Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
runs-on: ubuntu-24.04
steps:
- name: Checkout
uses: actions/checkout@v2
uses: actions/checkout@v5

- name: Liberate disk space
uses: jlumbroso/free-disk-space@main
Expand All @@ -32,4 +32,4 @@ jobs:
# BuildTests
dotnet build ./tests/Tests.csproj /p:Configuration=Release /v:quiet /p:WarningLevel=1 && \
# Run Tests
dotnet test ./tests/bin/Release/net9.0/Tests.dll
dotnet test ./tests/bin/Release/net10.0/Tests.dll
79 changes: 31 additions & 48 deletions process.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import sys
from json import dumps
from pathlib import Path
from datetime import datetime, timezone
from os import environ
from requests import post
import threading
from concurrent.futures import ThreadPoolExecutor
from requests import Session

URL = environ.get("REGALYTICS_API_BASE_URL", "https://api.regalytics.ai/api/v3")
API_KEY = environ.get("REGALYTICS_API_KEY", "")
Expand All @@ -16,6 +17,8 @@

process_date = datetime.strptime(DEPLOYMENT_DATE, '%Y%m%d').strftime('%Y-%m-%d')

SESSION = Session()

def get_data_from_source(process_date):

payload = dumps({
Expand All @@ -28,31 +31,20 @@ def get_data_from_source(process_date):
"page_size": 1000,
},
})
headers = {'Content-Type': 'application/json'}

def get_page(p):
page = post(f"https://api.regalytics.ai/api/v3/search?page={p}", headers={ 'Content-Type': 'application/json' }, data=payload).json()
all_responses.append(page)

page_1 = post(f"https://api.regalytics.ai/api/v3/search", headers={ 'Content-Type': 'application/json' }, data=payload).json()
all_responses = [page_1]
def fetch(page):
url = f"{URL}/search" if page == 1 else f"{URL}/search?page={page}"
return SESSION.post(url, headers=headers, data=payload).json()

page_1 = fetch(1)
if page_1['total_pages'] == 1:
return all_responses

threads = []

for p in range(2, page_1['total_pages'] + 1):
threads.append(threading.Thread(target=get_page, args=(p,)))

for t in threads:
t.start()
return [page_1]

for t in threads:
t.join()
with ThreadPoolExecutor(max_workers=8) as executor:
remaining = list(executor.map(fetch, range(2, page_1['total_pages'] + 1)))

all_responses.sort(key=lambda x: x['page_number'])

return all_responses
return [page_1] + remaining

# "agencies": [
# {
Expand All @@ -76,54 +68,45 @@ def get_page(p):

def process(process_date):
all_responses = get_data_from_source(process_date)
print(f'Fetched {len(all_responses)} response page(s) for {process_date}')

for response in all_responses:
for article in response.get('results', []):
article['in_federal_register'] = 'yes' in article['in_federal_register'].lower()
# Convert `in_federal_register` field into boolean value, default to False if the field is missing or empty or None.
article['in_federal_register'] = 'yes' in (article.get('in_federal_register') or '').lower()
# State -> Dictionary<string, List<string>>
states = {}
agencies = article.get('agencies', [])
if not agencies:
agencies = []
agencies = article.get('agencies') or []
for agency in agencies:
state = agency.get('state')
if not state:
continue

state_names = [x['name'] for x in state]
for country in agency.get('country', []):
name = country['name']

if not name in states:
country_states = []
states[name] = country_states
else:
country_states = states[name]

country_states.extend([x['name'] for x in state])
states.setdefault(country['name'], []).extend(state_names)

article['states'] = states
article['agencies'] = [agency['name'] for agency in agencies]
# search using `created_at` returns all with UTC time between 00:00-23:59 in a single day,

# search using `created_at` returns all with UTC time between 00:00-23:59 in a single day,
# so it include some articles created at 20:00-00:00 in EST of the "previous day" (-04:00).
# Adjust timezone info of `created_at` field into UTC time to avoid overwriting the previous day file
article['created_at'] = article['created_at'][:-3] + article['created_at'][-2:] # %z only accepts `-0400` instead of `-04:00` in Python3.6
created_at = datetime.strptime(article['created_at'], '%Y-%m-%dT%H:%M:%S.%f%z').astimezone(timezone.utc)
article['created_at'] = created_at.strftime('%Y-%m-%dT%H:%M:%S.%f')
date_key = created_at.strftime('%Y%m%d')

if date_key not in articles_by_date:
date_articles = []
articles_by_date[date_key] = date_articles
else:
date_articles = articles_by_date[date_key]
articles_by_date.setdefault(date_key, []).append(article)

date_articles.append(article)
date_count = len(articles_by_date)
print(f'Writing {date_count} date file(s)')
for date, articles in articles_by_date.items():
print(f' {date}: {len(articles)} article(s)')
with open(ARTICLE_PATH / f'{date}.json', 'w') as article_file:
article_file.write('\n'.join(dumps(article, indent=None) for article in articles))

for date, articles in articles_by_date.items():
with open(ARTICLE_PATH / f'{date}.json', 'w') as article_file:
article_lines = '\n'.join([dumps(article, indent=None) for article in articles])
article_file.write(article_lines)
if date_count > 1:
sys.exit(f'ERROR: expected 1 date, got {date_count}: {sorted(articles_by_date)}')

if __name__ == "__main__":
process(process_date)