diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 286187b..4fd6f82 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-24.04 steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v5 - name: Liberate disk space uses: jlumbroso/free-disk-space@main @@ -32,4 +32,4 @@ jobs: # BuildTests dotnet build ./tests/Tests.csproj /p:Configuration=Release /v:quiet /p:WarningLevel=1 && \ # Run Tests - dotnet test ./tests/bin/Release/net9.0/Tests.dll + dotnet test ./tests/bin/Release/net10.0/Tests.dll diff --git a/process.py b/process.py index 3f0709a..971c1ba 100644 --- a/process.py +++ b/process.py @@ -1,9 +1,10 @@ +import sys from json import dumps from pathlib import Path from datetime import datetime, timezone from os import environ -from requests import post -import threading +from concurrent.futures import ThreadPoolExecutor +from requests import Session URL = environ.get("REGALYTICS_API_BASE_URL", "https://api.regalytics.ai/api/v3") API_KEY = environ.get("REGALYTICS_API_KEY", "") @@ -16,6 +17,8 @@ process_date = datetime.strptime(DEPLOYMENT_DATE, '%Y%m%d').strftime('%Y-%m-%d') +SESSION = Session() + def get_data_from_source(process_date): payload = dumps({ @@ -28,31 +31,20 @@ def get_data_from_source(process_date): "page_size": 1000, }, }) + headers = {'Content-Type': 'application/json'} - def get_page(p): - page = post(f"https://api.regalytics.ai/api/v3/search?page={p}", headers={ 'Content-Type': 'application/json' }, data=payload).json() - all_responses.append(page) - - page_1 = post(f"https://api.regalytics.ai/api/v3/search", headers={ 'Content-Type': 'application/json' }, data=payload).json() - all_responses = [page_1] + def fetch(page): + url = f"{URL}/search" if page == 1 else f"{URL}/search?page={page}" + return SESSION.post(url, headers=headers, data=payload).json() + page_1 = fetch(1) if page_1['total_pages'] == 1: - return all_responses - - threads = [] - - for p in range(2, page_1['total_pages'] + 1): - threads.append(threading.Thread(target=get_page, args=(p,))) - - for t in threads: - t.start() + return [page_1] - for t in threads: - t.join() + with ThreadPoolExecutor(max_workers=8) as executor: + remaining = list(executor.map(fetch, range(2, page_1['total_pages'] + 1))) - all_responses.sort(key=lambda x: x['page_number']) - - return all_responses + return [page_1] + remaining # "agencies": [ # { @@ -76,54 +68,45 @@ def get_page(p): def process(process_date): all_responses = get_data_from_source(process_date) + print(f'Fetched {len(all_responses)} response page(s) for {process_date}') for response in all_responses: for article in response.get('results', []): - article['in_federal_register'] = 'yes' in article['in_federal_register'].lower() + # Convert `in_federal_register` field into boolean value, default to False if the field is missing or empty or None. + article['in_federal_register'] = 'yes' in (article.get('in_federal_register') or '').lower() # State -> Dictionary> states = {} - agencies = article.get('agencies', []) - if not agencies: - agencies = [] + agencies = article.get('agencies') or [] for agency in agencies: state = agency.get('state') if not state: continue + state_names = [x['name'] for x in state] for country in agency.get('country', []): - name = country['name'] - - if not name in states: - country_states = [] - states[name] = country_states - else: - country_states = states[name] - - country_states.extend([x['name'] for x in state]) + states.setdefault(country['name'], []).extend(state_names) article['states'] = states article['agencies'] = [agency['name'] for agency in agencies] - - # search using `created_at` returns all with UTC time between 00:00-23:59 in a single day, + + # search using `created_at` returns all with UTC time between 00:00-23:59 in a single day, # so it include some articles created at 20:00-00:00 in EST of the "previous day" (-04:00). # Adjust timezone info of `created_at` field into UTC time to avoid overwriting the previous day file - article['created_at'] = article['created_at'][:-3] + article['created_at'][-2:] # %z only accepts `-0400` instead of `-04:00` in Python3.6 created_at = datetime.strptime(article['created_at'], '%Y-%m-%dT%H:%M:%S.%f%z').astimezone(timezone.utc) article['created_at'] = created_at.strftime('%Y-%m-%dT%H:%M:%S.%f') date_key = created_at.strftime('%Y%m%d') - if date_key not in articles_by_date: - date_articles = [] - articles_by_date[date_key] = date_articles - else: - date_articles = articles_by_date[date_key] + articles_by_date.setdefault(date_key, []).append(article) - date_articles.append(article) + date_count = len(articles_by_date) + print(f'Writing {date_count} date file(s)') + for date, articles in articles_by_date.items(): + print(f' {date}: {len(articles)} article(s)') + with open(ARTICLE_PATH / f'{date}.json', 'w') as article_file: + article_file.write('\n'.join(dumps(article, indent=None) for article in articles)) - for date, articles in articles_by_date.items(): - with open(ARTICLE_PATH / f'{date}.json', 'w') as article_file: - article_lines = '\n'.join([dumps(article, indent=None) for article in articles]) - article_file.write(article_lines) + if date_count > 1: + sys.exit(f'ERROR: expected 1 date, got {date_count}: {sorted(articles_by_date)}') if __name__ == "__main__": process(process_date) \ No newline at end of file