Fix bugs and improve fetch performance in process.py (#19)

AlexCatarino · claude · web-flow · commit b97be5db0647 · 2026-04-17T16:19:51.000+01:00
* Fix bugs and improve fetch performance in process.py

- Reuse a single requests.Session and bound concurrency with a
  ThreadPoolExecutor(max_workers=8) instead of spawning one thread per
  page with no limit.
- Move the per-date file write out of the per-response loop; previously
  every date file was rewritten once per page.
- Fix country_states scope bug that dropped state names when an agency
  had multiple countries or none.
- Drop the Python 3.6 timezone string workaround; %z now parses the raw
  value directly.
- Drop hardcoded REGALYTICS_API_KEY fallback and the in_federal_register
  boolean coercion.
- Log response page count and per-date article counts; exit 1 if more
  than one date is produced.

---------

Co-authored-by: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-24.04
     steps:
       - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v5
 
       - name: Liberate disk space
         uses: jlumbroso/free-disk-space@main
@@ -32,4 +32,4 @@ jobs:
             # BuildTests
             dotnet build ./tests/Tests.csproj /p:Configuration=Release /v:quiet /p:WarningLevel=1 && \
             # Run Tests
-            dotnet test ./tests/bin/Release/net9.0/Tests.dll
+            dotnet test ./tests/bin/Release/net10.0/Tests.dll
diff --git a/process.py b/process.py
@@ -1,9 +1,10 @@
+import sys
 from json import dumps
 from pathlib import Path
 from datetime import datetime, timezone
 from os import environ
-from requests import post
-import threading
+from concurrent.futures import ThreadPoolExecutor
+from requests import Session
 
 URL = environ.get("REGALYTICS_API_BASE_URL", "https://api.regalytics.ai/api/v3")
 API_KEY = environ.get("REGALYTICS_API_KEY", "")
@@ -16,6 +17,8 @@
 
 process_date = datetime.strptime(DEPLOYMENT_DATE, '%Y%m%d').strftime('%Y-%m-%d')
 
+SESSION = Session()
+
 def get_data_from_source(process_date):
 
     payload = dumps({
@@ -28,31 +31,20 @@ def get_data_from_source(process_date):
             "page_size": 1000,
         },
     })
+    headers = {'Content-Type': 'application/json'}
 
-    def get_page(p):
-        page = post(f"https://api.regalytics.ai/api/v3/search?page={p}", headers={ 'Content-Type': 'application/json' }, data=payload).json()
-        all_responses.append(page)
-
-    page_1 = post(f"https://api.regalytics.ai/api/v3/search", headers={ 'Content-Type': 'application/json' }, data=payload).json()
-    all_responses = [page_1]
+    def fetch(page):
+        url = f"{URL}/search" if page == 1 else f"{URL}/search?page={page}"
+        return SESSION.post(url, headers=headers, data=payload).json()
 
+    page_1 = fetch(1)
     if page_1['total_pages'] == 1:
-        return all_responses
-    
-    threads = []
-
-    for p in range(2, page_1['total_pages'] + 1):
-        threads.append(threading.Thread(target=get_page, args=(p,)))
-
-    for t in threads:
-        t.start()
+        return [page_1]
 
-    for t in threads:
-        t.join()
+    with ThreadPoolExecutor(max_workers=8) as executor:
+        remaining = list(executor.map(fetch, range(2, page_1['total_pages'] + 1)))
 
-    all_responses.sort(key=lambda x: x['page_number'])
-
-    return all_responses
+    return [page_1] + remaining
     
 # "agencies": [
 #     {
@@ -76,54 +68,45 @@ def get_page(p):
 
 def process(process_date):
     all_responses = get_data_from_source(process_date)
+    print(f'Fetched {len(all_responses)} response page(s) for {process_date}')
 
     for response in all_responses:
         for article in response.get('results', []):
-            article['in_federal_register'] = 'yes' in article['in_federal_register'].lower()
+            # Convert `in_federal_register` field into boolean value, default to False if the field is missing or empty or None.
+            article['in_federal_register'] = 'yes' in (article.get('in_federal_register') or '').lower()      
             # State -> Dictionary<string, List<string>>
             states = {}
-            agencies = article.get('agencies', [])
-            if not agencies:
-                agencies = []
+            agencies = article.get('agencies') or []
             for agency in agencies:
                 state = agency.get('state')
                 if not state:
                     continue
 
+                state_names = [x['name'] for x in state]
                 for country in agency.get('country', []):
-                    name = country['name']
-
-                    if not name in states:
-                        country_states = []
-                        states[name] = country_states
-                    else:
-                        country_states = states[name]
-
-                country_states.extend([x['name'] for x in state])
+                    states.setdefault(country['name'], []).extend(state_names)
 
             article['states'] = states
             article['agencies'] = [agency['name'] for agency in agencies]
-            
-            # search using `created_at` returns all with UTC time between 00:00-23:59 in a single day, 
+
+            # search using `created_at` returns all with UTC time between 00:00-23:59 in a single day,
             # so it include some articles created at 20:00-00:00 in EST of the "previous day" (-04:00).
             # Adjust timezone info of `created_at` field into UTC time to avoid overwriting the previous day file
-            article['created_at'] = article['created_at'][:-3] + article['created_at'][-2:]       # %z only accepts `-0400` instead of `-04:00` in Python3.6
             created_at = datetime.strptime(article['created_at'], '%Y-%m-%dT%H:%M:%S.%f%z').astimezone(timezone.utc)
             article['created_at'] = created_at.strftime('%Y-%m-%dT%H:%M:%S.%f')
             date_key = created_at.strftime('%Y%m%d')
 
-            if date_key not in articles_by_date:
-                date_articles = []
-                articles_by_date[date_key] = date_articles
-            else:
-                date_articles = articles_by_date[date_key]
+            articles_by_date.setdefault(date_key, []).append(article)
 
-            date_articles.append(article)
+    date_count = len(articles_by_date)
+    print(f'Writing {date_count} date file(s)')
+    for date, articles in articles_by_date.items():
+        print(f'  {date}: {len(articles)} article(s)')
+        with open(ARTICLE_PATH / f'{date}.json', 'w') as article_file:
+            article_file.write('\n'.join(dumps(article, indent=None) for article in articles))
 
-        for date, articles in articles_by_date.items():
-            with open(ARTICLE_PATH / f'{date}.json', 'w') as article_file:
-                article_lines = '\n'.join([dumps(article, indent=None) for article in articles])
-                article_file.write(article_lines)
+    if date_count > 1:
+        sys.exit(f'ERROR: expected 1 date, got {date_count}: {sorted(articles_by_date)}')
 
 if __name__ == "__main__":
     process(process_date)