1+ import sys
12from json import dumps
23from pathlib import Path
34from datetime import datetime , timezone
45from os import environ
5- from requests import post
6- import threading
6+ from concurrent . futures import ThreadPoolExecutor
7+ from requests import Session
78
89URL = environ .get ("REGALYTICS_API_BASE_URL" , "https://api.regalytics.ai/api/v3" )
910API_KEY = environ .get ("REGALYTICS_API_KEY" , "" )
1617
1718process_date = datetime .strptime (DEPLOYMENT_DATE , '%Y%m%d' ).strftime ('%Y-%m-%d' )
1819
20+ SESSION = Session ()
21+
1922def get_data_from_source (process_date ):
2023
2124 payload = dumps ({
@@ -28,31 +31,20 @@ def get_data_from_source(process_date):
2831 "page_size" : 1000 ,
2932 },
3033 })
34+ headers = {'Content-Type' : 'application/json' }
3135
32- def get_page (p ):
33- page = post (f"https://api.regalytics.ai/api/v3/search?page={ p } " , headers = { 'Content-Type' : 'application/json' }, data = payload ).json ()
34- all_responses .append (page )
35-
36- page_1 = post (f"https://api.regalytics.ai/api/v3/search" , headers = { 'Content-Type' : 'application/json' }, data = payload ).json ()
37- all_responses = [page_1 ]
36+ def fetch (page ):
37+ url = f"{ URL } /search" if page == 1 else f"{ URL } /search?page={ page } "
38+ return SESSION .post (url , headers = headers , data = payload ).json ()
3839
40+ page_1 = fetch (1 )
3941 if page_1 ['total_pages' ] == 1 :
40- return all_responses
41-
42- threads = []
43-
44- for p in range (2 , page_1 ['total_pages' ] + 1 ):
45- threads .append (threading .Thread (target = get_page , args = (p ,)))
46-
47- for t in threads :
48- t .start ()
42+ return [page_1 ]
4943
50- for t in threads :
51- t . join ( )
44+ with ThreadPoolExecutor ( max_workers = 8 ) as executor :
45+ remaining = list ( executor . map ( fetch , range ( 2 , page_1 [ 'total_pages' ] + 1 )) )
5246
53- all_responses .sort (key = lambda x : x ['page_number' ])
54-
55- return all_responses
47+ return [page_1 ] + remaining
5648
5749# "agencies": [
5850# {
@@ -76,54 +68,45 @@ def get_page(p):
7668
7769def process (process_date ):
7870 all_responses = get_data_from_source (process_date )
71+ print (f'Fetched { len (all_responses )} response page(s) for { process_date } ' )
7972
8073 for response in all_responses :
8174 for article in response .get ('results' , []):
82- article ['in_federal_register' ] = 'yes' in article ['in_federal_register' ].lower ()
75+ # Convert `in_federal_register` field into boolean value, default to False if the field is missing or empty or None.
76+ article ['in_federal_register' ] = 'yes' in (article .get ('in_federal_register' ) or '' ).lower ()
8377 # State -> Dictionary<string, List<string>>
8478 states = {}
85- agencies = article .get ('agencies' , [])
86- if not agencies :
87- agencies = []
79+ agencies = article .get ('agencies' ) or []
8880 for agency in agencies :
8981 state = agency .get ('state' )
9082 if not state :
9183 continue
9284
85+ state_names = [x ['name' ] for x in state ]
9386 for country in agency .get ('country' , []):
94- name = country ['name' ]
95-
96- if not name in states :
97- country_states = []
98- states [name ] = country_states
99- else :
100- country_states = states [name ]
101-
102- country_states .extend ([x ['name' ] for x in state ])
87+ states .setdefault (country ['name' ], []).extend (state_names )
10388
10489 article ['states' ] = states
10590 article ['agencies' ] = [agency ['name' ] for agency in agencies ]
106-
107- # search using `created_at` returns all with UTC time between 00:00-23:59 in a single day,
91+
92+ # search using `created_at` returns all with UTC time between 00:00-23:59 in a single day,
10893 # so it include some articles created at 20:00-00:00 in EST of the "previous day" (-04:00).
10994 # Adjust timezone info of `created_at` field into UTC time to avoid overwriting the previous day file
110- article ['created_at' ] = article ['created_at' ][:- 3 ] + article ['created_at' ][- 2 :] # %z only accepts `-0400` instead of `-04:00` in Python3.6
11195 created_at = datetime .strptime (article ['created_at' ], '%Y-%m-%dT%H:%M:%S.%f%z' ).astimezone (timezone .utc )
11296 article ['created_at' ] = created_at .strftime ('%Y-%m-%dT%H:%M:%S.%f' )
11397 date_key = created_at .strftime ('%Y%m%d' )
11498
115- if date_key not in articles_by_date :
116- date_articles = []
117- articles_by_date [date_key ] = date_articles
118- else :
119- date_articles = articles_by_date [date_key ]
99+ articles_by_date .setdefault (date_key , []).append (article )
120100
121- date_articles .append (article )
101+ date_count = len (articles_by_date )
102+ print (f'Writing { date_count } date file(s)' )
103+ for date , articles in articles_by_date .items ():
104+ print (f' { date } : { len (articles )} article(s)' )
105+ with open (ARTICLE_PATH / f'{ date } .json' , 'w' ) as article_file :
106+ article_file .write ('\n ' .join (dumps (article , indent = None ) for article in articles ))
122107
123- for date , articles in articles_by_date .items ():
124- with open (ARTICLE_PATH / f'{ date } .json' , 'w' ) as article_file :
125- article_lines = '\n ' .join ([dumps (article , indent = None ) for article in articles ])
126- article_file .write (article_lines )
108+ if date_count > 1 :
109+ sys .exit (f'ERROR: expected 1 date, got { date_count } : { sorted (articles_by_date )} ' )
127110
128111if __name__ == "__main__" :
129112 process (process_date )
0 commit comments