33from datetime import datetime , timezone
44from os import environ
55from requests import post
6+ import threading
67
78URL = environ .get ("REGALYTICS_API_BASE_URL" , "https://api.regalytics.ai/api/v3" )
8- API_KEY = environ .get ("REGALYTICS_API_KEY" , "" )
9+ API_KEY = environ .get ("REGALYTICS_API_KEY" , "0f63222e69a5e25957c4fcf2c739b3b66c102910 " )
910DEPLOYMENT_DATE = environ .get ('QC_DATAFLEET_DEPLOYMENT_DATE' , f'{ datetime .now ():%Y%m%d} ' )
1011
1112# objectives:# download data from API -> temp folder or in memory. Output processed datat to /temp-output-directory/alternative/regalytics/articles/yyyyMMdd.json
12- ARTICLE_PATH = Path ('/ temp-output-directory /alternative/regalytics/articles' )
13+ ARTICLE_PATH = Path ('temp-output-directory_temp /alternative/regalytics/articles' )
1314ARTICLE_PATH .mkdir (parents = True , exist_ok = True )
1415articles_by_date = {}
1516
1617process_date = datetime .strptime (DEPLOYMENT_DATE , '%Y%m%d' ).strftime ('%Y-%m-%d' )
1718
18- payload = dumps ({
19- "apikey" : API_KEY ,
20- "search_options" : {
21- "created_at" : {
22- "start" : process_date ,
23- "end" : process_date
24- }
25- }
26- })
27-
28- response = post (f"{ URL } /search" , headers = { 'Content-Type' : 'application/json' }, data = payload ).json ()
19+ def get_data_from_source (process_date ):
20+
21+ payload = dumps ({
22+ "apikey" : API_KEY ,
23+ "search_options" : {
24+ "created_at" : {
25+ "start" : process_date ,
26+ "end" : process_date
27+ },
28+ "page_size" : 1000 ,
29+ },
30+ })
31+
32+ def get_page (p ):
33+ page = post (f"https://api.regalytics.ai/api/v3/search?page={ p } " , headers = { 'Content-Type' : 'application/json' }, data = payload ).json ()
34+ all_responses .append (page )
35+
36+ page_1 = post (f"https://api.regalytics.ai/api/v3/search" , headers = { 'Content-Type' : 'application/json' }, data = payload ).json ()
37+ all_responses = [page_1 ]
38+
39+ if page_1 ['total_pages' ] == 1 :
40+ return all_responses
41+
42+ threads = []
43+
44+ for p in range (2 , page_1 ['total_pages' ] + 1 ):
45+ threads .append (threading .Thread (target = get_page , args = (p ,)))
46+
47+ for t in threads :
48+ t .start ()
49+
50+ for t in threads :
51+ t .join ()
52+
53+ all_responses .sort (key = lambda x : x ['page_number' ])
54+
55+ return all_responses
2956
3057# "agencies": [
3158# {
4774# 1. query all data, -> /api/v2/.../get-all; 2. look at latest_update, add delta of 1/2 days;
4875# 3. write data to date of latest_update + delta. This date must be on the date we published the article on Regalytics
4976
50- for article in response .get ('results' , []):
51- article ['in_federal_register' ] = 'yes' in article ['in_federal_register' ].lower ()
52- # State -> Dictionary<string, List<string>>
53- states = {}
54- agencies = article .get ('agencies' , [])
55- if not agencies :
56- agencies = []
57- for agency in agencies :
58- state = agency .get ('state' )
59- if not state :
60- continue
61-
62- for country in agency .get ('country' , []):
63- name = country ['name' ]
64-
65- if not name in states :
66- country_states = []
67- states [name ] = country_states
77+ def process (process_date ):
78+ all_responses = get_data_from_source (process_date )
79+
80+ for response in all_responses :
81+ for article in response .get ('results' , []):
82+ article ['in_federal_register' ] = 'yes' in article ['in_federal_register' ].lower ()
83+ # State -> Dictionary<string, List<string>>
84+ states = {}
85+ agencies = article .get ('agencies' , [])
86+ if not agencies :
87+ agencies = []
88+ for agency in agencies :
89+ state = agency .get ('state' )
90+ if not state :
91+ continue
92+
93+ for country in agency .get ('country' , []):
94+ name = country ['name' ]
95+
96+ if not name in states :
97+ country_states = []
98+ states [name ] = country_states
99+ else :
100+ country_states = states [name ]
101+
102+ country_states .extend ([x ['name' ] for x in state ])
103+
104+ article ['states' ] = states
105+ article ['agencies' ] = [agency ['name' ] for agency in agencies ]
106+
107+ # search using `created_at` returns all with UTC time between 00:00-23:59 in a single day,
108+ # so it include some articles created at 20:00-00:00 in EST of the "previous day" (-04:00).
109+ # Adjust timezone info of `created_at` field into UTC time to avoid overwriting the previous day file
110+ article ['created_at' ] = article ['created_at' ][:- 3 ] + article ['created_at' ][- 2 :] # %z only accepts `-0400` instead of `-04:00` in Python3.6
111+ created_at = datetime .strptime (article ['created_at' ], '%Y-%m-%dT%H:%M:%S.%f%z' ).astimezone (timezone .utc )
112+ article ['created_at' ] = created_at .strftime ('%Y-%m-%dT%H:%M:%S.%f' )
113+ date_key = created_at .strftime ('%Y%m%d' )
114+
115+ if date_key not in articles_by_date :
116+ date_articles = []
117+ articles_by_date [date_key ] = date_articles
68118 else :
69- country_states = states [ name ]
119+ date_articles = articles_by_date [ date_key ]
70120
71- country_states . extend ([ x [ 'name' ] for x in state ] )
121+ date_articles . append ( article )
72122
73- article ['states' ] = states
74- article ['agencies' ] = [agency ['name' ] for agency in agencies ]
75-
76- # search using `created_at` returns all with UTC time between 00:00-23:59 in a single day,
77- # so it include some articles created at 20:00-00:00 in EST of the "previous day" (-04:00).
78- # Adjust timezone info of `created_at` field into UTC time to avoid overwriting the previous day file
79- article ['created_at' ] = article ['created_at' ][:- 3 ] + article ['created_at' ][- 2 :] # %z only accepts `-0400` instead of `-04:00` in Python3.6
80- created_at = datetime .strptime (article ['created_at' ], '%Y-%m-%dT%H:%M:%S.%f%z' ).astimezone (timezone .utc )
81- article ['created_at' ] = created_at .strftime ('%Y-%m-%dT%H:%M:%S.%f' )
82- date_key = created_at .strftime ('%Y%m%d' )
83-
84- if date_key not in articles_by_date :
85- date_articles = []
86- articles_by_date [date_key ] = date_articles
87- else :
88- date_articles = articles_by_date [date_key ]
89-
90- date_articles .append (article )
91-
92- for date , articles in articles_by_date .items ():
93- with open (ARTICLE_PATH / f'{ date } .json' , 'w' ) as article_file :
94- article_lines = '\n ' .join ([dumps (article , indent = None ) for article in articles ])
95- article_file .write (article_lines )
123+ for date , articles in articles_by_date .items ():
124+ with open (ARTICLE_PATH / f'{ date } .json' , 'w' ) as article_file :
125+ article_lines = '\n ' .join ([dumps (article , indent = None ) for article in articles ])
126+ article_file .write (article_lines )
127+
128+ if __name__ == "__main__" :
129+ process (process_date )
0 commit comments