33from datetime import datetime
44import os
55import requests
6- import sys
76
87URL = os .environ ["REGALYTICS_API_BASE_URL" ]
98HEADERS = {
109 'Content-Type' : 'application/json'
1110}
1211ARTICLE_PATH = pathlib .Path ('/temp-output-directory/alternative/regalytics/articles' )
1312
14- def main (date ):
15- # objectives:# download data from API -> temp folder or in memory. Output processed datat to /temp-output-directory/alternative/regalytics/articles/yyyyMMdd.json
16- ARTICLE_PATH .mkdir (parents = True , exist_ok = True )
17- articles_by_date = {}
13+ # objectives:# download data from API -> temp folder or in memory. Output processed datat to /temp-output-directory/alternative/regalytics/articles/yyyyMMdd.json
14+ ARTICLE_PATH .mkdir (parents = True , exist_ok = True )
15+ articles_by_date = {}
1816
19- if date == "all" :
20- url = f"{ URL } /get-all"
21- payload = json .dumps ({
22- "apikey" : os .environ ["REGALYTICS_API_KEY" ]
23- })
24-
25- response = requests .post (url , headers = HEADERS , data = payload ).json ()
26- max_page = response ['all_pages' ]
27- articles = response ['results' ]
28-
29- for i in range (2 , max_page + 1 ):
30- response = requests .post (f'{ url } ?page={ i } ' , headers = HEADERS , data = payload ).json ()
31- articles += response ['results' ]
32-
33- else :
34- url = f"{ URL } /search"
35- payload = json .dumps ({
36- "apikey" : os .environ ["REGALYTICS_API_KEY" ],
37- "search_options" : {
38- "created_at" : {
39- "start" : date ,
40- "end" : date
41- }
42- }
43- })
44-
45- response = requests .post (url , headers = HEADERS , data = payload ).json ()
46- articles = response ['articles' ]
47-
48- # "agencies": [
49- # {
50- # "name": "Iowa Department of Human Services",
51- # "states": [
52- # {
53- # "name": "Iowa"
54- # }
55- # ],
56- # "countries": [
57- # {
58- # "name": "United States"
59- # }
60- # ]
61- # }
62- # ]
63- # if states is more than 0
64- # loop into state and get the state name
65- # 1. query all data, -> /api/v2/.../get-all; 2. look at latest_update, add delta of 1/2 days;
66- # 3. write data to date of latest_update + delta. This date must be on the date we published the article on Regalytics
17+ process_datetime = datetime .strptime (os .environ ['QC_DATAFLEET_DEPLOYMENT_DATE' ], '%Y%m%d' ).date ()
18+ process_date = process_datetime .strftime ('%Y-%m-%d' )
6719
68- for article in articles :
69- article ['in_federal_register' ] = 'yes' in article ['in_federal_register' ].lower ()
70- # State -> Dictionary<string, List<string>>
71- states = {}
72- for agency in article ['agencies' ]:
73- state = agency ['states' ]
74-
75- if 'states' not in agency or state is None :
76- continue
20+ url = f"{ URL } /search"
21+ payload = json .dumps ({
22+ "apikey" : os .environ ["REGALYTICS_API_KEY" ],
23+ "search_options" : {
24+ "created_at" : {
25+ "start" : process_date ,
26+ "end" : process_date
27+ }
28+ }
29+ })
7730
78- if 'countries' not in agency :
79- continue
31+ response = requests .post (url , headers = HEADERS , data = payload ).json ()
32+ articles = response ['articles' ]
33+
34+ # "agencies": [
35+ # {
36+ # "name": "Iowa Department of Human Services",
37+ # "states": [
38+ # {
39+ # "name": "Iowa"
40+ # }
41+ # ],
42+ # "countries": [
43+ # {
44+ # "name": "United States"
45+ # }
46+ # ]
47+ # }
48+ # ]
49+ # if states is more than 0
50+ # loop into state and get the state name
51+ # 1. query all data, -> /api/v2/.../get-all; 2. look at latest_update, add delta of 1/2 days;
52+ # 3. write data to date of latest_update + delta. This date must be on the date we published the article on Regalytics
8053
81- countries = agency ['countries' ]
82- if countries is None :
83- continue
84-
85- for country in countries :
86- name = country ['name' ]
87-
88- if not name in states :
89- country_states = []
90- states [name ] = country_states
91- else :
92- country_states = states [name ]
54+ for article in articles :
55+ article ['in_federal_register' ] = 'yes' in article ['in_federal_register' ].lower ()
56+ # State -> Dictionary<string, List<string>>
57+ states = {}
58+ for agency in article ['agencies' ]:
59+ state = agency ['states' ]
60+
61+ if 'states' not in agency or state is None :
62+ continue
9363
94- country_states .extend ([x ['name' ] for x in state ])
64+ if 'countries' not in agency :
65+ continue
9566
96- article ['states' ] = states
97- article ['agencies' ] = [agency ['name' ] for agency in article ['agencies' ]]
67+ countries = agency ['countries' ]
68+ if countries is None :
69+ continue
9870
99- # remove timezone info (-04:00) [NewYork]
100- article ['created_at' ] = article ['created_at' ][:- 6 ]
101-
102- # all data received during day T would confer into day T+1 00:00
103- date = datetime .strptime (article ['created_at' ], '%Y-%m-%dT%H:%M:%S.%f' ).date ()
104- date_key = date .strftime ('%Y%m%d' )
71+ for country in countries :
72+ name = country ['name' ]
73+
74+ if not name in states :
75+ country_states = []
76+ states [name ] = country_states
77+ else :
78+ country_states = states [name ]
10579
106- if date_key not in articles_by_date :
107- date_articles = []
108- articles_by_date [date_key ] = date_articles
109- else :
110- date_articles = articles_by_date [date_key ]
80+ country_states .extend ([x ['name' ] for x in state ])
11181
112- date_articles .append (article )
82+ article ['states' ] = states
83+ article ['agencies' ] = [agency ['name' ] for agency in article ['agencies' ]]
84+
85+ # remove timezone info (-04:00) [NewYork]
86+ article ['created_at' ] = article ['created_at' ][:- 6 ]
87+
88+ # all data received during day T would confer into day T+1 00:00
89+ date = datetime .strptime (article ['created_at' ], '%Y-%m-%dT%H:%M:%S.%f' ).date ()
90+ date_key = date .strftime ('%Y%m%d' )
11391
114- for date , articles in articles_by_date .items ():
115- lines = []
116- for article in articles :
117- lines .append (json .dumps (article , indent = None ))
92+ if date_key not in articles_by_date :
93+ date_articles = []
94+ articles_by_date [date_key ] = date_articles
95+ else :
96+ date_articles = articles_by_date [date_key ]
97+
98+ date_articles .append (article )
11899
119- article_lines = '\n ' .join (lines )
100+ for date , articles in articles_by_date .items ():
101+ lines = []
102+ for article in articles :
103+ lines .append (json .dumps (article , indent = None ))
120104
121- with open (ARTICLE_PATH / f'{ date } .json' , 'w' ) as article_file :
122- article_file .write (article_lines )
105+ article_lines = '\n ' .join (lines )
123106
124- if __name__ == '__main__' :
125- if len (sys .argv ) != 2 :
126- raise ValueError ("process.py only takes 1 argument." )
127- main (sys .argv [- 1 ])
107+ with open (ARTICLE_PATH / f'{ date } .json' , 'w' ) as article_file :
108+ article_file .write (article_lines )
0 commit comments