Skip to content

Commit ae9ad39

Browse files
Merge pull request #7 from LouisSzeto/master
Address review
2 parents 21a87bd + fb843ee commit ae9ad39

1 file changed

Lines changed: 83 additions & 102 deletions

File tree

process.py

Lines changed: 83 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -3,125 +3,106 @@
33
from datetime import datetime
44
import os
55
import requests
6-
import sys
76

87
URL = os.environ["REGALYTICS_API_BASE_URL"]
98
HEADERS = {
109
'Content-Type': 'application/json'
1110
}
1211
ARTICLE_PATH = pathlib.Path('/temp-output-directory/alternative/regalytics/articles')
1312

14-
def main(date):
15-
# objectives:# download data from API -> temp folder or in memory. Output processed datat to /temp-output-directory/alternative/regalytics/articles/yyyyMMdd.json
16-
ARTICLE_PATH.mkdir(parents=True, exist_ok=True)
17-
articles_by_date = {}
13+
# objectives:# download data from API -> temp folder or in memory. Output processed datat to /temp-output-directory/alternative/regalytics/articles/yyyyMMdd.json
14+
ARTICLE_PATH.mkdir(parents=True, exist_ok=True)
15+
articles_by_date = {}
1816

19-
if date == "all":
20-
url = f"{URL}/get-all"
21-
payload = json.dumps({
22-
"apikey": os.environ["REGALYTICS_API_KEY"]
23-
})
24-
25-
response = requests.post(url, headers=HEADERS, data=payload).json()
26-
max_page = response['all_pages']
27-
articles = response['results']
28-
29-
for i in range(2, max_page + 1):
30-
response = requests.post(f'{url}?page={i}', headers=HEADERS, data=payload).json()
31-
articles += response['results']
32-
33-
else:
34-
url = f"{URL}/search"
35-
payload = json.dumps({
36-
"apikey": os.environ["REGALYTICS_API_KEY"],
37-
"search_options": {
38-
"created_at": {
39-
"start": date,
40-
"end": date
41-
}
42-
}
43-
})
44-
45-
response = requests.post(url, headers=HEADERS, data=payload).json()
46-
articles = response['articles']
47-
48-
# "agencies": [
49-
# {
50-
# "name": "Iowa Department of Human Services",
51-
# "states": [
52-
# {
53-
# "name": "Iowa"
54-
# }
55-
# ],
56-
# "countries": [
57-
# {
58-
# "name": "United States"
59-
# }
60-
# ]
61-
# }
62-
# ]
63-
# if states is more than 0
64-
# loop into state and get the state name
65-
# 1. query all data, -> /api/v2/.../get-all; 2. look at latest_update, add delta of 1/2 days;
66-
# 3. write data to date of latest_update + delta. This date must be on the date we published the article on Regalytics
17+
process_datetime = datetime.strptime(os.environ['QC_DATAFLEET_DEPLOYMENT_DATE'], '%Y%m%d').date()
18+
process_date = process_datetime.strftime('%Y-%m-%d')
6719

68-
for article in articles:
69-
article['in_federal_register'] = 'yes' in article['in_federal_register'].lower()
70-
# State -> Dictionary<string, List<string>>
71-
states = {}
72-
for agency in article['agencies']:
73-
state = agency['states']
74-
75-
if 'states' not in agency or state is None:
76-
continue
20+
url = f"{URL}/search"
21+
payload = json.dumps({
22+
"apikey": os.environ["REGALYTICS_API_KEY"],
23+
"search_options": {
24+
"created_at": {
25+
"start": process_date,
26+
"end": process_date
27+
}
28+
}
29+
})
7730

78-
if 'countries' not in agency:
79-
continue
31+
response = requests.post(url, headers=HEADERS, data=payload).json()
32+
articles = response['articles']
33+
34+
# "agencies": [
35+
# {
36+
# "name": "Iowa Department of Human Services",
37+
# "states": [
38+
# {
39+
# "name": "Iowa"
40+
# }
41+
# ],
42+
# "countries": [
43+
# {
44+
# "name": "United States"
45+
# }
46+
# ]
47+
# }
48+
# ]
49+
# if states is more than 0
50+
# loop into state and get the state name
51+
# 1. query all data, -> /api/v2/.../get-all; 2. look at latest_update, add delta of 1/2 days;
52+
# 3. write data to date of latest_update + delta. This date must be on the date we published the article on Regalytics
8053

81-
countries = agency['countries']
82-
if countries is None:
83-
continue
84-
85-
for country in countries:
86-
name = country['name']
87-
88-
if not name in states:
89-
country_states = []
90-
states[name] = country_states
91-
else:
92-
country_states = states[name]
54+
for article in articles:
55+
article['in_federal_register'] = 'yes' in article['in_federal_register'].lower()
56+
# State -> Dictionary<string, List<string>>
57+
states = {}
58+
for agency in article['agencies']:
59+
state = agency['states']
60+
61+
if 'states' not in agency or state is None:
62+
continue
9363

94-
country_states.extend([x['name'] for x in state])
64+
if 'countries' not in agency:
65+
continue
9566

96-
article['states'] = states
97-
article['agencies'] = [agency['name'] for agency in article['agencies']]
67+
countries = agency['countries']
68+
if countries is None:
69+
continue
9870

99-
# remove timezone info (-04:00) [NewYork]
100-
article['created_at'] = article['created_at'][:-6]
101-
102-
# all data received during day T would confer into day T+1 00:00
103-
date = datetime.strptime(article['created_at'], '%Y-%m-%dT%H:%M:%S.%f').date()
104-
date_key = date.strftime('%Y%m%d')
71+
for country in countries:
72+
name = country['name']
73+
74+
if not name in states:
75+
country_states = []
76+
states[name] = country_states
77+
else:
78+
country_states = states[name]
10579

106-
if date_key not in articles_by_date:
107-
date_articles = []
108-
articles_by_date[date_key] = date_articles
109-
else:
110-
date_articles = articles_by_date[date_key]
80+
country_states.extend([x['name'] for x in state])
11181

112-
date_articles.append(article)
82+
article['states'] = states
83+
article['agencies'] = [agency['name'] for agency in article['agencies']]
84+
85+
# remove timezone info (-04:00) [NewYork]
86+
article['created_at'] = article['created_at'][:-6]
87+
88+
# all data received during day T would confer into day T+1 00:00
89+
date = datetime.strptime(article['created_at'], '%Y-%m-%dT%H:%M:%S.%f').date()
90+
date_key = date.strftime('%Y%m%d')
11391

114-
for date, articles in articles_by_date.items():
115-
lines = []
116-
for article in articles:
117-
lines.append(json.dumps(article, indent=None))
92+
if date_key not in articles_by_date:
93+
date_articles = []
94+
articles_by_date[date_key] = date_articles
95+
else:
96+
date_articles = articles_by_date[date_key]
97+
98+
date_articles.append(article)
11899

119-
article_lines = '\n'.join(lines)
100+
for date, articles in articles_by_date.items():
101+
lines = []
102+
for article in articles:
103+
lines.append(json.dumps(article, indent=None))
120104

121-
with open(ARTICLE_PATH / f'{date}.json', 'w') as article_file:
122-
article_file.write(article_lines)
105+
article_lines = '\n'.join(lines)
123106

124-
if __name__ == '__main__':
125-
if len(sys.argv) != 2:
126-
raise ValueError("process.py only takes 1 argument.")
127-
main(sys.argv[-1])
107+
with open(ARTICLE_PATH / f'{date}.json', 'w') as article_file:
108+
article_file.write(article_lines)

0 commit comments

Comments
 (0)