Skip to content

Commit fe533b4

Browse files
Update process.py
Updated process.py file to include data contents of all pages, previously only one page was read.
1 parent 892f91d commit fe533b4

1 file changed

Lines changed: 90 additions & 56 deletions

File tree

process.py

Lines changed: 90 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -3,29 +3,56 @@
33
from datetime import datetime, timezone
44
from os import environ
55
from requests import post
6+
import threading
67

78
URL = environ.get("REGALYTICS_API_BASE_URL", "https://api.regalytics.ai/api/v3")
8-
API_KEY = environ.get("REGALYTICS_API_KEY", "")
9+
API_KEY = environ.get("REGALYTICS_API_KEY", "0f63222e69a5e25957c4fcf2c739b3b66c102910")
910
DEPLOYMENT_DATE = environ.get('QC_DATAFLEET_DEPLOYMENT_DATE', f'{datetime.now():%Y%m%d}')
1011

1112
# objectives:# download data from API -> temp folder or in memory. Output processed datat to /temp-output-directory/alternative/regalytics/articles/yyyyMMdd.json
12-
ARTICLE_PATH = Path('/temp-output-directory/alternative/regalytics/articles')
13+
ARTICLE_PATH = Path('temp-output-directory_temp/alternative/regalytics/articles')
1314
ARTICLE_PATH.mkdir(parents=True, exist_ok=True)
1415
articles_by_date = {}
1516

1617
process_date = datetime.strptime(DEPLOYMENT_DATE, '%Y%m%d').strftime('%Y-%m-%d')
1718

18-
payload = dumps({
19-
"apikey": API_KEY,
20-
"search_options": {
21-
"created_at": {
22-
"start": process_date,
23-
"end": process_date
24-
}
25-
}
26-
})
27-
28-
response = post(f"{URL}/search", headers={ 'Content-Type': 'application/json' }, data=payload).json()
19+
def get_data_from_source(process_date):
20+
21+
payload = dumps({
22+
"apikey": API_KEY,
23+
"search_options": {
24+
"created_at": {
25+
"start": process_date,
26+
"end": process_date
27+
},
28+
"page_size": 1000,
29+
},
30+
})
31+
32+
def get_page(p):
33+
page = post(f"https://api.regalytics.ai/api/v3/search?page={p}", headers={ 'Content-Type': 'application/json' }, data=payload).json()
34+
all_responses.append(page)
35+
36+
page_1 = post(f"https://api.regalytics.ai/api/v3/search", headers={ 'Content-Type': 'application/json' }, data=payload).json()
37+
all_responses = [page_1]
38+
39+
if page_1['total_pages'] == 1:
40+
return all_responses
41+
42+
threads = []
43+
44+
for p in range(2, page_1['total_pages'] + 1):
45+
threads.append(threading.Thread(target=get_page, args=(p,)))
46+
47+
for t in threads:
48+
t.start()
49+
50+
for t in threads:
51+
t.join()
52+
53+
all_responses.sort(key=lambda x: x['page_number'])
54+
55+
return all_responses
2956

3057
# "agencies": [
3158
# {
@@ -47,49 +74,56 @@
4774
# 1. query all data, -> /api/v2/.../get-all; 2. look at latest_update, add delta of 1/2 days;
4875
# 3. write data to date of latest_update + delta. This date must be on the date we published the article on Regalytics
4976

50-
for article in response.get('results', []):
51-
article['in_federal_register'] = 'yes' in article['in_federal_register'].lower()
52-
# State -> Dictionary<string, List<string>>
53-
states = {}
54-
agencies = article.get('agencies', [])
55-
if not agencies:
56-
agencies = []
57-
for agency in agencies:
58-
state = agency.get('state')
59-
if not state:
60-
continue
61-
62-
for country in agency.get('country', []):
63-
name = country['name']
64-
65-
if not name in states:
66-
country_states = []
67-
states[name] = country_states
77+
def process(process_date):
78+
all_responses = get_data_from_source(process_date)
79+
80+
for response in all_responses:
81+
for article in response.get('results', []):
82+
article['in_federal_register'] = 'yes' in article['in_federal_register'].lower()
83+
# State -> Dictionary<string, List<string>>
84+
states = {}
85+
agencies = article.get('agencies', [])
86+
if not agencies:
87+
agencies = []
88+
for agency in agencies:
89+
state = agency.get('state')
90+
if not state:
91+
continue
92+
93+
for country in agency.get('country', []):
94+
name = country['name']
95+
96+
if not name in states:
97+
country_states = []
98+
states[name] = country_states
99+
else:
100+
country_states = states[name]
101+
102+
country_states.extend([x['name'] for x in state])
103+
104+
article['states'] = states
105+
article['agencies'] = [agency['name'] for agency in agencies]
106+
107+
# search using `created_at` returns all with UTC time between 00:00-23:59 in a single day,
108+
# so it include some articles created at 20:00-00:00 in EST of the "previous day" (-04:00).
109+
# Adjust timezone info of `created_at` field into UTC time to avoid overwriting the previous day file
110+
article['created_at'] = article['created_at'][:-3] + article['created_at'][-2:] # %z only accepts `-0400` instead of `-04:00` in Python3.6
111+
created_at = datetime.strptime(article['created_at'], '%Y-%m-%dT%H:%M:%S.%f%z').astimezone(timezone.utc)
112+
article['created_at'] = created_at.strftime('%Y-%m-%dT%H:%M:%S.%f')
113+
date_key = created_at.strftime('%Y%m%d')
114+
115+
if date_key not in articles_by_date:
116+
date_articles = []
117+
articles_by_date[date_key] = date_articles
68118
else:
69-
country_states = states[name]
119+
date_articles = articles_by_date[date_key]
70120

71-
country_states.extend([x['name'] for x in state])
121+
date_articles.append(article)
72122

73-
article['states'] = states
74-
article['agencies'] = [agency['name'] for agency in agencies]
75-
76-
# search using `created_at` returns all with UTC time between 00:00-23:59 in a single day,
77-
# so it include some articles created at 20:00-00:00 in EST of the "previous day" (-04:00).
78-
# Adjust timezone info of `created_at` field into UTC time to avoid overwriting the previous day file
79-
article['created_at'] = article['created_at'][:-3] + article['created_at'][-2:] # %z only accepts `-0400` instead of `-04:00` in Python3.6
80-
created_at = datetime.strptime(article['created_at'], '%Y-%m-%dT%H:%M:%S.%f%z').astimezone(timezone.utc)
81-
article['created_at'] = created_at.strftime('%Y-%m-%dT%H:%M:%S.%f')
82-
date_key = created_at.strftime('%Y%m%d')
83-
84-
if date_key not in articles_by_date:
85-
date_articles = []
86-
articles_by_date[date_key] = date_articles
87-
else:
88-
date_articles = articles_by_date[date_key]
89-
90-
date_articles.append(article)
91-
92-
for date, articles in articles_by_date.items():
93-
with open(ARTICLE_PATH / f'{date}.json', 'w') as article_file:
94-
article_lines = '\n'.join([dumps(article, indent=None) for article in articles])
95-
article_file.write(article_lines)
123+
for date, articles in articles_by_date.items():
124+
with open(ARTICLE_PATH / f'{date}.json', 'w') as article_file:
125+
article_lines = '\n'.join([dumps(article, indent=None) for article in articles])
126+
article_file.write(article_lines)
127+
128+
if __name__ == "__main__":
129+
process(process_date)

0 commit comments

Comments
 (0)