Skip to content

Commit a7fd300

Browse files
committed
Consistently pass a user agent for update scripts
The scourge of AI bots is forcing more and more sites to block bots that don't set particular user agents.
1 parent e7afc61 commit a7fd300

12 files changed

Lines changed: 68 additions & 26 deletions

File tree

update/at_postleitzahl.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
# update/at_postleitzahl.py - download list of Austrian postal codes
55
#
6-
# Copyright (C) 2018-2025 Arthur de Jong
6+
# Copyright (C) 2018-2026 Arthur de Jong
77
#
88
# This library is free software; you can redistribute it and/or
99
# modify it under the terms of the GNU Lesser General Public
@@ -31,6 +31,10 @@
3131
download_url = 'https://data.rtr.at/api/v1/tables/plz.json'
3232

3333

34+
# The user agent that will be passed in requests
35+
user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
36+
37+
3438
# The list of regions that can be used in the document.
3539
regions = {
3640
'B': 'Burgenland',
@@ -46,7 +50,7 @@
4650

4751

4852
if __name__ == '__main__':
49-
response = requests.get(download_url, timeout=30)
53+
response = requests.get(download_url, timeout=30, headers={'User-Agent': user_agent})
5054
response.raise_for_status()
5155
data = response.json()
5256
# print header

update/be_banks.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@
3535
download_url = 'https://www.nbb.be/doc/be/be/protocol/grouped_list_current.xlsx'
3636

3737

38+
# The user agent that will be passed in requests
39+
user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
40+
41+
3842
# List of values that refer to non-existing, reserved or otherwise not-
3943
# allocated entries.
4044
not_applicable_values = (
@@ -79,7 +83,7 @@ def get_values(sheet):
7983

8084

8185
if __name__ == '__main__':
82-
response = requests.get(download_url, timeout=30)
86+
response = requests.get(download_url, timeout=30, headers={'User-Agent': user_agent})
8387
response.raise_for_status()
8488
workbook = openpyxl.load_workbook(io.BytesIO(response.content), read_only=True)
8589
sheet = workbook.worksheets[0]

update/cfi.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
# update/cfi.py - script to download CFI code list from the SIX group
44
#
5-
# Copyright (C) 2022-2025 Arthur de Jong
5+
# Copyright (C) 2022-2026 Arthur de Jong
66
#
77
# This library is free software; you can redistribute it and/or
88
# modify it under the terms of the GNU Lesser General Public
@@ -33,6 +33,10 @@
3333
download_url = 'https://www.six-group.com/en/products-services/financial-information/data-standards.html'
3434

3535

36+
# The user agent that will be passed in requests
37+
user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
38+
39+
3640
def normalise(value):
3741
"""Clean and minimise attribute names and values."""
3842
return re.sub(r' *[(\[\n].*', '', value, flags=re.MULTILINE).strip()
@@ -76,14 +80,14 @@ def print_attributes(attributes, index=0):
7680

7781
if __name__ == '__main__':
7882
# Download the page that contains the link to the current XLS file
79-
response = requests.get(download_url, timeout=30)
83+
response = requests.get(download_url, timeout=30, headers={'User-Agent': user_agent})
8084
response.raise_for_status()
8185
# Find the download link
8286
document = lxml.html.document_fromstring(response.content)
8387
links = [a.get('href') for a in document.findall('.//a[@href]')]
8488
link_url = next(a for a in links if re.match(r'.*/cfi/.*xlsx?$', a))
8589
# Download and parse the spreadsheet
86-
response = requests.get(link_url, timeout=30)
90+
response = requests.get(link_url, timeout=30, headers={'User-Agent': user_agent})
8791
response.raise_for_status()
8892
workbook = openpyxl.load_workbook(io.BytesIO(response.content), read_only=True)
8993

update/cn_loc.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# update/cn_loc.py - script to fetch data from the CN Open Data community
44
#
55
# Copyright (C) 2014-2015 Jiangge Zhang
6-
# Copyright (C) 2015-2025 Arthur de Jong
6+
# Copyright (C) 2015-2026 Arthur de Jong
77
#
88
# This library is free software; you can redistribute it and/or
99
# modify it under the terms of the GNU Lesser General Public
@@ -33,6 +33,10 @@
3333
wikipedia_pages = [f'中华人民共和国行政区划代码 ({i}区)' for i in range(1, 9)]
3434

3535

36+
# The user agent that will be passed in requests
37+
user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
38+
39+
3640
def get_wikipedia_url(page):
3741
"""Get the Simplified Chinese Wikipedia page URL."""
3842
return f'https://zh.wikipedia.org/w/index.php?title={page.replace(" ", "_")}&action=raw' # noqa: E231
@@ -128,7 +132,7 @@ def parse_page(content):
128132
provinces = {}
129133
numbers = defaultdict(lambda: defaultdict(list))
130134
for page in wikipedia_pages:
131-
response = requests.get(get_wikipedia_url(page), timeout=30)
135+
response = requests.get(get_wikipedia_url(page), timeout=30, headers={'User-Agent': user_agent})
132136
response.raise_for_status()
133137
for prefix, province, number, county in parse_page(response.text):
134138
provinces[prefix] = province

update/cz_banks.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# update/cz_banks.py - script to download Bank list from Czech National Bank
55
#
66
# Copyright (C) 2022 Petr Přikryl
7+
# Copyright (C) 2026 Arthur de Jong
78
#
89
# This library is free software; you can redistribute it and/or
910
# modify it under the terms of the GNU Lesser General Public
@@ -35,6 +36,10 @@
3536
download_url = 'https://www.cnb.cz/cs/platebni-styk/.galleries/ucty_kody_bank/download/kody_bank_CR.csv'
3637

3738

39+
# The user agent that will be passed in requests
40+
user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
41+
42+
3843
def get_values(csv_reader):
3944
"""Return values (bank_number, bic, bank_name, certis) from the CSV."""
4045
# skip first row (header)
@@ -48,7 +53,7 @@ def get_values(csv_reader):
4853

4954

5055
if __name__ == '__main__':
51-
response = requests.get(download_url, timeout=30)
56+
response = requests.get(download_url, timeout=30, headers={'User-Agent': user_agent})
5257
response.raise_for_status()
5358
csv_reader = csv.reader(StringIO(response.content.decode('utf-8')), delimiter=';')
5459
print('# generated from %s downloaded from' % os.path.basename(download_url))

update/do_whitelists.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
# update/do_whitelists.py - script to update do.rnc and do.cedula whitelists
55
#
6-
# Copyright (C) 2017-2019 Arthur de Jong
6+
# Copyright (C) 2017-2026 Arthur de Jong
77
#
88
# This library is free software; you can redistribute it and/or
99
# modify it under the terms of the GNU Lesser General Public
@@ -45,6 +45,10 @@
4545
download_url = 'https://www.dgii.gov.do/app/WebApps/Consultas/rnc/DGII_RNC.zip'
4646

4747

48+
# The user agent that will be passed in requests
49+
user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
50+
51+
4852
def handle_zipfile(f):
4953
"""Parse the ZIP file and return a set of invalid RNC and Cedula."""
5054
# collections of invalid numbers found
@@ -70,7 +74,7 @@ def handle_zipfile(f):
7074
# Download and read the ZIP file with valid data
7175
with tempfile.TemporaryFile() as tmp:
7276
# Download the zip file to a temporary file
73-
response = requests.get(download_url, stream=True, timeout=30)
77+
response = requests.get(download_url, stream=True, timeout=30, headers={'User-Agent': user_agent})
7478
response.raise_for_status()
7579
print('%s: %s' % (
7680
os.path.basename(download_url),

update/gs1_ai.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
# update/gs1_ai.py - script to get GS1 application identifiers
44
#
5-
# Copyright (C) 2019-2025 Arthur de Jong
5+
# Copyright (C) 2019-2026 Arthur de Jong
66
#
77
# This library is free software; you can redistribute it and/or
88
# modify it under the terms of the GNU Lesser General Public
@@ -39,10 +39,7 @@
3939

4040
def fetch_ais():
4141
"""Download application identifiers frm the GS1 website."""
42-
headers = {
43-
'User-Agent': user_agent,
44-
}
45-
response = requests.get(download_url, headers=headers, timeout=30)
42+
response = requests.get(download_url, timeout=30, headers={'User-Agent': user_agent})
4643
document = lxml.html.document_fromstring(response.content)
4744
element = document.findall('.//script[@type="application/ld+json"]')[0]
4845
data = json.loads(element.text)

update/iban.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
# update/iban.py - script to download and parse data from the IBAN registry
44
#
5-
# Copyright (C) 2011-2019 Arthur de Jong
5+
# Copyright (C) 2011-2026 Arthur de Jong
66
#
77
# This library is free software; you can redistribute it and/or
88
# modify it under the terms of the GNU Lesser General Public
@@ -37,6 +37,10 @@
3737
download_url = 'https://www.swift.com/node/11971'
3838

3939

40+
# The user agent that will be passed in requests
41+
user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
42+
43+
4044
def get_country_codes(line):
4145
"""Return the list of country codes this line has."""
4246
# simplest case first
@@ -53,7 +57,7 @@ def get_country_codes(line):
5357
print(f'# generated from {os.path.basename(sys.argv[1])}')
5458
print(f'# downloaded from {download_url}')
5559
else:
56-
response = requests.get(download_url, timeout=30)
60+
response = requests.get(download_url, timeout=30, headers={'User-Agent': user_agent})
5761
response.raise_for_status()
5862
print('# generated from iban-registry_1.txt')
5963
print(f'# downloaded from {download_url}')

update/imsi.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
# update/imsi.py - script to download from Wikipedia to build the database
44
#
5-
# Copyright (C) 2011-2022 Arthur de Jong
5+
# Copyright (C) 2011-2026 Arthur de Jong
66
#
77
# This library is free software; you can redistribute it and/or
88
# modify it under the terms of the GNU Lesser General Public
@@ -51,6 +51,10 @@
5151
# https://www.itu.int/net/ITU-T/inrdb/
5252

5353

54+
# The user agent that will be passed in requests
55+
user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
56+
57+
5458
cleanup_replacements = {
5559
'Anguilla (United Kingdom)': 'Anguilla',
5660
'Argentina|Argentine Republic': 'Argentina',
@@ -155,7 +159,7 @@ def get_mncs_from_wikipedia():
155159
for page in wikipedia_pages:
156160
url = 'https://en.wikipedia.org/w/index.php?title=%s&action=raw' % (
157161
page.replace(' ', '_'))
158-
response = requests.get(url, timeout=30)
162+
response = requests.get(url, timeout=30, headers={'User-Agent': user_agent})
159163
response.raise_for_status()
160164
country = cc = ''
161165
for line in response.iter_lines(decode_unicode=True):

update/isbn.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
# update/isbn.py - script to get ISBN prefix data
44
#
5-
# Copyright (C) 2010-2019 Arthur de Jong
5+
# Copyright (C) 2010-2026 Arthur de Jong
66
#
77
# This library is free software; you can redistribute it and/or
88
# modify it under the terms of the GNU Lesser General Public
@@ -33,6 +33,10 @@
3333
download_url = 'https://www.isbn-international.org/export_rangemessage.xml'
3434

3535

36+
# The user agent that will be passed in requests
37+
user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
38+
39+
3640
def ranges(group):
3741
"""Provide the ranges for the group."""
3842
for rule in group.findall('./Rules/Rule'):
@@ -56,7 +60,7 @@ def wrap(text):
5660
if __name__ == '__main__':
5761
print('# generated from RangeMessage.xml, downloaded from')
5862
print('# %s' % download_url)
59-
response = requests.get(download_url, timeout=30)
63+
response = requests.get(download_url, timeout=30, headers={'User-Agent': user_agent})
6064
response.raise_for_status()
6165

6266
# parse XML document

0 commit comments

Comments
 (0)