Consistently pass a user agent for update scripts

arthurdejong · arthurdejong · commit a7fd30071cd3 · 2026-01-04T18:08:52.000+01:00
The scourge of AI bots is forcing more and more sites to block bots that
don't set particular user agents.
diff --git a/update/at_postleitzahl.py b/update/at_postleitzahl.py
@@ -3,7 +3,7 @@
 
 # update/at_postleitzahl.py - download list of Austrian postal codes
 #
-# Copyright (C) 2018-2025 Arthur de Jong
+# Copyright (C) 2018-2026 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -31,6 +31,10 @@
 download_url = 'https://data.rtr.at/api/v1/tables/plz.json'
 
 
+# The user agent that will be passed in requests
+user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
+
+
 # The list of regions that can be used in the document.
 regions = {
     'B': 'Burgenland',
@@ -46,7 +50,7 @@
 
 
 if __name__ == '__main__':
-    response = requests.get(download_url, timeout=30)
+    response = requests.get(download_url, timeout=30, headers={'User-Agent': user_agent})
     response.raise_for_status()
     data = response.json()
     # print header
diff --git a/update/be_banks.py b/update/be_banks.py
@@ -35,6 +35,10 @@
 download_url = 'https://www.nbb.be/doc/be/be/protocol/grouped_list_current.xlsx'
 
 
+# The user agent that will be passed in requests
+user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
+
+
 # List of values that refer to non-existing, reserved or otherwise not-
 # allocated entries.
 not_applicable_values = (
@@ -79,7 +83,7 @@ def get_values(sheet):
 
 
 if __name__ == '__main__':
-    response = requests.get(download_url, timeout=30)
+    response = requests.get(download_url, timeout=30, headers={'User-Agent': user_agent})
     response.raise_for_status()
     workbook = openpyxl.load_workbook(io.BytesIO(response.content), read_only=True)
     sheet = workbook.worksheets[0]
diff --git a/update/cfi.py b/update/cfi.py
@@ -2,7 +2,7 @@
 
 # update/cfi.py - script to download CFI code list from the SIX group
 #
-# Copyright (C) 2022-2025 Arthur de Jong
+# Copyright (C) 2022-2026 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -33,6 +33,10 @@
 download_url = 'https://www.six-group.com/en/products-services/financial-information/data-standards.html'
 
 
+# The user agent that will be passed in requests
+user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
+
+
 def normalise(value):
     """Clean and minimise attribute names and values."""
     return re.sub(r' *[(\[\n].*', '', value, flags=re.MULTILINE).strip()
@@ -76,14 +80,14 @@ def print_attributes(attributes, index=0):
 
 if __name__ == '__main__':
     # Download the page that contains the link to the current XLS file
-    response = requests.get(download_url, timeout=30)
+    response = requests.get(download_url, timeout=30, headers={'User-Agent': user_agent})
     response.raise_for_status()
     # Find the download link
     document = lxml.html.document_fromstring(response.content)
     links = [a.get('href') for a in document.findall('.//a[@href]')]
     link_url = next(a for a in links if re.match(r'.*/cfi/.*xlsx?$', a))
     # Download and parse the spreadsheet
-    response = requests.get(link_url, timeout=30)
+    response = requests.get(link_url, timeout=30, headers={'User-Agent': user_agent})
     response.raise_for_status()
     workbook = openpyxl.load_workbook(io.BytesIO(response.content), read_only=True)
 
diff --git a/update/cn_loc.py b/update/cn_loc.py
@@ -3,7 +3,7 @@
 # update/cn_loc.py - script to fetch data from the CN Open Data community
 #
 # Copyright (C) 2014-2015 Jiangge Zhang
-# Copyright (C) 2015-2025 Arthur de Jong
+# Copyright (C) 2015-2026 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -33,6 +33,10 @@
 wikipedia_pages = [f'中华人民共和国行政区划代码 ({i}区)' for i in range(1, 9)]
 
 
+# The user agent that will be passed in requests
+user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
+
+
 def get_wikipedia_url(page):
     """Get the Simplified Chinese Wikipedia page URL."""
     return f'https://zh.wikipedia.org/w/index.php?title={page.replace(" ", "_")}&action=raw'  # noqa: E231
@@ -128,7 +132,7 @@ def parse_page(content):
     provinces = {}
     numbers = defaultdict(lambda: defaultdict(list))
     for page in wikipedia_pages:
-        response = requests.get(get_wikipedia_url(page), timeout=30)
+        response = requests.get(get_wikipedia_url(page), timeout=30, headers={'User-Agent': user_agent})
         response.raise_for_status()
         for prefix, province, number, county in parse_page(response.text):
             provinces[prefix] = province
diff --git a/update/cz_banks.py b/update/cz_banks.py
@@ -4,6 +4,7 @@
 # update/cz_banks.py - script to download Bank list from Czech National Bank
 #
 # Copyright (C) 2022 Petr Přikryl
+# Copyright (C) 2026 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -35,6 +36,10 @@
 download_url = 'https://www.cnb.cz/cs/platebni-styk/.galleries/ucty_kody_bank/download/kody_bank_CR.csv'
 
 
+# The user agent that will be passed in requests
+user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
+
+
 def get_values(csv_reader):
     """Return values (bank_number, bic, bank_name, certis) from the CSV."""
     # skip first row (header)
@@ -48,7 +53,7 @@ def get_values(csv_reader):
 
 
 if __name__ == '__main__':
-    response = requests.get(download_url, timeout=30)
+    response = requests.get(download_url, timeout=30, headers={'User-Agent': user_agent})
     response.raise_for_status()
     csv_reader = csv.reader(StringIO(response.content.decode('utf-8')), delimiter=';')
     print('# generated from %s downloaded from' % os.path.basename(download_url))
diff --git a/update/do_whitelists.py b/update/do_whitelists.py
@@ -3,7 +3,7 @@
 
 # update/do_whitelists.py - script to update do.rnc and do.cedula whitelists
 #
-# Copyright (C) 2017-2019 Arthur de Jong
+# Copyright (C) 2017-2026 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -45,6 +45,10 @@
 download_url = 'https://www.dgii.gov.do/app/WebApps/Consultas/rnc/DGII_RNC.zip'
 
 
+# The user agent that will be passed in requests
+user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
+
+
 def handle_zipfile(f):
     """Parse the ZIP file and return a set of invalid RNC and Cedula."""
     # collections of invalid numbers found
@@ -70,7 +74,7 @@ def handle_zipfile(f):
     # Download and read the ZIP file with valid data
     with tempfile.TemporaryFile() as tmp:
         # Download the zip file to a temporary file
-        response = requests.get(download_url, stream=True, timeout=30)
+        response = requests.get(download_url, stream=True, timeout=30, headers={'User-Agent': user_agent})
         response.raise_for_status()
         print('%s: %s' % (
             os.path.basename(download_url),
diff --git a/update/gs1_ai.py b/update/gs1_ai.py
@@ -2,7 +2,7 @@
 
 # update/gs1_ai.py - script to get GS1 application identifiers
 #
-# Copyright (C) 2019-2025 Arthur de Jong
+# Copyright (C) 2019-2026 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -39,10 +39,7 @@
 
 def fetch_ais():
     """Download application identifiers frm the GS1 website."""
-    headers = {
-        'User-Agent': user_agent,
-    }
-    response = requests.get(download_url, headers=headers, timeout=30)
+    response = requests.get(download_url, timeout=30, headers={'User-Agent': user_agent})
     document = lxml.html.document_fromstring(response.content)
     element = document.findall('.//script[@type="application/ld+json"]')[0]
     data = json.loads(element.text)
diff --git a/update/iban.py b/update/iban.py
@@ -2,7 +2,7 @@
 
 # update/iban.py - script to download and parse data from the IBAN registry
 #
-# Copyright (C) 2011-2019 Arthur de Jong
+# Copyright (C) 2011-2026 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -37,6 +37,10 @@
 download_url = 'https://www.swift.com/node/11971'
 
 
+# The user agent that will be passed in requests
+user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
+
+
 def get_country_codes(line):
     """Return the list of country codes this line has."""
     # simplest case first
@@ -53,7 +57,7 @@ def get_country_codes(line):
         print(f'# generated from {os.path.basename(sys.argv[1])}')
         print(f'# downloaded from {download_url}')
     else:
-        response = requests.get(download_url, timeout=30)
+        response = requests.get(download_url, timeout=30, headers={'User-Agent': user_agent})
         response.raise_for_status()
         print('# generated from iban-registry_1.txt')
         print(f'# downloaded from {download_url}')
diff --git a/update/imsi.py b/update/imsi.py
@@ -2,7 +2,7 @@
 
 # update/imsi.py - script to download from Wikipedia to build the database
 #
-# Copyright (C) 2011-2022 Arthur de Jong
+# Copyright (C) 2011-2026 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -51,6 +51,10 @@
 # https://www.itu.int/net/ITU-T/inrdb/
 
 
+# The user agent that will be passed in requests
+user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
+
+
 cleanup_replacements = {
     'Anguilla (United Kingdom)': 'Anguilla',
     'Argentina|Argentine Republic': 'Argentina',
@@ -155,7 +159,7 @@ def get_mncs_from_wikipedia():
     for page in wikipedia_pages:
         url = 'https://en.wikipedia.org/w/index.php?title=%s&action=raw' % (
             page.replace(' ', '_'))
-        response = requests.get(url, timeout=30)
+        response = requests.get(url, timeout=30, headers={'User-Agent': user_agent})
         response.raise_for_status()
         country = cc = ''
         for line in response.iter_lines(decode_unicode=True):
diff --git a/update/isbn.py b/update/isbn.py
@@ -2,7 +2,7 @@
 
 # update/isbn.py - script to get ISBN prefix data
 #
-# Copyright (C) 2010-2019 Arthur de Jong
+# Copyright (C) 2010-2026 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -33,6 +33,10 @@
 download_url = 'https://www.isbn-international.org/export_rangemessage.xml'
 
 
+# The user agent that will be passed in requests
+user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
+
+
 def ranges(group):
     """Provide the ranges for the group."""
     for rule in group.findall('./Rules/Rule'):
@@ -56,7 +60,7 @@ def wrap(text):
 if __name__ == '__main__':
     print('# generated from RangeMessage.xml, downloaded from')
     print('# %s' % download_url)
-    response = requests.get(download_url, timeout=30)
+    response = requests.get(download_url, timeout=30, headers={'User-Agent': user_agent})
     response.raise_for_status()
 
     # parse XML document
diff --git a/update/isil.py b/update/isil.py
@@ -2,7 +2,7 @@
 
 # update/isil.py - script to download ISIL agencies
 #
-# Copyright (C) 2011-2025 Arthur de Jong
+# Copyright (C) 2011-2026 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -35,14 +35,18 @@
 download_url = 'https://slks.dk/english/work-areas/libraries-and-literature/library-standards/isil'
 
 
+# The user agent that will be passed in requests
+user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
+
+
 def clean(td):
     """Clean up the element removing unneeded stuff from it."""
     s = lxml.html.tostring(td, method='text', encoding='utf-8').decode('utf-8')
     return spaces_re.sub(' ', s.replace(u'\u0096', '')).strip()
 
 
 if __name__ == '__main__':
-    response = requests.get(download_url, timeout=30)
+    response = requests.get(download_url, timeout=30, headers={'User-Agent': user_agent})
     response.raise_for_status()
     print('# generated from ISIL Registration Authority, downloaded from')
     print('# %s' % download_url)
diff --git a/update/nz_banks.py b/update/nz_banks.py
@@ -3,7 +3,7 @@
 
 # update/nz_banks.py - script to download Bank list from Bank Branch Register
 #
-# Copyright (C) 2019-2024 Arthur de Jong
+# Copyright (C) 2019-2026 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -35,6 +35,10 @@
 download_url = 'https://www.paymentsnz.co.nz/resources/industry-registers/bank-branch-register/download/xlsx/'
 
 
+# The user agent that will be passed in requests
+user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
+
+
 def get_values(sheet):
     """Return rows from the worksheet as a dict per row."""
     rows = sheet.iter_rows()
@@ -67,7 +71,7 @@ def branch_list(branches):
 
 if __name__ == '__main__':
     # parse the download as an XLS
-    response = requests.get(download_url, timeout=30)
+    response = requests.get(download_url, timeout=30, headers={'User-Agent': user_agent})
     response.raise_for_status()
     content_disposition = response.headers.get('content-disposition', '')
     filename = re.findall(r'filename=?(.+)"?', content_disposition)[0].strip('"')