From 26fd25b32a5d4377d2b17e22adc2db0d27503f93 Mon Sep 17 00:00:00 2001 From: Arthur de Jong Date: Sun, 17 Mar 2024 17:34:26 +0100 Subject: [PATCH] Switch to using openpyxl for parsing XLSX files The xlrd has dropped support for parsing XLSX files. We still use xlrd for update/be_banks.py because they use the classic XLS format and openpyxl does not support that format. --- setup.cfg | 1 + update/cfi.py | 21 +++++++++++---------- update/nz_banks.py | 16 ++++++---------- update/requirements.txt | 1 + 4 files changed, 19 insertions(+), 20 deletions(-) diff --git a/setup.cfg b/setup.cfg index 757a9f19..afd45e7c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,5 +47,6 @@ lines_after_imports = 2 multi_line_output = 4 known_third_party = lxml + openpyxl requests xlrd diff --git a/update/cfi.py b/update/cfi.py index e47e6bc3..f5431a89 100755 --- a/update/cfi.py +++ b/update/cfi.py @@ -2,7 +2,7 @@ # update/cfi.py - script to download CFI code list from the SIX group # -# Copyright (C) 2022 Arthur de Jong +# Copyright (C) 2022-2024 Arthur de Jong # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public @@ -21,11 +21,12 @@ """This script downloads the list of CFI codes as published by the SIX group.""" +import io import re import lxml.html +import openpyxl import requests -import xlrd # the location of the Statistical Classification file @@ -39,8 +40,8 @@ def normalise(value): def get_categories(sheet): """Get the list of top-level CFI categories.""" - for row in sheet.get_rows(): - if len(row[0].value) == 1 and row[1].value: + for row in sheet.iter_rows(): + if row[0].value and len(row[0].value) == 1 and row[1].value: yield (row[0].value, row[1].value) @@ -49,7 +50,7 @@ def get_attributes(sheet): attribute = None value_list = [] values = None - for row in sheet.get_rows(): + for row in sheet.iter_rows(): if row[0].value and not row[1].value and row[2].value: attribute = normalise(row[2].value) values = [] @@ -84,15 +85,15 @@ def print_attributes(attributes, index=0): # Download and parse the spreadsheet response = requests.get(link_url, timeout=30) response.raise_for_status() - workbook = xlrd.open_workbook(file_contents=response.content) + workbook = openpyxl.load_workbook(io.BytesIO(response.content), read_only=True) print('# generated from %s, downloaded from' % link_url.split('/')[-1]) print('# %s' % download_url) - groups = sorted(x for x in workbook.sheet_names() if len(x) == 6 and x.endswith('XXXX')) - for category, name in sorted(get_categories(workbook.sheet_by_name('Categories'))): + groups = sorted(x for x in workbook.sheetnames if len(x) == 6 and x.endswith('XXXX')) + for category, name in sorted(get_categories(workbook['Categories'])): print('%s category="%s"' % (category, name)) for group in (x for x in groups if x.startswith(category)): - sheet = workbook.sheet_by_name(group) - print(' %s group="%s"' % (group[1], normalise(sheet.cell(0, 0).value))) + sheet = workbook[group] + print(' %s group="%s"' % (group[1], normalise(sheet.cell(1, 1).value))) print_attributes(get_attributes(sheet)) diff --git a/update/nz_banks.py b/update/nz_banks.py index 816930f0..af1a4334 100755 --- a/update/nz_banks.py +++ b/update/nz_banks.py @@ -3,7 +3,7 @@ # update/nz_banks.py - script to download Bank list from Bank Branch Register # -# Copyright (C) 2019-2021 Arthur de Jong +# Copyright (C) 2019-2024 Arthur de Jong # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public @@ -23,16 +23,12 @@ """This script downloads the list of banks with bank codes as used in the New Zealand bank account numbers.""" +import io import re from collections import OrderedDict, defaultdict +import openpyxl import requests -import xlrd - - -# Monkey patch xlrd avoiding bug in combination with Python 3.9 -xlrd.xlsx.ensure_elementtree_imported(False, None) -xlrd.xlsx.Element_has_iter = True # The page that contains a link to the latest XLS version of the codes. @@ -41,7 +37,7 @@ def get_values(sheet): """Return rows from the worksheet as a dict per row.""" - rows = sheet.get_rows() + rows = sheet.iter_rows() # the first row has column names columns = [column.value.lower().replace(' ', '_') for column in next(rows)] # go over rows with values @@ -75,8 +71,8 @@ def branch_list(branches): response.raise_for_status() content_disposition = response.headers.get('content-disposition', '') filename = re.findall(r'filename=?(.+)"?', content_disposition)[0].strip('"') - workbook = xlrd.open_workbook(file_contents=response.content) - sheet = workbook.sheet_by_index(0) + workbook = openpyxl.load_workbook(io.BytesIO(response.content), read_only=True) + sheet = workbook.worksheets[0] # print header print('# generated from %s downloaded from' % filename) print('# %s' % download_url) diff --git a/update/requirements.txt b/update/requirements.txt index 16e367c9..b51c6849 100644 --- a/update/requirements.txt +++ b/update/requirements.txt @@ -1,3 +1,4 @@ lxml +openpyxl requests xlrd