Switch to using openpyxl for parsing XLSX files

The xlrd has dropped support for parsing XLSX files. We still use xlrd for update/be_banks.py because they use the classic XLS format and openpyxl does not support that format.
arthurdejong · Mar 17, 2024 · 26fd25b · 26fd25b
1 parent 9230604
commit 26fd25b
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 20 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -47,5 +47,6 @@ lines_after_imports = 2
 multi_line_output = 4
 known_third_party =
   lxml
+  openpyxl
   requests
   xlrd
diff --git a/update/cfi.py b/update/cfi.py
@@ -2,7 +2,7 @@
 
 # update/cfi.py - script to download CFI code list from the SIX group
 #
-# Copyright (C) 2022 Arthur de Jong
+# Copyright (C) 2022-2024 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -21,11 +21,12 @@
 
 """This script downloads the list of CFI codes as published by the SIX group."""
 
+import io
 import re
 
 import lxml.html
+import openpyxl
 import requests
-import xlrd
 
 
 # the location of the Statistical Classification file
@@ -39,8 +40,8 @@ def normalise(value):
 
 def get_categories(sheet):
     """Get the list of top-level CFI categories."""
-    for row in sheet.get_rows():
-        if len(row[0].value) == 1 and row[1].value:
+    for row in sheet.iter_rows():
+        if row[0].value and len(row[0].value) == 1 and row[1].value:
             yield (row[0].value, row[1].value)
 
 
@@ -49,7 +50,7 @@ def get_attributes(sheet):
     attribute = None
     value_list = []
     values = None
-    for row in sheet.get_rows():
+    for row in sheet.iter_rows():
         if row[0].value and not row[1].value and row[2].value:
             attribute = normalise(row[2].value)
             values = []
@@ -84,15 +85,15 @@ def print_attributes(attributes, index=0):
     # Download and parse the spreadsheet
     response = requests.get(link_url, timeout=30)
     response.raise_for_status()
-    workbook = xlrd.open_workbook(file_contents=response.content)
+    workbook = openpyxl.load_workbook(io.BytesIO(response.content), read_only=True)
 
     print('# generated from %s, downloaded from' % link_url.split('/')[-1])
     print('# %s' % download_url)
 
-    groups = sorted(x for x in workbook.sheet_names() if len(x) == 6 and x.endswith('XXXX'))
-    for category, name in sorted(get_categories(workbook.sheet_by_name('Categories'))):
+    groups = sorted(x for x in workbook.sheetnames if len(x) == 6 and x.endswith('XXXX'))
+    for category, name in sorted(get_categories(workbook['Categories'])):
         print('%s category="%s"' % (category, name))
         for group in (x for x in groups if x.startswith(category)):
-            sheet = workbook.sheet_by_name(group)
-            print(' %s group="%s"' % (group[1], normalise(sheet.cell(0, 0).value)))
+            sheet = workbook[group]
+            print(' %s group="%s"' % (group[1], normalise(sheet.cell(1, 1).value)))
             print_attributes(get_attributes(sheet))
diff --git a/update/nz_banks.py b/update/nz_banks.py
@@ -3,7 +3,7 @@
 
 # update/nz_banks.py - script to download Bank list from Bank Branch Register
 #
-# Copyright (C) 2019-2021 Arthur de Jong
+# Copyright (C) 2019-2024 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -23,16 +23,12 @@
 """This script downloads the list of banks with bank codes as used in the
 New Zealand bank account numbers."""
 
+import io
 import re
 from collections import OrderedDict, defaultdict
 
+import openpyxl
 import requests
-import xlrd
-
-
-# Monkey patch xlrd avoiding bug in combination with Python 3.9
-xlrd.xlsx.ensure_elementtree_imported(False, None)
-xlrd.xlsx.Element_has_iter = True
 
 
 # The page that contains a link to the latest XLS version of the codes.
@@ -41,7 +37,7 @@
 
 def get_values(sheet):
     """Return rows from the worksheet as a dict per row."""
-    rows = sheet.get_rows()
+    rows = sheet.iter_rows()
     # the first row has column names
     columns = [column.value.lower().replace(' ', '_') for column in next(rows)]
     # go over rows with values
@@ -75,8 +71,8 @@ def branch_list(branches):
     response.raise_for_status()
     content_disposition = response.headers.get('content-disposition', '')
     filename = re.findall(r'filename=?(.+)"?', content_disposition)[0].strip('"')
-    workbook = xlrd.open_workbook(file_contents=response.content)
-    sheet = workbook.sheet_by_index(0)
+    workbook = openpyxl.load_workbook(io.BytesIO(response.content), read_only=True)
+    sheet = workbook.worksheets[0]
     # print header
     print('# generated from %s downloaded from' % filename)
     print('# %s' % download_url)

diff --git a/update/requirements.txt b/update/requirements.txt
@@ -1,3 +1,4 @@
 lxml
+openpyxl
 requests
 xlrd