Skip to content

Commit

Permalink
Switch to using openpyxl for parsing XLSX files
Browse files Browse the repository at this point in the history
The xlrd has dropped support for parsing XLSX files. We still use xlrd
for update/be_banks.py because they use the classic XLS format and
openpyxl does not support that format.
  • Loading branch information
arthurdejong committed Mar 17, 2024
1 parent 9230604 commit 26fd25b
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 20 deletions.
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,6 @@ lines_after_imports = 2
multi_line_output = 4
known_third_party =
lxml
openpyxl
requests
xlrd
21 changes: 11 additions & 10 deletions update/cfi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# update/cfi.py - script to download CFI code list from the SIX group
#
# Copyright (C) 2022 Arthur de Jong
# Copyright (C) 2022-2024 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
Expand All @@ -21,11 +21,12 @@

"""This script downloads the list of CFI codes as published by the SIX group."""

import io
import re

import lxml.html
import openpyxl
import requests
import xlrd


# the location of the Statistical Classification file
Expand All @@ -39,8 +40,8 @@ def normalise(value):

def get_categories(sheet):
"""Get the list of top-level CFI categories."""
for row in sheet.get_rows():
if len(row[0].value) == 1 and row[1].value:
for row in sheet.iter_rows():
if row[0].value and len(row[0].value) == 1 and row[1].value:
yield (row[0].value, row[1].value)


Expand All @@ -49,7 +50,7 @@ def get_attributes(sheet):
attribute = None
value_list = []
values = None
for row in sheet.get_rows():
for row in sheet.iter_rows():
if row[0].value and not row[1].value and row[2].value:
attribute = normalise(row[2].value)
values = []
Expand Down Expand Up @@ -84,15 +85,15 @@ def print_attributes(attributes, index=0):
# Download and parse the spreadsheet
response = requests.get(link_url, timeout=30)
response.raise_for_status()
workbook = xlrd.open_workbook(file_contents=response.content)
workbook = openpyxl.load_workbook(io.BytesIO(response.content), read_only=True)

print('# generated from %s, downloaded from' % link_url.split('/')[-1])
print('# %s' % download_url)

groups = sorted(x for x in workbook.sheet_names() if len(x) == 6 and x.endswith('XXXX'))
for category, name in sorted(get_categories(workbook.sheet_by_name('Categories'))):
groups = sorted(x for x in workbook.sheetnames if len(x) == 6 and x.endswith('XXXX'))
for category, name in sorted(get_categories(workbook['Categories'])):
print('%s category="%s"' % (category, name))
for group in (x for x in groups if x.startswith(category)):
sheet = workbook.sheet_by_name(group)
print(' %s group="%s"' % (group[1], normalise(sheet.cell(0, 0).value)))
sheet = workbook[group]
print(' %s group="%s"' % (group[1], normalise(sheet.cell(1, 1).value)))
print_attributes(get_attributes(sheet))
16 changes: 6 additions & 10 deletions update/nz_banks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

# update/nz_banks.py - script to download Bank list from Bank Branch Register
#
# Copyright (C) 2019-2021 Arthur de Jong
# Copyright (C) 2019-2024 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
Expand All @@ -23,16 +23,12 @@
"""This script downloads the list of banks with bank codes as used in the
New Zealand bank account numbers."""

import io
import re
from collections import OrderedDict, defaultdict

import openpyxl
import requests
import xlrd


# Monkey patch xlrd avoiding bug in combination with Python 3.9
xlrd.xlsx.ensure_elementtree_imported(False, None)
xlrd.xlsx.Element_has_iter = True


# The page that contains a link to the latest XLS version of the codes.
Expand All @@ -41,7 +37,7 @@

def get_values(sheet):
"""Return rows from the worksheet as a dict per row."""
rows = sheet.get_rows()
rows = sheet.iter_rows()
# the first row has column names
columns = [column.value.lower().replace(' ', '_') for column in next(rows)]
# go over rows with values
Expand Down Expand Up @@ -75,8 +71,8 @@ def branch_list(branches):
response.raise_for_status()
content_disposition = response.headers.get('content-disposition', '')
filename = re.findall(r'filename=?(.+)"?', content_disposition)[0].strip('"')
workbook = xlrd.open_workbook(file_contents=response.content)
sheet = workbook.sheet_by_index(0)
workbook = openpyxl.load_workbook(io.BytesIO(response.content), read_only=True)
sheet = workbook.worksheets[0]
# print header
print('# generated from %s downloaded from' % filename)
print('# %s' % download_url)
Expand Down
1 change: 1 addition & 0 deletions update/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
lxml
openpyxl
requests
xlrd

0 comments on commit 26fd25b

Please sign in to comment.