From a58b997bb328c8e31e7f28f47ff97f0d1cf28c06 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Tue, 18 Apr 2023 15:46:58 +1000 Subject: [PATCH 1/4] reduce Flake8 exceptions --- .flake8 | 4 ---- ckanext/xloader/jobs.py | 2 +- ckanext/xloader/loader.py | 1 - ckanext/xloader/parser.py | 2 -- ckanext/xloader/plugin.py | 1 - ckanext/xloader/tests/ckan_setup.py | 2 +- ckanext/xloader/tests/fixtures.py | 5 ++--- 7 files changed, 4 insertions(+), 13 deletions(-) diff --git a/.flake8 b/.flake8 index a4eea9e3..32068ca7 100644 --- a/.flake8 +++ b/.flake8 @@ -17,8 +17,4 @@ max-line-length=127 # List ignore rules one per line. ignore = - E501 - C901 W503 - F401 - F403 diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index 4c4068f9..0d242db1 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -16,7 +16,7 @@ import sqlalchemy as sa from ckan import model -from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, config, check_ckan_version +from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, config from . import loader from . import db diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index afc3c980..dfddd0ff 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -14,7 +14,6 @@ from unidecode import unidecode import ckan.plugins as p -import ckan.plugins.toolkit as tk from .job_exceptions import FileCouldNotBeLoadedError, LoaderError from .parser import XloaderCSVParser diff --git a/ckanext/xloader/parser.py b/ckanext/xloader/parser.py index b2a6f889..b52c59a3 100644 --- a/ckanext/xloader/parser.py +++ b/ckanext/xloader/parser.py @@ -1,10 +1,8 @@ # -*- coding: utf-8 -*- import csv -from codecs import iterencode from decimal import Decimal, InvalidOperation from itertools import chain -import six from ckan.plugins.toolkit import asbool from dateutil.parser import isoparser, parser from dateutil.parser import ParserError diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py index dbde8ed5..159b99de 100644 --- a/ckanext/xloader/plugin.py +++ b/ckanext/xloader/plugin.py @@ -6,7 +6,6 @@ from ckan.plugins import toolkit from . import action, auth, helpers as xloader_helpers, utils -from .loader import fulltext_function_exists, get_write_engine try: config_declarations = toolkit.blanket.config_declarations diff --git a/ckanext/xloader/tests/ckan_setup.py b/ckanext/xloader/tests/ckan_setup.py index ae8bfb3e..ff43d74c 100644 --- a/ckanext/xloader/tests/ckan_setup.py +++ b/ckanext/xloader/tests/ckan_setup.py @@ -1,5 +1,5 @@ try: - from ckan.tests.pytest_ckan.ckan_setup import * + from ckan.tests.pytest_ckan.ckan_setup import * # noqa except ImportError: import pkg_resources from paste.deploy import loadapp diff --git a/ckanext/xloader/tests/fixtures.py b/ckanext/xloader/tests/fixtures.py index f43916ab..9a7ad37f 100644 --- a/ckanext/xloader/tests/fixtures.py +++ b/ckanext/xloader/tests/fixtures.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -import sqlalchemy -import sqlalchemy.orm as orm +from sqlalchemy import orm import os from ckanext.datastore.tests import helpers as datastore_helpers @@ -11,7 +10,7 @@ ) try: - from ckan.tests.pytest_ckan.fixtures import * + from ckan.tests.pytest_ckan.fixtures import * # noqa except ImportError: import pytest From 7c433e2a0a820bd4be6415b1e0d095a23126e4cd Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Tue, 18 Apr 2023 15:50:33 +1000 Subject: [PATCH 2/4] fix fallback to 'str' type when no other types are guessed, #182 - Columns that used numeric on some rows and free text on others resulted in no type being guessed and an error --- .../tests/samples/mixed_numeric_string_sample.csv | 3 +++ ckanext/xloader/tests/test_loader.py | 12 ++++++++++++ ckanext/xloader/utils.py | 2 +- 3 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 ckanext/xloader/tests/samples/mixed_numeric_string_sample.csv diff --git a/ckanext/xloader/tests/samples/mixed_numeric_string_sample.csv b/ckanext/xloader/tests/samples/mixed_numeric_string_sample.csv new file mode 100644 index 00000000..7f59686c --- /dev/null +++ b/ckanext/xloader/tests/samples/mixed_numeric_string_sample.csv @@ -0,0 +1,3 @@ +Funding agency,Program title,Maximum (indicative) grant amount +DTIS,Accessible Tourism Infrastructure Grants,Five hundred thousand dollars +DTIS,Boosting Accessible Tourism Experiences Grants,5000 diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py index f31b663b..0241693d 100644 --- a/ckanext/xloader/tests/test_loader.py +++ b/ckanext/xloader/tests/test_loader.py @@ -612,6 +612,18 @@ def test_german(self, Session): u"tsvector", ] + [u"text"] * (len(records[0]) - 1) + def test_with_mixed_types(self, Session): + csv_filepath = get_sample_filepath("mixed_numeric_string_sample.csv") + resource_id = "test1" + factories.Resource(id=resource_id) + loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=logger, + ) + assert len(self._get_records(Session, "test1")) == 2 + def test_reload(self, Session): csv_filepath = get_sample_filepath("simple.csv") resource_id = "test1" diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py index cbffaa2f..79facbea 100644 --- a/ckanext/xloader/utils.py +++ b/ckanext/xloader/utils.py @@ -175,10 +175,10 @@ def type_guess(rows, types=TYPES, strict=False): for ci, cell in enumerate(row): if not cell: continue - at_least_one_value[ci] = True for type in list(guesses[ci].keys()): if not isinstance(cell, type): guesses[ci].pop(type) + at_least_one_value[ci] = True if guesses[ci] else False # no need to set guessing weights before this # because we only accept a type if it never fails for i, guess in enumerate(guesses): From a1a5193d3f5a2f26c183fa0726ead0a8d75654c0 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Tue, 18 Apr 2023 15:52:51 +1000 Subject: [PATCH 3/4] fix parsing failures when converting empty strings to SQL types, #182 - 'timestamp' and 'numeric' cannot handle empty strings, so convert to None --- ckanext/xloader/loader.py | 7 +++++++ ckanext/xloader/tests/samples/sample_with_blanks.csv | 4 ++++ ckanext/xloader/tests/test_loader.py | 12 ++++++++++++ 3 files changed, 23 insertions(+) create mode 100644 ckanext/xloader/tests/samples/sample_with_blanks.csv diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index dfddd0ff..55c9cab5 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -317,9 +317,16 @@ def row_iterator(): logger.info('Copying to database...') count = 0 + # Some types cannot be stored as empty strings and must be converted to None, + # https://github.com/ckan/ckanext-xloader/issues/182 + non_empty_types = ['timestamp', 'numeric'] for i, records in enumerate(chunky(result, 250)): count += len(records) logger.info('Saving chunk {number}'.format(number=i)) + for row in records: + for column_index, column_name in enumerate(row): + if headers_dicts[column_index]['type'] in non_empty_types and row[column_name] == '': + row[column_name] = None send_resource_to_datastore(resource_id, headers_dicts, records) logger.info('...copying done') diff --git a/ckanext/xloader/tests/samples/sample_with_blanks.csv b/ckanext/xloader/tests/samples/sample_with_blanks.csv new file mode 100644 index 00000000..2b7c415c --- /dev/null +++ b/ckanext/xloader/tests/samples/sample_with_blanks.csv @@ -0,0 +1,4 @@ +Funding agency,Program title,Opening date,Service ID +DTIS,Visitor First Experiences Fund,23/03/2023,63039 +DTIS,First Nations Sport and Recreation Program Round 2,22/03/2023,63040 +,,,63041 diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py index 0241693d..68452d11 100644 --- a/ckanext/xloader/tests/test_loader.py +++ b/ckanext/xloader/tests/test_loader.py @@ -612,6 +612,18 @@ def test_german(self, Session): u"tsvector", ] + [u"text"] * (len(records[0]) - 1) + def test_with_blanks(self, Session): + csv_filepath = get_sample_filepath("sample_with_blanks.csv") + resource_id = "test1" + factories.Resource(id=resource_id) + loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=logger, + ) + assert len(self._get_records(Session, "test1")) == 3 + def test_with_mixed_types(self, Session): csv_filepath = get_sample_filepath("mixed_numeric_string_sample.csv") resource_id = "test1" From d17e55f40f8fe0714f676ecb4bdf1aee39d4f514 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Mon, 29 May 2023 12:42:49 +1000 Subject: [PATCH 4/4] increase CSV sample size for identifying quote character, #182 - Messytables used to use 1000 rows, the Tabulator approach should do the same --- ckanext/xloader/loader.py | 5 +- ckanext/xloader/parser.py | 2 +- .../samples/sample_with_mixed_quotes.csv | 136 ++++++++++++++++++ ckanext/xloader/tests/test_loader.py | 24 ++++ 4 files changed, 164 insertions(+), 3 deletions(-) create mode 100644 ckanext/xloader/tests/samples/sample_with_mixed_quotes.csv diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index 55c9cab5..2060a9ef 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -10,13 +10,13 @@ import psycopg2 from six.moves import zip -from tabulator import Stream, TabulatorException +from tabulator import config as tabulator_config, Stream, TabulatorException from unidecode import unidecode import ckan.plugins as p from .job_exceptions import FileCouldNotBeLoadedError, LoaderError -from .parser import XloaderCSVParser +from .parser import CSV_SAMPLE_LINES, XloaderCSVParser from .utils import headers_guess, type_guess from ckan.plugins.toolkit import config @@ -28,6 +28,7 @@ _drop_indexes = datastore_db._drop_indexes MAX_COLUMN_LENGTH = 63 +tabulator_config.CSV_SAMPLE_LINES = CSV_SAMPLE_LINES def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): diff --git a/ckanext/xloader/parser.py b/ckanext/xloader/parser.py index b52c59a3..82539f4d 100644 --- a/ckanext/xloader/parser.py +++ b/ckanext/xloader/parser.py @@ -12,7 +12,7 @@ from ckan.plugins.toolkit import config -CSV_SAMPLE_LINES = 100 +CSV_SAMPLE_LINES = 1000 class XloaderCSVParser(Parser): diff --git a/ckanext/xloader/tests/samples/sample_with_mixed_quotes.csv b/ckanext/xloader/tests/samples/sample_with_mixed_quotes.csv new file mode 100644 index 00000000..a9527cf7 --- /dev/null +++ b/ckanext/xloader/tests/samples/sample_with_mixed_quotes.csv @@ -0,0 +1,136 @@ +Category,Category name,Priority,Initiative name,Investment objectives,Primary digital priority,Initiative stage,Actual start date,Approved end date,Date data current at,Percentage complete,Overall status,Project commencement allocation,Approved expenditure,Actual cost to date,Scope change event,Cost re-evaluation event,Delivery delay event,Project journey and reasons for variance,Learn more (URL) +DDSSHHESW,"Department of Defence, Social Security, Health, Housing, Education, and Silly Walks",High,Silly Walks project - Stage 2,"Lorum ipsum.",Collaboration,Delivery,01/07/1970,30/06/1971,31/03/1971,41,G,5633000,5739000,2352000,N,N,N,"As at 31 March 1971 +- Overall 'green' (on track) status +- Revised user journey following results of Silly Walk UX/UI testing +- Transition to support progressing with documentation and walk-through of the solution. +- Ongoing high levels of silly walk usage reflecting the success of search engine marketing. Silly walk focused campaign to further increase awareness and usage is being finalised. + +As at 28 February 1971 +- Overall 'green' (on track) status +- Results of Silly Walk UX/UI testing is guiding development of the revised user journey. +- Silly Walk transition to BAU support continuing with workshops, showcases and handover documentation. +- Silly Walk usage is increasing + +As at 31 January 1971 +- Continued amber status [closely monitored] with risks under management +- Search Engine Marketing -'Always On' yielding good results with continued increase in users and the proportion benefitting from Silly Walk +- Good progress on development of revised Silly Walk user journey. + +As at 31 December 1970 +Status AMBER [Closely monitored] +- Search Engine Marketing commenced 19 December 1970 and already showing increased users and proportion of customers benefitting from Silly Walk +- External assurance review completed and reported 'green' rating for confidence of delivery. + +As at 30 November 1970 +- Continued amber status pending risk management +- Marketing to commence to increase awareness of platform +- Good progress on development of revised user journey + +As at 31 October 1970 +Status AMBER [Closely monitored] +- Silly Walk Stage 2 continue reporting amber status reflective of ongoing high-level risks associated with demand-driven labour-market conditions and planned transition to support. +- Communications and engagement are in progress. +- The revised user journey continues development and testing. This is planned to be ready for release in the first quarter of 1971. As at 30 September 1970 +Status AMBER [Closely monitored] +Project journey events: +- A revised customer journey in line with outcomes of customer testing and retesting to validate solution usefulness continues to progress. +- Silly Walk industries expanded to include all industries. +- Engagement with agencies continues, to heighten Silly Walk awareness and complete validation following recent expansion to encompass all industries. + +As at 31 August 1970 +Status GREEN [On track] +The project is reporting green overall. Ongoing resourcing risk will continue to be monitored and managed for the life of the project, due to a tight labour market. +Project journey events: +- A revised customer journey in line with outcomes of customer testing and retesting to validate solution usefulness continues to progress. +- Further analysis of June/July 1970 marketing campaign has offered recommendations for consideration, to improve target audience awareness and Silly Walk uptake. +- Silly Walk industries expanded to include Retail Trade, Accommodation and Non-residential Construction industries finalised. +- Engagement with agencies continues, to heighten Silly Walk awareness and complete validation following recent expansion with three additional industries. + +As at 31 July 1970 +Status AMBER [Closely monitored] +The project is continuing to report amber overall mainly due to ongoing resourcing challenges. +Project journey events: +- A revised customer journey in line with outcomes of customer testing and retesting to validate solution usefulness, is progressing. +- Analysis of a major marketing campaign conducted in June/July 1970 showed a significant step-up in number of Silly Walk users. +- The target of 95% of Circus population coverage was met in June 1970 with 100% of Circus population now covered on Silly Walk. +- Agency engagement for extension industries has commenced. + +As at 1 July 1970 +Silly Walk commenced work on expanding industries to include Retail Trade, Accommodation and Non-residential Construction industries. + +As at June 1970 +Stage 2 of the project is commencing and will build up the solution delivered in Silly Walk Stage 1. Customer journey will be revised in line with outcome of customer testing. The increased coverage target of at least 95% of the Circus population was met in June 1970, with all local governments included on Silly Walk. Benefits realisation through marketing and promotion of Silly Walk.",https://example.com +DDSSHHESW,"Department of Defence, Social Security, Health, Housing, Education, and Silly Walks",High,Flying Circus Modernisation and Transformation Program - Tranche 1,"The Flying Circus Modernisation and Transformation (FCMT) Program seeks to reduce the risks associated with department legacy systems by delivering contemporary, consolidated, integrated, user-friendly applications to support delivery of Flying Circus outcomes. To optimise the technical capabilities of the new solutions, engagement with business teams in the review and development of business processes is a priority. ",Trust,Delivery,01/07/1969,31/08/1971,28/02/1971,52,G,8692200,9614968,4961147,Y,Y,Y,"As at 28 February 1971 +- Tranche 1 FCMT projects continue on schedule and on budget for Tranche 1 completion by 31 August 1971. +- Customer Engagement and Contract Establishment projects continue to progress focusing on delivery activities for new CRM and Portal enhancements. +- FCMT Tranche 2 Business Case tracking for completion April 1971. + +As at 31 January 1971 +- FCMT Projects continue to track to schedule and on budget for Tranche 1 completion 31 August 1971. +- Customer Engagement and Contract Establishment Projects progressing well with delivery activities for new CRM and Portal enhancements. + +As at 31 December 1970 +Status GREEN +- FCMT projects continuing to track to board endorsed updated schedule and on budget for Tranche 1 completion on 31 August 1971. +- Customer Engagement and Contract Establishment projects completed partner onboarding and delivery activities underway. +- Planning in progress for Tranche 2, focusing on remaining legacy systems for planned commencement at completion of Tranch 1. + +As at 30 November 1970 +Status GREEN +- Tranche 1 delivery date extended to 31 August 1971 due to CRM vendor procurement delays and subsequent additional time requirements for build completion and testing of new CRM. +- All projects maintaining momentum and progressing to revised schedule within budget. + +As at 31 October 1970 +Status GREEN +-New 'Partner Portal' Digital Channel continues to perform well with 3516 registered, active, external users from 634 different organisations. Update release being planned for January 1971. +-SkillsCRM (CEP Project) delivery partner on-boarded and formal delivery stage commenced. +-Contract Establishment and Variation (CEV PRoject) continuing delivery partner select with a view to commencing prior to end of December 1970. + +As at 30 September 1970 Status GREEN. +The FCMT 'Partner Portal' solution was successfully launched on the 17 August 1970. The decommissioning of the outdated legacy application, 'WalkConnect', has completed. Work is now increasing on the next Flying Circus systems to be replaced, SkillsCRM (via the Customer Engagement Project) and Policy on Line (via the Contract Establishment and Variation Project). +Project Journey Events: +- Partner Portal. After the successful launch of Partner Portal and decommissioning of WalkConnect, the transition to BAU is underway with the Project team continuing to support business until BAU transition is completed. +- Data, Infrastructure and Reporting. +New 'Data Lake' infrastructure built. Data ingestion processes being trialled. QTS report requirement gathering underway which will showcase new capability once completed. Compliance tool SMCM successfully launched September 30. +-Customer Engagement Project (CEP). Completed assurance reviews successfully. Delivery partner selection completed. Partner and formal delivery stage due to start 18 October 1970. Ramp up of activities continuing with business demonstrations of CRM proof of concept. +-Contract Establishment and Variation (CEV). +Requirements gathering completed. Delivery partner selection process commenced. 'As is' process documentation underway. + +As at 31 August 1970 +Status GREEN. The project remains on track. Successful launch of new secure 'Partner Portal' Digital Channel for Flying Circus related organisations occurred 17 August 1970. + +Current Projects underway: +- Partner Portal. Go-live occurred on track 17 August 1970. All registered Flying Circus organisations now able to use the portal to access key applications and send information to DDSSHHESW via secure channel. Enhanced support being provided for 6 weeks. Legacy system decommissioning underway. +- Data, Infrastructure and Reporting. Build of initial Data Lake (centralised, quality, information source) continuing and requirement gathering of first report planned to use new capabilites commenced. +- Customer Services Hub (CRM). Implementation partner selection complete. Solution delivery activities due to start by end September 1970. +- Contract Engagement and Variation. Requirements gathering complete and partner selection process to commence by end September 1970. + +As at 31 July 1970 +Status GREEN + +Project journey events: +Implementation of next changes to FCMT applications remain on track for August 1970 with full launch of new secure Partner Portal Digital Channel for Flying Circus related organisations. +FCMT Program scope adjusted to include additional at risk system decommission activties during this financial year. Approved expenditure updated to align with revised scope. + +Current Projects underway +- Partner Portal. Opened for registrations 4 July 1970. Majority of Flying Circus related organisation now registered. Full access (go-live) on track to commence 17 August 1970. Legacy system to be disabled and decommissioned September 1970. +- Data, Infrastructure and Reporting. Build of initial Data Lake (centralised, quality, information source) underway with population and work on first report to commence in September. +- Customer Services Hub (CRM). Requirements confirmed and partner selection underway. Work on legacy CRM replacement due to start September/October 1970. +- Contract Engagement and Variation. Requirements gathering and new process design activities in progress. + +15 May 1970 Update +Status GREEN + +Implementation of next changes to Flying Circus applications on track for August 1970 with introduction of new secure 'Silly Portal' Digital Channel for Flying Circus related organisations. + +Projects Completed +-Database consolidation - key databases transitioned to supported versions and platforms. Completed November 1969. +-System to System Integration platform. Completed 9 May 1970. + +Current projects underway +-Partner Portal secure digital channel, in final testing. Pilot successfully complete and on track for release in August 1970. +Projects in startup +-Data, Infrastructure and Reporting, planning underway. +-Customer Services Hub (CRM), planning underway. +-Contract Engagement and Variation, planning underway. +-Planning continues for Tranche 2.",https://example.com diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py index 68452d11..14b54eeb 100644 --- a/ckanext/xloader/tests/test_loader.py +++ b/ckanext/xloader/tests/test_loader.py @@ -624,6 +624,18 @@ def test_with_blanks(self, Session): ) assert len(self._get_records(Session, "test1")) == 3 + def test_with_mixed_quotes(self, Session): + csv_filepath = get_sample_filepath("sample_with_mixed_quotes.csv") + resource_id = "test1" + factories.Resource(id=resource_id) + loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=logger, + ) + assert len(self._get_records(Session, "test1")) == 2 + def test_with_mixed_types(self, Session): csv_filepath = get_sample_filepath("mixed_numeric_string_sample.csv") resource_id = "test1" @@ -1159,3 +1171,15 @@ def test_no_entries(self): mimetype="csv", logger=logger, ) + + def test_with_mixed_quotes(self, Session): + csv_filepath = get_sample_filepath("sample_with_mixed_quotes.csv") + resource_id = "test1" + factories.Resource(id=resource_id) + loader.load_table( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=logger, + ) + assert len(self._get_records(Session, "test1")) == 2