Skip to content

Commit

Permalink
Merge pull request #186 from qld-gov-au/182-type-guessing-fixes
Browse files Browse the repository at this point in the history
#182 Type guessing fixes

These changes have been further tested in the `open-data` fork.
  • Loading branch information
ThrawnCA authored Jan 29, 2024
2 parents 4f3864f + d17e55f commit 1ee470a
Show file tree
Hide file tree
Showing 12 changed files with 207 additions and 17 deletions.
4 changes: 0 additions & 4 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,4 @@ max-line-length=127

# List ignore rules one per line.
ignore =
E501
C901
W503
F401
F403
2 changes: 1 addition & 1 deletion ckanext/xloader/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import sqlalchemy as sa

from ckan import model
from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, config, check_ckan_version
from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, config

from . import loader
from . import db
Expand Down
13 changes: 10 additions & 3 deletions ckanext/xloader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,13 @@

import psycopg2
from six.moves import zip
from tabulator import Stream, TabulatorException
from tabulator import config as tabulator_config, Stream, TabulatorException
from unidecode import unidecode

import ckan.plugins as p
import ckan.plugins.toolkit as tk

from .job_exceptions import FileCouldNotBeLoadedError, LoaderError
from .parser import XloaderCSVParser
from .parser import CSV_SAMPLE_LINES, XloaderCSVParser
from .utils import headers_guess, type_guess

from ckan.plugins.toolkit import config
Expand All @@ -29,6 +28,7 @@
_drop_indexes = datastore_db._drop_indexes

MAX_COLUMN_LENGTH = 63
tabulator_config.CSV_SAMPLE_LINES = CSV_SAMPLE_LINES


def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
Expand Down Expand Up @@ -318,9 +318,16 @@ def row_iterator():

logger.info('Copying to database...')
count = 0
# Some types cannot be stored as empty strings and must be converted to None,
# https://github.com/ckan/ckanext-xloader/issues/182
non_empty_types = ['timestamp', 'numeric']
for i, records in enumerate(chunky(result, 250)):
count += len(records)
logger.info('Saving chunk {number}'.format(number=i))
for row in records:
for column_index, column_name in enumerate(row):
if headers_dicts[column_index]['type'] in non_empty_types and row[column_name] == '':
row[column_name] = None
send_resource_to_datastore(resource_id, headers_dicts, records)
logger.info('...copying done')

Expand Down
4 changes: 1 addition & 3 deletions ckanext/xloader/parser.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
# -*- coding: utf-8 -*-
import csv
from codecs import iterencode
from decimal import Decimal, InvalidOperation
from itertools import chain

import six
from ckan.plugins.toolkit import asbool
from dateutil.parser import isoparser, parser
from dateutil.parser import ParserError
Expand All @@ -14,7 +12,7 @@

from ckan.plugins.toolkit import config

CSV_SAMPLE_LINES = 100
CSV_SAMPLE_LINES = 1000


class XloaderCSVParser(Parser):
Expand Down
1 change: 0 additions & 1 deletion ckanext/xloader/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from ckan.model.resource import Resource

from . import action, auth, helpers as xloader_helpers, utils
from .loader import fulltext_function_exists, get_write_engine

try:
config_declarations = toolkit.blanket.config_declarations
Expand Down
2 changes: 1 addition & 1 deletion ckanext/xloader/tests/ckan_setup.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
try:
from ckan.tests.pytest_ckan.ckan_setup import *
from ckan.tests.pytest_ckan.ckan_setup import * # noqa
except ImportError:
import pkg_resources
from paste.deploy import loadapp
Expand Down
5 changes: 2 additions & 3 deletions ckanext/xloader/tests/fixtures.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# -*- coding: utf-8 -*-
import sqlalchemy
import sqlalchemy.orm as orm
from sqlalchemy import orm
import os

from ckanext.datastore.tests import helpers as datastore_helpers
Expand All @@ -11,7 +10,7 @@
)

try:
from ckan.tests.pytest_ckan.fixtures import *
from ckan.tests.pytest_ckan.fixtures import * # noqa
except ImportError:
import pytest

Expand Down
3 changes: 3 additions & 0 deletions ckanext/xloader/tests/samples/mixed_numeric_string_sample.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Funding agency,Program title,Maximum (indicative) grant amount
DTIS,Accessible Tourism Infrastructure Grants,Five hundred thousand dollars
DTIS,Boosting Accessible Tourism Experiences Grants,5000
4 changes: 4 additions & 0 deletions ckanext/xloader/tests/samples/sample_with_blanks.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Funding agency,Program title,Opening date,Service ID
DTIS,Visitor First Experiences Fund,23/03/2023,63039
DTIS,First Nations Sport and Recreation Program Round 2,22/03/2023,63040
,,,63041
136 changes: 136 additions & 0 deletions ckanext/xloader/tests/samples/sample_with_mixed_quotes.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
Category,Category name,Priority,Initiative name,Investment objectives,Primary digital priority,Initiative stage,Actual start date,Approved end date,Date data current at,Percentage complete,Overall status,Project commencement allocation,Approved expenditure,Actual cost to date,Scope change event,Cost re-evaluation event,Delivery delay event,Project journey and reasons for variance,Learn more (URL)
DDSSHHESW,"Department of Defence, Social Security, Health, Housing, Education, and Silly Walks",High,Silly Walks project - Stage 2,"Lorum ipsum.",Collaboration,Delivery,01/07/1970,30/06/1971,31/03/1971,41,G,5633000,5739000,2352000,N,N,N,"As at 31 March 1971
- Overall 'green' (on track) status
- Revised user journey following results of Silly Walk UX/UI testing
- Transition to support progressing with documentation and walk-through of the solution.
- Ongoing high levels of silly walk usage reflecting the success of search engine marketing. Silly walk focused campaign to further increase awareness and usage is being finalised.

As at 28 February 1971
- Overall 'green' (on track) status
- Results of Silly Walk UX/UI testing is guiding development of the revised user journey.
- Silly Walk transition to BAU support continuing with workshops, showcases and handover documentation.
- Silly Walk usage is increasing

As at 31 January 1971
- Continued amber status [closely monitored] with risks under management
- Search Engine Marketing -'Always On' yielding good results with continued increase in users and the proportion benefitting from Silly Walk
- Good progress on development of revised Silly Walk user journey.

As at 31 December 1970
Status AMBER [Closely monitored]
- Search Engine Marketing commenced 19 December 1970 and already showing increased users and proportion of customers benefitting from Silly Walk
- External assurance review completed and reported 'green' rating for confidence of delivery.

As at 30 November 1970
- Continued amber status pending risk management
- Marketing to commence to increase awareness of platform
- Good progress on development of revised user journey

As at 31 October 1970
Status AMBER [Closely monitored]
- Silly Walk Stage 2 continue reporting amber status reflective of ongoing high-level risks associated with demand-driven labour-market conditions and planned transition to support.
- Communications and engagement are in progress.
- The revised user journey continues development and testing. This is planned to be ready for release in the first quarter of 1971. As at 30 September 1970
Status AMBER [Closely monitored]
Project journey events:
- A revised customer journey in line with outcomes of customer testing and retesting to validate solution usefulness continues to progress.
- Silly Walk industries expanded to include all industries.
- Engagement with agencies continues, to heighten Silly Walk awareness and complete validation following recent expansion to encompass all industries.

As at 31 August 1970
Status GREEN [On track]
The project is reporting green overall. Ongoing resourcing risk will continue to be monitored and managed for the life of the project, due to a tight labour market.
Project journey events:
- A revised customer journey in line with outcomes of customer testing and retesting to validate solution usefulness continues to progress.
- Further analysis of June/July 1970 marketing campaign has offered recommendations for consideration, to improve target audience awareness and Silly Walk uptake.
- Silly Walk industries expanded to include Retail Trade, Accommodation and Non-residential Construction industries finalised.
- Engagement with agencies continues, to heighten Silly Walk awareness and complete validation following recent expansion with three additional industries.

As at 31 July 1970
Status AMBER [Closely monitored]
The project is continuing to report amber overall mainly due to ongoing resourcing challenges.
Project journey events:
- A revised customer journey in line with outcomes of customer testing and retesting to validate solution usefulness, is progressing.
- Analysis of a major marketing campaign conducted in June/July 1970 showed a significant step-up in number of Silly Walk users.
- The target of 95% of Circus population coverage was met in June 1970 with 100% of Circus population now covered on Silly Walk.
- Agency engagement for extension industries has commenced.

As at 1 July 1970
Silly Walk commenced work on expanding industries to include Retail Trade, Accommodation and Non-residential Construction industries.

As at June 1970
Stage 2 of the project is commencing and will build up the solution delivered in Silly Walk Stage 1. Customer journey will be revised in line with outcome of customer testing. The increased coverage target of at least 95% of the Circus population was met in June 1970, with all local governments included on Silly Walk. Benefits realisation through marketing and promotion of Silly Walk.",https://example.com
DDSSHHESW,"Department of Defence, Social Security, Health, Housing, Education, and Silly Walks",High,Flying Circus Modernisation and Transformation Program - Tranche 1,"The Flying Circus Modernisation and Transformation (FCMT) Program seeks to reduce the risks associated with department legacy systems by delivering contemporary, consolidated, integrated, user-friendly applications to support delivery of Flying Circus outcomes. To optimise the technical capabilities of the new solutions, engagement with business teams in the review and development of business processes is a priority. ",Trust,Delivery,01/07/1969,31/08/1971,28/02/1971,52,G,8692200,9614968,4961147,Y,Y,Y,"As at 28 February 1971
- Tranche 1 FCMT projects continue on schedule and on budget for Tranche 1 completion by 31 August 1971.
- Customer Engagement and Contract Establishment projects continue to progress focusing on delivery activities for new CRM and Portal enhancements.
- FCMT Tranche 2 Business Case tracking for completion April 1971.

As at 31 January 1971
- FCMT Projects continue to track to schedule and on budget for Tranche 1 completion 31 August 1971.
- Customer Engagement and Contract Establishment Projects progressing well with delivery activities for new CRM and Portal enhancements.

As at 31 December 1970
Status GREEN
- FCMT projects continuing to track to board endorsed updated schedule and on budget for Tranche 1 completion on 31 August 1971.
- Customer Engagement and Contract Establishment projects completed partner onboarding and delivery activities underway.
- Planning in progress for Tranche 2, focusing on remaining legacy systems for planned commencement at completion of Tranch 1.

As at 30 November 1970
Status GREEN
- Tranche 1 delivery date extended to 31 August 1971 due to CRM vendor procurement delays and subsequent additional time requirements for build completion and testing of new CRM.
- All projects maintaining momentum and progressing to revised schedule within budget.

As at 31 October 1970
Status GREEN
-New 'Partner Portal' Digital Channel continues to perform well with 3516 registered, active, external users from 634 different organisations. Update release being planned for January 1971.
-SkillsCRM (CEP Project) delivery partner on-boarded and formal delivery stage commenced.
-Contract Establishment and Variation (CEV PRoject) continuing delivery partner select with a view to commencing prior to end of December 1970.

As at 30 September 1970 Status GREEN.
The FCMT 'Partner Portal' solution was successfully launched on the 17 August 1970. The decommissioning of the outdated legacy application, 'WalkConnect', has completed. Work is now increasing on the next Flying Circus systems to be replaced, SkillsCRM (via the Customer Engagement Project) and Policy on Line (via the Contract Establishment and Variation Project).
Project Journey Events:
- Partner Portal. After the successful launch of Partner Portal and decommissioning of WalkConnect, the transition to BAU is underway with the Project team continuing to support business until BAU transition is completed.
- Data, Infrastructure and Reporting.
New 'Data Lake' infrastructure built. Data ingestion processes being trialled. QTS report requirement gathering underway which will showcase new capability once completed. Compliance tool SMCM successfully launched September 30.
-Customer Engagement Project (CEP). Completed assurance reviews successfully. Delivery partner selection completed. Partner and formal delivery stage due to start 18 October 1970. Ramp up of activities continuing with business demonstrations of CRM proof of concept.
-Contract Establishment and Variation (CEV).
Requirements gathering completed. Delivery partner selection process commenced. 'As is' process documentation underway.

As at 31 August 1970
Status GREEN. The project remains on track. Successful launch of new secure 'Partner Portal' Digital Channel for Flying Circus related organisations occurred 17 August 1970.

Current Projects underway:
- Partner Portal. Go-live occurred on track 17 August 1970. All registered Flying Circus organisations now able to use the portal to access key applications and send information to DDSSHHESW via secure channel. Enhanced support being provided for 6 weeks. Legacy system decommissioning underway.
- Data, Infrastructure and Reporting. Build of initial Data Lake (centralised, quality, information source) continuing and requirement gathering of first report planned to use new capabilites commenced.
- Customer Services Hub (CRM). Implementation partner selection complete. Solution delivery activities due to start by end September 1970.
- Contract Engagement and Variation. Requirements gathering complete and partner selection process to commence by end September 1970.

As at 31 July 1970
Status GREEN

Project journey events:
Implementation of next changes to FCMT applications remain on track for August 1970 with full launch of new secure Partner Portal Digital Channel for Flying Circus related organisations.
FCMT Program scope adjusted to include additional at risk system decommission activties during this financial year. Approved expenditure updated to align with revised scope.

Current Projects underway
- Partner Portal. Opened for registrations 4 July 1970. Majority of Flying Circus related organisation now registered. Full access (go-live) on track to commence 17 August 1970. Legacy system to be disabled and decommissioned September 1970.
- Data, Infrastructure and Reporting. Build of initial Data Lake (centralised, quality, information source) underway with population and work on first report to commence in September.
- Customer Services Hub (CRM). Requirements confirmed and partner selection underway. Work on legacy CRM replacement due to start September/October 1970.
- Contract Engagement and Variation. Requirements gathering and new process design activities in progress.

15 May 1970 Update
Status GREEN

Implementation of next changes to Flying Circus applications on track for August 1970 with introduction of new secure 'Silly Portal' Digital Channel for Flying Circus related organisations.

Projects Completed
-Database consolidation - key databases transitioned to supported versions and platforms. Completed November 1969.
-System to System Integration platform. Completed 9 May 1970.

Current projects underway
-Partner Portal secure digital channel, in final testing. Pilot successfully complete and on track for release in August 1970.
Projects in startup
-Data, Infrastructure and Reporting, planning underway.
-Customer Services Hub (CRM), planning underway.
-Contract Engagement and Variation, planning underway.
-Planning continues for Tranche 2.",https://example.com
48 changes: 48 additions & 0 deletions ckanext/xloader/tests/test_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,42 @@ def test_german(self, Session):
u"tsvector",
] + [u"text"] * (len(records[0]) - 1)

def test_with_blanks(self, Session):
csv_filepath = get_sample_filepath("sample_with_blanks.csv")
resource_id = "test1"
factories.Resource(id=resource_id)
loader.load_csv(
csv_filepath,
resource_id=resource_id,
mimetype="text/csv",
logger=logger,
)
assert len(self._get_records(Session, "test1")) == 3

def test_with_mixed_quotes(self, Session):
csv_filepath = get_sample_filepath("sample_with_mixed_quotes.csv")
resource_id = "test1"
factories.Resource(id=resource_id)
loader.load_csv(
csv_filepath,
resource_id=resource_id,
mimetype="text/csv",
logger=logger,
)
assert len(self._get_records(Session, "test1")) == 2

def test_with_mixed_types(self, Session):
csv_filepath = get_sample_filepath("mixed_numeric_string_sample.csv")
resource_id = "test1"
factories.Resource(id=resource_id)
loader.load_csv(
csv_filepath,
resource_id=resource_id,
mimetype="text/csv",
logger=logger,
)
assert len(self._get_records(Session, "test1")) == 2

def test_reload(self, Session):
csv_filepath = get_sample_filepath("simple.csv")
resource = factories.Resource()
Expand Down Expand Up @@ -1143,3 +1179,15 @@ def test_no_entries(self):
mimetype="csv",
logger=logger,
)

def test_with_mixed_quotes(self, Session):
csv_filepath = get_sample_filepath("sample_with_mixed_quotes.csv")
resource_id = "test1"
factories.Resource(id=resource_id)
loader.load_table(
csv_filepath,
resource_id=resource_id,
mimetype="text/csv",
logger=logger,
)
assert len(self._get_records(Session, "test1")) == 2
2 changes: 1 addition & 1 deletion ckanext/xloader/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,10 +178,10 @@ def type_guess(rows, types=TYPES, strict=False):
for ci, cell in enumerate(row):
if not cell:
continue
at_least_one_value[ci] = True
for type in list(guesses[ci].keys()):
if not isinstance(cell, type):
guesses[ci].pop(type)
at_least_one_value[ci] = True if guesses[ci] else False
# no need to set guessing weights before this
# because we only accept a type if it never fails
for i, guess in enumerate(guesses):
Expand Down

0 comments on commit 1ee470a

Please sign in to comment.