From 2f9b89423c29cbf9572b83c00589bc99662fcb51 Mon Sep 17 00:00:00 2001 From: William Dutton Date: Wed, 1 Nov 2023 08:51:50 +1000 Subject: [PATCH 1/5] Quality of life improvements * Performance Improvement: Skip rows when showing datastore load when handing multi million line records * Improve testing on mixed formatting where type loading causes bad loads when it has a empty field on type timestamp/numeric, 'can't be string' must be none * Include shortcut to datastore when loaded for Package/Resource maintainers+ * Include dependabot pr auto raising to help keep current * Make flake8 more stringent --- .flake8 | 4 - .github/dependabot.yml | 18 ++ ckanext/xloader/action.py | 7 +- ckanext/xloader/command.py | 3 +- ckanext/xloader/config_declaration.yaml | 13 +- ckanext/xloader/helpers.py | 19 ++ ckanext/xloader/jobs.py | 25 +- ckanext/xloader/loader.py | 20 +- ckanext/xloader/plugin.py | 33 +-- .../templates/package/resource_edit_base.html | 4 +- .../templates/package/resource_read.html | 8 + .../package/snippets/resource_item.html | 8 + .../templates/package/snippets/resources.html | 8 + .../templates/xloader/resource_data.html | 32 ++- ckanext/xloader/tests/ckan_setup.py | 2 +- ckanext/xloader/tests/fixtures.py | 5 +- .../samples/mixed_numeric_string_sample.csv | 3 + .../tests/samples/non_timestamp_sample.csv | 4 + .../xloader/tests/samples/non_utf8_sample.csv | 267 ++++++++++++++++++ .../tests/samples/sample_with_blanks.csv | 4 + .../samples/sample_with_mixed_quotes.csv | 136 +++++++++ .../samples/sample_with_quoted_commas.csv | 4 + ckanext/xloader/tests/test_loader.py | 105 +++++++ ckanext/xloader/utils.py | 61 +++- ckanext/xloader/views.py | 12 +- 25 files changed, 724 insertions(+), 81 deletions(-) create mode 100644 .github/dependabot.yml create mode 100644 ckanext/xloader/templates/package/resource_read.html create mode 100644 ckanext/xloader/templates/package/snippets/resource_item.html create mode 100644 ckanext/xloader/templates/package/snippets/resources.html create mode 100644 ckanext/xloader/tests/samples/mixed_numeric_string_sample.csv create mode 100644 ckanext/xloader/tests/samples/non_timestamp_sample.csv create mode 100644 ckanext/xloader/tests/samples/non_utf8_sample.csv create mode 100644 ckanext/xloader/tests/samples/sample_with_blanks.csv create mode 100644 ckanext/xloader/tests/samples/sample_with_mixed_quotes.csv create mode 100644 ckanext/xloader/tests/samples/sample_with_quoted_commas.csv diff --git a/.flake8 b/.flake8 index a4eea9e3..32068ca7 100644 --- a/.flake8 +++ b/.flake8 @@ -17,8 +17,4 @@ max-line-length=127 # List ignore rules one per line. ignore = - E501 - C901 W503 - F401 - F403 diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..b5158981 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,18 @@ +version: 2 +registries: + python-index-pypi-org: + type: python-index + url: https://pypi.org/ + replaces-base: true + username: "${{secrets.PYTHON_INDEX_PYPI_ORG_USERNAME}}" + password: "${{secrets.PYTHON_INDEX_PYPI_ORG_PASSWORD}}" + +updates: +- package-ecosystem: pip + directory: "/" + schedule: + interval: daily + time: "19:00" + open-pull-requests-limit: 10 + registries: + - python-index-pypi-org diff --git a/ckanext/xloader/action.py b/ckanext/xloader/action.py index 3fa26803..e45394a9 100644 --- a/ckanext/xloader/action.py +++ b/ckanext/xloader/action.py @@ -152,7 +152,12 @@ def xloader_submit(context, data_dict): 'original_url': resource_dict.get('url'), } } - timeout = config.get('ckanext.xloader.job_timeout', '3600') + # Expand timeout for resources that have to be type-guessed + timeout = config.get( + 'ckanext.xloader.job_timeout', + '3600' if utils.datastore_resource_exists(res_id) else '10800') + log.debug("Timeout for XLoading resource %s is %s", res_id, timeout) + try: job = enqueue_job( jobs.xloader_data_into_datastore, [data], rq_kwargs=dict(timeout=timeout) diff --git a/ckanext/xloader/command.py b/ckanext/xloader/command.py index 7f2c000a..64b79754 100644 --- a/ckanext/xloader/command.py +++ b/ckanext/xloader/command.py @@ -3,6 +3,7 @@ import sys import logging import ckan.plugins.toolkit as tk +from ckanext.xloader.utils import XLoaderFormats class XloaderCmd: @@ -84,8 +85,6 @@ def _submit_resource(self, resource, user, indent=0): '''resource: resource dictionary ''' indentation = ' ' * indent - # import here, so that that loggers are setup - from ckanext.xloader.plugin import XLoaderFormats if not XLoaderFormats.is_it_an_xloader_format(resource['format']): print(indentation diff --git a/ckanext/xloader/config_declaration.yaml b/ckanext/xloader/config_declaration.yaml index b31f12e2..feb1cc9c 100644 --- a/ckanext/xloader/config_declaration.yaml +++ b/ckanext/xloader/config_declaration.yaml @@ -29,9 +29,7 @@ groups: default: 1_000_000_000 example: 100000 description: | - The connection string for the jobs database used by XLoader. The - default of an sqlite file is fine for development. For production use a - Postgresql database. + The maximum file size that XLoader will attempt to load. type: int required: false - key: ckanext.xloader.use_type_guessing @@ -48,6 +46,15 @@ groups: type: bool required: false legacy_key: ckanext.xloader.just_load_with_messytables + - key: ckanext.xloader.max_type_guessing_length + default: 0 + example: 100000 + description: | + The maximum file size that will be passed to Tabulator if the + use_type_guessing flag is enabled. Larger files will use COPY even if + the flag is set. Defaults to 1/10 of the maximum content length. + type: int + required: false - key: ckanext.xloader.parse_dates_dayfirst default: False example: False diff --git a/ckanext/xloader/helpers.py b/ckanext/xloader/helpers.py index 8c94387a..8b9dee8f 100644 --- a/ckanext/xloader/helpers.py +++ b/ckanext/xloader/helpers.py @@ -1,4 +1,5 @@ import ckan.plugins.toolkit as toolkit +from ckanext.xloader.utils import XLoaderFormats def xloader_status(resource_id): @@ -25,3 +26,21 @@ def xloader_status_description(status): return captions.get(status['status'], status['status'].capitalize()) else: return _('Not Uploaded Yet') + + +def is_resource_supported_by_xloader(res_dict, check_access=True): + is_supported_format = XLoaderFormats.is_it_an_xloader_format(res_dict.get('format')) + is_datastore_active = res_dict.get('datastore_active', False) + if check_access: + user_has_access = toolkit.h.check_access('package_update', {'id': res_dict.get('package_id')}) + else: + user_has_access = True + url_type = res_dict.get('url_type') + if url_type: + try: + is_supported_url_type = url_type not in toolkit.h.datastore_rw_resource_url_types() + except AttributeError: + is_supported_url_type = (url_type == 'upload') + else: + is_supported_url_type = True + return (is_supported_format or is_datastore_active) and user_has_access and is_supported_url_type diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index 4c4068f9..9c6e0a67 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -7,6 +7,7 @@ import tempfile import json import datetime +import os import traceback import sys @@ -16,23 +17,26 @@ import sqlalchemy as sa from ckan import model -from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, config, check_ckan_version +from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, config -from . import loader -from . import db +from . import db, loader from .job_exceptions import JobError, HTTPError, DataTooBigError, FileCouldNotBeLoadedError -from .utils import set_resource_metadata +from .utils import datastore_resource_exists, set_resource_metadata try: from ckan.lib.api_token import get_user_from_token except ImportError: get_user_from_token = None +log = logging.getLogger(__name__) + SSL_VERIFY = asbool(config.get('ckanext.xloader.ssl_verify', True)) if not SSL_VERIFY: requests.packages.urllib3.disable_warnings() MAX_CONTENT_LENGTH = int(config.get('ckanext.xloader.max_content_length') or 1e9) +# Don't try Tabulator load on large files +MAX_TYPE_GUESSING_LENGTH = int(config.get('ckanext.xloader.max_type_guessing_length') or MAX_CONTENT_LENGTH / 10) MAX_EXCERPT_LINES = int(config.get('ckanext.xloader.max_excerpt_lines') or 0) CHUNK_SIZE = 16 * 1024 # 16kb DOWNLOAD_TIMEOUT = 30 @@ -80,7 +84,6 @@ def xloader_data_into_datastore(input): db.mark_job_as_errored(job_id, str(e)) job_dict['status'] = 'error' job_dict['error'] = str(e) - log = logging.getLogger(__name__) log.error('xloader error: {0}, {1}'.format(e, traceback.format_exc())) errored = True except Exception as e: @@ -88,7 +91,6 @@ def xloader_data_into_datastore(input): job_id, traceback.format_tb(sys.exc_info()[2])[-1] + repr(e)) job_dict['status'] = 'error' job_dict['error'] = str(e) - log = logging.getLogger(__name__) log.error('xloader error: {0}, {1}'.format(e, traceback.format_exc())) errored = True finally: @@ -206,11 +208,12 @@ def tabulator_load(): logger.info('Loading CSV') # If ckanext.xloader.use_type_guessing is not configured, fall back to # deprecated ckanext.xloader.just_load_with_messytables - use_type_guessing = asbool(config.get( - 'ckanext.xloader.use_type_guessing', config.get( - 'ckanext.xloader.just_load_with_messytables', False))) - logger.info("'use_type_guessing' mode is: %s", - use_type_guessing) + use_type_guessing = asbool( + config.get('ckanext.xloader.use_type_guessing', config.get( + 'ckanext.xloader.just_load_with_messytables', False))) \ + and not datastore_resource_exists(resource['id']) \ + and os.path.getsize(tmp_file.name) <= MAX_TYPE_GUESSING_LENGTH + logger.info("'use_type_guessing' mode is: %s", use_type_guessing) try: if use_type_guessing: tabulator_load() diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index afc3c980..46c38dec 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -18,7 +18,7 @@ from .job_exceptions import FileCouldNotBeLoadedError, LoaderError from .parser import XloaderCSVParser -from .utils import headers_guess, type_guess +from .utils import datastore_resource_exists, headers_guess, type_guess from ckan.plugins.toolkit import config @@ -318,9 +318,16 @@ def row_iterator(): logger.info('Copying to database...') count = 0 + # Some types cannot be stored as empty strings and must be converted to None, + # https://github.com/ckan/ckanext-xloader/issues/182 + non_empty_types = ['timestamp', 'numeric'] for i, records in enumerate(chunky(result, 250)): count += len(records) logger.info('Saving chunk {number}'.format(number=i)) + for row in records: + for column_index, column_name in enumerate(row): + if headers_dicts[column_index]['type'] in non_empty_types and row[column_name] == '': + row[column_name] = None send_resource_to_datastore(resource_id, headers_dicts, records) logger.info('...copying done') @@ -395,17 +402,6 @@ def send_resource_to_datastore(resource_id, headers, records): .format(str(e))) -def datastore_resource_exists(resource_id): - from ckan import model - context = {'model': model, 'ignore_auth': True} - try: - response = p.toolkit.get_action('datastore_search')(context, dict( - id=resource_id, limit=0)) - except p.toolkit.ObjectNotFound: - return False - return response or {'fields': []} - - def delete_datastore_resource(resource_id): from ckan import model context = {'model': model, 'user': '', 'ignore_auth': True} diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py index dbde8ed5..1b8417b9 100644 --- a/ckanext/xloader/plugin.py +++ b/ckanext/xloader/plugin.py @@ -6,7 +6,7 @@ from ckan.plugins import toolkit from . import action, auth, helpers as xloader_helpers, utils -from .loader import fulltext_function_exists, get_write_engine +from ckanext.xloader.utils import XLoaderFormats try: config_declarations = toolkit.blanket.config_declarations @@ -19,36 +19,6 @@ def config_declarations(cls): log = logging.getLogger(__name__) -# resource.formats accepted by ckanext-xloader. Must be lowercase here. -DEFAULT_FORMATS = [ - "csv", - "application/csv", - "xls", - "xlsx", - "tsv", - "application/vnd.ms-excel", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - "ods", - "application/vnd.oasis.opendocument.spreadsheet", -] - - -class XLoaderFormats(object): - formats = None - - @classmethod - def is_it_an_xloader_format(cls, format_): - if cls.formats is None: - cls._formats = toolkit.config.get("ckanext.xloader.formats") - if cls._formats is not None: - cls._formats = cls._formats.lower().split() - else: - cls._formats = DEFAULT_FORMATS - if not format_: - return False - return format_.lower() in cls._formats - - @config_declarations class xloaderPlugin(plugins.SingletonPlugin): plugins.implements(plugins.IConfigurer) @@ -211,4 +181,5 @@ def get_helpers(self): return { "xloader_status": xloader_helpers.xloader_status, "xloader_status_description": xloader_helpers.xloader_status_description, + "is_resource_supported_by_xloader": xloader_helpers.is_resource_supported_by_xloader, } diff --git a/ckanext/xloader/templates/package/resource_edit_base.html b/ckanext/xloader/templates/package/resource_edit_base.html index 34403521..5c02815a 100644 --- a/ckanext/xloader/templates/package/resource_edit_base.html +++ b/ckanext/xloader/templates/package/resource_edit_base.html @@ -2,5 +2,7 @@ {% block inner_primary_nav %} {{ super() }} - {{ h.build_nav_icon('xloader.resource_data', _('DataStore'), id=pkg.name, resource_id=res.id) }} + {% if h.is_resource_supported_by_xloader(res) %} + {{ h.build_nav_icon('xloader.resource_data', _('DataStore'), id=pkg.name, resource_id=res.id, icon='cloud-upload') }} + {% endif %} {% endblock %} diff --git a/ckanext/xloader/templates/package/resource_read.html b/ckanext/xloader/templates/package/resource_read.html new file mode 100644 index 00000000..56bf0266 --- /dev/null +++ b/ckanext/xloader/templates/package/resource_read.html @@ -0,0 +1,8 @@ +{% ckan_extends %} + +{% block action_manage %} + {{ super() }} + {% if h.is_resource_supported_by_xloader(res) %} +
  • {% link_for _('DataStore'), named_route='xloader.resource_data', id=pkg.name, resource_id=res.id, class_='btn btn-light', icon='cloud-upload' %}
  • + {% endif %} +{% endblock %} diff --git a/ckanext/xloader/templates/package/snippets/resource_item.html b/ckanext/xloader/templates/package/snippets/resource_item.html new file mode 100644 index 00000000..37ed457c --- /dev/null +++ b/ckanext/xloader/templates/package/snippets/resource_item.html @@ -0,0 +1,8 @@ +{% ckan_extends %} + +{% block resource_item_explore_inner %} + {{ super() }} + {% if h.is_resource_supported_by_xloader(res) %} +
  • {% link_for _('DataStore'), named_route='xloader.resource_data', id=pkg.name, resource_id=res.id, class_='dropdown-item', icon='cloud-upload' %}
  • + {% endif %} +{% endblock %} diff --git a/ckanext/xloader/templates/package/snippets/resources.html b/ckanext/xloader/templates/package/snippets/resources.html new file mode 100644 index 00000000..e04dde4d --- /dev/null +++ b/ckanext/xloader/templates/package/snippets/resources.html @@ -0,0 +1,8 @@ +{% ckan_extends %} + +{% block resources_list_edit_dropdown_inner %} + {{ super() }} + {% if h.is_resource_supported_by_xloader(resource) %} +
  • {% link_for _('DataStore'), named_route='xloader.resource_data', id=pkg.name, resource_id=resource.id, class_='dropdown-item', icon='cloud-upload' %}
  • + {% endif %} +{% endblock %} diff --git a/ckanext/xloader/templates/xloader/resource_data.html b/ckanext/xloader/templates/xloader/resource_data.html index a94ad631..d9a22058 100644 --- a/ckanext/xloader/templates/xloader/resource_data.html +++ b/ckanext/xloader/templates/xloader/resource_data.html @@ -61,7 +61,25 @@ {% if status.status and status.task_info and show_table %}

    {{ _('Upload Log') }}