Skip to content

Commit

Permalink
Merge pull request #56 from qld-gov-au/develop
Browse files Browse the repository at this point in the history
Develop to master - performance enhancements
  • Loading branch information
ThrawnCA authored Aug 7, 2023
2 parents 971a3ed + 340778b commit 1938a07
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 25 deletions.
7 changes: 6 additions & 1 deletion ckanext/xloader/action.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,12 @@ def xloader_submit(context, data_dict):
'original_url': resource_dict.get('url'),
}
}
timeout = config.get('ckanext.xloader.job_timeout', '3600')
# Expand timeout for resources that have to be type-guessed
timeout = config.get(
'ckanext.xloader.job_timeout',
'3600' if utils.datastore_resource_exists(res_id) else '10800')
log.debug("Timeout for XLoading resource %s is %s", res_id, timeout)

try:
job = enqueue_job(
jobs.xloader_data_into_datastore, [data], rq_kwargs=dict(timeout=timeout)
Expand Down
13 changes: 10 additions & 3 deletions ckanext/xloader/config_declaration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,7 @@ groups:
default: 1_000_000_000
example: 100000
description: |
The connection string for the jobs database used by XLoader. The
default of an sqlite file is fine for development. For production use a
Postgresql database.
The maximum file size that XLoader will attempt to load.
type: int
required: false
- key: ckanext.xloader.use_type_guessing
Expand All @@ -48,6 +46,15 @@ groups:
type: bool
required: false
legacy_key: ckanext.xloader.just_load_with_messytables
- key: ckanext.xloader.max_type_guessing_length
default: 0
example: 100000
description: |
The maximum file size that will be passed to Tabulator if the
use_type_guessing flag is enabled. Larger files will use COPY even if
the flag is set. Defaults to 1/10 of the maximum content length.
type: int
required: false
- key: ckanext.xloader.parse_dates_dayfirst
default: False
example: False
Expand Down
19 changes: 11 additions & 8 deletions ckanext/xloader/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import tempfile
import json
import datetime
import os
import traceback
import sys

Expand All @@ -18,10 +19,9 @@
from ckan import model
from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, config

from . import loader
from . import db
from . import db, loader
from .job_exceptions import JobError, HTTPError, DataTooBigError, FileCouldNotBeLoadedError
from .utils import set_resource_metadata
from .utils import datastore_resource_exists, set_resource_metadata

try:
from ckan.lib.api_token import get_user_from_token
Expand All @@ -33,6 +33,8 @@
requests.packages.urllib3.disable_warnings()

MAX_CONTENT_LENGTH = int(config.get('ckanext.xloader.max_content_length') or 1e9)
# Don't try Tabulator load on large files
MAX_TYPE_GUESSING_LENGTH = int(config.get('ckanext.xloader.max_type_guessing_length') or MAX_CONTENT_LENGTH / 10)
MAX_EXCERPT_LINES = int(config.get('ckanext.xloader.max_excerpt_lines') or 0)
CHUNK_SIZE = 16 * 1024 # 16kb
DOWNLOAD_TIMEOUT = 30
Expand Down Expand Up @@ -206,11 +208,12 @@ def tabulator_load():
logger.info('Loading CSV')
# If ckanext.xloader.use_type_guessing is not configured, fall back to
# deprecated ckanext.xloader.just_load_with_messytables
use_type_guessing = asbool(config.get(
'ckanext.xloader.use_type_guessing', config.get(
'ckanext.xloader.just_load_with_messytables', False)))
logger.info("'use_type_guessing' mode is: %s",
use_type_guessing)
use_type_guessing = asbool(
config.get('ckanext.xloader.use_type_guessing', config.get(
'ckanext.xloader.just_load_with_messytables', False))) \
and not datastore_resource_exists(resource['id']) \
and os.path.getsize(tmp_file.name) <= MAX_TYPE_GUESSING_LENGTH
logger.info("'use_type_guessing' mode is: %s", use_type_guessing)
try:
if use_type_guessing:
tabulator_load()
Expand Down
13 changes: 1 addition & 12 deletions ckanext/xloader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from .job_exceptions import FileCouldNotBeLoadedError, LoaderError
from .parser import CSV_SAMPLE_LINES, XloaderCSVParser
from .utils import headers_guess, type_guess
from .utils import datastore_resource_exists, headers_guess, type_guess

from ckan.plugins.toolkit import config

Expand Down Expand Up @@ -402,17 +402,6 @@ def send_resource_to_datastore(resource_id, headers, records):
.format(str(e)))


def datastore_resource_exists(resource_id):
from ckan import model
context = {'model': model, 'ignore_auth': True}
try:
response = p.toolkit.get_action('datastore_search')(context, dict(
id=resource_id, limit=0))
except p.toolkit.ObjectNotFound:
return False
return response or {'fields': []}


def delete_datastore_resource(resource_id):
from ckan import model
context = {'model': model, 'user': '', 'ignore_auth': True}
Expand Down
2 changes: 1 addition & 1 deletion ckanext/xloader/templates/package/resource_read.html
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{% ckan_extends %}

{% block action_manage_inner %}
{% block action_manage %}
{{ super() }}
{% if h.is_resource_supported_by_xloader(res) %}
<li>{% link_for _('DataStore'), named_route='xloader.resource_data', id=pkg.name, resource_id=res.id, class_='btn btn-light', icon='cloud-upload' %}</li>
Expand Down
10 changes: 10 additions & 0 deletions ckanext/xloader/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,3 +245,13 @@ def type_guess(rows, types=TYPES, strict=False):
guesses_tuples = [(t, guess[t]) for t in types if t in guess]
_columns.append(max(guesses_tuples, key=lambda t_n: t_n[1])[0])
return _columns


def datastore_resource_exists(resource_id):
context = {'model': model, 'ignore_auth': True}
try:
response = p.toolkit.get_action('datastore_search')(context, dict(
id=resource_id, limit=0))
except p.toolkit.ObjectNotFound:
return False
return response or {'fields': []}

0 comments on commit 1938a07

Please sign in to comment.