Skip to content

Commit 958b80b

Browse files
authored
Merge pull request ckan#8530 from ckan/5847-selective-indexes
configurable datastore FTS indexes
2 parents 822245e + f3023ae commit 958b80b

File tree

9 files changed

+155
-78
lines changed

9 files changed

+155
-78
lines changed

changes/5847.feature

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
Allow configuring datastore full text field indexes with new
2+
ckan.datastore.default_fts_index_field_types config option.
3+
4+
The default is an empty list, avoiding automatically creating
5+
separate full text indexes for any individual columns. The
6+
whole-row full text index still exists for all tables.
7+
8+
Use the `ckan datastore fts-index` command to remove existing
9+
column indexes to reclaim database space.

ckanext/datastore/backend/postgres.py

Lines changed: 18 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,7 @@ def _where_clauses(
445445
# add full-text search where clause
446446
q: Union[dict[str, str], str, Any] = data_dict.get('q')
447447
full_text = data_dict.get('full_text')
448-
if q and not full_text:
448+
if q:
449449
if isinstance(q, str):
450450
ts_query_alias = _ts_query_alias()
451451
clause_str = u'_full_text @@ {0}'.format(ts_query_alias)
@@ -459,6 +459,7 @@ def _where_clauses(
459459

460460
ftyp = fields_types[field]
461461
if not datastore_helpers.should_fts_index_field_type(ftyp):
462+
# use general full text search to narrow results
462463
clause_str = u'_full_text @@ {0}'.format(query_field)
463464
clauses.append((clause_str,))
464465

@@ -468,49 +469,14 @@ def _where_clauses(
468469
identifier(field),
469470
query_field)
470471
clauses.append((clause_str,))
471-
elif (full_text and not q):
472-
ts_query_alias = _ts_query_alias()
473-
clause_str = u'_full_text @@ {0}'.format(ts_query_alias)
474-
clauses.append((clause_str,))
475-
476-
elif full_text and isinstance(q, dict):
477-
ts_query_alias = _ts_query_alias()
478-
clause_str = u'_full_text @@ {0}'.format(ts_query_alias)
479-
clauses.append((clause_str,))
480-
# update clauses with q dict
481-
_update_where_clauses_on_q_dict(data_dict, fields_types, q, clauses)
482-
483-
elif full_text and isinstance(q, str):
472+
if full_text:
484473
ts_query_alias = _ts_query_alias()
485474
clause_str = u'_full_text @@ {0}'.format(ts_query_alias)
486475
clauses.append((clause_str,))
487476

488477
return clauses
489478

490479

491-
def _update_where_clauses_on_q_dict(
492-
data_dict: dict[str, str], fields_types: dict[str, str],
493-
q: dict[str, str],
494-
clauses: WhereClauses) -> None:
495-
lang = _fts_lang(data_dict.get('language'))
496-
for field, _ in q.items():
497-
if field not in fields_types:
498-
continue
499-
query_field = _ts_query_alias(field)
500-
501-
ftyp = fields_types[field]
502-
if not datastore_helpers.should_fts_index_field_type(ftyp):
503-
clause_str = u'_full_text @@ {0}'.format(query_field)
504-
clauses.append((clause_str,))
505-
506-
clause_str = (
507-
u'to_tsvector({0}, cast({1} as text)) @@ {2}').format(
508-
literal_string(lang),
509-
identifier(field),
510-
query_field)
511-
clauses.append((clause_str,))
512-
513-
514480
def _textsearch_query(
515481
lang: str, q: Optional[Union[str, dict[str, str], Any]], plain: bool,
516482
full_text: Optional[str]) -> tuple[str, dict[str, str]]:
@@ -709,6 +675,7 @@ def _build_fts_indexes(
709675
data_dict: dict[str, Any], # noqa
710676
sql_index_str_method: str, fields: list[dict[str, Any]]):
711677
fts_indexes: list[str] = []
678+
fts_noindexes: list[str] = []
712679
resource_id = data_dict['resource_id']
713680
fts_lang = data_dict.get(
714681
'language', config.get('ckan.datastore.default_fts_lang'))
@@ -722,23 +689,25 @@ def cast_as_text(x: str):
722689

723690
full_text_field = {'type': 'tsvector', 'id': '_full_text'}
724691
for field in [full_text_field] + fields:
725-
if not datastore_helpers.should_fts_index_field_type(field['type']):
726-
continue
727-
728692
field_str = field['id']
729693
if field['type'] not in ['text', 'tsvector']:
730694
field_str = cast_as_text(field_str)
731695
else:
732696
field_str = u'"{0}"'.format(field_str)
733697
if field['type'] != 'tsvector':
734698
field_str = to_tsvector(field_str)
699+
if field['id'] != '_full_text' and not (
700+
datastore_helpers.should_fts_index_field_type(field['type'])):
701+
fts_noindexes.append(_generate_index_name(resource_id, field_str))
702+
continue
703+
735704
fts_indexes.append(sql_index_str_method.format(
736705
res_id=resource_id,
737706
unique='',
738707
name=_generate_index_name(resource_id, field_str),
739708
method=_get_fts_index_method(), fields=field_str))
740709

741-
return fts_indexes
710+
return fts_indexes, fts_noindexes
742711

743712

744713
def _drop_indexes(context: Context, data_dict: dict[str, Any],
@@ -944,9 +913,8 @@ def create_indexes(context: Context, data_dict: dict[str, Any]):
944913
field_ids = _pluck('id', fields)
945914
json_fields = [x['id'] for x in fields if x['type'] == 'nested']
946915

947-
fts_indexes = _build_fts_indexes(data_dict,
948-
sql_index_string_method,
949-
fields)
916+
fts_indexes, fts_noindexes = _build_fts_indexes(
917+
data_dict, sql_index_string_method, fields)
950918
sql_index_strings = sql_index_strings + fts_indexes
951919

952920
if indexes is not None:
@@ -986,10 +954,13 @@ def create_indexes(context: Context, data_dict: dict[str, Any]):
986954

987955
current_indexes = _get_index_names(context['connection'],
988956
data_dict['resource_id'])
957+
958+
for fts_idx in current_indexes:
959+
if fts_idx in fts_noindexes:
960+
connection.execute(sa.text(
961+
'DROP INDEX {0} CASCADE'.format(sa.column(fts_idx))))
989962
for sql_index_string in sql_index_strings:
990-
has_index = [c for c in current_indexes
991-
if sql_index_string.find(c) != -1]
992-
if not has_index:
963+
if not any(c in sql_index_string for c in current_indexes):
993964
connection.execute(sa.text(sql_index_string))
994965

995966

ckanext/datastore/cli.py

Lines changed: 37 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,14 @@
1313
import ckan.logic as logic
1414

1515
import ckanext.datastore as datastore_module
16+
from ckanext.datastore.backend import get_all_resources_ids_in_datastore
1617
from ckanext.datastore.backend.postgres import (
1718
identifier,
1819
literal_string,
1920
get_read_engine,
2021
get_write_engine,
2122
_get_raw_field_info,
23+
_TIMEOUT,
2224
)
2325
from ckanext.datastore.blueprint import DUMP_FORMATS, dump_to
2426

@@ -137,27 +139,17 @@ def purge():
137139

138140
site_user = logic.get_action('get_site_user')({'ignore_auth': True}, {})
139141

140-
result = logic.get_action('datastore_search')(
141-
{'user': site_user['name']},
142-
{'resource_id': '_table_metadata'}
143-
)
144-
145142
resource_id_list = []
146-
for record in result['records']:
143+
for resid in get_all_resources_ids_in_datastore():
147144
try:
148-
# ignore 'alias' records (views) as they are automatically
149-
# deleted when the parent resource table is dropped
150-
if record['alias_of']:
151-
continue
152-
153145
logic.get_action('resource_show')(
154146
{'user': site_user['name']},
155-
{'id': record['name']}
147+
{'id': resid}
156148
)
157149
except logic.NotFound:
158-
resource_id_list.append(record['name'])
150+
resource_id_list.append(resid)
159151
click.echo("Resource '%s' orphaned - queued for drop" %
160-
record[u'name'])
152+
resid)
161153
except KeyError:
162154
continue
163155

@@ -191,22 +183,12 @@ def upgrade():
191183
'''Move field info to _info so that plugins may add private information
192184
to each field for their own purposes.'''
193185

194-
site_user = logic.get_action('get_site_user')({'ignore_auth': True}, {})
195-
196-
result = logic.get_action('datastore_search')(
197-
{'user': site_user['name']},
198-
{'resource_id': '_table_metadata'}
199-
)
200-
201186
count = 0
202187
skipped = 0
203188
noinfo = 0
204189
read_connection = get_read_engine()
205-
for record in result['records']:
206-
if record['alias_of']:
207-
continue
208-
209-
raw_fields, old = _get_raw_field_info(read_connection, record['name'])
190+
for resid in get_all_resources_ids_in_datastore():
191+
raw_fields, old = _get_raw_field_info(read_connection, resid)
210192
if not old:
211193
if not raw_fields:
212194
noinfo += 1
@@ -222,7 +204,7 @@ def upgrade():
222204
raw_sql = literal_string(' ' + json.dumps(
223205
raw, ensure_ascii=False, separators=(',', ':')))
224206
alter_sql.append(u'COMMENT ON COLUMN {0}.{1} is {2}'.format(
225-
identifier(record['name']),
207+
identifier(resid),
226208
identifier(fid),
227209
raw_sql))
228210

@@ -236,5 +218,33 @@ def upgrade():
236218
count, skipped, noinfo))
237219

238220

221+
@datastore.command(
222+
'fts-index',
223+
short_help='create or remove full-text search indexes after changing '
224+
'the ckan.datastore.default_fts_index_field_types setting'
225+
)
226+
@click.option(
227+
'--timeout', metavar='SECONDS',
228+
type=click.FloatRange(0, 2147483.647), # because postgres max int
229+
default=_TIMEOUT / 1000, show_default=True,
230+
help='maximum index creation time in seconds',
231+
)
232+
def fts_index(timeout: float):
233+
'''Use to create or remove full-text search indexes after changing
234+
the ckan.datastore.default_fts_index_field_types setting.
235+
'''
236+
site_user = logic.get_action('get_site_user')({'ignore_auth': True}, {})
237+
resource_ids = get_all_resources_ids_in_datastore()
238+
239+
for i, resid in enumerate(get_all_resources_ids_in_datastore(), 1):
240+
print(f'\r{resid} [{i}/{len(resource_ids)}] ...', end='')
241+
logic.get_action('datastore_create')(
242+
{'user': site_user['name'],
243+
'query_timeout': int(timeout * 1000)}, # type: ignore
244+
{'resource_id': resid, 'force': True}
245+
)
246+
print('\x08\x08\x08done')
247+
248+
239249
def get_commands():
240250
return (set_permissions, dump, purge, upgrade)

ckanext/datastore/config_declaration.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,15 @@ groups:
9999
The default method used when creating full-text search indexes. Currently it
100100
can be "gin" or "gist". Refer to PostgreSQL's documentation to understand the
101101
characteristics of each one and pick the best for your instance.
102+
103+
- key: ckan.datastore.default_fts_index_field_types
104+
type: list
105+
default: ''
106+
example: text tsvector
107+
description: >
108+
A separate full-text search index will be created by default for fields
109+
with these types, and used when searching on fields by passing a
110+
dictionary to the datastore_search q parameter.
111+
112+
Indexes increase the time and disk space required to load data
113+
into the DataStore.

ckanext/datastore/helpers.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,8 @@ def _strip(s: Any):
8282

8383

8484
def should_fts_index_field_type(field_type: str):
85-
return field_type.lower() in ['tsvector', 'text', 'number']
85+
return field_type in tk.config.get(
86+
'ckan.datastore.default_fts_index_field_types', [])
8687

8788

8889
def get_table_and_function_names_from_sql(context: Context, sql: str):

ckanext/datastore/tests/test_create.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,8 @@ def test_create_adds_index_on_full_text_search_when_not_creating_other_indexes(
159159
resource_id = result["resource_id"]
160160
assert self._has_index_on_field(resource_id, '"_full_text"')
161161

162+
@pytest.mark.ckan_config(
163+
"ckan.datastore.default_fts_index_field_types", "text")
162164
def test_create_add_full_text_search_indexes_on_every_text_field(self):
163165
package = factories.Dataset()
164166
data = {

ckanext/datastore/tests/test_db.py

Lines changed: 66 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,15 @@ def test_default_fts_index_method_can_be_overwritten_by_config_var(self):
3939
"_full_text", connection, resource_id, method="gin"
4040
)
4141

42+
@pytest.mark.ckan_config(
43+
"ckan.datastore.default_fts_index_field_types", "text tsvector")
4244
@mock.patch("ckanext.datastore.backend.postgres._get_fields")
4345
def test_creates_fts_index_on_all_fields_except_dates_nested_and_arrays_with_english_as_default(
4446
self, _get_fields
4547
):
4648
_get_fields.return_value = [
4749
{"id": "text", "type": "text"},
48-
{"id": "number", "type": "number"},
50+
{"id": "tsvector", "type": "tsvector"},
4951
{"id": "nested", "type": "nested"},
5052
{"id": "date", "type": "date"},
5153
{"id": "text array", "type": "text[]"},
@@ -62,9 +64,37 @@ def test_creates_fts_index_on_all_fields_except_dates_nested_and_arrays_with_eng
6264
"text", connection, resource_id, "english"
6365
)
6466
self._assert_created_index_on(
65-
"number", connection, resource_id, "english", cast=True
67+
"tsvector", connection, resource_id,
6668
)
6769

70+
@mock.patch("ckanext.datastore.backend.postgres._get_fields")
71+
def test_creates_no_fts_indexes_by_default(
72+
self, _get_fields
73+
):
74+
_get_fields.return_value = [
75+
{"id": "text", "type": "text"},
76+
{"id": "tsvector", "type": "tsvector"},
77+
{"id": "nested", "type": "nested"},
78+
{"id": "date", "type": "date"},
79+
{"id": "text array", "type": "text[]"},
80+
{"id": "timestamp", "type": "timestamp"},
81+
]
82+
connection = mock.MagicMock()
83+
context = {"connection": connection}
84+
resource_id = "resource_id"
85+
data_dict = {"resource_id": resource_id}
86+
87+
db.create_indexes(context, data_dict)
88+
89+
self._assert_no_index_created_on(
90+
"text", connection, resource_id, "english"
91+
)
92+
self._assert_no_index_created_on(
93+
"tsvector", connection, resource_id,
94+
)
95+
96+
@pytest.mark.ckan_config(
97+
"ckan.datastore.default_fts_index_field_types", "text tsvector")
6898
@pytest.mark.ckan_config("ckan.datastore.default_fts_lang", "simple")
6999
@mock.patch("ckanext.datastore.backend.postgres._get_fields")
70100
def test_creates_fts_index_on_textual_fields_can_overwrite_lang_with_config_var(
@@ -80,6 +110,8 @@ def test_creates_fts_index_on_textual_fields_can_overwrite_lang_with_config_var(
80110

81111
self._assert_created_index_on("foo", connection, resource_id, "simple")
82112

113+
@pytest.mark.ckan_config(
114+
"ckan.datastore.default_fts_index_field_types", "text tsvector")
83115
@pytest.mark.ckan_config("ckan.datastore.default_fts_lang", "simple")
84116
@mock.patch("ckanext.datastore.backend.postgres._get_fields")
85117
def test_creates_fts_index_on_textual_fields_can_overwrite_lang_using_lang_param(
@@ -127,6 +159,38 @@ def _assert_created_index_on(
127159
"called with a string containing '%s'" % sql_str
128160
)
129161

162+
def _assert_no_index_created_on(
163+
self,
164+
field,
165+
connection,
166+
resource_id,
167+
lang=None,
168+
cast=False,
169+
method="gist",
170+
):
171+
field = u'"{0}"'.format(field)
172+
if cast:
173+
field = u"cast({0} AS text)".format(field)
174+
if lang is not None:
175+
sql_str = (
176+
u'ON "resource_id" '
177+
u"USING {method}(to_tsvector('{lang}', {field}))"
178+
)
179+
sql_str = sql_str.format(method=method, lang=lang, field=field)
180+
else:
181+
sql_str = u"USING {method}({field})".format(
182+
method=method, field=field
183+
)
184+
185+
calls = connection.execute.call_args_list
186+
187+
was_called = any(sql_str in str(call.args[0]) for call in calls)
188+
189+
assert not was_called, (
190+
"Expected 'connection.execute' to not have been "
191+
"called with a string containing '%s'" % sql_str
192+
)
193+
130194

131195
class TestGetAllResourcesIdsInDatastore(object):
132196
@pytest.mark.ckan_config(u"ckan.plugins", u"datastore")

0 commit comments

Comments
 (0)