Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use top-level keyword fields instead of subfields #3161

Merged
merged 20 commits into from
Oct 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 8 additions & 10 deletions api/api/controllers/search_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,12 +350,14 @@ def search(
("extension", None),
("category", None),
("categories", "category"),
("source", None),
("license", None),
("license_type", "license"),
# Audio-specific filters
("length", None),
# Image-specific filters
("aspect_ratio", None),
("size", None),
("source", None),
("license", "license__keyword"),
("license_type", "license__keyword"),
]
for serializer_field, es_field in filters:
if serializer_field in search_params.data:
Expand Down Expand Up @@ -512,9 +514,7 @@ def related_media(uuid: str, index: str, filter_dead: bool) -> list[Hit]:

# Search the default index for the item itself as it might be sensitive.
item_search = Search(index=index)
# TODO: remove `__keyword` after
# https://github.com/WordPress/openverse/pull/3143 is merged.
item_hit = item_search.query(Term(identifier__keyword=uuid)).execute().hits[0]
item_hit = item_search.query(Term(identifier=uuid)).execute().hits[0]

# Match related using title.
title = item_hit.title
Expand All @@ -539,9 +539,7 @@ def related_media(uuid: str, index: str, filter_dead: bool) -> list[Hit]:
s = Search(index=f"{index}-filtered")

# Exclude the current item and mature content.
# TODO: remove `__keyword` after
# https://github.com/WordPress/openverse/pull/3143 is merged.
s = s.query(related_query & ~Term(identifier__keyword=uuid) & ~Term(mature=True))
s = s.query(related_query & ~Term(identifier=uuid) & ~Term(mature=True))
# Exclude the dynamically disabled sources.
s = _exclude_filtered(s)

Expand Down Expand Up @@ -579,7 +577,7 @@ def get_sources(index):
aggs = {
"unique_sources": {
"terms": {
"field": "source.keyword",
"field": "source",
"size": size,
"order": {"_key": "desc"},
}
Expand Down
8 changes: 1 addition & 7 deletions api/api/utils/search_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,7 @@ def build(
# Use `identifier` rather than the document `id` due to
# `id` instability between refreshes:
# https://github.com/WordPress/openverse/issues/2306
# `identifier` is mapped as `text` which will match fuzzily.
# Use `identifier.keyword` to match _exactly_
# cf: https://github.com/WordPress/openverse/issues/2154
Q(
"terms",
**{"identifier.keyword": all_result_identifiers},
)
Q("terms", identifier=all_result_identifiers)
)

# The default query size is 10, so we need to slice the query
Expand Down
65 changes: 29 additions & 36 deletions ingestion_server/ingestion_server/elasticsearch_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,26 +97,38 @@ def get_instance_attrs(row, schema):
# cleanup tests in CI: test/unit_tests/test_cleanup.py
category = row[schema["category"]] if "category" in schema else None

provider = row[schema["provider"]]
authority_boost = Media.get_authority_boost(meta, provider)

# This matches the order of fields defined in ``es_mapping.py``.
return {
"_id": row[schema["id"]],
"id": row[schema["id"]],
"created_on": row[schema["created_on"]],
"mature": Media.get_maturity(meta, row[schema["mature"]]),
# Keyword fields
"identifier": row[schema["identifier"]],
"license": row[schema["license"]].lower(),
"provider": provider,
"source": row[schema["source"]],
"category": category,
# Text-based fields
"title": row[schema["title"]],
"foreign_landing_url": row[schema["foreign_landing_url"]],
"description": Media.parse_description(meta),
"creator": row[schema["creator"]],
"creator_url": row[schema["creator_url"]],
# Rank feature fields
"standardized_popularity": popularity,
"authority_boost": authority_boost,
"max_boost": max(popularity or 1, authority_boost or 1),
"min_boost": min(popularity or 1, authority_boost or 1),
# Nested fields
"tags": Media.parse_detailed_tags(row[schema["tags"]]),
# Extra fields, not indexed
"url": row[schema["url"]],
"license": row[schema["license"]].lower(),
"foreign_landing_url": row[schema["foreign_landing_url"]],
"creator_url": row[schema["creator_url"]],
"license_version": row[schema["license_version"]],
"license_url": Media.get_license_url(meta),
"provider": row[schema["provider"]],
"source": row[schema["source"]],
"category": category,
"created_on": row[schema["created_on"]],
"tags": Media.parse_detailed_tags(row[schema["tags"]]),
"mature": Media.get_maturity(meta, row[schema["mature"]]),
"standardized_popularity": popularity,
}

@staticmethod
Expand Down Expand Up @@ -187,7 +199,7 @@ def get_popularity(raw):
@staticmethod
def parse_detailed_tags(json_tags):
if not json_tags:
return None
return []
parsed_tags = []
for tag in json_tags:
if "name" in tag:
Expand Down Expand Up @@ -230,28 +242,18 @@ class Index:
@staticmethod
def database_row_to_elasticsearch_doc(row, schema):
extension = Image.get_extension(row[schema["url"]])

height = row[schema["height"]]
width = row[schema["width"]]
aspect_ratio = Image.get_aspect_ratio(height, width)
size = Image.get_size(height, width)

meta = row[schema["meta_data"]]
provider = row[schema["provider"]]
authority_boost = Image.get_authority_boost(meta, provider)

attrs = Image.get_instance_attrs(row, schema)
attrs["category"] = attrs["category"]
popularity = attrs["standardized_popularity"]

return Image(
thumbnail=row[schema["thumbnail"]],
aspect_ratio=aspect_ratio,
extension=extension,
size=size,
authority_boost=authority_boost,
max_boost=max(popularity or 1, authority_boost or 1),
min_boost=min(popularity or 1, authority_boost or 1),
# Extra fields, not indexed
thumbnail=row[schema["thumbnail"]],
**attrs,
)

Expand Down Expand Up @@ -319,27 +321,18 @@ def database_row_to_elasticsearch_doc(row, schema):
alt_files = row[schema["alt_files"]]
filetype = row[schema["filetype"]]
extension = Audio.get_extensions(filetype, alt_files)

meta = row[schema["meta_data"]]
provider = row[schema["provider"]]
authority_boost = Audio.get_authority_boost(meta, provider)

attrs = Audio.get_instance_attrs(row, schema)
popularity = attrs["standardized_popularity"]

length = Audio.get_length(row[schema["duration"]])

return Audio(
length=length,
filetype=filetype,
extension=extension,
# Extra fields, not indexed
bit_rate=row[schema["bit_rate"]],
sample_rate=row[schema["sample_rate"]],
genres=row[schema["genres"]],
duration=row[schema["duration"]],
length=length,
filetype=filetype,
extension=extension,
authority_boost=authority_boost,
max_boost=max(popularity or 1, authority_boost or 1),
min_boost=min(popularity or 1, authority_boost or 1),
**attrs,
)

Expand Down
121 changes: 55 additions & 66 deletions ingestion_server/ingestion_server/es_mapping.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,22 @@
def index_settings(table_name):
from ingestion_server.constants.media_types import AUDIO_TYPE, IMAGE_TYPE, MediaType


def index_settings(media_type: MediaType):
"""
Return the Elasticsearch mapping for a given table in the database.

:param table_name: The name of the table in the upstream database.
:return:
:param media_type: The name of the table in the upstream database.
:return: the settings for the ES mapping
"""

number_of_shards: dict[MediaType, int] = {
IMAGE_TYPE: 18,
AUDIO_TYPE: 1,
}

settings = {
"index": {
"number_of_shards": 18,
"number_of_shards": number_of_shards[media_type],
"number_of_replicas": 0,
"refresh_interval": "-1",
},
Expand Down Expand Up @@ -51,109 +60,89 @@ def index_settings(table_name):
},
}
common_mappings = {
"dynamic": False, # extra fields are stored in ``_source`` but not indexed
"properties": {
"id": {"type": "long"},
"created_on": {"type": "date"},
"mature": {"type": "boolean"},
# Keyword fields
"identifier": {
"fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
"type": "text",
# TODO: Remove subfield when API is updated
"fields": {"keyword": {"type": "keyword"}},
"type": "keyword",
},
"extension": {"type": "keyword"},
"license": {
# TODO: Remove subfield when API is updated
"fields": {"keyword": {"type": "keyword"}},
"type": "keyword",
},
"provider": {"type": "keyword"},
"source": {
# TODO: Remove subfield when API is updated
"fields": {"keyword": {"type": "keyword"}},
"type": "keyword",
},
"filetype": {"type": "keyword"},
"category": {"type": "keyword"},
# Text-based fields
"title": {
"type": "text",
"analyzer": "custom_english",
"similarity": "boolean",
"fields": {
"keyword": {"type": "keyword", "ignore_above": 256},
"raw": {"type": "text", "index": True},
},
"analyzer": "custom_english",
},
"foreign_landing_url": {
"fields": {"keyword": {"ignore_above": 256, "type": "keyword"}},
"type": "text",
},
"description": {
"fields": {
"keyword": {"type": "keyword", "similarity": "boolean"},
"raw": {"type": "text", "index": True},
},
"type": "text",
"analyzer": "custom_english",
"similarity": "boolean",
"fields": {"raw": {"type": "text", "index": True}},
},
"creator": {
"type": "text",
"fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
},
"url": {
"fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
"type": "text",
},
"extension": {
"fields": {"keyword": {"ignore_above": 8, "type": "keyword"}},
"type": "text",
},
"license": {
"fields": {"keyword": {"ignore_above": 256, "type": "keyword"}},
"type": "text",
},
"license_version": {
"type": "text",
"fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
},
"license_url": {
"fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
"type": "text",
},
"provider": {
"type": "text",
"fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
},
"source": {
"fields": {"keyword": {"ignore_above": 256, "type": "keyword"}},
"type": "text",
# Rank feature fields
"standardized_popularity": {"type": "rank_feature"},
"authority_boost": {"type": "rank_feature"},
"authority_penalty": {
"type": "rank_feature",
"positive_score_impact": False,
},
"filetype": {"type": "keyword"},
"created_on": {"type": "date"},
"max_boost": {"type": "rank_feature"},
"min_boost": {"type": "rank_feature"},
# Nested fields
"tags": {
"properties": {
"accuracy": {"type": "float"},
# Text-based fields
"name": {
"type": "text",
"analyzer": "custom_english",
"fields": {
"keyword": {"type": "keyword", "ignore_above": 256},
"raw": {"type": "text", "index": True},
},
"analyzer": "custom_english",
},
}
},
"mature": {"type": "boolean"},
"standardized_popularity": {"type": "rank_feature"},
"authority_boost": {"type": "rank_feature"},
"authority_penalty": {
"type": "rank_feature",
"positive_score_impact": False,
},
"max_boost": {"type": "rank_feature"},
"min_boost": {"type": "rank_feature"},
"category": {"type": "keyword"},
}
},
}
media_properties = {
"image": {
"aspect_ratio": {
"fields": {"keyword": {"type": "keyword"}},
"type": "text",
},
"size": {"fields": {"keyword": {"type": "keyword"}}, "type": "text"},
# Keyword fields
"aspect_ratio": {"type": "keyword"},
"size": {"type": "keyword"},
},
"audio": {
"bit_rate": {"type": "integer"},
"sample_rate": {"type": "integer"},
"genres": {"fields": {"keyword": {"type": "keyword"}}, "type": "text"},
"duration": {"type": "integer"},
# Keyword fields
"length": {"type": "keyword"},
},
}
media_mappings = common_mappings.copy()
media_mappings["properties"].update(media_properties[table_name])
media_mappings["properties"].update(media_properties[media_type])
result = {"settings": settings.copy(), "mappings": media_mappings}
return result