Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce ES shard count and simplify index properties #3143

Merged
merged 16 commits into from
Oct 19, 2023
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 29 additions & 36 deletions ingestion_server/ingestion_server/elasticsearch_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,26 +97,38 @@ def get_instance_attrs(row, schema):
# cleanup tests in CI: test/unit_tests/test_cleanup.py
category = row[schema["category"]] if "category" in schema else None

provider = row[schema["provider"]]
authority_boost = Media.get_authority_boost(meta, provider)

# This matches the order of fields defined in ``es_mapping.py``.
return {
"_id": row[schema["id"]],
"id": row[schema["id"]],
"created_on": row[schema["created_on"]],
"mature": Media.get_maturity(meta, row[schema["mature"]]),
# Keyword fields
"identifier": row[schema["identifier"]],
"license": row[schema["license"]].lower(),
"provider": provider,
"source": row[schema["source"]],
"category": category,
# Text-based fields
"title": row[schema["title"]],
"foreign_landing_url": row[schema["foreign_landing_url"]],
"description": Media.parse_description(meta),
"creator": row[schema["creator"]],
"creator_url": row[schema["creator_url"]],
# Rank feature fields
"standardized_popularity": popularity,
"authority_boost": authority_boost,
"max_boost": max(popularity or 1, authority_boost or 1),
"min_boost": min(popularity or 1, authority_boost or 1),
# Nested fields
"tags": Media.parse_detailed_tags(row[schema["tags"]]),
# Extra fields, not indexed
"url": row[schema["url"]],
"license": row[schema["license"]].lower(),
"foreign_landing_url": row[schema["foreign_landing_url"]],
"creator_url": row[schema["creator_url"]],
"license_version": row[schema["license_version"]],
"license_url": Media.get_license_url(meta),
"provider": row[schema["provider"]],
"source": row[schema["source"]],
"category": category,
"created_on": row[schema["created_on"]],
"tags": Media.parse_detailed_tags(row[schema["tags"]]),
"mature": Media.get_maturity(meta, row[schema["mature"]]),
"standardized_popularity": popularity,
}

@staticmethod
Expand Down Expand Up @@ -187,7 +199,7 @@ def get_popularity(raw):
@staticmethod
def parse_detailed_tags(json_tags):
if not json_tags:
return None
return []
parsed_tags = []
for tag in json_tags:
if "name" in tag:
Expand Down Expand Up @@ -230,28 +242,18 @@ class Index:
@staticmethod
def database_row_to_elasticsearch_doc(row, schema):
extension = Image.get_extension(row[schema["url"]])

height = row[schema["height"]]
width = row[schema["width"]]
aspect_ratio = Image.get_aspect_ratio(height, width)
size = Image.get_size(height, width)

meta = row[schema["meta_data"]]
provider = row[schema["provider"]]
authority_boost = Image.get_authority_boost(meta, provider)

attrs = Image.get_instance_attrs(row, schema)
attrs["category"] = attrs["category"]
popularity = attrs["standardized_popularity"]

return Image(
thumbnail=row[schema["thumbnail"]],
aspect_ratio=aspect_ratio,
extension=extension,
size=size,
authority_boost=authority_boost,
max_boost=max(popularity or 1, authority_boost or 1),
min_boost=min(popularity or 1, authority_boost or 1),
# Extra fields, not indexed
thumbnail=row[schema["thumbnail"]],
**attrs,
)

Expand Down Expand Up @@ -319,27 +321,18 @@ def database_row_to_elasticsearch_doc(row, schema):
alt_files = row[schema["alt_files"]]
filetype = row[schema["filetype"]]
extension = Audio.get_extensions(filetype, alt_files)

meta = row[schema["meta_data"]]
provider = row[schema["provider"]]
authority_boost = Audio.get_authority_boost(meta, provider)

attrs = Audio.get_instance_attrs(row, schema)
popularity = attrs["standardized_popularity"]

length = Audio.get_length(row[schema["duration"]])

return Audio(
length=length,
filetype=filetype,
extension=extension,
# Extra fields, not indexed
bit_rate=row[schema["bit_rate"]],
sample_rate=row[schema["sample_rate"]],
genres=row[schema["genres"]],
duration=row[schema["duration"]],
length=length,
filetype=filetype,
extension=extension,
authority_boost=authority_boost,
max_boost=max(popularity or 1, authority_boost or 1),
min_boost=min(popularity or 1, authority_boost or 1),
**attrs,
)

Expand Down
121 changes: 55 additions & 66 deletions ingestion_server/ingestion_server/es_mapping.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,22 @@
def index_settings(table_name):
from ingestion_server.constants.media_types import AUDIO_TYPE, IMAGE_TYPE, MediaType


def index_settings(media_type: MediaType):
"""
Return the Elasticsearch mapping for a given table in the database.

:param table_name: The name of the table in the upstream database.
:return:
:param media_type: The name of the table in the upstream database.
:return: the settings for the ES mapping
"""

number_of_shards: dict[MediaType, int] = {
IMAGE_TYPE: 18,
AUDIO_TYPE: 1,
}

dhruvkb marked this conversation as resolved.
Show resolved Hide resolved
settings = {
"index": {
"number_of_shards": 18,
"number_of_shards": number_of_shards[media_type],
"number_of_replicas": 0,
"refresh_interval": "-1",
},
Expand Down Expand Up @@ -51,109 +60,89 @@ def index_settings(table_name):
},
}
common_mappings = {
"dynamic": False, # extra fields are stored in ``_source`` but not indexed
"properties": {
"id": {"type": "long"},
"created_on": {"type": "date"},
"mature": {"type": "boolean"},
# Keyword fields
"identifier": {
"fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
"type": "text",
# TODO: Remove subfield when API is updated
"fields": {"keyword": {"type": "keyword"}},
"type": "keyword",
},
"extension": {"type": "keyword"},
"license": {
# TODO: Remove subfield when API is updated
"fields": {"keyword": {"type": "keyword"}},
"type": "keyword",
},
"provider": {"type": "keyword"},
"source": {
# TODO: Remove subfield when API is updated
"fields": {"keyword": {"type": "keyword"}},
"type": "keyword",
},
"filetype": {"type": "keyword"},
"category": {"type": "keyword"},
# Text-based fields
"title": {
"type": "text",
"analyzer": "custom_english",
"similarity": "boolean",
"fields": {
"keyword": {"type": "keyword", "ignore_above": 256},
"raw": {"type": "text", "index": True},
},
"analyzer": "custom_english",
},
"foreign_landing_url": {
"fields": {"keyword": {"ignore_above": 256, "type": "keyword"}},
"type": "text",
},
"description": {
"fields": {
"keyword": {"type": "keyword", "similarity": "boolean"},
"raw": {"type": "text", "index": True},
},
"type": "text",
"analyzer": "custom_english",
"similarity": "boolean",
"fields": {"raw": {"type": "text", "index": True}},
},
"creator": {
"type": "text",
"fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
},
"url": {
"fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
"type": "text",
},
"extension": {
"fields": {"keyword": {"ignore_above": 8, "type": "keyword"}},
"type": "text",
},
"license": {
"fields": {"keyword": {"ignore_above": 256, "type": "keyword"}},
"type": "text",
},
"license_version": {
"type": "text",
"fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
},
"license_url": {
"fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
"type": "text",
},
"provider": {
"type": "text",
"fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
},
"source": {
"fields": {"keyword": {"ignore_above": 256, "type": "keyword"}},
"type": "text",
# Rank feature fields
"standardized_popularity": {"type": "rank_feature"},
"authority_boost": {"type": "rank_feature"},
"authority_penalty": {
"type": "rank_feature",
"positive_score_impact": False,
},
"filetype": {"type": "keyword"},
"created_on": {"type": "date"},
"max_boost": {"type": "rank_feature"},
"min_boost": {"type": "rank_feature"},
# Nested fields
"tags": {
"properties": {
"accuracy": {"type": "float"},
# Text-based fields
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think setting it to keyword-only would greatly reduce the size of the index. It would change the search by tags a little bit. I.e., it would probably not match if you are searching for draw, you won't get the media that has drawing tag, but do we really want it to match like that?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an important point, we could convert it into keywords (and potentially use stems so that 'drawing' and 'draw' evaluate to the same keyword). I'll let others chime in on this discussion as well.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we maybe see how this set of changes goes, then consider moving tags to keywords as a follow-up? I agree that keywords + stemming could be useful (if that's a distinction we can make), but it would be a significant change to the way our search currently operates which leaves me a little hesitant.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, we can keep this possibility open as a future optimisation.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When discussing the collection views we explicitly said that tag stemming was desirable. The tag dogs should match dog, we used as an example. Any other approach doesn't make much sense and would confound users (and ourselves, I have to think!).

"name": {
"type": "text",
"analyzer": "custom_english",
"fields": {
"keyword": {"type": "keyword", "ignore_above": 256},
"raw": {"type": "text", "index": True},
},
"analyzer": "custom_english",
},
}
},
"mature": {"type": "boolean"},
"standardized_popularity": {"type": "rank_feature"},
"authority_boost": {"type": "rank_feature"},
"authority_penalty": {
"type": "rank_feature",
"positive_score_impact": False,
},
"max_boost": {"type": "rank_feature"},
"min_boost": {"type": "rank_feature"},
"category": {"type": "keyword"},
}
},
}
media_properties = {
"image": {
"aspect_ratio": {
"fields": {"keyword": {"type": "keyword"}},
"type": "text",
},
"size": {"fields": {"keyword": {"type": "keyword"}}, "type": "text"},
# Keyword fields
"aspect_ratio": {"type": "keyword"},
"size": {"type": "keyword"},
},
"audio": {
"bit_rate": {"type": "integer"},
"sample_rate": {"type": "integer"},
"genres": {"fields": {"keyword": {"type": "keyword"}}, "type": "text"},
"duration": {"type": "integer"},
dhruvkb marked this conversation as resolved.
Show resolved Hide resolved
# Keyword fields
"length": {"type": "keyword"},
},
}
media_mappings = common_mappings.copy()
media_mappings["properties"].update(media_properties[table_name])
media_mappings["properties"].update(media_properties[media_type])
result = {"settings": settings.copy(), "mappings": media_mappings}
return result