diff --git a/databuilder/extractor/neo4j_search_data_extractor.py b/databuilder/extractor/neo4j_search_data_extractor.py index 498b59d98..9a4b49608 100644 --- a/databuilder/extractor/neo4j_search_data_extractor.py +++ b/databuilder/extractor/neo4j_search_data_extractor.py @@ -24,7 +24,8 @@ class Neo4jSearchDataExtractor(Extractor): OPTIONAL MATCH (table)-[read:READ_BY]->(user:User) OPTIONAL MATCH (table)-[:COLUMN]->(cols:Column) OPTIONAL MATCH (cols)-[:DESCRIPTION]->(col_description:Description) - OPTIONAL MATCH (table)-[:TAGGED_BY]->(tags:Tag) + OPTIONAL MATCH (table)-[:TAGGED_BY]->(tags:Tag) WHERE tags.tag_type='default' + OPTIONAL MATCH (table)-[:TAGGED_BY]->(badges:Tag) WHERE badges.tag_type='badge' OPTIONAL MATCH (table)-[:LAST_UPDATED_AT]->(time_stamp:Timestamp) RETURN db.name as database, cluster.name AS cluster, schema.name AS schema, table.name AS name, table.key AS key, table_description.description AS description, @@ -33,7 +34,8 @@ class Neo4jSearchDataExtractor(Extractor): EXTRACT(cd IN COLLECT(DISTINCT col_description)| cd.description) AS column_descriptions, REDUCE(sum_r = 0, r in COLLECT(DISTINCT read)| sum_r + r.read_count) AS total_usage, COUNT(DISTINCT user.email) as unique_usage, - COLLECT(DISTINCT tags.key) as tags + COLLECT(DISTINCT tags.key) as tags, + COLLECT(DISTINCT badges.key) as badges ORDER BY table.name; """ ) diff --git a/databuilder/models/table_elasticsearch_document.py b/databuilder/models/table_elasticsearch_document.py index da7a797e8..577256cf3 100644 --- a/databuilder/models/table_elasticsearch_document.py +++ b/databuilder/models/table_elasticsearch_document.py @@ -19,7 +19,8 @@ def __init__(self, column_descriptions, # type: List[str] total_usage, # type: int unique_usage, # type: int - tags, # type: List[str] + tags, # type: List[str], + badges=None, # type: Optional[List[str]] display_name=None, # type: Optional[str] ): # type: (...) -> None @@ -38,3 +39,4 @@ def __init__(self, self.unique_usage = unique_usage # todo: will include tag_type once we have better understanding from UI flow. self.tags = tags + self.badges = badges diff --git a/databuilder/publisher/elasticsearch_publisher.py b/databuilder/publisher/elasticsearch_publisher.py index 21cdc68ac..97c8da029 100644 --- a/databuilder/publisher/elasticsearch_publisher.py +++ b/databuilder/publisher/elasticsearch_publisher.py @@ -36,6 +36,7 @@ class ElasticsearchPublisher(Publisher): # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-simple-analyzer.html # Standard Analyzer is used for all text fields that don't explicitly specify an analyzer # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-standard-analyzer.html + # TODO use amundsencommon for this when this project is updated to py3 DEFAULT_ELASTICSEARCH_INDEX_MAPPING = textwrap.dedent( """ { @@ -87,6 +88,9 @@ class ElasticsearchPublisher(Publisher): "tags": { "type": "keyword" }, + "badges": { + "type": "keyword" + }, "cluster": { "type": "text" }, diff --git a/requirements.txt b/requirements.txt index 9feecab75..1c8744f22 100644 --- a/requirements.txt +++ b/requirements.txt @@ -56,8 +56,6 @@ statsd==3.2.1 retrying==1.3.3 unicodecsv==0.14.1,<1.0 - - httplib2~=0.9.2 unidecode diff --git a/tests/unit/extractor/test_neo4j_extractor.py b/tests/unit/extractor/test_neo4j_extractor.py index 180cab11e..7e14f372e 100644 --- a/tests/unit/extractor/test_neo4j_extractor.py +++ b/tests/unit/extractor/test_neo4j_extractor.py @@ -112,7 +112,8 @@ def test_extraction_with_model_class(self): column_descriptions=['test_description1', 'test_description2', ''], total_usage=100, unique_usage=5, - tags=['hive']) + tags=['hive'], + badges=['badge1']) extractor.results = [result_dict] result_obj = extractor.extract() diff --git a/tests/unit/loader/test_file_system_elasticsearch_json_loader.py b/tests/unit/loader/test_file_system_elasticsearch_json_loader.py index ba035a5b3..7781e1c69 100644 --- a/tests/unit/loader/test_file_system_elasticsearch_json_loader.py +++ b/tests/unit/loader/test_file_system_elasticsearch_json_loader.py @@ -100,7 +100,8 @@ def test_loading_with_single_object(self): column_descriptions=['test_comment1', 'test_comment2'], total_usage=10, unique_usage=5, - tags=['test_tag1', 'test_tag2']) + tags=['test_tag1', 'test_tag2'], + badges=['badge1']) loader.load(data) loader.close() @@ -110,7 +111,7 @@ def test_loading_with_single_object(self): '"column_names": ["test_col1", "test_col2"], "name": "test_table", ' '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", ' '"description": "test_description", "unique_usage": 5, "total_usage": 10, ' - '"tags": ["test_tag1", "test_tag2"]}') + '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"]}') ] self._check_results_helper(expected=expected) @@ -136,7 +137,8 @@ def test_loading_with_list_of_objects(self): column_descriptions=['test_comment1', 'test_comment2'], total_usage=10, unique_usage=5, - tags=['test_tag1', 'test_tag2'])] * 5 + tags=['test_tag1', 'test_tag2'], + badges=['badge1'])] * 5 for d in data: loader.load(d) @@ -148,7 +150,7 @@ def test_loading_with_list_of_objects(self): '"column_names": ["test_col1", "test_col2"], "name": "test_table", ' '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", ' '"description": "test_description", "unique_usage": 5, "total_usage": 10, ' - '"tags": ["test_tag1", "test_tag2"]}') + '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"]}') ] * 5 self._check_results_helper(expected=expected) diff --git a/tests/unit/models/test_table_elasticsearch_document.py b/tests/unit/models/test_table_elasticsearch_document.py index 6f38324c0..9ddea689f 100644 --- a/tests/unit/models/test_table_elasticsearch_document.py +++ b/tests/unit/models/test_table_elasticsearch_document.py @@ -22,7 +22,8 @@ def test_to_json(self): column_descriptions=['test_description1', 'test_description2'], total_usage=100, unique_usage=10, - tags=['test']) + tags=['test'], + badges=['badge1']) expected_document_dict = {"database": "test_database", "cluster": "test_cluster", @@ -36,7 +37,8 @@ def test_to_json(self): "column_descriptions": ["test_description1", "test_description2"], "total_usage": 100, "unique_usage": 10, - "tags": ["test"] + "tags": ["test"], + "badges": ["badge1"] } result = test_obj.to_json()