Add badges to Neo4jExtractor and elastic search (#204)

* Add badges to Neo4jSearchExtractor * update publisher to have badges * update elastic search document * fix typo * update name * filter tags by type * typo * do not filter tags because then i can't get badges on staging :| * update tests * fix tests * use amunsen_common for elastic search index * revert commit using amundsencommon * add comment * make backwards compatible * remove badges from tags
amundsen-io · Mar 3, 2020 · 33fd3be · 33fd3be
1 parent 7b1d55a
commit 33fd3be
Show file tree

Hide file tree

Showing 7 changed files with 23 additions and 12 deletions.
diff --git a/databuilder/extractor/neo4j_search_data_extractor.py b/databuilder/extractor/neo4j_search_data_extractor.py
@@ -24,7 +24,8 @@ class Neo4jSearchDataExtractor(Extractor):
         OPTIONAL MATCH (table)-[read:READ_BY]->(user:User)
         OPTIONAL MATCH (table)-[:COLUMN]->(cols:Column)
         OPTIONAL MATCH (cols)-[:DESCRIPTION]->(col_description:Description)
-        OPTIONAL MATCH (table)-[:TAGGED_BY]->(tags:Tag)
+        OPTIONAL MATCH (table)-[:TAGGED_BY]->(tags:Tag) WHERE tags.tag_type='default'
+        OPTIONAL MATCH (table)-[:TAGGED_BY]->(badges:Tag) WHERE badges.tag_type='badge'
         OPTIONAL MATCH (table)-[:LAST_UPDATED_AT]->(time_stamp:Timestamp)
         RETURN db.name as database, cluster.name AS cluster, schema.name AS schema,
         table.name AS name, table.key AS key, table_description.description AS description,
@@ -33,7 +34,8 @@ class Neo4jSearchDataExtractor(Extractor):
         EXTRACT(cd IN COLLECT(DISTINCT col_description)| cd.description) AS column_descriptions,
         REDUCE(sum_r = 0, r in COLLECT(DISTINCT read)| sum_r + r.read_count) AS total_usage,
         COUNT(DISTINCT user.email) as unique_usage,
-        COLLECT(DISTINCT tags.key) as tags
+        COLLECT(DISTINCT tags.key) as tags,
+        COLLECT(DISTINCT badges.key) as badges
         ORDER BY table.name;
         """
     )

diff --git a/databuilder/models/table_elasticsearch_document.py b/databuilder/models/table_elasticsearch_document.py
@@ -19,7 +19,8 @@ def __init__(self,
                  column_descriptions,  # type: List[str]
                  total_usage,  # type: int
                  unique_usage,  # type: int
-                 tags,  # type: List[str]
+                 tags,  # type: List[str],
+                 badges=None,  # type: Optional[List[str]]
                  display_name=None,  # type: Optional[str]
                  ):
         # type: (...) -> None
@@ -38,3 +39,4 @@ def __init__(self,
         self.unique_usage = unique_usage
         # todo: will include tag_type once we have better understanding from UI flow.
         self.tags = tags
+        self.badges = badges
diff --git a/databuilder/publisher/elasticsearch_publisher.py b/databuilder/publisher/elasticsearch_publisher.py
@@ -36,6 +36,7 @@ class ElasticsearchPublisher(Publisher):
     # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-simple-analyzer.html
     # Standard Analyzer is used for all text fields that don't explicitly specify an analyzer
     # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-standard-analyzer.html
+    # TODO use amundsencommon for this when this project is updated to py3
     DEFAULT_ELASTICSEARCH_INDEX_MAPPING = textwrap.dedent(
         """
         {
@@ -87,6 +88,9 @@ class ElasticsearchPublisher(Publisher):
                 "tags": {
                   "type": "keyword"
                 },
+                "badges": {
+                  "type": "keyword"
+                },
                 "cluster": {
                   "type": "text"
                 },

diff --git a/requirements.txt b/requirements.txt
@@ -56,8 +56,6 @@ statsd==3.2.1
 retrying==1.3.3
 unicodecsv==0.14.1,<1.0
 
-
-
 httplib2~=0.9.2
 unidecode
 
diff --git a/tests/unit/extractor/test_neo4j_extractor.py b/tests/unit/extractor/test_neo4j_extractor.py
@@ -112,7 +112,8 @@ def test_extraction_with_model_class(self):
                                column_descriptions=['test_description1', 'test_description2', ''],
                                total_usage=100,
                                unique_usage=5,
-                               tags=['hive'])
+                               tags=['hive'],
+                               badges=['badge1'])
 
             extractor.results = [result_dict]
             result_obj = extractor.extract()

diff --git a/tests/unit/loader/test_file_system_elasticsearch_json_loader.py b/tests/unit/loader/test_file_system_elasticsearch_json_loader.py
@@ -100,7 +100,8 @@ def test_loading_with_single_object(self):
                                column_descriptions=['test_comment1', 'test_comment2'],
                                total_usage=10,
                                unique_usage=5,
-                               tags=['test_tag1', 'test_tag2'])
+                               tags=['test_tag1', 'test_tag2'],
+                               badges=['badge1'])
         loader.load(data)
         loader.close()
 
@@ -110,7 +111,7 @@ def test_loading_with_single_object(self):
              '"column_names": ["test_col1", "test_col2"], "name": "test_table", '
              '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
              '"description": "test_description", "unique_usage": 5, "total_usage": 10, '
-             '"tags": ["test_tag1", "test_tag2"]}')
+             '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"]}')
         ]
 
         self._check_results_helper(expected=expected)
@@ -136,7 +137,8 @@ def test_loading_with_list_of_objects(self):
                                 column_descriptions=['test_comment1', 'test_comment2'],
                                 total_usage=10,
                                 unique_usage=5,
-                                tags=['test_tag1', 'test_tag2'])] * 5
+                                tags=['test_tag1', 'test_tag2'],
+                                badges=['badge1'])] * 5
 
         for d in data:
             loader.load(d)
@@ -148,7 +150,7 @@ def test_loading_with_list_of_objects(self):
              '"column_names": ["test_col1", "test_col2"], "name": "test_table", '
              '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
              '"description": "test_description", "unique_usage": 5, "total_usage": 10, '
-             '"tags": ["test_tag1", "test_tag2"]}')
+             '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"]}')
         ] * 5
 
         self._check_results_helper(expected=expected)
diff --git a/tests/unit/models/test_table_elasticsearch_document.py b/tests/unit/models/test_table_elasticsearch_document.py
@@ -22,7 +22,8 @@ def test_to_json(self):
                                    column_descriptions=['test_description1', 'test_description2'],
                                    total_usage=100,
                                    unique_usage=10,
-                                   tags=['test'])
+                                   tags=['test'],
+                                   badges=['badge1'])
 
         expected_document_dict = {"database": "test_database",
                                   "cluster": "test_cluster",
@@ -36,7 +37,8 @@ def test_to_json(self):
                                   "column_descriptions": ["test_description1", "test_description2"],
                                   "total_usage": 100,
                                   "unique_usage": 10,
-                                  "tags": ["test"]
+                                  "tags": ["test"],
+                                  "badges": ["badge1"]
                                   }
 
         result = test_obj.to_json()