Skip to content

Commit

Permalink
Add badges to Neo4jExtractor and elastic search (#204)
Browse files Browse the repository at this point in the history
* Add badges to Neo4jSearchExtractor

* update publisher to have badges

* update elastic search document

* fix typo

* update name

* filter tags by type

* typo

* do not filter tags because then i can't get badges on staging :|

* update tests

* fix tests

* use amunsen_common for elastic search index

* revert commit using amundsencommon

* add comment

* make backwards compatible

* remove badges from tags
  • Loading branch information
csteez authored Mar 3, 2020
1 parent 7b1d55a commit 33fd3be
Show file tree
Hide file tree
Showing 7 changed files with 23 additions and 12 deletions.
6 changes: 4 additions & 2 deletions databuilder/extractor/neo4j_search_data_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ class Neo4jSearchDataExtractor(Extractor):
OPTIONAL MATCH (table)-[read:READ_BY]->(user:User)
OPTIONAL MATCH (table)-[:COLUMN]->(cols:Column)
OPTIONAL MATCH (cols)-[:DESCRIPTION]->(col_description:Description)
OPTIONAL MATCH (table)-[:TAGGED_BY]->(tags:Tag)
OPTIONAL MATCH (table)-[:TAGGED_BY]->(tags:Tag) WHERE tags.tag_type='default'
OPTIONAL MATCH (table)-[:TAGGED_BY]->(badges:Tag) WHERE badges.tag_type='badge'
OPTIONAL MATCH (table)-[:LAST_UPDATED_AT]->(time_stamp:Timestamp)
RETURN db.name as database, cluster.name AS cluster, schema.name AS schema,
table.name AS name, table.key AS key, table_description.description AS description,
Expand All @@ -33,7 +34,8 @@ class Neo4jSearchDataExtractor(Extractor):
EXTRACT(cd IN COLLECT(DISTINCT col_description)| cd.description) AS column_descriptions,
REDUCE(sum_r = 0, r in COLLECT(DISTINCT read)| sum_r + r.read_count) AS total_usage,
COUNT(DISTINCT user.email) as unique_usage,
COLLECT(DISTINCT tags.key) as tags
COLLECT(DISTINCT tags.key) as tags,
COLLECT(DISTINCT badges.key) as badges
ORDER BY table.name;
"""
)
Expand Down
4 changes: 3 additions & 1 deletion databuilder/models/table_elasticsearch_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ def __init__(self,
column_descriptions, # type: List[str]
total_usage, # type: int
unique_usage, # type: int
tags, # type: List[str]
tags, # type: List[str],
badges=None, # type: Optional[List[str]]
display_name=None, # type: Optional[str]
):
# type: (...) -> None
Expand All @@ -38,3 +39,4 @@ def __init__(self,
self.unique_usage = unique_usage
# todo: will include tag_type once we have better understanding from UI flow.
self.tags = tags
self.badges = badges
4 changes: 4 additions & 0 deletions databuilder/publisher/elasticsearch_publisher.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class ElasticsearchPublisher(Publisher):
# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-simple-analyzer.html
# Standard Analyzer is used for all text fields that don't explicitly specify an analyzer
# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-standard-analyzer.html
# TODO use amundsencommon for this when this project is updated to py3
DEFAULT_ELASTICSEARCH_INDEX_MAPPING = textwrap.dedent(
"""
{
Expand Down Expand Up @@ -87,6 +88,9 @@ class ElasticsearchPublisher(Publisher):
"tags": {
"type": "keyword"
},
"badges": {
"type": "keyword"
},
"cluster": {
"type": "text"
},
Expand Down
2 changes: 0 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@ statsd==3.2.1
retrying==1.3.3
unicodecsv==0.14.1,<1.0



httplib2~=0.9.2
unidecode

3 changes: 2 additions & 1 deletion tests/unit/extractor/test_neo4j_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ def test_extraction_with_model_class(self):
column_descriptions=['test_description1', 'test_description2', ''],
total_usage=100,
unique_usage=5,
tags=['hive'])
tags=['hive'],
badges=['badge1'])

extractor.results = [result_dict]
result_obj = extractor.extract()
Expand Down
10 changes: 6 additions & 4 deletions tests/unit/loader/test_file_system_elasticsearch_json_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,8 @@ def test_loading_with_single_object(self):
column_descriptions=['test_comment1', 'test_comment2'],
total_usage=10,
unique_usage=5,
tags=['test_tag1', 'test_tag2'])
tags=['test_tag1', 'test_tag2'],
badges=['badge1'])
loader.load(data)
loader.close()

Expand All @@ -110,7 +111,7 @@ def test_loading_with_single_object(self):
'"column_names": ["test_col1", "test_col2"], "name": "test_table", '
'"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
'"description": "test_description", "unique_usage": 5, "total_usage": 10, '
'"tags": ["test_tag1", "test_tag2"]}')
'"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"]}')
]

self._check_results_helper(expected=expected)
Expand All @@ -136,7 +137,8 @@ def test_loading_with_list_of_objects(self):
column_descriptions=['test_comment1', 'test_comment2'],
total_usage=10,
unique_usage=5,
tags=['test_tag1', 'test_tag2'])] * 5
tags=['test_tag1', 'test_tag2'],
badges=['badge1'])] * 5

for d in data:
loader.load(d)
Expand All @@ -148,7 +150,7 @@ def test_loading_with_list_of_objects(self):
'"column_names": ["test_col1", "test_col2"], "name": "test_table", '
'"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
'"description": "test_description", "unique_usage": 5, "total_usage": 10, '
'"tags": ["test_tag1", "test_tag2"]}')
'"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"]}')
] * 5

self._check_results_helper(expected=expected)
6 changes: 4 additions & 2 deletions tests/unit/models/test_table_elasticsearch_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ def test_to_json(self):
column_descriptions=['test_description1', 'test_description2'],
total_usage=100,
unique_usage=10,
tags=['test'])
tags=['test'],
badges=['badge1'])

expected_document_dict = {"database": "test_database",
"cluster": "test_cluster",
Expand All @@ -36,7 +37,8 @@ def test_to_json(self):
"column_descriptions": ["test_description1", "test_description2"],
"total_usage": 100,
"unique_usage": 10,
"tags": ["test"]
"tags": ["test"],
"badges": ["badge1"]
}

result = test_obj.to_json()
Expand Down

0 comments on commit 33fd3be

Please sign in to comment.