From 86c05750c8f7b42ca238244196b2ceb5ab80574e Mon Sep 17 00:00:00 2001 From: Robert Dempsey Date: Fri, 29 Apr 2016 09:52:00 -0400 Subject: [PATCH 1/8] Rule matching patch part 1 Refactored the code to move most of the rule dictionary processing before Traptor adds a rule tag and value to a tweet. Sped things up nicely. More to come! --- traptor/traptor.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/traptor/traptor.py b/traptor/traptor.py index 309145d..7015640 100644 --- a/traptor/traptor.py +++ b/traptor/traptor.py @@ -208,8 +208,8 @@ def _make_twitter_rules(self, rules): def _add_rule_tag_and_value_to_tweet(self, tweet_dict, search_str, matched_rule): - for k, v in FlatDict(tweet_dict).iteritems(): - if isinstance(v, unicode) and search_str.lower() in v.lower(): + for k, v in tweet_dict.iteritems(): + if isinstance(v, unicode) and search_str in v: # These two lines kept for backwards compatibility tweet_dict['traptor']['rule_tag'] = matched_rule['tag'] tweet_dict['traptor']['rule_value'] = matched_rule['value'] @@ -241,10 +241,19 @@ def _find_rule_matches(self, tweet_dict): for rule in self.redis_rules: search_str = rule['value'] + search_str = search_str.lower() # self.logger.debug("Search string used for the rule match: {}".format(search_str.encode('utf-8'))) if re.search(',', search_str): for s in search_str.split(','): - new_dict = self._add_rule_tag_and_value_to_tweet(new_dict, s, rule) + # Lower the search string + s = s.lower() + # Lowercase everything in the dict and flatten it out + new_dict = dict((k.lower(), v.lower()) for k, v in new_dict.iteritems()) + new_dict = FlatDict(new_dict) + # Add the rule to the tweet + new_dict = self._add_rule_tag_and_value_to_tweet(new_dict, + s, + rule) else: search_str = rule['value'].split()[0] for i in new_dict.keys(): From 36c6af83013623fe37beb44a74e002be92508457 Mon Sep 17 00:00:00 2001 From: Robert Dempsey Date: Fri, 29 Apr 2016 10:54:56 -0400 Subject: [PATCH 2/8] Make tweet data readable, fix rule matching Updated rule matching to not worry about comma-separated rules. --- tests/data/follow_tweet.json | 90 +++++++++++++++++++++++- tests/data/locations_tweet.json | 118 +++++++++++++++++++++++++++++++- tests/data/track_tweet.json | 92 ++++++++++++++++++++++++- tests/test_traptor_offline.py | 39 +---------- traptor/traptor.py | 32 ++++----- 5 files changed, 315 insertions(+), 56 deletions(-) diff --git a/tests/data/follow_tweet.json b/tests/data/follow_tweet.json index 5789cfd..aeb97ce 100644 --- a/tests/data/follow_tweet.json +++ b/tests/data/follow_tweet.json @@ -1 +1,89 @@ -{"favorited": false, "contributors": null, "truncated": false, "text": "I'm very happy to say the next single from 1989 will be 'New Romantics'.", "is_quote_status": false, "in_reply_to_status_id": null, "user": {"follow_request_sent": false, "has_extended_profile": false, "profile_use_background_image": false, "time_zone": "Dublin", "id": 17919972, "description": "Born in 1989.", "verified": true, "entities": {"url": {"urls": [{"url": "https://t.co/blhi4NNEJr", "indices": [0, 23], "expanded_url": "http://smarturl.it/1989TourLIVE", "display_url": "smarturl.it/1989TourLIVE"}]}, "description": {"urls": []}}, "profile_image_url_https": "https://pbs.twimg.com/profile_images/505200807503867904/osJXmYRl_normal.jpeg", "profile_sidebar_fill_color": "DDEEF6", "is_translator": false, "geo_enabled": false, "profile_text_color": "333333", "followers_count": 71643131, "protected": false, "id_str": "17919972", "default_profile_image": false, "listed_count": 122360, "lang": "en", "utc_offset": 0, "statuses_count": 4106, "profile_background_color": "C0DEED", "friends_count": 245, "profile_link_color": "0084B4", "profile_image_url": "http://pbs.twimg.com/profile_images/505200807503867904/osJXmYRl_normal.jpeg", "notifications": false, "profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/687293757/6d2ec27f32fa8cc2fcb7e6a9eada9945.jpeg", "profile_banner_url": "https://pbs.twimg.com/profile_banners/17919972/1409286315", "profile_background_image_url": "http://pbs.twimg.com/profile_background_images/687293757/6d2ec27f32fa8cc2fcb7e6a9eada9945.jpeg", "name": "Taylor Swift", "is_translation_enabled": false, "profile_background_tile": false, "favourites_count": 2109, "screen_name": "taylorswift13", "url": "https://t.co/blhi4NNEJr", "created_at": "Sat Dec 06 10:10:54 +0000 2008", "contributors_enabled": false, "location": "", "profile_sidebar_border_color": "FFFFFF", "default_profile": false, "following": false}, "geo": null, "id": 700890866920067072, "favorite_count": 146590, "lang": "en", "entities": {"symbols": [], "user_mentions": [], "hashtags": [], "urls": []}, "created_at": "Sat Feb 20 03:52:59 +0000 2016", "retweeted": false, "coordinates": null, "in_reply_to_user_id_str": null, "source": "Twitter for iPhone", "in_reply_to_status_id_str": null, "in_reply_to_screen_name": null, "id_str": "700890866920067072", "place": null, "retweet_count": 69616, "in_reply_to_user_id": null} +{ + "favorited": false, + "contributors": null, + "truncated": false, + "text": "I'm very happy to say the next single from 1989 will be 'New Romantics'.", + "is_quote_status": false, + "in_reply_to_status_id": null, + "user": { + "follow_request_sent": false, + "has_extended_profile": false, + "profile_use_background_image": false, + "time_zone": "Dublin", + "id": 17919972, + "description": "Born in 1989.", + "verified": true, + "entities": { + "url": { + "urls": [ + { + "url": "https://t.co/blhi4NNEJr", + "indices": [ + 0, + 23 + ], + "expanded_url": "http://smarturl.it/1989TourLIVE", + "display_url": "smarturl.it/1989TourLIVE" + } + ] + }, + "description": { + "urls": [] + } + }, + "profile_image_url_https": "https://pbs.twimg.com/profile_images/505200807503867904/osJXmYRl_normal.jpeg", + "profile_sidebar_fill_color": "DDEEF6", + "is_translator": false, + "geo_enabled": false, + "profile_text_color": "333333", + "followers_count": 71643131, + "protected": false, + "id_str": "17919972", + "default_profile_image": false, + "listed_count": 122360, + "lang": "en", + "utc_offset": 0, + "statuses_count": 4106, + "profile_background_color": "C0DEED", + "friends_count": 245, + "profile_link_color": "0084B4", + "profile_image_url": "http://pbs.twimg.com/profile_images/505200807503867904/osJXmYRl_normal.jpeg", + "notifications": false, + "profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/687293757/6d2ec27f32fa8cc2fcb7e6a9eada9945.jpeg", + "profile_banner_url": "https://pbs.twimg.com/profile_banners/17919972/1409286315", + "profile_background_image_url": "http://pbs.twimg.com/profile_background_images/687293757/6d2ec27f32fa8cc2fcb7e6a9eada9945.jpeg", + "name": "Taylor Swift", + "is_translation_enabled": false, + "profile_background_tile": false, + "favourites_count": 2109, + "screen_name": "taylorswift13", + "url": "https://t.co/blhi4NNEJr", + "created_at": "Sat Dec 06 10:10:54 +0000 2008", + "contributors_enabled": false, + "location": "", + "profile_sidebar_border_color": "FFFFFF", + "default_profile": false, + "following": false + }, + "geo": null, + "id": 700890866920067072, + "favorite_count": 146590, + "lang": "en", + "entities": { + "symbols": [], + "user_mentions": [], + "hashtags": [], + "urls": [] + }, + "created_at": "Sat Feb 20 03:52:59 +0000 2016", + "retweeted": false, + "coordinates": null, + "in_reply_to_user_id_str": null, + "source": "Twitter for iPhone", + "in_reply_to_status_id_str": null, + "in_reply_to_screen_name": null, + "id_str": "700890866920067072", + "place": null, + "retweet_count": 69616, + "in_reply_to_user_id": null +} \ No newline at end of file diff --git a/tests/data/locations_tweet.json b/tests/data/locations_tweet.json index c2310d9..579bffa 100644 --- a/tests/data/locations_tweet.json +++ b/tests/data/locations_tweet.json @@ -1 +1,117 @@ -{"contributors": null, "truncated": false, "text": "@Alexx__27 idk", "is_quote_status": false, "in_reply_to_status_id": 701949606007996416, "id": 701950328078348288, "favorite_count": 0, "source": "Twitter for iPhone", "retweeted": false, "coordinates": null, "timestamp_ms": "1456192974477", "entities": {"user_mentions": [{"indices": [0, 10], "screen_name": "Alexx__27", "id": 2431399232, "name": "Alex \u2668\ufe0f", "id_str": "2431399232"}], "symbols": [], "hashtags": [], "urls": []}, "in_reply_to_screen_name": "Alexx__27", "in_reply_to_user_id": 2431399232, "traptor": {"created_at_iso": "2016-02-23T02:02:54+00:00"}, "id_str": "701950328078348288", "favorited": false, "user": {"follow_request_sent": null, "profile_use_background_image": true, "id": 562524394, "verified": false, "profile_image_url_https": "https://pbs.twimg.com/profile_images/701947518737715200/7vPEepur_normal.jpg", "profile_sidebar_fill_color": "DDEEF6", "is_translator": false, "geo_enabled": true, "profile_text_color": "333333", "followers_count": 754, "protected": false, "location": null, "default_profile_image": false, "id_str": "562524394", "utc_offset": -21600, "statuses_count": 45525, "description": null, "friends_count": 665, "profile_link_color": "B40B43", "profile_image_url": "http://pbs.twimg.com/profile_images/701947518737715200/7vPEepur_normal.jpg", "notifications": null, "profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/437502442461069312/yGDRa5xj.jpeg", "profile_background_color": "FF6699", "profile_banner_url": "https://pbs.twimg.com/profile_banners/562524394/1455651000", "profile_background_image_url": "http://pbs.twimg.com/profile_background_images/437502442461069312/yGDRa5xj.jpeg", "screen_name": "jeennnyy_", "lang": "en", "profile_background_tile": true, "favourites_count": 31358, "name": "Jenny Hernandez", "url": "http://Instagram.com/jeennnnyy_", "created_at": "Wed Apr 25 02:25:20 +0000 2012", "contributors_enabled": false, "time_zone": "Central Time (US & Canada)", "profile_sidebar_border_color": "000000", "default_profile": false, "following": null, "listed_count": 3}, "geo": null, "in_reply_to_user_id_str": "2431399232", "lang": "und", "retweet_count": 0, "created_at": "Tue Feb 23 02:02:54 +0000 2016", "filter_level": "low", "in_reply_to_status_id_str": "701949606007996416", "place": {"country_code": "US", "url": "https://api.twitter.com/1.1/geo/id/45cadd6ef118ec9f.json", "country": "United States", "place_type": "city", "bounding_box": {"type": "Polygon", "coordinates": [[[-122.065206, 37.330068], [-122.065206, 37.426726], [-121.982475, 37.426726], [-121.982475, 37.330068]]]}, "full_name": "Sunnyvale, CA", "attributes": {}, "id": "45cadd6ef118ec9f", "name": "Sunnyvale"}} \ No newline at end of file +{ + "contributors": null, + "truncated": false, + "text": "@Alexx__27 idk", + "is_quote_status": false, + "in_reply_to_status_id": 701949606007996416, + "id": 701950328078348288, + "favorite_count": 0, + "source": "Twitter for iPhone", + "retweeted": false, + "coordinates": null, + "timestamp_ms": "1456192974477", + "entities": { + "user_mentions": [ + { + "indices": [ + 0, + 10 + ], + "screen_name": "Alexx__27", + "id": 2431399232, + "name": "Alex ♨️", + "id_str": "2431399232" + } + ], + "symbols": [], + "hashtags": [], + "urls": [] + }, + "in_reply_to_screen_name": "Alexx__27", + "in_reply_to_user_id": 2431399232, + "traptor": { + "created_at_iso": "2016-02-23T02:02:54+00:00" + }, + "id_str": "701950328078348288", + "favorited": false, + "user": { + "follow_request_sent": null, + "profile_use_background_image": true, + "id": 562524394, + "verified": false, + "profile_image_url_https": "https://pbs.twimg.com/profile_images/701947518737715200/7vPEepur_normal.jpg", + "profile_sidebar_fill_color": "DDEEF6", + "is_translator": false, + "geo_enabled": true, + "profile_text_color": "333333", + "followers_count": 754, + "protected": false, + "location": null, + "default_profile_image": false, + "id_str": "562524394", + "utc_offset": -21600, + "statuses_count": 45525, + "description": null, + "friends_count": 665, + "profile_link_color": "B40B43", + "profile_image_url": "http://pbs.twimg.com/profile_images/701947518737715200/7vPEepur_normal.jpg", + "notifications": null, + "profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/437502442461069312/yGDRa5xj.jpeg", + "profile_background_color": "FF6699", + "profile_banner_url": "https://pbs.twimg.com/profile_banners/562524394/1455651000", + "profile_background_image_url": "http://pbs.twimg.com/profile_background_images/437502442461069312/yGDRa5xj.jpeg", + "screen_name": "jeennnyy_", + "lang": "en", + "profile_background_tile": true, + "favourites_count": 31358, + "name": "Jenny Hernandez", + "url": "http://Instagram.com/jeennnnyy_", + "created_at": "Wed Apr 25 02:25:20 +0000 2012", + "contributors_enabled": false, + "time_zone": "Central Time (US & Canada)", + "profile_sidebar_border_color": "000000", + "default_profile": false, + "following": null, + "listed_count": 3 + }, + "geo": null, + "in_reply_to_user_id_str": "2431399232", + "lang": "und", + "retweet_count": 0, + "created_at": "Tue Feb 23 02:02:54 +0000 2016", + "filter_level": "low", + "in_reply_to_status_id_str": "701949606007996416", + "place": { + "country_code": "US", + "url": "https://api.twitter.com/1.1/geo/id/45cadd6ef118ec9f.json", + "country": "United States", + "place_type": "city", + "bounding_box": { + "type": "Polygon", + "coordinates": [ + [ + [ + -122.065206, + 37.330068 + ], + [ + -122.065206, + 37.426726 + ], + [ + -121.982475, + 37.426726 + ], + [ + -121.982475, + 37.330068 + ] + ] + ] + }, + "full_name": "Sunnyvale, CA", + "attributes": {}, + "id": "45cadd6ef118ec9f", + "name": "Sunnyvale" + } +} \ No newline at end of file diff --git a/tests/data/track_tweet.json b/tests/data/track_tweet.json index 6140974..e700c4c 100644 --- a/tests/data/track_tweet.json +++ b/tests/data/track_tweet.json @@ -1 +1,91 @@ -{"favorited": false, "contributors": null, "truncated": false, "text": "@DannyTaughtYou_ happy birthday", "is_quote_status": false, "in_reply_to_status_id": 701579336004972545, "user": {"follow_request_sent": false, "has_extended_profile": false, "profile_use_background_image": true, "time_zone": "Central Time (US & Canada)", "id": 35969755, "description": "Show No Love, Love Will Get You Hurt!!! #TeamNunns Instagram- itaint_nunn1 snapchat: Nunnwhitney FEBRUARY 20th", "verified": false, "entities": {"description": {"urls": []}}, "profile_image_url_https": "https://pbs.twimg.com/profile_images/673722421677981696/-MNnci6a_normal.jpg", "profile_sidebar_fill_color": "EFEFEF", "is_translator": false, "geo_enabled": false, "profile_text_color": "333333", "followers_count": 757, "protected": false, "id_str": "35969755", "default_profile_image": false, "listed_count": 3, "lang": "en", "utc_offset": -21600, "statuses_count": 94410, "profile_background_color": "131516", "friends_count": 714, "profile_link_color": "009999", "profile_image_url": "http://pbs.twimg.com/profile_images/673722421677981696/-MNnci6a_normal.jpg", "notifications": false, "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme14/bg.gif", "profile_banner_url": "https://pbs.twimg.com/profile_banners/35969755/1441172070", "profile_background_image_url": "http://abs.twimg.com/images/themes/theme14/bg.gif", "name": "February20", "is_translation_enabled": false, "profile_background_tile": true, "favourites_count": 3669, "screen_name": "ItAint_Nunn", "url": null, "created_at": "Tue Apr 28 03:11:26 +0000 2009", "contributors_enabled": false, "location": "Where the money at.!!! ", "profile_sidebar_border_color": "EEEEEE", "default_profile": false, "following": false}, "geo": null, "id": 701580888455389187, "favorite_count": 0, "lang": "en", "entities": {"symbols": [], "user_mentions": [{"indices": [0, 16], "screen_name": "DannyTaughtYou_", "id": 101886075, "name": "DANNYDAY2/21", "id_str": "101886075"}], "hashtags": [], "urls": []}, "created_at": "Mon Feb 22 01:34:53 +0000 2016", "retweeted": false, "metadata": {"iso_language_code": "en", "result_type": "recent"}, "coordinates": null, "in_reply_to_user_id_str": "101886075", "source": "Twitter for iPhone", "in_reply_to_status_id_str": "701579336004972545", "in_reply_to_screen_name": "DannyTaughtYou_", "in_reply_to_user_id": 101886075, "place": null, "retweet_count": 0, "id_str": "701580888455389187"} +{ + "favorited": false, + "contributors": null, + "truncated": false, + "text": "@DannyTaughtYou_ happy birthday", + "is_quote_status": false, + "in_reply_to_status_id": 701579336004972545, + "user": { + "follow_request_sent": false, + "has_extended_profile": false, + "profile_use_background_image": true, + "time_zone": "Central Time (US & Canada)", + "id": 35969755, + "description": "Show No Love, Love Will Get You Hurt!!! #TeamNunns Instagram- itaint_nunn1 snapchat: Nunnwhitney FEBRUARY 20th", + "verified": false, + "entities": { + "description": { + "urls": [] + } + }, + "profile_image_url_https": "https://pbs.twimg.com/profile_images/673722421677981696/-MNnci6a_normal.jpg", + "profile_sidebar_fill_color": "EFEFEF", + "is_translator": false, + "geo_enabled": false, + "profile_text_color": "333333", + "followers_count": 757, + "protected": false, + "id_str": "35969755", + "default_profile_image": false, + "listed_count": 3, + "lang": "en", + "utc_offset": -21600, + "statuses_count": 94410, + "profile_background_color": "131516", + "friends_count": 714, + "profile_link_color": "009999", + "profile_image_url": "http://pbs.twimg.com/profile_images/673722421677981696/-MNnci6a_normal.jpg", + "notifications": false, + "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme14/bg.gif", + "profile_banner_url": "https://pbs.twimg.com/profile_banners/35969755/1441172070", + "profile_background_image_url": "http://abs.twimg.com/images/themes/theme14/bg.gif", + "name": "February20", + "is_translation_enabled": false, + "profile_background_tile": true, + "favourites_count": 3669, + "screen_name": "ItAint_Nunn", + "url": null, + "created_at": "Tue Apr 28 03:11:26 +0000 2009", + "contributors_enabled": false, + "location": "Where the money at.!!! ", + "profile_sidebar_border_color": "EEEEEE", + "default_profile": false, + "following": false + }, + "geo": null, + "id": 701580888455389187, + "favorite_count": 0, + "lang": "en", + "entities": { + "symbols": [], + "user_mentions": [ + { + "indices": [ + 0, + 16 + ], + "screen_name": "DannyTaughtYou_", + "id": 101886075, + "name": "DANNYDAY2/21", + "id_str": "101886075" + } + ], + "hashtags": [], + "urls": [] + }, + "created_at": "Mon Feb 22 01:34:53 +0000 2016", + "retweeted": false, + "metadata": { + "iso_language_code": "en", + "result_type": "recent" + }, + "coordinates": null, + "in_reply_to_user_id_str": "101886075", + "source": "Twitter for iPhone", + "in_reply_to_status_id_str": "701579336004972545", + "in_reply_to_screen_name": "DannyTaughtYou_", + "in_reply_to_user_id": 101886075, + "place": null, + "retweet_count": 0, + "id_str": "701580888455389187" +} \ No newline at end of file diff --git a/tests/test_traptor_offline.py b/tests/test_traptor_offline.py index 9e76bd2..0fed48d 100644 --- a/tests/test_traptor_offline.py +++ b/tests/test_traptor_offline.py @@ -24,7 +24,7 @@ def redis_rules(request): with open('tests/data/locations_rules.json') as f: locations_rules = [json.loads(line) for line in f] - conn = StrictRedis(host='localhost', port=6379, db=5) + conn = StrictRedis(host='scdev', port=6379, db=5) conn.flushdb() rc = RulesToRedis(conn) @@ -43,7 +43,7 @@ def cleanup(): @pytest.fixture() def pubsub_conn(): """Create a connection for the Redis PubSub.""" - p_conn = StrictRedis(host='localhost', port=6379, db=5) + p_conn = StrictRedis(host='scdev', port=6379, db=5) return p_conn @@ -71,7 +71,7 @@ def traptor(request, redis_rules, pubsub_conn, heartbeat_conn, traptor_notify_ch traptor_type=request.param, apikeys=APIKEYS, traptor_id=0, - kafka_hosts='localhost:9092', + kafka_hosts='scdev:9092', kafka_topic='traptor_test', kafka_enabled=False, log_level='INFO', @@ -155,31 +155,6 @@ def test_create_kafka_producer(self, traptor): traptor._create_kafka_producer('testtopic') assert traptor.kafka_producer == None - - # def test_create_birdy_stream(self): - # pass - - # def test_make_twitter_rules(self): - # pass - - # def test_add_rule_tag_and_value_to_tweet(self): - # pass - - # def test_find_rule_matches(self): - # pass - - # def test_get_redis_rules(self): - # pass - - # def test_tweet_time_to_iso(self): - # pass - - # def test_create_traptor_obj(self): - # pass - - # def test_fix_tweet_object(self): - # pass - def test_check_redis_pubsub_for_restart(self, traptor, pubsub_conn): """Test pubsub message causes the restart_flag to be set to True.""" traptor._setup() @@ -240,14 +215,6 @@ def test_main_loop(self, traptor, tweets): if traptor.traptor_type == 'locations': assert data['traptor']['created_at_iso'] == '2016-02-23T02:02:54+00:00' - # TODO. - # Need to figure out how to map location rules back to results. - # Need to do some coordinate math on the geo bounding boxes. - - # assert enriched_data['traptor']['rule_tag'] == 'test' - # assert enriched_data['traptor']['rule_value'] == \ - # '-122.75,36.8,-121.75,37.8' - def test_ensure_heartbeat_message_is_produced(self, traptor): """Ensure Traptor can produce heartbeat messages.""" traptor._setup() diff --git a/traptor/traptor.py b/traptor/traptor.py index 7015640..a6c1ec2 100644 --- a/traptor/traptor.py +++ b/traptor/traptor.py @@ -243,23 +243,21 @@ def _find_rule_matches(self, tweet_dict): search_str = rule['value'] search_str = search_str.lower() # self.logger.debug("Search string used for the rule match: {}".format(search_str.encode('utf-8'))) - if re.search(',', search_str): - for s in search_str.split(','): - # Lower the search string - s = s.lower() - # Lowercase everything in the dict and flatten it out - new_dict = dict((k.lower(), v.lower()) for k, v in new_dict.iteritems()) - new_dict = FlatDict(new_dict) - # Add the rule to the tweet - new_dict = self._add_rule_tag_and_value_to_tweet(new_dict, - s, - rule) - else: - search_str = rule['value'].split()[0] - for i in new_dict.keys(): - new_dict = self._add_rule_tag_and_value_to_tweet(new_dict, search_str, rule) - # self.logger.debug('Rule matched - tag:{}, value:{}'.format(rule['tag'], - # rule['value'].encode('utf-8'))) + for s in search_str.split(','): + # Lower the search string + s = s.lower() + # Lowercase everything in the dict and flatten it out + for k, v in new_dict.iteritems(): + if v is not None and isinstance(v, str): + new_dict[k] = v.lower() + new_dict = FlatDict(new_dict) + # Add the rule to the tweet + new_dict = self._add_rule_tag_and_value_to_tweet(new_dict, + s, + rule) + + self.logger.debug('Rule matched - tag:{}, value:{}'.format(rule['tag'], + rule['value'].encode('utf-8'))) if 'rule_tag' not in new_dict['traptor']: self.logger.warning('Could not find rule_tag: {}, rule_value: {}, in tweet {}'.format( From 5b42b3c2ab859bd956dae6f7218ffa35da23a761 Mon Sep 17 00:00:00 2001 From: Robert Dempsey Date: Fri, 29 Apr 2016 16:22:08 -0400 Subject: [PATCH 3/8] Refactoring of geo and follow rule matching --- traptor/traptor.py | 51 +++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/traptor/traptor.py b/traptor/traptor.py index a6c1ec2..060b5e3 100644 --- a/traptor/traptor.py +++ b/traptor/traptor.py @@ -1,8 +1,8 @@ #!/usr/bin/env python import json -import re import sys import time +import re from datetime import datetime import dateutil.parser as parser @@ -215,8 +215,8 @@ def _add_rule_tag_and_value_to_tweet(self, tweet_dict, search_str, matched_rule) tweet_dict['traptor']['rule_value'] = matched_rule['value'] # Pass all key/value pairs from matched rule through to Traptor - for k, v in matched_rule.iteritems(): - tweet_dict['traptor'][k] = v + for key, value in matched_rule.iteritems(): + tweet_dict['traptor'][key] = value return tweet_dict @@ -237,35 +237,44 @@ def _find_rule_matches(self, tweet_dict): new_dict['traptor']['rule_tag'] = rule['tag'] new_dict['traptor']['rule_value'] = rule['value'] + for key, value in rule.iteritems(): + new_dict['traptor'][key] = value + return new_dict - for rule in self.redis_rules: - search_str = rule['value'] - search_str = search_str.lower() - # self.logger.debug("Search string used for the rule match: {}".format(search_str.encode('utf-8'))) - for s in search_str.split(','): - # Lower the search string - s = s.lower() - # Lowercase everything in the dict and flatten it out + if self.traptor_type == 'track': + + for rule in self.redis_rules: + # Get the rule to search for and lowercase it + search_str = rule['value'] + search_str = search_str.lower() + + self.logger.debug("Search string used for the rule match: {}".format(search_str.encode('utf-8'))) + # Lowercase everything in the dict for k, v in new_dict.iteritems(): if v is not None and isinstance(v, str): new_dict[k] = v.lower() + + # Flatten it out new_dict = FlatDict(new_dict) + # Add the rule to the tweet new_dict = self._add_rule_tag_and_value_to_tweet(new_dict, - s, + search_str, rule) - self.logger.debug('Rule matched - tag:{}, value:{}'.format(rule['tag'], - rule['value'].encode('utf-8'))) + if 'rule_tag' not in new_dict['traptor']: + self.logger.warning('Could not find rule_tag: {}, rule_value: {}, in tweet {}'.format( + rule['tag'], rule['value'].encode('utf-8'), new_dict.get('id_str'))) + new_dict['traptor']['rule_tag'] = 'Not found' + new_dict['traptor']['rule_value'] = 'Not found' - if 'rule_tag' not in new_dict['traptor']: - self.logger.warning('Could not find rule_tag: {}, rule_value: {}, in tweet {}'.format( - rule['tag'], rule['value'].encode('utf-8'), new_dict.get('id_str'))) - new_dict['traptor']['rule_tag'] = 'Not found' - new_dict['traptor']['rule_value'] = 'Not found' + return new_dict - return new_dict + # If this is a follow Traptor, chuck everything but the id_str field + if self.traptor_type == 'follow': + + pass def _get_redis_rules(self): """ Yields a traptor rule from redis. This function @@ -544,7 +553,7 @@ def main(sentry, stdout, info, debug, delay, id, type, key): if sentry: client = Client(SENTRY_SECRET) client.captureException() - logger.errror(e) + logger.error(e) if __name__ == '__main__': From ab9497fa103695020a1275a6687918ab26721152 Mon Sep 17 00:00:00 2001 From: Robert Dempsey Date: Mon, 2 May 2016 10:16:04 -0400 Subject: [PATCH 4/8] Only use the user/id field of a tweet for Follow Traptor rule checking --- traptor/traptor.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/traptor/traptor.py b/traptor/traptor.py index 060b5e3..3be5b08 100644 --- a/traptor/traptor.py +++ b/traptor/traptor.py @@ -271,10 +271,22 @@ def _find_rule_matches(self, tweet_dict): return new_dict - # If this is a follow Traptor, chuck everything but the id_str field + # If this is a follow Traptor, only check the user/id field of the tweet if self.traptor_type == 'follow': + for rule in self.redis_rules: + # Get the rule to search for + search_str = int(rule['value']) + + # Get the id field of the tweet object - that's all we need + if new_dict['user']['id'] and new_dict['user']['id'] == search_str: + new_dict['traptor']['rule_tag'] = rule['tag'] + new_dict['traptor']['rule_value'] = rule['value'] + + for key, value in rule.iteritems(): + new_dict['traptor'][key] = value + + return new_dict - pass def _get_redis_rules(self): """ Yields a traptor rule from redis. This function From 38a98f77cff975f9b3d3daffc869cdea657e3d85 Mon Sep 17 00:00:00 2001 From: Robert Dempsey Date: Mon, 2 May 2016 10:56:01 -0400 Subject: [PATCH 5/8] For follow Traptors, ensure the field we want is there MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check the dictionary for the ‘user’ key. If there’s no user key then we don’t have a tweet we can use, we have another type of message. --- tests/data/other_tweet_messages.json | 67 ++++++++++++++++++++++++++++ traptor/traptor.py | 2 +- 2 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 tests/data/other_tweet_messages.json diff --git a/tests/data/other_tweet_messages.json b/tests/data/other_tweet_messages.json new file mode 100644 index 0000000..88177ce --- /dev/null +++ b/tests/data/other_tweet_messages.json @@ -0,0 +1,67 @@ +{ + "delete":{ + "status":{ + "id":1234, + "id_str":"1234", + "user_id":3, + "user_id_str":"3" + } + } +}, +{ + "warning":{ + "code":"FALLING_BEHIND", + "message":"Your connection is falling behind and messages are being queued for delivery to you. Your queue is now over 60% full. You will be disconnected when the queue is full.", + "percent_full": 60 + } +}, +{ + "scrub_geo":{ + "user_id":14090452, + "user_id_str":"14090452", + "up_to_status_id":23260136625, + "up_to_status_id_str":"23260136625" + } +}, +{ + "limit":{ + "track":1234 + } +}, +{ + "status_withheld":{ + "id":1234567890, + "user_id":123456, + "withheld_in_countries":["DE", "AR"] + } +}, +{ + "user_withheld":{ + "id":123456, + "withheld_in_countries":["DE","AR"] + } +}, +{ + "disconnect":{ + "code": 4, + "stream_name":"", + "reason":"" + } +}, +{ + "warning":{ + "code":"FALLING_BEHIND", + "message":"Your connection is falling behind and messages are being queued for delivery to you. Your queue is now over 60% full. You will be disconnected when the queue is full.", + "percent_full": 60 + } +}, +{ + "created_at": "Tue Aug 06 02:23:21 +0000 2013", + "source": { + ... + }, + "target": { + ... + }, + "event": "user_update" +} diff --git a/traptor/traptor.py b/traptor/traptor.py index 3be5b08..50b4c49 100644 --- a/traptor/traptor.py +++ b/traptor/traptor.py @@ -278,7 +278,7 @@ def _find_rule_matches(self, tweet_dict): search_str = int(rule['value']) # Get the id field of the tweet object - that's all we need - if new_dict['user']['id'] and new_dict['user']['id'] == search_str: + if 'user' in new_dict and new_dict['user']['id'] == search_str: new_dict['traptor']['rule_tag'] = rule['tag'] new_dict['traptor']['rule_value'] = rule['value'] From 7fe2433778c678777236c78a7cbd300bb1894a19 Mon Sep 17 00:00:00 2001 From: Robert Dempsey Date: Mon, 2 May 2016 11:20:24 -0400 Subject: [PATCH 6/8] Change test back to localhost MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Need a way to make this work more better… --- tests/test_traptor_offline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_traptor_offline.py b/tests/test_traptor_offline.py index 0fed48d..d565b20 100644 --- a/tests/test_traptor_offline.py +++ b/tests/test_traptor_offline.py @@ -24,7 +24,7 @@ def redis_rules(request): with open('tests/data/locations_rules.json') as f: locations_rules = [json.loads(line) for line in f] - conn = StrictRedis(host='scdev', port=6379, db=5) + conn = StrictRedis(host='localhost', port=6379, db=5) conn.flushdb() rc = RulesToRedis(conn) @@ -43,7 +43,7 @@ def cleanup(): @pytest.fixture() def pubsub_conn(): """Create a connection for the Redis PubSub.""" - p_conn = StrictRedis(host='scdev', port=6379, db=5) + p_conn = StrictRedis(host='localhost', port=6379, db=5) return p_conn @@ -71,7 +71,7 @@ def traptor(request, redis_rules, pubsub_conn, heartbeat_conn, traptor_notify_ch traptor_type=request.param, apikeys=APIKEYS, traptor_id=0, - kafka_hosts='scdev:9092', + kafka_hosts='localhost:9092', kafka_topic='traptor_test', kafka_enabled=False, log_level='INFO', From b6fea2ff914fa7d1d566a478293cdc064fe065c0 Mon Sep 17 00:00:00 2001 From: Robert Dempsey Date: Tue, 3 May 2016 13:28:48 -0400 Subject: [PATCH 7/8] Replace str with basestring to cover unicode cases --- traptor/traptor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/traptor/traptor.py b/traptor/traptor.py index 50b4c49..3f1caae 100644 --- a/traptor/traptor.py +++ b/traptor/traptor.py @@ -252,7 +252,7 @@ def _find_rule_matches(self, tweet_dict): self.logger.debug("Search string used for the rule match: {}".format(search_str.encode('utf-8'))) # Lowercase everything in the dict for k, v in new_dict.iteritems(): - if v is not None and isinstance(v, str): + if v is not None and isinstance(v, basestring): new_dict[k] = v.lower() # Flatten it out From 466cb6be80160595264c78326e73c953c77a90ae Mon Sep 17 00:00:00 2001 From: Robert Dempsey Date: Tue, 3 May 2016 14:25:03 -0400 Subject: [PATCH 8/8] Bump the version Because I always forget --- traptor/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/traptor/version.py b/traptor/version.py index c58428b..1ae594e 100644 --- a/traptor/version.py +++ b/traptor/version.py @@ -1,2 +1,2 @@ -__version__ = '1.2.0' +__version__ = '1.2.1' VERSION = tuple(int(x) for x in __version__.split('.'))