-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtwitter-filter.py
79 lines (53 loc) · 2.16 KB
/
twitter-filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import tweepy
import ConfigParser
from collections import defaultdict
from pymongo import MongoClient, ASCENDING
from tokenizer import tokenize
config = ConfigParser.ConfigParser()
config.read('apikey.cfg')
consumer_key = config.get('DEFAULT', 'CONSUMER_KEY')
consumer_secret = config.get('DEFAULT', 'CONSUMER_SECRET')
access_token_key = config.get('DEFAULT', 'ACCESS_TOKEN_KEY')
access_token_secret = config.get('DEFAULT', 'ACCESS_TOKEN_SECRET')
auth1 = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth1.set_access_token(access_token_key, access_token_secret)
setTerms = ['mhacks']
mongo = MongoClient()
mongo_db = mongo['twitter_ngrams']
mongo_coll = mongo_db['tweets']
bad_tokens = ['.', '$'] # TODO: Do this in regex in tokenizer
class StreamListener(tweepy.StreamListener):
def on_status(self, tweet):
try:
insert_tweet(tweet)
except Exception, e:
print e
def on_error(self, status_code):
print 'Error: ' + repr(status_code)
return False
def token_counter(def_dict, tweet):
for token in tokenize(tweet.text.encode('ascii', 'ignore')):
token = token.strip().lower()
if (token not in bad_tokens and # get rid of chars that kill mongo
not token.startswith('http') and # get rid of urls
not token.startswith('@') and # get rid of Replies
not token.startswith('#') and # get rid of hashtags
len(token) >= 2):
def_dict[token] += 1
temp_dict = {k:v for k,v in def_dict.iteritems()}
temp_tweet = {'_id': tweet.created_at.replace(minute=0, second=0, microsecond=0),
'text': temp_dict}
return temp_tweet
def insert_tweet(tweet):
already_exists = mongo_coll.find_one({'_id': tweet.created_at.replace(minute=0, second=0, microsecond=0)})
if already_exists is not None:
temp_dict = defaultdict(int, already_exists['text'])
else:
temp_dict = defaultdict(int)
return mongo_coll.save(token_counter(temp_dict, tweet))
def main():
l = StreamListener()
streamer = tweepy.Stream(auth=auth1, listener=l)
streamer.filter(track=setTerms)
if __name__ == "__main__":
main()