-
Notifications
You must be signed in to change notification settings - Fork 1
/
twitter_data_ingestion.py
57 lines (46 loc) · 1.82 KB
/
twitter_data_ingestion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""
Created by Jairo Duarte on 22/02/2018.
"""
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener
import json
import twitter_config
import pykafka
import sys
import tweet_utils
import preprocessor as p
def getlocation(location):
for state in tweet_utils.STATES:
if state['name'] in str(location) or state['abbreviation'] in str(location):
return state['name']
return location
class TweetListener(StreamListener):
def __init__(self):
# on se connecte vers le serveur kafka et instancie l'objet producer du topique twitter_input
self.client = pykafka.KafkaClient("localhost:9092")
self.producer = self.client.topics[bytes('twitter_input', 'ascii')].get_producer()
def on_data(self, data):
try:
json_data = json.loads(data)
json_data['user']['location'] = getlocation(json_data['user']['location'])
json_data['text'] = p.clean(json_data['text'])
print(json_data['text'], '>>>>', json_data['user']['location'])
# envois des données twitter vers le consumer
self.producer.produce(json.dumps(json_data).encode())
return True
except KeyError:
return True
def on_error(self, status):
print(status)
return True
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: PYSPARK_PYTHON=python3 spark-submit file.py <WORD>", file=sys.stderr)
exit(-1)
word = sys.argv[1]
# connexion avec l'api twitter
auth = OAuthHandler(twitter_config.CONSUMER_KEY, twitter_config.CONSUMER_SECRET)
auth.set_access_token(twitter_config.ACCESS_TOKEN, twitter_config.ACCESS_TOKEN_SECRET)
twitter_stream = Stream(auth, TweetListener())
twitter_stream.filter(languages=['en'], track=[word])