-
Notifications
You must be signed in to change notification settings - Fork 0
/
tweets.sql
136 lines (106 loc) · 3.64 KB
/
tweets.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
--create the tweets_raw table containing the records as received from Twitter
ADD JAR /usr/hadoop/hadoop/hive/apache-hive-1.2.1-bin/lib/json-serde-1.3.6-SNAPSHOT-jar-with-dependencies.jar;
CREATE EXTERNAL TABLE Mytweets_raw (
id BIGINT,
created_at STRING,
source STRING,
favorited BOOLEAN,
retweet_count INT,
retweeted_status STRUCT<
text:STRING,
user_2:STRUCT<screen_name:STRING,name:STRING>
,retweet_count:INT
>,
entities STRUCT<
urls:ARRAY<STRUCT<expanded_url:STRING>>,
user_mentions:ARRAY<STRUCT<screen_name:STRING,name:STRING>>,
hashtags:ARRAY<STRUCT<text:STRING>>>,
text STRING,
user_2 STRUCT<
screen_name:STRING,
name:STRING,
friends_count:INT,
followers_count:INT,
statuses_count:INT,
verified:BOOLEAN,
utc_offset:INT,
time_zone:STRING>,
in_reply_to_screen_name STRING
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
LOCATION 'hdfs://namenode/user/cc/data/tweets_raw';
-- create sentiment dictionary
CREATE EXTERNAL TABLE dictionary (
type string,
length int,
word string,
pos string,
stemmed string,
polarity string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION 'hdfs://namenode/user/cc/data/dictionary';
-- loading data to the table dictionary
load data inpath 'hdfs://namenode/user/cc/data/dictionary/dictionary.tsv' INTO TABLE dictionary;
CREATE EXTERNAL TABLE time_zone_map (
time_zone string,
country string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION 'hdfs://namenode/user/cc/data/time_zone_map';
-- loading data to the table time_zone_map
load data inpath 'hdfs://namenode/user/cc/data/time_zone_map/time_zone_map.tsv' INTO TABLE time_zone_map;
-- Clean up tweets
CREATE VIEW tweets_simple AS
SELECT
id,
cast ( from_unixtime( unix_timestamp(concat( '2016 ', substring(created_at,12,2)), 'yyyy MMM dd hh:mm:ss')) as timestamp) ts,
text,
user_2.time_zone
FROM Mytweets_raw;
CREATE VIEW tweets_clean AS
SELECT
id,
ts,
text,
m.country
FROM tweets_simple t LEFT OUTER JOIN time_zone_map m ON t.time_zone = m.time_zone;
-- Compute sentiment
create view l1 as select id, words from Mytweets_raw lateral view explode(sentences(lower(text))) dummy as words;
create view l2 as select id, word from l1 lateral view explode( words ) dummy as word ;
create view l3 as select
id,
l2.word,
case d.polarity
when 'negative' then -1
when 'positive' then 1
else 0 end as polarity
from l2 left outer join dictionary d on l2.word = d.word;
create table tweets_sentiment as select
id,
case
when sum( polarity ) > 0 then 'positive'
when sum( polarity ) < 0 then 'negative'
else 'neutral' end as sentiment
from l3 group by id;
-- put everything back together and re-name sentiments...
CREATE TABLE tweetsbi
AS
SELECT
t.*,
s.sentiment
FROM tweets_clean t LEFT OUTER JOIN tweets_sentiment s on t.id = s.id;
-- data with tweet counts.....
CREATE TABLE tweetsbiaggr
AS
SELECT
country,sentiment, count(sentiment) as tweet_count
FROM tweetsbi
group by country,sentiment;
-- store data for analysis......
CREATE VIEW A as select country,tweet_count as positive_response from tweetsbiaggr where sentiment='positive';
CREATE VIEW B as select country,tweet_count as negative_response from tweetsbiaggr where sentiment='negative';
CREATE VIEW C as select country,tweet_count as neutral_response from tweetsbiaggr where sentiment='neutral';
CREATE TABLE tweetcompare as select A.*,B.negative_response as negative_response,C.neutral_response as neutral_response from A join B on A.country= B.country join C on B.country=C.country;