-
Notifications
You must be signed in to change notification settings - Fork 0
/
Tweets - json - hive.txt
126 lines (107 loc) · 4 KB
/
Tweets - json - hive.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
add jar Desktop/json-serde-1.3.8-jar-with-dependencies.jar;
// Load all tweet data regardless the type
CREATE TABLE tweet_all (
id BIGINT,
created_at STRING,
entities STRUCT<
hashtags:ARRAY<STRUCT<text:STRING>>>,
text STRING,
user STRUCT<
name:STRING,
followers_count:INT,
location:STRING>,
quoted_status STRUCT<
id: BIGINT,
created_at: STRING,
entities: STRUCT<
hashtags:ARRAY<STRUCT<text:STRING>>>,
text: STRING,
user: STRUCT<
name: STRING,
followers_count: INT,
location: STRING>>,
retweeted_status STRUCT<
id: BIGINT,
created_at: STRING,
entities: STRUCT<
hashtags:ARRAY<STRUCT<text:STRING>>>,
text: STRING,
user: STRUCT<
name: STRING,
followers_count: INT,
location: STRING>>
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
WITH SERDEPROPERTIES ("ignore.malformed.json" = "true");
load data local inpath 'Desktop/tweet_out.json' into table tweet_all;
// Collapse all 3 types of tweets into 1 table, starting with direct tweet
CREATE table tweets_table as
select id,
TO_DATE(from_unixtime( unix_timestamp(concat( '2016 ',
substring(created_at,5,6)), 'yyyy MMM dd'))) as created_at,
entities.hashtags, regexp_replace(text, '\n', ' ') as text,
user.name as uname, user.followers_count as followers_cnt, user.location as location
from tweet_all;
// Insert quoted_tweets
INSERT INTO tweets_table
select quoted_status.id,
TO_DATE(from_unixtime( unix_timestamp(concat( '2016 ',
substring(quoted_status.created_at,5,6)), 'yyyy MMM dd'))) as created_at,
quoted_status.entities.hashtags,
regexp_replace(quoted_status.text, '\n', ' ') as text,
quoted_status.user.name, quoted_status.user.followers_count, quoted_status.user.location
from tweet_all where quoted_status IS NOT NULL;
// Insert retweeted_tweets
INSERT INTO tweets_table
select retweeted_status.id,
TO_DATE(from_unixtime( unix_timestamp(concat( '2016 ',
substring(retweeted_status.created_at,5,6)), 'yyyy MMM dd'))) as created_at,
retweeted_status.entities.hashtags,
regexp_replace(retweeted_status.text, '\n', ' '),
retweeted_status.user.name, retweeted_status.user.followers_count, retweeted_status.user.location
from tweet_all where retweeted_status IS NOT NULL;
select count(*) from tweets_table;
// List all the hashtags in order from the most popular to least popular
create view hashtags_view as
select ht as hashtag,count(ht) as count from tweets_table
lateral view explode(hashtags.text) dummy as ht
group by ht order by count desc;
select * from hashtags_view;
// Select state wtih the most active users
// Use substr because location is like Dallas, TX
create view location_view as
select id, substr(location,-2) as state
from tweets_table;
select state, count(id) as tweets_count
from location_view group by state order by tweets_count desc limit 1;
// Select the top ten user with the most followers
select uname, max(followers_cnt) as count from tweets_table
group by uname order by count desc limit 10;
// Calculate sentiment score for each hashtag
// Load the Afinn dictionary
create table dictionary (
word STRING,
score INT)
row format delimited
fields terminated by '\t';
load data local inpath 'Desktop/Dictionary.txt' into table dictionary;
// Create a word list
create view text_view as
select id, uname, created_at, words
from tweets_table lateral view
explode(sentences(lower(text))) dummy as words;
create view text_view2 as
select id, uname, created_at, word from text_view
lateral view explode(words) dummy as word;
// Join the word list with dictionary
create view text_view3 as
select id, uname, created_at, t.word,
case when score IS NULL then 0 else score end as score
from text_view2 t left outer join dictionary d on t.word = d.word;
// Sum the score for each tweet
create view score_view as
select id, uname, created_at, sum(score) as total_score,
case when sum(score) > 0 then 'positive'
when sum(score) < 0 then 'negative'
else 'neutral' end as sentiment
from text_view3 group by created_at, uname, id order by id desc;