Skip to content

Commit 8458ef5

Browse files
author
cnt7-naya-cdh6
committed
Presented on final lesson
Signed-off-by: cnt7-naya-cdh6 <naya@cnt7-naya-cdh6.org>
1 parent 731b095 commit 8458ef5

6 files changed

+428
-522
lines changed

Dashboard.ipynb

+19-17
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,15 @@
2525
"metadata": {},
2626
"outputs": [],
2727
"source": [
28-
"hdfs_host = 'localhost'\n",
29-
"hdfs_port = 9870\n",
30-
"hive_port = 10000\n",
31-
"hive_username = 'hdfs'\n",
32-
"hive_password = 'naya'\n",
33-
"hive_database = 'twitter'\n",
34-
"hive_mode = 'CUSTOM'\n",
35-
"\n",
28+
"from common_vars import *"
29+
]
30+
},
31+
{
32+
"cell_type": "code",
33+
"execution_count": null,
34+
"metadata": {},
35+
"outputs": [],
36+
"source": [
3637
"hive_cnx = hive.Connection(\n",
3738
" host = hdfs_host, \n",
3839
" port = hive_port, \n",
@@ -156,7 +157,7 @@
156157
"metadata": {},
157158
"outputs": [],
158159
"source": [
159-
"q = .95\n",
160+
"q = .7\n",
160161
"qS = abs(t.Sentiment).quantile(q)\n",
161162
"qS"
162163
]
@@ -169,7 +170,7 @@
169170
"source": [
170171
"plt.figure(figsize=[13,9])\n",
171172
"plt.hist(t.Sentiment[t.Sentiment!=0], bins=200)\n",
172-
"plt.vlines([-qS,qS],ymin= 0, ymax = 600,colors='red')"
173+
"plt.vlines([-qS,qS],ymin= 0, ymax = 10,colors='red')"
173174
]
174175
},
175176
{
@@ -190,7 +191,7 @@
190191
"source": [
191192
"plt.figure(figsize=[9,9])\n",
192193
"plt.hist(t.user_followers[t.Sentiment!=0], bins=200)\n",
193-
"plt.vlines([quf],ymin= 0, ymax = 350,colors='red')"
194+
"plt.vlines([quf],ymin= 0, ymax = 5,colors='red')"
194195
]
195196
},
196197
{
@@ -199,7 +200,8 @@
199200
"metadata": {},
200201
"outputs": [],
201202
"source": [
202-
"tt = (t.user_followers>quf) & (t.user_followers<5) & ((t.Sentiment) < -qS)\n"
203+
"#tt = (t.user_followers>quf) & (t.user_followers<5) & ((t.Sentiment) < -qS)\n",
204+
"tt = ((t.Sentiment) < -qS)\n"
203205
]
204206
},
205207
{
@@ -221,7 +223,7 @@
221223
"badboys = tt.sort_values(by=['Sentiment','user_followers'], ascending = [True, False] )\n",
222224
"cols = \"text created_at user_followers Sentiment\".split(\" \")\n",
223225
"\n",
224-
"print(\"\\n-------\\n\".join(badboys['text']))"
226+
"print(\"\\n-----------------------------\\n\".join(badboys['text']))"
225227
]
226228
},
227229
{
@@ -254,8 +256,8 @@
254256
},
255257
"outputs": [],
256258
"source": [
257-
"print(badboys.text[i])\n",
258-
"i+=1"
259+
"# print(badboys.text[i])\n",
260+
"# i+=1"
259261
]
260262
},
261263
{
@@ -264,8 +266,8 @@
264266
"metadata": {},
265267
"outputs": [],
266268
"source": [
267-
"a,b = min(t['created_at']), max(t['created_at'])\n",
268-
"a,b"
269+
"# a,b = min(t['created_at']), max(t['created_at'])\n",
270+
"# a,b"
269271
]
270272
},
271273
{

HDFS_DB_Init.ipynb

-106
Original file line numberDiff line numberDiff line change
@@ -332,112 +332,6 @@
332332
"source": [
333333
"run_queries(q1)"
334334
]
335-
},
336-
{
337-
"cell_type": "code",
338-
"execution_count": null,
339-
"metadata": {},
340-
"outputs": [],
341-
"source": []
342-
},
343-
{
344-
"cell_type": "code",
345-
"execution_count": null,
346-
"metadata": {},
347-
"outputs": [],
348-
"source": []
349-
},
350-
{
351-
"cell_type": "code",
352-
"execution_count": null,
353-
"metadata": {},
354-
"outputs": [],
355-
"source": []
356-
},
357-
{
358-
"cell_type": "code",
359-
"execution_count": null,
360-
"metadata": {},
361-
"outputs": [],
362-
"source": []
363-
},
364-
{
365-
"cell_type": "code",
366-
"execution_count": null,
367-
"metadata": {},
368-
"outputs": [],
369-
"source": []
370-
},
371-
{
372-
"cell_type": "code",
373-
"execution_count": null,
374-
"metadata": {},
375-
"outputs": [],
376-
"source": [
377-
"from pyspark.sql.types import StructType, StringType, MapType, StructField,\\\n",
378-
" BooleanType, DateType, NumericType, IntegerType,\\\n",
379-
" LongType, TimestampType, FloatType, ArrayType\n",
380-
"import pyarrow as pa\n",
381-
"\n",
382-
"\n",
383-
"topic = 'TweeterArchive'\n",
384-
"partitionCol = \"created_ym\"\n",
385-
"Keywords = 'Israel'\n",
386-
"\n",
387-
"\n",
388-
"hdfs_host = 'localhost'\n",
389-
"hdfs_port = 9870\n",
390-
"hive_port = 10000\n",
391-
"hive_username = 'hdfs'\n",
392-
"hive_password = 'naya'\n",
393-
"hive_database = 'twitter'\n",
394-
"hive_mode = 'CUSTOM'\n",
395-
"\n",
396-
"\n",
397-
"fs = pa.hdfs.connect(\n",
398-
" host=hdfs_host, \n",
399-
" port=8020, \n",
400-
" user=hive_username, \n",
401-
" kerb_ticket=None, \n",
402-
" driver='libhdfs', \n",
403-
" extra_conf=None)\n",
404-
"\n",
405-
"\n",
406-
"event_fields = [ 'id', 'text','created_at', 'geo', 'coordinates', 'place',\n",
407-
" 'quote_count', 'reply_count', 'retweet_count', 'favorite_count' ]\n",
408-
"\n",
409-
"tweet_keys = event_fields + ['user_id', 'user_followers' ]\n",
410-
"\n",
411-
"tweet_types = [LongType, StringType, TimestampType, StringType, StringType, StringType, \n",
412-
" IntegerType, IntegerType, IntegerType, IntegerType, LongType, IntegerType]\n",
413-
"\n",
414-
"# tweet_types = [StringType]* 11 \n",
415-
"\n",
416-
"\n",
417-
"user_fields = ['id', 'name', 'screen_name','created_at', 'location', 'url',\n",
418-
" 'protected', 'verified', 'followers_count', 'friends_count',\n",
419-
" 'listed_count', 'favourites_count', 'statuses_count', 'withheld_in_countries']\n",
420-
"\n",
421-
"user_keys = user_fields + [partitionCol]\n",
422-
"\n",
423-
"user_types = [LongType, StringType, StringType, TimestampType, StringType, StringType, \n",
424-
" BooleanType, BooleanType, IntegerType, IntegerType, \n",
425-
" IntegerType, IntegerType, IntegerType, StringType, StringType]\n",
426-
"\n",
427-
"# user_types = [StringType]* 14\n",
428-
"\n",
429-
"\n",
430-
"\n"
431-
]
432-
},
433-
{
434-
"cell_type": "code",
435-
"execution_count": null,
436-
"metadata": {},
437-
"outputs": [],
438-
"source": [
439-
"conf.py"
440-
]
441335
}
442336
],
443337
"metadata": {

0 commit comments

Comments
 (0)