-
Notifications
You must be signed in to change notification settings - Fork 1
/
Making df from tweets_json.py
74 lines (59 loc) · 1.87 KB
/
Making df from tweets_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 24 00:05:02 2020
@author: Pranav Tumkur
"""
import os.path
import json
import pandas as pd
from pandas.io.json import json_normalize
import _pickle as pickle
filename='../SQL for data science/Capstone/US_PoliticalTweets/US_PoliticalTweets/tweets.json'
fh=open(filename)
data = fh.read()
#fh1=open(filename1)
#data1 = fh1.read()
#print(data1[:100000000])
lst=lst1=[]
x=start_from=0
temp_dct=temp_dct1={}
df=None
rows_to_extract=1000
if os.path.exists('../SQL for data science/tweets_pickle.pkl'):
df=pd.read_pickle('../SQL for data science/tweets.pkl')
initial_rows=int(df.shape[0])
for iterable in range(initial_rows):
start_from=data.find('\n',start_from+1)
for c in range(rows_to_extract):
y=data.find('\n',start_from+1)
if y>-1:
#print(start_from,y)
temp_js=json.loads(data[start_from:y])
df_i=pd.json_normalize(temp_js)
df=pd.concat([df,df_i])
else:break
start_from=y
df.to_pickle('../SQL for data science/tweets_pickle.pkl')
else:
for c in range(rows_to_extract):
y=data.find('\n',start_from+1)
if y>-1:
#print(start_from,y)
temp_js=json.loads(data[start_from:y])
df_i=pd.json_normalize(temp_js)
df=pd.concat([df,df_i])
else:break
start_from=y
df.to_pickle('../SQL for data science/tweets_pickle.pkl')
print(df)
'''print(y)
clean_jsonu=(json.loads(json.dumps(lst)))
clean_jsont=(json.loads(json.dumps(lst1)))
#print(clean_json)
dfu=pd.json_normalize(clean_jsonu)
dft=pd.json_normalize(clean_jsont)
#df['created_at']=pd.to_datetime(df['created_at'])
#pd.set_option('display.max_colwidth', None)
dfu.to_csv('Users_json.csv')
dft.to_csv('Tweets_json.csv')
#print(df.head(25))'''