-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtest.py
46 lines (38 loc) · 1.06 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import os
import json
import re
data = []
directory = '/home/jahnic/Git/Hackathon/TweetScraper/Data/tweet'
for filename in os.listdir(directory):
tweet_file = directory + "/" + filename
with open(tweet_file, 'r') as file:
tweet_data = file.read()
data.append(tweet_data)
raw_text = []
for dat in data:
text = re.search("full_text\": \".*", dat)
try:
t = text.group()[: 400]
raw_text.append(t)
except:
print('='*50)
print('Could not match:')
print('='*50)
raw_text
clean_text = []
for txt in raw_text:
# trim 'full_text": from txt
trimmed = txt[12:]
clean = re.search('"(.*?)"', trimmed)
try:
clean_text.append(clean.group())
except:
print('='*50)
print('Could not match:')
print('='*50)
df = pd.DataFrame({'tweets': clean_text})
low_character_count = df.tweets.apply(lambda x: len(x))
low_character_count.hist(bins=range(0, 380, 10))
# Cut off records with less than 30 characters
df = df[low_character_count > 30]
df.to_csv("pro_trump2.csv")