-
Notifications
You must be signed in to change notification settings - Fork 0
/
textblob_lang_classification.py
46 lines (38 loc) · 1.27 KB
/
textblob_lang_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
"""
Finding the language of each tweet using textblob.
It turned out to be the best solustion but if you
don't sleep after 10 request, you'll get 'Too Many Requests'
error. I slept 3 secs but still got the error ocasionally and
had to reinitiate the process in order to get the rest.
It happened 3 times and thats why you see tweet_lang02 file
and it took hours.
"""
# - *- coding: utf- 8 - *-
from textblob import TextBlob
from time import sleep
import csv
import pandas as pd
df = pd.read_csv("cleaned_data01.csv")
texts = df['cleaned_text']
counter = 188277
remains = texts.shape[0] - 188277
req_counter = 0
#========================= textblob ==========================
with open('tweet_lang02.csv', 'a', encoding="utf-8-sig") as csvFile:
csvWriter = csv.writer(csvFile)
csvWriter.writerow(['text','language'])
for i in range(188277, texts.shape[0]):
counter +=1
remains -=1
req_counter +=1
print(counter, ' ', remains)
t = texts[i]
s = t.replace("#","")
s = s.replace("_", " ")
if req_counter == 10:
sleep(3)
req_counter = 0
b = TextBlob(s)
l = b.detect_language()
csvWriter.writerow([t,l])
# =====================end of textblob ==============================