-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcount_english_word.py
84 lines (59 loc) · 2.24 KB
/
count_english_word.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import logging
import pandas as pd
import numpy
from functools import wraps
from collections import Counter
import string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import words
nltk.download('words')
def clean_tweets(text_list):
# lowercase
cleaned_text = [text.lower() for text in text_list]
# remove \n and \t
cleaned_text = [text.replace('\n', ' ') for text in cleaned_text]
# remove special characters
cleaned_text = [''.join([char for char in text if char not in string.punctuation]) for text in cleaned_text]
return cleaned_text
def count_english_words(text_list, english_words):
total_word_counts = []
english_word_count = []
english_word_proportion = []
for text in text_list:
# Count all words in text
words = text.split()
text_length = len(words)
# Find word in wordnet
counter = [True if word in english_words else False for word in words]
num_word_recognized = sum(counter)
# print(counter)
# print(words)
# print(sum(counter))
total_word_counts.append(text_length)
english_word_count.append(num_word_recognized)
english_word_proportion.append(float(num_word_recognized / text_length))
return total_word_counts, english_word_count, english_word_proportion
if __name__ == '__main__':
english_words = set(words.words())
print('Length english words', len(english_words))
tweets2019 = pd.read_csv('top_tweets_2019_03.csv')
tweets2020 = pd.read_csv('top_tweets_2020_03.csv')
tweets = pd.concat([tweets2019, tweets2020])
tweet_list = list(tweets['text'])
print('Cleaning text...')
cleaned_tweets = clean_tweets(tweet_list)
print('Counting words...')
tweet_length, eng_words_recognized, english_word_ratio = count_english_words(cleaned_tweets, english_words)
print('Saving table...')
english_count_df = pd.DataFrame(
{
'text' : tweet_list,
'cleaned_text' : cleaned_tweets,
'tweet_length' : tweet_length,
'eng_words_recognized' : eng_words_recognized,
'english_word_count' : english_word_ratio
}
)
english_count_df.to_csv('english_word_counts.csv', index=False)
print('Done.')