-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword_count.py
85 lines (71 loc) · 2.27 KB
/
word_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from pdfminer.high_level import extract_pages, extract_text
import re
import pandas as pd
import glob
import string
import dev.attempt2 as juwon
def word_count_text(t):
# ~ only text + space
pattern1 = re.compile(r'[a-zA-Z]+\s{1}')
# ~ empty df
df = pd.DataFrame(columns = ['freq'])
t = re.sub(pattern=r'[^\w]', repl=' ', string=t.lower())
# text = t.split()
text = pattern1.findall(t)
for i in text:
i = i.replace(' ','')
if not i in list(df.index):
df.loc[i,'freq']=1
else:
df.loc[df.index==i,'freq'] += 1
df = df.sort_values(by='freq', ascending=False)
return t, text, df
def word_count_pdf(fn):
t = juwon.get_full_text(fn)
return word_count_text(t)
# join two dataframes
def merge_freq_two(df1, df2):
df1 = df1.rename(columns = {'freq':'freq1'})
df2 = df2.rename(columns = {'freq':'freq2'})
df = df1.join(df2, how = 'outer')
df = df.fillna(0)
df['freq'] = df.freq1 + df.freq2
df = df[['freq']]
return df
# join a list of dataframes
def merge_freq(dfs):
df=pd.DataFrame()
for i, df_i in enumerate(dfs):
if i==0:
df = df_i
else:
df = merge_freq_two(df, df_i)
return df
# main function
if __name__ == '__main__':
# list of pdf files in .\pdfs
files = glob.glob(r'pdfs\*.pdf')
# files = [r'pdfs\2021 March SAT QAS.pdf',r'pdfs\April 2018 School Day SAT QAS Full Test.pdf']
# blank lists to store results
t = []
text = []
df = []
# to extract text and count the frequency of words
for file in files:
print(file)
temp_t, temp_text, temp_df = word_count_pdf(file)
# temp_t, temp_text, temp_df = word_count_text(file)
t.append(temp_t)
text.append(temp_text)
df.append(temp_df)
df_final = merge_freq(df)
# print(f'df[0]: {df[0]}')
# print(f'df[1]: {df[1]}')
# print(f'df_final: {df_final}')
# to save the result as a file
df_final.to_excel('text.xlsx', sheet_name = 'df_final')
with pd.ExcelWriter('text.xlsx', mode='a') as writer:
for i, df_i in enumerate(df):
# print(fr'{files[i]}')
file = files[i].replace('pdfs\\','')
df_i.to_excel(writer, sheet_name = file)