-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtxtScrape.py
121 lines (99 loc) · 3.86 KB
/
txtScrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/python
# -*- coding:utf-8 -*-
import jieba
from glob import glob
import os
import re
import pandas as pd
# f = open('stopwords.txt','r',encoding='utf-8')
# def loadStopwords():
# stopwords = []
# for line in f.readlines():
# stopwords.append(line.strip())
# return stopwords
def seg(origin, file_dir,wordMatrix,pdfno):
combine_dict = {}
first_word = []
index0=pdfno.find('\\')
index = pdfno.rfind('\\')
foldname=pdfno[index0+1:index0+5]
pdfdate=pdfno[index+1:index+9]
wordnum = [foldname,pdfdate]
for line in open(file_dir, "r", encoding='utf-8-sig'): # 把同义词存成字典
seperate_word = line.strip().split("\t")
first_word.append(seperate_word[0])
num = len(seperate_word)
first_word.append(seperate_word[0])
for i in range(0, num - 1):
combine_dict[seperate_word[i + 1]] = seperate_word[0]
for word in seperate_word:
origin=origin.replace(word,seperate_word[0])
n =len(re.findall(seperate_word[0], origin))
wordnum.append(n)
# print(wordnum)
wordMatrix.append(wordnum)
print(wordMatrix)
return wordMatrix
# stopwords = loadStopwords()
#
# combine_dict = {}
# first_word = []
# for line in open(file_dir, "r", encoding='utf-8'): # 把同义词存成字典
# seperate_word = line.strip().split("\t")
# num = len(seperate_word)
# first_word.append(seperate_word[0])
# for i in range(0, num - 1):
# combine_dict[seperate_word[i + 1]] = seperate_word[0]
# jieba.suggest_freq('马大', tune=True) # 输入所有同义词典中的分词
# jieba.suggest_freq('马利亚', tune=True)
#
# seg_list = jieba.cut(origin.replace('\n', '')) #是否使用HMM模型
# segs = list(seg_list)
# for word in segs:
# if word in stopwords:
# segs.remove(word)
# final_sentence = []
# for word in segs:
# if word in combine_dict:
# word = combine_dict[word]
# final_sentence.append(word)
# else:
# final_sentence.append(word)
# count = segs.count('马大')
# print(count)
# print(first_word[word])
# count = segs.count(first_word[word])
# print(count)
# return #'/'.join(segs)
# origin = open("C:\\\Users\\Administrator\\Desktop\\new\\a.txt", "r").read()
# origin = open("C:\\Users\\Deep Learning\\Desktop\\new\\a.txt","r",encoding='utf-8').read()
# jieba.suggest_freq("马大", tune = True)
# jieba.suggest_freq("大黑牛", tune = True)
# jieba.suggest_freq("能力者", tune = True)
# seg_list = jieba.cut(origin, cut_all = False)
# seg_list = jieba.cut(origin.replace('\n',''))
# f = "/".join(seg_list)
# fn = f.split("/")
# count = fn.count('马大')
# print(count)
def main():
dir_path = 'C:/Users/Deep Learning/Desktop/new/fold'
file_dir='C:/Users/Deep Learning/Desktop/new/dic.txt' # 同义词词典地址
fold_list = glob(os.path.join(dir_path, '*'))
wordMatrix = []
for Pdf_fold_no in fold_list:
pdf_list = glob(os.path.join(dir_path, Pdf_fold_no, '*.txt'))
for pdfno in pdf_list:
origin = open(pdfno, "r", encoding='utf-8-sig').read()
wordMatrix = seg(origin,file_dir,wordMatrix,pdfno)
df=pd.DataFrame(wordMatrix)
df.to_csv('C:/Users/Deep Learning/Desktop/new/testcsv.csv',encoding='GB18030')#gb2312
# fn = segs.split("/")
# count=fn.count('马大')
# print(count)
# output_1 = open("C:\\\Users\\Administrator\\Desktop\\new\\c.txt", "w")
# output_1 = open("C:\\Users\\Deep Learning\\Desktop\\new\\c.txt", "wb+")
# output_1.write(segs.encode('utf-8'))
# output_1.close()
if __name__ == '__main__':
main()