-
Notifications
You must be signed in to change notification settings - Fork 4
/
PreProcessing.py
88 lines (64 loc) · 2.24 KB
/
PreProcessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 13 19:31:12 2017
@author: shalin
"""
from DataSet import DataSet
import nltk
from tqdm import tqdm
d = DataSet()
#d.articles -> Dictionary (key = body id, value = article body)
#d.train_stances -> List (Each element is Dict) (key = 'stances,headline,body id')
lemma = nltk.WordNetLemmatizer()
class PreProcessing:
def __init__(self):
self.datadict = {}
print("Preprocessing data....")
self.run()
def word_lemma(self,word):
return lemma.lemmatize(word)
def tokenize_lemma(self,s):
tokens=[]
sents = nltk.sent_tokenize(s)
for sent in sents:
a = []
for word in nltk.word_tokenize(sent):
a.append(self.word_lemma(word))
ca = self.clean_list(a)
tokens.append(ca)
return tokens
def clean_list(self,a):
stopwords = ['*','...','.','-','"','"',',',',,']
for sw in stopwords:
if sw in a:
a.remove(sw)
return a
def clean(self,s):
return s.lower()
def stop_words(self,corpus):
corpus_ws = []
for s in tqdm(corpus):
sent = []
for word in s:
if word not in nltk.corpus.stopwords.words():
sent.append(word)
corpus_ws.append(sent)
return corpus_ws
def generatedict(self,finalcorpus):
self.datadict['corpus'] = finalcorpus
def run(self):
articles = {}
articles = d.articles
#stances = d.train_stances
articles_body=''
for article in tqdm(articles.keys()):
articles_body += articles[article]
clean_article_body = self.clean(articles_body)
lemma_tokens = self.tokenize_lemma(clean_article_body)
finalcorpus = self.stop_words(lemma_tokens)
with open('combined_articles'+'.txt','w+'
,encoding='utf-8') as w:
w.write(str(finalcorpus))
self.generatedict(finalcorpus)
if __name__=="__main__":
p = PreProcessing()