forked from MoritaDataLand/Natural_Language_Processing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
BoW-TFIDF.py
47 lines (37 loc) · 2.15 KB
/
BoW-TFIDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# -*- coding: utf-8 -*-
"""
@channel Morita DataLand
@author Morita Tarvirdians
@email tarvirdians.morita@gmail.com
@desc Bag of Words and TF-IDF tutorial
"""
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
text = """A major drawback of statistical methods is that they require elaborate feature engineering.
Since the early 2010s, the field has thus largely abandoned statistical methods and shifted to neural networks for machine learning.
Popular techniques include the use of word embeddings to capture semantic properties of words, and an increase in end-to-end learning of a higher-level task (e.g., question answering) instead of relying on a pipeline of separate intermediate tasks (e.g., part-of-speech tagging and dependency parsing).
In some areas, this shift has entailed substantial changes in how NLP systems are designed, such that deep neural network-based approaches may be viewed as a new paradigm distinct from statistical natural language processing.
For instance, the term neural machine translation (NMT) emphasizes the fact that deep learning-based approaches to machine translation directly learn sequence-to-sequence transformations, obviating the need for intermediate steps such as word alignment and language modeling that was used in statistical machine translation (SMT).
Latest works tend to use non-technical structure of a given task to build proper neural network
"""
#cleaning text
sentences = sent_tokenize(text)
stemmer = PorterStemmer()
corpus = []
for sent in sentences:
review = re.sub("[^a-zA-Z]", " ", sent)
review = re.sub("\b[a-zA-Z]\b", " ", review)
review = review.lower()
review = review.split()
review = [stemmer.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = " ".join(review)
corpus.append(review)
#vectorization
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
bow = cv.fit_transform(corpus).toarray()
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
tfidf = tf.fit_transform(corpus).toarray()