-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsong_classify.py
132 lines (94 loc) · 4.27 KB
/
song_classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 16 20:59:19 2020
@author: pranjal
"""
import os
import nltk
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import PunktSentenceTokenizer
import matplotlib
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection,preprocessing,naive_bayes,metrics
def get_wordnet_pos(word):
# Map POS tag to first character lemmatize() accepts
tag = nltk.pos_tag([word])[0][1][0].upper()
tag_dict = {"J": wordnet.ADJ,
"N": wordnet.NOUN,
"V": wordnet.VERB,
"R": wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN) #returns pos value that can be accepted by lemmatize function
def preprocess(text):
# sent tokenize----------------------------------------------
text = text.strip() #removing extra white spaces
s_tokenized = sent_tokenize(text)
#cleaning up for word tokenizing-----------------------------
import re
text = re.sub(r'\d+', '', text) #removing numbers
import string
text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) #removing symbols
text=text.strip()
text = re.sub('\s+', ' ', text) # remove newline chars
#removing stop words ----------------------------------------
filtered_text=[]
stop_words=set(stopwords.words("english"))
w_tokenized = word_tokenize((text))
for w in w_tokenized:
if w not in stop_words:
filtered_text.append(w)
#lemmatizing ----------------------------------------------
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(w)) for word in filtered_text]
return (" ".join(lemmas)) #returning string of lemmas which are to be used for clustering
def train_model(classifier, feature_vector_train, label, feature_vector_valid,valid_y):
# fit the training dataset on the classifier
classifier.fit(feature_vector_train, label)
# predict the labels on validation dataset
predictions = classifier.predict(feature_vector_valid)
return (metrics.accuracy_score(predictions, valid_y))
def main():
data=[]
labels=[]
Path = r"/home/pranjal/Desktop/song_classification" #Path to the data set downloaded by the web crawler
filelist = os.listdir(Path)
for i in range(4): #taking 1 file each from any 3 categories for text preprocessing
for j in range(len(os.listdir(Path+"/"+filelist[i]))):
new_path=Path+"/"+filelist[i]+"/"+os.listdir(Path+"/"+filelist[i])[j]
file = open(new_path, 'r')
text = file.read()
text=text.strip()
file.close()
text=preprocess(text)
data.append(text)
labels.append(filelist[i])
df=pd.DataFrame()
df["songs"]=data
df["labels"]=labels
df['songs'].replace('', np.nan, inplace=True)
df.dropna(subset=['songs'], inplace=True)
from sklearn.utils import shuffle
df = shuffle(df) #shuffling dataframe before splitting for train /test data set
df.reset_index(inplace=True, drop=True) #resetting the indexes shuffled due to above operation
train_df=df
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train_df['songs'], train_df['labels'])
# label encode the target variable
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)
# tf-idf bag of words model
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train_df['songs'])
xtrain_tfidf = tfidf_vect.transform(train_x)
xvalid_tfidf = tfidf_vect.transform(valid_x)
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf,valid_y)
print ("Accuracy for Naive Bayes classifier is : ", accuracy)
if __name__=="__main__":
main()