-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathobidroidMR.py
120 lines (73 loc) · 2.24 KB
/
obidroidMR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from mrjob.job import MRJob
# from sentClassifier import sentClassify
# from cPickle import load
from textblob import TextBlob
import re
import sys
import math
class ObidroidReview(MRJob):
# INPUT_PROTOCOL = RawValueProtocol
@staticmethod
def getFeatures(rev):
wordpattern = re.compile('\w+')
capspattern = re.compile('([A-Z])+\w')
exclaimpattern = re.compile('!')
rev = rev.decode('utf-8', 'ignore')
revCharLength = len(rev)
words = wordpattern.findall(rev)
revWordsLength = len(words)
revUniqueWordLength = len(set(words))
revCapCount = len(capspattern.findall(rev))
revExclaimCount = len(exclaimpattern.findall(rev))
revAdjCount = 0
# revPosTokens = nltk.pos_tag(nltk.word_tokenize(rev))
revBlob = TextBlob(rev)
revPosTokens = revBlob.tags
for _, pos in revPosTokens:
if pos == 'JJ' or pos == 'VBP':
revAdjCount += 1
# Sentiment Classifiers:
# revSentAgg = sentClassify(rev)
# overall production sentiment classifier
# blob = TextBlob(rev, analyzer=NaiveBayesAnalyzer())
revSent = revBlob.sentiment.polarity
revSubjectivity = revBlob.sentiment.subjectivity
# print blobSent
# if blobSent[0] == 'pos':
# revSent = 1 * blobSent[1]
# elif blobSent[0] == 'neg':
# revSent = -1 * blobSent[2]
# else:
# revSent = 0
revSent = round(revSent, 4)
revSubjectivity = round(revSubjectivity, 4)
return [
revCharLength,
revWordsLength,
revUniqueWordLength,
revCapCount,
revAdjCount,
# revSentAgg,
revSent,
revSubjectivity,
revExclaimCount
]
def getRecord(self, _, record): #Mapper 1
record = record.split(',')
idpattern = re.compile('(\w+\.+\w+[(\.+)(\w+)]+)')
reviewid = record[0]
appidmatches = idpattern.split(record[1])
appid = appidmatches[1]
features = ObidroidReview.getFeatures(record[2])
features.append(reviewid)
sys.stderr.write("MAPPER OUTPUT: ({0},{1})\n".format(appid,features))
yield appid, features
def performAction(self,appid,revfeatures): #Reducer 1
sys.stderr.write("REDUCER INPUT: ({0},{1})\n".format(appid,revfeatures))
yield appid, list(revfeatures)
def steps(self):
return [
self.mr(mapper=self.getRecord, reducer=self.performAction)
]
if __name__ == '__main__':
ObidroidReview.run()