-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_main_contributor.py
59 lines (45 loc) · 1.95 KB
/
find_main_contributor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from __future__ import unicode_literals
import os
import codecs
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from download import DATA_DIR
class FindMainContributor(object):
def __init__(self, vocabulary_size=10000):
main_contributors, content = [], []
for root, _, files in os.walk(DATA_DIR):
for filename in files:
if '@' in filename:
with codecs.open(os.path.join(root, filename)) as f:
data = json.loads(f.read())
main_contributors.append(self._main_contributor_string(filename, data))
content.append(data['readme_content'])
self._preprocessor = TfidfVectorizer(max_df=0.9, max_features=vocabulary_size)
self._label_encoder = LabelEncoder()
self._model = LogisticRegression(C=100)
self._model.fit(
X=self._preprocessor.fit_transform(content),
y=self._label_encoder.fit_transform(main_contributors)
)
@staticmethod
def _main_contributor_string(filename, data):
return 'The main contributor of related GitHub project {} is {}, try to email him at {}'.format(
filename, data['main_contributor']['name'], data['main_contributor']['email']
).encode('utf-8')
def __call__(self, request):
return self._label_encoder.inverse_transform(
self._model.predict(self._preprocessor.transform([request]))
)[0]
def prompt(find_main_contributor):
from sys import version_info
py3 = version_info[0] > 2
while True:
request = input('>') if py3 else raw_input('>')
print(find_main_contributor(request))
if __name__ == "__main__":
print('Inititalize model...')
find_main_contributor = FindMainContributor()
print('Type your query:')
prompt(find_main_contributor)