This repository has been archived by the owner on Jul 22, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprocess-readme.py
172 lines (146 loc) · 5.91 KB
/
process-readme.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env python3.4
#
# @file process-readme.py
# @brief Process README and description texts (based on science concierge)
# @author Matthew Graham
#
# <!---------------------------------------------------------------------------
# Copyright (C) 2016 by the California Institute of Technology.
# This software is part of CASICS, the Comprehensive and Automated Software
# Inventory Creation System. For more information, visit http://casics.org.
# ------------------------------------------------------------------------- -->
import plac
import os
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
sys.path.append('/Users/mjg/Projects/casics/casics/src')
# sys.path.append('/home/mjg/code/casics/src/')
import casics
import re
import string
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import WhitespaceTokenizer
from unidecode import unidecode
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
import pandas as pd
import matplotlib.pyplot as plt
from bhtsne import bh_tsne
from time import time
if __name__ == '__main__' and __package__ is None:
sys.path.append(os.path.join(os.path.dirname(__file__), "../database"))
from casicsdb import *
# Globals
# .............................................................................
stemmer = PorterStemmer()
w_tokenizer = WhitespaceTokenizer()
punct_re = re.compile('[{}]'.format(re.escape(string.punctuation)))
# Main body.
# .............................................................................
def main():
docs = get_text() # list of READMEs and descriptions
docs_preprocess = map(lambda doc: preprocess(doc), docs) # stemming string
tfidf_matrix = tfidf_vectorizer(docs_preprocess) # convert to tf-idf matrix
svd_vect = svd_vectorizer(tfidf_matrix, n_components=200, n_iter=150) # reduce dimensions
# Run t-distributed Stochastic Neighbor Embedding (t-SNE; Barnes-Hut implementation)
# Timings: sklearn - 1k: 15.9195120335, 2k - 41.7645118237, 4k - 185.737361908
# t-sne - 1k: 15.7083182335, 2k - 38.8270409107, 4k - 78.0439789295
embedded = []
for res in bh_tsne(svd_vect, no_dims = 2, perplexity = 40, verbose = True):
embedded.append(res)
embedded = np.array(embedded)
# We can use this as input to identify clusters of projects
# Plot t-SNE
fig, ax = plt.subplots(figsize=(10, 10))
plt.setp(ax, xticks=(), yticks=())
fig.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9,
wspace=0.0, hspace=0.0)
ax.scatter(embedded[:, 0], embedded[:, 1], marker = 'x')
# c=newsgroups.target, marker="x")
# fig.savefig('tsne.pdf', format = 'pdf')
plt.show()
# Helpers
# .............................................................................
def preprocess(text, stem=True):
"""
Apply Snowball stemmer to string
Parameters
----------
text : input abstract of papers/posters string
stem : apply stemmer if True, default True
"""
text = unidecode(text).lower()
text = punct_re.sub(' ', text) # remove punctuation
if stem:
text_new = [stemmer.stem(token) for token in w_tokenizer.tokenize(text)]
else:
text_new = w_tokenizer.tokenize(text)
return ' '.join(text_new)
def tfidf_vectorizer(abstract_list, min_df=3, max_df=0.8,
ngram_range=(1, 2), return_model=False):
"""
Transform list of abstracts to tf-idf matrix
"""
tfidf_model = TfidfVectorizer(min_df=min_df, max_df=max_df, strip_accents='unicode',
analyzer='word', token_pattern=r'\w{1,}', ngram_range=ngram_range,
use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words='english')
tfidf_matrix = tfidf_model.fit_transform(abstract_list)
if return_model:
return tfidf_matrix, tfidf_model
else:
return tfidf_matrix
def svd_vectorizer(tfidf_matrix, n_components=400,
n_iter=150, return_model=False):
"""
Apply dimensionality reduction using truncated SVD or Latent Semantic Analysis (LSA) to tfidf matrix
"""
svd_model = TruncatedSVD(n_components=n_components, n_iter=n_iter, algorithm='arpack')
text_vect = svd_model.fit_transform(tfidf_matrix)
if return_model:
return text_vect, svd_model
else:
return text_vect
def get_text():
text = []
for line in open("docs_4k.dat"):
text.append(line)
return text
def get_text_mongo(limit = 1000):
"""
Retrieve the descriptions and READMEs from the db
"""
text = []
connection = CasicsDB(server = 'hyponym.caltech.edu', port = '9988',
login = 'mjg', password = 'casicsfun')
github_db = connection.open('github')
repos = github_db.repos
count = 0
for repo in repos.find():
if count > limit: break
descrip = ''
readme = ''
if repo['description']:
if len(repo['description'].split()) < 200: descrip = repo['description']
if repo['readme'] and repo['readme'] != -1:
readme = repo['readme']
if len(descrip) + len(readme) > 0:
text.append(descrip + " " + readme)
count += 1
casicsdb.close()
return text
# Plac annotations for main function arguments
# .............................................................................
# Argument annotations are: (help, kind, abbrev, type, choices, metavar)
# Plac automatically adds a -h argument for help, so no need to do it here.
main.__annotations__ = dict(
file = ('file containing repository identifiers', 'option', 'f'),
id = ('comma-separated list of repository ids', 'option', 'i'),
)
# Entry point
# .............................................................................
def cli_main():
plac.call(main)
cli_main()