Skip to content

Commit

Permalink
Bug & Compatability Fixes (v1.0.4)
Browse files Browse the repository at this point in the history
  • Loading branch information
JaredFern committed Aug 1, 2017
1 parent 3b907e3 commit 10618c8
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 32 deletions.
14 changes: 7 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def find_version(*paths):
version = find_version('vecshare','__init__.py'),
description = 'Python library for sharing word embeddings',
long_description = read('README.md'),
url = 'hhttps://github.com/JaredFern/VecShare',
url = 'https://github.com/JaredFern/VecShare',
author = 'JaredFern',
author_email = 'jared.fern@u.northwestern.edu',
license = 'Apache 2.0',
Expand All @@ -48,11 +48,11 @@ def find_version(*paths):
'Intended Audience :: Science/Research',
'Operating System :: OS Independent',
'License :: OSI Approved :: Apache Software License',
'Programming language :: Python :: 2.7',
'Programming language :: Python :: 3',
'Programming language :: Python :: 3.4',
'Programming language :: Python :: 3.5',
'Programming language :: Python :: 3.6',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Topic :: Software Development :: Libraries :: Python Modules',
'Topic :: Scientific/Engineering :: Information Analysis',
],
Expand All @@ -65,7 +65,7 @@ def find_version(*paths):
'datadotworld',
'bs4',
'selenium',
'progressbar',
'progressbar2',
],

setup_requires=[
Expand Down
6 changes: 3 additions & 3 deletions vecshare/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
"""
VecShare:
VecShare:
This is a word embedding selection , query and download platform of which function is to
facilitate natural language processing researcher to use word embeddings more efficiently.
Associated with this platform is a broker-centered model.
Associated with this platform is a broker-centered model.
"""

from __future__ import absolute_import


__version__ = '1.0.0-beta.1'
__version__ = '1.0.4'
14 changes: 10 additions & 4 deletions vecshare/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
import datadotworld as dw
import pandas as pd
import csv,os,datetime,requests,string
from StringIO import StringIO
import cPickle as pickle

try:
from StringIO import StringIO
import cPickle as pickle
except: import io, pickle
INDEXER = 'jaredfern/vecshare-indexer'
INDEX_FILE = 'index_file'
EMB_TAG = 'vecshare'
Expand Down Expand Up @@ -124,7 +126,11 @@ def avgrank_refresh(tolerance = 0.60,sig_cnt = 5000,stopword_cnt = 100):
emb_name, set_name = row['embedding_name'], row['dataset_name']
query_url = "https://query.data.world/file_download/"+set_name+"/"+ emb_name + '.csv'
payload, headers = "{}", {'authorization': 'Bearer '+ DW_API_TOKEN}
emb_text = StringIO(requests.request("GET", query_url, data=payload, headers=headers).text)
if sys.version_info < (3,):
emb_text = StringIO(requests.request("GET", query_url, data=payload, headers=headers).text)
else:
emb_text = io.StringIO(requests.request("GET", query_url, data=payload, headers=headers).text)

emb_df = pd.read_csv(emb_text, nrows = 1.5 *sig_cnt)

wordlist = emb_df.iloc[0:2*stopword_cnt,0].values
Expand All @@ -142,7 +148,7 @@ def avgrank_refresh(tolerance = 0.60,sig_cnt = 5000,stopword_cnt = 100):
for emb_name, emb_sig in signatures.items():
emb_sig = emb_sig.tolist()
for word in stopwords:
if word in emb_sig: emb_sig.remove(word)
if word in emb_sig: emb_sig.remove(word)
emb_sig = emb_sig[:sig_cnt]
print ("Generated AvgRank signature for: " + emb_name)
signatures.update({emb_name:emb_sig})
Expand Down
40 changes: 23 additions & 17 deletions vecshare/signatures.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import pandas as pd
import datadotworld as dw
import cPickle as pickle
import os,string,re,codecs,requests,indexer
import os,string,re,codecs,requests,indexer,io
from nltk.tokenize import sent_tokenize,word_tokenize
from collections import Counter
from operator import itemgetter
from StringIO import StringIO

try:
from StringIO import StringIO
import cPickle as pickle
except: import io, pickle
"""
File containing signature similarity measures and aassociated helpers.
Implemented similarity measures:
Expand All @@ -18,19 +20,23 @@ def avgrank(inp_dir):
DW_API_TOKEN = os.environ['DW_AUTH_TOKEN']
query_url = "https://query.data.world/file_download/jaredfern/vecshare-signatures/ar_sig.txt"
payload, headers = "{}", {'authorization': 'Bearer '+ DW_API_TOKEN}
emb_text = StringIO(requests.request("GET", query_url, data=payload, headers=headers).text)
if sys.version_info < (3,):
emb_text = StringIO(requests.request("GET", query_url, data=payload, headers=headers).text)
else:
emb_text = io.StringIO(requests.request("GET", query_url, data=payload, headers=headers).text)

signatures = pickle.load(emb_text)

stopwords = signatures.pop('stopwords', None)
test_vocab = avgrank_corp(inp_dir,stopwords)
for emb_name,emb_sig in signatures.items():
rank_dict.update({emb_name: 0})
for ind in range(0,len(signatures[emb_name])):
curr_inpword = signatures[emb_name][ind]
if curr_inpword in test_vocab:
rank_dict[emb_name] += test_vocab.index(curr_inpword)/float(len(test_vocab))
else:
rank_dict[emb_name] += len(signatures[emb_name])/float(len(test_vocab))
curr_inpword = signatures[emb_name][ind]
if curr_inpword in test_vocab:
rank_dict[emb_name] += test_vocab.index(curr_inpword)/float(len(test_vocab))
else:
rank_dict[emb_name] += len(signatures[emb_name])/float(len(test_vocab))

ranked_embs = sorted(rank_dict.items(),key=itemgetter(1))
return ranked_embs[0][0]
Expand All @@ -40,15 +46,15 @@ def avgrank_corp(inp_dir,hdv_vocab, num = 5000):
cnt, vocab = Counter(), []
# Counter for all words in the corpus
for (root, dirs, files) in os.walk(inp_dir):
files = [f for f in files if not f[0] == '.']
for f in files:
filepath = os.path.join(root,f)
with codecs.open(filepath,'r', encoding="utf-8") as f:
tok_txt = word_tokenize(f.read())
for word in tok_txt: cnt[word] += 1
files = [f for f in files if not f[0] == '.']
for f in files:
filepath = os.path.join(root,f)
with codecs.open(filepath,'r', encoding="utf-8") as f:
tok_txt = word_tokenize(f.read())
for word in tok_txt: cnt[word] += 1
for word in hdv_vocab:
if word in cnt.keys(): del cnt[word]
for word in cnt.most_common(num):
try: vocab.append(str(word[0]))
except: continue
try: vocab.append(str(word[0]))
except: continue
return vocab
2 changes: 1 addition & 1 deletion vecshare/vecshare.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def extract(emb_name, file_dir, set_name = None, case_sensitive = False, downloa
for s in sentences:
inp_vocab.update(word_tokenize(s))
if case_sensitive: inp_vocab = list(inp_vocab)
else: inp_vocab = [lower(word) for word in list(inp_vocab)]
else: inp_vocab = [word.lower() for word in list(inp_vocab)]
inp_vsize = len(inp_vocab)

print ('Embedding extraction begins.')
Expand Down

0 comments on commit 10618c8

Please sign in to comment.