From 10618c8ba84b2e7443a32c7492dbb58264f26555 Mon Sep 17 00:00:00 2001 From: JaredFern Date: Tue, 1 Aug 2017 05:36:41 -0500 Subject: [PATCH] Bug & Compatability Fixes (v1.0.4) --- setup.py | 14 +++++++------- vecshare/__init__.py | 6 +++--- vecshare/indexer.py | 14 ++++++++++---- vecshare/signatures.py | 40 +++++++++++++++++++++++----------------- vecshare/vecshare.py | 2 +- 5 files changed, 44 insertions(+), 32 deletions(-) diff --git a/setup.py b/setup.py index 453794f..87b841d 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ def find_version(*paths): version = find_version('vecshare','__init__.py'), description = 'Python library for sharing word embeddings', long_description = read('README.md'), - url = 'hhttps://github.com/JaredFern/VecShare', + url = 'https://github.com/JaredFern/VecShare', author = 'JaredFern', author_email = 'jared.fern@u.northwestern.edu', license = 'Apache 2.0', @@ -48,11 +48,11 @@ def find_version(*paths): 'Intended Audience :: Science/Research', 'Operating System :: OS Independent', 'License :: OSI Approved :: Apache Software License', - 'Programming language :: Python :: 2.7', - 'Programming language :: Python :: 3', - 'Programming language :: Python :: 3.4', - 'Programming language :: Python :: 3.5', - 'Programming language :: Python :: 3.6', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', 'Topic :: Software Development :: Libraries :: Python Modules', 'Topic :: Scientific/Engineering :: Information Analysis', ], @@ -65,7 +65,7 @@ def find_version(*paths): 'datadotworld', 'bs4', 'selenium', - 'progressbar', + 'progressbar2', ], setup_requires=[ diff --git a/vecshare/__init__.py b/vecshare/__init__.py index ca1bb57..24d1a09 100644 --- a/vecshare/__init__.py +++ b/vecshare/__init__.py @@ -1,15 +1,15 @@ """ -VecShare: +VecShare: This is a word embedding selection , query and download platform of which function is to facilitate natural language processing researcher to use word embeddings more efficiently. -Associated with this platform is a broker-centered model. +Associated with this platform is a broker-centered model. """ from __future__ import absolute_import -__version__ = '1.0.0-beta.1' +__version__ = '1.0.4' diff --git a/vecshare/indexer.py b/vecshare/indexer.py index fea27bc..d893758 100644 --- a/vecshare/indexer.py +++ b/vecshare/indexer.py @@ -8,9 +8,11 @@ import datadotworld as dw import pandas as pd import csv,os,datetime,requests,string -from StringIO import StringIO -import cPickle as pickle +try: + from StringIO import StringIO + import cPickle as pickle +except: import io, pickle INDEXER = 'jaredfern/vecshare-indexer' INDEX_FILE = 'index_file' EMB_TAG = 'vecshare' @@ -124,7 +126,11 @@ def avgrank_refresh(tolerance = 0.60,sig_cnt = 5000,stopword_cnt = 100): emb_name, set_name = row['embedding_name'], row['dataset_name'] query_url = "https://query.data.world/file_download/"+set_name+"/"+ emb_name + '.csv' payload, headers = "{}", {'authorization': 'Bearer '+ DW_API_TOKEN} - emb_text = StringIO(requests.request("GET", query_url, data=payload, headers=headers).text) + if sys.version_info < (3,): + emb_text = StringIO(requests.request("GET", query_url, data=payload, headers=headers).text) + else: + emb_text = io.StringIO(requests.request("GET", query_url, data=payload, headers=headers).text) + emb_df = pd.read_csv(emb_text, nrows = 1.5 *sig_cnt) wordlist = emb_df.iloc[0:2*stopword_cnt,0].values @@ -142,7 +148,7 @@ def avgrank_refresh(tolerance = 0.60,sig_cnt = 5000,stopword_cnt = 100): for emb_name, emb_sig in signatures.items(): emb_sig = emb_sig.tolist() for word in stopwords: - if word in emb_sig: emb_sig.remove(word) + if word in emb_sig: emb_sig.remove(word) emb_sig = emb_sig[:sig_cnt] print ("Generated AvgRank signature for: " + emb_name) signatures.update({emb_name:emb_sig}) diff --git a/vecshare/signatures.py b/vecshare/signatures.py index 8772a37..7541aa6 100644 --- a/vecshare/signatures.py +++ b/vecshare/signatures.py @@ -1,12 +1,14 @@ import pandas as pd import datadotworld as dw -import cPickle as pickle -import os,string,re,codecs,requests,indexer +import os,string,re,codecs,requests,indexer,io from nltk.tokenize import sent_tokenize,word_tokenize from collections import Counter from operator import itemgetter -from StringIO import StringIO +try: + from StringIO import StringIO + import cPickle as pickle +except: import io, pickle """ File containing signature similarity measures and aassociated helpers. Implemented similarity measures: @@ -18,7 +20,11 @@ def avgrank(inp_dir): DW_API_TOKEN = os.environ['DW_AUTH_TOKEN'] query_url = "https://query.data.world/file_download/jaredfern/vecshare-signatures/ar_sig.txt" payload, headers = "{}", {'authorization': 'Bearer '+ DW_API_TOKEN} - emb_text = StringIO(requests.request("GET", query_url, data=payload, headers=headers).text) + if sys.version_info < (3,): + emb_text = StringIO(requests.request("GET", query_url, data=payload, headers=headers).text) + else: + emb_text = io.StringIO(requests.request("GET", query_url, data=payload, headers=headers).text) + signatures = pickle.load(emb_text) stopwords = signatures.pop('stopwords', None) @@ -26,11 +32,11 @@ def avgrank(inp_dir): for emb_name,emb_sig in signatures.items(): rank_dict.update({emb_name: 0}) for ind in range(0,len(signatures[emb_name])): - curr_inpword = signatures[emb_name][ind] - if curr_inpword in test_vocab: - rank_dict[emb_name] += test_vocab.index(curr_inpword)/float(len(test_vocab)) - else: - rank_dict[emb_name] += len(signatures[emb_name])/float(len(test_vocab)) + curr_inpword = signatures[emb_name][ind] + if curr_inpword in test_vocab: + rank_dict[emb_name] += test_vocab.index(curr_inpword)/float(len(test_vocab)) + else: + rank_dict[emb_name] += len(signatures[emb_name])/float(len(test_vocab)) ranked_embs = sorted(rank_dict.items(),key=itemgetter(1)) return ranked_embs[0][0] @@ -40,15 +46,15 @@ def avgrank_corp(inp_dir,hdv_vocab, num = 5000): cnt, vocab = Counter(), [] # Counter for all words in the corpus for (root, dirs, files) in os.walk(inp_dir): - files = [f for f in files if not f[0] == '.'] - for f in files: - filepath = os.path.join(root,f) - with codecs.open(filepath,'r', encoding="utf-8") as f: - tok_txt = word_tokenize(f.read()) - for word in tok_txt: cnt[word] += 1 + files = [f for f in files if not f[0] == '.'] + for f in files: + filepath = os.path.join(root,f) + with codecs.open(filepath,'r', encoding="utf-8") as f: + tok_txt = word_tokenize(f.read()) + for word in tok_txt: cnt[word] += 1 for word in hdv_vocab: if word in cnt.keys(): del cnt[word] for word in cnt.most_common(num): - try: vocab.append(str(word[0])) - except: continue + try: vocab.append(str(word[0])) + except: continue return vocab diff --git a/vecshare/vecshare.py b/vecshare/vecshare.py index 1d16163..006e3e2 100644 --- a/vecshare/vecshare.py +++ b/vecshare/vecshare.py @@ -151,7 +151,7 @@ def extract(emb_name, file_dir, set_name = None, case_sensitive = False, downloa for s in sentences: inp_vocab.update(word_tokenize(s)) if case_sensitive: inp_vocab = list(inp_vocab) - else: inp_vocab = [lower(word) for word in list(inp_vocab)] + else: inp_vocab = [word.lower() for word in list(inp_vocab)] inp_vsize = len(inp_vocab) print ('Embedding extraction begins.')