Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
215 changes: 155 additions & 60 deletions LinkingUtil.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,54 @@
#!/eecs/research/asr/mingbin/python-workspace/hopeless/bin/python

import numpy, os, codecs, itertools, logging
"""
Created on Mon Aug 4 2016
Modified on Mon Jan 13 2017
Filename : LinkingUtil.mingbin.feature.test.2016.py
Description : prepare test data for KBP EL
Author : fwei
"""


import numpy, os, codecs, itertools, logging, math
import scipy.sparse
from gigaword2feature import *
from scipy.sparse import csr_matrix
from sklearn import preprocessing
from collections import defaultdict

logger = logging.getLogger( __name__ )

# Global Variable
offsets_of_original_list = []
document_id = []
entity_type = []
mention_type = []

def LoadED( rspecifier, language = 'eng' ):

entity2cls = { # KBP2015 label
'PER_NAM' : 0,
'PER_NOM' : 5,
'ORG_NAM' : 1,
'GPE_NAM' : 2,
'LOC_NAM' : 3,
'FAC_NAM' : 4,
'PER_NOM' : 5,
'TTL_NAM' : 5,

# iflytek label
# KBP2016
'PER_NAME' : 0,
'ORG_NAME' : 1,
'GPE_NAME' : 2,
'LOC_NAME' : 3,
'FAC_NAME' : 4,
'PER_NOMINAL' : 5,
'ORG_NOMINAL' : 6,
'GPE_NOMINAL' : 7,
'LOC_NOMINAL' : 8,
'FAC_NOMINAL' : 9,
'ORG_NOM' : 6,
'GPE_NOM' : 7,
'LOC_NOM' : 8,
'FAC_NOM' : 9,
'TITLE_NAME' : 5,
'TITLE_NOMINAL' : 5

}

if os.path.isfile( rspecifier ):
Expand All @@ -41,6 +58,8 @@ def LoadED( rspecifier, language = 'eng' ):

# texts, tags, failures = processed.split( u'\n\n\n', 2 )
texts = processed.split( u'\n\n\n' )[0]


for text in texts.split( u'\n\n' ):
parts = text.split( u'\n' )
# assert len(parts) in [2, 3], 'sentence, offsets, labels(optional)'
Expand All @@ -53,7 +72,8 @@ def LoadED( rspecifier, language = 'eng' ):
[ offsets[1:-1].split(u',') for offsets in parts[1].split() ] )
assert len(offsets) == len(sent), rspecifier + '\n' + \
str( offsets ) + '\n' + str( sent ) + '\n%d vs %d' % (len(offsets), len(sent))



if len(parts) == 3:
for ans in parts[-1].split():
try:
Expand All @@ -62,7 +82,14 @@ def LoadED( rspecifier, language = 'eng' ):
boe.append( int(begin_idx) )
eoe.append( int(end_idx) )
mids.append( mid )
offsets_of_original_list.append("{0}-{1}".format(offsets[boe[-1]][0],
offsets[eoe[-1] - 1][1] - 1))
#print offsets_of_original_list
spelling.append( original[ offsets[boe[-1]][0] : offsets[eoe[-1] - 1][1] ] )
#print spelling
document_id.append( rspecifier.split('/')[-1] )
#print document_id
#exit(0)
except ValueError as ex1:
logger.exception( rspecifier )
logger.exception( ans )
Expand Down Expand Up @@ -93,7 +120,6 @@ def LoadED( rspecifier, language = 'eng' ):
yield X



def LoadEL( rspecifier, language = 'eng', window = 1 ):
if os.path.isfile( rspecifier ):
data = list( LoadED( rspecifier, language ) )
Expand Down Expand Up @@ -130,11 +156,11 @@ def PositiveEL( embedding_basename,
n_word = len( fp.read().strip().split() )
logger.debug( 'a vocabulary of %d words is used' % n_word )

numericizer = vocabulary( embedding_basename + '.wordlist',
numericizer = vocabulary( embedding_basename + '.wordlist',
case_sensitive = False )

bc = batch_constructor( [ rd[:4] for rd in raw_data ],
numericizer, numericizer,
numericizer, numericizer,
window = 1024, n_label_type = 7 )
logger.debug( bc )

Expand All @@ -144,13 +170,13 @@ def PositiveEL( embedding_basename,

mention = itertools.chain.from_iterable( rd[-1] for rd in raw_data )

# for sent, boe, eoe, _, _ in raw_data:
# for sent, boe, eoe, _, _ in raw_data:
# for b,e in zip( boe, eoe ):
# mention.append( sent[b:e] )

# feature_itr = bc.mini_batch( 1,
# shuffle_needed = False,
# overlap_rate = 0, disjoint_rate = 0,
# feature_itr = bc.mini_batch( 1,
# shuffle_needed = False,
# overlap_rate = 0, disjoint_rate = 0,
# feature_choice = 7 )
# # assert( len(list(mid_itr)) == len(list(feature_itr)) )

Expand All @@ -160,10 +186,10 @@ def PositiveEL( embedding_basename,
# for i,f in enumerate(feature[:9]) ]

l1v, r1v, l1i, r1i, l2v, r2v, l2i, r2i, bow = \
bc.mini_batch( len(bc.positive),
shuffle_needed = False,
overlap_rate = 0,
disjoint_rate = 0,
bc.mini_batch( len(bc.positive),
shuffle_needed = False,
overlap_rate = 0,
disjoint_rate = 0,
feature_choice = 7 ).next()[:9]
l1 = csr_matrix( ( l1v, ( l1i[:,0].reshape([-1]), l1i[:,1].reshape([-1]) ) ),
shape = [len(bc.positive), n_word] ).astype( numpy.float32 )
Expand All @@ -178,72 +204,141 @@ def PositiveEL( embedding_basename,
shape = [len(bc.positive), n_word] ).astype( numpy.float32 )
return list(mid_itr), mention, l1, l2, r1, r2, bow




def LoadTfidf( tfidf_basename, col ):

def LoadTfidf( tfidf_basename, col ):

with open( tfidf_basename + '.list' ) as fp:
idx2mid = [ mid[1:-1] for mid in fp.read().strip().split() ]
mid2idx = { m:i for i,m in enumerate( idx2mid ) }

indices = numpy.fromfile( tfidf_basename + '.indices', dtype = numpy.int32 )
data = numpy.fromfile( tfidf_basename + '.data', dtype = numpy.float32 )
indptr = numpy.fromfile( tfidf_basename + '.indptr', dtype = numpy.int32 )
assert indices.shape == data.shape

mid2tfidf = csr_matrix( (data, indices, indptr),
mid2tfidf = csr_matrix( (data, indices, indptr),
shape = (indptr.shape[0] - 1, col) )
del data, indices, indptr
mid2tfidf = mid2tfidf.astype( numpy.float32 )

with open( tfidf_basename + '.list' ) as fp:
idx2mid = [ mid[1:-1] for mid in fp.readlines() ]
mid2idx = { m:i for i,m in enumerate( idx2mid ) }
mid2tfidf.sort_indices()
# with open( tfidf_basename + '.list' ) as fp:
# idx2mid = [ mid[1:-1] for mid in fp.read().strip().split() ]
# mid2idx = { m:i for i,m in enumerate( idx2mid ) }

return mid2tfidf, idx2mid, mid2idx


def LoadCandiDict( candi_filename ):
candi_item_dict = {}
# open candidate file
with open( candi_filename, 'rb') as candifile:
for cline in candifile.readlines():
array = cline.split('\t')
# if(array[4] != ''): # remove the NIL items
if(True):
candi_item = array[5]
candi_item = re.sub('[|\[\]]', '', candi_item)
candi_item = candi_item.split(' ')

candi_item = [word.replace('.', '/') for word in candi_item] # replace '.' with '/' in mid
# candi_item_list.append(candi_item)

candi_item = map(lambda s: s.strip(), candi_item)
candi_item_dict[array[0]] = candi_item

logger.info( 'candidate loaded' )
return candi_item_dict

if __name__ == '__main__':
logging.basicConfig( format = '%(asctime)s : %(levelname)s : %(message)s',
logging.basicConfig( format = '%(asctime)s : %(levelname)s : %(message)s',
level = logging.DEBUG )

embedding_basename = 'word2vec/gigaword128-case-insensitive'
tfidf_basename = '/eecs/research/asr/Shared/Entity_Linking_training_data_from_Freebase/mid2tfidf'
embedding_basename = '/eecs/research/asr/mingbin/cleaner/word2vec/gigaword/gigaword128-case-insensitive'
tfidf_basename = '/eecs/research/asr/Shared/Entity_Linking_training_data_from_Freebase/result/FBeasy/mid2tfidf'
candi_filename = '/local/scratch/fwei/KBP/EL/data/out.edl2run3.candidate.txt'
output_dir = '/local/scratch/fwei/KBP/EL/result/2016'
input_dir = '/local/scratch/fwei/KBP/EL/xml_output/2016'


window_fofe = 1 # how many sentences before or behind the target sentence


with open( embedding_basename + '.word2vec', 'rb' ) as fp:
shape = numpy.fromfile( fp, dtype = numpy.int32, count = 2 )
projection = numpy.fromfile( fp, dtype = numpy.float32 ).reshape( shape )
logger.info( 'embedding loaded' )

solution, mention, l1, l2, r1, r2, bow = PositiveEL( embedding_basename,
'kbp-raw-data/eng-train-parsed' )
logger.info( 'fofe loaded' )

mid2tfidf, idx2mid, mid2idx = LoadTfidf( tfidf_basename, projection.shape[0] )
logger.info( 'tfidf loaded' )

l1p = l1.dot( projection )
l2p = l2.dot( projection )
r1p = r1.dot( projection )
r2p = r2.dot( projection )
bowp = bow.dot( projection )
mid2tfidfp = mid2tfidf.dot( projection )
logger.info( 'projection done' )
del l1, l2, r1, r2, bow, mid2tfidf

bow_coef = 0.5

feature = bow_coef * bowp + (1. - bowp) * (l2p + r2p) / 2.
del l1p, l2p, r1p, r2p, bowp

normalized_feature = preprocessing.normalize(feature, norm = 'l2')
logger.info( 'feature computed & normalized' )
del feature

normalized_mid2tfidfp = preprocessing.normalize(mid2tfidfp, norm = 'l2')
logger.info( 'tfidf normalized' )
del mid2tfidfp

solution, mention, l1, l2, r1, r2, bow = PositiveEL( embedding_basename, input_dir, window = window_fofe )
logger.info( 'fofe loaded' )

for i,(s,m) in enumerate( zip( solution, mention ) ):
print s, m
# similarity = numpy.dot( normalized_feature[i:i + 1], normalized_mid2tfidfp.T )
# top = numpy.argsort( similarity, axis = 1, kind = 'heapsort' )
# print m, s, idx2mid[top[0,-1]]
candi_item_dict = LoadCandiDict( candi_filename )

f_test_kb = open(os.path.join( output_dir, 'no_sorted_uniq_test.kb'), 'w')
f_test_pair = open(os.path.join( output_dir, 'no_shuf_test.pair'), 'w')

print 'r2: ' + str(r2.shape[0])
print 'offset list: ' + str(len(offsets_of_original_list))
exit(0)

# test.ment & test.fb & test.pair & test.map
with open(os.path.join( output_dir, 'test.ment'), 'w') as f_test_ment, \
codecs.open(os.path.join( output_dir, 'test.mentMap'), 'w', encoding='utf-8') as f_test_map:
for t, (s, m) in enumerate( zip (solution, mention) ):
# variable
numCopyPos = 0

# test.map
f_test_map.write('\t'.join([str(t), u':'.join([document_id[t], offsets_of_original_list[t]]), m.replace('\n',' ')]) + '\n')

# test.ment
strL2 = ' '.join('%s,%s' % x for x in zip(l2[t].indices, l2[t].data))
strR2 = ' '.join('%s,%s' % x for x in zip(r2[t].indices, r2[t].data))
strBow = ' '.join('%s,%s' % x for x in zip(bow[t].indices, bow[t].data / bow[t].data.shape[0]))

f_test_ment.write('\t'.join([str(t), strL2, strR2, strBow]) + '\n')


# negative --> test.fb & test.pair
query_feature = u':'.join([document_id[t], offsets_of_original_list[t]])
if query_feature in candi_item_dict:
candi_list = candi_item_dict.get(query_feature)

# candi_idx_list = [ mid2idx[c] for c in candi_list if c in mid2idx and c != s.replace('.','/')] # judge if true label
# in the candidate list
candi_idx_list = [ mid2idx[c] for c in candi_list if c in mid2idx ]
numCopyPos = math.floor(len(candi_idx_list) / 2)
for candi in candi_idx_list:
# assert idx2mid[candi].replace('/','.') <> s # assure true lable is not in the candidate list
negFea = mid2tfidf[candi]
strNeg = ' '.join('%s,%s' % x for x in zip(negFea.indices, negFea.data))
f_test_kb.write('\t'.join([idx2mid[candi].replace('/','.'), strNeg]) + '\n')
f_test_pair.write('\t'.join([str(t), idx2mid[candi].replace('/','.'), '0']) + '\n')


# positive --> test.fb & test.pair
# if s.replace('.','/') in mid2idx:
# posFea = mid2tfidf[mid2idx[s.replace('.','/')]]
# strPos = ' '.join('%s,%s' % x for x in zip(posFea.indices, posFea.data))
# f_train_fb.write('\t'.join([s, strPos]) + '\n')
# for i in range(int(numCopyPos) + 1):
# f_train_pair.write('\t'.join([str(t), s, '1']) + '\n')

f_test_kb.close()
f_test_pair.close()

# remove duplicate from .kb
os.system('cat ' + os.path.join( output_dir, 'no_sorted_uniq_test.kb') + ' | sort | uniq > ' + \
os.path.join( output_dir, 'test.kb') )

# shuffle .pair
os.system('cat ' + os.path.join( output_dir, 'no_shuf_test.pair') + ' | perl -MList::Util=shuffle -e \'print shuffle(<STDIN>);\' > ' + \
os.path.join( output_dir, 'test.pair') )

# move cache file to tmp
os.system('mv ' + os.path.join( output_dir, 'no_shuf_test.pair ') + os.path.join( output_dir, 'no_sorted_uniq_test.kb') + ' /tmp')