diff --git a/LinkingUtil.py b/LinkingUtil.py index 50e2ec88..8e925e2e 100755 --- a/LinkingUtil.py +++ b/LinkingUtil.py @@ -1,37 +1,54 @@ #!/eecs/research/asr/mingbin/python-workspace/hopeless/bin/python -import numpy, os, codecs, itertools, logging +""" +Created on Mon Aug 4 2016 +Modified on Mon Jan 13 2017 +Filename : LinkingUtil.mingbin.feature.test.2016.py +Description : prepare test data for KBP EL +Author : fwei +""" + + +import numpy, os, codecs, itertools, logging, math +import scipy.sparse from gigaword2feature import * from scipy.sparse import csr_matrix from sklearn import preprocessing +from collections import defaultdict logger = logging.getLogger( __name__ ) +# Global Variable +offsets_of_original_list = [] +document_id = [] +entity_type = [] +mention_type = [] def LoadED( rspecifier, language = 'eng' ): entity2cls = { # KBP2015 label 'PER_NAM' : 0, - 'PER_NOM' : 5, 'ORG_NAM' : 1, 'GPE_NAM' : 2, 'LOC_NAM' : 3, 'FAC_NAM' : 4, + 'PER_NOM' : 5, 'TTL_NAM' : 5, - - # iflytek label + + # KBP2016 'PER_NAME' : 0, 'ORG_NAME' : 1, 'GPE_NAME' : 2, 'LOC_NAME' : 3, 'FAC_NAME' : 4, 'PER_NOMINAL' : 5, - 'ORG_NOMINAL' : 6, - 'GPE_NOMINAL' : 7, - 'LOC_NOMINAL' : 8, - 'FAC_NOMINAL' : 9, + 'ORG_NOM' : 6, + 'GPE_NOM' : 7, + 'LOC_NOM' : 8, + 'FAC_NOM' : 9, 'TITLE_NAME' : 5, 'TITLE_NOMINAL' : 5 + } if os.path.isfile( rspecifier ): @@ -41,6 +58,8 @@ def LoadED( rspecifier, language = 'eng' ): # texts, tags, failures = processed.split( u'\n\n\n', 2 ) texts = processed.split( u'\n\n\n' )[0] + + for text in texts.split( u'\n\n' ): parts = text.split( u'\n' ) # assert len(parts) in [2, 3], 'sentence, offsets, labels(optional)' @@ -53,7 +72,8 @@ def LoadED( rspecifier, language = 'eng' ): [ offsets[1:-1].split(u',') for offsets in parts[1].split() ] ) assert len(offsets) == len(sent), rspecifier + '\n' + \ str( offsets ) + '\n' + str( sent ) + '\n%d vs %d' % (len(offsets), len(sent)) - + + if len(parts) == 3: for ans in parts[-1].split(): try: @@ -62,7 +82,14 @@ def LoadED( rspecifier, language = 'eng' ): boe.append( int(begin_idx) ) eoe.append( int(end_idx) ) mids.append( mid ) + offsets_of_original_list.append("{0}-{1}".format(offsets[boe[-1]][0], + offsets[eoe[-1] - 1][1] - 1)) + #print offsets_of_original_list spelling.append( original[ offsets[boe[-1]][0] : offsets[eoe[-1] - 1][1] ] ) + #print spelling + document_id.append( rspecifier.split('/')[-1] ) + #print document_id + #exit(0) except ValueError as ex1: logger.exception( rspecifier ) logger.exception( ans ) @@ -93,7 +120,6 @@ def LoadED( rspecifier, language = 'eng' ): yield X - def LoadEL( rspecifier, language = 'eng', window = 1 ): if os.path.isfile( rspecifier ): data = list( LoadED( rspecifier, language ) ) @@ -130,11 +156,11 @@ def PositiveEL( embedding_basename, n_word = len( fp.read().strip().split() ) logger.debug( 'a vocabulary of %d words is used' % n_word ) - numericizer = vocabulary( embedding_basename + '.wordlist', + numericizer = vocabulary( embedding_basename + '.wordlist', case_sensitive = False ) bc = batch_constructor( [ rd[:4] for rd in raw_data ], - numericizer, numericizer, + numericizer, numericizer, window = 1024, n_label_type = 7 ) logger.debug( bc ) @@ -144,13 +170,13 @@ def PositiveEL( embedding_basename, mention = itertools.chain.from_iterable( rd[-1] for rd in raw_data ) - # for sent, boe, eoe, _, _ in raw_data: + # for sent, boe, eoe, _, _ in raw_data: # for b,e in zip( boe, eoe ): # mention.append( sent[b:e] ) - # feature_itr = bc.mini_batch( 1, - # shuffle_needed = False, - # overlap_rate = 0, disjoint_rate = 0, + # feature_itr = bc.mini_batch( 1, + # shuffle_needed = False, + # overlap_rate = 0, disjoint_rate = 0, # feature_choice = 7 ) # # assert( len(list(mid_itr)) == len(list(feature_itr)) ) @@ -160,10 +186,10 @@ def PositiveEL( embedding_basename, # for i,f in enumerate(feature[:9]) ] l1v, r1v, l1i, r1i, l2v, r2v, l2i, r2i, bow = \ - bc.mini_batch( len(bc.positive), - shuffle_needed = False, - overlap_rate = 0, - disjoint_rate = 0, + bc.mini_batch( len(bc.positive), + shuffle_needed = False, + overlap_rate = 0, + disjoint_rate = 0, feature_choice = 7 ).next()[:9] l1 = csr_matrix( ( l1v, ( l1i[:,0].reshape([-1]), l1i[:,1].reshape([-1]) ) ), shape = [len(bc.positive), n_word] ).astype( numpy.float32 ) @@ -178,72 +204,141 @@ def PositiveEL( embedding_basename, shape = [len(bc.positive), n_word] ).astype( numpy.float32 ) return list(mid_itr), mention, l1, l2, r1, r2, bow - -def LoadTfidf( tfidf_basename, col ): + +def LoadTfidf( tfidf_basename, col ): + + with open( tfidf_basename + '.list' ) as fp: + idx2mid = [ mid[1:-1] for mid in fp.read().strip().split() ] + mid2idx = { m:i for i,m in enumerate( idx2mid ) } + indices = numpy.fromfile( tfidf_basename + '.indices', dtype = numpy.int32 ) data = numpy.fromfile( tfidf_basename + '.data', dtype = numpy.float32 ) indptr = numpy.fromfile( tfidf_basename + '.indptr', dtype = numpy.int32 ) assert indices.shape == data.shape - mid2tfidf = csr_matrix( (data, indices, indptr), + mid2tfidf = csr_matrix( (data, indices, indptr), shape = (indptr.shape[0] - 1, col) ) del data, indices, indptr mid2tfidf = mid2tfidf.astype( numpy.float32 ) - - with open( tfidf_basename + '.list' ) as fp: - idx2mid = [ mid[1:-1] for mid in fp.readlines() ] - mid2idx = { m:i for i,m in enumerate( idx2mid ) } + mid2tfidf.sort_indices() +# with open( tfidf_basename + '.list' ) as fp: +# idx2mid = [ mid[1:-1] for mid in fp.read().strip().split() ] +# mid2idx = { m:i for i,m in enumerate( idx2mid ) } return mid2tfidf, idx2mid, mid2idx +def LoadCandiDict( candi_filename ): + candi_item_dict = {} + # open candidate file + with open( candi_filename, 'rb') as candifile: + for cline in candifile.readlines(): + array = cline.split('\t') +# if(array[4] != ''): # remove the NIL items + if(True): + candi_item = array[5] + candi_item = re.sub('[|\[\]]', '', candi_item) + candi_item = candi_item.split(' ') + + candi_item = [word.replace('.', '/') for word in candi_item] # replace '.' with '/' in mid +# candi_item_list.append(candi_item) + + candi_item = map(lambda s: s.strip(), candi_item) + candi_item_dict[array[0]] = candi_item + + logger.info( 'candidate loaded' ) + return candi_item_dict if __name__ == '__main__': - logging.basicConfig( format = '%(asctime)s : %(levelname)s : %(message)s', + logging.basicConfig( format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.DEBUG ) - embedding_basename = 'word2vec/gigaword128-case-insensitive' - tfidf_basename = '/eecs/research/asr/Shared/Entity_Linking_training_data_from_Freebase/mid2tfidf' + embedding_basename = '/eecs/research/asr/mingbin/cleaner/word2vec/gigaword/gigaword128-case-insensitive' + tfidf_basename = '/eecs/research/asr/Shared/Entity_Linking_training_data_from_Freebase/result/FBeasy/mid2tfidf' + candi_filename = '/local/scratch/fwei/KBP/EL/data/out.edl2run3.candidate.txt' + output_dir = '/local/scratch/fwei/KBP/EL/result/2016' + input_dir = '/local/scratch/fwei/KBP/EL/xml_output/2016' + + + window_fofe = 1 # how many sentences before or behind the target sentence + with open( embedding_basename + '.word2vec', 'rb' ) as fp: shape = numpy.fromfile( fp, dtype = numpy.int32, count = 2 ) projection = numpy.fromfile( fp, dtype = numpy.float32 ).reshape( shape ) logger.info( 'embedding loaded' ) - solution, mention, l1, l2, r1, r2, bow = PositiveEL( embedding_basename, - 'kbp-raw-data/eng-train-parsed' ) - logger.info( 'fofe loaded' ) - mid2tfidf, idx2mid, mid2idx = LoadTfidf( tfidf_basename, projection.shape[0] ) logger.info( 'tfidf loaded' ) - l1p = l1.dot( projection ) - l2p = l2.dot( projection ) - r1p = r1.dot( projection ) - r2p = r2.dot( projection ) - bowp = bow.dot( projection ) - mid2tfidfp = mid2tfidf.dot( projection ) - logger.info( 'projection done' ) - del l1, l2, r1, r2, bow, mid2tfidf - - bow_coef = 0.5 - - feature = bow_coef * bowp + (1. - bowp) * (l2p + r2p) / 2. - del l1p, l2p, r1p, r2p, bowp - - normalized_feature = preprocessing.normalize(feature, norm = 'l2') - logger.info( 'feature computed & normalized' ) - del feature - - normalized_mid2tfidfp = preprocessing.normalize(mid2tfidfp, norm = 'l2') - logger.info( 'tfidf normalized' ) - del mid2tfidfp + solution, mention, l1, l2, r1, r2, bow = PositiveEL( embedding_basename, input_dir, window = window_fofe ) + logger.info( 'fofe loaded' ) - for i,(s,m) in enumerate( zip( solution, mention ) ): - print s, m - # similarity = numpy.dot( normalized_feature[i:i + 1], normalized_mid2tfidfp.T ) - # top = numpy.argsort( similarity, axis = 1, kind = 'heapsort' ) - # print m, s, idx2mid[top[0,-1]] + candi_item_dict = LoadCandiDict( candi_filename ) + + f_test_kb = open(os.path.join( output_dir, 'no_sorted_uniq_test.kb'), 'w') + f_test_pair = open(os.path.join( output_dir, 'no_shuf_test.pair'), 'w') + + print 'r2: ' + str(r2.shape[0]) + print 'offset list: ' + str(len(offsets_of_original_list)) + exit(0) + + # test.ment & test.fb & test.pair & test.map + with open(os.path.join( output_dir, 'test.ment'), 'w') as f_test_ment, \ + codecs.open(os.path.join( output_dir, 'test.mentMap'), 'w', encoding='utf-8') as f_test_map: + for t, (s, m) in enumerate( zip (solution, mention) ): + # variable + numCopyPos = 0 + + # test.map + f_test_map.write('\t'.join([str(t), u':'.join([document_id[t], offsets_of_original_list[t]]), m.replace('\n',' ')]) + '\n') + + # test.ment + strL2 = ' '.join('%s,%s' % x for x in zip(l2[t].indices, l2[t].data)) + strR2 = ' '.join('%s,%s' % x for x in zip(r2[t].indices, r2[t].data)) + strBow = ' '.join('%s,%s' % x for x in zip(bow[t].indices, bow[t].data / bow[t].data.shape[0])) + + f_test_ment.write('\t'.join([str(t), strL2, strR2, strBow]) + '\n') + + + # negative --> test.fb & test.pair + query_feature = u':'.join([document_id[t], offsets_of_original_list[t]]) + if query_feature in candi_item_dict: + candi_list = candi_item_dict.get(query_feature) + +# candi_idx_list = [ mid2idx[c] for c in candi_list if c in mid2idx and c != s.replace('.','/')] # judge if true label + # in the candidate list + candi_idx_list = [ mid2idx[c] for c in candi_list if c in mid2idx ] + numCopyPos = math.floor(len(candi_idx_list) / 2) + for candi in candi_idx_list: +# assert idx2mid[candi].replace('/','.') <> s # assure true lable is not in the candidate list + negFea = mid2tfidf[candi] + strNeg = ' '.join('%s,%s' % x for x in zip(negFea.indices, negFea.data)) + f_test_kb.write('\t'.join([idx2mid[candi].replace('/','.'), strNeg]) + '\n') + f_test_pair.write('\t'.join([str(t), idx2mid[candi].replace('/','.'), '0']) + '\n') + + + # positive --> test.fb & test.pair +# if s.replace('.','/') in mid2idx: +# posFea = mid2tfidf[mid2idx[s.replace('.','/')]] +# strPos = ' '.join('%s,%s' % x for x in zip(posFea.indices, posFea.data)) +# f_train_fb.write('\t'.join([s, strPos]) + '\n') +# for i in range(int(numCopyPos) + 1): +# f_train_pair.write('\t'.join([str(t), s, '1']) + '\n') + + f_test_kb.close() + f_test_pair.close() + + # remove duplicate from .kb + os.system('cat ' + os.path.join( output_dir, 'no_sorted_uniq_test.kb') + ' | sort | uniq > ' + \ + os.path.join( output_dir, 'test.kb') ) + + # shuffle .pair + os.system('cat ' + os.path.join( output_dir, 'no_shuf_test.pair') + ' | perl -MList::Util=shuffle -e \'print shuffle();\' > ' + \ + os.path.join( output_dir, 'test.pair') ) + + # move cache file to tmp + os.system('mv ' + os.path.join( output_dir, 'no_shuf_test.pair ') + os.path.join( output_dir, 'no_sorted_uniq_test.kb') + ' /tmp')