From bb6d581662ee4a08c6dc5869c2103836a43a81c5 Mon Sep 17 00:00:00 2001 From: lvyilin Date: Tue, 31 Jul 2018 14:25:51 +0800 Subject: [PATCH] update: many files --- co-occur_network_building.py | 56 +++++++++++++++++++++++++++++ co_occur_network_statistic.py | 22 ++++++++++++ filter_entity_sentences.py | 45 +++++++++++++++++++++++ get_sentence_degree_range.py | 59 ++++++++++++++++++++++++++++++ merge_annotation_to_db.py | 68 +++++++++++++++++++++++++++++++++++ merge_entity_sentences.py | 14 ++++++++ neo4j.py | 34 ++++++++++-------- parse_new.py | 2 +- person_relation.txt | 3 ++ 9 files changed, 288 insertions(+), 15 deletions(-) create mode 100644 co-occur_network_building.py create mode 100644 co_occur_network_statistic.py create mode 100644 filter_entity_sentences.py create mode 100644 get_sentence_degree_range.py create mode 100644 merge_annotation_to_db.py create mode 100644 merge_entity_sentences.py diff --git a/co-occur_network_building.py b/co-occur_network_building.py new file mode 100644 index 0000000..b04728d --- /dev/null +++ b/co-occur_network_building.py @@ -0,0 +1,56 @@ +from py2neo import Graph, Node, Relationship +import os + +CWD = os.getcwd() + +SENTENCE_FILE = os.path.join(CWD, "entity_sentences_lite.txt") +DB = Graph( + "bolt://localhost:7687", + username="neo4j", + password="admin" +) + + +def build_Node(node_name, entity_type="person"): + n = Node(entity_type, name=node_name) + return n + + +def build_Relation(nodeA, nodeB, relation_type): + r1 = Relationship(nodeA, relation_type, nodeB) + r1['weight'] = 1 + return r1 + + +def build_N_R(nodeA_name, nodeB_name, relation_type, entityA_type="person", entityB_type="person"): + n1 = build_Node(nodeA_name, entityA_type) + DB.merge(n1, entityA_type, 'name') + n2 = build_Node(nodeB_name, entityB_type) + DB.merge(n2, entityB_type, 'name') + r = DB.match_one(n1, relation_type, n2) + if r is None: + r = build_Relation(n1, n2, relation_type) + DB.merge(r) + else: + # if r['weight'] is None: + # r['weight'] = 1 + # else: + r['weight'] += 1 + r.push() + + +def split_sentence(line): + spl = str(line).split(" ", 2) + return spl[0], spl[1], spl[2] + + +def main(): + with open(SENTENCE_FILE, "r", encoding="utf8")as fp: + for line in fp: + print(line) + entity_a, entity_b, sentence = split_sentence(line.strip()) + build_N_R(entity_a, entity_b, "occur") + + +if __name__ == '__main__': + main() diff --git a/co_occur_network_statistic.py b/co_occur_network_statistic.py new file mode 100644 index 0000000..78c4676 --- /dev/null +++ b/co_occur_network_statistic.py @@ -0,0 +1,22 @@ +from py2neo import Graph, Node, Relationship +import os + +CWD = os.getcwd() + +DB = Graph( + "bolt://localhost:7687", + username="neo4j", + password="admin" +) + + +def main(): + res = DB.data("MATCH (a)-[r]-() RETURN a.name, sum(r.weight) as sz ORDER BY sz DESC ") + with open("co_occur_stats_lite.txt", "w", encoding="utf8") as g: + for item in res: + if item['sz'] >= 10 and item['sz'] <= 50: + g.write("{}\t{}\n".format(item['a.name'], item['sz'])) + + +if __name__ == '__main__': + main() diff --git a/filter_entity_sentences.py b/filter_entity_sentences.py new file mode 100644 index 0000000..e1c515b --- /dev/null +++ b/filter_entity_sentences.py @@ -0,0 +1,45 @@ +import os +import re + +FILE = "D:\\Projects\\Baike\\entity_sentences_lite.txt" +NEW_FILE = "D:\\Projects\\Baike\\entity_sentences_v7.txt" + + +def load_relation(): + d = set() + with open("person_relation.txt", "r", encoding="utf8") as f: + for line in f: + li = line.split(" ") + # ENTITY_MAP.add(line.split(" ")[0]) + d.add(li[0]) + return d + + +def build_relation_pattern(d): + s = u"" + for k in d: + s += k + "|" + s = s.rstrip('|') + ptn = u"(" + s + u")" + return re.compile(ptn) + + +def split_sentence(line): + spl = str(line).split(" ", 2) + return spl[0], spl[1], spl[2] + + +RELATION_DICT = load_relation() +RELATION_PATTERN = build_relation_pattern(RELATION_DICT) + +with open(FILE, 'r', encoding="utf8") as f: + lines = f.readlines() +with open(NEW_FILE, "w", encoding="utf8") as f: + for line in lines: + # print(len(line)) + # if len(line) >= 175 and len(RELATION_PATTERN.findall(line)) == 0: + entity_a, entity_b, sentence = split_sentence(line.strip()) + if "'" == entity_a or "'" == entity_b: + print(line) + continue + f.write(line) diff --git a/get_sentence_degree_range.py b/get_sentence_degree_range.py new file mode 100644 index 0000000..d157610 --- /dev/null +++ b/get_sentence_degree_range.py @@ -0,0 +1,59 @@ +import sqlite3 +import os + +CWD = os.getcwd() + +DATA = [] + + +def split_sentence(line): + spl = str(line).split(" ", 2) + return spl[0], spl[1], spl[2] + + +def save_to_file(): + with open("co_occur_stats_filtered.txt", "w", encoding="utf8") as f: + for item in DATA: + f.write("{} {} {}\n".format(item[0], item[1], item[2])) + + +def save_to_sqlite(): + conn = sqlite3.connect('baike.sqlite') + c = conn.cursor() + for item in DATA: + item[0] = str(item[0]).replace("'", "''") + item[1] = str(item[1]).replace("'", "''") + item[2] = str(item[2]).replace("'", "''") + # item[2] = str(item[2]).replace('"','""') + sql = "insert into Data(entity_a,entity_b,sentence,relation) VALUES('{}','{}','{}',0)".format(item[0], item[1], + item[2]) + print(sql) + c.execute(sql) + + conn.commit() + conn.close() + + +def read_data(): + entity_set = set() + with open("co_occur_stats_lite.txt", "r", encoding="utf8") as g: + for line in g: + spl = line.split("\t") + entity_set.add(spl[0]) + + with open("entity_sentences_lite.txt", "r", encoding="utf8") as f: + for line in f: + line = line.strip() + entity_a, entity_b, sentence = split_sentence(line) + if entity_a in entity_set or entity_b in entity_set: + DATA.append([entity_a, entity_b, sentence]) + + +def main(): + read_data() + # save_to_file() + save_to_sqlite() + + +if __name__ == '__main__': + main() diff --git a/merge_annotation_to_db.py b/merge_annotation_to_db.py new file mode 100644 index 0000000..1241c26 --- /dev/null +++ b/merge_annotation_to_db.py @@ -0,0 +1,68 @@ +# 将之前匹配方法标注的结果结合到共现标注的结果的数据库里 +import sqlite3 + +ANNOTATION_OLD_FILE = "D:\\Projects\\Baike\\sentences_annotation_auto\\annotation_fin.txt" +PERSON_RELATION_FILE = "D:\\Projects\\Baike\\person_relation.txt" +RELATION_MAP_FILE = "D:\\Projects\\Baike\\relation_stat_for_map.txt" +DB = "D:\\Projects\\Baike\\baike.db" + + +def split_line(line): + spl = str(line).rsplit(" ", 3) + return spl[0], spl[1], spl[2], spl[3] + + +def load_relation(): + d = dict() + with open(PERSON_RELATION_FILE, "r", encoding="utf8") as f: + for line in f: + li = line.split(" ") + d[li[0]] = li[1] + return d + + +def load_relation_map(): + d_num = dict() + with open(RELATION_MAP_FILE, "r", encoding="utf8") as f: + i = 1 + for line in f: + lst = line.split(":") + relation_type = lst[0] + relation_name = lst[1].split(" ") + for rel in relation_name: + rel = rel.strip() + d_num[rel] = i + i += 1 + return d_num + + +RELATION_NAME = load_relation() +RELATION_NUM_MAP = load_relation_map() + + +def map_relation_to_num(relation): + if relation in RELATION_NAME: + rel = RELATION_NAME[relation] + print("{},{},{}".format(relation, rel, RELATION_NUM_MAP[rel])) + return RELATION_NUM_MAP[rel] + + +with open(ANNOTATION_OLD_FILE, "r", encoding="utf8") as f: + conn = sqlite3.connect(DB) + c = conn.cursor() + for line in f: + sentence, entity_a, entity_b, relation = split_line(line) + relation = relation.strip() + c.execute( + "select * from Data where relation!=8 and sentence='{}' and entity_a='{}' and entity_b='{}'".format( + sentence, entity_a, entity_b)) + for row in c: + print(row) + r = map_relation_to_num(relation) + if len(row) != 0 and r is not None: + id = row[0] + c.execute("update Data set relation={} where id={}".format(r, id)) + else: + print(row) + conn.commit() + conn.close() diff --git a/merge_entity_sentences.py b/merge_entity_sentences.py new file mode 100644 index 0000000..0f667e4 --- /dev/null +++ b/merge_entity_sentences.py @@ -0,0 +1,14 @@ +# 将提取好的包含主体和其他实体的句子文件整合为一个文件 +import os + +DIR = "D:\\Projects\\Baike\\entity_sentences\\" +FILE = "D:\\Projects\\Baike\\entity_sentences.txt" + +files = os.listdir(DIR) +with open(FILE, "w", encoding="utf8") as f: + for filename in files: + with open(DIR + filename, "r", encoding="utf8") as g: + for line in g: + if line.strip() == "": + continue + f.write(line) diff --git a/neo4j.py b/neo4j.py index 59b7af6..cd93e5a 100644 --- a/neo4j.py +++ b/neo4j.py @@ -2,7 +2,7 @@ # 此函数每次运行只执行一次 -def delete_graph(DB, node = None, relation = None, all =0): +def delete_graph(DB, node=None, relation=None, all=0): if node != None: if DB.exists(node): DB.delete(node) @@ -12,6 +12,7 @@ def delete_graph(DB, node = None, relation = None, all =0): if all == 1: DB.delete_all() + def build_Node(node_name, entity_type="person"): n = Node(entity_type, name=node_name) return n @@ -22,25 +23,28 @@ def build_Relation(nodeA, nodeB, relation_type, location): r1[location] = 1 return r1 + def add_type_unique(db, type): db.run("CREATE CONSTRAINT ON (ea:" + type + ")ASSERT ea.name IS UNIQUE") + def build_N_R(m_Graph, nodeA_name, nodeB_name, relation_type, location, entityA_type="person", entityB_type="person"): - n1 = build_Node(nodeA_name,entityA_type) - m_Graph.merge(n1,entityA_type,'name') - n2 = build_Node(nodeB_name,entityB_type) - m_Graph.merge(n2,entityB_type,'name') - r = m_Graph.match_one(n1,relation_type,n2) - if r == None: + n1 = build_Node(nodeA_name, entityA_type) + m_Graph.merge(n1, entityA_type, 'name') + n2 = build_Node(nodeB_name, entityB_type) + m_Graph.merge(n2, entityB_type, 'name') + r = m_Graph.match_one(n1, relation_type, n2) + if r is None: r = build_Relation(n1, n2, relation_type, location) m_Graph.merge(r) else: - if r[location] == None: + if r[location] is None: r[location] = 1 else: r[location] += 1 r.push() + def initDB(): m_Graph_DB = Graph( "bolt://localhost:7687", @@ -49,6 +53,7 @@ def initDB(): ) return m_Graph_DB + def main(): m_Graph_DB = Graph( "bolt://localhost:7687", @@ -56,11 +61,12 @@ def main(): password="admin" ) # add_type_unique(m_Graph_DB,"person") - delete_graph(m_Graph_DB,all=1) + delete_graph(m_Graph_DB, all=1) return - build_N_R(m_Graph_DB, "毛泽东", "杨开慧" , "朋友", "infobox") - build_N_R(m_Graph_DB, "杨开慧", "毛泽东" , "朋友", "infobox") - build_N_R(m_Graph_DB, "毛泽东", "周恩来" , "同事", "infobox") + build_N_R(m_Graph_DB, "毛泽东", "杨开慧", "朋友", "infobox") + build_N_R(m_Graph_DB, "杨开慧", "毛泽东", "朋友", "infobox") + build_N_R(m_Graph_DB, "毛泽东", "周恩来", "同事", "infobox") + -if __name__=="__main__": - main() \ No newline at end of file +if __name__ == "__main__": + main() diff --git a/parse_new.py b/parse_new.py index 0bcd9a2..f33b2e4 100644 --- a/parse_new.py +++ b/parse_new.py @@ -12,7 +12,7 @@ pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') segmentor = pyltp.Segmentor() -segmentor.load_with_lexicon(cws_model_path, "entity_dict.txt") +segmentor.load_with_lexicon(cws_model_path, "entity_dict.txt") # TODO:检查"周总理"等是否识别 postagger = pyltp.Postagger() postagger.load(pos_model_path) recognizer = pyltp.NamedEntityRecognizer() diff --git a/person_relation.txt b/person_relation.txt index 3a1616b..ea6cfd2 100644 --- a/person_relation.txt +++ b/person_relation.txt @@ -6,10 +6,12 @@ 外婆 祖孙 亲情 外祖父 祖孙 亲情 外祖母 祖孙 亲情 +之父 父子 亲情 爸爸 父子 亲情 妈妈 母子 亲情 父亲 父子 亲情 母亲 母子 亲情 +之母 母子 亲情 大儿 父子 亲情 次子 父子 亲情 二子 父子 亲情 @@ -172,6 +174,7 @@ 祖姑母 姑孙 亲情 女朋友 情侣 爱情 男朋友 情侣 爱情 +夫妻 夫妻 爱情 夫人 夫妻 爱情 妻子 夫妻 爱情 男友 情侣 爱情