From bb6d581662ee4a08c6dc5869c2103836a43a81c5 Mon Sep 17 00:00:00 2001
From: lvyilin <lvyilin48@foxmail.com>
Date: Tue, 31 Jul 2018 14:25:51 +0800
Subject: [PATCH] update: many files

---
 co-occur_network_building.py  | 56 +++++++++++++++++++++++++++++
 co_occur_network_statistic.py | 22 ++++++++++++
 filter_entity_sentences.py    | 45 +++++++++++++++++++++++
 get_sentence_degree_range.py  | 59 ++++++++++++++++++++++++++++++
 merge_annotation_to_db.py     | 68 +++++++++++++++++++++++++++++++++++
 merge_entity_sentences.py     | 14 ++++++++
 neo4j.py                      | 34 ++++++++++--------
 parse_new.py                  |  2 +-
 person_relation.txt           |  3 ++
 9 files changed, 288 insertions(+), 15 deletions(-)
 create mode 100644 co-occur_network_building.py
 create mode 100644 co_occur_network_statistic.py
 create mode 100644 filter_entity_sentences.py
 create mode 100644 get_sentence_degree_range.py
 create mode 100644 merge_annotation_to_db.py
 create mode 100644 merge_entity_sentences.py

diff --git a/co-occur_network_building.py b/co-occur_network_building.py
new file mode 100644
index 0000000..b04728d
--- /dev/null
+++ b/co-occur_network_building.py
@@ -0,0 +1,56 @@
+from py2neo import Graph, Node, Relationship
+import os
+
+CWD = os.getcwd()
+
+SENTENCE_FILE = os.path.join(CWD, "entity_sentences_lite.txt")
+DB = Graph(
+    "bolt://localhost:7687",
+    username="neo4j",
+    password="admin"
+)
+
+
+def build_Node(node_name, entity_type="person"):
+    n = Node(entity_type, name=node_name)
+    return n
+
+
+def build_Relation(nodeA, nodeB, relation_type):
+    r1 = Relationship(nodeA, relation_type, nodeB)
+    r1['weight'] = 1
+    return r1
+
+
+def build_N_R(nodeA_name, nodeB_name, relation_type, entityA_type="person", entityB_type="person"):
+    n1 = build_Node(nodeA_name, entityA_type)
+    DB.merge(n1, entityA_type, 'name')
+    n2 = build_Node(nodeB_name, entityB_type)
+    DB.merge(n2, entityB_type, 'name')
+    r = DB.match_one(n1, relation_type, n2)
+    if r is None:
+        r = build_Relation(n1, n2, relation_type)
+        DB.merge(r)
+    else:
+        # if r['weight'] is None:
+        #     r['weight'] = 1
+        # else:
+        r['weight'] += 1
+        r.push()
+
+
+def split_sentence(line):
+    spl = str(line).split(" ", 2)
+    return spl[0], spl[1], spl[2]
+
+
+def main():
+    with open(SENTENCE_FILE, "r", encoding="utf8")as fp:
+        for line in fp:
+            print(line)
+            entity_a, entity_b, sentence = split_sentence(line.strip())
+            build_N_R(entity_a, entity_b, "occur")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/co_occur_network_statistic.py b/co_occur_network_statistic.py
new file mode 100644
index 0000000..78c4676
--- /dev/null
+++ b/co_occur_network_statistic.py
@@ -0,0 +1,22 @@
+from py2neo import Graph, Node, Relationship
+import os
+
+CWD = os.getcwd()
+
+DB = Graph(
+    "bolt://localhost:7687",
+    username="neo4j",
+    password="admin"
+)
+
+
+def main():
+    res = DB.data("MATCH (a)-[r]-() RETURN a.name, sum(r.weight) as sz ORDER BY sz DESC ")
+    with open("co_occur_stats_lite.txt", "w", encoding="utf8") as g:
+        for item in res:
+            if item['sz'] >= 10 and item['sz'] <= 50:
+                g.write("{}\t{}\n".format(item['a.name'], item['sz']))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/filter_entity_sentences.py b/filter_entity_sentences.py
new file mode 100644
index 0000000..e1c515b
--- /dev/null
+++ b/filter_entity_sentences.py
@@ -0,0 +1,45 @@
+import os
+import re
+
+FILE = "D:\\Projects\\Baike\\entity_sentences_lite.txt"
+NEW_FILE = "D:\\Projects\\Baike\\entity_sentences_v7.txt"
+
+
+def load_relation():
+    d = set()
+    with open("person_relation.txt", "r", encoding="utf8") as f:
+        for line in f:
+            li = line.split(" ")
+            # ENTITY_MAP.add(line.split(" ")[0])
+            d.add(li[0])
+    return d
+
+
+def build_relation_pattern(d):
+    s = u""
+    for k in d:
+        s += k + "|"
+    s = s.rstrip('|')
+    ptn = u"(" + s + u")"
+    return re.compile(ptn)
+
+
+def split_sentence(line):
+    spl = str(line).split(" ", 2)
+    return spl[0], spl[1], spl[2]
+
+
+RELATION_DICT = load_relation()
+RELATION_PATTERN = build_relation_pattern(RELATION_DICT)
+
+with open(FILE, 'r', encoding="utf8") as f:
+    lines = f.readlines()
+with open(NEW_FILE, "w", encoding="utf8") as f:
+    for line in lines:
+        # print(len(line))
+        # if len(line) >= 175 and len(RELATION_PATTERN.findall(line)) == 0:
+        entity_a, entity_b, sentence = split_sentence(line.strip())
+        if "'" == entity_a or "'" == entity_b:
+            print(line)
+            continue
+        f.write(line)
diff --git a/get_sentence_degree_range.py b/get_sentence_degree_range.py
new file mode 100644
index 0000000..d157610
--- /dev/null
+++ b/get_sentence_degree_range.py
@@ -0,0 +1,59 @@
+import sqlite3
+import os
+
+CWD = os.getcwd()
+
+DATA = []
+
+
+def split_sentence(line):
+    spl = str(line).split(" ", 2)
+    return spl[0], spl[1], spl[2]
+
+
+def save_to_file():
+    with open("co_occur_stats_filtered.txt", "w", encoding="utf8") as f:
+        for item in DATA:
+            f.write("{} {} {}\n".format(item[0], item[1], item[2]))
+
+
+def save_to_sqlite():
+    conn = sqlite3.connect('baike.sqlite')
+    c = conn.cursor()
+    for item in DATA:
+        item[0] = str(item[0]).replace("'", "''")
+        item[1] = str(item[1]).replace("'", "''")
+        item[2] = str(item[2]).replace("'", "''")
+        # item[2] = str(item[2]).replace('"','""')
+        sql = "insert into Data(entity_a,entity_b,sentence,relation) VALUES('{}','{}','{}',0)".format(item[0], item[1],
+                                                                                                      item[2])
+        print(sql)
+        c.execute(sql)
+
+    conn.commit()
+    conn.close()
+
+
+def read_data():
+    entity_set = set()
+    with open("co_occur_stats_lite.txt", "r", encoding="utf8") as g:
+        for line in g:
+            spl = line.split("\t")
+            entity_set.add(spl[0])
+
+    with open("entity_sentences_lite.txt", "r", encoding="utf8") as f:
+        for line in f:
+            line = line.strip()
+            entity_a, entity_b, sentence = split_sentence(line)
+            if entity_a in entity_set or entity_b in entity_set:
+                DATA.append([entity_a, entity_b, sentence])
+
+
+def main():
+    read_data()
+    # save_to_file()
+    save_to_sqlite()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/merge_annotation_to_db.py b/merge_annotation_to_db.py
new file mode 100644
index 0000000..1241c26
--- /dev/null
+++ b/merge_annotation_to_db.py
@@ -0,0 +1,68 @@
+# 将之前匹配方法标注的结果结合到共现标注的结果的数据库里
+import sqlite3
+
+ANNOTATION_OLD_FILE = "D:\\Projects\\Baike\\sentences_annotation_auto\\annotation_fin.txt"
+PERSON_RELATION_FILE = "D:\\Projects\\Baike\\person_relation.txt"
+RELATION_MAP_FILE = "D:\\Projects\\Baike\\relation_stat_for_map.txt"
+DB = "D:\\Projects\\Baike\\baike.db"
+
+
+def split_line(line):
+    spl = str(line).rsplit(" ", 3)
+    return spl[0], spl[1], spl[2], spl[3]
+
+
+def load_relation():
+    d = dict()
+    with open(PERSON_RELATION_FILE, "r", encoding="utf8") as f:
+        for line in f:
+            li = line.split(" ")
+            d[li[0]] = li[1]
+    return d
+
+
+def load_relation_map():
+    d_num = dict()
+    with open(RELATION_MAP_FILE, "r", encoding="utf8") as f:
+        i = 1
+        for line in f:
+            lst = line.split("：")
+            relation_type = lst[0]
+            relation_name = lst[1].split(" ")
+            for rel in relation_name:
+                rel = rel.strip()
+                d_num[rel] = i
+            i += 1
+    return d_num
+
+
+RELATION_NAME = load_relation()
+RELATION_NUM_MAP = load_relation_map()
+
+
+def map_relation_to_num(relation):
+    if relation in RELATION_NAME:
+        rel = RELATION_NAME[relation]
+        print("{},{},{}".format(relation, rel, RELATION_NUM_MAP[rel]))
+        return RELATION_NUM_MAP[rel]
+
+
+with open(ANNOTATION_OLD_FILE, "r", encoding="utf8") as f:
+    conn = sqlite3.connect(DB)
+    c = conn.cursor()
+    for line in f:
+        sentence, entity_a, entity_b, relation = split_line(line)
+        relation = relation.strip()
+        c.execute(
+            "select * from Data where relation!=8 and sentence='{}' and entity_a='{}' and entity_b='{}'".format(
+                sentence, entity_a, entity_b))
+        for row in c:
+            print(row)
+            r = map_relation_to_num(relation)
+            if len(row) != 0 and r is not None:
+                id = row[0]
+                c.execute("update Data set relation={} where id={}".format(r, id))
+            else:
+                print(row)
+    conn.commit()
+    conn.close()
diff --git a/merge_entity_sentences.py b/merge_entity_sentences.py
new file mode 100644
index 0000000..0f667e4
--- /dev/null
+++ b/merge_entity_sentences.py
@@ -0,0 +1,14 @@
+# 将提取好的包含主体和其他实体的句子文件整合为一个文件
+import os
+
+DIR = "D:\\Projects\\Baike\\entity_sentences\\"
+FILE = "D:\\Projects\\Baike\\entity_sentences.txt"
+
+files = os.listdir(DIR)
+with open(FILE, "w", encoding="utf8") as f:
+    for filename in files:
+        with open(DIR + filename, "r", encoding="utf8") as g:
+            for line in g:
+                if line.strip() == "":
+                    continue
+                f.write(line)
diff --git a/neo4j.py b/neo4j.py
index 59b7af6..cd93e5a 100644
--- a/neo4j.py
+++ b/neo4j.py
@@ -2,7 +2,7 @@
 
 
 # 此函数每次运行只执行一次
-def delete_graph(DB, node = None, relation = None, all =0):
+def delete_graph(DB, node=None, relation=None, all=0):
     if node != None:
         if DB.exists(node):
             DB.delete(node)
@@ -12,6 +12,7 @@ def delete_graph(DB, node = None, relation = None, all =0):
     if all == 1:
         DB.delete_all()
 
+
 def build_Node(node_name, entity_type="person"):
     n = Node(entity_type, name=node_name)
     return n
@@ -22,25 +23,28 @@ def build_Relation(nodeA, nodeB, relation_type, location):
     r1[location] = 1
     return r1
 
+
 def add_type_unique(db, type):
     db.run("CREATE CONSTRAINT ON (ea:" + type + ")ASSERT ea.name IS UNIQUE")
 
+
 def build_N_R(m_Graph, nodeA_name, nodeB_name, relation_type, location, entityA_type="person", entityB_type="person"):
-    n1 = build_Node(nodeA_name,entityA_type)
-    m_Graph.merge(n1,entityA_type,'name')
-    n2 = build_Node(nodeB_name,entityB_type)
-    m_Graph.merge(n2,entityB_type,'name')
-    r = m_Graph.match_one(n1,relation_type,n2)
-    if r == None:
+    n1 = build_Node(nodeA_name, entityA_type)
+    m_Graph.merge(n1, entityA_type, 'name')
+    n2 = build_Node(nodeB_name, entityB_type)
+    m_Graph.merge(n2, entityB_type, 'name')
+    r = m_Graph.match_one(n1, relation_type, n2)
+    if r is None:
         r = build_Relation(n1, n2, relation_type, location)
         m_Graph.merge(r)
     else:
-        if r[location] == None:
+        if r[location] is None:
             r[location] = 1
         else:
             r[location] += 1
         r.push()
 
+
 def initDB():
     m_Graph_DB = Graph(
         "bolt://localhost:7687",
@@ -49,6 +53,7 @@ def initDB():
     )
     return m_Graph_DB
 
+
 def main():
     m_Graph_DB = Graph(
         "bolt://localhost:7687",
@@ -56,11 +61,12 @@ def main():
         password="admin"
     )
     # add_type_unique(m_Graph_DB,"person")
-    delete_graph(m_Graph_DB,all=1)
+    delete_graph(m_Graph_DB, all=1)
     return
-    build_N_R(m_Graph_DB, "毛泽东", "杨开慧" , "朋友", "infobox")
-    build_N_R(m_Graph_DB, "杨开慧", "毛泽东" , "朋友", "infobox")
-    build_N_R(m_Graph_DB, "毛泽东", "周恩来" , "同事", "infobox")
+    build_N_R(m_Graph_DB, "毛泽东", "杨开慧", "朋友", "infobox")
+    build_N_R(m_Graph_DB, "杨开慧", "毛泽东", "朋友", "infobox")
+    build_N_R(m_Graph_DB, "毛泽东", "周恩来", "同事", "infobox")
+
 
-if __name__=="__main__":
-    main()
\ No newline at end of file
+if __name__ == "__main__":
+    main()
diff --git a/parse_new.py b/parse_new.py
index 0bcd9a2..f33b2e4 100644
--- a/parse_new.py
+++ b/parse_new.py
@@ -12,7 +12,7 @@
 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
 ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')
 segmentor = pyltp.Segmentor()
-segmentor.load_with_lexicon(cws_model_path, "entity_dict.txt")
+segmentor.load_with_lexicon(cws_model_path, "entity_dict.txt")  # TODO:检查"周总理"等是否识别
 postagger = pyltp.Postagger()
 postagger.load(pos_model_path)
 recognizer = pyltp.NamedEntityRecognizer()
diff --git a/person_relation.txt b/person_relation.txt
index 3a1616b..ea6cfd2 100644
--- a/person_relation.txt
+++ b/person_relation.txt
@@ -6,10 +6,12 @@
 外婆 祖孙 亲情
 外祖父 祖孙 亲情
 外祖母 祖孙 亲情
+之父 父子 亲情
 爸爸 父子 亲情
 妈妈 母子 亲情
 父亲 父子 亲情
 母亲 母子 亲情
+之母 母子 亲情
 大儿 父子 亲情
 次子 父子 亲情
 二子 父子 亲情
@@ -172,6 +174,7 @@
 祖姑母 姑孙 亲情
 女朋友 情侣 爱情
 男朋友 情侣 爱情
+夫妻 夫妻 爱情
 夫人 夫妻 爱情
 妻子 夫妻 爱情
 男友 情侣 爱情