Skip to content

Commit

Permalink
update: many files
Browse files Browse the repository at this point in the history
  • Loading branch information
lvyilin committed Jul 31, 2018
1 parent 4e0996f commit bb6d581
Show file tree
Hide file tree
Showing 9 changed files with 288 additions and 15 deletions.
56 changes: 56 additions & 0 deletions co-occur_network_building.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from py2neo import Graph, Node, Relationship
import os

CWD = os.getcwd()

SENTENCE_FILE = os.path.join(CWD, "entity_sentences_lite.txt")
DB = Graph(
"bolt://localhost:7687",
username="neo4j",
password="admin"
)


def build_Node(node_name, entity_type="person"):
n = Node(entity_type, name=node_name)
return n


def build_Relation(nodeA, nodeB, relation_type):
r1 = Relationship(nodeA, relation_type, nodeB)
r1['weight'] = 1
return r1


def build_N_R(nodeA_name, nodeB_name, relation_type, entityA_type="person", entityB_type="person"):
n1 = build_Node(nodeA_name, entityA_type)
DB.merge(n1, entityA_type, 'name')
n2 = build_Node(nodeB_name, entityB_type)
DB.merge(n2, entityB_type, 'name')
r = DB.match_one(n1, relation_type, n2)
if r is None:
r = build_Relation(n1, n2, relation_type)
DB.merge(r)
else:
# if r['weight'] is None:
# r['weight'] = 1
# else:
r['weight'] += 1
r.push()


def split_sentence(line):
spl = str(line).split(" ", 2)
return spl[0], spl[1], spl[2]


def main():
with open(SENTENCE_FILE, "r", encoding="utf8")as fp:
for line in fp:
print(line)
entity_a, entity_b, sentence = split_sentence(line.strip())
build_N_R(entity_a, entity_b, "occur")


if __name__ == '__main__':
main()
22 changes: 22 additions & 0 deletions co_occur_network_statistic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from py2neo import Graph, Node, Relationship
import os

CWD = os.getcwd()

DB = Graph(
"bolt://localhost:7687",
username="neo4j",
password="admin"
)


def main():
res = DB.data("MATCH (a)-[r]-() RETURN a.name, sum(r.weight) as sz ORDER BY sz DESC ")
with open("co_occur_stats_lite.txt", "w", encoding="utf8") as g:
for item in res:
if item['sz'] >= 10 and item['sz'] <= 50:
g.write("{}\t{}\n".format(item['a.name'], item['sz']))


if __name__ == '__main__':
main()
45 changes: 45 additions & 0 deletions filter_entity_sentences.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os
import re

FILE = "D:\\Projects\\Baike\\entity_sentences_lite.txt"
NEW_FILE = "D:\\Projects\\Baike\\entity_sentences_v7.txt"


def load_relation():
d = set()
with open("person_relation.txt", "r", encoding="utf8") as f:
for line in f:
li = line.split(" ")
# ENTITY_MAP.add(line.split(" ")[0])
d.add(li[0])
return d


def build_relation_pattern(d):
s = u""
for k in d:
s += k + "|"
s = s.rstrip('|')
ptn = u"(" + s + u")"
return re.compile(ptn)


def split_sentence(line):
spl = str(line).split(" ", 2)
return spl[0], spl[1], spl[2]


RELATION_DICT = load_relation()
RELATION_PATTERN = build_relation_pattern(RELATION_DICT)

with open(FILE, 'r', encoding="utf8") as f:
lines = f.readlines()
with open(NEW_FILE, "w", encoding="utf8") as f:
for line in lines:
# print(len(line))
# if len(line) >= 175 and len(RELATION_PATTERN.findall(line)) == 0:
entity_a, entity_b, sentence = split_sentence(line.strip())
if "'" == entity_a or "'" == entity_b:
print(line)
continue
f.write(line)
59 changes: 59 additions & 0 deletions get_sentence_degree_range.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import sqlite3
import os

CWD = os.getcwd()

DATA = []


def split_sentence(line):
spl = str(line).split(" ", 2)
return spl[0], spl[1], spl[2]


def save_to_file():
with open("co_occur_stats_filtered.txt", "w", encoding="utf8") as f:
for item in DATA:
f.write("{} {} {}\n".format(item[0], item[1], item[2]))


def save_to_sqlite():
conn = sqlite3.connect('baike.sqlite')
c = conn.cursor()
for item in DATA:
item[0] = str(item[0]).replace("'", "''")
item[1] = str(item[1]).replace("'", "''")
item[2] = str(item[2]).replace("'", "''")
# item[2] = str(item[2]).replace('"','""')
sql = "insert into Data(entity_a,entity_b,sentence,relation) VALUES('{}','{}','{}',0)".format(item[0], item[1],
item[2])
print(sql)
c.execute(sql)

conn.commit()
conn.close()


def read_data():
entity_set = set()
with open("co_occur_stats_lite.txt", "r", encoding="utf8") as g:
for line in g:
spl = line.split("\t")
entity_set.add(spl[0])

with open("entity_sentences_lite.txt", "r", encoding="utf8") as f:
for line in f:
line = line.strip()
entity_a, entity_b, sentence = split_sentence(line)
if entity_a in entity_set or entity_b in entity_set:
DATA.append([entity_a, entity_b, sentence])


def main():
read_data()
# save_to_file()
save_to_sqlite()


if __name__ == '__main__':
main()
68 changes: 68 additions & 0 deletions merge_annotation_to_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# 将之前匹配方法标注的结果结合到共现标注的结果的数据库里
import sqlite3

ANNOTATION_OLD_FILE = "D:\\Projects\\Baike\\sentences_annotation_auto\\annotation_fin.txt"
PERSON_RELATION_FILE = "D:\\Projects\\Baike\\person_relation.txt"
RELATION_MAP_FILE = "D:\\Projects\\Baike\\relation_stat_for_map.txt"
DB = "D:\\Projects\\Baike\\baike.db"


def split_line(line):
spl = str(line).rsplit(" ", 3)
return spl[0], spl[1], spl[2], spl[3]


def load_relation():
d = dict()
with open(PERSON_RELATION_FILE, "r", encoding="utf8") as f:
for line in f:
li = line.split(" ")
d[li[0]] = li[1]
return d


def load_relation_map():
d_num = dict()
with open(RELATION_MAP_FILE, "r", encoding="utf8") as f:
i = 1
for line in f:
lst = line.split(":")
relation_type = lst[0]
relation_name = lst[1].split(" ")
for rel in relation_name:
rel = rel.strip()
d_num[rel] = i
i += 1
return d_num


RELATION_NAME = load_relation()
RELATION_NUM_MAP = load_relation_map()


def map_relation_to_num(relation):
if relation in RELATION_NAME:
rel = RELATION_NAME[relation]
print("{},{},{}".format(relation, rel, RELATION_NUM_MAP[rel]))
return RELATION_NUM_MAP[rel]


with open(ANNOTATION_OLD_FILE, "r", encoding="utf8") as f:
conn = sqlite3.connect(DB)
c = conn.cursor()
for line in f:
sentence, entity_a, entity_b, relation = split_line(line)
relation = relation.strip()
c.execute(
"select * from Data where relation!=8 and sentence='{}' and entity_a='{}' and entity_b='{}'".format(
sentence, entity_a, entity_b))
for row in c:
print(row)
r = map_relation_to_num(relation)
if len(row) != 0 and r is not None:
id = row[0]
c.execute("update Data set relation={} where id={}".format(r, id))
else:
print(row)
conn.commit()
conn.close()
14 changes: 14 additions & 0 deletions merge_entity_sentences.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# 将提取好的包含主体和其他实体的句子文件整合为一个文件
import os

DIR = "D:\\Projects\\Baike\\entity_sentences\\"
FILE = "D:\\Projects\\Baike\\entity_sentences.txt"

files = os.listdir(DIR)
with open(FILE, "w", encoding="utf8") as f:
for filename in files:
with open(DIR + filename, "r", encoding="utf8") as g:
for line in g:
if line.strip() == "":
continue
f.write(line)
34 changes: 20 additions & 14 deletions neo4j.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


# 此函数每次运行只执行一次
def delete_graph(DB, node = None, relation = None, all =0):
def delete_graph(DB, node=None, relation=None, all=0):
if node != None:
if DB.exists(node):
DB.delete(node)
Expand All @@ -12,6 +12,7 @@ def delete_graph(DB, node = None, relation = None, all =0):
if all == 1:
DB.delete_all()


def build_Node(node_name, entity_type="person"):
n = Node(entity_type, name=node_name)
return n
Expand All @@ -22,25 +23,28 @@ def build_Relation(nodeA, nodeB, relation_type, location):
r1[location] = 1
return r1


def add_type_unique(db, type):
db.run("CREATE CONSTRAINT ON (ea:" + type + ")ASSERT ea.name IS UNIQUE")


def build_N_R(m_Graph, nodeA_name, nodeB_name, relation_type, location, entityA_type="person", entityB_type="person"):
n1 = build_Node(nodeA_name,entityA_type)
m_Graph.merge(n1,entityA_type,'name')
n2 = build_Node(nodeB_name,entityB_type)
m_Graph.merge(n2,entityB_type,'name')
r = m_Graph.match_one(n1,relation_type,n2)
if r == None:
n1 = build_Node(nodeA_name, entityA_type)
m_Graph.merge(n1, entityA_type, 'name')
n2 = build_Node(nodeB_name, entityB_type)
m_Graph.merge(n2, entityB_type, 'name')
r = m_Graph.match_one(n1, relation_type, n2)
if r is None:
r = build_Relation(n1, n2, relation_type, location)
m_Graph.merge(r)
else:
if r[location] == None:
if r[location] is None:
r[location] = 1
else:
r[location] += 1
r.push()


def initDB():
m_Graph_DB = Graph(
"bolt://localhost:7687",
Expand All @@ -49,18 +53,20 @@ def initDB():
)
return m_Graph_DB


def main():
m_Graph_DB = Graph(
"bolt://localhost:7687",
username="neo4j",
password="admin"
)
# add_type_unique(m_Graph_DB,"person")
delete_graph(m_Graph_DB,all=1)
delete_graph(m_Graph_DB, all=1)
return
build_N_R(m_Graph_DB, "毛泽东", "杨开慧" , "朋友", "infobox")
build_N_R(m_Graph_DB, "杨开慧", "毛泽东" , "朋友", "infobox")
build_N_R(m_Graph_DB, "毛泽东", "周恩来" , "同事", "infobox")
build_N_R(m_Graph_DB, "毛泽东", "杨开慧", "朋友", "infobox")
build_N_R(m_Graph_DB, "杨开慧", "毛泽东", "朋友", "infobox")
build_N_R(m_Graph_DB, "毛泽东", "周恩来", "同事", "infobox")


if __name__=="__main__":
main()
if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion parse_new.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')
segmentor = pyltp.Segmentor()
segmentor.load_with_lexicon(cws_model_path, "entity_dict.txt")
segmentor.load_with_lexicon(cws_model_path, "entity_dict.txt") # TODO:检查"周总理"等是否识别
postagger = pyltp.Postagger()
postagger.load(pos_model_path)
recognizer = pyltp.NamedEntityRecognizer()
Expand Down
3 changes: 3 additions & 0 deletions person_relation.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
外婆 祖孙 亲情
外祖父 祖孙 亲情
外祖母 祖孙 亲情
之父 父子 亲情
爸爸 父子 亲情
妈妈 母子 亲情
父亲 父子 亲情
母亲 母子 亲情
之母 母子 亲情
大儿 父子 亲情
次子 父子 亲情
二子 父子 亲情
Expand Down Expand Up @@ -172,6 +174,7 @@
祖姑母 姑孙 亲情
女朋友 情侣 爱情
男朋友 情侣 爱情
夫妻 夫妻 爱情
夫人 夫妻 爱情
妻子 夫妻 爱情
男友 情侣 爱情
Expand Down

0 comments on commit bb6d581

Please sign in to comment.