-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
lvyilin
committed
Jul 31, 2018
1 parent
4e0996f
commit bb6d581
Showing
9 changed files
with
288 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
from py2neo import Graph, Node, Relationship | ||
import os | ||
|
||
CWD = os.getcwd() | ||
|
||
SENTENCE_FILE = os.path.join(CWD, "entity_sentences_lite.txt") | ||
DB = Graph( | ||
"bolt://localhost:7687", | ||
username="neo4j", | ||
password="admin" | ||
) | ||
|
||
|
||
def build_Node(node_name, entity_type="person"): | ||
n = Node(entity_type, name=node_name) | ||
return n | ||
|
||
|
||
def build_Relation(nodeA, nodeB, relation_type): | ||
r1 = Relationship(nodeA, relation_type, nodeB) | ||
r1['weight'] = 1 | ||
return r1 | ||
|
||
|
||
def build_N_R(nodeA_name, nodeB_name, relation_type, entityA_type="person", entityB_type="person"): | ||
n1 = build_Node(nodeA_name, entityA_type) | ||
DB.merge(n1, entityA_type, 'name') | ||
n2 = build_Node(nodeB_name, entityB_type) | ||
DB.merge(n2, entityB_type, 'name') | ||
r = DB.match_one(n1, relation_type, n2) | ||
if r is None: | ||
r = build_Relation(n1, n2, relation_type) | ||
DB.merge(r) | ||
else: | ||
# if r['weight'] is None: | ||
# r['weight'] = 1 | ||
# else: | ||
r['weight'] += 1 | ||
r.push() | ||
|
||
|
||
def split_sentence(line): | ||
spl = str(line).split(" ", 2) | ||
return spl[0], spl[1], spl[2] | ||
|
||
|
||
def main(): | ||
with open(SENTENCE_FILE, "r", encoding="utf8")as fp: | ||
for line in fp: | ||
print(line) | ||
entity_a, entity_b, sentence = split_sentence(line.strip()) | ||
build_N_R(entity_a, entity_b, "occur") | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
from py2neo import Graph, Node, Relationship | ||
import os | ||
|
||
CWD = os.getcwd() | ||
|
||
DB = Graph( | ||
"bolt://localhost:7687", | ||
username="neo4j", | ||
password="admin" | ||
) | ||
|
||
|
||
def main(): | ||
res = DB.data("MATCH (a)-[r]-() RETURN a.name, sum(r.weight) as sz ORDER BY sz DESC ") | ||
with open("co_occur_stats_lite.txt", "w", encoding="utf8") as g: | ||
for item in res: | ||
if item['sz'] >= 10 and item['sz'] <= 50: | ||
g.write("{}\t{}\n".format(item['a.name'], item['sz'])) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import os | ||
import re | ||
|
||
FILE = "D:\\Projects\\Baike\\entity_sentences_lite.txt" | ||
NEW_FILE = "D:\\Projects\\Baike\\entity_sentences_v7.txt" | ||
|
||
|
||
def load_relation(): | ||
d = set() | ||
with open("person_relation.txt", "r", encoding="utf8") as f: | ||
for line in f: | ||
li = line.split(" ") | ||
# ENTITY_MAP.add(line.split(" ")[0]) | ||
d.add(li[0]) | ||
return d | ||
|
||
|
||
def build_relation_pattern(d): | ||
s = u"" | ||
for k in d: | ||
s += k + "|" | ||
s = s.rstrip('|') | ||
ptn = u"(" + s + u")" | ||
return re.compile(ptn) | ||
|
||
|
||
def split_sentence(line): | ||
spl = str(line).split(" ", 2) | ||
return spl[0], spl[1], spl[2] | ||
|
||
|
||
RELATION_DICT = load_relation() | ||
RELATION_PATTERN = build_relation_pattern(RELATION_DICT) | ||
|
||
with open(FILE, 'r', encoding="utf8") as f: | ||
lines = f.readlines() | ||
with open(NEW_FILE, "w", encoding="utf8") as f: | ||
for line in lines: | ||
# print(len(line)) | ||
# if len(line) >= 175 and len(RELATION_PATTERN.findall(line)) == 0: | ||
entity_a, entity_b, sentence = split_sentence(line.strip()) | ||
if "'" == entity_a or "'" == entity_b: | ||
print(line) | ||
continue | ||
f.write(line) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
import sqlite3 | ||
import os | ||
|
||
CWD = os.getcwd() | ||
|
||
DATA = [] | ||
|
||
|
||
def split_sentence(line): | ||
spl = str(line).split(" ", 2) | ||
return spl[0], spl[1], spl[2] | ||
|
||
|
||
def save_to_file(): | ||
with open("co_occur_stats_filtered.txt", "w", encoding="utf8") as f: | ||
for item in DATA: | ||
f.write("{} {} {}\n".format(item[0], item[1], item[2])) | ||
|
||
|
||
def save_to_sqlite(): | ||
conn = sqlite3.connect('baike.sqlite') | ||
c = conn.cursor() | ||
for item in DATA: | ||
item[0] = str(item[0]).replace("'", "''") | ||
item[1] = str(item[1]).replace("'", "''") | ||
item[2] = str(item[2]).replace("'", "''") | ||
# item[2] = str(item[2]).replace('"','""') | ||
sql = "insert into Data(entity_a,entity_b,sentence,relation) VALUES('{}','{}','{}',0)".format(item[0], item[1], | ||
item[2]) | ||
print(sql) | ||
c.execute(sql) | ||
|
||
conn.commit() | ||
conn.close() | ||
|
||
|
||
def read_data(): | ||
entity_set = set() | ||
with open("co_occur_stats_lite.txt", "r", encoding="utf8") as g: | ||
for line in g: | ||
spl = line.split("\t") | ||
entity_set.add(spl[0]) | ||
|
||
with open("entity_sentences_lite.txt", "r", encoding="utf8") as f: | ||
for line in f: | ||
line = line.strip() | ||
entity_a, entity_b, sentence = split_sentence(line) | ||
if entity_a in entity_set or entity_b in entity_set: | ||
DATA.append([entity_a, entity_b, sentence]) | ||
|
||
|
||
def main(): | ||
read_data() | ||
# save_to_file() | ||
save_to_sqlite() | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
# 将之前匹配方法标注的结果结合到共现标注的结果的数据库里 | ||
import sqlite3 | ||
|
||
ANNOTATION_OLD_FILE = "D:\\Projects\\Baike\\sentences_annotation_auto\\annotation_fin.txt" | ||
PERSON_RELATION_FILE = "D:\\Projects\\Baike\\person_relation.txt" | ||
RELATION_MAP_FILE = "D:\\Projects\\Baike\\relation_stat_for_map.txt" | ||
DB = "D:\\Projects\\Baike\\baike.db" | ||
|
||
|
||
def split_line(line): | ||
spl = str(line).rsplit(" ", 3) | ||
return spl[0], spl[1], spl[2], spl[3] | ||
|
||
|
||
def load_relation(): | ||
d = dict() | ||
with open(PERSON_RELATION_FILE, "r", encoding="utf8") as f: | ||
for line in f: | ||
li = line.split(" ") | ||
d[li[0]] = li[1] | ||
return d | ||
|
||
|
||
def load_relation_map(): | ||
d_num = dict() | ||
with open(RELATION_MAP_FILE, "r", encoding="utf8") as f: | ||
i = 1 | ||
for line in f: | ||
lst = line.split(":") | ||
relation_type = lst[0] | ||
relation_name = lst[1].split(" ") | ||
for rel in relation_name: | ||
rel = rel.strip() | ||
d_num[rel] = i | ||
i += 1 | ||
return d_num | ||
|
||
|
||
RELATION_NAME = load_relation() | ||
RELATION_NUM_MAP = load_relation_map() | ||
|
||
|
||
def map_relation_to_num(relation): | ||
if relation in RELATION_NAME: | ||
rel = RELATION_NAME[relation] | ||
print("{},{},{}".format(relation, rel, RELATION_NUM_MAP[rel])) | ||
return RELATION_NUM_MAP[rel] | ||
|
||
|
||
with open(ANNOTATION_OLD_FILE, "r", encoding="utf8") as f: | ||
conn = sqlite3.connect(DB) | ||
c = conn.cursor() | ||
for line in f: | ||
sentence, entity_a, entity_b, relation = split_line(line) | ||
relation = relation.strip() | ||
c.execute( | ||
"select * from Data where relation!=8 and sentence='{}' and entity_a='{}' and entity_b='{}'".format( | ||
sentence, entity_a, entity_b)) | ||
for row in c: | ||
print(row) | ||
r = map_relation_to_num(relation) | ||
if len(row) != 0 and r is not None: | ||
id = row[0] | ||
c.execute("update Data set relation={} where id={}".format(r, id)) | ||
else: | ||
print(row) | ||
conn.commit() | ||
conn.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# 将提取好的包含主体和其他实体的句子文件整合为一个文件 | ||
import os | ||
|
||
DIR = "D:\\Projects\\Baike\\entity_sentences\\" | ||
FILE = "D:\\Projects\\Baike\\entity_sentences.txt" | ||
|
||
files = os.listdir(DIR) | ||
with open(FILE, "w", encoding="utf8") as f: | ||
for filename in files: | ||
with open(DIR + filename, "r", encoding="utf8") as g: | ||
for line in g: | ||
if line.strip() == "": | ||
continue | ||
f.write(line) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters