-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchatbot_database.py
118 lines (106 loc) · 4.44 KB
/
chatbot_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import sqlite3
import json
from datetime import datetime
timeframe='2015-01'
sql_transaction=[]
connection=sqlite3.connect('{}.db'.format(timeframe))
c=connection.cursor()
def create_table():
c.execute("""CREATE TABLE IF NOT EXISTS parent_reply
(parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE,parent TEXT,
comment TEXT,subreddit TEXT,unix INT,score INT)""")
def format_data(data):
data=data.replace("\n"," newlinechar ").replace("\r"," newlinechar ").replace('"',"'")
return data
def find_existing_score(pid):
try:
sql="SELECT score FROM parent_reply WHERE parent_id='{}'LIMIT 1".format(pid)
c.execute(sql)
result=c.fetchone()
if result !=None:
return result[0]
else:
return False
except Exception as e:
#print("find_parent",e)
return False
def acceptable(data):
if len(data.split(' '))>50 or len(data)<1:
return False
elif len(data)>1000:
return False
elif data=='[deleted]'or data=='[removed]':
return False
else:
return True
def find_parent(pid):
try:
sql="SELECT comment FROM parent_reply WHERE comment_id='{}'LIMIT 1".format(pid)
c.execute(sql)
result=c.fetchone()
if result !=None:
return result[0]
else:
return False
except Exception as e:
#print("find_parent",e)
return False
def transaction_bldr(sql):
global sql_transaction
sql_transaction.append(sql)
if len(sql_transaction)>1000:
c.execute('BEGIN TRANSACTION')
for s in sql_transaction:
try:
c.execute(s)
except:
pass
connection.commit()
sql_transaction=[]
def sql_insert_replace_comment(commentid,parentid,parent,comment,subreddit,time,score):
try:
sql = """UPDATE parent_reply SET parent_id = ?, comment_id = ?, parent = ?, comment = ?, subreddit = ?, unix = ?, score = ? WHERE parent_id =?;""".format(parentid, commentid, parent, comment, subreddit, int(time), score, parentid)
transaction_bldr(sql)
except Exception as e:
print('s-UPDATE insertion',str(e))
def sql_insert_has_parent(commentid,parentid,parent,comment,subreddit,time,score):
try:
sql = """INSERT INTO parent_reply (parent_id, comment_id, parent, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}","{}",{},{});""".format(parentid, commentid, parent, comment, subreddit, int(time), score)
transaction_bldr(sql)
except Exception as e:
print('s-PARENT insertion',str(e))
def sql_insert_no_parent(commentid,parentid,comment,subreddit,time,score):
try:
sql = """INSERT INTO parent_reply (parent_id, comment_id, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}",{},{});""".format(parentid, commentid, comment, subreddit, int(time), score)
transaction_bldr(sql)
except Exception as e:
print('s-NO_PARENT insertion',str(e))
if __name__=="__main__":
create_table()
row_counter=0
paired_rows=0
with open("D:/Project/chatbot/Reddit data set/RC_2015-01",buffering=1000) as f:
for row in f:
row_counter+=1
row=json.loads(row)
parent_id=row['parent_id']
body=format_data(row['body'])
created_utc=row['created_utc']
score=row['score']
subreddit=row['subreddit']
comment_id=row['name']
parent_data=find_parent(parent_id)
if score>=2:
if acceptable(body):
existing_comment_score=find_existing_score(parent_id)
if existing_comment_score:
if score>existing_comment_score:
sql_insert_replace_comment(comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
else:
if parent_data:
sql_insert_has_parent(comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
paired_rows+=1
else:
sql_insert_no_parent(comment_id, parent_id,body,subreddit,created_utc,score)
if row_counter % 100000==0:
print("Total rows read: {},paired rows: {},Time: {},".format(row_counter,paired_rows,str(datetime.now())))