-
Notifications
You must be signed in to change notification settings - Fork 0
/
fb_group_sync.py
executable file
·114 lines (105 loc) · 5.87 KB
/
fb_group_sync.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import urllib2
import urllib
import re
import ConfigParser
import json
import MySQLdb
import logging
import traceback
from datetime import datetime
import iso8601
class FbGroupArchiver:
def __init__(self, config_filename):
self.config = ConfigParser.ConfigParser()
try:
self.config.read(config_filename)
except:
print 'Config file'+str(config_filename)+' cannot be read'
return
config = self.config
self.db = MySQLdb.connect(user=config.get('db','user'), passwd=config.get('db','password'), db=config.get('db','name'))
self.cursor = self.db.cursor()
logging.basicConfig(filename=config.get('logging','infolog'), level=logging.INFO)
def process_data(self, group_url=None):
controlchar_regex = re.compile(r'[\n\r\t]')
cursor = self.cursor
config = self.config
if(group_url == None):
group_url = config.get('group','url')
data = urllib2.urlopen(group_url).read()
jsondata = json.loads(data)
for i,post in enumerate(jsondata['data']):
try:
print post.get('id')
post_id = post.get('id').rsplit('_',1)[1]
message = post.get('message')
created_on_d = iso8601.parse_date(post.get('created_time'))
created_on = created_on_d.strftime('%Y-%m-%d %H:%M:%S')
updated_on_d = iso8601.parse_date(post.get('updated_time'))
updated_on = created_on_d.strftime('%Y-%m-%d %H:%M:%S')
author_name = post.get('from').get('name').encode('ascii','ignore').replace('"','')
author_id = post.get('from').get('id')
if(message != None):
message = message.encode('ascii','ignore').replace('"','')
comments_count = None
likes_count = None
title = ''
if(post.get('comments') != None):
comments_count = post.get('comments').get('count')
if(post.get('likes') != None):
likes_count = post.get('likes').get('count')
# Do the DB part here
cursor.execute("""SELECT InsertPost(%s, %s, %s, %s, %s, %s, %s, %s)""", (author_name, author_id, message, likes_count, comments_count, created_on, updated_on, post_id))
code = cursor.fetchone()
# code[0] indicates the number of affected rows, if its 1 -> successful insert, if not the post already exists in thr Db
if(code[0] == 1 and post.get('link') != None):
link = post.get('link').encode('ascii','ignore')
if(post.get('name') != None):
title = controlchar_regex.sub(' ',post.get('name').encode('ascii','ignore').replace('"',''))
if(post.get('description') != None):
description = post.get('description').encode('ascii','ignore').replace('"','')
description = controlchar_regex.sub(' ',description) + ' - ' + author_name
else:
description = controlchar_regex.sub(' ',message) + ' - ' + author_name
# Build the JSON
values = '{"url": "'+link+'" , "list":"'+config.get('kippt','listuri')+'", "title":"'+title+'", "notes":"'+description+ '"}'
r = self.post_to_kippt(values)
self.post_link(r, post_id)
elif(code[0] == 1):
# make this regex better if you want
try:
description = post.get('message').encode('ascii','ignore').replace('"','')
description = controlchar_regex.sub(' ',description)
urls = re.findall("(?P<url>https?://[^\s]+)", description)
for url in urls:
description = description.replace(url, '')
description = description + ' - ' + author_name
for url in urls:
# Build the JSON
values = '{"url": "'+url+'" , "list": "'+config.get('kippt','listuri')+'", "notes":"'+description+'"}'
r = self.post_to_kippt(values)
self.post_link(r, post_id)
except Exception, err:
logging.error(str(datetime.now())+" "+str(err))
traceback.print_exc(file = open(config.get('logging','errorlog'),'a'))
except Exception, err:
print 'Some error'
logging.error(str(datetime.now())+" "+str(err))
traceback.print_exc(file = open(config.get('logging','errorlog'),'a'))
print "Archiving Complete!"
logging.info(str(datetime.now())+' Archiving Complete for page: '+group_url)
def post_to_kippt(self, values):
print values
config = self.config
req = urllib2.Request(config.get('kippt','url'),values)
req.add_header('X-Kippt-Username', config.get('kippt','username'))
req.add_header('X-Kippt-API-Token', config.get('kippt','apitoken'))
r = urllib2.urlopen(req)
return r.read()
def post_link(self, response, post_id):
cursor = self.cursor
resp_data = json.loads(response)
resp_url=resp_data.get('url').encode('ascii','ignore').replace('"','')
resp_title=resp_data.get('title').encode('ascii','ignore').replace('"','')
resp_notes=resp_data.get('notes').encode('ascii','ignore').replace('"','')
cursor.execute("""SELECT InsertLink(%s, %s, %s, %s) """,(resp_url, resp_title, resp_notes, post_id ))