-
Notifications
You must be signed in to change notification settings - Fork 0
/
tianya.py
132 lines (120 loc) · 3.69 KB
/
tianya.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python3
#coding: utf-8
import requests, re
from db import DB
from bs4 import BeautifulSoup as bs
_sites = [
{
"title": "天涯杂谈",
"baseurl": "http://bbs.tianya.cn/list-free-1.shtml",
"pattern": r"/post-free-\d+-1\.shtml",
},
{
"title": "新闻众评",
"baseurl": "http://bbs.tianya.cn/list-news-1.shtml",
"pattern": r"/post-news-\d+-1\.shtml",
},
{
"title": "我的大学",
"baseurl": "http://bbs.tianya.cn/list-university-1.shtml",
"pattern": r"/post-university-\d+-1\.shtml",
},
{
"title": "关天茶馆",
"baseurl": "http://bbs.tianya.cn/list-no01-1.shtml",
"pattern": r"/post-no01-\d+-1\.shtml",
},
{
"title": "闲闲书话",
"baseurl": "http://bbs.tianya.cn/list-books-1.shtml",
"pattern": r"/post-books-\d+-1\.shtml",
},
{
"title": "国际观察",
"baseurl": "http://bbs.tianya.cn/list-worldlook-1.shtml",
"pattern": r"/post-worldlook-\d+-1\.shtml",
},
{
"title": "心灵热线",
"baseurl": "http://bbs.tianya.cn/list-spirit-1.shtml",
"pattern": r"/post-spirit-\d+-1\.shtml",
},
{
"title": "学术中国",
"baseurl": "http://bbs.tianya.cn/list-666-1.shtml",
"pattern": r"/post-666-\d+-1\.shtml",
},
{
"title": "人物研究",
"baseurl": "http://bbs.tianya.cn/list-113-1.shtml",
"pattern": r"/post-113-\d+-1\.shtml",
},
{
"title": "语文学习",
"baseurl": "http://bbs.tianya.cn/list-1170-1.shtml",
"pattern": r"/post-1170-\d+-1\.shtml",
},
{
"title": "文学批评",
"baseurl": "http://bbs.tianya.cn/list-187-1.shtml",
"pattern": r"/post-187-\d+-1\.shtml",
},
{
"title": "煮酒论史",
"baseurl": "http://bbs.tianya.cn/list-no05-1.shtml",
# http://bbs.tianya.cn/post-no05-507160-1.shtml
"pattern": r"/post-no05-\d+-1\.shtml"
},
{
"title": "舞文弄墨",
"baseurl": "http://bbs.tianya.cn/list-culture-1.shtml",
"pattern": r"/post-culture-\d+-1\.shtml"
},
{
"title": "影视评论",
"baseurl": "http://bbs.tianya.cn/list-filmtv-1.shtml",
"pattern": r"/post-filmtv-\d+-1\.shtml"
},
{
"title": "法制天地",
"baseurl": "http://bbs.tianya.cn/list-law-1.shtml",
"pattern": r"/post-law-\d+-1\.shtml"
}
]
def get(url):
sess = requests.session()
# to simulite browser
sess.headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36"
res = sess.get(url)
return res
def get_urls(logger, site):
logger.info("grab {}".format(site["title"]))
res = get(site["baseurl"])
ct = res.content.decode(res.apparent_encoding)
p = re.compile(site["pattern"])
urls = []
m = p.search(ct)
while m:
url = m.group(0)
url = "http://bbs.tianya.cn" + url
urls.append(url)
ct = ct[m.span()[1]:]
m = p.search(ct)
return urls
def get_content(logger, url):
res = get(url)
ct = res.content.decode(res.apparent_encoding)
s = bs(ct, features="html.parser")
cs = s.find_all("div", attrs={"class": "bbs-content"})
txt = '\n'.join([c.text for c in cs])
return txt
def grab(logger):
db = DB()
for site in _sites:
urls = get_urls(logger, site)
for url in urls:
if db.has(url):
continue
txt = get_content(logger, url)
db.save(url, txt)
logger.info("\tgrab {} got {} bytes".format(url, len(txt)))