-
Notifications
You must be signed in to change notification settings - Fork 0
/
DGStructure.py
97 lines (80 loc) · 3.19 KB
/
DGStructure.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
'''
This module is a component of web crawler framework that help collecting
ZHIHU & Douban data.
DGStructure extends BaseStructure class to extract needed information in douban
group web page.
'''
from BaseStructure import BaseStructure
from bs4 import BeautifulSoup
from types import FunctionType
class DGStructure(BaseStructure):
'''
This class decribes how to draw out douban group data.
This douban group class has the following specific functionalities:
+ search for specific author
TODO:
(1) search for specific time
(2) search for posts containning specific keyword
(3) search for posts which have specific length of title
Attributes:
Same as super class.
'''
def __init__(self, data_row):
'''
Creates a data fields.
This initialization method overrides the method from supper class.
Arguments:
data_row : A bs4.element.Tag object that contains a row
of douban group posts. It is one element of the
list which is casted from a returned object of
findAll function.
Attributes:
title : title of the post in the table
title_url : url links to the post page
author : author of the post
author_url : url links to the author's homepage
follow_num : reply number of the post
time : last reply time of the post
'''
self.title = data_row.find('td', {'class': 'title'}).a.attrs['title']
self.title_url = data_row.find(
"td", {"class": "title"}).a.attrs["href"]
self.author = data_row.find("td", {"nowrap": "nowrap"}).a.text
self.author_url = data_row.find(
"td", {"nowrap": "nowrap"}).a.attrs["href"]
self.follow_num = data_row.find(lambda tag: len(
tag.attrs) == 2 and tag.name == "td").text
self.time = data_row.find(
"td", {"nowrap": "nowrap", "class": "time"}).text
if self.follow_num == "":
self.follow_num = "0"
def get_line_data(self)->list:
'''
This method return a list of all Attributes aquired from douban group.
'''
fields = [self.title, self.title_url, self.author, self.author_url,
self.follow_num, self.time]
for field in fields:
assert not isinstance(field, FunctionType)
assert field is not None
return fields
'''
Temporary test code:
[date: 2017.1.19][status: working]
'''
if __name__ == "__main__":
from bs4 import BeautifulSoup
import requests
base_url = 'https://www.douban.com/group/gz020/discussion?start='
res = requests.get(base_url + '0')
soup = BeautifulSoup(res.text, 'lxml')
table = soup.findAll('table', {'class': 'olt'})
rows = list(table)[0].findAll("tr", {"class": "", "id": ""})
row = rows[0]
print('row:\n', '-' * 10, '\n', row, end='\n\n')
print('-' * 10)
zf = DGStructure(row)
for ele in zf.get_line_data():
print('-->', ele)
print(zf.get_line_data())
print(len(zf.get_line_data()))