-
Notifications
You must be signed in to change notification settings - Fork 1
/
douban1.py
72 lines (57 loc) · 1.45 KB
/
douban1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import requests
from pyquery import PyQuery as pq
"""
这是一个普通爬虫
下载网页并解析打印出来
但是只下载了一个网页
"""
class Model():
"""
基类, 用来显示类的信息
"""
def __repr__(self):
name = self.__class__.__name__
properties = ('{}=({})'.format(k, v) for k, v in self.__dict__.items())
s = '\n<{} \n {}>'.format(name, '\n '.join(properties))
return s
class Movie(Model):
"""
存储电影信息
"""
def __init__(self):
self.name = ''
self.score = 0
self.quote = ''
self.cover_url = ''
self.ranking = 0
def movie_from_div(div):
"""
从一个 div 里面获取到一个电影信息
"""
e = pq(div)
# 小作用域变量用单字符
m = Movie()
m.name = e('.title').text()
m.score = e('.rating_num').text()
m.quote = e('.inq').text()
m.cover_url = e('img').attr('src')
m.ranking = e('.pic').find('em').text()
return m
def movies_from_url(url):
"""
从 url 中下载网页并解析出页面内所有的电影
"""
r = requests.get(url)
page = r.content
e = pq(page)
items = e('.item')
# 调用 movie_from_div
movies = [movie_from_div(i) for i in items]
return movies
def main():
url = 'https://movie.douban.com/top250'
movies = movies_from_url(url)
print('top250 movies', movies)
if __name__ == '__main__':
main()