-
Notifications
You must be signed in to change notification settings - Fork 0
/
requests+BS4爬取猫眼电影排行榜.py
121 lines (80 loc) · 2.59 KB
/
requests+BS4爬取猫眼电影排行榜.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# coding: utf-8
# In[492]:
#description:a spider for grabbing some info of rank 100 movies on MAOYAN .
import requests
import pandas as pd
from bs4 import BeautifulSoup
from multiprocessing import Process
__author__ = 'Jonathan_Rowe'
__fstmdtime__ ='2019-03-19'
__lstmdtime__ ='2019-03-19'
# ### get web content
# In[493]:
def getUrl(url):
try:
response = requests.get(url)
print('response=%s'%response) #just for testing
if response.status_code ==200:
return response.text
else:return None
except:return None
# ### show all dataframe
# In[494]:
def showAll():
# expand dataframe lenth
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)
return
# ### filter content by BS4
# In[495]:
def filter(res):
try:
soup = BeautifulSoup(res,'html.parser') #only soup object has find_all function
dd = soup.find_all(name='dd') #get data block
# print(dl)
for item in dd:
print('*************************************\n')
# print('c=%s'%c)
index = item.i.string
title = item.find(class_='name').string
actors = item.find(class_='star').string.strip()[3:]
img = item.find(class_='board-img')['data-src']
time = item.find(class_='releasetime').string[5:]
score =item.find(class_='integer').string+item.find(class_='fraction').string
# print('%s\n'%index,'%s\n'%title,'%s\n'%actors,'%s\n'%img,'%s\n'%time,'%s\n'%score) #just for testing
data = [[title,score,time,actors,img]]
exits=0
if not exits:
df = pd.DataFrame(data,index=[index],columns=['title','score','time','actors','img_link'])
exits=1
else:
df.append(data)
yield df
except Exception as e:
print(type(e),e)
return None
return None
# ### save data as file
# In[496]:
def save(df):
df.to_csv('MaoYanMovieRank.csv',mode='a')
# ### multiprocess main part
# In[497]:
def main():
for i in range(0, 100, 10):
url = 'https://maoyan.com/board/4?offset=%s'%i
res = getUrl(url)
if type(res) != None:
dfg = filter(res)
for df in dfg:
print(df) #for testing
save(df)
# ### main
# In[498]:
if __name__ == '__main__':
global exits,df
showAll()
p = Process(target=main)
p.start()
main()