-
Notifications
You must be signed in to change notification settings - Fork 6
/
ex_12_online_retrievehtml.py
43 lines (36 loc) · 1.03 KB
/
ex_12_online_retrievehtml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# To run this, you can install BeautifulSoup
# https://pypi.python.org/pypi/beautifulsoup4
# Or download the file
# http://www.py4e.com/code3/bs4.zip
# and unzip it in the same directory as this file
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter - ')
count = input('Enter count: ')
count = int(count)
position = input('Enter position: ')
position = int(position)
i = 0
while i < count:
i += 1
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
# Retrieve all of the anchor tags
tags = soup('a')
# print('count=', count)
p = 0
for tag in tags:
newurl = tag.get('href', None)
if p < position:
p += 1
# print('position=', p)
# print(newurl)
url = newurl
else:
break
print('Retrieving:', url)