-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
68 lines (54 loc) · 2.07 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def extract_leaf_links(root_url, file_extension, exclude_title):
visited_urls = set()
file_meta = []
def crawl(url):
if url in visited_urls or url.endswith(file_extension):
return
visited_urls.add(url)
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
for a_tag in soup.find_all('a', href=True):
title = a_tag.text.strip()
link = a_tag['href']
if title in exclude_title:
continue
if not link.startswith('http'):
link = urljoin(url, link)
if link not in visited_urls:
if link.endswith(file_extension):
meta = {}
meta["filename"] = link.split("/")[-1]
meta["url"] = link
file_meta.append(meta)
else:
print(f"crawling link - {link}")
crawl(link)
except requests.exceptions.RequestException as e:
print(f"Error fetching page: {e}")
crawl(root_url)
return file_meta
if __name__ == '__main__':
DOWNLOAD_PATH = "./paper"
file_extension = ".pdf"
url_index = "https://sapgrp.com/FreeTestPapers/"
title_lst = ["Name", "Last modified", "Size", "Description", "Parent Directory"]
file_meta = extract_leaf_links(url_index, file_extension, title_lst)
if os.path.exists(DOWNLOAD_PATH):
print("Folder already exists")
else:
os.mkdir(DOWNLOAD_PATH)
# Get response object for link
for file in file_meta:
response = requests.get(file["url"])
# Write content in pdf file
PATH = os.path.join(DOWNLOAD_PATH, file["filename"])
pdf = open(PATH, 'wb')
pdf.write(response.content)
pdf.close()
print(f'{file["filename"]} downloaded')