From 057d37fc8d381c051486b38e5f076b71944f6f27 Mon Sep 17 00:00:00 2001 From: AaravSinghRathor <42621083+AaravSinghRathor@users.noreply.github.com> Date: Fri, 2 Oct 2020 00:18:36 +0530 Subject: [PATCH 1/2] Updated the readme file --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2a7f4d4..101a982 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ geeksforgeeks-pdf ================= -This Python script aimed to download all Amazon Interview Experience from GeeksforGeeks website. You can modify this script to download as per your need. +This Python script is created with an objective to download all Amazon Interview Experience Questions from GeeksforGeeks website. You can modify this script to download as per your need. From 5cbe9a7105264c0e6a376ab9d157db507baa32ce Mon Sep 17 00:00:00 2001 From: AaravSinghRathor <42621083+AaravSinghRathor@users.noreply.github.com> Date: Fri, 2 Oct 2020 00:25:38 +0530 Subject: [PATCH 2/2] Added some python3 changes, doc strings and improved the code according to pip8 guidelines --- geeksforgeeks-pdf.py | 47 ++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/geeksforgeeks-pdf.py b/geeksforgeeks-pdf.py index 93922f3..8fb66ff 100644 --- a/geeksforgeeks-pdf.py +++ b/geeksforgeeks-pdf.py @@ -1,53 +1,52 @@ import httplib2 import pdfcrowd +import urllib2 from bs4 import BeautifulSoup, SoupStrainer http = httplib2.Http() -s= 'http://www.geeksforgeeks.org/' -i=0 -to_crawl=[] -to_crawl.append(s) +s = 'http://www.geeksforgeeks.org/' +i = 0 +to_crawl = [s] status, response = http.request(s) -crawled=[] -crawled.append(s) +crawled = [s] for link in BeautifulSoup(response, parse_only=SoupStrainer('a')): if link.has_attr('href'): - li=link['href'] - #print li + li = link['href'] + # print li if li.find('http://www.geeksforgeeks.org')==0 and li not in crawled and li.find('forums')<0: to_crawl.append(li) -#print to_crawl -print len(to_crawl) -count=0 +# print to_crawl +print(len(to_crawl)) +count = 0 +# Helper method to get page def get_page(page): - import urllib2 - source=urllib2.urlopen(page) + source = urllib2.urlopen(page) return source.read() - +# Helper method to save the pdf def save_as_pdf(s): global i try: client = pdfcrowd.Client("mkap1234", "fc5ada9fbd1c55f46822d6e9e985a9bb") output_file = open('amazon'+str(i)+'.pdf', 'wb') - i=i+1 - html=get_page(s) + i = i + 1 + html = get_page(s) client.convertHtml(html, output_file) output_file.close() - except pdfcrowd.Error,why: + except pdfcrowd.Error, why: print 'Failed:', why while len(to_crawl): - b=to_crawl.pop() + b = to_crawl.pop() if b.find('http://www.geeksforgeeks.org')==0 and b not in crawled and b.find('forums')<0: - count=count+1 - print count + count = count + 1 + print(count) crawled.append(b) status, response = http.request(b) for link in BeautifulSoup(response, parse_only=SoupStrainer('a')): @@ -64,14 +63,14 @@ def save_as_pdf(s): for st in crawled: if st.find('amazon')>=0 and st.find('#')<0 and st.find('tag')<0 and st.find('forum')<0: - print st + print(st) amazon.append(st) -print "Finished" -print len(amazon) +print("Processing Finished") +print(len(amazon)) - +# Saving all the pages fetched as pdf for page in amazon: save_as_pdf(page)