diff --git a/README.md b/README.md index 2a7f4d4..101a982 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ geeksforgeeks-pdf ================= -This Python script aimed to download all Amazon Interview Experience from GeeksforGeeks website. You can modify this script to download as per your need. +This Python script is created with an objective to download all Amazon Interview Experience Questions from GeeksforGeeks website. You can modify this script to download as per your need. diff --git a/geeksforgeeks-pdf.py b/geeksforgeeks-pdf.py index 93922f3..8fb66ff 100644 --- a/geeksforgeeks-pdf.py +++ b/geeksforgeeks-pdf.py @@ -1,53 +1,52 @@ import httplib2 import pdfcrowd +import urllib2 from bs4 import BeautifulSoup, SoupStrainer http = httplib2.Http() -s= 'http://www.geeksforgeeks.org/' -i=0 -to_crawl=[] -to_crawl.append(s) +s = 'http://www.geeksforgeeks.org/' +i = 0 +to_crawl = [s] status, response = http.request(s) -crawled=[] -crawled.append(s) +crawled = [s] for link in BeautifulSoup(response, parse_only=SoupStrainer('a')): if link.has_attr('href'): - li=link['href'] - #print li + li = link['href'] + # print li if li.find('http://www.geeksforgeeks.org')==0 and li not in crawled and li.find('forums')<0: to_crawl.append(li) -#print to_crawl -print len(to_crawl) -count=0 +# print to_crawl +print(len(to_crawl)) +count = 0 +# Helper method to get page def get_page(page): - import urllib2 - source=urllib2.urlopen(page) + source = urllib2.urlopen(page) return source.read() - +# Helper method to save the pdf def save_as_pdf(s): global i try: client = pdfcrowd.Client("mkap1234", "fc5ada9fbd1c55f46822d6e9e985a9bb") output_file = open('amazon'+str(i)+'.pdf', 'wb') - i=i+1 - html=get_page(s) + i = i + 1 + html = get_page(s) client.convertHtml(html, output_file) output_file.close() - except pdfcrowd.Error,why: + except pdfcrowd.Error, why: print 'Failed:', why while len(to_crawl): - b=to_crawl.pop() + b = to_crawl.pop() if b.find('http://www.geeksforgeeks.org')==0 and b not in crawled and b.find('forums')<0: - count=count+1 - print count + count = count + 1 + print(count) crawled.append(b) status, response = http.request(b) for link in BeautifulSoup(response, parse_only=SoupStrainer('a')): @@ -64,14 +63,14 @@ def save_as_pdf(s): for st in crawled: if st.find('amazon')>=0 and st.find('#')<0 and st.find('tag')<0 and st.find('forum')<0: - print st + print(st) amazon.append(st) -print "Finished" -print len(amazon) +print("Processing Finished") +print(len(amazon)) - +# Saving all the pages fetched as pdf for page in amazon: save_as_pdf(page)