Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
geeksforgeeks-pdf
=================

This Python script aimed to download all Amazon Interview Experience from GeeksforGeeks website. You can modify this script to download as per your need.
This Python script is created with an objective to download all Amazon Interview Experience Questions from GeeksforGeeks website. You can modify this script to download as per your need.
47 changes: 23 additions & 24 deletions geeksforgeeks-pdf.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,52 @@
import httplib2
import pdfcrowd
import urllib2
from bs4 import BeautifulSoup, SoupStrainer

http = httplib2.Http()
s= 'http://www.geeksforgeeks.org/'
i=0
to_crawl=[]
to_crawl.append(s)
s = 'http://www.geeksforgeeks.org/'
i = 0
to_crawl = [s]
status, response = http.request(s)
crawled=[]
crawled.append(s)
crawled = [s]


for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
li=link['href']
#print li
li = link['href']
# print li
if li.find('http://www.geeksforgeeks.org')==0 and li not in crawled and li.find('forums')<0:
to_crawl.append(li)


#print to_crawl
print len(to_crawl)
count=0
# print to_crawl
print(len(to_crawl))
count = 0

# Helper method to get page
def get_page(page):
import urllib2
source=urllib2.urlopen(page)
source = urllib2.urlopen(page)
return source.read()


# Helper method to save the pdf
def save_as_pdf(s):
global i
try:
client = pdfcrowd.Client("mkap1234", "fc5ada9fbd1c55f46822d6e9e985a9bb")
output_file = open('amazon'+str(i)+'.pdf', 'wb')
i=i+1
html=get_page(s)
i = i + 1
html = get_page(s)
client.convertHtml(html, output_file)
output_file.close()
except pdfcrowd.Error,why:
except pdfcrowd.Error, why:
print 'Failed:', why


while len(to_crawl):
b=to_crawl.pop()
b = to_crawl.pop()
if b.find('http://www.geeksforgeeks.org')==0 and b not in crawled and b.find('forums')<0:
count=count+1
print count
count = count + 1
print(count)
crawled.append(b)
status, response = http.request(b)
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
Expand All @@ -64,14 +63,14 @@ def save_as_pdf(s):

for st in crawled:
if st.find('amazon')>=0 and st.find('#')<0 and st.find('tag')<0 and st.find('forum')<0:
print st
print(st)
amazon.append(st)



print "Finished"
print len(amazon)
print("Processing Finished")
print(len(amazon))


# Saving all the pages fetched as pdf
for page in amazon:
save_as_pdf(page)