-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
44 lines (32 loc) · 1.37 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import logging # to create log files
from datetime import date
import csv
import tldextract
# include files
from websitecrawler.local_functions import *
from websitecrawler.ftp_functions import *
# Create and configure logger
logging.basicConfig(filename=os.getcwd()+"/logs/"+str(date.today())+".log", format='%(asctime)s %(message) s', filemode='w')
# set name of the directory to which websites are going to save as a global variable
savedWebsitesDirName = "/savedwebsites/"
# main program
def main():
localDir = os.getcwd() + savedWebsitesDirName # set working directory
with open('webpages_to_crawl.csv') as csv_file:
csvReader = csv.reader(csv_file, delimiter=',') # store website URLs and xpaths into variable
# loop through csv array which includes urls[0] and xpaths[1]
for row in csvReader:
try:
domain = tldextract.extract(row[0]).domain # get domain without ext ie .com.au
# Save Files to Disk
createFiles(localDir, domain, row)
# Upload Files to FTP
uploadFiles(localDir, domain)
except Exception:
logging.exception("Exception occurred while scraping \""+domain+"\" website")
# logger=logging.getLogger() #Create an object
# logger.exception(logging.DEBUG) #Set the threshold of logger to DEBUG
pass
# start main program
if __name__ == "__main__":
main()