First (and possibly last) release

raldnor · May 20, 2021 · 276808a · 276808a
commit 276808a
Show file tree

Hide file tree

Showing 2 changed files with 266 additions and 0 deletions.
diff --git a/README.MD b/README.MD
@@ -0,0 +1,121 @@
+# URL Scanner
+
+This is simple utility that gets a list of urls from a textfile (one url per line), queries each url and returns the web server status code.
+Optionally screenshots can be made during this process of the pages being queries (this requires Firefox to be installed).
+
+### Usage
+```
+usage: urlscanner.py [-h] [--inputfile INPUTFILE] [--outputdir OUTPUTDIR]
+                     [--outputfile OUTPUTFILE] [--verbose] [--append APPEND]
+                     [--firefoxpath FIREFOXPATH] [--screenshots]
+```
+
+#### Options:
+```-h``` or ```--help``` - Show command lines options and a brief help description per option.  
+```-i``` or ```--inputfile``` - Input text file to read urls from. Basically a text file with one url per line.  
+```-d``` or ```--outputdir``` - Directory to store screenshots in. This directory has to exist before it can be specified.  
+```-a``` or ```--append``` - Append a string to the urls to be checked (e.g. 'index.html' or '?action=something').  
+```-f``` or ```--firefoxpath``` - Location where the firefox executable is found (required when creating screenshots).  
+```-s``` or ```--screenshots``` - Create .png screenshots of urls.  
+```-v``` or ```--verbose``` - More output while the utility is running.
+
+### Examples:
+Let's assume we have a textfile called _urls.txt_ containing the following lines:
+```
+www.google.com
+https://www.reddit.com
+http://github.com
+```
+When no http:// or https:// prefix is specified the script will assume https:// needs to be prepended. When specifically http:// needs to be queried make sure it is specified in the text file.
+
+#### 1. Simple status code query with only console output
+Command:  
+``` 
+$ python3 ./urlscanner.py -i urls.txt 
+ ____ _____________.____
+|    |   \______   \    |
+|    |   /|       _/    |
+|    |  / |    |   \    |___ 
+|______/  |____|_  /_______ \
+  ______ ____ ___\/    ____\/ ____   ___________
+ /  ___// ___\\__  \  /    \ /    \_/ __ \_  __ \
+ \___ \\  \___ / __ \|   |  \   |  \  ___/|  | \/
+/____  >\___  >____  /___|  /___|  /\___  >__|
+     \/     \/     \/     \/     \/     \/
+URL scanner
+Peter Berends - 2021
+
+Reading input file (urls.txt)...
+[200][OK] - https://www.google.com
+[200][OK] - https://www.reddit.com
+[301][Moved Permanently] - http://github.com
+Done. Processed 3 URLs (0 failed).
+```
+
+#### 2. Output results to CVS file:
+```
+$ python3 ./urlscanner.py -i urls.txt -o out.txt
+ ____ _____________.____
+|    |   \______   \    |
+|    |   /|       _/    |
+|    |  / |    |   \    |___ 
+|______/  |____|_  /_______ \
+  ______ ____ ___\/    ____\/ ____   ___________
+ /  ___// ___\\__  \  /    \ /    \_/ __ \_  __ \
+ \___ \\  \___ / __ \|   |  \   |  \  ___/|  | \/
+/____  >\___  >____  /___|  /___|  /\___  >__|
+     \/     \/     \/     \/     \/     \/
+URL scanner
+Peter Berends - 2021
+
+Reading input file (urls.txt)...
+[200][OK] - https://www.google.com
+[200][OK] - https://www.reddit.com
+[301][Moved Permanently] - http://github.com
+Saving results to out.txt
+Done. Processed 3 URLs (0 failed).
+
+$ cat out.txt
+200,OK,https://www.google.com
+200,OK,https://www.reddit.com
+301,Moved Permanently,http://github.com
+```
+
+#### 3. Create screenshots in verbose mode
+```
+$ mkdir shots
+$ python3 ./urlscanner.py -i urls.txt -s -f /opt/firefox/firefox -d shots -v
+ ____ _____________.____
+|    |   \______   \    |
+|    |   /|       _/    |
+|    |  / |    |   \    |___ 
+|______/  |____|_  /_______ \
+  ______ ____ ___\/    ____\/ ____   ___________
+ /  ___// ___\\__  \  /    \ /    \_/ __ \_  __ \
+ \___ \\  \___ / __ \|   |  \   |  \  ___/|  | \/
+/____  >\___  >____  /___|  /___|  /\___  >__|
+     \/     \/     \/     \/     \/     \/
+URL scanner
+Peter Berends - 2021
+
+Verbose mode on
+Location to firefox given: /opt/firefox/firefox
+Taking screenshots (will be stored in 'shots')
+Activating Firefox as screenshot driver
+Using Firefox location: /opt/firefox/firefox
+Reading input file (urls.txt)...
+Prepend https:// to line (www.google.com).
+Processing: https://www.google.com
+Taking screenshot of https://www.google.com
+[200][OK] - https://www.google.com
+Processing: https://www.reddit.com
+Taking screenshot of https://www.reddit.com
+[200][OK] - https://www.reddit.com
+Processing: http://github.com
+Taking screenshot of http://github.com
+[301][Moved Permanently] - http://github.com
+Done. Processed 3 URLs (0 failed).
+
+$ ls -a shots
+.  ..  github.com.png  www.google.com.png  www.reddit.com.png
+```
diff --git a/urlscanner.py b/urlscanner.py
@@ -0,0 +1,145 @@
+import argparse
+import requests
+import re
+from http.client import responses
+from os import path
+
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from time import sleep
+
+urlcount = 0
+verbose = 0
+failedurls = 0
+outstring = ""
+firefoxpath = "c:\Program Files\Mozilla Firefox\firefox.exe"
+interval = 3
+
+def printbanner():
+    banner = " ____ _____________.____\n"\
+"|    |   \\______   \\    |\n"\
+"|    |   /|       _/    |\n"\
+"|    |  / |    |   \\    |___ \n"\
+"|______/  |____|_  /_______ \\\n"\
+"  ______ ____ ___\\/    ____\\/ ____   ___________\n"\
+" /  ___// ___\\\\__  \\  /    \\ /    \\_/ __ \\_  __ \\\n"\
+" \\___ \\\\  \\___ / __ \\|   |  \\   |  \\  ___/|  | \\/\n"\
+"/____  >\\___  >____  /___|  /___|  /\\___  >__|\n"\
+"     \\/     \\/     \\/     \\/     \\/     \\/\n"\
+"URL scanner\r\nPeter Berends - 2021\n"
+    print(banner)
+
+def statuscode(url):
+    try:
+        response = requests.head(url)
+        return response.status_code
+    except Exception as err:
+        if verbose:
+            print("Request failed for %s (%s)." % (url, err))
+        return None
+
+def formaturl(url):
+    if not re.match('(?:http|https)://', url):
+        if verbose:
+            print("Prepend https:// to line (%s)." % url)
+        return 'https://{}'.format(url)
+    return url
+
+def checkdir(dir):
+    return path.exists(dir)
+
+
+printbanner()
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--inputfile", "-i", help="specify input file with URLs per line")
+parser.add_argument("--outputdir", "-d", help="specify output directory")
+parser.add_argument("--outputfile", "-o", help="save results to file (CSV format)")
+parser.add_argument("--verbose", "-v", help="verbose mode", action="store_true")
+parser.add_argument("--append", "-a", help="append a string to the urls to check")
+parser.add_argument("--firefoxpath", "-f", help="location where the executable of Firefox is found")
+parser.add_argument("--screenshots", "-s", help="create screenshots of URLs", action="store_true")
+
+args = parser.parse_args()
+
+if not args.inputfile:
+    print("No input file specified, got nothing to work with. Bye.")
+    exit(1)
+
+if args.screenshots == True and not args.outputdir:
+    print("No output directory specified for screenshot storage. Bye.")
+    exit(1)
+
+if args.outputdir and not checkdir(args.outputdir):
+    print("Invalid output directory specified. Bye.")
+    exit(1)
+
+if args.verbose:
+    print("Verbose mode on")
+    verbose = 1
+
+if args.firefoxpath:
+    firefoxpath = args.firefoxpath
+    if verbose:
+        print("Location to firefox given: %s" % args.firefoxpath)
+
+if args.screenshots:
+    print("Taking screenshots (will be stored in '%s')" % args.outputdir)
+    if verbose:
+        print("Activating Firefox as screenshot driver")
+        print("Using Firefox location: %s" % firefoxpath)
+    try:
+        Options = Options()
+        Options.headless = True
+        browser = webdriver.Firefox(options=Options, firefox_binary=firefoxpath)
+    except Exception as err:
+        print("Could not activate screenshot driver (%s). Try setting the path to Firefox using the --firefoxpath flag." % err)
+        exit(1)
+
+print("Reading input file (%s)..." % args.inputfile)
+
+try:
+    inputfile = open(args.inputfile, 'r')
+    lines = inputfile.readlines()
+except:
+    print("ERROR: Input file specified is invalid. Bye.")
+    exit(1)
+
+for line in lines:
+    line = re.sub(r"[\n\t\s\r]*", "", line)
+    line = formaturl(line)
+    if args.append:
+        line = line + args.append
+    urlcount += 1
+    if verbose:
+        print("Processing: %s" % line)
+    result = statuscode(line)
+    if verbose:
+        print("Taking screenshot of %s" % line)
+    try:
+        browser.get(line)
+        sleep(interval)
+        browser.get_screenshot_as_file(args.outputdir + '/' + line[7:] + '.png')
+    except Exception as err:
+        if verbose:
+            print("Could not create screenshot (%s)" % err)
+    if result:
+        outline = str(result) + "," + responses[result] + "," + line
+        outstring += outline + "\n"
+        print("[%s][%s] - %s" % (result, responses[result], line))
+    else:
+        failedurls += 1
+
+if args.outputfile:
+    print("Saving results to %s" % args.outputfile)
+    try:
+        outfile = open(args.outputfile, 'w')
+        outfile.write(outstring)
+        outfile.close()
+    except Exception as err:
+        print("ERROR: Could not save results (%s)." % err)
+
+if args.screenshots:
+    browser.quit()
+
+print("Done. Processed %i URLs (%i failed)." % (urlcount, failedurls))