diff --git a/CHANGELOG.md b/CHANGELOG.md index 70da23e..f344ff4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## [0.98.1] - 2021-02-15 +- Fixed filtering of columns +- Fixed List mode not working +- Fixed redirects in List mode + + ## [0.98] - 2021-02-04 - Added support for password protected sites (basic http auth) - Added option to respect/ignore nofollow href links diff --git a/greenflare/__init__.py b/greenflare/__init__.py index e56f876..4255313 100644 --- a/greenflare/__init__.py +++ b/greenflare/__init__.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. diff --git a/greenflare/app.py b/greenflare/app.py index b0e0fb3..b7665db 100644 --- a/greenflare/app.py +++ b/greenflare/app.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. diff --git a/greenflare/core/__init__.py b/greenflare/core/__init__.py index e56f876..4255313 100644 --- a/greenflare/core/__init__.py +++ b/greenflare/core/__init__.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. diff --git a/greenflare/core/defaults.py b/greenflare/core/defaults.py index 3d97e4a..6313059 100644 --- a/greenflare/core/defaults.py +++ b/greenflare/core/defaults.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. @@ -29,7 +29,7 @@ class Defaults: - version = '0.98' + version = '0.98.1' crawl_items = [ 'url', diff --git a/greenflare/core/gflarecrawler.py b/greenflare/core/gflarecrawler.py index 4f5309c..eef87aa 100644 --- a/greenflare/core/gflarecrawler.py +++ b/greenflare/core/gflarecrawler.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. diff --git a/greenflare/core/gflaredb.py b/greenflare/core/gflaredb.py index 214f45e..461380d 100644 --- a/greenflare/core/gflaredb.py +++ b/greenflare/core/gflaredb.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. @@ -266,7 +266,8 @@ def get_urls_crawled(self): @exception_handler def get_crawl_data(self): cur = self.con.cursor() - cur.execute(f"SELECT {', '.join(self.columns)} FROM crawl WHERE status_code != ''") + query = f"SELECT VALUES ({','.join(['?'] * self.columns)}) FROM crawl WHERE status_code != ''" + cur.execute(query, (self.columns,)) out = cur.fetchall() cur.close() return out @@ -311,11 +312,14 @@ def query(self, filters, table, columns=None): 'Less Than Or Equal To': '<=' } + values = [] + if not table: table = 'crawl' if columns: columns = f"{', '.join(columns)}" + elif table == 'crawl': columns = f"{', '.join(self.columns)}" else: @@ -327,16 +331,17 @@ def query(self, filters, table, columns=None): queries = [] order_cols = [] + values = [] for f in filters: column, operator, value = f - + value = value.replace('%', r'\%').replace('_', r'\_') if operator == 'Begins With': - value = f'{value}%' + values.append(f'{value}%') elif operator == 'Ends With': - value = f'%{value}' + values.append(f'%{value}') elif 'Contain' in operator: - value = f'%{value}%' + values.append(f'%{value}%') elif operator == 'Sort A-Z' or operator == 'Sort Smallest To Largest': order_cols.append(f'{column} ASC') continue @@ -345,7 +350,13 @@ def query(self, filters, table, columns=None): continue operator = operator_mapping[operator] - queries.append(f"{column} {operator} '{value}'") + + # Like values need to be escaped and the escape character needs to be defined as there is no default in sqlite + if 'LIKE' in operator: + queries.append(f"{column} {operator} ? ESCAPE '\\'") + else: + values.append(value) + queries.append(f"{column} {operator} ?") if queries: query += 'WHERE ' + \ @@ -359,18 +370,27 @@ def query(self, filters, table, columns=None): query += "WHERE status_code != ''" cur = self.con.cursor() - cur.execute(query) - rows = cur.fetchall() + try: + if values: + cur.execute(query, tuple(values)) + else: + cur.execute(query) + + rows = cur.fetchall() + if rows != None: + return rows + except Exception as e: + print(e) + cur.close() - if rows != None: - return rows + return [] def get_inlinks(self, url): url_id = self.get_ids([url]).pop() - query = fr"SELECT url as inlink FROM crawl LEFT JOIN inlinks ON crawl.id = inlinks.url_from_id WHERE inlinks.url_to_id = {url_id}" + query = "SELECT url as inlink FROM crawl LEFT JOIN inlinks ON crawl.id = inlinks.url_from_id WHERE inlinks.url_to_id = ?" cur = self.con.cursor() - cur.execute(query) + cur.execute(query, (url_id,)) inlinks = cur.fetchall() cur.close() if inlinks: @@ -398,10 +418,10 @@ def get_new_urls(self, links, chunk_size=999, check_crawled=False): print("ERROR returning new urls") print(e) print(f"input: {links}") - + cur.row_factory = None cur.close() - + urls_not_in_db = list(set(links) - set(urls_in_db)) if not urls_not_in_db: @@ -416,6 +436,7 @@ def insert_new_urls(self, urls): cur = self.con.cursor() cur.executemany(query, rows) cur.close() + self.commit() @exception_handler def get_ids(self, urls): diff --git a/greenflare/core/gflareresponse.py b/greenflare/core/gflareresponse.py index ed2eaa2..33254f6 100644 --- a/greenflare/core/gflareresponse.py +++ b/greenflare/core/gflareresponse.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. @@ -185,6 +185,9 @@ def get_robots_txt_url(self, url): return url def is_external(self, url): + if self.settings.get('MODE') == 'List': + return False + domain = self.get_domain(url) if not domain: @@ -274,8 +277,9 @@ def valid_url(self, url): return False # Filter out external links if needed - if "external_links" not in self.settings.get("CRAWL_ITEMS", "") and self.is_external(url): - return False + if self.settings.get('MODE') != 'List': + if "external_links" not in self.settings.get("CRAWL_ITEMS", "") and self.is_external(url): + return False if self.is_excluded(url): return False @@ -495,36 +499,35 @@ def dict_to_row(self, data): def has_redirected(self): return len(self.response.history) > 0 - # @timing def get_redirects(self): data = [] hist = self.response.history - if len(hist) > 0: - for i in range(len(hist)): - hob_url = self.sanitise_url(hist[i].url) - if 'external_links' not in self.settings.get('CRAWL_ITEMS', ''): - if self.is_external(hob_url): - break + for i in range(len(hist)): + hob_url = self.sanitise_url(hist[i].url) - robots_status = self.get_robots_txt_status(hob_url) - if 'respect_robots_txt' in self.settings.get('CRAWL_ITEMS', '') and 'follow_blocked_redirects' not in self.settings.get('CRAWL_ITEMS', '') and robots_status == 'blocked': - continue + if 'external_links' not in self.settings.get('CRAWL_ITEMS', ''): + if self.is_external(hob_url): + break - if i + 1 < len(hist): - redirect_to_url = self.sanitise_url(str(hist[i + 1].url)) - else: - redirect_to_url = self.get_final_url() + robots_status = self.get_robots_txt_status(hob_url) + if 'respect_robots_txt' in self.settings.get('CRAWL_ITEMS', '') and 'follow_blocked_redirects' not in self.settings.get('CRAWL_ITEMS', '') and robots_status == 'blocked': + continue + + if i + 1 < len(hist): + redirect_to_url = self.sanitise_url(str(hist[i + 1].url)) + else: + redirect_to_url = self.get_final_url() - hob_data = {"url": hob_url, "content_type": hist[i].headers.get('Content-Type', ""), 'status_code': hist[i].status_code, 'x_robots_tag': hist[ - i].headers.get('X-Robots-Tag', ''), 'redirect_url': redirect_to_url, 'robots_txt': robots_status} + hob_data = {"url": hob_url, "content_type": hist[i].headers.get('Content-Type', ""), 'status_code': hist[i].status_code, 'x_robots_tag': hist[ + i].headers.get('X-Robots-Tag', ''), 'redirect_url': redirect_to_url, 'robots_txt': robots_status} - hob_data['crawl_status'] = self.get_full_status( - hob_url, hob_data) - hob_row = self.dict_to_row(hob_data) + hob_data['crawl_status'] = self.get_full_status( + hob_url, hob_data) + hob_row = self.dict_to_row(hob_data) - data.append(hob_row) + data.append(hob_row) return data diff --git a/greenflare/core/gflarerobots.py b/greenflare/core/gflarerobots.py index 987377a..fbcb602 100644 --- a/greenflare/core/gflarerobots.py +++ b/greenflare/core/gflarerobots.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. diff --git a/greenflare/widgets/aboutwindow.py b/greenflare/widgets/aboutwindow.py index ba89986..b742c3e 100644 --- a/greenflare/widgets/aboutwindow.py +++ b/greenflare/widgets/aboutwindow.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. @@ -60,7 +60,7 @@ def __init__(self): heading_1 = 'Greenflare SEO Crawler' heading_2 = f'\nVersion {Defaults.version}' - text = '\n\n© Greenflare Developers 2020\n\nCreated By Benjamin Görler (ben@greenflare.io)\n\nWebsite: ' + text = '\n\n© Greenflare Developers 2020-2021\n\nCreated By Benjamin Görler (ben@greenflare.io)\n\nWebsite: ' website = 'https://greenflare.io' twitter = 'https://twitter.com/GreenflareEN' github = 'https://github.com/beb7/gflare-tk/' diff --git a/greenflare/widgets/checkboxgroup.py b/greenflare/widgets/checkboxgroup.py index fcc8b02..9e5141d 100644 --- a/greenflare/widgets/checkboxgroup.py +++ b/greenflare/widgets/checkboxgroup.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. diff --git a/greenflare/widgets/crawltab.py b/greenflare/widgets/crawltab.py index 07afae5..c711a03 100644 --- a/greenflare/widgets/crawltab.py +++ b/greenflare/widgets/crawltab.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. @@ -232,11 +232,11 @@ def btn_crawl_pushed(self): title='Invalid URL', message='Please enter a valid URL!') return - url = urlunsplit(url_components) - url = self.crawler.gf.sanitise_url(url, base_url='') + url = urlunsplit(url_components) + url = self.crawler.gf.sanitise_url(url, base_url='') - self.entry_url_input.entry.delete(0, 'end') - self.entry_url_input.entry.insert(0, url) + self.entry_url_input.entry.delete(0, 'end') + self.entry_url_input.entry.insert(0, url) self.start_new_crawl(url) @@ -430,10 +430,12 @@ def assign_treeview_click(self, event): col = self.treeview_table.identify_column(event.x) self.selected_column = self.treeview_table.heading(col)['text'] - try: - self.popup_menu.tk_popup(event.x_root, event.y_root + 20, 0) - finally: - self.popup_menu.grab_release() + # Only show a context menu if the table is not empty + if len(self.treeview_table.get_children()) > 0: + try: + self.popup_menu.tk_popup(event.x_root, event.y_root + 20, 0) + finally: + self.popup_menu.grab_release() def show_filter_window(self, label): columns = self.get_display_columns() diff --git a/greenflare/widgets/enhancedentry.py b/greenflare/widgets/enhancedentry.py index 75b7475..bbd9c40 100644 --- a/greenflare/widgets/enhancedentry.py +++ b/greenflare/widgets/enhancedentry.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. diff --git a/greenflare/widgets/exclusionstab.py b/greenflare/widgets/exclusionstab.py index 94cc599..e90f06a 100644 --- a/greenflare/widgets/exclusionstab.py +++ b/greenflare/widgets/exclusionstab.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. diff --git a/greenflare/widgets/extractionstab.py b/greenflare/widgets/extractionstab.py index 32ef3f5..820f3e9 100644 --- a/greenflare/widgets/extractionstab.py +++ b/greenflare/widgets/extractionstab.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. diff --git a/greenflare/widgets/filterwindow.py b/greenflare/widgets/filterwindow.py index 10665c3..d526b85 100644 --- a/greenflare/widgets/filterwindow.py +++ b/greenflare/widgets/filterwindow.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. @@ -106,7 +106,7 @@ def btn_ok_pushed(self): for w in self.widgets: children = w.winfo_children() - column = children[0].get() + column = children[0].get().lower().replace(' ', '_') operation = children[1].get() values = children[2].get() diff --git a/greenflare/widgets/helpers.py b/greenflare/widgets/helpers.py index 2353967..2a9b4f6 100644 --- a/greenflare/widgets/helpers.py +++ b/greenflare/widgets/helpers.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. diff --git a/greenflare/widgets/listcrawl.py b/greenflare/widgets/listcrawl.py index 02cee17..bb88d30 100644 --- a/greenflare/widgets/listcrawl.py +++ b/greenflare/widgets/listcrawl.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. @@ -74,8 +74,8 @@ def start_list_crawl(self): self.crawler.settings['MODE'] = 'List' self.crawler.list_mode_urls = urls self.crawl_tab.show_list_mode() - messagebox.showinfo(title='Reading URLs completed', message=f'Loaded {len(urls)} valid and unique URLs!') self.destroy() + messagebox.showinfo(title='Reading URLs completed', message=f'Loaded {len(urls)} valid and unique URLs!') else: messagebox.showerror(title='Reading URLs failed', message='No valid URLs found, please check your input!') diff --git a/greenflare/widgets/progresswindow.py b/greenflare/widgets/progresswindow.py index 47df77d..ab86974 100644 --- a/greenflare/widgets/progresswindow.py +++ b/greenflare/widgets/progresswindow.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. diff --git a/greenflare/widgets/settingstab.py b/greenflare/widgets/settingstab.py index 2ed6707..33cf87a 100644 --- a/greenflare/widgets/settingstab.py +++ b/greenflare/widgets/settingstab.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. diff --git a/greenflare/widgets/updatewindow.py b/greenflare/widgets/updatewindow.py index b857198..d557b78 100644 --- a/greenflare/widgets/updatewindow.py +++ b/greenflare/widgets/updatewindow.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. diff --git a/greenflare/widgets/viewinlinks.py b/greenflare/widgets/viewinlinks.py index b1b7072..1a0cbf7 100644 --- a/greenflare/widgets/viewinlinks.py +++ b/greenflare/widgets/viewinlinks.py @@ -4,7 +4,7 @@ @section LICENSE Greenflare SEO Web Crawler (https://greenflare.io) -Copyright (C) 2020 Benjamin Görler. This file is part of +Copyright (C) 2020-2021 Benjamin Görler. This file is part of Greenflare, an open-source project dedicated to delivering high quality SEO insights and analysis solutions to the world. @@ -22,11 +22,17 @@ along with this program. If not, see . """ -from tkinter import ttk, Toplevel, RIGHT, filedialog as fd +import sys from os import path, remove -from greenflare.widgets.helpers import export_to_csv, run_in_background_with_window, tk_after +from tkinter import ( + filedialog as fd, ttk, + Toplevel, RIGHT, +) +from greenflare.widgets.helpers import ( + export_to_csv, run_in_background_with_window, + tk_after, +) from greenflare.widgets.windowhelper import center_on_parent -import sys class ViewInlinks(Toplevel): @@ -43,8 +49,9 @@ def __init__(self, url, query_func): self.top_frame = ttk.Frame(self) self.top_frame.pack(anchor='center', fill='x') - - self.btn_export = ttk.Button(self.top_frame, text='Export', command=self.export_button_pushed) + + self.btn_export = ttk.Button( + self.top_frame, text='Export', command=self.export_button_pushed) self.btn_export.pack(side=RIGHT, padx=20, pady=20) self.frame_tbl = ttk.Frame(self) @@ -52,20 +59,23 @@ def __init__(self, url, query_func): self.tbl = ttk.Treeview(self.frame_tbl, selectmode="browse") - self.scrollbar_vertical = ttk.Scrollbar(self.frame_tbl, orient='vertical', command=self.tbl.yview) - self.scrollbar_vertical.pack(side="right", fill="y") - - self.scrollbar_horizontal = ttk.Scrollbar(self.frame_tbl, orient='horizontal', command=self.tbl.xview) - self.scrollbar_horizontal.pack(side="bottom", fill="x") - - self.tbl.configure(yscrollcommand=self.scrollbar_vertical.set, xscrollcommand=self.scrollbar_horizontal.set) - self.tbl.pack(fill="both", expand=True) - + self.scrollbar_vertical = ttk.Scrollbar( + self.frame_tbl, orient='vertical', command=self.tbl.yview) + self.scrollbar_vertical.pack(side='right', fill='y') + + self.scrollbar_horizontal = ttk.Scrollbar( + self.frame_tbl, orient='horizontal', command=self.tbl.xview) + self.scrollbar_horizontal.pack(side='bottom', fill='x') + + self.tbl.configure(yscrollcommand=self.scrollbar_vertical.set, + xscrollcommand=self.scrollbar_horizontal.set) + self.tbl.pack(fill='both', expand=True) + column_name = 'Linking URL' - self.tbl["columns"] = tuple([column_name]) + self.tbl['columns'] = tuple([column_name]) - self.tbl.heading("#0", text="id", anchor='w') - self.tbl.column("#0", width=55, stretch=False) + self.tbl.heading('#0', text='id', anchor='w') + self.tbl.column('#0', width=55, stretch=False) self.tbl.heading(column_name, text=column_name, anchor='w') self.tbl.column(column_name, width=750, stretch=True) @@ -81,7 +91,7 @@ def add_inlinks(self, inlinks): def _query_func(self): inlinks = self.query_func(self.url) self.add_inlinks(inlinks) - + def export_button_pushed(self): files = [('CSV files', '*.csv')] self.withdraw() @@ -99,4 +109,4 @@ def export_button_pushed(self): child)['values'] for child in self.tbl.get_children()] export_to_csv( - export_file, self.tbl['columns'], data) \ No newline at end of file + export_file, self.tbl['columns'], data) diff --git a/installer.iss b/installer.iss index 3e6375a..331c15a 100644 --- a/installer.iss +++ b/installer.iss @@ -2,7 +2,7 @@ ; SEE THE DOCUMENTATION FOR DETAILS ON CREATING INNO SETUP SCRIPT FILES! -#define VERSION "0.98" +#define VERSION "0.98.1" #define MyAppName "Greenflare SEO Web Crawler" #define MyAppShortName "Greenflare" #define MyAppProjectFileDesc "Greenflare Database File"