Skip to content

Commit

Permalink
Merge branch 'staging'
Browse files Browse the repository at this point in the history
  • Loading branch information
beb7 committed Feb 15, 2021
2 parents 45899d3 + eeaf663 commit 18eae06
Show file tree
Hide file tree
Showing 23 changed files with 131 additions and 89 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## [0.98.1] - 2021-02-15
- Fixed filtering of columns
- Fixed List mode not working
- Fixed redirects in List mode


## [0.98] - 2021-02-04
- Added support for password protected sites (basic http auth)
- Added option to respect/ignore nofollow href links
Expand Down
2 changes: 1 addition & 1 deletion greenflare/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
@section LICENSE
Greenflare SEO Web Crawler (https://greenflare.io)
Copyright (C) 2020 Benjamin Görler. This file is part of
Copyright (C) 2020-2021 Benjamin Görler. This file is part of
Greenflare, an open-source project dedicated to delivering
high quality SEO insights and analysis solutions to the world.
Expand Down
2 changes: 1 addition & 1 deletion greenflare/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
@section LICENSE
Greenflare SEO Web Crawler (https://greenflare.io)
Copyright (C) 2020 Benjamin Görler. This file is part of
Copyright (C) 2020-2021 Benjamin Görler. This file is part of
Greenflare, an open-source project dedicated to delivering
high quality SEO insights and analysis solutions to the world.
Expand Down
2 changes: 1 addition & 1 deletion greenflare/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
@section LICENSE
Greenflare SEO Web Crawler (https://greenflare.io)
Copyright (C) 2020 Benjamin Görler. This file is part of
Copyright (C) 2020-2021 Benjamin Görler. This file is part of
Greenflare, an open-source project dedicated to delivering
high quality SEO insights and analysis solutions to the world.
Expand Down
4 changes: 2 additions & 2 deletions greenflare/core/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
@section LICENSE
Greenflare SEO Web Crawler (https://greenflare.io)
Copyright (C) 2020 Benjamin Görler. This file is part of
Copyright (C) 2020-2021 Benjamin Görler. This file is part of
Greenflare, an open-source project dedicated to delivering
high quality SEO insights and analysis solutions to the world.
Expand All @@ -29,7 +29,7 @@

class Defaults:

version = '0.98'
version = '0.98.1'

crawl_items = [
'url',
Expand Down
2 changes: 1 addition & 1 deletion greenflare/core/gflarecrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
@section LICENSE
Greenflare SEO Web Crawler (https://greenflare.io)
Copyright (C) 2020 Benjamin Görler. This file is part of
Copyright (C) 2020-2021 Benjamin Görler. This file is part of
Greenflare, an open-source project dedicated to delivering
high quality SEO insights and analysis solutions to the world.
Expand Down
51 changes: 36 additions & 15 deletions greenflare/core/gflaredb.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
@section LICENSE
Greenflare SEO Web Crawler (https://greenflare.io)
Copyright (C) 2020 Benjamin Görler. This file is part of
Copyright (C) 2020-2021 Benjamin Görler. This file is part of
Greenflare, an open-source project dedicated to delivering
high quality SEO insights and analysis solutions to the world.
Expand Down Expand Up @@ -266,7 +266,8 @@ def get_urls_crawled(self):
@exception_handler
def get_crawl_data(self):
cur = self.con.cursor()
cur.execute(f"SELECT {', '.join(self.columns)} FROM crawl WHERE status_code != ''")
query = f"SELECT VALUES ({','.join(['?'] * self.columns)}) FROM crawl WHERE status_code != ''"
cur.execute(query, (self.columns,))
out = cur.fetchall()
cur.close()
return out
Expand Down Expand Up @@ -311,11 +312,14 @@ def query(self, filters, table, columns=None):
'Less Than Or Equal To': '<='
}

values = []

if not table:
table = 'crawl'

if columns:
columns = f"{', '.join(columns)}"

elif table == 'crawl':
columns = f"{', '.join(self.columns)}"
else:
Expand All @@ -327,16 +331,17 @@ def query(self, filters, table, columns=None):

queries = []
order_cols = []
values = []

for f in filters:
column, operator, value = f

value = value.replace('%', r'\%').replace('_', r'\_')
if operator == 'Begins With':
value = f'{value}%'
values.append(f'{value}%')
elif operator == 'Ends With':
value = f'%{value}'
values.append(f'%{value}')
elif 'Contain' in operator:
value = f'%{value}%'
values.append(f'%{value}%')
elif operator == 'Sort A-Z' or operator == 'Sort Smallest To Largest':
order_cols.append(f'{column} ASC')
continue
Expand All @@ -345,7 +350,13 @@ def query(self, filters, table, columns=None):
continue

operator = operator_mapping[operator]
queries.append(f"{column} {operator} '{value}'")

# Like values need to be escaped and the escape character needs to be defined as there is no default in sqlite
if 'LIKE' in operator:
queries.append(f"{column} {operator} ? ESCAPE '\\'")
else:
values.append(value)
queries.append(f"{column} {operator} ?")

if queries:
query += 'WHERE ' + \
Expand All @@ -359,18 +370,27 @@ def query(self, filters, table, columns=None):
query += "WHERE status_code != ''"

cur = self.con.cursor()
cur.execute(query)
rows = cur.fetchall()
try:
if values:
cur.execute(query, tuple(values))
else:
cur.execute(query)

rows = cur.fetchall()
if rows != None:
return rows
except Exception as e:
print(e)

cur.close()
if rows != None:
return rows

return []

def get_inlinks(self, url):
url_id = self.get_ids([url]).pop()
query = fr"SELECT url as inlink FROM crawl LEFT JOIN inlinks ON crawl.id = inlinks.url_from_id WHERE inlinks.url_to_id = {url_id}"
query = "SELECT url as inlink FROM crawl LEFT JOIN inlinks ON crawl.id = inlinks.url_from_id WHERE inlinks.url_to_id = ?"
cur = self.con.cursor()
cur.execute(query)
cur.execute(query, (url_id,))
inlinks = cur.fetchall()
cur.close()
if inlinks:
Expand Down Expand Up @@ -398,10 +418,10 @@ def get_new_urls(self, links, chunk_size=999, check_crawled=False):
print("ERROR returning new urls")
print(e)
print(f"input: {links}")

cur.row_factory = None
cur.close()

urls_not_in_db = list(set(links) - set(urls_in_db))

if not urls_not_in_db:
Expand All @@ -416,6 +436,7 @@ def insert_new_urls(self, urls):
cur = self.con.cursor()
cur.executemany(query, rows)
cur.close()
self.commit()

@exception_handler
def get_ids(self, urls):
Expand Down
49 changes: 26 additions & 23 deletions greenflare/core/gflareresponse.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
@section LICENSE
Greenflare SEO Web Crawler (https://greenflare.io)
Copyright (C) 2020 Benjamin Görler. This file is part of
Copyright (C) 2020-2021 Benjamin Görler. This file is part of
Greenflare, an open-source project dedicated to delivering
high quality SEO insights and analysis solutions to the world.
Expand Down Expand Up @@ -185,6 +185,9 @@ def get_robots_txt_url(self, url):
return url

def is_external(self, url):
if self.settings.get('MODE') == 'List':
return False

domain = self.get_domain(url)

if not domain:
Expand Down Expand Up @@ -274,8 +277,9 @@ def valid_url(self, url):
return False

# Filter out external links if needed
if "external_links" not in self.settings.get("CRAWL_ITEMS", "") and self.is_external(url):
return False
if self.settings.get('MODE') != 'List':
if "external_links" not in self.settings.get("CRAWL_ITEMS", "") and self.is_external(url):
return False

if self.is_excluded(url):
return False
Expand Down Expand Up @@ -495,36 +499,35 @@ def dict_to_row(self, data):
def has_redirected(self):
return len(self.response.history) > 0

# @timing
def get_redirects(self):
data = []
hist = self.response.history

if len(hist) > 0:
for i in range(len(hist)):
hob_url = self.sanitise_url(hist[i].url)

if 'external_links' not in self.settings.get('CRAWL_ITEMS', ''):
if self.is_external(hob_url):
break
for i in range(len(hist)):
hob_url = self.sanitise_url(hist[i].url)

robots_status = self.get_robots_txt_status(hob_url)
if 'respect_robots_txt' in self.settings.get('CRAWL_ITEMS', '') and 'follow_blocked_redirects' not in self.settings.get('CRAWL_ITEMS', '') and robots_status == 'blocked':
continue
if 'external_links' not in self.settings.get('CRAWL_ITEMS', ''):
if self.is_external(hob_url):
break

if i + 1 < len(hist):
redirect_to_url = self.sanitise_url(str(hist[i + 1].url))
else:
redirect_to_url = self.get_final_url()
robots_status = self.get_robots_txt_status(hob_url)
if 'respect_robots_txt' in self.settings.get('CRAWL_ITEMS', '') and 'follow_blocked_redirects' not in self.settings.get('CRAWL_ITEMS', '') and robots_status == 'blocked':
continue

if i + 1 < len(hist):
redirect_to_url = self.sanitise_url(str(hist[i + 1].url))
else:
redirect_to_url = self.get_final_url()

hob_data = {"url": hob_url, "content_type": hist[i].headers.get('Content-Type', ""), 'status_code': hist[i].status_code, 'x_robots_tag': hist[
i].headers.get('X-Robots-Tag', ''), 'redirect_url': redirect_to_url, 'robots_txt': robots_status}
hob_data = {"url": hob_url, "content_type": hist[i].headers.get('Content-Type', ""), 'status_code': hist[i].status_code, 'x_robots_tag': hist[
i].headers.get('X-Robots-Tag', ''), 'redirect_url': redirect_to_url, 'robots_txt': robots_status}

hob_data['crawl_status'] = self.get_full_status(
hob_url, hob_data)
hob_row = self.dict_to_row(hob_data)
hob_data['crawl_status'] = self.get_full_status(
hob_url, hob_data)
hob_row = self.dict_to_row(hob_data)

data.append(hob_row)
data.append(hob_row)

return data

Expand Down
2 changes: 1 addition & 1 deletion greenflare/core/gflarerobots.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
@section LICENSE
Greenflare SEO Web Crawler (https://greenflare.io)
Copyright (C) 2020 Benjamin Görler. This file is part of
Copyright (C) 2020-2021 Benjamin Görler. This file is part of
Greenflare, an open-source project dedicated to delivering
high quality SEO insights and analysis solutions to the world.
Expand Down
4 changes: 2 additions & 2 deletions greenflare/widgets/aboutwindow.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
@section LICENSE
Greenflare SEO Web Crawler (https://greenflare.io)
Copyright (C) 2020 Benjamin Görler. This file is part of
Copyright (C) 2020-2021 Benjamin Görler. This file is part of
Greenflare, an open-source project dedicated to delivering
high quality SEO insights and analysis solutions to the world.
Expand Down Expand Up @@ -60,7 +60,7 @@ def __init__(self):
heading_1 = 'Greenflare SEO Crawler'
heading_2 = f'\nVersion {Defaults.version}'

text = '\n\n© Greenflare Developers 2020\n\nCreated By Benjamin Görler (ben@greenflare.io)\n\nWebsite: '
text = '\n\n© Greenflare Developers 2020-2021\n\nCreated By Benjamin Görler (ben@greenflare.io)\n\nWebsite: '
website = 'https://greenflare.io'
twitter = 'https://twitter.com/GreenflareEN'
github = 'https://github.com/beb7/gflare-tk/'
Expand Down
2 changes: 1 addition & 1 deletion greenflare/widgets/checkboxgroup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
@section LICENSE
Greenflare SEO Web Crawler (https://greenflare.io)
Copyright (C) 2020 Benjamin Görler. This file is part of
Copyright (C) 2020-2021 Benjamin Görler. This file is part of
Greenflare, an open-source project dedicated to delivering
high quality SEO insights and analysis solutions to the world.
Expand Down
20 changes: 11 additions & 9 deletions greenflare/widgets/crawltab.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
@section LICENSE
Greenflare SEO Web Crawler (https://greenflare.io)
Copyright (C) 2020 Benjamin Görler. This file is part of
Copyright (C) 2020-2021 Benjamin Görler. This file is part of
Greenflare, an open-source project dedicated to delivering
high quality SEO insights and analysis solutions to the world.
Expand Down Expand Up @@ -232,11 +232,11 @@ def btn_crawl_pushed(self):
title='Invalid URL', message='Please enter a valid URL!')
return

url = urlunsplit(url_components)
url = self.crawler.gf.sanitise_url(url, base_url='')
url = urlunsplit(url_components)
url = self.crawler.gf.sanitise_url(url, base_url='')

self.entry_url_input.entry.delete(0, 'end')
self.entry_url_input.entry.insert(0, url)
self.entry_url_input.entry.delete(0, 'end')
self.entry_url_input.entry.insert(0, url)

self.start_new_crawl(url)

Expand Down Expand Up @@ -430,10 +430,12 @@ def assign_treeview_click(self, event):
col = self.treeview_table.identify_column(event.x)
self.selected_column = self.treeview_table.heading(col)['text']

try:
self.popup_menu.tk_popup(event.x_root, event.y_root + 20, 0)
finally:
self.popup_menu.grab_release()
# Only show a context menu if the table is not empty
if len(self.treeview_table.get_children()) > 0:
try:
self.popup_menu.tk_popup(event.x_root, event.y_root + 20, 0)
finally:
self.popup_menu.grab_release()

def show_filter_window(self, label):
columns = self.get_display_columns()
Expand Down
2 changes: 1 addition & 1 deletion greenflare/widgets/enhancedentry.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
@section LICENSE
Greenflare SEO Web Crawler (https://greenflare.io)
Copyright (C) 2020 Benjamin Görler. This file is part of
Copyright (C) 2020-2021 Benjamin Görler. This file is part of
Greenflare, an open-source project dedicated to delivering
high quality SEO insights and analysis solutions to the world.
Expand Down
2 changes: 1 addition & 1 deletion greenflare/widgets/exclusionstab.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
@section LICENSE
Greenflare SEO Web Crawler (https://greenflare.io)
Copyright (C) 2020 Benjamin Görler. This file is part of
Copyright (C) 2020-2021 Benjamin Görler. This file is part of
Greenflare, an open-source project dedicated to delivering
high quality SEO insights and analysis solutions to the world.
Expand Down
2 changes: 1 addition & 1 deletion greenflare/widgets/extractionstab.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
@section LICENSE
Greenflare SEO Web Crawler (https://greenflare.io)
Copyright (C) 2020 Benjamin Görler. This file is part of
Copyright (C) 2020-2021 Benjamin Görler. This file is part of
Greenflare, an open-source project dedicated to delivering
high quality SEO insights and analysis solutions to the world.
Expand Down
Loading

0 comments on commit 18eae06

Please sign in to comment.