From 1572852fd2f348d3a83b07f08330aeb990e8f8ca Mon Sep 17 00:00:00 2001 From: "deepsource-autofix[bot]" <62050782+deepsource-autofix[bot]@users.noreply.github.com> Date: Tue, 4 Apr 2023 10:07:02 +0000 Subject: [PATCH 1/2] refactor: Use identity check for comparison to a singleton Comparisons to the singleton objects, like `True`, `False`, and `None`, should be done with identity, not equality. Use `is` or `is not`. --- main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index 96ae59a..109e60b 100644 --- a/main.py +++ b/main.py @@ -34,7 +34,7 @@ def extract_info_url(url, main=False): scheme = data_url.scheme url_ori = url.replace(file_name, '') black_list = ['', '/'] - if main == True and file_name in black_list: + if main is True and file_name in black_list: file_name = 'index.html' return {"domain": domain, "path": path_file, "file_name": file_name, "scheme": scheme, "url": url_ori} @@ -95,7 +95,7 @@ def check_exists(self, url): if info_url['domain'] != self.info_url['domain']: return False path_file = info_url['path'] + info_url['file_name'] - return os.path.exists(path_file) == False + return os.path.exists(path_file) is False def get_all_urls_in_page(self, page_source): result = [] From 953018f7c8db4170faa9d73bf970327ddbf1275c Mon Sep 17 00:00:00 2001 From: "deepsource-autofix[bot]" <62050782+deepsource-autofix[bot]@users.noreply.github.com> Date: Tue, 4 Apr 2023 10:07:15 +0000 Subject: [PATCH 2/2] style: Format code with yapf, black, autopep8 and isort --- main.py | 117 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 66 insertions(+), 51 deletions(-) diff --git a/main.py b/main.py index 109e60b..31612bf 100644 --- a/main.py +++ b/main.py @@ -4,12 +4,12 @@ # __license__ = "GPL" # __version__ = "2.1.0" # __email__ = "a2FpdG9raWQxNDEyLmNvbmFuQGdtYWlsLmNvbQ==" -__black_list_type__ = ['.php'] +__black_list_type__ = [".php"] __status_code__ = [200, 404] __clone_all__ = False __zip__ = False __headless__ = False -__clone_url__ = 'https://themesbrand.com/velzon/html/default/index.html' +__clone_url__ = "https://themesbrand.com/velzon/html/default/index.html" import os import os.path @@ -29,14 +29,20 @@ def extract_info_url(url, main=False): data_url = urlparse(url) domain = data_url.netloc - path_file = domain.replace('.', '') + os.path.split(data_url.path)[0] + '/' + path_file = domain.replace(".", "") + os.path.split(data_url.path)[0] + "/" file_name = os.path.split(data_url.path)[1] scheme = data_url.scheme - url_ori = url.replace(file_name, '') - black_list = ['', '/'] + url_ori = url.replace(file_name, "") + black_list = ["", "/"] if main is True and file_name in black_list: - file_name = 'index.html' - return {"domain": domain, "path": path_file, "file_name": file_name, "scheme": scheme, "url": url_ori} + file_name = "index.html" + return { + "domain": domain, + "path": path_file, + "file_name": file_name, + "scheme": scheme, + "url": url_ori, + } def get_all_file_paths(directory): @@ -49,13 +55,13 @@ def get_all_file_paths(directory): def compress(path_folder): - print(f'Compression files... {str(path_folder)}.zip') + print(f"Compression files... {str(path_folder)}.zip") directory = path_folder file_paths = get_all_file_paths(directory) - with ZipFile(f'{path_folder}.zip', 'w') as zip: + with ZipFile(f"{path_folder}.zip", "w") as zip: for file in file_paths: zip.write(file) - print('All files zipped successfully!') + print("All files zipped successfully!") def check_invalid(file_name): @@ -66,7 +72,7 @@ def check_invalid(file_name): class File: - info_url = '' + info_url = "" def __init__(self, url): self.url = url @@ -78,61 +84,67 @@ def download_file(self, url, headers): if url == self.url: info_url = extract_info_url(url, True) - if info_url['file_name'][-4:] not in __black_list_type__: - file_name = info_url['file_name'] - black_list = ['', '/'] + if info_url["file_name"][-4:] not in __black_list_type__: + file_name = info_url["file_name"] + black_list = ["", "/"] if file_name in black_list: - file_name = 'index.html' - path_file = info_url['path'] + file_name + file_name = "index.html" + path_file = info_url["path"] + file_name if not os.path.exists(path_file): r = requests.get(url, headers=headers) os.makedirs(os.path.dirname(path_file), exist_ok=True) - with open(path_file, 'wb') as f: + with open(path_file, "wb") as f: f.write(r.content) def check_exists(self, url): info_url = extract_info_url(url) - if info_url['domain'] != self.info_url['domain']: + if info_url["domain"] != self.info_url["domain"]: return False - path_file = info_url['path'] + info_url['file_name'] + path_file = info_url["path"] + info_url["file_name"] return os.path.exists(path_file) is False def get_all_urls_in_page(self, page_source): result = [] - source = BeautifulSoup(page_source, 'html.parser') + source = BeautifulSoup(page_source, "html.parser") try: data_a = source.find_all("a") except Exception: data_a = None a_tag = [] for a in data_a: - if a.get('href') != '' and a.get('href') != '#' and str(a.get('href')) not in a_tag and check_invalid(str(a.get('href'))) is not None: - a_tag.append(a.get('href')) + if ( + a.get("href") != "" + and a.get("href") != "#" + and str(a.get("href")) not in a_tag + and check_invalid(str(a.get("href"))) is not None + ): + a_tag.append(a.get("href")) for href in a_tag: domain = urlparse(href).netloc - if domain == '': - if len(href.split('../')) > 1: - cut = self.info_url['url'].split('/')[-(len(href.split('../'))):] - link = self.info_url['url'] + if domain == "": + if len(href.split("../")) > 1: + cut = self.info_url["url"].split( + "/")[-(len(href.split("../"))):] + link = self.info_url["url"] for text in cut: - if text != '': - link = link.replace(f'{str(text)}/', '') - result.append(link + href.replace('../', '')) - elif href[:1] == '/': - link = re.split('[\/]+', self.info_url['url'])[:2] - link = f'{str(link[0])}//{str(link[1])}' + if text != "": + link = link.replace(f"{str(text)}/", "") + result.append(link + href.replace("../", "")) + elif href[:1] == "/": + link = re.split("[\/]+", self.info_url["url"])[:2] + link = f"{str(link[0])}//{str(link[1])}" result.append(link + href) else: - result.append(self.info_url['url'] + href) - if domain == self.info_url['domain']: + result.append(self.info_url["url"] + href) + if domain == self.info_url["domain"]: result.append(href) return result class BrowserClone(File): - driver = '' - page_source = '' + driver = "" + page_source = "" all_url = [] url_down = [] headers = {} @@ -143,23 +155,26 @@ def __init__(self, url): self.open_browser() def open_browser(self): - print('============================== Begin ==============================') + print("============================== Begin ==============================") options = webdriver.ChromeOptions() if __headless__: - options.add_argument('--headless') + options.add_argument("--headless") options.add_argument("--no-sandbox") options.add_experimental_option("useAutomationExtension", False) - options.add_experimental_option("excludeSwitches", ["enable-automation"]) - options.page_load_strategy = 'none' + options.add_experimental_option( + "excludeSwitches", ["enable-automation"]) + options.page_load_strategy = "none" - self.driver = webdriver.Chrome(chrome_options=options, executable_path=ChromeDriverManager().install()) + self.driver = webdriver.Chrome( + chrome_options=options, executable_path=ChromeDriverManager().install() + ) self.driver.get(self.url) - print('Waiting 30s to make sure the page has finished loading...') + print("Waiting 30s to make sure the page has finished loading...") time.sleep(30) self.set_page_source() self.extract_file() - print('Getting all the links to crawl...') + print("Getting all the links to crawl...") all_urls_in_page = super().get_all_urls_in_page(self.page_source) for url_in_page in all_urls_in_page: self.all_url.append(url_in_page) @@ -172,18 +187,18 @@ def open_browser(self): self.driver.get(url) self.extract_file() - print('Get all the links done!') + print("Get all the links done!") self.extract_file(True) if __zip__: url_info = extract_info_url(self.url, True) - folder = './' + url_info['domain'].replace('.', '') + folder = "./" + url_info["domain"].replace(".", "") compress(folder) try: shutil.rmtree(folder, ignore_errors=True) except OSError as e: print(f"Error: {folder} : {e.strerror}") - print('============================== End Game ==============================') + print("============================== End Game ==============================") def extract_html(self, url): super().__init__(url) @@ -197,13 +212,13 @@ def extract_html(self, url): def extract_file(self, down=False): for request in self.driver.requests: if ( - request.response - and request.response.status_code in __status_code__ - and request.url not in self.url_down + request.response + and request.response.status_code in __status_code__ + and request.url not in self.url_down ): self.url_down.append(request.url) if down: - print('Save files...') + print("Save files...") super().__init__(self.url) data = list(set(self.url_down)) with tqdm(total=len(data)) as pbar: @@ -211,7 +226,7 @@ def extract_file(self, down=False): if super().check_exists(file): super().download_file(file, self.headers) pbar.update(1) - print('Save files Done!') + print("Save files Done!") def set_page_source(self): for _ in range(5):