From b6d65fb7064f473818bdab754a5a737357e330b6 Mon Sep 17 00:00:00 2001 From: Sakan Date: Fri, 24 May 2024 21:09:25 +0700 Subject: [PATCH] v2.1.0 - Refactored web_scrap.py --- README.md | 2 + main.py | 72 -------------------------- tests/test.py | 1 - yoasobi_project/web_scrap.py | 2 - yoasobiscraper.py | 98 ++++++++++++++++++++++++++++++++++++ 5 files changed, 100 insertions(+), 75 deletions(-) delete mode 100644 main.py create mode 100644 yoasobiscraper.py diff --git a/README.md b/README.md index 18b3119..c13b302 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,8 @@ Update May 4th, 2024 Showcase visualizations about the common Japanese words in YOASOBI's songs' lyrics. +[![CodeQL](https://github.com/sakan811/Common-Japanese-Words-in-YOASOBI-Lyrics/actions/workflows/codeql.yml/badge.svg)](https://github.com/sakan811/Common-Japanese-Words-in-YOASOBI-Lyrics/actions/workflows/codeql.yml) + ## Project Details Lyrics were based on [genius.com](https://genius.com/artists/Yoasobi) diff --git a/main.py b/main.py deleted file mode 100644 index 359eff1..0000000 --- a/main.py +++ /dev/null @@ -1,72 +0,0 @@ -from loguru import logger - -import yoasobi_project - -logger.add('yoasobi.log', - format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {thread} | {name} | {module} | {function} | {line} | {message}", - mode='w') - - -class Main: - def __init__(self, db_dir: str): - logger.info('Initializing SQLite database if not exist...') - logger.info('Connect to the database if exist') - self.db_dir = db_dir - self.engine = yoasobi_project.connect_sqlite_db(db_dir) - - def main(self) -> None: - """ - Main function that runs the web-scraping process and SQLite data migration. - :return: None - """ - try: - urls: list[str] = yoasobi_project.return_url_list() - - page_source_list = yoasobi_project.thread_fetch_page_source(urls) - - yoasobi_project.connect_sqlite_db(self.db_dir) - - query = yoasobi_project.create_table_query() - yoasobi_project.execute_sql_query(self.engine, query) - - if page_source_list: - logger.info(f'Appended page sources to list successfully') - - logger.info(f'Delete all rows from the table \'Words\'') - query = yoasobi_project.delete_all_rows() - yoasobi_project.execute_sql_query(self.engine, query) - else: - logger.error('No page sources were found. Not delete all rows from the \'Words\' table.') - - for page_source in page_source_list: - lyrics_list: list[str] = yoasobi_project.scrap(page_source) - logger.debug(f'{lyrics_list = }') - - song_name: str = yoasobi_project.extract_song_name_from_lyrics_list(lyrics_list) - logger.debug(f'{song_name = }') - - lyrics: str = yoasobi_project.extract_lyrics_from_lyrics_list(lyrics_list) - logger.debug(f'{lyrics = }') - - words: list[str] = yoasobi_project.extract_words_from_lyrics(lyrics) - logger.debug(f'{words = }') - - romanized_words: list[str] = yoasobi_project.extract_romanji_from_words(words) - logger.debug(f'{romanized_words = }') - - part_of_speech_list: list[str] = yoasobi_project.extract_part_of_speech_from_words(words) - logger.debug(f'{part_of_speech_list = }') - - yoasobi_project.insert_data(words, romanized_words, part_of_speech_list, song_name, self.db_dir) - - logger.debug(f'{len(words) = }') - logger.debug(f'{len(romanized_words) = }') - logger.debug(f'{len(part_of_speech_list) = }') - except Exception as e: - logger.error(f'Error: {e}') - - -if __name__ == '__main__': - db_dir = 'yoasobi.db' - Main(db_dir).main() - diff --git a/tests/test.py b/tests/test.py index b686701..e985bc4 100644 --- a/tests/test.py +++ b/tests/test.py @@ -1,5 +1,4 @@ import sqlite3 -import requests import pytest from bs4 import BeautifulSoup from loguru import logger diff --git a/yoasobi_project/web_scrap.py b/yoasobi_project/web_scrap.py index 923a2e4..0df3766 100644 --- a/yoasobi_project/web_scrap.py +++ b/yoasobi_project/web_scrap.py @@ -4,8 +4,6 @@ import requests from bs4 import BeautifulSoup, ResultSet from loguru import logger -from selenium import webdriver -from selenium.webdriver.chrome.options import Options def return_url_list() -> list[str]: diff --git a/yoasobiscraper.py b/yoasobiscraper.py new file mode 100644 index 0000000..0ef97f9 --- /dev/null +++ b/yoasobiscraper.py @@ -0,0 +1,98 @@ +from typing import Tuple, List + +from loguru import logger + +import yoasobi_project + +logger.add('yoasobi.log', + format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {thread} | {name} | {module} | {function} | {line} | {message}", + mode='w') + + +class YoasobiScraper: + def __init__(self, db_dir: str): + """ + A 'YoasobiScraper' class creates or connects to the SQLite database when initialized. + :param db_dir: SQLite database directory. + """ + logger.info('Initializing SQLite database if not exist...') + logger.info('Connect to the database if exist') + self.db_dir = db_dir + self.engine = yoasobi_project.connect_sqlite_db(db_dir) + + def main(self) -> None: + """ + Main function that runs the web-scraping process and SQLite data migration. + :return: None + """ + logger.info('Start the scraping process...') + + urls: list[str] = yoasobi_project.return_url_list() + + page_source_list = yoasobi_project.thread_fetch_page_source(urls) + + yoasobi_project.connect_sqlite_db(self.db_dir) + + query = yoasobi_project.create_table_query() + yoasobi_project.execute_sql_query(self.engine, query) + + if page_source_list: + logger.info(f'Appended page sources to list successfully') + + logger.info(f'Delete all rows from the table \'Words\'') + query = yoasobi_project.delete_all_rows() + yoasobi_project.execute_sql_query(self.engine, query) + else: + logger.error('No page sources were found. Not delete all rows from the \'Words\' table.') + + self._scrape_each_page_source(page_source_list) + + def _scrape_each_page_source(self, page_source_list) -> None: + """ + Scrape each page source. + :param page_source_list: List of page sources. + :return: None + """ + logger.info('Scraping each page source...') + for page_source in page_source_list: + lyrics_list: list[str] = yoasobi_project.scrap(page_source) + logger.debug(f'{lyrics_list = }') + + part_of_speech_list, romanized_words, song_name, words = self._extract_data(lyrics_list) + + yoasobi_project.insert_data(words, romanized_words, part_of_speech_list, song_name, self.db_dir) + + logger.debug(f'{len(words) = }') + logger.debug(f'{len(romanized_words) = }') + logger.debug(f'{len(part_of_speech_list) = }') + + @staticmethod + def _extract_data(lyrics_list: list[str]) -> tuple[list[str], list[str], str, list[str]]: + """ + Extract data from the page sources. + :param lyrics_list: Lyrics list. + :return: Tuple of Lists that contain extracted data from the page sources. + """ + logger.info('Extracting data from the page sources...') + song_name: str = yoasobi_project.extract_song_name_from_lyrics_list(lyrics_list) + logger.debug(f'{song_name = }') + + lyrics: str = yoasobi_project.extract_lyrics_from_lyrics_list(lyrics_list) + logger.debug(f'{lyrics = }') + + words: list[str] = yoasobi_project.extract_words_from_lyrics(lyrics) + logger.debug(f'{words = }') + + romanized_words: list[str] = yoasobi_project.extract_romanji_from_words(words) + logger.debug(f'{romanized_words = }') + + part_of_speech_list: list[str] = yoasobi_project.extract_part_of_speech_from_words(words) + logger.debug(f'{part_of_speech_list = }') + + return part_of_speech_list, romanized_words, song_name, words + + +if __name__ == '__main__': + db_dir = 'yoasobi.db' + YoasobiScraper(db_dir).main() +