Skip to content

Commit

Permalink
v2.1.0
Browse files Browse the repository at this point in the history
- Refactored web_scrap.py
  • Loading branch information
sakan811 committed May 24, 2024
1 parent 15430f5 commit b6d65fb
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 75 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ Update May 4th, 2024

Showcase visualizations about the common Japanese words in YOASOBI's songs' lyrics.

[![CodeQL](https://github.com/sakan811/Common-Japanese-Words-in-YOASOBI-Lyrics/actions/workflows/codeql.yml/badge.svg)](https://github.com/sakan811/Common-Japanese-Words-in-YOASOBI-Lyrics/actions/workflows/codeql.yml)

## Project Details
Lyrics were based on [genius.com](https://genius.com/artists/Yoasobi)

Expand Down
72 changes: 0 additions & 72 deletions main.py

This file was deleted.

1 change: 0 additions & 1 deletion tests/test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import sqlite3
import requests
import pytest
from bs4 import BeautifulSoup
from loguru import logger
Expand Down
2 changes: 0 additions & 2 deletions yoasobi_project/web_scrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
import requests
from bs4 import BeautifulSoup, ResultSet
from loguru import logger
from selenium import webdriver
from selenium.webdriver.chrome.options import Options


def return_url_list() -> list[str]:
Expand Down
98 changes: 98 additions & 0 deletions yoasobiscraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from typing import Tuple, List

from loguru import logger

import yoasobi_project

logger.add('yoasobi.log',
format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {thread} | {name} | {module} | {function} | {line} | {message}",
mode='w')


class YoasobiScraper:
def __init__(self, db_dir: str):
"""
A 'YoasobiScraper' class creates or connects to the SQLite database when initialized.
:param db_dir: SQLite database directory.
"""
logger.info('Initializing SQLite database if not exist...')
logger.info('Connect to the database if exist')
self.db_dir = db_dir
self.engine = yoasobi_project.connect_sqlite_db(db_dir)

def main(self) -> None:
"""
Main function that runs the web-scraping process and SQLite data migration.
:return: None
"""
logger.info('Start the scraping process...')

urls: list[str] = yoasobi_project.return_url_list()

page_source_list = yoasobi_project.thread_fetch_page_source(urls)

yoasobi_project.connect_sqlite_db(self.db_dir)

query = yoasobi_project.create_table_query()
yoasobi_project.execute_sql_query(self.engine, query)

if page_source_list:
logger.info(f'Appended page sources to list successfully')

logger.info(f'Delete all rows from the table \'Words\'')
query = yoasobi_project.delete_all_rows()
yoasobi_project.execute_sql_query(self.engine, query)
else:
logger.error('No page sources were found. Not delete all rows from the \'Words\' table.')

self._scrape_each_page_source(page_source_list)

def _scrape_each_page_source(self, page_source_list) -> None:
"""
Scrape each page source.
:param page_source_list: List of page sources.
:return: None
"""
logger.info('Scraping each page source...')
for page_source in page_source_list:
lyrics_list: list[str] = yoasobi_project.scrap(page_source)
logger.debug(f'{lyrics_list = }')

part_of_speech_list, romanized_words, song_name, words = self._extract_data(lyrics_list)

yoasobi_project.insert_data(words, romanized_words, part_of_speech_list, song_name, self.db_dir)

logger.debug(f'{len(words) = }')
logger.debug(f'{len(romanized_words) = }')
logger.debug(f'{len(part_of_speech_list) = }')

@staticmethod
def _extract_data(lyrics_list: list[str]) -> tuple[list[str], list[str], str, list[str]]:
"""
Extract data from the page sources.
:param lyrics_list: Lyrics list.
:return: Tuple of Lists that contain extracted data from the page sources.
"""
logger.info('Extracting data from the page sources...')
song_name: str = yoasobi_project.extract_song_name_from_lyrics_list(lyrics_list)
logger.debug(f'{song_name = }')

lyrics: str = yoasobi_project.extract_lyrics_from_lyrics_list(lyrics_list)
logger.debug(f'{lyrics = }')

words: list[str] = yoasobi_project.extract_words_from_lyrics(lyrics)
logger.debug(f'{words = }')

romanized_words: list[str] = yoasobi_project.extract_romanji_from_words(words)
logger.debug(f'{romanized_words = }')

part_of_speech_list: list[str] = yoasobi_project.extract_part_of_speech_from_words(words)
logger.debug(f'{part_of_speech_list = }')

return part_of_speech_list, romanized_words, song_name, words


if __name__ == '__main__':
db_dir = 'yoasobi.db'
YoasobiScraper(db_dir).main()

0 comments on commit b6d65fb

Please sign in to comment.