forked from 411A/DuomeScraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
211 lines (172 loc) · 7.79 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
from playwright.async_api import (
Playwright, async_playwright,
)
from playwright._impl._api_types import (
TimeoutError as PWTimeOutErr,
)
import asyncio
from pathlib import Path
import re
from rich.console import Console
from rich import print as richprint
import random
import json
import csv
DUOLINGO_WORDS_URL = "https://duome.eu/vocabulary/en/ja"
FILENAME = DUOLINGO_WORDS_URL.replace("https://duome.eu/vocabulary/", "").replace("/", "_")
# [
# ['original_word', 'html_original_word', 'html_definition', 'category', 'html_category'],
# [...]
# ]
async def pw_duome_scraper(playwright: Playwright) -> None:
'''
Scrapes the duome website's words, prettifies them using function `prettifier_for_anki` and stores them into the global variable `LIST_WHOLE_WORDS`.
### Parameters
`playwright (Playwright)`: Takes the playwright's class to open the browser asynchronously.
'''
global FOUND_LANGS
# Get the path of the current script file (running Python)
current_script_path = Path(__file__).resolve()
# Use the resolved path to access the file or its parent directory
resolve_persistent_dir = current_script_path.parent / "PersistentContext"
browser_type = playwright.chromium
browser = await browser_type.launch_persistent_context(
user_data_dir=str(resolve_persistent_dir),
headless=False,
channel="msedge",
#slow_mo=10,
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57",
args=[
"--disable-blink-features=AutomationControlled",
"--disable-dev-shm-usage",
"--disable-component-extensions-with-background-pages",
],
# Set locale & timezone so websites always load in English
locale="en-US",
timezone_id="Europe/London"
)
page = await browser.new_page()
# Set viewport size like a fake laptop
await page.set_viewport_size({"width": 1244, "height": 830})
# Extract language codes from the URL
# Define the regex pattern to extract language codes
extract_langcode_from_url_pattern = r"https://duome\.eu/vocabulary/([a-z]{2})/([a-z]{2})"
# Use regular expression to extract language codes
lang_code_matches = re.search(
pattern=extract_langcode_from_url_pattern,
string=DUOLINGO_WORDS_URL
)
language_codes_list = [lang.upper() for lang in lang_code_matches.groups()]
FOUND_LANGS = language_codes_list
await page.goto(
url=DUOLINGO_WORDS_URL,
wait_until="load",
# Timeout: 3 minutes
timeout=180000,
)
#* Get total available words count
#? document.querySelector("small[class='cCCC']").textContent
total_words_element = await page.query_selector(
selector="small[class='cCCC']"
)
total_words_text = str(await total_words_element.text_content())
total_words = int(
re.search(
pattern=r"\b\d+\b",
string=total_words_text,
).group()
)
#* Get a list of all words (li elements), exclude header alphabets that have class='single'
#? document.querySelectorAll("div[id='words'] li:not(.single)")
# or: const word_element = document.querySelectorAll("div[id='words'] li:not(.single)")[0];
all_visible_words_element = await page.query_selector_all(
selector="div[id='words'] li:not(.single)",
)
# Colorize the final word count
if len(all_visible_words_element) == total_words:
colorized_word_count_result = "green"
elif len(all_visible_words_element) < total_words:
# FFC411 = orange
colorized_word_count_result = "#FFC411"
richprint(
f"[bold]Total Words Found: [{colorized_word_count_result}]{len(all_visible_words_element)}[/{colorized_word_count_result}]/[green]{total_words}[/green][/bold] (Visible Words Elements/Total Words)"
)
rich_console = Console()
# Iterate through elements & access their attributes|content
for i, word_element in enumerate(all_visible_words_element):
""" if i == 100:
break """
# end="\r" to print on same line as before (no print on newline, updating current line)
rich_console.print(f"[bold][red] Scarping Words:[/red] [#C5FF33]{i + 1}[/#C5FF33]/[green]{total_words}[/green][/bold]", end="\r")
# List of current word (word, definition, category)
#* Access original word
#? word_element.querySelector("span[class='hide wN']").textContent
original_word_element = await word_element.query_selector(
selector="span[class='hide wN']"
)
original_word = str(await original_word_element.text_content())
#* Access original word with phonetic symbols (e.g. ò)
#? word_element.querySelector("span[class='speak xs voice']").textContent
original_phoneticword_element = await word_element.query_selector(
selector="span[class='speak xs voice']"
)
original_phoneticword = str(await original_phoneticword_element.text_content())
#* Access definition (displayed on hover on the word)
#? word_element.querySelector("span[class='wA']").getAttribute("title")
word_definition_element = await word_element.query_selector(
selector="span[class='wA']"
)
word_definition = str(
await word_definition_element.get_attribute(
name="title"
)
)
#* Remove the [original_word] from the definition, only from the beginnig of the string
word_definition = re.sub(
# Remove the [original_word] and any whitespace after it
pattern=rf"\[{original_word}\]\s*",
repl="",
string=word_definition,
# Only from the beginning of the text
count=1,
)
#* Access word category (part of speech), like: Adverb, must remove "· " from the string
#? word_element.querySelector("small[class='cCCC wP']").textContent
word_category_element = await word_element.query_selector(
selector="small[class='cCCC wP']"
)
word_category_text = str(await word_category_element.text_content())
#* Remove the dot and any number of whitespaces it have, only from the beginning of the string
word_category = re.sub(
# Remove the dot and any number of whitespaces it have
pattern=r"·\s*",
repl="",
string=word_category_text,
# Only from the beginning of the text
count=1,
)
append_to_csv([original_phoneticword, word_definition, word_category], f'{FILENAME}_{total_words}.csv')
def append_to_csv(input_data, filename):
"""Appends the `input_data` list as a new row in the specified `filename`.
Args:
input_data (list): A list of values to be added as a new row.
Each element corresponds to a column value.
filename (str): The path to the target CSV file.
"""
# Open the file in append mode ('a'), specifying utf-8 encoding
with open(filename, 'a', encoding='utf-8', newline='') as csvfile:
writer = csv.writer(csvfile)
# Write the `input_data` as a new row
writer.writerow(input_data)
async def main():
'''
Runs the functions that have `Playwright` as parameter, then runs other necessary functions after closing the browser.
'''
async with async_playwright() as playwright:
await pw_duome_scraper(
playwright=playwright
)
# Run the async function
asyncio.run(
main()
)