-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
105 lines (81 loc) · 3.49 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import json
import time
import re
import os
initial_path = os.getcwd()
with open("YOUR_JSON_PATH", 'r', encoding='utf-8') as json_file:
word_list = json.load(json_file)
nb_mots = 0
nb_essais = 0
# Output list of unfound words
word_not_found = []
driver = webdriver.Firefox()
driver.get('https://derja.ninja/')
output_object = {}
for word in word_list:
# to be printed
nb_essais += 1
search_word = word
input_element = driver.find_element(
By.CSS_SELECTOR, ".search-input.search-input--large.js-search-input")
input_element.send_keys(search_word)
ok_button = driver.find_element(By.CSS_SELECTOR, '.search-button')
ok_button.click()
output_page_html = driver.page_source
soup = BeautifulSoup(output_page_html, 'html.parser')
results = soup.find_all('li', class_='search-result')
if len(results) > 0:
nb_mots += 1
output_object[search_word] = {}
search_result__term_in_arabic = results[0].find(
'div', class_='search-result__term_in_arabic').text
# removes spaces an updatebrowser message
search_result__term_in_arabic = re.sub(
r'\s+', '', search_result__term_in_arabic).replace("Updateyourbrowser", "")
# finally extract only the arabic alphabet word
search_result__term_in_arabic = re.sub(
r'[^-\u07FF\s]+', '', search_result__term_in_arabic)
# Get the first 3 results for the query
for i in range(min(len(results), 3)):
audio_found = False
# Some elements might not have an audio file
try:
sentence_audio_arabic = results[i].find(
'div', class_='search_result__example_sentence_in_arabic').find("audio").get("src")
audio_found = True
except Exception as e:
print(e)
# get sample sentence in english
search_result__example_sentence_in_english = results[i].find(
'div', class_='search_result__example_sentence_in_english').find("span").text
# get the translation for english sample sentence
search_result__example_sentence_in_arabic = results[i].find(
'div', class_='search_result__example_sentence_in_arabic').find(
'span', class_='example-sentence').text
output_object[search_word]["result"] = search_result__term_in_arabic
if audio_found:
output_object[search_word
][f"sample_{i}"] = [search_result__example_sentence_in_english, search_result__example_sentence_in_arabic, sentence_audio_arabic]
else:
output_object[search_word
][f"sample_{i}"] = [search_result__example_sentence_in_english, search_result__example_sentence_in_arabic]
print(f"{nb_mots}/{nb_essais}")
else:
word_not_found.append(word)
# Go back to main page for next search
main_page = driver.find_element(
By.CSS_SELECTOR, '.navbar > a:nth-child(2)')
main_page.click()
time.sleep(0.25)
driver.quit()
file_name = "YOUR_FILE_NAME"
os.chdir(initial_path)
with open(file_name, 'w', encoding='utf-8') as file:
json.dump(output_object, file, ensure_ascii=False)
with open(f"output_json/{file_name}", 'w', encoding='utf-8') as file:
json.dump(word_not_found, file, ensure_ascii=False)
print(nb_mots)
print(word_not_found)