Skip to content

Commit

Permalink
Fixed generate_cloze. Improved app and many fixes.
Browse files Browse the repository at this point in the history
  • Loading branch information
mohamedmujtabaraza committed Apr 2, 2022
1 parent def79f3 commit 35aef4c
Show file tree
Hide file tree
Showing 8 changed files with 212 additions and 116 deletions.
1 change: 1 addition & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 5 additions & 4 deletions run_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from src.app_cli import run_spider
from src.dict_scraper.spiders import cambridge

from src.lib.json_to_apkg import generate_cloze

if __name__ == '__main__':
word_url = "https://dictionary.cambridge.org/dictionary/english/sit"
Expand All @@ -18,13 +18,14 @@
'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'Referer': 'https://www.google.com'
}
# phrase = generate_cloze("an eye for an eye")
# # response = requests.get(gcurl, headers=headers)
# # print(response.content)
# CONTAINER['url'] = gcurl

http = urllib3.PoolManager(timeout=urllib3.Timeout(connect=1.0, read=2.0))
response = http.request('GET', word_url, headers=headers, retries=urllib3.Retry(5, redirect=2))
print(response.status)
# http = urllib3.PoolManager(timeout=urllib3.Timeout(connect=1.0, read=2.0))
# response = http.request('GET', word_url, headers=headers, retries=urllib3.Retry(5, redirect=2))
# print(response.status)
# print(response.data)
# print(response["headers"])

Expand Down
Binary file modified src/__pycache__/app.cpython-38.pyc
Binary file not shown.
2 changes: 1 addition & 1 deletion src/app.kv
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ ScreenManager:
MDRectangleFlatButton:
text: 'Generate Anki Flashcard'
font_style: 'Button'
pos_hint: {'center_x': 0.5, 'center_y': 0.2}
pos_hint: {'center_x': 0.5, 'center_y': 0.25}
on_release: root.show_data()


Expand Down
53 changes: 41 additions & 12 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@

CONTAINER = {'current_url': '', 'requests': []}
DICTIONARIES = {
"Cambridge": "https://dictionary.cambridge.org/dictionary/english/",
"Dictionary.com": "https://www.dictionary.com/browse/",
"Merriam-Webster": "https://www.merriam-webster.com/dictionary/",
"Oxford": "https://www.oxfordlearnersdictionaries.com/definition/english/",
"Vocabulary.com": "https://www.vocabulary.com/dictionary/",
"Cambridge": "dictionary.cambridge.org/dictionary/english/",
"Dictionary.com": "dictionary.com/browse/",
"Merriam-Webster": "merriam-webster.com/dictionary/",
"Oxford": "oxfordlearnersdictionaries.com/definition/english/",
"Vocabulary.com": "vocabulary.com/dictionary/",
}
HEADERS = {
'Referer': 'https://www.google.com'
Expand All @@ -64,13 +64,22 @@
http = urllib3.PoolManager(timeout=urllib3.Timeout(connect=1.0, read=2.0))


def remove_http_www(url):
if 'http' in url:
url = url.split('//')[1]
if 'www' in url:
url = re.sub('www.', '', url)
return url


def get_webpage(word_url):
url = remove_http_www(word_url)
r_text = None
global CONTAINER
for request in CONTAINER['requests']:
if word_url in request[0]:
if url == request[0]:
print("Found")
r_text = request[1]
# print("Found")
break
if not r_text:
headers = {'User-Agent': session.headers['User-Agent'], 'Referer': 'https://www.google.com'}
Expand All @@ -81,15 +90,31 @@ def get_webpage(word_url):
except:
gcurl = "https://webcache.googleusercontent.com/search?q=cache:" + word_url
response = http.request('GET', gcurl, headers=headers, retries=urllib3.Retry(5, redirect=2))
url = gcurl
r_text = response.data
print(response.status)
print(response.getheaders())

# print(session.headers['User-Agent'], session.headers['Referer'])
# r_text = session.get(word_url, verify=False).text

CONTAINER['requests'].append((word_url, r_text))
CONTAINER['requests'].append((url, r_text))
return r_text


def clear_request(word_url=None):
global CONTAINER
if not word_url:
CONTAINER['requests'] = []
return True
url = remove_http_www(word_url)
for request in CONTAINER['requests']:
if url == request[0]:
print("Found")
CONTAINER['requests'].remove(request)
return True
return False

# ----------------------------------- KIVY -------------------------------------

# Window.size = (500, 400)
Expand Down Expand Up @@ -355,13 +380,12 @@ def generate_flashcard(self, btn, section_tuple):
)

def show_data(self):
# word_url = self.word_url.text
word_url = self.ids.word_input.text
word_url = self.ids.word_input.text.split('#')[0].split('?')[0]
dict_name = None

if not validators.url(word_url):
self.toast("URL not found. Please try again")
return False
# word_url = self.word_url.text
# todo: extract word from word_url
# url_list = word_url.split('/')
# word = url_list[-2] if not url_list[-1] else url_list[-1]
Expand Down Expand Up @@ -405,8 +429,11 @@ def show_data(self):
CONTAINER['current_url'] = word_url
r_text = get_webpage(word_url)
extracted_meanings = cambridge.MeaningsSpider(BeautifulSoup(r_text, "html.parser")).parse()
if not extracted_meanings:
clear_request(word_url)
self.toast("Invalid URL. Please try again")
return False
# CONTAINER['meanings'] = extracted_meanings

# self.dialog_popup("Processing...", "Please wait. Generating Flashcard..")
meanings_screen = self.manager.get_screen("meanings_screen")
for meaning in extracted_meanings:
Expand Down Expand Up @@ -521,6 +548,8 @@ def soft_restart(self):
CONTAINER['current_url'] = ''
self.root.transition.direction = 'right'
self.root.transition.duration = 0.5 # 0.5 second
meanings_screen = self.root.get_screen("meanings_screen")
meanings_screen.ids.meanings_container.clear_widgets()
self.root.current = 'menu_screen'


Expand Down
Loading

0 comments on commit 35aef4c

Please sign in to comment.