Skip to content

Commit

Permalink
Welcome BeautifulSoup4
Browse files Browse the repository at this point in the history
  • Loading branch information
mohamedmujtabaraza committed Apr 1, 2022
1 parent 79ede36 commit dcb8470
Show file tree
Hide file tree
Showing 24 changed files with 260 additions and 956 deletions.
5 changes: 2 additions & 3 deletions buildozer.spec
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,9 @@ version = 0.1

# (list) Application requirements
# comma separated e.g. requirements = sqlite3,kivy
requirements = python3==3.8.10,appdirs,attrs,Automat,bcrypt,brotlipy,cached-property,certifi==2021.10.8,cffi,charset-normalizer,chevron,click,colorama,constantly,cryptography,cssselect,decorator,docutils,fake-useragent,Faker,filelock,frozendict,genanki,gTTS,hyperlink,idna,incremental,itemadapter,itemloaders,jmespath,Kivy,Kivy-Garden,kivymd,lxml,parsel,Pillow,pip,Protego,pyasn1,pyasn1-modules,pycparser,PyDispatcher,Pygments,pyOpenSSL,PySocks,python-dateutil,PyYAML,queuelib,requests,requests-file,Scrapy,scrapy-fake-useragent,service-identity,setuptools,six,tldextract,Twisted,typing_extensions,urllib3,validators,w3lib,wheel==0.37.1,zope.interface

# ,appdirs==1.4.4,attrs==21.4.0,Automat==20.2.0,bcrypt==3.2.0,brotlipy==0.7.0,cached-property==1.5.2,certifi==2021.10.8,cffi==1.15.0,charset-normalizer==2.0.12,chevron==0.14.0,click==8.0.4,colorama==0.4.4,constantly==15.1.0,cryptography==36.0.2,cssselect==1.1.0,decorator==5.1.1,docutils==0.18.1,fake-useragent==0.1.11,Faker==13.3.3,filelock==3.6.0,frozendict==2.3.0,genanki==0.13.0,gTTS==2.2.3,hyperlink==21.0.0,idna==3.3,incremental==21.3.0,itemadapter==0.5.0,itemloaders==1.0.4,jmespath==1.0.0,Kivy==2.1.0,kivy-deps.angle==0.3.2,kivy-deps.glew==0.3.1,kivy-deps.gstreamer==0.3.3,kivy-deps.sdl2==0.4.5,Kivy-Garden==0.1.5,kivymd==0.104.2,lxml==4.8.0,parsel==1.6.0,Pillow==9.0.1,pip==22.0.4,Protego==0.2.1,pyasn1==0.4.8,pyasn1-modules==0.2.7,pycparser==2.21,PyDispatcher==2.0.5,Pygments==2.11.2,pyOpenSSL==22.0.0,pypiwin32==223,PySocks==1.7.1,python-dateutil==2.8.2,pywin32==303,PyYAML==6.0,queuelib==1.6.2,requests==2.27.1,requests-file==1.5.1,Scrapy==2.6.1,scrapy-fake-useragent==1.4.4,service-identity==18.1.0,setuptools==61.1.1,six==1.16.0,tldextract==3.2.0,typing_extensions==4.1.1,urllib3==1.26.9,validators==0.18.2,w3lib==1.22.0,wheel==0.37.1,win-inet-pton==1.1.0,zope.interface==5.4.0
requirements = python3,beautifulsoup4,brotlipy,cached-property,certifi,cffi,charset-normalizer,chevron,click,colorama,cryptography,decorator,docutils,frozendict,genanki,gTTS,idna,Kivy,Kivy-Garden,kivymd,Pillow,pip,pycparser,Pygments,pyOpenSSL,PySocks,PyYAML,requests,setuptools,six,soupsieve,urllib3,validators,wheel

# beautifulsoup4==4.10.0,brotlipy==0.7.0,cached-property==1.5.2,certifi==2021.10.8,cffi==1.15.0,charset-normalizer==2.0.12,chevron==0.14.0,click==8.1.2,colorama==0.4.4,cryptography==36.0.2,decorator==5.1.1,docutils==0.18.1,frozendict==2.3.0,genanki==0.13.0,gTTS==2.2.4,idna==3.3,Kivy==2.1.0,kivy-deps.angle==0.3.2,kivy-deps.glew==0.3.1,kivy-deps.gstreamer==0.3.3,kivy-deps.sdl2==0.4.5,Kivy-Garden==0.1.5,kivymd==0.104.2,Pillow==9.0.1,pip==22.0.4,pycparser==2.21,Pygments==2.11.2,pyOpenSSL==22.0.0,pypiwin32==223,PySocks==1.7.1,pywin32==303,PyYAML==6.0,requests==2.27.1,setuptools==61.3.0,six==1.16.0,soupsieve==2.3.1,urllib3==1.26.9,validators==0.18.2,wheel==0.37.1,win-inet-pton==1.1.0

# (str) Custom source folders for requirements
# Sets custom source for any requirements with recipes
Expand Down
7 changes: 3 additions & 4 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@ channels:
dependencies:
- python=3.8
- pip
- twisted
- scrapy
- beautifulsoup4
- requests
- genanki
- validators
- gtts
- validators
- pip:
- scrapy-fake-useragent
- kivy[full]
- kivymd
6 changes: 1 addition & 5 deletions run.py → main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,7 @@
# scrapy genspider example example.com
# pip list --format=freeze > requirements.txt

from twisted.internet import reactor

from src.app import crawler_runner, MyApp
from src.app import MyApp

if __name__ == '__main__':
MyApp().run()
deferred = crawler_runner.join()
deferred.addBoth(lambda _: reactor.stop())
Binary file modified requirements.txt
Binary file not shown.
16 changes: 9 additions & 7 deletions run_cli.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor
import time

from src.app_cli import run_spider
from src.dict_scraper.spiders.cambridge import MeaningsSpider
from src.dict_scraper.spiders import cambridge


if __name__ == '__main__':
Expand All @@ -15,9 +12,14 @@
# # print(response.content)
# CONTAINER['url'] = gcurl

run_spider(MeaningsSpider, word_url)
time.sleep(5)
run_spider(MeaningsSpider, word_url2)
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'Referer': 'https://www.google.com'
}

print(run_spider(cambridge.MeaningsSpider, word_url, headers))
time.sleep(20)
print(run_spider(cambridge.MeaningsSpider, word_url2, headers))

# run_spider(CambridgeSpider, gcurl, "com", "cbed-2-4", False) # dt.now().strftime("%Y%m%d%H%M%S")
# run_spider("https://dictionary.cambridge.org/dictionary/english/water", "com", dt.now().strftime("%Y%m%d%H%M%S"))
11 changes: 0 additions & 11 deletions scrapy.cfg

This file was deleted.

Binary file modified src/__pycache__/app.cpython-38.pyc
Binary file not shown.
Binary file modified src/__pycache__/app_cli.cpython-38.pyc
Binary file not shown.
51 changes: 19 additions & 32 deletions src/app.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import os
import platform
import subprocess
# from multiprocessing import Process, Queue
from threading import Thread
from queue import Queue
import webbrowser
import validators
from datetime import datetime as dt
Expand All @@ -12,7 +9,6 @@
import json
import re

from twisted.internet import reactor
from kivy import platform as kplatform
from kivymd.app import MDApp
from kivymd.toast import toast
Expand All @@ -28,25 +24,15 @@
from kivymd.uix.expansionpanel import MDExpansionPanel, MDExpansionPanelTwoLine
from kivymd.uix.label import MDLabel, MDIcon
from kivymd.uix.gridlayout import MDGridLayout
import kivymd
import kivy
from kivy.config import Config
from kivy.uix.widget import Widget
from kivymd.uix.textfield import MDTextField
from kivymd.uix.floatlayout import FloatLayout
from kivy.uix.boxlayout import BoxLayout
from kivy.uix.button import Button
from kivy.utils import get_color_from_hex
from gtts import gTTS
import scrapy
from scrapy import signals
from scrapy.http.request import Request
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerRunner, CrawlerProcess

from src.dict_scraper.spiders.cambridge import CambridgeSpider, MeaningsSpider
from src.dict_scraper.items import CambridgeDictionaryItem
from src.lib.json_to_apkg import JsonToApkg
from src.dict_scraper.spiders import cambridge

# os.environ["KIVY_NO_CONSOLELOG"] = "1"
# kivy.require('1.9.0')
Expand All @@ -59,7 +45,12 @@
"Oxford": "https://www.oxfordlearnersdictionaries.com/definition/english/",
"Vocabulary.com": "https://www.vocabulary.com/dictionary/",
}

HEADERS = {
'User-Agent':
'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/85.0.4183.140 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'Referer': 'https://www.google.com'
}

# class UrlCrawlerScript(Process):
# def __init__(self, spider, q, *args):
Expand All @@ -74,22 +65,18 @@
# deferred.addBoth(lambda _: reactor.stop())
# reactor.run()

def run_spider(runner, spider, *args):
q = Queue()
runner.crawl(spider, q, args)
if not reactor.running:
Thread(target=reactor.run).start()
if spider is MeaningsSpider:
CONTAINER['meanings'] = q.get()[0]
else: # spider is CambridgeSpider:
CONTAINER['dictionary'] = q.get()[0]


crawler_runner = CrawlerRunner(get_project_settings())
def run_spider(soup_spider, url, headers, *args, **kwargs):
spider = soup_spider(url, headers, *args, **kwargs)
results = spider.parse()
if soup_spider is cambridge.MeaningsSpider:
CONTAINER['meanings'] = results
else: # spider is cambridge.CambridgeSpider:
CONTAINER['dictionary'] = results
return results

# ----------------------------------- KIVY -------------------------------------


# Window.size = (500, 400)


Expand Down Expand Up @@ -341,7 +328,7 @@ def confirm_generation(self, section_tuple):

def generate_flashcard(self, btn, section_tuple):
print(section_tuple)
run_spider(crawler_runner, CambridgeSpider, CONTAINER['url'], self.tld, section_tuple)
extracted_dictionary = cambridge.CambridgeSpider(CONTAINER['url'], HEADERS, self.tld, section_tuple).parse()
MDApp.get_running_app().soft_restart()
self.dialog_popup(
"Open Anki Package?",
Expand Down Expand Up @@ -399,12 +386,12 @@ def show_data(self):
# gcurl = "https://webcache.googleusercontent.com/search?q=cache:" + word_url

CONTAINER['url'] = word_url
run_spider(crawler_runner, MeaningsSpider, word_url)
extracted_meanings = cambridge.MeaningsSpider(word_url, HEADERS).parse()
# CONTAINER['meanings'] = extracted_meanings

# self.dialog_popup("Processing...", "Please wait. Generating Flashcard..")
# print(CONTAINER['meanings'])
meanings_screen = self.manager.get_screen("meanings_screen")
for meaning in CONTAINER['meanings']:
for meaning in extracted_meanings:
section_ids = meaning['cid']
word = meaning['word']
guide_word = meaning['gw']
Expand Down
Loading

0 comments on commit dcb8470

Please sign in to comment.