Skip to content

Commit

Permalink
Added requests-random-user-agent package, Improved CONTAINER values r…
Browse files Browse the repository at this point in the history
…etrieval i.e. less requests.
  • Loading branch information
mohamedmujtabaraza committed Apr 1, 2022
1 parent dcb8470 commit f7e0c26
Show file tree
Hide file tree
Showing 9 changed files with 64 additions and 56 deletions.
4 changes: 2 additions & 2 deletions buildozer.spec
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ version = 0.1

# (list) Application requirements
# comma separated e.g. requirements = sqlite3,kivy
requirements = python3,beautifulsoup4,brotlipy,cached-property,certifi,cffi,charset-normalizer,chevron,click,colorama,cryptography,decorator,docutils,frozendict,genanki,gTTS,idna,Kivy,Kivy-Garden,kivymd,Pillow,pip,pycparser,Pygments,pyOpenSSL,PySocks,PyYAML,requests,setuptools,six,soupsieve,urllib3,validators,wheel
requirements = python3,beautifulsoup4,brotlipy,cached-property,certifi,cffi,charset-normalizer,chevron,click,colorama,cryptography,decorator,docutils,frozendict,genanki,gTTS,idna,Kivy,Kivy-Garden,kivymd,Pillow,pip,pycparser,Pygments,pyOpenSSL,PySocks,PyYAML,requests,requests-random-user-agent,setuptools,six,soupsieve,urllib3,validators,wheel

# beautifulsoup4==4.10.0,brotlipy==0.7.0,cached-property==1.5.2,certifi==2021.10.8,cffi==1.15.0,charset-normalizer==2.0.12,chevron==0.14.0,click==8.1.2,colorama==0.4.4,cryptography==36.0.2,decorator==5.1.1,docutils==0.18.1,frozendict==2.3.0,genanki==0.13.0,gTTS==2.2.4,idna==3.3,Kivy==2.1.0,kivy-deps.angle==0.3.2,kivy-deps.glew==0.3.1,kivy-deps.gstreamer==0.3.3,kivy-deps.sdl2==0.4.5,Kivy-Garden==0.1.5,kivymd==0.104.2,Pillow==9.0.1,pip==22.0.4,pycparser==2.21,Pygments==2.11.2,pyOpenSSL==22.0.0,pypiwin32==223,PySocks==1.7.1,pywin32==303,PyYAML==6.0,requests==2.27.1,setuptools==61.3.0,six==1.16.0,soupsieve==2.3.1,urllib3==1.26.9,validators==0.18.2,wheel==0.37.1,win-inet-pton==1.1.0
# beautifulsoup4==4.10.0,brotlipy==0.7.0,cached-property==1.5.2,certifi==2021.10.8,cffi==1.15.0,charset-normalizer==2.0.12,chevron==0.14.0,click==8.1.2,colorama==0.4.4,cryptography==36.0.2,decorator==5.1.1,docutils==0.18.1,frozendict==2.3.0,genanki==0.13.0,gTTS==2.2.4,idna==3.3,Kivy==2.1.0,kivy-deps.angle==0.3.2,kivy-deps.glew==0.3.1,kivy-deps.gstreamer==0.3.3,kivy-deps.sdl2==0.4.5,Kivy-Garden==0.1.5,kivymd==0.104.2,Pillow==9.0.1,pip==22.0.4,pycparser==2.21,Pygments==2.11.2,pyOpenSSL==22.0.0,pypiwin32==223,PySocks==1.7.1,pywin32==303,PyYAML==6.0,requests==2.27.1,requests-random-user-agent==2022.1.23,setuptools==61.3.0,six==1.16.0,soupsieve==2.3.1,urllib3==1.26.9,validators==0.18.2,wheel==0.37.1,win-inet-pton==1.1.0

# (str) Custom source folders for requirements
# Sets custom source for any requirements with recipes
Expand Down
3 changes: 2 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ dependencies:
- validators
- pip:
- kivy[full]
- kivymd
- kivymd
- requests-random-user-agent
3 changes: 3 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
# cd ./src/
# scrapy genspider example example.com
# pip list --format=freeze > requirements.txt
from os import environ
if 'ANDROID_STORAGE' in environ:
environ['UA_PLATFORM'] = "android"

from src.app import MyApp

Expand Down
Binary file modified requirements.txt
Binary file not shown.
21 changes: 17 additions & 4 deletions run_cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import time
import os
os.environ['UA_PLATFORM'] = "android"
import requests
import requests_random_user_agent

from src.app_cli import run_spider
from src.dict_scraper.spiders import cambridge
Expand All @@ -12,14 +15,24 @@
# # print(response.content)
# CONTAINER['url'] = gcurl

s = requests.Session()
s.headers.update({'Referer': 'https://www.google.com'})
print(s.headers['User-Agent'], s.headers['Referer'])

# Without a session
resp = requests.get('https://httpbin.org/user-agent')
print(resp.json()['user-agent'])

headers = {
'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'Referer': 'https://www.google.com'
}
# 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) '
# 'Chrome/85.0.4183.140 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'

print(run_spider(cambridge.MeaningsSpider, word_url, headers))
time.sleep(20)
print(run_spider(cambridge.MeaningsSpider, word_url2, headers))
# print(run_spider(cambridge.MeaningsSpider, word_url, headers))
# time.sleep(20)
# print(run_spider(cambridge.MeaningsSpider, word_url2, headers))

# run_spider(CambridgeSpider, gcurl, "com", "cbed-2-4", False) # dt.now().strftime("%Y%m%d%H%M%S")
# run_spider("https://dictionary.cambridge.org/dictionary/english/water", "com", dt.now().strftime("%Y%m%d%H%M%S"))
Binary file modified src/__pycache__/app.cpython-38.pyc
Binary file not shown.
Binary file modified src/__pycache__/app_cli.cpython-38.pyc
Binary file not shown.
73 changes: 35 additions & 38 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
import platform
import subprocess
import webbrowser
import validators
from datetime import datetime as dt
import time
import requests
import json
import re

import requests
import validators
import requests_random_user_agent
from bs4 import BeautifulSoup
from kivy import platform as kplatform
from kivymd.app import MDApp
from kivymd.toast import toast
Expand Down Expand Up @@ -37,7 +39,7 @@
# os.environ["KIVY_NO_CONSOLELOG"] = "1"
# kivy.require('1.9.0')

CONTAINER = {'url': '', 'dictionary': [], 'meanings': []}
CONTAINER = {'current_url': '', 'requests': []}
DICTIONARIES = {
"Cambridge": "https://dictionary.cambridge.org/dictionary/english/",
"Dictionary.com": "https://www.dictionary.com/browse/",
Expand All @@ -46,34 +48,26 @@
"Vocabulary.com": "https://www.vocabulary.com/dictionary/",
}
HEADERS = {
'User-Agent':
'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/85.0.4183.140 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'Referer': 'https://www.google.com'
}

# class UrlCrawlerScript(Process):
# def __init__(self, spider, q, *args):
# Process.__init__(self)
# self.runner = CrawlerRunner(get_project_settings())
# self.spider = spider
# self.q = q
# self.args = args
#
# def run(self):
# deferred = self.runner.crawl(self.spider, self.q, self.args)
# deferred.addBoth(lambda _: reactor.stop())
# reactor.run()


def run_spider(soup_spider, url, headers, *args, **kwargs):
spider = soup_spider(url, headers, *args, **kwargs)
results = spider.parse()
if soup_spider is cambridge.MeaningsSpider:
CONTAINER['meanings'] = results
else: # spider is cambridge.CambridgeSpider:
CONTAINER['dictionary'] = results
return results
s = requests.Session()
s.headers.update(HEADERS)


def get_webpage(word_url):
r_text = None
global CONTAINER
for request in CONTAINER['requests']:
if word_url in request[0]:
r_text = request[1]
# print("Found")
break
if not r_text:
print(s.headers['User-Agent'], s.headers['Referer'])
r_text = s.get(word_url).text
CONTAINER['requests'].append((word_url, r_text))
return r_text

# ----------------------------------- KIVY -------------------------------------

Expand Down Expand Up @@ -328,7 +322,10 @@ def confirm_generation(self, section_tuple):

def generate_flashcard(self, btn, section_tuple):
print(section_tuple)
extracted_dictionary = cambridge.CambridgeSpider(CONTAINER['url'], HEADERS, self.tld, section_tuple).parse()
r_text = get_webpage(CONTAINER['current_url'])
extracted_dictionary = cambridge.CambridgeSpider(
BeautifulSoup(r_text, "html.parser"), self.tld, section_tuple
).parse()
MDApp.get_running_app().soft_restart()
self.dialog_popup(
"Open Anki Package?",
Expand All @@ -339,7 +336,7 @@ def generate_flashcard(self, btn, section_tuple):
def show_data(self):
# word_url = self.word_url.text
word_url = self.ids.word_input.text
url_found = False
dict_name = None

if not validators.url(word_url):
self.toast("URL not found. Please try again")
Expand All @@ -359,11 +356,11 @@ def show_data(self):
# # 'DEPTH_LIMIT': 2,
# # 'CLOSESPIDER_PAGECOUNT': 3,
# })
for dict_url in DICTIONARIES.values():
for name, dict_url in DICTIONARIES.items():
if dict_url in word_url:
url_found = True
dict_name = name
break
if url_found:
if dict_name:
# d = runner.crawl(
# CambridgeSpider,
# url=word_url,
Expand All @@ -384,9 +381,9 @@ def show_data(self):
# return False

# gcurl = "https://webcache.googleusercontent.com/search?q=cache:" + word_url

CONTAINER['url'] = word_url
extracted_meanings = cambridge.MeaningsSpider(word_url, HEADERS).parse()
CONTAINER['current_url'] = word_url
r_text = get_webpage(word_url)
extracted_meanings = cambridge.MeaningsSpider(BeautifulSoup(r_text, "html.parser")).parse()
# CONTAINER['meanings'] = extracted_meanings

# self.dialog_popup("Processing...", "Please wait. Generating Flashcard..")
Expand Down Expand Up @@ -484,7 +481,7 @@ def restart(self):
self.root.clear_widgets()
self.stop()
global CONTAINER
CONTAINER = {'url': '', 'dictionary': [], 'meanings': []}
CONTAINER['current_url'] = ''
return MyApp().run()

def change_screen(self):
Expand All @@ -500,7 +497,7 @@ def change_screen(self):

def soft_restart(self):
global CONTAINER
CONTAINER = {'url': '', 'dictionary': [], 'meanings': []}
CONTAINER['current_url'] = ''
self.root.transition.direction = 'right'
self.root.transition.duration = 0.5 # 0.5 second
self.root.current = 'menu_screen'
Expand Down
16 changes: 5 additions & 11 deletions src/dict_scraper/spiders/cambridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import requests
from gtts import gTTS
from bs4 import BeautifulSoup
from bs4.element import ResultSet, Tag

from src.lib.json_to_apkg import JsonToApkg
Expand Down Expand Up @@ -40,11 +39,8 @@ def extract_text(data, join_char=''):


class MeaningsSpider:
def __init__(self, url, headers, *args, **kwargs):
self.url = url
self.headers = headers
self.result = requests.get(self.url, headers=self.headers)
self.soup = BeautifulSoup(self.result.text, "html.parser")
def __init__(self, soup, *args, **kwargs):
self.soup = soup

def parse(self):
# print(response.request.headers.get('Referer', None))
Expand Down Expand Up @@ -180,13 +176,11 @@ def parse(self):


class CambridgeSpider:
def __init__(self, url, headers, *args, **kwargs):
self.url = url
self.headers = headers
def __init__(self, soup, *args, **kwargs):
# print(url, headers, args)
self.soup = soup
self.tld = args[0]
self.section_tuple = args[1]
self.result = requests.get(self.url, headers=self.headers)
self.soup = BeautifulSoup(self.result.text, "html.parser")

# allowed_domains = ['dictionary.cambridge.org']
# allowed_domains = ['web.archive.org']
Expand Down

0 comments on commit f7e0c26

Please sign in to comment.