Added requests-random-user-agent package, Improved CONTAINER values r…

…etrieval i.e. less requests.
mohamedmujtabaraza · Apr 1, 2022 · f7e0c26 · f7e0c26
1 parent dcb8470
commit f7e0c26
Show file tree

Hide file tree

Showing 9 changed files with 64 additions and 56 deletions.
diff --git a/buildozer.spec b/buildozer.spec
@@ -37,9 +37,9 @@ version = 0.1
 
 # (list) Application requirements
 # comma separated e.g. requirements = sqlite3,kivy
-requirements = python3,beautifulsoup4,brotlipy,cached-property,certifi,cffi,charset-normalizer,chevron,click,colorama,cryptography,decorator,docutils,frozendict,genanki,gTTS,idna,Kivy,Kivy-Garden,kivymd,Pillow,pip,pycparser,Pygments,pyOpenSSL,PySocks,PyYAML,requests,setuptools,six,soupsieve,urllib3,validators,wheel
+requirements = python3,beautifulsoup4,brotlipy,cached-property,certifi,cffi,charset-normalizer,chevron,click,colorama,cryptography,decorator,docutils,frozendict,genanki,gTTS,idna,Kivy,Kivy-Garden,kivymd,Pillow,pip,pycparser,Pygments,pyOpenSSL,PySocks,PyYAML,requests,requests-random-user-agent,setuptools,six,soupsieve,urllib3,validators,wheel
 
-# beautifulsoup4==4.10.0,brotlipy==0.7.0,cached-property==1.5.2,certifi==2021.10.8,cffi==1.15.0,charset-normalizer==2.0.12,chevron==0.14.0,click==8.1.2,colorama==0.4.4,cryptography==36.0.2,decorator==5.1.1,docutils==0.18.1,frozendict==2.3.0,genanki==0.13.0,gTTS==2.2.4,idna==3.3,Kivy==2.1.0,kivy-deps.angle==0.3.2,kivy-deps.glew==0.3.1,kivy-deps.gstreamer==0.3.3,kivy-deps.sdl2==0.4.5,Kivy-Garden==0.1.5,kivymd==0.104.2,Pillow==9.0.1,pip==22.0.4,pycparser==2.21,Pygments==2.11.2,pyOpenSSL==22.0.0,pypiwin32==223,PySocks==1.7.1,pywin32==303,PyYAML==6.0,requests==2.27.1,setuptools==61.3.0,six==1.16.0,soupsieve==2.3.1,urllib3==1.26.9,validators==0.18.2,wheel==0.37.1,win-inet-pton==1.1.0
+# beautifulsoup4==4.10.0,brotlipy==0.7.0,cached-property==1.5.2,certifi==2021.10.8,cffi==1.15.0,charset-normalizer==2.0.12,chevron==0.14.0,click==8.1.2,colorama==0.4.4,cryptography==36.0.2,decorator==5.1.1,docutils==0.18.1,frozendict==2.3.0,genanki==0.13.0,gTTS==2.2.4,idna==3.3,Kivy==2.1.0,kivy-deps.angle==0.3.2,kivy-deps.glew==0.3.1,kivy-deps.gstreamer==0.3.3,kivy-deps.sdl2==0.4.5,Kivy-Garden==0.1.5,kivymd==0.104.2,Pillow==9.0.1,pip==22.0.4,pycparser==2.21,Pygments==2.11.2,pyOpenSSL==22.0.0,pypiwin32==223,PySocks==1.7.1,pywin32==303,PyYAML==6.0,requests==2.27.1,requests-random-user-agent==2022.1.23,setuptools==61.3.0,six==1.16.0,soupsieve==2.3.1,urllib3==1.26.9,validators==0.18.2,wheel==0.37.1,win-inet-pton==1.1.0
 
 # (str) Custom source folders for requirements
 # Sets custom source for any requirements with recipes

diff --git a/environment.yml b/environment.yml
@@ -11,4 +11,5 @@ dependencies:
   - validators
   - pip:
       - kivy[full]
-      - kivymd
+      - kivymd
+      - requests-random-user-agent
diff --git a/main.py b/main.py
@@ -2,6 +2,9 @@
 # cd ./src/
 # scrapy genspider example example.com
 # pip list --format=freeze > requirements.txt
+from os import environ
+if 'ANDROID_STORAGE' in environ:
+    environ['UA_PLATFORM'] = "android"
 
 from src.app import MyApp
 

diff --git a/requirements.txt b/requirements.txt
diff --git a/run_cli.py b/run_cli.py
@@ -1,4 +1,7 @@
-import time
+import os
+os.environ['UA_PLATFORM'] = "android"
+import requests
+import requests_random_user_agent
 
 from src.app_cli import run_spider
 from src.dict_scraper.spiders import cambridge
@@ -12,14 +15,24 @@
     # # print(response.content)
     # CONTAINER['url'] = gcurl
 
+    s = requests.Session()
+    s.headers.update({'Referer': 'https://www.google.com'})
+    print(s.headers['User-Agent'], s.headers['Referer'])
+
+    # Without a session
+    resp = requests.get('https://httpbin.org/user-agent')
+    print(resp.json()['user-agent'])
+
     headers = {
         'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
         'Referer': 'https://www.google.com'
     }
+    # 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) '
+    # 'Chrome/85.0.4183.140 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
 
-    print(run_spider(cambridge.MeaningsSpider, word_url, headers))
-    time.sleep(20)
-    print(run_spider(cambridge.MeaningsSpider, word_url2, headers))
+    # print(run_spider(cambridge.MeaningsSpider, word_url, headers))
+    # time.sleep(20)
+    # print(run_spider(cambridge.MeaningsSpider, word_url2, headers))
 
     # run_spider(CambridgeSpider, gcurl, "com", "cbed-2-4", False)  # dt.now().strftime("%Y%m%d%H%M%S")
     # run_spider("https://dictionary.cambridge.org/dictionary/english/water", "com", dt.now().strftime("%Y%m%d%H%M%S"))
diff --git a/src/__pycache__/app.cpython-38.pyc b/src/__pycache__/app.cpython-38.pyc
diff --git a/src/__pycache__/app_cli.cpython-38.pyc b/src/__pycache__/app_cli.cpython-38.pyc
diff --git a/src/app.py b/src/app.py
@@ -2,13 +2,15 @@
 import platform
 import subprocess
 import webbrowser
-import validators
 from datetime import datetime as dt
 import time
-import requests
 import json
 import re
 
+import requests
+import validators
+import requests_random_user_agent
+from bs4 import BeautifulSoup
 from kivy import platform as kplatform
 from kivymd.app import MDApp
 from kivymd.toast import toast
@@ -37,7 +39,7 @@
 # os.environ["KIVY_NO_CONSOLELOG"] = "1"
 # kivy.require('1.9.0')
 
-CONTAINER = {'url': '', 'dictionary': [], 'meanings': []}
+CONTAINER = {'current_url': '', 'requests': []}
 DICTIONARIES = {
     "Cambridge": "https://dictionary.cambridge.org/dictionary/english/",
     "Dictionary.com": "https://www.dictionary.com/browse/",
@@ -46,34 +48,26 @@
     "Vocabulary.com": "https://www.vocabulary.com/dictionary/",
 }
 HEADERS = {
-    'User-Agent':
-        'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) '
-        'Chrome/85.0.4183.140 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
     'Referer': 'https://www.google.com'
 }
 
-# class UrlCrawlerScript(Process):
-#     def __init__(self, spider, q, *args):
-#         Process.__init__(self)
-#         self.runner = CrawlerRunner(get_project_settings())
-#         self.spider = spider
-#         self.q = q
-#         self.args = args
-#
-#     def run(self):
-#         deferred = self.runner.crawl(self.spider, self.q, self.args)
-#         deferred.addBoth(lambda _: reactor.stop())
-#         reactor.run()
-
-
-def run_spider(soup_spider, url, headers, *args, **kwargs):
-    spider = soup_spider(url, headers, *args, **kwargs)
-    results = spider.parse()
-    if soup_spider is cambridge.MeaningsSpider:
-        CONTAINER['meanings'] = results
-    else:  # spider is cambridge.CambridgeSpider:
-        CONTAINER['dictionary'] = results
-    return results
+s = requests.Session()
+s.headers.update(HEADERS)
+
+
+def get_webpage(word_url):
+    r_text = None
+    global CONTAINER
+    for request in CONTAINER['requests']:
+        if word_url in request[0]:
+            r_text = request[1]
+            # print("Found")
+            break
+    if not r_text:
+        print(s.headers['User-Agent'], s.headers['Referer'])
+        r_text = s.get(word_url).text
+        CONTAINER['requests'].append((word_url, r_text))
+    return r_text
 
 # ----------------------------------- KIVY -------------------------------------
 
@@ -328,7 +322,10 @@ def confirm_generation(self, section_tuple):
 
     def generate_flashcard(self, btn, section_tuple):
         print(section_tuple)
-        extracted_dictionary = cambridge.CambridgeSpider(CONTAINER['url'], HEADERS, self.tld, section_tuple).parse()
+        r_text = get_webpage(CONTAINER['current_url'])
+        extracted_dictionary = cambridge.CambridgeSpider(
+            BeautifulSoup(r_text, "html.parser"), self.tld, section_tuple
+        ).parse()
         MDApp.get_running_app().soft_restart()
         self.dialog_popup(
             "Open Anki Package?",
@@ -339,7 +336,7 @@ def generate_flashcard(self, btn, section_tuple):
     def show_data(self):
         # word_url = self.word_url.text
         word_url = self.ids.word_input.text
-        url_found = False
+        dict_name = None
 
         if not validators.url(word_url):
             self.toast("URL not found. Please try again")
@@ -359,11 +356,11 @@ def show_data(self):
         #     # 'DEPTH_LIMIT': 2,
         #     # 'CLOSESPIDER_PAGECOUNT': 3,
         # })
-        for dict_url in DICTIONARIES.values():
+        for name, dict_url in DICTIONARIES.items():
             if dict_url in word_url:
-                url_found = True
+                dict_name = name
                 break
-        if url_found:
+        if dict_name:
             # d = runner.crawl(
             #     CambridgeSpider,
             #     url=word_url,
@@ -384,9 +381,9 @@ def show_data(self):
             #     return False
 
             # gcurl = "https://webcache.googleusercontent.com/search?q=cache:" + word_url
-
-            CONTAINER['url'] = word_url
-            extracted_meanings = cambridge.MeaningsSpider(word_url, HEADERS).parse()
+            CONTAINER['current_url'] = word_url
+            r_text = get_webpage(word_url)
+            extracted_meanings = cambridge.MeaningsSpider(BeautifulSoup(r_text, "html.parser")).parse()
             # CONTAINER['meanings'] = extracted_meanings
 
             # self.dialog_popup("Processing...", "Please wait. Generating Flashcard..")
@@ -484,7 +481,7 @@ def restart(self):
         self.root.clear_widgets()
         self.stop()
         global CONTAINER
-        CONTAINER = {'url': '', 'dictionary': [], 'meanings': []}
+        CONTAINER['current_url'] = ''
         return MyApp().run()
 
     def change_screen(self):
@@ -500,7 +497,7 @@ def change_screen(self):
 
     def soft_restart(self):
         global CONTAINER
-        CONTAINER = {'url': '', 'dictionary': [], 'meanings': []}
+        CONTAINER['current_url'] = ''
         self.root.transition.direction = 'right'
         self.root.transition.duration = 0.5  # 0.5 second
         self.root.current = 'menu_screen'

diff --git a/src/dict_scraper/spiders/cambridge.py b/src/dict_scraper/spiders/cambridge.py
@@ -3,7 +3,6 @@
 
 import requests
 from gtts import gTTS
-from bs4 import BeautifulSoup
 from bs4.element import ResultSet, Tag
 
 from src.lib.json_to_apkg import JsonToApkg
@@ -40,11 +39,8 @@ def extract_text(data, join_char=''):
 
 
 class MeaningsSpider:
-    def __init__(self, url, headers, *args, **kwargs):
-        self.url = url
-        self.headers = headers
-        self.result = requests.get(self.url, headers=self.headers)
-        self.soup = BeautifulSoup(self.result.text, "html.parser")
+    def __init__(self, soup, *args, **kwargs):
+        self.soup = soup
 
     def parse(self):
         # print(response.request.headers.get('Referer', None))
@@ -180,13 +176,11 @@ def parse(self):
 
 
 class CambridgeSpider:
-    def __init__(self, url, headers, *args, **kwargs):
-        self.url = url
-        self.headers = headers
+    def __init__(self, soup, *args, **kwargs):
+        # print(url, headers, args)
+        self.soup = soup
         self.tld = args[0]
         self.section_tuple = args[1]
-        self.result = requests.get(self.url, headers=self.headers)
-        self.soup = BeautifulSoup(self.result.text, "html.parser")
 
     # allowed_domains = ['dictionary.cambridge.org']
     # allowed_domains = ['web.archive.org']