Merge pull request #66 from ferru97/feature/add_selenium

ferru97 · web-flow · commit 5956efe35170 · 2024-10-22T23:58:58.000+02:00
Selenium &amp; Schikar search by cites
diff --git a/PyPaperBot/Scholar.py b/PyPaperBot/Scholar.py
@@ -1,6 +1,8 @@
 import time
 import requests
 import functools
+import undetected_chromedriver as uc
+from selenium.webdriver.chrome.options import Options
 from .HTMLparsers import schoolarParser
 from .Crossref import getPapersInfo
 from .NetInfo import NetInfo
@@ -18,14 +20,24 @@ def waithIPchange():
             return True
 
 
-def scholar_requests(scholar_pages, url, restrict, scholar_results=10):
+def scholar_requests(scholar_pages, url, restrict, chrome_version, scholar_results=10):
     javascript_error = "Sorry, we can't verify that you're not a robot when JavaScript is turned off"
     to_download = []
+    driver = None
     for i in scholar_pages:
         while True:
             res_url = url % (scholar_results * (i - 1))
-            html = requests.get(res_url, headers=NetInfo.HEADERS)
-            html = html.text
+            if chrome_version is not None:
+                if driver is None:
+                    print("Using Selenium driver")
+                    options = Options()
+                    options.add_argument('--headless')
+                    driver = uc.Chrome(headless=True, use_subprocess=False, version_main=chrome_version)
+                driver.get(res_url)
+                html = driver.page_source
+            else:
+                html = requests.get(res_url, headers=NetInfo.HEADERS)
+                html = html.text
 
             if javascript_error in html:
                 is_continue = waithIPchange()
@@ -52,14 +64,18 @@ def scholar_requests(scholar_pages, url, restrict, scholar_results=10):
     return to_download
 
 
-def ScholarPapersInfo(query, scholar_pages, restrict, min_date=None, scholar_results=10):
-    url = r"https://scholar.google.com/scholar?hl=en&q=" + query + "&as_vis=1&as_sdt=1,5&start=%d"
-    if min_date is not None:
+def ScholarPapersInfo(query, scholar_pages, restrict, min_date=None, scholar_results=10, chrome_version=None, cites=None):
+    url = r"https://scholar.google.com/scholar?hl=en&as_vis=1&as_sdt=1,5&start=%d"
+    if query:
+        if len(query) > 7 and (query.startswith("http://") or query.startswith("https://")):
+            url = query
+        else:
+            url += f"&q={query}"
+    if cites:
+        url += f"&cites={cites}"
+    if min_date:
         url += f"&as_ylo={min_date}"
 
-    if len(query) > 7 and (query.startswith("http://") or query.startswith("https://")):
-        url = query
-
-    to_download = scholar_requests(scholar_pages, url, restrict, scholar_results)
+    to_download = scholar_requests(scholar_pages, url, restrict, chrome_version, scholar_results)
 
     return [item for sublist in to_download for item in sublist]
diff --git a/PyPaperBot/__init__.py b/PyPaperBot/__init__.py
@@ -1 +1 @@
-__version__= "1.2.2"
+__version__= "1.3.0"
diff --git a/PyPaperBot/__main__.py b/PyPaperBot/__main__.py
@@ -3,19 +3,22 @@
 import argparse
 import sys
 import os
+import time
 from .Paper import Paper
 from .PapersFilters import filterJurnals, filter_min_date, similarStrings
 from .Downloader import downloadPapers
 from .Scholar import ScholarPapersInfo
 from .Crossref import getPapersInfoFromDOIs
 from .proxy import proxy
 
-def start(query, scholar_results, scholar_pages, dwn_dir, proxy, min_date=None, num_limit=None, num_limit_type=None, filter_jurnal_file=None, restrict=None, DOIs=None, SciHub_URL=None):
+def start(query, scholar_results, scholar_pages, dwn_dir, proxy, min_date=None, num_limit=None, num_limit_type=None,
+          filter_jurnal_file=None, restrict=None, DOIs=None, SciHub_URL=None, chrome_version=None, cites=None):
 
     to_download = []
     if DOIs is None:
         print("Query: {}".format(query))
-        to_download = ScholarPapersInfo(query, scholar_pages, restrict, min_date, scholar_results)
+        print("Cites: {}".format(cites))
+        to_download = ScholarPapersInfo(query, scholar_pages, restrict, min_date, scholar_results, chrome_version, cites)
     else:
         print("Downloading papers from DOIs\n")
         num = 1
@@ -50,12 +53,16 @@ def start(query, scholar_results, scholar_pages, dwn_dir, proxy, min_date=None,
 
 def main():
     print(
-        """PyPaperBot is a Python tool for downloading scientific papers using Google Scholar, Crossref and SciHub.\nIf you like this project, you can give me a cup of coffee at --> https://www.paypal.com/paypalme/ferru97 <-- :)\n""")
-
+        """PyPaperBot is a Python tool for downloading scientific papers using Google Scholar, Crossref and SciHub.
+        -Join the telegram channel to stay updated --> https://t.me/pypaperbotdatawizards <--
+        -If you like this project, you can share a cup of coffee at --> https://www.paypal.com/paypalme/ferru97 <-- :)\n""")
+    time.sleep(4)
     parser = argparse.ArgumentParser(
         description='PyPaperBot is python tool to search and dwonload scientific papers using Google Scholar, Crossref and SciHub')
     parser.add_argument('--query', type=str, default=None,
                         help='Query to make on Google Scholar or Google Scholar page link')
+    parser.add_argument('--cites', type=str, default=None,
+                        help='Paper ID (from scholar address bar when you search citations) if you want get only citations of that paper')
     parser.add_argument('--doi', type=str, default=None,
                         help='DOI of the paper to download (this option uses only SciHub to download)')
     parser.add_argument('--doi-file', type=str, default=None,
@@ -82,6 +89,8 @@ def main():
                         help='Use proxychains, provide a seperated list of proxies to use.Please specify the argument al the end')
     parser.add_argument('--single-proxy', type=str, default=None,
                         help='Use a single proxy. Recommended if using --proxy gives errors')
+    parser.add_argument('--selenium-chrome-version', type=int, default=None,
+                        help='First three digits of the chrome version installed on your machine. If provided, selenium will be used for scholar search. It helps avoid bot detection but chrome must be installed.')
     args = parser.parse_args()
 
     if args.single_proxy is not None:
@@ -95,8 +104,8 @@ def main():
         pchain = args.proxy
         proxy(pchain)
 
-    if args.query is None and args.doi_file is None and args.doi is None:
-        print("Error, provide at least one of the following arguments: --query or --file")
+    if args.query is None and args.doi_file is None and args.doi is None and args.cites is None:
+        print("Error, provide at least one of the following arguments: --query, --file, or --cites")
         sys.exit()
 
     if (args.query is not None and args.doi_file is not None) or (args.query is not None and args.doi is not None) or (
@@ -119,7 +128,7 @@ def main():
         print("Error: Only one option between '--max-dwn-year' and '--max-dwn-cites' can be used ")
         sys.exit()
 
-    if args.query is not None:
+    if args.query is not None or args.cites is not None:
         if args.scholar_pages:
             try:
                 split = args.scholar_pages.split('-')
@@ -164,9 +173,12 @@ def main():
         max_dwn_type = 1
 
 
-    start(args.query, args.scholar_results, scholar_pages, dwn_dir, proxy, args.min_year , max_dwn, max_dwn_type , args.journal_filter, args.restrict, DOIs, args.scihub_mirror)
+    start(args.query, args.scholar_results, scholar_pages, dwn_dir, proxy, args.min_year , max_dwn, max_dwn_type ,
+          args.journal_filter, args.restrict, DOIs, args.scihub_mirror, args.selenium_chrome_version, args.cites)
 
 if __name__ == "__main__":
     main()
     print(
-        """\nWork completed!\nIf you like this project, you can offer me a cup of coffee at --> https://www.paypal.com/paypalme/ferru97 <-- :)\n""")
+        """\nWork completed!
+        -Join the telegram channel to stay updated --> https://t.me/pypaperbotdatawizards <--
+        -If you like this project, you can share a cup of coffee at --> https://www.paypal.com/paypalme/ferru97 <-- :)\n""")
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@
 
 PyPaperBot is a Python tool for **downloading scientific papers** using Google Scholar, Crossref, and SciHub.
 The tool tries to download papers from different sources such as PDF provided by Scholar, Scholar related links, and Scihub.
-PyPaerbot is also able to download the **bibtex** of each paper.
+PyPaperbot is also able to download the **bibtex** of each paper.
 
 ## Features
 
@@ -52,23 +52,25 @@ pip install PyPaperbot
 
 PyPaperBot arguments:
 
-| Arguments          | Description                                                                              | Type   |
-| ------------------ | ---------------------------------------------------------------------------------------- | ------ |
-| \-\-query          | Query to make on Google Scholar or Google Scholar page link                              | string |
-| \-\-doi            | DOI of the paper to download (this option uses only SciHub to download)                  | string |
-| \-\-doi-file       | File .txt containing the list of paper's DOIs to download                                | string |
-| \-\-scholar-pages  | Number or range of Google Scholar pages to inspect. Each page has a maximum of 10 papers | string |
-| \-\-dwn-dir        | Directory path in which to save the result                                               | string |
-| \-\-min-year       | Minimal publication year of the paper to download                                        | int    |
-| \-\-max-dwn-year   | Maximum number of papers to download sorted by year                                      | int    |
-| \-\-max-dwn-cites  | Maximum number of papers to download sorted by number of citations                       | int    |
-| \-\-journal-filter | CSV file path of the journal filter (More info on github)                                | string |
-| \-\-restrict       | 0:Download only Bibtex - 1:Down load only papers PDF                                     | int    |
-| \-\-scihub-mirror  | Mirror for downloading papers from sci-hub. If not set, it is selected automatically     | string |
-| \-\-scholar-results| Number of scholar results to bedownloaded when \-\-scholar-pages=1                       | int    |
-| \-\-proxy          | Proxies to be used. Please specify the protocol to be used.                              | string |
-| \-\-single-proxy   | Use a single proxy. Recommended if using --proxy gives errors.                           | string |
-| \-h                | Shows the help                                                                           | --     |
+| Arguments                   | Description                                                                              | Type   |
+|-----------------------------| ---------------------------------------------------------------------------------------- |--------|
+| \-\-query                   | Query to make on Google Scholar or Google Scholar page link                              | string |
+| \-\-cites                   | Paper ID (from scholar address bar when you search cites) if you want get only citations of that paper | string                              | string |
+| \-\-doi                     | DOI of the paper to download (this option uses only SciHub to download)                  | string |
+| \-\-doi-file                | File .txt containing the list of paper's DOIs to download                                | string |
+| \-\-scholar-pages           | Number or range of Google Scholar pages to inspect. Each page has a maximum of 10 papers | string |
+| \-\-dwn-dir                 | Directory path in which to save the result                                               | string |
+| \-\-min-year                | Minimal publication year of the paper to download                                        | int    |
+| \-\-max-dwn-year            | Maximum number of papers to download sorted by year                                      | int    |
+| \-\-max-dwn-cites           | Maximum number of papers to download sorted by number of citations                       | int    |
+| \-\-journal-filter          | CSV file path of the journal filter (More info on github)                                | string |
+| \-\-restrict                | 0:Download only Bibtex - 1:Down load only papers PDF                                     | int    |
+| \-\-scihub-mirror           | Mirror for downloading papers from sci-hub. If not set, it is selected automatically     | string |
+| \-\-scholar-results         | Number of scholar results to bedownloaded when \-\-scholar-pages=1                       | int    |
+| \-\-proxy                   | Proxies to be used. Please specify the protocol to be used.                              | string |
+| \-\-single-proxy            | Use a single proxy. Recommended if using --proxy gives errors.                           | string |
+| \-\-selenium-chrome-version | First three digits of the chrome version installed on your machine. If provided, selenium will be used for scholar search. It helps avoid bot detection but chrome must be installed.                           | int    |
+| \-h                         | Shows the help                                                                           | --     |
 
 ### Note
 
@@ -124,6 +126,12 @@ If it doesn't work, try to use *py* instead of *python* i.e.
 py -m PyPaperBot --doi="10.0086/s41037-711-0132-1" --dwn-dir="C:\User\example\papers"`
 ```
 
+Search papers that cite another (find ID in scholar address bar when you search citations):
+
+```bash
+python -m PyPaperBot --cites=3120460092236365926
+```
+
 Using a proxy
 
 ```
diff --git a/requirements.txt b/requirements.txt
diff --git a/setup.py b/setup.py
@@ -6,15 +6,15 @@
 setuptools.setup(
   name = 'PyPaperBot',        
   packages = setuptools.find_packages(),
-  version = '1.2.2',     
+  version = '1.3.0',
   license='MIT', 
   description = 'PyPaperBot is a Python tool for downloading scientific papers using Google Scholar, Crossref, and SciHub.',
   long_description=long_description,
   long_description_content_type="text/markdown",
   author = 'Vito Ferrulli',
   author_email = 'vitof970@gmail.com',
   url = 'https://github.com/ferru97/PyPaperBot',
-  download_url = 'https://github.com/ferru97/PyPaperBot/archive/v1.2.2.tar.gz',
+  download_url = 'https://github.com/ferru97/PyPaperBot/archive/v1.3.0.tar.gz',
   keywords = ['download-papers','google-scholar', 'scihub', 'scholar', 'crossref', 'papers'],
   install_requires=[          
         'astroid>=2.4.2,<=2.5',

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__= "1.2.2"`
	`1`	`+__version__= "1.3.0"`