Skip to content

Commit 5956efe

Browse files
authored
Merge pull request #66 from ferru97/feature/add_selenium
Selenium & Schikar search by cites
2 parents aa06605 + 1844083 commit 5956efe

File tree

6 files changed

+76
-40
lines changed

6 files changed

+76
-40
lines changed

PyPaperBot/Scholar.py

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import time
22
import requests
33
import functools
4+
import undetected_chromedriver as uc
5+
from selenium.webdriver.chrome.options import Options
46
from .HTMLparsers import schoolarParser
57
from .Crossref import getPapersInfo
68
from .NetInfo import NetInfo
@@ -18,14 +20,24 @@ def waithIPchange():
1820
return True
1921

2022

21-
def scholar_requests(scholar_pages, url, restrict, scholar_results=10):
23+
def scholar_requests(scholar_pages, url, restrict, chrome_version, scholar_results=10):
2224
javascript_error = "Sorry, we can't verify that you're not a robot when JavaScript is turned off"
2325
to_download = []
26+
driver = None
2427
for i in scholar_pages:
2528
while True:
2629
res_url = url % (scholar_results * (i - 1))
27-
html = requests.get(res_url, headers=NetInfo.HEADERS)
28-
html = html.text
30+
if chrome_version is not None:
31+
if driver is None:
32+
print("Using Selenium driver")
33+
options = Options()
34+
options.add_argument('--headless')
35+
driver = uc.Chrome(headless=True, use_subprocess=False, version_main=chrome_version)
36+
driver.get(res_url)
37+
html = driver.page_source
38+
else:
39+
html = requests.get(res_url, headers=NetInfo.HEADERS)
40+
html = html.text
2941

3042
if javascript_error in html:
3143
is_continue = waithIPchange()
@@ -52,14 +64,18 @@ def scholar_requests(scholar_pages, url, restrict, scholar_results=10):
5264
return to_download
5365

5466

55-
def ScholarPapersInfo(query, scholar_pages, restrict, min_date=None, scholar_results=10):
56-
url = r"https://scholar.google.com/scholar?hl=en&q=" + query + "&as_vis=1&as_sdt=1,5&start=%d"
57-
if min_date is not None:
67+
def ScholarPapersInfo(query, scholar_pages, restrict, min_date=None, scholar_results=10, chrome_version=None, cites=None):
68+
url = r"https://scholar.google.com/scholar?hl=en&as_vis=1&as_sdt=1,5&start=%d"
69+
if query:
70+
if len(query) > 7 and (query.startswith("http://") or query.startswith("https://")):
71+
url = query
72+
else:
73+
url += f"&q={query}"
74+
if cites:
75+
url += f"&cites={cites}"
76+
if min_date:
5877
url += f"&as_ylo={min_date}"
5978

60-
if len(query) > 7 and (query.startswith("http://") or query.startswith("https://")):
61-
url = query
62-
63-
to_download = scholar_requests(scholar_pages, url, restrict, scholar_results)
79+
to_download = scholar_requests(scholar_pages, url, restrict, chrome_version, scholar_results)
6480

6581
return [item for sublist in to_download for item in sublist]

PyPaperBot/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__= "1.2.2"
1+
__version__= "1.3.0"

PyPaperBot/__main__.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,22 @@
33
import argparse
44
import sys
55
import os
6+
import time
67
from .Paper import Paper
78
from .PapersFilters import filterJurnals, filter_min_date, similarStrings
89
from .Downloader import downloadPapers
910
from .Scholar import ScholarPapersInfo
1011
from .Crossref import getPapersInfoFromDOIs
1112
from .proxy import proxy
1213

13-
def start(query, scholar_results, scholar_pages, dwn_dir, proxy, min_date=None, num_limit=None, num_limit_type=None, filter_jurnal_file=None, restrict=None, DOIs=None, SciHub_URL=None):
14+
def start(query, scholar_results, scholar_pages, dwn_dir, proxy, min_date=None, num_limit=None, num_limit_type=None,
15+
filter_jurnal_file=None, restrict=None, DOIs=None, SciHub_URL=None, chrome_version=None, cites=None):
1416

1517
to_download = []
1618
if DOIs is None:
1719
print("Query: {}".format(query))
18-
to_download = ScholarPapersInfo(query, scholar_pages, restrict, min_date, scholar_results)
20+
print("Cites: {}".format(cites))
21+
to_download = ScholarPapersInfo(query, scholar_pages, restrict, min_date, scholar_results, chrome_version, cites)
1922
else:
2023
print("Downloading papers from DOIs\n")
2124
num = 1
@@ -50,12 +53,16 @@ def start(query, scholar_results, scholar_pages, dwn_dir, proxy, min_date=None,
5053

5154
def main():
5255
print(
53-
"""PyPaperBot is a Python tool for downloading scientific papers using Google Scholar, Crossref and SciHub.\nIf you like this project, you can give me a cup of coffee at --> https://www.paypal.com/paypalme/ferru97 <-- :)\n""")
54-
56+
"""PyPaperBot is a Python tool for downloading scientific papers using Google Scholar, Crossref and SciHub.
57+
-Join the telegram channel to stay updated --> https://t.me/pypaperbotdatawizards <--
58+
-If you like this project, you can share a cup of coffee at --> https://www.paypal.com/paypalme/ferru97 <-- :)\n""")
59+
time.sleep(4)
5560
parser = argparse.ArgumentParser(
5661
description='PyPaperBot is python tool to search and dwonload scientific papers using Google Scholar, Crossref and SciHub')
5762
parser.add_argument('--query', type=str, default=None,
5863
help='Query to make on Google Scholar or Google Scholar page link')
64+
parser.add_argument('--cites', type=str, default=None,
65+
help='Paper ID (from scholar address bar when you search citations) if you want get only citations of that paper')
5966
parser.add_argument('--doi', type=str, default=None,
6067
help='DOI of the paper to download (this option uses only SciHub to download)')
6168
parser.add_argument('--doi-file', type=str, default=None,
@@ -82,6 +89,8 @@ def main():
8289
help='Use proxychains, provide a seperated list of proxies to use.Please specify the argument al the end')
8390
parser.add_argument('--single-proxy', type=str, default=None,
8491
help='Use a single proxy. Recommended if using --proxy gives errors')
92+
parser.add_argument('--selenium-chrome-version', type=int, default=None,
93+
help='First three digits of the chrome version installed on your machine. If provided, selenium will be used for scholar search. It helps avoid bot detection but chrome must be installed.')
8594
args = parser.parse_args()
8695

8796
if args.single_proxy is not None:
@@ -95,8 +104,8 @@ def main():
95104
pchain = args.proxy
96105
proxy(pchain)
97106

98-
if args.query is None and args.doi_file is None and args.doi is None:
99-
print("Error, provide at least one of the following arguments: --query or --file")
107+
if args.query is None and args.doi_file is None and args.doi is None and args.cites is None:
108+
print("Error, provide at least one of the following arguments: --query, --file, or --cites")
100109
sys.exit()
101110

102111
if (args.query is not None and args.doi_file is not None) or (args.query is not None and args.doi is not None) or (
@@ -119,7 +128,7 @@ def main():
119128
print("Error: Only one option between '--max-dwn-year' and '--max-dwn-cites' can be used ")
120129
sys.exit()
121130

122-
if args.query is not None:
131+
if args.query is not None or args.cites is not None:
123132
if args.scholar_pages:
124133
try:
125134
split = args.scholar_pages.split('-')
@@ -164,9 +173,12 @@ def main():
164173
max_dwn_type = 1
165174

166175

167-
start(args.query, args.scholar_results, scholar_pages, dwn_dir, proxy, args.min_year , max_dwn, max_dwn_type , args.journal_filter, args.restrict, DOIs, args.scihub_mirror)
176+
start(args.query, args.scholar_results, scholar_pages, dwn_dir, proxy, args.min_year , max_dwn, max_dwn_type ,
177+
args.journal_filter, args.restrict, DOIs, args.scihub_mirror, args.selenium_chrome_version, args.cites)
168178

169179
if __name__ == "__main__":
170180
main()
171181
print(
172-
"""\nWork completed!\nIf you like this project, you can offer me a cup of coffee at --> https://www.paypal.com/paypalme/ferru97 <-- :)\n""")
182+
"""\nWork completed!
183+
-Join the telegram channel to stay updated --> https://t.me/pypaperbotdatawizards <--
184+
-If you like this project, you can share a cup of coffee at --> https://www.paypal.com/paypalme/ferru97 <-- :)\n""")

README.md

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
PyPaperBot is a Python tool for **downloading scientific papers** using Google Scholar, Crossref, and SciHub.
1010
The tool tries to download papers from different sources such as PDF provided by Scholar, Scholar related links, and Scihub.
11-
PyPaerbot is also able to download the **bibtex** of each paper.
11+
PyPaperbot is also able to download the **bibtex** of each paper.
1212

1313
## Features
1414

@@ -52,23 +52,25 @@ pip install PyPaperbot
5252

5353
PyPaperBot arguments:
5454

55-
| Arguments | Description | Type |
56-
| ------------------ | ---------------------------------------------------------------------------------------- | ------ |
57-
| \-\-query | Query to make on Google Scholar or Google Scholar page link | string |
58-
| \-\-doi | DOI of the paper to download (this option uses only SciHub to download) | string |
59-
| \-\-doi-file | File .txt containing the list of paper's DOIs to download | string |
60-
| \-\-scholar-pages | Number or range of Google Scholar pages to inspect. Each page has a maximum of 10 papers | string |
61-
| \-\-dwn-dir | Directory path in which to save the result | string |
62-
| \-\-min-year | Minimal publication year of the paper to download | int |
63-
| \-\-max-dwn-year | Maximum number of papers to download sorted by year | int |
64-
| \-\-max-dwn-cites | Maximum number of papers to download sorted by number of citations | int |
65-
| \-\-journal-filter | CSV file path of the journal filter (More info on github) | string |
66-
| \-\-restrict | 0:Download only Bibtex - 1:Down load only papers PDF | int |
67-
| \-\-scihub-mirror | Mirror for downloading papers from sci-hub. If not set, it is selected automatically | string |
68-
| \-\-scholar-results| Number of scholar results to bedownloaded when \-\-scholar-pages=1 | int |
69-
| \-\-proxy | Proxies to be used. Please specify the protocol to be used. | string |
70-
| \-\-single-proxy | Use a single proxy. Recommended if using --proxy gives errors. | string |
71-
| \-h | Shows the help | -- |
55+
| Arguments | Description | Type |
56+
|-----------------------------| ---------------------------------------------------------------------------------------- |--------|
57+
| \-\-query | Query to make on Google Scholar or Google Scholar page link | string |
58+
| \-\-cites | Paper ID (from scholar address bar when you search cites) if you want get only citations of that paper | string | string |
59+
| \-\-doi | DOI of the paper to download (this option uses only SciHub to download) | string |
60+
| \-\-doi-file | File .txt containing the list of paper's DOIs to download | string |
61+
| \-\-scholar-pages | Number or range of Google Scholar pages to inspect. Each page has a maximum of 10 papers | string |
62+
| \-\-dwn-dir | Directory path in which to save the result | string |
63+
| \-\-min-year | Minimal publication year of the paper to download | int |
64+
| \-\-max-dwn-year | Maximum number of papers to download sorted by year | int |
65+
| \-\-max-dwn-cites | Maximum number of papers to download sorted by number of citations | int |
66+
| \-\-journal-filter | CSV file path of the journal filter (More info on github) | string |
67+
| \-\-restrict | 0:Download only Bibtex - 1:Down load only papers PDF | int |
68+
| \-\-scihub-mirror | Mirror for downloading papers from sci-hub. If not set, it is selected automatically | string |
69+
| \-\-scholar-results | Number of scholar results to bedownloaded when \-\-scholar-pages=1 | int |
70+
| \-\-proxy | Proxies to be used. Please specify the protocol to be used. | string |
71+
| \-\-single-proxy | Use a single proxy. Recommended if using --proxy gives errors. | string |
72+
| \-\-selenium-chrome-version | First three digits of the chrome version installed on your machine. If provided, selenium will be used for scholar search. It helps avoid bot detection but chrome must be installed. | int |
73+
| \-h | Shows the help | -- |
7274

7375
### Note
7476

@@ -124,6 +126,12 @@ If it doesn't work, try to use *py* instead of *python* i.e.
124126
py -m PyPaperBot --doi="10.0086/s41037-711-0132-1" --dwn-dir="C:\User\example\papers"`
125127
```
126128

129+
Search papers that cite another (find ID in scholar address bar when you search citations):
130+
131+
```bash
132+
python -m PyPaperBot --cites=3120460092236365926
133+
```
134+
127135
Using a proxy
128136

129137
```

requirements.txt

1.55 KB
Binary file not shown.

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,15 @@
66
setuptools.setup(
77
name = 'PyPaperBot',
88
packages = setuptools.find_packages(),
9-
version = '1.2.2',
9+
version = '1.3.0',
1010
license='MIT',
1111
description = 'PyPaperBot is a Python tool for downloading scientific papers using Google Scholar, Crossref, and SciHub.',
1212
long_description=long_description,
1313
long_description_content_type="text/markdown",
1414
author = 'Vito Ferrulli',
1515
author_email = 'vitof970@gmail.com',
1616
url = 'https://github.com/ferru97/PyPaperBot',
17-
download_url = 'https://github.com/ferru97/PyPaperBot/archive/v1.2.2.tar.gz',
17+
download_url = 'https://github.com/ferru97/PyPaperBot/archive/v1.3.0.tar.gz',
1818
keywords = ['download-papers','google-scholar', 'scihub', 'scholar', 'crossref', 'papers'],
1919
install_requires=[
2020
'astroid>=2.4.2,<=2.5',

0 commit comments

Comments
 (0)