-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgemrxiv.py
executable file
·102 lines (80 loc) · 4.32 KB
/
gemrxiv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# gemRxiv
# Copyright (C) 2024 by Sven Kochmann
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# This program extracts the hidden gems from chemRxiv to brag about
# them on Twitter and the like.
#
# Feb 2024: ChemRxiv has its own API now:
# https://chemrxiv.org/engage/chemrxiv/public-api/documentation
import argparse
import datetime
import json
import tabulate
import urllib.request
# Argument setup and parsing
parser = argparse.ArgumentParser(
description = 'gemrxiv extracts the hidden gems from chemRxiv to brag about them on Twitter and the like.')
parser.add_argument('-v', '--version', help = 'prints version information', action = 'version', version = 'gemRxiv! 1.1 by Sven Kochmann')
parser.add_argument('-e', '--entries', metavar = 'E', help = 'Number of entries to load, maximum is 50 (default: 50)', type = int, default = 50)
parser.add_argument('-f', '--finish', metavar = 'F', help = 'Only considers entries BEFORE this finishing date (default: today). Should be YYYY-MM-DD format', type = str, default=datetime.datetime.today().strftime('%Y-%m-%d'))
parser.add_argument('-d', '--days', metavar = 'D', help = 'Entries in the D days before the finishing date will not be considered (default: 180)', type = int, default = 180)
args = vars(parser.parse_args())
# Calculate the starting and ending dates
date_end = datetime.date.fromisoformat(args['finish'])
date_start = date_end - datetime.timedelta(days=args['days'])
# According to chemRxiv API 50 items are the maximum
if not 0 < args['entries'] < 51:
args['entries'] = 50
# Setup the url (chemRxiv API allows exclusions!)
url1 = 'https://chemrxiv.org/engage/chemrxiv/public-api/v1/items?limit=' + str(args['entries']) + '&sort=VIEWS_COUNT_ASC&searchDateFrom=' + date_start.strftime('%Y-%m-%d') + '&searchDateTo=' + date_end.strftime('%Y-%m-%d')
# Opens an url an returns its contents as JSON-dictionary
def http_json_as_dict(urltogo):
req = urllib.request.Request(url=urltogo, headers={'User-Agent': 'Mozilla/5.0'})
return json.load(urllib.request.urlopen(req))
# Get all results
print('Sending request \'' + url1 + '\'...', flush=True)
print('Downloading preprints...', end='', flush=True)
preliminary_results = http_json_as_dict(url1)["itemHits"]
print('%d loaded.' % len(preliminary_results), flush=True)
# Create a list of dictionaries (article_id, title, views, downloads, publishing date, days since publishing,
# downloads/days) with only the papers that are left; ask statistics for views and downloads
results = []
for index, entry in enumerate(preliminary_results):
print("Downloading data for preprint %d of %d" % (index + 1, len(preliminary_results)), end='\r')
# Add entry
data = {'id': entry['item']['id'],
'title': entry['item']['title'],
'views': entry['item']['metrics'][0]['value'],
'citations': entry['item']['metrics'][1]['value'],
'downloads': entry['item']['metrics'][2]['value'],
'date': entry['item']['publishedDate'][0:10],
'days_online': 0,
'downloads_per_day': 0.0,
'url': 'https://chemrxiv.org/engage/chemrxiv/article-details/' + entry['item']['id']}
results.append(data)
pass
print("")
for index, entry in enumerate(results):
year, month, day = entry['date'].split('-')
days_online = (datetime.date.today() - datetime.date(int(year), int(month), int(day))).days
results[index]['days_online'] = int(days_online)
if days_online > 0:
results[index]['downloads_per_day'] = float(float(entry['downloads'])/float(days_online))
# Sort by downloads
results = sorted(results, key=lambda entry: entry['downloads_per_day'])
print("")
print(tabulate.tabulate(results)) #.encode("ascii", "replace"))