From 4dd476909589d75f9020b92e29bea47a76f27f8f Mon Sep 17 00:00:00 2001 From: muzgash Date: Mon, 31 Jul 2023 22:45:24 -0500 Subject: [PATCH 1/4] Added the plugin which inserts or upgrades journals from openalex db --- Kahi_openalex_sources/LICENSE | 30 +++++ Kahi_openalex_sources/MANIFEST.in | 2 + Kahi_openalex_sources/README.md | 43 +++++++ .../Kahi_openalex_sources.py | 121 ++++++++++++++++++ .../kahi_openalex_sources/__init__.py | 0 .../kahi_openalex_sources/_version.py | 6 + Kahi_openalex_sources/setup.py | 89 +++++++++++++ 7 files changed, 291 insertions(+) create mode 100644 Kahi_openalex_sources/LICENSE create mode 100644 Kahi_openalex_sources/MANIFEST.in create mode 100644 Kahi_openalex_sources/README.md create mode 100644 Kahi_openalex_sources/kahi_openalex_sources/Kahi_openalex_sources.py create mode 100644 Kahi_openalex_sources/kahi_openalex_sources/__init__.py create mode 100644 Kahi_openalex_sources/kahi_openalex_sources/_version.py create mode 100644 Kahi_openalex_sources/setup.py diff --git a/Kahi_openalex_sources/LICENSE b/Kahi_openalex_sources/LICENSE new file mode 100644 index 0000000..cef2ea0 --- /dev/null +++ b/Kahi_openalex_sources/LICENSE @@ -0,0 +1,30 @@ +Copyright (c) 2005-2020, Colav Developers. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the NumPy Developers nor the names of any + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Kahi_openalex_sources/MANIFEST.in b/Kahi_openalex_sources/MANIFEST.in new file mode 100644 index 0000000..eab13ec --- /dev/null +++ b/Kahi_openalex_sources/MANIFEST.in @@ -0,0 +1,2 @@ +recursive-include kahi_openalex_sources/ *.py +recursive-include kahi_openalex_sources/ *.* \ No newline at end of file diff --git a/Kahi_openalex_sources/README.md b/Kahi_openalex_sources/README.md new file mode 100644 index 0000000..66c2c5c --- /dev/null +++ b/Kahi_openalex_sources/README.md @@ -0,0 +1,43 @@ +
+ +# Kahi template plugin +This is a template for xyz project +replace template for the name of the plugin everywhere. + +# Description +Write something meaningful here ;) + +# Installation + +## Dependencies +What do I need fot this plugin?, it could be external services etc.. + +## Package +Write here how to install this plugin +usauly is + +`pip install kahi_template` + + +# Usage +what should I know? +put it here. + +Additional parameters for kahi_run in the workflow should be here as well. +example : + +``` +template: + - my_param_example: value +``` +Those parameters are not really needed in the workflow file, it is just for illustration. + + +# License +BSD-3-Clause License + +# Links +http://colav.udea.edu.co/ + + + diff --git a/Kahi_openalex_sources/kahi_openalex_sources/Kahi_openalex_sources.py b/Kahi_openalex_sources/kahi_openalex_sources/Kahi_openalex_sources.py new file mode 100644 index 0000000..09d7a87 --- /dev/null +++ b/Kahi_openalex_sources/kahi_openalex_sources/Kahi_openalex_sources.py @@ -0,0 +1,121 @@ +from kahi.KahiBase import KahiBase +from pymongo import MongoClient +from datetime import datetime as dt +from time import time + + +class Kahi_openalex_sources(KahiBase): + + config = {} + + def __init__(self, config): + self.config = config + + self.mongodb_url = config["database_url"] + + self.client = MongoClient(self.mongodb_url) + + self.db = self.client[config["database_name"]] + self.collection = self.db["sources"] + + self.openalex_client = MongoClient( + config["openalex_sources"]["database_url"]) + self.openalex_db = self.openalex_client[config["openalex_sources"] + ["database_name"]] + self.openalex_collection = self.openalex_db[config["openalex_sources"] + ["collection_name"]] + + self.already_processed = [] + + def process_openalex(self): + with self.openalex_client.start_session() as session: + self.openalex_db = self.openalex_client[self.config["openalex_sources"] + ["database_name"]] + self.openalex_collection = self.openalex_db[self.config["openalex_sources"] + ["collection_name"]] + old = dt.now() + for source in self.openalex_collection.find({"id": {"$nin": self.already_processed}}): + if source["id"] in self.already_processed: + continue + source_db = None + if "issn" in source.keys(): + source_db = self.collection.find_one( + {"external_ids.id": source["issn"]}) + if not source_db: + if "issn_l" in source.keys(): + source_db = self.collection.find_one( + {"external_ids.id": source["issn_l"]}) + if source_db: + oa_found = False + for up in source_db["updated"]: + if up["source"] == "openalex": + oa_found = True + break + if oa_found: + continue + + source_db["updated"].append( + {"source": "openalex", "time": int(time())}) + source_db["external_ids"].append( + {"source": "openalex", "id": source["id"]}) + source_db["types"].append( + {"source": "openalex", "type": source["type"]}) + source_db["names"].append( + {"name": source["display_name"], "lang": "en", "source": "openalex"}) + + self.collection.update_one({"_id": source_db["_id"]}, {"$set": { + "updated": source_db["updated"], + "names": source_db["names"], + "external_ids": source_db["external_ids"], + "types": source_db["types"], + "subjects": source_db["subjects"] + }}) + else: + entry = self.empty_source() + entry["updated"] = [ + {"source": "openalex", "time": int(time())}] + entry["names"].append( + {"name": source["display_name"], "lang": "en", "source": "openalex"}) + entry["external_ids"].append( + {"source": "openalex", "id": source["id"]}) + if "issn" in source.keys(): + entry["external_ids"].append( + {"source": "issn", "id": source["issn"]}) + if "issn_l" in source.keys(): + entry["external_ids"].append( + {"source": "issn_l", "id": source["issn_l"]}) + entry["types"].append( + {"source": "openalex", "type": source["type"]}) + if "publisher" in source.keys(): + if source["publisher"]: + entry["publisher"] = { + "name": source["publisher"], "country_code": ""} + if "apc_usd" in source.keys(): + if source["apc_usd"]: + entry["apc"] = {"currency": "USD", + "charges": source["apc_usd"]} + if "abbreviated_title" in source.keys(): + if source["abbreviated_title"]: + entry["abbreviations"].append( + source["abbreviated_title"]) + for name in source["alternate_titles"]: + entry["abbreviations"].append(name) + if source["homepage_url"]: + entry["external_urls"].append( + {"source": "site", "url": source["homepage_url"]}) + if source["societies"]: + for soc in source["societies"]: + entry["external_urls"].append( + {"source": soc["organization"], "url": soc["url"]}) + + self.collection.insert_one(entry) + self.already_processed.append(source["id"]) + delta = dt.now()-old + if delta.seconds > 240: + self.openalex_client.admin.command( + 'refreshSessions', [session.session_id], session=session) + old = dt.now() + + def run(self): + self.process_openalex() + return 0 diff --git a/Kahi_openalex_sources/kahi_openalex_sources/__init__.py b/Kahi_openalex_sources/kahi_openalex_sources/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Kahi_openalex_sources/kahi_openalex_sources/_version.py b/Kahi_openalex_sources/kahi_openalex_sources/_version.py new file mode 100644 index 0000000..7b6ecff --- /dev/null +++ b/Kahi_openalex_sources/kahi_openalex_sources/_version.py @@ -0,0 +1,6 @@ +# flake8: noqa +__version__ = '0.0.1-alpha' + + +def get_version(): + return __version__ diff --git a/Kahi_openalex_sources/setup.py b/Kahi_openalex_sources/setup.py new file mode 100644 index 0000000..66d8405 --- /dev/null +++ b/Kahi_openalex_sources/setup.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +# Copyright (c) Colav. +# Distributed under the terms of the Modified BSD License. + +# ----------------------------------------------------------------------------- +# Minimal Python version sanity check (from IPython) +# ----------------------------------------------------------------------------- + +# See https://stackoverflow.com/a/26737258/2268280 +# sudo pip3 install twine +# python3 setup.py sdist bdist_wheel +# twine upload dist/* +# For test purposes +# twine upload --repository-url https://test.pypi.org/legacy/ dist/* + +from __future__ import print_function +from setuptools import setup, find_packages + +import os +import sys +import codecs + + +v = sys.version_info + + +def read(rel_path): + here = os.path.abspath(os.path.dirname(__file__)) + with codecs.open(os.path.join(here, rel_path), 'r') as fp: + return fp.read() + + +def get_version(rel_path): + for line in read(rel_path).splitlines(): + if line.startswith('__version__'): + delim = '"' if '"' in line else "'" + return line.split(delim)[1] + else: + raise RuntimeError("Unable to find version string.") + + +shell = False +if os.name in ('nt', 'dos'): + shell = True + warning = "WARNING: Windows is not officially supported" + print(warning, file=sys.stderr) + + +def main(): + setup( + # Application name: + name="Kahi_openalex_sources", + + # Version number (initial): + version=get_version('kahi_openalex_sources/_version.py'), + + # Application author details: + author="Colav", + author_email="colav@udea.edu.co", + + # Packages + packages=find_packages(exclude=['tests']), + + # Include additional files into the package + include_package_data=True, + + # Details + url="https://github.com/colav/Kahi_plugins", + # + license="BSD", + + description="Kahi plugin to insert and update the sources from openalex", + + long_description=open("README.md").read(), + + long_description_content_type="text/markdown", + + # Dependent packages (distributions) + # put you packages here + install_requires=[ + 'kahi' + ], + ) + + +if __name__ == "__main__": + main() From 442a0c7bdf89538835e91cdfceaeec92741507ee Mon Sep 17 00:00:00 2001 From: muzgash Date: Mon, 31 Jul 2023 22:49:00 -0500 Subject: [PATCH 2/4] Improved the readme --- Kahi_openalex_sources/README.md | 53 ++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/Kahi_openalex_sources/README.md b/Kahi_openalex_sources/README.md index 66c2c5c..86ee519 100644 --- a/Kahi_openalex_sources/README.md +++ b/Kahi_openalex_sources/README.md @@ -1,36 +1,43 @@
-# Kahi template plugin -This is a template for xyz project -replace template for the name of the plugin everywhere. +# Kahi OpenAlex sources plugin +Kahi will use this plugin to insert or update the journal information from DOAJ # Description -Write something meaningful here ;) +Plugin that reads the information from a mongodb collection with openalex information to update or insert the information of the journals in CoLav's database format. # Installation +You could download the repository from github. Go into the folder where the setup.py is located and run +```shell +pip3 install . +``` +From the package you can install by running +```shell +pip3 install kahi_doaj_sources +``` ## Dependencies -What do I need fot this plugin?, it could be external services etc.. - -## Package -Write here how to install this plugin -usauly is - -`pip install kahi_template` - +Software dependencies will automatically be installed when installing the plugin. +The user must have a copy of the openalex dumpwith the collection of venues which can be downloaded at [OpenAlex data dump website](https://docs.openalex.org/download-all-data/openalex-snapshot "OpenAlex data dump website") and import it on a mongodb database. # Usage -what should I know? -put it here. - -Additional parameters for kahi_run in the workflow should be here as well. -example : - -``` -template: - - my_param_example: value +To use this plugin you must have kahi installed in your system and construct a yaml file such as +```yaml +config: + database_url: localhost:27017 + database_name: kahi + log_database: kahi_log + log_collection: log +workflow: + doaj_sources: + database_url: localhost:27017 + database_name: doaj + collection_name: stage +openalex_sources: + database_url: localhost:27017 + database_name: openalex + collection_name: venues ``` -Those parameters are not really needed in the workflow file, it is just for illustration. # License @@ -39,5 +46,3 @@ BSD-3-Clause License # Links http://colav.udea.edu.co/ - - From 044f1c4e784106d2c2134371fddd83281f85333b Mon Sep 17 00:00:00 2001 From: muzgash Date: Tue, 1 Aug 2023 10:36:20 -0500 Subject: [PATCH 3/4] Minor fix for coding standard --- .../kahi_openalex_sources/Kahi_openalex_sources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Kahi_openalex_sources/kahi_openalex_sources/Kahi_openalex_sources.py b/Kahi_openalex_sources/kahi_openalex_sources/Kahi_openalex_sources.py index 09d7a87..83f328d 100644 --- a/Kahi_openalex_sources/kahi_openalex_sources/Kahi_openalex_sources.py +++ b/Kahi_openalex_sources/kahi_openalex_sources/Kahi_openalex_sources.py @@ -110,7 +110,7 @@ def process_openalex(self): self.collection.insert_one(entry) self.already_processed.append(source["id"]) - delta = dt.now()-old + delta = dt.now() - old if delta.seconds > 240: self.openalex_client.admin.command( 'refreshSessions', [session.session_id], session=session) From 88061791bcce100a66cf46df9f9ef2f03730a933 Mon Sep 17 00:00:00 2001 From: muzgash Date: Tue, 8 Aug 2023 08:25:51 -0500 Subject: [PATCH 4/4] Fixed minor errors on README file --- Kahi_openalex_sources/README.md | 8 ++------ Kahi_openalex_sources/setup.py | 3 ++- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/Kahi_openalex_sources/README.md b/Kahi_openalex_sources/README.md index 86ee519..5e2ab30 100644 --- a/Kahi_openalex_sources/README.md +++ b/Kahi_openalex_sources/README.md @@ -1,7 +1,7 @@
# Kahi OpenAlex sources plugin -Kahi will use this plugin to insert or update the journal information from DOAJ +Kahi will use this plugin to insert or update the journal information from openalex # Description Plugin that reads the information from a mongodb collection with openalex information to update or insert the information of the journals in CoLav's database format. @@ -29,11 +29,7 @@ config: log_database: kahi_log log_collection: log workflow: - doaj_sources: - database_url: localhost:27017 - database_name: doaj - collection_name: stage -openalex_sources: + openalex_sources: database_url: localhost:27017 database_name: openalex collection_name: venues diff --git a/Kahi_openalex_sources/setup.py b/Kahi_openalex_sources/setup.py index 66d8405..f11bb39 100644 --- a/Kahi_openalex_sources/setup.py +++ b/Kahi_openalex_sources/setup.py @@ -80,7 +80,8 @@ def main(): # Dependent packages (distributions) # put you packages here install_requires=[ - 'kahi' + 'kahi', + 'pymongo' ], )