diff --git a/Kahi_scienti_sources/LICENSE b/Kahi_scienti_sources/LICENSE new file mode 100644 index 0000000..cef2ea0 --- /dev/null +++ b/Kahi_scienti_sources/LICENSE @@ -0,0 +1,30 @@ +Copyright (c) 2005-2020, Colav Developers. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the NumPy Developers nor the names of any + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Kahi_scienti_sources/MANIFEST.in b/Kahi_scienti_sources/MANIFEST.in new file mode 100644 index 0000000..15271b5 --- /dev/null +++ b/Kahi_scienti_sources/MANIFEST.in @@ -0,0 +1,2 @@ +recursive-include kahi_scienti_sources/ *.py +recursive-include kahi_scienti_sources/ *.* \ No newline at end of file diff --git a/Kahi_scienti_sources/README.md b/Kahi_scienti_sources/README.md new file mode 100644 index 0000000..1abe201 --- /dev/null +++ b/Kahi_scienti_sources/README.md @@ -0,0 +1,66 @@ +
+ +# Kahi scienti sources plugin +Kahi will use this plugin to insert or update the journal information from scienti dump + +# Description +Plugin that reads the information from a scienti dump to insert or update journals in colav's database. + +# Installation +You could download the repository from github. Go into the folder where the setup.py is located and run +```shell +pip3 install . +``` +From the package you can install by running +```shell +pip3 install kahi_scienti_sources +``` + +## Dependencies +Software dependencies will automatically be installed when installing the plugin. +The user must have at least one database obtained from minciencias and previously processed by [kayPacha](https://github.com/colav/KayPacha "KayPacha") and uploaded on a mongodb database. + +# Usage +To use this plugin you must have kahi installed in your system and construct a yaml file such as +```yaml +config: + database_url: localhost:27017 + database_name: kahi + log_database: kahi_log + log_collection: log +workflow: + scienti_sources: + - database_url: localhost:27017 + database_name: scienti_111 + collection_name: products +``` +Where file_path under scimago_sources task is the full path where the scimago csv is located. + +I you have several scimago files use the yaml structure as shown below +```yaml +config: + database_url: localhost:27017 + database_name: kahi + log_database: kahi_log + log_collection: log +workflow: + scienti_sources: + - database_url: localhost:27017 + database_name: scienti_111 + collection_name: products + - database_url: localhost:27017 + database_name: scienti_uec_2022 + collection_name: products + - database_url: localhost:27017 + database_name: scienti_univalle_2022 + collection_name: products +``` + +# License +BSD-3-Clause License + +# Links +http://colav.udea.edu.co/ + + + diff --git a/Kahi_scienti_sources/kahi_scienti_sources/Kahi_scienti_sources.py b/Kahi_scienti_sources/kahi_scienti_sources/Kahi_scienti_sources.py new file mode 100644 index 0000000..c462f7e --- /dev/null +++ b/Kahi_scienti_sources/kahi_scienti_sources/Kahi_scienti_sources.py @@ -0,0 +1,240 @@ +from kahi.KahiBase import KahiBase +from pymongo import MongoClient +from datetime import datetime as dt +from time import time +from langid import classify + + +class Kahi_scienti_sources(KahiBase): + + config = {} + + def __init__(self, config): + self.config = config + + self.mongodb_url = config["database_url"] + + self.client = MongoClient(self.mongodb_url) + + self.db = self.client[config["database_name"]] + self.collection = self.db["sources"] + + self.already_in_db = [] + + def update_scienti(self, reg, entry, issn): + updated_scienti = False + for upd in entry["updated"]: + if upd["source"] == "scienti": + updated_scienti = True + entry["updated"].remove(upd) + entry["updated"].append( + {"source": "scienti", "time": int(time())}) + break + if not updated_scienti: + entry["updated"].append({"source": "scienti", "time": int(time())}) + journal = None + for detail in reg["details"]: + if "article" in detail.keys(): + paper = detail["article"][0] + if "journal" in paper.keys(): + journal = paper["journal"][0] + break + if not journal: + return + if "TPO_REVISTA" in journal.keys(): + entry["types"].append( + {"source": "scienti", "type": journal["TPO_REVISTA"]}) + entry["external_ids"].append( + {"source": "scienti", "id": journal["COD_REVISTA"]}) + + rankings_list = [] + ranks = [] + dates = [(rank["from_date"], rank["to_date"]) + for rank in entry["ranking"] if rank["source"] == "scienti"] + for reg_scienti in self.scienti_collection["products"].find({"details.article.journal.TXT_ISSN_SEP": issn}): + paper = None + journal = None + for detail in reg_scienti["details"]: + if "article" in detail.keys(): + paper = detail["article"][0] + if "journal" in paper.keys(): + journal = paper["journal"][0] + break + + if "TPO_CLASIFICACION" not in journal.keys(): + continue + if not journal["TPO_CLASIFICACION"] in ranks: + ranking = { + "from_date": int(dt.strptime(paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()), + "to_date": int(dt.strptime(paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()), + "rank": journal["TPO_CLASIFICACION"], + "issn": issn, + "order": None, + "source": "scienti" + } + rankings_list.append(ranking) + ranks.append(journal["TPO_CLASIFICACION"]) + dates_tuple = ( + int(dt.strptime( + paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()), + int(dt.strptime( + paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()) + ) + + dates.append(dates_tuple) + else: + idx = ranks.index(journal["TPO_CLASIFICACION"]) + date1, date2 = dates[idx] + + if date1 > int(dt.strptime(paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()): + date1 = int(dt.strptime( + paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()) + if date2 < int(dt.strptime(paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()): + date2 = int(dt.strptime( + paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()) + dates[idx] = (date1, date2) + + self.collection.update_one({"_id": entry["_id"]}, {"$set": { + "types": entry["types"], + "external_ids": entry["external_ids"], + "updated": entry["updated"], + "ranking": entry["ranking"] + rankings_list + }}) + + def process_scienti(self, config, verbose=0): + self.scienti_client = MongoClient(config["database_url"]) + + if config["database_name"] not in self.scienti_client.list_database_names(): + raise Exception("Database {} not found".format(config["database_name"])) + + self.scienti_db = self.scienti_client[config["database_name"]] + + if config["collection_name"] not in self.scienti_db.list_collection_names(): + raise Exception("Collection {} not found".format(config["collection_name"])) + + self.scienti_collection = self.scienti_db[config["collection_name"]] + for issn in self.scienti_collection.distinct("details.article.journal.TXT_ISSN_SEP"): + print(issn) + reg_db = self.collection.find_one({"external_ids.id": issn}) + if reg_db: + reg_scienti = self.scienti_collection.find_one( + {"details.article.journal.TXT_ISSN_SEP": issn}) + if reg_scienti: + self.update_scienti(reg_scienti, reg_db, issn) + else: + reg_scienti = self.scienti_collection.find_one( + {"details.article.journal.TXT_ISSN_SEP": issn}) + if reg_scienti: + journal = None + for detail in reg_scienti["details"]: + if "article" in detail.keys(): + paper = detail["article"][0] + if "journal" in paper.keys(): + journal = paper["journal"][0] + break + if not journal: + continue + entry = self.empty_source() + entry["updated"] = [ + {"source": "scienti", "time": int(time())}] + lang = classify(journal["TXT_NME_REVISTA"])[0] + entry["names"] = [ + {"lang": lang, "name": journal["TXT_NME_REVISTA"], "source": "scienti"}] + entry["external_ids"].append( + {"source": "issn", "id": journal["TXT_ISSN_SEP"]}) + entry["external_ids"].append( + {"source": "scienti", "id": journal["COD_REVISTA"]}) + if "TPO_REVISTA" in journal.keys(): + entry["types"].append( + {"source": "scienti", "type": journal["TPO_REVISTA"]}) + if "editorial" in journal.keys(): + entry["publisher"] = { + "country_code": "", "name": journal["editorial"][0]["TXT_NME_EDITORIAL"]} + rankings_list = [] + ranks = [] + dates = [] + for reg_scienti in self.scienti_collection.find({"details.article.journal.TXT_ISSN_SEP": issn}): + paper = None + journal = None + for detail in reg_scienti["details"]: + if "article" in detail.keys(): + paper = detail["article"][0] + if "journal" in paper.keys(): + journal = paper["journal"][0] + break + if "TPO_CLASIFICACION" not in journal.keys(): + continue + if not journal["TPO_CLASIFICACION"] in ranks: + try: + from_date = int(dt.strptime(paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()) + to_date = int(dt.strptime(paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()) + except: + try: + from_date = int(dt.strptime(paper["DTA_CREACION"], "%Y-%m-%d %H:%M:%S").timestamp()) + to_date = int(dt.strptime(paper["DTA_CREACION"], "%Y-%m-%d %H:%M:%S").timestamp()) + except: + from_date = None + to_date = None + ranking = { + "from_date": from_date, + "to_date": to_date, + "rank": journal["TPO_CLASIFICACION"], + "issn": issn, + "order": None, + "source": "scienti" + } + rankings_list.append(ranking) + ranks.append(journal["TPO_CLASIFICACION"]) + try: + dates_tuple = ( + int(dt.strptime( + paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()), + int(dt.strptime( + paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()) + ) + except: + try: + dates_tuple = ( + int(dt.strptime( + paper["DTA_CREACION"], "%Y-%m-%d %H:%M:%S").timestamp()), + int(dt.strptime( + paper["DTA_CREACION"], "%Y-%m-%d %H:%M:%S").timestamp()) + ) + except: + dates_tuple = ( + None, + None + ) + + + dates.append(dates_tuple) + else: + # if is already ranked but dates changed + idx = ranks.index(journal["TPO_CLASIFICACION"]) + date1, date2 = dates[idx] + try: + if date1 > int(dt.strptime(paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()): + date1 = int(dt.strptime( + paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()) + if date2 < int(dt.strptime(paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()): + date2 = int(dt.strptime( + paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()) + except: + try: + if date1 > int(dt.strptime(paper["DTA_CREACION"], "%Y-%m-%d %H:%M:%S").timestamp()): + date1 = int(dt.strptime( + paper["DTA_CREACION"], "%Y-%m-%d %H:%M:%S").timestamp()) + if date2 < int(dt.strptime(paper["DTA_CREACION"], "%Y-%m-%d %H:%M:%S").timestamp()): + date2 = int(dt.strptime( + paper["DTA_CREACION"], "%Y-%m-%d %H:%M:%S").timestamp()) + except: + pass + dates[idx] = (date1, date2) + entry["ranking"] = rankings_list + self.collection.insert_one(entry) + + def run(self): + for config in self.config["scienti_sources"]: + print("Processing {} database".format(config["database_name"])) + self.process_scienti(config, verbose=5) + return 0 diff --git a/Kahi_scienti_sources/kahi_scienti_sources/__init__.py b/Kahi_scienti_sources/kahi_scienti_sources/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Kahi_scienti_sources/kahi_scienti_sources/_version.py b/Kahi_scienti_sources/kahi_scienti_sources/_version.py new file mode 100644 index 0000000..7b6ecff --- /dev/null +++ b/Kahi_scienti_sources/kahi_scienti_sources/_version.py @@ -0,0 +1,6 @@ +# flake8: noqa +__version__ = '0.0.1-alpha' + + +def get_version(): + return __version__ diff --git a/Kahi_scienti_sources/setup.py b/Kahi_scienti_sources/setup.py new file mode 100644 index 0000000..e730f9c --- /dev/null +++ b/Kahi_scienti_sources/setup.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +# Copyright (c) Colav. +# Distributed under the terms of the Modified BSD License. + +# ----------------------------------------------------------------------------- +# Minimal Python version sanity check (from IPython) +# ----------------------------------------------------------------------------- + +# See https://stackoverflow.com/a/26737258/2268280 +# sudo pip3 install twine +# python3 setup.py sdist bdist_wheel +# twine upload dist/* +# For test purposes +# twine upload --repository-url https://test.pypi.org/legacy/ dist/* + +from __future__ import print_function +from setuptools import setup, find_packages + +import os +import sys +import codecs + + +v = sys.version_info + + +def read(rel_path): + here = os.path.abspath(os.path.dirname(__file__)) + with codecs.open(os.path.join(here, rel_path), 'r') as fp: + return fp.read() + + +def get_version(rel_path): + for line in read(rel_path).splitlines(): + if line.startswith('__version__'): + delim = '"' if '"' in line else "'" + return line.split(delim)[1] + else: + raise RuntimeError("Unable to find version string.") + + +shell = False +if os.name in ('nt', 'dos'): + shell = True + warning = "WARNING: Windows is not officially supported" + print(warning, file=sys.stderr) + + +def main(): + setup( + # Application name: + name="Kahi_scienti_sources", + + # Version number (initial): + version=get_version('kahi_scienti_sources/_version.py'), + + # Application author details: + author="Colav", + author_email="colav@udea.edu.co", + + # Packages + packages=find_packages(exclude=['tests']), + + # Include additional files into the package + include_package_data=True, + + # Details + url="https://github.com/colav/Kahi_plugins", + # + license="BSD", + + description="Kahi plugin to insert or update sources from scienti", + + long_description=open("README.md").read(), + + long_description_content_type="text/markdown", + + # Dependent packages (distributions) + # put you packages here + install_requires=[ + 'kahi', + 'langid', + 'pymongo' + ], + ) + + +if __name__ == "__main__": + main() diff --git a/Kahi_template/kahi_template/_version.py b/Kahi_template/kahi_template/_version.py index e3f75fa..7b6ecff 100644 --- a/Kahi_template/kahi_template/_version.py +++ b/Kahi_template/kahi_template/_version.py @@ -1,5 +1,6 @@ # flake8: noqa __version__ = '0.0.1-alpha' + def get_version(): return __version__