colav · muzgash · Aug 7, 2023 · Jul 31, 2023 · Jul 31, 2023 · Aug 1, 2023
diff --git a/Kahi_scienti_sources/LICENSE b/Kahi_scienti_sources/LICENSE
@@ -0,0 +1,30 @@
+Copyright (c) 2005-2020, Colav Developers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials provided
+       with the distribution.
+
+    * Neither the name of the NumPy Developers nor the names of any
+       contributors may be used to endorse or promote products derived
+       from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Kahi_scienti_sources/MANIFEST.in b/Kahi_scienti_sources/MANIFEST.in
@@ -0,0 +1,2 @@
+recursive-include kahi_scienti_sources/ *.py
+recursive-include kahi_scienti_sources/ *.*
diff --git a/Kahi_scienti_sources/README.md b/Kahi_scienti_sources/README.md
@@ -0,0 +1,66 @@
+<center><img src="https://raw.githubusercontent.com/colav/colav.github.io/master/img/Logo.png"/></center>
+
+# Kahi scienti sources plugin 
+Kahi will use this plugin to insert or update the journal information from scienti dump
+
+# Description
+Plugin that reads the information from a scienti dump to insert or update journals in colav's database.
+
+# Installation
+You could download the repository from github. Go into the folder where the setup.py is located and run
+```shell
+pip3 install .
+```
+From the package you can install by running
+```shell
+pip3 install kahi_scienti_sources
+```
+
+## Dependencies
+Software dependencies will automatically be installed when installing the plugin.
+The user must have at least one database obtained from minciencias and previously processed by [kayPacha](https://github.com/colav/KayPacha "KayPacha") and uploaded on a mongodb database.
+
+# Usage
+To use this plugin you must have kahi installed in your system and construct a yaml file such as
+```yaml
+config:
+  database_url: localhost:27017
+  database_name: kahi
+  log_database: kahi_log
+  log_collection: log
+workflow:
+  scienti_sources:
+    - database_url: localhost:27017
+      database_name: scienti_111
+      collection_name: products
+```
+Where file_path under scimago_sources task is the full path where the scimago csv is located.
+
+I you have several scimago files use the yaml structure as shown below
+```yaml
+config:
+  database_url: localhost:27017
+  database_name: kahi
+  log_database: kahi_log
+  log_collection: log
+workflow:
+  scienti_sources:
+    - database_url: localhost:27017
+      database_name: scienti_111
+      collection_name: products
+    - database_url: localhost:27017
+      database_name: scienti_uec_2022
+      collection_name: products
+    - database_url: localhost:27017
+      database_name: scienti_univalle_2022
+      collection_name: products
+```
+
+# License
+BSD-3-Clause License 
+
+# Links
+http://colav.udea.edu.co/
+
+
+
diff --git a/Kahi_scienti_sources/kahi_scienti_sources/Kahi_scienti_sources.py b/Kahi_scienti_sources/kahi_scienti_sources/Kahi_scienti_sources.py
@@ -0,0 +1,240 @@
+from kahi.KahiBase import KahiBase
+from pymongo import MongoClient
+from datetime import datetime as dt
+from time import time
+from langid import classify
+
+
+class Kahi_scienti_sources(KahiBase):
+
+    config = {}
+
+    def __init__(self, config):
+        self.config = config
+
+        self.mongodb_url = config["database_url"]
+
+        self.client = MongoClient(self.mongodb_url)
+
+        self.db = self.client[config["database_name"]]
+        self.collection = self.db["sources"]
+
+        self.already_in_db = []
+
+    def update_scienti(self, reg, entry, issn):
+        updated_scienti = False
+        for upd in entry["updated"]:
+            if upd["source"] == "scienti":
+                updated_scienti = True
+                entry["updated"].remove(upd)
+                entry["updated"].append(
+                    {"source": "scienti", "time": int(time())})
+                break
+        if not updated_scienti:
+            entry["updated"].append({"source": "scienti", "time": int(time())})
+        journal = None
+        for  detail in reg["details"]:
+            if "article" in detail.keys():
+                paper = detail["article"][0]
+                if "journal" in paper.keys():
+                    journal = paper["journal"][0]
+                    break
+        if not journal:
+            return
+        if "TPO_REVISTA" in journal.keys():
+            entry["types"].append(
+                {"source": "scienti", "type": journal["TPO_REVISTA"]})
+        entry["external_ids"].append(
+            {"source": "scienti", "id": journal["COD_REVISTA"]})
+
+        rankings_list = []
+        ranks = []
+        dates = [(rank["from_date"], rank["to_date"])
+                 for rank in entry["ranking"] if rank["source"] == "scienti"]
+        for reg_scienti in self.scienti_collection["products"].find({"details.article.journal.TXT_ISSN_SEP": issn}):
+            paper = None
+            journal = None
+            for  detail in reg_scienti["details"]:
+                if "article" in detail.keys():
+                    paper = detail["article"][0]
+                    if "journal" in paper.keys():
+                        journal = paper["journal"][0]
+                        break
+
+            if "TPO_CLASIFICACION" not in journal.keys():
+                continue
+            if not journal["TPO_CLASIFICACION"] in ranks:
+                ranking = {
+                    "from_date": int(dt.strptime(paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()),
+                    "to_date": int(dt.strptime(paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()),
+                    "rank": journal["TPO_CLASIFICACION"],
+                    "issn": issn,
+                    "order": None,
+                    "source": "scienti"
+                }
+                rankings_list.append(ranking)
+                ranks.append(journal["TPO_CLASIFICACION"])
+                dates_tuple = (
+                    int(dt.strptime(
+                        paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()),
+                    int(dt.strptime(
+                        paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp())
+                )
+
+                dates.append(dates_tuple)
+            else:
+                idx = ranks.index(journal["TPO_CLASIFICACION"])
+                date1, date2 = dates[idx]
+
+                if date1 > int(dt.strptime(paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()):
+                    date1 = int(dt.strptime(
+                        paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp())
+                if date2 < int(dt.strptime(paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()):
+                    date2 = int(dt.strptime(
+                        paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp())
+                dates[idx] = (date1, date2)
+
+        self.collection.update_one({"_id": entry["_id"]}, {"$set": {
+            "types": entry["types"],
+            "external_ids": entry["external_ids"],
+            "updated": entry["updated"],
+            "ranking": entry["ranking"] + rankings_list
+        }})
+
+    def process_scienti(self, config, verbose=0):
+        self.scienti_client = MongoClient(config["database_url"])
+
+        if config["database_name"] not in self.scienti_client.list_database_names():
+            raise Exception("Database {} not found".format(config["database_name"]))
+
+        self.scienti_db = self.scienti_client[config["database_name"]]
+
+        if config["collection_name"] not in self.scienti_db.list_collection_names():
+            raise Exception("Collection {} not found".format(config["collection_name"]))
+
+        self.scienti_collection = self.scienti_db[config["collection_name"]]
+        for issn in self.scienti_collection.distinct("details.article.journal.TXT_ISSN_SEP"):
+            print(issn)
+            reg_db = self.collection.find_one({"external_ids.id": issn})
+            if reg_db:
+                reg_scienti = self.scienti_collection.find_one(
+                    {"details.article.journal.TXT_ISSN_SEP": issn})
+                if reg_scienti:
+                    self.update_scienti(reg_scienti, reg_db, issn)
+            else:
+                reg_scienti = self.scienti_collection.find_one(
+                    {"details.article.journal.TXT_ISSN_SEP": issn})
+                if reg_scienti:
+                    journal = None
+                    for  detail in reg_scienti["details"]:
+                        if "article" in detail.keys():
+                            paper = detail["article"][0]
+                            if "journal" in paper.keys():
+                                journal = paper["journal"][0]
+                                break
+                    if not journal:
+                        continue
+                    entry = self.empty_source()
+                    entry["updated"] = [
+                        {"source": "scienti", "time": int(time())}]
+                    lang = classify(journal["TXT_NME_REVISTA"])[0]
+                    entry["names"] = [
+                        {"lang": lang, "name": journal["TXT_NME_REVISTA"], "source": "scienti"}]
+                    entry["external_ids"].append(
+                        {"source": "issn", "id": journal["TXT_ISSN_SEP"]})
+                    entry["external_ids"].append(
+                        {"source": "scienti", "id": journal["COD_REVISTA"]})
+                    if "TPO_REVISTA" in journal.keys():
+                        entry["types"].append(
+                            {"source": "scienti", "type": journal["TPO_REVISTA"]})
+                    if "editorial" in journal.keys():
+                        entry["publisher"] = {
+                            "country_code": "", "name": journal["editorial"][0]["TXT_NME_EDITORIAL"]}
+                    rankings_list = []
+                    ranks = []
+                    dates = []
+                    for reg_scienti in self.scienti_collection.find({"details.article.journal.TXT_ISSN_SEP": issn}):
+                        paper = None
+                        journal = None
+                        for  detail in reg_scienti["details"]:
+                            if "article" in detail.keys():
+                                paper = detail["article"][0]
+                                if "journal" in paper.keys():
+                                    journal = paper["journal"][0]
+                                    break
+                        if "TPO_CLASIFICACION" not in journal.keys():
+                            continue
+                        if not journal["TPO_CLASIFICACION"] in ranks:
+                            try:
+                                from_date = int(dt.strptime(paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp())
+                                to_date = int(dt.strptime(paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp())
+                            except:
+                                try:
+                                    from_date = int(dt.strptime(paper["DTA_CREACION"], "%Y-%m-%d %H:%M:%S").timestamp())
+                                    to_date = int(dt.strptime(paper["DTA_CREACION"], "%Y-%m-%d %H:%M:%S").timestamp())
+                                except:
+                                    from_date = None
+                                    to_date = None
+                            ranking = {
+                                "from_date": from_date,
+                                "to_date": to_date,
+                                "rank": journal["TPO_CLASIFICACION"],
+                                "issn": issn,
+                                "order": None,
+                                "source": "scienti"
+                            }
+                            rankings_list.append(ranking)
+                            ranks.append(journal["TPO_CLASIFICACION"])
+                            try:
+                                dates_tuple = (
+                                    int(dt.strptime(
+                                        paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()),
+                                    int(dt.strptime(
+                                        paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp())
+                                )
+                            except:
+                                try:
+                                    dates_tuple = (
+                                    int(dt.strptime(
+                                        paper["DTA_CREACION"], "%Y-%m-%d %H:%M:%S").timestamp()),
+                                    int(dt.strptime(
+                                        paper["DTA_CREACION"], "%Y-%m-%d %H:%M:%S").timestamp())
+                                )
+                                except:
+                                    dates_tuple = (
+                                        None,
+                                        None
+                                    )
+
+
+                            dates.append(dates_tuple)
+                        else:
+                            # if is already ranked but dates changed
+                            idx = ranks.index(journal["TPO_CLASIFICACION"])
+                            date1, date2 = dates[idx]
+                            try:
+                                if date1 > int(dt.strptime(paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()):
+                                    date1 = int(dt.strptime(
+                                        paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp())
+                                if date2 < int(dt.strptime(paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp()):
+                                    date2 = int(dt.strptime(
+                                        paper["DTA_CREACION"], "%a, %d %b %Y %H:%M:%S %Z").timestamp())
+                            except:
+                                try:
+                                    if date1 > int(dt.strptime(paper["DTA_CREACION"], "%Y-%m-%d %H:%M:%S").timestamp()):
+                                        date1 = int(dt.strptime(
+                                            paper["DTA_CREACION"], "%Y-%m-%d %H:%M:%S").timestamp())
+                                    if date2 < int(dt.strptime(paper["DTA_CREACION"], "%Y-%m-%d %H:%M:%S").timestamp()):
+                                        date2 = int(dt.strptime(
+                                            paper["DTA_CREACION"], "%Y-%m-%d %H:%M:%S").timestamp())
+                                except:
+                                    pass
+                            dates[idx] = (date1, date2)
+                    entry["ranking"] = rankings_list
+                    self.collection.insert_one(entry)
+
+    def run(self):
+        for config in self.config["scienti_sources"]:
+            print("Processing {} database".format(config["database_name"]))
+            self.process_scienti(config, verbose=5)
+        return 0
diff --git a/Kahi_scienti_sources/kahi_scienti_sources/__init__.py b/Kahi_scienti_sources/kahi_scienti_sources/__init__.py
diff --git a/Kahi_scienti_sources/kahi_scienti_sources/_version.py b/Kahi_scienti_sources/kahi_scienti_sources/_version.py
@@ -0,0 +1,6 @@
+# flake8: noqa
+__version__ = '0.0.1-alpha'
+
+
+def get_version():
+    return __version__