Skip to content

Commit

Permalink
Merge pull request #331 from omazapa/main
Browse files Browse the repository at this point in the history
new Minciencias data
  • Loading branch information
omazapa authored Aug 20, 2024
2 parents adaa3ae + 5b99a85 commit 1b9ccf4
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -354,10 +354,9 @@ def process_openadata(self):
if self.verbose > 4:
print("Creating the aggregate for {} products.".format(
self.groups_production.count_documents({})))
categories = ['ART-00', 'ART-ART_A1', 'ART-ART_A2',
'ART-ART_B', 'ART-ART_C', 'ART-ART_D', 'ART-GC_ART']
pipeline = [
{'$match': {'id_tipo_pd_med': {'$in': categories}}},
# 0000000000 is a placeholder for missing id_persona_pd, there is not record for it, then we can omit it
{'$match': {'id_persona_pd': {'$ne': '0000000000'}}},
{"$sort": {"ano_convo": -1}},
{'$group': {'_id': '$id_producto_pd', 'originalDoc': {'$first': '$$ROOT'}}},
{'$replaceRoot': {'newRoot': '$originalDoc'}},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ def check_databases_and_collections(self):

# Check if collection exists
if collection_name not in db.list_collection_names():
raise ValueError(f"Collection {collection_name} in database {db_name} not found")
raise ValueError(
f"Collection {collection_name} in database {db_name} not found")

except ConnectionFailure:
raise ConnectionFailure("Failed to connect to MongoDB server.")
Expand All @@ -108,36 +109,91 @@ def process_opendata(self):
db = client[self.config["minciencias_opendata_works"]["database_name"]]
opendata = db[self.config["minciencias_opendata_works"]
["collection_name"]]
print("INFO: Creating indices")
opendata.create_index("id_producto_pd")
opendata.create_index("nme_tipologia_pd")
if self.task == "doi":
raise RuntimeError(
f'''{self.config["minciencias_opendata_works"]["task"]} is not a valid task for the minciencias_opendata database''')

categories = ['ART-00', 'ART-ART_A1', 'ART-ART_A2',
'ART-ART_B', 'ART-ART_C', 'ART-ART_D', 'ART-GC_ART']
# bibliography production requires a search in elasticsearch,
# there will be a cut in openalex for those products.
biblio = ["Publicaciones editoriales no especializadas",
"Notas científica",
"Informe Final de Investigación",
"Capítulos de libro de investigación",
"Libros de investigación",
"Artículos de investigación",
"Libros de Formación",
"Libros",
"Tesis de doctorado",
"Capítulos de libro",
"Documento de trabajo",
"Tesis de pregrado",
"Informe técnico final",
"Artículos",
"Edicion",
"Manuales y Guías Especializadas",
"Boletín divulgativo de resultado de investigación",
"Libros de Divulgación de investigación y/o Compilación de Divulgación",
"Tesis de maestria",
"Generación de contenido impresa"]

pipeline = [
{'$match': {'id_tipo_pd_med': {'$in': categories}}},
{'$match': {"nme_producto_pd": {"$exists": True}}},
{'$match': {'nme_tipologia_pd': {'$in': biblio}}},
{'$group': {'_id': '$id_producto_pd', 'originalDoc': {'$first': '$$ROOT'}}},
{'$replaceRoot': {'newRoot': '$originalDoc'}}
]
paper_list = list(opendata.aggregate(pipeline, allowDiskUse=True))
print(
f"INFO: Processing bibliographic production {len(paper_list)} catgories {biblio}")
Parallel(
n_jobs=self.n_jobs,
verbose=self.verbose,
backend="threading")(
delayed(process_one)(
work,
self.db,
self.collection,
self.empty_work(),
self.es_handler,
insert_all=self.insert_all,
thresholds=self.thresholds,
verbose=self.verbose
) for work in paper_list
)

exclude = ["Evento científico", "Eventos artísticos, de arquitectura o de diseño con componentes de apropiación", "Eventos artísticos",
"Patente de invención", "Patente modelo de utilidad",
"Proyecto ID+I con Formación", "Proyecto de Investigacion y Desarrollo", "Proyecto de Investigación y Creación",
"Proyecto de extensión", "Proyecto de extensión y responsabilidad social en CTI"]
exclude.extend(biblio)
pipeline = [
{'$match': {"nme_producto_pd": {"$exists": True}}},
{'$match': {'nme_tipologia_pd': {'$nin': exclude}}},
{'$group': {'_id': '$id_producto_pd', 'originalDoc': {'$first': '$$ROOT'}}},
{'$replaceRoot': {'newRoot': '$originalDoc'}}
]

if self.task == "doi":
raise RuntimeError(
f'''{self.config["minciencias_opendata_works"]["task"]} is not a valid task for the minciencias_opendata database''')
else:
paper_list = list(opendata.aggregate(pipeline, allowDiskUse=True))
Parallel(
n_jobs=self.n_jobs,
verbose=self.verbose,
backend="threading")(
delayed(process_one)(
work,
self.db,
self.collection,
self.empty_work(),
self.es_handler,
insert_all=self.insert_all,
thresholds=self.thresholds,
verbose=self.verbose
) for work in paper_list
)
paper_list = list(opendata.aggregate(pipeline, allowDiskUse=True))
print(
f"INFO: Processing non-bibliographic production {len(paper_list)} excluding {exclude}")
Parallel(
n_jobs=self.n_jobs,
verbose=self.verbose,
backend="threading")(
delayed(process_one)(
work,
self.db,
self.collection,
self.empty_work(),
None,
insert_all=self.insert_all,
thresholds=self.thresholds,
verbose=self.verbose
) for work in paper_list
)
client.close()

def run(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,8 @@ def process_one_update(openadata_reg, colav_reg, db, collection, empty_work, ver
for relation in rgroup["relations"]:
types = []
if "types" in relation.keys() and relation["types"]:
types = [rel["type"].lower() for rel in relation["types"]]
types = [rel["type"].lower()
for rel in relation["types"]]
if "education" in types:
if relation["id"] not in affs:
author["affiliations"].append(relation)
Expand Down Expand Up @@ -502,5 +503,7 @@ def process_one(openadata_reg, db, collection, empty_work, es_handler, insert_al
process_one_insert(
openadata_reg, db, collection, empty_work, es_handler, verbose)
else:
process_one_insert(
openadata_reg, db, collection, empty_work, es_handler, verbose)
if verbose > 4:
print("No elasticsearch index provided")

0 comments on commit 1b9ccf4

Please sign in to comment.