diff --git a/macrostrat_db_insertion/example_request.json b/macrostrat_db_insertion/example_request.json deleted file mode 100644 index 1ec3c89..0000000 --- a/macrostrat_db_insertion/example_request.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "run_id": "run_2024-04-29_18:56:40.697006", - "extraction_pipeline_id": "0", - "model_name": "example_model", - "model_version" : "example_version", - "results": [ - { - "text": { - "preprocessor_id": "haystack_v0.0.2", - "paper_id": "652102064f5dbdaca31cdc9d", - "hashed_text": "c8c89ab45db09711df194fb7104ee90b9fd8ee7f673b52b87c5858f0636f64aa", - "weaviate_id": "efd46bbb-4c4d-4d96-92c3-7f07056c63d8", - "paragraph_text": "The Castelo dos Sonhos Formation is a relic of a sedimentary basin that likely formed near the coast, where sediments that were eroded from higher elevations accumulated in alluvial fans and, occasionally, aeolian dunes.\nFigure 4 shows a schematic column of the broad stratigraphy of the Castelo dos Sonhos Formation. Most of the formation consists of medium to coarse-grained, cross-bedded sandstones that are described locally as metamorphosed arenites.\nMost of the gold mineralization in the Castelo dos Sonhos Formation lies in a central band where the various conglomerate lithologies dominate. At the base and top of this band, the conglomerates are interlayered with arenites, which become more frequent as one moves away from the conglomeratic band, either downward into the older rocks (the lower arenite) or upward into the younger rocks (the upper arenite)." - }, - "relationships": [ - { - "src": "Castelo dos Sonhos Formation", - "relationship_type": "strat_name_to_lith", - "dst": "medium to coarse-grained, cross-bedded sandstones" - }, - { - "src": "arenites", - "relationship_type": "lith_to_lith_type", - "dst": "metamorphosed" - }, - { - "src": "conglomerates", - "relationship_type": "att_bedform", - "dst": "central band" - } - ] - } - ] -} diff --git a/macrostrat_db_insertion/example_requests/feedback_examples/0.json b/macrostrat_db_insertion/example_requests/feedback_examples/0.json new file mode 100644 index 0000000..a1db035 --- /dev/null +++ b/macrostrat_db_insertion/example_requests/feedback_examples/0.json @@ -0,0 +1,83 @@ +{ + "nodes": [ + { + "id": 392536, + "type": 4, + "name": "Tree", + "txt_range": [ + [ + 597, + 601 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 392535, + "type": 4, + "name": "Gowganda", + "txt_range": [ + [ + 579, + 587 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": -1, + "type": 1, + "name": "mafic", + "txt_range": [ + [ + 305, + 310 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": -2, + "type": 2, + "name": "apparent", + "txt_range": [ + [ + 207, + 215 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": -3, + "type": 3, + "name": "feldspar", + "txt_range": [ + [ + 64, + 72 + ] + ], + "reasoning": null, + "match": null + } + ], + "edges": [ + { + "source": -1, + "dest": -2 + }, + { + "source": -2, + "dest": -3 + } + ], + "sourceTextId": 22938, + "supersedesRunIds": [ + 26718 + ] +} \ No newline at end of file diff --git a/macrostrat_db_insertion/example_requests/feedback_examples/1.json b/macrostrat_db_insertion/example_requests/feedback_examples/1.json new file mode 100644 index 0000000..d3cd168 --- /dev/null +++ b/macrostrat_db_insertion/example_requests/feedback_examples/1.json @@ -0,0 +1,278 @@ +{ + "nodes": [ + { + "id": 392239, + "type": 4, + "name": "mediumto", + "txt_range": [ + [ + 986, + 994 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 392238, + "type": 4, + "name": "DR3", + "txt_range": [ + [ + 829, + 832 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 392237, + "type": 4, + "name": "Fig.", + "txt_range": [ + [ + 764, + 768 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 392235, + "type": 4, + "name": "Suczek", + "txt_range": [ + [ + 476, + 482 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 392236, + "type": 10, + "name": "Dickinson et al.", + "txt_range": [ + [ + 492, + 508 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 392233, + "type": 4, + "name": "Ingersoll et al.", + "txt_range": [ + [ + 360, + 376 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 392232, + "type": 11, + "name": "Gazzi Dickinson", + "txt_range": [ + [ + 320, + 335 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 392234, + "type": 8, + "name": "Dickinson", + "txt_range": [ + [ + 326, + 335 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 392231, + "type": 4, + "name": "Villeneuve", + "txt_range": [ + [ + 238, + 248 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 392240, + "type": 4, + "name": "DeCelles et al.", + "txt_range": [ + [ + 1039, + 1054 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 392228, + "type": 4, + "name": "North", + "txt_range": [ + [ + 44, + 49 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 392227, + "type": 1, + "name": "North American basement", + "txt_range": [ + [ + 44, + 67 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 392229, + "type": 4, + "name": "Ross", + "txt_range": [ + [ + 217, + 221 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 392219, + "type": 1, + "name": "Ga Gehrels", + "txt_range": [ + [ + 202, + 212 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 392220, + "type": 2, + "name": "rocks", + "txt_range": [ + [ + 81, + 86 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 392226, + "type": 3, + "name": "feldspar", + "txt_range": [ + [ + 155, + 163 + ] + ], + "reasoning": null, + "match": { + "type": "lith_att", + "id": 96 + } + }, + { + "id": 392224, + "type": 3, + "name": "quartz", + "txt_range": [ + [ + 144, + 150 + ] + ], + "reasoning": null, + "match": { + "type": "lith_att", + "id": 94 + } + }, + { + "id": 392222, + "type": 3, + "name": "Sedimentary", + "txt_range": [ + [ + 69, + 80 + ] + ], + "reasoning": null, + "match": null + } + ], + "edges": [ + { + "source": 392235, + "dest": 392236 + }, + { + "source": 392232, + "dest": 392234 + }, + { + "source": 392227, + "dest": 392229 + }, + { + "source": 392219, + "dest": 392220 + }, + { + "source": 392220, + "dest": 392226 + }, + { + "source": 392220, + "dest": 392224 + }, + { + "source": 392220, + "dest": 392222 + } + ], + "sourceTextId": 22924, + "supersedesRunIds": [ + 26704 + ] + } \ No newline at end of file diff --git a/macrostrat_db_insertion/example_requests/feedback_examples/2.json b/macrostrat_db_insertion/example_requests/feedback_examples/2.json new file mode 100644 index 0000000..441a743 --- /dev/null +++ b/macrostrat_db_insertion/example_requests/feedback_examples/2.json @@ -0,0 +1,202 @@ +{ + "nodes": [ + { + "id": 391819, + "type": 2, + "name": "Jordan", + "txt_range": [ + [ + 62, + 68 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 391821, + "type": 3, + "name": "Pryor", + "txt_range": [ + [ + 73, + 78 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 391823, + "type": 1, + "name": "Single", + "txt_range": [ + [ + 311, + 317 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 391822, + "type": 2, + "name": "Miall", + "txt_range": [ + [ + 284, + 289 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 391824, + "type": 4, + "name": "Fig.", + "txt_range": [ + [ + 385, + 389 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 391825, + "type": 4, + "name": "Bristow", + "txt_range": [ + [ + 640, + 647 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 391826, + "type": 4, + "name": "Best", + "txt_range": [ + [ + 652, + 656 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 391815, + "type": 1, + "name": "Pryor", + "txt_range": [ + [ + 73, + 78 + ] + ], + "reasoning": null, + "match": { + "type": "strat_name", + "id": 72932 + } + }, + { + "id": 391816, + "type": 2, + "name": "maps", + "txt_range": [ + [ + 96, + 100 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 391818, + "type": 3, + "name": "sedimentary", + "txt_range": [ + [ + 84, + 95 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": -1, + "type": 2, + "name": "designate", + "txt_range": [ + [ + 338, + 347 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": -3, + "type": 11, + "name": "dynamic greatly", + "txt_range": [ + [ + 669, + 684 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": -2, + "type": 10, + "name": "meander belt", + "txt_range": [ + [ + 133, + 145 + ] + ], + "reasoning": null, + "match": null + } + ], + "edges": [ + { + "source": 391819, + "dest": 391821 + }, + { + "source": 391823, + "dest": 391822 + }, + { + "source": 391815, + "dest": 391816 + }, + { + "source": 391816, + "dest": 391818 + }, + { + "source": -3, + "dest": -2 + } + ], + "sourceTextId": 22900, + "supersedesRunIds": [ + 26680 + ] + } \ No newline at end of file diff --git a/macrostrat_db_insertion/example_requests/feedback_examples/2024-10-14-simple.json b/macrostrat_db_insertion/example_requests/feedback_examples/2024-10-14-simple.json deleted file mode 100644 index 27fecc5..0000000 --- a/macrostrat_db_insertion/example_requests/feedback_examples/2024-10-14-simple.json +++ /dev/null @@ -1,178 +0,0 @@ -{ - "nodes": [ - { - "id": 308167, - "type": 5, - "name": "soil", - "txt_range": [ - [ - 152, - 156 - ] - ], - "reasoning": null, - "match": null - }, - { - "id": 308168, - "type": 5, - "name": "sandy", - "txt_range": [ - [ - 135, - 140 - ] - ], - "reasoning": null, - "match": null - }, - { - "id": 308170, - "type": 2, - "name": "sandstone", - "txt_range": [ - [ - 0, - 9 - ] - ], - "reasoning": null, - "match": { - "type": "lith", - "id": 10 - } - }, - { - "id": -4, - "type": 5, - "name": "boulders", - "txt_range": [ - [ - 109, - 117 - ] - ], - "reasoning": null, - "match": null - }, - { - "id": -5, - "type": 5, - "name": "pebbles", - "txt_range": [ - [ - 122, - 129 - ] - ], - "reasoning": null, - "match": null - }, - { - "id": -6, - "type": 3, - "name": "yellowish-brown", - "txt_range": [ - [ - 11, - 26 - ] - ], - "reasoning": null, - "match": null - }, - { - "id": -1, - "type": 3, - "name": "fine-grained", - "txt_range": [ - [ - 41, - 53 - ] - ], - "reasoning": null, - "match": null - }, - { - "id": 308172, - "type": 3, - "name": "very fine", - "txt_range": [ - [ - 28, - 37 - ] - ], - "reasoning": null, - "match": { - "type": "lith_att", - "id": 44 - } - }, - { - "id": -2, - "type": 3, - "name": "limonite-stained", - "txt_range": [ - [ - 55, - 71 - ] - ], - "reasoning": null, - "match": null - }, - { - "id": -3, - "type": 3, - "name": "friable", - "txt_range": [ - [ - 73, - 80 - ] - ], - "reasoning": null, - "match": null - } - ], - "edges": [ - { - "source": 308167, - "dest": 308168 - }, - { - "source": 308170, - "dest": -4 - }, - { - "source": 308170, - "dest": -5 - }, - { - "source": 308170, - "dest": -6 - }, - { - "source": 308170, - "dest": -1 - }, - { - "source": 308170, - "dest": 308172 - }, - { - "source": 308170, - "dest": -2 - }, - { - "source": 308170, - "dest": -3 - } - ], - "sourceTextId": 25186, - "supersedesRunIds": [ - 21858 - ] -} diff --git a/macrostrat_db_insertion/example_requests/feedback_examples/3.json b/macrostrat_db_insertion/example_requests/feedback_examples/3.json new file mode 100644 index 0000000..9bc1f26 --- /dev/null +++ b/macrostrat_db_insertion/example_requests/feedback_examples/3.json @@ -0,0 +1,224 @@ +{ + "nodes": [ + { + "id": 388806, + "type": 1, + "name": "Toodoggone", + "txt_range": [ + [ + 962, + 972 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 388807, + "type": 2, + "name": "conglomerates", + "txt_range": [ + [ + 833, + 846 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 388811, + "type": 4, + "name": "SW", + "txt_range": [ + [ + 400, + 402 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 388813, + "type": 4, + "name": "Cliff Creek", + "txt_range": [ + [ + 646, + 657 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 388814, + "type": 4, + "name": "Dukes Ridge", + "txt_range": [ + [ + 659, + 670 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 388817, + "type": 1, + "name": "Lawyers", + "txt_range": [ + [ + 771, + 778 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 388816, + "type": 2, + "name": "AGB", + "txt_range": [ + [ + 684, + 687 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 388815, + "type": 3, + "name": "Phoenix", + "txt_range": [ + [ + 672, + 679 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 388818, + "type": 4, + "name": "E", + "txt_range": [ + [ + 2, + 3 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 388819, + "type": 4, + "name": "NW NNW", + "txt_range": [ + [ + 1084, + 1090 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 388810, + "type": 10, + "name": "Property", + "txt_range": [ + [ + 24, + 32 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 388812, + "type": 4, + "name": "a series of steep to subvertical, 310 to 340 striking faults that dip SW or NW", + "txt_range": [ + [ + 330, + 408 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 388809, + "type": 4, + "name": "the Property", + "txt_range": [ + [ + 20, + 32 + ] + ], + "reasoning": null, + "match": null + }, + { + "id": 388808, + "type": 1, + "name": "extension", + "txt_range": [ + [ + 146, + 155 + ] + ], + "reasoning": null, + "match": { + "type": "strat_name", + "id": 76118 + } + }, + { + "id": -1, + "type": 11, + "name": "Figure 7.6", + "txt_range": [ + [ + 73, + 83 + ] + ], + "reasoning": null, + "match": null + } + ], + "edges": [ + { + "source": 388806, + "dest": 388807 + }, + { + "source": 388817, + "dest": 388816 + }, + { + "source": 388816, + "dest": 388815 + }, + { + "source": 388810, + "dest": 388812 + } + ], + "sourceTextId": 22730, + "supersedesRunIds": [ + 26510 + ] + } \ No newline at end of file diff --git a/macrostrat_db_insertion/insert_into_server.py b/macrostrat_db_insertion/insert_into_server.py new file mode 100644 index 0000000..51fca1e --- /dev/null +++ b/macrostrat_db_insertion/insert_into_server.py @@ -0,0 +1,41 @@ +import requests +import json +import os +import time +import argparse +import traceback +import datetime + +def read_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--input_dir", type=str, required=True, help = "The path to the directory of results we want to upload to the server") + return parser.parse_args() + +def make_requests(): + request_url = "http://127.0.0.1:9543/record_run" + args = read_args() + input_dir_str = str(args.input_dir) + for file_name in os.listdir(args.input_dir): + if "json" not in file_name or file_name[0] == '.': + continue + + # Read in the current file + with open(os.path.join(args.input_dir, file_name), "r") as reader: + request_data = json.load(reader) + + # Make the request + if "feedback" not in input_dir_str: + request_data["run_id"] = "run_" + str(abs(hash(file_name))) + "_" + str(datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S.%f")) + try: + start_time = time.time() + response = requests.post(url = request_url, json = request_data) + response.raise_for_status() + time_taken = round(1000.0 * (time.time() - start_time)) + + print("Processed file", file_name, "sucessfully in", time_taken, "ms") + except: + print("FAIL: Request for file", file_name, "due to error", traceback.format_exc()) + break + +if __name__ == "__main__": + make_requests() \ No newline at end of file diff --git a/macrostrat_db_insertion/old_server.py b/macrostrat_db_insertion/old_server.py deleted file mode 100644 index 5890115..0000000 --- a/macrostrat_db_insertion/old_server.py +++ /dev/null @@ -1,417 +0,0 @@ -from flask import Flask, jsonify, request -from flask_cors import CORS -from flask_sqlalchemy import SQLAlchemy -import sqlalchemy -from sqlalchemy.dialects.postgresql import insert as INSERT_STATEMENT -from sqlalchemy import select as SELECT_STATEMENT -from sqlalchemy.orm import declarative_base -import json -import traceback -from datetime import datetime, timezone -import os - -from re_detail_adder import * - -def load_flask_app(): - # Create the app - app = Flask(__name__) - app.config["SQLALCHEMY_DATABASE_URI"] = os.environ['uri'] - - # Create the db - Base = declarative_base(metadata = sqlalchemy.MetaData(schema = os.environ['macrostrat_xdd_schema_name'])) - db = SQLAlchemy(model_class=Base) - db.init_app(app) - with app.app_context(): - db.reflect() - - return app, db - -# Connect to the database -MAX_TRIES = 5 -app, db = load_flask_app() -CORS(app) -re_processor = REProcessor("id_maps") - -ENTITY_TYPE_TO_ID_MAP = { - "strat_name" : "macrostrat_strat_id", - "lith" : "macrostrat_lith_id", - "lith_att" : "macrostrat_lith_att_id" -} -def get_db_entity_id(run_id, entity_name, entity_type, source_id): - # Create the entity value - entity_unique_rows = ["run_id", "entity_name", "entity_type", "source_id"] - entities_table = db.metadata.tables['macrostrat_kg_new.entities'] - entities_values = { - "run_id" : run_id, - "entity_name" : entity_name, - "entity_type" : entity_type, - "source_id" : source_id - } - - # Get the entity id - entity_id = re_processor.get_entity_id(entity_name, entity_type) - if entity_id != -1: - key_name = ENTITY_TYPE_TO_ID_MAP[entity_type] - entities_values[key_name] = entity_id - - # Try to create this entity - try: - entities_insert_statement = INSERT_STATEMENT(entities_table).values(**entities_values) - entities_insert_statement = entities_insert_statement.on_conflict_do_nothing(index_elements = entity_unique_rows) - db.session.execute(entities_insert_statement) - db.session.commit() - except: - error_msg = "Failed to insert entity " + entity_name + " for run " + str(run_id) + " due to error: " + traceback.format_exc() - return False, error_msg - - # Get this entity id - entity_id = "" - try: - # Execute the select query - entities_select_statement = SELECT_STATEMENT(entities_table) - entities_select_statement = entities_select_statement.where(entities_table.c.run_id == run_id) - entities_select_statement = entities_select_statement.where(entities_table.c.entity_name == entity_name) - entities_select_statement = entities_select_statement.where(entities_table.c.entity_type == entity_type) - entities_result = db.session.execute(entities_select_statement).all() - - # Ensure we got a result - if len(entities_result) == 0: - raise Exception("Got zero rows matching query " + str(entities_select_statement)) - - # Extract the sources id - first_row = entities_result[0]._mapping - entity_id = str(first_row["entity_id"]) - except: - error_msg = "Failed to get sources id for entity " + str(entity_name) - error_msg += " for run " + str(run_id) + " due to error: " + traceback.format_exc() - return False, error_msg - - return True, entity_id - -RELATIONSHIP_DETAILS = { - "strat" : ("strat_to_lith", "strat_name", "lith"), - "att" : ("lith_to_attribute", "lith", "lith_att") -} -def record_relationship(run_id, source_id, relationship): - # Verify the fields - expected_fields = ["src", "relationship_type", "dst"] - relationship_values = {} - for field in expected_fields: - if field not in relationship: - return False, "Request relationship missing field " + field - relationship_values[field] = relationship[field] - - # Extract the types - provided_relationship_type = relationship["relationship_type"] - db_relationship_type, src_entity_type, dst_entity_type = "", "", "" - for key_name in RELATIONSHIP_DETAILS: - if provided_relationship_type.startswith(key_name): - db_relationship_type, src_entity_type, dst_entity_type = RELATIONSHIP_DETAILS[key_name] - break - - # Ignore this type - if len(db_relationship_type) == 0 or len(src_entity_type) == 0 or len(dst_entity_type) == 0: - return True, "" - - # Get the entity ids - sucessful, src_entity_id = get_db_entity_id(run_id, relationship_values["src"], src_entity_type, source_id) - if not sucessful: - return sucessful, src_entity_id - - sucessful, dst_entity_id = get_db_entity_id(run_id, relationship_values["dst"], dst_entity_type, source_id) - if not sucessful: - return sucessful, dst_entity_id - - # Record the relationship - db_relationship_insert_values = { - "run_id" : run_id, - "src_entity_id" : src_entity_id, - "dst_entity_id" : dst_entity_id, - "source_id" : source_id, - "relationship_type" : db_relationship_type - } - unique_columns = ["run_id", "src_entity_id", "dst_entity_id", "relationship_type", "source_id"] - relationship_tables = db.metadata.tables['macrostrat_kg_new.relationship'] - try: - relationship_insert_statement = INSERT_STATEMENT(relationship_tables).values(**db_relationship_insert_values) - relationship_insert_statement = relationship_insert_statement.on_conflict_do_nothing(index_elements = unique_columns) - db.session.execute(relationship_insert_statement) - db.session.commit() - except: - error_msg = "Failed to insert relationship type " + str(provided_relationship_type) + " for source " + str(source_id) - error_msg += " for run " + str(run_id) + " due to error: " + traceback.format_exc() - return False, error_msg - - return True, "" - -def record_for_result(run_id, request): - # Ensure txt exists - if "text" not in request: - return False, "result is missing text field" - - source_fields = ["preprocessor_id", "paper_id", "hashed_text", "weaviate_id", "paragraph_text"] - source_values = {"run_id" : run_id} - text_data = request["text"] - for field_name in source_fields: - if field_name not in text_data: - return False, "Request text is missing field " + str(field_name) - source_values[field_name] = text_data[field_name] - - # Remove non ascii data from text - paragraph_txt = source_values["paragraph_text"] - source_values["paragraph_text"] = paragraph_txt.encode("ascii", errors="ignore").decode() - - sources_table = db.metadata.tables['macrostrat_kg_new.sources'] - try: - # Try to insert the sources - sources_insert_statement = INSERT_STATEMENT(sources_table).values(**source_values) - sources_insert_statement = sources_insert_statement.on_conflict_do_nothing(index_elements=["run_id", "weaviate_id"]) - db.session.execute(sources_insert_statement) - db.session.commit() - except: - error_msg = "Failed to insert paragraph " + str(source_values["weaviate_id"]) - error_msg += " for run " + str(source_values["run_id"]) + " due to error: " + traceback.format_exc() - return False, error_msg - - # Deal with case if we have no relationships - if "relationships" not in request: - return True, "" - - # Get the sources id - source_id = "" - try: - # Execute the select query - sources_select_statement = SELECT_STATEMENT(sources_table) - sources_select_statement = sources_select_statement.where(sources_table.c.run_id == run_id) - sources_select_statement = sources_select_statement.where(sources_table.c.weaviate_id == source_values["weaviate_id"]) - sources_result = db.session.execute(sources_select_statement).all() - - # Ensure we got a result - if len(sources_result) == 0: - raise Exception("Got zero rows matching query " + str(sources_select_statement)) - - # Extract the sources id - first_row = sources_result[0]._mapping - source_id = str(first_row["source_id"]) - except: - error_msg = "Failed to get sources id for paragraph " + str(source_values["weaviate_id"]) - error_msg += " for run " + str(source_values["run_id"]) + " due to error: " + traceback.format_exc() - return False, error_msg - - # Record the relationships - if "relationships" in request: - for relationship in request["relationships"]: - sucessful, message = record_relationship(run_id, source_id, relationship) - if not sucessful: - return sucessful, message - - # Record the entities - if "just_entities" in request: - required_entity_keys = ["entity", "entity_type"] - for entity_data in request["just_entities"]: - # Ensure that it has all the required keys - for key in required_entity_keys: - if key not in entity_data: - return False, "Provided just entities missing key " + str(key) - - # Only record strats - entity_type = entity_data["entity_type"] - if not entity_type.startswith("strat"): - continue - - # Record the entity - sucessful, entity_id = get_db_entity_id(run_id, entity_data["entity"], "strat_name", source_id) - if not sucessful: - return sucessful, entity_id - - return True, "" - -def get_user_id(user_name): - # Create the users rows - users_table = db.metadata.tables['macrostrat_kg_new.users'] - users_row_values = { - "user_name" : user_name - } - - # Try to create this user - try: - users_insert_statement = INSERT_STATEMENT(users_table).values(**users_row_values) - users_insert_statement = users_insert_statement.on_conflict_do_nothing(index_elements = ["user_name"]) - db.session.execute(users_insert_statement) - db.session.commit() - except: - error_msg = "Failed to insert user " + user_name + " due to error: " + traceback.format_exc() - return False, error_msg - - # Get this entity id - user_id = "" - try: - # Execute the select query - users_select_statement = SELECT_STATEMENT(users_table) - users_select_statement = users_select_statement.where(users_table.c.user_name == user_name) - users_result = db.session.execute(users_select_statement).all() - - # Ensure we got a result - if len(users_result) == 0: - raise Exception("Got zero rows matching query " + str(users_select_statement)) - - # Extract the sources id - first_row = users_result[0]._mapping - user_id = str(first_row["user_id"]) - except: - error_msg = "Failed to get id for user " + str(user_name) + " due to error: " + traceback.format_exc() - return False, error_msg - - return True, user_id - -def get_model_internal_details(request): - # Extract the expected values - expected_fields = ["model_name", "model_version"] - model_values = {} - for field in expected_fields: - if field not in request: - return False, "Request missing field " + field - model_values[field] = str(request[field]) - - # Try to insert the model - model_name = model_values["model_name"] - models_tables = db.metadata.tables['macrostrat_kg_new.models'] - try: - # Try to insert the model - model_insert_statement = INSERT_STATEMENT(models_tables).values(**{ - "model_name" : model_name - }) - model_insert_statement = model_insert_statement.on_conflict_do_nothing(index_elements=["model_name"]) - db.session.execute(model_insert_statement) - db.session.commit() - except: - error_msg = "Failed to insert model " + model_name + " due to error: " + traceback.format_exc() - return False, error_msg - - # Try to get the model id - data_to_return = {} - try: - # Execute the select query - models_select_statement = SELECT_STATEMENT(models_tables) - models_select_statement = models_select_statement.where(models_tables.c.model_name == model_name) - models_result = db.session.execute(models_select_statement).all() - - # Ensure we got a result - if len(models_result) == 0: - raise Exception("Got zero rows matching query " + str(models_select_statement)) - - # Extract the sources id - first_row = models_result[0]._mapping - data_to_return["internal_model_id"] = str(first_row["model_id"]) - except: - error_msg = "Failed to get id for model " + model_name + " due to error: " + traceback.format_exc() - return False, error_msg - - # Try to insert the model version - model_version = model_values["model_version"] - versions_table = db.metadata.tables['macrostrat_kg_new.model_versions'] - try: - # Try to insert the model version - version_insert_statement = INSERT_STATEMENT(versions_table).values(**{ - "model_id" : data_to_return["internal_model_id"], - "model_version" : model_version - }) - version_insert_statement = version_insert_statement.on_conflict_do_nothing(index_elements=["model_id", "model_version"]) - db.session.execute(version_insert_statement) - db.session.commit() - except: - error_msg = "Failed to insert version " + model_version + " for model " + model_name + " due to error: " + traceback.format_exc() - return False, error_msg - - # Try to get the model version - try: - # Execute the select query - version_select_statement = SELECT_STATEMENT(versions_table) - version_select_statement = version_select_statement.where(versions_table.c.model_id == data_to_return["internal_model_id"]) - version_select_statement = version_select_statement.where(versions_table.c.model_version == model_version) - version_result = db.session.execute(version_select_statement).all() - - # Ensure we got a result - if len(version_result) == 0: - raise Exception("Got zero rows matching query " + str(version_select_statement)) - - # Extract the sources id - first_row = version_result[0]._mapping - data_to_return["internal_version_id"] = str(first_row["version_id"]) - except: - error_msg = "Failed to get id for version " + model_version + " for model " + model_name + " due to error: " + traceback.format_exc() - return False, error_msg - - data_to_return["model_id"] = model_name + "_" + model_version - - return True, data_to_return - -def process_input_request(request_data): - # Get the metadata fields - metadata_fields = ["run_id", "extraction_pipeline_id"] - metadata_values = {} - for field_name in metadata_fields: - if field_name not in request_data: - return False, "Request data is missing field " + str(field_name) - metadata_values[field_name] = request_data[field_name] - - # Add the model fields to the metadata - sucessful, model_fields = get_model_internal_details(request_data) - if not sucessful: - return sucessful, model_fields - - for key_name in model_fields: - metadata_values[key_name] = model_fields[key_name] - - # Determine if this is user provided feedback - if "user_name" in request_data: - sucessful, user_id = get_user_id(request_data["user_name"]) - if not sucessful: - return sucessful, user_id - metadata_values["user_id"] = user_id - - # Insert this run to the metadata - try: - metadata_table = db.metadata.tables['macrostrat_kg_new.metadata'] - metadata_insert_statement = INSERT_STATEMENT(metadata_table).values(**metadata_values) - metadata_insert_statement = metadata_insert_statement.on_conflict_do_update(index_elements=["run_id"], set_ = { - "internal_model_id" : metadata_values["internal_model_id"], - "internal_version_id" : metadata_values["internal_version_id"] - }) - db.session.execute(metadata_insert_statement) - db.session.commit() - except Exception: - return False, "Failed to insert run " + str(metadata_values["run_id"]) + " due to error: " + traceback.format_exc() - - # Record the results - if "results" in request_data: - for result in request_data["results"]: - sucessful, error_msg = record_for_result(request_data["run_id"], result) - if not sucessful: - return sucessful, error_msg - - return True, "" - -@app.route("/record_run", methods=["POST"]) -def record_run(): - # Record the run - request_data = request.get_json() - print("Got request of", request_data) - ''' - sucessful, error_msg = process_input_request(request_data) - if not sucessful: - print("Returning error of", error_msg) - return jsonify({"error" : error_msg}), 400 - ''' - return jsonify({"sucess" : "Sucessfully processed the run"}), 200 - - -@app.route("/health", methods=["GET"]) -def health(): - """Health check endpoint""" - - return jsonify({"status": "Server Running"}), 200 - - -if __name__ == "__main__": - app.run(host = "0.0.0.0", port = 9543, debug = True) diff --git a/macrostrat_db_insertion/server.py b/macrostrat_db_insertion/server.py index 5241cef..5be998f 100644 --- a/macrostrat_db_insertion/server.py +++ b/macrostrat_db_insertion/server.py @@ -793,7 +793,7 @@ def get_internal_user_id(external_user_id : str, session : Session): except: return False, "Failed to insert user " + str(external_user_id) + " into table " + str(users_table_name) + " due to error: " + traceback.format_exc() -def is_valid_entity_type_id(entity_type_id, session : Session): +def get_entity_type_text(entity_type_id, session : Session): entity_type_table_name = get_complete_table_name("entity_type") entity_type_table = get_base().metadata.tables[entity_type_table_name] @@ -804,15 +804,17 @@ def is_valid_entity_type_id(entity_type_id, session : Session): entity_type_result = session.execute(entity_type_id_select_statement).all() # Return if this type is valid or not - return len(entity_type_result) > 0 + if len(entity_type_result) == 0: + return False, "Failed to get text for entity type id of " + str(entity_type_id) + + return True, entity_type_result[0]._mapping["name"] except: err_msg = "Failed to find type id " + str(entity_type_id) + " in table " + str(entity_type_table_name) + " due to error: " + traceback.format_exc() - print(err_msg) - return False + return False, err_msg def record_user_node_info(node_info, request_additional_data, session : Session): # Ensure that we have the required metadata fields - run_verify_result = verify_key_presents(node_info, ["id", "name"]) + run_verify_result = verify_key_presents(node_info, ["id", "type", "name"]) if run_verify_result is not None: return False, run_verify_result @@ -829,15 +831,17 @@ def record_user_node_info(node_info, request_additional_data, session : Session) # Record if the user provided the id for an entity type curr_entity_type_id = node_info["type"] request_additional_data.pop("curr_entity_type_id", None) - if "type" in node_info and is_valid_entity_type_id(curr_entity_type_id, session): - request_additional_data["curr_entity_type_id"] = curr_entity_type_id + success, entity_type_name = get_entity_type_text(curr_entity_type_id, session) + if not success: + return success, entity_type_name + request_additional_data["curr_entity_type_id"] = curr_entity_type_id success, new_node_id = get_entity_id(entity_name, None, request_additional_data, session, start_idx, end_idx) if not success: return success, new_node_id # Record the mapping - request_additional_data["node_id_mappings"][old_node_id] = new_node_id + request_additional_data["node_id_mappings"][old_node_id] = (new_node_id, entity_type_name) return True, None def record_user_relationship_info(curr_edge, request_additional_data, session : Session): @@ -851,14 +855,15 @@ def record_user_relationship_info(curr_edge, request_additional_data, session : old_source_id = curr_edge["source"] if old_source_id not in node_mappings: return False, "Don't have mapping for source node " + str(old_source_id) - new_source_id = node_mappings[old_source_id] + new_source_id, source_type = node_mappings[old_source_id] old_dest_id = curr_edge["dest"] if old_dest_id not in node_mappings: return False, "Don't have mapping for dest node " + str(old_dest_id) - new_dest_id = node_mappings[old_dest_id] + new_dest_id, dest_type = node_mappings[old_dest_id] - return insert_relationship(new_source_id, new_dest_id, None, request_additional_data, session) + relationship_type = str(source_type) + "_to_" + str(dest_type) + return insert_relationship(new_source_id, new_dest_id, relationship_type, request_additional_data, session) def process_user_feedback_input_request(request_data, session): # Ensure that we have the required metadata fields @@ -938,12 +943,13 @@ async def record_run( # Record the run successful, error_msg = False, "Request is not a model or feedback run" - if "run_id" in request_data: - successful, error_msg = process_model_input_request(request_data, session) - elif "sourceTextId" in request_data: + if "sourceTextId" in request_data: successful, error_msg = process_user_feedback_input_request(request_data, session) + elif "run_id" in request_data: + successful, error_msg = process_model_input_request(request_data, session) if not successful: + print("Returning error message", error_msg) raise HTTPException(status_code=400, detail=error_msg) return JSONResponse(content={"success": "Successfully processed the run"}) diff --git a/retraining_runner/feedback_puller.py b/retraining_runner/feedback_puller.py new file mode 100644 index 0000000..dbbe328 --- /dev/null +++ b/retraining_runner/feedback_puller.py @@ -0,0 +1,87 @@ +from sqlalchemy import create_engine, inspect, MetaData +from sqlalchemy.ext.automap import automap_base +from sqlalchemy.orm import sessionmaker, declarative_base +from sqlalchemy import select as SELECT_STATEMENT +import argparse + +def read_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--uri", type=str, required=True, help="The URI to use to connect to the database") + parser.add_argument("--schema", type=str, required=True, help="The schema to connect to") + return parser.parse_args() + +def load_sqlalchemy(args): + # Create the engine + engine = create_engine(args.uri) + metadata = MetaData(schema = args.schema) + metadata.reflect(bind=engine) + Session = sessionmaker(bind=engine) + current_session = Session() + + return { + "engine" : engine, + "metadata" : metadata, + "session" : current_session, + "schema" : args.schema + } + +def get_complete_table_name(connection_details, table_name): + return connection_details["schema"] + "." + table_name + +def get_all_user_runs(connection_details): + # Create the query to get all the user runs + runs_table_name = get_complete_table_name(connection_details, "all_runs") + runs_table = connection_details["metadata"].tables[runs_table_name] + users_run_select_statement = SELECT_STATEMENT(runs_table) + users_run_select_statement = users_run_select_statement.where(runs_table.c.user_id != None) + + # Run the query and get the results + users_run_result = connection_details["session"].execute(users_run_select_statement).all() + all_user_runs = [] + for current_row in users_run_result: + current_row = current_row._mapping + all_user_runs.append((current_row["id"], current_row["source_text_id"])) + + return all_user_runs + +def get_user_run_relationships(connection_details, save_dir, run_id, source_text_id): + # Load the source text + texts_table_name = get_complete_table_name(connection_details, "source_text") + texts_table = connection_details["metadata"].tables[texts_table_name] + text_select_statement = SELECT_STATEMENT(texts_table) + text_select_statement = text_select_statement.where(texts_table.c.id == source_text_id) + + text_select_result = connection_details["session"].execute(text_select_statement).all() + if len(text_select_result) == 0: + raise Exception("Can't find text for source id " + str(source_text_id)) + + source_text = text_select_result[0]._mapping["paragraph_text"] + print(source_text_id, source_text) + + # Extract the relationship + relationship_table_name = get_complete_table_name(connection_details, "relationship") + relationship_table = connection_details["metadata"].tables[relationship_table_name] + relationship_select_statement = SELECT_STATEMENT(relationship_table) + relationship_select_statement = relationship_select_statement.where(relationship_table.c.run_id == run_id) + + all_relationships = connection_details["session"].execute(relationship_select_statement).all() + for curr_relationship in all_relationships: + print(curr_relationship._mapping) + break + +def main(): + # Load the schema + args = read_args() + connection_details = load_sqlalchemy(args) + + # Get all of the user runs + save_dir = "extracted_feedback" + all_user_runs = get_all_user_runs(connection_details) + for run_id, source_text_id in all_user_runs: + get_user_run_relationships(connection_details, save_dir, run_id, source_text_id) + break + + connection_details["session"].close() + +if __name__ == "__main__": + main() \ No newline at end of file