diff --git a/app.py b/app.py index f8d16b60..86aef026 100644 --- a/app.py +++ b/app.py @@ -65,6 +65,7 @@ from profiles.Profile import Profile from profiles.ProfileFactory import ( ProfileFactory, + PROFILES, find_conformsto_subkg, load_profiles, update_profiles, @@ -310,6 +311,9 @@ def update_vocab_status(): prod_logger.info("Updating banner status") +profiles = PROFILES + + @app.context_processor def display_vocab_status(): global DICT_BANNER_INFO @@ -620,6 +624,72 @@ def get(self): return inspect_onto_reg(kg, False) +def suggest_profile(kg): + entities = util.list_all_instances(kg) + results = {} + final_results = [] + + for e in entities: + sub_kg = ConjunctiveGraph() + for s, p, o in kg.triples((e, None, None)): + sub_kg.add((s, p, o)) + + has_matching_profile = False + for p_name in profiles.keys(): + profile = profiles[p_name] + sim = profile.compute_similarity(sub_kg) + # sim = profile.compute_loose_similarity(kg) + results[p_name] = {"score": sim, "ref": profile.get_name()} + if sim > 0: + # print(f"closests_profile({e},{p_name}) = {sim}") + has_matching_profile = True + + sorted_results = dict( + sorted( + results.items(), + key=lambda item: item[1]["score"], + reverse=True, + ) + ) + + if has_matching_profile: + for hit in sorted_results.keys(): + if sorted_results[hit]["score"] > 0: + final_results.append( + { + "entity": str(e), + "profile_name": sorted_results[hit]["ref"], + "score": sorted_results[hit]["score"], + "profile_url": hit, + } + ) + res_sorted = sorted(final_results, key=lambda item: item["score"], reverse=True) + return res_sorted + + +@fc_inspect_namespace.route("/suggest_profile") +class SuggestBioschemasProfile(Resource): + @fc_inspect_namespace.expect(reqparse) + def get(self): + """Validate an RDF JSON-LD graph against Bioschemas profiles""" + args = reqparse.parse_args() + url = args["url"] + + eval = Evaluation() + eval.set_start_time() + eval.set_target_uri(url) + eval.set_reason("profile recommendation") + + web_res = WebResource(url) + kg = web_res.get_rdf() + results = suggest_profile(kg) + + eval.set_end_time() + eval.persist(source="API") + + return results + + # TODO update method @fc_inspect_namespace.route("/bioschemas_validation") class InspectBioschemas(Resource): @@ -1488,121 +1558,6 @@ def handle_complete_kg(json): print("completing KG for " + str(json["url"])) -# def inspect_onto_reg(kg, is_inspect_ui): -# query_classes = """ -# SELECT DISTINCT ?class { ?s rdf:type ?class } ORDER BY ?class -# """ -# query_properties = """ -# SELECT DISTINCT ?prop { ?s ?prop ?o } ORDER BY ?prop -# """ - -# table_content = { -# "classes": [], -# "classes_false": [], -# "properties": [], -# "properties_false": [], -# "done": False, -# } -# qres = kg.query(query_classes) -# for row in qres: -# namespace = urlparse(row["class"]).netloc -# class_entry = {} - -# if namespace == "bioschemas.org": -# class_entry = { -# "name": row["class"], -# "tag": { -# "OLS": None, -# "LOV": None, -# "BioPortal": None, -# "Bioschemas": True, -# }, -# } -# else: -# class_entry = { -# "name": row["class"], -# "tag": {"OLS": None, "LOV": None, "BioPortal": None}, -# } - -# table_content["classes"].append(class_entry) - -# qres = kg.query(query_properties) -# for row in qres: -# namespace = urlparse(row["prop"]).netloc -# property_entry = {} - -# if namespace == "bioschemas.org": -# property_entry = { -# "name": row["prop"], -# "tag": { -# "OLS": None, -# "LOV": None, -# "BioPortal": None, -# "Bioschemas": True, -# }, -# } -# else: -# property_entry = { -# "name": row["prop"], -# "tag": {"OLS": None, "LOV": None, "BioPortal": None}, -# } - -# table_content["properties"].append(property_entry) - -# if is_inspect_ui: -# emit("done_check", table_content) - -# for c in table_content["classes"]: - -# c["tag"]["OLS"] = util.ask_OLS(c["name"]) -# if is_inspect_ui: -# emit("done_check", table_content) - -# c["tag"]["LOV"] = util.ask_LOV(c["name"]) -# if is_inspect_ui: -# emit("done_check", table_content) - -# c["tag"]["BioPortal"] = util.ask_BioPortal(c["name"], "class") -# if is_inspect_ui: -# emit("done_check", table_content) - -# all_false_rule = [ -# c["tag"]["OLS"] == False, -# c["tag"]["LOV"] == False, -# c["tag"]["BioPortal"] == False, -# ] - -# if all(all_false_rule) and not "Bioschemas" in c["tag"]: -# table_content["classes_false"].append(c["name"]) - -# for p in table_content["properties"]: - -# p["tag"]["OLS"] = util.ask_OLS(p["name"]) -# if is_inspect_ui: -# emit("done_check", table_content) - -# p["tag"]["LOV"] = util.ask_LOV(p["name"]) -# if is_inspect_ui: -# emit("done_check", table_content) - -# p["tag"]["BioPortal"] = util.ask_BioPortal(p["name"], "property") -# if is_inspect_ui: -# emit("done_check", table_content) - -# all_false_rule = [ -# p["tag"]["OLS"] == False, -# p["tag"]["LOV"] == False, -# p["tag"]["BioPortal"] == False, -# ] -# if all(all_false_rule) and not "Bioschemas" in p["tag"]: -# table_content["properties_false"].append(p["name"]) - -# table_content["done"] = True -# if is_inspect_ui: -# emit("done_check", table_content) -# return table_content - - @socketio.on("check_kg") def check_vocabularies(data): step = 0 @@ -1727,12 +1682,16 @@ def validate_bioschemas(): uri = request.args.get("url") logging.info(f"Validating Bioschemas markup for {uri}") - kg = WebResource(uri).get_rdf() - print(len(kg)) + eval = Evaluation() + eval.set_start_time() + eval.set_target_uri(url) + eval.set_reason("bioschemas validation") + kg = WebResource(uri).get_rdf() results = evaluate_bioschemas_profiles(kg) - # res, kg = validate_any_from_microdata(input_url=uri) + eval.set_end_time() + eval.persist(source="UI") m = [] return render_template( @@ -1747,6 +1706,31 @@ def validate_bioschemas(): ) +@app.route("/suggest_profile") +def recommend_profile(): + url = request.args.get("url") + + eval = Evaluation() + eval.set_start_time() + eval.set_target_uri(url) + eval.set_reason("profile recommendation") + + web_res = WebResource(url) + kg = web_res.get_rdf() + results = suggest_profile(kg) + + eval.set_end_time() + eval.persist(source="UI") + + return render_template( + "profile_reco.html", + title="Which profile should I use ?", + subtitle="suggests the most relevant metadata profiles (beta feature)", + results=results, + url=url, + ) + + ####################################### ####################################### diff --git a/metrics/WebResource.py b/metrics/WebResource.py index 41eb640b..4effcbef 100644 --- a/metrics/WebResource.py +++ b/metrics/WebResource.py @@ -228,11 +228,11 @@ def __init__(self, url, rdf_graph=None) -> None: self.wr_dataset, "http://www.w3.org/1999/xhtml/vocab#" ) - print("HTML: " + str(len(self.kg_html))) - print("LINKS HEADERS: " + str(len(self.kg_links_header))) - print("AUTO: " + str(len(self.kg_auto))) - print("FORMATS_GUESSING: " + str(len(self.kg_brut))) - print("HTML LINKS: " + str(len(self.kg_links_html))) + # print("HTML: " + str(len(self.kg_html))) + # print("LINKS HEADERS: " + str(len(self.kg_links_header))) + # print("AUTO: " + str(len(self.kg_auto))) + # print("FORMATS_GUESSING: " + str(len(self.kg_brut))) + # print("HTML LINKS: " + str(len(self.kg_links_html))) else: self.rdf = rdf_graph diff --git a/metrics/util.py b/metrics/util.py index a7a8afbe..c45d28a1 100644 --- a/metrics/util.py +++ b/metrics/util.py @@ -1,6 +1,6 @@ # from time import time # from SPARQLWrapper import SPARQLWrapper, N3 -from rdflib import Graph, ConjunctiveGraph, URIRef +from rdflib import Graph, ConjunctiveGraph, URIRef, RDF import requests import metrics.statistics as stats @@ -776,3 +776,14 @@ def gen_usage_statistics(): with open("data/usage_stats.json", "w") as outfile: json.dump(stats_dict, outfile) logging.info("Saved stats") + + +def list_all_instances(kg): + # + # list all typed entities in a knowledge graph + # + subjects = [] + for s, p, o in kg.triples((None, RDF.type, None)): + # print(f"{s} is a {o}") + subjects.append(s) + return subjects diff --git a/profile_recomender.py b/profile_recomender.py deleted file mode 100644 index fc44ac39..00000000 --- a/profile_recomender.py +++ /dev/null @@ -1,138 +0,0 @@ -from rdflib import ConjunctiveGraph -from rich.console import Console -from rich.table import Table - -# from rich.text import Text -from argparse import ArgumentParser, RawTextHelpFormatter - -from profiles.ProfileFactory import ProfileFactory - -import sys -import time - - -parser = ArgumentParser( - description=""" -profile_recommender helps you in finding the most relevant Bioschemas profile. - -Usage examples : - python profile_recommender.py -url http://bio.tools/jaspar - -Please report any issue to alban.gaignard@univ-nantes.fr -""", - formatter_class=RawTextHelpFormatter, -) - -# parser.add_argument( -# "-u", -# "--update", -# help="download or update the EDAM ontology", -# ) - -parser.add_argument( - "-u", - "--urls", - metavar="urls", - type=str, - nargs="+", - help="input urls", -) - -parser.add_argument( - "-f", - "--files", - metavar="files", - type=str, - nargs="+", - help="input files", -) - - -if __name__ == "__main__": - if len(sys.argv) == 1: - parser.print_help(sys.stderr) - sys.exit(1) - - args = parser.parse_args() - console = Console() - - s1 = time.time() - profiles = ProfileFactory.create_all_profiles() - ts1 = round((time.time() - s1), 2) - console.print(f"{len(profiles)} Bioschemas profiles loaded in {ts1} s") - - if args.urls: - from metrics.WebResource import WebResource - - for url in args.urls: - console.print(f"Which profile is relevant for {url} ?") - web_res = WebResource(url) - kg = web_res.get_rdf() - console.print(f"{len(kg)} loaded RDF triples") - - results = {} - - for p_name in profiles.keys(): - profile = profiles[p_name] - sim = profile.compute_similarity(kg) - results[p_name] = {"score": sim, "ref": ""} - - sorted_results = dict( - sorted(results.items(), key=lambda item: item[1]["score"], reverse=True) - ) - # print(sorted_results) - - table = Table(show_header=True, header_style="bold magenta") - table.add_column("Profile", justify="left") - table.add_column("Similarity score", justify="right") - table.add_column("Profile URI", justify="right", style="green") - - for hit in sorted_results.keys(): - table.add_row( - str(hit), - str(sorted_results[hit]["score"]), - f"[link={sorted_results[hit]['ref']}]{sorted_results[hit]['ref']}[/link]", - ) - - console.rule(f"[bold red]Relevent Bioschemas profile for {url}") - console.print(table) - console.print() - - if args.files: - for file in args.files: - console.print(f"Which profile is relevant for {file} ?") - - kg = ConjunctiveGraph() - # kg.parse(file, format="turtle") - kg.parse(file) - console.print(f"{len(kg)} loaded RDF triples") - - results = {} - - for p_name in profiles.keys(): - profile = profiles[p_name] - sim = profile.compute_similarity(kg) - # sim = profile.compute_loose_similarity(kg) - results[p_name] = {"score": sim, "ref": ""} - - sorted_results = dict( - sorted(results.items(), key=lambda item: item[1]["score"], reverse=True) - ) - # print(sorted_results) - - table = Table(show_header=True, header_style="bold magenta") - table.add_column("Profile", justify="left") - table.add_column("Similarity score", justify="right") - table.add_column("Profile URI", justify="right", style="green") - - for hit in sorted_results.keys(): - table.add_row( - str(hit), - str(sorted_results[hit]["score"]), - # str({sorted_results[hit]["ref"]}) - f"[link={sorted_results[hit]['ref']}]{sorted_results[hit]['ref']}[/link]", - ) - - console.rule(f"[bold red]Relevent Bioschemas profile for {file}") - console.print(table) - console.print() diff --git a/profile_recommender.py b/profile_recommender.py new file mode 100644 index 00000000..935031bd --- /dev/null +++ b/profile_recommender.py @@ -0,0 +1,133 @@ +import eventlet + +eventlet.monkey_patch(select=False) + +from rdflib import ConjunctiveGraph, RDF +from rich.console import Console +from rich.table import Table + +# from rich.text import Text +from argparse import ArgumentParser, RawTextHelpFormatter +from profiles.ProfileFactory import ProfileFactory, load_profiles + +import sys +import time + + +parser = ArgumentParser( + description=""" +profile_recommender helps you in finding the most relevant Bioschemas profile. + +Usage examples : + python profile_recommender.py --url http://www.cathdb.info + +Please report any issue to alban.gaignard@univ-nantes.fr +""", + formatter_class=RawTextHelpFormatter, +) + +parser.add_argument( + "-u", + "--urls", + metavar="urls", + type=str, + nargs="+", + help="input urls", +) + +# parser.add_argument( +# "-f", +# "--files", +# metavar="files", +# type=str, +# nargs="+", +# help="input files", +# ) + + +def list_all_instances(kg): + subjects = [] + for s, p, o in kg.triples((None, RDF.type, None)): + # print(f"{s} is a {o}") + subjects.append(s) + return subjects + + +if __name__ == "__main__": + if len(sys.argv) == 1: + parser.print_help(sys.stderr) + sys.exit(1) + + args = parser.parse_args() + console = Console() + + s1 = time.time() + profiles = ProfileFactory.create_all_profiles_from_specifications() + ts1 = round((time.time() - s1), 2) + console.print(f"{len(profiles)} Bioschemas profiles loaded in {ts1} s") + + if args.urls: + from metrics.WebResource import WebResource + + for url in args.urls: + results = {} + + console.print(f"Which profile is relevant for {url} ?") + web_res = WebResource(url) + kg = web_res.get_rdf() + console.print(f"{len(kg)} loaded RDF triples") + entities = list_all_instances(kg) + + console.print(f"Iterating over {entities} typed entities") + table = Table(show_header=True, header_style="bold magenta") + table.add_column("Entity", justify="left") + table.add_column("Profile name", justify="left") + table.add_column("Similarity score", justify="right") + table.add_column("Profile URI", justify="right", style="green") + + for e in entities: + # print(e) + # print(type(e)) + sub_kg = ConjunctiveGraph() + for s, p, o in kg.triples((e, None, None)): + sub_kg.add((s, p, o)) + + has_matching_profile = False + for p_name in profiles.keys(): + profile = profiles[p_name] + sim = profile.compute_similarity(sub_kg) + # sim = profile.compute_loose_similarity(kg) + results[p_name] = {"score": sim, "ref": profile.get_name()} + if sim > 0: + has_matching_profile = True + + sorted_results = dict( + sorted( + results.items(), + key=lambda item: item[1]["score"], + reverse=True, + ) + ) + + final_results = [] + if has_matching_profile: + for hit in sorted_results.keys(): + if sorted_results[hit]["score"] > 0: + final_results.append( + ( + str(str(e)), + f"[link={sorted_results[hit]['ref']}]{sorted_results[hit]['ref']}[/link]", + str(sorted_results[hit]["score"]), + str(hit), + ) + ) + table.add_row( + str(str(e)), + f"[link={sorted_results[hit]['ref']}]{sorted_results[hit]['ref']}[/link]", + str(sorted_results[hit]["score"]), + str(hit), + ) + + console.rule(f"[bold red]Relevent Bioschemas profile for {url}") + console.print(table) + console.print() diff --git a/profiles/Profile.py b/profiles/Profile.py index 0aa5ca4e..a4ab594b 100755 --- a/profiles/Profile.py +++ b/profiles/Profile.py @@ -14,7 +14,6 @@ class Profile: - # TODO doc class # TODO getters for class attributes @@ -146,7 +145,6 @@ def gen_SHACL_from_profile(self): return shape def validate_shape(self, knowledge_graph, shacl_shape): - r = validate( data_graph=knowledge_graph, data_graph_format="turtle", @@ -188,7 +186,6 @@ def validate_shape(self, knowledge_graph, shacl_shape): warnings = [] errors = [] for r in results: - if "#Warning" in r["severity"]: # print( # f'WARNING: Property {r["path"]} should be provided for {r["node"]}' @@ -221,7 +218,7 @@ def match_sub_kgs_from_profile(self, kg): # print(kg.serialize(format="trig")) # for s, p, o in kg.triples((None, RDF.type, None)): - for (s, p, o, g) in kg.quads((None, RDF.type, None, None)): + for s, p, o, g in kg.quads((None, RDF.type, None, None)): # print(o) # print(o.n3(kg.namespace_manager)) if o.n3(kg.namespace_manager).replace("scs:", "sc:") in self.target_classes: @@ -234,7 +231,6 @@ def match_sub_kgs_from_profile(self, kg): ) for x, y, z, g in kg.quads((s, None, None, None)): - # print(f"{x} -> {y} -> {z} -> {g}") # print(i) sub_kg.add((x, y, z)) @@ -247,8 +243,7 @@ def compute_similarity(self, kg) -> float: kg.namespace_manager.bind("sc", URIRef("http://schema.org/")) kg.namespace_manager.bind("bsc", URIRef("https://bioschemas.org/")) kg.namespace_manager.bind("dct", URIRef("http://purl.org/dc/terms/")) - - # print(len(kg)) + # print(str(self.get_name()) + " targeting -> " + str(self.get_target())) # print(kg.serialize(format="turtle")) results = {} @@ -256,15 +251,13 @@ def compute_similarity(self, kg) -> float: # list classes for s, p, o in kg.triples((None, RDF.type, None)): # print() - # print(f"{s.n3(kg.namespace_manager)} is a {o.n3(kg.namespace_manager)}") # print(bs_profiles.keys()) # print(o.n3(kg.namespace_manager)) + # print(self.target_classes) if o.n3(kg.namespace_manager) in self.target_classes: # print() print(f"Trying to validate {s} as a(n) {o} resource") - shacl_shape = self.gen_SHACL_from_profile( - # o.n3(kg.namespace_manager) - ) + shacl_shape = self.gen_SHACL_from_profile() sub_kg = ConjunctiveGraph() for x, y, z in kg.triples((s, None, None)): @@ -278,12 +271,15 @@ def compute_similarity(self, kg) -> float: # print(f"{len(errors)} / {self.nb_min}") # print(f"{len(warnings)} / {self.nb_rec}") - max_points = 2 * self.nb_min + self.nb_rec + weight = 20 + + max_points = weight * self.nb_min + self.nb_rec similarity = ( - max_points - (2 * len(errors) + len(warnings)) + max_points - (weight * len(errors) + len(warnings)) ) / max_points similarity = round(similarity, 2) + print(self.get_name() + ": " + str(similarity)) return similarity return 0.0 @@ -307,9 +303,7 @@ def compute_loose_similarity(self, kg) -> float: # print() print(f"Trying to validate {s} as a(n) {o} resource") - shacl_shape = self.gen_SHACL_from_profile( - # o.n3(kg.namespace_manager) - ) + shacl_shape = self.gen_SHACL_from_profile() sub_kg = ConjunctiveGraph() for x, y, z in kg.triples((s, None, None)): diff --git a/templates/profile_reco.html b/templates/profile_reco.html new file mode 100644 index 00000000..e9311717 --- /dev/null +++ b/templates/profile_reco.html @@ -0,0 +1,85 @@ + + +{% extends "layout.html" %} + +{% block nav %} +{% include 'nav.html' %} +{% endblock %} + +{% block body %} + + + +
+
+
+

Recommended profiles

+
+ +

+ Bioschemas is a community effort aimed at reusing and extending Schema.org for better life + science digital resource findability. Several profiles are defined for each kind of Life Science + resources, specifying minimal, recommended or optional information. +

+
+

+ Here are proposed a list of possibly relevant profile based on an analysis of all typed entities described + in your page metadata. The score is computed based on profile's required and recommended properties. +

+ +
+

+

+ {{url}} +
+

+
+ + + + + + + + + + {% for item in results %} + + + + + + {% endfor %} + +
Typed entitiesProfileSimilarity score
{{item["entity"]}} + + {{item["profile_name"]}} + + {{item["score"]}}
+
+
+ +
+ +{% endblock %} \ No newline at end of file