Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updating the extract galaxy tools script to add two more columns one for reduced EDAM operation and one for reduced EDAM topic #52

Merged
merged 14 commits into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bin/extract_all_tools_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ json_output="results/${1}_tools.json"
python bin/extract_galaxy_tools.py \
extractools \
--api $GITHUB_API_KEY \
--all-tools $output \
--all-tools $tsv_output \
--all-tools-json $json_output \
--planemo-repository-list $1 \
--test
Expand Down
59 changes: 58 additions & 1 deletion bin/extract_galaxy_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from github import Github
from github.ContentFile import ContentFile
from github.Repository import Repository
from owlready2 import get_ontology

# Config variables
BIOTOOLS_API_URL = "https://bio.tools"
Expand Down Expand Up @@ -576,6 +577,10 @@ def export_tools_to_tsv(
df["ToolShed categories"] = format_list_column(df["ToolShed categories"])
df["EDAM operation"] = format_list_column(df["EDAM operation"])
df["EDAM topic"] = format_list_column(df["EDAM topic"])

df["EDAM operation (no superclasses)"] = format_list_column(df["EDAM operation (no superclasses)"])
df["EDAM topic (no superclasses)"] = format_list_column(df["EDAM topic (no superclasses)"])

df["bio.tool ids"] = format_list_column(df["bio.tool ids"])

# the Galaxy tools need to be formatted for the add_instances_to_table to work
Expand Down Expand Up @@ -620,6 +625,48 @@ def filter_tools(
return ts_filtered_tools, filtered_tools


def reduce_ontology_terms(terms: List, ontology: Any) -> List:
"""
Reduces a list of Ontology terms, to include only terms that are not super-classes of one of the other terms.
In other terms all classes that have a subclass in the terms are removed.

:terms: list of terms from that ontology
:ontology: Ontology
"""

# if list is empty do nothing
if not terms:
return terms

classes = [ontology.search_one(label=term) for term in terms]
check_classes = [cla for cla in classes if cla is not None] # Remove None values

new_classes = []
for cla in check_classes:
try:
# get all subclasses
subclasses = list(cla.subclasses())

# check if any of the other classes is a subclass
include_class = True
for subcla in subclasses:
for cla2 in check_classes:
if subcla == cla2:
include_class = False

# only keep the class if it is not a parent class
if include_class:
new_classes.append(cla)

except Exception as e:
print(f"Error processing class {cla}: {e}")

# convert back to terms, skipping None values
new_terms = [cla.label[0] for cla in new_classes if cla is not None]
# print(f"Terms: {len(terms)}, New terms: {len(new_terms)}")
return new_terms


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Extract Galaxy tools from GitHub repositories together with biotools and conda metadata"
Expand Down Expand Up @@ -695,7 +742,7 @@ def filter_tools(
run_test=args.test,
add_extra_repositories=not args.avoid_extra_repositories,
)
# parse tools in GitHub repositories to extract metada, filter by TS categories and export to output file
# parse tools in GitHub repositories to extract metadata, filter by TS categories and export to output file
tools: List[Dict] = []
for r in repo_list:
print("Parsing tools from:", (r))
Expand All @@ -709,6 +756,16 @@ def filter_tools(
f"Error while extracting tools from repo {r}: {e}",
file=sys.stderr,
)

# add additional information to the List[Dict] object
edam_ontology = get_ontology("https://edamontology.org/EDAM_1.25.owl").load()

for tool in tools:
tool["EDAM operation (no superclasses)"] = reduce_ontology_terms(
tool["EDAM operation"], ontology=edam_ontology
)
tool["EDAM topic (no superclasses)"] = reduce_ontology_terms(tool["EDAM topic"], ontology=edam_ontology)

export_tools_to_json(tools, args.all_tools_json)
export_tools_to_tsv(tools, args.all_tools, format_list_col=True, add_usage_stats=True)

Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ pyyaml
numpy
Pillow
matplotlib
wordcloud
wordcloud
owlready2
Binary file removed results/microgalaxy/tools_wordcloud.png
Binary file not shown.