Skip to content

Commit

Permalink
Merge list of tools to keep and to exclude in 1 file and add deprecat…
Browse files Browse the repository at this point in the history
…ion status (#102)

* Merge list of tools to keep and to exclude in 1 file and extract ts filtered tools

* Rename keep/exclude to status and add deprecation

* Update bin/extract_galaxy_tools.py

* revert results files

* Fix lint

---------

Co-authored-by: paulzierep <paul.zierep@googlemail.com>
  • Loading branch information
bebatut and paulzierep authored Jun 3, 2024
1 parent 4ef5182 commit 042bcff
Show file tree
Hide file tree
Showing 12 changed files with 1,703 additions and 866 deletions.
17 changes: 11 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,17 +95,22 @@ The script will generate a TSV file with each tool found in the list of GitHub r

1. Run the extraction as explained before
2. (Optional) Create a text file with ToolShed categories for which tools need to be extracted: 1 ToolShed category per row ([example for microbial data analysis](data/microgalaxy/categories))
3. (Optional) Create a text file with list of tools to exclude: 1 tool id per row ([example for microbial data analysis](data/microgalaxy/tools_to_exclude))
4. (Optional) Create a text file with list of tools to really keep (already reviewed): 1 tool id per row ([example for microbial data analysis](data/microgalaxy/tools_to_keep))
3. (Optional) Create a TSV (tabular) file with tool status (1 tool suite per row) as 3 columns:
- ToolShed ids of tool suites (one per line)
- Boolean with True to keep and False to exclude
- Boolean with True if deprecated and False if not

[Example for microbial data analysis](data/microgalaxy/tools_to_keep_exclude.tsv)

4. Run the tool extractor script

```
$ python bin/extract_galaxy_tools.py \
$ python bin/extract_galaxy_tools.py filtertools \
--tools <Path to CSV file with all extracted tools> \
--filtered_tools <Path to output CSV file with filtered tools> \
--ts_filtered_tools <Path to output TSV with tools filtered based on ToolShed category>
--filtered_tools <Path to output TSV with filtered tools based on ToolShed category and manual curation> \
[--categories <Path to ToolShed category file>] \
[--excluded <Path to excluded tool file category file>]\
[--keep <Path to to-keep tool file category file>]
[--status <Path to a TSV file with tool status - 3 columns: ToolShed ids of tool suites, Boolean with True to keep and False to exclude, Boolean with True if deprecated and False if not>]
```

## Development
Expand Down
61 changes: 32 additions & 29 deletions bin/extract_galaxy_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,12 +559,11 @@ def export_tools(
:param output_fp: path to output file
:param format_list_col: boolean indicating if list columns should be formatting
"""
df = pd.DataFrame(tools)
df = pd.DataFrame(tools).sort_values("Galaxy wrapper id")
if format_list_col:
df["ToolShed categories"] = format_list_column(df["ToolShed categories"])
df["EDAM operation"] = format_list_column(df["EDAM operation"])
df["EDAM topic"] = format_list_column(df["EDAM topic"])

df["bio.tool ids"] = format_list_column(df["bio.tool ids"])

# the Galaxy tools need to be formatted for the add_instances_to_table to work
Expand All @@ -580,30 +579,33 @@ def export_tools(
def filter_tools(
tools: List[Dict],
ts_cat: List[str],
excluded_tools: List[str],
keep_tools: List[str],
) -> List[Dict]:
tool_status: Dict,
) -> tuple:
"""
Filter tools for specific ToolShed categories and add information if to keep or to exclude
:param tools: dictionary with tools and their metadata
:param ts_cat: list of ToolShed categories to keep in the extraction
:param excluded_tools: list of tools to skip
:param keep_tools: list of tools to keep
:param tool_status: dictionary with tools and their 2 status: Keep and Deprecated
"""
ts_filtered_tools = []
filtered_tools = []
for tool in tools:
# filter ToolShed categories and leave function if not in expected categories
if check_categories(tool["ToolShed categories"], ts_cat):
name = tool["Galaxy wrapper id"]
tool["Reviewed"] = name in keep_tools or name in excluded_tools
tool["To keep"] = None
if name in keep_tools:
tool["To keep"] = True
elif name in excluded_tools:
tool["To keep"] = False
filtered_tools.append(tool)
return filtered_tools
tool["Reviewed"] = name in tool_status
keep = None
deprecated = None
if name in tool_status:
keep = tool_status[name][1]
deprecated = tool_status[name][2]
tool["Deprecated"] = deprecated
if keep: # only add tools that are manually marked as to keep
filtered_tools.append(tool)
tool["To keep"] = keep
ts_filtered_tools.append(tool)
return ts_filtered_tools, filtered_tools


if __name__ == "__main__":
Expand Down Expand Up @@ -642,30 +644,31 @@ def filter_tools(
filtertools = subparser.add_parser("filtertools", help="Filter tools")
filtertools.add_argument(
"--tools",
"-t",
"-i",
required=True,
help="Filepath to TSV with all extracted tools, generated by extractools command",
)
filtertools.add_argument(
"--ts-filtered-tools",
"-t",
required=True,
help="Filepath to TSV with tools filtered based on ToolShed category",
)
filtertools.add_argument(
"--filtered-tools",
"-f",
required=True,
help="Filepath to TSV with filtered tools",
help="Filepath to TSV with tools filtered based on ToolShed category and manual curation",
)
filtertools.add_argument(
"--categories",
"-c",
help="Path to a file with ToolShed category to keep in the extraction (one per line)",
)
filtertools.add_argument(
"--exclude",
"-e",
help="Path to a file with ToolShed ids of tools to exclude (one per line)",
)
filtertools.add_argument(
"--keep",
"-k",
help="Path to a file with ToolShed ids of tools to keep (one per line)",
"--status",
"-s",
help="Path to a TSV file with tool status - 3 columns: ToolShed ids of tool suites, Boolean with True to keep and False to exclude, Boolean with True if deprecated and False if not",
)
args = parser.parse_args()

Expand Down Expand Up @@ -696,11 +699,11 @@ def filter_tools(
export_tools(tools, args.all_tools, format_list_col=True, add_usage_stats=True)

elif args.command == "filtertools":
tools = pd.read_csv(Path(args.tools), sep="\t", keep_default_na=False).to_dict("records")
tools = pd.read_csv(args.tools, sep="\t", keep_default_na=False).to_dict("records")
# get categories and tools to exclude
categories = read_file(args.categories)
excl_tools = read_file(args.exclude)
keep_tools = read_file(args.keep)
status = pd.read_csv(args.status, sep="\t", index_col=0, header=None).to_dict("index")
# filter tool lists
filtered_tools = filter_tools(tools, categories, excl_tools, keep_tools)
ts_filtered_tools, filtered_tools = filter_tools(tools, categories, status)
export_tools(ts_filtered_tools, args.ts_filtered_tools)
export_tools(filtered_tools, args.filtered_tools)
4 changes: 2 additions & 2 deletions bin/get_community_tools.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ for com_data_fp in data/communities/* ; do
python bin/extract_galaxy_tools.py \
filtertools \
--tools "results/all_tools.tsv" \
--ts-filtered-tools "results/$community/tools_filtered_by_ts_categories.tsv" \
--filtered-tools "results/$community/tools.tsv" \
--categories "data/communities/$community/categories" \
--exclude "data/communities/$community/tools_to_exclude" \
--keep "data/communities/$community/tools_to_keep"
--status "data/communities/$community/tool_status.tsv"

python bin/create_interactive_table.py \
--table "results/$community/tools.tsv" \
Expand Down
File renamed without changes.
Empty file.
Loading

0 comments on commit 042bcff

Please sign in to comment.