diff --git a/2.0.0a3/404.html b/2.0.0a3/404.html new file mode 100644 index 00000000..59a14c3b --- /dev/null +++ b/2.0.0a3/404.html @@ -0,0 +1,1414 @@ + + + +
+ + + + + + + + + + + + + + + + +
antismash
+
+
+¶
AntismashBGCLoader
+
+
+¶
+ Bases: BGCLoaderBase
Build a loader for AntiSMASH BGC genbank (.gbk) files.
+ + +AntiSMASH BGC directory must follow the structure below: +
+Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
data_dir |
+
+ str | PathLike
+ |
+
+
+
+ Path to AntiSMASH directory that contains a +collection of AntiSMASH outputs. + |
+ + required + | +
src/nplinker/genomics/antismash/antismash_loader.py
get_bgc_genome_mapping
+
+
+¶Get the mapping from BGC to genome.
+Note that the directory name of the gbk file is treated as genome id.
+ + +Returns:
+Type | +Description | +
---|---|
+ dict[str, str]
+ |
+
+
+
+ The key is BGC name (gbk file name) and value is genome id (the directory name of the + |
+
+ dict[str, str]
+ |
+
+
+
+ gbk file). + |
+
src/nplinker/genomics/antismash/antismash_loader.py
get_files
+
+
+¶
GenomeStatus
+
+
+¶GenomeStatus(
+ original_id: str,
+ resolved_refseq_id: str = "",
+ resolve_attempted: bool = False,
+ bgc_path: str = "",
+)
+
A class to represent the status of a single genome.
+The status of genomes is tracked in a JSON file which has a name defined
+in variable GENOME_STATUS_FILENAME
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
original_id |
+
+ str
+ |
+
+
+
+ The original ID of the genome. + |
+ + required + | +
resolved_refseq_id |
+
+ str
+ |
+
+
+
+ The resolved RefSeq ID of the +genome. Defaults to "". + |
+
+ ''
+ |
+
resolve_attempted |
+
+ bool
+ |
+
+
+
+ A flag indicating whether an +attempt to resolve the RefSeq ID has been made. Defaults to False. + |
+
+ False
+ |
+
bgc_path |
+
+ str
+ |
+
+
+
+ The path to the downloaded BGC file for +the genome. Defaults to "". + |
+
+ ''
+ |
+
src/nplinker/genomics/antismash/podp_antismash_downloader.py
resolved_refseq_id
+
+
+
+ instance-attribute
+
+
+¶resolved_refseq_id = (
+ ""
+ if resolved_refseq_id == "None"
+ else resolved_refseq_id
+)
+
resolve_attempted
+
+
+
+ instance-attribute
+
+
+¶resolve_attempted = resolve_attempted
+
read_json
+
+
+
+ staticmethod
+
+
+¶Get a dict of GenomeStatus objects by loading given genome status file.
+Note that an empty dict is returned if the given file doesn't exist.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file |
+
+ str | PathLike
+ |
+
+
+
+ Path to genome status file. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ dict[str, 'GenomeStatus']
+ |
+
+
+
+ Dict keys are genome original id and values are GenomeStatus +objects. An empty dict is returned if the given file doesn't exist. + |
+
src/nplinker/genomics/antismash/podp_antismash_downloader.py
to_json
+
+
+
+ staticmethod
+
+
+¶to_json(
+ genome_status_dict: Mapping[str, "GenomeStatus"],
+ file: str | PathLike | None = None,
+) -> str | None
+
Convert the genome status dictionary to a JSON string.
+If a file path is provided, the JSON string is written to the file. If +the file already exists, it is overwritten.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
genome_status_dict |
+
+ Mapping[str, 'GenomeStatus']
+ |
+
+
+
+ A dictionary of genome +status objects. The keys are the original genome IDs and the values +are GenomeStatus objects. + |
+ + required + | +
file |
+
+ str | PathLike | None
+ |
+
+
+
+ The path to the output JSON file. +If None, the JSON string is returned but not written to a file. + |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ str | None
+ |
+
+
+
+ The JSON string if |
+
src/nplinker/genomics/antismash/podp_antismash_downloader.py
download_and_extract_antismash_data
+
+
+¶download_and_extract_antismash_data(
+ antismash_id: str,
+ download_root: str | PathLike,
+ extract_root: str | PathLike,
+) -> None
+
Download and extract antiSMASH BGC archive for a specified genome.
+The antiSMASH database (https://antismash-db.secondarymetabolites.org/) +is used to download the BGC archive. And antiSMASH use RefSeq assembly id +of a genome as the id of the archive.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
antismash_id |
+
+ str
+ |
+
+
+
+ The id used to download BGC archive from antiSMASH database. +If the id is versioned (e.g., "GCF_004339725.1") please be sure to +specify the version as well. + |
+ + required + | +
download_root |
+
+ str | PathLike
+ |
+
+
+
+ Path to the directory to place downloaded archive in. + |
+ + required + | +
extract_root |
+
+ str | PathLike
+ |
+
+
+
+ Path to the directory data files will be extracted to.
+Note that an |
+ + required + | +
Raises:
+Type | +Description | +
---|---|
+ ValueError
+ |
+
+
+
+ if |
+
Examples:
+>>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted")
+
src/nplinker/genomics/antismash/antismash_downloader.py
parse_bgc_genbank
+
+
+¶Parse a single BGC gbk file to BGC object.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file |
+
+ str | PathLike
+ |
+
+
+
+ Path to BGC gbk file + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ BGC
+ |
+
+
+
+ BGC object + |
+
Examples:
+>>> bgc = AntismashBGCLoader.parse_bgc(
+... "/data/antismash/GCF_000016425.1/NC_009380.1.region001.gbk")
+
src/nplinker/genomics/antismash/antismash_loader.py
get_best_available_genome_id
+
+
+¶Get the best available ID from genome_id_data dict.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
genome_id_data |
+
+ Mapping[str, str]
+ |
+
+
+
+ dictionary containing information for each genome record present. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ str | None
+ |
+
+
+
+ ID for the genome, if present, otherwise None. + |
+
src/nplinker/genomics/antismash/podp_antismash_downloader.py
podp_download_and_extract_antismash_data
+
+
+¶podp_download_and_extract_antismash_data(
+ genome_records: Sequence[
+ Mapping[str, Mapping[str, str]]
+ ],
+ project_download_root: str | PathLike,
+ project_extract_root: str | PathLike,
+)
+
Download and extract antiSMASH BGC archive for the given genome records.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
genome_records |
+
+ Sequence[Mapping[str, Mapping[str, str]]]
+ |
+
+
+
+ list of dicts +representing genome records. The dict of each genome record contains + - key(str): "genome_ID" + - value(dict[str, str]): a dict containing information about genome + type, label and accession ids (RefSeq, GenBank, and/or JGI). + |
+ + required + | +
project_download_root |
+
+ str | PathLike
+ |
+
+
+
+ Path to the directory to place +downloaded archive in. + |
+ + required + | +
project_extract_root |
+
+ str | PathLike
+ |
+
+
+
+ Path to the directory downloaded archive
+will be extracted to.
+Note that an |
+ + required + | +
src/nplinker/genomics/antismash/podp_antismash_downloader.py
127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 |
|
arranger
+
+
+¶
PODP_PROJECT_URL
+
+
+
+ module-attribute
+
+
+¶
DatasetArranger
+
+
+¶Arrange the dataset required by NPLinker.
+This class is used to arrange the datasets required by NPLinker according to the +configuration. The datasets include MIBiG, GNPS, antiSMASH, and BiG-SCAPE.
+If self.config.mode
is "local", the datasets are validated.
+If self.config.mode
is "podp", the datasets are downloaded or generated.
Attributes:
+Name | +Type | +Description | +
---|---|---|
config |
+ + | +
+
+
+ A Dynaconf object that contains the configuration settings. Check |
+
root_dir |
+ + | +
+
+
+ The root directory of the datasets. + |
+
downloads_dir |
+ + | +
+
+
+ The directory to store downloaded files. + |
+
mibig_dir |
+ + | +
+
+
+ The directory to store MIBiG metadata. + |
+
gnps_dir |
+ + | +
+
+
+ The directory to store GNPS data. + |
+
antismash_dir |
+ + | +
+
+
+ The directory to store antiSMASH data. + |
+
bigscape_dir |
+ + | +
+
+
+ The directory to store BiG-SCAPE data. + |
+
bigscape_running_output_dir |
+ + | +
+
+
+ The directory to store the running output of BiG-SCAPE. + |
+
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
config |
+
+ Dynaconf
+ |
+
+
+
+ A Dynaconf object that contains the configuration settings. Check |
+ + required + | +
src/nplinker/arranger.py
downloads_dir
+
+
+
+ instance-attribute
+
+
+¶downloads_dir = root_dir / DOWNLOADS_DIRNAME
+
mibig_dir
+
+
+
+ instance-attribute
+
+
+¶mibig_dir = root_dir / MIBIG_DIRNAME
+
gnps_dir
+
+
+
+ instance-attribute
+
+
+¶gnps_dir = root_dir / GNPS_DIRNAME
+
antismash_dir
+
+
+
+ instance-attribute
+
+
+¶antismash_dir = root_dir / ANTISMASH_DIRNAME
+
bigscape_dir
+
+
+
+ instance-attribute
+
+
+¶bigscape_dir = root_dir / BIGSCAPE_DIRNAME
+
bigscape_running_output_dir
+
+
+
+ instance-attribute
+
+
+¶bigscape_running_output_dir = (
+ bigscape_dir / BIGSCAPE_RUNNING_OUTPUT_DIRNAME
+)
+
arrange
+
+
+¶Arrange the datasets according to the configuration.
+The datasets include MIBiG, GNPS, antiSMASH, and BiG-SCAPE.
+ +src/nplinker/arranger.py
arrange_podp_project_json
+
+
+¶Arrange the PODP project JSON file.
+If self.config.mode
is "podp", download the PODP project JSON file if it doesn't exist. Then
+validate the PODP project JSON file if it exists or is downloaded.
The validation is controlled by the json schema schemas/podp_adapted_schema.json
.
src/nplinker/arranger.py
arrange_mibig
+
+
+¶Arrange the MIBiG metadata.
+Always download and extract the MIBiG metadata if self.config.mibig.to_use
is True.
+If the default directory has already existed, it will be removed and re-downloaded to ensure
+the latest version is used. So it's not allowed to manually put MIBiG metadata in the
+default directory.
src/nplinker/arranger.py
arrange_gnps
+
+
+¶Arrange the GNPS data.
+If self.config.mode
is "local", validate the GNPS data directory.
+If self.config.mode
is "podp", download the GNPS data if it doesn't exist or remove the
+existing GNPS data and re-download it if it is invalid.
The validation process includes:
+src/nplinker/arranger.py
arrange_antismash
+
+
+¶Arrange the antiSMASH data.
+If self.config.mode
is "local", validate the antiSMASH data directory.
+If self.config.mode
is "podp", download the antiSMASH data if it doesn't exist or remove the
+existing antiSMASH data and re-download it if it is invalid.
The validation process includes: +- Check if the antiSMASH data directory exists. +- Check if the antiSMASH data directory contains at least one sub-directory, and each + sub-directory contains at least one BGC file (with the suffix ".region???.gbk" where ??? + is a number).
+AntiSMASH BGC directory must follow the structure below: +
antismash
+ ├── genome_id_1 (one AntiSMASH output, e.g. GCF_000514775.1)
+ │ ├── GCF_000514775.1.gbk
+ │ ├── NZ_AZWO01000004.region001.gbk
+ │ └── ...
+ ├── genome_id_2
+ │ ├── ...
+ └── ...
+
src/nplinker/arranger.py
arrange_bigscape
+
+
+¶Arrange the BiG-SCAPE data.
+If self.config.mode
is "local", validate the BiG-SCAPE data directory.
+If self.config.mode
is "podp", run BiG-SCAPE to generate the clustering file if it doesn't
+exist or remove the existing BiG-SCAPE data and re-run BiG-SCAPE if it is invalid.
+The running output of BiG-SCAPE will be saved to the directory "bigscape_running_output"
+in the default BiG-SCAPE directory, and the clustering file
+"mix_clustering_c{self.config.bigscape.cutoff}.tsv" will be copied to the default BiG-SCAPE
+directory.
The validation process includes:
+src/nplinker/arranger.py
arrange_strain_mappings
+
+
+¶Arrange the strain mappings file.
+If self.config.mode
is "local", validate the strain mappings file.
+If self.config.mode
is "podp", always generate the strain mappings file and validate it.
The validation checks if the strain mappings file exists and if it is a valid JSON file
+according to the schema defined in schemas/strain_mappings_schema.json
.
src/nplinker/arranger.py
arrange_strains_selected
+
+
+¶Arrange the strains selected file.
+Validate the strains selected file if it exists.
+The validation checks if the strains selected file is a valid JSON file according to the
+schema defined in schemas/user_strains.json
.
src/nplinker/arranger.py
validate_gnps
+
+
+¶Validate the GNPS data directory and its contents.
+The GNPS data directory must contain the following files:
+Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
gnps_dir |
+
+ str | PathLike
+ |
+
+
+
+ Path to the GNPS data directory. + |
+ + required + | +
Raises:
+Type | +Description | +
---|---|
+ FileNotFoundError
+ |
+
+
+
+ If the GNPS data directory is not found or any of the required files +is not found. + |
+
+ ValueError
+ |
+
+
+
+ If both file_mappings.tsv and file_mapping.csv are found. + |
+
src/nplinker/arranger.py
validate_antismash
+
+
+¶Validate the antiSMASH data directory and its contents.
+The validation only checks the structure of the antiSMASH data directory and file names. +It does not check
+The antiSMASH data directory must exist and contain at least one sub-directory. The name of the +sub-directories must not contain any space. Each sub-directory must contain at least one BGC +file (with the suffix ".region???.gbk" where ??? is the region number).
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
antismash_dir |
+
+ str | PathLike
+ |
+
+
+
+ Path to the antiSMASH data directory. + |
+ + required + | +
Raises:
+Type | +Description | +
---|---|
+ FileNotFoundError
+ |
+
+
+
+ If the antiSMASH data directory is not found, or no sub-directories +are found in the antiSMASH data directory, or no BGC files are found in any +sub-directory. + |
+
+ ValueError
+ |
+
+
+
+ If any sub-directory name contains a space. + |
+
src/nplinker/arranger.py
validate_bigscape
+
+
+¶Validate the BiG-SCAPE data directory and its contents.
+The BiG-SCAPE data directory must exist and contain the clustering file +"mix_clustering_c{self.config.bigscape.cutoff}.tsv" where {self.config.bigscape.cutoff} is the +bigscape cutoff value set in the config file.
+Alternatively, the directory can contain the BiG-SCAPE database file generated by BiG-SCAPE v2. +At the moment, all the family assignments in the database will be used, so this database should +contain results from a single run with the desired cutoff.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
bigscape_dir |
+
+ str | PathLike
+ |
+
+
+
+ Path to the BiG-SCAPE data directory. + |
+ + required + | +
cutoff |
+
+ str
+ |
+
+
+
+ The BiG-SCAPE cutoff value. + |
+ + required + | +
Raises:
+Type | +Description | +
---|---|
+ FileNotFoundError
+ |
+
+
+
+ If the BiG-SCAPE data directory or the clustering file is not found. + |
+
src/nplinker/arranger.py
bigscape
+
+
+¶
BigscapeGCFLoader
+
+
+¶
+ Bases: GCFLoaderBase
Build a loader for BiG-SCAPE GCF cluster file.
+ + +Attributes:
+Name | +Type | +Description | +
---|---|---|
cluster_file |
+
+ str
+ |
+
+
+
+ path to the BiG-SCAPE cluster file. + |
+
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
cluster_file |
+
+ str | PathLike
+ |
+
+
+
+ Path to the BiG-SCAPE cluster file,
+the filename has a pattern of " |
+ + required + | +
src/nplinker/genomics/bigscape/bigscape_loader.py
cluster_file
+
+
+
+ instance-attribute
+
+
+¶cluster_file: str = str(cluster_file)
+
get_gcfs
+
+
+¶Get all GCF objects.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
keep_mibig_only |
+
+ bool
+ |
+
+
+
+ True to keep GCFs that contain only MIBiG +BGCs. + |
+
+ False
+ |
+
keep_singleton |
+
+ bool
+ |
+
+
+
+ True to keep singleton GCFs. A singleton GCF +is a GCF that contains only one BGC. + |
+
+ False
+ |
+
Returns:
+Type | +Description | +
---|---|
+ list[GCF]
+ |
+
+
+
+ A list of GCF objects. + |
+
src/nplinker/genomics/bigscape/bigscape_loader.py
BigscapeV2GCFLoader
+
+
+¶
+ Bases: GCFLoaderBase
Build a loader for BiG-SCAPE v2 database file.
+ + +Attributes:
+Name | +Type | +Description | +
---|---|---|
db_file |
+ + | +
+
+
+ Path to the BiG-SCAPE database file. + |
+
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
db_file |
+
+ str | PathLike
+ |
+
+
+
+ Path to the BiG-SCAPE v2 database file + |
+ + required + | +
src/nplinker/genomics/bigscape/bigscape_loader.py
get_gcfs
+
+
+¶Get all GCF objects.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
keep_mibig_only |
+
+ bool
+ |
+
+
+
+ True to keep GCFs that contain only MIBiG +BGCs. + |
+
+ False
+ |
+
keep_singleton |
+
+ bool
+ |
+
+
+
+ True to keep singleton GCFs. A singleton GCF +is a GCF that contains only one BGC. + |
+
+ False
+ |
+
Returns:
+Type | +Description | +
---|---|
+ list[GCF]
+ |
+
+
+
+ a list of GCF objects. + |
+
src/nplinker/genomics/bigscape/bigscape_loader.py
run_bigscape
+
+
+¶run_bigscape(
+ antismash_path: str | PathLike,
+ output_path: str | PathLike,
+ extra_params: str,
+)
+
src/nplinker/genomics/bigscape/runbigscape.py
run_bigscape
+
+
+¶run_bigscape(
+ antismash_path: str | PathLike,
+ output_path: str | PathLike,
+ extra_params: str,
+)
+
src/nplinker/genomics/bigscape/runbigscape.py
genomics
+
+
+¶
BGC
+
+
+¶Class to model BGC (biosynthetic gene cluster) data.
+BGC data include both annotations and sequence data. This class is +mainly designed to model the annotations or metadata.
+The raw BGC data is stored in GenBank format (.gbk). Additional
+GenBank features
+could be added to the GenBank file to annotate
+BGCs, e.g. antiSMASH has some self-defined features (like region
) in
+its output GenBank files.
The annotations of BGC can be stored in JSON format, which is defined +and used by MIBiG.
+ + +Attributes:
+Name | +Type | +Description | +
---|---|---|
id |
+ + | +
+
+
+ BGC identifier, e.g. MIBiG accession, GenBank accession. + |
+
product_prediction |
+ + | +
+
+
+ A tuple of (predicted) natural
+products or product classes of the BGC.
+For antiSMASH's GenBank data, the feature |
+
mibig_bgc_class |
+
+ tuple[str] | None
+ |
+
+
+
+ A tuple of MIBiG biosynthetic +classes to which the BGC belongs. +Defaults to None. +MIBiG defines 6 major biosynthetic classes for natural products, +including "NRP", "Polyketide", "RiPP", "Terpene", "Saccharide" +and "Alkaloid". Note that natural products created by all other +biosynthetic mechanisms fall under the category "Other". +More details see the publication: https://doi.org/10.1186/s40793-018-0318-y. + |
+
description |
+
+ str | None
+ |
+
+
+
+ Brief description of the BGC. +Defaults to None. + |
+
smiles |
+
+ tuple[str] | None
+ |
+
+
+
+ A tuple of SMILES formulas of the BGC's +products. +Defaults to None. + |
+
antismash_file |
+
+ str | None
+ |
+
+
+
+ The path to the antiSMASH GenBank file. +Defaults to None. + |
+
antismash_id |
+
+ str | None
+ |
+
+
+
+ Identifier of the antiSMASH BGC, referring
+to the feature |
+
antismash_region |
+
+ int | None
+ |
+
+
+
+ AntiSMASH BGC region number, referring
+to the feature |
+
parents |
+
+ set[GCF]
+ |
+
+
+
+ The set of GCFs that contain the BGC. + |
+
strain |
+
+ Strain | None
+ |
+
+
+
+ The strain of the BGC. + |
+
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
id |
+
+ str
+ |
+
+
+
+ BGC identifier, e.g. MIBiG accession, GenBank accession. + |
+ + required + | +
product_prediction |
+
+ str
+ |
+
+
+
+ BGC's (predicted) natural products or product classes. + |
+
+ ()
+ |
+
src/nplinker/genomics/bgc.py
product_prediction
+
+
+
+ instance-attribute
+
+
+¶product_prediction = product_prediction
+
mibig_bgc_class
+
+
+
+ instance-attribute
+
+
+¶
antismash_file
+
+
+
+ instance-attribute
+
+
+¶antismash_file: str | None = None
+
antismash_id
+
+
+
+ instance-attribute
+
+
+¶antismash_id: str | None = None
+
antismash_region
+
+
+
+ instance-attribute
+
+
+¶antismash_region: int | None = None
+
strain
+
+
+
+ property
+ writable
+
+
+¶strain: Strain | None
+
Get the strain of the BGC.
+
bigscape_classes
+
+
+
+ property
+
+
+¶Get BiG-SCAPE's BGC classes.
+BiG-SCAPE's BGC classes are similar to those defined in MiBIG but have +more categories (7 classes). More details see: +https://doi.org/10.1038%2Fs41589-019-0400-9.
+
add_parent
+
+
+¶add_parent(gcf: GCF) -> None
+
Add a parent GCF to the BGC.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
gcf |
+
+ GCF
+ |
+
+
+
+ gene cluster family + |
+ + required + | +
is_mibig
+
+
+¶is_mibig() -> bool
+
Check if the BGC is MIBiG reference BGC or not.
+ + +This method evaluates MIBiG BGC based on the pattern that MIBiG +BGC names start with "BGC". It might give false positive result.
+Returns:
+Type | +Description | +
---|---|
+ bool
+ |
+
+
+
+ True if it's MIBiG reference BGC + |
+
src/nplinker/genomics/bgc.py
GCF
+
+
+¶GCF(id: str)
+
Class to model gene cluster family (GCF).
+GCF is a group of similar BGCs and generated by clustering BGCs with +tools such as BiG-SCAPE and BiG-SLICE.
+ + +Attributes:
+Name | +Type | +Description | +
---|---|---|
id |
+ + | +
+
+
+ id of the GCF object. + |
+
bgc_ids |
+
+ set[str]
+ |
+
+
+
+ a set of BGC ids that belongs to the GCF. + |
+
bigscape_class |
+
+ str | None
+ |
+
+
+
+ BiG-SCAPE's BGC class. +BiG-SCAPE's BGC classes are similar to those defined in MiBIG +but have more categories (7 classes). More details see: +https://doi.org/10.1038%2Fs41589-019-0400-9. + |
+
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
id |
+
+ str
+ |
+
+
+
+ id of the GCF object. + |
+ + required + | +
src/nplinker/genomics/gcf.py
bigscape_class
+
+
+
+ instance-attribute
+
+
+¶bigscape_class: str | None = None
+
strains
+
+
+
+ property
+
+
+¶strains: StrainCollection
+
Get the strains in the GCF.
+
add_bgc
+
+
+¶add_bgc(bgc: BGC) -> None
+
Add a BGC object to the GCF.
+ +src/nplinker/genomics/gcf.py
detach_bgc
+
+
+¶detach_bgc(bgc: BGC) -> None
+
Remove a child BGC object.
+ +src/nplinker/genomics/gcf.py
has_strain
+
+
+¶Check if the given strain exists.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
strain |
+
+ Strain
+ |
+
+
+
+
|
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ bool
+ |
+
+
+
+ True when the given strain exist. + |
+
src/nplinker/genomics/gcf.py
has_mibig_only
+
+
+¶has_mibig_only() -> bool
+
Check if the GCF's children are only MIBiG BGCs.
+ + +Returns:
+Type | +Description | +
---|---|
+ bool
+ |
+
+
+
+ True if |
+
src/nplinker/genomics/gcf.py
abc
+
+
+¶
BGCLoaderBase
+
+
+¶
+ Bases: ABC
Abstract base class for BGC loader.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
data_dir |
+
+ str | PathLike
+ |
+
+
+
+ Path to directory that contains BGC metadata files +(.json) or full data genbank files (.gbk). + |
+ + required + | +
src/nplinker/genomics/abc.py
get_files
+
+
+
+ abstractmethod
+
+
+¶
GCFLoaderBase
+
+
+¶
+ Bases: ABC
Abstract base class for GCF loader.
+ + + + +
get_gcfs
+
+
+
+ abstractmethod
+
+
+¶Get GCF objects.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
keep_mibig_only |
+
+ bool
+ |
+
+
+
+ True to keep GCFs that contain only MIBiG +BGCs. + |
+ + required + | +
keep_singleton |
+
+ bool
+ |
+
+
+
+ True to keep singleton GCFs. A singleton GCF +is a GCF that contains only one BGC. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ list[GCF]
+ |
+
+
+
+ A list of GCF objects + |
+
src/nplinker/genomics/abc.py
utils
+
+
+¶
generate_mappings_genome_id_bgc_id
+
+
+¶generate_mappings_genome_id_bgc_id(
+ bgc_dir: str | PathLike,
+ output_file: str | PathLike | None = None,
+) -> None
+
Generate a file that maps genome id to BGC id.
+Note that the output_file
will be overwritten if it already exists.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
bgc_dir |
+
+ str | PathLike
+ |
+
+
+
+ The directory has one-layer of subfolders and
+each subfolder contains BGC files in |
+ + required + | +
output_file |
+
+ str | PathLike | None
+ |
+
+
+
+ The path to the output file. Note
+that the file will be overwritten if it already exists.
+Defaults to None, in which case the output file will be placed in
+the directory |
+
+ None
+ |
+
src/nplinker/genomics/utils.py
add_strain_to_bgc
+
+
+¶add_strain_to_bgc(
+ strains: StrainCollection, bgcs: Sequence[BGC]
+) -> tuple[list[BGC], list[BGC]]
+
Assign a Strain object to BGC.strain
for input BGCs.
BGC id is used to find the corresponding Strain object. It's possible that +no Strain object is found for a BGC id.
+Note that the input list bgcs
will be changed in place.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
strains |
+
+ StrainCollection
+ |
+
+
+
+ A collection of all strain objects. + |
+ + required + | +
bgcs |
+
+ Sequence[BGC]
+ |
+
+
+
+ A list of BGC objects. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ tuple[list[BGC], list[BGC]]
+ |
+
+
+
+ A tuple of two lists of BGC objects, +
|
+
Raises:
+Type | +Description | +
---|---|
+ ValueError
+ |
+
+
+
+ Multiple strain objects found for a BGC id. + |
+
src/nplinker/genomics/utils.py
add_bgc_to_gcf
+
+
+¶add_bgc_to_gcf(
+ bgcs: Sequence[BGC], gcfs: Sequence[GCF]
+) -> tuple[list[GCF], list[GCF], dict[GCF, set[str]]]
+
Add BGC objects to GCF object based on GCF's BGC ids.
+The attribute of GCF.bgc_ids
contains the ids of BGC objects. These ids
+are used to find BGC objects from the input bgcs
list. The found BGC
+objects are added to the bgcs
attribute of GCF object. It is possible that
+some BGC ids are not found in the input bgcs
list, and so their BGC
+objects are missing in the GCF object.
This method changes the lists bgcs
and gcfs
in place.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
bgcs |
+
+ Sequence[BGC]
+ |
+
+
+
+ A list of BGC objects. + |
+ + required + | +
gcfs |
+
+ Sequence[GCF]
+ |
+
+
+
+ A list of GCF objects. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ tuple[list[GCF], list[GCF], dict[GCF, set[str]]]
+ |
+
+
+
+ A tuple of two lists and a dictionary, +
|
+
src/nplinker/genomics/utils.py
get_mibig_from_gcf
+
+
+¶Get MIBiG BGCs and strains from GCF objects.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
gcfs |
+
+ Sequence[GCF]
+ |
+
+
+
+ A list of GCF objects. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ tuple[list[BGC], StrainCollection]
+ |
+
+
+
+ A tuple of two objects, +
|
+
src/nplinker/genomics/utils.py
extract_mappings_strain_id_original_genome_id
+
+
+¶extract_mappings_strain_id_original_genome_id(
+ podp_project_json_file: str | PathLike,
+) -> dict[str, set[str]]
+
Extract mappings "strain id <-> original genome id".
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
podp_project_json_file |
+
+ str | PathLike
+ |
+
+
+
+ The path to the PODP project +JSON file. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ dict[str, set[str]]
+ |
+
+
+
+ Key is strain id and value is a set of original genome ids. + |
+
The podp_project_json_file
is the project JSON file downloaded from
+PODP platform. For example, for project MSV000079284, its json file is
+https://pairedomicsdata.bioinformatics.nl/api/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4.
src/nplinker/genomics/utils.py
extract_mappings_original_genome_id_resolved_genome_id
+
+
+¶extract_mappings_original_genome_id_resolved_genome_id(
+ genome_status_json_file: str | PathLike,
+) -> dict[str, str]
+
Extract mappings "original_genome_id <-> resolved_genome_id".
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
genome_status_json_file |
+
+ str | PathLike
+ |
+
+
+
+ The path to the genome status +JSON file. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ dict[str, str]
+ |
+
+
+
+ Key is original genome id and value is resolved genome id. + |
+
The genome_status_json_file
is usually generated by the
+podp_download_and_extract_antismash_data
function with
+a default file name defined in nplinker.defaults.GENOME_STATUS_FILENAME
.
src/nplinker/genomics/utils.py
extract_mappings_resolved_genome_id_bgc_id
+
+
+¶extract_mappings_resolved_genome_id_bgc_id(
+ genome_bgc_mappings_file: str | PathLike,
+) -> dict[str, set[str]]
+
Extract mappings "resolved_genome_id <-> bgc_id".
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
genome_bgc_mappings_file |
+
+ str | PathLike
+ |
+
+
+
+ The path to the genome BGC +mappings JSON file. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ dict[str, set[str]]
+ |
+
+
+
+ Key is resolved genome id and value is a set of BGC ids. + |
+
The genome_bgc_mappings_file
is usually generated by the
+generate_mappings_genome_id_bgc_id
function with a default file name
+defined in nplinker.defaults.GENOME_BGC_MAPPINGS_FILENAME
.
src/nplinker/genomics/utils.py
get_mappings_strain_id_bgc_id
+
+
+¶get_mappings_strain_id_bgc_id(
+ mappings_strain_id_original_genome_id: Mapping[
+ str, set[str]
+ ],
+ mappings_original_genome_id_resolved_genome_id: Mapping[
+ str, str
+ ],
+ mappings_resolved_genome_id_bgc_id: Mapping[
+ str, set[str]
+ ],
+) -> dict[str, set[str]]
+
Get mappings "strain_id <-> bgc_id".
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
mappings_strain_id_original_genome_id |
+
+ Mapping[str, set[str]]
+ |
+
+
+
+ Mappings +"strain_id <-> original_genome_id". + |
+ + required + | +
mappings_original_genome_id_resolved_genome_id |
+
+ Mapping[str, str]
+ |
+
+
+
+ Mappings +"original_genome_id <-> resolved_genome_id". + |
+ + required + | +
mappings_resolved_genome_id_bgc_id |
+
+ Mapping[str, set[str]]
+ |
+
+
+
+ Mappings +"resolved_genome_id <-> bgc_id". + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ dict[str, set[str]]
+ |
+
+
+
+ Key is strain id and value is a set of BGC ids. + |
+
extract_mappings_strain_id_original_genome_id
: Extract mappings
+ "strain_id <-> original_genome_id".extract_mappings_original_genome_id_resolved_genome_id
: Extract mappings
+ "original_genome_id <-> resolved_genome_id".extract_mappings_resolved_genome_id_bgc_id
: Extract mappings
+ "resolved_genome_id <-> bgc_id".src/nplinker/genomics/utils.py
gnps
+
+
+¶
GNPSFormat
+
+
+¶
+ Bases: Enum
Enum class for GNPS format (workflow).
+The GNPS format refers to the GNPS workflow. The name of the enum is a +simple short name for the workflow, and the value of the enum is the actual +name of the workflow in the GNPS website.
+ + + + +
SNETS
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶
SNETSV2
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶
FBMN
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶
Unknown
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶
GNPSDownloader
+
+
+¶Download GNPS zip archive for the given task id.
+Note that only GNPS workflows listed in the GNPSFormat enum are supported.
+ + +Attributes:
+Name | +Type | +Description | +
---|---|---|
GNPS_DATA_DOWNLOAD_URL |
+
+ str
+ |
+
+
+
+ URL template for downloading GNPS data. + |
+
GNPS_DATA_DOWNLOAD_URL_FBMN |
+
+ str
+ |
+
+
+
+ URL template for downloading GNPS data for FBMN. + |
+
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
task_id |
+
+ str
+ |
+
+
+
+ GNPS task id, identifying the data to be downloaded. + |
+ + required + | +
download_root |
+
+ str | PathLike
+ |
+
+
+
+ Path where to store the downloaded archive. + |
+ + required + | +
Raises:
+Type | +Description | +
---|---|
+ ValueError
+ |
+
+
+
+ If the given task id does not correspond to a supported +GNPS workflow. + |
+
Examples:
+ + +src/nplinker/metabolomics/gnps/gnps_downloader.py
GNPS_DATA_DOWNLOAD_URL
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶GNPS_DATA_DOWNLOAD_URL: str = (
+ "https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task={}&view=download_clustered_spectra"
+)
+
GNPS_DATA_DOWNLOAD_URL_FBMN
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶GNPS_DATA_DOWNLOAD_URL_FBMN: str = (
+ "https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task={}&view=download_cytoscape_data"
+)
+
gnps_format
+
+
+
+ property
+
+
+¶gnps_format: GNPSFormat
+
Get the GNPS workflow type.
+ + +Returns:
+Type | +Description | +
---|---|
+ GNPSFormat
+ |
+
+
+
+ GNPS workflow type. + |
+
download
+
+
+¶Execute the downloading process.
+Note: GNPS data is downloaded using the POST method (empty payload is OK).
+ +src/nplinker/metabolomics/gnps/gnps_downloader.py
get_download_file
+
+
+¶get_download_file() -> str
+
Get the path to the zip file.
+ + +Returns:
+Type | +Description | +
---|---|
+ str
+ |
+
+
+
+ Download path as string + |
+
get_task_id
+
+
+¶get_task_id() -> str
+
Get the GNPS task id.
+ + +Returns:
+Type | +Description | +
---|---|
+ str
+ |
+
+
+
+ Task id as string. + |
+
get_url
+
+
+¶get_url() -> str
+
Get the full URL linking to GNPS data to be downloaded.
+ + +Returns:
+Type | +Description | +
---|---|
+ str
+ |
+
+
+
+ URL pointing to the GNPS data to be downloaded. + |
+
src/nplinker/metabolomics/gnps/gnps_downloader.py
GNPSExtractor
+
+
+¶Class to extract files from a GNPS molecular networking archive(.zip).
+Four files are extracted and renamed to the following names:
+The files to be extracted are selected based on the GNPS workflow type, +as described below (in the order of the files above):
+Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file |
+
+ str | PathLike
+ |
+
+
+
+ The path to the GNPS zip file. + |
+ + required + | +
extract_dir |
+
+ str | PathLike
+ |
+
+
+
+ path to the directory where to extract the files to. + |
+ + required + | +
Raises:
+Type | +Description | +
---|---|
+ ValueError
+ |
+
+
+
+ If the given file is an invalid GNPS archive. + |
+
Examples:
+>>> gnps_extractor = GNPSExtractor("path/to/gnps_archive.zip", "path/to/extract_dir")
+>>> gnps_extractor.gnps_format
+<GNPSFormat.SNETS: 'METABOLOMICS-SNETS'>
+>>> gnps_extractor.extract_dir
+'path/to/extract_dir'
+
src/nplinker/metabolomics/gnps/gnps_extractor.py
gnps_format
+
+
+
+ property
+
+
+¶gnps_format: GNPSFormat
+
Get the GNPS workflow type.
+ + +Returns:
+Type | +Description | +
---|---|
+ GNPSFormat
+ |
+
+
+
+ GNPS workflow type. + |
+
GNPSSpectrumLoader
+
+
+¶
+ Bases: SpectrumLoaderBase
Class to load mass spectra from the given GNPS MGF file.
+The file mappings file is from GNPS output archive, as described below +for each GNPS workflow type:
+Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file |
+
+ str | PathLike
+ |
+
+
+
+ path to the MGF file. + |
+ + required + | +
Raises:
+Type | +Description | +
---|---|
+ ValueError
+ |
+
+
+
+ Raises ValueError if the file is not valid. + |
+
Examples:
+ + +src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py
GNPSMolecularFamilyLoader
+
+
+¶
+ Bases: MolecularFamilyLoaderBase
Class to load molecular families from GNPS output file.
+The molecular family file is from GNPS output archive, as described below +for each GNPS workflow type:
+The "ComponentIndex" column in the GNPS molecular family's file is treated
+as family id. But for molecular families that have only one member (i.e. spectrum),
+named singleton molecular families, their files have the same value of
+"-1" in the "ComponentIndex" column. To make the family id unique,the
+spectrum id plus a prefix singleton-
is used as the family id of
+singleton molecular families.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file |
+
+ str | PathLike
+ |
+
+
+
+ Path to the GNPS molecular family file. + |
+ + required + | +
Raises:
+Type | +Description | +
---|---|
+ ValueError
+ |
+
+
+
+ Raises ValueError if the file is not valid. + |
+
Examples:
+>>> loader = GNPSMolecularFamilyLoader("gnps_molecular_families.tsv")
+>>> print(loader.families)
+[<MolecularFamily 1>, <MolecularFamily 2>, ...]
+>>> print(loader.families[0].spectra_ids)
+{'1', '3', '7', ...}
+
src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py
get_mfs
+
+
+¶get_mfs(
+ keep_singleton: bool = False,
+) -> list[MolecularFamily]
+
Get MolecularFamily objects.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
keep_singleton |
+
+ bool
+ |
+
+
+
+ True to keep singleton molecular families. A +singleton molecular family is a molecular family that contains +only one spectrum. + |
+
+ False
+ |
+
Returns:
+Type | +Description | +
---|---|
+ list[MolecularFamily]
+ |
+
+
+
+ A list of MolecularFamily objects with their spectra ids. + |
+
src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py
GNPSAnnotationLoader
+
+
+¶
+ Bases: AnnotationLoaderBase
Load annotations from GNPS output file.
+The annotation file is a .tsv file from GNPS output archive, as described +below for each GNPS workflow type:
+Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file |
+
+ str | PathLike
+ |
+
+
+
+ The GNPS annotation file. + |
+ + required + | +
Examples:
+>>> loader = GNPSAnnotationLoader("gnps_annotations.tsv")
+>>> print(loader.annotations["100"])
+{'#Scan#': '100',
+'Adduct': 'M+H',
+'CAS_Number': 'N/A',
+'Charge': '1',
+'Compound_Name': 'MLS002153841-01!Iobenguane sulfate',
+'Compound_Source': 'NIH Pharmacologically Active Library',
+'Data_Collector': 'VP/LMS',
+'ExactMass': '274.992',
+'INCHI': 'N/A',
+'INCHI_AUX': 'N/A',
+'Instrument': 'qTof',
+'IonMode': 'Positive',
+'Ion_Source': 'LC-ESI',
+'LibMZ': '276.003',
+'LibraryName': 'lib-00014.mgf',
+'LibraryQualityString': 'Gold',
+'Library_Class': '1',
+'MQScore': '0.704152',
+'MZErrorPPM': '405416',
+'MassDiff': '111.896',
+'Organism': 'GNPS-NIH-SMALLMOLECULEPHARMACOLOGICALLYACTIVE',
+'PI': 'Dorrestein',
+'Precursor_MZ': '276.003',
+'Pubmed_ID': 'N/A',
+'RT_Query': '795.979',
+'SharedPeaks': '7',
+'Smiles': 'NC(=N)NCc1cccc(I)c1.OS(=O)(=O)O',
+'SpecCharge': '1',
+'SpecMZ': '164.107',
+'SpectrumFile': 'spectra/specs_ms.pklbin',
+'SpectrumID': 'CCMSLIB00000086167',
+'TIC_Query': '986.997',
+'UpdateWorkflowName': 'UPDATE-SINGLE-ANNOTATED-GOLD',
+'tags': ' ',
+'png_url': 'https://metabolomics-usi.gnps2.org/png/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',
+'json_url': 'https://metabolomics-usi.gnps2.org/json/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',
+'svg_url': 'https://metabolomics-usi.gnps2.org/svg/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',
+'spectrum_url': 'https://metabolomics-usi.gnps2.org/spectrum/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167'}
+
src/nplinker/metabolomics/gnps/gnps_annotation_loader.py
annotations
+
+
+
+ property
+
+
+¶
GNPSFileMappingLoader
+
+
+¶
+ Bases: FileMappingLoaderBase
Class to load file mappings from GNPS output file.
+File mappings refers to the mapping from spectrum id to files in which +this spectrum occurs.
+The file mappings file is from GNPS output archive, as described below +for each GNPS workflow type:
+Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file |
+
+ str | PathLike
+ |
+
+
+
+ Path to the GNPS file mappings file. + |
+ + required + | +
Raises:
+Type | +Description | +
---|---|
+ ValueError
+ |
+
+
+
+ Raises ValueError if the file is not valid. + |
+
Examples:
+>>> loader = GNPSFileMappingLoader("gnps_file_mappings.tsv")
+>>> print(loader.mappings["1"])
+['26c.mzXML']
+>>> print(loader.mapping_reversed["26c.mzXML"])
+{'1', '3', '7', ...}
+
src/nplinker/metabolomics/gnps/gnps_file_mapping_loader.py
gnps_format_from_archive
+
+
+¶gnps_format_from_archive(
+ zip_file: str | PathLike,
+) -> GNPSFormat
+
Detect GNPS format from a downloaded GNPS zip archive.
+The detection is based on the filename of the zip file and the names of the +files contained in the zip file.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
zip_file |
+
+ str | PathLike
+ |
+
+
+
+ Path to the downloaded GNPS zip file. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ GNPSFormat
+ |
+
+
+
+ The format identified in the GNPS zip file. + |
+
Examples:
+>>> gnps_format_from_archive("downloads/ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra.zip") == GNPSFormat.SNETS
+>>> gnps_format_from_archive("downloads/ProteoSAFe-METABOLOMICS-SNETS-V2-189e8bf1-download_clustered_spectra.zip") == GNPSFormat.SNETSV2
+>>> gnps_format_from_archive("downloads/ProteoSAFe-FEATURE-BASED-MOLECULAR-NETWORKING-672d0a53-download_cytoscape_data.zip") == GNPSFormat.FBMN
+
src/nplinker/metabolomics/gnps/gnps_format.py
gnps_format_from_file_mapping
+
+
+¶gnps_format_from_file_mapping(
+ file: str | PathLike,
+) -> GNPSFormat
+
Detect GNPS format from the given file mapping file.
+The GNPS file mapping file is located in different folders depending on the +GNPS workflow. Here are the locations in corresponding GNPS zip archives:
+Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file |
+
+ str | PathLike
+ |
+
+
+
+ Path to the file to peek the format for. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ GNPSFormat
+ |
+
+
+
+ GNPS format identified in the file. + |
+
src/nplinker/metabolomics/gnps/gnps_format.py
gnps_format_from_task_id
+
+
+¶gnps_format_from_task_id(task_id: str) -> GNPSFormat
+
Detect GNPS format for the given task id.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
task_id |
+
+ str
+ |
+
+
+
+ GNPS task id. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ GNPSFormat
+ |
+
+
+
+ The format identified in the GNPS task. + |
+
Examples:
+>>> gnps_format_from_task_id("c22f44b14a3d450eb836d607cb9521bb") == GNPSFormat.SNETS
+>>> gnps_format_from_task_id("189e8bf16af145758b0a900f1c44ff4a") == GNPSFormat.SNETSV2
+>>> gnps_format_from_task_id("92036537c21b44c29e509291e53f6382") == GNPSFormat.FBMN
+>>> gnps_format_from_task_id("0ad6535e34d449788f297e712f43068a") == GNPSFormat.Unknown
+
src/nplinker/metabolomics/gnps/gnps_format.py
loader
+
+
+¶
DatasetLoader
+
+
+¶Class to load all data.
+ + +Attributes:
+Name | +Type | +Description | +
---|---|---|
config |
+ + | +
+
+
+ A Dynaconf object that contains the configuration settings. Check the
+ |
+
bgcs |
+
+ list[BGC]
+ |
+
+
+
+ A list of BGC objects. + |
+
gcfs |
+
+ list[GCF]
+ |
+
+
+
+ A list of GCF objects. + |
+
spectra |
+
+ list[Spectrum]
+ |
+
+
+
+ A list of Spectrum objects. + |
+
mfs |
+
+ list[MolecularFamily]
+ |
+
+
+
+ A list of MolecularFamily objects. + |
+
mibig_bgcs |
+
+ list[BGC]
+ |
+
+
+
+ A list of MIBiG BGC objects. + |
+
mibig_strains_in_use |
+
+ StrainCollection
+ |
+
+
+
+ A StrainCollection object that contains the strains in use from MIBiG. + |
+
product_types |
+
+ list
+ |
+
+
+
+ A list of product types. + |
+
strains |
+
+ StrainCollection
+ |
+
+
+
+ A StrainCollection object that contains all strains. + |
+
class_matches |
+ + | +
+
+
+ A ClassMatches object that contains class match info. + |
+
chem_classes |
+ + | +
+
+
+ A ChemClassPredictions object that contains chemical class predictions. + |
+
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
config |
+
+ Dynaconf
+ |
+
+
+
+ A Dynaconf object that contains the configuration settings. Check the
+ |
+ + required + | +
src/nplinker/loader.py
RUN_CANOPUS_DEFAULT
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶
EXTRA_CANOPUS_PARAMS_DEFAULT
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶
OR_CANOPUS
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶
OR_MOLNETENHANCER
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶
mibig_strains_in_use
+
+
+
+ instance-attribute
+
+
+¶mibig_strains_in_use: StrainCollection = StrainCollection()
+
strains
+
+
+
+ instance-attribute
+
+
+¶strains: StrainCollection = StrainCollection()
+
load
+
+
+¶Load all data.
+ +src/nplinker/loader.py
metabolomics
+
+
+¶
MolecularFamily
+
+
+¶MolecularFamily(id: str)
+
Class to model molecular family.
+ + +Attributes:
+Name | +Type | +Description | +
---|---|---|
id |
+
+ str
+ |
+
+
+
+ Unique id for the molecular family. + |
+
spectra_ids |
+
+ set[str]
+ |
+
+
+
+ Set of spectrum ids in the molecular family. + |
+
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
id |
+
+ str
+ |
+
+
+
+ Unique id for the molecular family. + |
+ + required + | +
src/nplinker/metabolomics/molecular_family.py
spectra
+
+
+
+ property
+
+
+¶Get Spectrum objects in the molecular family.
+
strains
+
+
+
+ property
+
+
+¶strains: StrainCollection
+
Get strains in the molecular family.
+
add_spectrum
+
+
+¶add_spectrum(spectrum: Spectrum) -> None
+
Add a Spectrum object to the molecular family.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
spectrum |
+
+ Spectrum
+ |
+
+
+
+
|
+ + required + | +
src/nplinker/metabolomics/molecular_family.py
detach_spectrum
+
+
+¶detach_spectrum(spectrum: Spectrum) -> None
+
Remove a Spectrum object from the molecular family.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
spectrum |
+
+ Spectrum
+ |
+
+
+
+
|
+ + required + | +
src/nplinker/metabolomics/molecular_family.py
has_strain
+
+
+¶Check if the given strain exists.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
strain |
+
+ Strain
+ |
+
+
+
+
|
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ bool
+ |
+
+
+
+ True when the given strain exists. + |
+
src/nplinker/metabolomics/molecular_family.py
is_singleton
+
+
+¶is_singleton() -> bool
+
Check if the molecular family contains only one spectrum.
+ + +Returns:
+Type | +Description | +
---|---|
+ bool
+ |
+
+
+
+ True when |
+
src/nplinker/metabolomics/molecular_family.py
Spectrum
+
+
+¶Spectrum(
+ id: str,
+ mz: list[float],
+ intensity: list[float],
+ precursor_mz: float,
+ rt: float = 0,
+ metadata: dict | None = None,
+)
+
Class to model MS/MS Spectrum.
+ + +Attributes:
+Name | +Type | +Description | +
---|---|---|
id |
+ + | +
+
+
+ the spectrum ID. + |
+
mz |
+ + | +
+
+
+ the list of m/z values. + |
+
intensity |
+ + | +
+
+
+ the list of intensity values. + |
+
precursor_mz |
+ + | +
+
+
+ the m/z value of the precursor. + |
+
rt |
+ + | +
+
+
+ the retention time in seconds. + |
+
metadata |
+ + | +
+
+
+ the metadata of the spectrum, i.e. the header information in the MGF +file. + |
+
gnps_annotations |
+
+ dict
+ |
+
+
+
+ the GNPS annotations of the spectrum. + |
+
gnps_id |
+
+ str | None
+ |
+
+
+
+ the GNPS ID of the spectrum. + |
+
strains |
+
+ StrainCollection
+ |
+
+
+
+ the strains that this spectrum belongs to. + |
+
family |
+
+ MolecularFamily | None
+ |
+
+
+
+ the molecular family that this spectrum belongs to. + |
+
peaks |
+
+ ndarray
+ |
+
+
+
+ 2D array of peaks, each row is a peak of (m/z, intensity) values. + |
+
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
id |
+
+ str
+ |
+
+
+
+ the spectrum ID. + |
+ + required + | +
mz |
+
+ list[float]
+ |
+
+
+
+ the list of m/z values. + |
+ + required + | +
intensity |
+
+ list[float]
+ |
+
+
+
+ the list of intensity values. + |
+ + required + | +
precursor_mz |
+
+ float
+ |
+
+
+
+ the precursor m/z. + |
+ + required + | +
rt |
+
+ float
+ |
+
+
+
+ the retention time in seconds. Defaults to 0. + |
+
+ 0
+ |
+
metadata |
+
+ dict | None
+ |
+
+
+
+ the metadata of the spectrum, i.e. the header information +in the MGF file. + |
+
+ None
+ |
+
src/nplinker/metabolomics/spectrum.py
gnps_annotations
+
+
+
+ instance-attribute
+
+
+¶gnps_annotations: dict = {}
+
strains
+
+
+
+ instance-attribute
+
+
+¶strains: StrainCollection = StrainCollection()
+
family
+
+
+
+ instance-attribute
+
+
+¶family: MolecularFamily | None = None
+
peaks
+
+
+
+ cached
+ property
+
+
+¶peaks: ndarray
+
Get the peaks, a 2D array with each row containing the values of (m/z, intensity).
+
has_strain
+
+
+¶Check if the given strain exists in the spectrum.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
strain |
+
+ Strain
+ |
+
+
+
+
|
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ bool
+ |
+
+
+
+ True when the given strain exist in the spectrum. + |
+
src/nplinker/metabolomics/spectrum.py
abc
+
+
+¶
SpectrumLoaderBase
+
+
+¶
+ Bases: ABC
Abstract base class for SpectrumLoader.
+ + + + + + +
MolecularFamilyLoaderBase
+
+
+¶
+ Bases: ABC
Abstract base class for MolecularFamilyLoader.
+ + + + +
get_mfs
+
+
+
+ abstractmethod
+
+
+¶get_mfs(keep_singleton: bool) -> list[MolecularFamily]
+
Get MolecularFamily objects.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
keep_singleton |
+
+ bool
+ |
+
+
+
+ True to keep singleton molecular families. A +singleton molecular family is a molecular family that contains +only one spectrum. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ list[MolecularFamily]
+ |
+
+
+
+ A sequence of MolecularFamily objects. + |
+
src/nplinker/metabolomics/abc.py
FileMappingLoaderBase
+
+
+¶
+ Bases: ABC
Abstract base class for FileMappingLoader.
+ + + + +
utils
+
+
+¶
add_annotation_to_spectrum
+
+
+¶add_annotation_to_spectrum(
+ annotations: Mapping[str, dict],
+ spectra: Sequence[Spectrum],
+) -> None
+
Add GNPS annotations to the Spectrum.gnps_annotations
attribute for input spectra.
It is possible that some spectra don't have annotations.
+Note that the input spectra
list is changed in place.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
annotations |
+
+ Mapping[str, dict]
+ |
+
+
+
+ A dictionary of GNPS annotations, where the keys are +spectrum ids and the values are GNPS annotations. + |
+ + required + | +
spectra |
+
+ Sequence[Spectrum]
+ |
+
+
+
+ A list of Spectrum objects. + |
+ + required + | +
src/nplinker/metabolomics/utils.py
add_strains_to_spectrum
+
+
+¶add_strains_to_spectrum(
+ strains: StrainCollection, spectra: Sequence[Spectrum]
+) -> tuple[list[Spectrum], list[Spectrum]]
+
Add Strain
objects to the Spectrum.strains
attribute for input spectra.
Note that the input spectra
list is changed in place.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
strains |
+
+ StrainCollection
+ |
+
+
+
+ A collection of strain objects. + |
+ + required + | +
spectra |
+
+ Sequence[Spectrum]
+ |
+
+
+
+ A list of Spectrum objects. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ tuple[list[Spectrum], list[Spectrum]]
+ |
+
+
+
+ A tuple of two lists of Spectrum objects, +
|
+
src/nplinker/metabolomics/utils.py
add_spectrum_to_mf
+
+
+¶add_spectrum_to_mf(
+ spectra: Sequence[Spectrum],
+ mfs: Sequence[MolecularFamily],
+) -> tuple[
+ list[MolecularFamily],
+ list[MolecularFamily],
+ dict[MolecularFamily, set[str]],
+]
+
Add Spectrum objects to MolecularFamily objects.
+The attribute of spectra_ids
of MolecularFamily object contains the ids of Spectrum objects.
+These ids are used to find Spectrum objects from the input spectra
list. The found Spectrum
+objects are added to the spectra
attribute of MolecularFamily object. It is possible that
+some spectrum ids are not found in the input spectra
list, and so their Spectrum objects are
+missing in the MolecularFamily object.
Note that the input mfs
list is changed in place.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
spectra |
+
+ Sequence[Spectrum]
+ |
+
+
+
+ A list of Spectrum objects. + |
+ + required + | +
mfs |
+
+ Sequence[MolecularFamily]
+ |
+
+
+
+ A list of MolecularFamily objects. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ tuple[list[MolecularFamily], list[MolecularFamily], dict[MolecularFamily, set[str]]]
+ |
+
+
+
+ A tuple of three elements, +
|
+
src/nplinker/metabolomics/utils.py
extract_mappings_strain_id_ms_filename
+
+
+¶extract_mappings_strain_id_ms_filename(
+ podp_project_json_file: str | PathLike,
+) -> dict[str, set[str]]
+
Extract mappings "strain_id <-> MS_filename".
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
podp_project_json_file |
+
+ str | PathLike
+ |
+
+
+
+ The path to the PODP project +JSON file. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ dict[str, set[str]]
+ |
+
+
+
+ Key is strain id and value is a set of MS filenames. + |
+
The podp_project_json_file
is the project JSON file downloaded from
+PODP platform. For example, for project MSV000079284, its json file is
+https://pairedomicsdata.bioinformatics.nl/api/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4.
src/nplinker/metabolomics/utils.py
extract_mappings_ms_filename_spectrum_id
+
+
+¶extract_mappings_ms_filename_spectrum_id(
+ gnps_file_mappings_file: str | PathLike,
+) -> dict[str, set[str]]
+
Extract mappings "MS_filename <-> spectrum_id".
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
gnps_file_mappings_file |
+
+ str | PathLike
+ |
+
+
+
+ The path to the GNPS file mappings file (csv or +tsv). + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ dict[str, set[str]]
+ |
+
+
+
+ Key is MS filename and value is a set of spectrum ids. + |
+
The gnps_file_mappings_file
is generated by GNPS molecular networking. It's downloaded
+from GNPS website to a file with a default name defined in GNPS_FILE_MAPPINGS_FILENAME
.
GNPSFileMappingLoader: A class to load GNPS file mappings file.
+src/nplinker/metabolomics/utils.py
get_mappings_strain_id_spectrum_id
+
+
+¶get_mappings_strain_id_spectrum_id(
+ mappings_strain_id_ms_filename: Mapping[str, set[str]],
+ mappings_ms_filename_spectrum_id: Mapping[
+ str, set[str]
+ ],
+) -> dict[str, set[str]]
+
Get mappings "strain_id <-> spectrum_id".
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
mappings_strain_id_ms_filename |
+
+ Mapping[str, set[str]]
+ |
+
+
+
+ Mappings +"strain_id <-> MS_filename". + |
+ + required + | +
mappings_ms_filename_spectrum_id |
+
+ Mapping[str, set[str]]
+ |
+
+
+
+ Mappings +"MS_filename <-> spectrum_id". + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ dict[str, set[str]]
+ |
+
+
+
+ Key is strain id and value is a set of spectrum ids. + |
+
extract_mappings_strain_id_ms_filename
: Extract mappings
+ "strain_id <-> MS_filename".
+extract_mappings_ms_filename_spectrum_id
: Extract mappings
+ "MS_filename <-> spectrum_id".
src/nplinker/metabolomics/utils.py
mibig
+
+
+¶
MibigLoader
+
+
+¶
+ Bases: BGCLoaderBase
Parse MIBiG metadata files and return BGC objects.
+MIBiG metadata file (json) contains annotations/metadata information +for each BGC. See https://mibig.secondarymetabolites.org/download.
+The MiBIG accession is used as BGC id and strain name. The loaded BGC
+objects have Strain object as their strain attribute (i.e. BGC.strain
).
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
data_dir |
+
+ str | PathLike
+ |
+
+
+
+ Path to the directory of MIBiG metadata json files + |
+ + required + | +
src/nplinker/genomics/mibig/mibig_loader.py
get_files
+
+
+¶Get the path of all MIBiG metadata json files.
+ + +Returns:
+Type | +Description | +
---|---|
+ dict[str, str]
+ |
+
+
+
+ The key is metadata file name (BGC accession), and the value is path to the metadata + |
+
+ dict[str, str]
+ |
+
+
+
+ json file + |
+
src/nplinker/genomics/mibig/mibig_loader.py
parse_data_dir
+
+
+
+ staticmethod
+
+
+¶Parse metadata directory and return paths to all metadata json files.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
data_dir |
+
+ str | PathLike
+ |
+
+
+
+ path to the directory of MIBiG metadata json files + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ dict[str, str]
+ |
+
+
+
+ The key is metadata file name (BGC accession), and the value is path to the metadata + |
+
+ dict[str, str]
+ |
+
+
+
+ json file + |
+
src/nplinker/genomics/mibig/mibig_loader.py
get_metadata
+
+
+¶get_metadata() -> dict[str, MibigMetadata]
+
Get MibigMetadata objects.
+ + +Returns:
+Type | +Description | +
---|---|
+ dict[str, MibigMetadata]
+ |
+
+
+
+ The key is BGC accession (file name) and the value is MibigMetadata object + |
+
src/nplinker/genomics/mibig/mibig_loader.py
get_bgcs
+
+
+¶Get BGC objects.
+The BGC objects use MiBIG accession as id and have Strain object as
+their strain attribute (i.e. BGC.strain
), where the name of the Strain
+object is also MiBIG accession.
Returns:
+Type | +Description | +
---|---|
+ list[BGC]
+ |
+
+
+
+ A list of BGC objects + |
+
src/nplinker/genomics/mibig/mibig_loader.py
MibigMetadata
+
+
+¶Class to model the BGC metadata/annotations defined in MIBiG.
+MIBiG is a specification of BGC metadata and use JSON schema to +represent BGC metadata. More details see: +https://mibig.secondarymetabolites.org/download.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file |
+
+ str | PathLike
+ |
+
+
+
+ Path to the json file of MIBiG BGC metadata + |
+ + required + | +
Examples:
+ + +src/nplinker/genomics/mibig/mibig_metadata.py
mibig_accession
+
+
+
+ property
+
+
+¶mibig_accession: str
+
Get the value of metadata item 'mibig_accession'.
+
biosyn_class
+
+
+
+ property
+
+
+¶Get the value of metadata item 'biosyn_class'.
+The 'biosyn_class' is biosynthetic class(es), namely the type of +natural product or secondary metabolite.
+MIBiG defines 6 major biosynthetic classes, including +"NRP", "Polyketide", "RiPP", "Terpene", "Saccharide" and "Alkaloid". +Note that natural products created by all other biosynthetic +mechanisms fall under the category "Other". More details see +the publication: https://doi.org/10.1186/s40793-018-0318-y.
+
download_and_extract_mibig_metadata
+
+
+¶download_and_extract_mibig_metadata(
+ download_root: str | PathLike,
+ extract_path: str | PathLike,
+ version: str = "3.1",
+)
+
Download and extract MIBiG metadata json files.
+Note that it does not matter whether the metadata json files are in nested folders or not in the archive,
+all json files will be extracted to the same location, i.e. extract_path
. The nested
+folders will be removed if they exist. So the extract_path
will have only json files.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
download_root |
+
+ str | PathLike
+ |
+
+
+
+ Path to the directory in which to place the downloaded archive. + |
+ + required + | +
extract_path |
+
+ str | PathLike
+ |
+
+
+
+ Path to an empty directory where the json files will be extracted. +The directory must be empty if it exists. If it doesn't exist, the directory will be created. + |
+ + required + | +
version |
+
+ str
+ |
+
+
+
+ description. Defaults to "3.1". + |
+
+ '3.1'
+ |
+
Examples:
+ + +src/nplinker/genomics/mibig/mibig_downloader.py
parse_bgc_metadata_json
+
+
+¶Parse MIBiG metadata file and return BGC object.
+Note that the MiBIG accession is used as the BGC id and strain name. The BGC +object has Strain object as its strain attribute.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file |
+
+ str | PathLike
+ |
+
+
+
+ Path to the MIBiG metadata json file + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ BGC
+ |
+
+
+
+ BGC object + |
+
src/nplinker/genomics/mibig/mibig_loader.py
nplinker
+
+
+¶
NPLinker
+
+
+¶Main class for the NPLinker application.
+ + +Attributes:
+Name | +Type | +Description | +
---|---|---|
config |
+ + | +
+
+
+ The configuration object for the current NPLinker application. + |
+
root_dir |
+
+ str
+ |
+
+
+
+ The path to the root directory of the current NPLinker application. + |
+
output_dir |
+
+ str
+ |
+
+
+
+ The path to the output directory of the current NPLinker application. + |
+
bgcs |
+
+ list[BGC]
+ |
+
+
+
+ A list of all BGC objects. + |
+
gcfs |
+
+ list[GCF]
+ |
+
+
+
+ A list of all GCF objects. + |
+
spectra |
+
+ list[Spectrum]
+ |
+
+
+
+ A list of all Spectrum objects. + |
+
mfs |
+
+ list[MolecularFamily]
+ |
+
+
+
+ A list of all MolecularFamily objects. + |
+
mibig_bgcs |
+
+ list[BGC]
+ |
+
+
+
+ A list of all MiBIG BGC objects. + |
+
strains |
+
+ StrainCollection
+ |
+
+
+
+ A StrainCollection object containing all Strain objects. + |
+
product_types |
+
+ list[str]
+ |
+
+
+
+ A list of all BiGSCAPE product types. + |
+
scoring_methods |
+
+ list[str]
+ |
+
+
+
+ A list of all valid scoring methods. + |
+
Examples:
+To start a NPLinker application:
+ +To load all data into memory:
+ +To check the number of GCF objects:
+ +To get the links for all GCF objects using the Metcalf scoring method, the result is a +LinkGraph object:
+ +To get the link data between two objects:
+>>> link_data = lg.get_link_data(npl.gcfs[0], npl.spectra[0])
+{"metcalf": Score("metcalf", 1.0, {"cutoff": 0, "standardised": False})}
+
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
config_file |
+
+ str | PathLike
+ |
+
+
+
+ Path to the configuration file to use. + |
+ + required + | +
src/nplinker/nplinker.py
root_dir
+
+
+
+ property
+
+
+¶root_dir: str
+
Get the path to the root directory of the current NPLinker instance.
+
output_dir
+
+
+
+ property
+
+
+¶output_dir: str
+
Get the path to the output directory of the current NPLinker instance.
+
spectra
+
+
+
+ property
+
+
+¶Get all Spectrum objects.
+
mfs
+
+
+
+ property
+
+
+¶mfs: list[MolecularFamily]
+
Get all MolecularFamily objects.
+
mibig_bgcs
+
+
+
+ property
+
+
+¶Get all MiBIG BGC objects.
+
strains
+
+
+
+ property
+
+
+¶strains: StrainCollection
+
Get all Strain objects.
+
product_types
+
+
+
+ property
+
+
+¶Get all BiGSCAPE product types.
+
chem_classes
+
+
+
+ property
+
+
+¶Returns loaded ChemClassPredictions with the class predictions.
+
class_matches
+
+
+
+ property
+
+
+¶ClassMatches with the matched classes and scoring tables from MIBiG.
+
scoring_methods
+
+
+
+ property
+
+
+¶Get names of all valid scoring methods.
+
load_data
+
+
+¶Load all data from local files into memory.
+This method is a convenience function that calls the DatasetArranger
and DatasetLoader
+classes to load all data from the local filesystem into memory. The loaded data is then
+stored in various private data containers for easy access.
src/nplinker/nplinker.py
get_links
+
+
+¶get_links(
+ objects: (
+ Sequence[BGC]
+ | Sequence[GCF]
+ | Sequence[Spectrum]
+ | Sequence[MolecularFamily]
+ ),
+ scoring_method: str,
+ **scoring_params: Any
+) -> LinkGraph
+
Get the links for the given objects using the specified scoring method and parameters.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
objects |
+
+ Sequence[BGC] | Sequence[GCF] | Sequence[Spectrum] | Sequence[MolecularFamily]
+ |
+
+
+
+ A sequence of objects to get the links for. The objects must be of the same
+type, i.e. |
+ + required + | +
scoring_method |
+
+ str
+ |
+
+
+
+ The scoring method to use. Must be one of the valid scoring methods
+ |
+ + required + | +
scoring_params |
+
+ Any
+ |
+
+
+
+ Parameters to pass to the scoring method. If not provided, the default +parameters for the scoring method will be used. + |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ LinkGraph
+ |
+
+
+
+ A LinkGraph object containing the links for the given objects. + |
+
Raises:
+Type | +Description | +
---|---|
+ ValueError
+ |
+
+
+
+ If input objects are empty or if the scoring method is invalid. + |
+
+ TypeError
+ |
+
+
+
+ If the input objects are not of the same type or if the object type is invalid. + |
+
src/nplinker/nplinker.py
lookup_bgc
+
+
+¶Get the BGC object with the given ID.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
id |
+
+ str
+ |
+
+
+
+ the ID of the BGC to look up. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ BGC | None
+ |
+
+
+
+ The BGC object with the given ID, or None if no such object exists. + |
+
src/nplinker/nplinker.py
lookup_gcf
+
+
+¶Get the GCF object with the given ID.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
id |
+
+ str
+ |
+
+
+
+ the ID of the GCF to look up. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ GCF | None
+ |
+
+
+
+ The GCF object with the given ID, or None if no such object exists. + |
+
src/nplinker/nplinker.py
lookup_spectrum
+
+
+¶Get the Spectrum object with the given ID.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
id |
+
+ str
+ |
+
+
+
+ the ID of the Spectrum to look up. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Spectrum | None
+ |
+
+
+
+ The Spectrum object with the given ID, or None if no such object exists. + |
+
src/nplinker/nplinker.py
lookup_mf
+
+
+¶lookup_mf(id: str) -> MolecularFamily | None
+
Get the MolecularFamily object with the given ID.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
id |
+
+ str
+ |
+
+
+
+ the ID of the MolecularFamily to look up. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ MolecularFamily | None
+ |
+
+
+
+ The MolecularFamily object with the given ID, or None if no such object exists. + |
+
src/nplinker/nplinker.py
save_data
+
+
+¶Pickle data to a file.
+The data to be pickled is a tuple containing the BGCs, GCFs, Spectra, MolecularFamilies,
+StrainCollection and links, i.e. (bgcs, gcfs, spectra, mfs, strains, links)
. If the links
+are not provided, None
will be used.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file |
+
+ str | PathLike
+ |
+
+
+
+ The path to the pickle file to save the data to. + |
+ + required + | +
links |
+
+ LinkGraph | None
+ |
+
+
+
+ The LinkGraph object to save. + |
+
+ None
+ |
+
src/nplinker/nplinker.py
setup_logging
+
+
+¶Setup logging configuration for the ancestor logger "nplinker".
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
level |
+
+ str
+ |
+
+
+
+ The log level, use the logging module's log level constants. Valid levels are: +"NOTSET", "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL". + |
+
+ 'INFO'
+ |
+
file |
+
+ str
+ |
+
+
+
+ The file to write the log to. If the file does not exist, it will be created. The log +will be written to the file in append mode. If the file is an empty string (by default), +the log will not be written to a file. + |
+
+ ''
+ |
+
use_console |
+
+ bool
+ |
+
+
+
+ Whether to log to the console. + |
+
+ True
+ |
+
src/nplinker/logger.py
defaults
+
+
+¶
NPLINKER_APP_DATA_DIR
+
+
+
+ module-attribute
+
+
+¶NPLINKER_APP_DATA_DIR: Final = parent / 'data'
+
STRAIN_MAPPINGS_FILENAME
+
+
+
+ module-attribute
+
+
+¶STRAIN_MAPPINGS_FILENAME: Final = 'strain_mappings.json'
+
GENOME_BGC_MAPPINGS_FILENAME
+
+
+
+ module-attribute
+
+
+¶GENOME_BGC_MAPPINGS_FILENAME: Final = (
+ "genome_bgc_mappings.json"
+)
+
GENOME_STATUS_FILENAME
+
+
+
+ module-attribute
+
+
+¶GENOME_STATUS_FILENAME: Final = 'genome_status.json'
+
GNPS_SPECTRA_FILENAME
+
+
+
+ module-attribute
+
+
+¶GNPS_SPECTRA_FILENAME: Final = 'spectra.mgf'
+
GNPS_MOLECULAR_FAMILY_FILENAME
+
+
+
+ module-attribute
+
+
+¶GNPS_MOLECULAR_FAMILY_FILENAME: Final = (
+ "molecular_families.tsv"
+)
+
GNPS_ANNOTATIONS_FILENAME
+
+
+
+ module-attribute
+
+
+¶GNPS_ANNOTATIONS_FILENAME: Final = 'annotations.tsv'
+
GNPS_FILE_MAPPINGS_TSV
+
+
+
+ module-attribute
+
+
+¶GNPS_FILE_MAPPINGS_TSV: Final = 'file_mappings.tsv'
+
GNPS_FILE_MAPPINGS_CSV
+
+
+
+ module-attribute
+
+
+¶GNPS_FILE_MAPPINGS_CSV: Final = 'file_mappings.csv'
+
STRAINS_SELECTED_FILENAME
+
+
+
+ module-attribute
+
+
+¶STRAINS_SELECTED_FILENAME: Final = 'strains_selected.json'
+
DOWNLOADS_DIRNAME
+
+
+
+ module-attribute
+
+
+¶DOWNLOADS_DIRNAME: Final = 'downloads'
+
ANTISMASH_DIRNAME
+
+
+
+ module-attribute
+
+
+¶ANTISMASH_DIRNAME: Final = 'antismash'
+
BIGSCAPE_DIRNAME
+
+
+
+ module-attribute
+
+
+¶BIGSCAPE_DIRNAME: Final = 'bigscape'
+
config
+
+
+¶
CONFIG_VALIDATORS
+
+
+
+ module-attribute
+
+
+¶CONFIG_VALIDATORS = [
+ Validator(
+ "root_dir",
+ required=True,
+ cast=transform_to_full_path,
+ condition=lambda v: is_dir(),
+ ),
+ Validator(
+ "mode",
+ required=True,
+ cast=lambda v: lower(),
+ is_in=["local", "podp"],
+ ),
+ Validator(
+ "podp_id",
+ required=True,
+ when=Validator("mode", eq="podp"),
+ ),
+ Validator(
+ "podp_id",
+ required=False,
+ when=Validator("mode", eq="local"),
+ ),
+ Validator(
+ "log.level",
+ is_type_of=str,
+ cast=lambda v: upper(),
+ is_in=[
+ "NOTSET",
+ "DEBUG",
+ "INFO",
+ "WARNING",
+ "ERROR",
+ "CRITICAL",
+ ],
+ ),
+ Validator("log.file", is_type_of=str),
+ Validator("log.use_console", is_type_of=bool),
+ Validator(
+ "mibig.to_use", required=True, is_type_of=bool
+ ),
+ Validator(
+ "mibig.version",
+ required=True,
+ is_type_of=str,
+ when=Validator("mibig.to_use", eq=True),
+ ),
+ Validator(
+ "bigscape.parameters", required=True, is_type_of=str
+ ),
+ Validator(
+ "bigscape.cutoff", required=True, is_type_of=str
+ ),
+ Validator(
+ "scoring.methods",
+ required=True,
+ cast=lambda v: [lower() for i in v],
+ is_type_of=list,
+ len_min=1,
+ condition=lambda v: issubset(
+ {"metcalf", "rosetta"}
+ ),
+ ),
+]
+
load_config
+
+
+¶Load and validate the configuration file.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
config_file |
+
+ str | PathLike
+ |
+
+
+
+ Path to the configuration file. + |
+ + required + | +
Returns:
+Name | Type | +Description | +
---|---|---|
Dynaconf |
+ Dynaconf
+ |
+
+
+
+ A Dynaconf object containing the configuration settings. + |
+
Raises:
+Type | +Description | +
---|---|
+ FileNotFoundError
+ |
+
+
+
+ If the configuration file does not exist. + |
+
src/nplinker/config.py
schemas
+
+
+¶
PODP_ADAPTED_SCHEMA
+
+
+
+ module-attribute
+
+
+¶PODP_ADAPTED_SCHEMA = load(f)
+
GENOME_STATUS_SCHEMA
+
+
+
+ module-attribute
+
+
+¶GENOME_STATUS_SCHEMA = load(f)
+
GENOME_BGC_MAPPINGS_SCHEMA
+
+
+
+ module-attribute
+
+
+¶GENOME_BGC_MAPPINGS_SCHEMA = load(f)
+
STRAIN_MAPPINGS_SCHEMA
+
+
+
+ module-attribute
+
+
+¶STRAIN_MAPPINGS_SCHEMA = load(f)
+
USER_STRAINS_SCHEMA
+
+
+
+ module-attribute
+
+
+¶USER_STRAINS_SCHEMA = load(f)
+
validate_podp_json
+
+
+¶validate_podp_json(json_data: dict) -> None
+
Validate a dictionary of JSON data against the PODP JSON schema.
+All validation error messages are collected and raised as a single +ValueError.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
json_data |
+
+ dict
+ |
+
+
+
+ The JSON data to validate. + |
+ + required + | +
Raises:
+Type | +Description | +
---|---|
+ ValueError
+ |
+
+
+
+ If the JSON data does not match the schema. + |
+
src/nplinker/schemas/utils.py
scoring
+
+
+¶
LinkGraph
+
+
+¶A class to represent the links between objects in NPLinker.
+This class wraps the networkx.Graph
class to provide a more user-friendly interface for
+working with the links.
The links between objects are stored as edges in a graph, while the objects themselves are +stored as nodes.
+The scoring data for each link (or link data) is stored as the key/value attributes of the edge.
+ + +Examples:
+Create a LinkGraph object:
+ +Add a link between a GCF and a Spectrum object:
+ +Get all links for a given object:
+ +Get all links:
+ +Check if there is a link between two objects:
+ +Get the link data between two objects:
+ + +src/nplinker/scoring/link_graph.py
links
+
+
+
+ property
+
+
+¶links: list[LINK]
+
Get all links.
+ + +Returns:
+Type | +Description | +
---|---|
+ list[LINK]
+ |
+
+
+
+ A list of tuples containing the links between objects. + |
+
add_link
+
+
+¶add_link(u: Entity, v: Entity, **data: Score) -> None
+
Add a link between two objects.
+The objects u
and v
must be different types, i.e. one must be a GCF and the other must be
+a Spectrum or MolecularFamily.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
u |
+
+ Entity
+ |
+
+
+
+ the first object, either a GCF, Spectrum, or MolecularFamily + |
+ + required + | +
v |
+
+ Entity
+ |
+
+
+
+ the second object, either a GCF, Spectrum, or MolecularFamily + |
+ + required + | +
data |
+
+ Score
+ |
+
+
+
+ keyword arguments. At least one scoring method and its data must be provided.
+The key must be the name of the scoring method defined in |
+
+ {}
+ |
+
src/nplinker/scoring/link_graph.py
has_link
+
+
+¶has_link(u: Entity, v: Entity) -> bool
+
Check if there is a link between two objects.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
u |
+
+ Entity
+ |
+
+
+
+ the first object, either a GCF, Spectrum, or MolecularFamily + |
+ + required + | +
v |
+
+ Entity
+ |
+
+
+
+ the second object, either a GCF, Spectrum, or MolecularFamily + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ bool
+ |
+
+
+
+ True if there is a link between the two objects, False otherwise + |
+
src/nplinker/scoring/link_graph.py
get_link_data
+
+
+¶Get the data for a link between two objects.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
u |
+
+ Entity
+ |
+
+
+
+ the first object, either a GCF, Spectrum, or MolecularFamily + |
+ + required + | +
v |
+
+ Entity
+ |
+
+
+
+ the second object, either a GCF, Spectrum, or MolecularFamily + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ LINK_DATA | None
+ |
+
+
+
+ A dictionary of scoring methods and their data for the link between the two objects, or + |
+
+ LINK_DATA | None
+ |
+
+
+
+ None if there is no link between the two objects. + |
+
src/nplinker/scoring/link_graph.py
Score
+
+
+
+ dataclass
+
+
+¶A data class to represent score data.
+ + +Attributes:
+Name | +Type | +Description | +
---|---|---|
name |
+
+ str
+ |
+
+
+
+ the name of the scoring method. See |
+
value |
+
+ float
+ |
+
+
+
+ the score value. + |
+
parameter |
+
+ dict
+ |
+
+
+
+ the parameters used for the scoring method. + |
+
abc
+
+
+¶
ScoringBase
+
+
+¶
+ Bases: ABC
Abstract base class of scoring methods.
+ + +Attributes:
+Name | +Type | +Description | +
---|---|---|
name |
+
+ str
+ |
+
+
+
+ The name of the scoring method. + |
+
npl |
+
+ NPLinker | None
+ |
+
+
+
+ The NPLinker object. + |
+
name
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶name: str = 'ScoringBase'
+
npl
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶npl: NPLinker | None = None
+
get_links
+
+
+
+ abstractmethod
+
+
+¶get_links(*objects, **parameters) -> LinkGraph
+
Get links information for the given objects.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
objects |
+ + | +
+
+
+ A list of objects to get links for. + |
+
+ ()
+ |
+
parameters |
+ + | +
+
+
+ The parameters used for scoring. + |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ LinkGraph
+ |
+
+
+
+ The LinkGraph object. + |
+
src/nplinker/scoring/abc.py
scoring
+
+
+¶
ScoringMethod
+
+
+¶
+ Bases: Enum
Enum class for scoring methods.
+ + + + +
METCALF
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶
ROSETTA
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶
NPLCLASS
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶
has_value
+
+
+
+ classmethod
+
+
+¶
MetcalfScoring
+
+
+¶
+ Bases: ScoringBase
Metcalf scoring method.
+ + +Attributes:
+Name | +Type | +Description | +
---|---|---|
name |
+ + | +
+
+
+ The name of this scoring method, set to a fixed value |
+
npl |
+
+ NPLinker | None
+ |
+
+
+
+ The NPLinker object. + |
+
CACHE |
+
+ str
+ |
+
+
+
+ The name of the cache file to use for storing the MetcalfScoring. + |
+
presence_gcf_strain |
+
+ DataFrame
+ |
+
+
+
+ A DataFrame to store presence of gcfs with respect to strains. +The index of the DataFrame are the GCF objects and the columns are Strain objects. +The values are 1 where the gcf occurs in the strain, 0 otherwise. + |
+
presence_spec_strain |
+
+ DataFrame
+ |
+
+
+
+ A DataFrame to store presence of spectra with respect to strains. +The index of the DataFrame are the Spectrum objects and the columns are Strain objects. +The values are 1 where the spectrum occurs in the strain, 0 otherwise. + |
+
presence_mf_strain |
+
+ DataFrame
+ |
+
+
+
+ A DataFrame to store presence of molecular families with respect to strains. +The index of the DataFrame are the MolecularFamily objects and the columns are Strain objects. +The values are 1 where the molecular family occurs in the strain, 0 otherwise. + |
+
raw_score_spec_gcf |
+
+ DataFrame
+ |
+
+
+
+ A DataFrame to store the raw Metcalf scores for spectrum-gcf links. +The columns are "spec", "gcf" and "score": +
|
+
raw_score_mf_gcf |
+
+ DataFrame
+ |
+
+
+
+ A DataFrame to store the raw Metcalf scores for molecular family-gcf links. +The columns are "mf", "gcf" and "score": +
|
+
metcalf_mean |
+
+ ndarray | None
+ |
+
+
+
+ A numpy array to store the mean value used for standardising Metcalf scores. +The array has shape (n_strains+1, n_strains+1), where n_strains is the number of strains. + |
+
metcalf_std |
+
+ ndarray | None
+ |
+
+
+
+ A numpy array to store the standard deviation value used for standardising +Metcalf scores. The array has shape (n_strains+1, n_strains+1), where n_strains is the +number of strains. + |
+
name
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶name = METCALF.value
+
npl
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶npl: NPLinker | None = None
+
CACHE
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶CACHE: str = 'cache_metcalf_scoring.pckl'
+
metcalf_weights
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶
presence_gcf_strain
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶
presence_spec_strain
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶
presence_mf_strain
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶
raw_score_spec_gcf
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶
raw_score_mf_gcf
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶
metcalf_mean
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶metcalf_mean: ndarray | None = None
+
metcalf_std
+
+
+
+ class-attribute
+ instance-attribute
+
+
+¶metcalf_std: ndarray | None = None
+
setup
+
+
+
+ classmethod
+
+
+¶setup(npl: NPLinker)
+
Setup the MetcalfScoring object.
+This method is only called once to setup the MetcalfScoring object.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
npl |
+
+ NPLinker
+ |
+
+
+
+ The NPLinker object. + |
+ + required + | +
src/nplinker/scoring/metcalf_scoring.py
get_links
+
+
+¶Get links for the given objects.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
objects |
+ + | +
+
+
+ The objects to get links for. All objects must be of the same type, i.e. |
+
+ ()
+ |
+
parameters |
+ + | +
+
+
+ The scoring parameters to use for the links. The parameters are: +
|
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ | +
+
+
+ The |
+
Raises:
+Type | +Description | +
---|---|
+ TypeError
+ |
+
+
+
+ If the input objects are not of the same type or the object type is invalid. + |
+
src/nplinker/scoring/metcalf_scoring.py
145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 |
|
format_data
+
+
+¶
sort
+
+
+¶Sort the objects based on the score.
+ + +
utils
+
+
+¶
get_presence_gcf_strain
+
+
+¶get_presence_gcf_strain(
+ gcfs: Sequence[GCF], strains: StrainCollection
+) -> DataFrame
+
Get the occurrence of strains in gcfs.
+The occurrence is a DataFrame with GCF objects as index and Strain objects as columns, and the +values are 1 if the gcf occurs in the strain, 0 otherwise.
+ +src/nplinker/scoring/utils.py
get_presence_spec_strain
+
+
+¶get_presence_spec_strain(
+ spectra: Sequence[Spectrum], strains: StrainCollection
+) -> DataFrame
+
Get the occurrence of strains in spectra.
+The occurrence is a DataFrame with Spectrum objects as index and Strain objects as columns, and +the values are 1 if the spectrum occurs in the strain, 0 otherwise.
+ +src/nplinker/scoring/utils.py
get_presence_mf_strain
+
+
+¶get_presence_mf_strain(
+ mfs: Sequence[MolecularFamily],
+ strains: StrainCollection,
+) -> DataFrame
+
Get the occurrence of strains in molecular families.
+The occurrence is a DataFrame with MolecularFamily objects as index and Strain objects as +columns, and the values are 1 if the molecular family occurs in the strain, 0 otherwise.
+ +src/nplinker/scoring/utils.py
strain
+
+
+¶
Strain
+
+
+¶Strain(id: str)
+
To model the mapping between strain id and its aliases.
+It's recommended to use NCBI taxonomy strain id or name as the primary +id.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
id |
+
+ str
+ |
+
+
+
+ the representative id of the strain. + |
+ + required + | +
src/nplinker/strain/strain.py
names
+
+
+
+ property
+
+
+¶
aliases
+
+
+
+ property
+
+
+¶
add_alias
+
+
+¶add_alias(alias: str) -> None
+
Add an alias to the list of known aliases.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
alias |
+
+ str
+ |
+
+
+
+ The alias to add to the list of known aliases. + |
+ + required + | +
src/nplinker/strain/strain.py
StrainCollection
+
+
+¶A collection of Strain objects.
+ +src/nplinker/strain/strain_collection.py
add
+
+
+¶add(strain: Strain) -> None
+
Add strain to the collection.
+If the strain already exists, merge the aliases.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
strain |
+
+ Strain
+ |
+
+
+
+ The strain to add. + |
+ + required + | +
src/nplinker/strain/strain_collection.py
remove
+
+
+¶remove(strain: Strain)
+
Remove a strain from the collection.
+It removes the given strain object from the collection by strain id. +If the strain id is not found, raise ValueError.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
strain |
+
+ Strain
+ |
+
+
+
+ The strain to remove. + |
+ + required + | +
Raises:
+Type | +Description | +
---|---|
+ ValueError
+ |
+
+
+
+ If the strain is not found in the collection. + |
+
src/nplinker/strain/strain_collection.py
filter
+
+
+¶Remove all strains that are not in strain_set from the strain collection.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
strain_set |
+
+ set[Strain]
+ |
+
+
+
+ Set of strains to keep. + |
+ + required + | +
src/nplinker/strain/strain_collection.py
intersection
+
+
+¶intersection(other: StrainCollection) -> StrainCollection
+
Get the intersection of two strain collections.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
other |
+
+ StrainCollection
+ |
+
+
+
+ The other strain collection to compare. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ StrainCollection
+ |
+
+
+
+ StrainCollection object containing the strains that are in both collections. + |
+
src/nplinker/strain/strain_collection.py
has_name
+
+
+¶Check if the strain collection contains the given strain name (id or alias).
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
name |
+
+ str
+ |
+
+
+
+ Strain name (id or alias) to check. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ bool
+ |
+
+
+
+ True if the strain name is in the collection, False otherwise. + |
+
src/nplinker/strain/strain_collection.py
lookup
+
+
+¶Lookup a strain by name (id or alias).
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
name |
+
+ str
+ |
+
+
+
+ Strain name (id or alias) to lookup. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ list[Strain]
+ |
+
+
+
+ List of Strain objects with the given name. + |
+
Raises:
+Type | +Description | +
---|---|
+ ValueError
+ |
+
+
+
+ If the strain name is not found. + |
+
src/nplinker/strain/strain_collection.py
read_json
+
+
+
+ staticmethod
+
+
+¶Read a strain mappings JSON file and return a StrainCollection object.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file |
+
+ str | PathLike
+ |
+
+
+
+ Path to the strain mappings JSON file. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ 'StrainCollection'
+ |
+
+
+
+ StrainCollection object. + |
+
src/nplinker/strain/strain_collection.py
to_json
+
+
+¶Convert the StrainCollection object to a JSON string.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file |
+
+ str | PathLike | None
+ |
+
+
+
+ Path to output JSON file. If None, +return the JSON string instead. + |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ str | None
+ |
+
+
+
+ If |
+
+ str | None
+ |
+
+
+
+ file. + |
+
src/nplinker/strain/strain_collection.py
utils
+
+
+¶
load_user_strains
+
+
+¶Load user specified strains from a JSON file.
+The JSON file must follow the schema defined in schemas/user_strains.json
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
json_file |
+
+ str | PathLike
+ |
+
+
+
+ Path to the JSON file containing user specified strains. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ set[Strain]
+ |
+
+
+
+ A set of user specified strains. + |
+
src/nplinker/strain/utils.py
podp_generate_strain_mappings
+
+
+¶podp_generate_strain_mappings(
+ podp_project_json_file: str | PathLike,
+ genome_status_json_file: str | PathLike,
+ genome_bgc_mappings_file: str | PathLike,
+ gnps_file_mappings_file: str | PathLike,
+ output_json_file: str | PathLike,
+) -> StrainCollection
+
Generate strain mappings JSON file for PODP pipeline.
+To get the strain mappings, we need to combine the following mappings:
+These mappings are extracted from the following files:
+podp_project_json_file
.genome_status_json_file
.genome_bgc_mappings_file
.podp_project_json_file
.gnps_file_mappings_file
.Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
podp_project_json_file |
+
+ str | PathLike
+ |
+
+
+
+ The path to the PODP project +JSON file. + |
+ + required + | +
genome_status_json_file |
+
+ str | PathLike
+ |
+
+
+
+ The path to the genome status +JSON file. + |
+ + required + | +
genome_bgc_mappings_file |
+
+ str | PathLike
+ |
+
+
+
+ The path to the genome BGC +mappings JSON file. + |
+ + required + | +
gnps_file_mappings_file |
+
+ str | PathLike
+ |
+
+
+
+ The path to the GNPS file +mappings file (csv or tsv). + |
+ + required + | +
output_json_file |
+
+ str | PathLike
+ |
+
+
+
+ The path to the output JSON file. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ StrainCollection
+ |
+
+
+
+ The strain mappings stored in a StrainCollection object. + |
+
extract_mappings_strain_id_original_genome_id
: Extract mappings
+ "strain_id <-> original_genome_id".extract_mappings_original_genome_id_resolved_genome_id
: Extract mappings
+ "original_genome_id <-> resolved_genome_id".extract_mappings_resolved_genome_id_bgc_id
: Extract mappings
+ "resolved_genome_id <-> bgc_id".get_mappings_strain_id_bgc_id
: Get mappings "strain_id <-> bgc_id".extract_mappings_strain_id_ms_filename
: Extract mappings
+ "strain_id <-> MS_filename".extract_mappings_ms_filename_spectrum_id
: Extract mappings
+ "MS_filename <-> spectrum_id".get_mappings_strain_id_spectrum_id
: Get mappings "strain_id <-> spectrum_id".src/nplinker/strain/utils.py
50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 |
|
utils
+
+
+¶
calculate_md5
+
+
+¶Calculate the MD5 checksum of a file.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
fpath |
+
+ str | PathLike
+ |
+
+
+
+ Path to the file. + |
+ + required + | +
chunk_size |
+
+ int
+ |
+
+
+
+ Chunk size for reading the file. Defaults to 1024*1024. + |
+
+ 1024 * 1024
+ |
+
Returns:
+Type | +Description | +
---|---|
+ str
+ |
+
+
+
+ MD5 checksum of the file. + |
+
src/nplinker/utils.py
check_disk_space
+
+
+¶A decorator to check available disk space.
+If the available disk space is less than 500GB, a warning is logged and a warning is raised.
+ +src/nplinker/utils.py
check_md5
+
+
+¶Verify the MD5 checksum of a file.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
fpath |
+
+ str | PathLike
+ |
+
+
+
+ Path to the file. + |
+ + required + | +
md5 |
+
+ str
+ |
+
+
+
+ MD5 checksum to verify. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ bool
+ |
+
+
+
+ True if the MD5 checksum matches, False otherwise. + |
+
src/nplinker/utils.py
download_and_extract_archive
+
+
+¶download_and_extract_archive(
+ url: str,
+ download_root: str | PathLike,
+ extract_root: str | Path | None = None,
+ filename: str | None = None,
+ md5: str | None = None,
+ remove_finished: bool = False,
+) -> None
+
Download a file from url and extract it.
+This method is a wrapper of download_url
and extract_archive
methods.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
url |
+
+ str
+ |
+
+
+
+ URL to download file from + |
+ + required + | +
download_root |
+
+ str | PathLike
+ |
+
+
+
+ Path to the directory to place downloaded +file in. If it doesn't exist, it will be created. + |
+ + required + | +
extract_root |
+
+ str | Path | None
+ |
+
+
+
+ Path to the directory the file
+will be extracted to. The given directory will be created if not exist.
+If omitted, the |
+
+ None
+ |
+
filename |
+
+ str | None
+ |
+
+
+
+ Name to save the downloaded file under. +If None, use the basename of the URL + |
+
+ None
+ |
+
md5 |
+
+ str | None
+ |
+
+
+
+ MD5 checksum of the download. If None, do not check + |
+
+ None
+ |
+
remove_finished |
+
+ bool
+ |
+
+
+
+ If |
+
+ False
+ |
+
src/nplinker/utils.py
download_url
+
+
+¶download_url(
+ url: str,
+ root: str | PathLike,
+ filename: str | None = None,
+ md5: str | None = None,
+ http_method: str = "GET",
+ allow_http_redirect: bool = True,
+) -> None
+
Download a file from a url and place it in root.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
url |
+
+ str
+ |
+
+
+
+ URL to download file from + |
+ + required + | +
root |
+
+ str | PathLike
+ |
+
+
+
+ Directory to place downloaded file in. If it doesn't exist, it will be created. + |
+ + required + | +
filename |
+
+ str | None
+ |
+
+
+
+ Name to save the file under. If None, use the +basename of the URL. + |
+
+ None
+ |
+
md5 |
+
+ str | None
+ |
+
+
+
+ MD5 checksum of the download. If None, do not check. + |
+
+ None
+ |
+
http_method |
+
+ str
+ |
+
+
+
+ HTTP request method, e.g. "GET", "POST". +Defaults to "GET". + |
+
+ 'GET'
+ |
+
allow_http_redirect |
+
+ bool
+ |
+
+
+
+ If true, enable following redirects for all HTTP ("http:") methods. + |
+
+ True
+ |
+
src/nplinker/utils.py
129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 |
|
extract_archive
+
+
+¶extract_archive(
+ from_path: str | PathLike,
+ extract_root: str | PathLike | None = None,
+ members: list | None = None,
+ remove_finished: bool = False,
+) -> str
+
Extract an archive.
+The archive type and a possible compression is automatically detected from
+the file name. If the file is compressed but not an archive the call is
+dispatched to :func:decompress
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
from_path |
+
+ str | PathLike
+ |
+
+
+
+ Path to the file to be extracted. + |
+ + required + | +
extract_root |
+
+ str | PathLike | None
+ |
+
+
+
+ Path to the directory the file will be extracted to. +The given directory will be created if not exist. +If omitted, the directory of the archive file is used. + |
+
+ None
+ |
+
members |
+
+ list | None
+ |
+
+
+
+ Optional selection of members to extract. If not specified,
+all members are extracted.
+Members must be a subset of the list returned by
+- |
+
+ None
+ |
+
remove_finished |
+
+ bool
+ |
+
+
+
+ If |
+
+ False
+ |
+
Returns:
+Type | +Description | +
---|---|
+ str
+ |
+
+
+
+ Path to the directory the file was extracted to. + |
+
src/nplinker/utils.py
is_file_format
+
+
+¶Check if the file is in the given format.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file |
+
+ str | PathLike
+ |
+
+
+
+ Path to the file to check. + |
+ + required + | +
format |
+
+ str
+ |
+
+
+
+ The format to check for, either "tsv" or "csv". + |
+
+ 'tsv'
+ |
+
Returns:
+Type | +Description | +
---|---|
+ bool
+ |
+
+
+
+ True if the file is in the given format, False otherwise. + |
+
src/nplinker/utils.py
list_dirs
+
+
+¶List all directories at a given root.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
root |
+
+ str | PathLike
+ |
+
+
+
+ Path to directory whose folders need to be listed + |
+ + required + | +
keep_parent |
+
+ bool
+ |
+
+
+
+ If true, prepends the path to each result, otherwise +only returns the name of the directories found + |
+
+ True
+ |
+
src/nplinker/utils.py
list_files
+
+
+¶list_files(
+ root: str | PathLike,
+ prefix: str | tuple[str, ...] = "",
+ suffix: str | tuple[str, ...] = "",
+ keep_parent: bool = True,
+) -> list[str]
+
List all files at a given root.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
root |
+
+ str | PathLike
+ |
+
+
+
+ Path to directory whose files need to be listed + |
+ + required + | +
prefix |
+
+ str | tuple[str, ...]
+ |
+
+
+
+ Prefix of the file names to match, +Defaults to empty string '""'. + |
+
+ ''
+ |
+
suffix |
+
+ str | tuple[str, ...]
+ |
+
+
+
+ Suffix of the files to match, e.g. ".png" or +(".jpg", ".png"). +Defaults to empty string '""'. + |
+
+ ''
+ |
+
keep_parent |
+
+ bool
+ |
+
+
+
+ If true, prepends the parent path to each +result, otherwise only returns the name of the files found. +Defaults to False. + |
+
+ True
+ |
+
src/nplinker/utils.py
transform_to_full_path
+
+
+¶Transform a path to a full path.
+The path is expanded (i.e. the ~
will be replaced with actual path) and converted to an
+absolute path (i.e. .
or ..
will be replaced with actual path).
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
p |
+
+ str | PathLike
+ |
+
+
+
+ The path to transform. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Path
+ |
+
+
+
+ The transformed full path. + |
+
src/nplinker/utils.py