Skip to content

Commit

Permalink
add pathbank download
Browse files Browse the repository at this point in the history
  • Loading branch information
cwieder committed Dec 3, 2023
1 parent ac17b04 commit 70e7bee
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 5 deletions.
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,10 @@ processed_data = (log2_mat - log2_mat.mean(axis=0)) / log2_mat.std(axis=0)
```python
# Pre-loaded pathways
# Reactome v78
reactome_pathways = sspa.process_reactome(organism="Homo sapiens")
reactome_pathways = sspa.process_reactome(organism="Homo sapiens")

# KEGG v98
kegg_human_pathways = sspa.process_kegg(organism="hsa")
kegg_human_pathways = sspa.process_kegg(organism="hsa")
```

Load a custom GMT file (extension .gmt or .csv)
Expand All @@ -96,10 +96,14 @@ Download latest version of pathways
kegg_mouse_latest = sspa.process_kegg("mmu", download_latest=True, filepath=".")

# download Reactome latest
reactome_mouse_latest = sspa.process_reactome("Mus musculus", download_latest=True, filepath=".")
reactome_mouse_latest = sspa.process_reactome("Mus musculus", download_latest=True, filepath=".", omicstype='metabolomics')

# download Pathbank latest
pathbank_human_latest = sspa.process_pathbank("Homo sapiens", download_latest=True, filepath=".", omicstype='metabolomics')
```

## Identifier harmonization
Note: KEGG pathways use KEGG compound IDs, Reactome and Pathbank pathways use ChEBI and UniProt (for proteins)
```python
# download the conversion table
compound_names = processed_data.columns.tolist()
Expand Down Expand Up @@ -200,6 +204,9 @@ We are grateful for our contributors who help develop and maintain py-ssPA:
<details>
<summary>Read more</summary>

### [v1.0.2] - 4/12/23
- Enable download of Pathbank pathways (metabolite and protein) via the `process_pathbank()` function

### [v1.0.0] - 25/08/23
- Add compatability with SciKitLearn by implementing `fit()`, `transform()` and `fit_transform()` methods for all ssPA methods. This allows integration of ssPA transformation with various machine learning functions in SKLearn such as `Pipeline` and `GridSearchCV`. Specifically for `sspa.sspa_ssClustPA`, `sspa.sspa_SVD`, and `sspa.sspa_KPCA` methods the model can be fit on the training data and the test data is transformed using the fitted model.
- Fixed ID conversion bug in `sspa.map_identifiers()` due to MetaboAnalyst API URL change
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setup(
name='sspa',
version='1.0.0',
version='1.0.2',
packages=['sspa'],
package_dir={'':'src'},
url='https://github.com/cwieder/sspa',
Expand Down
2 changes: 1 addition & 1 deletion src/sspa/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pkg_resources import get_distribution
__version__ = get_distribution('sspa').version

from .process_pathways import process_reactome, process_kegg, process_gmt
from .process_pathways import process_reactome, process_kegg, process_gmt, process_pathbank
from .sspa_cluster import sspa_ssClustPA
from .sspa_kpca import sspa_KPCA
from .sspa_zscore import sspa_zscore
Expand Down
91 changes: 91 additions & 0 deletions src/sspa/download_pathways.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,3 +238,94 @@ def download_metexplore(self):

print("Complete!")
return pathways


def download_pathbank(organism, filepath=None, omicstype='metabolomics'):
'''
Function for PathBank pathway download
Args:
organism (str): PathBank organism name
filepath (str): filepath to save pathway file to, default is None - save to variable
omics_type(str): type of omics pathways to download.
Options are 'metabolomics' (ChEBI identifiers), 'proteomics' (UniProt identifiers), or 'multiomics' (ChEBI and UniProt identifiers)
'''
organisms = ['Homo sapiens', 'Escherichia coli', 'Mus musculus', 'Arabidopsis thaliana',
'Saccharomyces cerevisiae', 'Bos taurus', 'Caenorhabditis elegans',
'Rattus norvegicus', 'Drosophila melanogaster', 'Pseudomonas aeruginosa']
if organism not in organisms:
raise ValueError('Organism must be one of '+ ", ".join(organisms))

version_no = None
pathway_names = pd.read_csv('https://pathbank.org/downloads/pathbank_all_pathways.csv.zip', compression='zip', sep=',', header=0)
name_dict = dict(zip(pathway_names['SMPDB ID'], pathway_names['Name']))


if omicstype == 'metabolomics':
metabolites_url = 'https://pathbank.org/downloads/pathbank_all_metabolites.csv.zip'
chebi_pathways = pd.read_csv(metabolites_url, compression='zip', sep=',', header=0, dtype=str)
chebi_pathways = chebi_pathways[chebi_pathways['Species'] == organism]

# reformat to gmt style such that each row contains chebi ID per pathway and each column is a chebi id
chebi_pathways = chebi_pathways.groupby(['PathBank ID', 'Pathway Name'])['ChEBI ID'].apply(list).reset_index()
chebi_pathways_gmt = pd.DataFrame(chebi_pathways['ChEBI ID'].values.tolist(), index=chebi_pathways['PathBank ID'])
chebi_pathways_gmt['Pathway_name'] = chebi_pathways_gmt.index.map(name_dict)
chebi_pathways_gmt.insert(0, 'Pathway_name', chebi_pathways_gmt.pop('Pathway_name'))

if filepath:
fpath = filepath + "/Pathbank_" + "_".join(organism.split())+ "_pathways_ChEBI" + ".gmt"
chebi_pathways_gmt.to_csv(fpath, sep="\t", header=False)
print("Pathbank DB file saved to " + fpath)

print("Complete!")
return chebi_pathways_gmt

if omicstype == 'proteomics':
proteins_url = 'https://pathbank.org/downloads/pathbank_all_proteins.csv.zip'
uniprot_pathways = pd.read_csv(proteins_url, compression='zip', sep=',', header=0, dtype=str)
uniprot_pathways = uniprot_pathways[uniprot_pathways['Species'] == organism]

# reformat to gmt style such that each row contains uniprot ID per pathway and each column is a uniprot id
uniprot_pathways = uniprot_pathways.groupby(['PathBank ID', 'Pathway Name'])['Uniprot ID'].apply(list).reset_index()
uniprot_pathways_gmt = pd.DataFrame(uniprot_pathways['Uniprot ID'].values.tolist(), index=uniprot_pathways['PathBank ID'])
uniprot_pathways_gmt['Pathway_name'] = uniprot_pathways_gmt.index.map(name_dict)
uniprot_pathways_gmt.insert(0, 'Pathway_name', uniprot_pathways_gmt.pop('Pathway_name'))

if filepath:
fpath = filepath + "/Pathbank_" + "_".join(organism.split())+ "_pathways_UniProt" + ".gmt"
uniprot_pathways_gmt.to_csv(fpath, sep="\t", header=False)
print("Pathbank DB file saved to " + fpath)

print("Complete!")
return uniprot_pathways_gmt

if omicstype == 'multiomics':
metabolites_url = 'https://pathbank.org/downloads/pathbank_all_metabolites.csv.zip'
chebi_pathways = pd.read_csv(metabolites_url, compression='zip', sep=',', header=0, dtype=str)
chebi_pathways = chebi_pathways[chebi_pathways['Species'] == organism]

# reformat to gmt style such that each row contains chebi ID per pathway and each column is a chebi id
chebi_pathways = chebi_pathways.groupby(['PathBank ID', 'Pathway Name'])['ChEBI ID'].apply(list).reset_index()
chebi_pathways_gmt = pd.DataFrame(chebi_pathways['ChEBI ID'].values.tolist(), index=chebi_pathways['PathBank ID'])

proteins_url = 'https://pathbank.org/downloads/pathbank_all_proteins.csv.zip'
uniprot_pathways = pd.read_csv(proteins_url, compression='zip', sep=',', header=0, dtype=str)
uniprot_pathways = uniprot_pathways[uniprot_pathways['Species'] == organism]

# reformat to gmt style such that each row contains uniprot ID per pathway and each column is a uniprot id
uniprot_pathways = uniprot_pathways.groupby(['PathBank ID', 'Pathway Name'])['Uniprot ID'].apply(list).reset_index()
uniprot_pathways_gmt = pd.DataFrame(uniprot_pathways['Uniprot ID'].values.tolist(), index=uniprot_pathways['PathBank ID'])

multiomics_pathways_gmt = pd.concat([chebi_pathways_gmt, uniprot_pathways_gmt], axis=1)
multiomics_pathways_gmt['Pathway_name'] = multiomics_pathways_gmt.index.map(name_dict)
multiomics_pathways_gmt.insert(0, 'Pathway_name', multiomics_pathways_gmt.pop('Pathway_name'))

if filepath:
fpath = filepath + "/Pathbank_" + "_".join(organism.split())+ "_pathways_multiomics" + ".gmt"
multiomics_pathways_gmt.to_csv(fpath, sep="\t", header=False)
print("Pathbank DB file saved to " + fpath)

print("Complete!")

return multiomics_pathways_gmt

download_pathbank()
23 changes: 23 additions & 0 deletions src/sspa/process_pathways.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,29 @@ def process_kegg(organism, infile=None, download_latest=False, filepath=None):
pathways_df = pathways_df.mask(mask, None)

return pathways_df


def process_pathbank(organism, infile=None, download_latest=False, filepath=None, omics_type='metabolomics'):
'''
Function to load PathBank pathways
Args:
infile (str): default None, provide a PathBank pathway file to process into the GMT-style dataframe
download_latest (Bool): Downloads the latest version of PathBank metabolic pathways
filepath (str): filepath to save pathway file to, default is None - save to variable
omics_type(str): If using download_latest, specify type of omics pathways to download. Options are 'metabolomics', 'proteomics', or 'multiomics'
Returns:
GMT-like pd.DataFrame containing PathBank pathways
'''
if download_latest:
pathways_df = sspa.download_pathways.download_pathbank(organism, filepath, omics_type)
return pathways_df

else:
if infile:
pathways_df = pd.read_csv(infile, index_col=0)
else:
print('Set download_latest=True to download latest version of PathBank pathways or provide a saved PathBank pathway .csv/.gmt file to load')


def process_gmt(infile):
'''
Expand Down

0 comments on commit 70e7bee

Please sign in to comment.