Skip to content

Commit

Permalink
Add caching for data loadinggit add . Complete load_scales docu
Browse files Browse the repository at this point in the history
  • Loading branch information
breimanntools committed Sep 23, 2023
1 parent 7ab4ba0 commit ca3d92c
Show file tree
Hide file tree
Showing 41 changed files with 582 additions and 151 deletions.
Binary file modified aaanalysis/__pycache__/utils.cpython-39.pyc
Binary file not shown.
Binary file modified aaanalysis/_utils/__pycache__/_utils_constants.cpython-39.pyc
Binary file not shown.
2 changes: 2 additions & 0 deletions aaanalysis/_utils/_utils_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
STR_SCALE_CAT = "scales_cat" # AAontology
STR_TOP60 = "top60" # AAclustTop60
STR_TOP60_EVAL = "top60_eval" # AAclustTop60 evaluation
NAMES_SCALE_SETS = [STR_SCALES, STR_SCALES_RAW, STR_SCALE_CAT,
STR_SCALES_PC, STR_TOP60, STR_TOP60_EVAL]


# Column names for primary df
Expand Down
Binary file modified aaanalysis/data_loader/__pycache__/data_loader.cpython-39.pyc
Binary file not shown.
239 changes: 142 additions & 97 deletions aaanalysis/data_loader/data_loader.py

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions aaanalysis/data_loader/data_read_write.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,7 @@ def read_fasta():
def to_fasta(df_seq=None):
""""""
# TODO implement a writer to fasta from df_seq

def to_df_scales(df=None):
""""""
# TODO implement parser from df to df_seq (remove not necessary columns and adjust naming)
17 changes: 16 additions & 1 deletion aaanalysis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
import os
import platform
from functools import lru_cache

# Import utility functions for specific purposes
from aaanalysis._utils._utils_constants import *
Expand All @@ -16,7 +17,7 @@

# I Folder structure
def _folder_path(super_folder, folder_name):
"""Modification of separator (OS depending)"""
"""Modification of separator (OS-depending)"""
path = os.path.join(super_folder, folder_name + SEP)
return path

Expand All @@ -28,6 +29,20 @@ def _folder_path(super_folder, folder_name):


# II MAIN FUNCTIONS
# Caching for data loading for better performance (data loaded ones)
@lru_cache(maxsize=None)
def read_excel_cached(name, index_col=None):
"""Load cached dataframe to save loading time"""
df = pd.read_excel(name, index_col=index_col)
return df

@lru_cache(maxsize=None)
def read_csv_cached(name, sep=None):
"""Load cached dataframe to save loading time"""
df = pd.read_csv(name, sep=sep)
return df


# Check key dataframes using constants and general checking functions (df_seq, df_parts, df_cat, df_scales, df_feat)
def check_df_seq(df_seq=None, jmd_n_len=None, jmd_c_len=None):
"""Get features from df"""
Expand Down
Binary file modified docs/build/doctrees/environment.pickle
Binary file not shown.
Binary file modified docs/build/doctrees/generated/aaanalysis.load_dataset.doctree
Binary file not shown.
Binary file modified docs/build/doctrees/generated/aaanalysis.load_scales.doctree
Binary file not shown.
Binary file modified docs/build/doctrees/generated/tutorial2a_data_loader.doctree
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified docs/build/doctrees/tutorials.doctree
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Data loading
============

This is a tutorial on loading of protein and amino acid scale datasets.
This is a tutorial on loading of protein benchmark datasets.

Loading of protein benchmarks
-----------------------------
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Scale set loading
=================

This is a tutorial on loading of amino acid scales sets, their
classification (AAontology), or evaluation (AAclust top60).
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Scale set loading
=================

This is a tutorial on loading of amino acid scales sets, their
classification (AAontology), or evaluation (AAclust top60).
1 change: 1 addition & 0 deletions docs/build/html/_sources/tutorials.rst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@ Further Tutorials
:maxdepth: 1

generated/tutorial2a_data_loader
generated/tutorial2b_scales_loader
6 changes: 3 additions & 3 deletions docs/build/html/api.html
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
<meta property="og:description" content="This Application Programming Interface (API) is the public interface for the objects and functions of our AAanalysis Python toolkit, which can be imported by: You can then access all methods and ob..." />
<meta property="og:image:width" content="1146" />
<meta property="og:image:height" content="600" />
<meta property="og:image" content="/_images/social_previews/summary_api_d0528d29.png" />
<meta property="og:image" content="/None" />
<meta property="og:image:alt" content="This Application Programming Interface (API) is the public interface for the objects and functions of our AAanalysis Python toolkit, which can be imported by..." />
<meta name="description" content="This Application Programming Interface (API) is the public interface for the objects and functions of our AAanalysis Python toolkit, which can be imported by: You can then access all methods and ob..." />
<meta name="twitter:card" content="summary_large_image" />
Expand Down Expand Up @@ -42,7 +42,7 @@
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="aaanalysis.load_dataset" href="generated/aaanalysis.load_dataset.html" />
<link rel="prev" title="Data loading" href="generated/tutorial2a_data_loader.html" />
<link rel="prev" title="Scale set loading" href="generated/tutorial2b_scales_loader.html" />
</head>

<body class="wy-body-for-nav">
Expand Down Expand Up @@ -223,7 +223,7 @@ <h2>Plot Utilities<a class="headerlink" href="#plot-utilities" title="Permalink
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="generated/tutorial2a_data_loader.html" class="btn btn-neutral float-left" title="Data loading" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="generated/tutorial2b_scales_loader.html" class="btn btn-neutral float-left" title="Scale set loading" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="generated/aaanalysis.load_dataset.html" class="btn btn-neutral float-right" title="aaanalysis.load_dataset" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>

Expand Down
20 changes: 10 additions & 10 deletions docs/build/html/generated/aaanalysis.load_dataset.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<meta property="og:site_name" content="AAanalysis" />
<meta property="og:image:width" content="1146" />
<meta property="og:image:height" content="600" />
<meta property="og:image" content="/_images/social_previews/summary_generated_aaanalysis.load_dataset_c5bd8cb0.png" />
<meta property="og:image" content="/None" />
<meta property="og:image:alt" content="" />
<meta name="twitter:card" content="summary_large_image" />

Expand Down Expand Up @@ -126,26 +126,26 @@
<h1>aaanalysis.load_dataset<a class="headerlink" href="#aaanalysis-load-dataset" title="Permalink to this heading"></a></h1>
<dl class="py function">
<dt class="sig sig-object py" id="aaanalysis.load_dataset">
<span class="sig-prename descclassname"><span class="pre">aaanalysis.</span></span><span class="sig-name descname"><span class="pre">load_dataset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">name</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'INFO'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">non_canonical_aa</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'remove'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_len</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_len</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">aa_window_size</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">9</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/breimanntools/aaanalysis/tree/master/aaanalysis/data_loader/data_loader.py#L147-L249"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#aaanalysis.load_dataset" title="Permalink to this definition"></a></dt>
<span class="sig-prename descclassname"><span class="pre">aaanalysis.</span></span><span class="sig-name descname"><span class="pre">load_dataset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">name</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'INFO'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">non_canonical_aa</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'remove'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_len</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_len</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">aa_window_size</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">9</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/breimanntools/aaanalysis/tree/master/aaanalysis/data_loader/data_loader.py#L165-L267"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#aaanalysis.load_dataset" title="Permalink to this definition"></a></dt>
<dd><p>Load protein benchmarking datasets.</p>
<p>The benchmarks are categorized into amino acid (‘AA’), domain (‘DOM’), and sequence (‘SEQ’) level datasets.
By default, an overview table is provided (<code class="docutils literal notranslate"><span class="pre">name='INFO'</span></code>). For in-depth details, refer to <a class="reference internal" href="../index/references.html#breimann23a" id="id1"><span>[Breimann23a]</span></a>.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>name</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><code class="xref py py-class docutils literal notranslate"><span class="pre">str</span></code></a>) – The name of the loaded dataset, from ‘Dataset’ column in overview table.</p></li>
<li><p><strong>name</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><code class="xref py py-class docutils literal notranslate"><span class="pre">str</span></code></a>) – The name of the loaded dataset, from the ‘Dataset’ column in the overview table.</p></li>
<li><p><strong>n</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Optional" title="(in Python v3.11)"><code class="xref py py-data docutils literal notranslate"><span class="pre">Optional</span></code></a>[<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.11)"><code class="xref py py-class docutils literal notranslate"><span class="pre">int</span></code></a>]) – Number of proteins per class, selected by index. If None, the whole dataset will be returned.</p></li>
<li><p><strong>random</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.11)"><code class="xref py py-class docutils literal notranslate"><span class="pre">bool</span></code></a>) – If True, <code class="docutils literal notranslate"><span class="pre">n</span></code> random selected proteins per class will be chosen.</p></li>
<li><p><strong>non_canonical_aa</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Literal" title="(in Python v3.11)"><code class="xref py py-data docutils literal notranslate"><span class="pre">Literal</span></code></a>[‘remove’, ‘keep’, ‘gap’]) – <p>Options for modifying non-canonical amino acids:</p>
<li><p><strong>random</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.11)"><code class="xref py py-class docutils literal notranslate"><span class="pre">bool</span></code></a>) – If True, <code class="docutils literal notranslate"><span class="pre">n</span></code> randomly selected proteins per class will be chosen.</p></li>
<li><p><strong>non_canonical_aa</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Literal" title="(in Python v3.11)"><code class="xref py py-data docutils literal notranslate"><span class="pre">Literal</span></code></a>[‘remove’, ‘keep’, ‘gap’]) – <p>Options for handling non-canonical amino acids:</p>
<ul>
<li><p>’remove’: Remove sequences containing non-canonical amino acids.</p></li>
<li><p>’keep’: Don’t remove sequences containing non-canonical amino acids.</p></li>
<li><p>’gap’: Non-canonical amino acids are replaced by gap symbol (‘X’).</p></li>
<li><p>’gap’: Non-canonical amino acids are replaced by the gap symbol (‘X’).</p></li>
</ul>
</p></li>
<li><p><strong>min_len</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Optional" title="(in Python v3.11)"><code class="xref py py-data docutils literal notranslate"><span class="pre">Optional</span></code></a>[<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.11)"><code class="xref py py-class docutils literal notranslate"><span class="pre">int</span></code></a>]) – Minimum length of sequences for filtering, disabled by default.</p></li>
<li><p><strong>max_len</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Optional" title="(in Python v3.11)"><code class="xref py py-data docutils literal notranslate"><span class="pre">Optional</span></code></a>[<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.11)"><code class="xref py py-class docutils literal notranslate"><span class="pre">int</span></code></a>]) – Maximum length of sequences for filtering, disabled by default.</p></li>
<li><p><strong>aa_window_size</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Optional" title="(in Python v3.11)"><code class="xref py py-data docutils literal notranslate"><span class="pre">Optional</span></code></a>[<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.11)"><code class="xref py py-class docutils literal notranslate"><span class="pre">int</span></code></a>]) – Length of amino acid window, only used for amino acid dataset level (<code class="docutils literal notranslate"><span class="pre">name='AA_'</span></code>) and if <code class="docutils literal notranslate"><span class="pre">n</span></code> given.</p></li>
<li><p><strong>aa_window_size</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Optional" title="(in Python v3.11)"><code class="xref py py-data docutils literal notranslate"><span class="pre">Optional</span></code></a>[<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.11)"><code class="xref py py-class docutils literal notranslate"><span class="pre">int</span></code></a>]) – Length of amino acid window, only used for the amino acid dataset level (<code class="docutils literal notranslate"><span class="pre">name='AA_'</span></code>) and if <code class="docutils literal notranslate"><span class="pre">n</span></code> is given.</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
Expand All @@ -158,12 +158,12 @@ <h1>aaanalysis.load_dataset<a class="headerlink" href="#aaanalysis-load-dataset"
</dl>
<div class="admonition-notes admonition">
<p class="admonition-title">Notes</p>
<p>The <code class="docutils literal notranslate"><span class="pre">df_seq</span></code> DataFrame includs these columns:</p>
<p><code class="docutils literal notranslate"><span class="pre">df_seq</span></code> includes these columns:</p>
<ul class="simple">
<li><p>‘entry’: Protein identifier, either the UniProt accession number or an id based on index.</p></li>
<li><p>‘sequence’: Amino acid sequence.</p></li>
<li><p>‘label’: Binary classification label (0 for negatives, 1 for positives).</p></li>
<li><p>‘tmd_start’, ‘tmd_stop’: Start and stop positions of TMD (present only at domain level).</p></li>
<li><p>‘tmd_start’, ‘tmd_stop’: Start and stop positions of TMD (present only at the domain level).</p></li>
<li><p>‘jmd_n’, ‘tmd’, ‘jmd_c’: Sequences for JMD_N, TMD, and JMD_C respectively.</p></li>
</ul>
</div>
Expand All @@ -177,8 +177,8 @@ <h1>aaanalysis.load_dataset<a class="headerlink" href="#aaanalysis-load-dataset"
<div class="admonition seealso">
<p class="admonition-title">See also</p>
<ul class="simple">
<li><p>Step-by-step guide in the <a class="reference external" href="tutorial2a_data_loader.html">data loading tutorial</a>.</p></li>
<li><p>Overview of all benchmarks in <a class="reference internal" href="../index/tables.html#t1-overview-benchmarks"><span class="std std-ref">Protein Benchmark Datasets</span></a>.</p></li>
<li><p>Step-by-step guide in the <a class="reference external" href="tutorial2a_data_loader.html">data loading tutorial</a>.</p></li>
</ul>
</div>
</dd></dl>
Expand Down
Loading

0 comments on commit ca3d92c

Please sign in to comment.