diff --git a/.gitignore b/.gitignore
index d42b330..34c79cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,136 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Output data from example script
+output/
+
build/
.vscode/
__pycache__/
diff --git a/README.md b/README.md
index 1a57153..fd19ef8 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,13 @@
# Caribou
Alignment-free bacterial identification and classification in metagenomics sequencing data using machine learning.
+## Proof of Concept
+The jupyter notebook `workflow_example.ipynb` shows the workflow and it's output using example data. In this notebook, the steps are identified for better understanding.
+
+Data used in the `workflow_example.ipynb` is located in the `example_data/` folder.
+
+This data was also used to test and debug the Caribou analysis pipeline.
+
## Installation
The Caribou analysis pipeline was developped in python3 and can be easily installed through the python wheel. The repo must be cloned first and then the package can be installed using the following commands lines in the desired folder :
```
diff --git a/example_data/30_genomes.csv b/example_data/30_genomes.csv
new file mode 100644
index 0000000..1652712
--- /dev/null
+++ b/example_data/30_genomes.csv
@@ -0,0 +1,31 @@
+id,species,genus,family,order,class,phylum,domain
+VBOR01000009.1,WS-7 sp005893165,WS-7,SZUA-252,SZUA-252,RBG-16-71-46,Eisenbacteria,Bacteria
+PMOP01000016.1,Palsa-360 sp003161495,Palsa-360,UBA7541,UBA7541,Acidobacteriae,Acidobacteriota,Bacteria
+DHUT01000069.1,Sedimentibacter sp002409285,Sedimentibacter,Sedimentibacteraceae,Tissierellales,Clostridia,Firmicutes_A,Bacteria
+JAAZAC010000025.1,Actinotalea sp012514545,Actinotalea,Cellulomonadaceae,Actinomycetales,Actinomycetia,Actinobacteriota,Bacteria
+URUG01000300.1,Faecalimonas sp900550975,Faecalimonas,Lachnospiraceae,Lachnospirales,Clostridia,Firmicutes_A,Bacteria
+CAJCBY010000033.1,Aquirufa sp903960725,Aquirufa,Spirosomaceae,Cytophagales,Bacteroidia,Bacteroidota,Bacteria
+JABXKY010000147.1,RBG-16-57-9 sp013619005,RBG-16-57-9,TCS64,TCS64,Bathyarchaeia,Thermoproteota,Archaea
+JABBOX010000109.1,UBA4719 sp012927555,UBA4719,Dermatophilaceae,Actinomycetales,Actinomycetia,Actinobacteriota,Bacteria
+JAABRC010000419.1,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+CAAFRK010000216.1,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+URSE01000035.1,Veillonella sp900550455,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+UQEY01000009.1,Eubacterium_R sp900540235,Eubacterium_R,Acutalibacteraceae,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+NZ_QEST01000278.1,Streptomyces sp003311645,Streptomyces,Streptomycetaceae,Streptomycetales,Actinomycetia,Actinobacteriota,Bacteria
+NZ_LRTR01000260.1,Streptomyces europaeiscabiei,Streptomyces,Streptomycetaceae,Streptomycetales,Actinomycetia,Actinobacteriota,Bacteria
+JACMKV010000045.1,JACMKV01 sp014379915,JACMKV01,JACMKV01,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+CACNVV010000042.1,Pelagibacter sp902624015,Pelagibacter,Pelagibacteraceae,Pelagibacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+WLHF01000026.1,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+DKBA01000026.1,UBA6912 sp002450985,UBA6912,UBA5794,UBA5794,Acidimicrobiia,Actinobacteriota,Bacteria
+WBXD01000017.1,UBA1315 sp008932935,UBA1315,Akkermansiaceae,Verrucomicrobiales,Verrucomicrobiae,Verrucomicrobiota,Bacteria
+PBSX01000072.1,CABZJG01 sp002726375,CABZJG01,Rhodobacteraceae,Rhodobacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+JAAYQI010000217.1,JAAYQI01 sp012519385,JAAYQI01,Anaerotignaceae,Lachnospirales,Clostridia,Firmicutes_A,Bacteria
+PMSQ01000054.1,Sulfotelmatobacter sp003168355,Sulfotelmatobacter,Koribacteraceae,Acidobacteriales,Acidobacteriae,Acidobacteriota,Bacteria
+NZ_LCZE01000023.1,Pseudomonas_E fluorescens_N,Pseudomonas_E,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+DNMQ01000225.1,Pseudomonas_A sp003488145,Pseudomonas_A,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+CAIXRL010000197.1,CAIXRL01 sp903921835,CAIXRL01,RBG-16-71-46,RBG-16-71-46,RBG-16-71-46,Eisenbacteria,Bacteria
+JAAYXU010000041.1,JAAYXU01 sp012515725,JAAYXU01,UMGS416,Christensenellales,Clostridia_A,Firmicutes_A,Bacteria
+DHMB01000127.1,CAG-841 sp002405565,CAG-841,CAG-272,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+JACNFQ010000081.1,NIOZ-UU106 sp014384545,NIOZ-UU106,UBA6624,UBA6624,UBA6624,UBP7,Bacteria
+CAJCHR010000269.1,Novosphingobium sp903970225,Novosphingobium,Sphingomonadaceae,Sphingomonadales,Alphaproteobacteria,Proteobacteria,Bacteria
+QMMC01000579.1,B10-G4 sp003647065,B10-G4,SG8-38,Polyangiales,Polyangia,Myxococcota,Bacteria
diff --git a/example_data/30_genomes.fna.gz b/example_data/30_genomes.fna.gz
new file mode 100644
index 0000000..30087a2
Binary files /dev/null and b/example_data/30_genomes.fna.gz differ
diff --git a/example_data/cucurbita_sample_3.csv b/example_data/cucurbita_sample_3.csv
new file mode 100644
index 0000000..6a700de
--- /dev/null
+++ b/example_data/cucurbita_sample_3.csv
@@ -0,0 +1,4 @@
+id,species,domain
+NW_019663258,Cucurbita,host
+NW_019657536,Cucurbita,host
+NEWN01002765,Cucurbita,host
diff --git a/example_data/cucurbita_sample_3.fna.gz b/example_data/cucurbita_sample_3.fna.gz
new file mode 100644
index 0000000..fe8a36a
Binary files /dev/null and b/example_data/cucurbita_sample_3.fna.gz differ
diff --git a/example_data/metagenome.csv b/example_data/metagenome.csv
new file mode 100644
index 0000000..ce6d1c8
--- /dev/null
+++ b/example_data/metagenome.csv
@@ -0,0 +1,299 @@
+id,species,genus,family,order,class,phylum,domain
+VBOR01000009.1_0_7_1,WS-7 sp005893165,WS-7,SZUA-252,SZUA-252,RBG-16-71-46,Eisenbacteria,Bacteria
+VBOR01000009.1_0_7_2,WS-7 sp005893165,WS-7,SZUA-252,SZUA-252,RBG-16-71-46,Eisenbacteria,Bacteria
+VBOR01000009.1_1_7_1,WS-7 sp005893165,WS-7,SZUA-252,SZUA-252,RBG-16-71-46,Eisenbacteria,Bacteria
+VBOR01000009.1_1_7_2,WS-7 sp005893165,WS-7,SZUA-252,SZUA-252,RBG-16-71-46,Eisenbacteria,Bacteria
+PMOP01000016.1_0_7_1,Palsa-360 sp003161495,Palsa-360,UBA7541,UBA7541,Acidobacteriae,Acidobacteriota,Bacteria
+PMOP01000016.1_0_7_2,Palsa-360 sp003161495,Palsa-360,UBA7541,UBA7541,Acidobacteriae,Acidobacteriota,Bacteria
+PMOP01000016.1_1_7_1,Palsa-360 sp003161495,Palsa-360,UBA7541,UBA7541,Acidobacteriae,Acidobacteriota,Bacteria
+PMOP01000016.1_1_7_2,Palsa-360 sp003161495,Palsa-360,UBA7541,UBA7541,Acidobacteriae,Acidobacteriota,Bacteria
+PMOP01000016.1_2_7_1,Palsa-360 sp003161495,Palsa-360,UBA7541,UBA7541,Acidobacteriae,Acidobacteriota,Bacteria
+PMOP01000016.1_2_7_2,Palsa-360 sp003161495,Palsa-360,UBA7541,UBA7541,Acidobacteriae,Acidobacteriota,Bacteria
+DHUT01000069.1_0_7_1,Sedimentibacter sp002409285,Sedimentibacter,Sedimentibacteraceae,Tissierellales,Clostridia,Firmicutes_A,Bacteria
+DHUT01000069.1_0_7_2,Sedimentibacter sp002409285,Sedimentibacter,Sedimentibacteraceae,Tissierellales,Clostridia,Firmicutes_A,Bacteria
+JAAZAC010000025.1_0_7_1,Actinotalea sp012514545,Actinotalea,Cellulomonadaceae,Actinomycetales,Actinomycetia,Actinobacteriota,Bacteria
+JAAZAC010000025.1_0_7_2,Actinotalea sp012514545,Actinotalea,Cellulomonadaceae,Actinomycetales,Actinomycetia,Actinobacteriota,Bacteria
+CAJCBY010000033.1_0_7_1,Aquirufa sp903960725,Aquirufa,Spirosomaceae,Cytophagales,Bacteroidia,Bacteroidota,Bacteria
+CAJCBY010000033.1_0_7_2,Aquirufa sp903960725,Aquirufa,Spirosomaceae,Cytophagales,Bacteroidia,Bacteroidota,Bacteria
+CAJCBY010000033.1_1_7_1,Aquirufa sp903960725,Aquirufa,Spirosomaceae,Cytophagales,Bacteroidia,Bacteroidota,Bacteria
+CAJCBY010000033.1_1_7_2,Aquirufa sp903960725,Aquirufa,Spirosomaceae,Cytophagales,Bacteroidia,Bacteroidota,Bacteria
+CAJCBY010000033.1_2_7_1,Aquirufa sp903960725,Aquirufa,Spirosomaceae,Cytophagales,Bacteroidia,Bacteroidota,Bacteria
+CAJCBY010000033.1_2_7_2,Aquirufa sp903960725,Aquirufa,Spirosomaceae,Cytophagales,Bacteroidia,Bacteroidota,Bacteria
+CAJCBY010000033.1_3_7_1,Aquirufa sp903960725,Aquirufa,Spirosomaceae,Cytophagales,Bacteroidia,Bacteroidota,Bacteria
+CAJCBY010000033.1_3_7_2,Aquirufa sp903960725,Aquirufa,Spirosomaceae,Cytophagales,Bacteroidia,Bacteroidota,Bacteria
+CAJCBY010000033.1_4_7_1,Aquirufa sp903960725,Aquirufa,Spirosomaceae,Cytophagales,Bacteroidia,Bacteroidota,Bacteria
+CAJCBY010000033.1_4_7_2,Aquirufa sp903960725,Aquirufa,Spirosomaceae,Cytophagales,Bacteroidia,Bacteroidota,Bacteria
+CAJCBY010000033.1_5_7_1,Aquirufa sp903960725,Aquirufa,Spirosomaceae,Cytophagales,Bacteroidia,Bacteroidota,Bacteria
+CAJCBY010000033.1_5_7_2,Aquirufa sp903960725,Aquirufa,Spirosomaceae,Cytophagales,Bacteroidia,Bacteroidota,Bacteria
+JABXKY010000147.1_0_7_1,RBG-16-57-9 sp013619005,RBG-16-57-9,TCS64,TCS64,Bathyarchaeia,Thermoproteota,Archaea
+JABXKY010000147.1_0_7_2,RBG-16-57-9 sp013619005,RBG-16-57-9,TCS64,TCS64,Bathyarchaeia,Thermoproteota,Archaea
+JABXKY010000147.1_1_7_1,RBG-16-57-9 sp013619005,RBG-16-57-9,TCS64,TCS64,Bathyarchaeia,Thermoproteota,Archaea
+JABXKY010000147.1_1_7_2,RBG-16-57-9 sp013619005,RBG-16-57-9,TCS64,TCS64,Bathyarchaeia,Thermoproteota,Archaea
+JABXKY010000147.1_2_7_1,RBG-16-57-9 sp013619005,RBG-16-57-9,TCS64,TCS64,Bathyarchaeia,Thermoproteota,Archaea
+JABXKY010000147.1_2_7_2,RBG-16-57-9 sp013619005,RBG-16-57-9,TCS64,TCS64,Bathyarchaeia,Thermoproteota,Archaea
+JABBOX010000109.1_0_7_1,UBA4719 sp012927555,UBA4719,Dermatophilaceae,Actinomycetales,Actinomycetia,Actinobacteriota,Bacteria
+JABBOX010000109.1_0_7_2,UBA4719 sp012927555,UBA4719,Dermatophilaceae,Actinomycetales,Actinomycetia,Actinobacteriota,Bacteria
+JABBOX010000109.1_1_7_1,UBA4719 sp012927555,UBA4719,Dermatophilaceae,Actinomycetales,Actinomycetia,Actinobacteriota,Bacteria
+JABBOX010000109.1_1_7_2,UBA4719 sp012927555,UBA4719,Dermatophilaceae,Actinomycetales,Actinomycetia,Actinobacteriota,Bacteria
+JABBOX010000109.1_2_7_1,UBA4719 sp012927555,UBA4719,Dermatophilaceae,Actinomycetales,Actinomycetia,Actinobacteriota,Bacteria
+JABBOX010000109.1_2_7_2,UBA4719 sp012927555,UBA4719,Dermatophilaceae,Actinomycetales,Actinomycetia,Actinobacteriota,Bacteria
+JABBOX010000109.1_3_7_1,UBA4719 sp012927555,UBA4719,Dermatophilaceae,Actinomycetales,Actinomycetia,Actinobacteriota,Bacteria
+JABBOX010000109.1_3_7_2,UBA4719 sp012927555,UBA4719,Dermatophilaceae,Actinomycetales,Actinomycetia,Actinobacteriota,Bacteria
+JABBOX010000109.1_4_7_1,UBA4719 sp012927555,UBA4719,Dermatophilaceae,Actinomycetales,Actinomycetia,Actinobacteriota,Bacteria
+JABBOX010000109.1_4_7_2,UBA4719 sp012927555,UBA4719,Dermatophilaceae,Actinomycetales,Actinomycetia,Actinobacteriota,Bacteria
+JABBOX010000109.1_5_7_1,UBA4719 sp012927555,UBA4719,Dermatophilaceae,Actinomycetales,Actinomycetia,Actinobacteriota,Bacteria
+JABBOX010000109.1_5_7_2,UBA4719 sp012927555,UBA4719,Dermatophilaceae,Actinomycetales,Actinomycetia,Actinobacteriota,Bacteria
+JABBOX010000109.1_6_7_1,UBA4719 sp012927555,UBA4719,Dermatophilaceae,Actinomycetales,Actinomycetia,Actinobacteriota,Bacteria
+JABBOX010000109.1_6_7_2,UBA4719 sp012927555,UBA4719,Dermatophilaceae,Actinomycetales,Actinomycetia,Actinobacteriota,Bacteria
+JAABRC010000419.1_0_0_1,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JAABRC010000419.1_0_0_2,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JAABRC010000419.1_0_1_1,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JAABRC010000419.1_0_1_2,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JAABRC010000419.1_0_2_1,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JAABRC010000419.1_0_2_2,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JAABRC010000419.1_0_3_1,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JAABRC010000419.1_0_3_2,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JAABRC010000419.1_0_4_1,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JAABRC010000419.1_0_4_2,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JAABRC010000419.1_0_5_1,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JAABRC010000419.1_0_5_2,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JAABRC010000419.1_0_6_1,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JAABRC010000419.1_0_6_2,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JAABRC010000419.1_0_7_1,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JAABRC010000419.1_0_7_2,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JAABRC010000419.1_1_7_1,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JAABRC010000419.1_1_7_2,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JAABRC010000419.1_2_7_1,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JAABRC010000419.1_2_7_2,JAABRC01 sp011391115,JAABRC01,Burkholderiaceae,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+CAAFRK010000216.1_0_0_1,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_0_0_2,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_0_1_1,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_0_1_2,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_0_2_1,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_0_2_2,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_0_3_1,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_0_3_2,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_0_4_1,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_0_4_2,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_0_5_1,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_0_5_2,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_0_6_1,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_0_6_2,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_0_7_1,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_0_7_2,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_1_7_1,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_1_7_2,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_2_7_1,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_2_7_2,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_3_7_1,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+CAAFRK010000216.1_3_7_2,Veillonella sp900765235,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+URSE01000035.1_0_7_1,Veillonella sp900550455,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+URSE01000035.1_0_7_2,Veillonella sp900550455,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+URSE01000035.1_1_7_1,Veillonella sp900550455,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+URSE01000035.1_1_7_2,Veillonella sp900550455,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+URSE01000035.1_2_7_1,Veillonella sp900550455,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+URSE01000035.1_2_7_2,Veillonella sp900550455,Veillonella,Veillonellaceae,Veillonellales,Negativicutes,Firmicutes_C,Bacteria
+UQEY01000009.1_0_7_1,Eubacterium_R sp900540235,Eubacterium_R,Acutalibacteraceae,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+UQEY01000009.1_0_7_2,Eubacterium_R sp900540235,Eubacterium_R,Acutalibacteraceae,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+UQEY01000009.1_1_7_1,Eubacterium_R sp900540235,Eubacterium_R,Acutalibacteraceae,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+UQEY01000009.1_1_7_2,Eubacterium_R sp900540235,Eubacterium_R,Acutalibacteraceae,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+UQEY01000009.1_2_7_1,Eubacterium_R sp900540235,Eubacterium_R,Acutalibacteraceae,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+UQEY01000009.1_2_7_2,Eubacterium_R sp900540235,Eubacterium_R,Acutalibacteraceae,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+UQEY01000009.1_3_7_1,Eubacterium_R sp900540235,Eubacterium_R,Acutalibacteraceae,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+UQEY01000009.1_3_7_2,Eubacterium_R sp900540235,Eubacterium_R,Acutalibacteraceae,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+UQEY01000009.1_4_7_1,Eubacterium_R sp900540235,Eubacterium_R,Acutalibacteraceae,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+UQEY01000009.1_4_7_2,Eubacterium_R sp900540235,Eubacterium_R,Acutalibacteraceae,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+UQEY01000009.1_5_7_1,Eubacterium_R sp900540235,Eubacterium_R,Acutalibacteraceae,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+UQEY01000009.1_5_7_2,Eubacterium_R sp900540235,Eubacterium_R,Acutalibacteraceae,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+UQEY01000009.1_6_7_1,Eubacterium_R sp900540235,Eubacterium_R,Acutalibacteraceae,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+UQEY01000009.1_6_7_2,Eubacterium_R sp900540235,Eubacterium_R,Acutalibacteraceae,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+NZ_LRTR01000260.1_0_7_1,Streptomyces europaeiscabiei,Streptomyces,Streptomycetaceae,Streptomycetales,Actinomycetia,Actinobacteriota,Bacteria
+NZ_LRTR01000260.1_0_7_2,Streptomyces europaeiscabiei,Streptomyces,Streptomycetaceae,Streptomycetales,Actinomycetia,Actinobacteriota,Bacteria
+NZ_LRTR01000260.1_1_7_1,Streptomyces europaeiscabiei,Streptomyces,Streptomycetaceae,Streptomycetales,Actinomycetia,Actinobacteriota,Bacteria
+NZ_LRTR01000260.1_1_7_2,Streptomyces europaeiscabiei,Streptomyces,Streptomycetaceae,Streptomycetales,Actinomycetia,Actinobacteriota,Bacteria
+NZ_LRTR01000260.1_2_7_1,Streptomyces europaeiscabiei,Streptomyces,Streptomycetaceae,Streptomycetales,Actinomycetia,Actinobacteriota,Bacteria
+NZ_LRTR01000260.1_2_7_2,Streptomyces europaeiscabiei,Streptomyces,Streptomycetaceae,Streptomycetales,Actinomycetia,Actinobacteriota,Bacteria
+NZ_LRTR01000260.1_3_7_1,Streptomyces europaeiscabiei,Streptomyces,Streptomycetaceae,Streptomycetales,Actinomycetia,Actinobacteriota,Bacteria
+NZ_LRTR01000260.1_3_7_2,Streptomyces europaeiscabiei,Streptomyces,Streptomycetaceae,Streptomycetales,Actinomycetia,Actinobacteriota,Bacteria
+NZ_LRTR01000260.1_4_7_1,Streptomyces europaeiscabiei,Streptomyces,Streptomycetaceae,Streptomycetales,Actinomycetia,Actinobacteriota,Bacteria
+NZ_LRTR01000260.1_4_7_2,Streptomyces europaeiscabiei,Streptomyces,Streptomycetaceae,Streptomycetales,Actinomycetia,Actinobacteriota,Bacteria
+JACMKV010000045.1_0_7_1,JACMKV01 sp014379915,JACMKV01,JACMKV01,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JACMKV010000045.1_0_7_2,JACMKV01 sp014379915,JACMKV01,JACMKV01,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JACMKV010000045.1_1_7_1,JACMKV01 sp014379915,JACMKV01,JACMKV01,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JACMKV010000045.1_1_7_2,JACMKV01 sp014379915,JACMKV01,JACMKV01,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JACMKV010000045.1_2_7_1,JACMKV01 sp014379915,JACMKV01,JACMKV01,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JACMKV010000045.1_2_7_2,JACMKV01 sp014379915,JACMKV01,JACMKV01,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JACMKV010000045.1_3_7_1,JACMKV01 sp014379915,JACMKV01,JACMKV01,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JACMKV010000045.1_3_7_2,JACMKV01 sp014379915,JACMKV01,JACMKV01,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JACMKV010000045.1_4_7_1,JACMKV01 sp014379915,JACMKV01,JACMKV01,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+JACMKV010000045.1_4_7_2,JACMKV01 sp014379915,JACMKV01,JACMKV01,Burkholderiales,Gammaproteobacteria,Proteobacteria,Bacteria
+CACNVV010000042.1_0_0_1,Pelagibacter sp902624015,Pelagibacter,Pelagibacteraceae,Pelagibacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+CACNVV010000042.1_0_0_2,Pelagibacter sp902624015,Pelagibacter,Pelagibacteraceae,Pelagibacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+CACNVV010000042.1_0_1_1,Pelagibacter sp902624015,Pelagibacter,Pelagibacteraceae,Pelagibacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+CACNVV010000042.1_0_1_2,Pelagibacter sp902624015,Pelagibacter,Pelagibacteraceae,Pelagibacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+CACNVV010000042.1_0_2_1,Pelagibacter sp902624015,Pelagibacter,Pelagibacteraceae,Pelagibacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+CACNVV010000042.1_0_2_2,Pelagibacter sp902624015,Pelagibacter,Pelagibacteraceae,Pelagibacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+CACNVV010000042.1_0_3_1,Pelagibacter sp902624015,Pelagibacter,Pelagibacteraceae,Pelagibacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+CACNVV010000042.1_0_3_2,Pelagibacter sp902624015,Pelagibacter,Pelagibacteraceae,Pelagibacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+CACNVV010000042.1_0_4_1,Pelagibacter sp902624015,Pelagibacter,Pelagibacteraceae,Pelagibacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+CACNVV010000042.1_0_4_2,Pelagibacter sp902624015,Pelagibacter,Pelagibacteraceae,Pelagibacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+CACNVV010000042.1_0_5_1,Pelagibacter sp902624015,Pelagibacter,Pelagibacteraceae,Pelagibacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+CACNVV010000042.1_0_5_2,Pelagibacter sp902624015,Pelagibacter,Pelagibacteraceae,Pelagibacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+CACNVV010000042.1_0_6_1,Pelagibacter sp902624015,Pelagibacter,Pelagibacteraceae,Pelagibacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+CACNVV010000042.1_0_6_2,Pelagibacter sp902624015,Pelagibacter,Pelagibacteraceae,Pelagibacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+CACNVV010000042.1_0_7_1,Pelagibacter sp902624015,Pelagibacter,Pelagibacteraceae,Pelagibacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+CACNVV010000042.1_0_7_2,Pelagibacter sp902624015,Pelagibacter,Pelagibacteraceae,Pelagibacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+CACNVV010000042.1_1_7_1,Pelagibacter sp902624015,Pelagibacter,Pelagibacteraceae,Pelagibacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+CACNVV010000042.1_1_7_2,Pelagibacter sp902624015,Pelagibacter,Pelagibacteraceae,Pelagibacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+WLHF01000026.1_0_0_1,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_0_0_2,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_0_1_1,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_0_1_2,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_0_2_1,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_0_2_2,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_0_3_1,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_0_3_2,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_0_4_1,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_0_4_2,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_0_5_1,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_0_5_2,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_0_6_1,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_0_6_2,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_0_7_1,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_0_7_2,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_1_7_1,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_1_7_2,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_2_7_1,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_2_7_2,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_3_7_1,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_3_7_2,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_4_7_1,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_4_7_2,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_5_7_1,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+WLHF01000026.1_5_7_2,Planktophila sp009702835,Planktophila,Nanopelagicaceae,Nanopelagicales,Actinomycetia,Actinobacteriota,Bacteria
+DKBA01000026.1_0_0_1,UBA6912 sp002450985,UBA6912,UBA5794,UBA5794,Acidimicrobiia,Actinobacteriota,Bacteria
+DKBA01000026.1_0_0_2,UBA6912 sp002450985,UBA6912,UBA5794,UBA5794,Acidimicrobiia,Actinobacteriota,Bacteria
+DKBA01000026.1_0_1_1,UBA6912 sp002450985,UBA6912,UBA5794,UBA5794,Acidimicrobiia,Actinobacteriota,Bacteria
+DKBA01000026.1_0_1_2,UBA6912 sp002450985,UBA6912,UBA5794,UBA5794,Acidimicrobiia,Actinobacteriota,Bacteria
+DKBA01000026.1_0_2_1,UBA6912 sp002450985,UBA6912,UBA5794,UBA5794,Acidimicrobiia,Actinobacteriota,Bacteria
+DKBA01000026.1_0_2_2,UBA6912 sp002450985,UBA6912,UBA5794,UBA5794,Acidimicrobiia,Actinobacteriota,Bacteria
+DKBA01000026.1_0_3_1,UBA6912 sp002450985,UBA6912,UBA5794,UBA5794,Acidimicrobiia,Actinobacteriota,Bacteria
+DKBA01000026.1_0_3_2,UBA6912 sp002450985,UBA6912,UBA5794,UBA5794,Acidimicrobiia,Actinobacteriota,Bacteria
+DKBA01000026.1_0_4_1,UBA6912 sp002450985,UBA6912,UBA5794,UBA5794,Acidimicrobiia,Actinobacteriota,Bacteria
+DKBA01000026.1_0_4_2,UBA6912 sp002450985,UBA6912,UBA5794,UBA5794,Acidimicrobiia,Actinobacteriota,Bacteria
+DKBA01000026.1_0_5_1,UBA6912 sp002450985,UBA6912,UBA5794,UBA5794,Acidimicrobiia,Actinobacteriota,Bacteria
+DKBA01000026.1_0_5_2,UBA6912 sp002450985,UBA6912,UBA5794,UBA5794,Acidimicrobiia,Actinobacteriota,Bacteria
+DKBA01000026.1_0_6_1,UBA6912 sp002450985,UBA6912,UBA5794,UBA5794,Acidimicrobiia,Actinobacteriota,Bacteria
+DKBA01000026.1_0_6_2,UBA6912 sp002450985,UBA6912,UBA5794,UBA5794,Acidimicrobiia,Actinobacteriota,Bacteria
+DKBA01000026.1_0_7_1,UBA6912 sp002450985,UBA6912,UBA5794,UBA5794,Acidimicrobiia,Actinobacteriota,Bacteria
+DKBA01000026.1_0_7_2,UBA6912 sp002450985,UBA6912,UBA5794,UBA5794,Acidimicrobiia,Actinobacteriota,Bacteria
+DKBA01000026.1_1_7_1,UBA6912 sp002450985,UBA6912,UBA5794,UBA5794,Acidimicrobiia,Actinobacteriota,Bacteria
+DKBA01000026.1_1_7_2,UBA6912 sp002450985,UBA6912,UBA5794,UBA5794,Acidimicrobiia,Actinobacteriota,Bacteria
+WBXD01000017.1_0_0_1,UBA1315 sp008932935,UBA1315,Akkermansiaceae,Verrucomicrobiales,Verrucomicrobiae,Verrucomicrobiota,Bacteria
+WBXD01000017.1_0_0_2,UBA1315 sp008932935,UBA1315,Akkermansiaceae,Verrucomicrobiales,Verrucomicrobiae,Verrucomicrobiota,Bacteria
+WBXD01000017.1_0_1_1,UBA1315 sp008932935,UBA1315,Akkermansiaceae,Verrucomicrobiales,Verrucomicrobiae,Verrucomicrobiota,Bacteria
+WBXD01000017.1_0_1_2,UBA1315 sp008932935,UBA1315,Akkermansiaceae,Verrucomicrobiales,Verrucomicrobiae,Verrucomicrobiota,Bacteria
+WBXD01000017.1_0_2_1,UBA1315 sp008932935,UBA1315,Akkermansiaceae,Verrucomicrobiales,Verrucomicrobiae,Verrucomicrobiota,Bacteria
+WBXD01000017.1_0_2_2,UBA1315 sp008932935,UBA1315,Akkermansiaceae,Verrucomicrobiales,Verrucomicrobiae,Verrucomicrobiota,Bacteria
+WBXD01000017.1_0_3_1,UBA1315 sp008932935,UBA1315,Akkermansiaceae,Verrucomicrobiales,Verrucomicrobiae,Verrucomicrobiota,Bacteria
+WBXD01000017.1_0_3_2,UBA1315 sp008932935,UBA1315,Akkermansiaceae,Verrucomicrobiales,Verrucomicrobiae,Verrucomicrobiota,Bacteria
+WBXD01000017.1_0_4_1,UBA1315 sp008932935,UBA1315,Akkermansiaceae,Verrucomicrobiales,Verrucomicrobiae,Verrucomicrobiota,Bacteria
+WBXD01000017.1_0_4_2,UBA1315 sp008932935,UBA1315,Akkermansiaceae,Verrucomicrobiales,Verrucomicrobiae,Verrucomicrobiota,Bacteria
+WBXD01000017.1_0_5_1,UBA1315 sp008932935,UBA1315,Akkermansiaceae,Verrucomicrobiales,Verrucomicrobiae,Verrucomicrobiota,Bacteria
+WBXD01000017.1_0_5_2,UBA1315 sp008932935,UBA1315,Akkermansiaceae,Verrucomicrobiales,Verrucomicrobiae,Verrucomicrobiota,Bacteria
+WBXD01000017.1_0_6_1,UBA1315 sp008932935,UBA1315,Akkermansiaceae,Verrucomicrobiales,Verrucomicrobiae,Verrucomicrobiota,Bacteria
+WBXD01000017.1_0_6_2,UBA1315 sp008932935,UBA1315,Akkermansiaceae,Verrucomicrobiales,Verrucomicrobiae,Verrucomicrobiota,Bacteria
+WBXD01000017.1_0_7_1,UBA1315 sp008932935,UBA1315,Akkermansiaceae,Verrucomicrobiales,Verrucomicrobiae,Verrucomicrobiota,Bacteria
+WBXD01000017.1_0_7_2,UBA1315 sp008932935,UBA1315,Akkermansiaceae,Verrucomicrobiales,Verrucomicrobiae,Verrucomicrobiota,Bacteria
+WBXD01000017.1_1_7_1,UBA1315 sp008932935,UBA1315,Akkermansiaceae,Verrucomicrobiales,Verrucomicrobiae,Verrucomicrobiota,Bacteria
+WBXD01000017.1_1_7_2,UBA1315 sp008932935,UBA1315,Akkermansiaceae,Verrucomicrobiales,Verrucomicrobiae,Verrucomicrobiota,Bacteria
+PBSX01000072.1_0_0_1,CABZJG01 sp002726375,CABZJG01,Rhodobacteraceae,Rhodobacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+PBSX01000072.1_0_0_2,CABZJG01 sp002726375,CABZJG01,Rhodobacteraceae,Rhodobacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+PBSX01000072.1_0_1_1,CABZJG01 sp002726375,CABZJG01,Rhodobacteraceae,Rhodobacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+PBSX01000072.1_0_1_2,CABZJG01 sp002726375,CABZJG01,Rhodobacteraceae,Rhodobacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+PBSX01000072.1_0_2_1,CABZJG01 sp002726375,CABZJG01,Rhodobacteraceae,Rhodobacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+PBSX01000072.1_0_2_2,CABZJG01 sp002726375,CABZJG01,Rhodobacteraceae,Rhodobacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+PBSX01000072.1_0_3_1,CABZJG01 sp002726375,CABZJG01,Rhodobacteraceae,Rhodobacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+PBSX01000072.1_0_3_2,CABZJG01 sp002726375,CABZJG01,Rhodobacteraceae,Rhodobacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+PBSX01000072.1_0_4_1,CABZJG01 sp002726375,CABZJG01,Rhodobacteraceae,Rhodobacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+PBSX01000072.1_0_4_2,CABZJG01 sp002726375,CABZJG01,Rhodobacteraceae,Rhodobacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+PBSX01000072.1_0_5_1,CABZJG01 sp002726375,CABZJG01,Rhodobacteraceae,Rhodobacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+PBSX01000072.1_0_5_2,CABZJG01 sp002726375,CABZJG01,Rhodobacteraceae,Rhodobacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+PBSX01000072.1_0_6_1,CABZJG01 sp002726375,CABZJG01,Rhodobacteraceae,Rhodobacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+PBSX01000072.1_0_6_2,CABZJG01 sp002726375,CABZJG01,Rhodobacteraceae,Rhodobacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+PBSX01000072.1_0_7_1,CABZJG01 sp002726375,CABZJG01,Rhodobacteraceae,Rhodobacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+PBSX01000072.1_0_7_2,CABZJG01 sp002726375,CABZJG01,Rhodobacteraceae,Rhodobacterales,Alphaproteobacteria,Proteobacteria,Bacteria
+JAAYQI010000217.1_0_7_1,JAAYQI01 sp012519385,JAAYQI01,Anaerotignaceae,Lachnospirales,Clostridia,Firmicutes_A,Bacteria
+JAAYQI010000217.1_0_7_2,JAAYQI01 sp012519385,JAAYQI01,Anaerotignaceae,Lachnospirales,Clostridia,Firmicutes_A,Bacteria
+JAAYQI010000217.1_1_7_1,JAAYQI01 sp012519385,JAAYQI01,Anaerotignaceae,Lachnospirales,Clostridia,Firmicutes_A,Bacteria
+JAAYQI010000217.1_1_7_2,JAAYQI01 sp012519385,JAAYQI01,Anaerotignaceae,Lachnospirales,Clostridia,Firmicutes_A,Bacteria
+JAAYQI010000217.1_2_7_1,JAAYQI01 sp012519385,JAAYQI01,Anaerotignaceae,Lachnospirales,Clostridia,Firmicutes_A,Bacteria
+JAAYQI010000217.1_2_7_2,JAAYQI01 sp012519385,JAAYQI01,Anaerotignaceae,Lachnospirales,Clostridia,Firmicutes_A,Bacteria
+NZ_LCZE01000023.1_0_0_1,Pseudomonas_E fluorescens_N,Pseudomonas_E,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+NZ_LCZE01000023.1_0_0_2,Pseudomonas_E fluorescens_N,Pseudomonas_E,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+NZ_LCZE01000023.1_0_1_1,Pseudomonas_E fluorescens_N,Pseudomonas_E,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+NZ_LCZE01000023.1_0_1_2,Pseudomonas_E fluorescens_N,Pseudomonas_E,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+NZ_LCZE01000023.1_0_2_1,Pseudomonas_E fluorescens_N,Pseudomonas_E,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+NZ_LCZE01000023.1_0_2_2,Pseudomonas_E fluorescens_N,Pseudomonas_E,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+NZ_LCZE01000023.1_0_3_1,Pseudomonas_E fluorescens_N,Pseudomonas_E,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+NZ_LCZE01000023.1_0_3_2,Pseudomonas_E fluorescens_N,Pseudomonas_E,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+NZ_LCZE01000023.1_0_4_1,Pseudomonas_E fluorescens_N,Pseudomonas_E,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+NZ_LCZE01000023.1_0_4_2,Pseudomonas_E fluorescens_N,Pseudomonas_E,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+NZ_LCZE01000023.1_0_5_1,Pseudomonas_E fluorescens_N,Pseudomonas_E,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+NZ_LCZE01000023.1_0_5_2,Pseudomonas_E fluorescens_N,Pseudomonas_E,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+NZ_LCZE01000023.1_0_6_1,Pseudomonas_E fluorescens_N,Pseudomonas_E,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+NZ_LCZE01000023.1_0_6_2,Pseudomonas_E fluorescens_N,Pseudomonas_E,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+NZ_LCZE01000023.1_0_7_1,Pseudomonas_E fluorescens_N,Pseudomonas_E,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+NZ_LCZE01000023.1_0_7_2,Pseudomonas_E fluorescens_N,Pseudomonas_E,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+DNMQ01000225.1_0_7_1,Pseudomonas_A sp003488145,Pseudomonas_A,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+DNMQ01000225.1_0_7_2,Pseudomonas_A sp003488145,Pseudomonas_A,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+DNMQ01000225.1_1_7_1,Pseudomonas_A sp003488145,Pseudomonas_A,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+DNMQ01000225.1_1_7_2,Pseudomonas_A sp003488145,Pseudomonas_A,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+DNMQ01000225.1_2_7_1,Pseudomonas_A sp003488145,Pseudomonas_A,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+DNMQ01000225.1_2_7_2,Pseudomonas_A sp003488145,Pseudomonas_A,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+DNMQ01000225.1_3_7_1,Pseudomonas_A sp003488145,Pseudomonas_A,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+DNMQ01000225.1_3_7_2,Pseudomonas_A sp003488145,Pseudomonas_A,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+DNMQ01000225.1_4_7_1,Pseudomonas_A sp003488145,Pseudomonas_A,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+DNMQ01000225.1_4_7_2,Pseudomonas_A sp003488145,Pseudomonas_A,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+DNMQ01000225.1_5_7_1,Pseudomonas_A sp003488145,Pseudomonas_A,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+DNMQ01000225.1_5_7_2,Pseudomonas_A sp003488145,Pseudomonas_A,Pseudomonadaceae,Pseudomonadales,Gammaproteobacteria,Proteobacteria,Bacteria
+CAIXRL010000197.1_0_7_1,CAIXRL01 sp903921835,CAIXRL01,RBG-16-71-46,RBG-16-71-46,RBG-16-71-46,Eisenbacteria,Bacteria
+CAIXRL010000197.1_0_7_2,CAIXRL01 sp903921835,CAIXRL01,RBG-16-71-46,RBG-16-71-46,RBG-16-71-46,Eisenbacteria,Bacteria
+CAIXRL010000197.1_1_7_1,CAIXRL01 sp903921835,CAIXRL01,RBG-16-71-46,RBG-16-71-46,RBG-16-71-46,Eisenbacteria,Bacteria
+CAIXRL010000197.1_1_7_2,CAIXRL01 sp903921835,CAIXRL01,RBG-16-71-46,RBG-16-71-46,RBG-16-71-46,Eisenbacteria,Bacteria
+JAAYXU010000041.1_0_7_1,JAAYXU01 sp012515725,JAAYXU01,UMGS416,Christensenellales,Clostridia_A,Firmicutes_A,Bacteria
+JAAYXU010000041.1_0_7_2,JAAYXU01 sp012515725,JAAYXU01,UMGS416,Christensenellales,Clostridia_A,Firmicutes_A,Bacteria
+DHMB01000127.1_0_7_1,CAG-841 sp002405565,CAG-841,CAG-272,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+DHMB01000127.1_0_7_2,CAG-841 sp002405565,CAG-841,CAG-272,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+DHMB01000127.1_1_7_1,CAG-841 sp002405565,CAG-841,CAG-272,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+DHMB01000127.1_1_7_2,CAG-841 sp002405565,CAG-841,CAG-272,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+DHMB01000127.1_2_7_1,CAG-841 sp002405565,CAG-841,CAG-272,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+DHMB01000127.1_2_7_2,CAG-841 sp002405565,CAG-841,CAG-272,Oscillospirales,Clostridia,Firmicutes_A,Bacteria
+JACNFQ010000081.1_0_7_1,NIOZ-UU106 sp014384545,NIOZ-UU106,UBA6624,UBA6624,UBA6624,UBP7,Bacteria
+JACNFQ010000081.1_0_7_2,NIOZ-UU106 sp014384545,NIOZ-UU106,UBA6624,UBA6624,UBA6624,UBP7,Bacteria
+JACNFQ010000081.1_1_7_1,NIOZ-UU106 sp014384545,NIOZ-UU106,UBA6624,UBA6624,UBA6624,UBP7,Bacteria
+JACNFQ010000081.1_1_7_2,NIOZ-UU106 sp014384545,NIOZ-UU106,UBA6624,UBA6624,UBA6624,UBP7,Bacteria
+JACNFQ010000081.1_2_7_1,NIOZ-UU106 sp014384545,NIOZ-UU106,UBA6624,UBA6624,UBA6624,UBP7,Bacteria
+JACNFQ010000081.1_2_7_2,NIOZ-UU106 sp014384545,NIOZ-UU106,UBA6624,UBA6624,UBA6624,UBP7,Bacteria
+JACNFQ010000081.1_3_7_1,NIOZ-UU106 sp014384545,NIOZ-UU106,UBA6624,UBA6624,UBA6624,UBP7,Bacteria
+JACNFQ010000081.1_3_7_2,NIOZ-UU106 sp014384545,NIOZ-UU106,UBA6624,UBA6624,UBA6624,UBP7,Bacteria
+JACNFQ010000081.1_4_7_1,NIOZ-UU106 sp014384545,NIOZ-UU106,UBA6624,UBA6624,UBA6624,UBP7,Bacteria
+JACNFQ010000081.1_4_7_2,NIOZ-UU106 sp014384545,NIOZ-UU106,UBA6624,UBA6624,UBA6624,UBP7,Bacteria
+JACNFQ010000081.1_5_7_1,NIOZ-UU106 sp014384545,NIOZ-UU106,UBA6624,UBA6624,UBA6624,UBP7,Bacteria
+JACNFQ010000081.1_5_7_2,NIOZ-UU106 sp014384545,NIOZ-UU106,UBA6624,UBA6624,UBA6624,UBP7,Bacteria
+JACNFQ010000081.1_6_7_1,NIOZ-UU106 sp014384545,NIOZ-UU106,UBA6624,UBA6624,UBA6624,UBP7,Bacteria
+JACNFQ010000081.1_6_7_2,NIOZ-UU106 sp014384545,NIOZ-UU106,UBA6624,UBA6624,UBA6624,UBP7,Bacteria
+CAJCHR010000269.1_0_7_1,Novosphingobium sp903970225,Novosphingobium,Sphingomonadaceae,Sphingomonadales,Alphaproteobacteria,Proteobacteria,Bacteria
+CAJCHR010000269.1_0_7_2,Novosphingobium sp903970225,Novosphingobium,Sphingomonadaceae,Sphingomonadales,Alphaproteobacteria,Proteobacteria,Bacteria
+CAJCHR010000269.1_1_7_1,Novosphingobium sp903970225,Novosphingobium,Sphingomonadaceae,Sphingomonadales,Alphaproteobacteria,Proteobacteria,Bacteria
+CAJCHR010000269.1_1_7_2,Novosphingobium sp903970225,Novosphingobium,Sphingomonadaceae,Sphingomonadales,Alphaproteobacteria,Proteobacteria,Bacteria
+CAJCHR010000269.1_2_7_1,Novosphingobium sp903970225,Novosphingobium,Sphingomonadaceae,Sphingomonadales,Alphaproteobacteria,Proteobacteria,Bacteria
+CAJCHR010000269.1_2_7_2,Novosphingobium sp903970225,Novosphingobium,Sphingomonadaceae,Sphingomonadales,Alphaproteobacteria,Proteobacteria,Bacteria
+CAJCHR010000269.1_3_7_1,Novosphingobium sp903970225,Novosphingobium,Sphingomonadaceae,Sphingomonadales,Alphaproteobacteria,Proteobacteria,Bacteria
+CAJCHR010000269.1_3_7_2,Novosphingobium sp903970225,Novosphingobium,Sphingomonadaceae,Sphingomonadales,Alphaproteobacteria,Proteobacteria,Bacteria
+CAJCHR010000269.1_4_7_1,Novosphingobium sp903970225,Novosphingobium,Sphingomonadaceae,Sphingomonadales,Alphaproteobacteria,Proteobacteria,Bacteria
+CAJCHR010000269.1_4_7_2,Novosphingobium sp903970225,Novosphingobium,Sphingomonadaceae,Sphingomonadales,Alphaproteobacteria,Proteobacteria,Bacteria
+CAJCHR010000269.1_5_7_1,Novosphingobium sp903970225,Novosphingobium,Sphingomonadaceae,Sphingomonadales,Alphaproteobacteria,Proteobacteria,Bacteria
+CAJCHR010000269.1_5_7_2,Novosphingobium sp903970225,Novosphingobium,Sphingomonadaceae,Sphingomonadales,Alphaproteobacteria,Proteobacteria,Bacteria
+QMMC01000579.1_0_7_1,B10-G4 sp003647065,B10-G4,SG8-38,Polyangiales,Polyangia,Myxococcota,Bacteria
+QMMC01000579.1_0_7_2,B10-G4 sp003647065,B10-G4,SG8-38,Polyangiales,Polyangia,Myxococcota,Bacteria
diff --git a/example_data/metagenome.fna.gz b/example_data/metagenome.fna.gz
new file mode 100644
index 0000000..6dec987
Binary files /dev/null and b/example_data/metagenome.fna.gz differ
diff --git a/setup.cfg b/setup.cfg
index b9acfc7..65ba271 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
[metadata]
name = Caribou
-version = 1.0.6
+version = 1.1.0
url = https://github.com/bioinfoUQAM/Caribou/wiki
author = Nicolas de Montigny
author_email = de_montigny.nicolas@courrier.uqam.ca
diff --git a/src/Caribou_pipeline.py b/src/Caribou_pipeline.py
index cc1f3ac..299df92 100644
--- a/src/Caribou_pipeline.py
+++ b/src/Caribou_pipeline.py
@@ -90,8 +90,8 @@ def caribou(opt):
verify_boolean(report, 'output in abundance report form')
# Check batch_size
- if multi_classifier in ['cnn','widecnn'] and training_batch_size < 20:
- training_batch_size = 20
+ # if multi_classifier in ['cnn','widecnn'] and training_batch_size < 20:
+ # training_batch_size = 20
# Folders creation for output
outdirs = define_create_outdirs(outdir)
@@ -101,7 +101,7 @@ def caribou(opt):
_system_config = {
'object_spilling_config': json.dumps(
- {'type': 'filesystem', 'params': {'directory_path': str(opt['workdir'])}})
+ {'type': 'filesystem', 'params': {'directory_path': str(workdir)}})
}
)
@@ -204,7 +204,6 @@ def caribou(opt):
metagenome
)
-
# Part 4 - Outputs for biological analysis of bacterial population
################################################################################
diff --git a/src/data/build_data.py b/src/data/build_data.py
index 7b6c32e..022d8ed 100644
--- a/src/data/build_data.py
+++ b/src/data/build_data.py
@@ -28,7 +28,7 @@ def build_load_save_data(file, hostfile, prefix, dataset, host, kmers_list=None,
if os.path.isfile(data_file) and os.path.isfile(data_file_host) and isinstance(hostfile, tuple):
data = load_Xy_data(data_file)
data_host = load_Xy_data(data_file_host)
- elif os.path.isfile(data_file):
+ elif os.path.isfile(data_file) :
data = load_Xy_data(data_file)
else:
# Build Xy_data of database
@@ -53,7 +53,7 @@ def build_load_save_data(file, hostfile, prefix, dataset, host, kmers_list=None,
nb_features_keep = nb_features_keep)
save_Xy_data(data, data_file)
- # Assign kmers_list to variable ater extracting database data
+ # Assign kmers_list to variable after extracting database data
if kmers_list is None:
kmers_list = data['kmers']
diff --git a/src/models/classification.py b/src/models/classification.py
index ec7593e..5cf1bd1 100644
--- a/src/models/classification.py
+++ b/src/models/classification.py
@@ -137,12 +137,11 @@ def _train_model(self, taxa):
self._binary_training(taxa)
else:
self._multiclass_training(taxa)
- if isinstance(self.models[taxa], KerasTFModel):
- for file in glob(os.path.join(self._outdirs['data_dir'], '*sim*')):
- if os.path.isdir(file):
- rmtree(file)
- else:
- os.remove(file)
+ for file in glob(os.path.join(self._outdirs['data_dir'], '*sim*')):
+ if os.path.isdir(file):
+ rmtree(file)
+ else:
+ os.remove(file)
def _binary_training(self, taxa):
print('_binary_training')
@@ -516,7 +515,6 @@ def _sim_4_cv(self, df, kmers_ds, name):
cv_sim = readsSimulation(kmers_ds['fasta'], cls, sim_cls_dct['id'], 'miseq', sim_outdir, name)
sim_data = cv_sim.simulation(self._k, self._database_data['kmers'])
sim_ids = sim_data['ids']
- sim_ids = sim_data['ids']
sim_cls = pd.DataFrame({'sim_id':sim_ids}, dtype = object)
sim_cls['id'] = sim_cls['sim_id'].str.replace('_[0-9]+_[0-9]+_[0-9]+', '', regex=True)
sim_cls = sim_cls.set_index('id').join(cls.set_index('id'))
diff --git a/src/models/kerasTF/ray_keras_tf.py b/src/models/kerasTF/ray_keras_tf.py
index ec8617c..4924f2e 100644
--- a/src/models/kerasTF/ray_keras_tf.py
+++ b/src/models/kerasTF/ray_keras_tf.py
@@ -191,7 +191,6 @@ def _sim_4_val(self, df, kmers_ds, name):
cv_sim = readsSimulation(kmers_ds['fasta'], cls, sim_genomes, 'miseq', sim_outdir, name)
sim_data = cv_sim.simulation(self.k, self.kmers)
sim_ids = sim_data['ids']
- sim_ids = sim_data['ids']
sim_cls = pd.DataFrame({'sim_id':sim_ids}, dtype = object)
sim_cls['id'] = sim_cls['sim_id'].str.replace('_[0-9]+_[0-9]+_[0-9]+', '', regex=True)
sim_cls = sim_cls.set_index('id').join(cls.set_index('id'))
@@ -247,13 +246,6 @@ def _fit_model(self, datasets):
'model': self.classifier
}
- print(f'num_workers : {self._n_workers}')
- print(f'nb_CPU_data : {self._nb_CPU_data}')
- print(f'nb_CPU_training : {self._nb_CPU_training}')
- print(f'nb_GPU : {self._nb_GPU}')
- print(f'nb_CPU_per_worker : {self._nb_CPU_per_worker}')
- print(f'nb_GPU_per_worker : {self._nb_GPU_per_worker}')
-
# Define trainer / tuner
self._trainer = TensorflowTrainer(
train_loop_per_worker = train_func,
@@ -313,13 +305,9 @@ def predict(self, df, threshold=0.8, cv=False):
len(self.kmers)
)
- print('predictions after batch_prediction :', predictions.to_pandas())
-
# Convert predictions to labels
predictions = self._prob_2_cls(predictions, threshold)
- print('predictions after probs_2_cls :', predictions)
-
return self._label_decode(predictions)
else:
raise ValueError('No data to predict')
@@ -440,6 +428,7 @@ def build_model(classifier, nb_cls, nb_kmers):
model = build_wideCNN(nb_kmers, nb_cls)
return model
+"""
def batch_predict_val(checkpoint, batch, clf, batch_size, nb_classes, nb_kmers):
def convert_logits_to_classes(df):
best_class = df["predictions"].map(lambda x: np.array(x).argmax())
@@ -470,6 +459,7 @@ def calculate_prediction_scores(df):
)
return correct_dataset
+"""
def batch_prediction(checkpoint, batch, clf, batch_size, nb_classes, nb_kmers):
predictor = BatchPredictor.from_checkpoint(
diff --git a/workflow_example.ipynb b/workflow_example.ipynb
new file mode 100644
index 0000000..69f1606
--- /dev/null
+++ b/workflow_example.ipynb
@@ -0,0 +1,1672 @@
+{
+ "cells": [
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Environment"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "vscode": {
+ "languageId": "shellscript"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "rm -r output/"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import gc\n",
+ "import ray\n",
+ "import json\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import seaborn as sns\n",
+ "\n",
+ "from pathlib import Path\n",
+ "from src.utils import *\n",
+ "\n",
+ "# K-mers\n",
+ "from src.data.seq_collections import SeqCollection\n",
+ "from src.data.kmers_collection import KmersCollection\n",
+ "\n",
+ "# Preprocessing\n",
+ "from src.models.kerasTF.ray_one_hot_tensor import OneHotTensorEncoder\n",
+ "from ray.data.preprocessors import Chain, LabelEncoder\n",
+ "from src.models.ray_tensor_min_max import TensorMinMaxScaler\n",
+ "from src.models.sklearn.ray_sklearn_onesvm_encoder import OneClassSVMLabelEncoder\n",
+ "\n",
+ "# Training\n",
+ "import tensorflow as tf\n",
+ "from ray.air import session\n",
+ "from sklearn.linear_model import SGDOneClassSVM\n",
+ "from ray.air.integrations.keras import Callback\n",
+ "from src.models.kerasTF.build_neural_networks import *\n",
+ "from src.models.sklearn.ray_sklearn_partial_trainer import SklearnPartialTrainer\n",
+ "from ray.train.tensorflow import TensorflowTrainer, TensorflowCheckpoint, prepare_dataset_shard\n",
+ "\n",
+ "# Tuning\n",
+ "from ray.air.config import RunConfig, ScalingConfig, DatasetConfig\n",
+ "\n",
+ "# Predicting\n",
+ "from ray.train.sklearn import SklearnPredictor\n",
+ "from ray.train.tensorflow import TensorflowPredictor\n",
+ "from ray.train.batch_predictor import BatchPredictor\n",
+ "from joblib import Parallel, delayed, parallel_backend\n",
+ "\n",
+ "# Simulation\n",
+ "from src.models.reads_simulation import readsSimulation\n"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Parameters / global variables"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Definition"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Names\n",
+ "db_name = '30_genomes'\n",
+ "host_name = 'cucurbita_3'\n",
+ "metagenome_name = 'metagenome'\n",
+ "# IO\n",
+ "db_fasta = 'example_data/30_genomes.fna.gz'\n",
+ "db_cls = 'example_data/30_genomes.csv'\n",
+ "host_fasta = 'example_data/cucurbita_sample_3.fna.gz'\n",
+ "host_cls = 'example_data/cucurbita_sample_3.csv'\n",
+ "metagenome_fasta = 'example_data/metagenome.fna.gz'\n",
+ "metagenome_cls = 'example_data/metagenome.csv'\n",
+ "outdir = 'output/'\n",
+ "workdir = '/tmp/spill'\n",
+ "# Settings\n",
+ "klen = 5\n",
+ "batch_size = 1\n",
+ "epochs = 10\n",
+ "classif_threshold = 0.8\n",
+ "features_threshold = np.inf\n",
+ "nb_features = np.inf"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Verification"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# IO\n",
+ "verify_seqfiles(db_fasta,host_fasta)\n",
+ "verify_fasta(metagenome_fasta)\n",
+ "verify_file(db_cls)\n",
+ "verify_file(host_cls)\n",
+ "verify_file(metagenome_cls)\n",
+ "# Settings\n",
+ "verify_positive_int(klen, 'kmers length')\n",
+ "verify_positive_int(batch_size, 'training batch size')\n",
+ "verify_positive_int(epochs, 'number of iterations in neural networks training')\n",
+ "verify_0_1(classif_threshold, 'classification threshold')\n",
+ "# Folders creation for output\n",
+ "if not os.path.isdir(outdir):\n",
+ " os.makedirs(outdir)\n",
+ "outdirs = define_create_outdirs(outdir)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Cluster initialization"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2023-03-13 15:50:18,083\tINFO worker.py:1529 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "
\n",
+ "
Ray
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " Python version: | \n",
+ " 3.8.10 | \n",
+ "
\n",
+ " \n",
+ " Ray version: | \n",
+ " 2.2.0 | \n",
+ "
\n",
+ " \n",
+ " Dashboard: | \n",
+ " http://127.0.0.1:8265 | \n",
+ "
\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "RayContext(dashboard_url='127.0.0.1:8265', python_version='3.8.10', ray_version='2.2.0', ray_commit='b6af0887ee5f2e460202133791ad941a41f15beb', address_info={'node_ip_address': '192.168.65.207', 'raylet_ip_address': '192.168.65.207', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2023-03-13_15-50-16_029330_21098/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2023-03-13_15-50-16_029330_21098/sockets/raylet', 'webui_url': '127.0.0.1:8265', 'session_dir': '/tmp/ray/session_2023-03-13_15-50-16_029330_21098', 'metrics_export_port': 63058, 'gcs_address': '192.168.65.207:65125', 'address': '192.168.65.207:65125', 'dashboard_agent_listen_port': 52365, 'node_id': 'd571557535c8ecc41d103c8ee18b50f1f1cdf8deedd4632d2bd18525'})"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ray.init(\n",
+ " _system_config={\n",
+ " 'object_spilling_config': json.dumps(\n",
+ " {'type': 'filesystem', 'params': {'directory_path': str(workdir)}})\n",
+ " }\n",
+ ")"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# K-mers"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Local variables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "db_kmers_data = {}\n",
+ "db_seqdata = None\n",
+ "db_kmers_collection = None\n",
+ "host_kmers_data = {}\n",
+ "host_seqdata = None\n",
+ "host_kmers_collection = None\n",
+ "metagenome_kmers_data = {}\n",
+ "metagenome_seqdata = None\n",
+ "metagenome_kmers_collection = None"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Filenames"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Filenames\n",
+ "db_profile_file = os.path.join(outdirs['data_dir'], f'profile_genome_{db_name}_data_K{klen}')\n",
+ "host_profile_file = os.path.join(outdirs['data_dir'], f'profile_genome_{host_name}_data_K{klen}')\n",
+ "metagenome_profile_file = os.path.join(outdirs['data_dir'], f'profile_genome_{metagenome_name}_data_K{klen}')"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Seqdata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "db_seqdata = SeqCollection((db_fasta, db_cls))\n",
+ "host_seqdata = SeqCollection((host_fasta, host_cls))\n",
+ "metagenome_seqdata = SeqCollection(metagenome_fasta)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### K-mers collections"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "seen_kmers\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.\n",
+ "[Parallel(n_jobs=-1)]: Done 30 out of 30 | elapsed: 1.7s finished\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "_batch_read_write_seen\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Map_Batches: 100%|██████████| 29/29 [00:00<00:00, 102.58it/s]\n",
+ "Repartition: 100%|██████████| 29/29 [00:00<00:00, 746.53it/s]\n",
+ "Repartition: 100%|██████████| 29/29 [00:00<00:00, 768.93it/s]\n",
+ "Repartition: 100%|██████████| 29/29 [00:00<00:00, 686.86it/s]\n",
+ "Write Progress: 100%|██████████| 29/29 [00:00<00:00, 348.20it/s]\n",
+ "[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "given_kmers\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 0.4s finished\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "_batch_read_write_given\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Repartition: 100%|██████████| 3/3 [00:00<00:00, 351.92it/s]\n",
+ "Repartition: 100%|██████████| 3/3 [00:00<00:00, 462.91it/s]\n",
+ "Repartition: 100%|██████████| 3/3 [00:00<00:00, 410.64it/s]\n",
+ "Write Progress: 100%|██████████| 3/3 [00:00<00:00, 162.88it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "given_kmers\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.\n",
+ "[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 2.5s\n",
+ "[Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 10.6s\n",
+ "[Parallel(n_jobs=-1)]: Done 298 out of 298 | elapsed: 17.6s finished\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "_batch_read_write_given\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Repartition: 100%|██████████| 298/298 [00:00<00:00, 1246.26it/s]\n",
+ "Repartition: 100%|██████████| 298/298 [00:00<00:00, 1496.60it/s]\n",
+ "Repartition: 100%|██████████| 298/298 [00:00<00:00, 1387.83it/s]\n",
+ "Write Progress: 100%|██████████| 298/298 [00:01<00:00, 246.96it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "db_kmers_collection = KmersCollection(\n",
+ " db_seqdata,\n",
+ " db_profile_file,\n",
+ " klen,\n",
+ " db_name,\n",
+ " None,\n",
+ " features_threshold,\n",
+ " nb_features\n",
+ ")\n",
+ "kmers_list = db_kmers_collection.kmers_list\n",
+ "host_kmers_collection = KmersCollection(\n",
+ " host_seqdata,\n",
+ " host_profile_file,\n",
+ " klen,\n",
+ " host_name,\n",
+ " kmers_list,\n",
+ " np.inf,\n",
+ " np.inf\n",
+ ")\n",
+ "metagenome_kmers_collection = KmersCollection(\n",
+ " metagenome_seqdata,\n",
+ " metagenome_profile_file,\n",
+ " klen,\n",
+ " metagenome_name,\n",
+ " kmers_list\n",
+ ")"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### K-mers data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "db_kmers_data['profile'] = db_kmers_collection.Xy_file\n",
+ "db_kmers_data['ids'] = db_kmers_collection.ids\n",
+ "db_kmers_data['classes'] = db_kmers_collection.classes\n",
+ "db_kmers_data['kmers'] = kmers_list\n",
+ "db_kmers_data['taxas'] = db_kmers_collection.taxas\n",
+ "db_kmers_data['fasta'] = db_fasta\n",
+ "#\n",
+ "host_kmers_data['profile'] = host_kmers_collection.Xy_file\n",
+ "host_kmers_data['ids'] = host_kmers_collection.ids\n",
+ "host_kmers_data['classes'] = host_kmers_collection.classes\n",
+ "host_kmers_data['kmers'] = kmers_list\n",
+ "host_kmers_data['taxas'] = host_kmers_collection.taxas\n",
+ "host_kmers_data['fasta'] = host_fasta\n",
+ "#\n",
+ "metagenome_kmers_data['profile'] = metagenome_kmers_collection.Xy_file\n",
+ "metagenome_kmers_data['ids'] = metagenome_kmers_collection.ids\n",
+ "metagenome_kmers_data['kmers'] = kmers_list"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Bacteria isolation - OneClassSVM"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### DB + Host merging"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Read progress: 100%|██████████| 19/19 [00:00<00:00, 2026.08it/s]\n",
+ "Write Progress: 100%|██████████| 19/19 [00:00<00:00, 39.44it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "merged_kmers_data = {}\n",
+ "\n",
+ "merged_kmers_data['profile'] = \"{}_host_merged\".format(os.path.splitext(db_kmers_data[\"profile\"])[0]) # Kmers profile\n",
+ "\n",
+ "merged_cls = pd.DataFrame(db_kmers_data[\"classes\"], columns=db_kmers_data[\"taxas\"])\n",
+ "df_cls_host = pd.DataFrame(host_kmers_data[\"classes\"], columns=host_kmers_data[\"taxas\"])\n",
+ "\n",
+ "if len(np.unique(merged_cls['domain'])) != 1:\n",
+ " merged_cls[merged_cls != 'bacteria'] = 'bacteria'\n",
+ "if len(df_cls_host) > len(host_kmers_data['ids']):\n",
+ " to_remove = np.arange(len(df_cls_host) - len(host_kmers_data['ids']))\n",
+ " df_cls_host.drop(to_remove, axis=0, inplace=True)\n",
+ "elif len(df_cls_host) < len(host_kmers_data['ids']):\n",
+ " diff = len(host_kmers_data['ids']) - len(df_cls_host)\n",
+ " row = df_cls_host.iloc[0]\n",
+ " for i in range(diff):\n",
+ " df_cls_host = pd.concat([df_cls_host, row.to_frame().T], ignore_index=True)\n",
+ "\n",
+ "merged_cls = pd.concat([merged_cls, df_cls_host], ignore_index=True)\n",
+ "merged_kmers_data['classes'] = np.array(merged_cls) # Class labels\n",
+ "merged_kmers_data['ids'] = np.concatenate((db_kmers_data[\"ids\"], host_kmers_data[\"ids\"])) # IDs\n",
+ "merged_kmers_data['kmers'] = db_kmers_data[\"kmers\"] # Features\n",
+ "merged_kmers_data['taxas'] = db_kmers_data[\"taxas\"] # Known taxas for classification\n",
+ "merged_kmers_data['fasta'] = (db_kmers_data['fasta'], host_kmers_data['fasta']) # Fasta file needed for reads simulation\n",
+ "\n",
+ "merged_df = db_df.union(host_df)\n",
+ "merged_df.write_parquet(merged_kmers_data['profile'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Read: 100%|██████████| 19/19 [00:00<00:00, 245.11it/s]\n",
+ "Repartition: 100%|██████████| 32/32 [00:00<00:00, 959.07it/s]\n",
+ "Repartition: 100%|██████████| 32/32 [00:00<00:00, 901.95it/s]\n",
+ "Repartition: 100%|██████████| 19/19 [00:00<00:00, 478.89it/s]\n",
+ "Repartition: 100%|██████████| 19/19 [00:00<00:00, 555.11it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "merged_df = zip_X_y(merged_df, merged_cls)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Preprocessing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "encoded = []\n",
+ "labels = []"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "preprocessor = Chain(\n",
+ " TensorMinMaxScaler(db_kmers_data['kmers']),\n",
+ " OneClassSVMLabelEncoder('domain')\n",
+ ")\n",
+ "encoded = np.array([1,-1], dtype = np.int32)\n",
+ "labels = np.array(['bacteria','unknown'],dtype = object)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Read: 100%|██████████| 16/16 [00:00<00:00, 234.67it/s]\n",
+ "Repartition: 100%|██████████| 29/29 [00:00<00:00, 894.83it/s]\n",
+ "Repartition: 100%|██████████| 29/29 [00:00<00:00, 754.86it/s]\n",
+ "Repartition: 100%|██████████| 16/16 [00:00<00:00, 399.82it/s]\n",
+ "Repartition: 100%|██████████| 16/16 [00:00<00:00, 451.36it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "db_cls = pd.read_csv(db_cls)\n",
+ "if db_df.count() != len(db_cls):\n",
+ " db_ids = []\n",
+ " for row in db_df.iter_rows():\n",
+ " db_ids.append(row['id'])\n",
+ " db_cls = db_cls[db_cls['id'].isin(db_ids)]\n",
+ "for col in db_cls.columns:\n",
+ " db_cls[col] = db_cls[col].str.lower()\n",
+ "db_df = zip_X_y(db_df, db_cls)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Map_Batches: 100%|██████████| 16/16 [00:00<00:00, 61.55it/s]\n",
+ "Map_Batches: 100%|██████████| 16/16 [00:00<00:00, 77.11it/s]\n",
+ "Map_Batches: 100%|██████████| 16/16 [00:00<00:00, 113.17it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "preprocessor.fit(db_df)\n",
+ "db_df = preprocessor.transform(db_df)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "clf = SGDOneClassSVM()\n",
+ "train_params = {\n",
+ " 'nu': 0.1,\n",
+ " 'learning_rate': 'invscaling',\n",
+ " 'eta0': 1000,\n",
+ " 'tol': 1e-4\n",
+ "}\n",
+ "trainer = SklearnPartialTrainer(\n",
+ " estimator = clf,\n",
+ " label_column = 'domain',\n",
+ " labels_list = np.array([0,1], dtype = np.int32),\n",
+ " features_list = db_kmers_data['kmers'],\n",
+ " params = train_params,\n",
+ " datasets = {'train' : ray.put(db_df)},\n",
+ " batch_size = batch_size,\n",
+ " set_estimator_cpus = True,\n",
+ " scaling_config = ScalingConfig(\n",
+ " trainer_resources={\n",
+ " 'CPU': int(os.cpu_count()*0.8)\n",
+ " }\n",
+ " ),\n",
+ " run_config = RunConfig(\n",
+ " name = 'OneClassSVM',\n",
+ " local_dir = workdir\n",
+ " ),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "
Tune Status
\n",
+ "
\n",
+ "\n",
+ "Current time: | 2023-03-13 15:50:53 |
\n",
+ "Running for: | 00:00:02.02 |
\n",
+ "Memory: | 6.4/12.4 GiB |
\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
System Info
\n",
+ " Using FIFO scheduling algorithm.
Resources requested: 6.0/8 CPUs, 0/0 GPUs, 0.0/4.19 GiB heap, 0.0/2.1 GiB objects\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
Trial Status
\n",
+ "
\n",
+ "\n",
+ "Trial name | status | loc |
\n",
+ "\n",
+ "\n",
+ "SklearnPartialTrainer_5b9c2_00000 | RUNNING | 192.168.65.207:28446 |
\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2023-03-13 15:50:55,186\tINFO tune.py:762 -- Total run time: 3.97 seconds (3.81 seconds for the tuning loop).\n"
+ ]
+ }
+ ],
+ "source": [
+ "training_result = trainer.fit()\n",
+ "model_ckpt = training_result.checkpoint"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Predicting"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Read->Map_Batches: 100%|██████████| 16/16 [00:01<00:00, 9.14it/s]\n",
+ "Map Progress (1 actors 1 pending): 100%|██████████| 16/16 [00:02<00:00, 6.15it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "metagenome_df = preprocessor.preprocessors[0].transform(metagenome_df)\n",
+ "predictor = BatchPredictor.from_checkpoint(model_ckpt, SklearnPredictor)\n",
+ "predictions = predictor.predict(metagenome_df, batch_size = batch_size)\n",
+ "predictions = np.array(predictions.to_pandas()).reshape(-1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "predictions_onesvm = pd.Series(np.empty(len(predictions), dtype=object))\n",
+ "predictions_onesvm[predictions == 1] = 'bacteria'\n",
+ "predictions_onesvm[predictions == -1] = 'unknown'\n"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Bacteria classification - WideCNN"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Preprocessing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Metadata Fetch Progress: 0%| | 0/4 [00:00, ?it/s]\n",
+ "Metadata Fetch Progress: 100%|██████████| 4/4 [00:00<00:00, 112.22it/s]id=28541)\u001b[0m \n",
+ "Parquet Files Sample: 100%|██████████| 2/2 [00:00<00:00, 204.86it/s]\n",
+ "Read: 100%|██████████| 16/16 [00:01<00:00, 12.87it/s]\n",
+ "Repartition: 100%|██████████| 29/29 [00:00<00:00, 55.99it/s]\n",
+ "Repartition: 100%|██████████| 29/29 [00:00<00:00, 587.80it/s]\n",
+ "Repartition: 100%|██████████| 16/16 [00:00<00:00, 310.24it/s]\n",
+ "Repartition: 100%|██████████| 16/16 [00:00<00:00, 349.11it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "db_df = ray.data.read_parquet(db_kmers_data['profile'])\n",
+ "if db_df.count() != len(db_cls):\n",
+ " db_ids = []\n",
+ " for row in db_df.iter_rows():\n",
+ " db_ids.append(row['id'])\n",
+ " db_cls = db_cls[db_cls['id'].isin(db_ids)]\n",
+ "for col in db_cls.columns:\n",
+ " db_cls[col] = db_cls[col].str.lower()\n",
+ "db_df = zip_X_y(db_df, db_cls)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "labels = []\n",
+ "nb_cls = 0\n",
+ "for row in db_df.iter_rows():\n",
+ " labels.append(row['genus'])\n",
+ "nb_cls = len(np.unique(labels))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Map_Batches: 100%|██████████| 16/16 [00:00<00:00, 64.71it/s]\n",
+ "Map_Batches: 100%|██████████| 16/16 [00:00<00:00, 162.68it/s]\n",
+ "Map_Batches: 100%|██████████| 16/16 [00:00<00:00, 140.37it/s]\n",
+ "Map_Batches: 100%|██████████| 16/16 [00:00<00:00, 182.61it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "Chain(TensorMinMaxScaler(columns=['AAAAA', 'AAAAC', 'AAAAG', 'AAAAT', 'AAACA', 'AAACC', 'AAACG', 'AAACT', 'AAAGA', 'AAAGC', 'AAAGG', 'AAAGT', 'AAATA', 'AAATC', 'AAATG', 'AAATT', 'AACAA', 'AACAC', 'AACAG', 'AACAT', 'AACCA', 'AACCC', 'AACCG', 'AACCT', 'AACGA', 'AACGC', 'AACGG', 'AACGT', 'AACTA', 'AACTC', 'AACTG', 'AACTT', 'AAGAA', 'AAGAC', 'AAGAG', 'AAGAT', 'AAGCA', 'AAGCC', 'AAGCG', 'AAGCT', 'AAGGA', 'AAGGC', 'AAGGG', 'AAGGT', 'AAGTA', 'AAGTC', 'AAGTG', 'AATAA', 'AATAC', 'AATAG', 'AATAT', 'AATCA', 'AATCC', 'AATCG', 'AATCT', 'AATGA', 'AATGC', 'AATGG', 'AATGT', 'AATTA', 'AATTC', 'AATTG', 'ACAAA', 'ACAAC', 'ACAAG', 'ACAAT', 'ACACA', 'ACACC', 'ACACG', 'ACACT', 'ACAGA', 'ACAGC', 'ACAGG', 'ACAGT', 'ACATA', 'ACATC', 'ACATG', 'ACCAA', 'ACCAC', 'ACCAG', 'ACCAT', 'ACCCA', 'ACCCC', 'ACCCG', 'ACCCT', 'ACCGA', 'ACCGC', 'ACCGG', 'ACCGT', 'ACCTA', 'ACCTC', 'ACCTG', 'ACGAA', 'ACGAC', 'ACGAG', 'ACGAT', 'ACGCA', 'ACGCC', 'ACGCG', 'ACGCT', 'ACGGA', 'ACGGC', 'ACGGG', 'ACGTA', 'ACGTC', 'ACGTG', 'ACTAA', 'ACTAC', 'ACTAG', 'ACTAT', 'ACTCA', 'ACTCC', 'ACTCG', 'ACTCT', 'ACTGA', 'ACTGC', 'ACTGG', 'ACTTA', 'ACTTC', 'ACTTG', 'AGAAA', 'AGAAC', 'AGAAG', 'AGAAT', 'AGACA', 'AGACC', 'AGACG', 'AGACT', 'AGAGA', 'AGAGC', 'AGAGG', 'AGATA', 'AGATC', 'AGATG', 'AGCAA', 'AGCAC', 'AGCAG', 'AGCAT', 'AGCCA', 'AGCCC', 'AGCCG', 'AGCCT', 'AGCGA', 'AGCGC', 'AGCGG', 'AGCTA', 'AGCTC', 'AGCTG', 'AGGAA', 'AGGAC', 'AGGAG', 'AGGAT', 'AGGCA', 'AGGCC', 'AGGCG', 'AGGGA', 'AGGGC', 'AGGGG', 'AGGTA', 'AGGTC', 'AGGTG', 'AGTAA', 'AGTAC', 'AGTAG', 'AGTAT', 'AGTCA', 'AGTCC', 'AGTCG', 'AGTGA', 'AGTGC', 'AGTGG', 'AGTTA', 'AGTTC', 'AGTTG', 'ATAAA', 'ATAAC', 'ATAAG', 'ATAAT', 'ATACA', 'ATACC', 'ATACG', 'ATAGA', 'ATAGC', 'ATAGG', 'ATATA', 'ATATC', 'ATATG', 'ATCAA', 'ATCAC', 'ATCAG', 'ATCAT', 'ATCCA', 'ATCCC', 'ATCCG', 'ATCGA', 'ATCGC', 'ATCGG', 'ATCTA', 'ATCTC', 'ATCTG', 'ATGAA', 'ATGAC', 'ATGAG', 'ATGCA', 'ATGCC', 'ATGCG', 'ATGGA', 'ATGGC', 'ATGGG', 'ATGTA', 'ATGTC', 'ATGTG', 'ATTAA', 'ATTAC', 'ATTAG', 'ATTCA', 'ATTCC', 'ATTCG', 'ATTGA', 'ATTGC', 'ATTGG', 'ATTTA', 'ATTTC', 'ATTTG', 'CAAAA', 'CAAAC', 'CAAAG', 'CAACA', 'CAACC', 'CAACG', 'CAAGA', 'CAAGC', 'CAAGG', 'CAATA', 'CAATC', 'CAATG', 'CACAA', 'CACAC', 'CACAG', 'CACCA', 'CACCC', 'CACCG', 'CACGA', 'CACGC', 'CACGG', 'CACTA', 'CACTC', 'CACTG', 'CAGAA', 'CAGAC', 'CAGAG', 'CAGCA', 'CAGCC', 'CAGCG', 'CAGGA', 'CAGGC', 'CAGGG', 'CAGTA', 'CAGTC', 'CATAA', 'CATAC', 'CATAG', 'CATCA', 'CATCC', 'CATCG', 'CATGA', 'CATGC', 'CATGG', 'CATTA', 'CATTC', 'CCAAA', 'CCAAC', 'CCAAG', 'CCACA', 'CCACC', 'CCACG', 'CCAGA', 'CCAGC', 'CCAGG', 'CCATA', 'CCATC', 'CCCAA', 'CCCAC', 'CCCAG', 'CCCCA', 'CCCCC', 'CCCCG', 'CCCGA', 'CCCGC', 'CCCGG', 'CCCTA', 'CCCTC', 'CCGAA', 'CCGAC', 'CCGAG', 'CCGCA', 'CCGCC', 'CCGCG', 'CCGGA', 'CCGGC', 'CCGTA', 'CCGTC', 'CCTAA', 'CCTAC', 'CCTAG', 'CCTCA', 'CCTCC', 'CCTCG', 'CCTGA', 'CCTGC', 'CCTTA', 'CCTTC', 'CGAAA', 'CGAAC', 'CGAAG', 'CGACA', 'CGACC', 'CGACG', 'CGAGA', 'CGAGC', 'CGATA', 'CGATC', 'CGCAA', 'CGCAC', 'CGCAG', 'CGCCA', 'CGCCC', 'CGCCG', 'CGCGA', 'CGCGC', 'CGCTA', 'CGCTC', 'CGGAA', 'CGGAC', 'CGGAG', 'CGGCA', 'CGGCC', 'CGGGA', 'CGGGC', 'CGGTA', 'CGGTC', 'CGTAA', 'CGTAC', 'CGTAG', 'CGTCA', 'CGTCC', 'CGTGA', 'CGTGC', 'CGTTA', 'CGTTC', 'CTAAA', 'CTAAC', 'CTAAG', 'CTACA', 'CTACC', 'CTAGA', 'CTAGC', 'CTATA', 'CTATC', 'CTCAA', 'CTCAC', 'CTCAG', 'CTCCA', 'CTCCC', 'CTCGA', 'CTCGC', 'CTCTA', 'CTCTC', 'CTGAA', 'CTGAC', 'CTGCA', 'CTGCC', 'CTGGA', 'CTGGC', 'CTGTA', 'CTGTC', 'CTTAA', 'CTTAC', 'CTTCA', 'CTTCC', 'CTTGA', 'CTTGC', 'CTTTA', 'CTTTC', 'GAAAA', 'GAAAC', 'GAACA', 'GAACC', 'GAAGA', 'GAAGC', 'GAATA', 'GAATC', 'GACAA', 'GACAC', 'GACCA', 'GACCC', 'GACGA', 'GACGC', 'GACTA', 'GACTC', 'GAGAA', 'GAGAC', 'GAGCA', 'GAGCC', 'GAGGA', 'GAGGC', 'GAGTA', 'GATAA', 'GATAC', 'GATCA', 'GATCC', 'GATGA', 'GATGC', 'GATTA', 'GCAAA', 'GCAAC', 'GCACA', 'GCACC', 'GCAGA', 'GCAGC', 'GCATA', 'GCCAA', 'GCCAC', 'GCCCA', 'GCCCC', 'GCCGA', 'GCCGC', 'GCCTA', 'GCGAA', 'GCGAC', 'GCGCA', 'GCGCC', 'GCGGA', 'GCGTA', 'GCTAA', 'GCTAC', 'GCTCA', 'GCTCC', 'GCTGA', 'GCTTA', 'GGAAA', 'GGAAC', 'GGACA', 'GGACC', 'GGAGA', 'GGATA', 'GGCAA', 'GGCAC', 'GGCCA', 'GGCCC', 'GGCGA', 'GGCTA', 'GGGAA', 'GGGAC', 'GGGCA', 'GGGGA', 'GGGTA', 'GGTAA', 'GGTAC', 'GGTCA', 'GGTGA', 'GGTTA', 'GTAAA', 'GTAAC', 'GTACA', 'GTAGA', 'GTATA', 'GTCAA', 'GTCAC', 'GTCCA', 'GTCGA', 'GTCTA', 'GTGAA', 'GTGCA', 'GTGGA', 'GTGTA', 'GTTAA', 'GTTCA', 'GTTGA', 'GTTTA', 'TAAAA', 'TAACA', 'TAAGA', 'TAATA', 'TACAA', 'TACCA', 'TACGA', 'TACTA', 'TAGAA', 'TAGCA', 'TAGGA', 'TATAA', 'TATCA', 'TATGA', 'TCAAA', 'TCACA', 'TCAGA', 'TCCAA', 'TCCCA', 'TCCGA', 'TCGAA', 'TCGCA', 'TCTAA', 'TCTCA', 'TGAAA', 'TGACA', 'TGCAA', 'TGCCA', 'TGGAA', 'TGTAA', 'TTAAA', 'TTCAA']), LabelEncoder(label_column='genus'), OneHotTensorEncoder(columns='genus')"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "preprocessor = Chain(\n",
+ " TensorMinMaxScaler(db_kmers_data['kmers']),\n",
+ " LabelEncoder('genus'),\n",
+ " OneHotTensorEncoder('genus')\n",
+ ")\n",
+ "preprocessor.fit(db_df)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Validation dataset simulation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Map_Batches: 100%|██████████| 16/16 [00:00<00:00, 102.95it/s]\n",
+ "INFO:iss.app:Starting iss generate\n",
+ "INFO:iss.app:Using kde ErrorModel\n",
+ "INFO:iss.util:Stitching input files together\n",
+ "INFO:iss.app:Using halfnormal abundance distribution\n",
+ "INFO:iss.app:Using 8 cpus for read generation\n",
+ "INFO:iss.app:Generating 26 reads\n",
+ "INFO:iss.app:Generating reads for record: JAAZAC010000025.1\n",
+ "INFO:iss.app:Generating reads for record: JABXKY010000147.1\n",
+ "INFO:iss.app:Generating reads for record: JAABRC010000419.1\n",
+ "INFO:iss.app:Generating reads for record: UQEY01000009.1\n",
+ "INFO:iss.app:Generating reads for record: CACNVV010000042.1\n",
+ "INFO:iss.app:Generating reads for record: WLHF01000026.1\n",
+ "INFO:iss.app:Generating reads for record: DKBA01000026.1\n",
+ "INFO:iss.app:Generating reads for record: WBXD01000017.1\n",
+ "INFO:iss.app:Generating reads for record: PBSX01000072.1\n",
+ "INFO:iss.app:Generating reads for record: CAIXRL010000197.1\n",
+ "INFO:iss.app:Generating reads for record: DHMB01000127.1\n",
+ "INFO:iss.app:Generating reads for record: JACNFQ010000081.1\n",
+ "INFO:iss.app:Generating reads for record: QMMC01000579.1\n",
+ "INFO:iss.util:Stitching input files together\n",
+ "INFO:iss.util:Stitching input files together\n",
+ "INFO:iss.util:Cleaning up\n",
+ "INFO:iss.app:Read generation complete\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "given_kmers\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.\n",
+ "[Parallel(n_jobs=-1)]: Done 28 out of 28 | elapsed: 1.8s finished\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "_batch_read_write_given\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Repartition: 100%|██████████| 28/28 [00:00<00:00, 59.49it/s]\n",
+ "Repartition: 100%|██████████| 28/28 [00:00<00:00, 1309.14it/s]\n",
+ "Repartition: 100%|██████████| 28/28 [00:00<00:00, 787.18it/s]\n",
+ "Write Progress: 100%|██████████| 28/28 [00:00<00:00, 311.78it/s]\n",
+ "Metadata Fetch Progress: 0%| | 0/4 [00:00, ?it/s]\n",
+ "Metadata Fetch Progress: 100%|██████████| 4/4 [00:00<00:00, 49.17it/s]\n",
+ "Parquet Files Sample: 100%|██████████| 2/2 [00:00<00:00, 137.49it/s]\n",
+ "Read: 100%|██████████| 16/16 [00:00<00:00, 99.24it/s]\n",
+ "Repartition: 100%|██████████| 28/28 [00:00<00:00, 539.30it/s]\n",
+ "Repartition: 100%|██████████| 28/28 [00:00<00:00, 784.09it/s]\n",
+ "Repartition: 100%|██████████| 16/16 [00:00<00:00, 448.23it/s]\n",
+ "Repartition: 100%|██████████| 16/16 [00:00<00:00, 588.52it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "sim_genomes = []\n",
+ "sim_taxas = []\n",
+ "val_df = db_df.random_sample(0.3)\n",
+ "for row in val_df.iter_rows():\n",
+ " sim_genomes.append(row['id'])\n",
+ " sim_taxas.append(row['genus'])\n",
+ "sim_cls = pd.DataFrame({\n",
+ " 'id' : sim_genomes,\n",
+ " 'genus' : sim_taxas\n",
+ "})\n",
+ "sim_outdir = os.path.dirname(db_kmers_data['profile'])\n",
+ "sim_val = readsSimulation(\n",
+ " Path(db_kmers_data['fasta']),\n",
+ " sim_cls,\n",
+ " sim_genomes,\n",
+ " 'miseq',\n",
+ " sim_outdir,\n",
+ " 'validation'\n",
+ ")\n",
+ "val_data = sim_val.simulation(klen, db_kmers_data['kmers'])\n",
+ "val_data\n",
+ "val_ids = val_data['ids']\n",
+ "val_cls = pd.DataFrame({'sim_id' : val_ids}, dtype = object)\n",
+ "val_cls['id'] = val_cls['sim_id'].str.replace('_[0-9]+_[0-9]+_[0-9]+', '', regex=True)\n",
+ "val_cls = val_cls.set_index('id').join(sim_cls.set_index('id'))\n",
+ "val_cls = val_cls.drop(['sim_id'], axis=1)\n",
+ "val_cls = val_cls.reset_index(drop=True)\n",
+ "val_df = ray.data.read_parquet(val_data['profile'])\n",
+ "val_df = zip_X_y(val_df, val_cls)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def train_func(config):\n",
+ " # Parameters\n",
+ " batch_size = config.get('batch_size', 128)\n",
+ " epochs = config.get('epochs', 10)\n",
+ " size = config.get('size')\n",
+ " nb_cls = config.get('nb_cls')\n",
+ " model = config.get('model')\n",
+ "\n",
+ " # Model setup\n",
+ " strategy = tf.distribute.MultiWorkerMirroredStrategy()\n",
+ " with strategy.scope():\n",
+ " model = build_wideCNN(size, nb_cls)\n",
+ "\n",
+ " # Load data directly to workers instead of serializing it?\n",
+ " train_data = session.get_dataset_shard('train')\n",
+ " val_data = session.get_dataset_shard('validation')\n",
+ "\n",
+ " def to_tf_dataset(data, batch_size):\n",
+ " def to_tensor_iterator():\n",
+ " for batch in data.iter_tf_batches(\n",
+ " batch_size=batch_size\n",
+ " ):\n",
+ " yield batch['__value__'], batch['labels']\n",
+ "\n",
+ " output_signature = (\n",
+ " tf.TensorSpec(shape=(None, size), dtype=tf.float32),\n",
+ " tf.TensorSpec(shape=(None, nb_cls), dtype=tf.int64),\n",
+ " )\n",
+ " tf_data = tf.data.Dataset.from_generator(\n",
+ " to_tensor_iterator, output_signature=output_signature\n",
+ " )\n",
+ " return prepare_dataset_shard(tf_data)\n",
+ "\n",
+ " batch_val = to_tf_dataset(val_data, batch_size)\n",
+ "\n",
+ " # Fit the model on streaming data\n",
+ " for epoch_train in train_data.iter_epochs(epochs):\n",
+ " batch_train = to_tf_dataset(epoch_train, batch_size)\n",
+ " history = model.fit(\n",
+ " x=batch_train,\n",
+ " validation_data=batch_val,\n",
+ " callbacks=[Callback()],\n",
+ " verbose=0\n",
+ " )\n",
+ " session.report({\n",
+ " 'accuracy': history.history['accuracy'][0],\n",
+ " 'loss': history.history['loss'][0],\n",
+ " 'val_accuracy': history.history['val_accuracy'][0],\n",
+ " 'val_loss': history.history['val_loss'][0],\n",
+ " },\n",
+ " checkpoint=TensorflowCheckpoint.from_model(model)\n",
+ " )\n",
+ " gc.collect()\n",
+ " tf.keras.backend.clear_session()\n",
+ " del model\n",
+ " gc.collect()\n",
+ " tf.keras.backend.clear_session()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nb_CPU_data = int(os.cpu_count() * 0.2)\n",
+ "nb_CPU_training = int(os.cpu_count() - nb_CPU_data)\n",
+ "nb_GPU = len(tf.config.list_physical_devices('GPU'))\n",
+ "nb_CPU_per_worker = 0\n",
+ "nb_GPU_per_worker = 0\n",
+ "if nb_GPU > 0:\n",
+ " use_gpu = True\n",
+ " n_workers = nb_GPU\n",
+ " nb_CPU_per_worker = int(nb_CPU_training / n_workers)\n",
+ " nb_GPU_per_worker = 1\n",
+ "else:\n",
+ " use_gpu = False\n",
+ " n_workers = int(nb_CPU_training * 0.2)\n",
+ " nb_CPU_per_worker = int(int(nb_CPU_training * 0.8) / n_workers)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Map_Batches: 100%|██████████| 16/16 [00:00<00:00, 136.66it/s]\n",
+ "Map_Batches: 100%|██████████| 16/16 [00:01<00:00, 14.63it/s]\n",
+ "Map_Batches: 100%|██████████| 16/16 [00:00<00:00, 114.35it/s]\n",
+ "Map_Batches: 100%|██████████| 16/16 [00:00<00:00, 85.52it/s]\n",
+ "Map_Batches: 100%|██████████| 16/16 [00:00<00:00, 123.06it/s]\n",
+ "Map_Batches: 100%|██████████| 16/16 [00:00<00:00, 16.76it/s]\n",
+ "Map_Batches: 100%|██████████| 16/16 [00:00<00:00, 144.19it/s]\n",
+ "Map_Batches: 100%|██████████| 16/16 [00:00<00:00, 130.22it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "db_df = db_df.drop_columns(['id'])\n",
+ "db_df = preprocessor.transform(db_df)\n",
+ "val_df = val_df.drop_columns(['id'])\n",
+ "val_df = preprocessor.transform(val_df)\n",
+ "datasets = {'train': db_df, 'validation': val_df}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_params = {\n",
+ " 'batch_size': batch_size,\n",
+ " 'epochs': epochs,\n",
+ " 'size': len(db_kmers_data['kmers']),\n",
+ " 'nb_cls': nb_cls,\n",
+ " 'model': 'widecnn'\n",
+ "}\n",
+ "trainer = TensorflowTrainer(\n",
+ " train_loop_per_worker = train_func,\n",
+ " train_loop_config = train_params,\n",
+ " scaling_config = ScalingConfig(\n",
+ " trainer_resources={'CPU': nb_CPU_data},\n",
+ " num_workers = n_workers,\n",
+ " use_gpu = use_gpu,\n",
+ " resources_per_worker={\n",
+ " 'CPU': nb_CPU_per_worker,\n",
+ " 'GPU': nb_GPU_per_worker\n",
+ " }\n",
+ " ),\n",
+ " dataset_config = {\n",
+ " 'train': DatasetConfig(\n",
+ " fit = False,\n",
+ " transform = False,\n",
+ " split = True,\n",
+ " use_stream_api = True\n",
+ " ),\n",
+ " 'validation': DatasetConfig(\n",
+ " fit = False,\n",
+ " transform = False,\n",
+ " split = True,\n",
+ " use_stream_api = False\n",
+ " )\n",
+ " },\n",
+ " run_config = RunConfig(\n",
+ " name = 'WideCNN',\n",
+ " local_dir = workdir,\n",
+ " ),\n",
+ " datasets = datasets,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "
Tune Status
\n",
+ "
\n",
+ "\n",
+ "Current time: | 2023-03-13 15:51:19 |
\n",
+ "Running for: | 00:00:03.36 |
\n",
+ "Memory: | 6.4/12.4 GiB |
\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
System Info
\n",
+ " Using FIFO scheduling algorithm.
Resources requested: 6.0/8 CPUs, 0/0 GPUs, 0.0/4.19 GiB heap, 0.0/2.1 GiB objects\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
Trial Status
\n",
+ "
\n",
+ "\n",
+ "Trial name | status | loc |
\n",
+ "\n",
+ "\n",
+ "TensorflowTrainer_6ab93_00000 | RUNNING | 192.168.65.207:29527 |
\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2m\u001b[36m(TensorflowTrainer pid=29527)\u001b[0m 2023-03-13 15:51:22,846\tINFO dataset.py:3693 -- Created DatasetPipeline with 1 windows: 0.13MiB min, 0.13MiB max, 0.13MiB mean\n",
+ "\u001b[2m\u001b[36m(TensorflowTrainer pid=29527)\u001b[0m 2023-03-13 15:51:22,846\tINFO dataset.py:3703 -- Blocks per window: 16 min, 16 max, 16 mean\n",
+ "\u001b[2m\u001b[36m(TensorflowTrainer pid=29527)\u001b[0m 2023-03-13 15:51:22,848\tINFO dataset.py:3725 -- ✔️ This pipeline's per-window parallelism is high enough to fully utilize the cluster.\n",
+ "\u001b[2m\u001b[36m(TensorflowTrainer pid=29527)\u001b[0m 2023-03-13 15:51:22,848\tINFO dataset.py:3742 -- ✔️ This pipeline's windows likely fit in object store memory without spilling.\n",
+ "Stage 0: 0%| | 0/1 [00:00, ?it/s]=29591)\u001b[0m \n",
+ " 0%| | 0/1 [00:00, ?it/s]\u001b[Aor pid=29591)\u001b[0m \n",
+ "Stage 1: 0%| | 0/1 [00:00, ?it/s]\u001b[A591)\u001b[0m \n",
+ "\u001b[2m\u001b[36m(PipelineSplitExecutorCoordinator pid=29591)\u001b[0m \n",
+ "Stage 0: : 4it [00:03, 1.27it/s] \u001b[A591)\u001b[0m \n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
Trial Progress
\n",
+ "
\n",
+ "\n",
+ "Trial name | _time_this_iter_s | _timestamp | _training_iteration | accuracy | date | done | episodes_total | experiment_id | hostname | iterations_since_restore | loss | node_ip | pid | should_checkpoint | time_since_restore | time_this_iter_s | time_total_s | timestamp | timesteps_since_restore | timesteps_total | training_iteration | trial_id | val_accuracy | val_loss | warmup_time |
\n",
+ "\n",
+ "\n",
+ "TensorflowTrainer_6ab93_00000 | 12.5321 | 1678737113 | 3 | 0.0689655 | 2023-03-13_15-51-53 | False | | c4fd5b07b4aa4d6481b9e7439f6334fb | DESKTOP-TM4J0AE | 3 | 3.33265 | 192.168.65.207 | 29527 | True | 33.5966 | 12.5317 | 33.5966 | 1678737113 | 0 | | 3 | 6ab93_00000 | 0 | 3.33878 | 0.258061 |
\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2m\u001b[36m(PipelineSplitExecutorCoordinator pid=29591)\u001b[0m \n",
+ "Stage 0: : 5it [00:17, 4.29s/it]\u001b[Ainator pid=29591)\u001b[0m \n",
+ "\u001b[2m\u001b[36m(PipelineSplitExecutorCoordinator pid=29591)\u001b[0m \n",
+ "Stage 0: : 6it [00:29, 6.58s/it]\u001b[Ainator pid=29591)\u001b[0m \n",
+ "\u001b[2m\u001b[36m(PipelineSplitExecutorCoordinator pid=29591)\u001b[0m \n",
+ "Stage 0: : 7it [00:42, 8.31s/it]\u001b[Ainator pid=29591)\u001b[0m \n",
+ "\u001b[2m\u001b[36m(PipelineSplitExecutorCoordinator pid=29591)\u001b[0m \n",
+ "Stage 0: : 8it [00:55, 9.67s/it]\u001b[Ainator pid=29591)\u001b[0m \n",
+ "\u001b[2m\u001b[36m(PipelineSplitExecutorCoordinator pid=29591)\u001b[0m \n",
+ "Stage 0: : 9it [01:09, 10.72s/it]\u001b[Ainator pid=29591)\u001b[0m \n",
+ "\u001b[2m\u001b[36m(PipelineSplitExecutorCoordinator pid=29591)\u001b[0m \n",
+ "Stage 0: : 10it [01:21, 11.36s/it][Ainator pid=29591)\u001b[0m \n",
+ "\u001b[2m\u001b[36m(PipelineSplitExecutorCoordinator pid=29591)\u001b[0m \n",
+ "Stage 0: : 11it [01:35, 12.05s/it]\u001b[Anator pid=29591)\u001b[0m \n",
+ "\u001b[2m\u001b[36m(PipelineSplitExecutorCoordinator pid=29591)\u001b[0m \n",
+ "Stage 0: : 12it [01:49, 12.47s/it]\u001b[Anator pid=29591)\u001b[0m \n",
+ "\u001b[2m\u001b[36m(PipelineSplitExecutorCoordinator pid=29591)\u001b[0m \n",
+ "Stage 0: : 13it [02:02, 12.89s/it]\u001b[Anator pid=29591)\u001b[0m \n",
+ "2023-03-13 15:53:43,102\tINFO tune.py:762 -- Total run time: 146.51 seconds (146.39 seconds for the tuning loop).\n"
+ ]
+ }
+ ],
+ "source": [
+ "training_result = trainer.fit()\n",
+ "model_ckpt = training_result.best_checkpoints[0][0]"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Predicting"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Map_Batches: 100%|██████████| 16/16 [00:00<00:00, 110.38it/s]\n",
+ "Map_Batches: 100%|██████████| 16/16 [00:02<00:00, 6.19it/s]\n",
+ "Map Progress (7 actors 2 pending): 100%|██████████| 16/16 [00:39<00:00, 2.44s/it]\n"
+ ]
+ }
+ ],
+ "source": [
+ "if len(metagenome_df.schema().names) > 1:\n",
+ " col_2_drop = [col for col in metagenome_df.schema().names if col != '__value__']\n",
+ " metagenome_df = metagenome_df.drop_columns(col_2_drop)\n",
+ "\n",
+ "metagenome_df = preprocessor.preprocessors[0].transform(metagenome_df)\n",
+ "\n",
+ "predictor = BatchPredictor.from_checkpoint(\n",
+ " model_ckpt,\n",
+ " TensorflowPredictor,\n",
+ " model_definition = lambda: build_wideCNN(len(db_kmers_data['kmers']), nb_cls)\n",
+ ")\n",
+ "predictions = predictor.predict(\n",
+ " data = metagenome_df,\n",
+ " batch_size = batch_size\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def map_predicted_label_multiclass(df, threshold):\n",
+ " predict = pd.DataFrame({\n",
+ " 'best_proba': [df['predictions'][i][np.argmax(df['predictions'][i])] for i in range(len(df))],\n",
+ " 'predicted_label': df[\"predictions\"].map(lambda x: np.array(x).argmax())\n",
+ " })\n",
+ " predict.loc[predict['best_proba'] < threshold, 'predicted_label'] = -1\n",
+ " return predict['predicted_label'].to_numpy(dtype = np.int32)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.\n",
+ "[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 0.3s\n",
+ "[Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 0.9s\n",
+ "[Parallel(n_jobs=-1)]: Done 298 out of 298 | elapsed: 1.3s finished\n"
+ ]
+ }
+ ],
+ "source": [
+ "with parallel_backend('threading'):\n",
+ " predict = Parallel(n_jobs=-1, prefer='threads', verbose=1)(\n",
+ " delayed(map_predicted_label_multiclass)(batch, classif_threshold) for batch in predictions.iter_batches(batch_size = batch_size))\n",
+ "\n",
+ "predictions = np.concatenate(predict)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "encoded = []\n",
+ "encoded.append(-1)\n",
+ "labels = ['unknown']\n",
+ "for k, v in preprocessor.preprocessors[1].stats_['unique_values({})'.format('genus')].items():\n",
+ " encoded.append(v)\n",
+ " labels.append(k)\n",
+ "predictions_widecnn = pd.Series(np.empty(len(predictions), dtype=object))\n",
+ "for label, coded in zip(labels, encoded):\n",
+ " predictions_widecnn[predictions == coded] = label"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " classification | \n",
+ " abundance | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " bacteria | \n",
+ " 298 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " classification abundance\n",
+ "0 bacteria 298"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "clf_onesvm = pd.DataFrame(\n",
+ " predictions_onesvm.value_counts(),\n",
+ " columns = ['abundance']\n",
+ ")\n",
+ "clf_onesvm = clf_onesvm.reset_index(names = 'classification')\n",
+ "clf_onesvm\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " classification | \n",
+ " abundance | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " unknown | \n",
+ " 298 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " classification abundance\n",
+ "0 unknown 298"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "clf_widecnn = pd.DataFrame(\n",
+ " predictions_widecnn.value_counts(),\n",
+ " columns = ['abundance']\n",
+ ")\n",
+ "clf_widecnn = clf_widecnn.reset_index(names = 'classification')\n",
+ "clf_widecnn"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAGwCAYAAABPSaTdAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAts0lEQVR4nO3de3RTdb7//1coNLYlLfRCLxJKGQsHbPFSHLCjAnKTAQFhhjp45CKOuriMHUAULyMelQIeCiIjHj0cynWAcQRdXikiRURGKDBcREQsAtJaqbVpsZNCu39/+DW/iQXENJD0M8/HWlnL7P1J8t74R59rZyexWZZlCQAAwFBNAj0AAADAxUTsAAAAoxE7AADAaMQOAAAwGrEDAACMRuwAAACjETsAAMBoTQM9QDCoq6vTiRMn5HA4ZLPZAj0OAAC4AJZlqbKyUklJSWrS5Nznb4gdSSdOnJDT6Qz0GAAAwAfHjh1T69atz7mf2JHkcDgkff+PFRkZGeBpAADAhXC5XHI6nZ6/4+dC7Eiet64iIyOJHQAAGpmfugSFC5QBAIDRiB0AAGA0YgcAABiN2AEAAEYjdgAAgNGIHQAAYDRiBwAAGI3YAQAARiN2AACA0QIaOwsXLlTnzp0931x8/fXX66233vLstyxL06dPV1JSksLCwtSjRw/t37/f6zncbrcmTpyo2NhYRUREaNCgQTp+/PilPhQAABCkAho7rVu31syZM7Vjxw7t2LFDN998swYPHuwJmtmzZys3N1cLFizQ9u3blZCQoD59+qiystLzHNnZ2Vq7dq1WrVqlLVu2qKqqSgMHDlRtbW2gDgsAAAQRm2VZVqCH+FfR0dF65plndNdddykpKUnZ2dl68MEHJX1/Fic+Pl6zZs3Svffeq4qKCsXFxWnZsmXKysqS9P//gvmbb76pfv36nfU13G633G635/4PPyRWUVHBb2MBANBIuFwuRUVF/eTf76C5Zqe2tlarVq3SqVOndP3116uoqEglJSXq27evZ43dblf37t21detWSVJhYaFOnz7ttSYpKUlpaWmeNWeTk5OjqKgoz83pdF68AwMAAAEV8NjZu3evmjdvLrvdrvvuu09r165Vp06dVFJSIkmKj4/3Wh8fH+/ZV1JSotDQULVs2fKca85m2rRpqqio8NyOHTvm56MCAADBommgB+jQoYN2796tb7/9Vn/72980atQoFRQUePb/+GfbLcv6yZ9y/6k1drtddru9YYMDAIBGIeCxExoaqiuuuEKS1KVLF23fvl3PPvus5zqdkpISJSYmetaXlpZ6zvYkJCSopqZG5eXlXmd3SktLlZmZeQmPInhkPLA00CMAABqBwmdGBnqESybgb2P9mGVZcrvdSklJUUJCgvLz8z37ampqVFBQ4AmZjIwMNWvWzGtNcXGx9u3b928bOwAAwFtAz+w8/PDD6t+/v5xOpyorK7Vq1Spt2rRJb7/9tmw2m7KzszVjxgylpqYqNTVVM2bMUHh4uEaMGCFJioqK0tixYzV58mTFxMQoOjpaU6ZMUXp6unr37h3IQwMAAEEioLHz1Vdf6c4771RxcbGioqLUuXNnvf322+rTp48kaerUqaqurta4ceNUXl6url27av369XI4HJ7nmDt3rpo2barhw4erurpavXr1Ul5enkJCQgJ1WAAAIIgE3ffsBMKFfk6/MeCaHQDAhTDhmp1G9z07AAAAFwOxAwAAjEbsAAAAoxE7AADAaMQOAAAwGrEDAACMRuwAAACjETsAAMBoxA4AADAasQMAAIxG7AAAAKMROwAAwGjEDgAAMBqxAwAAjEbsAAAAoxE7AADAaMQOAAAwGrEDAACMRuwAAACjETsAAMBoxA4AADAasQMAAIxG7AAAAKMROwAAwGjEDgAAMBqxAwAAjEbsAAAAoxE7AADAaMQOAAAwGrEDAACMRuwAAACjETsAAMBoxA4AADAasQMAAIxG7AAAAKMROwAAwGjEDgAAMBqxAwAAjEbsAAAAoxE7AADAaMQOAAAwGrEDAACMRuwAAACjETsAAMBoxA4AADAasQMAAIxG7AAAAKMROwAAwGgBjZ2cnBxdd911cjgcatWqlYYMGaKDBw96rRk9erRsNpvXrVu3bl5r3G63Jk6cqNjYWEVERGjQoEE6fvz4pTwUAAAQpAIaOwUFBRo/fry2bdum/Px8nTlzRn379tWpU6e81t1yyy0qLi723N58802v/dnZ2Vq7dq1WrVqlLVu2qKqqSgMHDlRtbe2lPBwAABCEmgbyxd9++22v+4sXL1arVq1UWFiom266ybPdbrcrISHhrM9RUVGhRYsWadmyZerdu7ckafny5XI6ndqwYYP69etX7zFut1tut9tz3+Vy+eNwAABAEAqqa3YqKiokSdHR0V7bN23apFatWql9+/b6/e9/r9LSUs++wsJCnT59Wn379vVsS0pKUlpamrZu3XrW18nJyVFUVJTn5nQ6L8LRAACAYBA0sWNZliZNmqQbbrhBaWlpnu39+/fXihUrtHHjRs2ZM0fbt2/XzTff7DkzU1JSotDQULVs2dLr+eLj41VSUnLW15o2bZoqKio8t2PHjl28AwMAAAEV0Lex/tWECRO0Z88ebdmyxWt7VlaW57/T0tLUpUsXJScn64033tDQoUPP+XyWZclms511n91ul91u98/gAAAgqAXFmZ2JEyfqtdde03vvvafWrVufd21iYqKSk5N16NAhSVJCQoJqampUXl7uta60tFTx8fEXbWYAANA4BDR2LMvShAkT9Morr2jjxo1KSUn5yceUlZXp2LFjSkxMlCRlZGSoWbNmys/P96wpLi7Wvn37lJmZedFmBwAAjUNA38YaP368Vq5cqVdffVUOh8NzjU1UVJTCwsJUVVWl6dOna9iwYUpMTNSRI0f08MMPKzY2Vrfddptn7dixYzV58mTFxMQoOjpaU6ZMUXp6uufTWQAA4N9XQGNn4cKFkqQePXp4bV+8eLFGjx6tkJAQ7d27V0uXLtW3336rxMRE9ezZU6tXr5bD4fCsnzt3rpo2barhw4erurpavXr1Ul5enkJCQi7l4QAAgCBksyzLCvQQgeZyuRQVFaWKigpFRkYGepwGyXhgaaBHAAA0AoXPjAz0CA12oX+/g+ICZQAAgIuF2AEAAEYjdgAAgNGIHQAAYDRiBwAAGI3YAQAARiN2AACA0YgdAABgNGIHAAAYjdgBAABGI3YAAIDRiB0AAGA0YgcAABiN2AEAAEYjdgAAgNGIHQAAYDRiBwAAGI3YAQAARiN2AACA0YgdAABgNGIHAAAYjdgBAABGI3YAAIDRiB0AAGA0YgcAABiN2AEAAEYjdgAAgNGIHQAAYDRiBwAAGI3YAQAARiN2AACA0YgdAABgNGIHAAAYjdgBAABGI3YAAIDRiB0AAGA0YgcAABiN2AEAAEYjdgAAgNGIHQAAYDRiBwAAGI3YAQAARiN2AACA0YgdAABgNGIHAAAYjdgBAABGI3YAAIDRiB0AAGC0gMZOTk6OrrvuOjkcDrVq1UpDhgzRwYMHvdZYlqXp06crKSlJYWFh6tGjh/bv3++1xu12a+LEiYqNjVVERIQGDRqk48ePX8pDAQAAQSqgsVNQUKDx48dr27Ztys/P15kzZ9S3b1+dOnXKs2b27NnKzc3VggULtH37diUkJKhPnz6qrKz0rMnOztbatWu1atUqbdmyRVVVVRo4cKBqa2sDcVgAACCI2CzLsgI9xA++/vprtWrVSgUFBbrppptkWZaSkpKUnZ2tBx98UNL3Z3Hi4+M1a9Ys3XvvvaqoqFBcXJyWLVumrKwsSdKJEyfkdDr15ptvql+/fvVex+12y+12e+67XC45nU5VVFQoMjLy0hzsRZLxwNJAjwAAaAQKnxkZ6BEazOVyKSoq6if/fgfVNTsVFRWSpOjoaElSUVGRSkpK1LdvX88au92u7t27a+vWrZKkwsJCnT592mtNUlKS0tLSPGt+LCcnR1FRUZ6b0+m8WIcEAAACLGhix7IsTZo0STfccIPS0tIkSSUlJZKk+Ph4r7Xx8fGefSUlJQoNDVXLli3PuebHpk2bpoqKCs/t2LFj/j4cAAAQJJoGeoAfTJgwQXv27NGWLVvq7bPZbF73Lcuqt+3HzrfGbrfLbrf7PiwAAGg0guLMzsSJE/Xaa6/pvffeU+vWrT3bExISJKneGZrS0lLP2Z6EhATV1NSovLz8nGsAAMC/r4DGjmVZmjBhgl555RVt3LhRKSkpXvtTUlKUkJCg/Px8z7aamhoVFBQoMzNTkpSRkaFmzZp5rSkuLta+ffs8awAAwL+vgL6NNX78eK1cuVKvvvqqHA6H5wxOVFSUwsLCZLPZlJ2drRkzZig1NVWpqamaMWOGwsPDNWLECM/asWPHavLkyYqJiVF0dLSmTJmi9PR09e7dO5CHBwAAgkBAY2fhwoWSpB49enhtX7x4sUaPHi1Jmjp1qqqrqzVu3DiVl5era9euWr9+vRwOh2f93Llz1bRpUw0fPlzV1dXq1auX8vLyFBIScqkOBQAABKmg+p6dQLnQz+k3BnzPDgDgQvA9OwAAAIYgdgAAgNGIHQAAYDRiBwAAGI3YAQAARiN2AACA0YgdAABgNGIHAAAYzefYOXz4sB599FH97ne/U2lpqSTp7bff1v79+/02HAAAQEP5FDsFBQVKT0/X3//+d73yyiuqqqqSJO3Zs0ePP/64XwcEAABoCJ9i56GHHtJTTz2l/Px8hYaGerb37NlTH374od+GAwAAaCifYmfv3r267bbb6m2Pi4tTWVlZg4cCAADwF59ip0WLFiouLq63fdeuXbr88ssbPBQAAIC/+BQ7I0aM0IMPPqiSkhLZbDbV1dXpgw8+0JQpUzRyZOP/FVUAAGAOn2Ln6aefVps2bXT55ZerqqpKnTp10k033aTMzEw9+uij/p4RAADAZ019eVCzZs20YsUKPfnkk9q5c6fq6up0zTXXKDU11d/zAQAANIhPsfODdu3aqV27dv6aBQAAwO98ehvrN7/5jWbOnFlv+zPPPKPf/va3DR4KAADAX3z+UsEBAwbU237LLbdo8+bNDR4KAADAX3yKnaqqKq8vE/xBs2bN5HK5GjwUAACAv/gUO2lpaVq9enW97atWrVKnTp0aPBQAAIC/+HSB8mOPPaZhw4bp8OHDuvnmmyVJ7777rv7yl7/or3/9q18HBAAAaAifYmfQoEFat26dZsyYoZdffllhYWHq3LmzNmzYoO7du/t7RgAAAJ/5/NHzAQMGnPUiZQAAgGDSoO/ZqampUWlpqerq6ry2t2nTpkFDAQAA+ItPsXPo0CHddddd2rp1q9d2y7Jks9lUW1vrl+EAAAAayqfYGT16tJo2barXX39diYmJstls/p4LAADAL3yKnd27d6uwsFD/8R//4e95AAAA/Mqn79np1KmTTp486e9ZAAAA/M6n2Jk1a5amTp2qTZs2qaysTC6Xy+sGAAAQLHx6G6t3796SpF69enlt5wJlAAAQbHyKnffee8/fcwAAAFwUPsUO35IMAAAaiwZ9qeB3332no0ePqqamxmt7586dGzQUAACAv/gUO19//bXGjBmjt95666z7uWYHAAAEC58+jZWdna3y8nJt27ZNYWFhevvtt7VkyRKlpqbqtdde8/eMAAAAPvPpzM7GjRv16quv6rrrrlOTJk2UnJysPn36KDIyUjk5OfxAKAAACBo+ndk5deqUWrVqJUmKjo7W119/LUlKT0/Xzp07/TcdAABAA/kUOx06dNDBgwclSVdffbX+53/+R19++aVeeOEFJSYm+nVAAACAhvDpbazs7GwVFxdLkh5//HH169dPK1asUGhoqPLy8vw5HwAAQIP4FDt33HGH57+vueYaHTlyRJ988onatGmj2NhYvw0HAADQUA36np0fhIeH69prr/XHUwEAAPjVBcfOpEmTLvhJc3NzfRoGAADA3y44dnbt2uV1v7CwULW1terQoYMk6dNPP1VISIgyMjL8OyEAAEADXHDs/OuPf+bm5srhcGjJkiVq2bKlJKm8vFxjxozRjTfe6P8pAQAAfOTTR8/nzJmjnJwcT+hIUsuWLfXUU09pzpw5fhsOAACgoXyKHZfLpa+++qre9tLSUlVWVjZ4KAAAAH/xKXZuu+02jRkzRi+//LKOHz+u48eP6+WXX9bYsWM1dOhQf88IAADgM59i54UXXtCAAQP0n//5n0pOTlZycrLuuOMO9e/fX88///wFP8/mzZt16623KikpSTabTevWrfPaP3r0aNlsNq9bt27dvNa43W5NnDhRsbGxioiI0KBBg3T8+HFfDgsAABjIp9gJDw/X888/r7KyMu3atUs7d+7UN998o+eff14REREX/DynTp3SVVddpQULFpxzzS233KLi4mLP7c033/Tan52drbVr12rVqlXasmWLqqqqNHDgQNXW1vpyaAAAwDAN+lLBiIgIde7c2efH9+/fX/379z/vGrvdroSEhLPuq6io0KJFi7Rs2TL17t1bkrR8+XI5nU5t2LBB/fr183k2AABgBp9i59SpU5o5c6beffddlZaWqq6uzmv/559/7pfhJGnTpk1q1aqVWrRooe7du+vpp5/2/OJ6YWGhTp8+rb59+3rWJyUlKS0tTVu3bj1n7Ljdbrndbs99l8vlt3kBAEBw8Sl27r77bhUUFOjOO+9UYmKibDabv+eS9P2Zn9/+9rdKTk5WUVGRHnvsMd18880qLCyU3W5XSUmJQkNDvT4CL0nx8fEqKSk55/Pm5OToiSeeuCgzAwCA4OJT7Lz11lt644039Ktf/crf83jJysry/HdaWpq6dOmi5ORkvfHGG+f91JdlWecNsGnTpnn9/IXL5ZLT6fTP0AAAIKj4dIFyy5YtFR0d7e9ZflJiYqKSk5N16NAhSVJCQoJqampUXl7uta60tFTx8fHnfB673a7IyEivGwAAMJNPsfPkk0/qT3/6k7777jt/z3NeZWVlOnbsmBITEyVJGRkZatasmfLz8z1riouLtW/fPmVmZl7S2QAAQHDy6W2sOXPm6PDhw4qPj1fbtm3VrFkzr/07d+68oOepqqrSZ5995rlfVFSk3bt3Kzo6WtHR0Zo+fbqGDRumxMREHTlyRA8//LBiY2N12223SZKioqI0duxYTZ48WTExMYqOjtaUKVOUnp7u+XQWAAD49+ZT7AwZMsQvL75jxw717NnTc/+H62hGjRqlhQsXau/evVq6dKm+/fZbJSYmqmfPnlq9erUcDofnMXPnzlXTpk01fPhwVVdXq1evXsrLy1NISIhfZgQAAI2bzbIsK9BDBJrL5VJUVJQqKioa/fU7GQ8sDfQIAIBGoPCZkYEeocEu9O+3T9fsAAAANBY+vY3VpEmT8360m59qAAAAwcKn2Fm7dq3X/dOnT2vXrl1asmQJX9YHAACCik+xM3jw4HrbfvOb3+jKK6/U6tWrNXbs2AYPBgAA4A9+vWana9eu2rBhgz+fEgAAoEH8FjvV1dV67rnn1Lp1a389JQAAQIP59DZWy5YtvS5QtixLlZWVCg8P1/Lly/02HAAAQEP5FDvz5s3zut+kSRPFxcWpa9eu9X6BHAAAIJB8ip1Ro0b5ew4AAICLwqfYkaTy8nItWrRIBw4ckM1mU8eOHTVmzJiA/Bo6AADAufh0gXJBQYHatm2r+fPnq7y8XN98843mz5+vlJQUFRQU+HtGAAAAn/l0Zmf8+PHKysrSwoULPT+4WVtbq3Hjxmn8+PHat2+fX4cEAADwlU9ndg4fPqzJkyd7/bJ4SEiIJk2apMOHD/ttOAAAgIbyKXauvfZaHThwoN72AwcO6Oqrr27oTAAAAH5zwW9j7dmzx/Pff/jDH3T//ffrs88+U7du3SRJ27Zt05///GfNnDnT/1MCAAD4yGZZlnUhC3/4pfOfWm6z2Rrdr567XC5FRUWpoqJCkZGRgR6nQTIeWBroEQAAjUDhMyMDPUKDXejf7ws+s1NUVOSXwQAAAC6lC46d5OTkets+/vhjHT16VDU1NZ5tNpvtrGsBAAACwaePnn/++ee67bbbtHfvXq+3tn74vazG9jYWAAAwl0+fxrr//vuVkpKir776SuHh4dq3b582b96sLl26aNOmTX4eEQAAwHc+ndn58MMPtXHjRsXFxalJkyYKCQnRDTfcoJycHP3hD3/Qrl27/D0nAACAT3w6s1NbW6vmzZtLkmJjY3XixAlJ31/Xc/DgQf9NBwAA0EA+ndlJS0vTnj171K5dO3Xt2lWzZ89WaGioXnzxRbVr187fMwIAAPjMp9h59NFHderUKUnSU089pYEDB+rGG29UTEyMVq9e7dcBAQAAGsKn2OnXr5/nv9u1a6ePP/5Y33zzjVq2bOn5RBYAAEAw8Cl2ziY6OtpfTwUAAOA3Pl2gDAAA0FgQOwAAwGjEDgAAMBqxAwAAjEbsAAAAoxE7AADAaMQOAAAwGrEDAACMRuwAAACjETsAAMBoxA4AADAasQMAAIxG7AAAAKMROwAAwGjEDgAAMBqxAwAAjEbsAAAAoxE7AADAaMQOAAAwGrEDAACMRuwAAACjETsAAMBoAY2dzZs369Zbb1VSUpJsNpvWrVvntd+yLE2fPl1JSUkKCwtTjx49tH//fq81brdbEydOVGxsrCIiIjRo0CAdP378Eh4FAAAIZgGNnVOnTumqq67SggULzrp/9uzZys3N1YIFC7R9+3YlJCSoT58+qqys9KzJzs7W2rVrtWrVKm3ZskVVVVUaOHCgamtrL9VhAACAINY0kC/ev39/9e/f/6z7LMvSvHnz9Mgjj2jo0KGSpCVLlig+Pl4rV67Uvffeq4qKCi1atEjLli1T7969JUnLly+X0+nUhg0b1K9fv0t2LAAAIDgF7TU7RUVFKikpUd++fT3b7Ha7unfvrq1bt0qSCgsLdfr0aa81SUlJSktL86w5G7fbLZfL5XUDAABmCtrYKSkpkSTFx8d7bY+Pj/fsKykpUWhoqFq2bHnONWeTk5OjqKgoz83pdPp5egAAECyCNnZ+YLPZvO5bllVv24/91Jpp06apoqLCczt27JhfZgUAAMEnaGMnISFBkuqdoSktLfWc7UlISFBNTY3Ky8vPueZs7Ha7IiMjvW4AAMBMQRs7KSkpSkhIUH5+vmdbTU2NCgoKlJmZKUnKyMhQs2bNvNYUFxdr3759njUAAODfW0A/jVVVVaXPPvvMc7+oqEi7d+9WdHS02rRpo+zsbM2YMUOpqalKTU3VjBkzFB4erhEjRkiSoqKiNHbsWE2ePFkxMTGKjo7WlClTlJ6e7vl0FgAA+PcW0NjZsWOHevbs6bk/adIkSdKoUaOUl5enqVOnqrq6WuPGjVN5ebm6du2q9evXy+FweB4zd+5cNW3aVMOHD1d1dbV69eqlvLw8hYSEXPLjAQAAwcdmWZYV6CECzeVyKSoqShUVFY3++p2MB5YGegQAQCNQ+MzIQI/QYBf69ztor9kBAADwB2IHAAAYjdgBAABGI3YAAIDRiB0AAGA0YgcAABiN2AEAAEYjdgAAgNGIHQAAYDRiBwAAGI3YAQAARiN2AACA0YgdAABgNGIHAAAYjdgBAABGI3YAAIDRiB0AAGA0YgcAABiN2AEAAEYjdgAAgNGIHQAAYDRiBwAAGI3YAQAARiN2AACA0YgdAABgNGIHAAAYjdgBAABGI3YAAIDRiB0AAGA0YgcAABiN2AEAAEYjdgAAgNGIHQAAYDRiBwAAGI3YAQAARiN2AACA0YgdAABgNGIHAAAYjdgBAABGI3YAAIDRiB0AAGA0YgcAABiN2AEAAEYjdgAAgNGIHQAAYDRiBwAAGI3YAQAARiN2AACA0YI6dqZPny6bzeZ1S0hI8Oy3LEvTp09XUlKSwsLC1KNHD+3fvz+AEwMAgGAT1LEjSVdeeaWKi4s9t71793r2zZ49W7m5uVqwYIG2b9+uhIQE9enTR5WVlQGcGAAABJOgj52mTZsqISHBc4uLi5P0/VmdefPm6ZFHHtHQoUOVlpamJUuW6LvvvtPKlSsDPDUAAAgWQR87hw4dUlJSklJSUnT77bfr888/lyQVFRWppKREffv29ay12+3q3r27tm7det7ndLvdcrlcXjcAAGCmoI6drl27aunSpXrnnXf00ksvqaSkRJmZmSorK1NJSYkkKT4+3usx8fHxnn3nkpOTo6ioKM/N6XRetGMAAACBFdSx079/fw0bNkzp6enq3bu33njjDUnSkiVLPGtsNpvXYyzLqrftx6ZNm6aKigrP7dixY/4fHgAABIWgjp0fi4iIUHp6ug4dOuT5VNaPz+KUlpbWO9vzY3a7XZGRkV43AABgpkYVO263WwcOHFBiYqJSUlKUkJCg/Px8z/6amhoVFBQoMzMzgFMCAIBg0jTQA5zPlClTdOutt6pNmzYqLS3VU089JZfLpVGjRslmsyk7O1szZsxQamqqUlNTNWPGDIWHh2vEiBGBHh0AAASJoI6d48eP63e/+51OnjypuLg4devWTdu2bVNycrIkaerUqaqurta4ceNUXl6url27av369XI4HAGeHAAABAubZVlWoIcINJfLpaioKFVUVDT663cyHlga6BEAAI1A4TMjAz1Cg13o3+9Gdc0OAADAz0XsAAAAoxE7AADAaMQOAAAwGrEDAACMRuwAAACjETsAAMBoxA4AADAasQMAAIxG7AAAAKMROwAAwGjEDgAAMBqxAwAAjEbsAAAAoxE7AADAaMQOAAAwGrEDAACMRuwAAACjETsAAMBoxA4AADAasQMAAIxG7AAAAKMROwAAwGjEDgAAMBqxAwAAjEbsAAAAoxE7AADAaMQOAAAwGrEDAACMRuwAAACjETsAAMBoxA4AADAasQMAAIxG7AAAAKMROwAAwGjEDgAAMBqxAwAAjEbsAAAAoxE7AADAaMQOAAAwGrEDAACMRuwAAACjETsAAMBoxA4AADAasQMAAIxG7AAAAKMROwAAwGjEDgAAMJoxsfP8888rJSVFl112mTIyMvT+++8HeiQAABAEjIid1atXKzs7W4888oh27dqlG2+8Uf3799fRo0cDPRoAAAgwI2InNzdXY8eO1d13362OHTtq3rx5cjqdWrhwYaBHAwAAAdY00AM0VE1NjQoLC/XQQw95be/bt6+2bt161se43W653W7P/YqKCkmSy+W6eINeIrXu6kCPAABoBEz4m/fDMViWdd51jT52Tp48qdraWsXHx3ttj4+PV0lJyVkfk5OToyeeeKLedqfTeVFmBAAg2EQ9d1+gR/CbyspKRUVFnXN/o4+dH9hsNq/7lmXV2/aDadOmadKkSZ77dXV1+uabbxQTE3POxwBonFwul5xOp44dO6bIyMhAjwPAjyzLUmVlpZKSks67rtHHTmxsrEJCQuqdxSktLa13tucHdrtddrvda1uLFi0u1ogAgkBkZCSxAxjofGd0ftDoL1AODQ1VRkaG8vPzvbbn5+crMzMzQFMBAIBg0ejP7EjSpEmTdOedd6pLly66/vrr9eKLL+ro0aO67z5z3o8EAAC+MSJ2srKyVFZWpv/6r/9ScXGx0tLS9Oabbyo5OTnQowEIMLvdrscff7zeW9cA/n3YrJ/6vBYAAEAj1uiv2QEAADgfYgcAABiN2AEAAEYjdgBccj169FB2dnagx7hgbdu21bx58wI9BgAfGfFpLAD4V6NHj9a3336rdevW+eX5tm/froiICL88F4BLj9gBgHOoqalRaGio4uLiAj0KgAbgbSwAAXHmzBlNmDBBLVq0UExMjB599FHPLxcvX75cXbp0kcPhUEJCgkaMGKHS0lKvx+/fv18DBgxQZGSkHA6HbrzxRh0+fFjTp0/XkiVL9Oqrr8pms8lms2nTpk2SpC+//FJZWVlq2bKlYmJiNHjwYB05csTznKNHj9aQIUOUk5OjpKQktW/fXlL9t7Fyc3OVnp6uiIgIOZ1OjRs3TlVVVRf13wuA74gdAAGxZMkSNW3aVH//+981f/58zZ07V//7v/8r6fszKk8++aT+8Y9/aN26dSoqKtLo0aM9j/3yyy9100036bLLLtPGjRtVWFiou+66S2fOnNGUKVM0fPhw3XLLLSouLlZxcbEyMzP13XffqWfPnmrevLk2b96sLVu2qHnz5rrllltUU1Pjee53331XBw4cUH5+vl5//fWzzt6kSRPNnz9f+/bt05IlS7Rx40ZNnTr1ov57AWgACwAuse7du1sdO3a06urqPNsefPBBq2PHjmdd/9FHH1mSrMrKSsuyLGvatGlWSkqKVVNTc9b1o0aNsgYPHuy1bdGiRVaHDh28XtPtdlthYWHWO++843lcfHy85Xa7vR6bnJxszZ0795zHs2bNGismJuac+wEEFmd2AAREt27dZLPZPPevv/56HTp0SLW1tdq1a5cGDx6s5ORkORwO9ejRQ5J09OhRSdLu3bt14403qlmzZhf8eoWFhfrss8/kcDjUvHlzNW/eXNHR0frnP/+pw4cPe9alp6crNDT0vM/13nvvqU+fPrr88svlcDg0cuRIlZWV6dSpUz/jXwDApcIFygCCyj//+U/17dtXffv21fLlyxUXF6ejR4+qX79+nrebwsLCfvbz1tXVKSMjQytWrKi3718vQP6pT1198cUX+vWvf6377rtPTz75pKKjo7VlyxaNHTtWp0+f/tlzAbj4iB0AAbFt27Z691NTU/XJJ5/o5MmTmjlzppxOpyRpx44dXms7d+6sJUuW6PTp02c9uxMaGqra2lqvbddee61Wr16tVq1aKTIy0ue5d+zYoTNnzmjOnDlq0uT7k+Nr1qzx+fkAXHy8jQUgII4dO6ZJkybp4MGD+stf/qLnnntO999/v9q0aaPQ0FA999xz+vzzz/Xaa6/pySef9HrshAkT5HK5dPvtt2vHjh06dOiQli1bpoMHD0r6/tNTe/bs0cGDB3Xy5EmdPn1ad9xxh2JjYzV48GC9//77KioqUkFBge6//34dP378guf+xS9+oTNnznjmW7ZsmV544QW//tsA8C9iB0BAjBw5UtXV1frlL3+p8ePHa+LEibrnnnsUFxenvLw8/fWvf1WnTp00c+ZM/fd//7fXY2NiYrRx40ZVVVWpe/fuysjI0EsvveQ5y/P73/9eHTp0UJcuXRQXF6cPPvhA4eHh2rx5s9q0aaOhQ4eqY8eOuuuuu1RdXf2zzvRcffXVys3N1axZs5SWlqYVK1YoJyfHr/82APzLZln/74stAAAADMSZHQAAYDRiBwAAGI3YAQAARiN2AACA0YgdAABgNGIHAAAYjdgBAABGI3YAAIDRiB0AfnXkyBHZbDbt3r37or9WXl6eWrRo4bXtxRdflNPpVJMmTTRv3jxNnz5dV1999UWfpW3btpo3b95Ffx0APx/foAzAr44cOaKUlBTt2rXrokdGdXW1Kisr1apVK0mSy+VSbGyscnNzNWzYMEVFRamurk5ut1sxMTF+ec28vDxlZ2fr22+/9dr+9ddfKyIiQuHh4X55HQD+w6+eA2i0wsLCFBYW5rl/9OhRnT59WgMGDFBiYqJne/PmzS/6LHFxcRf9NQD4hrexAPikrq5Os2bN0hVXXCG73a42bdro6aefrreutrZWY8eOVUpKisLCwtShQwc9++yzXms2bdqkX/7yl4qIiFCLFi30q1/9Sl988YUk6R//+Id69uwph8OhyMhIZWRkaMeOHZK838bKy8tTenq6JKldu3ay2Ww6cuTIWd/G+r//+z9deeWVstvtSkxM1IQJEzz7cnNzlZ6eroiICDmdTo0bN05VVVWeOceMGaOKigrZbDbZbDZNnz5dUv23sY4eParBgwerefPmioyM1PDhw/XVV1959v8w17Jly9S2bVtFRUXp9ttvV2Vl5c//nwHgvIgdAD6ZNm2aZs2apccee0wff/yxVq5cqfj4+Hrr6urq1Lp1a61Zs0Yff/yx/vSnP+nhhx/WmjVrJElnzpzRkCFD1L17d+3Zs0cffvih7rnnHtlsNknSHXfcodatW2v79u0qLCzUQw895Pl183+VlZWlDRs2SJI++ugjFRcXy+l01lu3cOFCjR8/Xvfcc4/27t2r1157TVdccYVnf5MmTTR//nzt27dPS5Ys0caNGzV16lRJUmZmpubNm6fIyEgVFxeruLhYU6ZMqfcalmVpyJAh+uabb1RQUKD8/HwdPnxYWVlZXusOHz6sdevW6fXXX9frr7+ugoICzZw580L/FwC4UBYA/Ewul8uy2+3WSy+9VG9fUVGRJcnatWvXOR8/btw4a9iwYZZlWVZZWZklydq0adNZ1zocDisvL++s+xYvXmxFRUV57u/atcuSZBUVFXm2Pf7449ZVV13luZ+UlGQ98sgj5z64H1mzZo0VExNzztf8QXJysjV37lzLsixr/fr1VkhIiHX06FHP/v3791uSrI8++sgzV3h4uOVyuTxrHnjgAatr164XPBuAC8OZHQA/24EDB+R2u9WrV68LWv/CCy+oS5cuiouLU/PmzfXSSy/p6NGjkqTo6GiNHj1a/fr106233qpnn31WxcXFnsdOmjRJd999t3r37q2ZM2fq8OHDPs9dWlqqEydOnHfu9957T3369NHll18uh8OhkSNHqqysTKdOnbrg1zlw4ICcTqfXmaVOnTqpRYsWOnDggGdb27Zt5XA4PPcTExNVWlr6M48KwE8hdgD8bP96UfBPWbNmjf74xz/qrrvu0vr167V7926NGTNGNTU1njWLFy/Whx9+qMzMTK1evVrt27fXtm3bJH1/bcv+/fs1YMAAbdy4UZ06ddLatWsvytxffPGFfv3rXystLU1/+9vfVFhYqD//+c+SpNOnT1/w61iW5Xkb7nzbf/x2nM1mU11d3QW/DoALQ+wA+NlSU1MVFhamd9999yfXvv/++8rMzNS4ceN0zTXX6Iorrjjr2ZlrrrlG06ZN09atW5WWlqaVK1d69rVv315//OMftX79eg0dOlSLFy/2aW6Hw6G2bduec+4dO3bozJkzmjNnjrp166b27dvrxIkTXmtCQ0NVW1t73tfp1KmTjh49qmPHjnm2ffzxx6qoqFDHjh19mh2A74gdAD/bZZddpgcffFBTp07V0qVLdfjwYW3btk2LFi2qt/aKK67Qjh079M477+jTTz/VY489pu3bt3v2FxUVadq0afrwww/1xRdfaP369fr000/VsWNHVVdXa8KECdq0aZO++OILffDBB9q+fXuDgmH69OmaM2eO5s+fr0OHDmnnzp167rnnJEm/+MUvdObMGT333HP6/PPPtWzZMr3wwgtej2/btq2qqqr07rvv6uTJk/ruu+/qvUbv3r3VuXNn3XHHHdq5c6c++ugjjRw5Ut27d1eXLl18nh2Ab4gdAD557LHHNHnyZP3pT39Sx44dlZWVddbrTe677z4NHTpUWVlZ6tq1q8rKyjRu3DjP/vDwcH3yyScaNmyY2rdvr3vuuUcTJkzQvffeq5CQEJWVlWnkyJFq3769hg8frv79++uJJ57wee5Ro0Zp3rx5ev7553XllVdq4MCBOnTokCTp6quvVm5urmbNmqW0tDStWLFCOTk5Xo/PzMzUfffdp6ysLMXFxWn27Nn1XsNms2ndunVq2bKlbrrpJvXu3Vvt2rXT6tWrfZ4bgO/4BmUAAGA0zuwAAACjETsAAMBoxA4AADAasQMAAIxG7AAAAKMROwAAwGjEDgAAMBqxAwAAjEbsAAAAoxE7AADAaMQOAAAw2v8H2Wb2AyOSoesAAAAASUVORK5CYII=",
+ "text/plain": [
+ "