From b1d422619c4922f0f61949c2995d699207a453b1 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 5 Feb 2020 16:57:11 -0500 Subject: [PATCH 01/35] bump dev version --- bbconf/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bbconf/_version.py b/bbconf/_version.py index f102a9c..6892a3d 100644 --- a/bbconf/_version.py +++ b/bbconf/_version.py @@ -1 +1 @@ -__version__ = "0.0.1" +__version__ = "0.0.2-dev" From 000f5fcf44bc124a87affbd90552da5ca22dce08 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 5 Feb 2020 16:57:35 -0500 Subject: [PATCH 02/35] make search return all results by default, but parametrize it --- bbconf/bbconf.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/bbconf/bbconf.py b/bbconf/bbconf.py index f337895..05bdf41 100644 --- a/bbconf/bbconf.py +++ b/bbconf/bbconf.py @@ -66,23 +66,25 @@ def assert_connection(self): if not hasattr(self, ES_CLIENT_KEY): raise BedBaseConnectionError("No active connection with Elasticsearch") - def _search_index(self, index_name, query, just_data=True): + def _search_index(self, index_name, query, just_data=True, size=None, **kwargs): """ Search selected Elasticsearch index with selected query :param str index_name: name of the Elasticsearch index to search :param dict query: query to search the DB against :param bool just_data: whether just the hits should be returned + :param int size: number of hits to return, all are returned by default :return dict | Iterable[dict]: search results """ self.assert_connection() _LOGGER.debug("Searching index: {}\nQuery: {}".format(index_name, query)) query = {"query": query} if "query" not in query else query - search_results = self[ES_CLIENT_KEY].search(index=index_name, body=query) + size = size or self._count_docs(index=index_name) + search_results = self[ES_CLIENT_KEY].search(index=index_name, body=query, size=size, **kwargs) return [r["_source"] for r in search_results["hits"]["hits"]] \ if just_data else search_results - def search_bedfiles(self, query, just_data=True): + def search_bedfiles(self, query, just_data=True, **kwargs): """ Search selected Elasticsearch bedset index with selected query @@ -90,9 +92,9 @@ def search_bedfiles(self, query, just_data=True): :param bool just_data: whether just the hits should be returned :return dict | Iterable[dict]: search results """ - return self._search_index(index_name=BED_INDEX, query=query, just_data=just_data) + return self._search_index(index_name=BED_INDEX, query=query, just_data=just_data, **kwargs) - def search_bedsets(self, query, just_data=True): + def search_bedsets(self, query, just_data=True, **kwargs): """ Search selected Elasticsearch bedfiles index with selected query @@ -100,7 +102,7 @@ def search_bedsets(self, query, just_data=True): :param bool just_data: whether just the hits should be returned :return dict | Iterable[dict]: search results """ - return self._search_index(index_name=BEDSET_INDEX, query=query, just_data=just_data) + return self._search_index(index_name=BEDSET_INDEX, query=query, just_data=just_data, **kwargs) def _insert_data(self, index, data, **kwargs): """ From 929fe4c08feae6dc3d24cdf87f38ebf5e588855b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 5 Feb 2020 17:03:05 -0500 Subject: [PATCH 03/35] update changelog --- docs/changelog.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/changelog.md b/docs/changelog.md index a53f6a2..af0bc6c 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,11 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. -## [0.0.1] - unreleased +## [0.0.2] - unreleased +### Changed +- make `search_bedfiles` and `search_bedsets` methods return all hits by default instead of just 10. Parametrize it. + + +## [0.0.1] - 2020-02-05 ### Added - initial project release \ No newline at end of file From af0bd6a1d253dcada86c5a44d6ba6275ffebcce0 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 11 Feb 2020 16:45:27 -0500 Subject: [PATCH 04/35] add json keys to const --- bbconf/const.py | 51 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/bbconf/const.py b/bbconf/const.py index f218480..3b6f6dc 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -32,6 +32,8 @@ CFG_BED_INDEX_KEY = "bed_index" CFG_BEDSET_INDEX_KEY = "bedset_index" +CFG_KEYS = ["CFG_PATH_KEY", "CFG_SERVER_KEY", "CFG_DATABASE_KEY", "CFG_HOST_KEY", + "CFG_PORT_KEY", "CFG_BEDSTAT_OUTPUT_KEY", "CFG_BED_INDEX_KEY", "CFG_BEDSET_INDEX_KEY"] DEFAULT_SECTION_VALUES = { CFG_DATABASE_KEY: { @@ -47,9 +49,52 @@ IDX_MAP = {CFG_BED_INDEX_KEY: BED_INDEX, CFG_BEDSET_INDEX_KEY: BEDSET_INDEX} -CFG_KEYS = ["CFG_PATH_KEY", "CFG_SERVER_KEY", "CFG_DATABASE_KEY", "CFG_HOST_KEY", - "CFG_PORT_KEY", "CFG_BEDSTAT_OUTPUT_KEY", "CFG_BED_INDEX_KEY", "CFG_BEDSET_INDEX_KEY"] +# JSON bed metadata constants and descriptions +# (the keys are actually established in bedstat/tools/regionstat.R) +JSON_GC_CONTENT_KEY = "gc_content" +JSON_ID_KEY = "id" +JSON_REGIONS_NO_KEY = "regions_no" +JSON_MEAN_ABS_TSS_DIST_KEY = "mean_abs_TSS_dist" +JSON_GEN_PART_KEY = "genomic_partitions" +JSON_MD5SUM_KEY = "md5sum" +JSON_PLOTS_KEY = "plots" +JSON_EXON_FREQUENCY = "exon_frequency" +JSON_INTRON_FREQUENCY = "intron_frequency" +JSON_INTERGENIC_FREQUENCY = "intergenic_frequency" +JSON_PROMOTERCORE_FREQUENCY = "promoterCore_frequency" +JSON_PROMOTERPROX_FREQUENCY = "promoterProx_frequency" +JSON_EXON_PERCENTAGE = "exon_percentage" +JSON_INTRON_PERCENTAGE = "intron_percentage" +JSON_INTERGENIC_PERCENTAGE = "intergenic_percentage" +JSON_PROMOTERCORE_PERCENTAGE = "promoterCore_percentage" +JSON_PROMOTERPROX_PERCENTAGE = "promoterProx_percentage" + +JSON_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_ID_KEY", "JSON_REGIONS_NO_KEY", + "JSON_MEAN_ABS_TSS_DIST_KEY", "JSON_GEN_PART_KEY", "JSON_MD5SUM_KEY", + "JSON_PLOTS_KEY", "JSON_EXON_FREQUENCY", "JSON_INTRON_FREQUENCY", + "JSON_INTERGENIC_FREQUENCY", "JSON_PROMOTERCORE_FREQUENCY", "JSON_PROMOTERPROX", + "JSON_EXON_PERCENTAGE", "JSON_INTRON_PERCENTAGE", "JSON_INTERGENIC_PERCENTAGE", + "JSON_PROMOTERCORE_PERCENTAGE", "JSON_PROMOTERPROX"] + +JSON_NUMERIC_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_REGIONS_NO_KEY", "JSON_GEN_PART_KEY", + "JSON_MEAN_ABS_TSS_DIST_KEY"] + +JSON_GC_CONTENT = {JSON_GC_CONTENT_KEY: "GC content"} +JSON_ID = {JSON_ID_KEY: "BED file ID"} +JSON_REGIONS_NO = {JSON_REGIONS_NO_KEY: "Number of regions"} +JSON_MEAN_ABS_TSS_DIST = {JSON_MEAN_ABS_TSS_DIST_KEY: "Mean absolute distance from transcription start sites"} +JSON_GEN_PART = {JSON_GEN_PART_KEY: "Genomic partitions"} +JSON_MD5SUM = {JSON_MD5SUM_KEY: "BED file md5 checksum"} + +JSON_DICTS_KEY_DESCS = ["JSON_GC_CONTENT", "JSON_ID", "JSON_REGIONS_NO", + "JSON_MEAN_ABS_TSS_DIST", "JSON_GEN_PART", "JSON_MD5SUM"] + +JSON_DICTS_KEY_DESCS = {JSON_GC_CONTENT_KEY: "GC content", JSON_ID_KEY: "BED file ID", + JSON_REGIONS_NO_KEY: "Number of regions", JSON_MD5SUM_KEY: "BED file md5 checksum", + JSON_MEAN_ABS_TSS_DIST_KEY: "Mean absolute distance from transcription start sites", + JSON_GEN_PART_KEY: "Genomic partitions"} __all__ = ["BED_INDEX", "BEDSET_INDEX", "SEARCH_TERMS", "RAW_BEDFILE_KEY", "CFG_ENV_VARS", "ES_CLIENT_KEY", "DB_DEFAULT_HOST", "SERVER_DEFAULT_PORT", "SERVER_DEFAULT_HOST", - "PKG_NAME", "IDX_MAP", "BEDFILE_PATH_KEY", "DEFAULT_SECTION_VALUES"] + CFG_KEYS + "PKG_NAME", "IDX_MAP", "BEDFILE_PATH_KEY", "DEFAULT_SECTION_VALUES", "JSON_DICTS_KEY_DESCS", + "JSON_KEYS", "JSON_NUMERIC_KEYS"] + CFG_KEYS + JSON_KEYS From 278b9a9d909243370e37d8efb1b71aeeb609721a Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 11 Feb 2020 17:58:08 -0500 Subject: [PATCH 05/35] more keys updates --- bbconf/const.py | 45 ++++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/bbconf/const.py b/bbconf/const.py index 3b6f6dc..230f7b4 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -55,44 +55,39 @@ JSON_ID_KEY = "id" JSON_REGIONS_NO_KEY = "regions_no" JSON_MEAN_ABS_TSS_DIST_KEY = "mean_abs_TSS_dist" -JSON_GEN_PART_KEY = "genomic_partitions" JSON_MD5SUM_KEY = "md5sum" JSON_PLOTS_KEY = "plots" -JSON_EXON_FREQUENCY = "exon_frequency" -JSON_INTRON_FREQUENCY = "intron_frequency" -JSON_INTERGENIC_FREQUENCY = "intergenic_frequency" -JSON_PROMOTERCORE_FREQUENCY = "promoterCore_frequency" -JSON_PROMOTERPROX_FREQUENCY = "promoterProx_frequency" -JSON_EXON_PERCENTAGE = "exon_percentage" -JSON_INTRON_PERCENTAGE = "intron_percentage" -JSON_INTERGENIC_PERCENTAGE = "intergenic_percentage" -JSON_PROMOTERCORE_PERCENTAGE = "promoterCore_percentage" -JSON_PROMOTERPROX_PERCENTAGE = "promoterProx_percentage" - -JSON_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_ID_KEY", "JSON_REGIONS_NO_KEY", - "JSON_MEAN_ABS_TSS_DIST_KEY", "JSON_GEN_PART_KEY", "JSON_MD5SUM_KEY", - "JSON_PLOTS_KEY", "JSON_EXON_FREQUENCY", "JSON_INTRON_FREQUENCY", - "JSON_INTERGENIC_FREQUENCY", "JSON_PROMOTERCORE_FREQUENCY", "JSON_PROMOTERPROX", - "JSON_EXON_PERCENTAGE", "JSON_INTRON_PERCENTAGE", "JSON_INTERGENIC_PERCENTAGE", - "JSON_PROMOTERCORE_PERCENTAGE", "JSON_PROMOTERPROX"] - -JSON_NUMERIC_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_REGIONS_NO_KEY", "JSON_GEN_PART_KEY", - "JSON_MEAN_ABS_TSS_DIST_KEY"] +JSON_EXON_FREQUENCY_KEY = "exon_frequency" +JSON_INTRON_FREQUENCY_KEY = "intron_frequency" +JSON_INTERGENIC_FREQUENCY_KEY = "intergenic_frequency" +JSON_PROMOTERCORE_FREQUENCY_KEY = "promoterCore_frequency" +JSON_PROMOTERPROX_FREQUENCY_KEY = "promoterProx_frequency" +JSON_EXON_PERCENTAGE_KEY = "exon_percentage" +JSON_INTRON_PERCENTAGE_KEY = "intron_percentage" +JSON_INTERGENIC_PERCENTAGE_KEY = "intergenic_percentage" +JSON_PROMOTERCORE_PERCENTAGE_KEY = "promoterCore_percentage" +JSON_PROMOTERPROX_PERCENTAGE_KEY = "promoterProx_percentage" + +JSON_NUMERIC_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_REGIONS_NO_KEY", "JSON_MEAN_ABS_TSS_DIST_KEY", + "JSON_EXON_FREQUENCY_KEY", "JSON_INTRON_FREQUENCY_KEY", "JSON_PROMOTERPROX_FREQUENCY_KEY", + "JSON_INTERGENIC_FREQUENCY_KEY", "JSON_PROMOTERCORE_FREQUENCY_KEY", + "JSON_PROMOTERPROX_PERCENTAGE_KEY", "JSON_EXON_PERCENTAGE_KEY", "JSON_INTRON_PERCENTAGE_KEY", + "JSON_INTERGENIC_PERCENTAGE_KEY", "JSON_PROMOTERCORE_PERCENTAGE_KEY"] + +JSON_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_ID_KEY", "JSON_PLOTS_KEY"] + JSON_NUMERIC_KEYS JSON_GC_CONTENT = {JSON_GC_CONTENT_KEY: "GC content"} JSON_ID = {JSON_ID_KEY: "BED file ID"} JSON_REGIONS_NO = {JSON_REGIONS_NO_KEY: "Number of regions"} JSON_MEAN_ABS_TSS_DIST = {JSON_MEAN_ABS_TSS_DIST_KEY: "Mean absolute distance from transcription start sites"} -JSON_GEN_PART = {JSON_GEN_PART_KEY: "Genomic partitions"} JSON_MD5SUM = {JSON_MD5SUM_KEY: "BED file md5 checksum"} JSON_DICTS_KEY_DESCS = ["JSON_GC_CONTENT", "JSON_ID", "JSON_REGIONS_NO", - "JSON_MEAN_ABS_TSS_DIST", "JSON_GEN_PART", "JSON_MD5SUM"] + "JSON_MEAN_ABS_TSS_DIST", "JSON_MD5SUM"] JSON_DICTS_KEY_DESCS = {JSON_GC_CONTENT_KEY: "GC content", JSON_ID_KEY: "BED file ID", JSON_REGIONS_NO_KEY: "Number of regions", JSON_MD5SUM_KEY: "BED file md5 checksum", - JSON_MEAN_ABS_TSS_DIST_KEY: "Mean absolute distance from transcription start sites", - JSON_GEN_PART_KEY: "Genomic partitions"} + JSON_MEAN_ABS_TSS_DIST_KEY: "Mean absolute distance from transcription start sites"} __all__ = ["BED_INDEX", "BEDSET_INDEX", "SEARCH_TERMS", "RAW_BEDFILE_KEY", "CFG_ENV_VARS", "ES_CLIENT_KEY", "DB_DEFAULT_HOST", "SERVER_DEFAULT_PORT", "SERVER_DEFAULT_HOST", From f57e0ceadaae16086b6b2b08bb25b4a93803e57d Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 12 Feb 2020 07:47:06 -0500 Subject: [PATCH 06/35] add genomic partitions consts --- bbconf/const.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/bbconf/const.py b/bbconf/const.py index 230f7b4..436a5a8 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -76,18 +76,21 @@ JSON_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_ID_KEY", "JSON_PLOTS_KEY"] + JSON_NUMERIC_KEYS -JSON_GC_CONTENT = {JSON_GC_CONTENT_KEY: "GC content"} -JSON_ID = {JSON_ID_KEY: "BED file ID"} -JSON_REGIONS_NO = {JSON_REGIONS_NO_KEY: "Number of regions"} -JSON_MEAN_ABS_TSS_DIST = {JSON_MEAN_ABS_TSS_DIST_KEY: "Mean absolute distance from transcription start sites"} -JSON_MD5SUM = {JSON_MD5SUM_KEY: "BED file md5 checksum"} - -JSON_DICTS_KEY_DESCS = ["JSON_GC_CONTENT", "JSON_ID", "JSON_REGIONS_NO", - "JSON_MEAN_ABS_TSS_DIST", "JSON_MD5SUM"] - +_PERC_TXT = "Percentage of regions in " +_FREQ_TXT = "Frequency of regions in " JSON_DICTS_KEY_DESCS = {JSON_GC_CONTENT_KEY: "GC content", JSON_ID_KEY: "BED file ID", JSON_REGIONS_NO_KEY: "Number of regions", JSON_MD5SUM_KEY: "BED file md5 checksum", - JSON_MEAN_ABS_TSS_DIST_KEY: "Mean absolute distance from transcription start sites"} + JSON_MEAN_ABS_TSS_DIST_KEY: "Mean absolute distance from transcription start sites", + JSON_PROMOTERPROX_PERCENTAGE_KEY: _PERC_TXT + "promoter proximity", + JSON_PROMOTERCORE_PERCENTAGE_KEY: _PERC_TXT + "promoter core", + JSON_EXON_PERCENTAGE_KEY: _PERC_TXT + "exons", + JSON_INTRON_PERCENTAGE_KEY: _PERC_TXT + "introns", + JSON_INTERGENIC_PERCENTAGE_KEY: _PERC_TXT + "intergenic", + JSON_PROMOTERPROX_FREQUENCY_KEY: _FREQ_TXT + "promoter proximity", + JSON_PROMOTERCORE_FREQUENCY_KEY: _FREQ_TXT + "promoter core", + JSON_EXON_FREQUENCY_KEY: _FREQ_TXT + "exons", + JSON_INTRON_FREQUENCY_KEY: _FREQ_TXT + "introns", + JSON_INTERGENIC_FREQUENCY_KEY: _FREQ_TXT + "intergenic"} __all__ = ["BED_INDEX", "BEDSET_INDEX", "SEARCH_TERMS", "RAW_BEDFILE_KEY", "CFG_ENV_VARS", "ES_CLIENT_KEY", "DB_DEFAULT_HOST", "SERVER_DEFAULT_PORT", "SERVER_DEFAULT_HOST", From 1f9871e606ace4cdedf537d9947c5b9134cc66b7 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 12 Feb 2020 21:11:06 -0500 Subject: [PATCH 07/35] add bedset keys --- bbconf/const.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/bbconf/const.py b/bbconf/const.py index 436a5a8..4cae581 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -68,13 +68,31 @@ JSON_PROMOTERCORE_PERCENTAGE_KEY = "promoterCore_percentage" JSON_PROMOTERPROX_PERCENTAGE_KEY = "promoterProx_percentage" -JSON_NUMERIC_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_REGIONS_NO_KEY", "JSON_MEAN_ABS_TSS_DIST_KEY", +JSON_NUMERIC_KEY_NAMES = ["JSON_GC_CONTENT_KEY", "JSON_REGIONS_NO_KEY", "JSON_MEAN_ABS_TSS_DIST_KEY", "JSON_EXON_FREQUENCY_KEY", "JSON_INTRON_FREQUENCY_KEY", "JSON_PROMOTERPROX_FREQUENCY_KEY", "JSON_INTERGENIC_FREQUENCY_KEY", "JSON_PROMOTERCORE_FREQUENCY_KEY", "JSON_PROMOTERPROX_PERCENTAGE_KEY", "JSON_EXON_PERCENTAGE_KEY", "JSON_INTRON_PERCENTAGE_KEY", "JSON_INTERGENIC_PERCENTAGE_KEY", "JSON_PROMOTERCORE_PERCENTAGE_KEY"] -JSON_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_ID_KEY", "JSON_PLOTS_KEY"] + JSON_NUMERIC_KEYS +JSON_NUMERIC_KEY_VALUES = [JSON_GC_CONTENT_KEY, JSON_REGIONS_NO_KEY, JSON_MEAN_ABS_TSS_DIST_KEY, + JSON_EXON_FREQUENCY_KEY, JSON_INTRON_FREQUENCY_KEY, JSON_PROMOTERPROX_FREQUENCY_KEY, + JSON_INTERGENIC_FREQUENCY_KEY, JSON_PROMOTERCORE_FREQUENCY_KEY, + JSON_PROMOTERPROX_PERCENTAGE_KEY, JSON_EXON_PERCENTAGE_KEY, JSON_INTRON_PERCENTAGE_KEY, + JSON_INTERGENIC_PERCENTAGE_KEY, JSON_PROMOTERCORE_PERCENTAGE_KEY] + +JSON_BEDSET_MEANS_KEY = "bedset_means" +JSON_BEDSET_SD_KEY = "bedset_standard_deviation" +JSON_BEDSET_TAR_PATH_KEY = "bedset_tar_archive_path" +JSON_BEDSET_BEDFILES_GD_STATS_KEY = "bedset_bedfiles_gd_stats" +JSON_BEDSET_IGD_DB_KEY = "bedset_igd_database_path" +JSON_BEDSET_GD_STATS = "bedset_gd_stats" +JSON_BEDSET_KEY_VALUES = [JSON_BEDSET_MEANS_KEY, JSON_BEDSET_SD_KEY, JSON_BEDSET_TAR_PATH_KEY, + JSON_BEDSET_BEDFILES_GD_STATS_KEY, JSON_BEDSET_IGD_DB_KEY, JSON_BEDSET_GD_STATS] +JSON_BEDSET_KEY_NAMES = ["JSON_BEDSET_MEANS_KEY", "JSON_BEDSET_SD_KEY", "JSON_BEDSET_TAR_PATH_KEY", + "JSON_BEDSET_BEDFILES_GD_STATS_KEY", "JSON_BEDSET_IGD_DB_KEY", "JSON_BEDSET_GD_STATS"] + +JSON_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_ID_KEY", "JSON_PLOTS_KEY"] + \ + JSON_NUMERIC_KEY_NAMES + JSON_BEDSET_KEY_NAMES _PERC_TXT = "Percentage of regions in " _FREQ_TXT = "Frequency of regions in " @@ -95,4 +113,5 @@ __all__ = ["BED_INDEX", "BEDSET_INDEX", "SEARCH_TERMS", "RAW_BEDFILE_KEY", "CFG_ENV_VARS", "ES_CLIENT_KEY", "DB_DEFAULT_HOST", "SERVER_DEFAULT_PORT", "SERVER_DEFAULT_HOST", "PKG_NAME", "IDX_MAP", "BEDFILE_PATH_KEY", "DEFAULT_SECTION_VALUES", "JSON_DICTS_KEY_DESCS", - "JSON_KEYS", "JSON_NUMERIC_KEYS"] + CFG_KEYS + JSON_KEYS + "JSON_KEYS", "JSON_NUMERIC_KEY_VALUES", "JSON_NUMERIC_KEY_NAMES", "JSON_BEDSET_KEY_VALUES", + "JSON_BEDSET_KEY_NAMES"] + CFG_KEYS + JSON_KEYS From 42806b9f2c1a778f285297b0182166f1ab419788 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 13 Feb 2020 14:18:59 -0500 Subject: [PATCH 08/35] add bedsets affiliation key --- bbconf/const.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/bbconf/const.py b/bbconf/const.py index 4cae581..689bf18 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -54,7 +54,7 @@ JSON_GC_CONTENT_KEY = "gc_content" JSON_ID_KEY = "id" JSON_REGIONS_NO_KEY = "regions_no" -JSON_MEAN_ABS_TSS_DIST_KEY = "mean_abs_TSS_dist" +JSON_MEAN_ABS_TSS_DIST_KEY = "mean_absolute_TSS_dist" JSON_MD5SUM_KEY = "md5sum" JSON_PLOTS_KEY = "plots" JSON_EXON_FREQUENCY_KEY = "exon_frequency" @@ -67,6 +67,7 @@ JSON_INTERGENIC_PERCENTAGE_KEY = "intergenic_percentage" JSON_PROMOTERCORE_PERCENTAGE_KEY = "promoterCore_percentage" JSON_PROMOTERPROX_PERCENTAGE_KEY = "promoterProx_percentage" +JSON_BEDSETS_AFFILIATION_KEY = "bedsets_affiliation" JSON_NUMERIC_KEY_NAMES = ["JSON_GC_CONTENT_KEY", "JSON_REGIONS_NO_KEY", "JSON_MEAN_ABS_TSS_DIST_KEY", "JSON_EXON_FREQUENCY_KEY", "JSON_INTRON_FREQUENCY_KEY", "JSON_PROMOTERPROX_FREQUENCY_KEY", @@ -91,7 +92,7 @@ JSON_BEDSET_KEY_NAMES = ["JSON_BEDSET_MEANS_KEY", "JSON_BEDSET_SD_KEY", "JSON_BEDSET_TAR_PATH_KEY", "JSON_BEDSET_BEDFILES_GD_STATS_KEY", "JSON_BEDSET_IGD_DB_KEY", "JSON_BEDSET_GD_STATS"] -JSON_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_ID_KEY", "JSON_PLOTS_KEY"] + \ +JSON_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_ID_KEY", "JSON_PLOTS_KEY", "JSON_BEDSETS_AFFILIATION_KEY"] + \ JSON_NUMERIC_KEY_NAMES + JSON_BEDSET_KEY_NAMES _PERC_TXT = "Percentage of regions in " @@ -108,7 +109,15 @@ JSON_PROMOTERCORE_FREQUENCY_KEY: _FREQ_TXT + "promoter core", JSON_EXON_FREQUENCY_KEY: _FREQ_TXT + "exons", JSON_INTRON_FREQUENCY_KEY: _FREQ_TXT + "introns", - JSON_INTERGENIC_FREQUENCY_KEY: _FREQ_TXT + "intergenic"} + JSON_INTERGENIC_FREQUENCY_KEY: _FREQ_TXT + "intergenic", + JSON_BEDSET_MEANS_KEY: "Average bedset statistics", + JSON_BEDSET_SD_KEY: "Standard deviation of bedset statistics", + JSON_BEDSET_TAR_PATH_KEY: "TAR archive", + JSON_BEDSET_BEDFILES_GD_STATS_KEY: "Individual bedfiles statistics CSV", + JSON_BEDSET_IGD_DB_KEY: "Bedset iGD database", + JSON_BEDSET_GD_STATS: "Bedset statistics CSV", + +} __all__ = ["BED_INDEX", "BEDSET_INDEX", "SEARCH_TERMS", "RAW_BEDFILE_KEY", "CFG_ENV_VARS", "ES_CLIENT_KEY", "DB_DEFAULT_HOST", "SERVER_DEFAULT_PORT", "SERVER_DEFAULT_HOST", From 871566d94fa3cdc3b11bb4b9d55e43fd93555915 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 13 Feb 2020 14:21:52 -0500 Subject: [PATCH 09/35] check for index existence before counting docs --- bbconf/bbconf.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bbconf/bbconf.py b/bbconf/bbconf.py index 05bdf41..aa252eb 100644 --- a/bbconf/bbconf.py +++ b/bbconf/bbconf.py @@ -165,9 +165,12 @@ def _count_docs(self, index): Get the total number of the documents in a selected index :param str index: index to count the documents for - :return int: number of documents + :return int | None: number of documents """ self.assert_connection() + if not self[ES_CLIENT_KEY].indices.exists(index=index): + _LOGGER.warning("'{}' index does not exist".format(index)) + return None return int(self[ES_CLIENT_KEY].cat.count(index, params={"format": "json"})[0]['count']) def count_bedfiles_docs(self): From e29a5575f355896766751afd88f5ad7b20fabd04 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 18 Feb 2020 10:45:56 -0500 Subject: [PATCH 10/35] multiple changes: - update output const name an val - check for index existence prior to searching --- bbconf/bbconf.py | 10 +++++++--- bbconf/const.py | 8 +++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/bbconf/bbconf.py b/bbconf/bbconf.py index aa252eb..7c23295 100644 --- a/bbconf/bbconf.py +++ b/bbconf/bbconf.py @@ -31,8 +31,8 @@ def _raise_missing_key(key): # if there's nothing under path key (None) self[CFG_PATH_KEY] = PXAM() - if CFG_BEDSTAT_OUTPUT_KEY not in self[CFG_PATH_KEY]: - _raise_missing_key(CFG_BEDSTAT_OUTPUT_KEY) + if CFG_PIP_OUTPUT_KEY not in self[CFG_PATH_KEY]: + _raise_missing_key(CFG_PIP_OUTPUT_KEY) for section, mapping in DEFAULT_SECTION_VALUES.items(): if section not in self: @@ -74,9 +74,13 @@ def _search_index(self, index_name, query, just_data=True, size=None, **kwargs): :param dict query: query to search the DB against :param bool just_data: whether just the hits should be returned :param int size: number of hits to return, all are returned by default - :return dict | Iterable[dict]: search results + :return dict | Iterable[dict] | NoneType: search results + or None if requested index does not exist """ self.assert_connection() + if not self[ES_CLIENT_KEY].indices.exists(index_name): + _LOGGER.warning("'{}' index does not exist".format(index_name)) + return _LOGGER.debug("Searching index: {}\nQuery: {}".format(index_name, query)) query = {"query": query} if "query" not in query else query size = size or self._count_docs(index=index_name) diff --git a/bbconf/const.py b/bbconf/const.py index 689bf18..e89f443 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -28,12 +28,12 @@ CFG_DATABASE_KEY = "database" CFG_HOST_KEY = "host" CFG_PORT_KEY = "port" -CFG_BEDSTAT_OUTPUT_KEY = "bedstat_output" +CFG_PIP_OUTPUT_KEY = "pipelines_output" CFG_BED_INDEX_KEY = "bed_index" CFG_BEDSET_INDEX_KEY = "bedset_index" CFG_KEYS = ["CFG_PATH_KEY", "CFG_SERVER_KEY", "CFG_DATABASE_KEY", "CFG_HOST_KEY", - "CFG_PORT_KEY", "CFG_BEDSTAT_OUTPUT_KEY", "CFG_BED_INDEX_KEY", "CFG_BEDSET_INDEX_KEY"] + "CFG_PORT_KEY", "CFG_PIP_OUTPUT_KEY", "CFG_BED_INDEX_KEY", "CFG_BEDSET_INDEX_KEY"] DEFAULT_SECTION_VALUES = { CFG_DATABASE_KEY: { @@ -115,9 +115,7 @@ JSON_BEDSET_TAR_PATH_KEY: "TAR archive", JSON_BEDSET_BEDFILES_GD_STATS_KEY: "Individual bedfiles statistics CSV", JSON_BEDSET_IGD_DB_KEY: "Bedset iGD database", - JSON_BEDSET_GD_STATS: "Bedset statistics CSV", - -} + JSON_BEDSET_GD_STATS: "Bedset statistics CSV"} __all__ = ["BED_INDEX", "BEDSET_INDEX", "SEARCH_TERMS", "RAW_BEDFILE_KEY", "CFG_ENV_VARS", "ES_CLIENT_KEY", "DB_DEFAULT_HOST", "SERVER_DEFAULT_PORT", "SERVER_DEFAULT_HOST", From ac4c4be1d70f3934dc1435136fbc440b511748ce Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 18 Feb 2020 12:37:21 -0500 Subject: [PATCH 11/35] implement doc id assiging possibility --- bbconf/bbconf.py | 28 +++++++++++++++++++++------- bbconf/const.py | 4 +++- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/bbconf/bbconf.py b/bbconf/bbconf.py index 7c23295..4ffe6d9 100644 --- a/bbconf/bbconf.py +++ b/bbconf/bbconf.py @@ -108,34 +108,37 @@ def search_bedsets(self, query, just_data=True, **kwargs): """ return self._search_index(index_name=BEDSET_INDEX, query=query, just_data=just_data, **kwargs) - def _insert_data(self, index, data, **kwargs): + def _insert_data(self, index, data, doc_id, **kwargs): """ Insert data to an index in a Elasticsearch DB or create it and the insert in case it does not exist :param str index: name of the index to insert the data into + :param str doc_id: unique identifier for the document :param dict data: data to insert """ self.assert_connection() - self[ES_CLIENT_KEY].index(index=index, body=data, **kwargs) + self[ES_CLIENT_KEY].index(index=index, body=data, id=doc_id, **kwargs) - def insert_bedfiles_data(self, data, **kwargs): + def insert_bedfiles_data(self, data, doc_id=None, **kwargs): """ Insert data to the bedfile index a Elasticsearch DB or create it and the insert in case it does not exist :param dict data: data to insert + :param str doc_id: unique identifier for the document, optional """ - self._insert_data(index=BED_INDEX, data=data, **kwargs) + self._insert_data(index=BED_INDEX, data=data, doc_id=doc_id, **kwargs) - def insert_bedsets_data(self, data, **kwargs): + def insert_bedsets_data(self, data, doc_id=None, **kwargs): """ Insert data to the bedset index in a Elasticsearch DB or create it and the insert in case it does not exist :param dict data: data to insert + :param str doc_id: unique identifier for the document, optional """ - self._insert_data(index=BEDSET_INDEX, data=data, **kwargs) + self._insert_data(index=BEDSET_INDEX, data=data, doc_id=doc_id, **kwargs) def _get_mapping(self, index, just_data=True, **kwargs): """ @@ -193,12 +196,23 @@ def count_bedsets_docs(self): """ return self._count_docs(index=BEDSET_INDEX) + def _get_all(self, index_name, just_data=False): + """ + Convenience method for index exploration + + :param str index_name: name of the Elasticsearch index to search + :param bool just_data: whether just the hits should be returned + :return: + """ + self.assert_connection() + return self._search_index(index_name=index_name, query=QUERY_ALL, just_data=just_data) + def get_bedbase_cfg(cfg=None): """ Determine path to the bedbase configuration file - The path can be either excplicitly provided + The path can be either explicitly provided or read from a $BEDBASE environment variable :param str cfg: path to the config file. diff --git a/bbconf/const.py b/bbconf/const.py index e89f443..7f199e7 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -22,6 +22,8 @@ ES_CLIENT_KEY = "elasticsearch_client" +QUERY_ALL = {"match_all": {}} + # config file constants CFG_PATH_KEY = "path" CFG_SERVER_KEY = "server" @@ -121,4 +123,4 @@ "ES_CLIENT_KEY", "DB_DEFAULT_HOST", "SERVER_DEFAULT_PORT", "SERVER_DEFAULT_HOST", "PKG_NAME", "IDX_MAP", "BEDFILE_PATH_KEY", "DEFAULT_SECTION_VALUES", "JSON_DICTS_KEY_DESCS", "JSON_KEYS", "JSON_NUMERIC_KEY_VALUES", "JSON_NUMERIC_KEY_NAMES", "JSON_BEDSET_KEY_VALUES", - "JSON_BEDSET_KEY_NAMES"] + CFG_KEYS + JSON_KEYS + "JSON_BEDSET_KEY_NAMES", "QUERY_ALL"] + CFG_KEYS + JSON_KEYS From 700ea8c5d906e0692bb76e12d4a2ad4ce6b6e99c Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 18 Feb 2020 12:57:21 -0500 Subject: [PATCH 12/35] implement more sophisticated doc inserting logic. Prevents overwriting see https://github.com/databio/bedbuncher/issues/3 --- bbconf/bbconf.py | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/bbconf/bbconf.py b/bbconf/bbconf.py index 4ffe6d9..7ff7310 100644 --- a/bbconf/bbconf.py +++ b/bbconf/bbconf.py @@ -1,4 +1,5 @@ from elasticsearch import Elasticsearch +from elasticsearch.exceptions import ConflictError from logging import getLogger from attmap import PathExAttMap as PXAM @@ -108,23 +109,48 @@ def search_bedsets(self, query, just_data=True, **kwargs): """ return self._search_index(index_name=BEDSET_INDEX, query=query, just_data=just_data, **kwargs) - def _insert_data(self, index, data, doc_id, **kwargs): + def _insert_data(self, index, data, doc_id, force_update=False, **kwargs): """ - Insert data to an index in a Elasticsearch DB - or create it and the insert in case it does not exist + Insert document to an index in a Elasticsearch DB + or create it and the insert in case it does not exist. + + Document ID argument is optional. If not provided, a random ID will be assigned. + If provided the document will be inserted only if no documents with this ID are present in the DB. + However, the document overwriting can be forced if needed. :param str index: name of the index to insert the data into :param str doc_id: unique identifier for the document + :param bool force_update: whether the pre-existing document should be overwritten :param dict data: data to insert """ self.assert_connection() - self[ES_CLIENT_KEY].index(index=index, body=data, id=doc_id, **kwargs) + if doc_id is None: + _LOGGER.info("Inserting document to index '{}' with an " + "automatically-assigned ID".format(index)) + self[ES_CLIENT_KEY].index(index=index, body=data, **kwargs) + else: + try: + self[ES_CLIENT_KEY].create(index=index, body=data, id=doc_id, **kwargs) + except ConflictError: + msg_base = "Document '{}' already exists in index '{}'"\ + .format(doc_id, index) + if force_update: + _LOGGER.info(msg_base + ". Forcing update") + self[ES_CLIENT_KEY].index(index=index, body=data, id=doc_id, **kwargs) + else: + _LOGGER.error("Could not insert data. " + msg_base) + raise def insert_bedfiles_data(self, data, doc_id=None, **kwargs): """ Insert data to the bedfile index a Elasticsearch DB or create it and the insert in case it does not exist + Document ID argument is optional. If not provided, a random ID will be assigned. + If provided the document will be inserted only if no documents with this ID are present in the DB. + However, the document overwriting can be forced if needed. + + :param dict data: data to insert :param str doc_id: unique identifier for the document, optional """ @@ -135,6 +161,11 @@ def insert_bedsets_data(self, data, doc_id=None, **kwargs): Insert data to the bedset index in a Elasticsearch DB or create it and the insert in case it does not exist + Document ID argument is optional. If not provided, a random ID will be assigned. + If provided the document will be inserted only if no documents with this ID are present in the DB. + However, the document overwriting can be forced if needed. + + :param dict data: data to insert :param str doc_id: unique identifier for the document, optional """ From bc4b8aa04180ed879a7e9b5fccef0e4d27e9f0ff Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 18 Feb 2020 12:59:30 -0500 Subject: [PATCH 13/35] update API docs --- docs/bbc_api.md | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/docs/bbc_api.md b/docs/bbc_api.md index 62d0d70..1d39f72 100644 --- a/docs/bbc_api.md +++ b/docs/bbc_api.md @@ -107,31 +107,33 @@ Get mapping definitions for the bedsets index ```python -def insert_bedfiles_data(self, data, **kwargs) +def insert_bedfiles_data(self, data, doc_id=None, **kwargs) ``` Insert data to the bedfile index a Elasticsearch DB or create it and the insert in case it does not exist #### Parameters: - `data` (`dict`): data to insert +- `doc_id` (`str`): unique identifier for the document, optional ```python -def insert_bedsets_data(self, data, **kwargs) +def insert_bedsets_data(self, data, doc_id=None, **kwargs) ``` Insert data to the bedset index in a Elasticsearch DB or create it and the insert in case it does not exist #### Parameters: - `data` (`dict`): data to insert +- `doc_id` (`str`): unique identifier for the document, optional ```python -def search_bedfiles(self, query, just_data=True) +def search_bedfiles(self, query, just_data=True, **kwargs) ``` Search selected Elasticsearch bedset index with selected query @@ -149,7 +151,7 @@ Search selected Elasticsearch bedset index with selected query ```python -def search_bedsets(self, query, just_data=True) +def search_bedsets(self, query, just_data=True, **kwargs) ``` Search selected Elasticsearch bedfiles index with selected query @@ -182,10 +184,13 @@ Return writability flag or None if not set def get_bedbase_cfg(cfg=None) ``` -Read and create the bedbase configuration object +Determine path to the bedbase configuration file + +The path can be either explicitly provided +or read from a $BEDBASE environment variable #### Parameters: -- `cfg` (`str`): path to the config file.Optional, the bedbase config env var will be used if not provided +- `cfg` (`str`): path to the config file.Optional, the $BEDBASE config env var will be used if not provided #### Returns: @@ -198,4 +203,4 @@ Read and create the bedbase configuration object -*Version Information: `bbconf` v0.0.1, generated by `lucidoc` v0.4.2* +*Version Information: `bbconf` v0.0.2-dev, generated by `lucidoc` v0.4.2* From 45ba5394bc904821a47eb82dd7089efa8fd6ad6f Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 18 Feb 2020 13:02:09 -0500 Subject: [PATCH 14/35] correct argstring so it's correctly parsed by lucidoc --- bbconf/bbconf.py | 2 -- docs/bbc_api.md | 36 +++++++++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/bbconf/bbconf.py b/bbconf/bbconf.py index 7ff7310..2ffb57a 100644 --- a/bbconf/bbconf.py +++ b/bbconf/bbconf.py @@ -113,7 +113,6 @@ def _insert_data(self, index, data, doc_id, force_update=False, **kwargs): """ Insert document to an index in a Elasticsearch DB or create it and the insert in case it does not exist. - Document ID argument is optional. If not provided, a random ID will be assigned. If provided the document will be inserted only if no documents with this ID are present in the DB. However, the document overwriting can be forced if needed. @@ -145,7 +144,6 @@ def insert_bedfiles_data(self, data, doc_id=None, **kwargs): """ Insert data to the bedfile index a Elasticsearch DB or create it and the insert in case it does not exist - Document ID argument is optional. If not provided, a random ID will be assigned. If provided the document will be inserted only if no documents with this ID are present in the DB. However, the document overwriting can be forced if needed. diff --git a/docs/bbc_api.md b/docs/bbc_api.md index 1d39f72..e816d74 100644 --- a/docs/bbc_api.md +++ b/docs/bbc_api.md @@ -1,3 +1,33 @@ +Final targets: BedBaseConf, get_bedbase_cfg + + + + + # Package `bbconf` Documentation ## Class `BedBaseConf` @@ -110,7 +140,7 @@ Get mapping definitions for the bedsets index def insert_bedfiles_data(self, data, doc_id=None, **kwargs) ``` -Insert data to the bedfile index a Elasticsearch DB or create it and the insert in case it does not exist +Insert data to the bedfile index a Elasticsearch DB or create it and the insert in case it does not exist Document ID argument is optional. If not provided, a random ID will be assigned. If provided the document will be inserted only if no documents with this ID are present in the DB. However, the document overwriting can be forced if needed. #### Parameters: - `data` (`dict`): data to insert @@ -124,6 +154,10 @@ def insert_bedsets_data(self, data, doc_id=None, **kwargs) ``` Insert data to the bedset index in a Elasticsearch DB or create it and the insert in case it does not exist + +Document ID argument is optional. If not provided, a random ID will be assigned. +If provided the document will be inserted only if no documents with this ID are present in the DB. +However, the document overwriting can be forced if needed. #### Parameters: - `data` (`dict`): data to insert From 4da3c0ccceff6404e0eb8c015e0f23a3f16b6d8e Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 18 Feb 2020 13:03:50 -0500 Subject: [PATCH 15/35] update docs --- bbconf/bbconf.py | 6 ++++-- docs/bbc_api.md | 8 ++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/bbconf/bbconf.py b/bbconf/bbconf.py index 2ffb57a..7b291f6 100644 --- a/bbconf/bbconf.py +++ b/bbconf/bbconf.py @@ -113,6 +113,7 @@ def _insert_data(self, index, data, doc_id, force_update=False, **kwargs): """ Insert document to an index in a Elasticsearch DB or create it and the insert in case it does not exist. + Document ID argument is optional. If not provided, a random ID will be assigned. If provided the document will be inserted only if no documents with this ID are present in the DB. However, the document overwriting can be forced if needed. @@ -143,7 +144,8 @@ def _insert_data(self, index, data, doc_id, force_update=False, **kwargs): def insert_bedfiles_data(self, data, doc_id=None, **kwargs): """ Insert data to the bedfile index a Elasticsearch DB - or create it and the insert in case it does not exist + or create it and the insert in case it does not exist. + Document ID argument is optional. If not provided, a random ID will be assigned. If provided the document will be inserted only if no documents with this ID are present in the DB. However, the document overwriting can be forced if needed. @@ -157,7 +159,7 @@ def insert_bedfiles_data(self, data, doc_id=None, **kwargs): def insert_bedsets_data(self, data, doc_id=None, **kwargs): """ Insert data to the bedset index in a Elasticsearch DB - or create it and the insert in case it does not exist + or create it and the insert in case it does not exist. Document ID argument is optional. If not provided, a random ID will be assigned. If provided the document will be inserted only if no documents with this ID are present in the DB. diff --git a/docs/bbc_api.md b/docs/bbc_api.md index e816d74..81b2cec 100644 --- a/docs/bbc_api.md +++ b/docs/bbc_api.md @@ -140,7 +140,11 @@ Get mapping definitions for the bedsets index def insert_bedfiles_data(self, data, doc_id=None, **kwargs) ``` -Insert data to the bedfile index a Elasticsearch DB or create it and the insert in case it does not exist Document ID argument is optional. If not provided, a random ID will be assigned. If provided the document will be inserted only if no documents with this ID are present in the DB. However, the document overwriting can be forced if needed. +Insert data to the bedfile index a Elasticsearch DB or create it and the insert in case it does not exist. + +Document ID argument is optional. If not provided, a random ID will be assigned. +If provided the document will be inserted only if no documents with this ID are present in the DB. +However, the document overwriting can be forced if needed. #### Parameters: - `data` (`dict`): data to insert @@ -153,7 +157,7 @@ Insert data to the bedfile index a Elasticsearch DB or create it and the insert def insert_bedsets_data(self, data, doc_id=None, **kwargs) ``` -Insert data to the bedset index in a Elasticsearch DB or create it and the insert in case it does not exist +Insert data to the bedset index in a Elasticsearch DB or create it and the insert in case it does not exist. Document ID argument is optional. If not provided, a random ID will be assigned. If provided the document will be inserted only if no documents with this ID are present in the DB. From 4687b7cf766680a5b98f4f93cec1651f2e6c76b1 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 25 Feb 2020 07:05:28 -0500 Subject: [PATCH 16/35] add consts --- bbconf/const.py | 98 ++++++++++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 45 deletions(-) diff --git a/bbconf/const.py b/bbconf/const.py index 7f199e7..eec8225 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -9,9 +9,6 @@ CFG_ENV_VARS = ["BEDBASE"] -SEARCH_TERMS = ['cellType', 'cellTypeSubtype', 'antibody', 'mappingGenome', - 'description', 'tissue', 'species', 'protocol', 'genome'] - RAW_BEDFILE_KEY = "raw_bedfile" BEDFILE_PATH_KEY = "bedfile_path" @@ -34,8 +31,9 @@ CFG_BED_INDEX_KEY = "bed_index" CFG_BEDSET_INDEX_KEY = "bedset_index" -CFG_KEYS = ["CFG_PATH_KEY", "CFG_SERVER_KEY", "CFG_DATABASE_KEY", "CFG_HOST_KEY", - "CFG_PORT_KEY", "CFG_PIP_OUTPUT_KEY", "CFG_BED_INDEX_KEY", "CFG_BEDSET_INDEX_KEY"] +CFG_KEYS = [ + "CFG_PATH_KEY", "CFG_SERVER_KEY", "CFG_DATABASE_KEY", "CFG_HOST_KEY", + "CFG_PORT_KEY", "CFG_PIP_OUTPUT_KEY", "CFG_BED_INDEX_KEY", "CFG_BEDSET_INDEX_KEY"] DEFAULT_SECTION_VALUES = { CFG_DATABASE_KEY: { @@ -69,19 +67,22 @@ JSON_INTERGENIC_PERCENTAGE_KEY = "intergenic_percentage" JSON_PROMOTERCORE_PERCENTAGE_KEY = "promoterCore_percentage" JSON_PROMOTERPROX_PERCENTAGE_KEY = "promoterProx_percentage" -JSON_BEDSETS_AFFILIATION_KEY = "bedsets_affiliation" - -JSON_NUMERIC_KEY_NAMES = ["JSON_GC_CONTENT_KEY", "JSON_REGIONS_NO_KEY", "JSON_MEAN_ABS_TSS_DIST_KEY", - "JSON_EXON_FREQUENCY_KEY", "JSON_INTRON_FREQUENCY_KEY", "JSON_PROMOTERPROX_FREQUENCY_KEY", - "JSON_INTERGENIC_FREQUENCY_KEY", "JSON_PROMOTERCORE_FREQUENCY_KEY", - "JSON_PROMOTERPROX_PERCENTAGE_KEY", "JSON_EXON_PERCENTAGE_KEY", "JSON_INTRON_PERCENTAGE_KEY", - "JSON_INTERGENIC_PERCENTAGE_KEY", "JSON_PROMOTERCORE_PERCENTAGE_KEY"] - -JSON_NUMERIC_KEY_VALUES = [JSON_GC_CONTENT_KEY, JSON_REGIONS_NO_KEY, JSON_MEAN_ABS_TSS_DIST_KEY, - JSON_EXON_FREQUENCY_KEY, JSON_INTRON_FREQUENCY_KEY, JSON_PROMOTERPROX_FREQUENCY_KEY, - JSON_INTERGENIC_FREQUENCY_KEY, JSON_PROMOTERCORE_FREQUENCY_KEY, - JSON_PROMOTERPROX_PERCENTAGE_KEY, JSON_EXON_PERCENTAGE_KEY, JSON_INTRON_PERCENTAGE_KEY, - JSON_INTERGENIC_PERCENTAGE_KEY, JSON_PROMOTERCORE_PERCENTAGE_KEY] +JSON_BEDSET_PEP_KEY = "bedset_pep" +JSON_BEDSET_BED_IDS = "bedset_bed_ids" + +JSON_NUMERIC_KEY_NAMES = [ + "JSON_GC_CONTENT_KEY", "JSON_REGIONS_NO_KEY", "JSON_MEAN_ABS_TSS_DIST_KEY", + "JSON_EXON_FREQUENCY_KEY", "JSON_INTRON_FREQUENCY_KEY", "JSON_PROMOTERPROX_FREQUENCY_KEY", + "JSON_INTERGENIC_FREQUENCY_KEY", "JSON_PROMOTERCORE_FREQUENCY_KEY", + "JSON_PROMOTERPROX_PERCENTAGE_KEY", "JSON_EXON_PERCENTAGE_KEY", "JSON_INTRON_PERCENTAGE_KEY", + "JSON_INTERGENIC_PERCENTAGE_KEY", "JSON_PROMOTERCORE_PERCENTAGE_KEY"] + +JSON_NUMERIC_KEY_VALUES = [ + JSON_GC_CONTENT_KEY, JSON_REGIONS_NO_KEY, JSON_MEAN_ABS_TSS_DIST_KEY, + JSON_EXON_FREQUENCY_KEY, JSON_INTRON_FREQUENCY_KEY, JSON_PROMOTERPROX_FREQUENCY_KEY, + JSON_INTERGENIC_FREQUENCY_KEY, JSON_PROMOTERCORE_FREQUENCY_KEY, + JSON_PROMOTERPROX_PERCENTAGE_KEY, JSON_EXON_PERCENTAGE_KEY, JSON_INTRON_PERCENTAGE_KEY, + JSON_INTERGENIC_PERCENTAGE_KEY, JSON_PROMOTERCORE_PERCENTAGE_KEY] JSON_BEDSET_MEANS_KEY = "bedset_means" JSON_BEDSET_SD_KEY = "bedset_standard_deviation" @@ -89,37 +90,44 @@ JSON_BEDSET_BEDFILES_GD_STATS_KEY = "bedset_bedfiles_gd_stats" JSON_BEDSET_IGD_DB_KEY = "bedset_igd_database_path" JSON_BEDSET_GD_STATS = "bedset_gd_stats" -JSON_BEDSET_KEY_VALUES = [JSON_BEDSET_MEANS_KEY, JSON_BEDSET_SD_KEY, JSON_BEDSET_TAR_PATH_KEY, - JSON_BEDSET_BEDFILES_GD_STATS_KEY, JSON_BEDSET_IGD_DB_KEY, JSON_BEDSET_GD_STATS] -JSON_BEDSET_KEY_NAMES = ["JSON_BEDSET_MEANS_KEY", "JSON_BEDSET_SD_KEY", "JSON_BEDSET_TAR_PATH_KEY", - "JSON_BEDSET_BEDFILES_GD_STATS_KEY", "JSON_BEDSET_IGD_DB_KEY", "JSON_BEDSET_GD_STATS"] - -JSON_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_ID_KEY", "JSON_PLOTS_KEY", "JSON_BEDSETS_AFFILIATION_KEY"] + \ +JSON_BEDSET_KEY_VALUES = [ + JSON_BEDSET_MEANS_KEY, JSON_BEDSET_SD_KEY, JSON_BEDSET_TAR_PATH_KEY, + JSON_BEDSET_BEDFILES_GD_STATS_KEY, JSON_BEDSET_IGD_DB_KEY, JSON_BEDSET_GD_STATS] +JSON_BEDSET_KEY_NAMES = [ + "JSON_BEDSET_MEANS_KEY", "JSON_BEDSET_SD_KEY", "JSON_BEDSET_TAR_PATH_KEY", + "JSON_BEDSET_BEDFILES_GD_STATS_KEY", "JSON_BEDSET_IGD_DB_KEY", + "JSON_BEDSET_GD_STATS", "JSON_BEDSET_PEP_KEY"] + +JSON_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_ID_KEY", "JSON_PLOTS_KEY"] + \ JSON_NUMERIC_KEY_NAMES + JSON_BEDSET_KEY_NAMES _PERC_TXT = "Percentage of regions in " _FREQ_TXT = "Frequency of regions in " -JSON_DICTS_KEY_DESCS = {JSON_GC_CONTENT_KEY: "GC content", JSON_ID_KEY: "BED file ID", - JSON_REGIONS_NO_KEY: "Number of regions", JSON_MD5SUM_KEY: "BED file md5 checksum", - JSON_MEAN_ABS_TSS_DIST_KEY: "Mean absolute distance from transcription start sites", - JSON_PROMOTERPROX_PERCENTAGE_KEY: _PERC_TXT + "promoter proximity", - JSON_PROMOTERCORE_PERCENTAGE_KEY: _PERC_TXT + "promoter core", - JSON_EXON_PERCENTAGE_KEY: _PERC_TXT + "exons", - JSON_INTRON_PERCENTAGE_KEY: _PERC_TXT + "introns", - JSON_INTERGENIC_PERCENTAGE_KEY: _PERC_TXT + "intergenic", - JSON_PROMOTERPROX_FREQUENCY_KEY: _FREQ_TXT + "promoter proximity", - JSON_PROMOTERCORE_FREQUENCY_KEY: _FREQ_TXT + "promoter core", - JSON_EXON_FREQUENCY_KEY: _FREQ_TXT + "exons", - JSON_INTRON_FREQUENCY_KEY: _FREQ_TXT + "introns", - JSON_INTERGENIC_FREQUENCY_KEY: _FREQ_TXT + "intergenic", - JSON_BEDSET_MEANS_KEY: "Average bedset statistics", - JSON_BEDSET_SD_KEY: "Standard deviation of bedset statistics", - JSON_BEDSET_TAR_PATH_KEY: "TAR archive", - JSON_BEDSET_BEDFILES_GD_STATS_KEY: "Individual bedfiles statistics CSV", - JSON_BEDSET_IGD_DB_KEY: "Bedset iGD database", - JSON_BEDSET_GD_STATS: "Bedset statistics CSV"} - -__all__ = ["BED_INDEX", "BEDSET_INDEX", "SEARCH_TERMS", "RAW_BEDFILE_KEY", "CFG_ENV_VARS", +JSON_DICTS_KEY_DESCS = { + JSON_GC_CONTENT_KEY: "GC content", JSON_ID_KEY: "BED file ID", + JSON_REGIONS_NO_KEY: "Number of regions", JSON_MD5SUM_KEY: "BED file md5 checksum", + JSON_MEAN_ABS_TSS_DIST_KEY: "Mean absolute distance from transcription start sites", + JSON_PROMOTERPROX_PERCENTAGE_KEY: _PERC_TXT + "promoter proximity", + JSON_PROMOTERCORE_PERCENTAGE_KEY: _PERC_TXT + "promoter core", + JSON_EXON_PERCENTAGE_KEY: _PERC_TXT + "exons", + JSON_INTRON_PERCENTAGE_KEY: _PERC_TXT + "introns", + JSON_INTERGENIC_PERCENTAGE_KEY: _PERC_TXT + "intergenic", + JSON_PROMOTERPROX_FREQUENCY_KEY: _FREQ_TXT + "promoter proximity", + JSON_PROMOTERCORE_FREQUENCY_KEY: _FREQ_TXT + "promoter core", + JSON_EXON_FREQUENCY_KEY: _FREQ_TXT + "exons", + JSON_INTRON_FREQUENCY_KEY: _FREQ_TXT + "introns", + JSON_INTERGENIC_FREQUENCY_KEY: _FREQ_TXT + "intergenic", + JSON_BEDSET_MEANS_KEY: "Average bedset statistics", + JSON_BEDSET_SD_KEY: "Standard deviation of bedset statistics", + JSON_BEDSET_TAR_PATH_KEY: "TAR archive", + JSON_BEDSET_BEDFILES_GD_STATS_KEY: "Individual bedfiles statistics CSV", + JSON_BEDSET_IGD_DB_KEY: "Bedset iGD database", + JSON_BEDSET_GD_STATS: "Bedset statistics CSV", + JSON_BEDSET_PEP_KEY: "Beset PEP", + JSON_BEDSET_BED_IDS: "BED files in this set" +} + +__all__ = ["BED_INDEX", "BEDSET_INDEX", "RAW_BEDFILE_KEY", "CFG_ENV_VARS", "ES_CLIENT_KEY", "DB_DEFAULT_HOST", "SERVER_DEFAULT_PORT", "SERVER_DEFAULT_HOST", "PKG_NAME", "IDX_MAP", "BEDFILE_PATH_KEY", "DEFAULT_SECTION_VALUES", "JSON_DICTS_KEY_DESCS", "JSON_KEYS", "JSON_NUMERIC_KEY_VALUES", "JSON_NUMERIC_KEY_NAMES", "JSON_BEDSET_KEY_VALUES", From 32c6ebc488508061773a317827dd70de29e02cae Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 25 Feb 2020 10:43:06 -0500 Subject: [PATCH 17/35] update key const names --- bbconf/const.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bbconf/const.py b/bbconf/const.py index eec8225..19277b2 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -68,7 +68,7 @@ JSON_PROMOTERCORE_PERCENTAGE_KEY = "promoterCore_percentage" JSON_PROMOTERPROX_PERCENTAGE_KEY = "promoterProx_percentage" JSON_BEDSET_PEP_KEY = "bedset_pep" -JSON_BEDSET_BED_IDS = "bedset_bed_ids" +JSON_BEDSET_BED_IDS_KEY = "bedset_bed_ids" JSON_NUMERIC_KEY_NAMES = [ "JSON_GC_CONTENT_KEY", "JSON_REGIONS_NO_KEY", "JSON_MEAN_ABS_TSS_DIST_KEY", @@ -89,14 +89,14 @@ JSON_BEDSET_TAR_PATH_KEY = "bedset_tar_archive_path" JSON_BEDSET_BEDFILES_GD_STATS_KEY = "bedset_bedfiles_gd_stats" JSON_BEDSET_IGD_DB_KEY = "bedset_igd_database_path" -JSON_BEDSET_GD_STATS = "bedset_gd_stats" +JSON_BEDSET_GD_STATS_KEY = "bedset_gd_stats" JSON_BEDSET_KEY_VALUES = [ JSON_BEDSET_MEANS_KEY, JSON_BEDSET_SD_KEY, JSON_BEDSET_TAR_PATH_KEY, - JSON_BEDSET_BEDFILES_GD_STATS_KEY, JSON_BEDSET_IGD_DB_KEY, JSON_BEDSET_GD_STATS] + JSON_BEDSET_BEDFILES_GD_STATS_KEY, JSON_BEDSET_IGD_DB_KEY, JSON_BEDSET_GD_STATS_KEY] JSON_BEDSET_KEY_NAMES = [ "JSON_BEDSET_MEANS_KEY", "JSON_BEDSET_SD_KEY", "JSON_BEDSET_TAR_PATH_KEY", "JSON_BEDSET_BEDFILES_GD_STATS_KEY", "JSON_BEDSET_IGD_DB_KEY", - "JSON_BEDSET_GD_STATS", "JSON_BEDSET_PEP_KEY"] + "JSON_BEDSET_GD_STATS_KEY", "JSON_BEDSET_PEP_KEY", "JSON_BEDSET_BED_IDS_KEY"] JSON_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_ID_KEY", "JSON_PLOTS_KEY"] + \ JSON_NUMERIC_KEY_NAMES + JSON_BEDSET_KEY_NAMES @@ -122,9 +122,9 @@ JSON_BEDSET_TAR_PATH_KEY: "TAR archive", JSON_BEDSET_BEDFILES_GD_STATS_KEY: "Individual bedfiles statistics CSV", JSON_BEDSET_IGD_DB_KEY: "Bedset iGD database", - JSON_BEDSET_GD_STATS: "Bedset statistics CSV", + JSON_BEDSET_GD_STATS_KEY: "Bedset statistics CSV", JSON_BEDSET_PEP_KEY: "Beset PEP", - JSON_BEDSET_BED_IDS: "BED files in this set" + JSON_BEDSET_BED_IDS_KEY: "BED files in this set" } __all__ = ["BED_INDEX", "BEDSET_INDEX", "RAW_BEDFILE_KEY", "CFG_ENV_VARS", From 2c507d96db7c9fe29fc1497ca8267090990dd01e Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 25 Feb 2020 12:01:44 -0500 Subject: [PATCH 18/35] export md5 key --- bbconf/const.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bbconf/const.py b/bbconf/const.py index 19277b2..e6b486b 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -98,8 +98,8 @@ "JSON_BEDSET_BEDFILES_GD_STATS_KEY", "JSON_BEDSET_IGD_DB_KEY", "JSON_BEDSET_GD_STATS_KEY", "JSON_BEDSET_PEP_KEY", "JSON_BEDSET_BED_IDS_KEY"] -JSON_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_ID_KEY", "JSON_PLOTS_KEY"] + \ - JSON_NUMERIC_KEY_NAMES + JSON_BEDSET_KEY_NAMES +JSON_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_ID_KEY", "JSON_PLOTS_KEY", + "JSON_MD5SUM_KEY"] + JSON_NUMERIC_KEY_NAMES + JSON_BEDSET_KEY_NAMES _PERC_TXT = "Percentage of regions in " _FREQ_TXT = "Frequency of regions in " From 862c2473ed16cb691d4b967af7062821e0c6d262 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 26 Feb 2020 06:42:27 -0500 Subject: [PATCH 19/35] implement documents retrieval by ID --- bbconf/bbconf.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/bbconf/bbconf.py b/bbconf/bbconf.py index 7b291f6..079a7fa 100644 --- a/bbconf/bbconf.py +++ b/bbconf/bbconf.py @@ -150,7 +150,6 @@ def insert_bedfiles_data(self, data, doc_id=None, **kwargs): If provided the document will be inserted only if no documents with this ID are present in the DB. However, the document overwriting can be forced if needed. - :param dict data: data to insert :param str doc_id: unique identifier for the document, optional """ @@ -165,7 +164,6 @@ def insert_bedsets_data(self, data, doc_id=None, **kwargs): If provided the document will be inserted only if no documents with this ID are present in the DB. However, the document overwriting can be forced if needed. - :param dict data: data to insert :param str doc_id: unique identifier for the document, optional """ @@ -198,6 +196,34 @@ def get_bedsets_mapping(self, just_data=True, **kwargs): """ return self._get_mapping(index=BEDSET_INDEX, just_data=just_data, **kwargs) + def _get_doc(self, index, doc_id): + """ + Get a document from an index by its ID + + :param str index: name of the index to search + :param str doc_id: document ID to return + :return Mapping: matched document + """ + return self[ES_CLIENT_KEY].get(index=index, id=doc_id) + + def get_bedfiles_doc(self, doc_id): + """ + Get a document from bedfiles index by its ID + + :param str doc_id: document ID to return + :return Mapping: matched document + """ + return self._get_doc(index=BED_INDEX, doc_id=doc_id) + + def get_bedsets_doc(self, doc_id): + """ + Get a document from bedsets index by its ID + + :param str doc_id: document ID to return + :return Mapping: matched document + """ + return self._get_doc(index=BEDSET_INDEX, doc_id=doc_id) + def _count_docs(self, index): """ Get the total number of the documents in a selected index From ba3492f07965fdecd13d738357006cf69e6e7660 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 27 Feb 2020 13:36:06 -0500 Subject: [PATCH 20/35] add consts required by metadata in bedstat --- bbconf/const.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bbconf/const.py b/bbconf/const.py index e6b486b..ac5f66e 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -70,6 +70,13 @@ JSON_BEDSET_PEP_KEY = "bedset_pep" JSON_BEDSET_BED_IDS_KEY = "bedset_bed_ids" +JSON_METADATA = ["genome", "exp_protocol", "cell_type", "antibody", "treatment", + "data_source"] \ + + JSON_ID_KEY + JSON_MD5SUM_KEY + JSON_PLOTS_KEY + BEDFILE_PATH_KEY + +JSON_STATS_SECTION_KEY = "statistics" +JSON_METADATA_SECTION_KEY = "metadata" + JSON_NUMERIC_KEY_NAMES = [ "JSON_GC_CONTENT_KEY", "JSON_REGIONS_NO_KEY", "JSON_MEAN_ABS_TSS_DIST_KEY", "JSON_EXON_FREQUENCY_KEY", "JSON_INTRON_FREQUENCY_KEY", "JSON_PROMOTERPROX_FREQUENCY_KEY", @@ -99,7 +106,7 @@ "JSON_BEDSET_GD_STATS_KEY", "JSON_BEDSET_PEP_KEY", "JSON_BEDSET_BED_IDS_KEY"] JSON_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_ID_KEY", "JSON_PLOTS_KEY", - "JSON_MD5SUM_KEY"] + JSON_NUMERIC_KEY_NAMES + JSON_BEDSET_KEY_NAMES + "JSON_MD5SUM_KEY"] + JSON_NUMERIC_KEY_NAMES + JSON_BEDSET_KEY_NAMES + JSON_METADATA _PERC_TXT = "Percentage of regions in " _FREQ_TXT = "Frequency of regions in " From 4d01ecf30ee7aee68d4056600180cb3bd0e75e21 Mon Sep 17 00:00:00 2001 From: "Jose E. Verdezoto Mosquera" Date: Thu, 27 Feb 2020 14:39:38 -0500 Subject: [PATCH 21/35] Update metadata attributes --- bbconf/const.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bbconf/const.py b/bbconf/const.py index ac5f66e..4fe90b5 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -70,8 +70,8 @@ JSON_BEDSET_PEP_KEY = "bedset_pep" JSON_BEDSET_BED_IDS_KEY = "bedset_bed_ids" -JSON_METADATA = ["genome", "exp_protocol", "cell_type", "antibody", "treatment", - "data_source"] \ +JSON_METADATA = ["genome", "exp_protocol", "cell_type", "tissue", "antibody", "treatment", + "data_source", "description"] \ + JSON_ID_KEY + JSON_MD5SUM_KEY + JSON_PLOTS_KEY + BEDFILE_PATH_KEY JSON_STATS_SECTION_KEY = "statistics" From 97a73420066b00ad2d20d1f6b186c52c8c199f9d Mon Sep 17 00:00:00 2001 From: "Jose E. Verdezoto Mosquera" Date: Thu, 27 Feb 2020 15:49:28 -0500 Subject: [PATCH 22/35] concat list instead of strings in meta attr --- bbconf/const.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bbconf/const.py b/bbconf/const.py index 4fe90b5..5374de0 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -72,7 +72,7 @@ JSON_METADATA = ["genome", "exp_protocol", "cell_type", "tissue", "antibody", "treatment", "data_source", "description"] \ - + JSON_ID_KEY + JSON_MD5SUM_KEY + JSON_PLOTS_KEY + BEDFILE_PATH_KEY + + [JSON_ID_KEY, JSON_MD5SUM_KEY, JSON_PLOTS_KEY, BEDFILE_PATH_KEY] JSON_STATS_SECTION_KEY = "statistics" JSON_METADATA_SECTION_KEY = "metadata" From 00ba24bfc513ec9d1ac7ff49ad086d5cfa3e7097 Mon Sep 17 00:00:00 2001 From: "Jose E. Verdezoto Mosquera" Date: Thu, 27 Feb 2020 16:07:11 -0500 Subject: [PATCH 23/35] add JSON_METADATA to all section --- bbconf/const.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bbconf/const.py b/bbconf/const.py index 5374de0..de33aed 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -138,4 +138,4 @@ "ES_CLIENT_KEY", "DB_DEFAULT_HOST", "SERVER_DEFAULT_PORT", "SERVER_DEFAULT_HOST", "PKG_NAME", "IDX_MAP", "BEDFILE_PATH_KEY", "DEFAULT_SECTION_VALUES", "JSON_DICTS_KEY_DESCS", "JSON_KEYS", "JSON_NUMERIC_KEY_VALUES", "JSON_NUMERIC_KEY_NAMES", "JSON_BEDSET_KEY_VALUES", - "JSON_BEDSET_KEY_NAMES", "QUERY_ALL"] + CFG_KEYS + JSON_KEYS + "JSON_BEDSET_KEY_NAMES", "QUERY_ALL", "JSON_METADATA"] + CFG_KEYS + JSON_KEYS From 316ab8e190dc9abe25adb11b7c415d8e86eb421a Mon Sep 17 00:00:00 2001 From: "Jose E. Verdezoto Mosquera" Date: Fri, 28 Feb 2020 10:37:45 -0500 Subject: [PATCH 24/35] set JSON keys for metadata --- bbconf/const.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/bbconf/const.py b/bbconf/const.py index de33aed..dae75ff 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -53,6 +53,14 @@ # (the keys are actually established in bedstat/tools/regionstat.R) JSON_GC_CONTENT_KEY = "gc_content" JSON_ID_KEY = "id" +JSON_GENOME_KEY = "genome" +JSON_PROTOCOL_KEY = "exp_protocol" +JSON_CELL_TYPE_KEY = "cell_type" +JSON_TISSUE_KEY = "tissue" +JSON_ANTIBODY_KEY = "antibody" +JSON_TREATMENT_KEY = "treatment" +JSON_DATA_SOURCE_KEY = "data_source" +JSON_DESCRIPTION_KEY = "description" JSON_REGIONS_NO_KEY = "regions_no" JSON_MEAN_ABS_TSS_DIST_KEY = "mean_absolute_TSS_dist" JSON_MD5SUM_KEY = "md5sum" @@ -70,9 +78,9 @@ JSON_BEDSET_PEP_KEY = "bedset_pep" JSON_BEDSET_BED_IDS_KEY = "bedset_bed_ids" -JSON_METADATA = ["genome", "exp_protocol", "cell_type", "tissue", "antibody", "treatment", - "data_source", "description"] \ - + [JSON_ID_KEY, JSON_MD5SUM_KEY, JSON_PLOTS_KEY, BEDFILE_PATH_KEY] +JSON_METADATA = [JSON_GENOME_KEY, JSON_PROTOCOL_KEY, JSON_CELL_TYPE_KEY, JSON_TISSUE_KEY, JSON_ANTIBODY_KEY, + JSON_TREATMENT_KEY, JSON_DATA_SOURCE_KEY, JSON_DESCRIPTION_KEY, + JSON_ID_KEY, JSON_MD5SUM_KEY, JSON_PLOTS_KEY, BEDFILE_PATH_KEY] JSON_STATS_SECTION_KEY = "statistics" JSON_METADATA_SECTION_KEY = "metadata" @@ -138,4 +146,4 @@ "ES_CLIENT_KEY", "DB_DEFAULT_HOST", "SERVER_DEFAULT_PORT", "SERVER_DEFAULT_HOST", "PKG_NAME", "IDX_MAP", "BEDFILE_PATH_KEY", "DEFAULT_SECTION_VALUES", "JSON_DICTS_KEY_DESCS", "JSON_KEYS", "JSON_NUMERIC_KEY_VALUES", "JSON_NUMERIC_KEY_NAMES", "JSON_BEDSET_KEY_VALUES", - "JSON_BEDSET_KEY_NAMES", "QUERY_ALL", "JSON_METADATA"] + CFG_KEYS + JSON_KEYS + "JSON_BEDSET_KEY_NAMES", "QUERY_ALL"] + CFG_KEYS + JSON_KEYS From 6ea48c1c76dbc2d4964a069975e16fef0c9f5e10 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 2 Mar 2020 10:28:15 -0500 Subject: [PATCH 25/35] fix json key consts exporting --- bbconf/const.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bbconf/const.py b/bbconf/const.py index dae75ff..12c0b00 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -78,9 +78,9 @@ JSON_BEDSET_PEP_KEY = "bedset_pep" JSON_BEDSET_BED_IDS_KEY = "bedset_bed_ids" -JSON_METADATA = [JSON_GENOME_KEY, JSON_PROTOCOL_KEY, JSON_CELL_TYPE_KEY, JSON_TISSUE_KEY, JSON_ANTIBODY_KEY, - JSON_TREATMENT_KEY, JSON_DATA_SOURCE_KEY, JSON_DESCRIPTION_KEY, - JSON_ID_KEY, JSON_MD5SUM_KEY, JSON_PLOTS_KEY, BEDFILE_PATH_KEY] +JSON_METADATA = ["JSON_GENOME_KEY", "JSON_PROTOCOL_KEY", "JSON_CELL_TYPE_KEY", "JSON_TISSUE_KEY", "JSON_ANTIBODY_KEY", + "JSON_TREATMENT_KEY", "JSON_DATA_SOURCE_KEY", "JSON_DESCRIPTION_KEY", + "JSON_ID_KEY", "JSON_MD5SUM_KEY", "JSON_PLOTS_KEY", "BEDFILE_PATH_KEY"] JSON_STATS_SECTION_KEY = "statistics" JSON_METADATA_SECTION_KEY = "metadata" @@ -146,4 +146,4 @@ "ES_CLIENT_KEY", "DB_DEFAULT_HOST", "SERVER_DEFAULT_PORT", "SERVER_DEFAULT_HOST", "PKG_NAME", "IDX_MAP", "BEDFILE_PATH_KEY", "DEFAULT_SECTION_VALUES", "JSON_DICTS_KEY_DESCS", "JSON_KEYS", "JSON_NUMERIC_KEY_VALUES", "JSON_NUMERIC_KEY_NAMES", "JSON_BEDSET_KEY_VALUES", - "JSON_BEDSET_KEY_NAMES", "QUERY_ALL"] + CFG_KEYS + JSON_KEYS + "JSON_BEDSET_KEY_NAMES", "QUERY_ALL", "JSON_METADATA"] + CFG_KEYS + JSON_KEYS From 88824dc573d148c36d20c95ff82f11c376748479 Mon Sep 17 00:00:00 2001 From: "Jose E. Verdezoto Mosquera" Date: Mon, 2 Mar 2020 11:10:21 -0500 Subject: [PATCH 26/35] include stats and meta sections in JSON keys --- bbconf/const.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bbconf/const.py b/bbconf/const.py index 12c0b00..a095d75 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -114,7 +114,8 @@ "JSON_BEDSET_GD_STATS_KEY", "JSON_BEDSET_PEP_KEY", "JSON_BEDSET_BED_IDS_KEY"] JSON_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_ID_KEY", "JSON_PLOTS_KEY", - "JSON_MD5SUM_KEY"] + JSON_NUMERIC_KEY_NAMES + JSON_BEDSET_KEY_NAMES + JSON_METADATA + "JSON_MD5SUM_KEY"] + JSON_NUMERIC_KEY_NAMES + JSON_BEDSET_KEY_NAMES + JSON_METADATA \ + + JSON_STATS_SECTION_KEY + JSON_METADATA_SECTION_KEY _PERC_TXT = "Percentage of regions in " _FREQ_TXT = "Frequency of regions in " From 9765fd34670c59a7094fee597f508c00507c740e Mon Sep 17 00:00:00 2001 From: "Jose E. Verdezoto Mosquera" Date: Mon, 2 Mar 2020 11:14:22 -0500 Subject: [PATCH 27/35] fix meta and stats concat error --- bbconf/const.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bbconf/const.py b/bbconf/const.py index a095d75..ecee20a 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -114,8 +114,9 @@ "JSON_BEDSET_GD_STATS_KEY", "JSON_BEDSET_PEP_KEY", "JSON_BEDSET_BED_IDS_KEY"] JSON_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_ID_KEY", "JSON_PLOTS_KEY", - "JSON_MD5SUM_KEY"] + JSON_NUMERIC_KEY_NAMES + JSON_BEDSET_KEY_NAMES + JSON_METADATA \ - + JSON_STATS_SECTION_KEY + JSON_METADATA_SECTION_KEY + "JSON_MD5SUM_KEY", "JSON_STATS_SECTION_KEY", + "JSON_METADATA_SECTION_KEY"] + JSON_NUMERIC_KEY_NAMES + JSON_BEDSET_KEY_NAMES + JSON_METADATA + _PERC_TXT = "Percentage of regions in " _FREQ_TXT = "Frequency of regions in " From ce71d585880f9a8154fa4f446d90ffcd58ed40f4 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 2 Mar 2020 13:53:52 -0500 Subject: [PATCH 28/35] get key values in consts --- bbconf/const.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/bbconf/const.py b/bbconf/const.py index ecee20a..241696c 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -78,10 +78,12 @@ JSON_BEDSET_PEP_KEY = "bedset_pep" JSON_BEDSET_BED_IDS_KEY = "bedset_bed_ids" -JSON_METADATA = ["JSON_GENOME_KEY", "JSON_PROTOCOL_KEY", "JSON_CELL_TYPE_KEY", "JSON_TISSUE_KEY", "JSON_ANTIBODY_KEY", +JSON_METADATA_NAMES = ["JSON_GENOME_KEY", "JSON_PROTOCOL_KEY", "JSON_CELL_TYPE_KEY", "JSON_TISSUE_KEY", "JSON_ANTIBODY_KEY", "JSON_TREATMENT_KEY", "JSON_DATA_SOURCE_KEY", "JSON_DESCRIPTION_KEY", "JSON_ID_KEY", "JSON_MD5SUM_KEY", "JSON_PLOTS_KEY", "BEDFILE_PATH_KEY"] +JSON_METADATA_VALUES = [eval(x) for x in JSON_METADATA_NAMES] + JSON_STATS_SECTION_KEY = "statistics" JSON_METADATA_SECTION_KEY = "metadata" @@ -92,12 +94,7 @@ "JSON_PROMOTERPROX_PERCENTAGE_KEY", "JSON_EXON_PERCENTAGE_KEY", "JSON_INTRON_PERCENTAGE_KEY", "JSON_INTERGENIC_PERCENTAGE_KEY", "JSON_PROMOTERCORE_PERCENTAGE_KEY"] -JSON_NUMERIC_KEY_VALUES = [ - JSON_GC_CONTENT_KEY, JSON_REGIONS_NO_KEY, JSON_MEAN_ABS_TSS_DIST_KEY, - JSON_EXON_FREQUENCY_KEY, JSON_INTRON_FREQUENCY_KEY, JSON_PROMOTERPROX_FREQUENCY_KEY, - JSON_INTERGENIC_FREQUENCY_KEY, JSON_PROMOTERCORE_FREQUENCY_KEY, - JSON_PROMOTERPROX_PERCENTAGE_KEY, JSON_EXON_PERCENTAGE_KEY, JSON_INTRON_PERCENTAGE_KEY, - JSON_INTERGENIC_PERCENTAGE_KEY, JSON_PROMOTERCORE_PERCENTAGE_KEY] +JSON_NUMERIC_KEY_VALUES = [eval(x) for x in JSON_NUMERIC_KEY_NAMES] JSON_BEDSET_MEANS_KEY = "bedset_means" JSON_BEDSET_SD_KEY = "bedset_standard_deviation" @@ -114,8 +111,8 @@ "JSON_BEDSET_GD_STATS_KEY", "JSON_BEDSET_PEP_KEY", "JSON_BEDSET_BED_IDS_KEY"] JSON_KEYS = ["JSON_GC_CONTENT_KEY", "JSON_ID_KEY", "JSON_PLOTS_KEY", - "JSON_MD5SUM_KEY", "JSON_STATS_SECTION_KEY", - "JSON_METADATA_SECTION_KEY"] + JSON_NUMERIC_KEY_NAMES + JSON_BEDSET_KEY_NAMES + JSON_METADATA + "JSON_MD5SUM_KEY", "JSON_STATS_SECTION_KEY", "JSON_METADATA_VALUES", + "JSON_METADATA_SECTION_KEY"] + JSON_NUMERIC_KEY_NAMES + JSON_BEDSET_KEY_NAMES + JSON_METADATA_NAMES _PERC_TXT = "Percentage of regions in " @@ -148,4 +145,4 @@ "ES_CLIENT_KEY", "DB_DEFAULT_HOST", "SERVER_DEFAULT_PORT", "SERVER_DEFAULT_HOST", "PKG_NAME", "IDX_MAP", "BEDFILE_PATH_KEY", "DEFAULT_SECTION_VALUES", "JSON_DICTS_KEY_DESCS", "JSON_KEYS", "JSON_NUMERIC_KEY_VALUES", "JSON_NUMERIC_KEY_NAMES", "JSON_BEDSET_KEY_VALUES", - "JSON_BEDSET_KEY_NAMES", "QUERY_ALL", "JSON_METADATA"] + CFG_KEYS + JSON_KEYS + "JSON_BEDSET_KEY_NAMES", "QUERY_ALL", "JSON_METADATA_NAMES", "JSON_METADATA_VALUES"] + CFG_KEYS + JSON_KEYS From 167a462ea4e341e006df1ae4f7e2d1ad67e75042 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 17 Mar 2020 17:26:26 -0400 Subject: [PATCH 29/35] implement index removal methods; #6 --- bbconf/bbconf.py | 106 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 77 insertions(+), 29 deletions(-) diff --git a/bbconf/bbconf.py b/bbconf/bbconf.py index 079a7fa..f00cd63 100644 --- a/bbconf/bbconf.py +++ b/bbconf/bbconf.py @@ -51,12 +51,16 @@ def establish_elasticsearch_connection(self, host=None): :return elasticsearch.Elasticsearch: connected client """ if hasattr(self, ES_CLIENT_KEY): - raise BedBaseConnectionError("The connection is already established: {}". - format(str(self[ES_CLIENT_KEY]))) + raise BedBaseConnectionError( + "The connection is already established: {}". + format(str(self[ES_CLIENT_KEY])) + ) hst = host or self[CFG_DATABASE_KEY][CFG_HOST_KEY] self[ES_CLIENT_KEY] = Elasticsearch([{"host": hst}]) - _LOGGER.info("Established connection with Elasticsearch: {}".format(hst)) - _LOGGER.debug("Elasticsearch info:\n{}".format(self[ES_CLIENT_KEY].info())) + _LOGGER.info("Established connection with Elasticsearch: {}". + format(hst)) + _LOGGER.debug("Elasticsearch info:\n{}". + format(self[ES_CLIENT_KEY].info())) def assert_connection(self): """ @@ -65,9 +69,12 @@ def assert_connection(self): :raise BedBaseConnectionError: if there is no active connection """ if not hasattr(self, ES_CLIENT_KEY): - raise BedBaseConnectionError("No active connection with Elasticsearch") + raise BedBaseConnectionError( + "No active connection with Elasticsearch" + ) - def _search_index(self, index_name, query, just_data=True, size=None, **kwargs): + def _search_index(self, index_name, query, just_data=True, size=None, + **kwargs): """ Search selected Elasticsearch index with selected query @@ -82,10 +89,12 @@ def _search_index(self, index_name, query, just_data=True, size=None, **kwargs): if not self[ES_CLIENT_KEY].indices.exists(index_name): _LOGGER.warning("'{}' index does not exist".format(index_name)) return - _LOGGER.debug("Searching index: {}\nQuery: {}".format(index_name, query)) + _LOGGER.debug("Searching index: {}\nQuery: {}". + format(index_name, query)) query = {"query": query} if "query" not in query else query size = size or self._count_docs(index=index_name) - search_results = self[ES_CLIENT_KEY].search(index=index_name, body=query, size=size, **kwargs) + search_results = self[ES_CLIENT_KEY].search( + index=index_name, body=query, size=size, **kwargs) return [r["_source"] for r in search_results["hits"]["hits"]] \ if just_data else search_results @@ -97,7 +106,8 @@ def search_bedfiles(self, query, just_data=True, **kwargs): :param bool just_data: whether just the hits should be returned :return dict | Iterable[dict]: search results """ - return self._search_index(index_name=BED_INDEX, query=query, just_data=just_data, **kwargs) + return self._search_index(index_name=BED_INDEX, query=query, + just_data=just_data, **kwargs) def search_bedsets(self, query, just_data=True, **kwargs): """ @@ -107,20 +117,24 @@ def search_bedsets(self, query, just_data=True, **kwargs): :param bool just_data: whether just the hits should be returned :return dict | Iterable[dict]: search results """ - return self._search_index(index_name=BEDSET_INDEX, query=query, just_data=just_data, **kwargs) + return self._search_index(index_name=BEDSET_INDEX, query=query, + just_data=just_data, **kwargs) def _insert_data(self, index, data, doc_id, force_update=False, **kwargs): """ Insert document to an index in a Elasticsearch DB or create it and the insert in case it does not exist. - Document ID argument is optional. If not provided, a random ID will be assigned. - If provided the document will be inserted only if no documents with this ID are present in the DB. - However, the document overwriting can be forced if needed. + Document ID argument is optional. If not provided, a random ID + will be assigned. + If provided the document will be inserted only if no documents with + this ID are present in the DB. However, the document overwriting + can be forced if needed. :param str index: name of the index to insert the data into :param str doc_id: unique identifier for the document - :param bool force_update: whether the pre-existing document should be overwritten + :param bool force_update: whether the pre-existing document + should be overwritten :param dict data: data to insert """ self.assert_connection() @@ -130,13 +144,15 @@ def _insert_data(self, index, data, doc_id, force_update=False, **kwargs): self[ES_CLIENT_KEY].index(index=index, body=data, **kwargs) else: try: - self[ES_CLIENT_KEY].create(index=index, body=data, id=doc_id, **kwargs) + self[ES_CLIENT_KEY].create(index=index, body=data, id=doc_id, + **kwargs) except ConflictError: msg_base = "Document '{}' already exists in index '{}'"\ .format(doc_id, index) if force_update: _LOGGER.info(msg_base + ". Forcing update") - self[ES_CLIENT_KEY].index(index=index, body=data, id=doc_id, **kwargs) + self[ES_CLIENT_KEY].index(index=index, body=data, id=doc_id, + **kwargs) else: _LOGGER.error("Could not insert data. " + msg_base) raise @@ -146,9 +162,10 @@ def insert_bedfiles_data(self, data, doc_id=None, **kwargs): Insert data to the bedfile index a Elasticsearch DB or create it and the insert in case it does not exist. - Document ID argument is optional. If not provided, a random ID will be assigned. - If provided the document will be inserted only if no documents with this ID are present in the DB. - However, the document overwriting can be forced if needed. + Document ID argument is optional. If not provided, a random ID will + be assigned. If provided the document will be inserted only if no + documents with this ID are present in the DB. However, the document + overwriting can be forced if needed. :param dict data: data to insert :param str doc_id: unique identifier for the document, optional @@ -160,14 +177,17 @@ def insert_bedsets_data(self, data, doc_id=None, **kwargs): Insert data to the bedset index in a Elasticsearch DB or create it and the insert in case it does not exist. - Document ID argument is optional. If not provided, a random ID will be assigned. - If provided the document will be inserted only if no documents with this ID are present in the DB. + Document ID argument is optional. If not provided, a random ID will + be assigned. + If provided the document will be inserted only if no documents with + this ID are present in the DB. However, the document overwriting can be forced if needed. :param dict data: data to insert :param str doc_id: unique identifier for the document, optional """ - self._insert_data(index=BEDSET_INDEX, data=data, doc_id=doc_id, **kwargs) + self._insert_data(index=BEDSET_INDEX, data=data, doc_id=doc_id, + **kwargs) def _get_mapping(self, index, just_data=True, **kwargs): """ @@ -178,7 +198,8 @@ def _get_mapping(self, index, just_data=True, **kwargs): """ self.assert_connection() mapping = self[ES_CLIENT_KEY].indices.get_mapping(index, **kwargs) - return mapping[index]["mappings"]["properties"] if just_data else mapping + return mapping[index]["mappings"]["properties"] \ + if just_data else mapping def get_bedfiles_mapping(self, just_data=True, **kwargs): """ @@ -194,7 +215,8 @@ def get_bedsets_mapping(self, just_data=True, **kwargs): :return dict: besets mapping definitions """ - return self._get_mapping(index=BEDSET_INDEX, just_data=just_data, **kwargs) + return self._get_mapping(index=BEDSET_INDEX, just_data=just_data, + **kwargs) def _get_doc(self, index, doc_id): """ @@ -235,7 +257,8 @@ def _count_docs(self, index): if not self[ES_CLIENT_KEY].indices.exists(index=index): _LOGGER.warning("'{}' index does not exist".format(index)) return None - return int(self[ES_CLIENT_KEY].cat.count(index, params={"format": "json"})[0]['count']) + return int(self[ES_CLIENT_KEY].cat.count( + index, params={"format": "json"})[0]['count']) def count_bedfiles_docs(self): """ @@ -253,6 +276,27 @@ def count_bedsets_docs(self): """ return self._count_docs(index=BEDSET_INDEX) + def _delete_index(self, index): + """ + Delete selected index from Elasticsearch + + :param str index: name of the index to delete + """ + self.assert_connection() + self[ES_CLIENT_KEY].indices.delete(index=index) + + def delete_bedfiles_index(self): + """ + Delete bedfiles index from Elasticsearch + """ + self._delete_index(index=BED_INDEX) + + def delete_bedsets_index(self): + """ + Delete bedsets index from Elasticsearch + """ + self._delete_index(index=BEDSET_INDEX) + def _get_all(self, index_name, just_data=False): """ Convenience method for index exploration @@ -262,7 +306,8 @@ def _get_all(self, index_name, just_data=False): :return: """ self.assert_connection() - return self._search_index(index_name=index_name, query=QUERY_ALL, just_data=just_data) + return self._search_index(index_name=index_name, query=QUERY_ALL, + just_data=just_data) def get_bedbase_cfg(cfg=None): @@ -276,10 +321,13 @@ def get_bedbase_cfg(cfg=None): Optional, the $BEDBASE config env var will be used if not provided :return str: configuration file path """ - selected_cfg = yacman.select_config(config_filepath=cfg, config_env_vars=CFG_ENV_VARS) + selected_cfg = yacman.select_config(config_filepath=cfg, + config_env_vars=CFG_ENV_VARS) if not selected_cfg: - raise BedBaseConnectionError("You must provide a config file or set the {} " - "environment variable".format("or ".join(CFG_ENV_VARS))) + raise BedBaseConnectionError( + "You must provide a config file or set the {} environment variable" + .format("or ".join(CFG_ENV_VARS)) + ) return selected_cfg From 30545f9cc1ce4aa4e28b2cbc0f5355e35323d3c7 Mon Sep 17 00:00:00 2001 From: "Jose E. Verdezoto Mosquera" Date: Wed, 1 Apr 2020 18:26:49 -0400 Subject: [PATCH 30/35] add mean region width key --- bbconf/const.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bbconf/const.py b/bbconf/const.py index 241696c..e399ccd 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -63,6 +63,7 @@ JSON_DESCRIPTION_KEY = "description" JSON_REGIONS_NO_KEY = "regions_no" JSON_MEAN_ABS_TSS_DIST_KEY = "mean_absolute_TSS_dist" +JSON_MEAN_REGION_WIDTH = "mean_region_width" JSON_MD5SUM_KEY = "md5sum" JSON_PLOTS_KEY = "plots" JSON_EXON_FREQUENCY_KEY = "exon_frequency" @@ -88,7 +89,7 @@ JSON_METADATA_SECTION_KEY = "metadata" JSON_NUMERIC_KEY_NAMES = [ - "JSON_GC_CONTENT_KEY", "JSON_REGIONS_NO_KEY", "JSON_MEAN_ABS_TSS_DIST_KEY", + "JSON_GC_CONTENT_KEY", "JSON_REGIONS_NO_KEY", "JSON_MEAN_ABS_TSS_DIST_KEY", "JSON_MEAN_REGION_WIDTH" "JSON_EXON_FREQUENCY_KEY", "JSON_INTRON_FREQUENCY_KEY", "JSON_PROMOTERPROX_FREQUENCY_KEY", "JSON_INTERGENIC_FREQUENCY_KEY", "JSON_PROMOTERCORE_FREQUENCY_KEY", "JSON_PROMOTERPROX_PERCENTAGE_KEY", "JSON_EXON_PERCENTAGE_KEY", "JSON_INTRON_PERCENTAGE_KEY", @@ -121,6 +122,7 @@ JSON_GC_CONTENT_KEY: "GC content", JSON_ID_KEY: "BED file ID", JSON_REGIONS_NO_KEY: "Number of regions", JSON_MD5SUM_KEY: "BED file md5 checksum", JSON_MEAN_ABS_TSS_DIST_KEY: "Mean absolute distance from transcription start sites", + JSON_MEAN_REGION_WIDTH: "Mean width of the regions in the BED file", JSON_PROMOTERPROX_PERCENTAGE_KEY: _PERC_TXT + "promoter proximity", JSON_PROMOTERCORE_PERCENTAGE_KEY: _PERC_TXT + "promoter core", JSON_EXON_PERCENTAGE_KEY: _PERC_TXT + "exons", From 912b42a537a29180e5a080cf950746e7aaad045a Mon Sep 17 00:00:00 2001 From: "Jose E. Verdezoto Mosquera" Date: Wed, 1 Apr 2020 19:15:22 -0400 Subject: [PATCH 31/35] add missing comma in JSON numeric key names --- bbconf/const.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bbconf/const.py b/bbconf/const.py index e399ccd..909d34a 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -89,7 +89,7 @@ JSON_METADATA_SECTION_KEY = "metadata" JSON_NUMERIC_KEY_NAMES = [ - "JSON_GC_CONTENT_KEY", "JSON_REGIONS_NO_KEY", "JSON_MEAN_ABS_TSS_DIST_KEY", "JSON_MEAN_REGION_WIDTH" + "JSON_GC_CONTENT_KEY", "JSON_REGIONS_NO_KEY", "JSON_MEAN_ABS_TSS_DIST_KEY", "JSON_MEAN_REGION_WIDTH", "JSON_EXON_FREQUENCY_KEY", "JSON_INTRON_FREQUENCY_KEY", "JSON_PROMOTERPROX_FREQUENCY_KEY", "JSON_INTERGENIC_FREQUENCY_KEY", "JSON_PROMOTERCORE_FREQUENCY_KEY", "JSON_PROMOTERPROX_PERCENTAGE_KEY", "JSON_EXON_PERCENTAGE_KEY", "JSON_INTRON_PERCENTAGE_KEY", From 31f134b2642a99bb63eb46e9b3fd067f6d1f6762 Mon Sep 17 00:00:00 2001 From: "Jose E. Verdezoto Mosquera" Date: Wed, 8 Apr 2020 17:31:34 -0400 Subject: [PATCH 32/35] replace pip out key with bedstat and bedbuncher out keys --- bbconf/const.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bbconf/const.py b/bbconf/const.py index 909d34a..d228084 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -27,13 +27,15 @@ CFG_DATABASE_KEY = "database" CFG_HOST_KEY = "host" CFG_PORT_KEY = "port" -CFG_PIP_OUTPUT_KEY = "pipelines_output" +CFG_BEDSTAT_OUTPUT_KEY = "bedstat_output" +CFG_BEDBUNCHER_OUTPUT_KEY = "bedbuncher_output" CFG_BED_INDEX_KEY = "bed_index" CFG_BEDSET_INDEX_KEY = "bedset_index" CFG_KEYS = [ "CFG_PATH_KEY", "CFG_SERVER_KEY", "CFG_DATABASE_KEY", "CFG_HOST_KEY", - "CFG_PORT_KEY", "CFG_PIP_OUTPUT_KEY", "CFG_BED_INDEX_KEY", "CFG_BEDSET_INDEX_KEY"] + "CFG_PORT_KEY", "CFG_BEDSTAT_OUTPUT_KEY", "CFG_BEDBUNCHER_OUTPUT_KEY", + "CFG_BED_INDEX_KEY", "CFG_BEDSET_INDEX_KEY"] DEFAULT_SECTION_VALUES = { CFG_DATABASE_KEY: { From a689ff0937605af9789ad17fca7d114740f342c2 Mon Sep 17 00:00:00 2001 From: "Jose E. Verdezoto Mosquera" Date: Wed, 8 Apr 2020 18:21:08 -0400 Subject: [PATCH 33/35] add bedstat and bedbuncher out keys to path --- bbconf/bbconf.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/bbconf/bbconf.py b/bbconf/bbconf.py index f00cd63..5ef1db8 100644 --- a/bbconf/bbconf.py +++ b/bbconf/bbconf.py @@ -32,9 +32,12 @@ def _raise_missing_key(key): # if there's nothing under path key (None) self[CFG_PATH_KEY] = PXAM() - if CFG_PIP_OUTPUT_KEY not in self[CFG_PATH_KEY]: - _raise_missing_key(CFG_PIP_OUTPUT_KEY) - + if CFG_BEDSTAT_OUTPUT_KEY not in self[CFG_PATH_KEY]: + _raise_missing_key(CFG_BEDSTAT_OUTPUT_KEY) + + if CFG_BEDBUNCHER_OUTPUT_KEY not in self[CFG_PATH_KEY]: + _raise_missing_key(CFG_BEDBUNCHER_OUTPUT_KEY) + for section, mapping in DEFAULT_SECTION_VALUES.items(): if section not in self: self[section] = PXAM() From f8126820f42f612199e290fe8178b8328fe017af Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 28 May 2020 16:33:22 -0400 Subject: [PATCH 34/35] update changelog, update example cfg files --- config.yaml | 2 +- config_min.yaml | 3 ++- docs/bbc_api.md | 65 ++++++++++++++++++++++++++++++++++++++++++----- docs/changelog.md | 10 +++++++- 4 files changed, 71 insertions(+), 9 deletions(-) diff --git a/config.yaml b/config.yaml index 0769577..4ff5c49 100644 --- a/config.yaml +++ b/config.yaml @@ -1,7 +1,7 @@ # full config example. Refer to bbconf/const.py for key names and default values path: - bedstat_output: $HOME/results_pipeline + pipelines_output: $HOME/results_pipeline database: host: localhost diff --git a/config_min.yaml b/config_min.yaml index aa3ef32..08eb350 100644 --- a/config_min.yaml +++ b/config_min.yaml @@ -1,4 +1,5 @@ # min config example. Refer to bbconf/const.py for key names and default values path: - bedstat_output: $HOME/results_pipeline \ No newline at end of file + bedstat_output: $LABROOT/resources/regions/bedstat_output + bedbuncher_output: $LABROOT/resources/regions/bedbuncher_output \ No newline at end of file diff --git a/docs/bbc_api.md b/docs/bbc_api.md index 81b2cec..b31c6dd 100644 --- a/docs/bbc_api.md +++ b/docs/bbc_api.md @@ -88,6 +88,22 @@ Get the total number of the documents in the bedsets index +```python +def delete_bedfiles_index(self) +``` + +Delete bedfiles index from Elasticsearch + + + +```python +def delete_bedsets_index(self) +``` + +Delete bedsets index from Elasticsearch + + + ```python def establish_elasticsearch_connection(self, host=None) ``` @@ -112,6 +128,23 @@ Return the path to the config file or None if not set +```python +def get_bedfiles_doc(self, doc_id) +``` + +Get a document from bedfiles index by its ID +#### Parameters: + +- `doc_id` (`str`): document ID to return + + +#### Returns: + +- `Mapping`: matched document + + + + ```python def get_bedfiles_mapping(self, just_data=True, **kwargs) ``` @@ -124,6 +157,23 @@ Get mapping definitions for the bedfiles index +```python +def get_bedsets_doc(self, doc_id) +``` + +Get a document from bedsets index by its ID +#### Parameters: + +- `doc_id` (`str`): document ID to return + + +#### Returns: + +- `Mapping`: matched document + + + + ```python def get_bedsets_mapping(self, just_data=True, **kwargs) ``` @@ -142,9 +192,10 @@ def insert_bedfiles_data(self, data, doc_id=None, **kwargs) Insert data to the bedfile index a Elasticsearch DB or create it and the insert in case it does not exist. -Document ID argument is optional. If not provided, a random ID will be assigned. -If provided the document will be inserted only if no documents with this ID are present in the DB. -However, the document overwriting can be forced if needed. +Document ID argument is optional. If not provided, a random ID will +be assigned. If provided the document will be inserted only if no +documents with this ID are present in the DB. However, the document +overwriting can be forced if needed. #### Parameters: - `data` (`dict`): data to insert @@ -159,8 +210,10 @@ def insert_bedsets_data(self, data, doc_id=None, **kwargs) Insert data to the bedset index in a Elasticsearch DB or create it and the insert in case it does not exist. -Document ID argument is optional. If not provided, a random ID will be assigned. -If provided the document will be inserted only if no documents with this ID are present in the DB. +Document ID argument is optional. If not provided, a random ID will +be assigned. +If provided the document will be inserted only if no documents with +this ID are present in the DB. However, the document overwriting can be forced if needed. #### Parameters: @@ -241,4 +294,4 @@ or read from a $BEDBASE environment variable -*Version Information: `bbconf` v0.0.2-dev, generated by `lucidoc` v0.4.2* +*Version Information: `bbconf` v0.0.2-dev, generated by `lucidoc` v0.4.3* diff --git a/docs/changelog.md b/docs/changelog.md index af0bc6c..30bd0fd 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,9 +2,17 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. -## [0.0.2] - unreleased +## [0.0.2] - 2020-05-28 +### Added +- index deleting methods: + - `delete_bedsets_index` + - `delete_bedfiles_index` +- multiple new keys constants + ### Changed - make `search_bedfiles` and `search_bedsets` methods return all hits by default instead of just 10. Parametrize it. +- added more arguments to `insert_bedfiles_data` and `insert_bedsets_data` method interfaces: `doc_id` and `force_update` +- Elasticsearch documents are inserted into the indices more securily, `insert_*` methods prevent documents duplication ## [0.0.1] - 2020-02-05 From f1875a52a16e9b7834d7ff39d472a6d1373ae50e Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 28 May 2020 16:35:16 -0400 Subject: [PATCH 35/35] update version --- bbconf/_version.py | 2 +- docs/bbc_api.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bbconf/_version.py b/bbconf/_version.py index 6892a3d..3b93d0b 100644 --- a/bbconf/_version.py +++ b/bbconf/_version.py @@ -1 +1 @@ -__version__ = "0.0.2-dev" +__version__ = "0.0.2" diff --git a/docs/bbc_api.md b/docs/bbc_api.md index b31c6dd..8de8432 100644 --- a/docs/bbc_api.md +++ b/docs/bbc_api.md @@ -294,4 +294,4 @@ or read from a $BEDBASE environment variable -*Version Information: `bbconf` v0.0.2-dev, generated by `lucidoc` v0.4.3* +*Version Information: `bbconf` v0.0.2, generated by `lucidoc` v0.4.3*