From 133bcb710b22d2d850a94805da259b9dd84fe4df Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 29 Nov 2023 13:01:57 -0500 Subject: [PATCH] Fixed bug in populating description --- docs/changelog.md | 3 ++ geofetch/__init__.py | 7 ++-- geofetch/_version.py | 2 +- geofetch/finder.py | 2 +- geofetch/geofetch.py | 81 +++++++++++++++++++++++++++--------------- geofetch/sraconvert.py | 8 +++-- geofetch/utils.py | 20 +++++------ setup.py | 10 +++--- tests/test_geofetch.py | 21 ++++++++--- 9 files changed, 99 insertions(+), 55 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 66e77b0..2846978 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,5 +1,8 @@ # Changelog +## [0.12.5] -- 2023-11-29 +- Fixed bug, where description was not populated in PEP + ## [0.12.4] -- 2023-08-01 - Fixed SRA convert - Added how to convert SRA diff --git a/geofetch/__init__.py b/geofetch/__init__.py index 5065195..8e208d1 100644 --- a/geofetch/__init__.py +++ b/geofetch/__init__.py @@ -1,13 +1,14 @@ """ Package-level data """ import logmuse +import coloredlogs -from geofetch.geofetch import * -from geofetch.finder import * +from geofetch.geofetch import Geofetcher +from geofetch.finder import Finder from geofetch._version import __version__ __author__ = ["Oleksandr Khoroshevskyi", "Vince Reuter", "Nathan Sheffield"] -__all__ = ["Finder", "Geofetcher"] +__all__ = ["Finder", "Geofetcher", "__version__"] _LOGGER = logmuse.init_logger("geofetch") coloredlogs.install( diff --git a/geofetch/_version.py b/geofetch/_version.py index 6dd4954..8e377d6 100644 --- a/geofetch/_version.py +++ b/geofetch/_version.py @@ -1 +1 @@ -__version__ = "0.12.4" +__version__ = "0.12.5" diff --git a/geofetch/finder.py b/geofetch/finder.py index 11b3bfb..e41405e 100644 --- a/geofetch/finder.py +++ b/geofetch/finder.py @@ -127,7 +127,7 @@ def _run_search_query(url: str) -> list: """ x = requests.get(url) if x.status_code != 200: - _LOGGER.error(f"Request status != 200. Error. Check your request") + _LOGGER.error("Request status != 200. Error. Check your request") return [] try: x_result = xmltodict.parse(x.text)["eSearchResult"] diff --git a/geofetch/geofetch.py b/geofetch/geofetch.py index 4703686..932fd74 100755 --- a/geofetch/geofetch.py +++ b/geofetch/geofetch.py @@ -17,7 +17,24 @@ import pandas as pd from geofetch.cli import _parse_cmdl -from geofetch.const import * +from geofetch.const import ( + GSE_PATTERN, + SAMPLE_SUPP_METADATA_FILE, + EXP_SUPP_METADATA_FILE, + NEW_GENOME_COL_NAME, + FILE_RAW_NAME_SAMPLE_PATTERN, + FILE_RAW_NAME_SUBSAMPLE_PATTERN, + CONFIG_RAW_TEMPLATE_NAME, + CONFIG_SRA_TEMPLATE, + CONFIG_PROCESSED_TEMPLATE_NAME, + NUM_RETRIES, + SER_SUPP_FILE_PATTERN, + SUPP_FILE_PATTERN, + PROJECT_PATTERN, + NCBI_EFETCH, + NCBI_ESEARCH, + EXPERIMENT_PATTERN, +) from geofetch.utils import ( Accession, build_prefetch_command, @@ -480,8 +497,8 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje file_gse_content, gsm_metadata, file_sra ) if not srp_list_result: - _LOGGER.info(f"No SRP data, continuing ....") - _LOGGER.warning(f"No raw pep will be created! ....") + _LOGGER.info("No SRP data, continuing ....") + _LOGGER.warning("No raw pep will be created! ....") # delete current acc if no raw data was found # del metadata_dict[acc_GSE] pass @@ -498,7 +515,7 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje _LOGGER.info(f"Getting SRR: {run} in ({acc_GSE})") self._download_raw_data(run) else: - _LOGGER.info(f"Dry run, no data will be downloaded") + _LOGGER.info("Dry run, no data will be downloaded") # save one project if self.acc_anno and nkeys > 1: @@ -517,7 +534,7 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje # Logging cleaning process: if self.discard_soft: - _LOGGER.info(f"Cleaning soft files ...") + _LOGGER.info("Cleaning soft files ...") clean_soft_files(self.metadata_root_full) ####################################################################################### @@ -878,7 +895,7 @@ def _expand_metadata_list_item(self, metadata_list: list, dict_key: str): if element_is_list: for n_elem in range(len(metadata_list)): try: - if type(metadata_list[n_elem][dict_key]) is not list: + if not isinstance(metadata_list[n_elem][dict_key], list): metadata_list[n_elem][dict_key] = [ metadata_list[n_elem][dict_key] ] @@ -930,7 +947,7 @@ def _expand_metadata_list_item(self, metadata_list: list, dict_key: str): metadata_list[n_elem][dict_key] = this_string else: del metadata_list[n_elem][dict_key] - except KeyError as err: + except KeyError: # _LOGGER.warning( # f"expand_metadata_list: Key Error: {err}, continuing ..." # ) @@ -980,6 +997,7 @@ def _write_processed_annotation( ) -> Union[NoReturn, peppy.Project]: """ Save annotation file by providing list of dictionaries with files metadata + :param list processed_metadata: list of dictionaries with files metadata :param str file_annotation_path: the path to the metadata file that has to be saved :param just_object: True, if you want to get peppy object without saving file @@ -1046,13 +1064,14 @@ def _write_processed_annotation( proj = peppy.Project().from_pandas(pd_value, config=conf) proj_exp_data = conf.get("experiment_metadata") if proj_exp_data: - proj["description"] = proj_exp_data.get("series_title") + proj.description = proj_exp_data.get("series_title") return proj @staticmethod def _find_genome(metadata_list: list) -> list: """ Create new genome column by searching joining few columns + :param metadata_list: list with metadata dict :return: list with metadata dict where genome column was added """ @@ -1080,6 +1099,7 @@ def _write_raw_annotation_new( """ Combine individual accessions into project-level annotations, and writing individual accession files (if requested) + :param name: Name of the run, project, or acc --> will influence name of the folder where project will be created :param metadata_dict: dictionary of sample annotations :param subannot_dict: dictionary of subsample annotations @@ -1128,7 +1148,7 @@ def _write_raw_annotation_new( f"subsample_table: {os.path.basename(proj_root_subsample)}" ) else: - subanot_path_yaml = f"" + subanot_path_yaml = "" template = self._create_config_raw( proj_meta, proj_root_sample, subanot_path_yaml, gse_meta_dict @@ -1166,7 +1186,7 @@ def _write_raw_annotation_new( proj = peppy.Project().from_pandas(meta_df, sub_meta_df, conf) proj_exp_data = conf.get("experiment_metadata") if proj_exp_data: - proj["description"] = proj_exp_data.get("series_title") + proj.description = proj_exp_data.get("series_title") return proj def _create_config_processed( @@ -1177,6 +1197,7 @@ def _create_config_processed( ) -> str: """ Compose and generate config file content + :param file_annotation_path: root to the annotation file :param proj_meta: common metadata that has to added to config file :param meta_in_series: @@ -1218,6 +1239,7 @@ def _create_config_raw( ): """ Compose and generate config file content for raw data + :param proj_meta: root to the annotation file :param proj_root_sample: path to sampletable file :param subanot_path_yaml: path to subannotation file @@ -1275,6 +1297,7 @@ def _check_sample_name_standard(metadata_dict: dict) -> dict: """ Standardize sample name and checking if it exists (This function is used for raw data) + :param metadata_dict: metadata dict :return: metadata dict with standardize sample names """ @@ -1300,14 +1323,16 @@ def _separate_common_meta( ) -> tuple: """ Separate experiment(project) metadata from sample metadata + :param list or dict meta_list: list of dictionaries of samples :param int max_len: threshold of the length of the common value that can be stored in the sample table :param int del_limit: threshold of the length of the common value that have to be deleted :param int attr_limit_truncate: max length of the attribute in the sample csv :return set: Return is a set of list, where 1 list (or dict) is - list of samples metadata dictionaries and 2: list of common samples metadata - dictionaries that are linked to the project. + list of samples metadata dictionaries and 2: list of common samples metadata + dictionaries that are linked to the project. """ + # check if meta_list is dict and converting it to list input_is_dict = False if isinstance(meta_list, dict): @@ -1401,6 +1426,7 @@ def _download_SRA_file(self, run_name: str): def _sra_to_bam_conversion_sam_dump(self, bam_file: str, run_name: str) -> NoReturn: """ Convert SRA file to BAM file by using samtools function "sam-dump" + :param str bam_file: path to BAM file that has to be created :param str run_name: SRR number of the SRA file that has to be converted """ @@ -1509,7 +1535,7 @@ def _download_file( full_filepath = os.path.join(data_folder, new_name) if not os.path.exists(full_filepath): - _LOGGER.info(f"\033[38;5;242m") # set color to gray + _LOGGER.info("\033[38;5;242m") # set color to gray # if dir does not exist: if not os.path.exists(data_folder): os.makedirs(data_folder) @@ -1518,7 +1544,7 @@ def _download_file( ) _LOGGER.info(f"\033[38;5;242m{ret}\033[0m") time.sleep(sleep_after) - _LOGGER.info(f"\033[0m") # Reset to default terminal color + _LOGGER.info("\033[0m") # Reset to default terminal color else: _LOGGER.info(f"\033[38;5;242mFile {full_filepath} exists.\033[0m") @@ -1545,7 +1571,7 @@ def _get_list_of_processed_files( pl = parse_SOFT_line(line) file_url = pl[list(pl.keys())[0]].rstrip() filename = os.path.basename(file_url) - _LOGGER.debug(f"Processed GSE file found: %s" % str(file_url)) + _LOGGER.debug(f"Processed GSE file found: {str(file_url)}") # search for tar file: if tar_re.search(filename): @@ -1574,7 +1600,7 @@ def _get_list_of_processed_files( ) else: - raise Exception(f"error in requesting tar_files_list") + raise Exception("error in requesting tar_files_list") else: _LOGGER.info(f"Found previous GSM file: {filelist_path}") filelist_obj = open(filelist_path, "r") @@ -1610,9 +1636,8 @@ def _get_list_of_processed_files( ): meta_processed_samples[nb].update(pl) else: - if ( - type(meta_processed_samples[nb][element_keys]) - is not list + if not isinstance( + meta_processed_samples[nb][element_keys], list ): meta_processed_samples[nb][element_keys] = [ meta_processed_samples[nb][element_keys] @@ -1631,7 +1656,7 @@ def _get_list_of_processed_files( pl = parse_SOFT_line(line_gsm) file_url_gsm = pl[list(pl.keys())[0]].rstrip() _LOGGER.debug( - f"Processed GSM file found: %s" % str(file_url_gsm) + f"Processed GSM file found: {str(file_url_gsm)}" ) if file_url_gsm != "NONE": meta_processed_samples[nb]["files"].append(file_url_gsm) @@ -1643,8 +1668,7 @@ def _get_list_of_processed_files( meta_processed_samples = _separate_file_url(meta_processed_samples) _LOGGER.info( - f"\nTotal number of processed SAMPLES files found is: " - f"%s" % str(len(meta_processed_samples)) + f"\nTotal number of processed SAMPLES files found is: {str(len(meta_processed_samples))}" ) # expand meta_processed_samples with information about type and size @@ -1677,21 +1701,21 @@ def _get_list_of_processed_files( if bl_key not in meta_processed_series.keys(): meta_processed_series.update(bl) else: - if type(meta_processed_series[bl_key]) is not list: + if not isinstance(meta_processed_series[bl_key], list): meta_processed_series[bl_key] = [meta_processed_series[bl_key]] meta_processed_series[bl_key].append(bl_value) else: meta_processed_series[bl_key].append(bl_value) except IndexError as ind_err: _LOGGER.debug( - f"IndexError in adding value to meta_processed_series: %s" % ind_err + f"IndexError in adding value to meta_processed_series: {ind_err}" ) meta_processed_series = _separate_list_of_files(meta_processed_series) meta_processed_series = _separate_file_url(meta_processed_series) _LOGGER.info( f"Total number of processed SERIES files found is: " - f"%s" % str(len(meta_processed_series)) + f"{str(len(meta_processed_series))}" ) if self.filter_re: meta_processed_series = self._run_filter(meta_processed_series) @@ -1778,6 +1802,7 @@ def _download_processed_file(self, file_url: str, data_folder: str) -> bool: def _get_SRA_meta(self, file_gse_content: list, gsm_metadata, file_sra=None): """ Parse out the SRA project identifier from the GSE file + :param list file_gse_content: list of content of file_sde_content :param dict gsm_metadata: dict of GSM metadata :param str file_sra: full path to SRA.csv metafile that has to be downloaded @@ -1805,7 +1830,7 @@ def _get_SRA_meta(self, file_gse_content: list, gsm_metadata, file_sra=None): acc_SRP = list(gsm_metadata.keys())[0] _LOGGER.warning( "But the GSM has an SRX number; instead of an " - "SRP, using SRX identifier for this sample: " + acc_SRP + f"SRP, using SRX identifier for this sample: {acc_SRP}" ) except TypeError: _LOGGER.warning("Error in gsm_metadata") @@ -1839,7 +1864,7 @@ def _get_SRA_meta(self, file_gse_content: list, gsm_metadata, file_sra=None): return [] else: # open existing annotation - _LOGGER.info(f"Found SRA metadata, opening..") + _LOGGER.info("Found SRA metadata, opening..") with open(file_sra, "r") as m_file: reader = csv.reader(m_file) file_list = [] @@ -1869,7 +1894,7 @@ def _get_SRP_list(self, srp_number: str) -> list: :return: list of dicts of SRRs """ if not srp_number: - _LOGGER.info(f"No srp number in this accession found") + _LOGGER.info("No srp number in this accession found") return [] _LOGGER.info(f"Downloading {srp_number} sra metadata") ncbi_esearch = NCBI_ESEARCH.format(SRP_NUMBER=srp_number) diff --git a/geofetch/sraconvert.py b/geofetch/sraconvert.py index d524895..d2dd3bc 100755 --- a/geofetch/sraconvert.py +++ b/geofetch/sraconvert.py @@ -144,12 +144,14 @@ def main(): # for paired-end data, and only *_1.fastq for single-end data. outfile = "{fq_prefix}_1.fastq.gz".format(fq_prefix=fq_prefix) cmd = "fasterq-dump {data_source} -O {outfolder}".format( - data_source=infile, outfolder=args.fqfolder, nofail=True + data_source=infile, + outfolder=args.fqfolder, ) elif args.format == "bam": outfile = os.path.join(args.bamfolder, args.srr[i] + ".bam") cmd = "sam-dump -u {data_source} | samtools view -bS - > {outfile}".format( - data_source=infile, outfile=outfile, nofail=True + data_source=infile, + outfile=outfile, ) else: raise KeyError("Unknown format: {}".format(args.format)) @@ -160,7 +162,7 @@ def main(): pm.info("Already completed files: {}".format(failed_files)) try: failed_files.remove(infile) - except: + except Exception: pass elif args.mode == "delete_bam": diff --git a/geofetch/utils.py b/geofetch/utils.py index f7af36c..850a77e 100644 --- a/geofetch/utils.py +++ b/geofetch/utils.py @@ -8,7 +8,7 @@ import requests from io import StringIO import csv -from typing import * +from typing import Union, List, NoReturn, Dict _LOGGER = logging.getLogger(__name__) @@ -48,7 +48,7 @@ def is_known_type(accn: str = None, typename: str = None): try: prefix, number = split_accn(accn) return prefix.upper() in URL_BY_ACC - except: + except Exception: return False @@ -85,9 +85,9 @@ def parse_accessions(input_arg, metadata_folder, just_metadata=False, max_size=N # Read the Run identifiers to download. run_ids = [] with open(file_sra, "r") as f: - for l in f: - if l.startswith("SRR"): - r_id = l.split(",")[0] + for line in f: + if line.startswith("SRR"): + r_id = line.split(",")[0] run_ids.append(r_id) _LOGGER.info("{} run(s)".format(len(run_ids))) for r_id in run_ids: @@ -132,14 +132,14 @@ def parse_accessions(input_arg, metadata_folder, just_metadata=False, max_size=N return acc_GSE_list -def parse_SOFT_line(l: str) -> dict: +def parse_SOFT_line(line: str) -> dict: """ Parse SOFT formatted line, returning a dictionary with the key-value pair. - :param str l: A SOFT-formatted line to parse ( !key = value ) + :param str line: A SOFT-formatted line to parse ( !key = value ) :return dict[str, str]: A python Dict object representing the key-value. """ - elems = l[1:].split("=") + elems = line[1:].split("=") return {elems[0].rstrip(): "=".join(elems[1:]).lstrip()} @@ -543,8 +543,8 @@ def _sanitize_config_string(text: str) -> str: :return: sanitized strings """ new_str = text - new_str = new_str.replace('"', f'\\"') - new_str = new_str.replace("'", f"''") + new_str = new_str.replace('"', '\\"') + new_str = new_str.replace("'", "''") return new_str diff --git a/setup.py b/setup.py index fd88b93..2d180f2 100644 --- a/setup.py +++ b/setup.py @@ -16,11 +16,11 @@ def read_reqs(reqs_name): deps = [] with open(os.path.join(REQDIR, "requirements-{}.txt".format(reqs_name)), "r") as f: - for l in f: - if not l.strip(): + for line in f: + if not line.strip(): continue - # deps.append(l.split("=")[0].rstrip("<>")) - deps.append(l) + # deps.append(line.split("=")[0].rstrip("<>")) + deps.append(line) return deps @@ -69,5 +69,5 @@ def read_reqs(reqs_name): setup_requires=( ["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else [] ), - **extra + **extra, ) diff --git a/tests/test_geofetch.py b/tests/test_geofetch.py index ea4fae0..5e5d70f 100644 --- a/tests/test_geofetch.py +++ b/tests/test_geofetch.py @@ -1,7 +1,8 @@ import peppy import geofetch -from geofetch import parse_accessions, Geofetcher, utils +from geofetch import Geofetcher, utils +from geofetch.utils import parse_accessions import os import pytest import shutil @@ -89,10 +90,10 @@ def initiate_geofetcher(self, tmpdir): def test_file_list( self, gse_numb, soft_gse, soft_gsm, sample_len, series_len, initiate_geofetcher ): - file_gse_content = geofetch.Accession(gse_numb).fetch_metadata( + file_gse_content = geofetch.utils.Accession(gse_numb).fetch_metadata( soft_gse, typename="GSE", clean=False ) - file_gsm_content = geofetch.Accession(gse_numb).fetch_metadata( + file_gsm_content = geofetch.utils.Accession(gse_numb).fetch_metadata( soft_gsm, typename="GSM", clean=False ) ( @@ -358,9 +359,21 @@ def test_creating_processed_peppy(self, initiate_geofetcher): def test_number_of_samples(self, initiate_geofetcher): gse_numb = "GSE189141" p_prop = initiate_geofetcher.get_projects(gse_numb) - a = [d["sample_name"] for d in p_prop[f"{gse_numb}_raw"].samples] + # a = [d["sample_name"] for d in p_prop[f"{gse_numb}_raw"].samples] assert len(p_prop[f"{gse_numb}_raw"].samples) == 16 # it has 16 samples + def test_description_created_correctly_series(self, initiate_geofetcher): + gse_numb = "GSE189141" + p_prop = initiate_geofetcher.get_projects(gse_numb) + peppy_obj = p_prop[f"{gse_numb}_raw"].to_dict(extended=True) + assert peppy_obj["_config"]["description"] is not None + + def test_description_created_correctly_samples(self, initiate_geofetcher): + gse_numb = "GSE189141" + p_prop = initiate_geofetcher.get_projects(gse_numb) + peppy_obj = p_prop[f"{gse_numb}_raw"].to_dict(extended=True) + assert peppy_obj["_config"]["description"] is not None + def test_clean_func(tmpdir): """