From 133bcb710b22d2d850a94805da259b9dd84fe4df Mon Sep 17 00:00:00 2001
From: Khoroshevskyi <sasha99250@gmail.com>
Date: Wed, 29 Nov 2023 13:01:57 -0500
Subject: [PATCH] Fixed bug in populating description

---
 docs/changelog.md      |  3 ++
 geofetch/__init__.py   |  7 ++--
 geofetch/_version.py   |  2 +-
 geofetch/finder.py     |  2 +-
 geofetch/geofetch.py   | 81 +++++++++++++++++++++++++++---------------
 geofetch/sraconvert.py |  8 +++--
 geofetch/utils.py      | 20 +++++------
 setup.py               | 10 +++---
 tests/test_geofetch.py | 21 ++++++++---
 9 files changed, 99 insertions(+), 55 deletions(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index 66e77b0..2846978 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,8 @@
 # Changelog
 
+## [0.12.5] -- 2023-11-29
+- Fixed bug, where description was not populated in PEP
+
 ## [0.12.4] -- 2023-08-01
 - Fixed SRA convert
 - Added how to convert SRA
diff --git a/geofetch/__init__.py b/geofetch/__init__.py
index 5065195..8e208d1 100644
--- a/geofetch/__init__.py
+++ b/geofetch/__init__.py
@@ -1,13 +1,14 @@
 """ Package-level data """
 import logmuse
+import coloredlogs
 
-from geofetch.geofetch import *
-from geofetch.finder import *
+from geofetch.geofetch import Geofetcher
+from geofetch.finder import Finder
 from geofetch._version import __version__
 
 
 __author__ = ["Oleksandr Khoroshevskyi", "Vince Reuter", "Nathan Sheffield"]
-__all__ = ["Finder", "Geofetcher"]
+__all__ = ["Finder", "Geofetcher", "__version__"]
 
 _LOGGER = logmuse.init_logger("geofetch")
 coloredlogs.install(
diff --git a/geofetch/_version.py b/geofetch/_version.py
index 6dd4954..8e377d6 100644
--- a/geofetch/_version.py
+++ b/geofetch/_version.py
@@ -1 +1 @@
-__version__ = "0.12.4"
+__version__ = "0.12.5"
diff --git a/geofetch/finder.py b/geofetch/finder.py
index 11b3bfb..e41405e 100644
--- a/geofetch/finder.py
+++ b/geofetch/finder.py
@@ -127,7 +127,7 @@ def _run_search_query(url: str) -> list:
         """
         x = requests.get(url)
         if x.status_code != 200:
-            _LOGGER.error(f"Request status != 200. Error. Check your request")
+            _LOGGER.error("Request status != 200. Error. Check your request")
             return []
         try:
             x_result = xmltodict.parse(x.text)["eSearchResult"]
diff --git a/geofetch/geofetch.py b/geofetch/geofetch.py
index 4703686..932fd74 100755
--- a/geofetch/geofetch.py
+++ b/geofetch/geofetch.py
@@ -17,7 +17,24 @@
 import pandas as pd
 
 from geofetch.cli import _parse_cmdl
-from geofetch.const import *
+from geofetch.const import (
+    GSE_PATTERN,
+    SAMPLE_SUPP_METADATA_FILE,
+    EXP_SUPP_METADATA_FILE,
+    NEW_GENOME_COL_NAME,
+    FILE_RAW_NAME_SAMPLE_PATTERN,
+    FILE_RAW_NAME_SUBSAMPLE_PATTERN,
+    CONFIG_RAW_TEMPLATE_NAME,
+    CONFIG_SRA_TEMPLATE,
+    CONFIG_PROCESSED_TEMPLATE_NAME,
+    NUM_RETRIES,
+    SER_SUPP_FILE_PATTERN,
+    SUPP_FILE_PATTERN,
+    PROJECT_PATTERN,
+    NCBI_EFETCH,
+    NCBI_ESEARCH,
+    EXPERIMENT_PATTERN,
+)
 from geofetch.utils import (
     Accession,
     build_prefetch_command,
@@ -480,8 +497,8 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje
                     file_gse_content, gsm_metadata, file_sra
                 )
                 if not srp_list_result:
-                    _LOGGER.info(f"No SRP data, continuing ....")
-                    _LOGGER.warning(f"No raw pep will be created! ....")
+                    _LOGGER.info("No SRP data, continuing ....")
+                    _LOGGER.warning("No raw pep will be created! ....")
                     # delete current acc if no raw data was found
                     # del metadata_dict[acc_GSE]
                     pass
@@ -498,7 +515,7 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje
                         _LOGGER.info(f"Getting SRR: {run}  in ({acc_GSE})")
                         self._download_raw_data(run)
                 else:
-                    _LOGGER.info(f"Dry run, no data will be downloaded")
+                    _LOGGER.info("Dry run, no data will be downloaded")
 
                 # save one project
                 if self.acc_anno and nkeys > 1:
@@ -517,7 +534,7 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje
 
         # Logging cleaning process:
         if self.discard_soft:
-            _LOGGER.info(f"Cleaning soft files ...")
+            _LOGGER.info("Cleaning soft files ...")
             clean_soft_files(self.metadata_root_full)
 
         #######################################################################################
@@ -878,7 +895,7 @@ def _expand_metadata_list_item(self, metadata_list: list, dict_key: str):
             if element_is_list:
                 for n_elem in range(len(metadata_list)):
                     try:
-                        if type(metadata_list[n_elem][dict_key]) is not list:
+                        if not isinstance(metadata_list[n_elem][dict_key], list):
                             metadata_list[n_elem][dict_key] = [
                                 metadata_list[n_elem][dict_key]
                             ]
@@ -930,7 +947,7 @@ def _expand_metadata_list_item(self, metadata_list: list, dict_key: str):
                             metadata_list[n_elem][dict_key] = this_string
                         else:
                             del metadata_list[n_elem][dict_key]
-                    except KeyError as err:
+                    except KeyError:
                         # _LOGGER.warning(
                         #     f"expand_metadata_list: Key Error: {err}, continuing ..."
                         # )
@@ -980,6 +997,7 @@ def _write_processed_annotation(
     ) -> Union[NoReturn, peppy.Project]:
         """
         Save annotation file by providing list of dictionaries with files metadata
+
         :param list processed_metadata: list of dictionaries with files metadata
         :param str file_annotation_path: the path to the metadata file that has to be saved
         :param just_object: True, if you want to get peppy object without saving file
@@ -1046,13 +1064,14 @@ def _write_processed_annotation(
             proj = peppy.Project().from_pandas(pd_value, config=conf)
             proj_exp_data = conf.get("experiment_metadata")
             if proj_exp_data:
-                proj["description"] = proj_exp_data.get("series_title")
+                proj.description = proj_exp_data.get("series_title")
             return proj
 
     @staticmethod
     def _find_genome(metadata_list: list) -> list:
         """
         Create new genome column by searching joining few columns
+
         :param metadata_list: list with metadata dict
         :return: list with metadata dict where genome column was added
         """
@@ -1080,6 +1099,7 @@ def _write_raw_annotation_new(
         """
         Combine individual accessions into project-level annotations, and writing
         individual accession files (if requested)
+
         :param name: Name of the run, project, or acc --> will influence name of the folder where project will be created
         :param metadata_dict: dictionary of sample annotations
         :param subannot_dict: dictionary of subsample annotations
@@ -1128,7 +1148,7 @@ def _write_raw_annotation_new(
                 f"subsample_table: {os.path.basename(proj_root_subsample)}"
             )
         else:
-            subanot_path_yaml = f""
+            subanot_path_yaml = ""
 
         template = self._create_config_raw(
             proj_meta, proj_root_sample, subanot_path_yaml, gse_meta_dict
@@ -1166,7 +1186,7 @@ def _write_raw_annotation_new(
             proj = peppy.Project().from_pandas(meta_df, sub_meta_df, conf)
             proj_exp_data = conf.get("experiment_metadata")
             if proj_exp_data:
-                proj["description"] = proj_exp_data.get("series_title")
+                proj.description = proj_exp_data.get("series_title")
             return proj
 
     def _create_config_processed(
@@ -1177,6 +1197,7 @@ def _create_config_processed(
     ) -> str:
         """
         Compose and generate config file content
+
         :param file_annotation_path: root to the annotation file
         :param proj_meta: common metadata that has to added to config file
         :param meta_in_series:
@@ -1218,6 +1239,7 @@ def _create_config_raw(
     ):
         """
         Compose and generate config file content for raw data
+
         :param proj_meta: root to the annotation file
         :param proj_root_sample: path to sampletable file
         :param subanot_path_yaml: path to subannotation file
@@ -1275,6 +1297,7 @@ def _check_sample_name_standard(metadata_dict: dict) -> dict:
         """
         Standardize sample name and checking if it exists
             (This function is used for raw data)
+
         :param metadata_dict: metadata dict
         :return: metadata dict with standardize sample names
         """
@@ -1300,14 +1323,16 @@ def _separate_common_meta(
     ) -> tuple:
         """
         Separate experiment(project) metadata from sample metadata
+
         :param list or dict meta_list: list of dictionaries of samples
         :param int max_len: threshold of the length of the common value that can be stored in the sample table
         :param int del_limit: threshold of the length of the common value that have to be deleted
         :param int attr_limit_truncate: max length of the attribute in the sample csv
         :return set: Return is a set of list, where 1 list (or dict) is
-        list of samples metadata dictionaries and 2: list of common samples metadata
-        dictionaries that are linked to the project.
+            list of samples metadata dictionaries and 2: list of common samples metadata
+            dictionaries that are linked to the project.
         """
+
         # check if meta_list is dict and converting it to list
         input_is_dict = False
         if isinstance(meta_list, dict):
@@ -1401,6 +1426,7 @@ def _download_SRA_file(self, run_name: str):
     def _sra_to_bam_conversion_sam_dump(self, bam_file: str, run_name: str) -> NoReturn:
         """
         Convert SRA file to BAM file by using samtools function "sam-dump"
+
         :param str bam_file: path to BAM file that has to be created
         :param str run_name: SRR number of the SRA file that has to be converted
         """
@@ -1509,7 +1535,7 @@ def _download_file(
             full_filepath = os.path.join(data_folder, new_name)
 
         if not os.path.exists(full_filepath):
-            _LOGGER.info(f"\033[38;5;242m")  # set color to gray
+            _LOGGER.info("\033[38;5;242m")  # set color to gray
             # if dir does not exist:
             if not os.path.exists(data_folder):
                 os.makedirs(data_folder)
@@ -1518,7 +1544,7 @@ def _download_file(
             )
             _LOGGER.info(f"\033[38;5;242m{ret}\033[0m")
             time.sleep(sleep_after)
-            _LOGGER.info(f"\033[0m")  # Reset to default terminal color
+            _LOGGER.info("\033[0m")  # Reset to default terminal color
         else:
             _LOGGER.info(f"\033[38;5;242mFile {full_filepath} exists.\033[0m")
 
@@ -1545,7 +1571,7 @@ def _get_list_of_processed_files(
                 pl = parse_SOFT_line(line)
                 file_url = pl[list(pl.keys())[0]].rstrip()
                 filename = os.path.basename(file_url)
-                _LOGGER.debug(f"Processed GSE file found: %s" % str(file_url))
+                _LOGGER.debug(f"Processed GSE file found: {str(file_url)}")
 
                 # search for tar file:
                 if tar_re.search(filename):
@@ -1574,7 +1600,7 @@ def _get_list_of_processed_files(
                                     )
 
                         else:
-                            raise Exception(f"error in requesting tar_files_list")
+                            raise Exception("error in requesting tar_files_list")
                     else:
                         _LOGGER.info(f"Found previous GSM file: {filelist_path}")
                         filelist_obj = open(filelist_path, "r")
@@ -1610,9 +1636,8 @@ def _get_list_of_processed_files(
                                 ):
                                     meta_processed_samples[nb].update(pl)
                                 else:
-                                    if (
-                                        type(meta_processed_samples[nb][element_keys])
-                                        is not list
+                                    if not isinstance(
+                                        meta_processed_samples[nb][element_keys], list
                                     ):
                                         meta_processed_samples[nb][element_keys] = [
                                             meta_processed_samples[nb][element_keys]
@@ -1631,7 +1656,7 @@ def _get_list_of_processed_files(
                             pl = parse_SOFT_line(line_gsm)
                             file_url_gsm = pl[list(pl.keys())[0]].rstrip()
                             _LOGGER.debug(
-                                f"Processed GSM file found: %s" % str(file_url_gsm)
+                                f"Processed GSM file found: {str(file_url_gsm)}"
                             )
                             if file_url_gsm != "NONE":
                                 meta_processed_samples[nb]["files"].append(file_url_gsm)
@@ -1643,8 +1668,7 @@ def _get_list_of_processed_files(
                     meta_processed_samples = _separate_file_url(meta_processed_samples)
 
                     _LOGGER.info(
-                        f"\nTotal number of processed SAMPLES files found is: "
-                        f"%s" % str(len(meta_processed_samples))
+                        f"\nTotal number of processed SAMPLES files found is: {str(len(meta_processed_samples))}"
                     )
 
                     # expand meta_processed_samples with information about type and size
@@ -1677,21 +1701,21 @@ def _get_list_of_processed_files(
                 if bl_key not in meta_processed_series.keys():
                     meta_processed_series.update(bl)
                 else:
-                    if type(meta_processed_series[bl_key]) is not list:
+                    if not isinstance(meta_processed_series[bl_key], list):
                         meta_processed_series[bl_key] = [meta_processed_series[bl_key]]
                         meta_processed_series[bl_key].append(bl_value)
                     else:
                         meta_processed_series[bl_key].append(bl_value)
             except IndexError as ind_err:
                 _LOGGER.debug(
-                    f"IndexError in adding value to meta_processed_series: %s" % ind_err
+                    f"IndexError in adding value to meta_processed_series: {ind_err}"
                 )
 
         meta_processed_series = _separate_list_of_files(meta_processed_series)
         meta_processed_series = _separate_file_url(meta_processed_series)
         _LOGGER.info(
             f"Total number of processed SERIES files found is: "
-            f"%s" % str(len(meta_processed_series))
+            f"{str(len(meta_processed_series))}"
         )
         if self.filter_re:
             meta_processed_series = self._run_filter(meta_processed_series)
@@ -1778,6 +1802,7 @@ def _download_processed_file(self, file_url: str, data_folder: str) -> bool:
     def _get_SRA_meta(self, file_gse_content: list, gsm_metadata, file_sra=None):
         """
         Parse out the SRA project identifier from the GSE file
+
         :param list file_gse_content: list of content of file_sde_content
         :param dict gsm_metadata: dict of GSM metadata
         :param str file_sra: full path to SRA.csv metafile that has to be downloaded
@@ -1805,7 +1830,7 @@ def _get_SRA_meta(self, file_gse_content: list, gsm_metadata, file_sra=None):
                     acc_SRP = list(gsm_metadata.keys())[0]
                     _LOGGER.warning(
                         "But the GSM has an SRX number; instead of an "
-                        "SRP, using SRX identifier for this sample: " + acc_SRP
+                        f"SRP, using SRX identifier for this sample: {acc_SRP}"
                     )
                 except TypeError:
                     _LOGGER.warning("Error in gsm_metadata")
@@ -1839,7 +1864,7 @@ def _get_SRA_meta(self, file_gse_content: list, gsm_metadata, file_sra=None):
                     return []
             else:
                 # open existing annotation
-                _LOGGER.info(f"Found SRA metadata, opening..")
+                _LOGGER.info("Found SRA metadata, opening..")
                 with open(file_sra, "r") as m_file:
                     reader = csv.reader(m_file)
                     file_list = []
@@ -1869,7 +1894,7 @@ def _get_SRP_list(self, srp_number: str) -> list:
         :return: list of dicts of SRRs
         """
         if not srp_number:
-            _LOGGER.info(f"No srp number in this accession found")
+            _LOGGER.info("No srp number in this accession found")
             return []
         _LOGGER.info(f"Downloading {srp_number} sra metadata")
         ncbi_esearch = NCBI_ESEARCH.format(SRP_NUMBER=srp_number)
diff --git a/geofetch/sraconvert.py b/geofetch/sraconvert.py
index d524895..d2dd3bc 100755
--- a/geofetch/sraconvert.py
+++ b/geofetch/sraconvert.py
@@ -144,12 +144,14 @@ def main():
                 # for paired-end data, and only *_1.fastq for single-end data.
                 outfile = "{fq_prefix}_1.fastq.gz".format(fq_prefix=fq_prefix)
                 cmd = "fasterq-dump {data_source} -O {outfolder}".format(
-                    data_source=infile, outfolder=args.fqfolder, nofail=True
+                    data_source=infile,
+                    outfolder=args.fqfolder,
                 )
             elif args.format == "bam":
                 outfile = os.path.join(args.bamfolder, args.srr[i] + ".bam")
                 cmd = "sam-dump -u {data_source} | samtools view -bS - > {outfile}".format(
-                    data_source=infile, outfile=outfile, nofail=True
+                    data_source=infile,
+                    outfile=outfile,
                 )
             else:
                 raise KeyError("Unknown format: {}".format(args.format))
@@ -160,7 +162,7 @@ def main():
                 pm.info("Already completed files: {}".format(failed_files))
                 try:
                     failed_files.remove(infile)
-                except:
+                except Exception:
                     pass
 
         elif args.mode == "delete_bam":
diff --git a/geofetch/utils.py b/geofetch/utils.py
index f7af36c..850a77e 100644
--- a/geofetch/utils.py
+++ b/geofetch/utils.py
@@ -8,7 +8,7 @@
 import requests
 from io import StringIO
 import csv
-from typing import *
+from typing import Union, List, NoReturn, Dict
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -48,7 +48,7 @@ def is_known_type(accn: str = None, typename: str = None):
     try:
         prefix, number = split_accn(accn)
         return prefix.upper() in URL_BY_ACC
-    except:
+    except Exception:
         return False
 
 
@@ -85,9 +85,9 @@ def parse_accessions(input_arg, metadata_folder, just_metadata=False, max_size=N
             # Read the Run identifiers to download.
             run_ids = []
             with open(file_sra, "r") as f:
-                for l in f:
-                    if l.startswith("SRR"):
-                        r_id = l.split(",")[0]
+                for line in f:
+                    if line.startswith("SRR"):
+                        r_id = line.split(",")[0]
                         run_ids.append(r_id)
             _LOGGER.info("{} run(s)".format(len(run_ids)))
             for r_id in run_ids:
@@ -132,14 +132,14 @@ def parse_accessions(input_arg, metadata_folder, just_metadata=False, max_size=N
     return acc_GSE_list
 
 
-def parse_SOFT_line(l: str) -> dict:
+def parse_SOFT_line(line: str) -> dict:
     """
     Parse SOFT formatted line, returning a dictionary with the key-value pair.
 
-    :param str l: A SOFT-formatted line to parse ( !key = value )
+    :param str line: A SOFT-formatted line to parse ( !key = value )
     :return dict[str, str]: A python Dict object representing the key-value.
     """
-    elems = l[1:].split("=")
+    elems = line[1:].split("=")
     return {elems[0].rstrip(): "=".join(elems[1:]).lstrip()}
 
 
@@ -543,8 +543,8 @@ def _sanitize_config_string(text: str) -> str:
     :return: sanitized strings
     """
     new_str = text
-    new_str = new_str.replace('"', f'\\"')
-    new_str = new_str.replace("'", f"''")
+    new_str = new_str.replace('"', '\\"')
+    new_str = new_str.replace("'", "''")
     return new_str
 
 
diff --git a/setup.py b/setup.py
index fd88b93..2d180f2 100644
--- a/setup.py
+++ b/setup.py
@@ -16,11 +16,11 @@
 def read_reqs(reqs_name):
     deps = []
     with open(os.path.join(REQDIR, "requirements-{}.txt".format(reqs_name)), "r") as f:
-        for l in f:
-            if not l.strip():
+        for line in f:
+            if not line.strip():
                 continue
-            # deps.append(l.split("=")[0].rstrip("<>"))
-            deps.append(l)
+            # deps.append(line.split("=")[0].rstrip("<>"))
+            deps.append(line)
     return deps
 
 
@@ -69,5 +69,5 @@ def read_reqs(reqs_name):
     setup_requires=(
         ["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else []
     ),
-    **extra
+    **extra,
 )
diff --git a/tests/test_geofetch.py b/tests/test_geofetch.py
index ea4fae0..5e5d70f 100644
--- a/tests/test_geofetch.py
+++ b/tests/test_geofetch.py
@@ -1,7 +1,8 @@
 import peppy
 
 import geofetch
-from geofetch import parse_accessions, Geofetcher, utils
+from geofetch import Geofetcher, utils
+from geofetch.utils import parse_accessions
 import os
 import pytest
 import shutil
@@ -89,10 +90,10 @@ def initiate_geofetcher(self, tmpdir):
     def test_file_list(
         self, gse_numb, soft_gse, soft_gsm, sample_len, series_len, initiate_geofetcher
     ):
-        file_gse_content = geofetch.Accession(gse_numb).fetch_metadata(
+        file_gse_content = geofetch.utils.Accession(gse_numb).fetch_metadata(
             soft_gse, typename="GSE", clean=False
         )
-        file_gsm_content = geofetch.Accession(gse_numb).fetch_metadata(
+        file_gsm_content = geofetch.utils.Accession(gse_numb).fetch_metadata(
             soft_gsm, typename="GSM", clean=False
         )
         (
@@ -358,9 +359,21 @@ def test_creating_processed_peppy(self, initiate_geofetcher):
     def test_number_of_samples(self, initiate_geofetcher):
         gse_numb = "GSE189141"
         p_prop = initiate_geofetcher.get_projects(gse_numb)
-        a = [d["sample_name"] for d in p_prop[f"{gse_numb}_raw"].samples]
+        # a = [d["sample_name"] for d in p_prop[f"{gse_numb}_raw"].samples]
         assert len(p_prop[f"{gse_numb}_raw"].samples) == 16  # it has 16 samples
 
+    def test_description_created_correctly_series(self, initiate_geofetcher):
+        gse_numb = "GSE189141"
+        p_prop = initiate_geofetcher.get_projects(gse_numb)
+        peppy_obj = p_prop[f"{gse_numb}_raw"].to_dict(extended=True)
+        assert peppy_obj["_config"]["description"] is not None
+
+    def test_description_created_correctly_samples(self, initiate_geofetcher):
+        gse_numb = "GSE189141"
+        p_prop = initiate_geofetcher.get_projects(gse_numb)
+        peppy_obj = p_prop[f"{gse_numb}_raw"].to_dict(extended=True)
+        assert peppy_obj["_config"]["description"] is not None
+
 
 def test_clean_func(tmpdir):
     """