diff --git a/.github/workflows/pypi_publish.yml b/.github/workflows/pypi_publish.yml index 871b7235..68880657 100644 --- a/.github/workflows/pypi_publish.yml +++ b/.github/workflows/pypi_publish.yml @@ -47,42 +47,3 @@ jobs: path: dist/ - name: Publish to PyPI uses: pypa/gh-action-pypi-publish@release/v1 - - github-release: - name: Sign dist with Sigstore and upload to GitHub Release - needs: - - publish-to-pypi - runs-on: ubuntu-latest - permissions: - contents: write - id-token: write - steps: - - name: Download all the dists - uses: actions/download-artifact@v4 - with: - name: python-package-distributions - path: dist/ - - name: Sign the dists with Sigstore - uses: sigstore/gh-action-sigstore-python@v2.1.1 - with: - inputs: >- - ./dist/*.tar.gz - ./dist/*.whl - - name: Create GitHub Release - env: - GITHUB_TOKEN: ${{ github.token }} - run: >- - gh release create - '${{ github.ref_name }}' - --repo '${{ github.repository }}' - --notes "" - - name: Upload artifact signatures to GitHub Release - env: - GITHUB_TOKEN: ${{ github.token }} - # Upload to GitHub Release using the `gh` CLI. - # `dist/` contains the built packages, and the - # sigstore-produced signatures and certificates. - run: >- - gh release upload - '${{ github.ref_name }}' dist/** - --repo '${{ github.repository }}' \ No newline at end of file diff --git a/.github/workflows/python_lint.yml b/.github/workflows/python_lint.yml index 9d043bbd..45dbe4f3 100644 --- a/.github/workflows/python_lint.yml +++ b/.github/workflows/python_lint.yml @@ -21,8 +21,19 @@ jobs: uses: actions/checkout@master - name: Install flake8 run: pip install flake8 + - name: Check for Python file changes + id: file_check + uses: tj-actions/changed-files@v44 + with: + sha: ${{ github.event.pull_request.head.sha }} + files: | + **.py - name: Run flake8 + if: steps.file_check.outputs.any_changed == 'true' run: flake8 --ignore E501,W503,E203,W605 + - name: No Python files changed + if: steps.file_check.outputs.any_changed != 'true' + run: echo "No Python files have been changed." black_lint: runs-on: ubuntu-latest @@ -31,5 +42,15 @@ jobs: uses: actions/checkout@v2 - name: Install black in jupyter run: pip install black[jupyter] + - name: Check for Python file changes + id: file_check + uses: tj-actions/changed-files@v44 + with: + sha: ${{ github.event.pull_request.head.sha }} + files: '**.py' - name: Check code lints with Black + if: steps.file_check.outputs.any_changed == 'true' uses: psf/black@stable + - name: No Python files changed + if: steps.file_check.outputs.any_changed != 'true' + run: echo "No Python files have been changed." diff --git a/.github/workflows/test_sftp_handle.yml b/.github/workflows/test_sftp_handle.yml index a43a670f..086d4d42 100644 --- a/.github/workflows/test_sftp_handle.yml +++ b/.github/workflows/test_sftp_handle.yml @@ -1,12 +1,14 @@ name: test_sftp_handle on: - push: - branches: "**" pull_request_target: - types: [opened, reopened, synchronize, closed] + types: [opened, reopened, synchronize] branches: "**" - + +concurrency: + group: ${{ github.repository }}-test_sftp_handle + cancel-in-progress: false + jobs: security_check: runs-on: ubuntu-latest @@ -24,21 +26,10 @@ jobs: echo "Current permission level is ${{ steps.checkAccess.outputs.user-permission }}" echo "Job originally triggered by ${{ github.actor }}" exit 1 - - sleep_to_ensure_concurrency: - needs: security_check - runs-on: ubuntu-latest - steps: - - name: - run: sleep 10s - shell: bash test_sftp_handle: - needs: [security_check, sleep_to_ensure_concurrency] + needs: security_check if: github.repository_owner == 'BU-ISCIII' - concurrency: - group: ${{ github.repository }}-test_sftp_handle - cancel-in-progress: false runs-on: ubuntu-latest strategy: max-parallel: 1 diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c719d95..fd20a0a1 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,29 +4,71 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [1.X.Xdev] - 2024-XX-XX : https://github.com/BU-ISCIII/relecov-tools/releases/tag/1.X.X +## [1.2.0] - 2024-10-11 : https://github.com/BU-ISCIII/relecov-tools/releases/tag/1.2.0 ### Credits -Code contributions to the hotfix: +Code contributions to the release: + +- [Juan Ledesma](https://github.com/juanledesma78) +- [Pablo Mata](https://github.com/Shettland) +- [Sergio Olmos](https://github.com/OPSergio) ### Modules +- Included wrapper module to launch download, read-lab-metadata and validate processes sequentially [#322](https://github.com/BU-ISCIII/relecov-tools/pull/322) +- Changed launch-pipeline name for pipeline-manager when tools are used via CLI [#324](https://github.com/BU-ISCIII/relecov-tools/pull/324) + #### Added enhancements +- Now also check for gzip file integrity after download. Moved cleaning process to end of workflow [#313](https://github.com/BU-ISCIII/relecov-tools/pull/313) +- Introduced a decorator in sftp_client.py to reconnect when conection is lost [#313](https://github.com/BU-ISCIII/relecov-tools/pull/313) +- Add Hospital Universitari Doctor Josep Trueta to laboratory_address.json [#316] (https://github.com/BU-ISCIII/relecov-tools/pull/316) +- samples_data json file is no longer mandatory as input in read-lab-metadata [#314](https://github.com/BU-ISCIII/relecov-tools/pull/314) +- Included handling of alternative column names to support two distinct headers using the same schema in read-lab-metadata [#314](https://github.com/BU-ISCIII/relecov-tools/pull/314) +- Included a new hospital (Hospital Universitario Araba) to laboratory_address.json [#315](https://github.com/BU-ISCIII/relecov-tools/pull/315) +- More accurate cleaning process, skipping only sequencing files instead of whole folder [#321](https://github.com/BU-ISCIII/relecov-tools/pull/321) +- Now single logs summaries are also created for each folder during download [#321](https://github.com/BU-ISCIII/relecov-tools/pull/321) +- Introduced handling for missing/dup files and more accurate information in prompt for pipeline_manager [#321](https://github.com/BU-ISCIII/relecov-tools/pull/321) +- Included excel resize, brackets removal in messages and handled exceptions in log_summary.py [#322](https://github.com/BU-ISCIII/relecov-tools/pull/322) +- Included processed batchs and samples in read-bioinfo-metadata log summary [#324](https://github.com/BU-ISCIII/relecov-tools/pull/324) +- When no samples_data.json is given, read-lab-metadata now creates a new one [#324](https://github.com/BU-ISCIII/relecov-tools/pull/324) +- Handling for missing sample ids in read-lab-metadata [#324](https://github.com/BU-ISCIII/relecov-tools/pull/324) +- Better logging for download, read-lab-metadata and wrapper [#324](https://github.com/BU-ISCIII/relecov-tools/pull/324) + #### Fixes +- Fixed wrong city name in relecov_tools/conf/laboratory_address.json [#320](https://github.com/BU-ISCIII/relecov-tools/pull/320) +- Fixed wrong single-paired layout detection in metadata due to Capital letters [#321](https://github.com/BU-ISCIII/relecov-tools/pull/321) +- Error handling in merge_logs() and create_logs_excel() methods for log_summary.py [#322](https://github.com/BU-ISCIII/relecov-tools/pull/322) +- Included handling of multiple empty rows in metadata xlsx file [#322](https://github.com/BU-ISCIII/relecov-tools/pull/322) + #### Changed +- Renamed and refactored "bioinfo_lab_heading" for "alt_header_equivalences" in configuration.json [#314](https://github.com/BU-ISCIII/relecov-tools/pull/314) +- Included a few schema fields that were missing or outdated, related to bioinformatics results [#314](https://github.com/BU-ISCIII/relecov-tools/pull/314) +- Updated metadata excel template, moved to relecov_tools/assets [#320](https://github.com/BU-ISCIII/relecov-tools/pull/320) +- Now python lint only triggers when PR includes python files [#320](https://github.com/BU-ISCIII/relecov-tools/pull/320) +- Moved concurrency to whole workflow instead of each step in test_sftp-handle.yml [#320](https://github.com/BU-ISCIII/relecov-tools/pull/320) +- Updated test_sftp-handle.yml testing datasets [#320](https://github.com/BU-ISCIII/relecov-tools/pull/320) +- Now download skips folders containing "invalid_samples" in its name [#321](https://github.com/BU-ISCIII/relecov-tools/pull/321) +- read-lab-metadata: Some warnings now include label. Also removed trailing spaces [#322](https://github.com/BU-ISCIII/relecov-tools/pull/322) +- Renamed launch-pipeline for pipeline-manager and updated keys in configuration.json [#324](https://github.com/BU-ISCIII/relecov-tools/pull/324) +- Pipeline manager now splits data based on enrichment_panel and version. One folder for each group [#324](https://github.com/BU-ISCIII/relecov-tools/pull/324) + #### Removed +- Removed duplicated tests with pushes after PR was merged in test_sftp-handle [#312](https://github.com/BU-ISCIII/relecov-tools/pull/312) +- Deleted deprecated auto-release in pypi_publish as it does not work with tag pushes anymore [#312](https://github.com/BU-ISCIII/relecov-tools/pull/312) +- Removed first sleep time for reconnection decorator in sftp_client.py, sleep time now increases in the second attempt [#321](https://github.com/BU-ISCIII/relecov-tools/pull/321) + ### Requirements ## [1.1.0] - 2024-09-13 : https://github.com/BU-ISCIII/relecov-tools/releases/tag/1.1.0 ### Credits -Code contributions to the hotfix: +Code contributions to the release: - [Pablo Mata](https://github.com/Shettland) - [Sara Monzón](https://github.com/saramonzon) diff --git a/README.md b/README.md index 8d9b7453..a24351d9 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,8 @@ relecov-tools is a set of helper tools for the assembly of the different element - [upload-to-ena](#upload-to-ena) - [upload-to-gisaid](#upload-to-gisaid) - [update-db](#update-db) - - [launch-pipeline](#launch-pipeline) + - [pipeline-manager](#pipeline-manager) + - [wrapper](#wrapper) - [logs-to-excel](#logs-to-excel) - [build-schema](#build-schema) - [Mandatory Fields](#mandatory-fields) @@ -63,7 +64,7 @@ $ relecov-tools --help \ \ / |__ / |__ | |___ | | | \ / / / \ | \ | | | | | | \ / / |--| | \ |___ |___ |___ |___ |___| \/ -RELECOV-tools version 1.1.0 +RELECOV-tools version 1.2.0 Usage: relecov-tools [OPTIONS] COMMAND [ARGS]... Options: @@ -73,16 +74,19 @@ Options: --help Show this message and exit. Commands: - download Download files located in sftp server. - read-lab-metadata Create the json compliant to the relecov schema from... - read-bioinfo-metadata Create the json compliant to the relecov schema with Bioinfo Metadata. - validate Validate json file against schema. - map Convert data between phage plus schema to ENA,... - upload-to-ena parsed data to create xml files to upload to ena - upload-to-gisaid parsed data to create files to upload to gisaid - update-db feed database with metadata jsons - build-schema Generates and updates JSON Schema files from... - launch-pipeline Create the symbolic links for the samples which... + download Download files located in sftp server. + read-lab-metadata Create the json compliant to the relecov schema... + validate Validate json file against schema. + map Convert data between phage plus schema to ENA,... + upload-to-ena parse data to create xml files to upload to ena + upload-to-gisaid parsed data to create files to upload to gisaid + update-db upload the information included in json file to... + read-bioinfo-metadata Create the json compliant from the Bioinfo... + metadata-homogeneizer Parse institution metadata lab to the one used... + pipeline-manager Create the symbolic links for the samples which... + wrapper Execute download, read-lab-metadata and validate... + build-schema Generates and updates JSON Schema files from... + logs-to-excel Creates a merged xlsx report from all the log... ``` #### download The command `download` connects to a transfer protocol (currently sftp) and downloads all files in the different available folders in the passed credentials. In addition, it checks if the files in the current folder match the files in the metadata file and also checks if there are md5sum for each file. Else, it creates one before storing in the final repository. @@ -247,10 +251,10 @@ Usage: relecov-tools upload-to-gisaid [OPTIONS] -t, --type Select the type of information to upload to database [sample,bioinfodata,variantdata] -d, --databaseServer Name of the database server receiving the data [iskylims,relecov] -#### launch-pipeline +#### pipeline-manager Create the folder structure to execute the given pipeline for the latest sample batches after executing download, read-lab-metadata and validate modules. This module will create symbolic links for each sample and generate the necessary files for pipeline execution using the information from validated_BATCH-NAME_DATE.json. ``` -Usage: relecov-tools launch-pipeline [OPTIONS] +Usage: relecov-tools pipeline-manager [OPTIONS] Create the symbolic links for the samples which are validated to prepare for bioinformatics pipeline execution. @@ -263,6 +267,19 @@ Options: --help Show this message and exit. ``` +#### wrapper +Execute download, read-lab-metadata and validate sequentially using a config file to fill the arguments for each one. It also creates a global report with all the logs for the three processes in a user-friendly .xlsx format. The config file should include the name of each module that is executed, along with the necessary parameters in YAML format. +``` +Usage: relecov-tools wrapper [OPTIONS] + + Executes the modules in config file sequentially + +Options: + -c, --config_file PATH Path to config file in yaml format [required] + -o, --output_folder PATH Path to folder where global results are saved [required] + --help Show this message and exit. +``` + #### logs-to-excel Creates an xlsx file with all the entries found for a specified laboratory in a given set of log_summary.json files (from log-summary module). The laboratory name must match the name of one of the keys in the provided logs to work. ``` diff --git a/relecov_tools/__main__.py b/relecov_tools/__main__.py index 978d17b9..0ef86437 100755 --- a/relecov_tools/__main__.py +++ b/relecov_tools/__main__.py @@ -24,6 +24,7 @@ import relecov_tools.upload_ena_protocol import relecov_tools.pipeline_manager import relecov_tools.build_schema +import relecov_tools.dataprocess_wrapper log = logging.getLogger() @@ -61,7 +62,7 @@ def run_relecov_tools(): ) # stderr.print("[green] `._,._,'\n", highlight=False) - __version__ = "1.1.0" + __version__ = "1.2.0" stderr.print( "\n" "[grey39] RELECOV-tools version {}".format(__version__), highlight=False ) @@ -476,12 +477,12 @@ def metadata_homogeneizer(institution, directory, output): help="select the template config file", ) @click.option("-o", "--output", type=click.Path(), help="select output folder") -def launch_pipeline(input, template, output, config): +def pipeline_manager(input, template, output, config): """ Create the symbolic links for the samples which are validated to prepare for bioinformatics pipeline execution. """ - new_launch = relecov_tools.pipeline_manager.LaunchPipeline( + new_launch = relecov_tools.pipeline_manager.PipelineManager( input, template, output, config ) new_launch.pipeline_exc() @@ -565,7 +566,8 @@ def logs_to_excel(lab_code, output_folder, files): logsum = relecov_tools.log_summary.LogSum(output_location=output_folder) merged_logs = logsum.merge_logs(key_name=lab_code, logs_list=all_logs) final_logs = logsum.prepare_final_logs(logs=merged_logs) - logsum.create_logs_excel(logs=final_logs) + excel_outpath = os.path.join(output_folder, lab_code + "_logs_report.xlsx") + logsum.create_logs_excel(logs=final_logs, excel_outpath=excel_outpath) @relecov_tools_cli.command(help_priority=16) @@ -573,14 +575,14 @@ def logs_to_excel(lab_code, output_folder, files): "-c", "--config_file", type=click.Path(), - help="Path to config file in yaml format", + help="Path to config file in yaml format [required]", required=True, ) @click.option( "-o", "--output_folder", type=click.Path(), - help="Path to the base schema file. This file is used as a reference to compare it with the schema generated using this module. (Default: installed schema in 'relecov-tools/relecov_tools/schema/relecov_schema.json')", + help="Path to folder where global results are saved [required]", required=False, ) def wrapper(config_file, output_folder): diff --git a/relecov_tools/assets/Relecov_metadata_template_v2.0.11.xlsx b/relecov_tools/assets/Relecov_metadata_template_v2.0.11.xlsx new file mode 100644 index 00000000..0ebdda52 Binary files /dev/null and b/relecov_tools/assets/Relecov_metadata_template_v2.0.11.xlsx differ diff --git a/relecov_tools/conf/configuration.json b/relecov_tools/conf/configuration.json index 724811e1..d539d19f 100755 --- a/relecov_tools/conf/configuration.json +++ b/relecov_tools/conf/configuration.json @@ -56,42 +56,28 @@ "Sequence file R1 fastq", "Sequence file R2 fastq" ], - "bioinfo_heading": [ - "Consensus sequence filename", - "VCF filename", - "Variant designation table filename", - "Bioinformatics protocol", - "If bioinformatics protocol Is Other, Specify", - "Bioinformatics protocol version", - "Commercial/Open-source/both", - "Preprocessing software", - "Preprocessing software version", - "If preprocessing Is Other, Specify", - "Preprocessing params", - "Mapping software", - "Mapping software version", - "If mapping Is Other, Specify", - "Mapping params", - "Assembly software", - "Assembly software version", - "If assembly Is Other, Specify", - "Assembly params", - "Variant calling software", - "Variant calling software version", - "If variant calling Is Other, Specify", - "Variant calling params", - "Consensus software", - "Consensus software version", - "If consensus Is Other, Specify", - "Consensus params", - "Clade/Type identification software", - "Clade/Type software version", - "If Clade/Type Is Other, Specify", - "Lineage identification software", - "Lineage software version", - "If lineage identification Is Other, Specify", - "Quality control metrics (sample discard criteria)" - ], + "alt_heading_equivalences": { + "Sample ID": "Sample ID given for sequencing", + "LAB ID" : "Originating Laboratory", + "Sequencing date\n Formato: YYYY-MM-DD" : "Sequencing Date", + "Commercial All-in-one library kit" : "Commercial All-in-one library kit", + "Library preparation kit " : "Library Preparation Kit", + "Enrichment protocol" : "Enrichment Protocol", + "if enrichment protocol is Other, specify" : "If Enrichment Protocol Is Other, Specify", + "Amplicon protocol" : "Enrichment panel/assay", + "if enrichment panel/assay is Other, specify" : "If Enrichment panel/assay Is Other, Specify", + "Amplicon version" : "Enrichment panel/assay version", + "Number Of Samples In Run" : "Number Of Samples In Run", + "Number of variants with effect (missense, frameshit, stop codon)" : "Number of variants with effect", + "Sequencing platforms (Illumina, Nanopore, IonTorrent, PacBio, other)" : "Sequencing Instrument Model", + "Variant designation table filename" : "Lineage analysis file", + "Library preparation kit" : "Library Preparation Kit", + "Library layout" : "Library Layout", + "Read lenght" : "Read length", + "SARS-CoV-2 Lineage designation" : "Lineage designation", + "fastq filename R1" : "Sequence file R1 fastq", + "fastq filename R2" : "Sequence file R2 fastq" + }, "lab_metadata_req_json": { "laboratory_data": { "file": "laboratory_address.json", @@ -240,8 +226,12 @@ "sftp_port": "22" }, "metadata_processing": { + "sample_id_col": "Sample ID given for sequencing", "header_flag": "CAMPO", - "excel_sheet": "METADATA_LAB" + "excel_sheet": "METADATA_LAB", + "alternative_sheet": "5.Viral Characterisation and Se", + "alternative_flag": "LAB ID", + "alternative_sample_id_col": "Sample ID" }, "abort_if_md5_mismatch": "False", "platform_storage_folder": "/tmp/relecov", @@ -442,8 +432,9 @@ ] } }, - "launch_pipeline": { - "analysis_name": "RELECOV_icasas_C", + "pipeline_manager": { + "analysis_group": "RLV", + "analysis_user": "icasas_C", "doc_folder": "DOC", "analysis_folder": "ANALYSIS", "sample_stored_folder": "RAW", diff --git a/relecov_tools/conf/laboratory_address.json b/relecov_tools/conf/laboratory_address.json index 4acbf284..619352b5 100755 --- a/relecov_tools/conf/laboratory_address.json +++ b/relecov_tools/conf/laboratory_address.json @@ -561,6 +561,17 @@ "submitting_institution_address": " Passeig de la Vall d'Hebron, 119-129", "submitting_institution_email": "" }, + "Hospital Universitari Doctor Josep Trueta": { + "collecting_institution_address": "Avinguda de França, S/N,", + "collecting_institution_email": "", + "geo_loc_state": "Cataluña", + "geo_loc_region": "Girona", + "geo_loc_city": "Girona", + "geo_loc_country": "Spain", + "submitting_institution": "Hospital Universitari Vall d'Hebron", + "submitting_institution_address": " Passeig de la Vall d'Hebron, 119-129", + "submitting_institution_email": "" + }, "Hospital Universitari Germans Trias i Pujol (HUGTiP)": { "collecting_institution_address": " Crta del Canyet s/n", "collecting_institution_email": "", @@ -979,6 +990,17 @@ "submitting_institution_address": " Jose Atxotegi Kalea, s/n", "submitting_institution_email": "" }, + "Hospital Universitario Araba": { + "collecting_institution_address": "Calle Jose Atxotegi, 01009", + "collecting_institution_email": "", + "geo_loc_state": "Pais Vasco", + "geo_loc_region": "Alava", + "geo_loc_city": "Gasteiz", + "geo_loc_country": "Spain", + "submitting_institution": "Hospital Universitario Donostia", + "submitting_institution_address": "Begiristain Doktorea Pasealekua, s/n", + "submitting_institution_email": "" + }, "Hospital Universitari Arnau de Vilanova": { "collecting_institution_address": "Av. Alcalde Rovira Roure, 80", "collecting_institution_email": "", @@ -1402,7 +1424,7 @@ "collecting_institution_email": "", "geo_loc_state": "Pais Vasco", "geo_loc_region": "Vizcaya", - "geo_loc_city": "Baracaldo", + "geo_loc_city": "Barakaldo", "geo_loc_country": "Spain", "submitting_institution": "Hospital Universitario Cruces", "submitting_institution_address": "Plaza Cruces, s/n", diff --git a/relecov_tools/dataprocess_wrapper.py b/relecov_tools/dataprocess_wrapper.py new file mode 100644 index 00000000..4ba3e262 --- /dev/null +++ b/relecov_tools/dataprocess_wrapper.py @@ -0,0 +1,297 @@ +#!/usr/bin/env python +import logging +import re +import yaml +import os +import sys +from datetime import datetime +import inspect +import rich.console +from relecov_tools.download_manager import DownloadManager +from relecov_tools.read_lab_metadata import RelecovMetadata +from relecov_tools.json_validation import SchemaValidation +import relecov_tools.log_summary +import relecov_tools.utils + +log = logging.getLogger(__name__) +stderr = rich.console.Console( + stderr=True, + style="dim", + highlight=False, + force_terminal=relecov_tools.utils.rich_force_colors(), +) + + +class ProcessWrapper: + """ + Always fill all the arguments for the class in the config file, leave its value + if you dont want to use that argument e.g.(target_folders: ) -> (target_folders = None) + """ + + def __init__(self, config_file: str = None, output_folder: str = None): + if not os.path.isdir(str(output_folder)): + sys.exit(FileNotFoundError(f"Output folder {output_folder} is not valid")) + else: + self.output_folder = output_folder + if not os.path.isfile(str(config_file)): + sys.exit(FileNotFoundError(f"Config file {config_file} is not a file")) + else: + try: + self.config_data = relecov_tools.utils.read_yml_file(config_file) + # Config file should include a key + except yaml.YAMLError as e: + sys.exit(yaml.YAMLError(f"Invalid config file: {e}")) + output_regex = ("out_folder", "output_folder", "output_location") + for key, val in self.config_data.items(): + for arg in output_regex: + if val == arg: + self.config_data[key] = self.output_folder + self.wrapper_logsum = relecov_tools.log_summary.LogSum( + output_location=os.path.join(self.output_folder, "logs") + ) + self.config_data["download"].update({"output_location": output_folder}) + self.download_params = self.clean_module_params( + "DownloadManager", self.config_data["download"] + ) + self.readmeta_params = self.clean_module_params( + "RelecovMetadata", self.config_data["read-lab-metadata"] + ) + self.validate_params = self.clean_module_params( + "SchemaValidation", self.config_data["validate"] + ) + self.date = datetime.today().strftime("%Y%m%d%H%M%S") + + def clean_module_params(self, module, params): + active_module = eval(module) + module_args = inspect.getfullargspec(active_module.__init__)[0] + module_args.remove("self") + module_valid_params = {x: y for x, y in params.items() if x in module_args} + if not module_valid_params: + stderr.print(f"[red]Invalid params for {module} in config file") + sys.exit(1) + return module_valid_params + + def exec_download(self, download_params): + download_manager = DownloadManager(**download_params) + download_manager.execute_process() + finished_folders = download_manager.finished_folders + download_logs = self.wrapper_logsum.prepare_final_logs( + logs=download_manager.logsum.logs + ) + self.download_manager = download_manager + return finished_folders, download_logs + + def exec_read_metadata(self, readmeta_params): + read_metadata = RelecovMetadata(**readmeta_params) + read_metadata.create_metadata_json() + read_meta_logs = self.wrapper_logsum.prepare_final_logs( + logs=read_metadata.logsum.logs + ) + return read_meta_logs + + def exec_validation(self, validate_params): + validate_proccess = SchemaValidation(**validate_params) + valid_json_data, invalid_json = validate_proccess.validate() + validate_logs = self.wrapper_logsum.prepare_final_logs( + logs=validate_proccess.logsum.logs + ) + return valid_json_data, invalid_json, validate_logs + + def process_folder(self, finished_folders, key, folder_logs): + """Executes read-lab-metadata and validation process for the given downloaded folder. + Merges all the log summaries generated with the ones from download process, creates + an excel file with custom format and uploads it back to its remote sftp folder. + Also uploads the files that failed validation back to the remote sftp folder. + Finally. It cleans all the remote remaining files if the process was successful. + + Args: + finished_folders (dict(str:list)): Dictionary which includes the names + of the remote folders processed during download and the successfull files for each + key (str): Name of the folder to process in remote sftp, same name as the one + included in the log_summary from download process. + folder_logs (dict): Download log_summary corresponding to the processed folder + + Raises: + ValueError: If folder_logs dont include a path to the local folder + ValueError: If folder is not found in remote sftp or more than 1 + ValueError: If no samples/files are found for the folder after download + ValueError: If no metadata json file is found after read-lab-metadata + + Returns: + merged_logs (dict): Dictionary which includes the logs from all processes + """ + + def upload_files_from_json(invalid_json, remote_dir): + """Upload the files in a given json with samples metadata""" + for sample in invalid_json: + local_dir = sample.get("r1_fastq_filepath") + # files_keys = [key for key in sample.keys() if "_file_" in key] + sample_files = ( + sample.get("sequence_file_R1_fastq"), + sample.get("sequence_file_R2_fastq"), + ) + ftp_files = self.download_manager.relecov_sftp.get_file_list(remote_dir) + uploaded_files = [] + for file in sample_files: + if not file or file in ftp_files: + continue + loc_path = os.path.join(local_dir, file) + sftp_path = os.path.join(remote_dir, file) + log.info("Uploading %s to remote %s" % (loc_path, remote_dir)) + uploaded = self.download_manager.relecov_sftp.upload_file( + loc_path, sftp_path + ) + if not uploaded: + err = f"Could not upload {loc_path} to {remote_dir}" + self.wrapper_logsum.add_error(sample=sample, entry=err) + else: + uploaded_files.append(file) + return uploaded_files + + local_folder = folder_logs.get("path") + if not local_folder: + raise ValueError(f"Couldnt find local path for {key} in log after download") + files = [os.path.join(local_folder, file) for file in os.listdir(local_folder)] + try: + metadata_file = [x for x in files if re.search("lab_metadata.*.xlsx", x)][0] + samples_file = [x for x in files if re.search("samples_data.*.json", x)][0] + except IndexError: + raise ValueError("No metadata/samples files found after download") + self.readmeta_params.update( + { + "metadata_file": metadata_file, + "sample_list_file": samples_file, + "output_folder": local_folder, + } + ) + read_meta_logs = self.exec_read_metadata(self.readmeta_params) + metadata_json = [ + x for x in os.listdir(local_folder) if re.search("lab_metadata.*.json", x) + ] + if not metadata_json: + raise ValueError("No metadata json found after read-lab-metadata") + self.validate_params.update( + { + "json_data_file": os.path.join(local_folder, metadata_json[0]), + "metadata": metadata_file, + "out_folder": local_folder, + } + ) + valid_json_data, invalid_json, validate_logs = self.exec_validation( + self.validate_params + ) + merged_logs = self.wrapper_logsum.merge_logs( + key_name=key, logs_list=[{key: folder_logs}, read_meta_logs, validate_logs] + ) + stderr.print(f"[green]Merged logs from all processes in {local_folder}") + sftp_dirs = self.download_manager.relecov_sftp.list_remote_folders(key) + sftp_dirs_paths = [os.path.join(key, d) for d in sftp_dirs] + valid_dirs = [d for d in sftp_dirs_paths if d in finished_folders.keys()] + # As all folders are merged into one during download, there should only be 1 folder + if not valid_dirs or len(valid_dirs) >= 2: + # If all samples were valid during download and download_clean is used, the original folder might have been deleted + log.warning("Couldnt find %s folder in remote sftp. Creating new one", key) + remote_dir = os.path.join(key, self.date + "_invalid_samples") + self.download_manager.relecov_sftp.make_dir(remote_dir) + else: + remote_dir = valid_dirs[0] + stderr.print( + f"[blue]Cleaning successfully validated files from {remote_dir}" + ) + log.info("Cleaning successfully validated files from remote dir") + file_fields = ("sequence_file_R1_fastq", "sequence_file_R2_fastq") + valid_sampfiles = [ + f.get(key) for key in file_fields for f in valid_json_data + ] + valid_files = [ + f for f in finished_folders[remote_dir] if f in valid_sampfiles + ] + self.download_manager.delete_remote_files(remote_dir, files=valid_files) + self.download_manager.delete_remote_files(remote_dir, skip_seqs=True) + self.download_manager.clean_remote_folder(remote_dir) + if invalid_json: + logtxt = f"Found {len(invalid_json)} invalid samples in {key}" + self.wrapper_logsum.add_warning(key=key, entry=logtxt) + assets = os.path.join(os.path.dirname(os.path.realpath(__file__)), "assets") + metadata_template = [ + x for x in os.listdir(assets) if re.search("metadata_templat.*.xlsx", x) + ][0] + sftp_path = os.path.join(remote_dir, os.path.basename(metadata_template)) + log.info("Uploading invalid files and template to %s", remote_dir) + stderr.print(f"[blue]Uploading invalid files and template to {remote_dir}") + self.download_manager.relecov_sftp.upload_file( + os.path.join(assets, metadata_template), sftp_path + ) + # Upload all the files that failed validation process back to sftp + upload_files_from_json(invalid_json, remote_dir) + else: + log.info("No invalid samples in %s", key) + stderr.print(f"[green]No invalid samples were found for {key} !!!") + log_filepath = os.path.join(local_folder, str(key) + "_metadata_report.json") + self.wrapper_logsum.create_error_summary( + called_module="metadata", + filepath=log_filepath, + logs=merged_logs, + to_excel=True, + ) + xlsx_report_files = [ + f for f in os.listdir(local_folder) if re.search("metadata_report.xlsx", f) + ] + if xlsx_report_files: + log.info("Uploading %s xlsx report to remote %s" % (key, remote_dir)) + local_xlsx = os.path.join(local_folder, xlsx_report_files[0]) + remote_xlsx = os.path.join(remote_dir, xlsx_report_files[0]) + up = self.download_manager.relecov_sftp.upload_file(local_xlsx, remote_xlsx) + if not up: + log.error( + "Could not upload %s report to remote %s" % (key, local_folder) + ) + else: + log.error("Could not find xlsx report for %s in %s" % (key, local_folder)) + return merged_logs + + def run_wrapper(self): + """Execute each given process in config file sequentially, starting with download. + Once the download has finished, each downloaded folder is processed using read-lab-metadata + and validation modules. The logs from each module are merged into a single log-summary. + These merged logs are then used to create an excel report of all the processes + """ + finished_folders, download_logs = self.exec_download(self.download_params) + if not finished_folders: + stderr.print("[red]No valid folders found to process") + sys.exit(1) + stderr.print(f"[blue]Processing {len(finished_folders)} downloaded folders...") + counter = 0 + for key, folder_logs in download_logs.items(): + folder = folder_logs.get("path") + if not folder: + log.error(f"Skipped folder {key}. Logs do not include path field") + continue + if not folder_logs.get("valid"): + log.error(f"Folder {key} is set as invalid in logs. Skipped.") + continue + counter += 1 + logtxt = "Processing folder %s/%s: %s with local path %s" + log.info(logtxt % (counter, str(len(finished_folders)), key, folder)) + stderr.print(logtxt % (counter, str(len(finished_folders)), key, folder)) + try: + merged_logs = self.process_folder(finished_folders, key, folder_logs) + except (FileNotFoundError, ValueError) as e: + log.error(f"Could not process folder {key}: {e}") + folder_logs["errors"].append(f"Could not process folder {key}: {e}") + log_filepath = os.path.join( + folder, self.date + "_" + str(key) + "_wrapper_summary.json" + ) + self.wrapper_logsum.create_error_summary( + called_module="metadata", + filepath=log_filepath, + logs={key: folder_logs}, + to_excel=False, + ) + continue + self.wrapper_logsum.logs[key] = merged_logs[key] + self.wrapper_logsum.create_error_summary( + called_module="wrapper", + to_excel=True, + ) + return diff --git a/relecov_tools/download_manager.py b/relecov_tools/download_manager.py index df770f3a..17a13b1e 100755 --- a/relecov_tools/download_manager.py +++ b/relecov_tools/download_manager.py @@ -131,6 +131,7 @@ def __init__( self.relecov_sftp = relecov_tools.sftp_client.SftpRelecov( conf_file, sftp_user, sftp_passwd ) + self.finished_folders = {} def create_local_folder(self, folder): """Create folder to download files in local path using date @@ -267,7 +268,8 @@ def create_files_with_metadata_info( data = {k: v for k, v in data.items() if k not in samples_to_delete} with open(sample_data_path, "w", encoding="utf-8") as fh: fh.write(json.dumps(data, indent=4, sort_keys=True, ensure_ascii=False)) - log.info("Successfully created json file with samples %s", sample_data_path) + # Feed accessible dict with necessary information for wrapper to work + log.info("Successfully created samples json file %s", sample_data_path) return def remove_duplicated_values(self, sample_file_dict): @@ -387,23 +389,22 @@ def get_sample_fastq_file_names(self, local_folder, meta_f_path): except ValueError as e: stderr.print("[red]Unable to convert to string. ", e) continue - if s_name not in sample_file_dict: - sample_file_dict[s_name] = {} - else: + if s_name in sample_file_dict: log_text = f"Found duplicated sample name: {s_name}. Skipped." stderr.print(log_text) self.include_warning(log_text, sample=s_name) continue - if row[index_layout] == "paired" and row[index_fastq_r2] is None: + if row[index_layout] == "Paired" and row[index_fastq_r2] is None: error_text = "Sample %s is paired-end, but no R2 given" self.include_error(error_text % str(row[index_sampleID]), s_name) row_complete = False - if row[index_layout] == "single" and row[index_fastq_r2] is not None: + if row[index_layout] == "Single" and row[index_fastq_r2] is not None: error_text = "Sample %s is single-end, but R1&R2 given" self.include_error(error_text % str(row[index_sampleID]), s_name) row_complete = False if row_complete: if row[index_fastq_r1] is not None: + sample_file_dict[s_name] = {} # TODO: move these keys to configuration.json sample_file_dict[s_name]["sequence_file_R1_fastq"] = row[ index_fastq_r1 @@ -416,9 +417,9 @@ def get_sample_fastq_file_names(self, local_folder, meta_f_path): log_text = "Fastq_R1 not defined in Metadata for sample %s" stderr.print(f"[red]{str(log_text % s_name)}") self.include_error(entry=str(log_text % s_name), sample=s_name) - del sample_file_dict[s_name] else: - self.include_warning(entry=f"Row {counter} skipped. No sample ID given") + txt = f"Row {counter} in metadata skipped.No sequencing sample ID given" + self.include_warning(entry=txt) # Remove duplicated files clean_sample_dict = self.remove_duplicated_values(sample_file_dict) return clean_sample_dict @@ -571,23 +572,43 @@ def delete_remote_files(self, remote_folder, files=None, skip_seqs=False): files_to_remove = self.relecov_sftp.get_file_list(remote_folder) else: files_to_remove = files - if any(file.endswith(tuple(self.allowed_file_ext)) for file in files_to_remove): - if skip_seqs is True: - log_text = f"Folder {remote_folder} has sequencing files. Not removed." - self.include_warning(log_text) - return for file in files_to_remove: + if skip_seqs is True: + if file.endswith(tuple(self.allowed_file_ext)): + continue try: self.relecov_sftp.remove_file( os.path.join(remote_folder, os.path.basename(file)) ) log.info("%s Deleted from remote server", file) except (IOError, PermissionError) as e: - self.include_warning(f"Could not delete remote file {file}: {e}") + log.error(f"Could not delete remote file {file}: {e}") stderr.print(f"Could not delete remote file {file}. Error: {e}") return - def delete_remote_folder(self, remote_folder): + def rename_remote_folder(self, remote_folder): + if "tmp_processing" in remote_folder: + new_name = remote_folder.replace("tmp_processing", "invalid_samples") + if new_name == remote_folder: + log.warning("Remote folder %s was already renamed", remote_folder) + return + try: + self.relecov_sftp.rename_file(remote_folder, new_name) + if self.finished_folders.get(remote_folder): + self.finished_folders[new_name] = self.finished_folders.pop( + remote_folder + ) + log.info("Successfully renamed %s to %s" % (remote_folder, new_name)) + except (OSError, PermissionError) as e: + log_text = f"Could not rename remote {remote_folder}. Error: {e}" + log.error(log_text) + else: + log.warning( + "No `tmp_processing` pattern in %s, not renamed" % remote_folder + ) + return + + def clean_remote_folder(self, remote_folder): """Delete a folder from remote sftp, check if it is empty or not first. Args: @@ -603,15 +624,16 @@ def remove_client_dir(remote_folder): log.info("Successfully removed %s", remote_folder) except (OSError, PermissionError) as e: log_text = f"Could not delete remote {remote_folder}. Error: {e}" - self.include_warning(log_text) + log.error(log_text) stderr.print(log_text) else: log.info("%s is a top-level folder. Not removed", remote_folder) remote_folder_files = self.relecov_sftp.get_file_list(remote_folder) if remote_folder_files: + self.rename_remote_folder(remote_folder) log_text = f"Remote folder {remote_folder} not empty. Not removed" - self.include_warning(log_text) + log.warning(log_text) else: remove_client_dir(remote_folder) return @@ -715,7 +737,7 @@ def md5_handler(md5sumlist, output_location): if not md5sumlist: error_text = "No md5sum could be found in remote folder %s" stderr.print(f"[yellow]{error_text % folder}") - self.include_warning(error_text) + self.include_warning(error_text % folder) continue folders_with_metadata[folder] = [fi for fi in files if fi not in md5sumlist] try: @@ -842,6 +864,9 @@ def pre_validate_folder(folder, folder_files): log.info("Setting %s remote folders...", str(len(target_folders.keys()))) stderr.print(f"[blue]Setting {len(target_folders.keys())} remote folders...") for folder in sorted(target_folders.keys()): + if "invalid_samples" in folder: + log.warning("Skipped invalid_samples folder %s", folder) + continue self.current_folder = folder # Include the folder in the final process log summary self.include_new_key() @@ -1006,7 +1031,7 @@ def process_filedict( processed(dict{str:str}): Updated valid_filedict """ processed_dict = {} - error_text = "md5 mismatch for %s" + error_text = "corrupted or md5 mismatch for %s" warning_text = "File %s not found in md5sum. Creating hash" for sample, vals in valid_filedict.items(): processed_dict[sample] = {} @@ -1028,13 +1053,12 @@ def process_filedict( del processed_dict[sample] return processed_dict - def download(self, target_folders, option="download"): + def download(self, target_folders): """Manages all the different functions to download files, verify their integrity and create initial json with filepaths and md5 hashes Args: target_folders (dict): dictionary - option (str, optional): Download option. Defaults to "download". """ log.info("Initiating download process") main_folder = self.platform_storage_folder @@ -1049,7 +1073,7 @@ def download(self, target_folders, option="download"): # Close previously open connection to avoid timeouts try: self.relecov_sftp.close_connection() - except paramiko.ssh_exception.NoValidConnectionsError: + except paramiko.SSHException: pass # Check if the connection has been closed due to time limit self.relecov_sftp.open_connection() @@ -1137,20 +1161,25 @@ def download(self, target_folders, option="download"): stderr.print(f"[red]{error_text}") self.include_warning(error_text) - clean_fetchlist = [ + seqs_fetchlist = [ fi for fi in fetched_files if fi.endswith(tuple(self.allowed_file_ext)) ] - - clean_fetchlist = [fi for fi in clean_fetchlist if fi not in corrupted] + seqs_fetchlist = [fi for fi in seqs_fetchlist if fi not in corrupted] # Checking for uncompressed files - files_to_compress = [fi for fi in clean_fetchlist if not fi.endswith(".gz")] + files_to_compress = [ + fi + for fi in seqs_fetchlist + if not fi.endswith(".gz") and not fi.endswith(".bam") + ] if files_to_compress: comp_files = str(len(files_to_compress)) log.info("Found %s uncompressed files, compressing...", comp_files) stderr.print(f"Found {comp_files} uncompressed files, compressing...") clean_fetchlist = self.compress_and_update( - clean_fetchlist, files_to_compress, local_folder + seqs_fetchlist, files_to_compress, local_folder ) + else: + clean_fetchlist = seqs_fetchlist clean_pathlist = [os.path.join(local_folder, fi) for fi in clean_fetchlist] not_md5sum = [] if remote_md5sum: @@ -1175,21 +1204,34 @@ def download(self, target_folders, option="download"): relecov_tools.utils.calculate_md5(path) for path in clean_pathlist ] files_md5_dict = dict(zip(clean_fetchlist, md5_hashes)) - + for file in files_md5_dict.keys(): + full_f_path = os.path.join(local_folder, file) + if not relecov_tools.utils.check_gzip_integrity(full_f_path): + corrupted.append(file) + files_md5_dict = { + x: y for x, y in files_md5_dict.items() if x not in corrupted + } processed_filedict = self.process_filedict( valid_filedict, clean_fetchlist, corrupted=corrupted, md5miss=not_md5sum ) self.create_files_with_metadata_info( local_folder, processed_filedict, files_md5_dict, meta_file ) - # If download_option is "download_clean", remove - # sftp folder content after download is finished - if option == "clean": - self.delete_remote_files(folder, files=files_to_download) - self.delete_remote_files(folder, skip_seqs=True) - self.delete_remote_folder(folder) - stderr.print(f"Delete process finished in remote {folder}") + if self.logsum.logs.get(self.current_folder): + self.logsum.logs[self.current_folder].update({"path": local_folder}) + try: + folder_basename = os.path.basename(local_folder.rstrip("/")) + log_name = folder_basename + "_download_log_summary.json" + self.logsum.create_error_summary( + filepath=os.path.join(local_folder, log_name), + logs={ + self.current_folder: self.logsum.logs[self.current_folder] + }, + ) + except Exception as e: + log.error("Could not create logsum for %s: %s" % (folder, str(e))) stderr.print(f"[green]Finished processing {folder}") + self.finished_folders[folder] = list(files_md5_dict.keys()) return def include_new_key(self, sample=None): @@ -1217,19 +1259,11 @@ def execute_process(self): for folder in processed_folders: self.current_folder = folder self.delete_remote_files(folder) - self.delete_remote_folder(folder) + self.clean_remote_folder(folder) stderr.print(f"Delete process finished in {folder}") else: target_folders, processed_folders = self.merge_subfolders(target_folders) - if self.download_option == "download_only": - self.download(target_folders, option="download") - if self.download_option == "download_clean": - self.download(target_folders, option="clean") - for folder in processed_folders: - self.current_folder = folder - self.delete_remote_files(folder, skip_seqs=True) - self.delete_remote_folder(folder) - stderr.print(f"Delete process finished in {folder}") + self.download(target_folders) self.relecov_sftp.close_connection() stderr.print(f"Processed {len(processed_folders)} folders: {processed_folders}") @@ -1238,5 +1272,17 @@ def execute_process(self): self.logsum.create_error_summary(called_module="download") else: log.info("Process log summary was empty. Not generated.") + # If download_option is "download_clean", remove + # sftp folder content after download is finished + if self.download_option == "download_clean": + for folder in processed_folders: + self.delete_remote_files(folder, skip_seqs=True) + self.clean_remote_folder(folder) + folders_to_clean = copy.deepcopy(self.finished_folders) + for folder, downloaded_files in folders_to_clean.items(): + self.delete_remote_files(folder, files=downloaded_files) + self.delete_remote_files(folder, skip_seqs=True) + self.clean_remote_folder(folder) + stderr.print(f"Delete process finished in remote {folder}") stderr.print("Finished execution") return diff --git a/relecov_tools/json_validation.py b/relecov_tools/json_validation.py index cb0cd05a..d49d5ed7 100755 --- a/relecov_tools/json_validation.py +++ b/relecov_tools/json_validation.py @@ -60,6 +60,10 @@ def __init__( stderr.print("[blue] Reading the json file") self.json_data = relecov_tools.utils.read_json_file(json_data_file) + if isinstance(self.json_data, dict): + stderr.print(f"[red]Invalid json file content in {json_data_file}.") + stderr.print("Should be a list of dicts. Create it with read-lab-metadata") + sys.exit(1) self.metadata = metadata try: self.sample_id_field = self.get_sample_id_field() @@ -94,6 +98,7 @@ def validate_instances(self): # create validator validator = Draft202012Validator(self.json_schema) + schema_props = self.json_schema["properties"] validated_json_data = [] invalid_json = [] @@ -112,18 +117,24 @@ def validate_instances(self): else: # Count error types for error in validator.iter_errors(item_row): + if error.validator == "required": + error_field = [ + f for f in error.validator_value if f in error.message + ][0] + else: + error_field = error.absolute_path[0] try: - error_keys[error.message] = error.absolute_path[0] - except Exception: - error_keys[error.message] = error.message + err_field_label = schema_props[error_field]["label"] + except KeyError: + log.error("Could not extract label for %s" % error_field) + err_field_label = error_field + error.message.replace(error_field, err_field_label) + error_text = f"Error in column {err_field_label}: {error.message}" + error_keys[error.message] = error_field if error.message in errors: errors[error.message] += 1 else: errors[error.message] = 1 - if error_keys[error.message] == error.message: - error_text = error.message - else: - error_text = f"{error_keys[error.message]}:{error.message}" self.logsum.add_error(sample=sample_id_value, entry=error_text) # append row with errors invalid_json.append(item_row) @@ -175,13 +186,22 @@ def create_invalid_metadata(self, invalid_json, metadata, out_folder): wb = openpyxl.load_workbook(metadata) # TODO: Include this as a key in configuration.json ws_sheet = wb["METADATA_LAB"] + tag = "Sample ID given for sequencing" + seq_id_col = [idx for idx, cell in enumerate(ws_sheet[1]) if tag in cell.value] + if seq_id_col: + id_col = seq_id_col[0] row_to_del = [] - for row in ws_sheet.iter_rows(min_row=5, max_row=ws_sheet.max_row): - # if not data on row 1 and 2 assume that no more data are in file - # then start deleting rows - if not row[2].value and not row[1].value: + row_iterator = ws_sheet.iter_rows(min_row=1, max_row=ws_sheet.max_row) + consec_empty_rows = 0 + for row in row_iterator: + # if no data in 10 consecutive rows, break loop + if not any(row[x].value for x in range(10)): + row_to_del.append(row[0].row) + consec_empty_rows += 1 + if consec_empty_rows > 10: break - if str(row[2].value) not in sample_list: + consec_empty_rows = 0 + if str(row[id_col].value) not in sample_list: row_to_del.append(row[0].row) stderr.print("Collected rows to create the excel file") if len(row_to_del) > 0: @@ -238,3 +258,4 @@ def validate(self): self.logsum.add_error(entry=log_text) stderr.print(f"[red]{log_text}") self.logsum.create_error_summary(called_module="validate") + return valid_json_data, invalid_json diff --git a/relecov_tools/log_summary.py b/relecov_tools/log_summary.py index 926b7900..0e92b47f 100755 --- a/relecov_tools/log_summary.py +++ b/relecov_tools/log_summary.py @@ -4,12 +4,14 @@ import os import inspect import copy +import re import openpyxl from rich.console import Console from datetime import datetime from collections import OrderedDict from relecov_tools.utils import rich_force_colors +import relecov_tools.utils log = logging.getLogger(__name__) @@ -28,10 +30,12 @@ def __init__( unique_key: str = None, path: str = None, ): - if not os.path.exists(str(output_location)): - raise FileNotFoundError(f"Output folder {output_location} does not exist") - else: - self.output_location = output_location + if not os.path.isdir(str(output_location)): + try: + os.makedirs(output_location, exist_ok=True) + except IOError: + raise IOError(f"Logs output folder {output_location} does not exist") + self.output_location = output_location # if unique_key is given, all entries will be saved inside that key by default if unique_key: self.unique_key = unique_key @@ -117,7 +121,7 @@ def prepare_final_logs(self, logs): logs: logs with updated valid field values """ for key in logs.keys(): - if logs[key]["errors"]: + if logs[key].get("errors"): logs[key]["valid"] = False if logs[key].get("samples") is not None: for sample in logs[key]["samples"].keys(): @@ -126,16 +130,23 @@ def prepare_final_logs(self, logs): return logs def merge_logs(self, key_name, logs_list): - """Merge a multiple set of logs without losing information""" - if not logs_list: - return - merged_logs = copy.deepcopy(logs_list[0]) - if not merged_logs["samples"]: - merged_logs["samples"] = {} - for logs in logs_list: - merged_logs["errors"].extend(logs["errors"]) - merged_logs["warnings"].extend(logs["warnings"]) - if "samples" in logs.keys(): + """Merge a multiple set of logs without losing information + + Args: + key_name (str): Name of the final key holding the logs + logs_list (list(dict)): List of logs for different processes, + logs should only include the actual records, + + Returns: + final_logs (dict): Merged list of logs into a single record + """ + + def add_new_logs(merged_logs, logs): + if "errors" not in logs.keys(): + logs = logs.get(list(logs.keys())[0]) + merged_logs["errors"].extend(logs.get("errors")) + merged_logs["warnings"].extend(logs.get("warnings")) + if logs.get("samples"): for sample, vals in logs["samples"].items(): if sample not in merged_logs["samples"].keys(): merged_logs["samples"][sample] = vals @@ -146,34 +157,59 @@ def merge_logs(self, key_name, logs_list): merged_logs["samples"][sample]["warnings"].extend( logs["samples"][sample]["warnings"] ) + return merged_logs + + if not logs_list: + return + merged_logs = OrderedDict({"valid": True, "errors": [], "warnings": []}) + merged_logs["samples"] = {} + for idx, logs in enumerate(logs_list): + if not logs: + continue + try: + merged_logs = add_new_logs(merged_logs, logs) + except (TypeError, KeyError) as e: + err = f"Could not add logs {idx} in list: {e}" + merged_logs["errors"].extend(err) + log.error(err) final_logs = {key_name: merged_logs} return final_logs - def create_logs_excel(self, logs, called_module=""): + def create_logs_excel(self, logs, excel_outpath): """Create an excel file with logs information Args: logs (dict, optional): Custom dictionary of logs. Useful to create outputs - called_module (str, optional): Name of the module running this code. + excel_outpath (str): Path to output excel file """ + def reg_remover(string, pattern): + """Remove annotation between brackets in logs message""" + string = string.replace("['", "'").replace("']", "'") + string = re.sub(pattern, "", string) + return string.strip() + def translate_fields(samples_logs): # TODO Translate logs to spanish using a local translator model like deepl return - date = datetime.today().strftime("%Y%m%d%-H%M%S") + date = datetime.today().strftime("%Y%m%d%H%M%S") lab_code = list(logs.keys())[0] - if self.unique_key: - excel_filename = "_".join( - [self.unique_key, called_module, date, "report.xlsx"] - ).replace("__", "") - else: - excel_filename = "_".join([lab_code, date, "report.xlsx"]) + if not os.path.exists(os.path.dirname(excel_outpath)): + excel_outpath = os.path.join( + self.output_location, lab_code + "_" + date + "_report.xlsx" + ) + log.warning( + "Given report outpath does not exist, changed to %s" % (excel_outpath) + ) + file_ext = os.path.splitext(excel_outpath)[-1] + excel_outpath = excel_outpath.replace(file_ext, ".xlsx") if not logs.get("samples"): try: - samples_logs = logs[lab_code].get("samples") + samples_logs = logs[lab_code]["samples"] except (KeyError, AttributeError) as e: stderr.print(f"[red]Could not convert log summary to excel: {e}") + log.error("Could not convert log summary to excel: %s" % str(e)) return else: samples_logs = logs.get("samples") @@ -186,18 +222,22 @@ def translate_fields(samples_logs): warnings_sheet = workbook.create_sheet("Other warnings") warnings_headers = ["Sample ID given for sequencing", "Valid", "Warnings"] warnings_sheet.append(warnings_headers) + regex = r"\[.*?\]" # Regex to remove annotation between brackets for sample, logs in samples_logs.items(): - error_row = [sample, str(logs["valid"]), "; ".join(logs["errors"])] + clean_errors = [reg_remover(x, regex) for x in logs["errors"]] + error_row = [sample, str(logs["valid"]), "\n ".join(clean_errors)] main_worksheet.append(error_row) - warning_row = [sample, str(logs["valid"]), "; ".join(logs["warnings"])] + clean_warngs = [reg_remover(x, regex) for x in logs["warnings"]] + warning_row = [sample, str(logs["valid"]), "\n ".join(clean_warngs)] warnings_sheet.append(warning_row) - excel_outpath = os.path.join(self.output_location, excel_filename) + relecov_tools.utils.adjust_sheet_size(main_worksheet) + relecov_tools.utils.adjust_sheet_size(warnings_sheet) workbook.save(excel_outpath) stderr.print(f"[green]Successfully created logs excel in {excel_outpath}") return def create_error_summary( - self, called_module=None, filename=None, logs=None, to_excel=False + self, called_module=None, filepath=None, logs=None, to_excel=False ): """Dump the log summary dictionary into a file with json format. If any of the 'errors' key is not empty, the parent key value 'valid' is set to false. @@ -223,15 +263,27 @@ def create_error_summary( ][0] except IndexError: called_module = "" - if not filename: + if not filepath: date = datetime.today().strftime("%Y%m%d%-H%M%S") filename = "_".join([date, called_module, "log_summary.json"]) - summary_path = os.path.join(self.output_location, filename) - with open(summary_path, "w", encoding="utf-8") as f: - f.write( - json.dumps(final_logs, indent=4, sort_keys=False, ensure_ascii=False) - ) - stderr.print(f"Process log summary saved in {summary_path}") - if to_excel is True: - self.create_logs_excel(final_logs, called_module) + os.makedirs(self.output_location, exist_ok=True) + filepath = os.path.join(self.output_location, filename) + else: + os.makedirs(os.path.dirname(filepath), exist_ok=True) + with open(filepath, "w", encoding="utf-8") as f: + try: + f.write( + json.dumps( + final_logs, indent=4, sort_keys=False, ensure_ascii=False + ) + ) + stderr.print(f"Process log summary saved in {filepath}") + if to_excel is True: + self.create_logs_excel( + final_logs, filepath.replace("log_summary", "report") + ) + except Exception as e: + stderr.print(f"[red]Error parsing logs to json format: {e}") + log.error("Error parsing logs to json format: %s", str(e)) + f.write(str(final_logs)) return diff --git a/relecov_tools/pipeline_manager.py b/relecov_tools/pipeline_manager.py index b7653289..b1549db0 100644 --- a/relecov_tools/pipeline_manager.py +++ b/relecov_tools/pipeline_manager.py @@ -8,7 +8,6 @@ from collections import Counter import rich.console - import relecov_tools.utils log = logging.getLogger(__name__) @@ -20,7 +19,7 @@ ) -class LaunchPipeline: +class PipelineManager: def __init__( self, input_folder=None, @@ -28,7 +27,7 @@ def __init__( output_folder=None, pipeline_conf_file=None, ): - current_date = datetime.date.today().strftime("%Y%m%d") + self.current_date = datetime.date.today().strftime("%Y%m%d") if input_folder is None: self.input_folder = relecov_tools.utils.prompt_path( msg="Select the folder which contains the fastq file of samples" @@ -63,23 +62,22 @@ def __init__( sys.exit(1) conf_settings = relecov_tools.utils.read_json_file(pipeline_conf_file) try: - data = conf_settings["launch_pipeline"] - # get_topic_data("launch_pipeline", "analysis_name") + config_data = conf_settings["pipeline_manager"] except KeyError: log.error("Invalid pipeline config file %s ", pipeline_conf_file) stderr.print("[red] Invalid pipeline config file " + pipeline_conf_file) if ( - "analysis_name" not in data - or "sample_stored_folder" not in data - or "sample_link_folder" not in data - or "doc_folder" not in data + "analysis_user" not in config_data + or "analysis_group" not in config_data + or "analysis_folder" not in config_data + or "sample_stored_folder" not in config_data + or "sample_link_folder" not in config_data + or "doc_folder" not in config_data ): - log.error("Invalid pipeline config file %s ", self.pipeline_conf_file) - stderr.print( - "[red] Invalid pipeline config file " + self.pipeline_conf_file - ) + log.error("Invalid pipeline config file %s ", pipeline_conf_file) + stderr.print("[red] Invalid pipeline config file " + pipeline_conf_file) sys.exit(1) - + self.config_fata = config_data if output_folder is None: output_folder = relecov_tools.utils.prompt_path( msg="Select the output folder" @@ -93,39 +91,22 @@ def __init__( sys.exit(1) # Update the output folder with the current date and analysis name - self.output_folder = os.path.join( - output_folder, current_date + "_" + data["analysis_name"] - ) - if os.path.exists(self.output_folder): - msg = "Analysis folder already exists and it will be deleted. Do you want to continue? Y/N" - confirmation = relecov_tools.utils.prompt_yn_question(msg) - if confirmation is False: - sys.exit(1) - shutil.rmtree(self.output_folder) - - self.analysis_folder = os.path.join(self.output_folder, data["analysis_folder"]) - self.copied_sample_folder = os.path.join( - self.output_folder, data["sample_stored_folder"] - ) - self.linked_sample_folder = os.path.join( - self.analysis_folder, data["sample_link_folder"] - ) - self.doc_folder = data["doc_folder"] + self.output_folder = output_folder + self.out_folder_namevar = f"{self.current_date}_{config_data['analysis_group']}_%s_{config_data['analysis_user']}" + self.analysis_folder = config_data["analysis_folder"] + self.copied_sample_folder = config_data["sample_stored_folder"] + self.linked_sample_folder = config_data["sample_link_folder"] + self.doc_folder = config_data["doc_folder"] def join_valid_items(self): - """Join validated metadata for the latest batches downloaded + """Join validated metadata for the latest batches downloaded into a single one Args: Returns: - sample_data: list(dict) - [ - { - "sequencing_sample_id":XXXX, - "r1_fastq_filepath": XXXX, - "r2_fastq_filepath":XXXX - } - ] + join_validate (list(dict)): List of dictionaries containing all the samples + found in each validated_lab_metadata.json form the scanned folders + latest_date (str): Latest batch date found in the scanned folder """ def get_latest_lab_folder(self): @@ -176,7 +157,6 @@ def get_latest_lab_folder(self): return lab_latest_folders, latest_date upload_lab_folders, latest_date = get_latest_lab_folder(self) - samples_data = [] join_validate = list() for lab, data_folder in upload_lab_folders.items(): lab_code = lab.split("/")[-1] @@ -198,139 +178,252 @@ def get_latest_lab_folder(self): with open(validate_file_path) as fh: data = json.load(fh) join_validate.extend(data) - for item in data: - sample = {} - sample["sequencing_sample_id"] = item["sequencing_sample_id"] - sample["r1_fastq_filepath"] = os.path.join( - item["r1_fastq_filepath"], item["sequence_file_R1_fastq"] - ) - if "r2_fastq_filepath" in item: - sample["r2_fastq_filepath"] = os.path.join( - item["r2_fastq_filepath"], item["sequence_file_R2_fastq"] - ) - samples_data.append(sample) - date_and_time = datetime.datetime.today().strftime("%Y%m%d%-H%M%S") - with open( - os.path.join( - self.output_folder, - self.doc_folder, - f"{date_and_time}_validate_batch.json", - ), - "w", - ) as fo: - json.dump(join_validate, fo, indent=4, ensure_ascii=False) + log.info("Found a total of %s samples", str(len(join_validate))) + stderr.print(f"Found a total of {len(join_validate)} samples") + return join_validate, latest_date - return samples_data - - def pipeline_exc(self): - """Prepare folder for analysis in HPC - Copies template selected as input - Copies RAW data with sequencing id as fastq file names - Creates samples_id.txt + def copy_process(self, samples_data, output_folder): + """Copies all the necessary samples files in the given samples_data list + to the output folder. Also creates symbolic links into the link folder + given in config_file. Args: + samples_data (list(dict)): samples_data from self.create_samples_data() + output_folder (str): Destination folder to copy files Returns: - + samp_errors (dict): Dictionary where keys are sequencing_sample_id and values + the files that received an error while trying to copy. """ - - # copy template folder and subfolders in output folder - shutil.copytree(self.template, self.output_folder) - # create the 00_reads folder - os.makedirs(self.linked_sample_folder, exist_ok=True) - # collect json with all validated samples - samples_data = self.join_valid_items() - - # Check for possible duplicates - # Extract the sequencing_sample_id from the list of dictionaries - sample_ids = [item["sequencing_sample_id"] for item in samples_data] - - # Use Counter to count the occurrences of each sequencing_sample_id - id_counts = Counter(sample_ids) - - # Find the sequencing_sample_id values that are duplicated (count > 1) - duplicates = [sample_id for sample_id, count in id_counts.items() if count > 1] - - if duplicates: - log.error(f"There are duplicated samples in your batch: {duplicates}") - stderr.print( - f"[red] There are duplicated samples in your batch: {duplicates}. Please handle manually" - ) - sys.exit() - - # print samples_id file - with open(os.path.join(self.analysis_folder, "samples_id.txt"), "w") as f: - for sample_id in sample_ids: - f.write(f"{sample_id}\n") - - # iterate over the sample_data to copy the fastq files in the output folder - file_errors = [] - copied_samples = 0 - if len(samples_data) == 0: - stderr.print("[yellow] No samples were found. Deleting analysis folder") - shutil.rmtree(self.analysis_folder) - sys.exit(0) - log.info("Samples to copy %s", len(samples_data)) + samp_errors = {} + links_folder = os.path.join( + output_folder, self.analysis_folder, self.linked_sample_folder + ) + os.makedirs(links_folder, exist_ok=True) for sample in samples_data: + sample_id = sample["sequencing_sample_id"] # fetch the file extension ext_found = re.match(r".*(fastq.*|bam)", sample["r1_fastq_filepath"]) + if not ext_found: + log.error("No valid file extension found for %s", sample_id) + samp_errors[sample_id].append(sample["r1_fastq_filepath"]) + continue ext = ext_found.group(1) - sequencing_r1_sample_id = sample["sequencing_sample_id"] + "_R1." + ext - # copy r1 sequencing file into the output folder + seq_r1_sample_id = sample["sequencing_sample_id"] + "_R1." + ext + # copy r1 sequencing file into the output folder self.analysis_folder sample_raw_r1 = os.path.join( - self.analysis_folder, self.copied_sample_folder, sequencing_r1_sample_id + output_folder, self.copied_sample_folder, seq_r1_sample_id ) log.info("Copying sample %s", sample) stderr.print("[blue] Copying sample: ", sample["sequencing_sample_id"]) try: shutil.copy(sample["r1_fastq_filepath"], sample_raw_r1) # create simlink for the r1 - r1_link_path = os.path.join( - self.linked_sample_folder, sequencing_r1_sample_id - ) - r1_link_path_ori = os.path.join("../../RAW", sequencing_r1_sample_id) + r1_link_path = os.path.join(links_folder, seq_r1_sample_id) + r1_link_path_ori = os.path.join("../../RAW", seq_r1_sample_id) os.symlink(r1_link_path_ori, r1_link_path) except FileNotFoundError as e: log.error("File not found %s", e) - file_errors.append(sample["r1_fastq_filepath"]) + samp_errors[sample_id] = [] + samp_errors[sample_id].append(sample["r1_fastq_filepath"]) + if "r2_fastq_filepath" in sample: + samp_errors[sample_id].append(sample["r2_fastq_filepath"]) continue - copied_samples += 1 # check if there is a r2 file if "r2_fastq_filepath" in sample: - sequencing_r2_sample_id = sample["sequencing_sample_id"] + "_R2." + ext + seq_r2_sample_id = sample["sequencing_sample_id"] + "_R2." + ext sample_raw_r2 = os.path.join( - self.analysis_folder, + output_folder, self.copied_sample_folder, - sequencing_r2_sample_id, + seq_r2_sample_id, ) - try: shutil.copy(sample["r2_fastq_filepath"], sample_raw_r2) - r2_link_path = os.path.join( - self.linked_sample_folder, sequencing_r2_sample_id - ) - r2_link_path_ori = os.path.join( - "../../RAW", sequencing_r2_sample_id - ) + r2_link_path = os.path.join(links_folder, seq_r2_sample_id) + r2_link_path_ori = os.path.join("../../RAW", seq_r2_sample_id) os.symlink(r2_link_path_ori, r2_link_path) except FileNotFoundError as e: log.error("File not found %s", e) - file_errors.append(sample["r2_fastq_filepath"]) + if not samp_errors.get(sample_id): + samp_errors[sample_id] = [] + samp_errors[sample_id].append(sample["r2_fastq_filepath"]) continue - if len(file_errors) > 0: + return samp_errors + + def create_samples_data(self, json_data): + """Creates a copy of the json_data but only with relevant keys to copy files. + Here 'r1_fastq_filepath' is created joining the original 'r1_fastq_filepath' + and 'sequence_file_R1_fastq' fields. The same goes for 'r2_fastq_filepath' + + Args: + json_data (list(dict)): Samples metadata in a list of dictionaries + + Returns: + sample_data: list(dict) + [ + { + "sequencing_sample_id":XXXX, + "r1_fastq_filepath": XXXX, + "r2_fastq_filepath":XXXX + } + ] + """ + samples_data = [] + for item in json_data: + sample = {} + sample["sequencing_sample_id"] = item["sequencing_sample_id"] + sample["r1_fastq_filepath"] = os.path.join( + item["r1_fastq_filepath"], item["sequence_file_R1_fastq"] + ) + if "r2_fastq_filepath" in item: + sample["r2_fastq_filepath"] = os.path.join( + item["r2_fastq_filepath"], item["sequence_file_R2_fastq"] + ) + samples_data.append(sample) + return samples_data + + def split_data_by_key(self, json_data, keylist): + """Split a given json data into different lists based on a given list of keys. + From a single list of samples (dicts), the output will now be a list of lists + where each new list is a subset of the original samples with the same values + for the given list of keys + + Args: + json_data (list(dict)): List of dictionaries, one for each sample + keylist (list(str)): List of keys within the given dictionaries to + split data. + + Returns: + + """ + if not keylist: + return [json_data] + + json_split_by_key = {} + new_key = keylist[0] + next_keys = keylist[1:] + + json_uniq_vals = frozenset([x.get(new_key) for x in json_data]) + for val in json_uniq_vals: + grouped_samples = [x for x in json_data if x.get(new_key) == val] + json_split_by_key[val] = grouped_samples + + list_of_jsons_by_key = [] + for group in json_split_by_key.values(): + list_of_jsons_by_key.extend(self.split_data_by_key(group, next_keys)) + return list_of_jsons_by_key + + def pipeline_exc(self): + """Prepare folder for analysis in HPC + Copies template selected as input + Copies RAW data with sequencing id as fastq file names + Creates samples_id.txt + + Args: + + Returns: + + """ + # collect json with all validated samples + join_validate, latest_date = self.join_valid_items() + latest_date = str(latest_date).replace("-", "") + if len(join_validate) == 0: + stderr.print("[yellow]No samples were found. Aborting") + sys.exit(0) + keys_to_split = ["enrichment_panel", "enrichment_panel_version"] + stderr.print(f"[blue]Splitting samples based on {keys_to_split}...") + json_split_by_panel = self.split_data_by_key(join_validate, keys_to_split) + stderr.print(f"[blue]Data splitted into {len(json_split_by_panel)} groups") + # iterate over the sample_data to copy the fastq files in the output folder + global_samp_errors = {} + for idx, list_of_samples in enumerate(json_split_by_panel, start=1): + group_tag = f"{latest_date}_PANEL{idx:02d}" + log.info("Processing group %s", group_tag) + stderr.print(f"[blue]Processing group {group_tag}...") + group_outfolder = os.path.join( + self.output_folder, self.out_folder_namevar % group_tag + ) + if os.path.exists(group_outfolder): + msg = f"Analysis folder {group_outfolder} already exists and it will be deleted. Do you want to continue? Y/N" + confirmation = relecov_tools.utils.prompt_yn_question(msg) + if confirmation is False: + continue + shutil.rmtree(group_outfolder) + log.info(f"Folder {group_outfolder} removed") + samples_data = self.create_samples_data(list_of_samples) + # Create a folder for the group of samples and copy the files there + log.info("Creating folder for group %s", group_tag) + stderr.print(f"[blue]Creating folder for group {group_tag}") + # copy template folder and subfolders in output folder + shutil.copytree(self.template, group_outfolder) + # Check for possible duplicates + log.info("Samples to copy %s", len(samples_data)) + # Extract the sequencing_sample_id from the list of dictionaries + sample_ids = [item["sequencing_sample_id"] for item in samples_data] + # Use Counter to count the occurrences of each sequencing_sample_id + id_counts = Counter(sample_ids) + # Find the sequencing_sample_id values that are duplicated (count > 1) + duplicates = [ + sample_id for sample_id, count in id_counts.items() if count > 1 + ] + if duplicates: + log.error( + "There are duplicated samples in group %s: %s" + % ({group_tag}, {duplicates}) + ) + stderr.print( + f"[red] There are duplicated samples in group {group_tag}: {duplicates}. Please handle manually" + ) + continue + + samp_errors = self.copy_process(samples_data, group_outfolder) + if len(samp_errors) > 0: + stderr.print( + f"[red]Unable to copy files from {len(samp_errors)} samples in group {group_tag}" + ) + msg = f"Do you want to delete analysis folder {group_outfolder}? Y/N" + confirmation = relecov_tools.utils.prompt_yn_question(msg) + if confirmation: + shutil.rmtree(group_outfolder) + log.info(f"Folder {group_outfolder} removed") + continue + global_samp_errors[group_tag] = samp_errors + samples_copied = len(list_of_samples) - len(samp_errors) + stderr.print( + f"[green]Group {group_tag}: {samples_copied} samples copied out of {len(list_of_samples)}" + ) + final_valid_samples = [ + x + for x in list_of_samples + if x.get("sequencing_sample_id") not in samp_errors + ] + sample_ids = [i for i in sample_ids if i not in samp_errors] + group_analysis_folder = os.path.join(group_outfolder, self.analysis_folder) + # print samples_id file + stderr.print( + f"[blue]Generating sample_id.txt file in {group_analysis_folder}" + ) + with open(os.path.join(group_analysis_folder, "samples_id.txt"), "w") as f: + for sample_id in sample_ids: + f.write(f"{sample_id}\n") + json_filename = os.path.join( + group_outfolder, + self.doc_folder, + f"{group_tag}_validate_batch.json", + ) + relecov_tools.utils.write_json_fo_file(final_valid_samples, json_filename) + log.info("[blue]Successfully created pipeline folder. Ready to launch") stderr.print( - "[red] Files do not found. Unable to copy", - "[red] " + str(len(file_errors)), - "[red]sample files", + f"[blue]Successfully created folder for {group_tag}. Ready to launch" ) - msg = "Do you want to delete analysis folder? Y/N" - confirmation = relecov_tools.utils.prompt_yn_question(msg) - if confirmation: - shutil.rmtree(self.output_folder) - sys.exit(1) - stderr.print("[green] Samples copied: ", copied_samples) - log.info("[blue] Pipeline launched successfully") - stderr.print("[blue] Pipeline launched successfully") + for group, samples in global_samp_errors.items(): + if not samples: + continue + log.error("Group %s received error for samples: %s" % (group, samples)) + if not any(v for v in global_samp_errors.values()): + stderr.print("[green]All samples were copied successfully!!") + log.info("Finished execution") + stderr.print("Finished execution") return diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index 27ee09aa..98aa0b9f 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -300,7 +300,6 @@ def handling_tables(self, file_list, conf_tab_name): file_ext = os.path.splitext(conf_tab_name)[1] # Parsing key position sample_idx_colpos = self.get_sample_idx_colpos(self.current_config_key) - # TODO: What if you need to process one table for each sample? extdict = {".csv": ",", ".tsv": "\t", ".tab": "\t"} if file_ext in extdict.keys(): data = relecov_tools.utils.read_csv_file_return_dict( @@ -738,6 +737,7 @@ def split_tables_by_batch(self, files_found_dict, batch_data, output_dir): def extract_batch_rows_to_file(file): """Create a new table file only with rows matching samples in batch_data""" + extdict = {".csv": ",", ".tsv": "\t", ".tab": "\t"} file_extension = os.path.splitext(file)[1] file_df = pd.read_csv( file, sep=extdict.get(file_extension), header=header_pos @@ -754,7 +754,6 @@ def extract_batch_rows_to_file(file): method_name = self.split_tables_by_batch.__name__ namekey = "sequencing_sample_id" batch_samples = [row.get(namekey) for row in batch_data] - extdict = {".csv": ",", ".tsv": "\t", ".tab": "\t"} for key, files in files_found_dict.items(): if not self.software_config[key].get("split_by_batch"): continue @@ -793,6 +792,7 @@ def create_bioinfo_file(self): data_by_batch = self.split_data_by_batch(self.j_data) # Add bioinfo metadata to j_data for batch_dir, batch_dict in data_by_batch.items(): + self.log_report.logsum.feed_key(batch_dir) stderr.print(f"[blue]Processing data from {batch_dir}") batch_data = batch_dict["j_data"] stderr.print("[blue]Adding bioinfo metadata to read lab metadata...") @@ -815,6 +815,10 @@ def create_bioinfo_file(self): batch_filename = tag + lab_code + "_" + batch_date + ".json" batch_filepath = os.path.join(batch_dir, batch_filename) relecov_tools.utils.write_json_fo_file(batch_data, batch_filepath) + for sample in batch_data: + self.log_report.logsum.feed_key( + key=batch_dir, sample=sample.get("sequencing_sample_id") + ) log.info("Created output json file: %s" % batch_filepath) stderr.print(f"[green]Created batch json file: {batch_filepath}") stderr.print("[blue]Writting output json file") diff --git a/relecov_tools/read_lab_metadata.py b/relecov_tools/read_lab_metadata.py index 2d4efbda..13e072c8 100755 --- a/relecov_tools/read_lab_metadata.py +++ b/relecov_tools/read_lab_metadata.py @@ -37,19 +37,12 @@ def __init__(self, metadata_file=None, sample_list_file=None, output_folder=None sys.exit(1) if sample_list_file is None: - self.sample_list_file = relecov_tools.utils.prompt_path( - msg="Select the file which contains the sample information" - ) - else: - self.sample_list_file = sample_list_file + stderr.print("[yellow]No samples_data.json file provided") + self.sample_list_file = sample_list_file - if not os.path.exists(self.sample_list_file): - log.error( - "Sample information file %s does not exist ", self.sample_list_file - ) - stderr.print( - "[red] Sample information " + self.sample_list_file + " does not exist" - ) + if sample_list_file is not None and not os.path.exists(sample_list_file): + log.error("Sample information file %s does not exist ", sample_list_file) + stderr.print("[red] Samples file " + sample_list_file + " does not exist") sys.exit(1) if output_folder is None: @@ -84,7 +77,7 @@ def __init__(self, metadata_file=None, sample_list_file=None, output_folder=None "[orange]Property " + prop + " does not have 'label' attribute" ) continue - + self.date = dtime.now().strftime("%Y%m%d%H%M%S") self.json_req_files = config_json.get_topic_data( "lab_metadata", "lab_metadata_req_json" ) @@ -97,6 +90,78 @@ def __init__(self, metadata_file=None, sample_list_file=None, output_folder=None "lab_metadata", "samples_json_fields" ) + def get_samples_files_data(self, clean_metadata_rows): + """Include the fields that would be included in samples_data.json + + Args: + clean_metadata_rows (list(dict)): Cleaned list of rows from metadata_lab.xlsx file + + Returns: + j_data (dict(dict)): Dictionary where each key is the sample ID and the values are + its file names, locations and md5 + """ + + def safely_calculate_md5(file): + """Check file md5, but return Not Provided if file does not exist""" + try: + return relecov_tools.utils.calculate_md5(file) + except IOError: + return "Not Provided [GENEPIO:0001668]" + + dir_path = os.path.dirname(os.path.realpath(self.metadata_file)) + md5_checksum_files = [f for f in os.listdir(dir_path) if "md5" in f] + if md5_checksum_files: + skip_list = self.configuration.get_topic_data( + "sftp_handle", "skip_when_found" + ) + md5_dict = relecov_tools.utils.read_md5_checksum( + file_name=md5_checksum_files[0], avoid_chars=skip_list + ) + else: + md5_dict = {} + log.warning("No md5sum file found.") + log.warning("Generating new md5 hashes. This might take a while...") + j_data = {} + no_fastq_error = "No R1 fastq was given for sample %s" + for sample in clean_metadata_rows: + files_dict = {} + r1_file = sample.get("sequence_file_R1_fastq") + r2_file = sample.get("sequence_file_R2_fastq") + if not r1_file: + self.logsum.add_error( + sample=sample.get("sequencing_sample_id"), + entry=no_fastq_error % sample.get("sequencing_sample_id"), + ) + j_data[str(sample.get("sequencing_sample_id"))] = files_dict + continue + r1_md5 = md5_dict.get(r1_file) + r2_md5 = md5_dict.get(r2_file) + files_dict["sequence_file_R1_fastq"] = r1_file + files_dict["r1_fastq_filepath"] = dir_path + if r1_md5: + files_dict["fastq_r1_md5"] = r1_md5 + else: + files_dict["fastq_r1_md5"] = safely_calculate_md5( + os.path.join(dir_path, r1_file) + ) + if r2_file: + files_dict["sequence_file_R2_fastq"] = r2_file + files_dict["r2_fastq_filepath"] = dir_path + if r2_md5: + files_dict["fastq_r2_md5"] = r2_md5 + else: + files_dict["fastq_r2_md5"] = safely_calculate_md5( + os.path.join(dir_path, r2_file) + ) + j_data[str(sample.get("sequencing_sample_id"))] = files_dict + try: + filename = "_".join(["samples_data", self.lab_code, self.date + ".json"]) + file_path = os.path.join(self.output_folder, filename) + relecov_tools.utils.write_json_fo_file(j_data, file_path) + except Exception: + log.error("Could not output samples_data.json file to output folder") + return j_data + def match_to_json(self, valid_metadata_rows): """Keep only the rows from samples present in the input file samples.json @@ -104,20 +169,25 @@ def match_to_json(self, valid_metadata_rows): valid_metadata_rows (list(dict)): List of rows from metadata_lab.xlsx file Returns: - clean_metadata_rows(list(dict)): _description_ - missing_samples(list(str)): + clean_metadata_rows(list(dict)): List of rows matching the samples in samples_data.json + missing_samples(list(str)): List of samples not found in samples_data.json """ - samples_json = relecov_tools.utils.read_json_file(self.sample_list_file) - clean_metadata_rows = [] missing_samples = [] + if not self.sample_list_file: + logtxt = "samples_data.json not provided, all samples will be included" + self.logsum.add_warning(entry=logtxt) + return valid_metadata_rows, missing_samples + else: + samples_json = relecov_tools.utils.read_json_file(self.sample_list_file) + clean_metadata_rows = [] for row in valid_metadata_rows: sample_id = str(row["sequencing_sample_id"]).strip() self.logsum.feed_key(sample=sample_id) if sample_id in samples_json.keys(): clean_metadata_rows.append(row) else: - log_text = "Sample missing in samples data Json file" - self.logsum.add_error(sample=sample_id, entry=log_text) + log_text = "Sample in metadata but missing in downloaded samples file" + self.logsum.add_warning(sample=sample_id, entry=log_text) missing_samples.append(sample_id) return clean_metadata_rows, missing_samples @@ -204,6 +274,7 @@ def adding_ontology_to_enum(self, m_data): def process_from_json(self, m_data, json_fields): """Find the labels that are missing in the file to match the given schema.""" map_field = json_fields["map_field"] + col_name = self.relecov_sch_json["properties"].get(map_field).get("label") json_data = json_fields["j_data"] for idx in range(len(m_data)): sample_id = str(m_data[idx].get("sequencing_sample_id")) @@ -214,14 +285,14 @@ def process_from_json(self, m_data, json_fields): clean_error = re.sub("[\[].*?[\]]", "", str(error.args[0])) if str(clean_error).lower().strip() == "not provided": log_text = ( - f"Label {map_field} was not provided in sample " + f"Label {col_name} was not provided in sample " + f"{sample_id}, auto-completing with Not Provided" ) self.logsum.add_warning(sample=sample_id, entry=log_text) else: log_text = ( - f"Unknown map_field value {error} for json data: " - + f"{str(map_field)} in sample {sample_id}. Skipped" + f"Unknown field value {error} for json data: " + + f"{str(col_name)} in sample {sample_id}. Skipped" ) self.logsum.add_warning(sample=sample_id, entry=log_text) continue @@ -251,7 +322,10 @@ def adding_fields(self, metadata): # TODO: Change sequencing_sample_id for some unique ID used in RELECOV database s_json["map_field"] = "sequencing_sample_id" s_json["adding_fields"] = self.samples_json_fields - s_json["j_data"] = relecov_tools.utils.read_json_file(self.sample_list_file) + if self.sample_list_file: + s_json["j_data"] = relecov_tools.utils.read_json_file(self.sample_list_file) + else: + s_json["j_data"] = self.get_samples_files_data(metadata) metadata = self.process_from_json(metadata, s_json) stderr.print("[green]Processed sample data file.") return metadata @@ -275,8 +349,24 @@ def read_metadata_file(self): """ meta_sheet = self.metadata_processing.get("excel_sheet") header_flag = self.metadata_processing.get("header_flag") - ws_metadata_lab, heading_row_number = relecov_tools.utils.read_excel_file( - self.metadata_file, meta_sheet, header_flag, leave_empty=False + sample_id_col = self.metadata_processing.get("sample_id_col") + self.alternative_heading = False + try: + ws_metadata_lab, heading_row_number = relecov_tools.utils.read_excel_file( + self.metadata_file, meta_sheet, header_flag, leave_empty=False + ) + except KeyError: + self.alternative_heading = True + alt_sheet = self.metadata_processing.get("alternative_sheet") + header_flag = self.metadata_processing.get("alternative_flag") + sample_id_col = self.metadata_processing.get("alternative_sample_id_col") + logtxt = f"No excel sheet named {meta_sheet}. Using {alt_sheet}" + stderr.print(f"[yellow]{logtxt}") + ws_metadata_lab, heading_row_number = relecov_tools.utils.read_excel_file( + self.metadata_file, alt_sheet, header_flag, leave_empty=False + ) + alt_header_dict = self.configuration.get_topic_data( + "lab_metadata", "alt_heading_equivalences" ) valid_metadata_rows = [] row_number = heading_row_number @@ -284,10 +374,13 @@ def read_metadata_file(self): row_number += 1 property_row = {} try: - sample_id = str(row["Sample ID given for sequencing"]).strip() + sample_id = str(row[sample_id_col]).strip() except KeyError: - log_text = f"Sample ID given for sequencing empty in row {row_number}" - log.error(log_text) + self.logsum.add_error(entry=f"No {sample_id_col} found in excel file") + continue + if not row[sample_id_col] or "Not Provided" in sample_id: + log_text = f"{sample_id_col} not provided in row {row_number}. Skipped" + self.logsum.add_warning(entry=log_text) stderr.print(f"[red]{log_text}") continue for key in row.keys(): @@ -304,7 +397,7 @@ def read_metadata_file(self): if isinstance(row[key], dtime): row[key] = str(row[key].date()) elif re.match(pattern, str(row[key])): - row[key] = row[key].replace("/", "-").replace(".", "-") + row[key] = str(row[key]).replace("/", "-").replace(".", "-") row[key] = re.match(pattern, row[key]).group(0) else: try: @@ -321,16 +414,30 @@ def read_metadata_file(self): else: if isinstance(row[key], float) or isinstance(row[key], int): row[key] = str(row[key]) + if "date" not in key.lower() and isinstance(row[key], dtime): + logtxt = f"Non-date field {key} provided as date. Parsed as int" + self.logsum.add_warning(sample=sample_id, entry=logtxt) + row[key] = str(relecov_tools.utils.excel_date_to_num(row[key])) + if self.alternative_heading: + alt_key = alt_header_dict.get(key) if row[key] is not None or "not provided" not in str(row[key]).lower(): try: - property_row[self.label_prop_dict[key]] = row[key] + property_row[self.label_prop_dict[key]] = str(row[key]).strip() except KeyError as e: + if self.alternative_heading: + try: + property_row[self.label_prop_dict[alt_key]] = str( + row[key] + ).strip() + continue + except KeyError: + pass log_text = f"Error when mapping the label {str(e)}" self.logsum.add_error(sample=sample_id, entry=log_text) stderr.print(f"[red]{log_text}") continue - valid_metadata_rows.append(property_row) + return valid_metadata_rows def create_metadata_json(self): @@ -339,9 +446,8 @@ def create_metadata_json(self): clean_metadata_rows, missing_samples = self.match_to_json(valid_metadata_rows) if missing_samples: num_miss = len(missing_samples) - log.warning( - "%s samples from metadata were not found: %s", num_miss, missing_samples - ) + logtx = "%s samples not found in metadata: %s" % (num_miss, missing_samples) + self.logsum.add_warning(entry=logtx) stderr.print(f"[yellow]{num_miss} samples missing:\n{missing_samples}") # Continue by adding extra information stderr.print("[blue]Including additional information") @@ -356,7 +462,7 @@ def create_metadata_json(self): stderr.print("Metadata was completely empty. No output file generated") sys.exit(1) file_code = "lab_metadata_" + self.lab_code + "_" - file_name = file_code + dtime.now().strftime("%Y%m%d%H%M%S") + ".json" + file_name = file_code + self.date + ".json" stderr.print("[blue]Writting output json file") os.makedirs(self.output_folder, exist_ok=True) self.logsum.create_error_summary(called_module="read-lab-metadata") diff --git a/relecov_tools/schema/relecov_schema.json b/relecov_tools/schema/relecov_schema.json index 27b9bbe8..bc605d45 100755 --- a/relecov_tools/schema/relecov_schema.json +++ b/relecov_tools/schema/relecov_schema.json @@ -3,7 +3,7 @@ "id": "https://github.com/BU-ISCIII/relecov-tools/blob/develop/relecov_tools/schema/relecov_schema.json", "title": "RELECOV schema", "description":"Json schema that specifies the structure, content, and validation rules for RELECOV metadata", - "version": "2.0.0", + "version": "2.1.0", "required": [ "collecting_lab_sample_id", "sequencing_sample_id", @@ -1794,7 +1794,18 @@ "type": "string", "description": "number of base pairs per read", "classification": "Sequencing", - "label": "Read Length", + "label": "Read length", + "fill_mode": "batch" + }, + "number_of_reads" : { + "examples": [ + "75" + ], + "ontology": "GENEPIO:0000087", + "type": "string", + "description": "number of reads passing base call filters", + "classification": "Sequencing", + "label": "Read count", "fill_mode": "batch" }, "sequence_file_R1_fastq": { @@ -2346,7 +2357,7 @@ "type": "string", "description": "Percentage of read that pass quality control threshold ", "classification": "Bioinformatics and QC metrics fields", - "label": "%qc filtered", + "label": "%QC filtered", "fill_mode": "batch" }, "per_reads_host": { @@ -2357,7 +2368,7 @@ "type": "string", "description": "Percentage of reads mapped to host", "classification": "Bioinformatics and QC metrics fields", - "label": "%reads host", + "label": "%Reads host", "fill_mode": "batch" }, "per_reads_virus": { @@ -2368,7 +2379,7 @@ "type": "string", "description": "Percentage of reads mapped to virus", "classification": "Bioinformatics and QC metrics fields", - "label": "%reads virus", + "label": "%Reads virus", "fill_mode": "batch" }, "per_unmapped": { @@ -2379,7 +2390,7 @@ "type": "string", "description": "Percentage of reads unmapped to virus or to host", "classification": "Bioinformatics and QC metrics fields", - "label": "%unmapped", + "label": "%Unmapped", "fill_mode": "batch" }, "depth_of_coverage_value": { @@ -2390,7 +2401,7 @@ "type": "string", "description": "The average number of reads representing a given nucleotide in the reconstructed sequence.", "classification": "Bioinformatics and QC metrics fields", - "label": "Depth of coverage value ", + "label": "Depth of coverage Mean value", "fill_mode": "batch" }, "per_genome_greater_10x": { @@ -2401,7 +2412,7 @@ "type": "string", "description": "Percentage of genome with coverage greater than 10x", "classification": "Bioinformatics and QC metrics fields", - "label": "% genome greater 10x", + "label": "% Genome > 10x", "fill_mode": "batch" }, "per_Ns": { @@ -2434,7 +2445,7 @@ "type": "string", "description": "The number of variants found in consensus sequence", "classification": "Bioinformatic Variants", - "label": "Number of variants (AF greater 75%)", + "label": "Number of variants (AF > 75%)", "fill_mode": "batch" }, "number_of_variants_with_effect": { @@ -2467,7 +2478,7 @@ "type": "string", "description": "The name of the clade.", "classification": "Lineage fields", - "label": "Clade designation", + "label": "Variant Clade Genotype", "fill_mode": "batch" }, "clade_type_software_name": { @@ -2492,6 +2503,16 @@ "label": "If Clade/Type Is Other, Specify", "fill_mode": "batch" }, + "clade_type_software_version": { + "examples": [ + "Pangolin" + ], + "ontology": "GENEPIO:0001502", + "type": "string", + "description": "The version of the software used to determine the clade/type.", + "classification": "Lineage fields", + "label": "Clade/Type software version" + }, "lineage_analysis_software_name": { "examples": [ "Pangolin" @@ -2544,7 +2565,7 @@ "type": "string", "description": "The version of the scorpio data used to determine the lineage/clade.", "classification": "Lineage fields", - "label": "Lineage/clade analysis software version", + "label": "Scorpio version", "fill_mode": "batch" }, "lineage_analysis_constellation_version": { @@ -2555,7 +2576,7 @@ "type": "string", "description": "The version of the constellations databases used to determine the lineage/clade.", "classification": "Lineage fields", - "label": "Lineage/clade analysis software version", + "label": "Constellations version", "fill_mode": "batch" }, "lineage_analysis_date": { @@ -2570,6 +2591,14 @@ "label": "lineage/clade analysis date", "fill_mode": "batch" }, + "lineage_analysis_file": { + "ontology": "0", + "type": "string", + "description": "File containing results from lineage/clade analysis", + "classification": "Lineage fields", + "label": "Lineage analysis file", + "fill_mode": "batch" + }, "variant_name": { "ontology": "", "type": "string", diff --git a/relecov_tools/sftp_client.py b/relecov_tools/sftp_client.py index d9de82ca..46a30c30 100644 --- a/relecov_tools/sftp_client.py +++ b/relecov_tools/sftp_client.py @@ -4,6 +4,7 @@ import rich.console import stat import sys +import time from relecov_tools.config_json import ConfigJson import relecov_tools.utils @@ -58,6 +59,29 @@ def __init__(self, conf_file=None, username=None, password=None): self.client = paramiko.SSHClient() self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + def reconnect_if_fail(n_times, sleep_time): + def decorator(func): + def retrier(self, *args, **kwargs): + more_sleep_time = 0 + retries = 0 + while retries < n_times: + try: + return func(self, *args, **kwargs) + except Exception: + retries += 1 + log.info("Connection lost. Trying to reconnect...") + time.sleep(more_sleep_time) + # Try extending sleep time before reconnecting in each step + more_sleep_time = more_sleep_time + sleep_time + self.open_connection() + else: + log.error("Could not reconnect to remote client") + return func(self, *args, **kwargs) + + return retrier + + return decorator + def open_connection(self): """Establishing sftp connection""" log.info("Setting credentials for SFTP connection with remote server") @@ -78,6 +102,7 @@ def open_connection(self): return False return True + @reconnect_if_fail(n_times=3, sleep_time=30) def list_remote_folders(self, folder_name, recursive=False): """Creates a directories list from the given client remote path @@ -90,7 +115,6 @@ def list_remote_folders(self, folder_name, recursive=False): """ log.info("Listing directories in %s", folder_name) directory_list = [] - self.open_connection() try: content_list = self.sftp.listdir_attr(folder_name) subfolders = any(stat.S_ISDIR(item.st_mode) for item in content_list) @@ -129,6 +153,7 @@ def recursive_list(folder_name): self.close_connection() return directory_list + @reconnect_if_fail(n_times=3, sleep_time=30) def get_file_list(self, folder_name): """Return a tuple with file name and directory path from remote @@ -148,6 +173,7 @@ def get_file_list(self, folder_name): ] return file_list + @reconnect_if_fail(n_times=3, sleep_time=30) def get_from_sftp(self, file, destination, exist_ok=False): """Download a file from remote sftp @@ -169,6 +195,7 @@ def get_from_sftp(self, file, destination, exist_ok=False): log.error("Unable to fetch file %s ", e) return False + @reconnect_if_fail(n_times=3, sleep_time=30) def make_dir(self, folder_name): """Create a new directory in remote sftp @@ -186,6 +213,7 @@ def make_dir(self, folder_name): stderr.print("[red]Directory already exists") return False + @reconnect_if_fail(n_times=3, sleep_time=30) def rename_file(self, old_name, new_name): """Rename a file in remote sftp @@ -199,11 +227,13 @@ def rename_file(self, old_name, new_name): try: self.sftp.rename(old_name, new_name) return True - except FileNotFoundError: - log.error("File %s not found", old_name) - stderr.print("[red]File not found") + except FileNotFoundError as e: + error_txt = f"Could not rename {old_name} to {new_name}: {e}" + log.error(error_txt) + stderr.print(f"[red]{error_txt}") return False + @reconnect_if_fail(n_times=3, sleep_time=30) def remove_file(self, file_name): """Remove a file from remote sftp @@ -221,6 +251,7 @@ def remove_file(self, file_name): stderr.print("[red]File not found") return False + @reconnect_if_fail(n_times=3, sleep_time=30) def remove_dir(self, folder_name): """Remove a directory from remote sftp @@ -238,6 +269,7 @@ def remove_dir(self, folder_name): stderr.print("[red]Directory not found") return False + @reconnect_if_fail(n_times=3, sleep_time=30) def upload_file(self, local_path, remote_file): """Upload a file to remote sftp @@ -256,11 +288,13 @@ def upload_file(self, local_path, remote_file): stderr.print("[red]File not found") return False + @reconnect_if_fail(n_times=3, sleep_time=30) def close_connection(self): log.info("Closing SFTP connection") try: self.sftp.close() except NameError: + log.warning("Could not close sftp connection") return False log.info("SFTP connection closed") return True diff --git a/relecov_tools/templates/Relecov_metadata_template_v2.0.9.xlsx b/relecov_tools/templates/Relecov_metadata_template_v2.0.9.xlsx deleted file mode 100644 index c7472e2d..00000000 Binary files a/relecov_tools/templates/Relecov_metadata_template_v2.0.9.xlsx and /dev/null differ diff --git a/relecov_tools/upload_database.py b/relecov_tools/upload_database.py index eb5d5c4f..ffcb673f 100644 --- a/relecov_tools/upload_database.py +++ b/relecov_tools/upload_database.py @@ -259,24 +259,24 @@ def update_database(self, field_values, post_url): if "ERROR" not in result: break if i == 9 and "ERROR" in result: - logtxt = f"Unable to sent the request to {self.platform}" + logtxt = f"Unable to sent the request to {post_url}" self.logsum.add_error(entry=logtxt, sample=req_sample) stderr.print(f"[red]{logtxt}") continue elif "is not defined" in result["ERROR_TEST"].lower(): error_txt = result["ERROR_TEST"] - logtxt = f"Sample {req_sample}: {error_txt}" + logtxt = f"Sample {req_sample} failed in {post_url}: {error_txt}" self.logsum.add_error(entry=logtxt, sample=req_sample) stderr.print(f"[yellow]Warning: {logtxt}") continue elif "already defined" in result["ERROR_TEST"].lower(): - logtxt = f"Request to {self.platform} already defined" + logtxt = f"Request to {post_url} already defined" self.logsum.add_warning(entry=logtxt, sample=req_sample) stderr.print(f"[yellow]{logtxt} for sample {req_sample}") continue else: - logtxt = f"Error {result['ERROR']} in request to {self.platform}" + logtxt = f"Error {result['ERROR']} in request to {post_url}" self.logsum.add_error(entry=logtxt, sample=req_sample) stderr.print(f"[red]{logtxt}") continue @@ -298,7 +298,7 @@ def update_database(self, field_values, post_url): sample=req_sample, ) stderr.print( - f"[yellow]logtxt % {suces_count} {request_count} {self.platform})" + f"[yellow]{logtxt % (suces_count, request_count, self.platform)}" ) return @@ -363,7 +363,7 @@ def update_db(self): self.server_name = "relecov" self.start_api(self.server_name) - for datatype in self.types_of_data: + for datatype in ["sample", "bioinfodata", "variantdata"]: log_text = f"Sending {datatype} data to {self.server_name}" log.info(log_text) stderr.print(log_text) diff --git a/relecov_tools/utils.py b/relecov_tools/utils.py index 0636e4fb..c3a1d5ad 100755 --- a/relecov_tools/utils.py +++ b/relecov_tools/utils.py @@ -19,6 +19,9 @@ from rich.console import Console from datetime import datetime from tabulate import tabulate +import openpyxl.utils +import openpyxl.styles + log = logging.getLogger(__name__) @@ -92,6 +95,14 @@ def read_excel_file(f_name, sheet_name, header_flag, leave_empty=True): return ws_data, heading_row +def excel_date_to_num(date): + """Transform a date object formatted by excel to a numeric value""" + try: + return date.toordinal() - datetime(1899, 12, 30).toordinal() + except AttributeError: + return None + + def read_csv_file_return_dict(file_name, sep=None, key_position=None): """Read csv or tsv file, according to separator, and return a dictionary where the main key is the first column, if key position is None otherwise @@ -480,3 +491,27 @@ def prompt_create_outdir( sys.exit(1) return global_path + + +def adjust_sheet_size(sheet, wrap_text=True, col_width=30): + """Adjust column width and row heights depending on the max number of + characters in each one. + + Args: + sheet (openpyxl.worksheet): active openpyxl worksheet object + wrap_text (bool): Wether to use excel wrap_text function for each cell. Defaults to True + col_width (int): Minimum columns width value. Also used to define maximum + number of characters in each cell when wrap_text is True. Defaults to 30. + """ + dims = {} + for _, row in enumerate(sheet.iter_rows(min_row=2, max_row=sheet.max_row), start=2): + for cell in row: + if wrap_text: + cell.alignment = openpyxl.styles.Alignment(wrapText=True) + if cell.value is not None: + max_length = max((dims.get(cell.column, 0), len(str(cell.value)))) + dims[cell.column] = max_length / col_width + for col_num, value in dims.items(): + if value < col_width: + value = col_width + sheet.column_dimensions[openpyxl.utils.get_column_letter(col_num)].width = value diff --git a/setup.py b/setup.py index b721a1c6..596177f2 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages -version = "1.1.0" +version = "1.2.0" with open("README.md") as f: readme = f.read() diff --git a/tests/data/sftp_handle/datatest1/SAMPLE2_R1.fastq.gz b/tests/data/sftp_handle/datatest1/SAMPLE2_R1.fastq.gz new file mode 100644 index 00000000..ce87295c Binary files /dev/null and b/tests/data/sftp_handle/datatest1/SAMPLE2_R1.fastq.gz differ diff --git a/tests/data/sftp_handle/datatest1/md5sum.txt b/tests/data/sftp_handle/datatest1/md5sum.txt old mode 100755 new mode 100644 index af658a0b..53d39c9b --- a/tests/data/sftp_handle/datatest1/md5sum.txt +++ b/tests/data/sftp_handle/datatest1/md5sum.txt @@ -1,12 +1,12 @@ -c1551cdf3d5e9849b80a082051907a72 SAMPLE1_R1.fastq.gz -449407f85d7ed4388b7d8a51e28d97a8 SAMPLE1_R2.fastq.gz -206ccbe05fff0701bdcf9f899ca3175d SAMPLE2_R1.fastq.gz -BADBADe23a1caf88c11678f10666b931 SAMPLE3_R1.fastq.gz -1be991ce0f9ed442d57c5a359d4c5848 SAMPLE3_R2.fastq.gz -5d838ce4a93bcf12d3031a5f9ffd4acf SAMPLE4_R1.fastq.gz -0a9b0a590b4773e5e4c90c391979a685 SAMPLE4_R2.fastq.gz -d04d32f14ca56f968d07f61d68598639 SAMPLE5_R1.fastq.gz -66a057995ac925e3203ff1afdd5770ec SAMPLE5_R2.fastq.gz -0d55b49614d1c38582b69ccded1b22c4 SAMPLE6_R1.fastq.gz -50afab6751cce654b973bb336cf3b3eb SAMPLE6_R2.fastq.gz -112d5d2ff3845236576732f7c7dc8b06 Singlesamp.fastq.gz +bb35d11ae7f99f22eda3db239435d0e3 SAMPLE1_R1.fastq.gz +ad4883129c29a8e55f102da198b1eb42 SAMPLE1_R2.fastq.gz +db70941c0c3e72325128b02c4bbe5f28 SAMPLE2_R1.fastq.gz +BADBAD35deedza0182d5aacae53fe025 SAMPLE3_R1.fastq.gz +4391785d2fd8a0982d5aacaef3cee057 SAMPLE3_R2.fastq.gz +57114529183e41b8e8e61dfc84e77b33 SAMPLE4_R1.fastq.gz +62918f6d284f23e41f19cf740deca660 SAMPLE4_R2.fastq.gz +87608f9851b25b52d49f6906e5fb23a6 SAMPLE5_R1.fastq.gz +2f379a4972671c691cd1160c8e5abe00 SAMPLE5_R2.fastq.gz +bd22843bd5e36a54f9c1ae1f0e7bdbac SAMPLE6_R1.fastq.gz +eebd6cc938d962ca54b2cc4f3650e039 SAMPLE6_R2.fastq.gz +e04c2abf071e931c2d8f2bab52d3f12b Singlesamp.fastq.gz diff --git a/tests/data/sftp_handle/datatest1/metadata_validation_test.xlsx b/tests/data/sftp_handle/datatest1/metadata_validation_test.xlsx index 46833153..57714e46 100755 Binary files a/tests/data/sftp_handle/datatest1/metadata_validation_test.xlsx and b/tests/data/sftp_handle/datatest1/metadata_validation_test.xlsx differ diff --git a/tests/data/sftp_handle/datatest2/test2_metadata_template_v2.0.1.xlsx b/tests/data/sftp_handle/datatest2/test2_metadata_template_v2.0.1.xlsx index ea564b83..9d4fabc7 100644 Binary files a/tests/data/sftp_handle/datatest2/test2_metadata_template_v2.0.1.xlsx and b/tests/data/sftp_handle/datatest2/test2_metadata_template_v2.0.1.xlsx differ