diff --git a/tools/checkformat/.shed.yml b/tools/checkformat/.shed.yml new file mode 100644 index 000000000..ff863e5d7 --- /dev/null +++ b/tools/checkformat/.shed.yml @@ -0,0 +1,8 @@ +name: checkformat +owner: ethevenot +description: '[W4M][Metabolomics][LC-MS][GC-MS][NMR] Checks the formats of the dataMatrix, sampleMetadata, and variableMetadata files.' +homepage_url: http://workflow4metabolomics.org +long_description: 'For all post-processing steps of the peak table, W4M uses a 3 table format for the data and metadata. This module therefore checks that the formats of the 3 files "dataMatrix.tsv", "sampleMetadata.tsv", and "variableMetadata.tsv" are correct. It can be used before any post-processing step (such as normalization or statistical analysis). Potential warnings or errors in the formats are returned in the "information.txt" output file.' +remote_repository_url: https://github.com/workflow4metabolomics/checkformat.git +categories: +- Metabolomics \ No newline at end of file diff --git a/tools/checkformat/CHANGELOG.md b/tools/checkformat/CHANGELOG.md new file mode 100644 index 000000000..c734b3cc3 --- /dev/null +++ b/tools/checkformat/CHANGELOG.md @@ -0,0 +1,25 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). + +## [3.0.0+galaxy0] - 2025-07-16 +### Changed +- migrated from Gitlab to GitHub + +## [3.0.0] - 2018-03-01 +### Added +- Automated re-ordering (if necessary) of sample and/or variable names from `dataMatrix` based on `sampleMetadata` and `variableMetadata`. +- New argument to make sample and variable names syntactically valid. +- Output of `dataMatrix`, `sampleMetadata`, and `variableMetadata` files, whether they have been modified or not. + +## [2.0.4] - 2017-06-06 +### Changed +- Minor internal modifications. + +## [2.0.2] - 2016-07-30 +### Changed +- Test for R code. +- Planemo running validation. +- Planemo installing validation. +- Travis automated testing. \ No newline at end of file diff --git a/tools/checkformat/checkformat_config.xml b/tools/checkformat/checkformat_config.xml new file mode 100644 index 000000000..fcb4bbda4 --- /dev/null +++ b/tools/checkformat/checkformat_config.xml @@ -0,0 +1,177 @@ + + Checking/formatting the sample and variable names of the dataMatrix, sampleMetadata, and variableMetadata files + + + r-base + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 10.1021/acs.jproteome.5b00354 + 10.1016/j.biocel.2017.07.002 + 10.1093/bioinformatics/btu813 + + diff --git a/tools/checkformat/checkformat_script.R b/tools/checkformat/checkformat_script.R new file mode 100644 index 000000000..c36cbe3c5 --- /dev/null +++ b/tools/checkformat/checkformat_script.R @@ -0,0 +1,218 @@ +## Etienne Thevenot +## CEA, MetaboHUB Paris +## etienne.thevenot@cea.fr + + + +## Reads the dataMatrix, sampleMetadata, and variableMetadata .tsv files +## and checks the formats +readAndCheckF <- function(datFilC = "dataMatrix.tsv", + samFilC = "sampleMetadata.tsv", + varFilC = "variableMetadata.tsv", + makNamL) { + ## options + + optStrAsFacL <- options()[["stringsAsFactors"]] + options(stringsAsFactors = FALSE) + + + ## checking that the tables have no duplicated row or column names + + for (tabC in c("dat", "sam", "var")) { + tabNamC <- switch(tabC, + dat = "dataMatrix", + sam = "sampleMetadata", + var = "variableMetadata" + ) + + rowVc <- read.table(eval(parse(text = paste0(tabC, "FilC"))), + check.names = FALSE, + header = TRUE, + sep = "\t" + )[, 1] + + colVc <- unlist(read.table(eval(parse(text = paste0(tabC, "FilC"))), + check.names = FALSE, + nrow = 1, + sep = "\t" + ))[-1] + + if (any(duplicated(rowVc))) { + stop("The following row name(s) is/are duplicated in the ", + tabNamC, + " table: '", + paste(rowVc[duplicated(rowVc)], collapse = "', '"), "'", + call. = FALSE + ) + } + + if (any(duplicated(colVc))) { + stop("The following column name(s) is/are duplicated in the ", + tabNamC, + " table: '", + paste(colVc[duplicated(colVc)], collapse = "', '"), "'", + call. = FALSE + ) + } + } + + + ## reading tables + + datMN <- t(as.matrix(read.table(datFilC, + check.names = FALSE, + header = TRUE, + row.names = 1, + sep = "\t" + ))) + + samDF <- read.table(samFilC, + check.names = FALSE, + header = TRUE, + row.names = 1, + sep = "\t" + ) + + varDF <- read.table(varFilC, + check.names = FALSE, + header = TRUE, + row.names = 1, + sep = "\t" + ) + + + ## checking that dataMatrix is numeric and that the sample and variable numbers are coherent + + if (mode(datMN) != "numeric") { + stop("The dataMatrix is not of the 'numeric' type", + call. = FALSE + ) + } + + if (nrow(datMN) != nrow(samDF)) { + if (nrow(datMN) > nrow(samDF)) { + print(setdiff(rownames(datMN), rownames(samDF))) + stop("The sample names above from dataMatrix were not found in sampleMetadata", + call. = FALSE + ) + } else { + print(setdiff(rownames(samDF), rownames(datMN))) + stop("The sample names above from sampleMetadata were not found in dataMatrix", + call. = FALSE + ) + } + } + + if (ncol(datMN) != nrow(varDF)) { + if (ncol(datMN) > nrow(varDF)) { + print(setdiff(colnames(datMN), rownames(varDF))) + stop("The variable names above from dataMatrix were not found in variableMetadata", + call. = FALSE + ) + } else { + print(setdiff(rownames(varDF), colnames(datMN))) + stop("The variable names above from variableMetadata were not found in dataMatrix", + call. = FALSE + ) + } + } + + + ## making sample and variable names (optional) + + newL <- FALSE + + if (makNamL) { + cat("\n\nMessage: Converting sample and variable names to the standard R format\n") + + rownames(datMN) <- make.names(rownames(datMN), unique = TRUE) + colnames(datMN) <- make.names(colnames(datMN), unique = TRUE) + rownames(samDF) <- make.names(rownames(samDF), unique = TRUE) + rownames(varDF) <- make.names(rownames(varDF), unique = TRUE) + + newL <- TRUE + } + + + ## checking sample and variable names + + chkL <- TRUE + + if (!identical(rownames(datMN), rownames(samDF))) { + if (identical(sort(rownames(datMN)), sort(rownames(samDF)))) { + cat("\n\nMessage: Re-ordering dataMatrix sample names to match sampleMetadata\n") + datMN <- datMN[rownames(samDF), , drop = FALSE] + + stopifnot(identical(sort(rownames(datMN)), sort(rownames(samDF)))) + + newL <- TRUE + } else { + cat("\n\nStop: The sample names of dataMatrix and sampleMetadata do not match:\n") + print(cbind.data.frame( + indice = 1:nrow(datMN), + dataMatrix = rownames(datMN), + sampleMetadata = rownames(samDF) + )[rownames(datMN) != rownames(samDF), , drop = FALSE]) + chkL <- FALSE + } + } + + if (!identical(colnames(datMN), rownames(varDF))) { + if (identical(sort(colnames(datMN)), sort(rownames(varDF)))) { + cat("\n\nMessage: Re-ordering dataMatrix variable names to match variableMetadata\n") + datMN <- datMN[, rownames(varDF), drop = FALSE] + + stopifnot(identical(sort(colnames(datMN)), sort(rownames(varDF)))) + + newL <- TRUE + } else { + cat("\n\nStop: The variable names of dataMatrix and variableMetadata do not match:\n") + print(cbind.data.frame( + indice = 1:ncol(datMN), + dataMatrix = colnames(datMN), + variableMetadata = rownames(varDF) + )[colnames(datMN) != rownames(varDF), , drop = FALSE]) + chkL <- FALSE + } + } + + + options(stringsAsFactors = optStrAsFacL) + + resLs <- list( + chkL = chkL, + newL = newL, + datMN = datMN, + samDF = samDF, + varDF = varDF + ) + + return(resLs) +} ## end of checkAndReadF + +write_dataMatrix <- function(dataMatrix, path) { + datDF <- cbind.data.frame( + dataMatrix = colnames(dataMatrix), + as.data.frame(t(dataMatrix)) + ) + write.table(datDF, + file = path, + quote = FALSE, + row.names = FALSE, + sep = "\t" + ) +} + +## sampleMetadata & variableMetadata +write_metadata <- function(sampleMetadata, path) { + samDF <- cbind.data.frame( + sampleMetadata = rownames(sampleMetadata), + sampleMetadata + ) + write.table(samDF, + file = path, + quote = FALSE, + row.names = FALSE, + sep = "\t" + ) +} diff --git a/tools/checkformat/static/images/checkFormat_workflowPositionImage.png b/tools/checkformat/static/images/checkFormat_workflowPositionImage.png new file mode 100644 index 000000000..c4f42813b Binary files /dev/null and b/tools/checkformat/static/images/checkFormat_workflowPositionImage.png differ diff --git a/tools/checkformat/test-data/input-dataMatrix.tsv b/tools/checkformat/test-data/input-dataMatrix.tsv new file mode 100644 index 000000000..0ffcd7bce --- /dev/null +++ b/tools/checkformat/test-data/input-dataMatrix.tsv @@ -0,0 +1,4 @@ +dataMatrix HU_017 HU_021 HU_027 HU_032 HU_041 HU_048 HU_049 HU_050 HU_052 HU_059 HU_060 HU_066 HU_072 HU_077 HU_090 HU_109 HU_110 HU_125 HU_126 HU_131 HU_134 HU_149 HU_150 HU_173 HU_179 HU_180 HU_182 HU_202 HU_204 HU_209 +HMDB01032 2569204.924 6222035.774 17070707.99 1258838.243 13039543.08 1909391.77 3495.093864 2293521.909 128503.2751 81872.52764 8103557.566 149574887 1544036.41 7103429.539 14138796.5 4970265.58 263054.7306 1671332.3 88433.1945 23602331.29 18648126.52 1554657.988 34152.36464 209372.7128 33187733.37 202438.5916 13581070.09 354170.8107 9120781.49 43419175.41 +HMDB03072 3628416.303 65626.98344 112170.1189 3261804.344 42228.27877 343254.2013 1958217.693 11983270.04 5932111.416 5511385.834 9154521.478 2632133.212 9500411.146 6551644.517 7204319.809 1273412.048 3260583.816 8932005.535 8340827.526 9256460.692 11217839.17 5919262.814 11790077.07 9567977.808 73717.58117 9991787.291 4208098.147 623970.6499 10904221.26 2171793.936 +HMDB00792 429568.6094 3887629.505 1330692.117 1367446.73 844197.4475 2948090.719 1614157.906 3740009.194 3292251.665 2310688.795 4404239.59 3043289.128 825736.4672 2523241.917 6030501.026 474901.6041 2885792.426 2955990.64 1917716.343 1767962.677 5926203.404 1639065.695 346810.7636 1054776.223 2390258.275 1831346.373 1026696.369 7079792.5 4368341.014 3495986.873 diff --git a/tools/checkformat/test-data/input-sampleMetadata.tsv b/tools/checkformat/test-data/input-sampleMetadata.tsv new file mode 100644 index 000000000..d0c01a6f2 --- /dev/null +++ b/tools/checkformat/test-data/input-sampleMetadata.tsv @@ -0,0 +1,31 @@ +sampleMetadata age ageGrp +HU_017 41 experienced +HU_021 34 junior +HU_027 37 experienced +HU_032 38 experienced +HU_041 28 junior +HU_048 39 experienced +HU_049 50 senior +HU_050 30 junior +HU_052 51 senior +HU_059 81 senior +HU_060 55 senior +HU_066 25 junior +HU_072 47 experienced +HU_077 27 junior +HU_090 46 experienced +HU_109 32 junior +HU_110 50 senior +HU_125 58 senior +HU_126 45 experienced +HU_131 42 experienced +HU_134 48 experienced +HU_149 35 experienced +HU_150 49 experienced +HU_173 55 senior +HU_179 33 junior +HU_180 53 senior +HU_182 43 experienced +HU_202 42 experienced +HU_204 31 junior +HU_209 17.5 junior diff --git a/tools/checkformat/test-data/input-variableMetadata.tsv b/tools/checkformat/test-data/input-variableMetadata.tsv new file mode 100644 index 000000000..42d0ad93f --- /dev/null +++ b/tools/checkformat/test-data/input-variableMetadata.tsv @@ -0,0 +1,4 @@ +variableMetadata name +HMDB01032 Dehydroepiandrosterone sulfate +HMDB03072 Quinic acid +HMDB00792 Sebacic acid diff --git a/tools/checkformat/test-data/output-dataMatrix.tsv b/tools/checkformat/test-data/output-dataMatrix.tsv new file mode 100644 index 000000000..d07105a32 --- /dev/null +++ b/tools/checkformat/test-data/output-dataMatrix.tsv @@ -0,0 +1,4 @@ +dataMatrix X17 HU_021 HU_027 HU_032 HU_041 HU_048 HU_049 HU_050 HU_052 HU_059 HU_060 HU_066 HU_072 HU_077 HU_090 HU_109 HU_110 HU_125 HU_126 HU_131 HU_134 HU_149 HU_150 HU_173 HU_179 HU_180 HU_182 HU_202 HU_204 HU_209 +HMDB01032 2569204.924 6222035.774 17070707.99 1258838.243 13039543.08 1909391.77 3495.093864 2293521.909 128503.2751 81872.52764 8103557.566 149574887 1544036.41 7103429.539 14138796.5 4970265.58 263054.7306 1671332.3 88433.1945 23602331.29 18648126.52 1554657.988 34152.36464 209372.7128 33187733.37 202438.5916 13581070.09 354170.8107 9120781.49 43419175.41 +HMDB03072 3628416.303 65626.98344 112170.1189 3261804.344 42228.27877 343254.2013 1958217.693 11983270.04 5932111.416 5511385.834 9154521.478 2632133.212 9500411.146 6551644.517 7204319.809 1273412.048 3260583.816 8932005.535 8340827.526 9256460.692 11217839.17 5919262.814 11790077.07 9567977.808 73717.58117 9991787.291 4208098.147 623970.6499 10904221.26 2171793.936 +HMDB00792 429568.6094 3887629.505 1330692.117 1367446.73 844197.4475 2948090.719 1614157.906 3740009.194 3292251.665 2310688.795 4404239.59 3043289.128 825736.4672 2523241.917 6030501.026 474901.6041 2885792.426 2955990.64 1917716.343 1767962.677 5926203.404 1639065.695 346810.7636 1054776.223 2390258.275 1831346.373 1026696.369 7079792.5 4368341.014 3495986.873 diff --git a/tools/checkformat/test-data/output-information.txt b/tools/checkformat/test-data/output-information.txt new file mode 100644 index 000000000..2cd78cbad --- /dev/null +++ b/tools/checkformat/test-data/output-information.txt @@ -0,0 +1,26 @@ + +Start of the 'Check Format' Galaxy module call: Thu 26 Oct 2017 10:27:44 AM + +Table formats are OK; enjoy your analyses! + +End of the 'Check Format' Galaxy module call: Thu 26 Oct 2017 10:27:44 AM + + + +============================================================================ +Additional information about the call: + +1) Parameters: + value +dataMatrix_in "/tmp/tmpolhAXS/files/000/dataset_1.dat" +sampleMetadata_in "/tmp/tmpolhAXS/files/000/dataset_2.dat" +variableMetadata_in "/tmp/tmpolhAXS/files/000/dataset_3.dat" +information "/tmp/tmpolhAXS/files/000/dataset_4.dat" + +2) Session Info: +R version 3.3.2 (2016-10-31) +Main packages: +batch 1.1.4 +Other loaded packages: + +============================================================================