From 2123f25dfad8f6f22d950aafe9b461e4c683cb05 Mon Sep 17 00:00:00 2001 From: Wei Shen Date: Thu, 17 Oct 2024 10:37:55 +0800 Subject: [PATCH] split: support customize output file prefix and subdirectory from prefix of keys. #288 --- CHANGELOG.md | 2 + csvtk/cmd/helper.go | 29 +++++++++++++++ csvtk/cmd/split.go | 47 +++++++++++++++++------- doc/docs/usage.md | 89 ++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 143 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b69ff11..1321fda 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ - add flag `--nr-width`. - `csvtk replace`: - fix implementing `{nr}`. [#286](https://github.com/shenwei356/csvtk/issues/286) + - `csvtk split`: + - support customize output file prefix and subdirectory from prefix of keys. [#288](https://github.com/shenwei356/csvtk/issues/288) - `csvtk spread`: - add a new alias "scatter" to "spread". [#265](https://github.com/shenwei356/csvtk/issues/265) - `csvtk grep`: diff --git a/csvtk/cmd/helper.go b/csvtk/cmd/helper.go index 0b1027d..a091fb1 100644 --- a/csvtk/cmd/helper.go +++ b/csvtk/cmd/helper.go @@ -32,7 +32,9 @@ import ( "strconv" "strings" + "github.com/pkg/errors" "github.com/shenwei356/breader" + "github.com/shenwei356/util/pathutil" "github.com/shenwei356/util/stringutil" "github.com/shenwei356/xopen" "github.com/spf13/cobra" @@ -752,3 +754,30 @@ func UniqInts(list []int) []int { } return s } + +func makeOutDir(outDir string, force bool, logname string, verbose bool) { + pwd, _ := os.Getwd() + if outDir != "./" && outDir != "." && pwd != filepath.Clean(outDir) { + existed, err := pathutil.DirExists(outDir) + checkError(errors.Wrap(err, outDir)) + if existed { + empty, err := pathutil.IsEmpty(outDir) + checkError(errors.Wrap(err, outDir)) + if !empty { + if force { + if verbose { + log.Infof("removing old output directory: %s", outDir) + } + checkError(os.RemoveAll(outDir)) + } else { + checkError(fmt.Errorf("%s not empty: %s, use --force to overwrite", logname, outDir)) + } + } else { + checkError(os.RemoveAll(outDir)) + } + } + checkError(os.MkdirAll(outDir, 0777)) + } else { + log.Errorf("%s should not be current directory", logname) + } +} diff --git a/csvtk/cmd/split.go b/csvtk/cmd/split.go index 51c1097..750c3f1 100644 --- a/csvtk/cmd/split.go +++ b/csvtk/cmd/split.go @@ -29,7 +29,6 @@ import ( "strings" "sync" - "github.com/shenwei356/util/pathutil" "github.com/shenwei356/xopen" "github.com/spf13/cobra" ) @@ -42,9 +41,11 @@ var splitCmd = &cobra.Command{ Short: "split CSV/TSV into multiple files according to column values", Long: `split CSV/TSV into multiple files according to column values -Note: +Notes: - 1. flag -o/--out-file can specify out directory for splitted files + 1. flag -o/--out-file can specify out directory for splitted files. + 2. flag -s/--prefix-as-subdir can create subdirectories with prefixes of + keys of length X, to avoid writing too many files in the output directory. `, Run: func(cmd *cobra.Command, args []string) { @@ -65,6 +66,9 @@ Note: bufRowsSize := getFlagNonNegativeInt(cmd, "buf-rows") bufGroupsSize := getFlagNonNegativeInt(cmd, "buf-groups") gzipped := getFlagBool(cmd, "out-gzip") + outPrefix := getFlagString(cmd, "out-prefix") + subdirLen := getFlagNonNegativeInt(cmd, "prefix-as-subdir") + force := getFlagBool(cmd, "force") file := files[0] csvReader, err := newCSVReaderByConfig(config, file) @@ -92,15 +96,28 @@ Note: outFileSuffix = outFileSuffix + ".gz" } + outdir := "./" if config.OutFile != "-" { // outdir - outdir := config.OutFile - var existed bool - existed, err = pathutil.DirExists(outdir) - checkError(err) - if !existed { - checkError(os.MkdirAll(outdir, 0775)) + outdir = config.OutFile + makeOutDir(outdir, force, "-o/--outfile", true) + } + + if outPrefix != "" || cmd.Flags().Lookup("out-prefix").Changed { + outFilePrefix = outPrefix + } else { + outFilePrefix += "-" + } + + outfile := func(key string) string { + if subdirLen == 0 { + return filepath.Join(outdir, outFilePrefix+key+outFileSuffix) + } + var subdir string + if len(key) > subdirLen { + subdir = key[:subdirLen] + return filepath.Join(outdir, subdir, outFilePrefix+key+outFileSuffix) } - outFilePrefix = filepath.Join(outdir, filepath.Base(outFilePrefix)) + return filepath.Join(outdir, outFilePrefix+key+outFileSuffix) } var key string @@ -138,7 +155,7 @@ Note: appendRows(config, csvReader, headerRow, - fmt.Sprintf("%s-%s%s", outFilePrefix, key, outFileSuffix), + outfile(key), rowsBuf[key], key, ) @@ -161,7 +178,7 @@ Note: appendRows(config, csvReader, headerRow, - fmt.Sprintf("%s-%s%s", outFilePrefix, key, outFileSuffix), + outfile(key), rows, key, ) @@ -189,7 +206,7 @@ Note: appendRows(config, csvReader, headerRow, - fmt.Sprintf("%s-%s%s", outFilePrefix, key, outFileSuffix), + outfile(key), rows, key, ) @@ -212,7 +229,9 @@ func init() { splitCmd.Flags().BoolP("out-gzip", "G", false, `force output gzipped file`) splitCmd.Flags().IntP("buf-rows", "b", 100000, `buffering N rows for every group before writing to file`) splitCmd.Flags().IntP("buf-groups", "g", 100, `buffering N groups before writing to file`) - + splitCmd.Flags().StringP("out-prefix", "p", "", `output file prefix, the default value is the input file. use -p "" to disable outputting prefix`) + splitCmd.Flags().IntP("prefix-as-subdir", "s", 0, `create subdirectories with prefixes of keys of length X, to avoid writing too many files in the output directory`) + splitCmd.Flags().BoolP("force", "", false, `overwrite existing output directory (given by -o).`) } var writtenFiles sync.Map diff --git a/doc/docs/usage.md b/doc/docs/usage.md index 4f278fc..44669f8 100644 --- a/doc/docs/usage.md +++ b/doc/docs/usage.md @@ -2438,22 +2438,29 @@ Usage ```text split CSV/TSV into multiple files according to column values -Note: +Notes: - 1. flag -o/--out-file can specify out directory for splitted files + 1. flag -o/--out-file can specify out directory for splitted files. + 2. flag -s/--prefix-as-subdir can create subdirectories with prefixes of + keys of length X, to avoid writing too many files in the output directory. Usage: csvtk split [flags] Flags: - -g, --buf-groups int buffering N groups before writing to file (default 100) - -b, --buf-rows int buffering N rows for every group before writing to file (default 100000) - -f, --fields string comma separated key fields, column name or index. e.g. -f 1-3 or -f id,id2 or - -F -f "group*" (default "1") - -F, --fuzzy-fields using fuzzy fields, e.g., -F -f "*name" or -F -f "id123*" - -h, --help help for split - -i, --ignore-case ignore case - -G, --out-gzip force output gzipped file + -g, --buf-groups int buffering N groups before writing to file (default 100) + -b, --buf-rows int buffering N rows for every group before writing to file (default 100000) + -f, --fields string comma separated key fields, column name or index. e.g. -f 1-3 or -f + id,id2 or -F -f "group*" (default "1") + --force overwrite existing output directory (given by -o). + -F, --fuzzy-fields using fuzzy fields, e.g., -F -f "*name" or -F -f "id123*" + -h, --help help for split + -i, --ignore-case ignore case + -G, --out-gzip force output gzipped file + -p, --out-prefix string output file prefix, the default value is the input file. use -p "" to + disable outputting prefix + -s, --prefix-as-subdir int create subdirectories with prefixes of keys of length X, to avoid writing + too many files in the output directory ``` @@ -2502,6 +2509,27 @@ Examples $ ls result/*.csv | wc -l 10000 +1. Do not output prefix, use `-p ""`. + + $ echo -ne "1,ACGT\n2,GGCA\n3,ACAAC\n" + 1,ACGT + 2,GGCA + 3,ACAAC + + $ echo -ne "1,ACGT\n2,GGCA\n3,ACAAC\n" | csvtk split -H -f 2 -o t -p "" -s 3 --force + + $ tree t + t + ├── ACA + │   └── ACAAC.csv + ├── ACG + │   └── ACGT.csv + └── GGC + └── GGCA.csv + + 4 directories, 3 files + + 1. extreme example 1: lots (1M) of rows in groups $ yes 2 | head -n 10000000 | gzip -c > t.gz @@ -2534,6 +2562,47 @@ Examples $ zcat t2.gz | md5sum 72d4ff27a28afbc066d5804999d5a504 - + since, v0.31.0, the flag `-s/--prefix-as-subdir` can create subdirectories with prefixes of + keys of length X, to avoid writing too many files in the output directory. + + $ memusg -t csvtk -H split t2.gz -o t2 -s 3 + elapsed time: 2.668s + peak rss: 1.79 GB + + $ fd .gz$ t2 | rush 'zcat {}' | sort -k 1,1n | md5sum + 72d4ff27a28afbc066d5804999d5a504 - + + $ tree t2/ | more + t2/ + ├── 100 + │   ├── t2-10000.gz + │   ├── t2-1000.gz + │   ├── t2-1001.gz + │   ├── t2-1002.gz + │   ├── t2-1003.gz + │   ├── t2-1004.gz + │   ├── t2-1005.gz + │   ├── t2-1006.gz + │   ├── t2-1007.gz + │   ├── t2-1008.gz + │   └── t2-1009.gz + ├── 101 + │   ├── t2-1010.gz + │   ├── t2-1011.gz + ... + ├── t2-994.gz + ├── t2-995.gz + ├── t2-996.gz + ├── t2-997.gz + ├── t2-998.gz + ├── t2-999.gz + ├── t2-99.gz + └── t2-9.gz + + 901 directories, 10000 files + + + ## splitxlsx Usage