Skip to content

Commit

Permalink
split: support customize output file prefix and subdirectory from pre…
Browse files Browse the repository at this point in the history
…fix of keys. #288
  • Loading branch information
shenwei356 committed Oct 17, 2024
1 parent cb23c4f commit 2123f25
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 24 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
- add flag `--nr-width`.
- `csvtk replace`:
- fix implementing `{nr}`. [#286](https://github.com/shenwei356/csvtk/issues/286)
- `csvtk split`:
- support customize output file prefix and subdirectory from prefix of keys. [#288](https://github.com/shenwei356/csvtk/issues/288)
- `csvtk spread`:
- add a new alias "scatter" to "spread". [#265](https://github.com/shenwei356/csvtk/issues/265)
- `csvtk grep`:
Expand Down
29 changes: 29 additions & 0 deletions csvtk/cmd/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ import (
"strconv"
"strings"

"github.com/pkg/errors"
"github.com/shenwei356/breader"
"github.com/shenwei356/util/pathutil"
"github.com/shenwei356/util/stringutil"
"github.com/shenwei356/xopen"
"github.com/spf13/cobra"
Expand Down Expand Up @@ -752,3 +754,30 @@ func UniqInts(list []int) []int {
}
return s
}

func makeOutDir(outDir string, force bool, logname string, verbose bool) {
pwd, _ := os.Getwd()
if outDir != "./" && outDir != "." && pwd != filepath.Clean(outDir) {
existed, err := pathutil.DirExists(outDir)
checkError(errors.Wrap(err, outDir))
if existed {
empty, err := pathutil.IsEmpty(outDir)
checkError(errors.Wrap(err, outDir))
if !empty {
if force {
if verbose {
log.Infof("removing old output directory: %s", outDir)
}
checkError(os.RemoveAll(outDir))
} else {
checkError(fmt.Errorf("%s not empty: %s, use --force to overwrite", logname, outDir))
}
} else {
checkError(os.RemoveAll(outDir))
}
}
checkError(os.MkdirAll(outDir, 0777))
} else {
log.Errorf("%s should not be current directory", logname)
}
}
47 changes: 33 additions & 14 deletions csvtk/cmd/split.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ import (
"strings"
"sync"

"github.com/shenwei356/util/pathutil"
"github.com/shenwei356/xopen"
"github.com/spf13/cobra"
)
Expand All @@ -42,9 +41,11 @@ var splitCmd = &cobra.Command{
Short: "split CSV/TSV into multiple files according to column values",
Long: `split CSV/TSV into multiple files according to column values
Note:
Notes:
1. flag -o/--out-file can specify out directory for splitted files
1. flag -o/--out-file can specify out directory for splitted files.
2. flag -s/--prefix-as-subdir can create subdirectories with prefixes of
keys of length X, to avoid writing too many files in the output directory.
`,
Run: func(cmd *cobra.Command, args []string) {
Expand All @@ -65,6 +66,9 @@ Note:
bufRowsSize := getFlagNonNegativeInt(cmd, "buf-rows")
bufGroupsSize := getFlagNonNegativeInt(cmd, "buf-groups")
gzipped := getFlagBool(cmd, "out-gzip")
outPrefix := getFlagString(cmd, "out-prefix")
subdirLen := getFlagNonNegativeInt(cmd, "prefix-as-subdir")
force := getFlagBool(cmd, "force")

file := files[0]
csvReader, err := newCSVReaderByConfig(config, file)
Expand Down Expand Up @@ -92,15 +96,28 @@ Note:
outFileSuffix = outFileSuffix + ".gz"
}

outdir := "./"
if config.OutFile != "-" { // outdir
outdir := config.OutFile
var existed bool
existed, err = pathutil.DirExists(outdir)
checkError(err)
if !existed {
checkError(os.MkdirAll(outdir, 0775))
outdir = config.OutFile
makeOutDir(outdir, force, "-o/--outfile", true)
}

if outPrefix != "" || cmd.Flags().Lookup("out-prefix").Changed {
outFilePrefix = outPrefix
} else {
outFilePrefix += "-"
}

outfile := func(key string) string {
if subdirLen == 0 {
return filepath.Join(outdir, outFilePrefix+key+outFileSuffix)
}
var subdir string
if len(key) > subdirLen {
subdir = key[:subdirLen]
return filepath.Join(outdir, subdir, outFilePrefix+key+outFileSuffix)
}
outFilePrefix = filepath.Join(outdir, filepath.Base(outFilePrefix))
return filepath.Join(outdir, outFilePrefix+key+outFileSuffix)
}

var key string
Expand Down Expand Up @@ -138,7 +155,7 @@ Note:
appendRows(config,
csvReader,
headerRow,
fmt.Sprintf("%s-%s%s", outFilePrefix, key, outFileSuffix),
outfile(key),
rowsBuf[key],
key,
)
Expand All @@ -161,7 +178,7 @@ Note:
appendRows(config,
csvReader,
headerRow,
fmt.Sprintf("%s-%s%s", outFilePrefix, key, outFileSuffix),
outfile(key),
rows,
key,
)
Expand Down Expand Up @@ -189,7 +206,7 @@ Note:
appendRows(config,
csvReader,
headerRow,
fmt.Sprintf("%s-%s%s", outFilePrefix, key, outFileSuffix),
outfile(key),
rows,
key,
)
Expand All @@ -212,7 +229,9 @@ func init() {
splitCmd.Flags().BoolP("out-gzip", "G", false, `force output gzipped file`)
splitCmd.Flags().IntP("buf-rows", "b", 100000, `buffering N rows for every group before writing to file`)
splitCmd.Flags().IntP("buf-groups", "g", 100, `buffering N groups before writing to file`)

splitCmd.Flags().StringP("out-prefix", "p", "", `output file prefix, the default value is the input file. use -p "" to disable outputting prefix`)
splitCmd.Flags().IntP("prefix-as-subdir", "s", 0, `create subdirectories with prefixes of keys of length X, to avoid writing too many files in the output directory`)
splitCmd.Flags().BoolP("force", "", false, `overwrite existing output directory (given by -o).`)
}

var writtenFiles sync.Map
Expand Down
89 changes: 79 additions & 10 deletions doc/docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -2438,22 +2438,29 @@ Usage
```text
split CSV/TSV into multiple files according to column values
Note:
Notes:
1. flag -o/--out-file can specify out directory for splitted files
1. flag -o/--out-file can specify out directory for splitted files.
2. flag -s/--prefix-as-subdir can create subdirectories with prefixes of
keys of length X, to avoid writing too many files in the output directory.
Usage:
csvtk split [flags]
Flags:
-g, --buf-groups int buffering N groups before writing to file (default 100)
-b, --buf-rows int buffering N rows for every group before writing to file (default 100000)
-f, --fields string comma separated key fields, column name or index. e.g. -f 1-3 or -f id,id2 or
-F -f "group*" (default "1")
-F, --fuzzy-fields using fuzzy fields, e.g., -F -f "*name" or -F -f "id123*"
-h, --help help for split
-i, --ignore-case ignore case
-G, --out-gzip force output gzipped file
-g, --buf-groups int buffering N groups before writing to file (default 100)
-b, --buf-rows int buffering N rows for every group before writing to file (default 100000)
-f, --fields string comma separated key fields, column name or index. e.g. -f 1-3 or -f
id,id2 or -F -f "group*" (default "1")
--force overwrite existing output directory (given by -o).
-F, --fuzzy-fields using fuzzy fields, e.g., -F -f "*name" or -F -f "id123*"
-h, --help help for split
-i, --ignore-case ignore case
-G, --out-gzip force output gzipped file
-p, --out-prefix string output file prefix, the default value is the input file. use -p "" to
disable outputting prefix
-s, --prefix-as-subdir int create subdirectories with prefixes of keys of length X, to avoid writing
too many files in the output directory
```

Expand Down Expand Up @@ -2502,6 +2509,27 @@ Examples
$ ls result/*.csv | wc -l
10000

1. Do not output prefix, use `-p ""`.

$ echo -ne "1,ACGT\n2,GGCA\n3,ACAAC\n"
1,ACGT
2,GGCA
3,ACAAC

$ echo -ne "1,ACGT\n2,GGCA\n3,ACAAC\n" | csvtk split -H -f 2 -o t -p "" -s 3 --force

$ tree t
t
├── ACA
│   └── ACAAC.csv
├── ACG
│   └── ACGT.csv
└── GGC
└── GGCA.csv

4 directories, 3 files


1. extreme example 1: lots (1M) of rows in groups

$ yes 2 | head -n 10000000 | gzip -c > t.gz
Expand Down Expand Up @@ -2534,6 +2562,47 @@ Examples
$ zcat t2.gz | md5sum
72d4ff27a28afbc066d5804999d5a504 -

since, v0.31.0, the flag `-s/--prefix-as-subdir` can create subdirectories with prefixes of
keys of length X, to avoid writing too many files in the output directory.

$ memusg -t csvtk -H split t2.gz -o t2 -s 3
elapsed time: 2.668s
peak rss: 1.79 GB

$ fd .gz$ t2 | rush 'zcat {}' | sort -k 1,1n | md5sum
72d4ff27a28afbc066d5804999d5a504 -

$ tree t2/ | more
t2/
├── 100
│   ├── t2-10000.gz
│   ├── t2-1000.gz
│   ├── t2-1001.gz
│   ├── t2-1002.gz
│   ├── t2-1003.gz
│   ├── t2-1004.gz
│   ├── t2-1005.gz
│   ├── t2-1006.gz
│   ├── t2-1007.gz
│   ├── t2-1008.gz
│   └── t2-1009.gz
├── 101
│   ├── t2-1010.gz
│   ├── t2-1011.gz
...
├── t2-994.gz
├── t2-995.gz
├── t2-996.gz
├── t2-997.gz
├── t2-998.gz
├── t2-999.gz
├── t2-99.gz
└── t2-9.gz

901 directories, 10000 files



## splitxlsx

Usage
Expand Down

0 comments on commit 2123f25

Please sign in to comment.