Skip to content

Commit

Permalink
create-taxdump: fix delnodes.dmp and merged.dmp shenwei356/gtdb-taxdu…
Browse files Browse the repository at this point in the history
  • Loading branch information
shenwei356 committed Aug 24, 2022
1 parent e842f55 commit b14fdd0
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 36 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
- [TaxonKit v0.12.1](https://github.com/shenwei356/taxonkit/releases/tag/v0.12.1)
[![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/taxonkit/v0.12.1/total.svg)](https://github.com/shenwei356/taxonkit/releases/tag/v0.12.1)
- `taxonkit reformat`: do not panic for invalid TaxIds, e.g., the column name, when using `-I--taxid-field`.
- `taxonkit create-taxdump`: fix merged.dmp and delnodes.dmp. [gtdb-taxdump/issues/2](https://github.com/shenwei356/gtdb-taxdump/issues/2)
- [TaxonKit v0.12.0](https://github.com/shenwei356/taxonkit/releases/tag/v0.12.0)
[![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/taxonkit/v0.12.0/total.svg)](https://github.com/shenwei356/taxonkit/releases/tag/v0.12.0)
- `taxonkit create-taxdump`:
Expand Down
90 changes: 54 additions & 36 deletions taxonkit/cmd/create-taxdump.go
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,17 @@ Attentions:
var merged map[uint32]uint32
var ok bool

// ------------------------------- delnodes.dmp -------------------------

fileDelNodes := filepath.Join(outDir, "delnodes.dmp")
outfhDelNodes, err := xopen.Wopen(fileDelNodes)
checkError(err)
defer outfhDelNodes.Close()

var delnodes map[uint32]interface{}

if taxdb != nil {
// --------------------- merged --------------------
merged = make(map[uint32]uint32, len(taxdb.MergeNodes))
var _parent uint32
for child, parent := range tree {
Expand All @@ -719,39 +729,11 @@ Attentions:
}
}

// append old delnodes.dmp
for from, to := range taxdb.MergeNodes {
if _, ok = merged[from]; !ok {
merged[from] = to
}
}

taxids := make([]uint32, 0, len(merged))
for child := range merged {
taxids = append(taxids, child)
}
sort.Slice(taxids, func(i, j int) bool {
return taxids[i] < taxids[j]
})

for _, child := range taxids {
fmt.Fprintf(outfhMerged, "%d\t|\t%d\t|\n", child, merged[child])
}
}

log.Infof("%d records saved to %s", len(merged), fileMerged)

// ------------------------------- delnodes.dmp -------------------------

fileDelNodes := filepath.Join(outDir, "delnodes.dmp")
outfhDelNodes, err := xopen.Wopen(fileDelNodes)
checkError(err)
defer outfhDelNodes.Close()
// we will handle it later

var delnodes []uint32
// --------------------- delnodes --------------------

if taxdb != nil {
delnodes = make([]uint32, 0, len(taxdb.DelNodes))
delnodes = make(map[uint32]interface{}, len(taxdb.DelNodes))

for child := range taxdb.Nodes {
if child == 1 {
Expand All @@ -766,25 +748,61 @@ Attentions:
continue
}

delnodes = append(delnodes, child)
delnodes[child] = struct{}{}
}

// append old delnodes.dmp
for child := range taxdb.DelNodes {
if _, ok = tree[child]; !ok { // some deleted taxids may be reused
delnodes = append(delnodes, child)
delnodes[child] = struct{}{}
}
}

// --------------------- merged --------------------

// append old merged.dmp
for from, to := range taxdb.MergeNodes {
if _, ok = delnodes[to]; ok { // could not append deleted nodes
delnodes[from] = struct{}{} // if the new taxid has been deleted, mark the old taxid too
continue
}
if _, ok = merged[from]; !ok {
merged[from] = to
}
}

sort.Slice(delnodes, func(i, j int) bool {
return delnodes[i] > delnodes[j]
// --------------------------------- write -----------------------------------

// -------------- write delnodes.dmp ------------

taxids := make([]uint32, 0, len(delnodes))
for child := range delnodes {
taxids = append(taxids, child)
}
sort.Slice(taxids, func(i, j int) bool {
return taxids[i] > taxids[j]
})

for _, child := range delnodes {
for _, child := range taxids {
fmt.Fprintf(outfhDelNodes, "%d\t|\n", child)
}

// -------------- write merged.dmp ------------

taxids = taxids[:0]
for child := range merged {
taxids = append(taxids, child)
}
sort.Slice(taxids, func(i, j int) bool {
return taxids[i] < taxids[j]
})

for _, child := range taxids {
fmt.Fprintf(outfhMerged, "%d\t|\t%d\t|\n", child, merged[child])
}
}

log.Infof("%d records saved to %s", len(merged), fileMerged)
log.Infof("%d records saved to %s", len(delnodes), fileDelNodes)
},
}
Expand Down

0 comments on commit b14fdd0

Please sign in to comment.