This repository has been archived by the owner on Nov 16, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 58
/
dedupe_test.go
107 lines (82 loc) · 2.07 KB
/
dedupe_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
//go:build go1.18
package parquet_test
import (
"sort"
"testing"
"github.com/segmentio/parquet-go"
)
func TestDedupeRowReader(t *testing.T) {
type Row struct {
Value int32 `parquet:"value"`
}
rows := make([]Row, 1000)
for i := range rows {
rows[i].Value = int32(i / 3)
}
dedupeMap := make(map[Row]struct{}, len(rows))
for _, row := range rows {
dedupeMap[row] = struct{}{}
}
dedupeRows := make([]Row, 0, len(dedupeMap))
for row := range dedupeMap {
dedupeRows = append(dedupeRows, row)
}
sort.Slice(dedupeRows, func(i, j int) bool {
return dedupeRows[i].Value < dedupeRows[j].Value
})
buffer1 := parquet.NewRowBuffer[Row]()
buffer1.Write(rows)
buffer1Rows := buffer1.Rows()
defer buffer1Rows.Close()
buffer2 := parquet.NewRowBuffer[Row]()
_, err := parquet.CopyRows(buffer2,
parquet.DedupeRowReader(buffer1Rows,
buffer1.Schema().Comparator(parquet.Ascending("value")),
),
)
if err != nil {
t.Fatal(err)
}
reader := parquet.NewGenericRowGroupReader[Row](buffer2)
defer reader.Close()
n, _ := reader.Read(rows)
assertRowsEqual(t, dedupeRows, rows[:n])
}
func TestDedupeRowWriter(t *testing.T) {
type Row struct {
Value int32 `parquet:"value"`
}
rows := make([]Row, 1000)
for i := range rows {
rows[i].Value = int32(i / 3)
}
dedupeMap := make(map[Row]struct{}, len(rows))
for _, row := range rows {
dedupeMap[row] = struct{}{}
}
dedupeRows := make([]Row, 0, len(dedupeMap))
for row := range dedupeMap {
dedupeRows = append(dedupeRows, row)
}
sort.Slice(dedupeRows, func(i, j int) bool {
return dedupeRows[i].Value < dedupeRows[j].Value
})
buffer1 := parquet.NewRowBuffer[Row]()
buffer1.Write(rows)
buffer1Rows := buffer1.Rows()
defer buffer1Rows.Close()
buffer2 := parquet.NewRowBuffer[Row]()
_, err := parquet.CopyRows(
parquet.DedupeRowWriter(buffer2,
buffer1.Schema().Comparator(parquet.Ascending("value")),
),
buffer1Rows,
)
if err != nil {
t.Fatal(err)
}
reader := parquet.NewGenericRowGroupReader[Row](buffer2)
defer reader.Close()
n, _ := reader.Read(rows)
assertRowsEqual(t, dedupeRows, rows[:n])
}