Skip to content

Commit

Permalink
gpbackup incorrectly backups HyperLogLog statistics (#41)
Browse files Browse the repository at this point in the history
Sometimes gprestore cannot restore such backup.
This happens if HyperLogLog statistics were collected for a column whose
type cannot be implicitly converted to text, for example INTEGER.
In this case, gpbackup backups the binary representation of such statistics,
converting them to a column type, which is incorrect, since HyperLogLog
statistics are stored in a binary representation of the BYTEA type.
(https://github.com/arenadata/gpdb/blob/030faa2e79cbeb42ab4c843f8c8475c322506990/src/backend/commands/analyze.c#L1284-L1305)

This patch corrects the described issue by explicitly
specifying the BYTEA type for HyperLogLog statistics.

In the test, the leaf partition was excluded from checking
the number of statistics, because the current version of gpbackup
does not backup statistics for leaf partitions.
Once this is corrected, this additional condition can be removed.

(cherry picked from commit 85384fa)
  • Loading branch information
RekGRpth committed Jul 19, 2024
1 parent dc8b7d8 commit b23c851
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 4 deletions.
14 changes: 11 additions & 3 deletions backup/statistics.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,10 @@ func generateAttributeSlotsQuery7(attStat AttributeStatistic) string {
AnyValues(attStat.Values2, attStat.Type),
AnyValues(attStat.Values3, attStat.Type),
AnyValues(attStat.Values4, attStat.Type),
AnyValues(attStat.Values5, attStat.Type))
// Hyperloglog data structure for STATISTIC_KIND_HLL and
// STATISTIC_KIND_FULLHLL is converted into a bytea and
// always stored in last slot
AnyValues(attStat.Values5, "bytea"))
}
return attributeQuery
}
Expand Down Expand Up @@ -233,7 +236,10 @@ func generateAttributeSlotsQuery6(attStat AttributeStatistic) string {
AnyValues(attStat.Values2, attStat.Type),
AnyValues(attStat.Values3, attStat.Type),
AnyValues(attStat.Values4, attStat.Type),
AnyValues(attStat.Values5, attStat.Type))
// Hyperloglog data structure for STATISTIC_KIND_HLL and
// STATISTIC_KIND_FULLHLL is converted into a bytea and
// always stored in last slot
AnyValues(attStat.Values5, "bytea"))
}
return attributeQuery
}
Expand Down Expand Up @@ -288,7 +294,9 @@ func generateAttributeSlotsQuery4(attStat AttributeStatistic) string {
AnyValues(attStat.Values1, attStat.Type),
AnyValues(attStat.Values2, attStat.Type),
AnyValues(attStat.Values3, attStat.Type),
AnyValues(attStat.Values4, attStat.Type))
// Hyperloglog data structure for STATISTIC_KIND_HLL is
// converted into a bytea and always stored in last slot
AnyValues(attStat.Values4, "bytea"))
}
return attributeQuery
}
Expand Down
31 changes: 30 additions & 1 deletion end_to_end/end_to_end_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1604,6 +1604,35 @@ var _ = Describe("backup and restore end to end tests", func() {
// gpbackup before version 1.18.0 does not dump pg_class statistics correctly
skipIfOldBackupVersionBefore("1.18.0")

testhelper.AssertQueryRuns(backupConn, `
CREATE TABLE et (
id character varying(13),
flg smallint,
dttm timestamp without time zone,
src character varying(80)
) WITH (appendonly='true', orientation='row', compresstype=zstd, compresslevel='3') DISTRIBUTED BY (id);
CREATE TABLE pt (
id character varying(13),
flg smallint,
dttm timestamp without time zone,
src character varying(80)
) WITH (appendonly='true', orientation='row', compresstype=zstd, compresslevel='3') DISTRIBUTED BY (id) PARTITION BY LIST(src) (
PARTITION src_mdm VALUES('val') WITH (tablename='pt_1_prt_src_mdm', appendonly='true', orientation='row', compresstype=zstd, compresslevel='3' )
);
INSERT INTO pt(id, flg, dttm, src) VALUES (1, 1, now(), 'val');
INSERT INTO et(id, flg, dttm, src) VALUES (2, 2, now(), 'val');
ANALYZE pt;
ANALYZE et;
ANALYZE ROOTPARTITION pt;
ALTER TABLE pt EXCHANGE PARTITION src_mdm WITH TABLE et;
`)

defer testhelper.AssertQueryRuns(backupConn,
`DROP TABLE et CASCADE; DROP TABLE pt CASCADE;`)
outputBkp := gpbackup(gpbackupPath, backupHelperPath,
"--with-stats",
"--backup-dir", backupDir, "--single-backup-dir")
Expand All @@ -1625,7 +1654,7 @@ var _ = Describe("backup and restore end to end tests", func() {
assertPGClassStatsRestored(backupConn, restoreConn, publicSchemaTupleCounts)
assertPGClassStatsRestored(backupConn, restoreConn, schema2TupleCounts)

statsQuery := fmt.Sprintf(`SELECT count(*) AS string FROM pg_statistic st left join pg_class cl on st.starelid = cl.oid left join pg_namespace nm on cl.relnamespace = nm.oid where %s;`, backup.SchemaFilterClause("nm"))
statsQuery := fmt.Sprintf(`SELECT count(*) AS string FROM pg_statistic st left join pg_class cl on st.starelid = cl.oid left join pg_namespace nm on cl.relnamespace = nm.oid where cl.relname != 'pt_1_prt_src_mdm' AND %s;`, backup.SchemaFilterClause("nm"))
backupStatisticCount := dbconn.MustSelectString(backupConn, statsQuery)
restoredStatisticsCount := dbconn.MustSelectString(restoreConn, statsQuery)

Expand Down

0 comments on commit b23c851

Please sign in to comment.