msg555 · msg555 · Oct 10, 2024 · Oct 9, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+# v0.4.2
+
+- Refactored temporary table creation to use sqlalchemy constructs
+- Ensured temporary tables created with primary key if source table had one
+- Added ID compaction functionality
+
 # v0.4.1
 
 - Added logic to appropriately adjust auto-increment sequence state for postgresql

diff --git a/subsetter.example.yaml b/subsetter.example.yaml
@@ -300,3 +300,26 @@ sampler:
   # single-column primary key in another table.
   infer_foreign_keys: none # can be 'none', 'schema', or 'all'
 
+  # Compaction refers to removing gaps in the sampled ID space of a specific
+  # table in a column. In most cases this is unnecessary but sometimes it can
+  # be helpful to keep the IDs in the sampled dataset small. Enabling compaction
+  # can require more tables to be materialized on the source database and can
+  # have some mild performance impacts on sampling.
+  compact:
+    # If set to true any tables that have single-column, integral primary key
+    # will have their primary key marked for compaction.
+    primary_keys: false
+
+    # If set to true any tables that have single-column, integral,
+    # auto-increment primary key will have their primary key marked for
+    # compaction.
+    auto_increment_keys: false
+
+    # Mapping of additional columns that should be compacted if needed. Note if
+    # multiple columns in the same table are compacted they will end up having
+    # the same value.
+    columns:
+      db1.gizmo: [extra_id]
+
+    # Minimum ID to set of the first sampled row for a table.
+    start_key: 1
diff --git a/subsetter/config_model.py b/subsetter/config_model.py
@@ -78,11 +78,17 @@ class MultiplicityConfig(ForbidBaseModel):
         extra_columns: Dict[str, List[str]] = {}
         ignore_primary_key_columns: Dict[str, List[str]] = {}
 
+    class CompactConfig(ForbidBaseModel):
+        primary_keys: bool = False
+        auto_increment_keys: bool = False
+        columns: Dict[str, List[str]] = {}
+        start_key: int = 1
+
     output: OutputType = DirectoryOutputConfig(mode="directory", directory="output")
     filters: Dict[str, List[FilterConfig]] = {}  # type: ignore
     multiplicity: MultiplicityConfig = MultiplicityConfig()
     infer_foreign_keys: Literal["none", "schema", "all"] = "none"
-    compact_keys: bool = False
+    compact: CompactConfig = CompactConfig()
 
 
 class SubsetterConfig(ForbidBaseModel):