From a791b5fbd6d3c98ba13eb25e31e4d7bc31594895 Mon Sep 17 00:00:00 2001 From: April M <36110273+aimurphy@users.noreply.github.com> Date: Mon, 1 Dec 2025 16:18:09 -0800 Subject: [PATCH 1/3] remove dsbulk migrator --- antora.yml | 1 + modules/ROOT/nav.adoc | 4 +- modules/ROOT/pages/astra-migration-paths.adoc | 2 +- modules/ROOT/pages/components.adoc | 18 +- modules/ROOT/pages/create-target.adoc | 3 - .../ROOT/pages/deployment-infrastructure.adoc | 12 +- modules/ROOT/pages/dsbulk-migrator.adoc | 846 ------------------ modules/ROOT/pages/faqs.adoc | 6 +- modules/ROOT/pages/index.adoc | 6 +- modules/ROOT/pages/introduction.adoc | 2 +- .../ROOT/pages/migrate-and-validate-data.adoc | 24 +- modules/ROOT/pages/troubleshooting-tips.adoc | 2 +- .../ROOT/pages/zdm-proxy-migration-paths.adoc | 2 +- .../sideloader/pages/sideloader-overview.adoc | 2 +- 14 files changed, 49 insertions(+), 881 deletions(-) delete mode 100644 modules/ROOT/pages/dsbulk-migrator.adoc diff --git a/antora.yml b/antora.yml index 9cea5ad0..1922b88f 100644 --- a/antora.yml +++ b/antora.yml @@ -51,6 +51,7 @@ asciidoc: dsbulk-migrator: 'DSBulk Migrator' dsbulk-migrator-repo: 'https://github.com/datastax/dsbulk-migrator' dsbulk-loader: 'DSBulk Loader' + dsbulk-loader-repo: 'https://github.com/datastax/dsbulk' cass-migrator: 'Cassandra Data Migrator' cass-migrator-short: 'CDM' cass-migrator-repo: 'https://github.com/datastax/cassandra-data-migrator' diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index 7457d69a..6119675e 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -43,5 +43,5 @@ * xref:ROOT:cassandra-data-migrator.adoc[] * {cass-migrator-repo}/releases[{cass-migrator-short} release notes] -.{dsbulk-migrator} -* xref:ROOT:dsbulk-migrator.adoc[] \ No newline at end of file +.{dsbulk-loader} +* xref:dsbulk:overview:dsbulk-about.adoc[] \ No newline at end of file diff --git a/modules/ROOT/pages/astra-migration-paths.adoc b/modules/ROOT/pages/astra-migration-paths.adoc index f2095047..abc773fc 100644 --- a/modules/ROOT/pages/astra-migration-paths.adoc +++ b/modules/ROOT/pages/astra-migration-paths.adoc @@ -9,7 +9,7 @@ If you have questions about migrating from a specific source to {astra-db}, cont .Migration tool compatibility [cols="2,1,1,1,1"] |=== -|Origin |{sstable-sideloader} |{cass-migrator} |{product-proxy} |{dsbulk-migrator}/{dsbulk-loader} +|Origin |{sstable-sideloader} |{cass-migrator} |{product-proxy} |{dsbulk-loader} |Aiven for {cass-short} |icon:check[role="text-success",alt="Supported"] diff --git a/modules/ROOT/pages/components.adoc b/modules/ROOT/pages/components.adoc index 78c611f2..7c16edb5 100644 --- a/modules/ROOT/pages/components.adoc +++ b/modules/ROOT/pages/components.adoc @@ -152,15 +152,23 @@ You can use {cass-migrator-short} by itself, with {product-proxy}, or for data v For more information, see xref:ROOT:cassandra-data-migrator.adoc[]. -=== {dsbulk-migrator} +=== {dsbulk-loader} -{dsbulk-migrator} extends {dsbulk-loader} with migration-specific commands: `migrate-live`, `generate-script`, and `generate-ddl`. +{dsbulk-loader} is a high-performance data loading and unloading tool for {cass-short}-based databases. +You can use it to load, unload, and count records. -It is best for smaller migrations or migrations that don't require extensive data validation, aside from post-migration row counts. +Because {dsbulk-loader} doesn't have the same data validation capabilities as {cass-migrator-short}, it is best for migrations that don't require extensive data validation, aside from post-migration row counts. -You can use {dsbulk-migrator} alone or with {product-proxy}. +You can use {dsbulk-loader} alone or with {product-proxy}. -For more information, see xref:ROOT:dsbulk-migrator.adoc[]. +For more information, see xref:dsbulk:overview:dsbulk-about.adoc[]. + +[TIP] +==== +The {dsbulk-migrator} tool, which was an extension of {dsbulk-loader}, is deprecated. +This tool is no longer recommended. +Instead, use {dsbulk-loader}'s unload, load, and count commands, or use another data migration tool, such as {cass-migrator-short}. +==== === Other data migration processes diff --git a/modules/ROOT/pages/create-target.adoc b/modules/ROOT/pages/create-target.adoc index ef2593a4..7e8f2905 100644 --- a/modules/ROOT/pages/create-target.adoc +++ b/modules/ROOT/pages/create-target.adoc @@ -58,8 +58,6 @@ scp -i some-key.pem /path/to/scb.zip user@client-ip-or-host: [IMPORTANT] ==== On your new database, the keyspace names, table names, column names, data types, and primary keys must be identical to the schema on the origin cluster or the migration will fail. - -To help you prepare the schema from the DDL in your origin cluster, consider using the `generate-ddl` functionality in the {dsbulk-migrator-repo}[{dsbulk-migrator}]. ==== + Note the following limitations and exceptions for tables in {astra-db}: @@ -106,7 +104,6 @@ On your new cluster, the keyspace names, table names, column names, data types, ==== + To copy the schema, you can run CQL `DESCRIBE` on the origin cluster to get the schema that is being migrated, and then run the output on your new cluster. -Alternatively, you can use the `generate-ddl` functionality in the {dsbulk-migrator-repo}[{dsbulk-migrator}]. + If your origin cluster is running an earlier version, you might need to edit CQL clauses that are no longer supported in newer versions, such as `COMPACT STORAGE`. For specific changes in each version, see the release notes for your database platform and {cass-short} driver. diff --git a/modules/ROOT/pages/deployment-infrastructure.adoc b/modules/ROOT/pages/deployment-infrastructure.adoc index f545e8af..0c6ee4a8 100644 --- a/modules/ROOT/pages/deployment-infrastructure.adoc +++ b/modules/ROOT/pages/deployment-infrastructure.adoc @@ -49,7 +49,7 @@ In a sidecar deployment, each client application instance would be connecting to [[_machines]] == Hardware requirements -You need a number of machines to run your {product-proxy} instances, plus additional machines for the centralized jumphost, and for running {dsbulk-migrator} or {cass-migrator} (which are recommended data migration and validation tools). +You need a number of machines to run your {product-proxy} instances, plus additional machines for the centralized jumphost, and for running {dsbulk-loader} or {cass-migrator} (which are recommended data migration and validation tools). This section uses the term _machine_ broadly to refer to a cloud instance (on any cloud provider), a VM, or a physical server. @@ -83,10 +83,10 @@ The jumphost machine must meet the following specifications: * 200 to 500 GB of storage depending on the amount of metrics history that you want to retain * Equivalent to AWS **c5.2xlarge**, GCP **e2-standard-8**, or Azure **A8 v2** -Data migration tools ({dsbulk-migrator} or {cass-migrator}):: -You need at least one machine to run {dsbulk-migrator} or {cass-migrator} for the data migration and validation (xref:ROOT:migrate-and-validate-data.adoc[Phase 2]). +Data migration tools ({dsbulk-loader} or {cass-migrator}):: +You need at least one machine to run {dsbulk-loader} or {cass-migrator} for the data migration and validation (xref:ROOT:migrate-and-validate-data.adoc[Phase 2]). Even if you plan to use another data migration tool, you might need infrastructure for these tools or your chosen tool. -For example, you can use {dsbulk-migrator} to generate DDL statements to help you recreate your origin cluster's schema on your target cluster, and {cass-migrator} is used for data validation after migrating data with {sstable-sideloader}. +For example, {cass-migrator} is used for data validation after migrating data with {sstable-sideloader}. + {company} recommends that you start with at least one VM that meets the following minimum specifications: + @@ -95,7 +95,7 @@ For example, you can use {dsbulk-migrator} to generate DDL statements to help yo * 64 GB RAM * 200 GB to 2 TB of storage + -If you plan to use {dsbulk-migrator} to unload and load multiple terabytes of data from the origin cluster to the target cluster, consider allocating additional space for data that needs to be staged between unloading and loading. +If you plan to use {dsbulk-loader} to unload and load multiple terabytes of data from the origin cluster to the target cluster, consider allocating additional space for data that needs to be staged between unloading and loading. * Equivalent to AWS **m5.4xlarge**, GCP **e2-standard-16**, or Azure **D16 v5** + @@ -104,7 +104,7 @@ Whether you need additional machines depends on the total amount of data you nee All machines must meet the minimum specifications. For example, if you have 20 TBs of existing data to migrate, you could use 4 VMs to speed up the migration. -Then, you would run {dsbulk-migrator} or {cass-migrator} in parallel on each VM with each one responsible for migrating specific tables or a portion of the data, such as 25% or 5 TB. +Then, you would run {dsbulk-loader} or {cass-migrator} in parallel on each VM with each one responsible for migrating specific tables or a portion of the data, such as 25% or 5 TB. If you have one especially large table, such as 75% of the total data in one table, you can use multiple VMs to migrate that one table. For example, if you have 4 VMs, you can use three VMs in parallel for the large table by splitting the table's full token range into three groups. diff --git a/modules/ROOT/pages/dsbulk-migrator.adoc b/modules/ROOT/pages/dsbulk-migrator.adoc deleted file mode 100644 index 4ff82116..00000000 --- a/modules/ROOT/pages/dsbulk-migrator.adoc +++ /dev/null @@ -1,846 +0,0 @@ -= Use {dsbulk-migrator} with {product-proxy} -:navtitle: Use {dsbulk-migrator} -:description: {dsbulk-migrator} extends {dsbulk-loader} with migration commands. -:page-aliases: ROOT:dsbulk-migrator-overview.adoc - -{dsbulk-migrator} is an extension of xref:dsbulk:overview:dsbulk-about.adoc[{dsbulk-loader}] that adds the following three commands: - -* `migrate-live`: Immediately runs a live data migration using {dsbulk-loader}. - -* `generate-script`: Generates a migration script that you can use to run a data migration with a standalone {dsbulk-loader} installation. -This command _doesn't_ trigger the migration; it only generates the migration script. - -* `generate-ddl`: Reads the origin cluster's schema, and then generates CQL files that you can use to recreate the schema on your target cluster in preparation for data migration. - -{dsbulk-migrator} is best for smaller migrations and migrations that don't require extensive data validation aside from post-migration row counts. -You might also use this tool for migrations where you can shard data from large tables into more manageable quantities. - -You can use {dsbulk-migrator} alone or with {product-proxy}. - -== Install {dsbulk-migrator} - -. Install Java 11 and https://maven.apache.org/download.cgi[Maven] 3.9.x. - -. Optional: If you don't want to use the embedded {dsbulk-loader} that is bundled with {dsbulk-migrator}, you must xref:dsbulk:overview:install.adoc[install {dsbulk-loader}] before installing {dsbulk-migrator}. - -. Clone the {dsbulk-migrator-repo}[{dsbulk-migrator} repository]: -+ -[source,bash] ----- -git clone git@github.com:datastax/dsbulk-migrator.git ----- - -. Change to the cloned directory: -+ -[source,bash] ----- -cd dsbulk-migrator ----- - -. Use Maven to build {dsbulk-migrator}: -+ -[source,bash] ----- -mvn clean package ----- -+ -[[dsbulk-jar]]The build produces two distributable fat jars. -You will use one of these jars when you run a {dsbulk-migrator} command. -+ -* `dsbulk-migrator-**VERSION**-embedded-dsbulk.jar`: Contains an embedded {dsbulk-loader} installation and an embedded Java driver. -+ -Supports all {dsbulk-migrator} operations, but it is larger than the other JAR due to the presence of the {dsbulk-loader} classes. -+ -Use this jar if you _don't_ want to use your own {dsbulk-loader} installation. - -* `dsbulk-migrator-**VERSION**-embedded-driver.jar`: Contains an embedded Java driver only. -+ -Suitable for using the `generate-script` and `migrate-live` commands with your own {dsbulk-loader} installation. -+ -You cannot use this jar for `migrate-live` with the embedded {dsbulk-loader} because the required {dsbulk-loader} classes aren't present in this jar. - -. https://github.com/datastax/simulacron[Clone and build Simulacron], which is required for some {dsbulk-migrator} integration tests. -+ -Note the https://github.com/datastax/simulacron?tab=readme-ov-file#prerequisites[prerequisites for Simulacron], particularly for macOS. - -. Run the {dsbulk-migrator} integration tests: -+ -[source,bash] ----- -mvn clean verify ----- - -After you install, build, and test {dsbulk-migrator}, you can run it from the command line, specifying your desired jar, command, and options. - -For a quick test, try the `<>` option. - -For information and examples for each command, see the following: - -* <> -* <> -* <> - -[#get-help-for-dsbulk-migrator] -== Get help for {dsbulk-migrator} - -Use `--help` (`-h`) to get information about {dsbulk-migrator} commands and options: - -* Print the available {dsbulk-migrator} commands: -+ -[source,bash] ----- -java -jar /path/to/dsbulk-migrator.jar --help ----- -+ -Replace `/path/to/dsbulk-migrator.jar` with the path to your <>. - -* Print help for a specific command: -+ -[source,bash,subs="+quotes"] ----- -java -jar /path/to/dsbulk-migrator.jar **COMMAND** --help ----- -+ -Replace the following: -+ -** `/path/to/dsbulk-migrator.jar`: The path to your <>. -** `COMMAND`: The command for which you want to get help, one of `migrate-live`, `generate-script`, or `generate-ddl`. - -[#dsbulk-live] -== Run a live migration - -The `migrate-live` command immediately runs a live data migration using the embedded version of {dsbulk-loader} or your own {dsbulk-loader} installation. -A _live migration_ means the data migration starts immediately, and it is handled by the migrator tool through the specified {dsbulk-loader} installation. - -To run the `migrate-live` command, provide the path to your <> followed by `migrate-live` and any options: - -[source,bash,subs="+quotes"] ----- -java -jar /path/to/dsbulk-migrator.jar migrate-live **OPTIONS** ----- - -The following examples show how to use either fat jar to perform a live migration where the target cluster is an {astra-db} database. -The password parameters are left blank so that {dsbulk-migrator} prompts for them interactively during the migration. -All unspecified options use their default values. - -[tabs] -====== -Use the embedded {dsbulk-loader}:: -+ --- -If you want to run the migration with the embedded {dsbulk-loader}, you must use the `dsbulk-migrator-**VERSION**-embedded-dsbulk.jar` fat jar and the `--dsbulk-use-embedded` option: - -[source,bash,subs="+quotes"] ----- - java -jar target/dsbulk-migrator-**VERSION**-embedded-dsbulk.jar migrate-live \ - --data-dir=/path/to/data/dir \ - --dsbulk-use-embedded \ - --dsbulk-log-dir=/path/to/log/dir \ - --export-host=**ORIGIN_CLUSTER_HOSTNAME** \ - --export-username=**ORIGIN_USERNAME** \ - --export-password # Origin password will be prompted \ - --export-dsbulk-option "--connector.csv.maxCharsPerColumn=65536" \ - --export-dsbulk-option "--executor.maxPerSecond=1000" \ - --import-bundle=/path/to/scb.zip \ - --import-username=token \ - --import-password # Application token will be prompted \ - --import-dsbulk-option "--connector.csv.maxCharsPerColumn=65536" \ - --import-dsbulk-option "--executor.maxPerSecond=1000" ----- --- - -Use your own {dsbulk-loader} installation:: -+ --- -If you want to run the migration with your own {dsbulk-loader} installation, use the `dsbulk-migrator-**VERSION**-embedded-driver.jar` fat jar, and use the `--dsbulk-cmd` option to specify the path to your {dsbulk-loader} installation: - -[source,bash,subs="+quotes,macros"] ----- - java -jar target/dsbulk-migrator-**VERSION**-embedded-driver.jar migrate-live \ - --data-dir=/path/to/data/dir \ - --dsbulk-cmd=pass:q[${DSBULK_ROOT}]/bin/dsbulk \ - --dsbulk-log-dir=/path/to/log/dir \ - --export-host=**ORIGIN_CLUSTER_HOSTNAME** \ - --export-username=**ORIGIN_USERNAME** \ - --export-password # Origin password will be prompted \ - --import-bundle=/path/to/scb.zip \ - --import-username=token \ - --import-password # Application token will be prompted ----- - --- -====== - -=== Options for migrate-live - -Options for the `migrate-live` command are used to configure the migration parameters and connect to the origin and target clusters. - -Most options have sensible default values and don't need to be specified unless you want to override the default value. - -[cols="1,3"] -|=== -| Option | Description - -| `--data-dir` (`-d`) -| The directory where data is exported to and imported from. -The directory is created if it doesn't exist. - -The default is a `data` subdirectory in the current working directory. - -Tables are exported and imported in subdirectories of the specified data directory: One subdirectory is created for each keyspace, and then one subdirectory is created for each table within each keyspace subdirectory. - -| `--dsbulk-cmd` (`-c`) -| The path to your own external (non-embedded) {dsbulk-loader} installation, such as `--dsbulk-cmd=pass:q[${DSBULK_ROOT}]/bin/dsbulk`. - -The default is `dsbulk`, which assumes that the command is available through the `PATH` variable contents. - -Ignored if the embedded {dsbulk-loader} is used (`--dsbulk-use-embedded`). - -| `--dsbulk-log-dir` (`-l`) -| The path to the directory where you want to store {dsbulk-loader} logs, such as `--dsbulk-log-dir=~/tmp/dsbulk-logs`. -The directory is created if it doesn't exist. - -The default is a `logs` subdirectory in the current working directory. - -Each {dsbulk-loader} operation creates its own subdirectory within the specified log directory. - -This parameter applies whether you use the embedded {dsbulk-loader} or your own external (non-embedded) {dsbulk-loader} installation. - -| `--dsbulk-use-embedded` (`-e`) -| Use the embedded {dsbulk-loader}. -Accepts no arguments; it's either included (enabled) or not (disabled). - -By default, this option is disabled/omitted, and `migrate-live` expects to use an external (non-embedded) {dsbulk-loader} installation. -If disabled/omitted, set the path to your {dsbulk-loader} installation in `--dsbulk-cmd`. - -| `--dsbulk-working-dir` (`-w`) -| The path to the directory where you want to run `dsbulk`, such as `--dsbulk-working-dir=~/tmp/dsbulk-work`. -The default is the current working directory. - -Only applicable when using your own external (non-embedded) {dsbulk-loader} installation with the `--dsbulk-cmd` option. -Ignored if the embedded {dsbulk-loader} is used (`--dsbulk-use-embedded`). - -| `--export-bundle` -| If your origin cluster is an {astra-db} database, provide the path to your database's {scb}, such as `--export-bundle=/path/to/scb.zip`. - -Cannot be used with `--export-host`. - -| `--export-consistency` -| The consistency level to use when exporting data. -The default is `--export-consistency=LOCAL_QUORUM`. - -| `--export-dsbulk-option` -| An additional xref:dsbulk:reference:dsbulk-cmd.adoc#options[{dsbulk-loader} option] to use when exporting data. - -The expected format is `--export-dsbulk-option "--option.full.name=value"`. - -You must use the option's full long form name and leading dashes; short form options will fail. -You must wrap the entire expression in quotes so that it is handled correctly by {dsbulk-migrator}. -This is in addition to any xref:dsbulk:reference:dsbulk-cmd.adoc#escape-and-quote-command-line-arguments[escaping] required for {dsbulk-loader} to process the option correctly. - -To pass multiple additional options, pass each option separately with `--export-dsbulk-option`. -For example: `--export-dsbulk-option "--connector.csv.maxCharsPerColumn=65536" --export-dsbulk-option "--executor.maxPerSecond=1000"`. - -| `--export-host` -a| The origin cluster's host name or IP address, and an optional port for a node in the origin cluster. -The default port is `9042` if not specified. -For example: - -* Hostname with default port: `--export-host=db2.example.com` -* Hostname with custom port: `--export-host=db1.example.com:9001` -* IP address with default port: `--export-host=1.2.3.5` -* IP address with custom port: `--export-host=1.2.3.4:9001` - -This option can be passed multiple times. - -If your origin cluster is an {astra-db} database, use `--export-bundle` instead of `--export-host`. - -| `--export-max-concurrent-files` -| The maximum number of concurrent files to write to when exporting data from the origin cluster. - -Can be either `AUTO` (default) or a positive integer, such as `--export-max-concurrent-files=8`. - -| `--export-max-concurrent-queries` -| The maximum number of concurrent queries to execute. - -Can be either `AUTO` (default) or a positive integer, such as `--export-max-concurrent-queries=8`. - -| `--export-max-records` -| The maximum number of records to export for each table. - -The default is `-1`, which exports the entire table (all records). - -To export a fixed number of records, set to a positive integer, such as `--export-max-records=10000`. - -| `--export-password` -| The password for authentication to the origin cluster. - -You can either provide the password directly (`--export-password=pass:q[${ORIGIN_PASSWORD}]`), or pass the option without a value (`--export-password`) to be prompted for the password interactively. - -If set, then `--export-username` is required. - -If the cluster doesn't require authentication, omit both `--export-username` and `--export-password`. - -If your origin cluster is an {astra-db} database, the password is an {astra} application token. - -| `--export-protocol-version` -| The protocol version to use when connecting to the origin cluster, such as `--export-protocol-version=V4`. - -If unspecified, the driver negotiates the highest version supported by both the client and the server. - -Specify only if you want to force the protocol version. - -| `--export-splits` -a| The maximum number of token range queries to generate. - -This is an advanced setting that {company} doesn't recommend modifying unless you have a specific need to do so. - -Can be either of the following: - -* A positive integer, such as `--export-splits=16`. -* A multiple of the number of available cores, specified as `NC` where `N` is the number of cores, such as `--export-splits=8C`. - -The default is `8C` (8 times the number of available cores). - -| `--export-username` -| The username for authentication to the origin cluster. - -If set, then `--export-password` is required. - -If the cluster doesn't require authentication, omit both `--export-username` and `--export-password`. - -If your origin cluster is an {astra-db} database, the username is the literal string `token`, such as `--export-username=token`. - -| `--import-bundle` -| If your target cluster is an {astra-db} database, provide the path to your database's {scb}, such as `--import-bundle=/path/to/scb.zip`. - -Cannot be used with `--import-host`. - -| `--import-consistency` -| The consistency level to use when importing data. -The default is `--import-consistency=LOCAL_QUORUM`. - -| `--import-default-timestamp` -| The default timestamp to use when importing data. -Must be a valid instant in ISO-8601 format. -The default is `--import-default-timestamp=1970-01-01T00:00:00Z`. - -| `--import-dsbulk-option` -| An additional xref:dsbulk:reference:dsbulk-cmd.adoc#options[{dsbulk-loader} option] to use when importing data. - -The expected format is `--import-dsbulk-option "--option.full.name=value"`. - -You must use the option's full long form name and leading dashes; short form options will fail. -You must wrap the entire expression in quotes so that it is handled correctly by {dsbulk-migrator}. -This is in addition to any xref:dsbulk:reference:dsbulk-cmd.adoc#escape-and-quote-command-line-arguments[escaping] required for {dsbulk-loader} to process the option correctly. - -To pass multiple additional options, pass each option separately with `--import-dsbulk-option`. -For example: `--import-dsbulk-option "--connector.csv.maxCharsPerColumn=65536" --import-dsbulk-option "--executor.maxPerSecond=1000"`. - -| `--import-host` -a| The target cluster's host name or IP address, and an optional port for a node in the target cluster. -The default port is `9042` if not specified. -For example: - -* Hostname with default port: `--import-host=db2.example.com` -* Hostname with custom port: `--import-host=db1.example.com:9001` -* IP address with default port: `--import-host=1.2.3.5` -* IP address with custom port: `--import-host=1.2.3.4:9001` - -This option can be passed multiple times. - -If your target cluster is an {astra-db} database, use `--import-bundle` instead of `--import-host`. - -| `--import-max-concurrent-files` -| The maximum number of concurrent files to read from when importing data to the target cluster. - -Can be either `AUTO` (default) or a positive integer, such as `--import-max-concurrent-files=8`. - -| `--import-max-concurrent-queries` -| The maximum number of concurrent queries to execute. - -Can be either `AUTO` (default) or a positive integer, such as `--import-max-concurrent-queries=8`. - -| `--import-max-errors` -| The maximum number of failed records to tolerate when importing data. - -Must be a positive integer, such as `--import-max-errors=5000`. -The default is `1000`. - -Failed records are written to a `load.bad` file in the {dsbulk-loader} operation directory. - -| `--import-password` -| The password for authentication to the target cluster. - -You can either provide the password directly (`--import-password=pass:q[${TARGET_PASSWORD}]`), or pass the option without a value (`--import-password`) to be prompted for the password interactively. - -If set, then `--import-username` is required. - -If the cluster doesn't require authentication, omit both `--import-username` and `--import-password`. - -If your target cluster is an {astra-db} database, the password is an {astra} application token. - -| `--import-protocol-version` -| The protocol version to use when connecting to the target cluster, such as `--import-protocol-version=V4`. - -If unspecified, the driver negotiates the highest version supported by both the client and the server. - -Specify only if you want to force the protocol version. - -| `--import-username` -| The username for authentication to the target cluster. - -If set, then `--import-password` is required. - -If the cluster doesn't require authentication, omit both `--import-username` and `--import-password`. - -If your target cluster is an {astra-db} database, the username is the literal string `token`, such as `--import-username=token`. - -| `--keyspaces` (`-k`) -| A regular expression to select keyspaces to migrate, such as `--keyspaces="^(my_keyspace\|anotherKeyspace)$"`. - -The default expression is `^(?!system\|dse\|OpsCenter)\\w+$`, which migrates all keyspaces except system keyspaces, {dse-short}-specific keyspaces, and the OpsCenter keyspace if these are present on the origin cluster. - -Case-sensitive keyspace names must be specified by their exact case. - -| `--max-concurrent-ops` -| The maximum number of concurrent operations (exports and imports) to carry. - -The default is `1`. - -Increase this value to allow exports and imports to occur concurrently. -For example, if `--max-concurrent-ops=2`, then each table is imported as soon as it is exported, and the next table immediately begins being exported as soon as the previous table starts importing. - -| `--skip-truncate-confirmation` -| Whether to bypass truncation confirmation before actually truncating counter tables. - -The default is disabled/omitted, which means you must confirm truncation before counter tables are truncated. - -Only applicable when migrating counter tables. -This option is ignored otherwise. - -| `--tables` (`-t`) -| A regular expression to select tables to migrate, such as `--tables="^(table1\|table_two)$"`. - -The default expression is `.{asterisk}`, which migrates all tables in the keyspaces that are selected by the `--keyspaces` option. - -Case-sensitive table names must be specified by their exact case. - -| `--table-types` -a| The table types to migrate: - -* `--table-types=regular`: Migrate only regular tables. -* `--table-types=counter`: Migrate only counter tables. -* `--table-types=all` (default): Migrate both regular and counter tables. - -| `--truncate-before-export` -| Truncate counter tables before exporting them, rather than truncating them afterwards. - -The default is disabled/omitted, which means counter tables are truncated after being exported. - -Only applicable when migrating counter tables. -This option is ignored otherwise. -|=== - -[#dsbulk-script] -== Generate a migration script - -The `generate-script` command generates a migration script that you can use to perform a data migration with your own {dsbulk-loader} installation. -This command _doesn't_ trigger the migration; it only generates the migration script that you must then run. - -If you want to run a migration immediately, or you want to use the embedded {dsbulk-loader}, use the `migrate-live` command instead. - -To run the `generate-script` command, provide the path to your <> followed by `generate-script` and any options: - -[source,bash,subs="+quotes"] ----- -java -jar /path/to/dsbulk-migrator.jar generate-script **OPTIONS** ----- - -The following example generates a migration script where the target cluster is an {astra-db} database. -The `--dsbulk-cmd` option specifies the path to the {dsbulk-loader} installation that you plan to use to run the generated migration script. -All unspecified options use their default values. - -[source,bash,subs="+quotes,macros"] ----- - java -jar target/dsbulk-migrator-**VERSION**-embedded-driver.jar generate-script \ - --data-dir=/path/to/data/dir \ - --dsbulk-cmd=pass:q[${DSBULK_ROOT}]/bin/dsbulk \ - --dsbulk-log-dir=/path/to/log/dir \ - --export-host=**ORIGIN_CLUSTER_HOSTNAME** \ - --export-username=**ORIGIN_USERNAME** \ - --export-password=**ORIGIN_PASSWORD** \ - --import-bundle=/path/to/scb.zip \ - --import-username=token \ - --import-password=**ASTRA_APPLICATION_TOKEN** ----- - -=== Options for generate-script - -The options for the `generate-script` command become options in the generated migration script. -The only exceptions are the origin cluster connection parameters (`export-username`, `export-password`, `export-host`, `export-bundle`), which are used in the migration script _and_ by {dsbulk-migrator} to gather metadata about the tables to migrate. - -Most options have sensible default values and don't need to be specified unless you want to override the default value. - -[cols="1,3"] -|=== -| Option | Description - -| `--data-dir` (`-d`) -| The directory where you want the generated migration script files are stored. -The directory is created if it doesn't exist. - -The default is a `data` subdirectory in the current working directory. - -| `--dsbulk-cmd` (`-c`) -| The path to an external (non-embedded) {dsbulk-loader} installation, such as `--dsbulk-cmd=pass:q[${DSBULK_ROOT}]/bin/dsbulk`. - -The default is `dsbulk`, which assumes that the command is available through the `PATH` variable contents. - -| `--dsbulk-log-dir` (`-l`) -| The path to the directory where you want to store {dsbulk-loader} logs, such as `--dsbulk-log-dir=~/tmp/dsbulk-logs`. -The directory is created if it doesn't exist. - -The default is a `logs` subdirectory in the current working directory. - -Each {dsbulk-loader} operation creates its own subdirectory within the specified log directory. - -| `--dsbulk-working-dir` (`-w`) -| The path to the directory where you want to run `dsbulk`, such as `--dsbulk-working-dir=~/tmp/dsbulk-work`. -The default is the current working directory. - -| `--export-bundle` -| If your origin cluster is an {astra-db} database, provide the path to your database's {scb}, such as `--export-bundle=/path/to/scb.zip`. - -Cannot be used with `--export-host`. - -| `--export-consistency` -| The consistency level to use when exporting data. -The default is `--export-consistency=LOCAL_QUORUM`. - -| `--export-dsbulk-option` -| An additional xref:dsbulk:reference:dsbulk-cmd.adoc#options[{dsbulk-loader} option] to use when exporting data. - -The expected format is `--export-dsbulk-option "--option.full.name=value"`. - -You must use the option's full long form name and leading dashes; short form options will fail. -You must wrap the entire expression in quotes so that it is handled correctly by {dsbulk-migrator}. -This is in addition to any xref:dsbulk:reference:dsbulk-cmd.adoc#escape-and-quote-command-line-arguments[escaping] required for {dsbulk-loader} to process the option correctly. - -To pass multiple additional options, pass each option separately with `--export-dsbulk-option`. -For example: `--export-dsbulk-option "--connector.csv.maxCharsPerColumn=65536" --export-dsbulk-option "--executor.maxPerSecond=1000"`. - -| `--export-host` -a| The origin cluster's host name or IP address, and an optional port for a node in the origin cluster. -The default port is `9042` if not specified. -For example: - -* Hostname with default port: `--export-host=db2.example.com` -* Hostname with custom port: `--export-host=db1.example.com:9001` -* IP address with default port: `--export-host=1.2.3.5` -* IP address with custom port: `--export-host=1.2.3.4:9001` - -This option can be passed multiple times. - -If your origin cluster is an {astra-db} database, use `--export-bundle` instead of `--export-host`. - -| `--export-max-concurrent-files` -| The maximum number of concurrent files to write to when exporting data from the origin cluster. - -Can be either `AUTO` (default) or a positive integer, such as `--export-max-concurrent-files=8`. - -| `--export-max-concurrent-queries` -| The maximum number of concurrent queries to execute. - -Can be either `AUTO` (default) or a positive integer, such as `--export-max-concurrent-queries=8`. - -| `--export-max-records` -| The maximum number of records to export for each table. - -The default is `-1`, which exports the entire table (all records). - -To export a fixed number of records, set to a positive integer, such as `--export-max-records=10000`. - -| `--export-password` -| The password for authentication to the origin cluster. - -You can either provide the password directly (`--export-password=pass:q[${ORIGIN_PASSWORD}]`), or pass the option without a value (`--export-password`) to be prompted for the password interactively. - -If set, then `--export-username` is required. - -If the cluster doesn't require authentication, omit both `--export-username` and `--export-password`. - -If your origin cluster is an {astra-db} database, the password is an {astra} application token. - -| `--export-protocol-version` -| The protocol version to use when connecting to the origin cluster, such as `--export-protocol-version=V4`. - -If unspecified, the driver negotiates the highest version supported by both the client and the server. - -Specify only if you want to force the protocol version. - -| `--export-splits` -a| The maximum number of token range queries to generate. - -This is an advanced setting that {company} doesn't recommend modifying unless you have a specific need to do so. - -Can be either of the following: - -* A positive integer, such as `--export-splits=16`. -* A multiple of the number of available cores, specified as `NC` where `N` is the number of cores, such as `--export-splits=8C`. - -The default is `8C` (8 times the number of available cores). - -| `--export-username` -| The username for authentication to the origin cluster. - -If set, then `--export-password` is required. - -If the cluster doesn't require authentication, omit both `--export-username` and `--export-password`. - -If your origin cluster is an {astra-db} database, the username is the literal string `token`, such as `--export-username=token`. - -| `--import-bundle` -| If your target cluster is an {astra-db} database, provide the path to your database's {scb}, such as `--import-bundle=/path/to/scb.zip`. - -Cannot be used with `--import-host`. - -| `--import-consistency` -| The consistency level to use when importing data. -The default is `--import-consistency=LOCAL_QUORUM`. - -| `--import-default-timestamp` -| The default timestamp to use when importing data. -Must be a valid instant in ISO-8601 format. -The default is `--import-default-timestamp=1970-01-01T00:00:00Z`. - -| `--import-dsbulk-option` -| An additional xref:dsbulk:reference:dsbulk-cmd.adoc#options[{dsbulk-loader} option] to use when importing data. - -The expected format is `--import-dsbulk-option "--option.full.name=value"`. - -You must use the option's full long form name and leading dashes; short form options will fail. -You must wrap the entire expression in quotes so that it is handled correctly by {dsbulk-migrator}. -This is in addition to any xref:dsbulk:reference:dsbulk-cmd.adoc#escape-and-quote-command-line-arguments[escaping] required for {dsbulk-loader} to process the option correctly. - -To pass multiple additional options, pass each option separately with `--import-dsbulk-option`. -For example: `--import-dsbulk-option "--connector.csv.maxCharsPerColumn=65536" --import-dsbulk-option "--executor.maxPerSecond=1000"`. - -| `--import-host` -a| The target cluster's host name or IP address, and an optional port for a node in the target cluster. -The default port is `9042` if not specified. -For example: - -* Hostname with default port: `--import-host=db2.example.com` -* Hostname with custom port: `--import-host=db1.example.com:9001` -* IP address with default port: `--import-host=1.2.3.5` -* IP address with custom port: `--import-host=1.2.3.4:9001` - -This option can be passed multiple times. - -If your target cluster is an {astra-db} database, use `--import-bundle` instead of `--import-host`. - -| `--import-max-concurrent-files` -| The maximum number of concurrent files to read from when importing data to the target cluster. - -Can be either `AUTO` (default) or a positive integer, such as `--import-max-concurrent-files=8`. - -| `--import-max-concurrent-queries` -| The maximum number of concurrent queries to execute. - -Can be either `AUTO` (default) or a positive integer, such as `--import-max-concurrent-queries=8`. - -| `--import-max-errors` -| The maximum number of failed records to tolerate when importing data. - -Must be a positive integer, such as `--import-max-errors=5000`. -The default is `1000`. - -Failed records are written to a `load.bad` file in the {dsbulk-loader} operation directory. - -| `--import-password` -| The password for authentication to the target cluster. - -You can either provide the password directly (`--import-password=pass:q[${TARGET_PASSWORD}]`), or pass the option without a value (`--import-password`) to be prompted for the password interactively. - -If set, then `--import-username` is required. - -If the cluster doesn't require authentication, omit both `--import-username` and `--import-password`. - -If your target cluster is an {astra-db} database, the password is an {astra} application token. - -| `--import-protocol-version` -| The protocol version to use when connecting to the target cluster, such as `--import-protocol-version=V4`. - -If unspecified, the driver negotiates the highest version supported by both the client and the server. - -Specify only if you want to force the protocol version. - -| `--import-username` -| The username for authentication to the target cluster. - -If set, then `--import-password` is required. - -If the cluster doesn't require authentication, omit both `--import-username` and `--import-password`. - -If your target cluster is an {astra-db} database, the username is the literal string `token`, such as `--import-username=token`. - -| `--keyspaces` (`-k`) -| A regular expression to select keyspaces to migrate, such as `--keyspaces="^(my_keyspace\|anotherKeyspace)$"`. - -The default expression is `^(?!system\|dse\|OpsCenter)\\w+$`, which migrates all keyspaces except system keyspaces, {dse-short}-specific keyspaces, and the OpsCenter keyspace if these are present on the origin cluster. - -Case-sensitive keyspace names must be specified by their exact case. - -| `--tables` (`-t`) -| A regular expression to select tables to migrate, such as `--tables="^(table1\|table_two)$"`. - -The default expression is `.{asterisk}`, which migrates all tables in the keyspaces that are selected by the `--keyspaces` option. - -Case-sensitive table names must be specified by their exact case. - -| `--table-types` -a| The table types to migrate: - -* `--table-types=regular`: Migrate only regular tables. -* `--table-types=counter`: Migrate only counter tables. -* `--table-types=all` (default): Migrate both regular and counter tables. -|=== - -=== Unsupported live migration options for migration scripts - -The following `migrate-live` options cannot be set in `generate-script`. -If you want to use these options, you must run the migration directly with `migrate-live` instead of generating a script. - -* `--dsbulk-use-embedded`: Not applicable to `generate-script` because the resulting script is intended to be run with your own (non-embedded) {dsbulk-loader} installation. - -* `--max-concurrent-ops`: Cannot be customized in `generate-script`. -Uses the default value of `1`. - -* `--skip-truncate-confirmation`: Cannot be customized in `generate-script`. -Uses the default behavior of requiring confirmation before truncating counter tables. - -* `--truncate-before-export`: Cannot be customized in `generate-script`. -Uses the default behavior of truncating counter tables after exporting them. - -* `--data-dir`: In `generate-script`, this parameter sets the location to store the generated script files. -There is no `generate-script` option to set a custom data directory for the migration's actual import and export operations. -When you run the migration script, the default data directory is used for the data export and import operations, which is a `data` subdirectory in the current working directory. - -[#dsbulk-ddl] -== Generate DDL files - -The `generate-ddl` command reads the origin cluster's schema, and then generates CQL files that you can use to recreate the schema on your target CQL-compatible cluster. - -To run the `generate-ddl` command, provide the path to your <> followed by `generate-ddl` and any options: - -[source,bash,subs="+quotes"] ----- -java -jar /path/to/dsbulk-migrator.jar generate-ddl **OPTIONS** ----- - -The following example generates DDL files that are optimized for recreating the schema on an {astra-db} database: - -[source,bash,subs="+quotes"] ----- - java -jar target/dsbulk-migrator-**VERSION**-embedded-driver.jar generate-ddl \ - --data-dir=/path/to/data/directory \ - --export-host=**ORIGIN_CLUSTER_HOSTNAME** \ - --export-username=**ORIGIN_USERNAME** \ - --export-password=**ORIGIN_PASSWORD** \ - --optimize-for-astra ----- - -=== Options for generate-ddl - -The `generate-ddl` command ignores all `import-{asterisk}` options and {dsbulk-loader}-related options because they aren't relevant to this operation. - -Origin cluster connection details (`export-{asterisk}` options) are required so that {dsbulk-migrator} can access the origin cluster to gather metadata about the keyspaces and tables for the DDL statements. - -Most options have sensible default values and don't need to be specified unless you want to override the default value. - -[cols="1,3"] -|=== -| Option | Description - -| `--data-dir` (`-d`) -| The directory where you want to store the generated CQL files. -The directory is created if it doesn't exist. - -The default is a `data` subdirectory in the current working directory. - -| `--export-bundle` -| If your origin cluster is an {astra-db} database, provide the path to your database's {scb}, such as `--export-bundle=/path/to/scb.zip`. - -Cannot be used with `--export-host`. - -| `--export-host` -a| The origin cluster's host name or IP address, and an optional port for a node in the origin cluster. -The default port is `9042` if not specified. -For example: - -* Hostname with default port: `--export-host=db2.example.com` -* Hostname with custom port: `--export-host=db1.example.com:9001` -* IP address with default port: `--export-host=1.2.3.5` -* IP address with custom port: `--export-host=1.2.3.4:9001` - -This option can be passed multiple times. - -If your origin cluster is an {astra-db} database, use `--export-bundle` instead of `--export-host`. - -| `--export-password` -| The password for authentication to the origin cluster. - -You can either provide the password directly (`--export-password=pass:q[${ORIGIN_PASSWORD}]`), or pass the option without a value (`--export-password`) to be prompted for the password interactively. - -If set, then `--export-username` is required. - -If the cluster doesn't require authentication, omit both `--export-username` and `--export-password`. - -If your origin cluster is an {astra-db} database, the password is an {astra} application token. - -| `--export-protocol-version` -| The protocol version to use when connecting to the origin cluster, such as `--export-protocol-version=V4`. - -If unspecified, the driver negotiates the highest version supported by both the client and the server. - -Specify only if you want to force the protocol version. - -| `--export-username` -| The username for authentication to the origin cluster. - -If set, then `--export-password` is required. - -If the cluster doesn't require authentication, omit both `--export-username` and `--export-password`. - -If your origin cluster is an {astra-db} database, the username is the literal string `token`, such as `--export-username=token`. - -| `--keyspaces` (`-k`) -| A regular expression to select keyspaces to include in the generated CQL files, such as `--keyspaces="^(my_keyspace\|anotherKeyspace)$"`. - -The default expression is `^(?!system\|dse\|OpsCenter)\\w+$`, which includes all keyspaces except system keyspaces, {dse-short}-specific keyspaces, and the OpsCenter keyspace if these are present on the origin cluster. - -Case-sensitive keyspace names must be specified by their exact case. - -| `--optimize-for-astra` (`-a`) -| Produce CQL files optimized for {astra-db}. - -xref:astra-db-serverless:cql:develop-with-cql.adoc#unsupported-values-are-ignored[{astra-db} doesn't support all CQL options in DDL statements]. -This option omits forbidden CQL options from the generated CQL files so you can use them to create the schema in your {astra-db} database without producing warnings or errors. - -The default is disabled/omitted, which generates the CQL files as-is without any {astra-db}-specific optimizations. - -| `--tables` (`-t`) -| A regular expression to select tables to include in the generated CQL files, such as `--tables="^(table1\|table_two)$"`. - -The default expression is `.{asterisk}`, which includes all tables in the keyspaces that are selected by the `--keyspaces` option. - -Case-sensitive table names must be specified by their exact case. - -| `--table-types` -a| The table types to include in the generated CQL files: - -* `--table-types=regular`: Include only regular tables. -* `--table-types=counter`: Include only counter tables. -* `--table-types=all` (default): Include both regular and counter tables. -|=== \ No newline at end of file diff --git a/modules/ROOT/pages/faqs.adoc b/modules/ROOT/pages/faqs.adoc index fbf9a3d8..7de2fe0d 100644 --- a/modules/ROOT/pages/faqs.adoc +++ b/modules/ROOT/pages/faqs.adoc @@ -60,7 +60,7 @@ Yes, you can use the xref:ROOT:introduction.adoc#lab[{product-short} interactive The {product-short} tools are {product-proxy}, {product-utility}, and {product-automation}. These tools orchestrate the traffic between your client applications and the origin and target clusters during the migration process. -For the actual data migration, there are many tools you can use, such as {sstable-sideloader}, {cass-migrator}, {dsbulk-migrator}, and custom data migration scripts. +For the actual data migration, there are many tools you can use, such as {sstable-sideloader}, {cass-migrator}, {dsbulk-loader}, and custom data migration scripts. For more information, see xref:ROOT:components.adoc[]. @@ -97,7 +97,7 @@ You can have the confidence that you are using tools designed specifically to ha == What is the pricing model? -{product-proxy}, {product-utility}, {product-automation}, {cass-migrator}, and {dsbulk-migrator} are free and open-sourced. +{product-proxy}, {product-utility}, {product-automation}, {cass-migrator}, and {dsbulk-loader} are free and open-sourced. {sstable-sideloader} is part of an {astra-db} *Enterprise* subscription plan, and it incurs costs based on usage. @@ -110,7 +110,7 @@ For any observed problems with {product-proxy} or the other open-source {product * {product-proxy-repo}[{product-proxy} repository] * {product-automation-repo}[{product-automation} repository] (includes {product-automation} and {product-utility}) * {cass-migrator-repo}[{cass-migrator} repository] -* {dsbulk-migrator-repo}[{dsbulk-migrator} repository] +* {dsbulk-loader-repo}[{dsbulk-loader} repository] == Can I contribute to {product-proxy}? diff --git a/modules/ROOT/pages/index.adoc b/modules/ROOT/pages/index.adoc index c00e946b..4b519302 100644 --- a/modules/ROOT/pages/index.adoc +++ b/modules/ROOT/pages/index.adoc @@ -81,12 +81,12 @@ svg::sideloader:astra-migration-toolkit.svg[role="absolute bottom-1/2 translate- svg:common:ROOT:icons/datastax/migrate.svg[role="mx-auto max-w-xs md:mx-0 lg:max-w-none"] -

{dsbulk-migrator}

+

{dsbulk-loader}

-

{dsbulk-migrator} is an extension of {dsbulk-loader}.

+

{dsbulk-loader} can load, unload, and count large volumes of data from your {cass-short}-based clusters.

- xref:ROOT:dsbulk-migrator.adoc[Get started with {dsbulk-migrator}] + xref:dsbulk:overview:dsbulk-about.adoc[Get started with {dsbulk-loader}]
diff --git a/modules/ROOT/pages/introduction.adoc b/modules/ROOT/pages/introduction.adoc index 753dedd5..aa40b575 100644 --- a/modules/ROOT/pages/introduction.adoc +++ b/modules/ROOT/pages/introduction.adoc @@ -20,7 +20,7 @@ For example, you might move from self-managed clusters to a cloud-based Database * You want to consolidate client applications running on separate clusters onto one shared cluster to minimize sprawl and maintenance. ==== -The {product-short} process uses {product-proxy}, {product-utility}, and {product-automation} to orchestrate live reads and writes on your databases while you move and validate data with a data migration tool, such as {sstable-sideloader}, {cass-migrator}, or {dsbulk-migrator}. +The {product-short} process uses {product-proxy}, {product-utility}, and {product-automation} to orchestrate live reads and writes on your databases while you move and validate data with a data migration tool, such as {sstable-sideloader}, {cass-migrator}, or {dsbulk-loader}. {product-proxy} keeps your databases in sync at all times through its dual-writes feature, which means you can seamlessly stop or abandon the migration at any point before the last phase of the migration (the final cutover to the new database). For more information about these tools, see xref:ROOT:components.adoc[]. diff --git a/modules/ROOT/pages/migrate-and-validate-data.adoc b/modules/ROOT/pages/migrate-and-validate-data.adoc index 5be69633..5df25ea0 100644 --- a/modules/ROOT/pages/migrate-and-validate-data.adoc +++ b/modules/ROOT/pages/migrate-and-validate-data.adoc @@ -1,5 +1,5 @@ = Phase 2: Migrate and validate data -:page-aliases: ROOT:sideloader-zdm.adoc +:page-aliases: ROOT:sideloader-zdm.adoc, ROOT:dsbulk-migrator-overview.adoc, ROOT:dsbulk-migrator.adoc In xref:ROOT:phase1.adoc[Phase 1], you set up {product-proxy} to orchestrate live traffic to your origin and target clusters. @@ -7,7 +7,7 @@ In Phase 2 of {product}, you migrate data from the origin to the target, and the image::migration-phase2ra.png[In {product-short} Phase 2, you migrate data from the origin cluster to the target cluster] -To move and validate data, you can use a dedicated data migration tool, such as {sstable-sideloader}, {cass-migrator}, or {dsbulk-migrator}, or your can create your own custom data migration script. +To move and validate data, you can use a dedicated data migration tool, such as {sstable-sideloader}, {cass-migrator}, or {dsbulk-loader}, or your can create your own custom data migration script. //Migration tool summaries are also on ROOT:components.adoc. @@ -16,7 +16,7 @@ To move and validate data, you can use a dedicated data migration tool, such as This tool is exclusively for migrations that move data to {astra-db}. {sstable-sideloader} is a service running in {astra-db} that imports data from snapshots of your existing {cass-reg}-based cluster. -Because it imports data directly, {sstable-sideloader} can offer several advantages over CQL-based tools like {dsbulk-migrator} and {cass-migrator}, including faster, more cost-effective data loading, and minimal performance impacts on your origin cluster and target database. +Because it imports data directly, {sstable-sideloader} can offer several advantages over CQL-based tools like {dsbulk-loader} and {cass-migrator}, including faster, more cost-effective data loading, and minimal performance impacts on your origin cluster and target database. To migrate data with {sstable-sideloader}, you use `nodetool`, a cloud provider's CLI, and the {astra} {devops-api}: @@ -41,15 +41,23 @@ You can use {cass-migrator-short} alone, with {product-proxy}, or for data valid For more information, see xref:ROOT:cassandra-data-migrator.adoc[]. -== {dsbulk-migrator} +== {dsbulk-loader} -{dsbulk-migrator} extends {dsbulk-loader} with migration-specific commands: `migrate-live`, `generate-script`, and `generate-ddl`. +{dsbulk-loader} is a high-performance data loading and unloading tool for {cass-short}-based databases. +You can use it to load, unload, and count records. -It is best for smaller migrations or migrations that don't require extensive data validation, aside from post-migration row counts. +Because {dsbulk-loader} doesn't have the same data validation capabilities as {cass-migrator-short}, it is best for migrations that don't require extensive data validation, aside from post-migration row counts. -You can use {dsbulk-migrator} alone or with {product-proxy}. +You can use {dsbulk-loader} alone or with {product-proxy}. -For more information, see xref:ROOT:dsbulk-migrator.adoc[]. +For more information, see xref:dsbulk:overview:dsbulk-about.adoc[]. + +[TIP] +==== +The {dsbulk-migrator} tool, which was an extension of {dsbulk-loader}, is deprecated. +This tool is no longer recommended. +Instead, use {dsbulk-loader}'s unload, load, and count commands, or use another data migration tool, such as {cass-migrator-short}. +==== == Other data migration processes diff --git a/modules/ROOT/pages/troubleshooting-tips.adoc b/modules/ROOT/pages/troubleshooting-tips.adoc index 54a0bf7a..afc284d2 100644 --- a/modules/ROOT/pages/troubleshooting-tips.adoc +++ b/modules/ROOT/pages/troubleshooting-tips.adoc @@ -682,7 +682,7 @@ To report an issue or get additional support, submit an issue in the {product-sh * {product-proxy-repo}/issues[{product-proxy} repository] * {product-automation-repo}/issues[{product-automation} repository] (includes {product-automation} and {product-utility}) * {cass-migrator-repo}/issues[{cass-migrator} repository] -* {dsbulk-migrator-repo}/issues[{dsbulk-migrator} repository] +* {dsbulk-loader-repo}/issues[{dsbulk-loader} repository] [IMPORTANT] ==== diff --git a/modules/ROOT/pages/zdm-proxy-migration-paths.adoc b/modules/ROOT/pages/zdm-proxy-migration-paths.adoc index 86a070a8..7eed2624 100644 --- a/modules/ROOT/pages/zdm-proxy-migration-paths.adoc +++ b/modules/ROOT/pages/zdm-proxy-migration-paths.adoc @@ -12,7 +12,7 @@ include::ROOT:partial$migration-scenarios.adoc[] If you don't want to use {product-proxy} or your databases don't meet the zero-downtime requirements, you can still complete the migration, but some downtime might be necessary to finish the migration. -If your origin cluster is incompatible with {product-proxy}, {product-utility}, and {product-automation}, you might be able to use standalone xref:ROOT:components.adoc#data-migration-tools[data migration tools] such as {dsbulk-migrator} or a custom data migration script. +If your origin cluster is incompatible with {product-proxy}, {product-utility}, and {product-automation}, you might be able to use standalone xref:ROOT:components.adoc#data-migration-tools[data migration tools] such as {dsbulk-loader} or a custom data migration script. Make sure you transform or prepare the data to comply with the target cluster's schema. For more complex migrations, such as RDBMS-to-NoSQL migrations, it is likely that your migration will require downtime for additional processing, such as extract, transform, and load (ETL) operations. diff --git a/modules/sideloader/pages/sideloader-overview.adoc b/modules/sideloader/pages/sideloader-overview.adoc index 9765c11d..5ea1f804 100644 --- a/modules/sideloader/pages/sideloader-overview.adoc +++ b/modules/sideloader/pages/sideloader-overview.adoc @@ -115,7 +115,7 @@ include::sideloader:partial$validate.adoc[] == Use {sstable-sideloader} with {product-proxy} -If you need to migrate a live database, you can use {sstable-sideloader} instead of {dsbulk-migrator} or {cass-migrator} during of xref:ROOT:migrate-and-validate-data.adoc[Phase 2 of {product}]. +If you need to migrate a live database, you can use {sstable-sideloader} instead of {dsbulk-loader} or {cass-migrator} during of xref:ROOT:migrate-and-validate-data.adoc[Phase 2 of {product}]. .Use {sstable-sideloader} with {product-proxy} svg::sideloader:astra-migration-toolkit.svg[] From 2b199f92c6d7e6420753d8889f79e953721ec705 Mon Sep 17 00:00:00 2001 From: April M <36110273+aimurphy@users.noreply.github.com> Date: Mon, 1 Dec 2025 16:20:46 -0800 Subject: [PATCH 2/3] add faq --- modules/ROOT/pages/faqs.adoc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modules/ROOT/pages/faqs.adoc b/modules/ROOT/pages/faqs.adoc index 7de2fe0d..a75fed33 100644 --- a/modules/ROOT/pages/faqs.adoc +++ b/modules/ROOT/pages/faqs.adoc @@ -158,6 +158,12 @@ For more information, see xref:components.adoc#how-zdm-proxy-handles-reads-and-w In the context of the {product-short} process, the terms _cluster_ and _database_ are used interchangeably to refer to the source and destination for the data that you are moving during your migration. +== What happened to {dsbulk-migrator}? + +The {dsbulk-migrator} tool, which was an extension of {dsbulk-loader}, is deprecated. +This tool is no longer recommended. +Instead, use xref:dsbulk:overview:dsbulk-about.adoc[{dsbulk-loader}] or use another xref:ROOT:components.adoc[data migration tool]. + == See also * https://github.com/datastax/zdm-proxy/blob/main/faq.md[{product-proxy} FAQ on GitHub] \ No newline at end of file From 82385cb5ab5018e4aa7cc18857ffe07765276969 Mon Sep 17 00:00:00 2001 From: April M <36110273+aimurphy@users.noreply.github.com> Date: Tue, 2 Dec 2025 08:15:58 -0800 Subject: [PATCH 3/3] sme and peer review --- antora.yml | 2 -- modules/ROOT/pages/components.adoc | 4 +--- modules/ROOT/pages/deployment-infrastructure.adoc | 8 ++++---- modules/ROOT/pages/faqs.adoc | 6 ++---- modules/ROOT/pages/migrate-and-validate-data.adoc | 4 +--- modules/ROOT/partials/dsbulk-migrator-deprecation.adoc | 3 +++ 6 files changed, 11 insertions(+), 16 deletions(-) create mode 100644 modules/ROOT/partials/dsbulk-migrator-deprecation.adoc diff --git a/antora.yml b/antora.yml index 1922b88f..1cedfd24 100644 --- a/antora.yml +++ b/antora.yml @@ -48,8 +48,6 @@ asciidoc: product-automation-repo: 'https://github.com/datastax/zdm-proxy-automation' product-automation-shield: 'image:https://img.shields.io/github/v/release/datastax/zdm-proxy-automation?label=latest[alt="Latest zdm-proxy-automation release on GitHub",link="{product-automation-repo}/releases"]' product-demo: 'ZDM Demo Client' - dsbulk-migrator: 'DSBulk Migrator' - dsbulk-migrator-repo: 'https://github.com/datastax/dsbulk-migrator' dsbulk-loader: 'DSBulk Loader' dsbulk-loader-repo: 'https://github.com/datastax/dsbulk' cass-migrator: 'Cassandra Data Migrator' diff --git a/modules/ROOT/pages/components.adoc b/modules/ROOT/pages/components.adoc index 7c16edb5..7dcb5c25 100644 --- a/modules/ROOT/pages/components.adoc +++ b/modules/ROOT/pages/components.adoc @@ -165,9 +165,7 @@ For more information, see xref:dsbulk:overview:dsbulk-about.adoc[]. [TIP] ==== -The {dsbulk-migrator} tool, which was an extension of {dsbulk-loader}, is deprecated. -This tool is no longer recommended. -Instead, use {dsbulk-loader}'s unload, load, and count commands, or use another data migration tool, such as {cass-migrator-short}. +include::ROOT:partial$dsbulk-migrator-deprecation.adoc[] ==== === Other data migration processes diff --git a/modules/ROOT/pages/deployment-infrastructure.adoc b/modules/ROOT/pages/deployment-infrastructure.adoc index 0c6ee4a8..5661859f 100644 --- a/modules/ROOT/pages/deployment-infrastructure.adoc +++ b/modules/ROOT/pages/deployment-infrastructure.adoc @@ -93,9 +93,7 @@ For example, {cass-migrator} is used for data validation after migrating data wi * Ubuntu Linux 20.04 or 22.04, Red Hat Family Linux 7 or newer * 16 vCPUs * 64 GB RAM -* 200 GB to 2 TB of storage -+ -If you plan to use {dsbulk-loader} to unload and load multiple terabytes of data from the origin cluster to the target cluster, consider allocating additional space for data that needs to be staged between unloading and loading. +* 200 GB to 2 TB of storage or more for larger migrations * Equivalent to AWS **m5.4xlarge**, GCP **e2-standard-16**, or Azure **D16 v5** + @@ -112,7 +110,9 @@ Then, each VM migrates one group of tokens, and you use the fourth VM to migrate [IMPORTANT] ==== -Make sure that your origin and target clusters can handle high traffic from your chosen data migration tool in addition to the live traffic from your application. +Regardless of the number of data migration machines or the amount of data you need to migrate, make sure the machines have enough space to stage data between unloading and loading during the migration. + +Additionally, make sure that your origin and target clusters can handle high traffic from your chosen data migration tool in addition to the live traffic from your application. Test migrations in a lower environment before you proceed with production migrations. diff --git a/modules/ROOT/pages/faqs.adoc b/modules/ROOT/pages/faqs.adoc index a75fed33..1f3507ff 100644 --- a/modules/ROOT/pages/faqs.adoc +++ b/modules/ROOT/pages/faqs.adoc @@ -158,11 +158,9 @@ For more information, see xref:components.adoc#how-zdm-proxy-handles-reads-and-w In the context of the {product-short} process, the terms _cluster_ and _database_ are used interchangeably to refer to the source and destination for the data that you are moving during your migration. -== What happened to {dsbulk-migrator}? +== What happened to DSBulk Migrator? -The {dsbulk-migrator} tool, which was an extension of {dsbulk-loader}, is deprecated. -This tool is no longer recommended. -Instead, use xref:dsbulk:overview:dsbulk-about.adoc[{dsbulk-loader}] or use another xref:ROOT:components.adoc[data migration tool]. +include::ROOT:partial$dsbulk-migrator-deprecation.adoc[] == See also diff --git a/modules/ROOT/pages/migrate-and-validate-data.adoc b/modules/ROOT/pages/migrate-and-validate-data.adoc index 5df25ea0..d56b54ab 100644 --- a/modules/ROOT/pages/migrate-and-validate-data.adoc +++ b/modules/ROOT/pages/migrate-and-validate-data.adoc @@ -54,9 +54,7 @@ For more information, see xref:dsbulk:overview:dsbulk-about.adoc[]. [TIP] ==== -The {dsbulk-migrator} tool, which was an extension of {dsbulk-loader}, is deprecated. -This tool is no longer recommended. -Instead, use {dsbulk-loader}'s unload, load, and count commands, or use another data migration tool, such as {cass-migrator-short}. +include::ROOT:partial$dsbulk-migrator-deprecation.adoc[] ==== == Other data migration processes diff --git a/modules/ROOT/partials/dsbulk-migrator-deprecation.adoc b/modules/ROOT/partials/dsbulk-migrator-deprecation.adoc new file mode 100644 index 00000000..29a6396f --- /dev/null +++ b/modules/ROOT/partials/dsbulk-migrator-deprecation.adoc @@ -0,0 +1,3 @@ +The DSBulk Migrator tool, which was an extension of {dsbulk-loader}, is deprecated. +This tool is no longer recommended. +Instead, use {dsbulk-loader}'s unload, load, and count commands, or use another data migration tool, such as {cass-migrator-short}. \ No newline at end of file