forked from scylladb/scylla-migrator
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconfig.yaml
212 lines (205 loc) · 8.67 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# Example configuration for migrating from Cassandra:
source:
type: cassandra
#host: cassandra-server-01
#port: 9042
#optional, if not specified None will be used
#localDC: <localdc>
#credentials:
# username: <user>
# password: <pass>
# SSL as per https://github.com/scylladb/spark-cassandra-connector/blob/master/doc/reference.md#cassandra-ssl-connection-options
#sslOptions:
# clientAuthEnabled: false
# enabled: false
# all below are optional! (generally just trustStorePassword and trustStorePath is needed)
# trustStorePassword: <pass>
# trustStorePath: <path>
# trustStoreType: JKS
# keyStorePassword: <pass>
# keyStorePath: <path>
# keyStoreType: JKS
# enabledAlgorithms:
# - TLS_RSA_WITH_AES_128_CBC_SHA
# - TLS_RSA_WITH_AES_256_CBC_SHA
# protocol: TLS
# keyspace: stocks
# table: stocks
# Preserve TTLs and WRITETIMEs of cells in the source database. Note that this
# option is *incompatible* when copying tables with collections (lists, maps, sets).
#preserveTimestamps: true
# Number of splits to use - this should be at minimum the amount of cores
# available in the Spark cluster, and optimally more; higher splits will lead
# to more fine-grained resumes. Aim for 8 * (Spark cores).
#splitCount: 256
# Number of connections to use to Cassandra when copying
#connections: 8
# Number of rows to fetch in each read
#fetchSize: 1000
# Optional condition to filter source table data that will be migrated
# where: race_start_date = '2015-05-27' AND race_end_date = '2015-05-27'
# Example for loading from Parquet:
# source:
# type: parquet
# path: s3a://bucket-name/path/to/parquet-directory
# # Optional AWS access/secret key for loading from S3.
# # This section can be left out if running on EC2 instances that have instance profiles with the
# # appropriate permissions. Assuming roles is not supported currently.
# credentials:
# accessKey:
# secretKey:
# Example for loading from DynamoDB:
# source:
# type: dynamodb
# table: <table name>
# # Optional - load from a custom endpoint:
# endpoint:
# # Specify the hostname without a protocol
# host: <host>
# port: <port>
#
# # Optional - specify the region:
# # region: <region>
#
# # Optional - static credentials:
# credentials:
# accessKey: <user>
# secretKey: <pass>
#
# # below controls split factor
# scanSegments: 1
#
# # throttling settings, set based on your capacity (or wanted capacity)
# readThroughput: 1
#
# # The value of dynamodb.throughput.read.percent can be between 0.1 and 1.5, inclusively.
# # 0.5 represents the default read rate, meaning that the job will attempt to consume half of the read capacity of the table.
# # If you increase the value above 0.5, spark will increase the request rate; decreasing the value below 0.5 decreases the read request rate.
# # (The actual read rate will vary, depending on factors such as whether there is a uniform key distribution in the DynamoDB table.)
# throughputReadPercent: 1.0
#
# # how many tasks per executor?
# maxMapTasks: 1
#
# # When transferring DynamoDB sources to DynamoDB targets (such as other DynamoDB tables or Alternator tables),
# # the migrator supports transferring live changes occuring on the source table after transferring an initial
# # snapshot. This is done using DynamoDB streams and incurs additional charges due to the Kinesis streams created.
# # Enable this flag to transfer live changes after transferring an initial snapshot. The migrator will continue
# # replicating changes endlessly; it must be stopped manually.
# #
# # NOTE: For the migration to be performed losslessly, the initial snapshot transfer must complete within 24 hours.
# # Otherwise, some captured changes may be lost due to the retention period of the table's stream.
# #
# # NOTE2: The migrator does not currently delete the created Dynamo stream. Delete it manually after ending the
# # migrator run.
# streamChanges: false
# Configuration for the database you're copying into
target:
type: astra
# NOTE: The destination table must have the same schema as the source table.
# If you'd like to rename columns, that's ok - see the renames parameter below.
# keyspace: stocks
# table: stocks
#host: scylla
#port: 9042
#optional, if not specified None will be used
#localDC: <localdc>
#credentials:
# username: <user>
# password: <pass>
# SSL as per https://github.com/scylladb/spark-cassandra-connector/blob/master/doc/reference.md#cassandra-ssl-connection-options
#sslOptions:
# clientAuthEnabled: false
# enabled: false
# all below are optional! (generally just trustStorePassword and trustStorePath is needed)
# trustStorePassword: <pass>
# trustStorePath: <path>
# trustStoreType: JKS
# keyStorePassword: <pass>
# keyStorePath: <path>
# keyStoreType: JKS
# enabledAlgorithms:
# - TLS_RSA_WITH_AES_128_CBC_SHA
# - TLS_RSA_WITH_AES_256_CBC_SHA
# protocol: TLS
# Number of connections to use to Scylla when copying
#connections: 16
# Spark pads decimals with zeros appropriate to their scale. This causes values
# like '3.5' to be copied as '3.5000000000...' to the target. There's no good way
# currently to preserve the original value, so this flag can strip trailing zeros
# on decimal values before they are written.
stripTrailingZerosForDecimals: false
# if we cannot persist timestamps (so preserveTimestamps==false)
# we can enforce in writer a single TTL or writetimestamp for ALL written records
# such writetimestamp can be e.g. set to time BEFORE starting dual writes
# and this will make your migration safe from overwriting dual write
# even for collections
# ALL rows written will get the same TTL or writetimestamp or both
# (you can uncomment just one of them, or all or none)
# TTL in seconds (sample 7776000 is 90 days)
#writeTTLInS: 7776000
# writetime in microseconds (sample 1640998861000 is Saturday, January 1, 2022 2:01:01 AM GMT+01:00 )
#writeWritetimestampInuS: 1640998861000
# Example for loading into a DynamoDB target (for example, Scylla's Alternator):
# target:
# type: dynamodb
# table: <table name>
# # Optional - write to a custom endpoint:
# endpoint:
# # If writing to Scylla Alternator, prefix the hostname with 'http://'.
# host: <host>
# port: <port>
#
# # Optional - specify the region:
# # region: <region>
#
# # Optional - static credentials:
# credentials:
# accessKey: <user>
# secretKey: <pass>
#
# # Split factor for reading/writing. This is required for Scylla targets.
# scanSegments: 1
#
# # throttling settings, set based on your capacity (or wanted capacity)
# readThroughput: 1
#
# # The value of dynamodb.throughput.read.percent can be between 0.1 and 1.5, inclusively.
# # 0.5 represents the default read rate, meaning that the job will attempt to consume half of the read capacity of the table.
# # If you increase the value above 0.5, spark will increase the request rate; decreasing the value below 0.5 decreases the read request rate.
# # (The actual read rate will vary, depending on factors such as whether there is a uniform key distribution in the DynamoDB table.)
# throughputReadPercent: 1.0
#
# # how many tasks per executor?
# maxMapTasks: 1
# Savepoints are configuration files (like this one), saved by the migrator as it
# runs. Their purpose is to skip token ranges that have already been copied. This
# configuration only applies when copying from Cassandra/Scylla.
savepoints:
# Where should savepoint configurations be stored? This is a path on the host running
# the Spark driver - usually the Spark master.
path: /data/savepoints/
# Interval in which savepoints will be created
intervalSeconds: 300
# Column renaming configuration. If you'd like to rename any columns, specify them like so:
# - from: source_column_name
# to: dest_column_name
renames: []
# Which token ranges to skip. You shouldn't need to fill this in normally; the migrator will
# create a savepoint file with this filled.
skipTokenRanges: []
# Configuration section for running the validator. The validator is run manually (see README)
# and currently only supports comparing a Cassandra source to a Scylla target.
validation:
# Should WRITETIMEs and TTLs be compared?
compareTimestamps: true
# What difference should we allow between TTLs?
ttlToleranceMillis: 60000
# What difference should we allow between WRITETIMEs?
writetimeToleranceMillis: 1000
# How many differences to fetch and print
failuresToFetch: 100
# What difference should we allow between floating point numbers?
floatingPointTolerance: 0.001
# What difference in ms should we allow between timestamps?
timestampMsTolerance: 0