diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 09f68d5..299df72 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -33,6 +33,7 @@ jobs: env: OS: ${{ matrix.os }} PYTHON: ${{ matrix.python-version }} + TC_KEEPALIVE: true name: Python ${{ matrix.python-version }} on OS ${{ matrix.os }} steps: @@ -56,7 +57,7 @@ jobs: pip install "setuptools>=64" --upgrade # Install package in editable mode. - pip install --use-pep517 --prefer-binary --editable=.[test,develop] + pip install --use-pep517 --prefer-binary --editable=.[all,test,develop] - name: Run linter and software tests run: | diff --git a/CHANGES.md b/CHANGES.md index c1bf04b..10de89b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,7 @@ # Changelog ## in progress +- Carabas: A subsystem to divert workloads to other people’s computers ## 2024-07-10 v0.0.2 - Initial working version, supporting MQTT, AMQP, and SQLAlchemy/CrateDB diff --git a/doc/backlog.rst b/doc/backlog.rst index f0a5856..88af8f8 100644 --- a/doc/backlog.rst +++ b/doc/backlog.rst @@ -40,6 +40,8 @@ Iteration 2 - [o] Examples: Add ``appsink`` example - [o] Improve inline docs - [o] Release 0.1.0 +- [o] CSV: https://github.com/alan-turing-institute/CleverCSV +- [o] Excel & ODF: https://github.com/dimastbk/python-calamine *********** diff --git a/doc/carabas/backlog.md b/doc/carabas/backlog.md new file mode 100644 index 0000000..e7c455d --- /dev/null +++ b/doc/carabas/backlog.md @@ -0,0 +1,21 @@ +# Carabas Backlog + +## Iteration +1 +- [x] Improve type mapping +- [x] Generalize CDC event -> SQL translator +- [ ] Only optionally display debug output of Docker build process, + [ ] when using `--verbose`. +- [ ] Bring back "Zip" use, for interactive hacking +- [ ] Distill into a Lambda variant +- [ ] Automation! + - [ ] DDL: CREATE TABLE (data OBJECT(DYNAMIC)); + - [ ] Wrap KCL launcher into manager component + +## Iteration +2 +- [ ] Performance improvements (simdjson?) +- [ ] Use SQLAlchemy for generating and submitting SQL statement +- [ ] Improve efficiency by using bulk operations when applicable +- [ ] is in UPDATE_ROLLBACK_COMPLETE_CLEANUP_IN_PROGRESS state and can not be updated +- [ ] is in ROLLBACK_COMPLETE state and can not be updated. +- [ ] Cannot create a publicly accessible DBInstance. The specified VPC has no + internet gateway attached.Update the VPC and then try again diff --git a/doc/carabas/dms/handbook.md b/doc/carabas/dms/handbook.md new file mode 100644 index 0000000..42208f7 --- /dev/null +++ b/doc/carabas/dms/handbook.md @@ -0,0 +1,79 @@ +(aws-dms-handbook)= +# AWS DMS Handbook + +A few useful AWSCLI commands to check the status of the DMS engine and +relevant pipeline elements. You can also use the AWS Web Console to +inspect and commandeer the same details. + + +## Status Checks +Display ARNs of all replication instances. +```shell +aws dms describe-replication-instances | jq -r '.ReplicationInstances[].ReplicationInstanceArn' +``` +Display replication endpoints and relevant connection settings. +```shell +aws dms describe-endpoints +``` +Invoke connection test on given DMS endpoint. +```shell +aws dms test-connection \ + --replication-instance-arn arn:aws:dms:eu-central-1:831394476016:rep:JD2LL6OM35BJZNKZIRSOE2FXIY \ + --endpoint-arn arn:aws:dms:eu-central-1:831394476016:endpoint:3IVDGL6E4RDNBF2LFBYF6DYV3Y +``` +Display connection test results. +```shell +aws dms describe-connections +``` + + +## Operations +Enumerate all configured replication tasks with compact output. +```shell +aws dms describe-replication-tasks | \ + jq '.ReplicationTasks[] | {ReplicationTaskIdentifier, ReplicationTaskArn, MigrationType, StartReplicationType, Status, StopReason, FailureMessages, ProvisionData}' +``` +Start replication task with given ARN. +```shell +aws dms start-replication-task \ + --start-replication-task-type start-replication --replication-task-arn \ + arn:aws:dms:eu-central-1:831394476016:task:7QBLNBTPCNDEBG7CHI3WA73YFA +``` +Stop replication task with given ARN. +```shell +aws dms stop-replication-task --replication-task-arn \ + arn:aws:dms:eu-central-1:831394476016:task:7QBLNBTPCNDEBG7CHI3WA73YFA +``` + + +## Logging +To see detailed progress about the replication process, use CloudWatch to +inspect corresponding log output. + +Enumerate all log groups. +```shell +aws logs describe-log-groups +``` + +Get log output history. +```shell +aws logs get-log-events \ + --log-group-name dms-tasks-testdrive-dms-instance \ + --log-stream-name dms-task-7QBLNBTPCNDEBG7CHI3WA73YFA | jq .events[].message +``` + +Start watching the log output using the `start-live-tail` CloudWatch operation. +```shell +aws logs start-live-tail --log-group-identifiers \ + arn:aws:logs:eu-central-1:831394476016:log-group:/aws/rds/instance/testdrive-dms-postgresql-dev-db/postgresql \ + arn:aws:logs:eu-central-1:831394476016:log-group:dms-tasks-testdrive-dms-instance +``` + + +## CloudFormation +When the CloudFormation deployment is stuck, or if you want to start from scratch, +those commands are useful. +```shell +aws cloudformation continue-update-rollback --stack-name testdrive-dms-postgresql-dev +aws cloudformation delete-stack --stack-name testdrive-dms-postgresql-dev +``` diff --git a/doc/carabas/dms/index.md b/doc/carabas/dms/index.md new file mode 100644 index 0000000..7420bbe --- /dev/null +++ b/doc/carabas/dms/index.md @@ -0,0 +1,135 @@ +(aws-dms)= +# Pipelines with AWS DMS + +_AWS DMS to Kinesis to CrateDB._ + +## What's Inside +- [Working with AWS DMS tasks] +- [Using Amazon Kinesis Data Streams as a target for AWS Database Migration Service] +- An IaC driver program based on [AWS CloudFormation] technologies using the + [cottonformation] Python API. It can be used to set up infrastructure on AWS + without much ado. +- DMS: Full load and CDC +- DMS Source: RDS PostgreSQL +- DMS Target: Amazon Kinesis +- CDC Target: CrateDB Cloud + + +## AWS Infrastructure Setup +The following walkthrough describes a full deployment of AWS DMS including +relevant outbound data processors for demonstration purposes. + +In order to run it in production, you are welcome to derive from it and tweak +it for your own purposes. YMMV. If you need support, don't hesitate to ask for +help. + +### Install +Install LorryStream. +```shell +pip install --upgrade 'lorrystream[carabas]' +``` +Acquire IaC driver program. +```shell +wget https://github.com/daq-tools/lorrystream/raw/main/examples/aws/rds_postgresql_kinesis_lambda_oci_cratedb.py +``` + +### Configure +Please configure endpoint and replication settings within the source code +of the IaC program you just acquired, and presented next. + +### Deploy +First, prepare an AWS ECR repository for publishing the OCI image including your +downstream processor element that is consuming the replication data stream from +Amazon Kinesis, and runs it into CrateDB. To learn about how this works, please +visit the documentation section about the [](project:#ecr-repository). + +Configure CrateDB database sink address. +```shell +export SINK_SQLALCHEMY_URL='crate://admin:dZ..qB@example.eks1.eu-west-1.aws.cratedb.net:4200/?ssl=true' +``` + +Invoke the IaC driver program in order to deploy relevant resources on AWS. +```shell +python examples/aws/rds_postgresql_kinesis_lambda_oci_cratedb.py +``` + +After deployment succeeded, you will be presented a corresponding response including +relevant information about entrypoints to the software stack you've just created. +```text +Result of CloudFormation deployment: +psql command: psql "postgresql://dynapipe:secret11@testdrive-dms-postgresql-dev-db.czylftvqn1ed.eu-central-1.rds.amazonaws.com:5432/postgres" +RDS Instance ARN: arn:aws:rds:eu-central-1:831394476016:db:testdrive-dms-postgresql-dev-db +Stream ARN: arn:aws:kinesis:eu-central-1:831394476016:stream/testdrive-dms-postgresql-dev-stream +Replication ARN: arn:aws:dms:eu-central-1:831394476016:replication-config:EAM3JEHXGBGZBPN5PLON7NPDEE +``` + +:::{note} +Please note this is a demonstration stack, deviating from typical real-world situations. + +- Contrary to this stack, which includes an RDS PostgreSQL instance, a database instance + will already be up and running, so the remaining task is to just configure the Kinesis + Data Stream and consume it. + +- Contrary to this stack, which uses AWS Lambda to host the downstream processor element, + when aiming for better cost-effectiveness, you will run corresponding code on a dedicated + computing environment. +::: + + +## Operations +Please consult the [](project:#aws-dms-handbook) to learn about commands +suitable for operating the AWS DMS engine. + +:::{toctree} +:hidden: + +handbook +::: + + + +## Usage + +### DMS +AWS DMS provides `full-load` and `full-load-and-cdc` migration types. +For a `full-load-and-cdc` task, AWS DMS migrates table data, and then applies +data changes that occur on the source, automatically establishing continuous +replication. + +When starting a replication task using [StartReplicationTask], you can use those +possible values for `--start-replication-task-type`, see also [start-replication-task]: + +:start-replication: + The only valid value for the first run of the task when the migration type is + `full-load` or `full-load-and-cdc` + +:resume-processing: + Not applicable for any full-load task, because you can't resume partially loaded + tables during the full load phase. Use it to replicate the changes from the last + stop position. + +:reload-target: + For a `full-load-and-cdc` task, load all the tables again, and start capturing + source changes. + + +## Migration by DMS Source +This section enumerates specific information to consider when aiming to use DMS +for your database as a source element. + +:::{toctree} +:maxdepth: 2 + +postgresql +mysql +::: + + + +[AWS CloudFormation]: https://en.wikipedia.org/wiki/AWS_CloudFormation +[cottonformation]: https://pypi.org/project/cottonformation/ +[StartReplicationTask]: https://docs.aws.amazon.com/dms/latest/APIReference/API_StartReplicationTask.html +[start-replication-task]: https://docs.aws.amazon.com/cli/latest/reference/dms/start-replication-task.html +[Using Amazon Kinesis Data Streams as a target for AWS Database Migration Service]: https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Target.Kinesis.html +[Using object mapping to migrate data to a Kinesis data stream]: https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Target.Kinesis.html#CHAP_Target.Kinesis.ObjectMapping +[Working with AWS DMS tasks]: https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Tasks.html diff --git a/doc/carabas/dms/mysql.md b/doc/carabas/dms/mysql.md new file mode 100644 index 0000000..24f0a80 --- /dev/null +++ b/doc/carabas/dms/mysql.md @@ -0,0 +1,4 @@ +(aws-dms-mysql)= +# AWS DMS with MySQL/MariaDB source + +WIP. diff --git a/doc/carabas/dms/postgresql.md b/doc/carabas/dms/postgresql.md new file mode 100644 index 0000000..f804be9 --- /dev/null +++ b/doc/carabas/dms/postgresql.md @@ -0,0 +1,57 @@ +(aws-dms-postgresql)= +# AWS DMS with PostgreSQL source + +## What's Inside +- [Using a PostgreSQL database as an AWS DMS source] + +### Prerequisites +First of all, activate the `pglocical` extension on your RDS PostgreSQL instance. +```sql +CREATE EXTENSION pglogical; +SELECT * FROM pg_catalog.pg_extension WHERE extname='pglogical'; +``` + +```sql +SHOW shared_preload_libraries; +SELECT name, setting FROM pg_settings WHERE name in ('rds.logical_replication','shared_preload_libraries'); +``` + + +### Data in Source +After that, connect to RDS PostgreSQL, and provision a little bunch of data. +```sql +DROP TABLE IF EXISTS foo CASCADE; +CREATE TABLE foo (id INT PRIMARY KEY, name TEXT, age INT, attributes JSONB); +INSERT INTO foo (id, name, age, attributes) VALUES (42, 'John', 30, '{"foo": "bar"}'); +INSERT INTO foo (id, name, age, attributes) VALUES (43, 'Jane', 31, '{"baz": "qux"}'); +``` + +### Data in Target +```sql +cr> SELECT * FROM public.foo; +``` +```postgresql ++---------------------------------------------------------------------+ +| data | ++---------------------------------------------------------------------+ +| {"age": 30, "attributes": {"foo": "bar"}, "id": 42, "name": "John"} | +| {"age": 31, "attributes": {"baz": "qux"}, "id": 43, "name": "Jane"} | ++---------------------------------------------------------------------+ +``` + + + +```sql +UPDATE foo SET age=32 WHERE name='Jane'; +UPDATE foo SET age=33 WHERE id=43; +UPDATE foo SET age=33 WHERE attributes->>'foo'='bar'; +UPDATE foo SET attributes = jsonb_set(attributes, '{last_name}', '"Doe"', true) WHERE name='John'; +``` +```sql +DELETE FROM foo WHERE name='Jane'; +DELETE FROM foo WHERE name='John'; +``` + + + +[Using a PostgreSQL database as an AWS DMS source]: https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Source.PostgreSQL.html diff --git a/doc/carabas/index.md b/doc/carabas/index.md new file mode 100644 index 0000000..65d929b --- /dev/null +++ b/doc/carabas/index.md @@ -0,0 +1,53 @@ +# Carabas + +A subsystem to divert workloads to other people's computers. +Workloads can be whole pipelines or elements of pipelines. +Provides blended computing environments on your fingertips. + +## Etymology +- [Marquis von Carabas] +- [Die Meisterkatze oder der gestiefelte Kater] +- [Le Maître chat ou le Chat botté] +- [Puss in Boots] + +## What's Inside + +### Kinesis KCL v2 +:::{toctree} +:maxdepth: 2 +:glob: +kcl/kinesis +::: + +### DynamoDB -> Kinesis KCL v2 +:::{toctree} +:maxdepth: 2 +:glob: +kcl/dynamodb* +::: + +### DMS -> Kinesis +:::{toctree} +:maxdepth: 2 +dms/index +::: + +### Kinesis -> Lambda +:::{toctree} +:maxdepth: 2 +lambda/index +::: + + +## Development +:::{toctree} +:maxdepth: 2 +backlog +research +::: + + +[Die Meisterkatze oder der gestiefelte Kater]: https://de.frwiki.wiki/wiki/Le_Ma%C3%AEtre_chat_ou_le_Chat_bott%C3%A9 +[Le Maître chat ou le Chat botté]: https://fr.wikipedia.org/wiki/Le_Ma%C3%AEtre_chat_ou_le_Chat_bott%C3%A9 +[Marquis von Carabas]: https://de.frwiki.wiki/wiki/Marquis_de_Carabas +[Puss in Boots]: https://en.wikipedia.org/wiki/Puss_in_Boots diff --git a/doc/carabas/kcl/dynamodb-standalone.md b/doc/carabas/kcl/dynamodb-standalone.md new file mode 100644 index 0000000..2694d48 --- /dev/null +++ b/doc/carabas/kcl/dynamodb-standalone.md @@ -0,0 +1,71 @@ +# DynamoDB CDC to CrateDB using DynamoDB Streams Kinesis Adapter + + +## Introduction +> DynamoDB Streams captures a time-ordered sequence of item-level modification +> in any DynamoDB table and stores this information in a log for up to 24 hours. +> +> Applications can access this log and view the data items as they appeared +> before and after they were modified, in near-real time. +> +> -- [Change data capture for DynamoDB Streams] + + +## About +A [change data capture (CDC)] pipeline made of a DynamoDB +egress CDC processor, sinking data into the CrateDB +OLAP database, using the [DynamoDB Streams Kinesis Adapter] +([GitHub][DynamoDB Streams Kinesis Adapter for Java]). + +> Using the Amazon Kinesis Adapter is the recommended way to +> consume streams from Amazon DynamoDB. +> +> -- [Using the DynamoDB Streams Kinesis adapter to process stream records] + + +## What's Inside + +- On a compute-environment of your choice, supporting Python, a traditional + KCL v2 application using the client-side DynamoDB Streams Kinesis Adapter, + subscribes to a DynamoDB Change Stream, which is pretending to be a Kinesis + Stream, in order to receive published CDC opslog messages. + +- On the egress side, the application re-materializes the items of the + operations log into any database with [SQLAlchemy] support. + + +## Holzweg! + +``` +# HACK + +# Kinesis backend. +multi_lang_daemon_class = "software.amazon.kinesis.multilang.MultiLangDaemon" + +# DynamoDB backend. +# https://github.com/awslabs/dynamodb-streams-kinesis-adapter/issues/46#issuecomment-1260222792 +multi_lang_daemon_class = "com.amazonaws.services.dynamodbv2.streamsadapter.StreamsMultiLangDaemon" +``` +- https://github.com/awslabs/dynamodb-streams-kinesis-adapter/issues/46 + +Q: It looks like the "DynamoDB Streams Kinesis Adapter" project is dead? + +- https://github.com/awslabs/dynamodb-streams-kinesis-adapter/issues/40 +- https://github.com/awslabs/dynamodb-streams-kinesis-adapter/issues/42 + +There would be an option to try this by downgrading to KCL v1. We are not +sure if it is worth to try it, though. + +A: Upgrade to KCLv2 will probably happen at some time in the future. + +- https://github.com/awslabs/dynamodb-streams-kinesis-adapter/issues/22 + + +[change data capture (CDC)]: https://en.wikipedia.org/wiki/Change_data_capture +[Change data capture for DynamoDB Streams]: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Streams.html +[DynamoDB]: https://aws.amazon.com/dynamodb/ +[DynamoDB Streams Kinesis Adapter]: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Streams.KCLAdapter.html +[DynamoDB Streams Kinesis Adapter for Java]: https://github.com/awslabs/dynamodb-streams-kinesis-adapter +[Kinesis]: https://aws.amazon.com/kinesis/ +[SQLAlchemy]: https://www.sqlalchemy.org/ +[Using the DynamoDB Streams Kinesis adapter to process stream records]: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Streams.KCLAdapter.html diff --git a/doc/carabas/kcl/dynamodb.md b/doc/carabas/kcl/dynamodb.md new file mode 100644 index 0000000..589bc16 --- /dev/null +++ b/doc/carabas/kcl/dynamodb.md @@ -0,0 +1,244 @@ +# DynamoDB CDC to CrateDB using Kinesis + + +## Introduction +> DynamoDB Streams captures a time-ordered sequence of item-level modification +> in any DynamoDB table and stores this information in a log for up to 24 hours. +> +> Applications can access this log and view the data items as they appeared +> before and after they were modified, in near-real time. +> +> -- [Change data capture for DynamoDB Streams] + + +## About +A [change data capture (CDC)] pipeline made of a DynamoDB +egress CDC processor, sinking data into the CrateDB +OLAP database, using Kinesis. + +> Kinesis Data Streams captures item-level modifications in any DynamoDB +> table and replicates them to a Kinesis data stream. +> +> -- [Using Kinesis Data Streams to capture changes to DynamoDB] + + +## What's Inside + +- Completely on AWS' premises, there is a process which relays CDC data + from a [DynamoDB] table to a [Kinesis] stream, configured using AWS' + APIs. + +- On a compute-environment of your choice, supporting Python, a traditional + KCL v2 application subscribes to the [Kinesis] stream, in order to receive + published CDC opslog messages. + +- On the egress side, the application re-materializes the items of the + operations log into any database with [SQLAlchemy] support. + + +## Setup +Create a database table in DynamoDB, and enable a Kinesis Stream on its +operations log. + +This section reflects configuration settings stored in +[dynamodb_cdc_processor.properties](../../../lorrystream/spike/kcl_dynamodb/dynamodb_cdc_processor.properties). + +We recommend to run through the setup procedure of [](kinesis.md) +beforehand, because it conveys relevant setup instructions about IAM +policies, which are obligatory to permit Kinesis access to DynamoDB for +storing a "lease table". + +### DynamoDB Table +```shell +# Optionally, drop the table. +aws dynamodb delete-table \ + --table-name table-testdrive + +# Create table (DDL). +# - It defines a composite primary key. +# - "device" is the partition key +# - "timestamp" is the sort key +# - It does not define auxiliary field names, +# they can be added dynamically. +aws dynamodb create-table \ + --table-name table-testdrive \ + --key-schema \ + AttributeName=device,KeyType=HASH \ + AttributeName=timestamp,KeyType=RANGE \ + --attribute-definitions \ + AttributeName=device,AttributeType=S \ + AttributeName=timestamp,AttributeType=S \ + --provisioned-throughput \ + ReadCapacityUnits=1,WriteCapacityUnits=1 \ + --table-class STANDARD + +# Display all table names on DynamoDB. +aws dynamodb list-tables + +# Check table status. +aws dynamodb describe-table --table-name table-testdrive | grep TableStatus +``` + +### CrateDB Table +The destination table name in CrateDB is currently hard-coded. Please use +this command to create the `transactions` table, where the CDC record +processor will re-materialize CDC events into. +```shell +crash -c "CREATE TABLE transactions (data OBJECT(DYNAMIC));" +``` + +### Kinesis Stream +Capture DynamoDB table operations and relay them to a Kinesis stream. +```shell +# Create a Kinesis Data Stream. +aws kinesis create-stream --stream-name dynamodb-cdc --shard-count 4 + +# Check that the Kinesis stream is active. +aws kinesis describe-stream --stream-name dynamodb-cdc + +# Enable Kinesis streaming on the DynamoDB table. +# Replace the `stream-arn` value with the one returned by +# `describe-stream` in the previous step. +STREAM_ARN=$(aws kinesis describe-stream --stream-name dynamodb-cdc | jq -r .StreamDescription.StreamARN) +aws dynamodb enable-kinesis-streaming-destination \ + --table-name table-testdrive \ + --stream-arn "${STREAM_ARN}" \ + --enable-kinesis-streaming-configuration ApproximateCreationDateTimePrecision=MICROSECOND + +# Check if Kinesis streaming is active on the table. +aws dynamodb describe-kinesis-streaming-destination --table-name table-testdrive +``` + +Note that you need to re-run the linking procedure after dropping and +re-creating the DynamoDB table. + +```shell +aws kinesis list-streams +aws kinesis delete-stream --stream-name dynamodb-cdc --enforce-consumer-deletion +``` + +### KCL Stream Processor + +Acquire sources and initialize sandbox. +```shell +git clone https://github.com/daq-tools/lorrystream +cd lorrystream +python3 -m venv .venv +source .venv/bin/activate +pip install --editable='.[carabas]' +``` + +Install dependencies, mainly the [amazon-kclpy] package. +```shell +cd lorrystream/spike/kcl_dynamodb +pip install wheel +pip install --verbose -r requirements.txt +``` + + +## Usage +You will need multiple terminal windows. Within both of them, activate the +virtualenv on the top-level directory. Then, navigate to the playground +directory, and seed AWS credentials. +```shell +source .venv/bin/activate +cd lorrystream/spike/kcl_dynamodb +export AWS_ACCESS_KEY=... +export AWS_SECRET_ACCESS_KEY=... +``` + +Launch the stream processor, subscribing to the DynamoDB CDC operations feed +over a Kinesis stream. +```shell +sh launch.sh dynamodb_cdc_processor.properties +``` + +Watch actions of the CDC processor. +```shell +tail -F dynamodb_cdc_processor.log +``` + +Insert record into database table. +```shell +READING_SQL="{'timestamp': '2024-07-12T01:17:42', 'device': 'foo', 'temperature': 42.42, 'humidity': 84.84}" +aws dynamodb execute-statement --statement \ + "INSERT INTO \"table-testdrive\" VALUE ${READING_SQL};" +``` + +Query database table. +```shell +aws dynamodb execute-statement --statement \ + "SELECT * FROM \"table-testdrive\";" +``` + +Run UPDATE and DELETE statements, in order to sample the two other DML operations. +```shell +aws dynamodb execute-statement --statement \ + "UPDATE \"table-testdrive\" SET temperature=55.55 WHERE \"device\"='foo' AND \"timestamp\"='2024-07-12T01:17:42';" +``` +```shell +aws dynamodb execute-statement --statement \ + "DELETE FROM \"table-testdrive\" WHERE \"device\"='foo' AND \"timestamp\"='2024-07-12T01:17:42';" +``` + +Alternative for displaying table contents. +```shell +aws dynamodb scan --table-name table-testdrive +``` + +## Software Tests +```shell +pytest +``` + +## Appendix + +### DynamoDB data types + +The following is a complete list of DynamoDB data type descriptors: + + S – String + N – Number + B – Binary + BOOL – Boolean + NULL – Null + M – Map + L – List + SS – String Set + NS – Number Set + BS – Binary Set + +### Opslog processor samples +``` +01:25:17.632 [dynamodb_cdc_processor] INFO process_record - {"awsRegion":"us-east-1","eventID":"b015b5f0-c095-4b50-8ad0-4279aa3d88c6","eventName":"INSERT","userIdentity":null,"recordFormat":"application/json","tableName":"table-testdrive","dynamodb":{"ApproximateCreationDateTime":1720740233012995,"Keys":{"device":{"S":"qux"},"timestamp":{"S":"2024-07-12T01:17:42"}},"NewImage":{"humidity":{"N":"84.84"},"temperature":{"N":"42.42"},"device":{"S":"qux"},"timestamp":{"S":"2024-07-12T01:17:42"}},"SizeBytes":99,"ApproximateCreationDateTimePrecision":"MICROSECOND"},"eventSource":"aws:dynamodb"} +01:58:22.371 [dynamodb_cdc_processor] INFO process_record - {"awsRegion":"us-east-1","eventID":"24757579-ebfd-480a-956d-a1287d2ef707","eventName":"MODIFY","userIdentity":null,"recordFormat":"application/json","tableName":"table-testdrive","dynamodb":{"ApproximateCreationDateTime":1720742302233719,"Keys":{"device":{"S":"foo"},"timestamp":{"S":"2024-07-12T01:17:42"}},"NewImage":{"humidity":{"N":"84.84"},"temperature":{"N":"55.66"},"device":{"S":"foo"},"timestamp":{"S":"2024-07-12T01:17:42"}},"OldImage":{"humidity":{"N":"84.84"},"temperature":{"N":"42.42"},"device":{"S":"foo"},"timestamp":{"S":"2024-07-12T01:17:42"}},"SizeBytes":161,"ApproximateCreationDateTimePrecision":"MICROSECOND"},"eventSource":"aws:dynamodb"} +01:58:42.510 [dynamodb_cdc_processor] INFO process_record - {"awsRegion":"us-east-1","eventID":"ff4e68ab-0820-4a0c-80b2-38753e8e00e5","eventName":"REMOVE","userIdentity":null,"recordFormat":"application/json","tableName":"table-testdrive","dynamodb":{"ApproximateCreationDateTime":1720742321848352,"Keys":{"device":{"S":"foo"},"timestamp":{"S":"2024-07-12T01:17:42"}},"OldImage":{"humidity":{"N":"84.84"},"temperature":{"N":"55.66"},"device":{"S":"foo"},"timestamp":{"S":"2024-07-12T01:17:42"}},"SizeBytes":99,"ApproximateCreationDateTimePrecision":"MICROSECOND"},"eventSource":"aws:dynamodb"} +``` + + +## Documentation +- https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/kds_gettingstarted.html +- https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/getting-started-step-1.html +- https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/getting-started-step-2.html +- https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/example_dynamodb_Scenario_GettingStartedMovies_section.html +- https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.NamingRulesDataTypes.html +- https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/streamsmain.html +- https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.CoreComponents.html#HowItWorks.CoreComponents.PrimaryKey +- https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_CreateTable.html +- https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ql-reference.update.html +- https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Streams.html +- https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.CoreComponents.html#HowItWorks.CoreComponents.TablesItemsAttributes +- https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/streamsmain.html + +## Resources +- https://aws.amazon.com/blogs/database/choose-the-right-change-data-capture-strategy-for-your-amazon-dynamodb-applications/ +- https://www.singlestore.com/blog/cdc-data-from-dynamodb-to-singlestore-using-dynamodb-streams/ +- https://medium.com/event-driven-utopia/aws-dynamodb-streams-change-data-capture-for-dynamodb-tables-d4c92f9639d3 + + +[change data capture (CDC)]: https://en.wikipedia.org/wiki/Change_data_capture +[Change data capture for DynamoDB Streams]: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Streams.html +[DynamoDB]: https://aws.amazon.com/dynamodb/ +[Kinesis]: https://aws.amazon.com/kinesis/ +[SQLAlchemy]: https://www.sqlalchemy.org/ +[Using Kinesis Data Streams to capture changes to DynamoDB]: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/kds.html diff --git a/doc/carabas/kcl/kinesis.md b/doc/carabas/kcl/kinesis.md new file mode 100644 index 0000000..1a03a0f --- /dev/null +++ b/doc/carabas/kcl/kinesis.md @@ -0,0 +1,107 @@ +# Kinesis Streams with KCLv2 + +## About +A stream processor component using the [Kinesis Client Library (KCL)]. +It is written in Python, and uses the [amazon-kclpy] Python SDK for KCL +([GitHub][amazon-kclpy-github]). + +## What's Inside +- Publishing and subscribing to [Kinesis] streams, using Python. + +## Setup +Create a Kinesis stream, and set up a Python sandbox for connecting +to it using KCL v2. + +This section reflects configuration settings stored in +[record_processor.properties](../../../lorrystream/spike/kcl_kinesis/record_processor.properties). + +### AWS +Configure a Kinesis Stream, and an IAM policy, because Kinesis needs to create +and maintain a "[leases table]" stored in DynamoDB, so it requires corresponding +permissions to do so. + +- Create a [Kinesis] stream called `testdrive-stream`, per [Kinesis Console]. +- [Create an IAM Policy and User], applying the permissions outlined on this page. + Two example ARN IDs, that address relevant resources in Kinesis and DynamoDB, are: + ```text + arn:aws:kinesis:us-east-1:841394475918:stream/testdrive-stream + arn:aws:dynamodb:us-east-1:841394475918:table/stream-demo + ``` +- The leases table in DynamoDB will be automatically created when the first + stream consumer (the KCL application) becomes active. + +### KCL Stream Processor + +Acquire sources and initialize sandbox. +```shell +git clone https://github.com/daq-tools/lorrystream +cd lorrystream +python3 -m venv .venv +source .venv/bin/activate +pip install --editable='.[carabas]' +``` + +Install dependencies, mainly the [amazon-kclpy] package. +```shell +cd lorrystream/spike/kcl_kinesis +pip install wheel +pip install --verbose -r requirements.txt +``` +Note that the first installation of the [amazon-kclpy] package on your machine +will take a while, because it will download a bunch of JAR files, defined by a +traditional [pom.xml] recipe, before embedding them into the Python package. + +On subsequent installations, as long as you don't switch versions, that package +will install from your local package cache, so it will be much faster. + +Alternative: Use ready-made wheel package. Note to self: Need to provide this to +the colleagues. +```shell +pip install ./dist/amazon_kclpy-2.1.5-py3-none-any.whl +``` + +## Usage +You will need multiple terminal windows. Within both of them, activate the +virtualenv on the top-level directory. Then, navigate to the playground +directory, and seed AWS credentials. +```shell +source .venv/bin/activate +cd lorrystream/spike/kcl_kinesis +export AWS_ACCESS_KEY=... +export AWS_SECRET_ACCESS_KEY=... +``` + +Launch the stream processor, subscribing to the stream. +```shell +$(sh launch.sh record_processor.properties) +``` + +Watch actions of the record processor. +```shell +tail -F record_processor.log +``` + +Publish a demo message to the stream. +```shell +python publish.py +``` + +## Documentation +- https://docs.aws.amazon.com/streams/latest/dev/building-consumers.html + +## Resources +- https://dev.solita.fi/2020/05/28/kinesis-streams-part-1.html +- https://dev.solita.fi/2020/12/21/kinesis-streams-part-2.html +- https://github.com/aws-samples/amazon-kinesis-data-processor-aws-fargate + + +[amazon-kclpy]: https://pypi.org/project/amazon-kclpy +[amazon-kclpy-github]: https://github.com/awslabs/amazon-kinesis-client-python +[Create an IAM Policy and User]: https://docs.aws.amazon.com/streams/latest/dev/tutorial-stock-data-kplkcl2-iam.html +[DynamoDB]: https://aws.amazon.com/dynamodb/ +[DynamoDB Console]: https://console.aws.amazon.com/dynamodbv2/ +[Kinesis]: https://aws.amazon.com/kinesis/ +[Kinesis Console]: https://console.aws.amazon.com/kinesis/ +[Kinesis Client Library (KCL)]: https://docs.aws.amazon.com/streams/latest/dev/shared-throughput-kcl-consumers.html +[leases table]: https://aws.amazon.com/blogs/big-data/processing-amazon-dynamodb-streams-using-the-amazon-kinesis-client-library/ +[pom.xml]: https://github.com/awslabs/amazon-kinesis-client-python/blob/v2.1.5/pom.xml diff --git a/doc/carabas/lambda/index.md b/doc/carabas/lambda/index.md new file mode 100644 index 0000000..36d9ce0 --- /dev/null +++ b/doc/carabas/lambda/index.md @@ -0,0 +1,222 @@ +# Pipelines with AWS Lambda + + +## What's inside +- A convenient [Infrastructure as code (IaC)] procedure to define data pipelines on [AWS]. +- Written in Python, using [AWS CloudFormation] stack deployments. To learn + what's behind, see also [How CloudFormation works]. +- Code for running on [AWS Lambda] is packaged into [OCI] images, for efficient + delta transfers, built-in versioning, and testing purposes. + + +## Details +- This specific document has a few general guidelines, and a + a few specifics coming from `examples/aws/dynamodb_kinesis_lambda_oci_cratedb.py`. +- That program defines a pipeline which looks like this: + + DynamoDB CDC -> Kinesis Stream -> Python Lambda via OCI -> CrateDB + + +## OCI image +In order to package code for AWS Lambda functions packages into OCI images, +and use them, you will need to publish them to the AWS ECR container image +registry. + +You will need to authenticate your local Docker environment, and create a +container image repository once for each project using a different runtime +image. + +### Authenticate +Define your AWS ID, region label, and repository name, to be able to use +the templated commands 1:1. +```shell +aws_id=831394476016 +aws_region=eu-central-1 +repository_name=kinesis-cratedb-lambda +``` +```shell +aws ecr get-login-password --region=${aws_region} | \ + docker login --username AWS --password-stdin ${aws_id}.dkr.ecr.${aws_region}.amazonaws.com +``` + +(ecr-repository)= +### ECR Repository +Just once, before proceeding, create an image repository hosting the runtime +code for your Lambda function. +```shell +aws ecr create-repository --region=${aws_region} \ + --repository-name=${repository_name} --image-tag-mutability=MUTABLE +``` +In order to allow others to pull that image, you will need to define a +[repository policy] using the [set-repository-policy] subcommend of the AWS CLI. +In order to invoke that command, put the [](project:#ecr-repository-policy) +JSON definition into a file called `policy.json`. +```shell +aws ecr set-repository-policy --repository-name=${repository_name} --policy-text file://policy.json +``` + +### Troubleshooting +If you receive such an error message, your session has expired, and you need +to re-run the authentication step. +```text +denied: Your authorization token has expired. Reauthenticate and try again. +``` + +This error message indicates your ECR repository does not exist. The solution +is to create it, using the command shared above. +```text +name unknown: The repository with name 'kinesis-cratedb-lambda' does +not exist in the registry with id '831394476016' +``` + +Get information about Lambda function. +```shell +aws lambda get-function \ + --function-name arn:aws:lambda:eu-central-1:831394476016:function:moll-stack-dynamodb-dev-lambda-processor +``` + + + +## CrateDB Table +The destination table name in CrateDB, where the CDC record +processor will re-materialize CDC events into. +```shell +pip install crash +crash -c "CREATE TABLE transactions (data OBJECT(DYNAMIC));" +``` + + +## Install +In order to exercise the example outlined below, you need to install +LorryStream. +```shell +pip install lorrystream +``` + + +## Usage +For exercising an AWS pipeline, you need two components: The IaC description, +and a record processor implementation for the AWS Lambda. For example, choose +those two variants: + +- IaC driver: [dynamodb_kinesis_lambda_oci_cratedb.py] +- Record processor: [kinesis_cratedb_lambda.py] + +Putting them next to each other into a directory, and adjusting +`LambdaPythonImage(entrypoint_file=...)` should be enough to get you started. +Sure enough, you will also need to configure the `CRATEDB_SQLALCHEMY_URL` +environment variable properly. + +Then, just invoke the IaC program to spin up the defined infrastructure on AWS. + + +## Operations +There are a few utility commands that help you operate the stack, that have not +been absorbed yet. See also [Monitoring and troubleshooting Lambda functions]. + +### Utilities +Check status of Lambda function. +```shell +aws lambda get-function \ + --function-name arn:aws:lambda:eu-central-1:831394476016:function:testdrive-dynamodb-dev-lambda-processor +``` +Check status of stream mapping(s). +```shell +aws lambda list-event-source-mappings +``` +Check logs. +```shell +aws logs describe-log-groups +aws logs start-live-tail --log-group-identifiers arn:aws:logs:eu-central-1:831394476016:log-group:/aws/lambda/DynamoDBCrateDBProcessor +``` + +### Test Flight I +Invoke the Lambda function for testing purposes. +```shell +aws lambda invoke \ + --function-name DynamoDBCrateDBProcessor \ + --payload file://records.json outputfile.txt +``` +Pick `records.json` from [](project:#kinesis-example-event), it is a basic +example of an AWS Kinesis event message. + +:::{note} +On AWS CLI v2, you may need that additional command line option. +```shell +--cli-binary-format raw-in-base64-out +``` +::: + +### Test Flight II +Trigger a real event by running two DML operations on the source database table. +```shell +READING_SQL="{'timestamp': '2024-07-12T01:17:42', 'device': 'foo', 'temperature': 42.42, 'humidity': 84.84}" + +aws dynamodb execute-statement --statement \ + "INSERT INTO \"table-testdrive\" VALUE ${READING_SQL};" + +aws dynamodb execute-statement --statement \ + "UPDATE \"table-testdrive\" SET temperature=43.59 WHERE \"device\"='foo' AND \"timestamp\"='2024-07-12T01:17:42';" +``` + + +## Appendix + +(ecr-repository-policy)= +### ECR Repository Policy +```json +{ + "Version": "2008-10-17", + "Statement": [ + { + "Sid": "allow public pull", + "Effect": "Allow", + "Principal": "*", + "Action": [ + "ecr:BatchCheckLayerAvailability", + "ecr:BatchGetImage", + "ecr:GetDownloadUrlForLayer" + ] + } + ] +} +``` + +(kinesis-example-event)= +### Kinesis Example Event +```json +{ + "Records": [ + { + "kinesis": { + "kinesisSchemaVersion": "1.0", + "partitionKey": "1", + "sequenceNumber": "49590338271490256608559692538361571095921575989136588898", + "data": "SGVsbG8sIHRoaXMgaXMgYSB0ZXN0Lg==", + "approximateArrivalTimestamp": 1545084650.987 + }, + "eventSource": "aws:kinesis", + "eventVersion": "1.0", + "eventID": "shardId-000000000006:49590338271490256608559692538361571095921575989136588898", + "eventName": "aws:kinesis:record", + "invokeIdentityArn": "arn:aws:iam::111122223333:role/lambda-kinesis-role", + "awsRegion": "us-east-2", + "eventSourceARN": "arn:aws:kinesis:us-east-2:111122223333:stream/lambda-stream" + } + ] +} +``` + + +[AWS]: https://en.wikipedia.org/wiki/Amazon_Web_Services +[AWS CloudFormation]: https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/Welcome.html +[AWS Lambda]: https://en.wikipedia.org/wiki/AWS_Lambda +[dynamodb_kinesis_lambda_oci_cratedb.py]: https://github.com/daq-tools/lorrystream/blob/main/examples/aws/dynamodb_kinesis_lambda_oci_cratedb.py +[example program]: https://github.com/daq-tools/lorrystream/tree/main/examples/aws +[How CloudFormation works]: https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/cloudformation-overview.html +[Infrastructure as code (IaC)]: https://en.wikipedia.org/wiki/Infrastructure_as_code +[kinesis_cratedb_lambda.py]: https://github.com/daq-tools/lorrystream/blob/main/lorrystream/process/kinesis_cratedb_lambda.py +[Monitoring and troubleshooting Lambda functions]: https://docs.aws.amazon.com/lambda/latest/dg/lambda-monitoring.html +[OCI]: https://en.wikipedia.org/wiki/Open_Container_Initiative +[repository policy]: https://docs.aws.amazon.com/lambda/latest/dg/images-create.html#gettingstarted-images-permissions +[set-repository-policy]: https://docs.aws.amazon.com/cli/latest/reference/ecr/set-repository-policy.html diff --git a/doc/carabas/research.md b/doc/carabas/research.md new file mode 100644 index 0000000..3625a38 --- /dev/null +++ b/doc/carabas/research.md @@ -0,0 +1,48 @@ +# Carabas Research + +- https://pypi.org/project/core-cdc +- https://github.com/sshd123/pypgoutput +- https://pypi.org/project/pypg-cdc/ +- https://github.com/hcevikGA/dynamo-wrapper +- https://pypi.org/project/dynamo-pandas/ +- https://aws.amazon.com/de/blogs/opensource/announcing-partiql-one-query-language-for-all-your-data/ +- https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ql-reference.html +- https://partiql.org/dql/overview.html +- https://github.com/MacHu-GWU/aws_lambda_layer-project +- https://github.com/MacHu-GWU/cottonformation-project +- https://docs.aws.amazon.com/lambda/latest/dg/python-package.html +- https://docs.aws.amazon.com/lambda/latest/dg/python-image.html +- https://docs.aws.amazon.com/lambda/latest/dg/lambda-runtimes.html +- https://docs.aws.amazon.com/lambda/latest/dg/file-processing-app.html +- https://www.tinybird.co/docs/guides/migrate-from-rockset#migrate-from-rockset +- https://www.tinybird.co/docs/guides/ingesting-data/ingest-from-dynamodb + +## AWS Lambda +- https://docs.aws.amazon.com/lambda/latest/dg/lambda-runtimes.html +- https://docs.aws.amazon.com/lambda/latest/dg/services-ddb-params.html +- https://docs.aws.amazon.com/lambda/latest/dg/best-practices.html +- https://docs.aws.amazon.com/lambda/latest/api/API_CreateEventSourceMapping.html +- https://aws.amazon.com/blogs/architecture/best-practices-for-developing-on-aws-lambda/ +- https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-lambda-eventsourcemapping.html +- https://docs.aws.amazon.com/lambda/latest/dg/with-kinesis.html + +## RDS +- https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Overview.DBInstance.html +- https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/rds-lambda-tutorial.html +- https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/creating-resources-with-cloudformation.html +- https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-rds-dbinstance.html + +## DMS +- https://stackoverflow.com/questions/77995867/dynamic-tables-via-dms-kinesis-iceberg-transactional-data-lake +- https://aws.amazon.com/blogs/database/tune-replication-performance-with-aws-dms-for-an-amazon-kinesis-data-streams-target-endpoint-part-3/ +- https://www.cockroachlabs.com/docs/stable/aws-dms + +## wal2json +- https://hevodata.com/learn/pg-logical/ +- https://aws.amazon.com/blogs/database/stream-changes-from-amazon-rds-for-postgresql-using-amazon-kinesis-data-streams-and-aws-lambda/ +- https://github.com/eulerto/wal2json +- https://docs.aws.amazon.com/AmazonRDS/latest/PostgreSQLReleaseNotes/postgresql-extensions.html#postgresql-extensions-15x + +## CDC +- https://debezium.io/documentation/reference/stable/postgres-plugins.html +- https://github.com/debezium/postgres-decoderbufs diff --git a/doc/conf.py b/doc/conf.py index fe0429b..24ed80f 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -53,12 +53,12 @@ "light_css_variables": { "color-brand-primary": "#CC3333", "color-brand-content": "darkblue", - "color-admonition-background": "orange", + # "color-admonition-background": "orange", }, "dark_css_variables": { "color-brand-primary": "#CC3333", "color-brand-content": "gold", - "color-admonition-background": "orange", + # "color-admonition-background": "orange", }, } diff --git a/doc/index.md b/doc/index.md index 253b0b9..167a008 100644 --- a/doc/index.md +++ b/doc/index.md @@ -29,8 +29,10 @@ :hidden: source/amqp +source/kinesis source/mqtt sink/database +carabas/index ``` ```{toctree} diff --git a/doc/source/kinesis.md b/doc/source/kinesis.md new file mode 100644 index 0000000..75d997b --- /dev/null +++ b/doc/source/kinesis.md @@ -0,0 +1,80 @@ +# Kinesis Source + +This recipe uses the LocalStack AWS environment to run an Amazon Kinesis surrogate. +The walkthrough follows the [Get started with Kinesis on LocalStack] tutorial. + +If you intend to invoke the commands on a real AWS environment, just use `aws` +instead of `awslocal`. + +:::{tip} +LocalStack is a cloud service emulator that runs in a single container on your +laptop or in your CI environment. With LocalStack, you can run your AWS +applications or Lambdas entirely on your local machine without connecting to +a remote cloud provider. +::: + +## Setup +Start the LocalStack service using Docker. +```shell +docker run \ + --rm -it \ + -p 127.0.0.1:4566:4566 \ + -p 127.0.0.1:4510-4559:4510-4559 \ + -v /var/run/docker.sock:/var/run/docker.sock \ + localstack/localstack:3.6 +``` + +Install LorryStream including LocalStack CLI programs. +```shell +pip install --upgrade 'lorrystream[carabas]' +``` + +## Configure +Create a Kinesis Data Stream called `testdrive`. +```shell +awslocal kinesis create-stream \ + --stream-name testdrive \ + --shard-count 1 +``` +Check the status of your streams. +```shell +awslocal kinesis list-streams +``` +```shell +awslocal kinesis describe-stream \ + --stream-name testdrive +``` +Display Stream ARN. +```shell +awslocal kinesis describe-stream --stream-name testdrive | jq -r .StreamDescription.StreamARN +``` + +## Usage +Submit an item to the data stream, using `awslocal`. +```shell +awslocal kinesis put-record \ + --stream-name testdrive \ + --partition-key 1 \ + --data '{"device": "foo", "temperature": 42.42, "humidity": 84.84}' +``` + +Submit an item to the data stream, using Python. +```shell +export AWS_ENDPOINT_URL="http://localhost:4566" +python examples/aws/kinesis_publish.py testdrive +``` + +Consume data stream, printing received payloads to STDOUT. +This is suitable for debugging purposes. +```shell +export AWS_ENDPOINT_URL="http://localhost:4566" +python examples/aws/kinesis_subscribe.py testdrive +``` + +:::{todo} +Demonstrate how to add a processor pipeline element using both either +AWS Lambda, or a dedicated processor instance. +::: + + +[Get started with Kinesis on LocalStack]: https://docs.localstack.cloud/user-guide/aws/kinesis/ diff --git a/examples/aws/dynamodb_kinesis_lambda_oci_cratedb.py b/examples/aws/dynamodb_kinesis_lambda_oci_cratedb.py new file mode 100644 index 0000000..81489f2 --- /dev/null +++ b/examples/aws/dynamodb_kinesis_lambda_oci_cratedb.py @@ -0,0 +1,67 @@ +import logging +from pathlib import Path + +from lorrystream.carabas.aws import DynamoDBKinesisPipe, LambdaFactory, LambdaPythonImage +from lorrystream.util.common import setup_logging + +logger = logging.getLogger(__name__) + + +def main(): + """ + A recipe to deploy a data relay stack to Amazon AWS. + + Pipeline: + - DynamoDB CDC -> Kinesis Stream -> Python Lambda via OCI -> CrateDB + + Ingredients: + - DynamoDB CDC to Kinesis + - Lambda function, shipped per OCI image + - CrateDB Cloud + + Prerequisites: Register an OCI repository. + """ + + # Build and publish OCI image that includes the AWS Lambda function. + python_image = LambdaPythonImage( + name="kinesis-cratedb-lambda", + entrypoint_file=Path("./lorrystream/process/kinesis_cratedb_lambda.py"), + entrypoint_handler="kinesis_cratedb_lambda.handler", + ) + python_image.publish() + + # Define an AWS CloudFormation software stack. + stack = DynamoDBKinesisPipe( + project="testdrive-dynamodb", + stage="dev", + region="eu-central-1", + description="DynamoDB CDC -> Kinesis Stream -> Python Lambda via OCI -> CrateDB", + table_name="table-testdrive", + stream_name="dynamodb-cdc", + environment={ + "SINK_SQLALCHEMY_URL": "crate://admin:dZ..qB@example.eks1.eu-west-1.aws.cratedb.net:4200/?ssl=true", + "SINK_TABLE": "transactions", + }, + ) + + # Add components to the stack. + stack.table().processor( + LambdaFactory( + name="DynamoDBCrateDBProcessor", + oci_uri=python_image.uri, + handler=python_image.entrypoint_handler, + ) + ).connect() + + # Deploy stack. + stack.deploy() + logger.info(f"Deployed stack: {stack}") + + # Refresh the OCI image. + # TODO: Detect when changed. + stack.deploy_processor_image() + + +if __name__ == "__main__": + setup_logging() + main() diff --git a/examples/aws/kinesis_publish.py b/examples/aws/kinesis_publish.py new file mode 100644 index 0000000..6760b0f --- /dev/null +++ b/examples/aws/kinesis_publish.py @@ -0,0 +1,39 @@ +""" +Synopsis, using LocalStack: + + export AWS_ENDPOINT_URL="http://localhost:4566" + python lorrystream/spike/kinesis/publish.py testdrive +""" + +import asyncio +import os +import sys + +from kinesis import Producer + +if "AWS_ACCESS_KEY" in os.environ: + os.environ["AWS_ACCESS_KEY_ID"] = os.environ["AWS_ACCESS_KEY"] +ENDPOINT_URL = os.environ.get("AWS_ENDPOINT_URL") +try: + STREAM_NAME = sys.argv[1] +except IndexError: + print("ERROR: Please supply stream name as positional argument", file=sys.stderr) # noqa: T201 + sys.exit(2) + +reading = {"device": "foo", "temperature": 42.42, "humidity": 84.84} + + +async def main(): + + # Put item onto queue to be flushed via `put_records()`. + async with Producer( + endpoint_url=ENDPOINT_URL, + stream_name=STREAM_NAME, + # region_name="eu-central-1", + buffer_time=0.01, + ) as producer: + await producer.put(reading) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/aws/kinesis_subscribe.py b/examples/aws/kinesis_subscribe.py new file mode 100644 index 0000000..4627a89 --- /dev/null +++ b/examples/aws/kinesis_subscribe.py @@ -0,0 +1,48 @@ +""" +Synopsis, using LocalStack: + + export AWS_ENDPOINT_URL="http://localhost:4566" + python lorrystream/spike/kinesis/subscribe.py testdrive +""" + +import asyncio +import os +import sys +from pprint import pprint + +from kinesis import Consumer, StringProcessor + +if "AWS_ACCESS_KEY" in os.environ: + os.environ["AWS_ACCESS_KEY_ID"] = os.environ["AWS_ACCESS_KEY"] +ENDPOINT_URL = os.environ.get("AWS_ENDPOINT_URL") +try: + STREAM_NAME = sys.argv[1] +except IndexError: + print("ERROR: Please supply stream name as positional argument", file=sys.stderr) # noqa: T201 + sys.exit(2) + + +async def main(): + """ + iterator_type: + + LATEST - Read only new records. + TRIM_HORIZON - Process all available records. + AT_TIMESTAMP - Specify a time from which to start reading records. + """ + async with Consumer( + endpoint_url=ENDPOINT_URL, + stream_name=STREAM_NAME, + # region_name="eu-central-1", + # TODO: Make configurable. + iterator_type="TRIM_HORIZON", + sleep_time_no_records=0.2, + processor=StringProcessor(), + ) as consumer: + while True: + async for item in consumer: + pprint(item) # noqa: T203 + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/aws/rds_postgresql_kinesis_lambda_oci_cratedb.py b/examples/aws/rds_postgresql_kinesis_lambda_oci_cratedb.py new file mode 100644 index 0000000..0c2d620 --- /dev/null +++ b/examples/aws/rds_postgresql_kinesis_lambda_oci_cratedb.py @@ -0,0 +1,145 @@ +import logging +import os +from pathlib import Path + +from commons_codec.model import ColumnType, ColumnTypeMapStore, TableAddress + +from lorrystream.carabas.aws import LambdaFactory, LambdaPythonImage, RDSPostgreSQLDMSKinesisPipe +from lorrystream.util.common import setup_logging + +logger = logging.getLogger(__name__) + + +def main(): + """ + A recipe to deploy a data migration stack to Amazon AWS. + + Pipeline: + - RDS PostgreSQL -> DMS -> Kinesis Stream -> Python Lambda via OCI -> CrateDB + + Ingredients: + - DMS, RDS PostgreSQL, Kinesis + - Lambda function, shipped per OCI image + - CrateDB Cloud + + Prerequisites: Register an OCI repository. + + Resources: + - https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/USER_VPC.WorkingWithRDSInstanceinaVPC.html + """ + + # Build and publish OCI image that includes the AWS Lambda function. + python_image = LambdaPythonImage( + name="kinesis-cratedb-lambda", + entrypoint_file=Path("./lorrystream/process/kinesis_cratedb_lambda.py"), + entrypoint_handler="kinesis_cratedb_lambda.handler", + ) + python_image.publish() + + # Define an AWS CloudFormation software stack. + stack = RDSPostgreSQLDMSKinesisPipe( + project="testdrive-dms-postgresql", + stage="dev", + region="eu-central-1", + description="RDS PostgreSQL > DMS -> Kinesis Stream -> Python Lambda via OCI -> CrateDB", + db_username="dynapipe", + db_password="secret11", # noqa: S106 + ) + + # Exclusively deploy the VPC elements of the stack. + # Do that on the first invocation, but nothing else. + # Warning: When doing it subsequently, it will currently delete the whole RDS substack. + # Warning: When doing it and directly proceed to RDS creation, it will fail: + # The specified VPC has no internet gateway attached. Update the VPC and then try again. + # TODO: Introduce a little CLI controller for invoking different deployment steps conveniently. + # TODO: Refactor by splitting into different stacks. + # stack.vpc().deploy(); return # noqa: ERA001 + + # Deploy the full RDS+DMS demo stack. + stack.vpc().database().stream().dms() # .deploy(); return + + # Define mapping rules for replication. + # https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Target.Kinesis.html + # https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Tasks.CustomizingTasks.TableMapping.html + # TODO: Currently hard-coded to table `public.foo`. + map_to_kinesis = { + "rules": [ + { + "rule-type": "selection", + "rule-id": "1", + "rule-name": "DefaultInclude", + "rule-action": "include", + "object-locator": {"schema-name": "public", "table-name": "foo"}, + "filters": [], + }, + # Using the percent wildcard ("%") in "table-settings" rules is + # not supported for source databases as shown following. + # https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Tasks.CustomizingTasks.TableMapping.SelectionTransformation.Tablesettings.html#CHAP_Tasks.CustomizingTasks.TableMapping.SelectionTransformation.Tablesettings.Wildcards + # Here: Exact schema and table required when using object mapping rule with '3.5' engine. + { + "rule-type": "object-mapping", + "rule-id": "2", + "rule-name": "DefaultMapToKinesis", + "rule-action": "map-record-to-record", + "object-locator": {"schema-name": "public", "table-name": "foo"}, + "filters": [], + }, + ] + } + + # Define column type mapping for CrateDB processor. + column_types = ColumnTypeMapStore().add( + table=TableAddress(schema="public", table="foo"), + column="attributes", + type_=ColumnType.MAP, + ) + + # Add a DMS replication pipeline element to the stack. + stack.replication(dms_table_mapping=map_to_kinesis) + + # Add custom processing components to the stack. + stack.processor( + factory=LambdaFactory( + name="DMSCrateDBProcessor", + oci_uri=python_image.uri, + handler=python_image.entrypoint_handler, + ), + environment={ + "MESSAGE_FORMAT": "dms", + "COLUMN_TYPES": column_types.to_json(), + "SINK_SQLALCHEMY_URL": os.environ.get("SINK_SQLALCHEMY_URL", "crate://"), + }, + ).connect( + batch_size=2_500, + # - LATEST - Read only new records. + # - TRIM_HORIZON - Process all available records. + # - AT_TIMESTAMP - Specify a time from which to start reading records. + starting_position="TRIM_HORIZON", + # starting_position_timestamp=1722986869.0, # noqa: ERA001 + ) + + # Deploy stack. + stack.deploy() + logger.info(f"Deployed stack: {stack}") + + # Refresh the OCI image. + # TODO: Detect when changed. + stack.deploy_processor_image() + + database_host = stack.get_output_value(stack._bsm, "DatabaseHost") + database_port = stack.get_output_value(stack._bsm, "DatabasePort") + psql_command = ( + f'psql "postgresql://{stack.db_username}:{stack.db_password}@{database_host}:{database_port}/postgres"' + ) + + print("Result of CloudFormation deployment:") + print("psql command:", psql_command) + + print("RDS Instance ARN:", stack.get_output_value(stack._bsm, "RDSInstanceArn")) + print("Stream ARN:", stack.get_output_value(stack._bsm, "StreamArn")) + print("Replication ARN:", stack.get_output_value(stack._bsm, "ReplicationTaskArn")) + + +if __name__ == "__main__": + setup_logging() + main() diff --git a/lorrystream/__init__.py b/lorrystream/__init__.py index c45275e..2b8f2ae 100644 --- a/lorrystream/__init__.py +++ b/lorrystream/__init__.py @@ -1 +1,10 @@ -from .cmd import parse_launch # noqa: F401 +from importlib.metadata import version + +from .cmd import parse_launch + +__appname__ = "lorrystream" +__version__ = version(__appname__) + +__all__ = [ + "parse_launch", +] diff --git a/lorrystream/carabas/__init__.py b/lorrystream/carabas/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lorrystream/carabas/aws/__init__.py b/lorrystream/carabas/aws/__init__.py new file mode 100644 index 0000000..7eb061e --- /dev/null +++ b/lorrystream/carabas/aws/__init__.py @@ -0,0 +1,11 @@ +from lorrystream.carabas.aws.function.model import LambdaFactory +from lorrystream.carabas.aws.function.oci import LambdaPythonImage +from lorrystream.carabas.aws.stack.dms import RDSPostgreSQLDMSKinesisPipe +from lorrystream.carabas.aws.stack.dynamodb import DynamoDBKinesisPipe + +__all__ = [ + "DynamoDBKinesisPipe", + "LambdaFactory", + "LambdaPythonImage", + "RDSPostgreSQLDMSKinesisPipe", +] diff --git a/lorrystream/carabas/aws/cf/__init__.py b/lorrystream/carabas/aws/cf/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lorrystream/carabas/aws/cf/dms_next.py b/lorrystream/carabas/aws/cf/dms_next.py new file mode 100644 index 0000000..a9ef518 --- /dev/null +++ b/lorrystream/carabas/aws/cf/dms_next.py @@ -0,0 +1,268 @@ +import typing + +import attr +from cottonformation.core.constant import AttrMeta +from cottonformation.core.model import GetAtt, Property, Resource, Tag, TypeCheck, TypeHint +from cottonformation.res.dms import Endpoint as EndpointVanilla +from cottonformation.res.dms import PropEndpointKinesisSettings, ReplicationSubnetGroup, ReplicationInstance, ReplicationTask + + +@attr.s +class Endpoint(EndpointVanilla): + p_Port: TypeHint.intrinsic_int = attr.ib( + default=None, + validator=attr.validators.optional(attr.validators.instance_of(TypeCheck.intrinsic_int_type)), + metadata={ + AttrMeta.PROPERTY_NAME: "Port", + AttrMeta.DATA: { + "Required": False, + "PrimitiveType": 'Integer', + "UpdateType": 'Mutable', + } + }, + ) + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-endpoint.html#cfn-dms-endpoint-port""" + + +@attr.s +class PropReplicationConfigComputeConfig(Property): + """ + AWS Object Type = "AWS::DMS::ReplicationConfig.ComputeConfig" + + Resource Document: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-dms-replicationconfig-computeconfig.html + + Property Document: + + - ``rp_MaxCapacityUnits``: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-dms-replicationconfig-computeconfig.html#cfn-dms-replicationconfig-computeconfig-maxcapacityunits + - ``p_AvailabilityZone``: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-dms-replicationconfig-computeconfig.html#cfn-dms-replicationconfig-computeconfig-availabilityzone + - ``p_DnsNameServers``: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-dms-replicationconfig-computeconfig.html#cfn-dms-replicationconfig-computeconfig-dnsnameservers + - ``p_KmsKeyId``: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-dms-replicationconfig-computeconfig.html#cfn-dms-replicationconfig-computeconfig-kmskeyid + - ``p_MinCapacityUnits``: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-dms-replicationconfig-computeconfig.html#cfn-dms-replicationconfig-computeconfig-mincapacityunits + - ``p_MultiAZ``: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-dms-replicationconfig-computeconfig.html#cfn-dms-replicationconfig-computeconfig-multiaz + - ``p_PreferredMaintenanceWindow``: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-dms-replicationconfig-computeconfig.html#cfn-dms-replicationconfig-computeconfig-preferredmaintenancewindow + - ``p_ReplicationSubnetGroupId``: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-dms-replicationconfig-computeconfig.html#cfn-dms-replicationconfig-computeconfig-replicationsubnetgroupid + - ``p_VpcSecurityGroupIds``: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-dms-replicationconfig-computeconfig.html#cfn-dms-replicationconfig-computeconfig-vpcsecuritygroupids + """ + AWS_OBJECT_TYPE = "AWS::DMS::ReplicationConfig.ComputeConfig" + + rp_MaxCapacityUnits: int = attr.ib( + default=None, + validator=attr.validators.instance_of(int), + metadata={AttrMeta.PROPERTY_NAME: "MaxCapacityUnits"}, + ) + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-dms-replicationconfig-computeconfig.html#cfn-dms-replicationconfig-computeconfig-maxcapacityunits""" + p_AvailabilityZone: TypeHint.intrinsic_str = attr.ib( + default=None, + validator=attr.validators.optional(attr.validators.instance_of(TypeCheck.intrinsic_str_type)), + metadata={AttrMeta.PROPERTY_NAME: "AvailabilityZone"}, + ) + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-dms-replicationconfig-computeconfig.html#cfn-dms-replicationconfig-computeconfig-availabilityzone""" + p_DnsNameServers: TypeHint.intrinsic_str = attr.ib( + default=None, + validator=attr.validators.optional(attr.validators.instance_of(TypeCheck.intrinsic_str_type)), + metadata={AttrMeta.PROPERTY_NAME: "DnsNameServers"}, + ) + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-dms-replicationconfig-computeconfig.html#cfn-dms-replicationconfig-computeconfig-dnsnameservers""" + p_KmsKeyId: TypeHint.intrinsic_str = attr.ib( + default=None, + validator=attr.validators.optional(attr.validators.instance_of(TypeCheck.intrinsic_str_type)), + metadata={AttrMeta.PROPERTY_NAME: "KmsKeyId"}, + ) + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-dms-replicationconfig-computeconfig.html#cfn-dms-replicationconfig-computeconfig-kmskeyid""" + p_MinCapacityUnits: int = attr.ib( + default=None, + validator=attr.validators.optional(attr.validators.instance_of(int)), + metadata={AttrMeta.PROPERTY_NAME: "MinCapacityUnits"}, + ) + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-dms-replicationconfig-computeconfig.html#cfn-dms-replicationconfig-computeconfig-mincapacityunits""" + p_MultiAZ: bool = attr.ib( + default=None, + validator=attr.validators.optional(attr.validators.instance_of(bool)), + metadata={AttrMeta.PROPERTY_NAME: "MultiAZ"}, + ) + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-dms-replicationconfig-computeconfig.html#cfn-dms-replicationconfig-computeconfig-multiaz""" + p_PreferredMaintenanceWindow: TypeHint.intrinsic_str = attr.ib( + default=None, + validator=attr.validators.optional(attr.validators.instance_of(TypeCheck.intrinsic_str_type)), + metadata={AttrMeta.PROPERTY_NAME: "PreferredMaintenanceWindow"}, + ) + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-dms-replicationconfig-computeconfig.html#cfn-dms-replicationconfig-computeconfig-preferredmaintenancewindow""" + p_ReplicationSubnetGroupId: TypeHint.intrinsic_str = attr.ib( + default=None, + validator=attr.validators.optional(attr.validators.instance_of(TypeCheck.intrinsic_str_type)), + metadata={AttrMeta.PROPERTY_NAME: "ReplicationSubnetGroupId"}, + ) + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-dms-replicationconfig-computeconfig.html#cfn-dms-replicationconfig-computeconfig-replicationsubnetgroupid""" + p_VpcSecurityGroupIds: typing.List[TypeHint.intrinsic_str] = attr.ib( + default=None, + validator=attr.validators.optional( + attr.validators.deep_iterable(member_validator=attr.validators.instance_of(TypeCheck.intrinsic_str_type), + iterable_validator=attr.validators.instance_of(list))), + metadata={AttrMeta.PROPERTY_NAME: "VpcSecurityGroupIds"}, + ) + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-dms-replicationconfig-computeconfig.html#cfn-dms-replicationconfig-computeconfig-vpcsecuritygroupids""" + + +@attr.s +class ReplicationConfig(Resource): + """ + AWS Object Type = "AWS::DMS::ReplicationConfig" + + Resource Document: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html + + Property Document: + + - ``rp_ComputeConfig``: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#cfn-dms-replicationconfig-computeconfig + - ``rp_ReplicationConfigIdentifier``: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#cfn-dms-replicationconfig-replicationconfigidentifier + - ``rp_ReplicationType``: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#cfn-dms-replicationconfig-replicationtype + - ``rp_SourceEndpointArn``: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#cfn-dms-replicationconfig-sourceendpointarn + - ``rp_TableMappings``: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#cfn-dms-replicationconfig-tablemappings + - ``rp_TargetEndpointArn``: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#cfn-dms-replicationconfig-targetendpointarn + - ``p_ReplicationSettings``: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#cfn-dms-replicationconfig-replicationsettings + - ``p_ResourceIdentifier``: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#cfn-dms-replicationconfig-resourceidentifier + - ``p_SupplementalSettings``: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#cfn-dms-replicationconfig-supplementalsettings + - ``p_Tags``: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#cfn-dms-replicationconfig-tags + """ + AWS_OBJECT_TYPE = "AWS::DMS::ReplicationConfig" + + rp_ComputeConfig: typing.Union['PropReplicationConfigComputeConfig', dict] = attr.ib( + default=None, + converter=PropReplicationConfigComputeConfig.from_dict, + validator=attr.validators.instance_of(PropReplicationConfigComputeConfig), + metadata={ + AttrMeta.PROPERTY_NAME: "ComputeConfig", + AttrMeta.DATA: { + "UpdateType": 'Mutable', + "Required": True, + "Type": 'ComputeConfig', + } + }, + ) + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#cfn-dms-replicationconfig-computeconfig""" + rp_ReplicationConfigIdentifier: TypeHint.intrinsic_str = attr.ib( + default=None, + validator=attr.validators.instance_of(TypeCheck.intrinsic_str_type), + metadata={ + AttrMeta.PROPERTY_NAME: "ReplicationConfigIdentifier", + AttrMeta.DATA: { + "UpdateType": 'Mutable', + "Required": True, + "PrimitiveType": 'String', + } + }, + ) + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#cfn-dms-replicationconfig-replicationconfigidentifier""" + rp_ReplicationType: TypeHint.intrinsic_str = attr.ib( + default=None, + validator=attr.validators.instance_of(TypeCheck.intrinsic_str_type), + metadata={ + AttrMeta.PROPERTY_NAME: "ReplicationType", + AttrMeta.DATA: { + "UpdateType": 'Mutable', + "Required": True, + "PrimitiveType": 'String', + } + }, + ) + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#cfn-dms-replicationconfig-replicationtype""" + rp_SourceEndpointArn: TypeHint.intrinsic_str = attr.ib( + default=None, + validator=attr.validators.instance_of(TypeCheck.intrinsic_str_type), + metadata={ + AttrMeta.PROPERTY_NAME: "SourceEndpointArn", + AttrMeta.DATA: { + "UpdateType": 'Mutable', + "Required": True, + "PrimitiveType": 'String', + } + }, + ) + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#cfn-dms-replicationconfig-sourceendpointarn""" + rp_TableMappings: dict = attr.ib( + default=None, + validator=attr.validators.instance_of(dict), + metadata={ + AttrMeta.PROPERTY_NAME: "TableMappings", + AttrMeta.DATA: { + "UpdateType": 'Mutable', + "Required": True, + "PrimitiveType": 'Json', + } + }, + ) + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#cfn-dms-replicationconfig-tablemappings""" + rp_TargetEndpointArn: TypeHint.intrinsic_str = attr.ib( + default=None, + validator=attr.validators.instance_of(TypeCheck.intrinsic_str_type), + metadata={ + AttrMeta.PROPERTY_NAME: "TargetEndpointArn", + AttrMeta.DATA: { + "UpdateType": 'Mutable', + "Required": True, + "PrimitiveType": 'String', + } + }, + ) + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#cfn-dms-replicationconfig-targetendpointarn""" + p_ReplicationSettings: dict = attr.ib( + default=None, + validator=attr.validators.optional(attr.validators.instance_of(dict)), + metadata={ + AttrMeta.PROPERTY_NAME: "ReplicationSettings", + AttrMeta.DATA: { + "UpdateType": 'Mutable', + "Required": False, + "PrimitiveType": 'Json', + } + }, + ) + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#cfn-dms-replicationconfig-replicationsettings""" + p_ResourceIdentifier: TypeHint.intrinsic_str = attr.ib( + default=None, + validator=attr.validators.optional(attr.validators.instance_of(TypeCheck.intrinsic_str_type)), + metadata={ + AttrMeta.PROPERTY_NAME: "ResourceIdentifier", + AttrMeta.DATA: { + "UpdateType": 'Immutable', + "Required": False, + "PrimitiveType": 'String', + } + }, + ) + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#cfn-dms-replicationconfig-resourceidentifier""" + p_SupplementalSettings: dict = attr.ib( + default=None, + validator=attr.validators.optional(attr.validators.instance_of(dict)), + metadata={ + AttrMeta.PROPERTY_NAME: "SupplementalSettings", + AttrMeta.DATA: { + "UpdateType": 'Mutable', + "Required": False, + "PrimitiveType": 'Json', + } + }, + ) + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#cfn-dms-replicationconfig-supplementalsettings""" + p_Tags: typing.List[typing.Union[Tag, dict]] = attr.ib( + default=None, + converter=Tag.from_list, + validator=attr.validators.optional( + attr.validators.deep_iterable(member_validator=attr.validators.instance_of(Tag), + iterable_validator=attr.validators.instance_of(list))), + metadata={ + AttrMeta.PROPERTY_NAME: "Tags", + AttrMeta.DATA: { + "UpdateType": 'Mutable', + "Required": False, + "Type": 'List', + "ItemType": 'Tag', + "DuplicatesAllowed": True, + } + }, + ) + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#cfn-dms-replicationconfig-tags""" + + @property + def rv_ReplicationConfigArn(self) -> GetAtt: + """Doc: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-dms-replicationconfig.html#aws-resource-dms-replicationconfig-return-values""" + return GetAtt(resource=self, attr_name="ReplicationConfigArn") diff --git a/lorrystream/carabas/aws/function/__init__.py b/lorrystream/carabas/aws/function/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lorrystream/carabas/aws/function/model.py b/lorrystream/carabas/aws/function/model.py new file mode 100644 index 0000000..1dae83b --- /dev/null +++ b/lorrystream/carabas/aws/function/model.py @@ -0,0 +1,157 @@ +import dataclasses +import logging +import typing as t +from pathlib import Path +from tempfile import TemporaryDirectory + +import attr +import cottonformation as cf +from cottonformation import ResourceGroup +from cottonformation.res import awslambda, iam + +from lorrystream.carabas.aws.model import GenericEnvStack + +logger = logging.getLogger(__name__) + + +@dataclasses.dataclass +class BundleArchive: + """ + Manage a Zip archive. + """ + + name: str + content: bytes + checksum: t.Optional[str] = None + + def to_file(self, name: str): + with TemporaryDirectory() as tmpdir: + tmppath = Path(tmpdir) + path = tmppath / name + path.write_bytes(self.content) + yield path + + +@attr.s +class LambdaResource: + """ + Manage a Lambda resource. + """ + + group: ResourceGroup = attr.ib() + function: awslambda.Function = attr.ib() + + +@attr.s +class LambdaFactory: + """ + Create a Lambda. + """ + + name: str = attr.ib() + handler: str = attr.ib() + code: str = attr.ib(default=None) + oci_uri: str = attr.ib(default=None) + role_id: str = attr.ib(default="IamRoleForLambdaExecution") + + @property + def function_id(self): + return self.name + + def __attrs_post_init__(self): + self.validate() + + def validate(self): + if self.code is None and self.oci_uri is None: + raise ValueError("Please configure either `code` or `image`") + + def make(self, stack: GenericEnvStack, environment: t.Dict[str, str] = None) -> LambdaResource: + environment = environment or {} + group = ResourceGroup() + + # IAM role for executing the Lambda function. + iam_role_for_lambda = iam.Role( + id=self.role_id, + # you don't need to remember the exact name or syntax for + # trusted entity / assume role policy, cottonformation has a helper for this + rp_AssumeRolePolicyDocument=cf.helpers.iam.AssumeRolePolicyBuilder( + cf.helpers.iam.ServicePrincipal.awslambda() + ).build(), + p_RoleName=cf.Sub("${EnvName}-iam-role-for-lambda", {"EnvName": stack.param_env_name.ref()}), + p_Description="IAM lambda execution role", + # you don't need to remember the exact ARN for aws managed policy. + # cottonformation has a helper for this + p_ManagedPolicyArns=[ + cf.helpers.iam.AwsManagedPolicy.AWSLambdaBasicExecutionRole, + # https://docs.aws.amazon.com/lambda/latest/dg/services-kinesis-create.html + cf.helpers.iam.AwsManagedPolicy.AWSLambdaKinesisExecutionRole, + ], + ) + group.add(iam_role_for_lambda) + + out_lambda_role_arn = cf.Output( + id=f"{self.role_id}Arn", + Description="IAM lambda execution role name", + Value=iam_role_for_lambda.rv_Arn, + ) + group.add(out_lambda_role_arn) + + # Define Lambda function. + """ + - rp_ means "Required Property", it will gives you parameter-hint + for all valid required properties. + - rv_ means "Return Value", allowing you to instantly reference the + attribute. Otherwise, you would need to explicitly invoke `GetAtt`, + to acquire ARNs of previously created resources. + - p_ means "Property". + + aws lambda create-function \ + --function-name hello-world \ + --package-type Image \ + --code ImageUri=111122223333.dkr.ecr.us-east-1.amazonaws.com/hello-world:latest \ + --role arn:aws:iam::111122223333:role/lambda-ex + """ + if self.code: + rp_code = awslambda.PropFunctionCode( + p_ZipFile=self.code, + ) + elif self.oci_uri: + rp_code = awslambda.PropFunctionCode( + p_ImageUri=self.oci_uri, + ) + else: + raise ValueError("Lambda function is invalid without code definition") + + # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-lambda-function.html + # Runtime and Handler are mandatory parameters for functions created with deployment packages + # The Runtime and Handler parameters are not supported for functions created with container images. + lambda_function = awslambda.Function( + id=self.function_id, + p_FunctionName=cf.Sub("${EnvName}-lambda-processor", {"EnvName": stack.param_env_name.ref()}), + rp_Code=rp_code, + p_PackageType="Image", + p_Environment=awslambda.PropFunctionEnvironment(p_Variables=environment), + rp_Role=iam_role_for_lambda.rv_Arn, + p_MemorySize=512, + p_Timeout=3, + ra_DependsOn=iam_role_for_lambda, + ) + + # TODO: Add Zip archive case. + # TODO: Add Python 3.10bis + """ + # p_Runtime=cf.helpers.awslambda.LambdaRuntime.python39, + # p_Runtime="python3.12", + # p_Handler="index.handler", + # p_Handler=self.handler, + """ + group.add(lambda_function) + + out_lambda_func_arn = cf.Output( + id=f"{self.function_id}Arn", + Description="Lambda Function ARN", + Value=lambda_function.rv_Arn, + ) + group.add(out_lambda_func_arn) + + return LambdaResource(group=group, function=lambda_function) diff --git a/lorrystream/carabas/aws/function/oci.py b/lorrystream/carabas/aws/function/oci.py new file mode 100644 index 0000000..90c34f9 --- /dev/null +++ b/lorrystream/carabas/aws/function/oci.py @@ -0,0 +1,263 @@ +import dataclasses +import importlib +import logging +import os +import shlex +import shutil +import subprocess +import typing as t +from pathlib import Path +from tempfile import NamedTemporaryFile, TemporaryDirectory +from textwrap import dedent + +from boto_session_manager import BotoSesManager + +from lorrystream.util.python.bundle import collect_requirements + +logger = logging.getLogger(__name__) + + +@dataclasses.dataclass +class LambdaPythonImage: + """ + Manage + https://docs.aws.amazon.com/lambda/latest/dg/images-create.html + https://docs.aws.amazon.com/lambda/latest/dg/python-image.html + https://aws.amazon.com/blogs/containers/containerizing-lambda-deployments-using-oci-container-images/ + https://docs.aws.amazon.com/prescriptive-guidance/latest/patterns/deploy-lambda-functions-with-container-images.html + """ + + name: str + entrypoint_handler: str + oci_image: t.Union[str, None] = None + oci_version: t.Union[str, None] = None + python_version: str = "3.12" + oci_baseimage: str = "public.ecr.aws/lambda/python" + oci_platform: str = "linux/amd64" + entrypoint_file: t.Union[Path, None] = None + packages: t.List[str] = dataclasses.field(default_factory=list) + requirements_list: t.List[str] = dataclasses.field(default_factory=list) + requirements_file: t.Union[str, Path, None] = None + + _bsm: BotoSesManager = None + + def __post_init__(self): + self._bsm = BotoSesManager() + if self.oci_image is None: + self.oci_image = f"{self._bsm.aws_account_id}.dkr.ecr.{self._bsm.aws_region}.amazonaws.com/{self.name}" + if self.oci_version is None: + self.oci_version = "latest" + self.temporary_requirements_file = NamedTemporaryFile() + + @property + def uri(self) -> str: + """ + The full specification to an OCI image defining the processor element. + """ + return f"{self.oci_image}:{self.oci_version}" + + @property + def image_build(self): + """ + The full qualified name of the image in `build` stage, including tag. + """ + return f"{self.name}:build" + + def find_repository_root(self, package: str): + return self.find_package_root(package).parent + + def find_package_root(self, package: str): + mod = importlib.import_module(package) + return Path(mod.__path__[0]) + + def get_package_folder(self, package): + return f"src/{package}" + + def get_dockerfile(self) -> str: + requirements = "" + entrypoint = "" + packages = "" + + # Populate dependencies from package name. + # This is suitable for building an image including the code on your working tree. + for package in self.packages: + pkg_folder = self.get_package_folder(package) + packages += f"ADD {pkg_folder} /{pkg_folder}" + self.requirements_list.append(f"/{pkg_folder}") + + # Populate dependencies from inline script metadata (PEP 723). + # This is suitable for picking up dependencies from standalone single-file Python programs. + if self.entrypoint_file is not None: + requirements_pep723 = collect_requirements(self.entrypoint_file.read_text()) + self.requirements_list += requirements_pep723 + + # Write list of picked up dependencies into `requirements.txt` file. + if self.requirements_list: + tmpfile = self.temporary_requirements_file + Path(tmpfile.name).write_text("\n".join(self.requirements_list)) + tmpfile.flush() + self.requirements_file = tmpfile.name + + # Render `Dockerfile` snippet to process a `requirements.txt` file. + if self.requirements_file is not None: + requirements = dedent( + """ + # Copy requirements.txt + COPY requirements.txt ${LAMBDA_TASK_ROOT} + + # Install the specified packages + RUN pip install -r requirements.txt + """ + ) + + # Render `Dockerfile` snippet to copy a single-file entrypoint file. + if self.entrypoint_file is not None: + entrypoint = dedent( + f""" + # Copy function code + COPY {self.entrypoint_file.name} ${{LAMBDA_TASK_ROOT}} + """ + ) + + dockerfile = dedent( + f""" + FROM {self.oci_baseimage}:{self.python_version} + + # Install Git, it is needed for installing Python projects from GitHub. + # TODO: Make optional. + # RUN dnf install -y git + + {packages} + + {requirements} + + {entrypoint} + + # Uninstall Git again. + # TODO: Make optional. + # RUN dnf remove -y git + + # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) + CMD [ "{self.entrypoint_handler}" ] + """ + ).strip() + + return dockerfile + + def copy_handler_file(self, target: Path): + module = self.entrypoint_handler.rsplit(".", 1)[0] + mod = importlib.import_module(module) + if mod.__file__ is None: + logger.error(f"Module has no __file__: {module}") + return + path = Path(mod.__file__) + + search = path.name + search = "dynamodb_cdc_lambda.py" + + def ignorefunc(src, names): + ignored = names + if search in names: + names.remove(search) + return ignored + + shutil.copytree(self.find_repository_root("lorrystream"), target / "lorrystream", ignore=ignorefunc) + + def build(self): + """ + docker build --platform linux/amd64 -t docker-image:build . + """ + dockerfile = self.get_dockerfile() + with TemporaryDirectory() as tmpdir: + tmppath = Path(tmpdir) + + # Establish Dockerfile. + (tmppath / "Dockerfile").write_text(dockerfile) + + # Establish Python `requirements.txt` file. + if self.requirements_file: + shutil.copy(self.requirements_file, tmppath / "requirements.txt") + + # Establish single entrypoint file. + if self.entrypoint_file: + shutil.copy(self.entrypoint_file, tmppath) + + # Copier for nested files from packages. + # self.copy_handler_file(tmppath) # noqa: ERA001 + + # Copier for whole development packages. + for package in self.packages: + pkg_folder = self.get_package_folder(package) + + def ignorefunc(src, names): + ignored = ["dist", "tmp"] + for name in names: + if name.startswith(".") and name != ".git": + ignored.append(name) + return ignored + + shutil.copytree(self.find_repository_root(package), tmppath / pkg_folder, ignore=ignorefunc) + + command = f"docker build --platform={self.oci_platform} --tag={self.image_build} ." + subprocess.run( # noqa: S603 + shlex.split(command), + cwd=tmppath, + env=dict(os.environ) | {"DOCKER_BUILDKIT": "1", "BUILDKIT_PROGRESS": "plain"}, + check=True, + ) + + def test(self): + """ + FIXME: Make it work. + + docker run --platform linux/amd64 -p 9000:8080 docker-image:build + curl "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"payload":"hello world!"}' + """ + """ + command = f"docker run --platform={self.oci_platform} -p 9000:8080 {self.image_build}" + print("test-command:", command) + """ + pass + + def push(self): + """ + Push OCI image of serverless function (AWS Lambda) to container registry (AWS ECR). + + TODO: Use Docker HTTP client wrapper `docker`, instead of shelling out to the `docker` CLI. + + Abstract: + docker tag docker-image:build :latest + docker push .... + + Example: + docker tag docker-image:build 111122223333.dkr.ecr.us-east-1.amazonaws.com/hello-world:latest + docker push 111122223333.dkr.ecr.us-east-1.amazonaws.com/hello-world:latest + """ + + # Ensure the image registry exists. + self.ensure_image_registry() + + # Tag the image with the designated remote image name and version. + command = f"docker tag {self.image_build} {self.oci_image}:{self.oci_version}" + subprocess.run(shlex.split(command), check=True) # noqa: S603 + + # Push to container registry. + command = f"docker push {self.oci_image}:{self.oci_version}" + subprocess.run(shlex.split(command), check=True) # noqa: S603 + + def ensure_image_registry(self): + """ + Make sure ECR container registry exists. It is needed to store OCI images for your Lambda functions. + + aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 111122223333.dkr.ecr.us-east-1.amazonaws.com + aws ecr create-repository --repository-name hello-world --region us-east-1 --image-scanning-configuration scanOnPush=true --image-tag-mutability MUTABLE + """ # noqa: E501 + pass + + def publish(self): + """ + This. + """ + self.build() + self.test() + self.push() diff --git a/lorrystream/carabas/aws/function/zip.py b/lorrystream/carabas/aws/function/zip.py new file mode 100644 index 0000000..7cbdfbd --- /dev/null +++ b/lorrystream/carabas/aws/function/zip.py @@ -0,0 +1,198 @@ +import glob +import shutil +import subprocess +import sys +import typing as T +from pathlib import Path +from tempfile import TemporaryDirectory + +from aws_lambda_layer.context import BuildContext +from aws_lambda_layer.source import build_source_artifacts +from aws_lambda_layer.vendor.better_pathlib import temp_cwd +from aws_lambda_layer.vendor.hashes import hashes + +from lorrystream.carabas.aws.function.model import BundleArchive +from lorrystream.util.python.bundle import collect_requirements + + +# `build_layer_artifacts` from `aws-lambda-layer` package by Sanhe Hu. +# `build_layer_artifacts` improvements to make it platform-agnostic by Andreas Motl. +# https://github.com/MacHu-GWU/aws_lambda_layer-project/blob/546a711401464/aws_lambda_layer/layer.py#L114-L199 +def build_layer_artifacts( + path_requirements: T.Union[str, Path], + dir_build: T.Union[str, Path], + bin_pip: T.Optional[T.Union[str, Path]] = None, + ignore_package_list: T.Optional[T.List[str]] = None, + quiet: bool = False, +) -> str: + """ + Build the AWS Lambda layer artifacts based on the dependencies + specified in the ``path_requirements``. It utilizes ``bin_pip`` to install + the dependencies into the ``${dir_build}/python`` folder. Afterwards, + it compresses the ``${dir_build}/python`` folder into ``${dir_build}/layer.zip``. + + Please note that this function is intended to run in an Amazon Linux-like environment, + such as CodeBuild, EC2, or Cloud9, as the Amazon managed Lambda function + also uses Amazon Linux. + + In order to build the layer on Windows or macOS, packages are downloaded from PyPI + using the `manylinux` platform, to avoid compatibility issues with platform-native + libraries / wheel packages including binary code. + + :param path_requirements: example: ``/path/to/requirements.txt``. + :param dir_build: example: ``/path/to/build/lambda``. + :param bin_pip: example: ``/path/to/.venv/bin/pip``. + :param ignore_package_list: a list of package names that you want to ignore + when building the layer. + :param quiet: whether you want to suppress the output of cli commands. + + :return: the layer content sha256, it is sha256 of the requirements.txt file + """ + build_context = BuildContext.new(dir_build=dir_build) + path_requirements = Path(path_requirements).absolute() + if bin_pip: + bin_pip = Path(bin_pip).absolute() + else: + bin_pip = Path(sys.executable).parent.joinpath("pip").absolute() + + # remove existing artifacts and temp folder + build_context.path_layer_zip.unlink(missing_ok=True) + shutil.rmtree(build_context.dir_python, ignore_errors=True) + + # initialize the build/lambda folder + build_context.dir_build.mkdir(parents=True, exist_ok=True) + + # Platform-agnostic `pip install`. + # pip install --platform=manylinux2014_x86_64 --only-binary=:all: \ + # --requirement requirements.txt --target ./build/python/lib/python3.11/site-packages + # https://github.com/MacHu-GWU/aws_lambda_layer-project/issues/1 + # https://docs.aws.amazon.com/lambda/latest/dg/python-layers.html#python-layer-manylinux + # https://github.com/awsdocs/aws-lambda-developer-guide/blob/main/sample-apps/layer-python/layer-numpy/1-install.sh + python_package_path = f"python{sys.version_info.major}.{sys.version_info.minor}" + pkg_relative_path = Path("lib") / python_package_path / "site-packages" + target_path = build_context.dir_python / pkg_relative_path + args = [ + str(bin_pip), + "install", + "--platform=manylinux2014_x86_64", + "--only-binary=:all:", + f"--requirement={path_requirements}", + f"--target={target_path}", + ] + if quiet: + args.append("--disable-pip-version-check") + args.append("--quiet") + subprocess.run(args, check=True) # noqa: S603 + + # zip the layer file + # some packages are pre-installed in AWS Lambda runtime, so we don't need to + # add them to the layer + if ignore_package_list is None: + ignore_package_list = [ + "boto3", + "botocore", + "s3transfer", + "urllib3", + "setuptools", + "pip", + "wheel", + "twine", + "_pytest", + "pytest", + ] + args = [ + "zip", + f"{build_context.path_layer_zip}", + "-r", + "-9", + ] + if quiet: + args.append("-q") + # the glob command and zip command depends on the current working directory + with temp_cwd(build_context.dir_build): + args.extend(glob.glob("*")) + if ignore_package_list: + args.append("-x") + for package in ignore_package_list: + ignore_path = Path(build_context.dir_python.name) / pkg_relative_path + args.append(f"{ignore_path}/{package}*") + subprocess.run(args, check=True) # noqa: S603 + layer_sha256 = hashes.of_bytes(path_requirements.read_bytes()) + return layer_sha256 + + +def build_layer(*artifacts: Path, more_requirements: T.Union[T.List[str], None] = None): + """ + Build an AWS Lambda layer for Python Lamda functions. + + https://docs.aws.amazon.com/lambda/latest/dg/python-layers.html#python-layer-manylinux + """ + + # Build list of requirements specifications. + more_requirements = more_requirements or [] + requirements = collect_requirements(*artifacts) + more_requirements + + with TemporaryDirectory() as tmpdir: + # Define build directory. + tmppath = Path(tmpdir) + dir_build = tmppath / "build" + + # Write list of requirements to file. + requirements_file = tmppath.joinpath("requirements.txt") + requirements_file.write_text("\n".join(requirements)) + + # Build AWS Lamda layer Zip archive. + layer_sha256 = build_layer_artifacts( + path_requirements=requirements_file, + dir_build=dir_build, + ) + archive_file = dir_build / "layer.zip" + return BundleArchive(name=archive_file.name, content=archive_file.read_bytes(), checksum=layer_sha256) + + +def build_source(entrypoint_script: Path, *artifacts: Path): + package_name = "common" + with TemporaryDirectory() as tmpdir: + tmppath = Path(tmpdir) + + # Populate source package directory. + dir_build = tmppath / "build" + dir_lib = tmppath / "lib" + pkg_dir = dir_lib / package_name + pkg_dir.mkdir(parents=True, exist_ok=True) + for artifact in artifacts: + shutil.copy(artifact, pkg_dir) + + # Build Zip archive. + dummy_projectfile = dir_lib / "pyproject.toml" + source_sha256, path_source_zip = build_source_artifacts( + path_setup_py_or_pyproject_toml=dummy_projectfile, + package_name=package_name, + path_lambda_function=entrypoint_script, + dir_build=dir_build, + use_pathlib=True, + ) + return BundleArchive(name=path_source_zip.name, content=path_source_zip.read_bytes(), checksum=source_sha256) + + +""" +def upload_source_old(bundle: BundleArchive): + # bsm = BotoSesManager(profile_name="bmt_app_dev_us_east_1") + bsm = BotoSesManager() + with TemporaryDirectory() as tmpdir: + tmppath = Path(tmpdir) + (tmppath / "source.zip").write_bytes(bundle.content) + s3dir_lambda = S3Path( + f"s3://{bsm.aws_account_id}-{bsm.aws_region}-artifacts/projects/{package_name}/lambda/" + ).to_dir() + s3path_source_zip = upload_source_artifacts( + bsm=bsm, + version="0.0.1", + source_sha256=bundle.checksum, + dir_build=tmppath, + s3dir_lambda=s3dir_lambda, + metadata=metadata, + tags=tags, + ) + print("s3path_source_zip:", s3path_source_zip) +""" diff --git a/lorrystream/carabas/aws/model.py b/lorrystream/carabas/aws/model.py new file mode 100644 index 0000000..34b0904 --- /dev/null +++ b/lorrystream/carabas/aws/model.py @@ -0,0 +1,163 @@ +import logging +import typing as t + +import attr +import botocore +import cottonformation as cf +from aws_cloudformation import Parameter +from boto_session_manager import BotoSesManager +from cottonformation.res import kinesis + +if t.TYPE_CHECKING: + from lorrystream.carabas.aws.function.model import LambdaResource + +logger = logging.getLogger(__name__) + + +@attr.s +class GenericEnvStack(cf.Stack): + project: str = attr.ib() + stage: str = attr.ib() + region: str = attr.ib() + description: str = attr.ib() + + _bsm: BotoSesManager + + param_env_name = cf.Parameter( + "EnvName", + Type=cf.Parameter.TypeEnum.String, + ) + + def post_hook(self): + self._bsm = BotoSesManager(region_name=self.region) + self.template.Description = self.description + self.define_parameters() + + def add(self, *things): + """ + A shortcut function to add a component to the current template of this Stack. + """ + for thing in things: + self.template.add(thing) + return self + + @property + def env_name(self): + """ + The environment name is a composite. + + Made from an arbitrary project name, and a name of the stage the Stack is running in. + """ + return f"{self.project}-{self.stage}" + + @property + def stack_name(self): + """ + Stack name equals environment name. + """ + return self.env_name + + def define_parameters(self): + """ + Define Stack parameters. + """ + # Define parameter: Environment name. + self.template.add(self.param_env_name) + + @property + def parameters(self): + """ + Return Stack parameters suitable for deployment. + """ + return [ + Parameter(key="EnvName", value=self.stack_name), + ] + + def deploy(self, respawn: bool = False): + """ + Deploy AWS CloudFormation Stack. + """ + logger.info("Deploying CloudFormation stack") + parameters = self.parameters or [] + + self.template.batch_tagging(dict(ProjectName=self.project, Stage=self.stage), mode_overwrite=True) # noqa: C408 + + env = cf.Env(bsm=self._bsm) + if respawn: + env.delete(stack_name=self.stack_name, skip_prompt=True) + + env.deploy( + template=self.template, + stack_name=self.stack_name, + parameters=parameters, + include_iam=True, + include_named_iam=True, + verbose=True, + skip_prompt=False, + # 300 seconds are not enough to wait for RDS PostgreSQL, for example. + # 500 seconds are not enough for a complete stack including a DMS instance, for example. + # on 110 th attempt, elapsed 555 seconds, remain 445 seconds ... + timeout=750, + ) + return self + + +@attr.s +class GenericProcessorStack(GenericEnvStack): + + _processor: t.Optional["LambdaResource"] = None + + def deploy_processor_image(self): + """ + Make an already running Lambda pick up a newly published OCI image. + + This is an imperative function executed orthogonally to the CloudFormation deployment. + + It follows this procedure: + - Acquire the `Arn` Output of the Stack's core processor Lambda. + - Use it to look up a handle to the actual Lambda information. + - From the information unit, extract the OCI image URI. + - Instruct the machinery to update the Lambda function code, + effectively respawning the container running it. + """ + if not self._processor: + logger.warning("No processor defined, skip deploying processor OCI image") + return None + function_id = self._processor.function.id + + # Inquire Stack Output. + logger.info(f"Discovering Lambda function existence: {function_id}") + output_id = f"{function_id}Arn" + try: + function_arn = self.get_output_value(self._bsm, output_id) + except botocore.exceptions.ClientError as ex: + if "does not exist" not in str(ex): + raise + logger.info(f"Stack not found or incomplete: {self.stack_name}") + return None + except KeyError: + logger.info(f"Stack not found or incomplete. Output not found: {output_id}") + return None + + # Inquire AWS API and eventually update Lambda code. + client = self._bsm.get_client("lambda") + try: + if func := client.get_function(FunctionName=function_arn): + logger.info(f"Found Lambda function: {function_arn}") + oci_uri = func["Code"]["ImageUri"] + logger.info(f"Deploying new OCI image to Lambda function: {oci_uri}") + response = client.update_function_code(FunctionName=function_arn, ImageUri=oci_uri) + last_status_message = response["LastUpdateStatusReason"] + logger.info(f"Lambda update status response: {last_status_message}") + except Exception as ex: + if ex.__class__.__name__ != "ResourceNotFoundException": + raise + logger.info(f"Lambda function to update OCI image not found: {function_arn}") + + return self + + +@attr.s +class KinesisProcessorStack(GenericProcessorStack): + + _stream_source: t.Union[kinesis.Stream, None] = None diff --git a/lorrystream/carabas/aws/stack/__init__.py b/lorrystream/carabas/aws/stack/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lorrystream/carabas/aws/stack/dms.py b/lorrystream/carabas/aws/stack/dms.py new file mode 100644 index 0000000..198f9e4 --- /dev/null +++ b/lorrystream/carabas/aws/stack/dms.py @@ -0,0 +1,685 @@ +import json +import typing as t + +import attr +import cottonformation as cf +from cottonformation import GetAtt +from cottonformation.res import awslambda, ec2, iam, kinesis, rds + +from lorrystream.carabas.aws import LambdaFactory +from lorrystream.carabas.aws.cf import dms_next as dms +from lorrystream.carabas.aws.model import KinesisProcessorStack + + +@attr.s +class RDSPostgreSQLDMSKinesisPipe(KinesisProcessorStack): + """ + A description for an AWS CloudFormation stack for migrating from PostgreSQL. + It is written down in Python, uses OO, and a fluent API. + + It provides elements to implement this kind of pipeline: + + RDS PostgreSQL -> DMS -> Kinesis Stream -> Python Lambda via OCI -> CrateDB + + See also the canonical AWS documentation about relevant topics. + + Documentation: + - https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Serverless.Components.html + - https://docs.aws.amazon.com/dms/latest/userguide/CHAP_VPC_Endpoints.html + - https://docs.aws.amazon.com/dms/latest/userguide/security-iam-awsmanpol.html + - https://docs.aws.amazon.com/dms/latest/userguide/security-iam.html#CHAP_Security.IAMPermissions + + Resources: + - https://aws.amazon.com/blogs/database/orchestrate-an-aws-dms-serverless-replication-task-using-aws-cli/ + - https://aws.amazon.com/blogs/aws/new-aws-dms-serverless-automatically-provisions-and-scales-capacity-for-migration-and-data-replication/ + - https://github.com/aws-cloudformation/aws-cloudformation-templates/blob/main/DMS/DMSAuroraToS3FullLoadAndOngoingReplication.yaml + """ + + db_username: str = attr.ib() + db_password: str = attr.ib() + + _vpc: ec2.VPC = None + _public_subnet1: ec2.Subnet = None + _public_subnet2: ec2.Subnet = None + _db_subnet_group: rds.DBSubnetGroup = None + _db_security_group: ec2.SecurityGroup = None + + _db: rds.DBInstance = None + + _dms_instance: dms.ReplicationInstance = None + _dms_kinesis_access_role: iam.Role = None + + def vpc(self): + group = cf.ResourceGroup() + + self._vpc = ec2.VPC( + "VPCInstance", + p_CidrBlock="10.0.0.0/24", + p_EnableDnsHostnames=True, + p_EnableDnsSupport=True, + p_Tags=cf.Tag.make_many( + Name=cf.Sub.from_params(f"{self.env_name}-vpc"), + Description=cf.Sub.from_params(f"The VPC for {self.env_name}"), + ), + ) + group.add(self._vpc) + + # Even if you are deploying a single-az instance, you have to + # specify multiple availability zones in the DB subnet group. + # https://stackoverflow.com/a/70658040 + # https://stackoverflow.com/a/63975208 + self._public_subnet1 = ec2.Subnet( + "VPCPublicSubnet1", + p_CidrBlock="10.0.0.0/26", + rp_VpcId=self._vpc.ref(), + p_AvailabilityZone=cf.GetAZs.n_th(1), + p_MapPublicIpOnLaunch=False, + p_Tags=cf.Tag.make_many( + Name=cf.Sub.from_params(f"{self.env_name}-vpc-subnet1"), + Description=cf.Sub.from_params(f"The VPC subnet 1 for {self.env_name}"), + ), + ra_DependsOn=self._vpc, + ) + self._public_subnet2 = ec2.Subnet( + "VPCPublicSubnet2", + p_CidrBlock="10.0.0.64/26", + rp_VpcId=self._vpc.ref(), + p_AvailabilityZone=cf.GetAZs.n_th(2), + p_MapPublicIpOnLaunch=False, + p_Tags=cf.Tag.make_many( + Name=cf.Sub.from_params(f"{self.env_name}-vpc-subnet2"), + Description=cf.Sub.from_params(f"The VPC subnet 2 for {self.env_name}"), + ), + ra_DependsOn=self._vpc, + ) + group.add(self._public_subnet1) + group.add(self._public_subnet2) + + # FIXME: Problem: Cannot create a publicly accessible DBInstance. + # The specified VPC has no internet gateway attached. + gateway = ec2.InternetGateway( + "VPCGateway", + p_Tags=cf.Tag.make_many( + Name=cf.Sub.from_params(f"{self.env_name}-vpc-gateway"), + Description=cf.Sub.from_params(f"The VPC gateway for {self.env_name}"), + ), + ra_DependsOn=self._vpc, + ) + gateway_attachment = ec2.VPCGatewayAttachment( + "VPCGatewayAttachment", + rp_VpcId=self._vpc.ref(), + p_InternetGatewayId=gateway.ref(), + ra_DependsOn=[self._vpc, gateway], + ) + group.add(gateway) + group.add(gateway_attachment) + + route_table = ec2.RouteTable( + "VPCRouteTable", + rp_VpcId=self._vpc.ref(), + p_Tags=cf.Tag.make_many( + Name=cf.Sub.from_params(f"{self.env_name}-vpc-route-table"), + Description=cf.Sub.from_params(f"The VPC routing table for {self.env_name}"), + ), + ) + group.add(route_table) + + default_route = ec2.Route( + "VPCDefaultRoute", + rp_RouteTableId=route_table.ref(), + p_DestinationCidrBlock="0.0.0.0/0", + p_GatewayId=gateway.ref(), + ra_DependsOn=gateway_attachment, + ) + group.add(default_route) + + subnet_route_1 = ec2.SubnetRouteTableAssociation( + "VPCSubnetRoute1", + rp_RouteTableId=route_table.ref(), + rp_SubnetId=self._public_subnet1.ref(), + ra_DependsOn=[route_table, self._public_subnet1], + ) + subnet_route_2 = ec2.SubnetRouteTableAssociation( + "VPCSubnetRoute2", + rp_RouteTableId=route_table.ref(), + rp_SubnetId=self._public_subnet2.ref(), + ra_DependsOn=[route_table, self._public_subnet2], + ) + group.add(subnet_route_1) + group.add(subnet_route_2) + + return self.add(group) + + def database(self): + group = cf.ResourceGroup() + + # https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/USER_VPC.WorkingWithRDSInstanceinaVPC.html + self._db_subnet_group = rds.DBSubnetGroup( + "RDSPostgreSQLDBSubnetGroup", + rp_DBSubnetGroupDescription=f"DB subnet group for {self.env_name}", + rp_SubnetIds=[self._public_subnet1.ref(), self._public_subnet2.ref()], + p_DBSubnetGroupName=f"{self.env_name}-db-subnet-group", + p_Tags=cf.Tag.make_many(Name=cf.Sub.from_params(f"{self.env_name}-db-subnet-group")), + ra_DependsOn=[self._public_subnet1, self._public_subnet2], + ) + group.add(self._db_subnet_group) + + db_security_group_name = f"{self.env_name}-db-security-group" + self._db_security_group = ec2.SecurityGroup( + "RDSPostgreSQLSecurityGroup", + rp_GroupDescription=f"DB security group for {self.env_name}", + p_GroupName=db_security_group_name, + p_VpcId=self._vpc.ref(), + p_SecurityGroupIngress=[ + ec2.PropSecurityGroupIngress( + rp_IpProtocol="TCP", + p_Description="Allow access from VPC", + p_FromPort=5432, + p_ToPort=5432, + p_CidrIp="10.0.0.0/24", + ), + # TODO: Possibly restrict to single provided ClientIP? + ec2.PropSecurityGroupIngress( + rp_IpProtocol="TCP", + p_Description="Allow access from outside", + p_FromPort=5432, + p_ToPort=5432, + p_CidrIp="0.0.0.0/0", + ), + ], + p_SecurityGroupEgress=[ + ec2.PropSecurityGroupEgress( + rp_IpProtocol="-1", + p_Description="Allow any access out", + p_FromPort=-1, + p_ToPort=-1, + p_CidrIp="0.0.0.0/0", + ) + ], + p_Tags=cf.Tag.make_many(Name=cf.Sub.from_params(db_security_group_name)), + ra_DependsOn=[self._vpc], + ) + group.add(self._db_security_group) + + # aws rds describe-db-parameter-groups + # aws rds describe-db-parameters --db-parameter-group-name default.postgres15 + db_parameter_group = rds.DBParameterGroup( + "RDSPostgreSQLParameterGroup", + rp_Family="postgres15", + rp_Description="DMS parameter group for postgres15", + p_DBParameterGroupName="dms-postgres15", + # aws rds describe-db-parameters --db-parameter-group-name default.postgres15 + p_Parameters={ + "log_connections": True, + # List of allowable settings for the pgaudit.log parameter: + # none, all, ddl, function, misc, read, role, write + # https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Appendix.PostgreSQL.CommonDBATasks.pgaudit.html + "pgaudit.log": "none", + "pgaudit.log_statement_once": True, + # `rds.logical_replication is a cluster level setting, not db instance setting? + # https://stackoverflow.com/a/66252465 + "rds.logical_replication": True, + "shared_preload_libraries": "pgaudit,pglogical,pg_stat_statements", + }, + ) + group.add(db_parameter_group) + + db = rds.DBInstance( + "RDSPostgreSQL", + p_DBInstanceClass="db.t3.micro", + p_DBInstanceIdentifier=f"{self.env_name}-db", + p_Engine="postgres", + # PostgreSQL 16 only supported by DMS 3.5.3. + # The current default engine version for AWS DMS is 3.5.2. + # https://docs.aws.amazon.com/dms/latest/userguide/CHAP_ReleaseNotes.html + p_EngineVersion="15", + p_DBParameterGroupName="dms-postgres15", + # The parameter AllocatedStorage must be provided and must not be null. + # Invalid storage size for engine name postgres and storage type gp2: 1 + p_AllocatedStorage="5", + # p_StorageType="gp3", # noqa: ERA001 + # Setting this parameter to 0 disables automated backups. + # Disabling automated backups speeds up the provisioning process. + p_BackupRetentionPeriod=0, + # To disable collection of Enhanced Monitoring metrics, specify 0. + p_MonitoringInterval=0, + p_EnablePerformanceInsights=False, + p_MasterUsername=self.db_username, + p_MasterUserPassword=self.db_password, + p_PubliclyAccessible=True, + p_MultiAZ=False, + p_VPCSecurityGroups=[ + self._db_security_group.ref(), + ], + # If there's no DB subnet group, then the DB instance isn't a VPC DB instance. + p_DBSubnetGroupName=self._db_subnet_group.ref(), + p_EnableCloudwatchLogsExports=["postgresql"], + # p_DBName="testdrive", # noqa: ERA001 + p_Tags=cf.Tag.make_many( + Name=cf.Sub.from_params(f"{self.env_name}-db"), + Description=cf.Sub.from_params(f"The DB instance for {self.env_name}"), + ), + ra_DependsOn=[db_parameter_group, self._db_security_group, self._db_subnet_group], + ) + self._db = db + group.add(db) + + rds_arn = cf.Output( + "RDSInstanceArn", + Value=db.rv_DBInstanceArn, + ) + group.add(rds_arn) + + public_endpoint = cf.Output( + "DatabaseHost", + Value=db.rv_EndpointAddress, + ) + group.add(public_endpoint) + + public_db_port = cf.Output( + "DatabasePort", + Value=db.rv_EndpointPort, + ) + group.add(public_db_port) + return self.add(group) + + def stream(self): + group = cf.ResourceGroup() + # https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Target.Kinesis.html#CHAP_Target.Kinesis.Prerequisites + + self._stream_source = kinesis.Stream( + id="KinesisStream", + p_Name=f"{self.env_name}-stream", + p_StreamModeDetails={"rp_StreamMode": "ON_DEMAND"}, + ) + stream_arn = cf.Output( + "StreamArn", + Value=self._stream_source.rv_Arn, + ) + group.add(self._stream_source) + group.add(stream_arn) + return self.add(group) + + def dms(self): + """ + An AWS DMS Serverless CloudFormation description for demonstration purposes. + + https://docs.aws.amazon.com/dms/latest/userguide/security-iam.html#CHAP_Security.APIRole + + Database Migration Service requires the below IAM Roles to be created before + replication instances can be created. See the DMS Documentation for + additional information: https://docs.aws.amazon.com/dms/latest/userguide/security-iam.html#CHAP_Security.APIRole + * dms-vpc-role + * dms-cloudwatch-logs-role + * dms-access-for-endpoint + + If you use the AWS CLI or the AWS DMS API for your database migration, you must add three IAM roles + to your AWS account before you can use the features of AWS DMS. Two of these are `dms-vpc-role` and + `dms-cloudwatch-logs-role`. + + If you use Amazon Redshift as a target database, you must also add the IAM role + `dms-access-for-endpoint` to your AWS account. + + -- https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/dms_replication_instance.html + -- https://github.com/hashicorp/terraform-provider-aws/issues/19580 + -- https://docs.aws.amazon.com/dms/latest/userguide/security-iam.html#CHAP_Security.APIRole + """ + group = cf.ResourceGroup() + + # Trust policy that is associated with upcoming roles. + # Trust policies define which entities can assume the role. + # You can associate only one trust policy with a role. + trust_policy_dms = cf.helpers.iam.AssumeRolePolicyBuilder( + cf.helpers.iam.ServicePrincipal.dms(), + ).build() + + dms_vpc_role = iam.Role( + id="DMSVPCManagementRole", + rp_AssumeRolePolicyDocument=trust_policy_dms, + # Role name must strictly be `dms-vpc-role`? + # https://stackoverflow.com/q/58542334 + # https://github.com/hashicorp/terraform-provider-aws/issues/7748 + # https://github.com/hashicorp/terraform-provider-aws/issues/11025 + # p_RoleName=cf.Sub("${EnvName}-dms-vpc-role", {"EnvName": self.param_env_name.ref()}), # noqa: ERA001, E501 + p_RoleName="dms-vpc-role", + p_Description="DMS VPC management IAM role", + p_ManagedPolicyArns=[ + cf.helpers.iam.AwsManagedPolicy.AmazonDMSVPCManagementRole, + ], + ) + group.add(dms_vpc_role) + dms_cloudwatch_role = iam.Role( + id="DMSCloudWatchLogsRole", + rp_AssumeRolePolicyDocument=trust_policy_dms, + # Role name must strictly be `dms-cloudwatch-logs-role`? + # https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Troubleshooting.html#CHAP_Troubleshooting.General.CWL + # p_RoleName=cf.Sub("${EnvName}-dms-cloudwatch-logs-role", {"EnvName": self.param_env_name.ref()}), # noqa: ERA001, E501 + p_RoleName="dms-cloudwatch-logs-role", + p_Description="DMS CloudWatch IAM role", + p_ManagedPolicyArns=[ + cf.helpers.iam.AwsManagedPolicy.AmazonDMSCloudWatchLogsRole, + ], + ) + group.add(dms_cloudwatch_role) + + # Allow DMS accessing the data sink. In this case, Kinesis. + # For Redshift, this role needs to be called `dms-access-for-endpoint`. + self._dms_kinesis_access_role = iam.Role( + id="DMSTargetAccessRole", + rp_AssumeRolePolicyDocument=trust_policy_dms, + p_RoleName=cf.Sub("${EnvName}-dms-target-access-role", {"EnvName": self.param_env_name.ref()}), + p_Description="DMS target access IAM role", + p_ManagedPolicyArns=[ + cf.helpers.iam.AwsManagedPolicy.AmazonKinesisFullAccess, + ], + ra_DependsOn=self._stream_source, + ) + group.add(self._dms_kinesis_access_role) + + # Create a replication subnet group given a list of the subnet IDs in a VPC. + # https://docs.aws.amazon.com/dms/latest/APIReference/API_CreateReplicationSubnetGroup.html + dms_replication_subnet_group = dms.ReplicationSubnetGroup( # type: ignore[call-arg,misc] + "DMSReplicationSubnetGroup", + rp_SubnetIds=[self._public_subnet1.ref(), self._public_subnet2.ref()], + rp_ReplicationSubnetGroupDescription=f"DMS replication subnet group for {self.env_name}", + p_ReplicationSubnetGroupIdentifier=f"{self.env_name}-dms-subnet-group", + ra_DependsOn=[dms_vpc_role], + ) + group.add(dms_replication_subnet_group) + + dms_security_group_name = f"{self.env_name}-dms-security-group" + dms_security_group = ec2.SecurityGroup( + "DMSSecurityGroup", + rp_GroupDescription=f"DMS security group for {self.env_name}", + p_GroupName=dms_security_group_name, + p_VpcId=self._vpc.ref(), + p_SecurityGroupIngress=[ + ec2.PropSecurityGroupIngress( + rp_IpProtocol="-1", + p_Description="Allow access from VPC", + p_FromPort=-1, + p_ToPort=-1, + p_CidrIp="10.0.0.0/24", + ), + # TODO: Possibly restrict to single provided ClientIP? + ec2.PropSecurityGroupIngress( + rp_IpProtocol="-1", + p_Description="Allow access from outside", + p_FromPort=-1, + p_ToPort=-1, + p_CidrIp="0.0.0.0/0", + ), + ], + p_SecurityGroupEgress=[ + ec2.PropSecurityGroupEgress( + rp_IpProtocol="-1", + p_Description="Allow any access out", + p_FromPort=-1, + p_ToPort=-1, + p_CidrIp="0.0.0.0/0", + ) + ], + p_Tags=cf.Tag.make_many(Name=cf.Sub.from_params(dms_security_group_name)), + ra_DependsOn=[self._vpc, dms_replication_subnet_group], + ) + group.add(dms_security_group) + + # The replication instance is the main workhorse. + self._dms_instance = dms.ReplicationInstance( + "DMSReplicationInstance", + rp_ReplicationInstanceClass="dms.t3.medium", + p_ReplicationInstanceIdentifier=f"{self.env_name}-dms-instance", + p_MultiAZ=False, + p_ReplicationSubnetGroupIdentifier=dms_replication_subnet_group.ref(), + p_VpcSecurityGroupIds=[dms_security_group.ref()], + p_EngineVersion="3.5.2", + p_AllocatedStorage=5, + p_PubliclyAccessible=True, + p_AutoMinorVersionUpgrade=False, + p_AllowMajorVersionUpgrade=False, + ra_DependsOn=[ + dms_vpc_role, + dms_cloudwatch_role, + dms_security_group, + dms_replication_subnet_group, + self._dms_kinesis_access_role, + ], + ) + group.add(self._dms_instance) + + # Configuring VPC endpoints as AWS DMS source and target endpoints. + # https://docs.aws.amazon.com/dms/latest/userguide/CHAP_VPC_Endpoints.html + vpc_endpoint_stream = ec2.VPCEndpoint( + "KinesisVPCEndpoint", + rp_VpcId=self._vpc.ref(), + rp_ServiceName=f"com.amazonaws.{self.region}.kinesis-streams", + p_SubnetIds=[self._public_subnet1.ref(), self._public_subnet2.ref()], + # TODO: Does it really need _both_ security groups? + p_SecurityGroupIds=[ + self._db_security_group.ref(), + dms_security_group.ref(), + ], + p_VpcEndpointType="Interface", + ) + group.add(vpc_endpoint_stream) + return self.add(group) + + def replication(self, dms_table_mapping: t.Dict[str, t.Any]): + + group = cf.ResourceGroup() + + # https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Source.PostgreSQL.html#CHAP_Source.PostgreSQL.Advanced + # https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Source.PostgreSQL.html#CHAP_Source.PostgreSQL.RDSPostgreSQL + # https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Source.PostgreSQL.html#CHAP_Source.PostgreSQL.ConnectionAttrib + source_endpoint = dms.Endpoint( # type: ignore[call-arg,misc] + "DMSSourceEndpoint", + rp_EndpointType="source", + rp_EngineName="postgres", + p_ServerName=self._db.rv_EndpointAddress, + # NOTE: Needs to be integer, so it requires a patched version of cottonformation's `dms` resource wrappers. + p_Port=self._db.rv_EndpointPort, + p_SslMode="require", + p_Username=self.db_username, + p_Password=self.db_password, + p_DatabaseName="postgres", + p_ExtraConnectionAttributes=json.dumps( + { + "CaptureDdls": True, + "PluginName": "pglogical", + } + ), + p_EndpointIdentifier=f"{self.env_name}-endpoint-source", + ra_DependsOn=[self._db], + ) + target_endpoint = dms.Endpoint( # type: ignore[call-arg,misc] + "DMSTargetEndpoint", + rp_EndpointType="target", + rp_EngineName="kinesis", + p_KinesisSettings=dms.PropEndpointKinesisSettings( + p_StreamArn=self.stream_arn, + p_MessageFormat="json-unformatted", + p_IncludeControlDetails=True, + p_IncludePartitionValue=True, + p_IncludeTransactionDetails=True, + p_IncludeNullAndEmpty=True, + p_IncludeTableAlterOperations=True, + p_PartitionIncludeSchemaTable=True, + # The parameter ServiceAccessRoleArn must be provided and must not be blank. + p_ServiceAccessRoleArn=self._dms_kinesis_access_role.rv_Arn, + ), + p_EndpointIdentifier=f"{self.env_name}-endpoint-target", + ra_DependsOn=[self._stream_source, self._dms_kinesis_access_role], + ) + group.add(source_endpoint) + group.add(target_endpoint) + + replication_settings = { + # https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Tasks.CustomizingTasks.TaskSettings.BeforeImage.html + "BeforeImageSettings": { + "EnableBeforeImage": True, + "FieldName": "before-image", + "ColumnFilter": "pk-only", + }, + # https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Tasks.CustomizingTasks.TaskSettings.Logging.html + "Logging": { + "EnableLogging": True, + "EnableLogContext": True, + # ERROR: Feature is not accessible. + # TODO: "LogConfiguration": {"EnableTraceOnError": True}, + "LogComponents": [ + {"Id": "COMMON", "Severity": "LOGGER_SEVERITY_DETAILED_DEBUG"}, + {"Id": "ADDONS", "Severity": "LOGGER_SEVERITY_DETAILED_DEBUG"}, + {"Id": "DATA_STRUCTURE", "Severity": "LOGGER_SEVERITY_DETAILED_DEBUG"}, + {"Id": "COMMUNICATION", "Severity": "LOGGER_SEVERITY_DETAILED_DEBUG"}, + {"Id": "FILE_TRANSFER", "Severity": "LOGGER_SEVERITY_DETAILED_DEBUG"}, + {"Id": "FILE_FACTORY", "Severity": "LOGGER_SEVERITY_DETAILED_DEBUG"}, + {"Id": "METADATA_MANAGER", "Severity": "LOGGER_SEVERITY_DETAILED_DEBUG"}, + {"Id": "IO", "Severity": "LOGGER_SEVERITY_DETAILED_DEBUG"}, + {"Id": "PERFORMANCE", "Severity": "LOGGER_SEVERITY_DETAILED_DEBUG"}, + {"Id": "SORTER", "Severity": "LOGGER_SEVERITY_DETAILED_DEBUG"}, + {"Id": "SOURCE_CAPTURE", "Severity": "LOGGER_SEVERITY_DETAILED_DEBUG"}, + {"Id": "SOURCE_UNLOAD", "Severity": "LOGGER_SEVERITY_DETAILED_DEBUG"}, + {"Id": "TABLES_MANAGER", "Severity": "LOGGER_SEVERITY_DETAILED_DEBUG"}, + {"Id": "TARGET_APPLY", "Severity": "LOGGER_SEVERITY_DETAILED_DEBUG"}, + {"Id": "TARGET_LOAD", "Severity": "LOGGER_SEVERITY_DETAILED_DEBUG"}, + {"Id": "TASK_MANAGER", "Severity": "LOGGER_SEVERITY_DETAILED_DEBUG"}, + {"Id": "TRANSFORMATION", "Severity": "LOGGER_SEVERITY_DETAILED_DEBUG"}, + {"Id": "REST_SERVER", "Severity": "LOGGER_SEVERITY_DETAILED_DEBUG"}, + # Replication Settings document error: Unsupported keys were found: VALIDATOR + # {"Id": "VALIDATOR", "Severity": "LOGGER_SEVERITY_DETAILED_DEBUG"}, # noqa: ERA001 + {"Id": "VALIDATOR_EXT", "Severity": "LOGGER_SEVERITY_DETAILED_DEBUG"}, + ], + }, + } + + """ + replication = dms.ReplicationConfig( # type: ignore[call-arg,misc] + "DMSReplicationConfig", + rp_ReplicationConfigIdentifier=f"{self.env_name}-dms-serverless", + # p_ResourceIdentifier=f"{self.env_name}-dms-serverless-resource", # noqa: ERA001 + rp_ReplicationType="full-load-and-cdc", + rp_SourceEndpointArn=source_endpoint.ref(), + rp_TargetEndpointArn=target_endpoint.ref(), + rp_ComputeConfig=dms.PropReplicationConfigComputeConfig( + rp_MaxCapacityUnits=1, + p_MinCapacityUnits=1, + p_MultiAZ=False, + p_ReplicationSubnetGroupId=dms_replication_subnet_group.ref(), + p_VpcSecurityGroupIds=[self._db_security_group.ref(), dms_security_group.ref()], + ), + rp_TableMappings=map_to_kinesis, + p_ReplicationSettings=replication_settings, + ra_DependsOn=[ + dms_replication_subnet_group, + dms_security_group, + dms_vpc_role, + dms_cloudwatch_role, + dms_target_access_role, + source_endpoint, + target_endpoint, + ], + ) + group.add(replication) + + replication_config_arn = cf.Output( + "ReplicationConfigArn", + Value=replication.rv_ReplicationConfigArn, + ) + group.add(replication_config_arn) + return self.add(group) + """ + + replication = dms.ReplicationTask( # type: ignore[call-arg,misc] + "DMSReplicationTask", + # TODO: Use existing replication instance on demand. + # FIXME: Make configurable. + rp_ReplicationInstanceArn=self._dms_instance.ref(), + p_ReplicationTaskIdentifier=f"{self.env_name}-dms-task", + # p_ResourceIdentifier=f"{self.env_name}-dms-serverless-resource", # noqa: ERA001 + rp_MigrationType="full-load-and-cdc", + rp_SourceEndpointArn=source_endpoint.ref(), + rp_TargetEndpointArn=target_endpoint.ref(), + # https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Tasks.CustomizingTasks.TableMapping.SelectionTransformation.html + rp_TableMappings=json.dumps(dms_table_mapping), + # https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Tasks.CustomizingTasks.TaskSettings.html + p_ReplicationTaskSettings=json.dumps(replication_settings), + ra_DependsOn=[ + self._dms_instance, + source_endpoint, + target_endpoint, + ], + ra_DeletionPolicy="Retain", + ) + group.add(replication) + + replication_task_arn = cf.Output( + "ReplicationTaskArn", + Value=replication.ref(), + ) + group.add(replication_task_arn) + + return self.add(group) + + @property + def stream_arn(self) -> GetAtt: + if self._stream_source is None: + raise ValueError("Kinesis Stream source not defined") + return self._stream_source.rv_Arn + + def processor(self, factory: LambdaFactory, environment: t.Dict[str, str]): + """ + Manifest the main processor component of this pipeline. + """ + self._processor = factory.make(self, environment=environment) + return self.add(self._processor.group) + + def connect( + self, + batch_size: int = 1_000, + starting_position: t.Literal["LATEST", "TRIM_HORIZON", "AT_TIMESTAMP"] = "TRIM_HORIZON", + starting_position_timestamp: float = None, + ): + """ + Connect the event source to the processor Lambda. + + starting_position: + - LATEST - Read only new records. + - TRIM_HORIZON - Process all available records. + - AT_TIMESTAMP - Specify a time from which to start reading records. + + starting_position_timestamp: + With `starting_position` set to `AT_TIMESTAMP`, the time from which to start reading, + in Unix time seconds. `starting_position_timestamp` cannot be in the future. + + https://docs.aws.amazon.com/lambda/latest/dg/services-kinesis-create.html + https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-lambda-eventsourcemapping.html + + aws kinesis register-stream-consumer \ + --consumer-name con1 \ + --stream-arn arn:aws:kinesis:us-east-2:123456789012:stream/lambda-stream + + aws lambda create-event-source-mapping \ + --function-name MyFunction \ + --event-source-arn arn:aws:kinesis:us-east-2:123456789012:stream/lambda-stream \ + --starting-position LATEST \ + --batch-size 100 + """ + if not self._processor: + raise RuntimeError("No processor defined") + if not self._stream_source: + raise RuntimeError("No Kinesis stream defined") + + # Get a handle to the AWS Lambda for dependency management purposes. + awsfunc = self._processor.function + + # Create a mapping and add it to the stack. + mapping = awslambda.EventSourceMapping( + id="KinesisToLambdaMapping", + rp_FunctionName=awsfunc.p_FunctionName, + p_EventSourceArn=self._stream_source.rv_Arn, + p_BatchSize=batch_size, + p_StartingPosition=starting_position, + p_StartingPositionTimestamp=starting_position_timestamp, + ra_DependsOn=awsfunc, + ) + return self.add(mapping) diff --git a/lorrystream/carabas/aws/stack/dynamodb.py b/lorrystream/carabas/aws/stack/dynamodb.py new file mode 100644 index 0000000..cb76fc7 --- /dev/null +++ b/lorrystream/carabas/aws/stack/dynamodb.py @@ -0,0 +1,141 @@ +import logging +import typing as t + +import attr +from cottonformation import ResourceGroup +from cottonformation.res import awslambda, dynamodb, kinesis +from cottonformation.res.dynamodb import PropTableKinesisStreamSpecification + +from lorrystream.carabas.aws.function.model import LambdaFactory +from lorrystream.carabas.aws.model import KinesisProcessorStack + +logger = logging.getLogger(__name__) + + +@attr.s +class DynamoDBKinesisPipe(KinesisProcessorStack): + """ + A description for an AWS CloudFormation stack, relaying DynamoDB CDC information into a sink. + It is written down in Python, uses OO, and a fluent API. + + It provides elements to implement this kind of pipeline: + + DynamoDB CDC -> Kinesis Stream -> Python Lambda via OCI -> CrateDB + + See also the canonical AWS documentation about relevant topics. + + - DynamoDB -> Kinesis: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/kds_gettingstarted.html + - Kinesis -> Lambda: https://docs.aws.amazon.com/lambda/latest/dg/with-kinesis.html + """ + + table_name: str = attr.ib() + stream_name: str = attr.ib() + + environment: t.Dict[str, str] = attr.ib(factory=dict) + + def table(self): + """ + aws dynamodb create-table \ + --table-name table-testdrive \ + --key-schema \ + AttributeName=device,KeyType=HASH \ + AttributeName=timestamp,KeyType=RANGE \ + --attribute-definitions \ + AttributeName=device,AttributeType=S \ + AttributeName=timestamp,AttributeType=S \ + --provisioned-throughput \ + ReadCapacityUnits=1,WriteCapacityUnits=1 \ + --table-class STANDARD + :return: + """ + + group = ResourceGroup() + + table = dynamodb.Table( + id="DynamoDBTable", + p_TableName=self.table_name, + rp_KeySchema=[ + {"rp_AttributeName": "device", "rp_KeyType": "HASH"}, + {"rp_AttributeName": "timestamp", "rp_KeyType": "RANGE"}, + ], + p_AttributeDefinitions=[ + {"rp_AttributeName": "device", "rp_AttributeType": "S"}, + {"rp_AttributeName": "timestamp", "rp_AttributeType": "S"}, + ], + p_TableClass="STANDARD", + p_ProvisionedThroughput={"rp_ReadCapacityUnits": 1, "rp_WriteCapacityUnits": 1}, + # p_KinesisStreamSpecification=PropTableKinesisStreamSpecification(rp_StreamArn=), + ) + + """ + aws kinesis create-stream --stream-name dynamodb-cdc --shard-count 4 + + # Check that the Kinesis stream is active. + aws kinesis describe-stream --stream-name dynamodb-cdc + + STREAM_ARN=$(aws kinesis describe-stream --stream-name dynamodb-cdc | jq -r .StreamDescription.StreamARN) + aws dynamodb enable-kinesis-streaming-destination \ + --table-name table-testdrive \ + --stream-arn "${STREAM_ARN}" \ + --enable-kinesis-streaming-configuration ApproximateCreationDateTimePrecision=MICROSECOND + """ + + # TODO: ShardCount is expected when StreamMode=PROVISIONED + stream = kinesis.Stream( + id="KinesisStream", + p_Name=self.stream_name, + p_StreamModeDetails={"rp_StreamMode": "ON_DEMAND"}, + ) + group.add(stream) + self._event_source = stream + + table.p_KinesisStreamSpecification = PropTableKinesisStreamSpecification(rp_StreamArn=stream.rv_Arn) + group.add(table) + + return self.add(group) + + def processor(self, proc: LambdaFactory): + """ + Manifest the main processor component of this pipeline. + """ + self._processor = proc.make(self, environment=self.environment) + return self.add(self._processor.group) + + def connect(self): + """ + Connect the event source to the processor. + + https://docs.aws.amazon.com/lambda/latest/dg/services-kinesis-create.html + https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-lambda-eventsourcemapping.html#cfn-lambda-eventsourcemapping-startingposition + + aws kinesis register-stream-consumer \ + --consumer-name con1 \ + --stream-arn arn:aws:kinesis:us-east-2:123456789012:stream/lambda-stream + + aws lambda create-event-source-mapping \ + --function-name MyFunction \ + --event-source-arn arn:aws:kinesis:us-east-2:123456789012:stream/lambda-stream \ + --starting-position LATEST \ + --batch-size 100 + """ + if not self._processor: + raise RuntimeError("No processor defined") + if not self._event_source: + raise RuntimeError("No event source defined") + + # Get a handle to the AWS Lambda for dependency management purposes. + awsfunc = self._processor.function + + # Create a mapping and add it to the stack. + mapping = awslambda.EventSourceMapping( + id="EventSourceToLambdaMapping", + rp_FunctionName=awsfunc.p_FunctionName, + p_EventSourceArn=self._event_source.rv_Arn, + p_BatchSize=2500, + # LATEST - Read only new records. + # TRIM_HORIZON - Process all available records. + # AT_TIMESTAMP - Specify a time from which to start reading records. + p_StartingPosition="TRIM_HORIZON", + ra_DependsOn=awsfunc, + ) + return self.add(mapping) diff --git a/lorrystream/process/__init__.py b/lorrystream/process/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lorrystream/process/kinesis_cratedb_lambda.py b/lorrystream/process/kinesis_cratedb_lambda.py new file mode 100644 index 0000000..f7658e5 --- /dev/null +++ b/lorrystream/process/kinesis_cratedb_lambda.py @@ -0,0 +1,151 @@ +# Copyright (c) 2024 The Panodata Developers and contributors. +# Distributed under the terms of the Apache 2 license. +""" +Using an AWS Lambda, consume an AWS Kinesis Stream of CDC data, and relay +into CrateDB, re-materializing the original information into an OBJECT +column `data`. + +Currently supported CDC message formats: + +- AWS DMS +- AWS DynamoDB + +Details: +When using `ON_ERROR = exit`, the processor uses Linux exit codes for +signalling error conditions, see https://stackoverflow.com/a/76187305. + +Resources: +- https://docs.aws.amazon.com/lambda/latest/dg/with-kinesis-example.html +- https://docs.aws.amazon.com/lambda/latest/dg/python-logging.html +""" +# In order to run, this module/program needs the following +# 3rd party libraries, defined using inline script metadata. +# +# /// script +# requires-python = ">=3.9" +# dependencies = [ +# "commons-codec==0.0.3", +# "sqlalchemy-cratedb==0.38.0", +# ] +# /// +import base64 +import json +import logging +import os +import sys + +import sqlalchemy as sa +from commons_codec.exception import UnknownOperationError +from commons_codec.model import ColumnTypeMapStore +from commons_codec.transform.aws_dms import DMSTranslatorCrateDB +from commons_codec.transform.dynamodb import DynamoCDCTranslatorCrateDB +from sqlalchemy.util import asbool + +LOG_LEVEL: str = os.environ.get("LOG_LEVEL", "INFO") +USE_BATCH_PROCESSING: bool = asbool(os.environ.get("USE_BATCH_PROCESSING", "false")) +ON_ERROR: str = os.environ.get("ON_ERROR", "exit") +SQL_ECHO: bool = asbool(os.environ.get("SQL_ECHO", "false")) + +MESSAGE_FORMAT: str = os.environ.get("MESSAGE_FORMAT", "unknown") +COLUMN_TYPES: str = os.environ.get("COLUMN_TYPES", "") +SINK_SQLALCHEMY_URL: str = os.environ.get("SINK_SQLALCHEMY_URL", "crate://") +SINK_TABLE: str = os.environ.get("SINK_TABLE", "default") + +logger = logging.getLogger(__name__) +logger.setLevel(LOG_LEVEL) + + +# Sanity checks. +# If any value is invalid, terminate by signalling "22 - Invalid argument". +error_strategies = ["exit", "ignore", "raise"] +message_formats = ["dms", "dynamodb"] +if ON_ERROR not in error_strategies: + message = f"Invalid value for ON_ERROR: {ON_ERROR}. Use one of: {error_strategies}" + logger.fatal(message) + sys.exit(22) +if MESSAGE_FORMAT not in message_formats: + message = f"Invalid value for MESSAGE_FORMAT: {MESSAGE_FORMAT}. Use one of: {message_formats}" + logger.fatal(message) + sys.exit(22) +try: + column_types = ColumnTypeMapStore.from_json(COLUMN_TYPES) +except Exception as ex: + message = f"Invalid value for COLUMN_TYPES: {COLUMN_TYPES}. Reason: {ex}. Use JSON." + logger.fatal(message) + sys.exit(22) + +# TODO: Automatically create destination table. +# TODO: Propagate mapping definitions and other settings. +if MESSAGE_FORMAT == "dms": + cdc = DMSTranslatorCrateDB(column_types=column_types) +elif MESSAGE_FORMAT == "dynamodb": + cdc = DynamoCDCTranslatorCrateDB(table_name=SINK_TABLE) + +# Create the database connection outside the handler to allow +# connections to be re-used by subsequent function invocations. +# TODO: Examine long-running jobs about successful reconnection behavior. +try: + engine = sa.create_engine(SINK_SQLALCHEMY_URL, echo=SQL_ECHO) + connection = engine.connect() + logger.info(f"Connection to sink database succeeded: {SINK_SQLALCHEMY_URL}") +except Exception as ex: + logger.exception(f"Connection to sink database failed: {SINK_SQLALCHEMY_URL}") + if ON_ERROR == "exit": + # Signal "Resource temporarily unavailable" when connection to database fails. + sys.exit(11) + elif ON_ERROR == "ignore": + pass + elif ON_ERROR == "raise": + raise ex + + +def handler(event, context): + """ + Implement partial batch response for Lambda functions that receive events from + a Kinesis stream. The function reports the batch item failures in the response, + signaling to Lambda to retry those messages later. + """ + + cur_record_sequence_number = "" + logger.debug("context: %s", context) + + for record in event["Records"]: + logger.debug(f"Record: {record}") + event_id = record["eventID"] + try: + + # Log and decode event. + # TODO: Remove log statements for better performance? + logger.debug(f"Processed Kinesis Event - EventID: {event_id}") + record_data = json.loads(base64.b64decode(record["kinesis"]["data"]).decode("utf-8")) + logger.debug(f"Record Data: {record_data}") + + # Process record. + sql = cdc.to_sql(record_data) + connection.execute(sa.text(sql)) + connection.commit() + + # Bookkeeping. + cur_record_sequence_number = record["kinesis"]["sequenceNumber"] + + except UnknownOperationError as ex: + logger.warning(f"Ignoring message. Reason: {ex}. Record: {ex.record}") + + except Exception as ex: + error_message = f"An error occurred processing event: {event_id}" + logger.exception(error_message) + if USE_BATCH_PROCESSING: + # Return failed record's sequence number. + return {"batchItemFailures": [{"itemIdentifier": cur_record_sequence_number}]} + if ON_ERROR == "exit": + # Signal "Input/output error" when error happens while processing data. + sys.exit(5) + elif ON_ERROR == "ignore": + pass + elif ON_ERROR == "raise": + raise ex + + logger.info(f"Successfully processed {len(event['Records'])} records") + if USE_BATCH_PROCESSING: + return {"batchItemFailures": []} + return None diff --git a/lorrystream/spike/__init__.py b/lorrystream/spike/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lorrystream/spike/kcl_dynamodb/.gitignore b/lorrystream/spike/kcl_dynamodb/.gitignore new file mode 100644 index 0000000..397b4a7 --- /dev/null +++ b/lorrystream/spike/kcl_dynamodb/.gitignore @@ -0,0 +1 @@ +*.log diff --git a/lorrystream/spike/kcl_dynamodb/__init__.py b/lorrystream/spike/kcl_dynamodb/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lorrystream/spike/kcl_dynamodb/dynamodb_cdc_processor.properties b/lorrystream/spike/kcl_dynamodb/dynamodb_cdc_processor.properties new file mode 100644 index 0000000..fa70839 --- /dev/null +++ b/lorrystream/spike/kcl_dynamodb/dynamodb_cdc_processor.properties @@ -0,0 +1,89 @@ +# Configuration file for Kinesis Client Library (KCLv2). +# https://github.com/awslabs/amazon-kinesis-client/blob/v2.6.0/amazon-kinesis-client-multilang/src/main/java/software/amazon/kinesis/coordinator/KinesisClientLibConfiguration.java#L210-L245 + +# The script that abides by the multi-language protocol. This script will +# be executed by the MultiLangDaemon, which will communicate with this script +# over STDIN and STDOUT according to the multi-language protocol. +executableName = python dynamodb_cdc_processor.py + +# The name of an Amazon Kinesis stream to process. +streamName = dynamodb-cdc + +# Used by the KCL as the name of this application. Will be used as the name +# of an Amazon DynamoDB table which will store the lease and checkpoint +# information for workers with this application name +applicationName = dynamodb-cdc-leases + +# Users can change the credentials provider the KCL will use to retrieve credentials. +# The DefaultAWSCredentialsProviderChain checks several other providers, which is +# described here: +# http://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/DefaultAWSCredentialsProviderChain.html +AWSCredentialsProvider = DefaultAWSCredentialsProviderChain + +# Appended to the user agent of the KCL. Does not impact the functionality of the +# KCL in any other way. +processingLanguage = python/3.11 + +# Valid options at TRIM_HORIZON or LATEST. +# See http://docs.aws.amazon.com/kinesis/latest/APIReference/API_GetShardIterator.html#API_GetShardIterator_RequestSyntax +initialPositionInStream = TRIM_HORIZON + +# The following properties are also available for configuring the KCL Worker that is created +# by the MultiLangDaemon. + +# The KCL defaults to us-east-1, this value is overridden by the set_region.py scripts +regionName = eu-central-1 + +# Fail over time in milliseconds. A worker which does not renew it's lease within this time interval +# will be regarded as having problems and it's shards will be assigned to other workers. +# For applications that have a large number of shards, this msy be set to a higher number to reduce +# the number of DynamoDB IOPS required for tracking leases +#failoverTimeMillis = 10000 + +# A worker id that uniquely identifies this worker among all workers using the same applicationName +# If this isn't provided a MultiLangDaemon instance will assign a unique workerId to itself. +#workerId = + +# Shard sync interval in milliseconds - e.g. wait for this long between shard sync tasks. +#shardSyncIntervalMillis = 60000 + +# Max records to fetch from Kinesis in a single GetRecords call. +#maxRecords = 10000 + +# Idle time between record reads in milliseconds. +#idleTimeBetweenReadsInMillis = 1000 + +# Enables applications flush/checkpoint (if they have some data "in progress", but don't get new data for while) +#callProcessRecordsEvenForEmptyRecordList = false + +# Interval in milliseconds between polling to check for parent shard completion. +# Polling frequently will take up more DynamoDB IOPS (when there are leases for shards waiting on +# completion of parent shards). +#parentShardPollIntervalMillis = 10000 + +# Cleanup leases upon shards completion (don't wait until they expire in Kinesis). +# Keeping leases takes some tracking/resources (e.g. they need to be renewed, assigned), so by default we try +# to delete the ones we don't need any longer. +#cleanupLeasesUponShardCompletion = true + +# Backoff time in milliseconds for Amazon Kinesis Client Library tasks (in the event of failures). +#taskBackoffTimeMillis = 500 + +# Buffer metrics for at most this long before publishing to CloudWatch. +#metricsBufferTimeMillis = 10000 + +# Buffer at most this many metrics before publishing to CloudWatch. +#metricsMaxQueueSize = 10000 + +# KCL will validate client provided sequence numbers with a call to Amazon Kinesis before checkpointing for calls +# to RecordProcessorCheckpointer#checkpoint(String) by default. +#validateSequenceNumberBeforeCheckpointing = true + +# The maximum number of active threads for the MultiLangDaemon to permit. +# If a value is provided then a FixedThreadPool is used with the maximum +# active threads set to the provided value. If a non-positive integer or no +# value is provided a CachedThreadPool is used. +#maxActiveThreads = 0 + +# Whether to report metrics to CloudWatch? +metricsLevel = none diff --git a/lorrystream/spike/kcl_dynamodb/dynamodb_cdc_processor.py b/lorrystream/spike/kcl_dynamodb/dynamodb_cdc_processor.py new file mode 100644 index 0000000..5ee3b4d --- /dev/null +++ b/lorrystream/spike/kcl_dynamodb/dynamodb_cdc_processor.py @@ -0,0 +1,221 @@ +#!/usr/bin/python3 + +# Copyright 2014-2015 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +from __future__ import print_function + +import json +import logging +import logging.handlers as handlers +import os +import time +import typing as t + +from amazon_kclpy import kcl +from amazon_kclpy.v3 import processor +from commons_codec.transform.dynamodb import DynamoCDCTranslatorCrateDB +from cratedb_toolkit.util import DatabaseAdapter + +logger = logging.getLogger(__name__) + +IntOrNone = t.Union[int, None] +FloatOrNone = t.Union[float, None] + + +def setup_logging(logfile: str): + """ + Configure Python logger to write to file, because stdout is used by MultiLangDaemon. + """ + logger.setLevel(logging.INFO) + formatter = logging.Formatter( + "%(asctime)s.%(msecs)03d [%(module)s] %(levelname)s %(funcName)s - %(message)s", "%H:%M:%S" + ) + handler = handlers.RotatingFileHandler(logfile, maxBytes=10**6, backupCount=5) + handler.setLevel(logging.INFO) + handler.setFormatter(formatter) + logger.addHandler(handler) + + +class RecordProcessor(processor.RecordProcessorBase): + """ + Process data from a shard in a stream. Its methods will be called with this pattern: + + * initialize will be called once + * process_records will be called zero or more times + * shutdown will be called if this MultiLangDaemon instance loses the lease to this shard, or the shard ends due + a scaling change. + """ + + def __init__(self, sqlalchemy_url: t.Optional[str], table_name: t.Optional[str]): + self._SLEEP_SECONDS = 5 + self._CHECKPOINT_RETRIES = 5 + self._CHECKPOINT_FREQ_SECONDS = 60 + self._largest_seq: t.Tuple[IntOrNone, IntOrNone] = (None, None) + self._largest_sub_seq = None + self._last_checkpoint_time: FloatOrNone = None + + self.sqlalchemy_url = sqlalchemy_url + self.table_name = table_name + + # Sanity checks. + if self.sqlalchemy_url is None: + raise ValueError("SQLAlchemy URL must not be empty") + if self.table_name is None: + raise ValueError("Target CDC table name must not be empty") + + self.cratedb = DatabaseAdapter(dburi=self.sqlalchemy_url) + self.table_name = self.table_name + self.cdc = DynamoCDCTranslatorCrateDB(table_name=self.table_name) + + def initialize(self, initialize_input): + """ + Called once by a KCLProcess before any calls to process_records + + :param amazon_kclpy.messages.InitializeInput initialize_input: Information about the lease that this record + processor has been assigned. + """ + self._largest_seq = (None, None) + self._last_checkpoint_time = time.time() + + def checkpoint(self, checkpointer, sequence_number=None, sub_sequence_number=None): + """ + Checkpoints with retries on retryable exceptions. + + :param amazon_kclpy.kcl.Checkpointer checkpointer: the checkpointer provided to either process_records + or shutdown + :param str or None sequence_number: the sequence number to checkpoint at. + :param int or None sub_sequence_number: the sub sequence number to checkpoint at. + """ + for n in range(0, self._CHECKPOINT_RETRIES): + try: + checkpointer.checkpoint(sequence_number, sub_sequence_number) + return + except kcl.CheckpointError as e: + if "ShutdownException" == e.value: + # + # A ShutdownException indicates that this record processor should be shutdown. This is due to + # some failover event, e.g. another MultiLangDaemon has taken the lease for this shard. + # + logging.error("Encountered shutdown exception, skipping checkpoint") + return + elif "ThrottlingException" == e.value: + # + # A ThrottlingException indicates that one of our dependencies is is over burdened, e.g. too many + # dynamo writes. We will sleep temporarily to let it recover. + # + if self._CHECKPOINT_RETRIES - 1 == n: + logging.error("Failed to checkpoint after {n} attempts, giving up.\n".format(n=n)) + return + else: + logging.info( + "Was throttled while checkpointing, will attempt again in {s} seconds".format( + s=self._SLEEP_SECONDS + ) + ) + elif "InvalidStateException" == e.value: + logging.error("MultiLangDaemon reported an invalid state while checkpointing.\n") + else: # Some other error + logging.error("Encountered an error while checkpointing, error was {e}.\n".format(e=e)) + time.sleep(self._SLEEP_SECONDS) + + def process_record(self, data, partition_key, sequence_number, sub_sequence_number): + """ + Convert record, which is a DynamoDB CDC event item, into an SQL statement, + and submit to downstream database. + + :param str data: The blob of data that was contained in the record. + :param str partition_key: The key associated with this recod. + :param int sequence_number: The sequence number associated with this record. + :param int sub_sequence_number: the sub sequence number associated with this record. + """ + + sql = None + try: + cdc_event = json.loads(data) + logger.info("CDC event: %s", cdc_event) + + sql = self.cdc.to_sql(cdc_event) + logger.info("SQL: %s", sql) + except Exception: + logger.exception("Decoding CDC event failed") + + if not sql: + return + + try: + self.cratedb.run_sql(sql) + except Exception: + logger.exception("Writing CDC event to sink database failed") + + def should_update_sequence(self, sequence_number, sub_sequence_number): + """ + Determines whether a new larger sequence number is available + + :param int sequence_number: the sequence number from the current record + :param int sub_sequence_number: the sub sequence number from the current record + :return boolean: true if the largest sequence should be updated, false otherwise + """ + return ( + self._largest_seq == (None, None) + or sequence_number > self._largest_seq[0] + or (sequence_number == self._largest_seq[0] and sub_sequence_number > self._largest_seq[1]) + ) + + def process_records(self, process_records_input): + """ + Called by a KCLProcess with a list of records to be processed and a checkpointer which accepts sequence numbers + from the records to indicate where in the stream to checkpoint. + + :param amazon_kclpy.messages.ProcessRecordsInput process_records_input: the records, and metadata about the + records. + """ + try: + for record in process_records_input.records: + data = record.binary_data + seq = int(record.sequence_number) + sub_seq = record.sub_sequence_number + key = record.partition_key + self.process_record(data, key, seq, sub_seq) + if self.should_update_sequence(seq, sub_seq): + self._largest_seq = (seq, sub_seq) + + # + # Checkpoints every self._CHECKPOINT_FREQ_SECONDS seconds + # + if self._last_checkpoint_time and time.time() - self._last_checkpoint_time > self._CHECKPOINT_FREQ_SECONDS: + self.checkpoint(process_records_input.checkpointer, str(self._largest_seq[0]), self._largest_seq[1]) + self._last_checkpoint_time = time.time() + + except Exception as e: + logging.error("Encountered an exception while processing records. Exception was {e}\n".format(e=e)) + + def lease_lost(self, lease_lost_input): + logging.warn("Lease has been lost") + + def shard_ended(self, shard_ended_input): + logging.warn("Shard has ended checkpointing") + shard_ended_input.checkpointer.checkpoint() + + def shutdown_requested(self, shutdown_requested_input): + logging.warn("Shutdown has been requested, checkpointing.") + shutdown_requested_input.checkpointer.checkpoint() + + +def main(): + # Set up logging. + logfile = os.environ.get("CDC_LOGFILE", "cdc.log") + setup_logging(logfile) + + # Setup processor. + sqlalchemy_url = os.environ.get("CDC_SQLALCHEMY_URL") + table_name = os.environ.get("CDC_TABLE_NAME") + kcl_processor = RecordProcessor(sqlalchemy_url=sqlalchemy_url, table_name=table_name) + + # Invoke machinery. + kcl_process = kcl.KCLProcess(kcl_processor) + kcl_process.run() + + +if __name__ == "__main__": + main() diff --git a/lorrystream/spike/kcl_dynamodb/launch.sh b/lorrystream/spike/kcl_dynamodb/launch.sh new file mode 100644 index 0000000..05d7ca5 --- /dev/null +++ b/lorrystream/spike/kcl_dynamodb/launch.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +# Configure record processor. +export CDC_SQLALCHEMY_URL=crate:// +export CDC_TABLE_NAME=transactions +export CDC_LOGFILE=dynamodb_cdc_processor.log + +# Invoke KCL launcher. +KCLPY_PATH=$(python -c 'import amazon_kclpy; print(amazon_kclpy.__path__[0])') +/usr/bin/java \ + -DstreamName=dynamodb-cdc-nested \ + -cp "${KCLPY_PATH}/jars/*" \ + software.amazon.kinesis.multilang.MultiLangDaemon \ + --properties-file "$1" \ + --log-configuration logback.xml diff --git a/lorrystream/spike/kcl_dynamodb/logback.xml b/lorrystream/spike/kcl_dynamodb/logback.xml new file mode 100644 index 0000000..afaebf8 --- /dev/null +++ b/lorrystream/spike/kcl_dynamodb/logback.xml @@ -0,0 +1,14 @@ + + + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + \ No newline at end of file diff --git a/lorrystream/spike/kcl_dynamodb/requirements.txt b/lorrystream/spike/kcl_dynamodb/requirements.txt new file mode 100644 index 0000000..a8f1c89 --- /dev/null +++ b/lorrystream/spike/kcl_dynamodb/requirements.txt @@ -0,0 +1,3 @@ +amazon-kclpy==2.1.5 +awscli==1.33.* +boto3<1.35 diff --git a/lorrystream/spike/kcl_kinesis/.gitignore b/lorrystream/spike/kcl_kinesis/.gitignore new file mode 100644 index 0000000..397b4a7 --- /dev/null +++ b/lorrystream/spike/kcl_kinesis/.gitignore @@ -0,0 +1 @@ +*.log diff --git a/lorrystream/spike/kcl_kinesis/__init__.py b/lorrystream/spike/kcl_kinesis/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lorrystream/spike/kcl_kinesis/amazon_kclpy_helper.py b/lorrystream/spike/kcl_kinesis/amazon_kclpy_helper.py new file mode 100644 index 0000000..9494f6a --- /dev/null +++ b/lorrystream/spike/kcl_kinesis/amazon_kclpy_helper.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python +# Copyright 2014-2015 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +# ruff: noqa: B006,E501 +""" +This script provides two utility functions: + + ``--print_classpath`` + which prints a java class path. It optionally takes --properties + and any number of --path options. It will generate a java class path which will include + the properties file and paths and the location of the KCL jars based on the location of + the amazon_kclpy.kcl module. + + ``--print_command`` + which prints a command to run an Amazon KCLpy application. It requires a --java + and --properties argument and optionally takes any number of --path arguments to prepend + to the classpath that it generates for the command. +""" +from __future__ import print_function + +import argparse +import os +import sys +from glob import glob + +import samples +from amazon_kclpy import kcl + + +def get_dir_of_file(f): + """ + Returns the absolute path to the directory containing the specified file. + + :type f: str + :param f: A path to a file, either absolute or relative + + :rtype: str + :return: The absolute path of the directory represented by the relative path provided. + """ + return os.path.dirname(os.path.abspath(f)) + + +def get_kcl_dir(): + """ + Returns the absolute path to the dir containing the amazon_kclpy.kcl module. + + :rtype: str + :return: The absolute path of the KCL package. + """ + return get_dir_of_file(kcl.__file__) + + +def get_kcl_jar_path(): + """ + Returns the absolute path to the KCL jars needed to run an Amazon KCLpy app. + + :rtype: str + :return: The absolute path of the KCL jar files needed to run the MultiLangDaemon. + """ + return ":".join(glob(os.path.join(get_kcl_dir(), "jars", "*jar"))) + + +def get_kcl_classpath(properties=None, paths=[]): + """ + Generates a classpath that includes the location of the kcl jars, the + properties file and the optional paths. + + :type properties: str + :param properties: Path to properties file. + + :type paths: list + :param paths: List of strings. The paths that will be prepended to the classpath. + + :rtype: str + :return: A java class path that will allow your properties to be found and the MultiLangDaemon and its deps and + any custom paths you provided. + """ + # First make all the user provided paths absolute + paths = [os.path.abspath(p) for p in paths] + # We add our paths after the user provided paths because this permits users to + # potentially inject stuff before our paths (otherwise our stuff would always + # take precedence). + paths.append(get_kcl_jar_path()) + if properties: + # Add the dir that the props file is in + dir_of_file = get_dir_of_file(properties) + paths.append(dir_of_file) + return ":".join([p for p in paths if p != ""]) + + +def get_kcl_app_command(args, multi_lang_daemon_class, properties, log_configuration, paths=[]): + """ + Generates a command to run the MultiLangDaemon. + + :type java: str + :param java: Path to java + + :type multi_lang_daemon_class: str + :param multi_lang_daemon_class: Name of multi language daemon class e.g. com.amazonaws.services.kinesis.multilang.MultiLangDaemon + + :type properties: str + :param properties: Optional properties file to be included in the classpath. + + :type paths: list + :param paths: List of strings. Additional paths to prepend to the classpath. + + :rtype: str + :return: A command that will run the MultiLangDaemon with your properties and custom paths and java. + """ + return "{java} -cp {cp} {daemon} {props} {log_config}".format( + java=args.java, + cp=get_kcl_classpath(args.properties, paths), + daemon=multi_lang_daemon_class, + # Just need the basename because the path is added to the classpath + props=properties, + log_config=log_configuration, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("A script for generating a command to run an Amazon KCLpy app") + parser.add_argument( + "--print_classpath", + dest="print_classpath", + action="store_true", + default=False, + help="Print a java class path.\noptional arguments: --path", + ) + parser.add_argument( + "--print_command", + dest="print_command", + action="store_true", + default=False, + help="Print a command for running an Amazon KCLpy app.\nrequired " + + "args: --java --properties\noptional args: --classpath", + ) + parser.add_argument( + "-j", + "--java", + dest="java", + help="The path to the java executable e.g. /jdk/bin/java", + metavar="PATH_TO_JAVA", + ) + parser.add_argument( + "-p", + "--properties", + "--props", + "--prop", + dest="properties", + help="The path to a properties file (relative to where you are running this script)", + metavar="PATH_TO_PROPERTIES", + ) + parser.add_argument( + "--sample", + "--sample-props", + "--use-sample-properties", + dest="use_sample_props", + help="This will use the sample.properties file included in this package as the properties file.", + action="store_true", + default=False, + ) + parser.add_argument( + "-c", + "--classpath", + "--path", + dest="paths", + action="append", + default=[], + help="Additional path to add to java class path. May be specified any number of times", + metavar="PATH", + ) + parser.add_argument( + "-l", + "--log-configuration", + dest="log_configuration", + help="This will use the logback.xml which will be used by the KCL to log.", + metavar="PATH_TO_LOG_CONFIGURATION", + ) + args = parser.parse_args() + # Possibly replace the properties with the sample. Useful if they just want to run the sample app. + if args.use_sample_props: + if args.properties: + sys.stderr.write("Replacing provided properties with sample properties due to arg --sample\n") + args.properties = os.path.join(get_dir_of_file(samples.__file__), "record_processor.properties") + + # Print what the asked for + if args.print_classpath: + print(get_kcl_classpath(args.properties, args.paths)) + elif args.print_command: + if args.java and args.properties: + multi_lang_daemon_class = "software.amazon.kinesis.multilang.MultiLangDaemon" + properties_argument = "--properties-file {props}".format(props=args.properties) + log_argument = "" + if args.log_configuration is not None: + log_argument = "--log-configuration {log}".format(log=args.log_configuration) + print( + get_kcl_app_command(args, multi_lang_daemon_class, properties_argument, log_argument, paths=args.paths) + ) + else: + sys.stderr.write("Must provide arguments: --java and --properties\n") + parser.print_usage() + else: + parser.print_usage() diff --git a/lorrystream/spike/kcl_kinesis/launch.sh b/lorrystream/spike/kcl_kinesis/launch.sh new file mode 100644 index 0000000..c2b7108 --- /dev/null +++ b/lorrystream/spike/kcl_kinesis/launch.sh @@ -0,0 +1 @@ +python amazon_kclpy_helper.py --print_command --java /usr/bin/java --properties "$1" --log-configuration logback.xml diff --git a/lorrystream/spike/kcl_kinesis/logback.xml b/lorrystream/spike/kcl_kinesis/logback.xml new file mode 100644 index 0000000..afaebf8 --- /dev/null +++ b/lorrystream/spike/kcl_kinesis/logback.xml @@ -0,0 +1,14 @@ + + + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + \ No newline at end of file diff --git a/lorrystream/spike/kcl_kinesis/record_processor.properties b/lorrystream/spike/kcl_kinesis/record_processor.properties new file mode 100644 index 0000000..5294f2a --- /dev/null +++ b/lorrystream/spike/kcl_kinesis/record_processor.properties @@ -0,0 +1,83 @@ +# The script that abides by the multi-language protocol. This script will +# be executed by the MultiLangDaemon, which will communicate with this script +# over STDIN and STDOUT according to the multi-language protocol. +executableName = python record_processor.py + +# The name of an Amazon Kinesis stream to process. +streamName = testdrive-stream + +# Used by the KCL as the name of this application. Will be used as the name +# of an Amazon DynamoDB table which will store the lease and checkpoint +# information for workers with this application name +applicationName = stream-demo + +# Users can change the credentials provider the KCL will use to retrieve credentials. +# The DefaultAWSCredentialsProviderChain checks several other providers, which is +# described here: +# http://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/DefaultAWSCredentialsProviderChain.html +AWSCredentialsProvider = DefaultAWSCredentialsProviderChain + +# Appended to the user agent of the KCL. Does not impact the functionality of the +# KCL in any other way. +processingLanguage = python/3.11 + +# Valid options at TRIM_HORIZON or LATEST. +# See http://docs.aws.amazon.com/kinesis/latest/APIReference/API_GetShardIterator.html#API_GetShardIterator_RequestSyntax +initialPositionInStream = TRIM_HORIZON + +# The following properties are also available for configuring the KCL Worker that is created +# by the MultiLangDaemon. + +# The KCL defaults to us-east-1, this value is overridden by the set_region.py scripts +regionName = eu-central-1 + +# Fail over time in milliseconds. A worker which does not renew it's lease within this time interval +# will be regarded as having problems and it's shards will be assigned to other workers. +# For applications that have a large number of shards, this msy be set to a higher number to reduce +# the number of DynamoDB IOPS required for tracking leases +#failoverTimeMillis = 10000 + +# A worker id that uniquely identifies this worker among all workers using the same applicationName +# If this isn't provided a MultiLangDaemon instance will assign a unique workerId to itself. +#workerId = + +# Shard sync interval in milliseconds - e.g. wait for this long between shard sync tasks. +#shardSyncIntervalMillis = 60000 + +# Max records to fetch from Kinesis in a single GetRecords call. +#maxRecords = 10000 + +# Idle time between record reads in milliseconds. +#idleTimeBetweenReadsInMillis = 1000 + +# Enables applications flush/checkpoint (if they have some data "in progress", but don't get new data for while) +#callProcessRecordsEvenForEmptyRecordList = false + +# Interval in milliseconds between polling to check for parent shard completion. +# Polling frequently will take up more DynamoDB IOPS (when there are leases for shards waiting on +# completion of parent shards). +#parentShardPollIntervalMillis = 10000 + +# Cleanup leases upon shards completion (don't wait until they expire in Kinesis). +# Keeping leases takes some tracking/resources (e.g. they need to be renewed, assigned), so by default we try +# to delete the ones we don't need any longer. +#cleanupLeasesUponShardCompletion = true + +# Backoff time in milliseconds for Amazon Kinesis Client Library tasks (in the event of failures). +#taskBackoffTimeMillis = 500 + +# Buffer metrics for at most this long before publishing to CloudWatch. +#metricsBufferTimeMillis = 10000 + +# Buffer at most this many metrics before publishing to CloudWatch. +#metricsMaxQueueSize = 10000 + +# KCL will validate client provided sequence numbers with a call to Amazon Kinesis before checkpointing for calls +# to RecordProcessorCheckpointer#checkpoint(String) by default. +#validateSequenceNumberBeforeCheckpointing = true + +# The maximum number of active threads for the MultiLangDaemon to permit. +# If a value is provided then a FixedThreadPool is used with the maximum +# active threads set to the provided value. If a non-positive integer or no +# value is provided a CachedThreadPool is used. +#maxActiveThreads = 0 diff --git a/lorrystream/spike/kcl_kinesis/record_processor.py b/lorrystream/spike/kcl_kinesis/record_processor.py new file mode 100644 index 0000000..8bebbe2 --- /dev/null +++ b/lorrystream/spike/kcl_kinesis/record_processor.py @@ -0,0 +1,171 @@ +#!/usr/bin/python3 + +# Copyright 2014-2015 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +from __future__ import print_function + +import logging +import logging.handlers as handlers +import time +import typing as t + +from amazon_kclpy import kcl +from amazon_kclpy.v3 import processor + +# Logger writes to file because stdout is used by MultiLangDaemon +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +formatter = logging.Formatter( + "%(asctime)s.%(msecs)03d [%(module)s] %(levelname)s %(funcName)s - %(message)s", "%H:%M:%S" +) +handler = handlers.RotatingFileHandler("record_processor.log", maxBytes=10**6, backupCount=5) +handler.setLevel(logging.INFO) +handler.setFormatter(formatter) +logger.addHandler(handler) + + +IntOrNone = t.Union[int, None] + + +class RecordProcessor(processor.RecordProcessorBase): + """ + A RecordProcessor processes data from a shard in a stream. Its methods will be called with this pattern: + + * initialize will be called once + * process_records will be called zero or more times + * shutdown will be called if this MultiLangDaemon instance loses the lease to this shard, or the shard ends due + a scaling change. + """ + + def __init__(self): + self._SLEEP_SECONDS = 5 + self._CHECKPOINT_RETRIES = 5 + self._CHECKPOINT_FREQ_SECONDS = 60 + self._largest_seq: t.Tuple[IntOrNone, IntOrNone] = (None, None) + self._largest_sub_seq = None + self._last_checkpoint_time = None + + def initialize(self, initialize_input): + """ + Called once by a KCLProcess before any calls to process_records + + :param amazon_kclpy.messages.InitializeInput initialize_input: Information about the lease that this record + processor has been assigned. + """ + self._largest_seq = (None, None) + self._last_checkpoint_time = time.time() + + def checkpoint(self, checkpointer, sequence_number=None, sub_sequence_number=None): + """ + Checkpoints with retries on retryable exceptions. + + :param amazon_kclpy.kcl.Checkpointer checkpointer: the checkpointer provided to either process_records + or shutdown + :param str or None sequence_number: the sequence number to checkpoint at. + :param int or None sub_sequence_number: the sub sequence number to checkpoint at. + """ + for n in range(0, self._CHECKPOINT_RETRIES): + try: + checkpointer.checkpoint(sequence_number, sub_sequence_number) + return + except kcl.CheckpointError as e: + if "ShutdownException" == e.value: + # + # A ShutdownException indicates that this record processor should be shutdown. This is due to + # some failover event, e.g. another MultiLangDaemon has taken the lease for this shard. + # + logging.error("Encountered shutdown exception, skipping checkpoint") + return + elif "ThrottlingException" == e.value: + # + # A ThrottlingException indicates that one of our dependencies is is over burdened, e.g. too many + # dynamo writes. We will sleep temporarily to let it recover. + # + if self._CHECKPOINT_RETRIES - 1 == n: + logging.error("Failed to checkpoint after {n} attempts, giving up.\n".format(n=n)) + return + else: + logging.info( + "Was throttled while checkpointing, will attempt again in {s} seconds".format( + s=self._SLEEP_SECONDS + ) + ) + elif "InvalidStateException" == e.value: + logging.error("MultiLangDaemon reported an invalid state while checkpointing.\n") + else: # Some other error + logging.error("Encountered an error while checkpointing, error was {e}.\n".format(e=e)) + time.sleep(self._SLEEP_SECONDS) + + def process_record(self, data, partition_key, sequence_number, sub_sequence_number): + """ + Called for each record that is passed to process_records. + + :param str data: The blob of data that was contained in the record. + :param str partition_key: The key associated with this recod. + :param int sequence_number: The sequence number associated with this record. + :param int sub_sequence_number: the sub sequence number associated with this record. + """ + #################################### + # Insert your processing logic here + #################################### + + logger.info(data.decode("UTF-8")) + + def should_update_sequence(self, sequence_number, sub_sequence_number): + """ + Determines whether a new larger sequence number is available + + :param int sequence_number: the sequence number from the current record + :param int sub_sequence_number: the sub sequence number from the current record + :return boolean: true if the largest sequence should be updated, false otherwise + """ + return ( + self._largest_seq == (None, None) + or sequence_number > self._largest_seq[0] + or (sequence_number == self._largest_seq[0] and sub_sequence_number > self._largest_seq[1]) + ) + + def process_records(self, process_records_input): + """ + Called by a KCLProcess with a list of records to be processed and a checkpointer which accepts sequence numbers + from the records to indicate where in the stream to checkpoint. + + :param amazon_kclpy.messages.ProcessRecordsInput process_records_input: the records, and metadata about the + records. + """ + try: + for record in process_records_input.records: + data = record.binary_data + seq = int(record.sequence_number) + sub_seq = record.sub_sequence_number + key = record.partition_key + self.process_record(data, key, seq, sub_seq) + if self.should_update_sequence(seq, sub_seq): + self._largest_seq = (seq, sub_seq) + + # + # Checkpoints every self._CHECKPOINT_FREQ_SECONDS seconds + # + if self._last_checkpoint_time and time.time() - self._last_checkpoint_time > self._CHECKPOINT_FREQ_SECONDS: + self.checkpoint(process_records_input.checkpointer, str(self._largest_seq[0]), self._largest_seq[1]) + self._last_checkpoint_time = time.time() + + except Exception as e: + logging.error("Encountered an exception while processing records. Exception was {e}\n".format(e=e)) + + def lease_lost(self, lease_lost_input): + logging.warn("Lease has been lost") + + def shard_ended(self, shard_ended_input): + logging.warn("Shard has ended checkpointing") + shard_ended_input.checkpointer.checkpoint() + + def shutdown_requested(self, shutdown_requested_input): + logging.warn("Shutdown has been requested, checkpointing.") + shutdown_requested_input.checkpointer.checkpoint() + + +if __name__ == "__main__": + kcl_process = kcl.KCLProcess(RecordProcessor()) + kcl_process.run() diff --git a/lorrystream/spike/kcl_kinesis/requirements.txt b/lorrystream/spike/kcl_kinesis/requirements.txt new file mode 100644 index 0000000..65e8999 --- /dev/null +++ b/lorrystream/spike/kcl_kinesis/requirements.txt @@ -0,0 +1 @@ +amazon-kclpy==2.1.5 diff --git a/lorrystream/spike/kinesis/__init__.py b/lorrystream/spike/kinesis/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lorrystream/util/common.py b/lorrystream/util/common.py index f245e1e..6ff5a40 100644 --- a/lorrystream/util/common.py +++ b/lorrystream/util/common.py @@ -23,7 +23,7 @@ def setup_logging_basic(level=logging.INFO): def setup_logging(level=logging.INFO): reset = escape_codes["reset"] - log_format = f"%(asctime)-15s [%(name)-28s] %(log_color)s%(levelname)-8s:{reset} %(message)s" + log_format = f"%(asctime)-15s [%(name)-30s] %(log_color)s%(levelname)-8s:{reset} %(message)s" handler = colorlog.StreamHandler() handler.setFormatter(colorlog.ColoredFormatter(log_format)) diff --git a/lorrystream/util/python/__init__.py b/lorrystream/util/python/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lorrystream/util/python/bundle.py b/lorrystream/util/python/bundle.py new file mode 100644 index 0000000..a67a4cd --- /dev/null +++ b/lorrystream/util/python/bundle.py @@ -0,0 +1,20 @@ +import typing as t +from pathlib import Path + +from lorrystream.util.python.pep723 import read_inline_script_metadata + + +def collect_requirements(*artifacts: t.Union[str, Path]): + """ + Collect dependencies from script metadata, as per PEP 723. + """ + dependencies: t.List[str] = [] + for artifact in artifacts: + if isinstance(artifact, Path): + payload = artifact.read_text() + else: + payload = artifact + metadata = read_inline_script_metadata(payload) + if isinstance(metadata, dict): + dependencies += metadata.get("dependencies", []) + return dependencies diff --git a/lorrystream/util/python/pep723.py b/lorrystream/util/python/pep723.py new file mode 100644 index 0000000..9eaf2be --- /dev/null +++ b/lorrystream/util/python/pep723.py @@ -0,0 +1,27 @@ +import re +import typing as t + +import tomli + +PEP_723_REGEX = r"(?m)^# /// (?P[a-zA-Z0-9-]+)$\s(?P(^#(| .*)$\s)+)^# ///$" + + +def read_inline_script_metadata(script: str) -> t.Dict[str, t.Any]: + """ + Reference implementation to read inline script metadata (PEP 723). + + https://packaging.python.org/en/latest/specifications/inline-script-metadata/ + https://peps.python.org/pep-0723/ + """ + name = "script" + matches = list(filter(lambda m: m.group("type") == name, re.finditer(PEP_723_REGEX, script))) + if len(matches) > 1: + raise ValueError(f"Multiple {name} blocks found") + if len(matches) == 1: + content = "".join( + line[2:] if line.startswith("# ") else line[1:] + for line in matches[0].group("content").splitlines(keepends=True) + ) + return tomli.loads(content) + else: + return {} diff --git a/pyproject.toml b/pyproject.toml index db80ae6..55eb989 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ keywords = [ "data", "export", "import", + "kinesis", "mqtt", "pandas", "rdbms", @@ -86,6 +87,7 @@ dependencies = [ "click<9", "colorama<1", "colorlog", + "commons-codec==0.0.3", "dask", "funcy", "influxdb", @@ -93,11 +95,24 @@ dependencies = [ "paho-mqtt", "pandas<2.3", "pika<1.4", + "simplejson<4", "sqlalchemy==2.0.*", "sqlalchemy-cratedb==0.38.0", "streamz", + "tomli", "toolz", ] +optional-dependencies.all = [ + "lorrystream[carabas]", +] +optional-dependencies.carabas = [ + "aiobotocore==2.13.*", # for async-kinesis + "async-kinesis<1.2", + "aws-lambda-layer<0.6", + "boto3==1.34.*", # for async-kinesis + "cottonformation<1.2", + "localstack[base-runtime]<3.7", +] optional-dependencies.develop = [ "black<25", "mypy<1.12", @@ -121,11 +136,18 @@ optional-dependencies.release = [ "twine<6", ] optional-dependencies.test = [ + # Problem: Breaks with requests 2.32.0: Not supported URL scheme http+docker. + # Solution: Pin `docker` and `requests` packages. + # https://github.com/docker/docker-py/issues/3256#issuecomment-2126888985 "cratedb-toolkit[testing]==0.0.15", + "docker<7", + "localstack-utils<1.1", "pytest<9", - "pytest-asyncio-cooperative", + "pytest-asyncio-cooperative<0.30", "pytest-cov<6", + "pytest-mock<4", "pytest-mqtt>=0.4.2,<0.5", + "requests==2.28.1", "testcontainer-python-rabbitmq==0.4.*", ] urls.Changelog = "https://lorrystream.readthedocs.io/changes.html" @@ -147,13 +169,18 @@ namespaces = false [tool.black] line-length = 120 -extend-exclude = "lorrystream/streamz/amqp.py" +force-exclude = ''' + lorrystream/streamz/amqp.py +| lorrystream/carabas/aws/cf/.*\.py +''' [tool.ruff] line-length = 120 extend-exclude = [ "amqp-to-mqtt.py", + "dms_next\\.py$", + "lorrystream/carabas/aws/cf/*.py", "lorrystream/streamz/amqp_async.py", "lorrystream/streamz/amqp_blocking.py", "workbench.py", @@ -198,13 +225,19 @@ lint.extend-ignore = [ "RET505", ] +lint.per-file-ignores."amazon_kclpy_helper.py" = [ "T201" ] # Allow `print` lint.per-file-ignores."examples/*" = [ "T201" ] # Allow `print` lint.per-file-ignores."lorrystream/util/about.py" = [ "T201" ] # Allow `print` +lint.per-file-ignores."test_*.py" = [ "S101" ] # Use of `assert` detected lint.per-file-ignores."tests/*" = [ "S101" ] # Use of `assert` detected [tool.pytest.ini_options] +# Because synchronous and asynchronous tests are mixed, +# and maybe because of woes with pytest fixtures, the +# test suite must turn off concurrency. addopts = """ -rA --verbosity=3 + --max-asyncio-tasks=1 --asyncio-task-timeout=30 --cov --cov-report=term-missing --cov-report=xml """ minversion = "2.0" @@ -224,6 +257,10 @@ markers = [ branch = false omit = [ "tests/*", + "lorrystream/carabas/aws/function/zip.py", + "lorrystream/spike/*", + # It is tested, but code coverage tracking does not work well. + "lorrystream/process/kinesis_cratedb_lambda.py", ] source = [ "lorrystream" ] @@ -234,6 +271,7 @@ show_missing = true [tool.mypy] packages = [ "lorrystream" ] exclude = [ + "dms_next.py", "lorrystream/streamz/amqp_async.py", "lorrystream/streamz/amqp_blocking.py", ] @@ -243,10 +281,18 @@ implicit_optional = true install_types = true non_interactive = true +[[tool.mypy.overrides]] +module = "lorrystream.carabas.aws.cf.*" +follow_imports = "silent" + [tool.versioningit.vcs] method = "git" default-tag = "0.0.0" +# =================== +# Tasks configuration +# =================== + [tool.poe.tasks] check = [ diff --git a/release/oci/Dockerfile b/release/oci/Dockerfile index 182bbde..69b7180 100644 --- a/release/oci/Dockerfile +++ b/release/oci/Dockerfile @@ -21,7 +21,7 @@ COPY . /src # Install package. RUN --mount=type=cache,id=pip,target=/root/.cache/pip \ - pip install --use-pep517 --prefer-binary '/src' + pip install --use-pep517 --prefer-binary '/src[all]' # Uninstall Git again. RUN apt-get --yes remove --purge git && apt-get --yes autoremove diff --git a/tests/carabas/__init__.py b/tests/carabas/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/carabas/test_dms.py b/tests/carabas/test_dms.py new file mode 100644 index 0000000..e311208 --- /dev/null +++ b/tests/carabas/test_dms.py @@ -0,0 +1,10 @@ +def test_endpoint_port_integer(): + """ + Verify p_Port is defined as an integer. + + TODO: Does not perform the validation yet. How? + """ + from lorrystream.carabas.aws.cf.dms_next import Endpoint + + ep = Endpoint("foobar", rp_EndpointType="foo", rp_EngineName="bar") + assert hasattr(ep, "p_Port") diff --git a/tests/carabas/test_function.py b/tests/carabas/test_function.py new file mode 100644 index 0000000..d721942 --- /dev/null +++ b/tests/carabas/test_function.py @@ -0,0 +1,43 @@ +from pathlib import Path + +import pytest +from cottonformation.res import awslambda + +from lorrystream.carabas.aws import LambdaFactory, LambdaPythonImage +from lorrystream.carabas.aws.model import GenericEnvStack + + +@pytest.mark.skip(reason="Needs adjustments for LocalStack-only operations") +def test_python_dockerfile(): + python_image = LambdaPythonImage( + name="kinesis-cratedb-lambda", + entrypoint_file=Path("./lorrystream/process/kinesis_cratedb_lambda.py"), + entrypoint_handler="kinesis_cratedb_lambda.handler", + ) + dockerfile = python_image.get_dockerfile() + assert "FROM public.ecr.aws/lambda/python:" in dockerfile + assert "COPY kinesis_cratedb_lambda.py ${LAMBDA_TASK_ROOT}" in dockerfile + + +@pytest.mark.skip(reason="Needs adjustments for LocalStack-only operations") +def test_lambda_python(): + python_image = LambdaPythonImage( + name="kinesis-cratedb-lambda", + entrypoint_file=Path("./lorrystream/process/kinesis_cratedb_lambda.py"), + entrypoint_handler="kinesis_cratedb_lambda.handler", + ) + lf = LambdaFactory( + name="FoobarProcessor", + oci_uri=python_image.uri, + handler=python_image.entrypoint_handler, + ) + assert "kinesis-cratedb-lambda:latest" in lf.oci_uri + + stack = GenericEnvStack( + project="testdrive", + stage="test", + region="eu-central-1", + description="Foobar Pipeline", + ) + lambda_function = lf.make(stack) + assert isinstance(lambda_function.function, awslambda.Function) diff --git a/tests/conftest.py b/tests/conftest.py index 039ad4a..d44706b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,13 +6,16 @@ from lorrystream.util.common import setup_logging from .fixtures.amqp import rabbitmq, rabbitmq_service # noqa: F401 +from .fixtures.localstack import boto3_configure_localstack, boto3_session, localstack, localstack_service # noqa: F401 @pytest.fixture def cratedb(cratedb_service): cratedb_service.reset( [ + "public.foo", "testdrive-amqp", + "testdrive-dynamodb-cdc", "testdrive-mqtt", ] ) diff --git a/tests/fixtures/localstack.py b/tests/fixtures/localstack.py new file mode 100644 index 0000000..01d100b --- /dev/null +++ b/tests/fixtures/localstack.py @@ -0,0 +1,64 @@ +import os +import socket +import time + +import boto3 +import botocore +import pytest +from localstack_utils.localstack import startup_localstack, stop_localstack + +from lorrystream.util.data import asbool + +TEST_STREAMS = [ + "test", + "testdrive", +] + + +def isUp(host, port): + """ + Test if a host is up. + + https://github.com/lovelysystems/lovely.testlayers/blob/0.7.0/src/lovely/testlayers/util.py#L6-L13 + """ + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + ex = s.connect_ex((host, port)) + if ex == 0: + s.close() + return True + return False + + +@pytest.fixture(scope="session") +def localstack_service(): + if not isUp("localhost", 4566): + startup_localstack(tag="3.6") + yield + if not asbool(os.environ.get("TC_KEEPALIVE")): + stop_localstack() + + +@pytest.fixture(scope="function") +def localstack(localstack_service, boto3_session): + kinesis = boto3_session.client("kinesis") + for stream_name in TEST_STREAMS: + try: + kinesis.delete_stream(StreamName=stream_name) + except botocore.exceptions.ClientError as error: + if error.response["Error"]["Code"] != "ResourceNotFoundException": + raise + time.sleep(0.5) + + +@pytest.fixture(scope="session", autouse=True) +def boto3_configure_localstack(): + os.environ["AWS_ENDPOINT_URL"] = "http://localhost:4566" + + +@pytest.fixture(scope="session") +def boto3_session(): + return boto3.Session( + region_name="us-east-1", + aws_access_key_id="foo", + aws_secret_access_key="bar", # noqa: S106 + ) diff --git a/tests/test_kinesis.py b/tests/test_kinesis.py new file mode 100644 index 0000000..08ee02a --- /dev/null +++ b/tests/test_kinesis.py @@ -0,0 +1,26 @@ +""" +Verify connectivity with Amazon Kinesis. + +- https://en.wikipedia.org/wiki/Amazon_Kinesis +- https://docs.localstack.cloud/user-guide/aws/kinesis/ +- https://docs.localstack.cloud/user-guide/tools/testing-utils/ +""" + +import logging +import time + +import pytest + +logger = logging.getLogger(__name__) + + +@pytest.mark.skip(reason="Does not stop at all on GHA, thus blocking the build") +def test_kinesis_stream_operations(localstack, boto3_session): + kinesis = boto3_session.client("kinesis") + + kinesis.create_stream(StreamName="test", ShardCount=1) + time.sleep(0.1) + + response = kinesis.list_streams() + assert response["StreamNames"] == ["test"] + time.sleep(0.1) diff --git a/tests/test_process.py b/tests/test_process.py new file mode 100644 index 0000000..badea58 --- /dev/null +++ b/tests/test_process.py @@ -0,0 +1,126 @@ +import json +import os +import sys + +import pytest +from commons_codec.model import ColumnType, ColumnTypeMapStore, TableAddress + + +@pytest.fixture +def reset_handler(): + try: + del sys.modules["lorrystream.process.kinesis_cratedb_lambda"] + except KeyError: + pass + + +def test_kinesis_dynamodb_cratedb_lambda_basic(mocker, cratedb, reset_handler): + """ + Test AWS Lambda processing Kinesis DynamoDB CDC event, converging to CrateDB. + """ + + # Read event payload. + with open("tests/testdata/kinesis_dynamodb.json") as fp: + event = json.load(fp) + + # Configure. + handler_environment = { + "MESSAGE_FORMAT": "dynamodb", + "SINK_SQLALCHEMY_URL": cratedb.get_connection_url(), + "SINK_TABLE": "testdrive-dynamodb-cdc", + } + mocker.patch.dict(os.environ, handler_environment) + + # Provision CrateDB. + cratedb.database.run_sql('CREATE TABLE "testdrive-dynamodb-cdc" (data OBJECT(DYNAMIC));') + + # Invoke Lambda handler. + from lorrystream.process.kinesis_cratedb_lambda import handler + + handler(event, None) + + # Verify record exists in CrateDB. + cratedb.database.run_sql('REFRESH TABLE "testdrive-dynamodb-cdc";') + assert cratedb.database.count_records("testdrive-dynamodb-cdc") == 1 + + records = cratedb.database.run_sql('SELECT * FROM "testdrive-dynamodb-cdc";', records=True) + assert records[0] == { + "data": {"temperature": 42.42, "humidity": 84.84, "device": "foo", "timestamp": "2024-07-12T01:17:42"} + } + + +def test_kinesis_dynamodb_cratedb_lambda_batch(mocker, cratedb, reset_handler): + """ + Test AWS Lambda processing Kinesis DynamoDB CDC event, converging to CrateDB. + This time, using batch processing on Kinesis. + """ + + # Read event payload. + with open("tests/testdata/kinesis_dynamodb.json") as fp: + event = json.load(fp) + + # Configure. + handler_environment = { + "MESSAGE_FORMAT": "dynamodb", + "SINK_SQLALCHEMY_URL": cratedb.get_connection_url(), + "SINK_TABLE": "testdrive-dynamodb-cdc", + "USE_BATCH_PROCESSING": "true", + } + mocker.patch.dict(os.environ, handler_environment) + + # Provision CrateDB. + cratedb.database.run_sql('CREATE TABLE "testdrive-dynamodb-cdc" (data OBJECT(DYNAMIC));') + + # Invoke Lambda handler. + from lorrystream.process.kinesis_cratedb_lambda import handler + + outcome = handler(event, None) + assert outcome == {"batchItemFailures": []} + + # Verify record exists in CrateDB. + cratedb.database.run_sql('REFRESH TABLE "testdrive-dynamodb-cdc";') + assert cratedb.database.count_records("testdrive-dynamodb-cdc") == 1 + + records = cratedb.database.run_sql('SELECT * FROM "testdrive-dynamodb-cdc";', records=True) + assert records[0] == { + "data": {"temperature": 42.42, "humidity": 84.84, "device": "foo", "timestamp": "2024-07-12T01:17:42"} + } + + +def test_kinesis_dms_cratedb_lambda_basic(mocker, cratedb, reset_handler): + """ + Test AWS Lambda processing AWS DMS events, converging to CrateDB. + """ + + # Read event payload. + with open("tests/testdata/kinesis_dms.json") as fp: + event = json.load(fp) + + # Define column type mapping for CrateDB processor. + column_types = ColumnTypeMapStore().add( + table=TableAddress(schema="public", table="foo"), + column="attributes", + type_=ColumnType.MAP, + ) + + # Configure environment variables. + handler_environment = { + "MESSAGE_FORMAT": "dms", + "COLUMN_TYPES": column_types.to_json(), + "SINK_SQLALCHEMY_URL": cratedb.get_connection_url(), + } + mocker.patch.dict(os.environ, handler_environment) + + # Invoke Lambda handler. + from lorrystream.process.kinesis_cratedb_lambda import handler + + handler(event, None) + + # Verify record exists in CrateDB. + cratedb.database.run_sql('REFRESH TABLE "public"."foo";') + assert cratedb.database.count_records("public.foo") == 1 + + records = cratedb.database.run_sql('SELECT * FROM "public"."foo";', records=True) + assert records[0] == { + "data": {"id": 46, "name": "Jane", "age": 31, "attributes": {"baz": "qux"}}, + } diff --git a/tests/testdata/kinesis_dms.json b/tests/testdata/kinesis_dms.json new file mode 100644 index 0000000..83bdd27 --- /dev/null +++ b/tests/testdata/kinesis_dms.json @@ -0,0 +1,36 @@ +{ + "Records": [ + { + "kinesis": { + "kinesisSchemaVersion": "1.0", + "partitionKey": "1", + "sequenceNumber": "49590338271490256608559692538361571095921575989136588898", + "data": "eyJjb250cm9sIjogeyJ0YWJsZS1kZWYiOiB7ImNvbHVtbnMiOiB7ImFnZSI6IHsibnVsbGFibGUiOiB0cnVlLCAidHlwZSI6ICJJTlQzMiJ9LCAiYXR0cmlidXRlcyI6IHsibnVsbGFibGUiOiB0cnVlLCAidHlwZSI6ICJTVFJJTkcifSwgImlkIjogeyJudWxsYWJsZSI6IGZhbHNlLCAidHlwZSI6ICJJTlQzMiJ9LCAibmFtZSI6IHsibnVsbGFibGUiOiB0cnVlLCAidHlwZSI6ICJTVFJJTkcifX0sICJwcmltYXJ5LWtleSI6IFsiaWQiXX19LCAibWV0YWRhdGEiOiB7Im9wZXJhdGlvbiI6ICJjcmVhdGUtdGFibGUiLCAicGFydGl0aW9uLWtleS10eXBlIjogInRhc2staWQiLCAicGFydGl0aW9uLWtleS12YWx1ZSI6ICJzZXJ2LXJlcy1pZC0xNzIyMTk1MzU4ODc4LXlocnUiLCAicmVjb3JkLXR5cGUiOiAiY29udHJvbCIsICJzY2hlbWEtbmFtZSI6ICJwdWJsaWMiLCAidGFibGUtbmFtZSI6ICJmb28iLCAidGltZXN0YW1wIjogIjIwMjQtMDctMjlUMDA6MzA6NDcuMjY2NTgxWiJ9fQ==", + "approximateArrivalTimestamp": 1545084650.987 + }, + "eventSource": "aws:kinesis", + "eventVersion": "1.0", + "eventID": "shardId-000000000006:49590338271490256608559692538361571095921575989136588898", + "eventName": "aws:kinesis:record", + "invokeIdentityArn": "arn:aws:iam::111122223333:role/lambda-kinesis-role", + "awsRegion": "eu-central-1", + "eventSourceARN": "arn:aws:kinesis:eu-central-1:111122223333:stream/lambda-stream" + }, + { + "kinesis": { + "kinesisSchemaVersion": "1.0", + "partitionKey": "1", + "sequenceNumber": "49590338271490256608559692538361571095921575989136588899", + "data": "eyJkYXRhIjogeyJhZ2UiOiAzMSwgImF0dHJpYnV0ZXMiOiAie1wiYmF6XCI6IFwicXV4XCJ9IiwgImlkIjogNDYsICJuYW1lIjogIkphbmUifSwgIm1ldGFkYXRhIjogeyJjb21taXQtdGltZXN0YW1wIjogIjIwMjQtMDctMjlUMDA6NTg6MTcuOTc0MzQwWiIsICJvcGVyYXRpb24iOiAiaW5zZXJ0IiwgInBhcnRpdGlvbi1rZXktdHlwZSI6ICJzY2hlbWEtdGFibGUiLCAicmVjb3JkLXR5cGUiOiAiZGF0YSIsICJzY2hlbWEtbmFtZSI6ICJwdWJsaWMiLCAic3RyZWFtLXBvc2l0aW9uIjogIjAwMDAwMDAyLzdDMDA3MTc4LjMuMDAwMDAwMDIvN0MwMDcxNzgiLCAidGFibGUtbmFtZSI6ICJmb28iLCAidGltZXN0YW1wIjogIjIwMjQtMDctMjlUMDA6NTg6MTcuOTgzNjcwWiIsICJ0cmFuc2FjdGlvbi1pZCI6IDExMzksICJ0cmFuc2FjdGlvbi1yZWNvcmQtaWQiOiAxfX0=", + "approximateArrivalTimestamp": 1545084650.998 + }, + "eventSource": "aws:kinesis", + "eventVersion": "1.0", + "eventID": "shardId-000000000006:49590338271490256608559692538361571095921575989136588898", + "eventName": "aws:kinesis:record", + "invokeIdentityArn": "arn:aws:iam::111122223333:role/lambda-kinesis-role", + "awsRegion": "eu-central-1", + "eventSourceARN": "arn:aws:kinesis:eu-central-1:111122223333:stream/lambda-stream" + } + ] +} diff --git a/tests/testdata/kinesis_dynamodb.json b/tests/testdata/kinesis_dynamodb.json new file mode 100644 index 0000000..1aa5723 --- /dev/null +++ b/tests/testdata/kinesis_dynamodb.json @@ -0,0 +1,20 @@ +{ + "Records": [ + { + "kinesis": { + "kinesisSchemaVersion": "1.0", + "partitionKey": "1", + "sequenceNumber": "49590338271490256608559692538361571095921575989136588898", + "data": "eyJhd3NSZWdpb24iOiAidXMtZWFzdC0xIiwgImV2ZW50SUQiOiAiYjAxNWI1ZjAtYzA5NS00YjUwLThhZDAtNDI3OWFhM2Q4OGM2IiwgImV2ZW50TmFtZSI6ICJJTlNFUlQiLCAidXNlcklkZW50aXR5IjogbnVsbCwgInJlY29yZEZvcm1hdCI6ICJhcHBsaWNhdGlvbi9qc29uIiwgInRhYmxlTmFtZSI6ICJmb28iLCAiZHluYW1vZGIiOiB7IkFwcHJveGltYXRlQ3JlYXRpb25EYXRlVGltZSI6IDE3MjA3NDAyMzMwMTI5OTUsICJLZXlzIjogeyJkZXZpY2UiOiB7IlMiOiAiZm9vIn0sICJ0aW1lc3RhbXAiOiB7IlMiOiAiMjAyNC0wNy0xMlQwMToxNzo0MiJ9fSwgIk5ld0ltYWdlIjogeyJodW1pZGl0eSI6IHsiTiI6ICI4NC44NCJ9LCAidGVtcGVyYXR1cmUiOiB7Ik4iOiAiNDIuNDIifSwgImRldmljZSI6IHsiUyI6ICJmb28ifSwgInRpbWVzdGFtcCI6IHsiUyI6ICIyMDI0LTA3LTEyVDAxOjE3OjQyIn19LCAiU2l6ZUJ5dGVzIjogOTksICJBcHByb3hpbWF0ZUNyZWF0aW9uRGF0ZVRpbWVQcmVjaXNpb24iOiAiTUlDUk9TRUNPTkQifSwgImV2ZW50U291cmNlIjogImF3czpkeW5hbW9kYiJ9", + "approximateArrivalTimestamp": 1545084650.987 + }, + "eventSource": "aws:kinesis", + "eventVersion": "1.0", + "eventID": "shardId-000000000006:49590338271490256608559692538361571095921575989136588898", + "eventName": "aws:kinesis:record", + "invokeIdentityArn": "arn:aws:iam::111122223333:role/lambda-kinesis-role", + "awsRegion": "us-east-2", + "eventSourceARN": "arn:aws:kinesis:us-east-2:111122223333:stream/lambda-stream" + } + ] +}