diff --git a/.github/workflows/azure-build-push.yml b/.github/workflows/azure-build-push.yml new file mode 100644 index 0000000..d334cb5 --- /dev/null +++ b/.github/workflows/azure-build-push.yml @@ -0,0 +1,34 @@ +on: + push: + branches: + - master + paths: + - 'src/data_generator/**' + - 'src/modules/**' + +name: tag_build_and_push + +jobs: + build-and-deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@master + with: + fetch-depth: '0' + - name: "Bump version and push tag" + id: bumptag + uses: anothrNick/github-tag-action@master + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + DEFAULT_BUMP: patch + WITH_V: true + - name: 'Build and push image' + uses: azure/docker-login@v1 + with: + login-server: ${{ secrets.REGISTRY_LOGIN_SERVER }} + username: ${{ secrets.REGISTRY_USERNAME }} + password: ${{ secrets.REGISTRY_PASSWORD }} + - run: | + docker build -t ${{ secrets.REGISTRY_LOGIN_SERVER }}/data-generator:${{ steps.bumptag.outputs.new_tag }} . + docker push ${{ secrets.REGISTRY_LOGIN_SERVER }}/data-generator:${{ steps.bumptag.outputs.new_tag }} + diff --git a/.github/workflows/code_quality.yml b/.github/workflows/code_quality.yml new file mode 100644 index 0000000..dda9b7a --- /dev/null +++ b/.github/workflows/code_quality.yml @@ -0,0 +1,66 @@ +on: + pull_request: + branches: + - master + paths: + - 'src/**' + +name: code_quality + +jobs: + static_code_analysis: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r src/test/requirements.txt + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 ./src/ --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. + flake8 ./src/ --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + unit_tests: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r src/test/requirements.txt + - name: Test with pytest + run: | + pytest ./src/test/ + code_coverage: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r src/test/requirements.txt + - name: Measure code coverage + run: | + percentage=$(pytest --cov=./src/modules/ --cov-report term ./src/test | grep -E '[TOTAL]' | awk '{print $4}' | tr -d % ) + if [ -z "$percentage" ];then + echo "pytest failed, no code coverage result" + exit -1; + fi; + if (("$percentage" < 95));then + echo "Coverage failed because percentage is lower than 95%: $percentage%" + exit -1; + fi; \ No newline at end of file diff --git a/DATA_GENERATOR.md b/DATA_GENERATOR.md new file mode 100644 index 0000000..48b0bf5 --- /dev/null +++ b/DATA_GENERATOR.md @@ -0,0 +1,1071 @@ +# Data Generator + +The Data Generator evolved as a standalone tool which can be used independently of the holistic comparison use case. This file describes how the Data Generator can be adapted and the functionality as well as explain different example use cases. + +## Table of Contents + +- [Data Generator](#data-generator) + * [General Information](#general-information) + + [About](#about) + + [How To](#how-to) + + [Supported Databases](#supported-databases) + - [CrateDB](#cratedb) + * [Client Library](#client-library) + * [Table Setup](#table-setup) + * [Insert](#insert) + * [Specifics](#specifics) + - [InfluxDB](#influxdb) + * [Client Library](#client-library-1) + * [Bucket Setup](#bucket-setup) + * [Insert](#insert-1) + * [Specifics](#specifics-1) + - [TimescaleDB](#timescaledb) + * [Client Library](#client-library-2) + * [Table Setup](#table-setup-1) + * [Insert](#insert-2) + * [Specifics](#specifics-2) + - [MongoDB](#mongodb) + * [Client Library](#client-library-3) + * [Collection Setup](#collection-setup) + * [Insert](#insert-3) + * [Specifics](#specifics-3) + - [PostgreSQL](#postgresql) + * [Client Library](#client-library-4) + * [Table Setup](#table-setup-2) + * [Insert](#insert-4) + * [Specifics](#specifics-4) + * [Data Generator Configuration](#data-generator-configuration) + + [Environment variables configuring the behaviour of the Data Generator](#environment-variables-configuring-the-behaviour-of-the-data-generator) + - [ID_START](#id-start) + - [ID_END](#id-end) + - [INGEST_MODE](#ingest-mode) + * [INGEST_MODE 0](#ingest-mode-0) + * [INGEST_MODE 1](#ingest-mode-1) + - [INGEST_SIZE](#ingest-size) + - [INGEST_TS](#ingest-ts) + - [INGEST_DELTA](#ingest-delta) + - [MODEL_PATH](#model-path) + - [BATCH_SIZE](#batch-size) + - [DATABASE](#database) + - [STAT_DELTA](#stat-delta) + + [Environment variables used to configure different databases](#environment-variables-used-to-configure-different-databases) + - [HOST](#host) + - [USERNAME](#username) + - [PASSWORD](#password) + - [DB_NAME](#db-name) + - [TABLE_NAME](#table-name) + - [PARTITION](#partition) + + [Environment variables used to configure CrateDB](#environment-variables-used-to-configure-cratedb) + - [SHARDS](#shards) + - [REPLICAS](#replicas) + + [Environment variables used to configure TimescaleDB](#environment-variables-used-to-configure-timescaledb) + - [PORT](#port) + + [Environment variables used to configure InfluxDB](#environment-variables-used-to-configure-influxdb) + - [TOKEN](#token) + - [ORG](#org) + * [Data Generator Models](#data-generator-models) + + [Structure](#structure) + + [Complex Model Example](#complex-model-example) + + [Sensor Types](#sensor-types) + - [Float Sensor](#float-sensor) + * [Model](#model) + - [Bool Sensor](#bool-sensor) + * [Model](#model-1) + * [Batch-Size-Automator](#batch-size-automator) + + [Setup](#setup) + + [Modes](#modes) + - [Finding best batch size](#finding-best-batch-size) + - [Surveillance mode](#surveillance-mode) + * [Prometheus Metrics](#prometheus-metrics) + * [Example Use Cases](#example-use-cases) + + [Single Type of Edge](#single-type-of-edge) + - [Setup](#setup-1) + - [Running the example](#running-the-example) + + [Multiple Types of Edges](#multiple-types-of-edges) + - [Setup](#setup-2) + - [Running the example](#running-the-example-1) + * [Alternative data generators](#alternative-data-generators) + + [Why use this data generator over the alternatives?](#why-use-this-data-generator-over-the-alternatives-) + + [cr8 + mkjson](#cr8---mkjson) + + [tsbs data generator](#tsbs-data-generator) + * [Glossary](#glossary) + +## General Information + +This chapter covers general information about the Data Generator, e.g. the supported databases and the basic workflow. + +### About + +The Data Generator is a tool to generate timeseries data which adheres to a [statistical model](#data-generator-models) both for [populating a database](#ingest_mode) as well as having a way to [continuously insert](#ingest_mode) timeseries data. + +### How To + +The easiest way to use the Data Generator is to build the Docker Image: + ++ navigate to root directory of this repository ++ build docker image with `docker build -t data_gen -f src/data_generator/Dockerfile .` ++ Adapt one of the example docker-compose files in the [example folder](src/data_generator/examples) ++ start (e.g. crate example) with `docker-compose -f src/data_generator/example/docker-compose_crate.yml up` + +For an explanation on how to set the environment variables see [Environment variables](#data-generator-configuration). +For example use cases see [Example use cases](#example-use-cases) + +### Supported Databases + +Currently 5 Databases are ++ [CrateDB](https://crate.io/) ++ [InfluxDB V2](https://www.influxdata.com/) ++ [TimescaleDB](https://www.timescale.com/) ++ [MongoDB](https://www.mongodb.com/) ++ [PostgreSQL](https://www.postgresql.org/) + +Databases can be run either local or in the Cloud as both use cases are supported. Support for additional databases depends on the demand for it. + +The following chapters give an overview over the specific implementation for the different databases. + +#### CrateDB + +##### Client Library + +For CrateDB the [crate](https://pypi.org/project/crate/) library is used. To connect to CrateDB the following environment variables must be set: + ++ [HOST](#host): hostname including port e.g. `localhost:4200` ++ [USERNAME](#username): CrateDB username. ++ [PASSWORD](#password): password for CrateDB user. + +##### Table Setup + +A table gets it's name either from the provided [model](#data-generator-models) or from the environment variable [table_name](#table_name) + +A table for CrateDB has three columns: + ++ `ts`: column containing a timestamp (occurrence of the payload) ++ `g_ts_'interval'`: column containing the `ts` value truncated to the value set with [PARTITION](#partition). It is used to partition the table and generated by the DB. ++ `payload`: column containing the the values, is of type [OBJECT](https://crate.io/docs/crate/reference/en/latest/general/ddl/data-types.html#object) Dynamic. The concrete subcolumns are definied by the provided [model](#data-generator-models). + +Additional table configuration: + ++ with [SHARDS](#shards) the amount of shards for the table can be configured ++ with [REPLICAS](#replicas) the amount of replicas for the table can be configured + +##### Insert + +Insert is done using the [unnest](https://crate.io/docs/crate/reference/en/latest/general/builtins/table-functions.html?#unnest-array-array) function of CrateDB. + +##### Specifics + ++ All columns and sub-columns are automatically indexed ++ Using an object column makes it possible to insert the values for multiple models into a single table (similar to a schema-less approach of a NO-SQL database). ++ Using `unnest` for the insert makes it possible to take the generated values without modification and insert them directly into the table. + +#### InfluxDB + +##### Client Library + +For InfluxDB the [influx-client](https://pypi.org/project/influxdb-client/) library is used as the Data Generator only supports InfluxDB V2. To connect to InfluxDB the following environment variables must be set: + ++ [HOST](#host): hostname ++ [TOKEN](#token): InfluxDB Read/Write token ++ [ORG](#org): InfluxDB organization + +##### Bucket Setup + +A bucket gets it's name either from the provided [model](#data-generator-models) or from the environment variable [table_name](#table_name) + +If a bucket with the same name already exists on the given host this bucket is used to insert data otherwise a new bucket is created without retention rules (data is saved indefinitely) + +##### Insert + +Insert into InfluxDB is done using the `Point` type from the `influxdb_client.client.write.api`. All tags from the [model](#data-generator-models) are added to `Point.tag` (InfluxDB creates indices for tags). Metrics are saved to `Point.field`. The timestamp is added to `Point.time`. Multiple Points are then inserted in a batch. + +##### Specifics + ++ All tags are automatically indexed ++ Insert of multiple models into a single bucket is possible due to InfluxDB being a NoSQL database. ++ When using InfluxDB V2 with a usage-based plan insert is limited to 300MB/5m, this is about 15.000 metrics per second, additional metrics are dropped by InfluxDB and the client is not informed. + +#### TimescaleDB + +##### Client Library + +For TimescaleDB the [psycopg2](https://pypi.org/project/psycopg2/) library is used. As psycopg2 does not have the best insert performance for TimescaleDB (see [here](https://docs.timescale.com/latest/tutorials/quickstart-python#insert_rows)) to insert a lot of metrics it is advised to split [IDs](#id_start) over multiple data-generator instances. + +To connect with TimescaleDB the following environment variables must be set: + ++ [HOST](#host): hostname ++ [PORT](#port): port ++ [USERNAME](#username): username of TimescaleDB user ++ [PASSWORD](#password): password of TimescaleDB user ++ [DB_NAME](#db_name): the database name with which to connect + +##### Table Setup + +A table gets it's name either from the provided [model](#data-generator-models) or from the environment variable [table_name](#table_name) + +A table for TimescaleDB consists of the following columns: + ++ `ts`: column containing a timestamp (occurrence of the payload) ++ `ts_'interval'`: column containing the `ts` value truncated to the value set with [PARTITION](#partition). ++ a column for each entry in `tags` and `metrics`. + + `tags` are of type `INTEGER` when using numbers and of type `TEXT` when using list notation + + `metrics` are of the type defined in the [model](#data-generator-models) + +**If a table with the same name already exists which doesn't have the expected structure the data-generator will fail when inserting values.** + +Using this table a TimescaleDB Hypertable is created partitioned by the `ts` and `ts_'interval'` column + +##### Insert + +Insert is done in batches. + +##### Specifics + ++ No index is created, to query data indices must be created manually. ++ Insert of multiple models into a single table is not possible as the table schema is only created once. ++ psycopg2 does not have the best insert performance for TimescaleDB (see [here](https://docs.timescale.com/latest/tutorials/quickstart-python#insert_rows)) to insert a lot of metrics it is advised to split [IDs](#id_start) over multiple data-generator instances. + +#### MongoDB + +##### Client Library + +For MongoDB the [MongoClient](https://mongodb.github.io/node-mongodb-native/api-generated/mongoclient.html) library is used. + +To connect with MongoDB the following environment variables must be set: + ++ [HOST](#host): hostname (can include port if not standard MongoDB port is used) ++ [USERNAME](#username): username of TimescaleDB user ++ [PASSWORD](#password): password of TimescaleDB user ++ [DB_NAME](#db_name): The name of the MongoDB database that will be used + +##### Collection Setup + +A collection gets it's name from the provided [model](#data-generator-models). + +A document in the collection consists of the following elements: + ++ `measurement`: same as the collection name ++ `date`: the timestamp of the measurement in datetime format ++ `tags`: tags that were defined in the [model](#data-generator-models) ++ `metrics`: metrics that were defined in the [model](#data-generator-models) + +##### Insert + +Insert is done using the `insert_many` function of the collection to insert documents in batches. + +##### Specifics + ++ MongoDB only creates a default index other indices have to be created manually. ++ Insert of multiple models into a single collection is possible but it's advised to use different collections for each model (same database is fine). + +#### PostgreSQL + +##### Client Library + +For PostgreSQL the [psycopg2](https://pypi.org/project/psycopg2/) library is used. + +To connect with PostgreSQL the following environment variables must be set: + ++ [HOST](#host): hostname ++ [PORT](#port): port ++ [USERNAME](#username): username of TimescaleDB user ++ [PASSWORD](#password): password of TimescaleDB user ++ [DB_NAME](#db_name): the database name with which to connect + +##### Table Setup + +A table gets it's name either from the provided [model](#data-generator-models) or from the environment variable [table_name](#table_name) + +A table for PostgreSQL consists of the following columns: + ++ `ts`: column containing a timestamp (occurrence of the payload) ++ `ts_'interval'`: column containing the `ts` value truncated to the value set with [PARTITION](#partition). ++ a column for each entry in `tags` and `metrics`. + + `tags` are of type `INTEGER` when using numbers and of type `TEXT` when using list notation + + `metrics` are of the type defined in the [model](#data-generator-models) + +**If a table with the same name already exists which doesn't have the expected structure the data-generator will fail when inserting values.** + +##### Insert + +Insert is done in batches. + +##### Specifics + ++ No index is created, to query data indices must be created manually. ++ Insert of multiple models into a single table is not possible as the table schema is only created once. + +## Data Generator Configuration + +The Data Generator is mostly configured by setting Environment Variables. This chapter lists all available Environment Variables and explains their use in the Data Generator. + +### Environment variables configuring the behaviour of the Data Generator + +The environment variables in this chapter are used to configure the behaviour of the data generator + +#### ID_START + +Type: Integer + +Value: A positive number. Must be smaller than [ID_END](#id_end). + +Default: 1 + +The Data Generator will create `(ID_END + 1) - ID_START` edges. + +#### ID_END + +Type: Integer + +Value: A positive number. Must be greater than [ID_START](#id_start). + +Default: 500 + +The Data Generator will create `(ID_END + 1) - ID_START` edges. + +#### INGEST_MODE + +Type: Integer + +Values: 0 or 1 + +Default: 1 + +##### INGEST_MODE 0 + +When `INGEST_MODE` is set to `0` the Data Generator goes into "steady load"-mode. This means for all edges controlled by the Data Generator an insert is performed each [INGEST_DELTA](#ingest_delta) seconds. + +**Note: If too many edges are controlled by one Data Generator instance so an insert cannot be performed in the timeframe set by INGEST_DELTA it is advised to split half the IDs to a separate Data Generator instance: e.g. one instance uses `ID_START=1, ID_END=500` and the other `ID_START=501, ID_END=1000`** + +With this configuration the [Batch Size Automator](#batch-size-automator) is disabled. Therefor the [Prometheus metrics](#prometheus-metrics) g_insert_time, g_rows_per_second, g_best_batch_size, g_best_batch_rps, will stay at 0. + +##### INGEST_MODE 1 + +When `INGEST_MODE` is set to `1` the Data Generator goes into "burst insert"-mode. This means it tries to insert as many values as possible. This mode is used populate a database and can be used to measure insert performance. + +Using this mode results in values inserted into the database at a faster rate than defined by [INGEST_DELTA](#ingest_delta) but the timestamp values will still adhere to this value and be in the defined time interval. This means that if [INGEST_TS](#ingest_ts) is not set to a specific value timestamps will point to the future. By adjusting INGEST_TS to a timestamp in the past in combination with a limited [INGEST_SIZE](#ingest_size) the rang of timestamps can be controlled. + +When [BATCH_SIZE](#batch_size) is set to a value smaller or equal to 0 the [Batch Size Automator](#batch-size-automator) is activated. This means that the insert performance is supervised and the batch size adjusted to have a fast insert speed. If the value is greater than 0 the batch size will be fixed at this value and the Batch Size Automator will be disabled. + +#### INGEST_SIZE + +Type: Integer + +Values: A positive number + +Default: 100 + +`INGEST_SIZE` defines how many values for each edge will be created. When setting `INGEST_SIZE` to `0` an endless amount of values is created until the Data Generator is terminated. + +Example: + ++ ID_START: 1 ++ ID_END: 500 ++ INGEST_SIZE: 1000 + +We have 500 edges and for each edge 1000 values are generated, therefor we will have 500.000 values in total. + +**Note: a value contains all the information for a single edge, including the defined `tags` and `metrics`. See [Data Generator Models](#data-generator-models) for more information about tags and metrics.** + +#### INGEST_TS + +Type: Integer + +Values: A valid UNIX timestamp + +Default: timestamp at the time the Data Generator was started + +This variable defines the first timestamp used for the generated values. When using [INGEST_MODE 1](#ingest_mode-1) all following timestamps have an interval to the previous timestamp by the value of [INGEST_DELTA](#ingest_delta). When using [INGEST_MODE 0](#ingest_mode-0) the second insert happens when `INGEST_TS + INGEST_DELTA` is equal or bigger than the current timestamp (real life). This means that if INGEST_TS is set to the future no inserts will happen until the `INGEST_TS + INGEST_DELTA` timestamp is reached. + +#### INGEST_DELTA + +Type: Float + +Values: Any positive number + +Default: 0.5 + +The value of `INGEST_DELTA` defines the interval between timestamps of the generated values. + +#### MODEL_PATH + +Type: String + +Values: Either relative or absolute path to a model in the json format (see [Data Generator Models](#data-generator-models) for more information on models). + +Default: empty string + +When using a relative path with the docker image be sure to checkout the [Dockerfile](Dockerfile) to be sure to use the correct path. + +#### BATCH_SIZE + +Type: Integer + +Values: Any number. + +Default: -1 + +`BATCH_SIZE` is only taken into account when [INGEST MODE](#ingest_mode) is set to `1`. The value of `BATCH_SIZE` defines how many rows will be inserted with one insert statement. If the value is smaller or equal to `0` the [Batch Size Automator](#batch-size-automator) will take control over the batch size and dynamically adjusts the batch size to get the best insert performance. + +#### DATABASE + +Type: Integer + +Values: 0..4 + +Default: 0 + +The value will define which database is used: ++ 0: CrateDB ++ 1: TimescaleDB ++ 2: InfluxDB ++ 3: MongoDB ++ 4: PostgreSQL + +#### STAT_DELTA + +Type: Integer + +Values: A positive number + +Default: 30 + +Prints statistics of average function execution time every `STAT_DELTA` seconds. + +### Environment variables used to configure different databases + +The environment variables in this chapter are used by different databases to connect and configure them. Each entry will contain the databases for which they are used and example values. For collected information for a single database see the chapters for the database: ++ [CrateDB](#cratedb) ++ [InfluxDB](#influxdb) ++ [TimescaleDB](#timescaledb) ++ [MongoDB](#mongodb) + +#### HOST + +Type: String + +Values: hostname according to database client requirements + +Default: localhost + +used with CrateDB, TimescaleDB, MongoDB. + +**CrateDB:** + +host must include port, e.g.: `"localhost:4200"` + +**TimescaleDB:** + +host must be hostname excluding port, e.g.: `"localhost"` + +**MongoDB:** + +host can be either without port (e.g. `"localhost"`) or with port (e.g. `"localhost:27017"`) + +#### USERNAME + +Type: String + +Values: username of user used for authentication against the database + +Default: None + +used with CrateDB, TimescaleDB, MongoDB. + +#### PASSWORD + +Type: String + +Values: password of user used for authentication against the database + +Default: None + +used with CrateDB, TimescaleDB, MongoDB. + +#### DB_NAME + +Type: String + +Values: Name of the database where table will be created + +Default: empty string + +used with InfluxDB, TimescaleDB, MongoDB. + +**InfluxDB:** +This is an optional parameter for InfluxDB. In case it is set the Bucket where the values are inserted will use the value of `DB_NAME` as name. If `DB_NAME` is empty string than the name of the model (see [Data Generator models](#data-generator-models) for more information) will be used as Bucket name. + +**TimescaleDB:** +The value of `DB_NAME` is used when connecting to TimescaleDB. This database must already exist in your TimescaleDB instance and must have already been initialized with `CREATE EXTENSION IF NOT EXISTS timescaledb CASCADE;`. + +**MongoDB:** +The value of `DB_NAME` is used as the database parameter of MongoDB. + +#### TABLE_NAME + +Type: String + +Values: Name of the table where values are stored + +Default: empty string + +used with CrateDB and TimescaleDB. It is an optional parameter to overwrite the default table_name defined in the model (see [Data Generator models](#data-generator-models)). + +#### PARTITION + +Type: String + +Values: second, minute, hour, day, week, month, quarter, year + +Default: week + +used with CrateDB and TimescaleDB. Is used to define an additional Column to partition the table. E.g. when using `week` an additional column is created (Crate: `g_ts_week`, Timescale `ts_week`) and the value from the `ts` column is truncated to its week value. + +### Environment variables used to configure CrateDB + +The environment variables in this chapter are only used to configure CrateDB + +#### SHARDS + +Type: Integer + +Values: positive number + +Default: 4 + +Defines how many [shards](https://crate.io/docs/crate/reference/en/latest/general/ddl/sharding.html) will be used. + +#### REPLICAS + +Type: Integer + +Values: positive number + +Default: 0 + +Defines how many [replicas](https://crate.io/docs/crate/reference/en/latest/general/ddl/replication.html) for the table will be created. + +### Environment variables used to configure TimescaleDB + +The environment variables in this chapter are only used to configure TimescaleDB + +#### PORT + +Type: Integer + +Values: positive number + +Default: 5432 + +Defines the port number of the host where TimescaleDB is reachable. + +### Environment variables used to configure InfluxDB + +The environment variables in this chapter are only used to configure InfluxDB + +#### TOKEN + +Type: String + +Values: token gotten from InfluxDB V2 + +Default: empty string + +Influx V2 uses [token](https://v2.docs.influxdata.com/v2.0/security/tokens/view-tokens/) based authentication. + +#### ORG + +Type: String + +Values: org_id gotten from InfluxDB V2 + +Default: empty string + +Influx V2 uses [organizations](https://v2.docs.influxdata.com/v2.0/organizations/) to manage buckets. + +## Data Generator Models + +The Data Generator uses models to determine what kind of values to generate. These models are described in JSON-files. This chapter explains how to write models based on examples. + +### Structure + +A Data Generator Model is a JSON-file which must contain one object (a second key `description` can be used to explain the model). The key for this object will be the default value of [TABLE_NAME](#table_name). For example the following JSON Object would contain a model called `button_sensor`. + +```JSON +{ + "button_sensor": {"..."} +} +``` + +This top-level object must have two sub-level objects `tags` and `metrics`. `tags` are all values that identify the thing where the measured values come from. `metrics` are all measured values of a thing. For example we use our `button_sensor` model. A sensor is identified by a `sensor_id` (we keep it simple for the first example). And has a single metric `button_press`: + +```JSON +{ + "button_sensor": { + "tags": { + "sensor_id": "id" + }, + "metrics": { + "button_press": {"..."} + } + } +} +``` + +As you can see the `sensor_id` tag gets the value `"id"` this means it will be the bottom-level tag directly associated with the values between `ID_START` and `ID_END`. +The `button_sensor` metric is another object describing how the value of this object should be calculated and saved to the database. + +```JSON +{ + "button_sensor": { + "tags": { + "sensor_id": "id" + }, + "metrics": { + "button_press": { + "key": { + "value": "button_pressed" + }, + "type": { + "value": "BOOL" + }, + "true_ratio": { + "value": 0.001 + } + } + } + } +} +``` + +The `button_press` metric is of type `BOOL` and has a true_ratio of `0.001` which means it is true in 1 out of 1000 cases. Go to [Sensor Types](#sensor-types) to get a more detailed overview over the different Sensor types. Or look at [motor.json](src/data_generator/examples/motor.json) or [temperature.json](src/data_generator/examples/temperature.json) for examples containing descriptions. + +This is the basic structure of a Data Generator model. It can contain any amount of tags and metrics, but row/document size increases with each add value, as well as calculation time with each metric. + +**Note: `tags` are ordered by their hierarchic structure, e.g. the upper tag contains the lower tags. This means the `"id"` tag must be the last one.** + +`tags` can also be defined as a list of values, e.g. `["AT", "BE", "CH", "DE", "ES", "FR", "GB", "HU", "IT", "JP"]`. This then uses the values in the array to setup the tags. + +### Complex Model Example + +**Use Case:** + ++ We want to model a certain edge in **50 different plants** ++ Each plant contains of **5 lines** ++ Each line consists of **10 edges** ++ Each edge has **5 different metrics**: + + voltage + + current + + temperature + + power + + vibration + +**Solution:** + +The first thing we need to identify how many edges we have in total: `50 plants * 5 lines * 10 edges = 2500 edges total`. Now we know that we have to use 2500 IDs so we set `ID_START=1` and `ID_END=2500`. Then we create our model: + +```JSON +{ + "example": { + "tags": { + "plant": 50, + "line": 5, + "edge": "id" + }, + "metrics": { + "voltage": {"..."}, + "current": {"..."}, + "temperature": {"..."}, + "power": {"..."}, + "vibration": {"..."} + } + } +} +``` + +As you see even a complex model isn't that complicated to write. The main limitation of the Data Generator is that one instance can only take a single model. But when combining multiple instances it is easily possible to simulate complex setups spanning multiple factories with multiple machinery. + +### Sensor Types + +This chapter describes the available Sensor types, what values they use and the projected output. + +#### Float Sensor + +The Float Sensor produces floating point numbers defined by a statistical model. The generated values follow a normal distribution. + +##### Model + +We describe all the keys and the corresponding values a model for a Float Sensor must contain. All elements are associated with the key under the metrics object (see [here](#structure)). So for example we already have this model: + +```JSON +{ + "example": { + "tags": { + "plant": 50, + "line": 5, + "edge": "id" + }, + "metrics": { + "voltage": {"..."}, + "current": {"..."}, + "temperature": {"..."}, + "power": {"..."}, + "vibration": {"..."} + } + } +} +``` + +Now we decide that `voltage` is a Float Sensor and describe it in the model like this: + +```JSON +{ + "key": { + "value": "voltage" + }, + "type": { + "value": "FLOAT" + }, + "min": { + "value": 200 + }, + "max": { + "value": 260 + }, + "mean": { + "value": 230 + }, + "stdev": { + "value": 10 + }, + "variance": { + "value": 0.03 + }, + "error_rate": { + "value": 0.001 + }, + "error_length": { + "value": 2.5 + } +} +``` + +Let's explain what this means. We take the normal voltage output of a european socket: + ++ key: the column/sub-column/field name where the value will be written ++ type: the type of sensor to use (currently `FLOAT` and `BOOL` are supported) ++ mean: the average output is 230V ++ min/max: the voltage can change between ± 30V. So min is 200V and max is 260V ++ stdev: we set the stdev to 10 so most values will be between 220V and 240V ++ variance: limits the step size between values. In this example the value changes at most by 0.03V ++ error_rate: The sensor has a 1:1000 chance to malfunction and report wrong values ++ error_length: When the sensor malfunctions and reports a wrong value on average the next 2.5 values are also wrong + +Using this model to generate 100.000 values the curve will look something like this (the spikes are the errors): + +![Image of docu_example model curve](src/data_generator/examples/pictures/curve.png) + +And the value distribution of this curve will look something like this: + +![Image of docu_example model value distribution](src/data_generator/examples/pictures/distribution.png) + +#### Bool Sensor + +The Bool Sensor produces boolean values according to a given ration. + +##### Model + +The Model for a Bool Sensor is pretty simple, we look at the example we created when we described the [Data Generator Model](#structure). + +```JSON +"button_press": { + "key": { + "value": "button_pressed" + }, + "type": { + "value": "BOOL" + }, + "true_ratio": { + "value": 0.001 + } +} +``` + ++ key: the column/sub-column/field name where the value will be written ++ type: the type of sensor to use (currently `FLOAT` and `BOOL` are supported) ++ true_ratio: the ratio how many time the Bool Sensor will generate the value `True`. E.g. if value is `1` the sensor will output `True` every time. If the value is `0.5` the output will be 50% `True` and 50% `False`. + +## Batch-Size-Automator + +The Batch Size Automator (BSA) is used to improve insert performance by adjusting the batch size dynamically. + +### Setup + +The BSA is only active when [INGEST_MODE](#ingest_mode) is set to `1` and [BATCH_SIZE](#batch_size) has a value smaller or equal `0`. When activated everything else is done automatically. + +### Modes + +The BSA consists of two modes: ++ finding best batch size ++ surveillance + +#### Finding best batch size + +1. The BSA calculates how many rows where inserted per second during the last test cycle (20 inserts) +2. The BSA compares if the current result is better than the best + 2.a. If current was better the batch size is adjusted by the step size + 2.b. If current was worse the batch size is adjusted in the opposite direction of the last adjustment and the step size is reduced +3. Repeat steps 1 to 2 until step size is below a threshold (this means that we entered 2.b. often and should have found our optimum batch size) +4. Change to surveillance mode + +#### Surveillance mode + +1. The BSA increases the length of the test cycle to 1000 inserts +2. After 1000 inserts the BSA calculates if performance has gotten worse + 2.a. if performance is worse test cycle length is set to 20 and we switch to finding best batch size mode + 2.b. If performance is the same or better repeat steps 1 to 2 + +## Prometheus Metrics + +This chapter gives an overview over the available prometheus metrics and what they represent + ++ generated_values: how many values have been generated ++ inserted_values: how many values have been inserted ++ insert_percentage: [INGEST_SIZE](#ingest_size) times number of IDs divided by inserted_values ++ batch_size: The currently used batch size (only available with [BSA](#batch-size-automator)) ++ insert_time: The average time it took to insert the current batch into the database (only available with [BSA](#batch-size-automator)) ++ rows_per_second: The average number of rows per second with the latest batch_size (only available with [BSA](#batch-size-automator)) ++ best_batch_size: The up to now best batch size found by the batch_size_automator (only available with [BSA](#batch-size-automator)) ++ best_batch_rps: The rows per second for the up to now best batch size (only available with [BSA](#batch-size-automator)) + +## Example Use Cases + +This chapter gives examples on how the Data Generator can be used. Files for these examples can be found [here](src/data_generator/examples) + +### Single Type of Edge + +We want to simulate two factories each with ten lines and each line with five sensors. The sensor measures three metrics: ++ speed (float) in m/s ++ temperature (float) in °C ++ vibration (bool) above or below threshold + +Every 5 seconds each sensor reports a value and we want our simulation to run for one hour. + +#### Setup + +The resulting JSON-model could look like this (you can find it as file [here](src/data_generator/examples/SingleType/example_model.json)): + +```JSON +{ + "sensor_values": { + "tags": { + "factory": 2, + "line": 10, + "sensor": "id" + }, + "metrics": { + "speed": { "key": {"value": "speed"}, + "type": {"value": "FLOAT"}, + "min": {"value": 0}, + "max": {"value": 3}, + "mean": {"value": 1.5}, + "stdev": {"value": 0.3}, + "variance": {"value": 0.01}, + "error_rate": {"value": 0.0001}, + "error_length": {"value": 1.04}}, + "temperature": {"key": {"value": "temperature"}, + "type": {"value": "FLOAT"}, + "min": {"value": 0}, + "max": {"value": 200}, + "mean": {"value": 50}, + "stdev": {"value": 5}, + "variance": {"value": 0.01}, + "error_rate": {"value": 0.00001}, + "error_length": {"value": 2.07}}, + "vibration": { "key": {"value": "vibration"}, + "type": {"value": "BOOL"}, + "true_ratio": {"value": 0.01}} + } + } +} +``` + +As we have five sensors on ten lines in two factories we have 100 sensors in total, so for our docker-compose file we set the following environment variables: ++ ID_START: 1 ++ ID_END: 100 + +As we want to use CrateDB running on localhost we set the following environment variables: ++ DATABASE: 0 ++ HOST: "host.docker.internal:4200" (this is the host when trying to access localhost from inside a docker container) ++ USERNAME: "aValidUsername" ++ PASSWORD: "PasswordForTheValidUsername" + +As we want to have a consistent insert every 5 seconds for one hour we set the following environment variables: ++ INGEST_MODE: 0 ++ INGEST_SIZE: 720 (an hour has 3600 seconds divided by 5 seconds) ++ INGEST_DELTA: 5 + +And finally we want to use our just created model: ++ MODEL_PATH: "/example_model.json" + +The resulting yml file could look like this: + +```YML +version: "2.3" +services: + datagen: + image: data_gen + ports: + - 8000:8000 + environment: + ID_START: 1 + ID_END: 100 + HOST: "host.docker.internal:4200" + USERNAME: "" + PASSWORD: "" + INGEST_MODE: 0 + INGEST_SIZE: 720 + INGEST_DELTA: 5 + MODEL_PATH: "/example_model.json" + DATABASE: 0 + volumes: + - ./example_model.json:/example_model.json +``` + +#### Running the example + +To run this example follow the following steps: + ++ navigate to root directory of this repository ++ build docker image with `docker build -t data_gen -f src/data_generator/Dockerfile .` ++ start an instance of CrateDB on localhost with `docker run -p "4200:4200" crate` ++ Enter USERNAME and PASSWORD in the [docker-compose file](src/data_generator/examples/SingleType/docker-compose_example_crate.yml) + + If no user was created you can just delete both environment variables (crate will use a default user) ++ start the docker-compose file with `docker-compose -f src/data_generator/examples/SingleType/docker-compose_example_crate.yml up` + +You can now navigate to localhost:4200 to look at CrateDB or to localhost:8000 to look at the metrics of the Data Generator. + +### Multiple Types of Edges + +Note the provided examples for this use-case are supposed to run with CrateDB. To run it with other Databases changes according to the documentation have to be made. This use-case is not possible to run with TimescaleDB as it uses a fixed schema and adding columns to existing tables is not yet supported. + +We want to simulate ten factories in ten different countries (defined by country code in the `tag` value) with ten lines per factory. The lines are grouped into upper lines and lower lines, with five lines per group. The different combinations have multiple sensors reporting at different time intervals: + ++ All lines in all factories have five sensors reporting a `speed` and `vibration` metric each second ++ All lines in the factories `["AT", "BE", "CH", "DE", "ES"]` have five sensors reporting a `voltage` and `current` metric each second ++ All lines in all factories have two sensors reporting a `temperature` metric every ten seconds. The temperature values are different depending on which group the lines are in. + +#### Setup + +As we actually use four different models (temperature metric is different for upper and lower lines) we also have four different model-files. You can find all the model-files [here](src/data_generator/examples/MultiType). + +To run this use-case we have to write a more complex docker-compose [file](src/data_generator/examples/MultiType/docker-compose_multitype_example.yml): + +```YML +version: "2.3" +services: + datagen_base1: + image: data_gen + ports: + - 8000:8000 + environment: + ID_START: 1 + ID_END: 500 + HOST: "host.docker.internal:4200" + INGEST_MODE: 1 + INGEST_SIZE: 1000 + INGEST_DELTA: 1 + MODEL_PATH: "/example_complex_base1.json" + DATABASE: 0 + volumes: + - ./example_complex_base1.json:/example_complex_base1.json + datagen_base2: + image: data_gen + ports: + - 8001:8000 + environment: + ID_START: 501 + ID_END: 1000 + HOST: "host.docker.internal:4200" + INGEST_MODE: 1 + INGEST_SIZE: 1000 + INGEST_DELTA: 1 + MODEL_PATH: "/example_complex_base2.json" + DATABASE: 0 + volumes: + - ./example_complex_base2.json:/example_complex_base2.json + datagen_H: + image: data_gen + ports: + - 8002:8000 + environment: + ID_START: 1001 + ID_END: 1100 + HOST: "host.docker.internal:4200" + INGEST_MODE: 1 + INGEST_SIZE: 100 + INGEST_DELTA: 10 + MODEL_PATH: "/example_complex_U.json" + DATABASE: 0 + volumes: + - ./example_complex_U.json:/example_complex_U.json + datagen_L: + image: data_gen + ports: + - 8003:8000 + environment: + ID_START: 1101 + ID_END: 1200 + HOST: "host.docker.internal:4200" + INGEST_MODE: 1 + INGEST_SIZE: 100 + INGEST_DELTA: 10 + MODEL_PATH: "/example_complex_L.json" + DATABASE: 0 + volumes: + - ./example_complex_L.json:/example_complex_L.json +``` + +**Note we use `INGEST_MODE: 1` to insert data fast. To keep data-size small we only insert 1000 seconds worth of data, this can obviously be adjusted to create a bigger dataset.** + +#### Running the example + +To run this example follow the following steps: + ++ navigate to root directory of this repository ++ build docker image with `docker build -t data_gen -f src/data_generator/Dockerfile .` ++ start an instance of CrateDB on localhost with `docker run -p "4200:4200" crate` ++ Add USERNAME and PASSWORD in the [docker-compose file](src/data_generator/examples/MultiType/docker-compose_multitype_example.yml) + + If no user was created you can just ignore both environment variables (crate will use a default user) ++ start the docker-compose file with `docker-compose -f src/data_generator/examples/MultiType/docker-compose_multitype_example.yml up` + +You can now navigate to localhost:4200 to look at CrateDB or to localhost:8000 to look at the metrics of the Data Generator. + +## Alternative data generators + +### Why use this data generator over the alternatives? + +* Generate random data which follows a statistical model to better reflect real world scenarios (real world data is almost never truly random) +* The "steady load"-mode can simulate a constant load of a defined number of messages per second +* Ready made to deploy and scale data generators with Docker containers +* When you need a data generator for one the supported databases +* Uses Prometheus server to gather metrics + +### cr8 + mkjson + +`mkjson` combined with `cr8 insert-json` makes it easy to generate random entries into a table. Can only be used on databases which are compatible with `cr8`. + +See [this blog post](https://zignar.net/2020/05/01/generating-data-sets-using-mkjson/) for an example how to use `cr8` with `mkjson`. + +### tsbs data generator + +The [Time Series Benchmark Suite](https://github.com/timescale/tsbs) includes a data generator but is mainly a single client benchmark tool. + +The Data Generator has the following advantages compared to tsbs (when used as data generator): +* Easier to define your own [data model](#data-generator-models) +* Scale out to multiple clients is a core concept +* Huge sets of data can be inserted without creating files as intermediate storage. +* Full control on how many values will be inserted + +## Glossary + +EDGE: A thing that collects and reports measured values. \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..2583dea --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,8 @@ +crate +prometheus_client +urllib3 +datetime_truncate +psycopg2-binary +influxdb_client +pymongo +requests \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..747752f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +crate==0.25.0 +prometheus_client==0.7.1 +urllib3==1.25.10 +datetime_truncate==1.1.0 +psycopg2-binary==2.8.6 +influxdb_client==1.7.0 +pymongo==3.11.0 \ No newline at end of file diff --git a/src/data_generator/data_generator.py b/src/data_generator/data_generator.py new file mode 100644 index 0000000..7649f6b --- /dev/null +++ b/src/data_generator/data_generator.py @@ -0,0 +1,223 @@ +import json +import queue +import urllib3 +import time +import logging +from modules import helper +from modules.edge import Edge +from modules.crate_db_writer import CrateDbWriter +from modules.timescale_db_writer import TimescaleDbWriter +from modules.influx_db_writer import InfluxDbWriter +from modules.mongo_db_writer import MongoDbWriter +from modules.postgres_db_writer import PostgresDbWriter +from modules.batch_size_automator import BatchSizeAutomator +from modules.config import DataGeneratorConfig +from threading import Thread +from prometheus_client import start_http_server, Gauge, Counter + + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) +logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') + +# load an validate the configuration +config = DataGeneratorConfig() +valid_config = config.validate_config() +if not valid_config: + logging.error(f"invalid configuration: {config.invalid_configs}") + exit(-1) + +stop_thread = False +# load the model we want to use and create the metrics +f = open(config.model_path, "r") +model = json.load(f) + +# initialize the db_writer based on environment variable +if config.database == 0: # crate + db_writer = CrateDbWriter(config.host, config.username, config.password, model, + config.table_name, config.shards, config.replicas, config.partition) +elif config.database == 1: # timescale + db_writer = TimescaleDbWriter(config.host, config.port, config.username, config.password, + config.db_name, model, config.table_name, config.partition) +elif config.database == 2: # influx + db_writer = InfluxDbWriter(config.host, config.token, config.organization, model, config.db_name) +elif config.database == 3: # mongo + db_writer = MongoDbWriter(config.host, config.username, config.password, config.db_name, model) +elif config.database == 4: # postgres + db_writer = PostgresDbWriter(config.host, config.port, config.username, config.password, + config.db_name, model, config.table_name, config.partition) + +else: + db_writer = None + +batch_size_automator = BatchSizeAutomator(config.batch_size, config.ingest_mode, config.id_end - config.id_start + 1) + +runtime_metrics = {"rows": 0, "metrics": 0, "batch_size": config.batch_size} +edges = {} +current_values = queue.Queue(5000) + +c_generated_values = Counter("data_gen_generated_values", "How many values have been generated") +c_inserted_values = Counter("data_gen_inserted_values", "How many values have been inserted") +g_insert_percentage = Gauge("data_gen_insert_percentage", "Percentage of values that have been inserted") +if batch_size_automator.auto_batch_mode: + g_batch_size = Gauge("data_gen_batch_size", "The currently used batch size") + g_insert_time = Gauge("data_gen_insert_time", "The average time it took to insert the current batch into the " + "database") + g_rows_per_second = Gauge("data_gen_rows_per_second", "The average number of rows per second with the latest " + "batch_size") + g_best_batch_size = Gauge("data_gen_best_batch_size", "The up to now best batch size found by the " + "batch_size_automator") + g_best_batch_rps = Gauge("data_gen_best_batch_rps", "The rows per second for the up to now best batch size") + + +def create_edges(): + # this function creates metric objects in the given range [id_start, id_end] + for i in range(config.id_start, config.id_end + 1): + edges[i] = Edge(i, get_sub_element("tags"), get_sub_element("metrics")) + + +def get_sub_element(sub): + element = {} + for key in model.keys(): + if key != "description": + element = model[key][sub] + if "description" in element: + element.pop("description") + return element + + +def get_next_value(): + # for each edge in the edges list all next values are calculated and saved to the edge_value list + # this list is then added to the FIFO queue, so each entry of the FIFO queue contains all next values for each edge + # in the edge list + edge_values = [] + for edge in edges.values(): + c_generated_values.inc() + edge_values.append(edge.calculate_next_value()) + current_values.put(edge_values) + + +def write_to_db(): + global db_writer + last_insert = config.ingest_ts + last_stat_ts = time.time() + # while the queue is not empty and the value creation has not yet finished the loop will call the insert_operation + # function. Because at the beginning of the script the queue is empty we use the stop_thread variable but because + # once the value creation is done we still ned to insert the outstanding values in the queue we check if the queue + # is empty + while not current_values.empty() or not stop_thread: + if not current_values.empty(): + # we calculate the time delta from the last insert to the current timestamp + insert_delta = time.time() - last_insert + # if we use the ingest_mode the delta can be ignored + # otherwise delta needs to be bigger than ingest_delta + # if delta is smaller than ingest_delta the time difference is waited (as we want an insert + # every half second (default) + if config.ingest_mode == 1 or insert_delta > config.ingest_delta: + last_insert = helper.execute_timed_function(insert_routine, last_insert) + else: + time.sleep(config.ingest_delta - insert_delta) + if time.time() - last_stat_ts >= config.stat_delta: + for key, value in helper.tic_toc_delta.items(): + print(f"""average time for {key}: {(sum(value) / len(value))}""") + helper.reset_delta() + last_stat_ts = time.time() + db_writer.close_connection() + + +def insert_routine(last_ts): + global db_writer + batch = [] + timestamps = [] + start = None + + if batch_size_automator.auto_batch_mode: + runtime_metrics["batch_size"] = batch_size_automator.get_next_batch_size() + g_batch_size.set(runtime_metrics["batch_size"]) + start = time.time() + + if config.ingest_mode == 1: + # during ingest mode execution time increase with the amount of queries we execute therefor we insert as + # many batches as possible at a time (batch size of 10000 was empirically the best) + while len(batch) < runtime_metrics["batch_size"]: + if not current_values.empty(): + c_inserted_values.inc(config.id_end - config.id_start + 1) + ts = last_ts + config.ingest_delta + next_batch = current_values.get() + batch.extend(next_batch) + factor = 1 / config.ingest_delta + last_ts = round(ts * factor) / factor + timestamps.extend([int(last_ts * 1000)] * len(next_batch)) + else: + break + g_insert_percentage.set((c_inserted_values._value.get() / + (config.ingest_size * (config.id_end - config.id_start + 1))) * 100) + else: + c_inserted_values.inc(config.id_end - config.id_start + 1) + batch = current_values.get() + ts = time.time() + # we want the same timestamp for each value this timestamp should be the same even if the data_generator + # runs in multiple containers therefor we round the timestamp to match ingest_delta this is done by multiplying + # by ingest_delta and then dividing the result by ingest_delta + delta_factor = 1 / config.ingest_delta + last_ts = round(ts * delta_factor) / delta_factor + timestamps = [int(last_ts * 1000)] * len(batch) + + runtime_metrics["rows"] += len(batch) + runtime_metrics["metrics"] += len(batch) * len(get_sub_element("metrics").keys()) + helper.execute_timed_function(db_writer.insert_stmt, timestamps, batch) + + if batch_size_automator.auto_batch_mode: + duration = time.time() - start + g_insert_time.set(duration) + g_rows_per_second.set(len(batch)/duration) + g_best_batch_size.set(batch_size_automator.batch_times["best"]["size"]) + g_best_batch_rps.set(batch_size_automator.batch_times["best"]["batch_per_second"]) + batch_size_automator.insert_batch_time(duration) + + # we return the timestamp so we know when the last insert happened + return last_ts + + +def main(): + # prepare the database (create the table/bucket/collection if not existing) + db_writer.prepare_database() + + # start the thread that writes to the db + db_writer_thread = Thread(target=write_to_db) + db_writer_thread.start() + try: + helper.execute_timed_function(create_edges) + + # TODO: this should not have an endless loop for now stop with ctrl+C + # we are either in endless mode or have a certain amount of values to create + while_count = 0 + while config.ingest_size == 0 or while_count < config.ingest_size: + while_count += 1 + helper.execute_timed_function(get_next_value) + except Exception as e: + logging.exception(e) + finally: + # once value creation is finished we signal the db_writer thread to stop and wait for it to join + global stop_thread + stop_thread = True + db_writer_thread.join() + + +if __name__ == '__main__': + # start prometheus server + start_http_server(8000) + + helper.execute_timed_function(main) + main = 0 + # we analyze the runtime of the different function + for k, v in helper.tic_toc.items(): + if k == "main": + main = sum(v) / len(v) + print(f"""average time for {k}: {(sum(v) / len(v))}""") + + print(f"""rows per second: {runtime_metrics["rows"] / main}""") + print(f"""metrics per second: {runtime_metrics["metrics"] / main}""") + print(f"""batch_size: {batch_size_automator.batch_times["best"]["size"]}""") + print(f"""batches/second: {batch_size_automator.batch_times["best"]["batch_per_second"]}""") + + # finished diff --git a/src/data_generator/examples/MultiType/docker-compose_multitype_example.yml b/src/data_generator/examples/MultiType/docker-compose_multitype_example.yml new file mode 100644 index 0000000..c1dccc1 --- /dev/null +++ b/src/data_generator/examples/MultiType/docker-compose_multitype_example.yml @@ -0,0 +1,62 @@ +version: "2.3" +services: + datagen_base1: + image: data_gen + ports: + - 8000:8000 + environment: + ID_START: 1 + ID_END: 500 + HOST: "host.docker.internal:4200" + INGEST_MODE: 1 + INGEST_SIZE: 1000 + INGEST_DELTA: 1 + MODEL_PATH: "/example_complex_base1.json" + DATABASE: 0 + volumes: + - ./example_complex_base1.json:/example_complex_base1.json + datagen_base2: + image: data_gen + ports: + - 8001:8000 + environment: + ID_START: 501 + ID_END: 1000 + HOST: "host.docker.internal:4200" + INGEST_MODE: 1 + INGEST_SIZE: 1000 + INGEST_DELTA: 1 + MODEL_PATH: "/example_complex_base2.json" + DATABASE: 0 + volumes: + - ./example_complex_base2.json:/example_complex_base2.json + datagen_H: + image: data_gen + ports: + - 8002:8000 + environment: + ID_START: 1001 + ID_END: 1100 + HOST: "host.docker.internal:4200" + INGEST_MODE: 1 + INGEST_SIZE: 100 + INGEST_DELTA: 10 + MODEL_PATH: "/example_complex_U.json" + DATABASE: 0 + volumes: + - ./example_complex_U.json:/example_complex_U.json + datagen_L: + image: data_gen + ports: + - 8003:8000 + environment: + ID_START: 1101 + ID_END: 1200 + HOST: "host.docker.internal:4200" + INGEST_MODE: 1 + INGEST_SIZE: 100 + INGEST_DELTA: 10 + MODEL_PATH: "/example_complex_L.json" + DATABASE: 0 + volumes: + - ./example_complex_L.json:/example_complex_L.json \ No newline at end of file diff --git a/src/data_generator/examples/MultiType/example_complex_L.json b/src/data_generator/examples/MultiType/example_complex_L.json new file mode 100644 index 0000000..3fcbdc8 --- /dev/null +++ b/src/data_generator/examples/MultiType/example_complex_L.json @@ -0,0 +1,40 @@ +{ + "sensor_values": { + "tags": { + "factory": ["AT", "BE", "CH", "DE", "ES", "FR", "GB", "HU", "IT", "JP"], + "line": ["L1", "L2", "L3", "L4", "L5"], + "sensor": "id" + }, + "metrics": { + "temperature": { + "key": { + "value": "temperature" + }, + "type": { + "value": "FLOAT" + }, + "min": { + "value": 0 + }, + "max": { + "value": 200 + }, + "mean": { + "value": 50 + }, + "stdev": { + "value": 5 + }, + "variance": { + "value": 0.01 + }, + "error_rate": { + "value": 0.00001 + }, + "error_length": { + "value": 2.07 + } + } + } + } +} \ No newline at end of file diff --git a/src/data_generator/examples/MultiType/example_complex_U.json b/src/data_generator/examples/MultiType/example_complex_U.json new file mode 100644 index 0000000..7a4ea48 --- /dev/null +++ b/src/data_generator/examples/MultiType/example_complex_U.json @@ -0,0 +1,40 @@ +{ + "sensor_values": { + "tags": { + "factory": ["AT", "BE", "CH", "DE", "ES", "FR", "GB", "HU", "IT", "JP"], + "line": ["U1", "U2", "U3", "U4", "U5"], + "sensor": "id" + }, + "metrics": { + "temperature": { + "key": { + "value": "temperature" + }, + "type": { + "value": "FLOAT" + }, + "min": { + "value": 50 + }, + "max": { + "value": 150 + }, + "mean": { + "value": 100 + }, + "stdev": { + "value": 2 + }, + "variance": { + "value": 0.01 + }, + "error_rate": { + "value": 0.00001 + }, + "error_length": { + "value": 4.07 + } + } + } + } +} \ No newline at end of file diff --git a/src/data_generator/examples/MultiType/example_complex_base1.json b/src/data_generator/examples/MultiType/example_complex_base1.json new file mode 100644 index 0000000..63cc9cb --- /dev/null +++ b/src/data_generator/examples/MultiType/example_complex_base1.json @@ -0,0 +1,51 @@ +{ + "sensor_values": { + "tags": { + "factory": ["AT", "BE", "CH", "DE", "ES", "FR", "GB", "HU", "IT", "JP"], + "line": ["L1", "L2", "L3", "L4", "L5", "U1", "U2", "U3", "U4", "U5"], + "sensor": "id" + }, + "metrics": { + "speed": { + "key": { + "value": "speed" + }, + "type": { + "value": "FLOAT" + }, + "min": { + "value": 0 + }, + "max": { + "value": 3 + }, + "mean": { + "value": 1.5 + }, + "stdev": { + "value": 0.3 + }, + "variance": { + "value": 0.01 + }, + "error_rate": { + "value": 0.0001 + }, + "error_length": { + "value": 1.04 + } + }, + "vibration": { + "key": { + "value": "vibration" + }, + "type": { + "value": "BOOL" + }, + "true_ratio": { + "value": 0.01 + } + } + } + } +} \ No newline at end of file diff --git a/src/data_generator/examples/MultiType/example_complex_base2.json b/src/data_generator/examples/MultiType/example_complex_base2.json new file mode 100644 index 0000000..7e4b11e --- /dev/null +++ b/src/data_generator/examples/MultiType/example_complex_base2.json @@ -0,0 +1,69 @@ +{ + "sensor_values": { + "tags": { + "factory": ["AT", "BE", "CH", "DE", "ES"], + "line": ["L1", "L2", "L3", "L4", "L5", "U1", "U2", "U3", "U4", "U5"], + "sensor": "id" + }, + "metrics": { + "voltage": { + "key": { + "value": "voltage" + }, + "type": { + "value": "FLOAT" + }, + "min": { + "value": 120 + }, + "max": { + "value": 280 + }, + "mean": { + "value": 240 + }, + "stdev": { + "value": 5 + }, + "variance": { + "value": 0.1 + }, + "error_rate": { + "value": 0.0001 + }, + "error_length": { + "value": 50 + } + }, + "current": { + "key": { + "value": "current" + }, + "type": { + "value": "FLOAT" + }, + "min": { + "value": 0.5 + }, + "max": { + "value": 2.0 + }, + "mean": { + "value": 1.0 + }, + "stdev": { + "value": 0.01 + }, + "variance": { + "value": 0.001 + }, + "error_rate": { + "value": 0.00001 + }, + "error_length": { + "value": 100.04 + } + } + } + } +} \ No newline at end of file diff --git a/src/data_generator/examples/SingleType/docker-compose_example_crate.yml b/src/data_generator/examples/SingleType/docker-compose_example_crate.yml new file mode 100644 index 0000000..8e80702 --- /dev/null +++ b/src/data_generator/examples/SingleType/docker-compose_example_crate.yml @@ -0,0 +1,19 @@ +version: "2.3" +services: + datagen: + image: data_gen + ports: + - 8000:8000 + environment: + ID_START: 1 + ID_END: 100 + HOST: "host.docker.internal:4200" + # USERNAME: "" + # PASSWORD: "" + INGEST_MODE: 0 + INGEST_SIZE: 720 + INGEST_DELTA: 5 + MODEL_PATH: "/example_model.json" + DATABASE: 0 + volumes: + - ./example_model.json:/example_model.json \ No newline at end of file diff --git a/src/data_generator/examples/SingleType/example_model.json b/src/data_generator/examples/SingleType/example_model.json new file mode 100644 index 0000000..dc293f5 --- /dev/null +++ b/src/data_generator/examples/SingleType/example_model.json @@ -0,0 +1,80 @@ +{ + "sensor_values": { + "tags": { + "factory": 2, + "line": 10, + "sensor": "id" + }, + "metrics": { + "speed": { + "key": { + "value": "speed" + }, + "type": { + "value": "FLOAT" + }, + "min": { + "value": 0 + }, + "max": { + "value": 3 + }, + "mean": { + "value": 1.5 + }, + "stdev": { + "value": 0.3 + }, + "variance": { + "value": 0.01 + }, + "error_rate": { + "value": 0.0001 + }, + "error_length": { + "value": 1.04 + } + }, + "temperature": { + "key": { + "value": "temperature" + }, + "type": { + "value": "FLOAT" + }, + "min": { + "value": 0 + }, + "max": { + "value": 200 + }, + "mean": { + "value": 50 + }, + "stdev": { + "value": 5 + }, + "variance": { + "value": 0.01 + }, + "error_rate": { + "value": 0.00001 + }, + "error_length": { + "value": 2.07 + } + }, + "vibration": { + "key": { + "value": "vibration" + }, + "type": { + "value": "BOOL" + }, + "true_ratio": { + "value": 0.01 + } + } + } + } +} \ No newline at end of file diff --git a/src/data_generator/examples/docker-compose_crate.yml b/src/data_generator/examples/docker-compose_crate.yml new file mode 100644 index 0000000..3caf992 --- /dev/null +++ b/src/data_generator/examples/docker-compose_crate.yml @@ -0,0 +1,35 @@ +# example docker-compose when inserting 1000 values for 1000 edges into CrateDB using the temperature model +# prometheus metrics for ID 1 - 500 on port 8000, for ID 501 - 1000 on port 8001 +# assuming CrateDB runs on localhost and has user crate without password +version: "2.3" +services: + datagen1: + image: data_gen + ports: + - 8000:8000 + environment: + ID_START: 1 + ID_END: 500 + HOST: "host.docker.internal:4200" + INGEST_MODE: 1 + INGEST_SIZE: 1000 + MODEL_PATH: "/temperature.json" + TABLE_NAME: "timeseries" + DATABASE: 0 + volumes: + - ./temperature.json:/temperature.json + datagen2: + image: data_gen + ports: + - 8001:8000 + environment: + ID_START: 501 + ID_END: 1000 + HOST: "host.docker.internal:4200" + INGEST_MODE: 1 + INGEST_SIZE: 1000 + MODEL_PATH: "/temperature.json" + TABLE_NAME: "timeseries" + DATABASE: 0 + volumes: + - ./temperature.json:/temperature.json \ No newline at end of file diff --git a/src/data_generator/examples/docker-compose_influx.yml b/src/data_generator/examples/docker-compose_influx.yml new file mode 100644 index 0000000..fb6d45f --- /dev/null +++ b/src/data_generator/examples/docker-compose_influx.yml @@ -0,0 +1,40 @@ +# example docker-compose when inserting 1000 values for 1000 edges into InfluxDB using the temperature model +# prometheus metrics for ID 1 - 500 on port 8000, for ID 501 - 1000 on port 8001 +# assuming InfluxDB runs on localhost +# Token and Organization have to be inserted before running +version: "2.3" +services: + datagen1: + image: data_gen + ports: + - 8000:8000 + environment: + ID_START: 1 + ID_END: 500 + HOST: "host.docker.internal" + PORT: 8086 + TOKEN: "" + ORG: "" + INGEST_MODE: 1 + INGEST_SIZE: 1000 + MODEL_PATH: "/temperature.json" + DATABASE: 2 + volumes: + - ./temperature.json:/temperature.json + datagen2: + image: data_gen + ports: + - 8001:8000 + environment: + ID_START: 501 + ID_END: 1000 + HOST: "host.docker.internal" + PORT: 8086 + TOKEN: "" + ORG: "" + INGEST_MODE: 1 + INGEST_SIZE: 1000 + MODEL_PATH: "/temperature.json" + DATABASE: 2 + volumes: + - ./temperature.json:/temperature.json \ No newline at end of file diff --git a/src/data_generator/examples/docker-compose_mongo.yml b/src/data_generator/examples/docker-compose_mongo.yml new file mode 100644 index 0000000..ebf2a29 --- /dev/null +++ b/src/data_generator/examples/docker-compose_mongo.yml @@ -0,0 +1,40 @@ +# example docker-compose when inserting 1000 values for 1000 edges into MongoDB using the temperature model +# prometheus metrics for ID 1 - 500 on port 8000, for ID 501 - 1000 on port 8001 +# assuming MongoDB runs on localhost +# Username and Password have to be inserted before running +version: "2.3" +services: + datagen1: + image: data_gen + ports: + - 8000:8000 + environment: + ID_START: 1 + ID_END: 500 + HOST: "host.docker.internal" + USERNAME: "" + PASSWORD: "" + DB_NAME: "temperature" + INGEST_MODE: 1 + INGEST_SIZE: 1000 + MODEL_PATH: "/temperature.json" + DATABASE: 3 + volumes: + - ./temperature.json:/temperature.json + datagen2: + image: data_gen + ports: + - 8001:8000 + environment: + ID_START: 501 + ID_END: 1000 + HOST: "host.docker.internal" + USERNAME: "" + PASSWORD: "" + DB_NAME: "temperature" + INGEST_MODE: 1 + INGEST_SIZE: 1000 + MODEL_PATH: "/temperature.json" + DATABASE: 3 + volumes: + - ./temperature.json:/temperature.json \ No newline at end of file diff --git a/src/data_generator/examples/docker-compose_timescale.yml b/src/data_generator/examples/docker-compose_timescale.yml new file mode 100644 index 0000000..9dc760e --- /dev/null +++ b/src/data_generator/examples/docker-compose_timescale.yml @@ -0,0 +1,44 @@ +# example docker-compose when inserting 1000 values for 1000 edges into CrateDB using the temperature model +# prometheus metrics for ID 1 - 500 on port 8000, for ID 501 - 1000 on port 8001 +# assuming TimescaleDB runs on localhost and an existing TimescaleDB Table named data_generator. +# Username and Password have to be inserted before running +version: "2.3" +services: + datagen1: + image: data_gen + ports: + - 8000:8000 + environment: + ID_START: 1 + ID_END: 500 + INGEST_MODE: 1 + INGEST_SIZE: 1000 + TABLE_NAME: "timeseries" + HOST: "host.docker.internal" + PORT: 5432 + dbUser: "" + dbPassword: "" + DATABASE: 1 + MODEL_PATH: "/temperature.json" + DB_NAME: "data_generator" + volumes: + - ./temperature.json:/temperature.json + datagen2: + image: data_gen + ports: + - 8001:8000 + environment: + ID_START: 501 + ID_END: 1000 + INGEST_MODE: 1 + INGEST_SIZE: 1000 + TABLE_NAME: "timeseries" + HOST: "host.docker.internal" + PORT: 5432 + USERNAME: "" + PASSWORD: "" + DATABASE: 1 + MODEL_PATH: "/temperature.json" + DB_NAME: "data_generator" + volumes: + - ./temperature.json:/temperature.json \ No newline at end of file diff --git a/src/data_generator/examples/docu_example.json b/src/data_generator/examples/docu_example.json new file mode 100644 index 0000000..fca5909 --- /dev/null +++ b/src/data_generator/examples/docu_example.json @@ -0,0 +1,40 @@ +{ + "example": { + "tags": { + "plant": 50, + "line": 5, + "edge": "id" + }, + "metrics": { + "voltage": { + "key": { + "value": "voltage" + }, + "type": { + "value": "FLOAT" + }, + "min": { + "value": 200 + }, + "max": { + "value": 260 + }, + "mean": { + "value": 230 + }, + "stdev": { + "value": 10 + }, + "variance": { + "value": 0.5 + }, + "error_rate": { + "value": 0.001 + }, + "error_length": { + "value": 2.5 + } + } + } + } +} \ No newline at end of file diff --git a/src/data_generator/examples/grafana_demo_live/.gitignore b/src/data_generator/examples/grafana_demo_live/.gitignore new file mode 100644 index 0000000..10fd06c --- /dev/null +++ b/src/data_generator/examples/grafana_demo_live/.gitignore @@ -0,0 +1 @@ +manifest.yaml diff --git a/src/data_generator/examples/grafana_demo_live/README.md b/src/data_generator/examples/grafana_demo_live/README.md new file mode 100644 index 0000000..3c3c1f9 --- /dev/null +++ b/src/data_generator/examples/grafana_demo_live/README.md @@ -0,0 +1,10 @@ +# Deployment + +In order for K8s to access the data generator Docker image, the `image-pull-cr8` secret is needed. This is replicated from the `templates` namespace. + +The data generator uses the `admin` user and access the corresponding password from the K8s secret. + +```console +$ j2cli -c grafana_config_map.yml k8s_deploy_grafana_demo_data.yml > manifest.yaml +$ kubectl --context k8s.westeurope.azure apply -f manifest.yaml +``` diff --git a/src/data_generator/examples/grafana_demo_live/docker-compose_grafana_demo.yml b/src/data_generator/examples/grafana_demo_live/docker-compose_grafana_demo.yml new file mode 100644 index 0000000..84a89f9 --- /dev/null +++ b/src/data_generator/examples/grafana_demo_live/docker-compose_grafana_demo.yml @@ -0,0 +1,19 @@ +version: "2.3" +services: + datagen: + image: data_gen + ports: + - 8000:8000 + environment: + ID_START: 1 + ID_END: 150 + HOST: "host.docker.internal:4200" + # USERNAME: "" + # PASSWORD: "" + INGEST_MODE: 0 + INGEST_SIZE: 720 # 1h, 3600secs / 5 + INGEST_DELTA: 5 # every 5 seconds one insert + MODEL_PATH: "/grafana_demo_model.json" + DATABASE: 0 + volumes: + - ./grafana_demo_model.json:/grafana_demo_model.json diff --git a/src/data_generator/examples/grafana_demo_live/grafana_config_map.yml b/src/data_generator/examples/grafana_demo_live/grafana_config_map.yml new file mode 100644 index 0000000..c2b164f --- /dev/null +++ b/src/data_generator/examples/grafana_demo_live/grafana_config_map.yml @@ -0,0 +1,15 @@ +kubernetes: + name: grafana-demo-data + namespace: 00768a8e-14ea-4daa-bc38-592bf1486d85 + +version: v0.1.2 + +ID_START: 1 +ID_END: 150 + +db: + crateuri: https://grafana-demo-1.westeurope.azure.cratedb.net:4200 + +credentials: + username: admin + password_k8s_secret: user-password-35e75e5f-5308-4e98-999d-2be9d2fa47af-0 diff --git a/src/data_generator/examples/grafana_demo_live/k8s_deploy_grafana_demo_data.yml b/src/data_generator/examples/grafana_demo_live/k8s_deploy_grafana_demo_data.yml new file mode 100644 index 0000000..afd4bc2 --- /dev/null +++ b/src/data_generator/examples/grafana_demo_live/k8s_deploy_grafana_demo_data.yml @@ -0,0 +1,240 @@ +--- +kind: ConfigMap +apiVersion: v1 +metadata: + labels: + k8s-app: datagenerator + app.kubernetes.io/name: "{{ kubernetes.name }}" + name: "datamodel" + namespace: "{{ kubernetes.namespace }}" +data: + grafana_demo_model.json: | + { + "grafana_demo_values": { + "tags": { + "factory": 2, + "machine": 10, + "sensor": "id" + }, + "metrics": { + "temperature": { + "key": { + "value": "temperature" + }, + "type": { + "value": "FLOAT" + }, + "min": { + "value": 50 + }, + "max": { + "value": 200 + }, + "mean": { + "value": 125 + }, + "stdev": { + "value": 5 + }, + "variance": { + "value": 0.5 + }, + "error_rate": { + "value": 0.00001 + }, + "error_length": { + "value": 2.07 + } + }, + "fluid_level": { + "key": { + "value": "fluid_level" + }, + "type": { + "value": "FLOAT" + }, + "min": { + "value": 85 + }, + "max": { + "value": 100 + }, + "mean": { + "value": 92 + }, + "stdev": { + "value": 1 + }, + "variance": { + "value": 0.01 + }, + "error_rate": { + "value": 0.0001 + }, + "error_length": { + "value": 2.07 + } + }, + "humidity": { + "key": { + "value": "humidity" + }, + "type": { + "value": "FLOAT" + }, + "min": { + "value": 30 + }, + "max": { + "value": 60 + }, + "mean": { + "value": 45 + }, + "stdev": { + "value": 1 + }, + "variance": { + "value": 0.01 + }, + "error_rate": { + "value": 0.0001 + }, + "error_length": { + "value": 2.07 + } + }, + "pressure": { + "key": { + "value": "pressure" + }, + "type": { + "value": "FLOAT" + }, + "min": { + "value": 0 + }, + "max": { + "value": 50 + }, + "mean": { + "value": 37 + }, + "stdev": { + "value": 1 + }, + "variance": { + "value": 0.01 + }, + "error_rate": { + "value": 0.0001 + }, + "error_length": { + "value": 5 + } + }, + "broken": { + "key": { + "value": "broken" + }, + "type": { + "value": "BOOL" + }, + "true_ratio": { + "value": 0.01 + } + } + } + } + } + +--- +apiVersion: v1 +kind: Secret +metadata: + name: image-pull-cr8 + namespace: "{{ kubernetes.namespace }}" + labels: + app.kubernetes.io/name: image-pull-cr8 + app.kubernetes.io/part-of: cratedb + app.kubernetes.io/managed-by: replicator + annotations: + replicator.v1.mittwald.de/replicate-from: "templates/image-pull-cr8" +type: kubernetes.io/dockerconfigjson +data: + .dockerconfigjson: "e30K" # empty {} + +--- +apiVersion: batch/v1 +kind: Job +metadata: + labels: + k8s-app: datagenerator + app.kubernetes.io/name: "{{ kubernetes.name }}" + name: "dg-{{ ID_START }}" + namespace: "{{ kubernetes.namespace }}" +spec: + backoffLimit: 0 + template: + metadata: + annotations: + prometheus.io/port: "8000" + prometheus.io/scrape: "true" + prometheus.io/path: "/metrics" + labels: + k8s-app: datagenerator + app.kubernetes.io/name: "{{ kubernetes.name }}" + spec: + imagePullSecrets: + - name: image-pull-cr8 + containers: + - name: datagenerator + image: "registry.cr8.net/data-generator:{{ version }}" + ports: + - containerPort: 8000 + protocol: TCP + resources: + requests: + cpu: "500m" + memory: "8196Mi" + limits: + cpu: "4000m" + memory: "8196Mi" + volumeMounts: + - name: datamodel + mountPath: "/grafana_demo_model.json" + subPath: grafana_demo_model.json + env: + - name: ID_START + value: "{{ ID_START }}" + - name: ID_END + value: "{{ ID_END }}" + - name: HOST + value: "{{ db.crateuri }}" + - name: INGEST_MODE + value: "0" + - name: INGEST_SIZE + value: "241920" + - name: MODEL_PATH + value: "/grafana_demo_model.json" + - name: INGEST_DELTA + value: "5" + - name: TABLE_NAME + value: "doc.grafana_demo_values" + - name: SHARDS + value: "4" + - name: USERNAME + value: "{{ credentials.username }}" + - name: PASSWORD + valueFrom: + secretKeyRef: + name: "{{ credentials.password_k8s_secret }}" + key: password + restartPolicy: Never + volumes: + - name: datamodel + configMap: + name: datamodel + items: + - key: grafana_demo_model.json + path: grafana_demo_model.json diff --git a/src/data_generator/examples/motor.json b/src/data_generator/examples/motor.json new file mode 100644 index 0000000..94ac3e5 --- /dev/null +++ b/src/data_generator/examples/motor.json @@ -0,0 +1,182 @@ +{ + "description": "this json file contains a tables and it's structure and how the values in this structure are calculated. The key represents the table name (in case the database uses tables)", + "motor": { + "tags": { + "description": "describes the database structure, tags are indexed columns for most dbs containing information about the 'sensor' reporting values but not containing measured values", + "plant": 100, + "line": 5, + "drive_id": "id" + }, + "metrics": { + "description": "metrics contain all measured values associated with a tag", + "temperature": { + "description": "the temperature of a drive", + "key": { + "description": "key used in the database to store this measurement", + "value": "temperature" + }, + "type": { + "description": "some databases (e.g. timescaleDB) need a column type because they don't have a schema less approach, calculation also differs for different types", + "value": "FLOAT" + }, + "min": { + "value": 70.0, + "unit": "°C", + "description": "the minimum value reported by a temperature sensor" + }, + "max": { + "value": 120.0, + "unit": "°C", + "description": "the maximum value reported by a temperature sensor" + }, + "mean": { + "value": 80.0, + "unit": "°C", + "description": "the average value reported by a temperature sensor" + }, + "stdev": { + "value": 5.0, + "description": "the standard deviation in a set of temperature measurements" + }, + "variance": { + "value": 0.03, + "description": "the expectation of the squared deviation of a temperature measurement from its mean" + }, + "error_rate": { + "value": 0.001, + "description": "the chance that the value is outside of the expected range" + }, + "error_length": { + "value": 300, + "description": "the average consecutive values that are outside the stdev" + } + }, + "voltage": { + "description": "the voltage input of a drive", + "key": { + "description": "key used in the database to store this measurement", + "value": "voltage" + }, + "type": { + "description": "some databases (e.g. timescaleDB) need a column type because they don't have a schema less approach, calculation also differs for different types", + "value": "FLOAT" + }, + "min": { + "value": 100.0, + "unit": "V", + "description": "the minimum value reported by a temperature sensor" + }, + "max": { + "value": 250.0, + "unit": "V", + "description": "the maximum value reported by a temperature sensor" + }, + "mean": { + "value": 240.0, + "unit": "V", + "description": "the average value reported by a temperature sensor" + }, + "stdev": { + "value": 10.0, + "description": "the standard deviation in a set of temperature measurements" + }, + "variance": { + "value": 0.01, + "description": "the expectation of the squared deviation of a temperature measurement from its mean" + }, + "error_rate": { + "value": 0.00001, + "description": "the chance that the value is outside of the expected range" + }, + "error_length": { + "value": 200, + "description": "the average consecutive values that are outside the stdev" + } + }, + "current": { + "description": "the current input of a drive", + "key": { + "description": "key used in the database to store this measurement", + "value": "current" + }, + "type": { + "description": "some databases (e.g. timescaleDB) need a column type because they don't have a schema less approach, calculation also differs for different types", + "value": "FLOAT" + }, + "min": { + "value": 0.5, + "unit": "A", + "description": "the minimum value reported by a temperature sensor" + }, + "max": { + "value": 2.0, + "unit": "A", + "description": "the maximum value reported by a temperature sensor" + }, + "mean": { + "value": 1.0, + "unit": "A", + "description": "the average value reported by a temperature sensor" + }, + "stdev": { + "value": 0.1, + "description": "the standard deviation in a set of temperature measurements" + }, + "variance": { + "value": 0.0001, + "description": "the expectation of the squared deviation of a temperature measurement from its mean" + }, + "error_rate": { + "value": 0.00001, + "description": "the chance that the value is outside of the expected range" + }, + "error_length": { + "value": 200, + "description": "the average consecutive values that are outside the stdev" + } + }, + "power": { + "description": "the power output of a drive", + "key": { + "description": "key used in the database to store this measurement", + "value": "power" + }, + "type": { + "description": "some databases (e.g. timescaleDB) need a column type because they don't have a schema less approach, calculation also differs for different types", + "value": "FLOAT" + }, + "min": { + "value": 0.0, + "unit": "W", + "description": "the minimum value reported by a temperature sensor" + }, + "max": { + "value": 300.0, + "unit": "W", + "description": "the maximum value reported by a temperature sensor" + }, + "mean": { + "value": 240.0, + "unit": "W", + "description": "the average value reported by a temperature sensor" + }, + "stdev": { + "value": 5.0, + "description": "the standard deviation in a set of temperature measurements" + }, + "variance": { + "value": 0.001, + "description": "the expectation of the squared deviation of a temperature measurement from its mean" + }, + "error_rate": { + "value": 0.00001, + "description": "the chance that the value is outside of the expected range" + }, + "error_length": { + "value": 200, + "description": "the average consecutive values that are outside the stdev" + } + } + } + } +} \ No newline at end of file diff --git a/src/data_generator/examples/pictures/curve.png b/src/data_generator/examples/pictures/curve.png new file mode 100644 index 0000000..6a41e58 Binary files /dev/null and b/src/data_generator/examples/pictures/curve.png differ diff --git a/src/data_generator/examples/pictures/distribution.png b/src/data_generator/examples/pictures/distribution.png new file mode 100644 index 0000000..c3f8645 Binary files /dev/null and b/src/data_generator/examples/pictures/distribution.png differ diff --git a/src/data_generator/examples/temperature.json b/src/data_generator/examples/temperature.json new file mode 100644 index 0000000..5b748d9 --- /dev/null +++ b/src/data_generator/examples/temperature.json @@ -0,0 +1,71 @@ +{ + "description": "this json file contains a tables and it's structure and how the values in this structure are calculated. The key represents the table name (in case the database uses tables)", + "temperature": { + "tags": { + "description": "describes the database structure, tags are indexed columns for most dbs containing information about the 'edge' reporting values but not containing measured values.", + "plant": 100, + "line": 5, + "sensor_id": "id" + }, + "metrics": { + "description": "metrics contain all measured values associated with a tag", + "temperature": { + "description": "temperature measured in a fridge", + "key": { + "description": "key used in the database to store this measurement", + "value": "value" + }, + "type": { + "description": "some databases (e.g. timescaleDB) need a column type because they don't have a schema less approach, calculation also differs for different types", + "value": "FLOAT" + }, + "min": { + "value": 6.0, + "unit": "°C", + "description": "the minimum value reported by a temperature sensor" + }, + "max": { + "value": 7.4, + "unit": "°C", + "description": "the maximum value reported by a temperature sensor" + }, + "mean": { + "value": 6.4, + "unit": "°C", + "description": "the average value reported by a temperature sensor" + }, + "stdev": { + "value": 0.2, + "description": "the standard deviation in a set of temperature measurements" + }, + "variance": { + "value": 0.03, + "description": "the expectation of the squared deviation of a temperature measurement from its mean" + }, + "error_rate": { + "value": 0.005, + "description": "the chance that the value is outside of the expected range" + }, + "error_length": { + "value": 1.08, + "description": "the average consecutive values that are outside the stdev" + } + }, + "button": { + "description": "indicating a pressed button", + "key": { + "description": "key used in the database to store this measurement", + "value": "button_press" + }, + "type": { + "description": "some databases (e.g. timescaleDB) need a column type because they don't have a schema less approach, calculation also differs for different types", + "value": "BOOL" + }, + "true_ratio": { + "description": "how high is the change of this boolean to be true, e.g. 0,001 means it is on average true for one in a thousand values, or 1 means it's true every time", + "value": 0.001 + } + } + } + } +} \ No newline at end of file diff --git a/src/data_generator/timeseries.sql b/src/data_generator/timeseries.sql new file mode 100644 index 0000000..e613d6f --- /dev/null +++ b/src/data_generator/timeseries.sql @@ -0,0 +1,25 @@ +CREATE TABLE "doc"."timeseries" ( + "ts" TIMESTAMP WITH TIME ZONE, + "g_ts_week" TIMESTAMP WITH TIME ZONE GENERATED ALWAYS AS date_trunc('week', "ts"), + "payload" OBJECT(DYNAMIC) AS ( + "button_press" INT, + "line" INT, + "plant" INT, + "sensor_id" INT, + "value" REAL + ) +) +CLUSTERED INTO 21 SHARDS +PARTITIONED BY ("g_ts_week") +WITH ( + number_of_replicas = 1 +) + +--timescale create table statement +CREATE TABLE timeseries( + ts TIMESTAMP NOT NULL, + ts_week TIMESTAMP NOT NULL, + payload jsonb NOT NULL +); + +SELECT create_hypertable('timeseries', 'ts', 'ts_week', 10); \ No newline at end of file diff --git a/src/modules/__init__.py b/src/modules/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/modules/batch_size_automator.py b/src/modules/batch_size_automator.py new file mode 100644 index 0000000..5d9b7ed --- /dev/null +++ b/src/modules/batch_size_automator.py @@ -0,0 +1,103 @@ +import statistics + +factors = [-1, 1] + + +class BatchSizeAutomator: + def __init__(self, batch_size, ingest_mode, data_batch_size=1): + self.auto_batch_mode = batch_size <= 0 and ingest_mode == 1 + self.data_batch_size = data_batch_size # the size the data set has at minimum for the batch operation + self._set_batch_size((batch_size, 2500)[batch_size <= 0]) + self.batch_times = {"current": { + "size": self.batch_size, + "times": [], + "avg_time": -1, + "batch_per_second": 0.0}, + "best": { + "size": self.batch_size, + "times": [], + "avg_time": -1, + "batch_per_second": 0.0}} + # batch_mode is only run when ingest_mode is set to 1 + self.alpha = 1.0 + # smaller steps than data_batch_size make no sense at this is the smallest batch_size + self.step_size = 500 if 500 > self.data_batch_size else self.data_batch_size + self.test_size = 20 + self.bigger_batch_size = True + self.surveillance_mode = False + + def get_next_batch_size(self): + return self.batch_size + + def insert_batch_time(self, duration): + self.batch_times["current"]["times"].append(duration) + if len(self.batch_times["current"]["times"]) >= self.test_size: + self._calc_better_batch_time() + + def _set_batch_size(self, batch_size): + if batch_size < self.data_batch_size: + # if batch_size goes down to a number smaller than data_batch_size it is set to 1 and the direction + # turned upwards + self.bigger_batch_size = not self.bigger_batch_size + self.batch_size = self.data_batch_size + else: + # the batch_size must always be a multitude of self.data_batch_size + if batch_size % self.data_batch_size != 0: + batch_size = self.data_batch_size * round(batch_size/self.data_batch_size) + self.batch_size = batch_size + + def _calc_better_batch_time(self): + if self._is_current_batch_size_better(): + # if during surveillance_mode the performance changes quit surveillance + # and calculate better batch_size faster + self._adjust_batch_size(True) + else: + # if we were in surveillance mode and the performance got worse we want to readjust the batch_size, + # this means we stop surveillance mode and do a retest over 10 batches with slightly adjusted batch_size. + if self.surveillance_mode: + self._stop_surveillance_mode() + self._adjust_batch_size(True) + else: + self.alpha -= self.alpha / 10 # reduce step_size by 10% each calculation + # if step_size is smaller than 100 no more adjustment necessary. batch_size_automator will go into + # surveillance mode and only check periodically if batch_performance has gotten worse + if self.step_size * self.alpha < 100: + self._start_surveillance_mode() + else: + # if we didn't change into surveillance mode we change the direction of the batch_size adjustment + # and let the automator run with a new batch_size + self.bigger_batch_size = not self.bigger_batch_size # change direction of batch_size adjustment + self._adjust_batch_size(False) + # reset current batch_times for next batch_size + self.batch_times["current"] = {"size": self.batch_size, "times": [], "avg_time": -1} + + def _adjust_batch_size(self, take_current): + if take_current: + self.batch_times["best"] = self.batch_times["current"] + batch_size_change = factors[self.bigger_batch_size] * self.alpha * self.step_size + # there would be no change in real batch size if the change was less than data_batch_size + if abs(batch_size_change) < self.data_batch_size: + batch_size_change = self.data_batch_size + self._set_batch_size(round(self.batch_times["best"]["size"] + batch_size_change)) + + def _start_surveillance_mode(self): + self.test_size = 1000 + self._set_batch_size(self.batch_times["best"]["size"]) + self.surveillance_mode = True + + def _stop_surveillance_mode(self): + self.surveillance_mode = False + self.test_size = 20 + self.alpha = 0.5 + + def _is_current_batch_size_better(self): + current_avg = statistics.mean(self.batch_times["current"]["times"]) + self.batch_times["current"]["avg_time"] = current_avg + best_avg = self.batch_times["best"]["avg_time"] + best_per_second = self.batch_times["best"]["size"] / best_avg + current_per_second = self.batch_times["current"]["size"] / current_avg + self.batch_times["best"]["batch_per_second"] = best_per_second # this is not really necessary + self.batch_times["current"]["batch_per_second"] = current_per_second + current_was_better = current_per_second > best_per_second + # if best_avg is -1 no best batch_size has been calculated yet + return best_avg == -1 or current_was_better diff --git a/src/modules/config.py b/src/modules/config.py new file mode 100644 index 0000000..385bc51 --- /dev/null +++ b/src/modules/config.py @@ -0,0 +1,71 @@ +import os +import time +import os.path + + +class DataGeneratorConfig: + def __init__(self): + # environment variables describing how the data_generator behaves + self.id_start = int(os.getenv("ID_START", 1)) + self.id_end = int(os.getenv("ID_END", 500)) + self.ingest_mode = int(os.getenv("INGEST_MODE", 1)) + self.ingest_size = int(os.getenv("INGEST_SIZE", 100)) + self.ingest_ts = float(os.getenv("INGEST_TS", time.time())) + self.ingest_delta = float(os.getenv("INGEST_DELTA", 0.5)) + self.model_path = os.getenv("MODEL_PATH", "") + self.batch_size = int(os.getenv("BATCH_SIZE", -1)) + self.database = int(os.getenv("DATABASE", 0)) # 0:crate, 1:timescale, 2:influx, 3:mongo + self.stat_delta = int(os.getenv("STAT_DELTA", 30)) + + # environment variables used by multiple database clients + self.host = os.getenv("HOST", "localhost") + self.username = os.getenv("USERNAME", None) + self.password = os.getenv("PASSWORD", None) + self.db_name = os.getenv("DB_NAME", "") + self.table_name = os.getenv("TABLE_NAME", "") + self.partition = os.getenv("PARTITION", "week") + + # environment variables to configure cratedb + self.shards = int(os.getenv("SHARDS", 4)) + self.replicas = int(os.getenv("REPLICAS", 0)) + + # environment variables to configure timescaledb + self.port = os.getenv("PORT", "5432") + + # environment variables to connect to influxdb + self.token = os.getenv("TOKEN", "") + self.organization = os.getenv("ORG", "") + self.invalid_configs = [] + + def validate_config(self) -> bool: + if self.id_start < 0: + self.invalid_configs.append(f"ID_START: {self.id_start} < 0") + if self.id_end < 0: + self.invalid_configs.append(f"ID_END: {self.id_end} < 0") + if self.id_end < self.id_start: + self.invalid_configs.append(f"ID_START: {self.id_start} > ID_END: {self.id_end}") + if self.ingest_mode not in [0, 1]: + self.invalid_configs.append(f"INGEST_MODE: {self.ingest_mode} not 0 or 1") + if self.ingest_size < 0: + self.invalid_configs.append(f"INGEST_SIZE: {self.ingest_size} < 0") + if self.ingest_ts < 0: + self.invalid_configs.append(f"INGEST_TS: {self.ingest_ts} < 0") + if self.ingest_delta <= 0: + self.invalid_configs.append(f"INGEST_DELTA: {self.ingest_delta} <= 0") + if not os.path.isfile(self.model_path): + self.invalid_configs.append(f"MODEL_PATH: {self.model_path} does not exist") + if self.database not in [0, 1, 2, 3, 4]: + self.invalid_configs.append(f"DATABASE: {self.database} not 0, 1, 2, 3 or 4") + if self.stat_delta <= 0: + self.invalid_configs.append(f"STAT_DELTA: {self.stat_delta} <= 0") + if self.partition.lower() not in ["second", "minute", "hour", "day", "week", "month", "quarter", "year"]: + self.invalid_configs.append(f"PARTITION: {self.partition} not one of second, minute, hour, day, week, " + f"month, quarter or year") + if self.shards <= 0: + self.invalid_configs.append(f"SHARDS: {self.shards} <= 0") + if self.replicas < 0: + self.invalid_configs.append(f"REPLICAS: {self.replicas} < 0") + if int(self.port) <= 0: + self.invalid_configs.append(f"PORT: {self.port} <= 0") + + return len(self.invalid_configs) == 0 diff --git a/src/modules/crate_db_writer.py b/src/modules/crate_db_writer.py new file mode 100644 index 0000000..b0463e9 --- /dev/null +++ b/src/modules/crate_db_writer.py @@ -0,0 +1,40 @@ +from modules.db_writer import DbWriter +from crate import client + + +class CrateDbWriter(DbWriter): + def __init__(self, host, username, password, model, table_name=None, shards=None, replicas=None, partition="week"): + super().__init__() + self.conn = client.connect(host, username=username, password=password) + self.cursor = self.conn.cursor() + self.model = model + self.table_name = (table_name, self._get_model_table_name())[table_name is None or table_name == ""] + self.shards = (shards, 21)[shards is None] + self.replicas = (replicas, 1)[replicas is None] + self.partition = partition + + def close_connection(self): + self.cursor.close() + self.conn.close() + + def prepare_database(self): + stmt = f"""CREATE TABLE IF NOT EXISTS {self.table_name} ("ts" TIMESTAMP WITH TIME ZONE, +"g_ts_{self.partition}" TIMESTAMP WITH TIME ZONE GENERATED ALWAYS AS date_trunc('{self.partition}', "ts"), +"payload" OBJECT(DYNAMIC)) +CLUSTERED INTO {self.shards} SHARDS +PARTITIONED BY ("g_ts_{self.partition}") +WITH (number_of_replicas = {self.replicas})""" + self.cursor.execute(stmt) + + def insert_stmt(self, timestamps, batch): + stmt = f"""INSERT INTO {self.table_name} (ts, payload) (SELECT col1, col2 FROM UNNEST(?,?))""" + self.cursor.execute(stmt, (timestamps, batch)) + + def execute_query(self, query): + self.cursor.execute(query) + return self.cursor.fetchall() + + def _get_model_table_name(self): + for key in self.model.keys(): + if key != "description": + return key diff --git a/src/modules/db_writer.py b/src/modules/db_writer.py new file mode 100644 index 0000000..42929ea --- /dev/null +++ b/src/modules/db_writer.py @@ -0,0 +1,18 @@ +class DbWriter: + def __init__(self): + pass + + def connect_to_database(self): + pass + + def close_connection(self): + pass + + def prepare_database(self): + pass + + def insert_stmt(self, timestamps, batch): + pass + + def execute_query(self, query): + pass diff --git a/src/modules/edge.py b/src/modules/edge.py new file mode 100644 index 0000000..89a518a --- /dev/null +++ b/src/modules/edge.py @@ -0,0 +1,165 @@ +import random + + +factors = [-1, 1] + + +class Edge: + def __init__(self, identifier, tags, edge_model): + self.id = identifier + self.tags = tags + self.edge_model = edge_model + self.sensors = [] + self.payload = {} + self._init_sensors() + + def _init_sensors(self): + for key, value in self.edge_model.items(): + sensor_type = value["type"]["value"].lower() + if sensor_type == "float": + self.sensors.append(FloatSensor(value)) + elif sensor_type == "bool": + self.sensors.append(BoolSensor(value)) + else: + raise NotImplementedError("only FLOAT and BOOL Type have been implemented") + + def calculate_next_value(self): + if self.payload == {}: + self._assign_tag_values() + + for sensor in self.sensors: + self.payload[sensor.get_key()] = sensor.calculate_next_value() + + # a copy of payload is returned so we don't overwrite the previously returned values + return dict(self.payload) + + def _assign_tag_values(self): + items = list(self.tags.items()) + elements_identifier = 0 + for i in range(len(items) - 1, -1, -1): + key = items[i][0] + value = items[i][1] + if value == "id": + self.payload[key] = self.id + else: + if isinstance(value, list): + identifiers = value + else: + identifiers = list(range(0, value)) + + if elements_identifier == 0: + self.payload[key] = identifiers[(self.id - 1) % len(identifiers)] + else: + self.payload[key] = identifiers[int((self.id - 1) / elements_identifier) % len(identifiers)] + + elements_identifier += len(identifiers) + + +class Sensor: + def __init__(self, model): + self.model = model + + def get_key(self): + return self.model["key"]["value"] + + +class FloatSensor(Sensor): + def __init__(self, model): + super().__init__(model) + self.value_count = 0 + self.error_count = 0 + self.last_none_error_value = 0 + self.mean = model["mean"]["value"] + self.minimum = model["min"]["value"] + self.maximum = model["max"]["value"] + self.standard_deviation = model["stdev"]["value"] + self.error_rate = model["error_rate"]["value"] + self.error_length = model["error_length"]["value"] + self.variance = model["variance"]["value"] + self.current_error = False + self.value = round(random.uniform(self.mean - self.variance, self.mean + self.variance), 2) + + def calculate_next_value(self): + # if the last value has been an error value we first calculate if the next value is also an error + # the chance for that is stored in the error_length variable which is a percentage value of how big + # the chance is for the next value to be an error + # each time the length is reduced by one, smallest chance is set to 0.1 + if self.current_error: + self.error_length -= 1 + self.error_rate = self.error_length + if self.error_rate < 0.01: + self.error_rate = 0.01 + + # this calculates if the next value is an error it takes the percentage of the error_rate variable and + # multiplies it by 1000 and then checks if a random value in range 0 - 1000 is below the resulting value + is_error = random.randint(0, 1000) < (self.error_rate * 1000) + + # if the next value is not an error the new value is calculated and the error variables reset + # otherwise a new error is calculated + if not is_error: + self._new_value() + if self.current_error: + self.current_error = False + self.error_rate = self.model["error_rate"]["value"] + self.error_length = self.model["error_length"]["value"] + else: + # to continue the good values where they ended the last time we save the last good value + if not self.current_error: + self.last_none_error_value = self.value + self._new_error_value() + + return self.value + + def _new_value(self): + self.value_count += 1 + + # value change is calculated by adding a value within the variance range to the current value + # by multiplying `factors[random.randint(0,1)]` to the value_change variable it is either + # added or subtracted from the last value + value_change = random.uniform(0, self.variance) + + # chance of going up or down is also influenced how far from the mean we are + factor = factors[self._decide_factor()] + # last value has been an error + if self.current_error: + self.value = self.last_none_error_value + (value_change * factor) + else: + self.value += value_change * factor + + def _decide_factor(self): + if self.value > self.mean: + distance = self.value - self.mean + continue_direction = 1 + change_direction = 0 + else: + distance = self.mean - self.value + continue_direction = 0 + change_direction = 1 + chance = (50 * self.standard_deviation) - distance + + return continue_direction if random.randint(0, (100 * self.standard_deviation)) < chance else change_direction + + def _new_error_value(self): + self.error_count += 1 + + # if the next value is a consecutive error it is basically calculated in a similar way to a new value but + # within the bounds of the respective error margin (upper or lower error) + # otherwise a new error is calculated and chosen randomly from the upper or lower values + if not self.current_error: + if self.value < self.mean: + self.value = round(random.uniform(self.minimum, self.mean - self.standard_deviation), 2) + else: + self.value = round(random.uniform(self.mean + self.standard_deviation, self.maximum), 2) + self.current_error = True + else: + value_change = round(random.uniform(0, self.variance), 2) + self.value += value_change * factors[random.randint(0, 1)] + + +class BoolSensor(Sensor): + def __init__(self, model): + super().__init__(model) + self.true_ratio = self.model["true_ratio"]["value"] + + def calculate_next_value(self): + return random.randint(0, (1 / self.true_ratio)) < 1 diff --git a/src/modules/helper.py b/src/modules/helper.py new file mode 100644 index 0000000..b007d5b --- /dev/null +++ b/src/modules/helper.py @@ -0,0 +1,35 @@ +import time + + +tic_toc = {} +tic_toc_delta = {} + + +def execute_timed_function(function, *args, do_print=False): + # this helper function measures the execution time of a function and saves the value to a dictionary + # all execution times are stored in a list accessible by the function name, this enables to calculate an + # average function runtime + tic = time.time() # starting time measurement + function_return = function(*args) + toc = time.time() - tic # stopping time measurement + + if function.__name__ not in tic_toc: + tic_toc[function.__name__] = [] + + tic_toc[function.__name__].append(toc) + + if function.__name__ not in tic_toc_delta: + tic_toc_delta[function.__name__] = [] + + tic_toc_delta[function.__name__].append(toc) + + if do_print: + # print(function.__name__ + " took: " + str(toc) + " seconds") + print("average time for " + function.__name__ + ": " + + str(sum(tic_toc[function.__name__]) / len(tic_toc[function.__name__]))) + + return function_return + + +def reset_delta(): + tic_toc_delta.clear() diff --git a/src/modules/influx_db_writer.py b/src/modules/influx_db_writer.py new file mode 100644 index 0000000..1a87dbe --- /dev/null +++ b/src/modules/influx_db_writer.py @@ -0,0 +1,72 @@ +from modules import helper +from modules.db_writer import DbWriter +from influxdb_client import InfluxDBClient, Bucket +from influxdb_client.client.write_api import ASYNCHRONOUS, Point +from datetime import datetime + + +class InfluxDbWriter(DbWriter): + def __init__(self, host, token, org, model, database_name=None): + super().__init__() + self.client = InfluxDBClient(url=host, token=token) + self.write_api = self.client.write_api(write_options=ASYNCHRONOUS) + self.query_api = self.client.query_api() + self.org = org + self.model = model + self.bucket = None + self.database_name = (database_name, self._get_model_database_name())[database_name is None or database_name == ""] + + def close_connection(self): + self.client.close() + + def prepare_database(self): + buckets = self.client.buckets_api().find_buckets() + for bucket in buckets.buckets: + if bucket.name == self.database_name: + self.bucket = bucket + + if self.bucket is None: + bucket = Bucket(name=self.database_name, + org_id=self.org, + retention_rules=[]) + self.bucket = self.client.buckets_api().create_bucket(bucket) + + def insert_stmt(self, timestamps, batch): + data = helper.execute_timed_function(self._prepare_influx_stmt, timestamps, batch) + self.write_api.write(bucket=self.database_name, org=self.org, record=data) + + def _prepare_influx_stmt(self, timestamps, batch): + data = [] + tags, metrics = self._get_tags_and_metrics() + for i in range(0, len(batch)): + t = datetime.fromtimestamp(timestamps[i] / 1000) + point = Point(self.database_name).time(t) + for tag in tags: + point.tag(tag, batch[i][tag]) + for metric in metrics: + point.field(metric, batch[i][metric]) + data.append(point) + + return data + + def execute_query(self, query): + return self.query_api.query(query) + + def _get_tags_and_metrics(self): + key = self._get_model_database_name() + tags_ = self.model[key]["tags"] + metrics_ = self.model[key]["metrics"] + tags = [] + metrics = [] + for key, value in tags_.items(): + if key != "description": + tags.append(key) + for key, value in metrics_.items(): + if key != "description": + metrics.append(value["key"]["value"]) + return tags, metrics + + def _get_model_database_name(self): + for key in self.model.keys(): + if key != "description": + return key diff --git a/src/modules/mongo_db_writer.py b/src/modules/mongo_db_writer.py new file mode 100644 index 0000000..821a0ae --- /dev/null +++ b/src/modules/mongo_db_writer.py @@ -0,0 +1,65 @@ +from modules import helper +from modules.db_writer import DbWriter +from pymongo import MongoClient +from pymongo import CursorType +from datetime import datetime + + +class MongoDbWriter(DbWriter): + def __init__(self, host, username, password, database_name, model): + super().__init__() + self.model = model + if host == "localhost": + connection_string = f"""mongodb://{username}:{password}@{host}""" + else: + connection_string = f"""mongodb+srv://{username}:{password}@{host}""" + + self.client = MongoClient(connection_string) + self.db = self.client[database_name] + self.collection = self.db[self._get_model_collection_name()] + + def close_connection(self): + self.client.close() + + def insert_stmt(self, timestamps, batch): + data = helper.execute_timed_function(self._prepare_mongo_stmt, timestamps, batch) + self.collection.insert_many(data) + + def _prepare_mongo_stmt(self, timestamps, batch): + data = [] + tags, metrics = self._get_tags_and_metrics() + for i in range(0, len(batch)): + t = datetime.fromtimestamp(timestamps[i] / 1000) + document = {"measurement": self._get_model_collection_name(), + "date": t, + "tags": {}, + "metrics": {}} + for tag in tags: + document["tags"][tag] = batch[i][tag] + for metric in metrics: + document["metrics"][metric] = batch[i][metric] + data.append(document) + return data + + def execute_query(self, query): + cursor = self.collection.find(query, cursor_type=CursorType.EXHAUST) + return list(cursor) + + def _get_tags_and_metrics(self): + key = self._get_model_collection_name() + tags_ = self.model[key]["tags"] + metrics_ = self.model[key]["metrics"] + tags = [] + metrics = [] + for key, value in tags_.items(): + if key != "description": + tags.append(key) + for key, value in metrics_.items(): + if key != "description": + metrics.append(value["key"]["value"]) + return tags, metrics + + def _get_model_collection_name(self): + for key in self.model.keys(): + if key != "description": + return key diff --git a/src/modules/postgres_db_writer.py b/src/modules/postgres_db_writer.py new file mode 100644 index 0000000..2effbb1 --- /dev/null +++ b/src/modules/postgres_db_writer.py @@ -0,0 +1,77 @@ +import psycopg2 +import psycopg2.extras +from modules.db_writer import DbWriter +from modules import helper +from datetime import datetime +from datetime_truncate import truncate + + +class PostgresDbWriter(DbWriter): + def __init__(self, host, port, username, password, db_name, model, table_name=None, partition="week"): + super().__init__() + self.conn = psycopg2.connect(dbname=db_name, user=username, password=password, host=host, port=port) + self.cursor = self.conn.cursor() + self.model = model + self.table_name = (table_name, self._get_model_table_name())[table_name is None or table_name == ""] + self.partition = partition + + def close_connection(self): + self.cursor.close() + self.conn.close() + + def prepare_database(self): + columns = self._get_tags_and_metrics() + stmt = f"""CREATE TABLE IF NOT EXISTS {self.table_name} ( +ts TIMESTAMP NOT NULL, +ts_{self.partition} TIMESTAMP NOT NULL, +""" + for key, value in columns.items(): + stmt += f"""{key} {value},""" + stmt = stmt.rstrip(",") + ");" + + self.cursor.execute(stmt) + self.conn.commit() + + def insert_stmt(self, timestamps, batch): + stmt = helper.execute_timed_function(self._prepare_postgres_stmt, timestamps, batch) + self.cursor.execute(stmt) + self.conn.commit() + + def _prepare_postgres_stmt(self, timestamps, batch): + columns = self._get_tags_and_metrics().keys() + stmt = f"""INSERT INTO {self.table_name} (ts, ts_{self.partition},""" + for column in columns: + stmt += f"""{column}, """ + + stmt = stmt.rstrip(", ") + ") VALUES" + for i in range(0, len(batch)): + t = datetime.fromtimestamp(timestamps[i] / 1000) + trunc = truncate(t, self.partition) + stmt = f"""{stmt} ('{t}', '{trunc}', """ + for column in columns: + stmt += f"""'{batch[i][column]}',""" + stmt = stmt.rstrip(",") + ")," + stmt = stmt.rstrip(",") + return stmt + + def execute_query(self, query): + self.cursor.execute(query) + return self.cursor.fetchall() + + def _get_tags_and_metrics(self): + key = self._get_model_table_name() + tags = self.model[key]["tags"] + metrics = self.model[key]["metrics"] + columns = {} + for key, value in tags.items(): + if key != "description": + columns[key] = "INTEGER" + for key, value in metrics.items(): + if key != "description": + columns[value["key"]["value"]] = value["type"]["value"] + return columns + + def _get_model_table_name(self): + for key in self.model.keys(): + if key != "description": + return key diff --git a/src/modules/timescale_db_writer.py b/src/modules/timescale_db_writer.py new file mode 100644 index 0000000..c923be1 --- /dev/null +++ b/src/modules/timescale_db_writer.py @@ -0,0 +1,83 @@ +import psycopg2 +import psycopg2.extras +from modules.db_writer import DbWriter +from modules import helper +from datetime import datetime +from datetime_truncate import truncate + + +class TimescaleDbWriter(DbWriter): + def __init__(self, host, port, username, password, ts_db_name, model, table_name=None, partition="week"): + super().__init__() + self.conn = psycopg2.connect(dbname=ts_db_name, user=username, password=password, host=host, port=port) + self.cursor = self.conn.cursor() + self.model = model + self.table_name = (table_name, self._get_model_table_name())[table_name is None or table_name == ""] + self.partition = partition + + def close_connection(self): + self.cursor.close() + self.conn.close() + + def prepare_database(self): + columns = self._get_tags_and_metrics() + stmt = f"""CREATE TABLE IF NOT EXISTS {self.table_name} ( +ts TIMESTAMP NOT NULL, +ts_{self.partition} TIMESTAMP NOT NULL, +""" + for key, value in columns.items(): + stmt += f"""{key} {value},""" + stmt = stmt.rstrip(",") + ");" + + self.cursor.execute(stmt) + self.conn.commit() + stmt = f"""SELECT create_hypertable('{self.table_name}', 'ts', 'ts_{self.partition}', 10, if_not_exists => true); """ + self.cursor.execute(stmt) + self.conn.commit() + + def insert_stmt(self, timestamps, batch): + stmt = helper.execute_timed_function(self._prepare_timescale_stmt, timestamps, batch) + self.cursor.execute(stmt) + self.conn.commit() + + def _prepare_timescale_stmt(self, timestamps, batch): + columns = self._get_tags_and_metrics().keys() + stmt = f"""INSERT INTO {self.table_name} (ts, ts_{self.partition},""" + for column in columns: + stmt += f"""{column}, """ + + stmt = stmt.rstrip(", ") + ") VALUES" + for i in range(0, len(batch)): + t = datetime.fromtimestamp(timestamps[i] / 1000) + trunc = truncate(t, self.partition) + stmt = f"""{stmt} ('{t}', '{trunc}', """ + for column in columns: + stmt += f"""'{batch[i][column]}',""" + stmt = stmt.rstrip(",") + ")," + stmt = stmt.rstrip(",") + return stmt + + def execute_query(self, query): + self.cursor.execute(query) + return self.cursor.fetchall() + + def _get_tags_and_metrics(self): + key = self._get_model_table_name() + tags = self.model[key]["tags"] + metrics = self.model[key]["metrics"] + columns = {} + for key, value in tags.items(): + if key != "description": + if type(value) == "list": + columns[key] = "TEXT" + else: + columns[key] = "INTEGER" + for key, value in metrics.items(): + if key != "description": + columns[value["key"]["value"]] = value["type"]["value"] + return columns + + def _get_model_table_name(self): + for key in self.model.keys(): + if key != "description": + return key diff --git a/src/query_timer/query_timer.py b/src/query_timer/query_timer.py new file mode 100644 index 0000000..9331b98 --- /dev/null +++ b/src/query_timer/query_timer.py @@ -0,0 +1,119 @@ +import os +import urllib3 +import statistics +import time +import shutil +import json +import modules.helper as helper +from modules.crate_db_writer import CrateDbWriter +from modules.timescale_db_writer import TimescaleDbWriter +from modules.influx_db_writer import InfluxDbWriter +from modules.mongo_db_writer import MongoDbWriter +from threading import Thread + + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +host = os.getenv("HOST", "localhost:4200") +port = os.getenv("PORT", "5432") +username = os.getenv("dbUser", None) +password = os.getenv("dbPassword", None) +table_name = os.getenv("TABLE_NAME", "timeseries") +database = int(os.getenv("DATABASE", 0)) # 0:crate, 1:timescale, 2:influx +db_name = os.getenv("DB_NAME", "") +token = os.getenv("TOKEN", "") +concurrency = os.getenv("CONCURRENCY", 20) +iterations = os.getenv("ITERATIONS", 50) +quantile_list = os.getenv("QUANTILES", "50,60,75,90,99") +query = os.getenv("QUERY", "SELECT COUNT(*) FROM timeseries;") + +query_results = [] +total_queries = 0 +stop_thread = False +start_time = time.time() +terminal_size = shutil.get_terminal_size() + +if database == 0: # crate + db_writer = CrateDbWriter(host, username, password, table_name) +elif database == 1: # timescale + db_writer = TimescaleDbWriter(host, username, password, table_name, db_name, port) +elif database == 2: # influx + db_writer = InfluxDbWriter(host, token, db_name) +elif database == 3: # mongo + db_writer = MongoDbWriter(host, username, password, db_name) + query = json.loads(query) # mongo uses queries in json format +else: + db_writer = None + + +def print_progressbar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█', print_end="\r"): + """ + Call in a loop to create terminal progress bar + @params: + iteration - Required : current iteration (Int) + total - Required : total iterations (Int) + prefix - Optional : prefix string (Str) + suffix - Optional : suffix string (Str) + decimals - Optional : positive number of decimals in percent complete (Int) + length - Optional : character length of bar (Int) + fill - Optional : bar fill character (Str) + printEnd - Optional : end character (e.g. "\r", "\r\n") (Str) + """ + now = time.time() + percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))) + filled_length = int(length * iteration // total) + bar = fill * filled_length + '-' * (length - filled_length) + print('\r%s |%s| %s%% %s %ss' % (prefix, bar, percent, suffix, round(now-start_time, 2)), end=print_end) + # Print New Line on Complete + if iteration == total: + print() + os.get_terminal_size() + + +def start_query_run(): + global total_queries + for x in range(0, iterations): + result = helper.execute_timed_function(db_writer.execute_query, query) + total_queries += 1 + terminal_size_thread = shutil.get_terminal_size() + print_progressbar(total_queries, concurrency * iterations, + prefix='Progress:', suffix='Complete', length=(terminal_size_thread.columns - 40)) + query_results.append(result) + + +def main(): + global start_time + start_time = time.time() + threads = [] + for y in range(0, concurrency): + thread = Thread(target=start_query_run) + threads.append(thread) + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + +if __name__ == '__main__': + if concurrency * iterations < 100: + raise ValueError("query_timer needs at least 100 queries (concurrent * iterations) to work properly") + + print(f"""concurrency: {concurrency}\niterations : {iterations}""") + print_progressbar(0, concurrency * iterations, + prefix='Progress:', suffix='Complete', length=(terminal_size.columns - 40)) + + main() + q_list = quantile_list.split(",") + for k, v in helper.tic_toc.items(): + # TODO: define rate, currently it is: iterations per average duration per second times concurrency + print(f"""rate : {round(((iterations * concurrency) / (statistics.mean(v)*1000)) * concurrency, 3)}qs/s""") + print(f"""mean : {round(statistics.mean(v)*1000, 3)}ms""") + print(f"""stdev: {round(statistics.stdev(v)*1000, 3)}ms""") + print(f"""min : {round(min(v)*1000, 3)}ms""") + print(f"""max : {round(max(v)*1000, 3)}ms""") + qus = statistics.quantiles(v, n=100, method="inclusive") + for i in range(0, len(qus)): + if str(i + 1) in q_list: + print(f"""p{i+1} : {round(qus[i]*1000, 3)}ms""") + + # finished diff --git a/src/test/__init__.py b/src/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/test/requirements.txt b/src/test/requirements.txt new file mode 100644 index 0000000..1a2177e --- /dev/null +++ b/src/test/requirements.txt @@ -0,0 +1,13 @@ +numpy==1.19.0 +mock==4.0.2 +pytest==5.4.3 +pytest-cov==2.10.0 +dotmap==1.3.17 +crate==0.24.0 +psycopg2-binary==2.8.5 +influxdb_client==1.7.0 +prometheus_client==0.7.1 +urllib3==1.25.9 +datetime_truncate==1.1.0 +pymongo==3.10.1 +flake8==3.8.3 \ No newline at end of file diff --git a/src/test/test_batch_size_automator.py b/src/test/test_batch_size_automator.py new file mode 100644 index 0000000..caecffa --- /dev/null +++ b/src/test/test_batch_size_automator.py @@ -0,0 +1,222 @@ +from modules.batch_size_automator import BatchSizeAutomator + + +def test_is_auto_batch_mode(): + """ + This function tests if the auto_batch_mode of the BatchSizeAutomator is correctly caluclated. + + Test Case 1: batch_size is bigger than 0 and => auto_batch_mode is False + batch_size: 2000 + ingest_mode: 1 + -> auto_batch_mode: False + + Test Case 2: batch_size is 0 and ingest_mode is 1 => auto_batch_mode is True + batch_size: 0 + ingest_mode: 1 + -> auto_batch_mode: True + + Test Case 3: batch_size is 2000 and ingest_mode is 0 => auto_batch_mode is False + batch_size: 2000 + ingest_mode: 0 + -> auto_batch_mode: False + + Test Case 4: batch_size is 0 and ingest_mode is 0 => auto_batch_mode is False + batch_size: 0 + ingest_mode: 0 + -> auto_batch_mode: False + + Test Case 5: batch_size is -1000 and ingest_mode is 1 => auto_batch_mode is True + batch_size: -1000 + ingest_mode: 1 + -> auto_batch_mode: True + """ + # Test Case 1: + batch_size_automator = BatchSizeAutomator(2000, 1) + assert not batch_size_automator.auto_batch_mode + # Test Case 2: + batch_size_automator = BatchSizeAutomator(0, 1) + assert batch_size_automator.auto_batch_mode + # Test Case 3: + batch_size_automator = BatchSizeAutomator(2000, 0) + assert not batch_size_automator.auto_batch_mode + # Test Case 4: + batch_size_automator = BatchSizeAutomator(0, 0) + assert not batch_size_automator.auto_batch_mode + # Test Case 5: + batch_size_automator = BatchSizeAutomator(-1000, 1) + assert batch_size_automator.auto_batch_mode + + +def test_insert_batch_time_normal(): + """ + This function tests for the correct behavior of the BatchSizeAutomator when using the insert_batch_time function + + Pre-Condition: A BatchSizeAutomator is created with batch_size 0 and ingest_mode 1 -> auto_batch_mode = True + get_next_batch_size() returns 2500 + + Test Case 1: For one full test_cycle (test_size) insert_batch_time is called with duration 10 + duration: 10 + times: test_size + -> batch_times["best"]["avg_time"]: 10 + -> batch_times["best"]["batch_per_second"]: 250 + -> get_next_batch_size(): 3000 + + Test Case 2: For one full test_cycle (test_size) insert_batch_time is called with duration 5 + because batch_size is calculated with a reduced step_size each cycle we calculate it as well + as we want to test if the direction of the step is correct + duration: 5 + times: test_size + -> batch_times["best"]["avg_time"]: 5 + -> batch_times["best"]["batch_per_second"]: 600 + -> get_next_batch_size(): increased by step_size * alpha + + Test Case 3: For one full test_cycle (test_size) insert_batch_time is called with duration 7 + avg_time and batch_per_second for best stays the same but new batch_size is decreased from the + last batch_size + duration: 5 + times: test_size + -> batch_times["best"]["avg_time"]: 5 + -> batch_times["best"]["batch_per_second"]: 600 + -> get_next_batch_size(): decreased by step_size * alpha + + Test Case 4: For one full test_cycle (test_size) insert_batch_time is called with duration 4 + avg_time and batch_per_second for best are updated and batch_size is further decreased + duration: 5 + times: test_size + -> batch_times["best"]["avg_time"]: 4 + -> batch_times["best"]["batch_per_second"]: batch_size / 4 + -> get_next_batch_size(): decreased by step_size * alpha + + Test Case 5: Until BatchSizeAutomator switches to surveillance mode insert_batch_time is called with duration 5 + Afterwards for one full test_cycle (test_size) insert_batch_time is called with duration 3 + As performance is better we stay in surveillance mode + duration: 5 + times: until surveillance_mode True + duration: 3 + times: test_size + -> batch_times["best"]["avg_time"]: 3 + -> batch_times["best"]["batch_per_second"]: batch_size / 3 + -> surveillance_mode: True + + Test Case 6: For one full test_cycle (test_size) insert_batch_time is called with duration 5 + duration: 5 + times: test_size + -> surveillance_mode: True + """ + # Pre-Condition: + batch_size_automator = BatchSizeAutomator(0, 1) + initial_batch_size = 2500 + assert batch_size_automator.get_next_batch_size() == initial_batch_size + + # Test Case 1: + # ten will be the first best time + duration = 10 + for i in range(0, batch_size_automator.test_size): + batch_size_automator.insert_batch_time(duration) + + assert batch_size_automator.batch_times["best"]["avg_time"] == duration + assert batch_size_automator.batch_times["best"]["batch_per_second"] == initial_batch_size / duration + # because for the next iteration batch_size is increased by 500 + initial_step_size = 500 + assert batch_size_automator.get_next_batch_size() == initial_batch_size + initial_step_size + + # Test Case 2: + # five will be the first best time making 3000 the best batchsize + duration = 5 + for i in range(0, batch_size_automator.test_size): + batch_size_automator.insert_batch_time(duration) + current_batch_size = initial_batch_size + initial_step_size + current_batch_per_second = current_batch_size / duration + + assert batch_size_automator.batch_times["best"]["avg_time"] == duration + assert batch_size_automator.batch_times["best"]["batch_per_second"] == current_batch_per_second + batch_size = 3000 + batch_size_automator.step_size * batch_size_automator.alpha + assert batch_size_automator.get_next_batch_size() == batch_size + + # Test Case 3: + # next is worse + duration = 7 + for i in range(0, batch_size_automator.test_size): + batch_size_automator.insert_batch_time(duration) + + assert batch_size_automator.batch_times["best"]["avg_time"] == 5 # last duration + assert batch_size_automator.batch_times["best"]["batch_per_second"] == current_batch_per_second + # batch_size is decreased this time because no better value was found and calculated based on best_size + batch_size = batch_size_automator.batch_times["best"]["size"] - batch_size_automator.step_size * batch_size_automator.alpha + assert batch_size_automator.get_next_batch_size() == batch_size + + # Test Case 4: + # next is best + duration = 4 + for i in range(0, batch_size_automator.test_size): + batch_size_automator.insert_batch_time(duration) + + assert batch_size_automator.batch_times["best"]["avg_time"] == duration + assert batch_size_automator.batch_times["best"]["batch_per_second"] == batch_size_automator.batch_times["best"]["size"] / duration + # batch_size is further decreased + batch_size = batch_size - batch_size_automator.step_size * batch_size_automator.alpha + assert batch_size_automator.get_next_batch_size() == batch_size + + # Test Case 5: + # we insert batch times until we switch to surveillance mode + duration = 5 + while not batch_size_automator.surveillance_mode: + batch_size_automator.insert_batch_time(duration) + + # for the next surveillance period we have no change in best batch size + duration = 3 + for i in range(0, batch_size_automator.test_size): + batch_size_automator.insert_batch_time(duration) + + assert batch_size_automator.batch_times["best"]["avg_time"] == duration + assert batch_size_automator.batch_times["best"]["batch_per_second"] == batch_size_automator.batch_times["best"]["size"] / duration + assert batch_size_automator.surveillance_mode + + # Test Case 6: + # performance is worse so we switch out of surveillance mode and try to find best batch size again + duration = 5 + for i in range(0, batch_size_automator.test_size): + batch_size_automator.insert_batch_time(duration) + + assert not batch_size_automator.surveillance_mode + + +def test_insert_batch_time_smallest_batch(): + """ + This function tests if batch_size is correctly adjusted before getting smaller than 1 + + Pre Condition: BatchSizeAutomator with batch_size 0 and ingest_mode 1 -> auto_batch_mode is True + + Test Case 1: At first we create a baseline for our test where a batch takes 10000 on average + For the next cycle a batch takes 100000 on average so the batch_size adjustment is reversed to produce smaller + batch sizes each cycle + Next we decrease the duration so the average goes down each test cycle and batch_size is decreased further + We stop when batch_size is set to 1, which means we went below 0 and the next batch_size should be increased + -> bigger_batch_size: True + """ + # Pre Condition: + batch_size_automator = BatchSizeAutomator(0, 1) + + # Test Case 1: + # batch size will allways get smaller until we are under 1 + # first we have a baseline + long_duration = 10000 + for i in range(0, batch_size_automator.test_size): + batch_size_automator.insert_batch_time(long_duration) + + # then we get worse so direction of optimization is reversed + worse_duration = 100000 + for i in range(0, batch_size_automator.test_size): + batch_size_automator.insert_batch_time(worse_duration) + + # now we get better each time until we reach batch_size 1 + duration = (batch_size_automator.batch_size / batch_size_automator.batch_times["best"]["batch_per_second"]) - 10 + batch_size = batch_size_automator.batch_size + while batch_size_automator.batch_size != 1: + if batch_size != batch_size_automator.batch_size: + duration = (batch_size_automator.batch_size / batch_size_automator.batch_times["best"]["batch_per_second"]) - 10 + batch_size = batch_size_automator.batch_size + batch_size_automator.insert_batch_time(duration) + + # once we reached a negative batch_size and it was reset to 1 the batch_size should get bigger again + assert batch_size_automator.bigger_batch_size diff --git a/src/test/test_config.py b/src/test/test_config.py new file mode 100644 index 0000000..0af5310 --- /dev/null +++ b/src/test/test_config.py @@ -0,0 +1,376 @@ +import pytest +import time +import os +import mock +import os.path +from modules.config import DataGeneratorConfig + + +def test_config_constructor_no_env_set(): + config = DataGeneratorConfig() + assert config.id_start == 1 + assert config.id_end == 500 + assert config.ingest_mode == 1 + assert config.ingest_size == 100 + assert config.ingest_ts == pytest.approx(time.time(), abs=0.3) + assert config.ingest_delta == 0.5 + assert config.model_path == "" + assert config.batch_size == -1 + assert config.database == 0 + assert config.stat_delta == 30 + + assert config.host == "localhost" + assert config.username == None + assert config.password == None + assert config.db_name == "" + assert config.table_name == "" + assert config.partition == "week" + + assert config.shards == 4 + assert config.replicas == 0 + + assert config.port == "5432" + + assert config.token == "" + assert config.organization == "" + + +def test_config_constructor_env_id_start_set(): + test_id_start = 10 + os.environ["ID_START"] = str(test_id_start) + config = DataGeneratorConfig() + assert config.id_start == test_id_start + del os.environ["ID_START"] + + +def test_config_constructor_env_id_end_set(): + test_id_end = 50 + os.environ["ID_END"] = str(test_id_end) + config = DataGeneratorConfig() + assert config.id_end == test_id_end + del os.environ["ID_END"] + + +def test_config_constructor_env_ingest_mode_set(): + test_ingest_mode = 0 + os.environ["INGEST_MODE"] = str(test_ingest_mode) + config = DataGeneratorConfig() + assert config.ingest_mode == test_ingest_mode + del os.environ["INGEST_MODE"] + + +def test_config_constructor_env_ingest_size_set(): + test_ingest_size = 1000 + os.environ["INGEST_SIZE"] = str(test_ingest_size) + config = DataGeneratorConfig() + assert config.ingest_size == test_ingest_size + del os.environ["INGEST_SIZE"] + + +def test_config_constructor_env_ingest_ts_set(): + ts = time.time() + os.environ["INGEST_TS"] = str(ts) + config = DataGeneratorConfig() + assert config.ingest_ts == ts + del os.environ["INGEST_TS"] + + +def test_config_constructor_env_ingest_delta_set(): + test_ingest_delta = 10 + os.environ["INGEST_DELTA"] = str(test_ingest_delta) + config = DataGeneratorConfig() + assert config.ingest_delta == test_ingest_delta + del os.environ["INGEST_DELTA"] + + +def test_config_constructor_env_model_path_set(): + test_path = "test/path" + os.environ["MODEL_PATH"] = test_path + config = DataGeneratorConfig() + assert config.model_path == test_path + del os.environ["MODEL_PATH"] + + +def test_config_constructor_env_batch_size_set(): + test_batch_size = 100 + os.environ["BATCH_SIZE"] = str(test_batch_size) + config = DataGeneratorConfig() + assert config.batch_size == test_batch_size + del os.environ["BATCH_SIZE"] + + +def test_config_constructor_env_database_set(): + test_database = 3 + os.environ["DATABASE"] = str(test_database) + config = DataGeneratorConfig() + assert config.database == test_database + del os.environ["DATABASE"] + + +def test_config_constructor_env_stat_delta_set(): + test_stat_delta = 60 + os.environ["STAT_DELTA"] = str(test_stat_delta) + config = DataGeneratorConfig() + assert config.stat_delta == test_stat_delta + del os.environ["STAT_DELTA"] + + +def test_config_constructor_env_host_set(): + test_host = "test/host" + os.environ["HOST"] = test_host + config = DataGeneratorConfig() + assert config.host == test_host + del os.environ["HOST"] + + +def test_config_constructor_env_username_set(): + test_username = "testUsername" + os.environ["USERNAME"] = test_username + config = DataGeneratorConfig() + assert config.username == test_username + del os.environ["USERNAME"] + + +def test_config_constructor_env_password_set(): + test_password = "password" + os.environ["PASSWORD"] = test_password + config = DataGeneratorConfig() + assert config.password == test_password + del os.environ["PASSWORD"] + + +def test_config_constructor_env_db_name_set(): + test_db_name = "dbName" + os.environ["DB_NAME"] = test_db_name + config = DataGeneratorConfig() + assert config.db_name == test_db_name + del os.environ["DB_NAME"] + + +def test_config_constructor_env_table_name_set(): + test_table_name = "testTableName" + os.environ["TABLE_NAME"] = test_table_name + config = DataGeneratorConfig() + assert config.table_name == test_table_name + del os.environ["TABLE_NAME"] + + +def test_config_constructor_env_partition_set(): + test_partition = "day" + os.environ["PARTITION"] = test_partition + config = DataGeneratorConfig() + assert config.partition == test_partition + del os.environ["PARTITION"] + + +def test_config_constructor_env_shards_set(): + test_shards = 4 + os.environ["SHARDS"] = str(test_shards) + config = DataGeneratorConfig() + assert config.shards == test_shards + del os.environ["SHARDS"] + + +def test_config_constructor_env_replicas_set(): + test_replicas = 2 + os.environ["REPLICAS"] = str(test_replicas) + config = DataGeneratorConfig() + assert config.replicas == test_replicas + del os.environ["REPLICAS"] + + +def test_config_constructor_env_port_set(): + test_port = "1234" + os.environ["PORT"] = test_port + config = DataGeneratorConfig() + assert config.port == test_port + del os.environ["PORT"] + + +def test_config_constructor_env_token_set(): + test_token = "testToken" + os.environ["TOKEN"] = test_token + config = DataGeneratorConfig() + assert config.token == test_token + del os.environ["TOKEN"] + + +def test_config_constructor_env_organization_set(): + test_organization = "testOrganization" + os.environ["ORG"] = test_organization + config = DataGeneratorConfig() + assert config.organization == test_organization + del os.environ["ORG"] + + +@mock.patch("os.path.isfile") +def test_validate_config_default_true(mock_isfile): + mock_isfile.return_value = True + config = DataGeneratorConfig() + assert config.validate_config() + + +@mock.patch("os.path.isfile") +def test_validate_config_id_start_false(mock_isfile): + mock_isfile.return_value = True + test_id_start = -1 + config = DataGeneratorConfig() + config.id_start = test_id_start + assert not config.validate_config() + assert len(config.invalid_configs) == 1 + assert "ID_START" in config.invalid_configs[0] + + +@mock.patch("os.path.isfile") +def test_validate_config_id_end_false(mock_isfile): + mock_isfile.return_value = True + test_id_end = -1 + config = DataGeneratorConfig() + config.id_end = test_id_end + assert not config.validate_config() + assert len(config.invalid_configs) == 2 # id_end is also smaller than id_start + assert "ID_END" in config.invalid_configs[0] + + +@mock.patch("os.path.isfile") +def test_validate_config_id_start_id_end_false(mock_isfile): + mock_isfile.return_value = True + test_id_start = 100 + test_id_end = 50 + config = DataGeneratorConfig() + config.id_start = test_id_start + config.id_end = test_id_end + assert not config.validate_config() + assert len(config.invalid_configs) == 1 + assert "ID_START" in config.invalid_configs[0] + assert "ID_END" in config.invalid_configs[0] + + +@mock.patch("os.path.isfile") +def test_validate_config_ingest_mode_false(mock_isfile): + mock_isfile.return_value = True + test_ingest_mode = -1 + config = DataGeneratorConfig() + config.ingest_mode = test_ingest_mode + assert not config.validate_config() + assert len(config.invalid_configs) == 1 + assert "INGEST_MODE" in config.invalid_configs[0] + + +@mock.patch("os.path.isfile") +def test_validate_config_ingest_size_false(mock_isfile): + mock_isfile.return_value = True + test_ingest_size = -1 + config = DataGeneratorConfig() + config.ingest_size = test_ingest_size + assert not config.validate_config() + assert len(config.invalid_configs) == 1 + assert "INGEST_SIZE" in config.invalid_configs[0] + + +@mock.patch("os.path.isfile") +def test_validate_config_ingest_ts_false(mock_isfile): + mock_isfile.return_value = True + test_ingest_ts = -1 + config = DataGeneratorConfig() + config.ingest_ts = test_ingest_ts + assert not config.validate_config() + assert len(config.invalid_configs) == 1 + assert "INGEST_TS" in config.invalid_configs[0] + + +@mock.patch("os.path.isfile") +def test_validate_config_ingest_delta_false(mock_isfile): + mock_isfile.return_value = True + test_ingest_delta = -1 + config = DataGeneratorConfig() + config.ingest_delta = test_ingest_delta + assert not config.validate_config() + assert len(config.invalid_configs) == 1 + assert "INGEST_DELTA" in config.invalid_configs[0] + + +@mock.patch("os.path.isfile") +def test_validate_config_model_path_false(mock_isfile): + mock_isfile.return_value = False + config = DataGeneratorConfig() + assert not config.validate_config() + assert len(config.invalid_configs) == 1 + assert "MODEL_PATH" in config.invalid_configs[0] + + +@mock.patch("os.path.isfile") +def test_validate_config_database_false(mock_isfile): + mock_isfile.return_value = True + test_database = -1 + config = DataGeneratorConfig() + config.database = test_database + assert not config.validate_config() + assert len(config.invalid_configs) == 1 + assert "DATABASE" in config.invalid_configs[0] + + +@mock.patch("os.path.isfile") +def test_validate_config_stat_delta_false(mock_isfile): + mock_isfile.return_value = True + test_stat_delta = -1 + config = DataGeneratorConfig() + config.stat_delta = test_stat_delta + assert not config.validate_config() + assert len(config.invalid_configs) == 1 + assert "STAT_DELTA" in config.invalid_configs[0] + + +@mock.patch("os.path.isfile") +def test_validate_config_partition_false(mock_isfile): + mock_isfile.return_value = True + test_partition = "invalid_partition" + config = DataGeneratorConfig() + config.partition = test_partition + assert not config.validate_config() + assert len(config.invalid_configs) == 1 + assert "PARTITION" in config.invalid_configs[0] + + +@mock.patch("os.path.isfile") +def test_validate_config_id_start_false(mock_isfile): + mock_isfile.return_value = True + test_id_start = -1 + config = DataGeneratorConfig() + config.id_start = test_id_start + assert not config.validate_config() + assert len(config.invalid_configs) == 1 + assert "ID_START" in config.invalid_configs[0] + + +@mock.patch("os.path.isfile") +def test_validate_config_shards_false(mock_isfile): + mock_isfile.return_value = True + test_shards = 0 + config = DataGeneratorConfig() + config.shards = test_shards + assert not config.validate_config() + assert len(config.invalid_configs) == 1 + assert "SHARDS" in config.invalid_configs[0] + + +@mock.patch("os.path.isfile") +def test_validate_config_replicas_false(mock_isfile): + mock_isfile.return_value = True + test_replicas = -1 + config = DataGeneratorConfig() + config.replicas = test_replicas + assert not config.validate_config() + assert len(config.invalid_configs) == 1 + assert "REPLICAS" in config.invalid_configs[0] + + +@mock.patch("os.path.isfile") +def test_validate_config_port_false(mock_isfile): + mock_isfile.return_value = True + test_port = "0" + config = DataGeneratorConfig() + config.port = test_port + assert not config.validate_config() + assert len(config.invalid_configs) == 1 + assert "PORT" in config.invalid_configs[0] diff --git a/src/test/test_crate_db_writer.py b/src/test/test_crate_db_writer.py new file mode 100644 index 0000000..69cee38 --- /dev/null +++ b/src/test/test_crate_db_writer.py @@ -0,0 +1,170 @@ +import mock +from crate import client +from modules.crate_db_writer import CrateDbWriter +from test.test_models import test_model, test_model2 + + +@mock.patch.object(client, 'connect', autospec=True) +def test_close_connection(mock_connect): + """ + This function tests if the .close() functions of the self.conn and self.cursor objects is called + + Pre Condition: crate.client.connect() returns a Mock Object conn which returns a Mock Object + cursor when its .cursor() function is called. + CrateDbWriter is called. + -> mock_connect is called by CrateDbWriter with values given the constructor + + Test Case 1: + when calling CrateDbWriter.close_connection() conn.close() and cursor.close() are called + + :param mock_connect: mocked function call from crate.client.connect() + """ + # Pre Condition: + conn = mock.Mock() + cursor = mock.Mock() + mock_connect.return_value = conn + conn.cursor.return_value = cursor + db_writer = CrateDbWriter("localhost:4200", "crate", "password", test_model) + mock_connect.assert_called_with("localhost:4200", username="crate", password="password") + # Test Case 1: + db_writer.close_connection() + conn.close.assert_called() + cursor.close.assert_called() + + +@mock.patch.object(client, 'connect', autospec=True) +def test_prepare_database1(mock_connect): + """ + This function tests if the .prepare_database() functions of the db_writer uses the correct values when + creating the the database tables + + Pre Condition: crate.client.connect() returns a Mock Object conn which returns a Mock Object + cursor when its .cursor() function is called. + CrateDbWriter is called. + -> mock_connect is called by CrateDbWriter with values given the constructor + + Test Case 1: + when calling CrateDbWriter.prepare_database() the statement given to cursor.execute() contains the correct values: + -> "temperature" is used in stmt as table name + -> "g_ts_week" is used as partitioning column + -> "21 SHARDS" are configured for table + -> "number_of_replicas = 1" is set for table + + :param mock_connect: mocked function call from crate.client.connect() + """ + # Pre Condition: + conn = mock.Mock() + cursor = mock.Mock() + mock_connect.return_value = conn + conn.cursor.return_value = cursor + db_writer = CrateDbWriter("localhost:4200", "crate2", "password2", test_model) + mock_connect.assert_called_with("localhost:4200", username="crate2", password="password2") + # Test Case 1: + db_writer.prepare_database() + stmt = cursor.execute.call_args.args[0] + # table name is in stmt + assert "temperature" in stmt + # partition is default + assert "g_ts_week" in stmt + # shards is default + assert "21 SHARDS" in stmt + # replicas is default + assert "number_of_replicas = 1" in stmt + + +@mock.patch.object(client, 'connect', autospec=True) +def test_prepare_database2(mock_connect): + """ + This function tests if the .prepare_database() functions of the db_writer uses the correct values when + creating the the database tables + + Pre Condition: crate.client.connect() returns a Mock Object conn which returns a Mock Object + cursor when its .cursor() function is called. + CrateDbWriter is called. + -> mock_connect is called by CrateDbWriter with values given the constructor + + Test Case 1: + A new CrateDbWriter is initialized that overwrites default values used in prepare_database() + -> "table_name" is used in stmt as table name + -> "g_ts_day" is used as partitioning column + -> "3 SHARDS" are configured for table + -> "number_of_replicas = 0" is set for table + + :param mock_connect: mocked function call from crate.client.connect() + """ + # Pre Condition: + conn = mock.Mock() + cursor = mock.Mock() + mock_connect.return_value = conn + conn.cursor.return_value = cursor + db_writer = CrateDbWriter("localhost:4200", "crate3", "password3", test_model2, "table_name", 3, 0, "day") + db_writer.prepare_database() + # Test Case 1: + stmt = cursor.execute.call_args.args[0] + # table name is in stmt + assert "table_name" in stmt + # partition is default + assert "g_ts_day" in stmt + # shards is default + assert "3 SHARDS" in stmt + # replicas is default + assert "number_of_replicas = 0" in stmt + + +@mock.patch.object(client, 'connect', autospec=True) +def test_insert_stmt(mock_connect): + """ + This function tests if the .insert_stmt() functions of CrateDbWriter uses the correct table name and arguments + + Pre Condition: crate.client.connect() returns a Mock Object conn which returns a Mock Object + cursor when its .cursor() function is called. + CrateDbWriter is called. + + Test Case 1: + when calling CrateDbWriter.insert_stmt() the correct values are used for cursor.execute() + -> stmt contains the correct table name + -> values are equal to the insert_stmt arguments + + :param mock_connect: mocked function call from crate.client.connect() + """ + # Pre Condition: + conn = mock.Mock() + cursor = mock.Mock() + mock_connect.return_value = conn + conn.cursor.return_value = cursor + db_writer = CrateDbWriter("localhost:4200", "crate2", "password2", test_model) + # Test Case 1: + db_writer.insert_stmt([1586327807000], [{"plant": 1, "line": 1, "sensor_id": 1, "value": 6.7, "button_press": False}]) + call_arguments = cursor.execute.call_args.args + stmt = call_arguments[0] + values = call_arguments[1] + assert stmt == "INSERT INTO temperature (ts, payload) (SELECT col1, col2 FROM UNNEST(?,?))" + assert values == ([1586327807000], [{"plant": 1, "line": 1, "sensor_id": 1, "value": 6.7, "button_press": False}]) + + +@mock.patch.object(client, 'connect', autospec=True) +def test_execute_query(mock_connect): + """ + This function tests if the .execute_query() functions of CrateDbWriter uses the correct query + + Pre Condition: crate.client.connect() returns a Mock Object conn which returns a Mock Object + cursor when its .cursor() function is called. + CrateDbWriter is called. + + Test Case 1: + when calling CrateDbWriter.execute_query() the correct values are used for cursor.execute() + -> cursor.execute is called with argument from execute_query + -> fetchall is called + + :param mock_connect: mocked function call from crate.client.connect() + """ + # Pre Condition: + conn = mock.Mock() + cursor = mock.Mock() + mock_connect.return_value = conn + conn.cursor.return_value = cursor + db_writer = CrateDbWriter("localhost:4200", "crate2", "password2", test_model) + # Test Case 1: + db_writer.execute_query("SELECT * FROM temperature;") + cursor.execute.assert_called_with("SELECT * FROM temperature;") + cursor.fetchall.assert_called() diff --git a/src/test/test_edge.py b/src/test/test_edge.py new file mode 100644 index 0000000..9236184 --- /dev/null +++ b/src/test/test_edge.py @@ -0,0 +1,137 @@ +import pytest +import numpy +import statistics +from modules.edge import Edge, FloatSensor, BoolSensor +from test.test_models import metrics_model_float1_bool1, metrics_model_string, \ + tag_model_plant100_line5_sensorId, float_model, bool_model + + +def test_init_sensors(): + """ + This function tests if the Edge Object is correctly initialized + + Test Case 1: + Edge is initialized with: + id: 1 + tags: valid tag model + metrics: metrics model containing one float and one bool sensor + -> FloatSensor must be in sensor_types + -> BoolSensor must be in sensor_types + + Test Case 2: + Edge is initialized with: + id: 1 + tags: valid tag model + metrics: metrics model containing one string sensor + -> Constructor raises NotImplementedError + """ + # Test Case 1: + edge = Edge(1, tag_model_plant100_line5_sensorId, metrics_model_float1_bool1) + sensor_types = [] + for sensor in edge.sensors: + sensor_types.append(sensor.__class__.__name__) + assert "FloatSensor" in sensor_types + assert "BoolSensor" in sensor_types + + # Test Case 2: + with pytest.raises(NotImplementedError): + Edge(1, tag_model_plant100_line5_sensorId, metrics_model_string) + + +def test_calculate_next_value_edge(): + """ + This function tests if the Edge Object correctly calculates the next value of it's sensors + + Pre Condition: Edge Object created with id 1, a valid tag model and a the metrics_model_float1_bool1 model + + Test Case 1: the first value of the edge object is calculated + -> "plant" tag is in batch + -> "plant" values is 1 + -> "line" tag is in batch + -> "line" value is 1 + -> "sensor_id" tag is in batch + -> "sensor_id" value is 1 + -> "value" metric is in batch + -> "value" value is 6.3 +-0.3 + -> "button_press" is in batch + there is a 1:100 chance that button_press is true based on the model that's why it's value is not tested + + Test Case 2: another 1000 values for the edge object are calculated to see if button_press is at least True once + and value is not the same value each time + -> True is contained in button_press array + -> length of unique values in values array is bigger than 1 + """ + # Pre Condition + edge = Edge(1, tag_model_plant100_line5_sensorId, metrics_model_float1_bool1) + results = [] + # Test Case 1. + batch = edge.calculate_next_value() + assert "plant" in batch + assert batch["plant"] == 0 + assert "line" in batch + assert batch["line"] == 0 + assert "sensor_id" in batch + assert batch["sensor_id"] == 1 + assert "value" in batch + assert batch["value"] == pytest.approx(6.3, abs=0.3) + assert "button_press" in batch + results.append(batch) + + # Test Case 2: + # because button_press has a probability of 1:100 to be True, we do a thousand operations to get True for sure + for i in range(0, 1000): + results.append(edge.calculate_next_value()) + + button_press = [] + values = [] + for result in results: + button_press.append(result["button_press"]) + values.append(result["value"]) + assert True in button_press + assert len(numpy.unique(values)) > 1 + + +def test_calculate_next_value_float(): + """ + This function tests if the FloatSensor produces values that match the model + + Pre Condition: FloatSensor initialized with float_model + + Test Case 1: 10000 values for FloatSensor are created + -> mean of generated values == mean of model +- 0.5 + -> stdev of generated values == stdev of model +- 0.1 + -> error_rate of generated values == error_rate of model +- 0.001 + """ + # Pre Condition: + float_sensor = FloatSensor(float_model) + # we want to test if the values we create match the model we used to initialize the sensor + results = [] + # Test Case 1: + for i in range(0, 100000): + results.append(float_sensor.calculate_next_value()) + mean = statistics.mean(results) + stdev = statistics.stdev(results) + error_rate = float_sensor.error_count / (float_sensor.value_count + float_sensor.error_count) + assert mean == pytest.approx(float_model["mean"]["value"], abs=0.5) + assert stdev == pytest.approx(float_model["stdev"]["value"], abs=0.1) + assert error_rate == pytest.approx(float_model["error_rate"]["value"], abs=0.001) + + +def test_calculate_next_value_bool(): + """ + This function tests if the BoolSensor produces values that match the model + + Pre Condition: BoolSensor initialized with bool_model + + Test Case 1: 10000 values for BoolSensor are created + -> true_ratio of generated values == true_ratio of model +- 0.001 + """ + # Pre Condition: + bool_sensor = BoolSensor(bool_model) + results = [] + # Test Case 1: + for i in range(0, 10000): + results.append(bool_sensor.calculate_next_value()) + sum_true = sum(results) + true_ratio = sum_true / len(results) + assert true_ratio == pytest.approx(bool_model["true_ratio"]["value"], abs=0.001) diff --git a/src/test/test_influx_db_writer.py b/src/test/test_influx_db_writer.py new file mode 100644 index 0000000..d2db02c --- /dev/null +++ b/src/test/test_influx_db_writer.py @@ -0,0 +1,155 @@ +import mock +from dotmap import DotMap +from influxdb_client import Bucket +from influxdb_client.client.write_api import Point +from modules.influx_db_writer import InfluxDbWriter +from test.test_models import test_model + + +@mock.patch("modules.influx_db_writer.InfluxDBClient", autospec=True) +def test_close_connection(mock_client): + """ + This function tests if the .close_connection() function of InfluxDbWriter calls the close() function of self.client + + Pre Condition: InfluxDBClient() returns a Mock Object client + InfluxDbWriter is called. + -> Parameters of InfluxDbWriter match constructor parameters + + Test Case 1: + when calling InfluxDbWriter.close_connection() self.client.close() is called + -> client.close() is called + + :param mock_client: mocked InfluxDBClient class + """ + # Pre Condition: + client = mock.Mock() + mock_client.return_value = client + db_writer = InfluxDbWriter("localhost", "token", "org", test_model) + mock_client.assert_called_with("localhost", token="token") + # Test Case 1 + db_writer.close_connection() + client.close.assert_called() + + +@mock.patch("modules.influx_db_writer.InfluxDBClient", autospec=True) +def test_prepare_database_bucket_exists(mock_client): + """ + This function tests if the .prepare_database() function of InfluxDbWriter loads the correct bucket + + Pre Condition: InfluxDBClient() returns a Mock Object client + client.buckets_api() returns a Mock Object buckets_api + buckets_api.find_buckets() returns a DotMap Object where .buckets returns a list of influx Buckets + InfluxDbWriter is called. + -> Parameters of InfluxDbWriter match constructor parameters + + Test Case 1: + calling InfluxDbWriter.prepare_database() with a already existing Bucket in the Influx DB + -> buckets_api.create_bucket() is not called + + :param mock_client: mocked InfluxDBClient class + """ + # Pre Condition: + client = mock.Mock() + buckets_api = mock.Mock() + client.buckets_api.return_value = buckets_api + mock_client.return_value = client + db_writer = InfluxDbWriter("localhost", "token1", "org1", test_model) + mock_client.assert_called_with("localhost", token="token1") + bucket_list = DotMap() + bucket_list.buckets = [Bucket(name="", retention_rules=[]), Bucket(name="temperature", retention_rules=[])] + buckets_api.find_buckets.return_value = bucket_list + # Test Case 1: + db_writer.prepare_database() + buckets_api.create_bucket.assert_not_called() + + +@mock.patch("modules.influx_db_writer.InfluxDBClient", autospec=True) +def test_prepare_database_bucket_not_exists(mock_client): + """ + This function tests if the .prepare_database() function of InfluxDbWriter loads the correct bucket + + Pre Condition: InfluxDBClient() returns a Mock Object client + client.buckets_api() returns a Mock Object buckets_api + buckets_api.find_buckets() returns a DotMap Object where .buckets returns a list of influx Buckets + InfluxDbWriter is called. + -> Parameters of InfluxDbWriter match constructor parameters + + Test Case 1: + calling InfluxDbWriter.prepare_database() with the matching Bucket not in bucket_list + -> buckets_api.create_bucket() is called + + :param mock_client: mocked InfluxDBClient class + """ + # Pre Condition: + client = mock.Mock() + buckets_api = mock.Mock() + client.buckets_api.return_value = buckets_api + mock_client.return_value = client + db_writer = InfluxDbWriter("localhost", "token2", "org2", test_model) + mock_client.assert_called_with("localhost", token="token2") + bucket_list = DotMap() + bucket_list.buckets = [Bucket(name="x", retention_rules=[]), Bucket(name="y", retention_rules=[])] + buckets_api.find_buckets.return_value = bucket_list + # Test Case 1: + db_writer.prepare_database() + buckets_api.create_bucket.assert_called() + + +@mock.patch("modules.influx_db_writer.InfluxDBClient", autospec=True) +def test_insert_stmt(mock_client): + """ + This function tests if the .insert_stmt() function of InfluxDbWriter uses the correct arguments for write_api.write + + Pre Condition: InfluxDBClient() returns a Mock Object client + client.write_api() returns a Mock Object write_api + InfluxDbWriter is called. + + Test Case 1: + calling InfluxDbWriter.insert_stmt() with one timestamp and one batch and check write parameters + -> org is "org" + -> data is of type list + -> data is of length 1 + -> element in data is of type influxdb_client.Point + + :param mock_client: mocked InfluxDBClient class + """ + # Pre Condition: + client = mock.Mock() + write_api = mock.Mock() + mock_client.return_value = client + client.write_api.return_value = write_api + db_writer = InfluxDbWriter("localhost", "token", "org", test_model) + # Test Case 1: + db_writer.insert_stmt([1586327807000], + [{"plant": 2, "line": 2, "sensor_id": 2, "value": 6.7, "button_press": False}]) + call_arguments = write_api.write.call_args[1] + org = call_arguments["org"] + data = call_arguments["record"] + assert org == "org" + assert isinstance(data, list) + assert len(data) == 1 + assert isinstance(data[0], Point) + + +@mock.patch("modules.influx_db_writer.InfluxDBClient", autospec=True) +def test_execute_query(mock_client): + """ + This function tests if the .execute_query() function of InfluxDbWriter uses the correct arguments + + Pre Condition: InfluxDBClient() returns a Mock Object client + client.query_api() returns a Mock Object query_api + InfluxDbWriter is called. + + Test Case 1: + calling InfluxDbWriter.execute_query() + -> query_api.query is called with the same query given execute_query + + :param mock_client: mocked InfluxDBClient class + """ + client = mock.Mock() + query_api = mock.Mock() + mock_client.return_value = client + client.query_api.return_value = query_api + db_writer = InfluxDbWriter("localhost", "token", "org", test_model) + db_writer.execute_query("SELECT * FROM temperature;") + query_api.query.assert_called_with("SELECT * FROM temperature;") diff --git a/src/test/test_models.py b/src/test/test_models.py new file mode 100644 index 0000000..2f7c6e9 --- /dev/null +++ b/src/test/test_models.py @@ -0,0 +1,196 @@ +test_model = { + "temperature": { + "tags": { + "plant": 100, + "line": 5, + "sensor_id": "id" + }, + "metrics": { + "temperature": { + "key": { + "value": "value" + }, + "type": { + "value": "FLOAT" + }, + "min": { + "value": 6.0, + }, + "max": { + "value": 7.4, + }, + "mean": { + "value": 6.4, + }, + "median": { + "value": 6.3, + }, + "mode": { + "value": 6.3, + }, + "stdev": { + "value": 0.2, + }, + "variance": { + "value": 0.03, + }, + "error_rate": { + "value": 0.005, + }, + "error_length": { + "value": 1.08, + } + }, + "button": { + "key": { + "value": "button_press" + }, + "type": { + "value": "BOOL" + }, + "true_ratio": { + "value": 0.001 + } + } + } + } +} +test_model2 = { + "booleans": { + "tags": { + "plant": 100, + "line": 5, + "sensor_id": "id" + }, + "metrics": { + "switch": { + "key": { + "value": "switch_on" + }, + "type": { + "value": "BOOL" + }, + "true_ratio": { + "value": 0.9 + } + }, + "button": { + "key": { + "value": "button_press" + }, + "type": { + "value": "BOOL" + }, + "true_ratio": { + "value": 0.001 + } + } + } + } +} +metrics_model_float1_bool1 = { + "temperature": { + "key": { + "value": "value" + }, + "type": { + "value": "FLOAT" + }, + "min": { + "value": 6.0, + }, + "max": { + "value": 7.4, + }, + "mean": { + "value": 6.4, + }, + "median": { + "value": 6.3, + }, + "stdev": { + "value": 0.2, + }, + "variance": { + "value": 0.03, + }, + "error_rate": { + "value": 0.005, + }, + "error_length": { + "value": 1.08, + } + }, + "button": { + "key": { + "value": "button_press" + }, + "type": { + "value": "BOOL" + }, + "true_ratio": { + "value": 0.01 + } + } +} +metrics_model_string = { + "string": { + "key": { + "value": "not_implemented_sensor_type" + }, + "type": { + "value": "STRING" + }, + "true_ratio": { + "value": "not_implemented" + } + } +} +tag_model_plant100_line5_sensorId = { + "plant": 100, + "line": 5, + "sensor_id": "id" +} +float_model = { + "key": { + "value": "test" + }, + "type": { + "value": "FLOAT" + }, + "min": { + "value": 6.0, + }, + "max": { + "value": 7.4, + }, + "mean": { + "value": 6.4, + }, + "median": { + "value": 6.3, + }, + "stdev": { + "value": 0.2, + }, + "variance": { + "value": 0.03, + }, + "error_rate": { + "value": 0.001, + }, + "error_length": { + "value": 1.08, + } +} +bool_model = { + "key": { + "value": "button_press" + }, + "type": { + "value": "BOOL" + }, + "true_ratio": { + "value": 0.001 + } +} diff --git a/src/test/test_mongo_db_writer.py b/src/test/test_mongo_db_writer.py new file mode 100644 index 0000000..116655c --- /dev/null +++ b/src/test/test_mongo_db_writer.py @@ -0,0 +1,89 @@ +import mock +from datetime import datetime +from modules.mongo_db_writer import MongoDbWriter +from test.test_models import test_model + + +@mock.patch("modules.mongo_db_writer.MongoClient", autospec=True) +def test_close_connection(mock_client): + """ + This function tests if the .close_connection() function of MongoDbWriter calls the close() function of self.client + + Pre Condition: MongoDBClient() returns a Mock Object client + MongoDbWriter is called. + -> Parameters of MongoDbWriter match constructor parameters + + Test Case 1: + when calling MongoDbWriter.close_connection() self.client.close() is called + -> client.close() is called + + :param mock_client: mocked MongoDBClient class + """ + # Pre Condition: + client = mock.MagicMock() + mock_client.return_value = client + db_writer = MongoDbWriter("localhost", "mongo", "password", "db_name", test_model) + mock_client.assert_called_with("mongodb://mongo:password@localhost") + # Test Case 1: + db_writer.close_connection() + client.close.assert_called() + + +@mock.patch("modules.mongo_db_writer.MongoClient", autospec=True) +def test_insert_stmt(mock_client): + """ + This function tests if the .insert_stmt() function of MongoDbWriter creates the correct json object + + Pre Condition: MongoDBClient() returns a Mock Object client + MongoDbWriter is called. + -> Parameters of MongoDbWriter match constructor parameters + + Test Case 1: + calling MongoDbWriter.insert_stmt() + -> the function call has on argument + -> the argument is the same as the document + + :param mock_client: mocked MongoDBClient class + """ + client = mock.MagicMock() + mock_client.return_value = client + db_writer = MongoDbWriter("srvhost", "mongo", "password", "db_name", test_model) + mock_client.assert_called_with("mongodb+srv://mongo:password@srvhost") + # Test Case 1: + db_writer.insert_stmt([1586327807000], + [{"plant": 2, "line": 2, "sensor_id": 2, "value": 6.7, "button_press": False}]) + document = {"measurement": "temperature", + "date": datetime.fromtimestamp(1586327807), + "tags": {"plant": 2, "line": 2, "sensor_id": 2}, + "metrics": {"value": 6.7, "button_press": False}} + # [2] because there have be 2 prior function calls on client (getting db and collection) + args = client.mock_calls[2].args + assert len(args) == 1 + assert args[0] == [document] + + +@mock.patch("modules.mongo_db_writer.MongoClient", autospec=True) +def test_execute_query(mock_client): + """ + This function tests if the .execute_query() function of MongoDbWriter uses the correct argument + + Pre Condition: MongoDBClient() returns a Mock Object client + MongoDbWriter is called. + -> Parameters of MongoDbWriter match constructor parameters + + Test Case 1: + calling MongoDbWriter.execute_query() + -> the function call has on argument + -> the argument is the same as execute_query argument + + :param mock_client: mocked MongoDBClient class + """ + client = mock.MagicMock() + mock_client.return_value = client + db_writer = MongoDbWriter("srvhost", "mongo", "password", "db_name", test_model) + # Test Case 1: + db_writer.execute_query({"plant": 1}) + # [2] because there have be 2 prior function calls on client (getting db and collection) + args = client.mock_calls[2].args + assert len(args) == 1 + assert args[0] == {"plant": 1} diff --git a/src/test/test_postgres_db_writer.py b/src/test/test_postgres_db_writer.py new file mode 100644 index 0000000..4d3bd4e --- /dev/null +++ b/src/test/test_postgres_db_writer.py @@ -0,0 +1,162 @@ +import mock +import psycopg2.extras +from modules.postgres_db_writer import PostgresDbWriter +from test.test_models import test_model, test_model2 + + +@mock.patch.object(psycopg2, 'connect', autospec=True) +def test_close_connection(mock_connect): + """ + This function tests if the .close() functions of the self.conn and self.cursor objects is called + + Pre Condition: psycopg2.client.connect() returns a Mock Object conn which returns a Mock Object + cursor when its .cursor() function is called. + TimescaleDbWriter is called. + -> mock_connect is called by TimescaleDbWriter with values given the constructor + + Test Case 1: + when calling TimescaleDbWriter.close_connection() conn.close() and cursor.close() are called + + :param mock_connect: mocked function call from psycopg2.client.connect() + """ + # Pre Condition: + conn = mock.Mock() + cursor = mock.Mock() + mock_connect.return_value = conn + conn.cursor.return_value = cursor + db_writer = PostgresDbWriter("localhost", 4200, "timescale", "password", "test", test_model) + mock_connect.assert_called_with(dbname="test", user="timescale", password="password", host="localhost", port=4200) + # Test Case 1: + db_writer.close_connection() + conn.close.assert_called() + cursor.close.assert_called() + + +@mock.patch.object(psycopg2, 'connect', autospec=True) +def test_prepare_database1(mock_connect): + """ + This function tests if the .prepare_database() function uses the correct statment to create the database table + + Pre Condition: psycopg2.client.connect() returns a Mock Object conn which returns a Mock Object + cursor when its .cursor() function is called. + TimescaleDbWriter is called. + -> mock_connect is called by TimescaleDbWriter with values given the constructor + + Test Case 1: calling TimescaleDbWriter.prepare_database() + -> "temperature" is in stmt (table name) + -> "ts_week" is in stmt (partitioning of hyper_table) + + :param mock_connect: mocked function call from psycopg2.client.connect() + """ + # Pre Condition: + conn = mock.Mock() + cursor = mock.Mock() + mock_connect.return_value = conn + conn.cursor.return_value = cursor + db_writer = PostgresDbWriter("localhost", 4200, "timescale", "password2", "test", test_model) + mock_connect.assert_called_with(dbname="test", user="timescale", password="password2", host="localhost", port=4200) + # Test Case 1: + db_writer.prepare_database() + stmt = cursor.execute.call_args.args[0] + # table name is in stmt + assert "temperature" in stmt + # partition is default + assert "ts_week" in stmt + + +@mock.patch.object(psycopg2, 'connect', autospec=True) +def test_prepare_database2(mock_connect): + """ + This function tests if the .prepare_database() function uses the correct statment to create the database table + + Pre Condition: psycopg2.client.connect() returns a Mock Object conn which returns a Mock Object + cursor when its .cursor() function is called. + TimescaleDbWriter is called. + + Test Case 1: calling TimescaleDbWriter.prepare_database() with default values overwritten by constructor arguments + -> "table_name" is in stmt (table name) + -> "ts_day is in stmt (partitioning of hyper_table) + -> conn.commit function has been called + + :param mock_connect: mocked function call from psycopg2.client.connect() + """ + # Pre Condition: + conn = mock.Mock() + cursor = mock.Mock() + mock_connect.return_value = conn + conn.cursor.return_value = cursor + db_writer = PostgresDbWriter("localhost", 4200, "timescale3", "password3", + "test", test_model2, "table_name", "day") + # Test Case 1: + db_writer.prepare_database() + stmt = cursor.execute.call_args.args[0] + # table name is in stmt + assert "table_name" in stmt + # partition is correctly set + assert "ts_day" in stmt + conn.commit.assert_called() + + +@mock.patch.object(psycopg2, 'connect', autospec=True) +def test_insert_stmt(mock_connect): + """ + This function tests if the .insert_stmt() function uses the correct statement to insert values + + Pre Condition: psycopg2.client.connect() returns a Mock Object conn which returns a Mock Object + cursor when its .cursor() function is called. + TimescaleDbWriter is called. + + Test Case 1: calling TimescaleDbWriter.insert_stmt() + -> "plant" is in stmt + -> "line" is in stmt + -> "sensor_id" is in stmt + -> "value" is in stmt + -> "button_press" is in stmt + -> conn.commit() function has been called + + :param mock_connect: mocked function call from psycopg2.client.connect() + """ + # Pre Condition: + conn = mock.Mock() + cursor = mock.Mock() + mock_connect.return_value = conn + conn.cursor.return_value = cursor + db_writer = PostgresDbWriter("localhost", 4200, "timescale", "password", "test", test_model) + # Test Case 1: + db_writer.insert_stmt([1586327807000], + [{"plant": 1, "line": 1, "sensor_id": 1, "value": 6.7, "button_press": False}]) + call_arguments = cursor.execute.call_args.args + stmt = call_arguments[0] + # all properties must be in statement + assert "plant" in stmt + assert "line" in stmt + assert "sensor_id" in stmt + assert "value" in stmt + assert "button_press" in stmt + conn.commit.assert_called() + + +@mock.patch.object(psycopg2, 'connect', autospec=True) +def test_execute_query(mock_connect): + """ + This function tests if the .execute_query() function uses the given query + + Pre Condition: psycopg2.client.connect() returns a Mock Object conn which returns a Mock Object + cursor when its .cursor() function is called. + TimescaleDbWriter is called. + + Test Case 1: calling TimescaleDbWriter.execute_query() + -> cursor.execute function is called with given argument + -> cursor.fetchall function is called + + :param mock_connect: mocked function call from psycopg2.client.connect() + """ + # Pre Condition: + conn = mock.Mock() + cursor = mock.Mock() + mock_connect.return_value = conn + conn.cursor.return_value = cursor + db_writer = PostgresDbWriter("localhost", 4200, "timescale", "password", "test", test_model) + db_writer.execute_query("SELECT * FROM temperature;") + cursor.execute.assert_called_with("SELECT * FROM temperature;") + cursor.fetchall.assert_called() diff --git a/src/test/test_timescale_db_writer.py b/src/test/test_timescale_db_writer.py new file mode 100644 index 0000000..c3e1ff9 --- /dev/null +++ b/src/test/test_timescale_db_writer.py @@ -0,0 +1,162 @@ +import mock +import psycopg2.extras +from modules.timescale_db_writer import TimescaleDbWriter +from test.test_models import test_model, test_model2 + + +@mock.patch.object(psycopg2, 'connect', autospec=True) +def test_close_connection(mock_connect): + """ + This function tests if the .close() functions of the self.conn and self.cursor objects is called + + Pre Condition: psycopg2.client.connect() returns a Mock Object conn which returns a Mock Object + cursor when its .cursor() function is called. + TimescaleDbWriter is called. + -> mock_connect is called by TimescaleDbWriter with values given the constructor + + Test Case 1: + when calling TimescaleDbWriter.close_connection() conn.close() and cursor.close() are called + + :param mock_connect: mocked function call from psycopg2.client.connect() + """ + # Pre Condition: + conn = mock.Mock() + cursor = mock.Mock() + mock_connect.return_value = conn + conn.cursor.return_value = cursor + db_writer = TimescaleDbWriter("localhost", 4200, "timescale", "password", "test", test_model) + mock_connect.assert_called_with(dbname="test", user="timescale", password="password", host="localhost", port=4200) + # Test Case 1: + db_writer.close_connection() + conn.close.assert_called() + cursor.close.assert_called() + + +@mock.patch.object(psycopg2, 'connect', autospec=True) +def test_prepare_database1(mock_connect): + """ + This function tests if the .prepare_database() function uses the correct statment to create the database table + + Pre Condition: psycopg2.client.connect() returns a Mock Object conn which returns a Mock Object + cursor when its .cursor() function is called. + TimescaleDbWriter is called. + -> mock_connect is called by TimescaleDbWriter with values given the constructor + + Test Case 1: calling TimescaleDbWriter.prepare_database() + -> "temperature" is in stmt (table name) + -> "ts_week" is in stmt (partitioning of hyper_table) + + :param mock_connect: mocked function call from psycopg2.client.connect() + """ + # Pre Condition: + conn = mock.Mock() + cursor = mock.Mock() + mock_connect.return_value = conn + conn.cursor.return_value = cursor + db_writer = TimescaleDbWriter("localhost", 4200, "timescale", "password2", "test", test_model) + mock_connect.assert_called_with(dbname="test", user="timescale", password="password2", host="localhost", port=4200) + # Test Case 1: + db_writer.prepare_database() + stmt = cursor.execute.call_args.args[0] + # table name is in stmt + assert "temperature" in stmt + # partition is default + assert "ts_week" in stmt + + +@mock.patch.object(psycopg2, 'connect', autospec=True) +def test_prepare_database2(mock_connect): + """ + This function tests if the .prepare_database() function uses the correct statment to create the database table + + Pre Condition: psycopg2.client.connect() returns a Mock Object conn which returns a Mock Object + cursor when its .cursor() function is called. + TimescaleDbWriter is called. + + Test Case 1: calling TimescaleDbWriter.prepare_database() with default values overwritten by constructor arguments + -> "table_name" is in stmt (table name) + -> "ts_day is in stmt (partitioning of hyper_table) + -> conn.commit function has been called + + :param mock_connect: mocked function call from psycopg2.client.connect() + """ + # Pre Condition: + conn = mock.Mock() + cursor = mock.Mock() + mock_connect.return_value = conn + conn.cursor.return_value = cursor + db_writer = TimescaleDbWriter("localhost", 4200, "timescale3", "password3", + "test", test_model2, "table_name", "day") + # Test Case 1: + db_writer.prepare_database() + stmt = cursor.execute.call_args.args[0] + # table name is in stmt + assert "table_name" in stmt + # partition is correctly set + assert "ts_day" in stmt + conn.commit.assert_called() + + +@mock.patch.object(psycopg2, 'connect', autospec=True) +def test_insert_stmt(mock_connect): + """ + This function tests if the .insert_stmt() function uses the correct statement to insert values + + Pre Condition: psycopg2.client.connect() returns a Mock Object conn which returns a Mock Object + cursor when its .cursor() function is called. + TimescaleDbWriter is called. + + Test Case 1: calling TimescaleDbWriter.insert_stmt() + -> "plant" is in stmt + -> "line" is in stmt + -> "sensor_id" is in stmt + -> "value" is in stmt + -> "button_press" is in stmt + -> conn.commit() function has been called + + :param mock_connect: mocked function call from psycopg2.client.connect() + """ + # Pre Condition: + conn = mock.Mock() + cursor = mock.Mock() + mock_connect.return_value = conn + conn.cursor.return_value = cursor + db_writer = TimescaleDbWriter("localhost", 4200, "timescale", "password", "test", test_model) + # Test Case 1: + db_writer.insert_stmt([1586327807000], + [{"plant": 1, "line": 1, "sensor_id": 1, "value": 6.7, "button_press": False}]) + call_arguments = cursor.execute.call_args.args + stmt = call_arguments[0] + # all properties must be in statement + assert "plant" in stmt + assert "line" in stmt + assert "sensor_id" in stmt + assert "value" in stmt + assert "button_press" in stmt + conn.commit.assert_called() + + +@mock.patch.object(psycopg2, 'connect', autospec=True) +def test_execute_query(mock_connect): + """ + This function tests if the .execute_query() function uses the given query + + Pre Condition: psycopg2.client.connect() returns a Mock Object conn which returns a Mock Object + cursor when its .cursor() function is called. + TimescaleDbWriter is called. + + Test Case 1: calling TimescaleDbWriter.execute_query() + -> cursor.execute function is called with given argument + -> cursor.fetchall function is called + + :param mock_connect: mocked function call from psycopg2.client.connect() + """ + # Pre Condition: + conn = mock.Mock() + cursor = mock.Mock() + mock_connect.return_value = conn + conn.cursor.return_value = cursor + db_writer = TimescaleDbWriter("localhost", 4200, "timescale", "password", "test", test_model) + db_writer.execute_query("SELECT * FROM temperature;") + cursor.execute.assert_called_with("SELECT * FROM temperature;") + cursor.fetchall.assert_called()