microsoft · dasiths · Sep 16, 2024 · Sep 16, 2024 · Sep 16, 2024 · Sep 16, 2024
diff --git a/.gitignore b/.gitignore
@@ -8,7 +8,7 @@ __pycache__/
 
 # Distribution / packaging
 .Python
-build/
+*/build/
 develop-eggs/
 dist/
 downloads/

diff --git a/Makefile b/Makefile
@@ -91,3 +91,6 @@ docker-build-load-test: ## Build the AOAI Simulated API Load Test as a docker im
 erase-recording: ## Erase all *.recording files
 	rm -rf "${makefile_dir}.recording"
 
+start-telemetry:
+	-docker-compose -f build/telemetry-docker-compose.yaml down
+	docker-compose -f ./build/telemetry-docker-compose.yaml up
diff --git a/build/telemetry-docker-compose.yaml b/build/telemetry-docker-compose.yaml
@@ -0,0 +1,8 @@
+services:
+  grafana-all-in-one: # https://grafana.com/blog/2024/03/13/an-opentelemetry-backend-in-a-docker-image-introducing-grafana/otel-lgtm/
+    image: grafana/otel-lgtm
+    container_name: otel-lgtm
+    ports:
+      - "3000:3000"   # Grafana Web UI
+      - "4317:4317"   # OTLP gRPC receiver
+      - "4318:4318"   # OTLP http receiver
diff --git a/docs/config.md b/docs/config.md
@@ -107,6 +107,8 @@ The simulator supports a set of basic Open Telemetry configuration options. Thes
 | ----------------------------- | ----------------------------------------------------------------------------------------------- |
 | `OTEL_SERVICE_NAME`           | Sets the value of the service name reported to Open Telemetry. Defaults to `aoai-api-simulator` |
 | `OTEL_METRIC_EXPORT_INTERVAL` | The time interval (in milliseconds) between the start of two export attempts..                  |
+| `APPLICATIONINSIGHTS_CONNECTION_STRING` | Sets up the app insights connection string for telemetry |
+|`OTEL_EXPORTER_OTLP_ENDPOINT` | Sets up the OpenTelemetry OTLP exporter endpoint. This can be further customised using environment variables described [here](https://opentelemetry.io/docs/specs/otel/protocol/exporter/). i.e. `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`, `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT` or `OTEL_EXPORTER_OTLP_LOGS_ENDPOINT`  |
 
 ## Config API Endpoint
 

diff --git a/infra/main.bicep b/infra/main.bicep
@@ -170,7 +170,7 @@ resource azureOpenAIKeySecret 'Microsoft.KeyVault/vaults/secrets@2023-07-01' = {
 }
 resource appInsightsConnectionStringSecret 'Microsoft.KeyVault/vaults/secrets@2023-07-01' = {
   parent: vault
-  name: 'app-insights-connection-string'
+  name: 'applicationinsights-connection-string'
   properties: {
     value: appInsights.properties.ConnectionString
   }
@@ -208,8 +208,8 @@ resource apiSim 'Microsoft.App/containerApps@2023-05-01' = {
           identity: managedIdentity.id
         }
         {
-          name: 'app-insights-connection-string'
-          keyVaultUrl: '${keyVaultUri}secrets/app-insights-connection-string'
+          name: 'applicationinsights-connection-string'
+          keyVaultUrl: '${keyVaultUri}secrets/applicationinsights-connection-string'
           identity: managedIdentity.id
         }
         {
@@ -243,7 +243,7 @@ resource apiSim 'Microsoft.App/containerApps@2023-05-01' = {
             { name: 'AZURE_OPENAI_KEY', secretRef: 'azure-openai-key' }
             { name: 'OPENAI_DEPLOYMENT_CONFIG_PATH', value: '/mnt/deployment-config/simulator_deployment_config.json' }
             { name: 'LOG_LEVEL', value: logLevel }
-            { name: 'APPLICATIONINSIGHTS_CONNECTION_STRING', secretRef: 'app-insights-connection-string' }
+            { name: 'APPLICATIONINSIGHTS_CONNECTION_STRING', secretRef: 'applicationinsights-connection-string' }
             // Ensure cloudRoleName is set in telemetry
             // https://opentelemetry-python.readthedocs.io/en/latest/sdk/environment_variables.html#opentelemetry.sdk.environment_variables.OTEL_SERVICE_NAME
             { name: 'OTEL_SERVICE_NAME', value: apiSimulatorName }

diff --git a/loadtest/common/config.py b/loadtest/common/config.py
@@ -1,7 +1,8 @@
 import os
 
 api_key = os.getenv("API_KEY", os.getenv("SIMULATOR_API_KEY"))
-app_insights_connection_string = os.getenv("APP_INSIGHTS_CONNECTION_STRING")
+opentelemetry_exporter_otlp_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
+applicationinsights_connection_string = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING")
 log_analytics_workspace_id = os.getenv("LOG_ANALYTICS_WORKSPACE_ID")
 log_analytics_workspace_name = os.getenv("LOG_ANALYTICS_WORKSPACE_NAME")
 tenant_id = os.getenv("TENANT_ID")

diff --git a/loadtest/common/locust_app_insights.py b/loadtest/common/locust_app_insights.py
@@ -1,18 +1,18 @@
 import logging
-from opentelemetry import metrics
+
 from azure.monitor.opentelemetry import configure_azure_monitor
+from opentelemetry import metrics
 
 from .config import (
-    app_insights_connection_string,
+    applicationinsights_connection_string,
 )
 
-
 histogram_request_latency: metrics.Histogram
 
-if app_insights_connection_string:
+if applicationinsights_connection_string:
     # Options: https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/monitor/azure-monitor-opentelemetry#usage
     logging.getLogger("azure").setLevel(logging.WARNING)
-    configure_azure_monitor(connection_string=app_insights_connection_string)
+    configure_azure_monitor(connection_string=applicationinsights_connection_string)
     histogram_request_latency = metrics.get_meter(__name__).create_histogram(
         "locust.request_latency", "Request latency", "s"
     )

diff --git a/loadtest/loadtest_chat_completions_1s_latency.py b/loadtest/loadtest_chat_completions_1s_latency.py
@@ -1,13 +1,13 @@
 import logging
 import os
-from locust import HttpUser, task, constant, events
-from locust.env import Environment
 
-from common.config import api_key, app_insights_connection_string
+from common.config import api_key, applicationinsights_connection_string
 from common.latency import set_simulator_chat_completions_latency
 from common.locust_app_insights import (
     report_request_metric,
 )
+from locust import HttpUser, constant, events, task
+from locust.env import Environment
 
 max_tokens = 100
 deployment_name = os.getenv("DEPLOYMENT_NAME", None)
@@ -21,7 +21,7 @@ def on_locust_init(environment: Environment, **_):
     """
     Configure test
     """
-    if app_insights_connection_string:
+    if applicationinsights_connection_string:
         logging.info("App Insights connection string found - enabling request metrics")
         environment.events.request.add_listener(report_request_metric)
     else:

diff --git a/loadtest/loadtest_chat_completions_no_added_latency.py b/loadtest/loadtest_chat_completions_no_added_latency.py
@@ -1,13 +1,13 @@
 import logging
 import os
-from locust import HttpUser, task, constant, events
-from locust.env import Environment
 
-from common.config import api_key, app_insights_connection_string
+from common.config import api_key, applicationinsights_connection_string
 from common.latency import set_simulator_chat_completions_latency
 from common.locust_app_insights import (
     report_request_metric,
 )
+from locust import HttpUser, constant, events, task
+from locust.env import Environment
 
 max_tokens = int(os.getenv("MAX_TOKENS", "100"))
 deployment_name = os.getenv("DEPLOYMENT_NAME", None)
@@ -25,7 +25,7 @@ def on_locust_init(environment: Environment, **_):
     """
     Configure test
     """
-    if app_insights_connection_string:
+    if applicationinsights_connection_string:
         logging.info("App Insights connection string found - enabling request metrics")
         environment.events.request.add_listener(report_request_metric)
     else:

diff --git a/sample.env b/sample.env
@@ -23,6 +23,8 @@ AZURE_FORM_RECOGNIZER_KEY=
 #  Open Telemetry Config (used within the simulator)
 OTEL_SERVICE_NAME=aoai-api-simulator-local-dev
 OTEL_METRIC_EXPORT_INTERVAL=10000
+OTEL_EXPORTER_OTLP_ENDPOINT=http://host.docker.internal:4317
+APPLICATIONINSIGHTS_CONNECTION_STRING=
 
 
 # Test Client Config (used to direct the tests and test clients)

diff --git a/scripts/_run-load-test-aca.sh b/scripts/_run-load-test-aca.sh
@@ -84,8 +84,8 @@ if [[ -z "${key_vault_name}" ]]; then
 	echo "Key Vault Name not found in output.json"
 	exit 1
 fi
-app_insights_connection_string=$(az keyvault secret show --vault-name "$key_vault_name" --name app-insights-connection-string --query value --output tsv)
-if [[ -z "${app_insights_connection_string}" ]]; then
+applicationinsights_connection_string=$(az keyvault secret show --vault-name "$key_vault_name" --name applicationinsights-connection-string --query value --output tsv)
+if [[ -z "${applicationinsights_connection_string}" ]]; then
 	echo "App Insights Connection String not found in Key Vault"
 	exit 1
 fi
@@ -127,7 +127,7 @@ az containerapp job create \
   --cpu "1" \
   --memory "2Gi" \
   --command "locust" \
-  --env-vars "LOCUST_LOCUSTFILE=$TEST_FILE" "LOCUST_HOST=https://${api_fqdn}/" "LOCUST_USERS=$LOCUST_USERS" "LOCUST_SPAWN_RATE=$LOCUST_SPAWN_RATE" "LOCUST_AUTOSTART=true" "LOCUST_RUN_TIME=$LOCUST_RUN_TIME" "LOCUST_AUTOQUIT=10" "SIMULATOR_API_KEY=${SIMULATOR_API_KEY}" "APP_INSIGHTS_CONNECTION_STRING=${app_insights_connection_string}" "MAX_TOKENS=${MAX_TOKENS}" "DEPLOYMENT_NAME=${DEPLOYMENT_NAME}" ALLOW_429_RESPONSES=${ALLOW_429_RESPONSES} 1>&2
+  --env-vars "LOCUST_LOCUSTFILE=$TEST_FILE" "LOCUST_HOST=https://${api_fqdn}/" "LOCUST_USERS=$LOCUST_USERS" "LOCUST_SPAWN_RATE=$LOCUST_SPAWN_RATE" "LOCUST_AUTOSTART=true" "LOCUST_RUN_TIME=$LOCUST_RUN_TIME" "LOCUST_AUTOQUIT=10" "SIMULATOR_API_KEY=${SIMULATOR_API_KEY}" "APP_INSIGHTS_CONNECTION_STRING=${applicationinsights_connection_string}" "MAX_TOKENS=${MAX_TOKENS}" "DEPLOYMENT_NAME=${DEPLOYMENT_NAME}" ALLOW_429_RESPONSES=${ALLOW_429_RESPONSES} 1>&2
 
 
 start_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ")

diff --git a/src/aoai-api-simulator/requirements.txt b/src/aoai-api-simulator/requirements.txt
@@ -2,9 +2,13 @@ fastapi==0.109.2
 uvicorn[standard]==0.27.0.post1
 gunicorn==22.0.0
 requests==2.32.0
+opentelemetry-instrumentation-requests==0.48b0
 PyYAML==6.0.1
 tiktoken==0.6.0
 nanoid==2.0.0
 limits==3.8.0
+opentelemetry-api==1.27.0
+opentelemetry-sdk==1.27.0
+opentelemetry-exporter-otlp==1.27.0
 azure-monitor-opentelemetry==1.3.0
 pydantic-settings==2.2.1
diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/main.py b/src/aoai-api-simulator/src/aoai_api_simulator/main.py
@@ -1,33 +1,91 @@
 import logging
 import os
 
-from azure.monitor.opentelemetry import configure_azure_monitor
+from aoai_api_simulator.app_builder import app as builder_app
+from aoai_api_simulator.app_builder import apply_config
 
 # from opentelemetry import trace
-
 from aoai_api_simulator.config_loader import get_config_from_env_vars, set_config
-from aoai_api_simulator.app_builder import app as builder_app, apply_config
+from azure.monitor.opentelemetry import configure_azure_monitor
+from opentelemetry import metrics, trace
+from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter
+from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
+from opentelemetry.instrumentation.requests import RequestsInstrumentor
+
+#  from opentelemetry.sdk._logs.export import ConsoleLogExporter
+from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
+from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
+from opentelemetry.sdk.metrics import MeterProvider
+from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
+from opentelemetry.sdk.resources import Resource
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
 
 log_level = os.getenv("LOG_LEVEL") or "INFO"
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=log_level)
 logging.getLogger("azure").setLevel(logging.WARNING)
 
+opentelemetry_exporter_otlp_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
 application_insights_connection_string = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING")
+
+using_azure_monitor: bool
+
 if application_insights_connection_string:
     logger.info("🚀 Configuring Azure Monitor telemetry")
 
     # Options: https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/monitor/azure-monitor-opentelemetry#usage
     configure_azure_monitor(connection_string=application_insights_connection_string)
+    using_azure_monitor = True
 else:
+    using_azure_monitor = False
     logger.info("🚀 Azure Monitor telemetry not configured (set APPLICATIONINSIGHTS_CONNECTION_STRING)")
 
-# tracer = trace.get_tracer(__name__)
+    if opentelemetry_exporter_otlp_endpoint:
+        logger.info("🚀 Configuring OTLP telemetry")
+
+        # setup the instrumentors
+        resource = Resource(attributes={"service.name": os.getenv("OTEL_SERVICE_NAME", "aoai-api-simulator")})
+
+        trace.set_tracer_provider(TracerProvider(resource=resource))
+        tracer = trace.get_tracer(__name__)
+
+        otlp_exporter = OTLPSpanExporter()
+
+        # tracing
+        tracer = trace.get_tracer(__name__)
+        span_processor = BatchSpanProcessor(otlp_exporter)
+        trace.get_tracer_provider().add_span_processor(span_processor)
+
+        # metrics
+        reader = PeriodicExportingMetricReader(OTLPMetricExporter())
+        meterProvider = MeterProvider(resource=resource, metric_readers=[reader])
+        metrics.set_meter_provider(meterProvider)
+
+        # logging
+        logger_provider = LoggerProvider(
+            resource=resource,
+        )
+
+        otlp_exporter = OTLPLogExporter()
+        logger_provider.add_log_record_processor(BatchLogRecordProcessor(otlp_exporter))
+
+        handler = LoggingHandler(level=os.getenv("OTEL_LOG_LEVEL", "INFO"), logger_provider=logger_provider)
+        # Attach OTLP handler to root logger
+        logging.getLogger().addHandler(handler)
+    else:
+        logger.info("🚀 OTLP telemetry exporter not configured (set OTEL_EXPORTER_OTLP_ENDPOINT)")
 
 config = get_config_from_env_vars(logger)
 set_config(config)
 
-
 apply_config()
+
 app = builder_app  # expose to gunicorn
+
+if not using_azure_monitor:
+    RequestsInstrumentor().instrument()
+    FastAPIInstrumentor.instrument_app(app)