From 74870443dee722568f66db0b6fba394fe496847f Mon Sep 17 00:00:00 2001 From: zhaolei36 Date: Mon, 8 Dec 2025 09:49:50 +0000 Subject: [PATCH] [Feature]support trace part1 --- docs/observability/README.md | 173 ++ docs/observability/trace.md | 202 ++ docs/zh/observability/README.md | 149 + docs/zh/observability/trace.md | 150 ++ examples/observability/docker-compose.yaml | 52 + .../grafana/dashboards/config/dashboard.yaml | 11 + .../dashboards/json/fastdeploy-dashboard.json | 2397 +++++++++++++++++ .../grafana/datasources/datasource.yaml | 9 + .../observability/metrics/prometheus.yaml | 10 + .../metrics/prometheus_compose.yaml | 30 + .../observability/tracing/opentelemetry.yaml | 38 + .../tracing/tracing_compose.yaml | 21 + fastdeploy/engine/common_engine.py | 34 +- fastdeploy/engine/engine.py | 3 + fastdeploy/engine/request.py | 7 +- fastdeploy/entrypoints/cli/tokenizer.py | 2 +- fastdeploy/entrypoints/engine_client.py | 9 +- fastdeploy/entrypoints/openai/api_server.py | 40 +- fastdeploy/entrypoints/openai/protocol.py | 2 + fastdeploy/entrypoints/openai/serving_chat.py | 32 + .../entrypoints/openai/serving_completion.py | 33 + fastdeploy/envs.py | 2 + fastdeploy/metrics/trace.py | 777 ++++++ fastdeploy/metrics/trace_util.py | 262 -- fastdeploy/output/token_processor.py | 26 + mkdocs.yml | 4 + tests/metrics/test_trace.py | 615 +++++ tests/metrics/test_trace_util.py | 193 -- tests/output/test_process_batch_output.py | 1 + 29 files changed, 4808 insertions(+), 476 deletions(-) create mode 100644 docs/observability/README.md create mode 100644 docs/observability/trace.md create mode 100644 docs/zh/observability/README.md create mode 100644 docs/zh/observability/trace.md create mode 100644 examples/observability/docker-compose.yaml create mode 100644 examples/observability/metrics/grafana/dashboards/config/dashboard.yaml create mode 100644 examples/observability/metrics/grafana/dashboards/json/fastdeploy-dashboard.json create mode 100644 examples/observability/metrics/grafana/datasources/datasource.yaml create mode 100644 examples/observability/metrics/prometheus.yaml create mode 100644 examples/observability/metrics/prometheus_compose.yaml create mode 100644 examples/observability/tracing/opentelemetry.yaml create mode 100644 examples/observability/tracing/tracing_compose.yaml create mode 100644 fastdeploy/metrics/trace.py delete mode 100644 fastdeploy/metrics/trace_util.py create mode 100644 tests/metrics/test_trace.py delete mode 100644 tests/metrics/test_trace_util.py diff --git a/docs/observability/README.md b/docs/observability/README.md new file mode 100644 index 00000000000..10246f29041 --- /dev/null +++ b/docs/observability/README.md @@ -0,0 +1,173 @@ +## Observability Example Configuration (`examples/observability`) + +This directory provides a complete, Docker Compose–based observability example environment, including: + +* **Prometheus**: Metrics collection +* **Grafana**: Metrics visualization +* **OpenTelemetry Collector**: Distributed tracing data ingestion and processing + +Developers can use this example to **launch a local monitoring and tracing system with a single command**. + +--- + +### Prerequisites + +Please make sure the following components are installed in advance: + +* Docker +* Docker Compose (or a newer Docker CLI version that supports `docker compose`) + +--- + +### Usage + +#### Start All Services + +Enter the directory: + +```bash +cd examples/observability +``` + +Run the following command to start the complete monitoring and tracing stack: + +```bash +docker compose -f docker-compose.yaml up -d +``` + +After startup, you can access: + +* **Prometheus**: [http://localhost:9090](http://localhost:9090) +* **Grafana**: [http://localhost:3000](http://localhost:3000) +* **OTLP receiver**: Applications should send traces to the default ports of the OTel Collector (usually `4317` or `4318`) + + * gRPC: `4317` + * HTTP: `4318` +* **Jaeger UI**: [http://localhost:16886](http://localhost:16886) + +**Notes:** + +* Update the Prometheus scrape targets to match your actual application endpoints. +* Map Grafana’s service port to a port that is accessible on your machine. +* Map the Jaeger UI port to a port that is accessible on your machine. +* When starting the full stack, there is no need to start individual sub-services separately. + +--- + +#### Start Metrics Services Only + +Enter the directory: + +```bash +cd examples/observability/metrics +``` + +Run the following command: + +```bash +docker compose -f prometheus_compose.yaml up -d +``` + +After startup, you can access: + +* **Grafana**: [http://localhost:3000](http://localhost:3000) + +--- + +#### Start Tracing Services Only + +Enter the directory: + +```bash +cd examples/observability/tracing +``` + +Run the following command: + +```bash +docker compose -f tracing_compose.yaml up -d +``` + +After startup, you can access: + +* **OTLP receiver**: Applications should send traces to the default ports of the OTel Collector (usually `4317` or `4318`) + + * gRPC: `4317` + * HTTP: `4318` +* **Jaeger UI**: [http://localhost:16886](http://localhost:16886) + +--- + +### Directory Structure and File Descriptions + +#### Core Startup File + +| File Name | Purpose | Description | +| --------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `docker-compose.yaml` | Main entry | Defines and starts the full observability stack (Prometheus, Grafana, OTel Collector, and Jaeger). This is the single entry point to launch the entire environment. | + +--- + +#### Metrics and Monitoring Configuration + +| File / Directory | Purpose | Description | +| --------------------------------------------------- | ------------------------ | ------------------------------------------------------------------------------------------------------------------------- | +| `metrics` | Metrics root directory | Contains all Prometheus- and metrics-related configurations. | +| `prometheus.yaml` | Prometheus main config | Defines scrape targets, global scrape parameters, and optional recording rules. All monitored endpoints are defined here. | +| `prometheus_compose.yaml` | Prometheus Docker config | Defines the Prometheus container, volume mounts, and network settings. | +| `grafana/datasources/datasource.yaml` | Datasource configuration | Configures how Grafana connects to Prometheus. | +| `grafana/dashboards/config/dashboard.yaml` | Dashboard provisioning | Specifies the locations of dashboard JSON files to be loaded. | +| `grafana/dashboards/json/fastdeploy-dashboard.json` | Dashboard definition | Contains visualization layouts and queries for `fastdeploy` monitoring metrics. | + +--- + +#### Distributed Tracing Configuration + +| File / Directory | Purpose | Description | +| ------------------------------------------------------------------------------- | ---------------------- | ---------------------------------------------------------------------- | +| `tracing` | Tracing root directory | Contains all configurations related to distributed tracing. | +| `opentelemetry.yaml` | OTel Collector config | Defines the Collector data pipelines: | +| • **receivers**: receive OTLP data (traces, metrics, logs) | | | +| • **processors**: data processing and batching | | | +| • **exporters**: export data to tracing backends (such as Jaeger) or files | | | +| • **extensions**: health check, pprof, and zpages | | | +| • **pipelines**: define complete processing flows for traces, metrics, and logs | | | +| `tracing_compose.yaml` | Tracing Docker config | Defines the container configuration for the OTel Collector and Jaeger. | + +--- + +### Customization + +#### 4.1 Modify Metrics Scrape Targets + +If your application’s metrics endpoint, port, or path changes, edit: + +```plain +metrics/prometheus.yaml +``` + +--- + +#### 4.2 Adjust Tracing Sampling Rate or Processing Logic + +Edit: + +```plain +tracing/opentelemetry.yaml +``` + +--- + +#### 4.3 Add Custom Grafana Dashboards + +1. Add the new dashboard JSON file to: + +```plain +grafana/dashboards/json/ +``` + +2. Register the dashboard so Grafana can load it automatically by editing: + +```plain +grafana/dashboards/config/dashboard.yaml +``` diff --git a/docs/observability/trace.md b/docs/observability/trace.md new file mode 100644 index 00000000000..ca7501ab0b3 --- /dev/null +++ b/docs/observability/trace.md @@ -0,0 +1,202 @@ +# FastDeploy Tracing with OpenTelemetry + +**FastDeploy** exports request tracing data through the **OpenTelemetry Collector**. +Tracing can be enabled when starting the server using the `--trace-enable` flag, and the OpenTelemetry Collector endpoint can be configured via `--otlp-traces-endpoint`. + +--- + +## Setup Guide + +### 1. Install Dependencies + +```bash +# Manual installation +pip install opentelemetry-sdk \ + opentelemetry-api \ + opentelemetry-exporter-otlp \ + opentelemetry-exporter-otlp-proto-grpc +``` + +--- + +### 2. Start OpenTelemetry Collector and Jaeger + +```bash +docker compose -f examples/observability/tracing/tracing_compose.yaml up -d +``` + +--- + +### 3. Start FastDeploy Server with Tracing Enabled + +#### Configure FastDeploy Environment Variables + +```shell +# Enable tracing +"TRACES_ENABLE": "true", + +# Service name +"FD_SERVICE_NAME": "FastDeploy", + +# Instance name +"FD_HOST_NAME": "trace_test", + +# Exporter type +"TRACES_EXPORTER": "otlp", + +# OTLP endpoint: +# gRPC: 4317 +# HTTP: 4318 +"EXPORTER_OTLP_ENDPOINT": "http://localhost:4317", + +# Optional headers +"EXPORTER_OTLP_HEADERS": "Authentication=Txxxxx", + +# Export protocol +"OTEL_EXPORTER_OTLP_TRACES_PROTOCOL": "grpc", +``` + +#### Start FastDeploy + +Start the FastDeploy server with the above configuration and ensure that tracing is enabled. + +--- + +### 4. Send Requests and View Traces + +* Open the **Jaeger UI** in your browser (port `16686`) to visualize request traces. +* The OpenTelemetry Collector will also export the trace data to a local file: + +```plain +/tmp/otel_trace.json +``` + +--- + +## Adding Tracing to Your Own Code + +FastDeploy already inserts tracing points at most critical execution stages. +Developers can use the APIs provided in `trace.py` to add more fine-grained tracing. + +--- + +### 4.1 Initialize Tracing + +Each **process** involved in tracing must call: + +```python +process_tracing_init() +``` + +Each **thread** that participates in a traced request must call: + +```python +trace_set_thread_info("thread_label", tp_rank, dp_rank) +``` + +* `thread_label`: identifier used for visual distinction of threads. +* `tp_rank` / `dp_rank`: optional values to label tensor parallelism or data parallelism ranks. + +--- + +### 4.2 Mark Request Start and Finish + +```python +trace_req_start(rid, bootstrap_room, ts, role) +trace_req_finish(rid, ts, attrs) +``` + +* Creates both a **Bootstrap Room Span** and a **Root Span**. +* Supports inheritance from spans created by the **FastAPI Instrumentor** (context copying). +* `attrs` can be used to attach additional attributes to the request span. + +--- + +### 4.3 Add Tracing for Slices + +#### Standard Slice + +```python +trace_slice_start("slice_name", rid) +trace_slice_end("slice_name", rid) +``` + +#### Mark Thread Completion + +The last slice in a thread can mark the thread span as finished: + +```python +trace_slice_end("slice_name", rid, thread_finish_flag=True) +``` + +--- + +### 4.4 Trace Context Propagation Across Threads + +#### Sender Side (ZMQ) + +```python +trace_context = trace_get_proc_propagate_context(rid) +req.trace_context = trace_context +``` + +#### Receiver Side (ZMQ) + +```python +trace_set_proc_propagate_context(rid, req.trace_context) +``` + +--- + +### 4.5 Add Events and Attributes + +#### Events (recorded on the current slice) + +```python +trace_event("event_name", rid, ts, attrs) +``` + +#### Attributes (attached to the current slice) + +```python +trace_slice_add_attr(rid, attrs) +``` + +--- + +## Extending the Tracing Framework + +### 5.1 Trace Context Hierarchy + +* Two levels of Trace Context: + + * **`TraceReqContext`** – request-level context + * **`TraceThreadContext`** – thread-level context + +* Three-level Span hierarchy: + + * `req_root_span` + * `thread_span` + * `slice_span` + +--- + +### 5.2 Available Span Name Enum (`TraceSpanName`) + +```python +FASTDEPLOY +PREPROCESS +SCHEDULE +PREFILL +DECODE +POSTPROCESS +``` + +* These enums can be used when creating slices to ensure consistent naming. + +--- + +### 5.3 Important Notes + +1. Each **thread span must be closed** when the final slice of that thread finishes. +2. Spans created by **FastAPI Instrumentor** are automatically inherited by the internal tracing context. diff --git a/docs/zh/observability/README.md b/docs/zh/observability/README.md new file mode 100644 index 00000000000..43de520ccf6 --- /dev/null +++ b/docs/zh/observability/README.md @@ -0,0 +1,149 @@ +## Observability 示例配置 (`examples/observability`) + +该目录提供了一套完整的、基于 Docker Compose 的可观测性(Observability)示例,包括: + +- Prometheus:指标收集 +- Grafana:指标可视化 +- OpenTelemetry Collector:分布式追踪数据接收与处理 + +开发者可以使用此示例环境 一键启动本地监控与追踪系统。 + +### 先决条件 + +需要确保提前安装以下组件: + +- Docker +- Docker Compose(或新版 Docker CLI 支持 `docker compose`) + +### 使用方法 + +#### 整体启动 + +进入目录: + +```shell +cd examples/observability +``` + +在 `examples/observability` 目录下执行以下命令即可启动完整的监控和追踪服务: + +```bash +docker compose -f docker-compose.yaml up -d +``` + +启动完成后可访问: + +- Prometheus 访问: http://localhost:9090 +- Grafana 访问: http://localhost:3000 +- OTLP 接收端: 应用程序应将 Traces 发送到 OTel Collector 的默认端口(通常是 `4317` 或 `4318`)。 + - grpc: 4317端口 + - http: 4318端口 +- Jeager 访问:http://localhost:16886 + +【注意事项】: + +- Prometheus的抓取地址换成自己的地址 + +- Grafana的展示端口映射成自己可以访问的端口 + +- Jaeger的展示端口映射成自己可以访问的端口 + +- 如果启动了整体服务就不需要再单独去启动子服务了 + +#### metrics启动 + +进入目录: + +```shell +cd examples/observability/metrics +``` + +在 `examples/observability` 目录下执行以下命令即可启动完整的监控和追踪服务: + +```bash +docker compose -f prometheus_compose.yaml up -d +``` + +启动完成后可访问: + +- Grafana 访问: http://localhost:3000 + +#### trace启动 + +进入目录: + +```shell +cd examples/observability/tracing +``` + +在 `examples/observability` 目录下执行以下命令即可启动完整的监控和追踪服务: + +```bash +docker compose -f tracing_compose.yaml up -d +``` + +启动完成后可访问: + +- OTLP 接收端:应用程序应将 Traces 发送到 OTel Collector 的默认端口(通常是 `4317` 或 `4318`)。 + - grpc: 4317端口 + - http: 4318端口 +- Jeager 访问:http://localhost:16886 + +### 目录结构与文件说明 + +#### 核心启动文件 + +| 文件名 | 作用 | 详情 | +| ------------------- | ---------- | ------------------------------------------------------------ | +| docker-compose.yaml | 主启动文件 | 定义并启动完整的可观测性组件(Prometheus、Grafana、OTel Collector、Jaeger)。这是启动整个 Observability 环境的唯一入口。 | + +#### 指标 (Metrics) 与监控配置 + +| 文件/目录 | 作用 | 详情 | +| ------------------------------------------------- | ---------------------- | ------------------------------------------------------------ | +| metrics | 指标配置根目录 | 包含所有与指标收集和 Prometheus 相关的配置。 | +| prometheus.yaml | Prometheus 主配置 | 定义抓取目标(scrape targets)、全局采集参数,并可选地配置记录规则(recording rules)。所有监控端点都在此定义。 | +| prometheus_compose.yaml | Prometheus Docker 配置 | 定义 Prometheus 容器、卷挂载和网络设置。 | +| grafana/datasources/datasource.yaml | 数据源配置 | 定义 Grafana 连接 Prometheus 的方式。 | +| grafana/dashboards/config/dashboard.yaml | 仪表板加载配置 | 指定仪表板 JSON 文件所在路径。 | +| grafana/dashboards/json/fastdeploy-dashboard.json | 仪表板 | 包含 `fastdeploy`监控指标的可视化布局与查询定义。 | + +#### 分布式追踪 (Tracing) 配置 + +| 文件/目录 | 作用 | 详情 | +| -------------------- | ------------------- | ------------------------------------------------------------ | +| tracing | 追踪配置根目录 | 包含所有与分布式追踪相关的配置。 | +| opentelemetry.yaml | OTel Collector 配置 | 定义 Collector 的数据管道:
• receivers:接收 OTLP 数据(traces, metrics, logs)
• processors:处理与批次化数据
• exporters:将数据导出到追踪后端(如 Jaeger)或文件
• extensions:健康检查、pprof 和 zpages
• pipelines:定义 traces、metrics 和 logs 的完整处理流程 | +| tracing_compose.yaml | Tracing Docker 配置 | 定义 OTel Collector 和 Jaeger 的容器配置。 | + +### 4. 如何定制 + +#### 4.1 修改指标抓取目标 + +若应用程序端口、路径更改,请编辑: + +```plain +metrics/prometheus.yaml +``` + +#### 4.2 调整追踪采样率或处理逻辑 + +编辑: + +```plain +tracing/opentelemetry.yaml +``` + +#### 4.3 添加自定义 Grafana 仪表盘 + +1. 新增 JSON 仪表盘至: + +```plain +grafana/dashboards/json/ +``` + +1. 在下方文件中注册该仪表盘,使 Grafana 自动加载: + +```plain +grafana/dashboards/config/dashboard.yaml +``` diff --git a/docs/zh/observability/trace.md b/docs/zh/observability/trace.md new file mode 100644 index 00000000000..87d4651d50e --- /dev/null +++ b/docs/zh/observability/trace.md @@ -0,0 +1,150 @@ +**FastDeploy** 基于**OpenTelemetry Collector** 导出请求追踪数据。 +可通过在启动服务器时添加 `--trace-enable` 来开启追踪,并使用 `--otlp-traces-endpoint` 配置 OpenTelemetry Collector 接收端点。 + +## 配置指南(Setup Guide) + +### 1. 安装依赖和工具 + +```bash +# 手动安装 +pip install opentelemetry-sdk opentelemetry-api opentelemetry-exporter-otlp opentelemetry-exporter-otlp-proto-grpc +``` + +### 2. 启动 OpenTelemetry Collector 和 Jaeger + +```bash +docker compose -f examples/observability/tracing/tracing_compose.yaml up -d +``` + +### 3. 启动带追踪功能的 FastDeploy 服务器 + +- FastDeploy设置环境变量 + +```shell +# 开启Trace +"TRACES_ENABLE": "true", +# 服务名称 +"FD_SERVICE_NAME": "FastDeploy", +# 实例名称 +"FD_HOST_NAME": "trace_test", +"TRACES_EXPORTER": "otlp", +# grpc方式导出端口为4317, http方式导出端口为4318 +"EXPORTER_OTLP_ENDPOINT": "http://localhost:4317", +"EXPORTER_OTLP_HEADERS": "Authentication=Txxxxx", +# 导出方式 +"OTEL_EXPORTER_OTLP_TRACES_PROTOCOL": "grpc", +``` + +- 启动FastDeploy + +### 4. 发送请求并观察追踪数据 + +- 在浏览器访问 Jaeger UI(端口 `16686`)可视化请求追踪。 + +- Collector 同时会将追踪数据导出为 `/tmp/otel_trace.json`。 + +## 如何为自己的代码添加追踪 + +FastDeploy 已在主要节点插入了追踪点。开发者可使用 `trace.py` 提供的 API 进行更精细的追踪。 + +### 4.1 初始化追踪 + +每个涉及追踪的**进程**执行: + +```python +process_tracing_init() +``` + +请求涉及到的每个**线程**执行: + +```python +trace_set_thread_info("thread_label", tp_rank, dp_rank) +``` + +- `thread_label` 用于线程区分,可视化显示 +- `tp_rank`/`dp_rank` 可选,标记张量并行或数据并行 rank + +### 4.2 标记请求开始和结束 + +```python +trace_req_start(rid, bootstrap_room, ts, role) +trace_req_finish(rid, ts, attrs) +``` + +- 会创建 Bootstrap Room Span与 Root Span +- 支持 FastAPI Instrumentor 已创建 Span 的继承(context copy) +- `attrs` 可添加额外属性 + +### 4.3 为 Slice 添加追踪 + +普通 Slice: + +```python +trace_slice_start("slice_name", rid) +trace_slice_end("slice_name", rid) +``` + +- 最后一个 Slice 可标记线程结束: + +```python +trace_slice_end("slice_name", rid, thread_finish_flag=True) +``` + +### 4.4 请求跨线程 Trace Context 传播 + +发送端(ZMQ): + +```python +trace_context = trace_get_proc_propagate_context(rid) +req.trace_context = trace_context +``` + +接收端(ZMQ): + +```python +trace_set_proc_propagate_context(rid, req.trace_context) +``` + +### 4.5 添加事件和属性 + +事件(记录到当前 Slice): + +```python +trace_event("event_name", rid, ts, attrs) +``` + +属性(添加到当前 Slice): + +```python +trace_slice_add_attr(rid, attrs) +``` + +## 扩展追踪框架 + +### 5.1 Trace Context 层级 + +- 两级 Trace Context: + - `TraceReqContext` → 请求级上下文 + - `TraceThreadContext` → 线程级上下文 +- 三级 Span 结构: + - `req_root_span` + - `thread_span` + - `slice_span` + +### 5.2 可用的 Span 名枚举(`TraceSpanName`) + +```python +FASTDEPLOY +PREPROCESS +SCHEDULE +PREFILL +DECODE +POSTPROCESS +``` + +- 在创建 slice 时可使用枚举,保证命名规范化 + +### 5.3 注意事项 + +1. 每个线程 Span 必须在最后一个 Slice 结束时关闭。 +2. FastAPI Instrumentor 已创建的 Span 会被继承到内部追踪上下文。 diff --git a/examples/observability/docker-compose.yaml b/examples/observability/docker-compose.yaml new file mode 100644 index 00000000000..ec15e634121 --- /dev/null +++ b/examples/observability/docker-compose.yaml @@ -0,0 +1,52 @@ +version: '1.0' +services: + prometheus: + image: prom/prometheus:latest + container_name: prometheus + ports: + - "9090:9090" + volumes: + - ./metrics/prometheus.yaml:/etc/prometheus/prometheus.yml + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + + grafana: + image: grafana/grafana:latest + container_name: grafana + ports: + - "3000:3000" + volumes: + - ./metrics/grafana/datasources:/etc/grafana/provisioning/datasources + - ./metrics/grafana/dashboards/config:/etc/grafana/provisioning/dashboards + - ./metrics/grafana/dashboards/json:/var/lib/grafana/dashboards + environment: + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer + - GF_AUTH_BASIC_ENABLED=false + - GF_USERS_ALLOW_SIGN_UP=false + - GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/var/lib/grafana/dashboards/fastdeploy-dashboard.json + depends_on: + - prometheus + + jaeger: + image: jaegertracing/all-in-one + container_name: jaeger + ports: + - "16686:16686" + environment: + - COLLECTOR_OTLP_ENABLED=true + restart: unless-stopped + + otel-collector: + image: docker.io/otel/opentelemetry-collector + volumes: + - ./tracing/opentelemetry.yaml:/etc/otelcol/config.yaml + - /tmp:/tmp + ports: + - "4317:4317" # OTLP gRPC + - "4318:4318" # OTLP HTTP + depends_on: + - jaeger + - prometheus + restart: unless-stopped diff --git a/examples/observability/metrics/grafana/dashboards/config/dashboard.yaml b/examples/observability/metrics/grafana/dashboards/config/dashboard.yaml new file mode 100644 index 00000000000..5d347a84420 --- /dev/null +++ b/examples/observability/metrics/grafana/dashboards/config/dashboard.yaml @@ -0,0 +1,11 @@ +apiVersion: 1 +providers: + - name: 'FastDeploy' + orgId: 1 + folder: 'FastDeploy Monitoring' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: false + options: + path: /var/lib/grafana/dashboards diff --git a/examples/observability/metrics/grafana/dashboards/json/fastdeploy-dashboard.json b/examples/observability/metrics/grafana/dashboards/json/fastdeploy-dashboard.json new file mode 100644 index 00000000000..eaff9cb5e2b --- /dev/null +++ b/examples/observability/metrics/grafana/dashboards/json/fastdeploy-dashboard.json @@ -0,0 +1,2397 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 4, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "df2i7osj6pssge" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 25, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "df2i7osj6pssge" + }, + "editorMode": "code", + "expr": "rate(fastdeploy:time_to_first_token_seconds_sum[5m]) / rate(fastdeploy:time_to_first_token_seconds_count[5m])", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "首Token时延", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "df2i7osj6pssge" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 26, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "df2i7osj6pssge" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95,sum(rate(fastdeploy:time_to_first_token_seconds_bucket[5m])) by (le))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "首Token时延95分位", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "fastdeploy:num_requests_running", + "refId": "A" + } + ], + "title": "当前运行请求数", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "fastdeploy:num_requests_waiting", + "refId": "A" + } + ], + "title": "当前等待请求数", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 23 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "rate(fastdeploy:requests_number_total[1m])", + "refId": "A" + } + ], + "title": "总请求数 (增量)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "df2i7osj6pssge" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 23 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "editorMode": "code", + "expr": "fastdeploy:request_success_total", + "range": true, + "refId": "A" + } + ], + "title": "成功请求总数 (增量)", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "rate(fastdeploy:generation_tokens_total[1m])", + "refId": "A" + } + ], + "title": "生成 token 总数 (增量)", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 34 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "rate(fastdeploy:prompt_tokens_total[1m])", + "refId": "A" + } + ], + "title": "Prompt token 总数 (增量)", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 45 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "fastdeploy:first_token_latency", + "refId": "A" + } + ], + "title": "首 token 延迟 (秒)", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 45 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "fastdeploy:infer_latency", + "refId": "A" + } + ], + "title": "单 token 推理延迟 (秒)", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 56 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "fastdeploy:batch_size", + "refId": "A" + } + ], + "title": "当前 batch size", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 56 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "fastdeploy:max_batch_size", + "refId": "A" + } + ], + "title": "最大 batch size", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 12, + "x": 0, + "y": 67 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "fastdeploy:available_batch_size", + "refId": "A" + } + ], + "title": "Decode 阶段可插入请求数", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 12, + "x": 12, + "y": 67 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "fastdeploy:gpu_cache_usage_perc", + "refId": "A" + } + ], + "title": "GPU KV-cache 使用率", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 12, + "x": 0, + "y": 80 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "fastdeploy:available_gpu_resource", + "refId": "A" + } + ], + "title": "可用 GPU 资源百分比", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 12, + "x": 12, + "y": 80 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "fastdeploy:available_gpu_block_num", + "refId": "A" + } + ], + "title": "可用 GPU block 数", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 93 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "fastdeploy:hit_req_rate", + "refId": "A" + } + ], + "title": "请求级缓存命中率", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 93 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "fastdeploy:hit_token_rate", + "refId": "A" + } + ], + "title": "Token 级缓存命中率", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 105 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "fastdeploy:cpu_hit_token_rate", + "refId": "A" + } + ], + "title": "CPU 缓存命中率", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 105 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "fastdeploy:gpu_hit_token_rate", + "refId": "A" + } + ], + "title": "GPU 缓存命中率", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 117 + }, + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "histogram_quantile(0.5, sum(rate(fastdeploy:request_queue_time_seconds_bucket[1m])) by (le))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(fastdeploy:request_queue_time_seconds_bucket[1m])) by (le))", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(fastdeploy:request_queue_time_seconds_bucket[1m])) by (le))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "队列等待时间 (秒)", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 117 + }, + "id": 20, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "histogram_quantile(0.5, sum(rate(fastdeploy:request_prefill_time_seconds_bucket[1m])) by (le))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(fastdeploy:request_prefill_time_seconds_bucket[1m])) by (le))", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(fastdeploy:request_prefill_time_seconds_bucket[1m])) by (le))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "Prefill 阶段耗时 (秒)", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 128 + }, + "id": 21, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "histogram_quantile(0.5, sum(rate(fastdeploy:request_decode_time_seconds_bucket[1m])) by (le))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(fastdeploy:request_decode_time_seconds_bucket[1m])) by (le))", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(fastdeploy:request_decode_time_seconds_bucket[1m])) by (le))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "Decode 阶段耗时 (秒)", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 129 + }, + "id": 22, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "histogram_quantile(0.5, sum(rate(fastdeploy:request_inference_time_seconds_bucket[1m])) by (le))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(fastdeploy:request_inference_time_seconds_bucket[1m])) by (le))", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(fastdeploy:request_inference_time_seconds_bucket[1m])) by (le))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "Inference 阶段耗时 (秒)", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 140 + }, + "id": 23, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "histogram_quantile(0.5, sum(rate(fastdeploy:request_generation_tokens_bucket[1m])) by (le))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(fastdeploy:request_generation_tokens_bucket[1m])) by (le))", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(fastdeploy:request_generation_tokens_bucket[1m])) by (le))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "单请求生成 token 数", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 141 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "expr": "histogram_quantile(0.5, sum(rate(fastdeploy:request_prompt_tokens_bucket[1m])) by (le))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(fastdeploy:request_prompt_tokens_bucket[1m])) by (le))", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(fastdeploy:request_prompt_tokens_bucket[1m])) by (le))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "单请求 prefill token 数", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "auto", + "schemaVersion": 42, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "FastDeploy 全指标监控", + "uid": "55071365-a765-4f8e-915c-336c8c35abac", + "version": 11 +} diff --git a/examples/observability/metrics/grafana/datasources/datasource.yaml b/examples/observability/metrics/grafana/datasources/datasource.yaml new file mode 100644 index 00000000000..12eb5b8dbc1 --- /dev/null +++ b/examples/observability/metrics/grafana/datasources/datasource.yaml @@ -0,0 +1,9 @@ +apiVersion: 1 +datasources: + - name: Prometheus + type: prometheus + access: proxy + # url: http://localhost:9090 + url: http://prometheus:9090 + isDefault: true + editable: false diff --git a/examples/observability/metrics/prometheus.yaml b/examples/observability/metrics/prometheus.yaml new file mode 100644 index 00000000000..6df97e64446 --- /dev/null +++ b/examples/observability/metrics/prometheus.yaml @@ -0,0 +1,10 @@ +# prometheus.yaml +global: + scrape_interval: 5s + evaluation_interval: 30s + +scrape_configs: + - job_name: 'fastdeploy' + static_configs: + # list all your targets here + - targets: ['127.0.0.1:30000'] diff --git a/examples/observability/metrics/prometheus_compose.yaml b/examples/observability/metrics/prometheus_compose.yaml new file mode 100644 index 00000000000..7659db3b5aa --- /dev/null +++ b/examples/observability/metrics/prometheus_compose.yaml @@ -0,0 +1,30 @@ +version: '1' +services: + prometheus: + image: prom/prometheus:latest + container_name: prometheus + ports: + - "9090:9090" + volumes: + - ./prometheus.yaml:/etc/prometheus/prometheus.yml + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + + grafana: + image: grafana/grafana:latest + container_name: grafana + ports: + - "3000:3000" + volumes: + - ./grafana/datasources:/etc/grafana/provisioning/datasources + - ./grafana/dashboards/config:/etc/grafana/provisioning/dashboards + - ./grafana/dashboards/json:/var/lib/grafana/dashboards + environment: + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer + - GF_AUTH_BASIC_ENABLED=false + - GF_USERS_ALLOW_SIGN_UP=false + - GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/var/lib/grafana/dashboards/fastdeploy-dashboard.json + depends_on: + - prometheus diff --git a/examples/observability/tracing/opentelemetry.yaml b/examples/observability/tracing/opentelemetry.yaml new file mode 100644 index 00000000000..8593d9182e1 --- /dev/null +++ b/examples/observability/tracing/opentelemetry.yaml @@ -0,0 +1,38 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 +processors: + batch: + +exporters: + otlp: + endpoint: jaeger:4317 + tls: + insecure: true + file: + path: /tmp/otel_trace.json + +extensions: + health_check: + pprof: + zpages: + +service: + extensions: [health_check, pprof, zpages] + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlp, file] + metrics: + receivers: [otlp] + processors: [batch] + exporters: [otlp] + logs: + receivers: [otlp] + processors: [batch] + exporters: [otlp] diff --git a/examples/observability/tracing/tracing_compose.yaml b/examples/observability/tracing/tracing_compose.yaml new file mode 100644 index 00000000000..7ed1ecdda37 --- /dev/null +++ b/examples/observability/tracing/tracing_compose.yaml @@ -0,0 +1,21 @@ +services: + otel-collector: + image: docker.io/otel/opentelemetry-collector + volumes: + - ./opentelemetry.yaml:/etc/otelcol/config.yaml + - /tmp:/tmp + ports: + - "4317:4317" # OTLP gRPC + - "4318:4318" # OTLP HTTP + depends_on: + - jaeger + restart: unless-stopped + + jaeger: + image: jaegertracing/all-in-one + container_name: jaeger + ports: + - "16686:16686" + environment: + - COLLECTOR_OTLP_ENABLED=true + restart: unless-stopped diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index 9a370f44353..7a689abec33 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -35,9 +35,9 @@ import paddle import requests import zmq -from opentelemetry import trace from tqdm import tqdm +import fastdeploy.metrics.trace as tracing from fastdeploy.engine.request import Request, RequestOutput, RequestType from fastdeploy.engine.resource_manager import ResourceManager from fastdeploy.engine.sched.resource_manager_v1 import ResourceManagerV1 @@ -51,7 +51,6 @@ ZmqTcpServer, ) from fastdeploy.metrics.metrics import main_process_metrics -from fastdeploy.metrics.trace_util import start_span, start_span_request from fastdeploy.model_executor.guided_decoding import schema_checker from fastdeploy.plugins.token_processor import load_token_processor_plugins from fastdeploy.router.utils import check_service_health @@ -417,13 +416,16 @@ def insert_tasks(self, tasks: List[Request], current_id=-1): """ if not isinstance(tasks, list): tasks = [tasks] - for task in tasks: - start_span_request("DEQUEUE", task, trace.SpanKind.CONSUMER) self.resource_manager.check_and_free_block_tables() need_delete_tasks = [] for task in tasks: + rid = task.request_id.split("_")[0] + trace_carrier = task.trace_carrier + if trace_carrier: + tracing.trace_set_proc_propagate_context(rid, trace_carrier) + task.trace_carrier = tracing.trace_get_proc_propagate_context(rid) if self.cfg.scheduler_config.splitwise_role != "mixed": status, msg = self.split_connector.check_decode_allocated(task) if status: @@ -447,6 +449,7 @@ def insert_tasks(self, tasks: List[Request], current_id=-1): for item in tasks: trace_print(LoggingEventName.RESOURCE_ALLOCATE_START, item.request_id, getattr(item, "user", "")) + available_batch = np.sum(self.resource_manager.stop_flags) if len(tasks) > available_batch: self.llm_logger.error(f"Inserting batch:{len(tasks)} exceeds the available batch:{available_batch}.") @@ -484,6 +487,13 @@ def insert_tasks(self, tasks: List[Request], current_id=-1): self.llm_logger.info(f"Tasks are sent to engine, req_ids={req_ids}") for task in tasks: task.metrics.inference_start_time = time.time() + tracing.trace_report_span( + tracing.TraceSpanName.SCHEDULE, + task.request_id.split("_")[0], + int(task.metrics.scheduler_recv_req_time * 1e9), + int(task.metrics.inference_start_time * 1e9), + thread_finish_flag=True, + ) trace_print(LoggingEventName.RESOURCE_ALLOCATE_END, task.request_id, getattr(task, "user", "")) trace_print(LoggingEventName.REQUEST_SCHEDULE_END, task.request_id, getattr(task, "user", "")) trace_print(LoggingEventName.INFERENCE_START, task.request_id, getattr(task, "user", "")) @@ -694,6 +704,7 @@ def _schedule_request_to_worker(self): Insert task to engine thread, monitor scheduler request queue. if the engine has resource, insert task to engine """ + tracing.trace_set_thread_info("Scheduler Task to Work") current_id = 0 while getattr(self, "running", True): try: @@ -764,6 +775,7 @@ def _schedule_request_to_worker_v1(self): """ Insert tasks to worker with scheduler v1 (ENABLE_V1_KVCACHE_SCHEDULER=1). """ + tracing.trace_set_thread_info("Scheduler Task to Work") get_request_pool = ThreadPoolExecutor(max_workers=1) is_fetching = False @@ -982,6 +994,18 @@ def _fetch_request(): self.resource_manager.get_real_bsz() for task in tasks: if task.task_type == RequestType.PREFILL: + rid = task.request_id.split("_")[0] + trace_carrier = task.trace_carrier + tracing.trace_set_proc_propagate_context(rid, trace_carrier) + trace_carrier = tracing.trace_get_proc_propagate_context(rid) + task.trace_carrier = trace_carrier + tracing.trace_report_span( + tracing.TraceSpanName.SCHEDULE, + rid, + int(task.metrics.scheduler_recv_req_time * 1e9), + int(time.time() * 1e9), + thread_finish_flag=True, + ) trace_print( LoggingEventName.RESOURCE_ALLOCATE_END, task.request_id, getattr(task, "user", "") ) @@ -1039,6 +1063,7 @@ def start_zmq_service(self, api_server_pid=None): self.receive_output_thread.start() def _insert_zmq_task_to_scheduler(self): + tracing.trace_set_thread_info("Insert Task to Scheduler") added_requests: Dict[str, int] = dict() if envs.FD_ENABLE_INTERNAL_ADAPTER: if self.cfg.scheduler_config.splitwise_role == "decode": @@ -1068,7 +1093,6 @@ def _insert_zmq_task_to_scheduler(self): try: request = Request.from_dict(data) request.metrics.scheduler_recv_req_time = time.time() - start_span("ENQUEUE_ZMQ", data, trace.SpanKind.PRODUCER) main_process_metrics.requests_number.inc() trace_print(LoggingEventName.PREPROCESSING_END, data["request_id"], data.get("user", "")) trace_print(LoggingEventName.REQUEST_SCHEDULE_START, data["request_id"], data.get("user", "")) diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index e1209872ead..49eccfcc59e 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -34,6 +34,7 @@ import paddle from tqdm import tqdm +import fastdeploy.metrics.trace as tracing from fastdeploy.engine.args_utils import EngineArgs from fastdeploy.engine.common_engine import EngineService from fastdeploy.engine.expert_service import start_data_parallel_service @@ -97,6 +98,8 @@ def __init__(self, cfg): main_process_metrics.set_cache_config_info(obj=self.cfg.cache_config) + tracing.trace_set_thread_info("engine") + def start(self, api_server_pid=None): """ Initializes the engine and starts its sub-services. diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py index c4080a4cc6b..b89e8f016c8 100644 --- a/fastdeploy/engine/request.py +++ b/fastdeploy/engine/request.py @@ -610,6 +610,7 @@ def __init__( # for internal adapter ic_req_data: Optional[dict] = None, prompt_token_ids_len: Optional[int] = 0, + trace_carrier: dict = dict(), ) -> None: self.request_id = request_id self.prompt = prompt @@ -626,6 +627,7 @@ def __init__( self.error_msg = error_msg self.ic_req_data = ic_req_data self.prompt_token_ids_len = prompt_token_ids_len + self.trace_carrier = trace_carrier if prompt_token_ids is None: self.prompt_token_ids = [] @@ -674,6 +676,7 @@ def __repr__(self) -> str: f"metrics={self.metrics}, " f"error_code={self.error_code}, " f"error_msg={self.error_msg}," + f"trace_carrier={self.trace_carrier}" ) @classmethod @@ -689,7 +692,8 @@ def from_dict(cls, d: dict): else: d.pop("metrics", None) metrics = None - return RequestOutput(**d, outputs=completion_output, metrics=metrics) + trace_carrier = d.pop("trace_carrier", {}) + return RequestOutput(**d, outputs=completion_output, metrics=metrics, trace_carrier=trace_carrier) def to_dict(self): """convert RequestOutput into a serializable dict""" @@ -710,6 +714,7 @@ def to_dict(self): "error_msg": self.error_msg, "ic_req_data": self.ic_req_data, "prompt_token_ids_len": self.prompt_token_ids_len, + "trace_carrier": self.trace_carrier, } diff --git a/fastdeploy/entrypoints/cli/tokenizer.py b/fastdeploy/entrypoints/cli/tokenizer.py index 3012fd1f6c6..17e22bb1181 100644 --- a/fastdeploy/entrypoints/cli/tokenizer.py +++ b/fastdeploy/entrypoints/cli/tokenizer.py @@ -196,7 +196,7 @@ def print_separator(title=""): # 检查参数 if not any([args.encode, args.decode, args.vocab_size, args.info, args.vocab_export]): - print("请至少指定一个参数:--encode, --decode, --vocab-size, --info, --export-vocab") + print("请至少指定一个参数:--encode, --decode, --vocab-size, --info, --vocab-export") return # 初始化tokenizer diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py index 7d387acc609..a75db36f48c 100644 --- a/fastdeploy/entrypoints/engine_client.py +++ b/fastdeploy/entrypoints/engine_client.py @@ -25,6 +25,7 @@ import numpy as np from filelock import FileLock +import fastdeploy.metrics.trace as tracing from fastdeploy import envs from fastdeploy.config import FDConfig from fastdeploy.entrypoints.openai.utils import DealerConnectionManager @@ -288,6 +289,8 @@ async def add_requests(self, task): """ task["preprocess_start_time"] = time.time() + request_id = task.get("request_id").split("_")[0] + tracing.trace_slice_start(tracing.TraceSpanName.PREPROCESSING, request_id) trace_print(LoggingEventName.PREPROCESSING_START, task["request_id"], task.get("user", "")) try: chat_template_kwargs = task.get("chat_template_kwargs") or {} @@ -307,7 +310,6 @@ async def add_requests(self, task): "The current service does not support processing requests containing multimodal data when prefix cache is enabled. Please send only text-based requests or disable prefix cache", error_code=400, ) - task["prompt_token_ids_len"] = len(task["prompt_token_ids"]) input_ids_len = task["prompt_token_ids_len"] @@ -376,10 +378,15 @@ async def add_requests(self, task): else: request_id = parts[0] index = int(parts[1]) + trace_carrier = tracing.trace_get_proc_propagate_context(request_id) + task["trace_carrier"] = trace_carrier for i in range(index * n, (index + 1) * n): child_task = copy(task) child_task["request_id"] = f"{request_id}_{i}" self._send_task(child_task) + tracing.trace_slice_end( + tracing.TraceSpanName.PREPROCESSING, task.get("request_id").split("_")[0], thread_finish_flag=True + ) except Exception as e: api_server_logger.error(f"zmq_client send task error: {e}, {str(traceback.format_exc())}") raise EngineError(str(e), error_code=400) diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py index 8da77548951..2744c9388c0 100644 --- a/fastdeploy/entrypoints/openai/api_server.py +++ b/fastdeploy/entrypoints/openai/api_server.py @@ -30,7 +30,10 @@ from fastapi.responses import JSONResponse, Response, StreamingResponse from gunicorn.app.base import BaseApplication from opentelemetry import trace +from opentelemetry.propagate import extract +import fastdeploy.metrics.trace as tracing +from fastdeploy import envs from fastdeploy.engine.args_utils import EngineArgs from fastdeploy.engine.engine import LLMEngine from fastdeploy.engine.expert_service import ExpertService @@ -58,12 +61,6 @@ from fastdeploy.entrypoints.openai.utils import UVICORN_CONFIG, make_arg_parser from fastdeploy.envs import environment_variables from fastdeploy.metrics.metrics import get_filtered_metrics -from fastdeploy.metrics.trace_util import ( - fd_start_span, - inject_to_metadata, - instrument, - lable_span, -) from fastdeploy.utils import ( ExceptionHandler, FlexibleArgumentParser, @@ -74,6 +71,8 @@ retrive_model_from_server, ) +tracing.process_tracing_init() + parser = make_arg_parser(FlexibleArgumentParser()) args = parser.parse_args() @@ -246,7 +245,6 @@ async def lifespan(app: FastAPI): app = FastAPI(lifespan=lifespan) app.add_exception_handler(RequestValidationError, ExceptionHandler.handle_request_validation_exception) app.add_exception_handler(Exception, ExceptionHandler.handle_exception) -instrument(app) env_api_key_func = environment_variables.get("FD_API_KEY") @@ -367,19 +365,23 @@ async def wrapped_generator(): @app.post("/v1/chat/completions") -async def create_chat_completion(request: ChatCompletionRequest): +async def create_chat_completion(request: ChatCompletionRequest, req: Request): """ Create a chat completion for the provided prompt and parameters. """ api_server_logger.debug(f"Chat Received request: {request.model_dump_json()}") + if envs.TRACES_ENABLE: + if req.headers: + headers = dict(req.headers) + trace_context = extract(headers) + request.trace_context = trace_context if app.state.dynamic_load_weight: status, msg = app.state.engine_client.is_workers_alive() if not status: return JSONResponse(content={"error": "Worker Service Not Healthy"}, status_code=304) try: async with connection_manager(): - inject_to_metadata(request) - lable_span(request) + tracing.label_span(request) generator = await app.state.chat_handler.create_chat_completion(request) if isinstance(generator, ErrorResponse): api_server_logger.debug(f"release: {connection_semaphore.status()}") @@ -399,18 +401,23 @@ async def create_chat_completion(request: ChatCompletionRequest): @app.post("/v1/completions") -async def create_completion(request: CompletionRequest): +async def create_completion(request: CompletionRequest, req: Request): """ Create a completion for the provided prompt and parameters. """ api_server_logger.info(f"Completion Received request: {request.model_dump_json()}") + if envs.TRACES_ENABLE: + if req.headers: + headers = dict(req.headers) + trace_context = extract(headers) + request.trace_context = trace_context if app.state.dynamic_load_weight: status, msg = app.state.engine_client.is_workers_alive() if not status: return JSONResponse(content={"error": "Worker Service Not Healthy"}, status_code=304) try: async with connection_manager(): - lable_span(request) + tracing.label_span(request) generator = await app.state.completion_handler.create_completion(request) if isinstance(generator, ErrorResponse): connection_semaphore.release() @@ -471,6 +478,7 @@ async def create_embedding(request: EmbeddingRequest): @app.get("/update_model_weight") +@tracing.trace_span("update_model_weight") def update_model_weight(request: Request) -> Response: """ update model weight @@ -485,6 +493,7 @@ def update_model_weight(request: Request) -> Response: @app.get("/clear_load_weight") +@tracing.trace_span("clear_load_weight") def clear_load_weight(request: Request) -> Response: """ clear model weight @@ -499,6 +508,7 @@ def clear_load_weight(request: Request) -> Response: @app.post("/rearrange_experts") +@tracing.trace_span("rearrange_experts") async def rearrange_experts(request: Request): """ rearrange experts @@ -509,6 +519,7 @@ async def rearrange_experts(request: Request): @app.post("/get_per_expert_tokens_stats") +@tracing.trace_span("get_per_expert_tokens_stats") async def get_per_expert_tokens_stats(request: Request): """ get per expert tokens stats @@ -519,6 +530,7 @@ async def get_per_expert_tokens_stats(request: Request): @app.post("/check_redundant") +@tracing.trace_span("check_redundant") async def check_redundant(request: Request): """ check redundant @@ -537,7 +549,7 @@ def launch_api_server() -> None: api_server_logger.info(f"launch Fastdeploy api server... port: {args.port}") api_server_logger.info(f"args: {args.__dict__}") - fd_start_span("FD_START") + # fd_start_span("FD_START") options = { "bind": f"{args.host}:{args.port}", @@ -565,6 +577,7 @@ def launch_api_server() -> None: @metrics_app.get("/metrics") +@tracing.trace_span("metrics") async def metrics(): """ metrics @@ -574,6 +587,7 @@ async def metrics(): @metrics_app.get("/config-info") +@tracing.trace_span("config-info") def config_info() -> Response: """ Get the current configuration of the API server. diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index a401e4db82c..40aa6239349 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -500,6 +500,7 @@ class CompletionRequest(BaseModel): mm_hashes: Optional[list] = None # doc: end-completion-extra-params + trace_context: Optional[str] = None collect_metrics: Optional[bool] = False @@ -675,6 +676,7 @@ class ChatCompletionRequest(BaseModel): mm_hashes: Optional[list] = None completion_token_ids: Optional[List[int]] = None # doc: end-chat-completion-extra-params + trace_context: Optional[str] = None collect_metrics: Optional[bool] = False diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index f8fcca31e75..6d872e126f0 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -24,6 +24,7 @@ import numpy as np +import fastdeploy.metrics.trace as tracing from fastdeploy.entrypoints.openai.protocol import ( ChatCompletionRequest, ChatCompletionResponse, @@ -103,6 +104,7 @@ async def create_chat_completion(self, request: ChatCompletionRequest): """ Create a new chat completion using the specified parameters. """ + tracing.trace_set_thread_info("API Server") if not self._check_master(): err_msg = ( f"Only master node can accept completion request, please send request to master node: {self.master_ip}" @@ -134,6 +136,8 @@ async def create_chat_completion(self, request: ChatCompletionRequest): request_id = f"chatcmpl-{request.user}-{uuid.uuid4()}" else: request_id = f"chatcmpl-{uuid.uuid4()}" + tracing.trace_req_start(rid=request_id, trace_content=request.trace_context, role="FastDeploy") + del request.trace_context api_server_logger.info(f"create chat completion request: {request_id}") prompt_tokens = None max_tokens = None @@ -414,6 +418,19 @@ async def chat_completion_stream_generator( arrival_time=arrival_time, ) if res["finished"]: + trace_carrier = res.get("trace_carrier") + if trace_carrier: + tracing.trace_set_proc_propagate_context(request_id, trace_carrier) + start_time = res["metrics"]["engine_recv_latest_token_time"] + tracing.trace_report_span( + tracing.TraceSpanName.POSTPROCESSING, + request_id, + int(start_time * 1e9), + int(time.time() * 1e9), + thread_finish_flag=True, + ) + if "trace_carrier" in res: + del res["trace_carrier"] num_choices -= 1 main_process_metrics.e2e_request_latency.observe( time.time() - res["metrics"]["request_start_time"] @@ -487,6 +504,7 @@ async def chat_completion_stream_generator( ) yield f"data: {error_data}\n\n" finally: + tracing.trace_req_finish(request_id) await self.engine_client.connection_manager.cleanup_request(request_id) self.engine_client.semaphore.release() trace_print(LoggingEventName.POSTPROCESSING_END, request_id, getattr(request, "user", "")) @@ -605,6 +623,19 @@ async def chat_completion_full_generator( if prompt_logprobs_res: prompt_logprobs_res_list[idx].extend(clamp_prompt_logprobs(prompt_logprobs_res)) if data["finished"]: + trace_carrier = data.get("trace_carrier") + if trace_carrier: + tracing.trace_set_proc_propagate_context(request_id, trace_carrier) + start_time = data["metrics"]["engine_recv_latest_token_time"] + tracing.trace_report_span( + tracing.TraceSpanName.POSTPROCESSING, + request_id, + int(start_time * 1e9), + int(time.time() * 1e9), + thread_finish_flag=True, + ) + if "trace_carrier" in data: + del data["trace_carrier"] num_choices -= 1 reasoning_num_tokens[idx] = data["outputs"].get("reasoning_token_num", 0) if data["outputs"].get("image_token_num"): @@ -629,6 +660,7 @@ async def chat_completion_full_generator( ) choices.append(choice) finally: + tracing.trace_req_finish(request_id) await self.engine_client.connection_manager.cleanup_request(request_id) self.engine_client.semaphore.release() api_server_logger.info(f"release {self.engine_client.semaphore.status()}") diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index e02fa22be5a..a323e289e60 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -24,6 +24,7 @@ import numpy as np +import fastdeploy.metrics.trace as tracing from fastdeploy.engine.request import RequestOutput from fastdeploy.entrypoints.openai.protocol import ( CompletionLogprobs, @@ -82,6 +83,7 @@ async def create_completion(self, request: CompletionRequest): """ Create a completion for the given prompt. """ + tracing.trace_set_thread_info("API Server") if not self._check_master(): err_msg = ( f"Only master node can accept completion request, please send request to master node: {self.master_ip}" @@ -106,6 +108,8 @@ async def create_completion(self, request: CompletionRequest): else: request_id = f"cmpl-{uuid.uuid4()}" api_server_logger.info(f"Initialize request {request_id}: {request}") + tracing.trace_req_start(rid=request_id, trace_content=request.trace_context, role="FastDeploy") + del request.trace_context request_prompt_ids = None request_prompts = None @@ -316,6 +320,19 @@ async def completion_full_generator( output_tokens[rid] += len(data["outputs"]["token_ids"]) completion_batched_token_ids[rid].extend(data["outputs"]["token_ids"]) if data.get("finished", False): + trace_carrier = data.get("trace_carrier") + if trace_carrier: + tracing.trace_set_proc_propagate_context(request_id, trace_carrier) + start_time = data["metrics"]["engine_recv_latest_token_time"] + tracing.trace_report_span( + tracing.TraceSpanName.POSTPROCESSING, + request_id, + int(start_time * 1e9), + int(time.time() * 1e9), + thread_finish_flag=True, + ) + if "trace_carrier" in data: + del data["trace_carrier"] data["output_token_ids"] = output_tokens[rid] data["outputs"]["top_logprobs"] = aggregated_top_logprobs[rid] data["outputs"]["draft_top_logprobs"] = aggregated_draft_top_logprobs[rid] @@ -340,6 +357,7 @@ async def completion_full_generator( except Exception as e: api_server_logger.error(f"Error in completion_full_generator: {e}", exc_info=True) finally: + tracing.trace_req_finish(request_id) trace_print(LoggingEventName.POSTPROCESSING_END, request_id, getattr(request, "user", "")) self.engine_client.semaphore.release() if dealer is not None: @@ -568,6 +586,19 @@ async def completion_stream_generator( choices = [] if res["finished"]: + trace_carrier = res.get("trace_carrier") + if trace_carrier: + tracing.trace_set_proc_propagate_context(request_id, trace_carrier) + start_time = res["metrics"]["engine_recv_latest_token_time"] + tracing.trace_report_span( + tracing.TraceSpanName.POSTPROCESSING, + request_id, + int(start_time * 1e9), + int(time.time() * 1e9), + thread_finish_flag=True, + ) + if "trace_carrier" in res: + del res["trace_carrier"] num_choices -= 1 if getattr(request, "stream_options", None) and request.stream_options.include_usage: usage_chunk = CompletionStreamResponse( @@ -598,6 +629,8 @@ async def completion_stream_generator( api_server_logger.error(f"Error in completion_stream_generator: {e}, {str(traceback.format_exc())}") yield f"data: {ErrorResponse(error=ErrorInfo(message=str(e), code='400', type=ErrorType.INTERNAL_ERROR)).model_dump_json(exclude_unset=True)}\n\n" finally: + + tracing.trace_req_finish(request_id) trace_print(LoggingEventName.POSTPROCESSING_END, request_id, getattr(request, "user", "")) del request if dealer is not None: diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index dc734af5eea..1d6f7a3ac53 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -152,6 +152,8 @@ "FD_HPU_CHUNK_SIZE": lambda: int(os.getenv("FD_HPU_CHUNK_SIZE", "64")), "FD_PREFILL_WAIT_DECODE_RESOURCE_SECONDS": lambda: int(os.getenv("FD_PREFILL_WAIT_DECODE_RESOURCE_SECONDS", "30")), "FMQ_CONFIG_JSON": lambda: os.getenv("FMQ_CONFIG_JSON", None), + "FD_OTLP_EXPORTER_SCHEDULE_DELAY_MILLIS": lambda: int(os.getenv("FD_OTLP_EXPORTER_SCHEDULE_DELAY_MILLIS", "500")), + "FD_OTLP_EXPORTER_MAX_EXPORT_BATCH_SIZE": lambda: int(os.getenv("FD_OTLP_EXPORTER_MAX_EXPORT_BATCH_SIZE", "64")), } diff --git a/fastdeploy/metrics/trace.py b/fastdeploy/metrics/trace.py new file mode 100644 index 00000000000..5c60b4e98da --- /dev/null +++ b/fastdeploy/metrics/trace.py @@ -0,0 +1,777 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +# This file is modified from https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/tracing/trace.py + +from __future__ import annotations + +import inspect +import os +import random +import threading +import time +import uuid +from dataclasses import dataclass +from enum import Enum, unique +from functools import wraps +from typing import Any, Dict, List, Optional + +from fastdeploy import envs +from fastdeploy.utils import api_server_logger as logger + +opentelemetry_imported = False +tracing_enabled = False + +try: + from opentelemetry import context, propagate, trace + from opentelemetry.sdk.environment_variables import ( + OTEL_EXPORTER_OTLP_TRACES_PROTOCOL, + ) + from opentelemetry.sdk.resources import Resource + from opentelemetry.sdk.trace import SpanProcessor, TracerProvider, id_generator + from opentelemetry.sdk.trace.export import BatchSpanProcessor, SpanExporter + + opentelemetry_imported = True +except ImportError as e: + print(f"Failed to import opentelemetry, tracing disabled.{e}") + logger.error(f"Failed to import opentelemetry, tracing disabled.{e}") + + class id_generator: + class IdGenerator: + pass + + logger.info("opentelemetry package is not installed, tracing disabled") + + +class FilteringSpanProcessor(SpanProcessor): + def __init__(self, exporter: SpanExporter, **kwargs): + self._processor = BatchSpanProcessor(exporter, **kwargs) + + def on_start(self, span, parent_context=None): + parent_span = trace.get_current_span() + if parent_span and parent_span.is_recording(): + stream_attr = parent_span.attributes.get("stream") + if stream_attr is not None: + span.set_attribute("stream", stream_attr) + self._processor.on_start(span, parent_context) + + def on_end(self, span): + # asgi_event_type = span.attributes.get("asgi.event.type") + # stream = span.attributes.get("stream") + span_name = span.name or "" + + if "http" in span_name: + return + + self._processor.on_end(span) + + def shutdown(self): + self._processor.shutdown() + + def force_flush(self, timeout_millis=None): + self._processor.force_flush(timeout_millis) + + +def label_span(request): + if request.stream: + span = trace.get_current_span() + if span is not None and span.is_recording(): + span.set_attribute("stream", "true") + + +@dataclass +class TraceThreadInfo: + host_id: str + pid: int + thread_label: str + tp_rank: int + dp_rank: int + tracer: trace.Tracer + + +@dataclass +class TraceSliceContext: + slice_name: str + span: Optional[trace.span.Span] = None + # When True, defers slice_name assignment until trace_slice_end() + anonymous: bool = False + + +@dataclass +class TraceThreadContext: + thread_info: TraceThreadInfo + cur_slice_stack: List[TraceSliceContext] + thread_span: Optional[trace.span.Span] = None + # Record the most recently completed span as the previous span for the next span to be created. + last_span_context: Optional[trace.span.SpanContext] = None + + +@dataclass +class TraceReqContext: + rid: str + start_time_ns: int + threads_context: Dict[int, TraceThreadContext] + + # Indicates whether this instance is a replica from the main process. + # When True, root_span is None and only root_span_context is preserved. + is_copy: bool = False + root_span: Optional[trace.span.Span] = None + root_span_context: Optional[context.Context] = None + + +@dataclass +class TracePropagateContext: + root_span_context: context.Context + prev_span_context: Optional[trace.span.SpanContext] + + def to_dict(self): + carrier: dict[str, str] = {} + propagate.inject(carrier, context=self.root_span_context) + + if self.prev_span_context: + return { + "root_span": carrier, + "prev_span": { + "span_id": self.prev_span_context.span_id, + "trace_id": self.prev_span_context.trace_id, + }, + } + else: + return {"root_span": carrier, "prev_span": "None"} + + @classmethod + def instance_from_dict(cls, d): + if "root_span" not in d or "prev_span" not in d: + return None + + carrier = d["root_span"] + root_span_context = propagate.extract(carrier) + + if d["prev_span"] == "None": + prev_span_context = None + else: + prev_span_context = trace.span.SpanContext( + trace_id=d["prev_span"]["trace_id"], + span_id=d["prev_span"]["span_id"], + is_remote=True, + ) + + return cls(root_span_context, prev_span_context) + + +class TraceCustomIdGenerator(id_generator.IdGenerator): + """ + The default IdGenerator may produce duplicate trace IDs across multiple TP scheduler processes, + hence a custom IdGenerator is implemented. + """ + + def __init__(self): + super().__init__() + self.local_random = random.Random() + self.local_random.seed(time.time()) + + def generate_trace_id(self) -> int: + return self.local_random.getrandbits(64) + + def generate_span_id(self) -> int: + return self.local_random.getrandbits(64) + + +# global variables +remote_trace_contexts: Dict[str, TracePropagateContext] = {} +threads_info: Dict[int, TraceThreadInfo] = {} +reqs_context: Dict[str, TraceReqContext] = {} + +__get_cur_time_ns = lambda: int(time.time() * 1e9) + + +def __get_host_id() -> str: + """ + In distributed tracing systems, obtain a unique node identifier + and inject it into all subsequently generated spans + to prevent PID conflicts between threads on different nodes. + """ + if envs.FD_HOST_NAME: + return envs.FD_HOST_NAME + paths = ["/etc/machine-id", "/var/lib/dbus/machine-id"] + for path in paths: + try: + with open(path, "r") as f: + val = f.read().strip() + if val: + return val + except Exception: + continue + + mac = uuid.getnode() + if mac != 0: + return uuid.UUID(int=mac).hex + + try: + unique_id = uuid.uuid4().hex + "-" + str(os.getpid()) + return unique_id + except Exception: + return "unknown" + + +# Should be called by each tracked process. +def process_tracing_init(): + global tracing_enabled + global __get_cur_time_ns + tracing_enabled = envs.TRACES_ENABLE.lower() == "true" + + if not tracing_enabled: + logger.warning("Opentelemetry is DISABLED.") + return + + if not opentelemetry_imported: + tracing_enabled = False + return + + try: + # --- read env --- + service_name = envs.FD_SERVICE_NAME + host_name = envs.FD_HOST_NAME + resource_attributes = {"service.name": service_name} + if host_name: + resource_attributes["host.name"] = host_name + resource = Resource(attributes=resource_attributes) + endpoint = envs.EXPORTER_OTLP_ENDPOINT + headers = envs.EXPORTER_OTLP_HEADERS + headers = dict(item.split("=") for item in headers.split(",")) if headers else None + + otlp_exporter = get_otlp_span_exporter(endpoint, headers) + + schedule_delay_millis = envs.FD_OTLP_EXPORTER_SCHEDULE_DELAY_MILLIS + max_export_batch_size = envs.FD_OTLP_EXPORTER_MAX_EXPORT_BATCH_SIZE + processor = FilteringSpanProcessor( + otlp_exporter, + schedule_delay_millis=schedule_delay_millis, + max_export_batch_size=max_export_batch_size, + ) + tracer_provider = TracerProvider(resource=resource, id_generator=TraceCustomIdGenerator()) + + tracer_provider.add_span_processor(processor) + # tracer_provider.add_span_processor( + # SimpleSpanProcessor(ConsoleSpanExporter()) + # ) + trace.set_tracer_provider(tracer_provider) + except Exception as e: + logger.error(f": initialize opentelemetry error:{e}") + logger.warning("please set correct otlp endpoint") + tracing_enabled = False + return + + if hasattr(time, "time_ns"): + __get_cur_time_ns = lambda: int(time.time_ns()) + + tracing_enabled = True + + +def get_otlp_span_exporter(endpoint, headers): + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import ( + OTLPSpanExporter as GRPCSpanExporter, + ) + from opentelemetry.exporter.otlp.proto.http.trace_exporter import ( + OTLPSpanExporter as HTTPSpanExporter, + ) + + protocol = os.environ.get(OTEL_EXPORTER_OTLP_TRACES_PROTOCOL, "grpc") + supported_protocols = {"grpc", "http/protobuf"} + + if protocol not in supported_protocols: + raise ValueError( + f"Unsupported OTLP protocol '{protocol}' configured. " + f"Supported protocols are: {', '.join(sorted(supported_protocols))}" + ) + + if protocol == "grpc": + return GRPCSpanExporter(endpoint=endpoint, insecure=True) + elif protocol == "http/protobuf": + return HTTPSpanExporter(endpoint=endpoint, headers=headers) + + +# Should be called by each tracked thread. +def trace_set_thread_info(thread_label: str, tp_rank: Optional[int] = None, dp_rank: Optional[int] = None): + if not tracing_enabled: + return + + pid = threading.get_native_id() + if pid in threads_info: + return + + threads_info[pid] = TraceThreadInfo( + host_id=__get_host_id(), + pid=pid, + thread_label=thread_label, + tp_rank=tp_rank, + dp_rank=dp_rank, + tracer=trace.get_tracer("fastdeploy server"), + ) + + +def __create_thread_context(pid, req_span_context, ts: Optional[int] = None): + if pid not in threads_info: + trace_set_thread_info("unknown") + + thread_info = threads_info[pid] + thread_context = TraceThreadContext( + thread_info=thread_info, + cur_slice_stack=[], + ) + + thread_name = f"{thread_info.thread_label}" + if thread_info.tp_rank is not None: + thread_name += f" [TP {thread_info.tp_rank}] " + thread_name += f"(host:{thread_info.host_id} | pid:{pid})" + ts = ts or __get_cur_time_ns() + thread_context.thread_span = thread_context.thread_info.tracer.start_span( + name=thread_name, + start_time=ts, + context=req_span_context, + ) + + if thread_info.tp_rank is not None: + thread_context.thread_span.set_attributes({"tp_rank": thread_info.tp_rank}) + + thread_context.thread_span.set_attributes( + { + "host_id": thread_info.host_id, + "pid": thread_info.pid, + "thread_label": thread_info.thread_label, + } + ) + + return thread_context + + +def trace_get_proc_propagate_context(rid) -> Optional[Dict[str, Any]]: + if not tracing_enabled: + return None + + rid = str(rid) + if rid not in reqs_context or not reqs_context[rid].root_span_context: + return None + + pid = threading.get_native_id() + prev_span_context = None + thread_context = reqs_context[rid].threads_context[pid] + if thread_context.cur_slice_stack: + cur_slice_info = thread_context.cur_slice_stack[0] + prev_span_context = cur_slice_info.span.get_span_context() + elif thread_context.last_span_context: + prev_span_context = thread_context.last_span_context + + root_span_context = reqs_context[rid].root_span_context + + trace_context = TracePropagateContext(root_span_context, prev_span_context) + return trace_context.to_dict() + + +def trace_set_proc_propagate_context(rid, trace_context: Optional[Dict[str, Any]], ts: Optional[int] = None): + if not tracing_enabled: + return + if not trace_context: + return + + trace_context = TracePropagateContext.instance_from_dict(trace_context) + if not trace_context: + return + + rid = str(rid) + # Create a copy of the request context + if rid not in reqs_context: + reqs_context[rid] = TraceReqContext( + rid=rid, + start_time_ns=ts or __get_cur_time_ns(), + threads_context={}, + root_span_context=trace_context.root_span_context, + is_copy=True, + ) + + pid = threading.get_native_id() + + if pid in reqs_context[rid].threads_context: + return + + # Create new thread context. + reqs_context[rid].threads_context[pid] = __create_thread_context( + pid, + trace_context.root_span_context, + reqs_context[rid].start_time_ns, + ) + + reqs_context[rid].threads_context[pid].last_span_context = trace_context.prev_span_context + + +def trace_req_start( + rid: str, + trace_content: str, + ts: Optional[int] = None, + role: Optional[str] = "null", +): + if not tracing_enabled: + return + + rid = str(rid) + + ts = ts or __get_cur_time_ns() + + pid = threading.get_native_id() + if pid not in threads_info: + return + + tracer = threads_info[pid].tracer + + upstream_context = trace_content + + # 1. Check if there is already an active Span (from FastAPI Instrumentor) + active_span = trace.get_current_span() + if active_span is not None and active_span.is_recording(): + active_span.set_attribute("rid", rid) + new_span_name = active_span.name + f" (Req: {rid})" + active_span.update_name(new_span_name) + + active_span_context = active_span.get_span_context() + + if active_span_context.is_valid and active_span_context.trace_id != 0: + # Scenario: FastAPIInstrumentor has created the top-level Span + + if rid in reqs_context: + return + + logger.info(f"Using existing active span from context as root for RID: {rid}") + + # Inject the FastAPI Span Context as the root Span Context into the internal structure + reqs_context[rid] = TraceReqContext( + rid=rid, + start_time_ns=ts, + threads_context={}, + root_span=active_span, + root_span_context=context.get_current(), + is_copy=True, + ) + # Thread context is necessary so that trace_slice_start can find the tracer + if pid not in reqs_context[rid].threads_context: + reqs_context[rid].threads_context[pid] = __create_thread_context( + pid, + context.get_current(), + ts, + ) + # No need to manually end req/bootstrap room span, this is handled by FastAPIInstrumentor + return + + parent_context = None + + use_upstream = False + if upstream_context: + ctx_span = trace.get_current_span(upstream_context) + if ctx_span.get_span_context().is_valid: + use_upstream = True + + if use_upstream: + logger.info(f"Continuing upstream trace for RID={rid}") + parent_context = upstream_context + + reqs_context[rid] = TraceReqContext( + rid=rid, + start_time_ns=ts, + threads_context={}, + is_copy=True, + ) + + else: + reqs_context[rid] = TraceReqContext( + rid=rid, + start_time_ns=ts, + threads_context={}, + is_copy=False, + ) + + orig_rid = rid.split("_")[0] + role = "" if role == "null" else role + attrs = {"rid": orig_rid} + + root_span = tracer.start_span( + name=f"{role} Req {orig_rid}".strip(), + start_time=ts, + context=parent_context, + kind=trace.SpanKind.SERVER, + attributes=attrs, + ) + + root_span.set_attributes( + { + "rid": rid, + } + ) + + # Consistently populate the Root Span information in reqs_context + reqs_context[rid].root_span = root_span + reqs_context[rid].root_span_context = trace.set_span_in_context(root_span) + + # create thread context and thread span + reqs_context[rid].threads_context[pid] = __create_thread_context( + pid, + reqs_context[rid].root_span_context, + ts, + ) + + +def trace_req_finish(rid: str, ts: Optional[int] = None, attrs: Optional[Dict[str, Any]] = None): + if not tracing_enabled: + return + + rid = str(rid) + if rid not in reqs_context: + return + + req_context = reqs_context[rid] + ts = ts or __get_cur_time_ns() + + # End all unclosed thread spans. + for thread_context in req_context.threads_context.values(): + thread_context.thread_span.end(end_time=ts) + + # Only end the root_span if it was manually created + if req_context.root_span: + if attrs: + req_context.root_span.set_attributes(attrs) + req_context.root_span.end(end_time=ts) + + del reqs_context[rid] + + +def trace_slice_start( + name: str, + rid: str, + ts: Optional[int] = None, + anonymous: bool = False, +): + if not tracing_enabled: + return + + rid = str(rid) + if rid not in reqs_context: + return + + pid = threading.get_native_id() + if pid not in reqs_context[rid].threads_context: + return + + thread_context = reqs_context[rid].threads_context[pid] + + ts = ts or __get_cur_time_ns() + + slice_info = TraceSliceContext( + slice_name=name, + anonymous=anonymous, + ) + + # find prev slice + prev_span_context = None + if not thread_context.cur_slice_stack: + if thread_context.last_span_context: + prev_span_context = thread_context.last_span_context + + parent_span = thread_context.thread_span + if thread_context.cur_slice_stack: + parent_span = thread_context.cur_slice_stack[-1].span + + parent_span_context = trace.set_span_in_context(parent_span) + span = thread_context.thread_info.tracer.start_span( + name=slice_info.slice_name, + start_time=ts, + context=parent_span_context, + ) + + if prev_span_context: + span.add_link(prev_span_context) + + slice_info.span = span + + thread_context.cur_slice_stack.append(slice_info) + + +def trace_slice_end( + name: str, + rid: str, + ts: Optional[int] = None, + attrs: Optional[Dict[str, Any]] = None, + auto_next_anon: bool = False, + thread_finish_flag: bool = False, +): + if not tracing_enabled: + return + + rid = str(rid) + if rid not in reqs_context: + return + + pid = threading.get_native_id() + if pid not in reqs_context[rid].threads_context: + return + + thread_context = reqs_context[rid].threads_context[pid] + + if not thread_context.cur_slice_stack: + logger.warning(f"No matching with the SLICE_START event{name} is required.") + return + + ts = ts or __get_cur_time_ns() + slice_info = thread_context.cur_slice_stack[-1] + span = slice_info.span + + if slice_info.anonymous: + span.update_name(name) + else: + span = slice_info.span + if slice_info.slice_name != name: + span.set_status(trace.Status(trace.StatusCode.ERROR)) + logger.warning(f"Slice name mismatch: {name} != {slice_info.slice_name}") + + if attrs: + span.set_attributes(attrs) + + span.end(end_time=ts) + + thread_context.cur_slice_stack.pop() + if len(thread_context.cur_slice_stack) == 0: + thread_context.last_span_context = span.get_span_context() + + # If this is the last slice in the thread, + # release the thread context and check whether to release the request context. + if thread_finish_flag: + thread_context.thread_span.end(end_time=ts) + del reqs_context[rid].threads_context[pid] + if reqs_context[rid].is_copy and not reqs_context[rid].threads_context: + del reqs_context[rid] + return + + if auto_next_anon: + trace_slice_start("", rid, ts, True) + + +# alias +trace_slice = trace_slice_end + + +def trace_report_span( + name: str, + rid: str, + start_time_ns: int, + end_time_ns: int, + attrs: Dict[str, Any] = None, + thread_finish_flag: bool = False, +): + if not tracing_enabled: + return + trace_slice_start(name, rid, start_time_ns) + trace_slice_end(name, rid, end_time_ns, attrs, False, thread_finish_flag) + + +# Add event to the current slice on the same thread with the same rid. +def trace_event(name: str, rid: str, ts: Optional[int] = None, attrs: Dict[str, Any] = None): + if not tracing_enabled: + return + + rid = str(rid) + if rid not in reqs_context: + return + + pid = threading.get_native_id() + if pid not in reqs_context[rid].threads_context: + return + + thread_context = reqs_context[rid].threads_context[pid] + + if not thread_context.cur_slice_stack: + logger.warning("No slice is currently being traced.") + return + + ts = ts or __get_cur_time_ns() + + slice_info = thread_context.cur_slice_stack[-1] + slice_info.span.add_event(name=name, timestamp=ts, attributes=attrs) + + +# Add attrs to the current slice on the same thread with the same rid. +def trace_slice_add_attr(rid: str, attrs: Dict[str, Any]): + if not tracing_enabled: + return + + rid = str(rid) + if rid not in reqs_context: + return + + pid = threading.get_native_id() + if pid not in reqs_context[rid].threads_context: + return + + thread_context = reqs_context[rid].threads_context[pid] + + if not thread_context.cur_slice_stack: + logger.warning("No slice is currently being traced.") + return + + slice_info = thread_context.cur_slice_stack[-1] + slice_info.span.set_attributes(attrs) + + +def trace_span(span_name: str = None): + + def decorator(func): + if not tracing_enabled: + return func + + pid = threading.get_native_id() + if pid not in threads_info: + trace_set_thread_info("FastDeploy") + + tracer = threads_info[pid].tracer + + name = span_name or func.__name__ + + if inspect.iscoroutinefunction(func): + + @wraps(func) + async def async_wrapper(*args, **kwargs): + with tracer.start_as_current_span(name): + return await func(*args, **kwargs) + + return async_wrapper + + else: + + @wraps(func) + def sync_wrapper(*args, **kwargs): + with tracer.start_as_current_span(name): + return func(*args, **kwargs) + + return sync_wrapper + + return decorator + + +@unique +class TraceSpanName(str, Enum): + + FASTDEPLOY = "FASTDEPLOY" + PREPROCESSING = "PREPROCESSING" + SCHEDULE = "SCHEDULE" + PREFILL = "PREFILL" + DECODE = "DECODE" + POSTPROCESSING = "POSTPROCESSING" diff --git a/fastdeploy/metrics/trace_util.py b/fastdeploy/metrics/trace_util.py deleted file mode 100644 index 111c2c85343..00000000000 --- a/fastdeploy/metrics/trace_util.py +++ /dev/null @@ -1,262 +0,0 @@ -import json -import os - -from fastapi import FastAPI -from opentelemetry import trace -from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter -from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor -from opentelemetry.instrumentation.logging import LoggingInstrumentor -from opentelemetry.propagate import extract, inject -from opentelemetry.sdk.resources import Resource -from opentelemetry.sdk.trace import SpanProcessor, TracerProvider -from opentelemetry.sdk.trace.export import ( - BatchSpanProcessor, - ConsoleSpanExporter, - SpanExporter, -) - -from fastdeploy import envs -from fastdeploy.utils import llm_logger - -# OpenTelemetry Trace context store in metadata -TRACE_CARRIER = "trace_carrier" - -traces_enable = False -tracer = trace.get_tracer(__name__) - - -class FilteringSpanProcessor(SpanProcessor): - def __init__(self, exporter: SpanExporter): - self._processor = BatchSpanProcessor(exporter) - - # 父span属性继承逻辑 - def on_start(self, span, parent_context=None): - parent_span = trace.get_current_span() - if parent_span and parent_span.is_recording(): - stream_attr = parent_span.attributes.get("stream") - if stream_attr is not None: - span.set_attribute("stream", stream_attr) - self._processor.on_start(span, parent_context) - - # span导出时的过滤逻辑 - def on_end(self, span): - asgi_event_type = span.attributes.get("asgi.event.type") - stream = span.attributes.get("stream") - span_name = span.name or "" - - if stream and asgi_event_type == "http.response.body" and "http send" in span_name: - return - - self._processor.on_end(span) - - def shutdown(self): - self._processor.shutdown() - - def force_flush(self, timeout_millis=None): - self._processor.force_flush(timeout_millis) - - -# 标记函数 -def lable_span(request): - if request.stream: - span = trace.get_current_span() - if span is not None and span.is_recording(): - span.set_attribute("stream", "true") - - -def set_up(): - try: - # when TRACES_ENABLED=true start trace - global traces_enable - traces_enable = envs.TRACES_ENABLE.lower() == "true" - if not traces_enable: - llm_logger.warning("Opentelemetry is DISABLED.") - return - - llm_logger.info("Opentelemetry is ENABLED, configuring...") - # --- read env --- - service_name = envs.FD_SERVICE_NAME - host_name = envs.FD_HOST_NAME - # --- set attributes (Service Name, Host Name, etc.) --- - resource_attributes = {"service.name": service_name} - if host_name: - resource_attributes["host.name"] = host_name - - resource = Resource(attributes=resource_attributes) - - # --- set Exporter --- - exporter_type = envs.TRACES_EXPORTER.lower() - if exporter_type == "otlp": - endpoint = envs.EXPORTER_OTLP_ENDPOINT # should be set - headers = envs.EXPORTER_OTLP_HEADERS # e.g., "Authentication=***,k2=v2" - - otlp_exporter = OTLPSpanExporter( - endpoint=endpoint, - headers=(dict(item.split("=") for item in headers.split(",")) if headers else None), - ) - processor = FilteringSpanProcessor(otlp_exporter) - llm_logger.info(f"Using OTLP Exporter, sending to {endpoint} with headers {headers}") - else: # default console - processor = FilteringSpanProcessor(ConsoleSpanExporter()) - llm_logger.info("Using Console Exporter.") - - # --- set Tracer Provider --- - provider = TracerProvider(resource=resource) - provider.add_span_processor(processor) - trace.set_tracer_provider(provider) - global tracer - tracer = trace.get_tracer(__name__) - except: - llm_logger.error("set_up failed") - pass - - -def instrument(app: FastAPI): - try: - set_up() - if traces_enable: - llm_logger.info("Applying instrumentors...") - FastAPIInstrumentor.instrument_app(app) - try: - LoggingInstrumentor().instrument(set_logging_format=True) - except Exception: - pass - except: - llm_logger.info("instrument failed") - pass - - -def inject_to_metadata(request, metadata_attr="metadata"): - """ - Inject OpenTelemetry trace context into the metadata field of the request. - - Parameters: - request: can be a dict or object, with metadata attributes or fields. - metadata_attr: the field name of metadata, default is 'metadata'. - - Operation: - - If metadata does not exist, create a new one and mount it on the request. - - Inject the current trace context as a JSON string and store it in metadata. - - Use the key TRACE_CARRIER to store the injected content. - - Note: - - This function is a non-blocking operation, and errors are silently ignored. - - If there is no metadata attribute in the request, an empty dict will be created for it as its attribute - """ - try: - if request is None or not traces_enable: - return - - metadata = request.get(metadata_attr) if isinstance(request, dict) else getattr(request, metadata_attr, None) - if metadata is None: - metadata = {} - if isinstance(request, dict): - request[metadata_attr] = metadata - else: - setattr(request, metadata_attr, metadata) - - trace_carrier = {} - inject(trace_carrier) - trace_carrier_json_string = json.dumps(trace_carrier) - metadata[TRACE_CARRIER] = trace_carrier_json_string - except: - pass - - -def extract_from_metadata(request, metadata_attr="metadata"): - """ - Extract trace context from metadata of request object (dict or class instance). - - Parameters: - request: can be a dictionary or any object, containing metadata attributes or fields. - metadata_attr: metadata field name, default is 'metadata'. - - Returns: - - Extraction success: returns OpenTelemetry context object (Context) - - Extraction failure or exception: returns None - """ - try: - metadata = request.get(metadata_attr) if isinstance(request, dict) else getattr(request, metadata_attr, None) - if metadata is None: - return None - - trace_carrier_json_string = metadata.get(TRACE_CARRIER) - if trace_carrier_json_string is None: - return None - - trace_carrier = json.loads(trace_carrier_json_string) - ctx = extract(trace_carrier) - return ctx - except: - return None - - -def extract_from_request(request): - """ - Extract trace context from trace_carrier of request object (dict or class instance). - - Parameters: - request: can be a dictionary or any object, containing metadata attributes or fields. - metadata_attr: metadata field name, default is 'metadata'. - - Returns: - - Extraction success: returns OpenTelemetry context object (Context) - - Extraction failure or exception: returns None - """ - try: - trace_carrier_info = getattr(request, TRACE_CARRIER, None) - - if trace_carrier_info is None: - return None - - trace_carrier = json.loads(trace_carrier_info) - ctx = extract(trace_carrier) - return ctx - except: - return None - - -def start_span(span_name, request, kind=trace.SpanKind.CLIENT): - """ - just start a new span in request trace context - """ - try: - if not traces_enable: - return - # extract Trace context from request.metadata.trace_carrier - ctx = extract_from_metadata(request) - with tracer.start_as_current_span(span_name, context=ctx, kind=kind) as span: - span.set_attribute("job_id", os.getenv("FD_JOB_ID", default="null")) - pass - except: - pass - - -def fd_start_span(span_name, kind=trace.SpanKind.CLIENT): - """ - when fd start, start a new span show start success - """ - try: - if not traces_enable: - return - with tracer.start_as_current_span(span_name, kind=kind) as span: - span.set_attribute("job_id", os.getenv("FD_JOB_ID", default="null")) - pass - except: - pass - - -def start_span_request(span_name, request, kind=trace.SpanKind.CLIENT): - """ - just start a new span in request trace context - """ - try: - if not traces_enable: - return - # extract Trace context from request.metadata.trace_carrier - ctx = extract_from_request(request) - with tracer.start_as_current_span(span_name, context=ctx, kind=kind) as span: - span.set_attribute("job_id", os.getenv("FD_JOB_ID", default="null")) - pass - except: - pass diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py index 6286451d9eb..3cf9869fa27 100644 --- a/fastdeploy/output/token_processor.py +++ b/fastdeploy/output/token_processor.py @@ -27,6 +27,7 @@ import paddle import zmq +import fastdeploy.metrics.trace as tracing from fastdeploy import envs from fastdeploy.engine.request import ( CompletionOutput, @@ -339,6 +340,7 @@ def process_sampling_results(self): """ read tokens from paddle inference engine and process """ + tracing.trace_set_thread_info("Token Processor") if current_platform.is_xpu(): from fastdeploy.model_executor.ops.xpu import ( @@ -641,6 +643,10 @@ def _process_batch_output(self): is_prefill = task.disaggregate_info is not None and self.cfg.scheduler_config.splitwise_role == "prefill" is_decode = task.disaggregate_info is not None and self.cfg.scheduler_config.splitwise_role == "decode" + rid = task_id.split("_")[0] + trace_carrier = task.trace_carrier + ts = int(task.metrics.inference_start_time * 1e9) + tracing.trace_set_proc_propagate_context(rid, trace_carrier, ts) if self.cfg.speculative_config.method: if accept_num[i] == -3: recovery_stop = True @@ -684,11 +690,21 @@ def _process_batch_output(self): self.total_step += 1 current_time = time.time() + trace_carrier = None if self.tokens_counter[task_id] == 0: task.metrics.record_recv_first_token() task.metrics.cal_cost_time() metrics = copy.copy(task.metrics) self._record_first_token_metrics(task, current_time) + + tracing.trace_report_span( + name=tracing.TraceSpanName.PREFILL, + rid=rid, + start_time_ns=int(task.metrics.inference_start_time * 1e9), + end_time_ns=int(time.time() * 1e9), + thread_finish_flag=False, + ) + else: task.metrics.record_recv_token() if self.tokens_counter[task_id] == 1 and self.cfg.scheduler_config.splitwise_role == "decode": @@ -710,6 +726,7 @@ def _process_batch_output(self): metrics=metrics, ic_req_data=task.ic_req_data, prompt_token_ids_len=task.prompt_token_ids_len, + trace_carrier=trace_carrier, ) if self.tokens_counter[task_id] == 0: if task.messages is not None: @@ -764,6 +781,15 @@ def _process_batch_output(self): if token_id in task.eos_token_ids or is_prefill or recovery_stop: result.finished = True + trace_carrier = tracing.trace_get_proc_propagate_context(rid=rid) + result.trace_carrier = trace_carrier + tracing.trace_report_span( + name=tracing.TraceSpanName.DECODE, + rid=rid, + start_time_ns=int(task.metrics.inference_start_time * 1e9), + end_time_ns=int(time.time() * 1e9), + thread_finish_flag=True, + ) if recovery_stop: result.error_msg = "Recover is not supported, the result is incomplete!" llm_logger.info( diff --git a/mkdocs.yml b/mkdocs.yml index 4a97f7f1ba7..ac4c81f85a2 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -66,6 +66,8 @@ plugins: Scheduler: 调度器 Graceful Shutdown: 服务优雅关闭 Offline Inference: 离线推理 + Observability: 可观测性 + Trace: Trace服务 CLI: CLI 使用说明 Chat: Chat命令 Complete: Complete命令 @@ -173,3 +175,5 @@ nav: - Bench: cli/bench.md - Run Batch: cli/run-batch.md - Tokenizer: cli/tokenizer.md + - Observability: + - Trace: observability/trace.md diff --git a/tests/metrics/test_trace.py b/tests/metrics/test_trace.py new file mode 100644 index 00000000000..10e7cfa3a52 --- /dev/null +++ b/tests/metrics/test_trace.py @@ -0,0 +1,615 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import os +import threading +import time +import unittest +from unittest import mock +from unittest.mock import MagicMock, patch + +import pytest +from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter + +from fastdeploy.metrics import trace +from fastdeploy.metrics.trace import FilteringSpanProcessor, label_span + + +class TestFilteringSpanProcessor(unittest.TestCase): + """Test cases for FilteringSpanProcessor class""" + + def setUp(self): + """Set up test fixtures""" + self.exporter = ConsoleSpanExporter() + self.processor = FilteringSpanProcessor(self.exporter) + + def test_initialization(self): + """Test that FilteringSpanProcessor is properly initialized""" + self.assertIsInstance(self.processor._processor, BatchSpanProcessor) + self.assertEqual(self.processor._processor.span_exporter, self.exporter) + + def test_on_start_with_parent_span(self): + """Test on_start method with parent span containing stream attribute""" + # Mock span and parent context + mock_span = MagicMock() + mock_parent_span = MagicMock() + mock_parent_span.is_recording.return_value = True + mock_parent_span.attributes.get.return_value = "test_stream" + + # Mock trace.get_current_span to return parent span + with patch("fastdeploy.metrics.trace.trace.get_current_span", return_value=mock_parent_span): + with patch.object(self.processor._processor, "on_start") as mock_parent_on_start: + self.processor.on_start(mock_span, parent_context=None) + + # Verify stream attribute is set on child span + mock_span.set_attribute.assert_called_once_with("stream", "test_stream") + mock_parent_on_start.assert_called_once_with(mock_span, None) + + def test_on_start_without_parent_span(self): + """Test on_start method without parent span""" + mock_span = MagicMock() + + # Mock trace.get_current_span to return None + with patch("fastdeploy.metrics.trace.trace.get_current_span", return_value=None): + with patch.object(self.processor._processor, "on_start") as mock_parent_on_start: + self.processor.on_start(mock_span, parent_context=None) + + # Verify no attributes are set + mock_span.set_attribute.assert_not_called() + mock_parent_on_start.assert_called_once_with(mock_span, None) + + def test_on_start_with_non_recording_parent_span(self): + """Test on_start method with non-recording parent span""" + mock_span = MagicMock() + mock_parent_span = MagicMock() + mock_parent_span.is_recording.return_value = False + + with patch("fastdeploy.metrics.trace.trace.get_current_span", return_value=mock_parent_span): + with patch.object(self.processor._processor, "on_start") as mock_parent_on_start: + self.processor.on_start(mock_span, parent_context=None) + + # Verify no attributes are set + mock_span.set_attribute.assert_not_called() + mock_parent_on_start.assert_called_once_with(mock_span, None) + + def test_on_end_filter_stream_http_response(self): + """Test on_end method filters out stream http response spans""" + mock_span = MagicMock() + mock_span.attributes.get.side_effect = lambda key: { + "asgi.event.type": "http.response.body", + "stream": "true", + }.get(key) + mock_span.name = "http send request" + + with patch.object(self.processor._processor, "on_end") as mock_parent_on_end: + self.processor.on_end(mock_span) + + # Verify parent on_end is NOT called (span is filtered out) + mock_parent_on_end.assert_not_called() + + def test_on_end_keep_spans_without_http_send(self): + """Test on_end method keeps spans without 'http send' in name""" + mock_span = MagicMock() + mock_span.attributes.get.side_effect = lambda key: { + "asgi.event.type": "http.response.body", + "stream": "true", + }.get(key) + mock_span.name = "other operation" + + with patch.object(self.processor._processor, "on_end") as mock_parent_on_end: + self.processor.on_end(mock_span) + + # Verify parent on_end is called + mock_parent_on_end.assert_called_once_with(mock_span) + + def test_shutdown(self): + """Test shutdown method""" + with patch.object(self.processor._processor, "shutdown") as mock_shutdown: + self.processor.shutdown() + mock_shutdown.assert_called_once() + + def test_force_flush(self): + """Test force_flush method""" + with patch.object(self.processor._processor, "force_flush") as mock_force_flush: + self.processor.force_flush(timeout_millis=5000) + mock_force_flush.assert_called_once_with(5000) + + +class TestLableSpan(unittest.TestCase): + """Test cases for label_span function""" + + def test_lable_span_with_stream_request(self): + """Test label_span function with streaming request""" + mock_request = MagicMock() + mock_request.stream = True + + mock_span = MagicMock() + mock_span.is_recording.return_value = True + + with patch("fastdeploy.metrics.trace.trace.get_current_span", return_value=mock_span): + label_span(mock_request) + + # Verify stream attribute is set + mock_span.set_attribute.assert_called_once_with("stream", "true") + + def test_lable_span_without_stream_request(self): + """Test label_span function with non-streaming request""" + mock_request = MagicMock() + mock_request.stream = False + + mock_span = MagicMock() + mock_span.is_recording.return_value = True + + with patch("fastdeploy.metrics.trace.trace.get_current_span", return_value=mock_span): + label_span(mock_request) + + # Verify no attributes are set + mock_span.set_attribute.assert_not_called() + + def test_lable_span_without_current_span(self): + """Test label_span function when no current span exists""" + mock_request = MagicMock() + mock_request.stream = True + + with patch("fastdeploy.metrics.trace.trace.get_current_span", return_value=None): + # Should not raise any exception + label_span(mock_request) + + def test_lable_span_with_non_recording_span(self): + """Test label_span function with non-recording span""" + mock_request = MagicMock() + mock_request.stream = True + + mock_span = MagicMock() + mock_span.is_recording.return_value = False + + with patch("fastdeploy.metrics.trace.trace.get_current_span", return_value=mock_span): + label_span(mock_request) + + # Verify no attributes are set + mock_span.set_attribute.assert_not_called() + + +class TestTraceComprehensive: + """Comprehensive tests for tracing functionality""" + + def setup_method(self): + """Setup test environment""" + # Mock environment variables + self.original_env = os.environ.copy() + os.environ["TRACES_ENABLE"] = "true" + os.environ["FD_SERVICE_NAME"] = "test_service" + os.environ["FD_HOST_NAME"] = "test_host" + os.environ["EXPORTER_OTLP_ENDPOINT"] = "http://localhost:4317" + os.environ["EXPORTER_OTLP_HEADERS"] = "key1=value1,key2=value2" + os.environ["FD_OTLP_EXPORTER_SCHEDULE_DELAY_MILLIS"] = "1000" + os.environ["FD_OTLP_EXPORTER_MAX_EXPORT_BATCH_SIZE"] = "512" + + # Reset global state + trace.remote_trace_contexts = {} + trace.threads_info = {} + trace.reqs_context = {} + trace.tracing_enabled = False + + def teardown_method(self): + """Restore environment""" + os.environ = self.original_env + + def test_process_tracing_init_with_different_scenarios(self): + """Test tracing initialization under different scenarios""" + # Test normal initialization + trace.process_tracing_init() + assert trace.tracing_enabled is True + + # Test with tracing disabled + os.environ["TRACES_ENABLE"] = "false" + trace.process_tracing_init() + assert trace.tracing_enabled is False + + # Test with invalid endpoint + os.environ["TRACES_ENABLE"] = "true" + os.environ["EXPORTER_OTLP_ENDPOINT"] = "" + + # Test with different protocols + for protocol in ["grpc", "http/protobuf"]: + os.environ["OTEL_EXPORTER_OTLP_TRACES_PROTOCOL"] = protocol + trace.process_tracing_init() + assert trace.tracing_enabled is True + + # Test with unsupported protocol + os.environ["OTEL_EXPORTER_OTLP_TRACES_PROTOCOL"] = "unsupported" + with pytest.raises(ValueError): + trace.get_otlp_span_exporter("http://localhost:4317", None) + + def test_thread_info_with_different_ranks(self): + """Test thread info with TP and DP ranks""" + # Test with TP rank + trace.process_tracing_init() + trace.trace_set_thread_info("test_thread_tp", tp_rank=0, dp_rank=1) + + pid = threading.get_native_id() + info = trace.threads_info[pid] + assert info.tp_rank == 0 + assert info.dp_rank == 1 + + # Test with None ranks + trace.trace_set_thread_info("test_thread_no_ranks") + info = trace.threads_info[pid] # Should still be the same thread + assert info.tp_rank == 0 # Should preserve previous values + + def test_advanced_request_scenarios(self): + """Test advanced request tracing scenarios""" + # Test request with timestamp + rid = "test_request_timestamp" + ts = int(time.time() * 1e9) - 1000 # 1 microsecond ago + + trace.process_tracing_init() + trace.trace_set_thread_info("test_thread") + + trace.trace_req_start(rid, "", ts=ts) + assert rid in trace.reqs_context + assert trace.reqs_context[rid].start_time_ns == ts + + trace.trace_req_finish(rid, ts=ts + 2000) + + # Test request with attributes + rid2 = "test_request_attrs" + trace.trace_req_start(rid2, "") + attrs = {"attr1": "value1", "attr2": 123} + trace.trace_req_finish(rid2, attrs=attrs) + + def test_complex_slice_scenarios(self): + """Test complex slice operations""" + rid = "test_complex_slices" + + trace.process_tracing_init() + trace.trace_set_thread_info("test_thread") + trace.trace_req_start(rid, "") + + # Test nested slices + trace.trace_slice_start("outer", rid) + trace.trace_slice_start("inner", rid) + trace.trace_slice_end("inner", rid) + trace.trace_slice_end("outer", rid) + + # Test anonymous slices + trace.trace_slice_start("", rid, anonymous=True) + trace.trace_slice_end("anonymous_test", rid) + + trace.trace_req_finish(rid) + + def test_trace_report_span_function(self): + """Test the trace_report_span convenience function""" + rid = "test_report_span" + + trace.process_tracing_init() + trace.trace_set_thread_info("test_thread") + trace.trace_req_start(rid, "") + + # Test trace_report_span + start_time = int(time.time() * 1e9) + end_time = start_time + 1000000 # 1ms later + attrs = {"test_attr": "test_value"} + + trace.trace_report_span("report_test", rid, start_time, end_time, attrs) + + trace.trace_req_finish(rid) + + def test_propagation_advanced_scenarios(self): + """Test advanced context propagation scenarios""" + rid = "test_advanced_propagation" + + trace.process_tracing_init() + trace.trace_set_thread_info("test_thread") + trace.trace_req_start(rid, "") + + # Create slices to get a non-null prev_span_context + trace.trace_slice_start("slice1", rid) + trace.trace_slice_end("slice1", rid) + + # Get context with prev_span_context + context_dict = trace.trace_get_proc_propagate_context(rid) + assert context_dict is not None + assert "prev_span" in context_dict + + # Test propagation with timestamp + new_rid = "test_propagated" + ts = int(time.time() * 1e9) + trace.trace_set_proc_propagate_context(new_rid, context_dict, ts=ts) + + assert new_rid in trace.reqs_context + assert trace.reqs_context[new_rid].is_copy is True + assert trace.reqs_context[new_rid].start_time_ns == ts + + # Test with empty or invalid context + trace.trace_set_proc_propagate_context("invalid_rid", None) + trace.trace_set_proc_propagate_context("invalid_rid", {}) + trace.trace_set_proc_propagate_context("invalid_rid", {"invalid": "data"}) + + trace.trace_req_finish(rid) + trace.trace_req_finish(new_rid) + + def test_multiple_threads_same_request(self): + """Test tracing with multiple threads on same request""" + rid = "test_multi_thread" + + trace.process_tracing_init() + + # Setup main thread + trace.trace_set_thread_info("main_thread") + trace.trace_req_start(rid, "") + + # Create worker thread + def worker_thread(): + trace.trace_set_thread_info("worker_thread") + trace.trace_slice_start("worker_task", rid) + time.sleep(0.001) # Simulate work + trace.trace_slice_end("worker_task", rid) + + thread = threading.Thread(target=worker_thread) + thread.start() + thread.join() + + # Main thread continues + trace.trace_slice_start("main_task", rid) + trace.trace_slice_end("main_task", rid) + + trace.trace_req_finish(rid) + + def test_trace_span_enum(self): + """Test TraceSpanName enum values""" + assert trace.TraceSpanName.FASTDEPLOY == "FASTDEPLOY" + assert trace.TraceSpanName.PREPROCESSING == "PREPROCESSING" + assert trace.TraceSpanName.SCHEDULE == "SCHEDULE" + assert trace.TraceSpanName.PREFILL == "PREFILL" + assert trace.TraceSpanName.DECODE == "DECODE" + assert trace.TraceSpanName.POSTPROCESSING == "POSTPROCESSING" + + # Test all enum members exist + expected_spans = ["FASTDEPLOY", "PREPROCESSING", "SCHEDULE", "PREFILL", "DECODE", "POSTPROCESSING"] + for span_name in expected_spans: + assert hasattr(trace.TraceSpanName, span_name) + + def test_host_id_generation(self): + """Test host ID generation logic""" + # Test with environment variable (most reliable) + os.environ["FD_HOST_NAME"] = "env-host-id" + trace.process_tracing_init() + trace.trace_set_thread_info("test_thread") + pid = threading.get_native_id() + assert pid in trace.threads_info + assert trace.threads_info[pid].host_id == "env-host-id" + + # Test fallback (when env var is not set) + os.environ.pop("FD_HOST_NAME", None) + trace.threads_info.clear() # Reset to trigger re-calculation + trace.trace_set_thread_info("test_thread2") + pid2 = threading.get_native_id() + assert pid2 in trace.threads_info + # Should generate some kind of host ID + assert trace.threads_info[pid2].host_id is not None + assert len(trace.threads_info[pid2].host_id) > 0 + + def test_edge_case_operations(self): + """Test edge case operations""" + trace.process_tracing_init() + trace.trace_set_thread_info("test_thread") + + # Test operations on empty stack + rid = "test_edge_cases" + trace.trace_req_start(rid, "") + + # Try to end a slice that doesn't exist + trace.trace_slice_end("non_existent", rid) + + # Try to add event to non-existent slice + trace.trace_event("test_event", rid) + + trace.trace_req_finish(rid) + + # Test repeated operations on finished request + trace.trace_slice_start("test", rid) + trace.trace_slice_end("test", rid) + trace.trace_event("test", rid) + + def test_timing_functions(self): + """Test timing-related functions""" + # Test that time_ns is used if available + if hasattr(time, "time_ns"): + trace.process_tracing_init() + # Test that timing works correctly by checking timestamps + ts1 = int(time.time() * 1e9) + time.sleep(0.001) # 1ms + ts2 = int(time.time() * 1e9) + assert ts2 > ts1 + assert ts2 - ts1 >= 1000000 # At least 1ms in nanoseconds + + def test_request_start_with_trace_content(self): + """Test request start with trace content (upstream context)""" + rid = "test_upstream_context" + + trace.process_tracing_init() + trace.trace_set_thread_info("test_thread") + + # Test with empty upstream context (valid case) + trace_content = "" + trace.trace_req_start(rid, trace_content, role="test_role") + + # Verify the request was created + assert rid in trace.reqs_context + + trace.trace_req_finish(rid) + + def test_span_linking_logic(self): + """Test span linking functionality""" + rid = "test_span_linking" + + trace.process_tracing_init() + trace.trace_set_thread_info("test_thread") + trace.trace_req_start(rid, "") + + # Create first slice + trace.trace_slice_start("first_slice", rid) + trace.trace_slice_end("first_slice", rid) + + # Create second slice (should be linked to first) + trace.trace_slice_start("second_slice", rid) + trace.trace_slice_end("second_slice", rid) + + trace.trace_req_finish(rid) + + @mock.patch("fastdeploy.metrics.trace.trace") + def test_active_span_handling(self, mock_trace): + """Test handling of active spans from FastAPI Instrumentor""" + rid = "test_active_span" + + # Mock an active span + mock_span = mock.MagicMock() + mock_span.is_recording.return_value = True + mock_span.name = "fastapi_request" + mock_span.get_span_context.return_value = mock.MagicMock(is_valid=True, trace_id=1234567890) + mock_trace.get_current_span.return_value = mock_span + mock_trace.set_span_in_context.return_value = "mock_context" + + trace.process_tracing_init() + trace.trace_set_thread_info("test_thread") + + trace.trace_req_start(rid, "") + + # Verify the active span was used + assert rid in trace.reqs_context + assert trace.reqs_context[rid].is_copy is True + mock_span.set_attribute.assert_called_with("rid", rid) + mock_span.update_name.assert_called_with("fastapi_request (Req: test_active_span)") + + trace.trace_req_finish(rid) + + def test_lable_span_functionality(self): + """Test the label_span function with different scenarios""" + + # Create mock request and span + class MockRequest: + def __init__(self, stream): + self.stream = stream + + mock_span = mock.MagicMock() + mock_span.is_recording.return_value = True + + with mock.patch("fastdeploy.metrics.trace.trace.get_current_span", return_value=mock_span): + # Test with stream=True + request_stream = MockRequest(True) + trace.label_span(request_stream) + mock_span.set_attribute.assert_called_with("stream", "true") + + # Test with stream=False + request_no_stream = MockRequest(False) + trace.label_span(request_no_stream) + # Should not set stream attribute for False + + # Test with no active span + with mock.patch( + "fastdeploy.metrics.trace.trace.get_current_span", return_value=mock.MagicMock(is_recording=False) + ): + request_no_stream = MockRequest(False) + trace.label_span(request_no_stream) + # Should not set stream attribute for False + # Should not crash + + def test_error_handling_and_logging(self): + """Test error handling and logging scenarios""" + trace.process_tracing_init() + trace.trace_set_thread_info("test_thread") + + with mock.patch("fastdeploy.metrics.trace.logger") as mock_logger: + # Test operations on non-existent request + rid = "non_existent" + trace.trace_slice_start("test", rid) + trace.trace_slice_end("test", rid) + trace.trace_event("test", rid) + trace.trace_slice_add_attr(rid, {"test": "value"}) + + # Should log warnings but not crash + # Check if warning was called (may not always be called depending on implementation) + + # Test slice name mismatch warning + rid = "test_mismatch_warning" + trace.trace_req_start(rid, "") + + with mock.patch("fastdeploy.metrics.trace.logger") as mock_logger: + trace.trace_slice_start("start_name", rid) + trace.trace_slice_end("different_name", rid) + assert mock_logger.warning.called + + trace.trace_req_finish(rid) + + +class TestPerformanceAndConcurrency: + """Performance and concurrency tests""" + + def test_concurrent_requests(self): + """Test handling of concurrent requests""" + trace.process_tracing_init() + + def process_request(request_id, results_list): + """Process a single request""" + trace.trace_set_thread_info(f"thread_{request_id}") + trace.trace_req_start(request_id, "") + trace.trace_slice_start("process", request_id) + time.sleep(0.001) # Simulate work + trace.trace_slice_end("process", request_id) + trace.trace_req_finish(request_id) + result = f"request_{request_id}_completed" + results_list.append(result) + return result + + # Process multiple requests concurrently + results = [] + threads = [] + + for i in range(10): + thread = threading.Thread(target=process_request, args=(f"req_{i}", results)) + threads.append(thread) + thread.start() + + for thread in threads: + thread.join() + + # Verify all requests were processed + assert len([r for r in results if r.endswith("_completed")]) == 10 + + def test_memory_cleanup(self): + """Test proper memory cleanup""" + trace.process_tracing_init() + trace.trace_set_thread_info("test_thread") + + # Create and finish multiple requests + for i in range(5): + rid = f"test_request_{i}" + trace.trace_req_start(rid, "") + trace.trace_slice_start("test", rid) + trace.trace_slice_end("test", rid) + trace.trace_req_finish(rid) + + # Verify cleanup + assert len(trace.reqs_context) == 0 + + # Thread info should persist + pid = threading.get_native_id() + assert pid in trace.threads_info + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/metrics/test_trace_util.py b/tests/metrics/test_trace_util.py deleted file mode 100644 index ebec980f382..00000000000 --- a/tests/metrics/test_trace_util.py +++ /dev/null @@ -1,193 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import unittest -from unittest.mock import MagicMock, patch - -from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter - -from fastdeploy.metrics.trace_util import FilteringSpanProcessor, lable_span - - -class TestFilteringSpanProcessor(unittest.TestCase): - """Test cases for FilteringSpanProcessor class""" - - def setUp(self): - """Set up test fixtures""" - self.exporter = ConsoleSpanExporter() - self.processor = FilteringSpanProcessor(self.exporter) - - def test_initialization(self): - """Test that FilteringSpanProcessor is properly initialized""" - self.assertIsInstance(self.processor._processor, BatchSpanProcessor) - self.assertEqual(self.processor._processor.span_exporter, self.exporter) - - def test_on_start_with_parent_span(self): - """Test on_start method with parent span containing stream attribute""" - # Mock span and parent context - mock_span = MagicMock() - mock_parent_span = MagicMock() - mock_parent_span.is_recording.return_value = True - mock_parent_span.attributes.get.return_value = "test_stream" - - # Mock trace.get_current_span to return parent span - with patch("fastdeploy.metrics.trace_util.trace.get_current_span", return_value=mock_parent_span): - with patch.object(self.processor._processor, "on_start") as mock_parent_on_start: - self.processor.on_start(mock_span, parent_context=None) - - # Verify stream attribute is set on child span - mock_span.set_attribute.assert_called_once_with("stream", "test_stream") - mock_parent_on_start.assert_called_once_with(mock_span, None) - - def test_on_start_without_parent_span(self): - """Test on_start method without parent span""" - mock_span = MagicMock() - - # Mock trace.get_current_span to return None - with patch("fastdeploy.metrics.trace_util.trace.get_current_span", return_value=None): - with patch.object(self.processor._processor, "on_start") as mock_parent_on_start: - self.processor.on_start(mock_span, parent_context=None) - - # Verify no attributes are set - mock_span.set_attribute.assert_not_called() - mock_parent_on_start.assert_called_once_with(mock_span, None) - - def test_on_start_with_non_recording_parent_span(self): - """Test on_start method with non-recording parent span""" - mock_span = MagicMock() - mock_parent_span = MagicMock() - mock_parent_span.is_recording.return_value = False - - with patch("fastdeploy.metrics.trace_util.trace.get_current_span", return_value=mock_parent_span): - with patch.object(self.processor._processor, "on_start") as mock_parent_on_start: - self.processor.on_start(mock_span, parent_context=None) - - # Verify no attributes are set - mock_span.set_attribute.assert_not_called() - mock_parent_on_start.assert_called_once_with(mock_span, None) - - def test_on_end_filter_stream_http_response(self): - """Test on_end method filters out stream http response spans""" - mock_span = MagicMock() - mock_span.attributes.get.side_effect = lambda key: { - "asgi.event.type": "http.response.body", - "stream": "true", - }.get(key) - mock_span.name = "http send request" - - with patch.object(self.processor._processor, "on_end") as mock_parent_on_end: - self.processor.on_end(mock_span) - - # Verify parent on_end is NOT called (span is filtered out) - mock_parent_on_end.assert_not_called() - - def test_on_end_keep_non_stream_spans(self): - """Test on_end method keeps non-stream spans""" - mock_span = MagicMock() - mock_span.attributes.get.side_effect = lambda key: {"asgi.event.type": "http.request", "stream": None}.get(key) - mock_span.name = "http receive request" - - with patch.object(self.processor._processor, "on_end") as mock_parent_on_end: - self.processor.on_end(mock_span) - - # Verify parent on_end is called - mock_parent_on_end.assert_called_once_with(mock_span) - - def test_on_end_keep_spans_without_http_send(self): - """Test on_end method keeps spans without 'http send' in name""" - mock_span = MagicMock() - mock_span.attributes.get.side_effect = lambda key: { - "asgi.event.type": "http.response.body", - "stream": "true", - }.get(key) - mock_span.name = "other operation" - - with patch.object(self.processor._processor, "on_end") as mock_parent_on_end: - self.processor.on_end(mock_span) - - # Verify parent on_end is called - mock_parent_on_end.assert_called_once_with(mock_span) - - def test_shutdown(self): - """Test shutdown method""" - with patch.object(self.processor._processor, "shutdown") as mock_shutdown: - self.processor.shutdown() - mock_shutdown.assert_called_once() - - def test_force_flush(self): - """Test force_flush method""" - with patch.object(self.processor._processor, "force_flush") as mock_force_flush: - self.processor.force_flush(timeout_millis=5000) - mock_force_flush.assert_called_once_with(5000) - - -class TestLableSpan(unittest.TestCase): - """Test cases for lable_span function""" - - def test_lable_span_with_stream_request(self): - """Test lable_span function with streaming request""" - mock_request = MagicMock() - mock_request.stream = True - - mock_span = MagicMock() - mock_span.is_recording.return_value = True - - with patch("fastdeploy.metrics.trace_util.trace.get_current_span", return_value=mock_span): - lable_span(mock_request) - - # Verify stream attribute is set - mock_span.set_attribute.assert_called_once_with("stream", "true") - - def test_lable_span_without_stream_request(self): - """Test lable_span function with non-streaming request""" - mock_request = MagicMock() - mock_request.stream = False - - mock_span = MagicMock() - mock_span.is_recording.return_value = True - - with patch("fastdeploy.metrics.trace_util.trace.get_current_span", return_value=mock_span): - lable_span(mock_request) - - # Verify no attributes are set - mock_span.set_attribute.assert_not_called() - - def test_lable_span_without_current_span(self): - """Test lable_span function when no current span exists""" - mock_request = MagicMock() - mock_request.stream = True - - with patch("fastdeploy.metrics.trace_util.trace.get_current_span", return_value=None): - # Should not raise any exception - lable_span(mock_request) - - def test_lable_span_with_non_recording_span(self): - """Test lable_span function with non-recording span""" - mock_request = MagicMock() - mock_request.stream = True - - mock_span = MagicMock() - mock_span.is_recording.return_value = False - - with patch("fastdeploy.metrics.trace_util.trace.get_current_span", return_value=mock_span): - lable_span(mock_request) - - # Verify no attributes are set - mock_span.set_attribute.assert_not_called() - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/output/test_process_batch_output.py b/tests/output/test_process_batch_output.py index 9a1d06db78b..a5ef051241a 100644 --- a/tests/output/test_process_batch_output.py +++ b/tests/output/test_process_batch_output.py @@ -66,6 +66,7 @@ def __init__(self): self.llm_engine_recv_req_timestamp = time.time() self.ic_req_data = {} self.prompt_token_ids_len = 0 + self.trace_carrier = {} now = time.time() self.metrics = RequestMetrics(