peaks-pipeline

Signed-off-by: Parul Singh <parsingh@redhat.com>
sustainable-computing-io · Sep 5, 2024 · e78bf16 · e78bf16
1 parent 1de3894
commit e78bf16
Show file tree

Hide file tree

Showing 16 changed files with 275 additions and 0 deletions.
diff --git a/benchmarks/peaks-pipeline/README.md b/benchmarks/peaks-pipeline/README.md
diff --git a/benchmarks/peaks-pipeline/deploy-all.sh b/benchmarks/peaks-pipeline/deploy-all.sh
diff --git a/benchmarks/peaks-pipeline/prometheus-query/Dockerfile b/benchmarks/peaks-pipeline/prometheus-query/Dockerfile
@@ -0,0 +1,25 @@
+# Use an official Python runtime as a parent image
+FROM python:3.9-alpine
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Copy the Python script and any necessary files into the container
+COPY prometheus_query.py /app/
+
+# Install necessary Python libraries
+RUN pip install requests pandas
+
+# Set environment variables for configurable properties (these will be overridden by Kubernetes ConfigMap)
+ENV PROMETHEUS_URL="http://localhost:9090/api/v1/query_range"
+ENV METRICS_LIST='kepler_node_dram_joules_total,kepler_node_other_joules_total,kepler_node_package_joules_total,kepler_node_platform_joules_total,node_cpu_seconds_total'
+ENV CSV_DIRECTORY="/tmp/pvc/data"
+ENV QUERY_INTERVAL="60"
+ENV START_TIME="2024-09-05T10:50:00Z"
+ENV END_TIME="2024-09-05T11:56:00Z"
+
+# Expose the working directory (in case files need to be written to the mounted PVC)
+VOLUME ["/mnt/pvc"]
+
+# Run the Python script to query Prometheus
+CMD ["python", "prometheus_query.py"]
diff --git a/benchmarks/peaks-pipeline/prometheus-query/prometheus-pvc.yaml b/benchmarks/peaks-pipeline/prometheus-query/prometheus-pvc.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: prometheus-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 10Gi
diff --git a/benchmarks/peaks-pipeline/prometheus-query/prometheus-query-configmap.yaml b/benchmarks/peaks-pipeline/prometheus-query/prometheus-query-configmap.yaml
@@ -0,0 +1,20 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: prometheus-query-config
+data:
+  # Prometheus server URL
+  PROMETHEUS_URL: "http://prometheus-server:9090/api/v1/query_range"
+
+  # Start and end times for querying Prometheus
+  START_TIME: "2024-09-04T00:00:00Z"
+  END_TIME: "2024-09-04T12:00:00Z"
+
+  # Directory to store the CSV files (mounted from PVC)
+  CSV_DIRECTORY: "/mnt/pvc/data"
+
+  # List of metrics to query, as a comma-separated string
+  METRICS_LIST: 'kepler_node_dram_joules_total,kepler_node_other_joules_total,kepler_node_package_joules_total,kepler_node_platform_joules_total,node_cpu_seconds_total'
+
+  # Optional: Query interval (time between queries in seconds)
+  QUERY_INTERVAL: "60"
diff --git a/benchmarks/peaks-pipeline/prometheus-query/prometheus-query-deployment.yaml b/benchmarks/peaks-pipeline/prometheus-query/prometheus-query-deployment.yaml
@@ -0,0 +1,66 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: prometheus-query-deployment
+  labels:
+    app: prometheus-query
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: prometheus-query
+  template:
+    metadata:
+      labels:
+        app: prometheus-query
+    spec:
+      containers:
+      - name: prometheus-query-client
+        image: quay.io/husky_parul/prometheus-query-client:latest
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/mnt/pvc"
+          name: prometheus-pvc-volume
+        # Inject the environment variables from the ConfigMap
+        env:
+        - name: PROMETHEUS_URL
+          valueFrom:
+            configMapKeyRef:
+              name: prometheus-query-config
+              key: PROMETHEUS_URL
+        - name: CSV_DIRECTORY
+          valueFrom:
+            configMapKeyRef:
+              name: prometheus-query-config
+              key: CSV_DIRECTORY
+        - name: START_TIME
+          valueFrom:
+            configMapKeyRef:
+              name: prometheus-query-config
+              key: START_TIME
+        - name: END_TIME
+          valueFrom:
+            configMapKeyRef:
+              name: prometheus-query-config
+              key: END_TIME
+        - name: QUERY_INTERVAL
+          valueFrom:
+            configMapKeyRef:
+              name: prometheus-query-config
+              key: QUERY_INTERVAL
+        - name: METRICS_LIST
+          valueFrom:
+            configMapKeyRef:
+              name: prometheus-query-config
+              key: METRICS_LIST
+        resources:
+          requests:
+            memory: "128Mi"
+            cpu: "100m"
+          limits:
+            memory: "512Mi"
+            cpu: "500m"
+      volumes:
+      - name: prometheus-pvc-volume
+        persistentVolumeClaim:
+          claimName: prometheus-pvc
diff --git a/benchmarks/peaks-pipeline/prometheus-query/prometheus_query.py b/benchmarks/peaks-pipeline/prometheus-query/prometheus_query.py
@@ -0,0 +1,78 @@
+import os
+import requests
+import pandas as pd
+import time
+
+# Get configuration from environment variables
+PROMETHEUS_URL = os.getenv('PROMETHEUS_URL')
+CSV_DIRECTORY = os.getenv('CSV_DIRECTORY', '/mnt/pvc/data')
+START_TIME = os.getenv('START_TIME')
+END_TIME = os.getenv('END_TIME')
+QUERY_INTERVAL = int(os.getenv('QUERY_INTERVAL', 60))  # Default to 60 seconds
+
+# Read the metrics list from environment variable (comma-separated)
+METRICS_LIST = os.getenv('METRICS_LIST').split(',')
+
+# Function to query Prometheus with a time range
+def query_prometheus(query, start_time, end_time):
+    try:
+        params = {
+            'query': query,
+            'start': start_time,
+            'end': end_time,
+            'step': '60s'  # Adjust step size as needed
+        }
+        response = requests.get(PROMETHEUS_URL, params=params)
+        response.raise_for_status()
+        return response.json()['data']['result']
+    except Exception as e:
+        print(f"Error querying Prometheus: {e}")
+        return []
+
+# Function to save the results to a CSV file
+def save_to_csv(data, filename):
+    df = pd.DataFrame(data)
+    csv_file_path = os.path.join(CSV_DIRECTORY, filename)
+    df.to_csv(csv_file_path, index=False)
+    print(f"Data saved to {csv_file_path}")
+
+# Main function to query Prometheus for multiple metrics
+def main():
+    for metric in METRICS_LIST:
+        metric_name = metric.replace('(', '').replace(')', '').replace('[', '').replace(']', '').replace(',', '_')
+        print(f"Querying metric: {metric}")
+
+        # Query Prometheus for the current metric
+        result = query_prometheus(metric, START_TIME, END_TIME)
+
+        print(result)
+
+        # Check if the result contains data
+        if result:
+            # Flatten the metric labels and values for better CSV format
+            data = []
+            for item in result:
+                metric_labels = item['metric']
+                values = item['values']
+                for value in values:
+                    timestamp = value[0]
+                    metric_value = value[1]
+
+                    # Create a flat record containing metric labels, timestamp, and value
+                    flat_metric = {k: v for k, v in metric_labels.items()}
+                    flat_metric['timestamp'] = timestamp
+                    flat_metric['value'] = metric_value
+                    data.append(flat_metric)
+
+            # Save the result to a CSV file named after the metric
+            timestamp = time.strftime("%Y%m%d-%H%M%S")
+            filename = f"{metric_name}_data_{timestamp}.csv"
+            save_to_csv(data, filename)
+        else:
+            print(f"No data found for the metric: {metric}")
+
+        # Wait for the query interval before the next query
+        time.sleep(QUERY_INTERVAL)
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/peaks-pipeline/stress-ng/Dockerfile b/benchmarks/peaks-pipeline/stress-ng/Dockerfile
@@ -0,0 +1,5 @@
+FROM alpine:latest
+RUN apk update && apk add stress-ng bash
+COPY stress-script.sh /usr/local/bin/stress-script.sh
+RUN chmod +x /usr/local/bin/stress-script.sh
+ENTRYPOINT ["/usr/local/bin/stress-script.sh"]
diff --git a/benchmarks/peaks-pipeline/stress-ng/stress-ng-configmap.yaml b/benchmarks/peaks-pipeline/stress-ng/stress-ng-configmap.yaml
@@ -0,0 +1,9 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: stress-ng-config
+data:
+  # Configurable parameters for Stress-ng
+  MAX_CPU_LOAD: "100"
+  STEP: "10"
+  DURATION: "60"
diff --git a/benchmarks/peaks-pipeline/stress-ng/stress-ng-daemonset.yaml b/benchmarks/peaks-pipeline/stress-ng/stress-ng-daemonset.yaml
@@ -0,0 +1,45 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: stress-ng-daemonset
+  labels:
+    app: stress-ng
+spec:
+  selector:
+    matchLabels:
+      app: stress-ng
+  template:
+    metadata:
+      labels:
+        app: stress-ng
+    spec:
+      containers:
+      - name: stress-ng
+        image: quay.io/husky_parul/stress-ng-container:latest
+        args: []
+        env:
+        - name: MAX_CPU_LOAD
+          valueFrom:
+            configMapKeyRef:
+              name: stress-ng-config
+              key: MAX_CPU_LOAD
+        - name: STEP
+          valueFrom:
+            configMapKeyRef:
+              name: stress-ng-config
+              key: STEP
+        - name: DURATION
+          valueFrom:
+            configMapKeyRef:
+              name: stress-ng-config
+              key: DURATION
+        resources:
+          requests:
+            memory: "128Mi"
+            cpu: "100m"
+          limits:
+            memory: "512Mi"
+            cpu: "500m"
+      nodeSelector:
+        kubernetes.io/os: linux
+      restartPolicy: Always
diff --git a/benchmarks/peaks-pipeline/stress-ng/stress-script.sh b/benchmarks/peaks-pipeline/stress-ng/stress-script.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Read configuration values from environment variables (provided via ConfigMap)
+MAX_CPU_LOAD=${MAX_CPU_LOAD:-100}  # Default to 100% if not provided
+STEP=${STEP:-10}                   # Default to 10% increment if not provided
+DURATION=${DURATION:-60}            # Default to 60 seconds if not provided
+CPU_COUNT=$(nproc)                  # Number of CPU cores available
+
+# Run the stress-ng workload in incremental steps
+for i in $(seq $STEP $STEP $MAX_CPU_LOAD); do
+  WORKERS=$(($i * $CPU_COUNT / 100))
+  echo "Stressing CPU with $WORKERS workers for $DURATION seconds"
+  stress-ng --cpu $WORKERS --timeout ${DURATION}s
+  sleep 10
+done
+
+echo "CPU stress test completed."
diff --git a/benchmarks/peaks-pipeline/training/ training_script.py b/benchmarks/peaks-pipeline/training/ training_script.py
diff --git a/benchmarks/peaks-pipeline/training/Dockerfile-training b/benchmarks/peaks-pipeline/training/Dockerfile-training
diff --git a/benchmarks/peaks-pipeline/training/training-configmap.yaml b/benchmarks/peaks-pipeline/training/training-configmap.yaml
diff --git a/benchmarks/peaks-pipeline/training/training-deployment.yaml b/benchmarks/peaks-pipeline/training/training-deployment.yaml
diff --git a/benchmarks/peaks-pipeline/training/training-pvc.yaml b/benchmarks/peaks-pipeline/training/training-pvc.yaml