diff --git a/.github/instructions/README.md b/.github/instructions/README.md new file mode 100644 index 00000000..02b5de26 --- /dev/null +++ b/.github/instructions/README.md @@ -0,0 +1,127 @@ +# GitHub Copilot Instructions + +This directory contains instruction files that help GitHub Copilot provide better, more contextual assistance when working with this repository. These files follow the [GitHub Copilot coding agent best practices](https://gh.io/copilot-coding-agent-tips). + +## Overview + +Each instruction file provides guidelines, conventions, and best practices for specific technologies or file types used in this repository. GitHub Copilot uses these instructions to understand the project's coding standards and provide more accurate suggestions. + +## Instruction Files + +### Core Technologies + +- **[helm-kubernetes.instructions.md](helm-kubernetes.instructions.md)** - Comprehensive guide for Helm chart development and Kubernetes manifest creation + - Applies to: `**/*.yaml`, `**/*.yml`, `**/Chart.yaml`, `**/values.yaml`, `**/templates/**` + - Covers: Helm best practices, chart structure, template development, RBAC, Argo-specific patterns + +- **[python.instructions.md](python.instructions.md)** - General Python development guidelines + - Applies to: `**/*.py`, `**/requirements*.txt`, `**/setup.py`, `**/pyproject.toml` + - Covers: PEP 8 compliance, type hints, testing, Flask patterns, error handling + +- **[bash.instructions.md](bash.instructions.md)** - Bash scripting and Makefile best practices + - Applies to: `**/*.sh`, `**/Makefile` + - Covers: Script structure, error handling, Kubernetes patterns, security, testing + +- **[docker.instructions.md](docker.instructions.md)** - Docker and containerization guidelines + - Applies to: `**/Dockerfile`, `**/Dockerfile.*`, `**/.dockerignore` + - Covers: Multi-stage builds, security, optimization, health checks, Alpine patterns + +### Specialized Technologies + +- **[go.instructions.md](go.instructions.md)** - Go development following idiomatic practices + - Applies to: `**/*.go`, `**/go.mod`, `**/go.sum` + - Covers: Idiomatic Go, naming conventions, error handling, concurrency + +- **[python-mcp-server.instructions.md](python-mcp-server.instructions.md)** - Model Context Protocol (MCP) server development + - Applies to: `**/*.py`, `**/pyproject.toml`, `**/requirements.txt` + - Covers: FastMCP patterns, tool development, resource management, HTTP/stdio transports + +## How It Works + +GitHub Copilot automatically reads and applies these instructions based on the file patterns specified in each instruction file's frontmatter. When you're working on a file that matches one or more patterns, Copilot considers the relevant guidelines when providing suggestions. + +### Frontmatter Format + +Each instruction file starts with YAML frontmatter: + +```yaml +--- +description: 'Brief description of what this file covers' +applyTo: 'file pattern(s) that trigger these instructions' +--- +``` + +### File Pattern Examples + +- `**/*.py` - All Python files +- `**/Dockerfile` - All Dockerfiles +- `helm/*/templates/**` - All Helm templates +- `**/*.{yaml,yml}` - All YAML files + +## Contributing + +When adding new technologies or updating existing guidelines: + +1. Create or update the appropriate instruction file +2. Include proper frontmatter with description and file patterns +3. Follow the established structure and format +4. Include practical examples and common patterns +5. Document common pitfalls and security considerations +6. Update this README with any new instruction files + +## Repository-Specific Patterns + +This repository focuses on: +- **Argo Workflows** - Kubernetes-native workflow engine +- **Argo CD** - GitOps continuous delivery +- **Authorization Adapter** - Flask-based RBAC service +- **Helm Charts** - Kubernetes package management +- **Multi-tenancy** - Namespace isolation and RBAC + +The instruction files are tailored to these specific use cases while following industry best practices. + +## Best Practices + +### When Writing Instructions + +- **Be specific** - Provide concrete examples and patterns +- **Be practical** - Focus on what developers actually need +- **Be current** - Keep up with best practices and tool updates +- **Be consistent** - Follow the established format and style +- **Be comprehensive** - Cover common scenarios and edge cases + +### Testing Instructions + +After adding or updating instruction files, verify they work correctly by: + +1. Opening files that match the patterns +2. Checking that Copilot provides contextually appropriate suggestions +3. Ensuring suggestions follow the documented guidelines +4. Testing with different file types and scenarios + +## Resources + +- [GitHub Copilot Documentation](https://docs.github.com/en/copilot) +- [Best practices for Copilot coding agent](https://gh.io/copilot-coding-agent-tips) +- [Repository CONTRIBUTING.md](../../CONTRIBUTING.md) +- [Repository README.md](../../README.md) + +## Maintenance + +These instruction files should be reviewed and updated: +- When introducing new technologies or patterns +- When updating dependencies or frameworks +- When best practices evolve +- When team conventions change +- At least quarterly for general maintenance + +## Questions? + +If you have questions about these instructions or suggestions for improvements, please: +- Open an issue in the repository +- Submit a pull request with proposed changes +- Reach out to the maintainers + +--- + +**Note**: These instructions are designed to assist GitHub Copilot in providing better suggestions. They represent our team's coding standards and should be followed by all contributors, whether using Copilot or not. diff --git a/.github/instructions/bash.instructions.md b/.github/instructions/bash.instructions.md new file mode 100644 index 00000000..beacf4e0 --- /dev/null +++ b/.github/instructions/bash.instructions.md @@ -0,0 +1,464 @@ +--- +description: 'Instructions for writing Bash scripts following best practices and conventions' +applyTo: '**/*.sh, **/Makefile' +--- + +# Bash Scripting Instructions + +## General Principles + +- Write portable, readable, and maintainable shell scripts +- Follow POSIX standards where possible, use Bash-specific features when beneficial +- Include error handling and validation +- Make scripts idempotent when possible +- Document script usage and requirements + +## Script Structure + +### Shebang and Options + +- Always start scripts with `#!/bin/bash` (or `#!/usr/bin/env bash` for portability) +- Use `set -e` to exit on errors (or `set -euo pipefail` for stricter error handling) +- Consider `set -u` to treat unset variables as errors +- Use `set -x` for debugging when needed (or enable via DEBUG environment variable) + +Example: +```bash +#!/bin/bash +set -euo pipefail + +# Optional debugging +[[ "${DEBUG:-}" == "true" ]] && set -x +``` + +### Script Organization + +- Start with a header comment describing the script's purpose +- Define all functions before the main script logic +- Include a usage/help function +- Place main execution logic at the bottom +- Use clear section separators + +Example: +```bash +#!/bin/bash +# Description: Deploy Argo stack to Kubernetes +# Usage: ./deploy.sh [options] + +set -euo pipefail + +################# +# Configuration # +################# + +DEFAULT_NAMESPACE="argocd" +TIMEOUT="10m" + +############# +# Functions # +############# + +usage() { + cat </dev/null 2>&1 || { echo "kubectl is required"; exit 1; } +} + +main() { + check_prerequisites + # Main logic here +} + +######## +# Main # +######## + +main "$@" +``` + +## Error Handling + +### Exit Codes + +- Use meaningful exit codes (0 for success, non-zero for errors) +- Document exit codes in help text for complex scripts +- Use consistent exit codes across scripts + +### Validation + +- Validate required environment variables early: +```bash +: "${REQUIRED_VAR:?Error: REQUIRED_VAR must be set}" +``` + +- Check for required commands: +```bash +command -v kubectl >/dev/null 2>&1 || { + echo "Error: kubectl is required but not installed" + exit 1 +} +``` + +- Validate file existence: +```bash +[[ -f "${CONFIG_FILE}" ]] || { + echo "Error: Config file not found: ${CONFIG_FILE}" + exit 1 +} +``` + +### Cleanup and Traps + +- Use `trap` for cleanup operations: +```bash +cleanup() { + rm -f "${TEMP_FILE}" +} +trap cleanup EXIT INT TERM +``` + +## Variables and Quoting + +### Variable Naming + +- Use UPPER_CASE for environment variables and constants +- Use lower_case for local variables +- Use descriptive names (avoid single letters except for loop counters) + +### Quoting + +- Always quote variables unless you explicitly want word splitting: `"${var}"` +- Quote command substitutions: `"$(command)"` +- Use arrays for lists instead of space-separated strings +- Don't quote variables in `[[ ]]` conditions (they're safe there) + +### Arrays + +- Use arrays for lists of items: +```bash +namespaces=("argo" "argocd" "security") +for ns in "${namespaces[@]}"; do + echo "${ns}" +done +``` + +## Conditionals and Loops + +### If Statements + +- Use `[[ ]]` instead of `[ ]` for better error handling and features +- Prefer explicit comparisons: +```bash +if [[ "${STATUS}" == "ready" ]]; then + echo "Ready" +fi + +if [[ -n "${VAR}" ]]; then # Check if variable is not empty + echo "VAR is set" +fi + +if [[ -z "${VAR}" ]]; then # Check if variable is empty + echo "VAR is not set" +fi +``` + +### Loops + +- Use `for` loops for iterating over arrays +- Use `while read` for processing lines: +```bash +while IFS= read -r line; do + echo "${line}" +done < file.txt +``` + +- Break long loops into functions for readability + +## Functions + +### Function Definition + +- Define functions before use +- Use clear, descriptive function names +- Add comments describing parameters and return values +- Use `local` for function-scoped variables + +```bash +# Deploy a Helm chart +# Arguments: +# $1 - chart name +# $2 - namespace +# $3 - values file (optional) +# Returns: +# 0 on success, 1 on failure +deploy_chart() { + local chart_name="${1}" + local namespace="${2}" + local values_file="${3:-}" + + local helm_args=( + upgrade --install + "${chart_name}" + "./charts/${chart_name}" + --namespace "${namespace}" + --create-namespace + ) + + if [[ -n "${values_file}" ]]; then + helm_args+=(--values "${values_file}") + fi + + helm "${helm_args[@]}" +} +``` + +## Command Execution + +### Command Substitution + +- Use `$(command)` instead of backticks +- Check command success: +```bash +if output=$(kubectl get pods 2>&1); then + echo "Success: ${output}" +else + echo "Failed to get pods" + exit 1 +fi +``` + +### Pipelines + +- Use `set -o pipefail` to catch errors in pipelines +- Consider breaking complex pipelines into steps + +### Background Jobs + +- Track background processes: +```bash +kubectl port-forward svc/myservice 8080:80 & +PF_PID=$! + +# Later, clean up +kill "${PF_PID}" 2>/dev/null || true +``` + +## Output and Logging + +### User Feedback + +- Use descriptive output messages with emoji when appropriate: +```bash +echo "✅ Deployment successful" +echo "❌ Error: Deployment failed" +echo "🔍 Checking prerequisites..." +echo "⚠️ Warning: Resource limits not set" +``` + +### Debugging + +- Use meaningful debug output: +```bash +if [[ "${DEBUG:-false}" == "true" ]]; then + echo "DEBUG: Variable value: ${VAR}" +fi +``` + +### Error Messages + +- Write errors to stderr: +```bash +echo "Error: Something went wrong" >&2 +exit 1 +``` + +## Kubernetes-Specific Patterns + +### Waiting for Resources + +- Use `kubectl wait` instead of sleep loops: +```bash +kubectl wait --for=condition=Ready pod \ + -l app=myapp \ + --timeout=120s \ + -n "${namespace}" +``` + +### Namespace Operations + +- Always specify namespace explicitly: +```bash +kubectl get pods -n "${namespace}" +``` + +- Check if namespace exists: +```bash +if kubectl get namespace "${namespace}" >/dev/null 2>&1; then + echo "Namespace exists" +fi +``` + +### Safe Deletions + +- Use `|| true` for delete operations that might not find resources: +```bash +kubectl delete namespace "${namespace}" --ignore-not-found=true +# or +kubectl delete pod mypod 2>/dev/null || true +``` + +## Makefile Conventions + +### Targets + +- Use `.PHONY` for non-file targets +- Provide a `help` target as default +- Use descriptive target names +- Add comments explaining what each target does + +### Variables + +- Define configurable variables with defaults +- Use `?=` for variables that can be overridden +- Document required environment variables + +Example: +```makefile +.PHONY: help deploy clean + +NAMESPACE ?= default +TIMEOUT ?= 10m + +help: + @echo "Available targets:" + @echo " deploy - Deploy the application" + @echo " clean - Clean up resources" + +deploy: + @echo "🚀 Deploying to namespace: $(NAMESPACE)" + helm upgrade --install myapp ./charts/myapp \ + --namespace $(NAMESPACE) \ + --timeout $(TIMEOUT) + +clean: + @echo "🧹 Cleaning up..." + helm uninstall myapp -n $(NAMESPACE) || true +``` + +## Security Considerations + +### Secrets and Sensitive Data + +- Never hardcode secrets in scripts +- Use environment variables or secret management tools +- Don't echo sensitive variables (they'll appear in logs) +- Be careful with `set -x` when handling secrets + +### Input Validation + +- Validate all external inputs +- Sanitize user-provided values +- Be cautious with `eval` (avoid if possible) + +## Testing + +### Dry Runs + +- Support dry-run mode where applicable: +```bash +DRY_RUN="${DRY_RUN:-false}" + +run_command() { + if [[ "${DRY_RUN}" == "true" ]]; then + echo "Would run: $*" + else + "$@" + fi +} +``` + +### ShellCheck + +- Run `shellcheck` on all shell scripts before committing +- Address or suppress warnings with justification +- Add shellcheck directives when needed: +```bash +# shellcheck disable=SC2034 # VAR appears unused +VAR="value" +``` + +## Common Patterns + +### Checking Command Availability + +```bash +has_command() { + command -v "$1" >/dev/null 2>&1 +} + +if ! has_command kubectl; then + echo "kubectl not found" + exit 1 +fi +``` + +### Retry Logic + +```bash +retry() { + local max_attempts=$1 + shift + local cmd=("$@") + local attempt=1 + + while (( attempt <= max_attempts )); do + if "${cmd[@]}"; then + return 0 + fi + echo "Attempt ${attempt}/${max_attempts} failed, retrying..." + ((attempt++)) + sleep 2 + done + + return 1 +} + +retry 3 kubectl get pods +``` + +### Temporary Files + +```bash +# Create temp file safely +TEMP_FILE=$(mktemp) +trap 'rm -f "${TEMP_FILE}"' EXIT + +# Use it +echo "data" > "${TEMP_FILE}" +``` + +## Common Pitfalls to Avoid + +- Don't use `cd` without error checking or in subshells +- Don't parse `ls` output (use globs or `find` instead) +- Don't use `cat file | grep` (use `grep pattern file`) +- Don't ignore command failures with `;` (use `&&` for chaining) +- Don't use `echo` for complex output (use `printf` or heredocs) +- Don't assume scripts run from a specific directory (use absolute paths or `cd "$(dirname "$0")"`) +- Avoid `which` (use `command -v` instead) + +## Documentation + +- Include usage information in scripts (help function) +- Document required environment variables +- Add examples in comments +- Keep comments up to date with code diff --git a/.github/instructions/docker.instructions.md b/.github/instructions/docker.instructions.md new file mode 100644 index 00000000..b5d630f1 --- /dev/null +++ b/.github/instructions/docker.instructions.md @@ -0,0 +1,533 @@ +--- +description: 'Instructions for writing Dockerfiles and working with containers' +applyTo: '**/Dockerfile, **/Dockerfile.*, **/.dockerignore' +--- + +# Docker and Containerization Instructions + +## General Principles + +- Write secure, efficient, and maintainable Dockerfiles +- Optimize for small image sizes and fast build times +- Follow Docker best practices and security guidelines +- Use multi-stage builds when appropriate +- Keep images minimal and focused + +## Dockerfile Best Practices + +### Base Image Selection + +- Use official base images from Docker Hub +- Prefer specific version tags over `latest` +- Choose appropriate base images for your use case: + - `alpine` for minimal size (use musl libc compatible packages) + - `slim` variants for balance between size and compatibility + - Full images when you need all system utilities +- Use multi-stage builds to keep final images small + +```dockerfile +# Good: Specific version +FROM python:3.11-slim + +# Bad: Using latest +FROM python:latest + +# Good: Alpine for minimal size +FROM python:3.11-alpine + +# Good: Multi-stage build +FROM python:3.11 AS builder +# Build steps + +FROM python:3.11-slim +# Copy artifacts from builder +``` + +### Image Structure + +- Order instructions from least to most frequently changing +- Combine related RUN commands to reduce layers +- Use `.dockerignore` to exclude unnecessary files +- Clean up in the same layer where you create files + +```dockerfile +FROM python:3.11-slim + +# Set working directory early +WORKDIR /app + +# Install system dependencies (changes rarely) +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + curl \ + ca-certificates && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Copy requirements first (changes less often than code) +COPY requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code (changes most frequently) +COPY . . + +# Set runtime configuration +ENV FLASK_APP=app.py +ENV PYTHONUNBUFFERED=1 + +# Expose port +EXPOSE 8080 + +# Run as non-root user +USER nobody + +# Define entrypoint +CMD ["python", "app.py"] +``` + +### Layer Optimization + +- Minimize the number of layers (combine RUN commands) +- Put frequently changing instructions at the end +- Use build cache effectively by ordering instructions properly +- Clean up temporary files in the same RUN instruction + +```dockerfile +# Good: Combined into one layer with cleanup +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + gcc \ + build-essential && \ + pip install --no-cache-dir -r requirements.txt && \ + apt-get remove -y gcc build-essential && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Bad: Multiple layers, no cleanup +RUN apt-get update +RUN apt-get install -y gcc +RUN pip install -r requirements.txt +``` + +### Multi-stage Builds + +- Use multi-stage builds to separate build and runtime environments +- Copy only necessary artifacts to final image +- Keep final image minimal + +```dockerfile +# Build stage +FROM python:3.11 AS builder + +WORKDIR /app + +# Install build dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + gcc \ + build-essential + +# Install Python packages +COPY requirements.txt . +RUN pip install --user --no-cache-dir -r requirements.txt + +# Runtime stage +FROM python:3.11-slim + +WORKDIR /app + +# Copy Python packages from builder +COPY --from=builder /root/.local /root/.local + +# Copy application +COPY . . + +# Make sure scripts are in PATH +ENV PATH=/root/.local/bin:$PATH +ENV PYTHONUNBUFFERED=1 + +USER nobody + +CMD ["python", "app.py"] +``` + +## Security Best Practices + +### User Management + +- Don't run containers as root +- Create dedicated non-root user if needed +- Use numeric user IDs for better Kubernetes compatibility + +```dockerfile +# Option 1: Use nobody user (already exists in base images) +USER nobody + +# Option 2: Create a dedicated user +RUN groupadd -r appuser && \ + useradd -r -g appuser -u 1000 appuser && \ + chown -R appuser:appuser /app + +USER appuser + +# Option 3: Use numeric UID (better for Kubernetes) +USER 1000:1000 +``` + +### Minimize Attack Surface + +- Install only necessary packages +- Remove package manager caches +- Use specific package versions +- Scan images for vulnerabilities regularly + +```dockerfile +# Install only what's needed, clean up after +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + curl && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# Pin Python package versions +COPY requirements.txt . +RUN pip install --no-cache-dir \ + flask==3.0.0 \ + requests==2.31.0 +``` + +### Secrets and Sensitive Data + +- Never include secrets in Docker images +- Use build arguments for build-time configuration (not secrets) +- Use Docker secrets or environment variables for runtime secrets +- Don't commit .env files with secrets + +```dockerfile +# Good: Use ARG for build-time values (not secrets) +ARG APP_VERSION=1.0.0 +ENV APP_VERSION=${APP_VERSION} + +# Good: Expect secrets via environment at runtime +ENV API_KEY="" + +# Bad: Hardcoded secret +ENV API_KEY="secret-key-12345" +``` + +## Python-Specific Patterns + +### Python Dockerfiles + +- Use `PYTHONUNBUFFERED=1` for real-time logging +- Install packages with `--no-cache-dir` to save space +- Use `pip install --user` in multi-stage builds +- Consider using `uv` or `pip-tools` for faster installs + +```dockerfile +FROM python:3.11-slim + +WORKDIR /app + +# Prevent Python from writing pyc files and buffering stdout/stderr +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application +COPY . . + +# Create non-root user +RUN useradd -m -u 1000 appuser && \ + chown -R appuser:appuser /app + +USER appuser + +EXPOSE 8080 + +CMD ["python", "-m", "flask", "run", "--host=0.0.0.0", "--port=8080"] +``` + +### Flask/Web Application Pattern + +```dockerfile +FROM python:3.11-slim + +WORKDIR /app + +ENV PYTHONUNBUFFERED=1 \ + FLASK_APP=app.py \ + FLASK_ENV=production + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY app.py . +COPY templates/ templates/ +COPY static/ static/ + +# Health check +HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:8080/healthz || exit 1 + +# Run as non-root +USER nobody + +EXPOSE 8080 + +CMD ["python", "-m", "flask", "run", "--host=0.0.0.0", "--port=8080"] +``` + +## .dockerignore + +- Always include a `.dockerignore` file +- Exclude unnecessary files to speed up builds and reduce context size +- Follow patterns similar to `.gitignore` + +``` +# .dockerignore +.git +.gitignore +.github +README.md +LICENSE +.venv +venv/ +__pycache__/ +*.pyc +*.pyo +*.pyd +.pytest_cache/ +.coverage +htmlcov/ +.mypy_cache/ +.tox/ +dist/ +build/ +*.egg-info/ +.DS_Store +.env +.env.local +*.log +tests/ +docs/ +examples/ +``` + +## Health Checks + +- Include HEALTHCHECK instructions for containerized services +- Implement a health endpoint in your application +- Set appropriate intervals and timeouts + +```dockerfile +# Simple health check using curl +HEALTHCHECK --interval=30s --timeout=3s --start-period=10s --retries=3 \ + CMD curl -f http://localhost:8080/healthz || exit 1 + +# Health check without additional tools +HEALTHCHECK --interval=30s --timeout=3s --start-period=10s --retries=3 \ + CMD python -c "import requests; requests.get('http://localhost:8080/healthz').raise_for_status()" || exit 1 +``` + +## Labels and Metadata + +- Add labels for better organization and documentation +- Follow OCI annotation conventions +- Include version, build info, and maintainer + +```dockerfile +LABEL org.opencontainers.image.title="Argo AuthZ Adapter" \ + org.opencontainers.image.description="Authorization adapter for Argo Workflows" \ + org.opencontainers.image.version="1.0.0" \ + org.opencontainers.image.authors="Your Team " \ + org.opencontainers.image.source="https://github.com/calypr/argo-helm" \ + org.opencontainers.image.licenses="Apache-2.0" +``` + +## Build Arguments + +- Use ARG for configurable build-time values +- Provide sensible defaults +- Document arguments in comments + +```dockerfile +# Build arguments with defaults +ARG PYTHON_VERSION=3.11 +ARG APP_VERSION=latest + +FROM python:${PYTHON_VERSION}-slim + +# Re-declare after FROM to use in this stage +ARG APP_VERSION +ENV APP_VERSION=${APP_VERSION} + +LABEL version="${APP_VERSION}" +``` + +## Entrypoint vs CMD + +- Use ENTRYPOINT for executable containers +- Use CMD for default arguments to ENTRYPOINT or standalone commands +- Use JSON array format for proper signal handling + +```dockerfile +# Good: ENTRYPOINT + CMD for flexibility +ENTRYPOINT ["python"] +CMD ["app.py"] +# Can override CMD: docker run myimage script.py + +# Good: ENTRYPOINT as executable +ENTRYPOINT ["python", "-m", "flask"] +CMD ["run", "--host=0.0.0.0"] + +# Good: Simple CMD +CMD ["python", "app.py"] + +# Bad: Shell form (doesn't handle signals properly) +CMD python app.py +``` + +## Working with Alpine + +- Install Python packages that need compilation with build dependencies +- Use Alpine's package manager (apk) +- Clean up build dependencies after use + +```dockerfile +FROM python:3.11-alpine + +WORKDIR /app + +# Install build dependencies and runtime dependencies +RUN apk add --no-cache --virtual .build-deps \ + gcc \ + musl-dev \ + python3-dev && \ + apk add --no-cache \ + ca-certificates \ + curl + +# Install Python packages +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Remove build dependencies +RUN apk del .build-deps + +COPY . . + +USER nobody + +CMD ["python", "app.py"] +``` + +## Volume Management + +- Use VOLUME for data that should persist or be shared +- Document expected volumes in comments +- Don't include VOLUME for application code + +```dockerfile +# Create directory for data +RUN mkdir -p /data && chown appuser:appuser /data + +# Declare volume for persistent data +VOLUME ["/data"] + +# Document in comment +# Expected volumes: +# /data - Application data and logs +``` + +## Common Patterns + +### Development vs Production + +Create separate Dockerfiles or use build targets: + +```dockerfile +# Dockerfile +FROM python:3.11-slim AS base + +WORKDIR /app + +COPY requirements.txt . + +FROM base AS development +RUN pip install --no-cache-dir -r requirements.txt -r requirements-dev.txt +COPY . . +CMD ["python", "-m", "flask", "run", "--host=0.0.0.0", "--debug"] + +FROM base AS production +RUN pip install --no-cache-dir -r requirements.txt +COPY . . +USER nobody +CMD ["gunicorn", "--bind", "0.0.0.0:8080", "app:app"] +``` + +### Build with make target + +```bash +# Build development image +docker build --target development -t myapp:dev . + +# Build production image +docker build --target production -t myapp:prod . +``` + +## Testing Docker Images + +- Test images locally before pushing +- Verify non-root user execution +- Check image size +- Scan for vulnerabilities + +```bash +# Build image +docker build -t myapp:test . + +# Check image size +docker images myapp:test + +# Run security scan (example with trivy) +trivy image myapp:test + +# Test the container +docker run --rm -p 8080:8080 myapp:test + +# Verify non-root +docker run --rm myapp:test id +``` + +## Common Pitfalls to Avoid + +- Don't use `apt-get upgrade` in Dockerfiles (use newer base image instead) +- Don't store secrets in images +- Don't run containers as root +- Don't use `latest` tag for base images in production +- Don't ignore .dockerignore (slows builds and increases context size) +- Don't install unnecessary packages +- Don't create unnecessary layers +- Don't leave package manager caches +- Don't use shell form for ENTRYPOINT/CMD (breaks signal handling) +- Don't copy everything with `COPY . .` too early (breaks layer caching) + +## Documentation + +- Document build arguments and their defaults +- Document exposed ports and their purpose +- Document required environment variables +- Document expected volumes +- Include example run commands in README diff --git a/.github/instructions/helm-kubernetes.instructions.md b/.github/instructions/helm-kubernetes.instructions.md new file mode 100644 index 00000000..7fbfaeed --- /dev/null +++ b/.github/instructions/helm-kubernetes.instructions.md @@ -0,0 +1,209 @@ +--- +description: 'Instructions for developing and maintaining Helm charts and Kubernetes manifests' +applyTo: '**/*.yaml, **/*.yml, **/Chart.yaml, **/values.yaml, **/templates/**' +--- + +# Helm and Kubernetes Development Instructions + +## General Principles + +- Follow Helm best practices and Kubernetes manifest conventions +- Write clean, maintainable, and reusable chart templates +- Ensure backward compatibility when making changes to existing charts +- Test all changes thoroughly before committing +- Document configuration options clearly in values.yaml and README files + +## Helm Chart Development + +### Chart Structure + +- Organize charts following the standard Helm chart structure: + - `Chart.yaml`: Chart metadata and dependencies + - `values.yaml`: Default configuration values with comprehensive comments + - `templates/`: Kubernetes manifest templates + - `templates/_helpers.tpl`: Helper templates for reusable snippets + - `README.md`: Chart documentation with usage examples + +### Chart.yaml + +- Follow semantic versioning for chart versions +- Increment `version` for chart changes, `appVersion` for application version changes +- List all dependencies with specific version constraints +- Include maintainer information and useful metadata +- Add keywords and home/source URLs for discoverability + +### values.yaml + +- Provide sensible defaults that work out of the box +- Document each configuration option with inline comments +- Group related settings logically with clear section headers +- Use consistent naming conventions (camelCase recommended) +- Mark required values clearly and provide example values +- Consider backward compatibility when adding or modifying values +- Use nested structures to organize complex configurations + +### Templates + +- Use consistent indentation (2 spaces) +- Include helpful comments explaining complex logic +- Use `{{- ` and ` -}}` to control whitespace appropriately +- Leverage `_helpers.tpl` for common patterns and labels +- Always quote string values in templates to prevent type issues +- Use `.Values`, `.Chart`, `.Release` objects appropriately +- Validate required values with `required` function +- Use `toYaml` and `nindent` for clean YAML output +- Include resource limits and requests for all containers +- Add health checks (liveness and readiness probes) where appropriate + +### Template Best Practices + +- Use `include` instead of `template` for better error messages +- Define common labels in `_helpers.tpl` and reuse them +- Use consistent naming for Kubernetes resources: `{{ include "chart.fullname" . }}` +- Implement conditional resource creation with `if` statements +- Validate inputs using the `required` and `fail` functions +- Use `lookup` function carefully (not available in `helm template`) +- Handle list values properly with `toYaml` and proper indentation + +### Testing and Validation + +- Run `helm lint` to check for issues before committing +- Use `helm template` to render manifests and verify output +- Test with `ct lint` (chart-testing tool) for comprehensive validation +- Use `kubeconform` or similar tools to validate Kubernetes manifests +- Test installation with `helm install` in a test cluster +- Verify upgrades work correctly with `helm upgrade` +- Test with different values files to ensure flexibility + +## Kubernetes Manifest Best Practices + +### Resource Specifications + +- Always specify resource requests and limits +- Set appropriate security contexts (runAsNonRoot, readOnlyRootFilesystem, etc.) +- Use namespaces for resource isolation +- Apply proper RBAC (Roles, RoleBindings, ServiceAccounts) +- Add meaningful labels and annotations +- Use selectors consistently + +### ConfigMaps and Secrets + +- Use ConfigMaps for non-sensitive configuration +- Use Secrets for sensitive data +- Reference Secrets securely in pod specs +- Consider using external secret management solutions +- Document which secrets need to be created before installation + +### Networking + +- Define Services with appropriate types (ClusterIP, NodePort, LoadBalancer) +- Configure Ingress resources with proper annotations for your ingress controller +- Use NetworkPolicies for network segmentation when needed +- Document external dependencies and endpoints + +### High Availability and Scaling + +- Support replica configuration for stateless applications +- Use PodDisruptionBudgets for critical services +- Configure HorizontalPodAutoscaler when appropriate +- Consider anti-affinity rules for better pod distribution +- Use StatefulSets for stateful applications + +### Observability + +- Include health check endpoints for all services +- Add Prometheus annotations for metric scraping when applicable +- Configure proper logging (stdout/stderr) +- Add readiness and liveness probes with appropriate thresholds + +## Argo-Specific Patterns + +### Argo Workflows + +- Follow Argo Workflows best practices for WorkflowTemplate definitions +- Use proper artifact repository configuration +- Configure service accounts with appropriate RBAC +- Use templates for reusable workflow components +- Document workflow parameters and usage + +### Argo CD + +- Structure Application manifests with proper sync policies +- Use automated sync with caution (prune and selfHeal options) +- Configure proper health checks for custom resources +- Use Projects for multi-tenancy when appropriate +- Document repository requirements and access patterns + +### Argo Events + +- Define EventSources with proper authentication +- Create Sensors with clear trigger conditions +- Use proper RBAC for event processing +- Document webhook configurations and expected payloads + +## Multi-Tenancy and RBAC + +- Create proper namespace isolation +- Define clear RBAC roles (viewer, runner, admin) +- Use RoleBindings and ClusterRoleBindings appropriately +- Document permission requirements +- Test RBAC policies with `kubectl auth can-i` + +## Documentation + +- Keep README.md up to date with: + - Prerequisites and dependencies + - Installation instructions + - Configuration examples + - Upgrade procedures + - Troubleshooting tips +- Document breaking changes in CHANGELOG +- Provide example values files for common scenarios +- Include mermaid diagrams for architecture when helpful + +## Common Pitfalls to Avoid + +- Don't hardcode values that should be configurable +- Don't ignore backward compatibility in existing charts +- Don't skip testing with different values combinations +- Don't forget to update Chart.yaml version +- Don't use deprecated Kubernetes API versions +- Don't omit resource limits (can cause cluster issues) +- Don't expose secrets in logs or status outputs +- Avoid creating breaking changes without major version bump + +## Validation Commands + +Always run these commands before committing: + +```bash +# Add Helm dependencies +helm repo add argo https://argoproj.github.io/argo-helm +helm repo update + +# Build dependencies +helm dependency build helm/argo-stack + +# Lint the chart +helm lint helm/argo-stack --values helm/argo-stack/values.yaml + +# Render templates +helm template argo-stack helm/argo-stack \ + --values helm/argo-stack/values.yaml \ + --namespace argocd > rendered.yaml + +# Validate manifests +kubeconform -strict -ignore-missing-schemas \ + -skip 'CustomResourceDefinition|Application|Workflow|WorkflowTemplate' \ + -summary rendered.yaml + +# Test with ct (if available) +ct lint --config .ct.yaml +``` + +## Version Compatibility + +- Target Kubernetes 1.20+ unless specific compatibility is needed +- Use stable API versions (avoid alpha/beta in production) +- Test with multiple Kubernetes versions when possible +- Document minimum required versions in Chart.yaml and README diff --git a/.github/instructions/python.instructions.md b/.github/instructions/python.instructions.md new file mode 100644 index 00000000..42682c73 --- /dev/null +++ b/.github/instructions/python.instructions.md @@ -0,0 +1,577 @@ +--- +description: 'Instructions for Python development following best practices and conventions' +applyTo: '**/*.py, **/requirements*.txt, **/setup.py, **/pyproject.toml' +--- + +# Python Development Instructions + +## General Principles + +- Write clear, readable, and maintainable Python code +- Follow PEP 8 style guidelines +- Use Python 3.9+ features and best practices +- Write comprehensive tests for all functionality +- Document code with docstrings and type hints +- Prefer explicit over implicit + +## Code Style and Formatting + +### PEP 8 Compliance + +- Use 4 spaces for indentation (never tabs) +- Limit lines to 88-100 characters (prefer 88 for Black compatibility) +- Use blank lines to separate logical sections +- Follow naming conventions: + - `snake_case` for functions and variables + - `PascalCase` for classes + - `UPPER_CASE` for constants + - `_leading_underscore` for private/internal + +### Imports + +- Group imports in order: standard library, third-party, local +- Use absolute imports over relative imports +- Sort imports alphabetically within groups +- One import per line for clarity: + +```python +# Standard library +import os +import sys +from typing import Dict, List, Optional + +# Third-party +import flask +from flask import Flask, request + +# Local +from .config import Config +from .utils import helper_function +``` + +### Type Hints + +- Always use type hints for function signatures +- Use `Optional[T]` for values that can be None +- Import types from `typing` module +- Use `-> None` for functions that don't return values + +```python +from typing import Dict, List, Optional + +def process_data( + data: List[str], + config: Optional[Dict[str, str]] = None +) -> Dict[str, int]: + """Process data and return results.""" + result: Dict[str, int] = {} + # Implementation + return result +``` + +## Documentation + +### Docstrings + +- Use docstrings for all public modules, classes, and functions +- Follow Google or NumPy style for multi-line docstrings +- Include parameter descriptions and return values +- Document exceptions that can be raised + +```python +def validate_token(token: str, fence_base: str) -> Dict[str, any]: + """ + Validate an authentication token against Fence. + + Args: + token: The authentication token to validate + fence_base: Base URL for the Fence authentication service + + Returns: + Dictionary containing user information and authorization data + + Raises: + ValueError: If token is empty or invalid format + requests.HTTPError: If Fence API request fails + """ + if not token: + raise ValueError("Token cannot be empty") + # Implementation +``` + +### Comments + +- Write comments for complex logic, not obvious code +- Keep comments up to date with code changes +- Use `#` for inline comments, prefer docstrings for functions +- Explain "why" not "what" when the code is self-documenting + +## Functions and Classes + +### Function Design + +- Keep functions small and focused (single responsibility) +- Use descriptive function names that indicate purpose +- Limit function parameters (consider using dataclasses for many params) +- Return early to reduce nesting + +```python +def decide_groups(user_doc: Dict[str, any]) -> List[str]: + """Determine user's authorization groups.""" + if not user_doc.get("active"): + return [] + + groups = [] + authz = user_doc.get("authz", {}) + + # Check for admin privileges + if _is_admin(user_doc): + groups.extend(["argo-admin", "argo-runner", "argo-viewer"]) + return groups + + # Check for runner privileges + if _has_workflow_access(authz): + groups.append("argo-runner") + + return groups +``` + +### Classes + +- Use classes for stateful objects and related functionality +- Implement `__init__`, `__repr__`, and other dunder methods as needed +- Use properties for computed attributes +- Consider dataclasses for simple data containers + +```python +from dataclasses import dataclass +from typing import Optional + +@dataclass +class WorkflowConfig: + """Configuration for workflow execution.""" + name: str + namespace: str + service_account: Optional[str] = None + timeout: int = 300 + + def __post_init__(self): + """Validate configuration after initialization.""" + if self.timeout < 0: + raise ValueError("Timeout must be positive") +``` + +## Error Handling + +### Exceptions + +- Use specific exception types, not bare `except:` +- Catch exceptions at the appropriate level +- Log errors before re-raising or returning error responses +- Create custom exceptions for domain-specific errors + +```python +class AuthorizationError(Exception): + """Raised when user is not authorized for an action.""" + pass + +def check_authorization(user: str, resource: str) -> bool: + """Check if user can access resource.""" + try: + result = validate_access(user, resource) + return result + except requests.RequestException as e: + logger.error(f"Authorization check failed: {e}") + raise AuthorizationError(f"Cannot verify access for {user}") from e + except Exception as e: + logger.exception("Unexpected error in authorization check") + raise +``` + +### Validation + +- Validate inputs early +- Use descriptive error messages +- Consider using libraries like `pydantic` for complex validation + +```python +def process_request(data: Dict[str, any]) -> Dict[str, any]: + """Process incoming request.""" + # Validate required fields + required_fields = ["user_id", "action", "resource"] + missing = [f for f in required_fields if f not in data] + if missing: + raise ValueError(f"Missing required fields: {', '.join(missing)}") + + # Validate field values + if not data["user_id"].strip(): + raise ValueError("user_id cannot be empty") + + # Process data + return perform_action(data) +``` + +## Flask/Web Application Patterns + +### Application Structure + +- Use application factory pattern for Flask apps +- Separate configuration, routes, and business logic +- Use blueprints for modular organization +- Configure proper logging + +```python +from flask import Flask +import logging + +def create_app(config: Optional[Dict] = None) -> Flask: + """Create and configure Flask application.""" + app = Flask(__name__) + + # Configure logging + logging.basicConfig(level=logging.INFO) + + # Load configuration + if config: + app.config.update(config) + + # Register routes + register_routes(app) + + return app +``` + +### Route Handlers + +- Keep route handlers thin (delegate to service layer) +- Validate inputs +- Return appropriate HTTP status codes +- Use consistent response format + +```python +from flask import Flask, request, jsonify + +app = Flask(__name__) + +@app.route("/check", methods=["GET"]) +def check_authorization(): + """Check if user is authorized.""" + try: + # Extract headers + auth_header = request.headers.get("Authorization", "") + if not auth_header: + return jsonify({"error": "Missing Authorization header"}), 401 + + # Validate token + token = auth_header.replace("Bearer ", "") + user_info = validate_token(token) + + # Check authorization + groups = decide_groups(user_info) + if not groups: + return jsonify({"error": "Unauthorized"}), 403 + + # Success response + return jsonify({ + "authorized": True, + "groups": groups + }), 200 + + except ValueError as e: + return jsonify({"error": str(e)}), 400 + except Exception as e: + app.logger.exception("Authorization check failed") + return jsonify({"error": "Internal server error"}), 500 +``` + +### Health Checks + +- Implement health check endpoints +- Check dependencies (database, external services) +- Return appropriate status codes + +```python +@app.route("/healthz", methods=["GET"]) +def health_check(): + """Health check endpoint.""" + try: + # Check if critical services are accessible + check_external_dependencies() + return jsonify({"status": "healthy"}), 200 + except Exception as e: + app.logger.error(f"Health check failed: {e}") + return jsonify({"status": "unhealthy", "error": str(e)}), 503 +``` + +## Testing + +### Test Structure + +- Use `pytest` for testing +- Organize tests to mirror source structure +- Use descriptive test names that explain what is being tested +- Group related tests in classes + +```python +import pytest +from app import decide_groups + +class TestDecideGroups: + """Tests for decide_groups function.""" + + def test_inactive_user_returns_empty_list(self): + """Inactive users should have no groups.""" + user_doc = {"active": False} + assert decide_groups(user_doc) == [] + + def test_admin_user_gets_all_groups(self): + """Admin users should get all permission groups.""" + user_doc = { + "active": True, + "email": "admin@example.com", + "authz": {} + } + groups = decide_groups(user_doc) + assert "argo-admin" in groups + assert "argo-runner" in groups + assert "argo-viewer" in groups +``` + +### Fixtures + +- Use pytest fixtures for common test setup +- Keep fixtures focused and reusable +- Use `conftest.py` for shared fixtures + +```python +# conftest.py +import pytest +from app import create_app + +@pytest.fixture +def app(): + """Create Flask app for testing.""" + app = create_app({"TESTING": True}) + return app + +@pytest.fixture +def client(app): + """Create test client.""" + return app.test_client() + +@pytest.fixture +def sample_user(): + """Sample user document for testing.""" + return { + "active": True, + "email": "test@example.com", + "authz": { + "/workflows/submit": [{"method": "create"}] + } + } +``` + +### Test Coverage + +- Aim for high test coverage (80%+ for critical code) +- Test edge cases and error conditions +- Use `pytest-cov` to measure coverage +- Don't just aim for coverage, ensure meaningful tests + +### Mocking + +- Use `unittest.mock` or `pytest-mock` for external dependencies +- Mock network calls, file I/O, and external services +- Keep mocks simple and focused + +```python +from unittest.mock import Mock, patch +import pytest + +def test_token_validation_with_mock(): + """Test token validation with mocked HTTP call.""" + mock_response = Mock() + mock_response.json.return_value = { + "active": True, + "email": "user@example.com" + } + + with patch("requests.get", return_value=mock_response): + result = validate_token("test-token", "https://fence.example.com") + assert result["active"] is True +``` + +## Dependencies and Environment + +### Requirements Files + +- Use `requirements.txt` for production dependencies +- Use `requirements-dev.txt` for development dependencies +- Pin versions for reproducibility +- Keep dependencies minimal and up to date + +``` +# requirements.txt +flask==3.0.0 +requests==2.31.0 + +# requirements-dev.txt +pytest==7.4.3 +pytest-cov==4.1.0 +black==23.12.0 +flake8==6.1.0 +``` + +### Virtual Environments + +- Always use virtual environments +- Document setup in README +- Consider using `venv`, `virtualenv`, or `uv` + +## Logging + +### Logger Configuration + +- Use Python's `logging` module +- Configure appropriate log levels +- Include context in log messages +- Don't log sensitive information + +```python +import logging + +logger = logging.getLogger(__name__) + +def process_authorization(user_id: str, resource: str) -> bool: + """Process authorization request.""" + logger.info(f"Checking authorization for user {user_id} on {resource}") + + try: + result = check_access(user_id, resource) + logger.info(f"Authorization check completed: {result}") + return result + except Exception as e: + logger.error(f"Authorization check failed for {user_id}: {e}") + raise +``` + +## Security Considerations + +### Input Validation + +- Validate all external inputs +- Sanitize data before use +- Use parameterized queries for databases +- Validate file paths to prevent path traversal + +### Secrets Management + +- Never hardcode secrets +- Use environment variables or secret management systems +- Don't log sensitive data +- Use secure random number generation for tokens + +```python +import os +import secrets + +def get_api_key() -> str: + """Get API key from environment.""" + api_key = os.environ.get("API_KEY") + if not api_key: + raise ValueError("API_KEY environment variable not set") + return api_key + +def generate_token() -> str: + """Generate secure random token.""" + return secrets.token_urlsafe(32) +``` + +## Performance Considerations + +### Efficient Code + +- Use list comprehensions for simple transformations +- Use generators for large datasets +- Cache expensive computations when appropriate +- Profile before optimizing + +```python +# Good: List comprehension +active_users = [u for u in users if u.get("active")] + +# Good: Generator for large datasets +def process_large_file(filename: str): + """Process large file line by line.""" + with open(filename) as f: + for line in f: + yield process_line(line) + +# Good: Caching with functools +from functools import lru_cache + +@lru_cache(maxsize=128) +def expensive_computation(n: int) -> int: + """Cached expensive operation.""" + return sum(i * i for i in range(n)) +``` + +## Common Patterns + +### Context Managers + +- Use context managers for resource management +- Implement `__enter__` and `__exit__` for custom context managers + +```python +from contextlib import contextmanager + +@contextmanager +def managed_resource(resource_name: str): + """Manage resource lifecycle.""" + resource = acquire_resource(resource_name) + try: + yield resource + finally: + release_resource(resource) + +# Usage +with managed_resource("my-resource") as res: + res.do_something() +``` + +## Common Pitfalls to Avoid + +- Don't use mutable default arguments (`def func(items=[]):`) +- Don't modify lists while iterating over them +- Don't catch `Exception` without logging or re-raising +- Don't use `eval()` or `exec()` with untrusted input +- Don't ignore return values from functions +- Don't use global variables when class attributes would work +- Avoid circular imports (reorganize code structure) + +## Code Quality Tools + +### Linting and Formatting + +- Use `black` for code formatting +- Use `flake8` or `pylint` for linting +- Use `mypy` for type checking +- Configure tools in `pyproject.toml` or setup.cfg + +```bash +# Format code +black . + +# Check style +flake8 . + +# Type checking +mypy app.py +``` + +### Pre-commit Hooks + +- Set up pre-commit hooks for automatic checks +- Run formatters and linters before committing +- Ensure tests pass before pushing diff --git a/.github/scripts/validate-instructions.sh b/.github/scripts/validate-instructions.sh new file mode 100755 index 00000000..c3dc7e02 --- /dev/null +++ b/.github/scripts/validate-instructions.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# Validate GitHub Copilot instruction files +# This script checks that all instruction files have proper frontmatter + +set -euo pipefail + +INSTRUCTIONS_DIR=".github/instructions" +ERRORS=0 + +echo "🔍 Validating Copilot instruction files..." +echo "" + +# Check if instructions directory exists +if [[ ! -d "${INSTRUCTIONS_DIR}" ]]; then + echo "❌ Error: Instructions directory not found: ${INSTRUCTIONS_DIR}" + exit 1 +fi + +# Find all .instructions.md files +shopt -s nullglob +instruction_files=("${INSTRUCTIONS_DIR}"/*.instructions.md) + +if [[ ${#instruction_files[@]} -eq 0 ]]; then + echo "⚠️ Warning: No instruction files found in ${INSTRUCTIONS_DIR}" + exit 0 +fi + +echo "Found ${#instruction_files[@]} instruction file(s)" +echo "" + +# Validate each instruction file +for file in "${instruction_files[@]}"; do + filename=$(basename "$file") + echo "Checking ${filename}..." + + # Check if file starts with frontmatter + if ! head -n 1 "$file" | grep -q "^---$"; then + echo " ❌ Missing frontmatter opening delimiter" + ((ERRORS++)) + continue + fi + + # Check for description field + if ! head -n 10 "$file" | grep -q "^description:"; then + echo " ❌ Missing 'description' field in frontmatter" + ((ERRORS++)) + else + echo " ✅ Has description field" + fi + + # Check for applyTo field + if ! head -n 10 "$file" | grep -q "^applyTo:"; then + echo " ❌ Missing 'applyTo' field in frontmatter" + ((ERRORS++)) + else + echo " ✅ Has applyTo field" + fi + + # Check for closing frontmatter delimiter + if ! head -n 10 "$file" | grep -A 1 "^description:" | tail -n +2 | grep -q "^---$" && \ + ! head -n 10 "$file" | grep -A 1 "^applyTo:" | tail -n +2 | grep -q "^---$"; then + # Check more lines for closing delimiter + if ! head -n 15 "$file" | tail -n +2 | grep -q "^---$"; then + echo " ❌ Missing frontmatter closing delimiter" + ((ERRORS++)) + else + echo " ✅ Has frontmatter closing delimiter" + fi + else + echo " ✅ Has frontmatter closing delimiter" + fi + + # Check file has content beyond frontmatter + line_count=$(wc -l < "$file") + if [[ $line_count -lt 20 ]]; then + echo " ⚠️ Warning: File seems very short (${line_count} lines)" + else + echo " ✅ Has substantial content (${line_count} lines)" + fi + + echo "" +done + +# Summary +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +if [[ $ERRORS -eq 0 ]]; then + echo "✅ All instruction files are valid!" + exit 0 +else + echo "❌ Found ${ERRORS} error(s) in instruction files" + exit 1 +fi diff --git a/.gitignore b/.gitignore index f86ac8fb..f5dad009 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,6 @@ __pycache__/ rendered.yaml helm/argo-stack/charts/argo-events-*.tgz helm/argo-stack/charts/external-secrets-*.tgz +.coverage +htmlcov/ +.pytest_cache/ diff --git a/CLEANUP_SUMMARY.md b/CLEANUP_SUMMARY.md deleted file mode 100644 index 2452b4a5..00000000 --- a/CLEANUP_SUMMARY.md +++ /dev/null @@ -1,116 +0,0 @@ -# Orphan Resources Cleanup Summary - -## Commits in This PR - -1. **f721715** - Add comprehensive analysis of orphan resources and legacy code -2. **be71bad** - Clean up orphaned templates and values from deprecated patterns -3. **65de60f** - Update ORPHAN_ANALYSIS.md with completed changes -4. **7e7adc2** - Fix remaining hardcoded namespace in workflowTemplates config -5. **ed9c1ec** - Remove all remaining hardcoded wf-poc references from examples and comments - -## What Was Removed - -### Templates (DELETED, not modified) -Following feedback "don't change templates just to conform. if the template is not being used, delete it": - -- ❌ `templates/workflows/per-app-workflowtemplates.yaml` - Used deprecated `applications` array -- ❌ `templates/10-rbac.yaml` - Legacy RBAC for static wf-poc namespace -- ❌ `templates/workflows/argo-workflow-runner-rbac.yaml` - Legacy RBAC for wf-poc -- ❌ `templates/workflows/sensor-argo-events-rbac.yaml` - Legacy RBAC for wf-poc -- ❌ `templates/events/role-wf-submit.yaml` - Legacy RBAC for wf-poc -- ❌ `templates/workflows/workflowtemplate-nextflow-hello.yaml` - Example template not used by repoRegistrations -- ❌ `templates/workflows/workflowtemplate-nextflow-runner.yaml` - Legacy template (replaced by nextflow-repo-runner in per-tenant) - -### Values (values.yaml) -- ❌ `applications: []` - Deprecated array declaration -- ❌ `events.github.repositories: []` - Deprecated array declaration -- ❌ `events.sensor.*` - Entire configuration block (auto-generated now) -- ❌ `workflows.namespace` - Hardcoded wf-poc -- ❌ `workflows.templateRef` - Unused field -- ❌ `workflowTemplates.*` - Entire configuration block (configured example templates that are now deleted) - -## What Was Updated - -### Example Files -- ✅ `values-multi-app.yaml` - Now demonstrates `repoRegistrations` pattern instead of deprecated `applications` - -### Documentation References -- ✅ Fixed references to point to correct doc: `docs/repo-registration-guide.md` -- ✅ Updated deprecation notices with clearer guidance - -## What Was Kept (Intentionally) - -### Active Templates (Used by RepoRegistration Pattern) -- ✅ `01-tenant-namespaces-from-repo-registrations.yaml` - Creates per-tenant namespaces -- ✅ `11-tenant-rbac-from-repo-registrations.yaml` - Creates per-tenant RBAC -- ✅ `22-tenant-artifact-repositories-from-repo-registrations.yaml` - Creates per-tenant artifact repos -- ✅ `per-tenant-workflowtemplates.yaml` - Creates nextflow-repo-runner per tenant -- ✅ `workflowtemplate-nextflow-repo-runner.yaml` - Base template (may be referenced globally) - -### Values -- ✅ `s3` global config block - Backward compatibility for global artifact config -- ✅ `namespaces.tenant` - Still defined but only used as fallback/legacy -- ✅ `workflows.runnerServiceAccount` - Defines ServiceAccount name used by sensors - -## Impact Assessment - -### Breaking Changes -**NONE** - All removed items were unused legacy templates - -### Improvements -1. **Clarity** - Removed unused templates instead of modifying them -2. **Simplicity** - RepoRegistration pattern creates all resources automatically -3. **Documentation** - Clear explanation of which templates are used vs legacy -4. **Maintainability** - Less code to maintain, cleaner separation of concerns - -## Rationale - -**Why delete instead of modify?** - -The RepoRegistration pattern creates ALL necessary resources automatically: -- Namespaces: `wf--` via `01-tenant-namespaces-from-repo-registrations.yaml` -- RBAC: ServiceAccount, Role, RoleBinding via `11-tenant-rbac-from-repo-registrations.yaml` -- WorkflowTemplates: `nextflow-repo-runner` via `per-tenant-workflowtemplates.yaml` - -The deleted templates were all for the **static `wf-poc` namespace** which is: -- Not used when deploying with `repoRegistrations` -- A legacy pattern from before multi-tenancy -- Redundant with per-tenant resources - -## Verification - -### Files Modified -- `ORPHAN_ANALYSIS.md` (created, updated) -- `CLEANUP_SUMMARY.md` (created, updated) -- `helm/argo-stack/values-multi-app.yaml` (migrated to repoRegistrations, removed workflowTemplates) -- `helm/argo-stack/values.yaml` (removed deprecated config blocks) - -### Files Deleted -- `helm/argo-stack/templates/10-rbac.yaml` -- `helm/argo-stack/templates/events/role-wf-submit.yaml` -- `helm/argo-stack/templates/workflows/argo-workflow-runner-rbac.yaml` -- `helm/argo-stack/templates/workflows/per-app-workflowtemplates.yaml` -- `helm/argo-stack/templates/workflows/sensor-argo-events-rbac.yaml` -- `helm/argo-stack/templates/workflows/workflowtemplate-nextflow-hello.yaml` -- `helm/argo-stack/templates/workflows/workflowtemplate-nextflow-runner.yaml` - -### Total Changes -- 11 files modified/deleted -- 7 templates deleted -- Significant reduction in maintenance burden - -## Recommendations for Next Steps - -1. Consider removing `workflowtemplate-nextflow-runner.yaml` if not actively used -2. Add explicit documentation about `namespaces.tenant` being legacy -3. Consider deprecating global `s3` config in favor of repoRegistrations only -4. Update CI/CD to validate no hardcoded namespaces in future PRs - -## Testing Checklist - -- [x] Code review completed - all feedback addressed -- [x] All hardcoded namespace references eliminated -- [x] Documentation references verified to exist -- [x] Deprecation notices updated with correct guidance -- [ ] Helm template rendering validation (requires network access) -- [ ] Deploy to test cluster (manual step) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c03fa4b5..7ee94f89 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -14,134 +14,15 @@ Thank you for your interest in contributing! This document provides guidelines f ```bash git checkout -b feature/your-feature-name ``` +4. **Make your changes** in the relevant directories: +5. **Run tests and validations** (see Testing section in quickstart.md): + - Helm linting + - Chart testing with kind + - Unit tests for authz-adapter + - Kubeconform schema validation +6. **Commit your changes** with clear messages: +7. **Push your branch** to your fork: + ```bash + git push origin feature/your-feature-name + ``` -## 🏗️ Development Environment - -### Prerequisites - -- Docker -- Kubernetes cluster (kind, minikube, or cloud) -- Helm v3.x -- Python 3.9+ (for authz-adapter development) - -### Setup - -```bash -# Install Python dependencies for authz-adapter -cd authz-adapter -pip install -r requirements.txt -pip install -r requirements-dev.txt - -# Run tests -python -m pytest tests/ -v - -# Build Docker image -docker build -t authz-adapter:dev . -``` - -## 🧪 Testing - -### Quick Start - -```bash -# Fast checks -helm repo add argo https://argoproj.github.io/argo-helm && helm repo update -helm lint helm/argo-stack --values helm/argo-stack/values.yaml -helm dependency build helm/argo-stack -helm template argo-stack helm/argo-stack --values helm/argo-stack/values.yaml --namespace argocd > rendered.yaml -kubeconform -strict -ignore-missing-schemas -skip 'CustomResourceDefinition|Application|Workflow|WorkflowTemplate' -summary rendered.yaml - -# kind + ct -kind create cluster -ct lint --config .ct.yaml -ct install --config .ct.yaml --debug - -# adapter tests -cd authz-adapter && python3 -m pip install -r requirements.txt pytest && pytest -q -``` - -### 0. Prerequisites - -```bash -# Helm -brew install helm || sudo snap install helm --classic - -# kind (Kubernetes-in-Docker) -brew install kind || GO111MODULE="on" go install sigs.k8s.io/kind@v0.23.0 - -# Python tooling for adapter tests -python3 -m pip install --upgrade pip -python3 -m pip install pytest - -# chart-testing (ct) -brew install chart-testing || pipx install chart-testing - -# kubeconform (schema validation) -brew install kubeconform || curl -L https://github.com/yannh/kubeconform/releases/latest/download/kubeconform-linux-amd64.tar.gz | tar xz && sudo mv kubeconform /usr/local/bin/ -``` - -### 1. Helm lint, template, kubeconform - -```bash -# From repo root -helm repo add argo https://argoproj.github.io/argo-helm -helm repo update - -# Lint the umbrella chart -helm lint helm/argo-stack --values helm/argo-stack/values.yaml - -# Render the chart to plain YAML -helm dependency build helm/argo-stack -helm template argo-stack helm/argo-stack --values helm/argo-stack/values.yaml --namespace argocd > rendered.yaml - -# Validate rendered manifests (skip CRDs and Argo custom resources) -kubeconform -strict -ignore-missing-schemas -skip 'CustomResourceDefinition|Application|Workflow|WorkflowTemplate' -summary rendered.yaml -``` - -### 2. Chart Testing (ct) - -This replicates CI: spin up kind, lint, then install the chart and smoke test it. - -```bash -# Fresh kind cluster -kind delete cluster || true -kind create cluster - -# Ensure dependencies are built -helm dependency build helm/argo-stack - -# Lint using ct (uses .ct.yaml) -ct lint --config .ct.yaml - -# Install and smoke test -ct install --config .ct.yaml --debug -``` - -**Notes** -- `ct` uses the working tree and `.ct.yaml` to find `helm/argo-stack`. -- To test with custom values, commit a `ci-values.yaml` or temporarily edit `values.yaml` before running. - -### 3. authz-adapter unit tests - -```bash -cd authz-adapter -python3 -m pip install -r requirements.txt pytest -pytest -q -``` - -What’s tested: -- `decide_groups(...)` logic (mapping `/user/user` authz JSON to groups like `argo-runner` and `argo-viewer`). - -### 4. Troubleshooting - -- **kubeconform errors on CRDs** - Keep skipping `CustomResourceDefinition|Application|Workflow|WorkflowTemplate` or provide schemas for these CRDs. - -- **`ct install` hangs or times out** - Use `--debug` to inspect controller/server logs. Ensure Docker has enough CPU/RAM for kind. - -- **Argo CD Application points to a repo path with no manifests** - That’s fine as a placeholder; it syncs “empty”. Add K8s manifests or a `kustomization.yaml` in that repo path for real resources. - -- **Port-forward conflicts** - Change the left port: e.g., `8081:80`, `2747:2746`. diff --git a/FINAL_SUMMARY.md b/FINAL_SUMMARY.md deleted file mode 100644 index 7ab8c1b7..00000000 --- a/FINAL_SUMMARY.md +++ /dev/null @@ -1,167 +0,0 @@ -# Final Cleanup Summary - -## User Feedback Addressed - -**Feedback**: "don't change templates just to conform. if the template is not being used, delete it" - -**Action Taken**: Deleted all unused legacy templates instead of modifying them. - ---- - -## Templates Deleted (7 files, -387 lines) - -All legacy templates for the static `wf-poc` namespace: - -1. ❌ `templates/10-rbac.yaml` - Legacy RBAC for wf-poc -2. ❌ `templates/events/role-wf-submit.yaml` - Legacy role for wf-poc -3. ❌ `templates/workflows/argo-workflow-runner-rbac.yaml` - Legacy workflow RBAC -4. ❌ `templates/workflows/sensor-argo-events-rbac.yaml` - Legacy sensor RBAC -5. ❌ `templates/workflows/workflowtemplate-nextflow-hello.yaml` - Example template -6. ❌ `templates/workflows/workflowtemplate-nextflow-runner.yaml` - Legacy runner template -7. ❌ `templates/workflows/per-app-workflowtemplates.yaml` - Used deprecated `applications` array - ---- - -## Why These Were Deleted - -The **RepoRegistration pattern** creates all necessary resources automatically per tenant: - -| Legacy Template (Deleted) | Replaced By (Active) | Purpose | -|---------------------------|---------------------|---------| -| `10-rbac.yaml` | `11-tenant-rbac-from-repo-registrations.yaml` | Per-tenant RBAC | -| `argo-workflow-runner-rbac.yaml` | `11-tenant-rbac-from-repo-registrations.yaml` | Workflow executor permissions | -| `sensor-argo-events-rbac.yaml` | `11-tenant-rbac-from-repo-registrations.yaml` | Sensor permissions | -| `role-wf-submit.yaml` | `11-tenant-rbac-from-repo-registrations.yaml` | Workflow submission permissions | -| `workflowtemplate-nextflow-hello.yaml` | `per-tenant-workflowtemplates.yaml` | Example/test template | -| `workflowtemplate-nextflow-runner.yaml` | `per-tenant-workflowtemplates.yaml` | Creates `nextflow-repo-runner` per tenant | -| `per-app-workflowtemplates.yaml` | `per-tenant-workflowtemplates.yaml` | Uses `repoRegistrations` not `applications` | - ---- - -## Active Templates (22 files) - -Templates actively used by the RepoRegistration pattern: - -### Core Infrastructure -- ✅ `00-namespaces.yaml` - Core namespaces (argo, argocd, security, etc.) -- ✅ `20-artifact-repositories.yaml` - Global artifact repository config -- ✅ `30-authz-adapter.yaml` - Authorization adapter -- ✅ `40-argo-workflows-ingress.yaml` - Argo Workflows ingress -- ✅ `41-argocd-ingress.yaml` - ArgoCD ingress -- ✅ `90-argocd-application.yaml` - ArgoCD application - -### RepoRegistration-Driven (Multi-Tenant) -- ✅ `01-tenant-namespaces-from-repo-registrations.yaml` - Creates `wf--` namespaces -- ✅ `11-tenant-rbac-from-repo-registrations.yaml` - Creates per-tenant ServiceAccount, Role, RoleBinding -- ✅ `21-per-app-artifact-repositories-from-repo-registrations.yaml` - Per-repo artifact config -- ✅ `22-tenant-artifact-repositories-from-repo-registrations.yaml` - Per-tenant artifact config -- ✅ `argocd/applications-from-repo-registrations.yaml` - Creates ArgoCD apps per repo -- ✅ `workflows/per-tenant-workflowtemplates.yaml` - Creates `nextflow-repo-runner` per tenant -- ✅ `workflows/workflowtemplate-nextflow-repo-runner.yaml` - Base nextflow runner template - -### External Secrets (ESO) -- ✅ `eso/secretstore.yaml` - Vault SecretStore -- ✅ `eso/serviceaccount.yaml` - ESO ServiceAccount -- ✅ `eso/externalsecret-argocd.yaml` - ArgoCD secrets -- ✅ `eso/externalsecret-repo-registrations-github.yaml` - Per-repo GitHub secrets -- ✅ `eso/externalsecret-repo-registrations-s3.yaml` - Per-repo S3 secrets - -### Argo Events -- ✅ `events/eventbus.yaml` - EventBus for Argo Events -- ✅ `events/eventsource-github-from-repo-registrations.yaml` - GitHub EventSource per repo -- ✅ `events/sensor-github-push.yaml` - Sensor for GitHub push events -- ✅ `events/secret-github.yaml` - GitHub webhook secret - ---- - -## Values Cleaned Up - -**Removed orphaned configuration blocks:** -- ❌ `applications: []` -- ❌ `events.github.repositories: []` -- ❌ `events.sensor.*` -- ❌ `workflowTemplates.*` -- ❌ `workflows.namespace` -- ❌ `workflows.templateRef` - -**Kept:** -- ✅ `workflows.runnerServiceAccount: wf-runner` - ServiceAccount name reference -- ✅ `namespaces.tenant: wf-poc` - Backward compatibility / legacy namespace -- ✅ `s3.*` - Global S3 config for backward compatibility - ---- - -## Impact - -| Metric | Before | After | Change | -|--------|--------|-------|--------| -| Template files | 29 | 22 | -7 files | -| Lines of code | ~1100 | ~713 | -387 lines | -| Legacy templates | 7 | 0 | **100% removed** | -| Hardcoded namespaces | Multiple | 0 | **All eliminated** | - -### Benefits -1. **Clarity**: Only templates that are actually used remain -2. **Simplicity**: RepoRegistration pattern handles everything -3. **Maintainability**: 35% less template code to maintain -4. **Consistency**: All resources created via single pattern - -### Breaking Changes -**NONE** - Only deleted unused legacy templates - ---- - -## Deployment Model - -**Before** (Legacy): -``` -Static wf-poc namespace -├── Hardcoded RBAC -├── Hardcoded ServiceAccount -└── Manual WorkflowTemplate creation -``` - -**After** (RepoRegistration): -``` -repoRegistrations: - - name: my-project - tenant: myorg - ... - -Automatically creates: -├── Namespace: wf-myorg-my-project -├── ServiceAccount: wf-runner -├── Role: workflow-executor -├── RoleBinding: workflow-executor-binding -├── WorkflowTemplate: nextflow-repo-runner -├── ExternalSecrets: github-secret-*, s3-credentials-* -└── ArgoCD Application: my-project -``` - ---- - -## Verification - -```bash -# Templates actually in use -$ find helm/argo-stack/templates -name "*.yaml" | wc -l -22 - -# Templates from repoRegistrations pattern -$ grep -l "repoRegistrations" helm/argo-stack/templates/**/*.yaml | wc -l -8 - -# Legacy wf-poc references (should be 0 in templates) -$ grep -r "wf-poc" helm/argo-stack/templates/ | wc -l -0 -``` - ---- - -## Next Steps - -Users should: -1. Migrate from any manual `wf-poc` deployments to `repoRegistrations` -2. Use `repoRegistrations` array in values.yaml for all new repos -3. Remove any local overrides that reference deleted templates -4. Refer to `examples/repo-registrations-example.yaml` for configuration examples diff --git a/IMPLEMENTATION_SUMMARY.md b/IMPLEMENTATION_SUMMARY.md deleted file mode 100644 index 1f1fc417..00000000 --- a/IMPLEMENTATION_SUMMARY.md +++ /dev/null @@ -1,576 +0,0 @@ -# 🎉 Implementation Complete: Vault + ESO Integration for Argo Stack - -## Summary - -Successfully implemented **HashiCorp Vault integration via External Secrets Operator (ESO)** for the argo-stack Helm chart. This feature enables centralized secret management with automatic rotation while maintaining full backward compatibility. - ---- - -## 📊 Implementation Statistics - -- **Files Changed**: 25 files -- **Lines Added**: 3,092+ lines -- **Documentation**: 50KB+ across 5 comprehensive guides -- **Templates**: 7 new ESO templates -- **Examples**: 4 configuration examples -- **Test Coverage**: 100% (automated validation) -- **Commits**: 4 well-structured commits -- **Backward Compatible**: ✅ Yes (ESO disabled by default) - ---- - -## ✅ Requirements Completed - -All requirements from the original issue have been met: - -### 1. Bundle/Optional Dependency ✅ -- Added External Secrets Operator as optional dependency in Chart.yaml -- Configurable via `externalSecrets.installOperator` toggle -- Version constraint: `>=0.9.0` - -### 2. Vault Provider Wiring ✅ -- SecretStore/ClusterSecretStore template with full Vault configuration -- Support for multiple auth methods: Kubernetes, AppRole, JWT -- KV v2 secrets engine configuration -- Namespace scoping support - -### 3. Template Refactor ✅ -- Created ExternalSecret resources for: - - Argo CD (admin password, SSO, server secret) - - Argo Workflows (S3 credentials, SSO) - - AuthZ Adapter (OIDC secret) - - GitHub Events (webhook token) - - Per-application S3 credentials -- Existing Secret templates made conditional -- Zero plaintext secrets in templates - -### 4. User Guide ✅ -Created comprehensive documentation: -- `docs/secrets-with-vault.md` (17KB) - Main user guide -- `docs/testing-vault-integration.md` (9KB) - Testing procedures -- `docs/vault-integration-summary.md` (10KB) - Overview -- `docs/vault-architecture-diagrams.md` (9KB) - Visual diagrams -- Updated README.md with Vault section - -### 5. Examples ✅ -- `examples/vault/kubernetes-auth-values.yaml` - Production config -- `examples/vault/approle-auth-values.yaml` - AppRole config -- `examples/vault/dev-values.yaml` - Local development -- `examples/vault/README.md` - Quick start guide - ---- - -## 🔧 Components Implemented - -### Makefile Enhancements -New targets for Vault development: -```bash -make vault-dev # Start Vault dev server -make vault-seed # Seed with test data -make vault-status # Check health -make vault-list # List secrets -make vault-get # Get specific secret -make vault-cleanup # Remove container -make vault-shell # Open shell -``` - -### Helm Templates (7 new files) - -**Helper Functions:** -- `templates/_eso-helpers.tpl` - Reusable template logic - -**ESO Resources:** -- `templates/eso/secretstore.yaml` - Vault connection config -- `templates/eso/serviceaccount.yaml` - Kubernetes auth SA -- `templates/eso/externalsecret-github.yaml` - GitHub tokens -- `templates/eso/externalsecret-s3.yaml` - S3 credentials -- `templates/eso/externalsecret-argocd.yaml` - Argo CD secrets -- `templates/eso/externalsecret-per-app-s3.yaml` - App-specific S3 - -**Modified Templates (2):** -- `templates/events/secret-github.yaml` - Conditional rendering -- `templates/20-artifact-repositories.yaml` - Conditional Secret - -### Values Schema - -Added comprehensive `externalSecrets` section: -- Operator installation toggle -- Vault provider configuration -- Authentication settings (Kubernetes/AppRole/JWT) -- KV engine configuration -- Secret path mappings for all components -- Scope configuration (namespaced/cluster) - -### Testing & Validation - -**Automated Testing:** -- `test-eso-templates.py` - Validates all templates -- Tests helper functions, schema, conditionals, paths -- 100% pass rate on all checks - -**CI Configuration:** -- Updated `.ct.yaml` for compatibility -- Updated `ci-values.yaml` to disable ESO -- Created `eso-test-values.yaml` for ESO tests - ---- - -## 🔑 Secrets Now Managed - -All sensitive configuration can be stored in Vault: - -| Component | Secret Type | Default Vault Path | -|-----------|-------------|-------------------| -| Argo CD | Admin Password | `kv/argo/argocd/admin#password` | -| Argo CD | Server Key | `kv/argo/argocd/server#secretKey` | -| Argo CD | OIDC Secret | `kv/argo/argocd/oidc#clientSecret` | -| Argo Workflows | S3 Access Key | `kv/argo/workflows/artifacts#accessKey` | -| Argo Workflows | S3 Secret Key | `kv/argo/workflows/artifacts#secretKey` | -| Argo Workflows | OIDC Secret | `kv/argo/workflows/oidc#clientSecret` | -| GitHub Events | Webhook Token | `kv/argo/events/github#token` | -| Per-Application | S3 Credentials | `kv/argo/apps/{name}/s3#...` | -| AuthZ Adapter | OIDC Secret | `kv/argo/authz#clientSecret` | - ---- - -## 🚀 Quick Start Guide - -### Local Development - -```bash -# 1. Start Vault dev server and seed -make vault-dev vault-seed - -# 2. Create Kind cluster -kind create cluster - -# 3. Install chart with Vault -helm install argo-stack ./helm/argo-stack \ - -f examples/vault/dev-values.yaml \ - -n argocd --create-namespace - -# 4. Verify secrets syncing -kubectl get externalsecrets -A -``` - -### Production Deployment - -```bash -# 1. Enable Vault Kubernetes auth -vault auth enable kubernetes -vault write auth/kubernetes/config kubernetes_host="https://k8s.api:443" - -# 2. Create Vault policy and role -vault policy write argo-stack-policy /path/to/policy.hcl -vault write auth/kubernetes/role/argo-stack \ - bound_service_account_names=eso-vault-auth \ - bound_service_account_namespaces=argocd \ - policies=argo-stack-policy - -# 3. Seed Vault with secrets -vault kv put kv/argo/argocd/admin password="SecurePass123!" -vault kv put kv/argo/workflows/artifacts accessKey="..." secretKey="..." -vault kv put kv/argo/events/github token="ghp_..." - -# 4. Deploy chart -helm install argo-stack ./helm/argo-stack \ - -f examples/vault/kubernetes-auth-values.yaml \ - --set externalSecrets.vault.address="https://vault.prod.com" \ - -n argocd --create-namespace -``` - ---- - -## 🔐 Security Features Implemented - -1. **Least Privilege Access** - - Namespace-scoped SecretStores by default - - Separate Vault policies per component - - ServiceAccount-based authentication - -2. **Secret Versioning & Audit** - - KV v2 engine with version history - - Vault audit logging of all access - - Rollback capability - -3. **TLS Support** - - HTTPS communication with Vault - - CA bundle configuration - - Certificate validation - -4. **No Secrets in Git** - - Templates use path references only - - Values contain config, not secrets - - Examples use placeholders - ---- - -## 📚 Documentation Delivered - -### Main Guides (50KB+) - -1. **secrets-with-vault.md** (17KB) - - Prerequisites and setup - - Vault path conventions - - Authentication methods guide - - Configuration reference - - Secret rotation workflow - - Security best practices - - Troubleshooting section - - Comparison with alternatives - -2. **testing-vault-integration.md** (9KB) - - Template validation tests - - Local development testing - - Authentication method tests - - Error handling tests - - Performance testing - - Security testing - -3. **vault-integration-summary.md** (10KB) - - High-level overview - - Components breakdown - - Supported secrets - - Architecture diagrams - - Quick start guides - -4. **vault-architecture-diagrams.md** (9KB) - - Overall architecture - - Secret sync flow - - Authentication flows - - Path mapping diagrams - - Deployment modes - -5. **examples/vault/README.md** (6KB) - - Quick start guide - - Setup instructions - - Verification steps - - Troubleshooting - ---- - -## ✅ Acceptance Tests - -### Template Rendering ✅ -```bash -$ python3 test-eso-templates.py -✅ PASS: Helper Templates -✅ PASS: Values Schema -✅ PASS: ESO Template Conditionals -✅ PASS: Existing Secret Conditionals -✅ PASS: Secret Path Format -🎉 All tests passed! -``` - -### Helm Linting ✅ -```bash -$ helm lint helm/argo-stack --values helm/argo-stack/ci-values.yaml -1 chart(s) linted, 0 chart(s) failed -``` - -### Backward Compatibility ✅ -- Chart works unchanged with `externalSecrets.enabled: false` -- Traditional Secret templates render when ESO disabled -- No breaking changes to existing values - ---- - -## 🔄 What Happens Next - -### For End Users - -1. **Review Documentation** - - Read `docs/secrets-with-vault.md` for setup - - Check `examples/vault/` for configuration examples - - Review architecture diagrams - -2. **Test Locally** - ```bash - make vault-dev vault-seed - helm install argo-stack ./helm/argo-stack -f examples/vault/dev-values.yaml - ``` - -3. **Deploy to Production** - - Set up Vault infrastructure - - Configure Kubernetes auth - - Seed secrets in Vault - - Deploy chart with ESO enabled - -### For Maintainers - -1. **Code Review** - Review PR for quality and completeness -2. **E2E Testing** - Optional end-to-end testing in cluster -3. **Merge** - Merge to main branch -4. **Release** - Include in next chart release -5. **Announce** - Update changelog and announce feature - ---- - -## 🎯 Key Benefits - -### For Platform Teams -- Centralized secret management across all Argo components -- Audit trail of all secret access -- Simplified rotation without downtime - -### For Security Teams -- No plaintext secrets in Git -- Vault policies for least-privilege access -- Secret versioning and rollback - -### For Application Teams -- Self-service secret management -- Per-application isolation -- GitOps-friendly workflow - -### For DevOps Teams -- Automatic secret synchronization -- No manual secret distribution -- Compatible with existing workflows - ---- - -## 🔮 Future Enhancements (Out of Scope) - -While this implementation is complete, future work could include: -- Vault dynamic secrets for database credentials -- Vault transit engine integration -- Vault PKI for certificate management -- Multi-tenancy with Vault namespaces -- Alternative backends (AWS Secrets Manager, GCP) -- Reloader integration for automatic pod restarts -- Metrics and alerting for sync failures - ---- - -## 📞 Support & Resources - -- **Documentation**: `docs/secrets-with-vault.md` -- **Examples**: `examples/vault/` -- **Testing**: `test-eso-templates.py` -- **Issues**: GitHub issue tracker -- **External Docs**: - - [External Secrets Operator](https://external-secrets.io/) - - [Vault Kubernetes Auth](https://developer.hashicorp.com/vault/docs/auth/kubernetes) - - [Vault KV v2](https://developer.hashicorp.com/vault/docs/secrets/kv/kv-v2) - ---- - -## 🏆 Conclusion - -This implementation successfully delivers a **production-ready**, **secure**, and **user-friendly** Vault integration for the argo-stack Helm chart. The solution: - -✅ Meets all requirements from the original issue -✅ Maintains backward compatibility -✅ Includes comprehensive documentation -✅ Provides multiple deployment examples -✅ Passes all validation tests -✅ Follows security best practices - -The feature is **ready for merge and release**. - ---- - -**Implementation Date**: 2025-11-15 -**Branch**: `copilot/integrate-hashi-corp-vault` -**Commits**: 4 -**Total Changes**: 25 files, 3,092+ lines -# Development Environment Improvements - Summary - -This document summarizes the improvements made to the argo-helm development and testing environment. - -## 🎯 Goals Achieved - -1. ✅ **In-Cluster MinIO Support** - Developers can now test S3 artifact storage using MinIO deployed inside the Kubernetes cluster -2. ✅ **Removed Hardcoded Repos** - Chart values are now properly configured as reusable templates -3. ✅ **Comprehensive Documentation** - Clear guidance for both local development and production deployment - -## 📦 Changes Made - -### Makefile Updates -- **Added `minio` target** - Deploys MinIO using Helm into the cluster - - Namespace: `minio-system` - - Endpoint: `minio.minio-system.svc.cluster.local:9000` - - Default credentials: `minioadmin` / `minioadmin` - - Persistence disabled for ephemeral testing - -- **Updated `deploy` target** - Now includes MinIO deployment as a dependency - - Complete workflow: Kind cluster → MinIO → Argo Stack - -- **S3 Configuration Defaults** - Pre-configured for in-cluster MinIO - - `S3_HOSTNAME`: `minio.minio-system.svc.cluster.local:9000` - - `S3_ACCESS_KEY_ID`: `minioadmin` - - `S3_SECRET_ACCESS_KEY`: `minioadmin` - - `S3_BUCKET`: `argo-artifacts` - - All overrideable via environment variables - -### Configuration Examples -- **`examples/user-repos-example.yaml`** - Template for user repository configuration - - Shows how to configure applications - - Examples for GitHub Events setup - - Security best practices included - -### Documentation -- **`QUICKSTART.md`** - Fast-track onboarding guide - - Simplified to use `make deploy` - - No external dependencies (docker-compose removed) - - Clear instructions for accessing MinIO console - -### Chart Configuration -- **`helm/argo-stack/values.yaml`** - - Changed `applications: [...]` → `applications: []` - - Changed `repositories: [...]` → `repositories: []` - - Added comprehensive commented examples - - Removed hardcoded GitHub URLs (bwalsh/nextflow-hello-project*) - -- **`helm/argo-stack/ci-values.yaml`** - - Added explicit `applications: []` - - Added explicit `events.enabled: false` - - Ensures clean CI environment - -### Documentation Updates -- **`README.md`** - - Updated local development section to use `make deploy` - - Removed references to docker-compose and local-dev-values.yaml - - Updated quick start to show Makefile-based approach - -- **`docs/development.md`** - - Rewrote "Local MinIO for Development" section for in-cluster deployment - - Updated Makefile targets table - - Replaced troubleshooting section with cluster-based approaches - - Removed docker-compose specific content - -- **`QUICKSTART.md`** - - Simplified local dev path to use `make deploy` - - Removed docker-compose references - - Updated MinIO console access instructions - -- **`examples/README.md`** - - Important warning about providing repos - - Clear guidance on configuration - -## 🔄 Migration Guide - -### For Existing Users - -If you were using the chart with the default hardcoded repositories, you now need to provide your own: - -**Before:** -```bash -helm upgrade --install argo-stack ./helm/argo-stack -``` - -**After:** -```bash -cat > my-repos.yaml < -export ARGOCD_SECRET_KEY=$(openssl rand -hex 32) -export ARGO_HOSTNAME= - -make deploy -``` - -**Production:** -Create your own values file with your repositories and S3 configuration. - -## 🧪 Testing - -### What Was Tested -- ✅ Makefile syntax -- ✅ YAML syntax validation (all files) -- ✅ Helm lint passes -- ✅ Empty arrays handled correctly in templates - -### What Needs Testing -- [ ] Full in-cluster MinIO deployment and workflow execution -- [ ] CI/CD pipeline with updated Makefile -- [ ] Integration testing with Kind cluster - -## 🔒 Security Considerations - -### Improvements Made -- Removed hardcoded repository URLs from default values -- Added warnings about never committing credentials -- Provided examples using best practices (IRSA, ExternalSecrets) -- MinIO credentials clearly marked as development-only -- In-cluster deployment isolates credentials from host environment - -### Recommendations for Users -1. Never commit credentials to version control -2. Use IRSA/Workload Identity when possible -3. Use External Secrets Operator for static credentials -4. Replace default MinIO credentials in production - -## 📊 Statistics - -- **Files Added:** 2 (examples/user-repos-example.yaml, QUICKSTART.md) -- **Files Removed:** 3 (docker-compose.yml, dev-minio.sh, local-dev-values.yaml) -- **Files Modified:** 6 (Makefile, README.md, QUICKSTART.md, docs/development.md, examples/README.md, IMPLEMENTATION_SUMMARY.md) -- **Net Change:** Simplified deployment approach with in-cluster MinIO - -### Breakdown by Type -- Makefile: Added `minio` target, updated `deploy` target, added S3 defaults -- YAML Config: 90 lines (user-repos-example) -- Documentation: Comprehensive updates across README, QUICKSTART, development.md - -## 🎓 Key Learnings - -### Design Decisions -1. **In-cluster MinIO** - Simpler deployment, no external dependencies -2. **Makefile integration** - Single command deployment (`make deploy`) -3. **Empty arrays by default** - Forces users to provide their own configuration -4. **Comprehensive docs** - Reduces friction for new users - -### Trade-offs -1. **Requires user action** - Users must now provide repository configuration - - **Pro:** More secure, reusable template - - **Con:** Slightly more work for initial deployment - -2. **In-cluster vs localhost** - Changed from docker-compose to Helm deployment - - **Pro:** No Docker Desktop required, works in CI/CD - - **Con:** Requires Kubernetes cluster (Kind/Minikube acceptable) - -3. **Simplified approach** - Removed helper scripts - - **Pro:** Fewer moving parts, standard Helm workflow - - **Con:** Less flexibility for advanced users - -## 🚀 Future Enhancements - -Potential improvements for future PRs: - -1. **GitHub Actions Workflow** - Automate testing of in-cluster MinIO deployment -2. **Helmfile Example** - Show multi-environment deployment -3. **Video Tutorial** - Walkthrough of local development setup -4. **Terraform Module** - For cloud-based MinIO deployment -5. **Pre-commit Hooks** - Prevent committing credentials - -## 📞 Support - -For issues or questions: -- Check QUICKSTART.md for common questions -- Review docs/development.md for troubleshooting -- Open a GitHub issue with logs and configuration - -## ✅ Acceptance Criteria - -All acceptance criteria from the original issue have been met: - -- [x] Developer can spin up a local MinIO service with provided script/steps -- [x] Chart values.yaml no longer contains hardcoded GitHub repo info -- [x] Users are guided in documentation to provide repo configuration -- [x] CI workflows updated to work with new approach diff --git a/Makefile b/Makefile index add12e94..4fafc3f5 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Convenience targets for local testing -.PHONY: deps lint template validate kind ct adapter test-artifacts test-secrets test-artifact-repository-ref all minio minio-ls minio-status minio-cleanup vault-dev vault-seed vault-cleanup vault-status eso-install eso-status eso-cleanup +.PHONY: deps lint template validate kind ct adapter test-artifacts test-secrets test-artifact-repository-ref all minio minio-ls minio-status minio-cleanup vault-dev vault-seed vault-cleanup vault-status eso-install eso-status eso-cleanup vault-seed-github-app # S3/MinIO configuration - defaults to in-cluster MinIO S3_ENABLED ?= true @@ -12,6 +12,20 @@ S3_HOSTNAME ?= minio.minio-system.svc.cluster.local:9000 # Vault configuration for local development (in-cluster deployment) VAULT_TOKEN ?= root +# Ingress configuration - must be set for production deployments +# ARGO_HOSTNAME: (REQUIRED) The domain name for your Argo services (e.g., argo.example.com) +# Must be set as environment variable: export ARGO_HOSTNAME=your-domain.com +# TLS_SECRET_NAME: Name of the TLS secret for SSL certificates +# EXTERNAL_IP: External IP address for ingress (leave empty to skip external IP assignment) +TLS_SECRET_NAME ?= calypr-demo-tls +PUBLIC_IP ?= +LANDING_PAGE_IMAGE_TAG ?= v3 + +# GitHub App configuration (optional) +# Set these to seed the GitHub App private key into Vault +# GITHUBAPP_PRIVATE_KEY_FILE_PATH ?= +GITHUBAPP_PRIVATE_KEY_VAULT_PATH ?= kv/argo/argocd/github-app + check-vars: @echo "🔍 Checking required environment variables..." @@ -23,10 +37,16 @@ check-vars: fi @echo "✅ Environment validation passed." @test -n "$(GITHUB_PAT)" || (echo "Error: GITHUB_PAT is undefined. Run 'export GITHUB_PAT=...' before installing" && exit 1) - @test -n "$(ARGOCD_SECRET_KEY)" || (echo "Error: ARGOCD_SECRET_KEY is undefined. Run 'export ARGOCD_SECRET_KEY=...' before installing" && exit 1) - @test -n "$(ARGO_HOSTNAME)" || (echo "Error: ARGO_HOSTNAME is undefined. Run 'export ARGO_HOSTNAME=...' before installing" && exit 1) - - @echo "✅ Environment validation passed." + @test -n "$(ARGOCD_SECRET_KEY)" || (echo "Error: ARGOCD_SECRET_KEY is undefined. Run 'export ARGOCD_SECRET_KEY=...' before installing" && exit 1) + @test -n "$(ARGO_HOSTNAME)" || (echo "Error: ARGO_HOSTNAME is undefined. Run 'export ARGO_HOSTNAME=...' before installing" && exit 1) + @test -n "$(ARGO_HOSTNAME)" || (echo "Error: ARGO_HOSTNAME is undefined. Run 'export ARGO_HOSTNAME=...' before installing" && exit 1) + @test -n "$(GITHUBHAPP_APP_ID)" || (echo "Error: GITHUBHAPP_APP_ID is undefined. Run 'export GITHUBHAPP_APP_ID=...' before installing" && exit 1) + @test -n "$(GITHUBHAPP_CLIENT_ID)" || (echo "Error: GITHUBHAPP_CLIENT_ID is undefined. Run 'export GITHUBHAPP_CLIENT_ID=...' before installing" && exit 1) + @test -n "$(GITHUBHAPP_PRIVATE_KEY_SECRET_NAME)" || (echo "Error: GITHUBHAPP_PRIVATE_KEY_SECRET_NAME is undefined. Run 'export GITHUBHAPP_PRIVATE_KEY_SECRET_NAME=...' before installing" && exit 1) + @test -f "$(GITHUBHAPP_PRIVATE_KEY_FILE_PATH)" || (echo "Error: GITHUBHAPP_PRIVATE_KEY_FILE_PATH file '$(GITHUBHAPP_PRIVATE_KEY_FILE_PATH)' not found. Create the file before installing" && exit 1) + @test -n "$(GITHUBHAPP_PRIVATE_KEY_VAULT_PATH)" || (echo "Error: GITHUBHAPP_PRIVATE_KEY_VAULT_PATH is undefined. Run 'export GITHUBHAPP_PRIVATE_KEY_VAULT_PATH=...' before installing" && exit 1) + @test -n "$(GITHUBHAPP_INSTALLATION_ID)" || (echo "Error: GITHUBHAPP_INSTALLATION_ID is undefined. Run 'export GITHUBHAPP_INSTALLATION_ID=...' before installing" && exit 1) + @echo "✅ All GITHUBHAPP environment variables are set." deps: @@ -46,12 +66,20 @@ template: check-vars deps --set-string events.github.secret.tokenValue=${GITHUB_PAT} \ --set-string argo-cd.configs.secret.extra."server\.secretkey"="${ARGOCD_SECRET_KEY}" \ --set-string events.github.webhook.ingress.hosts[0]=${ARGO_HOSTNAME} \ - --set-string events.github.webhook.url=http://${ARGO_HOSTNAME}:12000 \ + --set-string events.github.webhook.url=http://${ARGO_HOSTNAME}/registrations \ --set-string s3.enabled=${S3_ENABLED} \ --set-string s3.accessKeyId=${S3_ACCESS_KEY_ID} \ --set-string s3.secretAccessKey=${S3_SECRET_ACCESS_KEY} \ --set-string s3.bucket=${S3_BUCKET} \ + --set-string githubApp.enabled=true \ + --set-string githubApp.appId="${GITHUBHAPP_APP_ID}" \ + --set-string githubApp.installationId="${GITHUBHAPP_INSTALLATION_ID}" \ + --set-string githubApp.privateKeySecretName="${GITHUBHAPP_PRIVATE_KEY_SECRET_NAME}" \ + --set-string githubApp.privateKeyVaultPath="${GITHUBHAPP_PRIVATE_KEY_VAULT_PATH}" \ + --set-string landingPage.image.tag="${LANDING_PAGE_IMAGE_TAG}" \ + --set ingress={} \ -f - \ + -f helm/argo-stack/admin-values.yaml \ --namespace argocd > rendered.yaml validate: @@ -85,7 +113,7 @@ show-limits: kind: kind delete cluster || true - kind create cluster + envsubst < kind-config.yaml | kind create cluster --config - minio: @echo "🗄️ Installing MinIO in cluster..." @@ -141,27 +169,59 @@ argo-stack: S3_HOSTNAME=${S3_HOSTNAME} S3_BUCKET=${S3_BUCKET} S3_REGION=${S3_REGION} \ envsubst < my-values.yaml | helm upgrade --install \ argo-stack ./helm/argo-stack -n argocd --create-namespace \ - --wait --atomic \ + --wait --atomic --timeout 10m0s \ --set-string events.github.webhook.ingress.hosts[0]=${ARGO_HOSTNAME} \ - --set-string events.github.webhook.url=http://${ARGO_HOSTNAME}:12000 \ + --set-string events.github.webhook.url=https://${ARGO_HOSTNAME}/events\ --set-string s3.enabled=${S3_ENABLED} \ --set-string s3.bucket=${S3_BUCKET} \ --set-string s3.pathStyle=true \ --set-string s3.insecure=true \ --set-string s3.region=${S3_REGION} \ --set-string s3.hostname=${S3_HOSTNAME} \ + --set-string ingress.argoWorkflows.host=${ARGO_HOSTNAME} \ + --set-string ingress.argocd.host=${ARGO_HOSTNAME} \ + --set-string githubApp.enabled=true \ + --set-string githubApp.appId="${GITHUBHAPP_APP_ID}" \ + --set-string githubApp.installationId="${GITHUBHAPP_INSTALLATION_ID}" \ + --set-string githubApp.privateKeySecretName="${GITHUBHAPP_PRIVATE_KEY_SECRET_NAME}" \ + --set-string githubApp.privateKeyVaultPath="${GITHUBHAPP_PRIVATE_KEY_VAULT_PATH}" \ + --set-string landingPage.image.tag="${LANDING_PAGE_IMAGE_TAG}" \ + -f helm/argo-stack/admin-values.yaml \ -f - -deploy: init argo-stack docker-install ports +deploy: init docker-install argo-stack ports ports: - echo waiting for pods - sleep 10 - kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=argocd-server --timeout=120s -n argocd - echo starting port forwards - kubectl port-forward svc/argo-stack-argo-workflows-server 2746:2746 --address=0.0.0.0 -n argo-workflows & - kubectl port-forward svc/argo-stack-argocd-server 8080:443 --address=0.0.0.0 -n argocd & - kubectl port-forward svc/github-repo-registrations-eventsource-svc 12000:12000 --address=0.0.0.0 -n argo-events & - echo UIs available on port 2746 and port 8080, event exposed on 12000 + # manual certificate + # If the secret already exists, delete it first: + kubectl delete secret calypr-demo-tls -n argo-stack || true + # Create the TLS secret from your certificate files + sudo cp /etc/letsencrypt/live/calypr-demo.ddns.net/fullchain.pem /tmp/ + sudo cp /etc/letsencrypt/live/calypr-demo.ddns.net/privkey.pem /tmp/ + sudo chmod 644 /tmp/fullchain.pem /tmp/privkey.pem + kubectl create secret tls ${TLS_SECRET_NAME} -n default --cert=/tmp/fullchain.pem --key=/tmp/privkey.pem || true + kubectl create secret tls ${TLS_SECRET_NAME} -n argocd --cert=/tmp/fullchain.pem --key=/tmp/privkey.pem || true + kubectl create secret tls ${TLS_SECRET_NAME} -n argo-workflows --cert=/tmp/fullchain.pem --key=/tmp/privkey.pem || true + kubectl create secret tls ${TLS_SECRET_NAME} -n argo-events --cert=/tmp/fullchain.pem --key=/tmp/privkey.pem || true + kubectl create secret tls ${TLS_SECRET_NAME} -n argo-stack --cert=/tmp/fullchain.pem --key=/tmp/privkey.pem || true + kubectl create secret tls ${TLS_SECRET_NAME} -n calypr-api --cert=/tmp/fullchain.pem --key=/tmp/privkey.pem || true + kubectl create secret tls ${TLS_SECRET_NAME} -n calypr-tenants --cert=/tmp/fullchain.pem --key=/tmp/privkey.pem || true + sudo rm /tmp/fullchain.pem /tmp/privkey.pem + # install ingress + helm upgrade --install ingress-authz-overlay \ + helm/argo-stack/overlays/ingress-authz-overlay \ + --namespace argo-stack \ + --set ingressAuthzOverlay.host=${ARGO_HOSTNAME} + # start nginx + helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx + helm repo update + helm upgrade --install ingress-nginx ingress-nginx/ingress-nginx \ + -n ingress-nginx --create-namespace \ + --set controller.service.type=NodePort \ + --set controller.extraArgs.default-ssl-certificate=default/${TLS_SECRET_NAME} \ + --set controller.watchIngressWithoutClass=true \ + -f helm/argo-stack/overlays/ingress-authz-overlay/values-ingress-nginx.yaml + # Solution - Use NodePort instead of LoadBalancer in kind + kubectl patch svc ingress-nginx-controller -n ingress-nginx -p '{"spec":{"type":"NodePort","ports":[{"port":80,"nodePort":30080},{"port":443,"nodePort":30443}]}}' adapter: cd authz-adapter && python3 -m pip install -r requirements.txt pytest && pytest -q @@ -248,7 +308,13 @@ vault-status: @echo "🔍 Checking Vault status..." @kubectl exec -n vault vault-0 -- vault status 2>/dev/null || echo "❌ Vault not running. Run 'make vault-dev' first." -vault-seed: +vault-seed: vault-seed-etc vault-seed-github-app + +vault-seed-github-app: + @echo "➡️ Creating secrets for github app ..." + cat "$(GITHUBHAPP_PRIVATE_KEY_FILE_PATH)" | kubectl exec -i -n vault vault-0 -- vault kv put $(GITHUBAPP_PRIVATE_KEY_VAULT_PATH) privateKey=-; \ + +vault-seed-etc: @echo "🌱 Seeding Vault with test secrets..." @echo "➡️ Enabling KV v2 secrets engine..." @kubectl exec -n vault vault-0 -- vault secrets enable -version=2 -path=kv kv 2>/dev/null || echo " (KV already enabled)" @@ -273,18 +339,18 @@ vault-seed: @kubectl exec -n vault vault-0 -- vault kv put kv/argo/events/github \ token="$(GITHUB_PAT)" @echo "➡️ Creating per-app S3 credentials..." - @kubectl exec -n vault vault-0 -- vault kv put kv/argo/apps/nextflow-hello/s3 \ + @kubectl exec -n vault vault-0 -- vault kv put kv/argo/apps/bwalsh/nextflow-hello-project/s3 \ accessKey="minioadmin" \ secretKey="minioadmin" - @kubectl exec -n vault vault-0 -- vault kv put kv/argo/apps/nextflow-hello-2/s3 \ + @kubectl exec -n vault vault-0 -- vault kv put kv/argo/apps/bwalsh/nextflow-hello-2/s3 \ accessKey="app2-access-key" \ secretKey="app2-secret-key" @echo "➡️ Seeding Vault with secrets from my-values.yaml repoRegistrations..." @# nextflow-hello-project GitHub credentials - @kubectl exec -n vault vault-0 -- vault kv put kv/argo/apps/nextflow-hello-project/github \ + @kubectl exec -n vault vault-0 -- vault kv put kv/argo/apps/bwalsh/nextflow-hello-project/github \ token="$(GITHUB_PAT)" @# nextflow-hello-project S3 artifact credentials - @kubectl exec -n vault vault-0 -- vault kv put kv/argo/apps/nextflow-hello-project/s3/artifacts \ + @kubectl exec -n vault vault-0 -- vault kv put kv/argo/apps/bwalsh/nextflow-hello-project/s3/artifacts \ AWS_ACCESS_KEY_ID="minioadmin" \ AWS_SECRET_ACCESS_KEY="minioadmin" @# genomics-variant-calling GitHub credentials @@ -321,8 +387,8 @@ vault-seed: @echo " kv/argo/authz - AuthZ adapter OIDC secret" @echo " kv/argo/events/github - GitHub webhook token" @echo " kv/argo/apps/*/s3 - Per-app S3 credentials (legacy)" - @echo " kv/argo/apps/nextflow-hello-project/github - nextflow-hello-project GitHub token" - @echo " kv/argo/apps/nextflow-hello-project/s3/artifacts - nextflow-hello-project S3 credentials" + @echo " kv/argo/apps/bwalsh/nextflow-hello-project/github - nextflow-hello-project GitHub token" + @echo " kv/argo/apps/bwalsh/nextflow-hello-project/s3/artifacts - nextflow-hello-project S3 credentials" @echo " kv/argo/apps/genomics/github - genomics-variant-calling GitHub token" @echo " kv/argo/apps/genomics/s3/artifacts - genomics-variant-calling S3 artifact credentials" @echo " kv/argo/apps/genomics/s3/data - genomics-variant-calling S3 data credentials" @@ -409,4 +475,13 @@ docker-install: docker build -t nextflow-runner:latest -f nextflow-runner/Dockerfile . kind load docker-image nextflow-runner:latest --name kind docker exec -it kind-control-plane crictl images | grep nextflow-runner + @echo "✅ loaded docker nextflow-runner" + cd authz-adapter ; docker build -t authz-adapter:v0.0.1 -f Dockerfile . + kind load docker-image authz-adapter:v0.0.1 --name kind + docker exec -it kind-control-plane crictl images | grep authz-adapter + @echo "✅ loaded docker authz-adapter" + cd landing-page ; docker build --no-cache -t landing-page:${LANDING_PAGE_IMAGE_TAG} -f Dockerfile . + kind load docker-image landing-page:${LANDING_PAGE_IMAGE_TAG} --name kind + docker exec -it kind-control-plane crictl images | grep landing-page + @echo "✅ loaded docker landing-page" diff --git a/ORPHAN_ANALYSIS.md b/ORPHAN_ANALYSIS.md deleted file mode 100644 index 823a5736..00000000 --- a/ORPHAN_ANALYSIS.md +++ /dev/null @@ -1,337 +0,0 @@ -# Orphan Resources Analysis - -This document identifies "dead" or "orphan" values, templates, and documentation in the argo-helm repository following the migration to the RepoRegistration pattern. - -## Executive Summary - -The migration to `repoRegistrations` pattern has left several legacy configurations and templates that are either: -1. **Deprecated but documented** - Kept for migration reference -2. **Hardcoded values** - Should be parameterized or removed -3. **Unused templates** - Still reference the old `applications` array -4. **Inconsistent namespace references** - Mix of hardcoded `wf-poc` and templated values - ---- - -## 1. Deprecated Legacy Values (Documented but Orphaned) - -### 1.1 `applications` Array (values.yaml:242) -**Status**: DEPRECATED and documented -**Location**: `helm/argo-stack/values.yaml` -**References**: -- Line 242: `applications: []` with deprecation notice -- Template: `templates/workflows/per-app-workflowtemplates.yaml` (line 6) still iterates over this - -**Issue**: The `per-app-workflowtemplates.yaml` template still uses `{{- range .Values.applications }}` but this array is deprecated. - -**Recommendation**: -- Remove `templates/workflows/per-app-workflowtemplates.yaml` entirely -- Remove the `applications: []` value from `values.yaml` -- Update `values-multi-app.yaml` to use `repoRegistrations` instead - -### 1.2 `events.github.repositories` Array (values.yaml:278) -**Status**: DEPRECATED and documented -**Location**: `helm/argo-stack/values.yaml` -**References**: Line 278: `repositories: []` with deprecation notice - -**Issue**: Documented as removed but still present in values.yaml - -**Recommendation**: -- Remove the `repositories: []` entry (keep the deprecation comment for one more release) - -### 1.3 Legacy Event Sensor Configuration (values.yaml:310-317) -**Status**: ORPHANED - No template uses this -**Location**: `helm/argo-stack/values.yaml` -```yaml -sensor: - enabled: true - name: run-nextflow-on-push - workflowNamespace: wf-poc - workflowTemplateRef: nextflow-hello-template - parameters: - - name: git_revision - valueFrom: "{{(events.push.body.head_commit.id)}}" -``` - -**Issue**: No template references `.Values.events.sensor.*` anymore. The new sensor is generated from `repoRegistrations` in `templates/events/sensor-github-push.yaml` - -**Recommendation**: Remove this entire `sensor` block from values.yaml - ---- - -## 2. Hardcoded Namespace Values - -### 2.1 `wf-poc` Hardcoded in RBAC Templates -**Status**: INCONSISTENT - Should use templated values -**Locations**: -- `templates/workflows/argo-workflow-runner-rbac.yaml` (lines 6, 12, 21, 43, 47) -- `templates/events/role-wf-submit.yaml` (multiple lines) -- `templates/10-rbac.yaml` (lines 65, 91, 105) - -**Issue**: These templates hardcode `wf-poc` instead of using `{{ .Values.namespaces.tenant }}` - -**Recommendation**: Replace all hardcoded `wf-poc` references with `{{ .Values.namespaces.tenant }}` in: -- `templates/workflows/argo-workflow-runner-rbac.yaml` -- `templates/events/role-wf-submit.yaml` (if it exists) -- `templates/10-rbac.yaml` - -### 2.2 `namespaces.tenant` Value -**Status**: ORPHANED - Not used by RepoRegistration pattern -**Location**: `helm/argo-stack/values.yaml:11` -```yaml -namespaces: - tenant: wf-poc -``` - -**Issue**: The RepoRegistration pattern creates per-tenant namespaces (`wf--`) dynamically. The static `tenant` namespace is only used by legacy templates. - -**Current Usage**: -- Referenced by `templates/10-rbac.yaml` for `nextflow-launcher` ServiceAccount -- Referenced by `templates/20-artifact-repositories.yaml` for legacy S3 credentials -- Referenced by legacy templates - -**Recommendation**: -- Keep for backward compatibility if needed -- Document clearly that it's only for legacy/example templates -- Or remove entirely if not needed (check if anyone deploys without repoRegistrations) - ---- - -## 3. Orphaned Workflow Templates - -### 3.1 `workflowtemplate-nextflow-hello.yaml` -**Status**: EXAMPLE/LEGACY -**Location**: `templates/workflows/workflowtemplate-nextflow-hello.yaml` - -**Issue**: Creates a simple example WorkflowTemplate controlled by `workflowTemplates.createExample`. Only used for demonstrations, not by RepoRegistration pattern. - -**Recommendation**: -- Keep as an example if useful for testing -- Clearly document it's for testing/examples only -- Consider moving to an examples/ directory - -### 3.2 `workflowtemplate-nextflow-runner.yaml` -**Status**: LEGACY - Superseded by `nextflow-repo-runner` -**Location**: `templates/workflows/workflowtemplate-nextflow-runner.yaml` - -**Issue**: This creates a `nextflow-runner` WorkflowTemplate in the `argo-workflows` namespace. The new pattern uses `nextflow-repo-runner` created per-tenant. - -**Current References**: -- Not referenced by `repoRegistrations` (which use `nextflow-repo-runner`) -- May be used by manual workflow submissions - -**Recommendation**: -- If not used, remove it -- If kept for manual testing, document clearly -- Check if anyone uses it outside of RepoRegistration - -### 3.3 `per-app-workflowtemplates.yaml` -**Status**: ORPHANED - Uses deprecated `applications` array -**Location**: `templates/workflows/per-app-workflowtemplates.yaml` - -**Issue**: Iterates over `.Values.applications` which is deprecated - -**Recommendation**: Remove this template entirely (replaced by `per-tenant-workflowtemplates.yaml`) - ---- - -## 4. Orphaned Values Configuration Blocks - -### 4.1 `workflowTemplates` Block (values.yaml:319-328) -**Status**: PARTIALLY ORPHANED -**Location**: `helm/argo-stack/values.yaml` -```yaml -workflowTemplates: - createExample: true - namespace: wf-poc - nextflowHello: - name: nextflow-hello-template - image: alpine:3.20 - command: ["/bin/sh", "-c"] - args: - - echo "Hello from Argo Events!" -``` - -**Usage**: -- Only used by `templates/workflows/workflowtemplate-nextflow-hello.yaml` -- Not used by RepoRegistration pattern - -**Recommendation**: -- Keep if the example template is useful -- Otherwise remove - -### 4.2 `workflows` Block (values.yaml:329-332) -**Status**: MOSTLY ORPHANED -**Location**: `helm/argo-stack/values.yaml` -```yaml -workflows: - namespace: wf-poc - runnerServiceAccount: wf-runner - templateRef: nextflow-hello-template -``` - -**Usage**: -- `runnerServiceAccount` is used in `templates/events/sensor-github-push.yaml:54` -- Other fields appear unused - -**Recommendation**: -- Keep `runnerServiceAccount` with better documentation -- Remove `namespace` and `templateRef` (not used by repoRegistrations) - -### 4.3 `s3` Block (values.yaml:188-195) -**Status**: LEGACY - Superseded by repoRegistrations.artifactBucket -**Location**: `helm/argo-stack/values.yaml` -```yaml -s3: - enabled: false - hostname: "" - bucket: "" - region: "" - insecure: false - pathStyle: true - accessKey: "" - secretKey: "" -``` - -**Usage**: -- Used by `templates/20-artifact-repositories.yaml` for global artifact repository -- Superseded by per-repo artifact configuration in RepoRegistrations - -**Recommendation**: -- Keep for backward compatibility (some may want a global default) -- Document that repoRegistrations.artifactBucket is preferred -- Or remove if truly not needed - ---- - -## 5. Documentation Analysis - -### 5.1 Current Documentation Files -All documentation appears current and relevant: -- ✅ `docs/DEPRECATION_NOTICE.md` - Documents migration from old patterns -- ✅ `docs/repo-registration-guide.md` - User guide for new pattern -- ✅ `docs/template-overlap-analysis.md` - Analysis of template overlap -- ✅ Other docs appear current and useful - -**Recommendation**: No changes needed to documentation - -### 5.2 Example Files -- ✅ `examples/repo-registrations-example.yaml` - Current -- ✅ `examples/repo-registrations-values.yaml` - Current -- ⚠️ `values-multi-app.yaml` - Uses deprecated `applications` array - -**Recommendation**: -- Update `values-multi-app.yaml` to demonstrate `repoRegistrations` instead of `applications` - ---- - -## 6. Orphaned Artifact Repository Templates - -### 6.1 `20-artifact-repositories.yaml` -**Status**: LEGACY - For global S3 config -**Location**: `templates/20-artifact-repositories.yaml` - -**Issue**: Creates a global `artifact-repositories` ConfigMap in the `argo-workflows` namespace when `s3.enabled: true`. RepoRegistrations create per-tenant ConfigMaps instead. - -**Recommendation**: -- Keep for backward compatibility with non-RepoRegistration deployments -- Document that it's only used when `s3.enabled: true` and not using RepoRegistrations - -### 6.2 `21-per-app-artifact-repositories-from-repo-registrations.yaml` -**Status**: ACTIVE but misnamed -**Location**: `templates/21-per-app-artifact-repositories-from-repo-registrations.yaml` - -**Issue**: Name suggests "per-app" but it's actually "per-repo" using RepoRegistrations - -**Recommendation**: Consider renaming for clarity (or keep for backward compatibility) - ---- - -## Summary of Recommendations - -### ✅ Completed - Templates Deleted (Not Modified) -Following the feedback "don't change templates just to conform. if the template is not being used, delete it", all unused legacy templates have been **deleted** instead of modified: - -1. ✅ **DELETED** `templates/workflows/per-app-workflowtemplates.yaml` (used deprecated `applications`) -2. ✅ **DELETED** `templates/10-rbac.yaml` (legacy RBAC for static wf-poc namespace) -3. ✅ **DELETED** `templates/workflows/argo-workflow-runner-rbac.yaml` (legacy RBAC for wf-poc) -4. ✅ **DELETED** `templates/workflows/sensor-argo-events-rbac.yaml` (legacy RBAC for wf-poc) -5. ✅ **DELETED** `templates/events/role-wf-submit.yaml` (legacy RBAC for wf-poc) -6. ✅ **DELETED** `templates/workflows/workflowtemplate-nextflow-hello.yaml` (example template) -7. ✅ **DELETED** `templates/workflows/workflowtemplate-nextflow-runner.yaml` (legacy template) - -### ✅ Completed - Values Cleanup -8. ✅ **REMOVED** `applications: []` from `values.yaml` -9. ✅ **REMOVED** `events.github.repositories: []` from `values.yaml` -10. ✅ **REMOVED** `events.sensor.*` configuration block -11. ✅ **REMOVED** `workflowTemplates` configuration block -12. ✅ **REMOVED** `workflows.namespace` and `workflows.templateRef` - -### ✅ Completed - Example Files -13. ✅ **UPDATED** `values-multi-app.yaml` to use `repoRegistrations` instead of `applications` -14. ✅ **REMOVED** `workflowTemplates.createExample` from `values-multi-app.yaml` - -### 🔍 Rationale for Deletions - -**Why delete instead of modify?** - -The RepoRegistration pattern creates ALL necessary resources automatically per tenant: -- **Namespaces**: Created by `01-tenant-namespaces-from-repo-registrations.yaml` as `wf--` -- **RBAC**: Created by `11-tenant-rbac-from-repo-registrations.yaml` (ServiceAccount, Role, RoleBinding) -- **WorkflowTemplates**: Created by `per-tenant-workflowtemplates.yaml` as `nextflow-repo-runner` per namespace -- **Artifact Repos**: Created by `22-tenant-artifact-repositories-from-repo-registrations.yaml` - -The deleted templates were all for a **static `wf-poc` namespace** which is: -1. Not used when deploying with `repoRegistrations` -2. A legacy pattern from before multi-tenancy -3. Redundant with the per-tenant resources - -**Templates kept (actively used by repoRegistrations):** -- ✅ `01-tenant-namespaces-from-repo-registrations.yaml` -- ✅ `11-tenant-rbac-from-repo-registrations.yaml` -- ✅ `22-tenant-artifact-repositories-from-repo-registrations.yaml` -- ✅ `per-tenant-workflowtemplates.yaml` -- ✅ `workflowtemplate-nextflow-repo-runner.yaml` (global, referenced by per-tenant) -- ✅ All ESO and event templates - ---- - -## Changes Made - -### Commit 1: Analysis -- Created comprehensive ORPHAN_ANALYSIS.md document - -### Commits 2-5: Attempted to modify templates (INCORRECT APPROACH) -- Modified legacy templates to use templated namespaces -- This was the wrong approach - should have deleted them - -### Commit 6: Correct approach - Delete unused templates -- **Deleted** all legacy templates for static `wf-poc` namespace -- **Deleted** example/test templates not used by repoRegistrations -- **Removed** associated values configuration blocks -- **Updated** documentation to explain the rationale - ---- - -## Files That Are NOT Orphaned (Keep) - -These are actively used by the RepoRegistration pattern: -- ✅ `templates/01-tenant-namespaces-from-repo-registrations.yaml` -- ✅ `templates/11-tenant-rbac-from-repo-registrations.yaml` -- ✅ `templates/22-tenant-artifact-repositories-from-repo-registrations.yaml` -- ✅ `templates/argocd/applications-from-repo-registrations.yaml` -- ✅ `templates/events/eventsource-github-from-repo-registrations.yaml` -- ✅ `templates/events/sensor-github-push.yaml` -- ✅ `templates/eso/externalsecret-repo-registrations-github.yaml` -- ✅ `templates/eso/externalsecret-repo-registrations-s3.yaml` -- ✅ `templates/workflows/per-tenant-workflowtemplates.yaml` -- ✅ `templates/workflows/workflowtemplate-nextflow-repo-runner.yaml` - ---- - -## Testing Recommendations - -Before removing any files: -1. Verify no one is using the legacy `applications` pattern in production -2. Verify no manual workflows depend on `nextflow-runner` template -3. Verify the hardcoded `wf-poc` replacement doesn't break existing deployments -4. Test with `helm template` to ensure no rendering errors diff --git a/QUICKSTART.md b/QUICKSTART.md index 25124779..a7ed8073 100644 --- a/QUICKSTART.md +++ b/QUICKSTART.md @@ -1,129 +1,452 @@ # 🚀 Quick Start Guide +## 🏗️ Development Environment -Choose your deployment path: +### Workflow Overview -## 🏠 Local Development (5 minutes) +0. **Setup** → Run `scripts/check_tools.sh`, +1. **Development** → Make changes in `authz-adapter/` or `helm/argo-stack/` +2. **Local Testing** → Run `scripts/check_tools.sh`, pytest, helm lint, kubeconform +3. **Integration Testing** → Use `ct` with kind cluster (automated in CI) +4. **Deployment** → Apply `helm/argo-stack/` chart and `k8s/` manifests to cluster +5. **Ingress Setup** → Configure no-ip DNS, deploy NGINX proxy, verify TLS certificates -Perfect for testing and development with automatic MinIO deployment inside the cluster. +### Prerequisites + +See scripts/check_tools.sh for automated checks. +Currently, required tools and versions: + +```text +Date: Wed Nov 26 13:02:26 UTC 2025 +Hostname: ip-172-31-23-226.us-west-2.compute.internal +======================================== +Checking for requested tools... + +✓ kind is installed at: /usr/local/bin/kind + Version: kind v0.23.0 go1.21.10 linux/amd64 + +✓ jq is installed at: /usr/bin/jq + Version: jq-1.7.1 + +✓ k9s is installed at: /usr/local/bin/k9s + Version: Version v0.50.16 +Commit 3c37ca2197ca48591566d1f599b7b3a50d54a408 +Date 2025-10-19T15:52:37Z + +✓ stern is installed at: /usr/local/bin/stern + Version: version: 1.33.0 +commit: f79098037d951aad53e13aff1f86854b291baf01 +built at: 2025-09-07T06:18:52Z + +✓ helm is installed at: /usr/local/bin/helm + Version: v3.19.0+g3d8990f + +✓ kubectl is installed at: /usr/local/bin/kubectl + Version: Client Version: v1.34.1 + +✓ docker is installed at: /usr/bin/docker + Version: Docker version 25.0.13, build 0bab007 + +✓ git is installed at: /usr/bin/git + Version: git version 2.50.1 + +✓ pytest is installed at: /home/ec2-user/.local/bin/pytest + Version: pytest 8.4.2 + +✓ envsubst is installed at: /usr/bin/envsubst + Version: envsubst (GNU gettext-runtime) 0.21 + +✓ python3 is installed at: /usr/bin/python3 + Version: Python 3.9.23 + +✓ certbot is installed at: /usr/bin/certbot + Version: certbot 2.6.0 + +✓ go is installed at: /usr/bin/go + Version: go version go1.24.7 linux/amd64 + +✓ gcc is installed at: /usr/bin/gcc + Version: gcc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-5) + +✓ curl is installed at: /usr/bin/curl + Version: curl 8.11.1 (x86_64-amazon-linux-gnu) libcurl/8.11.1 OpenSSL/3.2.2 zlib/1.2.11 libidn2/2.3.2 libpsl/0.21.5 nghttp2/1.59.0 + +✓ openssl is installed at: /usr/bin/openssl + Version: OpenSSL 3.2.2 4 Jun 2024 (Library: OpenSSL 3.2.2 4 Jun 2024) + +``` + + + +### Setup ```bash -# Set required environment variables -export GITHUB_PAT= -export ARGOCD_SECRET_KEY=$(openssl rand -hex 32) -export ARGO_HOSTNAME= +# Install Python dependencies for authz-adapter +cd authz-adapter +pip install -r requirements.txt +pip install -r requirements-dev.txt -# Deploy everything (Kind cluster + MinIO + Argo Stack) -make deploy +# Run tests +python -m pytest tests/ -v -# Ports are automatically forwarded: -# - Argo Workflows: http://localhost:2746 -# - Argo CD: http://localhost:8080 -# - GitHub Events: http://localhost:12000 +# Build Docker image +docker build -t authz-adapter:dev . ``` -**Get ArgoCD password:** +## 🧪 Testing + +### Quick Start + ```bash -kubectl -n argocd get secret argocd-initial-admin-secret \ - -o jsonpath="{.data.password}" | base64 -d && echo +# Fast checks +helm repo add argo https://argoproj.github.io/argo-helm && helm repo update +helm lint helm/argo-stack --values helm/argo-stack/values.yaml +helm dependency build helm/argo-stack +helm template argo-stack helm/argo-stack --values helm/argo-stack/values.yaml --namespace argocd > rendered.yaml +kubeconform -strict -ignore-missing-schemas -skip 'CustomResourceDefinition|Application|Workflow|WorkflowTemplate' -summary rendered.yaml + +# kind + ct +kind create cluster +ct lint --config .ct.yaml +ct install --config .ct.yaml --debug + +# adapter tests +cd authz-adapter && python3 -m pip install -r requirements.txt pytest && pytest -q ``` -**Access MinIO Console (optional):** +### 0. Prerequisites + ```bash -kubectl port-forward svc/minio -n minio-system 9001:9001 -# Open http://localhost:9001 (minioadmin/minioadmin) +# Helm +brew install helm || sudo snap install helm --classic + +# kind (Kubernetes-in-Docker) +brew install kind || GO111MODULE="on" go install sigs.k8s.io/kind@v0.23.0 + +# Python tooling for adapter tests +python3 -m pip install --upgrade pip +python3 -m pip install pytest + +# chart-testing (ct) +brew install chart-testing || pipx install chart-testing + +# kubeconform (schema validation) +brew install kubeconform || curl -L https://github.com/yannh/kubeconform/releases/latest/download/kubeconform-linux-amd64.tar.gz | tar xz && sudo mv kubeconform /usr/local/bin/ ``` -**When done:** + +### 1. Helm lint, template, kubeconform + ```bash -kind delete cluster # Remove the entire cluster +# From repo root +helm repo add argo https://argoproj.github.io/argo-helm +helm repo update + +# Lint the umbrella chart +helm lint helm/argo-stack --values helm/argo-stack/values.yaml + +# Render the chart to plain YAML +helm dependency build helm/argo-stack +helm template argo-stack helm/argo-stack --values helm/argo-stack/values.yaml --namespace argocd > rendered.yaml + +# Validate rendered manifests (skip CRDs and Argo custom resources) +kubeconform -strict -ignore-missing-schemas -skip 'CustomResourceDefinition|Application|Workflow|WorkflowTemplate' -summary rendered.yaml ``` ---- +### 2. Chart Testing (ct) -## 🌐 Production Deployment (15 minutes) +This replicates CI: spin up kind, lint, then install the chart and smoke test it. + +```bash +# Fresh kind cluster +kind delete cluster || true +kind create cluster -For production deployments with your own repositories and S3 storage. +# Ensure dependencies are built +helm dependency build helm/argo-stack + +# Lint using ct (uses .ct.yaml) +ct lint --config .ct.yaml + +# Install and smoke test +ct install --config .ct.yaml --debug +``` -### Step 1: Create Your Values File +**Notes** +- `ct` uses the working tree and `.ct.yaml` to find `helm/argo-stack`. +- To test with custom values, commit a `ci-values.yaml` or temporarily edit `values.yaml` before running. + +### 3. authz-adapter unit tests ```bash -cat > my-values.yaml <<'EOF' -# S3 Configuration -s3: - enabled: true - hostname: "s3.us-west-2.amazonaws.com" - bucket: "my-argo-artifacts" - region: "us-west-2" - accessKey: "YOUR_ACCESS_KEY" - secretKey: "YOUR_SECRET_KEY" +cd authz-adapter +python3 -m pip install -r requirements.txt pytest +pytest -q +``` -# Your Applications (REQUIRED) -applications: - - name: my-workflow-app - project: default - repoURL: https://github.com/YOUR_ORG/YOUR_REPO.git # ⚠️ Replace this - targetRevision: main - path: "." - destination: - namespace: wf-poc - syncPolicy: - automated: - prune: true - selfHeal: true -EOF +What’s tested: +- `decide_groups(...)` logic (mapping `/user/user` authz JSON to groups like `argo-runner` and `argo-viewer`). + +### 4. Troubleshooting + +See docs/troubleshooting.md for common issues and solutions. + + +## 📁 Project Structure + +```text +. +├── authz-adapter/ # Custom authorization adapter for Argo +│ ├── Dockerfile # Container image definition +│ ├── adapter.py # Main FastAPI authorization service +│ ├── requirements.txt # Runtime dependencies +│ ├── requirements-dev.txt # Development and testing dependencies +│ └── tests/ # Unit tests for authorization logic +├── helm/ +│ └── argo-stack/ # Umbrella Helm chart for the full stack +│ ├── Chart.yaml # Chart metadata and dependencies +│ ├── values.yaml # Default configuration values +│ ├── templates/ # Kubernetes manifest templates +│ └── charts/ # Downloaded dependency charts (gitignored) +├── k8s/ # Platform Kubernetes manifests +│ └── TODO # k8s manifests for ingress, TLS, DNS, etc. +├── scripts/ # Development and automation scripts +│ └── check_tools.sh # Validates required tools and versions +├── docs/ # Documentation (if exists) +│ └── troubleshooting.md # Common issues and solutions +├── .ct.yaml # chart-testing (ct) configuration +├── .github/ # GitHub Actions CI/CD workflows +│ └── workflows/ +│ └── ci.yaml # Automated lint/test/build pipeline +├── CONTRIBUTING.md # This file +└── README.md # Project overview and quick start ``` -**Important:** Replace `YOUR_ORG/YOUR_REPO` with your actual repository! +### Key Directories + +#### `authz-adapter/` +Custom Python authorization adapter that integrates Gen3's Fence, Arborist with Argo Workflows and Argo CD. Implements group-based RBAC by mapping user identities to groups like `argo-runner` and `argo-viewer`. + +**Key files:** +- `adapter.py` - FastAPI service handling `/api/v1/auth` requests +- `tests/` - pytest unit tests for authorization logic +- `Dockerfile` - Builds the adapter container image + +#### `helm/argo-stack/` +Umbrella chart that bundles: +- **Argo CD** - GitOps continuous delivery +- **Argo Workflows** - Container-native workflow engine +- **Argo Events** - Event-driven workflow automation +- **authz-adapter** - Custom authorization service (as a subchart or template) + +**Key files:** +- `Chart.yaml` - Declares dependencies on upstream Argo charts +- `values.yaml` - Configures all components, includes authz adapter settings +- `templates/` - Additional manifests (e.g., RBAC, webhooks) + +#### `k8s/` +Raw Kubernetes manifests for platform infrastructure that sit outside the Helm chart. + +**Purpose:** +- **Ingress**: Path-based NGINX reverse proxy using `hostNetwork: true` for EC2 +- **TLS**: cert-manager integration with Let's Encrypt +- **DNS**: Configuration for no-ip.com dynamic DNS + +**Key files:** +- `nginx-reverse-proxy-path.yaml` - Single-host ingress with paths `/argo/`, `/applications/`, `/registrations/`, `/api/`, `/register/` +- `clusterissuer-letsencrypt.yaml` - ACME issuers for staging and production +- `noip-certificate.yaml` - Certificate resource for your no-ip domain + +#### `scripts/` +Helper scripts for local development and CI setup. + +**Key files:** +- `check_tools.sh` - Validates installed tools (kubectl, helm, kind, pytest, etc.) +- Other setup scripts for environment preparation + +#### Continuous Integration + +See .github/workflows/ci.yaml for the GitHub Actions pipeline that automates: +Configuration for [chart-testing](https://github.com/helm/chart-testing) used in CI. -### Step 2: Deploy +Defines: +- Chart directories to test +- Helm lint rules +- Install test configuration + + +## 🚀 Deployment + +### Quick Deploy with `make deploy` + +The `make deploy` target provides a complete deployment workflow that sets up the entire Argo stack with all dependencies: ```bash -export ARGOCD_SECRET_KEY=$(openssl rand -hex 32) +make deploy +``` + +#### What It Does + +The `deploy` target executes the following steps in order: + +1. **`make init`** - Initialize the cluster with all prerequisites: + - Creates a kind cluster (`make kind`) + - Raises inotify and file descriptor limits (`make bump-limits`) + - Installs External Secrets Operator (`make eso-install`) + - Deploys Vault dev server (`make vault-dev`) + - Seeds Vault with test secrets (`make vault-seed`) + - Installs MinIO for artifact storage (`make minio`) + - Configures Vault authentication (`make vault-auth`) + +2. **`make argo-stack`** - Deploys the Argo stack Helm chart: + - Installs Argo CD, Argo Workflows, and Argo Events + - Configures S3 artifact storage (MinIO) + - Sets up GitHub webhook integration + - Applies custom values from `my-values.yaml` + +3. **`make docker-install`** - Builds and loads custom Docker images: + - Builds `nextflow-runner:latest` from `nextflow-runner/Dockerfile` + - Loads the image into the kind cluster + +4. **`make ports`** - Configures ingress and TLS: + - Creates TLS secrets from Let's Encrypt certificates + - Installs the `ingress-authz-overlay` Helm chart + - Deploys nginx-ingress controller with NodePort + - Patches services to use external IP and NodePort + +#### Prerequisites + +Before running `make deploy`, ensure you have: -helm upgrade --install argo-stack ./helm/argo-stack \ - --namespace argocd --create-namespace \ - --values my-values.yaml \ - --set-string argo-cd.configs.secret.extra."server\.secretkey"="${ARGOCD_SECRET_KEY}" \ - --wait --timeout 10m +```bash +# Required environment variables +export GITHUB_PAT="your-github-personal-access-token" +export ARGOCD_SECRET_KEY="$(openssl rand -hex 32)" +export ARGO_HOSTNAME="your-domain.com" + +# Optional S3/MinIO configuration (defaults provided) +export S3_ENABLED="true" +export S3_ACCESS_KEY_ID="minioadmin" +export S3_SECRET_ACCESS_KEY="minioadmin" +export S3_BUCKET="argo-artifacts" ``` -### Step 3: Access +#### TLS Certificate Requirements + +The `make ports` step expects Let's Encrypt certificates at: + +```text +/etc/letsencrypt/live/${ARGO_HOSTNAME}/fullchain.pem +/etc/letsencrypt/live/${ARGO_HOSTNAME}/privkey.pem +``` + +To get certificates: ```bash -kubectl -n argo-workflows port-forward svc/argo-stack-argo-workflows-server 2746:2746 & -kubectl -n argocd port-forward svc/argo-stack-argocd-server 8080:443 & +sudo certbot certonly --standalone -d ${ARGO_HOSTNAME} ``` ---- +#### What Gets Deployed -## 📚 More Information +After `make deploy` completes, you'll have: -- **Full documentation:** [README.md](README.md) -- **Development guide:** [docs/development.md](docs/development.md) -- **Configuration examples:** [examples/](examples/) -- **Per-repo artifacts:** [examples/per-repo-artifacts-values.yaml](examples/per-repo-artifacts-values.yaml) -- **User repo example:** [examples/user-repos-example.yaml](examples/user-repos-example.yaml) +- **Kubernetes Cluster**: kind cluster with raised system limits +- **Secret Management**: + - Vault dev server with test secrets + - External Secrets Operator syncing secrets +- **Artifact Storage**: MinIO with `argo-artifacts` bucket +- **Argo Stack**: + - Argo CD at `https://${ARGO_HOSTNAME}/applications/` + - Argo Workflows at `https://${ARGO_HOSTNAME}/argo/` + - Argo Events webhook receiver at `https://${ARGO_HOSTNAME}/registrations/` +- **Ingress**: nginx-ingress controller with TLS termination ---- +#### Accessing Services -## ❓ Common Questions +After deployment: -**Q: Can I deploy without any applications?** -A: Yes! The chart will deploy Argo Workflows and Argo CD without any applications. You can add applications later through the Argo CD UI or by updating your values. +```bash +# Get Argo CD admin password +make password -**Q: Where do I put my GitHub repository URLs?** -A: Create a values file (like `my-values.yaml`) with your repositories in the `applications` array. Never commit credentials or private repo URLs to version control. +# Login to Argo CD CLI +make login -**Q: How do I use MinIO with Kubernetes in Kind/Minikube?** -A: Use `make deploy` which automatically sets up MinIO inside the cluster. See [docs/development.md](docs/development.md#local-minio-for-development) for details. +# Access services via browser: +# https://${ARGO_HOSTNAME}/applications/ - Argo CD UI +# https://${ARGO_HOSTNAME}/argo/ - Argo Workflows UI +# https://${ARGO_HOSTNAME}/registrations/ - GitHub webhook endpoint +``` -**Q: What about GitHub Events/webhooks?** -A: Configure the `events.github.repositories` section in your values file. See [examples/user-repos-example.yaml](examples/user-repos-example.yaml) for an example. +#### Troubleshooting Deployment ---- +**Environment variable errors:** +```bash +# Validate all required variables are set +make check-vars +``` + +**Certificate not found:** +```bash +# Verify certificate path +sudo ls -la /etc/letsencrypt/live/${ARGO_HOSTNAME}/ +``` -## 🆘 Getting Help +**MinIO bucket isn't created:** +```bash +# Check MinIO status +make minio-status -- **Check logs:** `kubectl logs -n argo-workflows -l app.kubernetes.io/name=argo-workflows-server` -- **MinIO logs:** `kubectl logs -n minio-system -l app=minio` -- **Issues:** [GitHub Issues](https://github.com/calypr/argo-helm/issues) +# List bucket contents +make minio-ls + +# Recreate if needed +make minio-cleanup +make minio +``` + +**Vault secrets not syncing:** +```bash +# Verify Vault is running +make vault-status + +# Check ExternalSecret status +make test-secrets + +# Reseed Vault if needed +make vault-seed +``` + +**Pods not starting:** +```bash +# Check system limits +make show-limits + +# Increase if needed +make bump-limits +``` + +#### Clean Deployment + +To start fresh: + +```bash +# Remove everything and redeploy +kind delete cluster +make deploy +``` + +#### Customizing Deployment + +To customize the deployment, edit `my-values.yaml` before running `make deploy`: + +```yaml +# my-values.yaml +repoRegistrations: + ... +``` + +Then deploy with custom values: + +```bash +make deploy +``` diff --git a/acmedns.json.example b/acmedns.json.example new file mode 100644 index 00000000..7bc96121 --- /dev/null +++ b/acmedns.json.example @@ -0,0 +1,9 @@ +{ + "calypr-demo.ddns.net": { + "username": "XXXX", + "password": "XXXX", + "fulldomain": "XXXX", + "subdomain": "XXXX", + "allowfrom": [] + } +} diff --git a/authz-adapter/.coverage b/authz-adapter/.coverage deleted file mode 100644 index 94912e3a..00000000 Binary files a/authz-adapter/.coverage and /dev/null differ diff --git a/authz-adapter/app.py b/authz-adapter/app.py index b46f9064..59eb0a37 100644 --- a/authz-adapter/app.py +++ b/authz-adapter/app.py @@ -9,18 +9,44 @@ app = Flask(__name__) + def decide_groups( - doc, - verb=None, - group=None, - version=None, - resource=None, - namespace=None, + doc, + verb=None, + group=None, + version=None, + resource=None, + namespace=None, ): """ - Map Fence-style authz JSON into coarse-grained groups Argo/ArgoCD can use. - Backward compatible: extra args are optional. - If Argo resource context is provided, only grant runner for Argo resources. + Map Fence-style authorization JSON into coarse-grained groups for Argo/ArgoCD. + + Determines which permission groups a user belongs to based on their Fence + authorization document. Supports resource-scoped decisions when Argo resource + context is provided. + + Args: + doc: User authorization document from Fence containing 'active' status and 'authz' data + verb: Optional Kubernetes verb (e.g., 'create', 'get', 'list') + group: Optional API group (e.g., 'argoproj.io') + version: Optional API version (e.g., 'v1alpha1') + resource: Optional resource type (e.g., 'workflows', 'workflowtemplates') + namespace: Optional namespace for the resource + + Returns: + List of group names the user belongs to (e.g., ['argo-runner', 'argo-viewer']) + Empty list if user is not active + + Examples: + >>> doc = {"active": True, "authz": {"/services/workflow/gen3-workflow": [{"method": "create"}]}} + >>> decide_groups(doc) + ['argo-runner', 'argo-viewer'] + + >>> decide_groups(doc, group="argoproj.io", resource="workflows") + ['argo-runner', 'argo-viewer'] + + >>> decide_groups({"active": False}) + [] """ groups = [] if not doc.get("active"): @@ -46,7 +72,32 @@ def decide_groups( groups.append("argo-viewer") return groups + def fetch_user_doc(auth_header): + """ + Fetch user authorization document from Fence userinfo endpoint. + + Validates the provided authorization token by calling the Fence /user endpoint. + Falls back to using a service token if no user token is provided. + + Args: + auth_header: Authorization header value (e.g., 'Bearer ') + + Returns: + Tuple of (user_doc, error): + - user_doc: Dictionary containing user info and authz data, or None on error + - error: Error message string, or None on success + + Examples: + >>> doc, err = fetch_user_doc("Bearer valid-token") + >>> if err: + ... print(f"Error: {err}") + ... else: + ... print(doc.get("email")) + + Raises: + No exceptions are raised; errors are returned in the tuple + """ headers = {} if auth_header and auth_header.lower().startswith("bearer "): headers["Authorization"] = auth_header @@ -54,7 +105,7 @@ def fetch_user_doc(auth_header): headers["Authorization"] = "Bearer " + SERVICE_TOKEN else: return None, "no token" - + try: r = requests.get(USERINFO_URL, headers=headers, timeout=TIMEOUT) if r.status_code != 200: @@ -69,16 +120,73 @@ def fetch_user_doc(auth_header): except Exception as e: return None, f"unexpected error: {e}" + +def get_debugging_vars(): + """ + Retrieve debugging override variables from query parameters or environment. + + Allows setting a fixed email and groups for testing purposes via + 'debug_email' and 'debug_groups' query parameters or + 'DEBUG_EMAIL' and 'DEBUG_GROUPS' environment variables. + + Returns: + Tuple of (email, groups): + - email: Debug email string or None + - groups: List of debug groups or None + + """ + email = None + groups = None + if os.environ.get("DEBUG_EMAIL"): + email = request.args.get("debug_email") or os.environ.get("DEBUG_EMAIL") + groups_str = request.args.get("debug_groups") or os.environ.get("DEBUG_GROUPS") + groups = groups_str.split(",") if groups_str else None + return email, groups + + @app.route("/check", methods=["GET"]) def check(): - auth = request.headers.get("Authorization", "") - doc, err = fetch_user_doc(auth) - if err or not doc: - return make_response(f"authz fetch failed: {err}", 401) - email = doc.get("email") or doc.get("name") or doc.get("username") or "unknown" - groups = decide_groups(doc) - if not groups: - return make_response("forbidden", 403) + """ + Authorization check endpoint for nginx auth_request. + + Validates the user's authorization token against Fence and determines their + permission groups. Sets custom headers for nginx to forward to upstream services. + + Expected Headers: + Authorization: Bearer token or service token fallback + + Response Headers (on success): + X-Auth-Request-User: User identifier (email/name/username) + X-Auth-Request-Email: User email + X-Auth-Request-Groups: Comma-separated list of groups + X-Allowed: 'true' to signal authorization success + + Returns: + HTTP Response: + - 200: User authorized, headers set + - 401: Authentication failed (invalid/missing token) + - 403: User authenticated but not authorized (no groups) + + Examples: + GET /check + Authorization: Bearer abc123 + + Response: 200 OK + X-Auth-Request-User: user@example.com + X-Auth-Request-Groups: argo-runner,argo-viewer + """ + # Check for debugging overrides via query parameters or environment variables + email, groups = get_debugging_vars() + # no debugging override, do real authz + if not (email and groups): + auth = request.headers.get("Authorization", "") + doc, err = fetch_user_doc(auth) + if err or not doc: + return make_response(f"authz fetch failed: {err}", 401) + email = doc.get("email") or doc.get("name") or doc.get("username") or "unknown" + groups = decide_groups(doc) + if not groups: + return make_response("forbidden", 403) resp = make_response("", 200) resp.headers["X-Auth-Request-User"] = email resp.headers["X-Auth-Request-Email"] = email @@ -86,9 +194,39 @@ def check(): resp.headers["X-Allowed"] = "true" return resp + @app.route("/healthz", methods=["GET"]) def healthz(): + """ + Health check endpoint. + + Simple endpoint to verify the service is running and responding. + Does not check external dependencies like Fence availability. + + Returns: + Tuple of (response_body, status_code): + - 200: Service is healthy + + Examples: + GET /healthz + + Response: 200 OK + ok + """ return "ok", 200 + if __name__ == "__main__": + """ + Run the Flask development server. + + Starts the authorization adapter service on all interfaces (0.0.0.0) + on port 8080. This should only be used for development/testing. + For production, use a WSGI server like gunicorn or uwsgi. + + Environment Variables: + FENCE_BASE: Base URL for Fence service (default: https://calypr-dev.ohsu.edu/user) + HTTP_TIMEOUT: Timeout for Fence requests in seconds (default: 3.0) + FENCE_SERVICE_TOKEN: Fallback service token for authentication + """ app.run(host="0.0.0.0", port=8080) diff --git a/authz-adapter/tests/test_app.py b/authz-adapter/tests/test_app.py index 15a39a9d..8de873d2 100644 --- a/authz-adapter/tests/test_app.py +++ b/authz-adapter/tests/test_app.py @@ -353,3 +353,281 @@ def test_malformed_authorization_header(self): for header in malformed_headers: response = client.get('/check', headers={'Authorization': header}) assert response.status_code == 401 + + +class TestGetDebuggingVars: + """Test the get_debugging_vars function.""" + + def setup_method(self): + """Reset app module before each test.""" + if 'app' in sys.modules: + del sys.modules['app'] + + @pytest.mark.unit + def test_get_debugging_vars_returns_none_when_no_debug_email(self): + """Test that get_debugging_vars returns (None, None) when DEBUG_EMAIL is not set.""" + env_vars = {'FENCE_BASE': 'https://test-fence.example.com/user'} + with patch.dict('os.environ', env_vars, clear=True): + import app + with app.app.test_request_context('/check'): + email, groups = app.get_debugging_vars() + assert email is None + assert groups is None + + @pytest.mark.unit + def test_get_debugging_vars_with_debug_email_env_only(self): + """Test get_debugging_vars with DEBUG_EMAIL env var set.""" + env_vars = { + 'FENCE_BASE': 'https://test-fence.example.com/user', + 'DEBUG_EMAIL': 'debug@example.com' + } + with patch.dict('os.environ', env_vars, clear=True): + import app + with app.app.test_request_context('/check'): + email, groups = app.get_debugging_vars() + assert email == 'debug@example.com' + assert groups is None + + @pytest.mark.unit + def test_get_debugging_vars_with_debug_email_and_groups_env(self): + """Test get_debugging_vars with DEBUG_EMAIL and DEBUG_GROUPS env vars set.""" + env_vars = { + 'FENCE_BASE': 'https://test-fence.example.com/user', + 'DEBUG_EMAIL': 'debug@example.com', + 'DEBUG_GROUPS': 'argo-runner,argo-viewer' + } + with patch.dict('os.environ', env_vars, clear=True): + import app + with app.app.test_request_context('/check'): + email, groups = app.get_debugging_vars() + assert email == 'debug@example.com' + assert groups == ['argo-runner', 'argo-viewer'] + + @pytest.mark.unit + def test_get_debugging_vars_query_params_override_env(self): + """Test that query params override env vars when DEBUG_EMAIL is set.""" + env_vars = { + 'FENCE_BASE': 'https://test-fence.example.com/user', + 'DEBUG_EMAIL': 'env@example.com', + 'DEBUG_GROUPS': 'env-group' + } + with patch.dict('os.environ', env_vars, clear=True): + import app + with app.app.test_request_context( + '/check?debug_email=query@example.com&debug_groups=query-group1,query-group2' + ): + email, groups = app.get_debugging_vars() + assert email == 'query@example.com' + assert groups == ['query-group1', 'query-group2'] + + @pytest.mark.unit + def test_get_debugging_vars_query_email_only(self): + """Test get_debugging_vars with debug_email query param only (requires DEBUG_EMAIL env).""" + env_vars = { + 'FENCE_BASE': 'https://test-fence.example.com/user', + 'DEBUG_EMAIL': 'env@example.com' + } + with patch.dict('os.environ', env_vars, clear=True): + import app + with app.app.test_request_context('/check?debug_email=query@example.com'): + email, groups = app.get_debugging_vars() + assert email == 'query@example.com' + assert groups is None + + @pytest.mark.unit + def test_get_debugging_vars_query_groups_with_env_email(self): + """Test get_debugging_vars with debug_groups query param and DEBUG_EMAIL env var.""" + env_vars = { + 'FENCE_BASE': 'https://test-fence.example.com/user', + 'DEBUG_EMAIL': 'env@example.com' + } + with patch.dict('os.environ', env_vars, clear=True): + import app + with app.app.test_request_context('/check?debug_groups=group1,group2'): + email, groups = app.get_debugging_vars() + assert email == 'env@example.com' + assert groups == ['group1', 'group2'] + + @pytest.mark.unit + def test_get_debugging_vars_single_group(self): + """Test get_debugging_vars with a single group in DEBUG_GROUPS.""" + env_vars = { + 'FENCE_BASE': 'https://test-fence.example.com/user', + 'DEBUG_EMAIL': 'debug@example.com', + 'DEBUG_GROUPS': 'single-group' + } + with patch.dict('os.environ', env_vars, clear=True): + import app + with app.app.test_request_context('/check'): + email, groups = app.get_debugging_vars() + assert email == 'debug@example.com' + assert groups == ['single-group'] + + @pytest.mark.unit + def test_get_debugging_vars_query_params_ignored_without_debug_email_env(self): + """Test that query params are ignored when DEBUG_EMAIL env is not set.""" + env_vars = {'FENCE_BASE': 'https://test-fence.example.com/user'} + with patch.dict('os.environ', env_vars, clear=True): + import app + with app.app.test_request_context( + '/check?debug_email=query@example.com&debug_groups=group1' + ): + email, groups = app.get_debugging_vars() + # Without DEBUG_EMAIL env var, query params should be ignored + assert email is None + assert groups is None + + +class TestCheckWithDebuggingVars: + """Test /check endpoint behavior with debugging variables.""" + + def setup_method(self): + """Reset app module before each test.""" + if 'app' in sys.modules: + del sys.modules['app'] + + @pytest.mark.unit + def test_check_bypasses_auth_with_debug_email_and_groups(self): + """Test that /check bypasses auth when DEBUG_EMAIL and DEBUG_GROUPS are set.""" + env_vars = { + 'FENCE_BASE': 'https://test-fence.example.com/user', + 'DEBUG_EMAIL': 'debug@example.com', + 'DEBUG_GROUPS': 'argo-runner,argo-viewer' + } + with patch.dict('os.environ', env_vars, clear=True): + import app + client = app.app.test_client() + # No Authorization header needed when debugging vars are set + response = client.get('/check') + assert response.status_code == 200 + assert response.headers['X-Auth-Request-User'] == 'debug@example.com' + assert response.headers['X-Auth-Request-Email'] == 'debug@example.com' + assert response.headers['X-Auth-Request-Groups'] == 'argo-runner,argo-viewer' + assert response.headers['X-Allowed'] == 'true' + + @pytest.mark.unit + def test_check_falls_back_to_auth_when_only_debug_email_set(self): + """Test that /check falls back to auth when only DEBUG_EMAIL is set (no groups).""" + with requests_mock.Mocker() as m: + env_vars = { + 'FENCE_BASE': 'https://test-fence.example.com/user', + 'DEBUG_EMAIL': 'debug@example.com' + # No DEBUG_GROUPS set + } + with patch.dict('os.environ', env_vars, clear=True): + import app + + mock_url = "https://test-fence.example.com/user/user" + user_doc = { + "active": True, + "email": "auth@example.com", + "authz": { + "/services/workflow/gen3-workflow": [ + {"method": "create", "service": "gen3-workflow"} + ] + } + } + m.get(mock_url, json=user_doc) + + client = app.app.test_client() + # Without groups, it should fall back to real auth + response = client.get('/check', headers={'Authorization': 'Bearer valid-token'}) + assert response.status_code == 200 + # Should use the email from auth, not debug email + assert response.headers['X-Auth-Request-Email'] == 'auth@example.com' + + @pytest.mark.unit + def test_check_with_debug_query_params(self): + """Test /check with debug_email and debug_groups query parameters.""" + env_vars = { + 'FENCE_BASE': 'https://test-fence.example.com/user', + 'DEBUG_EMAIL': 'env@example.com', + 'DEBUG_GROUPS': 'env-group' + } + with patch.dict('os.environ', env_vars, clear=True): + import app + client = app.app.test_client() + response = client.get( + '/check?debug_email=query@example.com&debug_groups=query-runner,query-viewer' + ) + assert response.status_code == 200 + assert response.headers['X-Auth-Request-User'] == 'query@example.com' + assert response.headers['X-Auth-Request-Email'] == 'query@example.com' + assert response.headers['X-Auth-Request-Groups'] == 'query-runner,query-viewer' + + @pytest.mark.unit + def test_check_with_debug_groups_override_in_query(self): + """Test /check with debug_groups query param overriding env var.""" + env_vars = { + 'FENCE_BASE': 'https://test-fence.example.com/user', + 'DEBUG_EMAIL': 'debug@example.com', + 'DEBUG_GROUPS': 'env-group' + } + with patch.dict('os.environ', env_vars, clear=True): + import app + client = app.app.test_client() + response = client.get('/check?debug_groups=query-admin,query-viewer') + assert response.status_code == 200 + assert response.headers['X-Auth-Request-Email'] == 'debug@example.com' + assert response.headers['X-Auth-Request-Groups'] == 'query-admin,query-viewer' + + @pytest.mark.unit + def test_check_query_params_ignored_without_debug_email_env(self): + """Test that query params are ignored when DEBUG_EMAIL env is not set.""" + with requests_mock.Mocker() as m: + env_vars = {'FENCE_BASE': 'https://test-fence.example.com/user'} + with patch.dict('os.environ', env_vars, clear=True): + import app + + mock_url = "https://test-fence.example.com/user/user" + user_doc = { + "active": True, + "email": "auth@example.com", + "authz": { + "/services/workflow/gen3-workflow": [ + {"method": "create", "service": "gen3-workflow"} + ] + } + } + m.get(mock_url, json=user_doc) + + client = app.app.test_client() + # Query params should be ignored without DEBUG_EMAIL env + response = client.get( + '/check?debug_email=query@example.com&debug_groups=query-group', + headers={'Authorization': 'Bearer valid-token'} + ) + assert response.status_code == 200 + # Should use auth email, not query param email + assert response.headers['X-Auth-Request-Email'] == 'auth@example.com' + + @pytest.mark.unit + def test_check_without_auth_fails_when_debug_incomplete(self): + """Test /check returns 401 when debug vars are incomplete and no auth provided.""" + env_vars = { + 'FENCE_BASE': 'https://test-fence.example.com/user', + 'DEBUG_EMAIL': 'debug@example.com' + # No DEBUG_GROUPS - incomplete debug config + } + with patch.dict('os.environ', env_vars, clear=True): + import app + client = app.app.test_client() + # No Authorization header, and debug vars incomplete + response = client.get('/check') + assert response.status_code == 401 + + @pytest.mark.unit + def test_check_with_empty_debug_groups(self): + """Test /check behavior with empty DEBUG_GROUPS.""" + env_vars = { + 'FENCE_BASE': 'https://test-fence.example.com/user', + 'DEBUG_EMAIL': 'debug@example.com', + 'DEBUG_GROUPS': '' # Empty groups + } + with patch.dict('os.environ', env_vars, clear=True): + import app + client = app.app.test_client() + # Empty groups should result in fallback to real auth + response = client.get('/check') + # Without valid auth, should fail + assert response.status_code == 401 diff --git a/docs/admin-roles.yaml b/docs/admin-roles.yaml new file mode 100644 index 00000000..d724b084 --- /dev/null +++ b/docs/admin-roles.yaml @@ -0,0 +1,443 @@ +Here’s a single, reusable `admin.yaml` that gives your `wf-admins` group: + +* Full admin over **Argo CD** (once you add the RBAC snippet to `argocd-rbac-cm`) +* Full admin over **Argo Workflows** and **Argo Events** +* Broad K8s rights over the resources those systems use + +Then I’ll show how to apply it and how to wire Argo CD’s RBAC to the same group. + +--- + +## 1. `admin.yaml` (ClusterRole + ClusterRoleBinding) + +Save this as `admin.yaml`: + +```yaml +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: argo-superuser + labels: + app.kubernetes.io/name: argo-superuser + app.kubernetes.io/part-of: argo-platform +rules: + # Full admin on all Argo CRDs (Workflows, Events, CD, etc.) + - apiGroups: ["argoproj.io"] + resources: ["*"] + verbs: ["*"] + + # Core Kubernetes resources needed to manage Argo pods and configs + - apiGroups: [""] + resources: + - pods + - pods/log + - pods/exec + - services + - endpoints + - configmaps + - secrets + - serviceaccounts + - events + - namespaces + verbs: ["*"] + + # Workload controllers (for Argo components themselves) + - apiGroups: ["apps"] + resources: + - deployments + - statefulsets + - replicasets + verbs: ["*"] + + # Ingress / networking (e.g. Argo CD / Workflows ingress) + - apiGroups: ["networking.k8s.io"] + resources: + - ingresses + verbs: ["*"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: argo-superuser-wf-admins + labels: + app.kubernetes.io/name: argo-superuser + app.kubernetes.io/part-of: argo-platform +subjects: + - kind: Group + name: wf-admins # <-- your global admin group from X-Auth-Request-Groups + apiGroup: rbac.authorization.k8s.io +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: argo-superuser +``` + +This gives **anyone in the `wf-admins` group** full K8s and Argo control (within the bounds above). + +--- + +## 2. Apply the manifest + +From your shell (with `kubectl` pointing at the right cluster): + +```bash +kubectl apply -f admin.yaml +``` + +Confirm it’s there: + +```bash +kubectl get clusterrole argo-superuser +kubectl get clusterrolebinding argo-superuser-wf-admins +``` + +Quick sanity check: + +```bash +kubectl auth can-i list workflows.argoproj.io --as-group=wf-admins +kubectl auth can-i create workflows.argoproj.io --as-group=wf-admins +kubectl auth can-i get applications.argoproj.io --as-group=wf-admins -n argocd +``` + +Each should say `yes`. + +--- + +## 3. Wire Argo CD RBAC to the same `wf-admins` group + +Kubernetes RBAC alone doesn’t give you the Argo CD UI permissions; Argo CD also has its **own** RBAC layer via `argocd-rbac-cm`. + +You want to add this **policy snippet**: + +```text +p, role:admin, *, *, *, allow +g, wf-admins, role:admin +``` + +### 3.1. If Argo CD is managed by Helm (your case) + +In the `argo-cd` chart values, there’s usually a place like this (names vary slightly by chart): + +```yaml +configs: + rbac: + policy.csv: | + p, role:admin, *, *, *, allow + g, wf-admins, role:admin + + policy.default: role:readonly +``` + +Or sometimes: + +```yaml +server: + rbacConfig: + policy.csv: | + p, role:admin, *, *, *, allow + g, wf-admins, role:admin + policy.default: role:readonly +``` + +**Important:** +Because your `argocd-rbac-cm` is already owned by the `argo-stack` Helm release, you should **not** apply a separate `argocd-rbac-cm` from `admin.yaml`. Instead: + +1. Edit your `argo-stack` values (e.g. `helm/argo-stack/values.yaml` or `my-values.yaml`) to include the `policy.csv` lines above. + +2. Redeploy: + + ```bash + helm upgrade --install argo-stack ./helm/argo-stack \ + -n argocd \ + -f ./helm/argo-stack/values.yaml \ + -f ./my-values.yaml + ``` + +3. Verify: + + ```bash + kubectl get configmap argocd-rbac-cm -n argocd -o yaml | grep -A5 policy.csv + ``` + +4. From a `wf-admins` user, run: + + ```bash + argocd account get-user-info + argocd account can-i manage repositories '*' + argocd account can-i sync applications '*/*' + ``` + +You should see that user has `role:admin`. + +--- + +## 4. Optional: lock Argo Workflows UI to wf-admins (and others) + +If you later want the **Workflows UI** itself to only be accessible to certain groups (instead of “anyone who can log in”), you can add a small annotation to the `argo-workflows-server` Deployment: + +```yaml +metadata: + annotations: + workflows.argoproj.io/rbac-rule-precedence: "1" + workflows.argoproj.io/rbac-rule: | + 'wf-admins' in groups +``` + +Or, if you want admins and tenant groups: + +```yaml +workflows.argoproj.io/rbac-rule: | + 'wf-admins' in groups + || 'wf-bwalsh-nextflow-hello-project-writers' in groups + || 'wf-bwalsh-nextflow-hello-project-readers' in groups +``` + +That’s separate from `admin.yaml` (and should be managed by your Helm chart), but it’s good to know where it plugs in. + +--- + +## 5. Quick verification flow + +Once you’ve: + +* Applied `admin.yaml` +* Wired the `wf-admins` group into `argocd-rbac-cm` via Helm values +* Confirmed your authz-adapter emits `X-Auth-Request-Groups: wf-admins,...` for an admin user + +You can verify end to end: + +1. Open Argo CD via your ingress as an admin user. +2. `argocd account get-user-info` shows `Groups: wf-admins`. +3. User can: + + * See and manage all Applications and Projects. + * Sync, delete, and edit applications. +4. In K8s: + + * `kubectl auth can-i` with `--as-group=wf-admins` returns `yes` for workflows, sensors, apps, etc. + + + +Here’s a single, reusable `admin.yaml` that gives your `wf-admins` group: + +* Full admin over **Argo CD** (once you add the RBAC snippet to `argocd-rbac-cm`) +* Full admin over **Argo Workflows** and **Argo Events** +* Broad K8s rights over the resources those systems use + +Then I’ll show how to apply it and how to wire Argo CD’s RBAC to the same group. + +--- + +## 1. `admin.yaml` (ClusterRole + ClusterRoleBinding) + +Save this as `admin.yaml`: + +```yaml +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: argo-superuser + labels: + app.kubernetes.io/name: argo-superuser + app.kubernetes.io/part-of: argo-platform +rules: + # Full admin on all Argo CRDs (Workflows, Events, CD, etc.) + - apiGroups: ["argoproj.io"] + resources: ["*"] + verbs: ["*"] + + # Core Kubernetes resources needed to manage Argo pods and configs + - apiGroups: [""] + resources: + - pods + - pods/log + - pods/exec + - services + - endpoints + - configmaps + - secrets + - serviceaccounts + - events + - namespaces + verbs: ["*"] + + # Workload controllers (for Argo components themselves) + - apiGroups: ["apps"] + resources: + - deployments + - statefulsets + - replicasets + verbs: ["*"] + + # Ingress / networking (e.g. Argo CD / Workflows ingress) + - apiGroups: ["networking.k8s.io"] + resources: + - ingresses + verbs: ["*"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: argo-superuser-wf-admins + labels: + app.kubernetes.io/name: argo-superuser + app.kubernetes.io/part-of: argo-platform +subjects: + - kind: Group + name: wf-admins # <-- your global admin group from X-Auth-Request-Groups + apiGroup: rbac.authorization.k8s.io +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: argo-superuser +``` + +This gives **anyone in the `wf-admins` group** full K8s and Argo control (within the bounds above). + +--- + +## 2. Apply the manifest + +From your shell (with `kubectl` pointing at the right cluster): + +```bash +kubectl apply -f admin.yaml +``` + +Confirm it’s there: + +```bash +kubectl get clusterrole argo-superuser +kubectl get clusterrolebinding argo-superuser-wf-admins +``` + +Quick sanity check: + +```bash +kubectl auth can-i list workflows.argoproj.io --as-group=wf-admins +kubectl auth can-i create workflows.argoproj.io --as-group=wf-admins +kubectl auth can-i get applications.argoproj.io --as-group=wf-admins -n argocd +``` + +Each should say `yes`. + +--- + +## 3. Wire Argo CD RBAC to the same `wf-admins` group + +Kubernetes RBAC alone doesn’t give you the Argo CD UI permissions; Argo CD also has its **own** RBAC layer via `argocd-rbac-cm`. + +You want to add this **policy snippet**: + +```text +p, role:admin, *, *, *, allow +g, wf-admins, role:admin +``` + +### 3.1. If Argo CD is managed by Helm (your case) + +In the `argo-cd` chart values, there’s usually a place like this (names vary slightly by chart): + +```yaml +configs: + rbac: + policy.csv: | + p, role:admin, *, *, *, allow + g, wf-admins, role:admin + + policy.default: role:readonly +``` + +Or sometimes: + +```yaml +server: + rbacConfig: + policy.csv: | + p, role:admin, *, *, *, allow + g, wf-admins, role:admin + policy.default: role:readonly +``` + +**Important:** +Because your `argocd-rbac-cm` is already owned by the `argo-stack` Helm release, you should **not** apply a separate `argocd-rbac-cm` from `admin.yaml`. Instead: + +1. Edit your `argo-stack` values (e.g. `helm/argo-stack/values.yaml` or `my-values.yaml`) to include the `policy.csv` lines above. + +2. Redeploy: + + ```bash + helm upgrade --install argo-stack ./helm/argo-stack \ + -n argocd \ + -f ./helm/argo-stack/values.yaml \ + -f ./my-values.yaml + ``` + +3. Verify: + + ```bash + kubectl get configmap argocd-rbac-cm -n argocd -o yaml | grep -A5 policy.csv + ``` + +4. From a `wf-admins` user, run: + + ```bash + argocd account get-user-info + argocd account can-i manage repositories '*' + argocd account can-i sync applications '*/*' + ``` + +You should see that user has `role:admin`. + +--- + +## 4. Optional: lock Argo Workflows UI to wf-admins (and others) + +If you later want the **Workflows UI** itself to only be accessible to certain groups (instead of “anyone who can log in”), you can add a small annotation to the `argo-workflows-server` Deployment: + +```yaml +metadata: + annotations: + workflows.argoproj.io/rbac-rule-precedence: "1" + workflows.argoproj.io/rbac-rule: | + 'wf-admins' in groups +``` + +Or, if you want admins and tenant groups: + +```yaml +workflows.argoproj.io/rbac-rule: | + 'wf-admins' in groups + || 'wf-bwalsh-nextflow-hello-project-writers' in groups + || 'wf-bwalsh-nextflow-hello-project-readers' in groups +``` + +That’s separate from `admin.yaml` (and should be managed by your Helm chart), but it’s good to know where it plugs in. + +--- + +## 5. Quick verification flow + +Once you’ve: + +* Applied `admin.yaml` +* Wired the `wf-admins` group into `argocd-rbac-cm` via Helm values +* Confirmed your authz-adapter emits `X-Auth-Request-Groups: wf-admins,...` for an admin user + +You can verify end to end: + +1. Open Argo CD via your ingress as an admin user. +2. `argocd account get-user-info` shows `Groups: wf-admins`. +3. User can: + + * See and manage all Applications and Projects. + * Sync, delete, and edit applications. +4. In K8s: + + * `kubectl auth can-i` with `--as-group=wf-admins` returns `yes` for workflows, sensors, apps, etc. + + diff --git a/docs/argo-rbac-troubleshooting.md b/docs/argo-rbac-troubleshooting.md new file mode 100644 index 00000000..8a96d0e2 --- /dev/null +++ b/docs/argo-rbac-troubleshooting.md @@ -0,0 +1,491 @@ +# argo-git-app-overlay + +A small overlay chart that wires a **single GitHub App** into a **multi-tenant ArgoCD + Argo Workflows stack**, using: + +- `values.repoRepositories[]` as the tenant catalog +- A consistent group/role naming convention: `wf---` +- `X-Auth-Request-Groups` headers from the `authz-adapter` to drive: + - Argo CD RBAC (AppProject roles + global roles) + - Kubernetes RBAC in workflow namespaces + - Argo Workflows UI access + +> **Tenant = one GitHub repo.** +> A single GitHub App is installed into multiple repositories and becomes the common identity / automation layer. + +--- + +## Architecture + +```mermaid +flowchart LR + subgraph GH["GitHub"] + GRepo["GitHub Repos
org/repo A, org/repo B, ..."] + GApp["Single GitHub App
(multi-tenant)"] + GRepo --> GApp + end + + subgraph Cluster["Kubernetes Cluster"] + subgraph Ingress["NGINX Ingress"] + NGINX["ingress-nginx
auth_request"] + end + + subgraph Security["Security Namespace"] + AUTHZ["authz-adapter
(OIDC + GitHub groups)"] + end + + subgraph ArgoCDNS["ArgoCD Namespace"] + ARGOCD["Argo CD Server"] + APPS["AppProjects + Applications"] + end + + subgraph ArgoWFNS["Argo Workflows Namespace"] + AWFSRV["Argo Workflows UI/Server"] + AWFCTRL["Workflow Controller"] + end + + subgraph Tenants["Tenant Workflow Namespaces"] + T1["wf--
Workflows + SA + Roles"] + T2["wf--
Workflows + SA + Roles"] + dots["..."] + end + end + + %% Requests + Dev["Developer / Reader
Browser or CLI"] -->|"HTTPS / Argo UI"| NGINX + NGINX -->|"auth_request"| AUTHZ + AUTHZ -->|"200 + X-Auth-Request-Groups:
wf---reader, wf---writer"| NGINX + + NGINX -->|"Authorized traffic
(ArgoCD UI & API)"| ARGOCD + NGINX -->|"Authorized traffic
(Workflows UI)"| AWFSRV + + %% GitHub → cluster + GApp -->|"Webhooks: push, PR"| ARGOCD + ARGOCD -->|"Sync applications"| APPS + APPS -->|"Deploy workflows & CRDs"| AWFCTRL + AWFCTRL -->|"Run workflows in
tenant namespaces"| T1 + AWFCTRL -->|"Run workflows in
tenant namespaces"| T2 + + %% RBAC + AUTHZ -->|"Map OIDC + GitHub claims
→ wf---* groups"| ARGOCD + AUTHZ -->|"Same groups header"| AWFSRV +``` +--- + +## 1. Unified access control matrix + +Assumptions (tunable to your naming): + +* Each GitHub repo `org/repo` gets two tenant groups: + + * `ghapp:org/repo:readers` + * `ghapp:org/repo:writers` +* One global admin group: + + * `ghapp:argo-admins` +* NGINX/authz-adapter injects: + + * `X-Auth-Request-User` + * `X-Auth-Request-Email` + * `X-Auth-Request-Groups: ghapp:org/repo:readers,ghapp:org/repo:writers,...` +* Tenant workflow namespace: `wf-org-repo` +* Argo CD AppProject: `proj-org-repo`, hosted by your `argo-stack` chart. ([GitHub][1]) + +### Matrix + +| Identity (X-Auth-Request-Groups) | Human meaning | Argo CD RBAC role(s) | Argo CD scope | Kubernetes RBAC | K8s scope | +| ---------------------------------- | ----------------------------------- | ----------------------------------------------- | ------------------------------------ | --------------------------------------------------- | ---------------------------------- | +| `ghapp:argo-admins` | Platform / superuser admins | `role:admin` (global, via `argocd-rbac-cm`) | All projects, all apps, all clusters | `cluster-admin` (or strong cluster role) | Cluster-wide | +| `ghapp:org/repo:writers` | Repo tenant “writers” / maintainers | `proj:proj-org-repo:admin` (AppProject role) | Only `proj-org-repo` applications | `wf-org-repo-admin` Role + RoleBinding to group | Namespace `wf-org-repo` | +| `ghapp:org/repo:readers` | Repo tenant “readers” / observers | `proj:proj-org-repo:readonly` (AppProject role) | Only `proj-org-repo` applications | `wf-org-repo-readonly` Role + RoleBinding to group | Namespace `wf-org-repo` | +| (optional) `ghapp:argocd-readonly` | Platform-wide read-only | `role:readonly` (global) | All projects, read-only | `cluster-readonly` ClusterRole + ClusterRoleBinding | Cluster-wide (get/list/watch only) | + +Result: + +* **Readers/writers see only their own Argo CD project & app(s)** because their roles are scoped to `proj-org-repo`. +* **Readers/writers see only their workflows** because their K8s Roles are bound only in `wf-org-repo`, and Argo Workflows RBAC uses the same group claims. ([Proactive Insights][2]) +* **Admins** can see and change everything. + +--- + +## 2. Example RBAC config for a single tenant + +Let’s use a concrete example repo: + +* GitHub repo: `calypr/nextflow-hello` +* Tenant workflow namespace: `wf-nextflow-hello` +* Argo CD project: `nextflow-hello` +* Groups: + + * `ghapp:calypr/nextflow-hello:readers` + * `ghapp:calypr/nextflow-hello:writers` + * `ghapp:argo-admins` + +### 2.1 Argo CD global RBAC (argocd-rbac-cm) + +This handles the **cluster-wide admin & default policy**. Pattern taken from typical Argo CD RBAC examples. ([Argo CD][3]) + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: argocd-rbac-cm + namespace: argocd +data: + policy.default: role:readonly + policy.csv: | + # Global ArgoCD admins + g, ghapp:argo-admins, role:admin + + # Optional: global read-only + g, ghapp:argocd-readonly, role:readonly +``` + +> Everything tenant-specific will be handled via **AppProject roles**, not here. + +--- + +### 2.2 Argo CD AppProject (per repo / tenant) + +Each repo gets its own `AppProject`. You already have per-repo application wiring in the chart; this fits that model. ([GitHub][1]) + +```yaml +apiVersion: argoproj.io/v1alpha1 +kind: AppProject +metadata: + name: nextflow-hello + namespace: argocd +spec: + description: "Tenant project for calypr/nextflow-hello" + sourceRepos: + - https://github.com/calypr/nextflow-hello.git + destinations: + - server: https://kubernetes.default.svc + namespace: wf-nextflow-hello + + # Limit cross-namespace & cluster resources (tighten as needed) + clusterResourceWhitelist: + - group: "" + kind: Namespace + - group: rbac.authorization.k8s.io + kind: Role + - group: rbac.authorization.k8s.io + kind: RoleBinding + + roles: + # Writers: manage apps in this project + - name: admin + description: "Writers for nextflow-hello" + policies: + - p, proj:nextflow-hello:admin, applications, *, nextflow-hello/*, allow + - p, proj:nextflow-hello:admin, repositories, get, *, allow + groups: + - ghapp:calypr/nextflow-hello:writers + + # Readers: read-only access to apps in this project + - name: readonly + description: "Readers for nextflow-hello" + policies: + - p, proj:nextflow-hello:readonly, applications, get, nextflow-hello/*, allow + - p, proj:nextflow-hello:readonly, applications, sync, nextflow-hello/*, deny + groups: + - ghapp:calypr/nextflow-hello:readers +``` + +With this: + +* **Writers** (GitHub App writers group) can: + + * Create/edit/sync Argo CD apps in this project. +* **Readers** can: + + * View apps & health, but cannot sync or modify. + +--- + +### 2.3 Argo CD Application example + +The Application itself just points to the tenant namespace. You already have an `applications` array in values; this is basically what your chart generates. ([GitHub][1]) + +```yaml +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: nextflow-hello + namespace: argocd +spec: + project: nextflow-hello + source: + repoURL: https://github.com/calypr/nextflow-hello.git + targetRevision: main + path: . + destination: + server: https://kubernetes.default.svc + namespace: wf-nextflow-hello + syncPolicy: + automated: + prune: true + selfHeal: true +``` + +--- + +### 2.4 Argo Workflows RBAC + K8s Roles + +**Goal:** same GitHub groups used for: + +* who can **see** workflows in the Argo Workflows UI +* who can **run/manage** them +* who can **see pods/logs** at the Kubernetes level + +#### 2.4.1 Namespace-scoped Roles + +```yaml +# Admin / writer role in wf-nextflow-hello +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: wf-nextflow-hello-admin + namespace: wf-nextflow-hello +rules: + - apiGroups: ["argoproj.io"] + resources: ["workflows", "workflowtemplates", "cronworkflows"] + verbs: ["*"] + - apiGroups: [""] + resources: ["pods", "pods/log", "events", "configmaps"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["secrets", "persistentvolumeclaims"] + verbs: ["get", "list", "create", "update", "delete"] + +--- +# Read-only role in wf-nextflow-hello +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: wf-nextflow-hello-readonly + namespace: wf-nextflow-hello +rules: + - apiGroups: ["argoproj.io"] + resources: ["workflows", "workflowtemplates", "cronworkflows"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["pods", "pods/log", "events", "configmaps"] + verbs: ["get", "list", "watch"] +``` + +#### 2.4.2 RoleBindings to GitHub App groups + +```yaml +# Writers → admin role +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: wf-nextflow-hello-admin-binding + namespace: wf-nextflow-hello +subjects: + - kind: Group + name: ghapp:calypr/nextflow-hello:writers + apiGroup: rbac.authorization.k8s.io +roleRef: + kind: Role + name: wf-nextflow-hello-admin + apiGroup: rbac.authorization.k8s.io + +--- +# Readers → readonly role +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: wf-nextflow-hello-readonly-binding + namespace: wf-nextflow-hello +subjects: + - kind: Group + name: ghapp:calypr/nextflow-hello:readers + apiGroup: rbac.authorization.k8s.io +roleRef: + kind: Role + name: wf-nextflow-hello-readonly + apiGroup: rbac.authorization.k8s.io +``` + +#### 2.4.3 Argo Workflows UI RBAC (group expressions) + +Argo Workflows can also enforce RBAC based on groups in the ID token (or headers via SSO integration). Typical pattern: ([Proactive Insights][2]) + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: argo-workflows-server + namespace: argo +spec: + template: + metadata: + annotations: + # Only users in these groups can access this Argo Workflows instance + workflows.argoproj.io/rbac-rule: > + 'ghapp:calypr/nextflow-hello:writers' in groups + || 'ghapp:calypr/nextflow-hello:readers' in groups + workflows.argoproj.io/rbac-rule-precedence: "1" + spec: + # ... +``` + +You can tighten that further if you ever run one Argo server per tenant namespace. + +--- + +## 3. Debugging guide: verifying `X-Auth-Request-Groups` end-to-end + +You want a **systematic “outside → inside” checklist**: + +### Step 0 – Known good group membership + +* Verify in your **IdP / GitHub App integration** that the user belongs to: + + * `ghapp:calypr/nextflow-hello:readers` or `:writers` +* If the authz-adapter talks to Fence/Gen3, confirm the group shows up in the token / userinfo response first (using `jq` on the JWT, or the Fence `/user/` endpoints). + +--- + +### Step 1 – NGINX / authz-adapter header injection + +1. **Confirm Ingress annotations** + + Check that your Argo CD and Argo Workflows ingresses are using `auth_request` and passing headers: + + ```bash + kubectl get ingress -n argocd argocd-server -o yaml + ``` + + Look for annotations like: + + ```yaml + nginx.ingress.kubernetes.io/auth-url: "http://authz-adapter.security.svc.cluster.local/auth" + nginx.ingress.kubernetes.io/auth-response-headers: "X-Auth-Request-User, X-Auth-Request-Email, X-Auth-Request-Groups" + ``` + +2. **Echo headers via a debug backend** + + Temporarily route a dedicated path `/debug/headers` to a simple echo service in the same ingress: + + ```yaml + apiVersion: v1 + kind: Pod + metadata: + name: header-echo + namespace: argocd + spec: + containers: + - name: header-echo + image: mendhak/http-https-echo + ports: + - containerPort: 80 + --- + apiVersion: v1 + kind: Service + metadata: + name: header-echo + namespace: argocd + spec: + selector: + app: header-echo + ports: + - port: 80 + targetPort: 80 + ``` + + Add a path rule in the ingress to send `/debug/headers` to `header-echo`. Then hit: + + ```bash + curl -k -H "Host: " https:///debug/headers + ``` + + You should see something like: + + ```text + X-Auth-Request-User: alice@example.org + X-Auth-Request-Groups: ghapp:calypr/nextflow-hello:writers,ghapp:argo-admins + ``` + + If you **don’t** see groups here, the bug is either in the authz-adapter or IdP side. + +--- + +### Step 2 – Argo CD sees the groups + +1. **Login through the proxied URL** (UI or CLI using `--header` if necessary). ([Argo CD][4]) + +2. Run: + + ```bash + argocd account get-user-info + ``` + + You should see: + + ```text + LoggedIn: true + Username: alice@example.org + Groups: ghapp:calypr/nextflow-hello:writers,ghapp:argo-admins + ``` + + * If groups are **missing here** but present at the ingress debug step: + + * Check Dex / OIDC config in `argocd-cm` to ensure the `groups` claim (or `X-Auth-Request-Groups`) is mapped through. + * If using an auth proxy connector, confirm `groupHeader: X-Auth-Request-Groups`. ([GitHub][5]) + +3. Validate Argo CD RBAC evaluation: + + ```bash + argocd account can-i sync applications 'nextflow-hello/*' + argocd account can-i get applications 'nextflow-hello/*' + ``` + + * For a `writers` user: `sync` should be allowed. + * For a `readers` user: `sync` should be denied; `get` allowed. + +--- + +### Step 3 – Kubernetes RBAC (workflows / pods) + +1. Use **Subject Access Review** locally: + + ```bash + kubectl auth can-i list workflows.argoproj.io \ + --as=alice@example.org \ + --as-group=ghapp:calypr/nextflow-hello:writers \ + -n wf-nextflow-hello + ``` + + * Expect `yes` for writers / readers (list workflows). + * Expect `no` for users without the group. + +2. If you see `no` for a group that should work: + + * Inspect the RoleBindings: + + ```bash + kubectl get rolebinding -n wf-nextflow-hello -o yaml | grep -A5 RoleBinding + ``` + + * Confirm the `subjects[].name` matches exactly the `group` string coming from `argocd account get-user-info`. + +--- + +### Step 4 – Argo Workflows UI RBAC + +1. Attempt to hit the Argo Workflows UI as each type of user. +2. If a user can log in but sees “no workflows”: + + * Check Argo Workflows server annotations `workflows.argoproj.io/rbac-rule*`. + * Confirm `groups` claim actually contains your `ghapp:*` group names. + * If using Dex, ensure Workflows is configured with `scopes: [openid, profile, email, groups]` and that it trusts the same issuer as Argo CD. ([Proactive Insights][2]) + +--- + +[1]: https://github.com/calypr/argo-helm/tree/feature/git-app-work "GitHub - calypr/argo-helm at feature/git-app-work" +[2]: https://hossted.com/knowledge-base/case-studies/devops/application-development/resolving-rbac-and-sso-issues-in-argo-workflows-with-azure-ad-and-dex/?utm_source=chatgpt.com "Resolving RBAC and SSO Issues in Argo Workflows with ..." +[3]: https://argo-cd.readthedocs.io/en/stable/operator-manual/rbac/?utm_source=chatgpt.com "RBAC Configuration - Declarative GitOps CD for Kubernetes" +[4]: https://argo-cd.readthedocs.io/en/release-2.0/operator-manual/ingress/?utm_source=chatgpt.com "Ingress Configuration - Declarative GitOps CD for Kubernetes" +[5]: https://github.com/argoproj/argo-cd/issues/21089?utm_source=chatgpt.com "ArgoCD behind an auth proxy · Issue #21089" diff --git a/docs/gitops-events.md b/docs/gitops-events.md new file mode 100644 index 00000000..56488080 --- /dev/null +++ b/docs/gitops-events.md @@ -0,0 +1,162 @@ +If your goal is to detect **“one or more pushes to the same pull request branch”** (i.e., new commits added to an open PR), there is exactly **one GitHub event** that represents this reliably: + +# ✅ **`pull_request` event with `action: synchronize`** + +This event fires **every time** someone pushes new commits to the branch associated with an open PR. + +### Why it's the right event + +* It batches *any number of git pushes* into a single conceptual signal: + *“The PR changed; its head SHA is updated.”* +* It is emitted whether: + + * the author pushes one commit + * multiple commits + * rebases + * force-pushes + * resolves merge conflicts + * GitHub automatically updates the branch + +### What you get + +GitHub always sends: + +```json +{ + "action": "synchronize", + "pull_request": { + "number": 42, + "head": { + "ref": "feature-x", + "sha": "abc123..." + } + }, + "installation": { "id": 123456 } +} +``` + +This is the canonical trigger for GitOps automation that needs to: + +* run validation workflows +* run metadata checks +* preview manifests +* lint pipeline configs +* update PR statuses/comments + +--- + +# Optional companion events + +Depending on how rich your GitOps automation is, you may also subscribe to these: + +### **1. `pull_request` with `opened`** + +Triggered when a new PR is created. +Useful for initial workflow runs. + +### **2. `pull_request` with `reopened`** + +If repos enforce closing PRs on failures, this allows reactivation. + +### **3. `push`** (optional) + +Only needed if you want: + +* behavior on branches **without** a PR +* behavior on default branch (e.g., `main`) +* behavior on tags + +For “one or more pushes *to a PR*,” you don't need `push`. + +### **4. `issue_comment`** (optional) + +If you want your automation to react to `/retest`, `/sync`, `/approve`, etc. + +--- + +# 🚫 Events that do *not* represent PR updates + +* `pull_request_review` +* `pull_request_review_comment` +* `check_suite` +* `check_run` + +These are *downstream* events, not indicators of new commits. + +--- + +# 🎯 Final Answer + +To represent **“one or more pushes to a pull request”**, subscribe to: + +```text +pull_request: + - synchronize # commits pushed to PR + - opened # initial PR creation + - reopened # optional +``` + +And treat **`synchronize`** as the canonical signal for “new code just arrived.” + +In “pure” GitOps, you treat **Git as the source of truth**, not as a log sink — so you usually **don’t** write raw workflow logs back into the repo. + +Here’s how I’d structure it. + +--- + +### 1. Where the *logs* live + +When a push happens to a branch like `main` and your workflow runs (Argo Workflows, Tekton, Actions, etc.): + +* **Runtime logs** + + * Stay in the **workflow engine / cluster**: + + * Pod logs in Kubernetes + * Argo Workflows UI / `argo logs` + * Optionally streamed/archived to: + + * Object storage (S3/MinIO/GCS) + * Log aggregation (Loki, ELK, CloudWatch, etc.) + +* **Artifacts / result bundles** + + * Stored in an **artifact repository** (S3, GCS, MinIO, Artifactory, etc.). + * Your workflow can expose a URL back to those artifacts. + +👉 **Do not** commit raw logs into the Git repo — it bloats history and breaks the “config only” contract. + +--- + +### 2. Where the *status* goes + +For GitOps, status tends to feed back to **two places**: + +1. **Git provider (GitHub) – for humans and automations** + + * Use: + + * **Commit status** or **Checks API** on the `main` commit + * Optional: a short **commit comment** with links to logs/artifacts + * This gives a simple signal: + + * ✅ success / ❌ failure + * * URL to detailed logs or report + +2. **Cluster GitOps objects – for controllers** + + * Argo CD / Flux will maintain status in: + + * `Application` (Argo CD) + * `Kustomization` / `HelmRelease` (Flux) + * These CRDs show *deployment* status (Synced, Healthy, Degraded), not CI logs. + +--- + +### 3. A good pattern in one sentence + +> On push to `main`, the workflow writes **logs and artifacts** to an external store, writes a **succinct status** (✅/❌ + link) back to the **commit in GitHub**, and lets **Argo CD/Flux** express the deployment state via their own status fields. + +That keeps Git clean, keeps ops logs in the right place, and still gives you a single-click path from the commit to the full workflow history. + + diff --git a/docs/gitops-status.md b/docs/gitops-status.md new file mode 100644 index 00000000..9a084779 --- /dev/null +++ b/docs/gitops-status.md @@ -0,0 +1,273 @@ +# ADR: GitOps Status Matrix for GitHub and Argo CD + +- **ADR ID:** 00X-gitops-status-matrix +- **Status:** Proposed +- **Date:** 2025-12-01 +- **Owner:** GitOps / Platform Team + +--- + +## 1. Context + +In our environment, a **workflow must run to prepare data for a release** before we consider a change “deployable.” +We use: + +- GitHub as the system of record for application and configuration code +- Argo CD as the GitOps controller for Kubernetes +- Argo Workflows (or equivalent) to prepare and validate release data inside the cluster +- A GitHub App for authenticated status updates + +Today, there is no consistent pattern that: + +1. Maps **GitHub events** (push to `main`, new commits on a PR) to +2. **GitHub status signals** (Commit Status, Deployment Status) + +…in a way that reflects workflow execution, Argo CD sync health, and user-facing status in GitHub. + +This ADR defines that mapping and the responsibilities of the Argo CD–managed workflow. + +--- + +## 2. Decision + +We define a **GitOps status matrix** connecting: + +- GitHub events +- Argo CD triggers +- Workflows +- GitHub Status signals + +Rules adopted: + +1. **A workflow must run** for both PRs and `main` before changes are deployable. +2. **The workflow updates GitHub status**, not Argo CD directly. +3. Argo CD manages and triggers the workflow, ensuring all workflow definitions and runtime are in sync. +4. GitHub status must clearly express workflow execution state and environment deployment readiness. + +--- + +## 3. Architecture Overview + +### 3.1 Status Matrix + +| GitHub Event →
GitHub Status ↓ | `push` to `main` | `pull_request` with `action=synchronize` | +|-----------------------------------|------------------|-------------------------------------------| +| **Commit Status** (Statuses API) | **“Release Data Prep / Main”**

1. Commit pushed to `main`.
2. Argo CD sees new revision and syncs the workflow definition.
3. Trigger workflow.
4. Workflow updates commit status (`pending` → `success` or `failure`). | **“Release Data Prep / PR”**

1. New commits added to PR branch.
2. Trigger workflow for PR head SHA.
3. Workflow sets commit status with context `release-data-prep`. | +| **GitHub Deployment Status** (Deployments API) | **“Deploy to staging / production”**

1. Workflow success for `main`.
2. Argo CD Application is Synced & Healthy.
3. Workflow or Argo Notification updates Deployment Status. | **“Preview Environment” (optional)**

1. Preview namespace created.
2. Workflow success or failure updates GitHub Deployment Status (environment = `preview/pr-123`). | + +--- + +## 4. Workflow Responsibilities and Steps + +### 4.1 High-Level Flow + +1. Argo CD syncs Application containing WorkflowTemplates/CronWorkflows. +2. A Sync hook or Argo Events Sensor triggers a workflow run. +3. Workflow prepares data, validates artifacts, and produces derived outputs. +4. Workflow updates GitHub Commit Status. +5. Workflow optionally updates Deployment Status. +6. Argo CD exposes cluster health, but GitHub is the primary UX indicator. + +--- + +## 4.2 Detailed Workflow Actions + +### Step 0 — Obtain Installation Access Token + +Workflow obtains a short-lived GitHub installation token: + +1. Create JWT from GitHub App private key. +2. Call: + `POST /app/installations/{installation_id}/access_tokens` +3. Save token as `GITHUB_TOKEN`. + +--- + +### Step 1 — Set Commit Status to `pending` + +API call: + +``` +POST /repos/{owner}/{repo}/statuses/{sha} +``` + +Payload: + +```json +{ + "state": "pending", + "context": "release-data-prep", + "description": "Preparing release data in Argo Workflow", + "target_url": "https://argocd.example.org/applications/your-app?workflow_run_id=XYZ" +} +``` + +--- + +### Step 2 — Execute Data Preparation Tasks + +Common operations: + +- Validate metadata +- Run ETL pipelines +- Generate derived manifests +- Upload artifacts to S3/MinIO +- Register outputs in metadata/index systems + +Failures must propagate to step status. + +--- + +### Step 3 — Set Commit Status to `success` or `failure` + +Success payload: + +```json +{ + "state": "success", + "context": "release-data-prep", + "description": "Release data prepared and validated", + "target_url": "https://argocd.example.org/applications/your-app?workflow_run_id=XYZ" +} +``` + +Failure payload: + +```json +{ + "state": "failure", + "context": "release-data-prep", + "description": "Release data prep failed; see workflow logs", + "target_url": "https://argocd.example.org/applications/your-app?workflow_run_id=XYZ" +} +``` + +--- + +### Step 4 — Update GitHub Deployment Status + +#### 4.1 Create Deployment + +``` +POST /repos/{owner}/{repo}/deployments +``` + +Payload: + +```json +{ + "ref": "GIT_COMMIT_SHA", + "environment": "staging", + "description": "Argo CD environment deployment", + "auto_merge": false +} +``` + +#### 4.2 Update Deployment Status + +``` +POST /repos/{owner}/{repo}/deployments/{id}/statuses +``` + +Payload: + +```json +{ + "state": "success", + "log_url": "https://argocd.example.org/applications/my-app", + "environment_url": "https://my-service.example.org", + "description": "Deployment completed successfully" +} +``` + +--- + +## 5. Mermaid Diagrams + +### 5.1 PR Synchronize → Workflow → Commit Status + +```mermaid +sequenceDiagram + autonumber + participant GH as GitHub + participant AC as Argo CD + participant WF as Argo Workflow + participant API as GitHub API + + GH->>AC: pull_request.synchronize webhook + AC->>WF: Trigger workflow (prepare data) + WF->>API: Commit Status = pending + WF->>WF: Run prep steps (validate, generate, upload) + alt Success + WF->>API: Commit Status = success + else Failure + WF->>API: Commit Status = failure + end +``` + +--- + +### 5.2 Push to Main → Workflow → Deployment Status + +```mermaid +sequenceDiagram + autonumber + participant GH as GitHub + participant AC as Argo CD + participant WF as Workflow + participant API as GitHub Deployments API + + GH->>AC: push-to-main webhook + AC->>WF: Trigger release-data-prep workflow + WF->>API: Commit Status = pending + WF->>WF: Run data prep + WF->>API: Commit Status = success + AC->>AC: Sync Application to cluster + AC->>API: Deployment Status = in_progress + AC->>API: Deployment Status = success +``` + +--- + +## 6. How Argo CD Participates + +Argo CD: + +- Syncs workflow definitions +- Triggers workflows via hooks/events +- Ensures desired state applied to cluster +- Does **not** call GitHub directly, but the workflow does +- Surfaces cluster health for visibility + +--- + +## 7. Consequences + +### 7.1 Positive + +- Clear mapping from GitHub → workflows → GitHub signals. +- PR authors understand release readiness immediately. +- Deployment history visible under GitHub “Environments.” + +### 7.2 Negative + +- Requires secure GitHub App secret handling. +- Workflow-side token issuance complexity. +- If workflow status update step fails, GitHub status may appear stale. + +--- + +## 8. Alternatives Considered + +1. GitHub Actions performing data prep alone +2. Only reporting Deployment Status +3. Pushing workflow results into Git history + +--- + +## 9. Open Questions / Next Steps + +- Should PR comments summarize validation results? +- Should preview environments be mandatory for all PRs? +- Should we standardize commit status context names? + diff --git a/docs/roles-README.md b/docs/roles-README.md new file mode 100644 index 00000000..6459443d --- /dev/null +++ b/docs/roles-README.md @@ -0,0 +1,129 @@ +# repo-registration-roles + +`repo-registration-roles` is a small **RBAC overlay chart** that derives Argo CD and +Kubernetes permissions from the existing `values.repoRegistrations` configuration +in your Argo stack. + +It assumes: + +- A single **GitHub App** is installed on multiple GitHub repositories. +- For each repository `https://github.com//.git`: + - There is an Argo CD **Application** named `-`. + - There is an Argo CD **Project** named `-`. + - There is a workflow namespace named: `wf--`. +- Authentication is handled via OIDC → `authz-adapter` → NGINX → Argo CD / Argo Workflows. +- The `authz-adapter` emits `X-Auth-Request-Groups` including group names: + - `wf---writers` + - `wf---readers` + - plus a global admin group, e.g. `wf-admins`. + +This chart does **not** create Applications, Projects, Namespaces or Workflows. +It only creates: + +- An Argo CD RBAC ConfigMap (`argocd-rbac-cm`) with policies derived from `repoRegistrations`. +- Per-repo `Role` and `RoleBinding` objects in the workflow namespaces `wf--`. + +--- + +## Architecture + +```mermaid +flowchart LR + subgraph GH["GitHub / IdP / Fence"] + REG["values.repoRegistrations[]"] + APP["Single GitHub App"] + end + + subgraph CL["Kubernetes Cluster"] + subgraph IN["Ingress + authz-adapter"] + NGINX["NGINX Ingress
auth_request"] + AUTHZ["authz-adapter
OIDC + repoRegistrations"] + end + + subgraph AC["Namespace: argocd"] + ARGOCD["Argo CD Server"] + RBACCM["argocd-rbac-cm
policies from repoRegistrations"] + end + + subgraph WF["Namespaces: wf-<org>-<repo>"] + WFNS["Role / RoleBinding
per tenant"] + end + end + + REG -->|Helm render| RBACCM + REG -->|Helm render| WFNS + + APP -->|webhooks / status| ARGOCD + + U["User Browser / CLI"] -->|HTTPS| NGINX + NGINX -->|auth_request| AUTHZ + AUTHZ -->|"200 + X-Auth-Request-Groups:
wf---writers, wf---readers"| NGINX + NGINX -->|authorized traffic + groups| ARGOCD + NGINX -->|authorized traffic + groups| WFNS +``` + +--- + +## Access control model + +For each `repoRegistrations[]` entry with `repoUrl: https://github.com//.git`: + +- Derived names: + - `appName` = `-` (Argo CD Application name) + - `projectName` = `-` (Argo CD Project name) + - `workflow ns` = `wf--` +- Groups: + - `writer group` = `wf---writers` + - `reader group` = `wf---readers` +- Global admin group: + - `adminGroup` = configurable (`wf-admins` by default) + +### Argo CD RBAC + +- `adminGroup` → `role:wf-admin` → `*,*,*,allow` (full admin). +- `wf---writers`: + - `get/sync/action` on Application `/`. + - `get` on Project ``. +- `wf---readers`: + - `get` on Application `/`. + - `get` on Project ``. + +### Workflow namespace RBAC + +Namespace: `wf--` + +- `wf---writers` (+ `adminGroup`): + - Full CRUD on `Workflows`, `WorkflowTemplates`, `CronWorkflows`. + - `get/list/watch` on `pods`, `pods/log`, `configmaps`, `secrets`. +- `wf---readers`: + - `get/list/watch` on the same resources (read-only). + +--- + +## Usage + +1. Ensure your main Argo stack already defines `values.repoRegistrations` and + creates the appropriate Applications and workflow namespaces. +2. Install this chart in the same cluster: + +```bash +helm upgrade --install repo-registration-roles \ + ./helm/argo-stack/overlays/repo-registration-roles \ + -n argocd \ + -f my-values.yaml +``` + +> `my-values.yaml` should include the same `repoRegistrations:` block you use +> elsewhere; this overlay does not change that schema. + +--- + +## Files in this chart + +- `Chart.yaml` – Helm chart metadata. +- `values.yaml` – Default values with comments; you typically override this. +- `templates/argocd-rbac-cm.yaml` – Generates `argocd-rbac-cm.policy.csv`. +- `templates/workflow-rbac.yaml` – Generates Roles/RoleBindings in `wf--`. + +See `troubleshooting.md` for step-by-step debugging instructions for group headers +and RBAC. diff --git a/docs/roles-troubleshooting.md b/docs/roles-troubleshooting.md new file mode 100644 index 00000000..01876ad7 --- /dev/null +++ b/docs/roles-troubleshooting.md @@ -0,0 +1,171 @@ +# Troubleshooting repo-registration-roles + +This guide helps you debug issues when tenants do not see the expected Argo CD +applications or workflows, or when group-based permissions are not enforced as +expected. + +--- + +## 1. Verify `repoRegistrations` values + +1. Check that `repoRegistrations` is present in the values file passed to this chart: + ```bash + grep -A10 '^repoRegistrations:' my-values.yaml + ``` +2. Confirm each entry has a valid `repoUrl` of the form: + ```text + https://github.com//.git + ``` +3. Render the chart locally and inspect the generated RBAC: + ```bash + helm template repo-registration-roles ./helm/repo-registration-roles -f my-values.yaml > rendered.yaml + ``` + +Look for: + +- `ConfigMap argocd-rbac-cm` with `policy.csv` lines for your tenant. +- `Role` / `RoleBinding` resources in the expected namespace `wf--`. + +--- + +## 2. Check Argo CD RBAC mapping + +1. Confirm the `argocd-rbac-cm` in the cluster matches what you expect: + ```bash + kubectl get configmap argocd-rbac-cm -n argocd -o yaml + ``` +2. Inspect `data.policy.csv` and verify: + - There is a `role:wf-admin` line and a `g, , role:wf-admin`. + - For each `repoUrl`, there are `role:wf---writer` and + `role:wf---reader` lines. +3. Log in to Argo CD (through the ingress / auth proxy) and run: + ```bash + argocd account get-user-info + ``` + You should see the `groups` list include: + - `wf---writers` or `wf---readers` + - `wf-admins` for global admins (if configured). + +4. Validate effective permissions: + ```bash + argocd account can-i get applications '/' + argocd account can-i sync applications '/' + ``` + +Expected: + +- Writers: `get` = allowed, `sync` = allowed. +- Readers: `get` = allowed, `sync` = denied. + +--- + +## 3. Verify workflow namespace RBAC + +1. Compute the workflow namespace: + ```text + wf-- + ``` + For `repoUrl: https://github.com/bwalsh/nextflow-hello-project.git`: + - `org = bwalsh` + - `repo = nextflow-hello-project` + - `namespace = wf-bwalsh-nextflow-hello-project` +2. Confirm Roles and RoleBindings exist: + ```bash + kubectl get role,rolebinding -n wf-bwalsh-nextflow-hello-project + ``` +3. Use `kubectl auth can-i` with impersonated groups: + ```bash + NS=wf-bwalsh-nextflow-hello-project + + kubectl auth can-i list workflows.argoproj.io --as-group=wf-bwalsh-nextflow-hello-project-writers -n "$NS" + + kubectl auth can-i create workflows.argoproj.io --as-group=wf-bwalsh-nextflow-hello-project-readers -n "$NS" + ``` + +Expected: + +- Writers: `list/create` = **yes**. +- Readers: `list` = **yes**, `create` = **no**. + +If the checks fail, inspect the corresponding `Role` and `RoleBinding` definitions +in `rendered.yaml` and in the live cluster. + +--- + +## 4. Validate X-Auth-Request-Groups end-to-end + +1. **authz-adapter output** + + - Increase logging or add a debug endpoint in `authz-adapter` to print: + - User email (`sub` / `preferred_username` / `email`). + - Derived groups, e.g. `wf---writers`. + + Ensure that for a given user: + + - Members of `adminUsers` in `repoRegistrations[]` get: + - `wf---writers` (and optionally `wf-admins`). + - Members of `readUsers` get: + - `wf---readers`. + +2. **Ingress (NGINX) headers** + + - Check the ingress configuration to confirm `auth_request` and header + propagation: + ```bash + kubectl get ingress -A -o yaml | grep -A5 'auth-request' + ``` + - Use a temporary echo service behind the same ingress to inspect headers: + ```bash + curl -k -H "Host: " https:///debug/headers + ``` + Look for: + ```text + X-Auth-Request-User: ... + X-Auth-Request-Groups: wf---writers, wf-admins + ``` + +3. **Argo CD sees the groups** + + - From the user’s environment, run: + ```bash + argocd account get-user-info + ``` + Confirm the `Groups` section contains the `wf-*` groups from above. + + If the groups appear at the ingress echo service but **not** in Argo CD, + check: + + - Whether a proxy or extra layer is stripping the headers. + - Whether TLS termination is happening in a place that changes headers. + +--- + +## 5. Common mistakes + +- **Wrong repoUrl format** + + If `repoUrl` is not `https://github.com//.git`, the Helm templates + may derive incorrect `` / `` segments, leading to mismatched group + names and namespaces. + +- **Namespaces not created** + + This chart does not create namespaces. Make sure `wf--` exists and + is managed by your base Argo stack or another chart. + +- **Argo CD Application name mismatch** + + The chart assumes the Application name and Project name are both `-`. + If you use a different naming convention, you must adjust the RBAC templates + accordingly. + +- **authz-adapter not configured to emit wf-* groups** + + The overlay cannot work unless `X-Auth-Request-Groups` includes values like + `wf---writers` and `wf---readers`. Make sure your + authz-adapter derives those consistently from `repoRegistrations` and/or IdP + group memberships. + +If you have verified all of the above and things still do not behave as +expected, rerun `helm template` and carefully compare the rendered RBAC +resources against what is actually running in the cluster. diff --git a/docs/testing.md b/docs/testing.md index 2a2e8cb2..2954b965 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -450,6 +450,146 @@ To test with different `repoRegistrations` configurations: 2. Modify test setup to use the new file 3. Adjust expected counts and values accordingly +## Authz-Adapter Testing + +The authz-adapter is a Flask-based authorization service that validates user access. It has its own comprehensive test suite located in `authz-adapter/tests/`. + +### Test Structure + +``` +authz-adapter/ +├── app.py # Main Flask application +├── Makefile # Test commands +├── pytest.ini # Pytest configuration +├── requirements.txt # Runtime dependencies +├── requirements-dev.txt # Development/test dependencies +└── tests/ + ├── __init__.py + ├── conftest.py # Shared fixtures + ├── test_app.py # Main application tests + ├── test_groups.py # Group authorization tests + ├── test_integration.py # Integration tests + └── test_performance.py # Performance tests +``` + +### Running Authz-Adapter Tests + +From the `authz-adapter/` directory: + +```bash +# Install dependencies and run all tests with coverage +make test + +# Run only unit tests +make test-unit + +# Run with coverage report +make test-coverage + +# Clean up test artifacts +make clean +``` + +### Test Classes + +#### TestAppBasic +Basic Flask application tests: +- App creation verification +- Health check endpoint (`/healthz`) +- Missing authorization header handling + +#### TestFetchUserDoc +Tests for fetching user documents from Fence: +- Bearer token validation +- Service token fallback +- Non-200 response handling +- Timeout and connection error handling +- Invalid JSON response handling + +#### TestCheckEndpoint +Tests for the `/check` authorization endpoint: +- Valid user authorization +- Invalid/inactive user handling +- Response header validation + +#### TestGetDebuggingVars +Tests for the `get_debugging_vars()` function that supports debug mode: + +| Test | Description | +|------|-------------| +| `test_get_debugging_vars_returns_none_when_no_debug_email` | Returns `(None, None)` when `DEBUG_EMAIL` env is not set | +| `test_get_debugging_vars_with_debug_email_env_only` | Works with only `DEBUG_EMAIL` env var set | +| `test_get_debugging_vars_with_debug_email_and_groups_env` | Works with both `DEBUG_EMAIL` and `DEBUG_GROUPS` env vars | +| `test_get_debugging_vars_query_params_override_env` | Query params override env vars when `DEBUG_EMAIL` is set | +| `test_get_debugging_vars_query_email_only` | Query param `debug_email` works with env `DEBUG_EMAIL` | +| `test_get_debugging_vars_query_groups_with_env_email` | Query param `debug_groups` works with env `DEBUG_EMAIL` | +| `test_get_debugging_vars_single_group` | Single group parsing works correctly | +| `test_get_debugging_vars_query_params_ignored_without_debug_email_env` | Query params are ignored without `DEBUG_EMAIL` env (security gate) | + +#### TestCheckWithDebuggingVars +Tests for `/check` endpoint behavior with debug variables: + +| Test | Description | +|------|-------------| +| `test_check_bypasses_auth_with_debug_email_and_groups` | Auth is bypassed when both `DEBUG_EMAIL` and `DEBUG_GROUPS` are set | +| `test_check_falls_back_to_auth_when_only_debug_email_set` | Falls back to real auth when only `DEBUG_EMAIL` is set (no groups) | +| `test_check_with_debug_query_params` | Query params `debug_email` and `debug_groups` work correctly | +| `test_check_with_debug_groups_override_in_query` | Query `debug_groups` overrides env `DEBUG_GROUPS` | +| `test_check_query_params_ignored_without_debug_email_env` | Query params ignored without `DEBUG_EMAIL` env (security) | +| `test_check_without_auth_fails_when_debug_incomplete` | Returns 401 when debug vars incomplete and no auth provided | +| `test_check_with_empty_debug_groups` | Empty `DEBUG_GROUPS` falls back to real auth | + +### Debug Mode Environment Variables + +The authz-adapter supports debug mode for testing purposes: + +| Variable | Description | +|----------|-------------| +| `DEBUG_EMAIL` | When set, enables debug mode and allows `debug_email`/`debug_groups` query params | +| `DEBUG_GROUPS` | Comma-separated list of groups to assign to the debug user | + +**Query Parameters** (only work when `DEBUG_EMAIL` env is set): + +| Parameter | Description | +|-----------|-------------| +| `debug_email` | Override the debug email address | +| `debug_groups` | Override the debug groups (comma-separated) | + +**Example: Bypass auth for testing** + +```bash +# Set environment variables +export DEBUG_EMAIL="test@example.com" +export DEBUG_GROUPS="argo-runner,argo-viewer" + +# Start the authz-adapter +python app.py + +# Test /check endpoint without Authorization header +curl http://localhost:8080/check +# Returns 200 with X-Auth-Request-Groups: argo-runner,argo-viewer +``` + +**Example: Use query params to override** + +```bash +# With DEBUG_EMAIL env set, use query params +curl "http://localhost:8080/check?debug_email=other@example.com&debug_groups=argo-admin" +# Returns 200 with: +# X-Auth-Request-Email: other@example.com +# X-Auth-Request-Groups: argo-admin +``` + +### Test Coverage + +The authz-adapter test suite maintains >80% code coverage. Run the following command to see the current coverage report: + +```bash +# Run tests with coverage report +cd authz-adapter +make test-coverage +``` + ## Related Documentation - [RepoRegistration User Guide](repo-registration-guide.md) - How to use repoRegistrations @@ -458,12 +598,20 @@ To test with different `repoRegistrations` configurations: ## Summary -The test suite provides comprehensive validation of Helm template rendering for the `repoRegistrations` feature. It ensures that: +The test suites provide comprehensive validation for both Helm template rendering and authz-adapter functionality: +**Helm Template Tests** (`tests/`): - ✅ All expected Kubernetes resources are generated - ✅ Resource names and labels are correct - ✅ Configurations match input values - ✅ S3 bucket settings are properly templated - ✅ GitHub webhooks are configured correctly -Run the tests regularly during development to catch template errors early and maintain high quality standards for the chart. +**Authz-Adapter Tests** (`authz-adapter/tests/`): +- ✅ Authorization flow with Fence integration +- ✅ Debug mode for testing without Fence +- ✅ Query param and env var precedence +- ✅ Error handling for various failure scenarios +- ✅ Performance and resource usage validation + +Run the tests regularly during development to catch errors early and maintain high quality standards. diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 61bc810c..24de5e3e 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -13,6 +13,10 @@ Data managers, developers, and platform administrators using the Argo Stack for ## Table of Contents - [General Troubleshooting](#general-troubleshooting) +- [Ingress and Connectivity Troubleshooting](#ingress-and-connectivity-troubleshooting) +- [Environment-Specific Ingress Configuration](#environment-specific-ingress-configuration) + - [AWS EKS Configuration](#aws-eks-configuration) + - [On-Premises / Bare Metal Configuration](#on-premises--bare-metal-configuration) - [Workflow Troubleshooting](#workflow-troubleshooting) - [Argo Events Issues](#argo-events-issues) - [Secret and Vault Issues](#secret-and-vault-issues) @@ -117,6 +121,1693 @@ kubectl get eventsources -A --- +## Ingress and Connectivity Troubleshooting + +### Issue: Connection Refused but Internal Services Work + +**Symptoms:** +Internal cluster connectivity works perfectly, but external access fails: + +```bash +# ✅ Internal service access works: +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \ + curl -v http://argo-stack-argo-workflows-server.argo-workflows:2746/ +# Returns 200 OK with HTML content + +# ✅ ExternalName proxy also works: +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \ + curl -v http://argo-stack-argo-workflows-server-proxy.argo-stack:2746/ +# Returns 200 OK + +# ❌ But external access fails: +curl https://calypr-demo.ddns.net/workflows +# curl: (7) Failed to connect to calypr-demo.ddns.net port 443 after 2 ms: Could not connect to server +``` + +**Cause:** This "Connection refused" error at the network level means the **ingress-nginx controller's LoadBalancer service** is not exposing ports to the external network. This is distinct from a 404 error (which would mean the ingress is reachable but routing is misconfigured). + +Common causes: +- LoadBalancer service is pending (no external IP provisioned) +- NodePort is not exposed in firewall/security groups +- DNS is not pointing to the correct IP +- Cloud provider LoadBalancer controller is not configured + +**Solution - Step-by-Step Diagnosis:** + +#### 1. Check the ingress-nginx LoadBalancer Service + +```bash +# Check the service type and external IP +kubectl get svc -n ingress-nginx + +# Expected output for LoadBalancer type: +# NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) +# ingress-nginx-controller LoadBalancer 10.100.x.x 80:30080/TCP,443:30443/TCP + +# Expected output for NodePort type: +# NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) +# ingress-nginx-controller NodePort 10.100.x.x 80:30080/TCP,443:30443/TCP +``` + +#### 2. If EXTERNAL-IP is `` + +This means the cloud LoadBalancer hasn't been provisioned: + +```bash +# Check service events for errors +kubectl describe svc ingress-nginx-controller -n ingress-nginx + +# Common causes: +# - AWS Load Balancer Controller not installed (EKS) +# - Insufficient IAM permissions for LB creation +# - Subnet/VPC configuration issues +# - Quota exceeded for load balancers +``` + +**For AWS EKS:** See [Troubleshooting AWS LoadBalancer Pending](#troubleshooting-aws-loadbalancer-pending) for detailed AWS-specific steps including IAM permissions, subnet tagging, and AWS Load Balancer Controller setup. + +Quick check: +```bash +# Check if AWS Load Balancer Controller is installed +kubectl get deployment -n kube-system aws-load-balancer-controller + +# If not installed, the Kubernetes service will stay in +``` + +**For bare metal / on-premises clusters:** + +LoadBalancer type won't work without a load balancer controller. Options: +- Use MetalLB: https://metallb.universe.tf/ +- Switch to NodePort and configure external LB manually +- Use HostPort on specific nodes + +#### 3. If using NodePort, check external access + +```bash +# Get the NodePort for port 443 +kubectl get svc ingress-nginx-controller -n ingress-nginx -o jsonpath='{.spec.ports[?(@.port==443)].nodePort}' +# Example output: 30443 + +# Get node external IP +kubectl get nodes -o wide +# Note the EXTERNAL-IP of your nodes + +# Verify firewall allows traffic on the NodePort +# Then test: curl https://:/ +``` + +#### 4. Verify DNS Resolution + +```bash +# Check that your domain resolves to the correct IP +nslookup calypr-demo.ddns.net + +# This should return the LoadBalancer external IP or Node external IP +# If it returns an incorrect IP, update your DNS +``` + +#### 5. Test Direct Access to the LoadBalancer IP + +```bash +# Get the LoadBalancer IP +LB_IP=$(kubectl get svc ingress-nginx-controller -n ingress-nginx -o jsonpath='{.status.loadBalancer.ingress[0].ip}') +echo "LoadBalancer IP: $LB_IP" + +# If AWS NLB (uses hostname instead of IP): +LB_HOSTNAME=$(kubectl get svc ingress-nginx-controller -n ingress-nginx -o jsonpath='{.status.loadBalancer.ingress[0].hostname}') +echo "LoadBalancer Hostname: $LB_HOSTNAME" + +# Test direct access +curl -v -k https://$LB_IP/workflows + +# If this works but your domain doesn't, the issue is DNS +``` + +#### 6. AWS-Specific: Check Security Groups + +See [AWS Security Group Configuration](#aws-security-group-configuration) for detailed security group verification. + +The LoadBalancer security group must allow: +- Inbound 443 from 0.0.0.0/0 (or your IP range) +- Inbound 80 from 0.0.0.0/0 (for HTTP-01 ACME challenges) + +#### 7. Verify ingress-nginx Controller is Healthy + +```bash +# Check pods are running +kubectl get pods -n ingress-nginx + +# Check controller logs for errors +kubectl logs -n ingress-nginx -l app.kubernetes.io/name=ingress-nginx --tail=50 + +# Look for: +# - "successfully synced" messages (good) +# - Error loading certificate (TLS issue) +# - Backend connection errors +``` + +#### 8. kind Cluster Specific Issues + +If you're using **kind** (Kubernetes IN Docker), the networking works differently: + +**Problem:** MetalLB's external IP only exists inside the Docker network, not accessible from your host machine. + +**Solution for kind:** + +1. **Access via localhost** using the port mappings defined in your kind config: +```bash +# If you configured extraPortMappings for ports 80/443 +curl -k https://localhost/workflows + +# Update /etc/hosts to use localhost for your domain +echo "127.0.0.1 calypr-demo.ddns.net" | sudo tee -a /etc/hosts +curl -k https://calypr-demo.ddns.net/workflows +``` + +2. **Use NodePort instead of LoadBalancer** with kind: +```yaml +# kind-config.yaml +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +networking: + kubeProxyMode: "iptables" +nodes: +- role: control-plane + extraPortMappings: + - containerPort: 30080 # NodePort for HTTP + hostPort: 80 + - containerPort: 30443 # NodePort for HTTPS + hostPort: 443 +``` + +Then patch the ingress-nginx service: +```bash +kubectl patch svc ingress-nginx-controller -n ingress-nginx -p '{"spec":{"type":"NodePort","ports":[{"name":"http","port":80,"nodePort":30080},{"name":"https","port":443,"nodePort":30443}]}}' +``` + +3. **Check iptables rules inside the kind container** (not on host): +```bash +# Rules exist inside the kind node container, not on the host +docker exec -it kind-control-plane bash + +# Inside the container +iptables-save | grep KUBE-SERVICES +iptables-save | grep ingress-nginx +``` + +4. **Let's Encrypt certificates won't work in kind** - use self-signed certs instead: + +kind clusters aren't accessible from the internet, so Let's Encrypt HTTP-01 challenges will fail. You'll see "Kubernetes Ingress Controller Fake Certificate" in your browser. + +**Solution - Use self-signed certificates for kind:** + +```bash +# Create a self-signed certificate +openssl req -x509 -nodes -days 365 -newkey rsa:2048 \ + -keyout tls.key -out tls.crt \ + -subj "/CN=calypr-demo.ddns.net/O=calypr-demo" + +# Create the TLS secret +kubectl create secret tls calypr-demo-tls \ + -n argo-stack \ + --cert=tls.crt \ + --key=tls.key + +# Delete the Certificate resource (stop cert-manager from managing it) +kubectl delete certificate calypr-demo-tls -n argo-stack + +# Remove cert-manager annotation from ingress +kubectl annotate ingress ingress-authz-workflows -n argo-stack cert-manager.io/cluster-issuer- +``` + +Your browser will show a security warning (expected for self-signed certs), but you can proceed. + +--- + +### Issue: kube-proxy Not Creating iptables/nftables Rules + +**Symptoms:** +- NodePort connections fail (Connection refused) +- Testing `curl localhost:` fails +- No KUBE-* chains in iptables/nftables output + +**Cause:** kube-proxy is configured for iptables mode but the system uses nftables, and no rules are being created. + +**Diagnosis:** + +1. **Check if kube-proxy rules exist:** +```bash +# On systems using iptables-nft backend +sudo nft list ruleset | grep KUBE-SERVICES + +# On systems using iptables-legacy +sudo iptables-save | grep KUBE-SERVICES + +# If you get "incompatible, use 'nft' tool" error: +# Your system uses nftables but you're trying to use iptables commands +``` + +2. **Verify which iptables backend is active:** +```bash +sudo update-alternatives --display iptables +# Look for: link currently points to /usr/sbin/iptables-nft +``` + +3. **Check kube-proxy configuration:** +```bash +kubectl get cm kube-proxy -n kube-system -o yaml | grep "mode:" +# Should show: mode: iptables or mode: nft +``` + +**Solution:** + +**For kind clusters:** +- kube-proxy runs inside the kind container +- Check rules from inside: `docker exec -it kind-control-plane iptables-save` +- The host's iptables/nftables are separate from the kind node's + +**For bare metal/VM clusters with nftables:** + +If your system uses iptables-nft and kube-proxy shows "Using iptables Proxier" but creates no rules: + +1. **Verify kube-proxy mode in ConfigMap:** +```bash +kubectl edit cm kube-proxy -n kube-system +``` + +Ensure `mode: iptables` is set (it should work with iptables-nft). + +2. **Restart kube-proxy:** +```bash +kubectl delete pod -n kube-system -l k8s-app=kube-proxy +``` + +3. **Verify rules are created:** +```bash +# Wait 30 seconds, then check +sudo nft list ruleset | grep KUBE-SERVICES +``` + +4. **If still no rules, check kube-proxy logs:** +```bash +kubectl logs -n kube-system -l k8s-app=kube-proxy --tail=100 +# Look for errors about iptables/nftables initialization +``` + +--- + +### Issue: Let's Encrypt Certificate Not Issuing (Fake Certificate Shown) + +**Symptoms:** +- Browser shows "Kubernetes Ingress Controller Fake Certificate" +- Certificate status shows `Ready: False` with reason `DoesNotExist` +- CertificateRequest or Challenge resources stuck in pending state + +**Diagnosis:** + +1. **Check Certificate status:** +```bash +kubectl describe certificate calypr-demo-tls -n argo-stack + +# Look for conditions showing why it's not ready +# Common reasons: DoesNotExist, Pending, Failed +``` + +2. **Check CertificateRequest:** +```bash +kubectl get certificaterequest -n argo-stack +kubectl describe certificaterequest -n argo-stack + +# Check for failure reasons +``` + +3. **Check ACME Challenge (for Let's Encrypt):** +```bash +kubectl get challenges -A +kubectl describe challenge -n argo-stack + +# Look for HTTP-01 or DNS-01 challenge status +``` + +4. **Check cert-manager logs:** +```bash +kubectl logs -n cert-manager -l app=cert-manager --tail=100 +kubectl logs -n cert-manager -l app=webhook --tail=100 +``` + +**Common Causes and Solutions:** + +#### Cause 1: Domain Not Accessible from Internet (kind/local clusters) + +**For kind or local development clusters**, Let's Encrypt cannot reach your domain to verify ownership via HTTP-01 challenge. + +**Solution:** Use self-signed certificates (see [kind Cluster Specific Issues](#8-kind-cluster-specific-issues) section). + +#### Cause 2: HTTP-01 Challenge Fails - Port 80 Not Reachable + +Let's Encrypt needs to reach `http://your-domain/.well-known/acme-challenge/` on port 80. + +**Check:** +```bash +# Verify ingress responds on port 80 +curl -v http://calypr-demo.ddns.net/.well-known/acme-challenge/test + +# Check if port 80 is open in firewall/security groups +# AWS: Check security group allows inbound port 80 from 0.0.0.0/0 +# On-prem: Check firewall allows port 80 from Let's Encrypt IPs +``` + +**Solution:** +```bash +# Ensure LoadBalancer/NodePort exposes port 80 +kubectl get svc ingress-nginx-controller -n ingress-nginx + +# Should show: 80:xxxxx/TCP in PORT(S) column +``` + +#### Cause 3: Use DNS-01 Challenge for kind/Local Clusters + +**For kind clusters or when using dynamic DNS providers like No-IP.com**, HTTP-01 challenges won't work because: +- kind clusters aren't publicly accessible from the internet +- Dynamic DNS IPs may not route directly to your cluster + +**Solution: Use DNS-01 challenge with webhook solver** + +cert-manager doesn't have native No-IP.com support, but you can use the generic **webhook solver with custom scripts** or **acme-dns**: + +**Option A: Use acme-dns (Recommended for No-IP.com)** + +1. **Set up acme-dns server** (one-time setup): +```bash +# Deploy acme-dns in your cluster +kubectl apply -f https://raw.githubusercontent.com/joohoi/acme-dns/master/k8s/acme-dns-deployment.yaml + +# Or use the public acme-dns service at auth.acme-dns.io +``` + +2. **Install cert-manager acme-dns webhook**: +```bash +helm repo add cert-manager-webhook-acme-dns https://k8s-at-home.github.io/charts +helm install acme-dns-webhook cert-manager-webhook-acme-dns/cert-manager-webhook-acme-dns \ + -n cert-manager +``` + +3. **Register your domain with acme-dns** (follow prompts): +```bash +curl -X POST https://auth.acme-dns.io/register +# Returns: {"username":"xxx","password":"xxx","fulldomain":"xxx.auth.acme-dns.io","subdomain":"xxx"} +``` + +4. **Add CNAME record in No-IP.com**: +``` +_acme-challenge.calypr-demo.ddns.net CNAME +``` + +5. **Create ClusterIssuer with acme-dns**: +```yaml +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-prod +spec: + acme: + server: https://acme-v02.api.letsencrypt.org/directory + email: your-email@example.com + privateKeySecretRef: + name: letsencrypt-prod-account-key + solvers: + - dns01: + acmeDNS: + host: https://auth.acme-dns.io + accountSecretRef: + name: acme-dns-credentials + key: acmedns.json +``` + +6. **Create the credentials secret**: +```bash +cat > acmedns.json <", + "password": "", + "fulldomain": "", + "subdomain": "", + "allowfrom": [] + } +} +EOF + +kubectl create secret generic acme-dns-credentials \ + -n cert-manager \ + --from-file=acmedns.json +``` + +**Option B: Manual DNS-01 (Not recommended - use acme-dns instead)** + +Manual verification requires adding TXT records to No-IP.com each time a certificate renews. This is not practical for automated renewals. + +If you still want manual control, you'll need to: +1. Create a Certificate with manual approval +2. Check the Challenge resource for the required TXT record +3. Add the TXT record `_acme-challenge.calypr-demo.ddns.net` to No-IP.com +4. Wait for validation + +For automated renewals, use Option A (acme-dns) instead. + +#### Debugging DNS-01 Challenge Flow + +If you've configured DNS-01 challenges but still see self-signed certificates, follow these debugging steps: + +**Step 1: Verify ClusterIssuer Configuration** + +```bash +# Check ClusterIssuer exists and is ready +kubectl get clusterissuer +kubectl describe clusterissuer letsencrypt-prod + +# Look for status: Ready: True +# If not ready, check the status conditions for error messages +``` + +**Step 2: Check Certificate Resource Status** + +```bash +# Check certificate status +kubectl get certificate -n argo-stack +kubectl describe certificate calypr-demo-tls -n argo-stack + +# Look for: +# - Ready: False (certificate not issued) +# - Status conditions showing the reason (e.g., "Issuing", "NotReady") +# - Last transition time (stuck?) +``` + +**Step 3: Inspect CertificateRequest** + +```bash +# List certificate requests +kubectl get certificaterequest -n argo-stack + +# Describe the most recent one +kubectl describe certificaterequest -n argo-stack | head -50 + +# Look for: +# - Approved: True +# - Ready: False +# - Status message indicating DNS-01 challenge state +``` + +**Step 4: Check Challenge Resources (DNS-01 specific)** + +```bash +# List all challenges +kubectl get challenges -A + +# Describe the challenge +kubectl describe challenge -n argo-stack + +# Look for: +# - Type: DNS-01 +# - State: pending, valid, invalid, or errored +# - Reason field with specific error messages +# - Presented: True (DNS record was created) +``` + +**Step 5: Verify acme-dns Credentials Secret** + +If using acme-dns: + +```bash +# Check secret exists +kubectl get secret acme-dns-credentials -n cert-manager + +# Verify the secret has the correct key +kubectl get secret acme-dns-credentials -n cert-manager -o jsonpath='{.data.acmedns\.json}' | base64 -d | jq . + +# Should return JSON with your domain configuration: +# { +# "calypr-demo.ddns.net": { +# "username": "...", +# "password": "...", +# "fulldomain": "xxx.auth.acme-dns.io", +# "subdomain": "xxx", +# "allowfrom": [] +# } +# } +``` + +**Step 6: Verify CNAME Record** + +```bash +# Check if CNAME record exists for _acme-challenge subdomain +nslookup -type=CNAME _acme-challenge.calypr-demo.ddns.net + +# Or use dig +dig _acme-challenge.calypr-demo.ddns.net CNAME +short + +# Should return something like: xxx.auth.acme-dns.io +``` + +If the CNAME is missing, add it to your DNS provider (e.g., No-IP.com): +``` +_acme-challenge.calypr-demo.ddns.net CNAME +``` + +**Step 7: Check acme-dns TXT Record** + +```bash +# Get the fulldomain from your secret +FULLDOMAIN=$(kubectl get secret acme-dns-credentials -n cert-manager -o jsonpath='{.data.acmedns\.json}' | base64 -d | jq -r '."calypr-demo.ddns.net".fulldomain') + +echo "Full domain: $FULLDOMAIN" + +# Check if TXT record is created on acme-dns +dig @auth.acme-dns.io $FULLDOMAIN TXT +short + +# During a challenge, you should see a TXT record with the validation token +``` + +**Step 8: Check cert-manager Logs** + +```bash +# cert-manager controller logs (handles Certificate resources) +kubectl logs -n cert-manager -l app=cert-manager --tail=100 --follow + +# cert-manager webhook logs (handles DNS-01 challenge creation) +kubectl logs -n cert-manager -l app=webhook --tail=100 + +# Look for: +# - "DNS record created" or "DNS propagation check" +# - acme-dns API call logs +# - Authentication errors +# - "challenge not ready" or timeout messages +``` + +**Step 9: Verify acme-dns Webhook (if installed)** + +If you installed the acme-dns webhook: + +```bash +# Check webhook pod is running +kubectl get pods -n cert-manager | grep acme-dns + +# Check webhook logs +kubectl logs -n cert-manager -l app.kubernetes.io/name=cert-manager-webhook-acme-dns --tail=50 + +# Test webhook connectivity to acme-dns server +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \ + curl -v https://auth.acme-dns.io/health +``` + +**Step 10: Manual Challenge Validation** + +Test if Let's Encrypt can validate your DNS-01 challenge: + +```bash +# Get the challenge token from the Challenge resource +kubectl get challenge -n argo-stack -o yaml + +# Look for spec.key (the validation token) +# The TXT record should be: _acme-challenge.calypr-demo.ddns.net -> CNAME -> fulldomain.auth.acme-dns.io + +# Verify Let's Encrypt can resolve it +dig _acme-challenge.calypr-demo.ddns.net TXT +short + +# This should resolve through the CNAME to the acme-dns TXT record +``` + +**Step 11: Force Certificate Reissue** + +After fixing configuration issues: + +```bash +# Delete failed challenges and certificate requests +kubectl delete challenges -n argo-stack --all +kubectl delete certificaterequest -n argo-stack --all + +# Optionally delete the certificate to trigger fresh issuance +kubectl delete certificate calypr-demo-tls -n argo-stack + +# cert-manager will automatically recreate them +# Watch the new challenge +kubectl get challenges -n argo-stack -w +``` + +**Common DNS-01 Issues and Solutions:** + +| Symptom | Likely Cause | Solution | +|---------|--------------|----------| +| "DNS record not yet propagated" | CNAME not configured or DNS cache | See detailed fix below | +| Challenge stuck in "pending" | CNAME not configured | Add `_acme-challenge.your-domain CNAME fulldomain.auth.acme-dns.io` | +| "invalid credentials" | Wrong acme-dns credentials | Re-register with acme-dns and update secret | +| "DNS record not found" | CNAME propagation delay | Wait 5-10 minutes for DNS propagation | +| "acme-dns: unauthorized" | Incorrect username/password | Verify credentials in secret match registration | +| Challenge "invalid" after 60s | DNS propagation too slow | Use longer `--dns01-self-check-period` flag on cert-manager | +| Certificate stays "Issuing" | Previous challenge failed | Delete old challenges: `kubectl delete challenges -A` | + +**Detailed Fix for "DNS record not yet propagated" Error:** + +If you see this error in cert-manager logs: +``` +"propagation check failed" err="DNS record for \"calypr-demo.ddns.net\" not yet propagated" +``` + +This means cert-manager is checking for the TXT record but can't find it. Follow these steps: + +**1. Verify the CNAME Record Exists:** + +```bash +# Check if CNAME exists +dig _acme-challenge.calypr-demo.ddns.net CNAME +short + +# Should return: xxx.auth.acme-dns.io +# If empty, the CNAME is missing - add it to your DNS provider +``` + +**2. Check DNS Resolution Path:** + +```bash +# Follow the full resolution chain +dig _acme-challenge.calypr-demo.ddns.net TXT +trace + +# This should show: +# 1. Query to root servers +# 2. Query to .net servers +# 3. Query to ddns.net servers (No-IP.com) +# 4. CNAME pointing to auth.acme-dns.io +# 5. TXT record on auth.acme-dns.io +``` + +**3. Verify acme-dns Has Created the TXT Record:** + +```bash +# Get the fulldomain from your acme-dns credentials +FULLDOMAIN=$(kubectl get secret acme-dns-credentials -n cert-manager \ + -o jsonpath='{.data.acmedns\.json}' | base64 -d | \ + jq -r '."calypr-demo.ddns.net".fulldomain') + +echo "Checking TXT record on: $FULLDOMAIN" + +# Query acme-dns directly +dig @auth.acme-dns.io $FULLDOMAIN TXT +short + +# Should return a TXT record like: "abc123def456..." +# If empty, acme-dns hasn't created the record yet +``` + +**4. Check cert-manager's View:** + +cert-manager uses specific DNS resolvers. Check what it sees: + +```bash +# Get the cert-manager pod name +CERT_MGR_POD=$(kubectl get pods -n cert-manager -l app=cert-manager -o jsonpath='{.items[0].metadata.name}') + +# Check DNS resolution from cert-manager's perspective +kubectl exec -n cert-manager $CERT_MGR_POD -- nslookup -type=TXT _acme-challenge.calypr-demo.ddns.net + +# If this fails but your local dig works, cert-manager is using different DNS servers +``` + +**5. Wait for DNS Propagation:** + +DNS changes can take time to propagate: + +```bash +# Watch the challenge status +kubectl get challenges -n argo-stack -w + +# cert-manager retries every 60 seconds by default +# Wait up to 10 minutes for DNS propagation +``` + +**6. Check for DNS Caching Issues:** + +```bash +# Flush local DNS cache (on your machine, not cluster) +# Linux: +sudo systemd-resolve --flush-caches + +# macOS: +sudo dscacheutil -flushcache; sudo killall -HUP mDNSResponder + +# Then retest +dig _acme-challenge.calypr-demo.ddns.net TXT +short +``` + +**7. Verify CNAME Configuration in No-IP.com:** + +Log into your No-IP.com account and verify: + +1. Go to **Dynamic DNS** → **Hostnames** +2. Click **Modify** on `calypr-demo.ddns.net` +3. Check if there's a **DNS Records** or **Advanced** section +4. Add CNAME record: + - **Subdomain**: `_acme-challenge` + - **Record Type**: CNAME + - **Target**: `.auth.acme-dns.io` (from your acme-dns registration) + +**8. If CNAME is Correct but Still Failing:** + +The issue might be cert-manager's DNS resolver configuration: + +```bash +# Check cert-manager deployment for custom DNS settings +kubectl get deployment -n cert-manager cert-manager -o yaml | grep -A 5 dnsPolicy + +# If using ClusterFirst (default), it uses cluster DNS (CoreDNS/kube-dns) +# Try using public DNS resolvers by adding flags to cert-manager: +kubectl set env deployment/cert-manager -n cert-manager \ + --containers=cert-manager \ + DNS01_RECURSIVE_NAMESERVERS=8.8.8.8:53,1.1.1.1:53 +``` + +**9. Increase DNS Propagation Check Period:** + +If your DNS propagates slowly: + +```bash +# Edit cert-manager deployment to increase check period +kubectl edit deployment cert-manager -n cert-manager + +# Add to container args: +# - --dns01-recursive-nameservers-only=true +# - --dns01-self-check-period=10m + +# Or use kubectl set: +kubectl patch deployment cert-manager -n cert-manager --type='json' \ + -p='[{"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--dns01-self-check-period=10m"}]' +``` + +**10. Verify acme-dns API Accessibility:** + +```bash +# Test from within the cluster +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \ + curl -v https://auth.acme-dns.io/health + +# Should return: {"ok":true} + +# If this fails, check network policies or firewall rules blocking acme-dns +``` + +**11. Check acme-dns Registration:** + +Verify your acme-dns registration is correct: + +```bash +# View the credentials +kubectl get secret acme-dns-credentials -n cert-manager \ + -o jsonpath='{.data.acmedns\.json}' | base64 -d | jq . + +# Test the credentials directly +curl -X POST https://auth.acme-dns.io/update \ + -H "X-Api-User: " \ + -H "X-Api-Key: " \ + -d '{"subdomain":"","txt":"test123"}' + +# Should return: {"txt":"test123"} +``` + +**12. Monitor cert-manager Logs in Real-Time:** + +```bash +# Watch cert-manager process the DNS-01 challenge +kubectl logs -n cert-manager -l app=cert-manager --tail=100 -f | grep -i "dns\|propagation\|challenge" + +# Look for: +# - "Calling DNS01 Update" (acme-dns API call) +# - "Waiting for DNS-01 propagation" (checking DNS) +# - "DNS record propagated" (success!) +# - Specific error messages +``` + +After fixing the issue, the challenge should transition from "pending" to "valid", and cert-manager will issue the certificate. + +**Using Staging for DNS-01 Testing:** + +To avoid Let's Encrypt rate limits while debugging: + +```bash +# Create staging ClusterIssuer with DNS-01 +kubectl apply -f - <&1 | grep "subject:\|issuer:\|expire" +``` + +#### Step 4: Update Multiple Namespaces (if needed) + +If you have ingress resources in multiple namespaces using the same certificate: + +```bash +# Create the same secret in other namespaces +kubectl create secret tls calypr-demo-tls \ + -n calypr-api \ + --cert=/etc/letsencrypt/live/calypr-demo.ddns.net/fullchain.pem \ + --key=/etc/letsencrypt/live/calypr-demo.ddns.net/privkey.pem + +kubectl create secret tls calypr-demo-tls \ + -n calypr-tenants \ + --cert=/etc/letsencrypt/live/calypr-demo.ddns.net/fullchain.pem \ + --key=/etc/letsencrypt/live/calypr-demo.ddns.net/privkey.pem +``` + +#### Important Notes + +**Certificate Expiration:** +- Let's Encrypt certificates are valid for **90 days** +- You must manually renew before expiration +- Set a calendar reminder for 30 days before expiration + +**Check expiration date:** +```bash +kubectl get secret calypr-demo-tls -n argo-stack -o jsonpath='{.data.tls\.crt}' | \ + base64 -d | openssl x509 -noout -enddate +``` + +**Manual Renewal Process:** + +When the certificate is about to expire: + +```bash +# 1. Renew on the server where you originally obtained it +certbot renew + +# 2. Update the Kubernetes secret +kubectl create secret tls calypr-demo-tls \ + -n argo-stack \ + --cert=/etc/letsencrypt/live/calypr-demo.ddns.net/fullchain.pem \ + --key=/etc/letsencrypt/live/calypr-demo.ddns.net/privkey.pem \ + --dry-run=client -o yaml | kubectl apply -f - + +# 3. Restart ingress controller to pick up new certificate (optional) +kubectl rollout restart deployment ingress-nginx-controller -n ingress-nginx +``` + +#### Re-enabling Automated cert-manager Management + +If you later fix your DNS-01 or HTTP-01 setup and want to return to automated certificate management: + +```bash +# 1. Delete the manual secret +kubectl delete secret calypr-demo-tls -n argo-stack + +# 2. Re-add the cert-manager annotation to your ingress +kubectl annotate ingress ingress-authz-workflows -n argo-stack \ + cert-manager.io/cluster-issuer=letsencrypt-prod + +# 3. cert-manager will automatically create a new Certificate resource +# and obtain a certificate from Let's Encrypt + +# 4. Verify certificate is being issued +kubectl get certificate -n argo-stack +kubectl describe certificate calypr-demo-tls -n argo-stack +``` + +#### Alternative: Using cert-manager with Manual Certificates + +If you want cert-manager to manage the Certificate resource but provide your own cert: + +```bash +# Create a Certificate resource pointing to an existing secret +kubectl apply -f - <) +# NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) +# ingress-nginx-controller LoadBalancer 10.100.x.x 80:30080/TCP,443:30443/TCP +``` + +If `EXTERNAL-IP` shows ``, the LoadBalancer hasn't been provisioned: + +```bash +# Check events for the service +kubectl describe svc ingress-nginx-controller -n ingress-nginx + +# Check cloud provider logs for LoadBalancer issues +``` + +#### 2. Verify Ingress Controller is Installed + +```bash +# Check if ingress-nginx namespace exists +kubectl get ns ingress-nginx + +# If not installed, install with: +helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx +helm repo update +helm upgrade --install ingress-nginx ingress-nginx/ingress-nginx \ + -n ingress-nginx --create-namespace +``` + +#### 3. Check Ingress Resources + +```bash +# List all ingress resources in relevant namespaces +kubectl get ingress -A + +# Describe a specific ingress to check configuration +kubectl describe ingress ingress-authz-workflows -n argo-stack +``` + +Look for: +- Correct host matching your domain +- IngressClass set correctly (usually `nginx`) +- TLS secret exists +- Backend service exists + +#### 4. Verify TLS Certificate + +```bash +# Check if certificate is ready +kubectl get certificate -n argo-stack + +# Check certificate status +kubectl describe certificate calypr-demo-tls -n argo-stack + +# Check if TLS secret exists +kubectl get secret calypr-demo-tls -n argo-stack +``` + +#### 5. Check Ingress Controller Logs + +```bash +# View ingress controller logs for errors +kubectl logs -n ingress-nginx -l app.kubernetes.io/name=ingress-nginx --tail=100 + +# Look for errors related to: +# - Certificate loading +# - Backend connection +# - Configuration reloads +``` + +#### 6. Verify Network Connectivity + +```bash +# Test from inside the cluster +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- curl -v http://argo-stack-argo-workflows-server.argo-workflows:2746/ + +# Test the ingress controller service directly +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \ + curl -v http://ingress-nginx-controller.ingress-nginx:80/ +``` + +#### 7. Check Security Groups / Firewalls (Cloud-specific) + +**AWS:** +```bash +# Check the LoadBalancer security group allows inbound 443 +aws ec2 describe-security-groups --group-ids +``` + +**GCP:** +```bash +# Check firewall rules +gcloud compute firewall-rules list --filter="name~ingress" +``` + +**Azure:** +```bash +# Check network security group +az network nsg rule list --resource-group --nsg-name +``` + +### Issue: 404 Not Found on Ingress Paths + +**Error:** +``` +{"level":"error","ts":...,"msg":"route not found"...} +``` + +**Cause:** The ingress path doesn't match any backend or the service doesn't exist. + +**Solution:** + +1. Verify backend service exists: +```bash +kubectl get svc -n argo-workflows argo-stack-argo-workflows-server +``` + +2. Check ingress path configuration matches service expectations +3. Verify the service ports match ingress configuration + +### Issue: 404 Due to Cross-Namespace Service Routing + +**Error:** +NGINX ingress returns 404 for all paths (`/workflows`, `/applications`, `/registrations`) even though the backend pods are running and responding correctly when accessed directly within the cluster. + +**Symptoms:** +```bash +# Direct service access works: +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \ + curl -v http://argo-stack-argo-workflows-server.argo-workflows:2746/ +# Returns expected HTML + +# But ingress returns 404: +curl https://calypr-demo.ddns.net/workflows +# Returns 404 Not Found +``` + +**Cause:** Kubernetes Ingress resources can **only route to services in the same namespace** as the Ingress. If your ingress is in `argo-stack` namespace but the actual service is in `argo-workflows` namespace, NGINX cannot route to it directly. + +Common cross-namespace scenarios: +- Ingress in `argo-stack` → Service in `argo-workflows` (Argo Workflows Server) +- Ingress in `argo-stack` → Service in `argocd` (Argo CD Server) +- Ingress in `argo-stack` → Service in `argo-events` (EventSource Service) + +**Solution - Use ExternalName Services:** + +The `ingress-authz-overlay` chart supports cross-namespace routing via ExternalName services. Configure each route with both `namespace` (where ingress lives) and `serviceNamespace` (where service actually exists): + +```yaml +# helm/argo-stack/overlays/ingress-authz-overlay/values.yaml +ingressAuthzOverlay: + routes: + workflows: + namespace: argo-stack # Where the ingress is created + serviceNamespace: argo-workflows # Where the actual service exists + service: argo-stack-argo-workflows-server + port: 2746 + applications: + namespace: argo-stack + serviceNamespace: argocd # ArgoCD server is in argocd namespace + service: argo-stack-argocd-server + port: 8080 + registrations: + namespace: argo-stack + serviceNamespace: argo-events # EventSource is in argo-events namespace + service: github-repo-registrations-eventsource-svc + port: 12000 +``` + +When `serviceNamespace` differs from `namespace`, the chart automatically creates: +1. **ExternalName Service** (e.g., `argo-stack-argo-workflows-server-proxy`) in the ingress namespace +2. This service acts as a DNS proxy pointing to the actual service FQDN +3. The ingress routes to the proxy service, which forwards to the actual service + +**Verify ExternalName Services:** +```bash +# Check ExternalName services were created +kubectl get svc -n argo-stack -l app.kubernetes.io/component=externalname-proxy + +# Verify ExternalName targets +kubectl get svc argo-stack-argo-workflows-server-proxy -n argo-stack -o yaml | grep externalName +# Should show: externalName: argo-stack-argo-workflows-server.argo-workflows.svc.cluster.local +``` + +**Redeploy the overlay:** +```bash +helm upgrade --install ingress-authz-overlay \ + helm/argo-stack/overlays/ingress-authz-overlay \ + --namespace argo-stack +``` + +**Debug cross-namespace routing:** +```bash +# 1. Verify direct service access works +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \ + curl -v http://argo-stack-argo-workflows-server.argo-workflows:2746/ + +# 2. Verify ExternalName proxy service resolves correctly +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \ + curl -v http://argo-stack-argo-workflows-server-proxy.argo-stack:2746/ + +# 3. Check ingress configuration +kubectl describe ingress ingress-authz-workflows -n argo-stack | grep -A5 "backend" +``` + +### Issue: 503 Service Unavailable + +**Error:** +``` +HTTP/1.1 503 Service Temporarily Unavailable +``` + +**Cause:** Backend service has no healthy endpoints. + +**Solution:** + +```bash +# Check endpoints for the service +kubectl get endpoints argo-stack-argo-workflows-server -n argo-stack + +# Check backend pods are running +kubectl get pods -n argo-stack -l app.kubernetes.io/name=argo-workflows-server + +# Check pod health +kubectl describe pod -n argo-stack +``` + +### Issue: authz-adapter External Auth Failure + +**Error:** +``` +auth-url: http://authz-adapter.security.svc.cluster.local:8080/check failed +``` + +**Cause:** The authz-adapter service is not responding. + +**Solution:** + +```bash +# Check authz-adapter is running +kubectl get pods -n security -l app=authz-adapter + +# Check authz-adapter service exists +kubectl get svc authz-adapter -n security + +# Test authz-adapter from within cluster +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \ + curl -v http://authz-adapter.security:8080/healthz + +# Check authz-adapter logs +kubectl logs -n security -l app=authz-adapter --tail=100 +``` + +### Ingress Debugging Cheat Sheet + +| Check | Command | +|-------|---------| +| Ingress controller pods | `kubectl get pods -n ingress-nginx` | +| Ingress controller service | `kubectl get svc -n ingress-nginx` | +| All ingress resources | `kubectl get ingress -A` | +| Ingress details | `kubectl describe ingress -n ` | +| TLS certificates | `kubectl get certificate -A` | +| Certificate status | `kubectl describe certificate -n ` | +| Controller logs | `kubectl logs -n ingress-nginx -l app.kubernetes.io/name=ingress-nginx` | +| authz-adapter status | `kubectl get pods -n security -l app=authz-adapter` | +| Test internal connectivity | `kubectl run debug --image=curlimages/curl --rm -it -- curl -v ` | + +--- + +## Environment-Specific Ingress Configuration + +This section covers ingress setup and troubleshooting for different deployment environments. + +### AWS EKS Configuration + +#### Prerequisites for AWS EKS + +1. **AWS Load Balancer Controller** (recommended) or use the default in-tree cloud provider +2. **IAM permissions** for creating/managing Elastic Load Balancers +3. **Subnet tags** for automatic subnet discovery + +#### Installing NGINX Ingress on AWS EKS + +```bash +# Add the ingress-nginx repository +helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx +helm repo update + +# Install with AWS-specific settings +helm upgrade --install ingress-nginx ingress-nginx/ingress-nginx \ + -n ingress-nginx --create-namespace \ + --set controller.service.type=LoadBalancer \ + --set controller.service.annotations."service\.beta\.kubernetes\.io/aws-load-balancer-type"=nlb \ + --set controller.service.annotations."service\.beta\.kubernetes\.io/aws-load-balancer-scheme"=internet-facing +``` + +#### AWS-Specific Annotations + +For Network Load Balancer (NLB) - recommended for production: +```yaml +service: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: nlb + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + # For internal-only access: + # service.beta.kubernetes.io/aws-load-balancer-scheme: internal +``` + +For Application Load Balancer (ALB) - requires AWS Load Balancer Controller: + +⚠️ **Note:** When using ALB with the AWS Load Balancer Controller, you configure the Ingress resource (not the Service). The Service should use `ClusterIP` or `NodePort` type. + +```yaml +# Ingress annotations for ALB (on the Ingress resource, not Service): +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + kubernetes.io/ingress.class: alb + alb.ingress.kubernetes.io/scheme: internet-facing + alb.ingress.kubernetes.io/target-type: ip +``` + +#### Troubleshooting AWS LoadBalancer Pending + +If `EXTERNAL-IP` stays ``: + +1. **Check service events:** +```bash +kubectl describe svc ingress-nginx-controller -n ingress-nginx +``` + +Look for events like: +- `Error syncing load balancer` - IAM permission issues +- `could not find any suitable subnets` - subnet tagging issues + +2. **Verify IAM permissions:** + +The node IAM role or service account needs these permissions: +```json +{ + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:CreateLoadBalancer", + "elasticloadbalancing:DeleteLoadBalancer", + "elasticloadbalancing:DescribeLoadBalancers", + "elasticloadbalancing:ModifyLoadBalancerAttributes", + "elasticloadbalancing:CreateTargetGroup", + "elasticloadbalancing:DeleteTargetGroup", + "elasticloadbalancing:DescribeTargetGroups", + "elasticloadbalancing:RegisterTargets", + "elasticloadbalancing:DeregisterTargets", + "ec2:DescribeSecurityGroups", + "ec2:DescribeSubnets", + "ec2:DescribeVpcs", + "ec2:CreateSecurityGroup", + "ec2:AuthorizeSecurityGroupIngress" + ], + "Resource": "*" +} +``` + +3. **Check subnet tags:** + +Public subnets need this tag for internet-facing LBs: +``` +kubernetes.io/role/elb = 1 +``` + +Private subnets need this tag for internal LBs: +``` +kubernetes.io/role/internal-elb = 1 +``` + +4. **Verify cluster tag on subnets:** +``` +kubernetes.io/cluster/ = shared (or owned) +``` + +5. **Check AWS Load Balancer Controller (if using ALB):** +```bash +kubectl get pods -n kube-system -l app.kubernetes.io/name=aws-load-balancer-controller +kubectl logs -n kube-system -l app.kubernetes.io/name=aws-load-balancer-controller +``` + +#### AWS Security Group Configuration + +After LoadBalancer is created, verify security group allows traffic: + +```bash +# Get the LoadBalancer DNS name +LB_DNS=$(kubectl get svc ingress-nginx-controller -n ingress-nginx -o jsonpath='{.status.loadBalancer.ingress[0].hostname}') +echo $LB_DNS + +# Find associated security group (from AWS Console or CLI) +# Replace the DNS name in the query with your actual LoadBalancer DNS +aws elbv2 describe-load-balancers --query "LoadBalancers[?DNSName=='${LB_DNS}'].SecurityGroups" + +# Verify inbound rules allow 80 and 443 +aws ec2 describe-security-groups --group-ids --query "SecurityGroups[].IpPermissions" +``` + +Required inbound rules: +- Port 80 (HTTP) from 0.0.0.0/0 (or your IP range) +- Port 443 (HTTPS) from 0.0.0.0/0 (or your IP range) + +--- + +### On-Premises / Bare Metal Configuration + +#### Option 1: MetalLB (Recommended for On-Premises) + +MetalLB provides LoadBalancer functionality for bare metal clusters. + +**Install MetalLB:** +```bash +# Install MetalLB +kubectl apply -f https://raw.githubusercontent.com/metallb/metallb/v0.13.12/config/manifests/metallb-native.yaml + +# Wait for MetalLB pods to be ready +kubectl wait --namespace metallb-system \ + --for=condition=ready pod \ + --selector=app=metallb \ + --timeout=90s +``` + +**Configure IP Address Pool:** +```bash +cat <<'YAML' | kubectl apply -f - +apiVersion: metallb.io/v1beta1 +kind: IPAddressPool +metadata: + name: default-pool + namespace: metallb-system +spec: + addresses: + - 192.168.1.240-192.168.1.250 # Adjust to your available IP range +--- +apiVersion: metallb.io/v1beta1 +kind: L2Advertisement +metadata: + name: default + namespace: metallb-system +spec: + ipAddressPools: + - default-pool +YAML +``` + +**Then install NGINX Ingress with LoadBalancer:** +```bash +helm upgrade --install ingress-nginx ingress-nginx/ingress-nginx \ + -n ingress-nginx --create-namespace \ + --set controller.service.type=LoadBalancer +``` + +#### Option 2: NodePort (Simple, No External Dependencies) + +Use NodePort when you don't have a LoadBalancer solution: + +```bash +helm upgrade --install ingress-nginx ingress-nginx/ingress-nginx \ + -n ingress-nginx --create-namespace \ + --set controller.service.type=NodePort \ + --set controller.service.nodePorts.http=30080 \ + --set controller.service.nodePorts.https=30443 +``` + +Access via any node IP on the configured ports: +```bash +# Get node IPs +kubectl get nodes -o wide + +# Access ingress +curl http://:30080/ +curl -k https://:30443/ +``` + +**To use standard ports (80/443)**, set up an external load balancer or reverse proxy (HAProxy, NGINX) pointing to the NodePorts. + +#### Option 3: HostNetwork (Direct Node Access) + +For single-node clusters or when you need direct port 80/443 access: + +```bash +helm upgrade --install ingress-nginx ingress-nginx/ingress-nginx \ + -n ingress-nginx --create-namespace \ + --set controller.hostNetwork=true \ + --set controller.service.type=ClusterIP \ + --set controller.kind=DaemonSet +``` + +Access directly via node IP on ports 80 and 443. + +⚠️ **Note:** Only one ingress controller pod can run per node with hostNetwork. + +#### Troubleshooting On-Premises Ingress + +1. **MetalLB not assigning IPs:** +```bash +# Check MetalLB speaker pods +kubectl get pods -n metallb-system + +# Check MetalLB logs +kubectl logs -n metallb-system -l component=speaker + +# Verify IPAddressPool is configured +kubectl get ipaddresspool -n metallb-system +``` + +2. **NodePort not accessible:** +```bash +# Verify service has NodePort assigned +kubectl get svc ingress-nginx-controller -n ingress-nginx + +# Check if port is open on the node +nc -zv 30443 + +# Check firewall (iptables/firewalld) +sudo iptables -L -n | grep 30443 +sudo firewall-cmd --list-ports +``` + +3. **Network connectivity from external:** +```bash +# Test from external machine +telnet 30443 + +# Check if traffic reaches the node +sudo tcpdump -i any port 30443 +``` + +4. **Firewall configuration (if using firewalld):** +```bash +# Option 1: Allow only the specific ports you're using (recommended for security) +sudo firewall-cmd --permanent --add-port=30080/tcp # HTTP NodePort +sudo firewall-cmd --permanent --add-port=30443/tcp # HTTPS NodePort +sudo firewall-cmd --reload + +# Option 2: Allow entire NodePort range (less secure, but convenient for development) +# sudo firewall-cmd --permanent --add-port=30000-32767/tcp +# sudo firewall-cmd --reload +``` + +--- + +### Environment Comparison Quick Reference + +| Feature | AWS EKS | On-Premises (MetalLB) | On-Premises (NodePort) | +|---------|---------|----------------------|------------------------| +| LoadBalancer type | NLB/ALB | L2/BGP | N/A | +| External IP | Automatic | From IP pool | Node IP + port | +| Standard ports (80/443) | ✅ Yes | ✅ Yes | ❌ No (30000-32767) | +| TLS termination | Ingress or ALB | Ingress | Ingress | +| Health checks | AWS-managed | MetalLB | Manual | +| HA setup | Multi-AZ | Multiple speakers | External LB needed | +| Setup complexity | Medium | Medium | Low | + +--- + ## Workflow Troubleshooting ### 🧭 Overview @@ -666,5 +2357,143 @@ Then verify: --- -**Document Version:** 2025-11-24 -**Maintainer:** Platform / Data Workflow Team +# 🛑 Engineering Note — Why Auth Snippet Ingresses Were Not Rendering in ingress-nginx + +## 1. How to Detect the Problem + +You know this issue is happening when: + +### **A. Ingress objects exist, but do not appear in `nginx -T`** + +Run: + +```bash +kubectl -n ingress-nginx exec -it -- nginx -T +``` + +If you see **only some ingresses** (e.g., `/events`) but none of your `ingress-authz-*` paths, then the controller is silently rejecting the others. + +### **B. NGINX serves 404 from the default backend** + +Logs look like: + +``` +[upstream-default-backend] 127.0.0.1:8181 404 +``` + +Meaning nginx never matched your ingress rules. + +### **C. The controller logs warn about a “risky annotation”** + +This is the key signal: + +``` +E store.go:951] annotation group ExternalAuth contains risky annotation based on ingress configuration +``` + +If you see this message for an ingress, it **will not be rendered** into the nginx config. + +--- + +## 2. Why It Happens + +Starting with **ingress-nginx 1.12+**, the controller introduces a security model where annotations are grouped by **risk level**. Certain annotations—especially those that inject raw NGINX directives—are considered **Critical**. + +Your ingresses use: + +```yaml +nginx.ingress.kubernetes.io/auth-snippet: | + proxy_set_header Authorization $http_authorization; + ... +``` + +`auth-snippet` is part of the **ExternalAuth** annotation group and is classified as **Critical risk**. + +By default, the ingress-nginx controller only allows annotations up to **“High” risk**, so Critical annotations are rejected. + +When a Critical annotation is present but not allowed, the controller: + +* Records the warning + *“annotation group ExternalAuth contains risky annotation…”* +* **Silently skips** the ingress when generating nginx.conf +* Causing all requests to fall through to the default backend (404) + +This is expected behavior unless the admin explicitly opts in. + +--- + +## 3. How to Fix It + +You must enable two configuration flags in the ingress-nginx controller: + +### **A. Allow snippet annotations at all** + +```yaml +controller: + allowSnippetAnnotations: true +``` + +### **B. Raise the accepted risk level to allow Critical annotations** + +```yaml +controller: + config: + annotations-risk-level: Critical +``` + +Place these in your Helm values (e.g., `values-ingress-nginx.yaml`): + +```yaml +controller: + allowSnippetAnnotations: true + config: + annotations-risk-level: Critical +``` + +Then upgrade: + +```bash +helm upgrade --install ingress-nginx ingress-nginx/ingress-nginx \ + -n ingress-nginx \ + -f values-ingress-nginx.yaml +``` + +### **Confirm the fix** + +1. Check controller ConfigMap: + +```bash +kubectl -n ingress-nginx get configmap ingress-nginx-controller -o yaml \ + | grep -E 'allow-snippet|annotations-risk-level' +``` + +You should see: + +``` +allow-snippet-annotations: "true" +annotations-risk-level: Critical +``` + +2. Re-run nginx config dump: + +```bash +kubectl -n ingress-nginx exec -it -- nginx -T +``` + +Your auth-protected ingresses (`/applications`, `/registrations`, `/workflows`, etc.) will now appear. + +3. Requests stop hitting `[upstream-default-backend]` and begin routing correctly. + +--- + +## Summary + +| Issue | Description | +| ----------------- | --------------------------------------------------------------------------- | +| **Symptom** | Ingress exists but never routes, nginx returns 404/default backend | +| **Log indicator** | `annotation group ExternalAuth contains risky annotation` | +| **Cause** | `auth-snippet` is a Critical-risk annotation rejected by default | +| **Fix** | Enable `allowSnippetAnnotations` and set `annotations-risk-level: Critical` | + +With these flags set, ingress-nginx resumes rendering the ingresses and routing works normally. + diff --git a/docs/user-guide.md b/docs/user-guide.md index 55267645..ff3cb762 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -642,6 +642,205 @@ kubectl get configmaps -n argo | grep my-repo 5. **Public Repos**: Set `isPublic: false` unless you specifically need unauthenticated access +--- + +# 🌟 Using the Calypr GitHub App + +Issue tracking: + +See #108 #35 + +This guide covers: + +* **What the Calypr GitHub App does** +* **Why a user would install it** +* **How to integrate it into their GitHub account** +* **How it affects their project workflows** + +### How to Connect Your GitHub Repository to the Calypr Server + +## Overview + +The **Calypr GitHub App** allows your project repository on GitHub to stay in sync with the Calypr platform. + +By installing this app on your GitHub repository: + +* Your **data**, **metadata**, and **workflow configuration files** will be automatically available to the Calypr system. +* Calypr can **detect updates** in your repository and ensure your project environment reflects your latest work. +* You no longer need to manage tokens, SSH keys, or manual permissions—GitHub handles it securely. + +This makes collaboration easier, keeps your project reproducible, and ensures the Calypr server always has the most up-to-date version of your files. + +--- + +# 🔧 What the Calypr GitHub App Does + +After you install it: + +### ✔ 1. **Securely connects your GitHub repository to Calypr** + +The app grants Calypr read-only access to your repository. Calypr can download the files it needs, but **cannot modify your code or data**. + +### ✔ 2. **Keeps your Calypr environment automatically updated** + +Whenever you push new: + +* data files +* metadata +* workflow definitions (Nextflow, Argo, CWL, etc.) +* configuration settings + +…the Calypr server can see the latest version and use it in your workspace and pipelines. + +### ✔ 3. **Simplifies onboarding** + +No personal GitHub tokens. +No SSH keys. +No security complexity. +Just a simple installation and you’re done. + +### ✔ 4. **Works with private repositories** + +Your private data stays private. Only the Calypr system (and only specific components) can access it. + +--- + +# 🧠 Why You Might Need This + +You should install the Calypr GitHub App if: + +* Your workflow, metadata, or analysis files live in GitHub. +* You want Calypr to run workflows based on the files in your repo. +* You want collaborators or pipelines to always use the current version. +* You want a secure, low-maintenance way to connect GitHub and Calypr. + +If you’re using Calypr for **multi-omics**, **analysis pipelines**, or **any project with multiple collaborators**, the GitHub App is the easiest way to keep everything synchronized. + +--- + +# 🛠 How to Install the Calypr GitHub App + +Installing takes less than one minute. + +### **Step 1 — Open the Installation Page** + +Visit: + +``` +https://github.com/apps/calypr +``` + +(or the URL provided by your administrator) + +### **Step 2 — Select Your Account or Organization** + +You will see options such as: + +* **Install for my personal GitHub account** +* **Install for an organization** + +Choose where your repository lives. + +### **Step 3 — Select Repositories** + +You have two choices: + +### **Option A — Only give access to selected repositories (recommended)** + +Choose specific repositories that contain Calypr projects. + +### **Option B — Give access to all repositories** + +Only use this if: + +* Your GitHub account is dedicated to Calypr work + — or — +* You prefer not to manage per-repo permissions. + +### **Step 4 — Confirm Permissions** + +The Calypr GitHub App typically requests: + +* **Read-only access to code and files** +* **Read-only access to repo metadata (branch names, permissions)** +* (Optional) permission to mark CI/check results, if Calypr is configured to do so + +The app **cannot** modify your repository. + +### **Step 5 — Finish Installation** + +Click **Install**. +That’s it—your repository is now linked. + +--- + +# 🔁 What Happens After Installation? + +Once your repository is connected: + +### ✔ Calypr immediately gains access + +Your project will appear in the Calypr interface (or become available for registration, depending on your setup). + +### ✔ Any updates you push to GitHub are seen by Calypr + +Examples: + +* upload new FASTQ files → they appear in Calypr’s data browser +* update metadata → validators update automatically +* change workflow config → workflow UI shows new settings +* add or edit sample sheets → pipelines re-index as needed + +### ✔ No further authentication is required + +You don’t need to manage passwords, tokens, or machine access. + +--- + +# 🧪 Verifying the Connection + +You can check the connection by: + +* Opening the Calypr UI → Projects → *Your Repository* +* Clicking **Refresh** +* Verifying that your latest branch, files, and metadata are shown + +If you push a change to GitHub and see it reflected in Calypr within a minute or two, everything is working. + +--- + +# ❓ Troubleshooting + +### **I don’t see my repository listed in Calypr.** + +Make sure the GitHub App was installed for that specific repository. + +### **I installed the app, but Calypr says it cannot access the repo.** + +Two common causes: + +1. The app was installed in your personal account, but the repo is in an organization. +2. The repo was not selected during installation. + +### **I want to remove access.** + +You can uninstall the app or change the permitted repositories anytime from: + +``` +https://github.com/settings/installations +``` + +--- + +# 📬 Need Help? + +If you run into issues: + +* Contact your Calypr platform administrator +* Or open a support ticket through the Calypr help portal + +--- + ### 📚 Additional Resources - [Vault Integration Guide](./secrets-with-vault.md) diff --git a/helm/argo-stack/admin-values.yaml b/helm/argo-stack/admin-values.yaml new file mode 100644 index 00000000..c23acb7d --- /dev/null +++ b/helm/argo-stack/admin-values.yaml @@ -0,0 +1,9 @@ +argo-cd: + configs: + rbac: + # Full Argo CD admin role + policy.csv: | + p, role:admin, *, *, *, allow + g, wf-admins, role:admin + + policy.default: role:readonly diff --git a/helm/argo-stack/overlays/ingress-authz-overlay/Chart.yaml b/helm/argo-stack/overlays/ingress-authz-overlay/Chart.yaml new file mode 100644 index 00000000..75de8fe6 --- /dev/null +++ b/helm/argo-stack/overlays/ingress-authz-overlay/Chart.yaml @@ -0,0 +1,16 @@ +apiVersion: v2 +name: ingress-authz-overlay +description: Authz-aware ingress overlay providing unified path-based routing with centralized authorization for multi-tenant UIs and APIs +type: application +version: 0.1.0 +appVersion: "1.0.0" +keywords: + - ingress + - authorization + - multi-tenant + - nginx + - argo +home: https://github.com/calypr/argo-helm +maintainers: + - name: calypr + url: https://github.com/calypr diff --git a/helm/argo-stack/overlays/ingress-authz-overlay/README.md b/helm/argo-stack/overlays/ingress-authz-overlay/README.md new file mode 100644 index 00000000..aa088a2c --- /dev/null +++ b/helm/argo-stack/overlays/ingress-authz-overlay/README.md @@ -0,0 +1,208 @@ +# Ingress AuthZ Overlay + +A Helm overlay chart providing unified, path-based ingress with centralized authorization for multi-tenant Argo Stack deployments. + +## Overview + +This overlay provides a **single host, path-based ingress** for all major UIs and APIs: + +| Path | Service | Namespace | Description | +|------|---------|-----------|-------------| +| `/workflows` | Argo Workflows Server | argo-workflows | Workflow UI (port 2746) | +| `/applications` | Argo CD Server | argocd | GitOps applications UI (port 8080) | +| `/registrations` | GitHub EventSource | argo-events | Repository registration events (port 12000) | +| `/api` | Calypr API | calypr-api | Platform API service (port 3000) | +| `/tenants` | Calypr Tenants | calypr-tenants | Tenant portal (port 3001) | + +All endpoints are protected by the `authz-adapter` via NGINX external authentication. + +## Cross-Namespace Routing + +This overlay supports **cross-namespace routing** for services that exist in different namespaces than the ingress resource. This is achieved using **ExternalName services** as proxies. + +### How It Works + +When a route's `serviceNamespace` differs from its `namespace`: + +1. An **ExternalName Service** is created in the ingress namespace +2. This service acts as a DNS proxy pointing to the actual service in the target namespace +3. The ingress routes to the proxy service, which forwards to the actual service + +### Configuration + +Each route can specify both the ingress namespace and the actual service namespace: + +```yaml +ingressAuthzOverlay: + routes: + workflows: + # Where the ingress is created + namespace: argo-stack + # Where the actual service lives + serviceNamespace: argo-workflows + service: argo-stack-argo-workflows-server + port: 2746 +``` + +When `serviceNamespace` differs from `namespace`, an ExternalName service is automatically created: + +- **Service Name**: `-proxy` +- **ExternalName**: `..svc.cluster.local` + +The ingress also adds the `nginx.ingress.kubernetes.io/upstream-vhost` annotation to ensure the correct Host header is sent to the backend service. + +## AuthZ Adapter Configuration + +**Important**: By default, this overlay does **not** deploy its own authz-adapter. Instead, it reuses the centralized authz-adapter deployed by the main `argo-stack` chart in the `security` namespace. + +### Default Configuration (Recommended) + +The overlay is configured to use the existing authz-adapter in the `security` namespace: + +```yaml +ingressAuthzOverlay: + authzAdapter: + deploy: false # Do NOT deploy a separate adapter + namespace: security # Point to security namespace + serviceName: authz-adapter + port: 8080 +``` + +This ensures a single, centralized authz-adapter handles authentication for all ingress routes. + +### Deploying a Separate Adapter (Advanced) + +If you need the overlay to deploy its own authz-adapter instance: + +```yaml +ingressAuthzOverlay: + authzAdapter: + deploy: true # Deploy a separate adapter + namespace: argo-stack # In the overlay's namespace + serviceName: authz-adapter + port: 8080 +``` + +**Note**: Having multiple authz-adapter instances may cause configuration drift and is not recommended. + +## Quick Start + +```bash +# Install the overlay +helm upgrade --install ingress-authz-overlay \ + helm/argo-stack/overlays/ingress-authz-overlay \ + --namespace argo-stack \ + --create-namespace + +# With custom host +helm upgrade --install ingress-authz-overlay \ + helm/argo-stack/overlays/ingress-authz-overlay \ + --namespace argo-stack \ + --set ingressAuthzOverlay.host=my-domain.example.com +``` + +## Configuration + +See [`values.yaml`](values.yaml) for all configurable options. + +Key settings: + +```yaml +ingressAuthzOverlay: + enabled: true + host: calypr-demo.ddns.net + tls: + enabled: true + secretName: calypr-demo-tls + clusterIssuer: letsencrypt-prod +``` + +### Route Configuration Options + +Each route supports the following options: + +| Option | Description | Default | +|--------|-------------|---------| +| `enabled` | Enable/disable this route | `true` | +| `namespace` | Namespace where the ingress is created | Required | +| `service` | Backend service name | Required | +| `serviceNamespace` | Namespace of the actual service (for cross-namespace routing) | Same as `namespace` | +| `port` | Backend service port | Required | +| `pathPrefix` | URL path prefix for this route | Required | +| `useRegex` | Enable regex path matching | `false` | +| `rewriteTarget` | Path rewrite target (when `useRegex` is true) | `/$2` | +| `backendProtocol` | Backend protocol (`HTTP`, `HTTPS`, `GRPC`, `GRPCS`) | `HTTP` | +| `proxyConnectTimeout` | NGINX proxy connect timeout | - | +| `proxyReadTimeout` | NGINX proxy read timeout | - | +| `proxySendTimeout` | NGINX proxy send timeout | - | + +### Backend Protocol + +Some services use HTTPS or gRPC internally. Use the `backendProtocol` option to specify the correct protocol: + +```yaml +ingressAuthzOverlay: + routes: + applications: + # ArgoCD server uses HTTPS by default + backendProtocol: HTTPS + grpc-service: + # For gRPC services + backendProtocol: GRPC +``` + +## Documentation + +- [User Guide](docs/authz-ingress-user-guide.md) - Complete installation and configuration guide +- [Acceptance Tests](tests/authz-ingress.feature) - Gherkin-style test scenarios + +## Architecture + +See the [User Guide](docs/authz-ingress-user-guide.md) for architecture diagrams and detailed flow descriptions. + +## Requirements + +- Kubernetes 1.19+ +- Helm 3.x +- NGINX Ingress Controller +- cert-manager (for TLS) - **must be installed before deploying this overlay** + +### TLS Certificate Ownership + +When using cert-manager's ingress-shim, only **one** ingress resource can "own" a Certificate. +This overlay uses a `primary: true` flag on routes to designate which ingress should have the +`cert-manager.io/cluster-issuer` annotation. + +By default, the `workflows` route is set as primary. Other ingresses reference the same TLS +secret but without the cluster-issuer annotation, avoiding the "certificate resource is not +owned by this object" error. + +To change the primary route: + +```yaml +ingressAuthzOverlay: + routes: + workflows: + primary: false # Remove primary from workflows + applications: + primary: true # Make applications the primary +``` + +### Installing cert-manager + +If you see `no matches for kind "ClusterIssuer"`, cert-manager is not installed: + +```bash +# Install cert-manager +helm repo add jetstack https://charts.jetstack.io +helm repo update +helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --create-namespace \ + --set crds.enabled=true + +# Wait for cert-manager to be ready +kubectl wait --for=condition=Ready pods --all -n cert-manager --timeout=120s +``` + +See the [User Guide](docs/authz-ingress-user-guide.md) for complete setup instructions including ClusterIssuer configuration. diff --git a/helm/argo-stack/overlays/ingress-authz-overlay/cluster-issuer-letsencrypt.deprecated b/helm/argo-stack/overlays/ingress-authz-overlay/cluster-issuer-letsencrypt.deprecated new file mode 100644 index 00000000..d64390ac --- /dev/null +++ b/helm/argo-stack/overlays/ingress-authz-overlay/cluster-issuer-letsencrypt.deprecated @@ -0,0 +1,17 @@ +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-prod +spec: + acme: + server: https://acme-v02.api.letsencrypt.org/directory + email: brian@bwalsh.com + privateKeySecretRef: + name: letsencrypt-prod-account-key + solvers: + - dns01: + acmeDNS: + host: https://auth.acme-dns.io + accountSecretRef: + name: acme-dns-credentials + key: acmedns.json diff --git a/helm/argo-stack/overlays/ingress-authz-overlay/docs/authz-ingress-user-guide.md b/helm/argo-stack/overlays/ingress-authz-overlay/docs/authz-ingress-user-guide.md new file mode 100644 index 00000000..b553e1c3 --- /dev/null +++ b/helm/argo-stack/overlays/ingress-authz-overlay/docs/authz-ingress-user-guide.md @@ -0,0 +1,618 @@ +# Authz-Aware Ingress Overlay User Guide + +## Overview + +The `ingress-authz-overlay` is a Helm overlay chart that provides a unified, path-based ingress layer for all major UIs and APIs in the Argo Stack. It centralizes authorization through the `authz-adapter` service, ensuring consistent access control across all endpoints. + +## Features + +- **Single Host**: All services exposed on one HTTPS hostname +- **Path-Based Routing**: Clean URL structure (`/workflows`, `/applications`, `/api`, etc.) +- **Centralized Authorization**: All routes protected by `authz-adapter` via NGINX external auth +- **TLS via cert-manager**: Automatic Let's Encrypt certificate management +- **Multi-Tenant Support**: User, email, and group headers passed to backend services +- **Drop-In Deployment**: Simple Helm overlay that can be enabled or disabled per environment + +## Architecture + +```mermaid +sequenceDiagram + participant User + participant Ingress as NGINX Ingress + participant AuthzAdapter as authz-adapter + participant Workflows as Argo Workflows + participant Applications as Argo CD + participant Registrations as Event Source + participant Api as Calypr API + participant Tenants as Calypr Tenants + + User->>Ingress: HTTPS GET /path + Ingress->>AuthzAdapter: auth-url check + AuthzAdapter-->>Ingress: Allow or Deny + alt Allowed + Note over Ingress: Route based on path + Ingress->>Workflows: /workflows... + Ingress->>Applications: /applications... + Ingress->>Registrations: /registrations... + Ingress->>Api: /api... + Ingress->>Tenants: /tenants... + else Denied + Ingress-->>User: Redirect to /tenants/login + end +``` + +## Routes + +| Path | Service | Port | Namespace | Description | +|------|---------|------|-----------|-------------| +| `/workflows` | `argo-stack-argo-workflows-server` | 2746 | `argo-stack` | Argo Workflows UI | +| `/applications` | `argo-stack-argocd-server` | 8080 | `argo-stack` | Argo CD Applications UI | +| `/registrations` | `github-repo-registrations-eventsource-svc` | 12000 | `argo-stack` | GitHub Repo Registration Events | +| `/api` | `calypr-api` | 3000 | `calypr-api` | Calypr API Service | +| `/tenants` | `calypr-tenants` | 3001 | `calypr-tenants` | Calypr Tenant Portal | + +## TLS with Let's Encrypt and cert-manager + +This overlay uses [cert-manager](https://cert-manager.io/) to automatically provision and renew TLS certificates from [Let's Encrypt](https://letsencrypt.org/). + +### Installing cert-manager + +**cert-manager must be installed before creating ClusterIssuers or deploying this overlay.** + +If you see an error like: +``` +no matches for kind "ClusterIssuer" in version "cert-manager.io/v1" +``` +This means cert-manager is not installed. Install it first: + +```bash +# Add the Jetstack Helm repository +helm repo add jetstack https://charts.jetstack.io +helm repo update + +# Install cert-manager with CRDs +helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --create-namespace \ + --set crds.enabled=true + +# Verify cert-manager is running +kubectl get pods -n cert-manager +``` + +Wait for all cert-manager pods to be `Running` before proceeding: +```bash +kubectl wait --for=condition=Ready pods --all -n cert-manager --timeout=120s +``` + +### How It Works + +```mermaid +sequenceDiagram + participant Ingress as Ingress Resource + participant CM as cert-manager + participant LE as Let's Encrypt + participant DNS as DNS Provider + + Note over Ingress: Created with annotation:
cert-manager.io/cluster-issuer: letsencrypt-prod + Ingress->>CM: Ingress triggers Certificate request + CM->>LE: Request certificate for domain + LE->>CM: ACME challenge (HTTP-01 or DNS-01) + CM->>DNS: Prove domain ownership + DNS-->>LE: Challenge verified + LE-->>CM: Issue certificate + CM->>Ingress: Store cert in TLS Secret + Note over Ingress: HTTPS now available +``` + +### ClusterIssuer: letsencrypt-prod + +The `letsencrypt-prod` ClusterIssuer is a cluster-wide cert-manager resource that defines how to obtain certificates from Let's Encrypt's production API. + +**Prerequisites**: You must create the ClusterIssuer before deploying this overlay: + +```yaml +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-prod +spec: + acme: + # Let's Encrypt production API endpoint + server: https://acme-v02.api.letsencrypt.org/directory + + # Email for certificate expiration notifications + email: your-email@example.com + + # Secret to store the ACME account private key + privateKeySecretRef: + name: letsencrypt-prod-account-key + + # HTTP-01 challenge solver using ingress + solvers: + - http01: + ingress: + class: nginx +``` + +**Apply the ClusterIssuer**: + +```bash +kubectl apply -f cluster-issuer.yaml +``` + +### Understanding the ACME Account Key Secret + +The `privateKeySecretRef` (e.g., `letsencrypt-prod-account-key` or `letsencrypt-staging-account-key`) specifies where cert-manager stores the ACME account private key. **You do NOT need to create this secret manually** — cert-manager handles it automatically. + +#### How It Works + +1. **First-Time Setup**: When you create the ClusterIssuer, cert-manager: + - Generates a new RSA private key + - Registers a new account with Let's Encrypt using your email + - Stores the private key in the specified secret (in the `cert-manager` namespace) + +2. **Secret Location**: The secret is created in the same namespace as cert-manager (typically `cert-manager`): + ```bash + # View the account key secret + kubectl get secret letsencrypt-prod-account-key -n cert-manager + + # Describe to see metadata + kubectl describe secret letsencrypt-prod-account-key -n cert-manager + ``` + +3. **Account Persistence**: The account key persists across cert-manager restarts. As long as the secret exists, cert-manager will reuse the same Let's Encrypt account. + +#### Backing Up the ACME Account Key + +For disaster recovery, you may want to back up the account key: + +```bash +# Export the account key secret +kubectl get secret letsencrypt-prod-account-key -n cert-manager -o yaml > letsencrypt-account-backup.yaml + +# To restore in a new cluster (before creating ClusterIssuer) +kubectl apply -f letsencrypt-account-backup.yaml +``` + +> **Note**: Keep the backup secure — this key provides access to your Let's Encrypt account and all its certificates. + +#### Troubleshooting Account Key Issues + +If the account key secret is not being created: + +```bash +# Check cert-manager controller logs +kubectl logs -n cert-manager -l app.kubernetes.io/component=controller + +# Check ClusterIssuer status +kubectl describe clusterissuer letsencrypt-prod +``` + +Common issues: +- **ACME Registration Failed**: Check your email address is valid and you can reach Let's Encrypt's API +- **Secret Not Found in Expected Namespace**: The secret is created in the cert-manager namespace, not your application namespace + +### TLS Certificate Ownership + +When using multiple ingress resources with the same TLS secret and cert-manager's ingress-shim, you may encounter an error: + +``` +certificate resource is not owned by this object. refusing to update non-owned certificate resource +``` + +This happens because **cert-manager only allows one Ingress to own a Certificate**. When multiple ingresses have the `cert-manager.io/cluster-issuer` annotation pointing to the same certificate, a conflict occurs. + +**Solution**: This overlay uses a `primary: true` flag on routes. Only the primary route's Ingress gets the `cert-manager.io/cluster-issuer` annotation. Other ingresses reference the TLS secret but don't trigger certificate creation. + +```yaml +ingressAuthzOverlay: + routes: + workflows: + enabled: true + primary: true # Only this route has cert-manager.io/cluster-issuer annotation + # ... + applications: + enabled: true + # primary: false (default) - uses the TLS secret but doesn't trigger cert creation +``` + +By default, the `workflows` route is primary. To change: + +```yaml +ingressAuthzOverlay: + routes: + workflows: + primary: false + api: + primary: true # Move certificate ownership to /api route +``` + +### Configuration Options + +| Setting | Description | Default | +|---------|-------------|---------| +| `tls.enabled` | Enable TLS for ingress | `true` | +| `tls.secretName` | Name of the TLS Secret (auto-created by cert-manager) | `calypr-demo-tls` | +| `tls.clusterIssuer` | Name of the ClusterIssuer to use | `letsencrypt-prod` | + +### Using letsencrypt-staging (for Testing) + +For testing, use the staging issuer to avoid Let's Encrypt rate limits: + +```yaml +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-staging +spec: + acme: + server: https://acme-staging-v02.api.letsencrypt.org/directory + email: your-email@example.com + privateKeySecretRef: + name: letsencrypt-staging-account-key + solvers: + - http01: + ingress: + class: nginx +``` + +> **Note**: The `letsencrypt-staging-account-key` secret is also auto-generated by cert-manager, just like the production key. Staging and production use separate accounts and secrets. + +Then configure the overlay to use it: + +```yaml +ingressAuthzOverlay: + tls: + clusterIssuer: letsencrypt-staging +``` + +### Verifying Certificate Status + +Check if the certificate was issued successfully: + +```bash +# Check Certificate resource +kubectl get certificate -n argo-stack + +# Check certificate details +kubectl describe certificate -n argo-stack + +# Check the TLS secret +kubectl get secret calypr-demo-tls -n argo-stack +``` + +### Troubleshooting Certificates + +If the certificate is not being issued: + +```bash +# Check cert-manager logs +kubectl logs -n cert-manager -l app=cert-manager + +# Check Certificate status +kubectl describe certificate -n argo-stack + +# Check CertificateRequest +kubectl get certificaterequest -n argo-stack + +# Check ACME challenges +kubectl get challenges -A +``` + +Common issues: +- **cert-manager not installed**: If you see `no matches for kind "ClusterIssuer"`, install cert-manager first (see [Installing cert-manager](#installing-cert-manager)) +- **Helm ownership conflict**: If you see `invalid ownership metadata; label validation error`, the ClusterIssuer was created outside of Helm. See [Helm Ownership Conflict](#helm-ownership-conflict) below. +- **Domain not reachable**: Ensure your domain's DNS points to the ingress controller's external IP +- **Rate limited**: Use `letsencrypt-staging` for testing to avoid production rate limits +- **Challenge failed**: Check that port 80 is accessible for HTTP-01 challenges + +### Helm Ownership Conflict + +If you get an error like: +``` +Error: UPGRADE FAILED: Unable to continue with update: ClusterIssuer "letsencrypt-prod" in namespace "" exists and cannot be imported into the current release: invalid ownership metadata; label validation error: missing key "app.kubernetes.io/managed-by" +``` + +This happens when: +1. The ClusterIssuer was created manually with `kubectl apply` +2. A Helm chart template tries to create/manage the same ClusterIssuer + +**Solution**: ClusterIssuers should be managed **outside** of this Helm chart: + +```bash +# Option 1: Keep the manually created ClusterIssuer (recommended) +# Simply don't include cluster-issuer templates in the chart +# This overlay already follows this pattern - it references the ClusterIssuer +# via annotation but doesn't create it + +# Option 2: If you have a local cluster-issuer template file, remove it +rm helm/argo-stack/overlays/ingress-authz-overlay/templates/cluster-issuer*.yaml + +# Option 3: To adopt an existing resource into Helm (advanced) +# Add Helm labels and annotations to the existing ClusterIssuer: +kubectl annotate clusterissuer letsencrypt-prod \ + meta.helm.sh/release-name=ingress-authz-overlay \ + meta.helm.sh/release-namespace=argo-stack +kubectl label clusterissuer letsencrypt-prod \ + app.kubernetes.io/managed-by=Helm +``` + +**Why ClusterIssuers are managed separately**: ClusterIssuers are cluster-scoped resources that affect the entire cluster, not just one namespace. Including them in application-specific Helm charts causes conflicts when: +- Multiple applications need the same ClusterIssuer +- The ClusterIssuer already exists (created by a previous deployment or another chart) +- Different teams deploy applications that reference the same issuer + +This chart references the ClusterIssuer via annotation (`cert-manager.io/cluster-issuer`) but leaves its lifecycle management to cluster administrators. + +## Installation + +### Prerequisites + +Before installing this overlay, ensure you have: + +1. **Kubernetes cluster** (1.19+) with NGINX Ingress Controller installed +2. **cert-manager installed** (see [Installing cert-manager](#installing-cert-manager) above) +3. **ClusterIssuer created** (see [ClusterIssuer: letsencrypt-prod](#clusterissuer-letsencrypt-prod)) +4. **Helm 3.x** installed locally + +**Installation Order**: +1. Install cert-manager +2. Create ClusterIssuer +3. Install this overlay + +### Install the Overlay + +```bash +# Install with default values +helm upgrade --install ingress-authz-overlay \ + helm/argo-stack/overlays/ingress-authz-overlay \ + --namespace argo-stack \ + --create-namespace + +# Install with custom host +helm upgrade --install ingress-authz-overlay \ + helm/argo-stack/overlays/ingress-authz-overlay \ + --namespace argo-stack \ + --set ingressAuthzOverlay.host=my-domain.example.com \ + --set ingressAuthzOverlay.tls.secretName=my-domain-tls +``` + +### Integrate with Parent Chart + +Alternatively, add the values to your main `argo-stack` deployment: + +```bash +helm upgrade --install argo-stack \ + helm/argo-stack \ + --values helm/argo-stack/values.yaml \ + --set ingressAuthzOverlay.enabled=true +``` + +## Configuration + +### Basic Configuration + +```yaml +ingressAuthzOverlay: + enabled: true + host: calypr-demo.ddns.net + tls: + enabled: true + secretName: calypr-demo-tls + clusterIssuer: letsencrypt-prod +``` + +### AuthZ Adapter Configuration + +By default, this overlay does **not** deploy its own authz-adapter. It reuses the centralized authz-adapter deployed by the main `argo-stack` chart in the `security` namespace: + +```yaml +ingressAuthzOverlay: + authzAdapter: + # Use centralized adapter from security namespace (recommended) + deploy: false + + # Service location (points to main argo-stack adapter) + serviceName: authz-adapter + namespace: security + port: 8080 + path: /check + + # Sign-in redirect URL + signinUrl: https://calypr-demo.ddns.net/tenants/login + + # Headers passed from auth response to backends + responseHeaders: "X-User,X-Email,X-Groups" +``` + +If you need to deploy a separate authz-adapter instance (not recommended): + +```yaml +ingressAuthzOverlay: + authzAdapter: + deploy: true # Deploy a separate adapter + namespace: argo-stack # In overlay's namespace + serviceName: authz-adapter + port: 8080 + + # Environment configuration (only used when deploy: true) + env: + fenceBase: "https://calypr-dev.ohsu.edu/user" +``` + +### Custom Routes + +Add or modify routes as needed: + +```yaml +ingressAuthzOverlay: + routes: + # Custom route example + myservice: + enabled: true + namespace: my-namespace + service: my-service + port: 8000 + pathPrefix: /myservice + useRegex: true + rewriteTarget: /$2 +``` + +### Disabling a Route + +```yaml +ingressAuthzOverlay: + routes: + registrations: + enabled: false +``` + +## Authorization Flow + +1. **User Request**: Client sends HTTPS request to the ingress host +2. **External Auth**: NGINX Ingress calls the `authz-adapter` `/check` endpoint +3. **Token Validation**: `authz-adapter` validates the Authorization header against Fence/OIDC +4. **Group Assignment**: User is assigned groups based on their permissions (e.g., `argo-runner`, `argo-viewer`) +5. **Response Headers**: On success, user info headers are added to the request +6. **Routing**: Request is forwarded to the appropriate backend service +7. **Denial**: On failure, user is redirected to the sign-in URL + +### Auth Response Headers + +The following headers are passed to backend services on successful authentication: + +| Header | Description | +|--------|-------------| +| `X-Auth-Request-User` | Username or email of the authenticated user | +| `X-Auth-Request-Email` | Email address of the user | +| `X-Auth-Request-Groups` | Comma-separated list of groups | +| `X-User` | Alias for X-Auth-Request-User | +| `X-Email` | Alias for X-Auth-Request-Email | +| `X-Groups` | Alias for X-Auth-Request-Groups | + +## Troubleshooting + +### Check Ingress Status + +```bash +kubectl get ingress -A -l app.kubernetes.io/name=ingress-authz-overlay +``` + +### Check AuthZ Adapter + +```bash +# Logs +kubectl logs -n argo-stack -l app=authz-adapter + +# Test health endpoint +kubectl port-forward -n argo-stack svc/authz-adapter 8080:8080 & +curl http://localhost:8080/healthz +``` + +### Test Authentication + +```bash +# Should redirect to login +curl -I https://calypr-demo.ddns.net/workflows + +# With valid token (should return 200) +curl -I -H "Authorization: Bearer $TOKEN" https://calypr-demo.ddns.net/workflows +``` + +### Common Issues + +1. **404 Not Found**: Routes return 404 error + - Check if backend services exist in their respective namespaces: + ```bash + # For workflows + kubectl get svc argo-stack-argo-workflows-server -n argo-workflows + # For applications (ArgoCD) + kubectl get svc argo-stack-argocd-server -n argocd + # For registrations + kubectl get svc github-repo-registrations-eventsource-svc -n argo-events + ``` + - Verify ExternalName proxy services are created for cross-namespace routing: + ```bash + kubectl get svc -n argo-stack -l app.kubernetes.io/component=externalname-proxy + ``` + - Check if NGINX ingress controller is running and has ADDRESS assigned: + ```bash + kubectl get ingress -A -l app.kubernetes.io/name=ingress-authz-overlay + ``` + - Verify backend protocol settings (ArgoCD requires HTTPS): + ```bash + kubectl get ingress ingress-authz-applications -n argo-stack -o yaml | grep backend-protocol + ``` + - Check NGINX ingress controller logs for routing errors: + ```bash + kubectl logs -n ingress-nginx -l app.kubernetes.io/component=controller --tail=100 + ``` + - Test direct connectivity to backend services: + ```bash + kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \ + curl -v http://argo-stack-argo-workflows-server.argo-workflows:2746/ + ``` + +2. **502 Bad Gateway**: AuthZ adapter not reachable + - Check authz-adapter deployment is running + - Verify service selector matches pod labels + +2. **503 Service Unavailable**: Backend service not available + - Check target service exists in the specified namespace + - Verify service port matches configuration + +3. **Redirect Loop**: Auth signin URL misconfigured + - Ensure `/tenants/login` path is accessible + - Check signinUrl matches actual login endpoint + +## Uninstall + +```bash +helm uninstall ingress-authz-overlay -n argo-stack +``` + +## Transient start up logs + +### Issue +When the authz-adapter starts up, you may see transient log messages like: + +``` +external-secrets-system external-secrets-cert-controller-5f8b8994d5-vrzmj cert-controller {"level":"error","ts":1764050879.7977011,"logger":"controllers.webhook-certs-updater","msg":"could not update webhook config","Webhookconfig":{"name":"secretstore-validate"},"error":"ca cert not yet ready","stacktrace":"github.com/external-secrets/external-secrets/pkg/controllers/webhookconfig.(*Reconciler).Reconcile\n\t/home/runner/work/external-secrets/external- +``` + +# ✅ **Short Answer** + +**Yes, this is a transient and harmless startup condition.** +It occurs when the **cert-controller** tries to update the validating/mutating webhook configuration *before* the internal CA bundle has been generated. + +ESO keeps retrying until the CA is ready, then the message disappears. + +--- + +# 🧠 **Why This Happens** + +External Secrets Operator uses an **internal self-signed CA** to secure: + +* The validating webhook +* The mutating webhook +* The admission controller + +On startup, the control plane usually initializes in this order: + +1. Pod starts +2. Cert controller initializes +3. Webhook server generates or fetches CA bundle +4. Cert controller tries to patch webhook config +5. **If CA is not yet ready → logs “ca cert not yet ready”** +6. Retry loop resolves it once CA is created + +ESO’s cert controller reconciles every few seconds until successful. + +--- + +## Related Documentation + +- [Argo Stack User Guide](../../docs/user-guide.md) +- [Tenant Onboarding Guide](../../docs/tenant-onboarding.md) +- [Repo Registration Guide](../../docs/repo-registration-guide.md) diff --git a/helm/argo-stack/overlays/ingress-authz-overlay/templates/_helpers.tpl b/helm/argo-stack/overlays/ingress-authz-overlay/templates/_helpers.tpl new file mode 100644 index 00000000..e8f2468d --- /dev/null +++ b/helm/argo-stack/overlays/ingress-authz-overlay/templates/_helpers.tpl @@ -0,0 +1,72 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "ingress-authz-overlay.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +*/}} +{{- define "ingress-authz-overlay.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "ingress-authz-overlay.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "ingress-authz-overlay.labels" -}} +helm.sh/chart: {{ include "ingress-authz-overlay.chart" . }} +{{ include "ingress-authz-overlay.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "ingress-authz-overlay.selectorLabels" -}} +app.kubernetes.io/name: {{ include "ingress-authz-overlay.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the auth-url for NGINX ingress external auth. +*/}} +{{- define "ingress-authz-overlay.authUrl" -}} +{{- $adapter := .Values.ingressAuthzOverlay.authzAdapter -}} +http://{{ $adapter.serviceName }}.{{ $adapter.namespace }}.svc.cluster.local:{{ $adapter.port }}{{ $adapter.path }} +{{- end }} + +{{/* +Create common ingress annotations for NGINX external auth. +*/}} +{{- define "ingress-authz-overlay.authAnnotations" -}} +nginx.ingress.kubernetes.io/auth-url: {{ include "ingress-authz-overlay.authUrl" . | quote }} +nginx.ingress.kubernetes.io/auth-method: "GET" +nginx.ingress.kubernetes.io/auth-signin: {{ .Values.ingressAuthzOverlay.authzAdapter.signinUrl | quote }} +nginx.ingress.kubernetes.io/auth-response-headers: {{ .Values.ingressAuthzOverlay.authzAdapter.responseHeaders | quote }} +nginx.ingress.kubernetes.io/auth-snippet: | + proxy_set_header Authorization $http_authorization; + proxy_set_header X-Original-URI $request_uri; + proxy_set_header X-Original-Method $request_method; + proxy_set_header X-Forwarded-Host $host; +{{- end }} diff --git a/helm/argo-stack/overlays/ingress-authz-overlay/templates/authz-adapter.yaml b/helm/argo-stack/overlays/ingress-authz-overlay/templates/authz-adapter.yaml new file mode 100644 index 00000000..194c1ea8 --- /dev/null +++ b/helm/argo-stack/overlays/ingress-authz-overlay/templates/authz-adapter.yaml @@ -0,0 +1,107 @@ +{{/* +AuthZ Adapter Deployment and Service for the ingress-authz-overlay. +The authz-adapter provides external authentication for NGINX Ingress, +validating tokens and returning user/group information. +*/}} +{{- if and .Values.ingressAuthzOverlay.enabled .Values.ingressAuthzOverlay.authzAdapter.deploy }} +{{- $config := .Values.ingressAuthzOverlay }} +{{- $adapter := $config.authzAdapter }} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ $adapter.serviceName }} + namespace: {{ $adapter.namespace }} + labels: + {{- include "ingress-authz-overlay.labels" . | nindent 4 }} + app.kubernetes.io/component: authz-adapter + app: {{ $adapter.serviceName }} + annotations: + meta.helm.sh/release-name: {{ .Release.Name }} + meta.helm.sh/release-namespace: {{ .Release.Namespace }} +spec: + replicas: {{ $adapter.replicas | default 2 }} + selector: + matchLabels: + app: {{ $adapter.serviceName }} + app.kubernetes.io/instance: {{ .Release.Name }} + template: + metadata: + labels: + app: {{ $adapter.serviceName }} + {{- include "ingress-authz-overlay.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: authz-adapter + spec: + {{- with $adapter.securityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: authz-adapter + image: {{ $adapter.image }} + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + ports: + - name: http + containerPort: {{ $adapter.port }} + protocol: TCP + env: + - name: FENCE_BASE + value: {{ $adapter.env.fenceBase | quote }} + - name: TENANT_LOGIN_PATH + value: {{ $adapter.env.tenantLoginPath | default "/tenants/login" | quote }} + - name: HTTP_TIMEOUT + value: {{ $adapter.env.httpTimeout | default "3.0" | quote }} + {{- if $adapter.env.gitappBaseUrl }} + - name: GITAPP_BASE_URL + value: {{ $adapter.env.gitappBaseUrl | quote }} + {{- end }} + livenessProbe: + httpGet: + path: /healthz + port: http + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /healthz + port: http + initialDelaySeconds: 3 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 2 + {{- with $adapter.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ $adapter.serviceName }} + namespace: {{ $adapter.namespace }} + labels: + {{- include "ingress-authz-overlay.labels" . | nindent 4 }} + app.kubernetes.io/component: authz-adapter + app: {{ $adapter.serviceName }} + annotations: + meta.helm.sh/release-name: {{ .Release.Name }} + meta.helm.sh/release-namespace: {{ .Release.Namespace }} +spec: + type: ClusterIP + selector: + app: {{ $adapter.serviceName }} + app.kubernetes.io/instance: {{ .Release.Name }} + ports: + - name: http + port: {{ $adapter.port }} + targetPort: http + protocol: TCP +{{- end }} diff --git a/helm/argo-stack/overlays/ingress-authz-overlay/templates/externalname-services.yaml b/helm/argo-stack/overlays/ingress-authz-overlay/templates/externalname-services.yaml new file mode 100644 index 00000000..b70fc050 --- /dev/null +++ b/helm/argo-stack/overlays/ingress-authz-overlay/templates/externalname-services.yaml @@ -0,0 +1,44 @@ +{{/* +ExternalName Services for cross-namespace routing. +When the ingress namespace differs from the service namespace, we create an +ExternalName service in the ingress namespace that points to the actual service +in its original namespace. This enables NGINX Ingress to route traffic correctly. +*/}} +{{- if .Values.ingressAuthzOverlay.enabled }} +{{- $root := . }} +{{- $config := .Values.ingressAuthzOverlay }} +{{- range $routeName, $route := $config.routes }} +{{- $routeEnabled := $route.enabled | default true }} +{{- if $routeEnabled }} +{{- $serviceNamespace := $route.serviceNamespace | default $route.namespace }} +{{- if ne $route.namespace $serviceNamespace }} +--- +# ExternalName service to enable cross-namespace routing for {{ $routeName }} +# Routes from {{ $route.namespace }} to {{ $route.service }}.{{ $serviceNamespace }} +apiVersion: v1 +kind: Service +metadata: + name: {{ $route.service }}-proxy + namespace: {{ $route.namespace }} + labels: + {{- include "ingress-authz-overlay.labels" $root | nindent 4 }} + app.kubernetes.io/component: externalname-proxy + ingress-authz-overlay.calypr.io/route: {{ $routeName | quote }} + ingress-authz-overlay.calypr.io/target-namespace: {{ $serviceNamespace | quote }} + ingress-authz-overlay.calypr.io/target-service: {{ $route.service | quote }} + annotations: + # Helm release tracking + meta.helm.sh/release-name: {{ $root.Release.Name }} + meta.helm.sh/release-namespace: {{ $root.Release.Namespace }} +spec: + type: ExternalName + externalName: {{ $route.service }}.{{ $serviceNamespace }}.svc.cluster.local + ports: + - port: {{ $route.port }} + targetPort: {{ $route.port }} + protocol: TCP + name: http +{{- end }} +{{- end }} +{{- end }} +{{- end }} diff --git a/helm/argo-stack/overlays/ingress-authz-overlay/templates/ingress-authz.yaml b/helm/argo-stack/overlays/ingress-authz-overlay/templates/ingress-authz.yaml new file mode 100644 index 00000000..1eb213e4 --- /dev/null +++ b/helm/argo-stack/overlays/ingress-authz-overlay/templates/ingress-authz.yaml @@ -0,0 +1,98 @@ +{{/* +Ingress resources for each route in the ingress-authz-overlay. +Each route creates a separate Ingress resource in its respective namespace, +all sharing the same host and TLS configuration. +All routes are protected by the authz-adapter via NGINX external auth. + +NOTE: Only the route with primary: true should have the cert-manager.io/cluster-issuer +annotation. Other routes just reference the TLS secret without the annotation to avoid +cert-manager ownership conflicts. If no route has primary: true, no ingress will have +the cluster-issuer annotation (the Certificate must be created manually or by another means). + +For cross-namespace routing (when serviceNamespace differs from namespace), we use +an ExternalName service as a proxy. The ExternalName service is created in the +externalname-services.yaml template. +*/}} +{{- if .Values.ingressAuthzOverlay.enabled }} +{{- $root := . }} +{{- $config := .Values.ingressAuthzOverlay }} +{{- range $routeName, $route := $config.routes }} +{{- $routeEnabled := $route.enabled | default true }} +{{- if $routeEnabled }} +{{- $serviceNamespace := $route.serviceNamespace | default $route.namespace }} +{{- $isCrossNamespace := ne $route.namespace $serviceNamespace }} +{{- $serviceName := ternary (printf "%s-proxy" $route.service) $route.service $isCrossNamespace }} +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: ingress-authz-{{ $routeName }} + namespace: {{ $route.namespace }} + labels: + {{- include "ingress-authz-overlay.labels" $root | nindent 4 }} + app.kubernetes.io/component: ingress + ingress-authz-overlay.calypr.io/route: {{ $routeName | quote }} + {{- if $isCrossNamespace }} + ingress-authz-overlay.calypr.io/cross-namespace: "true" + ingress-authz-overlay.calypr.io/target-namespace: {{ $serviceNamespace | quote }} + {{- end }} + annotations: + # Helm release tracking + meta.helm.sh/release-name: {{ $root.Release.Name }} + meta.helm.sh/release-namespace: {{ $root.Release.Namespace }} + # NGINX external auth annotations + {{- include "ingress-authz-overlay.authAnnotations" $root | nindent 4 }} + # {{- if and $config.tls.enabled $route.primary }} + # # Let's Encrypt / cert-manager integration (only on primary route to avoid ownership conflicts) + # cert-manager.io/cluster-issuer: {{ $config.tls.clusterIssuer | quote }} + # {{- end }} + {{- if $route.backendProtocol }} + # Backend protocol for services using HTTPS/GRPC internally + # Valid values: HTTP, HTTPS, GRPC, GRPCS, AJP, FCGI + nginx.ingress.kubernetes.io/backend-protocol: {{ $route.backendProtocol | quote }} + {{- end }} + {{- if $route.useRegex }} + # Path rewriting for subpath support + nginx.ingress.kubernetes.io/use-regex: "true" + nginx.ingress.kubernetes.io/rewrite-target: {{ $route.rewriteTarget | default "/$2" }} + {{- end }} + {{- if $isCrossNamespace }} + # Cross-namespace routing via ExternalName service + nginx.ingress.kubernetes.io/upstream-vhost: {{ $route.service }}.{{ $serviceNamespace }}.svc.cluster.local + {{- end }} + {{- if $route.proxyConnectTimeout }} + nginx.ingress.kubernetes.io/proxy-connect-timeout: {{ $route.proxyConnectTimeout | quote }} + {{- end }} + {{- if $route.proxyReadTimeout }} + nginx.ingress.kubernetes.io/proxy-read-timeout: {{ $route.proxyReadTimeout | quote }} + {{- end }} + {{- if $route.proxySendTimeout }} + nginx.ingress.kubernetes.io/proxy-send-timeout: {{ $route.proxySendTimeout | quote }} + {{- end }} +spec: + ingressClassName: "nginx" + {{- if $config.tls.enabled }} + tls: + - hosts: + - {{ $config.host | quote }} + secretName: {{ $config.tls.secretName | quote }} + {{- end }} + rules: + - host: {{ $config.host | quote }} + http: + paths: + {{- if $route.useRegex }} + - path: {{ $route.pathPrefix }}(/|$)(.*) + pathType: ImplementationSpecific + {{- else }} + - path: {{ $route.pathPrefix }} + pathType: Prefix + {{- end }} + backend: + service: + name: {{ $serviceName }} + port: + number: {{ $route.port }} +{{- end }} +{{- end }} +{{- end }} diff --git a/helm/argo-stack/overlays/ingress-authz-overlay/tests/authz-ingress.feature b/helm/argo-stack/overlays/ingress-authz-overlay/tests/authz-ingress.feature new file mode 100644 index 00000000..e84c7a8c --- /dev/null +++ b/helm/argo-stack/overlays/ingress-authz-overlay/tests/authz-ingress.feature @@ -0,0 +1,72 @@ +Feature: Authz ingress overlay + + Background: + Given the ingress-authz-overlay is installed + And the hostname "calypr-demo.ddns.net" resolves to the ingress endpoint + + Scenario: Unauthenticated user is redirected to login + When I send a GET request to "https://calypr-demo.ddns.net/workflows" + Then the response status should be 302 or 303 + And the "Location" header should contain "/tenants/login" + + Scenario: Authenticated user can access workflows + Given I have a valid session recognized by authz-adapter + When I send a GET request to "https://calypr-demo.ddns.net/workflows" + Then the response status should be 200 + + Scenario: All paths are protected by authz-adapter + When I send a GET request to "https://calypr-demo.ddns.net/applications" without credentials + Then I should be redirected to "/tenants/login" + + When I send a GET request to "https://calypr-demo.ddns.net/registrations" without credentials + Then I should be redirected to "/tenants/login" + + When I send a GET request to "https://calypr-demo.ddns.net/api" without credentials + Then I should be redirected to "/tenants/login" + + When I send a GET request to "https://calypr-demo.ddns.net/tenants" without credentials + Then I should be redirected to "/tenants/login" or served only public content as configured + + Scenario: TLS certificate is valid + When I connect to "https://calypr-demo.ddns.net" + Then the TLS certificate should be issued by "Let's Encrypt" + And the certificate subject alt name should include "calypr-demo.ddns.net" + + Scenario: Routing sends requests to the correct services + Given I am authenticated + When I send a GET request to "https://calypr-demo.ddns.net/workflows" + Then the response should contain an HTML title for the workflows UI + + When I send a GET request to "https://calypr-demo.ddns.net/applications" + Then the response should contain an HTML title for the applications UI + + When I send a GET request to "https://calypr-demo.ddns.net/api/health" + Then I should receive a 200 response with a JSON health object from the API + + When I send a GET request to "https://calypr-demo.ddns.net/tenants" + Then I should see the tenant portal landing page or login as configured + + Scenario: Auth response headers are passed to backend + Given I am authenticated with user "test@example.com" in groups "argo-runner,argo-viewer" + When I send a GET request to "https://calypr-demo.ddns.net/api/whoami" + Then the backend should receive header "X-Auth-Request-User" with value "test@example.com" + And the backend should receive header "X-Auth-Request-Groups" with value "argo-runner,argo-viewer" + + Scenario: Path rewriting works correctly + Given I am authenticated + When I send a GET request to "https://calypr-demo.ddns.net/workflows/workflow-details/my-workflow" + Then the Argo Workflows server should receive path "/workflow-details/my-workflow" + + When I send a GET request to "https://calypr-demo.ddns.net/api/v1/users" + Then the Calypr API should receive path "/v1/users" + + Scenario: Health check endpoint is accessible + When I send a GET request to "http://authz-adapter.argo-stack.svc.cluster.local:8080/healthz" + Then the response status should be 200 + And the response body should be "ok" + + Scenario: Multiple simultaneous requests are handled + Given I am authenticated + When I send 10 concurrent GET requests to "https://calypr-demo.ddns.net/workflows" + Then all responses should have status 200 + And the average response time should be less than 500ms diff --git a/helm/argo-stack/overlays/ingress-authz-overlay/values-ingress-nginx.yaml b/helm/argo-stack/overlays/ingress-authz-overlay/values-ingress-nginx.yaml new file mode 100644 index 00000000..58dda035 --- /dev/null +++ b/helm/argo-stack/overlays/ingress-authz-overlay/values-ingress-nginx.yaml @@ -0,0 +1,4 @@ +controller: + config: + allow-snippet-annotations: "true" + annotations-risk-level: Critical # allow Critical-risk annotations like auth-snippet diff --git a/helm/argo-stack/overlays/ingress-authz-overlay/values.yaml b/helm/argo-stack/overlays/ingress-authz-overlay/values.yaml new file mode 100644 index 00000000..54a77cc5 --- /dev/null +++ b/helm/argo-stack/overlays/ingress-authz-overlay/values.yaml @@ -0,0 +1,184 @@ +# ============================================================================ +# Ingress AuthZ Overlay Configuration +# ============================================================================ +# This overlay provides a single host, path-based ingress layer for all +# major UIs and APIs, protected by a centralized authz-adapter. +# +# Usage: +# helm upgrade --install ingress-authz-overlay \ +# helm/argo-stack/overlays/ingress-authz-overlay \ +# --set ingressAuthzOverlay.enabled=true +# ============================================================================ + + +ingressAuthzOverlay: + # Enable or disable the overlay + enabled: true + + # ============================================================================ + # Host and TLS Configuration + # ============================================================================ + # Single host for all path-based routes + host: calypr-demo.ddns.net + + # TLS configuration using cert-manager + tls: + enabled: true + secretName: calypr-demo-tls + + # ============================================================================ + # Ingress Controller Configuration + # ============================================================================ + ingressClassName: nginx + + # ============================================================================ + # AuthZ Adapter Configuration + # ============================================================================ + authzAdapter: + # Enable deployment of authz-adapter (set to false if using centralized adapter) + # NOTE: By default, the main argo-stack chart deploys authz-adapter to the + # 'security' namespace. Set deploy: false to reuse that instance. + deploy: false + + # Service discovery settings + # NOTE: When deploy: false, ensure these point to the existing authz-adapter + # deployed by the main argo-stack chart in the 'security' namespace. + serviceName: authz-adapter + namespace: security + port: 8080 + + # Auth endpoint path + path: /check + + # Sign-in URL for unauthenticated requests + signinUrl: https://calypr-demo.ddns.net/tenants/login + + # Headers to pass back from auth response + responseHeaders: "X-User,X-Email,X-Groups,X-Auth-Request-User,X-Auth-Request-Email,X-Auth-Request-Groups" + + # Container image for authz-adapter + image: ghcr.io/calypr/argo-helm:latest + + # Number of replicas + replicas: 2 + + # Environment configuration for the adapter + env: + # GitApp/Fence base URL for user info + fenceBase: "https://calypr-dev.ohsu.edu/user" + # Tenant login path + tenantLoginPath: "/tenants/login" + # HTTP timeout for auth calls + httpTimeout: "3.0" + + # Resource limits and requests + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 128Mi + + # Pod security context + securityContext: + runAsNonRoot: true + runAsUser: 1000 + + # ============================================================================ + # Route Definitions + # ============================================================================ + # Each route creates a separate Ingress resource in the specified namespace. + # All routes share the same host and TLS configuration. + # All routes are protected by the authz-adapter via NGINX external auth. + # + # IMPORTANT: Only ONE route should have the cert-manager.io/cluster-issuer annotation + # to avoid "certificate resource is not owned by this object" errors. + # Set `primary: true` on exactly one route to designate it as the certificate owner. + routes: + # Landing Page - Root path (/) + landing: + enabled: true + namespace: security + service: landing-page + serviceNamespace: security + port: 80 + pathPrefix: / + # Must be lower priority than other routes + # NGINX processes longer paths first, so /workflows takes precedence over / + + # Argo Workflows UI (primary route - manages TLS certificate) + workflows: + enabled: true + # Set primary: true to designate this route as the certificate owner + # Only the primary route gets the cert-manager.io/cluster-issuer annotation + primary: true + # Namespace where the ingress will be created + namespace: argo-workflows + # Service name to route to + service: argo-stack-argo-workflows-server + # Namespace where the actual service exists (for cross-namespace routing) + # If different from namespace, an ExternalName service will be created + serviceNamespace: argo-workflows + port: 2746 + pathPrefix: /workflows + # Use regex path matching for subpaths + useRegex: true + # Rewrite path to remove prefix + rewriteTarget: /$2 + + # Argo CD Applications UI + applications: + enabled: true + namespace: argocd + service: argo-stack-argocd-server + # ArgoCD server is in argocd namespace + serviceNamespace: argocd + port: 8080 + pathPrefix: /applications + useRegex: true + rewriteTarget: /$2 + # ArgoCD server uses HTTPS by default + # backendProtocol: HTTPS + + # GitHub Repository Registrations EventSource + registrations: + enabled: true + namespace: argo-events + service: github-repo-registrations-eventsource-svc + # EventSource is in argo-events namespace + serviceNamespace: argo-events + port: 12000 + pathPrefix: /registrations + useRegex: true + rewriteTarget: /$2 + + # Calypr API Service + api: + enabled: true + namespace: calypr-api + service: calypr-api + # Service is in same namespace as ingress + serviceNamespace: calypr-api + port: 3000 + pathPrefix: /api + useRegex: true + rewriteTarget: /$2 + + # Calypr Tenants Service + tenants: + enabled: true + namespace: calypr-tenants + service: calypr-tenants + # Service is in same namespace as ingress + serviceNamespace: calypr-tenants + port: 3001 + pathPrefix: /tenants + useRegex: true + rewriteTarget: /$2 + # Optional: Allow public access to login endpoint + # Set to true to skip auth for /tenants/login + publicPaths: + - /tenants/login + - /tenants/logout + - /tenants/callback diff --git a/helm/argo-stack/overlays/ip-address-pool.yaml b/helm/argo-stack/overlays/ip-address-pool.yaml new file mode 100644 index 00000000..beae18e5 --- /dev/null +++ b/helm/argo-stack/overlays/ip-address-pool.yaml @@ -0,0 +1,17 @@ +apiVersion: metallb.io/v1beta1 +kind: IPAddressPool +metadata: + name: default-pool + namespace: metallb-system +spec: + addresses: + - 100.22.124.96-100.22.124.96 # Adjust to your available IP range +--- +apiVersion: metallb.io/v1beta1 +kind: L2Advertisement +metadata: + name: default + namespace: metallb-system +spec: + ipAddressPools: + - default-pool diff --git a/helm/argo-stack/templates/30-authz-adapter.yaml b/helm/argo-stack/templates/30-authz-adapter.yaml index 43e0f4c5..1dd25204 100644 --- a/helm/argo-stack/templates/30-authz-adapter.yaml +++ b/helm/argo-stack/templates/30-authz-adapter.yaml @@ -24,10 +24,19 @@ spec: spec: containers: - name: authz-adapter - image: {{ .Values.authzAdapter.image }} + image: "{{ .Values.authzAdapter.image.repository }}:{{ .Values.authzAdapter.image.tag }}" + imagePullPolicy: "{{ .Values.authzAdapter.image.pullPolicy }}" env: - name: FENCE_BASE value: {{ .Values.authzAdapter.fenceBase | quote }} + {{- if .Values.authzAdapter.debugEmail }} + - name: DEBUG_EMAIL + value: {{ .Values.authzAdapter.debugEmail | quote }} + {{- end }} + {{- if .Values.authzAdapter.debugGroups }} + - name: DEBUG_GROUPS + value: {{ .Values.authzAdapter.debugGroups | quote }} + {{- end }} ports: - containerPort: 8080 --- diff --git a/helm/argo-stack/templates/31-landing-page.yaml b/helm/argo-stack/templates/31-landing-page.yaml new file mode 100644 index 00000000..67cf5a8f --- /dev/null +++ b/helm/argo-stack/templates/31-landing-page.yaml @@ -0,0 +1,66 @@ +{{- if .Values.landingPage.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: landing-page + namespace: {{ .Values.namespaces.security }} + labels: + app: landing-page +spec: + replicas: 1 + selector: + matchLabels: + app: landing-page + template: + metadata: + labels: + app: landing-page + spec: + containers: + - name: nginx + image: "{{ .Values.landingPage.image.repository }}:{{ .Values.landingPage.image.tag }}" + imagePullPolicy: {{ .Values.landingPage.image.pullPolicy }} + ports: + - containerPort: 80 + {{- if .Values.landingPage.docsPath }} + volumeMounts: + - name: docs + mountPath: /docs + readOnly: true + {{- end }} + livenessProbe: + httpGet: + path: /healthz + port: 80 + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /healthz + port: 80 + initialDelaySeconds: 2 + periodSeconds: 5 + {{- if .Values.landingPage.docsPath }} + volumes: + # Note: hostPath is used per design requirements to allow host-mounted content. + # Ensure the path only exposes the intended documentation directory. + - name: docs + hostPath: + path: {{ .Values.landingPage.docsPath }} + type: Directory + {{- end }} +--- +apiVersion: v1 +kind: Service +metadata: + name: landing-page + namespace: {{ .Values.namespaces.security }} + labels: + app: landing-page +spec: + selector: + app: landing-page + ports: + - port: 80 + targetPort: 80 +{{- end }} diff --git a/helm/argo-stack/templates/90-argocd-application.yaml b/helm/argo-stack/templates/90-argocd-application.yaml index 58125979..c5a5b719 100644 --- a/helm/argo-stack/templates/90-argocd-application.yaml +++ b/helm/argo-stack/templates/90-argocd-application.yaml @@ -18,6 +18,8 @@ metadata: namespace: {{ .Values.namespaces.argocd }} annotations: "helm.sh/hook": post-install,post-upgrade # <---- Annotations added here + notifications.argoproj.io/subscribe.on-deploy-succeeded.github: "" + notifications.argoproj.io/subscribe.on-deploy-failed.github: "" spec: project: {{ .Values.argocdApplication.project }} source: diff --git a/helm/argo-stack/templates/argocd/applications-from-repo-registrations.yaml b/helm/argo-stack/templates/argocd/applications-from-repo-registrations.yaml index ef387f92..4cb4942f 100644 --- a/helm/argo-stack/templates/argocd/applications-from-repo-registrations.yaml +++ b/helm/argo-stack/templates/argocd/applications-from-repo-registrations.yaml @@ -21,6 +21,20 @@ metadata: {{- if $reg.tenant }} repo-registration/tenant: {{ $reg.tenant | quote }} {{- end }} + {{- if and $.Values.notifications.enabled $.Values.notifications.github.enabled $.Values.notifications.defaultSubscriptions.autoSubscribe }} + {{- if $.Values.notifications.triggers.onSyncSucceeded }} + notifications.argoproj.io/subscribe.on-sync-succeeded.github: "" + {{- end }} + {{- if $.Values.notifications.triggers.onSyncFailed }} + notifications.argoproj.io/subscribe.on-sync-failed.github: "" + {{- end }} + {{- if $.Values.notifications.triggers.onSyncRunning }} + notifications.argoproj.io/subscribe.on-sync-running.github: "" + {{- end }} + {{- if $.Values.notifications.triggers.onHealthDegraded }} + notifications.argoproj.io/subscribe.on-health-degraded.github: "" + {{- end }} + {{- end }} spec: project: {{ $reg.project | default "default" }} source: diff --git a/helm/argo-stack/templates/argocd/notifications-configmap.yaml b/helm/argo-stack/templates/argocd/notifications-configmap.yaml new file mode 100644 index 00000000..3ee96cf8 --- /dev/null +++ b/helm/argo-stack/templates/argocd/notifications-configmap.yaml @@ -0,0 +1,158 @@ +{{- if .Values.notifications.enabled }} +{{- if .Values.notifications.github.enabled }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: argocd-notifications-cm + namespace: {{ .Values.namespaces.argocd }} + labels: + app.kubernetes.io/name: {{ .Chart.Name }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/component: argocd-notifications +data: + # ========================================================= + # GitHub Notification Service Configuration + # ========================================================= + service.github: | + {{- if .Values.notifications.github.useGithubApp }} + # Using GitHub App for notifications + appID: {{ .Values.notifications.github.appId | default .Values.githubApp.appId }} + installationID: {{ .Values.notifications.github.installationId | default .Values.githubApp.installationId }} + privateKey: $github-privateKey + {{- end }} + + {{- /* derive base URL once */ -}} + {{- $argocdUrl := .Values.notifications.argocdUrl | default "https://calypr-demo.ddns.net/applications" }} + + + # ========================================================= + # Notification Templates + # ========================================================= + + # Successful sync (commit status) + template.app-sync-succeeded: | + message: | + Application {{`{{.app.metadata.name}}`}} has been successfully synced. + github: + repoURLPath: "{{`{{.app.spec.source.repoURL}}`}}" + revisionPath: "{{`{{.app.status.sync.revision}}`}}" + status: + state: success + label: "argocd/{{`{{.app.metadata.name}}`}}" + targetURL: "{{ $argocdUrl }}/{{`{{.app.metadata.name}}`}}" + + # Failed sync (commit status) + template.app-sync-failed: | + message: | + Application {{`{{.app.metadata.name}}`}} sync failed. + Error: {{`{{.app.status.operationState.message}}`}} + github: + repoURLPath: "{{`{{.app.spec.source.repoURL}}`}}" + revisionPath: "{{`{{.app.status.sync.revision}}`}}" + status: + state: failure + label: "argocd/{{`{{.app.metadata.name}}`}}" + targetURL: "{{ $argocdUrl }}/{{`{{.app.metadata.name}}`}}" + + # Sync running (optional commit status) + template.app-sync-running: | + message: | + Application {{`{{.app.metadata.name}}`}} sync is in progress. + github: + repoURLPath: "{{`{{.app.spec.source.repoURL}}`}}" + revisionPath: "{{`{{.app.status.sync.revision}}`}}" + status: + state: pending + label: "argocd/{{`{{.app.metadata.name}}`}}" + targetURL: "{{ $argocdUrl }}/{{`{{.app.metadata.name}}`}}" + + # Health degraded (commit status) + template.app-health-degraded: | + message: | + Application {{`{{.app.metadata.name}}`}} health has degraded. + github: + repoURLPath: "{{`{{.app.spec.source.repoURL}}`}}" + revisionPath: "{{`{{.app.status.sync.revision}}`}}" + status: + state: failure + label: "argocd/health/{{`{{.app.metadata.name}}`}}" + targetURL: "{{ $argocdUrl }}/{{`{{.app.metadata.name}}`}}" + + # Deployment notification (GitHub deployment API) + template.app-deployed: | + message: | + Application {{`{{.app.metadata.name}}`}} has been deployed to {{`{{.app.spec.destination.namespace}}`}}. + Revision: {{`{{.app.status.sync.revision}}`}} + github: + repoURLPath: "{{`{{.app.spec.source.repoURL}}`}}" + revisionPath: "{{`{{.app.status.sync.revision}}`}}" + deployment: + state: success + environment: "{{`{{.app.spec.destination.namespace}}`}}" + environmentURL: "{{`{{.context.argocdUrl}}`}}/applications/{{`{{.app.metadata.name}}`}}" + logURL: "{{`{{.context.argocdUrl}}`}}/applications/{{`{{.app.metadata.name}}`}}?view=log" + requiredContexts: [] + autoMerge: false + + # ========================================================= + # Notification Triggers + # ========================================================= + {{- if .Values.notifications.triggers.onSyncSucceeded }} + trigger.on-sync-succeeded: | + - description: Application syncing has succeeded + when: app.status.sync.status == 'Synced' && app.status.health.status == 'Healthy' + oncePer: app.status.sync.revision + send: + - app-sync-succeeded + - app-deployed + {{- end }} + + {{- if .Values.notifications.triggers.onSyncFailed }} + trigger.on-sync-failed: | + - when: app.status.operationState != nil && app.status.operationState.phase in ['Error', 'Failed'] + send: + - app-sync-failed + {{- end }} + + {{- if .Values.notifications.triggers.onSyncRunning }} + trigger.on-sync-running: | + - when: app.status.operationState != nil && app.status.operationState.phase in ['Running'] + send: + - app-sync-running + {{- end }} + + {{- if .Values.notifications.triggers.onHealthDegraded }} + trigger.on-health-degraded: | + - when: app.status.health.status == 'Degraded' + send: + - app-health-degraded + {{- end }} + + # ========================================================= + # Default Subscriptions + # ========================================================= + subscriptions: | + {{- if .Values.notifications.triggers.onSyncSucceeded }} + - recipients: + - github + triggers: + - on-sync-succeeded + {{- end }} + {{- if .Values.notifications.triggers.onSyncFailed }} + - recipients: + - github + triggers: + - on-sync-failed + {{- end }} + {{- if .Values.notifications.triggers.onHealthDegraded }} + - recipients: + - github + triggers: + - on-health-degraded + {{- end }} +{{- end }} +{{- end }} + + diff --git a/helm/argo-stack/templates/argocd/repo-creds-github-app.yaml b/helm/argo-stack/templates/argocd/repo-creds-github-app.yaml new file mode 100644 index 00000000..2e2e8b71 --- /dev/null +++ b/helm/argo-stack/templates/argocd/repo-creds-github-app.yaml @@ -0,0 +1,50 @@ +{{- if .Values.githubApp.enabled }} +{{- if and .Values.githubApp.appId .Values.githubApp.installationId }} +--- +# Repo Credentials Secret for GitHub App authentication +# This secret grants Argo CD access to repositories via GitHub App +# When repoCredsUrl is set, it provides access to all repos under that URL prefix +# +# The private key must be provided via one of: +# 1. ExternalSecret (when vault integration is enabled) +# 2. Manually created secret named {{ .Values.githubApp.privateKeySecretName }} +# +# Argo CD will look up the private key from the referenced secret. +apiVersion: v1 +kind: Secret +metadata: + name: github-app-repo-creds + namespace: {{ .Values.namespaces.argocd }} + labels: + app.kubernetes.io/name: {{ .Chart.Name }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/component: argocd + app.kubernetes.io/part-of: github-app-auth + # This label tells Argo CD this is a repo-creds secret + argocd.argoproj.io/secret-type: repo-creds + {{- with .Values.githubApp.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +type: Opaque +stringData: + # Type of repository + type: git + # URL pattern that this credential applies to + # Example: https://github.com/your-org grants access to all repos in the org + {{- if .Values.githubApp.repoCredsUrl }} + url: {{ .Values.githubApp.repoCredsUrl | quote }} + {{- else }} + url: "https://github.com" + {{- end }} + # GitHub App ID + githubAppID: {{ .Values.githubApp.appId | quote }} + # GitHub App Installation ID + githubAppInstallationID: {{ .Values.githubApp.installationId | quote }} + # Reference to the secret containing the private key + # Argo CD will read the private key from this secret + githubAppPrivateKeySecret: {{ .Values.githubApp.privateKeySecretName | default "argocd-github-app-key" | quote }} +{{- end }} +{{- end }} + diff --git a/helm/argo-stack/templates/eso/externalsecret-github-app.yaml b/helm/argo-stack/templates/eso/externalsecret-github-app.yaml new file mode 100644 index 00000000..57c16aa8 --- /dev/null +++ b/helm/argo-stack/templates/eso/externalsecret-github-app.yaml @@ -0,0 +1,40 @@ +{{- if and (include "argo-stack.vault.enabled" .) .Values.githubApp.enabled .Values.githubApp.privateKeyVaultPath }} +{{- if .Capabilities.APIVersions.Has "external-secrets.io/v1/ClusterSecretStore" }} +--- +# ExternalSecret for GitHub App private key +# This secret is used by Argo CD for repository authentication via GitHub App +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: {{ .Values.githubApp.privateKeySecretName | default "argocd-github-app-key" }} + namespace: {{ .Values.namespaces.argocd }} + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "1" + labels: + app.kubernetes.io/name: {{ .Chart.Name }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/component: argocd + app.kubernetes.io/part-of: github-app-auth +spec: + refreshInterval: 1h + secretStoreRef: + kind: {{ include "argo-stack.secretStore.kind" . }} + name: {{ include "argo-stack.secretStore.name" . }} + target: + name: {{ .Values.githubApp.privateKeySecretName | default "argocd-github-app-key" }} + creationPolicy: Owner + template: + type: Opaque + metadata: + labels: + app.kubernetes.io/part-of: argocd + data: + - secretKey: privateKey + remoteRef: + key: {{ include "argo-stack.vault.extractKey" .Values.githubApp.privateKeyVaultPath }} + property: {{ include "argo-stack.vault.extractProperty" .Values.githubApp.privateKeyVaultPath }} +{{- end }} +{{- end }} + diff --git a/helm/argo-stack/templates/eso/externalsecret-notifications.yaml b/helm/argo-stack/templates/eso/externalsecret-notifications.yaml new file mode 100644 index 00000000..cf18510f --- /dev/null +++ b/helm/argo-stack/templates/eso/externalsecret-notifications.yaml @@ -0,0 +1,41 @@ +{{- if and (include "argo-stack.vault.enabled" .) .Values.notifications.enabled .Values.notifications.github.enabled .Values.notifications.github.useGithubApp }} +{{- if .Capabilities.APIVersions.Has "external-secrets.io/v1/ClusterSecretStore" }} +{{- $privateKeyVaultPath := .Values.notifications.github.privateKeyVaultPath | default .Values.githubApp.privateKeyVaultPath }} +--- +# ExternalSecret for Argo CD Notifications GitHub integration +# This creates the argocd-notifications-secret with the GitHub App private key +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: argocd-notifications-secret + namespace: {{ .Values.namespaces.argocd }} + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "2" + labels: + app.kubernetes.io/name: {{ .Chart.Name }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/component: argocd-notifications +spec: + refreshInterval: 1h + secretStoreRef: + kind: {{ include "argo-stack.secretStore.kind" . }} + name: {{ include "argo-stack.secretStore.name" . }} + target: + name: argocd-notifications-secret + creationPolicy: Owner + template: + type: Opaque + metadata: + labels: + app.kubernetes.io/part-of: argocd + data: + # Map the GitHub App private key to the key expected by notifications controller + - secretKey: github-privateKey + remoteRef: + key: {{ include "argo-stack.vault.extractKey" $privateKeyVaultPath }} + property: {{ include "argo-stack.vault.extractProperty" $privateKeyVaultPath }} +{{- end }} +{{- end }} + diff --git a/helm/argo-stack/templates/events/eventsource-github-from-repo-registrations.yaml b/helm/argo-stack/templates/events/eventsource-github-from-repo-registrations.yaml index 973b9276..fd47b9f1 100644 --- a/helm/argo-stack/templates/events/eventsource-github-from-repo-registrations.yaml +++ b/helm/argo-stack/templates/events/eventsource-github-from-repo-registrations.yaml @@ -78,9 +78,8 @@ metadata: namespace: {{ .Values.events.namespace | default "argo-events" }} labels: source: repo-registration - annotations: - kubernetes.io/ingress.class: {{ .Values.events.github.webhook.ingress.className | default "nginx" | quote }} spec: + ingressClassName: {{ .Values.events.github.webhook.ingress.className | default "nginx" | quote }} rules: {{- range $h := (.Values.events.github.webhook.ingress.hosts | default list) }} - host: {{ $h }} diff --git a/helm/argo-stack/templates/roles/workflow-rbac.yaml b/helm/argo-stack/templates/roles/workflow-rbac.yaml new file mode 100644 index 00000000..1e38ac1b --- /dev/null +++ b/helm/argo-stack/templates/roles/workflow-rbac.yaml @@ -0,0 +1,79 @@ +{{- $wfPrefix := .Values.workflowNamespacePrefix -}} +{{- $adminGroup := .Values.adminGroup -}} +{{- range $r := .Values.repoRegistrations }} +{{- $repoUrl := required "repoRegistrations[].repoUrl is required" $r.repoUrl }} +{{- $trimmed := $repoUrl | trimPrefix "https://github.com/" | trimSuffix ".git" }} +{{- $parts := splitList "/" $trimmed }} +{{- $org := index $parts 0 }} +{{- $repo := index $parts 1 }} +{{- $ns := printf "%s%s-%s" $wfPrefix $org $repo }} +{{- $writerGroup := printf "wf-%s-%s-writers" $org $repo }} +{{- $readerGroup := printf "wf-%s-%s-readers" $org $repo }} + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: wf-{{ $org }}-{{ $repo }}-writer + namespace: {{ $ns }} +rules: + - apiGroups: ["argoproj.io"] + resources: ["workflows", "workflowtemplates", "cronworkflows"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: [""] + resources: ["pods", "pods/log"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["configmaps", "secrets"] + verbs: ["get", "list", "watch"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: wf-{{ $org }}-{{ $repo }}-reader + namespace: {{ $ns }} +rules: + - apiGroups: ["argoproj.io"] + resources: ["workflows", "workflowtemplates", "cronworkflows"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["pods", "pods/log"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["configmaps", "secrets"] + verbs: ["get", "list", "watch"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: wf-{{ $org }}-{{ $repo }}-writer-binding + namespace: {{ $ns }} +subjects: + - kind: Group + name: {{ $writerGroup }} + apiGroup: rbac.authorization.k8s.io + - kind: Group + name: {{ $adminGroup }} + apiGroup: rbac.authorization.k8s.io +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: wf-{{ $org }}-{{ $repo }}-writer + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: wf-{{ $org }}-{{ $repo }}-reader-binding + namespace: {{ $ns }} +subjects: + - kind: Group + name: {{ $readerGroup }} + apiGroup: rbac.authorization.k8s.io +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: wf-{{ $org }}-{{ $repo }}-reader +{{- end }} diff --git a/helm/argo-stack/values.yaml b/helm/argo-stack/values.yaml index 4215afd3..ae459a40 100644 --- a/helm/argo-stack/values.yaml +++ b/helm/argo-stack/values.yaml @@ -8,9 +8,11 @@ namespaces: argo: argo-workflows argocd: argocd - tenant: wf-poc + calypr-tenants: calypr-tenants security: security argo-events: argo-events + argo-stack: argo-stack + calypr-api: calypr-api # ============================================================================ # External Secrets Operator (ESO) and Vault Integration @@ -141,9 +143,11 @@ argo-workflows: namespaced: true extraArgs: - --auth-mode=server + # UI/API lives under /workflows + baseHref: "/workflows/" controller: workflowNamespaces: - - wf-poc + - argo-workflows # Ensure controller uses correct namespace namespaceInstallMode: true # Enable log archiving for all workflows @@ -159,26 +163,99 @@ argo-cd: enabled: true namespace: argocd # where Argo CD runs namespaceOverride: argocd + admin.enabled: "false" # turn off login screen configs: secret: create: true # provide a strong random key (hex or base64 is fine) - extra: {} # MUST be a map, not a string/boolean + extra: # MUST be a map, not a string/boolean + # any sufficiently random string; 32+ chars is nice + server.secretkey: "some-long-random-string-change-me" # Never commit your real token in a Git repo. instead, override it at deploy time using: # export ARGO_CD_SECRET=replace with output of: openssl rand -hex 32 # helm upgrade --install ... argo-cd.configs.secret.extra."server\.secretkey"="${ARGOCD_SECRET_KEY}" # server.secretkey: "" + cm: + # External URL including the sub-path + url: https://calypr-demo.ddns.net/applications + dex.config: | + enablePasswordDB: false + oauth2: + alwaysShowLoginScreen: false + connectors: + - type: authproxy + id: authproxy + name: Auth Proxy + config: + userHeader: "X-Auth-Request-User" + groupHeader: "X-Auth-Request-Groups" + params: + # Argo CD is behind ingress that terminates TLS + server.insecure: true + # Tell ArgoCD UI/API it’s under /applications + server.basehref: "/applications" + # server.rootpath: "/applications" + notifications: + enabled: true # ============================================================================ # Authorization Adapter Settings (Arborist Integration) # ============================================================================ authzAdapter: - image: ghcr.io/calypr/argo-helm:latest + image: + repository: authz-adapter + pullPolicy: IfNotPresent + tag: v0.0.1 fenceBase: "https://calypr-dev.ohsu.edu/user" replicas: 2 # Ensure adapter uses security namespace namespace: security + # Debug mode settings (for development/testing only) + # When debugEmail is set, enables debug mode and allows debug_email/debug_groups query params + debugEmail: brian@bwalsh.com + # Comma-separated list of groups to assign to the debug user + debugGroups: wf-admins + +# ============================================================================ +# Landing Page Configuration +# ============================================================================ +# A simple static landing page that renders markdown files at the root path (/) +# Content is sourced from a host-mounted directory containing markdown files. + +landingPage: + # Enable the landing page deployment + enabled: true + # Container image configuration + image: + repository: landing-page + tag: v3 + pullPolicy: IfNotPresent + # Page title + title: "Welcome" + # -------------------------------------------------------------------------- + # docsPath - Host path to mount for markdown content (optional) + # -------------------------------------------------------------------------- + # This is a Kubernetes hostPath volume that mounts a directory from the + # node's filesystem into the landing-page container at /docs. + # + # For kind clusters: + # - The path refers to a directory inside the kind-control-plane container + # - To mount a host directory, use kind's extraMounts in kind-config.yaml: + # extraMounts: + # - hostPath: /path/on/your/host + # containerPath: /var/www/docs + # - Then set docsPath: "/var/www/docs" + # + # Content updates: + # - Changes to markdown files are picked up on browser refresh (no restart needed) + # - The page fetches markdown via JavaScript on each page load + # - nginx serves files directly from the mounted directory + # + # If not set, shows a default "no content available" message. + # SECURITY: Only expose a dedicated docs directory, not sensitive host paths + # Example: "/var/www/docs" + docsPath: "/var/www/docs" # ============================================================================ # Artifact Storage Configuration ( Results of Workflows ) @@ -194,20 +271,6 @@ s3: accessKey: "" secretKey: "" -# Ingresses we ship (toggle/hosts) -ingress: - argoWorkflows: - enabled: true - host: "argo.localtest.me" - tls: - enabled: false - secretName: "" - argocd: - enabled: true - host: "argocd.localtest.me" - tls: - enabled: false - secretName: "" # NGINX auth to the adapter ingressAuth: @@ -215,6 +278,67 @@ ingressAuth: authURL: "http://authz-adapter.security.svc.cluster.local:8080/check" passAuthorization: true +# ============================================================================ +# Ingress AuthZ Overlay - Unified Path-Based Routing with Centralized Auth +# ============================================================================ +# Enable this overlay to provide a single host, path-based ingress for all +# major UIs and APIs, protected by the authz-adapter. +# See: helm/argo-stack/overlays/ingress-authz-overlay/docs/authz-ingress-user-guide.md +# +# To use the overlay, install it separately: +# helm upgrade --install ingress-authz-overlay \ +# helm/argo-stack/overlays/ingress-authz-overlay \ +# --values helm/argo-stack/values.yaml \ +# --set ingressAuthzOverlay.enabled=true + +ingressAuthzOverlay: + enabled: true + host: calypr-demo.ddns.net + tls: + secretName: calypr-demo-tls + authzAdapter: + # Use centralized adapter from security namespace + deploy: false + serviceName: authz-adapter + namespace: security + port: 8080 + path: /check + signinUrl: https://calypr-demo.ddns.net/tenants/login + responseHeaders: X-User, X-Email, X-Groups + routes: + landing: + namespace: security + service: landing-page + port: 80 + pathPrefix: / + workflows: + namespace: argo-stack + service: argo-stack-argo-workflows-server + port: 2746 + pathPrefix: /workflows + applications: + namespace: argo-stack + service: argo-stack-argocd-server + port: 8080 + pathPrefix: /applications + # ArgoCD server uses HTTPS by default + backendProtocol: HTTPS + registrations: + namespace: argo-stack + service: github-repo-registrations-eventsource-svc + port: 12000 + pathPrefix: /registrations + api: + namespace: calypr-api + service: calypr-api + port: 3000 + pathPrefix: /api + tenants: + namespace: calypr-tenants + service: calypr-tenants + port: 3001 + pathPrefix: /tenants + # ============================================================================ # Argo CD Applications - Multi-Application Support (REMOVED) # ============================================================================ @@ -385,3 +509,114 @@ repoRegistrations: [] # - admin@myorg.com # # See examples/repo-registrations-example.yaml and docs/repo-registration-guide.md + + +# for roles +# Namespace where Argo CD is installed +argocdNamespace: argocd + +# Prefix for workflow namespaces. For a repoUrl https://github.com//.git, +# the workflow namespace is assumed to be: - +workflowNamespacePrefix: wf- + +# Global group that should have full Argo CD admin privileges +adminGroup: wf-admins + + +# ============================================================================ +# GitHub App Authentication for Repository Access +# ============================================================================ +# Use GitHub App for secure, auditable, least-privilege repository access. +# This replaces Personal Access Tokens (PATs) or SSH keys with short-lived +# installation tokens that are automatically rotated. +# +# See docs/github-app-setup.md for detailed setup instructions. + +githubApp: + # Enable GitHub App authentication for Argo CD repositories + enabled: true + + # GitHub App ID (found in the GitHub App settings) + appId: $GITHUBHAPP_APP_ID + + # GitHub App Installation ID (found in the installation settings) + installationId: $GITHUBHAPP_INSTALLATION_ID + + # Name of the Kubernetes Secret containing the GitHub App private key + # The secret should contain a key named 'privateKey' with the PEM-formatted key + privateKeySecretName: $GITHUBHAPP_PRIVATE_KEY_SECRET_NAME + + # Vault path for the GitHub App private key (when using External Secrets) + # Format: "path#key" where path is relative to vault.kv.defaultPathPrefix + privateKeyVaultPath: $GITHUBHAPP_PRIVATE_KEY_VAULT_PATH + + # Repository credential scope + # Use this to grant access to all repositories under an organization + # Example: "https://github.com/your-org" grants access to all repos in the org + # repoCredsUrl: "" + + # Additional annotations for the repo-creds secret + annotations: {} + +# ============================================================================ +# Argo CD Notifications - GitHub Status Integration +# ============================================================================ +# Configure Argo CD to post deployment status updates back to GitHub. +# This enables developers to see sync status directly in pull requests +# and commit statuses without switching to the Argo CD UI. +# +# See docs/github-app-setup.md for GitHub App permissions setup. + +notifications: + # Enable Argo CD Notifications controller + enabled: true + # for status templates + argocdUrl: https://calypr-demo.ddns.net/applications + # turn on debugging + # https://github.com/argoproj/argo-helm/blob/main/charts/argo-cd/templates/argocd-notifications/deployment.yaml#L76-L77 + logLevel: debug + # https://github.com/argoproj/argo-helm/blob/main/charts/argo-cd/templates/argocd-notifications/deployment.yaml#L86-L88 + extraEnv: + - name: ARGOCD_NOTIFICATIONS_HTTP_DEBUG + value: "true" + - name: GODEBUG + value: "http2debug=2" + # GitHub notification service configuration + github: + # Enable GitHub status updates + enabled: true + + # Use GitHub App for notifications (recommended) + # When enabled, uses the same GitHub App as repository authentication + useGithubApp: true + + # GitHub App ID (defaults to githubApp.appId if not set) + # appId: "" + + # GitHub App Installation ID (defaults to githubApp.installationId if not set) + # installationId: "" + + # Private key secret reference (defaults to githubApp.privateKeySecretName if not set) + # privateKeySecretName: "" + + # Default notification triggers + # These define when notifications are sent + triggers: + # Notify on successful sync + onSyncSucceeded: true + # Notify on failed sync + onSyncFailed: true + # Notify when sync is running + onSyncRunning: false + # Notify on health degraded + onHealthDegraded: false + + # Application annotations to subscribe to notifications + # Add these annotations to your Argo CD Applications to enable notifications: + # notifications.argoproj.io/subscribe.on-sync-succeeded.github: "" + # notifications.argoproj.io/subscribe.on-sync-failed.github: "" + defaultSubscriptions: + # Automatically add notification subscriptions to applications + # created from repoRegistrations + autoSubscribe: true + diff --git a/kind-config.yaml b/kind-config.yaml new file mode 100644 index 00000000..b064c807 --- /dev/null +++ b/kind-config.yaml @@ -0,0 +1,15 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +networking: + kubeProxyMode: "iptables" # Explicit mode +nodes: +- role: control-plane + extraPortMappings: + - containerPort: 30080 + hostPort: 80 + - containerPort: 30443 + hostPort: 443 + extraMounts: + - hostPath: $PWD/landing-page-content + containerPath: /var/www/docs + diff --git a/landing-page-content/index.md b/landing-page-content/index.md new file mode 100644 index 00000000..1d2d3a53 --- /dev/null +++ b/landing-page-content/index.md @@ -0,0 +1,233 @@ +# 🌟 Using the Calypr GitHub App + +**Use Case — Publishing Research Data with GitOps** + +**As a bioinformatician**, I want to publish my analysis outputs and dataset updates through a Git-based workflow, so that every change to my research data automatically updates downstream portal applications, ensuring that collaborators always see the latest validated results. + +**Story** + +When I push new analysis files or metadata updates to the repository, GitOps detects the commit and applies the corresponding updates to the data services and portal layers. This gives me a clear, auditable history of every modification, eliminates manual synchronization steps, and guarantees that the public or internal portal always reflects the current state of my work. By relying on version-controlled automation, I can focus on scientific analysis rather than operational glue, while maintaining reproducibility, traceability, and consistency across environments. + + +```mermaid +sequenceDiagram + autonumber + + participant B as Bioinformatician + participant G as Git Repository + participant O as Calypr GitOps + participant S as Workflow Services + participant P as Portal Application + + rect rgba(200, 220, 255, 0.35) + Note over O,P: Calypr Platform + end + + B->>G: Commit new data
and metadata updates + G-->>O: Trigger GitOps sync + + rect rgba(200, 220, 255, 0.12) + O->>S: Apply updated configs
and publish new data + S-->>O: Acknowledge deployment
and updated records + O-->>G: Updated Git status + + O->>P: Update portal manifests
and reload content + P-->>B: Portal reflects
latest validated data + end + + + Note over B,P: Git commits become the single source of truth,
and all systems stay in sync automatically. + +``` + +This guide covers: + +* **What the Calypr GitHub App does** +* **Why a user would install it** +* **How to integrate it into their GitHub account** +* **How it affects their project workflows** + +### How to Connect Your GitHub Repository to the Calypr Server + +## Overview + +The **Calypr GitHub App** allows your project repository on GitHub to stay in sync with the Calypr platform. + +By installing this app on your GitHub repository: + +* Your **data**, **metadata**, and **workflow configuration files** will be automatically available to the Calypr system. +* Calypr can **detect updates** in your repository and ensure your project environment reflects your latest work. +* You no longer need to manage tokens, SSH keys, or manual permissions—GitHub handles it securely. + +This makes collaboration easier, keeps your project reproducible, and ensures the Calypr server always has the most up-to-date version of your files. + +--- + +# 🔧 What the Calypr GitHub App Does + +After you install it: + +### ✔ 1. **Securely connects your GitHub repository to Calypr** + +The app grants Calypr read-only access to your repository. Calypr can download the files it needs, but **cannot modify your code or data**. + +### ✔ 2. **Keeps your Calypr environment automatically updated** + +Whenever you push new: + +* data files +* metadata +* workflow definitions (Nextflow, Argo, CWL, etc.) +* configuration settings + +…the Calypr server can see the latest version and use it in your workspace and pipelines. + +### ✔ 3. **Simplifies onboarding** + +No personal GitHub tokens. +No SSH keys. +No security complexity. +Just a simple installation and you’re done. + +### ✔ 4. **Works with private repositories** + +Your private data stays private. Only the Calypr system (and only specific components) can access it. + +--- + +# 🧠 Why You Might Need This + +You should install the Calypr GitHub App if: + +* Your workflow, metadata, or analysis files live in GitHub. +* You want Calypr to run workflows based on the files in your repo. +* You want collaborators or pipelines to always use the current version. +* You want a secure, low-maintenance way to connect GitHub and Calypr. + +If you’re using Calypr for **multi-omics**, **analysis pipelines**, or **any project with multiple collaborators**, the GitHub App is the easiest way to keep everything synchronized. + +--- + +# 🛠 How to Install the Calypr GitHub App + +Installing takes less than one minute. + +### **Step 1 — Open the Installation Page** + +Visit: + +``` +https://github.com/apps/calypr +``` + +(or the URL provided by your administrator) + +### **Step 2 — Select Your Account or Organization** + +You will see options such as: + +* **Install for my personal GitHub account** +* **Install for an organization** + +Choose where your repository lives. + +### **Step 3 — Select Repositories** + +You have two choices: + +### **Option A — Only give access to selected repositories (recommended)** + +Choose specific repositories that contain Calypr projects. + +### **Option B — Give access to all repositories** + +Only use this if: + +* Your GitHub account is dedicated to Calypr work + — or — +* You prefer not to manage per-repo permissions. + +### **Step 4 — Confirm Permissions** + +The Calypr GitHub App typically requests: + +* **Read-only access to code and files** +* **Read-only access to repo metadata (branch names, permissions)** +* (Optional) permission to mark CI/check results, if Calypr is configured to do so + +The app **cannot** modify your repository. + +### **Step 5 — Finish Installation** + +Click **Install**. +That’s it—your repository is now linked. + +--- + +# 🔁 What Happens After Installation? + +Once your repository is connected: + +### ✔ Calypr immediately gains access + +Your project will appear in the Calypr interface (or become available for registration, depending on your setup). + +### ✔ Any updates you push to GitHub are seen by Calypr + +Examples: + +* upload new FASTQ files → they appear in Calypr’s data browser +* update metadata → validators update automatically +* change workflow config → workflow UI shows new settings +* add or edit sample sheets → pipelines re-index as needed + +### ✔ No further authentication is required + +You don’t need to manage passwords, tokens, or machine access. + +--- + +# 🧪 Verifying the Connection + +You can check the connection by: + +* Opening the Calypr UI → Projects → *Your Repository* +* Clicking **Refresh** +* Verifying that your latest branch, files, and metadata are shown + +If you push a change to GitHub and see it reflected in Calypr within a minute or two, everything is working. + +--- + +# ❓ Troubleshooting + +### **I don’t see my repository listed in Calypr.** + +Make sure the GitHub App was installed for that specific repository. + +### **I installed the app, but Calypr says it cannot access the repo.** + +Two common causes: + +1. The app was installed in your personal account, but the repo is in an organization. +2. The repo was not selected during installation. + +### **I want to remove access.** + +You can uninstall the app or change the permitted repositories anytime from: + +``` +https://github.com/settings/installations +``` + +--- + +# 📬 Need Help? + +If you run into issues: + +* Contact your Calypr platform administrator +* Or open a support ticket through the Calypr help portal + +--- + diff --git a/landing-page/Dockerfile b/landing-page/Dockerfile new file mode 100644 index 00000000..e11a8e90 --- /dev/null +++ b/landing-page/Dockerfile @@ -0,0 +1,4 @@ +FROM nginx:alpine +COPY index.html /usr/share/nginx/html/index.html +COPY nginx.conf /etc/nginx/conf.d/default.conf +EXPOSE 80 diff --git a/landing-page/README.md b/landing-page/README.md new file mode 100644 index 00000000..b7a45e84 --- /dev/null +++ b/landing-page/README.md @@ -0,0 +1,99 @@ +# Landing Page + +A simple static landing page that renders markdown files. + +## Usage + +Build the Docker image: + +```bash +docker build -t landing-page . +``` + +Run with a mounted docs directory: + +```bash +docker run -p 8080:80 -v /path/to/your/docs:/docs landing-page +``` + +## Helm Configuration + +Configure via `values.yaml`: + +```yaml +landingPage: + enabled: true + image: + repository: landing-page + tag: latest + title: "Welcome" + docsPath: "/var/www/docs" # See "Content Directory" below +``` + +### Content Directory (`docsPath`) + +The `docsPath` is a Kubernetes **hostPath volume** that mounts a directory from the node's filesystem. + +**For kind clusters:** + +Kind clusters must be configured with `extraMounts` at creation time (mounts cannot be added to running clusters). + +1. **Add extraMounts to kind-config.yaml:** + ```yaml + kind: Cluster + apiVersion: kind.x-k8s.io/v1alpha4 + nodes: + - role: control-plane + extraMounts: + - hostPath: /path/on/your/host + containerPath: /var/www/docs + extraPortMappings: + - containerPort: 30080 + hostPort: 80 + - containerPort: 30443 + hostPort: 443 + ``` + +2. **Recreate the kind cluster with the new config:** + ```bash + # Delete existing cluster + kind delete cluster --name kind + + # Create new cluster with mount config + kind create cluster --config kind-config.yaml + + # Re-run setup (load images, install charts) + make docker-install + make install + ``` + +3. **Set docsPath in values.yaml:** + ```yaml + landingPage: + docsPath: "/var/www/docs" + ``` + +4. **Add your markdown content:** + ```bash + # On your host machine + echo "# Welcome" > /path/on/your/host/index.md + ``` + +**Content updates:** +- Changes to markdown files are picked up on browser refresh +- No pod restart needed—nginx serves files directly from the mount +- The page fetches and renders markdown via JavaScript on each load + +## Features + +- Renders `index.md` or `README.md` from the mounted `/docs` directory +- Client-side markdown rendering using marked.js with DOMPurify sanitization +- Minimal, lightweight nginx-based container +- Health check endpoint at `/healthz` +- Security headers (X-Content-Type-Options, X-Frame-Options) + +## Security Notes + +- JavaScript libraries (marked.js, DOMPurify) are loaded from jsdelivr CDN +- For production deployments, consider adding Subresource Integrity (SRI) hashes +- Only expose dedicated docs directories via the hostPath mount diff --git a/landing-page/index.html b/landing-page/index.html new file mode 100644 index 00000000..98de26e4 --- /dev/null +++ b/landing-page/index.html @@ -0,0 +1,195 @@ + + + + + + Welcome + + + + + + + +
Loading...
+ + + + + + + diff --git a/landing-page/nginx.conf b/landing-page/nginx.conf new file mode 100644 index 00000000..9766abe5 --- /dev/null +++ b/landing-page/nginx.conf @@ -0,0 +1,28 @@ +server { + listen 80; + server_name localhost; + root /usr/share/nginx/html; + + # Security headers + add_header X-Content-Type-Options nosniff; + add_header X-Frame-Options DENY; + + location / { + try_files $uri $uri/ /index.html; + } + + location /docs/ { + alias /docs/; + autoindex off; + # Only serve markdown and common image files + location ~ \.(md|png|jpg|jpeg|gif|svg)$ { + add_header X-Content-Type-Options nosniff; + } + } + + location /healthz { + access_log off; + return 200 'ok'; + add_header Content-Type text/plain; + } +} diff --git a/my-values.yaml b/my-values.yaml index 1ea43b2f..b17f38fb 100644 --- a/my-values.yaml +++ b/my-values.yaml @@ -1,6 +1,6 @@ repoRegistrations: # Example 1: Basic Nextflow pipeline - - name: nextflow-hello-project + - name: bwalsh-nextflow-hello-project repoUrl: https://github.com/bwalsh/nextflow-hello-project.git defaultBranch: main tenant: research-team-1 @@ -13,14 +13,14 @@ repoRegistrations: region: $S3_REGION insecure: true pathStyle: true - keyPrefix: nextflow-hello-project-workflows/ + keyPrefix: bwalsh-nextflow-hello-project-workflows/ # Vault path for S3 credentials (relative to kv/) - externalSecretPath: argo/apps/nextflow-hello-project/s3/artifacts + externalSecretPath: argo/apps/bwalsh/nextflow-hello-project/s3/artifacts # GitHub credentials - githubSecretName: github-secret-nextflow-hello + githubSecretName: github-secret-bwalsh-nextflow-hello-project # Vault path for GitHub token - githubSecretPath: argo/apps/nextflow-hello-project/github + githubSecretPath: argo/apps/bwalsh/nextflow-hello-project/github # Access control (Fence-authenticated emails) adminUsers: diff --git a/scripts/check_tools.sh b/scripts/check_tools.sh new file mode 100755 index 00000000..21a7d8b8 --- /dev/null +++ b/scripts/check_tools.sh @@ -0,0 +1,376 @@ +#!/bin/bash + +# Script to check installed tools and APT installation history +# Usage: ./check_tools_with_apt_history.sh + +echo "=== Tool Installation Check with APT History ===" +echo "Date: $(date)" +echo "Hostname: $(hostname)" +echo "========================================" + +# Define colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +MAGENTA='\033[0;35m' +NC='\033[0m' # No Color + +# Function to check if a command exists and get version +check_tool() { + local tool=$1 + + if command -v "$tool" &> /dev/null; then + echo -e "${GREEN}✓${NC} $tool is installed at: $(which $tool)" + + # Try to get version information + case $tool in + "docker") + echo " Version: $(docker --version 2>/dev/null)" + ;; + "kubectl") + echo " Version: $(kubectl version --client --short 2>/dev/null || kubectl version --client 2>/dev/null | head -1)" + ;; + "helm") + echo " Version: $(helm version --short 2>/dev/null || helm version 2>/dev/null | head -1)" + ;; + "kind") + echo " Version: $(kind version 2>/dev/null)" + ;; + "k9s") + echo " Version: $(k9s version --short 2>/dev/null || k9s version 2>/dev/null | head -1)" + ;; + "stern") + echo " Version: $(stern --version 2>/dev/null)" + ;; + "jq") + echo " Version: $(jq --version 2>/dev/null)" + ;; + "git") + echo " Version: $(git --version 2>/dev/null)" + ;; + "python3") + echo " Version: $(python3 --version 2>/dev/null)" + ;; + "pytest") + echo " Version: $(pytest --version 2>/dev/null | head -1)" + ;; + "envsubst") + echo " Version: $(envsubst --version 2>/dev/null | head -1)" + ;; + "certbot") + echo " Version: $(certbot --version 2>/dev/null)" + ;; + "go") + echo " Version: $(go version 2>/dev/null)" + ;; + "gcc") + echo " Version: $(gcc --version 2>/dev/null | head -1)" + ;; + "curl") + echo " Version: $(curl --version 2>/dev/null | head -1)" + ;; + "openssl") + echo " Version: $(openssl version 2>/dev/null)" + ;; + *) + version_output=$($tool --version 2>/dev/null | head -1) + if [ -n "$version_output" ]; then + echo " Version: $version_output" + else + echo " Version: Not available" + fi + ;; + esac + echo "" + else + echo -e "${RED}✗${NC} $tool is NOT installed" + echo "" + fi +} + +# Function to check APT history +check_apt_history() { + echo -e "${BLUE}=== APT Installation History ===${NC}" + + if ! command -v apt &> /dev/null; then + echo -e "${YELLOW}APT is not available on this system${NC}" + + # Check for other package managers + if command -v yum &> /dev/null; then + echo -e "${CYAN}YUM system detected. Checking yum history...${NC}" + check_yum_history + return + elif command -v dnf &> /dev/null; then + echo -e "${CYAN}DNF system detected. Checking dnf history...${NC}" + check_dnf_history + return + else + echo "No supported package manager history available" + return + fi + fi + + # Show recent APT operations + echo -e "${CYAN}Recent APT operations:${NC}" + if [ -f /var/log/apt/history.log ]; then + echo "From /var/log/apt/history.log (most recent 15 operations):" + grep -E "^Start-Date|^Commandline|^Install:|^Upgrade:" /var/log/apt/history.log | tail -30 | sed 's/^/ /' + echo "" + fi + + # Define tools and their common package names in APT + declare -A tool_packages=( + ["docker"]="docker.io docker-ce docker-engine containerd.io" + ["git"]="git git-core" + ["python3"]="python3 python3-dev python3-pip" + ["jq"]="jq" + ["kubectl"]="kubectl" + ["helm"]="helm" + ["kind"]="kind" + ["k9s"]="k9s" + ["stern"]="stern" + ["pytest"]="python3-pytest pytest" + ["envsubst"]="gettext-base gettext" + ["certbot"]="certbot python3-certbot-nginx python3-certbot-apache" + ["go"]="golang-go golang" + ["gcc"]="gcc build-essential" + ["curl"]="curl" + ["openssl"]="openssl libssl-dev" + ) + + echo -e "${CYAN}Searching APT history for tool installations:${NC}" + + for tool in "${!tool_packages[@]}"; do + echo -e "\n${YELLOW}Checking for $tool packages:${NC}" + packages=${tool_packages[$tool]} + + found_any=false + for package in $packages; do + # Search in APT history log + if [ -f /var/log/apt/history.log ]; then + history_result=$(grep -A5 -B1 "$package" /var/log/apt/history.log 2>/dev/null | grep -E "Start-Date|Install:|Upgrade:" | tail -5) + if [ -n "$history_result" ]; then + echo -e " ${GREEN}Found $package in APT history:${NC}" + echo "$history_result" | sed 's/^/ /' + found_any=true + fi + fi + + # Search in dpkg log + if [ -f /var/log/dpkg.log ]; then + dpkg_result=$(grep "install.*$package" /var/log/dpkg.log 2>/dev/null | tail -3) + if [ -n "$dpkg_result" ]; then + echo -e " ${GREEN}Found $package in dpkg log:${NC}" + echo "$dpkg_result" | sed 's/^/ /' + found_any=true + fi + fi + done + + if [ "$found_any" = false ]; then + echo -e " ${RED}No APT installation found for $tool${NC}" + fi + done +} + +# Function to check YUM history (fallback) +check_yum_history() { + echo "Recent YUM transactions:" + yum history list 2>/dev/null | head -15 + echo "" + + echo -e "${CYAN}Searching YUM history for development tools:${NC}" + yum history list | grep -E "(docker|git|python|jq|kubectl|helm|kind|gcc|curl|openssl)" 2>/dev/null || echo "No matching packages found in YUM history" +} + +# Function to check DNF history (fallback) +check_dnf_history() { + echo "Recent DNF transactions:" + dnf history list 2>/dev/null | head -15 + echo "" + + echo -e "${CYAN}Searching DNF history for development tools:${NC}" + dnf history list | grep -E "(docker|git|python|jq|kubectl|helm|kind|gcc|curl|openssl)" 2>/dev/null || echo "No matching packages found in DNF history" +} + +# Function to check installed packages via dpkg +check_installed_packages() { + echo -e "${BLUE}=== Currently Installed Packages (via dpkg) ===${NC}" + + if command -v dpkg &> /dev/null; then + echo -e "${CYAN}Checking for tool-related packages currently installed:${NC}" + + # List of package patterns to search for + patterns=("docker" "git" "python3" "jq" "kubectl" "helm" "kind" "k9s" "stern" "pytest" "gettext" "certbot" "golang" "gcc" "curl" "openssl") + + for pattern in "${patterns[@]}"; do + installed_packages=$(dpkg -l | grep -i "$pattern" | awk '{print $2, $3}' 2>/dev/null) + if [ -n "$installed_packages" ]; then + echo -e "\n${YELLOW}Packages matching '$pattern':${NC}" + echo "$installed_packages" | sed 's/^/ /' + fi + done + else + echo "dpkg not available" + fi + echo "" +} + +# Function to find additional development tools +find_additional_tools() { + echo -e "${BLUE}=== Additional Development Tools Found ===${NC}" + + additional_tools=( + "aws" "terraform" "ansible" "vagrant" "node" "npm" "yarn" + "java" "javac" "mvn" "gradle" "make" "cmake" "g++" + "wget" "vim" "nano" "emacs" "tmux" "screen" "htop" "tree" + "zip" "unzip" "tar" "rsync" "ssh" "scp" "netstat" "ss" + "systemctl" "journalctl" "crontab" "at" "nohup" "sudo" + "awk" "sed" "grep" "find" "xargs" "sort" "uniq" "head" "tail" + "nc" "telnet" "ping" "traceroute" "dig" "nslookup" + "lsof" "ps" "top" "df" "du" "free" "uptime" "whoami" + ) + + echo "Scanning for additional development and system tools:" + found_additional=() + + for tool in "${additional_tools[@]}"; do + if command -v "$tool" &> /dev/null; then + found_additional+=("$tool") + fi + done + + if [ ${#found_additional[@]} -gt 0 ]; then + echo -e "${GREEN}Additional tools found (${#found_additional[@]} total):${NC}" + printf '%s\n' "${found_additional[@]}" | column -c 100 | sed 's/^/ /' + else + echo "No additional common development tools found" + fi + echo "" +} + +# Function to check snap packages +check_snap_packages() { + if command -v snap &> /dev/null; then + echo -e "${BLUE}=== Snap Packages ===${NC}" + echo -e "${CYAN}Checking for tools installed via Snap:${NC}" + + snap_tools=$(snap list 2>/dev/null | grep -E "(docker|git|kubectl|helm|kind|k9s|stern|go|code)" | awk '{print $1, $2}') + if [ -n "$snap_tools" ]; then + echo "$snap_tools" | sed 's/^/ /' + else + echo " No relevant snap packages found" + fi + echo "" + fi +} + +# Main execution +echo "Checking for requested tools..." +echo "" + +# List of primary tools to check +tools=( + "kind" + "jq" + "k9s" + "stern" + "helm" + "kubectl" + "docker" + "git" + "pytest" + "envsubst" + "python3" + "certbot" + "go" + "gcc" + "curl" + "openssl" +) + +# Check each tool +for tool in "${tools[@]}"; do + check_tool "$tool" +done + +# Check package manager history +echo "" +check_apt_history + +# Check currently installed packages +echo "" +check_installed_packages + +# Check snap packages +echo "" +check_snap_packages + +# Find additional tools +echo "" +find_additional_tools + +# System information +echo -e "${BLUE}=== System Information ===${NC}" +echo "Operating System:" +if [ -f /etc/os-release ]; then + . /etc/os-release + echo " $PRETTY_NAME" +elif [ -f /etc/lsb-release ]; then + . /etc/lsb-release + echo " $DISTRIB_DESCRIPTION" +elif [ -f /etc/debian_version ]; then + echo " Debian $(cat /etc/debian_version)" +else + uname -s +fi + +echo "Architecture: $(uname -m)" +echo "Kernel: $(uname -r)" +echo "Uptime: $(uptime -p 2>/dev/null || uptime)" + +# Check environment +if [ -f /.dockerenv ]; then + echo "Environment: Running in Docker container" +elif [ -n "${KUBERNETES_SERVICE_HOST}" ]; then + echo "Environment: Running in Kubernetes pod" +else + echo "Environment: Standard Linux instance" +fi + +echo "" +echo -e "${BLUE}=== Package Managers Available ===${NC}" +command -v apt &> /dev/null && echo -e "${GREEN}✓${NC} apt $(apt --version 2>/dev/null | head -1)" +command -v dpkg &> /dev/null && echo -e "${GREEN}✓${NC} dpkg $(dpkg --version 2>/dev/null | head -1)" +command -v snap &> /dev/null && echo -e "${GREEN}✓${NC} snap $(snap version 2>/dev/null | head -1)" +command -v pip3 &> /dev/null && echo -e "${GREEN}✓${NC} pip3 $(pip3 --version 2>/dev/null)" +command -v npm &> /dev/null && echo -e "${GREEN}✓${NC} npm $(npm --version 2>/dev/null)" + +# Summary +echo "" +echo -e "${BLUE}=== Summary ===${NC}" +installed_count=0 +total_count=${#tools[@]} + +for tool in "${tools[@]}"; do + if command -v "$tool" &> /dev/null; then + ((installed_count++)) + fi +done + +echo "Primary tools installed: $installed_count/$total_count" + +if [ $installed_count -eq $total_count ]; then + echo -e "${GREEN}All requested tools are installed!${NC}" +elif [ $installed_count -gt $((total_count / 2)) ]; then + echo -e "${YELLOW}Most tools are installed.${NC}" +else + echo -e "${RED}Many tools are missing.${NC}" +fi + +echo "" +echo "Script completed at $(date)" + + diff --git a/test-artifact-repository-ref.sh b/scripts/test-artifact-repository-ref.sh similarity index 100% rename from test-artifact-repository-ref.sh rename to scripts/test-artifact-repository-ref.sh diff --git a/scripts/test-cert-2.sh b/scripts/test-cert-2.sh new file mode 100755 index 00000000..19ac6740 --- /dev/null +++ b/scripts/test-cert-2.sh @@ -0,0 +1,11 @@ +set -x + +# Check if ingress-nginx sees the TLS secret +kubectl get secret calypr-demo-tls -n argo-stack -o yaml | grep -A1 "tls.crt\|tls.key" + +# Check ingress-nginx logs for certificate loading +kubectl logs -n ingress-nginx -l app.kubernetes.io/component=controller --tail=50 | grep -i "tls\|certificate\|calypr-demo" + +# Access via the actual NodePort (bypasses any port forwarding issues) +curl -vI --resolve calypr-demo.ddns.net:30443:100.22.124.96 https://calypr-demo.ddns.net:30443/workflows + diff --git a/scripts/test-cert.sh b/scripts/test-cert.sh new file mode 100755 index 00000000..c1b1de4f --- /dev/null +++ b/scripts/test-cert.sh @@ -0,0 +1,16 @@ +set -x + +# Check what IP the domain resolves to +nslookup calypr-demo.ddns.net +dig calypr-demo.ddns.net +short + +# Get your ingress controller's external IP +kubectl get svc -n ingress-nginx ingress-nginx-controller + +# should not be in hosts file +grep calypr-demo /etc/hosts + +# check cert +curl https://calypr-demo.ddns.net/tenants/login || true + + diff --git a/scripts/test-curl b/scripts/test-curl new file mode 100755 index 00000000..4240366e --- /dev/null +++ b/scripts/test-curl @@ -0,0 +1,16 @@ +curl https://calypr-demo.ddns.net/workflows +curl https://calypr-demo.ddns.net/applications +curl https://calypr-demo.ddns.net/registrations +curl https://calypr-demo.ddns.net/api +curl https://calypr-demo.ddns.net/tenants + + +# 1. Verify direct service access works +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \ + curl -v http://argo-stack-argo-workflows-server.argo-workflows:2746/ +# 2. Verify ExternalName proxy service resolves correctly +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \ + curl -v http://argo-stack-argo-workflows-server-proxy.argo-stack:2746/ +# 3. Check ingress configuration +kubectl describe ingress ingress-authz-workflows -n argo-stack | grep -A5 "backend" + diff --git a/test-eso-templates.py b/scripts/test-eso-templates.py similarity index 100% rename from test-eso-templates.py rename to scripts/test-eso-templates.py diff --git a/test-per-app-artifacts.sh b/scripts/test-per-app-artifacts.sh similarity index 100% rename from test-per-app-artifacts.sh rename to scripts/test-per-app-artifacts.sh diff --git a/scripts/test-workflow-ingress b/scripts/test-workflow-ingress new file mode 100755 index 00000000..a80114e2 --- /dev/null +++ b/scripts/test-workflow-ingress @@ -0,0 +1,7 @@ +set -x #echo on + +kubectl get svc argo-stack-argo-workflows-server-proxy -n argo-stack -o yaml | grep externalName +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- curl -v http://argo-stack-argo-workflows-server.argo-workflows:2746/ +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- curl -v http://argo-stack-argo-workflows-server-proxy.argo-stack:2746/ +curl https://calypr-demo.ddns.net/workflows + diff --git a/scripts/yum_history.sh b/scripts/yum_history.sh new file mode 100755 index 00000000..c2769d8c --- /dev/null +++ b/scripts/yum_history.sh @@ -0,0 +1,12 @@ +echo "=== Quick Tool & YUM History Check ===" && \ +for tool in kind jq k9s stern helm kubectl docker git pytest envsubst python3 certbot go gcc curl openssl; do + if command -v "$tool" &> /dev/null; then + echo "✓ $tool: $(command -v $tool)"; + else + echo "✗ $tool: NOT FOUND"; + fi; +done && \ +echo -e "\n=== Recent YUM History ===" && \ +yum history list 2>/dev/null | head -10 && \ +echo -e "\n=== YUM Packages for Development Tools ===" && \ +yum history packages-list docker git python3 jq 2>/dev/null | grep -E "Install|Update" | head -10 diff --git a/test-workflows/README.md b/test-workflows/README.md deleted file mode 100644 index 5e8c0186..00000000 --- a/test-workflows/README.md +++ /dev/null @@ -1,164 +0,0 @@ -# Nextflow Workflow Testing - -This directory contains tests and workflows for validating Nextflow execution through Argo Workflows with authorization. - -## Files - -### Workflows -- `nextflow-hello-world.yaml` - A simple Nextflow "Hello World" workflow that can be submitted to Argo Workflows - -### Test Scripts -- `test_nextflow_execution.py` - Tests actual Nextflow workflow execution in Argo Workflows -- `test_authorization.py` - Tests authorization through the authz-adapter for workflow operations - -## Usage - -### Local Testing - -1. **Set up a local environment:** - ```bash - # Start a kind cluster with Argo Workflows - kind create cluster --name nextflow-test - - # Install Argo Workflows - kubectl create namespace argo-workflows - helm repo add argo https://argoproj.github.io/argo-helm - helm install argo-workflows argo/argo-workflows -n argo-workflows - - # Port forward to access the API - kubectl port-forward -n argo-workflows svc/argo-workflows-server 2746:2746 & - ``` - -2. **Run the Nextflow execution test:** - ```bash - python test_nextflow_execution.py - ``` - -3. **Run the authorization test:** - ```bash - python test_authorization.py - ``` - -### CI Testing - -The tests are automatically run in the GitHub Actions workflow `ci-nextflow.yaml` which: - -1. Sets up a kind cluster -2. Installs the complete Argo stack with authorization -3. Runs the Nextflow workflow execution test -4. Validates authorization is working correctly - -## Test Workflow Details - -### nextflow-hello-world.yaml - -This workflow: -- Uses the official Nextflow Docker image -- Creates a simple Nextflow script that processes multiple inputs -- Generates execution reports and traces -- Demonstrates container-based workflow execution -- Includes resource limits appropriate for CI environments - -The workflow includes: -- **Input processing**: Processes a channel of test strings -- **Report generation**: Creates a summary report of execution -- **Artifact collection**: Saves execution traces and reports -- **Resource management**: Appropriate CPU and memory limits - -### test_nextflow_execution.py - -This script: -- Submits the workflow to Argo Workflows via REST API -- Monitors execution progress with timeout handling -- Retrieves and displays workflow logs -- Validates successful completion -- Provides detailed error reporting on failures - -Features: -- Configurable Argo Workflows URL -- Configurable workflow file path -- Comprehensive logging and progress reporting -- Graceful error handling and cleanup - -### test_authorization.py - -This script validates: -- AuthZ adapter health and connectivity -- Workflow submission authorization for different user types -- Resource-specific authorization (Argo workflows vs other resources) -- Header-based authorization context (X-Resource-* headers) - -Test scenarios: -- **Authorized users**: Users with workflow creation permissions -- **Read-only users**: Users with only workflow viewing permissions -- **Unauthorized users**: Users without any workflow permissions -- **Resource context**: Authorization with Kubernetes resource context - -## Expected Outcomes - -### Successful Execution -When everything works correctly, you should see: - -1. **Workflow submission**: HTTP 201 response from Argo Workflows API -2. **Execution progress**: Status updates showing workflow phases (Running → Succeeded) -3. **Nextflow output**: Log messages showing Nextflow process execution -4. **Test reports**: Generated files showing successful data processing -5. **Authorization**: Proper group assignments (argo-runner, argo-viewer) - -### Common Issues - -#### Network Connectivity -- Ensure port forwarding is active: `kubectl port-forward -n argo-workflows svc/argo-workflows-server 2746:2746` -- Check firewall settings and cluster networking - -#### Authorization Failures -- Verify authz-adapter is deployed and healthy -- Check environment variables for Fence configuration -- Validate user tokens and permissions - -#### Resource Constraints -- Monitor cluster resources: `kubectl top nodes` -- Check pod resource requests vs available capacity -- Adjust workflow resource limits if needed - -#### Nextflow Issues -- Verify Nextflow image is accessible -- Check for script syntax errors in workflow definition -- Review container logs for Nextflow-specific errors - -## Customization - -### Modifying the Workflow -To customize the Nextflow workflow: - -1. Edit `nextflow-hello-world.yaml` -2. Modify the Nextflow script embedded in the container args -3. Adjust resource requests and limits as needed -4. Add additional output artifacts if required - -### Adding Test Cases -To add new test scenarios: - -1. Create new workflow YAML files following the same pattern -2. Extend `test_nextflow_execution.py` with additional test methods -3. Add authorization test cases to `test_authorization.py` -4. Update the CI workflow to run new tests - -### Environment Configuration -The tests support configuration via: - -- Command line arguments (--argo-url, --authz-url, etc.) -- Environment variables -- Configuration files (for complex scenarios) - -## Integration with CI/CD - -The `ci-nextflow.yaml` GitHub Actions workflow provides: - -- **Reproducible testing**: Same environment every time -- **Resource isolation**: Clean kind cluster for each run -- **Comprehensive validation**: Both functional and authorization testing -- **Artifact collection**: Logs and reports for debugging failures -- **Cleanup**: Automatic cleanup of resources after testing - -This ensures that Nextflow workflows will work correctly in production Kubernetes environments with proper authorization controls. \ No newline at end of file diff --git a/test-workflows/nextflow-hello-world.yaml b/test-workflows/nextflow-hello-world.yaml deleted file mode 100644 index 4f4a0d5e..00000000 --- a/test-workflows/nextflow-hello-world.yaml +++ /dev/null @@ -1,128 +0,0 @@ -apiVersion: argoproj.io/v1alpha1 -kind: Workflow -metadata: - generateName: nextflow-hello-world- - namespace: wf-poc - labels: - workflows.argoproj.io/test: nextflow-hello - test-type: integration -spec: - serviceAccountName: nextflow-workflow-sa - entrypoint: nextflow-hello - ttlStrategy: - secondsAfterCompletion: 300 # Clean up after 5 minutes - - templates: - - name: nextflow-hello - container: - image: nextflow/nextflow:23.10.0 - command: ["/bin/bash"] - args: - - -c - - | - echo "=== Nextflow Hello World Workflow Test ===" - echo "Nextflow version:" - nextflow -version - - echo "Creating simple hello world script..." - cat > hello.nf << 'NFEOF' - #!/usr/bin/env nextflow - - process sayHello { - input: - val x - - output: - stdout - - script: - """ - echo "Hello World from Nextflow! Input: $x" - echo "Current date: \$(date)" - echo "Hostname: \$(hostname)" - echo "Running in container: \$(cat /proc/1/cgroup | head -1)" - """ - } - - process createReport { - input: - val messages - - output: - path "report.txt" - - script: - """ - echo "Nextflow Hello World Test Report" > report.txt - echo "=================================" >> report.txt - echo "" >> report.txt - echo "Test executed at: \$(date)" >> report.txt - echo "Number of messages processed: \$(echo '$messages' | wc -w)" >> report.txt - echo "" >> report.txt - echo "Messages:" >> report.txt - echo '$messages' | tr ' ' '\n' >> report.txt - echo "" >> report.txt - echo "Test completed successfully!" >> report.txt - """ - } - - workflow { - // Create input channel with test data - input_ch = Channel.of('CI-Test', 'Argo-Workflows', 'Kind-Cluster', 'Nextflow-Integration') - - // Process each input and collect results - results = input_ch | sayHello | collect - - // Create a summary report - createReport(results.join(' ')) - - // Display results - results | view - } - NFEOF - - echo "Running Nextflow hello world workflow..." - nextflow run hello.nf -with-trace trace.txt -with-report report.html - - echo "=== Workflow execution completed ===" - echo "Generated files:" - ls -la - - if [ -f "report.txt" ]; then - echo "=== Test Report ===" - cat report.txt - fi - - if [ -f "trace.txt" ]; then - echo "=== Execution Trace ===" - cat trace.txt - fi - - echo "=== Nextflow Hello World test successful! ===" - - resources: - requests: - memory: "512Mi" - cpu: "200m" - limits: - memory: "1Gi" - cpu: "500m" - - env: - - name: NXF_HOME - value: "/tmp/.nextflow" - - name: NXF_WORK - value: "/tmp/work" - - # Define outputs for the workflow - outputs: - artifacts: - - name: workflow-report - path: /tmp/report.txt - optional: true - - name: execution-trace - path: /tmp/trace.txt - optional: true - - name: execution-report - path: /tmp/report.html - optional: true \ No newline at end of file diff --git a/test-workflows/rbac/workflow-rbac.yaml b/test-workflows/rbac/workflow-rbac.yaml deleted file mode 100644 index ebfec60b..00000000 --- a/test-workflows/rbac/workflow-rbac.yaml +++ /dev/null @@ -1,41 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: nextflow-workflow-sa - namespace: wf-poc ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - namespace: wf-poc - name: nextflow-workflow-role -rules: -- apiGroups: [""] - resources: ["pods", "pods/log", "pods/status"] - verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] -- apiGroups: [""] - resources: ["configmaps", "secrets"] - verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] -- apiGroups: [""] - resources: ["persistentvolumeclaims"] - verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] -- apiGroups: ["apps"] - resources: ["deployments", "replicasets"] - verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] -- apiGroups: ["batch"] - resources: ["jobs"] - verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: nextflow-workflow-binding - namespace: wf-poc -subjects: -- kind: ServiceAccount - name: nextflow-workflow-sa - namespace: wf-poc -roleRef: - kind: Role - name: nextflow-workflow-role - apiGroup: rbac.authorization.k8s.io diff --git a/test-workflows/setup-rbac.sh b/test-workflows/setup-rbac.sh deleted file mode 100755 index 5e27c153..00000000 --- a/test-workflows/setup-rbac.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -echo "🔒 Setting up RBAC for Nextflow workflows..." - -# Create namespace if it doesn't exist -echo "Creating namespace 'wf-poc' if it doesn't exist..." -kubectl create namespace wf-poc --dry-run=client -o yaml | kubectl apply -f - - -# Apply RBAC resources -echo "Applying RBAC resources..." -kubectl apply -f rbac/workflow-rbac.yaml - -# Verify setup -echo "Verifying RBAC setup..." -kubectl get serviceaccount nextflow-workflow-sa -n wf-poc -kubectl get role nextflow-workflow-role -n wf-poc -kubectl get rolebinding nextflow-workflow-binding -n wf-poc - -echo "✅ RBAC setup complete!" -echo "" -echo "You can now run workflows with the nextflow-workflow-sa service account." -echo "Test with: python test-workflows/test_nextflow_execution.py" diff --git a/test-workflows/test_authorization.py b/test-workflows/test_authorization.py deleted file mode 100755 index b4573b37..00000000 --- a/test-workflows/test_authorization.py +++ /dev/null @@ -1,217 +0,0 @@ -#!/usr/bin/env python3 -"""Test authorization for Nextflow workflow submission through the authz adapter.""" - -import requests -import json -import sys -import argparse -import time -from typing import Dict, Any, Optional - -class AuthzValidationTester: - """Test harness for authorization validation in the CI environment.""" - - def __init__(self, authz_url: str = "http://localhost:5000", argo_url: str = "http://localhost:2746"): - self.authz_url = authz_url - self.argo_url = argo_url - - def test_authz_health(self) -> bool: - """Test that the authz adapter is healthy.""" - try: - response = requests.get(f"{self.authz_url}/healthz", timeout=5) - if response.status_code == 200 and response.text == 'ok': - print(f"✓ AuthZ adapter is healthy at {self.authz_url}") - return True - else: - print(f"✗ AuthZ adapter health check failed: {response.status_code}") - return False - except Exception as e: - print(f"✗ Cannot reach AuthZ adapter at {self.authz_url}: {e}") - return False - - def test_argo_accessibility(self) -> bool: - """Test that Argo Workflows is accessible.""" - try: - response = requests.get(f"{self.argo_url}/", timeout=5) - print(f"✓ Argo Workflows is accessible at {self.argo_url} (status: {response.status_code})") - return True - except Exception as e: - print(f"✗ Cannot reach Argo Workflows at {self.argo_url}: {e}") - return False - - def test_workflow_submission_authorization(self) -> bool: - """Test authorization for workflow submission.""" - print("🔐 Testing workflow submission authorization...") - - # Test cases for different authorization scenarios - test_cases = [ - { - "name": "Authorized user with workflow permissions", - "headers": { - "Authorization": "Bearer valid-workflow-token", - "X-Original-URI": "/api/v1/workflows/argo-workflows", - "X-Original-Method": "POST" - }, - "expected_status": 200, - "expected_groups": ["argo-runner", "argo-viewer"] - }, - { - "name": "User with only read permissions", - "headers": { - "Authorization": "Bearer readonly-token", - "X-Original-URI": "/api/v1/workflows/argo-workflows/some-workflow", - "X-Original-Method": "GET" - }, - "expected_status": 200, - "expected_groups": ["argo-viewer"] - }, - { - "name": "Unauthorized user", - "headers": { - "Authorization": "Bearer invalid-token", - "X-Original-URI": "/api/v1/workflows/argo-workflows", - "X-Original-Method": "POST" - }, - "expected_status": 401, - "expected_groups": [] - } - ] - - all_passed = True - - for test_case in test_cases: - print(f"\n Testing: {test_case['name']}") - - try: - response = requests.get( - f"{self.authz_url}/check", - headers=test_case["headers"], - timeout=10 - ) - - if response.status_code == test_case["expected_status"]: - print(f" ✓ Status code: {response.status_code}") - else: - print(f" ✗ Expected status {test_case['expected_status']}, got {response.status_code}") - all_passed = False - continue - - # Check groups if auth was successful - if response.status_code == 200: - groups_header = response.headers.get('X-Auth-Request-Groups', '') - - for expected_group in test_case["expected_groups"]: - if expected_group in groups_header: - print(f" ✓ Group '{expected_group}' present") - else: - print(f" ✗ Group '{expected_group}' missing from: {groups_header}") - all_passed = False - - except Exception as e: - print(f" ✗ Request failed: {e}") - all_passed = False - - return all_passed - - def test_argo_resource_context_authorization(self) -> bool: - """Test authorization with Argo resource context headers.""" - print("🎯 Testing Argo resource context authorization...") - - headers = { - "Authorization": "Bearer argo-context-token", - "X-Original-URI": "/api/v1/workflows/argo-workflows", - "X-Original-Method": "POST", - "X-Resource-Group": "argoproj.io", - "X-Resource-Version": "v1alpha1", - "X-Resource-Kind": "workflows" - } - - try: - response = requests.get( - f"{self.authz_url}/check", - headers=headers, - timeout=10 - ) - - if response.status_code == 200: - groups = response.headers.get('X-Auth-Request-Groups', '') - print(f" ✓ Authorization successful with groups: {groups}") - - # Should have runner permissions for Argo resources - if 'argo-runner' in groups: - print(" ✓ Argo runner permissions granted") - return True - else: - print(" ✗ Argo runner permissions not granted") - return False - else: - print(f" ✗ Authorization failed: {response.status_code}") - return False - - except Exception as e: - print(f" ✗ Request failed: {e}") - return False - - def run_all_tests(self) -> bool: - """Run all authorization validation tests.""" - print("🧪 Authorization Validation Test Suite") - print("=====================================") - - tests = [ - ("AuthZ Health Check", self.test_authz_health), - ("Argo Accessibility", self.test_argo_accessibility), - ("Workflow Submission Authorization", self.test_workflow_submission_authorization), - ("Argo Resource Context Authorization", self.test_argo_resource_context_authorization) - ] - - results = [] - - for test_name, test_func in tests: - print(f"\n🔍 {test_name}") - print("-" * (len(test_name) + 4)) - - try: - result = test_func() - results.append(result) - - if result: - print(f"✅ {test_name}: PASSED") - else: - print(f"❌ {test_name}: FAILED") - - except Exception as e: - print(f"💥 {test_name}: ERROR - {e}") - results.append(False) - - # Summary - passed = sum(results) - total = len(results) - - print(f"\n📊 Test Summary") - print("===============") - print(f"Passed: {passed}/{total}") - print(f"Failed: {total - passed}/{total}") - - if all(results): - print("🎉 All authorization tests passed!") - return True - else: - print("💥 Some authorization tests failed!") - return False - -def main(): - parser = argparse.ArgumentParser(description='Test authorization for Nextflow workflow submission') - parser.add_argument('--authz-url', default='http://localhost:5000', - help='AuthZ adapter URL (default: http://localhost:5000)') - parser.add_argument('--argo-url', default='http://localhost:2746', - help='Argo Workflows URL (default: http://localhost:2746)') - - args = parser.parse_args() - - tester = AuthzValidationTester(args.authz_url, args.argo_url) - success = tester.run_all_tests() - - sys.exit(0 if success else 1) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/test-workflows/test_nextflow_execution.py b/test-workflows/test_nextflow_execution.py deleted file mode 100755 index 78dd00d6..00000000 --- a/test-workflows/test_nextflow_execution.py +++ /dev/null @@ -1,278 +0,0 @@ -#!/usr/bin/env python3 -"""Test script to submit and monitor Nextflow workflow execution in Argo Workflows.""" - -import requests -import yaml -import time -import json -import sys -import argparse -from typing import Dict, Any, Optional - -class ArgoWorkflowTester: - """Test harness for Argo Workflows and Nextflow integration.""" - - def __init__(self, base_url: str = "http://localhost:2746"): - self.base_url = base_url - self.api_url = f"{base_url}/api/v1" - - def submit_workflow(self, workflow_yaml: str) -> Dict[str, Any]: - """Submit a workflow to Argo Workflows.""" - with open(workflow_yaml, 'r') as f: - workflow_spec = yaml.safe_load(f) - - url = f"{self.api_url}/workflows/wf-poc" - headers = {'Content-Type': 'application/json'} - - print(f"Submitting workflow to {url}") - print(f"Workflow name pattern: {workflow_spec['metadata']['generateName']}") - - # Argo Workflows API expects the workflow to be wrapped in a request body - request_body = { - "workflow": workflow_spec - } - - - response = requests.post(url, json=request_body, headers=headers) - - if response.status_code in [200, 201]: - workflow = response.json() - print(f"✓ Workflow submitted successfully: {workflow['metadata']['name']}") - return workflow - else: - print(f"✗ Failed to submit workflow: {response.status_code}") - print(f"Response: {response.text}") - - # Check for RBAC issues - if "forbidden" in response.text.lower() or response.status_code == 403: - print("\n🔒 RBAC Permission Issue Detected!") - print("This appears to be a Kubernetes RBAC (Role-Based Access Control) issue.") - print("Please ensure the following resources are applied:") - print(" kubectl apply -f rbac/workflow-rbac.yaml") - print("\nOr create the necessary RBAC resources manually:") - print(" 1. ServiceAccount: nextflow-workflow-sa") - print(" 2. Role with pod creation permissions") - print(" 3. RoleBinding linking the ServiceAccount to the Role") - - raise Exception(f"Workflow submission failed: {response.status_code}") - - def get_workflow_status(self, namespace: str, name: str) -> Dict[str, Any]: - """Get workflow status.""" - url = f"{self.api_url}/workflows/{namespace}/{name}" - response = requests.get(url) - - if response.status_code == 200: - return response.json() - else: - raise Exception(f"Failed to get workflow status: {response.status_code}") - - def get_workflow_logs(self, namespace: str, name: str) -> Optional[str]: - """Get workflow logs.""" - try: - url = f"{self.api_url}/workflows/{namespace}/{name}/log" - response = requests.get(url) - if response.status_code == 200: - return response.text - except Exception as e: - print(f"Could not retrieve logs: {e}") - return None - - def wait_for_completion(self, namespace: str, name: str, timeout: int = 300) -> Dict[str, Any]: - """Wait for workflow to complete.""" - start_time = time.time() - - print(f"⏳ Waiting for workflow {name} to complete (timeout: {timeout}s)...") - - while time.time() - start_time < timeout: - try: - workflow = self.get_workflow_status(namespace, name) - phase = workflow.get('status', {}).get('phase', 'Unknown') - progress = workflow.get('status', {}).get('progress', 'Unknown') - - elapsed = int(time.time() - start_time) - print(f"[{elapsed:3d}s] Workflow {name} status: {phase} (progress: {progress})") - - # Check for RBAC errors in workflow status - if phase == 'Failed': - status = workflow.get('status', {}) - message = status.get('message', '') - if 'forbidden' in message.lower() or 'serviceaccount' in message.lower(): - print(f"\n🔒 RBAC Error detected in workflow: {message}") - print("Please apply RBAC configuration: kubectl apply -f rbac/workflow-rbac.yaml") - return workflow - - if phase in ['Succeeded', 'Failed', 'Error']: - return workflow - - time.sleep(10) - except Exception as e: - print(f"Error checking workflow status: {e}") - time.sleep(5) - - raise Exception(f"Workflow did not complete within {timeout} seconds") - - def test_nextflow_hello_world(self, workflow_file: str = 'test-workflows/nextflow-hello-world.yaml') -> bool: - """Test Nextflow hello world workflow execution.""" - try: - print("🚀 Starting Nextflow Hello World test...") - print(f"Using workflow file: {workflow_file}") - - # Submit workflow - workflow = self.submit_workflow(workflow_file) - namespace = workflow['metadata']['namespace'] - name = workflow['metadata']['name'] - - # Wait for completion - final_workflow = self.wait_for_completion(namespace, name, timeout=600) # 10 minute timeout - - # Check result - phase = final_workflow.get('status', {}).get('phase', 'Unknown') - - if phase == 'Succeeded': - print("✅ Nextflow Hello World workflow completed successfully!") - - # Get and display logs - logs = self.get_workflow_logs(namespace, name) - if logs: - print("📋 Workflow logs:") - print("-" * 50) - print(logs) - print("-" * 50) - - # Display workflow summary - status = final_workflow.get('status', {}) - if 'startedAt' in status and 'finishedAt' in status: - print(f"📊 Execution time: {status['startedAt']} → {status['finishedAt']}") - - return True - else: - print(f"❌ Workflow failed with phase: {phase}") - - # Print failure details - status = final_workflow.get('status', {}) - if 'message' in status: - print(f"Error message: {status['message']}") - - # Try to get logs for debugging - logs = self.get_workflow_logs(namespace, name) - if logs: - print("📋 Failure logs:") - print(logs) - - return False - - except Exception as e: - print(f"❌ Test failed with exception: {e}") - return False - -def test_connectivity(base_url: str) -> bool: - """Test connectivity to Argo Workflows API.""" - try: - response = requests.get(f"{base_url}/", timeout=10) - print(f"✓ Argo Workflows API accessible at {base_url} (status: {response.status_code})") - return True - except Exception as e: - print(f"✗ Cannot access Argo Workflows API at {base_url}: {e}") - return False - -def verify_namespace_setup() -> bool: - """Verify that required namespaces and RBAC are properly configured.""" - try: - import subprocess - - print("🔍 Verifying namespace and RBAC setup...") - - # Check if required namespaces exist - required_namespaces = ['argo-workflows', 'wf-poc', 'security'] - - for ns in required_namespaces: - result = subprocess.run(['kubectl', 'get', 'namespace', ns], - capture_output=True, text=True) - if result.returncode == 0: - print(f"✓ Namespace '{ns}' exists") - else: - print(f"✗ Namespace '{ns}' not found") - print(f" Create with: kubectl create namespace {ns}") - return False - - # Check if ServiceAccount exists in wf-poc namespace - result = subprocess.run(['kubectl', 'get', 'serviceaccount', 'nextflow-workflow-sa', '-n', 'wf-poc'], - capture_output=True, text=True) - if result.returncode == 0: - print("✓ ServiceAccount 'nextflow-workflow-sa' exists in wf-poc namespace") - else: - print("✗ ServiceAccount 'nextflow-workflow-sa' not found in wf-poc namespace") - print(" Apply RBAC with: kubectl apply -f rbac/workflow-rbac.yaml") - return False - - # Check Argo Workflows deployment location - result = subprocess.run(['kubectl', 'get', 'deployment', '-l', 'app.kubernetes.io/name=argo-workflows-server', '--all-namespaces'], - capture_output=True, text=True) - if result.returncode == 0: - print("🔍 Argo Workflows server deployment locations:") - lines = result.stdout.strip().split('\n')[1:] # Skip header - for line in lines: - if line.strip(): - parts = line.split() - namespace = parts[0] - deployment = parts[1] - print(f" - {deployment} in namespace '{namespace}'") - if namespace != 'argo-workflows': - print(f" ⚠️ Expected in 'argo-workflows' namespace, found in '{namespace}'") - - return True - - except Exception as e: - print(f"Error verifying setup: {e}") - return False - -def main(): - parser = argparse.ArgumentParser(description='Test Nextflow workflow execution in Argo Workflows') - parser.add_argument('--argo-url', default='http://localhost:2746', - help='Argo Workflows server URL (default: http://localhost:2746)') - parser.add_argument('--workflow-file', default='test-workflows/nextflow-hello-world.yaml', - help='Workflow YAML file to submit (default: test-workflows/nextflow-hello-world.yaml)') - parser.add_argument('--skip-connectivity-check', action='store_true', - help='Skip initial connectivity check') - - args = parser.parse_args() - - print("🧪 Nextflow Workflow Execution Test") - print("===================================") - print(f"Argo Workflows URL: {args.argo_url}") - print(f"Workflow file: {args.workflow_file}") - print() - - # Verify namespace setup first - if not verify_namespace_setup(): - print("\n❌ Namespace/RBAC setup verification failed!") - print("Please fix the issues above before running workflows.") - sys.exit(1) - - print("\n📋 Prerequisites:") - print(" 1. Argo Workflows server running and accessible") - print(" 2. RBAC resources applied: kubectl apply -f rbac/workflow-rbac.yaml") - print(" 3. All required namespaces exist and are properly configured") - print() - - # Test connectivity to Argo Workflows - if not args.skip_connectivity_check: - if not test_connectivity(args.argo_url): - sys.exit(1) - print() - - # Run the test - tester = ArgoWorkflowTester(args.argo_url) - success = tester.test_nextflow_hello_world(args.workflow_file) - - if success: - print("\n🎉 All tests passed!") - print("Nextflow hello world workflow executed successfully in Argo Workflows!") - sys.exit(0) - else: - print("\n💥 Tests failed!") - print("Check the logs above for error details.") - sys.exit(1) - -if __name__ == "__main__": - main() \ No newline at end of file