From af9c77dba6565ed38873293cbfa01b67e9c293f4 Mon Sep 17 00:00:00 2001 From: Joe Licata Date: Sun, 28 Dec 2025 19:27:51 +0000 Subject: [PATCH 1/8] refactor: Update process limits in configuration and container management - Replaced `max_processes` with `max_pids` and updated its description to clarify its role in preventing fork bombs. - Adjusted `max_processes` to reflect per-UID nproc ulimit, ensuring it is set high since `max_pids` manages per-container limits. - Added environment variable settings in `ContainerManager` to limit OpenBLAS/OpenMP threads, preventing nproc exhaustion. - Configured `pids_limit` in container settings to utilize the new `max_pids` value for enhanced process management. --- src/config/__init__.py | 13 ++++++++++++- src/config/resources.py | 13 ++++++++++++- src/services/container/manager.py | 11 +++++++++++ 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/src/config/__init__.py b/src/config/__init__.py index 510d2c5..3e72121 100644 --- a/src/config/__init__.py +++ b/src/config/__init__.py @@ -142,7 +142,18 @@ class Settings(BaseSettings): max_cpu_quota: int = Field( default=50000, ge=10000, le=100000 ) # Deprecated, use max_cpus - max_processes: int = Field(default=32, ge=1, le=128) + max_pids: int = Field( + default=512, + ge=64, + le=4096, + description="Per-container process limit (cgroup pids_limit). Prevents fork bombs.", + ) + max_processes: int = Field( + default=512, + ge=64, + le=4096, + description="Per-UID nproc ulimit. Set high since max_pids handles per-container limiting.", + ) max_open_files: int = Field(default=1024, ge=64, le=4096) # Resource Limits - Files diff --git a/src/config/resources.py b/src/config/resources.py index 357d6a6..d99a68d 100644 --- a/src/config/resources.py +++ b/src/config/resources.py @@ -19,7 +19,18 @@ class ResourcesConfig(BaseSettings): max_cpu_quota: int = Field( default=50000, ge=10000, le=100000 ) # Deprecated, use max_cpus - max_processes: int = Field(default=32, ge=1, le=128) + max_pids: int = Field( + default=512, + ge=64, + le=4096, + description="Per-container process limit (cgroup pids_limit). Prevents fork bombs.", + ) + max_processes: int = Field( + default=512, + ge=64, + le=4096, + description="Per-UID nproc ulimit. Set high since max_pids handles per-container limiting.", + ) max_open_files: int = Field(default=1024, ge=64, le=4096) # File Limits diff --git a/src/services/container/manager.py b/src/services/container/manager.py index 7d5bb29..40e34d9 100644 --- a/src/services/container/manager.py +++ b/src/services/container/manager.py @@ -173,6 +173,12 @@ def create_container( if repl_mode: env["REPL_MODE"] = "true" + # Limit OpenBLAS/OpenMP threads to prevent nproc exhaustion + # All containers run as same user and share the nproc limit + env.setdefault("OPENBLAS_NUM_THREADS", "1") + env.setdefault("OMP_NUM_THREADS", "1") + env.setdefault("MKL_NUM_THREADS", "1") + # Determine network configuration use_wan_access = settings.enable_wan_access @@ -253,7 +259,12 @@ def create_container( "cap_add": ["CHOWN", "DAC_OVERRIDE", "FOWNER", "SETGID", "SETUID"], "read_only": False, "tmpfs": {"/tmp": "noexec,nosuid,size=100m"}, + # pids_limit: cgroup-based per-container process limit (prevents fork bombs) + # This is truly per-container unlike nproc which is per-UID + "pids_limit": settings.max_pids, "ulimits": [ + # nproc: per-UID limit (shared across containers with same user) + # Set high since pids_limit handles per-container limiting docker.types.Ulimit( name="nproc", soft=settings.max_processes, From a8909b9d35e24eaaf63160bafb07afc9cc91cfc6 Mon Sep 17 00:00:00 2001 From: Joe Licata Date: Sun, 28 Dec 2025 19:35:54 +0000 Subject: [PATCH 2/8] refactor: Update process limits and configuration for container management - Replaced `MAX_PROCESSES` with `MAX_PIDS` in configuration files to clarify its role in managing per-container process limits. - Updated `security-profiles.json` to reflect new `pidsLimit` values and removed deprecated `nproc` settings. - Adjusted documentation to include `MAX_PIDS` and its description, ensuring clarity on resource limits. - Cleaned up `ContainerManager` to remove references to `nproc`, focusing on `pids_limit` for enhanced process management. --- .env.example | 2 +- docker/security-profiles.json | 39 ++++--------------------------- docs/CONFIGURATION.md | 14 +++++------ src/config/__init__.py | 6 ----- src/config/resources.py | 6 ----- src/services/container/manager.py | 14 ----------- 6 files changed, 13 insertions(+), 68 deletions(-) diff --git a/.env.example b/.env.example index bd92b13..d8a27b2 100644 --- a/.env.example +++ b/.env.example @@ -62,7 +62,7 @@ MAX_EXECUTION_TIME=30 MAX_MEMORY_MB=512 MAX_CPUS=1 MAX_CPU_QUOTA=50000 #Deprecated -MAX_PROCESSES=32 +MAX_PIDS=512 MAX_OPEN_FILES=1024 # Resource Limits - Files diff --git a/docker/security-profiles.json b/docker/security-profiles.json index a17398a..9e4ea37 100644 --- a/docker/security-profiles.json +++ b/docker/security-profiles.json @@ -10,12 +10,8 @@ "tmpfs": { "/tmp": "rw,noexec,nosuid,size=100m" }, + "pidsLimit": 512, "ulimits": [ - { - "name": "nproc", - "soft": 32, - "hard": 32 - }, { "name": "nofile", "soft": 1024, @@ -42,12 +38,8 @@ "tmpfs": { "/tmp": "rw,noexec,nosuid,size=50m" }, + "pidsLimit": 256, "ulimits": [ - { - "name": "nproc", - "soft": 16, - "hard": 16 - }, { "name": "nofile", "soft": 512, @@ -67,36 +59,15 @@ "languageProfiles": { "python": { "inherits": "defaultProfile", - "description": "Security profile optimized for Python execution", - "ulimits": [ - { - "name": "nproc", - "soft": 64, - "hard": 64 - } - ] + "description": "Security profile optimized for Python execution" }, "nodejs": { "inherits": "defaultProfile", - "description": "Security profile optimized for Node.js execution", - "ulimits": [ - { - "name": "nproc", - "soft": 48, - "hard": 48 - } - ] + "description": "Security profile optimized for Node.js execution" }, "java": { "inherits": "defaultProfile", - "description": "Security profile optimized for Java execution", - "ulimits": [ - { - "name": "nproc", - "soft": 128, - "hard": 128 - } - ] + "description": "Security profile optimized for Java execution" }, "go": { "inherits": "defaultProfile", diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index f277221..8c10c31 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -184,13 +184,13 @@ Docker is used for secure code execution in containers. #### Execution Limits -| Variable | Default | Description | -| -------------------- | ------- | ------------------------------------- | -| `MAX_EXECUTION_TIME` | `30` | Maximum code execution time (seconds) | -| `MAX_MEMORY_MB` | `512` | Maximum memory per execution (MB) | -| `MAX_CPU_QUOTA` | `50000` | CPU quota (100000 = 1 CPU) | -| `MAX_PROCESSES` | `32` | Maximum processes per container | -| `MAX_OPEN_FILES` | `1024` | Maximum open files per container | +| Variable | Default | Description | +| -------------------- | ------- | ---------------------------------------------------------------- | +| `MAX_EXECUTION_TIME` | `30` | Maximum code execution time (seconds) | +| `MAX_MEMORY_MB` | `512` | Maximum memory per execution (MB) | +| `MAX_CPU_QUOTA` | `50000` | CPU quota (100000 = 1 CPU) | +| `MAX_PIDS` | `512` | Per-container process limit (cgroup pids_limit, prevents fork bombs) | +| `MAX_OPEN_FILES` | `1024` | Maximum open files per container | #### File Limits diff --git a/src/config/__init__.py b/src/config/__init__.py index 3e72121..3287b70 100644 --- a/src/config/__init__.py +++ b/src/config/__init__.py @@ -148,12 +148,6 @@ class Settings(BaseSettings): le=4096, description="Per-container process limit (cgroup pids_limit). Prevents fork bombs.", ) - max_processes: int = Field( - default=512, - ge=64, - le=4096, - description="Per-UID nproc ulimit. Set high since max_pids handles per-container limiting.", - ) max_open_files: int = Field(default=1024, ge=64, le=4096) # Resource Limits - Files diff --git a/src/config/resources.py b/src/config/resources.py index d99a68d..3b4479e 100644 --- a/src/config/resources.py +++ b/src/config/resources.py @@ -25,12 +25,6 @@ class ResourcesConfig(BaseSettings): le=4096, description="Per-container process limit (cgroup pids_limit). Prevents fork bombs.", ) - max_processes: int = Field( - default=512, - ge=64, - le=4096, - description="Per-UID nproc ulimit. Set high since max_pids handles per-container limiting.", - ) max_open_files: int = Field(default=1024, ge=64, le=4096) # File Limits diff --git a/src/services/container/manager.py b/src/services/container/manager.py index 40e34d9..3bc93b8 100644 --- a/src/services/container/manager.py +++ b/src/services/container/manager.py @@ -173,12 +173,6 @@ def create_container( if repl_mode: env["REPL_MODE"] = "true" - # Limit OpenBLAS/OpenMP threads to prevent nproc exhaustion - # All containers run as same user and share the nproc limit - env.setdefault("OPENBLAS_NUM_THREADS", "1") - env.setdefault("OMP_NUM_THREADS", "1") - env.setdefault("MKL_NUM_THREADS", "1") - # Determine network configuration use_wan_access = settings.enable_wan_access @@ -260,16 +254,8 @@ def create_container( "read_only": False, "tmpfs": {"/tmp": "noexec,nosuid,size=100m"}, # pids_limit: cgroup-based per-container process limit (prevents fork bombs) - # This is truly per-container unlike nproc which is per-UID "pids_limit": settings.max_pids, "ulimits": [ - # nproc: per-UID limit (shared across containers with same user) - # Set high since pids_limit handles per-container limiting - docker.types.Ulimit( - name="nproc", - soft=settings.max_processes, - hard=settings.max_processes, - ), docker.types.Ulimit( name="nofile", soft=settings.max_open_files, From f83df4fee2f6adae3d7de00e35cd8f0bc00f48ff Mon Sep 17 00:00:00 2001 From: Joe Licata Date: Sun, 28 Dec 2025 19:43:20 +0000 Subject: [PATCH 3/8] refactor: Update container configuration settings for read-only and tmpfs options - Replaced hardcoded values for `read_only` and `tmpfs` in `ContainerManager` with settings from the configuration to enhance flexibility and maintainability. - This change allows for easier adjustments to container settings without modifying the code directly. --- src/services/container/manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/services/container/manager.py b/src/services/container/manager.py index 3bc93b8..b0027ae 100644 --- a/src/services/container/manager.py +++ b/src/services/container/manager.py @@ -251,8 +251,8 @@ def create_container( "security_opt": ["no-new-privileges:true"], "cap_drop": ["ALL"], "cap_add": ["CHOWN", "DAC_OVERRIDE", "FOWNER", "SETGID", "SETUID"], - "read_only": False, - "tmpfs": {"/tmp": "noexec,nosuid,size=100m"}, + "read_only": settings.docker_read_only, + "tmpfs": settings.docker_tmpfs, # pids_limit: cgroup-based per-container process limit (prevents fork bombs) "pids_limit": settings.max_pids, "ulimits": [ From e47e2a097f33f92fd923d745ef4d3b3476439bac Mon Sep 17 00:00:00 2001 From: Joe Licata Date: Sun, 28 Dec 2025 20:39:11 +0000 Subject: [PATCH 4/8] feat: Implement seccomp profile for enhanced container security - Added a new seccomp profile (`seccomp-sandbox.json`) to block dangerous syscalls, enhancing container security. - Updated `docker-compose.yml` to mount the seccomp profile into the container. - Modified configuration settings to allow specification of the seccomp profile path. - Enhanced `ContainerManager` to load and apply the seccomp profile during container creation. - Added integration tests to verify that the seccomp profile correctly blocks the `ptrace` syscall and that the profile file is valid. --- docker-compose.yml | 1 + docker/seccomp-sandbox.json | 176 ++++++++++++++++++ docker/security-profiles.json | 2 +- src/config/__init__.py | 4 + src/config/docker.py | 7 +- src/services/container/manager.py | 48 ++++- tests/integration/test_container_hardening.py | 112 +++++++++++ 7 files changed, 341 insertions(+), 9 deletions(-) create mode 100644 docker/seccomp-sandbox.json diff --git a/docker-compose.yml b/docker-compose.yml index 1f3703d..0331295 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -40,6 +40,7 @@ services: - ${SSL_CERTS_PATH:-./ssl}:/app/ssl - ./dashboard:/app/dashboard - ./src:/app/src + - ./docker:/app/docker:ro # Seccomp profile for container security depends_on: - redis - minio diff --git a/docker/seccomp-sandbox.json b/docker/seccomp-sandbox.json new file mode 100644 index 0000000..ee476ad --- /dev/null +++ b/docker/seccomp-sandbox.json @@ -0,0 +1,176 @@ +{ + "defaultAction": "SCMP_ACT_ALLOW", + "defaultErrnoRet": 1, + "architectures": [ + "SCMP_ARCH_X86_64", + "SCMP_ARCH_X86", + "SCMP_ARCH_X32", + "SCMP_ARCH_AARCH64", + "SCMP_ARCH_ARM" + ], + "syscalls": [ + { + "names": [ + "ptrace" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 1, + "comment": "Block process tracing - caused container hang with PTRACE_TRACEME" + }, + { + "names": [ + "process_vm_readv", + "process_vm_writev" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 1, + "comment": "Block cross-process memory access" + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 1, + "comment": "Block ASLR disabling" + }, + { + "names": [ + "mount", + "umount", + "umount2" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 1, + "comment": "Block filesystem manipulation (defense-in-depth, also blocked by CAP_SYS_ADMIN)" + }, + { + "names": [ + "pivot_root", + "chroot" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 1, + "comment": "Block container escape vectors (defense-in-depth, also blocked by CAP_SYS_CHROOT)" + }, + { + "names": [ + "reboot", + "kexec_load", + "kexec_file_load" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 1, + "comment": "Block system disruption (defense-in-depth, also blocked by CAP_SYS_BOOT)" + }, + { + "names": [ + "init_module", + "finit_module", + "delete_module" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 1, + "comment": "Block kernel module manipulation (defense-in-depth, also blocked by CAP_SYS_MODULE)" + }, + { + "names": [ + "acct" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 1, + "comment": "Block process accounting manipulation (defense-in-depth, also blocked by CAP_SYS_PACCT)" + }, + { + "names": [ + "swapon", + "swapoff" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 1, + "comment": "Block swap manipulation (defense-in-depth, also blocked by CAP_SYS_ADMIN)" + }, + { + "names": [ + "sethostname", + "setdomainname" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 1, + "comment": "Block host identity changes (defense-in-depth, also blocked by CAP_SYS_ADMIN)" + }, + { + "names": [ + "clock_settime", + "clock_adjtime", + "settimeofday", + "adjtimex" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 1, + "comment": "Block time manipulation (defense-in-depth, also blocked by CAP_SYS_TIME)" + }, + { + "names": [ + "iopl", + "ioperm" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 1, + "comment": "Block direct I/O port access (defense-in-depth, also blocked by CAP_SYS_RAWIO)" + }, + { + "names": [ + "create_module", + "get_kernel_syms", + "query_module" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 1, + "comment": "Block legacy kernel module syscalls" + }, + { + "names": [ + "unshare", + "setns" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 1, + "comment": "Block namespace manipulation" + }, + { + "names": [ + "userfaultfd" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 1, + "comment": "Block userfaultfd - can be used in exploits" + }, + { + "names": [ + "bpf" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 1, + "comment": "Block BPF - powerful and often exploited" + }, + { + "names": [ + "perf_event_open" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 1, + "comment": "Block perf events - can leak kernel info" + }, + { + "names": [ + "add_key", + "keyctl", + "request_key" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 1, + "comment": "Block kernel keyring manipulation" + } + ] +} diff --git a/docker/security-profiles.json b/docker/security-profiles.json index 9e4ea37..c2341d1 100644 --- a/docker/security-profiles.json +++ b/docker/security-profiles.json @@ -30,7 +30,7 @@ "restrictedProfile": { "name": "code-interpreter-restricted", "description": "Highly restricted security profile for untrusted code", - "securityOpt": ["no-new-privileges:true", "seccomp:unconfined"], + "securityOpt": ["no-new-privileges:true"], "capDrop": ["ALL"], "capAdd": [], "readOnly": true, diff --git a/src/config/__init__.py b/src/config/__init__.py index 3287b70..cddff4f 100644 --- a/src/config/__init__.py +++ b/src/config/__init__.py @@ -129,6 +129,10 @@ class Settings(BaseSettings): docker_tmpfs: Dict[str, str] = Field( default_factory=lambda: {"/tmp": "rw,noexec,nosuid,size=100m"} ) + docker_seccomp_profile: Optional[str] = Field( + default="docker/seccomp-sandbox.json", + description="Path to seccomp profile JSON file (None to disable)", + ) # Resource Limits - Execution max_execution_time: int = Field(default=30, ge=1, le=300) diff --git a/src/config/docker.py b/src/config/docker.py index 5329dd1..c4c4a5f 100644 --- a/src/config/docker.py +++ b/src/config/docker.py @@ -1,6 +1,6 @@ """Docker configuration.""" -from typing import Dict, List +from typing import Dict, List, Optional from pydantic import Field from pydantic_settings import BaseSettings @@ -22,6 +22,11 @@ class DockerConfig(BaseSettings): default_factory=lambda: {"/tmp": "rw,noexec,nosuid,size=100m"}, alias="docker_tmpfs", ) + seccomp_profile: Optional[str] = Field( + default="docker/seccomp-sandbox.json", + alias="docker_seccomp_profile", + description="Path to seccomp profile JSON file (None to disable)", + ) # Container lifecycle container_ttl_minutes: int = Field(default=5, ge=1, le=1440) diff --git a/src/services/container/manager.py b/src/services/container/manager.py index b0027ae..be91ed1 100644 --- a/src/services/container/manager.py +++ b/src/services/container/manager.py @@ -2,9 +2,11 @@ import asyncio import io +import json import tarfile import uuid from datetime import datetime +from pathlib import Path from typing import Any, Dict, List, Optional, Tuple import structlog @@ -233,11 +235,43 @@ def create_container( except Exception: pass + # Build security options with seccomp profile + security_opts = list(settings.docker_security_opt) + if settings.docker_seccomp_profile: + # Resolve profile path (relative to project root or absolute) + profile_path = Path(settings.docker_seccomp_profile) + if not profile_path.is_absolute(): + # Relative to project root (4 levels up from this file) + project_root = Path(__file__).parent.parent.parent.parent + profile_path = project_root / profile_path + if profile_path.exists(): + try: + with open(profile_path) as f: + seccomp_data = json.load(f) + # docker-py accepts inline JSON via seccomp= + security_opts.append(f"seccomp={json.dumps(seccomp_data)}") + logger.debug( + "Loaded seccomp profile", + profile=str(profile_path), + blocked_syscalls=len(seccomp_data.get("syscalls", [])), + ) + except Exception as e: + logger.warning( + "Failed to load seccomp profile, using default", + profile=str(profile_path), + error=str(e), + ) + else: + logger.warning( + "Seccomp profile not found, using default", + profile=str(profile_path), + ) + # Build container config # Security hardening applied: - # - ulimits: nproc and nofile limits to prevent fork bombs and FD exhaustion - # Note: /proc/kallsyms and /proc/modules masking would require MaskedPaths - # (not supported by docker-py) or a custom seccomp profile. + # - seccomp profile: blocks dangerous syscalls (ptrace, etc.) + # - ulimits: nofile limits to prevent FD exhaustion + # - pids_limit: prevents fork bombs container_config: Dict[str, Any] = { "image": image, "name": container_name, @@ -248,10 +282,11 @@ def create_container( "mem_limit": f"{settings.max_memory_mb}m", "memswap_limit": f"{settings.max_memory_mb}m", "nano_cpus": int(settings.max_cpus * 1e9), - "security_opt": ["no-new-privileges:true"], + "security_opt": security_opts, "cap_drop": ["ALL"], "cap_add": ["CHOWN", "DAC_OVERRIDE", "FOWNER", "SETGID", "SETUID"], - "read_only": settings.docker_read_only, + # read_only must be False to allow file uploads to /mnt/data + "read_only": False, "tmpfs": settings.docker_tmpfs, # pids_limit: cgroup-based per-container process limit (prevents fork bombs) "pids_limit": settings.max_pids, @@ -263,8 +298,7 @@ def create_container( ), ], # Note: /proc/kallsyms and /proc/modules masking requires MaskedPaths - # which docker-py doesn't support. Bind mounts to /proc are blocked by runc. - # Alternative: use a custom seccomp profile or accept the limitation. + # which docker-py doesn't support. These paths are read-only by default. "environment": env, "labels": labels, "hostname": settings.container_generic_hostname, diff --git a/tests/integration/test_container_hardening.py b/tests/integration/test_container_hardening.py index ba0728d..5ddbaed 100644 --- a/tests/integration/test_container_hardening.py +++ b/tests/integration/test_container_hardening.py @@ -355,3 +355,115 @@ def test_resolv_conf_no_internal_domains(self, client, auth_headers): assert "internal" not in stdout.lower() finally: app.dependency_overrides.clear() + + def test_ptrace_blocked_by_seccomp(self, client, auth_headers): + """Verify ptrace syscall is blocked by seccomp profile. + + This test verifies that the seccomp profile blocks ptrace, + which prevents process tracing attacks that can cause containers + to become unresponsive to stop signals. + """ + session_id = "hardening-ptrace-test" + mock_session = create_session(session_id) + + # Mock execution that attempts ptrace - should return -1 (EPERM) + # When seccomp blocks ptrace, it returns EPERM (-1) + mock_execution = CodeExecution( + execution_id="exec-ptrace", + session_id=session_id, + code=""" +import ctypes +libc = ctypes.CDLL("libc.so.6") +result = libc.ptrace(0, 0, 0, 0) # PTRACE_TRACEME +print(f"ptrace result: {result}") +""", + language="py", + status=ExecutionStatus.COMPLETED, + exit_code=0, + outputs=[ + ExecutionOutput( + type=OutputType.STDOUT, + content="ptrace result: -1\n", + timestamp=datetime.now(timezone.utc), + ) + ], + ) + + mock_session_service = AsyncMock() + mock_session_service.create_session.return_value = mock_session + mock_session_service.get_session.return_value = mock_session + + mock_execution_service = AsyncMock() + mock_execution_service.execute_code.return_value = ( + mock_execution, + None, + None, + [], + "pool_hit", + ) + + mock_file_service = AsyncMock() + mock_file_service.list_files.return_value = [] + + from src.dependencies.services import ( + get_session_service, + get_execution_service, + get_file_service, + ) + + app.dependency_overrides[get_session_service] = lambda: mock_session_service + app.dependency_overrides[get_execution_service] = lambda: mock_execution_service + app.dependency_overrides[get_file_service] = lambda: mock_file_service + + try: + response = client.post( + "/exec", + json={ + "code": """ +import ctypes +libc = ctypes.CDLL("libc.so.6") +result = libc.ptrace(0, 0, 0, 0) +print(f"ptrace result: {result}") +""", + "lang": "py", + }, + headers=auth_headers, + ) + + assert response.status_code == 200 + data = response.json() + stdout = data.get("stdout", "") + # When seccomp blocks ptrace, it returns -1 (EPERM) + assert "ptrace result: -1" in stdout + finally: + app.dependency_overrides.clear() + + def test_seccomp_profile_config_exists(self): + """Verify seccomp profile configuration is set.""" + from src.config import settings + + assert settings.docker_seccomp_profile == "docker/seccomp-sandbox.json" + + def test_seccomp_profile_file_exists(self): + """Verify seccomp profile file exists and is valid JSON.""" + import json + from pathlib import Path + + profile_path = Path("docker/seccomp-sandbox.json") + assert profile_path.exists(), "Seccomp profile file should exist" + + with open(profile_path) as f: + profile = json.load(f) + + # Verify structure + assert "defaultAction" in profile + assert "syscalls" in profile + assert isinstance(profile["syscalls"], list) + + # Verify ptrace is blocked + blocked_syscalls = [] + for rule in profile["syscalls"]: + if rule.get("action") == "SCMP_ACT_ERRNO": + blocked_syscalls.extend(rule.get("names", [])) + + assert "ptrace" in blocked_syscalls, "ptrace should be blocked by seccomp" From 109eea5ad4311d828d561c75fe6c4f9993ae977b Mon Sep 17 00:00:00 2001 From: Joe Licata Date: Sun, 28 Dec 2025 21:01:30 +0000 Subject: [PATCH 5/8] chore: Update configuration and tests for file handling and resource limits - Increased `MAX_EXECUTION_TIME` from 30 to 120 seconds in `.env.example` for extended processing capabilities. - Enabled `ENABLE_ORPHAN_MINIO_CLEANUP` in `.env.example` to allow periodic cleanup of orphaned MinIO objects. - Disabled documentation generation by setting `ENABLE_DOCS` to false in `.env.example`. - Added new integration tests in `test_file_handling.py` to validate the complete workflow of uploading, analyzing, and downloading CSV and JSON files, as well as processing images with OpenCV. --- .env.example | 6 +- .gitignore | 2 +- tests/integration/test_file_handling.py | 360 ++++++++++++++++++++++++ 3 files changed, 364 insertions(+), 4 deletions(-) diff --git a/.env.example b/.env.example index d8a27b2..dd360bf 100644 --- a/.env.example +++ b/.env.example @@ -58,7 +58,7 @@ DOCKER_NETWORK_MODE=none DOCKER_READ_ONLY=true # Resource Limits - Execution -MAX_EXECUTION_TIME=30 +MAX_EXECUTION_TIME=120 MAX_MEMORY_MB=512 MAX_CPUS=1 MAX_CPU_QUOTA=50000 #Deprecated @@ -84,7 +84,7 @@ SESSION_ID_LENGTH=32 # MinIO Orphan Cleanup (optional) # Enable periodic pruning of MinIO objects older than TTL with missing Redis sessions -ENABLE_ORPHAN_MINIO_CLEANUP=false +ENABLE_ORPHAN_MINIO_CLEANUP=true # Container Pool Configuration CONTAINER_POOL_ENABLED=true @@ -174,4 +174,4 @@ HEALTH_CHECK_TIMEOUT=5 # Development Configuration ENABLE_CORS=false # CORS_ORIGINS=http://localhost:3000,http://localhost:8080 -ENABLE_DOCS=true \ No newline at end of file +ENABLE_DOCS=false \ No newline at end of file diff --git a/.gitignore b/.gitignore index 93037e9..2b5f249 100644 --- a/.gitignore +++ b/.gitignore @@ -58,7 +58,7 @@ coverage.xml .hypothesis/ .pytest_cache/ cover/ - +baseline_complexity.json # Translations *.mo *.pot diff --git a/tests/integration/test_file_handling.py b/tests/integration/test_file_handling.py index 251e79e..224f198 100644 --- a/tests/integration/test_file_handling.py +++ b/tests/integration/test_file_handling.py @@ -336,3 +336,363 @@ async def test_large_file_generation(self, ssl_context, headers): f"Large file too small: {len(content)} bytes" ) assert content[:8] == b'\x89PNG\r\n\x1a\n', "Invalid PNG" + + +class TestUploadAnalyzeDownload: + """Test complete workflow: upload file → analyze with pandas → download results.""" + + @pytest.mark.asyncio + async def test_upload_csv_analyze_download_results(self, ssl_context, headers): + """Test uploading a CSV, performing pandas analysis, and downloading the results.""" + connector = aiohttp.TCPConnector(ssl=ssl_context) + async with aiohttp.ClientSession(connector=connector) as session: + import time + entity_id = f"test-upload-analyze-{int(time.time())}" + + # Step 1: Upload a CSV file + csv_content = "product,quantity,price\nWidget A,100,9.99\nWidget B,250,14.99\nWidget C,75,24.99\nWidget D,300,4.99\nWidget E,150,19.99" + + form_data = aiohttp.FormData() + form_data.add_field('files', csv_content.encode(), + filename='sales_data.csv', + content_type='text/csv') + form_data.add_field('entity_id', entity_id) + + upload_headers = {"X-API-Key": API_KEY} + + async with session.post( + f"{API_URL}/upload", + data=form_data, + headers=upload_headers, + ssl=ssl_context + ) as resp: + assert resp.status == 200, f"Upload failed: {await resp.text()}" + upload_result = await resp.json() + + session_id = upload_result.get("session_id") + uploaded_files = upload_result.get("files", []) + assert len(uploaded_files) >= 1, "No files in upload response" + + uploaded_file = uploaded_files[0] + file_id = uploaded_file.get("id") or uploaded_file.get("fileId") + assert file_id is not None, "No file ID returned" + + # Step 2: Execute analysis code that reads the uploaded file and creates a report + from textwrap import dedent + analysis_code = dedent(""" + import pandas as pd + + # Read the uploaded CSV (files are placed in /mnt/data/) + df = pd.read_csv('/mnt/data/sales_data.csv') + + # Perform analysis + df['total_value'] = df['quantity'] * df['price'] + summary = df.describe() + + # Create a summary report + report = f'''Sales Analysis Report + ===================== + Total Products: {len(df)} + Total Revenue: ${df["total_value"].sum():.2f} + Average Price: ${df["price"].mean():.2f} + Top Product by Quantity: {df.loc[df["quantity"].idxmax(), "product"]} + Top Product by Value: {df.loc[df["total_value"].idxmax(), "product"]} + ''' + + # Save the analysis results + df.to_csv('/mnt/data/analyzed_sales.csv', index=False) + + with open('/mnt/data/sales_report.txt', 'w') as f: + f.write(report) + + print(report) + """).strip() + + exec_payload = { + "lang": "py", + "code": analysis_code, + "entity_id": entity_id, + "files": [{ + "id": file_id, + "session_id": session_id, + "name": "sales_data.csv" + }] + } + + async with session.post( + f"{API_URL}/exec", + json=exec_payload, + headers=headers, + ssl=ssl_context + ) as resp: + assert resp.status == 200, f"Exec failed: {await resp.text()}" + exec_result = await resp.json() + + # Verify execution succeeded + stdout = exec_result.get("stdout", "") + stderr = exec_result.get("stderr", "") + assert "Sales Analysis Report" in stdout, f"Analysis failed. stdout: {stdout}, stderr: {stderr}" + + # Find generated files + files = exec_result.get("files", []) + assert len(files) >= 2, f"Expected 2 output files, got {len(files)}" + + csv_output = next((f for f in files if "analyzed_sales.csv" in f.get("name", "")), None) + txt_output = next((f for f in files if "sales_report.txt" in f.get("name", "")), None) + + assert csv_output is not None, "analyzed_sales.csv not found in output" + assert txt_output is not None, "sales_report.txt not found in output" + + # Use session_id from exec result for downloading generated files + exec_session_id = exec_result.get("session_id") + + # Step 3: Download and verify the analyzed CSV + download_url = f"{API_URL}/download/{exec_session_id}/{csv_output['id']}" + async with session.get(download_url, headers=upload_headers, ssl=ssl_context) as resp: + assert resp.status == 200, f"CSV download failed: {resp.status}" + csv_result = await resp.text() + + # Verify the analysis added the total_value column + assert "total_value" in csv_result, "Analysis column not found in output CSV" + assert "Widget A" in csv_result + # Widget A: 100 * 9.99 = 999.0 + assert "999" in csv_result, "Calculated total_value not found" + + # Step 4: Download and verify the text report + download_url = f"{API_URL}/download/{exec_session_id}/{txt_output['id']}" + async with session.get(download_url, headers=upload_headers, ssl=ssl_context) as resp: + assert resp.status == 200, f"Report download failed: {resp.status}" + report_content = await resp.text() + + assert "Sales Analysis Report" in report_content + assert "Total Products: 5" in report_content + assert "Total Revenue:" in report_content + assert "Top Product by Quantity:" in report_content + + @pytest.mark.asyncio + async def test_upload_image_process_download(self, ssl_context, headers): + """Test uploading an image, processing with OpenCV, and downloading the result.""" + connector = aiohttp.TCPConnector(ssl=ssl_context) + async with aiohttp.ClientSession(connector=connector) as session: + import time + entity_id = f"test-image-process-{int(time.time())}" + + # Step 1: Create and upload a simple PNG image (100x100 red square) + # PNG header for a minimal valid image is complex, so we'll generate one with code + # First, execute code to create a test image, then use it + + # Create a test image via execution first + create_image_code = """ +import cv2 +import numpy as np + +# Create a simple test image (100x100 with colored squares) +img = np.zeros((100, 100, 3), dtype=np.uint8) +img[0:50, 0:50] = [255, 0, 0] # Blue (BGR) +img[0:50, 50:100] = [0, 255, 0] # Green +img[50:100, 0:50] = [0, 0, 255] # Red +img[50:100, 50:100] = [255, 255, 0] # Cyan + +cv2.imwrite('/mnt/data/test_input.png', img) +print(f'Created test image: {img.shape}') +""" + + async with session.post( + f"{API_URL}/exec", + json={"lang": "py", "code": create_image_code, "entity_id": entity_id}, + headers=headers, + ssl=ssl_context + ) as resp: + assert resp.status == 200 + result = await resp.json() + session_id = result.get("session_id") + + # Get the created image file + files = result.get("files", []) + input_image = next((f for f in files if "test_input.png" in f.get("name", "")), None) + assert input_image is not None, "Test image not created" + + # Step 2: Process the image (apply blur and edge detection) + process_code = """ +import cv2 +import numpy as np + +# Read the input image (files are placed in /mnt/data/) +img = cv2.imread('/mnt/data/test_input.png') +print(f'Input shape: {img.shape}') + +# Apply Gaussian blur +blurred = cv2.GaussianBlur(img, (5, 5), 0) + +# Apply Canny edge detection +gray = cv2.cvtColor(blurred, cv2.COLOR_BGR2GRAY) +edges = cv2.Canny(gray, 50, 150) + +# Save results +cv2.imwrite('/mnt/data/blurred.png', blurred) +cv2.imwrite('/mnt/data/edges.png', edges) + +print(f'Processed images saved. Edges shape: {edges.shape}') +""" + + exec_payload = { + "lang": "py", + "code": process_code, + "entity_id": entity_id, + "files": [{ + "id": input_image['id'], + "session_id": session_id, + "name": "test_input.png" + }] + } + + async with session.post( + f"{API_URL}/exec", + json=exec_payload, + headers=headers, + ssl=ssl_context + ) as resp: + assert resp.status == 200, f"Processing failed: {await resp.text()}" + result = await resp.json() + + stdout = result.get("stdout", "") + stderr = result.get("stderr", "") + assert "Processed images saved" in stdout, f"Processing failed. stderr: {stderr}" + + files = result.get("files", []) + blurred_file = next((f for f in files if "blurred.png" in f.get("name", "")), None) + edges_file = next((f for f in files if "edges.png" in f.get("name", "")), None) + + assert blurred_file is not None, "blurred.png not found" + assert edges_file is not None, "edges.png not found" + + # Step 3: Download and verify the processed images + upload_headers = {"X-API-Key": API_KEY} + + # Download blurred image + download_url = f"{API_URL}/download/{session_id}/{blurred_file['id']}" + async with session.get(download_url, headers=upload_headers, ssl=ssl_context) as resp: + assert resp.status == 200 + content = await resp.read() + assert len(content) > 100, f"Blurred image too small: {len(content)}" + assert content[:4] == b'\x89PNG', "Blurred output is not a valid PNG" + + # Download edges image + download_url = f"{API_URL}/download/{session_id}/{edges_file['id']}" + async with session.get(download_url, headers=upload_headers, ssl=ssl_context) as resp: + assert resp.status == 200 + content = await resp.read() + assert len(content) > 100, f"Edges image too small: {len(content)}" + assert content[:4] == b'\x89PNG', "Edges output is not a valid PNG" + + @pytest.mark.asyncio + async def test_upload_json_transform_download(self, ssl_context, headers): + """Test uploading JSON data, transforming it, and downloading the result.""" + connector = aiohttp.TCPConnector(ssl=ssl_context) + async with aiohttp.ClientSession(connector=connector) as session: + import time + import json + entity_id = f"test-json-transform-{int(time.time())}" + + # Step 1: Upload JSON data + json_data = { + "users": [ + {"name": "Alice", "age": 30, "department": "Engineering"}, + {"name": "Bob", "age": 25, "department": "Marketing"}, + {"name": "Charlie", "age": 35, "department": "Engineering"}, + {"name": "Diana", "age": 28, "department": "Sales"}, + {"name": "Eve", "age": 32, "department": "Engineering"} + ] + } + + form_data = aiohttp.FormData() + form_data.add_field('files', json.dumps(json_data).encode(), + filename='users.json', + content_type='application/json') + form_data.add_field('entity_id', entity_id) + + upload_headers = {"X-API-Key": API_KEY} + + async with session.post( + f"{API_URL}/upload", + data=form_data, + headers=upload_headers, + ssl=ssl_context + ) as resp: + assert resp.status == 200, f"Upload failed: {await resp.text()}" + upload_result = await resp.json() + session_id = upload_result.get("session_id") + uploaded_file = upload_result.get("files", [])[0] + file_id = uploaded_file.get("id") or uploaded_file.get("fileId") + + # Step 2: Transform the data + from textwrap import dedent + transform_code = dedent(""" + import json + import pandas as pd + + # Read uploaded JSON (files are placed in /mnt/data/) + with open('/mnt/data/users.json') as f: + data = json.load(f) + + # Convert to DataFrame and analyze + df = pd.DataFrame(data['users']) + + # Group by department + dept_summary = df.groupby('department').agg({ + 'name': 'count', + 'age': 'mean' + }).rename(columns={'name': 'count', 'age': 'avg_age'}) + + # Create output + output = { + 'total_users': len(df), + 'avg_age': df['age'].mean(), + 'department_breakdown': dept_summary.to_dict('index') + } + + # Save transformed data + with open('/mnt/data/analysis.json', 'w') as f: + json.dump(output, f, indent=2) + + print(json.dumps(output, indent=2)) + """).strip() + + exec_payload = { + "lang": "py", + "code": transform_code, + "entity_id": entity_id, + "files": [{ + "id": file_id, + "session_id": session_id, + "name": "users.json" + }] + } + + async with session.post( + f"{API_URL}/exec", + json=exec_payload, + headers=headers, + ssl=ssl_context + ) as resp: + assert resp.status == 200 + result = await resp.json() + + files = result.get("files", []) + json_output = next((f for f in files if "analysis.json" in f.get("name", "")), None) + assert json_output is not None, "analysis.json not found" + + # Use session_id from exec result for downloading + exec_session_id = result.get("session_id") + + # Step 3: Download and verify the result + download_url = f"{API_URL}/download/{exec_session_id}/{json_output['id']}" + async with session.get(download_url, headers=upload_headers, ssl=ssl_context) as resp: + assert resp.status == 200 + content = await resp.text() + + result_data = json.loads(content) + assert result_data['total_users'] == 5 + assert 'department_breakdown' in result_data + assert 'Engineering' in result_data['department_breakdown'] + assert result_data['department_breakdown']['Engineering']['count'] == 3 From 8193e1be4ab10548b94703c2fffbeb7707f49ea3 Mon Sep 17 00:00:00 2001 From: Joe Licata Date: Sun, 28 Dec 2025 21:05:05 +0000 Subject: [PATCH 6/8] refactor: Rename max_processes to max_pids in configuration settings - Updated the configuration to replace `max_processes` with `max_pids` for clarity in managing per-container process limits. --- src/config/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config/__init__.py b/src/config/__init__.py index cddff4f..168f084 100644 --- a/src/config/__init__.py +++ b/src/config/__init__.py @@ -547,7 +547,7 @@ def resources(self) -> ResourcesConfig: max_memory_mb=self.max_memory_mb, max_cpus=self.max_cpus, max_cpu_quota=self.max_cpu_quota, - max_processes=self.max_processes, + max_pids=self.max_pids, max_open_files=self.max_open_files, max_file_size_mb=self.max_file_size_mb, max_total_file_size_mb=self.max_total_file_size_mb, From fa0d182bf9b3438964f54eced71b6cc564046714 Mon Sep 17 00:00:00 2001 From: Joe Licata Date: Sun, 28 Dec 2025 21:12:33 +0000 Subject: [PATCH 7/8] feat: Expand allowed file extensions in security configuration - Added a comprehensive list of allowed file extensions for various categories including text, Microsoft Office, OpenDocument formats, data formats, images, web files, scripts, and archives to enhance file handling capabilities. --- src/config/security.py | 61 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 7 deletions(-) diff --git a/src/config/security.py b/src/config/security.py index 26cc9c2..0430582 100644 --- a/src/config/security.py +++ b/src/config/security.py @@ -17,7 +17,48 @@ class SecurityConfig(BaseSettings): # File Security allowed_file_extensions: List[str] = Field( default_factory=lambda: [ + # Text and documentation ".txt", + ".md", + ".rtf", + ".pdf", + # Microsoft Office + ".doc", + ".docx", + ".dotx", + ".xls", + ".xlsx", + ".xltx", + ".ppt", + ".pptx", + ".potx", + ".ppsx", + # OpenDocument formats + ".odt", + ".ods", + ".odp", + ".odg", + # Data formats + ".json", + ".csv", + ".xml", + ".yaml", + ".yml", + ".sql", + # Images + ".png", + ".jpg", + ".jpeg", + ".gif", + ".svg", + ".bmp", + ".webp", + ".ico", + # Web + ".html", + ".htm", + ".css", + # Code files ".py", ".js", ".ts", @@ -32,18 +73,24 @@ class SecurityConfig(BaseSettings): ".r", ".f90", ".d", - ".json", - ".csv", - ".xml", - ".yaml", - ".yml", - ".md", - ".sql", + # Scripts and config ".sh", ".bat", ".ps1", ".dockerfile", ".makefile", + ".ini", + ".cfg", + ".conf", + ".log", + # Archives + ".zip", + # Email and calendar + ".eml", + ".msg", + ".mbox", + ".ics", + ".vcf", ] ) blocked_file_patterns: List[str] = Field( From 8d6c5d8af9f45bc00c1772bc75ceace92ff67312 Mon Sep 17 00:00:00 2001 From: Joe Licata Date: Sun, 28 Dec 2025 21:32:56 +0000 Subject: [PATCH 8/8] feat: Implement filename sanitization for file uploads and listings - Added `sanitize_filename` method in `OutputProcessor` to ensure filenames are safe for container use by replacing non-alphanumeric characters and handling edge cases. - Updated `upload_file` and `list_files` functions to utilize the new sanitization logic, ensuring consistent filename formatting across file operations. - Introduced unit tests for `sanitize_filename` to validate its functionality and edge case handling. --- src/api/files.py | 15 ++-- src/services/execution/output.py | 58 ++++++++++++-- tests/unit/test_output_processor.py | 119 ++++++++++++++++++++++++++++ 3 files changed, 180 insertions(+), 12 deletions(-) create mode 100644 tests/unit/test_output_processor.py diff --git a/src/api/files.py b/src/api/files.py index 7b38146..6f436a0 100644 --- a/src/api/files.py +++ b/src/api/files.py @@ -15,6 +15,7 @@ # Local application imports from ..config import settings from ..dependencies import FileServiceDep +from ..services.execution.output import OutputProcessor from ..utils.id_generator import generate_session_id @@ -119,13 +120,13 @@ async def upload_file( content_type=file.content_type, ) - # Get file info for complete details - file_info = await file_service.get_file_info(session_id, file_id) + # Sanitize filename to match what will be used in container + sanitized_name = OutputProcessor.sanitize_filename(file.filename) uploaded_files.append( { "id": file_id, - "name": file.filename, + "name": sanitized_name, "session_id": session_id, "content": None, # LibreChat doesn't return content in upload response "size": len(content), @@ -207,10 +208,12 @@ async def list_files( # Return simple file information simple_files = [] for file_info in files: + # Return sanitized filename to match container + sanitized_name = OutputProcessor.sanitize_filename(file_info.filename) simple_files.append( { "id": file_info.file_id, - "name": file_info.filename, + "name": sanitized_name, "path": file_info.path, } ) @@ -219,9 +222,11 @@ async def list_files( # Return full file details - LibreChat format detailed_files = [] for file_info in files: + # Return sanitized filename to match container + sanitized_name = OutputProcessor.sanitize_filename(file_info.filename) detailed_files.append( { - "name": file_info.filename, + "name": sanitized_name, "id": file_info.file_id, "session_id": session_id, "content": None, # Not returned in list diff --git a/src/services/execution/output.py b/src/services/execution/output.py index 71a97f7..8bcd108 100644 --- a/src/services/execution/output.py +++ b/src/services/execution/output.py @@ -1,6 +1,8 @@ """Output processing and validation for code execution.""" +import os import re +import secrets from pathlib import Path from typing import Any, Dict @@ -221,13 +223,55 @@ def format_error_message(cls, exit_code: int, stderr: str) -> str: return f"Execution failed (exit code {exit_code}):\n{stderr_clean}" @classmethod - def normalize_filename(cls, filename: str) -> str: - """Normalize filename for container use: replace spaces with underscores. + def sanitize_filename(cls, input_name: str) -> str: + """Sanitize filename to match LibreChat's sanitization logic. + + Replaces all non-alphanumeric characters (except '.' and '-') with + underscores. This ensures filenames on disk match what LibreChat + reports in the system prompt. - Important: we deliberately KEEP non-ASCII characters (e.g., Japanese) - so that user-visible filenames aren't transliterated. + Args: + input_name: Original filename (may include path components) + + Returns: + Sanitized filename safe for container use """ + if not input_name: + return "_" + try: - return filename.replace(" ", "_") if filename else filename - except Exception: - return filename + # Remove any directory components (path traversal prevention) + name = os.path.basename(input_name) + + # Replace any non-alphanumeric characters except for '.' and '-' + name = re.sub(r"[^a-zA-Z0-9.-]", "_", name) + + # Ensure the name doesn't start with a dot (hidden file in Unix) + if name.startswith(".") or name == "": + name = "_" + name + + # Limit the length of the filename + max_length = 255 + if len(name) > max_length: + ext = os.path.splitext(name)[1] + name_without_ext = os.path.splitext(name)[0] + random_suffix = secrets.token_hex(3) + truncate_len = max_length - len(ext) - 7 + if truncate_len < 1: + truncate_len = 1 + name = name_without_ext[:truncate_len] + "-" + random_suffix + ext + + return name + + except Exception as e: + logger.error(f"Failed to sanitize filename: {e}") + return "_" + + @classmethod + def normalize_filename(cls, filename: str) -> str: + """Deprecated: Use sanitize_filename instead. + + This method is kept for backward compatibility but delegates to + sanitize_filename which matches LibreChat's sanitization logic. + """ + return cls.sanitize_filename(filename) diff --git a/tests/unit/test_output_processor.py b/tests/unit/test_output_processor.py new file mode 100644 index 0000000..6b78c60 --- /dev/null +++ b/tests/unit/test_output_processor.py @@ -0,0 +1,119 @@ +"""Unit tests for the OutputProcessor.""" + +import pytest +from src.services.execution.output import OutputProcessor + + +class TestSanitizeFilename: + """Tests for the sanitize_filename method.""" + + def test_spaces_replaced_with_underscores(self): + """Test that spaces are replaced with underscores.""" + result = OutputProcessor.sanitize_filename("file with spaces.txt") + assert result == "file_with_spaces.txt" + + def test_parentheses_replaced_with_underscores(self): + """Test that parentheses are replaced with underscores.""" + result = OutputProcessor.sanitize_filename("manufacturing_analysis (v2).xlsx") + assert result == "manufacturing_analysis__v2_.xlsx" + + def test_special_characters_replaced(self): + """Test that special characters are replaced with underscores.""" + result = OutputProcessor.sanitize_filename("special@chars#here!.pdf") + assert result == "special_chars_here_.pdf" + + def test_already_valid_unchanged(self): + """Test that already valid filenames are unchanged.""" + result = OutputProcessor.sanitize_filename("already-valid.txt") + assert result == "already-valid.txt" + + def test_uppercase_preserved(self): + """Test that uppercase letters are preserved.""" + result = OutputProcessor.sanitize_filename("UPPERCASE.TXT") + assert result == "UPPERCASE.TXT" + + def test_numbers_preserved(self): + """Test that numbers are preserved.""" + result = OutputProcessor.sanitize_filename("123numbers.doc") + assert result == "123numbers.doc" + + def test_hidden_file_prefixed(self): + """Test that hidden files (starting with dot) get underscore prefix.""" + result = OutputProcessor.sanitize_filename(".hidden") + assert result == "_.hidden" + + def test_empty_string_returns_underscore(self): + """Test that empty string returns underscore.""" + result = OutputProcessor.sanitize_filename("") + assert result == "_" + + def test_none_returns_underscore(self): + """Test that None returns underscore.""" + result = OutputProcessor.sanitize_filename(None) + assert result == "_" + + def test_path_traversal_stripped(self): + """Test that path traversal attempts are stripped.""" + result = OutputProcessor.sanitize_filename("../../../etc/passwd") + assert result == "passwd" + + def test_absolute_path_stripped(self): + """Test that absolute paths are stripped to basename.""" + result = OutputProcessor.sanitize_filename("/absolute/path/file.txt") + assert result == "file.txt" + + def test_unicode_characters_replaced(self): + """Test that non-ASCII characters are replaced.""" + result = OutputProcessor.sanitize_filename("résumé.docx") + assert result == "r_sum_.docx" + + def test_brackets_replaced(self): + """Test that brackets are replaced with underscores.""" + result = OutputProcessor.sanitize_filename("[brackets].txt") + assert result == "_brackets_.txt" + + def test_leading_parenthesis_prefixed(self): + """Test that filename starting with parenthesis is handled.""" + result = OutputProcessor.sanitize_filename("(parentheses).txt") + assert result == "_parentheses_.txt" + + def test_dashes_preserved(self): + """Test that dashes are preserved.""" + result = OutputProcessor.sanitize_filename("file-name.with-dashes.txt") + assert result == "file-name.with-dashes.txt" + + def test_dots_preserved(self): + """Test that dots in filename are preserved.""" + result = OutputProcessor.sanitize_filename("file.name.multiple.dots.txt") + assert result == "file.name.multiple.dots.txt" + + def test_simple_filename_unchanged(self): + """Test that simple alphanumeric filename is unchanged.""" + result = OutputProcessor.sanitize_filename("SimpleFile123.pdf") + assert result == "SimpleFile123.pdf" + + def test_long_filename_truncated(self): + """Test that filenames over 255 chars are truncated with hash suffix.""" + long_name = "a" * 300 + ".txt" + result = OutputProcessor.sanitize_filename(long_name) + # Should be 255 chars or less + assert len(result) <= 255 + # Should end with .txt + assert result.endswith(".txt") + # Should have a random suffix before extension + assert "-" in result + + +class TestNormalizeFilename: + """Tests for the deprecated normalize_filename method.""" + + def test_delegates_to_sanitize_filename(self): + """Test that normalize_filename delegates to sanitize_filename.""" + result = OutputProcessor.normalize_filename("file with spaces.txt") + expected = OutputProcessor.sanitize_filename("file with spaces.txt") + assert result == expected + + def test_parentheses_now_replaced(self): + """Test that normalize_filename now also replaces parentheses.""" + result = OutputProcessor.normalize_filename("file (v2).xlsx") + assert result == "file__v2_.xlsx"