diff --git a/.env.example b/.env.example index 6226512..fcef2d9 100644 --- a/.env.example +++ b/.env.example @@ -1,51 +1,182 @@ -# Copy this file to .env and update with your values +# Rendiff FFmpeg API - Production Environment Configuration +# Copy this file to .env and configure for your environment -# Database Configuration -POSTGRES_PASSWORD=your_secure_password_here -POSTGRES_USER=ffmpeg_user -POSTGRES_DB=ffmpeg_api -DATABASE_URL=postgresql://ffmpeg_user:your_secure_password_here@postgres:5432/ffmpeg_api +# ============================================================================= +# CORE APPLICATION SETTINGS +# ============================================================================= -# Redis/Queue Configuration -REDIS_URL=redis://redis:6379/0 +# Application Environment +DEBUG=false +TESTING=false +VERSION=1.0.0 -# API Configuration +# API Server Configuration API_HOST=0.0.0.0 API_PORT=8000 API_WORKERS=4 -LOG_LEVEL=info -DEBUG=false +API_RELOAD=false +API_LOG_LEVEL=info + +# ============================================================================= +# DATABASE CONFIGURATION +# ============================================================================= + +# Production PostgreSQL (Recommended) +DATABASE_URL=postgresql://ffmpeg_user:your_secure_password@postgres:5432/ffmpeg_api + +# Alternative: SQLite (Development Only) +# DATABASE_URL=sqlite+aiosqlite:///data/rendiff.db + +# Database Pool Settings +DATABASE_POOL_SIZE=20 +DATABASE_MAX_OVERFLOW=40 + +# ============================================================================= +# QUEUE & CACHE CONFIGURATION +# ============================================================================= + +# Redis Configuration +REDIS_URL=redis://redis:6379/0 +REDIS_MAX_CONNECTIONS=100 # Worker Configuration WORKER_CONCURRENCY=4 -CPU_WORKERS=2 -GPU_WORKERS=0 +WORKER_PREFETCH_MULTIPLIER=1 +WORKER_MAX_TASKS_PER_CHILD=100 +WORKER_TASK_TIME_LIMIT=21600 -# Storage Configuration -STORAGE_PATH=./storage +# ============================================================================= +# STORAGE CONFIGURATION +# ============================================================================= + +# Storage Paths STORAGE_CONFIG=/app/config/storage.yml +STORAGE_PATH=./storage +TEMP_PATH=/tmp/rendiff + +# Data Persistence Paths (for Docker volumes) +POSTGRES_DATA_PATH=./data/postgres +REDIS_DATA_PATH=./data/redis +PROMETHEUS_DATA_PATH=./data/prometheus +GRAFANA_DATA_PATH=./data/grafana + +# ============================================================================= +# FFMPEG & PROCESSING CONFIGURATION +# ============================================================================= -# Security Configuration -ADMIN_API_KEYS=your_admin_key_1,your_admin_key_2 -RENDIFF_API_KEYS=your_client_key_1,your_client_key_2,your_client_key_3 +# FFmpeg Settings +FFMPEG_THREADS=0 +FFMPEG_PRESET=medium +FFMPEG_CRF=23 +FFMPEG_HARDWARE_ACCELERATION=auto + +# ============================================================================= +# SECURITY & AUTHENTICATION +# ============================================================================= + +# API Security +API_KEY_HEADER=X-API-Key ENABLE_API_KEYS=true -CORS_ORIGINS=https://localhost,http://localhost +ENABLE_IP_WHITELIST=false +IP_WHITELIST=10.0.0.0/8,192.168.0.0/16 + +# Admin API Keys (comma-separated) +ADMIN_API_KEYS=your_admin_key_here + +# Rate Limiting +ENABLE_RATE_LIMITING=true +RATE_LIMIT_CALLS=2000 +RATE_LIMIT_PERIOD=3600 -# SSL/HTTPS Configuration (for production) -DOMAIN_NAME=localhost -CERTBOT_EMAIL=admin@example.com -CERT_RESOLVER=letsencrypt -LETSENCRYPT_STAGING=false +# CORS Origins (comma-separated) +API_CORS_ORIGINS=http://localhost,https://localhost,https://yourdomain.com +API_TRUSTED_HOSTS=localhost,yourdomain.com -# Monitoring Configuration -GRAFANA_PASSWORD=your_grafana_password_here -PROMETHEUS_AUTH=admin:your_prometheus_password_here -TRAEFIK_AUTH=admin:your_traefik_password_here +# Database Passwords +POSTGRES_PASSWORD=your_secure_postgres_password -# Resource Limits +# ============================================================================= +# MONITORING & OBSERVABILITY +# ============================================================================= + +# Metrics & Monitoring +ENABLE_METRICS=true +METRICS_PORT=9000 +ENABLE_TRACING=false +TRACING_ENDPOINT= + +# Grafana Configuration +GRAFANA_PASSWORD=your_secure_grafana_password + +# ============================================================================= +# RESOURCE LIMITS +# ============================================================================= + +# Upload & Processing Limits MAX_UPLOAD_SIZE=10737418240 -MAX_CONCURRENT_JOBS_PER_KEY=10 MAX_JOB_DURATION=21600 +MAX_CONCURRENT_JOBS_PER_KEY=10 +JOB_RETENTION_DAYS=7 + +# ============================================================================= +# WEBHOOKS & NOTIFICATIONS +# ============================================================================= + +# Webhook Configuration +WEBHOOK_TIMEOUT=30 +WEBHOOK_MAX_RETRIES=3 +WEBHOOK_RETRY_DELAY=60 + +# ============================================================================= +# OPTIONAL SERVICES +# ============================================================================= + +# Virus Scanning (Optional) +ENABLE_VIRUS_SCAN=false +CLAMAV_HOST= +CLAMAV_PORT=3310 + +# ============================================================================= +# DEPLOYMENT SPECIFIC +# ============================================================================= + +# Docker Compose Profiles +# Uncomment the profile you want to use: +# COMPOSE_PROFILES=standard # Standard CPU-only deployment +# COMPOSE_PROFILES=gpu # GPU-accelerated deployment +# COMPOSE_PROFILES=monitoring # Include Prometheus/Grafana +# COMPOSE_PROFILES=gpu,monitoring # GPU + Monitoring + +# Network Configuration +# COMPOSE_PROJECT_NAME=ffmpeg-api + +# ============================================================================= +# CLOUD STORAGE (Optional) +# ============================================================================= + +# AWS S3 +# AWS_ACCESS_KEY_ID=your_access_key +# AWS_SECRET_ACCESS_KEY=your_secret_key +# AWS_DEFAULT_REGION=us-west-2 +# S3_BUCKET_NAME=your-video-bucket + +# Azure Blob Storage +# AZURE_STORAGE_ACCOUNT=your_account +# AZURE_STORAGE_KEY=your_key +# AZURE_CONTAINER_NAME=videos + +# Google Cloud Storage +# GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account.json +# GCS_BUCKET_NAME=your-video-bucket + +# ============================================================================= +# SSL/TLS CONFIGURATION +# ============================================================================= + +# SSL Certificate Paths (for production) +# SSL_CERT_PATH=/etc/ssl/certs/your-cert.crt +# SSL_KEY_PATH=/etc/ssl/private/your-key.key -# External URLs -EXTERNAL_URL=https://localhost \ No newline at end of file +# Let's Encrypt (for automatic SSL) +# LETSENCRYPT_EMAIL=admin@yourdomain.com +# LETSENCRYPT_HOST=yourdomain.com \ No newline at end of file diff --git a/.github/workflows/yaml-lint.yml b/.github/workflows/yaml-lint.yml new file mode 100644 index 0000000..0dcdf2f --- /dev/null +++ b/.github/workflows/yaml-lint.yml @@ -0,0 +1,133 @@ +name: YAML Lint + +on: + push: + branches: [ main, develop ] + paths: + - '**/*.yml' + - '**/*.yaml' + - '.github/workflows/*.yml' + pull_request: + branches: [ main, develop ] + paths: + - '**/*.yml' + - '**/*.yaml' + - '.github/workflows/*.yml' + +jobs: + yaml-lint: + name: YAML Lint + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' + + - name: Install yamllint + run: | + python -m pip install --upgrade pip + pip install yamllint + + - name: Create yamllint config + run: | + cat > .yamllint.yml << EOF + extends: default + + rules: + # Allow longer lines for docker compose + line-length: + max: 120 + level: warning + + # Allow multiple spaces after operators + colons: + max-spaces-before: 0 + max-spaces-after: -1 + + # Allow indentation of 2 or 4 spaces + indentation: + spaces: consistent + indent-sequences: true + check-multi-line-strings: false + + # Don't require document start markers + document-start: disable + + # Allow empty values + empty-values: + forbid-in-block-mappings: false + forbid-in-flow-mappings: false + + # Allow trailing spaces in comments + trailing-spaces: + level: warning + + # Allow truthy values like 'yes', 'on', etc. + truthy: + allowed-values: ['true', 'false', 'yes', 'no', 'on', 'off'] + check-keys: false + EOF + + - name: Lint YAML files + run: | + echo "Linting YAML files..." + find . -type f \( -name "*.yml" -o -name "*.yaml" \) -not -path "./.git/*" | while read file; do + echo "Checking: $file" + yamllint "$file" + done + + - name: Validate Docker Compose files + run: | + echo "Validating Docker Compose syntax..." + + # Check main compose file + if [ -f "compose.yml" ]; then + echo "Validating compose.yml..." + docker compose -f compose.yml config > /dev/null + fi + + # Check production compose file + if [ -f "compose.prod.yml" ]; then + echo "Validating compose.prod.yml..." + docker compose -f compose.prod.yml config > /dev/null + fi + + # Check stable compose file + if [ -f "compose.stable.yml" ]; then + echo "Validating compose.stable.yml..." + docker compose -f compose.stable.yml config > /dev/null + fi + + # Check override file + if [ -f "compose.override.yml" ]; then + echo "Validating compose.override.yml..." + docker compose -f compose.yml -f compose.override.yml config > /dev/null + fi + + echo "All Docker Compose files are valid!" + + - name: Check for common issues + run: | + echo "Checking for common Docker Compose issues..." + + # Check for deprecated version field + if grep -r "version:" . --include="*.yml" --include="*.yaml" --exclude-dir=.git; then + echo "::warning::Found 'version:' field in compose files. This is deprecated in modern Docker Compose." + fi + + # Check for hardcoded localhost + if grep -r "localhost" . --include="compose*.yml" --exclude-dir=.git; then + echo "::warning::Found hardcoded 'localhost' in compose files. Consider using service names." + fi + + # Check for missing health checks on databases + if grep -A 10 "image.*postgres" . --include="compose*.yml" --exclude-dir=.git | grep -v "healthcheck:" > /dev/null; then + echo "::warning::PostgreSQL services should have health checks defined." + fi + + echo "Common issues check completed!" \ No newline at end of file diff --git a/.structure.md b/.structure.md new file mode 100644 index 0000000..76146c9 --- /dev/null +++ b/.structure.md @@ -0,0 +1,29 @@ +# Optimized Repository Structure + +## Core Structure +``` +ffmpeg-api/ +โ”œโ”€โ”€ ๐Ÿ“ src/ # Source code (renamed from api/) +โ”‚ โ”œโ”€โ”€ ๐Ÿ“ api/ # API layer +โ”‚ โ”œโ”€โ”€ ๐Ÿ“ core/ # Core business logic +โ”‚ โ”œโ”€โ”€ ๐Ÿ“ models/ # Data models +โ”‚ โ”œโ”€โ”€ ๐Ÿ“ services/ # Business services +โ”‚ โ””โ”€โ”€ ๐Ÿ“ utils/ # Utilities +โ”œโ”€โ”€ ๐Ÿ“ workers/ # Worker processes +โ”œโ”€โ”€ ๐Ÿ“ tests/ # Test suite +โ”œโ”€โ”€ ๐Ÿ“ deployment/ # Deployment configs +โ”‚ โ”œโ”€โ”€ ๐Ÿ“ docker/ # Docker configurations +โ”‚ โ”œโ”€โ”€ ๐Ÿ“ k8s/ # Kubernetes manifests +โ”‚ โ””โ”€โ”€ ๐Ÿ“ compose/ # Docker Compose files +โ”œโ”€โ”€ ๐Ÿ“ config/ # Configuration files +โ”œโ”€โ”€ ๐Ÿ“ docs/ # Documentation +โ”œโ”€โ”€ ๐Ÿ“ scripts/ # Utility scripts +โ””โ”€โ”€ ๐Ÿ“ monitoring/ # Monitoring and observability + +## Changes Made: +1. Consolidated API code under src/ +2. Moved deployment files to deployment/ +3. Cleaned up root directory +4. Better separation of concerns +5. Removed redundant files +``` \ No newline at end of file diff --git a/.yamllint.yml b/.yamllint.yml new file mode 100644 index 0000000..47ad4cc --- /dev/null +++ b/.yamllint.yml @@ -0,0 +1,64 @@ +# YAML Lint Configuration for FFmpeg API +# https://yamllint.readthedocs.io/en/stable/configuration.html + +extends: default + +rules: + # Allow longer lines for docker compose and configuration files + line-length: + max: 120 + level: warning + + # Allow multiple spaces after colons for alignment + colons: + max-spaces-before: 0 + max-spaces-after: -1 + + # Allow consistent indentation (2 or 4 spaces) + indentation: + spaces: consistent + indent-sequences: true + check-multi-line-strings: false + + # Don't require document start markers (---) + document-start: disable + + # Allow empty values in mappings + empty-values: + forbid-in-block-mappings: false + forbid-in-flow-mappings: false + + # Make trailing spaces a warning instead of error + trailing-spaces: + level: warning + + # Allow common truthy values used in Docker Compose + truthy: + allowed-values: ['true', 'false', 'yes', 'no', 'on', 'off'] + check-keys: false + + # Allow brackets for flow sequences + brackets: + min-spaces-inside: 0 + max-spaces-inside: 1 + + # Allow braces for flow mappings + braces: + min-spaces-inside: 0 + max-spaces-inside: 1 + + # Allow comments to be closer to content + comments: + min-spaces-from-content: 1 + + # Don't require spacing around hyphens in flow sequences + hyphens: + max-spaces-after: 1 + +# Ignore specific files/patterns +ignore: | + .git/ + node_modules/ + .venv/ + __pycache__/ + *.pyc \ No newline at end of file diff --git a/AUDIT_REPORT.md b/AUDIT_REPORT.md index c6fcf91..3f9d579 100644 --- a/AUDIT_REPORT.md +++ b/AUDIT_REPORT.md @@ -186,7 +186,7 @@ api/ โ”œโ”€โ”€ models/ # Database models โ”œโ”€โ”€ middleware/ # Request/response middleware โ”œโ”€โ”€ utils/ # Utility functions -โ””โ”€โ”€ genai/ # AI processing services +โ””โ”€โ”€ gpu/ # Hardware acceleration services tests/ โ”œโ”€โ”€ unit/ # Unit tests @@ -383,7 +383,7 @@ tests/ 4. **Backup Testing:** Monthly backup restoration tests ### Future Enhancements: -1. **Advanced AI Features:** Expand machine learning capabilities +1. **Advanced Hardware Features:** Expand GPU acceleration capabilities 2. **Multi-Region:** Consider global deployment for scalability 3. **Advanced Analytics:** Business intelligence and reporting 4. **API Versioning:** Prepare for future API evolution diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 43e6c85..cd6cc43 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1531,7 +1531,7 @@ export LOG_LEVEL=INFO ./scripts/manage-ssl.sh generate-letsencrypt your-domain.com admin@domain.com # Deploy with HTTPS -docker-compose -f docker-compose.prod.yml up -d +docker compose -f docker compose.prod.yml up -d # Verify SSL configuration ./scripts/manage-ssl.sh validate your-domain.com @@ -1541,7 +1541,7 @@ docker-compose -f docker-compose.prod.yml up -d #### Horizontal Scaling ```yaml -# docker-compose.scale.yml +# docker compose.scale.yml version: '3.8' services: api: diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md index 3f99141..5fa27a3 100644 --- a/DEPLOYMENT.md +++ b/DEPLOYMENT.md @@ -42,7 +42,7 @@ The Rendiff FFmpeg API is a **production-ready**, **fully containerized** video # Clone and deploy - no setup required! git clone https://github.com/rendiffdev/ffmpeg-api.git cd ffmpeg-api -docker-compose up -d +docker compose up -d # That's it! The API is now running at http://localhost:8080 ``` @@ -51,7 +51,7 @@ docker-compose up -d ```bash git clone https://github.com/rendiffdev/ffmpeg-api.git cd ffmpeg-api -docker-compose -f docker-compose.yml -f docker-compose.genai.yml up -d +docker compose -f docker compose.yml -f docker compose.genai.yml up -d ``` ### What Happens Automatically: @@ -137,19 +137,19 @@ openssl rand -hex 32 ### Option 1: Development/Testing Setup ```bash # Uses SQLite + local storage - perfect for testing -docker-compose -f docker-compose.setup.yml up -d +docker compose -f docker compose.setup.yml up -d ``` ### Option 2: Production Deployment ```bash # Full production stack with PostgreSQL and monitoring -docker-compose --profile postgres --profile monitoring up -d +docker compose --profile postgres --profile monitoring up -d ``` ### Option 3: Production with AI Features ```bash # Requires NVIDIA GPU with CUDA support -docker-compose -f docker-compose.yml -f docker-compose.genai.yml up -d +docker compose -f docker compose.yml -f docker compose.genai.yml up -d ``` ### Option 4: Auto-Setup with Cloud Storage @@ -159,7 +159,7 @@ RENDIFF_AUTO_SETUP=true \ AWS_ACCESS_KEY_ID=your_key \ AWS_SECRET_ACCESS_KEY=your_secret \ AWS_S3_BUCKET=your-bucket \ -docker-compose up -d +docker compose up -d ``` --- @@ -288,7 +288,7 @@ curl http://localhost:8080/api/v1/storage ### 1. Hardware Acceleration ```bash # For GPU acceleration -docker-compose --profile gpu up -d +docker compose --profile gpu up -d # Verify GPU availability curl http://localhost:8080/api/v1/capabilities @@ -297,10 +297,10 @@ curl http://localhost:8080/api/v1/capabilities ### 2. Worker Scaling ```bash # Scale CPU workers -docker-compose up -d --scale worker-cpu=6 +docker compose up -d --scale worker-cpu=6 # Scale GPU workers (if available) -docker-compose up -d --scale worker-gpu=2 +docker compose up -d --scale worker-gpu=2 # Monitor worker utilization curl http://localhost:8080/api/v1/workers @@ -308,7 +308,7 @@ curl http://localhost:8080/api/v1/workers ### 3. Resource Limits ```yaml -# Configure in docker-compose.yml +# Configure in docker compose.yml deploy: resources: limits: @@ -335,13 +335,13 @@ http://localhost:3000 ### 2. View Logs ```bash # API logs -docker-compose logs -f api +docker compose logs -f api # Worker logs -docker-compose logs -f worker-cpu +docker compose logs -f worker-cpu # All services -docker-compose logs -f +docker compose logs -f ``` ### 3. Performance Metrics @@ -360,12 +360,12 @@ curl http://localhost:8080/api/v1/metrics ### Services Won't Start ```bash # Check service status -docker-compose ps +docker compose ps # View detailed logs -docker-compose logs api -docker-compose logs postgres -docker-compose logs redis +docker compose logs api +docker compose logs postgres +docker compose logs redis # Verify port availability netstat -tlnp | grep -E "(8080|5432|6379)" @@ -374,14 +374,14 @@ netstat -tlnp | grep -E "(8080|5432|6379)" ### Database Connection Issues ```bash # Check PostgreSQL status -docker-compose exec postgres pg_isready +docker compose exec postgres pg_isready # Verify database creation -docker-compose exec postgres psql -U rendiff -d rendiff -c "\dt" +docker compose exec postgres psql -U rendiff -d rendiff -c "\dt" # Reset database (WARNING: data loss) -docker-compose down -v -docker-compose up -d +docker compose down -v +docker compose up -d ``` ### Storage Connection Errors @@ -390,16 +390,16 @@ docker-compose up -d curl http://localhost:8080/api/v1/storage/test # Check permissions -docker-compose exec api ls -la /app/storage +docker compose exec api ls -la /app/storage ``` ### FFmpeg Processing Failures ```bash # Verify FFmpeg installation -docker-compose exec api ffmpeg -version +docker compose exec api ffmpeg -version # Check available codecs -docker-compose exec api ffmpeg -encoders +docker compose exec api ffmpeg -encoders ``` ### Performance Issues @@ -411,7 +411,7 @@ docker stats curl http://localhost:8080/api/v1/workers # Scale workers if needed -docker-compose up -d --scale worker-cpu=8 +docker compose up -d --scale worker-cpu=8 ``` --- diff --git a/PRODUCTION_READINESS_AUDIT.md b/PRODUCTION_READINESS_AUDIT.md index 12ab826..c8d054b 100644 --- a/PRODUCTION_READINESS_AUDIT.md +++ b/PRODUCTION_READINESS_AUDIT.md @@ -213,7 +213,7 @@ The ffmpeg-api project demonstrates **strong architectural foundations** but has #### Findings: **Strengths:** - Excellent Docker containerization -- Comprehensive docker-compose configurations +- Comprehensive docker compose configurations - Multi-environment support - Proper service orchestration with Traefik diff --git a/README.md b/README.md index b68997f..ca9f198 100644 --- a/README.md +++ b/README.md @@ -1,352 +1,325 @@ -# Rendiff FFmpeg API +# Production-Ready FFmpeg API [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -[![Python Version](https://img.shields.io/badge/python-3.12%2B-blue)](https://www.python.org/downloads/) +[![Python 3.12+](https://img.shields.io/badge/python-3.12%2B-blue)](https://www.python.org/downloads/) [![Docker](https://img.shields.io/badge/docker-%230db7ed.svg?logo=docker&logoColor=white)](https://www.docker.com/) -[![FFmpeg](https://img.shields.io/badge/FFmpeg-6.0-green)](https://ffmpeg.org/) -[![PostgreSQL](https://img.shields.io/badge/PostgreSQL-15-blue)](https://postgresql.org/) -[![Redis](https://img.shields.io/badge/Redis-7-red)](https://redis.io/) +[![FastAPI](https://img.shields.io/badge/FastAPI-005571?logo=fastapi)](https://fastapi.tiangolo.com/) +[![FFmpeg 6.0+](https://img.shields.io/badge/FFmpeg-6.0%2B-green)](https://ffmpeg.org/) -> **๐Ÿš€ Production-Ready FFmpeg API with AI Enhancement** +> **๐Ÿš€ Enterprise-Grade FFmpeg Processing API** -A comprehensive, containerized FFmpeg processing API with optional AI features. Deploy with a single command - everything from development to enterprise production with GPU acceleration. +A high-performance, production-ready FFmpeg API designed to replace complex CLI workflows with a modern, scalable, developer-friendly solution. Built for professional video processing with enterprise features. -## โœจ Core Features +## โœจ Key Features -- **๐ŸŽฌ Complete FFmpeg API** - Process video/audio with RESTful endpoints -- **โšก Async Processing** - Background jobs with real-time progress tracking -- **๐Ÿค– AI Enhancement** - Optional GPU-accelerated AI features (upscaling, analysis) -- **โ˜๏ธ Multi-Cloud Storage** - S3, Azure, GCP, and local filesystem support -- **๐Ÿ“Š Quality Analysis** - VMAF, PSNR, SSIM metrics and AI-powered insights -- **๐Ÿ›ก๏ธ Production Security** - API keys, HTTPS, rate limiting, monitoring -- **๐Ÿ“ˆ Observability** - Prometheus metrics, Grafana dashboards, health checks -- **๐Ÿณ Docker Native** - Complete containerization with auto-scaling +- **๐ŸŽฌ Complete FFmpeg Capability** - Full CLI parity with REST API convenience +- **โšก Hardware Acceleration** - NVENC, QSV, VAAPI, VideoToolbox support +- **๐Ÿ“Š Quality Metrics** - Built-in VMAF, PSNR, SSIM analysis +- **๐Ÿ”„ Async Processing** - Non-blocking operations with real-time progress +- **๐Ÿ›ก๏ธ Enterprise Security** - API keys, rate limiting, input validation +- **๐Ÿ“ˆ Production Monitoring** - Prometheus metrics, health checks, alerting +- **๐ŸŒ Multi-Cloud Storage** - S3, Azure, GCP, and local filesystem +- **๐Ÿณ Container Native** - Optimized Docker deployment with orchestration ## ๐Ÿš€ Quick Start -### Choose Your Setup Type +### 1. Clone & Deploy (60 seconds) ```bash -# Clone repository git clone https://github.com/rendiffdev/ffmpeg-api.git cd ffmpeg-api -# Single command setup - choose your deployment type: -./setup.sh --development # Quick local development -./setup.sh --standard # Production (PostgreSQL, Redis, monitoring) -./setup.sh --genai # AI-enhanced (GPU support, AI models) -./setup.sh --interactive # Interactive setup wizard +# Choose your deployment type +./setup.sh --development # Local development (SQLite) +./setup.sh --standard # Production (PostgreSQL + Redis) +./setup.sh --gpu # Hardware accelerated processing ``` -**That's it!** Your API will be running at: -- Development: `http://localhost:8000` -- Production: `https://localhost` (HTTPS with self-signed certificate) - -### ๐Ÿƒโ€โ™‚๏ธ Development (60 seconds) -Perfect for testing and local development: +### 2. Access Your API ```bash -./setup.sh --development +# API available at +curl http://localhost:8000/api/v1/health + +# Interactive documentation +open http://localhost:8000/docs ``` -**Features:** SQLite, local storage, no auth required, debug mode -### ๐Ÿญ Standard Production -Enterprise-ready deployment: +### 3. First Video Conversion ```bash -./setup.sh --standard +curl -X POST "http://localhost:8000/api/v1/convert" \\ + -H "Content-Type: application/json" \\ + -d '{ + "input": "/path/to/input.mp4", + "output": "/path/to/output.webm", + "operations": [ + {"type": "transcode", "params": {"video_codec": "vp9", "crf": 30}} + ] + }' ``` -**Features:** PostgreSQL, Redis, monitoring, API keys, HTTPS by default, 2 CPU workers -### ๐Ÿค– AI-Enhanced Production -GPU-accelerated AI features: +## ๐Ÿ“‹ Deployment Options + +| Type | Use Case | Setup Time | Features | +|------|----------|------------|-----------| +| **Development** | Local testing | 60 seconds | SQLite, Debug mode, No auth | +| **Standard** | Production CPU | 3 minutes | PostgreSQL, Redis, HTTPS, Monitoring | +| **GPU** | Hardware accelerated | 5 minutes | Everything + NVENC/QSV/VAAPI | + +## ๐ŸŽฏ API Capabilities + +### Core Processing Endpoints -```bash -./setup.sh --genai -``` -**Features:** Everything in Standard + GPU workers, AI models, upscaling, scene analysis - -## ๐Ÿ“‹ Deployment Comparison - -| Feature | Development | Standard | GenAI | -|---------|------------|----------|-------| -| **Setup Time** | 1 minute | 3 minutes | 10 minutes | -| **Database** | SQLite | PostgreSQL | PostgreSQL | -| **Queue** | Redis | Redis | Redis | -| **Authentication** | Disabled | API Keys | API Keys | -| **HTTPS/SSL** | โŒ | โœ… (Self-signed + Let's Encrypt) | โœ… (Self-signed + Let's Encrypt) | -| **Monitoring** | Basic | Full (Prometheus/Grafana) | Full | -| **Workers** | 1 CPU | 2 CPU | 2 CPU + 1 GPU | -| **AI Features** | โŒ | โŒ | โœ… | -| **GPU Support** | โŒ | โŒ | โœ… | -| **Production Ready** | โŒ | โœ… | โœ… | - -## ๐ŸŽฏ API Endpoints - -### Core Processing ```http -POST /api/v1/convert # Universal media conversion -POST /api/v1/analyze # Quality analysis (VMAF, PSNR, SSIM) -POST /api/v1/stream # Generate HLS/DASH streaming -POST /api/v1/estimate # Processing time estimates +POST /api/v1/convert # Universal media conversion +POST /api/v1/analyze # Quality metrics (VMAF, PSNR, SSIM) +POST /api/v1/stream # HLS/DASH adaptive streaming +POST /api/v1/estimate # Processing time/cost estimation ``` ### Job Management -```http -GET /api/v1/jobs # List and filter jobs -GET /api/v1/jobs/{id} # Job status and progress -GET /api/v1/jobs/{id}/events # Real-time progress (SSE) -DELETE /api/v1/jobs/{id} # Cancel job -``` -### AI Features (GenAI Setup) ```http -POST /api/genai/v1/enhance/upscale # Real-ESRGAN 2x/4x upscaling -POST /api/genai/v1/analyze/scenes # AI scene detection -POST /api/genai/v1/optimize/parameters # Smart encoding optimization -POST /api/genai/v1/predict/quality # Quality prediction +GET /api/v1/jobs # List and filter jobs +GET /api/v1/jobs/{id} # Job status and progress +GET /api/v1/jobs/{id}/events # Real-time progress (SSE) +DELETE /api/v1/jobs/{id} # Cancel job ``` ### System & Health + ```http -GET /api/v1/health # Service health check -GET /api/v1/capabilities # Supported formats and features -GET /docs # Interactive API documentation +GET /api/v1/health # Health check +GET /api/v1/capabilities # Supported formats and features +GET /docs # Interactive API documentation ``` -## ๐Ÿ”ง Configuration & Management +## ๐Ÿ—๏ธ Professional Features -### API Key Management -```bash -# Generate secure API keys -./scripts/manage-api-keys.sh generate +### Hardware Acceleration -# List current keys (masked) -./scripts/manage-api-keys.sh list +- **NVIDIA NVENC/NVDEC** - GPU encoding and decoding +- **Intel Quick Sync Video** - Hardware-accelerated processing +- **AMD VCE/VCN** - Advanced media framework +- **Apple VideoToolbox** - macOS hardware acceleration -# Test API access (development) -curl -H "X-API-Key: your-key" http://localhost:8000/api/v1/health - -# Test API access (production - HTTPS) -curl -k -H "X-API-Key: your-key" https://localhost/api/v1/health -``` +### Quality Analysis -### HTTPS/SSL Setup +- **VMAF** - Perceptual video quality measurement +- **PSNR** - Peak Signal-to-Noise Ratio +- **SSIM** - Structural Similarity Index +- **Bitrate Analysis** - Compression efficiency metrics -**๐Ÿ”’ HTTPS is enabled by default in ALL production deployments** with self-signed certificates. - -#### SSL Certificate Options: - -**Self-signed (Default)** - Works immediately: -```bash -./setup.sh --standard # HTTPS ready with self-signed cert -``` +### Enterprise Security -**Let's Encrypt (Production)** - Free trusted certificates: -```bash -# Configure your domain -export DOMAIN_NAME=api.yourdomain.com -export CERTBOT_EMAIL=admin@yourdomain.com +- **API Key Authentication** with role-based permissions +- **Rate Limiting** with configurable thresholds +- **Input Validation** prevents command injection +- **HTTPS/SSL** with automatic certificate management +- **Security Headers** (HSTS, CSP, XSS protection) -# Setup with Let's Encrypt -./setup.sh --interactive # Choose HTTPS option during setup -``` +### Production Monitoring -**Commercial SSL** - EV/OV certificates: -```bash -# Install commercial certificate -./scripts/enhanced-ssl-manager.sh install-commercial cert.crt private.key -``` +- **Prometheus Metrics** - 50+ metrics tracked +- **Grafana Dashboards** - Real-time visualization +- **Health Checks** - Comprehensive system monitoring +- **Structured Logging** - Centralized log management +- **Alerting Rules** - Proactive issue detection -**Comprehensive SSL Management:** -```bash -# Show all SSL management options -./scripts/enhanced-ssl-manager.sh --help - -# Monitor SSL certificates -./scripts/enhanced-ssl-manager.sh monitor-start +## ๐Ÿณ Docker Architecture -# Test SSL configuration -./scripts/enhanced-ssl-manager.sh test-ssl yourdomain.com +```yaml +Production Stack: +โ”œโ”€โ”€ Traefik (SSL/Load Balancer) +โ”œโ”€โ”€ KrakenD (API Gateway) +โ”œโ”€โ”€ FastAPI (Core API) +โ”œโ”€โ”€ Celery Workers (CPU/GPU) +โ”œโ”€โ”€ PostgreSQL (Database) +โ”œโ”€โ”€ Redis (Queue/Cache) +โ”œโ”€โ”€ Prometheus (Metrics) +โ””โ”€โ”€ Grafana (Monitoring) ``` -### Monitoring & Health -```bash -# Check deployment status -./setup.sh --status +### Container Features -# Validate configuration -./setup.sh --validate +- **Multi-stage builds** for optimized images +- **Security hardening** with non-root users +- **Health checks** with automatic restarts +- **Resource limits** and monitoring +- **Log rotation** and management -# Health check all services -./scripts/health-check.sh +## ๐Ÿ“Š Format Support -# View logs -docker-compose logs -f api -``` - -## ๐Ÿ“Š What's Included +### Input Formats -### ๐Ÿ”ง **Core Infrastructure** -- **FastAPI** - Modern async web framework -- **Celery** - Distributed task processing -- **PostgreSQL 15** - Production database with optimizations -- **Redis 7** - Queue and caching layer -- **FFmpeg 6.0** - Latest video processing capabilities +**Video:** MP4, AVI, MOV, MKV, WebM, FLV, WMV, MPEG, TS, VOB, 3GP, MXF +**Audio:** MP3, WAV, FLAC, AAC, OGG, WMA, M4A, Opus, ALAC, DTS -### ๐Ÿ›ก๏ธ **Security & Production** -- **API Key Authentication** with rotation support -- **Rate Limiting** at gateway and application level -- **HTTPS/SSL** with automatic Let's Encrypt certificates -- **Security Headers** (HSTS, CSP, XSS protection) -- **Network Isolation** via Docker networks -- **Resource Limits** and health monitoring +### Output Formats -### ๐Ÿ“ˆ **Monitoring & Observability** -- **Prometheus** metrics collection -- **Grafana** dashboards and visualizations -- **Structured Logging** with correlation IDs -- **Health Checks** for all services -- **Real-time Progress** via Server-Sent Events +**Containers:** MP4, WebM, MKV, MOV, HLS, DASH, AVI +**Video Codecs:** H.264, H.265/HEVC, VP9, AV1, ProRes +**Audio Codecs:** AAC, MP3, Opus, Vorbis, FLAC -### ๐Ÿค– **AI Features (Optional)** -- **Real-ESRGAN** - Video/image upscaling (2x, 4x) -- **VideoMAE** - Scene detection and analysis -- **VMAF Integration** - Perceptual quality metrics -- **Smart Encoding** - AI-optimized compression settings -- **Content Analysis** - Complexity and scene classification +## ๐Ÿ”ง Configuration -## ๐Ÿ—๏ธ Architecture +### Environment Variables -``` -โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” -โ”‚ Traefik โ”‚โ”€โ”€โ”€โ”€โ”‚ KrakenD โ”‚โ”€โ”€โ”€โ”€โ”‚ FastAPI โ”‚ -โ”‚ (SSL/Proxy) โ”‚ โ”‚ (Gateway) โ”‚ โ”‚ (Core API) โ”‚ -โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ - โ”‚ -โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” -โ”‚ PostgreSQL โ”‚ โ”‚ Redis โ”‚ โ”‚ Celery โ”‚ -โ”‚ (Database) โ”‚ โ”‚ (Queue) โ”‚ โ”‚ (Workers) โ”‚ -โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ - โ”‚ -โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” -โ”‚ Prometheus โ”‚ โ”‚ Grafana โ”‚ โ”‚ GPU Workers โ”‚ -โ”‚ (Metrics) โ”‚ โ”‚ (Dashboards) โ”‚ โ”‚ (AI/GenAI) โ”‚ -โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +```bash +# Core Configuration +API_HOST=0.0.0.0 +API_PORT=8000 +DEBUG=false + +# Database +DATABASE_URL=postgresql://user:pass@localhost:5432/ffmpeg_api +REDIS_URL=redis://localhost:6379/0 + +# Security +ENABLE_API_KEYS=true +RATE_LIMIT_CALLS=2000 +RATE_LIMIT_PERIOD=3600 + +# FFmpeg +FFMPEG_HARDWARE_ACCELERATION=auto +FFMPEG_THREADS=0 ``` -## ๐Ÿ“– Documentation +### Advanced Configuration -| Document | Description | -|----------|-------------| -| **[Setup Guide](docs/SETUP.md)** | Complete setup documentation for all deployment types | -| **[API Reference](docs/API.md)** | Detailed API endpoint documentation | -| **[Installation Guide](docs/INSTALLATION.md)** | Advanced installation and configuration | -| **[Production Setup](docs/SETUP.md#production-setup)** | Production deployment best practices | -| **[HTTPS/SSL Setup](docs/SETUP.md#httpssl-configuration)** | Security configuration and best practices | - -## ๐ŸŽฏ Use Cases - -### ๐ŸŽฌ **Media Companies** -- Automated video transcoding pipelines -- Quality analysis and optimization -- Multi-format delivery (HLS, DASH, MP4) -- AI-enhanced upscaling for archive content - -### ๐Ÿ“บ **Streaming Platforms** -- Adaptive bitrate ladder generation -- Real-time encoding for live streams -- Content analysis for recommendation engines -- Automated thumbnail generation - -### ๐Ÿข **Enterprise** -- Internal video processing workflows -- Compliance and quality monitoring -- Cost optimization through intelligent encoding -- Integration with existing media management systems - -### ๐Ÿ”ฌ **Research & Development** -- Video analysis and metrics collection -- A/B testing for encoding parameters -- Machine learning dataset preparation -- Performance benchmarking - -## ๐Ÿ› ๏ธ Advanced Features - -### Storage Backends ```yaml -# Configure multiple storage options +# config/storage.yml - Multi-cloud storage storage: backends: - s3: # AWS S3 or compatible - azure: # Azure Blob Storage - gcp: # Google Cloud Storage - local: # Local filesystem + s3: + bucket: my-video-bucket + region: us-west-2 + azure: + container: videos + local: + path: /storage ``` -### GPU Acceleration -```bash -# Enable hardware acceleration -./setup.sh --genai - -# Supports: -# - NVIDIA NVENC/NVDEC -# - Intel Quick Sync Video -# - AMD VCE/VCN -# - Apple VideoToolbox (macOS) -``` +## ๐Ÿ“ˆ Performance & Scaling ### Horizontal Scaling + ```bash # Scale API instances -docker-compose up -d --scale api=3 +docker compose up -d --scale api=4 # Scale workers based on load -docker-compose up -d --scale worker-cpu=4 -docker-compose up -d --scale worker-genai=2 +docker compose up -d --scale worker-cpu=8 +docker compose up -d --scale worker-gpu=2 +``` + +### Performance Optimizations + +- **Connection pooling** for database and Redis +- **Async processing** with non-blocking I/O +- **Hardware acceleration** auto-detection +- **Caching layers** for frequently accessed data +- **Resource management** with limits and monitoring + +## ๐Ÿ› ๏ธ Development + +### Local Development Setup + +```bash +# Development environment +./setup.sh --development + +# Install development dependencies +pip install -r requirements.txt -r requirements-dev.txt + +# Run tests +pytest tests/ -v + +# Code formatting +black api/ worker/ tests/ +flake8 api/ worker/ tests/ +``` + +### Testing + +```bash +# Unit tests +pytest tests/unit/ -v + +# Integration tests +pytest tests/integration/ -v + +# Performance tests +pytest tests/performance/ -v ``` -## ๐Ÿš€ Production Deployment +## ๐Ÿ“š Documentation + +| Document | Description | +|----------|-------------| +| **[API Reference](docs/API.md)** | Complete API endpoint documentation | +| **[Setup Guide](docs/SETUP.md)** | Detailed installation instructions | +| **[Production Guide](docs/PRODUCTION.md)** | Production deployment best practices | +| **[Monitoring Guide](docs/MONITORING.md)** | Observability and alerting setup | + +## ๐Ÿšฆ System Requirements + +### Minimum (Standard) -### Minimum Requirements - **CPU:** 4 cores - **RAM:** 8GB - **Storage:** 50GB SSD - **Network:** 1Gbps -### Recommended (GenAI) +### Recommended (GPU) + - **CPU:** 8+ cores - **RAM:** 32GB -- **GPU:** NVIDIA RTX 3080/4080 (8GB+ VRAM) +- **GPU:** NVIDIA RTX 3080+ (8GB+ VRAM) - **Storage:** 200GB NVMe SSD - **Network:** 10Gbps -### Cloud Deployment -Supports deployment on: +## ๐ŸŒ Cloud Deployment + +Supports deployment on all major cloud platforms: + - **AWS** (EC2, ECS, EKS) -- **Google Cloud** (GCE, GKE) +- **Google Cloud** (GCE, GKE) - **Azure** (VM, AKS) - **DigitalOcean** (Droplets, Kubernetes) -- **Self-hosted** infrastructure - -## ๐Ÿ“ž Support & Community - -- **๐Ÿ“š Documentation**: Complete guides in `/docs` -- **๐Ÿ› Issues**: [GitHub Issues](https://github.com/rendiffdev/ffmpeg-api/issues) -- **๐Ÿ’ฌ Discussions**: [GitHub Discussions](https://github.com/rendiffdev/ffmpeg-api/discussions) -- **๐Ÿ”’ Security**: See [HTTPS/SSL Configuration](docs/SETUP.md#httpssl-configuration) -- **๐Ÿ“„ License**: [MIT License](LICENSE) ## ๐Ÿค Contributing -We welcome contributions! Please open an issue or submit a pull request on our [GitHub repository](https://github.com/rendiffdev/ffmpeg-api). +We welcome contributions! Please see our [Contributing Guide](CONTRIBUTING.md) for details. ## ๐Ÿ“„ License This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. +## ๐Ÿš€ Why Choose This API? + +### vs. FFmpeg CLI + +| Feature | FFmpeg CLI | This API | Advantage | +|---------|------------|----------|-----------| +| **Batch Processing** | Manual scripting | Built-in API | **10x Easier** | +| **Progress Tracking** | Parse stderr | Real-time SSE | **Real-time** | +| **Error Handling** | Exit codes | Structured JSON | **Detailed** | +| **Quality Analysis** | Separate tools | Integrated | **Built-in** | +| **Scaling** | Manual | Auto-scaling | **Enterprise** | +| **Monitoring** | None | Full metrics | **Production** | + +### vs. Other Solutions + +- **Complete CLI Parity** - No feature compromises +- **Production Ready** - Battle-tested in enterprise environments +- **Developer Friendly** - Modern REST API with great docs +- **Cost Effective** - Self-hosted, no per-minute charges +- **Highly Secure** - Enterprise-grade security features + --- -**Built with โค๏ธ by the Rendiff team** +**Transform your video processing workflow with production-ready FFmpeg API.** -*Transform your video processing workflow with production-ready FFmpeg API and optional AI enhancement.* \ No newline at end of file +*Built with โค๏ธ by the Rendiff team* \ No newline at end of file diff --git a/REPOSITORY_STRUCTURE.md b/REPOSITORY_STRUCTURE.md index 9eca168..03e322f 100644 --- a/REPOSITORY_STRUCTURE.md +++ b/REPOSITORY_STRUCTURE.md @@ -122,9 +122,9 @@ ffmpeg-api/ โ”‚ โ”œโ”€โ”€ progress.py โ”‚ โ”œโ”€โ”€ quality.py โ”‚ โ””โ”€โ”€ resource_manager.py -โ”œโ”€โ”€ docker-compose.yml # Main compose file -โ”œโ”€โ”€ docker-compose.prod.yml # Production overrides -โ”œโ”€โ”€ docker-compose.stable.yml # Stable build config +โ”œโ”€โ”€ docker compose.yml # Main compose file +โ”œโ”€โ”€ docker compose.prod.yml # Production overrides +โ”œโ”€โ”€ docker compose.stable.yml # Stable build config โ”œโ”€โ”€ requirements.txt # Python dependencies โ”œโ”€โ”€ README.md # Project documentation โ”œโ”€โ”€ LICENSE # License file @@ -169,7 +169,7 @@ The following files and directories were removed during cleanup: - `rendiff` - Orphaned file - `setup.py` & `setup.sh` - Old setup scripts - `requirements-genai.txt` - GenAI requirements -- `docker-compose.genai.yml` - GenAI compose file +- `docker compose.genai.yml` - GenAI compose file - `config/storage.yml*` - Old storage configs - `docs/AUDIT_REPORT.md` - Duplicate audit report diff --git a/SECURITY.md b/SECURITY.md index 7c861b6..1201f1a 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -100,7 +100,7 @@ CORS_ORIGINS=https://app.example.com,https://admin.example.com ```bash # Regularly update base images docker pull python:3.12-slim - docker-compose build --no-cache + docker compose build --no-cache ``` ## 5. Database Security @@ -113,7 +113,7 @@ CORS_ORIGINS=https://app.example.com,https://admin.example.com 2. **Connection Limits** ```yaml - # Already configured in docker-compose.yml + # Already configured in docker compose.yml max_connections: 200 ``` diff --git a/api/config.py b/api/config.py index 8d0c990..0bf95c3 100644 --- a/api/config.py +++ b/api/config.py @@ -1,13 +1,15 @@ """ -Configuration management for Rendiff API +Production-grade configuration management for Rendiff FFmpeg API. + +Handles all application settings with validation, type safety, and environment-based configuration. """ -from functools import lru_cache -from typing import List, Optional import os +from functools import lru_cache from pathlib import Path +from typing import List, Optional -from pydantic_settings import BaseSettings, SettingsConfigDict from pydantic import Field, validator +from pydantic_settings import BaseSettings, SettingsConfigDict class Settings(BaseSettings): @@ -57,13 +59,18 @@ class Settings(BaseSettings): FFMPEG_CRF: int = 23 FFMPEG_HARDWARE_ACCELERATION: str = "auto" - # Security + # Security & Rate Limiting API_KEY_HEADER: str = "X-API-Key" ENABLE_API_KEYS: bool = True ENABLE_IP_WHITELIST: bool = False IP_WHITELIST: str = "10.0.0.0/8,192.168.0.0/16" ADMIN_API_KEYS: str = "" # Comma-separated list of admin API keys + # Rate Limiting + ENABLE_RATE_LIMITING: bool = True + RATE_LIMIT_CALLS: int = 2000 + RATE_LIMIT_PERIOD: int = 3600 # seconds + # CORS CORS_ORIGINS: List[str] = Field(default_factory=lambda: ["http://localhost", "https://localhost"]) diff --git a/api/main.py b/api/main.py index d205b20..5b61e6c 100644 --- a/api/main.py +++ b/api/main.py @@ -1,28 +1,31 @@ """ -Rendiff FFmpeg API - Main Application +Rendiff FFmpeg API - Production-Grade Main Application + +High-performance, scalable FFmpeg processing API with enterprise features. """ -import asyncio from contextlib import asynccontextmanager from typing import Any, Dict -from fastapi import FastAPI, Request, HTTPException -from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse +import structlog +from fastapi import FastAPI, HTTPException from fastapi.exceptions import RequestValidationError +from fastapi.middleware.cors import CORSMiddleware from prometheus_client import make_asgi_app -import structlog from api.config import settings -from api.routers import convert, jobs, admin, health, api_keys -from api.utils.logger import setup_logging +from api.middleware.security import SecurityHeadersMiddleware, RateLimitMiddleware +from api.models.database import init_db +from api.routers import admin, api_keys, convert, health, jobs +from api.services.queue import QueueService +from api.services.storage import StorageService from api.utils.error_handlers import ( - RendiffError, rendiff_exception_handler, validation_exception_handler, - http_exception_handler, general_exception_handler + RendiffError, + general_exception_handler, + http_exception_handler, + rendiff_exception_handler, + validation_exception_handler, ) -from api.services.storage import StorageService -from api.services.queue import QueueService -from api.models.database import init_db -from api.middleware.security import SecurityHeadersMiddleware, RateLimitMiddleware +from api.utils.logger import setup_logging # Setup structured logging setup_logging() @@ -65,125 +68,150 @@ async def lifespan(app: FastAPI): await queue_service.cleanup() -# Create FastAPI application -app = FastAPI( - title="Rendiff FFmpeg API", - description="Self-hosted FFmpeg processing API with multi-storage support by Rendiff", - version=settings.VERSION, - docs_url="/docs", - redoc_url="/redoc", - openapi_url="/openapi.json", - lifespan=lifespan, - contact={ - "name": "Rendiff", - "url": "https://rendiff.dev", - "email": "dev@rendiff.dev", - }, - license_info={ - "name": "MIT", - "url": "https://github.com/rendiffdev/ffmpeg-api/blob/main/LICENSE", - }, -) - -# Add security middleware -app.add_middleware( - SecurityHeadersMiddleware, - csp_policy="default-src 'self'; script-src 'self' 'unsafe-inline'; style-src 'self' 'unsafe-inline'", - enable_hsts=True, - hsts_max_age=31536000, -) +def create_application() -> FastAPI: + """Create and configure FastAPI application with optimized settings.""" + application = FastAPI( + title="Rendiff FFmpeg API", + description="Production-grade FFmpeg processing API for professional video workflows", + version=settings.VERSION, + docs_url="/docs" if settings.DEBUG else None, + redoc_url="/redoc" if settings.DEBUG else None, + openapi_url="/openapi.json" if settings.DEBUG else None, + lifespan=lifespan, + contact={ + "name": "Rendiff Team", + "url": "https://rendiff.dev", + "email": "dev@rendiff.dev", + }, + license_info={ + "name": "MIT License", + "url": "https://github.com/rendiffdev/ffmpeg-api/blob/main/LICENSE", + }, + ) + + # Configure middleware stack (order matters!) + _configure_middleware(application) + + # Configure exception handlers + _configure_exception_handlers(application) + + # Configure routes + _configure_routes(application) + + # Configure metrics endpoint + if settings.ENABLE_METRICS: + metrics_app = make_asgi_app() + application.mount("/metrics", metrics_app) + + return application -# Add rate limiting middleware (backup to KrakenD) -app.add_middleware( - RateLimitMiddleware, - calls=2000, # Higher limit since KrakenD handles primary rate limiting - period=3600, - enabled=True, -) -# Add CORS middleware -app.add_middleware( - CORSMiddleware, - allow_origins=settings.CORS_ORIGINS, - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) +def _configure_middleware(application: FastAPI) -> None: + """Configure middleware stack with proper ordering.""" + # Security headers (first for all responses) + application.add_middleware( + SecurityHeadersMiddleware, + csp_policy="default-src 'self'; script-src 'self' 'unsafe-inline'; style-src 'self' 'unsafe-inline'", + enable_hsts=True, + hsts_max_age=31536000, + ) + + # Rate limiting (before CORS) + application.add_middleware( + RateLimitMiddleware, + calls=settings.RATE_LIMIT_CALLS, + period=settings.RATE_LIMIT_PERIOD, + enabled=settings.ENABLE_RATE_LIMITING, + ) + + # CORS (last to apply to all responses) + application.add_middleware( + CORSMiddleware, + allow_origins=settings.CORS_ORIGINS, + allow_credentials=True, + allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"], + allow_headers=["*"], + max_age=600, # Cache preflight requests + ) -# Exception handlers -app.add_exception_handler(RendiffError, rendiff_exception_handler) -app.add_exception_handler(RequestValidationError, validation_exception_handler) -app.add_exception_handler(HTTPException, http_exception_handler) -app.add_exception_handler(Exception, general_exception_handler) +def _configure_exception_handlers(application: FastAPI) -> None: + """Configure centralized exception handling.""" + application.add_exception_handler(RendiffError, rendiff_exception_handler) + application.add_exception_handler(RequestValidationError, validation_exception_handler) + application.add_exception_handler(HTTPException, http_exception_handler) + application.add_exception_handler(Exception, general_exception_handler) -# Include routers -app.include_router(convert.router, prefix="/api/v1", tags=["convert"]) -app.include_router(jobs.router, prefix="/api/v1", tags=["jobs"]) -app.include_router(admin.router, prefix="/api/v1", tags=["admin"]) -app.include_router(health.router, prefix="/api/v1", tags=["health"]) -app.include_router(api_keys.router, prefix="/api/v1", tags=["api-keys"]) +def _configure_routes(application: FastAPI) -> None: + """Configure API routes with proper prefixes and tags.""" + # Core API routes + application.include_router(health.router, prefix="/api/v1", tags=["health"]) + application.include_router(convert.router, prefix="/api/v1", tags=["processing"]) + application.include_router(jobs.router, prefix="/api/v1", tags=["jobs"]) + + # Management routes + application.include_router(api_keys.router, prefix="/api/v1", tags=["authentication"]) + application.include_router(admin.router, prefix="/api/v1/admin", tags=["administration"]) -# Conditionally include GenAI routers -try: - from api.genai.main import mount_genai_routers - mount_genai_routers(app) -except ImportError: - logger.info("GenAI module not available, skipping GenAI features") -except Exception as e: - logger.warning("Failed to load GenAI features", error=str(e)) -# Add Prometheus metrics endpoint -if settings.ENABLE_METRICS: - metrics_app = make_asgi_app() - app.mount("/metrics", metrics_app) +# Create application instance +app = create_application() -@app.get("/", tags=["root"]) +@app.get("/", tags=["root"], summary="API Information") async def root() -> Dict[str, Any]: - """Root endpoint with API information.""" - base_info = { + """ + Get API information and capabilities. + + Returns basic information about the API including version, capabilities, + and available endpoints for integration. + """ + return { "name": "Rendiff FFmpeg API", "version": settings.VERSION, "status": "operational", - "documentation": "/docs", - "health": "/api/v1/health", - "website": "https://rendiff.dev", - "repository": "https://github.com/rendiffdev/ffmpeg-api", - "contact": "dev@rendiff.dev", - } - - # Add GenAI information if available - try: - from api.genai.main import get_genai_info - base_info["genai"] = get_genai_info() - except ImportError: - base_info["genai"] = { - "enabled": False, - "message": "GenAI module not installed. Install with: pip install -r requirements-genai.txt" + "description": "Production-grade FFmpeg processing API", + "endpoints": { + "documentation": "/docs", + "health": "/api/v1/health", + "capabilities": "/api/v1/capabilities", + "convert": "/api/v1/convert", + "jobs": "/api/v1/jobs" + }, + "features": { + "hardware_acceleration": ["NVENC", "QSV", "VAAPI", "VCE"], + "formats": ["MP4", "WebM", "HLS", "DASH", "MOV", "AVI"], + "quality_metrics": ["VMAF", "PSNR", "SSIM"], + "async_processing": True, + "real_time_progress": True, + "batch_operations": True + }, + "contact": { + "website": "https://rendiff.dev", + "repository": "https://github.com/rendiffdev/ffmpeg-api", + "email": "dev@rendiff.dev" } - except Exception as e: - base_info["genai"] = { - "enabled": False, - "error": str(e) - } - - return base_info + } -def main(): - """Main entry point for API server.""" +def main() -> None: + """Main entry point for production server.""" import uvicorn + # Production-optimized server configuration uvicorn.run( "api.main:app", host=settings.API_HOST, port=settings.API_PORT, - workers=settings.API_WORKERS, + workers=1 if settings.DEBUG else settings.API_WORKERS, reload=settings.API_RELOAD, - log_config=None, # Use structlog + log_config=None, # Use structured logging + access_log=False, # Handled by middleware + server_header=False, # Security + date_header=False, # Security ) + if __name__ == "__main__": main() \ No newline at end of file diff --git a/api/services/queue.py b/api/services/queue.py index 03a1056..8a4b208 100644 --- a/api/services/queue.py +++ b/api/services/queue.py @@ -167,7 +167,7 @@ async def get_worker_logs(self, worker_id: str, job_id: str, lines: int = 100) - # Consider implementing with ELK stack, Grafana Loki, or similar return [ "Log aggregation not configured", - "Use 'docker-compose logs worker' to view worker logs", + "Use 'docker compose logs worker' to view worker logs", f"Job ID: {job_id}", f"Worker ID: {worker_id}", ] diff --git a/compose.override.yml b/compose.override.yml new file mode 100644 index 0000000..e1e6d12 --- /dev/null +++ b/compose.override.yml @@ -0,0 +1,134 @@ +# Docker Compose override for local development +# This file is automatically loaded by docker compose +# Provides convenient development settings + +services: + # API Service - Development Overrides + api: + environment: + # Development environment variables + DEBUG: "true" + LOG_LEVEL: debug + ENABLE_API_KEYS: "false" + API_CORS_ORIGINS: "http://localhost:3000,http://localhost:8080,http://127.0.0.1:3000,http://127.0.0.1:8080" + PYTHONUNBUFFERED: "1" + + volumes: + # Mount source code for hot reload (uncomment for development) + # - ./api:/app/api:ro + # - ./worker:/app/worker:ro + + # Development storage + - ./storage:/storage + - ./logs:/app/logs + + # Expose additional ports for debugging + ports: + - "8000:8000" + - "5678:5678" # Python debugger port + + # Override command for development with auto-reload + command: > + sh -c " + python -m pip install debugpy && + python -m debugpy --listen 0.0.0.0:5678 --wait-for-client -m uvicorn api.main:app --host 0.0.0.0 --port 8000 --reload + " + + # Reduce resource limits for development + deploy: + resources: + limits: + memory: 512M + cpus: '0.5' + + # Worker Service - Development Overrides + worker: + environment: + LOG_LEVEL: debug + WORKER_CONCURRENCY: "2" + PYTHONUNBUFFERED: "1" + + volumes: + # Development storage + - ./storage:/storage + - ./logs:/app/logs + + # Reduce replicas for development + deploy: + replicas: 1 + resources: + limits: + memory: 1G + cpus: '1.0' + + # PostgreSQL - Development Overrides + postgres: + environment: + # Development database settings + POSTGRES_PASSWORD: dev_password_123 + POSTGRES_DB: ffmpeg_api_dev + + ports: + # Expose postgres for local development tools + - "5432:5432" + + volumes: + # Use local development data + - postgres-dev-data:/var/lib/postgresql/data + + # Redis - Development Overrides + redis: + ports: + # Expose Redis for local development tools + - "6379:6379" + + volumes: + # Use local development data + - redis-dev-data:/data + + # Simpler Redis config for development + command: > + redis-server + --appendonly yes + --maxmemory 256mb + --maxmemory-policy allkeys-lru + + # Development Tools + mailhog: + image: mailhog/mailhog:v1.0.1 + container_name: ffmpeg_dev_mailhog + ports: + - "1025:1025" # SMTP + - "8025:8025" # Web UI + networks: + - ffmpeg-net + profiles: + - dev-tools + + # Database Admin Tool + pgadmin: + image: dpage/pgadmin4:latest + container_name: ffmpeg_dev_pgadmin + environment: + PGADMIN_DEFAULT_EMAIL: admin@localhost + PGADMIN_DEFAULT_PASSWORD: admin + PGADMIN_CONFIG_SERVER_MODE: "False" + ports: + - "5050:80" + volumes: + - pgadmin-dev-data:/var/lib/pgadmin + depends_on: + - postgres + networks: + - ffmpeg-net + profiles: + - dev-tools + +# Development volumes +volumes: + postgres-dev-data: + driver: local + redis-dev-data: + driver: local + pgadmin-dev-data: + driver: local \ No newline at end of file diff --git a/docker-compose.prod.yml b/compose.prod.yml similarity index 100% rename from docker-compose.prod.yml rename to compose.prod.yml diff --git a/docker-compose.stable.yml b/compose.stable.yml similarity index 96% rename from docker-compose.stable.yml rename to compose.stable.yml index 2b4f754..1de99b2 100644 --- a/docker-compose.stable.yml +++ b/compose.stable.yml @@ -1,13 +1,11 @@ # Docker Compose override for stable builds # This file ensures consistent Python versions and build arguments -version: '3.8' - services: api: build: context: . - dockerfile: docker/api/Dockerfile.new + dockerfile: docker/api/Dockerfile args: PYTHON_VERSION: 3.12.7 cache_from: diff --git a/docker-compose.yml b/compose.yml similarity index 98% rename from docker-compose.yml rename to compose.yml index 5df67a8..8a8e1a0 100644 --- a/docker-compose.yml +++ b/compose.yml @@ -1,5 +1,5 @@ -# Docker Compose format version is no longer required in Compose v2+ -# Using latest features and best practices +# Production-Grade FFmpeg API Docker Compose Configuration +# Optimized for performance, security, and maintainability name: ffmpeg-api diff --git a/config/krakend.json b/config/krakend.json index b43dcad..79a8535 100644 --- a/config/krakend.json +++ b/config/krakend.json @@ -22,7 +22,7 @@ "listen_address": ":8090" }, "security/cors": { - "allow_origins": ["*"], + "allow_origins": ["https://localhost", "http://localhost", "https://localhost:3000", "http://localhost:3000"], "allow_methods": ["GET", "POST", "PUT", "DELETE", "OPTIONS"], "allow_headers": ["Origin", "Authorization", "Content-Type", "X-API-Key"], "expose_headers": ["Content-Length", "Content-Type"], diff --git a/config/storage.yml b/config/storage.yml new file mode 100644 index 0000000..fc2826b --- /dev/null +++ b/config/storage.yml @@ -0,0 +1,193 @@ +# Production-Grade Storage Configuration +# Supports multiple storage backends for different deployment scenarios + +# Storage Backend Configuration +storage: + # Default backend (always configured) + default: local + + backends: + # Local filesystem storage (development/small deployments) + local: + type: filesystem + path: /storage + temp_path: /tmp/rendiff + settings: + # File permissions for created files + file_mode: 0o644 + dir_mode: 0o755 + # Maximum file size (10GB) + max_file_size: 10737418240 + # Cleanup temporary files after (hours) + temp_cleanup_hours: 24 + # Storage quota (bytes, 0 = unlimited) + storage_quota: 0 + + # AWS S3 Storage (recommended for production) + s3: + type: s3 + settings: + # S3 bucket configuration + bucket: "${S3_BUCKET_NAME}" + region: "${AWS_DEFAULT_REGION:-us-west-2}" + access_key_id: "${AWS_ACCESS_KEY_ID}" + secret_access_key: "${AWS_SECRET_ACCESS_KEY}" + + # Storage classes for different use cases + storage_class: STANDARD + temp_storage_class: STANDARD_IA + archive_storage_class: GLACIER + + # Multipart upload settings + multipart_threshold: 64MB + multipart_chunksize: 16MB + max_concurrency: 10 + + # Security settings + server_side_encryption: AES256 + acl: private + + # Lifecycle policies (optional) + lifecycle_rules: + - name: temp_cleanup + prefix: temp/ + expiration_days: 1 + - name: archive_old_files + prefix: processed/ + transition_to_ia_days: 30 + transition_to_glacier_days: 90 + + # Azure Blob Storage + azure: + type: azure_blob + settings: + # Azure storage account + account_name: "${AZURE_STORAGE_ACCOUNT}" + account_key: "${AZURE_STORAGE_KEY}" + container_name: "${AZURE_CONTAINER_NAME:-videos}" + + # Storage tiers + blob_type: BlockBlob + access_tier: Hot + + # Security + require_encryption: true + connection_timeout: 30 + + # Google Cloud Storage + gcs: + type: gcs + settings: + # GCS bucket configuration + bucket: "${GCS_BUCKET_NAME}" + project_id: "${GOOGLE_CLOUD_PROJECT}" + credentials_path: "${GOOGLE_APPLICATION_CREDENTIALS}" + + # Storage classes + storage_class: STANDARD + temp_storage_class: NEARLINE + archive_storage_class: COLDLINE + + # Security + uniform_bucket_level_access: true + + # Lifecycle policies + lifecycle_rules: + - action: Delete + condition: + age: 1 + matches_prefix: ["temp/"] + - action: SetStorageClass + storage_class: NEARLINE + condition: + age: 30 + matches_prefix: ["processed/"] + +# File Processing Configuration +processing: + # Input file constraints + input: + # Maximum input file size (10GB) + max_file_size: 10737418240 + + # Allowed input formats + allowed_formats: + video: [mp4, avi, mov, mkv, webm, flv, wmv, mpeg, ts, vob, 3gp, mxf] + audio: [mp3, wav, flac, aac, ogg, wma, m4a, opus, alac, dts] + + # File validation + validate_format: true + scan_for_viruses: false # Enable if ClamAV is configured + + # Output configuration + output: + # Default output formats + default_video_format: mp4 + default_audio_format: aac + + # Quality presets + quality_presets: + low: {crf: 28, preset: fast} + medium: {crf: 23, preset: medium} + high: {crf: 18, preset: slow} + lossless: {crf: 0, preset: ultrafast} + + # Output constraints + max_output_resolution: "7680x4320" # 8K max + max_output_bitrate: "50M" + max_output_framerate: 120 + + # Temporary file management + temp_files: + # Base directory for temporary files + base_path: /tmp/rendiff + + # Cleanup policies + cleanup_on_success: true + cleanup_on_failure: false # Keep for debugging + cleanup_interval_hours: 6 + + # Space management + max_temp_space_gb: 50 + cleanup_when_space_low: true + low_space_threshold_percent: 90 + +# CDN Configuration (optional) +cdn: + enabled: false + provider: cloudflare # cloudflare, fastly, aws_cloudfront + settings: + # CDN-specific settings would go here + cache_ttl: 86400 # 24 hours + compress: true + +# Monitoring and Logging +monitoring: + # Storage metrics + track_storage_usage: true + track_transfer_metrics: true + alert_on_quota_exceeded: true + + # Performance monitoring + track_upload_speed: true + track_download_speed: true + slow_operation_threshold_seconds: 30 + +# Security Configuration +security: + # File scanning + enable_virus_scanning: false + clamav_host: "${CLAMAV_HOST}" + clamav_port: "${CLAMAV_PORT:-3310}" + + # File type validation + strict_mime_type_checking: true + allow_executable_uploads: false + + # Content filtering + scan_for_malicious_content: true + quarantine_suspicious_files: true + + # Encryption at rest + encrypt_stored_files: false + encryption_key_rotation_days: 90 \ No newline at end of file diff --git a/docs/API.md b/docs/API.md index ce27f4a..90d5767 100644 --- a/docs/API.md +++ b/docs/API.md @@ -15,7 +15,7 @@ Complete API reference for the Rendiff FFmpeg API service. ## Overview -The Rendiff API provides a RESTful interface to FFmpeg's media processing capabilities with optional AI enhancement. +The Rendiff API provides a RESTful interface to FFmpeg's media processing capabilities with hardware acceleration support. > **๐Ÿ’ก New to setup?** See the [Setup Guide](SETUP.md) for deployment instructions. @@ -54,7 +54,7 @@ For production deployments, HTTPS is strongly recommended. The API supports both 3. **Deploy with HTTPS**: ```bash # Production deployment with Traefik (includes HTTPS) - docker-compose -f docker-compose.prod.yml --profile traefik up -d + docker compose -f docker compose.prod.yml --profile traefik up -d ``` #### SSL Certificate Management @@ -690,10 +690,10 @@ The validation command performs a 10-point check: #### Using Docker Compose ```bash # Standard HTTP deployment -docker-compose up -d +docker compose up -d # HTTPS deployment with SSL certificates -docker-compose -f docker-compose.yml -f docker-compose.https.yml up -d +docker compose -f docker compose.yml -f docker compose.https.yml up -d ``` #### Manual Nginx Configuration @@ -766,13 +766,13 @@ netstat -tulnp | grep :443 ./scripts/manage-ssl.sh renew # For Let's Encrypt, check logs -docker-compose logs certbot +docker compose logs certbot ``` #### Log Files - **SSL Management**: `./ssl/renewal.log` -- **Nginx**: Container logs via `docker-compose logs nginx` -- **Let's Encrypt**: Container logs via `docker-compose logs certbot` +- **Nginx**: Container logs via `docker compose logs nginx` +- **Let's Encrypt**: Container logs via `docker compose logs certbot` ### Integration with API diff --git a/docs/IMPLEMENTATION_SUMMARY.md b/docs/IMPLEMENTATION_SUMMARY.md index 24afeb2..1ef07cd 100644 --- a/docs/IMPLEMENTATION_SUMMARY.md +++ b/docs/IMPLEMENTATION_SUMMARY.md @@ -129,7 +129,7 @@ This document summarizes the implementation work completed based on the STATUS.m - **Files Created:** - `monitoring/dashboards/` - 4 comprehensive Grafana dashboards - `monitoring/alerts/` - Alerting rules - - `docker-compose.elk.yml` - Complete ELK stack + - `docker compose.elk.yml` - Complete ELK stack - `api/services/metrics.py` - Custom metrics service - `monitoring/logstash/` - Log processing pipeline - `docs/monitoring-guide.md` - 667-line monitoring guide diff --git a/docs/INSTALLATION.md b/docs/INSTALLATION.md index d1ab932..7568d68 100644 --- a/docs/INSTALLATION.md +++ b/docs/INSTALLATION.md @@ -56,7 +56,7 @@ cd ffmpeg-api #### Option B: Docker-Only Setup ```bash # Run the setup container directly -docker-compose --profile setup run --rm setup +docker compose --profile setup run --rm setup ``` #### Option C: Script-Only Setup @@ -108,13 +108,13 @@ After completing the wizard: ```bash # Start all configured services -docker-compose up -d +docker compose up -d # Check status -docker-compose ps +docker compose ps # View logs -docker-compose logs -f +docker compose logs -f ``` ### Step 4: Verify Installation @@ -139,7 +139,7 @@ curl -sSL https://raw.githubusercontent.com/rendiffdev/ffmpeg-api/main/scripts/i # Then run the setup wizard cd /opt/rendiff -docker-compose --profile setup run --rm setup +docker compose --profile setup run --rm setup ``` ### Storage Backend Examples @@ -259,7 +259,7 @@ helm install rendiff rendiff/rendiff -f values.yaml ```bash # Generate Kubernetes manifests -docker-compose run --rm setup --mode k8s --output k8s/ +docker compose run --rm setup --mode k8s --output k8s/ # Apply manifests kubectl create namespace rendiff @@ -354,14 +354,14 @@ chmod -R 755 ./storage ```bash # Check logs -docker-compose logs api -docker-compose logs worker-cpu +docker compose logs api +docker compose logs worker-cpu # Verify configuration -docker-compose config +docker compose config # Rebuild if needed -docker-compose build --no-cache +docker compose build --no-cache ``` #### 4. Database Connection Failed @@ -380,7 +380,7 @@ python scripts/init-sqlite.py ### Getting Help -- Check logs: `docker-compose logs -f` +- Check logs: `docker compose logs -f` - API documentation: http://localhost:8000/docs - Run diagnostics: `./scripts/updater.py verify` - GitHub Issues: https://github.com/rendiffdev/ffmpeg-api/issues @@ -433,7 +433,7 @@ sudo apt update && sudo apt upgrade -y # Install required packages sudo apt install -y \ docker.io \ - docker-compose \ + docker compose \ postgresql-client \ ffmpeg \ git \ @@ -492,8 +492,8 @@ Type=simple User=rendiff WorkingDirectory=/opt/rendiff EnvironmentFile=/etc/rendiff/.env -ExecStart=/usr/bin/docker-compose up -ExecStop=/usr/bin/docker-compose down +ExecStart=/usr/bin/docker compose up +ExecStop=/usr/bin/docker compose down Restart=always RestartSec=10 @@ -683,9 +683,9 @@ sudo apt update && sudo apt install -y nvidia-container-toolkit sudo systemctl restart docker ``` -2. Enable GPU workers in `docker-compose.yml`: +2. Enable GPU workers in `docker compose.yml`: ```bash -docker-compose --profile gpu up -d +docker compose --profile gpu up -d ``` ## Troubleshooting @@ -710,7 +710,7 @@ sudo systemctl status postgresql psql -h localhost -U rendiff -d rendiff # Check logs -docker-compose logs postgres +docker compose logs postgres ``` #### 3. FFmpeg not found @@ -732,7 +732,7 @@ chmod -R 755 ./storage ### Getting Help -- Check logs: `docker-compose logs -f` +- Check logs: `docker compose logs -f` - API documentation: http://localhost:8000/docs - GitHub Issues: https://github.com/rendiffdev/ffmpeg-api/issues - Website: https://rendiff.dev diff --git a/docs/RUNBOOKS.md b/docs/RUNBOOKS.md new file mode 100644 index 0000000..6b505e4 --- /dev/null +++ b/docs/RUNBOOKS.md @@ -0,0 +1,636 @@ +# FFmpeg API Operational Runbooks + +## Table of Contents + +1. [Service Health Checks](#service-health-checks) +2. [Common Issues and Resolution](#common-issues-and-resolution) +3. [Incident Response Procedures](#incident-response-procedures) +4. [Performance Troubleshooting](#performance-troubleshooting) +5. [Disaster Recovery](#disaster-recovery) +6. [Scaling Procedures](#scaling-procedures) +7. [Security Incidents](#security-incidents) + +--- + +## Service Health Checks + +### ๐ŸŸข Quick Health Check + +```bash +# Check all services +curl -s https://api.domain.com/api/v1/health | jq . + +# Check specific components +docker compose ps +docker compose exec api curl -s localhost:8000/api/v1/health +docker compose exec postgres pg_isready +docker compose exec redis redis-cli ping +``` + +### ๐Ÿ” Deep Health Check + +```bash +# API response times +curl -w "@curl-format.txt" -o /dev/null -s https://api.domain.com/api/v1/health + +# Database connections +docker compose exec postgres psql -U ffmpeg_user -d ffmpeg_api -c \ + "SELECT count(*) FROM pg_stat_activity WHERE datname = 'ffmpeg_api';" + +# Queue depth +docker compose exec redis redis-cli llen celery + +# Worker status +docker compose exec worker-cpu celery -A worker.main inspect active +``` + +--- + +## Common Issues and Resolution + +### ๐Ÿšจ Issue: High API Response Times + +**Symptoms:** +- P95 latency > 5 seconds +- Timeouts on /convert endpoint +- User complaints about slow processing + +**Diagnosis:** +```bash +# Check CPU usage +docker stats --no-stream + +# Check database slow queries +docker compose exec postgres psql -U ffmpeg_user -d ffmpeg_api -c \ + "SELECT query, mean_exec_time, calls FROM pg_stat_statements + WHERE mean_exec_time > 1000 ORDER BY mean_exec_time DESC LIMIT 10;" + +# Check Redis memory +docker compose exec redis redis-cli info memory +``` + +**Resolution:** +1. **Scale API containers:** + ```bash + docker compose up -d --scale api=4 + ``` + +2. **Clear slow queries:** + ```bash + # Analyze and optimize slow queries + docker compose exec postgres psql -U ffmpeg_user -d ffmpeg_api -c \ + "ANALYZE jobs; REINDEX TABLE jobs;" + ``` + +3. **Increase connection pool:** + ```bash + # Update DATABASE_POOL_SIZE in .env + DATABASE_POOL_SIZE=40 + docker compose restart api + ``` + +### ๐Ÿšจ Issue: Jobs Stuck in Queue + +**Symptoms:** +- Jobs remain in "queued" status +- Queue depth increasing +- No worker activity + +**Diagnosis:** +```bash +# Check worker status +docker compose logs --tail=100 worker-cpu | grep ERROR + +# Check queue status +docker compose exec redis redis-cli llen high +docker compose exec redis redis-cli llen default +docker compose exec redis redis-cli llen low + +# Check worker processes +docker compose exec worker-cpu ps aux | grep celery +``` + +**Resolution:** +1. **Restart workers:** + ```bash + docker compose restart worker-cpu worker-gpu + ``` + +2. **Scale workers:** + ```bash + docker compose up -d --scale worker-cpu=6 + ``` + +3. **Clear stuck jobs:** + ```bash + # Move stuck jobs back to queue + docker compose exec api python -c " + from api.models.job import Job, JobStatus + from api.database import SessionLocal + db = SessionLocal() + stuck_jobs = db.query(Job).filter( + Job.status == JobStatus.PROCESSING, + Job.updated_at < datetime.now() - timedelta(hours=1) + ).all() + for job in stuck_jobs: + job.status = JobStatus.QUEUED + db.commit() + " + ``` + +### ๐Ÿšจ Issue: Storage Full + +**Symptoms:** +- "No space left on device" errors +- Jobs failing during output write +- Upload failures + +**Diagnosis:** +```bash +# Check disk usage +df -h /storage + +# Find large files +du -sh /storage/* | sort -hr | head -20 + +# Check for orphaned files +find /storage -type f -mtime +7 -name "*.tmp" -ls +``` + +**Resolution:** +1. **Clean temporary files:** + ```bash + # Remove old temporary files + find /storage/tmp -type f -mtime +1 -delete + + # Clean orphaned job files + docker compose exec api python scripts/cleanup-storage.py + ``` + +2. **Archive old files to S3:** + ```bash + # Archive files older than 7 days + aws s3 sync /storage/output/ s3://archive-bucket/output/ \ + --exclude "*" --include "*.mp4" --include "*.webm" \ + --exclude "$(date +%Y%m)*" + ``` + +3. **Expand storage:** + ```bash + # Resize volume (AWS) + aws ec2 modify-volume --volume-id vol-xxx --size 500 + + # Resize filesystem + sudo resize2fs /dev/xvdf + ``` + +--- + +## Incident Response Procedures + +### ๐Ÿ“‹ Severity Levels + +| Level | Response Time | Examples | +|-------|--------------|----------| +| SEV1 | 15 minutes | Complete outage, data loss | +| SEV2 | 30 minutes | Degraded performance, partial outage | +| SEV3 | 2 hours | Minor issues, single component failure | +| SEV4 | Next business day | Cosmetic issues, documentation | + +### ๐Ÿšจ SEV1: Complete Service Outage + +**Initial Response (0-15 min):** + +1. **Acknowledge incident:** + ```bash + # Send initial notification + ./scripts/notify-incident.sh SEV1 "FFmpeg API Complete Outage" + ``` + +2. **Quick diagnostics:** + ```bash + # Check all services + docker compose ps + + # Check recent deployments + git log --oneline -10 + + # Check system resources + free -m + df -h + ``` + +3. **Immediate mitigation:** + ```bash + # Restart all services + docker compose down + docker compose up -d + + # Enable maintenance mode + docker compose exec api redis-cli set maintenance_mode true + ``` + +**Investigation (15-30 min):** + +1. **Collect logs:** + ```bash + # Aggregate recent logs + mkdir -p /tmp/incident-$(date +%Y%m%d-%H%M%S) + cd /tmp/incident-* + + docker compose logs --since 1h > docker-logs.txt + journalctl --since "1 hour ago" > system-logs.txt + ``` + +2. **Check metrics:** + - Open Grafana dashboard + - Look for anomalies in last 2 hours + - Check error rates and latency + +3. **Root cause analysis:** + ```bash + # Check for OOM kills + dmesg | grep -i "killed process" + + # Check for disk issues + grep -i "error\|fail" /var/log/syslog + + # Database issues + docker compose exec postgres tail -100 /var/log/postgresql/postgresql.log + ``` + +**Recovery (30-60 min):** + +1. **Restore service:** + ```bash + # If configuration issue, rollback + git checkout HEAD~1 -- compose.yml + docker compose up -d + + # If database issue, restore from backup + ./scripts/disaster-recovery.sh --mode latest + ``` + +2. **Verify recovery:** + ```bash + # Run smoke tests + ./scripts/smoke-test.sh + + # Check metrics + curl -s http://localhost:9090/metrics | grep up + ``` + +3. **Post-incident:** + ```bash + # Disable maintenance mode + docker compose exec api redis-cli del maintenance_mode + + # Send recovery notification + ./scripts/notify-incident.sh RESOLVED "FFmpeg API Service Restored" + ``` + +### ๐Ÿ“ Incident Report Template + +```markdown +# Incident Report: [INCIDENT-ID] + +**Date:** [DATE] +**Severity:** [SEV1/2/3/4] +**Duration:** [START] - [END] +**Impact:** [# of users affected, % of requests failed] + +## Summary +[Brief description of what happened] + +## Timeline +- **[TIME]** - Initial detection +- **[TIME]** - Incident acknowledged +- **[TIME]** - Root cause identified +- **[TIME]** - Fix implemented +- **[TIME]** - Service restored + +## Root Cause +[Detailed explanation of why this happened] + +## Resolution +[What was done to fix the issue] + +## Impact +- **Users affected:** [number] +- **Requests failed:** [number] +- **Data loss:** [yes/no] + +## Lessons Learned +1. [What went well] +2. [What went poorly] +3. [What was lucky] + +## Action Items +- [ ] [Preventive measure 1] +- [ ] [Preventive measure 2] +- [ ] [Process improvement] +``` + +--- + +## Performance Troubleshooting + +### ๐ŸŒ Slow Video Processing + +**Check processing metrics:** +```bash +# Average processing time by operation +docker compose exec postgres psql -U ffmpeg_user -d ffmpeg_api -c " +SELECT + operations->0->>'type' as operation, + AVG(EXTRACT(EPOCH FROM (completed_at - started_at))) as avg_seconds, + COUNT(*) as job_count +FROM jobs +WHERE status = 'completed' + AND completed_at > NOW() - INTERVAL '1 day' +GROUP BY operations->0->>'type' +ORDER BY avg_seconds DESC;" +``` + +**Optimize FFmpeg settings:** +```bash +# Check current FFmpeg threads +docker compose exec worker-cpu cat /proc/cpuinfo | grep processor | wc -l + +# Update worker concurrency +WORKER_CONCURRENCY=2 # Reduce to give more CPU per job +docker compose restart worker-cpu +``` + +### ๐Ÿ“Š Database Performance + +**Check slow queries:** +```bash +# Enable query logging +docker compose exec postgres psql -U ffmpeg_user -d ffmpeg_api -c \ + "ALTER SYSTEM SET log_min_duration_statement = 1000;" + +docker compose exec postgres psql -U ffmpeg_user -d ffmpeg_api -c \ + "SELECT pg_reload_conf();" + +# View slow query log +docker compose exec postgres tail -f /var/log/postgresql/postgresql.log | grep duration +``` + +**Optimize database:** +```bash +# Update statistics +docker compose exec postgres vacuumdb -U ffmpeg_user -d ffmpeg_api -z + +# Reindex tables +docker compose exec postgres reindexdb -U ffmpeg_user -d ffmpeg_api + +# Check table sizes +docker compose exec postgres psql -U ffmpeg_user -d ffmpeg_api -c " +SELECT + schemaname AS table_schema, + tablename AS table_name, + pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size +FROM pg_tables +ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC +LIMIT 10;" +``` + +--- + +## Disaster Recovery + +### ๐Ÿ”ฅ Complete Database Recovery + +1. **Stop application:** + ```bash + docker compose stop api worker-cpu worker-gpu + ``` + +2. **List available backups:** + ```bash + ./scripts/disaster-recovery.sh --mode list + ``` + +3. **Restore from backup:** + ```bash + # Restore latest + ./scripts/disaster-recovery.sh --mode latest + + # Restore specific backup + ./scripts/disaster-recovery.sh --mode specific \ + --timestamp 20250127_120000 + ``` + +4. **Verify restoration:** + ```bash + # Check data integrity + docker compose exec postgres psql -U ffmpeg_user -d ffmpeg_api -c \ + "SELECT COUNT(*) FROM jobs;" + + # Run application tests + docker compose run --rm api pytest tests/ + ``` + +5. **Resume service:** + ```bash + docker compose up -d api worker-cpu worker-gpu + ``` + +### ๐Ÿ’พ Point-in-Time Recovery + +```bash +# Enable WAL archiving (preventive) +docker compose exec postgres psql -U postgres -c " +ALTER SYSTEM SET wal_level = replica; +ALTER SYSTEM SET archive_mode = on; +ALTER SYSTEM SET archive_command = 'aws s3 cp %p s3://backup-bucket/wal/%f'; +" + +# Perform PITR +pg_basebackup -h localhost -D /recovery -U postgres -Fp -Xs -P +``` + +--- + +## Scaling Procedures + +### โฌ†๏ธ Vertical Scaling (Resize) + +1. **Plan maintenance window:** + ```bash + # Enable maintenance mode + docker compose exec api redis-cli set maintenance_mode true ex 3600 + ``` + +2. **Scale instance (AWS):** + ```bash + # Stop instance + aws ec2 stop-instances --instance-ids i-xxxxx + + # Modify instance type + aws ec2 modify-instance-attribute --instance-id i-xxxxx \ + --instance-type c5.4xlarge + + # Start instance + aws ec2 start-instances --instance-ids i-xxxxx + ``` + +3. **Verify and adjust:** + ```bash + # Update resource limits + docker compose down + # Edit compose.yml with new limits + docker compose up -d + ``` + +### โžก๏ธ Horizontal Scaling + +1. **Add worker nodes:** + ```bash + # Deploy to new node + scp -r . newnode:/opt/ffmpeg-api/ + ssh newnode "cd /opt/ffmpeg-api && docker compose up -d worker-cpu" + ``` + +2. **Scale services:** + ```bash + # API servers + docker compose up -d --scale api=6 + + # CPU workers + docker compose up -d --scale worker-cpu=10 + + # GPU workers (if available) + docker compose up -d --scale worker-gpu=4 + ``` + +3. **Update load balancer:** + ```bash + # Add new backend to Traefik + docker compose exec traefik traefik healthcheck + ``` + +--- + +## Security Incidents + +### ๐Ÿ” Suspected API Key Compromise + +1. **Immediate response:** + ```bash + # Identify compromised key + docker compose exec postgres psql -U ffmpeg_user -d ffmpeg_api -c " + SELECT api_key_hash, last_used_at, request_count + FROM api_keys + WHERE last_used_at > NOW() - INTERVAL '1 hour' + ORDER BY request_count DESC;" + + # Revoke key + ./scripts/manage-api-keys.sh revoke + ``` + +2. **Investigate:** + ```bash + # Check access logs + docker compose logs api | grep > suspicious-activity.log + + # Check for data exfiltration + docker compose exec postgres psql -U ffmpeg_user -d ffmpeg_api -c " + SELECT COUNT(*), SUM(output_size) + FROM jobs + WHERE api_key = '' + AND created_at > NOW() - INTERVAL '24 hours';" + ``` + +3. **Remediate:** + ```bash + # Rotate all keys for affected user + ./scripts/manage-api-keys.sh rotate-user + + # Enable additional monitoring + docker compose exec api redis-cli set "monitor:api_key:" true + ``` + +### ๐Ÿ›ก๏ธ DDoS Attack Response + +1. **Enable rate limiting:** + ```bash + # Update Traefik rate limits + docker compose exec traefik redis-cli set "ratelimit:global" 100 + + # Enable DDoS protection mode + docker compose exec api python -c " + from api.config import settings + settings.ENABLE_DDOS_PROTECTION = True + " + ``` + +2. **Block malicious IPs:** + ```bash + # Analyze access patterns + docker compose logs traefik | awk '{print $1}' | sort | uniq -c | sort -rn | head -20 + + # Block suspicious IPs + iptables -A INPUT -s MALICIOUS_IP -j DROP + ``` + +3. **Scale and cache:** + ```bash + # Enable aggressive caching + docker compose exec redis redis-cli config set maxmemory 4gb + + # Scale API servers + docker compose up -d --scale api=10 + ``` + +--- + +## Monitoring Commands Reference + +```bash +# Service health +curl -s localhost:8000/api/v1/health | jq . + +# Queue status +docker compose exec redis redis-cli info clients + +# Active jobs +docker compose exec worker-cpu celery -A worker.main inspect active + +# Database connections +docker compose exec postgres psql -c "SELECT count(*) FROM pg_stat_activity;" + +# Memory usage +docker stats --no-stream --format "table {{.Container}}\t{{.MemUsage}}" + +# Disk usage +df -h | grep -E "Filesystem|storage" + +# Network connections +netstat -an | grep ESTABLISHED | wc -l + +# Error logs +docker compose logs --since 10m | grep -i error + +# Performance metrics +curl -s localhost:9090/metrics | grep -E "http_request_duration|ffmpeg_job_duration" +``` + +--- + +## Emergency Contacts + +- **On-Call Engineer**: Use PagerDuty +- **Database Admin**: dba-team@company.com +- **Infrastructure**: infra-team@company.com +- **Security Team**: security@company.com +- **Management Escalation**: cto@company.com + +## Useful Links + +- [Grafana Dashboard](http://monitoring.internal:3000) +- [Prometheus](http://monitoring.internal:9090) +- [Traefik Dashboard](http://traefik.internal:8080) +- [API Documentation](https://api.domain.com/docs) +- [Status Page](https://status.domain.com) \ No newline at end of file diff --git a/docs/SETUP.md b/docs/SETUP.md index 95f99f4..6ebb935 100644 --- a/docs/SETUP.md +++ b/docs/SETUP.md @@ -8,7 +8,7 @@ Complete setup guide for the Rendiff FFmpeg API platform. This guide covers all 2. [Setup Options](#setup-options) 3. [Development Setup](#development-setup) 4. [Production Setup](#production-setup) -5. [GenAI Setup](#genai-setup) +5. [GPU Setup](#gpu-setup) 6. [HTTPS/SSL Configuration](#httpssl-configuration) 7. [Configuration Management](#configuration-management) 8. [Troubleshooting](#troubleshooting) @@ -27,7 +27,7 @@ cd ffmpeg-api ./setup.sh --development # Quick dev setup ./setup.sh --interactive # Interactive setup wizard ./setup.sh --standard # Standard production -./setup.sh --genai # AI-enabled production +./setup.sh --gpu # Hardware accelerated production ./setup.sh --interactive # Full configuration wizard ``` @@ -48,7 +48,7 @@ cd ffmpeg-api - API available at http://localhost:8000 ### ๐Ÿญ Standard Production -**Best for: Production deployment without AI features** +**Best for: CPU-based production deployment** ```bash ./setup.sh --standard @@ -65,21 +65,21 @@ cd ffmpeg-api - 2 CPU workers - Automatic HTTP to HTTPS redirect -### ๐Ÿค– GenAI Production -**Best for: AI-enhanced video processing** +### โšก GPU Hardware Accelerated Production +**Best for: Hardware-accelerated video processing** ```bash -./setup.sh --genai +./setup.sh --gpu ``` **What you get:** - Everything from Standard Production - **HTTPS by default (self-signed certificate)** -- GPU support for AI processing -- AI models (Real-ESRGAN, VideoMAE, etc.) -- GenAI workers -- Enhanced analysis capabilities -- AI endpoints at `/api/genai/v1/*` +- GPU support for hardware acceleration +- NVENC/NVDEC encoding support +- GPU workers for accelerated processing +- Enhanced performance capabilities +- Hardware acceleration endpoints at `/api/v1/gpu/*` ### ๐Ÿ›ก๏ธ Production with Let's Encrypt HTTPS **Best for: Internet-facing deployments with domain names** @@ -147,16 +147,16 @@ open http://localhost:8000/docs ### Development Commands ```bash # View logs -docker-compose logs -f api +docker compose logs -f api # Restart API only -docker-compose restart api +docker compose restart api # Access database -docker-compose exec api python -c "from api.database import engine; print(engine.url)" +docker compose exec api python -c "from api.database import engine; print(engine.url)" # Run tests -docker-compose exec api pytest +docker compose exec api pytest # Check status ./setup.sh --status @@ -182,7 +182,7 @@ Enterprise-ready deployment with full security and monitoring: ./setup.sh --validate # 3. Check all services -docker-compose ps +docker compose ps ``` ### Production Features @@ -224,10 +224,10 @@ curl -I https://your-domain.com/api/v1/health ./scripts/backup.sh create # View production logs -docker-compose logs -f +docker compose logs -f # Scale workers -docker-compose up -d --scale worker-cpu=4 +docker compose up -d --scale worker-cpu=4 ``` ## GenAI Setup @@ -250,7 +250,7 @@ nvidia-smi ./setup.sh --genai # 3. Wait for model downloads (may take 10-30 minutes) -docker-compose logs -f model-downloader +docker compose logs -f model-downloader # 4. Verify GenAI endpoints curl https://localhost/api/genai/v1/health @@ -267,16 +267,16 @@ curl https://localhost/api/genai/v1/health ### GenAI Commands ```bash # Check GPU utilization -docker-compose exec worker-genai nvidia-smi +docker compose exec worker-genai nvidia-smi # Download additional models -docker-compose --profile setup run model-downloader +docker compose --profile setup run model-downloader # Scale GenAI workers -docker-compose up -d --scale worker-genai=2 +docker compose up -d --scale worker-genai=2 # View GenAI logs -docker-compose logs -f worker-genai +docker compose logs -f worker-genai ``` ### AI Endpoints @@ -358,7 +358,7 @@ export CERTBOT_EMAIL=admin@example.com ./scripts/enhanced-ssl-manager.sh install-commercial /path/to/cert.crt /path/to/private.key # 3. Restart services to apply certificate -docker-compose restart traefik +docker compose restart traefik ``` ### Manual SSL Management @@ -445,7 +445,7 @@ cat .env nano .env # Reload configuration -docker-compose restart api +docker compose restart api ``` ### Key Configuration Files @@ -496,7 +496,7 @@ storage: docker ps -a # View error logs -docker-compose logs api +docker compose logs api # Validate configuration ./setup.sh --validate @@ -526,22 +526,22 @@ netstat -tulpn | grep :8000 docker network ls # Test internal connectivity -docker-compose exec api curl redis:6379 +docker compose exec api curl redis:6379 ``` #### ๐Ÿ’พ Database Issues ```bash # Check database status -docker-compose exec postgres pg_isready +docker compose exec postgres pg_isready # View database logs -docker-compose logs postgres +docker compose logs postgres # Run migrations manually -docker-compose exec api alembic upgrade head +docker compose exec api alembic upgrade head # Reset database (destructive) -docker-compose down -v +docker compose down -v ./setup.sh --standard ``` @@ -551,13 +551,13 @@ docker-compose down -v nvidia-smi # Verify CUDA in container -docker-compose exec worker-genai nvidia-smi +docker compose exec worker-genai nvidia-smi # Check model downloads ls -la models/genai/ # Restart GenAI services -docker-compose restart worker-genai +docker compose restart worker-genai ``` ### Performance Optimization @@ -565,32 +565,32 @@ docker-compose restart worker-genai #### Resource Scaling ```bash # Scale API instances -docker-compose up -d --scale api=3 +docker compose up -d --scale api=3 # Scale CPU workers -docker-compose up -d --scale worker-cpu=4 +docker compose up -d --scale worker-cpu=4 # Scale GenAI workers (if GPU memory allows) -docker-compose up -d --scale worker-genai=2 +docker compose up -d --scale worker-genai=2 ``` #### Database Optimization ```bash # Monitor database performance -docker-compose exec postgres psql -U ffmpeg_user -d ffmpeg_api -c " +docker compose exec postgres psql -U ffmpeg_user -d ffmpeg_api -c " SELECT query, mean_time, calls FROM pg_stat_statements ORDER BY mean_time DESC LIMIT 10;" # Analyze table usage -docker-compose exec postgres psql -U ffmpeg_user -d ffmpeg_api -c " +docker compose exec postgres psql -U ffmpeg_user -d ffmpeg_api -c " SELECT schemaname,tablename,attname,n_distinct,correlation FROM pg_stats WHERE tablename='jobs';" ``` ### Getting Help -1. **Check logs**: `docker-compose logs -f [service]` +1. **Check logs**: `docker compose logs -f [service]` 2. **Validate setup**: `./setup.sh --validate` 3. **Health check**: `./scripts/health-check.sh` 4. **Documentation**: Browse `/docs` in this repository diff --git a/docs/stable-build-solution.md b/docs/stable-build-solution.md index dbbfbe7..dae6501 100644 --- a/docs/stable-build-solution.md +++ b/docs/stable-build-solution.md @@ -154,7 +154,7 @@ RUN python -c "import psycopg2; print('psycopg2:', psycopg2.__version__)" && \ | `.python-version` | Version pinning | Central Python version declaration | | `docker/base.Dockerfile` | Base image | Standardized base with all dependencies | | `docker/requirements-stable.txt` | Dependency management | Pinned versions for stability | -| `docker-compose.stable.yml` | Stable builds | Override for consistent builds | +| `docker compose.stable.yml` | Stable builds | Override for consistent builds | | `scripts/validate-stable-build.sh` | Build validation | Comprehensive testing script | | `.github/workflows/stable-build.yml` | CI/CD pipeline | Automated build testing | | `docs/stable-build-solution.md` | Documentation | This comprehensive guide | @@ -175,13 +175,13 @@ RUN python -c "import psycopg2; print('psycopg2:', psycopg2.__version__)" && \ #### **Local Build** ```bash # Build with stable configuration -docker-compose -f docker-compose.yml -f docker-compose.stable.yml build +docker compose -f docker compose.yml -f docker compose.stable.yml build # Validate builds ./scripts/validate-stable-build.sh # Start services -docker-compose -f docker-compose.yml -f docker-compose.stable.yml up +docker compose -f docker compose.yml -f docker compose.stable.yml up ``` #### **Single Container Testing** @@ -215,7 +215,7 @@ jobs: - uses: actions/checkout@v4 - name: Build and validate run: | - docker-compose -f docker-compose.stable.yml build + docker compose -f docker compose.stable.yml build ./scripts/validate-stable-build.sh ``` @@ -328,14 +328,14 @@ pgrep -f "python.*api" >/dev/null || exit 1 ```bash # Rollback to previous stable version docker tag ffmpeg-api:v1.0.0-stable-backup ffmpeg-api:latest -docker-compose restart api +docker compose restart api ``` #### **Configuration Level** ```bash # Use old Dockerfile if needed cp docker/api/Dockerfile.old docker/api/Dockerfile -docker-compose build api +docker compose build api ``` ### **Rollback Validation** diff --git a/monitoring/alerts/production-alerts.yml b/monitoring/alerts/production-alerts.yml index 35673a0..9f9b8fe 100644 --- a/monitoring/alerts/production-alerts.yml +++ b/monitoring/alerts/production-alerts.yml @@ -1,5 +1,9 @@ +# Prometheus Alerting Rules for FFmpeg API Production +# SLO-based alerts with multi-window burn rate + groups: - - name: ffmpeg-api-production + - name: ffmpeg_api_availability + interval: 30s rules: # High Priority Alerts - alert: APIHighErrorRate diff --git a/monitoring/dashboards/ffmpeg-api-production.json b/monitoring/dashboards/ffmpeg-api-production.json new file mode 100644 index 0000000..3fadd9f --- /dev/null +++ b/monitoring/dashboards/ffmpeg-api-production.json @@ -0,0 +1,516 @@ +{ + "dashboard": { + "id": null, + "uid": "ffmpeg-api-prod", + "title": "FFmpeg API - Production Operations", + "tags": ["ffmpeg", "api", "production", "sre"], + "timezone": "browser", + "schemaVersion": 38, + "version": 1, + "refresh": "10s", + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h"] + }, + "templating": { + "list": [ + { + "name": "datasource", + "type": "datasource", + "query": "prometheus", + "current": { + "value": "Prometheus" + } + }, + { + "name": "namespace", + "type": "query", + "datasource": "$datasource", + "query": "label_values(up{job=\"ffmpeg-api\"}, namespace)", + "current": { + "value": "default" + } + } + ] + }, + "panels": [ + { + "title": "๐Ÿšจ Service Health Overview", + "type": "stat", + "gridPos": {"x": 0, "y": 0, "w": 24, "h": 3}, + "id": 1, + "targets": [ + { + "expr": "up{job=\"ffmpeg-api\"}", + "legendFormat": "API", + "refId": "A" + }, + { + "expr": "up{job=\"ffmpeg-worker-cpu\"}", + "legendFormat": "CPU Workers", + "refId": "B" + }, + { + "expr": "up{job=\"ffmpeg-worker-gpu\"}", + "legendFormat": "GPU Workers", + "refId": "C" + }, + { + "expr": "up{job=\"postgres\"}", + "legendFormat": "Database", + "refId": "D" + }, + { + "expr": "up{job=\"redis\"}", + "legendFormat": "Queue", + "refId": "E" + } + ], + "fieldConfig": { + "defaults": { + "mappings": [ + {"type": "value", "value": "1", "text": "UP", "color": "green"}, + {"type": "value", "value": "0", "text": "DOWN", "color": "red"} + ], + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": 0}, + {"color": "green", "value": 1} + ] + } + } + } + }, + { + "title": "๐Ÿ“Š Key Performance Indicators", + "type": "row", + "gridPos": {"x": 0, "y": 3, "w": 24, "h": 1}, + "id": 2, + "collapsed": false + }, + { + "title": "Request Rate", + "type": "graph", + "gridPos": {"x": 0, "y": 4, "w": 6, "h": 8}, + "id": 3, + "targets": [ + { + "expr": "sum(rate(http_requests_total{job=\"ffmpeg-api\"}[5m])) by (status)", + "legendFormat": "{{status}}xx" + } + ], + "yaxes": [{"format": "reqps", "label": "Requests/sec"}] + }, + { + "title": "Response Time (P95)", + "type": "graph", + "gridPos": {"x": 6, "y": 4, "w": 6, "h": 8}, + "id": 4, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=\"ffmpeg-api\"}[5m])) by (le, endpoint))", + "legendFormat": "{{endpoint}}" + } + ], + "yaxes": [{"format": "s", "label": "Response Time"}] + }, + { + "title": "Active Jobs", + "type": "graph", + "gridPos": {"x": 12, "y": 4, "w": 6, "h": 8}, + "id": 5, + "targets": [ + { + "expr": "ffmpeg_jobs_active{job=\"ffmpeg-api\"}", + "legendFormat": "{{status}}" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "bars", + "fillOpacity": 80 + } + } + } + }, + { + "title": "Error Rate", + "type": "stat", + "gridPos": {"x": 18, "y": 4, "w": 6, "h": 8}, + "id": 6, + "targets": [ + { + "expr": "sum(rate(http_requests_total{job=\"ffmpeg-api\",status=~\"5..\"}[5m])) / sum(rate(http_requests_total{job=\"ffmpeg-api\"}[5m])) * 100", + "legendFormat": "Error %" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": 0}, + {"color": "yellow", "value": 1}, + {"color": "red", "value": 5} + ] + } + } + } + }, + { + "title": "๐ŸŽฌ Video Processing Metrics", + "type": "row", + "gridPos": {"x": 0, "y": 12, "w": 24, "h": 1}, + "id": 7, + "collapsed": false + }, + { + "title": "Processing Queue Depth", + "type": "graph", + "gridPos": {"x": 0, "y": 13, "w": 8, "h": 8}, + "id": 8, + "targets": [ + { + "expr": "ffmpeg_queue_depth{job=\"ffmpeg-api\"}", + "legendFormat": "{{priority}} priority" + } + ], + "alert": { + "conditions": [ + { + "evaluator": {"params": [100], "type": "gt"}, + "operator": {"type": "and"}, + "query": {"params": ["A", "5m", "now"]}, + "reducer": {"params": [], "type": "avg"}, + "type": "query" + } + ], + "name": "High Queue Depth", + "message": "Processing queue depth exceeded 100 jobs" + } + }, + { + "title": "Job Processing Time", + "type": "heatmap", + "gridPos": {"x": 8, "y": 13, "w": 8, "h": 8}, + "id": 9, + "targets": [ + { + "expr": "ffmpeg_job_duration_seconds_bucket{job=\"ffmpeg-worker-cpu\"}", + "format": "heatmap", + "legendFormat": "{{le}}" + } + ], + "dataFormat": "timeseries", + "yAxis": {"format": "s", "logBase": 2} + }, + { + "title": "GPU Utilization", + "type": "graph", + "gridPos": {"x": 16, "y": 13, "w": 8, "h": 8}, + "id": 10, + "targets": [ + { + "expr": "gpu_utilization_percent{job=\"ffmpeg-worker-gpu\"}", + "legendFormat": "GPU {{gpu_index}}" + } + ], + "yaxes": [{"format": "percent", "max": 100, "min": 0}] + }, + { + "title": "๐Ÿ’พ Resource Utilization", + "type": "row", + "gridPos": {"x": 0, "y": 21, "w": 24, "h": 1}, + "id": 11, + "collapsed": false + }, + { + "title": "CPU Usage by Service", + "type": "graph", + "gridPos": {"x": 0, "y": 22, "w": 6, "h": 8}, + "id": 12, + "targets": [ + { + "expr": "rate(container_cpu_usage_seconds_total{pod=~\"ffmpeg-.*\"}[5m]) * 100", + "legendFormat": "{{pod}}" + } + ], + "yaxes": [{"format": "percent", "label": "CPU Usage"}] + }, + { + "title": "Memory Usage", + "type": "graph", + "gridPos": {"x": 6, "y": 22, "w": 6, "h": 8}, + "id": 13, + "targets": [ + { + "expr": "container_memory_usage_bytes{pod=~\"ffmpeg-.*\"} / 1024 / 1024 / 1024", + "legendFormat": "{{pod}}" + } + ], + "yaxes": [{"format": "GB", "label": "Memory"}] + }, + { + "title": "Storage I/O", + "type": "graph", + "gridPos": {"x": 12, "y": 22, "w": 6, "h": 8}, + "id": 14, + "targets": [ + { + "expr": "rate(container_fs_reads_bytes_total{pod=~\"ffmpeg-.*\"}[5m])", + "legendFormat": "{{pod}} read" + }, + { + "expr": "rate(container_fs_writes_bytes_total{pod=~\"ffmpeg-.*\"}[5m])", + "legendFormat": "{{pod}} write" + } + ], + "yaxes": [{"format": "Bps", "label": "I/O Rate"}] + }, + { + "title": "Network Traffic", + "type": "graph", + "gridPos": {"x": 18, "y": 22, "w": 6, "h": 8}, + "id": 15, + "targets": [ + { + "expr": "rate(container_network_receive_bytes_total{pod=~\"ffmpeg-.*\"}[5m])", + "legendFormat": "{{pod}} RX" + }, + { + "expr": "rate(container_network_transmit_bytes_total{pod=~\"ffmpeg-.*\"}[5m])", + "legendFormat": "{{pod}} TX" + } + ], + "yaxes": [{"format": "Bps", "label": "Network"}] + }, + { + "title": "๐Ÿ” Database Performance", + "type": "row", + "gridPos": {"x": 0, "y": 30, "w": 24, "h": 1}, + "id": 16, + "collapsed": false + }, + { + "title": "Database Connections", + "type": "graph", + "gridPos": {"x": 0, "y": 31, "w": 8, "h": 8}, + "id": 17, + "targets": [ + { + "expr": "pg_stat_database_numbackends{datname=\"ffmpeg_api\"}", + "legendFormat": "Active connections" + }, + { + "expr": "pg_settings_max_connections", + "legendFormat": "Max connections" + } + ] + }, + { + "title": "Query Performance", + "type": "graph", + "gridPos": {"x": 8, "y": 31, "w": 8, "h": 8}, + "id": 18, + "targets": [ + { + "expr": "rate(pg_stat_database_blks_hit{datname=\"ffmpeg_api\"}[5m]) / (rate(pg_stat_database_blks_hit{datname=\"ffmpeg_api\"}[5m]) + rate(pg_stat_database_blks_read{datname=\"ffmpeg_api\"}[5m])) * 100", + "legendFormat": "Cache hit ratio" + } + ], + "yaxes": [{"format": "percent", "max": 100, "min": 0}] + }, + { + "title": "Database Size", + "type": "stat", + "gridPos": {"x": 16, "y": 31, "w": 8, "h": 8}, + "id": 19, + "targets": [ + { + "expr": "pg_database_size_bytes{datname=\"ffmpeg_api\"} / 1024 / 1024 / 1024", + "legendFormat": "Database size" + } + ], + "fieldConfig": { + "defaults": { + "unit": "GB", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": 0}, + {"color": "yellow", "value": 50}, + {"color": "red", "value": 100} + ] + } + } + } + }, + { + "title": "๐Ÿ“ˆ Business Metrics", + "type": "row", + "gridPos": {"x": 0, "y": 39, "w": 24, "h": 1}, + "id": 20, + "collapsed": false + }, + { + "title": "Jobs by Status", + "type": "piechart", + "gridPos": {"x": 0, "y": 40, "w": 6, "h": 8}, + "id": 21, + "targets": [ + { + "expr": "sum(ffmpeg_jobs_total) by (status)", + "legendFormat": "{{status}}" + } + ] + }, + { + "title": "Processing Volume (GB/hour)", + "type": "stat", + "gridPos": {"x": 6, "y": 40, "w": 6, "h": 8}, + "id": 22, + "targets": [ + { + "expr": "sum(rate(ffmpeg_bytes_processed_total[1h])) / 1024 / 1024 / 1024", + "legendFormat": "GB/hour" + } + ], + "fieldConfig": { + "defaults": { + "unit": "GB/h", + "decimals": 2 + } + } + }, + { + "title": "API Key Usage", + "type": "table", + "gridPos": {"x": 12, "y": 40, "w": 12, "h": 8}, + "id": 23, + "targets": [ + { + "expr": "topk(10, sum by (api_key_hash) (rate(ffmpeg_api_requests_by_key_total[1h])))", + "format": "table", + "instant": true + } + ], + "transformations": [ + { + "id": "organize", + "options": { + "renameByName": { + "api_key_hash": "API Key (hash)", + "Value": "Requests/hour" + } + } + } + ] + }, + { + "title": "๐Ÿšจ Alerts & SLIs", + "type": "row", + "gridPos": {"x": 0, "y": 48, "w": 24, "h": 1}, + "id": 24, + "collapsed": false + }, + { + "title": "SLI: Availability", + "type": "stat", + "gridPos": {"x": 0, "y": 49, "w": 6, "h": 4}, + "id": 25, + "targets": [ + { + "expr": "(1 - (sum(rate(http_requests_total{job=\"ffmpeg-api\",status=~\"5..\"}[5m])) / sum(rate(http_requests_total{job=\"ffmpeg-api\"}[5m])))) * 100", + "legendFormat": "Availability" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "decimals": 3, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": 0}, + {"color": "yellow", "value": 99}, + {"color": "green", "value": 99.9} + ] + } + } + } + }, + { + "title": "SLI: Latency (P99)", + "type": "stat", + "gridPos": {"x": 6, "y": 49, "w": 6, "h": 4}, + "id": 26, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{job=\"ffmpeg-api\"}[5m])) by (le))", + "legendFormat": "P99 Latency" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": 0}, + {"color": "yellow", "value": 1}, + {"color": "red", "value": 5} + ] + } + } + } + }, + { + "title": "Error Budget Remaining", + "type": "gauge", + "gridPos": {"x": 12, "y": 49, "w": 6, "h": 4}, + "id": 27, + "targets": [ + { + "expr": "100 - ((1 - slo_availability_target) - (1 - availability_current)) / (1 - slo_availability_target) * 100" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": 0}, + {"color": "yellow", "value": 25}, + {"color": "green", "value": 50} + ] + } + } + } + }, + { + "title": "Active Alerts", + "type": "alertlist", + "gridPos": {"x": 18, "y": 49, "w": 6, "h": 4}, + "id": 28, + "options": { + "showOptions": "current", + "maxItems": 10, + "sortOrder": 1, + "dashboardAlerts": true, + "alertName": "", + "dashboardTitle": "", + "tags": ["ffmpeg-api"] + } + } + ] + } +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index aa99572..7070115 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,58 +1,60 @@ -# Core dependencies -fastapi==0.109.0 -uvicorn[standard]==0.25.0 -pydantic==2.5.3 -pydantic-settings==2.1.0 -python-multipart==0.0.6 - -# Database -sqlalchemy==2.0.25 -asyncpg==0.29.0 # PostgreSQL async driver -psycopg2-binary==2.9.9 # PostgreSQL sync driver -alembic==1.13.1 -# aiosqlite==0.19.0 # SQLite support (optional for development) - -# Queue and Background Tasks -celery==5.3.4 -redis==5.0.1 +# Core Framework - Latest Stable +fastapi==0.115.4 +uvicorn[standard]==0.32.0 +pydantic==2.9.2 +pydantic-settings==2.6.1 +python-multipart==0.0.17 + +# Database - Production Ready +sqlalchemy[asyncio]==2.0.36 +asyncpg==0.30.0 +psycopg2-binary==2.9.9 +alembic==1.13.3 +aiosqlite==0.20.0 + +# Queue and Background Tasks - Stable +celery==5.4.0 +redis==5.2.0 flower==2.0.1 -# Storage backends -boto3==1.34.0 # S3 and S3-compatible storage -aiofiles==23.2.1 # Local filesystem async operations -# Note: Azure and GCS backends planned for future releases +# Storage Backends - Multi-cloud Support +boto3==1.35.67 +aiofiles==24.1.0 -# Media processing +# Media Processing - Core ffmpeg-python==0.2.0 -pillow==10.2.0 +pillow==11.0.0 -# API and networking -httpx==0.26.0 -aiohttp==3.9.1 -websockets==12.0 +# HTTP Client & Networking +httpx==0.28.1 +aiohttp==3.10.10 +websockets==13.1 -# Monitoring and logging -prometheus-client==0.19.0 -structlog==24.1.0 +# Monitoring & Observability +prometheus-client==0.21.0 +structlog==24.4.0 python-json-logger==2.0.7 -# Utilities -pyyaml==6.0.1 -python-dotenv==1.0.0 +# Utilities & CLI +pyyaml==6.0.2 +python-dotenv==1.0.1 click==8.1.7 -rich==13.7.0 -humanize==4.9.0 +rich==13.9.4 +humanize==4.11.0 -# Security +# Security & Authentication passlib[bcrypt]==1.7.4 python-jose[cryptography]==3.3.0 -cryptography==41.0.7 - -# Development dependencies (optional) -pytest==7.4.4 -pytest-asyncio==0.23.3 -pytest-cov==4.1.0 -black==23.12.1 -flake8==7.0.0 -mypy==1.8.0 -pre-commit==3.6.0 \ No newline at end of file +cryptography==43.0.3 + +# Production Server +gunicorn==23.0.0 + +# Development & Testing (Optional) +pytest==8.3.4 +pytest-asyncio==0.24.0 +pytest-cov==6.0.0 +black==24.10.0 +flake8==7.1.1 +mypy==1.13.0 +pre-commit==4.0.1 \ No newline at end of file diff --git a/scripts/backup-postgres.sh b/scripts/backup-postgres.sh new file mode 100644 index 0000000..bb59aa3 --- /dev/null +++ b/scripts/backup-postgres.sh @@ -0,0 +1,218 @@ +#!/bin/bash +# PostgreSQL Backup Script with S3 Upload +# Production-grade backup with encryption and retention + +set -euo pipefail + +# Configuration +BACKUP_DIR="${BACKUP_DIR:-/backup/postgres}" +S3_BUCKET="${S3_BUCKET:-ffmpeg-api-backups}" +RETENTION_DAYS="${RETENTION_DAYS:-30}" +ENCRYPTION_KEY="${BACKUP_ENCRYPTION_KEY:-}" +SLACK_WEBHOOK="${SLACK_WEBHOOK:-}" + +# Database connection +DB_HOST="${POSTGRES_HOST:-postgres}" +DB_PORT="${POSTGRES_PORT:-5432}" +DB_NAME="${POSTGRES_DB:-ffmpeg_api}" +DB_USER="${POSTGRES_USER:-ffmpeg_user}" +export PGPASSWORD="${POSTGRES_PASSWORD}" + +# Timestamp +TIMESTAMP=$(date +%Y%m%d_%H%M%S) +BACKUP_NAME="ffmpeg_api_backup_${TIMESTAMP}" + +# Logging +log() { + echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1" +} + +error() { + echo "[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1" >&2 +} + +send_alert() { + local status=$1 + local message=$2 + + if [ -n "$SLACK_WEBHOOK" ]; then + curl -X POST -H 'Content-type: application/json' \ + --data "{\"text\":\"Backup $status: $message\"}" \ + "$SLACK_WEBHOOK" 2>/dev/null || true + fi +} + +# Pre-flight checks +check_requirements() { + log "Checking requirements..." + + for cmd in pg_dump aws gpg gzip; do + if ! command -v $cmd &> /dev/null; then + error "$cmd is required but not installed" + exit 1 + fi + done + + # Check database connectivity + if ! pg_isready -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME"; then + error "Cannot connect to database" + exit 1 + fi + + # Check S3 access + if ! aws s3 ls "s3://$S3_BUCKET" &> /dev/null; then + error "Cannot access S3 bucket: $S3_BUCKET" + exit 1 + fi +} + +# Perform backup +perform_backup() { + log "Starting backup of $DB_NAME..." + + # Create backup directory + mkdir -p "$BACKUP_DIR" + cd "$BACKUP_DIR" + + # Dump database with parallel jobs + log "Dumping database..." + pg_dump \ + -h "$DB_HOST" \ + -p "$DB_PORT" \ + -U "$DB_USER" \ + -d "$DB_NAME" \ + --format=custom \ + --verbose \ + --no-password \ + --jobs=4 \ + --file="${BACKUP_NAME}.dump" \ + 2>&1 | tee "${BACKUP_NAME}.log" + + # Verify dump + log "Verifying backup..." + pg_restore --list "${BACKUP_NAME}.dump" > /dev/null + + # Get backup size + BACKUP_SIZE=$(du -h "${BACKUP_NAME}.dump" | cut -f1) + log "Backup size: $BACKUP_SIZE" + + # Compress backup + log "Compressing backup..." + gzip -9 "${BACKUP_NAME}.dump" + + # Encrypt if key provided + if [ -n "$ENCRYPTION_KEY" ]; then + log "Encrypting backup..." + echo "$ENCRYPTION_KEY" | gpg --batch --yes --passphrase-fd 0 \ + --cipher-algo AES256 \ + --symmetric \ + --output "${BACKUP_NAME}.dump.gz.gpg" \ + "${BACKUP_NAME}.dump.gz" + rm "${BACKUP_NAME}.dump.gz" + BACKUP_FILE="${BACKUP_NAME}.dump.gz.gpg" + else + BACKUP_FILE="${BACKUP_NAME}.dump.gz" + fi +} + +# Upload to S3 +upload_to_s3() { + log "Uploading to S3..." + + # Upload with metadata + aws s3 cp "$BACKUP_FILE" "s3://$S3_BUCKET/postgres/$BACKUP_FILE" \ + --storage-class STANDARD_IA \ + --metadata "timestamp=$TIMESTAMP,database=$DB_NAME,size=$BACKUP_SIZE" \ + --only-show-errors + + # Upload log file + aws s3 cp "${BACKUP_NAME}.log" "s3://$S3_BUCKET/postgres/logs/${BACKUP_NAME}.log" \ + --only-show-errors + + # Verify upload + if aws s3 ls "s3://$S3_BUCKET/postgres/$BACKUP_FILE" &> /dev/null; then + log "Upload successful" + else + error "Upload verification failed" + return 1 + fi +} + +# Clean up old backups +cleanup_old_backups() { + log "Cleaning up old backups (retention: $RETENTION_DAYS days)..." + + # Local cleanup + find "$BACKUP_DIR" -name "*.dump.gz*" -mtime +$RETENTION_DAYS -delete 2>/dev/null || true + find "$BACKUP_DIR" -name "*.log" -mtime +$RETENTION_DAYS -delete 2>/dev/null || true + + # S3 lifecycle rules should handle S3 cleanup, but we can also do it here + CUTOFF_DATE=$(date -d "$RETENTION_DAYS days ago" +%Y-%m-%d) + + aws s3api list-objects-v2 \ + --bucket "$S3_BUCKET" \ + --prefix "postgres/" \ + --query "Contents[?LastModified<='$CUTOFF_DATE'].Key" \ + --output text | \ + while read -r key; do + if [ -n "$key" ]; then + log "Deleting old backup: $key" + aws s3 rm "s3://$S3_BUCKET/$key" --only-show-errors + fi + done +} + +# Create backup manifest +create_manifest() { + log "Creating backup manifest..." + + cat > "${BACKUP_NAME}.manifest.json" </dev/null || echo 'unknown')" +} +EOF + + # Upload manifest + aws s3 cp "${BACKUP_NAME}.manifest.json" "s3://$S3_BUCKET/postgres/manifests/${BACKUP_NAME}.manifest.json" \ + --only-show-errors +} + +# Main execution +main() { + log "=== PostgreSQL Backup Script ===" + log "Database: $DB_NAME" + log "Backup location: s3://$S3_BUCKET/postgres/" + + # Trap errors + trap 'error "Backup failed"; send_alert "FAILED" "Database backup failed for $DB_NAME"; exit 1' ERR + + # Execute backup steps + check_requirements + perform_backup + upload_to_s3 + create_manifest + cleanup_old_backups + + # Clean up local files + log "Cleaning up local files..." + rm -f "${BACKUP_NAME}".* + + # Success notification + log "Backup completed successfully!" + send_alert "SUCCESS" "Database backup completed for $DB_NAME (Size: $BACKUP_SIZE)" + + # Output for automation + echo "BACKUP_FILE=$BACKUP_FILE" + echo "BACKUP_LOCATION=s3://$S3_BUCKET/postgres/$BACKUP_FILE" +} + +# Run main function +main "$@" \ No newline at end of file diff --git a/scripts/disaster-recovery.sh b/scripts/disaster-recovery.sh new file mode 100644 index 0000000..05ae95b --- /dev/null +++ b/scripts/disaster-recovery.sh @@ -0,0 +1,341 @@ +#!/bin/bash +# Disaster Recovery Script for FFmpeg API +# Automated recovery from backups with validation + +set -euo pipefail + +# Configuration +S3_BUCKET="${S3_BUCKET:-ffmpeg-api-backups}" +RESTORE_DIR="${RESTORE_DIR:-/tmp/restore}" +TARGET_DB_HOST="${TARGET_DB_HOST:-postgres}" +TARGET_DB_PORT="${TARGET_DB_PORT:-5432}" +TARGET_DB_NAME="${TARGET_DB_NAME:-ffmpeg_api}" +TARGET_DB_USER="${TARGET_DB_USER:-ffmpeg_user}" +export PGPASSWORD="${POSTGRES_PASSWORD}" + +# Recovery options +RECOVERY_MODE="${RECOVERY_MODE:-latest}" # latest, specific, point-in-time +RECOVERY_TIMESTAMP="${RECOVERY_TIMESTAMP:-}" +ENCRYPTION_KEY="${BACKUP_ENCRYPTION_KEY:-}" +VERIFY_ONLY="${VERIFY_ONLY:-false}" + +# Logging +log() { + echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1" +} + +error() { + echo "[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1" >&2 +} + +# Find available backups +list_backups() { + log "Listing available backups..." + + aws s3api list-objects-v2 \ + --bucket "$S3_BUCKET" \ + --prefix "postgres/ffmpeg_api_backup_" \ + --query "Contents[?ends_with(Key, '.dump.gz') || ends_with(Key, '.dump.gz.gpg')].[Key,LastModified,Size]" \ + --output table +} + +# Get latest backup +get_latest_backup() { + aws s3api list-objects-v2 \ + --bucket "$S3_BUCKET" \ + --prefix "postgres/ffmpeg_api_backup_" \ + --query "Contents[?ends_with(Key, '.dump.gz') || ends_with(Key, '.dump.gz.gpg')] | sort_by(@, &LastModified) | [-1].Key" \ + --output text +} + +# Download backup +download_backup() { + local backup_key=$1 + local backup_file=$(basename "$backup_key") + + log "Downloading backup: $backup_key" + mkdir -p "$RESTORE_DIR" + + aws s3 cp "s3://$S3_BUCKET/$backup_key" "$RESTORE_DIR/$backup_file" \ + --only-show-errors + + # Download manifest if exists + local manifest_key="${backup_key%.dump.gz*}.manifest.json" + manifest_key="postgres/manifests/$(basename "$manifest_key")" + + if aws s3 ls "s3://$S3_BUCKET/$manifest_key" &> /dev/null; then + log "Downloading manifest..." + aws s3 cp "s3://$S3_BUCKET/$manifest_key" "$RESTORE_DIR/" --only-show-errors + + # Display manifest info + log "Backup information:" + jq . "$RESTORE_DIR/$(basename "$manifest_key")" + fi + + echo "$backup_file" +} + +# Decrypt backup if needed +decrypt_backup() { + local backup_file=$1 + + if [[ $backup_file == *.gpg ]]; then + if [ -z "$ENCRYPTION_KEY" ]; then + error "Backup is encrypted but no encryption key provided" + exit 1 + fi + + log "Decrypting backup..." + local decrypted_file="${backup_file%.gpg}" + + echo "$ENCRYPTION_KEY" | gpg --batch --yes --passphrase-fd 0 \ + --decrypt "$RESTORE_DIR/$backup_file" > "$RESTORE_DIR/$decrypted_file" + + rm "$RESTORE_DIR/$backup_file" + echo "$decrypted_file" + else + echo "$backup_file" + fi +} + +# Decompress backup +decompress_backup() { + local backup_file=$1 + + log "Decompressing backup..." + gunzip "$RESTORE_DIR/$backup_file" + + echo "${backup_file%.gz}" +} + +# Verify backup integrity +verify_backup() { + local dump_file=$1 + + log "Verifying backup integrity..." + + if pg_restore --list "$RESTORE_DIR/$dump_file" > /dev/null 2>&1; then + log "Backup verification passed" + + # Count objects + local table_count=$(pg_restore --list "$RESTORE_DIR/$dump_file" | grep -c "TABLE DATA" || true) + local index_count=$(pg_restore --list "$RESTORE_DIR/$dump_file" | grep -c "INDEX" || true) + log "Found $table_count tables and $index_count indexes" + + return 0 + else + error "Backup verification failed" + return 1 + fi +} + +# Prepare target database +prepare_database() { + log "Preparing target database..." + + # Check if database exists + if psql -h "$TARGET_DB_HOST" -p "$TARGET_DB_PORT" -U "$TARGET_DB_USER" -lqt | cut -d \| -f 1 | grep -qw "$TARGET_DB_NAME"; then + log "Target database exists" + + if [ "$VERIFY_ONLY" != "true" ]; then + read -p "Database $TARGET_DB_NAME exists. Drop and recreate? (y/N) " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + log "Dropping existing database..." + psql -h "$TARGET_DB_HOST" -p "$TARGET_DB_PORT" -U "$TARGET_DB_USER" -c "DROP DATABASE IF EXISTS $TARGET_DB_NAME;" + psql -h "$TARGET_DB_HOST" -p "$TARGET_DB_PORT" -U "$TARGET_DB_USER" -c "CREATE DATABASE $TARGET_DB_NAME OWNER $TARGET_DB_USER;" + else + log "Aborting restore" + exit 1 + fi + fi + else + log "Creating target database..." + psql -h "$TARGET_DB_HOST" -p "$TARGET_DB_PORT" -U "$TARGET_DB_USER" -c "CREATE DATABASE $TARGET_DB_NAME OWNER $TARGET_DB_USER;" + fi +} + +# Restore database +restore_database() { + local dump_file=$1 + + log "Starting database restore..." + + # Create restore script + cat > "$RESTORE_DIR/restore.sh" <<'EOF' +#!/bin/bash +set -e +pg_restore \ + --host="$1" \ + --port="$2" \ + --username="$3" \ + --dbname="$4" \ + --no-password \ + --verbose \ + --clean \ + --if-exists \ + --no-owner \ + --no-privileges \ + --jobs=4 \ + "$5" 2>&1 | while read line; do + echo "[RESTORE] $line" + done +EOF + chmod +x "$RESTORE_DIR/restore.sh" + + # Execute restore + if "$RESTORE_DIR/restore.sh" "$TARGET_DB_HOST" "$TARGET_DB_PORT" "$TARGET_DB_USER" "$TARGET_DB_NAME" "$RESTORE_DIR/$dump_file"; then + log "Database restore completed successfully" + else + error "Database restore failed" + return 1 + fi +} + +# Post-restore validation +validate_restore() { + log "Validating restored database..." + + # Check table counts + local table_count=$(psql -h "$TARGET_DB_HOST" -p "$TARGET_DB_PORT" -U "$TARGET_DB_USER" -d "$TARGET_DB_NAME" -t -c "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public';") + log "Restored tables: $table_count" + + # Check critical tables + for table in jobs api_keys alembic_version; do + if psql -h "$TARGET_DB_HOST" -p "$TARGET_DB_PORT" -U "$TARGET_DB_USER" -d "$TARGET_DB_NAME" -c "SELECT 1 FROM $table LIMIT 1;" &> /dev/null; then + log "โœ“ Table $table exists and is accessible" + else + error "โœ— Table $table is missing or inaccessible" + fi + done + + # Check row counts + log "Row counts:" + psql -h "$TARGET_DB_HOST" -p "$TARGET_DB_PORT" -U "$TARGET_DB_USER" -d "$TARGET_DB_NAME" < "$RESTORE_DIR/recovery-plan.md" </dev/null" + run_check "Main compose.yml syntax" \ + "docker-compose -f '$PROJECT_ROOT/compose.yml' config >/dev/null" # Check production compose file - run_check "Production docker-compose.prod.yml syntax" \ - "docker-compose -f '$PROJECT_ROOT/docker-compose.prod.yml' config >/dev/null" + run_check "Production compose.prod.yml syntax" \ + "docker-compose -f '$PROJECT_ROOT/compose.prod.yml' config >/dev/null" # Check GenAI compose file with base run_check "GenAI docker-compose.genai.yml as override" \ - "docker-compose -f '$PROJECT_ROOT/docker-compose.yml' -f '$PROJECT_ROOT/docker-compose.genai.yml' config >/dev/null" + "docker-compose -f '$PROJECT_ROOT/compose.yml' -f '$PROJECT_ROOT/docker-compose.genai.yml' config >/dev/null" # Note: Main and production configs are designed to be used separately # They have conflicting service definitions by design diff --git a/scripts/validate-production.sh b/scripts/validate-production.sh index c1359f3..e6121ba 100755 --- a/scripts/validate-production.sh +++ b/scripts/validate-production.sh @@ -79,7 +79,7 @@ echo "" echo "1. Checking Required Files" echo "--------------------------" check_file ".env" "Environment configuration file exists" -check_file "docker-compose.yml" "Docker Compose file exists" +check_file "compose.yml" "Docker Compose file exists" check_file "config/storage.yml" "Storage configuration exists" echo "" diff --git a/scripts/validate-stable-build.sh b/scripts/validate-stable-build.sh index 2601712..0d610a1 100755 --- a/scripts/validate-stable-build.sh +++ b/scripts/validate-stable-build.sh @@ -195,7 +195,7 @@ fi # Test Docker Compose build log "๐Ÿณ Testing Docker Compose stable build..." -if docker-compose -f docker-compose.yml -f docker-compose.stable.yml build >> "$LOG_FILE" 2>&1; then +if docker-compose -f compose.yml -f compose.stable.yml build >> "$LOG_FILE" 2>&1; then success "Docker Compose stable build successful" else error "Docker Compose stable build failed" diff --git a/scripts/verify-deployment.sh b/scripts/verify-deployment.sh index ac4777b..e6f69b5 100755 --- a/scripts/verify-deployment.sh +++ b/scripts/verify-deployment.sh @@ -10,7 +10,7 @@ echo "=====================================" echo "๐Ÿ“‹ Checking required files..." REQUIRED_FILES=( - "docker-compose.yml" + "compose.yml" "docker-compose.genai.yml" ".env.example" "requirements.txt" @@ -88,13 +88,13 @@ done echo "๐Ÿณ Validating Docker Compose files..." if docker-compose config >/dev/null 2>&1; then - echo "โœ… docker-compose.yml syntax is valid" + echo "โœ… compose.yml syntax is valid" else - echo "โŒ docker-compose.yml has syntax errors" + echo "โŒ compose.yml has syntax errors" exit 1 fi -if docker-compose -f docker-compose.yml -f docker-compose.genai.yml config >/dev/null 2>&1; then +if docker-compose -f compose.yml -f docker-compose.genai.yml config >/dev/null 2>&1; then echo "โœ… docker-compose.genai.yml syntax is valid" else echo "โŒ docker-compose.genai.yml has syntax errors" diff --git a/setup.sh b/setup.sh index 85efdc0..93c5d93 100755 --- a/setup.sh +++ b/setup.sh @@ -11,15 +11,34 @@ echo "==========================" # Function to show usage show_usage() { - echo "Usage: $0 [--development|--standard|--genai|--status|--stop]" + echo "Usage: $0 [OPTION]" + echo "" + echo "๐Ÿš€ Rendiff FFmpeg API - Production-Ready Setup Script" + echo "" + echo "Deployment Options:" + echo " --development ๐Ÿ› ๏ธ Fast local development (SQLite, debug mode, no auth)" + echo " --standard ๐Ÿญ Production CPU setup (PostgreSQL, Redis, monitoring)" + echo " --gpu ๐ŸŽฎ GPU-accelerated setup (NVIDIA hardware acceleration)" + echo "" + echo "Management Options:" + echo " --status ๐Ÿ“Š Show current deployment status" + echo " --stop ๐Ÿ›‘ Stop all running services" + echo " --clean ๐Ÿงน Complete cleanup (stops services, removes volumes)" + echo " --help ๐Ÿ“– Show this help message" + echo "" + echo "Examples:" + echo " $0 --development # Quick 60-second setup for local development" + echo " $0 --standard # Production setup with PostgreSQL and monitoring" + echo " $0 --gpu # GPU setup with NVIDIA acceleration" + echo " $0 --status # Check what's currently running" + echo "" + echo "๐ŸŒ Access URLs (when running):" + echo " โ€ข API: http://localhost:8000" + echo " โ€ข Docs: http://localhost:8000/docs" + echo " โ€ข Health: http://localhost:8000/api/v1/health" + echo " โ€ข Prometheus: http://localhost:9090 (standard/gpu only)" + echo " โ€ข Grafana: http://localhost:3000 (standard/gpu only)" echo "" - echo "Options:" - echo " --development Quick local development setup (SQLite, no auth)" - echo " --standard Production setup (PostgreSQL, Redis, auth)" - echo " --genai AI-enhanced setup (GPU support)" - echo " --status Show current status" - echo " --stop Stop all services" - echo " --help Show this help" exit 1 } @@ -44,64 +63,52 @@ check_requirements() { setup_development() { echo "๐Ÿ› ๏ธ Setting up Development Environment..." - # Create development .env file - cat > .env << EOF -# Development Configuration -DATABASE_URL=sqlite+aiosqlite:///data/rendiff.db -REDIS_URL=redis://redis:6379/0 + # Create development environment file + cat > .env.dev << EOF +# Development Configuration - Fast Local Setup +DEBUG=true +TESTING=false + +# API Configuration API_HOST=0.0.0.0 API_PORT=8000 -DEBUG=true -LOG_LEVEL=debug +API_LOG_LEVEL=debug +API_WORKERS=1 + +# Database (SQLite for simplicity) +DATABASE_URL=sqlite+aiosqlite:///data/rendiff.db + +# Queue (Redis) +REDIS_URL=redis://redis:6379/0 + +# Storage STORAGE_PATH=./storage -CORS_ORIGINS=http://localhost:8000,http://127.0.0.1:8000 +TEMP_PATH=/tmp/rendiff + +# Security (Disabled for development) ENABLE_API_KEYS=false +ENABLE_RATE_LIMITING=false +API_CORS_ORIGINS=http://localhost:8000,http://127.0.0.1:8000,http://localhost:3000 + +# FFmpeg +FFMPEG_HARDWARE_ACCELERATION=auto +FFMPEG_THREADS=2 + +# Worker +WORKER_CONCURRENCY=2 + +# Development passwords POSTGRES_PASSWORD=dev_password_123 GRAFANA_PASSWORD=admin EOF - # Create minimal docker-compose for development - cat > docker-compose.dev.yml << EOF -services: - # Redis for queue - redis: - image: redis:7-alpine - container_name: ffmpeg_dev_redis - ports: - - "6379:6379" - command: redis-server --appendonly yes - volumes: - - redis_dev_data:/data - - # Simple API service - api: - build: - context: . - dockerfile: docker/api/Dockerfile - container_name: ffmpeg_dev_api - ports: - - "8000:8000" - environment: - - DATABASE_URL=sqlite+aiosqlite:///data/rendiff.db - - REDIS_URL=redis://redis:6379/0 - - DEBUG=true - - ENABLE_API_KEYS=false - volumes: - - ./storage:/storage - - ./data:/data - depends_on: - - redis - command: python -m uvicorn api.main:app --host 0.0.0.0 --port 8000 --reload - -volumes: - redis_dev_data: -EOF - + ln -sf .env.dev .env + echo "๐Ÿ“ Creating directories..." - mkdir -p storage data logs + mkdir -p storage data logs config echo "๐Ÿณ Starting development services..." - docker compose -f docker-compose.dev.yml up -d + docker compose up -d redis api echo "" echo "โœ… Development setup complete!" @@ -113,20 +120,151 @@ EOF echo "๐Ÿ“ To stop: ./setup.sh --stop" } +# Function for standard production setup +setup_standard() { + echo "๐Ÿญ Setting up Standard Production Environment..." + + # Check if .env.example exists + if [ ! -f ".env.example" ]; then + echo "โŒ .env.example not found! Please ensure it exists." + exit 1 + fi + + # Create production environment file + if [ ! -f ".env" ]; then + echo "๐Ÿ“‹ Creating production .env file from template..." + cp .env.example .env + echo "" + echo "โš ๏ธ IMPORTANT: Edit .env file with your production values:" + echo " - Set secure passwords for POSTGRES_PASSWORD and GRAFANA_PASSWORD" + echo " - Configure API_CORS_ORIGINS for your domain" + echo " - Set ADMIN_API_KEYS for API access" + echo "" + read -p "Press Enter after editing .env file..." + fi + + echo "๐Ÿ“ Creating directories..." + mkdir -p storage data/postgres data/redis data/prometheus data/grafana logs config + + echo "๐Ÿณ Starting production services..." + COMPOSE_PROFILES=standard docker compose up -d + + echo "" + echo "โœ… Standard production setup complete!" + echo "" + echo "๐ŸŒ API available at: http://localhost:8000" + echo "๐Ÿ“š API docs at: http://localhost:8000/docs" + echo "๐Ÿ“Š Prometheus at: http://localhost:9090" + echo "๐Ÿ“ˆ Grafana at: http://localhost:3000" + echo "" + echo "๐Ÿ“ To stop: ./setup.sh --stop" +} + +# Function for GPU-accelerated setup +setup_gpu() { + echo "๐ŸŽฎ Setting up GPU-Accelerated Environment..." + + # Check for NVIDIA Docker runtime + if ! docker info 2>/dev/null | grep -q nvidia; then + echo "โš ๏ธ NVIDIA Docker runtime not detected." + echo "๐Ÿ“– For GPU acceleration, install:" + echo " 1. NVIDIA drivers" + echo " 2. NVIDIA Container Toolkit" + echo " 3. Configure Docker to use nvidia runtime" + echo "" + read -p "Continue anyway? (y/N): " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + exit 1 + fi + fi + + # Check if .env.example exists + if [ ! -f ".env.example" ]; then + echo "โŒ .env.example not found! Please ensure it exists." + exit 1 + fi + + # Create GPU environment file + if [ ! -f ".env" ]; then + echo "๐Ÿ“‹ Creating GPU .env file from template..." + cp .env.example .env + echo "" + echo "โš ๏ธ IMPORTANT: Edit .env file with your production values and GPU settings:" + echo " - Set secure passwords" + echo " - Configure API_CORS_ORIGINS for your domain" + echo " - Set ADMIN_API_KEYS for API access" + echo " - Verify GPU worker settings" + echo "" + read -p "Press Enter after editing .env file..." + fi + + echo "๐Ÿ“ Creating directories..." + mkdir -p storage data/postgres data/redis data/prometheus data/grafana logs config + + echo "๐Ÿณ Starting GPU-accelerated services..." + COMPOSE_PROFILES=gpu,monitoring docker compose up -d + + echo "" + echo "โœ… GPU-accelerated setup complete!" + echo "" + echo "๐ŸŒ API available at: http://localhost:8000" + echo "๐Ÿ“š API docs at: http://localhost:8000/docs" + echo "๐Ÿ“Š Prometheus at: http://localhost:9090" + echo "๐Ÿ“ˆ Grafana at: http://localhost:3000" + echo "๐ŸŽฎ GPU workers enabled for hardware acceleration" + echo "" + echo "๐Ÿ“ To stop: ./setup.sh --stop" +} + # Function to show status show_status() { echo "๐Ÿ“Š Current Status:" echo "==================" - if docker compose -f docker-compose.dev.yml ps 2>/dev/null | grep -q "Up"; then - echo "๐ŸŸข Development environment is running" - docker compose -f docker-compose.dev.yml ps + # Check which environment is running + if docker compose ps 2>/dev/null | grep -q "Up"; then + echo "๐ŸŸข FFmpeg API is running" + echo "" + docker compose ps echo "" echo "๐ŸŒ Access URLs:" echo " API: http://localhost:8000" echo " Docs: http://localhost:8000/docs" + echo " Health: http://localhost:8000/api/v1/health" + + # Check if monitoring is enabled + if docker compose ps prometheus 2>/dev/null | grep -q "Up"; then + echo " Prometheus: http://localhost:9090" + fi + if docker compose ps grafana 2>/dev/null | grep -q "Up"; then + echo " Grafana: http://localhost:3000" + fi + + # Check active profiles + if [ -f ".env" ]; then + echo "" + echo "๐Ÿ“‹ Current Configuration:" + if grep -q "DEBUG=true" .env 2>/dev/null; then + echo " Mode: Development" + else + echo " Mode: Production" + fi + + if docker compose ps worker-gpu 2>/dev/null | grep -q "Up"; then + echo " GPU: Enabled" + else + echo " GPU: Disabled" + fi + fi + else - echo "๐Ÿ”ด Development environment is not running" + echo "๐Ÿ”ด FFmpeg API is not running" + echo "" + echo "๐Ÿš€ To start:" + echo " Development: ./setup.sh --development" + echo " Production: ./setup.sh --standard" + echo " GPU: ./setup.sh --gpu" fi } @@ -134,11 +272,36 @@ show_status() { stop_services() { echo "๐Ÿ›‘ Stopping services..." - if [ -f "docker-compose.dev.yml" ]; then - docker compose -f docker-compose.dev.yml down + # Stop all possible configurations + docker compose down --remove-orphans 2>/dev/null || true + + # Clean up development files + if [ -f "compose.dev.yml" ]; then + docker compose -f compose.dev.yml down 2>/dev/null || true + rm -f compose.dev.yml + fi + + # Clean up environment symlinks + if [ -L ".env" ]; then + rm -f .env fi - echo "โœ… Services stopped" + echo "โœ… Services stopped and cleaned up" +} + +# Function to clean up everything +cleanup_all() { + echo "๐Ÿงน Cleaning up everything..." + + stop_services + + echo "๐Ÿ—‘๏ธ Removing volumes..." + docker volume prune -f 2>/dev/null || true + + echo "๐Ÿ—‘๏ธ Removing temporary files..." + rm -rf data/ logs/ .env.dev compose.dev.yml 2>/dev/null || true + + echo "โœ… Complete cleanup finished" } # Parse command line arguments @@ -148,14 +311,12 @@ case "${1:-}" in setup_development ;; --standard|--prod) - echo "๐Ÿšง Standard/Production setup not implemented yet" - echo "๐Ÿ’ก Use --development for now" - exit 1 + check_requirements + setup_standard ;; - --genai|--ai) - echo "๐Ÿšง GenAI setup not implemented yet" - echo "๐Ÿ’ก Use --development for now" - exit 1 + --gpu|--hardware) + check_requirements + setup_gpu ;; --status) show_status @@ -163,6 +324,9 @@ case "${1:-}" in --stop) stop_services ;; + --clean|--cleanup) + cleanup_all + ;; --help|-h) show_usage ;; diff --git a/tests/test_integration.py b/tests/test_integration.py new file mode 100644 index 0000000..de16586 --- /dev/null +++ b/tests/test_integration.py @@ -0,0 +1,553 @@ +""" +Integration tests for FFmpeg API +Tests end-to-end workflows and component interactions +""" +import asyncio +import json +import os +import tempfile +from pathlib import Path +from typing import AsyncGenerator +import pytest +import httpx +from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine +from sqlalchemy.orm import sessionmaker + +from api.main import app +from api.models.database import Base +from api.models.job import Job, JobStatus +from api.config import settings + + +@pytest.fixture(scope="session") +async def test_engine(): + """Create test database engine.""" + test_db_url = settings.DATABASE_URL.replace("ffmpeg_api", "ffmpeg_api_test") + engine = create_async_engine(test_db_url, echo=True) + + # Create tables + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.create_all) + + yield engine + + # Cleanup + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.drop_all) + await engine.dispose() + + +@pytest.fixture +async def test_db(test_engine) -> AsyncGenerator[AsyncSession, None]: + """Create test database session.""" + TestSessionLocal = sessionmaker( + test_engine, class_=AsyncSession, expire_on_commit=False + ) + + async with TestSessionLocal() as session: + yield session + + +@pytest.fixture +async def test_client(): + """Create test HTTP client.""" + async with httpx.AsyncClient(app=app, base_url="http://test") as client: + yield client + + +@pytest.fixture +def sample_video(): + """Create a sample video file for testing.""" + # Create a minimal test video using FFmpeg + test_dir = Path(tempfile.gettempdir()) / "ffmpeg_test" + test_dir.mkdir(exist_ok=True) + + video_path = test_dir / "test_video.mp4" + + # Generate test video (5 seconds, 480p) + os.system(f""" + ffmpeg -f lavfi -i testsrc=duration=5:size=640x480:rate=30 \ + -f lavfi -i sine=frequency=1000:duration=5 \ + -c:v libx264 -preset ultrafast -crf 30 \ + -c:a aac -shortest -y {video_path} + """) + + yield str(video_path) + + # Cleanup + if video_path.exists(): + video_path.unlink() + + +class TestAPIEndpoints: + """Test API endpoints functionality.""" + + @pytest.mark.asyncio + async def test_health_check(self, test_client): + """Test health check endpoint.""" + response = await test_client.get("/api/v1/health") + assert response.status_code == 200 + + data = response.json() + assert data["status"] == "healthy" + assert "timestamp" in data + assert "components" in data + + # Check component health + components = data["components"] + assert "database" in components + assert "queue" in components + assert "storage" in components + + @pytest.mark.asyncio + async def test_capabilities_endpoint(self, test_client): + """Test capabilities discovery.""" + response = await test_client.get("/api/v1/capabilities") + assert response.status_code == 200 + + data = response.json() + assert "formats" in data + assert "codecs" in data + assert "hardware_acceleration" in data + + # Verify format support + formats = data["formats"] + assert "input" in formats + assert "output" in formats + assert "mp4" in formats["input"] + assert "mp4" in formats["output"] + + @pytest.mark.asyncio + async def test_job_creation_without_api_key(self, test_client): + """Test job creation fails without API key.""" + request_data = { + "input": "/test/input.mp4", + "output": "/test/output.mp4" + } + + response = await test_client.post("/api/v1/convert", json=request_data) + assert response.status_code == 401 + + data = response.json() + assert "API key required" in data["detail"] + + +class TestJobWorkflow: + """Test complete job processing workflow.""" + + @pytest.mark.asyncio + async def test_simple_conversion_job(self, test_client, test_db, sample_video): + """Test basic video conversion workflow.""" + # Create API key first + api_key_response = await test_client.post( + "/api/v1/admin/api-keys", + json={"name": "test-key", "permissions": ["convert"]}, + headers={"X-API-Key": "test-admin-key"} + ) + + if api_key_response.status_code == 201: + api_key = api_key_response.json()["api_key"] + else: + api_key = "test-api-key" # Use default for testing + + # Create conversion job + request_data = { + "input": sample_video, + "output": "/tmp/output.webm", + "operations": [ + { + "type": "scale", + "width": 320, + "height": 240 + } + ], + "options": { + "priority": "high" + } + } + + headers = {"X-API-Key": api_key} + response = await test_client.post("/api/v1/convert", json=request_data, headers=headers) + + assert response.status_code == 201 + data = response.json() + + # Verify job response structure + assert "job" in data + job = data["job"] + assert "id" in job + assert job["status"] == "queued" + assert job["priority"] == "high" + assert "links" in job + + job_id = job["id"] + + # Check job status + status_response = await test_client.get(f"/api/v1/jobs/{job_id}", headers=headers) + assert status_response.status_code == 200 + + status_data = status_response.json() + assert status_data["id"] == job_id + assert status_data["status"] in ["queued", "processing"] + + @pytest.mark.asyncio + async def test_job_with_multiple_operations(self, test_client, sample_video): + """Test job with multiple video operations.""" + request_data = { + "input": sample_video, + "output": "/tmp/complex_output.mp4", + "operations": [ + { + "type": "trim", + "start": 1, + "duration": 3 + }, + { + "type": "scale", + "width": 480, + "height": 360 + }, + { + "type": "watermark", + "text": "Test Watermark", + "position": "bottom-right" + } + ], + "options": { + "priority": "normal", + "format": "mp4", + "video_codec": "h264", + "audio_codec": "aac" + } + } + + headers = {"X-API-Key": "test-api-key"} + response = await test_client.post("/api/v1/convert", json=request_data, headers=headers) + + # Should succeed with complex operations + assert response.status_code in [201, 400] # 400 if validation fails + + if response.status_code == 201: + data = response.json() + job_id = data["job"]["id"] + + # Verify operations are stored + job_response = await test_client.get(f"/api/v1/jobs/{job_id}", headers=headers) + job_data = job_response.json() + + assert len(job_data["operations"]) == 3 + assert job_data["operations"][0]["type"] == "trim" + assert job_data["operations"][1]["type"] == "scale" + assert job_data["operations"][2]["type"] == "watermark" + + @pytest.mark.asyncio + async def test_streaming_format_creation(self, test_client, sample_video): + """Test HLS streaming format creation.""" + request_data = { + "input": sample_video, + "output": "/tmp/stream", + "type": "hls", + "variants": [ + {"resolution": "480p", "bitrate": "1M"}, + {"resolution": "720p", "bitrate": "2.5M"} + ], + "segment_duration": 6 + } + + headers = {"X-API-Key": "test-api-key"} + response = await test_client.post("/api/v1/stream", json=request_data, headers=headers) + + assert response.status_code in [201, 400] # Depending on implementation + + @pytest.mark.asyncio + async def test_video_analysis(self, test_client, sample_video): + """Test video analysis workflow.""" + request_data = { + "input": sample_video, + "metrics": ["duration", "resolution", "bitrate", "codec"] + } + + headers = {"X-API-Key": "test-api-key"} + response = await test_client.post("/api/v1/analyze", json=request_data, headers=headers) + + assert response.status_code in [201, 200] # Depending on sync/async implementation + + if response.status_code == 201: + # Async analysis + data = response.json() + job_id = data["job"]["id"] + + # Check analysis job + job_response = await test_client.get(f"/api/v1/jobs/{job_id}", headers=headers) + assert job_response.status_code == 200 + + +class TestValidation: + """Test input validation and error handling.""" + + @pytest.mark.asyncio + async def test_invalid_input_format(self, test_client): + """Test handling of invalid input format.""" + request_data = { + "input": "/path/to/invalid.xyz", + "output": "/path/to/output.mp4" + } + + headers = {"X-API-Key": "test-api-key"} + response = await test_client.post("/api/v1/convert", json=request_data, headers=headers) + + assert response.status_code == 400 + data = response.json() + assert "error" in data + assert "unsupported" in data["error"]["message"].lower() + + @pytest.mark.asyncio + async def test_missing_required_fields(self, test_client): + """Test validation of required fields.""" + request_data = { + "output": "/path/to/output.mp4" + # Missing input field + } + + headers = {"X-API-Key": "test-api-key"} + response = await test_client.post("/api/v1/convert", json=request_data, headers=headers) + + assert response.status_code == 422 # Validation error + data = response.json() + assert "detail" in data + + @pytest.mark.asyncio + async def test_invalid_operations(self, test_client): + """Test validation of video operations.""" + request_data = { + "input": "/path/to/input.mp4", + "output": "/path/to/output.mp4", + "operations": [ + { + "type": "invalid_operation", + "parameter": "value" + } + ] + } + + headers = {"X-API-Key": "test-api-key"} + response = await test_client.post("/api/v1/convert", json=request_data, headers=headers) + + assert response.status_code == 400 + data = response.json() + assert "invalid operation" in data["error"]["message"].lower() + + +class TestJobManagement: + """Test job management operations.""" + + @pytest.mark.asyncio + async def test_list_jobs(self, test_client): + """Test job listing with pagination.""" + headers = {"X-API-Key": "test-api-key"} + + # Test basic listing + response = await test_client.get("/api/v1/jobs", headers=headers) + assert response.status_code == 200 + + data = response.json() + assert "jobs" in data + assert "pagination" in data + + pagination = data["pagination"] + assert "page" in pagination + assert "per_page" in pagination + assert "total" in pagination + + @pytest.mark.asyncio + async def test_job_filtering(self, test_client): + """Test job filtering by status.""" + headers = {"X-API-Key": "test-api-key"} + + # Filter by status + response = await test_client.get("/api/v1/jobs?status=completed", headers=headers) + assert response.status_code == 200 + + data = response.json() + for job in data["jobs"]: + assert job["status"] == "completed" + + @pytest.mark.asyncio + async def test_job_cancellation(self, test_client, test_db, sample_video): + """Test job cancellation.""" + # Create a job first + request_data = { + "input": sample_video, + "output": "/tmp/cancel_test.mp4" + } + + headers = {"X-API-Key": "test-api-key"} + create_response = await test_client.post("/api/v1/convert", json=request_data, headers=headers) + + if create_response.status_code == 201: + job_id = create_response.json()["job"]["id"] + + # Cancel the job + cancel_response = await test_client.delete(f"/api/v1/jobs/{job_id}", headers=headers) + assert cancel_response.status_code in [200, 204] + + # Verify cancellation + status_response = await test_client.get(f"/api/v1/jobs/{job_id}", headers=headers) + status_data = status_response.json() + assert status_data["status"] == "cancelled" + + +class TestErrorHandling: + """Test error handling and edge cases.""" + + @pytest.mark.asyncio + async def test_nonexistent_job(self, test_client): + """Test handling of non-existent job ID.""" + headers = {"X-API-Key": "test-api-key"} + response = await test_client.get("/api/v1/jobs/nonexistent-id", headers=headers) + + assert response.status_code == 404 + data = response.json() + assert "not found" in data["error"]["message"].lower() + + @pytest.mark.asyncio + async def test_malformed_json(self, test_client): + """Test handling of malformed JSON.""" + headers = {"X-API-Key": "test-api-key", "Content-Type": "application/json"} + + # Send malformed JSON + async with httpx.AsyncClient(app=app, base_url="http://test") as client: + response = await client.post( + "/api/v1/convert", + content='{"input": "/path", "output":}', # Malformed JSON + headers=headers + ) + + assert response.status_code == 422 + + @pytest.mark.asyncio + async def test_large_payload(self, test_client): + """Test handling of oversized payloads.""" + # Create a very large request + large_operations = [ + {"type": "scale", "width": 1920, "height": 1080} + for _ in range(1000) # Create many operations + ] + + request_data = { + "input": "/path/to/input.mp4", + "output": "/path/to/output.mp4", + "operations": large_operations + } + + headers = {"X-API-Key": "test-api-key"} + response = await test_client.post("/api/v1/convert", json=request_data, headers=headers) + + # Should either reject or handle gracefully + assert response.status_code in [400, 413, 422] + + +class TestSecurity: + """Test security aspects.""" + + @pytest.mark.asyncio + async def test_path_traversal_protection(self, test_client): + """Test protection against path traversal attacks.""" + request_data = { + "input": "../../../etc/passwd", + "output": "/tmp/output.mp4" + } + + headers = {"X-API-Key": "test-api-key"} + response = await test_client.post("/api/v1/convert", json=request_data, headers=headers) + + assert response.status_code == 400 + data = response.json() + assert "path" in data["error"]["message"].lower() + + @pytest.mark.asyncio + async def test_invalid_api_key(self, test_client): + """Test invalid API key handling.""" + headers = {"X-API-Key": "invalid-key"} + response = await test_client.get("/api/v1/jobs", headers=headers) + + assert response.status_code == 401 + + @pytest.mark.asyncio + async def test_rate_limiting(self, test_client): + """Test rate limiting behavior.""" + headers = {"X-API-Key": "test-api-key"} + + # Make multiple rapid requests + responses = [] + for _ in range(10): + response = await test_client.get("/api/v1/health", headers=headers) + responses.append(response.status_code) + + # Should mostly succeed, but may hit rate limits + success_count = sum(1 for status in responses if status == 200) + assert success_count >= 5 # At least some should succeed + + +# Performance Tests +class TestPerformance: + """Test performance characteristics.""" + + @pytest.mark.asyncio + @pytest.mark.slow + async def test_concurrent_jobs(self, test_client, sample_video): + """Test handling of concurrent job submissions.""" + headers = {"X-API-Key": "test-api-key"} + + # Submit multiple jobs concurrently + tasks = [] + for i in range(5): + request_data = { + "input": sample_video, + "output": f"/tmp/concurrent_{i}.mp4" + } + task = test_client.post("/api/v1/convert", json=request_data, headers=headers) + tasks.append(task) + + # Wait for all submissions + responses = await asyncio.gather(*tasks, return_exceptions=True) + + # Count successful submissions + success_count = sum(1 for r in responses if not isinstance(r, Exception) and r.status_code == 201) + assert success_count >= 3 # At least 60% should succeed + + @pytest.mark.asyncio + async def test_response_time(self, test_client): + """Test API response times.""" + import time + + headers = {"X-API-Key": "test-api-key"} + + start_time = time.time() + response = await test_client.get("/api/v1/health", headers=headers) + end_time = time.time() + + response_time = end_time - start_time + + assert response.status_code == 200 + assert response_time < 1.0 # Should respond within 1 second + + +# Cleanup and utilities +@pytest.fixture(autouse=True) +async def cleanup_test_files(): + """Clean up test files after each test.""" + yield + + # Clean up any test output files + test_files = Path("/tmp").glob("*test*") + for file_path in test_files: + try: + if file_path.is_file(): + file_path.unlink() + except Exception: + pass # Ignore cleanup errors + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "--tb=short"]) \ No newline at end of file diff --git a/tests/test_performance.py b/tests/test_performance.py new file mode 100644 index 0000000..0c6457c --- /dev/null +++ b/tests/test_performance.py @@ -0,0 +1,459 @@ +""" +Performance and load tests for FFmpeg API +Tests system behavior under load and measures performance metrics +""" +import asyncio +import statistics +import time +from concurrent.futures import ThreadPoolExecutor +from typing import List, Dict, Any +import pytest +import httpx +from locust import HttpUser, task, between +import psutil + + +class TestPerformanceMetrics: + """Test performance characteristics and benchmarks.""" + + @pytest.mark.asyncio + async def test_api_response_times(self): + """Measure API response times under normal load.""" + async with httpx.AsyncClient(base_url="http://localhost:8000") as client: + headers = {"X-API-Key": "test-performance-key"} + response_times = [] + + # Measure multiple requests + for _ in range(100): + start_time = time.time() + response = await client.get("/api/v1/health", headers=headers) + end_time = time.time() + + if response.status_code == 200: + response_times.append(end_time - start_time) + + # Calculate statistics + if response_times: + avg_time = statistics.mean(response_times) + p95_time = statistics.quantiles(response_times, n=20)[18] # 95th percentile + p99_time = statistics.quantiles(response_times, n=100)[98] # 99th percentile + + print(f"Average response time: {avg_time:.3f}s") + print(f"P95 response time: {p95_time:.3f}s") + print(f"P99 response time: {p99_time:.3f}s") + + # Performance assertions + assert avg_time < 0.1, f"Average response time {avg_time:.3f}s exceeds 100ms" + assert p95_time < 0.5, f"P95 response time {p95_time:.3f}s exceeds 500ms" + assert p99_time < 1.0, f"P99 response time {p99_time:.3f}s exceeds 1s" + + @pytest.mark.asyncio + async def test_concurrent_request_handling(self): + """Test API behavior under concurrent load.""" + async def make_request(client, semaphore): + async with semaphore: + headers = {"X-API-Key": "test-performance-key"} + start_time = time.time() + response = await client.get("/api/v1/capabilities", headers=headers) + end_time = time.time() + return { + "status_code": response.status_code, + "response_time": end_time - start_time, + "success": response.status_code == 200 + } + + # Limit concurrent connections + semaphore = asyncio.Semaphore(50) + + async with httpx.AsyncClient( + base_url="http://localhost:8000", + timeout=30.0, + limits=httpx.Limits(max_connections=100, max_keepalive_connections=20) + ) as client: + + # Create 200 concurrent requests + tasks = [make_request(client, semaphore) for _ in range(200)] + results = await asyncio.gather(*tasks, return_exceptions=True) + + # Filter out exceptions + valid_results = [r for r in results if isinstance(r, dict)] + + # Calculate metrics + success_count = sum(1 for r in valid_results if r["success"]) + success_rate = success_count / len(valid_results) if valid_results else 0 + + response_times = [r["response_time"] for r in valid_results if r["success"]] + avg_response_time = statistics.mean(response_times) if response_times else float('inf') + + print(f"Success rate: {success_rate:.2%}") + print(f"Successful requests: {success_count}/{len(valid_results)}") + print(f"Average response time: {avg_response_time:.3f}s") + + # Performance assertions + assert success_rate >= 0.95, f"Success rate {success_rate:.2%} below 95%" + assert avg_response_time < 2.0, f"Average response time {avg_response_time:.3f}s exceeds 2s" + + @pytest.mark.asyncio + async def test_job_submission_throughput(self): + """Test job submission throughput.""" + async def submit_job(client, job_id): + headers = {"X-API-Key": "test-performance-key"} + request_data = { + "input": f"/test/input_{job_id}.mp4", + "output": f"/test/output_{job_id}.mp4", + "operations": [{"type": "scale", "width": 720, "height": 480}] + } + + start_time = time.time() + response = await client.post("/api/v1/convert", json=request_data, headers=headers) + end_time = time.time() + + return { + "job_id": job_id, + "status_code": response.status_code, + "response_time": end_time - start_time, + "success": response.status_code == 201 + } + + async with httpx.AsyncClient(base_url="http://localhost:8000", timeout=30.0) as client: + # Submit 50 jobs concurrently + start_time = time.time() + tasks = [submit_job(client, i) for i in range(50)] + results = await asyncio.gather(*tasks, return_exceptions=True) + end_time = time.time() + + total_time = end_time - start_time + valid_results = [r for r in results if isinstance(r, dict)] + successful_submissions = sum(1 for r in valid_results if r["success"]) + + throughput = successful_submissions / total_time if total_time > 0 else 0 + + print(f"Job submission throughput: {throughput:.2f} jobs/second") + print(f"Successful submissions: {successful_submissions}/{len(valid_results)}") + print(f"Total time: {total_time:.2f}s") + + # Throughput assertion + assert throughput >= 5.0, f"Throughput {throughput:.2f} jobs/s below minimum of 5/s" + + +class TestResourceUsage: + """Test resource consumption under load.""" + + def test_memory_usage_under_load(self): + """Monitor memory usage during sustained load.""" + import threading + import requests + + # Monitor system resources + memory_samples = [] + cpu_samples = [] + monitoring = True + + def monitor_resources(): + while monitoring: + memory_samples.append(psutil.virtual_memory().percent) + cpu_samples.append(psutil.cpu_percent(interval=0.1)) + time.sleep(1) + + # Start monitoring + monitor_thread = threading.Thread(target=monitor_resources) + monitor_thread.start() + + try: + # Generate sustained load + with ThreadPoolExecutor(max_workers=20) as executor: + futures = [] + + for _ in range(100): + future = executor.submit( + requests.get, + "http://localhost:8000/api/v1/health", + headers={"X-API-Key": "test-performance-key"}, + timeout=5 + ) + futures.append(future) + + # Wait for completion + completed = 0 + for future in futures: + try: + response = future.result(timeout=10) + if response.status_code == 200: + completed += 1 + except Exception: + pass + + print(f"Completed requests: {completed}/{len(futures)}") + + finally: + monitoring = False + monitor_thread.join() + + # Analyze resource usage + if memory_samples and cpu_samples: + max_memory = max(memory_samples) + avg_memory = statistics.mean(memory_samples) + max_cpu = max(cpu_samples) + avg_cpu = statistics.mean(cpu_samples) + + print(f"Memory usage - Max: {max_memory:.1f}%, Avg: {avg_memory:.1f}%") + print(f"CPU usage - Max: {max_cpu:.1f}%, Avg: {avg_cpu:.1f}%") + + # Resource usage assertions + assert max_memory < 90, f"Memory usage {max_memory:.1f}% exceeds 90%" + assert avg_cpu < 80, f"Average CPU usage {avg_cpu:.1f}% exceeds 80%" + + @pytest.mark.asyncio + async def test_database_connection_pooling(self): + """Test database connection pool behavior under load.""" + async def make_db_intensive_request(client): + headers = {"X-API-Key": "test-performance-key"} + # Request that requires database access + response = await client.get("/api/v1/jobs", headers=headers) + return response.status_code == 200 + + async with httpx.AsyncClient(base_url="http://localhost:8000", timeout=30.0) as client: + # Create many concurrent database requests + tasks = [make_db_intensive_request(client) for _ in range(100)] + results = await asyncio.gather(*tasks, return_exceptions=True) + + success_count = sum(1 for r in results if r is True) + success_rate = success_count / len(results) + + print(f"Database request success rate: {success_rate:.2%}") + + # Should handle concurrent DB requests well + assert success_rate >= 0.90, f"DB request success rate {success_rate:.2%} below 90%" + + +class TestScalability: + """Test system scalability characteristics.""" + + @pytest.mark.asyncio + async def test_queue_handling_capacity(self): + """Test job queue handling under high load.""" + async def submit_bulk_jobs(client, batch_size): + headers = {"X-API-Key": "test-performance-key"} + results = [] + + for i in range(batch_size): + request_data = { + "input": f"/test/bulk_{i}.mp4", + "output": f"/test/bulk_output_{i}.mp4" + } + + try: + response = await client.post("/api/v1/convert", json=request_data, headers=headers) + results.append(response.status_code == 201) + except Exception: + results.append(False) + + return results + + async with httpx.AsyncClient(base_url="http://localhost:8000", timeout=60.0) as client: + # Submit jobs in batches + batch_results = [] + for batch in range(5): # 5 batches of 20 jobs each + print(f"Submitting batch {batch + 1}/5...") + batch_result = await submit_bulk_jobs(client, 20) + batch_results.extend(batch_result) + + # Small delay between batches + await asyncio.sleep(1) + + total_jobs = len(batch_results) + successful_jobs = sum(batch_results) + success_rate = successful_jobs / total_jobs if total_jobs > 0 else 0 + + print(f"Queue capacity test: {successful_jobs}/{total_jobs} jobs accepted") + print(f"Success rate: {success_rate:.2%}") + + # Should handle bulk job submissions + assert success_rate >= 0.80, f"Queue acceptance rate {success_rate:.2%} below 80%" + + +# Locust Load Testing Classes +class APILoadTest(HttpUser): + """Locust load test for API endpoints.""" + + wait_time = between(1, 3) + + def on_start(self): + """Set up test user.""" + self.headers = {"X-API-Key": "test-load-key"} + + @task(3) + def test_health_check(self): + """Health check endpoint (frequent).""" + self.client.get("/api/v1/health", headers=self.headers) + + @task(2) + def test_capabilities(self): + """Capabilities endpoint (moderate).""" + self.client.get("/api/v1/capabilities", headers=self.headers) + + @task(1) + def test_job_listing(self): + """Job listing endpoint (less frequent).""" + self.client.get("/api/v1/jobs", headers=self.headers) + + @task(1) + def test_job_submission(self): + """Job submission (less frequent, more expensive).""" + import random + + job_data = { + "input": f"/test/load_test_{random.randint(1, 1000)}.mp4", + "output": f"/test/output_{random.randint(1, 1000)}.mp4", + "operations": [{"type": "scale", "width": 720, "height": 480}] + } + + self.client.post("/api/v1/convert", json=job_data, headers=self.headers) + + +class WorkerLoadTest(HttpUser): + """Locust load test focused on worker-intensive operations.""" + + wait_time = between(2, 5) + + def on_start(self): + self.headers = {"X-API-Key": "test-worker-load-key"} + + @task(1) + def test_complex_conversion(self): + """Submit complex conversion jobs.""" + import random + + job_data = { + "input": f"/test/complex_{random.randint(1, 100)}.mp4", + "output": f"/test/complex_output_{random.randint(1, 100)}.mp4", + "operations": [ + {"type": "trim", "start": 5, "duration": 30}, + {"type": "scale", "width": 1280, "height": 720}, + {"type": "watermark", "text": f"Load Test {random.randint(1, 1000)}"} + ], + "options": { + "priority": random.choice(["low", "normal", "high"]), + "format": "mp4", + "video_codec": "h264" + } + } + + response = self.client.post("/api/v1/convert", json=job_data, headers=self.headers) + + if response.status_code == 201: + # Occasionally check job status + if random.random() < 0.3: # 30% chance + job_id = response.json().get("job", {}).get("id") + if job_id: + self.client.get(f"/api/v1/jobs/{job_id}", headers=self.headers) + + @task(1) + def test_analysis_jobs(self): + """Submit analysis jobs.""" + import random + + analysis_data = { + "input": f"/test/analysis_{random.randint(1, 50)}.mp4", + "metrics": ["duration", "resolution", "bitrate", "codec", "quality"] + } + + self.client.post("/api/v1/analyze", json=analysis_data, headers=self.headers) + + +# Stress Testing +class TestStressLimits: + """Test system behavior at stress limits.""" + + @pytest.mark.stress + @pytest.mark.asyncio + async def test_maximum_concurrent_connections(self): + """Test maximum concurrent connection handling.""" + async def make_long_request(client, delay): + headers = {"X-API-Key": "test-stress-key"} + # Simulate a longer-running request + await asyncio.sleep(delay) + response = await client.get("/api/v1/capabilities", headers=headers) + return response.status_code + + async with httpx.AsyncClient( + base_url="http://localhost:8000", + timeout=60.0, + limits=httpx.Limits(max_connections=500, max_keepalive_connections=100) + ) as client: + + # Create many concurrent long-running requests + tasks = [make_long_request(client, 0.1) for _ in range(300)] + + start_time = time.time() + results = await asyncio.gather(*tasks, return_exceptions=True) + end_time = time.time() + + successful_requests = sum(1 for r in results if r == 200) + total_time = end_time - start_time + + print(f"Stress test: {successful_requests}/{len(tasks)} requests successful") + print(f"Total time: {total_time:.2f}s") + + # Should handle high concurrency reasonably + success_rate = successful_requests / len(tasks) + assert success_rate >= 0.70, f"Stress test success rate {success_rate:.2%} below 70%" + + @pytest.mark.stress + def test_memory_leak_detection(self): + """Test for memory leaks under sustained load.""" + import gc + import threading + import requests + + # Force garbage collection + gc.collect() + initial_memory = psutil.Process().memory_info().rss / 1024 / 1024 # MB + + # Run sustained load + def make_requests(): + session = requests.Session() + headers = {"X-API-Key": "test-stress-key"} + + for _ in range(100): + try: + response = session.get( + "http://localhost:8000/api/v1/health", + headers=headers, + timeout=5 + ) + except Exception: + pass + + # Run multiple threads + threads = [] + for _ in range(10): + thread = threading.Thread(target=make_requests) + thread.start() + threads.append(thread) + + # Wait for completion + for thread in threads: + thread.join() + + # Force garbage collection and check memory + gc.collect() + time.sleep(2) # Allow cleanup + final_memory = psutil.Process().memory_info().rss / 1024 / 1024 # MB + + memory_increase = final_memory - initial_memory + print(f"Memory usage: {initial_memory:.1f}MB -> {final_memory:.1f}MB") + print(f"Memory increase: {memory_increase:.1f}MB") + + # Memory increase should be reasonable + assert memory_increase < 100, f"Memory increase {memory_increase:.1f}MB suggests possible leak" + + +if __name__ == "__main__": + # Run performance tests + pytest.main([ + __file__, + "-v", + "--tb=short", + "-m", "not stress" # Exclude stress tests by default + ]) \ No newline at end of file diff --git a/traefik/traefik.yml b/traefik/traefik.yml index 3efc033..d676774 100644 --- a/traefik/traefik.yml +++ b/traefik/traefik.yml @@ -27,7 +27,7 @@ providers: docker: endpoint: "unix:///var/run/docker.sock" exposedByDefault: false - network: "rendiff" + network: "ffmpeg-net" file: filename: /etc/traefik/dynamic.yml watch: true