From 6eb0e7bfed9c61edc46b3c3947bb2331e7d6cdd9 Mon Sep 17 00:00:00 2001 From: mickael Date: Fri, 18 Jul 2025 14:11:40 +0200 Subject: [PATCH 01/12] feat: implement S3 integration for storing and retrieving digest files - Add utility functions for S3 configuration, URL generation, and file uploads. - Enhance ingestion flow to optionally upload digests to S3 if enabled. - Modify API endpoints to redirect downloads to S3 if files are stored there. - Extend `IngestResponse` schema to include S3 URL when applicable. - Introduce `get_current_commit_hash` utility to retrieve commit SHA in ingestion. - add Docker Compose configuration for dev/prod environments with documented usage details - integrate MinIO S3-compatible storage for local development, including bucket auto-setup and app credentials - add S3 storage toggle, test service in Docker Compose, and boto3 dependency - enforce UUID type for ingest_id, resolve comments - Implement `JSONFormatter` and methods for structured logging. - Integrate logging into S3 client creation, uploads, and URL lookups. - Enhance logging with extra fields for better traceability. - add optional S3 directory prefix support - remove unused test service from Docker Compose configuration - improve `get_s3_config` to handle optional environment variables more robustly - add centralized JSON logging and integrate into S3 utilities Co-authored-by: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> --- .docker/minio/setup.sh | 33 ++ .env.example | 23 ++ .github/workflows/codeql.yml | 2 +- .pre-commit-config.yaml | 2 + README.md | 85 +++++ compose.yml | 110 ++++++ pyproject.toml | 2 +- requirements.txt | 1 + src/gitingest/query_parser.py | 6 +- src/gitingest/schemas/ingestion.py | 8 +- src/gitingest/utils/logging_config.py | 111 ++++++ src/gitingest/utils/s3_utils.py | 356 +++++++++++++++++++ src/server/query_processor.py | 37 +- src/server/routers/ingest.py | 32 +- tests/conftest.py | 3 +- tests/query_parser/test_git_host_agnostic.py | 2 +- tests/server/test_flow_integration.py | 12 +- 17 files changed, 795 insertions(+), 30 deletions(-) create mode 100755 .docker/minio/setup.sh create mode 100644 compose.yml create mode 100644 src/gitingest/utils/logging_config.py create mode 100644 src/gitingest/utils/s3_utils.py diff --git a/.docker/minio/setup.sh b/.docker/minio/setup.sh new file mode 100755 index 00000000..3b1b6fb2 --- /dev/null +++ b/.docker/minio/setup.sh @@ -0,0 +1,33 @@ +#!/bin/sh + +# Simple script to set up MinIO bucket and user +# Based on example from MinIO issues + +# Format bucket name to ensure compatibility +BUCKET_NAME=$(echo "${S3_BUCKET_NAME}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') + +# Configure MinIO client +mc alias set myminio http://minio:9000 ${MINIO_ROOT_USER} ${MINIO_ROOT_PASSWORD} + +# Remove bucket if it exists (for clean setup) +mc rm -r --force myminio/${BUCKET_NAME} || true + +# Create bucket +mc mb myminio/${BUCKET_NAME} + +# Set bucket policy to allow downloads +mc anonymous set download myminio/${BUCKET_NAME} + +# Create user with access and secret keys +mc admin user add myminio ${S3_ACCESS_KEY} ${S3_SECRET_KEY} || echo "User already exists" + +# Create policy for the bucket +echo '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Action":["s3:*"],"Resource":["arn:aws:s3:::'${BUCKET_NAME}'/*","arn:aws:s3:::'${BUCKET_NAME}'"]}]}' > /tmp/policy.json + +# Apply policy +mc admin policy create myminio gitingest-policy /tmp/policy.json || echo "Policy already exists" +mc admin policy attach myminio gitingest-policy --user ${S3_ACCESS_KEY} + +echo "MinIO setup completed successfully" +echo "Bucket: ${BUCKET_NAME}" +echo "Access via console: http://localhost:9001" diff --git a/.env.example b/.env.example index 8d98ebba..aabdbf5a 100644 --- a/.env.example +++ b/.env.example @@ -33,3 +33,26 @@ GITINGEST_SENTRY_PROFILE_LIFECYCLE=trace GITINGEST_SENTRY_SEND_DEFAULT_PII=true # Environment name for Sentry (default: "") GITINGEST_SENTRY_ENVIRONMENT=development + +# MinIO Configuration (for development) +# Root user credentials for MinIO admin access +MINIO_ROOT_USER=minioadmin +MINIO_ROOT_PASSWORD=minioadmin + +# S3 Configuration (for application) +# Set to "true" to enable S3 storage for digests +# S3_ENABLED=true +# Endpoint URL for the S3 service (MinIO in development) +S3_ENDPOINT=http://minio:9000 +# Access key for the S3 bucket (created automatically in development) +S3_ACCESS_KEY=gitingest +# Secret key for the S3 bucket (created automatically in development) +S3_SECRET_KEY=gitingest123 +# Name of the S3 bucket (created automatically in development) +S3_BUCKET_NAME=gitingest-bucket +# Region for the S3 bucket (default for MinIO) +S3_REGION=us-east-1 +# Public URL/CDN for accessing S3 resources +S3_ALIAS_HOST=127.0.0.1:9000/gitingest-bucket +# Optional prefix for S3 file paths (if set, prefixes all S3 paths with this value) +# S3_DIRECTORY_PREFIX=my-prefix diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 04f00a55..ffc62c6c 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -35,7 +35,7 @@ jobs: strategy: fail-fast: false matrix: - language: ["javascript", "python"] + language: ["javascript", "python", "actions", "javascript-typescript"] # CodeQL supports [ $supported-codeql-languages ] # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4aa5f0e1..529d352a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -113,6 +113,7 @@ repos: files: ^src/ additional_dependencies: [ + boto3>=1.28.0, click>=8.0.0, 'fastapi[standard]>=0.109.1', httpx, @@ -138,6 +139,7 @@ repos: - --rcfile=tests/.pylintrc additional_dependencies: [ + boto3>=1.28.0, click>=8.0.0, 'fastapi[standard]>=0.109.1', httpx, diff --git a/README.md b/README.md index 501753e2..a31c780a 100644 --- a/README.md +++ b/README.md @@ -204,6 +204,8 @@ This is because Jupyter notebooks are asynchronous by default. ## 🐳 Self-host +### Using Docker + 1. Build the image: ``` bash @@ -239,6 +241,89 @@ The application can be configured using the following environment variables: - **GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE**: Sampling rate for profile sessions (default: "1.0", range: 0.0-1.0) - **GITINGEST_SENTRY_PROFILE_LIFECYCLE**: Profile lifecycle mode (default: "trace") - **GITINGEST_SENTRY_SEND_DEFAULT_PII**: Send default personally identifiable information (default: "true") +- **S3_ALIAS_HOST**: Public URL/CDN for accessing S3 resources (default: "127.0.0.1:9000/gitingest-bucket") +- **S3_DIRECTORY_PREFIX**: Optional prefix for S3 file paths (if set, prefixes all S3 paths with this value) + +### Using Docker Compose + +The project includes a `compose.yml` file that allows you to easily run the application in both development and production environments. + +#### Compose File Structure + +The `compose.yml` file uses YAML anchoring with `&app-base` and `<<: *app-base` to define common configuration that is shared between services: + +```yaml +# Common base configuration for all services +x-app-base: &app-base + build: + context: . + dockerfile: Dockerfile + ports: + - "${APP_WEB_BIND:-8000}:8000" # Main application port + - "${GITINGEST_METRICS_HOST:-127.0.0.1}:${GITINGEST_METRICS_PORT:-9090}:9090" # Metrics port + # ... other common configurations +``` + +#### Services + +The file defines three services: + +1. **app**: Production service configuration + - Uses the `prod` profile + - Sets the Sentry environment to "production" + - Configured for stable operation with `restart: unless-stopped` + +2. **app-dev**: Development service configuration + - Uses the `dev` profile + - Enables debug mode + - Mounts the source code for live development + - Uses hot reloading for faster development + +3. **minio**: S3-compatible object storage for development + - Uses the `dev` profile (only available in development mode) + - Provides S3-compatible storage for local development + - Accessible via: + - API: Port 9000 ([localhost:9000](http://localhost:9000)) + - Web Console: Port 9001 ([localhost:9001](http://localhost:9001)) + - Default admin credentials: + - Username: `minioadmin` + - Password: `minioadmin` + - Configurable via environment variables: + - `MINIO_ROOT_USER`: Custom admin username (default: minioadmin) + - `MINIO_ROOT_PASSWORD`: Custom admin password (default: minioadmin) + - Includes persistent storage via Docker volume + - Auto-creates a bucket and application-specific credentials: + - Bucket name: `gitingest-bucket` (configurable via `S3_BUCKET_NAME`) + - Access key: `gitingest` (configurable via `S3_ACCESS_KEY`) + - Secret key: `gitingest123` (configurable via `S3_SECRET_KEY`) + - These credentials are automatically passed to the app-dev service via environment variables: + - `S3_ENDPOINT`: URL of the MinIO server + - `S3_ACCESS_KEY`: Access key for the S3 bucket + - `S3_SECRET_KEY`: Secret key for the S3 bucket + - `S3_BUCKET_NAME`: Name of the S3 bucket + - `S3_REGION`: Region for the S3 bucket (default: us-east-1) + - `S3_ALIAS_HOST`: Public URL/CDN for accessing S3 resources (default: "127.0.0.1:9000/gitingest-bucket") + +#### Usage Examples + +To run the application in development mode: + +```bash +docker compose --profile dev up +``` + +To run the application in production mode: + +```bash +docker compose --profile prod up -d +``` + +To build and run the application: + +```bash +docker compose --profile prod build +docker compose --profile prod up -d +``` ## 🤝 Contributing diff --git a/compose.yml b/compose.yml new file mode 100644 index 00000000..defe28cd --- /dev/null +++ b/compose.yml @@ -0,0 +1,110 @@ +# Common base configuration for all services +x-app-base: &app-base + ports: + - "${APP_WEB_BIND:-8000}:8000" # Main application port + - "${GITINGEST_METRICS_HOST:-127.0.0.1}:${GITINGEST_METRICS_PORT:-9090}:9090" # Metrics port + environment: + # Python Configuration + - PYTHONUNBUFFERED=1 + - PYTHONDONTWRITEBYTECODE=1 + # Host Configuration + - ALLOWED_HOSTS=${ALLOWED_HOSTS:-gitingest.com,*.gitingest.com,localhost,127.0.0.1} + # Metrics Configuration + - GITINGEST_METRICS_ENABLED=${GITINGEST_METRICS_ENABLED:-true} + - GITINGEST_METRICS_HOST=${GITINGEST_METRICS_HOST:-127.0.0.1} + - GITINGEST_METRICS_PORT=${GITINGEST_METRICS_PORT:-9090} + # Sentry Configuration + - GITINGEST_SENTRY_ENABLED=${GITINGEST_SENTRY_ENABLED:-false} + - GITINGEST_SENTRY_DSN=${GITINGEST_SENTRY_DSN:-} + - GITINGEST_SENTRY_TRACES_SAMPLE_RATE=${GITINGEST_SENTRY_TRACES_SAMPLE_RATE:-1.0} + - GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE=${GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE:-1.0} + - GITINGEST_SENTRY_PROFILE_LIFECYCLE=${GITINGEST_SENTRY_PROFILE_LIFECYCLE:-trace} + - GITINGEST_SENTRY_SEND_DEFAULT_PII=${GITINGEST_SENTRY_SEND_DEFAULT_PII:-true} + user: "1000:1000" + command: ["python", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000"] + +services: + # Production service configuration + app: + <<: *app-base + image: ghcr.io/coderamp-labs/gitingest:latest + profiles: + - prod + environment: + - GITINGEST_SENTRY_ENVIRONMENT=${GITINGEST_SENTRY_ENVIRONMENT:-production} + restart: unless-stopped + + # Development service configuration + app-dev: + <<: *app-base + build: + context: . + dockerfile: Dockerfile + profiles: + - dev + environment: + - DEBUG=true + - GITINGEST_SENTRY_ENVIRONMENT=${GITINGEST_SENTRY_ENVIRONMENT:-development} + # S3 Configuration + - S3_ENABLED=true + - S3_ENDPOINT=http://minio:9000 + - S3_ACCESS_KEY=${S3_ACCESS_KEY:-gitingest} + - S3_SECRET_KEY=${S3_SECRET_KEY:-gitingest123} + # Use lowercase bucket name to ensure compatibility with MinIO + - S3_BUCKET_NAME=${S3_BUCKET_NAME:-gitingest-bucket} + - S3_REGION=${S3_REGION:-us-east-1} + # Public URL for S3 resources + - S3_ALIAS_HOST=${S3_ALIAS_HOST:-http://127.0.0.1:9000/${S3_BUCKET_NAME:-gitingest-bucket}} + volumes: + # Mount source code for live development + - ./src:/app:ro + # Use --reload flag for hot reloading during development + command: ["python", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] + depends_on: + minio-setup: + condition: service_completed_successfully + + # MinIO S3-compatible object storage for development + minio: + image: minio/minio:latest + profiles: + - dev + ports: + - "9000:9000" # API port + - "9001:9001" # Console port + environment: + - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin} + - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin} + volumes: + - minio-data:/data + command: server /data --console-address ":9001" + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 30s + timeout: 30s + start_period: 30s + start_interval: 1s + + # MinIO setup service to create bucket and user + minio-setup: + image: minio/mc + profiles: + - dev + depends_on: + minio: + condition: service_healthy + environment: + - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin} + - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin} + - S3_ACCESS_KEY=${S3_ACCESS_KEY:-gitingest} + - S3_SECRET_KEY=${S3_SECRET_KEY:-gitingest123} + - S3_BUCKET_NAME=${S3_BUCKET_NAME:-gitingest-bucket} + volumes: + - ./.docker/minio/setup.sh:/setup.sh:ro + entrypoint: sh + command: -c /setup.sh + +volumes: + minio-data: + driver: local diff --git a/pyproject.toml b/pyproject.toml index 334140dc..639f8a77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,6 @@ description="CLI tool to analyze and create text dumps of codebases for LLMs" readme = {file = "README.md", content-type = "text/markdown" } requires-python = ">= 3.8" dependencies = [ - "click>=8.0.0", "httpx", "pathspec>=0.12.1", "pydantic", @@ -44,6 +43,7 @@ dev = [ ] server = [ + "boto3>=1.28.0", # AWS SDK for S3 support "fastapi[standard]>=0.109.1", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2024-38) "prometheus-client", "sentry-sdk[fastapi]", diff --git a/requirements.txt b/requirements.txt index 712360e9..bdefb957 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +boto3>=1.28.0 # AWS SDK for S3 support click>=8.0.0 fastapi[standard]>=0.109.1 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2024-38 httpx diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py index 65b3f065..6262f0db 100644 --- a/src/gitingest/query_parser.py +++ b/src/gitingest/query_parser.py @@ -44,9 +44,9 @@ async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQ host = parsed_url.netloc user, repo = _get_user_and_repo_from_path(parsed_url.path) - _id = str(uuid.uuid4()) + _id = uuid.uuid4() slug = f"{user}-{repo}" - local_path = TMP_BASE_PATH / _id / slug + local_path = TMP_BASE_PATH / str(_id) / slug url = f"https://{host}/{user}/{repo}" query = IngestionQuery( @@ -132,7 +132,7 @@ def parse_local_dir_path(path_str: str) -> IngestionQuery: """ path_obj = Path(path_str).resolve() slug = path_obj.name if path_str == "." else path_str.strip("/") - return IngestionQuery(local_path=path_obj, slug=slug, id=str(uuid.uuid4())) + return IngestionQuery(local_path=path_obj, slug=slug, id=uuid.uuid4()) async def _configure_branch_or_tag( diff --git a/src/gitingest/schemas/ingestion.py b/src/gitingest/schemas/ingestion.py index 97e98804..92572aeb 100644 --- a/src/gitingest/schemas/ingestion.py +++ b/src/gitingest/schemas/ingestion.py @@ -3,6 +3,7 @@ from __future__ import annotations from pathlib import Path # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic) +from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic) from pydantic import BaseModel, Field @@ -27,7 +28,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes The URL of the repository. slug : str The slug of the repository. - id : str + id : UUID The ID of the repository. subpath : str The subpath to the repository or file (default: ``"/"``). @@ -47,6 +48,8 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes The patterns to include. include_submodules : bool Whether to include all Git submodules within the repository. (default: ``False``) + s3_url : str | None + The S3 URL where the digest is stored if S3 is enabled. """ @@ -56,7 +59,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes local_path: Path url: str | None = None slug: str - id: str + id: UUID subpath: str = Field(default="/") type: str | None = None branch: str | None = None @@ -66,6 +69,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes ignore_patterns: set[str] = Field(default_factory=set) # TODO: ssame type for ignore_* and include_* patterns include_patterns: set[str] | None = None include_submodules: bool = Field(default=False) + s3_url: str | None = None def extract_clone_config(self) -> CloneConfig: """Extract the relevant fields for the CloneConfig object. diff --git a/src/gitingest/utils/logging_config.py b/src/gitingest/utils/logging_config.py new file mode 100644 index 00000000..a5aae312 --- /dev/null +++ b/src/gitingest/utils/logging_config.py @@ -0,0 +1,111 @@ +"""Centralized logging configuration for JSON logging in k8s environments.""" + +from __future__ import annotations + +import json +import logging +import sys + + +class JSONFormatter(logging.Formatter): + """Custom JSON formatter for structured logging.""" + + def format(self, record: logging.LogRecord) -> str: + """Format log record as JSON.""" + log_entry = { + "timestamp": self.formatTime(record, self.datefmt), + "level": record.levelname, + "logger": record.name, + "message": record.getMessage(), + "module": record.module, + "function": record.funcName, + "line": record.lineno, + } + + # Add exception info if present + if record.exc_info: + log_entry["exception"] = self.formatException(record.exc_info) + + # Add extra fields if present + if hasattr(record, "extra_fields"): + log_entry.update(record.extra_fields) + + return json.dumps(log_entry) + + +def configure_json_logging(level: str = "INFO") -> None: + """Configure JSON logging for the application. + + Parameters + ---------- + level : str + Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) + + """ + # Convert string level to logging constant + numeric_level = getattr(logging, level.upper(), logging.INFO) + + # Create JSON formatter + formatter = JSONFormatter(datefmt="%Y-%m-%dT%H:%M:%S") + + # Configure root logger + root_logger = logging.getLogger() + root_logger.setLevel(numeric_level) + + # Remove existing handlers to avoid duplicates + for handler in root_logger.handlers[:]: + root_logger.removeHandler(handler) + + # Create console handler for stdout + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setLevel(numeric_level) + console_handler.setFormatter(formatter) + + # Add handler to root logger + root_logger.addHandler(console_handler) + + +def get_logger(name: str) -> logging.Logger: + """Get a logger instance with the given name. + + Parameters + ---------- + name : str + Logger name (typically __name__) + + Returns + ------- + logging.Logger + Configured logger instance + + """ + return logging.getLogger(name) + + +def log_with_extra(logger: logging.Logger, level: str, message: str, **extra_fields: str | int | bool | None) -> None: + """Log a message with extra fields. + + Parameters + ---------- + logger : logging.Logger + Logger instance + level : str + Log level (debug, info, warning, error, critical) + message : str + Log message + **extra_fields : str | int | bool | None + Additional fields to include in the log entry + + """ + # Create a LogRecord with extra fields + record = logger.makeRecord( + logger.name, + getattr(logging, level.upper()), + "", + 0, + message, + (), + None, + ) + record.extra_fields = extra_fields + logger.handle(record) diff --git a/src/gitingest/utils/s3_utils.py b/src/gitingest/utils/s3_utils.py new file mode 100644 index 00000000..9da94fbf --- /dev/null +++ b/src/gitingest/utils/s3_utils.py @@ -0,0 +1,356 @@ +"""S3 utility functions for uploading and managing digest files.""" + +from __future__ import annotations + +import hashlib +import os +from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic) + +import boto3 +from botocore.client import BaseClient +from botocore.exceptions import ClientError + +from gitingest.utils.logging_config import get_logger, log_with_extra + +# Initialize logger for this module +logger = get_logger(__name__) + + +class S3UploadError(Exception): + """Custom exception for S3 upload failures.""" + + +def is_s3_enabled() -> bool: + """Check if S3 is enabled via environment variables.""" + return os.getenv("S3_ENABLED", "false").lower() == "true" + + +def get_s3_config() -> dict[str, str | None]: + """Get S3 configuration from environment variables.""" + config = {} + + # Only include endpoint_url if it's set (for custom S3-compatible services) + endpoint_url = os.getenv("S3_ENDPOINT") + if endpoint_url: + config["endpoint_url"] = endpoint_url + + # Only include credentials if they're explicitly set + access_key = os.getenv("S3_ACCESS_KEY") + if access_key: + config["aws_access_key_id"] = access_key + + secret_key = os.getenv("S3_SECRET_KEY") + if secret_key: + config["aws_secret_access_key"] = secret_key + + # For region, check S3_REGION first, then fall back to AWS_REGION + region = os.getenv("S3_REGION") or os.getenv("AWS_REGION", "us-east-1") + config["region_name"] = region + + return config + + +def get_s3_bucket_name() -> str: + """Get S3 bucket name from environment variables.""" + return os.getenv("S3_BUCKET_NAME", "gitingest-bucket") + + +def get_s3_alias_host() -> str | None: + """Get S3 alias host for public URLs.""" + return os.getenv("S3_ALIAS_HOST") + + +def generate_s3_file_path( + source: str, + user_name: str, + repo_name: str, + branch: str | None, + commit: str, + include_patterns: set[str] | None, + ignore_patterns: set[str], +) -> str: + """Generate S3 file path with proper naming convention. + + The file path is formatted as: + [/]ingest//////.txt + + If S3_DIRECTORY_PREFIX environment variable is set, it will be prefixed to the path. + The commit-ID is always included in the URL. + If no specific commit is provided, the actual commit hash from the cloned repository is used. + + Parameters + ---------- + source : str + Git host (e.g., github, gitlab, bitbucket, etc.). + user_name : str + Repository owner or user. + repo_name : str + Repository name. + branch : str | None + Branch name (if available). + commit : str + Commit hash. + include_patterns : set[str] | None + Set of patterns specifying which files to include. + ignore_patterns : set[str] + Set of patterns specifying which files to exclude. + + Returns + ------- + str + S3 file path string. + + """ + # Extract source from URL or default to "unknown" + if "github.com" in source: + git_source = "github" + elif "gitlab.com" in source: + git_source = "gitlab" + elif "bitbucket.org" in source: + git_source = "bitbucket" + else: + git_source = "unknown" + + # Use branch, fallback to "main" if neither branch nor commit + branch_name = branch or "main" + + # Create hash of exclude/include patterns for uniqueness + patterns_str = f"include:{sorted(include_patterns) if include_patterns else []}" + patterns_str += f"exclude:{sorted(ignore_patterns)}" + + patterns_hash = hashlib.sha256(patterns_str.encode()).hexdigest()[:16] + + # Build the base path + base_path = f"ingest/{git_source}/{user_name}/{repo_name}/{branch_name}/{commit}/{patterns_hash}.txt" + + # Check for S3_DIRECTORY_PREFIX environment variable + s3_directory_prefix = os.getenv("S3_DIRECTORY_PREFIX") + if s3_directory_prefix: + # Remove trailing slash if present and add the prefix + s3_directory_prefix = s3_directory_prefix.rstrip("/") + return f"{s3_directory_prefix}/{base_path}" + + return base_path + + +def create_s3_client() -> BaseClient: + """Create and return an S3 client with configuration from environment.""" + config = get_s3_config() + + # Log S3 client creation with configuration details (excluding sensitive info) + log_config = {k: v for k, v in config.items() if k not in ["aws_access_key_id", "aws_secret_access_key"]} + + extra_fields = { + "s3_config": log_config, + "has_credentials": bool(config.get("aws_access_key_id")), + } + log_with_extra(logger=logger, level="debug", message="Creating S3 client", **extra_fields) + + return boto3.client("s3", **config) + + +def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str: + """Upload content to S3 and return the public URL. + + This function uploads the provided content to an S3 bucket and returns the public URL for the uploaded file. + The ingest ID is stored as an S3 object tag. + + Parameters + ---------- + content : str + The digest content to upload. + s3_file_path : str + The S3 file path where the content will be stored. + ingest_id : UUID + The ingest ID to store as an S3 object tag. + + Returns + ------- + str + Public URL to access the uploaded file. + + Raises + ------ + ValueError + If S3 is not enabled. + S3UploadError + If the upload to S3 fails. + + """ + if not is_s3_enabled(): + msg = "S3 is not enabled" + raise ValueError(msg) + + s3_client = create_s3_client() + bucket_name = get_s3_bucket_name() + + extra_fields = { + "bucket_name": bucket_name, + "s3_file_path": s3_file_path, + "ingest_id": str(ingest_id), + "content_size": len(content), + } + + # Log upload attempt + log_with_extra(logger=logger, level="debug", message="Starting S3 upload", **extra_fields) + + try: + # Upload the content with ingest_id as tag + s3_client.put_object( + Bucket=bucket_name, + Key=s3_file_path, + Body=content.encode("utf-8"), + ContentType="text/plain", + Tagging=f"ingest_id={ingest_id!s}", + ) + except ClientError as e: + # Log upload failure + log_with_extra( + logger=logger, + level="error", + message="S3 upload failed", + bucket_name=bucket_name, + s3_file_path=s3_file_path, + ingest_id=str(ingest_id), + error_code=e.response.get("Error", {}).get("Code"), + error_message=str(e), + ) + msg = f"Failed to upload to S3: {e}" + raise S3UploadError(msg) from e + + # Generate public URL + alias_host = get_s3_alias_host() + if alias_host: + # Use alias host if configured + public_url = f"{alias_host.rstrip('/')}/{s3_file_path}" + else: + # Fallback to direct S3 URL + endpoint = get_s3_config().get("endpoint_url") + if endpoint: + public_url = f"{endpoint.rstrip('/')}/{bucket_name}/{s3_file_path}" + else: + public_url = f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{s3_file_path}" + + # Log successful upload + log_with_extra( + logger=logger, + level="debug", + message="S3 upload completed successfully", + bucket_name=bucket_name, + s3_file_path=s3_file_path, + ingest_id=str(ingest_id), + public_url=public_url, + ) + + return public_url + + +def _build_s3_url(key: str) -> str: + """Build S3 URL for a given key.""" + alias_host = get_s3_alias_host() + if alias_host: + return f"{alias_host.rstrip('/')}/{key}" + endpoint = get_s3_config()["endpoint_url"] + if endpoint: + bucket_name = get_s3_bucket_name() + return f"{endpoint.rstrip('/')}/{bucket_name}/{key}" + bucket_name = get_s3_bucket_name() + return f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{key}" + + +def _check_object_tags(s3_client: BaseClient, bucket_name: str, key: str, target_ingest_id: UUID) -> bool: + """Check if an S3 object has the matching ingest_id tag.""" + try: + tags_response = s3_client.get_object_tagging(Bucket=bucket_name, Key=key) + tags = {tag["Key"]: tag["Value"] for tag in tags_response.get("TagSet", [])} + return tags.get("ingest_id") == str(target_ingest_id) + except ClientError: + return False + + +def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None: + """Get S3 URL for a given ingest ID if it exists. + + Search for files in S3 using object tags to find the matching ingest_id and returns the S3 URL if found. + Used by the download endpoint to redirect to S3 if available. + + Parameters + ---------- + ingest_id : UUID + The ingest ID to search for in S3 object tags. + + Returns + ------- + str | None + S3 URL if file exists, None otherwise. + + """ + if not is_s3_enabled(): + logger.debug("S3 not enabled, skipping URL lookup for ingest_id: %s", ingest_id) + return None + + extra_fields = {"ingest_id": str(ingest_id)} + log_with_extra(logger=logger, level="debug", message="Starting S3 URL lookup for ingest ID", **extra_fields) + + try: + s3_client = create_s3_client() + bucket_name = get_s3_bucket_name() + + # List all objects in the ingest/ prefix and check their tags + paginator = s3_client.get_paginator("list_objects_v2") + page_iterator = paginator.paginate(Bucket=bucket_name, Prefix="ingest/") + + objects_checked = 0 + for page in page_iterator: + if "Contents" not in page: + continue + + for obj in page["Contents"]: + key = obj["Key"] + objects_checked += 1 + if _check_object_tags( + s3_client=s3_client, + bucket_name=bucket_name, + key=key, + target_ingest_id=ingest_id, + ): + s3_url = _build_s3_url(key) + extra_fields = { + "ingest_id": str(ingest_id), + "s3_key": key, + "s3_url": s3_url, + "objects_checked": objects_checked, + } + log_with_extra( + logger=logger, + level="debug", + message="Found S3 object for ingest ID", + **extra_fields + ) + return s3_url + + extra_fields = { + "ingest_id": str(ingest_id), + "objects_checked": objects_checked, + } + log_with_extra( + logger=logger, + level="debug", + message="No S3 object found for ingest ID", + **extra_fields + ) + + except ClientError as err: + extra_fields = { + "ingest_id": str(ingest_id), + "error_code": err.response.get("Error", {}).get("Code"), + "error_message": str(err), + } + log_with_extra( + logger=logger, + level="error", + message="Error during S3 URL lookup", + **extra_fields + ) + + return None diff --git a/src/server/query_processor.py b/src/server/query_processor.py index a7b60f61..4841afc6 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -10,7 +10,10 @@ from gitingest.query_parser import parse_remote_repo from gitingest.utils.git_utils import validate_github_token from gitingest.utils.pattern_utils import process_patterns +from gitingest.utils.s3_utils import generate_s3_file_path, is_s3_enabled, upload_to_s3 from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType + + from server.server_config import MAX_DISPLAY_SIZE from server.server_utils import Colors, log_slider_to_size @@ -59,7 +62,6 @@ async def process_query( return IngestErrorResponse(error=str(exc)) query.url = cast("str", query.url) - query.host = cast("str", query.host) query.max_file_size = max_file_size query.ignore_patterns, query.include_patterns = process_patterns( exclude_patterns=pattern if pattern_type == PatternType.EXCLUDE else None, @@ -71,13 +73,36 @@ async def process_query( short_repo_url = f"{query.user_name}/{query.repo_name}" # Sets the "/" for the page title + # The commit hash should always be available at this point + if not query.commit: + raise RuntimeError("Unexpected error: no commit hash found") + try: summary, tree, content = ingest_query(query) - # TODO: why are we writing the tree and content to a file here? - local_txt_file = Path(clone_config.local_path).with_suffix(".txt") - with local_txt_file.open("w", encoding="utf-8") as f: - f.write(tree + "\n" + content) + # Prepare the digest content (tree + content) + digest_content = tree + "\n" + content + + # Store digest based on S3 configuration + if is_s3_enabled(): + # Upload to S3 instead of storing locally + s3_file_path = generate_s3_file_path( + source=query.url, + user_name=cast("str", query.user_name), + repo_name=cast("str", query.repo_name), + branch=query.branch, + commit=query.commit, + include_patterns=query.include_patterns, + ignore_patterns=query.ignore_patterns, + ) + s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id) + # Store S3 URL in query for later use + query.s3_url = s3_url + else: + # Store locally + local_txt_file = Path(clone_config.local_path).with_suffix(".txt") + with local_txt_file.open("w", encoding="utf-8") as f: + f.write(digest_content) except Exception as exc: _print_error(query.url, exc, max_file_size, pattern_type, pattern) @@ -101,7 +126,7 @@ async def process_query( repo_url=input_text, short_repo_url=short_repo_url, summary=summary, - ingest_id=query.id, + ingest_id=str(query.id), tree=tree, content=content, default_max_file_size=slider_position, diff --git a/src/server/routers/ingest.py b/src/server/routers/ingest.py index 521b7de0..0a9334d4 100644 --- a/src/server/routers/ingest.py +++ b/src/server/routers/ingest.py @@ -1,10 +1,14 @@ """Ingest endpoint for the API.""" +from typing import Union +from uuid import UUID + from fastapi import APIRouter, HTTPException, Request, status -from fastapi.responses import FileResponse, JSONResponse +from fastapi.responses import FileResponse, JSONResponse, RedirectResponse from prometheus_client import Counter from gitingest.config import TMP_BASE_PATH +from gitingest.utils.s3_utils import get_s3_url_for_ingest_id, is_s3_enabled from server.models import IngestRequest from server.routers_utils import COMMON_INGEST_RESPONSES, _perform_ingestion from server.server_config import MAX_DISPLAY_SIZE @@ -39,7 +43,7 @@ async def api_ingest( response = await _perform_ingestion( input_text=ingest_request.input_text, max_file_size=ingest_request.max_file_size, - pattern_type=ingest_request.pattern_type, + pattern_type=ingest_request.pattern_type.value, pattern=ingest_request.pattern, token=ingest_request.token, ) @@ -90,21 +94,24 @@ async def api_ingest_get( return response -@router.get("/api/download/file/{ingest_id}", response_class=FileResponse) -async def download_ingest(ingest_id: str) -> FileResponse: +@router.get("/api/download/file/{ingest_id}", response_model=None) +async def download_ingest( + ingest_id: UUID, +) -> Union[RedirectResponse, FileResponse]: # noqa: FA100 (future-rewritable-type-annotation) (pydantic) """Download the first text file produced for an ingest ID. **This endpoint retrieves the first ``*.txt`` file produced during the ingestion process** - and returns it as a downloadable file. The file is streamed with media type ``text/plain`` - and prompts the browser to download it. + and returns it as a downloadable file. If S3 is enabled and the file is stored in S3, + it redirects to the S3 URL. Otherwise, it serves the local file. **Parameters** - - **ingest_id** (`str`): Identifier that the ingest step emitted + - **ingest_id** (`UUID`): Identifier that the ingest step emitted **Returns** - - **FileResponse**: Streamed response with media type ``text/plain`` + - **RedirectResponse**: Redirect to S3 URL if S3 is enabled and file exists in S3 + - **FileResponse**: Streamed response with media type ``text/plain`` for local files **Raises** @@ -112,8 +119,15 @@ async def download_ingest(ingest_id: str) -> FileResponse: - **HTTPException**: **403** - the process lacks permission to read the directory or file """ + # Check if S3 is enabled and file exists in S3 + if is_s3_enabled(): + s3_url = get_s3_url_for_ingest_id(ingest_id) + if s3_url: + return RedirectResponse(url=s3_url, status_code=302) + + # Fall back to local file serving # Normalize and validate the directory path - directory = (TMP_BASE_PATH / ingest_id).resolve() + directory = (TMP_BASE_PATH / str(ingest_id)).resolve() if not str(directory).startswith(str(TMP_BASE_PATH.resolve())): raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=f"Invalid ingest ID: {ingest_id!r}") diff --git a/tests/conftest.py b/tests/conftest.py index 0e279726..8c747356 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,6 +7,7 @@ from __future__ import annotations import json +import uuid import sys from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Dict @@ -62,7 +63,7 @@ def sample_query() -> IngestionQuery: repo_name="test_repo", local_path=Path("/tmp/test_repo").resolve(), slug="test_user/test_repo", - id="id", + id=uuid.uuid4(), branch="main", max_file_size=1_000_000, ignore_patterns={"*.pyc", "__pycache__", ".git"}, diff --git a/tests/query_parser/test_git_host_agnostic.py b/tests/query_parser/test_git_host_agnostic.py index 342d9882..ce95aa9b 100644 --- a/tests/query_parser/test_git_host_agnostic.py +++ b/tests/query_parser/test_git_host_agnostic.py @@ -55,7 +55,7 @@ async def test_parse_query_without_host( query = await parse_remote_repo(url) # Compare against the canonical dict while ignoring unpredictable fields. - actual = query.model_dump(exclude={"id", "local_path", "ignore_patterns"}) + actual = query.model_dump(exclude={"id", "local_path", "ignore_patterns", "s3_url"}) assert "commit" in actual assert _is_valid_git_commit_hash(actual["commit"]) diff --git a/tests/server/test_flow_integration.py b/tests/server/test_flow_integration.py index 2129c0d9..31c474dd 100644 --- a/tests/server/test_flow_integration.py +++ b/tests/server/test_flow_integration.py @@ -50,7 +50,7 @@ async def test_remote_repository_analysis(request: pytest.FixtureRequest) -> Non client = request.getfixturevalue("test_client") form_data = { "input_text": "https://github.com/octocat/Hello-World", - "max_file_size": "243", + "max_file_size": 243, "pattern_type": "exclude", "pattern": "", "token": "", @@ -75,7 +75,7 @@ async def test_invalid_repository_url(request: pytest.FixtureRequest) -> None: client = request.getfixturevalue("test_client") form_data = { "input_text": "https://github.com/nonexistent/repo", - "max_file_size": "243", + "max_file_size": 243, "pattern_type": "exclude", "pattern": "", "token": "", @@ -97,7 +97,7 @@ async def test_large_repository(request: pytest.FixtureRequest) -> None: # TODO: ingesting a large repo take too much time (eg: godotengine/godot repository) form_data = { "input_text": "https://github.com/octocat/hello-world", - "max_file_size": "10", + "max_file_size": 10, "pattern_type": "exclude", "pattern": "", "token": "", @@ -122,7 +122,7 @@ async def test_concurrent_requests(request: pytest.FixtureRequest) -> None: def make_request() -> None: form_data = { "input_text": "https://github.com/octocat/hello-world", - "max_file_size": "243", + "max_file_size": 243, "pattern_type": "exclude", "pattern": "", "token": "", @@ -149,7 +149,7 @@ async def test_large_file_handling(request: pytest.FixtureRequest) -> None: client = request.getfixturevalue("test_client") form_data = { "input_text": "https://github.com/octocat/Hello-World", - "max_file_size": "1", + "max_file_size": 1, "pattern_type": "exclude", "pattern": "", "token": "", @@ -172,7 +172,7 @@ async def test_repository_with_patterns(request: pytest.FixtureRequest) -> None: client = request.getfixturevalue("test_client") form_data = { "input_text": "https://github.com/octocat/Hello-World", - "max_file_size": "243", + "max_file_size": 243, "pattern_type": "include", "pattern": "*.md", "token": "", From 416254c4576fc53125c7a87f711ab983c8c9b727 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Fri, 25 Jul 2025 15:46:55 +0200 Subject: [PATCH 02/12] rebase --- src/gitingest/utils/logging_config.py | 111 ----------- src/server/query_processor.py | 13 +- src/server/routers/ingest.py | 2 +- src/{gitingest/utils => server}/s3_utils.py | 209 +++++++++----------- tests/conftest.py | 2 +- 5 files changed, 107 insertions(+), 230 deletions(-) delete mode 100644 src/gitingest/utils/logging_config.py rename src/{gitingest/utils => server}/s3_utils.py (63%) diff --git a/src/gitingest/utils/logging_config.py b/src/gitingest/utils/logging_config.py deleted file mode 100644 index a5aae312..00000000 --- a/src/gitingest/utils/logging_config.py +++ /dev/null @@ -1,111 +0,0 @@ -"""Centralized logging configuration for JSON logging in k8s environments.""" - -from __future__ import annotations - -import json -import logging -import sys - - -class JSONFormatter(logging.Formatter): - """Custom JSON formatter for structured logging.""" - - def format(self, record: logging.LogRecord) -> str: - """Format log record as JSON.""" - log_entry = { - "timestamp": self.formatTime(record, self.datefmt), - "level": record.levelname, - "logger": record.name, - "message": record.getMessage(), - "module": record.module, - "function": record.funcName, - "line": record.lineno, - } - - # Add exception info if present - if record.exc_info: - log_entry["exception"] = self.formatException(record.exc_info) - - # Add extra fields if present - if hasattr(record, "extra_fields"): - log_entry.update(record.extra_fields) - - return json.dumps(log_entry) - - -def configure_json_logging(level: str = "INFO") -> None: - """Configure JSON logging for the application. - - Parameters - ---------- - level : str - Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) - - """ - # Convert string level to logging constant - numeric_level = getattr(logging, level.upper(), logging.INFO) - - # Create JSON formatter - formatter = JSONFormatter(datefmt="%Y-%m-%dT%H:%M:%S") - - # Configure root logger - root_logger = logging.getLogger() - root_logger.setLevel(numeric_level) - - # Remove existing handlers to avoid duplicates - for handler in root_logger.handlers[:]: - root_logger.removeHandler(handler) - - # Create console handler for stdout - console_handler = logging.StreamHandler(sys.stdout) - console_handler.setLevel(numeric_level) - console_handler.setFormatter(formatter) - - # Add handler to root logger - root_logger.addHandler(console_handler) - - -def get_logger(name: str) -> logging.Logger: - """Get a logger instance with the given name. - - Parameters - ---------- - name : str - Logger name (typically __name__) - - Returns - ------- - logging.Logger - Configured logger instance - - """ - return logging.getLogger(name) - - -def log_with_extra(logger: logging.Logger, level: str, message: str, **extra_fields: str | int | bool | None) -> None: - """Log a message with extra fields. - - Parameters - ---------- - logger : logging.Logger - Logger instance - level : str - Log level (debug, info, warning, error, critical) - message : str - Log message - **extra_fields : str | int | bool | None - Additional fields to include in the log entry - - """ - # Create a LogRecord with extra fields - record = logger.makeRecord( - logger.name, - getattr(logging, level.upper()), - "", - 0, - message, - (), - None, - ) - record.extra_fields = extra_fields - logger.handle(record) diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 4841afc6..b443e65d 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -10,10 +10,8 @@ from gitingest.query_parser import parse_remote_repo from gitingest.utils.git_utils import validate_github_token from gitingest.utils.pattern_utils import process_patterns -from gitingest.utils.s3_utils import generate_s3_file_path, is_s3_enabled, upload_to_s3 from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType - - +from server.s3_utils import generate_s3_file_path, is_s3_enabled, upload_to_s3 from server.server_config import MAX_DISPLAY_SIZE from server.server_utils import Colors, log_slider_to_size @@ -48,6 +46,11 @@ async def process_query( IngestResponse A union type, corresponding to IngestErrorResponse or IngestSuccessResponse + Raises + ------ + RuntimeError + If the commit hash is not found (should never happen). + """ if token: validate_github_token(token) @@ -75,7 +78,8 @@ async def process_query( # The commit hash should always be available at this point if not query.commit: - raise RuntimeError("Unexpected error: no commit hash found") + msg = "Unexpected error: no commit hash found" + raise RuntimeError(msg) try: summary, tree, content = ingest_query(query) @@ -90,7 +94,6 @@ async def process_query( source=query.url, user_name=cast("str", query.user_name), repo_name=cast("str", query.repo_name), - branch=query.branch, commit=query.commit, include_patterns=query.include_patterns, ignore_patterns=query.ignore_patterns, diff --git a/src/server/routers/ingest.py b/src/server/routers/ingest.py index 0a9334d4..fac7e20c 100644 --- a/src/server/routers/ingest.py +++ b/src/server/routers/ingest.py @@ -8,9 +8,9 @@ from prometheus_client import Counter from gitingest.config import TMP_BASE_PATH -from gitingest.utils.s3_utils import get_s3_url_for_ingest_id, is_s3_enabled from server.models import IngestRequest from server.routers_utils import COMMON_INGEST_RESPONSES, _perform_ingestion +from server.s3_utils import get_s3_url_for_ingest_id, is_s3_enabled from server.server_config import MAX_DISPLAY_SIZE from server.server_utils import limiter diff --git a/src/gitingest/utils/s3_utils.py b/src/server/s3_utils.py similarity index 63% rename from src/gitingest/utils/s3_utils.py rename to src/server/s3_utils.py index 9da94fbf..07ebdbe4 100644 --- a/src/gitingest/utils/s3_utils.py +++ b/src/server/s3_utils.py @@ -3,17 +3,20 @@ from __future__ import annotations import hashlib +import logging import os +from typing import TYPE_CHECKING +from urllib.parse import urlparse from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic) import boto3 -from botocore.client import BaseClient from botocore.exceptions import ClientError -from gitingest.utils.logging_config import get_logger, log_with_extra +if TYPE_CHECKING: + from botocore.client import BaseClient # Initialize logger for this module -logger = get_logger(__name__) +logger = logging.getLogger(__name__) class S3UploadError(Exception): @@ -27,27 +30,13 @@ def is_s3_enabled() -> bool: def get_s3_config() -> dict[str, str | None]: """Get S3 configuration from environment variables.""" - config = {} - - # Only include endpoint_url if it's set (for custom S3-compatible services) - endpoint_url = os.getenv("S3_ENDPOINT") - if endpoint_url: - config["endpoint_url"] = endpoint_url - - # Only include credentials if they're explicitly set - access_key = os.getenv("S3_ACCESS_KEY") - if access_key: - config["aws_access_key_id"] = access_key - - secret_key = os.getenv("S3_SECRET_KEY") - if secret_key: - config["aws_secret_access_key"] = secret_key - - # For region, check S3_REGION first, then fall back to AWS_REGION - region = os.getenv("S3_REGION") or os.getenv("AWS_REGION", "us-east-1") - config["region_name"] = region - - return config + config = { + "endpoint_url": os.getenv("S3_ENDPOINT"), + "aws_access_key_id": os.getenv("S3_ACCESS_KEY"), + "aws_secret_access_key": os.getenv("S3_SECRET_KEY"), + "region_name": os.getenv("S3_REGION") or os.getenv("AWS_REGION", "us-east-1"), + } + return {k: v for k, v in config.items() if v is not None} def get_s3_bucket_name() -> str: @@ -64,7 +53,6 @@ def generate_s3_file_path( source: str, user_name: str, repo_name: str, - branch: str | None, commit: str, include_patterns: set[str] | None, ignore_patterns: set[str], @@ -86,8 +74,6 @@ def generate_s3_file_path( Repository owner or user. repo_name : str Repository name. - branch : str | None - Branch name (if available). commit : str Commit hash. include_patterns : set[str] | None @@ -100,52 +86,57 @@ def generate_s3_file_path( str S3 file path string. + Raises + ------ + ValueError + If the source URL is invalid. + """ - # Extract source from URL or default to "unknown" - if "github.com" in source: - git_source = "github" - elif "gitlab.com" in source: - git_source = "gitlab" - elif "bitbucket.org" in source: - git_source = "bitbucket" - else: - git_source = "unknown" + hostname = urlparse(source).hostname + if hostname is None: + msg = "Invalid source URL" + logger.error(msg) + raise ValueError(msg) - # Use branch, fallback to "main" if neither branch nor commit - branch_name = branch or "main" + # Extract source from URL or default to "unknown" + git_source = { + "github.com": "github", + "gitlab.com": "gitlab", + "bitbucket.org": "bitbucket", + }.get(hostname, "unknown") # Create hash of exclude/include patterns for uniqueness patterns_str = f"include:{sorted(include_patterns) if include_patterns else []}" patterns_str += f"exclude:{sorted(ignore_patterns)}" - patterns_hash = hashlib.sha256(patterns_str.encode()).hexdigest()[:16] # Build the base path - base_path = f"ingest/{git_source}/{user_name}/{repo_name}/{branch_name}/{commit}/{patterns_hash}.txt" + base_path = f"ingest/{git_source}/{user_name}/{repo_name}/{commit}/{patterns_hash}.txt" # Check for S3_DIRECTORY_PREFIX environment variable s3_directory_prefix = os.getenv("S3_DIRECTORY_PREFIX") - if s3_directory_prefix: - # Remove trailing slash if present and add the prefix - s3_directory_prefix = s3_directory_prefix.rstrip("/") - return f"{s3_directory_prefix}/{base_path}" - return base_path + if not s3_directory_prefix: + return base_path + + # Remove trailing slash if present and add the prefix + s3_directory_prefix = s3_directory_prefix.rstrip("/") + return f"{s3_directory_prefix}/{base_path}" def create_s3_client() -> BaseClient: """Create and return an S3 client with configuration from environment.""" config = get_s3_config() - - # Log S3 client creation with configuration details (excluding sensitive info) - log_config = {k: v for k, v in config.items() if k not in ["aws_access_key_id", "aws_secret_access_key"]} - - extra_fields = { - "s3_config": log_config, - "has_credentials": bool(config.get("aws_access_key_id")), - } - log_with_extra(logger=logger, level="debug", message="Creating S3 client", **extra_fields) - + # Log S3 client creation (excluding sensitive info) + log_config = config.copy() + has_credentials = bool(log_config.pop("aws_access_key_id", None) or log_config.pop("aws_secret_access_key", None)) + logger.debug( + msg="Creating S3 client", + extra={ + "s3_config": log_config, + "has_credentials": has_credentials, + }, + ) return boto3.client("s3", **config) @@ -179,6 +170,7 @@ def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str: """ if not is_s3_enabled(): msg = "S3 is not enabled" + logger.error(msg) raise ValueError(msg) s3_client = create_s3_client() @@ -192,7 +184,7 @@ def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str: } # Log upload attempt - log_with_extra(logger=logger, level="debug", message="Starting S3 upload", **extra_fields) + logger.debug("Starting S3 upload", extra=extra_fields) try: # Upload the content with ingest_id as tag @@ -203,20 +195,20 @@ def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str: ContentType="text/plain", Tagging=f"ingest_id={ingest_id!s}", ) - except ClientError as e: + except ClientError as err: # Log upload failure - log_with_extra( - logger=logger, - level="error", - message="S3 upload failed", - bucket_name=bucket_name, - s3_file_path=s3_file_path, - ingest_id=str(ingest_id), - error_code=e.response.get("Error", {}).get("Code"), - error_message=str(e), + logger.exception( + "S3 upload failed", + extra={ + "bucket_name": bucket_name, + "s3_file_path": s3_file_path, + "ingest_id": str(ingest_id), + "error_code": err.response.get("Error", {}).get("Code"), + "error_message": str(err), + }, ) - msg = f"Failed to upload to S3: {e}" - raise S3UploadError(msg) from e + msg = f"Failed to upload to S3: {err}" + raise S3UploadError(msg) from err # Generate public URL alias_host = get_s3_alias_host() @@ -232,14 +224,14 @@ def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str: public_url = f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{s3_file_path}" # Log successful upload - log_with_extra( - logger=logger, - level="debug", - message="S3 upload completed successfully", - bucket_name=bucket_name, - s3_file_path=s3_file_path, - ingest_id=str(ingest_id), - public_url=public_url, + logger.debug( + "S3 upload completed successfully", + extra={ + "bucket_name": bucket_name, + "s3_file_path": s3_file_path, + "ingest_id": str(ingest_id), + "public_url": public_url, + }, ) return public_url @@ -250,12 +242,15 @@ def _build_s3_url(key: str) -> str: alias_host = get_s3_alias_host() if alias_host: return f"{alias_host.rstrip('/')}/{key}" - endpoint = get_s3_config()["endpoint_url"] + + bucket_name = get_s3_bucket_name() + config = get_s3_config() + + endpoint = config["endpoint_url"] if endpoint: - bucket_name = get_s3_bucket_name() return f"{endpoint.rstrip('/')}/{bucket_name}/{key}" - bucket_name = get_s3_bucket_name() - return f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{key}" + + return f"https://{bucket_name}.s3.{config['region_name']}.amazonaws.com/{key}" def _check_object_tags(s3_client: BaseClient, bucket_name: str, key: str, target_ingest_id: UUID) -> bool: @@ -289,8 +284,7 @@ def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None: logger.debug("S3 not enabled, skipping URL lookup for ingest_id: %s", ingest_id) return None - extra_fields = {"ingest_id": str(ingest_id)} - log_with_extra(logger=logger, level="debug", message="Starting S3 URL lookup for ingest ID", **extra_fields) + logger.debug(msg="Starting S3 URL lookup for ingest ID", extra={"ingest_id": str(ingest_id)}) try: s3_client = create_s3_client() @@ -315,42 +309,33 @@ def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None: target_ingest_id=ingest_id, ): s3_url = _build_s3_url(key) - extra_fields = { - "ingest_id": str(ingest_id), - "s3_key": key, - "s3_url": s3_url, - "objects_checked": objects_checked, - } - log_with_extra( - logger=logger, - level="debug", - message="Found S3 object for ingest ID", - **extra_fields + logger.debug( + msg="Found S3 object for ingest ID", + extra={ + "ingest_id": str(ingest_id), + "s3_key": key, + "s3_url": s3_url, + "objects_checked": objects_checked, + }, ) return s3_url - extra_fields = { - "ingest_id": str(ingest_id), - "objects_checked": objects_checked, - } - log_with_extra( - logger=logger, - level="debug", - message="No S3 object found for ingest ID", - **extra_fields + logger.debug( + msg="No S3 object found for ingest ID", + extra={ + "ingest_id": str(ingest_id), + "objects_checked": objects_checked, + }, ) except ClientError as err: - extra_fields = { - "ingest_id": str(ingest_id), - "error_code": err.response.get("Error", {}).get("Code"), - "error_message": str(err), - } - log_with_extra( - logger=logger, - level="error", - message="Error during S3 URL lookup", - **extra_fields + logger.exception( + msg="Error during S3 URL lookup", + extra={ + "ingest_id": str(ingest_id), + "error_code": err.response.get("Error", {}).get("Code"), + "error_message": str(err), + }, ) return None diff --git a/tests/conftest.py b/tests/conftest.py index 8c747356..fc97551f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,8 +7,8 @@ from __future__ import annotations import json -import uuid import sys +import uuid from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Dict from unittest.mock import AsyncMock From 3de3e70e721411027cbfca934ef2e6db866330d4 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Fri, 25 Jul 2025 17:00:57 +0200 Subject: [PATCH 03/12] rebase --- src/server/s3_utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/server/s3_utils.py b/src/server/s3_utils.py index 07ebdbe4..f7d70b9a 100644 --- a/src/server/s3_utils.py +++ b/src/server/s3_utils.py @@ -130,7 +130,7 @@ def create_s3_client() -> BaseClient: # Log S3 client creation (excluding sensitive info) log_config = config.copy() has_credentials = bool(log_config.pop("aws_access_key_id", None) or log_config.pop("aws_secret_access_key", None)) - logger.debug( + logger.info( msg="Creating S3 client", extra={ "s3_config": log_config, @@ -184,7 +184,7 @@ def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str: } # Log upload attempt - logger.debug("Starting S3 upload", extra=extra_fields) + logger.info("Starting S3 upload", extra=extra_fields) try: # Upload the content with ingest_id as tag @@ -224,7 +224,7 @@ def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str: public_url = f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{s3_file_path}" # Log successful upload - logger.debug( + logger.info( "S3 upload completed successfully", extra={ "bucket_name": bucket_name, @@ -281,10 +281,10 @@ def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None: """ if not is_s3_enabled(): - logger.debug("S3 not enabled, skipping URL lookup for ingest_id: %s", ingest_id) + logger.info("S3 not enabled, skipping URL lookup for ingest_id: %s", ingest_id) return None - logger.debug(msg="Starting S3 URL lookup for ingest ID", extra={"ingest_id": str(ingest_id)}) + logger.info(msg="Starting S3 URL lookup for ingest ID", extra={"ingest_id": str(ingest_id)}) try: s3_client = create_s3_client() @@ -309,7 +309,7 @@ def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None: target_ingest_id=ingest_id, ): s3_url = _build_s3_url(key) - logger.debug( + logger.info( msg="Found S3 object for ingest ID", extra={ "ingest_id": str(ingest_id), @@ -320,7 +320,7 @@ def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None: ) return s3_url - logger.debug( + logger.info( msg="No S3 object found for ingest ID", extra={ "ingest_id": str(ingest_id), From f99859c9e59de3047ea8c281fc6b028005447c03 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Fri, 25 Jul 2025 17:20:33 +0200 Subject: [PATCH 04/12] rebase --- src/server/routers/ingest.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/server/routers/ingest.py b/src/server/routers/ingest.py index fac7e20c..42852ea6 100644 --- a/src/server/routers/ingest.py +++ b/src/server/routers/ingest.py @@ -1,5 +1,6 @@ """Ingest endpoint for the API.""" +import logging from typing import Union from uuid import UUID @@ -94,6 +95,8 @@ async def api_ingest_get( return response +logger = logging.getLogger(__name__) + @router.get("/api/download/file/{ingest_id}", response_model=None) async def download_ingest( ingest_id: UUID, @@ -120,9 +123,12 @@ async def download_ingest( """ # Check if S3 is enabled and file exists in S3 + logger.info(f"Checking if S3 is enabled and file exists in S3 for ingest ID: {ingest_id}") if is_s3_enabled(): + logger.info(f"S3 is enabled, checking if file exists in S3 for ingest ID: {ingest_id}") s3_url = get_s3_url_for_ingest_id(ingest_id) if s3_url: + logger.info(f"File exists in S3, redirecting to S3 URL: {s3_url}") return RedirectResponse(url=s3_url, status_code=302) # Fall back to local file serving From 1cc54dd0da913e8a9799d5c756d8b871cfd6f3a2 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Fri, 25 Jul 2025 19:07:17 +0200 Subject: [PATCH 05/12] idk --- .vscode/launch.json | 18 +++++++++++++----- Dockerfile | 3 ++- compose.yml | 11 ++++++++--- src/server/routers/ingest.py | 10 ++++++++-- src/server/s3_utils.py | 2 +- 5 files changed, 32 insertions(+), 12 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index a0565651..7eedb3f6 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -1,12 +1,20 @@ { + "version": "0.2.0", "configurations": [ { - "name": "Python Debugger: Module", + "name": "Python: Attach to Docker", "type": "debugpy", - "request": "launch", - "module": "uvicorn", - "args": ["server.main:app", "--host", "0.0.0.0", "--port", "8000"], - "cwd": "${workspaceFolder}/src" + "request": "attach", + "connect": { + "host": "localhost", + "port": 5678 + }, + "pathMappings": [ + { + "localRoot": "${workspaceFolder}/src", + "remoteRoot": "/app" + } + ] } ] } diff --git a/Dockerfile b/Dockerfile index 05f6e44c..d5fea299 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,8 @@ COPY src/ ./src/ RUN set -eux; \ pip install --no-cache-dir --upgrade pip; \ - pip install --no-cache-dir --timeout 1000 .[server] + pip install --no-cache-dir --timeout 1000 .[server]; \ + pip install --no-cache-dir debugpy # Stage 2: Runtime image FROM python:3.13.5-slim@sha256:4c2cf9917bd1cbacc5e9b07320025bdb7cdf2df7b0ceaccb55e9dd7e30987419 diff --git a/compose.yml b/compose.yml index defe28cd..f903b75a 100644 --- a/compose.yml +++ b/compose.yml @@ -54,12 +54,17 @@ services: - S3_BUCKET_NAME=${S3_BUCKET_NAME:-gitingest-bucket} - S3_REGION=${S3_REGION:-us-east-1} # Public URL for S3 resources - - S3_ALIAS_HOST=${S3_ALIAS_HOST:-http://127.0.0.1:9000/${S3_BUCKET_NAME:-gitingest-bucket}} + - S3_ALIAS_HOST=https://d3pi02tcpwm9y3.cloudfront.net volumes: # Mount source code for live development - ./src:/app:ro - # Use --reload flag for hot reloading during development - command: ["python", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] + # Expose debugpy port for remote debugging + ports: + - "5678:5678" + - "${APP_WEB_BIND:-8000}:8000" + - "${GITINGEST_METRICS_HOST:-127.0.0.1}:${GITINGEST_METRICS_PORT:-9090}:9090" + # Start with debugpy for remote attach + command: ["python", "-m", "debugpy", "--listen", "0.0.0.0:5678", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] depends_on: minio-setup: condition: service_completed_successfully diff --git a/src/server/routers/ingest.py b/src/server/routers/ingest.py index 42852ea6..25918210 100644 --- a/src/server/routers/ingest.py +++ b/src/server/routers/ingest.py @@ -19,6 +19,12 @@ router = APIRouter() +# Configure basic logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) @router.post("/api/ingest", responses=COMMON_INGEST_RESPONSES) @limiter.limit("10/minute") @@ -95,7 +101,6 @@ async def api_ingest_get( return response -logger = logging.getLogger(__name__) @router.get("/api/download/file/{ingest_id}", response_model=None) async def download_ingest( @@ -122,6 +127,7 @@ async def download_ingest( - **HTTPException**: **403** - the process lacks permission to read the directory or file """ + logger = logging.getLogger(__name__) # Check if S3 is enabled and file exists in S3 logger.info(f"Checking if S3 is enabled and file exists in S3 for ingest ID: {ingest_id}") if is_s3_enabled(): @@ -145,7 +151,7 @@ async def download_ingest( except StopIteration as exc: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"No .txt file found for digest {ingest_id!r}", + detail=f"No .txt file found for digest {ingest_id!r}, s3_enabled: {is_s3_enabled()}" ) from exc try: diff --git a/src/server/s3_utils.py b/src/server/s3_utils.py index f7d70b9a..6fbf3df9 100644 --- a/src/server/s3_utils.py +++ b/src/server/s3_utils.py @@ -25,8 +25,8 @@ class S3UploadError(Exception): def is_s3_enabled() -> bool: """Check if S3 is enabled via environment variables.""" - return os.getenv("S3_ENABLED", "false").lower() == "true" + return os.getenv("S3_ENABLED", "false").lower() == "true" def get_s3_config() -> dict[str, str | None]: """Get S3 configuration from environment variables.""" From 1e791c62adaec2bad40f69cb5d236deaca52669a Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Fri, 25 Jul 2025 19:20:17 +0200 Subject: [PATCH 06/12] idk --- src/server/s3_utils.py | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/src/server/s3_utils.py b/src/server/s3_utils.py index 6fbf3df9..6bba1ded 100644 --- a/src/server/s3_utils.py +++ b/src/server/s3_utils.py @@ -310,32 +310,19 @@ def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None: ): s3_url = _build_s3_url(key) logger.info( - msg="Found S3 object for ingest ID", - extra={ - "ingest_id": str(ingest_id), - "s3_key": key, - "s3_url": s3_url, - "objects_checked": objects_checked, - }, + msg=f"Found S3 object for ingest ID: {ingest_id}, s3_key: {key}, s3_url: {s3_url}, objects_checked: {objects_checked}", ) return s3_url logger.info( - msg="No S3 object found for ingest ID", - extra={ - "ingest_id": str(ingest_id), - "objects_checked": objects_checked, - }, + msg=f"No S3 obj for ingest_id={ingest_id} (checked {objects_checked})", ) except ClientError as err: logger.exception( - msg="Error during S3 URL lookup", - extra={ - "ingest_id": str(ingest_id), - "error_code": err.response.get("Error", {}).get("Code"), - "error_message": str(err), - }, + f"Error during S3 URL lookup for ingest_id={ingest_id}, " + f"error_code={err.response.get('Error', {}).get('Code')}, " + f"error_message={err}" ) return None From 087c9e84e5d28144e9d4d36fbbc668e1a96b087a Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Fri, 25 Jul 2025 19:29:08 +0200 Subject: [PATCH 07/12] idk --- src/server/s3_utils.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/server/s3_utils.py b/src/server/s3_utils.py index 6bba1ded..2f4fd12d 100644 --- a/src/server/s3_utils.py +++ b/src/server/s3_utils.py @@ -131,11 +131,7 @@ def create_s3_client() -> BaseClient: log_config = config.copy() has_credentials = bool(log_config.pop("aws_access_key_id", None) or log_config.pop("aws_secret_access_key", None)) logger.info( - msg="Creating S3 client", - extra={ - "s3_config": log_config, - "has_credentials": has_credentials, - }, + msg=f"Creating S3 client with config: {log_config}, has_credentials: {has_credentials}" ) return boto3.client("s3", **config) @@ -315,7 +311,7 @@ def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None: return s3_url logger.info( - msg=f"No S3 obj for ingest_id={ingest_id} (checked {objects_checked})", + msg=f"No S3 obj for ingest_id={ingest_id} (checked {objects_checked}, bucket_name={bucket_name})", ) except ClientError as err: From e696407b85195187fc7d309a45b6c3897ebd89e4 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Fri, 25 Jul 2025 19:36:54 +0200 Subject: [PATCH 08/12] Revert "idk" This reverts commit 1cc54dd0da913e8a9799d5c756d8b871cfd6f3a2. --- Dockerfile | 3 +-- compose.yml | 11 +++-------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index d5fea299..05f6e44c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,8 +13,7 @@ COPY src/ ./src/ RUN set -eux; \ pip install --no-cache-dir --upgrade pip; \ - pip install --no-cache-dir --timeout 1000 .[server]; \ - pip install --no-cache-dir debugpy + pip install --no-cache-dir --timeout 1000 .[server] # Stage 2: Runtime image FROM python:3.13.5-slim@sha256:4c2cf9917bd1cbacc5e9b07320025bdb7cdf2df7b0ceaccb55e9dd7e30987419 diff --git a/compose.yml b/compose.yml index f903b75a..defe28cd 100644 --- a/compose.yml +++ b/compose.yml @@ -54,17 +54,12 @@ services: - S3_BUCKET_NAME=${S3_BUCKET_NAME:-gitingest-bucket} - S3_REGION=${S3_REGION:-us-east-1} # Public URL for S3 resources - - S3_ALIAS_HOST=https://d3pi02tcpwm9y3.cloudfront.net + - S3_ALIAS_HOST=${S3_ALIAS_HOST:-http://127.0.0.1:9000/${S3_BUCKET_NAME:-gitingest-bucket}} volumes: # Mount source code for live development - ./src:/app:ro - # Expose debugpy port for remote debugging - ports: - - "5678:5678" - - "${APP_WEB_BIND:-8000}:8000" - - "${GITINGEST_METRICS_HOST:-127.0.0.1}:${GITINGEST_METRICS_PORT:-9090}:9090" - # Start with debugpy for remote attach - command: ["python", "-m", "debugpy", "--listen", "0.0.0.0:5678", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] + # Use --reload flag for hot reloading during development + command: ["python", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] depends_on: minio-setup: condition: service_completed_successfully From c4ee8b3570ad57e250e114197260fbc12c2cdb96 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Fri, 25 Jul 2025 22:23:07 +0200 Subject: [PATCH 09/12] idk --- src/server/routers/ingest.py | 103 +++++++++++++++++++-- src/server/s3_utils.py | 175 +++++++++++++++++++++++++---------- 2 files changed, 221 insertions(+), 57 deletions(-) diff --git a/src/server/routers/ingest.py b/src/server/routers/ingest.py index 25918210..dacf8f49 100644 --- a/src/server/routers/ingest.py +++ b/src/server/routers/ingest.py @@ -128,36 +128,127 @@ async def download_ingest( """ logger = logging.getLogger(__name__) + + logger.info("Download request received", extra={ + "ingest_id": str(ingest_id), + "s3_enabled": is_s3_enabled() + }) + # Check if S3 is enabled and file exists in S3 - logger.info(f"Checking if S3 is enabled and file exists in S3 for ingest ID: {ingest_id}") if is_s3_enabled(): - logger.info(f"S3 is enabled, checking if file exists in S3 for ingest ID: {ingest_id}") - s3_url = get_s3_url_for_ingest_id(ingest_id) - if s3_url: - logger.info(f"File exists in S3, redirecting to S3 URL: {s3_url}") - return RedirectResponse(url=s3_url, status_code=302) + logger.info("S3 is enabled, attempting S3 URL lookup", extra={"ingest_id": str(ingest_id)}) + + try: + s3_url = get_s3_url_for_ingest_id(ingest_id) + if s3_url: + logger.info("File found in S3, redirecting", extra={ + "ingest_id": str(ingest_id), + "s3_url": s3_url, + "redirect_status": 302 + }) + return RedirectResponse(url=s3_url, status_code=302) + else: + logger.info("File not found in S3, falling back to local file", extra={ + "ingest_id": str(ingest_id) + }) + except Exception as s3_err: + logger.error("Error during S3 URL lookup, falling back to local file", extra={ + "ingest_id": str(ingest_id), + "error_type": type(s3_err).__name__, + "error_message": str(s3_err) + }) + else: + logger.info("S3 is disabled, serving local file", extra={"ingest_id": str(ingest_id)}) # Fall back to local file serving + logger.info("Attempting local file serving", extra={"ingest_id": str(ingest_id)}) + # Normalize and validate the directory path directory = (TMP_BASE_PATH / str(ingest_id)).resolve() + + logger.debug("Local directory path resolved", extra={ + "ingest_id": str(ingest_id), + "directory_path": str(directory), + "tmp_base_path": str(TMP_BASE_PATH.resolve()) + }) + if not str(directory).startswith(str(TMP_BASE_PATH.resolve())): + logger.error("Invalid ingest ID - path traversal attempt", extra={ + "ingest_id": str(ingest_id), + "directory_path": str(directory), + "tmp_base_path": str(TMP_BASE_PATH.resolve()) + }) raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=f"Invalid ingest ID: {ingest_id!r}") if not directory.is_dir(): + logger.error("Digest directory not found", extra={ + "ingest_id": str(ingest_id), + "directory_path": str(directory), + "directory_exists": directory.exists(), + "is_directory": directory.is_dir() if directory.exists() else False + }) raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Digest {ingest_id!r} not found") try: + # List all txt files for debugging + txt_files = list(directory.glob("*.txt")) + logger.debug("Found txt files in directory", extra={ + "ingest_id": str(ingest_id), + "directory_path": str(directory), + "txt_files_count": len(txt_files), + "txt_files": [f.name for f in txt_files] + }) + first_txt_file = next(directory.glob("*.txt")) + + logger.info("Selected txt file for download", extra={ + "ingest_id": str(ingest_id), + "selected_file": first_txt_file.name, + "file_path": str(first_txt_file), + "file_size": first_txt_file.stat().st_size if first_txt_file.exists() else "unknown" + }) + except StopIteration as exc: + # List all files in directory for debugging + all_files = list(directory.glob("*")) + logger.error("No txt file found in digest directory", extra={ + "ingest_id": str(ingest_id), + "directory_path": str(directory), + "all_files_count": len(all_files), + "all_files": [f.name for f in all_files], + "s3_enabled": is_s3_enabled() + }) raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"No .txt file found for digest {ingest_id!r}, s3_enabled: {is_s3_enabled()}" ) from exc try: + logger.info("Serving local file", extra={ + "ingest_id": str(ingest_id), + "file_name": first_txt_file.name, + "file_path": str(first_txt_file), + "media_type": "text/plain" + }) return FileResponse(path=first_txt_file, media_type="text/plain", filename=first_txt_file.name) except PermissionError as exc: + logger.error("Permission denied accessing file", extra={ + "ingest_id": str(ingest_id), + "file_path": str(first_txt_file), + "error_message": str(exc) + }) raise HTTPException( status_code=status.HTTP_403_FORBIDDEN, detail=f"Permission denied for {first_txt_file}", ) from exc + except Exception as exc: + logger.error("Unexpected error serving local file", extra={ + "ingest_id": str(ingest_id), + "file_path": str(first_txt_file), + "error_type": type(exc).__name__, + "error_message": str(exc) + }) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Error serving file for digest {ingest_id!r}", + ) from exc diff --git a/src/server/s3_utils.py b/src/server/s3_utils.py index 2f4fd12d..4d427ef9 100644 --- a/src/server/s3_utils.py +++ b/src/server/s3_utils.py @@ -25,8 +25,9 @@ class S3UploadError(Exception): def is_s3_enabled() -> bool: """Check if S3 is enabled via environment variables.""" - - return os.getenv("S3_ENABLED", "false").lower() == "true" + s3_enabled = os.getenv("S3_ENABLED", "false").lower() == "true" + logger.info(f"S3 enabled check: S3_ENABLED={os.getenv('S3_ENABLED', 'false')}, result={s3_enabled}") + return s3_enabled def get_s3_config() -> dict[str, str | None]: """Get S3 configuration from environment variables.""" @@ -36,17 +37,34 @@ def get_s3_config() -> dict[str, str | None]: "aws_secret_access_key": os.getenv("S3_SECRET_KEY"), "region_name": os.getenv("S3_REGION") or os.getenv("AWS_REGION", "us-east-1"), } - return {k: v for k, v in config.items() if v is not None} + + # Log config validation (without sensitive data) + config_status = { + "endpoint_url": "SET" if config["endpoint_url"] else "NOT_SET", + "aws_access_key_id": "SET" if config["aws_access_key_id"] else "NOT_SET", + "aws_secret_access_key": "SET" if config["aws_secret_access_key"] else "NOT_SET", + "region_name": config["region_name"] + } + logger.info(f"S3 config status: {config_status}") + + filtered_config = {k: v for k, v in config.items() if v is not None} + logger.info(f"S3 config keys present: {list(filtered_config.keys())}") + + return filtered_config def get_s3_bucket_name() -> str: """Get S3 bucket name from environment variables.""" - return os.getenv("S3_BUCKET_NAME", "gitingest-bucket") + bucket_name = os.getenv("S3_BUCKET_NAME", "gitingest-bucket") + logger.info(f"S3 bucket name: {bucket_name}") + return bucket_name def get_s3_alias_host() -> str | None: """Get S3 alias host for public URLs.""" - return os.getenv("S3_ALIAS_HOST") + alias_host = os.getenv("S3_ALIAS_HOST") + logger.info(f"S3 alias host: {'SET' if alias_host else 'NOT_SET'}") + return alias_host def generate_s3_file_path( @@ -126,14 +144,43 @@ def generate_s3_file_path( def create_s3_client() -> BaseClient: """Create and return an S3 client with configuration from environment.""" - config = get_s3_config() - # Log S3 client creation (excluding sensitive info) - log_config = config.copy() - has_credentials = bool(log_config.pop("aws_access_key_id", None) or log_config.pop("aws_secret_access_key", None)) - logger.info( - msg=f"Creating S3 client with config: {log_config}, has_credentials: {has_credentials}" - ) - return boto3.client("s3", **config) + try: + config = get_s3_config() + + # Log S3 client creation (excluding sensitive info) + log_config = config.copy() + has_access_key = bool(log_config.pop("aws_access_key_id", None)) + has_secret_key = bool(log_config.pop("aws_secret_access_key", None)) + + logger.info( + f"Creating S3 client - endpoint: {log_config.get('endpoint_url', 'DEFAULT_AWS')}, " + f"region: {log_config.get('region_name', 'us-east-1')}, " + f"has_access_key: {has_access_key}, has_secret_key: {has_secret_key}, " + f"credentials_provided: {has_access_key and has_secret_key}" + ) + + client = boto3.client("s3", **config) + + # Test client by attempting to list buckets (this will fail if credentials are wrong) + try: + # This is a lightweight test of the client credentials + client.list_buckets() + logger.info("S3 client created successfully and credentials validated") + except ClientError as test_err: + logger.warning( + f"S3 client created but credential validation failed - " + f"error_code: {test_err.response.get('Error', {}).get('Code')}, " + f"error_message: {str(test_err)}" + ) + + return client + + except Exception as err: + logger.error( + f"Failed to create S3 client - error_type: {type(err).__name__}, " + f"error_message: {str(err)}" + ) + raise def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str: @@ -172,15 +219,8 @@ def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str: s3_client = create_s3_client() bucket_name = get_s3_bucket_name() - extra_fields = { - "bucket_name": bucket_name, - "s3_file_path": s3_file_path, - "ingest_id": str(ingest_id), - "content_size": len(content), - } - # Log upload attempt - logger.info("Starting S3 upload", extra=extra_fields) + logger.info(f"Starting S3 upload - bucket: {bucket_name}, path: {s3_file_path}, ingest_id: {ingest_id}, content_size: {len(content)}") try: # Upload the content with ingest_id as tag @@ -193,15 +233,9 @@ def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str: ) except ClientError as err: # Log upload failure - logger.exception( - "S3 upload failed", - extra={ - "bucket_name": bucket_name, - "s3_file_path": s3_file_path, - "ingest_id": str(ingest_id), - "error_code": err.response.get("Error", {}).get("Code"), - "error_message": str(err), - }, + logger.error( + f"S3 upload failed - bucket: {bucket_name}, path: {s3_file_path}, ingest_id: {ingest_id}, " + f"error_code: {err.response.get('Error', {}).get('Code')}, error_message: {str(err)}" ) msg = f"Failed to upload to S3: {err}" raise S3UploadError(msg) from err @@ -220,15 +254,7 @@ def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str: public_url = f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{s3_file_path}" # Log successful upload - logger.info( - "S3 upload completed successfully", - extra={ - "bucket_name": bucket_name, - "s3_file_path": s3_file_path, - "ingest_id": str(ingest_id), - "public_url": public_url, - }, - ) + logger.info(f"S3 upload completed successfully - bucket: {bucket_name}, path: {s3_file_path}, ingest_id: {ingest_id}, public_url: {public_url}") return public_url @@ -252,10 +278,30 @@ def _build_s3_url(key: str) -> str: def _check_object_tags(s3_client: BaseClient, bucket_name: str, key: str, target_ingest_id: UUID) -> bool: """Check if an S3 object has the matching ingest_id tag.""" try: + logger.info(f"Checking tags for S3 object: {key}, target_ingest_id: {target_ingest_id}") + tags_response = s3_client.get_object_tagging(Bucket=bucket_name, Key=key) tags = {tag["Key"]: tag["Value"] for tag in tags_response.get("TagSet", [])} - return tags.get("ingest_id") == str(target_ingest_id) - except ClientError: + + logger.info(f"S3 object tags retrieved - key: {key}, target_ingest_id: {target_ingest_id}, tags: {tags}") + + match_found = tags.get("ingest_id") == str(target_ingest_id) + if match_found: + logger.info(f"Tag match found for {key}, target_ingest_id: {target_ingest_id}") + + return match_found + + except ClientError as err: + logger.warning( + f"Failed to get object tags - key: {key}, target_ingest_id: {target_ingest_id}, " + f"error_code: {err.response.get('Error', {}).get('Code')}, error_message: {str(err)}" + ) + return False + except Exception as err: + logger.warning( + f"Unexpected error checking object tags - key: {key}, target_ingest_id: {target_ingest_id}, " + f"error_type: {type(err).__name__}, error_message: {str(err)}" + ) return False @@ -277,27 +323,47 @@ def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None: """ if not is_s3_enabled(): - logger.info("S3 not enabled, skipping URL lookup for ingest_id: %s", ingest_id) + logger.info(f"S3 not enabled, skipping URL lookup for ingest_id: {ingest_id}") return None - logger.info(msg="Starting S3 URL lookup for ingest ID", extra={"ingest_id": str(ingest_id)}) + logger.info(f"Starting S3 URL lookup for ingest_id: {ingest_id}") try: s3_client = create_s3_client() bucket_name = get_s3_bucket_name() + + logger.info(f"S3 lookup initialized - ingest_id: {ingest_id}, bucket_name: {bucket_name}") # List all objects in the ingest/ prefix and check their tags - paginator = s3_client.get_paginator("list_objects_v2") - page_iterator = paginator.paginate(Bucket=bucket_name, Prefix="ingest/") + try: + paginator = s3_client.get_paginator("list_objects_v2") + page_iterator = paginator.paginate(Bucket=bucket_name, Prefix="ingest/") + + logger.info(f"S3 paginator created, starting object scan - ingest_id: {ingest_id}, bucket_name: {bucket_name}, prefix: ingest/") + except ClientError as paginator_err: + logger.error( + f"Failed to create S3 paginator - ingest_id: {ingest_id}, bucket_name: {bucket_name}, " + f"error_code: {paginator_err.response.get('Error', {}).get('Code')}, error_message: {str(paginator_err)}" + ) + return None objects_checked = 0 + pages_processed = 0 + for page in page_iterator: + pages_processed += 1 + logger.info(f"Processing S3 page {pages_processed} - ingest_id: {ingest_id}") + if "Contents" not in page: + logger.info(f"S3 page {pages_processed} has no contents - ingest_id: {ingest_id}") continue for obj in page["Contents"]: key = obj["Key"] objects_checked += 1 + + logger.info(f"Checking S3 object {objects_checked}: {key} - ingest_id: {ingest_id}") + if _check_object_tags( s3_client=s3_client, bucket_name=bucket_name, @@ -306,19 +372,26 @@ def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None: ): s3_url = _build_s3_url(key) logger.info( - msg=f"Found S3 object for ingest ID: {ingest_id}, s3_key: {key}, s3_url: {s3_url}, objects_checked: {objects_checked}", + f"Found matching S3 object for ingest ID - ingest_id: {ingest_id}, s3_key: {key}, " + f"s3_url: {s3_url}, objects_checked: {objects_checked}, pages_processed: {pages_processed}" ) return s3_url logger.info( - msg=f"No S3 obj for ingest_id={ingest_id} (checked {objects_checked}, bucket_name={bucket_name})", + f"No matching S3 object found for ingest ID - ingest_id: {ingest_id}, objects_checked: {objects_checked}, " + f"pages_processed: {pages_processed}, bucket_name: {bucket_name}" ) except ClientError as err: - logger.exception( - f"Error during S3 URL lookup for ingest_id={ingest_id}, " - f"error_code={err.response.get('Error', {}).get('Code')}, " - f"error_message={err}" + logger.error( + f"S3 client error during URL lookup - ingest_id: {ingest_id}, " + f"error_code: {err.response.get('Error', {}).get('Code')}, error_message: {str(err)}, " + f"bucket_name: {get_s3_bucket_name()}" + ) + except Exception as err: + logger.error( + f"Unexpected error during S3 URL lookup - ingest_id: {ingest_id}, " + f"error_type: {type(err).__name__}, error_message: {str(err)}" ) return None From d2ccfb892cb2269bc93c552502f5630772fc1033 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Fri, 25 Jul 2025 22:27:01 +0200 Subject: [PATCH 10/12] idk --- src/server/routers/ingest.py | 88 +++++++----------------------------- 1 file changed, 16 insertions(+), 72 deletions(-) diff --git a/src/server/routers/ingest.py b/src/server/routers/ingest.py index dacf8f49..d54b834c 100644 --- a/src/server/routers/ingest.py +++ b/src/server/routers/ingest.py @@ -129,125 +129,69 @@ async def download_ingest( """ logger = logging.getLogger(__name__) - logger.info("Download request received", extra={ - "ingest_id": str(ingest_id), - "s3_enabled": is_s3_enabled() - }) + logger.info(f"Download request received - ingest_id: {ingest_id}, s3_enabled: {is_s3_enabled()}") # Check if S3 is enabled and file exists in S3 if is_s3_enabled(): - logger.info("S3 is enabled, attempting S3 URL lookup", extra={"ingest_id": str(ingest_id)}) + logger.info(f"S3 is enabled, attempting S3 URL lookup - ingest_id: {ingest_id}") try: s3_url = get_s3_url_for_ingest_id(ingest_id) if s3_url: - logger.info("File found in S3, redirecting", extra={ - "ingest_id": str(ingest_id), - "s3_url": s3_url, - "redirect_status": 302 - }) + logger.info(f"File found in S3, redirecting - ingest_id: {ingest_id}, s3_url: {s3_url}, redirect_status: 302") return RedirectResponse(url=s3_url, status_code=302) else: - logger.info("File not found in S3, falling back to local file", extra={ - "ingest_id": str(ingest_id) - }) + logger.info(f"File not found in S3, falling back to local file - ingest_id: {ingest_id}") except Exception as s3_err: - logger.error("Error during S3 URL lookup, falling back to local file", extra={ - "ingest_id": str(ingest_id), - "error_type": type(s3_err).__name__, - "error_message": str(s3_err) - }) + logger.error(f"Error during S3 URL lookup, falling back to local file - ingest_id: {ingest_id}, error_type: {type(s3_err).__name__}, error_message: {str(s3_err)}") else: - logger.info("S3 is disabled, serving local file", extra={"ingest_id": str(ingest_id)}) + logger.info(f"S3 is disabled, serving local file - ingest_id: {ingest_id}") # Fall back to local file serving - logger.info("Attempting local file serving", extra={"ingest_id": str(ingest_id)}) + logger.info(f"Attempting local file serving - ingest_id: {ingest_id}") # Normalize and validate the directory path directory = (TMP_BASE_PATH / str(ingest_id)).resolve() - logger.debug("Local directory path resolved", extra={ - "ingest_id": str(ingest_id), - "directory_path": str(directory), - "tmp_base_path": str(TMP_BASE_PATH.resolve()) - }) + logger.info(f"Local directory path resolved - ingest_id: {ingest_id}, directory_path: {str(directory)}, tmp_base_path: {str(TMP_BASE_PATH.resolve())}") if not str(directory).startswith(str(TMP_BASE_PATH.resolve())): - logger.error("Invalid ingest ID - path traversal attempt", extra={ - "ingest_id": str(ingest_id), - "directory_path": str(directory), - "tmp_base_path": str(TMP_BASE_PATH.resolve()) - }) + logger.error(f"Invalid ingest ID - path traversal attempt - ingest_id: {ingest_id}, directory_path: {str(directory)}, tmp_base_path: {str(TMP_BASE_PATH.resolve())}") raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=f"Invalid ingest ID: {ingest_id!r}") if not directory.is_dir(): - logger.error("Digest directory not found", extra={ - "ingest_id": str(ingest_id), - "directory_path": str(directory), - "directory_exists": directory.exists(), - "is_directory": directory.is_dir() if directory.exists() else False - }) + logger.error(f"Digest directory not found - ingest_id: {ingest_id}, directory_path: {str(directory)}, directory_exists: {directory.exists()}, is_directory: {directory.is_dir() if directory.exists() else False}") raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Digest {ingest_id!r} not found") try: # List all txt files for debugging txt_files = list(directory.glob("*.txt")) - logger.debug("Found txt files in directory", extra={ - "ingest_id": str(ingest_id), - "directory_path": str(directory), - "txt_files_count": len(txt_files), - "txt_files": [f.name for f in txt_files] - }) + logger.info(f"Found txt files in directory - ingest_id: {ingest_id}, directory_path: {str(directory)}, txt_files_count: {len(txt_files)}, txt_files: {[f.name for f in txt_files]}") first_txt_file = next(directory.glob("*.txt")) - logger.info("Selected txt file for download", extra={ - "ingest_id": str(ingest_id), - "selected_file": first_txt_file.name, - "file_path": str(first_txt_file), - "file_size": first_txt_file.stat().st_size if first_txt_file.exists() else "unknown" - }) + logger.info(f"Selected txt file for download - ingest_id: {ingest_id}, selected_file: {first_txt_file.name}, file_path: {str(first_txt_file)}, file_size: {first_txt_file.stat().st_size if first_txt_file.exists() else 'unknown'}") except StopIteration as exc: # List all files in directory for debugging all_files = list(directory.glob("*")) - logger.error("No txt file found in digest directory", extra={ - "ingest_id": str(ingest_id), - "directory_path": str(directory), - "all_files_count": len(all_files), - "all_files": [f.name for f in all_files], - "s3_enabled": is_s3_enabled() - }) + logger.error(f"No txt file found in digest directory - ingest_id: {ingest_id}, directory_path: {str(directory)}, all_files_count: {len(all_files)}, all_files: {[f.name for f in all_files]}, s3_enabled: {is_s3_enabled()}") raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"No .txt file found for digest {ingest_id!r}, s3_enabled: {is_s3_enabled()}" ) from exc try: - logger.info("Serving local file", extra={ - "ingest_id": str(ingest_id), - "file_name": first_txt_file.name, - "file_path": str(first_txt_file), - "media_type": "text/plain" - }) + logger.info(f"Serving local file - ingest_id: {ingest_id}, file_name: {first_txt_file.name}, file_path: {str(first_txt_file)}, media_type: text/plain") return FileResponse(path=first_txt_file, media_type="text/plain", filename=first_txt_file.name) except PermissionError as exc: - logger.error("Permission denied accessing file", extra={ - "ingest_id": str(ingest_id), - "file_path": str(first_txt_file), - "error_message": str(exc) - }) + logger.error(f"Permission denied accessing file - ingest_id: {ingest_id}, file_path: {str(first_txt_file)}, error_message: {str(exc)}") raise HTTPException( status_code=status.HTTP_403_FORBIDDEN, detail=f"Permission denied for {first_txt_file}", ) from exc except Exception as exc: - logger.error("Unexpected error serving local file", extra={ - "ingest_id": str(ingest_id), - "file_path": str(first_txt_file), - "error_type": type(exc).__name__, - "error_message": str(exc) - }) + logger.error(f"Unexpected error serving local file - ingest_id: {ingest_id}, file_path: {str(first_txt_file)}, error_type: {type(exc).__name__}, error_message: {str(exc)}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Error serving file for digest {ingest_id!r}", From 83aafe5e7de0349d92130b4f94b123d6ea63e4a2 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Fri, 25 Jul 2025 22:48:21 +0200 Subject: [PATCH 11/12] idk --- src/server/s3_utils.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/server/s3_utils.py b/src/server/s3_utils.py index 4d427ef9..74da6c0a 100644 --- a/src/server/s3_utils.py +++ b/src/server/s3_utils.py @@ -153,8 +153,8 @@ def create_s3_client() -> BaseClient: has_secret_key = bool(log_config.pop("aws_secret_access_key", None)) logger.info( - f"Creating S3 client - endpoint: {log_config.get('endpoint_url', 'DEFAULT_AWS')}, " - f"region: {log_config.get('region_name', 'us-east-1')}, " + f"Creating S3 client - endpoint: {log_config.get('endpoint_url', 'NOT_SET')}, " + f"region: {log_config.get('region_name', 'NOT_SET')}, " f"has_access_key: {has_access_key}, has_secret_key: {has_secret_key}, " f"credentials_provided: {has_access_key and has_secret_key}" ) @@ -335,11 +335,20 @@ def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None: logger.info(f"S3 lookup initialized - ingest_id: {ingest_id}, bucket_name: {bucket_name}") # List all objects in the ingest/ prefix and check their tags + # Include S3_DIRECTORY_PREFIX if set + search_prefix = "ingest/" + s3_directory_prefix = os.getenv("S3_DIRECTORY_PREFIX") + if s3_directory_prefix: + search_prefix = f"{s3_directory_prefix.rstrip('/')}/ingest/" + logger.info(f"Using S3 directory prefix for search - ingest_id: {ingest_id}, directory_prefix: {s3_directory_prefix}") + else: + logger.info(f"No S3 directory prefix set, using default search - ingest_id: {ingest_id}") + try: paginator = s3_client.get_paginator("list_objects_v2") - page_iterator = paginator.paginate(Bucket=bucket_name, Prefix="ingest/") + page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=search_prefix) - logger.info(f"S3 paginator created, starting object scan - ingest_id: {ingest_id}, bucket_name: {bucket_name}, prefix: ingest/") + logger.info(f"S3 paginator created, starting object scan - ingest_id: {ingest_id}, bucket_name: {bucket_name}, prefix: {search_prefix}") except ClientError as paginator_err: logger.error( f"Failed to create S3 paginator - ingest_id: {ingest_id}, bucket_name: {bucket_name}, " From f3fb41f84528878673261049b88c066399d2a39d Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Fri, 25 Jul 2025 22:57:10 +0200 Subject: [PATCH 12/12] idk --- src/server/s3_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/server/s3_utils.py b/src/server/s3_utils.py index 74da6c0a..5ac74ac3 100644 --- a/src/server/s3_utils.py +++ b/src/server/s3_utils.py @@ -220,7 +220,7 @@ def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str: bucket_name = get_s3_bucket_name() # Log upload attempt - logger.info(f"Starting S3 upload - bucket: {bucket_name}, path: {s3_file_path}, ingest_id: {ingest_id}, content_size: {len(content)}") + logger.info(f"Starting S3 upload - ingest_id: {ingest_id}, EXACT_UPLOAD: s3://{bucket_name}/{s3_file_path}, content_size: {len(content)}") try: # Upload the content with ingest_id as tag @@ -348,7 +348,7 @@ def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None: paginator = s3_client.get_paginator("list_objects_v2") page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=search_prefix) - logger.info(f"S3 paginator created, starting object scan - ingest_id: {ingest_id}, bucket_name: {bucket_name}, prefix: {search_prefix}") + logger.info(f"S3 paginator created, starting object scan - ingest_id: {ingest_id}, EXACT_SEARCH: s3://{bucket_name}/{search_prefix}") except ClientError as paginator_err: logger.error( f"Failed to create S3 paginator - ingest_id: {ingest_id}, bucket_name: {bucket_name}, "