diff --git a/.env.example b/.env.example index aabdbf5a..53c3314e 100644 --- a/.env.example +++ b/.env.example @@ -56,3 +56,5 @@ S3_REGION=us-east-1 S3_ALIAS_HOST=127.0.0.1:9000/gitingest-bucket # Optional prefix for S3 file paths (if set, prefixes all S3 paths with this value) # S3_DIRECTORY_PREFIX=my-prefix + +LOG_FORMAT=JSON diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 25732f5d..03a603f6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -124,6 +124,7 @@ repos: pytest-asyncio, pytest-mock, python-dotenv, + python-json-logger, 'sentry-sdk[fastapi]', slowapi, starlette>=0.40.0, diff --git a/pyproject.toml b/pyproject.toml index ffbf6504..98e063e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ dependencies = [ "pathspec>=0.12.1", "pydantic", "python-dotenv", + "python-json-logger", "starlette>=0.40.0", # Minimum safe release (https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw) "strenum; python_version < '3.11'", "tiktoken>=0.7.0", # Support for o200k_base encoding diff --git a/requirements.txt b/requirements.txt index bdefb957..ee3226f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ pathspec>=0.12.1 prometheus-client pydantic python-dotenv +python-json-logger sentry-sdk[fastapi] slowapi starlette>=0.40.0 # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw diff --git a/src/gitingest/__main__.py b/src/gitingest/__main__.py index e14ed681..8c417bcc 100644 --- a/src/gitingest/__main__.py +++ b/src/gitingest/__main__.py @@ -4,6 +4,7 @@ from __future__ import annotations import asyncio +import logging from typing import TypedDict import click @@ -11,6 +12,11 @@ from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME from gitingest.entrypoint import ingest_async +from gitingest.logging_config import setup_logging + +setup_logging() + +logger = logging.getLogger(__name__) class _CLIArgs(TypedDict): @@ -163,9 +169,9 @@ async def _async_main( output_target = output if output is not None else OUTPUT_FILE_NAME if output_target == "-": - click.echo("Analyzing source, preparing output for stdout...", err=True) + logger.debug("Analyzing source, preparing output for stdout...") else: - click.echo(f"Analyzing source, output will be written to '{output_target}'...", err=True) + logger.debug("Analyzing source, output will be written to '%s'...", output_target) summary, _, _ = await ingest_async( source, @@ -180,18 +186,15 @@ async def _async_main( ) except Exception as exc: # Convert any exception into Click.Abort so that exit status is non-zero - click.echo(f"Error: {exc}", err=True) + logger.exception("Ingest failed.", exc_info=exc) raise click.Abort from exc - if output_target == "-": # stdout - click.echo("\n--- Summary ---", err=True) - click.echo(summary, err=True) - click.echo("--- End Summary ---", err=True) + if output_target == "-": + click.echo(f"--- Summary ---\n{summary}\n--- End Summary ---", err=True) click.echo("Analysis complete! Output sent to stdout.", err=True) - else: # file + else: click.echo(f"Analysis complete! Output written to: {output_target}") - click.echo("\nSummary:") - click.echo(summary) + click.echo(f"Summary:\n{summary}") if __name__ == "__main__": diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index 321e1b3e..444c6969 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -4,10 +4,10 @@ import asyncio import errno +import logging import shutil import stat import sys -import warnings from contextlib import asynccontextmanager from pathlib import Path from typing import TYPE_CHECKING, AsyncGenerator, Callable @@ -28,6 +28,8 @@ from gitingest.schemas import IngestionQuery +logger = logging.getLogger(__name__) + async def ingest_async( source: str, @@ -209,19 +211,19 @@ def _override_branch_and_tag(query: IngestionQuery, branch: str | None, tag: str """ if tag and query.tag and tag != query.tag: msg = f"Warning: The specified tag '{tag}' overrides the tag found in the URL '{query.tag}'." - warnings.warn(msg, RuntimeWarning, stacklevel=3) + logger.warning(msg) query.tag = tag or query.tag if branch and query.branch and branch != query.branch: msg = f"Warning: The specified branch '{branch}' overrides the branch found in the URL '{query.branch}'." - warnings.warn(msg, RuntimeWarning, stacklevel=3) + logger.warning(msg) query.branch = branch or query.branch if tag and branch: msg = "Warning: Both tag and branch are specified. The tag will be used." - warnings.warn(msg, RuntimeWarning, stacklevel=3) + logger.warning(msg) # Tag wins over branch if both supplied if query.tag: @@ -300,22 +302,17 @@ def _handle_remove_readonly( async def _write_output(tree: str, content: str, target: str | None) -> None: - """Write combined output to ``target`` (``"-"`` ⇒ stdout). - - Parameters - ---------- - tree : str - The tree-like string representation of the file structure. - content : str - The content of the files in the repository or directory. - target : str | None - The path to the output file. If ``None``, the results are not written to a file. - - """ + """Write combined output to ``target`` (``"-"`` ⇒ stdout).""" data = f"{tree}\n{content}" loop = asyncio.get_running_loop() - if target == "-": - await loop.run_in_executor(None, sys.stdout.write, data) - await loop.run_in_executor(None, sys.stdout.flush) - elif target is not None: - await loop.run_in_executor(None, Path(target).write_text, data, "utf-8") + try: + if target == "-": + logger.debug("Writing output to stdout.") + await loop.run_in_executor(None, sys.stdout.write, data) + await loop.run_in_executor(None, sys.stdout.flush) + elif target is not None: + logger.debug("Writing output to file: %s", target) + await loop.run_in_executor(None, Path(target).write_text, data, "utf-8") + except Exception as exc: + logger.exception("Failed to write output to %s.", target, exc_info=exc) + raise diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index 489a41a4..a2fb19b6 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -2,6 +2,7 @@ from __future__ import annotations +import logging from pathlib import Path from typing import TYPE_CHECKING @@ -13,6 +14,8 @@ if TYPE_CHECKING: from gitingest.schemas import IngestionQuery +logger = logging.getLogger(__name__) + def ingest_query(query: IngestionQuery) -> tuple[str, str, str]: """Run the ingestion process for a parsed query. @@ -111,7 +114,7 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem _process_symlink(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) elif sub_path.is_file(): if sub_path.stat().st_size > query.max_file_size: - print(f"Skipping file {sub_path}: would exceed max file size limit") + logger.debug("Skipping file %s: would exceed max file size limit", sub_path) continue _process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) elif sub_path.is_dir(): @@ -133,7 +136,7 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem node.file_count += child_directory_node.file_count node.dir_count += 1 + child_directory_node.dir_count else: - print(f"Warning: {sub_path} is an unknown file type, skipping") + logger.warning("Warning: %s is an unknown file type, skipping", sub_path) node.sort_children() @@ -186,12 +189,12 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat """ if stats.total_files + 1 > MAX_FILES: - print(f"Maximum file limit ({MAX_FILES}) reached") + logger.warning("Maximum file limit (%i) reached", MAX_FILES) return file_size = path.stat().st_size if stats.total_size + file_size > MAX_TOTAL_SIZE_BYTES: - print(f"Skipping file {path}: would exceed total size limit") + logger.debug("Skipping file %s: would exceed total size limit", path) return stats.total_files += 1 @@ -232,15 +235,15 @@ def limit_exceeded(stats: FileSystemStats, depth: int) -> bool: """ if depth > MAX_DIRECTORY_DEPTH: - print(f"Maximum depth limit ({MAX_DIRECTORY_DEPTH}) reached") + logger.warning("Maximum depth limit (%i) reached", MAX_DIRECTORY_DEPTH) return True if stats.total_files >= MAX_FILES: - print(f"Maximum file limit ({MAX_FILES}) reached") + logger.warning("Maximum file limit (%i) reached", MAX_FILES) return True # TODO: end recursion if stats.total_size >= MAX_TOTAL_SIZE_BYTES: - print(f"Maxumum total size limit ({MAX_TOTAL_SIZE_BYTES / 1024 / 1024:.1f}MB) reached") + logger.warning("Maxumum total size limit (%.1fMB) reached", MAX_TOTAL_SIZE_BYTES / 1024 / 1024) return True # TODO: end recursion return False diff --git a/src/gitingest/logging_config.py b/src/gitingest/logging_config.py new file mode 100644 index 00000000..e1cde977 --- /dev/null +++ b/src/gitingest/logging_config.py @@ -0,0 +1,29 @@ +"""Global logger configuration.""" + +import logging +import os +from typing import Literal + +from pythonjsonlogger import jsonlogger + + +def setup_logging(level: Literal = logging.INFO) -> None: + """Configure logger for the whole gitingest module. + + Selects formatter based on LOG_FORMAT env variable: + - 'json': JSON formatter (time/level/msg, then extras) + - any other value or unset: default formatter + """ + logger = logging.getLogger() + logger.setLevel(level) + log_handler = logging.StreamHandler() + + log_format = os.getenv("LOG_FORMAT", "default").lower() + if log_format == "json": + formatter = jsonlogger.JsonFormatter( + "%(asctime)s %(levelname)s %(message)s %(name)s %(module)s %(funcName)s %(lineno)d", + ) + else: + formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s") + log_handler.setFormatter(formatter) + logger.handlers = [log_handler] diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index 2a9957b2..094db230 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -2,12 +2,13 @@ from __future__ import annotations -import ssl +import logging import warnings +from ssl import SSLError from typing import TYPE_CHECKING -import requests.exceptions import tiktoken +from requests.exceptions import RequestException from gitingest.schemas import FileSystemNode, FileSystemNodeType from gitingest.utils.compat_func import readlink @@ -15,6 +16,8 @@ if TYPE_CHECKING: from gitingest.schemas import IngestionQuery +logger = logging.getLogger(__name__) + _TOKEN_THRESHOLDS: list[tuple[int, str]] = [ (1_000_000, "M"), (1_000, "k"), @@ -195,7 +198,7 @@ def _format_token_count(text: str) -> str | None: except (ValueError, UnicodeEncodeError) as exc: warnings.warn(f"Failed to estimate token size: {exc}", RuntimeWarning, stacklevel=3) return None - except (requests.exceptions.RequestException, ssl.SSLError) as exc: + except (RequestException, SSLError) as exc: # If network errors, skip token count estimation instead of erroring out warnings.warn(f"Failed to download tiktoken model: {exc}", RuntimeWarning, stacklevel=3) return None diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py index 6262f0db..371a6352 100644 --- a/src/gitingest/query_parser.py +++ b/src/gitingest/query_parser.py @@ -2,8 +2,8 @@ from __future__ import annotations +import logging import uuid -import warnings from pathlib import Path from typing import Literal @@ -18,6 +18,8 @@ _normalise_source, ) +logger = logging.getLogger(__name__) + async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQuery: """Parse a repository URL and return an ``IngestionQuery`` object. @@ -71,16 +73,19 @@ async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQ # TODO: Handle issues and pull requests if query.type in {PathKind.ISSUES, PathKind.PULL}: msg = f"Warning: Issues and pull requests are not yet supported: {url}. Returning repository root." + logger.warning(msg) return await _fallback_to_root(query, token=token, warn_msg=msg) # If no extra path parts, just return if not path_parts: msg = f"Warning: No extra path parts: {url}. Returning repository root." + logger.warning(msg) return await _fallback_to_root(query, token=token, warn_msg=msg) if query.type not in {PathKind.TREE, PathKind.BLOB}: # TODO: Handle other types msg = f"Warning: Type '{query.type}' is not yet supported: {url}. Returning repository root." + logger.warning(msg) return await _fallback_to_root(query, token=token, warn_msg=msg) # Commit, branch, or tag @@ -169,7 +174,7 @@ async def _configure_branch_or_tag( except RuntimeError as exc: # If remote discovery fails, we optimistically treat the first path segment as the branch/tag. msg = f"Warning: Failed to fetch {_ref_type}: {exc}" - warnings.warn(msg, RuntimeWarning, stacklevel=2) + logger.warning(msg) return path_parts.pop(0) if path_parts else None # Iterate over the path components and try to find a matching branch/tag diff --git a/src/gitingest/schemas/ingestion.py b/src/gitingest/schemas/ingestion.py index 21369075..98d954e9 100644 --- a/src/gitingest/schemas/ingestion.py +++ b/src/gitingest/schemas/ingestion.py @@ -2,6 +2,7 @@ from __future__ import annotations +import logging from pathlib import Path # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic) from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic) @@ -10,6 +11,8 @@ from gitingest.config import MAX_FILE_SIZE from gitingest.schemas.cloning import CloneConfig +logger = logging.getLogger(__name__) + class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes """Pydantic model to store the parsed details of the repository or file path. @@ -72,21 +75,18 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes s3_url: str | None = None def extract_clone_config(self) -> CloneConfig: - """Extract the relevant fields for the CloneConfig object. - - Returns - ------- - CloneConfig - A CloneConfig object containing the relevant fields. - - Raises - ------ - ValueError - If the ``url`` parameter is not provided. - - """ + """Extract the relevant fields for the CloneConfig object.""" + logger.debug( + "Extracting CloneConfig for url=%s, local_path=%s, branch=%s, tag=%s, commit=%s", + self.url, + self.local_path, + self.branch, + self.tag, + self.commit, + ) if not self.url: msg = "The 'url' parameter is required." + logger.error(msg) raise ValueError(msg) return CloneConfig( diff --git a/src/gitingest/utils/git_utils.py b/src/gitingest/utils/git_utils.py index a094e944..c3d82173 100644 --- a/src/gitingest/utils/git_utils.py +++ b/src/gitingest/utils/git_utils.py @@ -4,6 +4,7 @@ import asyncio import base64 +import logging import re import sys from pathlib import Path @@ -15,11 +16,12 @@ from gitingest.utils.compat_func import removesuffix from gitingest.utils.exceptions import InvalidGitHubTokenError -from server.server_utils import Colors if TYPE_CHECKING: from gitingest.schemas import CloneConfig +logger = logging.getLogger(__name__) + # GitHub Personal-Access tokens (classic + fine-grained). # - ghp_ / gho_ / ghu_ / ghs_ / ghr_ → 36 alphanumerics # - github_pat_ → 22 alphanumerics + "_" + 59 alphanumerics @@ -97,13 +99,12 @@ async def ensure_git_installed() -> None: try: stdout, _ = await run_command("git", "config", "core.longpaths") if stdout.decode().strip().lower() != "true": - print( - f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}Git clone may fail on Windows " - f"due to long file paths:{Colors.END}", + logger.warning( + """Git clone may fail on Windows due to long file paths: +To avoid this issue, consider enabling long path support with: + git config --global core.longpaths true +Note: This command may require administrator privileges.""", ) - print(f"{Colors.RED}To avoid this issue, consider enabling long path support with:{Colors.END}") - print(f"{Colors.RED} git config --global core.longpaths true{Colors.END}") - print(f"{Colors.RED}Note: This command may require administrator privileges.{Colors.END}") except RuntimeError: # Ignore if checking 'core.longpaths' fails. pass diff --git a/src/gitingest/utils/notebook.py b/src/gitingest/utils/notebook.py index cfa09238..96776966 100644 --- a/src/gitingest/utils/notebook.py +++ b/src/gitingest/utils/notebook.py @@ -3,6 +3,7 @@ from __future__ import annotations import json +import logging import warnings from itertools import chain from typing import TYPE_CHECKING, Any @@ -12,6 +13,8 @@ if TYPE_CHECKING: from pathlib import Path +logger = logging.getLogger(__name__) + def process_notebook(file: Path, *, include_output: bool = True) -> str: """Process a Jupyter notebook file and return an executable Python script as a string. @@ -39,6 +42,7 @@ def process_notebook(file: Path, *, include_output: bool = True) -> str: notebook: dict[str, Any] = json.load(f) except json.JSONDecodeError as exc: msg = f"Invalid JSON in notebook: {file}" + logger.exception(msg) raise InvalidNotebookError(msg) from exc # Check if the notebook contains worksheets @@ -127,24 +131,7 @@ def _process_cell(cell: dict[str, Any], *, include_output: bool) -> str | None: def _extract_output(output: dict[str, Any]) -> list[str]: - """Extract the output from a Jupyter notebook cell. - - Parameters - ---------- - output : dict[str, Any] - The output dictionary from a Jupyter notebook cell. - - Returns - ------- - list[str] - The output as a list of strings. - - Raises - ------ - ValueError - If an unknown output type is encountered. - - """ + """Extract the output from a Jupyter notebook cell.""" output_type = output["output_type"] if output_type == "stream": @@ -157,4 +144,5 @@ def _extract_output(output: dict[str, Any]) -> list[str]: return [f"Error: {output['ename']}: {output['evalue']}"] msg = f"Unknown output type: {output_type}" + logger.error(msg) raise ValueError(msg) diff --git a/src/gitingest/utils/query_parser_utils.py b/src/gitingest/utils/query_parser_utils.py index 41dc7ada..80234bf2 100644 --- a/src/gitingest/utils/query_parser_utils.py +++ b/src/gitingest/utils/query_parser_utils.py @@ -2,8 +2,8 @@ from __future__ import annotations +import logging import string -import warnings from typing import TYPE_CHECKING, cast from urllib.parse import ParseResult, unquote, urlparse @@ -13,6 +13,7 @@ if TYPE_CHECKING: from gitingest.schemas import IngestionQuery +logger = logging.getLogger(__name__) HEX_DIGITS: set[str] = set(string.hexdigits) @@ -56,7 +57,7 @@ async def _fallback_to_root(query: IngestionQuery, token: str | None, warn_msg: url = cast("str", query.url) query.commit = await _resolve_ref_to_sha(url, pattern="HEAD", token=token) if warn_msg: - warnings.warn(warn_msg, RuntimeWarning, stacklevel=3) + logger.warning(warn_msg) return query diff --git a/src/server/main.py b/src/server/main.py index 2a07773a..30003c75 100644 --- a/src/server/main.py +++ b/src/server/main.py @@ -14,6 +14,7 @@ from slowapi.errors import RateLimitExceeded from starlette.middleware.trustedhost import TrustedHostMiddleware +from gitingest.logging_config import setup_logging from server.metrics_server import start_metrics_server from server.routers import dynamic, index, ingest from server.server_config import templates @@ -22,6 +23,9 @@ # Load environment variables from .env file load_dotenv() +# Setup logging based on LOG_FORMAT env variable +setup_logging() + # Initialize Sentry SDK if enabled if os.getenv("GITINGEST_SENTRY_ENABLED") is not None: sentry_dsn = os.getenv("GITINGEST_SENTRY_DSN") diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 7a55bfd1..e28e2347 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -22,7 +22,8 @@ upload_to_s3, ) from server.server_config import MAX_DISPLAY_SIZE -from server.server_utils import Colors + +logger = logging.getLogger(__name__) if TYPE_CHECKING: from gitingest.schemas.cloning import CloneConfig @@ -70,14 +71,7 @@ async def _check_s3_cache( clone_config = query.extract_clone_config() query.commit = await resolve_commit(clone_config, token=token) # Generate S3 file path using the resolved commit - s3_file_path = generate_s3_file_path( - source=query.url, - user_name=cast("str", query.user_name), - repo_name=cast("str", query.repo_name), - commit=query.commit, - include_patterns=query.include_patterns, - ignore_patterns=query.ignore_patterns, - ) + s3_file_path = generate_s3_file_path(query) # Check if file exists on S3 if check_s3_object_exists(s3_file_path): @@ -147,14 +141,7 @@ def _store_digest_content( """ if is_s3_enabled(): # Upload to S3 instead of storing locally - s3_file_path = generate_s3_file_path( - source=query.url, - user_name=cast("str", query.user_name), - repo_name=cast("str", query.repo_name), - commit=query.commit, - include_patterns=query.include_patterns, - ignore_patterns=query.ignore_patterns, - ) + s3_file_path = generate_s3_file_path(query) s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id) # Also upload metadata JSON for caching @@ -244,14 +231,22 @@ async def process_query( If the commit hash is not found (should never happen). """ + logger.debug( + "Processing query: input_text=%s, max_file_size=%s, pattern_type=%s, pattern=%s", + input_text, + max_file_size, + pattern_type, + pattern, + ) if token: validate_github_token(token) try: + logger.debug("Parsing remote repo.") query = await parse_remote_repo(input_text, token=token) + logger.debug("Parsed query: url=%s, user=%s, repo=%s", query.url, query.user_name, query.repo_name) except Exception as exc: - print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") - print(f"{Colors.RED}{exc}{Colors.END}") + logger.exception("Failed to parse remote repo.") return IngestErrorResponse(error=str(exc)) query.url = cast("str", query.url) @@ -274,6 +269,7 @@ async def process_query( return s3_response clone_config = query.extract_clone_config() + logger.debug("Cloning repo with config: %r", clone_config) await clone_repo(clone_config, token=token) short_repo_url = f"{query.user_name}/{query.repo_name}" @@ -288,22 +284,37 @@ async def process_query( digest_content = tree + "\n" + content _store_digest_content(query, clone_config, digest_content, summary, tree, content) except Exception as exc: - _print_error(query.url, exc, max_file_size, pattern_type, pattern) + logger.exception( + "Error processing query for URL %s (max_file_size=%s, pattern_type=%s, pattern=%s).", + query.url, + max_file_size, + pattern_type, + pattern, + exc_info=exc, + ) return IngestErrorResponse(error=str(exc)) if len(content) > MAX_DISPLAY_SIZE: + logger.info( + "Content cropped to %sk characters for display.", + int(MAX_DISPLAY_SIZE / 1_000), + ) # Important: user-facing truncation content = ( f"(Files content cropped to {int(MAX_DISPLAY_SIZE / 1_000)}k characters, " "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] ) - _print_success( - url=query.url, - max_file_size=max_file_size, - pattern_type=pattern_type, - pattern=pattern, - summary=summary, - ) + logger.info( + "Query processed successfully for URL %s (max_file_size=%s, pattern_type=%s, pattern=%s)", + query.url, + max_file_size, + pattern_type, + pattern, + ) # Important: successful query + estimated_tokens = None + if "Estimated tokens:" in summary: + estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] + logger.info("Estimated tokens: %s", estimated_tokens) # Important: token estimation digest_url = _generate_digest_url(query) @@ -318,76 +329,3 @@ async def process_query( pattern_type=pattern_type, pattern=pattern, ) - - -def _print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None: - """Print a formatted summary of the query details for debugging. - - Parameters - ---------- - url : str - The URL associated with the query. - max_file_size : int - The maximum file size allowed for the query, in bytes. - pattern_type : str - Specifies the type of pattern to use, either "include" or "exclude". - pattern : str - The actual pattern string to include or exclude in the query. - - """ - default_max_file_kb = 50 - print(f"{Colors.WHITE}{url:<20}{Colors.END}", end="") - if int(max_file_size / 1024) != default_max_file_kb: - print( - f" | {Colors.YELLOW}Size: {int(max_file_size / 1024)}kB{Colors.END}", - end="", - ) - if pattern_type == "include" and pattern != "": - print(f" | {Colors.YELLOW}Include {pattern}{Colors.END}", end="") - elif pattern_type == "exclude" and pattern != "": - print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="") - - -def _print_error(url: str, exc: Exception, max_file_size: int, pattern_type: str, pattern: str) -> None: - """Print a formatted error message for debugging. - - Parameters - ---------- - url : str - The URL associated with the query that caused the error. - exc : Exception - The exception raised during the query or process. - max_file_size : int - The maximum file size allowed for the query, in bytes. - pattern_type : str - Specifies the type of pattern to use, either "include" or "exclude". - pattern : str - The actual pattern string to include or exclude in the query. - - """ - print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") - _print_query(url, max_file_size, pattern_type, pattern) - print(f" | {Colors.RED}{exc}{Colors.END}") - - -def _print_success(url: str, max_file_size: int, pattern_type: str, pattern: str, summary: str) -> None: - """Print a formatted success message for debugging. - - Parameters - ---------- - url : str - The URL associated with the successful query. - max_file_size : int - The maximum file size allowed for the query, in bytes. - pattern_type : str - Specifies the type of pattern to use, either "include" or "exclude". - pattern : str - The actual pattern string to include or exclude in the query. - summary : str - A summary of the query result, including details like estimated tokens. - - """ - estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] - print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<- {Colors.END}", end="") - _print_query(url, max_file_size, pattern_type, pattern) - print(f" | {Colors.PURPLE}{estimated_tokens}{Colors.END}") diff --git a/src/server/routers/ingest.py b/src/server/routers/ingest.py index ce9e6512..ace848b6 100644 --- a/src/server/routers/ingest.py +++ b/src/server/routers/ingest.py @@ -1,5 +1,6 @@ """Ingest endpoint for the API.""" +import logging from typing import Union from uuid import UUID @@ -15,6 +16,7 @@ from server.server_utils import limiter ingest_counter = Counter("gitingest_ingest_total", "Number of ingests", ["status", "url"]) +logger = logging.getLogger(__name__) router = APIRouter() @@ -40,6 +42,13 @@ async def api_ingest( - **JSONResponse**: Success response with ingestion results or error response with appropriate HTTP status code """ + logger.debug( + "POST /api/ingest called with input_text=%s, max_file_size=%s, pattern_type=%s, pattern=%s", + ingest_request.input_text, + ingest_request.max_file_size, + ingest_request.pattern_type, + ingest_request.pattern, + ) response = await _perform_ingestion( input_text=ingest_request.input_text, max_file_size=ingest_request.max_file_size, @@ -47,7 +56,12 @@ async def api_ingest( pattern=ingest_request.pattern, token=ingest_request.token, ) - # limit URL to 255 characters + logger.info( + "Ingest POST result: status_code=%s, url=%s", + response.status_code, + ingest_request.input_text[:255], + ) # Important event: ingestion result + ingest_counter.labels(status=response.status_code, url=ingest_request.input_text[:255]).inc() return response @@ -82,6 +96,16 @@ async def api_ingest_get( **Returns** - **JSONResponse**: Success response with ingestion results or error response with appropriate HTTP status code """ + logger.debug( + "GET /api/%s/%s called with user=%s, repository=%s, max_file_size=%s, pattern_type=%s, pattern=%s", + user, + repository, + user, + repository, + max_file_size, + pattern_type, + pattern, + ) response = await _perform_ingestion( input_text=f"{user}/{repository}", max_file_size=max_file_size, @@ -89,7 +113,13 @@ async def api_ingest_get( pattern=pattern, token=token or None, ) - # limit URL to 255 characters + logger.info( + "Ingest GET result: status_code=%s, url=%s/%s", + response.status_code, + user, + repository, + ) # Important event: ingestion result + ingest_counter.labels(status=response.status_code, url=f"{user}/{repository}"[:255]).inc() return response @@ -131,22 +161,28 @@ async def download_ingest( # Normalize and validate the directory path directory = (TMP_BASE_PATH / str(ingest_id)).resolve() if not str(directory).startswith(str(TMP_BASE_PATH.resolve())): + logger.error("Invalid ingest ID: %s (directory traversal attempt)", ingest_id) raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=f"Invalid ingest ID: {ingest_id!r}") if not directory.is_dir(): + logger.error("Digest %s not found (directory does not exist)", ingest_id) raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Digest {ingest_id!r} not found") try: first_txt_file = next(directory.glob("*.txt")) + logger.debug("Found .txt file for download: %s", first_txt_file) except StopIteration as exc: + logger.exception("No .txt file found for digest %s", ingest_id, exc_info=exc) raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"No .txt file found for digest {ingest_id!r}", + detail=f"No .txt file found for digest {ingest_id}", ) from exc try: + logger.info("Returning FileResponse for %s", first_txt_file) # Important event: file download return FileResponse(path=first_txt_file, media_type="text/plain", filename=first_txt_file.name) except PermissionError as exc: + logger.exception("Permission denied for %s", first_txt_file, exc_info=exc) raise HTTPException( status_code=status.HTTP_403_FORBIDDEN, detail=f"Permission denied for {first_txt_file}", diff --git a/src/server/s3_utils.py b/src/server/s3_utils.py index f7f14ad0..6d508c9f 100644 --- a/src/server/s3_utils.py +++ b/src/server/s3_utils.py @@ -18,6 +18,8 @@ if TYPE_CHECKING: from botocore.client import BaseClient + from gitingest.schemas.ingestion import IngestionQuery + # Initialize logger for this module logger = logging.getLogger(__name__) @@ -58,12 +60,7 @@ def get_s3_alias_host() -> str | None: def generate_s3_file_path( - source: str, - user_name: str, - repo_name: str, - commit: str, - include_patterns: set[str] | None, - ignore_patterns: set[str], + query: IngestionQuery, ) -> str: """Generate S3 file path with proper naming convention. @@ -77,18 +74,8 @@ def generate_s3_file_path( Parameters ---------- - source : str - Git host (e.g., github, gitlab, bitbucket, etc.). - user_name : str - Repository owner or user. - repo_name : str - Repository name. - commit : str - Commit hash. - include_patterns : set[str] | None - Set of patterns specifying which files to include. - ignore_patterns : set[str] - Set of patterns specifying which files to exclude. + query : IngestionQuery + The query object containing repository information. Returns ------- @@ -101,19 +88,32 @@ def generate_s3_file_path( If the source URL is invalid. """ - hostname = urlparse(source).hostname + if query.host is None: + msg = "Source is None" + logger.error(msg) + raise ValueError(msg) + + if query.commit is None: + msg = "commit is None" + logger.error(msg) + raise ValueError(msg) + + hostname = urlparse(query.host).hostname if hostname is None: msg = "Invalid source URL" logger.error(msg) raise ValueError(msg) # Create hash of exclude/include patterns for uniqueness - patterns_str = f"include:{sorted(include_patterns) if include_patterns else []}" - patterns_str += f"exclude:{sorted(ignore_patterns)}" + patterns_str = f"include:{sorted(query.include_patterns) if query.include_patterns else []}" + patterns_str += f"exclude:{sorted(query.ignore_patterns)}" patterns_hash = hashlib.sha256(patterns_str.encode()).hexdigest()[:16] # Build the base path using hostname directly - base_path = f"ingest/{hostname}/{user_name}/{repo_name}/{commit}/{patterns_hash}/{user_name}-{repo_name}.txt" + base_path = ( + f"ingest/{hostname}/{query.user_name}/{query.repo_name}/{query.commit}/" + f"{patterns_hash}/{query.user_name}-{query.repo_name}.txt" + ) # Check for S3_DIRECTORY_PREFIX environment variable s3_directory_prefix = os.getenv("S3_DIRECTORY_PREFIX") diff --git a/src/server/server_utils.py b/src/server/server_utils.py index ee6f9eca..d44ae884 100644 --- a/src/server/server_utils.py +++ b/src/server/server_utils.py @@ -1,6 +1,7 @@ """Utility functions for the server.""" import asyncio +import logging import shutil import time from contextlib import asynccontextmanager, suppress @@ -19,6 +20,8 @@ # Initialize a rate limiter limiter = Limiter(key_func=get_remote_address) +logger = logging.getLogger(__name__) + async def rate_limit_exception_handler(request: Request, exc: Exception) -> Response: """Handle rate-limiting errors with a custom exception handler. @@ -103,8 +106,8 @@ async def _remove_old_repositories( await _process_folder(folder) - except (OSError, PermissionError) as exc: - print(f"Error in _remove_old_repositories: {exc}") + except (OSError, PermissionError): + logger.exception("Exception in _remove_old_repositories") await asyncio.sleep(scan_interval) @@ -133,16 +136,16 @@ async def _process_folder(folder: Path) -> None: owner, repo = filename.split("-", 1) repo_url = f"{owner}/{repo}" await loop.run_in_executor(None, _append_line, history_file, repo_url) - except (OSError, PermissionError) as exc: - print(f"Error logging repository URL for {folder}: {exc}") + except (OSError, PermissionError): + logger.exception("Exception raised while processing folder %s", folder) # Delete the cloned repo try: await loop.run_in_executor(None, shutil.rmtree, folder) - except PermissionError as exc: - print(f"No permission to delete {folder}: {exc}") - except OSError as exc: - print(f"Could not delete {folder}: {exc}") + except PermissionError: + logger.exception("No permission to delete %s", folder) + except OSError: + logger.exception("Could not delete %s", folder) def _append_line(path: Path, line: str) -> None: diff --git a/tests/test_summary.py b/tests/test_summary.py index ac32394a..9d9d25ed 100644 --- a/tests/test_summary.py +++ b/tests/test_summary.py @@ -55,7 +55,7 @@ def test_ingest_summary(path_type: str, path: str, ref_type: str, ref: str) -> N is_blob = path_type == "blob" expected_lines = _calculate_expected_lines(ref_type, is_main_branch=is_main_branch) expected_non_empty_lines = expected_lines - 1 - + print(f"https://github.com/{REPO}/{path_type}/{ref}{path}") summary, _, _ = ingest(f"https://github.com/{REPO}/{path_type}/{ref}{path}") lines = summary.splitlines() parsed_lines = dict(line.split(": ", 1) for line in lines if ": " in line)