Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ repos:
pytest-asyncio,
pytest-mock,
python-dotenv,
python-json-logger,
'sentry-sdk[fastapi]',
slowapi,
starlette>=0.40.0,
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ dependencies = [
"pathspec>=0.12.1",
"pydantic",
"python-dotenv",
"python-json-logger",
"starlette>=0.40.0", # Minimum safe release (https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw)
"strenum; python_version < '3.11'",
"tiktoken>=0.7.0", # Support for o200k_base encoding
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ pathspec>=0.12.1
prometheus-client
pydantic
python-dotenv
python-json-logger
sentry-sdk[fastapi]
slowapi
starlette>=0.40.0 # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw
Expand Down
6 changes: 6 additions & 0 deletions src/gitingest/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,19 @@
from __future__ import annotations

import asyncio
import logging
from typing import TypedDict

import click
from typing_extensions import Unpack

from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME
from gitingest.entrypoint import ingest_async
from gitingest.logging_config import setup_json_logging

setup_json_logging()

logger = logging.getLogger(__name__)


class _CLIArgs(TypedDict):
Expand Down
10 changes: 6 additions & 4 deletions src/gitingest/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

import asyncio
import errno
import logging
import shutil
import stat
import sys
import warnings
from contextlib import asynccontextmanager
from pathlib import Path
from typing import TYPE_CHECKING, AsyncGenerator, Callable
Expand All @@ -28,6 +28,8 @@

from gitingest.schemas import IngestionQuery

logger = logging.getLogger(__name__)


async def ingest_async(
source: str,
Expand Down Expand Up @@ -209,19 +211,19 @@ def _override_branch_and_tag(query: IngestionQuery, branch: str | None, tag: str
"""
if tag and query.tag and tag != query.tag:
msg = f"Warning: The specified tag '{tag}' overrides the tag found in the URL '{query.tag}'."
warnings.warn(msg, RuntimeWarning, stacklevel=3)
logger.warning(msg)

query.tag = tag or query.tag

if branch and query.branch and branch != query.branch:
msg = f"Warning: The specified branch '{branch}' overrides the branch found in the URL '{query.branch}'."
warnings.warn(msg, RuntimeWarning, stacklevel=3)
logger.warning(msg)

query.branch = branch or query.branch

if tag and branch:
msg = "Warning: Both tag and branch are specified. The tag will be used."
warnings.warn(msg, RuntimeWarning, stacklevel=3)
logger.warning(msg)

# Tag wins over branch if both supplied
if query.tag:
Expand Down
17 changes: 10 additions & 7 deletions src/gitingest/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import logging
from pathlib import Path
from typing import TYPE_CHECKING

Expand All @@ -13,6 +14,8 @@
if TYPE_CHECKING:
from gitingest.schemas import IngestionQuery

logger = logging.getLogger(__name__)


def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:
"""Run the ingestion process for a parsed query.
Expand Down Expand Up @@ -111,7 +114,7 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem
_process_symlink(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path)
elif sub_path.is_file():
if sub_path.stat().st_size > query.max_file_size:
print(f"Skipping file {sub_path}: would exceed max file size limit")
logger.info("Skipping file %s: would exceed max file size limit", sub_path)
continue
_process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path)
elif sub_path.is_dir():
Expand All @@ -133,7 +136,7 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem
node.file_count += child_directory_node.file_count
node.dir_count += 1 + child_directory_node.dir_count
else:
print(f"Warning: {sub_path} is an unknown file type, skipping")
logger.warning("Warning: %s is an unknown file type, skipping", sub_path)

node.sort_children()

Expand Down Expand Up @@ -186,12 +189,12 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat

"""
if stats.total_files + 1 > MAX_FILES:
print(f"Maximum file limit ({MAX_FILES}) reached")
logger.warning("Maximum file limit (%i) reached", MAX_FILES)
return

file_size = path.stat().st_size
if stats.total_size + file_size > MAX_TOTAL_SIZE_BYTES:
print(f"Skipping file {path}: would exceed total size limit")
logger.info("Skipping file %s: would exceed total size limit", path)
return

stats.total_files += 1
Expand Down Expand Up @@ -232,15 +235,15 @@ def limit_exceeded(stats: FileSystemStats, depth: int) -> bool:

"""
if depth > MAX_DIRECTORY_DEPTH:
print(f"Maximum depth limit ({MAX_DIRECTORY_DEPTH}) reached")
logger.warning("Maximum depth limit (%i) reached", MAX_DIRECTORY_DEPTH)
return True

if stats.total_files >= MAX_FILES:
print(f"Maximum file limit ({MAX_FILES}) reached")
logger.warning("Maximum file limit (%i) reached", MAX_FILES)
return True # TODO: end recursion

if stats.total_size >= MAX_TOTAL_SIZE_BYTES:
print(f"Maxumum total size limit ({MAX_TOTAL_SIZE_BYTES / 1024 / 1024:.1f}MB) reached")
logger.warning("Maxumum total size limit (%.1fMB) reached", MAX_TOTAL_SIZE_BYTES / 1024 / 1024)
return True # TODO: end recursion

return False
16 changes: 16 additions & 0 deletions src/gitingest/logging_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"""Global logger configuration."""

import logging
from typing import Literal

from pythonjsonlogger import jsonlogger


def setup_json_logging(level: Literal = logging.INFO) -> None:
"""Configure json logger for the whole gitingest module."""
logger = logging.getLogger(__name__)
logger.setLevel(level)
log_handler = logging.StreamHandler()
formatter = jsonlogger.JsonFormatter("%(asctime)s %(levelname)s %(name)s %(message)s")
log_handler.setFormatter(formatter)
logger.handlers = [log_handler]
7 changes: 5 additions & 2 deletions src/gitingest/output_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import logging
from typing import TYPE_CHECKING

import tiktoken
Expand All @@ -12,6 +13,8 @@
if TYPE_CHECKING:
from gitingest.schemas import IngestionQuery

logger = logging.getLogger(__name__)

_TOKEN_THRESHOLDS: list[tuple[int, str]] = [
(1_000_000, "M"),
(1_000, "k"),
Expand Down Expand Up @@ -189,8 +192,8 @@ def _format_token_count(text: str) -> str | None:
try:
encoding = tiktoken.get_encoding("o200k_base") # gpt-4o, gpt-4o-mini
total_tokens = len(encoding.encode(text, disallowed_special=()))
except (ValueError, UnicodeEncodeError) as exc:
print(exc)
except (ValueError, UnicodeEncodeError):
logger.exception()
return None

for threshold, suffix in _TOKEN_THRESHOLDS:
Expand Down
9 changes: 7 additions & 2 deletions src/gitingest/query_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from __future__ import annotations

import logging
import uuid
import warnings
from pathlib import Path
from typing import Literal

Expand All @@ -18,6 +18,8 @@
_normalise_source,
)

logger = logging.getLogger(__name__)


async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQuery:
"""Parse a repository URL and return an ``IngestionQuery`` object.
Expand Down Expand Up @@ -71,16 +73,19 @@ async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQ
# TODO: Handle issues and pull requests
if query.type in {PathKind.ISSUES, PathKind.PULL}:
msg = f"Warning: Issues and pull requests are not yet supported: {url}. Returning repository root."
logger.warning(msg)
return await _fallback_to_root(query, token=token, warn_msg=msg)

# If no extra path parts, just return
if not path_parts:
msg = f"Warning: No extra path parts: {url}. Returning repository root."
logger.warning(msg)
return await _fallback_to_root(query, token=token, warn_msg=msg)

if query.type not in {PathKind.TREE, PathKind.BLOB}:
# TODO: Handle other types
msg = f"Warning: Type '{query.type}' is not yet supported: {url}. Returning repository root."
logger.warning(msg)
return await _fallback_to_root(query, token=token, warn_msg=msg)

# Commit, branch, or tag
Expand Down Expand Up @@ -169,7 +174,7 @@ async def _configure_branch_or_tag(
except RuntimeError as exc:
# If remote discovery fails, we optimistically treat the first path segment as the branch/tag.
msg = f"Warning: Failed to fetch {_ref_type}: {exc}"
warnings.warn(msg, RuntimeWarning, stacklevel=2)
logger.warning(msg)
return path_parts.pop(0) if path_parts else None

# Iterate over the path components and try to find a matching branch/tag
Expand Down
15 changes: 7 additions & 8 deletions src/gitingest/utils/git_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import asyncio
import base64
import logging
import re
import sys
from pathlib import Path
Expand All @@ -15,11 +16,12 @@

from gitingest.utils.compat_func import removesuffix
from gitingest.utils.exceptions import InvalidGitHubTokenError
from server.server_utils import Colors

if TYPE_CHECKING:
from gitingest.schemas import CloneConfig

logger = logging.getLogger(__name__)

# GitHub Personal-Access tokens (classic + fine-grained).
# - ghp_ / gho_ / ghu_ / ghs_ / ghr_ → 36 alphanumerics
# - github_pat_ → 22 alphanumerics + "_" + 59 alphanumerics
Expand Down Expand Up @@ -97,13 +99,10 @@ async def ensure_git_installed() -> None:
try:
stdout, _ = await run_command("git", "config", "core.longpaths")
if stdout.decode().strip().lower() != "true":
print(
f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}Git clone may fail on Windows "
f"due to long file paths:{Colors.END}",
)
print(f"{Colors.RED}To avoid this issue, consider enabling long path support with:{Colors.END}")
print(f"{Colors.RED} git config --global core.longpaths true{Colors.END}")
print(f"{Colors.RED}Note: This command may require administrator privileges.{Colors.END}")
logger.warning("WARN: Git clone may fail on Windows due to long file paths:")
logger.warning("To avoid this issue, consider enabling long path support with:")
logger.warning(" git config --global core.longpaths true")
logger.warning("Note: This command may require administrator privileges.")
except RuntimeError:
# Ignore if checking 'core.longpaths' fails.
pass
Expand Down
12 changes: 5 additions & 7 deletions src/gitingest/utils/notebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from __future__ import annotations

import json
import warnings
import logging
from itertools import chain
from typing import TYPE_CHECKING, Any

Expand All @@ -12,6 +12,8 @@
if TYPE_CHECKING:
from pathlib import Path

logger = logging.getLogger(__name__)


def process_notebook(file: Path, *, include_output: bool = True) -> str:
"""Process a Jupyter notebook file and return an executable Python script as a string.
Expand Down Expand Up @@ -44,20 +46,16 @@ def process_notebook(file: Path, *, include_output: bool = True) -> str:
# Check if the notebook contains worksheets
worksheets = notebook.get("worksheets")
if worksheets:
warnings.warn(
logger.warning(
"Worksheets are deprecated as of IPEP-17. Consider updating the notebook. "
"(See: https://github.com/jupyter/nbformat and "
"https://github.com/ipython/ipython/wiki/IPEP-17:-Notebook-Format-4#remove-multiple-worksheets "
"for more information.)",
DeprecationWarning,
stacklevel=2,
)

if len(worksheets) > 1:
warnings.warn(
logger.warning(
"Multiple worksheets detected. Combining all worksheets into a single script.",
UserWarning,
stacklevel=2,
)

cells = list(chain.from_iterable(ws["cells"] for ws in worksheets))
Expand Down
5 changes: 3 additions & 2 deletions src/gitingest/utils/query_parser_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from __future__ import annotations

import logging
import string
import warnings
from typing import TYPE_CHECKING, cast
from urllib.parse import ParseResult, unquote, urlparse

Expand All @@ -13,6 +13,7 @@
if TYPE_CHECKING:
from gitingest.schemas import IngestionQuery

logger = logging.getLogger(__name__)

HEX_DIGITS: set[str] = set(string.hexdigits)

Expand Down Expand Up @@ -56,7 +57,7 @@ async def _fallback_to_root(query: IngestionQuery, token: str | None, warn_msg:
url = cast("str", query.url)
query.commit = await _resolve_ref_to_sha(url, pattern="HEAD", token=token)
if warn_msg:
warnings.warn(warn_msg, RuntimeWarning, stacklevel=3)
logger.warning(warn_msg)
return query


Expand Down
Loading
Loading