Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,4 @@ Caddyfile

# ignore default output directory
tmp/*
digest.txt
8 changes: 4 additions & 4 deletions src/gitingest/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .ingest_from_query import ingest_from_query
from .clone import clone_repo
from .parse_query import parse_query
from .ingest import ingest
from .parse_query import parse_query
from .clone import clone_repo
from .ingest_from_query import ingest_from_query

__all__ = ["ingest_from_query", "clone_repo", "parse_query", "ingest"]
__all__ = ['ingest', 'parse_query', 'clone_repo', 'ingest_from_query']
55 changes: 46 additions & 9 deletions src/gitingest/cli.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
import os
import pathlib
import click
import sys

from gitingest.ingest import ingest
from gitingest.ingest_from_query import MAX_FILE_SIZE
from gitingest.parse_query import DEFAULT_IGNORE_PATTERNS
from .encoding import setup_encoding

# Setup encoding first
setup_encoding()

# Define constants
DEFAULT_IGNORE_PATTERNS = []

def normalize_pattern(pattern: str) -> str:
pattern = pattern.strip()
Expand All @@ -15,21 +21,52 @@ def normalize_pattern(pattern: str) -> str:

@click.command()
@click.argument('source', type=str, required=True)
@click.option('--output', '-o', default=None, help='Output file path (default: <repo_name>.txt in current directory)')
@click.option('--max-size', '-s', default=MAX_FILE_SIZE, help='Maximum file size to process in bytes')
@click.option('--exclude-pattern', '-e', multiple=True, help='Patterns to exclude')
@click.option('--include-pattern', '-i', multiple=True, help='Patterns to include')
@click.option('--output', '-o', default=None,
help='Output file path (default: <repo_name>.txt in current directory)')
@click.option('--max-size', '-s', default=MAX_FILE_SIZE,
help='Maximum file size to process in bytes')
@click.option('--exclude-pattern', '-e', multiple=True,
help='Patterns to exclude')
@click.option('--include-pattern', '-i', multiple=True,
help='Patterns to include')
def main(source, output, max_size, exclude_pattern, include_pattern):
"""Analyze a directory and create a text dump of its contents."""
try:
# Combine default and custom ignore patterns
from gitingest.ingest import ingest

# Convert paths to absolute with proper encoding
source = str(pathlib.Path(source).resolve())

# Handle patterns
exclude_patterns = list(exclude_pattern)
include_patterns = list(set(include_pattern))

# Set default output name
if not output:
output = "digest.txt"
summary, tree, content = ingest(source, max_size, include_patterns, exclude_patterns, output=output)
output = str(pathlib.Path(output).resolve())

# Call ingest with encoding awareness
summary, tree, content = ingest(
source,
max_size,
include_patterns,
exclude_patterns,
output=output
)

# Write output with explicit encoding
with open(output, 'w', encoding='utf-8', errors='replace') as f:
if isinstance(summary, bytes):
summary = summary.decode('utf-8', errors='replace')
if isinstance(tree, bytes):
tree = tree.decode('utf-8', errors='replace')
if isinstance(content, bytes):
content = content.decode('utf-8', errors='replace')

f.write(f"{summary}\n\n{tree}\n\n{content}")

# Print messages with encoding handling
click.echo(f"Analysis complete! Output written to: {output}")
click.echo("\nSummary:")
click.echo(summary)
Expand All @@ -39,4 +76,4 @@ def main(source, output, max_size, exclude_pattern, include_pattern):
raise click.Abort()

if __name__ == '__main__':
main()
main()
17 changes: 17 additions & 0 deletions src/gitingest/encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import sys
import io
import codecs

def setup_encoding():
if sys.stdout.encoding != 'utf-8':
sys.stdout = io.TextIOWrapper(
sys.stdout.buffer,
encoding='utf-8',
errors='replace'
)
if sys.stderr.encoding != 'utf-8':
sys.stderr = io.TextIOWrapper(
sys.stderr.buffer,
encoding='utf-8',
errors='replace'
)
62 changes: 54 additions & 8 deletions src/gitingest/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,40 @@
import shutil
from typing import Union, List
from pathlib import Path
import io
import sys

from .ingest_from_query import ingest_from_query
from .clone import clone_repo
from .parse_query import parse_query
# Import other modules from the package
from gitingest.parse_query import parse_query
from gitingest.clone import clone_repo
from gitingest.ingest_from_query import ingest_from_query

def ingest(source: str, max_file_size: int = 10 * 1024 * 1024, include_patterns: Union[List[str], str] = None, exclude_patterns: Union[List[str], str] = None, output: str = None) -> str:
def setup_encoding():
if sys.stdout.encoding != 'utf-8':
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
if sys.stderr.encoding != 'utf-8':
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')

def ingest(source: str, max_file_size: int = 10 * 1024 * 1024,
include_patterns: Union[List[str], str] = None,
exclude_patterns: Union[List[str], str] = None,
output: str = None) -> tuple[str, str, str]:
"""
Analyze and create a text dump of source contents.

Args:
source: Path to source directory or git URL
max_file_size: Maximum file size to process in bytes
include_patterns: Patterns to include in analysis
exclude_patterns: Patterns to exclude from analysis
output: Output file path

Returns:
Tuple of (summary, tree, content)
"""
setup_encoding()
query = None

try:
query = parse_query(source, max_file_size, False, include_patterns, exclude_patterns)
if query['url']:
Expand All @@ -16,13 +44,31 @@ def ingest(source: str, max_file_size: int = 10 * 1024 * 1024, include_patterns:
summary, tree, content = ingest_from_query(query)

if output:
with open(f"{output}", "w") as f:
f.write(tree + "\n" + content)
# Write with explicit UTF-8 encoding
with open(output, "w", encoding='utf-8', errors='replace') as f:
# Ensure all content is properly encoded
tree = tree.encode('utf-8', errors='replace').decode('utf-8') if isinstance(tree, str) else tree
content = content.encode('utf-8', errors='replace').decode('utf-8') if isinstance(content, str) else content
f.write(f"{tree}\n{content}")

return summary, tree, content

except UnicodeEncodeError as e:
# Handle encoding errors specifically
error_msg = f"Encoding error while processing {source}: {str(e)}"
raise RuntimeError(error_msg)

except Exception as e:
# Handle other errors
error_msg = f"Error while processing {source}: {str(e)}"
raise RuntimeError(error_msg)

finally:
# Clean up the temporary directory if it was created
if query['url']:
if query and query.get('url'):
# Get parent directory two levels up from local_path (../tmp)
cleanup_path = str(Path(query['local_path']).parents[1])
shutil.rmtree(cleanup_path, ignore_errors=True)
try:
shutil.rmtree(cleanup_path, ignore_errors=True)
except Exception as e:
print(f"Warning: Could not clean up temporary directory: {str(e)}", file=sys.stderr)
24 changes: 20 additions & 4 deletions src/gitingest/ingest_from_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,28 @@ def should_include(path: str, base_path: str, include_patterns: List[str]) -> bo
return include

def should_exclude(path: str, base_path: str, ignore_patterns: List[str]) -> bool:
rel_path = path.replace(base_path, "").lstrip(os.sep)
"""
Check if a file or directory should be ignored.

Args:
path (str): Path to check.
base_path (str): Root base path.
ignore_patterns (List[str]): List of patterns to ignore.

Returns:
bool: True if the path should be ignored.
"""
rel_path = os.path.relpath(path, base_path).replace("\\", "/")
for pattern in ignore_patterns:
if pattern == '':
continue
if fnmatch(rel_path, pattern):
if fnmatch(rel_path, pattern) or fnmatch(os.path.basename(path), pattern):
return True
# Special case for directories ending with /
if os.path.isdir(path) and fnmatch(rel_path + '/', pattern):
return True
return False



def is_safe_symlink(symlink_path: str, base_path: str) -> bool:
"""Check if a symlink points to a location within the base directory."""
try:
Expand Down Expand Up @@ -96,8 +110,10 @@ def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int =
try:
for item in os.listdir(path):
item_path = os.path.join(path, item)
print(f"Checking path: {path}")

if should_exclude(item_path, base_path, ignore_patterns):
print(f"Checking path: {path}")
continue

is_file = os.path.isfile(item_path)
Expand Down
Loading