diff --git a/.gitignore b/.gitignore index e98f538f..1cad9b5e 100644 --- a/.gitignore +++ b/.gitignore @@ -172,3 +172,4 @@ Caddyfile # ignore default output directory tmp/* +digest.txt \ No newline at end of file diff --git a/src/gitingest/__init__.py b/src/gitingest/__init__.py index ed84b214..07417b94 100644 --- a/src/gitingest/__init__.py +++ b/src/gitingest/__init__.py @@ -1,6 +1,6 @@ -from .ingest_from_query import ingest_from_query -from .clone import clone_repo -from .parse_query import parse_query from .ingest import ingest +from .parse_query import parse_query +from .clone import clone_repo +from .ingest_from_query import ingest_from_query -__all__ = ["ingest_from_query", "clone_repo", "parse_query", "ingest"] \ No newline at end of file +__all__ = ['ingest', 'parse_query', 'clone_repo', 'ingest_from_query'] \ No newline at end of file diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 81823e63..6db5602a 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -1,10 +1,16 @@ import os import pathlib import click +import sys -from gitingest.ingest import ingest from gitingest.ingest_from_query import MAX_FILE_SIZE -from gitingest.parse_query import DEFAULT_IGNORE_PATTERNS +from .encoding import setup_encoding + +# Setup encoding first +setup_encoding() + +# Define constants +DEFAULT_IGNORE_PATTERNS = [] def normalize_pattern(pattern: str) -> str: pattern = pattern.strip() @@ -15,21 +21,52 @@ def normalize_pattern(pattern: str) -> str: @click.command() @click.argument('source', type=str, required=True) -@click.option('--output', '-o', default=None, help='Output file path (default: .txt in current directory)') -@click.option('--max-size', '-s', default=MAX_FILE_SIZE, help='Maximum file size to process in bytes') -@click.option('--exclude-pattern', '-e', multiple=True, help='Patterns to exclude') -@click.option('--include-pattern', '-i', multiple=True, help='Patterns to include') +@click.option('--output', '-o', default=None, + help='Output file path (default: .txt in current directory)') +@click.option('--max-size', '-s', default=MAX_FILE_SIZE, + help='Maximum file size to process in bytes') +@click.option('--exclude-pattern', '-e', multiple=True, + help='Patterns to exclude') +@click.option('--include-pattern', '-i', multiple=True, + help='Patterns to include') def main(source, output, max_size, exclude_pattern, include_pattern): """Analyze a directory and create a text dump of its contents.""" try: - # Combine default and custom ignore patterns + from gitingest.ingest import ingest + + # Convert paths to absolute with proper encoding + source = str(pathlib.Path(source).resolve()) + + # Handle patterns exclude_patterns = list(exclude_pattern) include_patterns = list(set(include_pattern)) + # Set default output name if not output: output = "digest.txt" - summary, tree, content = ingest(source, max_size, include_patterns, exclude_patterns, output=output) + output = str(pathlib.Path(output).resolve()) + + # Call ingest with encoding awareness + summary, tree, content = ingest( + source, + max_size, + include_patterns, + exclude_patterns, + output=output + ) + + # Write output with explicit encoding + with open(output, 'w', encoding='utf-8', errors='replace') as f: + if isinstance(summary, bytes): + summary = summary.decode('utf-8', errors='replace') + if isinstance(tree, bytes): + tree = tree.decode('utf-8', errors='replace') + if isinstance(content, bytes): + content = content.decode('utf-8', errors='replace') + + f.write(f"{summary}\n\n{tree}\n\n{content}") + # Print messages with encoding handling click.echo(f"Analysis complete! Output written to: {output}") click.echo("\nSummary:") click.echo(summary) @@ -39,4 +76,4 @@ def main(source, output, max_size, exclude_pattern, include_pattern): raise click.Abort() if __name__ == '__main__': - main() \ No newline at end of file + main() \ No newline at end of file diff --git a/src/gitingest/encoding.py b/src/gitingest/encoding.py new file mode 100644 index 00000000..f4e10578 --- /dev/null +++ b/src/gitingest/encoding.py @@ -0,0 +1,17 @@ +import sys +import io +import codecs + +def setup_encoding(): + if sys.stdout.encoding != 'utf-8': + sys.stdout = io.TextIOWrapper( + sys.stdout.buffer, + encoding='utf-8', + errors='replace' + ) + if sys.stderr.encoding != 'utf-8': + sys.stderr = io.TextIOWrapper( + sys.stderr.buffer, + encoding='utf-8', + errors='replace' + ) \ No newline at end of file diff --git a/src/gitingest/ingest.py b/src/gitingest/ingest.py index eac20818..6b3e957a 100644 --- a/src/gitingest/ingest.py +++ b/src/gitingest/ingest.py @@ -2,12 +2,40 @@ import shutil from typing import Union, List from pathlib import Path +import io +import sys -from .ingest_from_query import ingest_from_query -from .clone import clone_repo -from .parse_query import parse_query +# Import other modules from the package +from gitingest.parse_query import parse_query +from gitingest.clone import clone_repo +from gitingest.ingest_from_query import ingest_from_query -def ingest(source: str, max_file_size: int = 10 * 1024 * 1024, include_patterns: Union[List[str], str] = None, exclude_patterns: Union[List[str], str] = None, output: str = None) -> str: +def setup_encoding(): + if sys.stdout.encoding != 'utf-8': + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') + if sys.stderr.encoding != 'utf-8': + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace') + +def ingest(source: str, max_file_size: int = 10 * 1024 * 1024, + include_patterns: Union[List[str], str] = None, + exclude_patterns: Union[List[str], str] = None, + output: str = None) -> tuple[str, str, str]: + """ + Analyze and create a text dump of source contents. + + Args: + source: Path to source directory or git URL + max_file_size: Maximum file size to process in bytes + include_patterns: Patterns to include in analysis + exclude_patterns: Patterns to exclude from analysis + output: Output file path + + Returns: + Tuple of (summary, tree, content) + """ + setup_encoding() + query = None + try: query = parse_query(source, max_file_size, False, include_patterns, exclude_patterns) if query['url']: @@ -16,13 +44,31 @@ def ingest(source: str, max_file_size: int = 10 * 1024 * 1024, include_patterns: summary, tree, content = ingest_from_query(query) if output: - with open(f"{output}", "w") as f: - f.write(tree + "\n" + content) + # Write with explicit UTF-8 encoding + with open(output, "w", encoding='utf-8', errors='replace') as f: + # Ensure all content is properly encoded + tree = tree.encode('utf-8', errors='replace').decode('utf-8') if isinstance(tree, str) else tree + content = content.encode('utf-8', errors='replace').decode('utf-8') if isinstance(content, str) else content + f.write(f"{tree}\n{content}") return summary, tree, content + + except UnicodeEncodeError as e: + # Handle encoding errors specifically + error_msg = f"Encoding error while processing {source}: {str(e)}" + raise RuntimeError(error_msg) + + except Exception as e: + # Handle other errors + error_msg = f"Error while processing {source}: {str(e)}" + raise RuntimeError(error_msg) + finally: # Clean up the temporary directory if it was created - if query['url']: + if query and query.get('url'): # Get parent directory two levels up from local_path (../tmp) cleanup_path = str(Path(query['local_path']).parents[1]) - shutil.rmtree(cleanup_path, ignore_errors=True) \ No newline at end of file + try: + shutil.rmtree(cleanup_path, ignore_errors=True) + except Exception as e: + print(f"Warning: Could not clean up temporary directory: {str(e)}", file=sys.stderr) \ No newline at end of file diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py index 4e7d5e78..7bd1bc7b 100644 --- a/src/gitingest/ingest_from_query.py +++ b/src/gitingest/ingest_from_query.py @@ -19,14 +19,28 @@ def should_include(path: str, base_path: str, include_patterns: List[str]) -> bo return include def should_exclude(path: str, base_path: str, ignore_patterns: List[str]) -> bool: - rel_path = path.replace(base_path, "").lstrip(os.sep) + """ + Check if a file or directory should be ignored. + + Args: + path (str): Path to check. + base_path (str): Root base path. + ignore_patterns (List[str]): List of patterns to ignore. + + Returns: + bool: True if the path should be ignored. + """ + rel_path = os.path.relpath(path, base_path).replace("\\", "/") for pattern in ignore_patterns: - if pattern == '': - continue - if fnmatch(rel_path, pattern): + if fnmatch(rel_path, pattern) or fnmatch(os.path.basename(path), pattern): + return True + # Special case for directories ending with / + if os.path.isdir(path) and fnmatch(rel_path + '/', pattern): return True return False + + def is_safe_symlink(symlink_path: str, base_path: str) -> bool: """Check if a symlink points to a location within the base directory.""" try: @@ -96,8 +110,10 @@ def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = try: for item in os.listdir(path): item_path = os.path.join(path, item) + print(f"Checking path: {path}") if should_exclude(item_path, base_path, ignore_patterns): + print(f"Checking path: {path}") continue is_file = os.path.isfile(item_path) diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index 8b8f97a8..572e2571 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -93,40 +93,81 @@ def parse_url(url: str) -> dict: parsed["subpath"] = "/" + "/".join(path_parts[4:]) return parsed +### šŸ“ **Normalize Pattern** def normalize_pattern(pattern: str) -> str: + """ + Normalize a pattern by stripping and formatting. + + Args: + pattern (str): The ignore pattern. + + Returns: + str: Normalized pattern. + """ pattern = pattern.strip() pattern = pattern.lstrip(os.sep) if pattern.endswith(os.sep): pattern += "*" return pattern +### šŸ“ **Parse Patterns** def parse_patterns(pattern: Union[List[str], str]) -> List[str]: + """ + Parse and validate patterns. + + Args: + pattern (Union[List[str], str]): Patterns to parse. + + Returns: + List[str]: Parsed patterns. + """ if isinstance(pattern, list): pattern = ",".join(pattern) for p in pattern.split(","): if not all(c.isalnum() or c in "-_./+*" for c in p.strip()): - raise ValueError(f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed.") - patterns = [normalize_pattern(p) for p in pattern.split(",")] - return patterns + raise ValueError( + f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed." + ) + return [normalize_pattern(p) for p in pattern.split(",")] +### šŸ“ **Override Ignore Patterns** def override_ignore_patterns(ignore_patterns: List[str], include_patterns: List[str]) -> List[str]: + """ + Remove include patterns from ignore patterns. + + Args: + ignore_patterns (List[str]): Ignore patterns. + include_patterns (List[str]): Include patterns. + + Returns: + List[str]: Updated ignore patterns. + """ for pattern in include_patterns: if pattern in ignore_patterns: ignore_patterns.remove(pattern) return ignore_patterns +### šŸ“ **Parse Path** def parse_path(path: str) -> dict: - - query = { + """ + Parse a local file path. + + Args: + path (str): File path. + + Returns: + dict: Parsed path details. + """ + return { "local_path": os.path.abspath(path), "slug": os.path.basename(os.path.dirname(path)) + "/" + os.path.basename(path), "subpath": "/", "id": str(uuid.uuid4()), "url": None, } - return query + def parse_query(source: str, max_file_size: int, from_web: bool, include_patterns: Union[List[str], str] = None, ignore_patterns: Union[List[str], str] = None) -> dict: if from_web: @@ -154,3 +195,89 @@ def parse_query(source: str, max_file_size: int, from_web: bool, include_pattern return query +### šŸ“ **Parse .gitignore** +def parse_gitignore(gitignore_path: str) -> List[str]: + """ + Parse .gitignore and return ignore patterns. + + Args: + gitignore_path (str): Path to the .gitignore file. + + Returns: + List[str]: List of ignore patterns. + """ + ignore_patterns = [] + if os.path.exists(gitignore_path): + with open(gitignore_path, 'r', encoding='utf-8') as file: + for line in file: + line = line.strip() + if line and not line.startswith('#'): + # Ensure directory patterns end with '/' + if os.path.isdir(os.path.join(os.path.dirname(gitignore_path), line)): + line = line.rstrip('/') + '/' + ignore_patterns.append(line) + return ignore_patterns + + +### šŸ“ **Parse Query** +def parse_query(source: str, max_file_size: int, from_web: bool, + include_patterns: Union[List[str], str] = None, + ignore_patterns: Union[List[str], str] = None) -> dict: + """ + Parse the query and apply ignore patterns. + + Args: + source (str): Source path or URL. + max_file_size (int): Maximum file size. + from_web (bool): Web source or local. + include_patterns (Union[List[str], str]): Include patterns. + ignore_patterns (Union[List[str], str]): Ignore patterns. + + Returns: + dict: Query object with patterns. + """ + if from_web: + query = parse_url(source) + else: + query = parse_path(source) + + query['max_file_size'] = max_file_size + + # Start with default ignore patterns + final_ignore_patterns = DEFAULT_IGNORE_PATTERNS.copy() + + # Load from .gitignore + gitignore_path = os.path.join(query['local_path'], '.gitignore') + print(f"find .gitignore on project --> {gitignore_path}") + + if os.path.exists(gitignore_path): + gitignore_patterns = parse_gitignore(gitignore_path) + final_ignore_patterns.extend(gitignore_patterns) + print(f"\nšŸ›”ļø Patterns from: {gitignore_path}") + for pattern in gitignore_patterns: + print(f" - {pattern}") + # Add user-defined ignore patterns + if ignore_patterns: + final_ignore_patterns.extend(parse_patterns(ignore_patterns)) + + # Handle include patterns + if include_patterns: + include_patterns = parse_patterns(include_patterns) + final_ignore_patterns = override_ignore_patterns(final_ignore_patterns, include_patterns) + + query['ignore_patterns'] = final_ignore_patterns + query['include_patterns'] = include_patterns + # šŸ–Øļø Print patterns to the console + print("\nšŸ›”ļø Applied Ignore Patterns:") + for pattern in final_ignore_patterns: + print(f" - {pattern}") + + if include_patterns: + print("\nāœ… Included Patterns:") + for pattern in include_patterns: + print(f" - {pattern}") + else: + print("\nāœ… Included Patterns: None") + + return query + return query \ No newline at end of file