Skip to content

Commit 789be9b

Browse files
authored
fix: traverse directories to allow pattern matching of files within them (#259)
* fix: traverse directories to allow pattern matching of files within them
1 parent bf5d760 commit 789be9b

File tree

4 files changed

+223
-7
lines changed

4 files changed

+223
-7
lines changed

src/gitingest/cli.py

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,34 @@
1313

1414
@click.command()
1515
@click.argument("source", type=str, default=".")
16-
@click.option("--output", "-o", default=None, help="Output file path (default: <repo_name>.txt in current directory)")
17-
@click.option("--max-size", "-s", default=MAX_FILE_SIZE, help="Maximum file size to process in bytes")
18-
@click.option("--exclude-pattern", "-e", multiple=True, help="Patterns to exclude")
19-
@click.option("--include-pattern", "-i", multiple=True, help="Patterns to include")
16+
@click.option(
17+
"--output",
18+
"-o",
19+
default=None,
20+
help="Output file path (default: <repo_name>.txt in current directory)",
21+
)
22+
@click.option(
23+
"--max-size",
24+
"-s",
25+
default=MAX_FILE_SIZE,
26+
help="Maximum file size to process in bytes",
27+
)
28+
@click.option(
29+
"--exclude-pattern",
30+
"-e",
31+
multiple=True,
32+
help="""Patterns to exclude. Handles python's arbitrary subset of Unix
33+
shell-style wildcards. See:
34+
https://docs.python.org/3/library/fnmatch.html""",
35+
)
36+
@click.option(
37+
"--include-pattern",
38+
"-i",
39+
multiple=True,
40+
help="""Patterns to include. Handles python's arbitrary subset of Unix
41+
shell-style wildcards. See:
42+
https://docs.python.org/3/library/fnmatch.html""",
43+
)
2044
@click.option("--branch", "-b", default=None, help="Branch to clone and ingest")
2145
def main(
2246
source: str,
@@ -27,7 +51,7 @@ def main(
2751
branch: Optional[str],
2852
):
2953
"""
30-
Main entry point for the CLI. This function is called when the CLI is run as a script.
54+
Main entry point for the CLI. This function is called when the CLI is run as a script.
3155
3256
It calls the async main function to run the command.
3357

src/gitingest/ingestion.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,10 @@ def _process_node(
202202
query=query,
203203
stats=stats,
204204
)
205+
206+
if not child_directory_node.children:
207+
continue
208+
205209
node.children.append(child_directory_node)
206210
node.size += child_directory_node.size
207211
node.file_count += child_directory_node.file_count

src/gitingest/utils/ingestion_utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,10 @@ def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) ->
3333
return False
3434

3535
rel_str = str(rel_path)
36+
37+
# if path is a directory, include it by default
3638
if path.is_dir():
37-
rel_str += "/"
39+
return True
3840

3941
for pattern in include_patterns:
4042
if fnmatch(rel_str, pattern):

tests/test_ingestion.py

Lines changed: 187 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,11 @@
55
including filtering patterns and subpaths.
66
"""
77

8+
import re
89
from pathlib import Path
10+
from typing import Set, TypedDict
11+
12+
import pytest
913

1014
from gitingest.ingestion import ingest_query
1115
from gitingest.query_parsing import IngestionQuery
@@ -42,5 +46,187 @@ def test_run_ingest_query(temp_directory: Path, sample_query: IngestionQuery) ->
4246
# TODO: Additional tests:
4347
# - Multiple include patterns, e.g. ["*.txt", "*.py"] or ["/src/*", "*.txt"].
4448
# - Edge cases with weird file names or deep subdirectory structures.
45-
# TODO : def test_include_txt_pattern
4649
# TODO : def test_include_nonexistent_extension
50+
51+
52+
class PatternScenario(TypedDict):
53+
include_patterns: Set[str]
54+
ignore_patterns: Set[str]
55+
expected_num_files: int
56+
expected_content: Set[str]
57+
expected_structure: Set[str]
58+
expected_not_structure: Set[str]
59+
60+
61+
@pytest.mark.parametrize(
62+
"pattern_scenario",
63+
[
64+
pytest.param(
65+
PatternScenario(
66+
{
67+
"include_patterns": {"file2.py", "dir2/file_dir2.txt"},
68+
"ignore_patterns": {*()},
69+
"expected_num_files": 2,
70+
"expected_content": {"file2.py", "dir2/file_dir2.txt"},
71+
"expected_structure": {"test_repo/", "dir2/"},
72+
"expected_not_structure": {"src/", "subdir/", "dir1/"},
73+
}
74+
),
75+
id="include-explicit-files",
76+
),
77+
pytest.param(
78+
PatternScenario(
79+
{
80+
"include_patterns": {
81+
"file1.txt",
82+
"file2.py",
83+
"file_dir1.txt",
84+
"*/file_dir2.txt",
85+
},
86+
"ignore_patterns": {*()},
87+
"expected_num_files": 3,
88+
"expected_content": {"file1.txt", "file2.py", "dir2/file_dir2.txt"},
89+
"expected_structure": {"test_repo/", "dir2/"},
90+
"expected_not_structure": {"src/", "subdir/", "dir1/"},
91+
}
92+
),
93+
id="include-wildcard-directory",
94+
),
95+
pytest.param(
96+
PatternScenario(
97+
{
98+
"include_patterns": {"*.py"},
99+
"ignore_patterns": {*()},
100+
"expected_num_files": 3,
101+
"expected_content": {
102+
"file2.py",
103+
"src/subfile2.py",
104+
"src/subdir/file_subdir.py",
105+
},
106+
"expected_structure": {"test_repo/", "src/", "subdir/"},
107+
"expected_not_structure": {"dir1/", "dir2/"},
108+
}
109+
),
110+
id="include-wildcard-files",
111+
),
112+
pytest.param(
113+
PatternScenario(
114+
{
115+
"include_patterns": {"**/file_dir2.txt", "src/**/*.py"},
116+
"ignore_patterns": {*()},
117+
"expected_num_files": 2,
118+
"expected_content": {
119+
"dir2/file_dir2.txt",
120+
"src/subdir/file_subdir.py",
121+
},
122+
"expected_structure": {"test_repo/", "dir2/", "src/", "subdir/"},
123+
"expected_not_structure": {"dir1/"},
124+
}
125+
),
126+
id="include-recursive-wildcard",
127+
),
128+
pytest.param(
129+
PatternScenario(
130+
{
131+
"include_patterns": {*()},
132+
"ignore_patterns": {"file2.py", "dir2/file_dir2.txt"},
133+
"expected_num_files": 6,
134+
"expected_content": {
135+
"file1.txt",
136+
"src/subfile1.txt",
137+
"src/subfile2.py",
138+
"src/subdir/file_subdir.txt",
139+
"src/subdir/file_subdir.py",
140+
"dir1/file_dir1.txt",
141+
},
142+
"expected_structure": {"test_repo/", "src/", "subdir/", "dir1/"},
143+
"expected_not_structure": {"dir2/"},
144+
}
145+
),
146+
id="exclude-explicit-files",
147+
),
148+
pytest.param(
149+
PatternScenario(
150+
{
151+
"include_patterns": {*()},
152+
"ignore_patterns": {"file1.txt", "file2.py", "*/file_dir1.txt"},
153+
"expected_num_files": 5,
154+
"expected_content": {
155+
"src/subfile1.txt",
156+
"src/subfile2.py",
157+
"src/subdir/file_subdir.txt",
158+
"src/subdir/file_subdir.py",
159+
"dir2/file_dir2.txt",
160+
},
161+
"expected_structure": {"test_repo/", "src/", "subdir/", "dir2/"},
162+
"expected_not_structure": {"dir1/"},
163+
}
164+
),
165+
id="exclude-wildcard-directory",
166+
),
167+
pytest.param(
168+
PatternScenario(
169+
{
170+
"include_patterns": {*()},
171+
"ignore_patterns": {"src/**/*.py"},
172+
"expected_num_files": 7,
173+
"expected_content": {
174+
"file1.txt",
175+
"file2.py",
176+
"src/subfile1.txt",
177+
"src/subfile2.py",
178+
"src/subdir/file_subdir.txt",
179+
"dir1/file_dir1.txt",
180+
"dir2/file_dir2.txt",
181+
},
182+
"expected_structure": {
183+
"test_repo/",
184+
"dir1/",
185+
"dir2/",
186+
"src/",
187+
"subdir/",
188+
},
189+
"expected_not_structure": {*()},
190+
}
191+
),
192+
id="exclude-recursive-wildcard",
193+
),
194+
],
195+
)
196+
def test_include_ignore_patterns(
197+
temp_directory: Path,
198+
sample_query: IngestionQuery,
199+
pattern_scenario: PatternScenario,
200+
) -> None:
201+
"""
202+
Test `ingest_query` to ensure included and ignored paths are included and ignored respectively.
203+
204+
Given a directory with .txt and .py files, and a set of include patterns or a set of ignore patterns:
205+
When `ingest_query` is invoked,
206+
Then it should produce a summary string listing the files analyzed and a combined content string.
207+
"""
208+
209+
sample_query.local_path = temp_directory
210+
sample_query.subpath = "/"
211+
sample_query.type = None
212+
sample_query.include_patterns = pattern_scenario["include_patterns"] or None
213+
sample_query.ignore_patterns = pattern_scenario["ignore_patterns"] or None
214+
215+
summary, structure, content = ingest_query(sample_query)
216+
217+
assert "Repository: test_user/test_repo" in summary
218+
num_files_regex = re.compile(r"^Files analyzed: (\d+)$", re.MULTILINE)
219+
assert (num_files_match := num_files_regex.search(summary)) is not None
220+
assert int(num_files_match.group(1)) == pattern_scenario["expected_num_files"]
221+
222+
# Check presence of key files in the content
223+
for expected_content_item in pattern_scenario["expected_content"]:
224+
assert expected_content_item in content
225+
226+
# check presence of included directories in structure
227+
for expected_structure_item in pattern_scenario["expected_structure"]:
228+
assert expected_structure_item in structure
229+
230+
# check non-presence of non-included directories in structure
231+
for expected_not_structure_item in pattern_scenario["expected_not_structure"]:
232+
assert expected_not_structure_item not in structure

0 commit comments

Comments
 (0)