Skip to content

Commit 1dd133c

Browse files
feat: add private-repo support to CLI & core (UI coming next) (#282)
* fix: split sparse-checkout & commit checkout when cloning; refresh docs/CLI * Run `git sparse-checkout set …` and `git checkout <sha>` as two calls—matches Git’s CLI rules and fixes failures. * Tidy clone path creation via _ensure_directory; use DEFAULT_TIMEOUT. * Clarify CLI/help strings and schema docstrings. * Update tests for the new two-step checkout flow. * feat(auth): support private GitHub repos & correct sparse-checkout flow * CLI: new `--token/-t` flag (fallback to `GITHUB_TOKEN`) * clone_repo: * injects Basic-auth header when a PAT is supplied * validates PAT format (`github_pat_*`) * git_utils: * `create_git_auth_header`, `validate_github_token`, `create_git_command` * `_check_github_repo_exists` & branch-listing now work with tokens * os_utils.ensure_directory extracted for reuse * tests updated to reflect new call signatures * allow git PAT to start with gth_ * fix GITHUB_PAT_PATTERN and add instructions to README * fix gph_ to ghp_ * docs: add GITHUB_TOKEN env var example to README * add GITHUB_TOKEN environment variable also in code
1 parent 789be9b commit 1dd133c

File tree

10 files changed

+334
-74
lines changed

10 files changed

+334
-74
lines changed

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ You can also replace `hub` with `ingest` in any GitHub URL to access the corresp
2929
## 📚 Requirements
3030

3131
- Python 3.7+
32+
- For private repositories: A GitHub Personal Access Token (PAT). You can generate one at [https://github.com/settings/personal-access-tokens](https://github.com/settings/personal-access-tokens) (Profile → Settings → Developer Settings → Personal Access Tokens → Fine-grained Tokens)
3233

3334
### 📦 Installation
3435

@@ -83,6 +84,14 @@ gitingest /path/to/directory
8384
# From URL
8485
gitingest https://github.com/cyclotruc/gitingest
8586

87+
# For private repositories, use the --token option
88+
# Get your token from https://github.com/settings/personal-access-tokens
89+
gitingest https://github.com/username/private-repo --token github_pat_...
90+
91+
# Or set it as an environment variable
92+
export GITHUB_TOKEN=github_pat_...
93+
gitingest https://github.com/username/private-repo
94+
8695
# See more options
8796
gitingest --help
8897
```

src/gitingest/cli.py

Lines changed: 65 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -29,26 +29,39 @@
2929
"--exclude-pattern",
3030
"-e",
3131
multiple=True,
32-
help="""Patterns to exclude. Handles python's arbitrary subset of Unix
33-
shell-style wildcards. See:
34-
https://docs.python.org/3/library/fnmatch.html""",
32+
help=(
33+
"Patterns to exclude. Handles Python's arbitrary subset of Unix shell-style "
34+
"wildcards. See: https://docs.python.org/3/library/fnmatch.html"
35+
),
3536
)
3637
@click.option(
3738
"--include-pattern",
3839
"-i",
3940
multiple=True,
40-
help="""Patterns to include. Handles python's arbitrary subset of Unix
41-
shell-style wildcards. See:
42-
https://docs.python.org/3/library/fnmatch.html""",
41+
help=(
42+
"Patterns to include. Handles Python's arbitrary subset of Unix shell-style "
43+
"wildcards. See: https://docs.python.org/3/library/fnmatch.html"
44+
),
4345
)
4446
@click.option("--branch", "-b", default=None, help="Branch to clone and ingest")
47+
@click.option(
48+
"--token",
49+
"-t",
50+
envvar="GITHUB_TOKEN",
51+
default=None,
52+
help=(
53+
"GitHub personal access token for accessing private repositories. "
54+
"If omitted, the CLI will look for the GITHUB_TOKEN environment variable."
55+
),
56+
)
4557
def main(
4658
source: str,
4759
output: Optional[str],
4860
max_size: int,
4961
exclude_pattern: Tuple[str, ...],
5062
include_pattern: Tuple[str, ...],
5163
branch: Optional[str],
64+
token: Optional[str],
5265
):
5366
"""
5467
Main entry point for the CLI. This function is called when the CLI is run as a script.
@@ -58,21 +71,33 @@ def main(
5871
Parameters
5972
----------
6073
source : str
61-
The source directory or repository to analyze.
74+
A directory path or a Git repository URL.
6275
output : str, optional
63-
The path where the output file will be written. If not specified, the output will be written
64-
to a file named `<repo_name>.txt` in the current directory.
76+
Output file path. Defaults to `<repo_name>.txt`.
6577
max_size : int
66-
The maximum file size to process, in bytes. Files larger than this size will be ignored.
78+
Maximum file size (in bytes) to consider.
6779
exclude_pattern : Tuple[str, ...]
68-
A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored.
80+
Glob patterns for pruning the file set.
6981
include_pattern : Tuple[str, ...]
70-
A tuple of patterns to include during the analysis. Only files matching these patterns will be processed.
82+
Glob patterns for including files in the output.
7183
branch : str, optional
72-
The branch to clone (optional).
84+
Specific branch to ingest (defaults to the repository's default).
85+
token: str, optional
86+
GitHub personal-access token (PAT). Needed when *source* refers to a
87+
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
7388
"""
74-
# Main entry point for the CLI. This function is called when the CLI is run as a script.
75-
asyncio.run(_async_main(source, output, max_size, exclude_pattern, include_pattern, branch))
89+
90+
asyncio.run(
91+
_async_main(
92+
source=source,
93+
output=output,
94+
max_size=max_size,
95+
exclude_pattern=exclude_pattern,
96+
include_pattern=include_pattern,
97+
branch=branch,
98+
token=token,
99+
)
100+
)
76101

77102

78103
async def _async_main(
@@ -82,6 +107,7 @@ async def _async_main(
82107
exclude_pattern: Tuple[str, ...],
83108
include_pattern: Tuple[str, ...],
84109
branch: Optional[str],
110+
token: Optional[str],
85111
) -> None:
86112
"""
87113
Analyze a directory or repository and create a text dump of its contents.
@@ -92,40 +118,53 @@ async def _async_main(
92118
Parameters
93119
----------
94120
source : str
95-
The source directory or repository to analyze.
121+
A directory path or a Git repository URL.
96122
output : str, optional
97-
The path where the output file will be written. If not specified, the output will be written
98-
to a file named `<repo_name>.txt` in the current directory.
123+
Output file path. Defaults to `<repo_name>.txt`.
99124
max_size : int
100-
The maximum file size to process, in bytes. Files larger than this size will be ignored.
125+
Maximum file size (in bytes) to consider.
101126
exclude_pattern : Tuple[str, ...]
102-
A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored.
127+
Glob patterns for pruning the file set.
103128
include_pattern : Tuple[str, ...]
104-
A tuple of patterns to include during the analysis. Only files matching these patterns will be processed.
129+
Glob patterns for including files in the output.
105130
branch : str, optional
106-
The branch to clone (optional).
131+
Specific branch to ingest (defaults to the repository's default).
132+
token: str, optional
133+
GitHub personal-access token (PAT). Needed when *source* refers to a
134+
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
107135
108136
Raises
109137
------
110138
Abort
111139
If there is an error during the execution of the command, this exception is raised to abort the process.
112140
"""
113141
try:
114-
# Combine default and custom ignore patterns
142+
# Normalise pattern containers (the ingest layer expects sets)
115143
exclude_patterns = set(exclude_pattern)
116144
include_patterns = set(include_pattern)
117145

118-
if not output:
146+
# Choose a default output path if none provided
147+
if output is None:
119148
output = OUTPUT_FILE_NAME
120-
summary, _, _ = await ingest_async(source, max_size, include_patterns, exclude_patterns, branch, output=output)
149+
150+
summary, _, _ = await ingest_async(
151+
source=source,
152+
max_file_size=max_size,
153+
include_patterns=include_patterns,
154+
exclude_patterns=exclude_patterns,
155+
branch=branch,
156+
output=output,
157+
token=token,
158+
)
121159

122160
click.echo(f"Analysis complete! Output written to: {output}")
123161
click.echo("\nSummary:")
124162
click.echo(summary)
125163

126164
except Exception as exc:
165+
# Convert any exception into Click.Abort so that exit status is non-zero
127166
click.echo(f"Error: {exc}", err=True)
128-
raise click.Abort()
167+
raise click.Abort() from exc
129168

130169

131170
if __name__ == "__main__":

src/gitingest/cloning.py

Lines changed: 44 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,24 @@
11
"""This module contains functions for cloning a Git repository to a local path."""
22

3-
import os
43
from pathlib import Path
54
from typing import Optional
65

6+
from gitingest.config import DEFAULT_TIMEOUT
77
from gitingest.schemas import CloneConfig
8-
from gitingest.utils.git_utils import check_repo_exists, ensure_git_installed, run_command
8+
from gitingest.utils.git_utils import (
9+
check_repo_exists,
10+
create_git_auth_header,
11+
create_git_command,
12+
ensure_git_installed,
13+
run_command,
14+
validate_github_token,
15+
)
16+
from gitingest.utils.os_utils import ensure_directory
917
from gitingest.utils.timeout_wrapper import async_timeout
1018

11-
TIMEOUT: int = 60
1219

13-
14-
@async_timeout(TIMEOUT)
15-
async def clone_repo(config: CloneConfig) -> None:
20+
@async_timeout(DEFAULT_TIMEOUT)
21+
async def clone_repo(config: CloneConfig, token: Optional[str] = None) -> None:
1622
"""
1723
Clone a repository to a local path based on the provided configuration.
1824
@@ -24,13 +30,15 @@ async def clone_repo(config: CloneConfig) -> None:
2430
----------
2531
config : CloneConfig
2632
The configuration for cloning the repository.
33+
token : str, optional
34+
GitHub personal-access token (PAT). Needed when *source* refers to a
35+
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
36+
Must start with 'github_pat_' or 'gph_' for GitHub repositories.
2737
2838
Raises
2939
------
3040
ValueError
31-
If the repository is not found or if the provided URL is invalid.
32-
OSError
33-
If an error occurs while creating the parent directory for the repository.
41+
If the repository is not found, if the provided URL is invalid, or if the token format is invalid.
3442
"""
3543
# Extract and validate query parameters
3644
url: str = config.url
@@ -39,19 +47,23 @@ async def clone_repo(config: CloneConfig) -> None:
3947
branch: Optional[str] = config.branch
4048
partial_clone: bool = config.subpath != "/"
4149

50+
# Validate token if provided
51+
if token and url.startswith("https://github.com"):
52+
validate_github_token(token)
53+
4254
# Create parent directory if it doesn't exist
43-
parent_dir = Path(local_path).parent
44-
try:
45-
os.makedirs(parent_dir, exist_ok=True)
46-
except OSError as exc:
47-
raise OSError(f"Failed to create parent directory {parent_dir}: {exc}") from exc
55+
await ensure_directory(Path(local_path).parent)
4856

4957
# Check if the repository exists
50-
if not await check_repo_exists(url):
51-
raise ValueError("Repository not found, make sure it is public")
58+
if not await check_repo_exists(url, token=token):
59+
raise ValueError("Repository not found. Make sure it is public or that you have provided a valid token.")
5260

53-
clone_cmd = ["git", "clone", "--single-branch"]
54-
# TODO re-enable --recurse-submodules
61+
clone_cmd = ["git"]
62+
if token and url.startswith("https://github.com"):
63+
clone_cmd += ["-c", create_git_auth_header(token)]
64+
65+
clone_cmd += ["clone", "--single-branch"]
66+
# TODO: Re-enable --recurse-submodules when submodule support is needed
5567

5668
if partial_clone:
5769
clone_cmd += ["--filter=blob:none", "--sparse"]
@@ -67,19 +79,17 @@ async def clone_repo(config: CloneConfig) -> None:
6779
await ensure_git_installed()
6880
await run_command(*clone_cmd)
6981

70-
if commit or partial_clone:
71-
checkout_cmd = ["git", "-C", local_path]
72-
73-
if partial_clone:
74-
subpath = config.subpath.lstrip("/")
75-
if config.blob:
76-
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name.
77-
subpath = str(Path(subpath).parent.as_posix())
78-
79-
checkout_cmd += ["sparse-checkout", "set", subpath]
80-
81-
if commit:
82-
checkout_cmd += ["checkout", commit]
83-
84-
# Check out the specific commit and/or subpath
85-
await run_command(*checkout_cmd)
82+
# Checkout the subpath if it is a partial clone
83+
if partial_clone:
84+
subpath = config.subpath.lstrip("/")
85+
if config.blob:
86+
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name.
87+
subpath = str(Path(subpath).parent.as_posix())
88+
89+
checkout_cmd = create_git_command(["git"], local_path, url, token)
90+
await run_command(*checkout_cmd, "sparse-checkout", "set", subpath)
91+
92+
# Checkout the commit if it is provided
93+
if commit:
94+
checkout_cmd = create_git_command(["git"], local_path, url, token)
95+
await run_command(*checkout_cmd, "checkout", commit)

src/gitingest/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal
88
MAX_FILES = 10_000 # Maximum number of files to process
99
MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB
10+
DEFAULT_TIMEOUT = 60 # seconds
1011

1112
OUTPUT_FILE_NAME = "digest.txt"
1213

src/gitingest/entrypoint.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import asyncio
44
import inspect
5+
import os
56
import shutil
67
from typing import Optional, Set, Tuple, Union
78

@@ -17,6 +18,7 @@ async def ingest_async(
1718
include_patterns: Optional[Union[str, Set[str]]] = None,
1819
exclude_patterns: Optional[Union[str, Set[str]]] = None,
1920
branch: Optional[str] = None,
21+
token: Optional[str] = None,
2022
output: Optional[str] = None,
2123
) -> Tuple[str, str, str]:
2224
"""
@@ -39,6 +41,9 @@ async def ingest_async(
3941
Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded.
4042
branch : str, optional
4143
The branch to clone and ingest. If `None`, the default branch is used.
44+
token : str, optional
45+
GitHub personal-access token (PAT). Needed when *source* refers to a
46+
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
4247
output : str, optional
4348
File path where the summary and content should be written. If `None`, the results are not written to a file.
4449
@@ -57,6 +62,9 @@ async def ingest_async(
5762
"""
5863
repo_cloned = False
5964

65+
if not token:
66+
token = os.getenv("GITHUB_TOKEN")
67+
6068
try:
6169
query: IngestionQuery = await parse_query(
6270
source=source,
@@ -71,7 +79,7 @@ async def ingest_async(
7179
query.branch = selected_branch
7280

7381
clone_config = query.extract_clone_config()
74-
clone_coroutine = clone_repo(clone_config)
82+
clone_coroutine = clone_repo(clone_config, token=token)
7583

7684
if inspect.iscoroutine(clone_coroutine):
7785
if asyncio.get_event_loop().is_running():
@@ -102,6 +110,7 @@ def ingest(
102110
include_patterns: Optional[Union[str, Set[str]]] = None,
103111
exclude_patterns: Optional[Union[str, Set[str]]] = None,
104112
branch: Optional[str] = None,
113+
token: Optional[str] = None,
105114
output: Optional[str] = None,
106115
) -> Tuple[str, str, str]:
107116
"""
@@ -124,6 +133,9 @@ def ingest(
124133
Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded.
125134
branch : str, optional
126135
The branch to clone and ingest. If `None`, the default branch is used.
136+
token : str, optional
137+
GitHub personal-access token (PAT). Needed when *source* refers to a
138+
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
127139
output : str, optional
128140
File path where the summary and content should be written. If `None`, the results are not written to a file.
129141
@@ -146,6 +158,7 @@ def ingest(
146158
include_patterns=include_patterns,
147159
exclude_patterns=exclude_patterns,
148160
branch=branch,
161+
token=token,
149162
output=output,
150163
)
151164
)

0 commit comments

Comments
 (0)