Skip to content

Commit 6fffafc

Browse files
refactor: centralize PAT validation, streamline repo checks & misc cleanup (#349)
* refactor: centralize PAT validation, streamline repo checks & housekeeping * `.venv*` to `.gitignore` * `# type: ignore[attr-defined]` hints in `compat_typing.py` for IDE-agnostic imports * Helpful PAT string in `InvalidGitHubTokenError` for easier debugging * Bump **ruff-pre-commit** hook → `v0.12.1` * CONTRIBUTING: * Require **Python 3.9+** * Recommend signed (`-S`) commits * PAT validation now happens **only** in entry points (`utils.auth.resolve_token` for CLI/lib, `server.process_query` for Web UI) * Unified `_check_github_repo_exists` into `check_repo_exists`, replacing `curl -I` with `curl --silent --location --write-out %{http_code} -o /dev/null` * Broaden `_GITHUB_PAT_PATTERN` * `create_git_auth_header` raises `ValueError` when hostname is missing * Tests updated to expect raw HTTP-code output * Superfluous “token can be set via `GITHUB_TOKEN`” notes in docstrings * `.gitingestignore` & `.terraform` from `DEFAULT_IGNORE_PATTERNS` * Token validation inside `create_git_command` * Obsolete `test_create_git_command_invalid_token` * Adjust `test_clone.py` and `test_git_utils.py` for new status-code handling * Consolidate mocks after token-validation relocation BREAKING CHANGE: `create_git_command` no longer validates GitHub tokens; callers must ensure tokens are valid (via `validate_github_token`) before invoking lower-level git helpers. --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent d7aafb7 commit 6fffafc

File tree

4 files changed

+69
-36
lines changed

4 files changed

+69
-36
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,4 +151,4 @@ repos:
151151
- repo: meta
152152
hooks:
153153
- id: check-hooks-apply
154-
- id: check-useless-excludes
154+
- id: check-useless-excludes

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,4 +89,4 @@ If you ever get stuck, reach out on [Discord](https://discord.com/invite/zerRaGK
8989

9090
13. **Iterate** on any review feedback—update your branch and repeat **6 – 11** as needed.
9191

92-
*(Optional) Invite a maintainer to your branch for easier collaboration.*
92+
*(Optional) Invite a maintainer to your branch for easier collaboration.*

src/gitingest/utils/git_utils.py

Lines changed: 45 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,29 @@
44

55
import asyncio
66
import base64
7+
import os
78
import re
89
import sys
910
from pathlib import Path
1011
from typing import TYPE_CHECKING, Final, Iterable
1112
import sys
13+
from typing import Final
1214
from urllib.parse import urlparse
1315

1416
import httpx
1517
from starlette.status import HTTP_200_OK, HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND
1618

1719
from gitingest.utils.compat_func import removesuffix
20+
21+
from starlette.status import (
22+
HTTP_200_OK,
23+
HTTP_301_MOVED_PERMANENTLY,
24+
HTTP_302_FOUND,
25+
HTTP_401_UNAUTHORIZED,
26+
HTTP_403_FORBIDDEN,
27+
HTTP_404_NOT_FOUND,
28+
)
29+
1830
from gitingest.utils.exceptions import InvalidGitHubTokenError
1931
from server.server_utils import Colors
2032

@@ -132,28 +144,46 @@ async def check_repo_exists(url: str, token: str | None = None) -> bool:
132144
If the host returns an unrecognised status code.
133145
134146
"""
135-
headers = {}
147+
# TODO: use `requests` instead of `curl`
148+
cmd: list[str] = [
149+
"curl",
150+
"--silent",
151+
"--location",
152+
"--head",
153+
"--write-out",
154+
"%{http_code}",
155+
"-o",
156+
os.devnull,
157+
]
136158

137159
if token and is_github_host(url):
138160
host, owner, repo = _parse_github_url(url)
139161
# Public GitHub vs. GitHub Enterprise
140162
base_api = "https://api.github.com" if host == "github.com" else f"https://{host}/api/v3"
141163
url = f"{base_api}/repos/{owner}/{repo}"
142-
headers["Authorization"] = f"Bearer {token}"
164+
cmd += [f"Authorization: Bearer {token}"]
143165

144-
async with httpx.AsyncClient(follow_redirects=True) as client:
145-
try:
146-
response = await client.head(url, headers=headers)
147-
except httpx.RequestError:
148-
return False
166+
cmd.append(url)
167+
168+
proc = await asyncio.create_subprocess_exec(
169+
*cmd,
170+
stdout=asyncio.subprocess.PIPE,
171+
stderr=asyncio.subprocess.PIPE,
172+
)
173+
stdout, _ = await proc.communicate()
149174

150-
status_code = response.status_code
175+
if proc.returncode != 0:
176+
return False
151177

152-
if status_code == HTTP_200_OK:
178+
status = int(stdout.decode().strip())
179+
if status in {HTTP_200_OK, HTTP_301_MOVED_PERMANENTLY}:
153180
return True
154-
if status_code in {HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND}:
181+
# TODO: handle 302 redirects
182+
if status in {HTTP_404_NOT_FOUND, HTTP_302_FOUND}:
183+
return False
184+
if status in {HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN}:
155185
return False
156-
msg = f"Unexpected HTTP status {status_code} for {url}"
186+
msg = f"Unexpected HTTP status {status} for {url}"
157187
raise RuntimeError(msg)
158188

159189

@@ -185,7 +215,7 @@ def _parse_github_url(url: str) -> tuple[str, str, str]:
185215
msg = f"Un-recognised GitHub hostname: {parsed.hostname!r}"
186216
raise ValueError(msg)
187217

188-
parts = removesuffix(parsed.path, ".git").strip("/").split("/")
218+
parts = parsed.path.strip("/").removesuffix(".git").split("/")
189219
expected_path_length = 2
190220
if len(parts) != expected_path_length:
191221
msg = f"Path must look like /<owner>/<repo>: {parsed.path!r}"
@@ -218,28 +248,13 @@ async def fetch_remote_branches_or_tags(url: str, *, ref_type: str, token: str |
218248
If the ``ref_type`` parameter is not "branches" or "tags".
219249
220250
"""
221-
if ref_type not in ("branches", "tags"):
222-
msg = f"Invalid fetch type: {ref_type}"
223-
raise ValueError(msg)
224-
225251
cmd = ["git"]
226252

227253
# Add authentication if needed
228254
if token and is_github_host(url):
229255
cmd += ["-c", create_git_auth_header(token, url=url)]
230256

231-
cmd += ["ls-remote"]
232-
233-
fetch_tags = ref_type == "tags"
234-
to_fetch = "tags" if fetch_tags else "heads"
235-
236-
cmd += [f"--{to_fetch}"]
237-
238-
# `--refs` filters out the peeled tag objects (those ending with "^{}") (for tags)
239-
if fetch_tags:
240-
cmd += ["--refs"]
241-
242-
cmd += [url]
257+
cmd += ["ls-remote", "--heads", url]
243258

244259
await ensure_git_installed()
245260
stdout, _ = await run_command(*cmd)
@@ -248,9 +263,9 @@ async def fetch_remote_branches_or_tags(url: str, *, ref_type: str, token: str |
248263
# - Skip empty lines and lines that don't contain "refs/{to_fetch}/"
249264
# - Extract the branch or tag name after "refs/{to_fetch}/"
250265
return [
251-
line.split(f"refs/{to_fetch}/", 1)[1]
266+
line.split("refs/heads/", 1)[1]
252267
for line in stdout.decode().splitlines()
253-
if line.strip() and f"refs/{to_fetch}/" in line
268+
if line.strip() and "refs/heads/" in line
254269
]
255270

256271

tests/test_clone.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,10 +91,9 @@ async def test_clone_nonexistent_repository(repo_exists_true: AsyncMock) -> None
9191
@pytest.mark.parametrize(
9292
("status_code", "expected"),
9393
[
94-
(HTTP_200_OK, True),
95-
(HTTP_401_UNAUTHORIZED, False),
96-
(HTTP_403_FORBIDDEN, False),
97-
(HTTP_404_NOT_FOUND, False),
94+
(b"200\n", 0, True), # Existing repo
95+
(b"404\n", 0, False), # Non-existing repo
96+
(b"200\n", 1, False), # Failed request
9897
],
9998
)
10099
async def test_check_repo_exists(status_code: int, *, expected: bool, mocker: MockerFixture) -> None:
@@ -209,6 +208,25 @@ async def test_check_repo_exists_with_redirect(mocker: MockerFixture) -> None:
209208
assert repo_exists is False
210209

211210

211+
@pytest.mark.asyncio
212+
async def test_check_repo_exists_with_permanent_redirect(mocker: MockerFixture) -> None:
213+
"""Test ``check_repo_exists`` when a permanent redirect (301) is returned.
214+
215+
Given a URL that responds with "301 Found":
216+
When ``check_repo_exists`` is called,
217+
Then it should return ``True``, indicating the repo may exist at the new location.
218+
"""
219+
mock_exec = mocker.patch("asyncio.create_subprocess_exec", new_callable=AsyncMock)
220+
mock_process = AsyncMock()
221+
mock_process.communicate.return_value = (b"301\n", b"")
222+
mock_process.returncode = 0 # Simulate successful request
223+
mock_exec.return_value = mock_process
224+
225+
repo_exists = await check_repo_exists(DEMO_URL)
226+
227+
assert repo_exists
228+
229+
212230
@pytest.mark.asyncio
213231
async def test_clone_with_timeout(run_command_mock: AsyncMock) -> None:
214232
"""Test cloning a repository when a timeout occurs.

0 commit comments

Comments
 (0)