Skip to content

Commit 0a35f58

Browse files
Simplify GitHub-specific code and generalize Git repository handling
Co-authored-by: nicoragne <nicoragne@hotmail.fr>
1 parent d8238d3 commit 0a35f58

File tree

4 files changed

+26
-131
lines changed

4 files changed

+26
-131
lines changed

src/gitingest/clone.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
create_git_auth_header,
1313
create_git_command,
1414
ensure_git_installed,
15-
is_github_host,
1615
resolve_commit,
1716
run_command,
1817
)

src/gitingest/utils/exceptions.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,4 @@ def __init__(self, message: str) -> None:
1616
super().__init__(message)
1717

1818

19-
class InvalidGitHubTokenError(ValueError):
20-
"""Exception raised when a GitHub Personal Access Token is malformed."""
21-
22-
def __init__(self) -> None:
23-
msg = (
24-
"Invalid GitHub token format. To generate a token, go to "
25-
"https://github.com/settings/tokens/new?description=gitingest&scopes=repo."
26-
)
27-
super().__init__(msg)
19+

src/gitingest/utils/git_utils.py

Lines changed: 16 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,7 @@
99
from typing import TYPE_CHECKING, Iterable
1010
from urllib.parse import urlparse
1111

12-
import httpx
13-
from starlette.status import HTTP_200_OK, HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND
1412

15-
from gitingest.utils.compat_func import removesuffix
1613

1714
from gitingest.utils.logging_config import get_logger
1815

@@ -25,23 +22,6 @@
2522

2623

2724

28-
def is_github_host(url: str) -> bool:
29-
"""Check if a URL is from a GitHub host (github.com or GitHub Enterprise).
30-
31-
Parameters
32-
----------
33-
url : str
34-
The URL to check
35-
36-
Returns
37-
-------
38-
bool
39-
True if the URL is from a GitHub host, False otherwise
40-
41-
"""
42-
hostname = urlparse(url).hostname or ""
43-
return hostname.startswith("github.")
44-
4525

4626
async def run_command(*args: str) -> tuple[bytes, bytes]:
4727
"""Execute a shell command asynchronously and return (stdout, stderr) bytes.
@@ -115,80 +95,27 @@ async def check_repo_exists(url: str, token: str | None = None) -> bool:
11595
url : str
11696
URL of the Git repository to check.
11797
token : str | None
118-
GitHub personal access token (PAT) for accessing private repositories.
98+
Personal access token (PAT) for accessing private repositories.
11999
120100
Returns
121101
-------
122102
bool
123103
``True`` if the repository exists, ``False`` otherwise.
124104
125-
Raises
126-
------
127-
RuntimeError
128-
If the host returns an unrecognised status code.
129-
130105
"""
131-
headers = {}
132-
133-
if token and is_github_host(url):
134-
host, owner, repo = _parse_github_url(url)
135-
# Public GitHub vs. GitHub Enterprise
136-
base_api = "https://api.github.com" if host == "github.com" else f"https://{host}/api/v3"
137-
url = f"{base_api}/repos/{owner}/{repo}"
138-
headers["Authorization"] = f"Bearer {token}"
139-
140-
async with httpx.AsyncClient(follow_redirects=True) as client:
141-
try:
142-
response = await client.head(url, headers=headers)
143-
except httpx.RequestError:
144-
return False
145-
146-
status_code = response.status_code
147-
148-
if status_code == HTTP_200_OK:
106+
try:
107+
# Use git ls-remote to check if repository exists
108+
cmd = ["git"]
109+
if token:
110+
cmd += ["-c", create_git_auth_header(token, url=url)]
111+
cmd += ["ls-remote", "--heads", url]
112+
113+
await run_command(*cmd)
149114
return True
150-
if status_code in {HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND}:
115+
except Exception:
151116
return False
152-
msg = f"Unexpected HTTP status {status_code} for {url}"
153-
raise RuntimeError(msg)
154117

155118

156-
def _parse_github_url(url: str) -> tuple[str, str, str]:
157-
"""Parse a GitHub URL and return (hostname, owner, repo).
158-
159-
Parameters
160-
----------
161-
url : str
162-
The URL of the GitHub repository to parse.
163-
164-
Returns
165-
-------
166-
tuple[str, str, str]
167-
A tuple containing the hostname, owner, and repository name.
168-
169-
Raises
170-
------
171-
ValueError
172-
If the URL is not a valid GitHub repository URL.
173-
174-
"""
175-
parsed = urlparse(url)
176-
if parsed.scheme not in {"http", "https"}:
177-
msg = f"URL must start with http:// or https://: {url!r}"
178-
raise ValueError(msg)
179-
180-
if not parsed.hostname or not parsed.hostname.startswith("github."):
181-
msg = f"Un-recognised GitHub hostname: {parsed.hostname!r}"
182-
raise ValueError(msg)
183-
184-
parts = removesuffix(parsed.path, ".git").strip("/").split("/")
185-
expected_path_length = 2
186-
if len(parts) != expected_path_length:
187-
msg = f"Path must look like /<owner>/<repo>: {parsed.path!r}"
188-
raise ValueError(msg)
189-
190-
owner, repo = parts
191-
return parsed.hostname, owner, repo
192119

193120

194121
async def fetch_remote_branches_or_tags(url: str, *, ref_type: str, token: str | None = None) -> list[str]:
@@ -201,7 +128,7 @@ async def fetch_remote_branches_or_tags(url: str, *, ref_type: str, token: str |
201128
ref_type: str
202129
The type of reference to fetch. Can be "branches" or "tags".
203130
token : str | None
204-
GitHub personal access token (PAT) for accessing private repositories.
131+
Personal access token (PAT) for accessing private repositories.
205132
206133
Returns
207134
-------
@@ -221,7 +148,7 @@ async def fetch_remote_branches_or_tags(url: str, *, ref_type: str, token: str |
221148
cmd = ["git"]
222149

223150
# Add authentication if needed
224-
if token and is_github_host(url):
151+
if token:
225152
cmd += ["-c", create_git_auth_header(token, url=url)]
226153

227154
cmd += ["ls-remote"]
@@ -314,7 +241,7 @@ async def checkout_partial_clone(config: CloneConfig, token: str | None) -> None
314241
config : CloneConfig
315242
The configuration for cloning the repository, including subpath and blob flag.
316243
token : str | None
317-
GitHub personal access token (PAT) for accessing private repositories.
244+
Personal access token (PAT) for accessing private repositories.
318245
319246
"""
320247
subpath = config.subpath.lstrip("/")
@@ -333,7 +260,7 @@ async def resolve_commit(config: CloneConfig, token: str | None) -> str:
333260
config : CloneConfig
334261
The configuration for cloning the repository.
335262
token : str | None
336-
GitHub personal access token (PAT) for accessing private repositories.
263+
Personal access token (PAT) for accessing private repositories.
337264
338265
Returns
339266
-------
@@ -365,7 +292,7 @@ async def _resolve_ref_to_sha(url: str, pattern: str, token: str | None = None)
365292
pattern : str
366293
The pattern to use to resolve the commit SHA.
367294
token : str | None
368-
GitHub personal access token (PAT) for accessing private repositories.
295+
Personal access token (PAT) for accessing private repositories.
369296
370297
Returns
371298
-------
@@ -380,7 +307,7 @@ async def _resolve_ref_to_sha(url: str, pattern: str, token: str | None = None)
380307
"""
381308
# Build: git [-c http.<host>/.extraheader=Auth...] ls-remote <url> <pattern>
382309
cmd: list[str] = ["git"]
383-
if token and is_github_host(url):
310+
if token:
384311
cmd += ["-c", create_git_auth_header(token, url=url)]
385312

386313
cmd += ["ls-remote", url, pattern]

tests/test_git_utils.py

Lines changed: 9 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
import pytest
1313

14-
from gitingest.utils.git_utils import create_git_auth_header, create_git_command, is_github_host
14+
from gitingest.utils.git_utils import create_git_auth_header, create_git_command
1515

1616
if TYPE_CHECKING:
1717
from pathlib import Path
@@ -118,50 +118,27 @@ def test_create_git_command_helper_calls(
118118
assert "HEADER" not in cmd
119119

120120

121-
@pytest.mark.parametrize(
122-
("url", "expected"),
123-
[
124-
# GitHub.com URLs
125-
("https://github.com/owner/repo.git", True),
126-
("http://github.com/owner/repo.git", True),
127-
("https://github.com/owner/repo", True),
128-
# GitHub Enterprise URLs
129-
("https://github.company.com/owner/repo.git", True),
130-
("https://github.enterprise.org/owner/repo.git", True),
131-
("http://github.internal/owner/repo.git", True),
132-
("https://github.example.co.uk/owner/repo.git", True),
133-
# Non-GitHub URLs
134-
("https://gitlab.com/owner/repo.git", False),
135-
("https://bitbucket.org/owner/repo.git", False),
136-
("https://git.example.com/owner/repo.git", False),
137-
("https://mygithub.com/owner/repo.git", False), # doesn't start with "github."
138-
("https://subgithub.com/owner/repo.git", False),
139-
("https://example.com/github/repo.git", False),
140-
# Edge cases
141-
("", False),
142-
("not-a-url", False),
143-
("ftp://github.com/owner/repo.git", True), # Different protocol but still github.com
144-
],
145-
)
146-
def test_is_github_host(url: str, *, expected: bool) -> None:
147-
"""Test that ``is_github_host`` correctly identifies GitHub and GitHub Enterprise URLs."""
148-
assert is_github_host(url) == expected
121+
149122

150123

151124
@pytest.mark.parametrize(
152125
("token", "url", "expected_hostname"),
153126
[
154-
# GitHub.com URLs (default)
127+
# GitHub.com URLs
155128
("ghp_" + "a" * 36, "https://github.com", "github.com"),
156129
("ghp_" + "a" * 36, "https://github.com/owner/repo.git", "github.com"),
157130
# GitHub Enterprise URLs
158131
("ghp_" + "b" * 36, "https://github.company.com", "github.company.com"),
159132
("ghp_" + "c" * 36, "https://github.enterprise.org/owner/repo.git", "github.enterprise.org"),
160133
("ghp_" + "d" * 36, "http://github.internal", "github.internal"),
134+
# Other Git services
135+
("glpat-xxxxxxxxxxxxxxxxxxxx", "https://gitlab.com/owner/repo.git", "gitlab.com"),
136+
("some_token", "https://bitbucket.org/owner/repo.git", "bitbucket.org"),
137+
("custom_token", "https://git.example.com/owner/repo.git", "git.example.com"),
161138
],
162139
)
163-
def test_create_git_auth_header_with_ghe_url(token: str, url: str, expected_hostname: str) -> None:
164-
"""Test that ``create_git_auth_header`` handles GitHub Enterprise URLs correctly."""
140+
def test_create_git_auth_header_with_different_hostnames(token: str, url: str, expected_hostname: str) -> None:
141+
"""Test that ``create_git_auth_header`` handles different Git service URLs correctly."""
165142
header = create_git_auth_header(token, url=url)
166143
expected_basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode()
167144
expected = f"http.https://{expected_hostname}/.extraheader=Authorization: Basic {expected_basic}"

0 commit comments

Comments
 (0)