Skip to content

Commit c5c9d5d

Browse files
call resolve_commit in parse_remote_repo instead of in clone_repo
1 parent 8a2c6cb commit c5c9d5d

File tree

16 files changed

+294
-156
lines changed

16 files changed

+294
-156
lines changed

.pre-commit-config.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,12 @@ repos:
122122
pytest-asyncio,
123123
pytest-mock,
124124
python-dotenv,
125+
'sentry-sdk[fastapi]',
125126
slowapi,
126127
starlette>=0.40.0,
128+
strenum; python_version < '3.11',
127129
tiktoken>=0.7.0,
130+
typing_extensions>= 4.0.0; python_version < '3.10',
128131
uvicorn>=0.11.7,
129132
]
130133

@@ -144,9 +147,12 @@ repos:
144147
pytest-asyncio,
145148
pytest-mock,
146149
python-dotenv,
150+
'sentry-sdk[fastapi]',
147151
slowapi,
148152
starlette>=0.40.0,
153+
strenum; python_version < '3.11',
149154
tiktoken>=0.7.0,
155+
typing_extensions>= 4.0.0; python_version < '3.10',
150156
uvicorn>=0.11.7,
151157
]
152158

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ If you ever get stuck, reach out on [Discord](https://discord.com/invite/zerRaGK
3232
```bash
3333
python -m venv .venv
3434
source .venv/bin/activate
35-
pip install -e ".[dev]"
35+
pip install -e ".[dev,server]"
3636
pre-commit install
3737
```
3838

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,14 @@ You can install it using `pip`:
6666
pip install gitingest
6767
```
6868

69+
or
70+
71+
```bash
72+
pip install gitingest[server]
73+
```
74+
75+
to include server dependencies for self-hosting.
76+
6977
However, it might be a good idea to use `pipx` to install it.
7078
You can install `pipx` using your preferred package manager.
7179

pyproject.toml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,14 @@ readme = {file = "README.md", content-type = "text/markdown" }
66
requires-python = ">= 3.8"
77
dependencies = [
88
"click>=8.0.0",
9-
"fastapi[standard]>=0.109.1", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2024-38)
109
"httpx",
1110
"pathspec>=0.12.1",
1211
"pydantic",
1312
"python-dotenv",
14-
"slowapi",
1513
"starlette>=0.40.0", # Minimum safe release (https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw)
14+
"strenum; python_version < '3.11'",
1615
"tiktoken>=0.7.0", # Support for o200k_base encoding
1716
"typing_extensions>= 4.0.0; python_version < '3.10'",
18-
"uvicorn>=0.11.7", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2020-150)
19-
"prometheus-client",
2017
]
2118

2219
license = {file = "LICENSE"}
@@ -46,6 +43,14 @@ dev = [
4643
"pytest-mock",
4744
]
4845

46+
server = [
47+
"fastapi[standard]>=0.109.1", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2024-38)
48+
"prometheus-client",
49+
"sentry-sdk[fastapi]",
50+
"slowapi",
51+
"uvicorn>=0.11.7", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2020-150)
52+
]
53+
4954
[project.scripts]
5055
gitingest = "gitingest.__main__:main"
5156

src/gitingest/output_formatter.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,13 +82,14 @@ def _create_summary_prefix(query: IngestionQuery, *, single_file: bool = False)
8282
# Local scenario
8383
parts.append(f"Directory: {query.slug}")
8484

85-
if query.commit:
86-
parts.append(f"Commit: {query.commit}")
87-
elif query.tag:
85+
if query.tag:
8886
parts.append(f"Tag: {query.tag}")
8987
elif query.branch and query.branch not in ("main", "master"):
9088
parts.append(f"Branch: {query.branch}")
9189

90+
if query.commit:
91+
parts.append(f"Commit: {query.commit}")
92+
9293
if query.subpath != "/" and not single_file:
9394
parts.append(f"Subpath: {query.subpath}")
9495

src/gitingest/query_parser.py

Lines changed: 48 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,17 @@
55
import uuid
66
import warnings
77
from pathlib import Path
8-
from urllib.parse import unquote, urlparse
8+
from typing import Literal
99

1010
from gitingest.config import TMP_BASE_PATH
1111
from gitingest.schemas import IngestionQuery
12-
from gitingest.utils.git_utils import check_repo_exists, fetch_remote_branches_or_tags
12+
from gitingest.utils.git_utils import fetch_remote_branches_or_tags, resolve_commit
1313
from gitingest.utils.query_parser_utils import (
14-
KNOWN_GIT_HOSTS,
14+
PathKind,
15+
_fallback_to_root,
1516
_get_user_and_repo_from_path,
1617
_is_valid_git_commit_hash,
17-
_validate_host,
18-
_validate_url_scheme,
18+
_normalise_source,
1919
)
2020

2121

@@ -40,80 +40,59 @@ async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQ
4040
A dictionary containing the parsed details of the repository.
4141
4242
"""
43-
source = unquote(source)
44-
45-
# Attempt to parse
46-
parsed_url = urlparse(source)
47-
48-
if parsed_url.scheme:
49-
_validate_url_scheme(parsed_url.scheme)
50-
_validate_host(parsed_url.netloc.lower())
51-
52-
else: # Will be of the form 'host/user/repo' or 'user/repo'
53-
tmp_host = source.split("/")[0].lower()
54-
if "." in tmp_host:
55-
_validate_host(tmp_host)
56-
else:
57-
# No scheme, no domain => user typed "user/repo", so we'll guess the domain.
58-
host = await try_domains_for_user_and_repo(*_get_user_and_repo_from_path(source), token=token)
59-
source = f"{host}/{source}"
60-
61-
source = "https://" + source
62-
parsed_url = urlparse(source)
63-
64-
host = parsed_url.netloc.lower()
65-
user_name, repo_name = _get_user_and_repo_from_path(parsed_url.path)
43+
parsed_url = await _normalise_source(source, token=token)
44+
host = parsed_url.netloc
45+
user, repo = _get_user_and_repo_from_path(parsed_url.path)
6646

6747
_id = str(uuid.uuid4())
68-
slug = f"{user_name}-{repo_name}"
48+
slug = f"{user}-{repo}"
6949
local_path = TMP_BASE_PATH / _id / slug
70-
url = f"https://{host}/{user_name}/{repo_name}"
50+
url = f"https://{host}/{user}/{repo}"
7151

7252
query = IngestionQuery(
7353
host=host,
74-
user_name=user_name,
75-
repo_name=repo_name,
54+
user_name=user,
55+
repo_name=repo,
7656
url=url,
7757
local_path=local_path,
7858
slug=slug,
7959
id=_id,
8060
)
8161

82-
remaining_parts = parsed_url.path.strip("/").split("/")[2:]
83-
84-
if not remaining_parts:
85-
return query
62+
path_parts = parsed_url.path.strip("/").split("/")[2:]
8663

87-
possible_type = remaining_parts.pop(0) # e.g. 'issues', 'pull', 'tree', 'blob'
64+
# main branch
65+
if not path_parts:
66+
return await _fallback_to_root(query, token=token)
8867

89-
# If no extra path parts, just return
90-
if not remaining_parts:
91-
return query
68+
kind = PathKind(path_parts.pop(0)) # may raise ValueError
69+
query.type = kind
9270

93-
# If this is an issues page or pull requests, return early without processing subpath
9471
# TODO: Handle issues and pull requests
95-
if remaining_parts and possible_type in {"issues", "pull"}:
72+
if query.type in {PathKind.ISSUES, PathKind.PULL}:
9673
msg = f"Warning: Issues and pull requests are not yet supported: {url}. Returning repository root."
97-
warnings.warn(msg, RuntimeWarning, stacklevel=2)
98-
return query
74+
return await _fallback_to_root(query, token=token, warn_msg=msg)
9975

100-
if possible_type not in {"tree", "blob"}:
101-
# TODO: Handle other types
102-
msg = f"Warning: Type '{possible_type}' is not yet supported: {url}. Returning repository root."
103-
warnings.warn(msg, RuntimeWarning, stacklevel=2)
104-
return query
76+
# If no extra path parts, just return
77+
if not path_parts:
78+
msg = f"Warning: No extra path parts: {url}. Returning repository root."
79+
return await _fallback_to_root(query, token=token, warn_msg=msg)
10580

106-
query.type = possible_type
81+
if query.type not in {PathKind.TREE, PathKind.BLOB}:
82+
# TODO: Handle other types
83+
msg = f"Warning: Type '{query.type}' is not yet supported: {url}. Returning repository root."
84+
return await _fallback_to_root(query, token=token, warn_msg=msg)
10785

10886
# Commit, branch, or tag
109-
commit_or_branch_or_tag = remaining_parts[0]
110-
if _is_valid_git_commit_hash(commit_or_branch_or_tag): # Commit
111-
query.commit = commit_or_branch_or_tag
112-
remaining_parts.pop(0) # Consume the commit hash
87+
ref = path_parts[0]
88+
89+
if _is_valid_git_commit_hash(ref): # Commit
90+
query.commit = ref
91+
path_parts.pop(0) # Consume the commit hash
11392
else: # Branch or tag
11493
# Try to resolve a tag
11594
query.tag = await _configure_branch_or_tag(
116-
remaining_parts,
95+
path_parts,
11796
url=url,
11897
ref_type="tags",
11998
token=token,
@@ -122,15 +101,17 @@ async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQ
122101
# If no tag found, try to resolve a branch
123102
if not query.tag:
124103
query.branch = await _configure_branch_or_tag(
125-
remaining_parts,
104+
path_parts,
126105
url=url,
127106
ref_type="branches",
128107
token=token,
129108
)
130109

131110
# Only configure subpath if we have identified a commit, branch, or tag.
132-
if remaining_parts and (query.commit or query.branch or query.tag):
133-
query.subpath += "/".join(remaining_parts)
111+
if path_parts and (query.commit or query.branch or query.tag):
112+
query.subpath += "/".join(path_parts)
113+
114+
query.commit = await resolve_commit(query.extract_clone_config(), token=token)
134115

135116
return query
136117

@@ -155,21 +136,21 @@ def parse_local_dir_path(path_str: str) -> IngestionQuery:
155136

156137

157138
async def _configure_branch_or_tag(
158-
remaining_parts: list[str],
139+
path_parts: list[str],
159140
*,
160141
url: str,
161-
ref_type: str,
142+
ref_type: Literal["branches", "tags"],
162143
token: str | None = None,
163144
) -> str | None:
164145
"""Configure the branch or tag based on the remaining parts of the URL.
165146
166147
Parameters
167148
----------
168-
remaining_parts : list[str]
169-
The remaining parts of the URL path.
149+
path_parts : list[str]
150+
The path parts of the URL.
170151
url : str
171152
The URL of the repository.
172-
ref_type : str
153+
ref_type : Literal["branches", "tags"]
173154
The type of reference to configure. Can be "branches" or "tags".
174155
token : str | None
175156
GitHub personal access token (PAT) for accessing private repositories.
@@ -179,16 +160,7 @@ async def _configure_branch_or_tag(
179160
str | None
180161
The branch or tag name if found, otherwise ``None``.
181162
182-
Raises
183-
------
184-
ValueError
185-
If the ``ref_type`` parameter is not "branches" or "tags".
186-
187163
"""
188-
if ref_type not in ("branches", "tags"):
189-
msg = f"Invalid reference type: {ref_type}"
190-
raise ValueError(msg)
191-
192164
_ref_type = "tags" if ref_type == "tags" else "branches"
193165

194166
try:
@@ -198,50 +170,18 @@ async def _configure_branch_or_tag(
198170
# If remote discovery fails, we optimistically treat the first path segment as the branch/tag.
199171
msg = f"Warning: Failed to fetch {_ref_type}: {exc}"
200172
warnings.warn(msg, RuntimeWarning, stacklevel=2)
201-
return remaining_parts.pop(0) if remaining_parts else None
173+
return path_parts.pop(0) if path_parts else None
202174

203175
# Iterate over the path components and try to find a matching branch/tag
204176
candidate_parts: list[str] = []
205177

206-
for part in remaining_parts:
178+
for part in path_parts:
207179
candidate_parts.append(part)
208180
candidate_name = "/".join(candidate_parts)
209181
if candidate_name in branches_or_tags:
210182
# We found a match — now consume exactly the parts that form the branch/tag
211-
del remaining_parts[: len(candidate_parts)]
183+
del path_parts[: len(candidate_parts)]
212184
return candidate_name
213185

214-
# No match found; leave remaining_parts intact
186+
# No match found; leave path_parts intact
215187
return None
216-
217-
218-
async def try_domains_for_user_and_repo(user_name: str, repo_name: str, token: str | None = None) -> str:
219-
"""Attempt to find a valid repository host for the given ``user_name`` and ``repo_name``.
220-
221-
Parameters
222-
----------
223-
user_name : str
224-
The username or owner of the repository.
225-
repo_name : str
226-
The name of the repository.
227-
token : str | None
228-
GitHub personal access token (PAT) for accessing private repositories.
229-
230-
Returns
231-
-------
232-
str
233-
The domain of the valid repository host.
234-
235-
Raises
236-
------
237-
ValueError
238-
If no valid repository host is found for the given ``user_name`` and ``repo_name``.
239-
240-
"""
241-
for domain in KNOWN_GIT_HOSTS:
242-
candidate = f"https://{domain}/{user_name}/{repo_name}"
243-
if await check_repo_exists(candidate, token=token if domain.startswith("github.") else None):
244-
return domain
245-
246-
msg = f"Could not find a valid repository host for '{user_name}/{repo_name}'."
247-
raise ValueError(msg)
Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
11
"""Compatibility layer for typing."""
22

3+
try:
4+
from enum import StrEnum # type: ignore[attr-defined] # Py ≥ 3.11
5+
except ImportError:
6+
from strenum import StrEnum # type: ignore[import-untyped] # Py ≤ 3.10
7+
38
try:
49
from typing import ParamSpec, TypeAlias # type: ignore[attr-defined] # Py ≥ 3.10
510
except ImportError:
6-
from typing_extensions import ParamSpec, TypeAlias # type: ignore[attr-defined] # Py 3.8 / 3.9
11+
from typing_extensions import ParamSpec, TypeAlias # type: ignore[attr-defined] # Py 3.9
712

813
try:
914
from typing import Annotated # type: ignore[attr-defined] # Py ≥ 3.9
1015
except ImportError:
11-
from typing_extensions import Annotated # type: ignore[attr-defined] # Py 3.8
16+
from typing_extensions import Annotated # type: ignore[attr-defined] # Py 3.8
1217

13-
__all__ = ["Annotated", "ParamSpec", "TypeAlias"]
18+
__all__ = ["Annotated", "ParamSpec", "StrEnum", "TypeAlias"]

0 commit comments

Comments
 (0)