55import uuid
66import warnings
77from pathlib import Path
8- from urllib . parse import unquote , urlparse
8+ from typing import Literal
99
1010from gitingest .config import TMP_BASE_PATH
1111from gitingest .schemas import IngestionQuery
12- from gitingest .utils .git_utils import check_repo_exists , fetch_remote_branches_or_tags
12+ from gitingest .utils .git_utils import fetch_remote_branches_or_tags , resolve_commit
1313from gitingest .utils .query_parser_utils import (
14- KNOWN_GIT_HOSTS ,
14+ PathKind ,
15+ _fallback_to_root ,
1516 _get_user_and_repo_from_path ,
1617 _is_valid_git_commit_hash ,
17- _validate_host ,
18- _validate_url_scheme ,
18+ _normalise_source ,
1919)
2020
2121
@@ -40,80 +40,59 @@ async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQ
4040 A dictionary containing the parsed details of the repository.
4141
4242 """
43- source = unquote (source )
44-
45- # Attempt to parse
46- parsed_url = urlparse (source )
47-
48- if parsed_url .scheme :
49- _validate_url_scheme (parsed_url .scheme )
50- _validate_host (parsed_url .netloc .lower ())
51-
52- else : # Will be of the form 'host/user/repo' or 'user/repo'
53- tmp_host = source .split ("/" )[0 ].lower ()
54- if "." in tmp_host :
55- _validate_host (tmp_host )
56- else :
57- # No scheme, no domain => user typed "user/repo", so we'll guess the domain.
58- host = await try_domains_for_user_and_repo (* _get_user_and_repo_from_path (source ), token = token )
59- source = f"{ host } /{ source } "
60-
61- source = "https://" + source
62- parsed_url = urlparse (source )
63-
64- host = parsed_url .netloc .lower ()
65- user_name , repo_name = _get_user_and_repo_from_path (parsed_url .path )
43+ parsed_url = await _normalise_source (source , token = token )
44+ host = parsed_url .netloc
45+ user , repo = _get_user_and_repo_from_path (parsed_url .path )
6646
6747 _id = str (uuid .uuid4 ())
68- slug = f"{ user_name } -{ repo_name } "
48+ slug = f"{ user } -{ repo } "
6949 local_path = TMP_BASE_PATH / _id / slug
70- url = f"https://{ host } /{ user_name } /{ repo_name } "
50+ url = f"https://{ host } /{ user } /{ repo } "
7151
7252 query = IngestionQuery (
7353 host = host ,
74- user_name = user_name ,
75- repo_name = repo_name ,
54+ user_name = user ,
55+ repo_name = repo ,
7656 url = url ,
7757 local_path = local_path ,
7858 slug = slug ,
7959 id = _id ,
8060 )
8161
82- remaining_parts = parsed_url .path .strip ("/" ).split ("/" )[2 :]
83-
84- if not remaining_parts :
85- return query
62+ path_parts = parsed_url .path .strip ("/" ).split ("/" )[2 :]
8663
87- possible_type = remaining_parts .pop (0 ) # e.g. 'issues', 'pull', 'tree', 'blob'
64+ # main branch
65+ if not path_parts :
66+ return await _fallback_to_root (query , token = token )
8867
89- # If no extra path parts, just return
90- if not remaining_parts :
91- return query
68+ kind = PathKind (path_parts .pop (0 )) # may raise ValueError
69+ query .type = kind
9270
93- # If this is an issues page or pull requests, return early without processing subpath
9471 # TODO: Handle issues and pull requests
95- if remaining_parts and possible_type in {"issues" , "pull" }:
72+ if query . type in {PathKind . ISSUES , PathKind . PULL }:
9673 msg = f"Warning: Issues and pull requests are not yet supported: { url } . Returning repository root."
97- warnings .warn (msg , RuntimeWarning , stacklevel = 2 )
98- return query
74+ return await _fallback_to_root (query , token = token , warn_msg = msg )
9975
100- if possible_type not in {"tree" , "blob" }:
101- # TODO: Handle other types
102- msg = f"Warning: Type '{ possible_type } ' is not yet supported: { url } . Returning repository root."
103- warnings .warn (msg , RuntimeWarning , stacklevel = 2 )
104- return query
76+ # If no extra path parts, just return
77+ if not path_parts :
78+ msg = f"Warning: No extra path parts: { url } . Returning repository root."
79+ return await _fallback_to_root (query , token = token , warn_msg = msg )
10580
106- query .type = possible_type
81+ if query .type not in {PathKind .TREE , PathKind .BLOB }:
82+ # TODO: Handle other types
83+ msg = f"Warning: Type '{ query .type } ' is not yet supported: { url } . Returning repository root."
84+ return await _fallback_to_root (query , token = token , warn_msg = msg )
10785
10886 # Commit, branch, or tag
109- commit_or_branch_or_tag = remaining_parts [0 ]
110- if _is_valid_git_commit_hash (commit_or_branch_or_tag ): # Commit
111- query .commit = commit_or_branch_or_tag
112- remaining_parts .pop (0 ) # Consume the commit hash
87+ ref = path_parts [0 ]
88+
89+ if _is_valid_git_commit_hash (ref ): # Commit
90+ query .commit = ref
91+ path_parts .pop (0 ) # Consume the commit hash
11392 else : # Branch or tag
11493 # Try to resolve a tag
11594 query .tag = await _configure_branch_or_tag (
116- remaining_parts ,
95+ path_parts ,
11796 url = url ,
11897 ref_type = "tags" ,
11998 token = token ,
@@ -122,15 +101,17 @@ async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQ
122101 # If no tag found, try to resolve a branch
123102 if not query .tag :
124103 query .branch = await _configure_branch_or_tag (
125- remaining_parts ,
104+ path_parts ,
126105 url = url ,
127106 ref_type = "branches" ,
128107 token = token ,
129108 )
130109
131110 # Only configure subpath if we have identified a commit, branch, or tag.
132- if remaining_parts and (query .commit or query .branch or query .tag ):
133- query .subpath += "/" .join (remaining_parts )
111+ if path_parts and (query .commit or query .branch or query .tag ):
112+ query .subpath += "/" .join (path_parts )
113+
114+ query .commit = await resolve_commit (query .extract_clone_config (), token = token )
134115
135116 return query
136117
@@ -155,21 +136,21 @@ def parse_local_dir_path(path_str: str) -> IngestionQuery:
155136
156137
157138async def _configure_branch_or_tag (
158- remaining_parts : list [str ],
139+ path_parts : list [str ],
159140 * ,
160141 url : str ,
161- ref_type : str ,
142+ ref_type : Literal [ "branches" , "tags" ] ,
162143 token : str | None = None ,
163144) -> str | None :
164145 """Configure the branch or tag based on the remaining parts of the URL.
165146
166147 Parameters
167148 ----------
168- remaining_parts : list[str]
169- The remaining parts of the URL path .
149+ path_parts : list[str]
150+ The path parts of the URL.
170151 url : str
171152 The URL of the repository.
172- ref_type : str
153+ ref_type : Literal["branches", "tags"]
173154 The type of reference to configure. Can be "branches" or "tags".
174155 token : str | None
175156 GitHub personal access token (PAT) for accessing private repositories.
@@ -179,16 +160,7 @@ async def _configure_branch_or_tag(
179160 str | None
180161 The branch or tag name if found, otherwise ``None``.
181162
182- Raises
183- ------
184- ValueError
185- If the ``ref_type`` parameter is not "branches" or "tags".
186-
187163 """
188- if ref_type not in ("branches" , "tags" ):
189- msg = f"Invalid reference type: { ref_type } "
190- raise ValueError (msg )
191-
192164 _ref_type = "tags" if ref_type == "tags" else "branches"
193165
194166 try :
@@ -198,50 +170,18 @@ async def _configure_branch_or_tag(
198170 # If remote discovery fails, we optimistically treat the first path segment as the branch/tag.
199171 msg = f"Warning: Failed to fetch { _ref_type } : { exc } "
200172 warnings .warn (msg , RuntimeWarning , stacklevel = 2 )
201- return remaining_parts .pop (0 ) if remaining_parts else None
173+ return path_parts .pop (0 ) if path_parts else None
202174
203175 # Iterate over the path components and try to find a matching branch/tag
204176 candidate_parts : list [str ] = []
205177
206- for part in remaining_parts :
178+ for part in path_parts :
207179 candidate_parts .append (part )
208180 candidate_name = "/" .join (candidate_parts )
209181 if candidate_name in branches_or_tags :
210182 # We found a match — now consume exactly the parts that form the branch/tag
211- del remaining_parts [: len (candidate_parts )]
183+ del path_parts [: len (candidate_parts )]
212184 return candidate_name
213185
214- # No match found; leave remaining_parts intact
186+ # No match found; leave path_parts intact
215187 return None
216-
217-
218- async def try_domains_for_user_and_repo (user_name : str , repo_name : str , token : str | None = None ) -> str :
219- """Attempt to find a valid repository host for the given ``user_name`` and ``repo_name``.
220-
221- Parameters
222- ----------
223- user_name : str
224- The username or owner of the repository.
225- repo_name : str
226- The name of the repository.
227- token : str | None
228- GitHub personal access token (PAT) for accessing private repositories.
229-
230- Returns
231- -------
232- str
233- The domain of the valid repository host.
234-
235- Raises
236- ------
237- ValueError
238- If no valid repository host is found for the given ``user_name`` and ``repo_name``.
239-
240- """
241- for domain in KNOWN_GIT_HOSTS :
242- candidate = f"https://{ domain } /{ user_name } /{ repo_name } "
243- if await check_repo_exists (candidate , token = token if domain .startswith ("github." ) else None ):
244- return domain
245-
246- msg = f"Could not find a valid repository host for '{ user_name } /{ repo_name } '."
247- raise ValueError (msg )
0 commit comments