22
33from __future__ import annotations
44
5+ import logging
56from pathlib import Path
6- from typing import cast
7+ from typing import TYPE_CHECKING , cast
78
89from gitingest .clone import clone_repo
910from gitingest .ingestion import ingest_query
1011from gitingest .query_parser import parse_remote_repo
11- from gitingest .utils .git_utils import validate_github_token
12+ from gitingest .utils .git_utils import resolve_commit , validate_github_token
1213from gitingest .utils .pattern_utils import process_patterns
13- from server .models import IngestErrorResponse , IngestResponse , IngestSuccessResponse , PatternType
14- from server .s3_utils import generate_s3_file_path , is_s3_enabled , upload_to_s3
14+ from server .models import IngestErrorResponse , IngestResponse , IngestSuccessResponse , PatternType , S3Metadata
15+ from server .s3_utils import (
16+ _build_s3_url ,
17+ check_s3_object_exists ,
18+ generate_s3_file_path ,
19+ get_metadata_from_s3 ,
20+ is_s3_enabled ,
21+ upload_metadata_to_s3 ,
22+ upload_to_s3 ,
23+ )
1524from server .server_config import MAX_DISPLAY_SIZE
1625from server .server_utils import Colors
1726
27+ if TYPE_CHECKING :
28+ from gitingest .schemas .cloning import CloneConfig
29+ from gitingest .schemas .ingestion import IngestionQuery
30+
31+ logger = logging .getLogger (__name__ )
32+
33+
34+ async def _check_s3_cache (
35+ query : IngestionQuery ,
36+ input_text : str ,
37+ max_file_size : int ,
38+ pattern_type : str ,
39+ pattern : str ,
40+ token : str | None ,
41+ ) -> IngestSuccessResponse | None :
42+ """Check if digest already exists on S3 and return response if found.
43+
44+ Parameters
45+ ----------
46+ query : IngestionQuery
47+ The parsed query object.
48+ input_text : str
49+ Original input text.
50+ max_file_size : int
51+ Maximum file size in KB.
52+ pattern_type : str
53+ Pattern type (include/exclude).
54+ pattern : str
55+ Pattern string.
56+ token : str | None
57+ GitHub token.
58+
59+ Returns
60+ -------
61+ IngestSuccessResponse | None
62+ Response if file exists on S3, None otherwise.
63+
64+ """
65+ if not is_s3_enabled ():
66+ return None
67+
68+ try :
69+ # Use git ls-remote to get commit SHA without cloning
70+ clone_config = query .extract_clone_config ()
71+ query .commit = await resolve_commit (clone_config , token = token )
72+ # Generate S3 file path using the resolved commit
73+ s3_file_path = generate_s3_file_path (
74+ source = query .url ,
75+ user_name = cast ("str" , query .user_name ),
76+ repo_name = cast ("str" , query .repo_name ),
77+ commit = query .commit ,
78+ include_patterns = query .include_patterns ,
79+ ignore_patterns = query .ignore_patterns ,
80+ )
81+
82+ # Check if file exists on S3
83+ if check_s3_object_exists (s3_file_path ):
84+ # File exists on S3, serve it directly without cloning
85+ s3_url = _build_s3_url (s3_file_path )
86+ query .s3_url = s3_url
87+
88+ short_repo_url = f"{ query .user_name } /{ query .repo_name } "
89+
90+ # Try to get cached metadata
91+ metadata = get_metadata_from_s3 (s3_file_path )
92+
93+ if metadata :
94+ # Use cached metadata if available
95+ summary = metadata .summary
96+ tree = metadata .tree
97+ content = metadata .content
98+ else :
99+ # Fallback to placeholder messages if metadata not available
100+ summary = "Digest served from cache (S3). Download the full digest to see content details."
101+ tree = "Digest served from cache. Download the full digest to see the file tree."
102+ content = "Digest served from cache. Download the full digest to see the content."
103+
104+ return IngestSuccessResponse (
105+ repo_url = input_text ,
106+ short_repo_url = short_repo_url ,
107+ summary = summary ,
108+ digest_url = s3_url ,
109+ tree = tree ,
110+ content = content ,
111+ default_max_file_size = max_file_size ,
112+ pattern_type = pattern_type ,
113+ pattern = pattern ,
114+ )
115+ except Exception as exc :
116+ # Log the exception but don't fail the entire request
117+ logger .warning ("S3 cache check failed, falling back to normal cloning: %s" , exc )
118+
119+ return None
120+
121+
122+ def _store_digest_content (
123+ query : IngestionQuery ,
124+ clone_config : CloneConfig ,
125+ digest_content : str ,
126+ summary : str ,
127+ tree : str ,
128+ content : str ,
129+ ) -> None :
130+ """Store digest content either to S3 or locally based on configuration.
131+
132+ Parameters
133+ ----------
134+ query : IngestionQuery
135+ The query object containing repository information.
136+ clone_config : CloneConfig
137+ The clone configuration object.
138+ digest_content : str
139+ The complete digest content to store.
140+ summary : str
141+ The summary content for metadata.
142+ tree : str
143+ The tree content for metadata.
144+ content : str
145+ The file content for metadata.
146+
147+ """
148+ if is_s3_enabled ():
149+ # Upload to S3 instead of storing locally
150+ s3_file_path = generate_s3_file_path (
151+ source = query .url ,
152+ user_name = cast ("str" , query .user_name ),
153+ repo_name = cast ("str" , query .repo_name ),
154+ commit = query .commit ,
155+ include_patterns = query .include_patterns ,
156+ ignore_patterns = query .ignore_patterns ,
157+ )
158+ s3_url = upload_to_s3 (content = digest_content , s3_file_path = s3_file_path , ingest_id = query .id )
159+
160+ # Also upload metadata JSON for caching
161+ metadata = S3Metadata (
162+ summary = summary ,
163+ tree = tree ,
164+ content = content ,
165+ )
166+ try :
167+ upload_metadata_to_s3 (metadata = metadata , s3_file_path = s3_file_path , ingest_id = query .id )
168+ logger .debug ("Successfully uploaded metadata to S3" )
169+ except Exception as metadata_exc :
170+ # Log the error but don't fail the entire request
171+ logger .warning ("Failed to upload metadata to S3: %s" , metadata_exc )
172+
173+ # Store S3 URL in query for later use
174+ query .s3_url = s3_url
175+ else :
176+ # Store locally
177+ local_txt_file = Path (clone_config .local_path ).with_suffix (".txt" )
178+ with local_txt_file .open ("w" , encoding = "utf-8" ) as f :
179+ f .write (digest_content )
180+
181+
182+ def _generate_digest_url (query : IngestionQuery ) -> str :
183+ """Generate the digest URL based on S3 configuration.
184+
185+ Parameters
186+ ----------
187+ query : IngestionQuery
188+ The query object containing repository information.
189+
190+ Returns
191+ -------
192+ str
193+ The digest URL.
194+
195+ Raises
196+ ------
197+ RuntimeError
198+ If S3 is enabled but no S3 URL was generated.
199+
200+ """
201+ if is_s3_enabled ():
202+ digest_url = getattr (query , "s3_url" , None )
203+ if not digest_url :
204+ # This should not happen if S3 upload was successful
205+ msg = "S3 is enabled but no S3 URL was generated"
206+ raise RuntimeError (msg )
207+ return digest_url
208+ return f"/api/download/file/{ query .id } "
209+
18210
19211async def process_query (
20212 input_text : str ,
@@ -69,10 +261,22 @@ async def process_query(
69261 include_patterns = pattern if pattern_type == PatternType .INCLUDE else None ,
70262 )
71263
264+ # Check if digest already exists on S3 before cloning
265+ s3_response = await _check_s3_cache (
266+ query = query ,
267+ input_text = input_text ,
268+ max_file_size = max_file_size ,
269+ pattern_type = pattern_type .value ,
270+ pattern = pattern ,
271+ token = token ,
272+ )
273+ if s3_response :
274+ return s3_response
275+
72276 clone_config = query .extract_clone_config ()
73277 await clone_repo (clone_config , token = token )
74278
75- short_repo_url = f"{ query .user_name } /{ query .repo_name } " # Sets the "<user>/<repo>" for the page title
279+ short_repo_url = f"{ query .user_name } /{ query .repo_name } "
76280
77281 # The commit hash should always be available at this point
78282 if not query .commit :
@@ -81,30 +285,8 @@ async def process_query(
81285
82286 try :
83287 summary , tree , content = ingest_query (query )
84-
85- # Prepare the digest content (tree + content)
86288 digest_content = tree + "\n " + content
87-
88- # Store digest based on S3 configuration
89- if is_s3_enabled ():
90- # Upload to S3 instead of storing locally
91- s3_file_path = generate_s3_file_path (
92- source = query .url ,
93- user_name = cast ("str" , query .user_name ),
94- repo_name = cast ("str" , query .repo_name ),
95- commit = query .commit ,
96- include_patterns = query .include_patterns ,
97- ignore_patterns = query .ignore_patterns ,
98- )
99- s3_url = upload_to_s3 (content = digest_content , s3_file_path = s3_file_path , ingest_id = query .id )
100- # Store S3 URL in query for later use
101- query .s3_url = s3_url
102- else :
103- # Store locally
104- local_txt_file = Path (clone_config .local_path ).with_suffix (".txt" )
105- with local_txt_file .open ("w" , encoding = "utf-8" ) as f :
106- f .write (digest_content )
107-
289+ _store_digest_content (query , clone_config , digest_content , summary , tree , content )
108290 except Exception as exc :
109291 _print_error (query .url , exc , max_file_size , pattern_type , pattern )
110292 return IngestErrorResponse (error = str (exc ))
@@ -123,15 +305,7 @@ async def process_query(
123305 summary = summary ,
124306 )
125307
126- # Generate digest_url based on S3 configuration
127- if is_s3_enabled ():
128- digest_url = getattr (query , "s3_url" , None )
129- if not digest_url :
130- # This should not happen if S3 upload was successful
131- msg = "S3 is enabled but no S3 URL was generated"
132- raise RuntimeError (msg )
133- else :
134- digest_url = f"/api/download/file/{ query .id } "
308+ digest_url = _generate_digest_url (query )
135309
136310 return IngestSuccessResponse (
137311 repo_url = input_text ,
0 commit comments