Skip to content

Commit 6b606bf

Browse files
committed
wip
1 parent 6bcbac5 commit 6b606bf

File tree

8 files changed

+300
-171
lines changed

8 files changed

+300
-171
lines changed

src/gitingest/ingestion.py

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -41,15 +41,6 @@ def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:
4141
If the path cannot be found, is not a file, or the file has no content.
4242
4343
"""
44-
logger.info(
45-
"Starting file ingestion",
46-
extra={
47-
"slug": query.slug,
48-
"subpath": query.subpath,
49-
"local_path": str(query.local_path),
50-
"max_file_size": query.max_file_size,
51-
},
52-
)
5344

5445
subpath = Path(query.subpath.strip("/")).as_posix()
5546
path = query.local_path / subpath
@@ -84,13 +75,6 @@ def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:
8475
msg = f"File {file_node.name} has no content"
8576
raise ValueError(msg)
8677

87-
logger.info(
88-
"Single file processing completed",
89-
extra={
90-
"file_name": file_node.name,
91-
"file_size": file_node.size,
92-
},
93-
)
9478
return format_node(file_node, query=query)
9579

9680
logger.info("Processing directory", extra={"directory_path": str(path)})
@@ -106,17 +90,6 @@ def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:
10690

10791
_process_node(node=root_node, query=query, stats=stats)
10892

109-
logger.info(
110-
"Directory processing completed",
111-
extra={
112-
"total_files": root_node.file_count,
113-
"total_directories": root_node.dir_count,
114-
"total_size_bytes": root_node.size,
115-
"stats_total_files": stats.total_files,
116-
"stats_total_size": stats.total_size,
117-
},
118-
)
119-
12093
return format_node(root_node, query=query)
12194

12295

src/gitingest/output_formatter.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,19 @@ def _create_tree_structure(
168168
display_name += "/"
169169
elif node.type == FileSystemNodeType.SYMLINK:
170170
display_name += " -> " + readlink(node.path).name
171+
172+
# Add likelihood score if this file was selected by AI (score > 0)
173+
if node.likelihood_score > 0:
174+
# Color code based on score
175+
if node.likelihood_score >= 80:
176+
score_indicator = f" [🟢 {node.likelihood_score}%]"
177+
elif node.likelihood_score >= 60:
178+
score_indicator = f" [🟡 {node.likelihood_score}%]"
179+
elif node.likelihood_score >= 40:
180+
score_indicator = f" [🟠 {node.likelihood_score}%]"
181+
else:
182+
score_indicator = f" [🔴 {node.likelihood_score}%]"
183+
display_name += score_indicator
171184

172185
tree_str += f"{prefix}{current_prefix}{display_name}\n"
173186

src/gitingest/schemas/filesystem.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
if TYPE_CHECKING:
1515
from pathlib import Path
1616

17+
from pathlib import PurePath
18+
1719
SEPARATOR = "=" * 48 # Tiktoken, the tokenizer openai uses, counts 2 tokens if we have more than 48
1820

1921

@@ -49,6 +51,7 @@ class FileSystemNode: # pylint: disable=too-many-instance-attributes
4951
dir_count: int = 0
5052
depth: int = 0
5153
children: list[FileSystemNode] = field(default_factory=list)
54+
likelihood_score: int = 0 # AI likelihood score (0-100) for file selection, 0 = default/not AI-selected
5255

5356
def sort_children(self) -> None:
5457
"""Sort the children nodes of a directory according to a specific order.
@@ -83,6 +86,61 @@ def _sort_key(child: FileSystemNode) -> tuple[int, str]:
8386

8487
self.children.sort(key=_sort_key)
8588

89+
def map(self, func) -> None:
90+
"""Apply a function to all nodes in the tree (depth-first).
91+
92+
Parameters
93+
----------
94+
func : callable
95+
Function to apply to each node. Takes a FileSystemNode as argument.
96+
97+
Example
98+
-------
99+
>>> def print_file_names(node):
100+
... if node.type == FileSystemNodeType.FILE:
101+
... print(node.name)
102+
>>> root_node.map(print_file_names)
103+
"""
104+
# Apply function to current node
105+
func(self)
106+
107+
# Recursively apply to all children
108+
if self.type == FileSystemNodeType.DIRECTORY and self.children:
109+
for child in self.children:
110+
child.map(func)
111+
112+
def __getitem__(self, path: str) -> "FileSystemNode | None":
113+
"""Get a node by its path string.
114+
115+
Parameters
116+
----------
117+
path : str
118+
The path string to search for
119+
120+
Returns
121+
-------
122+
FileSystemNode | None
123+
The node with the matching path, or None if not found
124+
125+
Example
126+
-------
127+
>>> file_node = root_node["src/main.py"]
128+
>>> if file_node:
129+
... file_node.likelihood_score = 95
130+
"""
131+
# Check if this node matches (using OS-independent path comparison)
132+
if self.path_str and PurePath(self.path_str) == PurePath(path):
133+
return self
134+
135+
# Search in children
136+
if self.type == FileSystemNodeType.DIRECTORY and self.children:
137+
for child in self.children:
138+
result = child[path]
139+
if result:
140+
return result
141+
142+
return None
143+
86144
@property
87145
def content_string(self) -> str:
88146
"""Return the content of the node as a string, including path and content.

src/server/ai_file_selector.py

Lines changed: 91 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@
2323
class FileSelectionResponse(BaseModel):
2424
"""Response model for AI file selection."""
2525

26-
selected_files: list[str]
26+
selected_files: list[str] # file paths selected by AI
27+
selected_files_detailed: dict[str, dict] | None # detailed info with reasoning
2728
reasoning: str
2829

2930

@@ -137,21 +138,29 @@ def _create_selection_prompt(
137138
{content_sample}
138139
139140
CONSTRAINTS:
140-
- Target exactly {context_size_tokens:,} tokens in the final output
141+
- The output will be trimmed down to {context_size_tokens:,} tokens in the end.
141142
- Prioritize files that are most relevant to the user's request
142143
- Include key architectural files (main entry points, configuration, core modules)
143144
- Balance breadth (overview) with depth (important details)
144145
- Avoid redundant or duplicate content
145146
- Consider file dependencies and relationships
147+
- When in doubt, include the file
146148
147149
RESPONSE FORMAT:
150+
For every file, include a level of "likelihood of being relevant" from 1 to 100.
151+
Multiple files can have the same likelihood.
148152
Return a JSON object with this exact structure:
149153
{{
150-
"selected_files": [
151-
"path/to/file1.py",
152-
"path/to/file2.js",
153-
"path/to/file3.md"
154-
],
154+
"selected_files": {{
155+
"path/to/file1.py": {{
156+
"score": 90,
157+
"reasoning": "Brief explanation of why this file has this score"
158+
}},
159+
"path/to/file2.py": {{
160+
"score": 80,
161+
"reasoning": "Brief explanation of why this file has this score"
162+
}}
163+
}},
155164
"reasoning": "Brief explanation of why these files were selected and how they serve the user's request."
156165
}}
157166
@@ -219,14 +228,42 @@ async def select_files(
219228
logger.warning("Failed to parse JSON response, attempting to extract files manually")
220229
# Fallback: try to extract file paths from response
221230
file_paths = re.findall(r'"([^"]+\.[a-zA-Z]+)"', response_text)
231+
# Convert to new dict format with default scores
232+
file_dict = {path: {"score": 50, "reasoning": "Default score"} for path in file_paths}
222233
parsed_response = {
223-
"selected_files": file_paths,
234+
"selected_files": file_dict,
224235
"reasoning": "Extracted files from AI response (JSON parsing failed)"
225236
}
226237

238+
# Extract selected files and scores from AI response
239+
selected_files_data = parsed_response.get("selected_files", {})
240+
reasoning = parsed_response.get("reasoning", "No reasoning provided")
241+
242+
# Convert new format to scores dict and preserve detailed info
243+
selected_files_dict = {}
244+
detailed_files = {}
245+
for file_path, file_data in selected_files_data.items():
246+
if isinstance(file_data, dict) and "score" in file_data:
247+
selected_files_dict[file_path] = file_data["score"]
248+
detailed_files[file_path] = file_data
249+
else:
250+
# Fallback for old format or malformed data
251+
selected_files_dict[file_path] = file_data if isinstance(file_data, int) else 50
252+
detailed_files[file_path] = {"score": file_data if isinstance(file_data, int) else 50}
253+
254+
logger.info("Applying AI scores to tree", extra={
255+
"files_with_scores": len(selected_files_dict),
256+
"sample_scores": dict(list(selected_files_dict.items())[:3]) if selected_files_dict else {}
257+
})
258+
259+
# Update tree nodes with likelihood scores
260+
self._update_tree_scores(root_node, selected_files_dict)
261+
262+
# Return the actual file paths for frontend display
227263
selection = FileSelectionResponse(
228-
selected_files=parsed_response.get("selected_files", []),
229-
reasoning=parsed_response.get("reasoning", "No reasoning provided")
264+
selected_files=list(selected_files_dict.keys()),
265+
selected_files_detailed=detailed_files if detailed_files else None,
266+
reasoning=reasoning
230267
)
231268

232269
logger.info("AI file selection completed", extra={
@@ -238,11 +275,13 @@ async def select_files(
238275

239276
except Exception as e:
240277
logger.error("AI file selection failed", extra={"error": str(e)})
241-
# Fallback: return all files up to a reasonable limit
242-
all_files = self._extract_all_files(root_node)
278+
# Set fallback scores directly on tree nodes
279+
self._set_fallback_scores(root_node)
280+
243281
return FileSelectionResponse(
244-
selected_files=all_files[:50], # Limit to 50 files as fallback
245-
reasoning=f"AI selection failed ({str(e)}), using fallback selection of key files"
282+
selected_files=[],
283+
selected_files_detailed=None,
284+
reasoning=f"AI selection failed ({str(e)}), using fallback scoring"
246285
)
247286

248287
def _extract_all_files(self, node: FileSystemNode, files: list[str] | None = None) -> list[str]:
@@ -258,6 +297,44 @@ def _extract_all_files(self, node: FileSystemNode, files: list[str] | None = Non
258297

259298
return files
260299

300+
def _update_tree_scores(self, root_node: FileSystemNode, selected_files_dict: dict[str, int]) -> None:
301+
"""Update tree nodes with likelihood scores from AI selection."""
302+
for path, score in selected_files_dict.items():
303+
node = root_node[path]
304+
if node:
305+
node.likelihood_score = score
306+
logger.debug("Updated node score", extra={
307+
"path": path,
308+
"score": score
309+
})
310+
311+
def _set_fallback_scores(self, root_node: FileSystemNode) -> None:
312+
"""Set fallback scores for files when AI is not available."""
313+
def set_fallback_score(node: FileSystemNode) -> None:
314+
if node.type.value == "file":
315+
# Use heuristics to score files
316+
file_name = node.name.lower()
317+
file_ext = node.path_str.split('.')[-1].lower() if node.path_str and '.' in node.path_str else ""
318+
319+
# High importance files
320+
if any(pattern in file_name for pattern in ['readme', 'main', 'index', 'app', 'server']):
321+
node.likelihood_score = 90
322+
# Important extensions
323+
elif file_ext in {'py', 'js', 'ts', 'java', 'cpp', 'c', 'go', 'rs'}:
324+
node.likelihood_score = 70
325+
# Config files
326+
elif file_ext in {'json', 'yaml', 'yml', 'toml', 'ini', 'env'}:
327+
node.likelihood_score = 60
328+
# Documentation
329+
elif file_ext in {'md', 'txt', 'rst'}:
330+
node.likelihood_score = 50
331+
# Other files
332+
else:
333+
node.likelihood_score = 30
334+
335+
# Use the map function to apply fallback scores to all nodes
336+
root_node.map(set_fallback_score)
337+
261338

262339
def get_ai_file_selector() -> AIFileSelector | None:
263340
"""Get AI file selector instance, return None if not configured."""

0 commit comments

Comments
 (0)