|
| 1 | +""" Define the schema for the filesystem representation. """ |
| 2 | + |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +import os |
| 6 | +from dataclasses import dataclass, field |
| 7 | +from enum import Enum, auto |
| 8 | +from pathlib import Path |
| 9 | + |
| 10 | +from gitingest.exceptions import InvalidNotebookError |
| 11 | +from gitingest.utils.ingestion_utils import _get_encoding_list |
| 12 | +from gitingest.utils.notebook_utils import process_notebook |
| 13 | +from gitingest.utils.textfile_checker_utils import is_textfile |
| 14 | + |
| 15 | +SEPARATOR = "=" * 48 + "\n" |
| 16 | + |
| 17 | + |
| 18 | +class FileSystemNodeType(Enum): |
| 19 | + """Enum representing the type of a file system node (directory or file).""" |
| 20 | + |
| 21 | + DIRECTORY = auto() |
| 22 | + FILE = auto() |
| 23 | + |
| 24 | + |
| 25 | +@dataclass |
| 26 | +class FileSystemStats: |
| 27 | + """Class for tracking statistics during file system traversal.""" |
| 28 | + |
| 29 | + visited: set[Path] = field(default_factory=set) |
| 30 | + total_files: int = 0 |
| 31 | + total_size: int = 0 |
| 32 | + |
| 33 | + |
| 34 | +@dataclass |
| 35 | +class FileSystemNode: # pylint: disable=too-many-instance-attributes |
| 36 | + """ |
| 37 | + Class representing a node in the file system (either a file or directory). |
| 38 | +
|
| 39 | + This class has more than the recommended number of attributes because it needs to |
| 40 | + track various properties of files and directories for comprehensive analysis. |
| 41 | + """ |
| 42 | + |
| 43 | + name: str |
| 44 | + type: FileSystemNodeType # e.g., "directory" or "file" |
| 45 | + path_str: str |
| 46 | + path: Path |
| 47 | + size: int = 0 |
| 48 | + file_count: int = 0 |
| 49 | + dir_count: int = 0 |
| 50 | + depth: int = 0 |
| 51 | + children: list[FileSystemNode] = field(default_factory=list) # Using default_factory instead of empty list |
| 52 | + |
| 53 | + def sort_children(self) -> None: |
| 54 | + """ |
| 55 | + Sort the children nodes of a directory according to a specific order. |
| 56 | +
|
| 57 | + Order of sorting: |
| 58 | + 1. README.md first |
| 59 | + 2. Regular files (not starting with dot) |
| 60 | + 3. Hidden files (starting with dot) |
| 61 | + 4. Regular directories (not starting with dot) |
| 62 | + 5. Hidden directories (starting with dot) |
| 63 | + All groups are sorted alphanumerically within themselves. |
| 64 | + """ |
| 65 | + # Separate files and directories |
| 66 | + files = [child for child in self.children if child.type == FileSystemNodeType.FILE] |
| 67 | + directories = [child for child in self.children if child.type == FileSystemNodeType.DIRECTORY] |
| 68 | + |
| 69 | + # Find README.md |
| 70 | + readme_files = [f for f in files if f.name.lower() == "readme.md"] |
| 71 | + other_files = [f for f in files if f.name.lower() != "readme.md"] |
| 72 | + |
| 73 | + # Separate hidden and regular files/directories |
| 74 | + regular_files = [f for f in other_files if not f.name.startswith(".")] |
| 75 | + hidden_files = [f for f in other_files if f.name.startswith(".")] |
| 76 | + regular_dirs = [d for d in directories if not d.name.startswith(".")] |
| 77 | + hidden_dirs = [d for d in directories if d.name.startswith(".")] |
| 78 | + |
| 79 | + # Sort each group alphanumerically |
| 80 | + regular_files.sort(key=lambda x: x.name) |
| 81 | + hidden_files.sort(key=lambda x: x.name) |
| 82 | + regular_dirs.sort(key=lambda x: x.name) |
| 83 | + hidden_dirs.sort(key=lambda x: x.name) |
| 84 | + |
| 85 | + self.children = readme_files + regular_files + hidden_files + regular_dirs + hidden_dirs |
| 86 | + |
| 87 | + @property |
| 88 | + def content_string(self) -> str: |
| 89 | + """ |
| 90 | + Return the content of the node as a string. |
| 91 | +
|
| 92 | + This property returns the content of the node as a string, including the path and content. |
| 93 | +
|
| 94 | + Returns |
| 95 | + ------- |
| 96 | + str |
| 97 | + A string representation of the node's content. |
| 98 | + """ |
| 99 | + content_repr = SEPARATOR |
| 100 | + |
| 101 | + # Use forward slashes in output paths |
| 102 | + content_repr += f"File: {str(self.path_str).replace(os.sep, '/')}\n" |
| 103 | + content_repr += SEPARATOR |
| 104 | + content_repr += f"{self.content}\n\n" |
| 105 | + return content_repr |
| 106 | + |
| 107 | + @property |
| 108 | + def content(self) -> str: # pylint: disable=too-many-return-statements |
| 109 | + """ |
| 110 | + Read the content of a file. |
| 111 | +
|
| 112 | + This function attempts to open a file and read its contents using UTF-8 encoding. |
| 113 | + If an error occurs during reading (e.g., file is not found or permission error), |
| 114 | + it returns an error message. |
| 115 | +
|
| 116 | + Returns |
| 117 | + ------- |
| 118 | + str |
| 119 | + The content of the file, or an error message if the file could not be read. |
| 120 | + """ |
| 121 | + if self.type == FileSystemNodeType.FILE and not is_textfile(self.path): |
| 122 | + return "[Non-text file]" |
| 123 | + |
| 124 | + try: |
| 125 | + if self.path.suffix == ".ipynb": |
| 126 | + try: |
| 127 | + return process_notebook(self.path) |
| 128 | + except Exception as exc: |
| 129 | + return f"Error processing notebook: {exc}" |
| 130 | + |
| 131 | + for encoding in _get_encoding_list(): |
| 132 | + try: |
| 133 | + with self.path.open(encoding=encoding) as f: |
| 134 | + return f.read() |
| 135 | + except UnicodeDecodeError: |
| 136 | + continue |
| 137 | + except OSError as exc: |
| 138 | + return f"Error reading file: {exc}" |
| 139 | + |
| 140 | + return "Error: Unable to decode file with available encodings" |
| 141 | + |
| 142 | + except (OSError, InvalidNotebookError) as exc: |
| 143 | + return f"Error reading file: {exc}" |
0 commit comments