Skip to content

Commit cdeadf5

Browse files
refactor: rework how symlinks are processed (no longer resolve) (#248)
Some changes to how we handle symlinks. We no longer resolve them, which should reduce the complexity by a nice bit. We also now show the target name in the output. I also added a launch.json file for debugging because it took me a while to figure out how to get the debugger to work. Yeah, that's it. Please test before merging because I'm a bit of a dingus sometimes
1 parent 8be6f56 commit cdeadf5

File tree

4 files changed

+55
-32
lines changed

4 files changed

+55
-32
lines changed

.vscode/launch.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
"configurations": [
3+
{
4+
"name": "Python Debugger: Module",
5+
"type": "debugpy",
6+
"request": "launch",
7+
"module": "uvicorn",
8+
"args": ["server.main:app", "--host", "0.0.0.0", "--port", "8000"],
9+
"cwd": "${workspaceFolder}/src"
10+
}
11+
]
12+
}

src/gitingest/ingestion.py

Lines changed: 33 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from gitingest.query_parsing import IngestionQuery
1010
from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats
1111
from gitingest.utils.ingestion_utils import _should_exclude, _should_include
12-
from gitingest.utils.path_utils import _is_safe_symlink
1312

1413
try:
1514
import tomllib # type: ignore[import]
@@ -171,40 +170,22 @@ def _process_node(
171170
The parsed query object containing information about the repository and query parameters.
172171
stats : FileSystemStats
173172
Statistics tracking object for the total file count and size.
174-
175-
Raises
176-
------
177-
ValueError
178-
If an unexpected error occurs during processing.
179173
"""
180174

181175
if limit_exceeded(stats, node.depth):
182176
return
183177

184178
for sub_path in node.path.iterdir():
185179

186-
symlink_path = None
187-
if sub_path.is_symlink():
188-
if not _is_safe_symlink(sub_path, query.local_path):
189-
print(f"Skipping unsafe symlink: {sub_path}")
190-
continue
191-
192-
symlink_path = sub_path
193-
sub_path = sub_path.resolve()
194-
195-
if sub_path in stats.visited:
196-
print(f"Skipping already visited path: {sub_path}")
197-
continue
198-
199-
stats.visited.add(sub_path)
200-
201180
if query.ignore_patterns and _should_exclude(sub_path, query.local_path, query.ignore_patterns):
202181
continue
203182

204183
if query.include_patterns and not _should_include(sub_path, query.local_path, query.include_patterns):
205184
continue
206185

207-
if sub_path.is_file():
186+
if sub_path.is_symlink():
187+
_process_symlink(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path)
188+
elif sub_path.is_file():
208189
_process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path)
209190
elif sub_path.is_dir():
210191

@@ -216,11 +197,6 @@ def _process_node(
216197
depth=node.depth + 1,
217198
)
218199

219-
# rename the subdir to reflect the symlink name
220-
if symlink_path:
221-
child_directory_node.name = symlink_path.name
222-
child_directory_node.path_str = str(symlink_path)
223-
224200
_process_node(
225201
node=child_directory_node,
226202
query=query,
@@ -230,13 +206,41 @@ def _process_node(
230206
node.size += child_directory_node.size
231207
node.file_count += child_directory_node.file_count
232208
node.dir_count += 1 + child_directory_node.dir_count
233-
234209
else:
235-
raise ValueError(f"Unexpected error: {sub_path} is neither a file nor a directory")
210+
print(f"Warning: {sub_path} is an unknown file type, skipping")
236211

237212
node.sort_children()
238213

239214

215+
def _process_symlink(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path) -> None:
216+
"""
217+
Process a symlink in the file system.
218+
219+
This function checks the symlink's target.
220+
221+
Parameters
222+
----------
223+
path : Path
224+
The full path of the symlink.
225+
parent_node : FileSystemNode
226+
The parent directory node.
227+
stats : FileSystemStats
228+
Statistics tracking object for the total file count and size.
229+
local_path : Path
230+
The base path of the repository or directory being processed.
231+
"""
232+
child = FileSystemNode(
233+
name=path.name,
234+
type=FileSystemNodeType.SYMLINK,
235+
path_str=str(path.relative_to(local_path)),
236+
path=path,
237+
depth=parent_node.depth + 1,
238+
)
239+
stats.total_files += 1
240+
parent_node.children.append(child)
241+
parent_node.file_count += 1
242+
243+
240244
def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path) -> None:
241245
"""
242246
Process a file in the file system.

src/gitingest/output_formatters.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def format_node(node: FileSystemNode, query: IngestionQuery) -> Tuple[str, str,
3131

3232
if node.type == FileSystemNodeType.DIRECTORY:
3333
summary += f"Files analyzed: {node.file_count}\n"
34-
else:
34+
elif node.type == FileSystemNodeType.FILE:
3535
summary += f"File: {node.name}\n"
3636
summary += f"Lines: {len(node.content.splitlines()):,}\n"
3737

@@ -101,7 +101,7 @@ def _gather_file_contents(node: FileSystemNode) -> str:
101101
str
102102
The concatenated content of all files under the given node.
103103
"""
104-
if node.type == FileSystemNodeType.FILE:
104+
if node.type != FileSystemNodeType.DIRECTORY:
105105
return node.content_string
106106

107107
# Recursively gather contents of all files under the current directory
@@ -142,6 +142,8 @@ def _create_tree_structure(query: IngestionQuery, node: FileSystemNode, prefix:
142142
display_name = node.name
143143
if node.type == FileSystemNodeType.DIRECTORY:
144144
display_name += "/"
145+
elif node.type == FileSystemNodeType.SYMLINK:
146+
display_name += " -> " + node.path.readlink().name
145147

146148
tree_str += f"{prefix}{current_prefix}{display_name}\n"
147149

src/gitingest/schemas/filesystem_schema.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ class FileSystemNodeType(Enum):
1818

1919
DIRECTORY = auto()
2020
FILE = auto()
21+
SYMLINK = auto()
2122

2223

2324
@dataclass
@@ -91,7 +92,8 @@ def content_string(self) -> str:
9192
"""
9293
parts = [
9394
SEPARATOR,
94-
f"File: {str(self.path_str).replace(os.sep, '/')}",
95+
f"{self.type.name}: {str(self.path_str).replace(os.sep, '/')}"
96+
+ (f" -> {self.path.readlink().name}" if self.type == FileSystemNodeType.SYMLINK else ""),
9597
SEPARATOR,
9698
f"{self.content}",
9799
]
@@ -116,6 +118,9 @@ def content(self) -> str: # pylint: disable=too-many-return-statements
116118
if self.type == FileSystemNodeType.DIRECTORY:
117119
raise ValueError("Cannot read content of a directory node")
118120

121+
if self.type == FileSystemNodeType.SYMLINK:
122+
return ""
123+
119124
if not is_text_file(self.path):
120125
return "[Non-text file]"
121126

0 commit comments

Comments
 (0)