From e6d0dc2d3ce097dee23c1437244267d616400f8d Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Fri, 7 Mar 2025 16:42:08 +0100 Subject: [PATCH 01/34] Refactor ingestion logic to unify single-file and directory output, remove unused exceptions, and fix partial clone subpath handling. - Consolidate `format_directory` and `format_single_file` into a single `format_node` function - Remove unused exceptions (`MaxFilesReachedError`, `MaxFileSizeReachedError`, `AlreadyVisitedError`) - Update partial clone logic to correctly handle single-file paths by stripping the filename from subpath when `blob` is True - Improve docstrings and clean up code for better readability --- src/gitingest/cloning.py | 9 +- src/gitingest/exceptions.py | 21 ---- src/gitingest/filesystem_schema.py | 125 ++++++++++---------- src/gitingest/ingestion.py | 12 +- src/gitingest/output_formatters.py | 182 +++++++++++++---------------- 5 files changed, 151 insertions(+), 198 deletions(-) diff --git a/src/gitingest/cloning.py b/src/gitingest/cloning.py index ffd933c1..e24d5230 100644 --- a/src/gitingest/cloning.py +++ b/src/gitingest/cloning.py @@ -100,11 +100,12 @@ async def clone_repo(config: CloneConfig) -> None: checkout_cmd = ["git", "-C", local_path] if partial_clone: + subpath = config.subpath.lstrip("/") if config.blob: - # When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name - checkout_cmd += ["sparse-checkout", "set", Path(config.subpath.lstrip("/")).parent] - else: - checkout_cmd += ["sparse-checkout", "set", config.subpath.lstrip("/")] + # When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name. + subpath = str(Path(subpath).parent.as_posix()) + + checkout_cmd += ["sparse-checkout", "set", subpath] if commit: checkout_cmd += ["checkout", commit] diff --git a/src/gitingest/exceptions.py b/src/gitingest/exceptions.py index 3b01018d..241baf00 100644 --- a/src/gitingest/exceptions.py +++ b/src/gitingest/exceptions.py @@ -30,27 +30,6 @@ class AsyncTimeoutError(Exception): """ -class MaxFilesReachedError(Exception): - """Exception raised when the maximum number of files is reached.""" - - def __init__(self, max_files: int) -> None: - super().__init__(f"Maximum number of files ({max_files}) reached.") - - -class MaxFileSizeReachedError(Exception): - """Exception raised when the maximum file size is reached.""" - - def __init__(self, max_size: int): - super().__init__(f"Maximum file size limit ({max_size/1024/1024:.1f}MB) reached.") - - -class AlreadyVisitedError(Exception): - """Exception raised when a symlink target has already been visited.""" - - def __init__(self, path: str) -> None: - super().__init__(f"Symlink target already visited: {path}") - - class InvalidNotebookError(Exception): """Exception raised when a Jupyter notebook is invalid or cannot be processed.""" diff --git a/src/gitingest/filesystem_schema.py b/src/gitingest/filesystem_schema.py index 169830ba..77d0e464 100644 --- a/src/gitingest/filesystem_schema.py +++ b/src/gitingest/filesystem_schema.py @@ -7,12 +7,11 @@ from enum import Enum, auto from pathlib import Path -from gitingest.exceptions import InvalidNotebookError from gitingest.utils.ingestion_utils import _get_encoding_list from gitingest.utils.notebook_utils import process_notebook from gitingest.utils.textfile_checker_utils import is_textfile -SEPARATOR = "=" * 48 + "\n" +SEPARATOR = "=" * 48 class FileSystemNodeType(Enum): @@ -36,108 +35,104 @@ class FileSystemNode: # pylint: disable=too-many-instance-attributes """ Class representing a node in the file system (either a file or directory). - This class has more than the recommended number of attributes because it needs to - track various properties of files and directories for comprehensive analysis. + Tracks properties of files/directories for comprehensive analysis. """ name: str - type: FileSystemNodeType # e.g., "directory" or "file" + type: FileSystemNodeType path_str: str path: Path size: int = 0 file_count: int = 0 dir_count: int = 0 depth: int = 0 - children: list[FileSystemNode] = field(default_factory=list) # Using default_factory instead of empty list + children: list[FileSystemNode] = field(default_factory=list) def sort_children(self) -> None: """ Sort the children nodes of a directory according to a specific order. Order of sorting: - 1. README.md first - 2. Regular files (not starting with dot) - 3. Hidden files (starting with dot) - 4. Regular directories (not starting with dot) - 5. Hidden directories (starting with dot) - All groups are sorted alphanumerically within themselves. - """ - # Separate files and directories - files = [child for child in self.children if child.type == FileSystemNodeType.FILE] - directories = [child for child in self.children if child.type == FileSystemNodeType.DIRECTORY] + 2. Regular files (not starting with dot) + 3. Hidden files (starting with dot) + 4. Regular directories (not starting with dot) + 5. Hidden directories (starting with dot) - # Find README.md - readme_files = [f for f in files if f.name.lower() == "readme.md"] - other_files = [f for f in files if f.name.lower() != "readme.md"] + All groups are sorted alphanumerically within themselves. - # Separate hidden and regular files/directories - regular_files = [f for f in other_files if not f.name.startswith(".")] - hidden_files = [f for f in other_files if f.name.startswith(".")] - regular_dirs = [d for d in directories if not d.name.startswith(".")] - hidden_dirs = [d for d in directories if d.name.startswith(".")] + Raises + ------ + ValueError + If the node is not a directory. + """ + if self.type != FileSystemNodeType.DIRECTORY: + raise ValueError("Cannot sort children of a non-directory node") - # Sort each group alphanumerically - regular_files.sort(key=lambda x: x.name) - hidden_files.sort(key=lambda x: x.name) - regular_dirs.sort(key=lambda x: x.name) - hidden_dirs.sort(key=lambda x: x.name) + def _sort_key(child: FileSystemNode) -> tuple[int, str]: + # Groups: 0=README, 1=regular file, 2=hidden file, 3=regular dir, 4=hidden dir + name = child.name.lower() + if child.type == FileSystemNodeType.FILE: + if name == "readme.md": + return (0, name) + return (1 if not name.startswith(".") else 2, name) + return (3 if not name.startswith(".") else 4, name) - self.children = readme_files + regular_files + hidden_files + regular_dirs + hidden_dirs + self.children.sort(key=_sort_key) @property def content_string(self) -> str: """ - Return the content of the node as a string. - - This property returns the content of the node as a string, including the path and content. + Return the content of the node as a string, including path and content. Returns ------- str A string representation of the node's content. """ - content_repr = SEPARATOR + parts = [ + SEPARATOR, + f"File: {str(self.path_str).replace(os.sep, '/')}", + SEPARATOR, + f"{self.content}", + ] - # Use forward slashes in output paths - content_repr += f"File: {str(self.path_str).replace(os.sep, '/')}\n" - content_repr += SEPARATOR - content_repr += f"{self.content}\n\n" - return content_repr + return "\n".join(parts) + "\n\n" @property def content(self) -> str: # pylint: disable=too-many-return-statements """ - Read the content of a file. - - This function attempts to open a file and read its contents using UTF-8 encoding. - If an error occurs during reading (e.g., file is not found or permission error), - it returns an error message. + Read the content of a file if it's text (or a notebook). Return an error message otherwise. Returns ------- str The content of the file, or an error message if the file could not be read. + + Raises + ------ + ValueError + If the node is a directory. """ - if self.type == FileSystemNodeType.FILE and not is_textfile(self.path): + if self.type == FileSystemNodeType.DIRECTORY: + raise ValueError("Cannot read content of a directory node") + + if not is_textfile(self.path): return "[Non-text file]" - try: - if self.path.suffix == ".ipynb": - try: - return process_notebook(self.path) - except Exception as exc: - return f"Error processing notebook: {exc}" - - for encoding in _get_encoding_list(): - try: - with self.path.open(encoding=encoding) as f: - return f.read() - except UnicodeDecodeError: - continue - except OSError as exc: - return f"Error reading file: {exc}" - - return "Error: Unable to decode file with available encodings" - - except (OSError, InvalidNotebookError) as exc: - return f"Error reading file: {exc}" + if self.path.suffix == ".ipynb": + try: + return process_notebook(self.path) + except Exception as exc: + return f"Error processing notebook: {exc}" + + # Try multiple encodings + for encoding in _get_encoding_list(): + try: + with self.path.open(encoding=encoding) as f: + return f.read() + except UnicodeDecodeError: + continue + except OSError as exc: + return f"Error reading file: {exc}" + + return "Error: Unable to decode file with available encodings" diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index 24b65b39..bdfbdbf6 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -6,7 +6,7 @@ from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES from gitingest.filesystem_schema import FileSystemNode, FileSystemNodeType, FileSystemStats -from gitingest.output_formatters import format_directory, format_single_file +from gitingest.output_formatters import format_node from gitingest.query_parsing import ParsedQuery from gitingest.utils.ingestion_utils import _should_exclude, _should_include from gitingest.utils.path_utils import _is_safe_symlink @@ -38,7 +38,7 @@ def ingest_query(query: ParsedQuery) -> Tuple[str, str, str]: Raises ------ ValueError - If the specified path cannot be found or if the file is not a text file. + If the path cannot be found, is not a file, or the file has no content. """ subpath = Path(query.subpath.strip("/")).as_posix() path = query.local_path / subpath @@ -63,7 +63,11 @@ def ingest_query(query: ParsedQuery) -> Tuple[str, str, str]: path_str=str(relative_path), path=path, ) - return format_single_file(file_node, query) + + if not file_node.content: + raise ValueError(f"File {file_node.name} has no content") + + return format_node(file_node, query) root_node = FileSystemNode( name=path.name, @@ -80,7 +84,7 @@ def ingest_query(query: ParsedQuery) -> Tuple[str, str, str]: stats=stats, ) - return format_directory(root_node, query) + return format_node(root_node, query) def apply_gitingest_file(path: Path, query: ParsedQuery) -> None: diff --git a/src/gitingest/output_formatters.py b/src/gitingest/output_formatters.py index c9228361..8d5a278c 100644 --- a/src/gitingest/output_formatters.py +++ b/src/gitingest/output_formatters.py @@ -1,4 +1,4 @@ -""" Functions to ingest and analyze a codebase directory or single file. """ +"""Functions to ingest and analyze a codebase directory or single file.""" from typing import Optional, Tuple @@ -8,105 +8,109 @@ from gitingest.query_parsing import ParsedQuery -def _create_summary_string(query: ParsedQuery, node: FileSystemNode) -> str: +def format_node(node: FileSystemNode, query: ParsedQuery) -> Tuple[str, str, str]: """ - Create a summary string with file counts and content size. + Generate a summary, directory structure, and file contents for a given file system node. - This function generates a summary of the repository's contents, including the number - of files analyzed, the total content size, and other relevant details based on the query parameters. + If the node represents a directory, the function will recursively process its contents. Parameters ---------- + node : FileSystemNode + The file system node to be summarized. query : ParsedQuery The parsed query object containing information about the repository and query parameters. - node : FileSystemNode - The root node representing the directory structure, including file and directory counts. Returns ------- - str - Summary string containing details such as repository name, file count, and other query-specific information. + Tuple[str, str, str] + A tuple containing the summary, directory structure, and file contents. """ - if query.user_name: - summary = f"Repository: {query.user_name}/{query.repo_name}\n" + is_single_file = node.type == FileSystemNodeType.FILE + summary = _create_summary_prefix(query, single_file=is_single_file) + + if node.type == FileSystemNodeType.DIRECTORY: + summary += f"Files analyzed: {node.file_count}\n" else: - # Local scenario - summary = f"Directory: {query.slug}\n" + summary += f"File: {node.name}\n" + summary += f"Lines: {len(node.content.splitlines()):,}\n" - if query.commit: - summary += f"Commit: {query.commit}\n" - elif query.branch and query.branch not in ("main", "master"): - summary += f"Branch: {query.branch}\n" + tree = "Directory structure:\n" + _create_tree_structure(query, node) + _create_tree_structure(query, node) - if query.subpath != "/": - summary += f"Subpath: {query.subpath}\n" + content = _gather_file_contents(node) - summary += f"Files analyzed: {node.file_count}\n" - # TODO: Do we want to add the total number of lines? + token_estimate = _format_token_count(tree + content) + if token_estimate: + summary += f"\nEstimated tokens: {token_estimate}" - return summary + return summary, tree, content -def format_single_file(file_node: FileSystemNode, query: ParsedQuery) -> Tuple[str, str, str]: +def _create_summary_prefix(query: ParsedQuery, single_file: bool = False) -> str: """ - Format a single file for display. + Create a prefix string for summarizing a repository or local directory. - This function generates a summary, tree structure, and content for a single file. - It includes information such as the repository name, commit/branch, file name, - line count, and estimated token count. + Includes repository name (if provided), commit/branch details, and subpath if relevant. Parameters ---------- - file_node : FileSystemNode - The node representing the file to format. query : ParsedQuery The parsed query object containing information about the repository and query parameters. + single_file : bool + A flag indicating whether the summary is for a single file, by default False. Returns ------- - Tuple[str, str, str] - A tuple containing the summary, tree structure, and file content. - - Raises - ------ - ValueError - If the file has no content. + str + A summary prefix string containing repository, commit, branch, and subpath details. """ - if not file_node.content: - raise ValueError(f"File {file_node.name} has no content") + parts = [] - summary = f"Repository: {query.user_name}/{query.repo_name}\n" + if query.user_name: + parts.append(f"Repository: {query.user_name}/{query.repo_name}") + else: + # Local scenario + parts.append(f"Directory: {query.slug}") if query.commit: - summary += f"Commit: {query.commit}\n" + parts.append(f"Commit: {query.commit}") elif query.branch and query.branch not in ("main", "master"): - summary += f"Branch: {query.branch}\n" + parts.append(f"Branch: {query.branch}") - summary += f"File: {file_node.name}\n" - summary += f"Lines: {len(file_node.content.splitlines()):,}\n" + if query.subpath != "/" and not single_file: + parts.append(f"Subpath: {query.subpath}") - files_content = file_node.content_string + return "\n".join(parts) + "\n" - tree = "Directory structure:\n└── " + file_node.name - formatted_tokens = _generate_token_string(files_content) - if formatted_tokens: - summary += f"\nEstimated tokens: {formatted_tokens}" +def _gather_file_contents(node: FileSystemNode) -> str: + """ + Recursively gather contents of all files under the given node. - return summary, tree, files_content + This function recursively processes a directory node and gathers the contents of all files + under that node. It returns the concatenated content of all files as a single string. + Parameters + ---------- + node : FileSystemNode + The current directory or file node being processed. -def _get_files_content(node: FileSystemNode) -> str: + Returns + ------- + str + The concatenated content of all files under the given node. + """ if node.type == FileSystemNodeType.FILE: return node.content_string - if node.type == FileSystemNodeType.DIRECTORY: - return "\n".join(_get_files_content(child) for child in node.children) - return "" + + # Recursively gather contents of all files under the current directory + return "\n".join(_gather_file_contents(child) for child in node.children) def _create_tree_structure(query: ParsedQuery, node: FileSystemNode, prefix: str = "", is_last: bool = True) -> str: """ - Create a tree-like string representation of the file structure. + Generate a tree-like string representation of the file structure. This function generates a string representation of the directory structure, formatted as a tree with appropriate indentation for nested directories and files. @@ -127,36 +131,36 @@ def _create_tree_structure(query: ParsedQuery, node: FileSystemNode, prefix: str str A string representing the directory structure formatted as a tree. """ - tree = "" - if not node.name: + # If no name is present, use the slug as the top-level directory name node.name = query.slug - if node.name: - current_prefix = "└── " if is_last else "├── " - name = node.name + "/" if node.type == FileSystemNodeType.DIRECTORY else node.name - tree += prefix + current_prefix + name + "\n" + tree_str = "" + current_prefix = "└── " if is_last else "├── " + # Indicate directories with a trailing slash + display_name = node.name if node.type == FileSystemNodeType.DIRECTORY: - # Adjust prefix only if we added a node name - new_prefix = prefix + (" " if is_last else "│ ") if node.name else prefix - children = node.children - for i, child in enumerate(children): - tree += _create_tree_structure(query, node=child, prefix=new_prefix, is_last=i == len(children) - 1) + display_name += "/" + + tree_str += f"{prefix}{current_prefix}{display_name}\n" - return tree + if node.type == FileSystemNodeType.DIRECTORY and node.children: + prefix += " " if is_last else "│ " + for i, child in enumerate(node.children): + tree_str += _create_tree_structure(query, node=child, prefix=prefix, is_last=i == len(node.children) - 1) + return tree_str -def _generate_token_string(context_string: str) -> Optional[str]: +def _format_token_count(text: str) -> Optional[str]: """ - Return the number of tokens in a text string. + Return a human-readable string representing the token count of the given text. - This function estimates the number of tokens in a given text string using the `tiktoken` - library. It returns the number of tokens in a human-readable format (e.g., '1.2k', '1.2M'). + E.g., '120' -> '120', '1200' -> '1.2k', '1200000' -> '1.2M'. Parameters ---------- - context_string : str + text : str The text string for which the token count is to be estimated. Returns @@ -166,45 +170,15 @@ def _generate_token_string(context_string: str) -> Optional[str]: """ try: encoding = tiktoken.get_encoding("cl100k_base") - total_tokens = len(encoding.encode(context_string, disallowed_special=())) + total_tokens = len(encoding.encode(text, disallowed_special=())) except (ValueError, UnicodeEncodeError) as exc: print(exc) return None - if total_tokens > 1_000_000: + if total_tokens >= 1_000_000: return f"{total_tokens / 1_000_000:.1f}M" - if total_tokens > 1_000: + if total_tokens >= 1_000: return f"{total_tokens / 1_000:.1f}k" return str(total_tokens) - - -def format_directory(root_node: FileSystemNode, query: ParsedQuery) -> Tuple[str, str, str]: - """ - Ingest an entire directory and return its summary, directory structure, and file contents. - - This function processes a directory, extracts its contents, and generates a summary, - directory structure, and file content. It recursively processes subdirectories as well. - - Parameters - ---------- - root_node : FileSystemNode - The root node representing the directory to process. - query : ParsedQuery - The parsed query object containing information about the repository and query parameters. - - Returns - ------- - Tuple[str, str, str] - A tuple containing the summary, directory structure, and file contents. - """ - summary = _create_summary_string(query, node=root_node) - tree = "Directory structure:\n" + _create_tree_structure(query, root_node) - files_content = _get_files_content(root_node) - - formatted_tokens = _generate_token_string(tree + files_content) - if formatted_tokens: - summary += f"\nEstimated tokens: {formatted_tokens}" - - return summary, tree, files_content From 2c593bf8d14155b8c86e6c17ab2653e57610302a Mon Sep 17 00:00:00 2001 From: cyclotruc Date: Fri, 7 Mar 2025 20:33:57 +0000 Subject: [PATCH 02/34] add comments --- src/gitingest/filesystem_schema.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gitingest/filesystem_schema.py b/src/gitingest/filesystem_schema.py index 77d0e464..61f60a95 100644 --- a/src/gitingest/filesystem_schema.py +++ b/src/gitingest/filesystem_schema.py @@ -11,7 +11,7 @@ from gitingest.utils.notebook_utils import process_notebook from gitingest.utils.textfile_checker_utils import is_textfile -SEPARATOR = "=" * 48 +SEPARATOR = "=" * 48 # Tiktoken, the tokenizer openai uses, counts 2 tokens if we have more than 48 class FileSystemNodeType(Enum): @@ -69,6 +69,7 @@ def sort_children(self) -> None: raise ValueError("Cannot sort children of a non-directory node") def _sort_key(child: FileSystemNode) -> tuple[int, str]: + # returns the priority order for the sort function, 0 is first # Groups: 0=README, 1=regular file, 2=hidden file, 3=regular dir, 4=hidden dir name = child.name.lower() if child.type == FileSystemNodeType.FILE: From b098bb453477648f2999f58afe38a4d510795600 Mon Sep 17 00:00:00 2001 From: Romain Courtois Date: Tue, 11 Mar 2025 00:56:58 +0100 Subject: [PATCH 03/34] Refactor/pydantic(#226) --- pyproject.toml | 1 + src/gitingest/__init__.py | 6 +- src/gitingest/cli.py | 2 +- src/gitingest/cloning.py | 34 +---- .../{repository_ingest.py => entrypoint.py} | 22 +-- src/gitingest/ingestion.py | 14 +- src/gitingest/ingestion_schema.py | 90 ++++++++++++ src/gitingest/output_formatters.py | 14 +- src/gitingest/query_parsing.py | 96 +++--------- src/server/query_processor.py | 22 +-- tests/conftest.py | 14 +- tests/query_parser/test_git_host_agnostic.py | 20 +-- tests/query_parser/test_query_parser.py | 138 +++++++++--------- tests/test_ingestion.py | 4 +- tests/test_repository_clone.py | 72 ++++----- 15 files changed, 281 insertions(+), 268 deletions(-) rename src/gitingest/{repository_ingest.py => entrypoint.py} (88%) create mode 100644 src/gitingest/ingestion_schema.py diff --git a/pyproject.toml b/pyproject.toml index 50a746cb..6eb4cedc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,5 +76,6 @@ pythonpath = ["src"] testpaths = ["tests/"] python_files = "test_*.py" asyncio_mode = "auto" +asyncio_default_fixture_loop_scope = "function" python_classes = "Test*" python_functions = "test_*" diff --git a/src/gitingest/__init__.py b/src/gitingest/__init__.py index c291fd1b..684ec14f 100644 --- a/src/gitingest/__init__.py +++ b/src/gitingest/__init__.py @@ -1,8 +1,8 @@ """ Gitingest: A package for ingesting data from Git repositories. """ -from gitingest.cloning import clone_repo +from gitingest.cloning import clone +from gitingest.entrypoint import ingest, ingest_async from gitingest.ingestion import ingest_query from gitingest.query_parsing import parse_query -from gitingest.repository_ingest import ingest, ingest_async -__all__ = ["ingest_query", "clone_repo", "parse_query", "ingest", "ingest_async"] +__all__ = ["ingest_query", "clone", "parse_query", "ingest", "ingest_async"] diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 73b49b67..d5c5c4f5 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -8,7 +8,7 @@ import click from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME -from gitingest.repository_ingest import ingest_async +from gitingest.entrypoint import ingest_async @click.command() diff --git a/src/gitingest/cloning.py b/src/gitingest/cloning.py index e24d5230..8c717b38 100644 --- a/src/gitingest/cloning.py +++ b/src/gitingest/cloning.py @@ -2,47 +2,17 @@ import asyncio import os -from dataclasses import dataclass from pathlib import Path from typing import List, Optional, Tuple +from gitingest.ingestion_schema import CloneConfig from gitingest.utils.timeout_wrapper import async_timeout TIMEOUT: int = 60 -@dataclass -class CloneConfig: - """ - Configuration for cloning a Git repository. - - This class holds the necessary parameters for cloning a repository to a local path, including - the repository's URL, the target local path, and optional parameters for a specific commit or branch. - - Attributes - ---------- - url : str - The URL of the Git repository to clone. - local_path : str - The local directory where the repository will be cloned. - commit : str, optional - The specific commit hash to check out after cloning (default is None). - branch : str, optional - The branch to clone (default is None). - subpath : str - The subpath to clone from the repository (default is "/"). - """ - - url: str - local_path: str - commit: Optional[str] = None - branch: Optional[str] = None - subpath: str = "/" - blob: bool = False - - @async_timeout(TIMEOUT) -async def clone_repo(config: CloneConfig) -> None: +async def clone(config: CloneConfig) -> None: """ Clone a repository to a local path based on the provided configuration. diff --git a/src/gitingest/repository_ingest.py b/src/gitingest/entrypoint.py similarity index 88% rename from src/gitingest/repository_ingest.py rename to src/gitingest/entrypoint.py index f30d6001..776a6397 100644 --- a/src/gitingest/repository_ingest.py +++ b/src/gitingest/entrypoint.py @@ -5,10 +5,10 @@ import shutil from typing import Optional, Set, Tuple, Union -from gitingest.cloning import clone_repo +from gitingest.cloning import clone from gitingest.config import TMP_BASE_PATH from gitingest.ingestion import ingest_query -from gitingest.query_parsing import ParsedQuery, parse_query +from gitingest.query_parsing import IngestionQuery, parse_query async def ingest_async( @@ -53,12 +53,12 @@ async def ingest_async( Raises ------ TypeError - If `clone_repo` does not return a coroutine, or if the `source` is of an unsupported type. + If `clone` does not return a coroutine, or if the `source` is of an unsupported type. """ repo_cloned = False try: - parsed_query: ParsedQuery = await parse_query( + query: IngestionQuery = await parse_query( source=source, max_file_size=max_file_size, from_web=False, @@ -66,12 +66,12 @@ async def ingest_async( ignore_patterns=exclude_patterns, ) - if parsed_query.url: - selected_branch = branch if branch else parsed_query.branch # prioritize branch argument - parsed_query.branch = selected_branch + if query.url: + selected_branch = branch if branch else query.branch # prioritize branch argument + query.branch = selected_branch - clone_config = parsed_query.extact_clone_config() - clone_coroutine = clone_repo(clone_config) + clone_config = query.extract_clone_config() + clone_coroutine = clone(clone_config) if inspect.iscoroutine(clone_coroutine): if asyncio.get_event_loop().is_running(): @@ -79,11 +79,11 @@ async def ingest_async( else: asyncio.run(clone_coroutine) else: - raise TypeError("clone_repo did not return a coroutine as expected.") + raise TypeError("clone did not return a coroutine as expected.") repo_cloned = True - summary, tree, content = ingest_query(parsed_query) + summary, tree, content = ingest_query(query) if output is not None: with open(output, "w", encoding="utf-8") as f: diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index bdfbdbf6..ec5eb754 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -7,7 +7,7 @@ from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES from gitingest.filesystem_schema import FileSystemNode, FileSystemNodeType, FileSystemStats from gitingest.output_formatters import format_node -from gitingest.query_parsing import ParsedQuery +from gitingest.query_parsing import IngestionQuery from gitingest.utils.ingestion_utils import _should_exclude, _should_include from gitingest.utils.path_utils import _is_safe_symlink @@ -17,7 +17,7 @@ import tomli as tomllib -def ingest_query(query: ParsedQuery) -> Tuple[str, str, str]: +def ingest_query(query: IngestionQuery) -> Tuple[str, str, str]: """ Run the ingestion process for a parsed query. @@ -27,7 +27,7 @@ def ingest_query(query: ParsedQuery) -> Tuple[str, str, str]: Parameters ---------- - query : ParsedQuery + query : IngestionQuery The parsed query object containing information about the repository and query parameters. Returns @@ -87,7 +87,7 @@ def ingest_query(query: ParsedQuery) -> Tuple[str, str, str]: return format_node(root_node, query) -def apply_gitingest_file(path: Path, query: ParsedQuery) -> None: +def apply_gitingest_file(path: Path, query: IngestionQuery) -> None: """ Apply the .gitingest file to the query object. @@ -98,7 +98,7 @@ def apply_gitingest_file(path: Path, query: ParsedQuery) -> None: ---------- path : Path The path of the directory to ingest. - query : ParsedQuery + query : IngestionQuery The parsed query object containing information about the repository and query parameters. It should have an attribute `ignore_patterns` which is either None or a set of strings. """ @@ -154,7 +154,7 @@ def apply_gitingest_file(path: Path, query: ParsedQuery) -> None: def _process_node( node: FileSystemNode, - query: ParsedQuery, + query: IngestionQuery, stats: FileSystemStats, ) -> None: """ @@ -167,7 +167,7 @@ def _process_node( ---------- node : FileSystemNode The current directory or file node being processed. - query : ParsedQuery + query : IngestionQuery The parsed query object containing information about the repository and query parameters. stats : FileSystemStats Statistics tracking object for the total file count and size. diff --git a/src/gitingest/ingestion_schema.py b/src/gitingest/ingestion_schema.py new file mode 100644 index 00000000..e28f6470 --- /dev/null +++ b/src/gitingest/ingestion_schema.py @@ -0,0 +1,90 @@ +""" This module contains the dataclasses for the ingestion process. """ + +from dataclasses import dataclass +from pathlib import Path +from typing import Optional, Set + +from pydantic import BaseModel, Field + +from gitingest.config import MAX_FILE_SIZE + + +@dataclass +class CloneConfig: + """ + Configuration for cloning a Git repository. + + This class holds the necessary parameters for cloning a repository to a local path, including + the repository's URL, the target local path, and optional parameters for a specific commit or branch. + + Attributes + ---------- + url : str + The URL of the Git repository to clone. + local_path : str + The local directory where the repository will be cloned. + commit : str, optional + The specific commit hash to check out after cloning (default is None). + branch : str, optional + The branch to clone (default is None). + subpath : str + The subpath to clone from the repository (default is "/"). + """ + + url: str + local_path: str + commit: Optional[str] = None + branch: Optional[str] = None + subpath: str = "/" + blob: bool = False + + +class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes + """ + Pydantic model to store the parsed details of the repository or file path. + """ + + user_name: Optional[str] = None + repo_name: Optional[str] = None + local_path: Path + url: Optional[str] = None + slug: str + id: str + subpath: str = "/" + type: Optional[str] = None + branch: Optional[str] = None + commit: Optional[str] = None + max_file_size: int = Field(default=MAX_FILE_SIZE) + ignore_patterns: Optional[Set[str]] = None + include_patterns: Optional[Set[str]] = None + + class Config: + """Pydantic model configuration.""" + + arbitrary_types_allowed = True + + def extract_clone_config(self) -> CloneConfig: + """ + Extract the relevant fields for the CloneConfig object. + + Returns + ------- + CloneConfig + A CloneConfig object containing the relevant fields. + + Raises + ------ + ValueError + If the 'url' parameter is not provided. + """ + if not self.url: + raise ValueError("The 'url' parameter is required.") + + return CloneConfig( + url=self.url, + local_path=str(self.local_path), + commit=self.commit, + branch=self.branch, + subpath=self.subpath, + blob=self.type == "blob", + ) diff --git a/src/gitingest/output_formatters.py b/src/gitingest/output_formatters.py index 8d5a278c..5f747387 100644 --- a/src/gitingest/output_formatters.py +++ b/src/gitingest/output_formatters.py @@ -5,10 +5,10 @@ import tiktoken from gitingest.filesystem_schema import FileSystemNode, FileSystemNodeType -from gitingest.query_parsing import ParsedQuery +from gitingest.query_parsing import IngestionQuery -def format_node(node: FileSystemNode, query: ParsedQuery) -> Tuple[str, str, str]: +def format_node(node: FileSystemNode, query: IngestionQuery) -> Tuple[str, str, str]: """ Generate a summary, directory structure, and file contents for a given file system node. @@ -18,7 +18,7 @@ def format_node(node: FileSystemNode, query: ParsedQuery) -> Tuple[str, str, str ---------- node : FileSystemNode The file system node to be summarized. - query : ParsedQuery + query : IngestionQuery The parsed query object containing information about the repository and query parameters. Returns @@ -47,7 +47,7 @@ def format_node(node: FileSystemNode, query: ParsedQuery) -> Tuple[str, str, str return summary, tree, content -def _create_summary_prefix(query: ParsedQuery, single_file: bool = False) -> str: +def _create_summary_prefix(query: IngestionQuery, single_file: bool = False) -> str: """ Create a prefix string for summarizing a repository or local directory. @@ -55,7 +55,7 @@ def _create_summary_prefix(query: ParsedQuery, single_file: bool = False) -> str Parameters ---------- - query : ParsedQuery + query : IngestionQuery The parsed query object containing information about the repository and query parameters. single_file : bool A flag indicating whether the summary is for a single file, by default False. @@ -108,7 +108,7 @@ def _gather_file_contents(node: FileSystemNode) -> str: return "\n".join(_gather_file_contents(child) for child in node.children) -def _create_tree_structure(query: ParsedQuery, node: FileSystemNode, prefix: str = "", is_last: bool = True) -> str: +def _create_tree_structure(query: IngestionQuery, node: FileSystemNode, prefix: str = "", is_last: bool = True) -> str: """ Generate a tree-like string representation of the file structure. @@ -117,7 +117,7 @@ def _create_tree_structure(query: ParsedQuery, node: FileSystemNode, prefix: str Parameters ---------- - query : ParsedQuery + query : IngestionQuery The parsed query object containing information about the repository and query parameters. node : FileSystemNode The current directory or file node being processed. diff --git a/src/gitingest/query_parsing.py b/src/gitingest/query_parsing.py index e2b0e0cf..434220ef 100644 --- a/src/gitingest/query_parsing.py +++ b/src/gitingest/query_parsing.py @@ -3,14 +3,14 @@ import re import uuid import warnings -from dataclasses import dataclass from pathlib import Path from typing import List, Optional, Set, Union from urllib.parse import unquote, urlparse -from gitingest.cloning import CloneConfig, _check_repo_exists, fetch_remote_branch_list -from gitingest.config import MAX_FILE_SIZE, TMP_BASE_PATH +from gitingest.cloning import _check_repo_exists, fetch_remote_branch_list +from gitingest.config import TMP_BASE_PATH from gitingest.exceptions import InvalidPatternError +from gitingest.ingestion_schema import IngestionQuery from gitingest.utils.ignore_patterns import DEFAULT_IGNORE_PATTERNS from gitingest.utils.query_parser_utils import ( KNOWN_GIT_HOSTS, @@ -23,61 +23,13 @@ ) -@dataclass -class ParsedQuery: # pylint: disable=too-many-instance-attributes - """ - Dataclass to store the parsed details of the repository or file path. - """ - - user_name: Optional[str] - repo_name: Optional[str] - local_path: Path - url: Optional[str] - slug: str - id: str - subpath: str = "/" - type: Optional[str] = None - branch: Optional[str] = None - commit: Optional[str] = None - max_file_size: int = MAX_FILE_SIZE - ignore_patterns: Optional[Set[str]] = None - include_patterns: Optional[Set[str]] = None - pattern_type: Optional[str] = None - - def extact_clone_config(self) -> CloneConfig: - """ - Extract the relevant fields for the CloneConfig object. - - Returns - ------- - CloneConfig - A CloneConfig object containing the relevant fields. - - Raises - ------ - ValueError - If the 'url' parameter is not provided. - """ - if not self.url: - raise ValueError("The 'url' parameter is required.") - - return CloneConfig( - url=self.url, - local_path=str(self.local_path), - commit=self.commit, - branch=self.branch, - subpath=self.subpath, - blob=self.type == "blob", - ) - - async def parse_query( source: str, max_file_size: int, from_web: bool, include_patterns: Optional[Union[str, Set[str]]] = None, ignore_patterns: Optional[Union[str, Set[str]]] = None, -) -> ParsedQuery: +) -> IngestionQuery: """ Parse the input source (URL or path) to extract relevant details for the query. @@ -100,17 +52,17 @@ async def parse_query( Returns ------- - ParsedQuery + IngestionQuery A dataclass object containing the parsed details of the repository or file path. """ # Determine the parsing method based on the source type if from_web or urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS): # We either have a full URL or a domain-less slug - parsed_query = await _parse_remote_repo(source) + query = await _parse_remote_repo(source) else: # Local path scenario - parsed_query = _parse_local_dir_path(source) + query = _parse_local_dir_path(source) # Combine default ignore patterns + custom patterns ignore_patterns_set = DEFAULT_IGNORE_PATTERNS.copy() @@ -125,24 +77,24 @@ async def parse_query( else: parsed_include = None - return ParsedQuery( - user_name=parsed_query.user_name, - repo_name=parsed_query.repo_name, - url=parsed_query.url, - subpath=parsed_query.subpath, - local_path=parsed_query.local_path, - slug=parsed_query.slug, - id=parsed_query.id, - type=parsed_query.type, - branch=parsed_query.branch, - commit=parsed_query.commit, + return IngestionQuery( + user_name=query.user_name, + repo_name=query.repo_name, + url=query.url, + subpath=query.subpath, + local_path=query.local_path, + slug=query.slug, + id=query.id, + type=query.type, + branch=query.branch, + commit=query.commit, max_file_size=max_file_size, ignore_patterns=ignore_patterns_set, include_patterns=parsed_include, ) -async def _parse_remote_repo(source: str) -> ParsedQuery: +async def _parse_remote_repo(source: str) -> IngestionQuery: """ Parse a repository URL into a structured query dictionary. @@ -158,7 +110,7 @@ async def _parse_remote_repo(source: str) -> ParsedQuery: Returns ------- - ParsedQuery + IngestionQuery A dictionary containing the parsed details of the repository. """ source = unquote(source) @@ -190,7 +142,7 @@ async def _parse_remote_repo(source: str) -> ParsedQuery: local_path = TMP_BASE_PATH / _id / slug url = f"https://{host}/{user_name}/{repo_name}" - parsed = ParsedQuery( + parsed = IngestionQuery( user_name=user_name, repo_name=repo_name, url=url, @@ -307,7 +259,7 @@ def _parse_patterns(pattern: Union[str, Set[str]]) -> Set[str]: return {_normalize_pattern(p) for p in parsed_patterns} -def _parse_local_dir_path(path_str: str) -> ParsedQuery: +def _parse_local_dir_path(path_str: str) -> IngestionQuery: """ Parse the given file path into a structured query dictionary. @@ -318,12 +270,12 @@ def _parse_local_dir_path(path_str: str) -> ParsedQuery: Returns ------- - ParsedQuery + IngestionQuery A dictionary containing the parsed details of the file path. """ path_obj = Path(path_str).resolve() slug = path_obj.name if path_str == "." else path_str.strip("/") - return ParsedQuery( + return IngestionQuery( user_name=None, repo_name=None, url=None, diff --git a/src/server/query_processor.py b/src/server/query_processor.py index f6cdcea2..2e751479 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -5,9 +5,9 @@ from fastapi import Request from starlette.templating import _TemplateResponse -from gitingest.cloning import clone_repo +from gitingest.cloning import clone from gitingest.ingestion import ingest_query -from gitingest.query_parsing import ParsedQuery, parse_query +from gitingest.query_parsing import IngestionQuery, parse_query from server.server_config import EXAMPLE_REPOS, MAX_DISPLAY_SIZE, templates from server.server_utils import Colors, log_slider_to_size @@ -74,25 +74,25 @@ async def process_query( } try: - parsed_query: ParsedQuery = await parse_query( + query: IngestionQuery = await parse_query( source=input_text, max_file_size=max_file_size, from_web=True, include_patterns=include_patterns, ignore_patterns=exclude_patterns, ) - if not parsed_query.url: + if not query.url: raise ValueError("The 'url' parameter is required.") - clone_config = parsed_query.extact_clone_config() - await clone_repo(clone_config) - summary, tree, content = ingest_query(parsed_query) + clone_config = query.extract_clone_config() + await clone(clone_config) + summary, tree, content = ingest_query(query) with open(f"{clone_config.local_path}.txt", "w", encoding="utf-8") as f: f.write(tree + "\n" + content) except Exception as exc: # hack to print error message when query is not defined - if "query" in locals() and parsed_query is not None and isinstance(parsed_query, dict): - _print_error(parsed_query["url"], exc, max_file_size, pattern_type, pattern) + if "query" in locals() and query is not None and isinstance(query, dict): + _print_error(query["url"], exc, max_file_size, pattern_type, pattern) else: print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") print(f"{Colors.RED}{exc}{Colors.END}") @@ -111,7 +111,7 @@ async def process_query( ) _print_success( - url=parsed_query.url, + url=query.url, max_file_size=max_file_size, pattern_type=pattern_type, pattern=pattern, @@ -124,7 +124,7 @@ async def process_query( "summary": summary, "tree": tree, "content": content, - "ingest_id": parsed_query.id, + "ingest_id": query.id, } ) diff --git a/tests/conftest.py b/tests/conftest.py index 86925005..33cf4df3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,24 +11,24 @@ import pytest -from gitingest.query_parsing import ParsedQuery +from gitingest.query_parsing import IngestionQuery WriteNotebookFunc = Callable[[str, Dict[str, Any]], Path] @pytest.fixture -def sample_query() -> ParsedQuery: +def sample_query() -> IngestionQuery: """ - Provide a default `ParsedQuery` object for use in tests. + Provide a default `IngestionQuery` object for use in tests. - This fixture returns a `ParsedQuery` pre-populated with typical fields and some default ignore patterns. + This fixture returns a `IngestionQuery` pre-populated with typical fields and some default ignore patterns. Returns ------- - ParsedQuery - The sample `ParsedQuery` object. + IngestionQuery + The sample `IngestionQuery` object. """ - return ParsedQuery( + return IngestionQuery( user_name="test_user", repo_name="test_repo", url=None, diff --git a/tests/query_parser/test_git_host_agnostic.py b/tests/query_parser/test_git_host_agnostic.py index 61fb512e..0039d220 100644 --- a/tests/query_parser/test_git_host_agnostic.py +++ b/tests/query_parser/test_git_host_agnostic.py @@ -82,14 +82,14 @@ async def test_parse_query_without_host( Then the parser should correctly identify the user, repo, canonical URL, and other default fields. """ for url in urls: - parsed_query = await parse_query(url, max_file_size=50, from_web=True) + query = await parse_query(url, max_file_size=50, from_web=True) - assert parsed_query.user_name == expected_user - assert parsed_query.repo_name == expected_repo - assert parsed_query.url == expected_url - assert parsed_query.slug == f"{expected_user}-{expected_repo}" - assert parsed_query.id is not None - assert parsed_query.subpath == "/" - assert parsed_query.branch is None - assert parsed_query.commit is None - assert parsed_query.type is None + assert query.user_name == expected_user + assert query.repo_name == expected_repo + assert query.url == expected_url + assert query.slug == f"{expected_user}-{expected_repo}" + assert query.id is not None + assert query.subpath == "/" + assert query.branch is None + assert query.commit is None + assert query.type is None diff --git a/tests/query_parser/test_query_parser.py b/tests/query_parser/test_query_parser.py index 51beb8d5..a01b5e0f 100644 --- a/tests/query_parser/test_query_parser.py +++ b/tests/query_parser/test_query_parser.py @@ -32,11 +32,11 @@ async def test_parse_url_valid_https() -> None: "https://gist.github.com/user/repo", ] for url in test_cases: - parsed_query = await _parse_remote_repo(url) + query = await _parse_remote_repo(url) - assert parsed_query.user_name == "user" - assert parsed_query.repo_name == "repo" - assert parsed_query.url == url + assert query.user_name == "user" + assert query.repo_name == "repo" + assert query.url == url @pytest.mark.asyncio @@ -57,11 +57,11 @@ async def test_parse_url_valid_http() -> None: "http://gist.github.com/user/repo", ] for url in test_cases: - parsed_query = await _parse_remote_repo(url) + query = await _parse_remote_repo(url) - assert parsed_query.user_name == "user" - assert parsed_query.repo_name == "repo" - assert parsed_query.slug == "user-repo" + assert query.user_name == "user" + assert query.repo_name == "repo" + assert query.slug == "user-repo" @pytest.mark.asyncio @@ -88,13 +88,13 @@ async def test_parse_query_basic(url): When `parse_query` is called, Then user/repo, URL, and ignore patterns should be parsed correctly. """ - parsed_query = await parse_query(source=url, max_file_size=50, from_web=True, ignore_patterns="*.txt") + query = await parse_query(source=url, max_file_size=50, from_web=True, ignore_patterns="*.txt") - assert parsed_query.user_name == "user" - assert parsed_query.repo_name == "repo" - assert parsed_query.url == url - assert parsed_query.ignore_patterns - assert "*.txt" in parsed_query.ignore_patterns + assert query.user_name == "user" + assert query.repo_name == "repo" + assert query.url == url + assert query.ignore_patterns + assert "*.txt" in query.ignore_patterns @pytest.mark.asyncio @@ -107,10 +107,10 @@ async def test_parse_query_mixed_case() -> None: Then the user and repo names should be normalized to lowercase. """ url = "Https://GitHub.COM/UsEr/rEpO" - parsed_query = await parse_query(url, max_file_size=50, from_web=True) + query = await parse_query(url, max_file_size=50, from_web=True) - assert parsed_query.user_name == "user" - assert parsed_query.repo_name == "repo" + assert query.user_name == "user" + assert query.repo_name == "repo" @pytest.mark.asyncio @@ -123,10 +123,10 @@ async def test_parse_query_include_pattern() -> None: Then the include pattern should be set, and default ignore patterns remain applied. """ url = "https://github.com/user/repo" - parsed_query = await parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py") + query = await parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py") - assert parsed_query.include_patterns == {"*.py"} - assert parsed_query.ignore_patterns == DEFAULT_IGNORE_PATTERNS + assert query.include_patterns == {"*.py"} + assert query.ignore_patterns == DEFAULT_IGNORE_PATTERNS @pytest.mark.asyncio @@ -157,12 +157,12 @@ async def test_parse_url_with_subpaths() -> None: mock_run_command.return_value = (b"refs/heads/main\nrefs/heads/dev\nrefs/heads/feature-branch\n", b"") with patch("gitingest.cloning.fetch_remote_branch_list", new_callable=AsyncMock) as mock_fetch_branches: mock_fetch_branches.return_value = ["main", "dev", "feature-branch"] - parsed_query = await _parse_remote_repo(url) + query = await _parse_remote_repo(url) - assert parsed_query.user_name == "user" - assert parsed_query.repo_name == "repo" - assert parsed_query.branch == "main" - assert parsed_query.subpath == "/subdir/file" + assert query.user_name == "user" + assert query.repo_name == "repo" + assert query.branch == "main" + assert query.subpath == "/subdir/file" @pytest.mark.asyncio @@ -216,10 +216,10 @@ async def test_parse_query_with_large_file_size() -> None: Then `max_file_size` should be set correctly and default ignore patterns remain unchanged. """ url = "https://github.com/user/repo" - parsed_query = await parse_query(url, max_file_size=10**9, from_web=True) + query = await parse_query(url, max_file_size=10**9, from_web=True) - assert parsed_query.max_file_size == 10**9 - assert parsed_query.ignore_patterns == DEFAULT_IGNORE_PATTERNS + assert query.max_file_size == 10**9 + assert query.ignore_patterns == DEFAULT_IGNORE_PATTERNS @pytest.mark.asyncio @@ -232,10 +232,10 @@ async def test_parse_query_empty_patterns() -> None: Then include_patterns becomes None and default ignore patterns apply. """ url = "https://github.com/user/repo" - parsed_query = await parse_query(url, max_file_size=50, from_web=True, include_patterns="", ignore_patterns="") + query = await parse_query(url, max_file_size=50, from_web=True, include_patterns="", ignore_patterns="") - assert parsed_query.include_patterns is None - assert parsed_query.ignore_patterns == DEFAULT_IGNORE_PATTERNS + assert query.include_patterns is None + assert query.ignore_patterns == DEFAULT_IGNORE_PATTERNS @pytest.mark.asyncio @@ -248,7 +248,7 @@ async def test_parse_query_include_and_ignore_overlap() -> None: Then "*.py" should be removed from ignore patterns. """ url = "https://github.com/user/repo" - parsed_query = await parse_query( + query = await parse_query( url, max_file_size=50, from_web=True, @@ -256,10 +256,10 @@ async def test_parse_query_include_and_ignore_overlap() -> None: ignore_patterns={"*.py", "*.txt"}, ) - assert parsed_query.include_patterns == {"*.py"} - assert parsed_query.ignore_patterns is not None - assert "*.py" not in parsed_query.ignore_patterns - assert "*.txt" in parsed_query.ignore_patterns + assert query.include_patterns == {"*.py"} + assert query.ignore_patterns is not None + assert "*.py" not in query.ignore_patterns + assert "*.txt" in query.ignore_patterns @pytest.mark.asyncio @@ -272,12 +272,12 @@ async def test_parse_query_local_path() -> None: Then the local path should be set, id generated, and slug formed accordingly. """ path = "/home/user/project" - parsed_query = await parse_query(path, max_file_size=100, from_web=False) + query = await parse_query(path, max_file_size=100, from_web=False) tail = Path("home/user/project") - assert parsed_query.local_path.parts[-len(tail.parts) :] == tail.parts - assert parsed_query.id is not None - assert parsed_query.slug == "home/user/project" + assert query.local_path.parts[-len(tail.parts) :] == tail.parts + assert query.id is not None + assert query.slug == "home/user/project" @pytest.mark.asyncio @@ -290,11 +290,11 @@ async def test_parse_query_relative_path() -> None: Then local_path resolves relatively, and slug ends with "project". """ path = "./project" - parsed_query = await parse_query(path, max_file_size=100, from_web=False) + query = await parse_query(path, max_file_size=100, from_web=False) tail = Path("project") - assert parsed_query.local_path.parts[-len(tail.parts) :] == tail.parts - assert parsed_query.slug.endswith("project") + assert query.local_path.parts[-len(tail.parts) :] == tail.parts + assert query.slug.endswith("project") @pytest.mark.asyncio @@ -336,11 +336,11 @@ async def test_parse_url_branch_and_commit_distinction(url: str, expected_branch with patch("gitingest.cloning.fetch_remote_branch_list", new_callable=AsyncMock) as mock_fetch_branches: mock_fetch_branches.return_value = ["main", "dev", "feature-branch"] - parsed_query = await _parse_remote_repo(url) + query = await _parse_remote_repo(url) # Verify that `branch` and `commit` match our expectations - assert parsed_query.branch == expected_branch - assert parsed_query.commit == expected_commit + assert query.branch == expected_branch + assert query.commit == expected_commit @pytest.mark.asyncio @@ -353,10 +353,10 @@ async def test_parse_query_uuid_uniqueness() -> None: Then each call should produce a different query id. """ path = "/home/user/project" - parsed_query_1 = await parse_query(path, max_file_size=100, from_web=False) - parsed_query_2 = await parse_query(path, max_file_size=100, from_web=False) + query_1 = await parse_query(path, max_file_size=100, from_web=False) + query_2 = await parse_query(path, max_file_size=100, from_web=False) - assert parsed_query_1.id != parsed_query_2.id + assert query_1.id != query_2.id @pytest.mark.asyncio @@ -369,11 +369,11 @@ async def test_parse_url_with_query_and_fragment() -> None: Then those parts should be stripped, leaving a clean user/repo URL. """ url = "https://github.com/user/repo?arg=value#fragment" - parsed_query = await _parse_remote_repo(url) + query = await _parse_remote_repo(url) - assert parsed_query.user_name == "user" - assert parsed_query.repo_name == "repo" - assert parsed_query.url == "https://github.com/user/repo" # URL should be cleaned + assert query.user_name == "user" + assert query.repo_name == "repo" + assert query.url == "https://github.com/user/repo" # URL should be cleaned @pytest.mark.asyncio @@ -400,17 +400,17 @@ async def test_parse_query_with_branch() -> None: Then the branch should be identified, subpath set, and commit remain None. """ url = "https://github.com/pandas-dev/pandas/blob/2.2.x/.github/ISSUE_TEMPLATE/documentation_improvement.yaml" - parsed_query = await parse_query(url, max_file_size=10**9, from_web=True) + query = await parse_query(url, max_file_size=10**9, from_web=True) - assert parsed_query.user_name == "pandas-dev" - assert parsed_query.repo_name == "pandas" - assert parsed_query.url == "https://github.com/pandas-dev/pandas" - assert parsed_query.slug == "pandas-dev-pandas" - assert parsed_query.id is not None - assert parsed_query.subpath == "/.github/ISSUE_TEMPLATE/documentation_improvement.yaml" - assert parsed_query.branch == "2.2.x" - assert parsed_query.commit is None - assert parsed_query.type == "blob" + assert query.user_name == "pandas-dev" + assert query.repo_name == "pandas" + assert query.url == "https://github.com/pandas-dev/pandas" + assert query.slug == "pandas-dev-pandas" + assert query.id is not None + assert query.subpath == "/.github/ISSUE_TEMPLATE/documentation_improvement.yaml" + assert query.branch == "2.2.x" + assert query.commit is None + assert query.type == "blob" @pytest.mark.asyncio @@ -439,10 +439,10 @@ async def test_parse_repo_source_with_failed_git_command(url, expected_branch, e "git ls-remote --heads https://github.com/user/repo", ): - parsed_query = await _parse_remote_repo(url) + query = await _parse_remote_repo(url) - assert parsed_query.branch == expected_branch - assert parsed_query.subpath == expected_subpath + assert query.branch == expected_branch + assert query.subpath == expected_subpath @pytest.mark.asyncio @@ -473,7 +473,7 @@ async def test_parse_repo_source_with_various_url_patterns(url, expected_branch, ) mock_fetch_branches.return_value = ["feature/fix1", "main", "feature-branch"] - parsed_query = await _parse_remote_repo(url) + query = await _parse_remote_repo(url) - assert parsed_query.branch == expected_branch - assert parsed_query.subpath == expected_subpath + assert query.branch == expected_branch + assert query.subpath == expected_subpath diff --git a/tests/test_ingestion.py b/tests/test_ingestion.py index 901646d1..3e991f8f 100644 --- a/tests/test_ingestion.py +++ b/tests/test_ingestion.py @@ -8,10 +8,10 @@ from pathlib import Path from gitingest.ingestion import ingest_query -from gitingest.query_parsing import ParsedQuery +from gitingest.query_parsing import IngestionQuery -def test_run_ingest_query(temp_directory: Path, sample_query: ParsedQuery) -> None: +def test_run_ingest_query(temp_directory: Path, sample_query: IngestionQuery) -> None: """ Test `ingest_query` to ensure it processes the directory and returns expected results. diff --git a/tests/test_repository_clone.py b/tests/test_repository_clone.py index fcf61631..54f9f986 100644 --- a/tests/test_repository_clone.py +++ b/tests/test_repository_clone.py @@ -12,17 +12,17 @@ import pytest -from gitingest.cloning import CloneConfig, _check_repo_exists, clone_repo +from gitingest.cloning import CloneConfig, _check_repo_exists, clone from gitingest.exceptions import AsyncTimeoutError @pytest.mark.asyncio -async def test_clone_repo_with_commit() -> None: +async def test_clone_with_commit() -> None: """ Test cloning a repository with a specific commit hash. Given a valid URL and a commit hash: - When `clone_repo` is called, + When `clone` is called, Then the repository should be cloned and checked out at that commit. """ clone_config = CloneConfig( @@ -38,19 +38,19 @@ async def test_clone_repo_with_commit() -> None: mock_process.communicate.return_value = (b"output", b"error") mock_exec.return_value = mock_process - await clone_repo(clone_config) + await clone(clone_config) mock_check.assert_called_once_with(clone_config.url) assert mock_exec.call_count == 2 # Clone and checkout calls @pytest.mark.asyncio -async def test_clone_repo_without_commit() -> None: +async def test_clone_without_commit() -> None: """ Test cloning a repository when no commit hash is provided. Given a valid URL and no commit hash: - When `clone_repo` is called, + When `clone` is called, Then only the clone operation should be performed (no checkout). """ query = CloneConfig( @@ -66,19 +66,19 @@ async def test_clone_repo_without_commit() -> None: mock_process.communicate.return_value = (b"output", b"error") mock_exec.return_value = mock_process - await clone_repo(query) + await clone(query) mock_check.assert_called_once_with(query.url) assert mock_exec.call_count == 1 # Only clone call @pytest.mark.asyncio -async def test_clone_repo_nonexistent_repository() -> None: +async def test_clone_nonexistent_repository() -> None: """ Test cloning a nonexistent repository URL. Given an invalid or nonexistent URL: - When `clone_repo` is called, + When `clone` is called, Then a ValueError should be raised with an appropriate error message. """ clone_config = CloneConfig( @@ -89,7 +89,7 @@ async def test_clone_repo_nonexistent_repository() -> None: ) with patch("gitingest.cloning._check_repo_exists", return_value=False) as mock_check: with pytest.raises(ValueError, match="Repository not found"): - await clone_repo(clone_config) + await clone(clone_config) mock_check.assert_called_once_with(clone_config.url) @@ -126,18 +126,18 @@ async def test_check_repo_exists(mock_stdout: bytes, return_code: int, expected: @pytest.mark.asyncio -async def test_clone_repo_with_custom_branch() -> None: +async def test_clone_with_custom_branch() -> None: """ Test cloning a repository with a specified custom branch. Given a valid URL and a branch: - When `clone_repo` is called, + When `clone` is called, Then the repository should be cloned shallowly to that branch. """ clone_config = CloneConfig(url="https://github.com/user/repo", local_path="/tmp/repo", branch="feature-branch") with patch("gitingest.cloning._check_repo_exists", return_value=True): with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) + await clone(clone_config) mock_exec.assert_called_once_with( "git", @@ -157,7 +157,7 @@ async def test_git_command_failure() -> None: Test cloning when the Git command fails during execution. Given a valid URL, but `_run_command` raises a RuntimeError: - When `clone_repo` is called, + When `clone` is called, Then a RuntimeError should be raised with the correct message. """ clone_config = CloneConfig( @@ -167,16 +167,16 @@ async def test_git_command_failure() -> None: with patch("gitingest.cloning._check_repo_exists", return_value=True): with patch("gitingest.cloning._run_command", side_effect=RuntimeError("Git command failed")): with pytest.raises(RuntimeError, match="Git command failed"): - await clone_repo(clone_config) + await clone(clone_config) @pytest.mark.asyncio -async def test_clone_repo_default_shallow_clone() -> None: +async def test_clone_default_shallow_clone() -> None: """ Test cloning a repository with the default shallow clone options. Given a valid URL and no branch or commit: - When `clone_repo` is called, + When `clone` is called, Then the repository should be cloned with `--depth=1` and `--single-branch`. """ clone_config = CloneConfig( @@ -186,7 +186,7 @@ async def test_clone_repo_default_shallow_clone() -> None: with patch("gitingest.cloning._check_repo_exists", return_value=True): with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) + await clone(clone_config) mock_exec.assert_called_once_with( "git", @@ -199,12 +199,12 @@ async def test_clone_repo_default_shallow_clone() -> None: @pytest.mark.asyncio -async def test_clone_repo_commit_without_branch() -> None: +async def test_clone_commit_without_branch() -> None: """ Test cloning when a commit hash is provided but no branch is specified. Given a valid URL and a commit hash (but no branch): - When `clone_repo` is called, + When `clone` is called, Then the repository should be cloned and checked out at that commit. """ clone_config = CloneConfig( @@ -214,7 +214,7 @@ async def test_clone_repo_commit_without_branch() -> None: ) with patch("gitingest.cloning._check_repo_exists", return_value=True): with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) + await clone(clone_config) assert mock_exec.call_count == 2 # Clone and checkout calls mock_exec.assert_any_call("git", "clone", "--single-branch", clone_config.url, clone_config.local_path) @@ -264,12 +264,12 @@ async def test_check_repo_exists_with_permanent_redirect() -> None: @pytest.mark.asyncio -async def test_clone_repo_with_timeout() -> None: +async def test_clone_with_timeout() -> None: """ Test cloning a repository when a timeout occurs. Given a valid URL, but `_run_command` times out: - When `clone_repo` is called, + When `clone` is called, Then an `AsyncTimeoutError` should be raised to indicate the operation exceeded time limits. """ clone_config = CloneConfig(url="https://github.com/user/repo", local_path="/tmp/repo") @@ -278,7 +278,7 @@ async def test_clone_repo_with_timeout() -> None: with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: mock_exec.side_effect = asyncio.TimeoutError with pytest.raises(AsyncTimeoutError, match="Operation timed out after"): - await clone_repo(clone_config) + await clone(clone_config) @pytest.mark.asyncio @@ -287,7 +287,7 @@ async def test_clone_specific_branch(tmp_path): Test cloning a specific branch of a repository. Given a valid repository URL and a branch name: - When `clone_repo` is called, + When `clone` is called, Then the repository should be cloned and checked out at that branch. """ repo_url = "https://github.com/cyclotruc/gitingest.git" @@ -295,7 +295,7 @@ async def test_clone_specific_branch(tmp_path): local_path = tmp_path / "gitingest" config = CloneConfig(url=repo_url, local_path=str(local_path), branch=branch_name) - await clone_repo(config) + await clone(config) # Assertions assert local_path.exists(), "The repository was not cloned successfully." @@ -312,7 +312,7 @@ async def test_clone_branch_with_slashes(tmp_path): Test cloning a branch with slashes in the name. Given a valid repository URL and a branch name with slashes: - When `clone_repo` is called, + When `clone` is called, Then the repository should be cloned and checked out at that branch. """ repo_url = "https://github.com/user/repo" @@ -322,7 +322,7 @@ async def test_clone_branch_with_slashes(tmp_path): clone_config = CloneConfig(url=repo_url, local_path=str(local_path), branch=branch_name) with patch("gitingest.cloning._check_repo_exists", return_value=True): with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) + await clone(clone_config) mock_exec.assert_called_once_with( "git", @@ -337,12 +337,12 @@ async def test_clone_branch_with_slashes(tmp_path): @pytest.mark.asyncio -async def test_clone_repo_creates_parent_directory(tmp_path: Path) -> None: +async def test_clone_creates_parent_directory(tmp_path: Path) -> None: """ - Test that clone_repo creates parent directories if they don't exist. + Test that clone creates parent directories if they don't exist. Given a local path with non-existent parent directories: - When `clone_repo` is called, + When `clone` is called, Then it should create the parent directories before attempting to clone. """ nested_path = tmp_path / "deep" / "nested" / "path" / "repo" @@ -353,7 +353,7 @@ async def test_clone_repo_creates_parent_directory(tmp_path: Path) -> None: with patch("gitingest.cloning._check_repo_exists", return_value=True): with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) + await clone(clone_config) # Verify parent directory was created assert nested_path.parent.exists() @@ -375,14 +375,14 @@ async def test_clone_with_specific_subpath() -> None: Test cloning a repository with a specific subpath. Given a valid repository URL and a specific subpath: - When `clone_repo` is called, + When `clone` is called, Then the repository should be cloned with sparse checkout enabled and the specified subpath. """ clone_config = CloneConfig(url="https://github.com/user/repo", local_path="/tmp/repo", subpath="src/docs") with patch("gitingest.cloning._check_repo_exists", return_value=True): with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) + await clone(clone_config) # Verify the clone command includes sparse checkout flags mock_exec.assert_any_call( @@ -408,7 +408,7 @@ async def test_clone_with_commit_and_subpath() -> None: Test cloning a repository with both a specific commit and subpath. Given a valid repository URL, commit hash, and subpath: - When `clone_repo` is called, + When `clone` is called, Then the repository should be cloned with sparse checkout enabled, checked out at the specific commit, and only include the specified subpath. """ @@ -421,7 +421,7 @@ async def test_clone_with_commit_and_subpath() -> None: with patch("gitingest.cloning._check_repo_exists", return_value=True): with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) + await clone(clone_config) # Verify the clone command includes sparse checkout flags mock_exec.assert_any_call( From 89d2dc6d24d49af4c8cb78742dae45a54eba5d5d Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Wed, 12 Mar 2025 09:32:05 -0700 Subject: [PATCH 04/34] chore: bump dependencies to address security vulnerabilities (#227) --- requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 15765e71..629d6f47 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ chardet click>=8.0.0 -fastapi[standard] +fastapi[standard]>=0.109.1 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2024-38 python-dotenv slowapi -starlette +starlette>=0.40.0 # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw tiktoken tomli -uvicorn +uvicorn>=0.11.7 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2020-150 From 945129915a9ea76791b74dd9a7674e76b7d4289b Mon Sep 17 00:00:00 2001 From: StepSecurity Bot Date: Wed, 12 Mar 2025 18:29:50 -0700 Subject: [PATCH 05/34] [StepSecurity] ci: Harden GitHub Actions (#229) --- .github/workflows/ci.yml | 3 +++ .github/workflows/publish.yml | 3 +++ 2 files changed, 6 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 710b2561..587b776d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,6 +6,9 @@ on: pull_request: branches: [ main ] +permissions: + contents: read + jobs: test: runs-on: ${{ matrix.os }} diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index db4ce3d4..b9403985 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -5,6 +5,9 @@ on: types: [created] workflow_dispatch: +permissions: + contents: read + jobs: release-build: runs-on: ubuntu-latest From 3cee6725d35f070d967ced6dc5144dc4e55dff82 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Thu, 13 Mar 2025 02:35:18 +0100 Subject: [PATCH 06/34] Remove unused pattern_type parameter from IngestionQuery fixture (#228) --- tests/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 33cf4df3..307b705d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -40,7 +40,6 @@ def sample_query() -> IngestionQuery: max_file_size=1_000_000, ignore_patterns={"*.pyc", "__pycache__", ".git"}, include_patterns=None, - pattern_type="exclude", ) From 31484298b575d938fe4eba1990c3f845a94e6d00 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Thu, 13 Mar 2025 13:04:21 +0100 Subject: [PATCH 07/34] chore: remove unused dependencies and pin versions to address vulnerabilities - Remove chardet and fastapi-analytics references from .pre-commit-config.yaml and requirements - Pin fastapi, starlette, and uvicorn to versions fixing known vulnerabilities - Add pydantic to requirements - Update ingestion_schema to use pydantic's new ConfigDict --- .pre-commit-config.yaml | 23 +++++++++++------------ pyproject.toml | 6 ++++++ requirements.txt | 2 +- src/gitingest/ingestion_schema.py | 7 ++----- 4 files changed, 20 insertions(+), 18 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f258f160..800728c9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -95,16 +95,16 @@ repos: files: ^src/ additional_dependencies: [ - chardet, - click, - fastapi-analytics, + click>=8.0.0, + "fastapi[standard]>=0.109.1", + pydantic, pytest-asyncio, python-dotenv, slowapi, - starlette, + starlette>=0.40.0, tiktoken, tomli, - uvicorn, + uvicorn>=0.11.7, ] - id: pylint name: pylint for tests @@ -113,17 +113,16 @@ repos: - --rcfile=tests/.pylintrc additional_dependencies: [ - chardet, - click, - fastapi-analytics, - pytest, + click>=8.0.0, + "fastapi[standard]>=0.109.1", + pydantic, pytest-asyncio, python-dotenv, slowapi, - starlette, - tomli, + starlette>=0.40.0, tiktoken, - uvicorn, + tomli, + uvicorn>=0.11.7, ] - repo: meta diff --git a/pyproject.toml b/pyproject.toml index 6eb4cedc..f280d4a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,9 +6,15 @@ readme = {file = "README.md", content-type = "text/markdown" } requires-python = ">= 3.8" dependencies = [ "click>=8.0.0", + "fastapi[standard]>=0.109.1", # Vulnerable to https://osv.dev/vulnerability/PYSEC-2024-38 + "pydantic", + "python-dotenv", + "slowapi", + "starlette>=0.40.0", # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw "tiktoken", "tomli", "typing_extensions; python_version < '3.10'", + "uvicorn>=0.11.7", # Vulnerable to https://osv.dev/vulnerability/PYSEC-2020-150 ] license = {file = "LICENSE"} diff --git a/requirements.txt b/requirements.txt index 629d6f47..5f8657ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -chardet click>=8.0.0 fastapi[standard]>=0.109.1 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2024-38 +pydantic python-dotenv slowapi starlette>=0.40.0 # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw diff --git a/src/gitingest/ingestion_schema.py b/src/gitingest/ingestion_schema.py index e28f6470..08efff3c 100644 --- a/src/gitingest/ingestion_schema.py +++ b/src/gitingest/ingestion_schema.py @@ -4,7 +4,7 @@ from pathlib import Path from typing import Optional, Set -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field from gitingest.config import MAX_FILE_SIZE @@ -58,10 +58,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes ignore_patterns: Optional[Set[str]] = None include_patterns: Optional[Set[str]] = None - class Config: - """Pydantic model configuration.""" - - arbitrary_types_allowed = True + model_config = ConfigDict(arbitrary_types_allowed=True) def extract_clone_config(self) -> CloneConfig: """ From 7923fab077433f5d0f3ccfaed8f1d4f3ae87bc30 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Fri, 21 Mar 2025 13:12:00 +0100 Subject: [PATCH 08/34] chore: run pre-commit autoupdate --- .pre-commit-config.yaml | 8 ++++---- src/gitingest/__init__.py | 2 +- src/gitingest/cli.py | 2 +- src/gitingest/cloning.py | 2 +- src/gitingest/config.py | 2 +- src/gitingest/entrypoint.py | 2 +- src/gitingest/exceptions.py | 2 +- src/gitingest/filesystem_schema.py | 2 +- src/gitingest/ingestion.py | 2 +- src/gitingest/ingestion_schema.py | 2 +- src/gitingest/query_parsing.py | 2 +- src/gitingest/utils/ignore_patterns.py | 2 +- src/gitingest/utils/ingestion_utils.py | 2 +- src/gitingest/utils/notebook_utils.py | 2 +- src/gitingest/utils/path_utils.py | 2 +- src/gitingest/utils/query_parser_utils.py | 2 +- src/gitingest/utils/textfile_checker_utils.py | 2 +- src/gitingest/utils/timeout_wrapper.py | 2 +- src/server/main.py | 2 +- src/server/query_processor.py | 2 +- src/server/routers/__init__.py | 2 +- src/server/routers/download.py | 2 +- src/server/routers/dynamic.py | 2 +- src/server/routers/index.py | 2 +- src/server/server_config.py | 2 +- src/server/server_utils.py | 2 +- tests/test_cli.py | 2 +- tests/test_flow_integration.py | 5 +---- 28 files changed, 31 insertions(+), 34 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 800728c9..1a70d007 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,7 +39,7 @@ repos: description: "Automatically convert relative imports to absolute. (Use `args: [--never]` to revert.)" - repo: https://github.com/psf/black - rev: 24.10.0 + rev: 25.1.0 hooks: - id: black @@ -61,7 +61,7 @@ repos: description: "Enforce that python3.6+ type annotations are used instead of type comments." - repo: https://github.com/PyCQA/isort - rev: 5.13.2 + rev: 6.0.1 hooks: - id: isort description: "Sort imports alphabetically, and automatically separated into sections and by type." @@ -73,7 +73,7 @@ repos: - id: djlint-reformat-jinja - repo: https://github.com/igorshubovych/markdownlint-cli - rev: v0.43.0 + rev: v0.44.0 hooks: - id: markdownlint description: "Lint markdown files." @@ -88,7 +88,7 @@ repos: files: ^src/ - repo: https://github.com/pycqa/pylint - rev: v3.3.3 + rev: v3.3.6 hooks: - id: pylint name: pylint for source diff --git a/src/gitingest/__init__.py b/src/gitingest/__init__.py index 684ec14f..6cde44c3 100644 --- a/src/gitingest/__init__.py +++ b/src/gitingest/__init__.py @@ -1,4 +1,4 @@ -""" Gitingest: A package for ingesting data from Git repositories. """ +"""Gitingest: A package for ingesting data from Git repositories.""" from gitingest.cloning import clone from gitingest.entrypoint import ingest, ingest_async diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index d5c5c4f5..b691fd7f 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -1,4 +1,4 @@ -""" Command-line interface for the Gitingest package. """ +"""Command-line interface for the Gitingest package.""" # pylint: disable=no-value-for-parameter diff --git a/src/gitingest/cloning.py b/src/gitingest/cloning.py index 8c717b38..fc2b787f 100644 --- a/src/gitingest/cloning.py +++ b/src/gitingest/cloning.py @@ -1,4 +1,4 @@ -""" This module contains functions for cloning a Git repository to a local path. """ +"""This module contains functions for cloning a Git repository to a local path.""" import asyncio import os diff --git a/src/gitingest/config.py b/src/gitingest/config.py index 93a1d7d7..9740713c 100644 --- a/src/gitingest/config.py +++ b/src/gitingest/config.py @@ -1,4 +1,4 @@ -""" Configuration file for the project. """ +"""Configuration file for the project.""" import tempfile from pathlib import Path diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index 776a6397..adb83cf2 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -1,4 +1,4 @@ -""" Main entry point for ingesting a source and processing its contents. """ +"""Main entry point for ingesting a source and processing its contents.""" import asyncio import inspect diff --git a/src/gitingest/exceptions.py b/src/gitingest/exceptions.py index 241baf00..aade9418 100644 --- a/src/gitingest/exceptions.py +++ b/src/gitingest/exceptions.py @@ -1,4 +1,4 @@ -""" Custom exceptions for the Gitingest package. """ +"""Custom exceptions for the Gitingest package.""" class InvalidPatternError(ValueError): diff --git a/src/gitingest/filesystem_schema.py b/src/gitingest/filesystem_schema.py index 61f60a95..b19c9121 100644 --- a/src/gitingest/filesystem_schema.py +++ b/src/gitingest/filesystem_schema.py @@ -1,4 +1,4 @@ -""" Define the schema for the filesystem representation. """ +"""Define the schema for the filesystem representation.""" from __future__ import annotations diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index ec5eb754..46810e3b 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -1,4 +1,4 @@ -""" Functions to ingest and analyze a codebase directory or single file. """ +"""Functions to ingest and analyze a codebase directory or single file.""" import warnings from pathlib import Path diff --git a/src/gitingest/ingestion_schema.py b/src/gitingest/ingestion_schema.py index 08efff3c..02b1c678 100644 --- a/src/gitingest/ingestion_schema.py +++ b/src/gitingest/ingestion_schema.py @@ -1,4 +1,4 @@ -""" This module contains the dataclasses for the ingestion process. """ +"""This module contains the dataclasses for the ingestion process.""" from dataclasses import dataclass from pathlib import Path diff --git a/src/gitingest/query_parsing.py b/src/gitingest/query_parsing.py index 434220ef..2f925729 100644 --- a/src/gitingest/query_parsing.py +++ b/src/gitingest/query_parsing.py @@ -1,4 +1,4 @@ -""" This module contains functions to parse and validate input sources and patterns. """ +"""This module contains functions to parse and validate input sources and patterns.""" import re import uuid diff --git a/src/gitingest/utils/ignore_patterns.py b/src/gitingest/utils/ignore_patterns.py index 3e389117..8928c66d 100644 --- a/src/gitingest/utils/ignore_patterns.py +++ b/src/gitingest/utils/ignore_patterns.py @@ -1,4 +1,4 @@ -""" Default ignore patterns for Gitingest. """ +"""Default ignore patterns for Gitingest.""" from typing import Set diff --git a/src/gitingest/utils/ingestion_utils.py b/src/gitingest/utils/ingestion_utils.py index a9a46613..51b57395 100644 --- a/src/gitingest/utils/ingestion_utils.py +++ b/src/gitingest/utils/ingestion_utils.py @@ -1,4 +1,4 @@ -""" Utility functions for the ingestion process. """ +"""Utility functions for the ingestion process.""" import locale import platform diff --git a/src/gitingest/utils/notebook_utils.py b/src/gitingest/utils/notebook_utils.py index 82bb2a28..5ef0123d 100644 --- a/src/gitingest/utils/notebook_utils.py +++ b/src/gitingest/utils/notebook_utils.py @@ -1,4 +1,4 @@ -""" Utilities for processing Jupyter notebooks. """ +"""Utilities for processing Jupyter notebooks.""" import json import warnings diff --git a/src/gitingest/utils/path_utils.py b/src/gitingest/utils/path_utils.py index cb4a4bdf..c6edd501 100644 --- a/src/gitingest/utils/path_utils.py +++ b/src/gitingest/utils/path_utils.py @@ -1,4 +1,4 @@ -""" Utility functions for working with file paths. """ +"""Utility functions for working with file paths.""" import os import platform diff --git a/src/gitingest/utils/query_parser_utils.py b/src/gitingest/utils/query_parser_utils.py index c1ce5ba7..c008f15d 100644 --- a/src/gitingest/utils/query_parser_utils.py +++ b/src/gitingest/utils/query_parser_utils.py @@ -1,4 +1,4 @@ -""" Utility functions for parsing and validating query parameters. """ +"""Utility functions for parsing and validating query parameters.""" import os import string diff --git a/src/gitingest/utils/textfile_checker_utils.py b/src/gitingest/utils/textfile_checker_utils.py index 37ffd9ec..00470e9d 100644 --- a/src/gitingest/utils/textfile_checker_utils.py +++ b/src/gitingest/utils/textfile_checker_utils.py @@ -1,4 +1,4 @@ -""" Utility functions for checking whether a file is likely a text file or a binary file. """ +"""Utility functions for checking whether a file is likely a text file or a binary file.""" from pathlib import Path diff --git a/src/gitingest/utils/timeout_wrapper.py b/src/gitingest/utils/timeout_wrapper.py index 27d60934..cf45e6b5 100644 --- a/src/gitingest/utils/timeout_wrapper.py +++ b/src/gitingest/utils/timeout_wrapper.py @@ -1,4 +1,4 @@ -""" Utility functions for the Gitingest package. """ +"""Utility functions for the Gitingest package.""" import asyncio import functools diff --git a/src/server/main.py b/src/server/main.py index a71f5391..d78b3c54 100644 --- a/src/server/main.py +++ b/src/server/main.py @@ -1,4 +1,4 @@ -""" Main module for the FastAPI application. """ +"""Main module for the FastAPI application.""" import os from pathlib import Path diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 2e751479..e4a755a7 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -1,4 +1,4 @@ -""" Process a query by parsing input, cloning a repository, and generating a summary. """ +"""Process a query by parsing input, cloning a repository, and generating a summary.""" from functools import partial diff --git a/src/server/routers/__init__.py b/src/server/routers/__init__.py index ae6666b1..a1159830 100644 --- a/src/server/routers/__init__.py +++ b/src/server/routers/__init__.py @@ -1,4 +1,4 @@ -""" This module contains the routers for the FastAPI application. """ +"""This module contains the routers for the FastAPI application.""" from server.routers.download import router as download from server.routers.dynamic import router as dynamic diff --git a/src/server/routers/download.py b/src/server/routers/download.py index b868444d..e2b405ea 100644 --- a/src/server/routers/download.py +++ b/src/server/routers/download.py @@ -1,4 +1,4 @@ -""" This module contains the FastAPI router for downloading a digest file. """ +"""This module contains the FastAPI router for downloading a digest file.""" from fastapi import APIRouter, HTTPException from fastapi.responses import Response diff --git a/src/server/routers/dynamic.py b/src/server/routers/dynamic.py index 74febf8d..bfa31f68 100644 --- a/src/server/routers/dynamic.py +++ b/src/server/routers/dynamic.py @@ -1,4 +1,4 @@ -""" This module defines the dynamic router for handling dynamic path requests. """ +"""This module defines the dynamic router for handling dynamic path requests.""" from fastapi import APIRouter, Form, Request from fastapi.responses import HTMLResponse diff --git a/src/server/routers/index.py b/src/server/routers/index.py index 5b08a244..01b84730 100644 --- a/src/server/routers/index.py +++ b/src/server/routers/index.py @@ -1,4 +1,4 @@ -""" This module defines the FastAPI router for the home page of the application. """ +"""This module defines the FastAPI router for the home page of the application.""" from fastapi import APIRouter, Form, Request from fastapi.responses import HTMLResponse diff --git a/src/server/server_config.py b/src/server/server_config.py index 1f9d22d9..0f910623 100644 --- a/src/server/server_config.py +++ b/src/server/server_config.py @@ -1,4 +1,4 @@ -""" Configuration for the server. """ +"""Configuration for the server.""" from typing import Dict, List diff --git a/src/server/server_utils.py b/src/server/server_utils.py index e124eaa1..9972c9ba 100644 --- a/src/server/server_utils.py +++ b/src/server/server_utils.py @@ -1,4 +1,4 @@ -""" Utility functions for the server. """ +"""Utility functions for the server.""" import asyncio import math diff --git a/tests/test_cli.py b/tests/test_cli.py index 0fec4612..7eadea46 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,4 +1,4 @@ -""" Tests for the gitingest cli """ +"""Tests for the gitingest cli.""" import os diff --git a/tests/test_flow_integration.py b/tests/test_flow_integration.py index 99ea35af..da12ca82 100644 --- a/tests/test_flow_integration.py +++ b/tests/test_flow_integration.py @@ -1,7 +1,4 @@ -""" -Integration tests for GitIngest. -These tests cover core functionalities, edge cases, and concurrency handling. -""" +"""Integration tests covering core functionalities, edge cases, and concurrency handling.""" import shutil from concurrent.futures import ThreadPoolExecutor From 8be6f5620fca7d82866c7131478b4f3be6e20ef0 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sat, 22 Mar 2025 18:56:39 +0100 Subject: [PATCH 09/34] refactor: rename clone to clone_repo and consolidate schema & utility modules (#237) * refactor: rename clone to clone_repo and consolidate schema & utility modules --- src/gitingest/__init__.py | 4 +- src/gitingest/cloning.py | 159 +----------------- src/gitingest/entrypoint.py | 8 +- src/gitingest/ingestion.py | 2 +- src/gitingest/output_formatters.py | 2 +- src/gitingest/query_parsing.py | 8 +- src/gitingest/schemas/__init__.py | 6 + .../{ => schemas}/filesystem_schema.py | 7 +- .../{ => schemas}/ingestion_schema.py | 0 src/gitingest/{ => utils}/exceptions.py | 0 src/gitingest/utils/file_utils.py | 72 ++++++++ src/gitingest/utils/git_utils.py | 118 +++++++++++++ src/gitingest/utils/ingestion_utils.py | 25 +-- src/gitingest/utils/notebook_utils.py | 2 +- src/gitingest/utils/textfile_checker_utils.py | 48 ------ src/gitingest/utils/timeout_wrapper.py | 2 +- src/server/query_processor.py | 4 +- tests/query_parser/test_query_parser.py | 20 ++- tests/test_repository_clone.py | 125 +++++++------- 19 files changed, 300 insertions(+), 312 deletions(-) create mode 100644 src/gitingest/schemas/__init__.py rename src/gitingest/{ => schemas}/filesystem_schema.py (95%) rename src/gitingest/{ => schemas}/ingestion_schema.py (100%) rename src/gitingest/{ => utils}/exceptions.py (100%) create mode 100644 src/gitingest/utils/file_utils.py create mode 100644 src/gitingest/utils/git_utils.py delete mode 100644 src/gitingest/utils/textfile_checker_utils.py diff --git a/src/gitingest/__init__.py b/src/gitingest/__init__.py index 6cde44c3..46ea09ab 100644 --- a/src/gitingest/__init__.py +++ b/src/gitingest/__init__.py @@ -1,8 +1,8 @@ """Gitingest: A package for ingesting data from Git repositories.""" -from gitingest.cloning import clone +from gitingest.cloning import clone_repo from gitingest.entrypoint import ingest, ingest_async from gitingest.ingestion import ingest_query from gitingest.query_parsing import parse_query -__all__ = ["ingest_query", "clone", "parse_query", "ingest", "ingest_async"] +__all__ = ["ingest_query", "clone_repo", "parse_query", "ingest", "ingest_async"] diff --git a/src/gitingest/cloning.py b/src/gitingest/cloning.py index fc2b787f..79b97cb9 100644 --- a/src/gitingest/cloning.py +++ b/src/gitingest/cloning.py @@ -1,18 +1,18 @@ """This module contains functions for cloning a Git repository to a local path.""" -import asyncio import os from pathlib import Path -from typing import List, Optional, Tuple +from typing import Optional -from gitingest.ingestion_schema import CloneConfig +from gitingest.schemas import CloneConfig +from gitingest.utils.git_utils import check_repo_exists, ensure_git_installed, run_command from gitingest.utils.timeout_wrapper import async_timeout TIMEOUT: int = 60 @async_timeout(TIMEOUT) -async def clone(config: CloneConfig) -> None: +async def clone_repo(config: CloneConfig) -> None: """ Clone a repository to a local path based on the provided configuration. @@ -47,7 +47,7 @@ async def clone(config: CloneConfig) -> None: raise OSError(f"Failed to create parent directory {parent_dir}: {exc}") from exc # Check if the repository exists - if not await _check_repo_exists(url): + if not await check_repo_exists(url): raise ValueError("Repository not found, make sure it is public") clone_cmd = ["git", "clone", "--single-branch"] @@ -64,7 +64,8 @@ async def clone(config: CloneConfig) -> None: clone_cmd += [url, local_path] # Clone the repository - await _run_command(*clone_cmd) + await ensure_git_installed() + await run_command(*clone_cmd) if commit or partial_clone: checkout_cmd = ["git", "-C", local_path] @@ -81,148 +82,4 @@ async def clone(config: CloneConfig) -> None: checkout_cmd += ["checkout", commit] # Check out the specific commit and/or subpath - await _run_command(*checkout_cmd) - - -async def _check_repo_exists(url: str) -> bool: - """ - Check if a Git repository exists at the provided URL. - - Parameters - ---------- - url : str - The URL of the Git repository to check. - Returns - ------- - bool - True if the repository exists, False otherwise. - - Raises - ------ - RuntimeError - If the curl command returns an unexpected status code. - """ - proc = await asyncio.create_subprocess_exec( - "curl", - "-I", - url, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, _ = await proc.communicate() - - if proc.returncode != 0: - return False - - response = stdout.decode() - status_code = _get_status_code(response) - - if status_code in (200, 301): - return True - - if status_code in (404, 302): - return False - - raise RuntimeError(f"Unexpected status code: {status_code}") - - -async def fetch_remote_branch_list(url: str) -> List[str]: - """ - Fetch the list of branches from a remote Git repository. - Parameters - ---------- - url : str - The URL of the Git repository to fetch branches from. - Returns - ------- - List[str] - A list of branch names available in the remote repository. - """ - fetch_branches_command = ["git", "ls-remote", "--heads", url] - stdout, _ = await _run_command(*fetch_branches_command) - stdout_decoded = stdout.decode() - - return [ - line.split("refs/heads/", 1)[1] - for line in stdout_decoded.splitlines() - if line.strip() and "refs/heads/" in line - ] - - -async def _run_command(*args: str) -> Tuple[bytes, bytes]: - """ - Execute a command asynchronously and captures its output. - - Parameters - ---------- - *args : str - The command and its arguments to execute. - - Returns - ------- - Tuple[bytes, bytes] - A tuple containing the stdout and stderr of the command. - - Raises - ------ - RuntimeError - If command exits with a non-zero status. - """ - await check_git_installed() - - # Execute the requested command - proc = await asyncio.create_subprocess_exec( - *args, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate() - if proc.returncode != 0: - error_message = stderr.decode().strip() - raise RuntimeError(f"Command failed: {' '.join(args)}\nError: {error_message}") - - return stdout, stderr - - -async def check_git_installed() -> None: - """ - Check if Git is installed and accessible on the system. - - Raises - ------ - RuntimeError - If Git is not installed or if the Git command exits with a non-zero status. - """ - try: - proc = await asyncio.create_subprocess_exec( - "git", - "--version", - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - _, stderr = await proc.communicate() - if proc.returncode != 0: - error_message = stderr.decode().strip() if stderr else "Git command not found" - raise RuntimeError(f"Git is not installed or not accessible: {error_message}") - - except FileNotFoundError as exc: - raise RuntimeError("Git is not installed. Please install Git before proceeding.") from exc - - -def _get_status_code(response: str) -> int: - """ - Extract the status code from an HTTP response. - - Parameters - ---------- - response : str - The HTTP response string. - - Returns - ------- - int - The status code of the response - """ - status_line = response.splitlines()[0].strip() - status_code = int(status_line.split(" ", 2)[1]) - return status_code + await run_command(*checkout_cmd) diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index adb83cf2..0af4a4ba 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -5,7 +5,7 @@ import shutil from typing import Optional, Set, Tuple, Union -from gitingest.cloning import clone +from gitingest.cloning import clone_repo from gitingest.config import TMP_BASE_PATH from gitingest.ingestion import ingest_query from gitingest.query_parsing import IngestionQuery, parse_query @@ -53,7 +53,7 @@ async def ingest_async( Raises ------ TypeError - If `clone` does not return a coroutine, or if the `source` is of an unsupported type. + If `clone_repo` does not return a coroutine, or if the `source` is of an unsupported type. """ repo_cloned = False @@ -71,7 +71,7 @@ async def ingest_async( query.branch = selected_branch clone_config = query.extract_clone_config() - clone_coroutine = clone(clone_config) + clone_coroutine = clone_repo(clone_config) if inspect.iscoroutine(clone_coroutine): if asyncio.get_event_loop().is_running(): @@ -79,7 +79,7 @@ async def ingest_async( else: asyncio.run(clone_coroutine) else: - raise TypeError("clone did not return a coroutine as expected.") + raise TypeError("clone_repo did not return a coroutine as expected.") repo_cloned = True diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index 46810e3b..72e11c4f 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -5,9 +5,9 @@ from typing import Tuple from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES -from gitingest.filesystem_schema import FileSystemNode, FileSystemNodeType, FileSystemStats from gitingest.output_formatters import format_node from gitingest.query_parsing import IngestionQuery +from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats from gitingest.utils.ingestion_utils import _should_exclude, _should_include from gitingest.utils.path_utils import _is_safe_symlink diff --git a/src/gitingest/output_formatters.py b/src/gitingest/output_formatters.py index 5f747387..7169d5c9 100644 --- a/src/gitingest/output_formatters.py +++ b/src/gitingest/output_formatters.py @@ -4,8 +4,8 @@ import tiktoken -from gitingest.filesystem_schema import FileSystemNode, FileSystemNodeType from gitingest.query_parsing import IngestionQuery +from gitingest.schemas import FileSystemNode, FileSystemNodeType def format_node(node: FileSystemNode, query: IngestionQuery) -> Tuple[str, str, str]: diff --git a/src/gitingest/query_parsing.py b/src/gitingest/query_parsing.py index 2f925729..5d547356 100644 --- a/src/gitingest/query_parsing.py +++ b/src/gitingest/query_parsing.py @@ -7,10 +7,10 @@ from typing import List, Optional, Set, Union from urllib.parse import unquote, urlparse -from gitingest.cloning import _check_repo_exists, fetch_remote_branch_list from gitingest.config import TMP_BASE_PATH -from gitingest.exceptions import InvalidPatternError -from gitingest.ingestion_schema import IngestionQuery +from gitingest.schemas import IngestionQuery +from gitingest.utils.exceptions import InvalidPatternError +from gitingest.utils.git_utils import check_repo_exists, fetch_remote_branch_list from gitingest.utils.ignore_patterns import DEFAULT_IGNORE_PATTERNS from gitingest.utils.query_parser_utils import ( KNOWN_GIT_HOSTS, @@ -308,6 +308,6 @@ async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str: """ for domain in KNOWN_GIT_HOSTS: candidate = f"https://{domain}/{user_name}/{repo_name}" - if await _check_repo_exists(candidate): + if await check_repo_exists(candidate): return domain raise ValueError(f"Could not find a valid repository host for '{user_name}/{repo_name}'.") diff --git a/src/gitingest/schemas/__init__.py b/src/gitingest/schemas/__init__.py new file mode 100644 index 00000000..c3869864 --- /dev/null +++ b/src/gitingest/schemas/__init__.py @@ -0,0 +1,6 @@ +"""This module contains the schemas for the Gitingest package.""" + +from gitingest.schemas.filesystem_schema import FileSystemNode, FileSystemNodeType, FileSystemStats +from gitingest.schemas.ingestion_schema import CloneConfig, IngestionQuery + +__all__ = ["FileSystemNode", "FileSystemNodeType", "FileSystemStats", "CloneConfig", "IngestionQuery"] diff --git a/src/gitingest/filesystem_schema.py b/src/gitingest/schemas/filesystem_schema.py similarity index 95% rename from src/gitingest/filesystem_schema.py rename to src/gitingest/schemas/filesystem_schema.py index b19c9121..fdd3e338 100644 --- a/src/gitingest/filesystem_schema.py +++ b/src/gitingest/schemas/filesystem_schema.py @@ -7,9 +7,8 @@ from enum import Enum, auto from pathlib import Path -from gitingest.utils.ingestion_utils import _get_encoding_list +from gitingest.utils.file_utils import get_preferred_encodings, is_text_file from gitingest.utils.notebook_utils import process_notebook -from gitingest.utils.textfile_checker_utils import is_textfile SEPARATOR = "=" * 48 # Tiktoken, the tokenizer openai uses, counts 2 tokens if we have more than 48 @@ -117,7 +116,7 @@ def content(self) -> str: # pylint: disable=too-many-return-statements if self.type == FileSystemNodeType.DIRECTORY: raise ValueError("Cannot read content of a directory node") - if not is_textfile(self.path): + if not is_text_file(self.path): return "[Non-text file]" if self.path.suffix == ".ipynb": @@ -127,7 +126,7 @@ def content(self) -> str: # pylint: disable=too-many-return-statements return f"Error processing notebook: {exc}" # Try multiple encodings - for encoding in _get_encoding_list(): + for encoding in get_preferred_encodings(): try: with self.path.open(encoding=encoding) as f: return f.read() diff --git a/src/gitingest/ingestion_schema.py b/src/gitingest/schemas/ingestion_schema.py similarity index 100% rename from src/gitingest/ingestion_schema.py rename to src/gitingest/schemas/ingestion_schema.py diff --git a/src/gitingest/exceptions.py b/src/gitingest/utils/exceptions.py similarity index 100% rename from src/gitingest/exceptions.py rename to src/gitingest/utils/exceptions.py diff --git a/src/gitingest/utils/file_utils.py b/src/gitingest/utils/file_utils.py new file mode 100644 index 00000000..055b9ca7 --- /dev/null +++ b/src/gitingest/utils/file_utils.py @@ -0,0 +1,72 @@ +"""Utility functions for working with files and directories.""" + +import locale +import platform +from pathlib import Path +from typing import List + +try: + locale.setlocale(locale.LC_ALL, "") +except locale.Error: + locale.setlocale(locale.LC_ALL, "C") + + +def get_preferred_encodings() -> List[str]: + """ + Get list of encodings to try, prioritized for the current platform. + + Returns + ------- + List[str] + List of encoding names to try in priority order, starting with the + platform's default encoding followed by common fallback encodings. + """ + encodings = [locale.getpreferredencoding(), "utf-8", "utf-16", "utf-16le", "utf-8-sig", "latin"] + if platform.system() == "Windows": + encodings += ["cp1252", "iso-8859-1"] + return encodings + + +def is_text_file(path: Path) -> bool: + """ + Determine if the file is likely a text file by trying to decode a small chunk + with multiple encodings, and checking for common binary markers. + + Parameters + ---------- + path : Path + The path to the file to check. + + Returns + ------- + bool + True if the file is likely textual; False if it appears to be binary. + """ + + # Attempt to read a portion of the file in binary mode + try: + with path.open("rb") as f: + chunk = f.read(1024) + except OSError: + return False + + # If file is empty, treat as text + if not chunk: + return True + + # Check obvious binary bytes + if b"\x00" in chunk or b"\xff" in chunk: + return False + + # Attempt multiple encodings + for enc in get_preferred_encodings(): + try: + with path.open(encoding=enc) as f: + f.read() + return True + except UnicodeDecodeError: + continue + except OSError: + return False + + return False diff --git a/src/gitingest/utils/git_utils.py b/src/gitingest/utils/git_utils.py new file mode 100644 index 00000000..9ed7c645 --- /dev/null +++ b/src/gitingest/utils/git_utils.py @@ -0,0 +1,118 @@ +"""Utility functions for interacting with Git repositories.""" + +import asyncio +from typing import List, Tuple + + +async def run_command(*args: str) -> Tuple[bytes, bytes]: + """ + Execute a shell command asynchronously and return (stdout, stderr) bytes. + + Parameters + ---------- + *args : str + The command and its arguments to execute. + + Returns + ------- + Tuple[bytes, bytes] + A tuple containing the stdout and stderr of the command. + + Raises + ------ + RuntimeError + If command exits with a non-zero status. + """ + # Execute the requested command + proc = await asyncio.create_subprocess_exec( + *args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + if proc.returncode != 0: + error_message = stderr.decode().strip() + raise RuntimeError(f"Command failed: {' '.join(args)}\nError: {error_message}") + + return stdout, stderr + + +async def ensure_git_installed() -> None: + """ + Ensure Git is installed and accessible on the system. + + Raises + ------ + RuntimeError + If Git is not installed or not accessible. + """ + try: + await run_command("git", "--version") + except RuntimeError as exc: + raise RuntimeError("Git is not installed or not accessible. Please install Git first.") from exc + + +async def check_repo_exists(url: str) -> bool: + """ + Check if a Git repository exists at the provided URL. + + Parameters + ---------- + url : str + The URL of the Git repository to check. + Returns + ------- + bool + True if the repository exists, False otherwise. + + Raises + ------ + RuntimeError + If the curl command returns an unexpected status code. + """ + proc = await asyncio.create_subprocess_exec( + "curl", + "-I", + url, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, _ = await proc.communicate() + + if proc.returncode != 0: + return False # likely unreachable or private + + response = stdout.decode() + status_line = response.splitlines()[0].strip() + parts = status_line.split(" ") + if len(parts) >= 2: + status_code_str = parts[1] + if status_code_str in ("200", "301"): + return True + if status_code_str in ("302", "404"): + return False + raise RuntimeError(f"Unexpected status line: {status_line}") + + +async def fetch_remote_branch_list(url: str) -> List[str]: + """ + Fetch the list of branches from a remote Git repository. + Parameters + ---------- + url : str + The URL of the Git repository to fetch branches from. + Returns + ------- + List[str] + A list of branch names available in the remote repository. + """ + fetch_branches_command = ["git", "ls-remote", "--heads", url] + await ensure_git_installed() + stdout, _ = await run_command(*fetch_branches_command) + stdout_decoded = stdout.decode() + + return [ + line.split("refs/heads/", 1)[1] + for line in stdout_decoded.splitlines() + if line.strip() and "refs/heads/" in line + ] diff --git a/src/gitingest/utils/ingestion_utils.py b/src/gitingest/utils/ingestion_utils.py index 51b57395..b4bb552c 100644 --- a/src/gitingest/utils/ingestion_utils.py +++ b/src/gitingest/utils/ingestion_utils.py @@ -1,31 +1,8 @@ """Utility functions for the ingestion process.""" -import locale -import platform from fnmatch import fnmatch from pathlib import Path -from typing import List, Set - -try: - locale.setlocale(locale.LC_ALL, "") -except locale.Error: - locale.setlocale(locale.LC_ALL, "C") - - -def _get_encoding_list() -> List[str]: - """ - Get list of encodings to try, prioritized for the current platform. - - Returns - ------- - List[str] - List of encoding names to try in priority order, starting with the - platform's default encoding followed by common fallback encodings. - """ - encodings = [locale.getpreferredencoding(), "utf-8", "utf-16", "utf-16le", "utf-8-sig", "latin"] - if platform.system() == "Windows": - encodings += ["cp1252", "iso-8859-1"] - return encodings +from typing import Set def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) -> bool: diff --git a/src/gitingest/utils/notebook_utils.py b/src/gitingest/utils/notebook_utils.py index 5ef0123d..bae62064 100644 --- a/src/gitingest/utils/notebook_utils.py +++ b/src/gitingest/utils/notebook_utils.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional -from gitingest.exceptions import InvalidNotebookError +from gitingest.utils.exceptions import InvalidNotebookError def process_notebook(file: Path, include_output: bool = True) -> str: diff --git a/src/gitingest/utils/textfile_checker_utils.py b/src/gitingest/utils/textfile_checker_utils.py deleted file mode 100644 index 00470e9d..00000000 --- a/src/gitingest/utils/textfile_checker_utils.py +++ /dev/null @@ -1,48 +0,0 @@ -"""Utility functions for checking whether a file is likely a text file or a binary file.""" - -from pathlib import Path - -from gitingest.utils.ingestion_utils import _get_encoding_list - - -def is_textfile(path: Path) -> bool: - """ - Determine whether a file is likely a text file or a binary file using various heuristics. - - Parameters - ---------- - path : Path - The path to the file to check. - - Returns - ------- - bool - True if the file is likely textual; False if it appears to be binary. - """ - # Attempt to read a small portion (up to 1024 bytes) of the file in binary mode. - try: - with path.open("rb") as f: - chunk = f.read(1024) - except OSError: - # If we cannot read the file for any reason, treat it as non-textual. - return False - - # If the file is empty, we treat it as text. - if not chunk: - return True - - # Look for obvious binary indicators such as null (0x00) or 0xFF bytes. - if b"\x00" in chunk or b"\xff" in chunk: - return False - - for encoding in _get_encoding_list(): - try: - with path.open(encoding=encoding) as f: - f.read() - return True - except UnicodeDecodeError: - continue - except OSError: - return False - - return False diff --git a/src/gitingest/utils/timeout_wrapper.py b/src/gitingest/utils/timeout_wrapper.py index cf45e6b5..7d1d5f91 100644 --- a/src/gitingest/utils/timeout_wrapper.py +++ b/src/gitingest/utils/timeout_wrapper.py @@ -4,7 +4,7 @@ import functools from typing import Any, Awaitable, Callable, TypeVar -from gitingest.exceptions import AsyncTimeoutError +from gitingest.utils.exceptions import AsyncTimeoutError T = TypeVar("T") diff --git a/src/server/query_processor.py b/src/server/query_processor.py index e4a755a7..00b1c640 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -5,7 +5,7 @@ from fastapi import Request from starlette.templating import _TemplateResponse -from gitingest.cloning import clone +from gitingest.cloning import clone_repo from gitingest.ingestion import ingest_query from gitingest.query_parsing import IngestionQuery, parse_query from server.server_config import EXAMPLE_REPOS, MAX_DISPLAY_SIZE, templates @@ -85,7 +85,7 @@ async def process_query( raise ValueError("The 'url' parameter is required.") clone_config = query.extract_clone_config() - await clone(clone_config) + await clone_repo(clone_config) summary, tree, content = ingest_query(query) with open(f"{clone_config.local_path}.txt", "w", encoding="utf-8") as f: f.write(tree + "\n" + content) diff --git a/tests/query_parser/test_query_parser.py b/tests/query_parser/test_query_parser.py index a01b5e0f..b7f15f22 100644 --- a/tests/query_parser/test_query_parser.py +++ b/tests/query_parser/test_query_parser.py @@ -153,9 +153,11 @@ async def test_parse_url_with_subpaths() -> None: Then user, repo, branch, and subpath should be identified correctly. """ url = "https://github.com/user/repo/tree/main/subdir/file" - with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_run_command: + with patch("gitingest.utils.git_utils.run_command", new_callable=AsyncMock) as mock_run_command: mock_run_command.return_value = (b"refs/heads/main\nrefs/heads/dev\nrefs/heads/feature-branch\n", b"") - with patch("gitingest.cloning.fetch_remote_branch_list", new_callable=AsyncMock) as mock_fetch_branches: + with patch( + "gitingest.utils.git_utils.fetch_remote_branch_list", new_callable=AsyncMock + ) as mock_fetch_branches: mock_fetch_branches.return_value = ["main", "dev", "feature-branch"] query = await _parse_remote_repo(url) @@ -330,10 +332,12 @@ async def test_parse_url_branch_and_commit_distinction(url: str, expected_branch When `_parse_remote_repo` is called with branch fetching, Then the function should correctly set `branch` or `commit` based on the URL content. """ - with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_run_command: + with patch("gitingest.utils.git_utils.run_command", new_callable=AsyncMock) as mock_run_command: # Mocking the return value to include 'main' and some additional branches mock_run_command.return_value = (b"refs/heads/main\nrefs/heads/dev\nrefs/heads/feature-branch\n", b"") - with patch("gitingest.cloning.fetch_remote_branch_list", new_callable=AsyncMock) as mock_fetch_branches: + with patch( + "gitingest.utils.git_utils.fetch_remote_branch_list", new_callable=AsyncMock + ) as mock_fetch_branches: mock_fetch_branches.return_value = ["main", "dev", "feature-branch"] query = await _parse_remote_repo(url) @@ -430,7 +434,7 @@ async def test_parse_repo_source_with_failed_git_command(url, expected_branch, e When `_parse_remote_repo` is called, Then it should fall back to path components for branch identification. """ - with patch("gitingest.cloning.fetch_remote_branch_list", new_callable=AsyncMock) as mock_fetch_branches: + with patch("gitingest.utils.git_utils.fetch_remote_branch_list", new_callable=AsyncMock) as mock_fetch_branches: mock_fetch_branches.side_effect = Exception("Failed to fetch branch list") with pytest.warns( @@ -465,8 +469,10 @@ async def test_parse_repo_source_with_various_url_patterns(url, expected_branch, When `_parse_remote_repo` is called with remote branch fetching, Then the correct branch/subpath should be set or None if unmatched. """ - with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_run_command: - with patch("gitingest.cloning.fetch_remote_branch_list", new_callable=AsyncMock) as mock_fetch_branches: + with patch("gitingest.utils.git_utils.run_command", new_callable=AsyncMock) as mock_run_command: + with patch( + "gitingest.utils.git_utils.fetch_remote_branch_list", new_callable=AsyncMock + ) as mock_fetch_branches: mock_run_command.return_value = ( b"refs/heads/feature/fix1\nrefs/heads/main\nrefs/heads/feature-branch\nrefs/heads/fix\n", b"", diff --git a/tests/test_repository_clone.py b/tests/test_repository_clone.py index 54f9f986..b614d5a4 100644 --- a/tests/test_repository_clone.py +++ b/tests/test_repository_clone.py @@ -12,8 +12,9 @@ import pytest -from gitingest.cloning import CloneConfig, _check_repo_exists, clone -from gitingest.exceptions import AsyncTimeoutError +from gitingest.cloning import check_repo_exists, clone_repo +from gitingest.schemas import CloneConfig +from gitingest.utils.exceptions import AsyncTimeoutError @pytest.mark.asyncio @@ -22,7 +23,7 @@ async def test_clone_with_commit() -> None: Test cloning a repository with a specific commit hash. Given a valid URL and a commit hash: - When `clone` is called, + When `clone_repo` is called, Then the repository should be cloned and checked out at that commit. """ clone_config = CloneConfig( @@ -32,13 +33,13 @@ async def test_clone_with_commit() -> None: branch="main", ) - with patch("gitingest.cloning._check_repo_exists", return_value=True) as mock_check: - with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: + with patch("gitingest.cloning.check_repo_exists", return_value=True) as mock_check: + with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() mock_process.communicate.return_value = (b"output", b"error") mock_exec.return_value = mock_process - await clone(clone_config) + await clone_repo(clone_config) mock_check.assert_called_once_with(clone_config.url) assert mock_exec.call_count == 2 # Clone and checkout calls @@ -50,8 +51,8 @@ async def test_clone_without_commit() -> None: Test cloning a repository when no commit hash is provided. Given a valid URL and no commit hash: - When `clone` is called, - Then only the clone operation should be performed (no checkout). + When `clone_repo` is called, + Then only the clone_repo operation should be performed (no checkout). """ query = CloneConfig( url="https://github.com/user/repo", @@ -60,13 +61,13 @@ async def test_clone_without_commit() -> None: branch="main", ) - with patch("gitingest.cloning._check_repo_exists", return_value=True) as mock_check: - with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: + with patch("gitingest.cloning.check_repo_exists", return_value=True) as mock_check: + with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() mock_process.communicate.return_value = (b"output", b"error") mock_exec.return_value = mock_process - await clone(query) + await clone_repo(query) mock_check.assert_called_once_with(query.url) assert mock_exec.call_count == 1 # Only clone call @@ -78,7 +79,7 @@ async def test_clone_nonexistent_repository() -> None: Test cloning a nonexistent repository URL. Given an invalid or nonexistent URL: - When `clone` is called, + When `clone_repo` is called, Then a ValueError should be raised with an appropriate error message. """ clone_config = CloneConfig( @@ -87,9 +88,9 @@ async def test_clone_nonexistent_repository() -> None: commit=None, branch="main", ) - with patch("gitingest.cloning._check_repo_exists", return_value=False) as mock_check: + with patch("gitingest.cloning.check_repo_exists", return_value=False) as mock_check: with pytest.raises(ValueError, match="Repository not found"): - await clone(clone_config) + await clone_repo(clone_config) mock_check.assert_called_once_with(clone_config.url) @@ -120,7 +121,7 @@ async def test_check_repo_exists(mock_stdout: bytes, return_code: int, expected: mock_process.returncode = return_code mock_exec.return_value = mock_process - repo_exists = await _check_repo_exists(url) + repo_exists = await check_repo_exists(url) assert repo_exists is expected @@ -131,13 +132,13 @@ async def test_clone_with_custom_branch() -> None: Test cloning a repository with a specified custom branch. Given a valid URL and a branch: - When `clone` is called, + When `clone_repo` is called, Then the repository should be cloned shallowly to that branch. """ clone_config = CloneConfig(url="https://github.com/user/repo", local_path="/tmp/repo", branch="feature-branch") - with patch("gitingest.cloning._check_repo_exists", return_value=True): - with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone(clone_config) + with patch("gitingest.cloning.check_repo_exists", return_value=True): + with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: + await clone_repo(clone_config) mock_exec.assert_called_once_with( "git", @@ -156,18 +157,18 @@ async def test_git_command_failure() -> None: """ Test cloning when the Git command fails during execution. - Given a valid URL, but `_run_command` raises a RuntimeError: - When `clone` is called, + Given a valid URL, but `run_command` raises a RuntimeError: + When `clone_repo` is called, Then a RuntimeError should be raised with the correct message. """ clone_config = CloneConfig( url="https://github.com/user/repo", local_path="/tmp/repo", ) - with patch("gitingest.cloning._check_repo_exists", return_value=True): - with patch("gitingest.cloning._run_command", side_effect=RuntimeError("Git command failed")): + with patch("gitingest.cloning.check_repo_exists", return_value=True): + with patch("gitingest.cloning.run_command", side_effect=RuntimeError("Git command failed")): with pytest.raises(RuntimeError, match="Git command failed"): - await clone(clone_config) + await clone_repo(clone_config) @pytest.mark.asyncio @@ -176,7 +177,7 @@ async def test_clone_default_shallow_clone() -> None: Test cloning a repository with the default shallow clone options. Given a valid URL and no branch or commit: - When `clone` is called, + When `clone_repo` is called, Then the repository should be cloned with `--depth=1` and `--single-branch`. """ clone_config = CloneConfig( @@ -184,9 +185,9 @@ async def test_clone_default_shallow_clone() -> None: local_path="/tmp/repo", ) - with patch("gitingest.cloning._check_repo_exists", return_value=True): - with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone(clone_config) + with patch("gitingest.cloning.check_repo_exists", return_value=True): + with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: + await clone_repo(clone_config) mock_exec.assert_called_once_with( "git", @@ -204,7 +205,7 @@ async def test_clone_commit_without_branch() -> None: Test cloning when a commit hash is provided but no branch is specified. Given a valid URL and a commit hash (but no branch): - When `clone` is called, + When `clone_repo` is called, Then the repository should be cloned and checked out at that commit. """ clone_config = CloneConfig( @@ -212,9 +213,9 @@ async def test_clone_commit_without_branch() -> None: local_path="/tmp/repo", commit="a" * 40, # Simulating a valid commit hash ) - with patch("gitingest.cloning._check_repo_exists", return_value=True): - with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone(clone_config) + with patch("gitingest.cloning.check_repo_exists", return_value=True): + with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: + await clone_repo(clone_config) assert mock_exec.call_count == 2 # Clone and checkout calls mock_exec.assert_any_call("git", "clone", "--single-branch", clone_config.url, clone_config.local_path) @@ -224,10 +225,10 @@ async def test_clone_commit_without_branch() -> None: @pytest.mark.asyncio async def test_check_repo_exists_with_redirect() -> None: """ - Test `_check_repo_exists` when a redirect (302) is returned. + Test `check_repo_exists` when a redirect (302) is returned. Given a URL that responds with "302 Found": - When `_check_repo_exists` is called, + When `check_repo_exists` is called, Then it should return `False`, indicating the repo is inaccessible. """ url = "https://github.com/user/repo" @@ -237,7 +238,7 @@ async def test_check_repo_exists_with_redirect() -> None: mock_process.returncode = 0 # Simulate successful request mock_exec.return_value = mock_process - repo_exists = await _check_repo_exists(url) + repo_exists = await check_repo_exists(url) assert repo_exists is False @@ -245,10 +246,10 @@ async def test_check_repo_exists_with_redirect() -> None: @pytest.mark.asyncio async def test_check_repo_exists_with_permanent_redirect() -> None: """ - Test `_check_repo_exists` when a permanent redirect (301) is returned. + Test `check_repo_exists` when a permanent redirect (301) is returned. Given a URL that responds with "301 Found": - When `_check_repo_exists` is called, + When `check_repo_exists` is called, Then it should return `True`, indicating the repo may exist at the new location. """ url = "https://github.com/user/repo" @@ -258,7 +259,7 @@ async def test_check_repo_exists_with_permanent_redirect() -> None: mock_process.returncode = 0 # Simulate successful request mock_exec.return_value = mock_process - repo_exists = await _check_repo_exists(url) + repo_exists = await check_repo_exists(url) assert repo_exists @@ -268,17 +269,17 @@ async def test_clone_with_timeout() -> None: """ Test cloning a repository when a timeout occurs. - Given a valid URL, but `_run_command` times out: - When `clone` is called, + Given a valid URL, but `run_command` times out: + When `clone_repo` is called, Then an `AsyncTimeoutError` should be raised to indicate the operation exceeded time limits. """ clone_config = CloneConfig(url="https://github.com/user/repo", local_path="/tmp/repo") - with patch("gitingest.cloning._check_repo_exists", return_value=True): - with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: + with patch("gitingest.cloning.check_repo_exists", return_value=True): + with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: mock_exec.side_effect = asyncio.TimeoutError with pytest.raises(AsyncTimeoutError, match="Operation timed out after"): - await clone(clone_config) + await clone_repo(clone_config) @pytest.mark.asyncio @@ -287,7 +288,7 @@ async def test_clone_specific_branch(tmp_path): Test cloning a specific branch of a repository. Given a valid repository URL and a branch name: - When `clone` is called, + When `clone_repo` is called, Then the repository should be cloned and checked out at that branch. """ repo_url = "https://github.com/cyclotruc/gitingest.git" @@ -295,7 +296,7 @@ async def test_clone_specific_branch(tmp_path): local_path = tmp_path / "gitingest" config = CloneConfig(url=repo_url, local_path=str(local_path), branch=branch_name) - await clone(config) + await clone_repo(config) # Assertions assert local_path.exists(), "The repository was not cloned successfully." @@ -312,7 +313,7 @@ async def test_clone_branch_with_slashes(tmp_path): Test cloning a branch with slashes in the name. Given a valid repository URL and a branch name with slashes: - When `clone` is called, + When `clone_repo` is called, Then the repository should be cloned and checked out at that branch. """ repo_url = "https://github.com/user/repo" @@ -320,9 +321,9 @@ async def test_clone_branch_with_slashes(tmp_path): local_path = tmp_path / "gitingest" clone_config = CloneConfig(url=repo_url, local_path=str(local_path), branch=branch_name) - with patch("gitingest.cloning._check_repo_exists", return_value=True): - with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone(clone_config) + with patch("gitingest.cloning.check_repo_exists", return_value=True): + with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: + await clone_repo(clone_config) mock_exec.assert_called_once_with( "git", @@ -339,10 +340,10 @@ async def test_clone_branch_with_slashes(tmp_path): @pytest.mark.asyncio async def test_clone_creates_parent_directory(tmp_path: Path) -> None: """ - Test that clone creates parent directories if they don't exist. + Test that clone_repo creates parent directories if they don't exist. Given a local path with non-existent parent directories: - When `clone` is called, + When `clone_repo` is called, Then it should create the parent directories before attempting to clone. """ nested_path = tmp_path / "deep" / "nested" / "path" / "repo" @@ -351,9 +352,9 @@ async def test_clone_creates_parent_directory(tmp_path: Path) -> None: local_path=str(nested_path), ) - with patch("gitingest.cloning._check_repo_exists", return_value=True): - with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone(clone_config) + with patch("gitingest.cloning.check_repo_exists", return_value=True): + with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: + await clone_repo(clone_config) # Verify parent directory was created assert nested_path.parent.exists() @@ -375,14 +376,14 @@ async def test_clone_with_specific_subpath() -> None: Test cloning a repository with a specific subpath. Given a valid repository URL and a specific subpath: - When `clone` is called, + When `clone_repo` is called, Then the repository should be cloned with sparse checkout enabled and the specified subpath. """ clone_config = CloneConfig(url="https://github.com/user/repo", local_path="/tmp/repo", subpath="src/docs") - with patch("gitingest.cloning._check_repo_exists", return_value=True): - with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone(clone_config) + with patch("gitingest.cloning.check_repo_exists", return_value=True): + with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: + await clone_repo(clone_config) # Verify the clone command includes sparse checkout flags mock_exec.assert_any_call( @@ -408,7 +409,7 @@ async def test_clone_with_commit_and_subpath() -> None: Test cloning a repository with both a specific commit and subpath. Given a valid repository URL, commit hash, and subpath: - When `clone` is called, + When `clone_repo` is called, Then the repository should be cloned with sparse checkout enabled, checked out at the specific commit, and only include the specified subpath. """ @@ -419,9 +420,9 @@ async def test_clone_with_commit_and_subpath() -> None: subpath="src/docs", ) - with patch("gitingest.cloning._check_repo_exists", return_value=True): - with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone(clone_config) + with patch("gitingest.cloning.check_repo_exists", return_value=True): + with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: + await clone_repo(clone_config) # Verify the clone command includes sparse checkout flags mock_exec.assert_any_call( From cdeadf510d0946e975d783d63c63ecdd7b3806b7 Mon Sep 17 00:00:00 2001 From: Nicolas Iragne Date: Wed, 2 Apr 2025 01:35:20 +0200 Subject: [PATCH 10/34] refactor: rework how symlinks are processed (no longer resolve) (#248) Some changes to how we handle symlinks. We no longer resolve them, which should reduce the complexity by a nice bit. We also now show the target name in the output. I also added a launch.json file for debugging because it took me a while to figure out how to get the debugger to work. Yeah, that's it. Please test before merging because I'm a bit of a dingus sometimes --- .vscode/launch.json | 12 +++++ src/gitingest/ingestion.py | 62 ++++++++++++---------- src/gitingest/output_formatters.py | 6 ++- src/gitingest/schemas/filesystem_schema.py | 7 ++- 4 files changed, 55 insertions(+), 32 deletions(-) create mode 100644 .vscode/launch.json diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 00000000..a0565651 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,12 @@ +{ + "configurations": [ + { + "name": "Python Debugger: Module", + "type": "debugpy", + "request": "launch", + "module": "uvicorn", + "args": ["server.main:app", "--host", "0.0.0.0", "--port", "8000"], + "cwd": "${workspaceFolder}/src" + } + ] +} diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index 72e11c4f..d3005250 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -9,7 +9,6 @@ from gitingest.query_parsing import IngestionQuery from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats from gitingest.utils.ingestion_utils import _should_exclude, _should_include -from gitingest.utils.path_utils import _is_safe_symlink try: import tomllib # type: ignore[import] @@ -171,11 +170,6 @@ def _process_node( The parsed query object containing information about the repository and query parameters. stats : FileSystemStats Statistics tracking object for the total file count and size. - - Raises - ------ - ValueError - If an unexpected error occurs during processing. """ if limit_exceeded(stats, node.depth): @@ -183,28 +177,15 @@ def _process_node( for sub_path in node.path.iterdir(): - symlink_path = None - if sub_path.is_symlink(): - if not _is_safe_symlink(sub_path, query.local_path): - print(f"Skipping unsafe symlink: {sub_path}") - continue - - symlink_path = sub_path - sub_path = sub_path.resolve() - - if sub_path in stats.visited: - print(f"Skipping already visited path: {sub_path}") - continue - - stats.visited.add(sub_path) - if query.ignore_patterns and _should_exclude(sub_path, query.local_path, query.ignore_patterns): continue if query.include_patterns and not _should_include(sub_path, query.local_path, query.include_patterns): continue - if sub_path.is_file(): + if sub_path.is_symlink(): + _process_symlink(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) + elif sub_path.is_file(): _process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) elif sub_path.is_dir(): @@ -216,11 +197,6 @@ def _process_node( depth=node.depth + 1, ) - # rename the subdir to reflect the symlink name - if symlink_path: - child_directory_node.name = symlink_path.name - child_directory_node.path_str = str(symlink_path) - _process_node( node=child_directory_node, query=query, @@ -230,13 +206,41 @@ def _process_node( node.size += child_directory_node.size node.file_count += child_directory_node.file_count node.dir_count += 1 + child_directory_node.dir_count - else: - raise ValueError(f"Unexpected error: {sub_path} is neither a file nor a directory") + print(f"Warning: {sub_path} is an unknown file type, skipping") node.sort_children() +def _process_symlink(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path) -> None: + """ + Process a symlink in the file system. + + This function checks the symlink's target. + + Parameters + ---------- + path : Path + The full path of the symlink. + parent_node : FileSystemNode + The parent directory node. + stats : FileSystemStats + Statistics tracking object for the total file count and size. + local_path : Path + The base path of the repository or directory being processed. + """ + child = FileSystemNode( + name=path.name, + type=FileSystemNodeType.SYMLINK, + path_str=str(path.relative_to(local_path)), + path=path, + depth=parent_node.depth + 1, + ) + stats.total_files += 1 + parent_node.children.append(child) + parent_node.file_count += 1 + + def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path) -> None: """ Process a file in the file system. diff --git a/src/gitingest/output_formatters.py b/src/gitingest/output_formatters.py index 7169d5c9..5bacba22 100644 --- a/src/gitingest/output_formatters.py +++ b/src/gitingest/output_formatters.py @@ -31,7 +31,7 @@ def format_node(node: FileSystemNode, query: IngestionQuery) -> Tuple[str, str, if node.type == FileSystemNodeType.DIRECTORY: summary += f"Files analyzed: {node.file_count}\n" - else: + elif node.type == FileSystemNodeType.FILE: summary += f"File: {node.name}\n" summary += f"Lines: {len(node.content.splitlines()):,}\n" @@ -101,7 +101,7 @@ def _gather_file_contents(node: FileSystemNode) -> str: str The concatenated content of all files under the given node. """ - if node.type == FileSystemNodeType.FILE: + if node.type != FileSystemNodeType.DIRECTORY: return node.content_string # Recursively gather contents of all files under the current directory @@ -142,6 +142,8 @@ def _create_tree_structure(query: IngestionQuery, node: FileSystemNode, prefix: display_name = node.name if node.type == FileSystemNodeType.DIRECTORY: display_name += "/" + elif node.type == FileSystemNodeType.SYMLINK: + display_name += " -> " + node.path.readlink().name tree_str += f"{prefix}{current_prefix}{display_name}\n" diff --git a/src/gitingest/schemas/filesystem_schema.py b/src/gitingest/schemas/filesystem_schema.py index fdd3e338..6bb4569a 100644 --- a/src/gitingest/schemas/filesystem_schema.py +++ b/src/gitingest/schemas/filesystem_schema.py @@ -18,6 +18,7 @@ class FileSystemNodeType(Enum): DIRECTORY = auto() FILE = auto() + SYMLINK = auto() @dataclass @@ -91,7 +92,8 @@ def content_string(self) -> str: """ parts = [ SEPARATOR, - f"File: {str(self.path_str).replace(os.sep, '/')}", + f"{self.type.name}: {str(self.path_str).replace(os.sep, '/')}" + + (f" -> {self.path.readlink().name}" if self.type == FileSystemNodeType.SYMLINK else ""), SEPARATOR, f"{self.content}", ] @@ -116,6 +118,9 @@ def content(self) -> str: # pylint: disable=too-many-return-statements if self.type == FileSystemNodeType.DIRECTORY: raise ValueError("Cannot read content of a directory node") + if self.type == FileSystemNodeType.SYMLINK: + return "" + if not is_text_file(self.path): return "[Non-text file]" From 2c8c8e7a1d071463ba8ad5d669fea98daf456e1f Mon Sep 17 00:00:00 2001 From: Romain Courtois Date: Wed, 2 Apr 2025 19:56:27 +0200 Subject: [PATCH 11/34] Update CONTRIBUTING.md (#251) --- CONTRIBUTING.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0a87d2b1..3ece5d35 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -48,9 +48,9 @@ Thanks for your interest in contributing to Gitingest! 🚀 Gitingest aims to be pytest ``` -8. Navigate to src folder +8. Run the local web server - 1. Build the Docker image + 1. Navigate to src folder ``` bash cd src From 688c1d0b1d418dfae29c1b0c520cdc9003eaf7b1 Mon Sep 17 00:00:00 2001 From: Nicolas Iragne Date: Thu, 3 Apr 2025 11:24:26 +0200 Subject: [PATCH 12/34] fix: Skip files where decoding raises an exception (#250) --- src/gitingest/schemas/filesystem_schema.py | 2 ++ src/gitingest/utils/file_utils.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/gitingest/schemas/filesystem_schema.py b/src/gitingest/schemas/filesystem_schema.py index 6bb4569a..22cff569 100644 --- a/src/gitingest/schemas/filesystem_schema.py +++ b/src/gitingest/schemas/filesystem_schema.py @@ -137,6 +137,8 @@ def content(self) -> str: # pylint: disable=too-many-return-statements return f.read() except UnicodeDecodeError: continue + except UnicodeError: + continue except OSError as exc: return f"Error reading file: {exc}" diff --git a/src/gitingest/utils/file_utils.py b/src/gitingest/utils/file_utils.py index 055b9ca7..28c3d4eb 100644 --- a/src/gitingest/utils/file_utils.py +++ b/src/gitingest/utils/file_utils.py @@ -66,6 +66,8 @@ def is_text_file(path: Path) -> bool: return True except UnicodeDecodeError: continue + except UnicodeError: + continue except OSError: return False From b4d87b5ebb954268d9d0f658ad01cfecb993f6a0 Mon Sep 17 00:00:00 2001 From: Tanner Woody Date: Fri, 4 Apr 2025 11:48:22 -0700 Subject: [PATCH 13/34] add installation instructions --- README.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/README.md b/README.md index 38f235f6..c8e000fd 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,31 @@ Issues and feature requests are welcome to the repo. ## 💡 Command line usage +### Installation: Non mac + +```bash +pip install gitingest +``` + +### Installation: Mac + +99% of mac users use `brew` as a local package manger. +If Python and pip have been installed with `brew`, it is recommended to stay in this ecosystem with `pipx`. +**If `pipx` does not exist and you are using `brew`, first install the following:** + +```bash +brew install pipx +pipx ensurepath +``` + +Finally, install `gitingest`: + +```bash +pipx install gitingest +``` + +### Usage + The `gitingest` command line tool allows you to analyze codebases and create a text dump of their contents. ```bash From d36b3a08d317d16e015ec4f1d07736022825d750 Mon Sep 17 00:00:00 2001 From: Alex Tyrode Date: Mon, 7 Apr 2025 18:10:53 +0200 Subject: [PATCH 14/34] fix: adding missing suggested changes from #252 (#256) Co-authored-by: Nicolas IRAGNE --- README.md | 53 ++++++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index c8e000fd..b4d28ebf 100644 --- a/README.md +++ b/README.md @@ -30,50 +30,49 @@ You can also replace `hub` with `ingest` in any GitHub URL to access the corresp - Python 3.7+ -## 📦 Installation +### 📦 Installation -``` bash +Gitingest is available on [PyPI](https://pypi.org/project/gitingest/). +You can install it using `pip`: + +```bash pip install gitingest ``` -## 🧩 Browser Extension Usage - - -Available in the Chrome Web Store -Get The Add-on for Firefox -Get from the Edge Add-ons - - -The extension is open source at [lcandy2/gitingest-extension](https://github.com/lcandy2/gitingest-extension). - -Issues and feature requests are welcome to the repo. - -## 💡 Command line usage - -### Installation: Non mac +However, it might be a good idea to use `pipx` to install it. +You can install `pipx` using your preferred package manager. ```bash -pip install gitingest +brew install pipx +apt install pipx +scoop install pipx +... ``` -### Installation: Mac - -99% of mac users use `brew` as a local package manger. -If Python and pip have been installed with `brew`, it is recommended to stay in this ecosystem with `pipx`. -**If `pipx` does not exist and you are using `brew`, first install the following:** +If you are using pipx for the first time, run: ```bash -brew install pipx pipx ensurepath ``` -Finally, install `gitingest`: - ```bash +# install gitingest pipx install gitingest ``` -### Usage +## 🧩 Browser Extension Usage + + +Available in the Chrome Web Store +Get The Add-on for Firefox +Get from the Edge Add-ons + + +The extension is open source at [lcandy2/gitingest-extension](https://github.com/lcandy2/gitingest-extension). + +Issues and feature requests are welcome to the repo. + +## 💡 Command line usage The `gitingest` command line tool allows you to analyze codebases and create a text dump of their contents. From bf5d76036deaf7f5db957b3edf32d652cdca676d Mon Sep 17 00:00:00 2001 From: Romain Courtois Date: Mon, 21 Apr 2025 03:02:59 +0200 Subject: [PATCH 15/34] Update footer.jinja (#262) * Update footer.jinja Add a link to pad.ws in footer * Fix layout --- src/server/templates/components/footer.jinja | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/server/templates/components/footer.jinja b/src/server/templates/components/footer.jinja index 1a8f3e6e..81032ad7 100644 --- a/src/server/templates/components/footer.jinja +++ b/src/server/templates/components/footer.jinja @@ -16,14 +16,22 @@ -
+
made with ❤️ by - @rom2
+
+ Check out my + latest project +
From 789be9b339f80e215505bf07b48383cccc6041c5 Mon Sep 17 00:00:00 2001 From: Aaron Date: Fri, 13 Jun 2025 09:30:49 -0600 Subject: [PATCH 16/34] fix: traverse directories to allow pattern matching of files within them (#259) * fix: traverse directories to allow pattern matching of files within them --- src/gitingest/cli.py | 34 ++++- src/gitingest/ingestion.py | 4 + src/gitingest/utils/ingestion_utils.py | 4 +- tests/test_ingestion.py | 188 ++++++++++++++++++++++++- 4 files changed, 223 insertions(+), 7 deletions(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index b691fd7f..c7f07d9b 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -13,10 +13,34 @@ @click.command() @click.argument("source", type=str, default=".") -@click.option("--output", "-o", default=None, help="Output file path (default: .txt in current directory)") -@click.option("--max-size", "-s", default=MAX_FILE_SIZE, help="Maximum file size to process in bytes") -@click.option("--exclude-pattern", "-e", multiple=True, help="Patterns to exclude") -@click.option("--include-pattern", "-i", multiple=True, help="Patterns to include") +@click.option( + "--output", + "-o", + default=None, + help="Output file path (default: .txt in current directory)", +) +@click.option( + "--max-size", + "-s", + default=MAX_FILE_SIZE, + help="Maximum file size to process in bytes", +) +@click.option( + "--exclude-pattern", + "-e", + multiple=True, + help="""Patterns to exclude. Handles python's arbitrary subset of Unix + shell-style wildcards. See: + https://docs.python.org/3/library/fnmatch.html""", +) +@click.option( + "--include-pattern", + "-i", + multiple=True, + help="""Patterns to include. Handles python's arbitrary subset of Unix + shell-style wildcards. See: + https://docs.python.org/3/library/fnmatch.html""", +) @click.option("--branch", "-b", default=None, help="Branch to clone and ingest") def main( source: str, @@ -27,7 +51,7 @@ def main( branch: Optional[str], ): """ - Main entry point for the CLI. This function is called when the CLI is run as a script. + Main entry point for the CLI. This function is called when the CLI is run as a script. It calls the async main function to run the command. diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index d3005250..ec378978 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -202,6 +202,10 @@ def _process_node( query=query, stats=stats, ) + + if not child_directory_node.children: + continue + node.children.append(child_directory_node) node.size += child_directory_node.size node.file_count += child_directory_node.file_count diff --git a/src/gitingest/utils/ingestion_utils.py b/src/gitingest/utils/ingestion_utils.py index b4bb552c..9ce2ae72 100644 --- a/src/gitingest/utils/ingestion_utils.py +++ b/src/gitingest/utils/ingestion_utils.py @@ -33,8 +33,10 @@ def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) -> return False rel_str = str(rel_path) + + # if path is a directory, include it by default if path.is_dir(): - rel_str += "/" + return True for pattern in include_patterns: if fnmatch(rel_str, pattern): diff --git a/tests/test_ingestion.py b/tests/test_ingestion.py index 3e991f8f..3d829b4a 100644 --- a/tests/test_ingestion.py +++ b/tests/test_ingestion.py @@ -5,7 +5,11 @@ including filtering patterns and subpaths. """ +import re from pathlib import Path +from typing import Set, TypedDict + +import pytest from gitingest.ingestion import ingest_query from gitingest.query_parsing import IngestionQuery @@ -42,5 +46,187 @@ def test_run_ingest_query(temp_directory: Path, sample_query: IngestionQuery) -> # TODO: Additional tests: # - Multiple include patterns, e.g. ["*.txt", "*.py"] or ["/src/*", "*.txt"]. # - Edge cases with weird file names or deep subdirectory structures. -# TODO : def test_include_txt_pattern # TODO : def test_include_nonexistent_extension + + +class PatternScenario(TypedDict): + include_patterns: Set[str] + ignore_patterns: Set[str] + expected_num_files: int + expected_content: Set[str] + expected_structure: Set[str] + expected_not_structure: Set[str] + + +@pytest.mark.parametrize( + "pattern_scenario", + [ + pytest.param( + PatternScenario( + { + "include_patterns": {"file2.py", "dir2/file_dir2.txt"}, + "ignore_patterns": {*()}, + "expected_num_files": 2, + "expected_content": {"file2.py", "dir2/file_dir2.txt"}, + "expected_structure": {"test_repo/", "dir2/"}, + "expected_not_structure": {"src/", "subdir/", "dir1/"}, + } + ), + id="include-explicit-files", + ), + pytest.param( + PatternScenario( + { + "include_patterns": { + "file1.txt", + "file2.py", + "file_dir1.txt", + "*/file_dir2.txt", + }, + "ignore_patterns": {*()}, + "expected_num_files": 3, + "expected_content": {"file1.txt", "file2.py", "dir2/file_dir2.txt"}, + "expected_structure": {"test_repo/", "dir2/"}, + "expected_not_structure": {"src/", "subdir/", "dir1/"}, + } + ), + id="include-wildcard-directory", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {"*.py"}, + "ignore_patterns": {*()}, + "expected_num_files": 3, + "expected_content": { + "file2.py", + "src/subfile2.py", + "src/subdir/file_subdir.py", + }, + "expected_structure": {"test_repo/", "src/", "subdir/"}, + "expected_not_structure": {"dir1/", "dir2/"}, + } + ), + id="include-wildcard-files", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {"**/file_dir2.txt", "src/**/*.py"}, + "ignore_patterns": {*()}, + "expected_num_files": 2, + "expected_content": { + "dir2/file_dir2.txt", + "src/subdir/file_subdir.py", + }, + "expected_structure": {"test_repo/", "dir2/", "src/", "subdir/"}, + "expected_not_structure": {"dir1/"}, + } + ), + id="include-recursive-wildcard", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {*()}, + "ignore_patterns": {"file2.py", "dir2/file_dir2.txt"}, + "expected_num_files": 6, + "expected_content": { + "file1.txt", + "src/subfile1.txt", + "src/subfile2.py", + "src/subdir/file_subdir.txt", + "src/subdir/file_subdir.py", + "dir1/file_dir1.txt", + }, + "expected_structure": {"test_repo/", "src/", "subdir/", "dir1/"}, + "expected_not_structure": {"dir2/"}, + } + ), + id="exclude-explicit-files", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {*()}, + "ignore_patterns": {"file1.txt", "file2.py", "*/file_dir1.txt"}, + "expected_num_files": 5, + "expected_content": { + "src/subfile1.txt", + "src/subfile2.py", + "src/subdir/file_subdir.txt", + "src/subdir/file_subdir.py", + "dir2/file_dir2.txt", + }, + "expected_structure": {"test_repo/", "src/", "subdir/", "dir2/"}, + "expected_not_structure": {"dir1/"}, + } + ), + id="exclude-wildcard-directory", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {*()}, + "ignore_patterns": {"src/**/*.py"}, + "expected_num_files": 7, + "expected_content": { + "file1.txt", + "file2.py", + "src/subfile1.txt", + "src/subfile2.py", + "src/subdir/file_subdir.txt", + "dir1/file_dir1.txt", + "dir2/file_dir2.txt", + }, + "expected_structure": { + "test_repo/", + "dir1/", + "dir2/", + "src/", + "subdir/", + }, + "expected_not_structure": {*()}, + } + ), + id="exclude-recursive-wildcard", + ), + ], +) +def test_include_ignore_patterns( + temp_directory: Path, + sample_query: IngestionQuery, + pattern_scenario: PatternScenario, +) -> None: + """ + Test `ingest_query` to ensure included and ignored paths are included and ignored respectively. + + Given a directory with .txt and .py files, and a set of include patterns or a set of ignore patterns: + When `ingest_query` is invoked, + Then it should produce a summary string listing the files analyzed and a combined content string. + """ + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + sample_query.include_patterns = pattern_scenario["include_patterns"] or None + sample_query.ignore_patterns = pattern_scenario["ignore_patterns"] or None + + summary, structure, content = ingest_query(sample_query) + + assert "Repository: test_user/test_repo" in summary + num_files_regex = re.compile(r"^Files analyzed: (\d+)$", re.MULTILINE) + assert (num_files_match := num_files_regex.search(summary)) is not None + assert int(num_files_match.group(1)) == pattern_scenario["expected_num_files"] + + # Check presence of key files in the content + for expected_content_item in pattern_scenario["expected_content"]: + assert expected_content_item in content + + # check presence of included directories in structure + for expected_structure_item in pattern_scenario["expected_structure"]: + assert expected_structure_item in structure + + # check non-presence of non-included directories in structure + for expected_not_structure_item in pattern_scenario["expected_not_structure"]: + assert expected_not_structure_item not in structure From 1dd133c3e02b899ff035a9863c6071af61a3479f Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sun, 15 Jun 2025 23:30:46 +0200 Subject: [PATCH 17/34] feat: add private-repo support to CLI & core (UI coming next) (#282) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: split sparse-checkout & commit checkout when cloning; refresh docs/CLI * Run `git sparse-checkout set …` and `git checkout ` as two calls—matches Git’s CLI rules and fixes failures. * Tidy clone path creation via _ensure_directory; use DEFAULT_TIMEOUT. * Clarify CLI/help strings and schema docstrings. * Update tests for the new two-step checkout flow. * feat(auth): support private GitHub repos & correct sparse-checkout flow * CLI: new `--token/-t` flag (fallback to `GITHUB_TOKEN`) * clone_repo: * injects Basic-auth header when a PAT is supplied * validates PAT format (`github_pat_*`) * git_utils: * `create_git_auth_header`, `validate_github_token`, `create_git_command` * `_check_github_repo_exists` & branch-listing now work with tokens * os_utils.ensure_directory extracted for reuse * tests updated to reflect new call signatures * allow git PAT to start with gth_ * fix GITHUB_PAT_PATTERN and add instructions to README * fix gph_ to ghp_ * docs: add GITHUB_TOKEN env var example to README * add GITHUB_TOKEN environment variable also in code --- README.md | 9 ++ src/gitingest/cli.py | 91 +++++++++---- src/gitingest/cloning.py | 78 ++++++----- src/gitingest/config.py | 1 + src/gitingest/entrypoint.py | 15 ++- src/gitingest/query_parsing.py | 14 +- src/gitingest/schemas/ingestion_schema.py | 2 + src/gitingest/utils/git_utils.py | 156 +++++++++++++++++++++- src/gitingest/utils/os_utils.py | 24 ++++ tests/test_repository_clone.py | 18 ++- 10 files changed, 334 insertions(+), 74 deletions(-) create mode 100644 src/gitingest/utils/os_utils.py diff --git a/README.md b/README.md index b4d28ebf..ba69b0a9 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ You can also replace `hub` with `ingest` in any GitHub URL to access the corresp ## 📚 Requirements - Python 3.7+ +- For private repositories: A GitHub Personal Access Token (PAT). You can generate one at [https://github.com/settings/personal-access-tokens](https://github.com/settings/personal-access-tokens) (Profile → Settings → Developer Settings → Personal Access Tokens → Fine-grained Tokens) ### 📦 Installation @@ -83,6 +84,14 @@ gitingest /path/to/directory # From URL gitingest https://github.com/cyclotruc/gitingest +# For private repositories, use the --token option +# Get your token from https://github.com/settings/personal-access-tokens +gitingest https://github.com/username/private-repo --token github_pat_... + +# Or set it as an environment variable +export GITHUB_TOKEN=github_pat_... +gitingest https://github.com/username/private-repo + # See more options gitingest --help ``` diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index c7f07d9b..a7b5de98 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -29,19 +29,31 @@ "--exclude-pattern", "-e", multiple=True, - help="""Patterns to exclude. Handles python's arbitrary subset of Unix - shell-style wildcards. See: - https://docs.python.org/3/library/fnmatch.html""", + help=( + "Patterns to exclude. Handles Python's arbitrary subset of Unix shell-style " + "wildcards. See: https://docs.python.org/3/library/fnmatch.html" + ), ) @click.option( "--include-pattern", "-i", multiple=True, - help="""Patterns to include. Handles python's arbitrary subset of Unix - shell-style wildcards. See: - https://docs.python.org/3/library/fnmatch.html""", + help=( + "Patterns to include. Handles Python's arbitrary subset of Unix shell-style " + "wildcards. See: https://docs.python.org/3/library/fnmatch.html" + ), ) @click.option("--branch", "-b", default=None, help="Branch to clone and ingest") +@click.option( + "--token", + "-t", + envvar="GITHUB_TOKEN", + default=None, + help=( + "GitHub personal access token for accessing private repositories. " + "If omitted, the CLI will look for the GITHUB_TOKEN environment variable." + ), +) def main( source: str, output: Optional[str], @@ -49,6 +61,7 @@ def main( exclude_pattern: Tuple[str, ...], include_pattern: Tuple[str, ...], branch: Optional[str], + token: Optional[str], ): """ Main entry point for the CLI. This function is called when the CLI is run as a script. @@ -58,21 +71,33 @@ def main( Parameters ---------- source : str - The source directory or repository to analyze. + A directory path or a Git repository URL. output : str, optional - The path where the output file will be written. If not specified, the output will be written - to a file named `.txt` in the current directory. + Output file path. Defaults to `.txt`. max_size : int - The maximum file size to process, in bytes. Files larger than this size will be ignored. + Maximum file size (in bytes) to consider. exclude_pattern : Tuple[str, ...] - A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored. + Glob patterns for pruning the file set. include_pattern : Tuple[str, ...] - A tuple of patterns to include during the analysis. Only files matching these patterns will be processed. + Glob patterns for including files in the output. branch : str, optional - The branch to clone (optional). + Specific branch to ingest (defaults to the repository's default). + token: str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. """ - # Main entry point for the CLI. This function is called when the CLI is run as a script. - asyncio.run(_async_main(source, output, max_size, exclude_pattern, include_pattern, branch)) + + asyncio.run( + _async_main( + source=source, + output=output, + max_size=max_size, + exclude_pattern=exclude_pattern, + include_pattern=include_pattern, + branch=branch, + token=token, + ) + ) async def _async_main( @@ -82,6 +107,7 @@ async def _async_main( exclude_pattern: Tuple[str, ...], include_pattern: Tuple[str, ...], branch: Optional[str], + token: Optional[str], ) -> None: """ Analyze a directory or repository and create a text dump of its contents. @@ -92,18 +118,20 @@ async def _async_main( Parameters ---------- source : str - The source directory or repository to analyze. + A directory path or a Git repository URL. output : str, optional - The path where the output file will be written. If not specified, the output will be written - to a file named `.txt` in the current directory. + Output file path. Defaults to `.txt`. max_size : int - The maximum file size to process, in bytes. Files larger than this size will be ignored. + Maximum file size (in bytes) to consider. exclude_pattern : Tuple[str, ...] - A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored. + Glob patterns for pruning the file set. include_pattern : Tuple[str, ...] - A tuple of patterns to include during the analysis. Only files matching these patterns will be processed. + Glob patterns for including files in the output. branch : str, optional - The branch to clone (optional). + Specific branch to ingest (defaults to the repository's default). + token: str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. Raises ------ @@ -111,21 +139,32 @@ async def _async_main( If there is an error during the execution of the command, this exception is raised to abort the process. """ try: - # Combine default and custom ignore patterns + # Normalise pattern containers (the ingest layer expects sets) exclude_patterns = set(exclude_pattern) include_patterns = set(include_pattern) - if not output: + # Choose a default output path if none provided + if output is None: output = OUTPUT_FILE_NAME - summary, _, _ = await ingest_async(source, max_size, include_patterns, exclude_patterns, branch, output=output) + + summary, _, _ = await ingest_async( + source=source, + max_file_size=max_size, + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + branch=branch, + output=output, + token=token, + ) click.echo(f"Analysis complete! Output written to: {output}") click.echo("\nSummary:") click.echo(summary) except Exception as exc: + # Convert any exception into Click.Abort so that exit status is non-zero click.echo(f"Error: {exc}", err=True) - raise click.Abort() + raise click.Abort() from exc if __name__ == "__main__": diff --git a/src/gitingest/cloning.py b/src/gitingest/cloning.py index 79b97cb9..284b353e 100644 --- a/src/gitingest/cloning.py +++ b/src/gitingest/cloning.py @@ -1,18 +1,24 @@ """This module contains functions for cloning a Git repository to a local path.""" -import os from pathlib import Path from typing import Optional +from gitingest.config import DEFAULT_TIMEOUT from gitingest.schemas import CloneConfig -from gitingest.utils.git_utils import check_repo_exists, ensure_git_installed, run_command +from gitingest.utils.git_utils import ( + check_repo_exists, + create_git_auth_header, + create_git_command, + ensure_git_installed, + run_command, + validate_github_token, +) +from gitingest.utils.os_utils import ensure_directory from gitingest.utils.timeout_wrapper import async_timeout -TIMEOUT: int = 60 - -@async_timeout(TIMEOUT) -async def clone_repo(config: CloneConfig) -> None: +@async_timeout(DEFAULT_TIMEOUT) +async def clone_repo(config: CloneConfig, token: Optional[str] = None) -> None: """ Clone a repository to a local path based on the provided configuration. @@ -24,13 +30,15 @@ async def clone_repo(config: CloneConfig) -> None: ---------- config : CloneConfig The configuration for cloning the repository. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + Must start with 'github_pat_' or 'gph_' for GitHub repositories. Raises ------ ValueError - If the repository is not found or if the provided URL is invalid. - OSError - If an error occurs while creating the parent directory for the repository. + If the repository is not found, if the provided URL is invalid, or if the token format is invalid. """ # Extract and validate query parameters url: str = config.url @@ -39,19 +47,23 @@ async def clone_repo(config: CloneConfig) -> None: branch: Optional[str] = config.branch partial_clone: bool = config.subpath != "/" + # Validate token if provided + if token and url.startswith("https://github.com"): + validate_github_token(token) + # Create parent directory if it doesn't exist - parent_dir = Path(local_path).parent - try: - os.makedirs(parent_dir, exist_ok=True) - except OSError as exc: - raise OSError(f"Failed to create parent directory {parent_dir}: {exc}") from exc + await ensure_directory(Path(local_path).parent) # Check if the repository exists - if not await check_repo_exists(url): - raise ValueError("Repository not found, make sure it is public") + if not await check_repo_exists(url, token=token): + raise ValueError("Repository not found. Make sure it is public or that you have provided a valid token.") - clone_cmd = ["git", "clone", "--single-branch"] - # TODO re-enable --recurse-submodules + clone_cmd = ["git"] + if token and url.startswith("https://github.com"): + clone_cmd += ["-c", create_git_auth_header(token)] + + clone_cmd += ["clone", "--single-branch"] + # TODO: Re-enable --recurse-submodules when submodule support is needed if partial_clone: clone_cmd += ["--filter=blob:none", "--sparse"] @@ -67,19 +79,17 @@ async def clone_repo(config: CloneConfig) -> None: await ensure_git_installed() await run_command(*clone_cmd) - if commit or partial_clone: - checkout_cmd = ["git", "-C", local_path] - - if partial_clone: - subpath = config.subpath.lstrip("/") - if config.blob: - # When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name. - subpath = str(Path(subpath).parent.as_posix()) - - checkout_cmd += ["sparse-checkout", "set", subpath] - - if commit: - checkout_cmd += ["checkout", commit] - - # Check out the specific commit and/or subpath - await run_command(*checkout_cmd) + # Checkout the subpath if it is a partial clone + if partial_clone: + subpath = config.subpath.lstrip("/") + if config.blob: + # When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name. + subpath = str(Path(subpath).parent.as_posix()) + + checkout_cmd = create_git_command(["git"], local_path, url, token) + await run_command(*checkout_cmd, "sparse-checkout", "set", subpath) + + # Checkout the commit if it is provided + if commit: + checkout_cmd = create_git_command(["git"], local_path, url, token) + await run_command(*checkout_cmd, "checkout", commit) diff --git a/src/gitingest/config.py b/src/gitingest/config.py index 9740713c..3f4e3724 100644 --- a/src/gitingest/config.py +++ b/src/gitingest/config.py @@ -7,6 +7,7 @@ MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal MAX_FILES = 10_000 # Maximum number of files to process MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB +DEFAULT_TIMEOUT = 60 # seconds OUTPUT_FILE_NAME = "digest.txt" diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index 0af4a4ba..cfabb461 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -2,6 +2,7 @@ import asyncio import inspect +import os import shutil from typing import Optional, Set, Tuple, Union @@ -17,6 +18,7 @@ async def ingest_async( include_patterns: Optional[Union[str, Set[str]]] = None, exclude_patterns: Optional[Union[str, Set[str]]] = None, branch: Optional[str] = None, + token: Optional[str] = None, output: Optional[str] = None, ) -> Tuple[str, str, str]: """ @@ -39,6 +41,9 @@ async def ingest_async( Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded. branch : str, optional The branch to clone and ingest. If `None`, the default branch is used. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. output : str, optional File path where the summary and content should be written. If `None`, the results are not written to a file. @@ -57,6 +62,9 @@ async def ingest_async( """ repo_cloned = False + if not token: + token = os.getenv("GITHUB_TOKEN") + try: query: IngestionQuery = await parse_query( source=source, @@ -71,7 +79,7 @@ async def ingest_async( query.branch = selected_branch clone_config = query.extract_clone_config() - clone_coroutine = clone_repo(clone_config) + clone_coroutine = clone_repo(clone_config, token=token) if inspect.iscoroutine(clone_coroutine): if asyncio.get_event_loop().is_running(): @@ -102,6 +110,7 @@ def ingest( include_patterns: Optional[Union[str, Set[str]]] = None, exclude_patterns: Optional[Union[str, Set[str]]] = None, branch: Optional[str] = None, + token: Optional[str] = None, output: Optional[str] = None, ) -> Tuple[str, str, str]: """ @@ -124,6 +133,9 @@ def ingest( Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded. branch : str, optional The branch to clone and ingest. If `None`, the default branch is used. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. output : str, optional File path where the summary and content should be written. If `None`, the results are not written to a file. @@ -146,6 +158,7 @@ def ingest( include_patterns=include_patterns, exclude_patterns=exclude_patterns, branch=branch, + token=token, output=output, ) ) diff --git a/src/gitingest/query_parsing.py b/src/gitingest/query_parsing.py index 5d547356..d391e184 100644 --- a/src/gitingest/query_parsing.py +++ b/src/gitingest/query_parsing.py @@ -94,7 +94,7 @@ async def parse_query( ) -async def _parse_remote_repo(source: str) -> IngestionQuery: +async def _parse_remote_repo(source: str, token: Optional[str] = None) -> IngestionQuery: """ Parse a repository URL into a structured query dictionary. @@ -107,6 +107,9 @@ async def _parse_remote_repo(source: str) -> IngestionQuery: ---------- source : str The URL or domain-less slug to parse. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. Returns ------- @@ -128,7 +131,7 @@ async def _parse_remote_repo(source: str) -> IngestionQuery: _validate_host(tmp_host) else: # No scheme, no domain => user typed "user/repo", so we'll guess the domain. - host = await try_domains_for_user_and_repo(*_get_user_and_repo_from_path(source)) + host = await try_domains_for_user_and_repo(*_get_user_and_repo_from_path(source), token=token) source = f"{host}/{source}" source = "https://" + source @@ -285,7 +288,7 @@ def _parse_local_dir_path(path_str: str) -> IngestionQuery: ) -async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str: +async def try_domains_for_user_and_repo(user_name: str, repo_name: str, token: Optional[str] = None) -> str: """ Attempt to find a valid repository host for the given user_name and repo_name. @@ -295,6 +298,9 @@ async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str: The username or owner of the repository. repo_name : str The name of the repository. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. Returns ------- @@ -308,6 +314,6 @@ async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str: """ for domain in KNOWN_GIT_HOSTS: candidate = f"https://{domain}/{user_name}/{repo_name}" - if await check_repo_exists(candidate): + if await check_repo_exists(candidate, token=token if domain == "github.com" else None): return domain raise ValueError(f"Could not find a valid repository host for '{user_name}/{repo_name}'.") diff --git a/src/gitingest/schemas/ingestion_schema.py b/src/gitingest/schemas/ingestion_schema.py index 02b1c678..43ea6c42 100644 --- a/src/gitingest/schemas/ingestion_schema.py +++ b/src/gitingest/schemas/ingestion_schema.py @@ -29,6 +29,8 @@ class CloneConfig: The branch to clone (default is None). subpath : str The subpath to clone from the repository (default is "/"). + blob: bool + Whether the repository is a blob (default is False). """ url: str diff --git a/src/gitingest/utils/git_utils.py b/src/gitingest/utils/git_utils.py index 9ed7c645..b3346996 100644 --- a/src/gitingest/utils/git_utils.py +++ b/src/gitingest/utils/git_utils.py @@ -1,7 +1,11 @@ """Utility functions for interacting with Git repositories.""" import asyncio -from typing import List, Tuple +import base64 +import re +from typing import List, Optional, Tuple + +GITHUB_PAT_PATTERN = r"^(?:github_pat_|ghp_)[A-Za-z0-9_]{36,}$" async def run_command(*args: str) -> Tuple[bytes, bytes]: @@ -52,7 +56,7 @@ async def ensure_git_installed() -> None: raise RuntimeError("Git is not installed or not accessible. Please install Git first.") from exc -async def check_repo_exists(url: str) -> bool: +async def check_repo_exists(url: str, token: Optional[str] = None) -> bool: """ Check if a Git repository exists at the provided URL. @@ -60,6 +64,10 @@ async def check_repo_exists(url: str) -> bool: ---------- url : str The URL of the Git repository to check. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + Returns ------- bool @@ -70,6 +78,9 @@ async def check_repo_exists(url: str) -> bool: RuntimeError If the curl command returns an unexpected status code. """ + if token and "github.com" in url: + return await _check_github_repo_exists(url, token) + proc = await asyncio.create_subprocess_exec( "curl", "-I", @@ -94,19 +105,93 @@ async def check_repo_exists(url: str) -> bool: raise RuntimeError(f"Unexpected status line: {status_line}") -async def fetch_remote_branch_list(url: str) -> List[str]: +async def _check_github_repo_exists(url: str, token: Optional[str] = None) -> bool: + """ + Return True iff the authenticated user can see `url`. + + Parameters + ---------- + url : str + The URL of the GitHub repository to check. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + + Returns + ------- + bool + True if the repository exists, False otherwise. + + Raises + ------ + ValueError + If the URL is not a valid GitHub repository URL. + RuntimeError + If the repository is not found, if the provided URL is invalid, or if the token format is invalid. + """ + m = re.match(r"https?://github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$", url) + if not m: + raise ValueError(f"Un-recognised GitHub URL: {url!r}") + owner, repo = m.groups() + + api = f"https://api.github.com/repos/{owner}/{repo}" + cmd = [ + "curl", + "--silent", + "--location", + "--write-out", + "%{http_code}", + "-o", + "/dev/null", + "-H", + "Accept: application/vnd.github+json", + ] + if token: + cmd += ["-H", f"Authorization: Bearer {token}"] + cmd.append(api) + + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, _ = await proc.communicate() + status = stdout.decode()[-3:] # just the %{http_code} + + if status == "200": + return True + if status == "404": + return False + if status in ("401", "403"): + raise RuntimeError("Token invalid or lacks permissions") + raise RuntimeError(f"GitHub API returned unexpected HTTP {status}") + + +async def fetch_remote_branch_list(url: str, token: Optional[str] = None) -> List[str]: """ Fetch the list of branches from a remote Git repository. + Parameters ---------- url : str The URL of the Git repository to fetch branches from. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + Returns ------- List[str] A list of branch names available in the remote repository. """ - fetch_branches_command = ["git", "ls-remote", "--heads", url] + fetch_branches_command = ["git"] + + # Add authentication if needed + if token and "github.com" in url: + fetch_branches_command += ["-c", create_git_auth_header(token)] + + fetch_branches_command += ["ls-remote", "--heads", url] + await ensure_git_installed() stdout, _ = await run_command(*fetch_branches_command) stdout_decoded = stdout.decode() @@ -116,3 +201,66 @@ async def fetch_remote_branch_list(url: str) -> List[str]: for line in stdout_decoded.splitlines() if line.strip() and "refs/heads/" in line ] + + +def create_git_command(base_cmd: List[str], local_path: str, url: str, token: Optional[str] = None) -> List[str]: + """Create a git command with authentication if needed. + + Parameters + ---------- + base_cmd : List[str] + The base git command to start with + local_path : str + The local path where the git command should be executed + url : str + The repository URL to check if it's a GitHub repository + token : Optional[str] + GitHub personal access token for authentication + + Returns + ------- + List[str] + The git command with authentication if needed + """ + cmd = base_cmd + ["-C", local_path] + if token and url.startswith("https://github.com"): + validate_github_token(token) + cmd += ["-c", create_git_auth_header(token)] + return cmd + + +def create_git_auth_header(token: str) -> str: + """Create a Basic authentication header for GitHub git operations. + + Parameters + ---------- + token : str + GitHub personal access token + + Returns + ------- + str + The git config command for setting the authentication header + """ + basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode() + return f"http.https://github.com/.extraheader=Authorization: Basic {basic}" + + +def validate_github_token(token: str) -> None: + """Validate the format of a GitHub Personal Access Token. + + Parameters + ---------- + token : str + The GitHub token to validate + + Raises + ------ + ValueError + If the token format is invalid + """ + if not re.match(GITHUB_PAT_PATTERN, token): + raise ValueError( + "Invalid GitHub token format. Token should start with 'github_pat_' or 'ghp_' " + "followed by at least 36 characters of letters, numbers, and underscores." + ) diff --git a/src/gitingest/utils/os_utils.py b/src/gitingest/utils/os_utils.py new file mode 100644 index 00000000..a2d49916 --- /dev/null +++ b/src/gitingest/utils/os_utils.py @@ -0,0 +1,24 @@ +"""Utility functions for working with the operating system.""" + +import os +from pathlib import Path + + +async def ensure_directory(path: Path) -> None: + """ + Ensure the directory exists, creating it if necessary. + + Parameters + ---------- + path : Path + The path to ensure exists + + Raises + ------ + OSError + If the directory cannot be created + """ + try: + os.makedirs(path, exist_ok=True) + except OSError as exc: + raise OSError(f"Failed to create directory {path}: {exc}") from exc diff --git a/tests/test_repository_clone.py b/tests/test_repository_clone.py index b614d5a4..b57d737e 100644 --- a/tests/test_repository_clone.py +++ b/tests/test_repository_clone.py @@ -12,9 +12,10 @@ import pytest -from gitingest.cloning import check_repo_exists, clone_repo +from gitingest.cloning import clone_repo from gitingest.schemas import CloneConfig from gitingest.utils.exceptions import AsyncTimeoutError +from gitingest.utils.git_utils import check_repo_exists @pytest.mark.asyncio @@ -41,7 +42,7 @@ async def test_clone_with_commit() -> None: await clone_repo(clone_config) - mock_check.assert_called_once_with(clone_config.url) + mock_check.assert_called_once_with(clone_config.url, token=None) assert mock_exec.call_count == 2 # Clone and checkout calls @@ -69,7 +70,7 @@ async def test_clone_without_commit() -> None: await clone_repo(query) - mock_check.assert_called_once_with(query.url) + mock_check.assert_called_once_with(query.url, token=None) assert mock_exec.call_count == 1 # Only clone call @@ -435,7 +436,7 @@ async def test_clone_with_commit_and_subpath() -> None: clone_config.local_path, ) - # Verify the sparse-checkout command sets the correct path + # Verify sparse-checkout set mock_exec.assert_any_call( "git", "-C", @@ -443,8 +444,15 @@ async def test_clone_with_commit_and_subpath() -> None: "sparse-checkout", "set", "src/docs", + ) + + # Verify checkout commit + mock_exec.assert_any_call( + "git", + "-C", + clone_config.local_path, "checkout", clone_config.commit, ) - assert mock_exec.call_count == 2 + assert mock_exec.call_count == 3 From 2dea7c886530ef8a04d24f0901bfb56a7442fb62 Mon Sep 17 00:00:00 2001 From: Amgad Hasan <109704569+AmgadHasan@users.noreply.github.com> Date: Wed, 18 Jun 2025 12:57:34 +0300 Subject: [PATCH 18/34] Use gpt-4o's tokenizer (#258) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit feat: switch to o200k_base, require tiktoken ≥ 0.7.0, drop Python 3.7 Context ------- Token counting now uses **o200k_base** (native to GPT-4o / 4o-mini). That encoding ships only with **tiktoken ≥ 0.7.0**, whose wheels need Python 3.8+. CI already tests 3.8-3.13, so we align our documented minimums. Changes ------- * src/gitingest/output_formatters.py – `cl100k_base` → `o200k_base` * README.md – “Python 3.7+” → “Python 3.8+” * pyproject.toml * `tiktoken` → `tiktoken>=0.7.0` (o200k support) * remove classifier *Programming Language :: Python :: 3.7* * requirements.txt – same `tiktoken` bump Impact ------ * **Breaking** for users pinned to Python 3.7 → upgrade to 3.8+. * Environments on `tiktoken==0.6.*` must `pip install -U tiktoken>=0.7.0`. * No other runtime deps added. Co-authored-by: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> --- README.md | 2 +- pyproject.toml | 3 +-- requirements.txt | 2 +- src/gitingest/output_formatters.py | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index ba69b0a9..9ed8318b 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ You can also replace `hub` with `ingest` in any GitHub URL to access the corresp ## 📚 Requirements -- Python 3.7+ +- Python 3.8+ - For private repositories: A GitHub Personal Access Token (PAT). You can generate one at [https://github.com/settings/personal-access-tokens](https://github.com/settings/personal-access-tokens) (Profile → Settings → Developer Settings → Personal Access Tokens → Fine-grained Tokens) ### 📦 Installation diff --git a/pyproject.toml b/pyproject.toml index f280d4a4..f6d39290 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "python-dotenv", "slowapi", "starlette>=0.40.0", # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw - "tiktoken", + "tiktoken>=0.7.0", # Support for o200k_base encoding "tomli", "typing_extensions; python_version < '3.10'", "uvicorn>=0.11.7", # Vulnerable to https://osv.dev/vulnerability/PYSEC-2020-150 @@ -23,7 +23,6 @@ classifiers=[ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", diff --git a/requirements.txt b/requirements.txt index 5f8657ed..aa8ff03b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,6 @@ pydantic python-dotenv slowapi starlette>=0.40.0 # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw -tiktoken +tiktoken>=0.7.0 # Support for o200k_base encoding tomli uvicorn>=0.11.7 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2020-150 diff --git a/src/gitingest/output_formatters.py b/src/gitingest/output_formatters.py index 5bacba22..9ca3d474 100644 --- a/src/gitingest/output_formatters.py +++ b/src/gitingest/output_formatters.py @@ -171,7 +171,7 @@ def _format_token_count(text: str) -> Optional[str]: The formatted number of tokens as a string (e.g., '1.2k', '1.2M'), or `None` if an error occurs. """ try: - encoding = tiktoken.get_encoding("cl100k_base") + encoding = tiktoken.get_encoding("o200k_base") # gpt-4o, gpt-4o-mini total_tokens = len(encoding.encode(text, disallowed_special=())) except (ValueError, UnicodeEncodeError) as exc: print(exc) From c656635f6d6d22142e3b735172f727d11bd641f9 Mon Sep 17 00:00:00 2001 From: Casey West Date: Thu, 19 Jun 2025 09:21:13 +0200 Subject: [PATCH 19/34] Add option to output digest to stdout (#264) * Add option to output digest to stdout This change introduces the ability for users to direct the output of the gitingest tool to standard output (stdout) instead of writing to a file. This is useful for piping the output to other commands or viewing it directly in the terminal. Co-authored-by: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> --- .pre-commit-config.yaml | 6 +-- README.md | 31 +++++++++-- src/gitingest/cli.py | 32 +++++++---- src/gitingest/entrypoint.py | 8 ++- tests/test_cli.py | 105 ++++++++++++++++++++++++------------ 5 files changed, 130 insertions(+), 52 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1a70d007..b8b3f228 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -44,7 +44,7 @@ repos: - id: black - repo: https://github.com/asottile/pyupgrade - rev: v3.19.1 + rev: v3.20.0 hooks: - id: pyupgrade description: "Automatically upgrade syntax for newer versions." @@ -73,7 +73,7 @@ repos: - id: djlint-reformat-jinja - repo: https://github.com/igorshubovych/markdownlint-cli - rev: v0.44.0 + rev: v0.45.0 hooks: - id: markdownlint description: "Lint markdown files." @@ -88,7 +88,7 @@ repos: files: ^src/ - repo: https://github.com/pycqa/pylint - rev: v3.3.6 + rev: v3.3.7 hooks: - id: pylint name: pylint for source diff --git a/README.md b/README.md index 9ed8318b..f62ea417 100644 --- a/README.md +++ b/README.md @@ -78,26 +78,35 @@ Issues and feature requests are welcome to the repo. The `gitingest` command line tool allows you to analyze codebases and create a text dump of their contents. ```bash -# Basic usage +# Basic usage (writes to digest.txt by default) gitingest /path/to/directory # From URL gitingest https://github.com/cyclotruc/gitingest +``` + +For private repositories, use the `--token/-t` option. -# For private repositories, use the --token option +```bash # Get your token from https://github.com/settings/personal-access-tokens gitingest https://github.com/username/private-repo --token github_pat_... # Or set it as an environment variable export GITHUB_TOKEN=github_pat_... gitingest https://github.com/username/private-repo +``` -# See more options +By default, the digest is written to a text file (`digest.txt`) in your current working directory. You can customize the output in two ways: + +- Use `--output/-o ` to write to a specific file. +- Use `--output/-o -` to output directly to `STDOUT` (useful for piping to other tools). + +See more options and usage details with: + +```bash gitingest --help ``` -This will write the digest in a text file (default `digest.txt`) in your current working directory. - ## 🐍 Python package usage ```python @@ -110,6 +119,18 @@ summary, tree, content = ingest("path/to/directory") summary, tree, content = ingest("https://github.com/cyclotruc/gitingest") ``` +For private repositories, you can pass a token: + +```python +# Using token parameter +summary, tree, content = ingest("https://github.com/username/private-repo", token="github_pat_...") + +# Or set it as an environment variable +import os +os.environ["GITHUB_TOKEN"] = "github_pat_..." +summary, tree, content = ingest("https://github.com/username/private-repo") +``` + By default, this won't write a file but can be enabled with the `output` argument. ```python diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index a7b5de98..fb4e584e 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -73,7 +73,8 @@ def main( source : str A directory path or a Git repository URL. output : str, optional - Output file path. Defaults to `.txt`. + The path where the output file will be written. If not specified, the output will be written + to a file named `.txt` in the current directory. Use '-' to output to stdout. max_size : int Maximum file size (in bytes) to consider. exclude_pattern : Tuple[str, ...] @@ -113,14 +114,16 @@ async def _async_main( Analyze a directory or repository and create a text dump of its contents. This command analyzes the contents of a specified source directory or repository, applies custom include and - exclude patterns, and generates a text summary of the analysis which is then written to an output file. + exclude patterns, and generates a text summary of the analysis which is then written to an output file + or printed to stdout. Parameters ---------- source : str A directory path or a Git repository URL. output : str, optional - Output file path. Defaults to `.txt`. + The path where the output file will be written. If not specified, the output will be written + to a file named `.txt` in the current directory. Use '-' to output to stdout. max_size : int Maximum file size (in bytes) to consider. exclude_pattern : Tuple[str, ...] @@ -143,9 +146,12 @@ async def _async_main( exclude_patterns = set(exclude_pattern) include_patterns = set(include_pattern) - # Choose a default output path if none provided - if output is None: - output = OUTPUT_FILE_NAME + output_target = output if output is not None else OUTPUT_FILE_NAME + + if output_target == "-": + click.echo("Analyzing source, preparing output for stdout...", err=True) + else: + click.echo(f"Analyzing source, output will be written to '{output_target}'...", err=True) summary, _, _ = await ingest_async( source=source, @@ -153,13 +159,19 @@ async def _async_main( include_patterns=include_patterns, exclude_patterns=exclude_patterns, branch=branch, - output=output, + output=output_target, token=token, ) - click.echo(f"Analysis complete! Output written to: {output}") - click.echo("\nSummary:") - click.echo(summary) + if output_target == "-": # stdout + click.echo("\n--- Summary ---", err=True) + click.echo(summary, err=True) + click.echo("--- End Summary ---", err=True) + click.echo("Analysis complete! Output sent to stdout.", err=True) + else: # file + click.echo(f"Analysis complete! Output written to: {output_target}") + click.echo("\nSummary:") + click.echo(summary) except Exception as exc: # Convert any exception into Click.Abort so that exit status is non-zero diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index cfabb461..13dc8170 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -4,6 +4,7 @@ import inspect import os import shutil +import sys from typing import Optional, Set, Tuple, Union from gitingest.cloning import clone_repo @@ -93,7 +94,12 @@ async def ingest_async( summary, tree, content = ingest_query(query) - if output is not None: + if output == "-": + loop = asyncio.get_running_loop() + output_data = tree + "\n" + content + await loop.run_in_executor(None, sys.stdout.write, output_data) + await loop.run_in_executor(None, sys.stdout.flush) + elif output is not None: with open(output, "w", encoding="utf-8") as f: f.write(tree + "\n" + content) diff --git a/tests/test_cli.py b/tests/test_cli.py index 7eadea46..a7758f04 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,41 +1,80 @@ -"""Tests for the gitingest cli.""" +"""Tests for the Gitingest CLI.""" import os +from inspect import signature +from pathlib import Path +from typing import List -from click.testing import CliRunner +import pytest +from _pytest.monkeypatch import MonkeyPatch +from click.testing import CliRunner, Result from gitingest.cli import main from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME -def test_cli_with_default_options(): - runner = CliRunner() - result = runner.invoke(main, ["./"]) - output_lines = result.output.strip().split("\n") - assert f"Analysis complete! Output written to: {OUTPUT_FILE_NAME}" in output_lines - assert os.path.exists(OUTPUT_FILE_NAME), f"Output file was not created at {OUTPUT_FILE_NAME}" - - os.remove(OUTPUT_FILE_NAME) - - -def test_cli_with_options(): - runner = CliRunner() - result = runner.invoke( - main, - [ - "./", - "--output", - str(OUTPUT_FILE_NAME), - "--max-size", - str(MAX_FILE_SIZE), - "--exclude-pattern", - "tests/", - "--include-pattern", - "src/", - ], - ) - output_lines = result.output.strip().split("\n") - assert f"Analysis complete! Output written to: {OUTPUT_FILE_NAME}" in output_lines - assert os.path.exists(OUTPUT_FILE_NAME), f"Output file was not created at {OUTPUT_FILE_NAME}" - - os.remove(OUTPUT_FILE_NAME) +@pytest.mark.parametrize( + "cli_args, expect_file", + [ + pytest.param(["./"], True, id="default-options"), + pytest.param( + [ + "./", + "--output", + str(OUTPUT_FILE_NAME), + "--max-size", + str(MAX_FILE_SIZE), + "--exclude-pattern", + "tests/", + "--include-pattern", + "src/", + ], + True, + id="custom-options", + ), + ], +) +def test_cli_writes_file(tmp_path: Path, monkeypatch: MonkeyPatch, cli_args: List[str], expect_file: bool) -> None: + """Run the CLI and verify that the SARIF file is created (or not).""" + # Work inside an isolated temp directory + monkeypatch.chdir(tmp_path) + + result = _invoke_isolated_cli_runner(cli_args) + + assert result.exit_code == 0, result.stderr + + # Summary line should be on STDOUT + stdout_lines = result.stdout.splitlines() + assert f"Analysis complete! Output written to: {OUTPUT_FILE_NAME}" in stdout_lines + + # File side-effect + sarif_file = tmp_path / OUTPUT_FILE_NAME + assert sarif_file.exists() is expect_file, f"{OUTPUT_FILE_NAME} existence did not match expectation" + + +def test_cli_with_stdout_output() -> None: + """Test CLI invocation with output directed to STDOUT.""" + result = _invoke_isolated_cli_runner(["./", "--output", "-", "--exclude-pattern", "tests/"]) + + # ─── core expectations (stdout) ────────────────────────────────────- + assert result.exit_code == 0, f"CLI exited with code {result.exit_code}, stderr: {result.stderr}" + assert "---" in result.stdout, "Expected file separator '---' not found in STDOUT" + assert "src/gitingest/cli.py" in result.stdout, "Expected content (e.g., src/gitingest/cli.py) not found in STDOUT" + assert not os.path.exists(OUTPUT_FILE_NAME), f"Output file {OUTPUT_FILE_NAME} was unexpectedly created." + + # ─── the summary must *not* pollute STDOUT, must appear on STDERR ─── + summary = "Analysis complete! Output sent to stdout." + stdout_lines = result.stdout.splitlines() + stderr_lines = result.stderr.splitlines() + assert summary not in stdout_lines, "Unexpected summary message found in STDOUT" + assert summary in stderr_lines, "Expected summary message not found in STDERR" + assert f"Output written to: {OUTPUT_FILE_NAME}" not in stderr_lines + + +def _invoke_isolated_cli_runner(args: List[str]) -> Result: + """Return a CliRunner that keeps stderr apart on Click 8.0-8.1.""" + kwargs = {} + if "mix_stderr" in signature(CliRunner.__init__).parameters: + kwargs["mix_stderr"] = False # Click 8.0–8.1 + runner = CliRunner(**kwargs) + return runner.invoke(main, args) From 3869aa32e30c794b1fb07721d42a541a7c14d394 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sat, 21 Jun 2025 20:19:16 +0200 Subject: [PATCH 20/34] feat(web-ui): add private-GitHub ingestion via PAT (#286) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(web-ui, backend): allow ingesting private GitHub repos with PAT authentication * Accept a GitHub personal access token (PAT) from the UI and forward it through - `git_form.jinja` → new “Private Repository” checkbox + PAT field - routers (`index.py`, `dynamic.py`) and `query_processor.py` * Propagate `token` throughout the ingestion stack - `gitingest.entrypoint.parse_query` - `query_parsing` (including `try_domains_for_user_and_repo`) so we can infer the host when the user enters a bare “user/repo” slug * Tests - Added `"token": ""` to the `form_data` dict in the tests in `tests/test_flow_integration.py` **Limitation:** This PR enables PAT-protected cloning **only for GitHub**; other hosts (GitLab, Gitea, etc.) remain public-only for now. * help link to generate PAT * pre-commit hooks --------- Co-authored-by: cyclotruc --- src/gitingest/entrypoint.py | 1 + src/gitingest/query_parsing.py | 8 +- src/server/query_processor.py | 9 +- src/server/routers/dynamic.py | 7 +- src/server/routers/index.py | 7 +- .../templates/components/git_form.jinja | 202 ++++++++++++------ tests/test_flow_integration.py | 6 + tests/test_repository_clone.py | 19 +- 8 files changed, 176 insertions(+), 83 deletions(-) diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index 13dc8170..f9e65dde 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -73,6 +73,7 @@ async def ingest_async( from_web=False, include_patterns=include_patterns, ignore_patterns=exclude_patterns, + token=token, ) if query.url: diff --git a/src/gitingest/query_parsing.py b/src/gitingest/query_parsing.py index d391e184..089a6f96 100644 --- a/src/gitingest/query_parsing.py +++ b/src/gitingest/query_parsing.py @@ -29,6 +29,7 @@ async def parse_query( from_web: bool, include_patterns: Optional[Union[str, Set[str]]] = None, ignore_patterns: Optional[Union[str, Set[str]]] = None, + token: Optional[str] = None, ) -> IngestionQuery: """ Parse the input source (URL or path) to extract relevant details for the query. @@ -49,7 +50,10 @@ async def parse_query( Patterns to include, by default None. Can be a set of strings or a single string. ignore_patterns : Union[str, Set[str]], optional Patterns to ignore, by default None. Can be a set of strings or a single string. - + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + Must start with 'github_pat_' or 'gph_' for GitHub repositories. Returns ------- IngestionQuery @@ -59,7 +63,7 @@ async def parse_query( # Determine the parsing method based on the source type if from_web or urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS): # We either have a full URL or a domain-less slug - query = await _parse_remote_repo(source) + query = await _parse_remote_repo(source, token=token) else: # Local path scenario query = _parse_local_dir_path(source) diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 00b1c640..1440a5e5 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -1,6 +1,7 @@ """Process a query by parsing input, cloning a repository, and generating a summary.""" from functools import partial +from typing import Optional from fastapi import Request from starlette.templating import _TemplateResponse @@ -19,6 +20,7 @@ async def process_query( pattern_type: str = "exclude", pattern: str = "", is_index: bool = False, + token: Optional[str] = None, ) -> _TemplateResponse: """ Process a query by parsing input, cloning a repository, and generating a summary. @@ -40,6 +42,9 @@ async def process_query( Pattern to include or exclude in the query, depending on the pattern type. is_index : bool Flag indicating whether the request is for the index page (default is False). + token : str, optional + GitHub personal-access token (PAT). Needed when *input_text* refers to a + **private** repository. Returns ------- @@ -71,6 +76,7 @@ async def process_query( "default_file_size": slider_position, "pattern_type": pattern_type, "pattern": pattern, + "token": token, } try: @@ -80,12 +86,13 @@ async def process_query( from_web=True, include_patterns=include_patterns, ignore_patterns=exclude_patterns, + token=token, ) if not query.url: raise ValueError("The 'url' parameter is required.") clone_config = query.extract_clone_config() - await clone_repo(clone_config) + await clone_repo(clone_config, token=token) summary, tree, content = ingest_query(query) with open(f"{clone_config.local_path}.txt", "w", encoding="utf-8") as f: f.write(tree + "\n" + content) diff --git a/src/server/routers/dynamic.py b/src/server/routers/dynamic.py index bfa31f68..57a54a56 100644 --- a/src/server/routers/dynamic.py +++ b/src/server/routers/dynamic.py @@ -50,6 +50,7 @@ async def process_catch_all( max_file_size: int = Form(...), pattern_type: str = Form(...), pattern: str = Form(...), + token: str = Form(...), ) -> HTMLResponse: """ Process the form submission with user input for query parameters. @@ -69,13 +70,16 @@ async def process_catch_all( The type of pattern used for the query, specified by the user. pattern : str The pattern string used in the query, specified by the user. - + token : str + GitHub personal-access token (PAT). Needed when *input_text* refers to a + **private** repository. Returns ------- HTMLResponse An HTML response generated after processing the form input and query logic, which will be rendered and returned to the user. """ + resolved_token = None if token == "" else token return await process_query( request, input_text, @@ -83,4 +87,5 @@ async def process_catch_all( pattern_type, pattern, is_index=False, + token=resolved_token, ) diff --git a/src/server/routers/index.py b/src/server/routers/index.py index 01b84730..8c11aaa8 100644 --- a/src/server/routers/index.py +++ b/src/server/routers/index.py @@ -47,6 +47,7 @@ async def index_post( max_file_size: int = Form(...), pattern_type: str = Form(...), pattern: str = Form(...), + token: str = Form(...), ) -> HTMLResponse: """ Process the form submission with user input for query parameters. @@ -67,13 +68,16 @@ async def index_post( The type of pattern used for the query, specified by the user. pattern : str The pattern string used in the query, specified by the user. - + token : str + GitHub personal-access token (PAT). Needed when *input_text* refers to a + **private** repository. Returns ------- HTMLResponse An HTML response containing the results of processing the form input and query logic, which will be rendered and returned to the user. """ + resolved_token = None if token == "" else token return await process_query( request, input_text, @@ -81,4 +85,5 @@ async def index_post( pattern_type, pattern, is_index=True, + token=resolved_token, ) diff --git a/src/server/templates/components/git_form.jinja b/src/server/templates/components/git_form.jinja index 764fff70..b45d0f92 100644 --- a/src/server/templates/components/git_form.jinja +++ b/src/server/templates/components/git_form.jinja @@ -17,88 +17,156 @@ element.classList.toggle('hover:text-gray-500'); }); } + + function toggleAccessSettings() { + const container = document.getElementById('accessSettingsContainer'); + const checkbox = document.getElementById('showAccessSettings'); + const row = document.getElementById('controlsRow'); + const show = checkbox.checked; + container.classList.toggle('hidden', !show); + row.classList.toggle('mb-8', show); + }
-
+ -
-
- -
-
-
- + +
+ +
+
+ +
+ +
+
+ +
+ - -
- -
-
-
-
-
- - - - + +
+ +
+ +
+
+
+
+ +
+ + + + +
+ + +
+
+
+ +
+ + +
+
+ +
+ +
+ + +
+ +
+
+
+
+ +
+
+ + -
-
- - -
-
+ + {% if show_examples %} -

Try these example repositories:

diff --git a/tests/test_flow_integration.py b/tests/test_flow_integration.py index da12ca82..c85f63ae 100644 --- a/tests/test_flow_integration.py +++ b/tests/test_flow_integration.py @@ -63,6 +63,7 @@ async def test_remote_repository_analysis(request): "max_file_size": "243", "pattern_type": "exclude", "pattern": "", + "token": "", } response = client.post("/", data=form_data) @@ -79,6 +80,7 @@ async def test_invalid_repository_url(request): "max_file_size": "243", "pattern_type": "exclude", "pattern": "", + "token": "", } response = client.post("/", data=form_data) @@ -95,6 +97,7 @@ async def test_large_repository(request): "max_file_size": "243", "pattern_type": "exclude", "pattern": "", + "token": "", } response = client.post("/", data=form_data) @@ -113,6 +116,7 @@ def make_request(): "max_file_size": "243", "pattern_type": "exclude", "pattern": "", + "token": "", } response = client.post("/", data=form_data) assert response.status_code == 200, f"Request failed: {response.text}" @@ -133,6 +137,7 @@ async def test_large_file_handling(request): "max_file_size": "1", "pattern_type": "exclude", "pattern": "", + "token": "", } response = client.post("/", data=form_data) @@ -149,6 +154,7 @@ async def test_repository_with_patterns(request): "max_file_size": "243", "pattern_type": "include", "pattern": "*.md", + "token": "", } response = client.post("/", data=form_data) diff --git a/tests/test_repository_clone.py b/tests/test_repository_clone.py index b57d737e..787456b1 100644 --- a/tests/test_repository_clone.py +++ b/tests/test_repository_clone.py @@ -55,7 +55,7 @@ async def test_clone_without_commit() -> None: When `clone_repo` is called, Then only the clone_repo operation should be performed (no checkout). """ - query = CloneConfig( + clone_config = CloneConfig( url="https://github.com/user/repo", local_path="/tmp/repo", commit=None, @@ -68,9 +68,9 @@ async def test_clone_without_commit() -> None: mock_process.communicate.return_value = (b"output", b"error") mock_exec.return_value = mock_process - await clone_repo(query) + await clone_repo(clone_config) - mock_check.assert_called_once_with(query.url, token=None) + mock_check.assert_called_once_with(clone_config.url, token=None) assert mock_exec.call_count == 1 # Only clone call @@ -107,10 +107,10 @@ async def test_clone_nonexistent_repository() -> None: ) async def test_check_repo_exists(mock_stdout: bytes, return_code: int, expected: bool) -> None: """ - Test the `_check_repo_exists` function with different Git HTTP responses. + Test the `check_repo_exists` function with different Git HTTP responses. Given various stdout lines and return codes: - When `_check_repo_exists` is called, + When `check_repo_exists` is called, Then it should correctly indicate whether the repository exists. """ url = "https://github.com/user/repo" @@ -296,8 +296,8 @@ async def test_clone_specific_branch(tmp_path): branch_name = "main" local_path = tmp_path / "gitingest" - config = CloneConfig(url=repo_url, local_path=str(local_path), branch=branch_name) - await clone_repo(config) + clone_config = CloneConfig(url=repo_url, local_path=str(local_path), branch=branch_name) + await clone_repo(clone_config) # Assertions assert local_path.exists(), "The repository was not cloned successfully." @@ -348,10 +348,7 @@ async def test_clone_creates_parent_directory(tmp_path: Path) -> None: Then it should create the parent directories before attempting to clone. """ nested_path = tmp_path / "deep" / "nested" / "path" / "repo" - clone_config = CloneConfig( - url="https://github.com/user/repo", - local_path=str(nested_path), - ) + clone_config = CloneConfig(url="https://github.com/user/repo", local_path=str(nested_path)) with patch("gitingest.cloning.check_repo_exists", return_value=True): with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: From 95009bdf15ac6f1f7142ec104ea76f23cdeee186 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sat, 21 Jun 2025 21:26:29 +0200 Subject: [PATCH 21/34] test: add pytest-mock, introduce fixtures & type hints (#290) * Added pytest-mock to dev dependencies and pre-commit hooks * Introduced InvalidGitHubTokenError for clearer token-validation failures * Refactored tests: * Replaced ad-hoc mocks with reusable fixtures * Parametrised URL/branch matrices to cut duplication * Added type hints throughout * New coverage: * validate_github_token (happy & error paths) * create_git_command / create_git_auth_header --- .pre-commit-config.yaml | 2 + requirements-dev.txt | 1 + src/gitingest/utils/exceptions.py | 10 + src/gitingest/utils/git_utils.py | 9 +- tests/conftest.py | 55 ++- tests/query_parser/test_git_host_agnostic.py | 123 +++--- tests/query_parser/test_query_parser.py | 255 ++++++------ tests/test_flow_integration.py | 50 ++- tests/test_git_utils.py | 142 +++++++ tests/test_repository_clone.py | 398 ++++++++----------- 10 files changed, 581 insertions(+), 464 deletions(-) create mode 100644 tests/test_git_utils.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b8b3f228..c8dce118 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -99,6 +99,7 @@ repos: "fastapi[standard]>=0.109.1", pydantic, pytest-asyncio, + pytest-mock, python-dotenv, slowapi, starlette>=0.40.0, @@ -117,6 +118,7 @@ repos: "fastapi[standard]>=0.109.1", pydantic, pytest-asyncio, + pytest-mock, python-dotenv, slowapi, starlette>=0.40.0, diff --git a/requirements-dev.txt b/requirements-dev.txt index eb733ff3..b8fd868a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,3 +5,4 @@ pre-commit pylint pytest pytest-asyncio +pytest-mock diff --git a/src/gitingest/utils/exceptions.py b/src/gitingest/utils/exceptions.py index aade9418..5b9f33b4 100644 --- a/src/gitingest/utils/exceptions.py +++ b/src/gitingest/utils/exceptions.py @@ -35,3 +35,13 @@ class InvalidNotebookError(Exception): def __init__(self, message: str) -> None: super().__init__(message) + + +class InvalidGitHubTokenError(ValueError): + """Exception raised when a GitHub Personal Access Token is malformed.""" + + def __init__(self) -> None: + super().__init__( + "Invalid GitHub token format. Token should start with 'github_pat_' or 'ghp_' " + "followed by at least 36 characters of letters, numbers, and underscores." + ) diff --git a/src/gitingest/utils/git_utils.py b/src/gitingest/utils/git_utils.py index b3346996..7d18499e 100644 --- a/src/gitingest/utils/git_utils.py +++ b/src/gitingest/utils/git_utils.py @@ -5,6 +5,8 @@ import re from typing import List, Optional, Tuple +from gitingest.utils.exceptions import InvalidGitHubTokenError + GITHUB_PAT_PATTERN = r"^(?:github_pat_|ghp_)[A-Za-z0-9_]{36,}$" @@ -256,11 +258,8 @@ def validate_github_token(token: str) -> None: Raises ------ - ValueError + InvalidGitHubTokenError If the token format is invalid """ if not re.match(GITHUB_PAT_PATTERN, token): - raise ValueError( - "Invalid GitHub token format. Token should start with 'github_pat_' or 'ghp_' " - "followed by at least 36 characters of letters, numbers, and underscores." - ) + raise InvalidGitHubTokenError() diff --git a/tests/conftest.py b/tests/conftest.py index 307b705d..50a5a90d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,14 +7,19 @@ import json from pathlib import Path -from typing import Any, Callable, Dict +from typing import Any, Callable, Dict, List +from unittest.mock import AsyncMock import pytest +from pytest_mock import MockerFixture from gitingest.query_parsing import IngestionQuery WriteNotebookFunc = Callable[[str, Dict[str, Any]], Path] +DEMO_URL = "https://github.com/user/repo" +LOCAL_REPO_PATH = "/tmp/repo" + @pytest.fixture def sample_query() -> IngestionQuery: @@ -129,3 +134,51 @@ def _write_notebook(name: str, content: Dict[str, Any]) -> Path: return notebook_path return _write_notebook + + +@pytest.fixture +def stub_branches(mocker: MockerFixture) -> Callable[[List[str]], None]: + """Return a function that stubs git branch discovery to *branches*.""" + + def _factory(branches: List[str]) -> None: + mocker.patch( + "gitingest.utils.git_utils.run_command", + new_callable=AsyncMock, + return_value=("\n".join(f"refs/heads/{b}" for b in branches).encode() + b"\n", b""), + ) + mocker.patch( + "gitingest.utils.git_utils.fetch_remote_branch_list", + new_callable=AsyncMock, + return_value=branches, + ) + + return _factory + + +@pytest.fixture +def repo_exists_true(mocker: MockerFixture) -> AsyncMock: + """Patch `gitingest.cloning.check_repo_exists` to always return ``True``. + + Many cloning-related tests assume that the remote repository exists. This fixture centralises + that behaviour so individual tests no longer need to repeat the same ``mocker.patch`` call. + The mock object is returned so that tests can make assertions on how it was used or override + its behaviour when needed. + """ + return mocker.patch("gitingest.cloning.check_repo_exists", return_value=True) + + +@pytest.fixture +def run_command_mock(mocker: MockerFixture) -> AsyncMock: + """Patch `gitingest.cloning.run_command` with an ``AsyncMock``. + + The mocked function returns a dummy process whose ``communicate`` method yields generic + *stdout* / *stderr* bytes. Tests can still access / tweak the mock via the fixture argument. + """ + mock_exec = mocker.patch("gitingest.cloning.run_command", new_callable=AsyncMock) + + # Provide a default dummy process so most tests don't have to create one. + dummy_process = AsyncMock() + dummy_process.communicate.return_value = (b"output", b"error") + mock_exec.return_value = dummy_process + + return mock_exec diff --git a/tests/query_parser/test_git_host_agnostic.py b/tests/query_parser/test_git_host_agnostic.py index 0039d220..a4c3fe3c 100644 --- a/tests/query_parser/test_git_host_agnostic.py +++ b/tests/query_parser/test_git_host_agnostic.py @@ -5,91 +5,60 @@ Bitbucket, Gitea, and Codeberg, even if the host is omitted. """ -from typing import List +from typing import List, Tuple import pytest from gitingest.query_parsing import parse_query +# Repository matrix: (host, user, repo) +_REPOS: List[Tuple[str, str, str]] = [ + ("github.com", "tiangolo", "fastapi"), + ("gitlab.com", "gitlab-org", "gitlab-runner"), + ("bitbucket.org", "na-dna", "llm-knowledge-share"), + ("gitea.com", "xorm", "xorm"), + ("codeberg.org", "forgejo", "forgejo"), +] -@pytest.mark.parametrize( - "urls, expected_user, expected_repo, expected_url", - [ - ( - [ - "https://github.com/tiangolo/fastapi", - "github.com/tiangolo/fastapi", - "tiangolo/fastapi", - ], - "tiangolo", - "fastapi", - "https://github.com/tiangolo/fastapi", - ), - ( - [ - "https://gitlab.com/gitlab-org/gitlab-runner", - "gitlab.com/gitlab-org/gitlab-runner", - "gitlab-org/gitlab-runner", - ], - "gitlab-org", - "gitlab-runner", - "https://gitlab.com/gitlab-org/gitlab-runner", - ), - ( - [ - "https://bitbucket.org/na-dna/llm-knowledge-share", - "bitbucket.org/na-dna/llm-knowledge-share", - "na-dna/llm-knowledge-share", - ], - "na-dna", - "llm-knowledge-share", - "https://bitbucket.org/na-dna/llm-knowledge-share", - ), - ( - [ - "https://gitea.com/xorm/xorm", - "gitea.com/xorm/xorm", - "xorm/xorm", - ], - "xorm", - "xorm", - "https://gitea.com/xorm/xorm", - ), - ( - [ - "https://codeberg.org/forgejo/forgejo", - "codeberg.org/forgejo/forgejo", - "forgejo/forgejo", - ], - "forgejo", - "forgejo", - "https://codeberg.org/forgejo/forgejo", - ), - ], -) + +# Generate cartesian product of repository tuples with URL variants. +@pytest.mark.parametrize("host, user, repo", _REPOS, ids=[f"{h}:{u}/{r}" for h, u, r in _REPOS]) +@pytest.mark.parametrize("variant", ["full", "noscheme", "slug"]) @pytest.mark.asyncio async def test_parse_query_without_host( - urls: List[str], - expected_user: str, - expected_repo: str, - expected_url: str, + host: str, + user: str, + repo: str, + variant: str, ) -> None: - """ - Test `parse_query` for Git host agnosticism. + """Verify that `parse_query` handles URLs, host-omitted URLs and raw slugs.""" + + # Build the input URL based on the selected variant + if variant == "full": + url = f"https://{host}/{user}/{repo}" + elif variant == "noscheme": + url = f"{host}/{user}/{repo}" + else: # "slug" + url = f"{user}/{repo}" + + expected_url = f"https://{host}/{user}/{repo}" + + query = await parse_query(url, max_file_size=50, from_web=True) + + # Compare against the canonical dict while ignoring unpredictable fields. + actual = query.model_dump(exclude={"id", "local_path", "ignore_patterns"}) - Given multiple URL variations for the same user/repo on different Git hosts (with or without host names): - When `parse_query` is called with each variation, - Then the parser should correctly identify the user, repo, canonical URL, and other default fields. - """ - for url in urls: - query = await parse_query(url, max_file_size=50, from_web=True) + expected = { + "user_name": user, + "repo_name": repo, + "url": expected_url, + "slug": f"{user}-{repo}", + "subpath": "/", + "type": None, + "branch": None, + "commit": None, + "max_file_size": 50, + "include_patterns": None, + } - assert query.user_name == expected_user - assert query.repo_name == expected_repo - assert query.url == expected_url - assert query.slug == f"{expected_user}-{expected_repo}" - assert query.id is not None - assert query.subpath == "/" - assert query.branch is None - assert query.commit is None - assert query.type is None + assert actual == expected diff --git a/tests/query_parser/test_query_parser.py b/tests/query_parser/test_query_parser.py index b7f15f22..9c2af01c 100644 --- a/tests/query_parser/test_query_parser.py +++ b/tests/query_parser/test_query_parser.py @@ -6,62 +6,43 @@ """ from pathlib import Path -from unittest.mock import AsyncMock, patch +from typing import Callable, List, Optional +from unittest.mock import AsyncMock import pytest +from pytest_mock import MockerFixture from gitingest.query_parsing import _parse_patterns, _parse_remote_repo, parse_query +from gitingest.schemas.ingestion_schema import IngestionQuery from gitingest.utils.ignore_patterns import DEFAULT_IGNORE_PATTERNS +from tests.conftest import DEMO_URL +URLS_HTTPS: List[str] = [ + DEMO_URL, + "https://gitlab.com/user/repo", + "https://bitbucket.org/user/repo", + "https://gitea.com/user/repo", + "https://codeberg.org/user/repo", + "https://gist.github.com/user/repo", +] -@pytest.mark.asyncio -async def test_parse_url_valid_https() -> None: - """ - Test `_parse_remote_repo` with valid HTTPS URLs. - - Given various HTTPS URLs on supported platforms: - When `_parse_remote_repo` is called, - Then user name, repo name, and the URL should be extracted correctly. - """ - test_cases = [ - "https://github.com/user/repo", - "https://gitlab.com/user/repo", - "https://bitbucket.org/user/repo", - "https://gitea.com/user/repo", - "https://codeberg.org/user/repo", - "https://gist.github.com/user/repo", - ] - for url in test_cases: - query = await _parse_remote_repo(url) - - assert query.user_name == "user" - assert query.repo_name == "repo" - assert query.url == url +URLS_HTTP: List[str] = [url.replace("https://", "http://") for url in URLS_HTTPS] +@pytest.mark.parametrize("url", URLS_HTTPS, ids=lambda u: u) @pytest.mark.asyncio -async def test_parse_url_valid_http() -> None: - """ - Test `_parse_remote_repo` with valid HTTP URLs. +async def test_parse_url_valid_https(url: str) -> None: + """Valid HTTPS URLs parse correctly and `query.url` equals the input.""" + query = await _assert_basic_repo_fields(url) - Given various HTTP URLs on supported platforms: - When `_parse_remote_repo` is called, - Then user name, repo name, and the slug should be extracted correctly. - """ - test_cases = [ - "http://github.com/user/repo", - "http://gitlab.com/user/repo", - "http://bitbucket.org/user/repo", - "http://gitea.com/user/repo", - "http://codeberg.org/user/repo", - "http://gist.github.com/user/repo", - ] - for url in test_cases: - query = await _parse_remote_repo(url) + assert query.url == url # HTTPS: canonical URL should equal input - assert query.user_name == "user" - assert query.repo_name == "repo" - assert query.slug == "user-repo" + +@pytest.mark.parametrize("url", URLS_HTTP, ids=lambda u: u) +@pytest.mark.asyncio +async def test_parse_url_valid_http(url: str) -> None: + """Valid HTTP URLs parse correctly (slug check only).""" + await _assert_basic_repo_fields(url) @pytest.mark.asyncio @@ -74,13 +55,14 @@ async def test_parse_url_invalid() -> None: Then a ValueError should be raised indicating an invalid repository URL. """ url = "https://github.com" + with pytest.raises(ValueError, match="Invalid repository URL"): await _parse_remote_repo(url) @pytest.mark.asyncio -@pytest.mark.parametrize("url", ["https://github.com/user/repo", "https://gitlab.com/user/repo"]) -async def test_parse_query_basic(url): +@pytest.mark.parametrize("url", [DEMO_URL, "https://gitlab.com/user/repo"]) +async def test_parse_query_basic(url: str) -> None: """ Test `parse_query` with a basic valid repository URL. @@ -122,8 +104,7 @@ async def test_parse_query_include_pattern() -> None: When `parse_query` is called, Then the include pattern should be set, and default ignore patterns remain applied. """ - url = "https://github.com/user/repo" - query = await parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py") + query = await parse_query(DEMO_URL, max_file_size=50, from_web=True, include_patterns="*.py") assert query.include_patterns == {"*.py"} assert query.ignore_patterns == DEFAULT_IGNORE_PATTERNS @@ -138,13 +119,12 @@ async def test_parse_query_invalid_pattern() -> None: When `parse_query` is called, Then a ValueError should be raised indicating invalid characters. """ - url = "https://github.com/user/repo" with pytest.raises(ValueError, match="Pattern.*contains invalid characters"): - await parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py;rm -rf") + await parse_query(DEMO_URL, max_file_size=50, from_web=True, include_patterns="*.py;rm -rf") @pytest.mark.asyncio -async def test_parse_url_with_subpaths() -> None: +async def test_parse_url_with_subpaths(stub_branches: Callable[[List[str]], None]) -> None: """ Test `_parse_remote_repo` with a URL containing branch and subpath. @@ -152,19 +132,16 @@ async def test_parse_url_with_subpaths() -> None: When `_parse_remote_repo` is called with remote branch fetching, Then user, repo, branch, and subpath should be identified correctly. """ - url = "https://github.com/user/repo/tree/main/subdir/file" - with patch("gitingest.utils.git_utils.run_command", new_callable=AsyncMock) as mock_run_command: - mock_run_command.return_value = (b"refs/heads/main\nrefs/heads/dev\nrefs/heads/feature-branch\n", b"") - with patch( - "gitingest.utils.git_utils.fetch_remote_branch_list", new_callable=AsyncMock - ) as mock_fetch_branches: - mock_fetch_branches.return_value = ["main", "dev", "feature-branch"] - query = await _parse_remote_repo(url) + url = DEMO_URL + "/tree/main/subdir/file" + + stub_branches(["main", "dev", "feature-branch"]) - assert query.user_name == "user" - assert query.repo_name == "repo" - assert query.branch == "main" - assert query.subpath == "/subdir/file" + query = await _assert_basic_repo_fields(url) + + assert query.user_name == "user" + assert query.repo_name == "repo" + assert query.branch == "main" + assert query.subpath == "/subdir/file" @pytest.mark.asyncio @@ -177,6 +154,7 @@ async def test_parse_url_invalid_repo_structure() -> None: Then a ValueError should be raised indicating an invalid repository URL. """ url = "https://github.com/user" + with pytest.raises(ValueError, match="Invalid repository URL"): await _parse_remote_repo(url) @@ -204,6 +182,7 @@ def test_parse_patterns_invalid_characters() -> None: Then a ValueError should be raised indicating invalid pattern syntax. """ patterns = "*.py;rm -rf" + with pytest.raises(ValueError, match="Pattern.*contains invalid characters"): _parse_patterns(patterns) @@ -217,8 +196,7 @@ async def test_parse_query_with_large_file_size() -> None: When `parse_query` is called, Then `max_file_size` should be set correctly and default ignore patterns remain unchanged. """ - url = "https://github.com/user/repo" - query = await parse_query(url, max_file_size=10**9, from_web=True) + query = await parse_query(DEMO_URL, max_file_size=10**9, from_web=True) assert query.max_file_size == 10**9 assert query.ignore_patterns == DEFAULT_IGNORE_PATTERNS @@ -233,8 +211,7 @@ async def test_parse_query_empty_patterns() -> None: When `parse_query` is called, Then include_patterns becomes None and default ignore patterns apply. """ - url = "https://github.com/user/repo" - query = await parse_query(url, max_file_size=50, from_web=True, include_patterns="", ignore_patterns="") + query = await parse_query(DEMO_URL, max_file_size=50, from_web=True, include_patterns="", ignore_patterns="") assert query.include_patterns is None assert query.ignore_patterns == DEFAULT_IGNORE_PATTERNS @@ -249,9 +226,8 @@ async def test_parse_query_include_and_ignore_overlap() -> None: When `parse_query` is called, Then "*.py" should be removed from ignore patterns. """ - url = "https://github.com/user/repo" query = await parse_query( - url, + DEMO_URL, max_file_size=50, from_web=True, include_patterns="*.py", @@ -308,23 +284,26 @@ async def test_parse_query_empty_source() -> None: When `parse_query` is called, Then a ValueError should be raised indicating an invalid repository URL. """ + url = "" + with pytest.raises(ValueError, match="Invalid repository URL"): - await parse_query("", max_file_size=100, from_web=True) + await parse_query(url, max_file_size=100, from_web=True) @pytest.mark.asyncio @pytest.mark.parametrize( - "url, expected_branch, expected_commit", + "path, expected_branch, expected_commit", [ - ("https://github.com/user/repo/tree/main", "main", None), - ( - "https://github.com/user/repo/tree/abcd1234abcd1234abcd1234abcd1234abcd1234", - None, - "abcd1234abcd1234abcd1234abcd1234abcd1234", - ), + ("/tree/main", "main", None), + ("/tree/abcd1234abcd1234abcd1234abcd1234abcd1234", None, "abcd1234abcd1234abcd1234abcd1234abcd1234"), ], ) -async def test_parse_url_branch_and_commit_distinction(url: str, expected_branch: str, expected_commit: str) -> None: +async def test_parse_url_branch_and_commit_distinction( + path: str, + expected_branch: str, + expected_commit: str, + stub_branches: Callable[[List[str]], None], +) -> None: """ Test `_parse_remote_repo` distinguishing branch vs. commit hash. @@ -332,19 +311,13 @@ async def test_parse_url_branch_and_commit_distinction(url: str, expected_branch When `_parse_remote_repo` is called with branch fetching, Then the function should correctly set `branch` or `commit` based on the URL content. """ - with patch("gitingest.utils.git_utils.run_command", new_callable=AsyncMock) as mock_run_command: - # Mocking the return value to include 'main' and some additional branches - mock_run_command.return_value = (b"refs/heads/main\nrefs/heads/dev\nrefs/heads/feature-branch\n", b"") - with patch( - "gitingest.utils.git_utils.fetch_remote_branch_list", new_callable=AsyncMock - ) as mock_fetch_branches: - mock_fetch_branches.return_value = ["main", "dev", "feature-branch"] + stub_branches(["main", "dev", "feature-branch"]) - query = await _parse_remote_repo(url) + url = DEMO_URL + path + query = await _assert_basic_repo_fields(url) - # Verify that `branch` and `commit` match our expectations - assert query.branch == expected_branch - assert query.commit == expected_commit + assert query.branch == expected_branch + assert query.commit == expected_commit @pytest.mark.asyncio @@ -372,12 +345,12 @@ async def test_parse_url_with_query_and_fragment() -> None: When `_parse_remote_repo` is called, Then those parts should be stripped, leaving a clean user/repo URL. """ - url = "https://github.com/user/repo?arg=value#fragment" + url = DEMO_URL + "?arg=value#fragment" query = await _parse_remote_repo(url) assert query.user_name == "user" assert query.repo_name == "repo" - assert query.url == "https://github.com/user/repo" # URL should be cleaned + assert query.url == DEMO_URL # URL should be cleaned @pytest.mark.asyncio @@ -390,6 +363,7 @@ async def test_parse_url_unsupported_host() -> None: Then a ValueError should be raised for the unknown domain. """ url = "https://only-domain.com" + with pytest.raises(ValueError, match="Unknown domain 'only-domain.com' in URL"): await _parse_remote_repo(url) @@ -419,14 +393,19 @@ async def test_parse_query_with_branch() -> None: @pytest.mark.asyncio @pytest.mark.parametrize( - "url, expected_branch, expected_subpath", + "path, expected_branch, expected_subpath", [ - ("https://github.com/user/repo/tree/main/src", "main", "/src"), - ("https://github.com/user/repo/tree/fix1", "fix1", "/"), - ("https://github.com/user/repo/tree/nonexistent-branch/src", "nonexistent-branch", "/src"), + ("/tree/main/src", "main", "/src"), + ("/tree/fix1", "fix1", "/"), + ("/tree/nonexistent-branch/src", "nonexistent-branch", "/src"), ], ) -async def test_parse_repo_source_with_failed_git_command(url, expected_branch, expected_subpath): +async def test_parse_repo_source_with_failed_git_command( + path: str, + expected_branch: str, + expected_subpath: str, + mocker: MockerFixture, +) -> None: """ Test `_parse_remote_repo` when git fetch fails. @@ -434,52 +413,62 @@ async def test_parse_repo_source_with_failed_git_command(url, expected_branch, e When `_parse_remote_repo` is called, Then it should fall back to path components for branch identification. """ - with patch("gitingest.utils.git_utils.fetch_remote_branch_list", new_callable=AsyncMock) as mock_fetch_branches: - mock_fetch_branches.side_effect = Exception("Failed to fetch branch list") + url = DEMO_URL + path - with pytest.warns( - RuntimeWarning, - match="Warning: Failed to fetch branch list: Command failed: " - "git ls-remote --heads https://github.com/user/repo", - ): + mock_fetch_branches = mocker.patch("gitingest.utils.git_utils.fetch_remote_branch_list", new_callable=AsyncMock) + mock_fetch_branches.side_effect = Exception("Failed to fetch branch list") - query = await _parse_remote_repo(url) + with pytest.warns( + RuntimeWarning, + match="Warning: Failed to fetch branch list: Command failed: " + "git ls-remote --heads https://github.com/user/repo", + ): + query = await _parse_remote_repo(url) - assert query.branch == expected_branch - assert query.subpath == expected_subpath + assert query.branch == expected_branch + assert query.subpath == expected_subpath @pytest.mark.asyncio @pytest.mark.parametrize( - "url, expected_branch, expected_subpath", + ("path", "expected_branch", "expected_subpath"), [ - ("https://github.com/user/repo/tree/feature/fix1/src", "feature/fix1", "/src"), - ("https://github.com/user/repo/tree/main/src", "main", "/src"), - ("https://github.com/user/repo", None, "/"), # No - ("https://github.com/user/repo/tree/nonexistent-branch/src", None, "/"), # Non-existent branch - ("https://github.com/user/repo/tree/fix", "fix", "/"), - ("https://github.com/user/repo/blob/fix/page.html", "fix", "/page.html"), + ("/tree/feature/fix1/src", "feature/fix1", "/src"), + ("/tree/main/src", "main", "/src"), + ("", None, "/"), + ("/tree/nonexistent-branch/src", None, "/"), + ("/tree/fix", "fix", "/"), + ("/blob/fix/page.html", "fix", "/page.html"), ], ) -async def test_parse_repo_source_with_various_url_patterns(url, expected_branch, expected_subpath): +async def test_parse_repo_source_with_various_url_patterns( + path: str, + expected_branch: Optional[str], + expected_subpath: str, + stub_branches: Callable[[List[str]], None], +) -> None: """ - Test `_parse_remote_repo` with various URL patterns. + `_parse_remote_repo` should detect (or reject) a branch and resolve the + sub-path for various GitHub-style URL permutations. - Given multiple branch/blob patterns (including nonexistent branches): - When `_parse_remote_repo` is called with remote branch fetching, - Then the correct branch/subpath should be set or None if unmatched. - """ - with patch("gitingest.utils.git_utils.run_command", new_callable=AsyncMock) as mock_run_command: - with patch( - "gitingest.utils.git_utils.fetch_remote_branch_list", new_callable=AsyncMock - ) as mock_fetch_branches: - mock_run_command.return_value = ( - b"refs/heads/feature/fix1\nrefs/heads/main\nrefs/heads/feature-branch\nrefs/heads/fix\n", - b"", - ) - mock_fetch_branches.return_value = ["feature/fix1", "main", "feature-branch"] - - query = await _parse_remote_repo(url) - - assert query.branch == expected_branch - assert query.subpath == expected_subpath + Branch discovery is stubbed so that only names passed to `stub_branches` are considered "remote". + """ + stub_branches(["feature/fix1", "main", "feature-branch", "fix"]) + + url = DEMO_URL + path + query = await _assert_basic_repo_fields(url) + + assert query.branch == expected_branch + assert query.subpath == expected_subpath + + +async def _assert_basic_repo_fields(url: str) -> IngestionQuery: + """Run _parse_remote_repo and assert user, repo and slug are parsed.""" + + query = await _parse_remote_repo(url) + + assert query.user_name == "user" + assert query.repo_name == "repo" + assert query.slug == "user-repo" + + return query diff --git a/tests/test_flow_integration.py b/tests/test_flow_integration.py index c85f63ae..7821b60a 100644 --- a/tests/test_flow_integration.py +++ b/tests/test_flow_integration.py @@ -3,10 +3,12 @@ import shutil from concurrent.futures import ThreadPoolExecutor from pathlib import Path -from unittest.mock import patch +from typing import Generator import pytest from fastapi.testclient import TestClient +from pytest import FixtureRequest +from pytest_mock import MockerFixture from src.server.main import app @@ -15,30 +17,33 @@ @pytest.fixture(scope="module") -def test_client(): +def test_client() -> Generator[TestClient, None, None]: """Create a test client fixture.""" with TestClient(app) as client_instance: client_instance.headers.update({"Host": "localhost"}) yield client_instance -@pytest.fixture(scope="module", autouse=True) -def mock_static_files(): +@pytest.fixture(autouse=True) +def mock_static_files(mocker: MockerFixture) -> Generator[None, None, None]: """Mock the static file mount to avoid directory errors.""" - with patch("src.server.main.StaticFiles") as mock_static: - mock_static.return_value = None # Mocks the StaticFiles response - yield mock_static + mock_static = mocker.patch("src.server.main.StaticFiles", autospec=True) + mock_static.return_value = None + yield mock_static -@pytest.fixture(scope="module", autouse=True) -def mock_templates(): +@pytest.fixture(autouse=True) +def mock_templates(mocker: MockerFixture) -> Generator[None, None, None]: """Mock Jinja2 template rendering to bypass actual file loading.""" - with patch("starlette.templating.Jinja2Templates.TemplateResponse") as mock_template: - mock_template.return_value = "Mocked Template Response" - yield mock_template + mock_template = mocker.patch("starlette.templating.Jinja2Templates.TemplateResponse", autospec=True) + mock_template.return_value = "Mocked Template Response" + yield mock_template -def cleanup_temp_directories(): +@pytest.fixture(scope="module", autouse=True) +def cleanup_tmp_dir() -> Generator[None, None, None]: + """Remove /tmp/gitingest after this test-module is done.""" + yield # run tests temp_dir = Path("/tmp/gitingest") if temp_dir.exists(): try: @@ -47,15 +52,8 @@ def cleanup_temp_directories(): print(f"Error cleaning up {temp_dir}: {exc}") -@pytest.fixture(scope="module", autouse=True) -def cleanup(): - """Cleanup temporary directories after tests.""" - yield - cleanup_temp_directories() - - @pytest.mark.asyncio -async def test_remote_repository_analysis(request): +async def test_remote_repository_analysis(request: FixtureRequest) -> None: """Test the complete flow of analyzing a remote repository.""" client = request.getfixturevalue("test_client") form_data = { @@ -72,7 +70,7 @@ async def test_remote_repository_analysis(request): @pytest.mark.asyncio -async def test_invalid_repository_url(request): +async def test_invalid_repository_url(request: FixtureRequest) -> None: """Test handling of an invalid repository URL.""" client = request.getfixturevalue("test_client") form_data = { @@ -89,7 +87,7 @@ async def test_invalid_repository_url(request): @pytest.mark.asyncio -async def test_large_repository(request): +async def test_large_repository(request: FixtureRequest) -> None: """Simulate analysis of a large repository with nested folders.""" client = request.getfixturevalue("test_client") form_data = { @@ -106,7 +104,7 @@ async def test_large_repository(request): @pytest.mark.asyncio -async def test_concurrent_requests(request): +async def test_concurrent_requests(request: FixtureRequest) -> None: """Test handling of multiple concurrent requests.""" client = request.getfixturevalue("test_client") @@ -129,7 +127,7 @@ def make_request(): @pytest.mark.asyncio -async def test_large_file_handling(request): +async def test_large_file_handling(request: FixtureRequest) -> None: """Test handling of repositories with large files.""" client = request.getfixturevalue("test_client") form_data = { @@ -146,7 +144,7 @@ async def test_large_file_handling(request): @pytest.mark.asyncio -async def test_repository_with_patterns(request): +async def test_repository_with_patterns(request: FixtureRequest) -> None: """Test repository analysis with include/exclude patterns.""" client = request.getfixturevalue("test_client") form_data = { diff --git a/tests/test_git_utils.py b/tests/test_git_utils.py new file mode 100644 index 00000000..9d4e842d --- /dev/null +++ b/tests/test_git_utils.py @@ -0,0 +1,142 @@ +""" +Tests for the `git_utils` module. + +These tests validate the `validate_github_token` function, which ensures that +GitHub personal access tokens (PATs) are properly formatted. +""" + +import base64 + +import pytest + +from gitingest.utils.exceptions import InvalidGitHubTokenError +from gitingest.utils.git_utils import ( + create_git_auth_header, + create_git_command, + validate_github_token, +) + + +@pytest.mark.parametrize( + "token", + [ + # Valid tokens: correct prefixes and at least 36 allowed characters afterwards + "github_pat_" + "a" * 36, + "ghp_" + "A" * 36, + "github_pat_1234567890abcdef1234567890abcdef1234", + ], +) +def test_validate_github_token_valid(token): + """validate_github_token should accept properly-formatted tokens.""" + # Should not raise any exception + validate_github_token(token) + + +@pytest.mark.parametrize( + "token", + [ + "github_pat_short", # Too short after prefix + "ghp_" + "b" * 35, # one character short + "invalidprefix_" + "c" * 36, # Wrong prefix + "github_pat_" + "!" * 36, # Disallowed characters + "", # Empty string + ], +) +def test_validate_github_token_invalid(token): + """validate_github_token should raise ValueError on malformed tokens.""" + with pytest.raises(InvalidGitHubTokenError): + validate_github_token(token) + + +@pytest.mark.parametrize( + "base_cmd, local_path, url, token, expected_suffix", + [ + ( + ["git", "clone"], + "/some/path", + "https://github.com/owner/repo.git", + None, + [], # No auth header expected when token is None + ), + ( + ["git", "clone"], + "/some/path", + "https://github.com/owner/repo.git", + "ghp_" + "d" * 36, + [ + "-c", + create_git_auth_header("ghp_" + "d" * 36), + ], # Auth header expected for GitHub URL + token + ), + ( + ["git", "clone"], + "/some/path", + "https://gitlab.com/owner/repo.git", + "ghp_" + "e" * 36, + [], # No auth header for non-GitHub URL even if token provided + ), + ], +) +def test_create_git_command(base_cmd, local_path, url, token, expected_suffix): + """create_git_command should build the correct command list based on inputs.""" + cmd = create_git_command(base_cmd, local_path, url, token) + + # The command should start with base_cmd and the -C option + expected_prefix = base_cmd + ["-C", local_path] + assert cmd[: len(expected_prefix)] == expected_prefix + + # The suffix (anything after prefix) should match expected + assert cmd[len(expected_prefix) :] == expected_suffix + + +def test_create_git_command_invalid_token(): + """Supplying an invalid token for a GitHub URL should raise ValueError.""" + with pytest.raises(InvalidGitHubTokenError): + create_git_command( + ["git", "clone"], + "/some/path", + "https://github.com/owner/repo.git", + "invalid_token", + ) + + +@pytest.mark.parametrize( + "token", + [ + "ghp_abcdefghijklmnopqrstuvwxyz012345", # typical ghp_ token + "github_pat_1234567890abcdef1234567890abcdef1234", + ], +) +def test_create_git_auth_header(token): + """create_git_auth_header should produce correct base64-encoded header.""" + header = create_git_auth_header(token) + expected_basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode() + expected = f"http.https://github.com/.extraheader=Authorization: Basic {expected_basic}" + assert header == expected + + +@pytest.mark.parametrize( + "url, token, should_call", + [ + ("https://github.com/foo/bar.git", "ghp_" + "f" * 36, True), + ("https://github.com/foo/bar.git", None, False), + ("https://gitlab.com/foo/bar.git", "ghp_" + "g" * 36, False), + ], +) +def test_create_git_command_helper_calls(mocker, url, token, should_call): + """Verify validate_github_token & create_git_auth_header are invoked only when appropriate.""" + + validate_mock = mocker.patch("gitingest.utils.git_utils.validate_github_token") + header_mock = mocker.patch("gitingest.utils.git_utils.create_git_auth_header", return_value="HEADER") + + cmd = create_git_command(["git", "clone"], "/tmp", url, token) + + if should_call: + validate_mock.assert_called_once_with(token) + header_mock.assert_called_once_with(token) + assert "HEADER" in cmd + else: + validate_mock.assert_not_called() + header_mock.assert_not_called() + # HEADER should not be included in command list + assert "HEADER" not in cmd diff --git a/tests/test_repository_clone.py b/tests/test_repository_clone.py index 787456b1..d5d395c8 100644 --- a/tests/test_repository_clone.py +++ b/tests/test_repository_clone.py @@ -8,18 +8,24 @@ import asyncio import os from pathlib import Path -from unittest.mock import AsyncMock, patch +from unittest.mock import AsyncMock import pytest +from pytest_mock import MockerFixture from gitingest.cloning import clone_repo from gitingest.schemas import CloneConfig from gitingest.utils.exceptions import AsyncTimeoutError from gitingest.utils.git_utils import check_repo_exists +from tests.conftest import DEMO_URL, LOCAL_REPO_PATH + +# All cloning-related tests assume (unless explicitly overridden) that the repository exists. +# Apply the check-repo patch automatically so individual tests don't need to repeat it. +pytestmark = pytest.mark.usefixtures("repo_exists_true") @pytest.mark.asyncio -async def test_clone_with_commit() -> None: +async def test_clone_with_commit(repo_exists_true: AsyncMock, run_command_mock: AsyncMock) -> None: """ Test cloning a repository with a specific commit hash. @@ -28,26 +34,20 @@ async def test_clone_with_commit() -> None: Then the repository should be cloned and checked out at that commit. """ clone_config = CloneConfig( - url="https://github.com/user/repo", - local_path="/tmp/repo", + url=DEMO_URL, + local_path=LOCAL_REPO_PATH, commit="a" * 40, # Simulating a valid commit hash branch="main", ) - with patch("gitingest.cloning.check_repo_exists", return_value=True) as mock_check: - with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: - mock_process = AsyncMock() - mock_process.communicate.return_value = (b"output", b"error") - mock_exec.return_value = mock_process - - await clone_repo(clone_config) + await clone_repo(clone_config) - mock_check.assert_called_once_with(clone_config.url, token=None) - assert mock_exec.call_count == 2 # Clone and checkout calls + repo_exists_true.assert_called_once_with(clone_config.url, token=None) + assert run_command_mock.call_count == 2 # Clone and checkout calls @pytest.mark.asyncio -async def test_clone_without_commit() -> None: +async def test_clone_without_commit(repo_exists_true: AsyncMock, run_command_mock: AsyncMock) -> None: """ Test cloning a repository when no commit hash is provided. @@ -55,27 +55,16 @@ async def test_clone_without_commit() -> None: When `clone_repo` is called, Then only the clone_repo operation should be performed (no checkout). """ - clone_config = CloneConfig( - url="https://github.com/user/repo", - local_path="/tmp/repo", - commit=None, - branch="main", - ) + clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, commit=None, branch="main") - with patch("gitingest.cloning.check_repo_exists", return_value=True) as mock_check: - with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: - mock_process = AsyncMock() - mock_process.communicate.return_value = (b"output", b"error") - mock_exec.return_value = mock_process - - await clone_repo(clone_config) + await clone_repo(clone_config) - mock_check.assert_called_once_with(clone_config.url, token=None) - assert mock_exec.call_count == 1 # Only clone call + repo_exists_true.assert_called_once_with(clone_config.url, token=None) + assert run_command_mock.call_count == 1 # Only clone call @pytest.mark.asyncio -async def test_clone_nonexistent_repository() -> None: +async def test_clone_nonexistent_repository(repo_exists_true: AsyncMock) -> None: """ Test cloning a nonexistent repository URL. @@ -85,15 +74,17 @@ async def test_clone_nonexistent_repository() -> None: """ clone_config = CloneConfig( url="https://github.com/user/nonexistent-repo", - local_path="/tmp/repo", + local_path=LOCAL_REPO_PATH, commit=None, branch="main", ) - with patch("gitingest.cloning.check_repo_exists", return_value=False) as mock_check: - with pytest.raises(ValueError, match="Repository not found"): - await clone_repo(clone_config) + # Override the default fixture behaviour for this test + repo_exists_true.return_value = False + + with pytest.raises(ValueError, match="Repository not found"): + await clone_repo(clone_config) - mock_check.assert_called_once_with(clone_config.url) + repo_exists_true.assert_called_once_with(clone_config.url, token=None) @pytest.mark.asyncio @@ -105,7 +96,7 @@ async def test_clone_nonexistent_repository() -> None: (b"HTTP/1.1 200 OK\n", 1, False), # Failed request ], ) -async def test_check_repo_exists(mock_stdout: bytes, return_code: int, expected: bool) -> None: +async def test_check_repo_exists(mock_stdout: bytes, return_code: int, expected: bool, mocker: MockerFixture) -> None: """ Test the `check_repo_exists` function with different Git HTTP responses. @@ -113,22 +104,19 @@ async def test_check_repo_exists(mock_stdout: bytes, return_code: int, expected: When `check_repo_exists` is called, Then it should correctly indicate whether the repository exists. """ - url = "https://github.com/user/repo" + mock_exec = mocker.patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) + mock_process = AsyncMock() + mock_process.communicate.return_value = (mock_stdout, b"") + mock_process.returncode = return_code + mock_exec.return_value = mock_process - with patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) as mock_exec: - mock_process = AsyncMock() - # Mock the subprocess output - mock_process.communicate.return_value = (mock_stdout, b"") - mock_process.returncode = return_code - mock_exec.return_value = mock_process + repo_exists = await check_repo_exists(DEMO_URL) - repo_exists = await check_repo_exists(url) - - assert repo_exists is expected + assert repo_exists is expected @pytest.mark.asyncio -async def test_clone_with_custom_branch() -> None: +async def test_clone_with_custom_branch(run_command_mock: AsyncMock) -> None: """ Test cloning a repository with a specified custom branch. @@ -136,25 +124,24 @@ async def test_clone_with_custom_branch() -> None: When `clone_repo` is called, Then the repository should be cloned shallowly to that branch. """ - clone_config = CloneConfig(url="https://github.com/user/repo", local_path="/tmp/repo", branch="feature-branch") - with patch("gitingest.cloning.check_repo_exists", return_value=True): - with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) + clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, branch="feature-branch") - mock_exec.assert_called_once_with( - "git", - "clone", - "--single-branch", - "--depth=1", - "--branch", - "feature-branch", - clone_config.url, - clone_config.local_path, - ) + await clone_repo(clone_config) + + run_command_mock.assert_called_once_with( + "git", + "clone", + "--single-branch", + "--depth=1", + "--branch", + "feature-branch", + clone_config.url, + clone_config.local_path, + ) @pytest.mark.asyncio -async def test_git_command_failure() -> None: +async def test_git_command_failure(run_command_mock: AsyncMock) -> None: """ Test cloning when the Git command fails during execution. @@ -162,18 +149,16 @@ async def test_git_command_failure() -> None: When `clone_repo` is called, Then a RuntimeError should be raised with the correct message. """ - clone_config = CloneConfig( - url="https://github.com/user/repo", - local_path="/tmp/repo", - ) - with patch("gitingest.cloning.check_repo_exists", return_value=True): - with patch("gitingest.cloning.run_command", side_effect=RuntimeError("Git command failed")): - with pytest.raises(RuntimeError, match="Git command failed"): - await clone_repo(clone_config) + clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH) + + run_command_mock.side_effect = RuntimeError("Git command failed") + + with pytest.raises(RuntimeError, match="Git command failed"): + await clone_repo(clone_config) @pytest.mark.asyncio -async def test_clone_default_shallow_clone() -> None: +async def test_clone_default_shallow_clone(run_command_mock: AsyncMock) -> None: """ Test cloning a repository with the default shallow clone options. @@ -181,27 +166,22 @@ async def test_clone_default_shallow_clone() -> None: When `clone_repo` is called, Then the repository should be cloned with `--depth=1` and `--single-branch`. """ - clone_config = CloneConfig( - url="https://github.com/user/repo", - local_path="/tmp/repo", - ) + clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH) - with patch("gitingest.cloning.check_repo_exists", return_value=True): - with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) + await clone_repo(clone_config) - mock_exec.assert_called_once_with( - "git", - "clone", - "--single-branch", - "--depth=1", - clone_config.url, - clone_config.local_path, - ) + run_command_mock.assert_called_once_with( + "git", + "clone", + "--single-branch", + "--depth=1", + clone_config.url, + clone_config.local_path, + ) @pytest.mark.asyncio -async def test_clone_commit_without_branch() -> None: +async def test_clone_commit_without_branch(run_command_mock: AsyncMock) -> None: """ Test cloning when a commit hash is provided but no branch is specified. @@ -209,22 +189,18 @@ async def test_clone_commit_without_branch() -> None: When `clone_repo` is called, Then the repository should be cloned and checked out at that commit. """ - clone_config = CloneConfig( - url="https://github.com/user/repo", - local_path="/tmp/repo", - commit="a" * 40, # Simulating a valid commit hash - ) - with patch("gitingest.cloning.check_repo_exists", return_value=True): - with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) + # Simulating a valid commit hash + clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, commit="a" * 40) - assert mock_exec.call_count == 2 # Clone and checkout calls - mock_exec.assert_any_call("git", "clone", "--single-branch", clone_config.url, clone_config.local_path) - mock_exec.assert_any_call("git", "-C", clone_config.local_path, "checkout", clone_config.commit) + await clone_repo(clone_config) + + assert run_command_mock.call_count == 2 # Clone and checkout calls + run_command_mock.assert_any_call("git", "clone", "--single-branch", clone_config.url, clone_config.local_path) + run_command_mock.assert_any_call("git", "-C", clone_config.local_path, "checkout", clone_config.commit) @pytest.mark.asyncio -async def test_check_repo_exists_with_redirect() -> None: +async def test_check_repo_exists_with_redirect(mocker: MockerFixture) -> None: """ Test `check_repo_exists` when a redirect (302) is returned. @@ -232,20 +208,19 @@ async def test_check_repo_exists_with_redirect() -> None: When `check_repo_exists` is called, Then it should return `False`, indicating the repo is inaccessible. """ - url = "https://github.com/user/repo" - with patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) as mock_exec: - mock_process = AsyncMock() - mock_process.communicate.return_value = (b"HTTP/1.1 302 Found\n", b"") - mock_process.returncode = 0 # Simulate successful request - mock_exec.return_value = mock_process + mock_exec = mocker.patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) + mock_process = AsyncMock() + mock_process.communicate.return_value = (b"HTTP/1.1 302 Found\n", b"") + mock_process.returncode = 0 # Simulate successful request + mock_exec.return_value = mock_process - repo_exists = await check_repo_exists(url) + repo_exists = await check_repo_exists(DEMO_URL) - assert repo_exists is False + assert repo_exists is False @pytest.mark.asyncio -async def test_check_repo_exists_with_permanent_redirect() -> None: +async def test_check_repo_exists_with_permanent_redirect(mocker: MockerFixture) -> None: """ Test `check_repo_exists` when a permanent redirect (301) is returned. @@ -253,20 +228,19 @@ async def test_check_repo_exists_with_permanent_redirect() -> None: When `check_repo_exists` is called, Then it should return `True`, indicating the repo may exist at the new location. """ - url = "https://github.com/user/repo" - with patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) as mock_exec: - mock_process = AsyncMock() - mock_process.communicate.return_value = (b"HTTP/1.1 301 Found\n", b"") - mock_process.returncode = 0 # Simulate successful request - mock_exec.return_value = mock_process + mock_exec = mocker.patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) + mock_process = AsyncMock() + mock_process.communicate.return_value = (b"HTTP/1.1 301 Found\n", b"") + mock_process.returncode = 0 # Simulate successful request + mock_exec.return_value = mock_process - repo_exists = await check_repo_exists(url) + repo_exists = await check_repo_exists(DEMO_URL) - assert repo_exists + assert repo_exists @pytest.mark.asyncio -async def test_clone_with_timeout() -> None: +async def test_clone_with_timeout(run_command_mock: AsyncMock) -> None: """ Test cloning a repository when a timeout occurs. @@ -274,17 +248,16 @@ async def test_clone_with_timeout() -> None: When `clone_repo` is called, Then an `AsyncTimeoutError` should be raised to indicate the operation exceeded time limits. """ - clone_config = CloneConfig(url="https://github.com/user/repo", local_path="/tmp/repo") + clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH) + + run_command_mock.side_effect = asyncio.TimeoutError - with patch("gitingest.cloning.check_repo_exists", return_value=True): - with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: - mock_exec.side_effect = asyncio.TimeoutError - with pytest.raises(AsyncTimeoutError, match="Operation timed out after"): - await clone_repo(clone_config) + with pytest.raises(AsyncTimeoutError, match="Operation timed out after"): + await clone_repo(clone_config) @pytest.mark.asyncio -async def test_clone_specific_branch(tmp_path): +async def test_clone_specific_branch(tmp_path: Path) -> None: """ Test cloning a specific branch of a repository. @@ -295,21 +268,18 @@ async def test_clone_specific_branch(tmp_path): repo_url = "https://github.com/cyclotruc/gitingest.git" branch_name = "main" local_path = tmp_path / "gitingest" - clone_config = CloneConfig(url=repo_url, local_path=str(local_path), branch=branch_name) + await clone_repo(clone_config) - # Assertions assert local_path.exists(), "The repository was not cloned successfully." assert local_path.is_dir(), "The cloned repository path is not a directory." - - # Check the current branch current_branch = os.popen(f"git -C {local_path} branch --show-current").read().strip() assert current_branch == branch_name, f"Expected branch '{branch_name}', got '{current_branch}'." @pytest.mark.asyncio -async def test_clone_branch_with_slashes(tmp_path): +async def test_clone_branch_with_slashes(tmp_path: Path, run_command_mock: AsyncMock) -> None: """ Test cloning a branch with slashes in the name. @@ -317,29 +287,26 @@ async def test_clone_branch_with_slashes(tmp_path): When `clone_repo` is called, Then the repository should be cloned and checked out at that branch. """ - repo_url = "https://github.com/user/repo" branch_name = "fix/in-operator" local_path = tmp_path / "gitingest" + clone_config = CloneConfig(url=DEMO_URL, local_path=str(local_path), branch=branch_name) - clone_config = CloneConfig(url=repo_url, local_path=str(local_path), branch=branch_name) - with patch("gitingest.cloning.check_repo_exists", return_value=True): - with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) - - mock_exec.assert_called_once_with( - "git", - "clone", - "--single-branch", - "--depth=1", - "--branch", - "fix/in-operator", - clone_config.url, - clone_config.local_path, - ) + await clone_repo(clone_config) + + run_command_mock.assert_called_once_with( + "git", + "clone", + "--single-branch", + "--depth=1", + "--branch", + "fix/in-operator", + clone_config.url, + clone_config.local_path, + ) @pytest.mark.asyncio -async def test_clone_creates_parent_directory(tmp_path: Path) -> None: +async def test_clone_creates_parent_directory(tmp_path: Path, run_command_mock: AsyncMock) -> None: """ Test that clone_repo creates parent directories if they don't exist. @@ -348,28 +315,23 @@ async def test_clone_creates_parent_directory(tmp_path: Path) -> None: Then it should create the parent directories before attempting to clone. """ nested_path = tmp_path / "deep" / "nested" / "path" / "repo" - clone_config = CloneConfig(url="https://github.com/user/repo", local_path=str(nested_path)) - - with patch("gitingest.cloning.check_repo_exists", return_value=True): - with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) + clone_config = CloneConfig(url=DEMO_URL, local_path=str(nested_path)) - # Verify parent directory was created - assert nested_path.parent.exists() + await clone_repo(clone_config) - # Verify git clone was called with correct parameters - mock_exec.assert_called_once_with( - "git", - "clone", - "--single-branch", - "--depth=1", - clone_config.url, - str(nested_path), - ) + assert nested_path.parent.exists() + run_command_mock.assert_called_once_with( + "git", + "clone", + "--single-branch", + "--depth=1", + clone_config.url, + str(nested_path), + ) @pytest.mark.asyncio -async def test_clone_with_specific_subpath() -> None: +async def test_clone_with_specific_subpath(run_command_mock: AsyncMock) -> None: """ Test cloning a repository with a specific subpath. @@ -377,32 +339,30 @@ async def test_clone_with_specific_subpath() -> None: When `clone_repo` is called, Then the repository should be cloned with sparse checkout enabled and the specified subpath. """ - clone_config = CloneConfig(url="https://github.com/user/repo", local_path="/tmp/repo", subpath="src/docs") + clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, subpath="src/docs") - with patch("gitingest.cloning.check_repo_exists", return_value=True): - with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) + await clone_repo(clone_config) - # Verify the clone command includes sparse checkout flags - mock_exec.assert_any_call( - "git", - "clone", - "--single-branch", - "--filter=blob:none", - "--sparse", - "--depth=1", - clone_config.url, - clone_config.local_path, - ) + # Verify the clone command includes sparse checkout flags + run_command_mock.assert_any_call( + "git", + "clone", + "--single-branch", + "--filter=blob:none", + "--sparse", + "--depth=1", + clone_config.url, + clone_config.local_path, + ) - # Verify the sparse-checkout command sets the correct path - mock_exec.assert_any_call("git", "-C", clone_config.local_path, "sparse-checkout", "set", "src/docs") + # Verify the sparse-checkout command sets the correct path + run_command_mock.assert_any_call("git", "-C", clone_config.local_path, "sparse-checkout", "set", "src/docs") - assert mock_exec.call_count == 2 + assert run_command_mock.call_count == 2 @pytest.mark.asyncio -async def test_clone_with_commit_and_subpath() -> None: +async def test_clone_with_commit_and_subpath(run_command_mock: AsyncMock) -> None: """ Test cloning a repository with both a specific commit and subpath. @@ -411,45 +371,39 @@ async def test_clone_with_commit_and_subpath() -> None: Then the repository should be cloned with sparse checkout enabled, checked out at the specific commit, and only include the specified subpath. """ - clone_config = CloneConfig( - url="https://github.com/user/repo", - local_path="/tmp/repo", - commit="a" * 40, # Simulating a valid commit hash - subpath="src/docs", + # Simulating a valid commit hash + clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, commit="a" * 40, subpath="src/docs") + + await clone_repo(clone_config) + + # Verify the clone command includes sparse checkout flags + run_command_mock.assert_any_call( + "git", + "clone", + "--single-branch", + "--filter=blob:none", + "--sparse", + clone_config.url, + clone_config.local_path, + ) + + # Verify sparse-checkout set + run_command_mock.assert_any_call( + "git", + "-C", + clone_config.local_path, + "sparse-checkout", + "set", + "src/docs", + ) + + # Verify checkout commit + run_command_mock.assert_any_call( + "git", + "-C", + clone_config.local_path, + "checkout", + clone_config.commit, ) - with patch("gitingest.cloning.check_repo_exists", return_value=True): - with patch("gitingest.cloning.run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) - - # Verify the clone command includes sparse checkout flags - mock_exec.assert_any_call( - "git", - "clone", - "--single-branch", - "--filter=blob:none", - "--sparse", - clone_config.url, - clone_config.local_path, - ) - - # Verify sparse-checkout set - mock_exec.assert_any_call( - "git", - "-C", - clone_config.local_path, - "sparse-checkout", - "set", - "src/docs", - ) - - # Verify checkout commit - mock_exec.assert_any_call( - "git", - "-C", - clone_config.local_path, - "checkout", - clone_config.commit, - ) - - assert mock_exec.call_count == 3 + assert run_command_mock.call_count == 3 From 52966287c463a8b179b772200c0a67d2e42eb94e Mon Sep 17 00:00:00 2001 From: Pokey Rule <755842+pokey@users.noreply.github.com> Date: Sun, 22 Jun 2025 00:06:32 +0100 Subject: [PATCH 22/34] Add subdirectory URL syntax documentation to README (#254) * Add subdirectory URL syntax documentation to README Document how to access specific subdirectories using GitHub tree URLs with gitingest repo examples. Co-authored-by: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index f62ea417..6b9eba3b 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,9 @@ gitingest /path/to/directory # From URL gitingest https://github.com/cyclotruc/gitingest + +# or from specific subdirectory +gitingest https://github.com/cyclotruc/gitingest/tree/main/src/gitingest/utils ``` For private repositories, use the `--token/-t` option. @@ -117,6 +120,9 @@ summary, tree, content = ingest("path/to/directory") # or from URL summary, tree, content = ingest("https://github.com/cyclotruc/gitingest") + +# or from a specific subdirectory +summary, tree, content = ingest("https://github.com/cyclotruc/gitingest/tree/main/src/gitingest/utils") ``` For private repositories, you can pass a token: From 327958eae8377bdc7b97a49624dd27b3e2abf7c1 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sun, 22 Jun 2025 09:56:04 +0200 Subject: [PATCH 23/34] fix(ui): use proper decimal prefixes (kB / MB) in file-size selector (#294) --- src/server/templates/components/git_form.jinja | 2 +- src/static/js/utils.js | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/server/templates/components/git_form.jinja b/src/server/templates/components/git_form.jinja index b45d0f92..2a694adb 100644 --- a/src/server/templates/components/git_form.jinja +++ b/src/server/templates/components/git_form.jinja @@ -107,7 +107,7 @@
= 1024) { - return Math.round(sizeInKB / 1024) + 'mb'; + return Math.round(sizeInKB / 1024) + 'MB'; } - return Math.round(sizeInKB) + 'kb'; + return Math.round(sizeInKB) + 'kB'; } // Initialize slider on page load From 3c5384322c6ba79e3e1ff4f2cab27c931c2b5ed4 Mon Sep 17 00:00:00 2001 From: Carlos Uriel Date: Sun, 22 Jun 2025 08:12:27 -0600 Subject: [PATCH 24/34] fix(ui): update directory-picker logic to compute full file paths (#295) --- src/server/templates/components/result.jinja | 38 ++++++++++++++++---- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/src/server/templates/components/result.jinja b/src/server/templates/components/result.jinja index 151bc02f..55c1f533 100644 --- a/src/server/templates/components/result.jinja +++ b/src/server/templates/components/result.jinja @@ -1,22 +1,48 @@
From 1545dc8f4270f94b56d5ca2735f7b770a9a36d27 Mon Sep 17 00:00:00 2001 From: Romain Courtois Date: Mon, 23 Jun 2025 01:30:22 +0200 Subject: [PATCH 28/34] feat: add /llm.txt (#307) --- src/server/main.py | 13 + .../templates/components/badge_new.jinja | 1 + .../templates/components/git_form.jinja | 1 + src/server/templates/components/navbar.jinja | 5 + src/static/llm.txt | 362 ++++++++++++++++++ 5 files changed, 382 insertions(+) create mode 100644 src/server/templates/components/badge_new.jinja create mode 100644 src/static/llm.txt diff --git a/src/server/main.py b/src/server/main.py index d78b3c54..f314a3ad 100644 --- a/src/server/main.py +++ b/src/server/main.py @@ -104,6 +104,19 @@ async def robots() -> FileResponse: return FileResponse("static/robots.txt") +@app.get("/llm.txt") +async def llm_txt() -> FileResponse: + """ + Serve the `llm.txt` file to provide information about the site to LLMs. + + Returns + ------- + FileResponse + The `llm.txt` file located in the static directory. + """ + return FileResponse("static/llm.txt") + + # Include routers for modular endpoints app.include_router(index) app.include_router(download) diff --git a/src/server/templates/components/badge_new.jinja b/src/server/templates/components/badge_new.jinja new file mode 100644 index 00000000..dc6dfcad --- /dev/null +++ b/src/server/templates/components/badge_new.jinja @@ -0,0 +1 @@ +NEW diff --git a/src/server/templates/components/git_form.jinja b/src/server/templates/components/git_form.jinja index 5e58280e..bf18804d 100644 --- a/src/server/templates/components/git_form.jinja +++ b/src/server/templates/components/git_form.jinja @@ -129,6 +129,7 @@ onchange="toggleAccessSettings()" {% if token %}checked{% endif %}> + {% include "components/badge_new.jinja" %}