Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: add validation for GitHub repo paths in Codebase constructor #529

Open
wants to merge 6 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions src/codegen/sdk/core/codebase.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from functools import cached_property
from pathlib import Path
from typing import TYPE_CHECKING, Generic, Literal, TypeVar, Unpack, overload
from urllib.parse import urlparse

import plotly.graph_objects as go
import rich.repr
Expand Down Expand Up @@ -177,16 +178,41 @@

# Initialize project with repo_path if projects is None
if repo_path is not None:
# Add validation to detect GitHub repo paths and URLs
# Clean up the path - remove angle brackets and whitespace
cleaned_path = repo_path.strip("<> \t\n")

if not os.path.exists(cleaned_path):
# Parse URL to handle various GitHub URL formats
parsed_url = urlparse(cleaned_path)
path_parts = parsed_url.path.strip("/").split("/")

# Check for GitHub URLs (e.g., https://github.com/owner/repo)
if parsed_url.netloc == "github.com" or parsed_url.path.startswith("github.com/"):
owner_repo = "/".join(path_parts[-2:]) if len(path_parts) >= 2 else ""
msg = f"Path '{repo_path}' is a GitHub URL. To create a Codebase from a GitHub repo, use Codebase.from_repo('{owner_repo}') instead."
raise ValueError(msg)
# Check for GitHub repo paths (e.g., owner/repo)
if len(path_parts) == 2 and "/" in cleaned_path and not parsed_url.scheme and not parsed_url.netloc:
msg = f"Path '{repo_path}' looks like a GitHub repository path. To create a Codebase from a GitHub repo, use Codebase.from_repo('{cleaned_path}') instead."
raise ValueError(msg)

# For non-GitHub paths that don't exist, provide a clearer error
if os.path.isabs(cleaned_path):
msg = f"Local path '{repo_path}' does not exist. Please provide a valid local directory path."
else:
msg = f"Local path '{repo_path}' does not exist. Please provide a valid local directory path (relative paths like '{cleaned_path}' are allowed if they exist)."
raise ValueError(msg)
main_project = ProjectConfig.from_path(repo_path, programming_language=ProgrammingLanguage(language.upper()) if language else None)
projects = [main_project]
else:
main_project = projects[0]

Check failure on line 209 in src/codegen/sdk/core/codebase.py

View workflow job for this annotation

GitHub Actions / mypy

error: Value of type "list[ProjectConfig] | None" is not indexable [index]

# Initialize codebase
self._op = main_project.repo_operator
self.viz = VisualizationManager(op=self._op)
self.repo_path = Path(self._op.repo_path)
self.ctx = CodebaseContext(projects, config=config, io=io, progress=progress)

Check failure on line 215 in src/codegen/sdk/core/codebase.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "CodebaseContext" has incompatible type "list[ProjectConfig] | None"; expected "list[ProjectConfig]" [arg-type]
self.console = Console(record=True, soft_wrap=True)

@noapidoc
Expand All @@ -202,7 +228,7 @@
yield "nodes", len(self.ctx.nodes)
yield "edges", len(self.ctx.edges)

__rich_repr__.angular = ANGULAR_STYLE

Check failure on line 231 in src/codegen/sdk/core/codebase.py

View workflow job for this annotation

GitHub Actions / mypy

error: "Callable[[Codebase[TSourceFile, TDirectory, TSymbol, TClass, TFunction, TImport, TGlobalVar, TInterface, TTypeAlias, TParameter, TCodeBlock]], Iterable[Any | tuple[Any] | tuple[str, Any] | tuple[str, Any, Any]]]" has no attribute "angular" [attr-defined]

@property
@deprecated("Please do not use the local repo operator directly")
Expand Down Expand Up @@ -244,8 +270,8 @@

@noapidoc
def _symbols(self, symbol_type: SymbolType | None = None) -> list[TSymbol | TClass | TFunction | TGlobalVar]:
matches: list[Symbol] = self.ctx.get_nodes(NodeType.SYMBOL)

Check failure on line 273 in src/codegen/sdk/core/codebase.py

View workflow job for this annotation

GitHub Actions / mypy

error: Incompatible types in assignment (expression has type "list[Importable[Any]]", variable has type "list[Symbol[Any, Any]]") [assignment]
return [x for x in matches if x.is_top_level and (symbol_type is None or x.symbol_type == symbol_type)]

Check failure on line 274 in src/codegen/sdk/core/codebase.py

View workflow job for this annotation

GitHub Actions / mypy

error: List comprehension has incompatible type List[Symbol[Any, Any]]; expected List[TSymbol | TClass | TFunction | TGlobalVar] [misc]

# =====[ Node Types ]=====
@overload
Expand All @@ -254,7 +280,7 @@
def files(self, *, extensions: Literal["*"]) -> list[File]: ...
@overload
def files(self, *, extensions: None = ...) -> list[TSourceFile]: ...
@proxy_property

Check failure on line 283 in src/codegen/sdk/core/codebase.py

View workflow job for this annotation

GitHub Actions / mypy

error: "cached_property[ProxyProperty[[Codebase[TSourceFile, TDirectory, TSymbol, TClass, TFunction, TImport, TGlobalVar, TInterface, TTypeAlias, TParameter, TCodeBlock], DefaultNamedArg(list[str] | Literal['*'] | None, 'extensions')], list[TSourceFile] | list[File]]]" not callable [operator]
def files(self, *, extensions: list[str] | Literal["*"] | None = None) -> list[TSourceFile] | list[File]:
"""A list property that returns all files in the codebase.

Expand Down Expand Up @@ -284,15 +310,15 @@
return sort_editables(files, alphabetical=True, dedupe=False)

@cached_property
def codeowners(self) -> list["CodeOwner[TSourceFile]"]:

Check failure on line 313 in src/codegen/sdk/core/codebase.py

View workflow job for this annotation

GitHub Actions / mypy

error: "CodeOwner" expects 7 type arguments, but 1 given [type-arg]
"""List all CodeOnwers in the codebase.

Returns:
list[CodeOwners]: A list of CodeOwners objects in the codebase.
"""
if self.G.codeowners_parser is None:

Check failure on line 319 in src/codegen/sdk/core/codebase.py

View workflow job for this annotation

GitHub Actions / mypy

error: "Codebase[TSourceFile, TDirectory, TSymbol, TClass, TFunction, TImport, TGlobalVar, TInterface, TTypeAlias, TParameter, TCodeBlock]" has no attribute "G" [attr-defined]
return []
return CodeOwner.from_parser(self.G.codeowners_parser, lambda *args, **kwargs: self.files(*args, **kwargs))

Check failure on line 321 in src/codegen/sdk/core/codebase.py

View workflow job for this annotation

GitHub Actions / mypy

error: "Codebase[TSourceFile, TDirectory, TSymbol, TClass, TFunction, TImport, TGlobalVar, TInterface, TTypeAlias, TParameter, TCodeBlock]" has no attribute "G" [attr-defined]

@property
def directories(self) -> list[TDirectory]:
Expand All @@ -304,7 +330,7 @@
Returns:
list[TDirectory]: A list of Directory objects in the codebase.
"""
return list(self.ctx.directories.values())

Check failure on line 333 in src/codegen/sdk/core/codebase.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "list" has incompatible type "dict_values[Path, Directory[Any, Any, Any, Any, Any, Any, Any]]"; expected "Iterable[TDirectory]" [arg-type]

@property
def imports(self) -> list[TImport]:
Expand Down
65 changes: 65 additions & 0 deletions tests/unit/python/codebase/test_codebase_github_repo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import os

import pytest

from codegen.sdk.core.codebase import Codebase


def test_codebase_github_repo_path() -> None:
"""Test that trying to create a Codebase with a GitHub repo path raises an error."""
with pytest.raises(ValueError, match="looks like a GitHub repository path"):
Codebase(repo_path="fastapi/fastapi")


def test_codebase_github_url_formats() -> None:
"""Test that trying to create a Codebase with various GitHub URL formats raises an error."""
urls = [
"https://github.com/fastapi/fastapi",
"https://github.com/fastapi/fastapi.git",
"http://github.com/fastapi/fastapi",
"github.com/fastapi/fastapi",
"<https://github.com/fastapi/fastapi>",
"[email protected]:fastapi/fastapi.git",
]
for url in urls:
with pytest.raises(ValueError, match="is a GitHub URL"):
Codebase(repo_path=url)


def test_codebase_github_url_with_path() -> None:
"""Test that trying to create a Codebase with a GitHub URL containing extra path components raises an error."""
with pytest.raises(ValueError, match="is a GitHub URL"):
Codebase(repo_path="https://github.com/fastapi/fastapi/tree/main")


def test_codebase_nonexistent_local_paths() -> None:
"""Test that trying to create a Codebase with nonexistent local paths raises appropriate errors."""
# Absolute path
with pytest.raises(ValueError, match="Local path .* does not exist"):
Codebase(repo_path="/nonexistent/path")

# Relative path
with pytest.raises(ValueError, match="relative paths like"):
Codebase(repo_path="path/to/file")

# String variable
test_string = "some/code/here"
with pytest.raises(ValueError, match="relative paths like"):
Codebase(repo_path=test_string)


def test_codebase_valid_path_with_slash(tmp_path) -> None:
"""Test that a valid path containing slashes works correctly."""
# Initialize git repo at tmp_path
import subprocess

subprocess.run(["git", "init"], cwd=str(tmp_path), check=True)

path = tmp_path / "some/nested/path"
os.makedirs(path)
# Create a Python file so language detection works
with open(path / "test.py", "w") as f:
f.write("# Test file")
codebase: Codebase = Codebase(repo_path=str(path))
# When initializing a Codebase from a path within a git repo, it uses the repo root
assert str(codebase.repo_path) == str(tmp_path)
Loading