diff --git a/.github/release.yml b/.github/release.yml new file mode 100644 index 0000000..1fc55ad --- /dev/null +++ b/.github/release.yml @@ -0,0 +1,26 @@ +# Configures GitHub's auto-generated release notes (the "What's Changed" section +# appended by `generate_release_notes` in .github/workflows/release.yml). Merged +# PRs are grouped under these emoji headings by label, mirroring the emoji +# categories used by the codeanalyzer-typescript backend. +changelog: + exclude: + authors: + - dependabot + - github-actions + categories: + - title: 🚀 Features + labels: [enhancement, kind/feature] + - title: 🐛 Fixes + labels: [bug, fix] + - title: ♻️ Refactoring + labels: [refactoring] + - title: ⚡️ Performance + labels: [performance] + - title: 📚 Documentation + labels: [documentation, doc] + - title: 🚦 Tests + labels: [test] + - title: 🚨 Breaking Changes + labels: [breaking] + - title: 🛠 Other Changes + labels: ["*"] diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 32fa884..ab14353 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -87,42 +87,97 @@ jobs: echo "current_version=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT shell: bash - - name: Read Changelog Entry - id: changelog_reader - uses: mindsers/changelog-reader-action@v2 - with: - validation_level: warn - version: ${{ steps.tag_name.outputs.current_version }} - path: ./CHANGELOG.md - - - name: Build changelog - id: gen_changelog - continue-on-error: true # auto-PR-diff is best-effort; CHANGELOG.md is the source of truth - uses: mikepenz/release-changelog-builder-action@v5 - with: - failOnError: "false" - configuration: .github/workflows/release_config.json + # cargo-dist-style notes: install one-liners + a download table. The categorized + # "What's Changed" (merged PRs/issues grouped under emoji headings via + # .github/release.yml) is appended by generate_release_notes below. Indented code + # blocks avoid backticks in the heredoc. + - name: Compose release notes header (install + download) env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + VERSION: ${{ steps.tag_name.outputs.current_version }} + run: | + REPO="codellm-devkit/codeanalyzer-python" + BASE="https://github.com/$REPO/releases/download/v$VERSION" + cat > "$RUNNER_TEMP/RELEASE_BODY.md" <> "$GITHUB_OUTPUT" + + - name: Generate Homebrew formula + env: + REPO: ${{ github.repository }} + VERSION: ${{ steps.ver.outputs.version }} + run: | + chmod +x packaging/homebrew/generate_formula.sh + # The release job just published the sdist as a Release asset; hash the + # exact bytes users will download so the formula checksum always matches. + sdist="https://github.com/${REPO}/releases/download/v${VERSION}/codeanalyzer_python-${VERSION}.tar.gz" + SHA256="$(curl -fLsS "$sdist" | shasum -a 256 | cut -d' ' -f1)" + REPO="$REPO" VERSION="$VERSION" SHA256="$SHA256" \ + ./packaging/homebrew/generate_formula.sh > codeanalyzer-python.rb + cat codeanalyzer-python.rb + + - name: Push formula to codellm-devkit/homebrew-tap + env: + TAP_TOKEN: ${{ secrets.HOMEBREW_TAP_TOKEN }} # PAT with write access to homebrew-tap + VERSION: ${{ steps.ver.outputs.version }} + run: | + git clone "https://x-access-token:${TAP_TOKEN}@github.com/codellm-devkit/homebrew-tap.git" tap + mkdir -p tap/Formula + cp codeanalyzer-python.rb tap/Formula/codeanalyzer-python.rb + cd tap + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add Formula/codeanalyzer-python.rb + git commit -m "codeanalyzer-python ${VERSION}" || { echo "no formula change"; exit 0; } + git push diff --git a/.github/workflows/release_config.json b/.github/workflows/release_config.json deleted file mode 100644 index abb8698..0000000 --- a/.github/workflows/release_config.json +++ /dev/null @@ -1,65 +0,0 @@ -{ - "categories": [ - { - "title": "## ✨ Release", - "labels": [ - "release" - ] - }, - { - "title": "## 🚀 Features", - "labels": [ - "kind/feature", - "enhancement" - ] - }, - { - "title": "## 🐛 Fixes", - "labels": [ - "fix", - "bug" - ] - }, - { - "title": "## ♻️ Refactoring", - "labels": [ - "refactoring" - ] - }, - { - "title": "## ⚡️ Performance Improvements", - "labels": [ - "performance" - ] - }, - { - "title": "## \uD83D\uDCDA Documentation", - "labels": [ - "documentation", - "doc" - ] - }, - { - "title": "## \uD83D\uDEA6 Tests", - "labels": [ - "test" - ] - }, - { - "title": "## \uD83D\uDEE0 Other Updates", - "labels": [ - "other", - "kind/dependency-change" - ] - }, - { - "title": "## 🚨 Breaking Changes", - "labels": [ - "breaking" - ] - } - ], - "ignore_labels": [ - "ignore" - ] -} diff --git a/CHANGELOG.md b/CHANGELOG.md index 6200df6..15afc94 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,21 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.2.1] - 2026-06-22 + +### Added +- **Homebrew tap** — `brew install codellm-devkit/tap/codeanalyzer-python`. The release workflow auto-generates a formula (`packaging/homebrew/generate_formula.sh`) that installs the pinned PyPI release as an isolated `uv` tool, and pushes it to `codellm-devkit/homebrew-tap`. Because the package is pure-Python with heavy native dependencies (`ray`, `pandas`, `numpy`), the formula depends on `uv` and runs the release via `uvx` rather than vendoring every transitive dependency as a Homebrew resource. +- **First-class external symbols** — `PyApplication.external_symbols` (a `{signature → PyExternalSymbol{name, module}}` map) records call-graph targets outside the analyzed project, mirroring the `codeanalyzer-typescript` backend. `analysis.json` now carries external info that was previously only a bare target string, and the Neo4j projection emits `:PyExternal` authoritatively from it ([#44](https://github.com/codellm-devkit/codeanalyzer-python/issues/44)). +- **`--no-venv` / `--venv` flag** — skip virtualenv creation and dependency installation and resolve imports against the ambient Python interpreter. Useful in CI / containers where the project's dependencies are already installed, for sandboxed runs without network, and for speed ([#46](https://github.com/codellm-devkit/codeanalyzer-python/issues/46)). + +### Changed +- The per-project analysis virtualenv is now installed with **`uv`** (parallel downloads + a shared global cache; falls back to `pip`), and is now **wired to Jedi** — previously `self.virtualenv` stayed `None`, so the install was never used by the symbol-table builder ([#47](https://github.com/codellm-devkit/codeanalyzer-python/issues/47)). +- Neo4j `:PyExternal` gains a `module` property; `SCHEMA_VERSION` bumped `1.0.0 → 1.1.0` (additive) ([#44](https://github.com/codellm-devkit/codeanalyzer-python/issues/44)). + +### Fixed +- `--emit neo4j` no longer drops call edges whose target is a bare imported module name (e.g. `os`, `re`, `json`): a `:PyPackage` name can no longer shadow a call target's `:PySymbol` signature, and the node-identity tracking is keyed by `(label, value)` so deferred `PY_EXTENDS` / `PY_RESOLVES_TO` edges can't be shadowed either ([#44](https://github.com/codellm-devkit/codeanalyzer-python/issues/44)). +- `--emit neo4j` (Bolt) full-run orphan prune is now scoped to the `:PyApplication` anchor, so a full-run push for one application no longer deletes another application's modules from a shared database ([#45](https://github.com/codellm-devkit/codeanalyzer-python/issues/45)). + ## [0.2.0] - 2026-06-20 ### Added diff --git a/README.md b/README.md index 8c85783..17cea55 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,8 @@ [![PyPI](https://img.shields.io/pypi/v/codeanalyzer-python?style=for-the-badge&logo=pypi&logoColor=white)](https://pypi.org/project/codeanalyzer-python/) [![Python](https://img.shields.io/pypi/pyversions/codeanalyzer-python?style=for-the-badge&logo=python&logoColor=white)](https://pypi.org/project/codeanalyzer-python/) -[![Release](https://img.shields.io/github/actions/workflow/status/codellm-devkit/codeanalyzer-python/release.yml?style=for-the-badge&label=release&logo=github)](https://github.com/codellm-devkit/codeanalyzer-python/actions/workflows/release.yml) +[![GitHub release](https://img.shields.io/github/v/release/codellm-devkit/codeanalyzer-python?style=for-the-badge&logo=github&label=GitHub&color=2dba4e)](https://github.com/codellm-devkit/codeanalyzer-python/releases/latest) +[![Release](https://img.shields.io/github/actions/workflow/status/codellm-devkit/codeanalyzer-python/release.yml?style=for-the-badge&label=release&logo=githubactions&logoColor=white)](https://github.com/codellm-devkit/codeanalyzer-python/actions/workflows/release.yml) [![License](https://img.shields.io/badge/License-Apache%202.0-blue?style=for-the-badge)](./LICENSE) @@ -35,6 +36,7 @@ and merges them with the Jedi-derived edges, also backfilling callees Jedi could - [Prerequisites](#prerequisites) - [Install via pip (PyPI)](#install-via-pip-pypi) - [Install via shell script](#install-via-shell-script) + - [Install via Homebrew](#install-via-homebrew) - [Build from source](#build-from-source) - [Usage](#usage) - [Options](#options) @@ -101,6 +103,15 @@ Install the CLI as an isolated tool with the one-line installer (provisions via curl --proto '=https' --tlsv1.2 -LsSf https://github.com/codellm-devkit/codeanalyzer-python/releases/latest/download/canpy-installer.sh | sh ``` +### Install via Homebrew + +```sh +brew install codellm-devkit/tap/codeanalyzer-python +``` + +The formula depends on [uv](https://docs.astral.sh/uv/) and installs `canpy` as an isolated, +version-pinned uv tool (the package and its dependencies are resolved and cached on first run). + ### Build from source This project uses [uv](https://docs.astral.sh/uv/) for dependency management. @@ -133,102 +144,64 @@ $ canpy --help Static Analysis on Python source code using Jedi, CodeQL and Tree sitter. -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --input -i PATH Path to the │ -│ project root │ -│ directory (not │ -│ required for │ -│ --emit schema). │ -│ --output -o PATH Output directory │ -│ for artifacts. │ -│ --format -f [json|msgpack] Output format for │ -│ --emit json: json │ -│ or msgpack. │ -│ [default: json] │ -│ --emit [json|neo4j|sche Output target: │ -│ ma] json │ -│ (analysis.json, │ -│ default) | neo4j │ -│ (graph.cypher or │ -│ live Bolt push) | │ -│ schema (the Neo4j │ -│ schema.json │ -│ contract). │ -│ [default: json] │ -│ --app-name TEXT Logical │ -│ application name │ -│ for the graph │ -│ :PyApplication │ -│ anchor (default: │ -│ input dir name). │ -│ --neo4j-uri TEXT Push the graph to │ -│ a live Neo4j over │ -│ Bolt │ -│ (incremental); │ -│ omit to write │ -│ graph.cypher. │ -│ [env var: │ -│ NEO4J_URI] │ -│ --neo4j-user TEXT Neo4j username. │ -│ [env var: │ -│ NEO4J_USERNAME] │ -│ [default: neo4j] │ -│ --neo4j-password TEXT Neo4j password. │ -│ Prefer the env │ -│ var over the flag │ -│ (the flag is │ -│ visible in shell │ -│ history / process │ -│ list). │ -│ [env var: │ -│ NEO4J_PASSWORD] │ -│ [default: neo4j] │ -│ --neo4j-database TEXT Neo4j database │ -│ name (default: │ -│ server default). │ -│ [env var: │ -│ NEO4J_DATABASE] │ -│ --codeql --no-codeql Enable │ -│ CodeQL-based │ -│ analysis. │ -│ [default: │ -│ no-codeql] │ -│ --ray --no-ray Enable Ray for │ -│ distributed │ -│ analysis. │ -│ [default: no-ray] │ -│ --eager --lazy Enable eager or │ -│ lazy analysis. │ -│ Defaults to lazy. │ -│ [default: lazy] │ -│ --skip-tests --include-tests Skip test files │ -│ in analysis. │ -│ [default: │ -│ skip-tests] │ -│ --file-name PATH Analyze only the │ -│ specified file │ -│ (relative to │ -│ input directory). │ -│ --cache-dir -c PATH Directory to │ -│ store analysis │ -│ cache. Defaults │ -│ to │ -│ '.codeanalyzer' │ -│ in the input │ -│ directory. │ -│ --clear-cache --keep-cache Clear cache after │ -│ analysis. By │ -│ default, cache is │ -│ retained. │ -│ [default: │ -│ keep-cache] │ -│ -v INTEGER Increase │ -│ verbosity: -v, │ -│ -vv, -vvv │ -│ [default: 0] │ -│ --help Show this message │ -│ and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ +│ --input -i PATH Path to the project root directory │ +│ (not required for --emit schema). │ +│ --output -o PATH Output directory for artifacts. │ +│ --format -f [json|msgpack] Output format for --emit json: │ +│ json or msgpack. │ +│ [default: json] │ +│ --emit [json|neo4j|schema] Output target: json │ +│ (analysis.json, default) | neo4j │ +│ (graph.cypher or live Bolt push) | │ +│ schema (the Neo4j schema.json │ +│ contract). │ +│ [default: json] │ +│ --app-name TEXT Logical application name for the │ +│ graph :PyApplication anchor │ +│ (default: input dir name). │ +│ --neo4j-uri TEXT Push the graph to a live Neo4j │ +│ over Bolt (incremental); omit to │ +│ write graph.cypher. │ +│ [env var: NEO4J_URI] │ +│ --neo4j-user TEXT Neo4j username. │ +│ [env var: NEO4J_USERNAME] │ +│ [default: neo4j] │ +│ --neo4j-password TEXT Neo4j password. Prefer the env var │ +│ over the flag (the flag is visible │ +│ in shell history / process list). │ +│ [env var: NEO4J_PASSWORD] │ +│ [default: neo4j] │ +│ --neo4j-database TEXT Neo4j database name (default: │ +│ server default). │ +│ [env var: NEO4J_DATABASE] │ +│ --codeql --no-codeql Enable CodeQL-based analysis. │ +│ [default: no-codeql] │ +│ --ray --no-ray Enable Ray for distributed │ +│ analysis. │ +│ [default: no-ray] │ +│ --eager --lazy Enable eager or lazy analysis. │ +│ Defaults to lazy. │ +│ [default: lazy] │ +│ --skip-tests --include-tests Skip test files in analysis. │ +│ [default: skip-tests] │ +│ --no-venv --venv Skip virtualenv creation and │ +│ dependency installation; resolve │ +│ imports against the ambient Python │ +│ environment instead. │ +│ [default: venv] │ +│ --file-name PATH Analyze only the specified file │ +│ (relative to input directory). │ +│ --cache-dir -c PATH Directory to store analysis cache. │ +│ Defaults to '.codeanalyzer' in the │ +│ input directory. │ +│ --clear-cache --keep-cache Clear cache after analysis. By │ +│ default, cache is retained. │ +│ [default: keep-cache] │ +│ -v INTEGER Increase verbosity: -v, -vv, -vvv │ +│ [default: 0] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` diff --git a/codeanalyzer/__main__.py b/codeanalyzer/__main__.py index d386d3b..d7f4ab3 100644 --- a/codeanalyzer/__main__.py +++ b/codeanalyzer/__main__.py @@ -104,6 +104,14 @@ def main( help="Skip test files in analysis.", ), ] = True, + no_venv: Annotated[ + bool, + typer.Option( + "--no-venv/--venv", + help="Skip virtualenv creation and dependency installation; resolve " + "imports against the ambient Python environment instead.", + ), + ] = False, file_name: Annotated[ Optional[Path], typer.Option( @@ -144,6 +152,7 @@ def main( using_ray=using_ray, rebuild_analysis=rebuild_analysis, skip_tests=skip_tests, + no_venv=no_venv, file_name=file_name, cache_dir=cache_dir, clear_cache=clear_cache, diff --git a/codeanalyzer/core.py b/codeanalyzer/core.py index b8cfcca..9b5f538 100644 --- a/codeanalyzer/core.py +++ b/codeanalyzer/core.py @@ -8,7 +8,13 @@ import ray from codeanalyzer.utils import logger -from codeanalyzer.schema import PyApplication, PyModule, model_dump_json, model_validate_json +from codeanalyzer.schema import ( + PyApplication, + PyExternalSymbol, + PyModule, + model_dump_json, + model_validate_json, +) from codeanalyzer.schema.py_schema import PyCallEdge from codeanalyzer.semantic_analysis.call_graph import ( jedi_call_graph_edges, @@ -60,6 +66,7 @@ def __init__(self, options: AnalysisOptions) -> None: self.skip_tests = options.skip_tests self.using_codeql = options.using_codeql self.rebuild_analysis = options.rebuild_analysis + self.no_venv = options.no_venv self.cache_dir = ( options.cache_dir.resolve() if options.cache_dir is not None else self.project_dir ) / ".codeanalyzer" @@ -226,13 +233,41 @@ def _get_base_interpreter() -> Path: f"a working Python interpreter that can create virtual environments." ) + @staticmethod + def _uv_bin() -> Optional[str]: + """Path to a uv binary: the one bundled with the ``uv`` PyPI package (a + dependency, so normally always present -- including inside a Docker image), + else a uv on PATH, else ``None`` (callers fall back to pip).""" + try: + from uv import find_uv_bin + + return str(find_uv_bin()) + except Exception: + return shutil.which("uv") + + def _install_into_venv(self, venv_python: Path, args: List[str]) -> None: + """Install packages into the target venv, preferring uv for speed (parallel + downloads + a shared global cache) and falling back to the venv's own pip + when uv is unavailable.""" + uv = self._uv_bin() + if uv: + cmd = [uv, "pip", "install", "--python", str(venv_python), *args] + else: + cmd = [str(venv_python), "-m", "pip", "install", *args] + self._cmd_exec_helper(cmd, cwd=self.project_dir, check=True) + def __enter__(self) -> "Codeanalyzer": # If no virtualenv is provided, try to create one using requirements.txt or pyproject.toml venv_path = self.cache_dir / self.project_dir.name / "virtualenv" # Ensure the cache directory exists for this project venv_path.parent.mkdir(parents=True, exist_ok=True) + if self.no_venv: + logger.info( + "--no-venv: using the ambient Python environment " + "(skipping virtualenv creation and dependency installation)" + ) # Create the virtual environment if it does not exist - if not venv_path.exists() or self.rebuild_analysis: + if not self.no_venv and (not venv_path.exists() or self.rebuild_analysis): logger.info(f"(Re-)creating virtual environment at {venv_path}") self._cmd_exec_helper( [str(self._get_base_interpreter()), "-m", "venv", str(venv_path)], @@ -249,24 +284,19 @@ def __enter__(self) -> "Codeanalyzer": ("test-requirements.txt", ["-r"]), ] - for dep_file, pip_args in dependency_files: + for dep_file, _ in dependency_files: if (self.project_dir / dep_file).exists(): logger.info(f"Installing dependencies from {dep_file}") - self._cmd_exec_helper( - [str(venv_python), "-m", "pip", "install", "-U"] + pip_args + [str(self.project_dir / dep_file)], - cwd=self.project_dir, - check=True, + self._install_into_venv( + venv_python, + ["--upgrade", "-r", str(self.project_dir / dep_file)], ) # Handle Pipenv files if (self.project_dir / "Pipfile").exists(): logger.info("Installing dependencies from Pipfile") # Note: This would require pipenv to be installed - self._cmd_exec_helper( - [str(venv_python), "-m", "pip", "install", "pipenv"], - cwd=self.project_dir, - check=True, - ) + self._install_into_venv(venv_python, ["pipenv"]) self._cmd_exec_helper( ["pipenv", "install", "--dev"], cwd=self.project_dir, @@ -289,14 +319,18 @@ def __enter__(self) -> "Codeanalyzer": if any((self.project_dir / file).exists() for file in package_definition_files): logger.info("Installing project in editable mode") - self._cmd_exec_helper( - [str(venv_python), "-m", "pip", "install", "-e", str(self.project_dir)], - cwd=self.project_dir, - check=True, - ) + self._install_into_venv(venv_python, ["-e", str(self.project_dir)]) else: logger.warning("No package definition files found, skipping editable installation") + # Point Jedi at the analysis venv so it resolves the project's third-party + # imports. This runs on both a fresh build and a lazy reuse of an existing + # venv -- previously self.virtualenv stayed None, so the install above was + # never actually used by the symbol-table builder. With --no-venv we leave + # it None so Jedi resolves against the ambient interpreter instead. + if not self.no_venv and venv_path.exists(): + self.virtualenv = venv_path + if self.using_codeql: logger.info(f"(Re-)initializing CodeQL analysis for {self.project_dir}") @@ -358,6 +392,43 @@ def __exit__(self, *args, **kwargs) -> None: logger.info(f"Clearing cache directory: {self.cache_dir}") shutil.rmtree(self.cache_dir) + @staticmethod + def _compute_external_symbols(symbol_table, call_graph): + """Build the external-symbol map: every call-graph endpoint whose signature + is not a declared class/callable in the symbol table is an external (an + imported library or builtin member). ``name``/``module`` are derived from + the signature (best effort: split on the last dot).""" + declared = set() + + def walk_callable(c): + declared.add(c.signature) + for ic in (c.inner_callables or {}).values(): + walk_callable(ic) + for cl in (c.inner_classes or {}).values(): + walk_class(cl) + + def walk_class(cl): + declared.add(cl.signature) + for m in (cl.methods or {}).values(): + walk_callable(m) + for ic in (cl.inner_classes or {}).values(): + walk_class(ic) + + for mod in symbol_table.values(): + for c in (mod.functions or {}).values(): + walk_callable(c) + for cl in (mod.classes or {}).values(): + walk_class(cl) + + externals: Dict[str, PyExternalSymbol] = {} + for edge in call_graph: + for sig in (edge.source, edge.target): + if sig in declared or sig in externals: + continue + module, name = sig.rsplit(".", 1) if "." in sig else (sig, sig) + externals[sig] = PyExternalSymbol(name=name, module=module) + return externals + def analyze(self) -> PyApplication: """Analyze the project and return a PyApplication with symbol table. @@ -397,8 +468,19 @@ def analyze(self) -> PyApplication: jedi_edges = jedi_call_graph_edges(symbol_table) call_graph = merge_edges(jedi_edges, codeql_edges) + # Classify call-graph endpoints that are not declared in the symbol table + # (imported library / builtin members) once, so the JSON and Neo4j backends + # share one authoritative external-symbol set. + external_symbols = self._compute_external_symbols(symbol_table, call_graph) + # Recreate pyapplication - app = PyApplication.builder().symbol_table(symbol_table).call_graph(call_graph).build() + app = ( + PyApplication.builder() + .symbol_table(symbol_table) + .call_graph(call_graph) + .external_symbols(external_symbols) + .build() + ) # Save to cache self._save_analysis_cache(app, cache_file) diff --git a/codeanalyzer/neo4j/bolt.py b/codeanalyzer/neo4j/bolt.py index 4ae102b..dc60986 100644 --- a/codeanalyzer/neo4j/bolt.py +++ b/codeanalyzer/neo4j/bolt.py @@ -77,6 +77,13 @@ def session(): for stmt in [*CONSTRAINTS, *INDEXES]: s.run(stmt) + # The application anchor (a shared node) — used to scope the orphan prune + # so it never touches modules belonging to a different :PyApplication. + app_name = next( + (n.value for n in rows.nodes if n.labels and n.labels[0] == "PyApplication"), + None, + ) + # Partition nodes by owning module; shared nodes have no _module. by_module: Dict[str, List[NodeRow]] = {} shared: List[NodeRow] = [] @@ -135,13 +142,17 @@ def _purge(tx, module=m, node_keys=keys): _upsert_edges(session, neo4j, edges) # 6. orphan prune — only safe on a full run (a targeted run can't tell deleted from untargeted). - if full_run: + # Scope to THIS application's anchor so a full run for application B never + # deletes application A's modules from a shared database. + if full_run and app_name is not None: present = list(by_module.keys()) with session() as s: res = s.run( - "MATCH (m:PyModule) WHERE NOT m.file_key IN $present " + "MATCH (:PyApplication {name: $app})-[:PY_HAS_MODULE]->(m:PyModule) " + "WHERE NOT m.file_key IN $present " f"OPTIONAL MATCH (m)-{DESCENDANTS}->(x) DETACH DELETE x, m " "RETURN count(m) AS pruned", + app=app_name, present=present, ) pruned = res.single() diff --git a/codeanalyzer/neo4j/catalog.py b/codeanalyzer/neo4j/catalog.py index 37f8a1a..155d86a 100644 --- a/codeanalyzer/neo4j/catalog.py +++ b/codeanalyzer/neo4j/catalog.py @@ -34,7 +34,7 @@ from codeanalyzer.neo4j.schema import CONSTRAINTS, INDEXES -SCHEMA_VERSION = "1.0.0" +SCHEMA_VERSION = "1.1.0" # PropType ∈ {"string", "integer", "float", "boolean", "string[]", "integer[]"}. @@ -119,7 +119,7 @@ class RelType: "PyExternal", "PySymbol", "signature", - {"signature": "string", "name": "string"}, + {"signature": "string", "name": "string", "module": "string"}, ), NodeLabel("PyPackage", "PyPackage", "name", {"name": "string"}), NodeLabel( diff --git a/codeanalyzer/neo4j/project.py b/codeanalyzer/neo4j/project.py index 4878cda..7c4deb7 100644 --- a/codeanalyzer/neo4j/project.py +++ b/codeanalyzer/neo4j/project.py @@ -60,11 +60,12 @@ def project(app: PyApplication, app_name: str) -> GraphRows: b.edge("PY_HAS_MODULE", app_ref, mod_ref) _project_module_body(b, file_key, mod_ref, mod) - # The aggregated :PY_CALLS twin. Endpoints not present in the symbol table become - # :PyExternal ghost nodes (the analyzer already preserves them as ghost nodes). + # The aggregated :PY_CALLS twin. Endpoints listed in app.external_symbols become + # :PyExternal ghost nodes; the rest are declared :PySymbol nodes already emitted. + externals = app.external_symbols or {} for e in app.call_graph: - src = _call_endpoint(b, e.source) - tgt = _call_endpoint(b, e.target) + src = _call_endpoint(b, e.source, externals) + tgt = _call_endpoint(b, e.target, externals) b.edge("PY_CALLS", src, tgt, _call_edge_props(e.weight, list(e.provenance or []))) return b.finish() @@ -74,13 +75,20 @@ def _sym(signature: str) -> NodeRef: return NodeRef("PySymbol", "signature", signature) -def _call_endpoint(b: RowBuilder, signature: str) -> NodeRef: - """A call-graph endpoint: a known callable already emitted, or a phantom - :PyExternal symbol materialized on demand for a ghost target.""" - if b.has_key(signature): +def _call_endpoint(b: RowBuilder, signature: str, externals: dict) -> NodeRef: + """A call-graph endpoint: a declared callable already emitted, or an external + symbol (imported library / builtin member) materialized as a :PyExternal ghost. + + Classification is authoritative -- it comes from ``app.external_symbols``, not a + "present in the graph" heuristic -- so an imported module name (which exists only + as a :PyPackage) can never shadow the call target. A small fallback still + materializes an external for any endpoint that is neither declared nor listed.""" + ext = externals.get(signature) + if ext is None and b.has_key("PySymbol", signature): return _sym(signature) - name = signature.rsplit(".", 1)[-1] if "." in signature else signature - return b.node(["PySymbol", "PyExternal"], "signature", signature, {"name": name}) + name = ext.name if ext is not None else (signature.rsplit(".", 1)[-1] if "." in signature else signature) + module = ext.module if ext is not None else None + return b.node(["PySymbol", "PyExternal"], "signature", signature, prune({"name": name, "module": module})) # ---------------------------------------------------------------------------------------------- diff --git a/codeanalyzer/neo4j/rows.py b/codeanalyzer/neo4j/rows.py index 9edecde..cbc381f 100644 --- a/codeanalyzer/neo4j/rows.py +++ b/codeanalyzer/neo4j/rows.py @@ -83,7 +83,11 @@ def __init__(self) -> None: self._nodes: Dict[str, NodeRow] = {} # key: f"{labels[0]} {value}" self._edges: List[EdgeRow] = [] self._deferred: List[EdgeRow] = [] # edges gated against node existence at finish() - self._keys: set = set() # every node value seen, for resolved-gating + # (merge_label, value) of every node seen, for resolved-gating. Keyed by + # label too so a :PyPackage name can't shadow a :PySymbol signature (and + # vice versa) — otherwise a call to an imported module name like ``os`` + # resolves to a :PySymbol node that was never created and the edge is lost. + self._keys: set = set() def node(self, labels: List[str], key_prop: str, value: str, props: Props) -> NodeRef: """Upsert a node. Re-seeing the same ``(labels[0], value)`` merges props @@ -98,7 +102,7 @@ def node(self, labels: List[str], key_prop: str, value: str, props: Props) -> No existing.labels.append(label) else: self._nodes[node_id] = NodeRow(list(labels), key_prop, value, dict(props)) - self._keys.add(value) + self._keys.add((labels[0], value)) return NodeRef(labels[0], key_prop, value) def edge(self, type_: str, from_ref: NodeRef, to_ref: NodeRef, props: Optional[Props] = None) -> None: @@ -121,12 +125,13 @@ def edge_to_symbol( ) ) - def has_key(self, value: str) -> bool: - return value in self._keys + def has_key(self, label: str, value: str) -> bool: + """Whether a node with this ``(merge_label, value)`` identity was emitted.""" + return (label, value) in self._keys def finish(self) -> GraphRows: for e in self._deferred: - if e.to_ref.value in self._keys: + if (e.to_ref.label, e.to_ref.value) in self._keys: self._edges.append(e) nodes = sorted(self._nodes.values(), key=lambda n: f"{n.labels[0]} {n.value}") edges = sorted(self._edges, key=lambda e: f"{e.type} {e.from_ref.value} {e.to_ref.value}") diff --git a/codeanalyzer/options/options.py b/codeanalyzer/options/options.py index 541fb85..e314c5e 100644 --- a/codeanalyzer/options/options.py +++ b/codeanalyzer/options/options.py @@ -38,6 +38,7 @@ class AnalysisOptions: using_ray: bool = False rebuild_analysis: bool = False skip_tests: bool = True + no_venv: bool = False file_name: Optional[Path] = None cache_dir: Optional[Path] = None clear_cache: bool = False diff --git a/codeanalyzer/schema/__init__.py b/codeanalyzer/schema/__init__.py index 8853909..bcfa976 100644 --- a/codeanalyzer/schema/__init__.py +++ b/codeanalyzer/schema/__init__.py @@ -8,6 +8,7 @@ PyClass, PyClassAttribute, PyComment, + PyExternalSymbol, PyImport, PyModule, PyVariableDeclaration, @@ -15,6 +16,7 @@ __all__ = [ "PyApplication", + "PyExternalSymbol", "PyImport", "PyComment", "PyModule", diff --git a/codeanalyzer/schema/py_schema.py b/codeanalyzer/schema/py_schema.py index 8bef391..c69e5fb 100644 --- a/codeanalyzer/schema/py_schema.py +++ b/codeanalyzer/schema/py_schema.py @@ -358,6 +358,17 @@ class PyCallEdge(BaseModel): provenance: List[Literal["jedi", "codeql", "joern"]] = [] +@builder +@msgpk +class PyExternalSymbol(BaseModel): + """A call-graph target outside the analyzed project -- an imported library or + builtin member. Mirrors codeanalyzer-typescript's ``TSExternalSymbol`` and is + keyed in ``PyApplication.external_symbols`` by its call-graph signature.""" + + name: str # the member/short name, e.g. "get" for "requests.get" + module: Optional[str] = None # best-effort owning module, e.g. "requests" + + @builder @msgpk class PyApplication(BaseModel): @@ -365,3 +376,7 @@ class PyApplication(BaseModel): symbol_table: Dict[str, PyModule] call_graph: List[PyCallEdge] = [] + # Call-graph endpoints not declared in the symbol table (imported library / + # builtin members), keyed by signature. Populated by the analyzer so every + # backend (JSON and Neo4j) shares one authoritative external-symbol set. + external_symbols: Dict[str, PyExternalSymbol] = {} diff --git a/packaging/homebrew/generate_formula.sh b/packaging/homebrew/generate_formula.sh new file mode 100755 index 0000000..70846cf --- /dev/null +++ b/packaging/homebrew/generate_formula.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# +# Generate the Homebrew formula for codeanalyzer-python (the `canpy` CLI). +# +# Unlike the codeanalyzer-typescript sibling -- which ships a single self-contained +# binary that the formula just downloads -- codeanalyzer-python is a pure-Python +# package published to PyPI with heavy native dependencies (ray, pandas, numpy). +# Vendoring every transitive dependency as a Homebrew `resource` is impractical +# (ray is not buildable from an sdist), and pip-installing at build time is blocked +# by Homebrew's network sandbox. +# +# So the formula stays tiny: it depends on `uv` and installs version-pinned wrapper +# scripts that run the published PyPI release via `uvx` (uv resolves and caches the +# isolated environment on first run). This keeps `brew install` sandbox-safe (no +# network at build time) while pinning the exact released version. +# +# Homebrew requires every formula to declare a source `url` + `sha256` for its +# stable spec, so we point at the released sdist (byte-identical to the PyPI one). +# The install method ignores the unpacked source and just writes uv wrappers, but +# the url anchors the version and satisfies Homebrew's spec requirement. +# +# Usage: +# REPO=codellm-devkit/codeanalyzer-python VERSION=0.2.0 SHA256= \ +# ./generate_formula.sh > codeanalyzer-python.rb +# +set -euo pipefail + +REPO="${REPO:?set REPO, e.g. codellm-devkit/codeanalyzer-python}" +VERSION="${VERSION:?set VERSION, e.g. 0.2.0}" +SHA256="${SHA256:?set SHA256 of the released sdist}" +SDIST_URL="https://github.com/${REPO}/releases/download/v${VERSION}/codeanalyzer_python-${VERSION}.tar.gz" + +cat <=2.10.0,<3.0.0; python_version >= '3.11'", "packaging>=25.0", + # uv -- installs the analyzed project's deps into the analysis venv quickly. + # Shipped as a self-contained binary in its wheel, so it's available wherever + # canpy is pip-installed (incl. Docker); core.py falls back to pip without it. + "uv>=0.5.0", ] [project.optional-dependencies] diff --git a/schema.neo4j.json b/schema.neo4j.json index ffccf29..de5d100 100644 --- a/schema.neo4j.json +++ b/schema.neo4j.json @@ -1,5 +1,5 @@ { - "schema_version": "1.0.0", + "schema_version": "1.1.0", "generator": "codeanalyzer-python", "marker_labels": [], "node_labels": [ @@ -67,7 +67,8 @@ "key": "signature", "properties": { "signature": "string", - "name": "string" + "name": "string", + "module": "string" } }, { diff --git a/scripts/update_readme.py b/scripts/update_readme.py index 75cb8f0..e424448 100644 --- a/scripts/update_readme.py +++ b/scripts/update_readme.py @@ -31,6 +31,16 @@ def render_help() -> str: os.environ["TERM"] = "dumb" os.environ["NO_COLOR"] = "1" + # Typer caps help width at rich_utils.MAX_WIDTH (default 80) regardless of + # COLUMNS, so CI renders the box narrower than a dev machine. Pin it to WIDTH + # so the rendered help is wide and byte-identical everywhere. + try: + import typer.rich_utils as _ru + + _ru.MAX_WIDTH = WIDTH + except Exception: # pragma: no cover - defensive across Typer versions + pass + from click.testing import CliRunner from typer.main import get_command diff --git a/test/sample_graph_app.py b/test/sample_graph_app.py index 11124f4..b4232b9 100644 --- a/test/sample_graph_app.py +++ b/test/sample_graph_app.py @@ -14,6 +14,7 @@ PyClass, PyClassAttribute, PyComment, + PyExternalSymbol, PyImport, PyModule, PyVariableDeclaration, @@ -149,4 +150,7 @@ def make_sample_app() -> PyApplication: return PyApplication( symbol_table={"src/service.py": service_mod, "src/util.py": util_mod}, call_graph=call_graph, + # The ghost edge's target (requests.get) is a library member, recorded as a + # first-class external symbol so the projection emits a :PyExternal for it. + external_symbols={"requests.get": PyExternalSymbol(name="get", module="requests")}, ) diff --git a/test/test_cli.py b/test/test_cli.py index b4ba50d..11a5490 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -38,6 +38,27 @@ def test_cli_call_symbol_table_with_json(cli_runner, whole_applications__xarray) assert len(json_obj["symbol_table"]) > 0, "Symbol table should not be empty" +def test_no_venv_skips_virtualenv( + cli_runner, single_functionalities__stuff_nested_in_functions, tmp_path +): + """#46: --no-venv must skip virtualenv creation/installation and still analyze.""" + out = tmp_path / "out" + cache = tmp_path / "cache" + result = cli_runner.invoke( + app, + [ + "--input", str(single_functionalities__stuff_nested_in_functions), + "--output", str(out), + "--cache-dir", str(cache), + "--no-venv", "--no-codeql", "--no-ray", + ], + env={"NO_COLOR": "1", "TERM": "dumb"}, + ) + assert result.exit_code == 0, result.output + assert (out / "analysis.json").exists(), "analysis.json should still be produced with --no-venv" + assert not list(cache.rglob("virtualenv")), "--no-venv must not create a virtualenv" + + def test_single_file(cli_runner, single_functionalities__stuff_nested_in_functions): """Must be able to run the CLI with single file analysis using --file-name flag.""" output_dir = single_functionalities__stuff_nested_in_functions.joinpath(".output") diff --git a/test/test_neo4j_bolt.py b/test/test_neo4j_bolt.py index ee84e01..6f02bd8 100644 --- a/test/test_neo4j_bolt.py +++ b/test/test_neo4j_bolt.py @@ -15,9 +15,24 @@ from codeanalyzer.neo4j import project from codeanalyzer.neo4j.bolt import BoltConfig, bolt_writer +from codeanalyzer.schema import PyApplication, PyCallable, PyModule from sample_graph_app import make_sample_app + +def _single_module_app(file_key: str = "appb/main.py") -> PyApplication: + """A minimal second application with its own (distinct) module file_key.""" + fn = PyCallable( + name="main", path=file_key, signature="appb.main", return_type="None", + code="def main():\n ...", start_line=1, end_line=2, + code_start_line=1, cyclomatic_complexity=1, + ) + mod = PyModule( + file_path=file_key, module_name="appb.main", functions={"main": fn}, + content_hash="h-b", last_modified=1.0, file_size=10, + ) + return PyApplication(symbol_table={file_key: mod}, call_graph=[]) + pytestmark = pytest.mark.skipif( not os.environ.get("RUN_CONTAINER_TESTS"), reason="opt-in: set RUN_CONTAINER_TESTS=1 (needs Docker/Podman) to run the Neo4j bolt test", @@ -105,6 +120,21 @@ def test_full_push_materializes_the_whole_graph_and_schema(driver, cfg): assert _num(driver, "MATCH (e:PyExternal) RETURN count(e)") >= 1 +def test_full_run_does_not_prune_another_applications_modules(driver, cfg): + """Regression for #45: a full-run push for one application must not prune the + modules of a *different* application sharing the database.""" + bolt_writer(project(make_sample_app(), "app-a"), cfg, full_run=True) + before = _num(driver, "MATCH (:PyApplication {name:'app-a'})-[:PY_HAS_MODULE]->(m) RETURN count(m)") + assert before > 0 + + # A full-run push for a different application must leave app-a untouched. + bolt_writer(project(_single_module_app(), "app-b"), cfg, full_run=True) + + after = _num(driver, "MATCH (:PyApplication {name:'app-a'})-[:PY_HAS_MODULE]->(m) RETURN count(m)") + assert after == before, "full-run push for app-b pruned app-a's modules (#45)" + assert _num(driver, "MATCH (:PyApplication {name:'app-b'})-[:PY_HAS_MODULE]->(m) RETURN count(m)") == 1 + + def test_re_pushing_identical_analysis_is_idempotent(driver, cfg): rows = project(make_sample_app(), "sample-app") bolt_writer(rows, cfg, full_run=True) diff --git a/test/test_neo4j_schema.py b/test/test_neo4j_schema.py index 401b465..bba6336 100644 --- a/test/test_neo4j_schema.py +++ b/test/test_neo4j_schema.py @@ -12,6 +12,8 @@ from codeanalyzer.neo4j import NODE_LABELS, REL_TYPES, build_schema_document, project from codeanalyzer.neo4j.catalog import MARKER_LABELS from codeanalyzer.neo4j.cypher import render_cypher +from codeanalyzer.schema import PyApplication, PyCallable, PyImport, PyModule +from codeanalyzer.schema.py_schema import PyCallEdge from sample_graph_app import make_sample_app @@ -87,6 +89,38 @@ def test_render_cypher_is_deterministic_and_self_contained(): assert "MERGE (n:PySymbol {signature: row.k})" in a +def test_call_edge_to_imported_module_name_is_not_dropped(): + """Regression for #44: a call whose target is a bare module name that is also + imported (e.g. ``os``) must not be dropped. The import creates a :PyPackage + named ``os``; that must not shadow the call target's :PySymbol signature.""" + caller = PyCallable( + name="caller", path="m.py", signature="m.caller", return_type="None", + code="def caller():\n os.getcwd()", start_line=1, end_line=2, + code_start_line=1, cyclomatic_complexity=1, + ) + mod = PyModule( + file_path="m.py", module_name="m", + imports=[PyImport(module="os", name="getcwd")], + functions={"caller": caller}, + content_hash="h", last_modified=1.0, file_size=10, + ) + app = PyApplication( + symbol_table={"m.py": mod}, + call_graph=[PyCallEdge(source="m.caller", target="os", weight=1, provenance=["jedi"])], + ) + rows = project(app, "app") + + calls_to_os = [e for e in rows.edges if e.type == "PY_CALLS" and e.to_ref.value == "os"] + assert len(calls_to_os) == 1, "PY_CALLS edge to imported module name 'os' was dropped" + + # 'os' is materialized as a :PyExternal symbol (the call target) ... + assert any(n.value == "os" and "PyExternal" in n.labels for n in rows.nodes), \ + ":PyExternal ghost for the call target 'os' is missing" + # ... distinct from the :PyPackage 'os' created by the import. + assert any(n.value == "os" and "PyPackage" in n.labels for n in rows.nodes), \ + ":PyPackage for the import 'os' is missing" + + def test_checked_in_schema_matches_catalog(): """Run `canpy --emit schema > schema.neo4j.json` if this fails.""" on_disk_path = Path(__file__).resolve().parents[1] / "schema.neo4j.json"