From c93c3f5d82c2c4a727da2c757230f4331bf5dfef Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Sat, 20 Jun 2026 16:04:13 -0400 Subject: [PATCH 1/9] ci(release): categorized GitHub-native release notes (install + download + What's Changed) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mikepenz changelog builder relies on GitHub's compare API, which 404s ("no common ancestor") for this repo's tag range after the history rewrites, so it produced an empty changelog. Replace it with GitHub's native generated notes, which list merged PRs/issues without needing the compare merge-base: - Compose an install + download-table header (cargo-dist style, like the codeanalyzer-typescript release notes). - softprops/action-gh-release@v2 with generate_release_notes: true appends a categorized "What's Changed" section. - .github/release.yml groups PRs under emoji headings by label (Features, Fixes, Docs, Tests, Breaking, …), mirroring the old mikepenz categories. - Drop the unused mindsers/mikepenz steps and release_config.json. --- .github/release.yml | 26 +++++++++++ .github/workflows/release.yml | 64 +++++++++++++++----------- .github/workflows/release_config.json | 65 --------------------------- 3 files changed, 63 insertions(+), 92 deletions(-) create mode 100644 .github/release.yml delete mode 100644 .github/workflows/release_config.json diff --git a/.github/release.yml b/.github/release.yml new file mode 100644 index 0000000..1fc55ad --- /dev/null +++ b/.github/release.yml @@ -0,0 +1,26 @@ +# Configures GitHub's auto-generated release notes (the "What's Changed" section +# appended by `generate_release_notes` in .github/workflows/release.yml). Merged +# PRs are grouped under these emoji headings by label, mirroring the emoji +# categories used by the codeanalyzer-typescript backend. +changelog: + exclude: + authors: + - dependabot + - github-actions + categories: + - title: 🚀 Features + labels: [enhancement, kind/feature] + - title: 🐛 Fixes + labels: [bug, fix] + - title: ♻️ Refactoring + labels: [refactoring] + - title: ⚡️ Performance + labels: [performance] + - title: 📚 Documentation + labels: [documentation, doc] + - title: 🚦 Tests + labels: [test] + - title: 🚨 Breaking Changes + labels: [breaking] + - title: 🛠 Other Changes + labels: ["*"] diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 32fa884..5002cb9 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -87,40 +87,50 @@ jobs: echo "current_version=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT shell: bash - - name: Read Changelog Entry - id: changelog_reader - uses: mindsers/changelog-reader-action@v2 - with: - validation_level: warn - version: ${{ steps.tag_name.outputs.current_version }} - path: ./CHANGELOG.md - - - name: Build changelog - id: gen_changelog - continue-on-error: true # auto-PR-diff is best-effort; CHANGELOG.md is the source of truth - uses: mikepenz/release-changelog-builder-action@v5 - with: - failOnError: "false" - configuration: .github/workflows/release_config.json + # cargo-dist-style notes: install one-liners + a download table. The categorized + # "What's Changed" (merged PRs/issues grouped under emoji headings via + # .github/release.yml) is appended by generate_release_notes below. Indented code + # blocks avoid backticks in the heredoc. + - name: Compose release notes header (install + download) env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + VERSION: ${{ steps.tag_name.outputs.current_version }} + run: | + REPO="codellm-devkit/codeanalyzer-python" + BASE="https://github.com/$REPO/releases/download/v$VERSION" + cat > "$RUNNER_TEMP/RELEASE_BODY.md" < Date: Sat, 20 Jun 2026 16:08:43 -0400 Subject: [PATCH 2/9] docs(readme): widen the --help block and add a GitHub release badge - update_readme.py: pin typer.rich_utils.MAX_WIDTH to WIDTH (100). Typer caps help at MAX_WIDTH=80 regardless of COLUMNS, so CI rendered the box much narrower than a dev machine and the release doc-sync kept shrinking it. Pinning it makes the rendered help wide and byte-identical local vs CI. - README: regenerate the (now 100-wide) help block; add a GitHub release badge (github/v/release) alongside the PyPI / workflow / license badges. --- README.md | 152 ++++++++++++++------------------------- scripts/update_readme.py | 10 +++ 2 files changed, 65 insertions(+), 97 deletions(-) diff --git a/README.md b/README.md index 8c85783..8211145 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,8 @@ [![PyPI](https://img.shields.io/pypi/v/codeanalyzer-python?style=for-the-badge&logo=pypi&logoColor=white)](https://pypi.org/project/codeanalyzer-python/) [![Python](https://img.shields.io/pypi/pyversions/codeanalyzer-python?style=for-the-badge&logo=python&logoColor=white)](https://pypi.org/project/codeanalyzer-python/) -[![Release](https://img.shields.io/github/actions/workflow/status/codellm-devkit/codeanalyzer-python/release.yml?style=for-the-badge&label=release&logo=github)](https://github.com/codellm-devkit/codeanalyzer-python/actions/workflows/release.yml) +[![GitHub release](https://img.shields.io/github/v/release/codellm-devkit/codeanalyzer-python?style=for-the-badge&logo=github&label=GitHub&color=2dba4e)](https://github.com/codellm-devkit/codeanalyzer-python/releases/latest) +[![Release](https://img.shields.io/github/actions/workflow/status/codellm-devkit/codeanalyzer-python/release.yml?style=for-the-badge&label=release&logo=githubactions&logoColor=white)](https://github.com/codellm-devkit/codeanalyzer-python/actions/workflows/release.yml) [![License](https://img.shields.io/badge/License-Apache%202.0-blue?style=for-the-badge)](./LICENSE) @@ -133,102 +134,59 @@ $ canpy --help Static Analysis on Python source code using Jedi, CodeQL and Tree sitter. -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --input -i PATH Path to the │ -│ project root │ -│ directory (not │ -│ required for │ -│ --emit schema). │ -│ --output -o PATH Output directory │ -│ for artifacts. │ -│ --format -f [json|msgpack] Output format for │ -│ --emit json: json │ -│ or msgpack. │ -│ [default: json] │ -│ --emit [json|neo4j|sche Output target: │ -│ ma] json │ -│ (analysis.json, │ -│ default) | neo4j │ -│ (graph.cypher or │ -│ live Bolt push) | │ -│ schema (the Neo4j │ -│ schema.json │ -│ contract). │ -│ [default: json] │ -│ --app-name TEXT Logical │ -│ application name │ -│ for the graph │ -│ :PyApplication │ -│ anchor (default: │ -│ input dir name). │ -│ --neo4j-uri TEXT Push the graph to │ -│ a live Neo4j over │ -│ Bolt │ -│ (incremental); │ -│ omit to write │ -│ graph.cypher. │ -│ [env var: │ -│ NEO4J_URI] │ -│ --neo4j-user TEXT Neo4j username. │ -│ [env var: │ -│ NEO4J_USERNAME] │ -│ [default: neo4j] │ -│ --neo4j-password TEXT Neo4j password. │ -│ Prefer the env │ -│ var over the flag │ -│ (the flag is │ -│ visible in shell │ -│ history / process │ -│ list). │ -│ [env var: │ -│ NEO4J_PASSWORD] │ -│ [default: neo4j] │ -│ --neo4j-database TEXT Neo4j database │ -│ name (default: │ -│ server default). │ -│ [env var: │ -│ NEO4J_DATABASE] │ -│ --codeql --no-codeql Enable │ -│ CodeQL-based │ -│ analysis. │ -│ [default: │ -│ no-codeql] │ -│ --ray --no-ray Enable Ray for │ -│ distributed │ -│ analysis. │ -│ [default: no-ray] │ -│ --eager --lazy Enable eager or │ -│ lazy analysis. │ -│ Defaults to lazy. │ -│ [default: lazy] │ -│ --skip-tests --include-tests Skip test files │ -│ in analysis. │ -│ [default: │ -│ skip-tests] │ -│ --file-name PATH Analyze only the │ -│ specified file │ -│ (relative to │ -│ input directory). │ -│ --cache-dir -c PATH Directory to │ -│ store analysis │ -│ cache. Defaults │ -│ to │ -│ '.codeanalyzer' │ -│ in the input │ -│ directory. │ -│ --clear-cache --keep-cache Clear cache after │ -│ analysis. By │ -│ default, cache is │ -│ retained. │ -│ [default: │ -│ keep-cache] │ -│ -v INTEGER Increase │ -│ verbosity: -v, │ -│ -vv, -vvv │ -│ [default: 0] │ -│ --help Show this message │ -│ and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ +│ --input -i PATH Path to the project root directory │ +│ (not required for --emit schema). │ +│ --output -o PATH Output directory for artifacts. │ +│ --format -f [json|msgpack] Output format for --emit json: │ +│ json or msgpack. │ +│ [default: json] │ +│ --emit [json|neo4j|schema] Output target: json │ +│ (analysis.json, default) | neo4j │ +│ (graph.cypher or live Bolt push) | │ +│ schema (the Neo4j schema.json │ +│ contract). │ +│ [default: json] │ +│ --app-name TEXT Logical application name for the │ +│ graph :PyApplication anchor │ +│ (default: input dir name). │ +│ --neo4j-uri TEXT Push the graph to a live Neo4j │ +│ over Bolt (incremental); omit to │ +│ write graph.cypher. │ +│ [env var: NEO4J_URI] │ +│ --neo4j-user TEXT Neo4j username. │ +│ [env var: NEO4J_USERNAME] │ +│ [default: neo4j] │ +│ --neo4j-password TEXT Neo4j password. Prefer the env var │ +│ over the flag (the flag is visible │ +│ in shell history / process list). │ +│ [env var: NEO4J_PASSWORD] │ +│ [default: neo4j] │ +│ --neo4j-database TEXT Neo4j database name (default: │ +│ server default). │ +│ [env var: NEO4J_DATABASE] │ +│ --codeql --no-codeql Enable CodeQL-based analysis. │ +│ [default: no-codeql] │ +│ --ray --no-ray Enable Ray for distributed │ +│ analysis. │ +│ [default: no-ray] │ +│ --eager --lazy Enable eager or lazy analysis. │ +│ Defaults to lazy. │ +│ [default: lazy] │ +│ --skip-tests --include-tests Skip test files in analysis. │ +│ [default: skip-tests] │ +│ --file-name PATH Analyze only the specified file │ +│ (relative to input directory). │ +│ --cache-dir -c PATH Directory to store analysis cache. │ +│ Defaults to '.codeanalyzer' in the │ +│ input directory. │ +│ --clear-cache --keep-cache Clear cache after analysis. By │ +│ default, cache is retained. │ +│ [default: keep-cache] │ +│ -v INTEGER Increase verbosity: -v, -vv, -vvv │ +│ [default: 0] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` diff --git a/scripts/update_readme.py b/scripts/update_readme.py index 75cb8f0..e424448 100644 --- a/scripts/update_readme.py +++ b/scripts/update_readme.py @@ -31,6 +31,16 @@ def render_help() -> str: os.environ["TERM"] = "dumb" os.environ["NO_COLOR"] = "1" + # Typer caps help width at rich_utils.MAX_WIDTH (default 80) regardless of + # COLUMNS, so CI renders the box narrower than a dev machine. Pin it to WIDTH + # so the rendered help is wide and byte-identical everywhere. + try: + import typer.rich_utils as _ru + + _ru.MAX_WIDTH = WIDTH + except Exception: # pragma: no cover - defensive across Typer versions + pass + from click.testing import CliRunner from typer.main import get_command From cdaa793127b800c17af169b20a8e5509dc1bc8c6 Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Sat, 20 Jun 2026 16:16:26 -0400 Subject: [PATCH 3/9] feat(homebrew): add a Homebrew tap formula + release job Add `brew install codellm-devkit/tap/codeanalyzer-python`: - packaging/homebrew/generate_formula.sh emits a formula that depends on `uv` and installs version-pinned wrappers running the published PyPI release via `uvx`. The package is pure-Python with heavy native deps (ray, pandas, numpy), so vendoring every transitive dependency as a Homebrew resource is impractical and pip-at-build-time is blocked by the sandbox; the uv-tool approach keeps the formula tiny and `brew install` sandbox-safe. - release.yml gains a `homebrew` job (needs: release) that regenerates the formula on each tag and pushes it to codellm-devkit/homebrew-tap (requires the HOMEBREW_TAP_TOKEN secret, same as the codeanalyzer-typescript release). - README: Install via Homebrew section; CHANGELOG: Unreleased entry. --- .github/workflows/release.yml | 40 ++++++++++++++++++ CHANGELOG.md | 5 +++ README.md | 10 +++++ packaging/homebrew/generate_formula.sh | 56 ++++++++++++++++++++++++++ 4 files changed, 111 insertions(+) create mode 100755 packaging/homebrew/generate_formula.sh diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 5002cb9..d1e0a33 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -136,3 +136,43 @@ jobs: - name: Publish to PyPI via Trusted Publishing run: uv publish + + # Regenerate the Homebrew formula and push it to the shared tap. Split into its + # own job (needs: release) so a tap-push failure -- e.g. a missing + # HOMEBREW_TAP_TOKEN -- is isolated from the PyPI and GitHub Release steps above. + # The non-Rust equivalent of what cargo-dist does for you. + homebrew: + needs: release + if: startsWith(github.ref, 'refs/tags/') + runs-on: ubuntu-latest + steps: + - name: Check out code + uses: actions/checkout@v4 + + - name: Derive version from tag + id: ver + run: echo "version=${GITHUB_REF#refs/tags/v}" >> "$GITHUB_OUTPUT" + + - name: Generate Homebrew formula + env: + REPO: ${{ github.repository }} + VERSION: ${{ steps.ver.outputs.version }} + run: | + chmod +x packaging/homebrew/generate_formula.sh + ./packaging/homebrew/generate_formula.sh > codeanalyzer-python.rb + cat codeanalyzer-python.rb + + - name: Push formula to codellm-devkit/homebrew-tap + env: + TAP_TOKEN: ${{ secrets.HOMEBREW_TAP_TOKEN }} # PAT with write access to homebrew-tap + VERSION: ${{ steps.ver.outputs.version }} + run: | + git clone "https://x-access-token:${TAP_TOKEN}@github.com/codellm-devkit/homebrew-tap.git" tap + mkdir -p tap/Formula + cp codeanalyzer-python.rb tap/Formula/codeanalyzer-python.rb + cd tap + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add Formula/codeanalyzer-python.rb + git commit -m "codeanalyzer-python ${VERSION}" || { echo "no formula change"; exit 0; } + git push diff --git a/CHANGELOG.md b/CHANGELOG.md index 6200df6..7a36b1b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Added +- **Homebrew tap** — `brew install codellm-devkit/tap/codeanalyzer-python`. The release workflow auto-generates a formula (`packaging/homebrew/generate_formula.sh`) that installs the pinned PyPI release as an isolated `uv` tool, and pushes it to `codellm-devkit/homebrew-tap`. Because the package is pure-Python with heavy native dependencies (`ray`, `pandas`, `numpy`), the formula depends on `uv` and runs the release via `uvx` rather than vendoring every transitive dependency as a Homebrew resource. + ## [0.2.0] - 2026-06-20 ### Added diff --git a/README.md b/README.md index 8211145..9f87640 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ and merges them with the Jedi-derived edges, also backfilling callees Jedi could - [Prerequisites](#prerequisites) - [Install via pip (PyPI)](#install-via-pip-pypi) - [Install via shell script](#install-via-shell-script) + - [Install via Homebrew](#install-via-homebrew) - [Build from source](#build-from-source) - [Usage](#usage) - [Options](#options) @@ -102,6 +103,15 @@ Install the CLI as an isolated tool with the one-line installer (provisions via curl --proto '=https' --tlsv1.2 -LsSf https://github.com/codellm-devkit/codeanalyzer-python/releases/latest/download/canpy-installer.sh | sh ``` +### Install via Homebrew + +```sh +brew install codellm-devkit/tap/codeanalyzer-python +``` + +The formula depends on [uv](https://docs.astral.sh/uv/) and installs `canpy` as an isolated, +version-pinned uv tool (the package and its dependencies are resolved and cached on first run). + ### Build from source This project uses [uv](https://docs.astral.sh/uv/) for dependency management. diff --git a/packaging/homebrew/generate_formula.sh b/packaging/homebrew/generate_formula.sh new file mode 100755 index 0000000..458f436 --- /dev/null +++ b/packaging/homebrew/generate_formula.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +# +# Generate the Homebrew formula for codeanalyzer-python (the `canpy` CLI). +# +# Unlike the codeanalyzer-typescript sibling -- which ships a single self-contained +# binary that the formula just downloads -- codeanalyzer-python is a pure-Python +# package published to PyPI with heavy native dependencies (ray, pandas, numpy). +# Vendoring every transitive dependency as a Homebrew `resource` is impractical +# (ray is not buildable from an sdist), and pip-installing at build time is blocked +# by Homebrew's network sandbox. +# +# So the formula stays tiny: it depends on `uv` and installs version-pinned wrapper +# scripts that run the published PyPI release via `uvx` (uv resolves and caches the +# isolated environment on first run). This keeps `brew install` sandbox-safe (no +# network at build time) while pinning the exact released version. +# +# Usage: +# REPO=codellm-devkit/codeanalyzer-python VERSION=0.2.0 \ +# ./generate_formula.sh > codeanalyzer-python.rb +# +set -euo pipefail + +REPO="${REPO:?set REPO, e.g. codellm-devkit/codeanalyzer-python}" +VERSION="${VERSION:?set VERSION, e.g. 0.2.0}" + +cat < Date: Sat, 20 Jun 2026 16:26:33 -0400 Subject: [PATCH 4/9] fix(homebrew): formula must declare url + sha256 `brew install` failed with "formula requires at least a URL" because the formula had only version + depends_on. Point url at the released sdist and add its sha256 (byte-identical to the PyPI sdist); the install method still just writes the uv wrappers. The release `homebrew` job now hashes the published sdist and passes SHA256 to generate_formula.sh. --- .github/workflows/release.yml | 7 ++++++- packaging/homebrew/generate_formula.sh | 11 ++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index d1e0a33..ab14353 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -159,7 +159,12 @@ jobs: VERSION: ${{ steps.ver.outputs.version }} run: | chmod +x packaging/homebrew/generate_formula.sh - ./packaging/homebrew/generate_formula.sh > codeanalyzer-python.rb + # The release job just published the sdist as a Release asset; hash the + # exact bytes users will download so the formula checksum always matches. + sdist="https://github.com/${REPO}/releases/download/v${VERSION}/codeanalyzer_python-${VERSION}.tar.gz" + SHA256="$(curl -fLsS "$sdist" | shasum -a 256 | cut -d' ' -f1)" + REPO="$REPO" VERSION="$VERSION" SHA256="$SHA256" \ + ./packaging/homebrew/generate_formula.sh > codeanalyzer-python.rb cat codeanalyzer-python.rb - name: Push formula to codellm-devkit/homebrew-tap diff --git a/packaging/homebrew/generate_formula.sh b/packaging/homebrew/generate_formula.sh index 458f436..70846cf 100755 --- a/packaging/homebrew/generate_formula.sh +++ b/packaging/homebrew/generate_formula.sh @@ -14,14 +14,21 @@ # isolated environment on first run). This keeps `brew install` sandbox-safe (no # network at build time) while pinning the exact released version. # +# Homebrew requires every formula to declare a source `url` + `sha256` for its +# stable spec, so we point at the released sdist (byte-identical to the PyPI one). +# The install method ignores the unpacked source and just writes uv wrappers, but +# the url anchors the version and satisfies Homebrew's spec requirement. +# # Usage: -# REPO=codellm-devkit/codeanalyzer-python VERSION=0.2.0 \ +# REPO=codellm-devkit/codeanalyzer-python VERSION=0.2.0 SHA256= \ # ./generate_formula.sh > codeanalyzer-python.rb # set -euo pipefail REPO="${REPO:?set REPO, e.g. codellm-devkit/codeanalyzer-python}" VERSION="${VERSION:?set VERSION, e.g. 0.2.0}" +SHA256="${SHA256:?set SHA256 of the released sdist}" +SDIST_URL="https://github.com/${REPO}/releases/download/v${VERSION}/codeanalyzer_python-${VERSION}.tar.gz" cat < Date: Mon, 22 Jun 2026 16:33:23 -0400 Subject: [PATCH 5/9] feat(analysis): install the analysis venv with uv and wire it to Jedi Closes #47 The per-project analysis venv was built and populated but never used: __init__ left self.virtualenv = None and never reassigned it, so SymbolTableBuilder got virtualenv=None and Jedi resolved against the default environment, ignoring the installed dependencies. Set self.virtualenv to the venv path on both a fresh build and a lazy reuse so Jedi resolves the project's third-party imports. Also install dependencies with uv (uv pip install --python ) instead of pip: uv resolves and downloads in parallel with a shared global cache, which is dramatically faster for large dependency trees (e.g. Odoo). uv ships as a self-contained binary in its wheel, so it is present wherever canpy is installed (including Docker); fall back to python -m pip when uv cannot be located. --- codeanalyzer/core.py | 51 +++++++++++++++++++++++++++++++------------- pyproject.toml | 4 ++++ 2 files changed, 40 insertions(+), 15 deletions(-) diff --git a/codeanalyzer/core.py b/codeanalyzer/core.py index b8cfcca..dd27b72 100644 --- a/codeanalyzer/core.py +++ b/codeanalyzer/core.py @@ -226,6 +226,29 @@ def _get_base_interpreter() -> Path: f"a working Python interpreter that can create virtual environments." ) + @staticmethod + def _uv_bin() -> Optional[str]: + """Path to a uv binary: the one bundled with the ``uv`` PyPI package (a + dependency, so normally always present -- including inside a Docker image), + else a uv on PATH, else ``None`` (callers fall back to pip).""" + try: + from uv import find_uv_bin + + return str(find_uv_bin()) + except Exception: + return shutil.which("uv") + + def _install_into_venv(self, venv_python: Path, args: List[str]) -> None: + """Install packages into the target venv, preferring uv for speed (parallel + downloads + a shared global cache) and falling back to the venv's own pip + when uv is unavailable.""" + uv = self._uv_bin() + if uv: + cmd = [uv, "pip", "install", "--python", str(venv_python), *args] + else: + cmd = [str(venv_python), "-m", "pip", "install", *args] + self._cmd_exec_helper(cmd, cwd=self.project_dir, check=True) + def __enter__(self) -> "Codeanalyzer": # If no virtualenv is provided, try to create one using requirements.txt or pyproject.toml venv_path = self.cache_dir / self.project_dir.name / "virtualenv" @@ -249,24 +272,19 @@ def __enter__(self) -> "Codeanalyzer": ("test-requirements.txt", ["-r"]), ] - for dep_file, pip_args in dependency_files: + for dep_file, _ in dependency_files: if (self.project_dir / dep_file).exists(): logger.info(f"Installing dependencies from {dep_file}") - self._cmd_exec_helper( - [str(venv_python), "-m", "pip", "install", "-U"] + pip_args + [str(self.project_dir / dep_file)], - cwd=self.project_dir, - check=True, + self._install_into_venv( + venv_python, + ["--upgrade", "-r", str(self.project_dir / dep_file)], ) # Handle Pipenv files if (self.project_dir / "Pipfile").exists(): logger.info("Installing dependencies from Pipfile") # Note: This would require pipenv to be installed - self._cmd_exec_helper( - [str(venv_python), "-m", "pip", "install", "pipenv"], - cwd=self.project_dir, - check=True, - ) + self._install_into_venv(venv_python, ["pipenv"]) self._cmd_exec_helper( ["pipenv", "install", "--dev"], cwd=self.project_dir, @@ -289,14 +307,17 @@ def __enter__(self) -> "Codeanalyzer": if any((self.project_dir / file).exists() for file in package_definition_files): logger.info("Installing project in editable mode") - self._cmd_exec_helper( - [str(venv_python), "-m", "pip", "install", "-e", str(self.project_dir)], - cwd=self.project_dir, - check=True, - ) + self._install_into_venv(venv_python, ["-e", str(self.project_dir)]) else: logger.warning("No package definition files found, skipping editable installation") + # Point Jedi at the analysis venv so it resolves the project's third-party + # imports. This runs on both a fresh build and a lazy reuse of an existing + # venv -- previously self.virtualenv stayed None, so the install above was + # never actually used by the symbol-table builder. + if venv_path.exists(): + self.virtualenv = venv_path + if self.using_codeql: logger.info(f"(Re-)initializing CodeQL analysis for {self.project_dir}") diff --git a/pyproject.toml b/pyproject.toml index 4b2b57c..acc00a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,10 @@ dependencies = [ "ray==2.0.0; python_version < '3.11'", "ray>=2.10.0,<3.0.0; python_version >= '3.11'", "packaging>=25.0", + # uv -- installs the analyzed project's deps into the analysis venv quickly. + # Shipped as a self-contained binary in its wheel, so it's available wherever + # canpy is pip-installed (incl. Docker); core.py falls back to pip without it. + "uv>=0.5.0", ] [project.optional-dependencies] From df0eae935269910917969d3b422449223d3800af Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Mon, 22 Jun 2026 17:02:48 -0400 Subject: [PATCH 6/9] feat(schema,neo4j): first-class external_symbols; fix dropped call edges (#44) Closes #44 Adopt the model codeanalyzer-typescript uses: external call targets are now first-class in the IR instead of being re-derived ad hoc during Neo4j projection. - schema: add PyExternalSymbol{name, module} and PyApplication.external_symbols, keyed by signature (mirrors TSExternalSymbol). - core: _compute_external_symbols() classifies every call-graph endpoint not declared in the symbol table as an external (name/module from the signature), so analysis.json carries external info that was previously a bare target string. - neo4j: :PyExternal gains a `module` property (SCHEMA_VERSION 1.0.0 -> 1.1.0, additive). project()'s _call_endpoint classifies authoritatively from external_symbols rather than a "present in the graph" heuristic, so an imported module name (a :PyPackage) can no longer shadow a call target and silently drop the PY_CALLS edge. - rows: track node identity by (merge_label, value) so deferred PY_EXTENDS / PY_RESOLVES_TO edges can't be shadowed either. Fixes the ~3.7% of call edges (e.g. targets os/re/json) that were dropped from the emitted graph. Adds a regression test and exercises external_symbols in the sample app; regenerates schema.neo4j.json. --- codeanalyzer/core.py | 58 ++++++++++++++++++++++++++++++-- codeanalyzer/neo4j/catalog.py | 4 +-- codeanalyzer/neo4j/project.py | 28 +++++++++------ codeanalyzer/neo4j/rows.py | 15 ++++++--- codeanalyzer/schema/__init__.py | 2 ++ codeanalyzer/schema/py_schema.py | 15 +++++++++ schema.neo4j.json | 5 +-- test/sample_graph_app.py | 4 +++ test/test_neo4j_schema.py | 34 +++++++++++++++++++ 9 files changed, 144 insertions(+), 21 deletions(-) diff --git a/codeanalyzer/core.py b/codeanalyzer/core.py index dd27b72..314b5ec 100644 --- a/codeanalyzer/core.py +++ b/codeanalyzer/core.py @@ -8,7 +8,13 @@ import ray from codeanalyzer.utils import logger -from codeanalyzer.schema import PyApplication, PyModule, model_dump_json, model_validate_json +from codeanalyzer.schema import ( + PyApplication, + PyExternalSymbol, + PyModule, + model_dump_json, + model_validate_json, +) from codeanalyzer.schema.py_schema import PyCallEdge from codeanalyzer.semantic_analysis.call_graph import ( jedi_call_graph_edges, @@ -379,6 +385,43 @@ def __exit__(self, *args, **kwargs) -> None: logger.info(f"Clearing cache directory: {self.cache_dir}") shutil.rmtree(self.cache_dir) + @staticmethod + def _compute_external_symbols(symbol_table, call_graph): + """Build the external-symbol map: every call-graph endpoint whose signature + is not a declared class/callable in the symbol table is an external (an + imported library or builtin member). ``name``/``module`` are derived from + the signature (best effort: split on the last dot).""" + declared = set() + + def walk_callable(c): + declared.add(c.signature) + for ic in (c.inner_callables or {}).values(): + walk_callable(ic) + for cl in (c.inner_classes or {}).values(): + walk_class(cl) + + def walk_class(cl): + declared.add(cl.signature) + for m in (cl.methods or {}).values(): + walk_callable(m) + for ic in (cl.inner_classes or {}).values(): + walk_class(ic) + + for mod in symbol_table.values(): + for c in (mod.functions or {}).values(): + walk_callable(c) + for cl in (mod.classes or {}).values(): + walk_class(cl) + + externals: Dict[str, PyExternalSymbol] = {} + for edge in call_graph: + for sig in (edge.source, edge.target): + if sig in declared or sig in externals: + continue + module, name = sig.rsplit(".", 1) if "." in sig else (sig, sig) + externals[sig] = PyExternalSymbol(name=name, module=module) + return externals + def analyze(self) -> PyApplication: """Analyze the project and return a PyApplication with symbol table. @@ -418,8 +461,19 @@ def analyze(self) -> PyApplication: jedi_edges = jedi_call_graph_edges(symbol_table) call_graph = merge_edges(jedi_edges, codeql_edges) + # Classify call-graph endpoints that are not declared in the symbol table + # (imported library / builtin members) once, so the JSON and Neo4j backends + # share one authoritative external-symbol set. + external_symbols = self._compute_external_symbols(symbol_table, call_graph) + # Recreate pyapplication - app = PyApplication.builder().symbol_table(symbol_table).call_graph(call_graph).build() + app = ( + PyApplication.builder() + .symbol_table(symbol_table) + .call_graph(call_graph) + .external_symbols(external_symbols) + .build() + ) # Save to cache self._save_analysis_cache(app, cache_file) diff --git a/codeanalyzer/neo4j/catalog.py b/codeanalyzer/neo4j/catalog.py index 37f8a1a..155d86a 100644 --- a/codeanalyzer/neo4j/catalog.py +++ b/codeanalyzer/neo4j/catalog.py @@ -34,7 +34,7 @@ from codeanalyzer.neo4j.schema import CONSTRAINTS, INDEXES -SCHEMA_VERSION = "1.0.0" +SCHEMA_VERSION = "1.1.0" # PropType ∈ {"string", "integer", "float", "boolean", "string[]", "integer[]"}. @@ -119,7 +119,7 @@ class RelType: "PyExternal", "PySymbol", "signature", - {"signature": "string", "name": "string"}, + {"signature": "string", "name": "string", "module": "string"}, ), NodeLabel("PyPackage", "PyPackage", "name", {"name": "string"}), NodeLabel( diff --git a/codeanalyzer/neo4j/project.py b/codeanalyzer/neo4j/project.py index 4878cda..7c4deb7 100644 --- a/codeanalyzer/neo4j/project.py +++ b/codeanalyzer/neo4j/project.py @@ -60,11 +60,12 @@ def project(app: PyApplication, app_name: str) -> GraphRows: b.edge("PY_HAS_MODULE", app_ref, mod_ref) _project_module_body(b, file_key, mod_ref, mod) - # The aggregated :PY_CALLS twin. Endpoints not present in the symbol table become - # :PyExternal ghost nodes (the analyzer already preserves them as ghost nodes). + # The aggregated :PY_CALLS twin. Endpoints listed in app.external_symbols become + # :PyExternal ghost nodes; the rest are declared :PySymbol nodes already emitted. + externals = app.external_symbols or {} for e in app.call_graph: - src = _call_endpoint(b, e.source) - tgt = _call_endpoint(b, e.target) + src = _call_endpoint(b, e.source, externals) + tgt = _call_endpoint(b, e.target, externals) b.edge("PY_CALLS", src, tgt, _call_edge_props(e.weight, list(e.provenance or []))) return b.finish() @@ -74,13 +75,20 @@ def _sym(signature: str) -> NodeRef: return NodeRef("PySymbol", "signature", signature) -def _call_endpoint(b: RowBuilder, signature: str) -> NodeRef: - """A call-graph endpoint: a known callable already emitted, or a phantom - :PyExternal symbol materialized on demand for a ghost target.""" - if b.has_key(signature): +def _call_endpoint(b: RowBuilder, signature: str, externals: dict) -> NodeRef: + """A call-graph endpoint: a declared callable already emitted, or an external + symbol (imported library / builtin member) materialized as a :PyExternal ghost. + + Classification is authoritative -- it comes from ``app.external_symbols``, not a + "present in the graph" heuristic -- so an imported module name (which exists only + as a :PyPackage) can never shadow the call target. A small fallback still + materializes an external for any endpoint that is neither declared nor listed.""" + ext = externals.get(signature) + if ext is None and b.has_key("PySymbol", signature): return _sym(signature) - name = signature.rsplit(".", 1)[-1] if "." in signature else signature - return b.node(["PySymbol", "PyExternal"], "signature", signature, {"name": name}) + name = ext.name if ext is not None else (signature.rsplit(".", 1)[-1] if "." in signature else signature) + module = ext.module if ext is not None else None + return b.node(["PySymbol", "PyExternal"], "signature", signature, prune({"name": name, "module": module})) # ---------------------------------------------------------------------------------------------- diff --git a/codeanalyzer/neo4j/rows.py b/codeanalyzer/neo4j/rows.py index 9edecde..cbc381f 100644 --- a/codeanalyzer/neo4j/rows.py +++ b/codeanalyzer/neo4j/rows.py @@ -83,7 +83,11 @@ def __init__(self) -> None: self._nodes: Dict[str, NodeRow] = {} # key: f"{labels[0]} {value}" self._edges: List[EdgeRow] = [] self._deferred: List[EdgeRow] = [] # edges gated against node existence at finish() - self._keys: set = set() # every node value seen, for resolved-gating + # (merge_label, value) of every node seen, for resolved-gating. Keyed by + # label too so a :PyPackage name can't shadow a :PySymbol signature (and + # vice versa) — otherwise a call to an imported module name like ``os`` + # resolves to a :PySymbol node that was never created and the edge is lost. + self._keys: set = set() def node(self, labels: List[str], key_prop: str, value: str, props: Props) -> NodeRef: """Upsert a node. Re-seeing the same ``(labels[0], value)`` merges props @@ -98,7 +102,7 @@ def node(self, labels: List[str], key_prop: str, value: str, props: Props) -> No existing.labels.append(label) else: self._nodes[node_id] = NodeRow(list(labels), key_prop, value, dict(props)) - self._keys.add(value) + self._keys.add((labels[0], value)) return NodeRef(labels[0], key_prop, value) def edge(self, type_: str, from_ref: NodeRef, to_ref: NodeRef, props: Optional[Props] = None) -> None: @@ -121,12 +125,13 @@ def edge_to_symbol( ) ) - def has_key(self, value: str) -> bool: - return value in self._keys + def has_key(self, label: str, value: str) -> bool: + """Whether a node with this ``(merge_label, value)`` identity was emitted.""" + return (label, value) in self._keys def finish(self) -> GraphRows: for e in self._deferred: - if e.to_ref.value in self._keys: + if (e.to_ref.label, e.to_ref.value) in self._keys: self._edges.append(e) nodes = sorted(self._nodes.values(), key=lambda n: f"{n.labels[0]} {n.value}") edges = sorted(self._edges, key=lambda e: f"{e.type} {e.from_ref.value} {e.to_ref.value}") diff --git a/codeanalyzer/schema/__init__.py b/codeanalyzer/schema/__init__.py index 8853909..bcfa976 100644 --- a/codeanalyzer/schema/__init__.py +++ b/codeanalyzer/schema/__init__.py @@ -8,6 +8,7 @@ PyClass, PyClassAttribute, PyComment, + PyExternalSymbol, PyImport, PyModule, PyVariableDeclaration, @@ -15,6 +16,7 @@ __all__ = [ "PyApplication", + "PyExternalSymbol", "PyImport", "PyComment", "PyModule", diff --git a/codeanalyzer/schema/py_schema.py b/codeanalyzer/schema/py_schema.py index 8bef391..c69e5fb 100644 --- a/codeanalyzer/schema/py_schema.py +++ b/codeanalyzer/schema/py_schema.py @@ -358,6 +358,17 @@ class PyCallEdge(BaseModel): provenance: List[Literal["jedi", "codeql", "joern"]] = [] +@builder +@msgpk +class PyExternalSymbol(BaseModel): + """A call-graph target outside the analyzed project -- an imported library or + builtin member. Mirrors codeanalyzer-typescript's ``TSExternalSymbol`` and is + keyed in ``PyApplication.external_symbols`` by its call-graph signature.""" + + name: str # the member/short name, e.g. "get" for "requests.get" + module: Optional[str] = None # best-effort owning module, e.g. "requests" + + @builder @msgpk class PyApplication(BaseModel): @@ -365,3 +376,7 @@ class PyApplication(BaseModel): symbol_table: Dict[str, PyModule] call_graph: List[PyCallEdge] = [] + # Call-graph endpoints not declared in the symbol table (imported library / + # builtin members), keyed by signature. Populated by the analyzer so every + # backend (JSON and Neo4j) shares one authoritative external-symbol set. + external_symbols: Dict[str, PyExternalSymbol] = {} diff --git a/schema.neo4j.json b/schema.neo4j.json index ffccf29..de5d100 100644 --- a/schema.neo4j.json +++ b/schema.neo4j.json @@ -1,5 +1,5 @@ { - "schema_version": "1.0.0", + "schema_version": "1.1.0", "generator": "codeanalyzer-python", "marker_labels": [], "node_labels": [ @@ -67,7 +67,8 @@ "key": "signature", "properties": { "signature": "string", - "name": "string" + "name": "string", + "module": "string" } }, { diff --git a/test/sample_graph_app.py b/test/sample_graph_app.py index 11124f4..b4232b9 100644 --- a/test/sample_graph_app.py +++ b/test/sample_graph_app.py @@ -14,6 +14,7 @@ PyClass, PyClassAttribute, PyComment, + PyExternalSymbol, PyImport, PyModule, PyVariableDeclaration, @@ -149,4 +150,7 @@ def make_sample_app() -> PyApplication: return PyApplication( symbol_table={"src/service.py": service_mod, "src/util.py": util_mod}, call_graph=call_graph, + # The ghost edge's target (requests.get) is a library member, recorded as a + # first-class external symbol so the projection emits a :PyExternal for it. + external_symbols={"requests.get": PyExternalSymbol(name="get", module="requests")}, ) diff --git a/test/test_neo4j_schema.py b/test/test_neo4j_schema.py index 401b465..bba6336 100644 --- a/test/test_neo4j_schema.py +++ b/test/test_neo4j_schema.py @@ -12,6 +12,8 @@ from codeanalyzer.neo4j import NODE_LABELS, REL_TYPES, build_schema_document, project from codeanalyzer.neo4j.catalog import MARKER_LABELS from codeanalyzer.neo4j.cypher import render_cypher +from codeanalyzer.schema import PyApplication, PyCallable, PyImport, PyModule +from codeanalyzer.schema.py_schema import PyCallEdge from sample_graph_app import make_sample_app @@ -87,6 +89,38 @@ def test_render_cypher_is_deterministic_and_self_contained(): assert "MERGE (n:PySymbol {signature: row.k})" in a +def test_call_edge_to_imported_module_name_is_not_dropped(): + """Regression for #44: a call whose target is a bare module name that is also + imported (e.g. ``os``) must not be dropped. The import creates a :PyPackage + named ``os``; that must not shadow the call target's :PySymbol signature.""" + caller = PyCallable( + name="caller", path="m.py", signature="m.caller", return_type="None", + code="def caller():\n os.getcwd()", start_line=1, end_line=2, + code_start_line=1, cyclomatic_complexity=1, + ) + mod = PyModule( + file_path="m.py", module_name="m", + imports=[PyImport(module="os", name="getcwd")], + functions={"caller": caller}, + content_hash="h", last_modified=1.0, file_size=10, + ) + app = PyApplication( + symbol_table={"m.py": mod}, + call_graph=[PyCallEdge(source="m.caller", target="os", weight=1, provenance=["jedi"])], + ) + rows = project(app, "app") + + calls_to_os = [e for e in rows.edges if e.type == "PY_CALLS" and e.to_ref.value == "os"] + assert len(calls_to_os) == 1, "PY_CALLS edge to imported module name 'os' was dropped" + + # 'os' is materialized as a :PyExternal symbol (the call target) ... + assert any(n.value == "os" and "PyExternal" in n.labels for n in rows.nodes), \ + ":PyExternal ghost for the call target 'os' is missing" + # ... distinct from the :PyPackage 'os' created by the import. + assert any(n.value == "os" and "PyPackage" in n.labels for n in rows.nodes), \ + ":PyPackage for the import 'os' is missing" + + def test_checked_in_schema_matches_catalog(): """Run `canpy --emit schema > schema.neo4j.json` if this fails.""" on_disk_path = Path(__file__).resolve().parents[1] / "schema.neo4j.json" From de41937cbe60851d55f3c26952002ff41cba9628 Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Mon, 22 Jun 2026 17:08:36 -0400 Subject: [PATCH 7/9] fix(neo4j): scope bolt full-run orphan prune to the application anchor Closes #45 The full-run prune deleted any :PyModule whose file_key was not in the current emit across the ENTIRE database -- not just the application being written -- so a full-run push for application B wiped application A's modules, leaving an orphaned :PyApplication with zero PY_HAS_MODULE edges. A single Neo4j database therefore could not hold multiple applications via full-run --emit neo4j. Anchor the prune to the :PyApplication {name} being emitted (MATCH (:PyApplication {name:$app})-[:PY_HAS_MODULE]->(m:PyModule) WHERE NOT m.file_key IN $present ...), so it only removes that application's vanished modules. Adds a container regression test (app-b push leaves app-a intact). --- codeanalyzer/neo4j/bolt.py | 15 +++++++++++++-- test/test_neo4j_bolt.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/codeanalyzer/neo4j/bolt.py b/codeanalyzer/neo4j/bolt.py index 4ae102b..dc60986 100644 --- a/codeanalyzer/neo4j/bolt.py +++ b/codeanalyzer/neo4j/bolt.py @@ -77,6 +77,13 @@ def session(): for stmt in [*CONSTRAINTS, *INDEXES]: s.run(stmt) + # The application anchor (a shared node) — used to scope the orphan prune + # so it never touches modules belonging to a different :PyApplication. + app_name = next( + (n.value for n in rows.nodes if n.labels and n.labels[0] == "PyApplication"), + None, + ) + # Partition nodes by owning module; shared nodes have no _module. by_module: Dict[str, List[NodeRow]] = {} shared: List[NodeRow] = [] @@ -135,13 +142,17 @@ def _purge(tx, module=m, node_keys=keys): _upsert_edges(session, neo4j, edges) # 6. orphan prune — only safe on a full run (a targeted run can't tell deleted from untargeted). - if full_run: + # Scope to THIS application's anchor so a full run for application B never + # deletes application A's modules from a shared database. + if full_run and app_name is not None: present = list(by_module.keys()) with session() as s: res = s.run( - "MATCH (m:PyModule) WHERE NOT m.file_key IN $present " + "MATCH (:PyApplication {name: $app})-[:PY_HAS_MODULE]->(m:PyModule) " + "WHERE NOT m.file_key IN $present " f"OPTIONAL MATCH (m)-{DESCENDANTS}->(x) DETACH DELETE x, m " "RETURN count(m) AS pruned", + app=app_name, present=present, ) pruned = res.single() diff --git a/test/test_neo4j_bolt.py b/test/test_neo4j_bolt.py index ee84e01..6f02bd8 100644 --- a/test/test_neo4j_bolt.py +++ b/test/test_neo4j_bolt.py @@ -15,9 +15,24 @@ from codeanalyzer.neo4j import project from codeanalyzer.neo4j.bolt import BoltConfig, bolt_writer +from codeanalyzer.schema import PyApplication, PyCallable, PyModule from sample_graph_app import make_sample_app + +def _single_module_app(file_key: str = "appb/main.py") -> PyApplication: + """A minimal second application with its own (distinct) module file_key.""" + fn = PyCallable( + name="main", path=file_key, signature="appb.main", return_type="None", + code="def main():\n ...", start_line=1, end_line=2, + code_start_line=1, cyclomatic_complexity=1, + ) + mod = PyModule( + file_path=file_key, module_name="appb.main", functions={"main": fn}, + content_hash="h-b", last_modified=1.0, file_size=10, + ) + return PyApplication(symbol_table={file_key: mod}, call_graph=[]) + pytestmark = pytest.mark.skipif( not os.environ.get("RUN_CONTAINER_TESTS"), reason="opt-in: set RUN_CONTAINER_TESTS=1 (needs Docker/Podman) to run the Neo4j bolt test", @@ -105,6 +120,21 @@ def test_full_push_materializes_the_whole_graph_and_schema(driver, cfg): assert _num(driver, "MATCH (e:PyExternal) RETURN count(e)") >= 1 +def test_full_run_does_not_prune_another_applications_modules(driver, cfg): + """Regression for #45: a full-run push for one application must not prune the + modules of a *different* application sharing the database.""" + bolt_writer(project(make_sample_app(), "app-a"), cfg, full_run=True) + before = _num(driver, "MATCH (:PyApplication {name:'app-a'})-[:PY_HAS_MODULE]->(m) RETURN count(m)") + assert before > 0 + + # A full-run push for a different application must leave app-a untouched. + bolt_writer(project(_single_module_app(), "app-b"), cfg, full_run=True) + + after = _num(driver, "MATCH (:PyApplication {name:'app-a'})-[:PY_HAS_MODULE]->(m) RETURN count(m)") + assert after == before, "full-run push for app-b pruned app-a's modules (#45)" + assert _num(driver, "MATCH (:PyApplication {name:'app-b'})-[:PY_HAS_MODULE]->(m) RETURN count(m)") == 1 + + def test_re_pushing_identical_analysis_is_idempotent(driver, cfg): rows = project(make_sample_app(), "sample-app") bolt_writer(rows, cfg, full_run=True) From 63cf46fbaa507d6f2e0a0d169b8e38df36bbfbd1 Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Mon, 22 Jun 2026 17:12:38 -0400 Subject: [PATCH 8/9] feat(cli): add --no-venv to skip virtualenv creation and use the ambient env Closes #46 Add a --no-venv flag (AnalysisOptions.no_venv) that skips virtualenv creation and dependency installation and resolves imports against the ambient interpreter (self.virtualenv stays None, so Jedi uses the default environment). Useful in CI / containers where the project's dependencies are already installed, for sandboxed runs where network installs are disallowed, and for speed. Tradeoff: import / call-resolution quality then depends on what is installed in the ambient env. Regenerates the README --help block; adds a CLI regression test (no virtualenv is created and analysis.json is still produced). --- README.md | 5 +++++ codeanalyzer/__main__.py | 9 +++++++++ codeanalyzer/core.py | 13 ++++++++++--- codeanalyzer/options/options.py | 1 + test/test_cli.py | 21 +++++++++++++++++++++ 5 files changed, 46 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 9f87640..17cea55 100644 --- a/README.md +++ b/README.md @@ -185,6 +185,11 @@ $ canpy --help │ [default: lazy] │ │ --skip-tests --include-tests Skip test files in analysis. │ │ [default: skip-tests] │ +│ --no-venv --venv Skip virtualenv creation and │ +│ dependency installation; resolve │ +│ imports against the ambient Python │ +│ environment instead. │ +│ [default: venv] │ │ --file-name PATH Analyze only the specified file │ │ (relative to input directory). │ │ --cache-dir -c PATH Directory to store analysis cache. │ diff --git a/codeanalyzer/__main__.py b/codeanalyzer/__main__.py index d386d3b..d7f4ab3 100644 --- a/codeanalyzer/__main__.py +++ b/codeanalyzer/__main__.py @@ -104,6 +104,14 @@ def main( help="Skip test files in analysis.", ), ] = True, + no_venv: Annotated[ + bool, + typer.Option( + "--no-venv/--venv", + help="Skip virtualenv creation and dependency installation; resolve " + "imports against the ambient Python environment instead.", + ), + ] = False, file_name: Annotated[ Optional[Path], typer.Option( @@ -144,6 +152,7 @@ def main( using_ray=using_ray, rebuild_analysis=rebuild_analysis, skip_tests=skip_tests, + no_venv=no_venv, file_name=file_name, cache_dir=cache_dir, clear_cache=clear_cache, diff --git a/codeanalyzer/core.py b/codeanalyzer/core.py index 314b5ec..9b5f538 100644 --- a/codeanalyzer/core.py +++ b/codeanalyzer/core.py @@ -66,6 +66,7 @@ def __init__(self, options: AnalysisOptions) -> None: self.skip_tests = options.skip_tests self.using_codeql = options.using_codeql self.rebuild_analysis = options.rebuild_analysis + self.no_venv = options.no_venv self.cache_dir = ( options.cache_dir.resolve() if options.cache_dir is not None else self.project_dir ) / ".codeanalyzer" @@ -260,8 +261,13 @@ def __enter__(self) -> "Codeanalyzer": venv_path = self.cache_dir / self.project_dir.name / "virtualenv" # Ensure the cache directory exists for this project venv_path.parent.mkdir(parents=True, exist_ok=True) + if self.no_venv: + logger.info( + "--no-venv: using the ambient Python environment " + "(skipping virtualenv creation and dependency installation)" + ) # Create the virtual environment if it does not exist - if not venv_path.exists() or self.rebuild_analysis: + if not self.no_venv and (not venv_path.exists() or self.rebuild_analysis): logger.info(f"(Re-)creating virtual environment at {venv_path}") self._cmd_exec_helper( [str(self._get_base_interpreter()), "-m", "venv", str(venv_path)], @@ -320,8 +326,9 @@ def __enter__(self) -> "Codeanalyzer": # Point Jedi at the analysis venv so it resolves the project's third-party # imports. This runs on both a fresh build and a lazy reuse of an existing # venv -- previously self.virtualenv stayed None, so the install above was - # never actually used by the symbol-table builder. - if venv_path.exists(): + # never actually used by the symbol-table builder. With --no-venv we leave + # it None so Jedi resolves against the ambient interpreter instead. + if not self.no_venv and venv_path.exists(): self.virtualenv = venv_path if self.using_codeql: diff --git a/codeanalyzer/options/options.py b/codeanalyzer/options/options.py index 541fb85..e314c5e 100644 --- a/codeanalyzer/options/options.py +++ b/codeanalyzer/options/options.py @@ -38,6 +38,7 @@ class AnalysisOptions: using_ray: bool = False rebuild_analysis: bool = False skip_tests: bool = True + no_venv: bool = False file_name: Optional[Path] = None cache_dir: Optional[Path] = None clear_cache: bool = False diff --git a/test/test_cli.py b/test/test_cli.py index b4ba50d..11a5490 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -38,6 +38,27 @@ def test_cli_call_symbol_table_with_json(cli_runner, whole_applications__xarray) assert len(json_obj["symbol_table"]) > 0, "Symbol table should not be empty" +def test_no_venv_skips_virtualenv( + cli_runner, single_functionalities__stuff_nested_in_functions, tmp_path +): + """#46: --no-venv must skip virtualenv creation/installation and still analyze.""" + out = tmp_path / "out" + cache = tmp_path / "cache" + result = cli_runner.invoke( + app, + [ + "--input", str(single_functionalities__stuff_nested_in_functions), + "--output", str(out), + "--cache-dir", str(cache), + "--no-venv", "--no-codeql", "--no-ray", + ], + env={"NO_COLOR": "1", "TERM": "dumb"}, + ) + assert result.exit_code == 0, result.output + assert (out / "analysis.json").exists(), "analysis.json should still be produced with --no-venv" + assert not list(cache.rglob("virtualenv")), "--no-venv must not create a virtualenv" + + def test_single_file(cli_runner, single_functionalities__stuff_nested_in_functions): """Must be able to run the CLI with single file analysis using --file-name flag.""" output_dir = single_functionalities__stuff_nested_in_functions.joinpath(".output") From c02b92da97cfbd6628bc2336e6ce4025d29f5e51 Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Mon, 22 Jun 2026 17:28:45 -0400 Subject: [PATCH 9/9] chore(release): 0.2.1 --- CHANGELOG.md | 12 +++++++++++- pyproject.toml | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a36b1b..15afc94 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,10 +5,20 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [0.2.1] - 2026-06-22 ### Added - **Homebrew tap** — `brew install codellm-devkit/tap/codeanalyzer-python`. The release workflow auto-generates a formula (`packaging/homebrew/generate_formula.sh`) that installs the pinned PyPI release as an isolated `uv` tool, and pushes it to `codellm-devkit/homebrew-tap`. Because the package is pure-Python with heavy native dependencies (`ray`, `pandas`, `numpy`), the formula depends on `uv` and runs the release via `uvx` rather than vendoring every transitive dependency as a Homebrew resource. +- **First-class external symbols** — `PyApplication.external_symbols` (a `{signature → PyExternalSymbol{name, module}}` map) records call-graph targets outside the analyzed project, mirroring the `codeanalyzer-typescript` backend. `analysis.json` now carries external info that was previously only a bare target string, and the Neo4j projection emits `:PyExternal` authoritatively from it ([#44](https://github.com/codellm-devkit/codeanalyzer-python/issues/44)). +- **`--no-venv` / `--venv` flag** — skip virtualenv creation and dependency installation and resolve imports against the ambient Python interpreter. Useful in CI / containers where the project's dependencies are already installed, for sandboxed runs without network, and for speed ([#46](https://github.com/codellm-devkit/codeanalyzer-python/issues/46)). + +### Changed +- The per-project analysis virtualenv is now installed with **`uv`** (parallel downloads + a shared global cache; falls back to `pip`), and is now **wired to Jedi** — previously `self.virtualenv` stayed `None`, so the install was never used by the symbol-table builder ([#47](https://github.com/codellm-devkit/codeanalyzer-python/issues/47)). +- Neo4j `:PyExternal` gains a `module` property; `SCHEMA_VERSION` bumped `1.0.0 → 1.1.0` (additive) ([#44](https://github.com/codellm-devkit/codeanalyzer-python/issues/44)). + +### Fixed +- `--emit neo4j` no longer drops call edges whose target is a bare imported module name (e.g. `os`, `re`, `json`): a `:PyPackage` name can no longer shadow a call target's `:PySymbol` signature, and the node-identity tracking is keyed by `(label, value)` so deferred `PY_EXTENDS` / `PY_RESOLVES_TO` edges can't be shadowed either ([#44](https://github.com/codellm-devkit/codeanalyzer-python/issues/44)). +- `--emit neo4j` (Bolt) full-run orphan prune is now scoped to the `:PyApplication` anchor, so a full-run push for one application no longer deletes another application's modules from a shared database ([#45](https://github.com/codellm-devkit/codeanalyzer-python/issues/45)). ## [0.2.0] - 2026-06-20 diff --git a/pyproject.toml b/pyproject.toml index acc00a7..d7f2514 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "codeanalyzer-python" -version = "0.2.0" +version = "0.2.1" description = "Static Analysis on Python source code using Jedi, CodeQL and Treesitter — emits analysis.json or a Neo4j property graph." readme = "README.md" authors = [