diff --git a/.github/actions/codeclone/README.md b/.github/actions/codeclone/README.md index 1889dcd..bc51388 100644 --- a/.github/actions/codeclone/README.md +++ b/.github/actions/codeclone/README.md @@ -1,11 +1,178 @@ # CodeClone GitHub Action -Runs CodeClone to detect architectural code duplication in Python projects. +Baseline-aware structural code quality analysis for Python with: -## Usage +- configurable CI gating +- SARIF upload for GitHub Code Scanning +- PR summary comments +- deterministic JSON report generation + +This action is designed for PR and CI workflows where you want CodeClone to act +as a non-LLM review bot: run analysis, upload SARIF, post a concise summary, +and propagate the real gate result. + +## What it does + +The v2 action flow is: + +1. set up Python +2. install `codeclone` +3. optionally require a committed baseline +4. run CodeClone with JSON + optional SARIF output +5. optionally upload SARIF to GitHub Code Scanning +6. optionally post or update a PR summary comment +7. return the real CodeClone exit code as the job result + +When the action is used from the checked-out CodeClone repository itself +(`uses: ./.github/actions/codeclone`), it installs CodeClone from the repo +source under test. Remote consumers still install from PyPI. + +## Basic usage ```yaml -- uses: orenlab/codeclone/.github/actions/codeclone@v1 +- uses: orenlab/codeclone/.github/actions/codeclone@main with: - path: . - fail-on-new: true + fail-on-new: "true" +``` + +For released references, prefer pinning to a major version tag such as `@v2` +or to an immutable commit SHA. + +## PR workflow example + +```yaml +name: CodeClone + +on: + pull_request: + types: [ opened, synchronize, reopened ] + paths: [ "**/*.py" ] + +permissions: + contents: read + security-events: write + pull-requests: write + +jobs: + codeclone: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - uses: orenlab/codeclone/.github/actions/codeclone@main + with: + fail-on-new: "true" + fail-health: "60" + sarif: "true" + pr-comment: "true" +``` + +## Inputs + +| Input | Default | Purpose | +|-------------------------|---------------------------------|-------------------------------------------------------------------------------------------------------------------| +| `python-version` | `3.13` | Python version used to run the action | +| `package-version` | `""` | CodeClone version from PyPI for remote installs; ignored when the action runs from the checked-out CodeClone repo | +| `path` | `.` | Project root to analyze | +| `json-path` | `.cache/codeclone/report.json` | JSON report output path | +| `sarif` | `true` | Generate SARIF and try to upload it | +| `sarif-path` | `.cache/codeclone/report.sarif` | SARIF output path | +| `pr-comment` | `true` | Post or update a PR summary comment | +| `fail-on-new` | `true` | Fail if new clone groups are detected | +| `fail-on-new-metrics` | `false` | Fail if metrics regress vs baseline | +| `fail-threshold` | `-1` | Max allowed function+block clone groups | +| `fail-complexity` | `-1` | Max cyclomatic complexity | +| `fail-coupling` | `-1` | Max coupling CBO | +| `fail-cohesion` | `-1` | Max cohesion LCOM4 | +| `fail-cycles` | `false` | Fail on dependency cycles | +| `fail-dead-code` | `false` | Fail on high-confidence dead code | +| `fail-health` | `-1` | Minimum health score | +| `require-baseline` | `true` | Fail early if the baseline file is missing | +| `baseline-path` | `codeclone.baseline.json` | Baseline path passed to CodeClone | +| `metrics-baseline-path` | `codeclone.baseline.json` | Metrics baseline path passed to CodeClone | +| `extra-args` | `""` | Additional CodeClone CLI arguments | +| `no-progress` | `true` | Disable progress output | + +For numeric gate inputs, `-1` means "disabled". + +## Outputs + +| Output | Meaning | +|-----------------|------------------------------------------------------------| +| `exit-code` | CodeClone process exit code | +| `json-path` | Resolved JSON report path | +| `sarif-path` | Resolved SARIF report path | +| `pr-comment-id` | PR comment id when the action updated or created a comment | + +## Exit behavior + +The action propagates the real CodeClone exit code at the end: + +- `0` — success +- `2` — contract error +- `3` — gating failure +- `5` — internal error + +SARIF upload and PR comment posting are treated as additive integrations. The +final job result is still driven by the CodeClone analysis exit code. + +## Permissions + +Recommended permissions: + +```yaml +permissions: + contents: read + security-events: write + pull-requests: write +``` + +Notes: + +- `security-events: write` is required for SARIF upload +- `pull-requests: write` is required for PR comments +- if you only want gating and JSON output, you can disable `sarif` and + `pr-comment` + +## Stable vs prerelease installs + +Stable: + +```yaml +with: + package-version: "" +``` + +Explicit prerelease: + +```yaml +with: + package-version: "2.0.0b3" +``` + +Local/self-repo validation: + +- `uses: ./.github/actions/codeclone` installs CodeClone from the checked-out + repository source, so beta branches and unreleased commits do not depend on + PyPI publication. + +## Notes and limitations + +- For private repositories without GitHub Advanced Security, SARIF upload may + not be available. In that case, set `sarif: "false"` and rely on the PR + comment + exit code. +- The baseline file must exist in the repository when `require-baseline: true`. +- The action always generates a canonical JSON report, even if SARIF is + disabled. +- PR comments are updated in place using a hidden marker, so repeated runs do + not keep adding duplicate comments. +- Analysis has a 10-minute timeout. For very large repositories, consider + using `extra-args: "--skip-metrics"` or narrowing the scan scope. + +## See also + +- [CodeClone repository](https://github.com/orenlab/codeclone) +- [Documentation](https://orenlab.github.io/codeclone/) +- [SARIF integration](https://orenlab.github.io/codeclone/sarif/) diff --git a/.github/actions/codeclone/_action_impl.py b/.github/actions/codeclone/_action_impl.py new file mode 100644 index 0000000..b4d52b9 --- /dev/null +++ b/.github/actions/codeclone/_action_impl.py @@ -0,0 +1,263 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import json +import shlex +import subprocess +from dataclasses import dataclass +from pathlib import Path +from typing import Literal + +COMMENT_MARKER = "" + + +@dataclass(frozen=True, slots=True) +class ActionInputs: + path: str + json_path: str + sarif: bool + sarif_path: str + fail_on_new: bool + fail_on_new_metrics: bool + fail_threshold: int | None + fail_complexity: int | None + fail_coupling: int | None + fail_cohesion: int | None + fail_cycles: bool + fail_dead_code: bool + fail_health: int | None + baseline_path: str + metrics_baseline_path: str + extra_args: str + no_progress: bool + + +@dataclass(frozen=True, slots=True) +class RunResult: + exit_code: int + json_path: str + json_exists: bool + sarif_path: str + sarif_exists: bool + + +@dataclass(frozen=True, slots=True) +class InstallTarget: + requirement: str + source: Literal["repo", "pypi-version", "pypi-latest"] + + +def parse_bool(value: str) -> bool: + return value.strip().lower() == "true" + + +def parse_optional_int(value: str) -> int | None: + normalized = value.strip() + if normalized in {"", "-1"}: + return None + return int(normalized) + + +def build_codeclone_args(inputs: ActionInputs) -> list[str]: + args = [inputs.path, "--json", inputs.json_path] + if inputs.sarif: + args.extend(["--sarif", inputs.sarif_path]) + if inputs.no_progress: + args.append("--no-progress") + if inputs.fail_on_new: + args.append("--fail-on-new") + if inputs.fail_on_new_metrics: + args.append("--fail-on-new-metrics") + if inputs.fail_threshold is not None: + args.extend(["--fail-threshold", str(inputs.fail_threshold)]) + if inputs.fail_complexity is not None: + args.extend(["--fail-complexity", str(inputs.fail_complexity)]) + if inputs.fail_coupling is not None: + args.extend(["--fail-coupling", str(inputs.fail_coupling)]) + if inputs.fail_cohesion is not None: + args.extend(["--fail-cohesion", str(inputs.fail_cohesion)]) + if inputs.fail_cycles: + args.append("--fail-cycles") + if inputs.fail_dead_code: + args.append("--fail-dead-code") + if inputs.fail_health is not None: + args.extend(["--fail-health", str(inputs.fail_health)]) + if inputs.baseline_path.strip(): + args.extend(["--baseline", inputs.baseline_path]) + if inputs.metrics_baseline_path.strip(): + args.extend(["--metrics-baseline", inputs.metrics_baseline_path]) + if inputs.extra_args.strip(): + args.extend(shlex.split(inputs.extra_args)) + return args + + +def ensure_parent_dir(path_text: str) -> None: + Path(path_text).parent.mkdir(parents=True, exist_ok=True) + + +def write_outputs(path: str, values: dict[str, str]) -> None: + with open(path, "a", encoding="utf-8") as handle: + for key, value in values.items(): + handle.write(f"{key}={value}\n") + + +# codeclone: ignore[dead-code] +def resolve_install_target( + *, + action_path: str, + workspace: str, + package_version: str, +) -> InstallTarget: + action_root = Path(action_path).resolve().parents[2] + workspace_root = Path(workspace).resolve() + if action_root == workspace_root: + return InstallTarget(requirement=str(action_root), source="repo") + + normalized_version = package_version.strip() + if normalized_version: + return InstallTarget( + requirement=f"codeclone=={normalized_version}", + source="pypi-version", + ) + return InstallTarget(requirement="codeclone", source="pypi-latest") + + +def run_codeclone(inputs: ActionInputs) -> RunResult: + ensure_parent_dir(inputs.json_path) + if inputs.sarif: + ensure_parent_dir(inputs.sarif_path) + argv = ["codeclone", *build_codeclone_args(inputs)] + try: + completed = subprocess.run(argv, check=False, timeout=600) + except subprocess.TimeoutExpired: + print("::error::CodeClone analysis timed out after 10 minutes") + return RunResult( + exit_code=5, + json_path=inputs.json_path, + json_exists=Path(inputs.json_path).exists(), + sarif_path=inputs.sarif_path, + sarif_exists=inputs.sarif and Path(inputs.sarif_path).exists(), + ) + return RunResult( + exit_code=completed.returncode, + json_path=inputs.json_path, + json_exists=Path(inputs.json_path).exists(), + sarif_path=inputs.sarif_path, + sarif_exists=inputs.sarif and Path(inputs.sarif_path).exists(), + ) + + +def _mapping(value: object) -> dict[str, object]: + return value if isinstance(value, dict) else {} + + +def _int(value: object, default: int = 0) -> int: + return value if isinstance(value, int) else default + + +def _str(value: object, default: str = "") -> str: + return value if isinstance(value, str) else default + + +def render_pr_comment(report: dict[str, object], *, exit_code: int) -> str: + meta = _mapping(report.get("meta")) + findings = _mapping(report.get("findings")) + findings_summary = _mapping(findings.get("summary")) + clone_summary = _mapping(findings_summary.get("clones")) + families = _mapping(findings_summary.get("families")) + metrics = _mapping(report.get("metrics")) + metrics_summary = _mapping(metrics.get("summary")) + health = _mapping(metrics_summary.get("health")) + baseline = _mapping(meta.get("baseline")) + cache = _mapping(meta.get("cache")) + + health_score = _int(health.get("score"), default=-1) + health_grade = _str(health.get("grade"), default="?") + baseline_status = _str(baseline.get("status"), default="unknown") + cache_used = bool(cache.get("used")) + codeclone_version = _str(meta.get("codeclone_version"), default="?") + + status_icon = "white_check_mark" + status_label = "Passed" + if exit_code == 3: + status_icon = "x" + status_label = "Failed (gating)" + elif exit_code != 0: + status_icon = "warning" + status_label = "Error" + + lines = [ + COMMENT_MARKER, + "## :microscope: CodeClone Report", + "", + "| Metric | Value |", + "|--------|-------|", + f"| Health | **{health_score}/100 ({health_grade})** |", + f"| Status | :{status_icon}: {status_label} |", + f"| Baseline | `{baseline_status}` |", + f"| Cache | `{'used' if cache_used else 'not used'}` |", + f"| Version | `{codeclone_version}` |", + "", + "### Findings", + "```text", + _clone_summary_line(clone_summary=clone_summary, families=families), + f"Structural: {_int(families.get('structural'))}", + f"Dead code: {_int(families.get('dead_code'))}", + f"Design: {_int(families.get('design'))}", + "```", + "", + ":robot: Generated by " + 'CodeClone', + ] + return "\n".join(lines) + + +def write_step_summary(path: str, body: str) -> None: + with open(path, "a", encoding="utf-8") as handle: + handle.write(body) + handle.write("\n") + + +def load_report(path: str) -> dict[str, object]: + with open(path, encoding="utf-8") as handle: + loaded = json.load(handle) + return loaded if isinstance(loaded, dict) else {} + + +def build_inputs_from_env(env: dict[str, str]) -> ActionInputs: + return ActionInputs( + path=env["INPUT_PATH"], + json_path=env["INPUT_JSON_PATH"], + sarif=parse_bool(env["INPUT_SARIF"]), + sarif_path=env["INPUT_SARIF_PATH"], + fail_on_new=parse_bool(env["INPUT_FAIL_ON_NEW"]), + fail_on_new_metrics=parse_bool(env["INPUT_FAIL_ON_NEW_METRICS"]), + fail_threshold=parse_optional_int(env["INPUT_FAIL_THRESHOLD"]), + fail_complexity=parse_optional_int(env["INPUT_FAIL_COMPLEXITY"]), + fail_coupling=parse_optional_int(env["INPUT_FAIL_COUPLING"]), + fail_cohesion=parse_optional_int(env["INPUT_FAIL_COHESION"]), + fail_cycles=parse_bool(env["INPUT_FAIL_CYCLES"]), + fail_dead_code=parse_bool(env["INPUT_FAIL_DEAD_CODE"]), + fail_health=parse_optional_int(env["INPUT_FAIL_HEALTH"]), + baseline_path=env["INPUT_BASELINE_PATH"], + metrics_baseline_path=env["INPUT_METRICS_BASELINE_PATH"], + extra_args=env["INPUT_EXTRA_ARGS"], + no_progress=parse_bool(env["INPUT_NO_PROGRESS"]), + ) + + +def _clone_summary_line( + *, + clone_summary: dict[str, object], + families: dict[str, object], +) -> str: + return ( + f"Clones: {_int(families.get('clones'))} " + f"({_int(clone_summary.get('new'))} new, " + f"{_int(clone_summary.get('known'))} known)" + ) diff --git a/.github/actions/codeclone/action.yml b/.github/actions/codeclone/action.yml index efb63f2..7cc9975 100644 --- a/.github/actions/codeclone/action.yml +++ b/.github/actions/codeclone/action.yml @@ -1,7 +1,7 @@ name: CodeClone description: > - Structural code quality analysis for Python with - CI-friendly baseline enforcement. + Structural code health analysis for Python with baseline-aware CI gating, + SARIF upload, and PR-friendly summaries. author: OrenLab @@ -11,35 +11,124 @@ branding: inputs: python-version: - description: "Python version to use" + description: "Python version" required: false default: "3.13" package-version: - description: "CodeClone version from PyPI (empty = latest)" + description: "CodeClone version from PyPI for remote installs (ignored when the action runs from the checked-out CodeClone repo)" required: false default: "" path: - description: "Path to the project root" + description: "Project root" required: false default: "." - fail-on-new: - description: "Fail if new code clones are detected" + json-path: + description: "Canonical JSON report output path" + required: false + default: ".cache/codeclone/report.json" + + sarif: + description: "Generate SARIF and upload to Code Scanning" required: false default: "true" - no-progress: - description: "Disable progress output" + sarif-path: + description: "SARIF output path" + required: false + default: ".cache/codeclone/report.sarif" + + pr-comment: + description: "Post or update a PR summary comment" + required: false + default: "true" + + fail-on-new: + description: "Fail if new clone groups are detected" required: false default: "true" + fail-on-new-metrics: + description: "Fail if metrics regress vs baseline" + required: false + default: "false" + + fail-threshold: + description: "Max allowed function+block clone groups (-1 = disabled)" + required: false + default: "-1" + + fail-complexity: + description: "Max cyclomatic complexity (-1 = disabled)" + required: false + default: "-1" + + fail-coupling: + description: "Max coupling CBO (-1 = disabled)" + required: false + default: "-1" + + fail-cohesion: + description: "Max cohesion LCOM4 (-1 = disabled)" + required: false + default: "-1" + + fail-cycles: + description: "Fail if dependency cycles are detected" + required: false + default: "false" + + fail-dead-code: + description: "Fail if high-confidence dead code is detected" + required: false + default: "false" + + fail-health: + description: "Minimum health score (-1 = disabled)" + required: false + default: "-1" + require-baseline: - description: "Fail if codeclone.baseline.json is missing" + description: "Fail if the baseline file is missing" required: false default: "true" + baseline-path: + description: "Baseline path passed to CodeClone" + required: false + default: "codeclone.baseline.json" + + metrics-baseline-path: + description: "Metrics baseline path passed to CodeClone" + required: false + default: "codeclone.baseline.json" + + extra-args: + description: "Additional CodeClone CLI arguments" + required: false + default: "" + + no-progress: + description: "Disable progress output" + required: false + default: "true" + +outputs: + exit-code: + description: "CodeClone process exit code" + value: ${{ steps.analysis.outputs.exit-code }} + json-path: + description: "Resolved JSON report path" + value: ${{ steps.analysis.outputs.json-path }} + sarif-path: + description: "Resolved SARIF report path" + value: ${{ steps.analysis.outputs.sarif-path }} + pr-comment-id: + description: "Updated PR comment id when a PR comment was posted" + value: ${{ steps.post-pr-comment.outputs.comment-id }} + runs: using: composite steps: @@ -49,31 +138,170 @@ runs: python-version: ${{ inputs.python-version }} cache: pip + - name: Resolve CodeClone install target + id: resolve-install + shell: bash + env: + CODECLONE_VERSION: ${{ inputs.package-version }} + run: | + python - <<'PY' + import os + import sys + + sys.path.insert(0, os.environ["GITHUB_ACTION_PATH"]) + + from _action_impl import resolve_install_target, write_outputs + + target = resolve_install_target( + action_path=os.environ["GITHUB_ACTION_PATH"], + workspace=os.environ["GITHUB_WORKSPACE"], + package_version=os.environ.get("CODECLONE_VERSION", ""), + ) + print(f"Resolved CodeClone install source: {target.source} ({target.requirement})") + github_output = os.environ.get("GITHUB_OUTPUT") + if github_output: + write_outputs( + github_output, + { + "install-spec": target.requirement, + "install-source": target.source, + }, + ) + PY + - name: Install CodeClone shell: bash + env: + INSTALL_SPEC: ${{ steps.resolve-install.outputs.install-spec }} + INSTALL_SOURCE: ${{ steps.resolve-install.outputs.install-source }} run: | python -m pip install --upgrade pip - if [ -n "${{ inputs.package-version }}" ]; then - pip install "codeclone==${{ inputs.package-version }}" - else - pip install codeclone - fi + echo "Installing CodeClone from ${INSTALL_SOURCE}: ${INSTALL_SPEC}" + python -m pip install "${INSTALL_SPEC}" - - name: Verify baseline exists + - name: Verify baseline if: ${{ inputs.require-baseline == 'true' }} shell: bash + env: + INPUT_PROJECT_PATH: ${{ inputs.path }} + INPUT_BASELINE_PATH: ${{ inputs.baseline-path }} run: | - test -f "${{ inputs.path }}/codeclone.baseline.json" + python - <<'PY' + import os + import sys + from pathlib import Path + + project_root = Path(os.environ["INPUT_PROJECT_PATH"]) + baseline_path = Path(os.environ["INPUT_BASELINE_PATH"]) + target = baseline_path if baseline_path.is_absolute() else project_root / baseline_path + if not target.exists(): + print(f"Missing required CodeClone baseline: {target}", file=sys.stderr) + raise SystemExit(1) + PY - name: Run CodeClone + id: analysis shell: bash + env: + INPUT_PATH: ${{ inputs.path }} + INPUT_JSON_PATH: ${{ inputs.json-path }} + INPUT_SARIF: ${{ inputs.sarif }} + INPUT_SARIF_PATH: ${{ inputs.sarif-path }} + INPUT_FAIL_ON_NEW: ${{ inputs.fail-on-new }} + INPUT_FAIL_ON_NEW_METRICS: ${{ inputs.fail-on-new-metrics }} + INPUT_FAIL_THRESHOLD: ${{ inputs.fail-threshold }} + INPUT_FAIL_COMPLEXITY: ${{ inputs.fail-complexity }} + INPUT_FAIL_COUPLING: ${{ inputs.fail-coupling }} + INPUT_FAIL_COHESION: ${{ inputs.fail-cohesion }} + INPUT_FAIL_CYCLES: ${{ inputs.fail-cycles }} + INPUT_FAIL_DEAD_CODE: ${{ inputs.fail-dead-code }} + INPUT_FAIL_HEALTH: ${{ inputs.fail-health }} + INPUT_BASELINE_PATH: ${{ inputs.baseline-path }} + INPUT_METRICS_BASELINE_PATH: ${{ inputs.metrics-baseline-path }} + INPUT_EXTRA_ARGS: ${{ inputs.extra-args }} + INPUT_NO_PROGRESS: ${{ inputs.no-progress }} run: | - extra="" - if [ "${{ inputs.no-progress }}" = "true" ]; then - extra="--no-progress" - fi - if [ "${{ inputs.fail-on-new }}" = "true" ]; then - codeclone "${{ inputs.path }}" --fail-on-new $extra - else - codeclone "${{ inputs.path }}" $extra + python "${{ github.action_path }}/run_codeclone.py" + + - name: Render PR summary + id: render-pr-comment + if: ${{ inputs.pr-comment == 'true' && github.event_name == 'pull_request' && steps.analysis.outputs.json-exists == 'true' }} + shell: bash + env: + REPORT_PATH: ${{ steps.analysis.outputs.json-path }} + ANALYSIS_EXIT_CODE: ${{ steps.analysis.outputs.exit-code }} + COMMENT_OUTPUT_PATH: ${{ runner.temp }}/codeclone-pr-comment.md + run: | + python "${{ github.action_path }}/render_pr_comment.py" + + - name: Upload SARIF + if: ${{ always() && inputs.sarif == 'true' && steps.analysis.outputs.sarif-exists == 'true' }} + continue-on-error: true + uses: github/codeql-action/upload-sarif@v3 + with: + sarif_file: ${{ steps.analysis.outputs.sarif-path }} + category: codeclone + + - name: Post or update PR comment + id: post-pr-comment + if: ${{ always() && inputs.pr-comment == 'true' && github.event_name == 'pull_request' && steps.render-pr-comment.outputs.comment-exists == 'true' }} + continue-on-error: true + uses: actions/github-script@v7 + env: + COMMENT_BODY_PATH: ${{ steps.render-pr-comment.outputs.comment-body-path }} + COMMENT_MARKER: "" + with: + script: | + const fs = require("fs"); + const body = fs.readFileSync(process.env.COMMENT_BODY_PATH, "utf8"); + const marker = process.env.COMMENT_MARKER; + const issue_number = context.issue.number; + const { owner, repo } = context.repo; + + const comments = await github.paginate( + github.rest.issues.listComments, + { + owner, + repo, + issue_number, + per_page: 100, + }, + ); + + const existing = comments.find( + (comment) => + comment.user && + comment.user.type === "Bot" && + comment.body && + comment.body.includes(marker), + ); + + let result; + if (existing) { + result = await github.rest.issues.updateComment({ + owner, + repo, + comment_id: existing.id, + body, + }); + } else { + result = await github.rest.issues.createComment({ + owner, + repo, + issue_number, + body, + }); + } + + core.setOutput("comment-id", String(result.data.id)); + + - name: Gate result + if: ${{ always() }} + shell: bash + run: | + status="${{ steps.analysis.outputs.exit-code }}" + if [ -z "${status}" ]; then + echo "CodeClone analysis did not produce an exit code." >&2 + exit 2 fi + exit "${status}" diff --git a/.github/actions/codeclone/render_pr_comment.py b/.github/actions/codeclone/render_pr_comment.py new file mode 100644 index 0000000..f08668e --- /dev/null +++ b/.github/actions/codeclone/render_pr_comment.py @@ -0,0 +1,58 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import os + +from _action_impl import ( + load_report, + render_pr_comment, + write_outputs, + write_step_summary, +) + + +def main() -> int: + report_path = os.environ["REPORT_PATH"] + output_path = os.environ["COMMENT_OUTPUT_PATH"] + exit_code = int(os.environ["ANALYSIS_EXIT_CODE"]) + + if not os.path.exists(report_path): + github_output = os.environ.get("GITHUB_OUTPUT") + if github_output: + write_outputs( + github_output, + { + "comment-exists": "false", + "comment-body-path": output_path, + }, + ) + return 0 + + body = render_pr_comment(load_report(report_path), exit_code=exit_code) + with open(output_path, "w", encoding="utf-8") as handle: + handle.write(body) + handle.write("\n") + + step_summary = os.environ.get("GITHUB_STEP_SUMMARY") + if step_summary: + write_step_summary(step_summary, body) + + github_output = os.environ.get("GITHUB_OUTPUT") + if github_output: + write_outputs( + github_output, + { + "comment-exists": "true", + "comment-body-path": output_path, + }, + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/.github/actions/codeclone/run_codeclone.py b/.github/actions/codeclone/run_codeclone.py new file mode 100644 index 0000000..b253289 --- /dev/null +++ b/.github/actions/codeclone/run_codeclone.py @@ -0,0 +1,32 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import os + +from _action_impl import build_inputs_from_env, run_codeclone, write_outputs + + +def main() -> int: + result = run_codeclone(build_inputs_from_env(dict(os.environ))) + github_output = os.environ.get("GITHUB_OUTPUT") + if github_output: + write_outputs( + github_output, + { + "exit-code": str(result.exit_code), + "json-path": result.json_path, + "json-exists": str(result.json_exists).lower(), + "sarif-path": result.sarif_path, + "sarif-exists": str(result.sarif_exists).lower(), + }, + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/.github/workflows/codeclone.yml b/.github/workflows/codeclone.yml new file mode 100644 index 0000000..f7ddc23 --- /dev/null +++ b/.github/workflows/codeclone.yml @@ -0,0 +1,33 @@ +name: CodeClone + +on: + pull_request: + types: [opened, synchronize, reopened] + paths: ["**/*.py"] + +permissions: + contents: read + security-events: write + pull-requests: write + +concurrency: + group: codeclone-${{ github.ref }} + cancel-in-progress: true + +jobs: + codeclone: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Run CodeClone + uses: ./.github/actions/codeclone + with: + python-version: "3.13" + fail-on-new: "true" + fail-health: "60" + sarif: "true" + pr-comment: "true" diff --git a/AGENTS.md b/AGENTS.md index 91c606c..c52de58 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -60,6 +60,9 @@ Key artifacts: - `codeclone.baseline.json` — trusted baseline snapshot (for CI comparisons) - `.cache/codeclone/cache.json` — analysis cache (integrity-checked) - `.cache/codeclone/report.html|report.json|report.md|report.sarif|report.txt` — reports +- `codeclone-mcp` — optional read-only MCP server (install via `codeclone[mcp]`) +- MCP runs are in-memory only; review markers are session-local and must never + leak into baseline/cache/report artifacts - `docs/`, `mkdocs.yml`, `.github/workflows/docs.yml` — published documentation site and docs build pipeline --- @@ -79,6 +82,12 @@ If you touched `docs/`, `mkdocs.yml`, docs publishing workflow, or sample-report uv run --with mkdocs --with mkdocs-material mkdocs build --strict ``` +If you touched the MCP surface, also run: + +```bash +uv run pytest -q tests/test_mcp_service.py tests/test_mcp_server.py +``` + --- ## 4) Baseline contract (v2, stable) @@ -161,6 +170,10 @@ Reports come in: - SARIF (`--sarif`) - Text (`--text`) +MCP is a separate optional interface, not a report format. It must remain a +read-only agent layer over the same canonical report/baseline/cache contracts. +Session review markers are allowed only as ephemeral MCP process state. + ### Report invariants - Ordering must be deterministic (stable sort keys). @@ -169,6 +182,10 @@ Reports come in: - baseline fingerprint + schema versions - baseline generator version - cache path / cache used +- SARIF `partialFingerprints.primaryLocationLineHash` must remain stable across + line-only shifts for the same finding identity. +- SARIF `automationDetails.id` must be unique per run; result `kind` should be + explicit when emitted. ### Explainability contract (core owns facts) @@ -246,6 +263,13 @@ Agents must preserve these semantics: - **3** — analysis gating failure (e.g., `--fail-threshold` exceeded or new clones in `--ci` as designed) - **5** — internal error (unexpected exception escaped top-level CLI handling) +Changed-scope flags are contract-sensitive: + +- `--changed-only` keeps the canonical analysis/report full, but applies clone + summary/threshold evaluation to the changed-files projection. +- `--diff-against` requires `--changed-only`. +- `--paths-from-git-diff` implies `--changed-only`. + If you introduce a new exit reason, document it and add tests. --- @@ -273,6 +297,9 @@ Before cutting a release: - Don’t add project-root hashes or unstable machine-local fields to baseline. - Don’t embed suppressions into baseline unless explicitly designed as a versioned contract. - Don’t introduce nondeterministic ordering (dict iteration, set ordering, filesystem traversal without sort). +- Don’t make the base `codeclone` install depend on optional MCP runtime packages. +- Don’t let MCP mutate baselines, source files, or repo state. +- Don’t let MCP re-synthesize design findings from raw metrics; read canonical `findings.groups.design` only. --- @@ -296,6 +323,8 @@ Architecture is layered, but grounded in current code (not aspirational diagrams `codeclone/templates.py`) renders views from report/meta facts. - **Documentation/publishing surface** (`docs/`, `mkdocs.yml`, `.github/workflows/docs.yml`, `scripts/build_docs_example_report.py`) publishes contract docs and the live sample report. +- **MCP agent interface** (`codeclone/mcp_service.py`, `codeclone/mcp_server.py`) exposes the current pipeline as a + deterministic, read-only MCP server for AI agents and MCP-capable clients. - **Tests-as-spec** (`tests/`) lock behavior, contracts, determinism, and architecture boundaries. Non-negotiable interpretation: @@ -303,6 +332,7 @@ Non-negotiable interpretation: - Core produces facts; renderers present facts. - Baseline/cache are persistence contracts, not analysis truth. - UI/report must not invent gating semantics. +- MCP reuses pipeline/report contracts and must not create a second analysis truth path. ## 13) Module map @@ -333,6 +363,13 @@ Use this map to route changes to the right owner module. change belongs here. - `codeclone/report/*.py` (other modules) — deterministic projections/format transforms ( text/markdown/sarif/derived/findings/suggestions); avoid injecting new analysis heuristics here. +- `codeclone/mcp_service.py` — typed, in-process MCP service adapter over the current pipeline/report contracts; keep + it deterministic; allow only session-local in-memory state such as reviewed markers, and never move shell UX or + `sys.exit` behavior here. +- `codeclone/mcp_server.py` — optional MCP launcher/server wiring, transport config, and MCP tool/resource + registration; keep dependency loading lazy so base installs/CI do not require MCP runtime packages. +- `tests/test_mcp_service.py`, `tests/test_mcp_server.py` — MCP contract and integration tests; run these when + touching any MCP surface. - `codeclone/html_report.py` — public HTML facade/re-export surface; preserve backward-compatible imports here; do not grow section/layout logic in this module. - `codeclone/_html_report/*` — actual HTML assembly, context shaping, tabs, sections, and overview/navigation behavior; @@ -365,6 +402,7 @@ Operational rules: - CLI helper modules (`_cli_*`) must orchestrate/format, not own domain semantics. - Persistence semantics (baseline/cache trust/integrity) must stay in persistence/domain modules, not in render/UI layers. +- MCP may depend on pipeline/report/contracts, but core/persistence/report layers must not depend on MCP modules. ## 15) Suppression policy @@ -389,15 +427,16 @@ Prefer explicit inline suppressions for runtime/dynamic false positives instead If you change a contract-sensitive zone, route docs/tests/approval deliberately. -| Change zone | Must update docs | Must update tests | Explicit approval required when | Contract-change trigger | -|-------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------|------------------------------------------------------------------------------------| -| Baseline schema/trust/integrity (`codeclone/baseline.py`) | `docs/book/06-baseline.md`, `docs/book/14-compatibility-and-versioning.md`, `docs/book/appendix/b-schema-layouts.md`, `CHANGELOG.md` | `tests/test_baseline.py`, CI/CLI behavior tests (`tests/test_cli_inprocess.py`, `tests/test_cli_unit.py`) | schema/trust semantics, compatibility windows, payload integrity logic change | baseline key layout/status semantics/compat rules change | -| Cache schema/profile/integrity (`codeclone/cache.py`) | `docs/book/07-cache.md`, `docs/book/appendix/b-schema-layouts.md`, `CHANGELOG.md` | `tests/test_cache.py`, pipeline/CLI cache integration tests | cache schema/status/profile compatibility semantics change | cache payload/version/status semantics change | -| Canonical report JSON shape (`codeclone/report/json_contract.py`, report projections) | `docs/book/08-report.md` (+ `docs/book/10-html-render.md` if rendering contract impacted), `docs/sarif.md` when SARIF changes, `CHANGELOG.md` | `tests/test_report.py`, `tests/test_report_contract_coverage.py`, `tests/test_report_branch_invariants.py`, relevant report-format tests | finding/meta/summary schema changes | stable JSON fields/meaning/order guarantees change | -| CLI flags/help/exit behavior (`codeclone/cli.py`, `_cli_*`, `contracts.py`) | `docs/book/09-cli.md`, `docs/book/03-contracts-exit-codes.md`, `README.md`, `CHANGELOG.md` | `tests/test_cli_unit.py`, `tests/test_cli_inprocess.py`, `tests/test_cli_smoke.py` | exit-code semantics, script-facing behavior, flag contracts change | user-visible CLI contract changes | -| Fingerprint-adjacent analysis (`extractor/cfg/normalize/grouping`) | `docs/book/05-core-pipeline.md`, `docs/cfg.md`, `docs/book/14-compatibility-and-versioning.md`, `CHANGELOG.md` | `tests/test_fingerprint.py`, `tests/test_extractor.py`, `tests/test_cfg.py`, golden tests (`tests/test_detector_golden.py`, `tests/test_golden_v2.py`) | always (see Section 1.6) | clone identity / NEW-vs-KNOWN / fingerprint inputs change | -| Suppression semantics/reporting (`suppressions`, extractor dead-code wiring, report/UI counters) | `docs/book/19-inline-suppressions.md`, `docs/book/16-dead-code-contract.md`, `docs/book/08-report.md`, and interface docs if surfaced (`09-cli`, `10-html-render`) | `tests/test_suppressions.py`, `tests/test_extractor.py`, `tests/test_metrics_modules.py`, `tests/test_pipeline_metrics.py`, report/html/cli tests | declaration scope semantics, rule effect, or contract-visible counters/fields change | suppression changes alter active finding output or contract-visible report payload | -| Docs site / sample report publication (`docs/`, `mkdocs.yml`, `.github/workflows/docs.yml`, `scripts/build_docs_example_report.py`) | `docs/README.md`, `docs/publishing.md`, `docs/examples/report.md`, and any contract pages surfaced by the change, `CHANGELOG.md` when user-visible behavior changes | `mkdocs build --strict`, sample-report generation smoke path, and relevant report/html tests if generated examples or embeds change | published docs navigation, sample-report generation, or Pages workflow semantics change | published documentation behavior or sample-report generation contract changes | +| Change zone | Must update docs | Must update tests | Explicit approval required when | Contract-change trigger | +|-------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------| +| Baseline schema/trust/integrity (`codeclone/baseline.py`) | `docs/book/06-baseline.md`, `docs/book/14-compatibility-and-versioning.md`, `docs/book/appendix/b-schema-layouts.md`, `CHANGELOG.md` | `tests/test_baseline.py`, CI/CLI behavior tests (`tests/test_cli_inprocess.py`, `tests/test_cli_unit.py`) | schema/trust semantics, compatibility windows, payload integrity logic change | baseline key layout/status semantics/compat rules change | +| Cache schema/profile/integrity (`codeclone/cache.py`) | `docs/book/07-cache.md`, `docs/book/appendix/b-schema-layouts.md`, `CHANGELOG.md` | `tests/test_cache.py`, pipeline/CLI cache integration tests | cache schema/status/profile compatibility semantics change | cache payload/version/status semantics change | +| Canonical report JSON shape (`codeclone/report/json_contract.py`, report projections) | `docs/book/08-report.md` (+ `docs/book/10-html-render.md` if rendering contract impacted), `docs/sarif.md` when SARIF changes, `CHANGELOG.md` | `tests/test_report.py`, `tests/test_report_contract_coverage.py`, `tests/test_report_branch_invariants.py`, relevant report-format tests | finding/meta/summary schema changes | stable JSON fields/meaning/order guarantees change | +| CLI flags/help/exit behavior (`codeclone/cli.py`, `_cli_*`, `contracts.py`) | `docs/book/09-cli.md`, `docs/book/03-contracts-exit-codes.md`, `README.md`, `CHANGELOG.md` | `tests/test_cli_unit.py`, `tests/test_cli_inprocess.py`, `tests/test_cli_smoke.py` | exit-code semantics, script-facing behavior, flag contracts change | user-visible CLI contract changes | +| Fingerprint-adjacent analysis (`extractor/cfg/normalize/grouping`) | `docs/book/05-core-pipeline.md`, `docs/cfg.md`, `docs/book/14-compatibility-and-versioning.md`, `CHANGELOG.md` | `tests/test_fingerprint.py`, `tests/test_extractor.py`, `tests/test_cfg.py`, golden tests (`tests/test_detector_golden.py`, `tests/test_golden_v2.py`) | always (see Section 1.6) | clone identity / NEW-vs-KNOWN / fingerprint inputs change | +| Suppression semantics/reporting (`suppressions`, extractor dead-code wiring, report/UI counters) | `docs/book/19-inline-suppressions.md`, `docs/book/16-dead-code-contract.md`, `docs/book/08-report.md`, and interface docs if surfaced (`09-cli`, `10-html-render`) | `tests/test_suppressions.py`, `tests/test_extractor.py`, `tests/test_metrics_modules.py`, `tests/test_pipeline_metrics.py`, report/html/cli tests | declaration scope semantics, rule effect, or contract-visible counters/fields change | suppression changes alter active finding output or contract-visible report payload | +| MCP interface (`codeclone/mcp_service.py`, `codeclone/mcp_server.py`, packaging extra/launcher) | `README.md`, `docs/book/20-mcp-interface.md`, `docs/mcp.md`, `docs/book/01-architecture-map.md`, `docs/book/14-compatibility-and-versioning.md`, `CHANGELOG.md` | `tests/test_mcp_service.py`, `tests/test_mcp_server.py`, plus CLI/package tests if launcher/install semantics change | tool/resource shapes, read-only semantics, optional-dependency packaging behavior change | public MCP tool names, resource URIs, launcher/install behavior, or response semantics change | +| Docs site / sample report publication (`docs/`, `mkdocs.yml`, `.github/workflows/docs.yml`, `scripts/build_docs_example_report.py`) | `docs/README.md`, `docs/publishing.md`, `docs/examples/report.md`, and any contract pages surfaced by the change, `CHANGELOG.md` when user-visible behavior changes | `mkdocs build --strict`, sample-report generation smoke path, and relevant report/html tests if generated examples or embeds change | published docs navigation, sample-report generation, or Pages workflow semantics change | published documentation behavior or sample-report generation contract changes | Golden rule: do not “fix” failures by snapshot refresh unless the underlying contract change is intentional, documented, and approved. @@ -431,6 +470,8 @@ Policy: - Cache schema/status/profile compatibility/integrity (`CACHE_VERSION` contract family). - Canonical report JSON schema/payload semantics (`REPORT_SCHEMA_VERSION` contract family). - Documented report projections and their machine/user-facing semantics (HTML/Markdown/SARIF/Text). +- Documented MCP launcher/install behavior, tool names, resource URIs, and read-only semantics. +- Session-local MCP review state semantics (`mark_finding_reviewed`, `exclude_reviewed`) as documented public behavior. - Documented finding families/kinds/ids and suppression-facing report fields. - Metrics baseline schema/compatibility where used by CI/gating. - Benchmark schema/outputs if consumed as a reproducible contract surface. diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b97621..f88bd7b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,62 @@ # Changelog +## [2.0.0b3] + +2.0.0b3 is the release where CodeClone stops looking like "a strong analyzer with extras" and starts looking like a +coherent platform: canonical-report-first, agent-facing, CI-native, and product-grade. + +### Licensing & packaging + +- Re-license source code to MPL-2.0 while keeping documentation under MIT. +- Ship dual `LICENSE` / `LICENSE-docs` files and sync SPDX headers. + +### MCP server (new) + +- Add optional `codeclone[mcp]` extra with `codeclone-mcp` launcher (`stdio` and `streamable-http`). +- Introduce a read-only MCP surface with 20 tools, fixed resources, and run-scoped URIs for analysis, changed-files + review, run comparison, findings / hotspots / remediation, granular checks, and gate preview. +- Add bounded run retention (`--history-limit`), `--allow-remote` guard, and reject `cache_policy=refresh` to preserve + read-only semantics. +- Optimize MCP payloads for agents with short ids, compact summaries/cards, bounded `metrics_detail`, and slim + changed-files / compare-runs responses — without changing the canonical report contract. +- Make MCP explicitly triage-first and budget-aware: clients are guided toward summary/triage → hotspots / `check_*` → + single-finding drill-down instead of broad early listing. +- Add `cache.freshness` marker and `get_production_triage` / `codeclone://latest/triage` for compact production-first + overview. +- Improve run-comparison honesty: `compare_runs` now reports `mixed` / `incomparable`, and `clones_only` runs surface + `health: unavailable` instead of placeholder values. +- Harden repository safety: MCP analysis now requires an absolute repository root and rejects relative roots like `.` + to avoid analyzing the wrong directory. +- Fix hotlist key resolution for `production_hotspots` and `test_fixture_hotspots`. +- Bump cache schema to `2.3` (stale metric entries rebuilt, not reused). + +### Report contract + +- Bump canonical report schema to `2.2`. +- Add canonical `meta.analysis_thresholds.design_findings` provenance and move threshold-aware design findings fully + into the canonical report, so MCP and HTML read the same design-finding universe. +- Add `derived.overview.directory_hotspots` and render it in the HTML Overview tab as `Hotspots by Directory`. + +### CLI + +- Add `--changed-only`, `--diff-against`, and `--paths-from-git-diff` for changed-scope review and gating with + first-class summary output. + +### SARIF + +- Stabilize `primaryLocationLineHash` (line numbers excluded), add run-unique `automationDetails.id` / + `startTimeUtc`, set explicit `kind: "fail"`, and move ancillary fields to `properties`. + +### HTML report + +- Add `Hotspots by Directory` to the Overview tab, surfacing directory-level concentration for `all`, `clones`, and low-cohesion findings with scope-aware badges and compact counts. +- Add IDE picker (PyCharm, IDEA, VS Code, Cursor, Fleet, Zed) with persistent selection. +- Add clickable file-path deep links across all tabs and stable `finding-{id}` anchors. + +### GitHub Action + +- Ship Composite Action v2 with configurable quality gates, SARIF upload to Code Scanning, and PR summary comments. + ## [2.0.0b2] ### Dependencies diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ff63f06..6a92890 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -3,7 +3,7 @@ Thank you for your interest in contributing to **CodeClone**. CodeClone provides **structural code quality analysis** for Python, including clone detection, -quality metrics, and baseline-aware CI governance. +quality metrics, baseline-aware CI governance, and an optional MCP agent interface. Contributions are welcome — especially those that improve **signal quality**, **CFG semantics**, and **real-world CI usability**. @@ -31,8 +31,11 @@ We especially welcome contributions in the following areas: - Control Flow Graph (CFG) construction and semantics - AST normalization improvements - Segment-level clone detection and reporting +- Quality metrics (complexity, coupling, cohesion, dead-code, dependencies) - False-positive reduction - HTML report UX improvements +- MCP server tools and agent workflows +- GitHub Action improvements - Performance optimizations - Documentation and real-world examples @@ -51,6 +54,8 @@ When reporting issues related to clone detection, include: - AST-related, - CFG-related, - normalization-related, + - metrics-related, + - MCP-related, - reporting / UI-related. Screenshots alone are usually insufficient for analysis. @@ -73,8 +78,6 @@ Well-argued false-positive reports are valuable and appreciated. ## CFG Semantics Discussions -CFG behavior in CodeClone is intentionally conservative in the 1.x series. - If proposing changes to CFG semantics, include: - a description of the current behavior; @@ -98,15 +101,13 @@ Such changes often require design-level discussion and may be staged across vers ## Baseline & CI -### Baseline contract (v1) +### Baseline contract (v2) -- The baseline schema is versioned (`meta.schema_version`). +- The baseline schema is versioned (`meta.schema_version`, currently `2.0`). - Compatibility/trust gates include `schema_version`, `fingerprint_version`, `python_tag`, and `meta.generator.name`. -- Integrity is tamper-evident via `meta.payload_sha256` over canonical payload: - `clones.functions`, `clones.blocks`, `meta.fingerprint_version`, `meta.python_tag`. - `meta.schema_version`, `meta.generator.name`, `meta.generator.version`, and `created_at` - are excluded from payload hashing. +- Integrity is tamper-evident via `meta.payload_sha256` over canonical payload. +- The baseline may embed a `metrics` section for metrics-baseline-aware CI gating. ### When baseline regeneration is required @@ -131,12 +132,55 @@ Such changes often require design-level discussion and may be staged across vers --- +## Versioned schemas + +CodeClone maintains several versioned schema contracts: + +| Schema | Current version | Owner | +|------------------|-----------------|-------------------------------------| +| Baseline | `2.0` | `codeclone/baseline.py` | +| Report | `2.1` | `codeclone/report/json_contract.py` | +| Cache | `2.2` | `codeclone/cache.py` | +| Metrics baseline | `1.0` | `codeclone/metrics_baseline.py` | + +Any change to schema shape or semantics requires version review, documentation, and tests. + +--- + +## MCP Interface + +CodeClone includes an optional **read-only MCP server** (`codeclone[mcp]`) for AI agents. + +When contributing to MCP: + +- MCP must remain **read-only** — it must never mutate baselines, source files, or repo state. +- Session-local review markers are the only allowed mutable state (in-memory, ephemeral). +- MCP reuses pipeline/report contracts — do not create a second analysis truth path. +- Tool names, resource URIs, and response shapes are public surfaces — changes require tests and docs. + +See `docs/mcp.md` and `docs/book/20-mcp-interface.md` for details. + +--- + +## GitHub Action + +CodeClone ships a composite GitHub Action (`.github/actions/codeclone/`). + +When contributing to the Action: + +- Never inline `${{ inputs.* }}` in shell scripts — pass through `env:` variables. +- Prefer major-tag pinning for actions (e.g., `actions/setup-python@v5`). +- Add timeouts to all `subprocess.run` calls. + +--- + ## Development Setup ```bash git clone https://github.com/orenlab/codeclone.git cd codeclone uv sync --all-extras --dev +uv run pre-commit install ``` Run tests: @@ -148,16 +192,26 @@ uv run pytest Static checks: ```bash -uv run mypy . -uv run ruff check . -uv run ruff format . +uv run pre-commit run --all-files +``` + +Build documentation (if you touched `docs/` or `mkdocs.yml`): + +```bash +uv run --with mkdocs --with mkdocs-material mkdocs build --strict +``` + +Run MCP tests (if you touched `mcp_service.py` or `mcp_server.py`): + +```bash +uv run pytest -q tests/test_mcp_service.py tests/test_mcp_server.py ``` --- ## Code Style -- Python **3.10–3.14** +- Python **3.10 – 3.14** - Type annotations are required - `Any` should be minimized; prefer precise types and small typed helpers - `mypy` must pass @@ -182,5 +236,7 @@ and may require a `fingerprint_version` bump (and thus baseline regeneration). ## License -By contributing to CodeClone, you agree that your contributions will be licensed -under the **MIT License**. +By contributing code to CodeClone, you agree that your contributions will be +licensed under **MPL-2.0**. + +Documentation contributions are licensed under **MIT**. diff --git a/LICENSE b/LICENSE index fdcac7c..df9d84d 100644 --- a/LICENSE +++ b/LICENSE @@ -1,23 +1,373 @@ -MIT License +Mozilla Public License Version 2.0 +================================== -Copyright (c) 2024 Denis Rozhnovskiy +1. Definitions +-------------- -The name “CodeClone” refers to the official project distribution. +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. +1.3. "Contribution" + means Covered Software of a particular Contributor. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted from a particular Contributor are +reinstated (a) provisionally, unless and until such Contributor +explicitly and finally terminates Your grants, and (b) on an ongoing +basis, if such Contributor fails to notify You of the non-compliance by +some reasonable means prior to 60 days after You have come back into +compliance. Moreover, Your grants from a particular Contributor are +reinstated on an ongoing basis if such Contributor notifies You of the +non-compliance by some reasonable means, this is the first time You have +received notice of non-compliance with this License from such +Contributor, and You become compliant prior to 30 days after Your +receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. diff --git a/LICENSE-docs b/LICENSE-docs new file mode 100644 index 0000000..66b3e88 --- /dev/null +++ b/LICENSE-docs @@ -0,0 +1,25 @@ +MIT License + +Copyright (c) 2024 Denis Rozhnovskiy + +This license applies to documentation in this repository, including the +`docs/` tree and Markdown documentation files, unless a file states +otherwise. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this documentation and associated files (the "Documentation"), to deal +in the Documentation without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Documentation, and to permit persons to whom the +Documentation is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Documentation. + +THE DOCUMENTATION IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE DOCUMENTATION OR THE USE OR OTHER DEALINGS IN +THE DOCUMENTATION. diff --git a/README.md b/README.md index 38bcb00..dac69e6 100644 --- a/README.md +++ b/README.md @@ -12,44 +12,70 @@ Tests Benchmark Python - codeclone 81 (B) - License + codeclone 85 (B) + License

--- -CodeClone provides comprehensive structural code quality analysis for Python. It detects architectural -duplication via normalized AST and Control Flow Graphs, computes quality metrics, and enforces CI gates — -all with baseline-aware governance that separates **known** technical debt from **new** regressions. +CodeClone provides deterministic structural code quality analysis for Python. +It detects architectural duplication, computes quality metrics, and enforces CI gates — all with **baseline-aware +governance** that separates **known** technical debt from **new** regressions. +An optional MCP interface exposes the same canonical analysis pipeline to AI agents and IDEs. Docs: [orenlab.github.io/codeclone](https://orenlab.github.io/codeclone/) · Live sample report: [orenlab.github.io/codeclone/examples/report/](https://orenlab.github.io/codeclone/examples/report/) +> [!NOTE] +> This README and docs site track the in-development `v2.0.x` line from `main`. +> For the latest stable CodeClone documentation (`v1.4.4`), see the +> [`v1.4.4` README](https://github.com/orenlab/codeclone/blob/v1.4.4/README.md) +> and the +> [`v1.4.4` docs tree](https://github.com/orenlab/codeclone/tree/v1.4.4/docs). + ## Features - **Clone detection** — function (CFG fingerprint), block (statement windows), and segment (report-only) clones - **Structural findings** — duplicated branch families, clone guard/exit divergence and clone-cohort drift (report-only) - **Quality metrics** — cyclomatic complexity, coupling (CBO), cohesion (LCOM4), dependency cycles, dead code, health score -- **Baseline governance** — known debt stays accepted; CI blocks only new clones and metric regressions +- **Baseline governance** — separates accepted **legacy** debt from **new regressions** and lets CI fail **only** on + what changed - **Reports** — interactive HTML, deterministic JSON/TXT plus Markdown and SARIF projections from one canonical report +- **MCP server** — optional read-only MCP surface for AI agents and IDEs, designed as a budget-aware guided control + surface for agentic development - **CI-first** — deterministic output, stable ordering, exit code contract, pre-commit support -- **Fast*** — incremental caching, parallel processing, warm-run optimization, and reproducible benchmark coverage +- **Fast** — incremental caching, parallel processing, warm-run optimization, and reproducible benchmark coverage ## Quick Start ```bash -pip install codeclone # or: uv tool install codeclone - -codeclone . # analyze current directory -codeclone . --html # generate HTML report -codeclone . --html --open-html-report # generate and open HTML report -codeclone . --json --md --sarif --text # generate machine-readable reports -codeclone . --html --json --timestamped-report-paths # keep timestamped report snapshots -codeclone . --ci # CI mode (--fail-on-new --no-color --quiet) +pip install codeclone # or: uv tool install codeclone + +codeclone . # analyze +codeclone . --html # HTML report +codeclone . --html --open-html-report # open in browser +codeclone . --json --md --sarif --text # all formats +codeclone . --ci # CI mode ``` +
+More examples + +```bash +# timestamped report snapshots +codeclone . --html --json --timestamped-report-paths + +# changed-scope gating against git diff +codeclone . --changed-only --diff-against main + +# shorthand: diff source for changed-scope review +codeclone . --paths-from-git-diff HEAD~1 +``` + +
+
Run without install @@ -69,9 +95,34 @@ codeclone . --update-baseline codeclone . --ci ``` -The `--ci` preset equals `--fail-on-new --no-color --quiet`. +
+What --ci enables +The --ci preset equals --fail-on-new --no-color --quiet. When a trusted metrics baseline is loaded, CI mode also enables -`--fail-on-new-metrics`. +--fail-on-new-metrics. +
+ +### GitHub Action + +CodeClone also ships a composite GitHub Action for PR and CI workflows: + +```yaml +- uses: orenlab/codeclone/.github/actions/codeclone@main + with: + fail-on-new: "true" + sarif: "true" + pr-comment: "true" +``` + +It can: + +- run baseline-aware gating +- generate JSON and SARIF reports +- upload SARIF to GitHub Code Scanning +- post or update a PR summary comment + +Action docs: +[.github/actions/codeclone/README.md](https://github.com/orenlab/codeclone/blob/main/.github/actions/codeclone/README.md) ### Quality Gates @@ -101,6 +152,36 @@ repos: types: [ python ] ``` +## MCP Server + +CodeClone ships an optional read-only MCP server for AI agents and IDE clients. + +```bash +# install the MCP extra +pip install "codeclone[mcp]" + +# local agents (Claude Code, Codex, Copilot, Gemini CLI) +codeclone-mcp --transport stdio + +# remote / HTTP-only clients +codeclone-mcp --transport streamable-http --port 8000 +``` + +20 tools + 10 resources — deterministic, baseline-aware, and read-only. +Never mutates source files, baselines, or repo state. + +Payloads are optimized for LLM context: compact summaries by default, full detail on demand. +The cheapest useful path is also the most obvious path: first-pass triage stays compact, and deeper detail is explicit. + +Recommended agent flow: +`analyze_repository` or `analyze_changed_paths` → `get_run_summary` or `get_production_triage` → +`list_hotspots` or `check_*` → `get_finding` → `get_remediation` + +Docs: +[MCP usage guide](https://orenlab.github.io/codeclone/mcp/) +· +[MCP interface contract](https://orenlab.github.io/codeclone/book/20-mcp-interface/) + ## Configuration CodeClone can load project-level configuration from `pyproject.toml`: @@ -163,8 +244,7 @@ All report formats are rendered from one canonical JSON report document. - `--timestamped-report-paths` appends a UTC timestamp to default report filenames for bare report flags such as `--html` or `--json`. Explicit report paths are not rewritten. -The published docs site also includes a live example HTML/JSON/SARIF report -generated from the current `codeclone` repository during the docs build. +The docs site also includes live example HTML/JSON/SARIF reports generated from the current `codeclone` repository. Structural findings include: @@ -191,16 +271,21 @@ class Middleware: # codeclone: ignore[dead-code] Dynamic/runtime false positives are resolved via explicit inline suppressions, not via broad heuristics.
-JSON report shape (v2.1) +Canonical JSON report shape (v2.2) ```json { - "report_schema_version": "2.1", + "report_schema_version": "2.2", "meta": { - "codeclone_version": "2.0.0b2", + "codeclone_version": "2.0.0b3", "project_name": "...", "scan_root": ".", "report_mode": "full", + "analysis_thresholds": { + "design_findings": { + "...": "..." + } + }, "baseline": { "...": "..." }, @@ -211,6 +296,7 @@ Dynamic/runtime false positives are resolved via explicit inline suppressions, n "...": "..." }, "runtime": { + "analysis_started_at_utc": "...", "report_generated_at_utc": "..." } }, @@ -257,7 +343,8 @@ Dynamic/runtime false positives are resolved via explicit inline suppressions, n "families": {}, "top_risks": [], "source_scope_breakdown": {}, - "health_snapshot": {} + "health_snapshot": {}, + "directory_hotspots": {} }, "hotlists": { "most_actionable_ids": [], @@ -300,20 +387,20 @@ CFG semantics: [CFG semantics](https://orenlab.github.io/codeclone/cfg/) ## Documentation -| Topic | Link | -|----------------------------|----------------------------------------------------------------------------------------------------| -| Contract book (start here) | [Contracts and guarantees](https://orenlab.github.io/codeclone/book/00-intro/) | -| Exit codes | [Exit codes and failure policy](https://orenlab.github.io/codeclone/book/03-contracts-exit-codes/) | -| Configuration | [Config and defaults](https://orenlab.github.io/codeclone/book/04-config-and-defaults/) | -| Baseline contract | [Baseline contract](https://orenlab.github.io/codeclone/book/06-baseline/) | -| Cache contract | [Cache contract](https://orenlab.github.io/codeclone/book/07-cache/) | -| Report contract | [Report contract](https://orenlab.github.io/codeclone/book/08-report/) | +| Topic | Link | +|----------------------------|-----------------------------------------------------------------------------------------------------| +| Contract book (start here) | [Contracts and guarantees](https://orenlab.github.io/codeclone/book/00-intro/) | +| Exit codes | [Exit codes and failure policy](https://orenlab.github.io/codeclone/book/03-contracts-exit-codes/) | +| Configuration | [Config and defaults](https://orenlab.github.io/codeclone/book/04-config-and-defaults/) | +| Baseline contract | [Baseline contract](https://orenlab.github.io/codeclone/book/06-baseline/) | +| Cache contract | [Cache contract](https://orenlab.github.io/codeclone/book/07-cache/) | +| Report contract | [Report contract](https://orenlab.github.io/codeclone/book/08-report/) | | Metrics & quality gates | [Metrics and quality gates](https://orenlab.github.io/codeclone/book/15-metrics-and-quality-gates/) | -| Dead code | [Dead-code contract](https://orenlab.github.io/codeclone/book/16-dead-code-contract/) | -| Docker benchmark contract | [Benchmarking contract](https://orenlab.github.io/codeclone/book/18-benchmarking/) | -| Determinism | [Determinism policy](https://orenlab.github.io/codeclone/book/12-determinism/) | +| Dead code | [Dead-code contract](https://orenlab.github.io/codeclone/book/16-dead-code-contract/) | +| Docker benchmark contract | [Benchmarking contract](https://orenlab.github.io/codeclone/book/18-benchmarking/) | +| Determinism | [Determinism policy](https://orenlab.github.io/codeclone/book/12-determinism/) | -## * Benchmarking +## Benchmarking Notes
Reproducible Docker Benchmark @@ -337,8 +424,15 @@ in [Benchmarking contract](https://orenlab.github.io/codeclone/book/18-benchmark
+## License + +- **Code:** MPL-2.0 +- **Documentation:** MIT + +Versions released before this change remain under their original license terms. + ## Links - **Issues:** - **PyPI:** -- **License:** MIT +- **Licenses:** [MPL-2.0](LICENSE) · [MIT docs](LICENSE-docs) diff --git a/SECURITY.md b/SECURITY.md index aca157b..333de2d 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -27,36 +27,74 @@ CodeClone operates purely on static input and follows a conservative execution m - Performs analysis in-process with explicit resource limits - Generates static HTML reports without external dependencies -Potential risk areas include: +### Core analysis -- malformed or adversarial source files -- extremely large inputs leading to resource exhaustion -- HTML report generation and embedding - -These areas are explicitly tested and hardened, but are still the primary focus of -ongoing security review. +- Scanner traversal is root-confined and prevents symlink-based path escape. +- Temporary files use unpredictable names (`tempfile.NamedTemporaryFile` with `delete=False`) + and atomic replacement (`os.replace`) to prevent predictable-path attacks. -Additional safeguards: +### HTML reports - HTML report content is escaped in both text and attribute contexts to prevent script injection. - Reports are static and do not execute analyzed code. -- Report explainability fields are generated in Python core; UI is rendering-only and does not infer semantics. -- Scanner traversal is root-confined and prevents symlink-based path escape. +- Report explainability fields are generated in Python core; UI is rendering-only and does not + infer semantics. + +### Baseline and cache integrity + - Baseline files are schema/type validated with size limits and tamper-evident integrity fields (`meta.generator` as trust gate, `meta.payload_sha256` as integrity hash in baseline schema `2.0`). - Baseline integrity is tamper-evident (audit signal), not tamper-proof cryptographic signing. An actor who can rewrite baseline content and recompute `payload_sha256` can still alter it. -- Baseline hash covers canonical payload only (`clones.functions`, `clones.blocks`, +- Baseline hash covers canonical clone payload (`clones.functions`, `clones.blocks`, `meta.fingerprint_version`, `meta.python_tag`). - Baseline hash excludes non-semantic metadata (`created_at`, `meta.generator.version`). - `meta.schema_version` and `meta.generator.name` are validated as compatibility/trust gates and are intentionally excluded from `payload_sha256`. +- Metrics baseline (`MetricsBaseline`) maintains a separate integrity hash over its own payload, + independent of the clone baseline hash. - In `--ci` (or explicit `--fail-on-new`), untrusted baseline states fail fast; otherwise baseline is ignored with explicit warning and comparison proceeds against an empty baseline. - Cache files are integrity-signed with canonical payload hashing (constant-time comparison), size-limited, and ignored on mismatch. - Legacy cache secret files (`.cache/codeclone/.cache_secret`) are obsolete and should be removed. +### MCP server + +CodeClone includes an optional read-only MCP server (`codeclone[mcp]`) that exposes +analysis results over JSON-RPC (stdio transport). + +- The MCP server is **read-only**: it never mutates baselines, source files, cache, or repo state. +- Session-local review markers are in-memory only and discarded on process exit. +- Tool arguments that accept git refs (`git_diff_ref`) are validated against a strict regex + to prevent command injection via `subprocess` calls. +- The MCP run store is bounded (`history_limit`) with FIFO eviction to prevent unbounded + memory growth from repeated analysis calls. +- MCP is an optional extra (`codeclone[mcp]`); its runtime dependencies are never loaded + by the base install or CLI. + +### GitHub Action + +CodeClone ships a composite GitHub Action (`.github/actions/codeclone/`). + +- All `${{ inputs.* }}` values are passed through `env:` variables, never inlined in shell + scripts, to prevent script injection from untrusted PR authors. +- External subprocess calls use explicit timeouts (`timeout=600` for analysis, + `timeout=30` for git commands) to prevent hanging CI runners. + +### Potential risk areas + +Potential risk areas include: + +- malformed or adversarial source files +- extremely large inputs leading to resource exhaustion +- HTML report generation and embedding +- MCP tool arguments from untrusted agent contexts +- GitHub Action inputs from untrusted PR authors + +These areas are explicitly tested and hardened, but remain the primary focus of +ongoing security review. + --- ## Reporting a Vulnerability diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py index c9b7135..ba12356 100755 --- a/benchmarks/run_benchmark.py +++ b/benchmarks/run_benchmark.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone.baseline.json b/codeclone.baseline.json index 4492b81..8b28ea2 100644 --- a/codeclone.baseline.json +++ b/codeclone.baseline.json @@ -2,14 +2,14 @@ "meta": { "generator": { "name": "codeclone", - "version": "2.0.0b2" + "version": "2.0.0b3" }, "schema_version": "2.0", "fingerprint_version": "1", "python_tag": "cp313", - "created_at": "2026-03-26T16:36:17Z", + "created_at": "2026-03-29T16:19:22Z", "payload_sha256": "691c6cedd10e2a51d6038780f3ae9dffe763356dd2aba742b3980f131b79f217", - "metrics_payload_sha256": "f18db9aa4573517b0babb31e4e995208209895ea6b8a1957087c0f3b6f1f5434" + "metrics_payload_sha256": "878d5169c9ffd6d73eb0ce3ce55166df3d080b85ed835091f33ff53d2779b9ac" }, "clones": { "functions": [ @@ -30,16 +30,14 @@ "high_risk_functions": [], "max_coupling": 10, "high_coupling_classes": [], - "max_cohesion": 5, + "max_cohesion": 4, "low_cohesion_classes": [ - "codeclone.baseline:Baseline", - "codeclone.metrics_baseline:MetricsBaseline", "tests.test_golden_v2:_DummyExecutor" ], "dependency_cycles": [], - "dependency_max_depth": 10, + "dependency_max_depth": 11, "dead_code_items": [], - "health_score": 81, + "health_score": 85, "health_grade": "B" } } diff --git a/codeclone/__init__.py b/codeclone/__init__.py index b52ea47..cb9ce1d 100644 --- a/codeclone/__init__.py +++ b/codeclone/__init__.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from importlib.metadata import PackageNotFoundError, version diff --git a/codeclone/_cli_args.py b/codeclone/_cli_args.py index d2796b9..17a2a2f 100644 --- a/codeclone/_cli_args.py +++ b/codeclone/_cli_args.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations @@ -130,6 +133,23 @@ def build_parser(version: str) -> _ArgumentParser: default=DEFAULT_PROCESSES, help=ui.HELP_PROCESSES, ) + _add_bool_optional_argument( + analysis_group, + flag="--changed-only", + help_text=ui.HELP_CHANGED_ONLY, + ) + analysis_group.add_argument( + "--diff-against", + default=None, + metavar="GIT_REF", + help=ui.HELP_DIFF_AGAINST, + ) + analysis_group.add_argument( + "--paths-from-git-diff", + default=None, + metavar="GIT_REF", + help=ui.HELP_PATHS_FROM_GIT_DIFF, + ) _add_optional_path_argument( analysis_group, flag="--cache-path", diff --git a/codeclone/_cli_baselines.py b/codeclone/_cli_baselines.py index 64a187c..ed415f7 100644 --- a/codeclone/_cli_baselines.py +++ b/codeclone/_cli_baselines.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/_cli_config.py b/codeclone/_cli_config.py index b31d9b1..22efec1 100644 --- a/codeclone/_cli_config.py +++ b/codeclone/_cli_config.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/_cli_gating.py b/codeclone/_cli_gating.py index d6d100f..5a5ae7d 100644 --- a/codeclone/_cli_gating.py +++ b/codeclone/_cli_gating.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/_cli_meta.py b/codeclone/_cli_meta.py index 6d893ec..f112d8d 100644 --- a/codeclone/_cli_meta.py +++ b/codeclone/_cli_meta.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations @@ -8,6 +11,11 @@ from typing import TYPE_CHECKING, TypedDict from .baseline import Baseline, current_python_tag +from .contracts import ( + DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, +) if TYPE_CHECKING: from pathlib import Path @@ -67,6 +75,10 @@ class ReportMeta(TypedDict): health_grade: str | None analysis_mode: str metrics_computed: list[str] + design_complexity_threshold: int + design_coupling_threshold: int + design_cohesion_threshold: int + analysis_started_at_utc: str | None report_generated_at_utc: str @@ -91,6 +103,10 @@ def _build_report_meta( health_grade: str | None, analysis_mode: str, metrics_computed: tuple[str, ...], + design_complexity_threshold: int = DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + design_coupling_threshold: int = DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + design_cohesion_threshold: int = DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + analysis_started_at_utc: str | None, report_generated_at_utc: str, ) -> ReportMeta: project_name = scan_root.name or str(scan_root) @@ -133,5 +149,9 @@ def _build_report_meta( "health_grade": health_grade, "analysis_mode": analysis_mode, "metrics_computed": list(metrics_computed), + "design_complexity_threshold": design_complexity_threshold, + "design_coupling_threshold": design_coupling_threshold, + "design_cohesion_threshold": design_cohesion_threshold, + "analysis_started_at_utc": analysis_started_at_utc, "report_generated_at_utc": report_generated_at_utc, } diff --git a/codeclone/_cli_paths.py b/codeclone/_cli_paths.py index 2fb6d11..3577dc0 100644 --- a/codeclone/_cli_paths.py +++ b/codeclone/_cli_paths.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/_cli_reports.py b/codeclone/_cli_reports.py index f1ffea6..126879c 100644 --- a/codeclone/_cli_reports.py +++ b/codeclone/_cli_reports.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/_cli_rich.py b/codeclone/_cli_rich.py index 506a6ce..88f9d00 100644 --- a/codeclone/_cli_rich.py +++ b/codeclone/_cli_rich.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/_cli_runtime.py b/codeclone/_cli_runtime.py index b7e315e..616057b 100644 --- a/codeclone/_cli_runtime.py +++ b/codeclone/_cli_runtime.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/_cli_summary.py b/codeclone/_cli_summary.py index d1d2369..69b30da 100644 --- a/codeclone/_cli_summary.py +++ b/codeclone/_cli_summary.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations @@ -25,6 +28,14 @@ class MetricsSnapshot: suppressed_dead_code_count: int = 0 +@dataclass(frozen=True, slots=True) +class ChangedScopeSnapshot: + paths_count: int + findings_total: int + findings_new: int + findings_known: int + + class _Printer(Protocol): def print(self, *objects: object, **kwargs: object) -> None: ... @@ -149,3 +160,34 @@ def _print_metrics( suppressed=metrics.suppressed_dead_code_count, ) ) + + +def _print_changed_scope( + *, + console: _Printer, + quiet: bool, + changed_scope: ChangedScopeSnapshot, +) -> None: + if quiet: + console.print( + ui.fmt_changed_scope_compact( + paths=changed_scope.paths_count, + findings=changed_scope.findings_total, + new=changed_scope.findings_new, + known=changed_scope.findings_known, + ) + ) + return + + from rich.rule import Rule + + console.print() + console.print(Rule(title=ui.CHANGED_SCOPE_TITLE, style="dim", characters="\u2500")) + console.print(ui.fmt_changed_scope_paths(count=changed_scope.paths_count)) + console.print( + ui.fmt_changed_scope_findings( + total=changed_scope.findings_total, + new=changed_scope.findings_new, + known=changed_scope.findings_known, + ) + ) diff --git a/codeclone/_coerce.py b/codeclone/_coerce.py index e4c07bd..9017c6a 100644 --- a/codeclone/_coerce.py +++ b/codeclone/_coerce.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/_html_badges.py b/codeclone/_html_badges.py index f35dc17..dc06b15 100644 --- a/codeclone/_html_badges.py +++ b/codeclone/_html_badges.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy """Shared HTML badge, label, and visual helpers for the report UI layer. diff --git a/codeclone/_html_css.py b/codeclone/_html_css.py index 3accd98..8923410 100644 --- a/codeclone/_html_css.py +++ b/codeclone/_html_css.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy """CSS design system for the HTML report — tokens, components, layout.""" @@ -167,12 +170,14 @@ background:var(--bg-surface);border:1px solid var(--border);border-radius:var(--radius-lg); overflow-x:auto;scrollbar-width:none;-webkit-overflow-scrolling:touch} .main-tabs::-webkit-scrollbar{display:none} -.main-tab{position:relative;flex:1;text-align:center;padding:var(--sp-2) var(--sp-3); - background:none;border:none;cursor:pointer;font-size:.85rem;font-weight:500; - color:var(--text-muted);white-space:nowrap;border-radius:var(--radius-md); - transition:all var(--dur-fast) var(--ease)} +.main-tab{position:relative;flex:1;display:inline-flex;align-items:center;justify-content:center; + gap:var(--sp-1);text-align:center;padding:var(--sp-2) var(--sp-3);background:none; + border:none;cursor:pointer;font-size:.85rem;font-weight:500;color:var(--text-muted); + white-space:nowrap;border-radius:var(--radius-md);transition:all var(--dur-fast) var(--ease)} .main-tab:hover{color:var(--text-primary);background:var(--bg-raised)} .main-tab[aria-selected="true"]{color:var(--accent-primary);background:var(--accent-muted)} +.main-tab-icon{flex-shrink:0;opacity:.72} +.main-tab-label{display:inline-flex;align-items:center} .tab-count{display:inline-flex;align-items:center;justify-content:center;min-width:18px; height:18px;padding:0 5px;font-size:.7rem;font-weight:700;border-radius:9px; background:var(--bg-overlay);color:var(--text-muted);margin-left:var(--sp-1)} @@ -653,6 +658,23 @@ .breakdown-bar-track{height:6px;border-radius:3px;background:var(--bg-raised);overflow:hidden} .breakdown-bar-fill{display:block;height:100%;border-radius:3px; background:var(--accent-primary);transition:width .6s var(--ease)} +/* Directory hotspot entries */ +.dir-hotspot-list{display:flex;flex-direction:column;gap:0} +.dir-hotspot-entry{padding:var(--sp-2) 0;border-bottom:1px solid color-mix(in srgb,var(--border) 50%,transparent)} +.dir-hotspot-entry:last-child{border-bottom:none;padding-bottom:0} +.dir-hotspot-entry:first-child{padding-top:0} +.dir-hotspot-path{display:flex;align-items:center;gap:var(--sp-2);margin-bottom:4px;min-width:0} +.dir-hotspot-path code{font-size:.78rem;font-weight:600;color:var(--text-primary);line-height:1.3} +.dir-hotspot-bar-row{display:flex;align-items:center;gap:var(--sp-2);margin-bottom:3px} +.dir-hotspot-bar-track{flex:1;height:4px;border-radius:2px;background:var(--bg-raised); + overflow:hidden;display:flex} +.dir-hotspot-bar-prev{height:100%;background:var(--text-muted);opacity:.18} +.dir-hotspot-bar-cur{height:100%;background:var(--accent-primary);opacity:.7} +.dir-hotspot-pct{font-size:.7rem;font-weight:600;font-variant-numeric:tabular-nums; + color:var(--text-muted);min-width:3.2em;text-align:right} +.dir-hotspot-meta{display:flex;flex-wrap:wrap;gap:6px;font-size:.68rem;color:var(--text-muted)} +.dir-hotspot-meta span{font-variant-numeric:tabular-nums} +.dir-hotspot-meta-sep{opacity:.3} /* Health radar chart */ .health-radar{display:flex;justify-content:center;padding:var(--sp-3) 0} .health-radar svg{width:100%;max-width:520px;height:auto;overflow:visible} @@ -778,10 +800,10 @@ .suggestion-sev-inline{font-size:.72rem;font-weight:600;padding:1px var(--sp-1); border-radius:var(--radius-sm)} .suggestion-title{font-weight:600;font-size:.85rem;color:var(--text-primary);flex:1;min-width:0} -.suggestion-meta{display:flex;align-items:center;gap:var(--sp-1);flex-shrink:0;flex-wrap:wrap} -.suggestion-meta-badge{font-size:.68rem;font-family:var(--font-mono);font-weight:500; - padding:1px var(--sp-2);border-radius:var(--radius-sm);background:var(--bg-overlay); - color:var(--text-muted);white-space:nowrap} +.suggestion-meta{display:flex;align-items:center;gap:var(--sp-2);flex-shrink:0;flex-wrap:wrap} +.suggestion-meta-badge{font-size:.68rem;font-weight:600;padding:2px var(--sp-2); + border-radius:999px;background:var(--bg-overlay);color:var(--text-muted); + white-space:nowrap;line-height:1.2;font-variant-numeric:tabular-nums} .suggestion-effort--easy{color:var(--success);background:var(--success-muted, rgba(34,197,94,.1))} .suggestion-effort--moderate{color:var(--warning);background:var(--warning-muted)} .suggestion-effort--hard{color:var(--error);background:var(--error-muted)} @@ -1075,6 +1097,11 @@ .theme-toggle{font-size:0;gap:0;width:32px;height:32px; padding:0;align-items:center;justify-content:center} .theme-toggle svg{width:16px;height:16px} + .ide-picker-btn{font-size:0;gap:0;width:32px;height:32px; + padding:0;align-items:center;justify-content:center} + .ide-picker-btn svg{width:16px;height:16px} + .ide-picker-label{display:none} + .ide-menu{right:0;min-width:140px} .main-tabs-wrap{position:sticky;top:0;z-index:90;padding:var(--sp-2) 0 0} .main-tabs{padding:var(--sp-1);gap:2px; background: @@ -1084,6 +1111,7 @@ linear-gradient(to left,rgba(0,0,0,.12),transparent) right center / 10px 100% no-repeat scroll, var(--bg-surface)} .main-tab{flex:none;padding:var(--sp-1) var(--sp-2);font-size:.78rem} + .main-tab-icon{width:13px;height:13px} } @media(max-width:480px){ .overview-kpi-grid{grid-template-columns:1fr} @@ -1091,10 +1119,41 @@ .brand-logo{width:28px;height:28px} } +/* IDE link */ +.ide-link{color:inherit;text-decoration:none;cursor:default} +[data-ide]:not([data-ide=""]) .ide-link{cursor:pointer;color:var(--accent-primary); + text-decoration-line:underline;text-decoration-style:dotted;text-underline-offset:2px} +[data-ide]:not([data-ide=""]) .ide-link:hover{text-decoration-style:solid} + +/* IDE picker dropdown */ +.ide-picker{position:relative;display:inline-flex} +.ide-picker-btn{display:inline-flex;align-items:center;gap:var(--sp-1); + padding:var(--sp-1) var(--sp-3);background:none;border:1px solid var(--border); + border-radius:var(--radius-md);cursor:pointer;color:var(--text-muted);font-size:.85rem; + font-weight:500;font-family:inherit;transition:all var(--dur-fast) var(--ease); + white-space:nowrap} +.ide-picker-btn:hover{color:var(--text-primary);background:var(--bg-raised);border-color:var(--border-strong)} +.ide-picker-btn svg{width:16px;height:16px;flex-shrink:0} +.ide-picker-btn[aria-expanded="true"]{color:var(--accent-primary);border-color:var(--accent-primary)} +.ide-menu{display:none;position:absolute;top:100%;right:0;margin-top:var(--sp-1); + min-width:160px;background:var(--bg-surface);border:1px solid var(--border); + border-radius:var(--radius);box-shadow:0 4px 12px rgba(0,0,0,.15); + z-index:100;padding:var(--sp-1) 0;list-style:none} +.ide-menu[data-open]{display:block} +.ide-menu li{padding:0} +.ide-menu button{display:flex;align-items:center;gap:var(--sp-2);width:100%; + padding:var(--sp-1) var(--sp-3);background:none;border:none;color:var(--text-primary); + font-size:.8rem;font-family:var(--font-sans);cursor:pointer;text-align:left} +.ide-menu button:hover{background:var(--bg-alt)} +.ide-menu button[aria-checked="true"]{color:var(--accent-primary);font-weight:600} +.ide-menu button[aria-checked="true"]::before{content:'\\2713';font-size:.7rem; + width:14px;text-align:center;flex-shrink:0} +.ide-menu button[aria-checked="false"]::before{content:'';width:14px;flex-shrink:0} + /* Print */ @media print{ .topbar,.toolbar,.pagination,.theme-toggle,.toast-container, - .novelty-tabs,.clear-btn,.btn{display:none!important} + .novelty-tabs,.clear-btn,.btn,.ide-picker{display:none!important} .tab-panel{display:block!important;break-inside:avoid} .group-body{display:block!important} body{background:#fff;color:#000} diff --git a/codeclone/_html_data_attrs.py b/codeclone/_html_data_attrs.py index cf10e4b..74b2f8b 100644 --- a/codeclone/_html_data_attrs.py +++ b/codeclone/_html_data_attrs.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy """Unified data-attribute builder for HTML elements.""" diff --git a/codeclone/_html_escape.py b/codeclone/_html_escape.py index b12a3b8..63b1a7e 100644 --- a/codeclone/_html_escape.py +++ b/codeclone/_html_escape.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/_html_filters.py b/codeclone/_html_filters.py index 980cf91..dd9bbf3 100644 --- a/codeclone/_html_filters.py +++ b/codeclone/_html_filters.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy """Data-driven filter dropdown renderer for report toolbars.""" diff --git a/codeclone/_html_js.py b/codeclone/_html_js.py index 12ad40e..0d07299 100644 --- a/codeclone/_html_js.py +++ b/codeclone/_html_js.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy """JavaScript for the HTML report — modular IIFE with feature blocks.""" @@ -567,6 +570,105 @@ _LAZY_HIGHLIGHT = "" +# --------------------------------------------------------------------------- +# IDE links +# --------------------------------------------------------------------------- + +_IDE_LINKS = r""" +(function initIdeLinks(){ + const KEY='codeclone-ide'; + const root=document.documentElement; + var scanRoot=root.getAttribute('data-scan-root')||''; + var projectName=scanRoot.replace(/\/$/,'').split('/').pop()||''; + + function relPath(abs){ + var r=scanRoot.replace(/\/$/,'')+'/'; + if(abs.indexOf(r)===0)return abs.substring(r.length); + return abs; + } + + const SCHEMES={ + pycharm:{label:'PyCharm', + url:function(f,l){return 'jetbrains://pycharm/navigate/reference?project='+encodeURIComponent(projectName)+'&path='+encodeURIComponent(relPath(f))+':'+l}}, + idea:{label:'IntelliJ IDEA', + url:function(f,l){return 'jetbrains://idea/navigate/reference?project='+encodeURIComponent(projectName)+'&path='+encodeURIComponent(relPath(f))+':'+l}}, + vscode:{label:'VS Code', + url:function(f,l){return 'vscode://file'+f+':'+l}}, + cursor:{label:'Cursor', + url:function(f,l){return 'cursor://file'+f+':'+l}}, + fleet:{label:'Fleet', + url:function(f,l){return 'fleet://open?file='+encodeURIComponent(f)+'&line='+l}}, + zed:{label:'Zed', + url:function(f,l){return 'zed://file'+f+':'+l}}, + '': {label:'None',url:null} + }; + + var current=localStorage.getItem(KEY)||''; + root.setAttribute('data-ide',current); + + const btn=$('.ide-picker-btn'); + const menu=$('.ide-menu'); + const label=$('.ide-picker-label'); + if(!btn||!menu)return; + + function updateLabel(){ + if(!label)return; + var s=SCHEMES[current]; + label.textContent=s&¤t?s.label:'IDE'; + } + + function setChecked(){ + menu.querySelectorAll('button').forEach(function(b){ + b.setAttribute('aria-checked',b.dataset.ide===current?'true':'false'); + }); + } + + function applyHrefs(){ + var s=SCHEMES[current]; + $$('.ide-link[data-file]').forEach(function(a){ + if(!current||!s||!s.url){a.removeAttribute('href');return} + var f=a.getAttribute('data-file'),l=a.getAttribute('data-line')||'1'; + if(!f)return; + a.setAttribute('href',s.url(f,l)); + }); + } + + setChecked(); + updateLabel(); + applyHrefs(); + + // Reapply hrefs when new content becomes visible (tab switch) + var mo=new MutationObserver(function(){applyHrefs()}); + document.querySelectorAll('.tab-panel').forEach(function(p){ + mo.observe(p,{attributes:true,attributeFilter:['class']}); + }); + + btn.addEventListener('click',function(e){ + e.stopPropagation(); + var open=menu.hasAttribute('data-open'); + if(open){menu.removeAttribute('data-open');btn.setAttribute('aria-expanded','false')} + else{menu.setAttribute('data-open','');btn.setAttribute('aria-expanded','true')} + }); + + document.addEventListener('click',function(){ + menu.removeAttribute('data-open');btn.setAttribute('aria-expanded','false'); + }); + + menu.addEventListener('click',function(e){ + e.stopPropagation(); + var b=e.target.closest('button[data-ide]'); + if(!b)return; + current=b.dataset.ide; + localStorage.setItem(KEY,current); + root.setAttribute('data-ide',current); + setChecked(); + updateLabel(); + applyHrefs(); + menu.removeAttribute('data-open');btn.setAttribute('aria-expanded','false'); + }); + +})(); +""" # --------------------------------------------------------------------------- # Public API @@ -589,6 +691,7 @@ _TABLE_SORT, _SCOPE_COUNTERS, _LAZY_HIGHLIGHT, + _IDE_LINKS, ) diff --git a/codeclone/_html_report/__init__.py b/codeclone/_html_report/__init__.py index fbbfff7..69b89c1 100644 --- a/codeclone/_html_report/__init__.py +++ b/codeclone/_html_report/__init__.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy """New HTML report package — component-based architecture.""" diff --git a/codeclone/_html_report/_assemble.py b/codeclone/_html_report/_assemble.py index 91172af..29017d4 100644 --- a/codeclone/_html_report/_assemble.py +++ b/codeclone/_html_report/_assemble.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy """Orchestrator: build_context → render all sections → template.substitute.""" @@ -10,7 +13,7 @@ from .. import __version__, _coerce from .._html_css import build_css -from .._html_escape import _escape_html +from .._html_escape import _escape_attr, _escape_html from .._html_js import build_js from .._html_snippets import _FileCache, _pygments_css from ..contracts import DOCS_URL, ISSUES_URL, REPOSITORY_URL @@ -18,7 +21,7 @@ from ..structural_findings import normalize_structural_findings from ..templates import FONT_CSS_URL, REPORT_TEMPLATE from ._context import _meta_pick, build_context -from ._icons import BRAND_LOGO, ICONS +from ._icons import BRAND_LOGO, ICONS, section_icon_html from ._sections._clones import render_clones_panel from ._sections._coupling import render_quality_panel from ._sections._dead_code import render_dead_code_panel @@ -116,6 +119,15 @@ def _tab_badge(count: int) -> str: return f'{count}' # -- Main tab navigation -- + tab_icon_keys: dict[str, str] = { + "overview": "overview", + "clones": "clones", + "quality": "quality", + "dependencies": "dependencies", + "dead-code": "dead-code", + "suggestions": "suggestions", + "structural-findings": "structural-findings", + } tab_defs = [ ("overview", "Overview", overview_html, ""), ("clones", "Clones", clones_html, _tab_badge(ctx.clone_groups_total)), @@ -148,10 +160,15 @@ def _tab_badge(count: int) -> str: extra = tab_extra_attrs.get(tab_id, "") if extra: extra = " " + extra + tab_icon = section_icon_html( + tab_icon_keys.get(tab_id, ""), + class_name="main-tab-icon", + size=15, + ) tab_buttons.append( f'" + f'{tab_icon}{tab_label}{badge}' ) active = " active" if idx == 0 else "" tab_panels.append( @@ -185,6 +202,22 @@ def _tab_badge(count: int) -> str: else: prov_dot_cls = "dot-neutral" + # -- IDE picker menu -- + ide_options = [ + ("pycharm", "PyCharm"), + ("idea", "IntelliJ IDEA"), + ("vscode", "VS Code"), + ("cursor", "Cursor"), + ("fleet", "Fleet"), + ("zed", "Zed"), + ("", "None"), + ] + ide_menu_items = "".join( + f'
  • ' + for ide_id, label in ide_options + ) + # -- Topbar -- topbar_html = ( '
    ' @@ -195,6 +228,11 @@ def _tab_badge(count: int) -> str: f'
    {ctx.brand_meta}
    ' "
    " '
    ' + '
    ' + '' + f'
    ' f'' f'
    " @@ -411,6 +419,7 @@ def _render_group_html( section_novelty: Mapping[str, str], ) -> str: group_id = f"{section_id}-{group_index}" + finding_id = clone_group_id(_clone_kind_for_section(section_id), group_key) search_parts: list[str] = [str(group_key)] for item in items: search_parts.append(str(item.get("qualname", ""))) @@ -463,8 +472,10 @@ def _render_group_html( explanation_html = _render_group_explanation(block_meta) if block_meta else "" return ( - f'
    ' diff --git a/codeclone/_html_report/_sections/_coupling.py b/codeclone/_html_report/_sections/_coupling.py index 224e8cc..cfc7bac 100644 --- a/codeclone/_html_report/_sections/_coupling.py +++ b/codeclone/_html_report/_sections/_coupling.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy """Coupling + Cohesion panel renderer (unified Quality tab).""" diff --git a/codeclone/_html_report/_sections/_dead_code.py b/codeclone/_html_report/_sections/_dead_code.py index ca87f42..1823128 100644 --- a/codeclone/_html_report/_sections/_dead_code.py +++ b/codeclone/_html_report/_sections/_dead_code.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy """Dead Code panel renderer.""" diff --git a/codeclone/_html_report/_sections/_dependencies.py b/codeclone/_html_report/_sections/_dependencies.py index 67d5917..3258f9d 100644 --- a/codeclone/_html_report/_sections/_dependencies.py +++ b/codeclone/_html_report/_sections/_dependencies.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy """Dependencies panel renderer (SVG graph + tables).""" diff --git a/codeclone/_html_report/_sections/_meta.py b/codeclone/_html_report/_sections/_meta.py index 6c0fcde..a29e494 100644 --- a/codeclone/_html_report/_sections/_meta.py +++ b/codeclone/_html_report/_sections/_meta.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy """Report Provenance / metadata panel renderer.""" diff --git a/codeclone/_html_report/_sections/_overview.py b/codeclone/_html_report/_sections/_overview.py index be3b811..341a054 100644 --- a/codeclone/_html_report/_sections/_overview.py +++ b/codeclone/_html_report/_sections/_overview.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy """Overview panel renderer.""" @@ -6,10 +9,11 @@ from __future__ import annotations import math +from collections.abc import Mapping from typing import TYPE_CHECKING from ... import _coerce -from ..._html_badges import _stat_card +from ..._html_badges import _source_kind_badge_html, _stat_card from ..._html_escape import _escape_html from .._components import ( Tone, @@ -28,6 +32,36 @@ _as_mapping = _coerce.as_mapping _as_sequence = _coerce.as_sequence +_DIRECTORY_BUCKET_LABELS: dict[str, str] = { + "all": "All Findings", + "clones": "Clone Groups", + "structural": "Structural Findings", + "complexity": "High Complexity", + "cohesion": "Low Cohesion", + "coupling": "High Coupling", + "dead_code": "Dead Code", + "dependency": "Dependency Cycles", +} +_DIRECTORY_BUCKET_ORDER: tuple[str, ...] = ( + "all", + "clones", + "structural", + "complexity", + "cohesion", + "coupling", + "dead_code", + "dependency", +) +_DIRECTORY_KIND_LABELS: dict[str, str] = { + "clones": "clones", + "structural": "structural", + "dead_code": "dead code", + "complexity": "complexity", + "cohesion": "cohesion", + "coupling": "coupling", + "dependency": "dependency", +} + def _health_gauge_html( score: float, grade: str, *, health_delta: int | None = None @@ -344,6 +378,130 @@ def _issue_breakdown_html( return '
    ' + "".join(parts) + "
    " +def _dir_meta_span(val: int, label: str) -> str: + return f"{val} {_escape_html(label)}" + + +_DIR_META_SEP = '\u00b7' + + +def _directory_kind_meta_parts( + kind_breakdown: Mapping[str, object], + *, + total_groups: int, +) -> list[str]: + kind_rows = [ + (str(kind), _as_int(count)) + for kind, count in kind_breakdown.items() + if _as_int(count) > 0 + ] + kind_rows.sort(key=lambda item: (-item[1], item[0])) + if len(kind_rows) <= 1: + return [] + parts: list[str] = [] + for kind, count in kind_rows[:2]: + parts.append(_dir_meta_span(count, _DIRECTORY_KIND_LABELS.get(kind, kind))) + return parts + + +def _directory_hotspot_bucket_body(bucket: str, payload: Mapping[str, object]) -> str: + items = list(map(_as_mapping, _as_sequence(payload.get("items")))) + if not items: + return "" + returned = _as_int(payload.get("returned")) + total_directories = _as_int(payload.get("total_directories")) + has_more = bool(payload.get("has_more")) + subtitle_html = "" + if has_more and returned > 0 and total_directories > returned: + subtitle_html = ( + '
    ' + f"top {returned} of {total_directories} directories" + "
    " + ) + rows: list[str] = [] + cumulative = 0.0 + for item in items: + path = str(item.get("path", ".")).strip() or "." + source_scope = _as_mapping(item.get("source_scope")) + dominant_kind = ( + str(source_scope.get("dominant_kind", "other")).strip() or "other" + ) + share_pct = _as_float(item.get("share_pct")) + groups = _as_int(item.get("finding_groups")) + affected = _as_int(item.get("affected_items")) + files = _as_int(item.get("files")) + + meta_parts = [ + _dir_meta_span(groups, "groups"), + _dir_meta_span(affected, "items"), + _dir_meta_span(files, "files"), + ] + if bucket == "all": + meta_parts.extend( + _directory_kind_meta_parts( + _as_mapping(item.get("kind_breakdown")), + total_groups=groups, + ) + ) + + path_html = _escape_html(path).replace("/", "/") + + prev_pct = min(cumulative, 100.0) + cur_pct = min(share_pct, 100.0 - prev_pct) + cumulative += share_pct + + bar_html = ( + '' + f'' + f'' + "" + ) + + rows.append( + '
    ' + '
    ' + f"{path_html}" + f" {_source_kind_badge_html(dominant_kind)}" + "
    " + f'
    {bar_html}' + f'{share_pct:.1f}%' + "
    " + f'
    {_DIR_META_SEP.join(meta_parts)}
    ' + "
    " + ) + return subtitle_html + '
    ' + "".join(rows) + "
    " + + +def _directory_hotspots_section(ctx: ReportContext) -> str: + directory_hotspots = _as_mapping(ctx.overview_data.get("directory_hotspots")) + if not directory_hotspots: + return "" + cards: list[str] = [] + for bucket in _DIRECTORY_BUCKET_ORDER: + payload = _as_mapping(directory_hotspots.get(bucket)) + body_html = _directory_hotspot_bucket_body(bucket, payload) + if not body_html: + continue + cards.append( + overview_summary_item_html( + label=_DIRECTORY_BUCKET_LABELS.get(bucket, bucket), + body_html=body_html, + ) + ) + if not cards: + return "" + return ( + '
    ' + + overview_cluster_header( + "Hotspots by Directory", + "Directories with the highest concentration of findings by category.", + ) + + '
    ' + + "".join(cards) + + "
    " + ) + + def render_overview_panel(ctx: ReportContext) -> str: """Build the Overview tab panel HTML.""" complexity_summary = _as_mapping(ctx.complexity_map.get("summary")) @@ -619,6 +777,7 @@ def _baselined_detail( + "
    " + "" + executive + + _directory_hotspots_section(ctx) + _analytics_section(ctx) ) diff --git a/codeclone/_html_report/_sections/_structural.py b/codeclone/_html_report/_sections/_structural.py index 4f09a52..9a9f5c8 100644 --- a/codeclone/_html_report/_sections/_structural.py +++ b/codeclone/_html_report/_sections/_structural.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy """Structural Findings panel — thin wrapper delegating to report/findings.py.""" diff --git a/codeclone/_html_report/_sections/_suggestions.py b/codeclone/_html_report/_sections/_suggestions.py index a643229..be1e33b 100644 --- a/codeclone/_html_report/_sections/_suggestions.py +++ b/codeclone/_html_report/_sections/_suggestions.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy """Suggestions panel renderer.""" @@ -11,7 +14,7 @@ from ... import _coerce from ..._html_badges import _tab_empty from ..._html_data_attrs import _build_data_attrs -from ..._html_escape import _escape_html +from ..._html_escape import _escape_attr, _escape_html from ..._html_filters import SPREAD_OPTIONS, _render_select from ...domain.findings import ( CATEGORY_CLONE, @@ -34,6 +37,11 @@ from .._context import ReportContext _as_int = _coerce.as_int +_CLONE_KIND_CHIP_LABELS: dict[str, str] = { + "function": "Function", + "block": "Block", + "segment": "Segment", +} def _render_fact_summary(raw: str) -> str: @@ -71,6 +79,34 @@ def _format_source_breakdown( return " \u00b7 ".join(f"{source_kind_label(k)} {c}" for k, c in rows if c > 0) +def _suggestion_context_labels(s: Suggestion) -> tuple[str, ...]: + labels: list[str] = [] + source_label = source_kind_label(s.source_kind) + if source_label: + labels.append(source_label) + if s.category == CATEGORY_CLONE: + kind_label = _CLONE_KIND_CHIP_LABELS.get(s.finding_kind.strip().lower()) + if kind_label: + labels.append(kind_label) + if s.clone_type: + labels.append(s.clone_type) + return tuple(labels) + category_label = s.category.replace("_", " ").title() + if category_label: + labels.append(category_label) + return tuple(labels) + + +def _priority_badge_label(priority: float) -> str: + return f"Priority {priority:g}" + + +def _spread_label(*, spread_functions: int, spread_files: int) -> str: + function_word = "function" if spread_functions == 1 else "functions" + file_word = "file" if spread_files == 1 else "files" + return f"{spread_functions} {function_word} \u00b7 {spread_files} {file_word}" + + def _render_card(s: Suggestion, ctx: ReportContext) -> str: actionable = "true" if s.severity != "info" else "false" spread_bucket = "high" if s.spread_files > 1 or s.spread_functions > 1 else "low" @@ -78,18 +114,11 @@ def _render_card(s: Suggestion, ctx: ReportContext) -> str: facts_source = _escape_html(breakdown_text or source_kind_label(s.source_kind)) facts_location = _escape_html(s.location_label or s.location) - # Context chips — more visible than a single muted line - ctx_chips: list[str] = [] - sk = source_kind_label(s.source_kind) - if sk: - ctx_chips.append(f'{_escape_html(sk)}') - cat = s.category.replace("_", " ") - if cat: - ctx_chips.append(f'{_escape_html(cat)}') - if s.clone_type: - ctx_chips.append( - f'{_escape_html(s.clone_type)}' - ) + # Context chips stay compact and specific: source scope first, then kind. + ctx_chips = [ + f'{_escape_html(label)}' + for label in _suggestion_context_labels(s) + ] ctx_html = f'
    {"".join(ctx_chips)}
    ' # Next step — primary actionable CTA @@ -106,18 +135,22 @@ def _render_card(s: Suggestion, ctx: ReportContext) -> str: # Effort badge — color-coded effort_cls = f" suggestion-effort--{_escape_html(s.effort)}" - - # Priority — clean display (drop trailing zeros) - priority_str = f"{s.priority:g}" + effort_label = s.effort.title() + priority_label = _priority_badge_label(s.priority) + spread_label = _spread_label( + spread_functions=s.spread_functions, + spread_files=s.spread_files, + ) # Locations inside details locs_html = "" if s.representative_locations: locs_items = "".join( '
  • ' + f'' f"{_escape_html(loc.relative_path)}" f':{loc.start_line}\u2013{loc.end_line}' - "" + "" f'{_escape_html(ctx.bare_qualname(loc.qualname, loc.filepath))}' "
  • " for loc in s.representative_locations @@ -151,9 +184,9 @@ def _render_card(s: Suggestion, ctx: ReportContext) -> str: f'{_escape_html(s.severity)}' f'{_escape_html(s.title)}' '' - f'{_escape_html(s.effort)}' - f'P{priority_str}' - f'{s.spread_functions} fn / {s.spread_files} files' + f'{_escape_html(effort_label)}' + f'{_escape_html(priority_label)}' + f'{_escape_html(spread_label)}' "" # -- body -- '
    ' @@ -170,7 +203,7 @@ def _render_card(s: Suggestion, ctx: ReportContext) -> str: '
    Facts
    ' '
    ' f"
    Finding
    {_escape_html(s.fact_kind or s.category)}
    " - f"
    Spread
    {s.spread_functions} fn / {s.spread_files} files
    " + f"
    Spread
    {_escape_html(spread_label)}
    " f"
    Source
    {facts_source}
    " f"
    Scope
    {facts_location}
    " "
    " @@ -179,7 +212,7 @@ def _render_card(s: Suggestion, ctx: ReportContext) -> str: '
    ' f"
    Severity
    {sev_dd}
    " f"
    Confidence
    {_escape_html(s.confidence)}
    " - f"
    Priority
    {priority_str}
    " + f"
    Priority
    {_escape_html(priority_label)}
    " f"
    Family
    {_escape_html(s.finding_family)}
    " "
    " "" diff --git a/codeclone/_html_report/_tables.py b/codeclone/_html_report/_tables.py index 8d8a1fd..14bf7aa 100644 --- a/codeclone/_html_report/_tables.py +++ b/codeclone/_html_report/_tables.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy """Generic table renderer for metric/finding tables.""" @@ -103,7 +106,9 @@ def _td(col_idx: int, cell: str) -> str: if h in _PATH_HEADERS and ctx is not None: short = ctx.relative_path(cell) return ( - f'{_escape_html(short)}' + f'' + f'' + f"{_escape_html(short)}" ) return f"{_escape_html(cell)}" diff --git a/codeclone/_html_report/_tabs.py b/codeclone/_html_report/_tabs.py index 54870ca..d9241a8 100644 --- a/codeclone/_html_report/_tabs.py +++ b/codeclone/_html_report/_tabs.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy """Tab/subtab rendering helpers.""" diff --git a/codeclone/_html_snippets.py b/codeclone/_html_snippets.py index 9ae7e40..dac7eec 100644 --- a/codeclone/_html_snippets.py +++ b/codeclone/_html_snippets.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/_schema_validation.py b/codeclone/_schema_validation.py index 43280c0..e90404f 100644 --- a/codeclone/_schema_validation.py +++ b/codeclone/_schema_validation.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/baseline.py b/codeclone/baseline.py index c249539..53d4a37 100644 --- a/codeclone/baseline.py +++ b/codeclone/baseline.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations @@ -9,6 +12,7 @@ import os import re import sys +import tempfile from datetime import datetime, timezone from enum import Enum from pathlib import Path @@ -23,7 +27,7 @@ from .errors import BaselineValidationError if TYPE_CHECKING: - from collections.abc import Mapping + from collections.abc import Collection, Mapping # Any: baseline JSON parsing/serialization boundary. Values are validated # and narrowed before entering compatibility/integrity checks. @@ -416,13 +420,21 @@ def diff( def _atomic_write_json(path: Path, payload: dict[str, Any]) -> None: - tmp_path = path.with_name(f"{path.name}.tmp") data = json.dumps(payload, indent=2, ensure_ascii=False) + "\n" - with tmp_path.open("wb") as tmp_file: - tmp_file.write(data.encode("utf-8")) - tmp_file.flush() - os.fsync(tmp_file.fileno()) - os.replace(tmp_path, path) + fd_num, tmp_name = tempfile.mkstemp( + dir=path.parent, + suffix=".tmp", + ) + tmp_path = Path(tmp_name) + try: + with os.fdopen(fd_num, "wb") as fd: + fd.write(data.encode("utf-8")) + fd.flush() + os.fsync(fd.fileno()) + os.replace(tmp_path, path) + except BaseException: + tmp_path.unlink(missing_ok=True) + raise def _safe_stat_size(path: Path) -> int: @@ -574,8 +586,8 @@ def _baseline_payload( sorted_functions = sorted(functions) sorted_blocks = sorted(blocks) payload_sha256 = _compute_payload_sha256( - functions=set(sorted_functions), - blocks=set(sorted_blocks), + functions=sorted_functions, + blocks=sorted_blocks, fingerprint_version=resolved_fingerprint, python_tag=resolved_python_tag, ) @@ -601,8 +613,8 @@ def _baseline_payload( def _compute_payload_sha256( *, - functions: set[str], - blocks: set[str], + functions: Collection[str], + blocks: Collection[str], fingerprint_version: str, python_tag: str, ) -> str: diff --git a/codeclone/blockhash.py b/codeclone/blockhash.py deleted file mode 100644 index 5eb8bcc..0000000 --- a/codeclone/blockhash.py +++ /dev/null @@ -1,29 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (c) 2026 Den Rozhnovskiy - -from __future__ import annotations - -import ast -import hashlib -from typing import TYPE_CHECKING - -from .normalize import AstNormalizer, NormalizationConfig - -if TYPE_CHECKING: - from collections.abc import Sequence - - -def _normalized_stmt_dump(stmt: ast.stmt, normalizer: AstNormalizer) -> str: - normalized = normalizer.visit(stmt) - assert isinstance(normalized, ast.AST) - return ast.dump(normalized, annotate_fields=True, include_attributes=False) - - -def stmt_hashes(statements: Sequence[ast.stmt], cfg: NormalizationConfig) -> list[str]: - normalizer = AstNormalizer(cfg) - return [ - hashlib.sha1( - _normalized_stmt_dump(stmt, normalizer).encode("utf-8") - ).hexdigest() - for stmt in statements - ] diff --git a/codeclone/blocks.py b/codeclone/blocks.py index 2ccad47..9089ff1 100644 --- a/codeclone/blocks.py +++ b/codeclone/blocks.py @@ -1,13 +1,16 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations from typing import TYPE_CHECKING -from .blockhash import stmt_hashes from .fingerprint import sha1 from .models import BlockUnit, SegmentUnit +from .normalize import stmt_hashes if TYPE_CHECKING: import ast @@ -49,10 +52,11 @@ def extract_blocks( for i in range(len(stmt_hash_rows) - block_size + 1): start = getattr(body[i], "lineno", None) end = getattr(body[i + block_size - 1], "end_lineno", None) - if not start or not end: - continue - - if last_start is not None and start - last_start < min_line_distance: + if ( + not start + or not end + or (last_start is not None and start - last_start < min_line_distance) + ): continue bh = "|".join(stmt_hash_rows[i : i + block_size]) diff --git a/codeclone/cache.py b/codeclone/cache.py index 18b9b44..3bee46f 100644 --- a/codeclone/cache.py +++ b/codeclone/cache.py @@ -1,18 +1,48 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations -import hashlib -import hmac -import json import os from collections.abc import Collection from enum import Enum +from json import JSONDecodeError from pathlib import Path from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, TypeVar, cast from .baseline import current_python_tag +from .cache_io import ( + as_int_or_none as _cache_as_int, +) +from .cache_io import ( + as_object_list as _cache_as_list, +) +from .cache_io import ( + as_str_dict as _cache_as_str_dict, +) +from .cache_io import ( + as_str_or_none as _cache_as_str, +) +from .cache_io import ( + read_json_document, + sign_cache_payload, + verify_cache_payload_signature, + write_json_document_atomically, +) +from .cache_paths import runtime_filepath_from_wire, wire_filepath_from_runtime +from .cache_segments import ( + SegmentReportProjection as _SegmentReportProjection, +) +from .cache_segments import ( + build_segment_report_projection as _build_segment_report_projection, +) +from .cache_segments import ( + decode_segment_report_projection, + encode_segment_report_projection, +) from .contracts import BASELINE_FINGERPRINT_VERSION, CACHE_VERSION from .errors import CacheError from .models import ( @@ -34,6 +64,13 @@ if TYPE_CHECKING: from collections.abc import Callable, Mapping, Sequence +SegmentReportProjection = _SegmentReportProjection +build_segment_report_projection = _build_segment_report_projection +_as_str = _cache_as_str +_as_int = _cache_as_int +_as_list = _cache_as_list +_as_str_dict = _cache_as_str_dict + MAX_CACHE_SIZE_BYTES = 50 * 1024 * 1024 LEGACY_CACHE_SECRET_FILENAME = ".cache_secret" _DEFAULT_WIRE_UNIT_FLOW_PROFILES = ( @@ -163,67 +200,6 @@ class CacheData(TypedDict): files: dict[str, CacheEntry] -class SegmentReportProjection(TypedDict): - digest: str - suppressed: int - groups: dict[str, list[SegmentDict]] - - -def build_segment_report_projection( - *, - digest: str, - suppressed: int, - groups: Mapping[str, Sequence[Mapping[str, object]]], -) -> SegmentReportProjection: - normalized_groups: dict[str, list[SegmentDict]] = {} - for group_key in sorted(groups): - normalized_items: list[SegmentDict] = [] - for raw_item in sorted( - groups[group_key], - key=lambda item: ( - str(item.get("filepath", "")), - str(item.get("qualname", "")), - _as_int(item.get("start_line")) or 0, - _as_int(item.get("end_line")) or 0, - ), - ): - segment_hash = _as_str(raw_item.get("segment_hash")) - segment_sig = _as_str(raw_item.get("segment_sig")) - filepath = _as_str(raw_item.get("filepath")) - qualname = _as_str(raw_item.get("qualname")) - start_line = _as_int(raw_item.get("start_line")) - end_line = _as_int(raw_item.get("end_line")) - size = _as_int(raw_item.get("size")) - if ( - segment_hash is None - or segment_sig is None - or filepath is None - or qualname is None - or start_line is None - or end_line is None - or size is None - ): - continue - normalized_items.append( - SegmentGroupItem( - segment_hash=segment_hash, - segment_sig=segment_sig, - filepath=filepath, - qualname=qualname, - start_line=start_line, - end_line=end_line, - size=size, - ) - ) - if normalized_items: - normalized_groups[group_key] = normalized_items - return { - "digest": digest, - "suppressed": max(0, int(suppressed)), - "groups": normalized_groups, - } - - def _normalize_cached_structural_group( group: StructuralFindingGroupDict, *, @@ -421,12 +397,6 @@ def _reject_version_mismatch(self, version: str) -> CacheData | None: schema_version=version, ) - @staticmethod - def _sign_data(data: Mapping[str, object]) -> str: - """Create deterministic SHA-256 signature for canonical payload data.""" - canonical = _canonical_json(data) - return hashlib.sha256(canonical.encode("utf-8")).hexdigest() - def load(self) -> None: try: exists = self.path.exists() @@ -455,7 +425,7 @@ def load(self) -> None: ) return - raw_obj: object = json.loads(self.path.read_text("utf-8")) + raw_obj = read_json_document(self.path) parsed = self._load_and_validate(raw_obj) if parsed is None: return @@ -470,7 +440,7 @@ def load(self) -> None: f"Cache unreadable; ignoring cache: {e}", status=CacheStatus.UNREADABLE, ) - except json.JSONDecodeError: + except JSONDecodeError: self._ignore_cache( "Cache corrupted; ignoring cache.", status=CacheStatus.INVALID_JSON, @@ -499,8 +469,7 @@ def _load_and_validate(self, raw_obj: object) -> CacheData | None: if sig is None or payload is None: return self._reject_invalid_cache_format(schema_version=version) - expected_sig = self._sign_data(payload) - if not hmac.compare_digest(sig, expected_sig): + if not verify_cache_payload_signature(payload, sig): return self._reject_cache_load( "Cache signature mismatch; ignoring cache.", status=CacheStatus.INTEGRITY_FAILED, @@ -556,13 +525,14 @@ def _load_and_validate(self, raw_obj: object) -> CacheData | None: parsed_files: dict[str, CacheEntry] = {} for wire_path, file_entry_obj in files_dict.items(): - runtime_path = self._runtime_filepath_from_wire(wire_path) + runtime_path = runtime_filepath_from_wire(wire_path, root=self.root) parsed_entry = self._decode_entry(file_entry_obj, runtime_path) if parsed_entry is None: return self._reject_invalid_cache_format(schema_version=version) parsed_files[runtime_path] = _canonicalize_cache_entry(parsed_entry) - self.segment_report_projection = self._decode_segment_report_projection( - payload.get("sr") + self.segment_report_projection = decode_segment_report_projection( + payload.get("sr"), + root=self.root, ) self.cache_schema_version = version @@ -578,10 +548,10 @@ def save(self) -> None: if not self._dirty: return try: - self.path.parent.mkdir(parents=True, exist_ok=True) wire_files: dict[str, object] = {} wire_map = { - rp: self._wire_filepath_from_runtime(rp) for rp in self.data["files"] + rp: wire_filepath_from_runtime(rp, root=self.root) + for rp in self.data["files"] } for runtime_path in sorted(self.data["files"], key=wire_map.__getitem__): entry = self.get_file_entry(runtime_path) @@ -595,22 +565,18 @@ def save(self) -> None: "ap": self.analysis_profile, "files": wire_files, } - segment_projection = self._encode_segment_report_projection() + segment_projection = encode_segment_report_projection( + self.segment_report_projection, + root=self.root, + ) if segment_projection is not None: payload["sr"] = segment_projection signed_doc = { "v": self._CACHE_VERSION, "payload": payload, - "sig": self._sign_data(payload), + "sig": sign_cache_payload(payload), } - - tmp_path = self.path.with_name(f"{self.path.name}.tmp") - data = _canonical_json(signed_doc).encode("utf-8") - with tmp_path.open("wb") as tmp_file: - tmp_file.write(data) - tmp_file.flush() - os.fsync(tmp_file.fileno()) - os.replace(tmp_path, self.path) + write_json_document_atomically(self.path, signed_doc) self._dirty = False self.data["version"] = self._CACHE_VERSION @@ -629,131 +595,6 @@ def _decode_entry(value: object, filepath: str) -> CacheEntry | None: def _encode_entry(entry: CacheEntry) -> dict[str, object]: return _encode_wire_file_entry(entry) - def _wire_filepath_from_runtime(self, runtime_filepath: str) -> str: - runtime_path = Path(runtime_filepath) - if self.root is None: - return runtime_path.as_posix() - - try: - relative = runtime_path.relative_to(self.root) - return relative.as_posix() - except ValueError: - pass - - try: - relative = runtime_path.resolve().relative_to(self.root.resolve()) - return relative.as_posix() - except OSError: - return runtime_path.as_posix() - except ValueError: - return runtime_path.as_posix() - - def _runtime_filepath_from_wire(self, wire_filepath: str) -> str: - wire_path = Path(wire_filepath) - if self.root is None or wire_path.is_absolute(): - return str(wire_path) - - combined = self.root / wire_path - try: - return str(combined.resolve(strict=False)) - except OSError: - return str(combined) - - def _decode_segment_report_projection( - self, - value: object, - ) -> SegmentReportProjection | None: - obj = _as_str_dict(value) - if obj is None: - return None - digest = _as_str(obj.get("d")) - suppressed = _as_int(obj.get("s")) - groups_raw = _as_list(obj.get("g")) - if digest is None or suppressed is None or groups_raw is None: - return None - groups: dict[str, list[SegmentDict]] = {} - for group_row in groups_raw: - group_list = _as_list(group_row) - if group_list is None or len(group_list) != 2: - return None - group_key = _as_str(group_list[0]) - items_raw = _as_list(group_list[1]) - if group_key is None or items_raw is None: - return None - items: list[SegmentDict] = [] - for item_raw in items_raw: - item_list = _as_list(item_raw) - if item_list is None or len(item_list) != 7: - return None - wire_filepath = _as_str(item_list[0]) - qualname = _as_str(item_list[1]) - start_line = _as_int(item_list[2]) - end_line = _as_int(item_list[3]) - size = _as_int(item_list[4]) - segment_hash = _as_str(item_list[5]) - segment_sig = _as_str(item_list[6]) - if ( - wire_filepath is None - or qualname is None - or start_line is None - or end_line is None - or size is None - or segment_hash is None - or segment_sig is None - ): - return None - items.append( - SegmentGroupItem( - segment_hash=segment_hash, - segment_sig=segment_sig, - filepath=self._runtime_filepath_from_wire(wire_filepath), - qualname=qualname, - start_line=start_line, - end_line=end_line, - size=size, - ) - ) - groups[group_key] = items - return { - "digest": digest, - "suppressed": max(0, suppressed), - "groups": groups, - } - - def _encode_segment_report_projection(self) -> dict[str, object] | None: - projection = self.segment_report_projection - if projection is None: - return None - groups_rows: list[list[object]] = [] - for group_key in sorted(projection["groups"]): - items = sorted( - projection["groups"][group_key], - key=lambda item: ( - item["filepath"], - item["qualname"], - item["start_line"], - item["end_line"], - ), - ) - encoded_items = [ - [ - self._wire_filepath_from_runtime(item["filepath"]), - item["qualname"], - item["start_line"], - item["end_line"], - item["size"], - item["segment_hash"], - item["segment_sig"], - ] - for item in items - ] - groups_rows.append([group_key, encoded_items]) - return { - "d": projection["digest"], - "s": max(0, int(projection["suppressed"])), - "g": groups_rows, - } - def _store_canonical_file_entry( self, *, @@ -772,8 +613,8 @@ def get_file_entry(self, filepath: str) -> CacheEntry | None: runtime_lookup_key = filepath entry_obj = self.data["files"].get(runtime_lookup_key) if entry_obj is None: - wire_key = self._wire_filepath_from_runtime(filepath) - runtime_lookup_key = self._runtime_filepath_from_wire(wire_key) + wire_key = wire_filepath_from_runtime(filepath, root=self.root) + runtime_lookup_key = runtime_filepath_from_wire(wire_key, root=self.root) entry_obj = self.data["files"].get(runtime_lookup_key) if entry_obj is None: @@ -858,8 +699,9 @@ def put_file_entry( file_metrics: FileMetrics | None = None, structural_findings: list[StructuralFindingGroup] | None = None, ) -> None: - runtime_path = self._runtime_filepath_from_wire( - self._wire_filepath_from_runtime(filepath) + runtime_path = runtime_filepath_from_wire( + wire_filepath_from_runtime(filepath, root=self.root), + root=self.root, ) unit_rows = [_unit_dict_from_model(unit, runtime_path) for unit in units] @@ -953,22 +795,6 @@ def _empty_cache_data( ) -def _canonical_json(data: object) -> str: - return json.dumps(data, sort_keys=True, separators=(",", ":"), ensure_ascii=False) - - -def _as_str(value: object) -> str | None: - return value if isinstance(value, str) else None - - -def _as_int(value: object) -> int | None: - return value if isinstance(value, int) else None - - -def _as_list(value: object) -> list[object] | None: - return value if isinstance(value, list) else None - - def _as_risk_literal(value: object) -> Literal["low", "medium", "high"] | None: match value: case "low": @@ -1181,6 +1007,13 @@ def _as_typed_string_list(value: object) -> list[str] | None: return _as_typed_list(value, predicate=lambda item: isinstance(item, str)) +def _normalized_optional_string_list(value: object) -> list[str] | None: + items = _as_typed_string_list(value) + if not items: + return None + return sorted(set(items)) + + def _is_canonical_cache_entry(value: object) -> TypeGuard[CacheEntry]: return isinstance(value, dict) and _has_cache_entry_container_shape(value) @@ -1311,15 +1144,6 @@ def _decode_wire_qualname_span_size( return qualname, start_line, end_line, size -def _as_str_dict(value: object) -> dict[str, object] | None: - if not isinstance(value, dict): - return None - for key in value: - if not isinstance(key, str): - return None - return value - - def _as_analysis_profile(value: object) -> AnalysisProfile | None: obj = _as_str_dict(value) if obj is None: @@ -2135,6 +1959,15 @@ def _encode_wire_file_entry(entry: CacheEntry) -> dict[str, object]: ), ) if class_metrics: + coupled_classes_rows: list[list[object]] = [] + + def _append_coupled_classes_row(metric: ClassMetricsDict) -> None: + coupled_classes = _normalized_optional_string_list( + metric.get("coupled_classes", []) + ) + if coupled_classes: + coupled_classes_rows.append([metric["qualname"], coupled_classes]) + wire["cm"] = [ [ metric["qualname"], @@ -2149,15 +1982,8 @@ def _encode_wire_file_entry(entry: CacheEntry) -> dict[str, object]: ] for metric in class_metrics ] - coupled_classes_rows = [] for metric in class_metrics: - coupled_classes_raw = metric.get("coupled_classes", []) - if not _is_string_list(coupled_classes_raw): - continue - coupled_classes = sorted(set(coupled_classes_raw)) - if not coupled_classes: - continue - coupled_classes_rows.append([metric["qualname"], coupled_classes]) + _append_coupled_classes_row(metric) if coupled_classes_rows: wire["cc"] = coupled_classes_rows @@ -2199,10 +2025,9 @@ def _encode_wire_file_entry(entry: CacheEntry) -> dict[str, object]: candidate["kind"], ] suppressed_rules = candidate.get("suppressed_rules", []) - if _is_string_list(suppressed_rules): - normalized_rules = sorted(set(suppressed_rules)) - if normalized_rules: - encoded.append(normalized_rules) + normalized_rules = _normalized_optional_string_list(suppressed_rules) + if normalized_rules: + encoded.append(normalized_rules) encoded_dead_candidates.append(encoded) wire["dc"] = encoded_dead_candidates diff --git a/codeclone/cache_io.py b/codeclone/cache_io.py new file mode 100644 index 0000000..ecffc83 --- /dev/null +++ b/codeclone/cache_io.py @@ -0,0 +1,71 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import hashlib +import hmac +import json +import os +import tempfile +from collections.abc import Mapping +from pathlib import Path + + +def as_str_or_none(value: object) -> str | None: + return value if isinstance(value, str) else None + + +def as_int_or_none(value: object) -> int | None: + return value if isinstance(value, int) else None + + +def as_object_list(value: object) -> list[object] | None: + return value if isinstance(value, list) else None + + +def as_str_dict(value: object) -> dict[str, object] | None: + if not isinstance(value, dict): + return None + if not all(isinstance(key, str) for key in value): + return None + return value + + +def canonical_json(data: object) -> str: + return json.dumps(data, sort_keys=True, separators=(",", ":"), ensure_ascii=False) + + +def sign_cache_payload(data: Mapping[str, object]) -> str: + canonical = canonical_json(data) + return hashlib.sha256(canonical.encode("utf-8")).hexdigest() + + +def verify_cache_payload_signature( + payload: Mapping[str, object], + signature: str, +) -> bool: + return hmac.compare_digest(signature, sign_cache_payload(payload)) + + +def read_json_document(path: Path) -> object: + return json.loads(path.read_text("utf-8")) + + +def write_json_document_atomically(path: Path, document: object) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + data = canonical_json(document).encode("utf-8") + fd_num, tmp_name = tempfile.mkstemp(dir=path.parent, suffix=".tmp") + tmp_path = Path(tmp_name) + try: + with os.fdopen(fd_num, "wb") as fd: + fd.write(data) + fd.flush() + os.fsync(fd.fileno()) + os.replace(tmp_path, path) + except BaseException: + tmp_path.unlink(missing_ok=True) + raise diff --git a/codeclone/cache_paths.py b/codeclone/cache_paths.py new file mode 100644 index 0000000..8de7c63 --- /dev/null +++ b/codeclone/cache_paths.py @@ -0,0 +1,49 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from pathlib import Path + + +def wire_filepath_from_runtime( + runtime_filepath: str, + *, + root: Path | None, +) -> str: + runtime_path = Path(runtime_filepath) + if root is None: + return runtime_path.as_posix() + + try: + relative = runtime_path.relative_to(root) + return relative.as_posix() + except ValueError: + pass + + try: + relative = runtime_path.resolve().relative_to(root.resolve()) + return relative.as_posix() + except OSError: + return runtime_path.as_posix() + except ValueError: + return runtime_path.as_posix() + + +def runtime_filepath_from_wire( + wire_filepath: str, + *, + root: Path | None, +) -> str: + wire_path = Path(wire_filepath) + if root is None or wire_path.is_absolute(): + return str(wire_path) + + combined = root / wire_path + try: + return str(combined.resolve(strict=False)) + except OSError: + return str(combined) diff --git a/codeclone/cache_segments.py b/codeclone/cache_segments.py new file mode 100644 index 0000000..a771e51 --- /dev/null +++ b/codeclone/cache_segments.py @@ -0,0 +1,184 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Mapping, Sequence +from pathlib import Path +from typing import TypedDict + +from .cache_io import ( + as_int_or_none, + as_object_list, + as_str_dict, + as_str_or_none, +) +from .cache_paths import runtime_filepath_from_wire, wire_filepath_from_runtime +from .models import SegmentGroupItem + +SegmentDict = SegmentGroupItem + + +class SegmentReportProjection(TypedDict): + digest: str + suppressed: int + groups: dict[str, list[SegmentDict]] + + +def build_segment_report_projection( + *, + digest: str, + suppressed: int, + groups: Mapping[str, Sequence[Mapping[str, object]]], +) -> SegmentReportProjection: + normalized_groups: dict[str, list[SegmentDict]] = {} + for group_key in sorted(groups): + normalized_items: list[SegmentDict] = [] + for raw_item in sorted( + groups[group_key], + key=lambda item: ( + str(item.get("filepath", "")), + str(item.get("qualname", "")), + as_int_or_none(item.get("start_line")) or 0, + as_int_or_none(item.get("end_line")) or 0, + ), + ): + segment_hash = as_str_or_none(raw_item.get("segment_hash")) + segment_sig = as_str_or_none(raw_item.get("segment_sig")) + filepath = as_str_or_none(raw_item.get("filepath")) + qualname = as_str_or_none(raw_item.get("qualname")) + start_line = as_int_or_none(raw_item.get("start_line")) + end_line = as_int_or_none(raw_item.get("end_line")) + size = as_int_or_none(raw_item.get("size")) + if ( + segment_hash is None + or segment_sig is None + or filepath is None + or qualname is None + or start_line is None + or end_line is None + or size is None + ): + continue + normalized_items.append( + SegmentGroupItem( + segment_hash=segment_hash, + segment_sig=segment_sig, + filepath=filepath, + qualname=qualname, + start_line=start_line, + end_line=end_line, + size=size, + ) + ) + if normalized_items: + normalized_groups[group_key] = normalized_items + return { + "digest": digest, + "suppressed": max(0, int(suppressed)), + "groups": normalized_groups, + } + + +def decode_segment_report_projection( + value: object, + *, + root: Path | None, +) -> SegmentReportProjection | None: + obj = as_str_dict(value) + if obj is None: + return None + digest = as_str_or_none(obj.get("d")) + suppressed = as_int_or_none(obj.get("s")) + groups_raw = as_object_list(obj.get("g")) + if digest is None or suppressed is None or groups_raw is None: + return None + groups: dict[str, list[SegmentDict]] = {} + for group_row in groups_raw: + group_list = as_object_list(group_row) + if group_list is None or len(group_list) != 2: + return None + group_key = as_str_or_none(group_list[0]) + items_raw = as_object_list(group_list[1]) + if group_key is None or items_raw is None: + return None + items: list[SegmentDict] = [] + for item_raw in items_raw: + item_list = as_object_list(item_raw) + if item_list is None or len(item_list) != 7: + return None + wire_filepath = as_str_or_none(item_list[0]) + qualname = as_str_or_none(item_list[1]) + start_line = as_int_or_none(item_list[2]) + end_line = as_int_or_none(item_list[3]) + size = as_int_or_none(item_list[4]) + segment_hash = as_str_or_none(item_list[5]) + segment_sig = as_str_or_none(item_list[6]) + if ( + wire_filepath is None + or qualname is None + or start_line is None + or end_line is None + or size is None + or segment_hash is None + or segment_sig is None + ): + return None + items.append( + SegmentGroupItem( + segment_hash=segment_hash, + segment_sig=segment_sig, + filepath=runtime_filepath_from_wire(wire_filepath, root=root), + qualname=qualname, + start_line=start_line, + end_line=end_line, + size=size, + ) + ) + groups[group_key] = items + return { + "digest": digest, + "suppressed": max(0, suppressed), + "groups": groups, + } + + +def encode_segment_report_projection( + projection: SegmentReportProjection | None, + *, + root: Path | None, +) -> dict[str, object] | None: + if projection is None: + return None + groups_rows: list[list[object]] = [] + for group_key in sorted(projection["groups"]): + items = sorted( + projection["groups"][group_key], + key=lambda item: ( + item["filepath"], + item["qualname"], + item["start_line"], + item["end_line"], + ), + ) + encoded_items = [ + [ + wire_filepath_from_runtime(item["filepath"], root=root), + item["qualname"], + item["start_line"], + item["end_line"], + item["size"], + item["segment_hash"], + item["segment_sig"], + ] + for item in items + ] + groups_rows.append([group_key, encoded_items]) + return { + "d": projection["digest"], + "s": max(0, int(projection["suppressed"])), + "g": groups_rows, + } diff --git a/codeclone/cfg.py b/codeclone/cfg.py index 097a216..e16be39 100644 --- a/codeclone/cfg.py +++ b/codeclone/cfg.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/cfg_model.py b/codeclone/cfg_model.py index bb5fba2..4361e04 100644 --- a/codeclone/cfg_model.py +++ b/codeclone/cfg_model.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/cli.py b/codeclone/cli.py index 4de107c..d06a3e4 100644 --- a/codeclone/cli.py +++ b/codeclone/cli.py @@ -1,16 +1,21 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations import os +import subprocess import sys import time +from collections.abc import Mapping, Sequence from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Literal, Protocol, cast -from . import __version__ +from . import __version__, _coerce from . import ui_messages as ui from ._cli_args import build_parser from ._cli_baselines import ( @@ -80,10 +85,22 @@ from ._cli_runtime import ( validate_numeric_args as _validate_numeric_args_impl, ) -from ._cli_summary import MetricsSnapshot, _print_metrics, _print_summary +from ._cli_summary import ( + ChangedScopeSnapshot, + MetricsSnapshot, + _print_changed_scope, + _print_metrics, + _print_summary, +) from .baseline import Baseline from .cache import Cache, CacheStatus, build_segment_report_projection -from .contracts import ISSUES_URL, ExitCode +from .contracts import ( + DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + ISSUES_URL, + ExitCode, +) from .errors import CacheError if TYPE_CHECKING: @@ -132,6 +149,8 @@ "report", ] +# Lazy singleton for pipeline module — deferred import to keep CLI startup fast. +# Tests monkeypatch this via _pipeline_module() to inject mocks. _PIPELINE_MODULE: ModuleType | None = None @@ -171,6 +190,214 @@ class ProcessingResult: structural_findings: list[object] | None = None +@dataclass(frozen=True, slots=True) +class ChangedCloneGate: + changed_paths: tuple[str, ...] + new_func: frozenset[str] + new_block: frozenset[str] + total_clone_groups: int + findings_total: int + findings_new: int + findings_known: int + + +_as_mapping = _coerce.as_mapping +_as_sequence = _coerce.as_sequence + + +def _validate_changed_scope_args(*, args: Namespace) -> str | None: + if args.diff_against and args.paths_from_git_diff: + console.print( + ui.fmt_contract_error( + "Use --diff-against or --paths-from-git-diff, not both." + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + if args.paths_from_git_diff: + args.changed_only = True + return str(args.paths_from_git_diff) + if args.diff_against and not args.changed_only: + console.print(ui.fmt_contract_error("--diff-against requires --changed-only.")) + sys.exit(ExitCode.CONTRACT_ERROR) + if args.changed_only and not args.diff_against: + console.print( + ui.fmt_contract_error( + "--changed-only requires --diff-against or --paths-from-git-diff." + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + return str(args.diff_against) if args.diff_against else None + + +def _normalize_changed_paths( + *, + root_path: Path, + paths: Sequence[str], +) -> tuple[str, ...]: + normalized: set[str] = set() + for raw_path in paths: + candidate = raw_path.strip() + if not candidate: + continue + candidate_path = Path(candidate) + try: + absolute_path = ( + candidate_path.resolve() + if candidate_path.is_absolute() + else (root_path / candidate_path).resolve() + ) + except OSError as exc: + console.print( + ui.fmt_contract_error( + f"Unable to resolve changed path '{candidate}': {exc}" + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + try: + relative_path = absolute_path.relative_to(root_path) + except ValueError: + console.print( + ui.fmt_contract_error( + f"Changed path '{candidate}' is outside the scan root." + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + cleaned = str(relative_path).replace("\\", "/").strip("/") + if cleaned: + normalized.add(cleaned) + return tuple(sorted(normalized)) + + +def _git_diff_changed_paths(*, root_path: Path, git_diff_ref: str) -> tuple[str, ...]: + if git_diff_ref.startswith("-"): + console.print( + ui.fmt_contract_error( + f"Invalid git diff ref '{git_diff_ref}': must not start with '-'." + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + try: + completed = subprocess.run( + ["git", "diff", "--name-only", git_diff_ref, "--"], + cwd=str(root_path), + check=True, + capture_output=True, + text=True, + timeout=30, + ) + except ( + FileNotFoundError, + subprocess.CalledProcessError, + subprocess.TimeoutExpired, + ) as exc: + console.print( + ui.fmt_contract_error( + "Unable to resolve changed files from git diff ref " + f"'{git_diff_ref}': {exc}" + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + lines = [line.strip() for line in completed.stdout.splitlines() if line.strip()] + return _normalize_changed_paths(root_path=root_path, paths=lines) + + +def _path_matches(relative_path: str, changed_paths: Sequence[str]) -> bool: + return any( + relative_path == candidate or relative_path.startswith(candidate + "/") + for candidate in changed_paths + ) + + +def _flatten_report_findings( + report_document: Mapping[str, object], +) -> list[dict[str, object]]: + findings = _as_mapping(report_document.get("findings")) + groups = _as_mapping(findings.get("groups")) + clone_groups = _as_mapping(groups.get("clones")) + return [ + *[ + dict(_as_mapping(item)) + for item in _as_sequence(clone_groups.get("functions")) + ], + *[dict(_as_mapping(item)) for item in _as_sequence(clone_groups.get("blocks"))], + *[ + dict(_as_mapping(item)) + for item in _as_sequence(clone_groups.get("segments")) + ], + *[ + dict(_as_mapping(item)) + for item in _as_sequence( + _as_mapping(groups.get("structural")).get("groups") + ) + ], + *[ + dict(_as_mapping(item)) + for item in _as_sequence(_as_mapping(groups.get("dead_code")).get("groups")) + ], + *[ + dict(_as_mapping(item)) + for item in _as_sequence(_as_mapping(groups.get("design")).get("groups")) + ], + ] + + +def _finding_touches_changed_paths( + finding: Mapping[str, object], + *, + changed_paths: Sequence[str], +) -> bool: + for item in _as_sequence(finding.get("items")): + relative_path = str(_as_mapping(item).get("relative_path", "")).strip() + if relative_path and _path_matches(relative_path, changed_paths): + return True + return False + + +def _changed_clone_gate_from_report( + report_document: Mapping[str, object], + *, + changed_paths: Sequence[str], +) -> ChangedCloneGate: + findings = [ + finding + for finding in _flatten_report_findings(report_document) + if _finding_touches_changed_paths(finding, changed_paths=changed_paths) + ] + clone_findings = [ + finding + for finding in findings + if str(finding.get("family", "")).strip() == "clone" + and str(finding.get("category", "")).strip() in {"function", "block"} + ] + new_func = frozenset( + str(finding.get("id", "")) + for finding in clone_findings + if str(finding.get("category", "")).strip() == "function" + and str(finding.get("novelty", "")).strip() == "new" + ) + new_block = frozenset( + str(finding.get("id", "")) + for finding in clone_findings + if str(finding.get("category", "")).strip() == "block" + and str(finding.get("novelty", "")).strip() == "new" + ) + findings_new = sum( + 1 for finding in findings if str(finding.get("novelty", "")).strip() == "new" + ) + findings_known = sum( + 1 for finding in findings if str(finding.get("novelty", "")).strip() == "known" + ) + return ChangedCloneGate( + changed_paths=tuple(changed_paths), + new_func=new_func, + new_block=new_block, + total_clone_groups=len(clone_findings), + findings_total=len(findings), + findings_new=findings_new, + findings_known=findings_known, + ) + + def process_file( filepath: str, root: str, @@ -262,6 +489,7 @@ def report( new_block: set[str], html_builder: Callable[..., str] | None = None, metrics_diff: MetricsDiff | None = None, + include_report_document: bool = False, ) -> ReportArtifacts: return cast( "ReportArtifacts", @@ -275,6 +503,7 @@ def report( new_block=new_block, html_builder=html_builder, metrics_diff=metrics_diff, + include_report_document=include_report_document, ), ) @@ -757,6 +986,7 @@ def _enforce_gating( new_block: set[str], metrics_diff: MetricsDiff | None, html_report_path: str | None, + clone_threshold_total: int | None = None, ) -> None: if source_read_contract_failure: console.print( @@ -791,6 +1021,25 @@ def _enforce_gating( new_block=new_block, metrics_diff=metrics_diff, ) + if clone_threshold_total is not None: + reasons = [ + reason + for reason in gate_result.reasons + if not reason.startswith("clone:threshold:") + ] + if 0 <= args.fail_threshold < clone_threshold_total: + reasons.append( + f"clone:threshold:{clone_threshold_total}:{args.fail_threshold}" + ) + gate_result = cast( + "GatingResult", + _pipeline_module().GatingResult( + exit_code=( + int(ExitCode.GATING_FAILURE) if reasons else int(ExitCode.SUCCESS) + ), + reasons=tuple(reasons), + ), + ) metric_reasons = [ reason[len("metric:") :] @@ -867,6 +1116,7 @@ def _main_impl() -> None: run_started_at = time.monotonic() from ._cli_meta import _build_report_meta, _current_report_timestamp_utc + analysis_started_at_utc = _current_report_timestamp_utc() ap = build_parser(__version__) def _prepare_run_inputs() -> tuple[ @@ -879,6 +1129,8 @@ def _prepare_run_inputs() -> tuple[ OutputPaths, Path, dict[str, object] | None, + tuple[str, ...], + str, str, ]: global console @@ -920,6 +1172,12 @@ def _prepare_run_inputs() -> tuple[ config_values=pyproject_config, explicit_cli_dests=explicit_cli_dests, ) + git_diff_ref = _validate_changed_scope_args(args=args) + changed_paths = ( + _git_diff_changed_paths(root_path=root_path, git_diff_ref=git_diff_ref) + if git_diff_ref is not None + else () + ) if args.debug: os.environ["CODECLONE_DEBUG"] = "1" @@ -1028,6 +1286,8 @@ def _prepare_run_inputs() -> tuple[ output_paths, cache_path, shared_baseline_payload, + changed_paths, + analysis_started_at_utc, report_generated_at_utc, ) @@ -1041,6 +1301,8 @@ def _prepare_run_inputs() -> tuple[ output_paths, cache_path, shared_baseline_payload, + changed_paths, + analysis_started_at_utc, report_generated_at_utc, ) = _prepare_run_inputs() @@ -1142,6 +1404,10 @@ def _prepare_run_inputs() -> tuple[ ), analysis_mode=("clones_only" if args.skip_metrics else "full"), metrics_computed=_metrics_computed(args), + design_complexity_threshold=DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + design_coupling_threshold=DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + design_cohesion_threshold=DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + analysis_started_at_utc=analysis_started_at_utc, report_generated_at_utc=report_generated_at_utc, ) @@ -1214,7 +1480,27 @@ def _prepare_run_inputs() -> tuple[ new_block=new_block, html_builder=build_html_report, metrics_diff=metrics_diff, + include_report_document=bool(changed_paths), + ) + changed_clone_gate = ( + _changed_clone_gate_from_report( + report_artifacts.report_document or {}, + changed_paths=changed_paths, + ) + if args.changed_only and report_artifacts.report_document is not None + else None ) + if changed_clone_gate is not None: + _print_changed_scope( + console=cast("_PrinterLike", console), + quiet=args.quiet, + changed_scope=ChangedScopeSnapshot( + paths_count=len(changed_clone_gate.changed_paths), + findings_total=changed_clone_gate.findings_total, + findings_new=changed_clone_gate.findings_new, + findings_known=changed_clone_gate.findings_known, + ), + ) html_report_path = _write_report_outputs( args=args, output_paths=output_paths, @@ -1230,13 +1516,27 @@ def _prepare_run_inputs() -> tuple[ source_read_contract_failure=source_read_contract_failure, baseline_failure_code=baseline_state.failure_code, metrics_baseline_failure_code=metrics_baseline_state.failure_code, - new_func=new_func, - new_block=new_block, + new_func=set(changed_clone_gate.new_func) if changed_clone_gate else new_func, + new_block=( + set(changed_clone_gate.new_block) if changed_clone_gate else new_block + ), metrics_diff=metrics_diff, html_report_path=html_report_path, + clone_threshold_total=( + changed_clone_gate.total_clone_groups if changed_clone_gate else None + ), ) - if not args.update_baseline and not args.fail_on_new and new_clones_count > 0: + notice_new_clones_count = ( + len(changed_clone_gate.new_func) + len(changed_clone_gate.new_block) + if changed_clone_gate is not None + else new_clones_count + ) + if ( + not args.update_baseline + and not args.fail_on_new + and notice_new_clones_count > 0 + ): console.print(ui.WARN_NEW_CLONES_WITHOUT_FAIL) if not args.quiet: diff --git a/codeclone/contracts.py b/codeclone/contracts.py index fdb09de..a75c22d 100644 --- a/codeclone/contracts.py +++ b/codeclone/contracts.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations @@ -9,13 +12,16 @@ BASELINE_SCHEMA_VERSION: Final = "2.0" BASELINE_FINGERPRINT_VERSION: Final = "1" -CACHE_VERSION: Final = "2.2" -REPORT_SCHEMA_VERSION: Final = "2.1" +CACHE_VERSION: Final = "2.3" +REPORT_SCHEMA_VERSION: Final = "2.2" METRICS_BASELINE_SCHEMA_VERSION: Final = "1.0" DEFAULT_COMPLEXITY_THRESHOLD: Final = 20 DEFAULT_COUPLING_THRESHOLD: Final = 10 DEFAULT_COHESION_THRESHOLD: Final = 4 +DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD: Final = 20 +DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD: Final = 10 +DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD: Final = 4 DEFAULT_HEALTH_THRESHOLD: Final = 60 COMPLEXITY_RISK_LOW_MAX: Final = 10 diff --git a/codeclone/domain/__init__.py b/codeclone/domain/__init__.py index 59fc066..61cd04f 100644 --- a/codeclone/domain/__init__.py +++ b/codeclone/domain/__init__.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from .findings import ( @@ -29,6 +32,7 @@ STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE, STRUCTURAL_KIND_DUPLICATED_BRANCHES, SYMBOL_KIND_CLASS, + SYMBOL_KIND_FUNCTION, SYMBOL_KIND_IMPORT, SYMBOL_KIND_METHOD, ) @@ -127,6 +131,7 @@ "STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE", "STRUCTURAL_KIND_DUPLICATED_BRANCHES", "SYMBOL_KIND_CLASS", + "SYMBOL_KIND_FUNCTION", "SYMBOL_KIND_IMPORT", "SYMBOL_KIND_METHOD", ] diff --git a/codeclone/domain/findings.py b/codeclone/domain/findings.py index 37928b2..66f4851 100644 --- a/codeclone/domain/findings.py +++ b/codeclone/domain/findings.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations @@ -9,6 +12,7 @@ CLONE_KIND_BLOCK: Final = "block" CLONE_KIND_SEGMENT: Final = "segment" +SYMBOL_KIND_FUNCTION: Final = "function" SYMBOL_KIND_CLASS: Final = "class" SYMBOL_KIND_METHOD: Final = "method" SYMBOL_KIND_IMPORT: Final = "import" @@ -69,6 +73,7 @@ "STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE", "STRUCTURAL_KIND_DUPLICATED_BRANCHES", "SYMBOL_KIND_CLASS", + "SYMBOL_KIND_FUNCTION", "SYMBOL_KIND_IMPORT", "SYMBOL_KIND_METHOD", ] diff --git a/codeclone/domain/quality.py b/codeclone/domain/quality.py index 6d03baa..cca64c2 100644 --- a/codeclone/domain/quality.py +++ b/codeclone/domain/quality.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/domain/source_scope.py b/codeclone/domain/source_scope.py index ddfd3ea..578b3e3 100644 --- a/codeclone/domain/source_scope.py +++ b/codeclone/domain/source_scope.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/errors.py b/codeclone/errors.py index 77371c4..7b9331f 100644 --- a/codeclone/errors.py +++ b/codeclone/errors.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy diff --git a/codeclone/extractor.py b/codeclone/extractor.py index 116731a..be39896 100644 --- a/codeclone/extractor.py +++ b/codeclone/extractor.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations @@ -14,7 +17,7 @@ from hashlib import sha1 as _sha1 from typing import TYPE_CHECKING, Literal, NamedTuple -from .blockhash import stmt_hashes +from . import qualnames as _qualnames from .blocks import extract_blocks, extract_segments from .cfg import CFGBuilder from .errors import ParseError @@ -42,6 +45,7 @@ AstNormalizer, NormalizationConfig, normalized_ast_dump_from_list, + stmt_hashes, ) from .paths import is_test_filepath from .structural_findings import scan_function_structure @@ -60,7 +64,6 @@ __all__ = [ "Unit", - "_QualnameCollector", "extract_units_and_stats_from_source", ] @@ -75,8 +78,9 @@ class _ParseTimeoutError(Exception): pass -FunctionNode = ast.FunctionDef | ast.AsyncFunctionDef -_NamedDeclarationNode = FunctionNode | ast.ClassDef +# Any named declaration: function, async function, or class. +_NamedDeclarationNode = _qualnames.FunctionNode | ast.ClassDef +# Unique key for a declaration's token index: (start_line, end_line, qualname). _DeclarationTokenIndexKey = tuple[int, int, str] @@ -267,57 +271,13 @@ def _declaration_end_line( return _fallback_declaration_end_line(node, start_line=start_line) -class _QualnameCollector(ast.NodeVisitor): - __slots__ = ( - "class_count", - "class_nodes", - "funcs", - "function_count", - "method_count", - "stack", - "units", - ) - - def __init__(self) -> None: - self.stack: list[str] = [] - self.units: list[tuple[str, FunctionNode]] = [] - self.class_nodes: list[tuple[str, ast.ClassDef]] = [] - self.funcs: dict[str, FunctionNode] = {} - self.class_count = 0 - self.function_count = 0 - self.method_count = 0 - - def visit_ClassDef(self, node: ast.ClassDef) -> None: - self.class_count += 1 - class_qualname = ".".join([*self.stack, node.name]) if self.stack else node.name - self.class_nodes.append((class_qualname, node)) - self.stack.append(node.name) - self.generic_visit(node) - self.stack.pop() - - def _register_function(self, node: FunctionNode) -> None: - name = ".".join([*self.stack, node.name]) if self.stack else node.name - if self.stack: - self.method_count += 1 - else: - self.function_count += 1 - self.units.append((name, node)) - self.funcs[name] = node - - def visit_FunctionDef(self, node: ast.FunctionDef) -> None: - self._register_function(node) - - def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None: - self._register_function(node) - - # ========================= # CFG fingerprinting # ========================= def _cfg_fingerprint_and_complexity( - node: FunctionNode, + node: _qualnames.FunctionNode, cfg: NormalizationConfig, qualname: str, ) -> tuple[str, int]: @@ -511,7 +471,7 @@ def _is_protocol_class( return False -def _is_non_runtime_candidate(node: FunctionNode) -> bool: +def _is_non_runtime_candidate(node: _qualnames.FunctionNode) -> bool: for decorator in node.decorator_list: name = _dotted_expr_name(decorator) if name is None: @@ -530,13 +490,66 @@ def _node_line_span(node: ast.AST) -> tuple[int, int] | None: return start, end +def _eligible_unit_shape( + node: _qualnames.FunctionNode, + *, + min_loc: int, + min_stmt: int, +) -> tuple[int, int, int, int] | None: + span = _node_line_span(node) + if span is None: + return None + start, end = span + if end < start: + return None + loc = end - start + 1 + stmt_count = _stmt_count(node) + if loc < min_loc or stmt_count < min_stmt: + return None + return start, end, loc, stmt_count + + +def _class_metrics_for_node( + *, + module_name: str, + class_qualname: str, + class_node: ast.ClassDef, + filepath: str, + module_import_names: set[str], + module_class_names: set[str], +) -> ClassMetrics | None: + span = _node_line_span(class_node) + if span is None: + return None + start, end = span + cbo, coupled_classes = compute_cbo( + class_node, + module_import_names=module_import_names, + module_class_names=module_class_names, + ) + lcom4, method_count, instance_var_count = compute_lcom4(class_node) + return ClassMetrics( + qualname=f"{module_name}:{class_qualname}", + filepath=filepath, + start_line=start, + end_line=end, + cbo=cbo, + lcom4=lcom4, + method_count=method_count, + instance_var_count=instance_var_count, + risk_coupling=coupling_risk(cbo), + risk_cohesion=cohesion_risk(lcom4), + coupled_classes=coupled_classes, + ) + + def _dead_candidate_kind(local_name: str) -> Literal["function", "method"]: return "method" if "." in local_name else "function" def _should_skip_dead_candidate( local_name: str, - node: FunctionNode, + node: _qualnames.FunctionNode, *, protocol_class_qualnames: set[str], ) -> bool: @@ -584,7 +597,7 @@ def _dead_candidate_for_unit( *, module_name: str, local_name: str, - node: FunctionNode, + node: _qualnames.FunctionNode, filepath: str, suppression_index: Mapping[SuppressionTargetKey, tuple[str, ...]], protocol_class_qualnames: set[str], @@ -628,7 +641,7 @@ def _collect_load_reference_node( def _resolve_referenced_qualnames( *, module_name: str, - collector: _QualnameCollector, + collector: _qualnames.QualnameCollector, state: _ModuleWalkState, ) -> frozenset[str]: top_level_class_by_name = { @@ -649,18 +662,18 @@ def _resolve_referenced_qualnames( for attr_node in state.attr_nodes: base = attr_node.value - if not isinstance(base, ast.Name): - continue - imported_module = state.imported_module_aliases.get(base.id) - if imported_module is not None: - resolved.add(f"{imported_module}:{attr_node.attr}") - continue - class_qualname = top_level_class_by_name.get(base.id) - if class_qualname is None: - continue - local_method_qualname = f"{module_name}:{class_qualname}.{attr_node.attr}" - if local_method_qualname in local_method_qualnames: - resolved.add(local_method_qualname) + if isinstance(base, ast.Name): + imported_module = state.imported_module_aliases.get(base.id) + if imported_module is not None: + resolved.add(f"{imported_module}:{attr_node.attr}") + else: + class_qualname = top_level_class_by_name.get(base.id) + if class_qualname is not None: + local_method_qualname = ( + f"{module_name}:{class_qualname}.{attr_node.attr}" + ) + if local_method_qualname in local_method_qualnames: + resolved.add(local_method_qualname) return frozenset(resolved) @@ -678,7 +691,7 @@ def _collect_module_walk_data( *, tree: ast.AST, module_name: str, - collector: _QualnameCollector, + collector: _qualnames.QualnameCollector, collect_referenced_names: bool, ) -> _ModuleWalkResult: """Single ast.walk that collects imports, deps, names, qualnames & protocol aliases. @@ -694,16 +707,14 @@ def _collect_module_walk_data( state=state, collect_referenced_names=collect_referenced_names, ) - continue - if isinstance(node, ast.ImportFrom): + elif isinstance(node, ast.ImportFrom): _collect_import_from_node( node=node, module_name=module_name, state=state, collect_referenced_names=collect_referenced_names, ) - continue - if collect_referenced_names: + elif collect_referenced_names: _collect_load_reference_node(node=node, state=state) deps_sorted = tuple( @@ -736,7 +747,7 @@ def _collect_dead_candidates( *, filepath: str, module_name: str, - collector: _QualnameCollector, + collector: _qualnames.QualnameCollector, protocol_symbol_aliases: frozenset[str] = frozenset({"Protocol"}), protocol_module_aliases: frozenset[str] = frozenset( {"typing", "typing_extensions"} @@ -767,27 +778,25 @@ def _collect_dead_candidates( suppression_index=suppression_index, protocol_class_qualnames=protocol_class_qualnames, ) - if candidate is None: - continue - candidates.append(candidate) + if candidate is not None: + candidates.append(candidate) for class_qualname, class_node in collector.class_nodes: span = _node_line_span(class_node) - if span is None: - continue - start, end = span - candidates.append( - _build_dead_candidate( - module_name=module_name, - local_name=class_qualname, - node=class_node, - filepath=filepath, - kind="class", - suppression_index=suppression_index, - start_line=start, - end_line=end, + if span is not None: + start, end = span + candidates.append( + _build_dead_candidate( + module_name=module_name, + local_name=class_qualname, + node=class_node, + filepath=filepath, + kind="class", + suppression_index=suppression_index, + start_line=start, + end_line=end, + ) ) - ) return tuple( sorted( @@ -806,7 +815,7 @@ def _collect_declaration_targets( *, filepath: str, module_name: str, - collector: _QualnameCollector, + collector: _qualnames.QualnameCollector, source_tokens: tuple[tokenize.TokenInfo, ...] = (), source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None, include_inline_lines: bool = False, @@ -885,7 +894,7 @@ def _build_suppression_index_for_source( source: str, filepath: str, module_name: str, - collector: _QualnameCollector, + collector: _qualnames.QualnameCollector, ) -> Mapping[SuppressionTargetKey, tuple[str, ...]]: suppression_directives = extract_suppression_directives(source) if not suppression_directives: @@ -947,7 +956,7 @@ def extract_units_and_stats_from_source( except SyntaxError as e: raise ParseError(f"Failed to parse {filepath}: {e}") from e - collector = _QualnameCollector() + collector = _qualnames.QualnameCollector() collector.visit(tree) source_lines = source.splitlines() source_line_count = len(source_lines) @@ -985,17 +994,14 @@ def extract_units_and_stats_from_source( structural_findings: list[StructuralFindingGroup] = [] for local_name, node in collector.units: - start = getattr(node, "lineno", None) - end = getattr(node, "end_lineno", None) - - if not start or not end or end < start: - continue - - loc = end - start + 1 - stmt_count = _stmt_count(node) - - if loc < min_loc or stmt_count < min_stmt: + unit_shape = _eligible_unit_shape( + node, + min_loc=min_loc, + min_stmt=min_stmt, + ) + if unit_shape is None: continue + start, end, loc, stmt_count = unit_shape qualname = f"{module_name}:{local_name}" fingerprint, complexity = _cfg_fingerprint_and_complexity(node, cfg, qualname) @@ -1009,7 +1015,6 @@ def extract_units_and_stats_from_source( risk = risk_level(complexity) raw_hash = _raw_source_hash_for_range(source_lines, start, end) - # Function-level unit (including __init__) units.append( Unit( qualname=qualname, @@ -1037,7 +1042,6 @@ def extract_units_and_stats_from_source( ) ) - # Block-level and segment-level units share statement hashes needs_blocks = ( not local_name.endswith("__init__") and loc >= block_min_loc @@ -1077,36 +1081,20 @@ def extract_units_and_stats_from_source( ) ) - # Structural findings extraction (report-only, no re-parse) if collect_structural_findings: structural_findings.extend(structure_facts.structural_findings) for class_qualname, class_node in collector.class_nodes: - start = int(getattr(class_node, "lineno", 0)) - end = int(getattr(class_node, "end_lineno", 0)) - if start <= 0 or end <= 0: - continue - cbo, coupled_classes = compute_cbo( - class_node, + class_metric = _class_metrics_for_node( + module_name=module_name, + class_qualname=class_qualname, + class_node=class_node, + filepath=filepath, module_import_names=module_import_names, module_class_names=module_class_names, ) - lcom4, method_count, instance_var_count = compute_lcom4(class_node) - class_metrics.append( - ClassMetrics( - qualname=f"{module_name}:{class_qualname}", - filepath=filepath, - start_line=start, - end_line=end, - cbo=cbo, - lcom4=lcom4, - method_count=method_count, - instance_var_count=instance_var_count, - risk_coupling=coupling_risk(cbo), - risk_cohesion=cohesion_risk(lcom4), - coupled_classes=coupled_classes, - ) - ) + if class_metric is not None: + class_metrics.append(class_metric) dead_candidates = _collect_dead_candidates( filepath=filepath, diff --git a/codeclone/fingerprint.py b/codeclone/fingerprint.py index d47b8fc..72adaee 100644 --- a/codeclone/fingerprint.py +++ b/codeclone/fingerprint.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/grouping.py b/codeclone/grouping.py index 583e62a..a5ac7db 100644 --- a/codeclone/grouping.py +++ b/codeclone/grouping.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/html_report.py b/codeclone/html_report.py index 3783b23..16ceab5 100644 --- a/codeclone/html_report.py +++ b/codeclone/html_report.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy """Public facade for HTML report generation. diff --git a/codeclone/mcp_server.py b/codeclone/mcp_server.py new file mode 100644 index 0000000..3848ea0 --- /dev/null +++ b/codeclone/mcp_server.py @@ -0,0 +1,916 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import argparse +import ipaddress +import sys +from collections.abc import Callable +from typing import TYPE_CHECKING, Any, Literal, TypeVar, cast + +from . import __version__ +from .contracts import DOCS_URL +from .mcp_service import ( + DEFAULT_MCP_HISTORY_LIMIT, + MAX_MCP_HISTORY_LIMIT, + CodeCloneMCPService, + MCPAnalysisRequest, + MCPGateRequest, + _validated_history_limit, +) + +if TYPE_CHECKING: + from mcp.server.fastmcp import FastMCP + from mcp.types import ToolAnnotations + +_SERVER_INSTRUCTIONS = ( + "CodeClone MCP is a deterministic, baseline-aware, read-only analysis server " + "for Python repositories. Use analyze_repository first for full runs or " + "analyze_changed_paths for PR-style review, then prefer get_run_summary or " + "get_production_triage for the first pass. Use list_hotspots or focused " + "check_* tools before broader list_findings calls, then drill into one " + "finding with get_finding or get_remediation. Use " + "get_report_section(section='metrics_detail', family=..., limit=...) for " + "bounded metrics drill-down, and prefer generate_pr_summary(format='markdown') " + "unless machine JSON is required. Pass an absolute repository root to " + "analysis tools. This server never updates baselines and never mutates " + "source files." +) +_MCP_INSTALL_HINT = ( + "CodeClone MCP support requires the optional 'mcp' extra. " + "Install it with: pip install 'codeclone[mcp]'" +) + + +class MCPDependencyError(RuntimeError): + """Raised when the optional MCP runtime dependency is unavailable.""" + + +MCPCallable = TypeVar("MCPCallable", bound=Callable[..., object]) + + +def _load_mcp_runtime() -> tuple[type[FastMCP], ToolAnnotations, ToolAnnotations]: + try: + from mcp.server.fastmcp import FastMCP as runtime_fastmcp + from mcp.types import ToolAnnotations as runtime_tool_annotations + except ImportError as exc: + raise MCPDependencyError(_MCP_INSTALL_HINT) from exc + return ( + cast("type[FastMCP]", runtime_fastmcp), + runtime_tool_annotations( + readOnlyHint=True, + destructiveHint=False, + idempotentHint=True, + openWorldHint=False, + ), + runtime_tool_annotations( + readOnlyHint=False, + destructiveHint=False, + idempotentHint=True, + openWorldHint=False, + ), + ) + + +def build_mcp_server( + *, + history_limit: int = DEFAULT_MCP_HISTORY_LIMIT, + host: str = "127.0.0.1", + port: int = 8000, + json_response: bool = False, + stateless_http: bool = False, + debug: bool = False, + log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "INFO", +) -> FastMCP: + runtime_fastmcp, read_only_tool, session_tool = _load_mcp_runtime() + service = CodeCloneMCPService(history_limit=_validated_history_limit(history_limit)) + mcp = runtime_fastmcp( + name="CodeClone", + instructions=_SERVER_INSTRUCTIONS, + website_url=DOCS_URL, + host=host, + port=port, + json_response=json_response, + stateless_http=stateless_http, + debug=debug, + log_level=log_level, + dependencies=(f"codeclone=={__version__}",), + ) + + def tool(*args: Any, **kwargs: Any) -> Callable[[MCPCallable], MCPCallable]: + return cast( + "Callable[[MCPCallable], MCPCallable]", + mcp.tool(*args, **kwargs), + ) + + def resource( + *args: Any, + **kwargs: Any, + ) -> Callable[[MCPCallable], MCPCallable]: + return cast( + "Callable[[MCPCallable], MCPCallable]", + mcp.resource(*args, **kwargs), + ) + + @tool( + title="Analyze Repository", + description=( + "Run a deterministic CodeClone analysis for a repository and register " + "the result as the latest MCP run. Pass an absolute repository root: " + "relative roots like '.' are rejected in MCP. Then prefer " + "get_run_summary or get_production_triage for the first pass. Tip: " + "set cache_policy='off' to bypass cache and get fully fresh results." + ), + annotations=session_tool, + structured_output=True, + ) + def analyze_repository( + root: str, + analysis_mode: str = "full", + respect_pyproject: bool = True, + changed_paths: list[str] | None = None, + git_diff_ref: str | None = None, + processes: int | None = None, + min_loc: int | None = None, + min_stmt: int | None = None, + block_min_loc: int | None = None, + block_min_stmt: int | None = None, + segment_min_loc: int | None = None, + segment_min_stmt: int | None = None, + complexity_threshold: int | None = None, + coupling_threshold: int | None = None, + cohesion_threshold: int | None = None, + baseline_path: str | None = None, + metrics_baseline_path: str | None = None, + max_baseline_size_mb: int | None = None, + cache_policy: str = "reuse", + cache_path: str | None = None, + max_cache_size_mb: int | None = None, + ) -> dict[str, object]: + return service.analyze_repository( + MCPAnalysisRequest( + root=root, + analysis_mode=analysis_mode, # type: ignore[arg-type] + respect_pyproject=respect_pyproject, + changed_paths=tuple(changed_paths or ()), + git_diff_ref=git_diff_ref, + processes=processes, + min_loc=min_loc, + min_stmt=min_stmt, + block_min_loc=block_min_loc, + block_min_stmt=block_min_stmt, + segment_min_loc=segment_min_loc, + segment_min_stmt=segment_min_stmt, + complexity_threshold=complexity_threshold, + coupling_threshold=coupling_threshold, + cohesion_threshold=cohesion_threshold, + baseline_path=baseline_path, + metrics_baseline_path=metrics_baseline_path, + max_baseline_size_mb=max_baseline_size_mb, + cache_policy=cache_policy, # type: ignore[arg-type] + cache_path=cache_path, + max_cache_size_mb=max_cache_size_mb, + ) + ) + + @tool( + title="Analyze Changed Paths", + description=( + "Run a deterministic CodeClone analysis and return a changed-files " + "projection using explicit paths or a git diff ref. Pass an absolute " + "repository root: relative roots like '.' are rejected in MCP. Then " + "prefer get_report_section(section='changed') or get_production_triage " + "before broader finding lists. Tip: set cache_policy='off' to bypass " + "cache and get fully fresh results." + ), + annotations=session_tool, + structured_output=True, + ) + def analyze_changed_paths( + root: str, + changed_paths: list[str] | None = None, + git_diff_ref: str | None = None, + analysis_mode: str = "full", + respect_pyproject: bool = True, + processes: int | None = None, + min_loc: int | None = None, + min_stmt: int | None = None, + block_min_loc: int | None = None, + block_min_stmt: int | None = None, + segment_min_loc: int | None = None, + segment_min_stmt: int | None = None, + complexity_threshold: int | None = None, + coupling_threshold: int | None = None, + cohesion_threshold: int | None = None, + baseline_path: str | None = None, + metrics_baseline_path: str | None = None, + max_baseline_size_mb: int | None = None, + cache_policy: str = "reuse", + cache_path: str | None = None, + max_cache_size_mb: int | None = None, + ) -> dict[str, object]: + return service.analyze_changed_paths( + MCPAnalysisRequest( + root=root, + changed_paths=tuple(changed_paths or ()), + git_diff_ref=git_diff_ref, + analysis_mode=analysis_mode, # type: ignore[arg-type] + respect_pyproject=respect_pyproject, + processes=processes, + min_loc=min_loc, + min_stmt=min_stmt, + block_min_loc=block_min_loc, + block_min_stmt=block_min_stmt, + segment_min_loc=segment_min_loc, + segment_min_stmt=segment_min_stmt, + complexity_threshold=complexity_threshold, + coupling_threshold=coupling_threshold, + cohesion_threshold=cohesion_threshold, + baseline_path=baseline_path, + metrics_baseline_path=metrics_baseline_path, + max_baseline_size_mb=max_baseline_size_mb, + cache_policy=cache_policy, # type: ignore[arg-type] + cache_path=cache_path, + max_cache_size_mb=max_cache_size_mb, + ) + ) + + @tool( + title="Get Run Summary", + description=( + "Return the stored compact MCP summary for the latest or specified " + "run. Start here when you want the cheapest run-level snapshot." + ), + annotations=read_only_tool, + structured_output=True, + ) + def get_run_summary(run_id: str | None = None) -> dict[str, object]: + return service.get_run_summary(run_id) + + @tool( + title="Get Production Triage", + description=( + "Return a production-first triage view over a stored run: health, " + "cache freshness, production hotspots, and production suggestions, " + "while keeping global source-kind counters visible. Use this as the " + "default first-pass review on noisy repositories." + ), + annotations=read_only_tool, + structured_output=True, + ) + def get_production_triage( + run_id: str | None = None, + max_hotspots: int = 3, + max_suggestions: int = 3, + ) -> dict[str, object]: + return service.get_production_triage( + run_id=run_id, + max_hotspots=max_hotspots, + max_suggestions=max_suggestions, + ) + + @tool( + title="Evaluate Gates", + description=( + "Evaluate CodeClone gate conditions against an existing MCP run without " + "modifying baselines or exiting the process." + ), + annotations=session_tool, + structured_output=True, + ) + def evaluate_gates( + run_id: str | None = None, + fail_on_new: bool = False, + fail_threshold: int = -1, + fail_complexity: int = -1, + fail_coupling: int = -1, + fail_cohesion: int = -1, + fail_cycles: bool = False, + fail_dead_code: bool = False, + fail_health: int = -1, + fail_on_new_metrics: bool = False, + ) -> dict[str, object]: + return service.evaluate_gates( + MCPGateRequest( + run_id=run_id, + fail_on_new=fail_on_new, + fail_threshold=fail_threshold, + fail_complexity=fail_complexity, + fail_coupling=fail_coupling, + fail_cohesion=fail_cohesion, + fail_cycles=fail_cycles, + fail_dead_code=fail_dead_code, + fail_health=fail_health, + fail_on_new_metrics=fail_on_new_metrics, + ) + ) + + @tool( + title="Get Report Section", + description=( + "Return a canonical CodeClone report section for the latest or " + "specified MCP run. Prefer specific sections instead of 'all' unless " + "you truly need the full canonical report. The 'metrics' section " + "returns only the summary, while 'metrics_detail' returns paginated " + "item slices or summary+hint when unfiltered." + ), + annotations=read_only_tool, + structured_output=True, + ) + def get_report_section( + run_id: str | None = None, + section: str = "all", + family: str | None = None, + path: str | None = None, + offset: int = 0, + limit: int = 50, + ) -> dict[str, object]: + return service.get_report_section( + run_id=run_id, + section=section, # type: ignore[arg-type] + family=family, # type: ignore[arg-type] + path=path, + offset=offset, + limit=limit, + ) + + @tool( + title="List Findings", + description=( + "List canonical finding groups with deterministic ordering, optional " + "filters, pagination, and compact summary cards by default. Prefer " + "list_hotspots or focused check_* tools for first-pass triage; use " + "this when you need a broader filtered list." + ), + annotations=read_only_tool, + structured_output=True, + ) + def list_findings( + run_id: str | None = None, + family: str = "all", + category: str | None = None, + severity: str | None = None, + source_kind: str | None = None, + novelty: str = "all", + sort_by: str = "default", + detail_level: str = "summary", + changed_paths: list[str] | None = None, + git_diff_ref: str | None = None, + exclude_reviewed: bool = False, + offset: int = 0, + limit: int = 50, + max_results: int | None = None, + ) -> dict[str, object]: + return service.list_findings( + run_id=run_id, + family=family, # type: ignore[arg-type] + category=category, + severity=severity, + source_kind=source_kind, + novelty=novelty, # type: ignore[arg-type] + sort_by=sort_by, # type: ignore[arg-type] + detail_level=detail_level, # type: ignore[arg-type] + changed_paths=tuple(changed_paths or ()), + git_diff_ref=git_diff_ref, + exclude_reviewed=exclude_reviewed, + offset=offset, + limit=limit, + max_results=max_results, + ) + + @tool( + title="Get Finding", + description=( + "Return a single canonical finding group by short or full id. " + "Normal detail is the default. Use this after list_hotspots, " + "list_findings, or check_* instead of requesting larger lists at " + "higher detail." + ), + annotations=read_only_tool, + structured_output=True, + ) + def get_finding( + finding_id: str, + run_id: str | None = None, + detail_level: str = "normal", + ) -> dict[str, object]: + return service.get_finding( + finding_id=finding_id, + run_id=run_id, + detail_level=detail_level, # type: ignore[arg-type] + ) + + @tool( + title="Get Remediation", + description=( + "Return actionable remediation guidance for a single finding. " + "Normal detail is the default. Use this when you need the fix packet " + "for one finding without pulling larger detail lists." + ), + annotations=read_only_tool, + structured_output=True, + ) + def get_remediation( + finding_id: str, + run_id: str | None = None, + detail_level: str = "normal", + ) -> dict[str, object]: + return service.get_remediation( + finding_id=finding_id, + run_id=run_id, + detail_level=detail_level, # type: ignore[arg-type] + ) + + @tool( + title="List Hotspots", + description=( + "Return one of the derived CodeClone hotlists for the latest or " + "specified MCP run, using compact summary cards by default. Prefer " + "this for first-pass triage before broader list_findings calls." + ), + annotations=read_only_tool, + structured_output=True, + ) + def list_hotspots( + kind: str, + run_id: str | None = None, + detail_level: str = "summary", + changed_paths: list[str] | None = None, + git_diff_ref: str | None = None, + exclude_reviewed: bool = False, + limit: int = 10, + max_results: int | None = None, + ) -> dict[str, object]: + return service.list_hotspots( + kind=kind, # type: ignore[arg-type] + run_id=run_id, + detail_level=detail_level, # type: ignore[arg-type] + changed_paths=tuple(changed_paths or ()), + git_diff_ref=git_diff_ref, + exclude_reviewed=exclude_reviewed, + limit=limit, + max_results=max_results, + ) + + @tool( + title="Compare Runs", + description=( + "Compare two registered CodeClone MCP runs by finding ids and " + "run-to-run health. Returns 'incomparable' when roots or effective " + "analysis settings differ." + ), + annotations=read_only_tool, + structured_output=True, + ) + def compare_runs( + run_id_before: str, + run_id_after: str | None = None, + focus: str = "all", + ) -> dict[str, object]: + return service.compare_runs( + run_id_before=run_id_before, + run_id_after=run_id_after, + focus=focus, # type: ignore[arg-type] + ) + + @tool( + title="Check Complexity", + description=( + "Return complexity hotspots from a compatible stored run. " + "Use analyze_repository first if no full run is available. When " + "filtering by root without run_id, pass an absolute root. Prefer " + "this narrower tool instead of list_findings when you only need " + "complexity hotspots." + ), + annotations=read_only_tool, + structured_output=True, + ) + def check_complexity( + run_id: str | None = None, + root: str | None = None, + path: str | None = None, + min_complexity: int | None = None, + max_results: int = 10, + detail_level: str = "summary", + ) -> dict[str, object]: + return service.check_complexity( + run_id=run_id, + root=root, + path=path, + min_complexity=min_complexity, + max_results=max_results, + detail_level=detail_level, # type: ignore[arg-type] + ) + + @tool( + title="Check Clones", + description=( + "Return clone findings from a compatible stored run. " + "Use analyze_repository first if no compatible run is available. " + "When filtering by root without run_id, pass an absolute root. " + "Prefer this narrower tool instead of list_findings when you only " + "need clone findings." + ), + annotations=read_only_tool, + structured_output=True, + ) + def check_clones( + run_id: str | None = None, + root: str | None = None, + path: str | None = None, + clone_type: str | None = None, + source_kind: str | None = None, + max_results: int = 10, + detail_level: str = "summary", + ) -> dict[str, object]: + return service.check_clones( + run_id=run_id, + root=root, + path=path, + clone_type=clone_type, + source_kind=source_kind, + max_results=max_results, + detail_level=detail_level, # type: ignore[arg-type] + ) + + @tool( + title="Check Coupling", + description=( + "Return coupling hotspots from a compatible stored run. " + "Use analyze_repository first if no full run is available. When " + "filtering by root without run_id, pass an absolute root. Prefer " + "this narrower tool instead of list_findings when you only need " + "coupling hotspots." + ), + annotations=read_only_tool, + structured_output=True, + ) + def check_coupling( + run_id: str | None = None, + root: str | None = None, + path: str | None = None, + max_results: int = 10, + detail_level: str = "summary", + ) -> dict[str, object]: + return service.check_coupling( + run_id=run_id, + root=root, + path=path, + max_results=max_results, + detail_level=detail_level, # type: ignore[arg-type] + ) + + @tool( + title="Check Cohesion", + description=( + "Return cohesion hotspots from a compatible stored run. " + "Use analyze_repository first if no full run is available. When " + "filtering by root without run_id, pass an absolute root. Prefer " + "this narrower tool instead of list_findings when you only need " + "cohesion hotspots." + ), + annotations=read_only_tool, + structured_output=True, + ) + def check_cohesion( + run_id: str | None = None, + root: str | None = None, + path: str | None = None, + max_results: int = 10, + detail_level: str = "summary", + ) -> dict[str, object]: + return service.check_cohesion( + run_id=run_id, + root=root, + path=path, + max_results=max_results, + detail_level=detail_level, # type: ignore[arg-type] + ) + + @tool( + title="Check Dead Code", + description=( + "Return dead-code findings from a compatible stored run. " + "Use analyze_repository first if no full run is available. When " + "filtering by root without run_id, pass an absolute root. Prefer " + "this narrower tool instead of list_findings when you only need " + "dead-code findings." + ), + annotations=read_only_tool, + structured_output=True, + ) + def check_dead_code( + run_id: str | None = None, + root: str | None = None, + path: str | None = None, + min_severity: str | None = None, + max_results: int = 10, + detail_level: str = "normal", + ) -> dict[str, object]: + return service.check_dead_code( + run_id=run_id, + root=root, + path=path, + min_severity=min_severity, + max_results=max_results, + detail_level=detail_level, # type: ignore[arg-type] + ) + + @tool( + title="Generate PR Summary", + description=( + "Generate a PR-friendly CodeClone summary for changed files. Prefer " + "format='markdown' for compact LLM-facing output; use 'json' only " + "for machine post-processing." + ), + annotations=read_only_tool, + structured_output=True, + ) + def generate_pr_summary( + run_id: str | None = None, + changed_paths: list[str] | None = None, + git_diff_ref: str | None = None, + format: str = "markdown", + ) -> dict[str, object]: + return service.generate_pr_summary( + run_id=run_id, + changed_paths=tuple(changed_paths or ()), + git_diff_ref=git_diff_ref, + format=format, # type: ignore[arg-type] + ) + + @tool( + title="Mark Finding Reviewed", + description="Mark a finding as reviewed in the current in-memory MCP session.", + annotations=session_tool, + structured_output=True, + ) + def mark_finding_reviewed( + finding_id: str, + run_id: str | None = None, + note: str | None = None, + ) -> dict[str, object]: + return service.mark_finding_reviewed( + finding_id=finding_id, + run_id=run_id, + note=note, + ) + + @tool( + title="List Reviewed Findings", + description=( + "List in-memory reviewed findings for the current or specified run." + ), + annotations=read_only_tool, + structured_output=True, + ) + def list_reviewed_findings(run_id: str | None = None) -> dict[str, object]: + return service.list_reviewed_findings(run_id=run_id) + + @tool( + title="Clear Session Runs", + description=( + "Clear all in-memory MCP analysis runs and ephemeral session state " + "for this server process." + ), + annotations=session_tool, + structured_output=True, + ) + def clear_session_runs() -> dict[str, object]: + return service.clear_session_runs() + + @resource( + "codeclone://latest/summary", + title="Latest Run Summary", + description="Canonical JSON summary for the latest CodeClone MCP run.", + mime_type="application/json", + ) + def latest_summary_resource() -> str: + return service.read_resource("codeclone://latest/summary") + + @resource( + "codeclone://latest/report.json", + title="Latest Canonical Report", + description="Canonical JSON report for the latest CodeClone MCP run.", + mime_type="application/json", + ) + def latest_report_resource() -> str: + return service.read_resource("codeclone://latest/report.json") + + @resource( + "codeclone://latest/health", + title="Latest Health Snapshot", + description="Health score and dimensions for the latest CodeClone MCP run.", + mime_type="application/json", + ) + def latest_health_resource() -> str: + return service.read_resource("codeclone://latest/health") + + @resource( + "codeclone://latest/gates", + title="Latest Gate Evaluation", + description="Last gate evaluation result produced by this MCP session.", + mime_type="application/json", + ) + def latest_gates_resource() -> str: + return service.read_resource("codeclone://latest/gates") + + @resource( + "codeclone://latest/changed", + title="Latest Changed Findings", + description=( + "Changed-files projection for the latest diff-aware CodeClone MCP run." + ), + mime_type="application/json", + ) + def latest_changed_resource() -> str: + return service.read_resource("codeclone://latest/changed") + + @resource( + "codeclone://latest/triage", + title="Latest Production Triage", + description=("Production-first triage view for the latest CodeClone MCP run."), + mime_type="application/json", + ) + def latest_triage_resource() -> str: + return service.read_resource("codeclone://latest/triage") + + @resource( + "codeclone://schema", + title="CodeClone Report Schema", + description="JSON schema-style descriptor for the canonical CodeClone report.", + mime_type="application/json", + ) + def schema_resource() -> str: + return service.read_resource("codeclone://schema") + + @resource( + "codeclone://runs/{run_id}/summary", + title="Run Summary", + description="Canonical JSON summary for a specific CodeClone MCP run.", + mime_type="application/json", + ) + def run_summary_resource(run_id: str) -> str: + return service.read_resource(f"codeclone://runs/{run_id}/summary") + + @resource( + "codeclone://runs/{run_id}/report.json", + title="Run Canonical Report", + description="Canonical JSON report for a specific CodeClone MCP run.", + mime_type="application/json", + ) + def run_report_resource(run_id: str) -> str: + return service.read_resource(f"codeclone://runs/{run_id}/report.json") + + @resource( + "codeclone://runs/{run_id}/findings/{finding_id}", + title="Run Finding", + description="Canonical JSON finding group for a specific CodeClone MCP run.", + mime_type="application/json", + ) + def run_finding_resource(run_id: str, finding_id: str) -> str: + return service.read_resource(f"codeclone://runs/{run_id}/findings/{finding_id}") + + return mcp + + +def _history_limit_arg(value: str) -> int: + try: + parsed = int(value) + except ValueError as exc: + raise argparse.ArgumentTypeError( + f"history limit must be an integer between 1 and {MAX_MCP_HISTORY_LIMIT}." + ) from exc + try: + return _validated_history_limit(parsed) + except ValueError as exc: + raise argparse.ArgumentTypeError(str(exc)) from exc + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="codeclone-mcp", + description=( + "CodeClone MCP server for deterministic, baseline-aware, read-only " + "analysis of Python repositories." + ), + ) + parser.add_argument( + "--transport", + choices=("stdio", "streamable-http"), + default="stdio", + help="MCP transport to run. Defaults to stdio.", + ) + parser.add_argument( + "--host", + default="127.0.0.1", + help="Host to bind when using streamable-http.", + ) + parser.add_argument( + "--allow-remote", + action=argparse.BooleanOptionalAction, + default=False, + help=( + "Allow binding streamable-http to a non-loopback host. " + "Disabled by default because CodeClone MCP has no built-in authentication." + ), + ) + parser.add_argument( + "--port", + type=int, + default=8000, + help="Port to bind when using streamable-http.", + ) + parser.add_argument( + "--history-limit", + type=_history_limit_arg, + default=DEFAULT_MCP_HISTORY_LIMIT, + help=( + "Maximum number of in-memory analysis runs retained by the server " + f"(1-{MAX_MCP_HISTORY_LIMIT}, default: {DEFAULT_MCP_HISTORY_LIMIT})." + ), + ) + parser.add_argument( + "--json-response", + action=argparse.BooleanOptionalAction, + default=True, + help="Use JSON responses for streamable-http transport.", + ) + parser.add_argument( + "--stateless-http", + action=argparse.BooleanOptionalAction, + default=True, + help="Use stateless Streamable HTTP mode when transport is streamable-http.", + ) + parser.add_argument( + "--debug", + action=argparse.BooleanOptionalAction, + default=False, + help="Enable FastMCP debug mode.", + ) + parser.add_argument( + "--log-level", + choices=("DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"), + default="INFO", + help="FastMCP server log level.", + ) + return parser + + +def _host_is_loopback(host: str) -> bool: + cleaned = host.strip().strip("[]") + if not cleaned: + return False + if cleaned.lower() == "localhost": + return True + try: + return ipaddress.ip_address(cleaned).is_loopback + except ValueError: + return False + + +def main() -> None: + args = build_parser().parse_args() + if ( + args.transport == "streamable-http" + and not args.allow_remote + and not _host_is_loopback(args.host) + ): + print( + ( + "Refusing to bind CodeClone MCP streamable-http to non-loopback " + f"host '{args.host}' without --allow-remote. " + "The server has no built-in authentication." + ), + file=sys.stderr, + ) + raise SystemExit(2) + try: + server = build_mcp_server( + history_limit=args.history_limit, + host=args.host, + port=args.port, + json_response=args.json_response, + stateless_http=args.stateless_http, + debug=args.debug, + log_level=args.log_level, + ) + except MCPDependencyError as exc: + print(str(exc), file=sys.stderr) + raise SystemExit(2) from exc + try: + server.run(transport=args.transport) + except KeyboardInterrupt: + return + + +__all__ = [ + "MCPDependencyError", + "build_mcp_server", + "build_parser", + "main", +] diff --git a/codeclone/mcp_service.py b/codeclone/mcp_service.py new file mode 100644 index 0000000..fd86ee4 --- /dev/null +++ b/codeclone/mcp_service.py @@ -0,0 +1,3862 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import hashlib +import json +import subprocess +from argparse import Namespace +from collections import OrderedDict +from collections.abc import Iterable, Mapping, Sequence +from dataclasses import dataclass +from pathlib import Path +from threading import RLock +from typing import Final, Literal, cast + +from . import __version__ +from ._cli_args import ( + DEFAULT_BASELINE_PATH, + DEFAULT_BLOCK_MIN_LOC, + DEFAULT_BLOCK_MIN_STMT, + DEFAULT_MAX_BASELINE_SIZE_MB, + DEFAULT_MAX_CACHE_SIZE_MB, + DEFAULT_MIN_LOC, + DEFAULT_MIN_STMT, + DEFAULT_SEGMENT_MIN_LOC, + DEFAULT_SEGMENT_MIN_STMT, +) +from ._cli_baselines import ( + CloneBaselineState, + MetricsBaselineState, + probe_metrics_baseline_section, + resolve_clone_baseline_state, + resolve_metrics_baseline_state, +) +from ._cli_config import ConfigValidationError, load_pyproject_config +from ._cli_meta import _build_report_meta, _current_report_timestamp_utc +from ._cli_runtime import ( + resolve_cache_path, + resolve_cache_status, + validate_numeric_args, +) +from ._coerce import as_float as _as_float +from ._coerce import as_int as _as_int +from .baseline import Baseline +from .cache import Cache, CacheStatus +from .contracts import ( + DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + REPORT_SCHEMA_VERSION, + ExitCode, +) +from .domain.findings import ( + CATEGORY_CLONE, + CATEGORY_COHESION, + CATEGORY_COMPLEXITY, + CATEGORY_COUPLING, + CATEGORY_DEAD_CODE, + CATEGORY_DEPENDENCY, + CATEGORY_STRUCTURAL, + CLONE_KIND_SEGMENT, + FAMILY_CLONE, + FAMILY_CLONES, + FAMILY_DEAD_CODE, + FAMILY_DESIGN, + FAMILY_STRUCTURAL, +) +from .domain.quality import ( + CONFIDENCE_HIGH, + CONFIDENCE_LOW, + CONFIDENCE_MEDIUM, + EFFORT_EASY, + EFFORT_HARD, + EFFORT_MODERATE, + SEVERITY_CRITICAL, + SEVERITY_INFO, + SEVERITY_WARNING, +) +from .domain.source_scope import ( + SOURCE_KIND_FIXTURES, + SOURCE_KIND_MIXED, + SOURCE_KIND_ORDER, + SOURCE_KIND_OTHER, + SOURCE_KIND_PRODUCTION, + SOURCE_KIND_TESTS, +) +from .models import MetricsDiff, ProjectMetrics, Suggestion +from .pipeline import ( + GatingResult, + MetricGateConfig, + OutputPaths, + analyze, + bootstrap, + discover, + metric_gate_reasons, + process, + report, +) +from .report.json_contract import ( + clone_group_id, + dead_code_group_id, + design_group_id, + structural_group_id, +) + +AnalysisMode = Literal["full", "clones_only"] +CachePolicy = Literal["reuse", "refresh", "off"] +FreshnessKind = Literal["fresh", "mixed", "reused"] +HotlistKind = Literal[ + "most_actionable", + "highest_spread", + "highest_priority", + "production_hotspots", + "test_fixture_hotspots", +] +FindingFamilyFilter = Literal["all", "clone", "structural", "dead_code", "design"] +FindingNoveltyFilter = Literal["all", "new", "known"] +FindingSort = Literal["default", "priority", "severity", "spread"] +DetailLevel = Literal["summary", "normal", "full"] +ComparisonFocus = Literal["all", "clones", "structural", "metrics"] +PRSummaryFormat = Literal["markdown", "json"] +MetricsDetailFamily = Literal[ + "complexity", + "coupling", + "cohesion", + "dependencies", + "dead_code", + "health", +] +ReportSection = Literal[ + "all", + "meta", + "inventory", + "findings", + "metrics", + "metrics_detail", + "derived", + "changed", + "integrity", +] + +_LEGACY_CACHE_PATH = Path("~/.cache/codeclone/cache.json").expanduser() +_REPORT_DUMMY_PATH = Path(".cache/codeclone/report.json") +_MCP_CONFIG_KEYS = frozenset( + { + "min_loc", + "min_stmt", + "block_min_loc", + "block_min_stmt", + "segment_min_loc", + "segment_min_stmt", + "processes", + "cache_path", + "max_cache_size_mb", + "baseline", + "max_baseline_size_mb", + "metrics_baseline", + } +) +_RESOURCE_SECTION_MAP: Final[dict[str, ReportSection]] = { + "report.json": "all", + "summary": "meta", + "health": "metrics", + "changed": "changed", + "overview": "derived", +} +_SEVERITY_WEIGHT: Final[dict[str, float]] = { + SEVERITY_CRITICAL: 1.0, + SEVERITY_WARNING: 0.6, + SEVERITY_INFO: 0.2, +} +_EFFORT_WEIGHT: Final[dict[str, float]] = { + EFFORT_EASY: 1.0, + EFFORT_MODERATE: 0.6, + EFFORT_HARD: 0.3, +} +_NOVELTY_WEIGHT: Final[dict[str, float]] = {"new": 1.0, "known": 0.5} +_RUNTIME_WEIGHT: Final[dict[str, float]] = { + "production": 1.0, + "mixed": 0.8, + "tests": 0.4, + "fixtures": 0.2, + "other": 0.5, +} +_CONFIDENCE_WEIGHT: Final[dict[str, float]] = { + CONFIDENCE_HIGH: 1.0, + CONFIDENCE_MEDIUM: 0.7, + CONFIDENCE_LOW: 0.3, +} +# Canonical report groups use FAMILY_CLONES ("clones"), while individual finding +# payloads use FAMILY_CLONE ("clone"). +_VALID_ANALYSIS_MODES = frozenset({"full", "clones_only"}) +_VALID_CACHE_POLICIES = frozenset({"reuse", "refresh", "off"}) +_VALID_FINDING_FAMILIES = frozenset( + {"all", "clone", "structural", "dead_code", "design"} +) +_VALID_FINDING_NOVELTY = frozenset({"all", "new", "known"}) +_VALID_FINDING_SORT = frozenset({"default", "priority", "severity", "spread"}) +_VALID_DETAIL_LEVELS = frozenset({"summary", "normal", "full"}) +_VALID_COMPARISON_FOCUS = frozenset({"all", "clones", "structural", "metrics"}) +_VALID_PR_SUMMARY_FORMATS = frozenset({"markdown", "json"}) +DEFAULT_MCP_HISTORY_LIMIT = 4 +MAX_MCP_HISTORY_LIMIT = 10 +_VALID_REPORT_SECTIONS = frozenset( + { + "all", + "meta", + "inventory", + "findings", + "metrics", + "metrics_detail", + "derived", + "changed", + "integrity", + } +) +_VALID_HOTLIST_KINDS = frozenset( + { + "most_actionable", + "highest_spread", + "highest_priority", + "production_hotspots", + "test_fixture_hotspots", + } +) +_VALID_SEVERITIES = frozenset({SEVERITY_CRITICAL, SEVERITY_WARNING, SEVERITY_INFO}) +_SOURCE_KIND_BREAKDOWN_ORDER: Final[tuple[str, ...]] = ( + SOURCE_KIND_PRODUCTION, + SOURCE_KIND_TESTS, + SOURCE_KIND_FIXTURES, + SOURCE_KIND_MIXED, + SOURCE_KIND_OTHER, +) +_HOTLIST_REPORT_KEYS: Final[dict[str, str]] = { + "most_actionable": "most_actionable_ids", + "highest_spread": "highest_spread_ids", + "production_hotspots": "production_hotspot_ids", + "test_fixture_hotspots": "test_fixture_hotspot_ids", +} +_CHECK_TO_DIMENSION: Final[dict[str, str]] = { + "cohesion": "cohesion", + "coupling": "coupling", + "dead_code": "dead_code", + "complexity": "complexity", + "clones": "clones", +} +_VALID_METRICS_DETAIL_FAMILIES = frozenset( + { + "complexity", + "coupling", + "cohesion", + "dependencies", + "dead_code", + "health", + } +) +_SHORT_RUN_ID_LENGTH = 8 +_SHORT_HASH_ID_LENGTH = 6 + + +def _suggestion_finding_id_payload(suggestion: object) -> str: + if not hasattr(suggestion, "finding_family"): + return "" + family = str(getattr(suggestion, "finding_family", "")).strip() + if family == FAMILY_CLONES: + kind = str(getattr(suggestion, "finding_kind", "")).strip() + subject_key = str(getattr(suggestion, "subject_key", "")).strip() + return clone_group_id(kind or CLONE_KIND_SEGMENT, subject_key) + if family == FAMILY_STRUCTURAL: + return structural_group_id( + str(getattr(suggestion, "finding_kind", "")).strip() or CATEGORY_STRUCTURAL, + str(getattr(suggestion, "subject_key", "")).strip(), + ) + category = str(getattr(suggestion, "category", "")).strip() + subject_key = str(getattr(suggestion, "subject_key", "")).strip() + if category == CATEGORY_DEAD_CODE: + return dead_code_group_id(subject_key) + return design_group_id( + category, + subject_key or str(getattr(suggestion, "title", "")), + ) + + +@dataclass(frozen=True, slots=True) +class _CloneShortIdEntry: + canonical_id: str + alias: str + token: str + suffix: str + + def render(self, prefix_length: int) -> str: + if prefix_length <= 0: + prefix_length = len(self.token) + return f"{self.alias}:{self.token[:prefix_length]}{self.suffix}" + + +def _partitioned_short_id(alias: str, remainder: str) -> str: + first, _, rest = remainder.partition(":") + return f"{alias}:{first}:{rest}" if rest else f"{alias}:{first}" + + +def _clone_short_id_entry_payload(canonical_id: str) -> _CloneShortIdEntry: + _prefix, _, remainder = canonical_id.partition(":") + clone_kind, _, group_key = remainder.partition(":") + hashes = [part for part in group_key.split("|") if part] + if clone_kind == "function": + fingerprint = hashes[0] if hashes else group_key + bucket = "" + if "|" in group_key: + bucket = "|" + group_key.split("|")[-1] + return _CloneShortIdEntry( + canonical_id=canonical_id, + alias="fn", + token=fingerprint, + suffix=bucket, + ) + alias = {"block": "blk", "segment": "seg"}.get(clone_kind, "clone") + combined = "|".join(hashes) if hashes else group_key + token = hashlib.sha256(combined.encode()).hexdigest() + return _CloneShortIdEntry( + canonical_id=canonical_id, + alias=alias, + token=token, + suffix=f"|x{len(hashes) or 1}", + ) + + +def _disambiguated_clone_short_ids_payload( + canonical_ids: Sequence[str], +) -> dict[str, str]: + clone_entries = [ + _clone_short_id_entry_payload(canonical_id) for canonical_id in canonical_ids + ] + max_token_length = max((len(entry.token) for entry in clone_entries), default=0) + for prefix_length in range(_SHORT_HASH_ID_LENGTH + 2, max_token_length + 1, 2): + candidates = { + entry.canonical_id: entry.render(prefix_length) for entry in clone_entries + } + if len(set(candidates.values())) == len(candidates): + return candidates + return { + entry.canonical_id: entry.render(max_token_length) for entry in clone_entries + } + + +def _leaf_symbol_name_payload(value: object) -> str: + text = str(value).strip() + if not text: + return "" + if ":" in text: + text = text.rsplit(":", maxsplit=1)[-1] + if "." in text: + text = text.rsplit(".", maxsplit=1)[-1] + return text + + +def _base_short_finding_id_payload(canonical_id: str) -> str: + prefix, _, remainder = canonical_id.partition(":") + if prefix == "clone": + return _clone_short_id_entry_payload(canonical_id).render(_SHORT_HASH_ID_LENGTH) + if prefix == "structural": + finding_kind, _, finding_key = remainder.partition(":") + return f"struct:{finding_kind}:{finding_key[:_SHORT_HASH_ID_LENGTH]}" + if prefix == "dead_code": + return f"dead:{_leaf_symbol_name_payload(remainder)}" + if prefix == "design": + category, _, subject_key = remainder.partition(":") + return f"design:{category}:{_leaf_symbol_name_payload(subject_key)}" + return canonical_id + + +def _disambiguated_short_finding_id_payload(canonical_id: str) -> str: + prefix, _, remainder = canonical_id.partition(":") + if prefix == "clone": + return _clone_short_id_entry_payload(canonical_id).render(0) + if prefix == "structural": + return _partitioned_short_id("struct", remainder) + if prefix == "dead_code": + return f"dead:{remainder}" + if prefix == "design": + return _partitioned_short_id("design", remainder) + return canonical_id + + +def _json_text_payload( + payload: object, + *, + sort_keys: bool = True, +) -> str: + return json.dumps( + payload, + ensure_ascii=False, + indent=2, + sort_keys=sort_keys, + ) + + +def _git_diff_lines_payload( + *, + root_path: Path, + git_diff_ref: str, +) -> tuple[str, ...]: + if git_diff_ref.startswith("-"): + raise MCPGitDiffError( + f"Invalid git diff ref '{git_diff_ref}': must not start with '-'." + ) + try: + completed = subprocess.run( + ["git", "diff", "--name-only", git_diff_ref, "--"], + cwd=root_path, + check=True, + capture_output=True, + text=True, + timeout=30, + ) + except (OSError, subprocess.CalledProcessError, subprocess.TimeoutExpired) as exc: + raise MCPGitDiffError( + f"Unable to resolve changed paths from git diff ref '{git_diff_ref}'." + ) from exc + return tuple( + sorted({line.strip() for line in completed.stdout.splitlines() if line.strip()}) + ) + + +def _load_report_document_payload(report_json: str) -> dict[str, object]: + try: + payload = json.loads(report_json) + except json.JSONDecodeError as exc: + raise MCPServiceError( + f"Generated canonical report is not valid JSON: {exc}" + ) from exc + if not isinstance(payload, dict): + raise MCPServiceError("Generated canonical report must be a JSON object.") + return dict(payload) + + +def _validated_history_limit(history_limit: int) -> int: + if not 1 <= history_limit <= MAX_MCP_HISTORY_LIMIT: + raise ValueError( + f"history_limit must be between 1 and {MAX_MCP_HISTORY_LIMIT}." + ) + return history_limit + + +class MCPServiceError(RuntimeError): + """Base class for CodeClone MCP service errors.""" + + +class MCPServiceContractError(MCPServiceError): + """Raised when an MCP request violates the CodeClone service contract.""" + + +class MCPRunNotFoundError(MCPServiceError): + """Raised when a requested MCP run is not available in the in-memory registry.""" + + +class MCPFindingNotFoundError(MCPServiceError): + """Raised when a requested finding id is not present in the selected run.""" + + +class MCPGitDiffError(MCPServiceError): + """Raised when changed paths cannot be resolved from a git ref.""" + + +class _BufferConsole: + def __init__(self) -> None: + self.messages: list[str] = [] + + def print(self, *objects: object, **_kwargs: object) -> None: + text = " ".join(str(obj) for obj in objects).strip() + if text: + self.messages.append(text) + + +@dataclass(frozen=True, slots=True) +class MCPAnalysisRequest: + root: str | None = None + analysis_mode: AnalysisMode = "full" + respect_pyproject: bool = True + changed_paths: tuple[str, ...] = () + git_diff_ref: str | None = None + processes: int | None = None + min_loc: int | None = None + min_stmt: int | None = None + block_min_loc: int | None = None + block_min_stmt: int | None = None + segment_min_loc: int | None = None + segment_min_stmt: int | None = None + complexity_threshold: int | None = None + coupling_threshold: int | None = None + cohesion_threshold: int | None = None + baseline_path: str | None = None + metrics_baseline_path: str | None = None + max_baseline_size_mb: int | None = None + cache_policy: CachePolicy = "reuse" + cache_path: str | None = None + max_cache_size_mb: int | None = None + + +@dataclass(frozen=True, slots=True) +class MCPGateRequest: + run_id: str | None = None + fail_on_new: bool = False + fail_threshold: int = -1 + fail_complexity: int = -1 + fail_coupling: int = -1 + fail_cohesion: int = -1 + fail_cycles: bool = False + fail_dead_code: bool = False + fail_health: int = -1 + fail_on_new_metrics: bool = False + + +@dataclass(frozen=True, slots=True) +class MCPRunRecord: + run_id: str + root: Path + request: MCPAnalysisRequest + comparison_settings: tuple[object, ...] + report_document: dict[str, object] + summary: dict[str, object] + changed_paths: tuple[str, ...] + changed_projection: dict[str, object] | None + warnings: tuple[str, ...] + failures: tuple[str, ...] + func_clones_count: int + block_clones_count: int + project_metrics: ProjectMetrics | None + suggestions: tuple[Suggestion, ...] + new_func: frozenset[str] + new_block: frozenset[str] + metrics_diff: MetricsDiff | None + + +class CodeCloneMCPRunStore: + def __init__(self, *, history_limit: int = DEFAULT_MCP_HISTORY_LIMIT) -> None: + self._history_limit = _validated_history_limit(history_limit) + self._lock = RLock() + self._records: OrderedDict[str, MCPRunRecord] = OrderedDict() + self._latest_run_id: str | None = None + + def register(self, record: MCPRunRecord) -> MCPRunRecord: + with self._lock: + self._records.pop(record.run_id, None) + self._records[record.run_id] = record + self._records.move_to_end(record.run_id) + self._latest_run_id = record.run_id + while len(self._records) > self._history_limit: + self._records.popitem(last=False) + return record + + def get(self, run_id: str | None = None) -> MCPRunRecord: + with self._lock: + resolved_run_id = self._resolve_run_id(run_id) + if resolved_run_id is None: + raise MCPRunNotFoundError("No matching MCP analysis run is available.") + return self._records[resolved_run_id] + + def _resolve_run_id(self, run_id: str | None) -> str | None: + if run_id is None: + return self._latest_run_id + if run_id in self._records: + return run_id + matches = [ + candidate for candidate in self._records if candidate.startswith(run_id) + ] + if len(matches) == 1: + return matches[0] + if len(matches) > 1: + raise MCPServiceContractError( + f"Run id '{run_id}' is ambiguous in this MCP session." + ) + return None + + def records(self) -> tuple[MCPRunRecord, ...]: + with self._lock: + return tuple(self._records.values()) + + def clear(self) -> tuple[str, ...]: + with self._lock: + removed_run_ids = tuple(self._records.keys()) + self._records.clear() + self._latest_run_id = None + return removed_run_ids + + +class CodeCloneMCPService: + def __init__(self, *, history_limit: int = DEFAULT_MCP_HISTORY_LIMIT) -> None: + self._runs = CodeCloneMCPRunStore(history_limit=history_limit) + self._state_lock = RLock() + self._review_state: dict[str, OrderedDict[str, str | None]] = {} + self._last_gate_results: dict[str, dict[str, object]] = {} + self._spread_max_cache: dict[str, int] = {} + + def analyze_repository(self, request: MCPAnalysisRequest) -> dict[str, object]: + self._validate_analysis_request(request) + root_path = self._resolve_root(request.root) + analysis_started_at_utc = _current_report_timestamp_utc() + changed_paths = self._resolve_request_changed_paths( + root_path=root_path, + changed_paths=request.changed_paths, + git_diff_ref=request.git_diff_ref, + ) + args = self._build_args(root_path=root_path, request=request) + ( + baseline_path, + baseline_exists, + metrics_baseline_path, + metrics_baseline_exists, + shared_baseline_payload, + ) = self._resolve_baseline_inputs(root_path=root_path, args=args) + cache_path = self._resolve_cache_path(root_path=root_path, args=args) + cache = self._build_cache( + root_path=root_path, + args=args, + cache_path=cache_path, + policy=request.cache_policy, + ) + console = _BufferConsole() + + boot = bootstrap( + args=args, + root=root_path, + output_paths=OutputPaths(json=_REPORT_DUMMY_PATH), + cache_path=cache_path, + ) + discovery_result = discover(boot=boot, cache=cache) + processing_result = process(boot=boot, discovery=discovery_result, cache=cache) + analysis_result = analyze( + boot=boot, + discovery=discovery_result, + processing=processing_result, + ) + + clone_baseline_state = resolve_clone_baseline_state( + args=args, + baseline_path=baseline_path, + baseline_exists=baseline_exists, + func_groups=analysis_result.func_groups, + block_groups=analysis_result.block_groups, + codeclone_version=__version__, + console=console, + shared_baseline_payload=( + shared_baseline_payload + if metrics_baseline_path == baseline_path + else None + ), + ) + metrics_baseline_state = resolve_metrics_baseline_state( + args=args, + metrics_baseline_path=metrics_baseline_path, + metrics_baseline_exists=metrics_baseline_exists, + baseline_updated_path=clone_baseline_state.updated_path, + project_metrics=analysis_result.project_metrics, + console=console, + shared_baseline_payload=( + shared_baseline_payload + if metrics_baseline_path == baseline_path + else None + ), + ) + + cache_status, cache_schema_version = resolve_cache_status(cache) + report_meta = _build_report_meta( + codeclone_version=__version__, + scan_root=root_path, + baseline_path=baseline_path, + baseline=clone_baseline_state.baseline, + baseline_loaded=clone_baseline_state.loaded, + baseline_status=clone_baseline_state.status.value, + cache_path=cache_path, + cache_used=cache_status == CacheStatus.OK, + cache_status=cache_status.value, + cache_schema_version=cache_schema_version, + files_skipped_source_io=len(processing_result.source_read_failures), + metrics_baseline_path=metrics_baseline_path, + metrics_baseline=metrics_baseline_state.baseline, + metrics_baseline_loaded=metrics_baseline_state.loaded, + metrics_baseline_status=metrics_baseline_state.status.value, + health_score=( + analysis_result.project_metrics.health.total + if analysis_result.project_metrics is not None + else None + ), + health_grade=( + analysis_result.project_metrics.health.grade + if analysis_result.project_metrics is not None + else None + ), + analysis_mode=request.analysis_mode, + metrics_computed=self._metrics_computed(request.analysis_mode), + design_complexity_threshold=_as_int( + getattr( + args, + "design_complexity_threshold", + DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + ), + DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + ), + design_coupling_threshold=_as_int( + getattr( + args, + "design_coupling_threshold", + DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + ), + DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + ), + design_cohesion_threshold=_as_int( + getattr( + args, + "design_cohesion_threshold", + DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + ), + DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + ), + analysis_started_at_utc=analysis_started_at_utc, + report_generated_at_utc=_current_report_timestamp_utc(), + ) + + baseline_for_diff = ( + clone_baseline_state.baseline + if clone_baseline_state.trusted_for_diff + else Baseline(baseline_path) + ) + new_func, new_block = baseline_for_diff.diff( + analysis_result.func_groups, + analysis_result.block_groups, + ) + metrics_diff = None + if ( + analysis_result.project_metrics is not None + and metrics_baseline_state.trusted_for_diff + ): + metrics_diff = metrics_baseline_state.baseline.diff( + analysis_result.project_metrics + ) + + report_artifacts = report( + boot=boot, + discovery=discovery_result, + processing=processing_result, + analysis=analysis_result, + report_meta=report_meta, + new_func=new_func, + new_block=new_block, + metrics_diff=metrics_diff, + ) + report_json = report_artifacts.json + if report_json is None: + raise MCPServiceError("CodeClone MCP expected a canonical JSON report.") + report_document = self._load_report_document(report_json) + run_id = self._report_digest(report_document) + + warning_items = set(console.messages) + if cache.load_warning: + warning_items.add(cache.load_warning) + warning_items.update(discovery_result.skipped_warnings) + warnings = tuple(sorted(warning_items)) + failures = tuple( + sorted( + { + *processing_result.failed_files, + *processing_result.source_read_failures, + } + ) + ) + + base_summary = self._build_run_summary_payload( + run_id=run_id, + root_path=root_path, + request=request, + report_document=report_document, + baseline_state=clone_baseline_state, + metrics_baseline_state=metrics_baseline_state, + cache_status=cache_status, + new_func=new_func, + new_block=new_block, + metrics_diff=metrics_diff, + warnings=warnings, + failures=failures, + ) + provisional_record = MCPRunRecord( + run_id=run_id, + root=root_path, + request=request, + comparison_settings=self._comparison_settings(args=args, request=request), + report_document=report_document, + summary=base_summary, + changed_paths=changed_paths, + changed_projection=None, + warnings=warnings, + failures=failures, + func_clones_count=analysis_result.func_clones_count, + block_clones_count=analysis_result.block_clones_count, + project_metrics=analysis_result.project_metrics, + suggestions=analysis_result.suggestions, + new_func=frozenset(new_func), + new_block=frozenset(new_block), + metrics_diff=metrics_diff, + ) + changed_projection = self._build_changed_projection(provisional_record) + summary = self._augment_summary_with_changed( + summary=base_summary, + changed_paths=changed_paths, + changed_projection=changed_projection, + ) + record = MCPRunRecord( + run_id=run_id, + root=root_path, + request=request, + comparison_settings=self._comparison_settings(args=args, request=request), + report_document=report_document, + summary=summary, + changed_paths=changed_paths, + changed_projection=changed_projection, + warnings=warnings, + failures=failures, + func_clones_count=analysis_result.func_clones_count, + block_clones_count=analysis_result.block_clones_count, + project_metrics=analysis_result.project_metrics, + suggestions=analysis_result.suggestions, + new_func=frozenset(new_func), + new_block=frozenset(new_block), + metrics_diff=metrics_diff, + ) + self._runs.register(record) + self._prune_session_state() + return self._summary_payload(record.summary, record=record) + + def analyze_changed_paths(self, request: MCPAnalysisRequest) -> dict[str, object]: + if not request.changed_paths and request.git_diff_ref is None: + raise MCPServiceContractError( + "analyze_changed_paths requires changed_paths or git_diff_ref." + ) + analysis_summary = self.analyze_repository(request) + record = self._runs.get(str(analysis_summary.get("run_id", "")) or None) + return self._changed_analysis_payload(record) + + def get_run_summary(self, run_id: str | None = None) -> dict[str, object]: + record = self._runs.get(run_id) + return self._summary_payload(record.summary, record=record) + + def compare_runs( + self, + *, + run_id_before: str, + run_id_after: str | None = None, + focus: ComparisonFocus = "all", + ) -> dict[str, object]: + validated_focus = cast( + "ComparisonFocus", + self._validate_choice("focus", focus, _VALID_COMPARISON_FOCUS), + ) + before = self._runs.get(run_id_before) + after = self._runs.get(run_id_after) + before_findings = self._comparison_index(before, focus=validated_focus) + after_findings = self._comparison_index(after, focus=validated_focus) + before_ids = set(before_findings) + after_ids = set(after_findings) + regressions = sorted(after_ids - before_ids) + improvements = sorted(before_ids - after_ids) + common = before_ids & after_ids + health_before = self._summary_health_score(before.summary) + health_after = self._summary_health_score(after.summary) + comparability = self._comparison_scope(before=before, after=after) + comparable = bool(comparability["comparable"]) + health_delta = ( + health_after - health_before + if comparable and health_before is not None and health_after is not None + else None + ) + verdict = ( + self._comparison_verdict( + regressions=len(regressions), + improvements=len(improvements), + health_delta=health_delta, + ) + if comparable + else "incomparable" + ) + regressions_payload = ( + [ + self._comparison_finding_card( + after, + after_findings[finding_id], + ) + for finding_id in regressions + ] + if comparable + else [] + ) + improvements_payload = ( + [ + self._comparison_finding_card( + before, + before_findings[finding_id], + ) + for finding_id in improvements + ] + if comparable + else [] + ) + payload: dict[str, object] = { + "before": { + "run_id": self._short_run_id(before.run_id), + "health": health_before, + }, + "after": { + "run_id": self._short_run_id(after.run_id), + "health": health_after, + }, + "comparable": comparable, + "health_delta": health_delta, + "verdict": verdict, + "regressions": regressions_payload, + "improvements": improvements_payload, + "unchanged": len(common) if comparable else None, + "summary": self._comparison_summary_text( + comparable=comparable, + comparability_reason=str(comparability["reason"]), + regressions=len(regressions), + improvements=len(improvements), + health_delta=health_delta, + ), + } + if not comparable: + payload["reason"] = comparability["reason"] + return payload + + def evaluate_gates(self, request: MCPGateRequest) -> dict[str, object]: + record = self._runs.get(request.run_id) + gate_result = self._evaluate_gate_snapshot(record=record, request=request) + result = { + "run_id": self._short_run_id(record.run_id), + "would_fail": gate_result.exit_code != 0, + "exit_code": gate_result.exit_code, + "reasons": list(gate_result.reasons), + "config": { + "fail_on_new": request.fail_on_new, + "fail_threshold": request.fail_threshold, + "fail_complexity": request.fail_complexity, + "fail_coupling": request.fail_coupling, + "fail_cohesion": request.fail_cohesion, + "fail_cycles": request.fail_cycles, + "fail_dead_code": request.fail_dead_code, + "fail_health": request.fail_health, + "fail_on_new_metrics": request.fail_on_new_metrics, + }, + } + with self._state_lock: + self._last_gate_results[record.run_id] = dict(result) + return result + + def _evaluate_gate_snapshot( + self, + *, + record: MCPRunRecord, + request: MCPGateRequest, + ) -> GatingResult: + reasons: list[str] = [] + if record.project_metrics is not None: + metric_reasons = metric_gate_reasons( + project_metrics=record.project_metrics, + metrics_diff=record.metrics_diff, + config=MetricGateConfig( + fail_complexity=request.fail_complexity, + fail_coupling=request.fail_coupling, + fail_cohesion=request.fail_cohesion, + fail_cycles=request.fail_cycles, + fail_dead_code=request.fail_dead_code, + fail_health=request.fail_health, + fail_on_new_metrics=request.fail_on_new_metrics, + ), + ) + reasons.extend(f"metric:{reason}" for reason in metric_reasons) + + if request.fail_on_new and (record.new_func or record.new_block): + reasons.append("clone:new") + + total_clone_groups = record.func_clones_count + record.block_clones_count + if 0 <= request.fail_threshold < total_clone_groups: + reasons.append( + f"clone:threshold:{total_clone_groups}:{request.fail_threshold}" + ) + + if reasons: + return GatingResult( + exit_code=int(ExitCode.GATING_FAILURE), + reasons=tuple(reasons), + ) + return GatingResult(exit_code=int(ExitCode.SUCCESS), reasons=()) + + def get_report_section( + self, + *, + run_id: str | None = None, + section: ReportSection = "all", + family: MetricsDetailFamily | None = None, + path: str | None = None, + offset: int = 0, + limit: int = 50, + ) -> dict[str, object]: + validated_section = cast( + "ReportSection", + self._validate_choice("section", section, _VALID_REPORT_SECTIONS), + ) + record = self._runs.get(run_id) + report_document = record.report_document + if validated_section == "all": + return dict(report_document) + if validated_section == "changed": + if record.changed_projection is None: + raise MCPServiceContractError( + "Report section 'changed' is not available in this run." + ) + return dict(record.changed_projection) + if validated_section == "metrics": + metrics = self._as_mapping(report_document.get("metrics")) + return {"summary": dict(self._as_mapping(metrics.get("summary")))} + if validated_section == "metrics_detail": + metrics = self._as_mapping(report_document.get("metrics")) + if not metrics: + raise MCPServiceContractError( + "Report section 'metrics_detail' is not available in this run." + ) + validated_family = cast( + "MetricsDetailFamily | None", + self._validate_optional_choice( + "family", + family, + _VALID_METRICS_DETAIL_FAMILIES, + ), + ) + return self._metrics_detail_payload( + metrics=metrics, + family=validated_family, + path=path, + offset=offset, + limit=limit, + ) + if validated_section == "derived": + return self._derived_section_payload(record) + payload = report_document.get(validated_section) + if not isinstance(payload, Mapping): + raise MCPServiceContractError( + f"Report section '{validated_section}' is not available in this run." + ) + return dict(payload) + + def list_findings( + self, + *, + run_id: str | None = None, + family: FindingFamilyFilter = "all", + category: str | None = None, + severity: str | None = None, + source_kind: str | None = None, + novelty: FindingNoveltyFilter = "all", + sort_by: FindingSort = "default", + detail_level: DetailLevel = "summary", + changed_paths: Sequence[str] = (), + git_diff_ref: str | None = None, + exclude_reviewed: bool = False, + offset: int = 0, + limit: int = 50, + max_results: int | None = None, + ) -> dict[str, object]: + validated_family = cast( + "FindingFamilyFilter", + self._validate_choice("family", family, _VALID_FINDING_FAMILIES), + ) + validated_novelty = cast( + "FindingNoveltyFilter", + self._validate_choice("novelty", novelty, _VALID_FINDING_NOVELTY), + ) + validated_sort = cast( + "FindingSort", + self._validate_choice("sort_by", sort_by, _VALID_FINDING_SORT), + ) + validated_detail = cast( + "DetailLevel", + self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS), + ) + validated_severity = self._validate_optional_choice( + "severity", + severity, + _VALID_SEVERITIES, + ) + record = self._runs.get(run_id) + paths_filter = self._resolve_query_changed_paths( + record=record, + changed_paths=changed_paths, + git_diff_ref=git_diff_ref, + ) + normalized_limit = max( + 1, + min(max_results if max_results is not None else limit, 200), + ) + filtered = self._query_findings( + record=record, + family=validated_family, + category=category, + severity=validated_severity, + source_kind=source_kind, + novelty=validated_novelty, + sort_by=validated_sort, + detail_level=validated_detail, + changed_paths=paths_filter, + exclude_reviewed=exclude_reviewed, + ) + total = len(filtered) + normalized_offset = max(0, offset) + items = filtered[normalized_offset : normalized_offset + normalized_limit] + next_offset = normalized_offset + len(items) + return { + "run_id": self._short_run_id(record.run_id), + "detail_level": validated_detail, + "sort_by": validated_sort, + "changed_paths": list(paths_filter), + "offset": normalized_offset, + "limit": normalized_limit, + "returned": len(items), + "total": total, + "next_offset": next_offset if next_offset < total else None, + "items": items, + } + + def get_finding( + self, + *, + finding_id: str, + run_id: str | None = None, + detail_level: DetailLevel = "normal", + ) -> dict[str, object]: + record = self._runs.get(run_id) + validated_detail = cast( + "DetailLevel", + self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS), + ) + canonical_id = self._resolve_canonical_finding_id(record, finding_id) + for finding in self._base_findings(record): + if str(finding.get("id")) == canonical_id: + return self._decorate_finding( + record, + finding, + detail_level=validated_detail, + ) + raise MCPFindingNotFoundError( + f"Finding id '{finding_id}' was not found in run " + f"'{self._short_run_id(record.run_id)}'." + ) + + def get_remediation( + self, + *, + finding_id: str, + run_id: str | None = None, + detail_level: DetailLevel = "normal", + ) -> dict[str, object]: + validated_detail = cast( + "DetailLevel", + self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS), + ) + record = self._runs.get(run_id) + canonical_id = self._resolve_canonical_finding_id(record, finding_id) + finding = self.get_finding( + finding_id=canonical_id, + run_id=record.run_id, + detail_level="full", + ) + remediation = self._as_mapping(finding.get("remediation")) + if not remediation: + raise MCPFindingNotFoundError( + f"Finding id '{finding_id}' does not expose remediation guidance." + ) + return { + "run_id": self._short_run_id(record.run_id), + "finding_id": self._short_finding_id(record, canonical_id), + "detail_level": validated_detail, + "remediation": self._project_remediation( + remediation, + detail_level=validated_detail, + ), + } + + def list_hotspots( + self, + *, + kind: HotlistKind, + run_id: str | None = None, + detail_level: DetailLevel = "summary", + changed_paths: Sequence[str] = (), + git_diff_ref: str | None = None, + exclude_reviewed: bool = False, + limit: int = 10, + max_results: int | None = None, + ) -> dict[str, object]: + validated_kind = cast( + "HotlistKind", + self._validate_choice("kind", kind, _VALID_HOTLIST_KINDS), + ) + validated_detail = cast( + "DetailLevel", + self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS), + ) + record = self._runs.get(run_id) + paths_filter = self._resolve_query_changed_paths( + record=record, + changed_paths=changed_paths, + git_diff_ref=git_diff_ref, + ) + rows = self._hotspot_rows( + record=record, + kind=validated_kind, + detail_level=validated_detail, + changed_paths=paths_filter, + exclude_reviewed=exclude_reviewed, + ) + normalized_limit = max( + 1, + min(max_results if max_results is not None else limit, 50), + ) + return { + "run_id": self._short_run_id(record.run_id), + "kind": validated_kind, + "detail_level": validated_detail, + "changed_paths": list(paths_filter), + "returned": min(len(rows), normalized_limit), + "total": len(rows), + "items": [dict(self._as_mapping(item)) for item in rows[:normalized_limit]], + } + + def get_production_triage( + self, + *, + run_id: str | None = None, + max_hotspots: int = 3, + max_suggestions: int = 3, + ) -> dict[str, object]: + record = self._runs.get(run_id) + summary = self._summary_payload(record.summary, record=record) + findings = self._base_findings(record) + findings_breakdown = self._source_kind_breakdown( + self._finding_source_kind(finding) for finding in findings + ) + suggestion_rows = self._triage_suggestion_rows(record) + suggestion_breakdown = self._source_kind_breakdown( + row.get("source_kind") for row in suggestion_rows + ) + hotspot_limit = max(1, min(max_hotspots, 10)) + suggestion_limit = max(1, min(max_suggestions, 10)) + production_hotspots = self._hotspot_rows( + record=record, + kind="production_hotspots", + detail_level="summary", + changed_paths=(), + exclude_reviewed=False, + ) + production_suggestions = [ + dict(row) + for row in suggestion_rows + if str(row.get("source_kind", "")) == SOURCE_KIND_PRODUCTION + ] + return { + "run_id": self._short_run_id(record.run_id), + "health": dict(self._summary_health_payload(summary)), + "cache": dict(self._as_mapping(summary.get("cache"))), + "findings": { + "total": len(findings), + "by_source_kind": findings_breakdown, + "outside_focus": len(findings) + - findings_breakdown[SOURCE_KIND_PRODUCTION], + }, + "top_hotspots": { + "kind": "production_hotspots", + "available": len(production_hotspots), + "returned": min(len(production_hotspots), hotspot_limit), + "items": [ + dict(self._as_mapping(item)) + for item in production_hotspots[:hotspot_limit] + ], + }, + "suggestions": { + "total": len(suggestion_rows), + "by_source_kind": suggestion_breakdown, + "outside_focus": len(suggestion_rows) + - suggestion_breakdown[SOURCE_KIND_PRODUCTION], + }, + "top_suggestions": { + "available": len(production_suggestions), + "returned": min(len(production_suggestions), suggestion_limit), + "items": production_suggestions[:suggestion_limit], + }, + } + + def generate_pr_summary( + self, + *, + run_id: str | None = None, + changed_paths: Sequence[str] = (), + git_diff_ref: str | None = None, + format: PRSummaryFormat = "markdown", + ) -> dict[str, object]: + output_format = cast( + "PRSummaryFormat", + self._validate_choice("format", format, _VALID_PR_SUMMARY_FORMATS), + ) + record = self._runs.get(run_id) + paths_filter = self._resolve_query_changed_paths( + record=record, + changed_paths=changed_paths, + git_diff_ref=git_diff_ref, + prefer_record_paths=True, + ) + changed_items = self._query_findings( + record=record, + detail_level="summary", + changed_paths=paths_filter, + ) + previous = self._previous_run_for_root(record) + resolved: list[dict[str, object]] = [] + if previous is not None: + compare_payload = self.compare_runs( + run_id_before=previous.run_id, + run_id_after=record.run_id, + focus="all", + ) + resolved = cast("list[dict[str, object]]", compare_payload["improvements"]) + with self._state_lock: + gate_result = dict( + self._last_gate_results.get( + record.run_id, + {"would_fail": False, "reasons": []}, + ) + ) + verdict = self._changed_verdict( + changed_projection={ + "total": len(changed_items), + "new": sum( + 1 for item in changed_items if str(item.get("novelty", "")) == "new" + ), + }, + health_delta=self._summary_health_delta(record.summary), + ) + payload: dict[str, object] = { + "run_id": self._short_run_id(record.run_id), + "changed_files": len(paths_filter), + "health": self._summary_health_payload(record.summary), + "health_delta": self._summary_health_delta(record.summary), + "verdict": verdict, + "new_findings_in_changed_files": changed_items, + "resolved": resolved, + "blocking_gates": list(cast(Sequence[str], gate_result.get("reasons", []))), + } + if output_format == "json": + return payload + return { + "run_id": self._short_run_id(record.run_id), + "format": output_format, + "content": self._render_pr_summary_markdown(payload), + } + + def mark_finding_reviewed( + self, + *, + finding_id: str, + run_id: str | None = None, + note: str | None = None, + ) -> dict[str, object]: + record = self._runs.get(run_id) + canonical_id = self._resolve_canonical_finding_id(record, finding_id) + self.get_finding( + finding_id=canonical_id, + run_id=record.run_id, + detail_level="normal", + ) + with self._state_lock: + review_map = self._review_state.setdefault(record.run_id, OrderedDict()) + review_map[canonical_id] = ( + note.strip() if isinstance(note, str) and note.strip() else None + ) + review_map.move_to_end(canonical_id) + return { + "run_id": self._short_run_id(record.run_id), + "finding_id": self._short_finding_id(record, canonical_id), + "reviewed": True, + "note": review_map[canonical_id], + "reviewed_count": len(review_map), + } + + def list_reviewed_findings( + self, + *, + run_id: str | None = None, + ) -> dict[str, object]: + record = self._runs.get(run_id) + with self._state_lock: + review_items = tuple( + self._review_state.get(record.run_id, OrderedDict()).items() + ) + items = [] + for finding_id, note in review_items: + try: + finding = self.get_finding(finding_id=finding_id, run_id=record.run_id) + except MCPFindingNotFoundError: + continue + items.append( + { + "finding_id": self._short_finding_id(record, finding_id), + "note": note, + "finding": self._project_finding_detail( + record, + finding, + detail_level="summary", + ), + } + ) + return { + "run_id": self._short_run_id(record.run_id), + "reviewed_count": len(items), + "items": items, + } + + def clear_session_runs(self) -> dict[str, object]: + removed_run_ids = self._runs.clear() + with self._state_lock: + cleared_review_entries = sum( + len(entries) for entries in self._review_state.values() + ) + cleared_gate_results = len(self._last_gate_results) + cleared_spread_cache_entries = len(self._spread_max_cache) + self._review_state.clear() + self._last_gate_results.clear() + self._spread_max_cache.clear() + return { + "cleared_runs": len(removed_run_ids), + "cleared_run_ids": [ + self._short_run_id(run_id) for run_id in removed_run_ids + ], + "cleared_review_entries": cleared_review_entries, + "cleared_gate_results": cleared_gate_results, + "cleared_spread_cache_entries": cleared_spread_cache_entries, + } + + def check_complexity( + self, + *, + run_id: str | None = None, + root: str | None = None, + path: str | None = None, + min_complexity: int | None = None, + max_results: int = 10, + detail_level: DetailLevel = "summary", + ) -> dict[str, object]: + validated_detail = cast( + "DetailLevel", + self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS), + ) + record = self._resolve_granular_record( + run_id=run_id, + root=root, + analysis_mode="full", + ) + findings = self._query_findings( + record=record, + family="design", + category=CATEGORY_COMPLEXITY, + detail_level=validated_detail, + changed_paths=self._path_filter_tuple(path), + sort_by="priority", + ) + if min_complexity is not None: + findings = [ + finding + for finding in findings + if _as_int( + self._as_mapping(finding.get("facts")).get( + "cyclomatic_complexity", + 0, + ) + ) + >= min_complexity + ] + return self._granular_payload( + record=record, + check="complexity", + items=findings, + detail_level=validated_detail, + max_results=max_results, + path=path, + ) + + def check_clones( + self, + *, + run_id: str | None = None, + root: str | None = None, + path: str | None = None, + clone_type: str | None = None, + source_kind: str | None = None, + max_results: int = 10, + detail_level: DetailLevel = "summary", + ) -> dict[str, object]: + validated_detail = cast( + "DetailLevel", + self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS), + ) + record = self._resolve_granular_record( + run_id=run_id, + root=root, + analysis_mode="clones_only", + ) + findings = self._query_findings( + record=record, + family="clone", + source_kind=source_kind, + detail_level=validated_detail, + changed_paths=self._path_filter_tuple(path), + sort_by="priority", + ) + if clone_type is not None: + findings = [ + finding + for finding in findings + if str(finding.get("clone_type", "")).strip() == clone_type + ] + return self._granular_payload( + record=record, + check="clones", + items=findings, + detail_level=validated_detail, + max_results=max_results, + path=path, + ) + + def check_coupling( + self, + *, + run_id: str | None = None, + root: str | None = None, + path: str | None = None, + max_results: int = 10, + detail_level: DetailLevel = "summary", + ) -> dict[str, object]: + validated_detail = cast( + "DetailLevel", + self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS), + ) + record = self._resolve_granular_record( + run_id=run_id, + root=root, + analysis_mode="full", + ) + findings = self._query_findings( + record=record, + family="design", + category=CATEGORY_COUPLING, + detail_level=validated_detail, + changed_paths=self._path_filter_tuple(path), + sort_by="priority", + ) + return self._granular_payload( + record=record, + check="coupling", + items=findings, + detail_level=validated_detail, + max_results=max_results, + path=path, + ) + + def check_cohesion( + self, + *, + run_id: str | None = None, + root: str | None = None, + path: str | None = None, + max_results: int = 10, + detail_level: DetailLevel = "summary", + ) -> dict[str, object]: + validated_detail = cast( + "DetailLevel", + self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS), + ) + record = self._resolve_granular_record( + run_id=run_id, + root=root, + analysis_mode="full", + ) + findings = self._query_findings( + record=record, + family="design", + category=CATEGORY_COHESION, + detail_level=validated_detail, + changed_paths=self._path_filter_tuple(path), + sort_by="priority", + ) + return self._granular_payload( + record=record, + check="cohesion", + items=findings, + detail_level=validated_detail, + max_results=max_results, + path=path, + ) + + def check_dead_code( + self, + *, + run_id: str | None = None, + root: str | None = None, + path: str | None = None, + min_severity: str | None = None, + max_results: int = 10, + detail_level: DetailLevel = "summary", + ) -> dict[str, object]: + validated_detail = cast( + "DetailLevel", + self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS), + ) + validated_min_severity = self._validate_optional_choice( + "min_severity", + min_severity, + _VALID_SEVERITIES, + ) + record = self._resolve_granular_record( + run_id=run_id, + root=root, + analysis_mode="full", + ) + findings = self._query_findings( + record=record, + family="dead_code", + detail_level=validated_detail, + changed_paths=self._path_filter_tuple(path), + sort_by="priority", + ) + if validated_min_severity is not None: + findings = [ + finding + for finding in findings + if self._severity_rank(str(finding.get("severity", ""))) + >= self._severity_rank(validated_min_severity) + ] + return self._granular_payload( + record=record, + check="dead_code", + items=findings, + detail_level=validated_detail, + max_results=max_results, + path=path, + ) + + def read_resource(self, uri: str) -> str: + if uri == "codeclone://schema": + return _json_text_payload(self._schema_resource_payload()) + if uri == "codeclone://latest/triage": + latest = self._runs.get() + return _json_text_payload(self.get_production_triage(run_id=latest.run_id)) + latest_prefix = "codeclone://latest/" + run_prefix = "codeclone://runs/" + if uri.startswith(latest_prefix): + latest = self._runs.get() + suffix = uri[len(latest_prefix) :] + return self._render_resource(latest, suffix) + if not uri.startswith(run_prefix): + raise MCPServiceContractError(f"Unsupported CodeClone resource URI: {uri}") + remainder = uri[len(run_prefix) :] + run_id, sep, suffix = remainder.partition("/") + if not sep: + raise MCPServiceContractError(f"Unsupported CodeClone resource URI: {uri}") + record = self._runs.get(run_id) + return self._render_resource(record, suffix) + + def _render_resource(self, record: MCPRunRecord, suffix: str) -> str: + if suffix == "summary": + return _json_text_payload( + self._summary_payload(record.summary, record=record) + ) + if suffix == "triage": + raise MCPServiceContractError( + "Production triage is exposed only as codeclone://latest/triage." + ) + if suffix == "health": + return _json_text_payload(self._summary_health_payload(record.summary)) + if suffix == "gates": + with self._state_lock: + gate_result = self._last_gate_results.get(record.run_id) + if gate_result is None: + raise MCPServiceContractError( + "No gate evaluation result is available in this MCP session." + ) + return _json_text_payload(gate_result) + if suffix == "changed": + if record.changed_projection is None: + raise MCPServiceContractError( + "Changed-findings projection is not available in this run." + ) + return _json_text_payload(record.changed_projection) + if suffix == "schema": + return _json_text_payload(self._schema_resource_payload()) + if suffix == "report.json": + return _json_text_payload(record.report_document, sort_keys=False) + if suffix == "overview": + return _json_text_payload( + self.list_hotspots(kind="highest_spread", run_id=record.run_id) + ) + finding_prefix = "findings/" + if suffix.startswith(finding_prefix): + finding_id = suffix[len(finding_prefix) :] + return _json_text_payload( + self.get_finding(run_id=record.run_id, finding_id=finding_id) + ) + raise MCPServiceContractError( + f"Unsupported CodeClone resource suffix '{suffix}'." + ) + + def _resolve_request_changed_paths( + self, + *, + root_path: Path, + changed_paths: Sequence[str], + git_diff_ref: str | None, + ) -> tuple[str, ...]: + if changed_paths and git_diff_ref is not None: + raise MCPServiceContractError( + "Provide changed_paths or git_diff_ref, not both." + ) + if git_diff_ref is not None: + return self._git_diff_paths(root_path=root_path, git_diff_ref=git_diff_ref) + if not changed_paths: + return () + return self._normalize_changed_paths(root_path=root_path, paths=changed_paths) + + def _resolve_query_changed_paths( + self, + *, + record: MCPRunRecord, + changed_paths: Sequence[str], + git_diff_ref: str | None, + prefer_record_paths: bool = False, + ) -> tuple[str, ...]: + if changed_paths or git_diff_ref is not None: + return self._resolve_request_changed_paths( + root_path=record.root, + changed_paths=changed_paths, + git_diff_ref=git_diff_ref, + ) + if prefer_record_paths: + return record.changed_paths + return () + + def _normalize_changed_paths( + self, + *, + root_path: Path, + paths: Sequence[str], + ) -> tuple[str, ...]: + normalized: set[str] = set() + for raw_path in paths: + candidate = Path(str(raw_path)).expanduser() + if candidate.is_absolute(): + try: + relative = candidate.resolve().relative_to(root_path) + except (OSError, ValueError) as exc: + raise MCPServiceContractError( + f"Changed path '{raw_path}' is outside root '{root_path}'." + ) from exc + normalized.add(relative.as_posix()) + continue + cleaned = self._normalize_relative_path(candidate.as_posix()) + if cleaned: + normalized.add(cleaned) + return tuple(sorted(normalized)) + + def _git_diff_paths( + self, + *, + root_path: Path, + git_diff_ref: str, + ) -> tuple[str, ...]: + lines = _git_diff_lines_payload( + root_path=root_path, + git_diff_ref=git_diff_ref, + ) + return self._normalize_changed_paths(root_path=root_path, paths=lines) + + def _prune_session_state(self) -> None: + active_run_ids = {record.run_id for record in self._runs.records()} + with self._state_lock: + for state_map in ( + self._review_state, + self._last_gate_results, + self._spread_max_cache, + ): + stale_run_ids = [ + run_id for run_id in state_map if run_id not in active_run_ids + ] + for run_id in stale_run_ids: + state_map.pop(run_id, None) + + def _summary_health_score(self, summary: Mapping[str, object]) -> int | None: + health = self._summary_health_payload(summary) + if health.get("available") is False: + return None + score = health.get("score", 0) + return _as_int(score, 0) + + def _summary_health_delta(self, summary: Mapping[str, object]) -> int | None: + if self._summary_health_payload(summary).get("available") is False: + return None + metrics_diff = self._as_mapping(summary.get("metrics_diff")) + value = metrics_diff.get("health_delta", 0) + return _as_int(value, 0) + + def _summary_health_payload( + self, + summary: Mapping[str, object], + ) -> dict[str, object]: + if str(summary.get("analysis_mode", "")) == "clones_only": + return {"available": False, "reason": "metrics_skipped"} + health = dict(self._as_mapping(summary.get("health"))) + if health: + return health + return {"available": False, "reason": "unavailable"} + + @staticmethod + def _short_run_id(run_id: str) -> str: + return run_id[:_SHORT_RUN_ID_LENGTH] + + def _finding_id_maps( + self, + record: MCPRunRecord, + ) -> tuple[dict[str, str], dict[str, str]]: + canonical_ids = sorted( + str(finding.get("id", "")) + for finding in self._base_findings(record) + if str(finding.get("id", "")) + ) + base_ids = { + canonical_id: self._base_short_finding_id(canonical_id) + for canonical_id in canonical_ids + } + grouped: dict[str, list[str]] = {} + for canonical_id, short_id in base_ids.items(): + grouped.setdefault(short_id, []).append(canonical_id) + canonical_to_short: dict[str, str] = {} + short_to_canonical: dict[str, str] = {} + for short_id, group in grouped.items(): + if len(group) == 1: + canonical_id = group[0] + canonical_to_short[canonical_id] = short_id + short_to_canonical[short_id] = canonical_id + continue + disambiguated_ids = self._disambiguated_short_finding_ids(group) + for canonical_id, disambiguated in disambiguated_ids.items(): + canonical_to_short[canonical_id] = disambiguated + short_to_canonical[disambiguated] = canonical_id + return canonical_to_short, short_to_canonical + + def _base_short_finding_id(self, canonical_id: str) -> str: + return _base_short_finding_id_payload(canonical_id) + + def _disambiguated_short_finding_id(self, canonical_id: str) -> str: + return _disambiguated_short_finding_id_payload(canonical_id) + + def _disambiguated_short_finding_ids( + self, + canonical_ids: Sequence[str], + ) -> dict[str, str]: + clone_ids = [ + canonical_id + for canonical_id in canonical_ids + if canonical_id.startswith("clone:") + ] + if len(clone_ids) == len(canonical_ids): + clone_short_ids = _disambiguated_clone_short_ids_payload(clone_ids) + if len(set(clone_short_ids.values())) == len(clone_short_ids): + return clone_short_ids + return { + canonical_id: self._disambiguated_short_finding_id(canonical_id) + for canonical_id in canonical_ids + } + + def _short_finding_id( + self, + record: MCPRunRecord, + canonical_id: str, + ) -> str: + canonical_to_short, _short_to_canonical = self._finding_id_maps(record) + return canonical_to_short.get(canonical_id, canonical_id) + + def _resolve_canonical_finding_id( + self, + record: MCPRunRecord, + finding_id: str, + ) -> str: + canonical_to_short, short_to_canonical = self._finding_id_maps(record) + if finding_id in canonical_to_short: + return finding_id + canonical = short_to_canonical.get(finding_id) + if canonical is not None: + return canonical + raise MCPFindingNotFoundError( + f"Finding id '{finding_id}' was not found in run " + f"'{self._short_run_id(record.run_id)}'." + ) + + def _leaf_symbol_name(self, value: object) -> str: + return _leaf_symbol_name_payload(value) + + @staticmethod + def _comparison_settings( + *, + args: Namespace, + request: MCPAnalysisRequest, + ) -> tuple[object, ...]: + return ( + request.analysis_mode, + _as_int(args.min_loc, DEFAULT_MIN_LOC), + _as_int(args.min_stmt, DEFAULT_MIN_STMT), + _as_int(args.block_min_loc, DEFAULT_BLOCK_MIN_LOC), + _as_int(args.block_min_stmt, DEFAULT_BLOCK_MIN_STMT), + _as_int(args.segment_min_loc, DEFAULT_SEGMENT_MIN_LOC), + _as_int(args.segment_min_stmt, DEFAULT_SEGMENT_MIN_STMT), + _as_int( + args.design_complexity_threshold, + DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + ), + _as_int( + args.design_coupling_threshold, + DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + ), + _as_int( + args.design_cohesion_threshold, + DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + ), + ) + + @staticmethod + def _comparison_scope( + *, + before: MCPRunRecord, + after: MCPRunRecord, + ) -> dict[str, object]: + same_root = before.root == after.root + same_analysis_settings = before.comparison_settings == after.comparison_settings + if same_root and same_analysis_settings: + reason = "comparable" + elif not same_root and not same_analysis_settings: + reason = "different_root_and_analysis_settings" + elif not same_root: + reason = "different_root" + else: + reason = "different_analysis_settings" + return { + "comparable": same_root and same_analysis_settings, + "same_root": same_root, + "same_analysis_settings": same_analysis_settings, + "reason": reason, + } + + @staticmethod + def _severity_rank(severity: str) -> int: + return { + SEVERITY_CRITICAL: 3, + SEVERITY_WARNING: 2, + SEVERITY_INFO: 1, + }.get(severity, 0) + + def _path_filter_tuple(self, path: str | None) -> tuple[str, ...]: + if not path: + return () + cleaned = self._normalize_relative_path(Path(path).as_posix()) + return (cleaned,) if cleaned else () + + def _normalize_relative_path(self, path: str) -> str: + cleaned = path.strip() + if cleaned == ".": + return "" + if cleaned.startswith("./"): + cleaned = cleaned[2:] + return cleaned.rstrip("/") + + def _previous_run_for_root(self, record: MCPRunRecord) -> MCPRunRecord | None: + previous: MCPRunRecord | None = None + for item in self._runs.records(): + if item.run_id == record.run_id: + return previous + if item.root == record.root: + previous = item + return None + + @staticmethod + def _record_supports_analysis_mode( + record: MCPRunRecord, + *, + analysis_mode: AnalysisMode, + ) -> bool: + record_mode = record.request.analysis_mode + if analysis_mode == "clones_only": + return record_mode in {"clones_only", "full"} + return record_mode == "full" + + def _latest_compatible_record( + self, + *, + analysis_mode: AnalysisMode, + root_path: Path | None = None, + ) -> MCPRunRecord | None: + for item in reversed(self._runs.records()): + if root_path is not None and item.root != root_path: + continue + if self._record_supports_analysis_mode( + item, + analysis_mode=analysis_mode, + ): + return item + return None + + def _resolve_granular_record( + self, + *, + run_id: str | None, + root: str | None, + analysis_mode: AnalysisMode, + ) -> MCPRunRecord: + if run_id is not None: + record = self._runs.get(run_id) + if self._record_supports_analysis_mode(record, analysis_mode=analysis_mode): + return record + raise MCPServiceContractError( + "Selected MCP run is not compatible with this check. " + f"Call analyze_repository(root='{record.root}', " + "analysis_mode='full') first." + ) + root_path = self._resolve_optional_root(root) + latest_record = self._latest_compatible_record( + analysis_mode=analysis_mode, + root_path=root_path, + ) + if latest_record is not None: + return latest_record + if root_path is not None: + raise MCPRunNotFoundError( + f"No compatible MCP analysis run is available for root: {root_path}. " + f"Call analyze_repository(root='{root_path}') or " + f"analyze_changed_paths(root='{root_path}', changed_paths=[...]) first." + ) + raise MCPRunNotFoundError( + "No compatible MCP analysis run is available. " + "Call analyze_repository(root='/path/to/repo') or " + "analyze_changed_paths(root='/path/to/repo', changed_paths=[...]) first." + ) + + def _base_findings(self, record: MCPRunRecord) -> list[dict[str, object]]: + report_document = record.report_document + findings = self._as_mapping(report_document.get("findings")) + groups = self._as_mapping(findings.get("groups")) + clone_groups = self._as_mapping(groups.get(FAMILY_CLONES)) + return [ + *self._dict_list(clone_groups.get("functions")), + *self._dict_list(clone_groups.get("blocks")), + *self._dict_list(clone_groups.get("segments")), + *self._dict_list( + self._as_mapping(groups.get(FAMILY_STRUCTURAL)).get("groups") + ), + *self._dict_list( + self._as_mapping(groups.get(FAMILY_DEAD_CODE)).get("groups") + ), + *self._dict_list(self._as_mapping(groups.get(FAMILY_DESIGN)).get("groups")), + ] + + def _query_findings( + self, + *, + record: MCPRunRecord, + family: FindingFamilyFilter = "all", + category: str | None = None, + severity: str | None = None, + source_kind: str | None = None, + novelty: FindingNoveltyFilter = "all", + sort_by: FindingSort = "default", + detail_level: DetailLevel = "normal", + changed_paths: Sequence[str] = (), + exclude_reviewed: bool = False, + ) -> list[dict[str, object]]: + findings = self._base_findings(record) + max_spread_value = max( + (self._spread_value(finding) for finding in findings), + default=0, + ) + with self._state_lock: + self._spread_max_cache[record.run_id] = max_spread_value + filtered = [ + finding + for finding in findings + if self._matches_finding_filters( + finding=finding, + family=family, + category=category, + severity=severity, + source_kind=source_kind, + novelty=novelty, + ) + and ( + not changed_paths + or self._finding_touches_paths( + finding=finding, + changed_paths=changed_paths, + ) + ) + and (not exclude_reviewed or not self._finding_is_reviewed(record, finding)) + ] + remediation_map = { + str(finding.get("id", "")): self._remediation_for_finding(record, finding) + for finding in filtered + } + priority_map = { + str(finding.get("id", "")): self._priority_score( + record, + finding, + remediation=remediation_map[str(finding.get("id", ""))], + max_spread_value=max_spread_value, + ) + for finding in filtered + } + ordered = self._sort_findings( + record=record, + findings=filtered, + sort_by=sort_by, + priority_map=priority_map, + ) + return [ + self._decorate_finding( + record, + finding, + detail_level=detail_level, + remediation=remediation_map[str(finding.get("id", ""))], + priority_payload=priority_map[str(finding.get("id", ""))], + max_spread_value=max_spread_value, + ) + for finding in ordered + ] + + def _sort_findings( + self, + *, + record: MCPRunRecord, + findings: Sequence[Mapping[str, object]], + sort_by: FindingSort, + priority_map: Mapping[str, Mapping[str, object]] | None = None, + ) -> list[dict[str, object]]: + finding_rows = [dict(finding) for finding in findings] + if sort_by == "default": + return finding_rows + if sort_by == "severity": + finding_rows.sort( + key=lambda finding: ( + -self._severity_rank(str(finding.get("severity", ""))), + str(finding.get("id", "")), + ) + ) + elif sort_by == "spread": + finding_rows.sort( + key=lambda finding: ( + -self._spread_value(finding), + -_as_float(finding.get("priority", 0.0), 0.0), + str(finding.get("id", "")), + ) + ) + else: + finding_rows.sort( + key=lambda finding: ( + -_as_float( + self._as_mapping( + (priority_map or {}).get(str(finding.get("id", ""))) + ).get("score", 0.0), + 0.0, + ) + if priority_map is not None + else -_as_float( + self._priority_score(record, finding)["score"], + 0.0, + ), + -self._severity_rank(str(finding.get("severity", ""))), + str(finding.get("id", "")), + ) + ) + return finding_rows + + def _decorate_finding( + self, + record: MCPRunRecord, + finding: Mapping[str, object], + *, + detail_level: DetailLevel, + remediation: Mapping[str, object] | None = None, + priority_payload: Mapping[str, object] | None = None, + max_spread_value: int | None = None, + ) -> dict[str, object]: + resolved_remediation = ( + remediation + if remediation is not None + else self._remediation_for_finding(record, finding) + ) + resolved_priority_payload = ( + dict(priority_payload) + if priority_payload is not None + else self._priority_score( + record, + finding, + remediation=resolved_remediation, + max_spread_value=max_spread_value, + ) + ) + payload = dict(finding) + payload["priority_score"] = resolved_priority_payload["score"] + payload["priority_factors"] = resolved_priority_payload["factors"] + payload["locations"] = self._locations_for_finding( + record, + finding, + include_uri=detail_level == "full", + ) + payload["html_anchor"] = f"finding-{finding.get('id', '')}" + if resolved_remediation is not None: + payload["remediation"] = resolved_remediation + return self._project_finding_detail( + record, + payload, + detail_level=detail_level, + ) + + def _project_finding_detail( + self, + record: MCPRunRecord, + finding: Mapping[str, object], + *, + detail_level: DetailLevel, + ) -> dict[str, object]: + if detail_level == "full": + full_payload = dict(finding) + full_payload["id"] = self._short_finding_id( + record, + str(finding.get("id", "")), + ) + return full_payload + payload: dict[str, object] = { + "id": self._short_finding_id(record, str(finding.get("id", ""))), + "kind": self._finding_kind_label(finding), + "severity": str(finding.get("severity", "")), + "novelty": str(finding.get("novelty", "")), + "scope": self._finding_source_kind(finding), + "count": _as_int(finding.get("count", 0), 0), + "spread": dict(self._as_mapping(finding.get("spread"))), + "priority": round(_as_float(finding.get("priority_score", 0.0), 0.0), 2), + } + clone_type = str(finding.get("clone_type", "")).strip() + if clone_type: + payload["type"] = clone_type + locations = [ + self._as_mapping(item) + for item in self._as_sequence(finding.get("locations")) + ] + if detail_level == "summary": + remediation = self._as_mapping(finding.get("remediation")) + if remediation: + payload["effort"] = str(remediation.get("effort", "")) + payload["locations"] = [ + summary_location + for summary_location in ( + self._summary_location_string(location) for location in locations + ) + if summary_location + ] + return payload + remediation = self._as_mapping(finding.get("remediation")) + if remediation: + payload["remediation"] = self._project_remediation( + remediation, + detail_level="normal", + ) + payload["locations"] = [ + projected + for projected in ( + self._normal_location_payload(location) for location in locations + ) + if projected + ] + return payload + + def _finding_summary_card( + self, + record: MCPRunRecord, + finding: Mapping[str, object], + ) -> dict[str, object]: + return self._finding_summary_card_payload( + record, + self._decorate_finding(record, finding, detail_level="full"), + ) + + def _finding_summary_card_payload( + self, + record: MCPRunRecord, + finding: Mapping[str, object], + ) -> dict[str, object]: + return self._project_finding_detail(record, finding, detail_level="summary") + + def _comparison_finding_card( + self, + record: MCPRunRecord, + finding: Mapping[str, object], + ) -> dict[str, object]: + summary_card = self._finding_summary_card(record, finding) + return { + "id": summary_card.get("id"), + "kind": summary_card.get("kind"), + "severity": summary_card.get("severity"), + } + + @staticmethod + def _finding_kind_label(finding: Mapping[str, object]) -> str: + family = str(finding.get("family", "")).strip() + kind = str(finding.get("kind", finding.get("category", ""))).strip() + if family == FAMILY_CLONE: + clone_kind = str( + finding.get("clone_kind", finding.get("category", kind)) + ).strip() + return f"{clone_kind}_clone" if clone_kind else "clone" + if family == FAMILY_DEAD_CODE: + return "dead_code" + return kind or family + + @staticmethod + def _summary_location_string(location: Mapping[str, object]) -> str: + path = str(location.get("file", "")).strip() + line = _as_int(location.get("line", 0), 0) + if not path: + return "" + return f"{path}:{line}" if line > 0 else path + + def _normal_location_payload( + self, + location: Mapping[str, object], + ) -> dict[str, object]: + path = str(location.get("file", "")).strip() + if not path: + return {} + payload: dict[str, object] = { + "path": path, + "line": _as_int(location.get("line", 0), 0), + "end_line": _as_int(location.get("end_line", 0), 0), + } + symbol = self._leaf_symbol_name(location.get("symbol")) + if symbol: + payload["symbol"] = symbol + return payload + + def _matches_finding_filters( + self, + *, + finding: Mapping[str, object], + family: FindingFamilyFilter, + category: str | None = None, + severity: str | None, + source_kind: str | None, + novelty: FindingNoveltyFilter, + ) -> bool: + finding_family = str(finding.get("family", "")).strip() + if family != "all" and finding_family != family: + return False + if ( + category is not None + and str(finding.get("category", "")).strip() != category + ): + return False + if ( + severity is not None + and str(finding.get("severity", "")).strip() != severity + ): + return False + dominant_kind = str( + self._as_mapping(finding.get("source_scope")).get("dominant_kind", "") + ).strip() + if source_kind is not None and dominant_kind != source_kind: + return False + return novelty == "all" or str(finding.get("novelty", "")).strip() == novelty + + def _finding_touches_paths( + self, + *, + finding: Mapping[str, object], + changed_paths: Sequence[str], + ) -> bool: + normalized_paths = tuple(changed_paths) + for item in self._as_sequence(finding.get("items")): + relative_path = str(self._as_mapping(item).get("relative_path", "")).strip() + if relative_path and self._path_matches(relative_path, normalized_paths): + return True + return False + + @staticmethod + def _path_matches(relative_path: str, changed_paths: Sequence[str]) -> bool: + for candidate in changed_paths: + if relative_path == candidate or relative_path.startswith(candidate + "/"): + return True + return False + + def _finding_is_reviewed( + self, + record: MCPRunRecord, + finding: Mapping[str, object], + ) -> bool: + with self._state_lock: + review_map = self._review_state.get(record.run_id, OrderedDict()) + return str(finding.get("id", "")) in review_map + + def _include_hotspot_finding( + self, + *, + record: MCPRunRecord, + finding: Mapping[str, object], + changed_paths: Sequence[str], + exclude_reviewed: bool, + ) -> bool: + if changed_paths and not self._finding_touches_paths( + finding=finding, + changed_paths=changed_paths, + ): + return False + return not exclude_reviewed or not self._finding_is_reviewed(record, finding) + + def _priority_score( + self, + record: MCPRunRecord, + finding: Mapping[str, object], + *, + remediation: Mapping[str, object] | None = None, + max_spread_value: int | None = None, + ) -> dict[str, object]: + spread_weight = self._spread_weight( + record, + finding, + max_spread_value=max_spread_value, + ) + factors = { + "severity_weight": _SEVERITY_WEIGHT.get( + str(finding.get("severity", "")), + 0.2, + ), + "effort_weight": _EFFORT_WEIGHT.get( + ( + str(remediation.get("effort", EFFORT_MODERATE)) + if remediation is not None + else EFFORT_MODERATE + ), + 0.6, + ), + "novelty_weight": _NOVELTY_WEIGHT.get( + str(finding.get("novelty", "")), + 0.7, + ), + "runtime_weight": _RUNTIME_WEIGHT.get( + str( + self._as_mapping(finding.get("source_scope")).get( + "dominant_kind", + "other", + ) + ), + 0.5, + ), + "spread_weight": spread_weight, + "confidence_weight": _CONFIDENCE_WEIGHT.get( + str(finding.get("confidence", CONFIDENCE_MEDIUM)), + 0.7, + ), + } + product = 1.0 + for value in factors.values(): + product *= max(_as_float(value, 0.01), 0.01) + score = product ** (1.0 / max(len(factors), 1)) + return { + "score": round(score, 4), + "factors": { + key: round(_as_float(value, 0.0), 4) for key, value in factors.items() + }, + } + + def _spread_weight( + self, + record: MCPRunRecord, + finding: Mapping[str, object], + *, + max_spread_value: int | None = None, + ) -> float: + spread_value = self._spread_value(finding) + if max_spread_value is None: + with self._state_lock: + max_spread_value = self._spread_max_cache.get(record.run_id) + if max_spread_value is None: + max_spread_value = max( + (self._spread_value(item) for item in self._base_findings(record)), + default=0, + ) + with self._state_lock: + self._spread_max_cache[record.run_id] = max_spread_value + max_value = max_spread_value + if max_value <= 0: + return 0.3 + return max(0.2, min(1.0, spread_value / max_value)) + + def _spread_value(self, finding: Mapping[str, object]) -> int: + spread = self._as_mapping(finding.get("spread")) + files = _as_int(spread.get("files", 0), 0) + functions = _as_int(spread.get("functions", 0), 0) + count = _as_int(finding.get("count", 0), 0) + return max(files, functions, count, 1) + + def _locations_for_finding( + self, + record: MCPRunRecord, + finding: Mapping[str, object], + *, + include_uri: bool = True, + ) -> list[dict[str, object]]: + locations: list[dict[str, object]] = [] + for item in self._as_sequence(finding.get("items")): + item_map = self._as_mapping(item) + relative_path = str(item_map.get("relative_path", "")).strip() + if not relative_path: + continue + line = _as_int(item_map.get("start_line", 0) or 0, 0) + end_line = _as_int(item_map.get("end_line", 0) or 0, 0) + symbol = str(item_map.get("qualname", item_map.get("module", ""))).strip() + location: dict[str, object] = { + "file": relative_path, + "line": line, + "end_line": end_line, + "symbol": symbol, + } + if include_uri: + absolute_path = (record.root / relative_path).resolve() + uri = absolute_path.as_uri() + if line > 0: + uri = f"{uri}#L{line}" + location["uri"] = uri + locations.append(location) + deduped: list[dict[str, object]] = [] + seen: set[tuple[str, int, str]] = set() + for location in locations: + key = ( + str(location.get("file", "")), + _as_int(location.get("line", 0), 0), + str(location.get("symbol", "")), + ) + if key not in seen: + seen.add(key) + deduped.append(location) + return deduped + + @staticmethod + def _suggestion_finding_id(suggestion: object) -> str: + return _suggestion_finding_id_payload(suggestion) + + def _remediation_for_finding( + self, + record: MCPRunRecord, + finding: Mapping[str, object], + ) -> dict[str, object] | None: + suggestion = self._suggestion_for_finding(record, str(finding.get("id", ""))) + if suggestion is None: + return None + source_kind = str(getattr(suggestion, "source_kind", "other")) + spread_files = _as_int(getattr(suggestion, "spread_files", 0), 0) + spread_functions = _as_int(getattr(suggestion, "spread_functions", 0), 0) + title = str(getattr(suggestion, "title", "")).strip() + severity = str(finding.get("severity", "")).strip() + novelty = str(finding.get("novelty", "known")).strip() + count = _as_int( + getattr(suggestion, "fact_count", 0) or finding.get("count", 0) or 0, + 0, + ) + safe_refactor_shape = self._safe_refactor_shape(suggestion) + effort = str(getattr(suggestion, "effort", EFFORT_MODERATE)) + confidence = str(getattr(suggestion, "confidence", CONFIDENCE_MEDIUM)) + risk_level = self._risk_level_for_effort(effort) + return { + "effort": effort, + "priority": _as_float(getattr(suggestion, "priority", 0.0), 0.0), + "confidence": confidence, + "safe_refactor_shape": safe_refactor_shape, + "steps": list(getattr(suggestion, "steps", ())), + "risk_level": risk_level, + "why_now": self._why_now_text( + title=title, + severity=severity, + novelty=novelty, + count=count, + source_kind=source_kind, + spread_files=spread_files, + spread_functions=spread_functions, + effort=effort, + ), + "blast_radius": { + "files": spread_files, + "functions": spread_functions, + "is_production": source_kind == "production", + }, + } + + def _suggestion_for_finding( + self, + record: MCPRunRecord, + finding_id: str, + ) -> object | None: + for suggestion in record.suggestions: + if self._suggestion_finding_id(suggestion) == finding_id: + return suggestion + return None + + @staticmethod + def _safe_refactor_shape(suggestion: object) -> str: + category = str(getattr(suggestion, "category", "")).strip() + clone_type = str(getattr(suggestion, "clone_type", "")).strip() + title = str(getattr(suggestion, "title", "")).strip() + if category == CATEGORY_CLONE and clone_type == "Type-1": + return "Keep one canonical implementation and route callers through it." + if category == CATEGORY_CLONE and clone_type == "Type-2": + return "Extract shared implementation with explicit parameters." + if category == CATEGORY_CLONE and "Block" in title: + return "Extract the repeated statement sequence into a helper." + if category == CATEGORY_STRUCTURAL: + return "Extract the repeated branch family into a named helper." + if category == CATEGORY_COMPLEXITY: + return "Split the function into smaller named steps." + if category == CATEGORY_COUPLING: + return "Isolate responsibilities and invert unnecessary dependencies." + if category == CATEGORY_COHESION: + return "Split the class by responsibility boundary." + if category == CATEGORY_DEAD_CODE: + return "Delete the unused symbol or document intentional reachability." + if category == CATEGORY_DEPENDENCY: + return "Break the cycle by moving shared abstractions to a lower layer." + return "Extract the repeated logic into a shared, named abstraction." + + @staticmethod + def _risk_level_for_effort(effort: str) -> str: + return { + EFFORT_EASY: "low", + EFFORT_MODERATE: "medium", + EFFORT_HARD: "high", + }.get(effort, "medium") + + @staticmethod + def _why_now_text( + *, + title: str, + severity: str, + novelty: str, + count: int, + source_kind: str, + spread_files: int, + spread_functions: int, + effort: str, + ) -> str: + novelty_text = "new regression" if novelty == "new" else "known debt" + context = ( + "production code" + if source_kind == "production" + else source_kind or "mixed scope" + ) + spread_text = f"{spread_files} files / {spread_functions} functions" + count_text = f"{count} instances" if count > 0 else "localized issue" + return ( + f"{severity.upper()} {title} in {context} — {count_text}, " + f"{spread_text}, {effort} fix, {novelty_text}." + ) + + def _project_remediation( + self, + remediation: Mapping[str, object], + *, + detail_level: DetailLevel, + ) -> dict[str, object]: + if detail_level == "full": + return dict(remediation) + projected = { + "effort": remediation.get("effort"), + "risk": remediation.get("risk_level"), + "shape": remediation.get("safe_refactor_shape"), + "why_now": remediation.get("why_now"), + } + if detail_level == "summary": + return projected + projected["steps"] = list(self._as_sequence(remediation.get("steps"))) + return projected + + def _hotspot_rows( + self, + *, + record: MCPRunRecord, + kind: HotlistKind, + detail_level: DetailLevel, + changed_paths: Sequence[str], + exclude_reviewed: bool, + ) -> list[dict[str, object]]: + findings = self._base_findings(record) + finding_index = {str(finding.get("id", "")): finding for finding in findings} + max_spread_value = max( + (self._spread_value(finding) for finding in findings), + default=0, + ) + with self._state_lock: + self._spread_max_cache[record.run_id] = max_spread_value + remediation_map = { + str(finding.get("id", "")): self._remediation_for_finding(record, finding) + for finding in findings + } + priority_map = { + str(finding.get("id", "")): self._priority_score( + record, + finding, + remediation=remediation_map[str(finding.get("id", ""))], + max_spread_value=max_spread_value, + ) + for finding in findings + } + derived = self._as_mapping(record.report_document.get("derived")) + hotlists = self._as_mapping(derived.get("hotlists")) + if kind == "highest_priority": + ordered_ids = [ + str(finding.get("id", "")) + for finding in self._sort_findings( + record=record, + findings=findings, + sort_by="priority", + priority_map=priority_map, + ) + ] + else: + hotlist_key = _HOTLIST_REPORT_KEYS.get(kind) + if hotlist_key is None: + return [] + ordered_ids = [ + str(item) + for item in self._as_sequence(hotlists.get(hotlist_key)) + if str(item) + ] + rows: list[dict[str, object]] = [] + for finding_id in ordered_ids: + finding = finding_index.get(finding_id) + if finding is None or not self._include_hotspot_finding( + record=record, + finding=finding, + changed_paths=changed_paths, + exclude_reviewed=exclude_reviewed, + ): + continue + finding_id_key = str(finding.get("id", "")) + rows.append( + self._decorate_finding( + record, + finding, + detail_level=detail_level, + remediation=remediation_map[finding_id_key], + priority_payload=priority_map[finding_id_key], + max_spread_value=max_spread_value, + ) + ) + return rows + + def _build_changed_projection( + self, + record: MCPRunRecord, + ) -> dict[str, object] | None: + if not record.changed_paths: + return None + items = self._query_findings( + record=record, + detail_level="summary", + changed_paths=record.changed_paths, + ) + new_count = sum(1 for item in items if str(item.get("novelty", "")) == "new") + known_count = sum( + 1 for item in items if str(item.get("novelty", "")) == "known" + ) + health_delta = self._summary_health_delta(record.summary) + return { + "run_id": self._short_run_id(record.run_id), + "changed_paths": list(record.changed_paths), + "total": len(items), + "new": new_count, + "known": known_count, + "items": items, + "health": dict(self._summary_health_payload(record.summary)), + "health_delta": health_delta, + "verdict": self._changed_verdict( + changed_projection={"new": new_count, "total": len(items)}, + health_delta=health_delta, + ), + } + + def _changed_analysis_payload( + self, + record: MCPRunRecord, + ) -> dict[str, object]: + changed_projection = self._as_mapping(record.changed_projection) + health = self._summary_health_payload(record.summary) + health_payload = ( + { + "score": health.get("score"), + "grade": health.get("grade"), + } + if health.get("available") is not False + else dict(health) + ) + return { + "run_id": self._short_run_id(record.run_id), + "changed_files": len(record.changed_paths), + "health": health_payload, + "health_delta": ( + _as_int(changed_projection.get("health_delta", 0), 0) + if changed_projection.get("health_delta") is not None + else None + ), + "verdict": str(changed_projection.get("verdict", "stable")), + "new_findings": _as_int(changed_projection.get("new", 0), 0), + "resolved_findings": 0, + "changed_findings": [], + } + + def _augment_summary_with_changed( + self, + *, + summary: Mapping[str, object], + changed_paths: Sequence[str], + changed_projection: Mapping[str, object] | None, + ) -> dict[str, object]: + payload = dict(summary) + if changed_paths: + payload["changed_paths"] = list(changed_paths) + if changed_projection is not None: + payload["changed_findings"] = { + "total": _as_int(changed_projection.get("total", 0), 0), + "new": _as_int(changed_projection.get("new", 0), 0), + "known": _as_int(changed_projection.get("known", 0), 0), + "items": [ + dict(self._as_mapping(item)) + for item in self._as_sequence(changed_projection.get("items"))[:10] + ], + } + payload["health_delta"] = ( + _as_int(changed_projection.get("health_delta", 0), 0) + if changed_projection.get("health_delta") is not None + else None + ) + payload["verdict"] = str(changed_projection.get("verdict", "stable")) + return payload + + @staticmethod + def _changed_verdict( + *, + changed_projection: Mapping[str, object], + health_delta: int | None, + ) -> str: + if _as_int(changed_projection.get("new", 0), 0) > 0 or ( + health_delta is not None and health_delta < 0 + ): + return "regressed" + if ( + _as_int(changed_projection.get("total", 0), 0) == 0 + and health_delta is not None + and health_delta > 0 + ): + return "improved" + return "stable" + + def _comparison_index( + self, + record: MCPRunRecord, + *, + focus: ComparisonFocus, + ) -> dict[str, dict[str, object]]: + findings = self._base_findings(record) + if focus == "clones": + findings = [f for f in findings if str(f.get("family", "")) == FAMILY_CLONE] + elif focus == "structural": + findings = [ + f for f in findings if str(f.get("family", "")) == FAMILY_STRUCTURAL + ] + elif focus == "metrics": + findings = [ + f + for f in findings + if str(f.get("family", "")) in {FAMILY_DESIGN, FAMILY_DEAD_CODE} + ] + return {str(finding.get("id", "")): dict(finding) for finding in findings} + + @staticmethod + def _comparison_verdict( + *, + regressions: int, + improvements: int, + health_delta: int | None, + ) -> str: + has_negative_signal = regressions > 0 or ( + health_delta is not None and health_delta < 0 + ) + has_positive_signal = improvements > 0 or ( + health_delta is not None and health_delta > 0 + ) + if has_negative_signal and has_positive_signal: + return "mixed" + if has_negative_signal: + return "regressed" + if has_positive_signal: + return "improved" + return "stable" + + @staticmethod + def _comparison_summary_text( + *, + comparable: bool, + comparability_reason: str, + regressions: int, + improvements: int, + health_delta: int | None, + ) -> str: + if not comparable: + reason_text = { + "different_root": "different roots", + "different_analysis_settings": "different analysis settings", + "different_root_and_analysis_settings": ( + "different roots and analysis settings" + ), + }.get(comparability_reason, "incomparable runs") + return f"Finding and run health deltas omitted ({reason_text})" + if health_delta is None: + return ( + f"{improvements} findings resolved, {regressions} new regressions; " + "run health delta omitted (metrics unavailable)" + ) + return ( + f"{improvements} findings resolved, {regressions} new regressions, " + f"run health delta {health_delta:+d}" + ) + + def _render_pr_summary_markdown(self, payload: Mapping[str, object]) -> str: + health = self._as_mapping(payload.get("health")) + score = health.get("score", "n/a") + grade = health.get("grade", "n/a") + delta = _as_int(payload.get("health_delta", 0), 0) + changed_items = [ + self._as_mapping(item) + for item in self._as_sequence(payload.get("new_findings_in_changed_files")) + ] + resolved = [ + self._as_mapping(item) + for item in self._as_sequence(payload.get("resolved")) + ] + blocking_gates = [ + str(item) + for item in self._as_sequence(payload.get("blocking_gates")) + if str(item) + ] + health_line = ( + f"Health: {score}/100 ({grade}) | Delta: {delta:+d} | " + f"Verdict: {payload.get('verdict', 'stable')}" + if payload.get("health_delta") is not None + else ( + f"Health: {score}/100 ({grade}) | Delta: n/a | " + f"Verdict: {payload.get('verdict', 'stable')}" + ) + ) + lines = [ + "## CodeClone Summary", + "", + health_line, + "", + f"### New findings in changed files ({len(changed_items)})", + ] + if not changed_items: + lines.append("- None") + else: + lines.extend( + [ + ( + f"- **{str(item.get('severity', 'info')).upper()}** " + f"{item.get('kind', 'finding')} in " + f"`{self._finding_display_location(item)}`" + ) + for item in changed_items[:10] + ] + ) + lines.extend(["", f"### Resolved ({len(resolved)})"]) + if not resolved: + lines.append("- None") + else: + lines.extend( + [ + ( + f"- {item.get('kind', 'finding')} in " + f"`{self._finding_display_location(item)}`" + ) + for item in resolved[:10] + ] + ) + lines.extend(["", "### Blocking gates"]) + if not blocking_gates: + lines.append("- none") + else: + lines.extend([f"- `{reason}`" for reason in blocking_gates]) + return "\n".join(lines) + + def _finding_display_location(self, finding: Mapping[str, object]) -> str: + locations = self._as_sequence(finding.get("locations")) + if not locations: + return "(unknown)" + first = locations[0] + if isinstance(first, str): + return first + location = self._as_mapping(first) + path = str(location.get("path", location.get("file", ""))).strip() + line = _as_int(location.get("line", 0), 0) + if not path: + return "(unknown)" + return f"{path}:{line}" if line > 0 else path + + def _granular_payload( + self, + *, + record: MCPRunRecord, + check: str, + items: Sequence[Mapping[str, object]], + detail_level: DetailLevel, + max_results: int, + path: str | None, + ) -> dict[str, object]: + bounded_items = [dict(item) for item in items[: max(1, max_results)]] + full_health = dict(self._as_mapping(record.summary.get("health"))) + dimensions = self._as_mapping(full_health.get("dimensions")) + relevant_dimension = _CHECK_TO_DIMENSION.get(check) + slim_dimensions = ( + {relevant_dimension: dimensions.get(relevant_dimension)} + if relevant_dimension and relevant_dimension in dimensions + else dict(dimensions) + ) + return { + "run_id": self._short_run_id(record.run_id), + "check": check, + "detail_level": detail_level, + "path": path, + "returned": len(bounded_items), + "total": len(items), + "health": { + "score": full_health.get("score"), + "grade": full_health.get("grade"), + "dimensions": slim_dimensions, + }, + "items": bounded_items, + } + + @staticmethod + def _normalized_source_kind(value: object) -> str: + normalized = str(value).strip().lower() + if normalized in SOURCE_KIND_ORDER: + return normalized + return SOURCE_KIND_OTHER + + def _finding_source_kind(self, finding: Mapping[str, object]) -> str: + source_scope = self._as_mapping(finding.get("source_scope")) + return self._normalized_source_kind(source_scope.get("dominant_kind")) + + def _source_kind_breakdown( + self, + source_kinds: Iterable[object], + ) -> dict[str, int]: + breakdown = dict.fromkeys(_SOURCE_KIND_BREAKDOWN_ORDER, 0) + for value in source_kinds: + breakdown[self._normalized_source_kind(value)] += 1 + return breakdown + + def _triage_suggestion_rows(self, record: MCPRunRecord) -> list[dict[str, object]]: + derived = self._as_mapping(record.report_document.get("derived")) + canonical_rows = self._dict_list(derived.get("suggestions")) + suggestion_source_kinds = { + self._suggestion_finding_id(suggestion): self._normalized_source_kind( + getattr(suggestion, "source_kind", SOURCE_KIND_OTHER) + ) + for suggestion in record.suggestions + } + rows: list[dict[str, object]] = [] + for row in canonical_rows: + canonical_finding_id = str(row.get("finding_id", "")) + action = self._as_mapping(row.get("action")) + try: + finding_id = self._short_finding_id( + record, + self._resolve_canonical_finding_id(record, canonical_finding_id), + ) + except MCPFindingNotFoundError: + finding_id = self._base_short_finding_id(canonical_finding_id) + rows.append( + { + "id": f"suggestion:{finding_id}", + "finding_id": finding_id, + "title": str(row.get("title", "")), + "summary": str(row.get("summary", "")), + "effort": str(action.get("effort", "")), + "steps": list(self._as_sequence(action.get("steps"))), + "source_kind": suggestion_source_kinds.get( + canonical_finding_id, + SOURCE_KIND_OTHER, + ), + } + ) + return rows + + def _derived_section_payload(self, record: MCPRunRecord) -> dict[str, object]: + derived = self._as_mapping(record.report_document.get("derived")) + if not derived: + raise MCPServiceContractError( + "Report section 'derived' is not available in this run." + ) + suggestions = self._triage_suggestion_rows(record) + canonical_to_short, _ = self._finding_id_maps(record) + hotlists = self._as_mapping(derived.get("hotlists")) + projected_hotlists: dict[str, list[str]] = {} + for hotlist_key, hotlist_ids in hotlists.items(): + projected_hotlists[hotlist_key] = [ + canonical_to_short.get( + str(finding_id), + self._base_short_finding_id(str(finding_id)), + ) + for finding_id in self._as_sequence(hotlist_ids) + if str(finding_id) + ] + return { + "suggestions": suggestions, + "hotlists": projected_hotlists, + } + + @staticmethod + def _schema_resource_payload() -> dict[str, object]: + return { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "CodeCloneCanonicalReport", + "type": "object", + "required": [ + "report_schema_version", + "meta", + "inventory", + "findings", + "derived", + "integrity", + ], + "properties": { + "report_schema_version": { + "type": "string", + "const": REPORT_SCHEMA_VERSION, + }, + "meta": {"type": "object"}, + "inventory": {"type": "object"}, + "findings": {"type": "object"}, + "metrics": {"type": "object"}, + "derived": {"type": "object"}, + "integrity": {"type": "object"}, + }, + } + + def _validate_analysis_request(self, request: MCPAnalysisRequest) -> None: + self._validate_choice( + "analysis_mode", + request.analysis_mode, + _VALID_ANALYSIS_MODES, + ) + self._validate_choice( + "cache_policy", + request.cache_policy, + _VALID_CACHE_POLICIES, + ) + if request.cache_policy == "refresh": + raise MCPServiceContractError( + "cache_policy='refresh' is not supported by the read-only " + "CodeClone MCP server. Use 'reuse' or 'off'." + ) + + @staticmethod + def _validate_choice( + name: str, + value: str, + allowed: Sequence[str] | frozenset[str], + ) -> str: + if value not in allowed: + allowed_list = ", ".join(sorted(allowed)) + raise MCPServiceContractError( + f"Invalid value for {name}: {value!r}. Expected one of: {allowed_list}." + ) + return value + + def _validate_optional_choice( + self, + name: str, + value: str | None, + allowed: Sequence[str] | frozenset[str], + ) -> str | None: + if value is None: + return None + return self._validate_choice(name, value, allowed) + + @staticmethod + def _resolve_root(root: str | None) -> Path: + cleaned_root = "" if root is None else str(root).strip() + if not cleaned_root: + raise MCPServiceContractError( + "MCP analysis requires an absolute repository root. " + "Omitted or relative roots are unsafe because the MCP server " + "working directory may not match the client workspace." + ) + candidate = Path(cleaned_root).expanduser() + if not candidate.is_absolute(): + raise MCPServiceContractError( + f"MCP requires an absolute repository root; got relative root " + f"{cleaned_root!r}. Relative roots like '.' are unsafe because " + "the MCP server working directory may not match the client " + "workspace." + ) + try: + root_path = candidate.resolve() + except OSError as exc: + raise MCPServiceContractError( + f"Invalid root path '{cleaned_root}': {exc}" + ) from exc + if not root_path.exists(): + raise MCPServiceContractError(f"Root path does not exist: {root_path}") + if not root_path.is_dir(): + raise MCPServiceContractError(f"Root path is not a directory: {root_path}") + return root_path + + def _resolve_optional_root(self, root: str | None) -> Path | None: + cleaned_root = "" if root is None else str(root).strip() + if not cleaned_root: + return None + return self._resolve_root(cleaned_root) + + def _build_args(self, *, root_path: Path, request: MCPAnalysisRequest) -> Namespace: + args = Namespace( + root=str(root_path), + min_loc=DEFAULT_MIN_LOC, + min_stmt=DEFAULT_MIN_STMT, + block_min_loc=DEFAULT_BLOCK_MIN_LOC, + block_min_stmt=DEFAULT_BLOCK_MIN_STMT, + segment_min_loc=DEFAULT_SEGMENT_MIN_LOC, + segment_min_stmt=DEFAULT_SEGMENT_MIN_STMT, + processes=None, + cache_path=None, + max_cache_size_mb=DEFAULT_MAX_CACHE_SIZE_MB, + baseline=DEFAULT_BASELINE_PATH, + max_baseline_size_mb=DEFAULT_MAX_BASELINE_SIZE_MB, + update_baseline=False, + fail_on_new=False, + fail_threshold=-1, + ci=False, + fail_complexity=-1, + fail_coupling=-1, + fail_cohesion=-1, + fail_cycles=False, + fail_dead_code=False, + fail_health=-1, + fail_on_new_metrics=False, + design_complexity_threshold=DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + design_coupling_threshold=DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + design_cohesion_threshold=DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + update_metrics_baseline=False, + metrics_baseline=DEFAULT_BASELINE_PATH, + skip_metrics=False, + skip_dead_code=False, + skip_dependencies=False, + html_out=None, + json_out=None, + md_out=None, + sarif_out=None, + text_out=None, + no_progress=True, + no_color=True, + quiet=True, + verbose=False, + debug=False, + open_html_report=False, + timestamped_report_paths=False, + ) + if request.respect_pyproject: + try: + config_values = load_pyproject_config(root_path) + except ConfigValidationError as exc: + raise MCPServiceContractError(str(exc)) from exc + for key in sorted(_MCP_CONFIG_KEYS.intersection(config_values)): + setattr(args, key, config_values[key]) + + self._apply_request_overrides(args=args, root_path=root_path, request=request) + + if request.analysis_mode == "clones_only": + args.skip_metrics = True + args.skip_dead_code = True + args.skip_dependencies = True + else: + args.skip_metrics = False + args.skip_dead_code = False + args.skip_dependencies = False + + if not validate_numeric_args(args): + raise MCPServiceContractError( + "Numeric analysis settings must be non-negative and thresholds " + "must be >= -1." + ) + + return args + + def _apply_request_overrides( + self, + *, + args: Namespace, + root_path: Path, + request: MCPAnalysisRequest, + ) -> None: + override_map: dict[str, object | None] = { + "processes": request.processes, + "min_loc": request.min_loc, + "min_stmt": request.min_stmt, + "block_min_loc": request.block_min_loc, + "block_min_stmt": request.block_min_stmt, + "segment_min_loc": request.segment_min_loc, + "segment_min_stmt": request.segment_min_stmt, + "max_baseline_size_mb": request.max_baseline_size_mb, + "max_cache_size_mb": request.max_cache_size_mb, + "design_complexity_threshold": request.complexity_threshold, + "design_coupling_threshold": request.coupling_threshold, + "design_cohesion_threshold": request.cohesion_threshold, + } + for key, value in override_map.items(): + if value is not None: + setattr(args, key, value) + + if request.baseline_path is not None: + args.baseline = str( + self._resolve_optional_path(request.baseline_path, root_path) + ) + if request.metrics_baseline_path is not None: + args.metrics_baseline = str( + self._resolve_optional_path(request.metrics_baseline_path, root_path) + ) + if request.cache_path is not None: + args.cache_path = str( + self._resolve_optional_path(request.cache_path, root_path) + ) + + @staticmethod + def _resolve_optional_path(value: str, root_path: Path) -> Path: + candidate = Path(value).expanduser() + resolved = candidate if candidate.is_absolute() else root_path / candidate + try: + return resolved.resolve() + except OSError as exc: + raise MCPServiceContractError( + f"Invalid path '{value}' relative to '{root_path}': {exc}" + ) from exc + + def _resolve_baseline_inputs( + self, + *, + root_path: Path, + args: Namespace, + ) -> tuple[Path, bool, Path, bool, dict[str, object] | None]: + baseline_path = self._resolve_optional_path(str(args.baseline), root_path) + baseline_exists = baseline_path.exists() + + metrics_baseline_arg_path = self._resolve_optional_path( + str(args.metrics_baseline), + root_path, + ) + shared_baseline_payload: dict[str, object] | None = None + if metrics_baseline_arg_path == baseline_path: + probe = probe_metrics_baseline_section(metrics_baseline_arg_path) + metrics_baseline_exists = probe.has_metrics_section + shared_baseline_payload = probe.payload + else: + metrics_baseline_exists = metrics_baseline_arg_path.exists() + + return ( + baseline_path, + baseline_exists, + metrics_baseline_arg_path, + metrics_baseline_exists, + shared_baseline_payload, + ) + + @staticmethod + def _resolve_cache_path(*, root_path: Path, args: Namespace) -> Path: + return resolve_cache_path( + root_path=root_path, + args=args, + from_args=bool(args.cache_path), + legacy_cache_path=_LEGACY_CACHE_PATH, + console=_BufferConsole(), + ) + + @staticmethod + def _build_cache( + *, + root_path: Path, + args: Namespace, + cache_path: Path, + policy: CachePolicy, + ) -> Cache: + cache = Cache( + cache_path, + root=root_path, + max_size_bytes=_as_int(args.max_cache_size_mb, 0) * 1024 * 1024, + min_loc=_as_int(args.min_loc, DEFAULT_MIN_LOC), + min_stmt=_as_int(args.min_stmt, DEFAULT_MIN_STMT), + block_min_loc=_as_int(args.block_min_loc, DEFAULT_BLOCK_MIN_LOC), + block_min_stmt=_as_int(args.block_min_stmt, DEFAULT_BLOCK_MIN_STMT), + segment_min_loc=_as_int(args.segment_min_loc, DEFAULT_SEGMENT_MIN_LOC), + segment_min_stmt=_as_int( + args.segment_min_stmt, + DEFAULT_SEGMENT_MIN_STMT, + ), + ) + if policy != "off": + cache.load() + return cache + + @staticmethod + def _metrics_computed(analysis_mode: AnalysisMode) -> tuple[str, ...]: + return ( + () + if analysis_mode == "clones_only" + else ( + "complexity", + "coupling", + "cohesion", + "health", + "dependencies", + "dead_code", + ) + ) + + @staticmethod + def _load_report_document(report_json: str) -> dict[str, object]: + return _load_report_document_payload(report_json) + + def _report_digest(self, report_document: Mapping[str, object]) -> str: + integrity = self._as_mapping(report_document.get("integrity")) + digest = self._as_mapping(integrity.get("digest")) + value = digest.get("value") + if not isinstance(value, str) or not value: + raise MCPServiceError("Canonical report digest is missing.") + return value + + def _build_run_summary_payload( + self, + *, + run_id: str, + root_path: Path, + request: MCPAnalysisRequest, + report_document: Mapping[str, object], + baseline_state: CloneBaselineState, + metrics_baseline_state: MetricsBaselineState, + cache_status: CacheStatus, + new_func: Sequence[str] | set[str], + new_block: Sequence[str] | set[str], + metrics_diff: MetricsDiff | None, + warnings: Sequence[str], + failures: Sequence[str], + ) -> dict[str, object]: + meta = self._as_mapping(report_document.get("meta")) + meta_baseline = self._as_mapping(meta.get("baseline")) + meta_metrics_baseline = self._as_mapping(meta.get("metrics_baseline")) + meta_cache = self._as_mapping(meta.get("cache")) + inventory = self._as_mapping(report_document.get("inventory")) + findings = self._as_mapping(report_document.get("findings")) + metrics = self._as_mapping(report_document.get("metrics")) + metrics_summary = self._as_mapping(metrics.get("summary")) + summary = self._as_mapping(findings.get("summary")) + payload = { + "run_id": run_id, + "root": str(root_path), + "analysis_mode": request.analysis_mode, + "codeclone_version": meta.get("codeclone_version", __version__), + "report_schema_version": report_document.get( + "report_schema_version", + REPORT_SCHEMA_VERSION, + ), + "baseline": { + "path": meta_baseline.get( + "path", + str(root_path / DEFAULT_BASELINE_PATH), + ), + "loaded": bool(meta_baseline.get("loaded", baseline_state.loaded)), + "status": str(meta_baseline.get("status", baseline_state.status.value)), + "trusted_for_diff": baseline_state.trusted_for_diff, + }, + "metrics_baseline": { + "path": meta_metrics_baseline.get( + "path", + str(root_path / DEFAULT_BASELINE_PATH), + ), + "loaded": bool( + meta_metrics_baseline.get( + "loaded", + metrics_baseline_state.loaded, + ) + ), + "status": str( + meta_metrics_baseline.get( + "status", + metrics_baseline_state.status.value, + ) + ), + "trusted_for_diff": metrics_baseline_state.trusted_for_diff, + }, + "cache": { + "path": meta_cache.get("path"), + "status": str(meta_cache.get("status", cache_status.value)), + "used": bool(meta_cache.get("used", False)), + "schema_version": meta_cache.get("schema_version"), + }, + "inventory": dict(inventory), + "findings_summary": dict(summary), + "health": dict(self._as_mapping(metrics_summary.get("health"))), + "baseline_diff": { + "new_function_clone_groups": len(new_func), + "new_block_clone_groups": len(new_block), + "new_clone_groups_total": len(new_func) + len(new_block), + }, + "metrics_diff": self._metrics_diff_payload(metrics_diff), + "warnings": list(warnings), + "failures": list(failures), + } + payload["cache"] = self._summary_cache_payload(payload) + payload["health"] = self._summary_health_payload(payload) + return payload + + def _summary_payload( + self, + summary: Mapping[str, object], + *, + record: MCPRunRecord | None = None, + ) -> dict[str, object]: + inventory = self._as_mapping(summary.get("inventory")) + if ( + not summary.get("run_id") + and not record + and "inventory" in summary + and not summary.get("baseline") + ): + return { + "inventory": self._summary_inventory_payload(inventory), + "health": self._summary_health_payload(summary), + } + resolved_run_id = ( + record.run_id if record is not None else str(summary.get("run_id", "")) + ) + payload: dict[str, object] = { + "run_id": self._short_run_id(resolved_run_id) if resolved_run_id else "", + "version": str(summary.get("codeclone_version", __version__)), + "schema": str(summary.get("report_schema_version", REPORT_SCHEMA_VERSION)), + "mode": str(summary.get("analysis_mode", "")), + "baseline": self._summary_baseline_payload(summary), + "metrics_baseline": self._summary_metrics_baseline_payload(summary), + "cache": self._summary_cache_payload(summary), + "inventory": self._summary_inventory_payload(inventory), + "health": self._summary_health_payload(summary), + "findings": self._summary_findings_payload(summary, record=record), + "diff": self._summary_diff_payload(summary), + "warnings": list(self._as_sequence(summary.get("warnings"))), + "failures": list(self._as_sequence(summary.get("failures"))), + } + return payload + + def _summary_baseline_payload( + self, + summary: Mapping[str, object], + ) -> dict[str, object]: + baseline = self._as_mapping(summary.get("baseline")) + return { + "loaded": bool(baseline.get("loaded", False)), + "status": str(baseline.get("status", "")), + "trusted": bool(baseline.get("trusted_for_diff", False)), + } + + def _summary_metrics_baseline_payload( + self, + summary: Mapping[str, object], + ) -> dict[str, object]: + baseline = self._as_mapping(summary.get("metrics_baseline")) + return { + "loaded": bool(baseline.get("loaded", False)), + "status": str(baseline.get("status", "")), + "trusted": bool(baseline.get("trusted_for_diff", False)), + } + + def _summary_cache_payload( + self, + summary: Mapping[str, object], + ) -> dict[str, object]: + cache = dict(self._as_mapping(summary.get("cache"))) + if not cache: + return {} + return { + "used": bool(cache.get("used", False)), + "freshness": self._effective_freshness(summary), + } + + def _effective_freshness( + self, + summary: Mapping[str, object], + ) -> FreshnessKind: + inventory = self._as_mapping(summary.get("inventory")) + files = self._as_mapping(inventory.get("files")) + analyzed = max(0, _as_int(files.get("analyzed", 0), 0)) + cached = max(0, _as_int(files.get("cached", 0), 0)) + cache = self._as_mapping(summary.get("cache")) + cache_used = bool(cache.get("used")) + if cache_used and cached > 0 and analyzed == 0: + return "reused" + if cache_used and cached > 0 and analyzed > 0: + return "mixed" + return "fresh" + + def _summary_inventory_payload( + self, + inventory: Mapping[str, object], + ) -> dict[str, object]: + if not inventory: + return {} + files = self._as_mapping(inventory.get("files")) + code = self._as_mapping(inventory.get("code")) + total_files = _as_int( + files.get( + "total_found", + files.get( + "analyzed", + len( + self._as_sequence( + self._as_mapping(inventory.get("file_registry")).get( + "items" + ) + ) + ), + ), + ), + 0, + ) + functions = _as_int(code.get("functions", 0), 0) + _as_int( + code.get("methods", 0), + 0, + ) + return { + "files": total_files, + "lines": _as_int(code.get("parsed_lines", 0), 0), + "functions": functions, + "classes": _as_int(code.get("classes", 0), 0), + } + + def _summary_findings_payload( + self, + summary: Mapping[str, object], + *, + record: MCPRunRecord | None, + ) -> dict[str, object]: + findings_summary = self._as_mapping(summary.get("findings_summary")) + if record is None: + return { + "total": _as_int(findings_summary.get("total", 0), 0), + "new": 0, + "known": 0, + "by_family": {}, + "production": 0, + } + findings = self._base_findings(record) + by_family: dict[str, int] = { + "clones": 0, + "structural": 0, + "dead_code": 0, + "design": 0, + } + new_count = 0 + known_count = 0 + production_count = 0 + for finding in findings: + family = str(finding.get("family", "")).strip() + family_key = "clones" if family == FAMILY_CLONE else family + if family_key in by_family: + by_family[family_key] += 1 + if str(finding.get("novelty", "")).strip() == "new": + new_count += 1 + else: + known_count += 1 + if self._finding_source_kind(finding) == SOURCE_KIND_PRODUCTION: + production_count += 1 + return { + "total": len(findings), + "new": new_count, + "known": known_count, + "by_family": {key: value for key, value in by_family.items() if value > 0}, + "production": production_count, + } + + def _summary_diff_payload( + self, + summary: Mapping[str, object], + ) -> dict[str, object]: + baseline_diff = self._as_mapping(summary.get("baseline_diff")) + metrics_diff = self._as_mapping(summary.get("metrics_diff")) + return { + "new_clones": _as_int(baseline_diff.get("new_clone_groups_total", 0), 0), + "health_delta": ( + _as_int(metrics_diff.get("health_delta", 0), 0) + if metrics_diff + and self._summary_health_payload(summary).get("available") is not False + else None + ), + } + + def _metrics_detail_payload( + self, + *, + metrics: Mapping[str, object], + family: MetricsDetailFamily | None, + path: str | None, + offset: int, + limit: int, + ) -> dict[str, object]: + summary = dict(self._as_mapping(metrics.get("summary"))) + families = self._as_mapping(metrics.get("families")) + normalized_path = self._normalize_relative_path(path or "") + if family is None and not normalized_path: + return { + "summary": summary, + "_hint": "Use family and/or path parameters to access per-item detail.", + } + normalized_offset = max(0, offset) + normalized_limit = max(1, min(limit, 200)) + family_names: Sequence[str] = ( + (family,) if family is not None else tuple(sorted(families)) + ) + items: list[dict[str, object]] = [] + for family_name in family_names: + family_payload = self._as_mapping(families.get(family_name)) + for item in self._as_sequence(family_payload.get("items")): + item_map = self._as_mapping(item) + if normalized_path and not self._metric_item_matches_path( + item_map, + normalized_path, + ): + continue + compact_item = self._compact_metrics_item(item_map) + if family is None: + compact_item = {"family": family_name, **compact_item} + items.append(compact_item) + items.sort( + key=lambda item: ( + str(item.get("family", family or "")), + str(item.get("path", "")), + str(item.get("qualname", "")), + _as_int(item.get("start_line", 0), 0), + ) + ) + page = items[normalized_offset : normalized_offset + normalized_limit] + return { + "family": family, + "path": normalized_path or None, + "offset": normalized_offset, + "limit": normalized_limit, + "returned": len(page), + "total": len(items), + "has_more": normalized_offset + len(page) < len(items), + "items": page, + } + + def _metric_item_matches_path( + self, + item: Mapping[str, object], + normalized_path: str, + ) -> bool: + path_value = ( + str(item.get("relative_path", "")).strip() + or str(item.get("path", "")).strip() + or str(item.get("filepath", "")).strip() + or str(item.get("file", "")).strip() + ) + if not path_value: + return False + return self._path_matches(path_value, (normalized_path,)) + + @staticmethod + def _compact_metrics_item( + item: Mapping[str, object], + ) -> dict[str, object]: + compact: dict[str, object] = {} + path_value = ( + str(item.get("relative_path", "")).strip() + or str(item.get("path", "")).strip() + or str(item.get("filepath", "")).strip() + or str(item.get("file", "")).strip() + ) + if path_value: + compact["path"] = path_value + for key, value in item.items(): + if key in {"relative_path", "path", "filepath", "file"}: + continue + if value in ("", None, [], {}, ()): + continue + compact[str(key)] = value + return compact + + @staticmethod + def _metrics_diff_payload( + metrics_diff: MetricsDiff | None, + ) -> dict[str, object] | None: + if metrics_diff is None: + return None + new_high_risk_functions = tuple( + cast(Sequence[str], getattr(metrics_diff, "new_high_risk_functions", ())) + ) + new_high_coupling_classes = tuple( + cast(Sequence[str], getattr(metrics_diff, "new_high_coupling_classes", ())) + ) + new_cycles = tuple( + cast(Sequence[object], getattr(metrics_diff, "new_cycles", ())) + ) + new_dead_code = tuple( + cast(Sequence[str], getattr(metrics_diff, "new_dead_code", ())) + ) + health_delta = getattr(metrics_diff, "health_delta", 0) + return { + "new_high_risk_functions": len(new_high_risk_functions), + "new_high_coupling_classes": len(new_high_coupling_classes), + "new_cycles": len(new_cycles), + "new_dead_code": len(new_dead_code), + "health_delta": _as_int(health_delta, 0), + } + + def _dict_list(self, value: object) -> list[dict[str, object]]: + return [dict(self._as_mapping(item)) for item in self._as_sequence(value)] + + @staticmethod + def _as_mapping(value: object) -> Mapping[str, object]: + return value if isinstance(value, Mapping) else {} + + @staticmethod + def _as_sequence(value: object) -> Sequence[object]: + if isinstance(value, Sequence) and not isinstance( + value, + (str, bytes, bytearray), + ): + return value + return () diff --git a/codeclone/meta_markers.py b/codeclone/meta_markers.py index ec0d390..3f527d6 100644 --- a/codeclone/meta_markers.py +++ b/codeclone/meta_markers.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/metrics/__init__.py b/codeclone/metrics/__init__.py index e9c1afe..bf64509 100644 --- a/codeclone/metrics/__init__.py +++ b/codeclone/metrics/__init__.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/metrics/cohesion.py b/codeclone/metrics/cohesion.py index f0b2cc1..c8a389b 100644 --- a/codeclone/metrics/cohesion.py +++ b/codeclone/metrics/cohesion.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations @@ -65,16 +68,14 @@ def compute_lcom4(class_node: ast.ClassDef) -> tuple[int, int, int]: components = 0 for method_name in method_names: - if method_name in visited: - continue - components += 1 - stack = [method_name] - while stack: - current = stack.pop() - if current in visited: - continue - visited.add(current) - stack.extend(sorted(adjacency[current] - visited)) + if method_name not in visited: + components += 1 + stack = [method_name] + while stack: + current = stack.pop() + if current not in visited: + visited.add(current) + stack.extend(sorted(adjacency[current] - visited)) instance_vars = set().union(*method_to_attrs.values()) if method_to_attrs else set() return components, len(method_names), len(instance_vars) diff --git a/codeclone/metrics/complexity.py b/codeclone/metrics/complexity.py index 2e6919e..4573da5 100644 --- a/codeclone/metrics/complexity.py +++ b/codeclone/metrics/complexity.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/metrics/coupling.py b/codeclone/metrics/coupling.py index 07752b7..8a34037 100644 --- a/codeclone/metrics/coupling.py +++ b/codeclone/metrics/coupling.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations @@ -41,34 +44,28 @@ def compute_cbo( """ couplings: set[str] = set() - for base in class_node.bases: - candidate = _annotation_name(base) + def _add_annotation_coupling(node: ast.AST | None) -> None: + if node is None: + return + candidate = _annotation_name(node) if candidate: couplings.add(candidate) + for base in class_node.bases: + _add_annotation_coupling(base) + for node in ast.walk(class_node): if isinstance(node, ast.Name): couplings.add(node.id) - continue - if isinstance(node, ast.Attribute): - if isinstance(node.value, ast.Name) and node.value.id in {"self", "cls"}: - continue - couplings.add(node.attr) - continue - if isinstance(node, ast.Call): - candidate = _annotation_name(node.func) - if candidate: - couplings.add(candidate) - continue - if isinstance(node, ast.AnnAssign) and node.annotation is not None: - candidate = _annotation_name(node.annotation) - if candidate: - couplings.add(candidate) - continue - if isinstance(node, ast.arg) and node.annotation is not None: - candidate = _annotation_name(node.annotation) - if candidate: - couplings.add(candidate) + elif isinstance(node, ast.Attribute): + if not ( + isinstance(node.value, ast.Name) and node.value.id in {"self", "cls"} + ): + couplings.add(node.attr) + elif isinstance(node, ast.Call): + _add_annotation_coupling(node.func) + elif isinstance(node, (ast.AnnAssign, ast.arg)): + _add_annotation_coupling(node.annotation) filtered = { name diff --git a/codeclone/metrics/dead_code.py b/codeclone/metrics/dead_code.py index eeccc81..3b64c97 100644 --- a/codeclone/metrics/dead_code.py +++ b/codeclone/metrics/dead_code.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations @@ -6,7 +9,7 @@ from dataclasses import replace from typing import Literal -from ..domain.findings import CLONE_KIND_FUNCTION, SYMBOL_KIND_METHOD +from ..domain.findings import SYMBOL_KIND_FUNCTION, SYMBOL_KIND_METHOD from ..domain.quality import CONFIDENCE_HIGH, CONFIDENCE_MEDIUM from ..models import DeadCandidate, DeadItem from ..paths import is_test_filepath @@ -104,7 +107,7 @@ def _is_non_actionable_candidate(symbol: DeadCandidate) -> bool: return True # Module-level dynamic hooks (PEP 562) are invoked by import/runtime lookup. - if symbol.kind == CLONE_KIND_FUNCTION: + if symbol.kind == SYMBOL_KIND_FUNCTION: return symbol.local_name in _MODULE_RUNTIME_HOOK_NAMES # Magic methods and visitor callbacks are invoked by runtime dispatch. if symbol.kind == SYMBOL_KIND_METHOD: diff --git a/codeclone/metrics/dependencies.py b/codeclone/metrics/dependencies.py index caa32d9..48ba032 100644 --- a/codeclone/metrics/dependencies.py +++ b/codeclone/metrics/dependencies.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/metrics/health.py b/codeclone/metrics/health.py index 9886ae9..9f0ab67 100644 --- a/codeclone/metrics/health.py +++ b/codeclone/metrics/health.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/metrics_baseline.py b/codeclone/metrics_baseline.py index d7aa592..3522a05 100644 --- a/codeclone/metrics_baseline.py +++ b/codeclone/metrics_baseline.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations @@ -7,6 +10,7 @@ import hmac import json import os +import tempfile from datetime import datetime, timezone from enum import Enum from pathlib import Path @@ -452,13 +456,21 @@ def diff(self, current: ProjectMetrics) -> MetricsDiff: def _atomic_write_json(path: Path, payload: dict[str, object]) -> None: - tmp_path = path.with_name(f"{path.name}.tmp") data = json.dumps(payload, indent=2, ensure_ascii=False) + "\n" - with tmp_path.open("wb") as tmp_file: - tmp_file.write(data.encode("utf-8")) - tmp_file.flush() - os.fsync(tmp_file.fileno()) - os.replace(tmp_path, path) + fd_num, tmp_name = tempfile.mkstemp( + dir=path.parent, + suffix=".tmp", + ) + tmp_path = Path(tmp_name) + try: + with os.fdopen(fd_num, "wb") as fd: + fd.write(data.encode("utf-8")) + fd.flush() + os.fsync(fd.fileno()) + os.replace(tmp_path, path) + except BaseException: + tmp_path.unlink(missing_ok=True) + raise def _load_json_object(path: Path) -> dict[str, Any]: diff --git a/codeclone/models.py b/codeclone/models.py index f882d37..eaae21a 100644 --- a/codeclone/models.py +++ b/codeclone/models.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/normalize.py b/codeclone/normalize.py index b3e0243..3109ef6 100644 --- a/codeclone/normalize.py +++ b/codeclone/normalize.py @@ -1,10 +1,14 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations import ast import copy +import hashlib from ast import AST from dataclasses import dataclass from typing import TYPE_CHECKING, cast @@ -214,15 +218,33 @@ def normalized_ast_dump_from_list( ) -> str: """ Dump a list of AST nodes after normalization. - WARNING: This modifies the AST nodes in-place for performance. + + The normalizer works on deep-copied nodes so callers can safely reuse + the original AST for downstream metrics and reporting passes. """ active_normalizer = normalizer or AstNormalizer(cfg) dumps: list[str] = [] for node in nodes: # Fingerprints ignore location attributes, so we skip location repair. - new_node = active_normalizer.visit(node) + new_node = active_normalizer.visit(copy.deepcopy(node)) assert isinstance(new_node, ast.AST) dumps.append(ast.dump(new_node, annotate_fields=True, include_attributes=False)) return ";".join(dumps) + + +def _normalized_stmt_dump(stmt: ast.stmt, normalizer: AstNormalizer) -> str: + normalized = normalizer.visit(stmt) + assert isinstance(normalized, ast.AST) + return ast.dump(normalized, annotate_fields=True, include_attributes=False) + + +def stmt_hashes(statements: Sequence[ast.stmt], cfg: NormalizationConfig) -> list[str]: + normalizer = AstNormalizer(cfg) + return [ + hashlib.sha1( + _normalized_stmt_dump(stmt, normalizer).encode("utf-8") + ).hexdigest() + for stmt in statements + ] diff --git a/codeclone/paths.py b/codeclone/paths.py index 551d2be..c9a33a6 100644 --- a/codeclone/paths.py +++ b/codeclone/paths.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/pipeline.py b/codeclone/pipeline.py index a3701bb..527fa6b 100644 --- a/codeclone/pipeline.py +++ b/codeclone/pipeline.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations @@ -10,7 +13,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Literal, cast -from . import _coerce +from ._coerce import as_int, as_str from .cache import ( Cache, CacheEntry, @@ -74,6 +77,10 @@ DEFAULT_BATCH_SIZE = 100 PARALLEL_MIN_FILES_PER_WORKER = 8 PARALLEL_MIN_FILES_FLOOR = 16 +DEFAULT_RUNTIME_PROCESSES = 4 + +_as_int = as_int +_as_str = as_str @dataclass(frozen=True, slots=True) @@ -190,6 +197,7 @@ class ReportArtifacts: text: str | None = None md: str | None = None sarif: str | None = None + report_document: dict[str, object] | None = None @dataclass(frozen=True, slots=True) @@ -203,10 +211,6 @@ class MetricGateConfig: fail_on_new_metrics: bool -_as_int = _coerce.as_int -_as_str = _coerce.as_str - - def _as_sorted_str_tuple(value: object) -> tuple[str, ...]: if not isinstance(value, list): return () @@ -338,6 +342,12 @@ def _parallel_min_files(processes: int) -> int: return max(PARALLEL_MIN_FILES_FLOOR, processes * PARALLEL_MIN_FILES_PER_WORKER) +def _resolve_process_count(processes: object) -> int: + if processes is None: + return DEFAULT_RUNTIME_PROCESSES + return max(1, _as_int(processes, DEFAULT_RUNTIME_PROCESSES)) + + def _should_collect_structural_findings(output_paths: OutputPaths) -> bool: return any( path is not None @@ -816,7 +826,9 @@ def process( failed_files: list[str] = [] source_read_failures: list[str] = [] root_str = str(boot.root) - processes = max(1, int(boot.args.processes)) + # Keep process-count fallback in the core runtime so non-CLI callers such as + # the MCP service do not need to guess or mirror parallelism policy. + processes = _resolve_process_count(boot.args.processes) min_loc = int(boot.args.min_loc) min_stmt = int(boot.args.min_stmt) block_min_loc = int(boot.args.block_min_loc) @@ -1428,6 +1440,18 @@ def analyze( ) +def _load_markdown_report_renderer() -> Callable[..., str]: + from .report.markdown import to_markdown_report + + return to_markdown_report + + +def _load_sarif_report_renderer() -> Callable[..., str]: + from .report.sarif import to_sarif_report + + return to_sarif_report + + def report( *, boot: BootstrapResult, @@ -1439,6 +1463,7 @@ def report( new_block: Collection[str], html_builder: Callable[..., str] | None = None, metrics_diff: object | None = None, + include_report_document: bool = False, ) -> ReportArtifacts: contents: dict[str, str | None] = { "html": None, @@ -1466,13 +1491,17 @@ def report( "file_list": list(discovery.all_file_paths), } report_document: dict[str, object] | None = None - needs_report_document = boot.output_paths.html is not None or any( - path is not None - for path in ( - boot.output_paths.json, - boot.output_paths.md, - boot.output_paths.sarif, - boot.output_paths.text, + needs_report_document = ( + include_report_document + or boot.output_paths.html is not None + or any( + path is not None + for path in ( + boot.output_paths.json, + boot.output_paths.md, + boot.output_paths.sarif, + boot.output_paths.text, + ) ) ) @@ -1525,10 +1554,11 @@ def report( if boot.output_paths.json and report_document is not None: contents["json"] = render_json_report_document(report_document) - if boot.output_paths.md and report_document is not None: - from .report.markdown import to_markdown_report - - contents["md"] = to_markdown_report( + def _render_projection_artifact( + renderer: Callable[..., str], + ) -> str: + assert report_document is not None + return renderer( report_document=report_document, meta=report_meta, inventory=report_inventory, @@ -1544,24 +1574,12 @@ def report( structural_findings=sf, ) - if boot.output_paths.sarif and report_document is not None: - from .report.sarif import to_sarif_report - - contents["sarif"] = to_sarif_report( - report_document=report_document, - meta=report_meta, - inventory=report_inventory, - func_groups=analysis.func_groups, - block_groups=analysis.block_groups_report, - segment_groups=analysis.segment_groups, - block_facts=analysis.block_group_facts, - new_function_group_keys=new_func, - new_block_group_keys=new_block, - new_segment_group_keys=set(analysis.segment_groups.keys()), - metrics=analysis.metrics_payload, - suggestions=analysis.suggestions, - structural_findings=sf, - ) + for key, output_path, loader in ( + ("md", boot.output_paths.md, _load_markdown_report_renderer), + ("sarif", boot.output_paths.sarif, _load_sarif_report_renderer), + ): + if output_path and report_document is not None: + contents[key] = _render_projection_artifact(loader()) if boot.output_paths.text and report_document is not None: contents["text"] = render_text_report_document(report_document) @@ -1572,6 +1590,7 @@ def report( md=contents["md"], sarif=contents["sarif"], text=contents["text"], + report_document=report_document, ) diff --git a/codeclone/qualnames.py b/codeclone/qualnames.py new file mode 100644 index 0000000..a63229b --- /dev/null +++ b/codeclone/qualnames.py @@ -0,0 +1,57 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import ast + +__all__ = ["FunctionNode", "QualnameCollector"] + +FunctionNode = ast.FunctionDef | ast.AsyncFunctionDef + + +class QualnameCollector(ast.NodeVisitor): + __slots__ = ( + "class_count", + "class_nodes", + "funcs", + "function_count", + "method_count", + "stack", + "units", + ) + + def __init__(self) -> None: + self.stack: list[str] = [] + self.units: list[tuple[str, FunctionNode]] = [] + self.class_nodes: list[tuple[str, ast.ClassDef]] = [] + self.funcs: dict[str, FunctionNode] = {} + self.class_count = 0 + self.function_count = 0 + self.method_count = 0 + + def visit_ClassDef(self, node: ast.ClassDef) -> None: + self.class_count += 1 + class_qualname = ".".join([*self.stack, node.name]) if self.stack else node.name + self.class_nodes.append((class_qualname, node)) + self.stack.append(node.name) + self.generic_visit(node) + self.stack.pop() + + def _register_function(self, node: FunctionNode) -> None: + name = ".".join([*self.stack, node.name]) if self.stack else node.name + if self.stack: + self.method_count += 1 + else: + self.function_count += 1 + self.units.append((name, node)) + self.funcs[name] = node + + def visit_FunctionDef(self, node: ast.FunctionDef) -> None: + self._register_function(node) + + def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None: + self._register_function(node) diff --git a/codeclone/report/__init__.py b/codeclone/report/__init__.py index 08f4da3..79da57b 100644 --- a/codeclone/report/__init__.py +++ b/codeclone/report/__init__.py @@ -1,9 +1,11 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations -from ..extractor import _QualnameCollector from ..grouping import build_block_groups, build_groups, build_segment_groups from .blocks import merge_block_items as _merge_block_items from .blocks import prepare_block_report_groups @@ -48,7 +50,6 @@ "_FORBIDDEN_STMTS", "GroupItem", "GroupMap", - "_QualnameCollector", "_SegmentAnalysis", "_analyze_segment_statements", "_assign_targets_attribute_only", diff --git a/codeclone/report/_formatting.py b/codeclone/report/_formatting.py index 9b3cffb..4e9362b 100644 --- a/codeclone/report/_formatting.py +++ b/codeclone/report/_formatting.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/report/_source_kinds.py b/codeclone/report/_source_kinds.py index 4e9dee8..15aab14 100644 --- a/codeclone/report/_source_kinds.py +++ b/codeclone/report/_source_kinds.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/report/blocks.py b/codeclone/report/blocks.py index 7e1b592..8ecaf5a 100644 --- a/codeclone/report/blocks.py +++ b/codeclone/report/blocks.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/report/derived.py b/codeclone/report/derived.py index cfa8fd1..521c317 100644 --- a/codeclone/report/derived.py +++ b/codeclone/report/derived.py @@ -1,13 +1,20 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations from collections import Counter -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast -from .. import _coerce +from .._coerce import as_int as _as_int from ..domain.source_scope import ( + IMPACT_SCOPE_MIXED, + IMPACT_SCOPE_NON_RUNTIME, + IMPACT_SCOPE_RUNTIME, + SOURCE_KIND_BREAKDOWN_KEYS, SOURCE_KIND_FIXTURES, SOURCE_KIND_MIXED, SOURCE_KIND_OTHER, @@ -35,6 +42,8 @@ "report_location_from_structural_occurrence", "representative_locations", "source_kind_breakdown", + "source_scope_from_counts", + "source_scope_from_locations", ] SOURCE_KIND_ORDER: dict[SourceKind, int] = { @@ -45,8 +54,6 @@ SOURCE_KIND_OTHER: _SOURCE_KIND_ORDER[SOURCE_KIND_OTHER], } -_as_int = _coerce.as_int - def _normalize_path(value: str) -> str: return value.replace("\\", "/").strip() @@ -115,6 +122,64 @@ def combine_source_kinds( return SOURCE_KIND_MIXED +def normalized_source_kind(value: object) -> SourceKind: + source_kind_text = str(value).strip().lower() or SOURCE_KIND_OTHER + if source_kind_text == SOURCE_KIND_PRODUCTION: + return SOURCE_KIND_PRODUCTION + if source_kind_text == SOURCE_KIND_TESTS: + return SOURCE_KIND_TESTS + if source_kind_text == SOURCE_KIND_FIXTURES: + return SOURCE_KIND_FIXTURES + return SOURCE_KIND_OTHER + + +def source_scope_from_counts( + counts: Mapping[SourceKind, int] | Mapping[str, int], +) -> dict[str, object]: + normalized_counts = cast("Mapping[str, int]", counts) + + def _count(kind: str) -> int: + value = normalized_counts.get(kind, 0) + return int(value) + + breakdown = {kind: _count(kind) for kind in SOURCE_KIND_BREAKDOWN_KEYS} + present = tuple(kind for kind in SOURCE_KIND_BREAKDOWN_KEYS if breakdown[kind] > 0) + dominant_kind = ( + present[0] + if len(present) == 1 + else combine_source_kinds(present) + if present + else SOURCE_KIND_OTHER + ) + production_count = breakdown[SOURCE_KIND_PRODUCTION] + non_runtime_count = ( + breakdown[SOURCE_KIND_TESTS] + + breakdown[SOURCE_KIND_FIXTURES] + + breakdown[SOURCE_KIND_OTHER] + ) + match (production_count > 0, non_runtime_count == 0, production_count == 0): + case (True, True, _): + impact_scope = IMPACT_SCOPE_RUNTIME + case (_, _, True): + impact_scope = IMPACT_SCOPE_NON_RUNTIME + case _: + impact_scope = IMPACT_SCOPE_MIXED + return { + "dominant_kind": dominant_kind, + "breakdown": breakdown, + "impact_scope": impact_scope, + } + + +def source_scope_from_locations( + locations: Sequence[Mapping[str, object]], +) -> dict[str, object]: + counts: Counter[SourceKind] = Counter() + for location in locations: + counts[normalized_source_kind(location.get("source_kind"))] += 1 + return source_scope_from_counts(counts) + + def report_location_from_group_item( item: Mapping[str, object], *, diff --git a/codeclone/report/explain.py b/codeclone/report/explain.py index 5673e84..73605b0 100644 --- a/codeclone/report/explain.py +++ b/codeclone/report/explain.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations @@ -9,7 +12,7 @@ from pathlib import Path from typing import TYPE_CHECKING -from .. import _coerce +from .._coerce import as_int from .explain_contract import ( BLOCK_HINT_ASSERT_ONLY, BLOCK_HINT_ASSERT_ONLY_LABEL, @@ -37,14 +40,13 @@ class _StatementRecord: _StatementIndex = tuple[tuple[_StatementRecord, ...], tuple[int, ...]] _EMPTY_ASSERT_RANGE_STATS = (0, 0, 0) +_as_int = as_int + def signature_parts(group_key: str) -> list[str]: return [part for part in group_key.split("|") if part] -_as_int = _coerce.as_int - - def parsed_file_tree( filepath: str, *, ast_cache: dict[str, ast.AST | None] ) -> ast.AST | None: diff --git a/codeclone/report/explain_contract.py b/codeclone/report/explain_contract.py index 2169ff2..fccbd5f 100644 --- a/codeclone/report/explain_contract.py +++ b/codeclone/report/explain_contract.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/report/findings.py b/codeclone/report/findings.py index b8745eb..1c7d93d 100644 --- a/codeclone/report/findings.py +++ b/codeclone/report/findings.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy """CodeClone — structural code quality analysis for Python. @@ -27,6 +30,7 @@ relative_report_path, report_location_from_structural_occurrence, ) +from .json_contract import structural_group_id if TYPE_CHECKING: from collections.abc import Sequence @@ -117,8 +121,10 @@ def _rows_for(entries: Sequence[StructuralFindingOccurrence]) -> str: short_path = relative_report_path(item.file_path, scan_root=scan_root) rows.append( "" - f'' - f"{_escape_html(short_path)}" + f'' + f'' + f"{_escape_html(short_path)}" f'{_source_kind_badge_html(location.source_kind)} ' f"{_escape_html(item.qualname)}" f'{item.start}-{item.end}' @@ -182,8 +188,8 @@ def _finding_reason_list_html( items: Sequence[StructuralFindingOccurrence], ) -> str: spread = _spread(items) - if group.finding_kind == STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE: - reasons = [ + clone_cohort_reasons = { + STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE: [ ( f"{len(items)} divergent clone members were detected after " "stable sorting and deduplication." @@ -203,20 +209,20 @@ def _finding_reason_list_html( f"{spread['files']} {'file' if spread['files'] == 1 else 'files'}." ), "This is a report-only finding and does not affect clone gating.", - ] - return _render_reason_list_html(reasons) - if group.finding_kind == STRUCTURAL_KIND_CLONE_COHORT_DRIFT: - reasons = [ + ], + STRUCTURAL_KIND_CLONE_COHORT_DRIFT: [ f"{len(items)} clone members diverge from the cohort majority profile.", f"Drift fields: {group.signature.get('drift_fields', 'n/a')}.", ( f"Cohort id: {group.signature.get('cohort_id', 'unknown')} with " f"arity {group.signature.get('cohort_arity', 'n/a')}." ), - ("Majority profile is compared deterministically with lexical tie-breaks."), + "Majority profile is compared deterministically with lexical tie-breaks.", "This is a report-only finding and does not affect clone gating.", - ] - return _render_reason_list_html(reasons) + ], + } + if group.finding_kind in clone_cohort_reasons: + return _render_reason_list_html(clone_cohort_reasons[group.finding_kind]) stmt_seq = group.signature.get("stmt_seq", "n/a") terminal = group.signature.get("terminal", "n/a") @@ -256,20 +262,20 @@ def _finding_matters_html( ) -> str: spread = _spread(items) count = len(items) - if group.finding_kind == STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE: - message = ( + special_messages = { + STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE: ( "Members of one function-clone cohort diverged in guard/exit behavior. " "This often points to a partial fix where one path was updated and " "other siblings were left unchanged." - ) - return _finding_matters_paragraph(message) - if group.finding_kind == STRUCTURAL_KIND_CLONE_COHORT_DRIFT: - message = ( + ), + STRUCTURAL_KIND_CLONE_COHORT_DRIFT: ( "Members of one function-clone cohort drifted from a stable majority " "profile (terminal, guard, try/finally, side-effect order). Review " "whether divergence is intentional." - ) - return _finding_matters_paragraph(message) + ), + } + if group.finding_kind in special_messages: + return _finding_matters_paragraph(special_messages[group.finding_kind]) terminal = str(group.signature.get("terminal", "")).strip() stmt_seq = str(group.signature.get("stmt_seq", "")).strip() @@ -279,23 +285,26 @@ def _finding_matters_html( f"{spread['files']} files, so the same branch policy may be copied " "between multiple code paths." ) - elif terminal == "raise": - message = ( - "This group points to repeated guard or validation exits inside one " - "function. Consolidating the shared exit policy usually reduces " - "branch noise." - ) - elif terminal == "return": - message = ( - "This group points to repeated return-path logic inside one function. " - "A helper can often keep the branch predicate local while sharing " - "the emitted behavior." - ) else: - message = ( - f"This group reports {count} branches with the same local shape " - f"({stmt_seq or 'unknown signature'}). Review whether the shared " - "branch body should stay duplicated or become a helper." + terminal_messages = { + "raise": ( + "This group points to repeated guard or validation exits inside one " + "function. Consolidating the shared exit policy usually reduces " + "branch noise." + ), + "return": ( + "This group points to repeated return-path logic inside one function. " + "A helper can often keep the branch predicate local while sharing " + "the emitted behavior." + ), + } + message = terminal_messages.get( + terminal, + ( + f"This group reports {count} branches with the same local shape " + f"({stmt_seq or 'unknown signature'}). Review whether the shared " + "branch body should stay duplicated or become a helper." + ), ) return _finding_matters_paragraph(message) @@ -449,9 +458,12 @@ def _render_finding_card( # Scope text — concise spread summary scope_text = _finding_scope_text(deduped_items) + finding_id = structural_group_id(g.finding_kind, g.finding_key) return ( f'
    ' diff --git a/codeclone/report/json_contract.py b/codeclone/report/json_contract.py index 330f92f..4cecd99 100644 --- a/codeclone/report/json_contract.py +++ b/codeclone/report/json_contract.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations @@ -9,8 +12,16 @@ from hashlib import sha256 from typing import TYPE_CHECKING, Literal -from .. import _coerce -from ..contracts import REPORT_SCHEMA_VERSION +from .._coerce import as_float as _as_float +from .._coerce import as_int as _as_int +from .._coerce import as_mapping as _as_mapping +from .._coerce import as_sequence as _as_sequence +from ..contracts import ( + DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + REPORT_SCHEMA_VERSION, +) from ..domain.findings import ( CATEGORY_COHESION, CATEGORY_COMPLEXITY, @@ -55,12 +66,18 @@ from ..structural_findings import normalize_structural_findings from ..suppressions import INLINE_CODECLONE_SUPPRESSION_SOURCE from .derived import ( - combine_source_kinds, group_spread, relative_report_path, report_location_from_group_item, report_location_from_structural_occurrence, ) +from .derived import ( + source_scope_from_counts as _report_source_scope_from_counts, +) +from .derived import ( + source_scope_from_locations as _report_source_scope_from_locations, +) +from .overview import build_directory_hotspots from .suggestions import classify_clone_type if TYPE_CHECKING: @@ -80,18 +97,6 @@ "structural_group_id", ] -_as_int = _coerce.as_int -_as_float = _coerce.as_float -_as_mapping = _coerce.as_mapping -_as_sequence = _coerce.as_sequence - -_SOURCE_BREAKDOWN_KEYS_TYPED: tuple[SourceKind, ...] = ( - SOURCE_KIND_PRODUCTION, - SOURCE_KIND_TESTS, - SOURCE_KIND_FIXTURES, - SOURCE_KIND_OTHER, -) - def _optional_str(value: object) -> str | None: if value is None: @@ -100,6 +105,45 @@ def _optional_str(value: object) -> str | None: return text or None +def _coerced_nonnegative_threshold(value: object, *, default: int) -> int: + threshold = _as_int(value, default) + return threshold if threshold >= 0 else default + + +def _design_findings_thresholds_payload( + raw_meta: Mapping[str, object] | None, +) -> dict[str, object]: + meta = dict(raw_meta or {}) + return { + "design_findings": { + CATEGORY_COMPLEXITY: { + "metric": "cyclomatic_complexity", + "operator": ">", + "value": _coerced_nonnegative_threshold( + meta.get("design_complexity_threshold"), + default=DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + ), + }, + CATEGORY_COUPLING: { + "metric": "cbo", + "operator": ">", + "value": _coerced_nonnegative_threshold( + meta.get("design_coupling_threshold"), + default=DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + ), + }, + CATEGORY_COHESION: { + "metric": "lcom4", + "operator": ">=", + "value": _coerced_nonnegative_threshold( + meta.get("design_cohesion_threshold"), + default=DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + ), + }, + } + } + + def _normalize_path(value: str) -> str: return value.replace("\\", "/").strip() @@ -269,44 +313,17 @@ def _normalized_source_kind(value: object) -> SourceKind: def _source_scope_from_counts( counts: Mapping[SourceKind, int], ) -> dict[str, object]: - breakdown = {kind: counts[kind] for kind in _SOURCE_BREAKDOWN_KEYS_TYPED} - present = tuple( - kind for kind in _SOURCE_BREAKDOWN_KEYS_TYPED if breakdown[kind] > 0 - ) - dominant_kind = ( - present[0] - if len(present) == 1 - else combine_source_kinds(present) - if present - else SOURCE_KIND_OTHER - ) - production_count = breakdown[SOURCE_KIND_PRODUCTION] - non_runtime_count = ( - breakdown[SOURCE_KIND_TESTS] - + breakdown[SOURCE_KIND_FIXTURES] - + breakdown[SOURCE_KIND_OTHER] - ) - match (production_count > 0, non_runtime_count == 0, production_count == 0): - case (True, True, _): - impact_scope = IMPACT_SCOPE_RUNTIME - case (_, _, True): - impact_scope = IMPACT_SCOPE_NON_RUNTIME - case _: - impact_scope = IMPACT_SCOPE_MIXED - return { - "dominant_kind": dominant_kind, - "breakdown": breakdown, - "impact_scope": impact_scope, - } + return _report_source_scope_from_counts(counts) def _source_scope_from_locations( locations: Sequence[Mapping[str, object]], ) -> dict[str, object]: - counts: Counter[SourceKind] = Counter() - for location in locations: - counts[_normalized_source_kind(location.get("source_kind"))] += 1 - return _source_scope_from_counts(counts) + normalized_locations = [ + {"source_kind": _normalized_source_kind(location.get("source_kind"))} + for location in locations + ] + return _report_source_scope_from_locations(normalized_locations) def _collect_paths_from_metrics(metrics: Mapping[str, object]) -> set[str]: @@ -849,6 +866,7 @@ def _build_meta_payload( "analysis_mode": str(meta.get("analysis_mode", "full") or "full"), "report_mode": str(meta.get("report_mode", "full") or "full"), "computed_metric_families": metrics_computed, + "analysis_thresholds": _design_findings_thresholds_payload(meta), "baseline": { "path": baseline_path, "path_scope": baseline_path_scope, @@ -889,6 +907,9 @@ def _build_meta_payload( ), }, "runtime": { + "analysis_started_at_utc": _optional_str( + meta.get("analysis_started_at_utc") + ), "report_generated_at_utc": _optional_str( meta.get("report_generated_at_utc") ), @@ -1418,10 +1439,11 @@ def _design_singleton_group( def _complexity_design_group( item_map: Mapping[str, object], *, + threshold: int, scan_root: str, ) -> dict[str, object] | None: cc = _as_int(item_map.get("cyclomatic_complexity"), 1) - if cc <= 20: + if cc <= threshold: return None qualname = str(item_map.get("qualname", "")) filepath = str(item_map.get("relative_path", "")) @@ -1451,10 +1473,11 @@ def _complexity_design_group( def _coupling_design_group( item_map: Mapping[str, object], *, + threshold: int, scan_root: str, ) -> dict[str, object] | None: cbo = _as_int(item_map.get("cbo")) - if cbo <= 10: + if cbo <= threshold: return None qualname = str(item_map.get("qualname", "")) filepath = str(item_map.get("relative_path", "")) @@ -1483,10 +1506,11 @@ def _coupling_design_group( def _cohesion_design_group( item_map: Mapping[str, object], *, + threshold: int, scan_root: str, ) -> dict[str, object] | None: lcom4 = _as_int(item_map.get("lcom4")) - if lcom4 <= 3: + if lcom4 < threshold: return None qualname = str(item_map.get("qualname", "")) filepath = str(item_map.get("relative_path", "")) @@ -1562,26 +1586,52 @@ def _dependency_design_group( def _build_design_groups( metrics_payload: Mapping[str, object], *, + design_thresholds: Mapping[str, object] | None = None, scan_root: str, ) -> list[dict[str, object]]: families = _as_mapping(metrics_payload.get("families")) + thresholds = _as_mapping(design_thresholds) + complexity_threshold = _coerced_nonnegative_threshold( + _as_mapping(thresholds.get(CATEGORY_COMPLEXITY)).get("value"), + default=DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + ) + coupling_threshold = _coerced_nonnegative_threshold( + _as_mapping(thresholds.get(CATEGORY_COUPLING)).get("value"), + default=DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + ) + cohesion_threshold = _coerced_nonnegative_threshold( + _as_mapping(thresholds.get(CATEGORY_COHESION)).get("value"), + default=DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + ) groups: list[dict[str, object]] = [] complexity = _as_mapping(families.get(CATEGORY_COMPLEXITY)) for item in _as_sequence(complexity.get("items")): - group = _complexity_design_group(_as_mapping(item), scan_root=scan_root) + group = _complexity_design_group( + _as_mapping(item), + threshold=complexity_threshold, + scan_root=scan_root, + ) if group is not None: groups.append(group) coupling = _as_mapping(families.get(CATEGORY_COUPLING)) for item in _as_sequence(coupling.get("items")): - group = _coupling_design_group(_as_mapping(item), scan_root=scan_root) + group = _coupling_design_group( + _as_mapping(item), + threshold=coupling_threshold, + scan_root=scan_root, + ) if group is not None: groups.append(group) cohesion = _as_mapping(families.get(CATEGORY_COHESION)) for item in _as_sequence(cohesion.get("items")): - group = _cohesion_design_group(_as_mapping(item), scan_root=scan_root) + group = _cohesion_design_group( + _as_mapping(item), + threshold=cohesion_threshold, + scan_root=scan_root, + ) if group is not None: groups.append(group) @@ -1907,6 +1957,7 @@ def _build_derived_overview( if dominant_kind_counts[key] > 0 }, "health_snapshot": _health_snapshot(metrics_payload), + "directory_hotspots": build_directory_hotspots(findings=findings), } hotlists: dict[str, object] = { "most_actionable_ids": _sort_flat_finding_ids( @@ -2043,6 +2094,7 @@ def _build_findings_payload( new_function_group_keys: Collection[str] | None, new_block_group_keys: Collection[str] | None, new_segment_group_keys: Collection[str] | None, + design_thresholds: Mapping[str, object] | None, scan_root: str, ) -> dict[str, object]: clone_functions = _build_clone_groups( @@ -2089,6 +2141,7 @@ def _build_findings_payload( ) design_groups = _build_design_groups( metrics_payload, + design_thresholds=design_thresholds, scan_root=scan_root, ) return { @@ -2215,6 +2268,9 @@ def build_report_document( report_schema_version = REPORT_SCHEMA_VERSION scan_root = str(_as_mapping(meta).get("scan_root", "")) meta_payload = _build_meta_payload(meta, scan_root=scan_root) + design_thresholds = _as_mapping( + _as_mapping(meta_payload.get("analysis_thresholds")).get("design_findings") + ) metrics_payload = _build_metrics_payload(metrics, scan_root=scan_root) file_list = _collect_report_file_list( inventory=inventory, @@ -2241,6 +2297,7 @@ def build_report_document( new_function_group_keys=new_function_group_keys, new_block_group_keys=new_block_group_keys, new_segment_group_keys=new_segment_group_keys, + design_thresholds=design_thresholds, scan_root=scan_root, ) overview_payload, hotlists_payload = _build_derived_overview( diff --git a/codeclone/report/markdown.py b/codeclone/report/markdown.py index 71e1eef..07a66cd 100644 --- a/codeclone/report/markdown.py +++ b/codeclone/report/markdown.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations @@ -6,7 +9,7 @@ from collections.abc import Collection, Mapping, Sequence from typing import TYPE_CHECKING -from .. import _coerce +from .._coerce import as_float, as_int, as_mapping, as_sequence from ..domain.findings import FAMILY_CLONE, FAMILY_DEAD_CODE, FAMILY_STRUCTURAL from ._formatting import format_spread_text from .json_contract import build_report_document @@ -19,6 +22,11 @@ _MAX_FINDING_LOCATIONS = 5 _MAX_METRIC_ITEMS = 10 +_as_int = as_int +_as_float = as_float +_as_mapping = as_mapping +_as_sequence = as_sequence + _ANCHORS: tuple[tuple[str, str, int], ...] = ( ("overview", "Overview", 2), ("inventory", "Inventory", 2), @@ -41,11 +49,6 @@ ("integrity", "Integrity", 2), ) -_as_int = _coerce.as_int -_as_float = _coerce.as_float -_as_mapping = _coerce.as_mapping -_as_sequence = _coerce.as_sequence - def _text(value: object) -> str: if value is None: diff --git a/codeclone/report/merge.py b/codeclone/report/merge.py index fc59e9e..2ba2331 100644 --- a/codeclone/report/merge.py +++ b/codeclone/report/merge.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/report/overview.py b/codeclone/report/overview.py index 14fac90..cc0efda 100644 --- a/codeclone/report/overview.py +++ b/codeclone/report/overview.py @@ -1,13 +1,19 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations from collections import Counter from collections.abc import Mapping, Sequence -from typing import TYPE_CHECKING +from pathlib import PurePosixPath +from typing import TYPE_CHECKING, cast -from .. import _coerce +from .._coerce import as_int as _as_int +from .._coerce import as_mapping as _as_mapping +from .._coerce import as_sequence as _as_sequence from ..domain.findings import ( CATEGORY_COHESION, CATEGORY_COMPLEXITY, @@ -17,6 +23,7 @@ CLONE_KIND_BLOCK, CLONE_KIND_FUNCTION, CLONE_KIND_SEGMENT, + FAMILY_CLONE, FAMILY_CLONES, FAMILY_DEAD_CODE, FAMILY_DESIGN, @@ -36,21 +43,22 @@ BLOCK_HINT_ASSERT_ONLY, BLOCK_PATTERN_REPEATED_STMT_HASH, ) -from .derived import format_spread_location_label +from .derived import ( + classify_source_kind, + format_spread_location_label, + source_scope_from_locations, +) if TYPE_CHECKING: from ..models import Suggestion __all__ = [ + "build_directory_hotspots", "build_report_overview", "materialize_report_overview", "serialize_suggestion_card", ] -_as_int = _coerce.as_int -_as_mapping = _coerce.as_mapping -_as_sequence = _coerce.as_sequence - def serialize_suggestion_card(suggestion: Suggestion) -> dict[str, object]: return { @@ -94,6 +102,260 @@ def _flatten_findings(findings: Mapping[str, object]) -> list[Mapping[str, objec ] +_DIRECTORY_HOTSPOT_BUCKETS: tuple[str, ...] = ( + "all", + "clones", + "structural", + CATEGORY_COMPLEXITY, + CATEGORY_COHESION, + CATEGORY_COUPLING, + CATEGORY_DEAD_CODE, + CATEGORY_DEPENDENCY, +) +_DIRECTORY_KIND_BREAKDOWN_KEYS: tuple[str, ...] = ( + "clones", + "structural", + CATEGORY_DEAD_CODE, + CATEGORY_COMPLEXITY, + CATEGORY_COUPLING, + CATEGORY_COHESION, + CATEGORY_DEPENDENCY, +) + + +def _directory_bucket_keys(group: Mapping[str, object]) -> tuple[str, ...]: + family = str(group.get("family", "")).strip() + category = str(group.get("category", "")).strip() + if family == FAMILY_CLONE: + return ("all", "clones") + if family == FAMILY_STRUCTURAL: + return ("all", "structural") + if family == FAMILY_DEAD_CODE: + return ("all", CATEGORY_DEAD_CODE) + if family == FAMILY_DESIGN and category in { + CATEGORY_COMPLEXITY, + CATEGORY_COUPLING, + CATEGORY_COHESION, + CATEGORY_DEPENDENCY, + }: + return ("all", category) + return ("all",) + + +def _directory_kind_breakdown_key(group: Mapping[str, object]) -> str | None: + family = str(group.get("family", "")).strip() + category = str(group.get("category", "")).strip() + if family == FAMILY_CLONE: + return "clones" + if family == FAMILY_STRUCTURAL: + return "structural" + if family == FAMILY_DEAD_CODE: + return CATEGORY_DEAD_CODE + if family == FAMILY_DESIGN and category in { + CATEGORY_COMPLEXITY, + CATEGORY_COUPLING, + CATEGORY_COHESION, + CATEGORY_DEPENDENCY, + }: + return category + return None + + +def _directory_relative_path(item: Mapping[str, object]) -> str | None: + relative_path = str(item.get("relative_path", "")).replace("\\", "/").strip() + if not relative_path: + module = str(item.get("module", "")).strip() + if module: + relative_path = module.replace(".", "/") + ".py" + return relative_path or None + + +def _directory_path_label(relative_path: str) -> str: + parent = PurePosixPath(relative_path).parent.as_posix() + return parent if parent not in {"", "/"} else "." + + +def _directory_scope_root_label( + relative_path: str, + *, + source_kind: str, +) -> str | None: + parts = tuple( + part for part in PurePosixPath(relative_path).parts if part not in {"", "."} + ) + if not parts: + return None + tests_idx = next( + (index for index, part in enumerate(parts) if part == SOURCE_KIND_TESTS), + None, + ) + if tests_idx is None: + return None + if ( + source_kind == SOURCE_KIND_FIXTURES + and tests_idx + 1 < len(parts) + and parts[tests_idx + 1] == SOURCE_KIND_FIXTURES + ): + return "/".join(parts[: tests_idx + 2]) + if source_kind == SOURCE_KIND_TESTS: + return "/".join(parts[: tests_idx + 1]) + return None + + +def _overview_directory_label( + relative_path: str, + *, + source_kind: str, +) -> str: + scope_root = _directory_scope_root_label( + relative_path, + source_kind=source_kind, + ) + if scope_root: + return scope_root + return _directory_path_label(relative_path) + + +def _directory_contributions( + group: Mapping[str, object], +) -> dict[str, dict[str, object]]: + contributions: dict[str, dict[str, object]] = {} + for item in map(_as_mapping, _as_sequence(group.get("items"))): + relative_path = _directory_relative_path(item) + if relative_path is None: + continue + source_kind = str(item.get("source_kind", "")).strip() or classify_source_kind( + relative_path + ) + directory = _overview_directory_label(relative_path, source_kind=source_kind) + entry = contributions.setdefault( + directory, + { + "affected_items": 0, + "files": set(), + "locations": [], + }, + ) + entry["affected_items"] = _as_int(entry.get("affected_items")) + 1 + cast(set[str], entry["files"]).add(relative_path) + cast(list[dict[str, object]], entry["locations"]).append( + {"source_kind": source_kind} + ) + return contributions + + +def _directory_group_data( + group: Mapping[str, object], +) -> tuple[str, dict[str, dict[str, object]]] | None: + group_id = str(group.get("id", "")).strip() + if not group_id: + return None + contributions = _directory_contributions(group) + if not contributions: + return None + return group_id, contributions + + +def build_directory_hotspots( + *, + findings: Mapping[str, object], + limit: int = 5, +) -> dict[str, object]: + normalized_limit = max(1, _as_int(limit, 5)) + bucket_rows: dict[str, dict[str, dict[str, object]]] = { + bucket: {} for bucket in _DIRECTORY_HOTSPOT_BUCKETS + } + bucket_totals: Counter[str] = Counter() + + for group in _flatten_findings(findings): + group_data = _directory_group_data(group) + if group_data is None: + continue + group_id, contributions = group_data + bucket_keys = _directory_bucket_keys(group) + kind_key = _directory_kind_breakdown_key(group) + for bucket in bucket_keys: + rows = bucket_rows[bucket] + for directory, contribution in contributions.items(): + row = rows.setdefault( + directory, + { + "path": directory, + "finding_ids": set(), + "affected_items": 0, + "files": set(), + "locations": [], + "kind_breakdown_ids": { + key: set() for key in _DIRECTORY_KIND_BREAKDOWN_KEYS + }, + }, + ) + cast(set[str], row["finding_ids"]).add(group_id) + row["affected_items"] = _as_int(row.get("affected_items")) + _as_int( + contribution.get("affected_items") + ) + cast(set[str], row["files"]).update( + cast(set[str], contribution["files"]) + ) + cast(list[dict[str, object]], row["locations"]).extend( + cast(list[dict[str, object]], contribution["locations"]) + ) + if bucket == "all" and kind_key is not None: + kind_rows = cast( + dict[str, set[str]], + row["kind_breakdown_ids"], + ) + kind_rows[kind_key].add(group_id) + bucket_totals[bucket] += _as_int(contribution.get("affected_items")) + + def _row_sort_key(row: Mapping[str, object]) -> tuple[int, int, int, str]: + return ( + -len(cast(set[str], row["finding_ids"])), + -_as_int(row.get("affected_items")), + -len(cast(set[str], row["files"])), + str(row.get("path", "")), + ) + + hotspots: dict[str, object] = {} + for bucket in _DIRECTORY_HOTSPOT_BUCKETS: + bucket_items = sorted(bucket_rows[bucket].values(), key=_row_sort_key) + total_directories = len(bucket_items) + total_affected_items = bucket_totals[bucket] + items: list[dict[str, object]] = [] + for row in bucket_items[:normalized_limit]: + finding_groups = len(cast(set[str], row["finding_ids"])) + affected_items = _as_int(row.get("affected_items")) + files = len(cast(set[str], row["files"])) + item = { + "path": str(row.get("path", ".")), + "finding_groups": finding_groups, + "affected_items": affected_items, + "files": files, + "share_pct": round( + (affected_items / total_affected_items) * 100.0, + 1, + ) + if total_affected_items > 0 + else 0.0, + "source_scope": source_scope_from_locations( + cast(list[dict[str, object]], row["locations"]) + ), + } + if bucket == "all": + item["kind_breakdown"] = { + key: len(cast(dict[str, set[str]], row["kind_breakdown_ids"])[key]) + for key in _DIRECTORY_KIND_BREAKDOWN_KEYS + } + items.append(item) + hotspots[bucket] = { + "total_directories": total_directories, + "returned": len(items), + "has_more": total_directories > len(items), + "items": items, + } + return hotspots + + def _clone_fact_kind(kind: str) -> str: return { CLONE_KIND_FUNCTION: "Function clone group", diff --git a/codeclone/report/sarif.py b/codeclone/report/sarif.py index c6bd6ff..d0e8554 100644 --- a/codeclone/report/sarif.py +++ b/codeclone/report/sarif.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations @@ -10,12 +13,16 @@ from pathlib import Path from typing import TYPE_CHECKING, cast -from .. import _coerce +from .._coerce import as_float as _as_float +from .._coerce import as_int as _as_int +from .._coerce import as_mapping as _as_mapping +from .._coerce import as_sequence as _as_sequence from ..contracts import DOCS_URL, REPOSITORY_URL from ..domain.findings import ( CATEGORY_COHESION, CATEGORY_COMPLEXITY, CATEGORY_COUPLING, + CATEGORY_DEPENDENCY, CLONE_KIND_BLOCK, CLONE_KIND_FUNCTION, FAMILY_CLONE, @@ -32,6 +39,7 @@ STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE, STRUCTURAL_KIND_DUPLICATED_BRANCHES, SYMBOL_KIND_CLASS, + SYMBOL_KIND_FUNCTION, SYMBOL_KIND_METHOD, ) from ..domain.quality import ( @@ -63,12 +71,6 @@ class _RuleSpec: precision: str -_as_int = _coerce.as_int -_as_float = _coerce.as_float -_as_mapping = _coerce.as_mapping -_as_sequence = _coerce.as_sequence - - def _text(value: object) -> str: if value is None: return "" @@ -79,26 +81,12 @@ def _severity_to_level(severity: str) -> str: if severity == SEVERITY_CRITICAL: return "error" if severity == SEVERITY_WARNING: - return SEVERITY_WARNING + return "warning" return "note" -def _slug(text: str) -> str: - slug_chars: list[str] = [] - prev_dash = False - for char in text.lower(): - if char.isalnum(): - slug_chars.append(char) - prev_dash = False - continue - if not prev_dash: - slug_chars.append("-") - prev_dash = True - return "".join(slug_chars).strip("-") or "finding" - - def _rule_name(spec: _RuleSpec) -> str: - return f"codeclone.{_slug(spec.short_description)}" + return f"codeclone.{spec.rule_id}" def _rule_remediation(spec: _RuleSpec) -> str: @@ -285,7 +273,7 @@ def _structural_rule_spec(kind: str) -> _RuleSpec: def _dead_code_rule_spec(category: str) -> _RuleSpec: - if category == CLONE_KIND_FUNCTION: + if category == SYMBOL_KIND_FUNCTION: return _RuleSpec( "CDEAD001", "Unused function", @@ -416,7 +404,7 @@ def _structural_result_message( ) if signature_family == STRUCTURAL_KIND_CLONE_COHORT_DRIFT: drift_fields = _as_sequence(signature.get("drift_fields")) - drift_label = ",".join(_text(item) for item in drift_fields) or "profile" + drift_label = ", ".join(_text(item) for item in drift_fields) or "profile" cohort_id = _text(signature.get("cohort_id")) return ( f"Clone cohort drift ({drift_label}), " @@ -439,7 +427,7 @@ def _dead_code_result_message( ) -> str: confidence = _text(group.get("confidence")) or "reported" target = qualname or relative_path - return f"Unused {category} with {confidence} confidence: {target}" + return f"Unused {category} with {confidence} confidence: {target}." def _design_result_message( @@ -451,15 +439,15 @@ def _design_result_message( ) -> str: if category == CATEGORY_COHESION: lcom4 = _as_int(facts.get("lcom4")) - return f"Low cohesion class (LCOM4={lcom4}): {qualname}" + return f"Low cohesion class (LCOM4={lcom4}): {qualname}." if category == CATEGORY_COMPLEXITY: cc = _as_int(facts.get("cyclomatic_complexity")) - return f"High complexity function (CC={cc}): {qualname}" + return f"High complexity function (CC={cc}): {qualname}." if category == CATEGORY_COUPLING: cbo = _as_int(facts.get("cbo")) - return f"High coupling class (CBO={cbo}): {qualname}" + return f"High coupling class (CBO={cbo}): {qualname}." modules = [_text(item.get("module")) for item in items if _text(item.get("module"))] - return f"Dependency cycle ({len(modules)} modules): {' -> '.join(modules)}" + return f"Dependency cycle ({len(modules)} modules): {' -> '.join(modules)}." def _result_message(group: Mapping[str, object]) -> str: @@ -515,13 +503,7 @@ def _location_message( ) -> str: family = _text(group.get("family")) category = _text(group.get("category")) - if family == FAMILY_CLONE: - return ( - "Representative occurrence" - if related_id is None - else f"Related occurrence #{related_id}" - ) - if family == FAMILY_STRUCTURAL: + if family in {FAMILY_CLONE, FAMILY_STRUCTURAL}: return ( "Representative occurrence" if related_id is None @@ -533,7 +515,7 @@ def _location_message( if related_id is None else f"Related declaration #{related_id}" ) - if category == "dependency": + if category == CATEGORY_DEPENDENCY: return ( "Cycle member" if related_id is None @@ -688,8 +670,6 @@ def _result_properties(group: Mapping[str, object]) -> dict[str, object]: props, facts=_as_mapping(group.get("facts")), ) - if family == FAMILY_DEAD_CODE: - props["confidence"] = _text(group.get("confidence")) return props @@ -703,15 +683,6 @@ def _partial_fingerprints( path = _text(primary_item.get("relative_path")) qualname = _text(primary_item.get("qualname")) start_line = _as_int(primary_item.get("start_line")) - end_line = _as_int(primary_item.get("end_line")) - fingerprints = { - "rule": rule_id, - "path": path, - } - if qualname: - fingerprints["qualname"] = qualname - if start_line > 0: - fingerprints["region"] = f"{start_line}-{end_line or start_line}" if path and start_line > 0: fingerprint_material = "\0".join( ( @@ -719,16 +690,32 @@ def _partial_fingerprints( finding_id, path, qualname, - str(start_line), - str(end_line or start_line), ) ) - fingerprints["primaryLocationLineHash"] = ( - f"{hashlib.sha256(fingerprint_material.encode('utf-8')).hexdigest()[:16]}" - f":{start_line}" - ) - fingerprints["finding"] = finding_id - return fingerprints + return { + "primaryLocationLineHash": ( + f"{hashlib.sha256(fingerprint_material.encode('utf-8')).hexdigest()[:16]}" + f":{start_line}" + ) + } + return {} + + +def _primary_location_properties( + primary_item: Mapping[str, object], +) -> dict[str, object]: + path = _text(primary_item.get("relative_path")) + qualname = _text(primary_item.get("qualname")) + start_line = _as_int(primary_item.get("start_line")) + end_line = _as_int(primary_item.get("end_line")) + props: dict[str, object] = {} + if path: + props["primaryPath"] = path + if qualname: + props["primaryQualname"] = qualname + if start_line > 0: + props["primaryRegion"] = f"{start_line}-{end_line or start_line}" + return props def _baseline_state(group: Mapping[str, object]) -> str: @@ -763,6 +750,7 @@ def _result_entry( result: dict[str, object] = { "ruleId": rule_id, "ruleIndex": rule_index, + "kind": "fail", "level": _severity_to_level(_text(group.get("severity"))), "message": { "text": _result_message(group), @@ -778,6 +766,9 @@ def _result_entry( ), "properties": _result_properties(group), } + if primary_item: + properties = cast(dict[str, object], result["properties"]) + properties.update(_primary_location_properties(primary_item)) baseline_state = _baseline_state(group) if baseline_state: result["baselineState"] = baseline_state @@ -802,6 +793,7 @@ def _result_entry( def render_sarif_report_document(payload: Mapping[str, object]) -> str: meta = _as_mapping(payload.get("meta")) runtime = _as_mapping(meta.get("runtime")) + analysis_started_at = _text(runtime.get("analysis_started_at_utc")) generated_at = _text(runtime.get("report_generated_at_utc")) analysis_mode = _text(meta.get("analysis_mode")) or "full" findings = sorted( @@ -837,6 +829,7 @@ def render_sarif_report_document(payload: Mapping[str, object]) -> str: ] invocation: dict[str, object] = { "executionSuccessful": True, + **({"startTimeUtc": analysis_started_at} if analysis_started_at else {}), **({"endTimeUtc": generated_at} if generated_at else {}), } if scan_root_uri: @@ -846,7 +839,6 @@ def render_sarif_report_document(payload: Mapping[str, object]) -> str: "driver": { "name": "codeclone", "version": _text(meta.get("codeclone_version")), - "semanticVersion": _text(meta.get("codeclone_version")), "informationUri": REPOSITORY_URL, "rules": [ { @@ -869,7 +861,20 @@ def render_sarif_report_document(payload: Mapping[str, object]) -> str: } }, "automationDetails": { - "id": f"codeclone/{analysis_mode}", + "id": "/".join( + part + for part in ( + "codeclone", + analysis_mode, + generated_at + or _text( + _as_mapping( + _as_mapping(payload.get("integrity")).get("digest") + ).get("value") + )[:12], + ) + if part + ), }, **( { @@ -898,7 +903,6 @@ def render_sarif_report_document(payload: Mapping[str, object]) -> str: ), **({"reportGeneratedAtUtc": generated_at} if generated_at else {}), }, - "columnKind": "utf16CodeUnits", } return json.dumps( { diff --git a/codeclone/report/segments.py b/codeclone/report/segments.py index ba5ec9a..7f46502 100644 --- a/codeclone/report/segments.py +++ b/codeclone/report/segments.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations @@ -9,7 +12,7 @@ from pathlib import Path from typing import TYPE_CHECKING -from ..extractor import _QualnameCollector +from ..qualnames import FunctionNode, QualnameCollector from .merge import coerce_positive_int, merge_overlapping_items if TYPE_CHECKING: @@ -52,7 +55,7 @@ def merge_segment_items(items: GroupItemsLike) -> list[GroupItem]: def collect_file_functions( filepath: str, -) -> dict[str, ast.FunctionDef | ast.AsyncFunctionDef] | None: +) -> dict[str, FunctionNode] | None: try: source = Path(filepath).read_text("utf-8") except OSError: @@ -62,13 +65,13 @@ def collect_file_functions( except SyntaxError: return None - collector = _QualnameCollector() + collector = QualnameCollector() collector.visit(tree) return collector.funcs def segment_statements( - func_node: ast.FunctionDef | ast.AsyncFunctionDef, start_line: int, end_line: int + func_node: FunctionNode, start_line: int, end_line: int ) -> list[ast.stmt]: body = getattr(func_node, "body", None) if not isinstance(body, list): @@ -137,7 +140,7 @@ def analyze_segment_statements(statements: list[ast.stmt]) -> _SegmentAnalysis | def _analyze_segment_item( item: GroupItemLike, *, - file_cache: dict[str, dict[str, ast.FunctionDef | ast.AsyncFunctionDef] | None], + file_cache: dict[str, dict[str, FunctionNode] | None], ) -> _SegmentAnalysis | None: filepath = str(item.get("filepath", "")) qualname = str(item.get("qualname", "")) @@ -164,7 +167,7 @@ def _analyze_segment_item( def _analyze_segment_group( items: Sequence[GroupItemLike], *, - file_cache: dict[str, dict[str, ast.FunctionDef | ast.AsyncFunctionDef] | None], + file_cache: dict[str, dict[str, FunctionNode] | None], ) -> list[_SegmentAnalysis] | None: analyses: list[_SegmentAnalysis] = [] for item in items: @@ -182,7 +185,7 @@ def prepare_segment_report_groups(segment_groups: GroupMapLike) -> tuple[GroupMa """ suppressed = 0 filtered: GroupMap = {} - file_cache: dict[str, dict[str, ast.FunctionDef | ast.AsyncFunctionDef] | None] = {} + file_cache: dict[str, dict[str, FunctionNode] | None] = {} for key, items in segment_groups.items(): merged_items = merge_segment_items(items) diff --git a/codeclone/report/serialize.py b/codeclone/report/serialize.py index f074cd3..a4c93b9 100644 --- a/codeclone/report/serialize.py +++ b/codeclone/report/serialize.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations @@ -6,13 +9,13 @@ import json from collections.abc import Mapping, Sequence -from .. import _coerce +from .._coerce import as_int, as_mapping, as_sequence from ..domain.source_scope import IMPACT_SCOPE_NON_RUNTIME, SOURCE_KIND_OTHER from ._formatting import format_spread_text -_as_int = _coerce.as_int -_as_mapping = _coerce.as_mapping -_as_sequence = _coerce.as_sequence +_as_int = as_int +_as_mapping = as_mapping +_as_sequence = as_sequence def render_json_report_document(payload: Mapping[str, object]) -> str: diff --git a/codeclone/report/suggestions.py b/codeclone/report/suggestions.py index f1277e2..798eeec 100644 --- a/codeclone/report/suggestions.py +++ b/codeclone/report/suggestions.py @@ -1,11 +1,14 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations from typing import TYPE_CHECKING, Literal -from .. import _coerce +from .._coerce import as_int, as_str from ..domain.findings import ( CATEGORY_CLONE, CATEGORY_COHESION, @@ -75,8 +78,8 @@ "dependency", ] -_as_int = _coerce.as_int -_as_str = _coerce.as_str +_as_int = as_int +_as_str = as_str def _priority(severity: Severity, effort: Effort) -> float: diff --git a/codeclone/report/types.py b/codeclone/report/types.py index 42bd16d..6824e6b 100644 --- a/codeclone/report/types.py +++ b/codeclone/report/types.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/scanner.py b/codeclone/scanner.py index 42ed7f7..a9c65a9 100644 --- a/codeclone/scanner.py +++ b/codeclone/scanner.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/structural_findings.py b/codeclone/structural_findings.py index a6aaaf1..d0e3d78 100644 --- a/codeclone/structural_findings.py +++ b/codeclone/structural_findings.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy """CodeClone — structural code quality analysis for Python. @@ -228,35 +231,55 @@ def normalize_structural_findings( return tuple(normalized) -def _summarize_branch(body: list[ast.stmt]) -> dict[str, str] | None: - """Build deterministic structural signature for a meaningful branch body.""" - if not body or all(isinstance(stmt, ast.Pass) for stmt in body): - return None +_TRY_STAR_TYPE = getattr(ast, "TryStar", None) +_NESTED_SCOPE_TYPES = (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef) +_LOOP_TYPES = (ast.For, ast.While, ast.AsyncFor) + + +def _walk_branch_stats(body: Sequence[ast.stmt]) -> _BranchWalkStats: + """Collect branch body facts without descending into nested scopes.""" + call_count = 0 + raise_count = 0 + has_nested_if = False + has_loop = False + has_try = False + stack: list[ast.AST] = [ast.Module(body=list(body), type_ignores=[])] + + while stack: + node = stack.pop() + if isinstance(node, _NESTED_SCOPE_TYPES): + continue - call_count = raise_count = 0 - has_nested_if, has_loop, has_try = False, False, False - try_star = getattr(ast, "TryStar", None) - for node in ast.walk(ast.Module(body=body, type_ignores=[])): if isinstance(node, ast.Call): call_count += 1 elif isinstance(node, ast.Raise): raise_count += 1 elif isinstance(node, ast.If): has_nested_if = True - elif isinstance(node, (ast.For, ast.While, ast.AsyncFor)): + elif isinstance(node, _LOOP_TYPES): has_loop = True elif isinstance(node, ast.Try) or ( - try_star is not None and isinstance(node, try_star) + _TRY_STAR_TYPE is not None and isinstance(node, _TRY_STAR_TYPE) ): has_try = True - stats = _BranchWalkStats( + stack.extend(reversed(list(ast.iter_child_nodes(node)))) + + return _BranchWalkStats( call_count=call_count, raise_count=raise_count, has_nested_if=has_nested_if, has_loop=has_loop, has_try=has_try, ) + + +def _summarize_branch(body: list[ast.stmt]) -> dict[str, str] | None: + """Build deterministic structural signature for a meaningful branch body.""" + if not body or all(isinstance(stmt, ast.Pass) for stmt in body): + return None + + stats = _walk_branch_stats(body) signature = { "stmt_seq": _stmt_type_sequence(body), "terminal": _terminal_kind(body), diff --git a/codeclone/suppressions.py b/codeclone/suppressions.py index 2a9984d..9b2b149 100644 --- a/codeclone/suppressions.py +++ b/codeclone/suppressions.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/codeclone/templates.py b/codeclone/templates.py index eed9082..bc3d493 100644 --- a/codeclone/templates.py +++ b/codeclone/templates.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy """Minimal HTML skeleton template for the report. @@ -20,7 +23,7 @@ REPORT_TEMPLATE = Template( r""" - + diff --git a/codeclone/ui_messages.py b/codeclone/ui_messages.py index c95a9f3..7aca82e 100644 --- a/codeclone/ui_messages.py +++ b/codeclone/ui_messages.py @@ -1,4 +1,7 @@ -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations @@ -31,6 +34,18 @@ HELP_MIN_LOC = "Minimum Lines of Code (LOC) required for clone analysis.\nDefault: 10." HELP_MIN_STMT = "Minimum AST statement count required for clone analysis.\nDefault: 6." HELP_PROCESSES = "Number of parallel worker processes.\nDefault: 4." +HELP_CHANGED_ONLY = ( + "Limit clone gating and changed-scope summaries to findings that touch\n" + "files from a git diff selection." +) +HELP_DIFF_AGAINST = ( + "Resolve changed files from `git diff --name-only `.\n" + "Use together with --changed-only." +) +HELP_PATHS_FROM_GIT_DIFF = ( + "Shorthand for --changed-only using `git diff --name-only `.\n" + "Useful for PR and CI review flows." +) HELP_CACHE_PATH = ( "Path to the cache file.\n" "If FILE is omitted, uses /.cache/codeclone/cache.json." @@ -134,6 +149,7 @@ SUMMARY_TITLE = "Summary" METRICS_TITLE = "Metrics" +CHANGED_SCOPE_TITLE = "Changed Scope" CLI_LAYOUT_MAX_WIDTH = 80 @@ -164,6 +180,9 @@ " lcom4={lcom_avg}/{lcom_max} cycles={cycles} dead_code={dead}" " health={health}({grade})" ) +SUMMARY_COMPACT_CHANGED_SCOPE = ( + "Changed paths={paths} findings={findings} new={new} known={known}" +) WARN_SUMMARY_ACCOUNTING_MISMATCH = ( "Summary accounting mismatch: " @@ -516,6 +535,35 @@ def fmt_metrics_dead_code(count: int, *, suppressed: int = 0) -> str: ) +def fmt_changed_scope_paths(*, count: int) -> str: + return f" {'Paths':<{_L}}{_v(count, 'bold cyan')} from git diff" + + +def fmt_changed_scope_findings(*, total: int, new: int, known: int) -> str: + parts = [ + f"{_v(total, 'bold')} total", + f"{_v(new, 'bold cyan')} new", + f"{_v(known)} known", + ] + separator = " \u00b7 " + return f" {'Findings':<{_L}}{separator.join(parts)}" + + +def fmt_changed_scope_compact( + *, + paths: int, + findings: int, + new: int, + known: int, +) -> str: + return SUMMARY_COMPACT_CHANGED_SCOPE.format( + paths=paths, + findings=findings, + new=new, + known=known, + ) + + def fmt_pipeline_done(elapsed: float) -> str: return f" [dim]Pipeline done in {elapsed:.2f}s[/dim]" diff --git a/docs/README.md b/docs/README.md index b46ffd7..b08f1d1 100644 --- a/docs/README.md +++ b/docs/README.md @@ -3,6 +3,13 @@ This site is built with MkDocs and published to [orenlab.github.io/codeclone](https://orenlab.github.io/codeclone/). +!!! note "Version Notice" + This site currently documents the in-development `v2.0.x` line from `main`. + For the latest stable CodeClone documentation (`v1.4.4`), see the + [`v1.4.4` README](https://github.com/orenlab/codeclone/blob/v1.4.4/README.md) + and the + [`v1.4.4` docs tree](https://github.com/orenlab/codeclone/tree/v1.4.4/docs). + It has two documentation layers: - [Contracts Book](book/README.md): **contract-first** documentation. This is the canonical @@ -31,12 +38,13 @@ repository build: - [Config and defaults](book/04-config-and-defaults.md) - [Core pipeline and invariants](book/05-core-pipeline.md) - [Baseline contract (schema v2.0)](book/06-baseline.md) -- [Cache contract (schema v2.2)](book/07-cache.md) -- [Report contract (schema v2.1)](book/08-report.md) +- [Cache contract (schema v2.3)](book/07-cache.md) +- [Report contract (schema v2.2)](book/08-report.md) ## Interfaces - [CLI behavior, modes, and UX](book/09-cli.md) +- [MCP interface contract](book/20-mcp-interface.md) - [HTML report rendering contract](book/10-html-render.md) ## System Properties @@ -58,6 +66,7 @@ repository build: - [Architecture narrative](architecture.md) - [CFG design and semantics](cfg.md) +- [MCP integration for AI agents and clients](mcp.md) - [SARIF integration for IDE/code-scanning use](sarif.md) - [Docs publishing and Pages workflow](publishing.md) diff --git a/docs/architecture.md b/docs/architecture.md index 1cce692..4cea0a2 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -144,7 +144,7 @@ gating decisions. Detected findings can be rendered as: - interactive HTML (`--html`), -- canonical JSON (`--json`, schema `2.1`), +- canonical JSON (`--json`, schema `2.2`), - deterministic text projection (`--text`), - deterministic Markdown projection (`--md`), - deterministic SARIF projection (`--sarif`). @@ -158,6 +158,7 @@ Reporting uses a layered model: Provenance is carried through `meta` and includes: - runtime/context (`codeclone_version`, `python_version`, `python_tag`, `analysis_mode`, `report_mode`) +- analysis thresholds (`meta.analysis_thresholds.design_findings`) - baseline status block (`meta.baseline.*`) - cache status block (`meta.cache.*`) - metrics-baseline status block (`meta.metrics_baseline.*`) @@ -171,6 +172,61 @@ Explainability contract (v1): --- +## 9. MCP Agent Interface + +CodeClone also exposes an optional MCP layer for AI agents and MCP-capable +clients. + +Current shape: + +- install via the optional `codeclone[mcp]` extra +- launch via `codeclone-mcp` +- transports: + - `stdio` + - `streamable-http` +- semantics: + - read-only + - baseline-aware + - built on the same pipeline/report contracts as the CLI + - bounded in-memory run history + +Operational note: + +- `codeclone/mcp_server.py` is only a thin launcher/registration layer. +- The optional MCP runtime is imported lazily so the base `codeclone` install + and normal CI paths do not require MCP packages. +- `codeclone/mcp_service.py` is the in-process adapter over the existing + pipeline/report contracts. + +The MCP layer is intentionally thin. It does not add a separate analysis engine; +it adapts the existing pipeline into tools/resources such as: + +- analyze repository +- analyze changed paths +- get run summary +- compare runs +- list findings +- inspect one finding +- project remediation payloads +- list hotspots +- generate PR summary +- preview gate outcomes +- keep session-local reviewed markers + +This keeps agent integrations deterministic and aligned with the same canonical +report document used by JSON/HTML/SARIF. + +Security boundaries: + +- Read-only by design — no tool mutates source files, baselines, or repo state. +- `--allow-remote` guard required for non-local transports; default is `stdio`. +- `cache_policy=refresh` rejected to preserve read-only semantics. +- Review markers are session-local in-memory state, never persisted. +- Run history bounded by `--history-limit` to prevent unbounded memory growth. +- `git_diff_ref` validated against strict regex to prevent injection. + +--- + ## CI Integration Baseline comparison allows CI to fail **only on new clones**, diff --git a/docs/book/01-architecture-map.md b/docs/book/01-architecture-map.md index a7c0114..1c92481 100644 --- a/docs/book/01-architecture-map.md +++ b/docs/book/01-architecture-map.md @@ -8,26 +8,28 @@ Document current module boundaries and ownership in CodeClone v2.x. Main ownership layers: -- Core detection pipeline: `scanner` -> `extractor` -> `cfg/normalize` -> `grouping`. +- Core detection pipeline: `scanner` -> `extractor` -> `cfg/normalize/blocks` -> `grouping`. - Quality metrics pipeline: complexity/coupling/cohesion/dependencies/dead-code/health. - Contracts and persistence: baseline, metrics baseline, cache, exit semantics. -- Report model and serialization: deterministic JSON/TXT + explainability facts. +- Report model and projections: canonical JSON + deterministic TXT/Markdown/SARIF + explainability facts. +- MCP agent surface: read-only server layer over the same pipeline/report contracts. - Render layer: HTML rendering and template assets. ## Data model -| Layer | Modules | Responsibility | -|-----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------| -| Contracts | `codeclone/contracts.py`, `codeclone/errors.py` | Shared schema versions, URLs, exit-code enum, typed exceptions | -| Domain models | `codeclone/models.py`, `codeclone/domain/*.py` | Typed dataclasses/enums plus centralized finding/scope/severity taxonomies | -| Discovery + parsing | `codeclone/scanner.py`, `codeclone/extractor.py` | Enumerate files, parse AST, extract function/block/segment units | -| Structural analysis | `codeclone/cfg.py`, `codeclone/normalize.py`, `codeclone/blockhash.py`, `codeclone/fingerprint.py`, `codeclone/blocks.py` | CFG, normalization, statement hashes, block/segment windows | -| Grouping | `codeclone/grouping.py` | Build function/block/segment groups | -| Metrics | `codeclone/metrics/*` | Compute complexity/coupling/cohesion/dependency/dead-code/health signals | -| Report core | `codeclone/report/*`, `codeclone/_cli_meta.py` | Merge windows, explainability facts, deterministic JSON/TXT schema + shared metadata | -| Persistence | `codeclone/baseline.py`, `codeclone/metrics_baseline.py`, `codeclone/cache.py` | Baseline/cache trust/compat/integrity and atomic persistence | -| Runtime orchestration | `codeclone/pipeline.py`, `codeclone/cli.py`, `codeclone/_cli_args.py`, `codeclone/_cli_paths.py`, `codeclone/_cli_summary.py`, `codeclone/_cli_config.py`, `codeclone/ui_messages.py` | CLI UX, stage orchestration, status handling, outputs, error markers | -| Rendering | `codeclone/html_report.py`, `codeclone/_html_report/*`, `codeclone/_html_badges.py`, `codeclone/_html_js.py`, `codeclone/_html_escape.py`, `codeclone/_html_snippets.py`, `codeclone/templates.py` | HTML-only view layer over report data | +| Layer | Modules | Responsibility | +|-----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------| +| Contracts | `codeclone/contracts.py`, `codeclone/errors.py` | Shared schema versions, URLs, exit-code enum, typed exceptions | +| Domain models | `codeclone/models.py`, `codeclone/domain/*.py` | Typed dataclasses/enums plus centralized finding/scope/severity taxonomies | +| Discovery + parsing | `codeclone/scanner.py`, `codeclone/extractor.py` | Enumerate files, parse AST, extract function/block/segment units | +| Structural analysis | `codeclone/cfg.py`, `codeclone/normalize.py`, `codeclone/fingerprint.py`, `codeclone/blocks.py` | CFG, normalization, statement hashes, block/segment windows | +| Grouping | `codeclone/grouping.py` | Build function/block/segment groups | +| Metrics | `codeclone/metrics/*` | Compute complexity/coupling/cohesion/dependency/dead-code/health signals | +| Report core | `codeclone/report/*`, `codeclone/_cli_meta.py` | Canonical report building, deterministic projections, explainability facts, and shared metadata | +| Persistence | `codeclone/baseline.py`, `codeclone/metrics_baseline.py`, `codeclone/cache.py` | Baseline/cache trust/compat/integrity and atomic persistence | +| Runtime orchestration | `codeclone/pipeline.py`, `codeclone/cli.py`, `codeclone/_cli_args.py`, `codeclone/_cli_paths.py`, `codeclone/_cli_summary.py`, `codeclone/_cli_config.py`, `codeclone/ui_messages.py` | CLI UX, stage orchestration, status handling, outputs, error markers | +| MCP agent interface | `codeclone/mcp_service.py`, `codeclone/mcp_server.py` | Read-only MCP tools/resources over canonical analysis and report layers | +| Rendering | `codeclone/html_report.py`, `codeclone/_html_report/*`, `codeclone/_html_badges.py`, `codeclone/_html_js.py`, `codeclone/_html_escape.py`, `codeclone/_html_snippets.py`, `codeclone/templates.py` | HTML-only view layer over report data | Refs: @@ -39,6 +41,17 @@ Refs: - Core analysis modules do not depend on render/UI modules. - HTML renderer receives already-computed report data/facts and does not recompute detection semantics. +- MCP layer reuses current pipeline/report semantics and must not introduce a + separate analysis truth path. +- MCP may ship task-specific slim projections (for example, summary-only metrics + or inventory counts) as long as canonical report data remains the source of + truth and richer detail stays reachable through dedicated tools/sections. +- The same rule applies to summary cache convenience fields such as + `freshness` and to production-first triage projections built from + canonical hotlists/suggestions. +- MCP finding lists may also expose short run/finding ids and slimmer relative + location projections, while keeping `get_finding(detail_level="full")` as the + richer per-finding inspection path. - Baseline, metrics baseline, and cache are validated before being trusted. Refs: @@ -106,6 +119,7 @@ Refs: | Clone baseline trust/compat/integrity | [06-baseline.md](06-baseline.md) | | Cache trust and fail-open behavior | [07-cache.md](07-cache.md) | | Report schema and provenance | [08-report.md](08-report.md), [10-html-render.md](10-html-render.md) | +| MCP agent surface | [20-mcp-interface.md](20-mcp-interface.md) | | Metrics gates and metrics baseline | [15-metrics-and-quality-gates.md](15-metrics-and-quality-gates.md) | | Dead-code liveness policy | [16-dead-code-contract.md](16-dead-code-contract.md) | | Suggestions and clone typing | [17-suggestions-and-clone-typing.md](17-suggestions-and-clone-typing.md) | diff --git a/docs/book/02-terminology.md b/docs/book/02-terminology.md index 73a29e1..8feab3d 100644 --- a/docs/book/02-terminology.md +++ b/docs/book/02-terminology.md @@ -29,6 +29,25 @@ Define terms exactly as used by code and tests. - report schema (`report_schema_version`) for report format compatibility. - **payload_sha256**: canonical baseline semantic hash. - **trusted baseline**: baseline loaded + status `ok`. +- **source_kind**: file classification — `production`, `tests`, `fixtures`, `other` — + determined by scanner path rules. Drives source-scope breakdown and + hotspot attribution. +- **health score**: weighted blend of seven dimension scores (0–100). + Dimensions: clones 25%, complexity 20%, cohesion 15%, coupling 10%, + dead code 10%, dependencies 10%, coverage 10%. + Grade bands: A ≥90, B ≥75, C ≥60, D ≥40, F <40. +- **design finding**: metric-driven finding (complexity/coupling/cohesion) + emitted by the canonical report builder when a class or function exceeds + the report-level design threshold. Thresholds are stored in + `meta.analysis_thresholds.design_findings`. +- **suggestion**: advisory recommendation card derived from clones, structural + findings, or metric violations. Advisory only — never gates CI. +- **production_hotspot**: finding group whose items are concentrated in + production source scope (`source_kind=production`). +- **freshness**: MCP cache indicator (`fresh` / `mixed` / `reused`) + reflecting how much of the analysis was recomputed vs cache-served. +- **directory_hotspot**: derived aggregation in `derived.overview` showing + which directories concentrate the most findings by category. Refs: @@ -37,12 +56,20 @@ Refs: - `codeclone/blocks.py:extract_segments` - `codeclone/baseline.py:current_python_tag` - `codeclone/baseline.py:Baseline.verify_compatibility` +- `codeclone/scanner.py:classify_source_kind` +- `codeclone/metrics/health.py:compute_health` +- `codeclone/report/json_contract.py:_design_findings_thresholds_payload` +- `codeclone/report/suggestions.py:generate_suggestions` +- `codeclone/report/overview.py:build_directory_hotspots` ## Contracts - New/known classification is key-based, not item-heuristic-based. - Baseline trust is status-driven. - Cache trust is status-driven and independent from baseline trust. +- Design finding universe is determined solely by the canonical report builder; + MCP and HTML read, never resynthesize. +- Suggestions are advisory and never affect exit code. Refs: @@ -87,7 +114,7 @@ Refs: ## Locked by tests - `tests/test_baseline.py::test_baseline_id_lists_must_be_sorted_and_unique` -- `tests/test_report.py::test_report_json_group_order_is_lexicographic` +- `tests/test_report.py::test_report_json_group_order_is_deterministic_by_count_then_id` - `tests/test_cache.py::test_cache_version_mismatch_warns` ## Non-guarantees diff --git a/docs/book/05-core-pipeline.md b/docs/book/05-core-pipeline.md index 81ed0ee..1640dd1 100644 --- a/docs/book/05-core-pipeline.md +++ b/docs/book/05-core-pipeline.md @@ -33,6 +33,30 @@ Stages: 6. Structural report findings: - duplicated branch families from per-function AST structure facts - clone cohort drift families built from existing function groups (no rescan) +7. Metrics computation (full mode only): + - per-function cyclomatic complexity + - per-class coupling (CBO) and cohesion (LCOM4) + - dead-code analysis: declaration-only, qualname-based liveness + - dependency graph and cycle detection +8. Health scoring: + - seven dimension scores: clones, complexity, coupling, cohesion, + dead code, dependencies, coverage + - weighted blend → composite score (0–100) and grade (A–F) +9. Design finding extraction: + - threshold-aware findings for complexity, coupling, cohesion + - thresholds recorded in `meta.analysis_thresholds.design_findings` +10. Suggestion generation: + - advisory cards from clone groups, structural findings, metric violations + - deterministic priority sort, never gates CI +11. Derived overview and hotlists: + - overview families, top risks, source breakdown, health snapshot + - directory hotspots by category (`derived.overview.directory_hotspots`) + - hotlists: most actionable, highest spread, production/test-fixture hotspots +12. Gate evaluation: + - clone-baseline diff (NEW vs KNOWN) + - metric threshold gates (`--fail-complexity`, `--fail-coupling`, etc.) + - metric regression gates (`--fail-on-new-metrics`) + - gate reasons emitted in deterministic order Refs: @@ -40,6 +64,11 @@ Refs: - `codeclone/extractor.py:extract_units_and_stats_from_source` - `codeclone/report/blocks.py:prepare_block_report_groups` - `codeclone/report/segments.py:prepare_segment_report_groups` +- `codeclone/metrics/health.py:compute_health` +- `codeclone/report/json_contract.py:_build_design_groups` +- `codeclone/report/suggestions.py:generate_suggestions` +- `codeclone/report/overview.py:build_directory_hotspots` +- `codeclone/pipeline.py:metric_gate_reasons` ## Contracts diff --git a/docs/book/07-cache.md b/docs/book/07-cache.md index 3690c7c..1e2fe51 100644 --- a/docs/book/07-cache.md +++ b/docs/book/07-cache.md @@ -2,7 +2,7 @@ ## Purpose -Define cache schema v2.2, integrity verification, and fail-open behavior. +Define cache schema v2.3, integrity verification, and fail-open behavior. ## Public surface @@ -13,7 +13,7 @@ Define cache schema v2.2, integrity verification, and fail-open behavior. ## Data model -On-disk schema (`v == "2.2"`): +On-disk schema (`v == "2.3"`): - Top-level: `v`, `payload`, `sig` - `payload` keys: `py`, `fp`, `ap`, `files`, optional `sr` @@ -58,6 +58,9 @@ Refs: (`min_loc`, `min_stmt`, `block_min_loc`, `block_min_stmt`, `segment_min_loc`, `segment_min_stmt`) - `sig` equals deterministic hash of canonical payload +- Cache schema must also be bumped when cached analysis semantics change in a + way that could leave syntactically valid but semantically stale per-file + entries accepted by runtime compatibility checks. Refs: diff --git a/docs/book/08-report.md b/docs/book/08-report.md index 3f37d6e..f893b51 100644 --- a/docs/book/08-report.md +++ b/docs/book/08-report.md @@ -2,7 +2,7 @@ ## Purpose -Define report contracts in `2.0.0b2`: canonical JSON (`report_schema_version=2.1`) +Define report contracts in `2.0.0b3`: canonical JSON (`report_schema_version=2.2`) plus deterministic TXT/Markdown/SARIF projections. ## Public surface @@ -16,7 +16,7 @@ plus deterministic TXT/Markdown/SARIF projections. ## Data model -JSON report top-level (v2.1): +JSON report top-level (v2.2): - `report_schema_version` - `meta` @@ -26,6 +26,12 @@ JSON report top-level (v2.1): - `derived` - `integrity` +Canonical provenance additions: + +- `meta.analysis_thresholds.design_findings` records the effective report-level + thresholds used to materialize canonical design findings for that run + (`complexity > N`, `coupling > N`, `cohesion >= N`). + Canonical vs non-canonical split: - Canonical: `report_schema_version`, `meta`, `inventory`, `findings`, `metrics` @@ -41,6 +47,7 @@ Derived projection layer: - `top_risks` - `source_scope_breakdown` - `health_snapshot` + - `directory_hotspots` - `derived.hotlists` — deterministic lists of canonical finding IDs: - `most_actionable_ids` - `highest_spread_ids` @@ -73,19 +80,34 @@ Per-group common axes (family-specific fields may extend): - JSON is source of truth for report semantics. - Markdown and SARIF are deterministic projections from the same report document. +- MCP summary/finding/hotlist/report-section queries are deterministic views over + the same canonical report document. - SARIF is an IDE/code-scanning-oriented projection: - repo-relative result paths are anchored via `%SRCROOT%` - referenced files are listed under `run.artifacts` - clone results carry `baselineState` when clone novelty is known - Derived layer (`suggestions`, `overview`, `hotlists`) does not replace canonical findings/metrics. +- Design findings are built once in the canonical report using the effective + threshold policy recorded in `meta.analysis_thresholds.design_findings`; MCP + and HTML must not re-synthesize them post-hoc from raw metric rows. - HTML overview cards are materialized from canonical findings plus `derived.overview` + `derived.hotlists`; pre-expanded overview card payloads are not part of the report contract. +- `derived.overview.directory_hotspots` is a deterministic report-layer + aggregation over canonical findings; HTML must render it as-is or omit it on + compatibility paths without a canonical report document. +- `derived.overview.directory_hotspots[*].path` is an overview-oriented + directory key: runtime findings keep their parent directory, while test-only + and fixture-only findings collapse to the corresponding source-scope roots + (`.../tests` or `.../tests/fixtures`) to avoid duplicating the same hotspot + across leaf fixture paths. - Overview hotspot/source-breakdown sections must resolve from canonical report data or deterministic derived IDs; HTML must not silently substitute stale placeholders such as `n/a` or empty-state cards when canonical data exists. -- `report_generated_at_utc` is carried in `meta.runtime` and reused by UI/renderers. +- `analysis_started_at_utc` and `report_generated_at_utc` are carried in + `meta.runtime`; renderers/projections may use them for provenance but must not + reinterpret them as semantic analysis data. - Canonical `meta.scan_root` is normalized to `"."`; absolute runtime paths are exposed under `meta.runtime.*_absolute`. - `clone_type` and `novelty` are group-level properties inside clone groups. @@ -148,6 +170,7 @@ Refs: - [07-cache.md](07-cache.md) - [09-cli.md](09-cli.md) - [10-html-render.md](10-html-render.md) +- [20-mcp-interface.md](20-mcp-interface.md) - [17-suggestions-and-clone-typing.md](17-suggestions-and-clone-typing.md) - [../sarif.md](../sarif.md) - [../examples/report.md](../examples/report.md) diff --git a/docs/book/09-cli.md b/docs/book/09-cli.md index f5b15c1..0d059cb 100644 --- a/docs/book/09-cli.md +++ b/docs/book/09-cli.md @@ -42,6 +42,13 @@ Refs: - `--open-html-report` is a local UX action layered on top of `--html`; it does not implicitly enable HTML output. - `--timestamped-report-paths` only rewrites default report paths requested via bare report flags; explicit FILE values stay unchanged. +- Changed-scope clone review uses: + - `--changed-only` + - `--diff-against GIT_REF` + - `--paths-from-git-diff GIT_REF` + Typical usage: + - `codeclone . --changed-only --diff-against main` + - `codeclone . --paths-from-git-diff HEAD~1` - Contract errors are prefixed by `CONTRACT ERROR:`. - Gating failures are prefixed by `GATING FAILURE:`. - Internal errors use `fmt_internal_error` with optional debug details. @@ -65,9 +72,15 @@ Refs: `.cache/codeclone/`. - `--open-html-report` requires `--html`; invalid combination is a contract error. - `--timestamped-report-paths` requires at least one requested report output; invalid combination is a contract error. +- `--changed-only` requires either `--diff-against` or `--paths-from-git-diff`. +- `--diff-against` requires `--changed-only`. +- `--diff-against` and `--paths-from-git-diff` are mutually exclusive. - Browser-open failure after a successful HTML write is warning-only and does not change the process exit code. - Baseline update write failure is contract error. - In gating mode, unreadable source files are contract errors with higher priority than clone gating failure. +- Changed-scope flags do not create a second canonical report: they project clone + summary/threshold decisions over the changed-files subset after the normal full + analysis completes. Refs: @@ -82,6 +95,9 @@ Refs: | Invalid output extension/path | contract | 2 | | `--open-html-report` without `--html` | contract | 2 | | `--timestamped-report-paths` without reports | contract | 2 | +| `--changed-only` without diff source | contract | 2 | +| `--diff-against` without `--changed-only` | contract | 2 | +| `--diff-against` + `--paths-from-git-diff` | contract | 2 | | Baseline untrusted in CI/gating | contract | 2 | | Unreadable source in CI/gating | contract | 2 | | New clones with `--fail-on-new` | gating | 3 | @@ -93,6 +109,8 @@ Refs: - Summary metric ordering is fixed. - Compact summary mode (`--quiet`) is fixed-format text. - Help epilog is generated from static constants. +- `git diff --name-only` input is normalized to sorted repo-relative paths before + changed-scope projection is applied. Refs: @@ -115,5 +133,6 @@ Refs: ## See also - [04-config-and-defaults.md](04-config-and-defaults.md) +- [20-mcp-interface.md](20-mcp-interface.md) - [15-metrics-and-quality-gates.md](15-metrics-and-quality-gates.md) - [16-dead-code-contract.md](16-dead-code-contract.md) diff --git a/docs/book/10-html-render.md b/docs/book/10-html-render.md index e93161f..33151b9 100644 --- a/docs/book/10-html-render.md +++ b/docs/book/10-html-render.md @@ -44,10 +44,23 @@ Refs: - KPI cards with baseline-aware tone (`✓ baselined` / `+N` regression) - Health gauge with baseline delta arc (improvement/degradation) - Executive Summary: issue breakdown (sorted bars) + source breakdown + - Hotspots by Directory: render-only view over `derived.overview.directory_hotspots` - Health Profile: full-width radar chart of dimension scores - Get Badge modal: grade-only / score+grade variants with shields.io embed - Dead-code UI is a single top-level `Dead Code` tab with deterministic split sub-tabs: `Active` and `Suppressed`. +- IDE deep links: + - An IDE picker in the topbar lets users choose their IDE. The selection is + persisted in `localStorage` (key `codeclone-ide`). + - Supported IDEs: PyCharm, IntelliJ IDEA, VS Code, Cursor, Fleet, Zed. + - File paths across Clones, Quality, Suggestions, Dead Code, and Findings + tabs are rendered as `` elements with `data-file` + (absolute path) and `data-line` attributes. + - JetBrains IDEs use `jetbrains://` protocol (requires Toolbox); others use + native URL schemes (`vscode://`, `cursor://`, `fleet://`, `zed://`). + - The scan root is embedded as `data-scan-root` on `` so that + JetBrains links can derive the project name and relative path. + - When no IDE is selected, links are inert (no `href`, default cursor). Refs: @@ -55,6 +68,8 @@ Refs: - `codeclone/report/overview.py:materialize_report_overview` - `codeclone/_html_report/_sections/_clones.py:_render_group_explanation` - `codeclone/_html_report/_sections/_meta.py:render_meta_panel` +- `codeclone/_html_js.py:_IDE_LINKS` +- `codeclone/_html_report/_assemble.py` (IDE picker topbar widget) ## Invariants (MUST) @@ -63,12 +78,16 @@ Refs: - Novelty controls reflect baseline trust split note and per-group novelty flags. - Suppressed dead-code rows are rendered only from report dead-code suppression payloads and do not become active dead-code findings in UI tables. +- IDE link `data-file` and `data-line` attributes are escaped via + `_escape_attr` before insertion into HTML. Refs: - `codeclone/_html_escape.py:_escape_attr` - `codeclone/_html_snippets.py:_render_code_block` - `codeclone/_html_report/_sections/_clones.py:render_clones_panel` +- `codeclone/_html_report/_tables.py` (path cell IDE links) +- `codeclone/report/findings.py` (structural findings IDE links) ## Failure modes @@ -104,7 +123,12 @@ Refs: ## Non-guarantees - CSS/visual system and interaction details may evolve without schema bump. -- HTML-only interaction affordances (theme toggle, provenance modal, badge - modal, radar chart) are not baseline/cache/report contracts. +- HTML-only interaction affordances (theme toggle, IDE picker, provenance modal, + badge modal, radar chart) are not baseline/cache/report contracts. +- IDE deep link behavior depends on the user's local IDE installation and + protocol handler registration (e.g. JetBrains Toolbox for `jetbrains://`). - Overview layout (KPI grid, executive summary, analytics) is a pure view concern; only the underlying data identity and ordering are contract-sensitive. +- Direct `build_html_report(...)` compatibility paths without a canonical + `report_document` may omit `directory_hotspots`; HTML must not approximate + directory aggregates from suggestion cards. diff --git a/docs/book/11-security-model.md b/docs/book/11-security-model.md index d6a271a..a9c917f 100644 --- a/docs/book/11-security-model.md +++ b/docs/book/11-security-model.md @@ -10,6 +10,7 @@ Describe implemented protections and explicit security boundaries. - File read limits and parser limits: `codeclone/cli.py:process_file`, `codeclone/extractor.py:_parse_limits` - Baseline/cache validation: `codeclone/baseline.py`, `codeclone/cache.py` - HTML escaping: `codeclone/_html_escape.py`, `codeclone/html_report.py` +- MCP read-only enforcement: `codeclone/mcp_service.py`, `codeclone/mcp_server.py` ## Data model @@ -25,6 +26,17 @@ Security-relevant input classes: - Sensitive root directories are blocked by scanner policy. - Symlink traversal outside root is skipped. - HTML report escapes text and attribute contexts before embedding. +- MCP server is read-only by design: no tool mutates source files, baselines, + cache, or report artifacts. +- `--allow-remote` guard must be passed explicitly for non-local transports; + default is local-only (`stdio`). +- `cache_policy=refresh` is rejected — MCP cannot trigger cache invalidation. +- Review markers (`mark_finding_reviewed`) are session-local in-memory state; + they are never persisted to disk or leaked into baselines/reports. +- `git_diff_ref` parameter is validated against a strict regex to prevent + command injection via shell-interpreted git arguments. +- Run history is bounded by `--history-limit` (default 10) to prevent + unbounded memory growth. Refs: @@ -54,6 +66,9 @@ Refs: | Oversized baseline | Baseline rejected | | Oversized cache | Cache ignored | | HTML-injected payload in metadata/source | Escaped output | +| `--allow-remote` not passed for HTTP | Transport rejected | +| `cache_policy=refresh` requested | Policy rejected | +| `git_diff_ref` fails regex | Parameter rejected | ## Determinism / canonicalization @@ -74,6 +89,8 @@ Refs: - `tests/test_security.py::test_html_report_escapes_user_content` - `tests/test_html_report.py::test_html_report_escapes_script_breakout_payload` - `tests/test_cache.py::test_cache_too_large_warns` +- `tests/test_mcp_service.py::test_cache_policy_refresh_rejected` +- `tests/test_mcp_server.py::test_allow_remote_guard` ## Non-guarantees diff --git a/docs/book/13-testing-as-spec.md b/docs/book/13-testing-as-spec.md index ac46762..fe83446 100644 --- a/docs/book/13-testing-as-spec.md +++ b/docs/book/13-testing-as-spec.md @@ -29,14 +29,14 @@ Test classes by role: The following matrix is treated as executable contract: -| Contract | Tests | -|--------------------------------------------|---------------------------------------------------------------------------------------------------------------| -| Baseline schema/integrity/compat gates | `tests/test_baseline.py` | -| Cache v2.2 fail-open + status mapping | `tests/test_cache.py`, `tests/test_cli_inprocess.py::test_cli_reports_cache_too_large_respects_max_size_flag` | -| Exit code categories and markers | `tests/test_cli_unit.py`, `tests/test_cli_inprocess.py` | -| Report schema v2.1 canonical/derived/integrity + JSON/TXT/MD/SARIF projections | `tests/test_report.py`, `tests/test_report_contract_coverage.py`, `tests/test_report_branch_invariants.py` | -| HTML render-only explainability + escaping | `tests/test_html_report.py` | -| Scanner traversal safety | `tests/test_scanner_extra.py`, `tests/test_security.py` | +| Contract | Tests | +|--------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------| +| Baseline schema/integrity/compat gates | `tests/test_baseline.py` | +| Cache v2.3 fail-open + status mapping | `tests/test_cache.py`, `tests/test_cli_inprocess.py::test_cli_reports_cache_too_large_respects_max_size_flag` | +| Exit code categories and markers | `tests/test_cli_unit.py`, `tests/test_cli_inprocess.py` | +| Report schema v2.2 canonical/derived/integrity + JSON/TXT/MD/SARIF projections | `tests/test_report.py`, `tests/test_report_contract_coverage.py`, `tests/test_report_branch_invariants.py` | +| HTML render-only explainability + escaping | `tests/test_html_report.py` | +| Scanner traversal safety | `tests/test_scanner_extra.py`, `tests/test_security.py` | ## Invariants (MUST) diff --git a/docs/book/14-compatibility-and-versioning.md b/docs/book/14-compatibility-and-versioning.md index 563ff7d..3ad9656 100644 --- a/docs/book/14-compatibility-and-versioning.md +++ b/docs/book/14-compatibility-and-versioning.md @@ -12,6 +12,7 @@ compatibility is enforced. - Metrics baseline compatibility checks: `codeclone/metrics_baseline.py:MetricsBaseline.verify_compatibility` - Cache compatibility checks: `codeclone/cache.py:Cache.load` - Report schema assignment: `codeclone/report/json_contract.py:build_report_document` +- MCP public surface: `codeclone/mcp_server.py`, `codeclone/mcp_service.py` ## Data model @@ -19,8 +20,8 @@ Current contract versions: - `BASELINE_SCHEMA_VERSION = "2.0"` - `BASELINE_FINGERPRINT_VERSION = "1"` -- `CACHE_VERSION = "2.2"` -- `REPORT_SCHEMA_VERSION = "2.1"` +- `CACHE_VERSION = "2.3"` +- `REPORT_SCHEMA_VERSION = "2.2"` - `METRICS_BASELINE_SCHEMA_VERSION = "1.0"` (standalone metrics-baseline file) Refs: @@ -33,10 +34,29 @@ Version bump rules: - Bump **baseline schema** only for baseline JSON layout/type changes. - Bump **fingerprint version** when clone key semantics change. -- Bump **cache schema** for cache wire-format/validation changes. +- Bump **cache schema** for cache wire-format/validation changes and for + cached-analysis semantic changes that would otherwise leave stale cache + entries looking compatible to runtime validation. - Bump **report schema** for canonical report document contract changes (`report_schema_version`, consumed by JSON/TXT/Markdown/SARIF and HTML provenance/view). - Bump **metrics-baseline schema** only for standalone metrics-baseline payload changes. +- MCP does not currently define a separate schema/version constant; tool names, + resource shapes, and documented request/response semantics are therefore + package-versioned public surface and must be documented/tested when changed. +- Slimming or splitting MCP-only projections (for example, summary payloads or + `metrics` vs `metrics_detail`) does not change `report_schema_version` as long + as the canonical report document and finding identities remain unchanged. +- The same rule applies to finding-level MCP projection changes such as + short MCP ids, slim summary locations, or omitting `priority_factors` + outside `detail_level="full"`. +- Additive MCP-only convenience fields/projections such as + `cache.freshness` or production-first triage also do not change + `report_schema_version` when they are derived from unchanged canonical report + and summary data. +- Canonical report changes such as `meta.analysis_thresholds.design_findings` + or threshold-aware design finding materialization do change + `report_schema_version` because they alter canonical report semantics and + integrity payload. Baseline compatibility rules: @@ -84,8 +104,8 @@ Refs: ## Locked by tests -- `tests/test_baseline.py::test_baseline_verify_schema_too_new` -- `tests/test_baseline.py::test_baseline_verify_schema_major_mismatch` +- `tests/test_baseline.py::test_baseline_verify_schema_incompatibilities[schema_too_new]` +- `tests/test_baseline.py::test_baseline_verify_schema_incompatibilities[schema_major_mismatch]` - `tests/test_baseline.py::test_baseline_verify_fingerprint_mismatch` - `tests/test_cache.py::test_cache_v_field_version_mismatch_warns` - `tests/test_report.py::test_report_json_compact_v21_contract` diff --git a/docs/book/15-metrics-and-quality-gates.md b/docs/book/15-metrics-and-quality-gates.md index ed9d483..7f9f760 100644 --- a/docs/book/15-metrics-and-quality-gates.md +++ b/docs/book/15-metrics-and-quality-gates.md @@ -52,8 +52,8 @@ Refs: runtime auto-enables clone-only mode (`skip_metrics=true`). - In clone-only mode: `skip_dead_code=true`, `skip_dependencies=true`. -- `--fail-dead-code` forces dead-code analysis on. -- `--fail-cycles` forces dependency analysis on. +- `--fail-dead-code` forces dead-code analysis on (even if metrics are skipped). +- `--fail-cycles` forces dependency analysis on (even if metrics are skipped). - `--update-baseline` in full mode implies metrics-baseline update in the same run. - If metrics baseline path equals clone baseline path and clone baseline file is diff --git a/docs/book/17-suggestions-and-clone-typing.md b/docs/book/17-suggestions-and-clone-typing.md index 5befb4f..eac9246 100644 --- a/docs/book/17-suggestions-and-clone-typing.md +++ b/docs/book/17-suggestions-and-clone-typing.md @@ -21,6 +21,8 @@ Suggestion shape: - `severity`: `critical|warning|info` - `category`: `clone|structural|complexity|coupling|cohesion|dead_code|dependency` +- `source_kind`: source classification of the primary location + (`production` / `tests` / `fixtures` / `other`) - `title`, `location`, `steps`, `effort`, `priority` Clone typing: @@ -93,7 +95,7 @@ Refs: - `tests/test_report_suggestions.py::test_classify_clone_type_all_modes` - `tests/test_report_suggestions.py::test_generate_suggestions_covers_clone_metrics_and_dependency_categories` - `tests/test_report_suggestions.py::test_generate_suggestions_covers_skip_branches_for_optional_rules` -- `tests/test_html_report.py::test_html_report_suggestions_headers_include_help_tips` +- `tests/test_html_report.py::test_html_report_suggestions_cards_split_facts_assessment_and_action` ## Non-guarantees diff --git a/docs/book/20-mcp-interface.md b/docs/book/20-mcp-interface.md new file mode 100644 index 0000000..ac3cc56 --- /dev/null +++ b/docs/book/20-mcp-interface.md @@ -0,0 +1,296 @@ +# 20. MCP Interface + +## Purpose + +Define the current public MCP surface in the `2.0` beta line. + +This interface is **optional** and is installed via the `mcp` extra. It does +not replace the CLI or the canonical JSON report contract. Instead, it exposes +the existing deterministic analysis pipeline as a **read-only MCP server** for +AI agents and MCP-capable clients. +It is intentionally budget-aware and triage-first: the MCP surface is shaped as +guided control flow for agentic development, not as a flat dump of report data. + +## Public surface + +- Package extra: `codeclone[mcp]` +- MCP launcher: `codeclone-mcp` +- MCP server: `codeclone/mcp_server.py` +- MCP service adapter: `codeclone/mcp_service.py` + +## Data model + +Current server characteristics: + +- optional dependency; base `codeclone` install does not require `mcp` +- transports: + - `stdio` + - `streamable-http` +- run storage: + - in-memory only + - bounded history (`--history-limit`, default `4`, maximum `10`) + - latest-run pointer for `codeclone://latest/...` resources + - the `latest` pointer moves whenever a newer `analyze_*` call registers a run +- run identity: + - canonical run identity is derived from the canonical report integrity digest + - MCP payloads expose a short `run_id` handle (first 8 hex chars) + - MCP tools/resources accept both short and full run ids + - MCP finding ids are compact by default and may lengthen when needed to + stay unique within a run +- analysis modes: + - `full` + - `clones_only` +- process-count policy: + - `processes` is an optional override + - when omitted, MCP defers to the core CodeClone runtime +- root contract: + - analysis tools require an absolute repository root + - relative roots such as `.` are rejected in MCP because server cwd may + differ from the client workspace + - granular `check_*` tools may omit `root` and use the latest compatible + stored run; if `root` is provided, it must also be absolute +- cache policies: + - `reuse` + - `off` + `refresh` is rejected in MCP because the server is read-only. +- summary payload: + - `run_id`, `version`, `schema`, `mode` + - `baseline`, `metrics_baseline`, `cache` + - `cache.freshness` classifies summary cache reuse as `fresh`, `mixed`, + or `reused` + - flattened `inventory` (`files`, `lines`, `functions`, `classes`) + - flattened `findings` (`total`, `new`, `known`, `by_family`, `production`) + - flattened `diff` (`new_clones`, `health_delta`) + - `warnings`, `failures` + - `analyze_changed_paths` is intentionally more compact than `get_run_summary`: + it returns `changed_files`, `health`, `health_delta`, `verdict`, + `new_findings`, `resolved_findings`, and an empty `changed_findings` + placeholder, while detailed changed payload stays in + `get_report_section(section="changed")` +- workflow guidance: + - the MCP surface is intentionally agent-guiding rather than list-first + - the cheapest useful path is designed to be the most obvious path: + `get_run_summary` / `get_production_triage` first, then `list_hotspots` + or `check_*`, then `get_finding` / `get_remediation` +- finding-list payloads: + - MCP finding ids are compact projection ids; canonical report ids are unchanged + - `detail_level="summary"` is the default for list/check/hotspot tools + - `detail_level="summary"` keeps compact relative `"path:line"` locations + - `detail_level="normal"` keeps structured `{path, line, end_line, symbol}` + locations plus remediation + - `detail_level="full"` keeps the compatibility-oriented payload, + including `priority_factors`, `items`, and per-location `uri` + +The MCP layer does not introduce a separate analysis engine. It calls the +current CodeClone pipeline and reuses the canonical report document already +produced by the report contract. + +## Tools + +Current tool set: + +| Tool | Key parameters | Purpose / notes | +|--------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `analyze_repository` | absolute `root`, `analysis_mode`, `changed_paths`, `git_diff_ref`, inline thresholds, cache/baseline paths | Run deterministic CodeClone analysis, register the latest run, and return a compact MCP summary. The intended next step is `get_run_summary` or `get_production_triage`, not broad listing by default | +| `analyze_changed_paths` | absolute `root`, `changed_paths` or `git_diff_ref`, `analysis_mode`, inline thresholds | Diff-aware fast path: analyze a repo, attach a changed-files projection, and return a compact changed-files snapshot. The intended next step is `get_report_section(section="changed")` or `get_production_triage` | +| `get_run_summary` | `run_id` | Return the stored summary for the latest or specified run, with slim inventory counts instead of the full file registry; this is the cheapest run-level snapshot and `health` becomes explicit `available=false` when metrics were skipped | +| `get_production_triage` | `run_id`, `max_hotspots`, `max_suggestions` | Return a compact production-first MCP projection: health, cache `freshness`, production hotspots, production suggestions, and global source-kind counters. This is the default first-pass view for large or noisy repositories | +| `compare_runs` | `run_id_before`, `run_id_after`, `focus` | Compare two registered runs by finding ids and run-to-run health delta; MCP returns short run ids, compact regression/improvement cards, `mixed` for conflicting signals, and `incomparable` with top-level `reason`, empty comparison cards, and `health_delta=null` when roots/settings differ | +| `evaluate_gates` | `run_id`, gate thresholds/booleans | Evaluate CI/gating conditions against an existing run without exiting the process | +| `get_report_section` | `run_id`, `section`, `family`, `path`, `offset`, `limit` | Return a canonical report section. Prefer targeted sections instead of `section="all"` unless the client truly needs the full canonical report. `metrics` is summary-only; `metrics_detail` is paginated/bounded and falls back to summary+hint when unfiltered | +| `list_findings` | `family`, `category`, `severity`, `source_kind`, `novelty`, `sort_by`, `detail_level`, `changed_paths`, `git_diff_ref`, `exclude_reviewed`, pagination | Return deterministically ordered finding groups with filtering and pagination; compact summary detail is the default. Intended for broader filtered review after hotspots or `check_*`, not as the cheapest first-pass call | +| `get_finding` | `finding_id`, `run_id`, `detail_level` | Return one finding by id; defaults to `normal` detail and accepts MCP short ids. Use this after `list_hotspots`, `list_findings`, or `check_*` instead of raising detail on larger lists | +| `get_remediation` | `finding_id`, `run_id`, `detail_level` | Return just the remediation/explainability packet for one finding. Use this when the client needs the fix packet without pulling broader detail payloads | +| `list_hotspots` | `kind`, `run_id`, `detail_level`, `changed_paths`, `git_diff_ref`, `exclude_reviewed`, `limit`, `max_results` | Return one derived hotlist (`most_actionable`, `highest_spread`, `highest_priority`, `production_hotspots`, `test_fixture_hotspots`) with compact summary cards. This is the preferred first-pass triage surface before broader `list_findings` calls | +| `check_clones` | `run_id`, `root`, `path`, `clone_type`, `source_kind`, `max_results`, `detail_level` | Return clone findings from a compatible stored run; `health.dimensions` includes only `clones`. Prefer this narrower tool over `list_findings` when only clone debt is needed | +| `check_complexity` | `run_id`, `root`, `path`, `min_complexity`, `max_results`, `detail_level` | Return complexity hotspots from a compatible stored run; `health.dimensions` includes only `complexity`. Prefer this narrower tool over `list_findings` when only complexity is needed | +| `check_coupling` | `run_id`, `root`, `path`, `max_results`, `detail_level` | Return coupling hotspots from a compatible stored run; `health.dimensions` includes only `coupling`. Prefer this narrower tool over `list_findings` when only coupling is needed | +| `check_cohesion` | `run_id`, `root`, `path`, `max_results`, `detail_level` | Return cohesion hotspots from a compatible stored run; `health.dimensions` includes only `cohesion`. Prefer this narrower tool over `list_findings` when only cohesion is needed | +| `check_dead_code` | `run_id`, `root`, `path`, `min_severity`, `max_results`, `detail_level` | Return dead-code findings from a compatible stored run; `health.dimensions` includes only `dead_code`. Prefer this narrower tool over `list_findings` when only dead code is needed | +| `generate_pr_summary` | `run_id`, `changed_paths`, `git_diff_ref`, `format` | Build a PR-friendly changed-files summary in markdown or JSON. Prefer `markdown` for compact LLM-facing output and reserve `json` for machine post-processing | +| `mark_finding_reviewed` | `finding_id`, `run_id`, `note` | Mark a finding as reviewed in the in-memory MCP session | +| `list_reviewed_findings` | `run_id` | Return the current reviewed findings for the selected run | +| `clear_session_runs` | none | Clear all stored in-memory runs plus ephemeral review/gate/session caches for the current server process | + +All analysis/report tools are read-only with respect to repo state. The only +mutable MCP tools are `mark_finding_reviewed` and `clear_session_runs`, and +their effects are session-local and in-memory only. `analyze_repository`, +`analyze_changed_paths`, and `evaluate_gates` are +sessionful and may populate or reuse in-memory run state. The granular +`check_*` tools are read-only over stored runs: use `analyze_repository` or +`analyze_changed_paths` first, then query the latest run or pass a specific +`run_id`. + +Budget-aware workflow is intentional: + +- first pass: `get_run_summary` or `get_production_triage` +- targeted triage: `list_hotspots` or the relevant `check_*` +- single-finding drill-down: `get_finding`, then `get_remediation` +- bounded metrics drill-down: `get_report_section(section="metrics_detail", family=..., limit=...)` +- PR output: `generate_pr_summary(format="markdown")` unless machine JSON is explicitly needed + +## Resources + +Current fixed resources: + +| Resource | Payload | Availability | +|----------------------------------|-------------------------------------------------------|-------------------------------------------------------| +| `codeclone://latest/summary` | latest run summary projection | always after at least one run | +| `codeclone://latest/triage` | latest production-first triage projection | always after at least one run | +| `codeclone://latest/report.json` | latest canonical report document | always after at least one run | +| `codeclone://latest/health` | latest health score + dimensions | always after at least one run | +| `codeclone://latest/gates` | latest gate evaluation result | only after `evaluate_gates` in current server process | +| `codeclone://latest/changed` | latest changed-files projection | only for a diff-aware latest run | +| `codeclone://schema` | schema-style descriptor for canonical report sections | always available | + +Current run-scoped URI templates: + +| URI template | Payload | Availability | +|---------------------------------------------------|--------------------------------------|-----------------------------------------| +| `codeclone://runs/{run_id}/summary` | run-specific summary projection | for any stored run | +| `codeclone://runs/{run_id}/report.json` | run-specific canonical report | for any stored run | +| `codeclone://runs/{run_id}/findings/{finding_id}` | run-specific canonical finding group | for an existing finding in a stored run | + +Fixed resources and URI templates are convenience views over already +registered runs. They do not trigger fresh analysis by themselves. +If a client needs the freshest truth, it must start a fresh analysis run first +(typically with `cache_policy="off"`), rather than relying on older session +state behind `codeclone://latest/...`. + +## Contracts + +- MCP is **read-only**: + - no source-file mutation + - no baseline update + - no metrics-baseline update + - no cache refresh writes +- Session review markers are **ephemeral only**: + - stored in memory per server process + - never written to baseline, cache, or report artifacts +- `streamable-http` defaults to loopback binding. + Non-loopback hosts require explicit `--allow-remote` because the server has + no built-in authentication. +- MCP must reuse current: + - pipeline stages + - baseline trust semantics + - cache semantics + - canonical report contract +- Inline MCP design-threshold parameters (`complexity_threshold`, + `coupling_threshold`, `cohesion_threshold`) define the canonical design + finding universe of that run and are recorded in + `meta.analysis_thresholds.design_findings`. +- `get_run_summary` is a deterministic convenience projection derived from the + canonical report (`meta`, `inventory`, `findings.summary`, + `metrics.summary.health`) plus baseline-diff/gate/changed-files context. +- `get_production_triage` is also a deterministic MCP projection over the same + canonical run state (`summary`, `derived.hotlists`, `derived.suggestions`, + and canonical finding source scope). It must not create a second analysis or + remediation truth path. +- Canonical JSON remains the source of truth for report semantics. +- `list_findings` and `list_hotspots` are deterministic projections over the + canonical report, not a separate analysis branch. +- `get_remediation` is a deterministic MCP projection over existing + suggestions/explainability data, not a second remediation engine. +- `analysis_mode="clones_only"` must mirror the same metric/dependency + skip-semantics as the regular pipeline. +- Missing optional MCP dependency is handled explicitly by the launcher with a + user-facing install hint and exit code `2`. + +## Invariants (MUST) + +- Tool names are stable public surface. +- Resource URI shapes are stable public surface. +- Read-only vs session-local tool annotations remain accurate. +- `analyze_repository` always registers exactly one latest run. +- `analyze_changed_paths` requires `changed_paths` or `git_diff_ref`. +- `analyze_repository` and `analyze_changed_paths` require an absolute `root`; + relative roots like `.` are rejected. +- `changed_paths` is a structured `list[str]` of repo-relative paths, not a + comma-separated string payload. +- `analyze_changed_paths` may return the same `run_id` as a previous run when + the canonical report digest is unchanged; changed-files state is an overlay, + not a second canonical report. +- `get_run_summary` with no `run_id` resolves to the latest stored run. +- `codeclone://latest/...` resources always resolve to the latest stored run in + the current MCP server process, not to a globally fresh analysis state. +- Summary-style MCP payloads expose `cache.freshness` as a derived convenience + marker; canonical cache metadata remains available only through canonical + report/meta surfaces. +- `get_report_section(section="all")` returns the full canonical report document. +- `get_report_section(section="metrics")` returns only `metrics.summary`. +- `get_report_section(section="metrics_detail")` is intentionally bounded: + without filters it returns `summary` plus a hint; with `family` and/or `path` + it returns a paginated item slice. +- `get_report_section(section="changed")` is available only for diff-aware runs. +- MCP short `run_id` values are session handles over the canonical digest of + that run. +- MCP summary/normal finding/location payloads use relative paths only and do + not expose absolute `file://` URIs. +- Finding `locations` and `html_anchor` values are stable projections over the + current run and do not invent non-canonical ids. +- For the same finding id, `source_kind` remains consistent across + `list_findings`, `list_hotspots`, and `get_finding`. +- `get_finding(detail_level="full")` remains the compatibility-preserving + full-detail endpoint: `priority_factors` and location `uri` are still + available there. +- `compare_runs` is only semantically meaningful when both runs use comparable + repository scope/root and analysis settings. +- `compare_runs` exposes top-level `comparable` plus optional `reason`. When + roots or effective analysis settings differ, `regressions` and + `improvements` become empty lists, `unchanged` and `health_delta` become + `null`, and `verdict` becomes `incomparable`. +- `compare_runs.health_delta` is `after.health - before.health` between the two + selected comparable runs. It is independent of baseline or metrics-baseline + drift. +- `compare_runs.verdict` is intentionally conservative but not one-dimensional: + it returns `mixed` when run-to-run finding deltas and `health_delta` disagree. +- `analysis_mode="clones_only"` keeps clone findings fully usable, but MCP + surfaces mark `health` as unavailable instead of fabricating zeroed metrics. +- `codeclone://latest/triage` is a latest-only resource; run-specific triage is + available via the tool, not via a `codeclone://runs/{run_id}/...` resource URI. + +## Failure modes + +| Condition | Behavior | +|--------------------------------------------|---------------------------------------------------| +| `mcp` extra not installed | `codeclone-mcp` prints install hint and exits `2` | +| Invalid root path / invalid numeric config | service raises contract error | +| Requested run missing | service raises run-not-found error | +| Requested finding missing | service raises finding-not-found error | +| Unsupported report section/resource suffix | service raises contract error | + +## Determinism / canonicalization + +- MCP run identity is derived from canonical report integrity digest. +- Finding order is inherited from canonical report ordering. +- Hotlists are derived from canonical report data and deterministic derived ids. +- No MCP-only heuristics may change analysis or gating semantics. +- MCP must not re-synthesize design findings from raw metrics after the run; + threshold-aware design findings belong to the canonical report document. + +## Locked by tests + +- `tests/test_mcp_service.py::test_mcp_service_analyze_repository_registers_latest_run` +- `tests/test_mcp_service.py::test_mcp_service_lists_findings_and_hotspots` +- `tests/test_mcp_service.py::test_mcp_service_changed_runs_remediation_and_review_flow` +- `tests/test_mcp_service.py::test_mcp_service_granular_checks_pr_summary_and_resources` +- `tests/test_mcp_service.py::test_mcp_service_evaluate_gates_on_existing_run` +- `tests/test_mcp_service.py::test_mcp_service_resources_expose_latest_summary_and_report` +- `tests/test_mcp_server.py::test_mcp_server_exposes_expected_read_only_tools` +- `tests/test_mcp_server.py::test_mcp_server_tool_roundtrip_and_resources` +- `tests/test_mcp_server.py::test_mcp_server_main_reports_missing_optional_dependency` + +## Non-guarantees + +- There is currently no standalone `mcp_api_version` constant. +- In-memory run history does not survive process restart. +- `clear_session_runs` resets the in-memory run registry and related session + caches, but does not mutate baseline/cache/report artifacts on disk. +- Client-specific UI/approval behavior is not part of the CodeClone contract. + +## See also + +- [09-cli.md](09-cli.md) +- [08-report.md](08-report.md) +- [14-compatibility-and-versioning.md](14-compatibility-and-versioning.md) +- [../mcp.md](../mcp.md) diff --git a/docs/book/README.md b/docs/book/README.md index d2024cd..e995d74 100644 --- a/docs/book/README.md +++ b/docs/book/README.md @@ -29,6 +29,7 @@ If a statement is not enforced by code/tests, it is explicitly marked as non-con ### Interfaces - [09-cli.md](09-cli.md) +- [20-mcp-interface.md](20-mcp-interface.md) - [10-html-render.md](10-html-render.md) ### System properties diff --git a/docs/book/appendix/b-schema-layouts.md b/docs/book/appendix/b-schema-layouts.md index fcb2388..bf2734d 100644 --- a/docs/book/appendix/b-schema-layouts.md +++ b/docs/book/appendix/b-schema-layouts.md @@ -2,14 +2,14 @@ ## Purpose -Compact structural layouts for baseline/cache/report contracts in `2.0.0b2`. +Compact structural layouts for baseline/cache/report contracts in `2.0.0b3`. ## Baseline schema (`2.0`) ```json { "meta": { - "generator": { "name": "codeclone", "version": "2.0.0b2" }, + "generator": { "name": "codeclone", "version": "2.0.0b3" }, "schema_version": "2.0", "fingerprint_version": "1", "python_tag": "cp313", @@ -25,11 +25,11 @@ Compact structural layouts for baseline/cache/report contracts in `2.0.0b2`. } ``` -## Cache schema (`2.2`) +## Cache schema (`2.3`) ```json { - "v": "2.2", + "v": "2.3", "payload": { "py": "cp313", "fp": "1", @@ -77,17 +77,24 @@ Notes: - `u` row decoder accepts both legacy 11-column rows and canonical 17-column rows (legacy rows map new structural fields to neutral defaults). -## Report schema (`2.1`) +## Report schema (`2.2`) ```json { - "report_schema_version": "2.1", + "report_schema_version": "2.2", "meta": { - "codeclone_version": "2.0.0b2", + "codeclone_version": "2.0.0b3", "project_name": "codeclone", "scan_root": ".", "analysis_mode": "full", "report_mode": "full", + "analysis_thresholds": { + "design_findings": { + "complexity": { "metric": "cyclomatic_complexity", "operator": ">", "value": 20 }, + "coupling": { "metric": "cbo", "operator": ">", "value": 10 }, + "cohesion": { "metric": "lcom4", "operator": ">=", "value": 4 } + } + }, "baseline": { "...": "..." }, @@ -98,6 +105,7 @@ Notes: "...": "..." }, "runtime": { + "analysis_started_at_utc": "2026-03-11T08:36:29Z", "report_generated_at_utc": "2026-03-11T08:36:32Z" } }, @@ -198,6 +206,9 @@ Notes: "health_snapshot": { "score": 100, "grade": "A" + }, + "directory_hotspots": { + "...": "..." } }, "hotlists": { @@ -233,7 +244,7 @@ Notes: ```text # CodeClone Report - Markdown schema: 1.0 -- Source report schema: 2.1 +- Source report schema: 2.2 ... ## Overview ## Inventory @@ -264,11 +275,11 @@ Notes: "tool": { "driver": { "name": "codeclone", - "version": "2.0.0b2", + "version": "2.0.0b3", "rules": [ { "id": "CCLONE001", - "name": "codeclone.function-clone-group", + "name": "codeclone.CCLONE001", "shortDescription": { "text": "Function clone group" }, @@ -297,6 +308,9 @@ Notes: ] } }, + "automationDetails": { + "id": "codeclone/full/2026-03-11T08:36:32Z" + }, "artifacts": [ { "location": { @@ -308,18 +322,19 @@ Notes: "invocations": [ { "executionSuccessful": true, + "startTimeUtc": "2026-03-11T08:36:29Z", "workingDirectory": { "uri": "file:///repo/project/" } } ], - "columnKind": "utf16CodeUnits", "properties": { "profileVersion": "1.0", - "reportSchemaVersion": "2.1" + "reportSchemaVersion": "2.2" }, "results": [ { + "kind": "fail", "ruleId": "CCLONE001", "ruleIndex": 0, "baselineState": "new", @@ -349,6 +364,11 @@ Notes: } } ], + "properties": { + "primaryPath": "codeclone/report/sarif.py", + "primaryQualname": "codeclone.report.sarif:render_sarif_report_document", + "primaryRegion": "1:10" + }, "relatedLocations": [], "partialFingerprints": { "primaryLocationLineHash": "0123456789abcdef:1" diff --git a/docs/book/appendix/c-error-catalog.md b/docs/book/appendix/c-error-catalog.md index 24115c7..f9545dd 100644 --- a/docs/book/appendix/c-error-catalog.md +++ b/docs/book/appendix/c-error-catalog.md @@ -71,9 +71,24 @@ Refs: - `codeclone/cli.py:_main_impl` +## MCP interface errors + +| Condition | Behavior | +|-----------|----------| +| Optional `mcp` extra missing | `codeclone-mcp` prints install hint and exits `2` | +| Invalid root path / invalid numeric config | MCP service contract error | +| Missing run or finding id | MCP service request error | +| Unsupported MCP resource URI / report section | MCP service contract error | + +Refs: + +- `codeclone/mcp_server.py:main` +- `codeclone/mcp_service.py` + ## Locked by tests - `tests/test_cli_inprocess.py::test_cli_report_write_error_is_contract_error` - `tests/test_cli_inprocess.py::test_cli_update_baseline_write_error_is_contract_error` - `tests/test_cli_inprocess.py::test_cli_unreadable_source_fails_in_ci_with_contract_error` - `tests/test_cli_unit.py::test_cli_internal_error_marker` +- `tests/test_mcp_server.py::test_mcp_server_main_reports_missing_optional_dependency` diff --git a/docs/mcp.md b/docs/mcp.md new file mode 100644 index 0000000..1278758 --- /dev/null +++ b/docs/mcp.md @@ -0,0 +1,319 @@ +# MCP Usage Guide + +CodeClone MCP is a **read-only, baseline-aware** analysis server for AI agents +and MCP-capable clients. It exposes the existing deterministic pipeline without +mutating source files, baselines, cache, or on-disk report artifacts. Only +session-local review/run state is mutable in memory. +It is not only bounded in payload shape — it actively guides agents toward +low-cost, high-signal workflows. + +MCP is a **client integration surface**, not a model-specific feature. It works +with any MCP-capable client regardless of the backend model. +In practice, the cheapest useful path is also the most obvious one: summary or +triage first, then hotspots or focused checks, then single-finding drill-down. + +## Install + +```bash +pip install "codeclone[mcp]" # add MCP extra +# or +uv tool install "codeclone[mcp]" # install as a standalone tool +``` + +## Start the server + +**Local agents** (Claude Code, Codex, Copilot Chat, Gemini CLI): + +```bash +codeclone-mcp --transport stdio +``` + +MCP analysis tools require an absolute repository root. Relative roots such as +`.` are rejected, because the server process working directory may differ from +the client workspace. The same absolute-path rule applies to `check_*` tools +when a `root` filter is provided. + +**Remote / HTTP-only clients:** + +```bash +codeclone-mcp --transport streamable-http --host 127.0.0.1 --port 8000 +``` + +Non-loopback hosts require `--allow-remote` (no built-in auth). +Run retention is bounded: default `4`, max `10` (`--history-limit`). +If a tool request omits `processes`, MCP defers process-count policy to the +core CodeClone runtime. + +## Tool surface + +| Tool | Purpose | +|--------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `analyze_repository` | Full analysis → register as latest run and return a compact MCP summary; then prefer `get_run_summary` or `get_production_triage` for the first pass | +| `analyze_changed_paths` | Diff-aware analysis with `changed_paths` or `git_diff_ref`; returns a compact changed-files snapshot; then prefer `get_report_section(section="changed")` or `get_production_triage` before broader list calls | +| `get_run_summary` | Cheapest run-level snapshot: compact health/findings/baseline summary with slim inventory counts; `health` is explicit `available=false` when metrics were skipped | +| `get_production_triage` | Compact production-first view: health, cache freshness, production hotspots, production suggestions; best default first pass on noisy repos | +| `compare_runs` | Regressions, improvements, and run-to-run health delta between comparable runs; returns `mixed` for conflicting signals and `incomparable` when roots/settings differ, with empty comparison cards and `health_delta=null` in that case | +| `list_findings` | Filtered, paginated finding groups with compact summary payloads by default; use after hotspots or `check_*` when you need a broader filtered list | +| `get_finding` | Deep inspection of one finding by id; defaults to normal detail and accepts `detail_level`; use after `list_hotspots`, `list_findings`, or `check_*` | +| `get_remediation` | Structured remediation payload for one finding; defaults to normal detail; use when you only need the fix packet for a single finding | +| `list_hotspots` | Derived views: highest priority, production hotspots, spread, etc., with compact summary cards; preferred first-pass triage before broader listing | +| `get_report_section` | Read canonical report sections; prefer specific sections over `section="all"`; `metrics` is summary-only, `metrics_detail` is paginated/bounded | +| `evaluate_gates` | Preview CI/gating decisions without exiting | +| `check_clones` | Clone findings from a stored run; cheaper and narrower than `list_findings` when you only need clone debt | +| `check_complexity` | Complexity hotspots from a stored run; cheaper and narrower than `list_findings` when you only need complexity | +| `check_coupling` | Coupling hotspots from a stored run; cheaper and narrower than `list_findings` when you only need coupling | +| `check_cohesion` | Cohesion hotspots from a stored run; cheaper and narrower than `list_findings` when you only need cohesion | +| `check_dead_code` | Dead-code findings from a stored run; cheaper and narrower than `list_findings` when you only need dead code | +| `generate_pr_summary` | PR-friendly markdown or JSON summary; prefer `markdown` for compact LLM-facing output and `json` for machine post-processing | +| `mark_finding_reviewed` | Session-local review marker (in-memory only) | +| `list_reviewed_findings` | List reviewed findings for a run | +| `clear_session_runs` | Reset all in-memory runs and session caches | + +> `check_*` tools query stored runs only. Call `analyze_repository` or +> `analyze_changed_paths` first. + +`check_*` responses keep `health.score` and `health.grade`, but slim +`health.dimensions` down to the one dimension relevant to that tool. +List-style finding responses now use short MCP finding ids and compact relative +locations by default; `normal` keeps structured `{path, line, end_line, symbol}` +locations, while `full` keeps the richer compatibility payload including `uri`. +Summary-style MCP cache payloads expose `freshness` (`fresh`, `mixed`, `reused`). +Inline design-threshold parameters on `analyze_repository` / +`analyze_changed_paths` become part of the canonical run: they are recorded in +`meta.analysis_thresholds.design_findings` and define that run's canonical +design findings. + +Run ids in MCP payloads are short session handles (first 8 hex chars of the +canonical digest). MCP tools and run-scoped resources accept both short and full +run ids. Finding ids follow the same rule: MCP responses use compact ids, while +the canonical `report.json` keeps full finding ids unchanged. When a short +finding id would collide within a run, MCP lengthens it just enough to keep it +unique. + +## Resource surface + +Fixed resources: + +| Resource | Content | +|----------------------------------|--------------------------------------------| +| `codeclone://latest/summary` | Latest run summary | +| `codeclone://latest/triage` | Latest production-first triage | +| `codeclone://latest/report.json` | Full canonical report | +| `codeclone://latest/health` | Health score and dimensions | +| `codeclone://latest/gates` | Last gate evaluation result | +| `codeclone://latest/changed` | Changed-files projection (diff-aware runs) | +| `codeclone://schema` | Canonical report shape descriptor | + +Run-scoped resource templates: + +| URI template | Content | +|---------------------------------------------------|---------------------------------| +| `codeclone://runs/{run_id}/summary` | Summary for a specific run | +| `codeclone://runs/{run_id}/report.json` | Report for a specific run | +| `codeclone://runs/{run_id}/findings/{finding_id}` | One finding from a specific run | + +Resources and URI templates are read-only views over stored runs; they do not +trigger analysis. + +`codeclone://latest/*` always resolves to the most recent run registered in the +current MCP server session. A later `analyze_repository` or +`analyze_changed_paths` call moves that pointer. +`mark_finding_reviewed` and `clear_session_runs` mutate only in-memory session +state. They never touch source files, baselines, cache, or report artifacts. + +## Recommended workflows + +### Budget-aware first pass + +``` +analyze_repository → get_run_summary or get_production_triage +→ list_hotspots or check_* → get_finding → get_remediation +``` + +### Full repository review + +``` +analyze_repository → get_production_triage +→ list_hotspots(kind="highest_priority") → get_finding → evaluate_gates +``` + +### Changed-files review (PR / patch) + +``` +analyze_changed_paths → get_report_section(section="changed") +→ list_findings(changed_paths=..., sort_by="priority") → get_remediation → generate_pr_summary +``` + +### Session-based review loop + +``` +list_findings → get_finding → mark_finding_reviewed +→ list_findings(exclude_reviewed=true) → … → clear_session_runs +``` + +## Prompt patterns + +Good prompts include **scope**, **goal**, and **constraint**: + +### Health check + +```text +Use codeclone MCP to analyze this repository. Give me a concise structural health summary +and explain which findings are worth looking at first. +``` + +### Clone triage (production only) + +```text +Analyze through codeclone MCP, filter to clone findings in production code only, +and show me the top 3 clone groups worth fixing first. +``` + +### Changed-files review + +```text +Use codeclone MCP in changed-files mode for my latest edits. +Focus only on findings that touch changed files and rank them by priority. +``` + +### Dead-code review + +```text +Use codeclone MCP to review dead-code findings. Separate actionable items from +likely framework false positives. Do not add suppressions automatically. +``` + +### Gate preview + +```text +Run codeclone through MCP and preview gating with fail_on_new plus a zero clone threshold. +Explain the exact reasons. Do not change any files. +``` + +### AI-generated code check + +```text +I added code with an AI agent. Use codeclone MCP to check for new structural drift: +clone groups, dead code, duplicated branches, design hotspots. +Separate accepted baseline debt from new regressions. +``` + +### Safe refactor planning + +```text +Use codeclone MCP to pick one production finding that looks safe to refactor. +Explain why it is a good candidate and outline a minimal plan. +``` + +### Run comparison + +```text +Compare the latest CodeClone MCP run against the previous one. +Show regressions, resolved findings, and health delta. +``` + +**Tips:** + +- Use `analyze_changed_paths` for PRs, not full analysis. +- Prefer `get_run_summary` or `get_production_triage` for the first pass on a + new run. +- Prefer `list_hotspots` or the narrow `check_*` tools before broad + `list_findings` calls. +- Use `get_finding` / `get_remediation` for one finding instead of raising + `detail_level` on larger lists. +- Set `cache_policy="off"` when you need the freshest truth from a new analysis + run, not whatever older session state currently sits behind `latest/*`. +- Pass an absolute `root` to `analyze_repository` / `analyze_changed_paths`. + MCP intentionally rejects relative roots like `.` to avoid analyzing the + wrong directory when server cwd and client workspace differ. +- Prefer `generate_pr_summary(format="markdown")` for agent-facing output; use + `json` only when another machine step needs it. +- Avoid `get_report_section(section="all")` unless you truly need the full + canonical report document. +- Use `get_report_section(section="metrics_detail", family=..., limit=...)` for + metrics drill-down; the unfiltered call is intentionally bounded. +- Use `"production-only"` / `source_kind` filters to cut test/fixture noise. +- Use `mark_finding_reviewed` + `exclude_reviewed=true` in long sessions. +- Ask the agent to separate baseline debt from new regressions. + +## Client configuration + +All clients use the same CodeClone server — only the registration differs. + +### Claude Code / Anthropic + +```json +{ + "mcpServers": { + "codeclone": { + "command": "codeclone-mcp", + "args": [ + "--transport", + "stdio" + ] + } + } +} +``` + +### Codex / OpenAI (command-based) + +```toml +[mcp_servers.codeclone] +enabled = true +command = "codeclone-mcp" +args = ["--transport", "stdio"] +``` + +For the Responses API or remote-only OpenAI clients, use `streamable-http`. + +### GitHub Copilot Chat + +```json +{ + "mcpServers": { + "codeclone": { + "command": "codeclone-mcp", + "args": [ + "--transport", + "stdio" + ] + } + } +} +``` + +### Gemini CLI + +Same `stdio` registration. If the client only accepts remote URLs, use +`streamable-http` and point to the `/mcp` endpoint. + +### Other clients + +- `stdio` for local analysis +- `streamable-http` for remote/HTTP-only clients + +If `codeclone-mcp` is not on `PATH`, use an absolute path to the launcher. + +## Security + +- Read-only by design: no source mutation, no baseline/cache writes. +- Run history and review markers are in-memory only — lost on process stop. +- Repository access is limited to what the server process can read locally. +- `streamable-http` binds to loopback by default; `--allow-remote` is explicit opt-in. + +## Troubleshooting + +| Problem | Fix | +|-----------------------------------------------------------|--------------------------------------------------------------------------------| +| `CodeClone MCP support requires the optional 'mcp' extra` | `pip install "codeclone[mcp]"` | +| Client cannot find `codeclone-mcp` | `uv tool install "codeclone[mcp]"` or use absolute path | +| Client only accepts remote MCP | Use `streamable-http` transport | +| Agent reads stale results | Call `analyze_repository` again; `latest` always points to the most recent run | +| `changed_paths` rejected | Pass a `list[str]` of repo-relative paths, not a comma-separated string | + +## See also + +- [book/20-mcp-interface.md](book/20-mcp-interface.md) — formal interface contract +- [book/08-report.md](book/08-report.md) — canonical report contract +- [book/09-cli.md](book/09-cli.md) — CLI reference diff --git a/docs/sarif.md b/docs/sarif.md index e62d4b8..3f3b7b1 100644 --- a/docs/sarif.md +++ b/docs/sarif.md @@ -38,7 +38,10 @@ Current behavior: - `artifactLocation.uri` uses repository-relative paths - `artifactLocation.index` aligns locations with artifacts for stable linking - `run.invocations[*].workingDirectory` mirrors the scan root URI when available -- `run.columnKind` is fixed to `utf16CodeUnits` +- `run.invocations[*].startTimeUtc` is emitted when analysis start time is + available in canonical runtime meta +- `run.automationDetails.id` is unique per run so code-scanning systems can + correlate uploads reliably This helps consumers resolve results back to workspace files consistently. @@ -53,6 +56,10 @@ Current SARIF output includes: human-readable role labels such as `Representative occurrence` - `relatedLocations[*]` when the result has multiple relevant locations - `partialFingerprints.primaryLocationLineHash` for stable per-location identity + without encoding line numbers into the hash digest +- result `properties` with stable identity/context fields such as primary path, + qualname, and region +- explicit `kind: "fail"` on results For clone results, CodeClone also carries novelty-aware metadata when known: @@ -68,6 +75,7 @@ Rule records are intentionally richer than a minimal SARIF export. They include: - stable rule IDs +- stable rule names derived from `ruleId` - display name - help text / markdown - tags diff --git a/mkdocs.yml b/mkdocs.yml index fae6e1d..cd68046 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -70,6 +70,7 @@ nav: - Report: book/08-report.md - Interfaces: - CLI: book/09-cli.md + - MCP Interface: book/20-mcp-interface.md - HTML Render: book/10-html-render.md - System Properties: - Security Model: book/11-security-model.md @@ -89,6 +90,7 @@ nav: - Deep Dives: - Architecture Narrative: architecture.md - CFG Semantics: cfg.md + - MCP for AI Agents: mcp.md - SARIF for IDEs: sarif.md - Publishing and Docs Site: publishing.md - Examples: diff --git a/pyproject.toml b/pyproject.toml index 28e465b..96fe2bc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta" [project] name = "codeclone" -version = "2.0.0b2" +version = "2.0.0b3" description = "Structural code quality analysis for Python" readme = { file = "README.md", content-type = "text/markdown" } -license = "MIT" -license-files = ["LICENSE"] +license = "MPL-2.0 AND MIT" +license-files = ["LICENSE", "LICENSE-docs"] authors = [ { name = "Den Rozhnovskiy", email = "pytelemonbot@mail.ru" } @@ -20,7 +20,7 @@ maintainers = [ requires-python = ">=3.10" dependencies = [ - "pygments>=2.19.2", + "pygments>=2.20.0", "rich>=14.3.2", "tomli>=2.0.1; python_version < '3.11'", ] @@ -61,6 +61,9 @@ Changelog = "https://github.com/orenlab/codeclone/releases" Documentation = "https://orenlab.github.io/codeclone/" [project.optional-dependencies] +mcp = [ + "mcp>=1.26.0,<2", +] dev = [ "pytest>=9.0.0", "pytest-cov>=7.1.0", @@ -73,6 +76,7 @@ dev = [ [project.scripts] codeclone = "codeclone.cli:main" +codeclone-mcp = "codeclone.mcp_server:main" [tool.setuptools] packages = [ diff --git a/scripts/build_docs_example_report.py b/scripts/build_docs_example_report.py index 5254c59..b003fd2 100644 --- a/scripts/build_docs_example_report.py +++ b/scripts/build_docs_example_report.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 -# SPDX-License-Identifier: MIT +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy from __future__ import annotations diff --git a/tests/__init__.py b/tests/__init__.py index e69de29..9135843 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,5 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy diff --git a/tests/_assertions.py b/tests/_assertions.py index 619e882..1f4dd4a 100644 --- a/tests/_assertions.py +++ b/tests/_assertions.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + from __future__ import annotations from collections.abc import Mapping diff --git a/tests/_ast_helpers.py b/tests/_ast_helpers.py index ce123be..574d026 100644 --- a/tests/_ast_helpers.py +++ b/tests/_ast_helpers.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + from __future__ import annotations import ast diff --git a/tests/_report_access.py b/tests/_report_access.py index 9eeb760..91fb068 100644 --- a/tests/_report_access.py +++ b/tests/_report_access.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + from __future__ import annotations from collections.abc import Mapping diff --git a/tests/_report_fixtures.py b/tests/_report_fixtures.py index 73e68e4..6bbe126 100644 --- a/tests/_report_fixtures.py +++ b/tests/_report_fixtures.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + from __future__ import annotations from pathlib import Path diff --git a/tests/conftest.py b/tests/conftest.py index 7647800..a497dcf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + from __future__ import annotations from collections.abc import Callable diff --git a/tests/fixtures/golden_project/LICENSE b/tests/fixtures/golden_project/LICENSE new file mode 100644 index 0000000..d2b21b1 --- /dev/null +++ b/tests/fixtures/golden_project/LICENSE @@ -0,0 +1,9 @@ +Fixture source files in this directory are covered by the Mozilla Public +License, v. 2.0. + +Per Mozilla MPL header guidance, the notice is provided in this directory +instead of modifying the fixture files themselves. + +This Source Code Form is subject to the terms of the Mozilla Public +License, v. 2.0. If a copy of the MPL was not distributed with this +directory, You can obtain one at https://mozilla.org/MPL/2.0/. diff --git a/tests/fixtures/golden_v2/clone_metrics_cycle/pkg/LICENSE b/tests/fixtures/golden_v2/clone_metrics_cycle/pkg/LICENSE new file mode 100644 index 0000000..d2b21b1 --- /dev/null +++ b/tests/fixtures/golden_v2/clone_metrics_cycle/pkg/LICENSE @@ -0,0 +1,9 @@ +Fixture source files in this directory are covered by the Mozilla Public +License, v. 2.0. + +Per Mozilla MPL header guidance, the notice is provided in this directory +instead of modifying the fixture files themselves. + +This Source Code Form is subject to the terms of the Mozilla Public +License, v. 2.0. If a copy of the MPL was not distributed with this +directory, You can obtain one at https://mozilla.org/MPL/2.0/. diff --git a/tests/fixtures/golden_v2/pyproject_defaults/golden_expected_cli_snapshot.json b/tests/fixtures/golden_v2/pyproject_defaults/golden_expected_cli_snapshot.json index f202dbe..dc98485 100644 --- a/tests/fixtures/golden_v2/pyproject_defaults/golden_expected_cli_snapshot.json +++ b/tests/fixtures/golden_v2/pyproject_defaults/golden_expected_cli_snapshot.json @@ -2,7 +2,7 @@ "meta": { "python_tag": "cp313" }, - "report_schema_version": "2.1", + "report_schema_version": "2.2", "project_name": "pyproject_defaults", "scan_root": ".", "baseline_status": "missing", diff --git a/tests/fixtures/golden_v2/pyproject_defaults/pkg/LICENSE b/tests/fixtures/golden_v2/pyproject_defaults/pkg/LICENSE new file mode 100644 index 0000000..d2b21b1 --- /dev/null +++ b/tests/fixtures/golden_v2/pyproject_defaults/pkg/LICENSE @@ -0,0 +1,9 @@ +Fixture source files in this directory are covered by the Mozilla Public +License, v. 2.0. + +Per Mozilla MPL header guidance, the notice is provided in this directory +instead of modifying the fixture files themselves. + +This Source Code Form is subject to the terms of the Mozilla Public +License, v. 2.0. If a copy of the MPL was not distributed with this +directory, You can obtain one at https://mozilla.org/MPL/2.0/. diff --git a/tests/fixtures/golden_v2/test_only_usage/pkg/LICENSE b/tests/fixtures/golden_v2/test_only_usage/pkg/LICENSE new file mode 100644 index 0000000..d2b21b1 --- /dev/null +++ b/tests/fixtures/golden_v2/test_only_usage/pkg/LICENSE @@ -0,0 +1,9 @@ +Fixture source files in this directory are covered by the Mozilla Public +License, v. 2.0. + +Per Mozilla MPL header guidance, the notice is provided in this directory +instead of modifying the fixture files themselves. + +This Source Code Form is subject to the terms of the Mozilla Public +License, v. 2.0. If a copy of the MPL was not distributed with this +directory, You can obtain one at https://mozilla.org/MPL/2.0/. diff --git a/tests/fixtures/golden_v2/test_only_usage/pkg/tests/LICENSE b/tests/fixtures/golden_v2/test_only_usage/pkg/tests/LICENSE new file mode 100644 index 0000000..d2b21b1 --- /dev/null +++ b/tests/fixtures/golden_v2/test_only_usage/pkg/tests/LICENSE @@ -0,0 +1,9 @@ +Fixture source files in this directory are covered by the Mozilla Public +License, v. 2.0. + +Per Mozilla MPL header guidance, the notice is provided in this directory +instead of modifying the fixture files themselves. + +This Source Code Form is subject to the terms of the Mozilla Public +License, v. 2.0. If a copy of the MPL was not distributed with this +directory, You can obtain one at https://mozilla.org/MPL/2.0/. diff --git a/tests/test_architecture.py b/tests/test_architecture.py index 34101e9..6454b52 100644 --- a/tests/test_architecture.py +++ b/tests/test_architecture.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + from __future__ import annotations import ast diff --git a/tests/test_baseline.py b/tests/test_baseline.py index 127af92..302040e 100644 --- a/tests/test_baseline.py +++ b/tests/test_baseline.py @@ -1,6 +1,13 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + import json from collections.abc import Callable from pathlib import Path +from typing import Any, cast import pytest @@ -219,6 +226,16 @@ def test_baseline_load_legacy_payload(tmp_path: Path) -> None: assert exc.value.status == "missing_fields" +def test_baseline_load_rejects_non_object_preloaded_payload(tmp_path: Path) -> None: + baseline_path = tmp_path / "baseline.json" + _write_payload(baseline_path, _trusted_payload()) + baseline = Baseline(baseline_path) + + with pytest.raises(BaselineValidationError, match="must be an object") as exc: + baseline.load(preloaded_payload=cast(Any, [])) + assert exc.value.status == "invalid_type" + + def test_baseline_load_missing_top_level_key(tmp_path: Path) -> None: baseline_path = tmp_path / "baseline.json" _write_payload(baseline_path, {"meta": {}}) @@ -778,6 +795,25 @@ def _boom_stat(self: Path) -> object: assert exc.value.status == "invalid_type" +def test_baseline_atomic_write_json_cleans_up_temp_file_on_replace_failure( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + path = tmp_path / "baseline.json" + temp_holder: dict[str, Path] = {} + + def _boom_replace(src: str | Path, dst: str | Path) -> None: + temp_holder["path"] = Path(src) + raise OSError("replace failed") + + monkeypatch.setattr("codeclone.baseline.os.replace", _boom_replace) + + with pytest.raises(OSError, match="replace failed"): + baseline_mod._atomic_write_json(path, _trusted_payload()) + + assert temp_holder["path"].exists() is False + + def test_baseline_load_json_read_error( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: @@ -812,6 +848,22 @@ def test_baseline_optional_str_paths(tmp_path: Path) -> None: assert exc.value.status == "invalid_type" +def test_baseline_require_utc_iso8601_z_rejects_invalid_calendar_date( + tmp_path: Path, +) -> None: + path = tmp_path / "baseline.json" + with pytest.raises( + BaselineValidationError, + match="'created_at' must be UTC ISO-8601 with Z", + ) as exc: + baseline_mod._require_utc_iso8601_z( + {"created_at": "2026-02-31T00:00:00Z"}, + "created_at", + path=path, + ) + assert exc.value.status == "invalid_type" + + def test_baseline_load_legacy_codeclone_version_alias(tmp_path: Path) -> None: baseline_path = tmp_path / "baseline.json" payload = _trusted_payload(generator_version="1.4.0") diff --git a/tests/test_blockhash.py b/tests/test_blockhash.py deleted file mode 100644 index 003f120..0000000 --- a/tests/test_blockhash.py +++ /dev/null @@ -1,11 +0,0 @@ -import ast - -from codeclone.blockhash import stmt_hashes -from codeclone.normalize import NormalizationConfig - - -def test_stmt_hash_normalizes_names() -> None: - cfg = NormalizationConfig() - s1 = ast.parse("a = b + 1").body[0] - s2 = ast.parse("x = y + 2").body[0] - assert stmt_hashes([s1], cfg)[0] == stmt_hashes([s2], cfg)[0] diff --git a/tests/test_blocks.py b/tests/test_blocks.py index d551666..a875635 100644 --- a/tests/test_blocks.py +++ b/tests/test_blocks.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + import ast from codeclone.blocks import extract_blocks diff --git a/tests/test_cache.py b/tests/test_cache.py index e0c2cf3..788709e 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + from __future__ import annotations import json @@ -11,6 +17,8 @@ import codeclone.cache as cache_mod from codeclone.blocks import BlockUnit, SegmentUnit from codeclone.cache import Cache, CacheStatus +from codeclone.cache_io import sign_cache_payload +from codeclone.cache_paths import runtime_filepath_from_wire, wire_filepath_from_runtime from codeclone.errors import CacheError from codeclone.extractor import Unit @@ -161,7 +169,7 @@ def test_cache_load_normalizes_stale_structural_findings(tmp_path: Path) -> None cache, files={"x.py": cache_mod._encode_wire_file_entry(entry)}, ) - signature = cache._sign_data(payload) + signature = sign_cache_payload(payload) cache_path.write_text( json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": signature}), "utf-8", @@ -289,7 +297,7 @@ def test_cache_v13_missing_optional_sections_default_empty(tmp_path: Path) -> No cache_path = tmp_path / "cache.json" cache = Cache(cache_path) payload = _analysis_payload(cache, files={"x.py": {"st": [1, 2]}}) - signature = cache._sign_data(payload) + signature = sign_cache_payload(payload) cache_path.write_text( json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": signature}), "utf-8", @@ -393,9 +401,8 @@ def test_cache_signature_mismatch_warns(tmp_path: Path) -> None: def test_cache_version_mismatch_warns(tmp_path: Path) -> None: cache_path = tmp_path / "cache.json" - cache = Cache(cache_path) data = {"version": "0.0", "files": {}} - signature = cache._sign_data(data) + signature = sign_cache_payload(data) cache_path.write_text( json.dumps({**data, "_signature": signature}, ensure_ascii=False, indent=2), "utf-8", @@ -411,13 +418,14 @@ def test_cache_version_mismatch_warns(tmp_path: Path) -> None: assert loaded.cache_schema_version == "0.0" -def test_cache_v_field_version_mismatch_warns(tmp_path: Path) -> None: +@pytest.mark.parametrize("version", ["0.0", "2.2"]) +def test_cache_v_field_version_mismatch_warns(tmp_path: Path, version: str) -> None: cache_path = tmp_path / "cache.json" cache = Cache(cache_path) payload = _analysis_payload(cache, files={}) - signature = cache._sign_data(payload) + signature = sign_cache_payload(payload) cache_path.write_text( - json.dumps({"v": "0.0", "payload": payload, "sig": signature}), "utf-8" + json.dumps({"v": version, "payload": payload, "sig": signature}), "utf-8" ) loaded = Cache(cache_path) @@ -426,7 +434,7 @@ def test_cache_v_field_version_mismatch_warns(tmp_path: Path) -> None: assert "version mismatch" in loaded.load_warning assert loaded.data["files"] == {} assert loaded.load_status == CacheStatus.VERSION_MISMATCH - assert loaded.cache_schema_version == "0.0" + assert loaded.cache_schema_version == version def test_cache_too_large_warns(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: @@ -737,7 +745,7 @@ def test_cache_load_invalid_files_type(tmp_path: Path) -> None: cache_path = tmp_path / "cache.json" cache = Cache(cache_path) payload = _analysis_payload(cache, files=[]) - signature = cache._sign_data(payload) + signature = sign_cache_payload(payload) cache_path.write_text( json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": signature}), "utf-8", @@ -838,7 +846,7 @@ def test_cache_load_missing_v_field(tmp_path: Path) -> None: cache_path = tmp_path / "cache.json" cache = Cache(cache_path) payload = _analysis_payload(cache, files={}) - sig = cache._sign_data(payload) + sig = sign_cache_payload(payload) cache_path.write_text(json.dumps({"payload": payload, "sig": sig}), "utf-8") cache.load() assert cache.load_warning is not None @@ -871,7 +879,7 @@ def test_cache_load_rejects_missing_required_payload_fields( cache_path = tmp_path / "cache.json" cache = Cache(cache_path) payload = payload_factory(cache) - sig = cache._sign_data(payload) + sig = sign_cache_payload(payload) cache_path.write_text( json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": sig}), "utf-8" ) @@ -889,7 +897,7 @@ def test_cache_load_python_tag_mismatch(tmp_path: Path) -> None: "ap": cache.data["analysis_profile"], "files": {}, } - sig = cache._sign_data(payload) + sig = sign_cache_payload(payload) cache_path.write_text( json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": sig}), "utf-8" ) @@ -907,7 +915,7 @@ def test_cache_load_fingerprint_version_mismatch(tmp_path: Path) -> None: "ap": cache.data["analysis_profile"], "files": {}, } - sig = cache._sign_data(payload) + sig = sign_cache_payload(payload) cache_path.write_text( json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": sig}), "utf-8" ) @@ -940,7 +948,7 @@ def test_cache_load_missing_analysis_profile_in_payload(tmp_path: Path) -> None: "fp": cache.data["fingerprint_version"], "files": {}, } - sig = cache._sign_data(payload) + sig = sign_cache_payload(payload) cache_path.write_text( json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": sig}), "utf-8" ) @@ -971,7 +979,7 @@ def test_cache_load_invalid_analysis_profile_payload( "ap": bad_analysis_profile, "files": {}, } - sig = cache._sign_data(payload) + sig = sign_cache_payload(payload) cache_path.write_text( json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": sig}), "utf-8" ) @@ -988,7 +996,7 @@ def test_cache_load_invalid_wire_file_entry(tmp_path: Path) -> None: cache_path = tmp_path / "cache.json" cache = Cache(cache_path) payload = _analysis_payload(cache, files={"x.py": {"st": "bad"}}) - sig = cache._sign_data(payload) + sig = sign_cache_payload(payload) cache_path.write_text( json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": sig}), "utf-8" ) @@ -1028,7 +1036,9 @@ def test_wire_filepath_outside_root_falls_back_to_runtime_path(tmp_path: Path) - root.mkdir() cache = Cache(tmp_path / "cache.json", root=root) outside = tmp_path / "outside.py" - assert cache._wire_filepath_from_runtime(str(outside)) == outside.as_posix() + assert ( + wire_filepath_from_runtime(str(outside), root=cache.root) == outside.as_posix() + ) def test_wire_filepath_resolve_oserror_falls_back_to_runtime_path( @@ -1046,7 +1056,9 @@ def _resolve_with_error(self: Path, *, strict: bool = False) -> Path: return original_resolve(self, strict=strict) monkeypatch.setattr(Path, "resolve", _resolve_with_error) - assert cache._wire_filepath_from_runtime(str(runtime)) == runtime.as_posix() + assert ( + wire_filepath_from_runtime(str(runtime), root=cache.root) == runtime.as_posix() + ) def test_wire_filepath_resolve_relative_success_path( @@ -1067,7 +1079,7 @@ def _resolve_with_mapping(self: Path, *, strict: bool = False) -> Path: return original_resolve(self, strict=strict) monkeypatch.setattr(Path, "resolve", _resolve_with_mapping) - assert cache._wire_filepath_from_runtime(str(runtime)) == "pkg/module.py" + assert wire_filepath_from_runtime(str(runtime), root=cache.root) == "pkg/module.py" def test_runtime_filepath_from_wire_resolve_oserror( @@ -1085,7 +1097,7 @@ def _resolve_with_error(self: Path, *, strict: bool = False) -> Path: return original_resolve(self, strict=strict) monkeypatch.setattr(Path, "resolve", _resolve_with_error) - assert cache._runtime_filepath_from_wire("pkg/module.py") == str(combined) + assert runtime_filepath_from_wire("pkg/module.py", root=cache.root) == str(combined) def test_as_str_dict_rejects_non_string_keys() -> None: @@ -1265,6 +1277,73 @@ def test_decode_wire_file_entry_optional_source_stats() -> None: ) +def test_cache_helpers_cover_invalid_analysis_profile_and_source_stats_shapes() -> None: + assert ( + cache_mod._decode_wire_qualname_span_size(["pkg.mod:fn", 1, 2, "bad"]) is None + ) + assert cache_mod._decode_wire_qualname_span_size([None, 1, 2, 4]) is None + assert ( + cache_mod._as_analysis_profile( + { + "min_loc": 1, + "min_stmt": 1, + "block_min_loc": 2, + "block_min_stmt": "bad", + "segment_min_loc": 3, + "segment_min_stmt": 4, + } + ) + is None + ) + assert ( + cache_mod._decode_optional_wire_source_stats(obj={"ss": [1, 2, "bad", 0]}) + is None + ) + + +def test_canonicalize_cache_entry_skips_invalid_dead_candidate_suppression_shape() -> ( + None +): + normalized = cache_mod._canonicalize_cache_entry( + cast( + Any, + { + "stat": {"mtime_ns": 1, "size": 2}, + "units": [], + "blocks": [], + "segments": [], + "class_metrics": [], + "module_deps": [], + "dead_candidates": [ + { + "qualname": "pkg.mod:unused", + "local_name": "unused", + "filepath": "pkg/mod.py", + "start_line": 1, + "end_line": 2, + "kind": "function", + "suppressed_rules": "dead-code", + } + ], + "referenced_names": [], + "referenced_qualnames": [], + "import_names": [], + "class_names": [], + }, + ) + ) + assert normalized["dead_candidates"] == [ + { + "qualname": "pkg.mod:unused", + "local_name": "unused", + "filepath": "pkg/mod.py", + "start_line": 1, + "end_line": 2, + "kind": "function", + } + ] + + def test_decode_optional_wire_coupled_classes_rejects_non_string_qualname() -> None: assert ( cache_mod._decode_optional_wire_coupled_classes( @@ -1550,6 +1629,18 @@ def test_cache_type_predicates_reject_non_dict_variants() -> None: assert cache_mod._is_class_metrics_dict([]) is False assert cache_mod._is_module_dep_dict([]) is False assert cache_mod._is_dead_candidate_dict([]) is False + assert ( + cache_mod._is_dead_candidate_dict( + { + "qualname": "pkg.mod:broken", + "local_name": "broken", + "filepath": "pkg/mod.py", + "start_line": 1, + "end_line": 2, + } + ) + is False + ) assert ( cache_mod._is_dead_candidate_dict( { diff --git a/tests/test_cfg.py b/tests/test_cfg.py index b0c4955..a111819 100644 --- a/tests/test_cfg.py +++ b/tests/test_cfg.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + import ast from textwrap import dedent diff --git a/tests/test_cfg_model.py b/tests/test_cfg_model.py index 216c9a8..36c4eee 100644 --- a/tests/test_cfg_model.py +++ b/tests/test_cfg_model.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + from codeclone.cfg_model import CFG, Block diff --git a/tests/test_cli_config.py b/tests/test_cli_config.py index 4fdfcd4..ecc60a0 100644 --- a/tests/test_cli_config.py +++ b/tests/test_cli_config.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + from __future__ import annotations import argparse diff --git a/tests/test_cli_inprocess.py b/tests/test_cli_inprocess.py index 1317065..8651b3f 100644 --- a/tests/test_cli_inprocess.py +++ b/tests/test_cli_inprocess.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + from __future__ import annotations import json @@ -1869,16 +1875,37 @@ def test_cli_too_large_baseline_fails_in_ci( _assert_report_baseline_meta(payload, status="too_large", loaded=False) +@pytest.mark.parametrize( + ("mutator", "expected_message", "expected_status", "expected_schema_version"), + [ + ( + lambda data: data.__setitem__("sig", "bad"), + "signature", + "integrity_failed", + CACHE_VERSION, + ), + ( + lambda data: data.__setitem__("v", "2.2"), + "Cache version mismatch", + "version_mismatch", + "2.2", + ), + ], +) def test_cli_reports_cache_used_false_on_warning( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], + mutator: Callable[[dict[str, object]], None], + expected_message: str, + expected_status: str, + expected_schema_version: object, ) -> None: src, cache_path, cache = _prepare_single_source_cache(tmp_path) cache.put_file_entry(str(src), {"mtime_ns": 1, "size": 10}, [], [], []) cache.save() data = json.loads(cache_path.read_text("utf-8")) - data["sig"] = "bad" + mutator(data) cache_path.write_text(json.dumps(data), "utf-8") baseline_path = _write_current_python_baseline(tmp_path / "baseline.json") @@ -1893,12 +1920,12 @@ def test_cli_reports_cache_used_false_on_warning( ], ) out = capsys.readouterr().out - assert "signature" in out + assert expected_message in out _assert_report_cache_meta( payload, used=False, - status="integrity_failed", - schema_version=CACHE_VERSION, + status=expected_status, + schema_version=expected_schema_version, ) diff --git a/tests/test_cli_main_guard.py b/tests/test_cli_main_guard.py index 3570d68..904c2b3 100644 --- a/tests/test_cli_main_guard.py +++ b/tests/test_cli_main_guard.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + import os import subprocess import sys diff --git a/tests/test_cli_main_guard_runpy.py b/tests/test_cli_main_guard_runpy.py index f130c4e..9685a8a 100644 --- a/tests/test_cli_main_guard_runpy.py +++ b/tests/test_cli_main_guard_runpy.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + import runpy import sys diff --git a/tests/test_cli_smoke.py b/tests/test_cli_smoke.py index 97093f2..ea3399f 100644 --- a/tests/test_cli_smoke.py +++ b/tests/test_cli_smoke.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + import os import subprocess import sys diff --git a/tests/test_cli_unit.py b/tests/test_cli_unit.py index 89c3c26..37338c9 100644 --- a/tests/test_cli_unit.py +++ b/tests/test_cli_unit.py @@ -1,14 +1,23 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + import json import os +import subprocess import sys import webbrowser from argparse import Namespace from collections.abc import Callable from pathlib import Path -from typing import cast +from types import SimpleNamespace +from typing import Any, cast import pytest +import codeclone._cli_meta as cli_meta_mod import codeclone._cli_reports as cli_reports import codeclone._cli_summary as cli_summary import codeclone.baseline as baseline_mod @@ -19,6 +28,7 @@ from codeclone import ui_messages as ui from codeclone._cli_args import build_parser from codeclone._cli_config import ConfigValidationError +from codeclone.cache import Cache from codeclone.cli import process_file from codeclone.contracts import DOCS_URL, ISSUES_URL, REPOSITORY_URL from codeclone.errors import BaselineValidationError @@ -151,6 +161,9 @@ def test_cli_help_text_consistency( "Structural code quality analysis for Python.", "Target:", "Analysis:", + "--changed-only", + "--diff-against GIT_REF", + "--paths-from-git-diff GIT_REF", "Baselines and CI:", "Quality gates:", "Analysis stages:", @@ -206,6 +219,16 @@ def test_report_path_origins_distinguish_bare_and_explicit_flags() -> None: } +def test_report_path_origins_stops_at_double_dash() -> None: + assert cli._report_path_origins(("--json=out.json", "--", "--html")) == { + "html": None, + "json": "explicit", + "md": None, + "sarif": None, + "text": None, + } + + def test_timestamped_report_path_appends_utc_slug() -> None: path = Path("/tmp/report.html") assert cli._timestamped_report_path( @@ -231,6 +254,16 @@ def test_open_html_report_in_browser_raises_without_handler( cli_reports._open_html_report_in_browser(path=report_path) +def test_open_html_report_in_browser_succeeds_when_handler_exists( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + report_path = tmp_path / "report.html" + report_path.write_text("", encoding="utf-8") + monkeypatch.setattr(webbrowser, "open_new_tab", lambda _uri: True) + cli_reports._open_html_report_in_browser(path=report_path) + + def test_cli_plain_console_status_context() -> None: plain = cli._make_plain_console() with plain.status("noop"): @@ -304,6 +337,502 @@ def test_argument_parser_contract_error_marker_for_invalid_args( assert "CONTRACT ERROR:" in err +def test_validate_changed_scope_args_requires_diff_source() -> None: + cli.console = cli._make_console(no_color=True) + args = Namespace( + changed_only=True, + diff_against=None, + paths_from_git_diff=None, + ) + with pytest.raises(SystemExit) as exc: + cli._validate_changed_scope_args(args=args) + assert exc.value.code == 2 + + +def test_validate_changed_scope_args_requires_changed_only_for_diff_against() -> None: + cli.console = cli._make_console(no_color=True) + args = Namespace( + changed_only=False, + diff_against="main", + paths_from_git_diff=None, + ) + with pytest.raises(SystemExit) as exc: + cli._validate_changed_scope_args(args=args) + assert exc.value.code == 2 + + +def test_validate_changed_scope_args_promotes_paths_from_git_diff() -> None: + args = Namespace( + changed_only=False, + diff_against=None, + paths_from_git_diff="HEAD~1", + ) + assert cli._validate_changed_scope_args(args=args) == "HEAD~1" + assert args.changed_only is True + + +def test_validate_changed_scope_args_rejects_conflicting_diff_sources() -> None: + cli.console = cli._make_console(no_color=True) + args = Namespace( + changed_only=True, + diff_against="HEAD~1", + paths_from_git_diff="HEAD~2", + ) + with pytest.raises(SystemExit) as exc: + cli._validate_changed_scope_args(args=args) + assert exc.value.code == 2 + + +def test_normalize_changed_paths_relativizes_dedupes_and_sorts(tmp_path: Path) -> None: + root_path = tmp_path.resolve() + pkg_dir = root_path / "pkg" + pkg_dir.mkdir() + first = pkg_dir / "b.py" + second = pkg_dir / "a.py" + first.write_text("pass\n", "utf-8") + second.write_text("pass\n", "utf-8") + + assert cli._normalize_changed_paths( + root_path=root_path, + paths=("pkg/b.py", str(second), " pkg/b.py ", ""), + ) == ("pkg/a.py", "pkg/b.py") + + +def test_normalize_changed_paths_skips_empty_relative_results( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + root_path = tmp_path.resolve() + candidate = root_path / "marker.py" + candidate.write_text("pass\n", encoding="utf-8") + original_relative_to = Path.relative_to + + def _fake_relative_to(self: Path, *other: str | Path) -> Path: + if self == candidate: + return Path("/") + return original_relative_to(self, *other) + + monkeypatch.setattr(Path, "relative_to", _fake_relative_to) + assert ( + cli._normalize_changed_paths(root_path=root_path, paths=(str(candidate),)) == () + ) + + +def test_normalize_changed_paths_reports_unresolvable_path( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + cli.console = cli._make_console(no_color=True) + root_path = tmp_path.resolve() + original_resolve = Path.resolve + + def _broken_resolve(self: Path, strict: bool = False) -> Path: + if self.name == "broken.py": + raise OSError("boom") + return original_resolve(self, strict=strict) + + monkeypatch.setattr(Path, "resolve", _broken_resolve) + with pytest.raises(SystemExit) as exc: + cli._normalize_changed_paths(root_path=root_path, paths=("broken.py",)) + assert exc.value.code == 2 + + +def test_normalize_changed_paths_rejects_outside_root(tmp_path: Path) -> None: + cli.console = cli._make_console(no_color=True) + root_path = tmp_path.resolve() + outside_dir = tmp_path.parent / f"{tmp_path.name}-outside" + outside_dir.mkdir() + outside_path = outside_dir / "external.py" + outside_path.write_text("pass\n", "utf-8") + + with pytest.raises(SystemExit) as exc: + cli._normalize_changed_paths(root_path=root_path, paths=(str(outside_path),)) + assert exc.value.code == 2 + + +def test_git_diff_changed_paths_normalizes_subprocess_output( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + root_path = tmp_path.resolve() + pkg_dir = root_path / "pkg" + pkg_dir.mkdir() + (pkg_dir / "a.py").write_text("pass\n", "utf-8") + (pkg_dir / "b.py").write_text("pass\n", "utf-8") + + def _run(*args: object, **kwargs: object) -> subprocess.CompletedProcess[str]: + return subprocess.CompletedProcess( + args=["git", "diff", "--name-only", "HEAD~1", "--"], + returncode=0, + stdout="pkg/b.py\npkg/a.py\n\n", + stderr="", + ) + + monkeypatch.setattr(subprocess, "run", _run) + assert cli._git_diff_changed_paths(root_path=root_path, git_diff_ref="HEAD~1") == ( + "pkg/a.py", + "pkg/b.py", + ) + + +def test_git_diff_changed_paths_reports_subprocess_errors( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + cli.console = cli._make_console(no_color=True) + + def _run(*args: object, **kwargs: object) -> subprocess.CompletedProcess[str]: + raise subprocess.TimeoutExpired(cmd="git diff", timeout=30) + + monkeypatch.setattr(subprocess, "run", _run) + with pytest.raises(SystemExit) as exc: + cli._git_diff_changed_paths(root_path=tmp_path.resolve(), git_diff_ref="HEAD~1") + assert exc.value.code == 2 + + +def test_git_diff_changed_paths_rejects_option_like_ref(tmp_path: Path) -> None: + cli.console = cli._make_console(no_color=True) + with pytest.raises(SystemExit) as exc: + cli._git_diff_changed_paths( + root_path=tmp_path.resolve(), git_diff_ref="--cached" + ) + assert exc.value.code == 2 + + +def test_report_path_origins_ignores_unrelated_equals_tokens() -> None: + assert cli._report_path_origins(("--unknown=value", "--json=out.json")) == { + "html": None, + "json": "explicit", + "md": None, + "sarif": None, + "text": None, + } + + +def test_changed_clone_gate_from_report_filters_changed_scope() -> None: + gate = cli._changed_clone_gate_from_report( + { + "findings": { + "groups": { + "clones": { + "functions": [ + { + "id": "clone:function:new", + "family": "clone", + "category": "function", + "novelty": "new", + "items": [{"relative_path": "pkg/dup.py"}], + }, + { + "id": "clone:function:known", + "family": "clone", + "category": "function", + "novelty": "known", + "items": [{"relative_path": "pkg/other.py"}], + }, + ], + "blocks": [ + { + "id": "clone:block:known", + "family": "clone", + "category": "block", + "novelty": "known", + "items": [{"relative_path": "pkg/dup.py"}], + } + ], + "segments": [], + }, + "structural": { + "groups": [ + { + "id": "structural:changed", + "family": "structural", + "novelty": "new", + "items": [{"relative_path": "pkg/dup.py"}], + } + ] + }, + "dead_code": {"groups": []}, + "design": {"groups": []}, + } + } + }, + changed_paths=("pkg/dup.py",), + ) + assert gate.changed_paths == ("pkg/dup.py",) + assert gate.total_clone_groups == 2 + assert gate.new_func == frozenset({"clone:function:new"}) + assert gate.new_block == frozenset() + assert gate.findings_total == 3 + assert gate.findings_new == 2 + assert gate.findings_known == 1 + + +def test_run_analysis_stages_requires_rich_console_when_progress_ui_is_enabled( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + cli.console = cli._make_plain_console() + monkeypatch.setattr( + cli, + "discover", + lambda **_kwargs: SimpleNamespace( + skipped_warnings=(), files_to_process=("x.py",) + ), + ) + + with pytest.raises(RuntimeError, match="Rich console is required"): + cli._run_analysis_stages( + args=Namespace(quiet=False, no_progress=False), + boot=cast(Any, object()), + cache=Cache(tmp_path / "cache.json"), + ) + + +def test_run_analysis_stages_prints_source_read_failures_when_failed_files_are_empty( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + cli.console = cli._make_plain_console() + printed: list[tuple[object, ...]] = [] + monkeypatch.setattr( + cli, + "_print_failed_files", + lambda failures: printed.append(tuple(failures)), + ) + monkeypatch.setattr( + cli, + "discover", + lambda **_kwargs: SimpleNamespace(skipped_warnings=(), files_to_process=()), + ) + monkeypatch.setattr( + cli, + "process", + lambda **_kwargs: SimpleNamespace( + failed_files=(), + source_read_failures=("pkg/mod.py: unreadable",), + ), + ) + monkeypatch.setattr(cli, "analyze", lambda **_kwargs: SimpleNamespace()) + monkeypatch.setattr( + cli, + "_cache_update_segment_projection", + lambda *_args, **_kwargs: None, + ) + monkeypatch.setattr(Cache, "save", lambda self: None) + + cli._run_analysis_stages( + args=Namespace(quiet=False, no_progress=True), + boot=cast(Any, object()), + cache=Cache(tmp_path / "cache.json"), + ) + + assert printed == [(), ("pkg/mod.py: unreadable",)] + + +def test_enforce_gating_rewrites_clone_threshold_for_changed_scope( + monkeypatch: pytest.MonkeyPatch, +) -> None: + cli.console = cli._make_console(no_color=True) + observed: dict[str, object] = {} + + monkeypatch.setattr( + cli, + "gate", + lambda **_kwargs: pipeline.GatingResult( + exit_code=3, + reasons=("clone:threshold:8:1",), + ), + ) + monkeypatch.setattr( + cli, + "_print_gating_failure_block", + lambda *, code, entries, args: observed.update( + {"code": code, "entries": tuple(entries), "threshold": args.fail_threshold} + ), + ) + + with pytest.raises(SystemExit) as exc: + cli._enforce_gating( + args=Namespace(fail_threshold=1, verbose=False), + boot=cast("pipeline.BootstrapResult", object()), + analysis=cast("pipeline.AnalysisResult", object()), + processing=cast(Any, Namespace(source_read_failures=[])), + source_read_contract_failure=False, + baseline_failure_code=None, + metrics_baseline_failure_code=None, + new_func=set(), + new_block=set(), + metrics_diff=None, + html_report_path=None, + clone_threshold_total=2, + ) + + assert exc.value.code == 3 + assert observed["code"] == "threshold" + assert observed["entries"] == ( + ("clone_groups_total", 2), + ("clone_groups_limit", 1), + ) + + +def test_enforce_gating_drops_rewritten_threshold_when_changed_scope_is_within_limit( + monkeypatch: pytest.MonkeyPatch, +) -> None: + cli.console = cli._make_console(no_color=True) + observed: dict[str, object] = {} + + monkeypatch.setattr( + cli, + "gate", + lambda **_kwargs: pipeline.GatingResult( + exit_code=3, + reasons=("clone:threshold:8:1",), + ), + ) + monkeypatch.setattr( + cli, + "_print_gating_failure_block", + lambda **kwargs: observed.update(kwargs), + ) + + cli._enforce_gating( + args=Namespace(fail_threshold=5, verbose=False), + boot=cast("pipeline.BootstrapResult", object()), + analysis=cast("pipeline.AnalysisResult", object()), + processing=cast(Any, Namespace(source_read_failures=[])), + source_read_contract_failure=False, + baseline_failure_code=None, + metrics_baseline_failure_code=None, + new_func=set(), + new_block=set(), + metrics_diff=None, + html_report_path=None, + clone_threshold_total=2, + ) + + assert observed == {} + + +def test_main_impl_prints_changed_scope_when_changed_projection_is_available( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + baseline_path = tmp_path / "baseline.json" + metrics_path = tmp_path / "metrics.json" + cache_path = tmp_path / "cache.json" + monkeypatch.setattr( + sys, + "argv", + [ + "codeclone", + str(tmp_path), + "--quiet", + "--changed-only", + "--diff-against", + "HEAD~1", + "--baseline", + str(baseline_path), + "--metrics-baseline", + str(metrics_path), + "--cache-path", + str(cache_path), + ], + ) + observed: dict[str, object] = {} + + monkeypatch.setattr(cli, "load_pyproject_config", lambda _root: {}) + monkeypatch.setattr( + cli, + "apply_pyproject_config_overrides", + lambda **_kwargs: None, + ) + monkeypatch.setattr( + cli, + "_git_diff_changed_paths", + lambda **_kwargs: ("pkg/dup.py",), + ) + monkeypatch.setattr(cli, "_validate_report_ui_flags", lambda **_kwargs: None) + monkeypatch.setattr(cli, "bootstrap", lambda **_kwargs: cast(Any, object())) + monkeypatch.setattr( + cli, + "_run_analysis_stages", + lambda **_kwargs: ( + SimpleNamespace(files_found=1, cache_hits=0), + SimpleNamespace( + files_analyzed=1, + files_skipped=0, + analyzed_lines=10, + analyzed_functions=1, + analyzed_methods=0, + analyzed_classes=0, + source_read_failures=(), + ), + SimpleNamespace( + func_groups={}, + block_groups={}, + func_clones_count=0, + block_clones_count=0, + segment_clones_count=0, + suppressed_segment_groups=0, + project_metrics=None, + ), + ), + ) + monkeypatch.setattr( + cli, + "_resolve_clone_baseline_state", + lambda **_kwargs: SimpleNamespace( + baseline=baseline_mod.Baseline(baseline_path), + loaded=False, + status=baseline_mod.BaselineStatus.MISSING, + trusted_for_diff=False, + updated_path=None, + failure_code=None, + ), + ) + monkeypatch.setattr( + cli, + "_resolve_metrics_baseline_state", + lambda **_kwargs: SimpleNamespace( + baseline=metrics_baseline_mod.MetricsBaseline(metrics_path), + loaded=False, + status=metrics_baseline_mod.MetricsBaselineStatus.MISSING, + trusted_for_diff=False, + failure_code=None, + ), + ) + monkeypatch.setattr(cli_meta_mod, "_build_report_meta", lambda **_kwargs: {}) + monkeypatch.setattr(cli, "_print_summary", lambda **_kwargs: None) + monkeypatch.setattr( + cli, "report", lambda **_kwargs: SimpleNamespace(report_document={}) + ) + monkeypatch.setattr( + cli, + "_changed_clone_gate_from_report", + lambda _report, changed_paths: cli.ChangedCloneGate( + changed_paths=tuple(changed_paths), + new_func=frozenset(), + new_block=frozenset(), + total_clone_groups=0, + findings_total=3, + findings_new=1, + findings_known=2, + ), + ) + monkeypatch.setattr( + cli, + "_print_changed_scope", + lambda **kwargs: observed.update(kwargs), + ) + monkeypatch.setattr(cli, "_write_report_outputs", lambda **_kwargs: None) + monkeypatch.setattr(cli, "_enforce_gating", lambda **_kwargs: None) + + cli._main_impl() + + changed_scope = cast(Any, observed["changed_scope"]) + assert observed["quiet"] is True + assert changed_scope.paths_count == 1 + assert changed_scope.findings_total == 3 + + def test_make_console_caps_width_to_layout_limit( monkeypatch: pytest.MonkeyPatch, ) -> None: @@ -415,6 +944,65 @@ def test_ui_summary_formatters_cover_optional_branches() -> None: clean_with_suppressed = ui.fmt_metrics_dead_code(0, suppressed=9) assert "✔ clean" in clean_with_suppressed assert "(9 suppressed)" in clean_with_suppressed + changed_paths = ui.fmt_changed_scope_paths(count=45) + assert "45" in changed_paths + assert "from git diff" in changed_paths + changed_findings = ui.fmt_changed_scope_findings(total=7, new=2, known=5) + assert "total" in changed_findings + assert "new" in changed_findings + assert "5 known" in changed_findings + changed_compact = ui.fmt_changed_scope_compact( + paths=45, + findings=7, + new=2, + known=5, + ) + assert "Changed" in changed_compact + assert "paths=45" in changed_compact + assert "findings=7" in changed_compact + + +def test_print_changed_scope_uses_dedicated_block( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +) -> None: + monkeypatch.setattr(cli, "console", cli._make_console(no_color=True)) + cli_summary._print_changed_scope( + console=cast("cli_summary._Printer", cli.console), + quiet=False, + changed_scope=cli_summary.ChangedScopeSnapshot( + paths_count=45, + findings_total=7, + findings_new=2, + findings_known=5, + ), + ) + out = capsys.readouterr().out + assert "Changed Scope" in out + assert "Paths" in out + assert "Findings" in out + assert "from git diff" in out + + +def test_print_changed_scope_uses_compact_line_in_quiet_mode( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +) -> None: + monkeypatch.setattr(cli, "console", cli._make_console(no_color=True)) + cli_summary._print_changed_scope( + console=cast("cli_summary._Printer", cli.console), + quiet=True, + changed_scope=cli_summary.ChangedScopeSnapshot( + paths_count=45, + findings_total=7, + findings_new=2, + findings_known=5, + ), + ) + out = capsys.readouterr().out + assert "Changed" in out + assert "paths=45" in out + assert "findings=7" in out + assert "new=2" in out + assert "known=5" in out def test_configure_metrics_mode_rejects_skip_metrics_with_metrics_flags( diff --git a/tests/test_coerce.py b/tests/test_coerce.py index 0112504..9b7b0c0 100644 --- a/tests/test_coerce.py +++ b/tests/test_coerce.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + from __future__ import annotations from collections.abc import Mapping, Sequence diff --git a/tests/test_core_branch_coverage.py b/tests/test_core_branch_coverage.py index 43407e0..888f8dc 100644 --- a/tests/test_core_branch_coverage.py +++ b/tests/test_core_branch_coverage.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + from __future__ import annotations from argparse import Namespace @@ -26,6 +32,7 @@ _is_dead_candidate_dict, build_segment_report_projection, ) +from codeclone.cache_segments import decode_segment_report_projection from codeclone.errors import CacheError from codeclone.grouping import build_segment_groups from codeclone.models import ( @@ -413,29 +420,41 @@ def test_cache_segment_report_projection_filters_invalid_items(tmp_path: Path) - def test_cache_decode_segment_projection_invalid_shapes(tmp_path: Path) -> None: cache = Cache(tmp_path / "cache.json", root=tmp_path.resolve()) assert ( - cache._decode_segment_report_projection({"d": "x", "s": 0, "g": "bad"}) is None + decode_segment_report_projection( + {"d": "x", "s": 0, "g": "bad"}, + root=cache.root, + ) + is None ) assert ( - cache._decode_segment_report_projection({"d": "x", "s": 0, "g": [["k"]]}) + decode_segment_report_projection( + {"d": "x", "s": 0, "g": [["k"]]}, + root=cache.root, + ) is None ) assert ( - cache._decode_segment_report_projection({"d": "x", "s": 0, "g": [[1, []]]}) + decode_segment_report_projection( + {"d": "x", "s": 0, "g": [[1, []]]}, + root=cache.root, + ) is None ) assert ( - cache._decode_segment_report_projection( - {"d": "x", "s": 0, "g": [["k", ["bad-item"]]]} + decode_segment_report_projection( + {"d": "x", "s": 0, "g": [["k", ["bad-item"]]]}, + root=cache.root, ) is None ) assert ( - cache._decode_segment_report_projection( + decode_segment_report_projection( { "d": "x", "s": 0, "g": [["k", [["a.py", "q", 1, 2, 3, "h", None]]]], - } + }, + root=cache.root, ) is None ) diff --git a/tests/test_detector_golden.py b/tests/test_detector_golden.py index a270bb8..d03e103 100644 --- a/tests/test_detector_golden.py +++ b/tests/test_detector_golden.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + from __future__ import annotations import json diff --git a/tests/test_extractor.py b/tests/test_extractor.py index aeb8161..133b0df 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + import ast import os import signal @@ -9,11 +15,12 @@ import pytest -from codeclone import extractor +from codeclone import extractor, qualnames from codeclone.errors import ParseError from codeclone.metrics import find_unused -from codeclone.models import BlockUnit, ModuleDep, SegmentUnit +from codeclone.models import BlockUnit, ClassMetrics, ModuleDep, SegmentUnit from codeclone.normalize import NormalizationConfig +from codeclone.qualnames import FunctionNode, QualnameCollector def extract_units_from_source( @@ -52,9 +59,9 @@ def extract_units_from_source( def _parse_tree_and_collector( source: str, -) -> tuple[ast.Module, extractor._QualnameCollector]: +) -> tuple[ast.Module, QualnameCollector]: tree = ast.parse(source) - collector = extractor._QualnameCollector() + collector = QualnameCollector() collector.visit(tree) return tree, collector @@ -64,7 +71,7 @@ def _collect_module_walk( *, module_name: str = "pkg.mod", collect_referenced_names: bool = True, -) -> tuple[ast.Module, extractor._QualnameCollector, extractor._ModuleWalkResult]: +) -> tuple[ast.Module, QualnameCollector, extractor._ModuleWalkResult]: tree, collector = _parse_tree_and_collector(source) walk = extractor._collect_module_walk_data( tree=tree, @@ -134,6 +141,75 @@ def test_declaration_token_index_uses_prebuilt_index() -> None: ) +def test_declaration_helpers_cover_async_found_tokens_and_eof_scan() -> None: + async_node = ast.parse( + """ +async def demo(): + return 1 +""" + ).body[0] + assert isinstance(async_node, ast.AsyncFunctionDef) + assert extractor._declaration_token_name(async_node) == "async" + + tokens = extractor._source_tokens("def demo():\n return 1\n") + assert ( + extractor._declaration_token_index( + source_tokens=tokens, + start_line=1, + start_col=0, + declaration_token="def", + ) + == 0 + ) + + nested_tokens = extractor._source_tokens( + "def demo(arg: tuple[int, int]) -> tuple[int, int]:\n return arg\n" + ) + assert ( + extractor._scan_declaration_colon_line( + source_tokens=nested_tokens, + start_index=0, + ) + == 1 + ) + + default_tokens = extractor._source_tokens( + "def demo(arg=(1, [2])):\n return arg\n" + ) + assert ( + extractor._scan_declaration_colon_line( + source_tokens=default_tokens, + start_index=0, + ) + == 1 + ) + + eof_tokens = ( + tokenize.TokenInfo(tokenize.NAME, "def", (1, 0), (1, 3), "def demo("), + tokenize.TokenInfo(tokenize.NAME, "demo", (1, 4), (1, 8), "def demo("), + tokenize.TokenInfo(tokenize.OP, "(", (1, 8), (1, 9), "def demo("), + ) + assert ( + extractor._scan_declaration_colon_line( + source_tokens=eof_tokens, + start_index=0, + ) + is None + ) + + unmatched_close_tokens = ( + tokenize.TokenInfo(tokenize.NAME, "def", (1, 0), (1, 3), "def demo)"), + tokenize.TokenInfo(tokenize.OP, ")", (1, 8), (1, 9), "def demo)"), + ) + assert ( + extractor._scan_declaration_colon_line( + source_tokens=unmatched_close_tokens, + start_index=0, + ) + is None + ) + + def test_scan_declaration_colon_line_returns_none_when_header_is_incomplete() -> None: tokens = extractor._source_tokens("def broken\n") assert ( @@ -168,6 +244,40 @@ def broken(): assert extractor._declaration_end_line(node, source_tokens=()) == 0 +def test_declaration_fallback_helpers_cover_empty_and_same_line_bodies() -> None: + empty_body_node = ast.parse( + """ +def demo(): + return 1 +""" + ).body[0] + assert isinstance(empty_body_node, ast.FunctionDef) + empty_body_node.body = [] + assert extractor._fallback_declaration_end_line(empty_body_node, start_line=2) == 2 + + inline_body_node = ast.parse( + """ +def demo(): + return 1 +""" + ).body[0] + assert isinstance(inline_body_node, ast.FunctionDef) + inline_body_node.body[0].lineno = 2 + assert extractor._fallback_declaration_end_line(inline_body_node, start_line=2) == 2 + + no_colon_tokens = ( + tokenize.TokenInfo(tokenize.NAME, "def", (2, 0), (2, 3), "def demo"), + tokenize.TokenInfo(tokenize.NAME, "demo", (2, 4), (2, 8), "def demo"), + ) + assert ( + extractor._declaration_end_line( + inline_body_node, + source_tokens=no_colon_tokens, + ) + == 2 + ) + + def test_init_function_is_ignored_for_blocks() -> None: src = """ class A: @@ -557,7 +667,7 @@ def test_collect_module_walk_data_imports_and_references() -> None: obj.method() """.strip() ) - collector = extractor._QualnameCollector() + collector = QualnameCollector() collector.visit(tree) walk = extractor._collect_module_walk_data( tree=tree, @@ -597,7 +707,7 @@ def test_collect_module_walk_data_imports_and_references() -> None: def test_collect_module_walk_data_edge_branches() -> None: tree = ast.parse("from .... import parent") - collector = extractor._QualnameCollector() + collector = QualnameCollector() collector.visit(tree) walk = extractor._collect_module_walk_data( tree=tree, @@ -610,7 +720,7 @@ def test_collect_module_walk_data_edge_branches() -> None: assert walk.referenced_names == frozenset() lambda_call_tree = ast.parse("(lambda x: x)(1)") - lambda_collector = extractor._QualnameCollector() + lambda_collector = QualnameCollector() lambda_collector.visit(lambda_call_tree) lambda_walk = extractor._collect_module_walk_data( tree=lambda_call_tree, @@ -629,7 +739,7 @@ def test_collect_module_walk_data_without_referenced_name_collection() -> None: from .... import parent """.strip() ) - collector = extractor._QualnameCollector() + collector = QualnameCollector() collector.visit(tree) walk = extractor._collect_module_walk_data( tree=tree, @@ -725,7 +835,7 @@ class B(te.Protocol[int]): pass """.strip() ) - collector = extractor._QualnameCollector() + collector = QualnameCollector() collector.visit(tree) walk = extractor._collect_module_walk_data( tree=tree, @@ -828,6 +938,92 @@ def hook(self) -> int: assert "pkg.helpers:decorate" not in walk.referenced_qualnames +def test_extractor_private_helper_branches_cover_invalid_protocol_and_declarations( + monkeypatch: pytest.MonkeyPatch, +) -> None: + expr = ast.Attribute( + value=ast.Call( + func=ast.Name(id="factory", ctx=ast.Load()), + args=[], + keywords=[], + ), + attr="method", + ctx=ast.Load(), + ) + assert extractor._dotted_expr_name(expr) is None + + protocol_class = ast.parse( + """ +class Demo(Unknown, alias.Protocol): + pass +""" + ).body[0] + assert isinstance(protocol_class, ast.ClassDef) + assert ( + extractor._is_protocol_class( + protocol_class, + protocol_symbol_aliases=frozenset({"Protocol"}), + protocol_module_aliases=frozenset({"typing"}), + ) + is False + ) + + bad_span_node = ast.parse( + """ +def demo(): + return 1 +""" + ).body[0] + assert isinstance(bad_span_node, ast.FunctionDef) + bad_span_node.lineno = 3 + bad_span_node.end_lineno = 2 + assert extractor._eligible_unit_shape(bad_span_node, min_loc=1, min_stmt=1) is None + + _, missing_method_collector, missing_method_walk = _collect_module_walk( + """ +class Service: + def real(self) -> int: + return 1 + +handler = Service.missing +""" + ) + assert "pkg.mod:Service.missing" not in missing_method_walk.referenced_qualnames + assert missing_method_collector.class_nodes[0][0] == "Service" + + _, declaration_collector = _parse_tree_and_collector( + """ +class Demo: + def work(self) -> int: + return 1 +""" + ) + declaration_collector.units[0][1].end_lineno = 0 + declaration_collector.class_nodes[0][1].end_lineno = 0 + assert ( + extractor._collect_declaration_targets( + filepath="pkg/mod.py", + module_name="pkg.mod", + collector=declaration_collector, + ) + == () + ) + + suppression_source = """ +def demo(): # codeclone: ignore[dead-code] + return 1 +""" + _, suppression_collector = _parse_tree_and_collector(suppression_source) + monkeypatch.setattr(extractor, "_source_tokens", lambda _source: ()) + suppression_index = extractor._build_suppression_index_for_source( + source=suppression_source, + filepath="pkg/mod.py", + module_name="pkg.mod", + collector=suppression_collector, + ) + assert tuple(suppression_index.values()) == (("dead-code",),) + + def test_extract_stats_drops_referenced_names_for_test_filepaths() -> None: src = """ from pkg.mod import live @@ -855,6 +1051,55 @@ def test_extract_stats_drops_referenced_names_for_test_filepaths() -> None: assert "live" in regular_metrics.referenced_names +def test_extract_stats_keeps_class_cohesion_metrics_after_unit_fingerprinting() -> None: + src = """ +class Service: + def __init__(self): + self.path = "x" + self.data = {} + + def load(self): + if self.path: + return self.data + return {} + + def save(self): + if self.path: + self.data["saved"] = True + return self.data + + def verify(self): + return bool(self.path) and bool(self.data) + + @staticmethod + def make(): + return Service() +""" + _, _, _, _, file_metrics, _ = extractor.extract_units_and_stats_from_source( + source=src, + filepath="pkg/service.py", + module_name="pkg.service", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=1, + ) + + assert file_metrics.class_metrics == ( + ClassMetrics( + qualname="pkg.service:Service", + filepath="pkg/service.py", + start_line=2, + end_line=22, + cbo=0, + lcom4=2, + method_count=5, + instance_var_count=2, + risk_coupling="low", + risk_cohesion="medium", + ), + ) + + def test_dead_code_marks_symbol_dead_when_referenced_only_by_tests() -> None: src_prod = """ def orphan(): @@ -1042,7 +1287,7 @@ def orphan(self) -> int: def test_collect_dead_candidates_and_extract_skip_classes_without_lineno( monkeypatch: pytest.MonkeyPatch, ) -> None: - collector = extractor._QualnameCollector() + collector = QualnameCollector() collector.visit( ast.parse( """ @@ -1070,7 +1315,7 @@ def used(): class _CollectorNoClassMetrics: def __init__(self) -> None: - self.units: list[tuple[str, extractor.FunctionNode]] = [] + self.units: list[tuple[str, FunctionNode]] = [] self.class_nodes = [("Broken", broken_class)] self.function_count = 0 self.method_count = 0 @@ -1079,7 +1324,7 @@ def __init__(self) -> None: def visit(self, _tree: ast.AST) -> None: return None - monkeypatch.setattr(extractor, "_QualnameCollector", _CollectorNoClassMetrics) + monkeypatch.setattr(qualnames, "QualnameCollector", _CollectorNoClassMetrics) _, _, _, _, file_metrics, _ = extractor.extract_units_and_stats_from_source( source="class Broken:\n pass\n", filepath="pkg/mod.py", diff --git a/tests/test_fingerprint.py b/tests/test_fingerprint.py index dffb176..a785d15 100644 --- a/tests/test_fingerprint.py +++ b/tests/test_fingerprint.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + from codeclone.fingerprint import bucket_loc, sha1 diff --git a/tests/test_github_action_helpers.py b/tests/test_github_action_helpers.py new file mode 100644 index 0000000..d9a240d --- /dev/null +++ b/tests/test_github_action_helpers.py @@ -0,0 +1,194 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path +from types import ModuleType +from typing import Any, cast + + +def _load_action_impl() -> ModuleType: + path = ( + Path(__file__).resolve().parents[1] + / ".github" + / "actions" + / "codeclone" + / "_action_impl.py" + ) + spec = importlib.util.spec_from_file_location("codeclone_action_impl", path) + assert spec is not None + assert spec.loader is not None + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +def _assert_contains_all(text: str, expected_parts: tuple[str, ...]) -> None: + for expected in expected_parts: + assert expected in text + + +def _resolve_install_target( + *, + action_path: Path, + workspace: Path, + package_version: str, +) -> Any: + action_impl = _load_action_impl() + return action_impl.resolve_install_target( + action_path=str(action_path), + workspace=str(workspace), + package_version=package_version, + ) + + +def test_build_codeclone_args_includes_enabled_gates_and_paths() -> None: + action_impl = _load_action_impl() + inputs = action_impl.ActionInputs( + path=".", + json_path=".cache/codeclone/report.json", + sarif=True, + sarif_path=".cache/codeclone/report.sarif", + fail_on_new=True, + fail_on_new_metrics=True, + fail_threshold=5, + fail_complexity=20, + fail_coupling=10, + fail_cohesion=4, + fail_cycles=True, + fail_dead_code=True, + fail_health=60, + baseline_path="codeclone.baseline.json", + metrics_baseline_path="codeclone.baseline.json", + extra_args="--no-color --quiet", + no_progress=True, + ) + + args = cast(list[str], action_impl.build_codeclone_args(inputs)) + + assert args[:5] == [ + ".", + "--json", + ".cache/codeclone/report.json", + "--sarif", + ".cache/codeclone/report.sarif", + ] + _assert_contains_all( + " ".join(args), + ( + "--fail-on-new", + "--fail-on-new-metrics", + "--fail-cycles", + "--fail-dead-code", + "--no-progress", + "--baseline", + "--metrics-baseline", + "--no-color", + "--quiet", + ), + ) + + +def test_render_pr_comment_uses_canonical_report_summary() -> None: + action_impl = _load_action_impl() + report = { + "meta": { + "codeclone_version": "2.0.0b3", + "baseline": {"status": "ok"}, + "cache": {"used": True}, + }, + "findings": { + "summary": { + "families": { + "clones": 8, + "structural": 15, + "dead_code": 0, + "design": 3, + }, + "clones": { + "new": 1, + "known": 7, + }, + } + }, + "metrics": { + "summary": { + "health": { + "score": 81, + "grade": "B", + } + } + }, + } + + body = cast(str, action_impl.render_pr_comment(report, exit_code=3)) + + _assert_contains_all( + body, + ( + "", + "CodeClone Report", + "**81/100 (B)**", + ":x: Failed (gating)", + "Clones: 8 (1 new, 7 known)", + "Structural: 15", + "Dead code: 0", + "Design: 3", + "`2.0.0b3`", + ), + ) + + +def test_resolve_install_target_uses_repo_source_for_local_action_checkout( + tmp_path: Path, +) -> None: + repo_root = tmp_path / "codeclone" + action_path = repo_root / ".github" / "actions" / "codeclone" + action_path.mkdir(parents=True) + + target = _resolve_install_target( + action_path=action_path, + workspace=repo_root, + package_version="2.0.0b3", + ) + + assert target.source == "repo" + assert target.requirement == str(repo_root.resolve()) + + +def test_resolve_install_target_uses_pypi_for_remote_checkout(tmp_path: Path) -> None: + workspace_root = tmp_path / "consumer" + action_repo = tmp_path / "_actions" / "orenlab" / "codeclone" / "main" + action_path = action_repo / ".github" / "actions" / "codeclone" + action_path.mkdir(parents=True) + workspace_root.mkdir() + + pinned = _resolve_install_target( + action_path=action_path, + workspace=workspace_root, + package_version="2.0.0b3", + ) + latest = _resolve_install_target( + action_path=action_path, + workspace=workspace_root, + package_version="", + ) + + assert ( + pinned.source, + pinned.requirement, + latest.source, + latest.requirement, + ) == ( + "pypi-version", + "codeclone==2.0.0b3", + "pypi-latest", + "codeclone", + ) diff --git a/tests/test_golden_v2.py b/tests/test_golden_v2.py index 3a95188..fcce405 100644 --- a/tests/test_golden_v2.py +++ b/tests/test_golden_v2.py @@ -1,3 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + from __future__ import annotations import json diff --git a/tests/test_html_report.py b/tests/test_html_report.py index eae09b9..e6f52fe 100644 --- a/tests/test_html_report.py +++ b/tests/test_html_report.py @@ -1,12 +1,24 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + import importlib import json from collections.abc import Callable from pathlib import Path -from typing import Any +from typing import Any, cast import pytest -from codeclone.contracts import CACHE_VERSION, DOCS_URL, ISSUES_URL, REPOSITORY_URL +from codeclone.contracts import ( + CACHE_VERSION, + DOCS_URL, + ISSUES_URL, + REPORT_SCHEMA_VERSION, + REPOSITORY_URL, +) from codeclone.errors import FileProcessingError from codeclone.html_report import ( _FileCache, @@ -23,7 +35,11 @@ Suggestion, ) from codeclone.report import build_block_group_facts -from codeclone.report.json_contract import build_report_document +from codeclone.report.json_contract import ( + build_report_document, + clone_group_id, + structural_group_id, +) from codeclone.report.serialize import render_json_report_document from tests._report_fixtures import ( REPEATED_ASSERT_SOURCE, @@ -552,6 +568,78 @@ def test_html_report_structural_findings_why_modal_renders_examples( assert needle in html +def test_html_report_finding_cards_expose_stable_anchor_ids(tmp_path: Path) -> None: + f1 = tmp_path / "a.py" + f2 = tmp_path / "b.py" + f1.write_text("def alpha():\n return 1\n", "utf-8") + f2.write_text("def beta():\n return 1\n", "utf-8") + clone_key = "pkg.mod:dup" + finding_key = "anchor-key" + html = build_html_report( + func_groups={ + clone_key: [ + { + "qualname": "pkg.mod:alpha", + "filepath": str(f1), + "start_line": 1, + "end_line": 2, + }, + { + "qualname": "pkg.mod:beta", + "filepath": str(f2), + "start_line": 1, + "end_line": 2, + }, + ] + }, + block_groups={}, + segment_groups={}, + structural_findings=[ + StructuralFindingGroup( + finding_kind="duplicated_branches", + finding_key=finding_key, + signature={ + "calls": "1", + "has_loop": "0", + "has_try": "0", + "nested_if": "0", + "raises": "0", + "stmt_seq": "Expr,Return", + "terminal": "return_const", + }, + items=( + StructuralFindingOccurrence( + finding_kind="duplicated_branches", + finding_key=finding_key, + file_path=str(f1), + qualname="pkg.mod:alpha", + start=1, + end=2, + signature={"stmt_seq": "Expr,Return"}, + ), + StructuralFindingOccurrence( + finding_kind="duplicated_branches", + finding_key=finding_key, + file_path=str(f2), + qualname="pkg.mod:beta", + start=1, + end=2, + signature={"stmt_seq": "Expr,Return"}, + ), + ), + ) + ], + ) + clone_id = clone_group_id("function", clone_key) + finding_id = structural_group_id("duplicated_branches", finding_key) + _assert_html_contains( + html, + f'id="finding-{clone_id}"', + f'id="finding-{finding_id}"', + f'data-finding-id="{finding_id}"', + ) + + def test_html_report_block_group_includes_match_basis_and_compact_key() -> None: group_key = _REPEATED_BLOCK_GROUP_KEY html = build_html_report( @@ -875,7 +963,7 @@ def test_html_report_provenance_summary_uses_card_like_badges( 'class="prov-badge prov-badge--neutral"', 'verified', 'Baseline', - '2.1', + f'{REPORT_SCHEMA_VERSION}', 'Schema', '1', 'Fingerprint', @@ -1530,7 +1618,12 @@ def test_html_report_metrics_risk_branches() -> None: assert 'stroke="var(--error)"' in html assert "Cycles: 1; max dependency depth: 4." in html assert "5 candidates total; 2 high-confidence items; 0 suppressed." in html - assert 'Dead Code2' in html + assert '