Zivro/Report/append_sources_to_report.py

#!/usr/bin/env python3
"""
Append source files to a markdown report and save as a new file.

Example:
    python3 Report/append_sources_to_report.py \
      --input Report/zivro-open-project-report.md \
      --output Report/zivro-open-project-report-with-code.md \
      --base .
"""

from __future__ import annotations

import argparse
from pathlib import Path
from typing import Iterable


DEFAULT_EXTENSIONS = {
    ".zig",
    ".zon",
    ".json",
    ".toml",
    ".yaml",
    ".yml",
    ".md",
    ".txt",
    ".py",
    ".puml",
}

DEFAULT_EXCLUDE_DIRS = {
    ".git",
    "zig-out",
    "zig-cache",
    ".zig-cache",
    ".cursor",
    "mcps",
}

DEFAULT_EXCLUDE_FILES = {
    ".DS_Store",
}


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description=(
            "Adds source code listings to the end of a markdown report and writes "
            "the result to a new markdown file."
        )
    )
    parser.add_argument("--input", required=True, help="Path to source markdown report")
    parser.add_argument("--output", required=True, help="Path to output markdown report")
    parser.add_argument(
        "--base",
        default=".",
        help="Project root to scan for source files (default: current directory)",
    )
    parser.add_argument(
        "--include",
        nargs="*",
        default=["src", "build.zig", "build.zig.zon"],
        help=(
            "Files/directories (relative to --base) to include in appendix scan. "
            "Default: src build.zig build.zig.zon"
        ),
    )
    parser.add_argument(
        "--extensions",
        nargs="*",
        default=sorted(DEFAULT_EXTENSIONS),
        help=(
            "Allowed file extensions (e.g. .zig .md). "
            "If empty, all file extensions are allowed."
        ),
    )
    parser.add_argument(
        "--exclude-dir",
        nargs="*",
        default=sorted(DEFAULT_EXCLUDE_DIRS),
        help="Directory names to exclude recursively",
    )
    parser.add_argument(
        "--max-bytes",
        type=int,
        default=1_000_000,
        help="Skip files larger than this size in bytes (default: 1_000_000)",
    )
    return parser.parse_args()


def is_text_file(path: Path) -> bool:
    try:
        data = path.read_bytes()
    except OSError:
        return False

    if b"\x00" in data:
        return False
    return True


def iter_files(
    base: Path,
    include_paths: Iterable[str],
    extensions: set[str],
    exclude_dirs: set[str],
    max_bytes: int,
) -> list[Path]:
    files: list[Path] = []

    def add_file(path: Path) -> None:
        if not path.is_file():
            return
        if path.name in DEFAULT_EXCLUDE_FILES:
            return
        if extensions and path.suffix.lower() not in extensions:
            return
        try:
            size = path.stat().st_size
        except OSError:
            return
        if size > max_bytes:
            return
        if not is_text_file(path):
            return
        files.append(path)

    for rel in include_paths:
        item = (base / rel).resolve()
        if not item.exists():
            continue
        if item.is_file():
            add_file(item)
            continue
        for path in item.rglob("*"):
            if any(part in exclude_dirs for part in path.parts):
                continue
            add_file(path)

    return sorted(set(files), key=lambda p: p.relative_to(base).as_posix())


def language_for(path: Path) -> str:
    ext = path.suffix.lower()
    if ext == ".zig":
        return "zig"
    if ext == ".py":
        return "python"
    if ext in {".yaml", ".yml"}:
        return "yaml"
    if ext == ".json":
        return "json"
    if ext == ".toml":
        return "toml"
    if ext == ".md":
        return "markdown"
    return ""


def main() -> int:
    args = parse_args()

    input_path = Path(args.input).resolve()
    output_path = Path(args.output).resolve()
    base_path = Path(args.base).resolve()

    if not input_path.exists():
        raise FileNotFoundError(f"Input report not found: {input_path}")
    if input_path == output_path:
        raise ValueError("--input and --output must be different files")

    report_text = input_path.read_text(encoding="utf-8")

    extensions = {e.lower() if e.startswith(".") else f".{e.lower()}" for e in args.extensions}
    exclude_dirs = set(args.exclude_dir)

    files = iter_files(
        base=base_path,
        include_paths=args.include,
        extensions=extensions,
        exclude_dirs=exclude_dirs,
        max_bytes=args.max_bytes,
    )

    appendix_lines: list[str] = []
    appendix_lines.append("")
    appendix_lines.append("---")
    appendix_lines.append("")
    appendix_lines.append("## Приложение A. Исходные тексты")
    appendix_lines.append("")
    appendix_lines.append(
        f"Сформировано автоматически скриптом `Report/append_sources_to_report.py` "
        f"(файлов: {len(files)})."
    )
    appendix_lines.append("")

    for idx, path in enumerate(files, start=1):
        rel = path.relative_to(base_path).as_posix()
        lang = language_for(path)
        code = path.read_text(encoding="utf-8", errors="replace")

        appendix_lines.append(f"### A.{idx}. `{rel}`")
        appendix_lines.append("")
        appendix_lines.append(f"```{lang}")
        appendix_lines.append(code.rstrip("\n"))
        appendix_lines.append("```")
        appendix_lines.append("")

    output_text = report_text.rstrip() + "\n" + "\n".join(appendix_lines)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(output_text, encoding="utf-8")

    print(f"Created: {output_path}")
    print(f"Input report preserved: {input_path}")
    print(f"Attached files: {len(files)}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())