draugnorak/scripts/audit_untranslated_strings.py

#!/usr/bin/env python3
from __future__ import annotations

import re
import sys
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Set, Tuple

ROOT = Path(__file__).resolve().parents[1]
ALLOWLIST_PATH = ROOT / "scripts" / "i18n_audit_allowlist.txt"

SKIP_DIR_NAMES = {".git", "bloodshed", "docs", "skills", "nvgt-git", "libstorm-nvgt"}
SKIP_FILE_NAMES = {"crash.log"}

INSERT_LAST_CONTEXT_HINTS = (
    "option",
    "label",
    "line",
    "prompt",
    "instruction",
    "intro",
    "reward",
    "message",
    "title",
    "menu",
)

TRANSLATION_WRAPPERS = (
    "tr(",
    "trf(",
    "trn(",
    "i18n_translate_speech_message(",
    "i18n_lookup_key_with_fallback(",
    "speech_history_transform_message(",
    "get_barricade_option_text(",
    "i18n_text(",
)

# Function call checks for call arguments that must be translation-wrapped when they
# contain literals. Keep this conservative and focused on user-facing text paths.
ARG_CHECKS: Dict[str, List[int]] = {
    "screen_reader_speak": [0],
    "menu_run_simple": [0],
    "text_reader": [0, 1],
    "text_reader_lines": [1],
    "text_reader_file": [1],
    "file_viewer": [0, 1],
    "file_viewer_lines": [1],
    "file_viewer_file": [1],
}

ASSIGNMENT_LHS_CHECKS = (
    "intro_text",
)


class Finding:
    def __init__(self, path: Path, line: int, context: str, expression: str):
        self.path = path
        self.line = line
        self.context = context
        self.expression = expression.strip()

    def key(self) -> str:
        return f"{self.path.relative_to(ROOT).as_posix()}:{self.line}:{self.context}"


def iter_nvgt_files() -> List[Path]:
    files: List[Path] = []

    entrypoints = [ROOT / "draugnorak.nvgt", ROOT / "src" / "sound_settings.nvgt"]
    for entry in entrypoints:
        if entry.exists():
            files.append(entry)

    source_roots = [ROOT / "src", ROOT / "libstorm-nvgt"]
    for source_root in source_roots:
        if not source_root.exists():
            continue
        for path in source_root.rglob("*.nvgt"):
            rel = path.relative_to(ROOT)
            if any(part in SKIP_DIR_NAMES for part in rel.parts):
                continue
            if path.name in SKIP_FILE_NAMES:
                continue
            files.append(path)

    return sorted(set(files))


def load_allowlist() -> Set[str]:
    allowed: Set[str] = set()
    if not ALLOWLIST_PATH.exists():
        return allowed

    for raw_line in ALLOWLIST_PATH.read_text(encoding="utf-8", errors="replace").splitlines():
        line = raw_line.strip()
        if not line or line.startswith("#"):
            continue
        allowed.add(line)
    return allowed


def is_identifier_char(ch: str) -> bool:
    return ch.isalnum() or ch == "_"


def read_identifier_backward(text: str, before_index: int) -> str:
    i = before_index
    while i >= 0 and text[i].isspace():
        i -= 1
    end = i
    while i >= 0 and is_identifier_char(text[i]):
        i -= 1
    start = i + 1
    if end < start:
        return ""
    return text[start : end + 1]


def find_matching_paren(text: str, open_index: int) -> int:
    depth = 0
    in_string = False
    escape = False

    for i in range(open_index, len(text)):
        ch = text[i]
        if in_string:
            if escape:
                escape = False
            elif ch == "\\":
                escape = True
            elif ch == '"':
                in_string = False
            continue

        if ch == '"':
            in_string = True
            continue
        if ch == "(":
            depth += 1
            continue
        if ch == ")":
            depth -= 1
            if depth == 0:
                return i
            continue

    return -1


def split_top_level(expr: str, delimiter: str) -> List[str]:
    parts: List[str] = []
    depth_paren = 0
    depth_bracket = 0
    depth_brace = 0
    in_string = False
    escape = False
    start = 0

    for i, ch in enumerate(expr):
        if in_string:
            if escape:
                escape = False
            elif ch == "\\":
                escape = True
            elif ch == '"':
                in_string = False
            continue

        if ch == '"':
            in_string = True
            continue
        if ch == "(":
            depth_paren += 1
            continue
        if ch == ")":
            depth_paren = max(0, depth_paren - 1)
            continue
        if ch == "[":
            depth_bracket += 1
            continue
        if ch == "]":
            depth_bracket = max(0, depth_bracket - 1)
            continue
        if ch == "{":
            depth_brace += 1
            continue
        if ch == "}":
            depth_brace = max(0, depth_brace - 1)
            continue

        if ch == delimiter and depth_paren == 0 and depth_bracket == 0 and depth_brace == 0:
            parts.append(expr[start:i])
            start = i + 1

    parts.append(expr[start:])
    return parts


def line_number_for_index(text: str, index: int) -> int:
    return text.count("\n", 0, index) + 1


def has_string_literal(expr: str) -> bool:
    return len(extract_string_literals(expr)) > 0


def extract_string_literals(expr: str) -> List[str]:
    literals: List[str] = []
    in_string = False
    escape = False
    current: List[str] = []

    for ch in expr:
        if in_string:
            if escape:
                current.append(ch)
                escape = False
                continue
            if ch == "\\":
                escape = True
                continue
            if ch == '"':
                in_string = False
                literals.append("".join(current))
                current = []
                continue
            current.append(ch)
            continue

        if ch == '"':
            in_string = True

    return literals


def has_meaningful_literal(expr: str) -> bool:
    literals = extract_string_literals(expr)
    if not literals:
        return False

    for literal in literals:
        if literal == "":
            continue
        if re.fullmatch(r"[\s:,.!?;()\-\[\]/+]*", literal):
            continue
        if re.fullmatch(r"[a-z0-9_.-]+", literal):
            # Translation keys and identifiers are not user-facing copy.
            continue
        return True

    return False


def is_translated_expression(expr: str) -> bool:
    normalized = "".join(expr.split())
    for wrapper in TRANSLATION_WRAPPERS:
        if wrapper in normalized:
            return True
    return False


def should_check_insert_last(receiver: str) -> bool:
    receiver_lower = receiver.lower()
    if not receiver_lower:
        return False
    return any(hint in receiver_lower for hint in INSERT_LAST_CONTEXT_HINTS)


def add_finding(findings: List[Finding], path: Path, line: int, context: str, expr: str) -> None:
    findings.append(Finding(path, line, context, expr))


def check_call_args(path: Path, line: int, function_name: str, receiver: str, args: List[str], findings: List[Finding],
                    translated_arrays: Set[str]) -> None:
    if function_name == "insert_last":
        if not should_check_insert_last(receiver):
            return
        if receiver in translated_arrays:
            return
        if not args:
            return
        expr = args[0]
        if has_meaningful_literal(expr) and not is_translated_expression(expr):
            add_finding(findings, path, line, f"{receiver}.insert_last", expr)
        return

    target_indexes = ARG_CHECKS.get(function_name)
    if not target_indexes:
        return

    for arg_index in target_indexes:
        if arg_index >= len(args):
            continue
        expr = args[arg_index]
        if has_meaningful_literal(expr) and not is_translated_expression(expr):
            add_finding(findings, path, line, f"{function_name}[{arg_index}]", expr)


def check_assignment_literals(path: Path, text: str, findings: List[Finding]) -> None:
    for lhs in ASSIGNMENT_LHS_CHECKS:
        pattern = re.compile(rf"\b{re.escape(lhs)}\s*=\s*(.+?);", re.MULTILINE)
        for match in pattern.finditer(text):
            expr = match.group(1)
            if not has_meaningful_literal(expr):
                continue
            if is_translated_expression(expr):
                continue
            line = line_number_for_index(text, match.start())
            add_finding(findings, path, line, f"assign:{lhs}", expr)


def scan_file(path: Path) -> List[Finding]:
    findings: List[Finding] = []
    text = path.read_text(encoding="utf-8", errors="replace")
    translated_arrays = set(re.findall(r"i18n_translate_string_array_in_place\(\s*([A-Za-z_][A-Za-z0-9_]*)\s*\)", text))

    check_assignment_literals(path, text, findings)

    i = 0
    while i < len(text):
        ch = text[i]
        if not is_identifier_char(ch):
            i += 1
            continue

        start = i
        while i < len(text) and is_identifier_char(text[i]):
            i += 1
        name = text[start:i]

        j = i
        while j < len(text) and text[j].isspace():
            j += 1

        if j >= len(text) or text[j] != "(":
            continue

        receiver = ""
        k = start - 1
        while k >= 0 and text[k].isspace():
            k -= 1
        if k >= 0 and text[k] == ".":
            receiver = read_identifier_backward(text, k - 1)

        close = find_matching_paren(text, j)
        if close < 0:
            break

        arg_text = text[j + 1 : close]
        args = split_top_level(arg_text, ",")

        tail = close + 1
        while tail < len(text) and text[tail].isspace():
            tail += 1
        if tail < len(text) and text[tail] == "{":
            # Function/method declaration, not a call site.
            i = close + 1
            continue

        line = line_number_for_index(text, start)
        check_call_args(path, line, name, receiver, args, findings, translated_arrays)

        i = close + 1

    return findings


def summarize_expression(expr: str) -> str:
    collapsed = " ".join(expr.split())
    if len(collapsed) > 120:
        return collapsed[:117] + "..."
    return collapsed


def main() -> int:
    allowlist = load_allowlist()

    all_findings: List[Finding] = []
    for nvgt_file in iter_nvgt_files():
        all_findings.extend(scan_file(nvgt_file))

    filtered = [f for f in all_findings if f.key() not in allowlist]
    filtered.sort(key=lambda item: (item.path.as_posix(), item.line, item.context))

    if not filtered:
        print("No untranslated-string violations found.")
        return 0

    print(f"Found {len(filtered)} untranslated-string violations:")
    for finding in filtered:
        rel = finding.path.relative_to(ROOT).as_posix()
        print(f"{rel}:{finding.line}: {finding.context}: {summarize_expression(finding.expression)}")

    print("\nAdd approved exceptions to scripts/i18n_audit_allowlist.txt if needed.")
    return 1


if __name__ == "__main__":
    sys.exit(main())