| author | Alan Dipert
<alan@tailrecursion.com> 2025-12-29 17:01:34 UTC |
| committer | Alan Dipert
<alan@tailrecursion.com> 2025-12-29 17:01:34 UTC |
| parent | 7e4f5123b0e836f9f4a386823c1e55b889f55b9c |
| tools/build_page.py | +38 | -7 |
diff --git a/tools/build_page.py b/tools/build_page.py index 7c18a8e..63a1bde 100755 --- a/tools/build_page.py +++ b/tools/build_page.py @@ -6,12 +6,12 @@ from __future__ import annotations import argparse import html import os -import re import shlex import subprocess import sys +import xml.etree.ElementTree as ET from pathlib import Path -from typing import Dict, List, Set +from typing import Dict, Iterable, List, Set TOOLS_DIR = Path(__file__).resolve().parent if str(TOOLS_DIR) not in sys.path: @@ -36,7 +36,8 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() -_LINK_PATTERN = re.compile(r"\]\(([^)]+)\)") +NS_URI = "http://commonmark.org/xml/1.0" +NS = f"{{{NS_URI}}}" def find_md_root(input_md: Path) -> Path: @@ -53,8 +54,36 @@ def is_temp_path(path: Path) -> bool: return False -def extract_links(md_path: Path) -> List[str]: - return _LINK_PATTERN.findall(md_path.read_text()) +def _strip_to_flags() -> List[str]: + env_flags = shlex.split(os.environ.get("CMARK_FLAGS", "")) + + flags: list[str] = [] + skip_next = False + for flag in env_flags: + if skip_next: + skip_next = False + continue + if flag in {"--to", "-t"}: + skip_next = True + continue + flags.append(flag) + return flags + + +def run_cmark_ast(md_path: Path, md2html: str) -> ET.Element: + flags = ["--to", "xml", "--extension", "table", "--validate-utf8", *_strip_to_flags()] + cmd = [md2html, *flags, str(md_path)] + result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True, check=True) + return ET.fromstring(result.stdout) + + +def iter_link_destinations(node: ET.Element) -> Iterable[str]: + if node.tag == f"{NS}link": + dest = node.attrib.get("destination", "") + if dest: + yield dest + for child in node: + yield from iter_link_destinations(child) def normalize_target(current_md: Path, raw_url: str, md_root: Path) -> Path | None: @@ -92,11 +121,13 @@ def normalize_target(current_md: Path, raw_url: str, md_root: Path) -> Path | No def compute_backlinks(md_root: Path) -> Dict[str, List[str]]: backlinks: Dict[str, Set[str]] = {} + md2html = os.environ.get("MD2HTML", "/usr/bin/cmark-gfm") for md_path in md_root.rglob("*.md"): rel_source = md_path.relative_to(md_root) if is_temp_path(rel_source): continue - for raw_url in extract_links(md_path): + ast_root = run_cmark_ast(md_path, md2html) + for raw_url in iter_link_destinations(ast_root): target = normalize_target(md_path, raw_url, md_root) if target is None or target == rel_source: continue @@ -178,7 +209,7 @@ def main() -> None: items.append(f"<li><a href=\"{href}\">{display}</a></li>") backlinks_html = ( '<div class="backlinks">\n' - " <h3>Links to this page</h3>\n" + " <h3>Backlinks</h3>\n" " <ul>\n" f" {'\n '.join(items)}\n" " </ul>\n"