git » homepage.git » commit ce7c549

Parse backlinks via cmark AST and label as Backlinks

author Alan Dipert
2025-12-29 17:01:34 UTC
committer Alan Dipert
2025-12-29 17:01:34 UTC
parent 7e4f5123b0e836f9f4a386823c1e55b889f55b9c

Parse backlinks via cmark AST and label as Backlinks

tools/build_page.py +38 -7

diff --git a/tools/build_page.py b/tools/build_page.py
index 7c18a8e..63a1bde 100755
--- a/tools/build_page.py
+++ b/tools/build_page.py
@@ -6,12 +6,12 @@ from __future__ import annotations
 import argparse
 import html
 import os
-import re
 import shlex
 import subprocess
 import sys
+import xml.etree.ElementTree as ET
 from pathlib import Path
-from typing import Dict, List, Set
+from typing import Dict, Iterable, List, Set
 
 TOOLS_DIR = Path(__file__).resolve().parent
 if str(TOOLS_DIR) not in sys.path:
@@ -36,7 +36,8 @@ def parse_args() -> argparse.Namespace:
     return parser.parse_args()
 
 
-_LINK_PATTERN = re.compile(r"\]\(([^)]+)\)")
+NS_URI = "http://commonmark.org/xml/1.0"
+NS = f"{{{NS_URI}}}"
 
 
 def find_md_root(input_md: Path) -> Path:
@@ -53,8 +54,36 @@ def is_temp_path(path: Path) -> bool:
     return False
 
 
-def extract_links(md_path: Path) -> List[str]:
-    return _LINK_PATTERN.findall(md_path.read_text())
+def _strip_to_flags() -> List[str]:
+    env_flags = shlex.split(os.environ.get("CMARK_FLAGS", ""))
+
+    flags: list[str] = []
+    skip_next = False
+    for flag in env_flags:
+        if skip_next:
+            skip_next = False
+            continue
+        if flag in {"--to", "-t"}:
+            skip_next = True
+            continue
+        flags.append(flag)
+    return flags
+
+
+def run_cmark_ast(md_path: Path, md2html: str) -> ET.Element:
+    flags = ["--to", "xml", "--extension", "table", "--validate-utf8", *_strip_to_flags()]
+    cmd = [md2html, *flags, str(md_path)]
+    result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True, check=True)
+    return ET.fromstring(result.stdout)
+
+
+def iter_link_destinations(node: ET.Element) -> Iterable[str]:
+    if node.tag == f"{NS}link":
+        dest = node.attrib.get("destination", "")
+        if dest:
+            yield dest
+    for child in node:
+        yield from iter_link_destinations(child)
 
 
 def normalize_target(current_md: Path, raw_url: str, md_root: Path) -> Path | None:
@@ -92,11 +121,13 @@ def normalize_target(current_md: Path, raw_url: str, md_root: Path) -> Path | No
 
 def compute_backlinks(md_root: Path) -> Dict[str, List[str]]:
     backlinks: Dict[str, Set[str]] = {}
+    md2html = os.environ.get("MD2HTML", "/usr/bin/cmark-gfm")
     for md_path in md_root.rglob("*.md"):
         rel_source = md_path.relative_to(md_root)
         if is_temp_path(rel_source):
             continue
-        for raw_url in extract_links(md_path):
+        ast_root = run_cmark_ast(md_path, md2html)
+        for raw_url in iter_link_destinations(ast_root):
             target = normalize_target(md_path, raw_url, md_root)
             if target is None or target == rel_source:
                 continue
@@ -178,7 +209,7 @@ def main() -> None:
             items.append(f"<li><a href=\"{href}\">{display}</a></li>")
         backlinks_html = (
             '<div class="backlinks">\n'
-            "  <h3>Links to this page</h3>\n"
+            "  <h3>Backlinks</h3>\n"
             "  <ul>\n"
             f"    {'\n    '.join(items)}\n"
             "  </ul>\n"