git » homepage.git » commit 915276a

Parse Markdown links via cmark

author Alan Dipert
2025-10-08 06:35:39 UTC
committer Alan Dipert
2025-10-08 06:35:39 UTC
parent 008e9d1151f0e127d9cff9d458c52a08ee1be4ae

Parse Markdown links via cmark

tools/check_links.py +53 -4

diff --git a/tools/check_links.py b/tools/check_links.py
index 55bec55..da0a6f2 100755
--- a/tools/check_links.py
+++ b/tools/check_links.py
@@ -3,19 +3,68 @@
 from __future__ import annotations
 
 import pathlib
-import re
 import subprocess
 import sys
+import xml.etree.ElementTree as ET
 from collections import defaultdict
+from typing import Iterable
 
 ROOT = pathlib.Path(__file__).resolve().parent.parent
 MD_DIR = ROOT / "md"
-URL_RE = re.compile(r"https?://[\w\-./?%#=&+:~]+", re.IGNORECASE)
+CMARK_CANDIDATES: Iterable[pathlib.Path | str] = (
+    ROOT / "bin" / "cmark",
+    pathlib.Path("/usr/bin/cmark-gfm"),
+    pathlib.Path("/usr/bin/cmark"),
+    "cmark-gfm",
+    "cmark",
+)
+
+
+def _cmark_cmd() -> list[str]:
+    for candidate in CMARK_CANDIDATES:
+        if isinstance(candidate, pathlib.Path):
+            if candidate.is_file():
+                return [str(candidate)]
+        else:
+            return [candidate]
+    raise RuntimeError("Unable to locate cmark-gfm executable")
 
 
 def collect_links(md_path: pathlib.Path) -> set[str]:
-    text = md_path.read_text(encoding="utf-8")
-    return set(URL_RE.findall(text))
+    proc = subprocess.run(
+        _cmark_cmd()
+        + [
+            "--to",
+            "xml",
+            "--extension",
+            "table",
+            "--unsafe",
+        ],
+        input=md_path.read_bytes(),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        check=False,
+    )
+    if proc.returncode != 0:
+        raise RuntimeError(
+            f"cmark failed for {md_path}: {proc.stderr.decode('utf-8', 'ignore')}"
+        )
+
+    try:
+        root = ET.fromstring(proc.stdout.decode("utf-8"))
+    except ET.ParseError as exc:  # pragma: no cover
+        raise RuntimeError(
+            f"Unable to parse cmark output for {md_path}: {exc}"
+        ) from exc
+
+    urls: set[str] = set()
+    for node in root.iter():
+        tag = node.tag.split('}')[-1]
+        if tag in {"link", "image"}:
+            dest = node.attrib.get("destination", "").strip()
+            if dest.startswith("http://") or dest.startswith("https://"):
+                urls.add(dest)
+    return urls
 
 
 def check_url(url: str) -> tuple[bool, str, str]: