author | Alan Dipert
<alan@dipert.org> 2025-10-08 06:35:39 UTC |
committer | Alan Dipert
<alan@dipert.org> 2025-10-08 06:35:39 UTC |
parent | 008e9d1151f0e127d9cff9d458c52a08ee1be4ae |
tools/check_links.py | +53 | -4 |
diff --git a/tools/check_links.py b/tools/check_links.py index 55bec55..da0a6f2 100755 --- a/tools/check_links.py +++ b/tools/check_links.py @@ -3,19 +3,68 @@ from __future__ import annotations import pathlib -import re import subprocess import sys +import xml.etree.ElementTree as ET from collections import defaultdict +from typing import Iterable ROOT = pathlib.Path(__file__).resolve().parent.parent MD_DIR = ROOT / "md" -URL_RE = re.compile(r"https?://[\w\-./?%#=&+:~]+", re.IGNORECASE) +CMARK_CANDIDATES: Iterable[pathlib.Path | str] = ( + ROOT / "bin" / "cmark", + pathlib.Path("/usr/bin/cmark-gfm"), + pathlib.Path("/usr/bin/cmark"), + "cmark-gfm", + "cmark", +) + + +def _cmark_cmd() -> list[str]: + for candidate in CMARK_CANDIDATES: + if isinstance(candidate, pathlib.Path): + if candidate.is_file(): + return [str(candidate)] + else: + return [candidate] + raise RuntimeError("Unable to locate cmark-gfm executable") def collect_links(md_path: pathlib.Path) -> set[str]: - text = md_path.read_text(encoding="utf-8") - return set(URL_RE.findall(text)) + proc = subprocess.run( + _cmark_cmd() + + [ + "--to", + "xml", + "--extension", + "table", + "--unsafe", + ], + input=md_path.read_bytes(), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, + ) + if proc.returncode != 0: + raise RuntimeError( + f"cmark failed for {md_path}: {proc.stderr.decode('utf-8', 'ignore')}" + ) + + try: + root = ET.fromstring(proc.stdout.decode("utf-8")) + except ET.ParseError as exc: # pragma: no cover + raise RuntimeError( + f"Unable to parse cmark output for {md_path}: {exc}" + ) from exc + + urls: set[str] = set() + for node in root.iter(): + tag = node.tag.split('}')[-1] + if tag in {"link", "image"}: + dest = node.attrib.get("destination", "").strip() + if dest.startswith("http://") or dest.startswith("https://"): + urls.add(dest) + return urls def check_url(url: str) -> tuple[bool, str, str]: