git » homepage.git » commit 0e190f6

Add Updates RSS feed generation

author Alan Dipert
2025-10-20 04:15:27 UTC
committer Alan Dipert
2025-10-20 04:15:27 UTC
parent 98a937688ad1078ea714be5bb49d71f1e60cd24c

Add Updates RSS feed generation

Makefile +8 -1
md/Home.md +2 -0
tools/gen_updates_rss.py +287 -0
tools/validate_rss.py +54 -0
tpl/head.html +1 -0

diff --git a/Makefile b/Makefile
index 647dc2f..f014fcf 100644
--- a/Makefile
+++ b/Makefile
@@ -13,11 +13,14 @@ BUILD_PAGE := tools/build_page.py
 GEN_INDEX := tools/gen_index.py
 INDEX_HTML := $(OUT)/Index.html
 DEPLOY_HOST ?= arsien23i2@dreamhost:tailrecursion.com/~alan
+SITE_BASE_URL ?= https://tailrecursion.com/~alan/
 
 MD_FILES := $(shell find $(SRC) -type f -name '*.md' | LC_ALL=C sort)
 HTML := $(patsubst $(SRC)/%.md,$(OUT)/%.html,$(MD_FILES))
+HOME_MD := md/Home.md
+UPDATES_FEED := $(OUT)/updates.xml
 
-all: assets $(OUT)/style.css $(HTML) $(INDEX_HTML)
+all: assets $(OUT)/style.css $(HTML) $(INDEX_HTML) $(UPDATES_FEED)
 
 $(OUT)/style.css: $(CSS)
 	mkdir -p $(OUT)
@@ -37,6 +40,10 @@ assets:
 $(INDEX_HTML): $(HTML) $(GEN_INDEX) $(HEAD) $(FOOT) $(BUILDINFO) | $(OUT)
 	$(PYTHON) $(GEN_INDEX) $@ $(SRC)
 
+$(UPDATES_FEED): $(HOME_MD) tools/gen_updates_rss.py tools/mdlink2html.py tools/validate_rss.py | $(OUT)
+	SITE_BASE_URL="$(SITE_BASE_URL)" $(PYTHON) tools/gen_updates_rss.py "$(HOME_MD)" "$@"
+	$(PYTHON) tools/validate_rss.py "$@"
+
 deploy: check-git-clean assets all
 	@if [ -z "$(DEPLOY_HOST)" ]; then \
 		echo "DEPLOY_HOST is not set"; \
diff --git a/md/Home.md b/md/Home.md
index 6d7eb36..90a0b07 100644
--- a/md/Home.md
+++ b/md/Home.md
@@ -20,6 +20,8 @@ The best way to contact me is by email at [alan@tailrecursion.com](mailto:alan@t
 
 Updates
 -------
+[Subscribe to the Updates RSS feed](./updates.xml).
+
 | Date       | Note                                                                                                                                       |
 |:-----------|:-------------------------------------------------------------------------------------------------------------------------------------------|
 | 2025-10-08 | Expanded [WellReadUndergrad](./WellReadUndergrad.md) with linked references to the primary works, terms, and quotations in the canon my dad assembled. |
diff --git a/tools/gen_updates_rss.py b/tools/gen_updates_rss.py
new file mode 100644
index 0000000..7e1622e
--- /dev/null
+++ b/tools/gen_updates_rss.py
@@ -0,0 +1,287 @@
+#!/usr/bin/env python3
+"""Generate an RSS feed from the Updates section of Home.md using the cmark AST."""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import html
+import os
+import re
+import shlex
+import subprocess
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from email.utils import format_datetime
+from pathlib import Path
+from typing import Iterable, Iterator, List, Optional
+from urllib.parse import urljoin
+from xml.etree import ElementTree as ET
+from xml.sax.saxutils import escape as xml_escape
+
+DEFAULT_BASE_URL = os.environ.get("SITE_BASE_URL", "https://tailrecursion.com/~alan/")
+DEFAULT_FEED_TITLE = "Alan Dipert — Updates"
+DEFAULT_FEED_DESCRIPTION = "Recent updates from Alan's homepage."
+
+NS_URI = "http://commonmark.org/xml/1.0"
+NS = f"{{{NS_URI}}}"
+KNOWN_EXTENSIONS = (".html", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf", ".css", ".js", ".zip")
+TAG_PATTERN = re.compile(r"<[^>]+>")
+
+
+@dataclass
+class Update:
+    date: datetime
+    title: str
+    link: str
+    html: str
+    guid: str
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("home_markdown", help="Path to md/Home.md (or equivalent source)")
+    parser.add_argument("output", help="Destination RSS file (e.g. out/updates.xml)")
+    parser.add_argument("--base-url", default=DEFAULT_BASE_URL, help="Public site root used to absolutize links")
+    parser.add_argument("--feed-title", default=DEFAULT_FEED_TITLE, help="Title element for the RSS channel")
+    parser.add_argument(
+        "--feed-description",
+        default=DEFAULT_FEED_DESCRIPTION,
+        help="Description element for the RSS channel",
+    )
+    parser.add_argument(
+        "--channel-link",
+        default=None,
+        help="Homepage URL for the channel link element (defaults to base-url + Home.html)",
+    )
+    return parser.parse_args()
+
+
+def run_cmark_ast(home_md: Path) -> ET.Element:
+    md2html = os.environ.get("MD2HTML", "/usr/bin/cmark-gfm")
+    env_flags = shlex.split(os.environ.get("CMARK_FLAGS", ""))
+
+    flags: list[str] = []
+    skip_next = False
+    for flag in env_flags:
+        if skip_next:
+            skip_next = False
+            continue
+        if flag in {"--to", "-t"}:
+            skip_next = True
+            continue
+        flags.append(flag)
+
+    cmd = [md2html, "--to", "xml", "--extension", "table", "--validate-utf8", *flags, str(home_md)]
+    result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True, check=True)
+    return ET.fromstring(result.stdout)
+
+
+def local_name(tag: str) -> str:
+    return tag.split("}", 1)[-1]
+
+
+def find_updates_table(root: ET.Element) -> ET.Element:
+    children = list(root)
+    for idx, child in enumerate(children):
+        if child.tag == f"{NS}heading":
+            text = "".join(c.text or "" for c in child.findall(f"{NS}text"))
+            if text.strip().lower() == "updates":
+                for sibling in children[idx + 1 :]:
+                    if sibling.tag == f"{NS}table":
+                        return sibling
+                break
+    raise RuntimeError("Unable to locate Updates table in Home.md")
+
+
+def iter_rows(table: ET.Element) -> Iterator[tuple[ET.Element, ET.Element]]:
+    for node in table.findall(f"{NS}table_row"):
+        cells = node.findall(f"{NS}table_cell")
+        if len(cells) != 2:
+            continue
+        yield cells[0], cells[1]
+
+
+def collect_links(node: ET.Element, base_url: str) -> Iterable[str]:
+    if node.tag == f"{NS}link":
+        dest = node.attrib.get("destination", "")
+        yield absolutize_url(rewrite_destination(dest), base_url)
+    for child in node:
+        yield from collect_links(child, base_url)
+
+
+def absolutize_url(url: str, base_url: str) -> str:
+    lowered = url.lower()
+    if "://" in url or lowered.startswith("mailto:") or url.startswith("#"):
+        return url
+    return urljoin(base_url, url)
+
+
+def rewrite_destination(url: str) -> str:
+    if not url:
+        return url
+    frag = ""
+    if "#" in url:
+        path, frag = url.split("#", 1)
+        frag = "#" + frag
+    else:
+        path = url
+
+    lowered = path.lower()
+    if lowered.endswith(".md"):
+        path = path[:-3] + ".html"
+    elif lowered.endswith(KNOWN_EXTENSIONS):
+        pass
+    elif "." not in path and path and not path.endswith("/"):
+        path = f"{path}.html"
+
+    return f"{path}{frag}"
+
+
+def render_children(node: ET.Element, base_url: str) -> str:
+    parts: list[str] = []
+    if node.text and node.text.strip():
+        parts.append(html.escape(node.text))
+    for child in node:
+        parts.append(render_node(child, base_url))
+        if child.tail and child.tail.strip():
+            parts.append(html.escape(child.tail))
+    return "".join(parts)
+
+
+def render_node(node: ET.Element, base_url: str) -> str:
+    name = local_name(node.tag)
+
+    if name == "text":
+        return html.escape(node.text or "")
+    if name in {"softbreak", "linebreak"}:
+        return "<br />"
+    if name == "code":
+        return f"<code>{html.escape(node.text or '')}</code>"
+    if name == "strong":
+        return f"<strong>{render_children(node, base_url)}</strong>"
+    if name == "emph":
+        return f"<em>{render_children(node, base_url)}</em>"
+    if name == "link":
+        href = absolutize_url(rewrite_destination(node.attrib.get("destination", "")), base_url)
+        inner = render_children(node, base_url)
+        title = node.attrib.get("title", "")
+        title_attr = f' title="{html.escape(title)}"' if title else ""
+        return f'<a href="{html.escape(href)}"{title_attr}>{inner}</a>'
+    if name == "code_span":
+        return f"<code>{html.escape(node.text or '')}</code>"
+    if name == "paragraph":
+        return f"<p>{render_children(node, base_url)}</p>"
+
+    return render_children(node, base_url)
+
+
+def render_cell_html(cell: ET.Element, base_url: str) -> str:
+    paragraphs = [child for child in cell if local_name(child.tag) == "paragraph"]
+    if paragraphs:
+        return "".join(render_node(p, base_url) for p in paragraphs)
+
+    body = render_children(cell, base_url)
+    return f"<p>{body}</p>" if body else "<p></p>"
+
+
+def parse_date(cell: ET.Element) -> datetime:
+    text = "".join(part.strip() for part in cell.itertext()).strip()
+    try:
+        return datetime.strptime(text, "%Y-%m-%d").replace(tzinfo=timezone.utc)
+    except ValueError as exc:
+        raise ValueError(f"Invalid date in Updates table: {text!r}") from exc
+
+
+def build_updates(home_md: Path, base_url: str) -> List[Update]:
+    ast_root = run_cmark_ast(home_md)
+    table = find_updates_table(ast_root)
+
+    updates: list[Update] = []
+    for date_cell, note_cell in iter_rows(table):
+        pub_date = parse_date(date_cell)
+        html_body = render_cell_html(note_cell, base_url)
+        plain = html_to_text(html_body)
+        guid_source = f"{pub_date.date().isoformat()}\n{plain}".encode("utf-8")
+        guid = f"urn:homepage-updates:{hashlib.sha1(guid_source).hexdigest()}"
+
+        links = list(collect_links(note_cell, base_url))
+        link = links[0] if links else urljoin(base_url, "Home.html")
+
+        updates.append(Update(pub_date, plain, link, html_body, guid))
+
+    return updates
+
+
+def render_feed(
+    updates: List[Update],
+    base_url: str,
+    feed_title: str,
+    feed_description: str,
+    channel_link: Optional[str],
+) -> str:
+    if not updates:
+        raise RuntimeError("No updates found; RSS feed would be empty.")
+
+    updates.sort(key=lambda u: u.date, reverse=True)
+
+    last_modified = updates[0].date
+    resolved_channel_link = channel_link or urljoin(base_url, "Home.html")
+
+    lines = [
+        '<?xml version="1.0" encoding="UTF-8"?>',
+        '<rss version="2.0">',
+        "  <channel>",
+        f"    <title>{xml_escape(feed_title)}</title>",
+        f"    <link>{xml_escape(resolved_channel_link)}</link>",
+        f"    <description>{xml_escape(feed_description)}</description>",
+        f"    <lastBuildDate>{format_datetime(last_modified)}</lastBuildDate>",
+    ]
+
+    for update in updates:
+        lines.extend(
+            [
+                "    <item>",
+                f"      <title>{xml_escape(update.title)}</title>",
+                f"      <link>{xml_escape(update.link)}</link>",
+                f"      <guid isPermaLink=\"false\">{xml_escape(update.guid)}</guid>",
+                f"      <pubDate>{format_datetime(update.date)}</pubDate>",
+                "      <description><![CDATA[",
+                f"{update.html}",
+                "      ]]></description>",
+                "    </item>",
+            ]
+        )
+
+    lines.append("  </channel>")
+    lines.append("</rss>")
+    lines.append("")
+    return "\n".join(lines)
+
+
+def html_to_text(fragment: str) -> str:
+    stripped = TAG_PATTERN.sub("", fragment)
+    return " ".join(stripped.split())
+
+
+def main() -> None:
+    args = parse_args()
+    base_url = args.base_url.rstrip("/") + "/"
+
+    home_md = Path(args.home_markdown)
+    updates = build_updates(home_md, base_url)
+    feed_xml = render_feed(
+        updates,
+        base_url,
+        args.feed_title,
+        args.feed_description,
+        args.channel_link,
+    )
+
+    output_path = Path(args.output)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(feed_xml, encoding="utf-8")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/validate_rss.py b/tools/validate_rss.py
new file mode 100644
index 0000000..6462173
--- /dev/null
+++ b/tools/validate_rss.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+"""Ensure an RSS feed is well-formed and has the expected top-level structure."""
+
+from __future__ import annotations
+
+import argparse
+import sys
+import xml.etree.ElementTree as ET
+from pathlib import Path
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("feed", help="Path to the RSS XML file to validate")
+    return parser.parse_args()
+
+
+def validate_feed(path: Path) -> None:
+    try:
+        tree = ET.parse(path)
+    except ET.ParseError as exc:
+        raise SystemExit(f"{path}: XML parse error: {exc}") from exc
+
+    root = tree.getroot()
+    tag = root.tag.lower()
+    if not tag.endswith("rss"):
+        raise SystemExit(f"{path}: expected root <rss>, found <{root.tag}>")
+
+    channel = None
+    for child in root:
+        child_tag = child.tag.lower()
+        if child_tag.endswith("channel"):
+            channel = child
+            break
+
+    if channel is None:
+        raise SystemExit(f"{path}: missing <channel> element under <rss>")
+
+    items = [elem for elem in channel if elem.tag.lower().endswith("item")]
+    if not items:
+        raise SystemExit(f"{path}: RSS feed has no <item> entries")
+
+
+def main() -> None:
+    args = parse_args()
+    path = Path(args.feed)
+    if not path.is_file():
+        raise SystemExit(f"{path}: file not found")
+
+    validate_feed(path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tpl/head.html b/tpl/head.html
index 7d09603..ec5b3ca 100644
--- a/tpl/head.html
+++ b/tpl/head.html
@@ -5,6 +5,7 @@
   <meta name="viewport" content="width=device-width,initial-scale=1">
   <title>@TITLE@</title>
   <link rel="stylesheet" href="@ROOT@style.css">
+  <link rel="alternate" type="application/rss+xml" title="Alan Dipert — Updates" href="@ROOT@updates.xml">
   <script async src="https://www.googletagmanager.com/gtag/js?id=G-XCMVL5K44X"></script>
   <script>
     window.dataLayer = window.dataLayer || [];