| author | Alan Dipert
<alan@tailrecursion.com> 2025-10-20 04:15:27 UTC |
| committer | Alan Dipert
<alan@tailrecursion.com> 2025-10-20 04:15:27 UTC |
| parent | 98a937688ad1078ea714be5bb49d71f1e60cd24c |
| Makefile | +8 | -1 |
| md/Home.md | +2 | -0 |
| tools/gen_updates_rss.py | +287 | -0 |
| tools/validate_rss.py | +54 | -0 |
| tpl/head.html | +1 | -0 |
diff --git a/Makefile b/Makefile index 647dc2f..f014fcf 100644 --- a/Makefile +++ b/Makefile @@ -13,11 +13,14 @@ BUILD_PAGE := tools/build_page.py GEN_INDEX := tools/gen_index.py INDEX_HTML := $(OUT)/Index.html DEPLOY_HOST ?= arsien23i2@dreamhost:tailrecursion.com/~alan +SITE_BASE_URL ?= https://tailrecursion.com/~alan/ MD_FILES := $(shell find $(SRC) -type f -name '*.md' | LC_ALL=C sort) HTML := $(patsubst $(SRC)/%.md,$(OUT)/%.html,$(MD_FILES)) +HOME_MD := md/Home.md +UPDATES_FEED := $(OUT)/updates.xml -all: assets $(OUT)/style.css $(HTML) $(INDEX_HTML) +all: assets $(OUT)/style.css $(HTML) $(INDEX_HTML) $(UPDATES_FEED) $(OUT)/style.css: $(CSS) mkdir -p $(OUT) @@ -37,6 +40,10 @@ assets: $(INDEX_HTML): $(HTML) $(GEN_INDEX) $(HEAD) $(FOOT) $(BUILDINFO) | $(OUT) $(PYTHON) $(GEN_INDEX) $@ $(SRC) +$(UPDATES_FEED): $(HOME_MD) tools/gen_updates_rss.py tools/mdlink2html.py tools/validate_rss.py | $(OUT) + SITE_BASE_URL="$(SITE_BASE_URL)" $(PYTHON) tools/gen_updates_rss.py "$(HOME_MD)" "$@" + $(PYTHON) tools/validate_rss.py "$@" + deploy: check-git-clean assets all @if [ -z "$(DEPLOY_HOST)" ]; then \ echo "DEPLOY_HOST is not set"; \ diff --git a/md/Home.md b/md/Home.md index 6d7eb36..90a0b07 100644 --- a/md/Home.md +++ b/md/Home.md @@ -20,6 +20,8 @@ The best way to contact me is by email at [alan@tailrecursion.com](mailto:alan@t Updates ------- +[Subscribe to the Updates RSS feed](./updates.xml). + | Date | Note | |:-----------|:-------------------------------------------------------------------------------------------------------------------------------------------| | 2025-10-08 | Expanded [WellReadUndergrad](./WellReadUndergrad.md) with linked references to the primary works, terms, and quotations in the canon my dad assembled. | diff --git a/tools/gen_updates_rss.py b/tools/gen_updates_rss.py new file mode 100644 index 0000000..7e1622e --- /dev/null +++ b/tools/gen_updates_rss.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python3 +"""Generate an RSS feed from the Updates section of Home.md using the cmark AST.""" + +from __future__ import annotations + +import argparse +import hashlib +import html +import os +import re +import shlex +import subprocess +from dataclasses import dataclass +from datetime import datetime, timezone +from email.utils import format_datetime +from pathlib import Path +from typing import Iterable, Iterator, List, Optional +from urllib.parse import urljoin +from xml.etree import ElementTree as ET +from xml.sax.saxutils import escape as xml_escape + +DEFAULT_BASE_URL = os.environ.get("SITE_BASE_URL", "https://tailrecursion.com/~alan/") +DEFAULT_FEED_TITLE = "Alan Dipert — Updates" +DEFAULT_FEED_DESCRIPTION = "Recent updates from Alan's homepage." + +NS_URI = "http://commonmark.org/xml/1.0" +NS = f"{{{NS_URI}}}" +KNOWN_EXTENSIONS = (".html", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf", ".css", ".js", ".zip") +TAG_PATTERN = re.compile(r"<[^>]+>") + + +@dataclass +class Update: + date: datetime + title: str + link: str + html: str + guid: str + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("home_markdown", help="Path to md/Home.md (or equivalent source)") + parser.add_argument("output", help="Destination RSS file (e.g. out/updates.xml)") + parser.add_argument("--base-url", default=DEFAULT_BASE_URL, help="Public site root used to absolutize links") + parser.add_argument("--feed-title", default=DEFAULT_FEED_TITLE, help="Title element for the RSS channel") + parser.add_argument( + "--feed-description", + default=DEFAULT_FEED_DESCRIPTION, + help="Description element for the RSS channel", + ) + parser.add_argument( + "--channel-link", + default=None, + help="Homepage URL for the channel link element (defaults to base-url + Home.html)", + ) + return parser.parse_args() + + +def run_cmark_ast(home_md: Path) -> ET.Element: + md2html = os.environ.get("MD2HTML", "/usr/bin/cmark-gfm") + env_flags = shlex.split(os.environ.get("CMARK_FLAGS", "")) + + flags: list[str] = [] + skip_next = False + for flag in env_flags: + if skip_next: + skip_next = False + continue + if flag in {"--to", "-t"}: + skip_next = True + continue + flags.append(flag) + + cmd = [md2html, "--to", "xml", "--extension", "table", "--validate-utf8", *flags, str(home_md)] + result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True, check=True) + return ET.fromstring(result.stdout) + + +def local_name(tag: str) -> str: + return tag.split("}", 1)[-1] + + +def find_updates_table(root: ET.Element) -> ET.Element: + children = list(root) + for idx, child in enumerate(children): + if child.tag == f"{NS}heading": + text = "".join(c.text or "" for c in child.findall(f"{NS}text")) + if text.strip().lower() == "updates": + for sibling in children[idx + 1 :]: + if sibling.tag == f"{NS}table": + return sibling + break + raise RuntimeError("Unable to locate Updates table in Home.md") + + +def iter_rows(table: ET.Element) -> Iterator[tuple[ET.Element, ET.Element]]: + for node in table.findall(f"{NS}table_row"): + cells = node.findall(f"{NS}table_cell") + if len(cells) != 2: + continue + yield cells[0], cells[1] + + +def collect_links(node: ET.Element, base_url: str) -> Iterable[str]: + if node.tag == f"{NS}link": + dest = node.attrib.get("destination", "") + yield absolutize_url(rewrite_destination(dest), base_url) + for child in node: + yield from collect_links(child, base_url) + + +def absolutize_url(url: str, base_url: str) -> str: + lowered = url.lower() + if "://" in url or lowered.startswith("mailto:") or url.startswith("#"): + return url + return urljoin(base_url, url) + + +def rewrite_destination(url: str) -> str: + if not url: + return url + frag = "" + if "#" in url: + path, frag = url.split("#", 1) + frag = "#" + frag + else: + path = url + + lowered = path.lower() + if lowered.endswith(".md"): + path = path[:-3] + ".html" + elif lowered.endswith(KNOWN_EXTENSIONS): + pass + elif "." not in path and path and not path.endswith("/"): + path = f"{path}.html" + + return f"{path}{frag}" + + +def render_children(node: ET.Element, base_url: str) -> str: + parts: list[str] = [] + if node.text and node.text.strip(): + parts.append(html.escape(node.text)) + for child in node: + parts.append(render_node(child, base_url)) + if child.tail and child.tail.strip(): + parts.append(html.escape(child.tail)) + return "".join(parts) + + +def render_node(node: ET.Element, base_url: str) -> str: + name = local_name(node.tag) + + if name == "text": + return html.escape(node.text or "") + if name in {"softbreak", "linebreak"}: + return "<br />" + if name == "code": + return f"<code>{html.escape(node.text or '')}</code>" + if name == "strong": + return f"<strong>{render_children(node, base_url)}</strong>" + if name == "emph": + return f"<em>{render_children(node, base_url)}</em>" + if name == "link": + href = absolutize_url(rewrite_destination(node.attrib.get("destination", "")), base_url) + inner = render_children(node, base_url) + title = node.attrib.get("title", "") + title_attr = f' title="{html.escape(title)}"' if title else "" + return f'<a href="{html.escape(href)}"{title_attr}>{inner}</a>' + if name == "code_span": + return f"<code>{html.escape(node.text or '')}</code>" + if name == "paragraph": + return f"<p>{render_children(node, base_url)}</p>" + + return render_children(node, base_url) + + +def render_cell_html(cell: ET.Element, base_url: str) -> str: + paragraphs = [child for child in cell if local_name(child.tag) == "paragraph"] + if paragraphs: + return "".join(render_node(p, base_url) for p in paragraphs) + + body = render_children(cell, base_url) + return f"<p>{body}</p>" if body else "<p></p>" + + +def parse_date(cell: ET.Element) -> datetime: + text = "".join(part.strip() for part in cell.itertext()).strip() + try: + return datetime.strptime(text, "%Y-%m-%d").replace(tzinfo=timezone.utc) + except ValueError as exc: + raise ValueError(f"Invalid date in Updates table: {text!r}") from exc + + +def build_updates(home_md: Path, base_url: str) -> List[Update]: + ast_root = run_cmark_ast(home_md) + table = find_updates_table(ast_root) + + updates: list[Update] = [] + for date_cell, note_cell in iter_rows(table): + pub_date = parse_date(date_cell) + html_body = render_cell_html(note_cell, base_url) + plain = html_to_text(html_body) + guid_source = f"{pub_date.date().isoformat()}\n{plain}".encode("utf-8") + guid = f"urn:homepage-updates:{hashlib.sha1(guid_source).hexdigest()}" + + links = list(collect_links(note_cell, base_url)) + link = links[0] if links else urljoin(base_url, "Home.html") + + updates.append(Update(pub_date, plain, link, html_body, guid)) + + return updates + + +def render_feed( + updates: List[Update], + base_url: str, + feed_title: str, + feed_description: str, + channel_link: Optional[str], +) -> str: + if not updates: + raise RuntimeError("No updates found; RSS feed would be empty.") + + updates.sort(key=lambda u: u.date, reverse=True) + + last_modified = updates[0].date + resolved_channel_link = channel_link or urljoin(base_url, "Home.html") + + lines = [ + '<?xml version="1.0" encoding="UTF-8"?>', + '<rss version="2.0">', + " <channel>", + f" <title>{xml_escape(feed_title)}</title>", + f" <link>{xml_escape(resolved_channel_link)}</link>", + f" <description>{xml_escape(feed_description)}</description>", + f" <lastBuildDate>{format_datetime(last_modified)}</lastBuildDate>", + ] + + for update in updates: + lines.extend( + [ + " <item>", + f" <title>{xml_escape(update.title)}</title>", + f" <link>{xml_escape(update.link)}</link>", + f" <guid isPermaLink=\"false\">{xml_escape(update.guid)}</guid>", + f" <pubDate>{format_datetime(update.date)}</pubDate>", + " <description><![CDATA[", + f"{update.html}", + " ]]></description>", + " </item>", + ] + ) + + lines.append(" </channel>") + lines.append("</rss>") + lines.append("") + return "\n".join(lines) + + +def html_to_text(fragment: str) -> str: + stripped = TAG_PATTERN.sub("", fragment) + return " ".join(stripped.split()) + + +def main() -> None: + args = parse_args() + base_url = args.base_url.rstrip("/") + "/" + + home_md = Path(args.home_markdown) + updates = build_updates(home_md, base_url) + feed_xml = render_feed( + updates, + base_url, + args.feed_title, + args.feed_description, + args.channel_link, + ) + + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(feed_xml, encoding="utf-8") + + +if __name__ == "__main__": + main() diff --git a/tools/validate_rss.py b/tools/validate_rss.py new file mode 100644 index 0000000..6462173 --- /dev/null +++ b/tools/validate_rss.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +"""Ensure an RSS feed is well-formed and has the expected top-level structure.""" + +from __future__ import annotations + +import argparse +import sys +import xml.etree.ElementTree as ET +from pathlib import Path + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("feed", help="Path to the RSS XML file to validate") + return parser.parse_args() + + +def validate_feed(path: Path) -> None: + try: + tree = ET.parse(path) + except ET.ParseError as exc: + raise SystemExit(f"{path}: XML parse error: {exc}") from exc + + root = tree.getroot() + tag = root.tag.lower() + if not tag.endswith("rss"): + raise SystemExit(f"{path}: expected root <rss>, found <{root.tag}>") + + channel = None + for child in root: + child_tag = child.tag.lower() + if child_tag.endswith("channel"): + channel = child + break + + if channel is None: + raise SystemExit(f"{path}: missing <channel> element under <rss>") + + items = [elem for elem in channel if elem.tag.lower().endswith("item")] + if not items: + raise SystemExit(f"{path}: RSS feed has no <item> entries") + + +def main() -> None: + args = parse_args() + path = Path(args.feed) + if not path.is_file(): + raise SystemExit(f"{path}: file not found") + + validate_feed(path) + + +if __name__ == "__main__": + main() diff --git a/tpl/head.html b/tpl/head.html index 7d09603..ec5b3ca 100644 --- a/tpl/head.html +++ b/tpl/head.html @@ -5,6 +5,7 @@ <meta name="viewport" content="width=device-width,initial-scale=1"> <title>@TITLE@</title> <link rel="stylesheet" href="@ROOT@style.css"> + <link rel="alternate" type="application/rss+xml" title="Alan Dipert — Updates" href="@ROOT@updates.xml"> <script async src="https://www.googletagmanager.com/gtag/js?id=G-XCMVL5K44X"></script> <script> window.dataLayer = window.dataLayer || [];