| author | Alan Dipert
<alan@dipert.org> 2025-10-09 00:05:57 UTC |
| committer | Alan Dipert
<alan@dipert.org> 2025-10-09 00:05:57 UTC |
| parent | 530e5d1cce264daa9fecf8c731360c3a71066c5e |
| tools/xml2md.py | +334 | -0 |
diff --git a/tools/xml2md.py b/tools/xml2md.py new file mode 100755 index 0000000..28d449a --- /dev/null +++ b/tools/xml2md.py @@ -0,0 +1,334 @@ +#!/usr/bin/env python3 +"""Convert CommonMark XML (from cmark-gfm --to xml) back into Markdown.""" + +from __future__ import annotations + +import argparse +import re +import sys +import xml.etree.ElementTree as ET +from pathlib import Path +from typing import Iterable, List + +NAMESPACE = "{http://commonmark.org/xml/1.0}" + +INLINE_NEEDS_ESCAPES = re.compile(r'[\\`*_{}\[\]()#+\-!>~|\"]') + + +def strip_tag(tag: str) -> str: + if tag.startswith("{"): + return tag.split("}", 1)[1] + return tag + + +def escape_text(text: str) -> str: + if not text: + return "" + # Escape characters that frequently cause formatting when emitted raw. + def esc(match: re.Match[str]) -> str: + char = match.group(0) + if char in "\\`*_{}[]()": + return f"\\{char}" + return char + + return INLINE_NEEDS_ESCAPES.sub(esc, text) + + +def fence_for_code(text: str) -> str: + longest = 0 + for match in re.finditer(r"`+", text): + longest = max(longest, len(match.group(0))) + return "`" * (longest + 1 or 3) + + +def render_inline(node: ET.Element) -> str: + tag = strip_tag(node.tag) + + if tag == "text": + return escape_text(node.text or "") + if tag == "softbreak": + return "\n" + if tag == "linebreak": + return " \n" + if tag == "code": + code_text = node.text or "" + fence = fence_for_code(code_text) + return f"{fence}{code_text}{fence}" + if tag == "link": + label = render_inlines(node) + destination = node.attrib.get("destination", "") + title = node.attrib.get("title", "") + title_part = f' "{title}"' if title else "" + return f"[{label}]({destination}{title_part})" + if tag == "image": + alt = render_inlines(node) + destination = node.attrib.get("destination", "") + title = node.attrib.get("title", "") + title_part = f' "{title}"' if title else "" + return f"" + if tag == "strong": + return f"**{render_inlines(node)}**" + if tag == "emph": + return f"*{render_inlines(node)}*" + if tag == "strikethrough": + return f"~~{render_inlines(node)}~~" + if tag == "html_inline": + return node.text or "" + if tag == "emoji": + return node.attrib.get("alias", "") + if tag == "code_span": + # compatibility alias + return render_inline(ET.Element(NAMESPACE + "code", text=node.text)) + + # For any unhandled inline, render its children directly. + parts: List[str] = [] + if node.text: + parts.append(node.text) + for child in list(node): + parts.append(render_inline(child)) + tail = child.tail or "" + if tail.strip(): + parts.append(tail) + return "".join(parts) + + +def render_inlines(parent: ET.Element) -> str: + parts: List[str] = [] + if parent.text and parent.text.strip(): + parts.append(escape_text(parent.text)) + for child in list(parent): + parts.append(render_inline(child)) + tail = child.tail or "" + if tail.strip(): + parts.append(escape_text(tail)) + return "".join(parts) + + +def render_code_block(node: ET.Element) -> str: + info = (node.attrib.get("info") or "").strip() + info = info.split()[0] if info else "" + code_text = node.text or "" + code_text = code_text.rstrip("\n") + + fence = "```" + if "```" in code_text: + fence = "````" + + header = f"{fence}{info}" if info else fence + return f"{header}\n{code_text}\n{fence}" + + +def render_html_block(node: ET.Element) -> str: + parts = [] + if node.text: + parts.append(node.text) + for child in list(node): + parts.append(ET.tostring(child, encoding="unicode")) + if child.tail: + parts.append(child.tail) + return "".join(parts).rstrip() + + +def render_blockquote(node: ET.Element) -> str: + content = render_children_as_blocks(node) + lines = [] + for block in content.splitlines(): + if block.strip(): + lines.append(f"> {block}") + else: + lines.append(">") + return "\n".join(lines) + + +def render_list(node: ET.Element, indent: str = "") -> str: + list_type = node.attrib.get("type", "bullet") + tight = node.attrib.get("tight", "false") == "true" + start = int(node.attrib.get("start", "1")) + delimiter = node.attrib.get("delimiter", "period") + suffix = "." if delimiter != "paren" else ")" + + lines: List[str] = [] + index = start + + for item in node.findall(NAMESPACE + "item"): + marker = "- " if list_type == "bullet" else f"{index}{suffix} " + if list_type == "ordered": + index += 1 + + item_blocks = [] + for child in list(item): + block = render_block(child) + if block: + item_blocks.append(block) + + if tight: + content = " ".join(block.strip() for block in item_blocks).strip() + content_lines = content.splitlines() if content else [""] + else: + content = "\n\n".join(item_blocks).strip() + content_lines = content.splitlines() if content else [""] + + indent_spaces = indent + " " * len(marker) + first_line = content_lines[0] if content_lines else "" + lines.append(f"{indent}{marker}{first_line}") + for extra in content_lines[1:]: + lines.append(f"{indent_spaces}{extra}") + + if not tight: + lines.append("") + + if not tight and lines and lines[-1] == "": + lines.pop() + + return "\n".join(lines) + + +def render_children_as_blocks(parent: ET.Element) -> str: + blocks: List[str] = [] + for child in list(parent): + block = render_block(child) + if block: + blocks.append(block.rstrip()) + return "\n\n".join(blocks) + + +def render_block(node: ET.Element) -> str: + tag = strip_tag(node.tag) + + if tag == "paragraph": + return render_inlines(node).strip("\n") + if tag == "heading": + level = int(node.attrib.get("level", "1")) + content = render_inlines(node).strip() + hashes = "#" * level + return f"{hashes} {content}" if content else hashes + if tag == "code_block": + return render_code_block(node) + if tag == "html_block": + return render_html_block(node) + if tag == "block_quote": + return render_blockquote(node) + if tag == "thematic_break": + return "---" + if tag == "list": + return render_list(node) + if tag == "custom_block": + literal = node.attrib.get("on_enter", "") + (node.text or "") + for child in list(node): + literal += ET.tostring(child, encoding="unicode") + literal += child.tail or "" + literal += node.attrib.get("on_exit", "") + return literal.strip() + if tag == "table": + return render_table(node) + + # Fallback: treat as container and render children. + if list(node): + return render_children_as_blocks(node) + return node.text or "" + + +def render_table(node: ET.Element) -> str: + headers: List[str] = [] + aligns: List[str] = [] + rows: List[List[str]] = [] + + for child in list(node): + tag = strip_tag(child.tag) + if tag == "table_header": + headers, aligns = _render_table_row(child) + elif tag in {"table_body", "table_row"}: + target_rows = child.findall(NAMESPACE + "table_row") if tag == "table_body" else [child] + for row in target_rows: + cells, _ = _render_table_row(row) + rows.append(cells) + + if not headers and rows: + width = len(rows[0]) + headers = ["" for _ in range(width)] + aligns = ["" for _ in range(width)] + + widths = [max(len(headers[i]), *(len(row[i]) for row in rows)) if headers else max(len(row[i]) for row in rows) + for i in range(len(headers or rows[0]))] if (headers or rows) else [] + + def align_marker(align: str, width: int) -> str: + dash_count = max(width, 3) + dashes = "-" * dash_count + if align == "left": + return ":" + dashes[1:] + if align == "right": + return dashes[:-1] + ":" + if align == "center": + if dash_count <= 2: + return ":" + return ":" + dashes[1:-1] + ":" + return dashes + + lines: List[str] = [] + if headers: + header_line = "| " + " | ".join(cell.ljust(widths[i]) for i, cell in enumerate(headers)) + " |" + align_line = "| " + " | ".join(align_marker(aligns[i], widths[i]) for i in range(len(headers))) + " |" + lines.extend([header_line, align_line]) + + for row in rows: + padded = [row[i].ljust(widths[i]) for i in range(len(row))] + lines.append("| " + " | ".join(padded) + " |") + + return "\n".join(lines).rstrip() + + +def _render_table_row(row: ET.Element) -> tuple[list[str], list[str]]: + cells: List[str] = [] + aligns: List[str] = [] + for cell in row.findall(NAMESPACE + "table_cell"): + cells.append(render_table_cell(cell)) + aligns.append(cell.attrib.get("align", "")) + return cells, aligns + + +def render_table_cell(cell: ET.Element) -> str: + content = render_inlines(cell).strip() + return content.replace("|", "\\|") + + +def render_document(root: ET.Element) -> str: + blocks: List[str] = [] + for child in list(root): + block = render_block(child) + if block: + blocks.append(block.rstrip()) + markdown = "\n\n".join(blocks) + return markdown.rstrip() + "\n" + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("input", nargs="?", help="XML file to read (defaults to stdin)") + parser.add_argument("output", nargs="?", help="Destination markdown file (defaults to stdout)") + return parser.parse_args() + + +def main() -> None: + args = parse_args() + if args.input: + xml_data = Path(args.input).read_text() + else: + xml_data = sys.stdin.read() + + root = ET.fromstring(xml_data) + if strip_tag(root.tag) != "document": + for child in list(root): + if strip_tag(child.tag) == "document": + root = child + break + + markdown = render_document(root) + + if args.output: + Path(args.output).write_text(markdown) + else: + sys.stdout.write(markdown) + + +if __name__ == "__main__": + main()