git » homepage.git » commit 6917e12

Add CommonMark XML to Markdown converter

author Alan Dipert
2025-10-09 00:05:57 UTC
committer Alan Dipert
2025-10-09 00:05:57 UTC
parent 530e5d1cce264daa9fecf8c731360c3a71066c5e

Add CommonMark XML to Markdown converter

tools/xml2md.py +334 -0

diff --git a/tools/xml2md.py b/tools/xml2md.py
new file mode 100755
index 0000000..28d449a
--- /dev/null
+++ b/tools/xml2md.py
@@ -0,0 +1,334 @@
+#!/usr/bin/env python3
+"""Convert CommonMark XML (from cmark-gfm --to xml) back into Markdown."""
+
+from __future__ import annotations
+
+import argparse
+import re
+import sys
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from typing import Iterable, List
+
+NAMESPACE = "{http://commonmark.org/xml/1.0}"
+
+INLINE_NEEDS_ESCAPES = re.compile(r'[\\`*_{}\[\]()#+\-!>~|\"]')
+
+
+def strip_tag(tag: str) -> str:
+    if tag.startswith("{"):
+        return tag.split("}", 1)[1]
+    return tag
+
+
+def escape_text(text: str) -> str:
+    if not text:
+        return ""
+    # Escape characters that frequently cause formatting when emitted raw.
+    def esc(match: re.Match[str]) -> str:
+        char = match.group(0)
+        if char in "\\`*_{}[]()":
+            return f"\\{char}"
+        return char
+
+    return INLINE_NEEDS_ESCAPES.sub(esc, text)
+
+
+def fence_for_code(text: str) -> str:
+    longest = 0
+    for match in re.finditer(r"`+", text):
+        longest = max(longest, len(match.group(0)))
+    return "`" * (longest + 1 or 3)
+
+
+def render_inline(node: ET.Element) -> str:
+    tag = strip_tag(node.tag)
+
+    if tag == "text":
+        return escape_text(node.text or "")
+    if tag == "softbreak":
+        return "\n"
+    if tag == "linebreak":
+        return "  \n"
+    if tag == "code":
+        code_text = node.text or ""
+        fence = fence_for_code(code_text)
+        return f"{fence}{code_text}{fence}"
+    if tag == "link":
+        label = render_inlines(node)
+        destination = node.attrib.get("destination", "")
+        title = node.attrib.get("title", "")
+        title_part = f' "{title}"' if title else ""
+        return f"[{label}]({destination}{title_part})"
+    if tag == "image":
+        alt = render_inlines(node)
+        destination = node.attrib.get("destination", "")
+        title = node.attrib.get("title", "")
+        title_part = f' "{title}"' if title else ""
+        return f"![{alt}]({destination}{title_part})"
+    if tag == "strong":
+        return f"**{render_inlines(node)}**"
+    if tag == "emph":
+        return f"*{render_inlines(node)}*"
+    if tag == "strikethrough":
+        return f"~~{render_inlines(node)}~~"
+    if tag == "html_inline":
+        return node.text or ""
+    if tag == "emoji":
+        return node.attrib.get("alias", "")
+    if tag == "code_span":
+        # compatibility alias
+        return render_inline(ET.Element(NAMESPACE + "code", text=node.text))
+
+    # For any unhandled inline, render its children directly.
+    parts: List[str] = []
+    if node.text:
+        parts.append(node.text)
+    for child in list(node):
+        parts.append(render_inline(child))
+        tail = child.tail or ""
+        if tail.strip():
+            parts.append(tail)
+    return "".join(parts)
+
+
+def render_inlines(parent: ET.Element) -> str:
+    parts: List[str] = []
+    if parent.text and parent.text.strip():
+        parts.append(escape_text(parent.text))
+    for child in list(parent):
+        parts.append(render_inline(child))
+        tail = child.tail or ""
+        if tail.strip():
+            parts.append(escape_text(tail))
+    return "".join(parts)
+
+
+def render_code_block(node: ET.Element) -> str:
+    info = (node.attrib.get("info") or "").strip()
+    info = info.split()[0] if info else ""
+    code_text = node.text or ""
+    code_text = code_text.rstrip("\n")
+
+    fence = "```"
+    if "```" in code_text:
+        fence = "````"
+
+    header = f"{fence}{info}" if info else fence
+    return f"{header}\n{code_text}\n{fence}"
+
+
+def render_html_block(node: ET.Element) -> str:
+    parts = []
+    if node.text:
+        parts.append(node.text)
+    for child in list(node):
+        parts.append(ET.tostring(child, encoding="unicode"))
+        if child.tail:
+            parts.append(child.tail)
+    return "".join(parts).rstrip()
+
+
+def render_blockquote(node: ET.Element) -> str:
+    content = render_children_as_blocks(node)
+    lines = []
+    for block in content.splitlines():
+        if block.strip():
+            lines.append(f"> {block}")
+        else:
+            lines.append(">")
+    return "\n".join(lines)
+
+
+def render_list(node: ET.Element, indent: str = "") -> str:
+    list_type = node.attrib.get("type", "bullet")
+    tight = node.attrib.get("tight", "false") == "true"
+    start = int(node.attrib.get("start", "1"))
+    delimiter = node.attrib.get("delimiter", "period")
+    suffix = "." if delimiter != "paren" else ")"
+
+    lines: List[str] = []
+    index = start
+
+    for item in node.findall(NAMESPACE + "item"):
+        marker = "- " if list_type == "bullet" else f"{index}{suffix} "
+        if list_type == "ordered":
+            index += 1
+
+        item_blocks = []
+        for child in list(item):
+            block = render_block(child)
+            if block:
+                item_blocks.append(block)
+
+        if tight:
+            content = " ".join(block.strip() for block in item_blocks).strip()
+            content_lines = content.splitlines() if content else [""]
+        else:
+            content = "\n\n".join(item_blocks).strip()
+            content_lines = content.splitlines() if content else [""]
+
+        indent_spaces = indent + " " * len(marker)
+        first_line = content_lines[0] if content_lines else ""
+        lines.append(f"{indent}{marker}{first_line}")
+        for extra in content_lines[1:]:
+            lines.append(f"{indent_spaces}{extra}")
+
+        if not tight:
+            lines.append("")
+
+    if not tight and lines and lines[-1] == "":
+        lines.pop()
+
+    return "\n".join(lines)
+
+
+def render_children_as_blocks(parent: ET.Element) -> str:
+    blocks: List[str] = []
+    for child in list(parent):
+        block = render_block(child)
+        if block:
+            blocks.append(block.rstrip())
+    return "\n\n".join(blocks)
+
+
+def render_block(node: ET.Element) -> str:
+    tag = strip_tag(node.tag)
+
+    if tag == "paragraph":
+        return render_inlines(node).strip("\n")
+    if tag == "heading":
+        level = int(node.attrib.get("level", "1"))
+        content = render_inlines(node).strip()
+        hashes = "#" * level
+        return f"{hashes} {content}" if content else hashes
+    if tag == "code_block":
+        return render_code_block(node)
+    if tag == "html_block":
+        return render_html_block(node)
+    if tag == "block_quote":
+        return render_blockquote(node)
+    if tag == "thematic_break":
+        return "---"
+    if tag == "list":
+        return render_list(node)
+    if tag == "custom_block":
+        literal = node.attrib.get("on_enter", "") + (node.text or "")
+        for child in list(node):
+            literal += ET.tostring(child, encoding="unicode")
+            literal += child.tail or ""
+        literal += node.attrib.get("on_exit", "")
+        return literal.strip()
+    if tag == "table":
+        return render_table(node)
+
+    # Fallback: treat as container and render children.
+    if list(node):
+        return render_children_as_blocks(node)
+    return node.text or ""
+
+
+def render_table(node: ET.Element) -> str:
+    headers: List[str] = []
+    aligns: List[str] = []
+    rows: List[List[str]] = []
+
+    for child in list(node):
+        tag = strip_tag(child.tag)
+        if tag == "table_header":
+            headers, aligns = _render_table_row(child)
+        elif tag in {"table_body", "table_row"}:
+            target_rows = child.findall(NAMESPACE + "table_row") if tag == "table_body" else [child]
+            for row in target_rows:
+                cells, _ = _render_table_row(row)
+                rows.append(cells)
+
+    if not headers and rows:
+        width = len(rows[0])
+        headers = ["" for _ in range(width)]
+        aligns = ["" for _ in range(width)]
+
+    widths = [max(len(headers[i]), *(len(row[i]) for row in rows)) if headers else max(len(row[i]) for row in rows)
+              for i in range(len(headers or rows[0]))] if (headers or rows) else []
+
+    def align_marker(align: str, width: int) -> str:
+        dash_count = max(width, 3)
+        dashes = "-" * dash_count
+        if align == "left":
+            return ":" + dashes[1:]
+        if align == "right":
+            return dashes[:-1] + ":"
+        if align == "center":
+            if dash_count <= 2:
+                return ":"
+            return ":" + dashes[1:-1] + ":"
+        return dashes
+
+    lines: List[str] = []
+    if headers:
+        header_line = "| " + " | ".join(cell.ljust(widths[i]) for i, cell in enumerate(headers)) + " |"
+        align_line = "| " + " | ".join(align_marker(aligns[i], widths[i]) for i in range(len(headers))) + " |"
+        lines.extend([header_line, align_line])
+
+    for row in rows:
+        padded = [row[i].ljust(widths[i]) for i in range(len(row))]
+        lines.append("| " + " | ".join(padded) + " |")
+
+    return "\n".join(lines).rstrip()
+
+
+def _render_table_row(row: ET.Element) -> tuple[list[str], list[str]]:
+    cells: List[str] = []
+    aligns: List[str] = []
+    for cell in row.findall(NAMESPACE + "table_cell"):
+        cells.append(render_table_cell(cell))
+        aligns.append(cell.attrib.get("align", ""))
+    return cells, aligns
+
+
+def render_table_cell(cell: ET.Element) -> str:
+    content = render_inlines(cell).strip()
+    return content.replace("|", "\\|")
+
+
+def render_document(root: ET.Element) -> str:
+    blocks: List[str] = []
+    for child in list(root):
+        block = render_block(child)
+        if block:
+            blocks.append(block.rstrip())
+    markdown = "\n\n".join(blocks)
+    return markdown.rstrip() + "\n"
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("input", nargs="?", help="XML file to read (defaults to stdin)")
+    parser.add_argument("output", nargs="?", help="Destination markdown file (defaults to stdout)")
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    if args.input:
+        xml_data = Path(args.input).read_text()
+    else:
+        xml_data = sys.stdin.read()
+
+    root = ET.fromstring(xml_data)
+    if strip_tag(root.tag) != "document":
+        for child in list(root):
+            if strip_tag(child.tag) == "document":
+                root = child
+                break
+
+    markdown = render_document(root)
+
+    if args.output:
+        Path(args.output).write_text(markdown)
+    else:
+        sys.stdout.write(markdown)
+
+
+if __name__ == "__main__":
+    main()