ArcaSuite/services/mcp-docugen/src/mcp_docugen/docx_renderer.py

from __future__ import annotations

import asyncio
import logging
import re
from dataclasses import dataclass
from pathlib import Path

logger = logging.getLogger(__name__)

_STYLE_BLOCK_RE = re.compile(r"<style\b[^>]*>.*?</style>", re.DOTALL | re.IGNORECASE)
_FRONTMATTER_DELIM = "---"


class DocxRenderError(Exception):
    pass


@dataclass(frozen=True)
class DocxRenderResult:
    docx_bytes: bytes
    size_bytes: int


def _strip_style_blocks(markdown_text: str) -> str:
    """Remove `<style>...</style>` blocks: they're meaningless in DOCX and
    Pandoc would otherwise embed them as raw text."""
    return _STYLE_BLOCK_RE.sub("", markdown_text)


def _strip_frontmatter(markdown_text: str) -> str:
    """Remove the YAML frontmatter so it doesn't appear as a body table in
    the DOCX. Frontmatter values were meant for the PDF renderer."""
    if not markdown_text.startswith(_FRONTMATTER_DELIM):
        return markdown_text
    end_marker = f"\n{_FRONTMATTER_DELIM}\n"
    idx = markdown_text.find(end_marker, len(_FRONTMATTER_DELIM))
    if idx == -1:
        return markdown_text
    return markdown_text[idx + len(end_marker) :].lstrip()


def _preprocess(markdown_text: str) -> str:
    return _strip_style_blocks(_strip_frontmatter(markdown_text))


async def render_markdown_to_docx(
    markdown_text: str, reference_doc: Path | None = None
) -> DocxRenderResult:
    """Convert Markdown to a DOCX file via Pandoc subprocess.

    Pandoc reads from stdin and writes the binary DOCX on stdout, so no
    intermediate temp file is needed. The optional `reference_doc` is a
    `.docx` whose styles (heading colors, fonts, header/footer, page size)
    Pandoc will inherit — this is the path to add Tielogic branding to the
    Word output later.
    """
    if not markdown_text.strip():
        raise DocxRenderError("empty markdown input")

    cleaned = _preprocess(markdown_text)
    if not cleaned.strip():
        raise DocxRenderError("nothing to render after stripping frontmatter/style")

    args = [
        "pandoc",
        "-f",
        "markdown+raw_html-implicit_figures",
        "-t",
        "docx",
        "-o",
        "-",
    ]
    if reference_doc is not None and reference_doc.is_file():
        args[5:5] = ["--reference-doc", str(reference_doc)]

    proc = await asyncio.create_subprocess_exec(
        *args,
        stdin=asyncio.subprocess.PIPE,
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE,
    )
    stdout, stderr = await proc.communicate(cleaned.encode("utf-8"))
    if proc.returncode != 0:
        raise DocxRenderError(
            f"pandoc exit {proc.returncode}: {stderr.decode('utf-8', errors='replace')}"
        )

    return DocxRenderResult(docx_bytes=stdout, size_bytes=len(stdout))