from __future__ import annotations import asyncio import logging import re from dataclasses import dataclass from pathlib import Path logger = logging.getLogger(__name__) _STYLE_BLOCK_RE = re.compile(r"]*>.*?", re.DOTALL | re.IGNORECASE) _FRONTMATTER_DELIM = "---" class DocxRenderError(Exception): pass @dataclass(frozen=True) class DocxRenderResult: docx_bytes: bytes size_bytes: int def _strip_style_blocks(markdown_text: str) -> str: """Remove `` blocks: they're meaningless in DOCX and Pandoc would otherwise embed them as raw text.""" return _STYLE_BLOCK_RE.sub("", markdown_text) def _strip_frontmatter(markdown_text: str) -> str: """Remove the YAML frontmatter so it doesn't appear as a body table in the DOCX. Frontmatter values were meant for the PDF renderer.""" if not markdown_text.startswith(_FRONTMATTER_DELIM): return markdown_text end_marker = f"\n{_FRONTMATTER_DELIM}\n" idx = markdown_text.find(end_marker, len(_FRONTMATTER_DELIM)) if idx == -1: return markdown_text return markdown_text[idx + len(end_marker) :].lstrip() def _preprocess(markdown_text: str) -> str: return _strip_style_blocks(_strip_frontmatter(markdown_text)) async def render_markdown_to_docx( markdown_text: str, reference_doc: Path | None = None ) -> DocxRenderResult: """Convert Markdown to a DOCX file via Pandoc subprocess. Pandoc reads from stdin and writes the binary DOCX on stdout, so no intermediate temp file is needed. The optional `reference_doc` is a `.docx` whose styles (heading colors, fonts, header/footer, page size) Pandoc will inherit — this is the path to add Tielogic branding to the Word output later. """ if not markdown_text.strip(): raise DocxRenderError("empty markdown input") cleaned = _preprocess(markdown_text) if not cleaned.strip(): raise DocxRenderError("nothing to render after stripping frontmatter/style") args = [ "pandoc", "-f", "markdown+raw_html-implicit_figures", "-t", "docx", "-o", "-", ] if reference_doc is not None and reference_doc.is_file(): args[5:5] = ["--reference-doc", str(reference_doc)] proc = await asyncio.create_subprocess_exec( *args, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) stdout, stderr = await proc.communicate(cleaned.encode("utf-8")) if proc.returncode != 0: raise DocxRenderError( f"pandoc exit {proc.returncode}: {stderr.decode('utf-8', errors='replace')}" ) return DocxRenderResult(docx_bytes=stdout, size_bytes=len(stdout))