from __future__ import annotations
import asyncio
import logging
import re
from dataclasses import dataclass
from pathlib import Path
logger = logging.getLogger(__name__)
_STYLE_BLOCK_RE = re.compile(r"", re.DOTALL | re.IGNORECASE)
_FRONTMATTER_DELIM = "---"
class DocxRenderError(Exception):
pass
@dataclass(frozen=True)
class DocxRenderResult:
docx_bytes: bytes
size_bytes: int
def _strip_style_blocks(markdown_text: str) -> str:
"""Remove `` blocks: they're meaningless in DOCX and
Pandoc would otherwise embed them as raw text."""
return _STYLE_BLOCK_RE.sub("", markdown_text)
def _strip_frontmatter(markdown_text: str) -> str:
"""Remove the YAML frontmatter so it doesn't appear as a body table in
the DOCX. Frontmatter values were meant for the PDF renderer."""
if not markdown_text.startswith(_FRONTMATTER_DELIM):
return markdown_text
end_marker = f"\n{_FRONTMATTER_DELIM}\n"
idx = markdown_text.find(end_marker, len(_FRONTMATTER_DELIM))
if idx == -1:
return markdown_text
return markdown_text[idx + len(end_marker) :].lstrip()
def _preprocess(markdown_text: str) -> str:
return _strip_style_blocks(_strip_frontmatter(markdown_text))
async def render_markdown_to_docx(
markdown_text: str, reference_doc: Path | None = None
) -> DocxRenderResult:
"""Convert Markdown to a DOCX file via Pandoc subprocess.
Pandoc reads from stdin and writes the binary DOCX on stdout, so no
intermediate temp file is needed. The optional `reference_doc` is a
`.docx` whose styles (heading colors, fonts, header/footer, page size)
Pandoc will inherit — this is the path to add Tielogic branding to the
Word output later.
"""
if not markdown_text.strip():
raise DocxRenderError("empty markdown input")
cleaned = _preprocess(markdown_text)
if not cleaned.strip():
raise DocxRenderError("nothing to render after stripping frontmatter/style")
args = [
"pandoc",
"-f",
"markdown+raw_html-implicit_figures",
"-t",
"docx",
"-o",
"-",
]
if reference_doc is not None and reference_doc.is_file():
args[5:5] = ["--reference-doc", str(reference_doc)]
proc = await asyncio.create_subprocess_exec(
*args,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate(cleaned.encode("utf-8"))
if proc.returncode != 0:
raise DocxRenderError(
f"pandoc exit {proc.returncode}: {stderr.decode('utf-8', errors='replace')}"
)
return DocxRenderResult(docx_bytes=stdout, size_bytes=len(stdout))