feat(mcp-docugen): output Word (.docx) via Pandoc con reference Tielogic

Aggiunge la generazione di documenti Word coerenti con l'identità
visiva Tielogic, in parallelo al render PDF già esistente. Il flusso
completo è ora `bullet input → Markdown formattato → PDF e/o DOCX`
in una singola chiamata MCP.

- docx_renderer.py: subprocess Pandoc che legge il Markdown da stdin,
  emette il binario .docx su stdout. Strippa il YAML frontmatter e i
  blocchi `<style>` (presenti per il PDF, irrilevanti in DOCX) prima
  della conversione.
- mcp_tools.py: nuovo tool `document_to_docx(markdown)` che ritorna
  `{docx_b64, size_bytes}`; `document_generate` esteso con
  `output_format ∈ {md, pdf, docx, all}`. La firma di
  `build_mcp_server` accetta ora `docx_reference_path` opzionale.
- config.py: `Settings.docx_reference_path` (default
  /app/themes/tielogic-reference.docx).
- main.py: passa la nuova setting a `build_mcp_server`.
- mcp-docugen.Dockerfile: installazione di pandoc accanto alle libs
  Chromium.
- themes/tielogic-reference.docx: reference Word (10 KB) con stili
  Tielogic — heading colors blu/dark, font Inter, dimensioni allineate
  al CSS web. Generato da `scripts/build-reference-docx.py` che parte
  dal reference.docx di default di Pandoc e riscrive `word/styles.xml`
  con regex sui blocchi `<w:style>`. Pandoc lo applica in automatico
  agli output DOCX prodotti dal servizio.
- 9 nuovi test unit per docx_renderer (strip frontmatter/style,
  preprocess combinato, error empty input, smoke skippato in
  ambienti senza Pandoc): 92 test totali.

Smoke E2E via MCP: una sola chiamata `document_generate` con
`output_format=all` produce MD (14 KB), PDF (137 KB, 4 pagine A4) e
DOCX (12.7 KB) coerenti tra loro.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-26 11:13:11 +02:00
parent 725190010c
commit c783fff040
9 changed files with 405 additions and 9 deletions
@@ -17,6 +17,7 @@ class Settings(BaseSettings):
data_dir: Path = Path("/data")
templates_seed_dir: Path = Path("/app/services/mcp-docugen/templates_seed")
inline_stylesheet_path: Path | None = Path("/app/themes/tielogic.css")
docx_reference_path: Path | None = Path("/app/themes/tielogic-reference.docx")
asset_ttl_days: int = 30
max_image_size_mb: int = 10
llm_timeout_seconds: int = 60
@@ -0,0 +1,89 @@
from __future__ import annotations
import asyncio
import logging
import re
from dataclasses import dataclass
from pathlib import Path
logger = logging.getLogger(__name__)
_STYLE_BLOCK_RE = re.compile(r"<style\b[^>]*>.*?</style>", re.DOTALL | re.IGNORECASE)
_FRONTMATTER_DELIM = "---"
class DocxRenderError(Exception):
pass
@dataclass(frozen=True)
class DocxRenderResult:
docx_bytes: bytes
size_bytes: int
def _strip_style_blocks(markdown_text: str) -> str:
"""Remove `<style>...</style>` blocks: they're meaningless in DOCX and
Pandoc would otherwise embed them as raw text."""
return _STYLE_BLOCK_RE.sub("", markdown_text)
def _strip_frontmatter(markdown_text: str) -> str:
"""Remove the YAML frontmatter so it doesn't appear as a body table in
the DOCX. Frontmatter values were meant for the PDF renderer."""
if not markdown_text.startswith(_FRONTMATTER_DELIM):
return markdown_text
end_marker = f"\n{_FRONTMATTER_DELIM}\n"
idx = markdown_text.find(end_marker, len(_FRONTMATTER_DELIM))
if idx == -1:
return markdown_text
return markdown_text[idx + len(end_marker) :].lstrip()
def _preprocess(markdown_text: str) -> str:
return _strip_style_blocks(_strip_frontmatter(markdown_text))
async def render_markdown_to_docx(
markdown_text: str, reference_doc: Path | None = None
) -> DocxRenderResult:
"""Convert Markdown to a DOCX file via Pandoc subprocess.
Pandoc reads from stdin and writes the binary DOCX on stdout, so no
intermediate temp file is needed. The optional `reference_doc` is a
`.docx` whose styles (heading colors, fonts, header/footer, page size)
Pandoc will inherit — this is the path to add Tielogic branding to the
Word output later.
"""
if not markdown_text.strip():
raise DocxRenderError("empty markdown input")
cleaned = _preprocess(markdown_text)
if not cleaned.strip():
raise DocxRenderError("nothing to render after stripping frontmatter/style")
args = [
"pandoc",
"-f",
"markdown+raw_html-implicit_figures",
"-t",
"docx",
"-o",
"-",
]
if reference_doc is not None and reference_doc.is_file():
args[5:5] = ["--reference-doc", str(reference_doc)]
proc = await asyncio.create_subprocess_exec(
*args,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate(cleaned.encode("utf-8"))
if proc.returncode != 0:
raise DocxRenderError(
f"pandoc exit {proc.returncode}: {stderr.decode('utf-8', errors='replace')}"
)
return DocxRenderResult(docx_bytes=stdout, size_bytes=len(stdout))
+3 -1
View File
@@ -51,7 +51,9 @@ async def build_app(settings: Settings | None = None) -> FastAPI:
inline_stylesheet_path=settings.inline_stylesheet_path,
)
mcp = build_mcp_server(template_store, renderer)
mcp = build_mcp_server(
template_store, renderer, docx_reference_path=settings.docx_reference_path
)
mcp_asgi = mcp.streamable_http_app()
@asynccontextmanager
@@ -1,10 +1,12 @@
from __future__ import annotations
import base64
from pathlib import Path
from typing import Literal
from mcp.server.fastmcp import FastMCP
from mcp_docugen.docx_renderer import render_markdown_to_docx
from mcp_docugen.models import TemplateFrontmatter
from mcp_docugen.pdf_renderer import render_markdown_to_pdf
from mcp_docugen.renderer import Renderer
@@ -12,9 +14,16 @@ from mcp_docugen.template_store import TemplateStore
def build_mcp_server(
template_store: TemplateStore, renderer: Renderer
template_store: TemplateStore,
renderer: Renderer,
docx_reference_path: Path | None = None,
) -> FastMCP:
mcp = FastMCP("mcp-docugen")
docx_ref = (
docx_reference_path
if docx_reference_path is not None and docx_reference_path.is_file()
else None
)
@mcp.tool()
async def template_create(
@@ -74,14 +83,15 @@ def build_mcp_server(
content_md: str,
variables: dict,
instructions: str | None = None,
output_format: Literal["md", "pdf", "both"] = "md",
output_format: Literal["md", "pdf", "docx", "all"] = "md",
) -> dict:
"""Generate a document from a template, content, and variables.
output_format:
- "md" → returns the generated Markdown only (default)
- "pdf" → also renders the Markdown to PDF (base64 encoded)
- "both"same as "pdf" (kept for symmetry; PDF includes the MD)
- "md" → returns the generated Markdown only (default)
- "pdf" → also renders the Markdown to PDF (base64 encoded)
- "docx"also renders the Markdown to a Word DOCX (base64)
- "all" → emits both PDF and DOCX alongside the Markdown
"""
result = await renderer.generate(
template_name=template_name,
@@ -91,11 +101,21 @@ def build_mcp_server(
)
out = result.model_dump(mode="json")
if output_format in ("pdf", "both"):
want_pdf = output_format in ("pdf", "all")
want_docx = output_format in ("docx", "all")
if want_pdf:
pdf = await render_markdown_to_pdf(result.markdown)
out["pdf_b64"] = base64.b64encode(pdf.pdf_bytes).decode("ascii")
out["pdf_size_bytes"] = pdf.size_bytes
if want_docx:
docx = await render_markdown_to_docx(
result.markdown, reference_doc=docx_ref
)
out["docx_b64"] = base64.b64encode(docx.docx_bytes).decode("ascii")
out["docx_size_bytes"] = docx.size_bytes
return out
@mcp.tool()
@@ -112,6 +132,22 @@ def build_mcp_server(
"size_bytes": pdf.size_bytes,
}
@mcp.tool()
async def document_to_docx(markdown: str) -> dict:
"""Convert an arbitrary Markdown document into a Word DOCX file.
YAML frontmatter and inline `<style>` blocks (meaningful only for
the PDF renderer) are stripped before conversion. Tables, headings,
bold/italic/code, and inline HTML survive as native Word elements.
Visual branding (cover, page colors, custom fonts) is NOT preserved
in DOCX — for that, a Tielogic reference.docx is needed (TODO).
"""
docx = await render_markdown_to_docx(markdown, reference_doc=docx_ref)
return {
"docx_b64": base64.b64encode(docx.docx_bytes).decode("ascii"),
"size_bytes": docx.size_bytes,
}
mcp.tools = {
"template_create": template_create,
"template_update": template_update,
@@ -120,6 +156,7 @@ def build_mcp_server(
"template_list": template_list,
"document_generate": document_generate,
"document_to_pdf": document_to_pdf,
"document_to_docx": document_to_docx,
}
return mcp
@@ -0,0 +1,99 @@
from __future__ import annotations
import shutil
import pytest
from mcp_docugen.docx_renderer import (
DocxRenderError,
_preprocess,
_strip_frontmatter,
_strip_style_blocks,
render_markdown_to_docx,
)
def test_strip_style_blocks_removes_simple_block():
md = "Pre\n<style>h1 { color: red; }</style>\nPost"
assert "<style>" not in _strip_style_blocks(md)
assert "color: red" not in _strip_style_blocks(md)
def test_strip_style_blocks_removes_multiline_block_with_attrs():
md = (
"Before\n"
"<style type=\"text/css\">\n"
" body { font-family: Inter; }\n"
" h1 { color: blue; }\n"
"</style>\n"
"After"
)
cleaned = _strip_style_blocks(md)
assert "<style" not in cleaned
assert "Inter" not in cleaned
assert "Before" in cleaned and "After" in cleaned
def test_strip_style_blocks_no_style_is_noop():
md = "# Just markdown\n\nNo style here."
assert _strip_style_blocks(md) == md
def test_strip_frontmatter_removes_block():
md = "---\nfoo: bar\npdf_options:\n format: A4\n---\n\n# Body\n"
out = _strip_frontmatter(md)
assert out.startswith("# Body")
assert "pdf_options" not in out
def test_strip_frontmatter_no_frontmatter_returns_input_unchanged():
md = "# Body only\n\nText."
assert _strip_frontmatter(md) == md
def test_strip_frontmatter_unclosed_returns_input_unchanged():
md = "---\nfoo: bar\nno closing delim"
assert _strip_frontmatter(md) == md
def test_preprocess_strips_both_frontmatter_and_style():
md = (
"---\nfoo: bar\n---\n\n"
"<style>body { color: red; }</style>\n\n"
"# Body\n\nContent.\n"
)
out = _preprocess(md)
assert "foo: bar" not in out
assert "<style>" not in out
assert "# Body" in out
@pytest.mark.skipif(
shutil.which("pandoc") is None,
reason="pandoc binary not available; runs in container or CI with pandoc installed",
)
async def test_render_markdown_to_docx_roundtrip():
md = (
"---\nfoo: bar\n---\n\n"
"<style>h1 { color: red; }</style>\n\n"
"# Hello\n\n"
"| A | B |\n|---|---|\n| 1 | 2 |\n\n"
"**bold** and *italic*.\n"
)
result = await render_markdown_to_docx(md)
# DOCX is a ZIP archive; signature: PK\x03\x04
assert result.docx_bytes.startswith(b"PK\x03\x04")
assert result.size_bytes > 1000
async def test_render_empty_markdown_raises():
with pytest.raises(DocxRenderError):
await render_markdown_to_docx("")
with pytest.raises(DocxRenderError):
await render_markdown_to_docx(" \n\n ")
async def test_render_only_frontmatter_and_style_raises():
md = "---\nfoo: bar\n---\n\n<style>h1{}</style>\n\n \n"
with pytest.raises(DocxRenderError):
await render_markdown_to_docx(md)