feat(mcp-docugen): nuovo tool MCP document_to_pdf via Playwright/Chromium
Aggiunge la possibilità di convertire un documento Markdown in PDF
direttamente lato server, senza richiedere al chiamante di avere
md-to-pdf, pandoc o altri tool sull'host. Il PDF è restituito come
stringa base64 nella risposta JSON-RPC, pronto a essere salvato,
allegato o spedito al cliente.
- pdf_renderer.py: nuovo modulo che parsea il frontmatter YAML del
Markdown (incluso il blocco pdf_options stile Puppeteer/md-to-pdf),
rende il body in HTML via markdown-it-py (supporta tabelle e
HTML inline) e produce il PDF tramite Chromium headless gestito
da Playwright. Le pdf_options camelCase (printBackground,
displayHeaderFooter, headerTemplate, ...) vengono tradotte negli
argomenti snake_case di page.pdf().
- mcp_tools.py: nuovo tool `document_to_pdf(markdown)` che ritorna
`{pdf_b64, size_bytes}`; `document_generate` esteso con il
parametro `output_format ∈ {md, pdf, both}` per emettere il PDF
contestualmente alla generazione del Markdown.
- pyproject.toml + uv.lock: aggiunte le dipendenze playwright>=1.48
e markdown-it-py[plugins]>=3.0.
- mcp-docugen.Dockerfile: nuova fase di runtime che installa le
librerie native richieste da Chromium (libnss3, libgbm1, ecc.) e
scarica il binario Chromium di Playwright in /opt/ms-playwright.
- 7 nuovi test unit (83 totali) che coprono lo split del frontmatter,
il rendering Markdown→HTML con tabelle, la traduzione delle
pdf_options camelCase→snake_case e l'errore su YAML invalido. Il
test E2E che richiede Chromium è marcato skip in unit; lo smoke
via MCP conferma generazione PDF da 134 KB / 4 pagine.
README aggiornato con le tre strade di conversione (server-side,
client-side, bundling) e la stima del nuovo costo immagine.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -15,6 +15,8 @@ dependencies = [
|
||||
"pyyaml>=6.0",
|
||||
"pillow>=11.0",
|
||||
"python-multipart>=0.0.9",
|
||||
"playwright>=1.48",
|
||||
"markdown-it-py[plugins]>=3.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
from typing import Literal
|
||||
|
||||
from mcp.server.fastmcp import FastMCP
|
||||
|
||||
from mcp_docugen.models import TemplateFrontmatter
|
||||
from mcp_docugen.pdf_renderer import render_markdown_to_pdf
|
||||
from mcp_docugen.renderer import Renderer
|
||||
from mcp_docugen.template_store import TemplateStore
|
||||
|
||||
@@ -70,15 +74,43 @@ def build_mcp_server(
|
||||
content_md: str,
|
||||
variables: dict,
|
||||
instructions: str | None = None,
|
||||
output_format: Literal["md", "pdf", "both"] = "md",
|
||||
) -> dict:
|
||||
"""Generate a Markdown document from a template, content, and variables."""
|
||||
"""Generate a document from a template, content, and variables.
|
||||
|
||||
output_format:
|
||||
- "md" → returns the generated Markdown only (default)
|
||||
- "pdf" → also renders the Markdown to PDF (base64 encoded)
|
||||
- "both" → same as "pdf" (kept for symmetry; PDF includes the MD)
|
||||
"""
|
||||
result = await renderer.generate(
|
||||
template_name=template_name,
|
||||
content_md=content_md,
|
||||
variables=variables,
|
||||
instructions=instructions,
|
||||
)
|
||||
return result.model_dump(mode="json")
|
||||
out = result.model_dump(mode="json")
|
||||
|
||||
if output_format in ("pdf", "both"):
|
||||
pdf = await render_markdown_to_pdf(result.markdown)
|
||||
out["pdf_b64"] = base64.b64encode(pdf.pdf_bytes).decode("ascii")
|
||||
out["pdf_size_bytes"] = pdf.size_bytes
|
||||
|
||||
return out
|
||||
|
||||
@mcp.tool()
|
||||
async def document_to_pdf(markdown: str) -> dict:
|
||||
"""Convert an arbitrary Markdown document into a PDF.
|
||||
|
||||
Accepts the full Markdown text (including optional YAML frontmatter
|
||||
with `pdf_options:` and inline `<style>` blocks). Returns the PDF as
|
||||
a base64-encoded string plus the byte size.
|
||||
"""
|
||||
pdf = await render_markdown_to_pdf(markdown)
|
||||
return {
|
||||
"pdf_b64": base64.b64encode(pdf.pdf_bytes).decode("ascii"),
|
||||
"size_bytes": pdf.size_bytes,
|
||||
}
|
||||
|
||||
mcp.tools = {
|
||||
"template_create": template_create,
|
||||
@@ -87,6 +119,7 @@ def build_mcp_server(
|
||||
"template_get": template_get,
|
||||
"template_list": template_list,
|
||||
"document_generate": document_generate,
|
||||
"document_to_pdf": document_to_pdf,
|
||||
}
|
||||
|
||||
return mcp
|
||||
|
||||
@@ -0,0 +1,119 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
from markdown_it import MarkdownIt
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_FRONTMATTER_DELIM = "---"
|
||||
|
||||
|
||||
class PdfRenderError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PdfRenderResult:
|
||||
pdf_bytes: bytes
|
||||
pages: int | None # Playwright doesn't expose page count post-render reliably
|
||||
size_bytes: int
|
||||
|
||||
|
||||
def _split_frontmatter(markdown_text: str) -> tuple[dict, str]:
|
||||
if not markdown_text.startswith(_FRONTMATTER_DELIM):
|
||||
return {}, markdown_text
|
||||
end_marker = f"\n{_FRONTMATTER_DELIM}\n"
|
||||
idx = markdown_text.find(end_marker, len(_FRONTMATTER_DELIM))
|
||||
if idx == -1:
|
||||
return {}, markdown_text
|
||||
fm_text = markdown_text[len(_FRONTMATTER_DELIM) + 1 : idx]
|
||||
body = markdown_text[idx + len(end_marker) :]
|
||||
try:
|
||||
data = yaml.safe_load(fm_text) or {}
|
||||
if not isinstance(data, dict):
|
||||
data = {}
|
||||
except yaml.YAMLError as exc:
|
||||
raise PdfRenderError(f"invalid YAML frontmatter: {exc}") from exc
|
||||
return data, body
|
||||
|
||||
|
||||
def _md_to_html(body: str) -> str:
|
||||
md = MarkdownIt("commonmark", {"html": True, "linkify": True}).enable("table")
|
||||
return md.render(body)
|
||||
|
||||
|
||||
def _build_full_html(body_html: str) -> str:
|
||||
return (
|
||||
"<!DOCTYPE html>\n"
|
||||
"<html lang=\"it\">\n"
|
||||
"<head>\n"
|
||||
'<meta charset="utf-8">\n'
|
||||
"</head>\n"
|
||||
f"<body>\n{body_html}\n</body>\n"
|
||||
"</html>\n"
|
||||
)
|
||||
|
||||
|
||||
def _normalize_pdf_options(opts: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Map md-to-pdf / Puppeteer pdf_options to Playwright page.pdf() kwargs.
|
||||
|
||||
Playwright (Python) uses snake_case kwargs; Puppeteer/md-to-pdf YAML uses
|
||||
camelCase. Translate the keys we care about, drop the rest.
|
||||
"""
|
||||
if not isinstance(opts, dict):
|
||||
return {}
|
||||
|
||||
out: dict[str, Any] = {}
|
||||
if "format" in opts:
|
||||
out["format"] = opts["format"]
|
||||
if "landscape" in opts:
|
||||
out["landscape"] = bool(opts["landscape"])
|
||||
if "margin" in opts and isinstance(opts["margin"], dict):
|
||||
out["margin"] = {k: str(v) for k, v in opts["margin"].items()}
|
||||
if "printBackground" in opts:
|
||||
out["print_background"] = bool(opts["printBackground"])
|
||||
if "displayHeaderFooter" in opts:
|
||||
out["display_header_footer"] = bool(opts["displayHeaderFooter"])
|
||||
if "headerTemplate" in opts:
|
||||
out["header_template"] = str(opts["headerTemplate"])
|
||||
if "footerTemplate" in opts:
|
||||
out["footer_template"] = str(opts["footerTemplate"])
|
||||
if "scale" in opts:
|
||||
out["scale"] = float(opts["scale"])
|
||||
return out
|
||||
|
||||
|
||||
async def render_markdown_to_pdf(markdown_text: str) -> PdfRenderResult:
|
||||
"""Render a Markdown document (with optional YAML frontmatter and inline
|
||||
`<style>` block) into a PDF byte string via headless Chromium."""
|
||||
if not markdown_text.strip():
|
||||
raise PdfRenderError("empty markdown input")
|
||||
|
||||
frontmatter, body = _split_frontmatter(markdown_text)
|
||||
pdf_opts = _normalize_pdf_options(frontmatter.get("pdf_options") or {})
|
||||
if "format" not in pdf_opts:
|
||||
pdf_opts["format"] = "A4"
|
||||
|
||||
body_html = _md_to_html(body)
|
||||
full_html = _build_full_html(body_html)
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(args=["--no-sandbox"])
|
||||
try:
|
||||
context = await browser.new_context()
|
||||
page = await context.new_page()
|
||||
await page.set_content(full_html, wait_until="networkidle")
|
||||
pdf_bytes = await page.pdf(**pdf_opts)
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
return PdfRenderResult(
|
||||
pdf_bytes=bytes(pdf_bytes),
|
||||
pages=None,
|
||||
size_bytes=len(pdf_bytes),
|
||||
)
|
||||
@@ -0,0 +1,86 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from mcp_docugen.pdf_renderer import (
|
||||
PdfRenderError,
|
||||
_md_to_html,
|
||||
_normalize_pdf_options,
|
||||
_split_frontmatter,
|
||||
)
|
||||
|
||||
|
||||
def test_split_frontmatter_extracts_yaml_dict():
|
||||
md = "---\ntitle: Foo\npdf_options:\n format: A4\n---\n\n# Body\n"
|
||||
fm, body = _split_frontmatter(md)
|
||||
assert fm == {"title": "Foo", "pdf_options": {"format": "A4"}}
|
||||
assert body.lstrip().startswith("# Body")
|
||||
|
||||
|
||||
def test_split_frontmatter_no_frontmatter_returns_empty_dict():
|
||||
md = "# Just body\nNo YAML here.\n"
|
||||
fm, body = _split_frontmatter(md)
|
||||
assert fm == {}
|
||||
assert body == md
|
||||
|
||||
|
||||
def test_split_frontmatter_invalid_yaml_raises():
|
||||
md = "---\ntitle: : : broken\n---\nbody"
|
||||
with pytest.raises(PdfRenderError):
|
||||
_split_frontmatter(md)
|
||||
|
||||
|
||||
def test_md_to_html_renders_table_and_html_passthrough():
|
||||
body = "| A | B |\n|---|---|\n| 1 | 2 |\n\n<div class=\"x\">raw</div>\n"
|
||||
html = _md_to_html(body)
|
||||
assert "<table>" in html
|
||||
assert "<th>A</th>" in html
|
||||
assert '<div class="x">raw</div>' in html
|
||||
|
||||
|
||||
def test_normalize_pdf_options_translates_camelcase_to_snake():
|
||||
src = {
|
||||
"format": "A4",
|
||||
"printBackground": True,
|
||||
"displayHeaderFooter": True,
|
||||
"headerTemplate": "<div>H</div>",
|
||||
"footerTemplate": "<div>F</div>",
|
||||
"margin": {"top": "18mm", "bottom": "18mm"},
|
||||
"landscape": False,
|
||||
"scale": 0.95,
|
||||
}
|
||||
out = _normalize_pdf_options(src)
|
||||
assert out["format"] == "A4"
|
||||
assert out["print_background"] is True
|
||||
assert out["display_header_footer"] is True
|
||||
assert out["header_template"] == "<div>H</div>"
|
||||
assert out["footer_template"] == "<div>F</div>"
|
||||
assert out["margin"]["top"] == "18mm"
|
||||
assert out["landscape"] is False
|
||||
assert out["scale"] == pytest.approx(0.95)
|
||||
|
||||
|
||||
def test_normalize_pdf_options_drops_unknown_keys():
|
||||
src = {"format": "A4", "weird": 123, "viewportWidth": 1200}
|
||||
out = _normalize_pdf_options(src)
|
||||
assert out == {"format": "A4"}
|
||||
|
||||
|
||||
def test_normalize_pdf_options_handles_non_dict():
|
||||
assert _normalize_pdf_options(None) == {}
|
||||
assert _normalize_pdf_options("not a dict") == {}
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
True, reason="requires Chromium binary; runs only in container/CI with Playwright installed"
|
||||
)
|
||||
async def test_render_markdown_to_pdf_smoke():
|
||||
from mcp_docugen.pdf_renderer import render_markdown_to_pdf
|
||||
|
||||
md = (
|
||||
"---\npdf_options:\n format: A4\n---\n\n"
|
||||
"<style>h1 { color: red; }</style>\n\n# Hello\n\nBody text.\n"
|
||||
)
|
||||
result = await render_markdown_to_pdf(md)
|
||||
assert result.pdf_bytes.startswith(b"%PDF-")
|
||||
assert result.size_bytes > 1000
|
||||
Reference in New Issue
Block a user