diff --git a/README.md b/README.md index a6465ea..2298636 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Due pezzi, stesso repo: | Servizio | Stato | Funzione | |---|---|---| -| `mcp-docugen` | Implementato, 83 test verde, deploy Docker via gateway Caddy (porta 8090), **7 tool MCP** esposti (CRUD template + `document_generate` + `document_to_pdf`), template seed versionati, CSS Tielogic iniettato inline, **render PDF server-side** via Chromium/Playwright | Genera Markdown formale da template + LLM (OpenRouter) e converte in PDF. Vedi [`docs/mcp-docugen-design.md`](docs/mcp-docugen-design.md) + [`docs/mcp-docugen-implementation.md`](docs/mcp-docugen-implementation.md). | +| `mcp-docugen` | Implementato, 92 test verde, deploy Docker via gateway Caddy (porta 8090), **8 tool MCP** esposti (CRUD template + `document_generate` + `document_to_pdf` + `document_to_docx`), template seed versionati, CSS Tielogic iniettato inline, render server-side **PDF** via Chromium/Playwright e **DOCX** via Pandoc con reference `tielogic-reference.docx` | Genera Markdown formale da template + LLM (OpenRouter) e converte in PDF o Word. Vedi [`docs/mcp-docugen-design.md`](docs/mcp-docugen-design.md) + [`docs/mcp-docugen-implementation.md`](docs/mcp-docugen-implementation.md). | | `mcp-convert` | Da progettare | Conversione Markdown → PDF / DOCX / HTML (pandoc/typst backend). | | `mcp-inbox` | Da progettare | Ingest da Telegram (+ STT opzionale via Whisper) verso draft inbox consumati da Claude Code desktop. | @@ -96,6 +96,8 @@ Conversione Markdown→PDF: tre strade, in ordine di comodità. Il CSS Tielogic non viene mai referenziato come path esterno nel Markdown prodotto dal servizio: il `Renderer` lo legge da `themes/tielogic.css` (copiato nell'immagine Docker in `/app/themes/`) e lo inietta come blocco `", re.DOTALL | re.IGNORECASE) +_FRONTMATTER_DELIM = "---" + + +class DocxRenderError(Exception): + pass + + +@dataclass(frozen=True) +class DocxRenderResult: + docx_bytes: bytes + size_bytes: int + + +def _strip_style_blocks(markdown_text: str) -> str: + """Remove `` blocks: they're meaningless in DOCX and + Pandoc would otherwise embed them as raw text.""" + return _STYLE_BLOCK_RE.sub("", markdown_text) + + +def _strip_frontmatter(markdown_text: str) -> str: + """Remove the YAML frontmatter so it doesn't appear as a body table in + the DOCX. Frontmatter values were meant for the PDF renderer.""" + if not markdown_text.startswith(_FRONTMATTER_DELIM): + return markdown_text + end_marker = f"\n{_FRONTMATTER_DELIM}\n" + idx = markdown_text.find(end_marker, len(_FRONTMATTER_DELIM)) + if idx == -1: + return markdown_text + return markdown_text[idx + len(end_marker) :].lstrip() + + +def _preprocess(markdown_text: str) -> str: + return _strip_style_blocks(_strip_frontmatter(markdown_text)) + + +async def render_markdown_to_docx( + markdown_text: str, reference_doc: Path | None = None +) -> DocxRenderResult: + """Convert Markdown to a DOCX file via Pandoc subprocess. + + Pandoc reads from stdin and writes the binary DOCX on stdout, so no + intermediate temp file is needed. The optional `reference_doc` is a + `.docx` whose styles (heading colors, fonts, header/footer, page size) + Pandoc will inherit — this is the path to add Tielogic branding to the + Word output later. + """ + if not markdown_text.strip(): + raise DocxRenderError("empty markdown input") + + cleaned = _preprocess(markdown_text) + if not cleaned.strip(): + raise DocxRenderError("nothing to render after stripping frontmatter/style") + + args = [ + "pandoc", + "-f", + "markdown+raw_html-implicit_figures", + "-t", + "docx", + "-o", + "-", + ] + if reference_doc is not None and reference_doc.is_file(): + args[5:5] = ["--reference-doc", str(reference_doc)] + + proc = await asyncio.create_subprocess_exec( + *args, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate(cleaned.encode("utf-8")) + if proc.returncode != 0: + raise DocxRenderError( + f"pandoc exit {proc.returncode}: {stderr.decode('utf-8', errors='replace')}" + ) + + return DocxRenderResult(docx_bytes=stdout, size_bytes=len(stdout)) diff --git a/services/mcp-docugen/src/mcp_docugen/main.py b/services/mcp-docugen/src/mcp_docugen/main.py index b032079..b5f5ce9 100644 --- a/services/mcp-docugen/src/mcp_docugen/main.py +++ b/services/mcp-docugen/src/mcp_docugen/main.py @@ -51,7 +51,9 @@ async def build_app(settings: Settings | None = None) -> FastAPI: inline_stylesheet_path=settings.inline_stylesheet_path, ) - mcp = build_mcp_server(template_store, renderer) + mcp = build_mcp_server( + template_store, renderer, docx_reference_path=settings.docx_reference_path + ) mcp_asgi = mcp.streamable_http_app() @asynccontextmanager diff --git a/services/mcp-docugen/src/mcp_docugen/mcp_tools.py b/services/mcp-docugen/src/mcp_docugen/mcp_tools.py index 562b8b7..c3b9899 100644 --- a/services/mcp-docugen/src/mcp_docugen/mcp_tools.py +++ b/services/mcp-docugen/src/mcp_docugen/mcp_tools.py @@ -1,10 +1,12 @@ from __future__ import annotations import base64 +from pathlib import Path from typing import Literal from mcp.server.fastmcp import FastMCP +from mcp_docugen.docx_renderer import render_markdown_to_docx from mcp_docugen.models import TemplateFrontmatter from mcp_docugen.pdf_renderer import render_markdown_to_pdf from mcp_docugen.renderer import Renderer @@ -12,9 +14,16 @@ from mcp_docugen.template_store import TemplateStore def build_mcp_server( - template_store: TemplateStore, renderer: Renderer + template_store: TemplateStore, + renderer: Renderer, + docx_reference_path: Path | None = None, ) -> FastMCP: mcp = FastMCP("mcp-docugen") + docx_ref = ( + docx_reference_path + if docx_reference_path is not None and docx_reference_path.is_file() + else None + ) @mcp.tool() async def template_create( @@ -74,14 +83,15 @@ def build_mcp_server( content_md: str, variables: dict, instructions: str | None = None, - output_format: Literal["md", "pdf", "both"] = "md", + output_format: Literal["md", "pdf", "docx", "all"] = "md", ) -> dict: """Generate a document from a template, content, and variables. output_format: - - "md" → returns the generated Markdown only (default) - - "pdf" → also renders the Markdown to PDF (base64 encoded) - - "both" → same as "pdf" (kept for symmetry; PDF includes the MD) + - "md" → returns the generated Markdown only (default) + - "pdf" → also renders the Markdown to PDF (base64 encoded) + - "docx" → also renders the Markdown to a Word DOCX (base64) + - "all" → emits both PDF and DOCX alongside the Markdown """ result = await renderer.generate( template_name=template_name, @@ -91,11 +101,21 @@ def build_mcp_server( ) out = result.model_dump(mode="json") - if output_format in ("pdf", "both"): + want_pdf = output_format in ("pdf", "all") + want_docx = output_format in ("docx", "all") + + if want_pdf: pdf = await render_markdown_to_pdf(result.markdown) out["pdf_b64"] = base64.b64encode(pdf.pdf_bytes).decode("ascii") out["pdf_size_bytes"] = pdf.size_bytes + if want_docx: + docx = await render_markdown_to_docx( + result.markdown, reference_doc=docx_ref + ) + out["docx_b64"] = base64.b64encode(docx.docx_bytes).decode("ascii") + out["docx_size_bytes"] = docx.size_bytes + return out @mcp.tool() @@ -112,6 +132,22 @@ def build_mcp_server( "size_bytes": pdf.size_bytes, } + @mcp.tool() + async def document_to_docx(markdown: str) -> dict: + """Convert an arbitrary Markdown document into a Word DOCX file. + + YAML frontmatter and inline `\nPost" + assert "\n" + "After" + ) + cleaned = _strip_style_blocks(md) + assert "body { color: red; }\n\n" + "# Body\n\nContent.\n" + ) + out = _preprocess(md) + assert "foo: bar" not in out + assert "\n\n" + "# Hello\n\n" + "| A | B |\n|---|---|\n| 1 | 2 |\n\n" + "**bold** and *italic*.\n" + ) + result = await render_markdown_to_docx(md) + # DOCX is a ZIP archive; signature: PK\x03\x04 + assert result.docx_bytes.startswith(b"PK\x03\x04") + assert result.size_bytes > 1000 + + +async def test_render_empty_markdown_raises(): + with pytest.raises(DocxRenderError): + await render_markdown_to_docx("") + with pytest.raises(DocxRenderError): + await render_markdown_to_docx(" \n\n ") + + +async def test_render_only_frontmatter_and_style_raises(): + md = "---\nfoo: bar\n---\n\n\n\n \n" + with pytest.raises(DocxRenderError): + await render_markdown_to_docx(md) diff --git a/themes/tielogic-reference.docx b/themes/tielogic-reference.docx new file mode 100644 index 0000000..b2f9815 Binary files /dev/null and b/themes/tielogic-reference.docx differ