Files
ArcaSuite/scripts/build-reference-docx.py
Adriano c783fff040 feat(mcp-docugen): output Word (.docx) via Pandoc con reference Tielogic
Aggiunge la generazione di documenti Word coerenti con l'identità
visiva Tielogic, in parallelo al render PDF già esistente. Il flusso
completo è ora `bullet input → Markdown formattato → PDF e/o DOCX`
in una singola chiamata MCP.

- docx_renderer.py: subprocess Pandoc che legge il Markdown da stdin,
  emette il binario .docx su stdout. Strippa il YAML frontmatter e i
  blocchi `<style>` (presenti per il PDF, irrilevanti in DOCX) prima
  della conversione.
- mcp_tools.py: nuovo tool `document_to_docx(markdown)` che ritorna
  `{docx_b64, size_bytes}`; `document_generate` esteso con
  `output_format ∈ {md, pdf, docx, all}`. La firma di
  `build_mcp_server` accetta ora `docx_reference_path` opzionale.
- config.py: `Settings.docx_reference_path` (default
  /app/themes/tielogic-reference.docx).
- main.py: passa la nuova setting a `build_mcp_server`.
- mcp-docugen.Dockerfile: installazione di pandoc accanto alle libs
  Chromium.
- themes/tielogic-reference.docx: reference Word (10 KB) con stili
  Tielogic — heading colors blu/dark, font Inter, dimensioni allineate
  al CSS web. Generato da `scripts/build-reference-docx.py` che parte
  dal reference.docx di default di Pandoc e riscrive `word/styles.xml`
  con regex sui blocchi `<w:style>`. Pandoc lo applica in automatico
  agli output DOCX prodotti dal servizio.
- 9 nuovi test unit per docx_renderer (strip frontmatter/style,
  preprocess combinato, error empty input, smoke skippato in
  ambienti senza Pandoc): 92 test totali.

Smoke E2E via MCP: una sola chiamata `document_generate` con
`output_format=all` produce MD (14 KB), PDF (137 KB, 4 pagine A4) e
DOCX (12.7 KB) coerenti tra loro.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 11:13:11 +02:00

165 lines
6.0 KiB
Python
Executable File

#!/usr/bin/env python3
"""Costruisce themes/tielogic-reference.docx applicando l'identità Tielogic
al reference.docx di default di Pandoc.
Pandoc usa il reference.docx come template di stili (Heading1, Heading2,
Normal, Table, ...) per l'output `-t docx`. Questo script:
1. estrae il reference.docx di default di Pandoc
2. modifica word/styles.xml: font Inter, colori Tielogic blu (#2767d8),
dimensioni e attributi paragrafo coerenti col theme CSS
3. modifica word/header*.xml e word/footer*.xml con testo Tielogic generico
4. riscrive lo zip in themes/tielogic-reference.docx
"""
from __future__ import annotations
import argparse
import io
import re
import shutil
import subprocess
import sys
import zipfile
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parent.parent
DEFAULT_OUTPUT = REPO_ROOT / "themes" / "tielogic-reference.docx"
TIELOGIC_BLUE = "2767D8"
TIELOGIC_DARK = "0D1B2A"
INTER = "Inter"
INTER_MONO = "JetBrains Mono"
def get_pandoc_default_reference() -> bytes:
if shutil.which("pandoc") is None:
sys.exit("pandoc not found in PATH; install pandoc and rerun")
proc = subprocess.run(
["pandoc", "--print-default-data-file", "reference.docx"],
capture_output=True,
check=True,
)
return proc.stdout
def patch_styles_xml(xml: str) -> str:
"""Apply Tielogic visual identity to styles.xml.
Targets the most visible styles: Heading1, Heading2, Heading3, Normal,
plus the default Title/Subtitle. Operates with regex on the XML for
minimal dependency footprint (no python-docx required).
"""
out = xml
# Override the run-property defaults globally where possible.
# rFonts (font) — set ascii/hAnsi to Inter; cs to Inter; eastAsia kept.
out = re.sub(
r'<w:rFonts\s+[^/]*?/>',
lambda m: _patch_rfonts(m.group(0)),
out,
)
# For each heading style, enforce color + bold + Inter.
headings = {
"Heading1": {"size": "44", "color": TIELOGIC_DARK, "caps": False, "bold": True},
"Heading2": {"size": "28", "color": TIELOGIC_BLUE, "caps": True, "bold": True},
"Heading3": {"size": "23", "color": TIELOGIC_DARK, "caps": False, "bold": True},
"Title": {"size": "52", "color": TIELOGIC_DARK, "caps": True, "bold": True},
"Subtitle": {"size": "26", "color": "5A6478", "caps": True, "bold": False},
}
for style_id, attrs in headings.items():
out = _override_style(out, style_id, **attrs)
return out
def _patch_rfonts(tag: str) -> str:
"""Replace ascii/hAnsi font names with Inter, preserve other attrs."""
new = re.sub(r'w:ascii="[^"]*"', f'w:ascii="{INTER}"', tag)
new = re.sub(r'w:hAnsi="[^"]*"', f'w:hAnsi="{INTER}"', new)
new = re.sub(r'w:cs="[^"]*"', f'w:cs="{INTER}"', new)
if 'w:ascii=' not in new:
new = new.replace('/>', f' w:ascii="{INTER}" w:hAnsi="{INTER}"/>')
return new
def _override_style(xml: str, style_id: str, *, size: str, color: str, caps: bool, bold: bool) -> str:
"""Inject explicit rPr override for a named style, replacing any existing
color/sz/caps/font directive within that style's <w:rPr> block."""
pattern = re.compile(
r'(<w:style\s+[^>]*w:styleId="' + re.escape(style_id) + r'"[^>]*>)(.*?)(</w:style>)',
re.DOTALL,
)
def repl(m: re.Match[str]) -> str:
head, body, tail = m.group(1), m.group(2), m.group(3)
rpr_block = (
"<w:rPr>"
f'<w:rFonts w:ascii="{INTER}" w:hAnsi="{INTER}" w:cs="{INTER}"/>'
+ (f'<w:b/><w:bCs/>' if bold else "")
+ (f'<w:caps/>' if caps else "")
+ f'<w:color w:val="{color}"/>'
f'<w:sz w:val="{size}"/>'
f'<w:szCs w:val="{size}"/>'
"</w:rPr>"
)
if "<w:rPr>" in body:
body = re.sub(r"<w:rPr>.*?</w:rPr>", rpr_block, body, count=1, flags=re.DOTALL)
else:
# insert after pPr if present, else right after style head
if "<w:pPr>" in body:
body = re.sub(r"(</w:pPr>)", r"\1" + rpr_block, body, count=1)
else:
body = rpr_block + body
return head + body + tail
return pattern.sub(repl, xml)
def patch_header_footer_xml(name: str, xml: str) -> str:
"""Generic Tielogic running header/footer text. We don't try to add
field codes for page numbers in headers — Pandoc emits its own with
PAGE field if the reference includes one; we leave that intact."""
# Replace any pre-existing visible body text inside <w:t>...</w:t>
# with Tielogic placeholders. Keep formatting nodes.
if "header" in name:
replacement = "Tielogic — Soluzioni Software Industriali"
else:
replacement = "Tielogic — Soluzioni Software Industriali"
return re.sub(r"<w:t[^>]*>[^<]*</w:t>", f"<w:t>{replacement}</w:t>", xml)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
args = parser.parse_args()
output = args.output.resolve()
output.parent.mkdir(parents=True, exist_ok=True)
src_bytes = get_pandoc_default_reference()
out_buf = io.BytesIO()
with zipfile.ZipFile(io.BytesIO(src_bytes), "r") as zin:
with zipfile.ZipFile(out_buf, "w", zipfile.ZIP_DEFLATED) as zout:
for item in zin.infolist():
data = zin.read(item.filename)
if item.filename == "word/styles.xml":
text = data.decode("utf-8")
text = patch_styles_xml(text)
data = text.encode("utf-8")
elif re.match(r"word/(header|footer)\d*\.xml$", item.filename):
text = data.decode("utf-8")
text = patch_header_footer_xml(item.filename, text)
data = text.encode("utf-8")
zout.writestr(item, data)
output.write_bytes(out_buf.getvalue())
print(f"OK: {output} ({output.stat().st_size} bytes)")
if __name__ == "__main__":
main()