fix(mcp-docugen): preprocessor HTML→Markdown per output Word leggibile

Il DOCX prodotto dalla versione precedente emetteva i div Tielogic
(`<div class="cover">`, `<div class="info-col">`, `<div class="acceptance">`,
`<div class="status-card">`) come testo grezzo: Pandoc non sa
interpretare il CSS-flavoured HTML del PDF e li copia letteralmente
nel documento Word. Anche le tabelle `<table class="financial">`
finivano spezzate cella per cella.

Il fix introduce un preprocessor dedicato che riscrive tutta la
HTML Tielogic-flavoured in Markdown nativo prima di passare il
documento a Pandoc.

- docx_preprocessor.py: nuovo modulo basato su BeautifulSoup. Strippa
  frontmatter e <style>, poi rewrite di:
    * <div class="cover"> → titoli H1/H2, paragrafi, tabella pipe
      2-col FORNITORE/CLIENTE, validità in italic, \newpage finale
    * <table class="financial"> → tabella pipe Markdown con riga
      total-row in **bold**
    * <div class="acceptance"> → heading H2 + intro + tabella pipe
      con riga firma `_____________________` + luogo/data
    * <div class="status-card"> → paragrafo "**name** — descrizione"
    * <span class="badge ..."> → testo **bold**
    * <div class="page-break"> → \newpage Pandoc-friendly
- docx_renderer.py: deferisce tutto il preprocessing al nuovo modulo
  (più compatto, niente regex sparse).
- pyproject.toml + uv.lock: aggiunta dipendenza beautifulsoup4>=4.12.
- 8 nuovi test unit per il preprocessor (cover, tabelle, badge,
  acceptance, idempotenza, niente div residui, badge standalone).
  Adattati i test esistenti agli import dal nuovo modulo. 101 verde.

Smoke E2E via MCP: l'offerta TieMeasureFlow esce in DOCX leggibile
con tabelle Word native, heading colorati Tielogic e firme in tabella.
This commit is contained in:
2026-04-26 11:26:52 +02:00
parent c783fff040
commit 54bf41efd6
7 changed files with 464 additions and 28 deletions
@@ -0,0 +1,167 @@
from __future__ import annotations
import textwrap
from mcp_docugen.docx_preprocessor import preprocess_for_docx
SAMPLE_DOC = textwrap.dedent(
"""\
---
pdf_options:
format: A4
---
<style>body { color: red; }</style>
<div class="cover">
<div class="brand">TIELOGIC</div>
<div class="brand-tagline">Soluzioni Software Industriali</div>
<div class="brand-divider"></div>
<div class="doc-title">OFFERTA PRODOTTO E INTEGRAZIONE</div>
<div class="doc-product">TieMeasureFlow</div>
<div class="doc-ref">Rif. OFF-2026-022 | 23 marzo 2026</div>
<div class="info-box">
<div class="info-col">
<div class="info-label">FORNITORE</div>
<div class="info-name">Tielogic SRL</div>
<div>Via Villanova 39, 36020 Solagna (VI)</div>
<div>P.IVA / C.F. 03954890244</div>
</div>
<div class="info-col">
<div class="info-label">CLIENTE</div>
<div class="info-name">Ricerca e Misure s.r.l.</div>
<div>Via Brigata Julia 21, 35020 Pernumia (PD)</div>
<div>Rif. Menoncin</div>
</div>
</div>
<div class="doc-validity">Validità offerta: 23 aprile 2026</div>
</div>
# TieMeasureFlow
Sistema web SPC.
## Costo di setup iniziale
<table class="financial">
<thead><tr><th>Voce</th><th class="num">Importo</th></tr></thead>
<tbody>
<tr><td>Setup</td><td class="num">€ 3.500,00</td></tr>
<tr class="total-row"><td>TOTALE SETUP</td><td class="num">€ 3.500,00</td></tr>
</tbody>
</table>
<div class="status-card drift">
<div class="name">TEST Z +50MM <span class="badge badge-drift">DRIFT</span></div>
<div>Errore cumulativo da 7.8mm a 11.5mm.</div>
</div>
<div class="acceptance">
<h2 class="acceptance-title">ACCETTAZIONE</h2>
<div class="acceptance-intro">Per accettazione, restituire copia firmata.</div>
<div class="signature-grid"><div class="sig-col"><div class="sig-party">Per Tielogic SRL</div><div class="sig-line">Firma e timbro</div></div><div class="sig-col"><div class="sig-party">Per Ricerca e Misure s.r.l.</div><div class="sig-line">Firma e timbro</div></div></div>
<div class="place-date">Luogo e data: ____________ 23 marzo 2026</div>
</div>
"""
)
def test_preprocessor_strips_style_and_frontmatter():
out = preprocess_for_docx(SAMPLE_DOC)
assert "<style>" not in out
assert "pdf_options" not in out
assert not out.startswith("---")
def test_preprocessor_converts_cover_to_markdown():
out = preprocess_for_docx(SAMPLE_DOC)
assert '<div class="cover">' not in out
assert "# TIELOGIC" in out
assert "*Soluzioni Software Industriali*" in out
assert "## OFFERTA PRODOTTO E INTEGRAZIONE" in out
assert "**TieMeasureFlow**" in out
assert "Rif. OFF-2026-022 | 23 marzo 2026" in out
# Info table 2-col
assert "| **FORNITORE** | **CLIENTE** |" in out
assert "**Tielogic SRL**" in out
assert "**Ricerca e Misure s.r.l.**" in out
assert "Via Villanova 39, 36020 Solagna (VI)" in out
assert "*Validità offerta: 23 aprile 2026*" in out
# newpage after cover
assert "\\newpage" in out
def test_preprocessor_converts_financial_table_to_pipe():
out = preprocess_for_docx(SAMPLE_DOC)
assert '<table class="financial">' not in out
assert "| Voce | Importo |" in out
assert "| Setup | € 3.500,00 |" in out
# total row bolded
assert "| **TOTALE SETUP** | **€ 3.500,00** |" in out
def test_preprocessor_converts_status_card_to_paragraph():
out = preprocess_for_docx(SAMPLE_DOC)
assert 'class="status-card' not in out
assert "**TEST Z +50MM" in out
assert "Errore cumulativo da 7.8mm a 11.5mm" in out
def test_preprocessor_converts_badges_to_bold():
out = preprocess_for_docx(SAMPLE_DOC)
assert "badge-drift" not in out
# Badge inside status-card name gets absorbed into the bold name string;
# checking that the badge text survives somewhere inside a bold span.
assert "DRIFT" in out
assert "<span" not in out
def test_preprocessor_converts_standalone_badges_to_bold():
md = "Verdetto: <span class=\"badge badge-fattibile\">FATTIBILE</span>"
out = preprocess_for_docx(md)
assert "<span" not in out
assert "**FATTIBILE**" in out
def test_preprocessor_converts_acceptance_to_table():
out = preprocess_for_docx(SAMPLE_DOC)
assert 'class="acceptance"' not in out
assert "## ACCETTAZIONE" in out
assert "Per accettazione, restituire copia firmata." in out
assert "| **Per Tielogic SRL** | **Per Ricerca e Misure s.r.l.** |" in out
assert "| _____________________ | _____________________ |" in out
assert "| Firma e timbro | Firma e timbro |" in out
assert "Luogo e data" in out
def test_preprocessor_idempotent_on_clean_markdown():
md = "# Title\n\nA paragraph.\n\n| a | b |\n|---|---|\n| 1 | 2 |\n"
out = preprocess_for_docx(md)
# No frontmatter to strip, no Tielogic widgets to rewrite.
assert "# Title" in out
assert "| a | b |" in out
assert "<style" not in out
def test_preprocessor_no_div_classes_left_in_output():
out = preprocess_for_docx(SAMPLE_DOC)
for forbidden in (
"<div class=\"cover\"",
"<div class=\"info-box\"",
"<div class=\"info-col\"",
"<div class=\"info-label\"",
"<div class=\"info-name\"",
"<div class=\"doc-title\"",
"<div class=\"doc-product\"",
"<div class=\"doc-ref\"",
"<div class=\"doc-validity\"",
"<div class=\"acceptance\"",
"<div class=\"signature-grid\"",
"<div class=\"sig-col\"",
"<div class=\"sig-party\"",
"<div class=\"sig-line\"",
"<div class=\"place-date\"",
"<div class=\"status-card",
):
assert forbidden not in out, f"{forbidden!r} still present in output"
@@ -4,11 +4,13 @@ import shutil
import pytest
from mcp_docugen.docx_renderer import (
DocxRenderError,
_preprocess,
from mcp_docugen.docx_preprocessor import (
_strip_frontmatter,
_strip_style_blocks,
preprocess_for_docx as _preprocess,
)
from mcp_docugen.docx_renderer import (
DocxRenderError,
render_markdown_to_docx,
)