fix(mcp-docugen): preprocessor HTML→Markdown per output Word leggibile

Il DOCX prodotto dalla versione precedente emetteva i div Tielogic (`<div class="cover">`, `<div class="info-col">`, `<div class="acceptance">`, `<div class="status-card">`) come testo grezzo: Pandoc non sa interpretare il CSS-flavoured HTML del PDF e li copia letteralmente nel documento Word. Anche le tabelle `<table class="financial">` finivano spezzate cella per cella. Il fix introduce un preprocessor dedicato che riscrive tutta la HTML Tielogic-flavoured in Markdown nativo prima di passare il documento a Pandoc. - docx_preprocessor.py: nuovo modulo basato su BeautifulSoup. Strippa frontmatter e <style>, poi rewrite di: * <div class="cover"> → titoli H1/H2, paragrafi, tabella pipe 2-col FORNITORE/CLIENTE, validità in italic, \newpage finale * <table class="financial"> → tabella pipe Markdown con riga total-row in **bold** * <div class="acceptance"> → heading H2 + intro + tabella pipe con riga firma `_____________________` + luogo/data * <div class="status-card"> → paragrafo "**name** — descrizione" * <span class="badge ..."> → testo **bold** * <div class="page-break"> → \newpage Pandoc-friendly - docx_renderer.py: deferisce tutto il preprocessing al nuovo modulo (più compatto, niente regex sparse). - pyproject.toml + uv.lock: aggiunta dipendenza beautifulsoup4>=4.12. - 8 nuovi test unit per il preprocessor (cover, tabelle, badge, acceptance, idempotenza, niente div residui, badge standalone). Adattati i test esistenti agli import dal nuovo modulo. 101 verde. Smoke E2E via MCP: l'offerta TieMeasureFlow esce in DOCX leggibile con tabelle Word native, heading colorati Tielogic e firme in tabella.
2026-04-26 11:26:52 +02:00
parent c783fff040
commit 54bf41efd6
7 changed files with 464 additions and 28 deletions
@@ -17,6 +17,7 @@ dependencies = [
    "python-multipart>=0.0.9",
    "playwright>=1.48",
    "markdown-it-py[plugins]>=3.0",
+    "beautifulsoup4>=4.12",
 ]

 [project.optional-dependencies]
@@ -0,0 +1,252 @@
+"""Convert Tielogic-flavoured HTML inside the generated Markdown into
+native Markdown elements before passing the document to Pandoc.
+
+The PDF pipeline depends on `<div class="cover">`, `<table class="financial">`,
+`<div class="acceptance">`, etc. — these are CSS-styled in Chromium but
+Pandoc has no idea what to do with them and would otherwise emit them as
+raw text in the DOCX. This module rewrites those structures as headings,
+pipe tables and paragraphs so the Word output is readable and structured.
+"""
+
+from __future__ import annotations
+
+import re
+
+from bs4 import BeautifulSoup, Tag
+
+_FRONTMATTER_DELIM = "---"
+_STYLE_BLOCK_RE = re.compile(r"<style\b[^>]*>.*?</style>", re.DOTALL | re.IGNORECASE)
+
+
+def _strip_style_blocks(text: str) -> str:
+    return _STYLE_BLOCK_RE.sub("", text)
+
+
+def _strip_frontmatter(text: str) -> str:
+    if not text.startswith(_FRONTMATTER_DELIM):
+        return text
+    end_marker = f"\n{_FRONTMATTER_DELIM}\n"
+    idx = text.find(end_marker, len(_FRONTMATTER_DELIM))
+    if idx == -1:
+        return text
+    return text[idx + len(end_marker) :].lstrip()
+
+
+def _text(el: Tag | None) -> str:
+    return el.get_text(" ", strip=True) if el is not None else ""
+
+
+def _info_col_lines(col: Tag) -> list[str]:
+    """Extract the rows of an info-col block (FORNITORE/CLIENTE), skipping
+    the label (used as table header) and bolding the company name."""
+    lines: list[str] = []
+    for child in col.find_all("div", recursive=False):
+        classes = set(child.get("class") or [])
+        if "info-label" in classes:
+            continue
+        txt = child.get_text(" ", strip=True)
+        if not txt:
+            continue
+        if "info-name" in classes:
+            lines.append(f"**{txt}**")
+        else:
+            lines.append(txt)
+    return lines
+
+
+def _convert_cover(soup: BeautifulSoup) -> None:
+    cover = soup.find("div", class_="cover")
+    if not isinstance(cover, Tag):
+        return
+
+    brand = _text(cover.find(class_="brand"))
+    tagline = _text(cover.find(class_="brand-tagline"))
+    title = _text(cover.find(class_="doc-title"))
+    product = _text(cover.find(class_="doc-product"))
+    ref = _text(cover.find(class_="doc-ref"))
+    validity = _text(cover.find(class_="doc-validity"))
+
+    info_box = cover.find(class_="info-box")
+    info_cols = (
+        info_box.find_all("div", class_="info-col") if isinstance(info_box, Tag) else []
+    )
+
+    blocks: list[str] = []
+    if brand:
+        blocks.append(f"# {brand}")
+    if tagline:
+        blocks.append(f"*{tagline}*")
+    blocks.append("---")
+    if title:
+        blocks.append(f"## {title}")
+    if product:
+        blocks.append(f"**{product}**")
+    if ref:
+        blocks.append(ref)
+
+    if len(info_cols) == 2:
+        col_a, col_b = info_cols
+        label_a = _text(col_a.find(class_="info-label")) or "FORNITORE"
+        label_b = _text(col_b.find(class_="info-label")) or "CLIENTE"
+        rows_a = _info_col_lines(col_a)
+        rows_b = _info_col_lines(col_b)
+        height = max(len(rows_a), len(rows_b))
+        rows_a += [""] * (height - len(rows_a))
+        rows_b += [""] * (height - len(rows_b))
+
+        table_lines = [
+            f"| **{label_a}** | **{label_b}** |",
+            "|---|---|",
+        ]
+        for a, b in zip(rows_a, rows_b):
+            table_lines.append(f"| {a} | {b} |")
+        blocks.append("\n".join(table_lines))
+
+    if validity:
+        blocks.append(f"*{validity}*")
+
+    replacement = "\n\n".join(blocks) + "\n\n\\newpage\n"
+    cover.replace_with(BeautifulSoup(replacement, "html.parser"))
+
+
+def _convert_acceptance(soup: BeautifulSoup) -> None:
+    acceptance = soup.find("div", class_="acceptance")
+    if not isinstance(acceptance, Tag):
+        return
+
+    title_el = acceptance.find(class_="acceptance-title")
+    intro_el = acceptance.find(class_="acceptance-intro")
+    sig_grid = acceptance.find(class_="signature-grid")
+    place_date = acceptance.find(class_="place-date")
+
+    title = _text(title_el) or "ACCETTAZIONE"
+    intro = _text(intro_el)
+
+    blocks = [f"## {title}"]
+    if intro:
+        blocks.append(intro)
+
+    if isinstance(sig_grid, Tag):
+        cols = sig_grid.find_all("div", class_="sig-col")
+        if len(cols) == 2:
+            party_a = _text(cols[0].find(class_="sig-party"))
+            party_b = _text(cols[1].find(class_="sig-party"))
+            line_a = _text(cols[0].find(class_="sig-line")) or "Firma e timbro"
+            line_b = _text(cols[1].find(class_="sig-line")) or "Firma e timbro"
+            blocks.append(
+                "\n".join(
+                    [
+                        f"| **{party_a}** | **{party_b}** |",
+                        "|---|---|",
+                        "| _____________________ | _____________________ |",
+                        f"| {line_a} | {line_b} |",
+                    ]
+                )
+            )
+
+    if isinstance(place_date, Tag):
+        blocks.append(_text(place_date))
+
+    replacement = "\n\n".join(blocks)
+    acceptance.replace_with(BeautifulSoup(replacement, "html.parser"))
+
+
+def _convert_status_cards(soup: BeautifulSoup) -> None:
+    for card in soup.find_all("div", class_="status-card"):
+        if not isinstance(card, Tag):
+            continue
+        name_el = card.find(class_="name")
+        name = _text(name_el)
+        # remaining text (sibling divs after the name)
+        body_parts: list[str] = []
+        for child in card.find_all("div", recursive=False):
+            if child is name_el:
+                continue
+            txt = child.get_text(" ", strip=True)
+            if txt:
+                body_parts.append(txt)
+        body = " ".join(body_parts)
+        block = f"**{name}** — {body}" if body else f"**{name}**"
+        card.replace_with(BeautifulSoup(block, "html.parser"))
+
+
+def _convert_badges(soup: BeautifulSoup) -> None:
+    for span in soup.find_all("span", class_="badge"):
+        if not isinstance(span, Tag):
+            continue
+        txt = span.get_text(" ", strip=True)
+        span.replace_with(f"**{txt}**" if txt else "")
+
+
+def _convert_page_breaks(soup: BeautifulSoup) -> None:
+    for el in soup.find_all("div", class_="page-break"):
+        if not isinstance(el, Tag):
+            continue
+        el.replace_with(BeautifulSoup("\n\n\\newpage\n\n", "html.parser"))
+
+
+def _convert_financial_tables(soup: BeautifulSoup) -> None:
+    """Rewrite `<table class="financial">` (with custom td/tr classes) as
+    a clean pipe-table Markdown block. Pandoc handles raw HTML <table>
+    inconsistently when extra attributes/classes are present."""
+    for table in soup.find_all("table", class_="financial"):
+        if not isinstance(table, Tag):
+            continue
+
+        header_cells: list[str] = []
+        thead = table.find("thead")
+        if isinstance(thead, Tag):
+            for th in thead.find_all("th"):
+                header_cells.append(th.get_text(" ", strip=True))
+
+        rows: list[list[str]] = []
+        body = table.find("tbody") or table
+        for tr in body.find_all("tr"):
+            classes = set(tr.get("class") or [])
+            cells = [td.get_text(" ", strip=True) for td in tr.find_all(["td", "th"])]
+            if not cells:
+                continue
+            if "total-row" in classes:
+                cells = [f"**{c}**" for c in cells]
+            # Skip if this row was already pulled in via thead
+            if cells == header_cells:
+                continue
+            rows.append(cells)
+
+        if not header_cells and rows:
+            # Use the first row as header if no thead provided.
+            header_cells, rows = rows[0], rows[1:]
+
+        if not header_cells:
+            continue
+
+        ncols = len(header_cells)
+        rows = [r + [""] * (ncols - len(r)) for r in rows]
+
+        lines = ["| " + " | ".join(header_cells) + " |"]
+        lines.append("|" + "|".join(["---"] * ncols) + "|")
+        for r in rows:
+            lines.append("| " + " | ".join(r) + " |")
+
+        block = "\n".join(lines)
+        table.replace_with(BeautifulSoup(block, "html.parser"))
+
+
+def preprocess_for_docx(markdown_text: str) -> str:
+    """Apply the full pipeline of transformations needed to render the
+    Tielogic Markdown documents in DOCX via Pandoc."""
+    text = _strip_style_blocks(markdown_text)
+    text = _strip_frontmatter(text)
+
+    soup = BeautifulSoup(text, "html.parser")
+    _convert_cover(soup)
+    _convert_acceptance(soup)
+    _convert_status_cards(soup)
+    _convert_financial_tables(soup)
+    _convert_badges(soup)
+    _convert_page_breaks(soup)
+
+    out = str(soup)
+    # Collapse 3+ blank lines into 2 to keep the document tidy.
+    out = re.sub(r"\n{3,}", "\n\n", out)
+    return out.strip() + "\n"
@@ -2,14 +2,12 @@ from __future__ import annotations

 import asyncio
 import logging
-import re
 from dataclasses import dataclass
 from pathlib import Path

-logger = logging.getLogger(__name__)
+from mcp_docugen.docx_preprocessor import preprocess_for_docx

-_STYLE_BLOCK_RE = re.compile(r"<style\b[^>]*>.*?</style>", re.DOTALL | re.IGNORECASE)
-_FRONTMATTER_DELIM = "---"
+logger = logging.getLogger(__name__)


 class DocxRenderError(Exception):
@@ -22,26 +20,13 @@ class DocxRenderResult:
    size_bytes: int


-def _strip_style_blocks(markdown_text: str) -> str:
-    """Remove `<style>...</style>` blocks: they're meaningless in DOCX and
-    Pandoc would otherwise embed them as raw text."""
-    return _STYLE_BLOCK_RE.sub("", markdown_text)
-
-
-def _strip_frontmatter(markdown_text: str) -> str:
-    """Remove the YAML frontmatter so it doesn't appear as a body table in
-    the DOCX. Frontmatter values were meant for the PDF renderer."""
-    if not markdown_text.startswith(_FRONTMATTER_DELIM):
-        return markdown_text
-    end_marker = f"\n{_FRONTMATTER_DELIM}\n"
-    idx = markdown_text.find(end_marker, len(_FRONTMATTER_DELIM))
-    if idx == -1:
-        return markdown_text
-    return markdown_text[idx + len(end_marker) :].lstrip()
-
-
 def _preprocess(markdown_text: str) -> str:
-    return _strip_style_blocks(_strip_frontmatter(markdown_text))
+    """Strip the bits of the document that only make sense in the PDF
+    pipeline (YAML frontmatter, inline `<style>`) and rewrite the
+    Tielogic-flavoured HTML widgets (cover, status cards, financial
+    tables, signatures, badges) as native Markdown so Pandoc can produce a
+    clean DOCX."""
+    return preprocess_for_docx(markdown_text)


 async def render_markdown_to_docx(
@@ -0,0 +1,167 @@
+from __future__ import annotations
+
+import textwrap
+
+from mcp_docugen.docx_preprocessor import preprocess_for_docx
+
+
+SAMPLE_DOC = textwrap.dedent(
+    """\
+    ---
+    pdf_options:
+      format: A4
+    ---
+
+    <style>body { color: red; }</style>
+
+    <div class="cover">
+      <div class="brand">TIELOGIC</div>
+      <div class="brand-tagline">Soluzioni Software Industriali</div>
+      <div class="brand-divider"></div>
+      <div class="doc-title">OFFERTA PRODOTTO E INTEGRAZIONE</div>
+      <div class="doc-product">TieMeasureFlow</div>
+      <div class="doc-ref">Rif. OFF-2026-022 | 23 marzo 2026</div>
+      <div class="info-box">
+        <div class="info-col">
+          <div class="info-label">FORNITORE</div>
+          <div class="info-name">Tielogic SRL</div>
+          <div>Via Villanova 39, 36020 Solagna (VI)</div>
+          <div>P.IVA / C.F. 03954890244</div>
+        </div>
+        <div class="info-col">
+          <div class="info-label">CLIENTE</div>
+          <div class="info-name">Ricerca e Misure s.r.l.</div>
+          <div>Via Brigata Julia 21, 35020 Pernumia (PD)</div>
+          <div>Rif. Menoncin</div>
+        </div>
+      </div>
+      <div class="doc-validity">Validità offerta: 23 aprile 2026</div>
+    </div>
+
+    # TieMeasureFlow
+
+    Sistema web SPC.
+
+    ## Costo di setup iniziale
+
+    <table class="financial">
+    <thead><tr><th>Voce</th><th class="num">Importo</th></tr></thead>
+    <tbody>
+    <tr><td>Setup</td><td class="num">€ 3.500,00</td></tr>
+    <tr class="total-row"><td>TOTALE SETUP</td><td class="num">€ 3.500,00</td></tr>
+    </tbody>
+    </table>
+
+    <div class="status-card drift">
+      <div class="name">TEST Z +50MM <span class="badge badge-drift">DRIFT</span></div>
+      <div>Errore cumulativo da 7.8mm a 11.5mm.</div>
+    </div>
+
+    <div class="acceptance">
+    <h2 class="acceptance-title">ACCETTAZIONE</h2>
+    <div class="acceptance-intro">Per accettazione, restituire copia firmata.</div>
+    <div class="signature-grid"><div class="sig-col"><div class="sig-party">Per Tielogic SRL</div><div class="sig-line">Firma e timbro</div></div><div class="sig-col"><div class="sig-party">Per Ricerca e Misure s.r.l.</div><div class="sig-line">Firma e timbro</div></div></div>
+    <div class="place-date">Luogo e data: ____________ 23 marzo 2026</div>
+    </div>
+    """
+)
+
+
+def test_preprocessor_strips_style_and_frontmatter():
+    out = preprocess_for_docx(SAMPLE_DOC)
+    assert "<style>" not in out
+    assert "pdf_options" not in out
+    assert not out.startswith("---")
+
+
+def test_preprocessor_converts_cover_to_markdown():
+    out = preprocess_for_docx(SAMPLE_DOC)
+    assert '<div class="cover">' not in out
+    assert "# TIELOGIC" in out
+    assert "*Soluzioni Software Industriali*" in out
+    assert "## OFFERTA PRODOTTO E INTEGRAZIONE" in out
+    assert "**TieMeasureFlow**" in out
+    assert "Rif. OFF-2026-022 | 23 marzo 2026" in out
+    # Info table 2-col
+    assert "| **FORNITORE** | **CLIENTE** |" in out
+    assert "**Tielogic SRL**" in out
+    assert "**Ricerca e Misure s.r.l.**" in out
+    assert "Via Villanova 39, 36020 Solagna (VI)" in out
+    assert "*Validità offerta: 23 aprile 2026*" in out
+    # newpage after cover
+    assert "\\newpage" in out
+
+
+def test_preprocessor_converts_financial_table_to_pipe():
+    out = preprocess_for_docx(SAMPLE_DOC)
+    assert '<table class="financial">' not in out
+    assert "| Voce | Importo |" in out
+    assert "| Setup | € 3.500,00 |" in out
+    # total row bolded
+    assert "| **TOTALE SETUP** | **€ 3.500,00** |" in out
+
+
+def test_preprocessor_converts_status_card_to_paragraph():
+    out = preprocess_for_docx(SAMPLE_DOC)
+    assert 'class="status-card' not in out
+    assert "**TEST Z +50MM" in out
+    assert "Errore cumulativo da 7.8mm a 11.5mm" in out
+
+
+def test_preprocessor_converts_badges_to_bold():
+    out = preprocess_for_docx(SAMPLE_DOC)
+    assert "badge-drift" not in out
+    # Badge inside status-card name gets absorbed into the bold name string;
+    # checking that the badge text survives somewhere inside a bold span.
+    assert "DRIFT" in out
+    assert "<span" not in out
+
+
+def test_preprocessor_converts_standalone_badges_to_bold():
+    md = "Verdetto: <span class=\"badge badge-fattibile\">FATTIBILE</span>"
+    out = preprocess_for_docx(md)
+    assert "<span" not in out
+    assert "**FATTIBILE**" in out
+
+
+def test_preprocessor_converts_acceptance_to_table():
+    out = preprocess_for_docx(SAMPLE_DOC)
+    assert 'class="acceptance"' not in out
+    assert "## ACCETTAZIONE" in out
+    assert "Per accettazione, restituire copia firmata." in out
+    assert "| **Per Tielogic SRL** | **Per Ricerca e Misure s.r.l.** |" in out
+    assert "| _____________________ | _____________________ |" in out
+    assert "| Firma e timbro | Firma e timbro |" in out
+    assert "Luogo e data" in out
+
+
+def test_preprocessor_idempotent_on_clean_markdown():
+    md = "# Title\n\nA paragraph.\n\n| a | b |\n|---|---|\n| 1 | 2 |\n"
+    out = preprocess_for_docx(md)
+    # No frontmatter to strip, no Tielogic widgets to rewrite.
+    assert "# Title" in out
+    assert "| a | b |" in out
+    assert "<style" not in out
+
+
+def test_preprocessor_no_div_classes_left_in_output():
+    out = preprocess_for_docx(SAMPLE_DOC)
+    for forbidden in (
+        "<div class=\"cover\"",
+        "<div class=\"info-box\"",
+        "<div class=\"info-col\"",
+        "<div class=\"info-label\"",
+        "<div class=\"info-name\"",
+        "<div class=\"doc-title\"",
+        "<div class=\"doc-product\"",
+        "<div class=\"doc-ref\"",
+        "<div class=\"doc-validity\"",
+        "<div class=\"acceptance\"",
+        "<div class=\"signature-grid\"",
+        "<div class=\"sig-col\"",
+        "<div class=\"sig-party\"",
+        "<div class=\"sig-line\"",
+        "<div class=\"place-date\"",
+        "<div class=\"status-card",
+    ):
+        assert forbidden not in out, f"{forbidden!r} still present in output"
@@ -4,11 +4,13 @@ import shutil

 import pytest

-from mcp_docugen.docx_renderer import (
-    DocxRenderError,
-    _preprocess,
+from mcp_docugen.docx_preprocessor import (
    _strip_frontmatter,
    _strip_style_blocks,
+    preprocess_for_docx as _preprocess,
+)
+from mcp_docugen.docx_renderer import (
+    DocxRenderError,
    render_markdown_to_docx,
 )