From 54bf41efd6edfa0ba696728d680d55a601483f91 Mon Sep 17 00:00:00 2001 From: AdrianoDev Date: Sun, 26 Apr 2026 11:26:52 +0200 Subject: [PATCH] =?UTF-8?q?fix(mcp-docugen):=20preprocessor=20HTML?= =?UTF-8?q?=E2=86=92Markdown=20per=20output=20Word=20leggibile?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Il DOCX prodotto dalla versione precedente emetteva i div Tielogic (`
`, `
`, `
`, `
`) come testo grezzo: Pandoc non sa interpretare il CSS-flavoured HTML del PDF e li copia letteralmente nel documento Word. Anche le tabelle `` finivano spezzate cella per cella. Il fix introduce un preprocessor dedicato che riscrive tutta la HTML Tielogic-flavoured in Markdown nativo prima di passare il documento a Pandoc. - docx_preprocessor.py: nuovo modulo basato su BeautifulSoup. Strippa frontmatter e ", re.DOTALL | re.IGNORECASE) + + +def _strip_style_blocks(text: str) -> str: + return _STYLE_BLOCK_RE.sub("", text) + + +def _strip_frontmatter(text: str) -> str: + if not text.startswith(_FRONTMATTER_DELIM): + return text + end_marker = f"\n{_FRONTMATTER_DELIM}\n" + idx = text.find(end_marker, len(_FRONTMATTER_DELIM)) + if idx == -1: + return text + return text[idx + len(end_marker) :].lstrip() + + +def _text(el: Tag | None) -> str: + return el.get_text(" ", strip=True) if el is not None else "" + + +def _info_col_lines(col: Tag) -> list[str]: + """Extract the rows of an info-col block (FORNITORE/CLIENTE), skipping + the label (used as table header) and bolding the company name.""" + lines: list[str] = [] + for child in col.find_all("div", recursive=False): + classes = set(child.get("class") or []) + if "info-label" in classes: + continue + txt = child.get_text(" ", strip=True) + if not txt: + continue + if "info-name" in classes: + lines.append(f"**{txt}**") + else: + lines.append(txt) + return lines + + +def _convert_cover(soup: BeautifulSoup) -> None: + cover = soup.find("div", class_="cover") + if not isinstance(cover, Tag): + return + + brand = _text(cover.find(class_="brand")) + tagline = _text(cover.find(class_="brand-tagline")) + title = _text(cover.find(class_="doc-title")) + product = _text(cover.find(class_="doc-product")) + ref = _text(cover.find(class_="doc-ref")) + validity = _text(cover.find(class_="doc-validity")) + + info_box = cover.find(class_="info-box") + info_cols = ( + info_box.find_all("div", class_="info-col") if isinstance(info_box, Tag) else [] + ) + + blocks: list[str] = [] + if brand: + blocks.append(f"# {brand}") + if tagline: + blocks.append(f"*{tagline}*") + blocks.append("---") + if title: + blocks.append(f"## {title}") + if product: + blocks.append(f"**{product}**") + if ref: + blocks.append(ref) + + if len(info_cols) == 2: + col_a, col_b = info_cols + label_a = _text(col_a.find(class_="info-label")) or "FORNITORE" + label_b = _text(col_b.find(class_="info-label")) or "CLIENTE" + rows_a = _info_col_lines(col_a) + rows_b = _info_col_lines(col_b) + height = max(len(rows_a), len(rows_b)) + rows_a += [""] * (height - len(rows_a)) + rows_b += [""] * (height - len(rows_b)) + + table_lines = [ + f"| **{label_a}** | **{label_b}** |", + "|---|---|", + ] + for a, b in zip(rows_a, rows_b): + table_lines.append(f"| {a} | {b} |") + blocks.append("\n".join(table_lines)) + + if validity: + blocks.append(f"*{validity}*") + + replacement = "\n\n".join(blocks) + "\n\n\\newpage\n" + cover.replace_with(BeautifulSoup(replacement, "html.parser")) + + +def _convert_acceptance(soup: BeautifulSoup) -> None: + acceptance = soup.find("div", class_="acceptance") + if not isinstance(acceptance, Tag): + return + + title_el = acceptance.find(class_="acceptance-title") + intro_el = acceptance.find(class_="acceptance-intro") + sig_grid = acceptance.find(class_="signature-grid") + place_date = acceptance.find(class_="place-date") + + title = _text(title_el) or "ACCETTAZIONE" + intro = _text(intro_el) + + blocks = [f"## {title}"] + if intro: + blocks.append(intro) + + if isinstance(sig_grid, Tag): + cols = sig_grid.find_all("div", class_="sig-col") + if len(cols) == 2: + party_a = _text(cols[0].find(class_="sig-party")) + party_b = _text(cols[1].find(class_="sig-party")) + line_a = _text(cols[0].find(class_="sig-line")) or "Firma e timbro" + line_b = _text(cols[1].find(class_="sig-line")) or "Firma e timbro" + blocks.append( + "\n".join( + [ + f"| **{party_a}** | **{party_b}** |", + "|---|---|", + "| _____________________ | _____________________ |", + f"| {line_a} | {line_b} |", + ] + ) + ) + + if isinstance(place_date, Tag): + blocks.append(_text(place_date)) + + replacement = "\n\n".join(blocks) + acceptance.replace_with(BeautifulSoup(replacement, "html.parser")) + + +def _convert_status_cards(soup: BeautifulSoup) -> None: + for card in soup.find_all("div", class_="status-card"): + if not isinstance(card, Tag): + continue + name_el = card.find(class_="name") + name = _text(name_el) + # remaining text (sibling divs after the name) + body_parts: list[str] = [] + for child in card.find_all("div", recursive=False): + if child is name_el: + continue + txt = child.get_text(" ", strip=True) + if txt: + body_parts.append(txt) + body = " ".join(body_parts) + block = f"**{name}** — {body}" if body else f"**{name}**" + card.replace_with(BeautifulSoup(block, "html.parser")) + + +def _convert_badges(soup: BeautifulSoup) -> None: + for span in soup.find_all("span", class_="badge"): + if not isinstance(span, Tag): + continue + txt = span.get_text(" ", strip=True) + span.replace_with(f"**{txt}**" if txt else "") + + +def _convert_page_breaks(soup: BeautifulSoup) -> None: + for el in soup.find_all("div", class_="page-break"): + if not isinstance(el, Tag): + continue + el.replace_with(BeautifulSoup("\n\n\\newpage\n\n", "html.parser")) + + +def _convert_financial_tables(soup: BeautifulSoup) -> None: + """Rewrite `
` (with custom td/tr classes) as + a clean pipe-table Markdown block. Pandoc handles raw HTML
+ inconsistently when extra attributes/classes are present.""" + for table in soup.find_all("table", class_="financial"): + if not isinstance(table, Tag): + continue + + header_cells: list[str] = [] + thead = table.find("thead") + if isinstance(thead, Tag): + for th in thead.find_all("th"): + header_cells.append(th.get_text(" ", strip=True)) + + rows: list[list[str]] = [] + body = table.find("tbody") or table + for tr in body.find_all("tr"): + classes = set(tr.get("class") or []) + cells = [td.get_text(" ", strip=True) for td in tr.find_all(["td", "th"])] + if not cells: + continue + if "total-row" in classes: + cells = [f"**{c}**" for c in cells] + # Skip if this row was already pulled in via thead + if cells == header_cells: + continue + rows.append(cells) + + if not header_cells and rows: + # Use the first row as header if no thead provided. + header_cells, rows = rows[0], rows[1:] + + if not header_cells: + continue + + ncols = len(header_cells) + rows = [r + [""] * (ncols - len(r)) for r in rows] + + lines = ["| " + " | ".join(header_cells) + " |"] + lines.append("|" + "|".join(["---"] * ncols) + "|") + for r in rows: + lines.append("| " + " | ".join(r) + " |") + + block = "\n".join(lines) + table.replace_with(BeautifulSoup(block, "html.parser")) + + +def preprocess_for_docx(markdown_text: str) -> str: + """Apply the full pipeline of transformations needed to render the + Tielogic Markdown documents in DOCX via Pandoc.""" + text = _strip_style_blocks(markdown_text) + text = _strip_frontmatter(text) + + soup = BeautifulSoup(text, "html.parser") + _convert_cover(soup) + _convert_acceptance(soup) + _convert_status_cards(soup) + _convert_financial_tables(soup) + _convert_badges(soup) + _convert_page_breaks(soup) + + out = str(soup) + # Collapse 3+ blank lines into 2 to keep the document tidy. + out = re.sub(r"\n{3,}", "\n\n", out) + return out.strip() + "\n" diff --git a/services/mcp-docugen/src/mcp_docugen/docx_renderer.py b/services/mcp-docugen/src/mcp_docugen/docx_renderer.py index c12519e..a5c3f95 100644 --- a/services/mcp-docugen/src/mcp_docugen/docx_renderer.py +++ b/services/mcp-docugen/src/mcp_docugen/docx_renderer.py @@ -2,14 +2,12 @@ from __future__ import annotations import asyncio import logging -import re from dataclasses import dataclass from pathlib import Path -logger = logging.getLogger(__name__) +from mcp_docugen.docx_preprocessor import preprocess_for_docx -_STYLE_BLOCK_RE = re.compile(r"]*>.*?", re.DOTALL | re.IGNORECASE) -_FRONTMATTER_DELIM = "---" +logger = logging.getLogger(__name__) class DocxRenderError(Exception): @@ -22,26 +20,13 @@ class DocxRenderResult: size_bytes: int -def _strip_style_blocks(markdown_text: str) -> str: - """Remove `` blocks: they're meaningless in DOCX and - Pandoc would otherwise embed them as raw text.""" - return _STYLE_BLOCK_RE.sub("", markdown_text) - - -def _strip_frontmatter(markdown_text: str) -> str: - """Remove the YAML frontmatter so it doesn't appear as a body table in - the DOCX. Frontmatter values were meant for the PDF renderer.""" - if not markdown_text.startswith(_FRONTMATTER_DELIM): - return markdown_text - end_marker = f"\n{_FRONTMATTER_DELIM}\n" - idx = markdown_text.find(end_marker, len(_FRONTMATTER_DELIM)) - if idx == -1: - return markdown_text - return markdown_text[idx + len(end_marker) :].lstrip() - - def _preprocess(markdown_text: str) -> str: - return _strip_style_blocks(_strip_frontmatter(markdown_text)) + """Strip the bits of the document that only make sense in the PDF + pipeline (YAML frontmatter, inline ` + +
+
TIELOGIC
+
Soluzioni Software Industriali
+
+
OFFERTA PRODOTTO E INTEGRAZIONE
+
TieMeasureFlow
+
Rif. OFF-2026-022 | 23 marzo 2026
+
+
+
FORNITORE
+
Tielogic SRL
+
Via Villanova 39, 36020 Solagna (VI)
+
P.IVA / C.F. 03954890244
+
+
+
CLIENTE
+
Ricerca e Misure s.r.l.
+
Via Brigata Julia 21, 35020 Pernumia (PD)
+
Rif. Menoncin
+
+
+
Validità offerta: 23 aprile 2026
+
+ + # TieMeasureFlow + + Sistema web SPC. + + ## Costo di setup iniziale + +
+ + + + + +
VoceImporto
Setup€ 3.500,00
TOTALE SETUP€ 3.500,00
+ +
+
TEST Z +50MM DRIFT
+
Errore cumulativo da 7.8mm a 11.5mm.
+
+ +
+

ACCETTAZIONE

+
Per accettazione, restituire copia firmata.
+
Per Tielogic SRL
Firma e timbro
Per Ricerca e Misure s.r.l.
Firma e timbro
+
Luogo e data: ____________ 23 marzo 2026
+
+ """ +) + + +def test_preprocessor_strips_style_and_frontmatter(): + out = preprocess_for_docx(SAMPLE_DOC) + assert "