54bf41efd6
Il DOCX prodotto dalla versione precedente emetteva i div Tielogic
(`<div class="cover">`, `<div class="info-col">`, `<div class="acceptance">`,
`<div class="status-card">`) come testo grezzo: Pandoc non sa
interpretare il CSS-flavoured HTML del PDF e li copia letteralmente
nel documento Word. Anche le tabelle `<table class="financial">`
finivano spezzate cella per cella.
Il fix introduce un preprocessor dedicato che riscrive tutta la
HTML Tielogic-flavoured in Markdown nativo prima di passare il
documento a Pandoc.
- docx_preprocessor.py: nuovo modulo basato su BeautifulSoup. Strippa
frontmatter e <style>, poi rewrite di:
* <div class="cover"> → titoli H1/H2, paragrafi, tabella pipe
2-col FORNITORE/CLIENTE, validità in italic, \newpage finale
* <table class="financial"> → tabella pipe Markdown con riga
total-row in **bold**
* <div class="acceptance"> → heading H2 + intro + tabella pipe
con riga firma `_____________________` + luogo/data
* <div class="status-card"> → paragrafo "**name** — descrizione"
* <span class="badge ..."> → testo **bold**
* <div class="page-break"> → \newpage Pandoc-friendly
- docx_renderer.py: deferisce tutto il preprocessing al nuovo modulo
(più compatto, niente regex sparse).
- pyproject.toml + uv.lock: aggiunta dipendenza beautifulsoup4>=4.12.
- 8 nuovi test unit per il preprocessor (cover, tabelle, badge,
acceptance, idempotenza, niente div residui, badge standalone).
Adattati i test esistenti agli import dal nuovo modulo. 101 verde.
Smoke E2E via MCP: l'offerta TieMeasureFlow esce in DOCX leggibile
con tabelle Word native, heading colorati Tielogic e firme in tabella.
102 lines
2.8 KiB
Python
102 lines
2.8 KiB
Python
from __future__ import annotations
|
|
|
|
import shutil
|
|
|
|
import pytest
|
|
|
|
from mcp_docugen.docx_preprocessor import (
|
|
_strip_frontmatter,
|
|
_strip_style_blocks,
|
|
preprocess_for_docx as _preprocess,
|
|
)
|
|
from mcp_docugen.docx_renderer import (
|
|
DocxRenderError,
|
|
render_markdown_to_docx,
|
|
)
|
|
|
|
|
|
def test_strip_style_blocks_removes_simple_block():
|
|
md = "Pre\n<style>h1 { color: red; }</style>\nPost"
|
|
assert "<style>" not in _strip_style_blocks(md)
|
|
assert "color: red" not in _strip_style_blocks(md)
|
|
|
|
|
|
def test_strip_style_blocks_removes_multiline_block_with_attrs():
|
|
md = (
|
|
"Before\n"
|
|
"<style type=\"text/css\">\n"
|
|
" body { font-family: Inter; }\n"
|
|
" h1 { color: blue; }\n"
|
|
"</style>\n"
|
|
"After"
|
|
)
|
|
cleaned = _strip_style_blocks(md)
|
|
assert "<style" not in cleaned
|
|
assert "Inter" not in cleaned
|
|
assert "Before" in cleaned and "After" in cleaned
|
|
|
|
|
|
def test_strip_style_blocks_no_style_is_noop():
|
|
md = "# Just markdown\n\nNo style here."
|
|
assert _strip_style_blocks(md) == md
|
|
|
|
|
|
def test_strip_frontmatter_removes_block():
|
|
md = "---\nfoo: bar\npdf_options:\n format: A4\n---\n\n# Body\n"
|
|
out = _strip_frontmatter(md)
|
|
assert out.startswith("# Body")
|
|
assert "pdf_options" not in out
|
|
|
|
|
|
def test_strip_frontmatter_no_frontmatter_returns_input_unchanged():
|
|
md = "# Body only\n\nText."
|
|
assert _strip_frontmatter(md) == md
|
|
|
|
|
|
def test_strip_frontmatter_unclosed_returns_input_unchanged():
|
|
md = "---\nfoo: bar\nno closing delim"
|
|
assert _strip_frontmatter(md) == md
|
|
|
|
|
|
def test_preprocess_strips_both_frontmatter_and_style():
|
|
md = (
|
|
"---\nfoo: bar\n---\n\n"
|
|
"<style>body { color: red; }</style>\n\n"
|
|
"# Body\n\nContent.\n"
|
|
)
|
|
out = _preprocess(md)
|
|
assert "foo: bar" not in out
|
|
assert "<style>" not in out
|
|
assert "# Body" in out
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
shutil.which("pandoc") is None,
|
|
reason="pandoc binary not available; runs in container or CI with pandoc installed",
|
|
)
|
|
async def test_render_markdown_to_docx_roundtrip():
|
|
md = (
|
|
"---\nfoo: bar\n---\n\n"
|
|
"<style>h1 { color: red; }</style>\n\n"
|
|
"# Hello\n\n"
|
|
"| A | B |\n|---|---|\n| 1 | 2 |\n\n"
|
|
"**bold** and *italic*.\n"
|
|
)
|
|
result = await render_markdown_to_docx(md)
|
|
# DOCX is a ZIP archive; signature: PK\x03\x04
|
|
assert result.docx_bytes.startswith(b"PK\x03\x04")
|
|
assert result.size_bytes > 1000
|
|
|
|
|
|
async def test_render_empty_markdown_raises():
|
|
with pytest.raises(DocxRenderError):
|
|
await render_markdown_to_docx("")
|
|
with pytest.raises(DocxRenderError):
|
|
await render_markdown_to_docx(" \n\n ")
|
|
|
|
|
|
async def test_render_only_frontmatter_and_style_raises():
|
|
md = "---\nfoo: bar\n---\n\n<style>h1{}</style>\n\n \n"
|
|
with pytest.raises(DocxRenderError):
|
|
await render_markdown_to_docx(md)
|