feat(agents): hypothesis retry-with-error-feedback (max 1 retry)

HypothesisAgent.propose ora riprova una volta in caso di parse o validation error: il prompt user del retry include l'output precedente (troncato a 800 char) e il messaggio di errore, così l'LLM può auto-correggersi. Configurabile via max_retries (default 1). Cambia il modello dati di HypothesisProposal: completion (singolare) diventa completions: list[CompletionResult] con n_attempts. L'orchestrator itera su completions per registrare il costo di ogni chiamata LLM, incluse le retry. Phase 1 v4 mostrava 64% di parse failure recuperabili: il retry punta a tagliare quel tasso senza inflazionare i token oltre 2x worst-case. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-10 21:20:47 +02:00
parent 44eb6436c1
commit d4fcb42fc5
3 changed files with 193 additions and 47 deletions
@@ -60,7 +60,8 @@ def test_hypothesis_agent_calls_llm_and_parses(mocker):  # type: ignore[no-untyp
    agent = HypothesisAgent(llm=fake_llm)
    proposal = agent.propose(make_genome(), make_summary())
    assert proposal.strategy is not None
-    assert proposal.completion.input_tokens == 200
+    assert proposal.completions[0].input_tokens == 200
+    assert proposal.n_attempts == 1
    fake_llm.complete.assert_called_once()


@@ -73,10 +74,12 @@ def test_hypothesis_agent_returns_none_on_parse_error(mocker):  # type: ignore[n
        tier=ModelTier.C,
        model="qwen",
    )
-    agent = HypothesisAgent(llm=fake_llm)
+    agent = HypothesisAgent(llm=fake_llm, max_retries=0)
    proposal = agent.propose(make_genome(), make_summary())
    assert proposal.strategy is None
    assert proposal.parse_error is not None
+    assert proposal.n_attempts == 1
+    assert fake_llm.complete.call_count == 1


 def test_hypothesis_agent_extracts_json_from_markdown_fence(mocker):  # type: ignore[no-untyped-def]
@@ -123,8 +126,91 @@ def test_hypothesis_agent_returns_error_on_invalid_strategy(mocker):  # type: ig
        tier=ModelTier.C,
        model="qwen",
    )
-    agent = HypothesisAgent(llm=fake_llm)
+    agent = HypothesisAgent(llm=fake_llm, max_retries=0)
    proposal = agent.propose(make_genome(), make_summary())
    assert proposal.strategy is None
    assert proposal.parse_error is not None
    assert "wibble" in proposal.parse_error or "unknown" in proposal.parse_error
+
+
+def test_hypothesis_agent_retries_on_parse_error_and_succeeds(mocker):  # type: ignore[no-untyped-def]
+    """Primo output malformato → secondo output valido → strategia accettata."""
+    fake_llm = mocker.MagicMock()
+    fake_llm.complete.side_effect = [
+        CompletionResult(
+            text="this is not JSON at all",
+            input_tokens=200,
+            output_tokens=80,
+            tier=ModelTier.C,
+            model="qwen",
+        ),
+        CompletionResult(
+            text="```json\n" + VALID_STRATEGY_JSON + "\n```",
+            input_tokens=300,
+            output_tokens=120,
+            tier=ModelTier.C,
+            model="qwen",
+        ),
+    ]
+    agent = HypothesisAgent(llm=fake_llm, max_retries=1)
+    proposal = agent.propose(make_genome(), make_summary())
+    assert proposal.strategy is not None
+    assert proposal.n_attempts == 2
+    assert len(proposal.completions) == 2
+    assert proposal.completions[0].input_tokens == 200
+    assert proposal.completions[1].input_tokens == 300
+    assert fake_llm.complete.call_count == 2
+    # Il secondo prompt user deve contenere il marker corrective.
+    second_call_kwargs = fake_llm.complete.call_args_list[1].kwargs
+    assert "TENTATIVO PRECEDENTE FALLITO" in second_call_kwargs["user"]
+    assert "this is not JSON at all" in second_call_kwargs["user"]
+
+
+def test_hypothesis_agent_gives_up_after_max_retries(mocker):  # type: ignore[no-untyped-def]
+    """Entrambi i tentativi falliscono → strategy None, errori concatenati."""
+    fake_llm = mocker.MagicMock()
+    fake_llm.complete.side_effect = [
+        CompletionResult(
+            text="garbage attempt 1",
+            input_tokens=200,
+            output_tokens=50,
+            tier=ModelTier.C,
+            model="qwen",
+        ),
+        CompletionResult(
+            text="garbage attempt 2",
+            input_tokens=250,
+            output_tokens=60,
+            tier=ModelTier.C,
+            model="qwen",
+        ),
+    ]
+    agent = HypothesisAgent(llm=fake_llm, max_retries=1)
+    proposal = agent.propose(make_genome(), make_summary())
+    assert proposal.strategy is None
+    assert proposal.n_attempts == 2
+    assert len(proposal.completions) == 2
+    assert fake_llm.complete.call_count == 2
+    assert proposal.parse_error is not None
+    assert "attempt 1" in proposal.parse_error
+    assert "attempt 2" in proposal.parse_error
+    # raw_text deve riflettere l'ULTIMO output (non il primo).
+    assert proposal.raw_text == "garbage attempt 2"
+
+
+def test_hypothesis_agent_no_retry_when_first_succeeds(mocker):  # type: ignore[no-untyped-def]
+    """Primo tentativo OK → nessun retry, anche con max_retries=1 di default."""
+    fake_llm = mocker.MagicMock()
+    fake_llm.complete.return_value = CompletionResult(
+        text=VALID_STRATEGY_JSON,
+        input_tokens=200,
+        output_tokens=80,
+        tier=ModelTier.C,
+        model="qwen",
+    )
+    agent = HypothesisAgent(llm=fake_llm)  # default max_retries=1
+    proposal = agent.propose(make_genome(), make_summary())
+    assert proposal.strategy is not None
+    assert proposal.n_attempts == 1
+    assert len(proposal.completions) == 1
+    assert fake_llm.complete.call_count == 1