feat(agents): hypothesis retry-with-error-feedback (max 1 retry)

HypothesisAgent.propose ora riprova una volta in caso di parse o validation error: il prompt user del retry include l'output precedente (troncato a 800 char) e il messaggio di errore, così l'LLM può auto-correggersi. Configurabile via max_retries (default 1). Cambia il modello dati di HypothesisProposal: completion (singolare) diventa completions: list[CompletionResult] con n_attempts. L'orchestrator itera su completions per registrare il costo di ogni chiamata LLM, incluse le retry. Phase 1 v4 mostrava 64% di parse failure recuperabili: il retry punta a tagliare quel tasso senza inflazionare i token oltre 2x worst-case. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-10 21:20:47 +02:00
parent 44eb6436c1
commit d4fcb42fc5
3 changed files with 193 additions and 47 deletions
@@ -1,7 +1,7 @@
 from __future__ import annotations
 import re
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from ..genome.hypothesis import HypothesisAgentGenome
 from ..llm.client import CompletionResult, LLMClient
@@ -23,10 +23,20 @@ class MarketSummary:
@dataclass(frozen=True)
 class HypothesisProposal:
    """Risultato di una propose() del HypothesisAgent.
    ``completions`` contiene SEMPRE almeno un elemento: il primo tentativo.
    Se il primo tentativo fallisce e c'e' budget di retry, vengono accodate
    le completions successive, una per ogni retry effettuato.
    ``n_attempts == len(completions)``. ``raw_text`` riflette l'ULTIMO output
    LLM osservato (quello che ha prodotto strategy o l'ultimo parse_error).
    """
    strategy: Strategy | None
    raw_text: str
-    completion: CompletionResult
+    completions: list[CompletionResult] = field(default_factory=list)
    parse_error: str | None = None
    n_attempts: int = 1
 SYSTEM_TEMPLATE = """\
@@ -113,7 +123,7 @@ USER_TEMPLATE = """\
 Mercato: {symbol} timeframe {timeframe}, {n_bars} barre osservate.
 Statistiche return: mean={return_mean:.5f}, std={return_std:.5f}, \
 skew={skew:.3f}, kurt={kurtosis:.3f}.
-Regime volatilitÃ : {volatility_regime}.
+Regime volatilità: {volatility_regime}.
 Feature accessibili dal tuo genoma: {feature_access}.
 Lookback massimo che puoi usare nel ragionamento: {lookback_window} barre.
@@ -122,6 +132,21 @@ Genera una strategia che cerchi anomalie sfruttabili in questo regime.
 """
 _RETRY_TEMPLATE = """\
 {original_user}
 --- TENTATIVO PRECEDENTE FALLITO ---
 Output: {previous_raw}
 Errore: {previous_error}
 ---
 Correggi l'errore e rispondi di nuovo con un singolo oggetto JSON valido
 dentro fence ```json...```, seguendo strettamente lo schema fornito nel
 SYSTEM message.
 """
 _RETRY_RAW_TRUNCATE = 800
 _JSON_FENCE_RE = re.compile(
    r"```(?:json)?\s*(\{[\s\S]*\})\s*```",
    re.MULTILINE,
@@ -175,9 +200,25 @@ def _extract_json(text: str) -> str | None:
    return _balance_braces(stripped)
 def _try_parse(text: str) -> tuple[Strategy | None, str | None]:
    """Estrai+parsea+valida. Ritorna (strategy, error). Esattamente uno e' None."""
    payload = _extract_json(text)
    if payload is None:
        return None, "no JSON object found in output"
    try:
        ast = parse_strategy(payload)
        validate_strategy(ast)
    except (ParseError, ValidationError) as e:
        return None, str(e)
    return ast, None
 class HypothesisAgent:
-    def __init__(self, llm: LLMClient):
+    def __init__(self, llm: LLMClient, max_retries: int = 1):
        if max_retries < 0:
            raise ValueError("max_retries must be >= 0")
        self._llm = llm
        self._max_retries = max_retries
    def propose(
        self,
@@ -188,7 +229,7 @@ class HypothesisAgent:
            cognitive_style=genome.cognitive_style,
            system_prompt=genome.system_prompt,
        )
-        user = USER_TEMPLATE.format(
+        original_user = USER_TEMPLATE.format(
            symbol=market.symbol,
            timeframe=market.timeframe,
            n_bars=market.n_bars,
@@ -201,28 +242,45 @@ class HypothesisAgent:
            lookback_window=genome.lookback_window,
        )
-        completion = self._llm.complete(genome, system=system, user=user)
+        completions: list[CompletionResult] = []
        errors: list[str] = []
        last_raw = ""
        max_attempts = 1 + self._max_retries
-        payload = _extract_json(completion.text)
+        for attempt in range(max_attempts):
-        if payload is None:
+            if attempt == 0:
-            return HypothesisProposal(
+                user = original_user
-                strategy=None,
+            else:
-                raw_text=completion.text,
+                truncated = last_raw[:_RETRY_RAW_TRUNCATE]
-                completion=completion,
+                user = _RETRY_TEMPLATE.format(
-                parse_error="no JSON object found in output",
+                    original_user=original_user,
-            )
+                    previous_raw=truncated,
-        try:
+                    previous_error=errors[-1],
-            ast = parse_strategy(payload)
+                )
-            validate_strategy(ast)
+
-            return HypothesisProposal(
+            completion = self._llm.complete(genome, system=system, user=user)
-                strategy=ast,
+            completions.append(completion)
-                raw_text=completion.text,
+            last_raw = completion.text
-                completion=completion,
+
-            )
+            strategy, err = _try_parse(completion.text)
-        except (ParseError, ValidationError) as e:
+            if strategy is not None:
-            return HypothesisProposal(
+                return HypothesisProposal(
-                strategy=None,
+                    strategy=strategy,
-                raw_text=completion.text,
+                    raw_text=completion.text,
-                completion=completion,
+                    completions=completions,
-                parse_error=str(e),
+                    parse_error=None,
-            )
+                    n_attempts=len(completions),
                )
            assert err is not None
            errors.append(err)
        chained = " | ".join(
            f"attempt {i + 1}: {e}" for i, e in enumerate(errors)
        )
        return HypothesisProposal(
            strategy=None,
            raw_text=last_raw,
            completions=completions,
            parse_error=chained,
            n_attempts=len(completions),
        )
@@ -99,21 +99,23 @@ def run_phase1(
                    continue  # elite gia' valutata in generazione precedente
                repo.save_genome(run_id=run_id, generation_idx=gen, genome=genome)
                proposal = hypothesis_agent.propose(genome, market)
-                cost_record = cost_tracker.record(
+                # Registra costo per OGNI completion (incluse retry).
-                    input_tokens=proposal.completion.input_tokens,
+                for completion in proposal.completions:
-                    output_tokens=proposal.completion.output_tokens,
+                    cost_record = cost_tracker.record(
-                    tier=proposal.completion.tier,
+                        input_tokens=completion.input_tokens,
-                    run_id=run_id,
+                        output_tokens=completion.output_tokens,
-                    agent_id=genome.id,
+                        tier=completion.tier,
-                )
+                        run_id=run_id,
-                repo.save_cost_record(
+                        agent_id=genome.id,
-                    run_id=run_id,
+                    )
-                    agent_id=genome.id,
+                    repo.save_cost_record(
-                    tier=cost_record.tier.value,
+                        run_id=run_id,
-                    input_tokens=cost_record.input_tokens,
+                        agent_id=genome.id,
-                    output_tokens=cost_record.output_tokens,
+                        tier=cost_record.tier.value,
-                    cost_usd=cost_record.cost_usd,
+                        input_tokens=cost_record.input_tokens,
-                )
+                        output_tokens=cost_record.output_tokens,
                        cost_usd=cost_record.cost_usd,
                    )
                if proposal.strategy is None:
                    repo.save_evaluation(
@@ -60,7 +60,8 @@ def test_hypothesis_agent_calls_llm_and_parses(mocker):  # type: ignore[no-untyp
    agent = HypothesisAgent(llm=fake_llm)
    proposal = agent.propose(make_genome(), make_summary())
    assert proposal.strategy is not None
-    assert proposal.completion.input_tokens == 200
+    assert proposal.completions[0].input_tokens == 200
    assert proposal.n_attempts == 1
    fake_llm.complete.assert_called_once()
@@ -73,10 +74,12 @@ def test_hypothesis_agent_returns_none_on_parse_error(mocker):  # type: ignore[n
        tier=ModelTier.C,
        model="qwen",
    )
-    agent = HypothesisAgent(llm=fake_llm)
+    agent = HypothesisAgent(llm=fake_llm, max_retries=0)
    proposal = agent.propose(make_genome(), make_summary())
    assert proposal.strategy is None
    assert proposal.parse_error is not None
    assert proposal.n_attempts == 1
    assert fake_llm.complete.call_count == 1
 def test_hypothesis_agent_extracts_json_from_markdown_fence(mocker):  # type: ignore[no-untyped-def]
@@ -123,8 +126,91 @@ def test_hypothesis_agent_returns_error_on_invalid_strategy(mocker):  # type: ig
        tier=ModelTier.C,
        model="qwen",
    )
-    agent = HypothesisAgent(llm=fake_llm)
+    agent = HypothesisAgent(llm=fake_llm, max_retries=0)
    proposal = agent.propose(make_genome(), make_summary())
    assert proposal.strategy is None
    assert proposal.parse_error is not None
    assert "wibble" in proposal.parse_error or "unknown" in proposal.parse_error
 def test_hypothesis_agent_retries_on_parse_error_and_succeeds(mocker):  # type: ignore[no-untyped-def]
    """Primo output malformato → secondo output valido → strategia accettata."""
    fake_llm = mocker.MagicMock()
    fake_llm.complete.side_effect = [
        CompletionResult(
            text="this is not JSON at all",
            input_tokens=200,
            output_tokens=80,
            tier=ModelTier.C,
            model="qwen",
        ),
        CompletionResult(
            text="```json\n" + VALID_STRATEGY_JSON + "\n```",
            input_tokens=300,
            output_tokens=120,
            tier=ModelTier.C,
            model="qwen",
        ),
    ]
    agent = HypothesisAgent(llm=fake_llm, max_retries=1)
    proposal = agent.propose(make_genome(), make_summary())
    assert proposal.strategy is not None
    assert proposal.n_attempts == 2
    assert len(proposal.completions) == 2
    assert proposal.completions[0].input_tokens == 200
    assert proposal.completions[1].input_tokens == 300
    assert fake_llm.complete.call_count == 2
    # Il secondo prompt user deve contenere il marker corrective.
    second_call_kwargs = fake_llm.complete.call_args_list[1].kwargs
    assert "TENTATIVO PRECEDENTE FALLITO" in second_call_kwargs["user"]
    assert "this is not JSON at all" in second_call_kwargs["user"]
 def test_hypothesis_agent_gives_up_after_max_retries(mocker):  # type: ignore[no-untyped-def]
    """Entrambi i tentativi falliscono → strategy None, errori concatenati."""
    fake_llm = mocker.MagicMock()
    fake_llm.complete.side_effect = [
        CompletionResult(
            text="garbage attempt 1",
            input_tokens=200,
            output_tokens=50,
            tier=ModelTier.C,
            model="qwen",
        ),
        CompletionResult(
            text="garbage attempt 2",
            input_tokens=250,
            output_tokens=60,
            tier=ModelTier.C,
            model="qwen",
        ),
    ]
    agent = HypothesisAgent(llm=fake_llm, max_retries=1)
    proposal = agent.propose(make_genome(), make_summary())
    assert proposal.strategy is None
    assert proposal.n_attempts == 2
    assert len(proposal.completions) == 2
    assert fake_llm.complete.call_count == 2
    assert proposal.parse_error is not None
    assert "attempt 1" in proposal.parse_error
    assert "attempt 2" in proposal.parse_error
    # raw_text deve riflettere l'ULTIMO output (non il primo).
    assert proposal.raw_text == "garbage attempt 2"
 def test_hypothesis_agent_no_retry_when_first_succeeds(mocker):  # type: ignore[no-untyped-def]
    """Primo tentativo OK → nessun retry, anche con max_retries=1 di default."""
    fake_llm = mocker.MagicMock()
    fake_llm.complete.return_value = CompletionResult(
        text=VALID_STRATEGY_JSON,
        input_tokens=200,
        output_tokens=80,
        tier=ModelTier.C,
        model="qwen",
    )
    agent = HypothesisAgent(llm=fake_llm)  # default max_retries=1
    proposal = agent.propose(make_genome(), make_summary())
    assert proposal.strategy is not None
    assert proposal.n_attempts == 1
    assert len(proposal.completions) == 1
    assert fake_llm.complete.call_count == 1