feat(agents): hypothesis retry-with-error-feedback (max 1 retry)

HypothesisAgent.propose ora riprova una volta in caso di parse o
validation error: il prompt user del retry include l'output precedente
(troncato a 800 char) e il messaggio di errore, così l'LLM può
auto-correggersi. Configurabile via max_retries (default 1).

Cambia il modello dati di HypothesisProposal: completion (singolare)
diventa completions: list[CompletionResult] con n_attempts. L'orchestrator
itera su completions per registrare il costo di ogni chiamata LLM,
incluse le retry.

Phase 1 v4 mostrava 64% di parse failure recuperabili: il retry punta
a tagliare quel tasso senza inflazionare i token oltre 2x worst-case.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-10 21:20:47 +02:00
parent 44eb6436c1
commit d4fcb42fc5
3 changed files with 193 additions and 47 deletions
+87 -29
View File
@@ -1,7 +1,7 @@
from __future__ import annotations from __future__ import annotations
import re import re
from dataclasses import dataclass from dataclasses import dataclass, field
from ..genome.hypothesis import HypothesisAgentGenome from ..genome.hypothesis import HypothesisAgentGenome
from ..llm.client import CompletionResult, LLMClient from ..llm.client import CompletionResult, LLMClient
@@ -23,10 +23,20 @@ class MarketSummary:
@dataclass(frozen=True) @dataclass(frozen=True)
class HypothesisProposal: class HypothesisProposal:
"""Risultato di una propose() del HypothesisAgent.
``completions`` contiene SEMPRE almeno un elemento: il primo tentativo.
Se il primo tentativo fallisce e c'e' budget di retry, vengono accodate
le completions successive, una per ogni retry effettuato.
``n_attempts == len(completions)``. ``raw_text`` riflette l'ULTIMO output
LLM osservato (quello che ha prodotto strategy o l'ultimo parse_error).
"""
strategy: Strategy | None strategy: Strategy | None
raw_text: str raw_text: str
completion: CompletionResult completions: list[CompletionResult] = field(default_factory=list)
parse_error: str | None = None parse_error: str | None = None
n_attempts: int = 1
SYSTEM_TEMPLATE = """\ SYSTEM_TEMPLATE = """\
@@ -113,7 +123,7 @@ USER_TEMPLATE = """\
Mercato: {symbol} timeframe {timeframe}, {n_bars} barre osservate. Mercato: {symbol} timeframe {timeframe}, {n_bars} barre osservate.
Statistiche return: mean={return_mean:.5f}, std={return_std:.5f}, \ Statistiche return: mean={return_mean:.5f}, std={return_std:.5f}, \
skew={skew:.3f}, kurt={kurtosis:.3f}. skew={skew:.3f}, kurt={kurtosis:.3f}.
Regime volatilità : {volatility_regime}. Regime volatilità: {volatility_regime}.
Feature accessibili dal tuo genoma: {feature_access}. Feature accessibili dal tuo genoma: {feature_access}.
Lookback massimo che puoi usare nel ragionamento: {lookback_window} barre. Lookback massimo che puoi usare nel ragionamento: {lookback_window} barre.
@@ -122,6 +132,21 @@ Genera una strategia che cerchi anomalie sfruttabili in questo regime.
""" """
_RETRY_TEMPLATE = """\
{original_user}
--- TENTATIVO PRECEDENTE FALLITO ---
Output: {previous_raw}
Errore: {previous_error}
---
Correggi l'errore e rispondi di nuovo con un singolo oggetto JSON valido
dentro fence ```json...```, seguendo strettamente lo schema fornito nel
SYSTEM message.
"""
_RETRY_RAW_TRUNCATE = 800
_JSON_FENCE_RE = re.compile( _JSON_FENCE_RE = re.compile(
r"```(?:json)?\s*(\{[\s\S]*\})\s*```", r"```(?:json)?\s*(\{[\s\S]*\})\s*```",
re.MULTILINE, re.MULTILINE,
@@ -175,9 +200,25 @@ def _extract_json(text: str) -> str | None:
return _balance_braces(stripped) return _balance_braces(stripped)
def _try_parse(text: str) -> tuple[Strategy | None, str | None]:
"""Estrai+parsea+valida. Ritorna (strategy, error). Esattamente uno e' None."""
payload = _extract_json(text)
if payload is None:
return None, "no JSON object found in output"
try:
ast = parse_strategy(payload)
validate_strategy(ast)
except (ParseError, ValidationError) as e:
return None, str(e)
return ast, None
class HypothesisAgent: class HypothesisAgent:
def __init__(self, llm: LLMClient): def __init__(self, llm: LLMClient, max_retries: int = 1):
if max_retries < 0:
raise ValueError("max_retries must be >= 0")
self._llm = llm self._llm = llm
self._max_retries = max_retries
def propose( def propose(
self, self,
@@ -188,7 +229,7 @@ class HypothesisAgent:
cognitive_style=genome.cognitive_style, cognitive_style=genome.cognitive_style,
system_prompt=genome.system_prompt, system_prompt=genome.system_prompt,
) )
user = USER_TEMPLATE.format( original_user = USER_TEMPLATE.format(
symbol=market.symbol, symbol=market.symbol,
timeframe=market.timeframe, timeframe=market.timeframe,
n_bars=market.n_bars, n_bars=market.n_bars,
@@ -201,28 +242,45 @@ class HypothesisAgent:
lookback_window=genome.lookback_window, lookback_window=genome.lookback_window,
) )
completion = self._llm.complete(genome, system=system, user=user) completions: list[CompletionResult] = []
errors: list[str] = []
last_raw = ""
max_attempts = 1 + self._max_retries
payload = _extract_json(completion.text) for attempt in range(max_attempts):
if payload is None: if attempt == 0:
return HypothesisProposal( user = original_user
strategy=None, else:
raw_text=completion.text, truncated = last_raw[:_RETRY_RAW_TRUNCATE]
completion=completion, user = _RETRY_TEMPLATE.format(
parse_error="no JSON object found in output", original_user=original_user,
) previous_raw=truncated,
try: previous_error=errors[-1],
ast = parse_strategy(payload) )
validate_strategy(ast)
return HypothesisProposal( completion = self._llm.complete(genome, system=system, user=user)
strategy=ast, completions.append(completion)
raw_text=completion.text, last_raw = completion.text
completion=completion,
) strategy, err = _try_parse(completion.text)
except (ParseError, ValidationError) as e: if strategy is not None:
return HypothesisProposal( return HypothesisProposal(
strategy=None, strategy=strategy,
raw_text=completion.text, raw_text=completion.text,
completion=completion, completions=completions,
parse_error=str(e), parse_error=None,
) n_attempts=len(completions),
)
assert err is not None
errors.append(err)
chained = " | ".join(
f"attempt {i + 1}: {e}" for i, e in enumerate(errors)
)
return HypothesisProposal(
strategy=None,
raw_text=last_raw,
completions=completions,
parse_error=chained,
n_attempts=len(completions),
)
+17 -15
View File
@@ -99,21 +99,23 @@ def run_phase1(
continue # elite gia' valutata in generazione precedente continue # elite gia' valutata in generazione precedente
repo.save_genome(run_id=run_id, generation_idx=gen, genome=genome) repo.save_genome(run_id=run_id, generation_idx=gen, genome=genome)
proposal = hypothesis_agent.propose(genome, market) proposal = hypothesis_agent.propose(genome, market)
cost_record = cost_tracker.record( # Registra costo per OGNI completion (incluse retry).
input_tokens=proposal.completion.input_tokens, for completion in proposal.completions:
output_tokens=proposal.completion.output_tokens, cost_record = cost_tracker.record(
tier=proposal.completion.tier, input_tokens=completion.input_tokens,
run_id=run_id, output_tokens=completion.output_tokens,
agent_id=genome.id, tier=completion.tier,
) run_id=run_id,
repo.save_cost_record( agent_id=genome.id,
run_id=run_id, )
agent_id=genome.id, repo.save_cost_record(
tier=cost_record.tier.value, run_id=run_id,
input_tokens=cost_record.input_tokens, agent_id=genome.id,
output_tokens=cost_record.output_tokens, tier=cost_record.tier.value,
cost_usd=cost_record.cost_usd, input_tokens=cost_record.input_tokens,
) output_tokens=cost_record.output_tokens,
cost_usd=cost_record.cost_usd,
)
if proposal.strategy is None: if proposal.strategy is None:
repo.save_evaluation( repo.save_evaluation(
+89 -3
View File
@@ -60,7 +60,8 @@ def test_hypothesis_agent_calls_llm_and_parses(mocker): # type: ignore[no-untyp
agent = HypothesisAgent(llm=fake_llm) agent = HypothesisAgent(llm=fake_llm)
proposal = agent.propose(make_genome(), make_summary()) proposal = agent.propose(make_genome(), make_summary())
assert proposal.strategy is not None assert proposal.strategy is not None
assert proposal.completion.input_tokens == 200 assert proposal.completions[0].input_tokens == 200
assert proposal.n_attempts == 1
fake_llm.complete.assert_called_once() fake_llm.complete.assert_called_once()
@@ -73,10 +74,12 @@ def test_hypothesis_agent_returns_none_on_parse_error(mocker): # type: ignore[n
tier=ModelTier.C, tier=ModelTier.C,
model="qwen", model="qwen",
) )
agent = HypothesisAgent(llm=fake_llm) agent = HypothesisAgent(llm=fake_llm, max_retries=0)
proposal = agent.propose(make_genome(), make_summary()) proposal = agent.propose(make_genome(), make_summary())
assert proposal.strategy is None assert proposal.strategy is None
assert proposal.parse_error is not None assert proposal.parse_error is not None
assert proposal.n_attempts == 1
assert fake_llm.complete.call_count == 1
def test_hypothesis_agent_extracts_json_from_markdown_fence(mocker): # type: ignore[no-untyped-def] def test_hypothesis_agent_extracts_json_from_markdown_fence(mocker): # type: ignore[no-untyped-def]
@@ -123,8 +126,91 @@ def test_hypothesis_agent_returns_error_on_invalid_strategy(mocker): # type: ig
tier=ModelTier.C, tier=ModelTier.C,
model="qwen", model="qwen",
) )
agent = HypothesisAgent(llm=fake_llm) agent = HypothesisAgent(llm=fake_llm, max_retries=0)
proposal = agent.propose(make_genome(), make_summary()) proposal = agent.propose(make_genome(), make_summary())
assert proposal.strategy is None assert proposal.strategy is None
assert proposal.parse_error is not None assert proposal.parse_error is not None
assert "wibble" in proposal.parse_error or "unknown" in proposal.parse_error assert "wibble" in proposal.parse_error or "unknown" in proposal.parse_error
def test_hypothesis_agent_retries_on_parse_error_and_succeeds(mocker): # type: ignore[no-untyped-def]
"""Primo output malformato → secondo output valido → strategia accettata."""
fake_llm = mocker.MagicMock()
fake_llm.complete.side_effect = [
CompletionResult(
text="this is not JSON at all",
input_tokens=200,
output_tokens=80,
tier=ModelTier.C,
model="qwen",
),
CompletionResult(
text="```json\n" + VALID_STRATEGY_JSON + "\n```",
input_tokens=300,
output_tokens=120,
tier=ModelTier.C,
model="qwen",
),
]
agent = HypothesisAgent(llm=fake_llm, max_retries=1)
proposal = agent.propose(make_genome(), make_summary())
assert proposal.strategy is not None
assert proposal.n_attempts == 2
assert len(proposal.completions) == 2
assert proposal.completions[0].input_tokens == 200
assert proposal.completions[1].input_tokens == 300
assert fake_llm.complete.call_count == 2
# Il secondo prompt user deve contenere il marker corrective.
second_call_kwargs = fake_llm.complete.call_args_list[1].kwargs
assert "TENTATIVO PRECEDENTE FALLITO" in second_call_kwargs["user"]
assert "this is not JSON at all" in second_call_kwargs["user"]
def test_hypothesis_agent_gives_up_after_max_retries(mocker): # type: ignore[no-untyped-def]
"""Entrambi i tentativi falliscono → strategy None, errori concatenati."""
fake_llm = mocker.MagicMock()
fake_llm.complete.side_effect = [
CompletionResult(
text="garbage attempt 1",
input_tokens=200,
output_tokens=50,
tier=ModelTier.C,
model="qwen",
),
CompletionResult(
text="garbage attempt 2",
input_tokens=250,
output_tokens=60,
tier=ModelTier.C,
model="qwen",
),
]
agent = HypothesisAgent(llm=fake_llm, max_retries=1)
proposal = agent.propose(make_genome(), make_summary())
assert proposal.strategy is None
assert proposal.n_attempts == 2
assert len(proposal.completions) == 2
assert fake_llm.complete.call_count == 2
assert proposal.parse_error is not None
assert "attempt 1" in proposal.parse_error
assert "attempt 2" in proposal.parse_error
# raw_text deve riflettere l'ULTIMO output (non il primo).
assert proposal.raw_text == "garbage attempt 2"
def test_hypothesis_agent_no_retry_when_first_succeeds(mocker): # type: ignore[no-untyped-def]
"""Primo tentativo OK → nessun retry, anche con max_retries=1 di default."""
fake_llm = mocker.MagicMock()
fake_llm.complete.return_value = CompletionResult(
text=VALID_STRATEGY_JSON,
input_tokens=200,
output_tokens=80,
tier=ModelTier.C,
model="qwen",
)
agent = HypothesisAgent(llm=fake_llm) # default max_retries=1
proposal = agent.propose(make_genome(), make_summary())
assert proposal.strategy is not None
assert proposal.n_attempts == 1
assert len(proposal.completions) == 1
assert fake_llm.complete.call_count == 1