feat(agents): hypothesis retry-with-error-feedback (max 1 retry)

HypothesisAgent.propose ora riprova una volta in caso di parse o
validation error: il prompt user del retry include l'output precedente
(troncato a 800 char) e il messaggio di errore, così l'LLM può
auto-correggersi. Configurabile via max_retries (default 1).

Cambia il modello dati di HypothesisProposal: completion (singolare)
diventa completions: list[CompletionResult] con n_attempts. L'orchestrator
itera su completions per registrare il costo di ogni chiamata LLM,
incluse le retry.

Phase 1 v4 mostrava 64% di parse failure recuperabili: il retry punta
a tagliare quel tasso senza inflazionare i token oltre 2x worst-case.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-10 21:20:47 +02:00
parent 44eb6436c1
commit d4fcb42fc5
3 changed files with 193 additions and 47 deletions
+87 -29
View File
@@ -1,7 +1,7 @@
from __future__ import annotations
import re
from dataclasses import dataclass
from dataclasses import dataclass, field
from ..genome.hypothesis import HypothesisAgentGenome
from ..llm.client import CompletionResult, LLMClient
@@ -23,10 +23,20 @@ class MarketSummary:
@dataclass(frozen=True)
class HypothesisProposal:
"""Risultato di una propose() del HypothesisAgent.
``completions`` contiene SEMPRE almeno un elemento: il primo tentativo.
Se il primo tentativo fallisce e c'e' budget di retry, vengono accodate
le completions successive, una per ogni retry effettuato.
``n_attempts == len(completions)``. ``raw_text`` riflette l'ULTIMO output
LLM osservato (quello che ha prodotto strategy o l'ultimo parse_error).
"""
strategy: Strategy | None
raw_text: str
completion: CompletionResult
completions: list[CompletionResult] = field(default_factory=list)
parse_error: str | None = None
n_attempts: int = 1
SYSTEM_TEMPLATE = """\
@@ -113,7 +123,7 @@ USER_TEMPLATE = """\
Mercato: {symbol} timeframe {timeframe}, {n_bars} barre osservate.
Statistiche return: mean={return_mean:.5f}, std={return_std:.5f}, \
skew={skew:.3f}, kurt={kurtosis:.3f}.
Regime volatilità : {volatility_regime}.
Regime volatilità: {volatility_regime}.
Feature accessibili dal tuo genoma: {feature_access}.
Lookback massimo che puoi usare nel ragionamento: {lookback_window} barre.
@@ -122,6 +132,21 @@ Genera una strategia che cerchi anomalie sfruttabili in questo regime.
"""
_RETRY_TEMPLATE = """\
{original_user}
--- TENTATIVO PRECEDENTE FALLITO ---
Output: {previous_raw}
Errore: {previous_error}
---
Correggi l'errore e rispondi di nuovo con un singolo oggetto JSON valido
dentro fence ```json...```, seguendo strettamente lo schema fornito nel
SYSTEM message.
"""
_RETRY_RAW_TRUNCATE = 800
_JSON_FENCE_RE = re.compile(
r"```(?:json)?\s*(\{[\s\S]*\})\s*```",
re.MULTILINE,
@@ -175,9 +200,25 @@ def _extract_json(text: str) -> str | None:
return _balance_braces(stripped)
def _try_parse(text: str) -> tuple[Strategy | None, str | None]:
"""Estrai+parsea+valida. Ritorna (strategy, error). Esattamente uno e' None."""
payload = _extract_json(text)
if payload is None:
return None, "no JSON object found in output"
try:
ast = parse_strategy(payload)
validate_strategy(ast)
except (ParseError, ValidationError) as e:
return None, str(e)
return ast, None
class HypothesisAgent:
def __init__(self, llm: LLMClient):
def __init__(self, llm: LLMClient, max_retries: int = 1):
if max_retries < 0:
raise ValueError("max_retries must be >= 0")
self._llm = llm
self._max_retries = max_retries
def propose(
self,
@@ -188,7 +229,7 @@ class HypothesisAgent:
cognitive_style=genome.cognitive_style,
system_prompt=genome.system_prompt,
)
user = USER_TEMPLATE.format(
original_user = USER_TEMPLATE.format(
symbol=market.symbol,
timeframe=market.timeframe,
n_bars=market.n_bars,
@@ -201,28 +242,45 @@ class HypothesisAgent:
lookback_window=genome.lookback_window,
)
completion = self._llm.complete(genome, system=system, user=user)
completions: list[CompletionResult] = []
errors: list[str] = []
last_raw = ""
max_attempts = 1 + self._max_retries
payload = _extract_json(completion.text)
if payload is None:
return HypothesisProposal(
strategy=None,
raw_text=completion.text,
completion=completion,
parse_error="no JSON object found in output",
)
try:
ast = parse_strategy(payload)
validate_strategy(ast)
return HypothesisProposal(
strategy=ast,
raw_text=completion.text,
completion=completion,
)
except (ParseError, ValidationError) as e:
return HypothesisProposal(
strategy=None,
raw_text=completion.text,
completion=completion,
parse_error=str(e),
)
for attempt in range(max_attempts):
if attempt == 0:
user = original_user
else:
truncated = last_raw[:_RETRY_RAW_TRUNCATE]
user = _RETRY_TEMPLATE.format(
original_user=original_user,
previous_raw=truncated,
previous_error=errors[-1],
)
completion = self._llm.complete(genome, system=system, user=user)
completions.append(completion)
last_raw = completion.text
strategy, err = _try_parse(completion.text)
if strategy is not None:
return HypothesisProposal(
strategy=strategy,
raw_text=completion.text,
completions=completions,
parse_error=None,
n_attempts=len(completions),
)
assert err is not None
errors.append(err)
chained = " | ".join(
f"attempt {i + 1}: {e}" for i, e in enumerate(errors)
)
return HypothesisProposal(
strategy=None,
raw_text=last_raw,
completions=completions,
parse_error=chained,
n_attempts=len(completions),
)
+17 -15
View File
@@ -99,21 +99,23 @@ def run_phase1(
continue # elite gia' valutata in generazione precedente
repo.save_genome(run_id=run_id, generation_idx=gen, genome=genome)
proposal = hypothesis_agent.propose(genome, market)
cost_record = cost_tracker.record(
input_tokens=proposal.completion.input_tokens,
output_tokens=proposal.completion.output_tokens,
tier=proposal.completion.tier,
run_id=run_id,
agent_id=genome.id,
)
repo.save_cost_record(
run_id=run_id,
agent_id=genome.id,
tier=cost_record.tier.value,
input_tokens=cost_record.input_tokens,
output_tokens=cost_record.output_tokens,
cost_usd=cost_record.cost_usd,
)
# Registra costo per OGNI completion (incluse retry).
for completion in proposal.completions:
cost_record = cost_tracker.record(
input_tokens=completion.input_tokens,
output_tokens=completion.output_tokens,
tier=completion.tier,
run_id=run_id,
agent_id=genome.id,
)
repo.save_cost_record(
run_id=run_id,
agent_id=genome.id,
tier=cost_record.tier.value,
input_tokens=cost_record.input_tokens,
output_tokens=cost_record.output_tokens,
cost_usd=cost_record.cost_usd,
)
if proposal.strategy is None:
repo.save_evaluation(
+89 -3
View File
@@ -60,7 +60,8 @@ def test_hypothesis_agent_calls_llm_and_parses(mocker): # type: ignore[no-untyp
agent = HypothesisAgent(llm=fake_llm)
proposal = agent.propose(make_genome(), make_summary())
assert proposal.strategy is not None
assert proposal.completion.input_tokens == 200
assert proposal.completions[0].input_tokens == 200
assert proposal.n_attempts == 1
fake_llm.complete.assert_called_once()
@@ -73,10 +74,12 @@ def test_hypothesis_agent_returns_none_on_parse_error(mocker): # type: ignore[n
tier=ModelTier.C,
model="qwen",
)
agent = HypothesisAgent(llm=fake_llm)
agent = HypothesisAgent(llm=fake_llm, max_retries=0)
proposal = agent.propose(make_genome(), make_summary())
assert proposal.strategy is None
assert proposal.parse_error is not None
assert proposal.n_attempts == 1
assert fake_llm.complete.call_count == 1
def test_hypothesis_agent_extracts_json_from_markdown_fence(mocker): # type: ignore[no-untyped-def]
@@ -123,8 +126,91 @@ def test_hypothesis_agent_returns_error_on_invalid_strategy(mocker): # type: ig
tier=ModelTier.C,
model="qwen",
)
agent = HypothesisAgent(llm=fake_llm)
agent = HypothesisAgent(llm=fake_llm, max_retries=0)
proposal = agent.propose(make_genome(), make_summary())
assert proposal.strategy is None
assert proposal.parse_error is not None
assert "wibble" in proposal.parse_error or "unknown" in proposal.parse_error
def test_hypothesis_agent_retries_on_parse_error_and_succeeds(mocker): # type: ignore[no-untyped-def]
"""Primo output malformato → secondo output valido → strategia accettata."""
fake_llm = mocker.MagicMock()
fake_llm.complete.side_effect = [
CompletionResult(
text="this is not JSON at all",
input_tokens=200,
output_tokens=80,
tier=ModelTier.C,
model="qwen",
),
CompletionResult(
text="```json\n" + VALID_STRATEGY_JSON + "\n```",
input_tokens=300,
output_tokens=120,
tier=ModelTier.C,
model="qwen",
),
]
agent = HypothesisAgent(llm=fake_llm, max_retries=1)
proposal = agent.propose(make_genome(), make_summary())
assert proposal.strategy is not None
assert proposal.n_attempts == 2
assert len(proposal.completions) == 2
assert proposal.completions[0].input_tokens == 200
assert proposal.completions[1].input_tokens == 300
assert fake_llm.complete.call_count == 2
# Il secondo prompt user deve contenere il marker corrective.
second_call_kwargs = fake_llm.complete.call_args_list[1].kwargs
assert "TENTATIVO PRECEDENTE FALLITO" in second_call_kwargs["user"]
assert "this is not JSON at all" in second_call_kwargs["user"]
def test_hypothesis_agent_gives_up_after_max_retries(mocker): # type: ignore[no-untyped-def]
"""Entrambi i tentativi falliscono → strategy None, errori concatenati."""
fake_llm = mocker.MagicMock()
fake_llm.complete.side_effect = [
CompletionResult(
text="garbage attempt 1",
input_tokens=200,
output_tokens=50,
tier=ModelTier.C,
model="qwen",
),
CompletionResult(
text="garbage attempt 2",
input_tokens=250,
output_tokens=60,
tier=ModelTier.C,
model="qwen",
),
]
agent = HypothesisAgent(llm=fake_llm, max_retries=1)
proposal = agent.propose(make_genome(), make_summary())
assert proposal.strategy is None
assert proposal.n_attempts == 2
assert len(proposal.completions) == 2
assert fake_llm.complete.call_count == 2
assert proposal.parse_error is not None
assert "attempt 1" in proposal.parse_error
assert "attempt 2" in proposal.parse_error
# raw_text deve riflettere l'ULTIMO output (non il primo).
assert proposal.raw_text == "garbage attempt 2"
def test_hypothesis_agent_no_retry_when_first_succeeds(mocker): # type: ignore[no-untyped-def]
"""Primo tentativo OK → nessun retry, anche con max_retries=1 di default."""
fake_llm = mocker.MagicMock()
fake_llm.complete.return_value = CompletionResult(
text=VALID_STRATEGY_JSON,
input_tokens=200,
output_tokens=80,
tier=ModelTier.C,
model="qwen",
)
agent = HypothesisAgent(llm=fake_llm) # default max_retries=1
proposal = agent.propose(make_genome(), make_summary())
assert proposal.strategy is not None
assert proposal.n_attempts == 1
assert len(proposal.completions) == 1
assert fake_llm.complete.call_count == 1