diff --git a/.env.example b/.env.example index 0740ef0..6092605 100644 --- a/.env.example +++ b/.env.example @@ -9,8 +9,11 @@ OPENROUTER_API_KEY= ANTHROPIC_API_KEY= # LLM models (override Phase 1 defaults if needed) -LLM_MODEL_TIER_C=qwen/qwen-2.5-72b-instruct +LLM_MODEL_TIER_S=claude-opus-4-7 +LLM_MODEL_TIER_A=claude-sonnet-4-6 LLM_MODEL_TIER_B=claude-sonnet-4-6 +LLM_MODEL_TIER_C=qwen/qwen-2.5-72b-instruct +LLM_MODEL_TIER_D=meta-llama/llama-3.3-70b-instruct OPENROUTER_BASE_URL=https://openrouter.ai/api/v1 # Run config diff --git a/scripts/run_phase1.py b/scripts/run_phase1.py index 4d380e7..99ebd47 100644 --- a/scripts/run_phase1.py +++ b/scripts/run_phase1.py @@ -48,8 +48,11 @@ def main() -> None: settings.anthropic_api_key.get_secret_value() if settings.anthropic_api_key else None ), - model_tier_c=settings.llm_model_tier_c, + model_tier_s=settings.llm_model_tier_s, + model_tier_a=settings.llm_model_tier_a, model_tier_b=settings.llm_model_tier_b, + model_tier_c=settings.llm_model_tier_c, + model_tier_d=settings.llm_model_tier_d, openrouter_base_url=settings.openrouter_base_url, ) diff --git a/src/multi_swarm/config.py b/src/multi_swarm/config.py index 01e0026..8fce44e 100644 --- a/src/multi_swarm/config.py +++ b/src/multi_swarm/config.py @@ -26,8 +26,11 @@ class Settings(BaseSettings): openrouter_api_key: SecretStr anthropic_api_key: SecretStr | None = None - llm_model_tier_c: str = "qwen/qwen-2.5-72b-instruct" + llm_model_tier_s: str = "claude-opus-4-7" + llm_model_tier_a: str = "claude-sonnet-4-6" llm_model_tier_b: str = "claude-sonnet-4-6" + llm_model_tier_c: str = "qwen/qwen-2.5-72b-instruct" + llm_model_tier_d: str = "meta-llama/llama-3.3-70b-instruct" openrouter_base_url: str = "https://openrouter.ai/api/v1" run_name: str = "phase1-spike-001" diff --git a/src/multi_swarm/genome/hypothesis.py b/src/multi_swarm/genome/hypothesis.py index 8b1a4e5..8dc1686 100644 --- a/src/multi_swarm/genome/hypothesis.py +++ b/src/multi_swarm/genome/hypothesis.py @@ -8,8 +8,11 @@ from typing import Any class ModelTier(StrEnum): + S = "S" # top-tier reasoning (Opus / equivalent) via Anthropic + A = "A" # premium override via Anthropic B = "B" # Sonnet 4.6 via Anthropic C = "C" # Qwen 2.5 72B via OpenRouter + D = "D" # ultra-economic (Llama / cheap models) via OpenRouter @dataclass diff --git a/src/multi_swarm/llm/client.py b/src/multi_swarm/llm/client.py index 7e558de..4673822 100644 --- a/src/multi_swarm/llm/client.py +++ b/src/multi_swarm/llm/client.py @@ -16,8 +16,11 @@ from tenacity import ( from ..genome.hypothesis import HypothesisAgentGenome, ModelTier # Modelli configurati per Phase 1 -MODEL_TIER_C = "qwen/qwen-2.5-72b-instruct" # via OpenRouter +MODEL_TIER_S = "claude-opus-4-7" # via Anthropic +MODEL_TIER_A = "claude-sonnet-4-6" # via Anthropic (premium override) MODEL_TIER_B = "claude-sonnet-4-6" # via Anthropic +MODEL_TIER_C = "qwen/qwen-2.5-72b-instruct" # via OpenRouter +MODEL_TIER_D = "meta-llama/llama-3.3-70b-instruct" # via OpenRouter OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1" # Errori transient: retry. RateLimit/Auth/InvalidRequest: NO retry. @@ -41,17 +44,33 @@ class CompletionResult: class LLMClient: + _ANTHROPIC_TIERS: tuple[ModelTier, ...] = (ModelTier.S, ModelTier.A, ModelTier.B) + _OPENROUTER_TIERS: tuple[ModelTier, ...] = (ModelTier.C, ModelTier.D) + def __init__( self, openrouter_api_key: str, anthropic_api_key: str | None = None, - model_tier_c: str = MODEL_TIER_C, + model_tier_s: str = MODEL_TIER_S, + model_tier_a: str = MODEL_TIER_A, model_tier_b: str = MODEL_TIER_B, + model_tier_c: str = MODEL_TIER_C, + model_tier_d: str = MODEL_TIER_D, openrouter_base_url: str = OPENROUTER_BASE_URL, ) -> None: - self.model_tier_c = model_tier_c + self.model_tier_s = model_tier_s + self.model_tier_a = model_tier_a self.model_tier_b = model_tier_b + self.model_tier_c = model_tier_c + self.model_tier_d = model_tier_d self.openrouter_base_url = openrouter_base_url + self._tier_models: dict[ModelTier, str] = { + ModelTier.S: model_tier_s, + ModelTier.A: model_tier_a, + ModelTier.B: model_tier_b, + ModelTier.C: model_tier_c, + ModelTier.D: model_tier_d, + } self._openrouter = OpenAI(api_key=openrouter_api_key, base_url=openrouter_base_url) self._anthropic = Anthropic(api_key=anthropic_api_key) if anthropic_api_key else None @@ -68,32 +87,53 @@ class LLMClient: user: str, max_tokens: int = 2000, ) -> CompletionResult: - if genome.model_tier == ModelTier.C: - resp = self._openrouter.chat.completions.create( - model=self.model_tier_c, - messages=[ - {"role": "system", "content": system}, - {"role": "user", "content": user}, - ], - temperature=genome.temperature, - top_p=genome.top_p, - max_tokens=max_tokens, - ) - usage = resp.usage - assert usage is not None - return CompletionResult( - text=resp.choices[0].message.content or "", - input_tokens=usage.prompt_tokens, - output_tokens=usage.completion_tokens, - tier=ModelTier.C, - model=self.model_tier_c, - ) + model = self._tier_models[genome.model_tier] + if genome.model_tier in self._ANTHROPIC_TIERS: + return self._call_anthropic(genome, system, user, max_tokens, model) + return self._call_openrouter(genome, system, user, max_tokens, model) + def _call_openrouter( + self, + genome: HypothesisAgentGenome, + system: str, + user: str, + max_tokens: int, + model: str, + ) -> CompletionResult: + resp = self._openrouter.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ], + temperature=genome.temperature, + top_p=genome.top_p, + max_tokens=max_tokens, + ) + usage = resp.usage + assert usage is not None + return CompletionResult( + text=resp.choices[0].message.content or "", + input_tokens=usage.prompt_tokens, + output_tokens=usage.completion_tokens, + tier=genome.model_tier, + model=model, + ) + + def _call_anthropic( + self, + genome: HypothesisAgentGenome, + system: str, + user: str, + max_tokens: int, + model: str, + ) -> CompletionResult: if self._anthropic is None: - raise RuntimeError("ANTHROPIC_API_KEY required for tier B genomes") - + raise RuntimeError( + f"ANTHROPIC_API_KEY required for tier {genome.model_tier.value} genomes" + ) msg = self._anthropic.messages.create( - model=self.model_tier_b, + model=model, system=system, messages=[{"role": "user", "content": user}], temperature=genome.temperature, @@ -105,6 +145,6 @@ class LLMClient: text=text, input_tokens=msg.usage.input_tokens, output_tokens=msg.usage.output_tokens, - tier=ModelTier.B, - model=self.model_tier_b, + tier=genome.model_tier, + model=model, ) diff --git a/src/multi_swarm/llm/cost_tracker.py b/src/multi_swarm/llm/cost_tracker.py index 3b8c567..7541834 100644 --- a/src/multi_swarm/llm/cost_tracker.py +++ b/src/multi_swarm/llm/cost_tracker.py @@ -8,8 +8,11 @@ from typing import Any from ..genome.hypothesis import ModelTier PRICE_PER_M_TOKENS: dict[ModelTier, dict[str, float]] = { - ModelTier.C: {"input": 0.40, "output": 0.40}, + ModelTier.S: {"input": 15.00, "output": 75.00}, + ModelTier.A: {"input": 3.00, "output": 15.00}, ModelTier.B: {"input": 3.00, "output": 15.00}, + ModelTier.C: {"input": 0.40, "output": 0.40}, + ModelTier.D: {"input": 0.10, "output": 0.30}, } diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index e8c0de6..592ba35 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -45,26 +45,38 @@ def test_settings_requires_tokens(monkeypatch: pytest.MonkeyPatch) -> None: def test_settings_loads_llm_model_overrides(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setenv("CERBERO_TESTNET_TOKEN", "tok-test") monkeypatch.setenv("OPENROUTER_API_KEY", "or-key") - monkeypatch.setenv("LLM_MODEL_TIER_C", "deepseek/deepseek-chat") + monkeypatch.setenv("LLM_MODEL_TIER_S", "claude-mega-x") + monkeypatch.setenv("LLM_MODEL_TIER_A", "claude-premium-y") monkeypatch.setenv("LLM_MODEL_TIER_B", "claude-opus-4-7") + monkeypatch.setenv("LLM_MODEL_TIER_C", "deepseek/deepseek-chat") + monkeypatch.setenv("LLM_MODEL_TIER_D", "mistralai/mistral-7b") monkeypatch.setenv("OPENROUTER_BASE_URL", "https://example.com/api/v1") s = Settings(_env_file=None) # type: ignore[call-arg] - assert s.llm_model_tier_c == "deepseek/deepseek-chat" + assert s.llm_model_tier_s == "claude-mega-x" + assert s.llm_model_tier_a == "claude-premium-y" assert s.llm_model_tier_b == "claude-opus-4-7" + assert s.llm_model_tier_c == "deepseek/deepseek-chat" + assert s.llm_model_tier_d == "mistralai/mistral-7b" assert s.openrouter_base_url == "https://example.com/api/v1" def test_settings_llm_model_defaults(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setenv("CERBERO_TESTNET_TOKEN", "tok-test") monkeypatch.setenv("OPENROUTER_API_KEY", "or-key") - monkeypatch.delenv("LLM_MODEL_TIER_C", raising=False) + monkeypatch.delenv("LLM_MODEL_TIER_S", raising=False) + monkeypatch.delenv("LLM_MODEL_TIER_A", raising=False) monkeypatch.delenv("LLM_MODEL_TIER_B", raising=False) + monkeypatch.delenv("LLM_MODEL_TIER_C", raising=False) + monkeypatch.delenv("LLM_MODEL_TIER_D", raising=False) monkeypatch.delenv("OPENROUTER_BASE_URL", raising=False) s = Settings(_env_file=None) # type: ignore[call-arg] - assert s.llm_model_tier_c == "qwen/qwen-2.5-72b-instruct" + assert s.llm_model_tier_s == "claude-opus-4-7" + assert s.llm_model_tier_a == "claude-sonnet-4-6" assert s.llm_model_tier_b == "claude-sonnet-4-6" + assert s.llm_model_tier_c == "qwen/qwen-2.5-72b-instruct" + assert s.llm_model_tier_d == "meta-llama/llama-3.3-70b-instruct" assert s.openrouter_base_url == "https://openrouter.ai/api/v1" diff --git a/tests/unit/test_cost_tracker.py b/tests/unit/test_cost_tracker.py index 08a417c..bb82863 100644 --- a/tests/unit/test_cost_tracker.py +++ b/tests/unit/test_cost_tracker.py @@ -30,3 +30,34 @@ def test_tracker_per_tier_breakdown(): summary = t.summary() assert "C" in summary["by_tier"] assert "B" in summary["by_tier"] + + +def test_estimate_cost_tier_s(): + cost = estimate_cost(input_tokens=1_000_000, output_tokens=1_000_000, tier=ModelTier.S) + assert cost == 15.00 + 75.00 + + +def test_estimate_cost_tier_a(): + cost = estimate_cost(input_tokens=1_000_000, output_tokens=1_000_000, tier=ModelTier.A) + assert cost == 3.00 + 15.00 + + +def test_estimate_cost_tier_d(): + cost = estimate_cost(input_tokens=1_000_000, output_tokens=1_000_000, tier=ModelTier.D) + assert cost == 0.10 + 0.30 + + +def test_tracker_summary_contains_all_five_tiers(): + t = CostTracker() + for tier in (ModelTier.S, ModelTier.A, ModelTier.B, ModelTier.C, ModelTier.D): + t.record( + input_tokens=1_000, + output_tokens=1_000, + tier=tier, + run_id="r", + agent_id=f"a-{tier.value}", + ) + summary = t.summary() + for tier_letter in ("S", "A", "B", "C", "D"): + assert tier_letter in summary["by_tier"] + assert summary["by_tier"][tier_letter]["calls"] == 1 diff --git a/tests/unit/test_genome_hypothesis.py b/tests/unit/test_genome_hypothesis.py index 7cbe8b0..4948543 100644 --- a/tests/unit/test_genome_hypothesis.py +++ b/tests/unit/test_genome_hypothesis.py @@ -48,3 +48,22 @@ def test_genome_id_is_deterministic_on_content(): top_p=0.9, model_tier=ModelTier.C, lookback_window=100, cognitive_style="x", ) assert g1.id == g2.id + + +def test_genome_all_tiers_serde_roundtrip(): + """Tutti i 5 tier (S, A, B, C, D) sopravvivono a to_dict/from_dict.""" + for tier in (ModelTier.S, ModelTier.A, ModelTier.B, ModelTier.C, ModelTier.D): + g = HypothesisAgentGenome( + system_prompt="prompt", + feature_access=["close"], + temperature=0.7, + top_p=0.9, + model_tier=tier, + lookback_window=128, + cognitive_style="generic", + ) + payload = g.to_dict() + assert payload["model_tier"] == tier.value + g2 = HypothesisAgentGenome.from_dict(payload) + assert g2.model_tier == tier + assert g2.id == g.id diff --git a/tests/unit/test_llm_client.py b/tests/unit/test_llm_client.py index 68f686d..bd553b2 100644 --- a/tests/unit/test_llm_client.py +++ b/tests/unit/test_llm_client.py @@ -121,6 +121,94 @@ def test_completion_uses_custom_model_tier_b(mocker): assert out.model == "claude-opus-4-7" +def test_completion_tier_s_uses_anthropic_with_opus(mocker): + fake_anthropic = mocker.MagicMock() + fake_msg = mocker.MagicMock() + fake_msg.content = [mocker.MagicMock(text="(strategy s)")] + fake_msg.usage = mocker.MagicMock(input_tokens=50, output_tokens=100) + fake_anthropic.messages.create.return_value = fake_msg + mocker.patch("multi_swarm.llm.client.Anthropic", return_value=fake_anthropic) + + client = LLMClient(openrouter_api_key="or-x", anthropic_api_key="an-x") + g = make_genome(ModelTier.S) + out = client.complete(g, system="sys", user="usr") + + fake_anthropic.messages.create.assert_called_once() + call_kwargs = fake_anthropic.messages.create.call_args.kwargs + assert call_kwargs["model"] == "claude-opus-4-7" + assert out.tier == ModelTier.S + assert out.model == "claude-opus-4-7" + + +def test_completion_tier_a_uses_anthropic_with_sonnet(mocker): + fake_anthropic = mocker.MagicMock() + fake_msg = mocker.MagicMock() + fake_msg.content = [mocker.MagicMock(text="(strategy a)")] + fake_msg.usage = mocker.MagicMock(input_tokens=40, output_tokens=80) + fake_anthropic.messages.create.return_value = fake_msg + mocker.patch("multi_swarm.llm.client.Anthropic", return_value=fake_anthropic) + + client = LLMClient(openrouter_api_key="or-x", anthropic_api_key="an-x") + g = make_genome(ModelTier.A) + out = client.complete(g, system="sys", user="usr") + + fake_anthropic.messages.create.assert_called_once() + call_kwargs = fake_anthropic.messages.create.call_args.kwargs + assert call_kwargs["model"] == "claude-sonnet-4-6" + assert out.tier == ModelTier.A + assert out.model == "claude-sonnet-4-6" + + +def test_completion_tier_d_uses_openrouter_with_llama(mocker): + fake_openai = mocker.MagicMock() + fake_response = mocker.MagicMock() + fake_response.choices = [ + mocker.MagicMock(message=mocker.MagicMock(content="(strategy d)")) + ] + fake_response.usage = mocker.MagicMock(prompt_tokens=30, completion_tokens=70) + fake_openai.chat.completions.create.return_value = fake_response + mocker.patch("multi_swarm.llm.client.OpenAI", return_value=fake_openai) + + client = LLMClient(openrouter_api_key="or-x", anthropic_api_key=None) + g = make_genome(ModelTier.D) + out = client.complete(g, system="sys", user="usr") + + fake_openai.chat.completions.create.assert_called_once() + call_kwargs = fake_openai.chat.completions.create.call_args.kwargs + assert call_kwargs["model"] == "meta-llama/llama-3.3-70b-instruct" + assert out.tier == ModelTier.D + assert out.model == "meta-llama/llama-3.3-70b-instruct" + + +def test_completion_uses_custom_model_tier_s(mocker): + fake_anthropic = mocker.MagicMock() + fake_msg = mocker.MagicMock() + fake_msg.content = [mocker.MagicMock(text="(strategy custom-s)")] + fake_msg.usage = mocker.MagicMock(input_tokens=10, output_tokens=20) + fake_anthropic.messages.create.return_value = fake_msg + mocker.patch("multi_swarm.llm.client.Anthropic", return_value=fake_anthropic) + + client = LLMClient( + openrouter_api_key="or-x", + anthropic_api_key="an-x", + model_tier_s="claude-future-mega", + ) + g = make_genome(ModelTier.S) + out = client.complete(g, system="sys", user="usr") + + call_kwargs = fake_anthropic.messages.create.call_args.kwargs + assert call_kwargs["model"] == "claude-future-mega" + assert out.model == "claude-future-mega" + + +def test_completion_tier_s_without_anthropic_key_raises(mocker): + mocker.patch("multi_swarm.llm.client.OpenAI", return_value=mocker.MagicMock()) + client = LLMClient(openrouter_api_key="or-x", anthropic_api_key=None) + g = make_genome(ModelTier.S) + with pytest.raises(RuntimeError, match="tier S"): + client.complete(g, system="sys", user="usr") + + @pytest.mark.slow def test_completion_succeeds_after_one_retry(mocker): """Dopo 1 fallimento transient, il retry riesce al 2 tentativo."""