feat(gui+runtime): Phase D — kill-switch arm/disarm from the dashboard

Wires the GUI's first write path through the manual_actions queue:

* runtime/manual_actions_consumer.py — drains the queue and
  dispatches arm_kill / disarm_kill via KillSwitch (preserving the
  audit chain). Unsupported kinds (force_close, approve/reject_proposal)
  are marked result="not_supported" so they don't sit forever.
* runtime/orchestrator.py — adds a `manual_actions` job at */1 cron
  to the canonical scheduler manifest.
* gui/data_layer.py — write helpers enqueue_arm_kill /
  enqueue_disarm_kill (the only write path the GUI uses) plus
  load_pending_manual_actions for the pending strip.
* gui/pages/1_📊_Status.py — kill-switch arm/disarm panel with typed
  confirmation ("yes I am sure") + reason field; pending-actions table
  rendered when the queue is non-empty.

End-to-end smoke against the testnet state.sqlite:
  GUI enqueue → consumer dispatch → KillSwitch transition → audit
  chain hash linkage holds, "source":"manual_gui" recorded.

7 new unit tests for the consumer (arm, disarm, drain, unsupported,
default-reason, KillSwitchError handling, empty queue); 360/360 pass.
ruff clean; mypy strict src clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-30 12:33:58 +02:00
parent 6f6dd4c8dd
commit e8345a29c8
6 changed files with 470 additions and 4 deletions
+84 -1
View File
@@ -14,6 +14,7 @@ poking at the repository directly.
from __future__ import annotations
import json
from dataclasses import dataclass
from datetime import UTC, datetime, timedelta
from decimal import Decimal
@@ -27,12 +28,14 @@ from cerbero_bite.safety.audit_log import (
iter_entries,
verify_chain,
)
from cerbero_bite.state import Repository, connect
from cerbero_bite.state import Repository, connect, transaction
from cerbero_bite.state.models import (
DecisionRecord,
ManualAction,
PositionRecord,
SystemStateRecord,
)
from cerbero_bite.state.repository import _row_to_manual
__all__ = [
"DEFAULT_AUDIT_PATH",
@@ -50,12 +53,15 @@ __all__ = [
"compute_kpis",
"compute_monthly_stats",
"compute_payoff_curve",
"enqueue_arm_kill",
"enqueue_disarm_kill",
"load_audit_chain_status",
"load_audit_tail",
"load_closed_positions",
"load_decisions_for_position",
"load_engine_snapshot",
"load_open_positions",
"load_pending_manual_actions",
"load_position_by_id",
]
@@ -559,6 +565,83 @@ def compute_distance_metrics(
)
# ---------------------------------------------------------------------------
# Manual actions queue (the GUI's only write path)
# ---------------------------------------------------------------------------
def _enqueue_action(
*,
db_path: Path | str,
kind: str,
payload: dict[str, object],
proposal_id: UUID | None = None,
) -> int:
"""Insert a row in ``manual_actions``. The engine consumer applies it."""
db_path = Path(db_path)
repo = Repository()
now = datetime.now(UTC)
conn = connect(db_path)
try:
with transaction(conn):
return repo.enqueue_manual_action(
conn,
ManualAction(
kind=kind, # type: ignore[arg-type]
proposal_id=proposal_id,
payload_json=json.dumps(payload),
created_at=now,
),
)
finally:
conn.close()
def enqueue_arm_kill(
*, reason: str, db_path: Path | str = DEFAULT_DB_PATH
) -> int:
"""Queue an ``arm_kill`` action for the engine consumer."""
if not reason or not reason.strip():
raise ValueError("reason is required")
return _enqueue_action(
db_path=db_path,
kind="arm_kill",
payload={"reason": reason.strip()},
)
def enqueue_disarm_kill(
*, reason: str, db_path: Path | str = DEFAULT_DB_PATH
) -> int:
"""Queue a ``disarm_kill`` action for the engine consumer."""
if not reason or not reason.strip():
raise ValueError("reason is required")
return _enqueue_action(
db_path=db_path,
kind="disarm_kill",
payload={"reason": reason.strip()},
)
def load_pending_manual_actions(
*, db_path: Path | str = DEFAULT_DB_PATH
) -> list[ManualAction]:
"""All unconsumed actions, oldest first (used for the pending strip)."""
db_path = Path(db_path)
if not db_path.exists():
return []
conn = connect(db_path)
try:
rows = conn.execute(
"SELECT * FROM manual_actions WHERE consumed_at IS NULL "
"ORDER BY created_at ASC"
).fetchall()
finally:
conn.close()
return [_row_to_manual(row) for row in rows]
def load_audit_tail(
*,
audit_path: Path | str = DEFAULT_AUDIT_PATH,
+98 -2
View File
@@ -10,10 +10,14 @@ import streamlit as st
from cerbero_bite.gui.data_layer import (
DEFAULT_AUDIT_PATH,
DEFAULT_DB_PATH,
EngineSnapshot,
enqueue_arm_kill,
enqueue_disarm_kill,
humanize_age,
humanize_dt,
load_engine_snapshot,
load_open_positions,
load_pending_manual_actions,
)
@@ -31,6 +35,74 @@ _HEALTH_COLORS = {
"unknown": ("", "info"),
}
_TYPED_PHRASE = "yes I am sure"
def _render_kill_switch_panel(db_path: Path, snap: EngineSnapshot) -> None:
st.subheader("Kill switch controls")
if snap.kill_switch_armed:
st.warning(
"Kill switch is **armed**. Disarming queues a `disarm_kill` "
"action; the engine consumer applies it on the next minute "
"tick and the transition is recorded in the audit chain."
)
with st.form("kill_disarm_form", clear_on_submit=True):
reason = st.text_input(
"Reason (required)",
placeholder="e.g. macro window passed",
)
confirm = st.text_input(
f"Type `{_TYPED_PHRASE}` to confirm",
placeholder=_TYPED_PHRASE,
)
submitted = st.form_submit_button(
"🟢 Queue disarm",
type="primary",
use_container_width=True,
)
if submitted:
if confirm.strip() != _TYPED_PHRASE:
st.error(f"Type exactly `{_TYPED_PHRASE}` to confirm.")
elif not reason.strip():
st.error("Reason is required.")
else:
aid = enqueue_disarm_kill(reason=reason, db_path=db_path)
st.success(
f"✅ disarm queued (id #{aid}). "
"The engine will pick it up within ~1 minute."
)
else:
st.info(
"Kill switch is **disarmed**. Arming queues an `arm_kill` "
"action; the engine consumer applies it on the next minute tick."
)
with st.form("kill_arm_form", clear_on_submit=True):
reason = st.text_input(
"Reason (required)",
placeholder="e.g. macro shock — pause trading",
)
confirm = st.text_input(
f"Type `{_TYPED_PHRASE}` to confirm",
placeholder=_TYPED_PHRASE,
)
submitted = st.form_submit_button(
"🔴 Queue arm",
type="secondary",
use_container_width=True,
)
if submitted:
if confirm.strip() != _TYPED_PHRASE:
st.error(f"Type exactly `{_TYPED_PHRASE}` to confirm.")
elif not reason.strip():
st.error("Reason is required.")
else:
aid = enqueue_arm_kill(reason=reason, db_path=db_path)
st.success(
f"✅ arm queued (id #{aid}). "
"The engine will pick it up within ~1 minute."
)
def render() -> None:
st.title("📊 Status")
@@ -54,8 +126,7 @@ def render() -> None:
st.error(
f"**Kill switch armed** — engine will refuse new entries.\n\n"
f"- reason: `{snap.kill_reason or ''}`\n"
f"- since: `{humanize_dt(snap.kill_at)}`\n\n"
"Disarm via CLI: `cerbero-bite kill-switch disarm --reason '<your reason>'`"
f"- since: `{humanize_dt(snap.kill_at)}`"
)
# Top metrics
@@ -69,6 +140,31 @@ def render() -> None:
st.divider()
# Kill switch controls
_render_kill_switch_panel(db_path, snap)
st.divider()
# Pending manual actions
pending = load_pending_manual_actions(db_path=db_path)
if pending:
st.subheader("Pending manual actions")
st.caption(
"Queued from this dashboard, not yet consumed. The engine "
"drains the queue every minute via the `manual_actions` job."
)
rows_pending = [
{
"id": a.id,
"kind": a.kind,
"payload": a.payload_json or "",
"created_at": humanize_dt(a.created_at),
}
for a in pending
]
st.dataframe(rows_pending, use_container_width=True, hide_index=True)
st.divider()
# Audit anchor
st.subheader("Audit anchor")
if snap.last_audit_hash is None:
@@ -0,0 +1,114 @@
"""Consumer of the ``manual_actions`` queue.
The GUI (and other out-of-band tooling) records operator intent in the
SQLite ``manual_actions`` table; this consumer pulls those rows and
dispatches them through the same primitives the engine uses internally
(``KillSwitch.arm`` / ``disarm``) so the audit chain remains the single
source of truth for state transitions.
Currently supported kinds:
* ``arm_kill`` — payload ``{"reason": str}``; arms the kill switch.
* ``disarm_kill`` — payload ``{"reason": str}``; disarms it.
Future kinds (``force_close``, ``approve_proposal``,
``reject_proposal``) are recognised by the ``ManualAction`` schema but
not yet wired up — the consumer marks them as
``result="not_supported"`` so they don't sit in the queue forever.
"""
from __future__ import annotations
import json
import logging
from datetime import UTC, datetime
from typing import TYPE_CHECKING
from cerbero_bite.safety.kill_switch import KillSwitchError
from cerbero_bite.state import connect, transaction
if TYPE_CHECKING:
from cerbero_bite.runtime.dependencies import RuntimeContext
__all__ = ["consume_manual_actions"]
_log = logging.getLogger("cerbero_bite.runtime.manual_actions")
_CONSUMER_ID = "engine"
def _parse_payload(raw: str | None) -> dict[str, object]:
if not raw:
return {}
try:
parsed = json.loads(raw)
except (TypeError, ValueError):
return {}
return parsed if isinstance(parsed, dict) else {}
async def consume_manual_actions(
ctx: RuntimeContext, *, now: datetime | None = None
) -> int:
"""Drain the queue. Return the number of actions processed.
The function is synchronous at heart (SQLite + KillSwitch), but kept
``async def`` so the orchestrator can register it as an APScheduler
coroutine without an extra wrapper. Each iteration fetches the next
unconsumed row and processes it; the loop terminates when the queue
is empty so a single tick can catch up after a long pause.
"""
reference = (now or datetime.now(UTC)).astimezone(UTC)
processed = 0
while True:
conn = connect(ctx.db_path)
try:
action = ctx.repository.next_unconsumed_action(conn)
finally:
conn.close()
if action is None:
break
if action.id is None:
_log.warning("manual_action without id, skipping")
break
payload = _parse_payload(action.payload_json)
result = "ok"
try:
if action.kind == "arm_kill":
reason = str(payload.get("reason", "manual via GUI"))
ctx.kill_switch.arm(reason=reason, source="manual_gui")
elif action.kind == "disarm_kill":
reason = str(payload.get("reason", "manual via GUI"))
ctx.kill_switch.disarm(reason=reason, source="manual_gui")
else:
result = "not_supported"
_log.warning(
"manual_action kind=%s not supported yet", action.kind
)
except KillSwitchError as exc:
_log.exception("kill switch transition failed")
result = f"error: {type(exc).__name__}: {exc}"
except Exception as exc: # pragma: no cover — defensive
_log.exception("manual_action dispatch failed")
result = f"error: {type(exc).__name__}: {exc}"
conn = connect(ctx.db_path)
try:
with transaction(conn):
ctx.repository.mark_action_consumed(
conn,
action.id,
consumed_by=_CONSUMER_ID,
result=result,
now=reference,
)
finally:
conn.close()
processed += 1
if processed:
_log.info("processed %d manual_actions", processed)
return processed
+14
View File
@@ -28,6 +28,7 @@ from cerbero_bite.runtime.dependencies import RuntimeContext, build_runtime
from cerbero_bite.runtime.entry_cycle import EntryCycleResult, run_entry_cycle
from cerbero_bite.runtime.health_check import HealthCheck, HealthCheckResult
from cerbero_bite.runtime.lockfile import EngineLock
from cerbero_bite.runtime.manual_actions_consumer import consume_manual_actions
from cerbero_bite.runtime.monitor_cycle import MonitorCycleResult, run_monitor_cycle
from cerbero_bite.runtime.recovery import recover_state
from cerbero_bite.runtime.scheduler import JobSpec, build_scheduler
@@ -45,6 +46,7 @@ _CRON_ENTRY = "0 14 * * MON"
_CRON_MONITOR = "0 2,14 * * *"
_CRON_HEALTH = "*/5 * * * *"
_CRON_BACKUP = "0 * * * *"
_CRON_MANUAL_ACTIONS = "*/1 * * * *"
_BACKUP_RETENTION_DAYS = 30
@@ -191,6 +193,7 @@ class Orchestrator:
monitor_cron: str = _CRON_MONITOR,
health_cron: str = _CRON_HEALTH,
backup_cron: str = _CRON_BACKUP,
manual_actions_cron: str = _CRON_MANUAL_ACTIONS,
backup_dir: Path | None = None,
backup_retention_days: int = _BACKUP_RETENTION_DAYS,
) -> AsyncIOScheduler:
@@ -229,12 +232,23 @@ class Orchestrator:
await _safe("backup", _do)
async def _manual_actions() -> None:
async def _do() -> None:
await consume_manual_actions(self._ctx)
await _safe("manual_actions", _do)
self._scheduler = build_scheduler(
[
JobSpec(name="entry", cron=entry_cron, coro_factory=_entry),
JobSpec(name="monitor", cron=monitor_cron, coro_factory=_monitor),
JobSpec(name="health", cron=health_cron, coro_factory=_health),
JobSpec(name="backup", cron=backup_cron, coro_factory=_backup),
JobSpec(
name="manual_actions",
cron=manual_actions_cron,
coro_factory=_manual_actions,
),
]
)
return self._scheduler
+1 -1
View File
@@ -114,4 +114,4 @@ def test_install_scheduler_registers_canonical_jobs(tmp_path: Path) -> None:
orch = _build_orch(tmp_path)
sched = orch.install_scheduler()
job_ids = {j.id for j in sched.get_jobs()}
assert job_ids == {"entry", "monitor", "health", "backup"}
assert job_ids == {"entry", "monitor", "health", "backup", "manual_actions"}
+159
View File
@@ -0,0 +1,159 @@
"""Tests for runtime.manual_actions_consumer."""
from __future__ import annotations
import json
from datetime import UTC, datetime
from pathlib import Path
from unittest.mock import MagicMock
import pytest
from cerbero_bite.runtime.manual_actions_consumer import consume_manual_actions
from cerbero_bite.safety.audit_log import AuditLog
from cerbero_bite.safety.kill_switch import KillSwitch, KillSwitchError
from cerbero_bite.state import Repository, connect, run_migrations, transaction
from cerbero_bite.state.models import ManualAction
def _now() -> datetime:
return datetime(2026, 4, 30, 12, 0, tzinfo=UTC)
def _ctx(tmp_path: Path):
db_path = tmp_path / "state.sqlite"
audit_path = tmp_path / "audit.log"
repo = Repository()
conn = connect(db_path)
run_migrations(conn)
with transaction(conn):
repo.init_system_state(conn, config_version="1.0.0", now=_now())
conn.close()
audit = AuditLog(audit_path)
ks = KillSwitch(
connection_factory=lambda: connect(db_path),
repository=repo,
audit_log=audit,
clock=_now,
)
ctx = MagicMock()
ctx.db_path = db_path
ctx.repository = repo
ctx.kill_switch = ks
ctx.audit_log = audit
return ctx
def _enqueue(ctx, kind: str, payload: dict[str, object]) -> int:
conn = connect(ctx.db_path)
try:
with transaction(conn):
return ctx.repository.enqueue_manual_action(
conn,
ManualAction(
kind=kind, # type: ignore[arg-type]
payload_json=json.dumps(payload),
created_at=_now(),
),
)
finally:
conn.close()
def _fetch_action(ctx, action_id: int):
conn = connect(ctx.db_path)
try:
row = conn.execute(
"SELECT consumed_at, consumed_by, result FROM manual_actions WHERE id = ?",
(action_id,),
).fetchone()
finally:
conn.close()
return row
@pytest.mark.asyncio
async def test_arm_kill_arms_kill_switch(tmp_path: Path) -> None:
ctx = _ctx(tmp_path)
aid = _enqueue(ctx, "arm_kill", {"reason": "GUI typed yes"})
assert ctx.kill_switch.is_armed() is False
n = await consume_manual_actions(ctx, now=_now())
assert n == 1
assert ctx.kill_switch.is_armed() is True
row = _fetch_action(ctx, aid)
assert row["consumed_by"] == "engine"
assert row["result"] == "ok"
assert row["consumed_at"] is not None
@pytest.mark.asyncio
async def test_disarm_kill_disarms_kill_switch(tmp_path: Path) -> None:
ctx = _ctx(tmp_path)
ctx.kill_switch.arm(reason="prior", source="manual")
assert ctx.kill_switch.is_armed() is True
aid = _enqueue(ctx, "disarm_kill", {"reason": "operator override"})
n = await consume_manual_actions(ctx, now=_now())
assert n == 1
assert ctx.kill_switch.is_armed() is False
row = _fetch_action(ctx, aid)
assert row["result"] == "ok"
@pytest.mark.asyncio
async def test_consumer_drains_queue(tmp_path: Path) -> None:
ctx = _ctx(tmp_path)
_enqueue(ctx, "arm_kill", {"reason": "first"})
_enqueue(ctx, "disarm_kill", {"reason": "second"})
_enqueue(ctx, "arm_kill", {"reason": "third"})
n = await consume_manual_actions(ctx, now=_now())
assert n == 3
assert ctx.kill_switch.is_armed() is True
@pytest.mark.asyncio
async def test_unsupported_kind_marked_not_supported(tmp_path: Path) -> None:
ctx = _ctx(tmp_path)
aid = _enqueue(ctx, "force_close", {"proposal_id": "abc"})
n = await consume_manual_actions(ctx, now=_now())
assert n == 1
row = _fetch_action(ctx, aid)
assert row["result"] == "not_supported"
@pytest.mark.asyncio
async def test_missing_payload_uses_default_reason(tmp_path: Path) -> None:
ctx = _ctx(tmp_path)
_enqueue(ctx, "arm_kill", {})
n = await consume_manual_actions(ctx, now=_now())
assert n == 1
assert ctx.kill_switch.is_armed() is True
@pytest.mark.asyncio
async def test_kill_switch_error_caught_and_recorded(tmp_path: Path) -> None:
ctx = _ctx(tmp_path)
# Replace the kill switch with one whose arm raises.
bad_ks = MagicMock()
bad_ks.arm.side_effect = KillSwitchError("simulated")
bad_ks.is_armed.return_value = False
ctx.kill_switch = bad_ks
aid = _enqueue(ctx, "arm_kill", {"reason": "x"})
n = await consume_manual_actions(ctx, now=_now())
assert n == 1
row = _fetch_action(ctx, aid)
assert "KillSwitchError" in (row["result"] or "")
@pytest.mark.asyncio
async def test_empty_queue_returns_zero(tmp_path: Path) -> None:
ctx = _ctx(tmp_path)
n = await consume_manual_actions(ctx, now=_now())
assert n == 0