perf: spread bitmap uint8 + pre-NMS prima refine (3.5x globale, 49x worst case)

Due ottimizzazioni chiave: 1. Spread bitmap uint8 invece di response map (N_BINS, H, W) float32 - 32x meno memoria, cache-friendly - Nuovi kernel Numba: _jit_score_bitmap, _jit_popcount_density - Formato: spread[y,x] bit b = bin b attivo nel raggio di spread - _refine_angle usa slicing su bitmap con mask & bit 2. Pre-NMS prima di refine_angle/verify_ncc - Problema: loop 'for raw in candidati' applicava refine+verify A OGNI candidato prima del check NMS → 2000+ refine chiamati per ~25 match - Fix: pre-NMS su (cx, cy) subpixel, limita a max_matches*3 candidati, poi refine + verify solo su quelli - Esempio worst case: lama_full_fast 55.9s → 1.13s (49x) Benchmark suite 16 scenari (4 immagini x full/part x fast/preciso): prima: totale find 94.6s dopo: totale find 27.3s (3.5x globale) casi peggiori <5s (prima erano >50s) ROI parziali (solo metà oggetto) funzionano in tutti i casi. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 02:11:33 +02:00
parent d27676cfe6
commit ba54b42fdc
3 changed files with 262 additions and 38 deletions
@@ -45,13 +45,12 @@ if HAS_NUMBA:
        resp: np.ndarray,        # float32 (N_BINS, H, W)
        dx: np.ndarray,          # int32 (N,)
        dy: np.ndarray,          # int32 (N,)
-        bins: np.ndarray,        # int8 or int32 (N,)
+        bins: np.ndarray,        # int8 (N,)
        bin_active: np.ndarray,  # bool_ (N_BINS,)
    ) -> np.ndarray:
-        n_bins, H, W = resp.shape
+        _, H, W = resp.shape
        N = dx.shape[0]
        acc = np.zeros((H, W), dtype=np.float32)
-        # Parallelizza per riga: niente race (ogni y scrive solo acc[y, :])
        for y in nb.prange(H):
            for i in range(N):
                b = bins[i]
@@ -73,7 +72,59 @@ if HAS_NUMBA:
                    acc[y, x] *= inv
        return acc

-    # Warmup: precompila con dummy data
+    @nb.njit(cache=True, parallel=True, fastmath=True, boundscheck=False)
+    def _jit_score_bitmap(
+        spread: np.ndarray,      # uint8 (H, W), bit b = bin b attivo
+        dx: np.ndarray,          # int32 (N,)
+        dy: np.ndarray,          # int32 (N,)
+        bins: np.ndarray,        # int8 (N,) bin per ogni feature
+        bit_active: np.uint8,    # bitmask bin attivi in scena
+    ) -> np.ndarray:
+        """score[y,x] = (Σ_i [bit bins[i] acceso in spread[y+dy_i, x+dx_i]]) / N.
+
+        32× meno memoria di response map float32 → cache-friendly.
+        """
+        H, W = spread.shape
+        N = dx.shape[0]
+        acc = np.zeros((H, W), dtype=np.float32)
+        for y in nb.prange(H):
+            for i in range(N):
+                b = bins[i]
+                mask = np.uint8(1) << b
+                if (bit_active & mask) == 0:
+                    continue
+                ddy = dy[i]
+                yy = y + ddy
+                if yy < 0 or yy >= H:
+                    continue
+                ddx = dx[i]
+                x_lo = 0 if ddx >= 0 else -ddx
+                x_hi = W if ddx <= 0 else W - ddx
+                for x in range(x_lo, x_hi):
+                    if spread[yy, x + ddx] & mask:
+                        acc[y, x] += 1.0
+        if N > 0:
+            inv = 1.0 / N
+            for y in nb.prange(H):
+                for x in range(W):
+                    acc[y, x] *= inv
+        return acc
+
+    @nb.njit(cache=True, parallel=True, fastmath=True, boundscheck=False)
+    def _jit_popcount_density(spread: np.ndarray) -> np.ndarray:
+        """Conta bit set per pixel: ritorna (H, W) float32 in [0..8]."""
+        H, W = spread.shape
+        out = np.zeros((H, W), dtype=np.float32)
+        for y in nb.prange(H):
+            for x in range(W):
+                v = spread[y, x]
+                # popcount manuale
+                v = (v & 0x55) + ((v >> 1) & 0x55)
+                v = (v & 0x33) + ((v >> 2) & 0x33)
+                v = (v & 0x0F) + ((v >> 4) & 0x0F)
+                out[y, x] = float(v)
+        return out
+
    def _warmup():
        resp = np.zeros((8, 32, 32), dtype=np.float32)
        dx = np.zeros(1, dtype=np.int32)
@@ -81,16 +132,57 @@ if HAS_NUMBA:
        b = np.zeros(1, dtype=np.int8)
        ba = np.ones(8, dtype=np.bool_)
        _jit_score_by_shift(resp, dx, dy, b, ba)
+        spread = np.zeros((32, 32), dtype=np.uint8)
+        _jit_score_bitmap(spread, dx, dy, b, np.uint8(0xFF))
+        _jit_popcount_density(spread)

 else:  # pragma: no cover

    def _jit_score_by_shift(resp, dx, dy, bins, bin_active):
        raise RuntimeError("numba non disponibile")

+    def _jit_score_bitmap(spread, dx, dy, bins, bit_active):
+        raise RuntimeError("numba non disponibile")
+
+    def _jit_popcount_density(spread):
+        raise RuntimeError("numba non disponibile")
+
    def _warmup():
        pass


+def score_bitmap(
+    spread: np.ndarray, dx: np.ndarray, dy: np.ndarray, bins: np.ndarray,
+    bit_active: int,
+) -> np.ndarray:
+    """Dispatch bitmap: JIT se numba, fallback numpy."""
+    if HAS_NUMBA and len(dx) > 0:
+        return _jit_score_bitmap(
+            np.ascontiguousarray(spread, dtype=np.uint8),
+            np.ascontiguousarray(dx, dtype=np.int32),
+            np.ascontiguousarray(dy, dtype=np.int32),
+            np.ascontiguousarray(bins, dtype=np.int8),
+            np.uint8(bit_active),
+        )
+    # Fallback numpy (lento): converte bitmap a response 3D
+    H, W = spread.shape
+    resp = np.zeros((8, H, W), dtype=np.float32)
+    for b in range(8):
+        resp[b] = ((spread >> b) & 1).astype(np.float32)
+    return _numpy_score_by_shift(resp, dx, dy, bins, None)
+
+
+def popcount_density(spread: np.ndarray) -> np.ndarray:
+    if HAS_NUMBA:
+        return _jit_popcount_density(np.ascontiguousarray(spread, dtype=np.uint8))
+    # Fallback
+    H, W = spread.shape
+    out = np.zeros((H, W), dtype=np.float32)
+    for b in range(8):
+        out += ((spread >> b) & 1).astype(np.float32)
+    return out
+
+
 def score_by_shift(
    resp: np.ndarray, dx: np.ndarray, dy: np.ndarray, bins: np.ndarray,
    bin_has_data: np.ndarray | None = None,