perf: spread bitmap uint8 + pre-NMS prima refine (3.5x globale, 49x worst case)
Due ottimizzazioni chiave:
1. Spread bitmap uint8 invece di response map (N_BINS, H, W) float32
- 32x meno memoria, cache-friendly
- Nuovi kernel Numba: _jit_score_bitmap, _jit_popcount_density
- Formato: spread[y,x] bit b = bin b attivo nel raggio di spread
- _refine_angle usa slicing su bitmap con mask & bit
2. Pre-NMS prima di refine_angle/verify_ncc
- Problema: loop 'for raw in candidati' applicava refine+verify A OGNI
candidato prima del check NMS → 2000+ refine chiamati per ~25 match
- Fix: pre-NMS su (cx, cy) subpixel, limita a max_matches*3 candidati,
poi refine + verify solo su quelli
- Esempio worst case: lama_full_fast 55.9s → 1.13s (49x)
Benchmark suite 16 scenari (4 immagini x full/part x fast/preciso):
prima: totale find 94.6s
dopo: totale find 27.3s (3.5x globale)
casi peggiori <5s (prima erano >50s)
ROI parziali (solo metà oggetto) funzionano in tutti i casi.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+96
-4
@@ -45,13 +45,12 @@ if HAS_NUMBA:
|
||||
resp: np.ndarray, # float32 (N_BINS, H, W)
|
||||
dx: np.ndarray, # int32 (N,)
|
||||
dy: np.ndarray, # int32 (N,)
|
||||
bins: np.ndarray, # int8 or int32 (N,)
|
||||
bins: np.ndarray, # int8 (N,)
|
||||
bin_active: np.ndarray, # bool_ (N_BINS,)
|
||||
) -> np.ndarray:
|
||||
n_bins, H, W = resp.shape
|
||||
_, H, W = resp.shape
|
||||
N = dx.shape[0]
|
||||
acc = np.zeros((H, W), dtype=np.float32)
|
||||
# Parallelizza per riga: niente race (ogni y scrive solo acc[y, :])
|
||||
for y in nb.prange(H):
|
||||
for i in range(N):
|
||||
b = bins[i]
|
||||
@@ -73,7 +72,59 @@ if HAS_NUMBA:
|
||||
acc[y, x] *= inv
|
||||
return acc
|
||||
|
||||
# Warmup: precompila con dummy data
|
||||
@nb.njit(cache=True, parallel=True, fastmath=True, boundscheck=False)
|
||||
def _jit_score_bitmap(
|
||||
spread: np.ndarray, # uint8 (H, W), bit b = bin b attivo
|
||||
dx: np.ndarray, # int32 (N,)
|
||||
dy: np.ndarray, # int32 (N,)
|
||||
bins: np.ndarray, # int8 (N,) bin per ogni feature
|
||||
bit_active: np.uint8, # bitmask bin attivi in scena
|
||||
) -> np.ndarray:
|
||||
"""score[y,x] = (Σ_i [bit bins[i] acceso in spread[y+dy_i, x+dx_i]]) / N.
|
||||
|
||||
32× meno memoria di response map float32 → cache-friendly.
|
||||
"""
|
||||
H, W = spread.shape
|
||||
N = dx.shape[0]
|
||||
acc = np.zeros((H, W), dtype=np.float32)
|
||||
for y in nb.prange(H):
|
||||
for i in range(N):
|
||||
b = bins[i]
|
||||
mask = np.uint8(1) << b
|
||||
if (bit_active & mask) == 0:
|
||||
continue
|
||||
ddy = dy[i]
|
||||
yy = y + ddy
|
||||
if yy < 0 or yy >= H:
|
||||
continue
|
||||
ddx = dx[i]
|
||||
x_lo = 0 if ddx >= 0 else -ddx
|
||||
x_hi = W if ddx <= 0 else W - ddx
|
||||
for x in range(x_lo, x_hi):
|
||||
if spread[yy, x + ddx] & mask:
|
||||
acc[y, x] += 1.0
|
||||
if N > 0:
|
||||
inv = 1.0 / N
|
||||
for y in nb.prange(H):
|
||||
for x in range(W):
|
||||
acc[y, x] *= inv
|
||||
return acc
|
||||
|
||||
@nb.njit(cache=True, parallel=True, fastmath=True, boundscheck=False)
|
||||
def _jit_popcount_density(spread: np.ndarray) -> np.ndarray:
|
||||
"""Conta bit set per pixel: ritorna (H, W) float32 in [0..8]."""
|
||||
H, W = spread.shape
|
||||
out = np.zeros((H, W), dtype=np.float32)
|
||||
for y in nb.prange(H):
|
||||
for x in range(W):
|
||||
v = spread[y, x]
|
||||
# popcount manuale
|
||||
v = (v & 0x55) + ((v >> 1) & 0x55)
|
||||
v = (v & 0x33) + ((v >> 2) & 0x33)
|
||||
v = (v & 0x0F) + ((v >> 4) & 0x0F)
|
||||
out[y, x] = float(v)
|
||||
return out
|
||||
|
||||
def _warmup():
|
||||
resp = np.zeros((8, 32, 32), dtype=np.float32)
|
||||
dx = np.zeros(1, dtype=np.int32)
|
||||
@@ -81,16 +132,57 @@ if HAS_NUMBA:
|
||||
b = np.zeros(1, dtype=np.int8)
|
||||
ba = np.ones(8, dtype=np.bool_)
|
||||
_jit_score_by_shift(resp, dx, dy, b, ba)
|
||||
spread = np.zeros((32, 32), dtype=np.uint8)
|
||||
_jit_score_bitmap(spread, dx, dy, b, np.uint8(0xFF))
|
||||
_jit_popcount_density(spread)
|
||||
|
||||
else: # pragma: no cover
|
||||
|
||||
def _jit_score_by_shift(resp, dx, dy, bins, bin_active):
|
||||
raise RuntimeError("numba non disponibile")
|
||||
|
||||
def _jit_score_bitmap(spread, dx, dy, bins, bit_active):
|
||||
raise RuntimeError("numba non disponibile")
|
||||
|
||||
def _jit_popcount_density(spread):
|
||||
raise RuntimeError("numba non disponibile")
|
||||
|
||||
def _warmup():
|
||||
pass
|
||||
|
||||
|
||||
def score_bitmap(
|
||||
spread: np.ndarray, dx: np.ndarray, dy: np.ndarray, bins: np.ndarray,
|
||||
bit_active: int,
|
||||
) -> np.ndarray:
|
||||
"""Dispatch bitmap: JIT se numba, fallback numpy."""
|
||||
if HAS_NUMBA and len(dx) > 0:
|
||||
return _jit_score_bitmap(
|
||||
np.ascontiguousarray(spread, dtype=np.uint8),
|
||||
np.ascontiguousarray(dx, dtype=np.int32),
|
||||
np.ascontiguousarray(dy, dtype=np.int32),
|
||||
np.ascontiguousarray(bins, dtype=np.int8),
|
||||
np.uint8(bit_active),
|
||||
)
|
||||
# Fallback numpy (lento): converte bitmap a response 3D
|
||||
H, W = spread.shape
|
||||
resp = np.zeros((8, H, W), dtype=np.float32)
|
||||
for b in range(8):
|
||||
resp[b] = ((spread >> b) & 1).astype(np.float32)
|
||||
return _numpy_score_by_shift(resp, dx, dy, bins, None)
|
||||
|
||||
|
||||
def popcount_density(spread: np.ndarray) -> np.ndarray:
|
||||
if HAS_NUMBA:
|
||||
return _jit_popcount_density(np.ascontiguousarray(spread, dtype=np.uint8))
|
||||
# Fallback
|
||||
H, W = spread.shape
|
||||
out = np.zeros((H, W), dtype=np.float32)
|
||||
for b in range(8):
|
||||
out += ((spread >> b) & 1).astype(np.float32)
|
||||
return out
|
||||
|
||||
|
||||
def score_by_shift(
|
||||
resp: np.ndarray, dx: np.ndarray, dy: np.ndarray, bins: np.ndarray,
|
||||
bin_has_data: np.ndarray | None = None,
|
||||
|
||||
Reference in New Issue
Block a user