merge: greediness (kernel greedy alternativo a rescore strided)

This commit is contained in:
2026-05-04 15:45:15 +02:00
2 changed files with 101 additions and 7 deletions
+84
View File
@@ -167,6 +167,61 @@ if HAS_NUMBA:
acc[y, x] = 0.0
return acc
@nb.njit(cache=True, parallel=True, fastmath=True, boundscheck=False)
def _jit_score_bitmap_greedy(
spread: np.ndarray,
dx: np.ndarray, dy: np.ndarray, bins: np.ndarray,
bit_active: np.uint8,
min_score: nb.float32,
greediness: nb.float32,
) -> np.ndarray:
"""Score bitmap con early-exit greedy (no rescore background).
Per ogni pixel iteriamo le N feature; abortiamo non appena diventa
impossibile raggiungere `min_required` count anche aggiungendo
tutte le feature rimanenti. min_required = greediness * min_score * N.
greediness=0 → nessun early-exit (equivalente a kernel base).
greediness=1 → exit non appena hits + remaining < min_score * N.
Tipico: 0.7-0.9 → 2-4x speed-up senza perdere match.
"""
H, W = spread.shape
N = dx.shape[0]
acc = np.zeros((H, W), dtype=np.float32)
if N == 0:
return acc
min_req = greediness * min_score * N
inv_N = nb.float32(1.0 / N)
for y in nb.prange(H):
for x in range(W):
hits = 0
for i in range(N):
b = bins[i]
mask = np.uint8(1) << b
if (bit_active & mask) == 0:
if hits + (N - i - 1) < min_req:
break
continue
ddy = dy[i]
yy = y + ddy
if yy < 0 or yy >= H:
if hits + (N - i - 1) < min_req:
break
continue
ddx = dx[i]
xx = x + ddx
if xx < 0 or xx >= W:
if hits + (N - i - 1) < min_req:
break
continue
if spread[yy, xx] & mask:
hits += 1
else:
if hits + (N - i - 1) < min_req:
break
acc[y, x] = nb.float32(hits) * inv_N
return acc
@nb.njit(cache=True, parallel=True, fastmath=True, boundscheck=False)
def _jit_score_bitmap_rescored(
spread: np.ndarray, # uint8 (H, W)
@@ -245,6 +300,10 @@ if HAS_NUMBA:
_jit_score_bitmap_rescored_strided(
spread, dx, dy, b, np.uint8(0xFF), bg, np.int32(2),
)
_jit_score_bitmap_greedy(
spread, dx, dy, b, np.uint8(0xFF),
np.float32(0.5), np.float32(0.8),
)
_jit_popcount_density(spread)
else: # pragma: no cover
@@ -261,6 +320,9 @@ else: # pragma: no cover
def _jit_score_bitmap_rescored_strided(spread, dx, dy, bins, bit_active, bg, stride):
raise RuntimeError("numba non disponibile")
def _jit_score_bitmap_greedy(spread, dx, dy, bins, bit_active, min_score, greediness):
raise RuntimeError("numba non disponibile")
def _jit_popcount_density(spread):
raise RuntimeError("numba non disponibile")
@@ -319,6 +381,28 @@ def score_bitmap_rescored(
return np.maximum(0.0, out).astype(np.float32)
def score_bitmap_greedy(
spread: np.ndarray, dx: np.ndarray, dy: np.ndarray, bins: np.ndarray,
bit_active: int, min_score: float, greediness: float,
) -> np.ndarray:
"""Score bitmap con early-exit greedy. Per coarse-pass aggressivo.
Non applica rescore background: usare quando la scena ha basso clutter
o quando si vuole mass-prune varianti via top-level rapidamente.
"""
if HAS_NUMBA and len(dx) > 0:
return _jit_score_bitmap_greedy(
np.ascontiguousarray(spread, dtype=np.uint8),
np.ascontiguousarray(dx, dtype=np.int32),
np.ascontiguousarray(dy, dtype=np.int32),
np.ascontiguousarray(bins, dtype=np.int8),
np.uint8(bit_active),
np.float32(min_score), np.float32(greediness),
)
# Fallback: kernel base senza early-exit
return score_bitmap(spread, dx, dy, bins, bit_active)
def popcount_density(spread: np.ndarray) -> np.ndarray:
if HAS_NUMBA:
return _jit_popcount_density(np.ascontiguousarray(spread, dtype=np.uint8))