From ba54b42fdccf026b4e824b417ad77b04fa783c11 Mon Sep 17 00:00:00 2001
From: AdrianoDev <adrianodalpastro@gmail.com>
Date: Fri, 24 Apr 2026 02:11:33 +0200
Subject: [PATCH] perf: spread bitmap uint8 + pre-NMS prima refine (3.5x
 globale, 49x worst case)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Due ottimizzazioni chiave:

1. Spread bitmap uint8 invece di response map (N_BINS, H, W) float32
   - 32x meno memoria, cache-friendly
   - Nuovi kernel Numba: _jit_score_bitmap, _jit_popcount_density
   - Formato: spread[y,x] bit b = bin b attivo nel raggio di spread
   - _refine_angle usa slicing su bitmap con mask & bit

2. Pre-NMS prima di refine_angle/verify_ncc
   - Problema: loop 'for raw in candidati' applicava refine+verify A OGNI
     candidato prima del check NMS → 2000+ refine chiamati per ~25 match
   - Fix: pre-NMS su (cx, cy) subpixel, limita a max_matches*3 candidati,
     poi refine + verify solo su quelli
   - Esempio worst case: lama_full_fast 55.9s → 1.13s (49x)

Benchmark suite 16 scenari (4 immagini x full/part x fast/preciso):
  prima: totale find 94.6s
  dopo:  totale find 27.3s (3.5x globale)
  casi peggiori <5s (prima erano >50s)

ROI parziali (solo metà oggetto) funzionano in tutti i casi.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/test_suite.py |  96 ++++++++++++++++++++++++++++++++++++
 pm2d/_jit_kernels.py     | 100 +++++++++++++++++++++++++++++++++++--
 pm2d/line_matcher.py     | 104 ++++++++++++++++++++++++++-------------
 3 files changed, 262 insertions(+), 38 deletions(-)
 create mode 100644 benchmarks/test_suite.py

diff --git a/benchmarks/test_suite.py b/benchmarks/test_suite.py
new file mode 100644
index 0000000..4518892
--- /dev/null
+++ b/benchmarks/test_suite.py
@@ -0,0 +1,96 @@
+"""Test suite esaustivo su Test/*.png con varie configurazioni.
+
+Esegue matrix (immagine, ROI completa/parziale, config) e stampa tempi/match.
+"""
+from __future__ import annotations
+
+import time
+from pathlib import Path
+
+import cv2
+import numpy as np
+
+from pm2d import LineShapeMatcher
+from pm2d.gui import draw_matches
+
+
+TEST_DIR = Path(__file__).parent.parent / "Test"
+OUT_DIR = Path("/tmp/pm2d_suite"); OUT_DIR.mkdir(exist_ok=True)
+
+# Casi: (nome, immagine, (y0,y1,x0,x1) roi completa, (y0,y1,x0,x1) roi parziale)
+CASES = [
+    ("clip",   "clip.png",            ( 60, 200,  90, 290), ( 60, 135,  90, 290)),
+    ("ruota",  "rings_and_nuts.png",  ( 55, 175,  90, 215), ( 55, 115,  90, 215)),
+    ("dado",   "rings_and_nuts.png",  (255, 375,  40, 170), (255, 315,  40, 170)),
+    ("lama",   "razors2.png",         ( 90, 370, 120, 160), ( 90, 230, 120, 160)),
+]
+
+CONFIGS = [
+    ("fast",    dict(angle_step_deg=10.0, scale_range=(1.0, 1.0),
+                     pyramid_levels=3, num_features=64)),
+    ("preciso", dict(angle_step_deg=5.0,  scale_range=(0.5, 1.1), scale_step=0.05,
+                     pyramid_levels=3, num_features=96)),
+]
+
+
+def bench(case_name: str, img_path: str, roi_box: tuple, roi_kind: str,
+          cfg_name: str, cfg: dict) -> dict:
+    scene = cv2.imread(str(TEST_DIR / img_path))
+    y0, y1, x0, x1 = roi_box
+    roi = scene[y0:y1, x0:x1].copy()
+    m = LineShapeMatcher(
+        angle_range_deg=(0.0, 360.0),
+        weak_grad=30, strong_grad=60,
+        spread_radius=5, n_threads=4, **cfg,
+    )
+    t0 = time.time()
+    n_var = m.train(roi)
+    t_train = time.time() - t0
+    # warmup (prima call è JIT compile)
+    m.find(scene, min_score=0.55, max_matches=3, refine_angle=False)
+
+    t0 = time.time()
+    matches = m.find(
+        scene, min_score=0.55, max_matches=25, nms_radius=None,
+        refine_angle=True, subpixel=True, verify_threshold=0.4,
+    )
+    t_find = time.time() - t0
+
+    tag = f"{case_name}_{roi_kind}_{cfg_name}"
+    overlay = draw_matches(scene, matches,
+                           template_gray=cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY))
+    cv2.imwrite(str(OUT_DIR / f"{tag}.png"), overlay)
+
+    return {
+        "case": tag,
+        "roi": f"{roi.shape[1]}x{roi.shape[0]}",
+        "variants": n_var,
+        "train_s": t_train,
+        "find_s": t_find,
+        "n_match": len(matches),
+        "score_range": (
+            f"{min(x.score for x in matches):.2f}..{max(x.score for x in matches):.2f}"
+            if matches else "-"
+        ),
+    }
+
+
+def main():
+    print(f"{'case':30s} {'roi':>9s}  {'var':>4s}  "
+          f"{'train':>6s}  {'find':>6s}  {'n':>3s}  score")
+    print("-" * 85)
+    total_find = 0.0
+    for case_name, img, roi_full, roi_part in CASES:
+        for roi_kind, roi_box in [("full", roi_full), ("part", roi_part)]:
+            for cfg_name, cfg in CONFIGS:
+                r = bench(case_name, img, roi_box, roi_kind, cfg_name, cfg)
+                print(f"{r['case']:30s} {r['roi']:>9s}  {r['variants']:>4d}  "
+                      f"{r['train_s']:>5.2f}s  {r['find_s']:>5.2f}s  "
+                      f"{r['n_match']:>3d}  {r['score_range']}")
+                total_find += r["find_s"]
+    print("-" * 85)
+    print(f"totale find: {total_find:.1f}s  overlay salvati in {OUT_DIR}/")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pm2d/_jit_kernels.py b/pm2d/_jit_kernels.py
index 6d0087e..39de405 100644
--- a/pm2d/_jit_kernels.py
+++ b/pm2d/_jit_kernels.py
@@ -45,13 +45,12 @@ if HAS_NUMBA:
         resp: np.ndarray,        # float32 (N_BINS, H, W)
         dx: np.ndarray,          # int32 (N,)
         dy: np.ndarray,          # int32 (N,)
-        bins: np.ndarray,        # int8 or int32 (N,)
+        bins: np.ndarray,        # int8 (N,)
         bin_active: np.ndarray,  # bool_ (N_BINS,)
     ) -> np.ndarray:
-        n_bins, H, W = resp.shape
+        _, H, W = resp.shape
         N = dx.shape[0]
         acc = np.zeros((H, W), dtype=np.float32)
-        # Parallelizza per riga: niente race (ogni y scrive solo acc[y, :])
         for y in nb.prange(H):
             for i in range(N):
                 b = bins[i]
@@ -73,7 +72,59 @@ if HAS_NUMBA:
                     acc[y, x] *= inv
         return acc
 
-    # Warmup: precompila con dummy data
+    @nb.njit(cache=True, parallel=True, fastmath=True, boundscheck=False)
+    def _jit_score_bitmap(
+        spread: np.ndarray,      # uint8 (H, W), bit b = bin b attivo
+        dx: np.ndarray,          # int32 (N,)
+        dy: np.ndarray,          # int32 (N,)
+        bins: np.ndarray,        # int8 (N,) bin per ogni feature
+        bit_active: np.uint8,    # bitmask bin attivi in scena
+    ) -> np.ndarray:
+        """score[y,x] = (Σ_i [bit bins[i] acceso in spread[y+dy_i, x+dx_i]]) / N.
+
+        32× meno memoria di response map float32 → cache-friendly.
+        """
+        H, W = spread.shape
+        N = dx.shape[0]
+        acc = np.zeros((H, W), dtype=np.float32)
+        for y in nb.prange(H):
+            for i in range(N):
+                b = bins[i]
+                mask = np.uint8(1) << b
+                if (bit_active & mask) == 0:
+                    continue
+                ddy = dy[i]
+                yy = y + ddy
+                if yy < 0 or yy >= H:
+                    continue
+                ddx = dx[i]
+                x_lo = 0 if ddx >= 0 else -ddx
+                x_hi = W if ddx <= 0 else W - ddx
+                for x in range(x_lo, x_hi):
+                    if spread[yy, x + ddx] & mask:
+                        acc[y, x] += 1.0
+        if N > 0:
+            inv = 1.0 / N
+            for y in nb.prange(H):
+                for x in range(W):
+                    acc[y, x] *= inv
+        return acc
+
+    @nb.njit(cache=True, parallel=True, fastmath=True, boundscheck=False)
+    def _jit_popcount_density(spread: np.ndarray) -> np.ndarray:
+        """Conta bit set per pixel: ritorna (H, W) float32 in [0..8]."""
+        H, W = spread.shape
+        out = np.zeros((H, W), dtype=np.float32)
+        for y in nb.prange(H):
+            for x in range(W):
+                v = spread[y, x]
+                # popcount manuale
+                v = (v & 0x55) + ((v >> 1) & 0x55)
+                v = (v & 0x33) + ((v >> 2) & 0x33)
+                v = (v & 0x0F) + ((v >> 4) & 0x0F)
+                out[y, x] = float(v)
+        return out
+
     def _warmup():
         resp = np.zeros((8, 32, 32), dtype=np.float32)
         dx = np.zeros(1, dtype=np.int32)
@@ -81,16 +132,57 @@ if HAS_NUMBA:
         b = np.zeros(1, dtype=np.int8)
         ba = np.ones(8, dtype=np.bool_)
         _jit_score_by_shift(resp, dx, dy, b, ba)
+        spread = np.zeros((32, 32), dtype=np.uint8)
+        _jit_score_bitmap(spread, dx, dy, b, np.uint8(0xFF))
+        _jit_popcount_density(spread)
 
 else:  # pragma: no cover
 
     def _jit_score_by_shift(resp, dx, dy, bins, bin_active):
         raise RuntimeError("numba non disponibile")
 
+    def _jit_score_bitmap(spread, dx, dy, bins, bit_active):
+        raise RuntimeError("numba non disponibile")
+
+    def _jit_popcount_density(spread):
+        raise RuntimeError("numba non disponibile")
+
     def _warmup():
         pass
 
 
+def score_bitmap(
+    spread: np.ndarray, dx: np.ndarray, dy: np.ndarray, bins: np.ndarray,
+    bit_active: int,
+) -> np.ndarray:
+    """Dispatch bitmap: JIT se numba, fallback numpy."""
+    if HAS_NUMBA and len(dx) > 0:
+        return _jit_score_bitmap(
+            np.ascontiguousarray(spread, dtype=np.uint8),
+            np.ascontiguousarray(dx, dtype=np.int32),
+            np.ascontiguousarray(dy, dtype=np.int32),
+            np.ascontiguousarray(bins, dtype=np.int8),
+            np.uint8(bit_active),
+        )
+    # Fallback numpy (lento): converte bitmap a response 3D
+    H, W = spread.shape
+    resp = np.zeros((8, H, W), dtype=np.float32)
+    for b in range(8):
+        resp[b] = ((spread >> b) & 1).astype(np.float32)
+    return _numpy_score_by_shift(resp, dx, dy, bins, None)
+
+
+def popcount_density(spread: np.ndarray) -> np.ndarray:
+    if HAS_NUMBA:
+        return _jit_popcount_density(np.ascontiguousarray(spread, dtype=np.uint8))
+    # Fallback
+    H, W = spread.shape
+    out = np.zeros((H, W), dtype=np.float32)
+    for b in range(8):
+        out += ((spread >> b) & 1).astype(np.float32)
+    return out
+
+
 def score_by_shift(
     resp: np.ndarray, dx: np.ndarray, dy: np.ndarray, bins: np.ndarray,
     bin_has_data: np.ndarray | None = None,
diff --git a/pm2d/line_matcher.py b/pm2d/line_matcher.py
index 623fed5..5540777 100644
--- a/pm2d/line_matcher.py
+++ b/pm2d/line_matcher.py
@@ -33,7 +33,12 @@ from dataclasses import dataclass
 import cv2
 import numpy as np
 
-from pm2d._jit_kernels import score_by_shift as _jit_score_by_shift, HAS_NUMBA
+from pm2d._jit_kernels import (
+    score_by_shift as _jit_score_by_shift,
+    score_bitmap as _jit_score_bitmap,
+    popcount_density as _jit_popcount,
+    HAS_NUMBA,
+)
 
 N_BINS = 8  # orientamenti quantizzati modulo π
 
@@ -286,11 +291,7 @@ class LineShapeMatcher:
     # --- Matching ------------------------------------------------------
 
     def _response_map(self, gray: np.ndarray) -> np.ndarray:
-        """Response map shape (N_BINS, H, W) float32 0/1.
-
-        Rinormalizzazione anti-background (match vs texture densa) è
-        applicata a valle nel `find()` via `_bg_map` locale.
-        """
+        """Response map shape (N_BINS, H, W) float32 (legacy path)."""
         mag, bins = self._gradient(gray)
         valid = mag >= self.weak_grad
         k = 2 * self.spread_radius + 1
@@ -303,6 +304,23 @@ class LineShapeMatcher:
             raw[b] = d.astype(np.float32)
         return raw
 
+    def _spread_bitmap(self, gray: np.ndarray) -> np.ndarray:
+        """Spread bitmap uint8: bit b acceso dove bin b è presente nel raggio.
+
+        Formato compatto 32× più denso della response map (N_BINS, H, W) float32.
+        """
+        mag, bins = self._gradient(gray)
+        valid = mag >= self.weak_grad
+        k = 2 * self.spread_radius + 1
+        kernel = np.ones((k, k), dtype=np.uint8)
+        H, W = gray.shape
+        spread = np.zeros((H, W), dtype=np.uint8)
+        for b in range(N_BINS):
+            mask_b = ((bins == b) & valid).astype(np.uint8)
+            d = cv2.dilate(mask_b, kernel)
+            spread |= (d << b)
+        return spread
+
     @staticmethod
     def _score_by_shift(
         resp: np.ndarray, dx: np.ndarray, dy: np.ndarray, bins: np.ndarray,
@@ -333,7 +351,8 @@ class LineShapeMatcher:
 
     def _refine_angle(
         self,
-        resp0: np.ndarray,
+        spread0: np.ndarray,       # bitmap uint8 (H, W)
+        bit_active: int,
         template_gray: np.ndarray,
         cx: float, cy: float,
         angle_deg: float, scale: float,
@@ -366,7 +385,7 @@ class LineShapeMatcher:
                                      cv2.BORDER_CONSTANT, value=0)
         center = (diag / 2.0, diag / 2.0)
 
-        H, W = resp0.shape[1], resp0.shape[2]
+        H, W = spread0.shape
         # Ricerca locale posizione con margine ±2 px sulla (cx, cy)
         margin = 3
 
@@ -385,13 +404,14 @@ class LineShapeMatcher:
                 continue
             dx = (fx - center[0]).astype(np.int32)
             dy = (fy - center[1]).astype(np.int32)
-            # Finestra locale ±margin attorno a (cx, cy) via slicing vettorizzato
+            # Finestra locale ±margin attorno a (cx, cy) via slicing su bitmap
             y_lo = int(cy) - margin; y_hi = int(cy) + margin + 1
             x_lo = int(cx) - margin; x_hi = int(cx) + margin + 1
             sh = y_hi - y_lo; sw = x_hi - x_lo
             acc = np.zeros((sh, sw), dtype=np.float32)
             for i in range(len(dx)):
                 ddx = int(dx[i]); ddy = int(dy[i]); b = int(fb[i])
+                bit = np.uint8(1 << b)
                 sy0 = y_lo + ddy; sy1 = y_hi + ddy
                 sx0 = x_lo + ddx; sx1 = x_hi + ddx
                 a_y0 = max(0, -sy0); a_y1 = sh - max(0, sy1 - H)
@@ -399,7 +419,10 @@ class LineShapeMatcher:
                 s_y0 = max(0, sy0); s_y1 = min(H, sy1)
                 s_x0 = max(0, sx0); s_x1 = min(W, sx1)
                 if s_y1 > s_y0 and s_x1 > s_x0:
-                    acc[a_y0:a_y1, a_x0:a_x1] += resp0[b, s_y0:s_y1, s_x0:s_x1]
+                    region = spread0[s_y0:s_y1, s_x0:s_x1]
+                    acc[a_y0:a_y1, a_x0:a_x1] += (
+                        (region & bit) != 0
+                    ).astype(np.float32)
             acc /= len(dx)
             _, max_val, _, max_loc = cv2.minMaxLoc(acc)
             scores_by_off[float(off)] = float(max_val)
@@ -487,18 +510,19 @@ class LineShapeMatcher:
             grays.append(cv2.pyrDown(grays[-1]))
         top = len(grays) - 1
 
-        # Response map top-level
-        resp_top = self._response_map(grays[top])
-        bin_has_top = np.array([resp_top[b].any() for b in range(N_BINS)])
+        # Spread bitmap (uint8) al top level: 32× meno memoria della response
+        # map float32 → MOLTO più cache-friendly per _score_by_shift.
+        spread_top = self._spread_bitmap(grays[top])
+        bit_active_top = int(
+            sum(1 << b for b in range(N_BINS)
+                if (spread_top & np.uint8(1 << b)).any())
+        )
         if nms_radius is None:
             nms_radius = max(8, min(self.template_size) // 2)
         top_thresh = min_score * self.top_score_factor
 
-        # Background map PER-SCALA: densità media bin attivi normalizzata
-        # su bbox template scalata. Rinormalizza score per isolare contributo
-        # non-random e riduce FP in zone con attivazione densa.
         tw, th = self.template_size
-        density_top = resp_top.sum(axis=0)
+        density_top = _jit_popcount(spread_top)
         sf_top = 2 ** top
         bg_cache_top: dict[float, np.ndarray] = {}
         bg_cache_full: dict[float, np.ndarray] = {}
@@ -521,8 +545,8 @@ class LineShapeMatcher:
         def _top_score(vi: int) -> tuple[int, float]:
             var = self.variants[vi]
             lvl = var.levels[min(top, len(var.levels) - 1)]
-            score = self._score_by_shift(
-                resp_top, lvl.dx, lvl.dy, lvl.bin, bin_has_data=bin_has_top,
+            score = _jit_score_bitmap(
+                spread_top, lvl.dx, lvl.dy, lvl.bin, bit_active_top,
             )
             score = _rescore(score, bg_cache_top[var.scale])
             return vi, float(score.max()) if score.size else -1.0
@@ -549,18 +573,21 @@ class LineShapeMatcher:
         max_vars_full = max(max_matches * 8, len(self.variants) // 2)
         kept_variants = kept_variants[:max_vars_full]
 
-        # Full-res (parallelizzato per variante)
-        resp0 = self._response_map(gray0)
-        bin_has_full = np.array([resp0[b].any() for b in range(N_BINS)])
-        density_full = resp0.sum(axis=0)
+        # Full-res (parallelizzato) con bitmap
+        spread0 = self._spread_bitmap(gray0)
+        bit_active_full = int(
+            sum(1 << b for b in range(N_BINS)
+                if (spread0 & np.uint8(1 << b)).any())
+        )
+        density_full = _jit_popcount(spread0)
         for sc in unique_scales:
             bg_cache_full[sc] = _bg_for_scale(density_full, sc, 1)
 
         def _full_score(vi: int) -> tuple[int, np.ndarray]:
             var = self.variants[vi]
             lvl0 = var.levels[0]
-            score = self._score_by_shift(
-                resp0, lvl0.dx, lvl0.dy, lvl0.bin, bin_has_data=bin_has_full,
+            score = _jit_score_bitmap(
+                spread0, lvl0.dx, lvl0.dy, lvl0.bin, bit_active_full,
             )
             score = _rescore(score, bg_cache_full[var.scale])
             return vi, score
@@ -595,28 +622,37 @@ class LineShapeMatcher:
         h, w = self.template_gray.shape if self.template_gray is not None else (0, 0)
         mask_full = np.full((h, w), 255, dtype=np.uint8)
 
-        kept: list[Match] = []
+        # Pre-NMS rapido su raw (solo subpixel, no refine/verify): riduce
+        # i candidati a ~max_matches*3 prima di operazioni costose (refine,
+        # verify) che erano chiamate per ogni raw causando lentezze 100x.
         r2 = nms_radius * nms_radius
-        tw, th = self.template_size
+        preliminary: list[tuple[float, float, float, int]] = []
+        pre_cap = max(max_matches * 3, max_matches + 10)
         for score, xi, yi, vi in raw:
-            var = self.variants[vi]
-            cx_f = float(xi); cy_f = float(yi)
             if subpixel and vi in score_maps:
                 cx_f, cy_f = self._subpixel_peak(score_maps[vi], xi, yi)
-
-            if any((k.cx - cx_f) ** 2 + (k.cy - cy_f) ** 2 < r2 for k in kept):
+            else:
+                cx_f, cy_f = float(xi), float(yi)
+            if any((k[1] - cx_f) ** 2 + (k[2] - cy_f) ** 2 < r2
+                   for k in preliminary):
                 continue
+            preliminary.append((score, cx_f, cy_f, vi))
+            if len(preliminary) >= pre_cap:
+                break
 
+        # Ora refine + verify solo sui candidati pre-NMS
+        kept: list[Match] = []
+        tw, th = self.template_size
+        for score, cx_f, cy_f, vi in preliminary:
+            var = self.variants[vi]
             ang_f = var.angle_deg
             score_f = score
             if refine_angle and self.template_gray is not None:
                 ang_f, score_f, cx_f, cy_f = self._refine_angle(
-                    resp0, self.template_gray, cx_f, cy_f,
+                    spread0, bit_active_full, self.template_gray, cx_f, cy_f,
                     var.angle_deg, var.scale, mask_full,
                     search_radius=self.angle_step_deg / 2.0,
                 )
-
-            # Verify NCC: filtra falsi positivi con mismatch pixel-level
             if verify_ncc:
                 ncc = self._verify_ncc(gray0, cx_f, cy_f, ang_f, var.scale)
                 if ncc < verify_threshold: