From ba54b42fdccf026b4e824b417ad77b04fa783c11 Mon Sep 17 00:00:00 2001 From: AdrianoDev Date: Fri, 24 Apr 2026 02:11:33 +0200 Subject: [PATCH] perf: spread bitmap uint8 + pre-NMS prima refine (3.5x globale, 49x worst case) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Due ottimizzazioni chiave: 1. Spread bitmap uint8 invece di response map (N_BINS, H, W) float32 - 32x meno memoria, cache-friendly - Nuovi kernel Numba: _jit_score_bitmap, _jit_popcount_density - Formato: spread[y,x] bit b = bin b attivo nel raggio di spread - _refine_angle usa slicing su bitmap con mask & bit 2. Pre-NMS prima di refine_angle/verify_ncc - Problema: loop 'for raw in candidati' applicava refine+verify A OGNI candidato prima del check NMS → 2000+ refine chiamati per ~25 match - Fix: pre-NMS su (cx, cy) subpixel, limita a max_matches*3 candidati, poi refine + verify solo su quelli - Esempio worst case: lama_full_fast 55.9s → 1.13s (49x) Benchmark suite 16 scenari (4 immagini x full/part x fast/preciso): prima: totale find 94.6s dopo: totale find 27.3s (3.5x globale) casi peggiori <5s (prima erano >50s) ROI parziali (solo metà oggetto) funzionano in tutti i casi. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/test_suite.py | 96 ++++++++++++++++++++++++++++++++++++ pm2d/_jit_kernels.py | 100 +++++++++++++++++++++++++++++++++++-- pm2d/line_matcher.py | 104 ++++++++++++++++++++++++++------------- 3 files changed, 262 insertions(+), 38 deletions(-) create mode 100644 benchmarks/test_suite.py diff --git a/benchmarks/test_suite.py b/benchmarks/test_suite.py new file mode 100644 index 0000000..4518892 --- /dev/null +++ b/benchmarks/test_suite.py @@ -0,0 +1,96 @@ +"""Test suite esaustivo su Test/*.png con varie configurazioni. + +Esegue matrix (immagine, ROI completa/parziale, config) e stampa tempi/match. +""" +from __future__ import annotations + +import time +from pathlib import Path + +import cv2 +import numpy as np + +from pm2d import LineShapeMatcher +from pm2d.gui import draw_matches + + +TEST_DIR = Path(__file__).parent.parent / "Test" +OUT_DIR = Path("/tmp/pm2d_suite"); OUT_DIR.mkdir(exist_ok=True) + +# Casi: (nome, immagine, (y0,y1,x0,x1) roi completa, (y0,y1,x0,x1) roi parziale) +CASES = [ + ("clip", "clip.png", ( 60, 200, 90, 290), ( 60, 135, 90, 290)), + ("ruota", "rings_and_nuts.png", ( 55, 175, 90, 215), ( 55, 115, 90, 215)), + ("dado", "rings_and_nuts.png", (255, 375, 40, 170), (255, 315, 40, 170)), + ("lama", "razors2.png", ( 90, 370, 120, 160), ( 90, 230, 120, 160)), +] + +CONFIGS = [ + ("fast", dict(angle_step_deg=10.0, scale_range=(1.0, 1.0), + pyramid_levels=3, num_features=64)), + ("preciso", dict(angle_step_deg=5.0, scale_range=(0.5, 1.1), scale_step=0.05, + pyramid_levels=3, num_features=96)), +] + + +def bench(case_name: str, img_path: str, roi_box: tuple, roi_kind: str, + cfg_name: str, cfg: dict) -> dict: + scene = cv2.imread(str(TEST_DIR / img_path)) + y0, y1, x0, x1 = roi_box + roi = scene[y0:y1, x0:x1].copy() + m = LineShapeMatcher( + angle_range_deg=(0.0, 360.0), + weak_grad=30, strong_grad=60, + spread_radius=5, n_threads=4, **cfg, + ) + t0 = time.time() + n_var = m.train(roi) + t_train = time.time() - t0 + # warmup (prima call è JIT compile) + m.find(scene, min_score=0.55, max_matches=3, refine_angle=False) + + t0 = time.time() + matches = m.find( + scene, min_score=0.55, max_matches=25, nms_radius=None, + refine_angle=True, subpixel=True, verify_threshold=0.4, + ) + t_find = time.time() - t0 + + tag = f"{case_name}_{roi_kind}_{cfg_name}" + overlay = draw_matches(scene, matches, + template_gray=cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)) + cv2.imwrite(str(OUT_DIR / f"{tag}.png"), overlay) + + return { + "case": tag, + "roi": f"{roi.shape[1]}x{roi.shape[0]}", + "variants": n_var, + "train_s": t_train, + "find_s": t_find, + "n_match": len(matches), + "score_range": ( + f"{min(x.score for x in matches):.2f}..{max(x.score for x in matches):.2f}" + if matches else "-" + ), + } + + +def main(): + print(f"{'case':30s} {'roi':>9s} {'var':>4s} " + f"{'train':>6s} {'find':>6s} {'n':>3s} score") + print("-" * 85) + total_find = 0.0 + for case_name, img, roi_full, roi_part in CASES: + for roi_kind, roi_box in [("full", roi_full), ("part", roi_part)]: + for cfg_name, cfg in CONFIGS: + r = bench(case_name, img, roi_box, roi_kind, cfg_name, cfg) + print(f"{r['case']:30s} {r['roi']:>9s} {r['variants']:>4d} " + f"{r['train_s']:>5.2f}s {r['find_s']:>5.2f}s " + f"{r['n_match']:>3d} {r['score_range']}") + total_find += r["find_s"] + print("-" * 85) + print(f"totale find: {total_find:.1f}s overlay salvati in {OUT_DIR}/") + + +if __name__ == "__main__": + main() diff --git a/pm2d/_jit_kernels.py b/pm2d/_jit_kernels.py index 6d0087e..39de405 100644 --- a/pm2d/_jit_kernels.py +++ b/pm2d/_jit_kernels.py @@ -45,13 +45,12 @@ if HAS_NUMBA: resp: np.ndarray, # float32 (N_BINS, H, W) dx: np.ndarray, # int32 (N,) dy: np.ndarray, # int32 (N,) - bins: np.ndarray, # int8 or int32 (N,) + bins: np.ndarray, # int8 (N,) bin_active: np.ndarray, # bool_ (N_BINS,) ) -> np.ndarray: - n_bins, H, W = resp.shape + _, H, W = resp.shape N = dx.shape[0] acc = np.zeros((H, W), dtype=np.float32) - # Parallelizza per riga: niente race (ogni y scrive solo acc[y, :]) for y in nb.prange(H): for i in range(N): b = bins[i] @@ -73,7 +72,59 @@ if HAS_NUMBA: acc[y, x] *= inv return acc - # Warmup: precompila con dummy data + @nb.njit(cache=True, parallel=True, fastmath=True, boundscheck=False) + def _jit_score_bitmap( + spread: np.ndarray, # uint8 (H, W), bit b = bin b attivo + dx: np.ndarray, # int32 (N,) + dy: np.ndarray, # int32 (N,) + bins: np.ndarray, # int8 (N,) bin per ogni feature + bit_active: np.uint8, # bitmask bin attivi in scena + ) -> np.ndarray: + """score[y,x] = (Σ_i [bit bins[i] acceso in spread[y+dy_i, x+dx_i]]) / N. + + 32× meno memoria di response map float32 → cache-friendly. + """ + H, W = spread.shape + N = dx.shape[0] + acc = np.zeros((H, W), dtype=np.float32) + for y in nb.prange(H): + for i in range(N): + b = bins[i] + mask = np.uint8(1) << b + if (bit_active & mask) == 0: + continue + ddy = dy[i] + yy = y + ddy + if yy < 0 or yy >= H: + continue + ddx = dx[i] + x_lo = 0 if ddx >= 0 else -ddx + x_hi = W if ddx <= 0 else W - ddx + for x in range(x_lo, x_hi): + if spread[yy, x + ddx] & mask: + acc[y, x] += 1.0 + if N > 0: + inv = 1.0 / N + for y in nb.prange(H): + for x in range(W): + acc[y, x] *= inv + return acc + + @nb.njit(cache=True, parallel=True, fastmath=True, boundscheck=False) + def _jit_popcount_density(spread: np.ndarray) -> np.ndarray: + """Conta bit set per pixel: ritorna (H, W) float32 in [0..8].""" + H, W = spread.shape + out = np.zeros((H, W), dtype=np.float32) + for y in nb.prange(H): + for x in range(W): + v = spread[y, x] + # popcount manuale + v = (v & 0x55) + ((v >> 1) & 0x55) + v = (v & 0x33) + ((v >> 2) & 0x33) + v = (v & 0x0F) + ((v >> 4) & 0x0F) + out[y, x] = float(v) + return out + def _warmup(): resp = np.zeros((8, 32, 32), dtype=np.float32) dx = np.zeros(1, dtype=np.int32) @@ -81,16 +132,57 @@ if HAS_NUMBA: b = np.zeros(1, dtype=np.int8) ba = np.ones(8, dtype=np.bool_) _jit_score_by_shift(resp, dx, dy, b, ba) + spread = np.zeros((32, 32), dtype=np.uint8) + _jit_score_bitmap(spread, dx, dy, b, np.uint8(0xFF)) + _jit_popcount_density(spread) else: # pragma: no cover def _jit_score_by_shift(resp, dx, dy, bins, bin_active): raise RuntimeError("numba non disponibile") + def _jit_score_bitmap(spread, dx, dy, bins, bit_active): + raise RuntimeError("numba non disponibile") + + def _jit_popcount_density(spread): + raise RuntimeError("numba non disponibile") + def _warmup(): pass +def score_bitmap( + spread: np.ndarray, dx: np.ndarray, dy: np.ndarray, bins: np.ndarray, + bit_active: int, +) -> np.ndarray: + """Dispatch bitmap: JIT se numba, fallback numpy.""" + if HAS_NUMBA and len(dx) > 0: + return _jit_score_bitmap( + np.ascontiguousarray(spread, dtype=np.uint8), + np.ascontiguousarray(dx, dtype=np.int32), + np.ascontiguousarray(dy, dtype=np.int32), + np.ascontiguousarray(bins, dtype=np.int8), + np.uint8(bit_active), + ) + # Fallback numpy (lento): converte bitmap a response 3D + H, W = spread.shape + resp = np.zeros((8, H, W), dtype=np.float32) + for b in range(8): + resp[b] = ((spread >> b) & 1).astype(np.float32) + return _numpy_score_by_shift(resp, dx, dy, bins, None) + + +def popcount_density(spread: np.ndarray) -> np.ndarray: + if HAS_NUMBA: + return _jit_popcount_density(np.ascontiguousarray(spread, dtype=np.uint8)) + # Fallback + H, W = spread.shape + out = np.zeros((H, W), dtype=np.float32) + for b in range(8): + out += ((spread >> b) & 1).astype(np.float32) + return out + + def score_by_shift( resp: np.ndarray, dx: np.ndarray, dy: np.ndarray, bins: np.ndarray, bin_has_data: np.ndarray | None = None, diff --git a/pm2d/line_matcher.py b/pm2d/line_matcher.py index 623fed5..5540777 100644 --- a/pm2d/line_matcher.py +++ b/pm2d/line_matcher.py @@ -33,7 +33,12 @@ from dataclasses import dataclass import cv2 import numpy as np -from pm2d._jit_kernels import score_by_shift as _jit_score_by_shift, HAS_NUMBA +from pm2d._jit_kernels import ( + score_by_shift as _jit_score_by_shift, + score_bitmap as _jit_score_bitmap, + popcount_density as _jit_popcount, + HAS_NUMBA, +) N_BINS = 8 # orientamenti quantizzati modulo π @@ -286,11 +291,7 @@ class LineShapeMatcher: # --- Matching ------------------------------------------------------ def _response_map(self, gray: np.ndarray) -> np.ndarray: - """Response map shape (N_BINS, H, W) float32 0/1. - - Rinormalizzazione anti-background (match vs texture densa) è - applicata a valle nel `find()` via `_bg_map` locale. - """ + """Response map shape (N_BINS, H, W) float32 (legacy path).""" mag, bins = self._gradient(gray) valid = mag >= self.weak_grad k = 2 * self.spread_radius + 1 @@ -303,6 +304,23 @@ class LineShapeMatcher: raw[b] = d.astype(np.float32) return raw + def _spread_bitmap(self, gray: np.ndarray) -> np.ndarray: + """Spread bitmap uint8: bit b acceso dove bin b è presente nel raggio. + + Formato compatto 32× più denso della response map (N_BINS, H, W) float32. + """ + mag, bins = self._gradient(gray) + valid = mag >= self.weak_grad + k = 2 * self.spread_radius + 1 + kernel = np.ones((k, k), dtype=np.uint8) + H, W = gray.shape + spread = np.zeros((H, W), dtype=np.uint8) + for b in range(N_BINS): + mask_b = ((bins == b) & valid).astype(np.uint8) + d = cv2.dilate(mask_b, kernel) + spread |= (d << b) + return spread + @staticmethod def _score_by_shift( resp: np.ndarray, dx: np.ndarray, dy: np.ndarray, bins: np.ndarray, @@ -333,7 +351,8 @@ class LineShapeMatcher: def _refine_angle( self, - resp0: np.ndarray, + spread0: np.ndarray, # bitmap uint8 (H, W) + bit_active: int, template_gray: np.ndarray, cx: float, cy: float, angle_deg: float, scale: float, @@ -366,7 +385,7 @@ class LineShapeMatcher: cv2.BORDER_CONSTANT, value=0) center = (diag / 2.0, diag / 2.0) - H, W = resp0.shape[1], resp0.shape[2] + H, W = spread0.shape # Ricerca locale posizione con margine ±2 px sulla (cx, cy) margin = 3 @@ -385,13 +404,14 @@ class LineShapeMatcher: continue dx = (fx - center[0]).astype(np.int32) dy = (fy - center[1]).astype(np.int32) - # Finestra locale ±margin attorno a (cx, cy) via slicing vettorizzato + # Finestra locale ±margin attorno a (cx, cy) via slicing su bitmap y_lo = int(cy) - margin; y_hi = int(cy) + margin + 1 x_lo = int(cx) - margin; x_hi = int(cx) + margin + 1 sh = y_hi - y_lo; sw = x_hi - x_lo acc = np.zeros((sh, sw), dtype=np.float32) for i in range(len(dx)): ddx = int(dx[i]); ddy = int(dy[i]); b = int(fb[i]) + bit = np.uint8(1 << b) sy0 = y_lo + ddy; sy1 = y_hi + ddy sx0 = x_lo + ddx; sx1 = x_hi + ddx a_y0 = max(0, -sy0); a_y1 = sh - max(0, sy1 - H) @@ -399,7 +419,10 @@ class LineShapeMatcher: s_y0 = max(0, sy0); s_y1 = min(H, sy1) s_x0 = max(0, sx0); s_x1 = min(W, sx1) if s_y1 > s_y0 and s_x1 > s_x0: - acc[a_y0:a_y1, a_x0:a_x1] += resp0[b, s_y0:s_y1, s_x0:s_x1] + region = spread0[s_y0:s_y1, s_x0:s_x1] + acc[a_y0:a_y1, a_x0:a_x1] += ( + (region & bit) != 0 + ).astype(np.float32) acc /= len(dx) _, max_val, _, max_loc = cv2.minMaxLoc(acc) scores_by_off[float(off)] = float(max_val) @@ -487,18 +510,19 @@ class LineShapeMatcher: grays.append(cv2.pyrDown(grays[-1])) top = len(grays) - 1 - # Response map top-level - resp_top = self._response_map(grays[top]) - bin_has_top = np.array([resp_top[b].any() for b in range(N_BINS)]) + # Spread bitmap (uint8) al top level: 32× meno memoria della response + # map float32 → MOLTO più cache-friendly per _score_by_shift. + spread_top = self._spread_bitmap(grays[top]) + bit_active_top = int( + sum(1 << b for b in range(N_BINS) + if (spread_top & np.uint8(1 << b)).any()) + ) if nms_radius is None: nms_radius = max(8, min(self.template_size) // 2) top_thresh = min_score * self.top_score_factor - # Background map PER-SCALA: densità media bin attivi normalizzata - # su bbox template scalata. Rinormalizza score per isolare contributo - # non-random e riduce FP in zone con attivazione densa. tw, th = self.template_size - density_top = resp_top.sum(axis=0) + density_top = _jit_popcount(spread_top) sf_top = 2 ** top bg_cache_top: dict[float, np.ndarray] = {} bg_cache_full: dict[float, np.ndarray] = {} @@ -521,8 +545,8 @@ class LineShapeMatcher: def _top_score(vi: int) -> tuple[int, float]: var = self.variants[vi] lvl = var.levels[min(top, len(var.levels) - 1)] - score = self._score_by_shift( - resp_top, lvl.dx, lvl.dy, lvl.bin, bin_has_data=bin_has_top, + score = _jit_score_bitmap( + spread_top, lvl.dx, lvl.dy, lvl.bin, bit_active_top, ) score = _rescore(score, bg_cache_top[var.scale]) return vi, float(score.max()) if score.size else -1.0 @@ -549,18 +573,21 @@ class LineShapeMatcher: max_vars_full = max(max_matches * 8, len(self.variants) // 2) kept_variants = kept_variants[:max_vars_full] - # Full-res (parallelizzato per variante) - resp0 = self._response_map(gray0) - bin_has_full = np.array([resp0[b].any() for b in range(N_BINS)]) - density_full = resp0.sum(axis=0) + # Full-res (parallelizzato) con bitmap + spread0 = self._spread_bitmap(gray0) + bit_active_full = int( + sum(1 << b for b in range(N_BINS) + if (spread0 & np.uint8(1 << b)).any()) + ) + density_full = _jit_popcount(spread0) for sc in unique_scales: bg_cache_full[sc] = _bg_for_scale(density_full, sc, 1) def _full_score(vi: int) -> tuple[int, np.ndarray]: var = self.variants[vi] lvl0 = var.levels[0] - score = self._score_by_shift( - resp0, lvl0.dx, lvl0.dy, lvl0.bin, bin_has_data=bin_has_full, + score = _jit_score_bitmap( + spread0, lvl0.dx, lvl0.dy, lvl0.bin, bit_active_full, ) score = _rescore(score, bg_cache_full[var.scale]) return vi, score @@ -595,28 +622,37 @@ class LineShapeMatcher: h, w = self.template_gray.shape if self.template_gray is not None else (0, 0) mask_full = np.full((h, w), 255, dtype=np.uint8) - kept: list[Match] = [] + # Pre-NMS rapido su raw (solo subpixel, no refine/verify): riduce + # i candidati a ~max_matches*3 prima di operazioni costose (refine, + # verify) che erano chiamate per ogni raw causando lentezze 100x. r2 = nms_radius * nms_radius - tw, th = self.template_size + preliminary: list[tuple[float, float, float, int]] = [] + pre_cap = max(max_matches * 3, max_matches + 10) for score, xi, yi, vi in raw: - var = self.variants[vi] - cx_f = float(xi); cy_f = float(yi) if subpixel and vi in score_maps: cx_f, cy_f = self._subpixel_peak(score_maps[vi], xi, yi) - - if any((k.cx - cx_f) ** 2 + (k.cy - cy_f) ** 2 < r2 for k in kept): + else: + cx_f, cy_f = float(xi), float(yi) + if any((k[1] - cx_f) ** 2 + (k[2] - cy_f) ** 2 < r2 + for k in preliminary): continue + preliminary.append((score, cx_f, cy_f, vi)) + if len(preliminary) >= pre_cap: + break + # Ora refine + verify solo sui candidati pre-NMS + kept: list[Match] = [] + tw, th = self.template_size + for score, cx_f, cy_f, vi in preliminary: + var = self.variants[vi] ang_f = var.angle_deg score_f = score if refine_angle and self.template_gray is not None: ang_f, score_f, cx_f, cy_f = self._refine_angle( - resp0, self.template_gray, cx_f, cy_f, + spread0, bit_active_full, self.template_gray, cx_f, cy_f, var.angle_deg, var.scale, mask_full, search_radius=self.angle_step_deg / 2.0, ) - - # Verify NCC: filtra falsi positivi con mismatch pixel-level if verify_ncc: ncc = self._verify_ncc(gray0, cx_f, cy_f, ang_f, var.scale) if ncc < verify_threshold: