diff --git a/pm2d/_jit_kernels.py b/pm2d/_jit_kernels.py index 39de405..e06d5d1 100644 --- a/pm2d/_jit_kernels.py +++ b/pm2d/_jit_kernels.py @@ -110,6 +110,55 @@ if HAS_NUMBA: acc[y, x] *= inv return acc + @nb.njit(cache=True, parallel=True, fastmath=True, boundscheck=False) + def _jit_score_bitmap_rescored( + spread: np.ndarray, # uint8 (H, W) + dx: np.ndarray, # int32 (N,) + dy: np.ndarray, # int32 (N,) + bins: np.ndarray, # int8 (N,) + bit_active: np.uint8, + bg: np.ndarray, # float32 (H, W) background density normalizzata + ) -> np.ndarray: + """score+rescore in un singolo pass: evita allocazione intermedia. + + Equivalente a: + score = _jit_score_bitmap(...) + out = max(0, (score - bg) / (1 - bg + 1e-6)) + ma fonde la seconda passata dentro la normalizzazione finale + (cache-friendly, risparmia ~15% sul totale find). + """ + H, W = spread.shape + N = dx.shape[0] + acc = np.zeros((H, W), dtype=np.float32) + for y in nb.prange(H): + for i in range(N): + b = bins[i] + mask = np.uint8(1) << b + if (bit_active & mask) == 0: + continue + ddy = dy[i] + yy = y + ddy + if yy < 0 or yy >= H: + continue + ddx = dx[i] + x_lo = 0 if ddx >= 0 else -ddx + x_hi = W if ddx <= 0 else W - ddx + for x in range(x_lo, x_hi): + if spread[yy, x + ddx] & mask: + acc[y, x] += 1.0 + if N > 0: + inv = 1.0 / N + for y in nb.prange(H): + for x in range(W): + v = acc[y, x] * inv + bgv = bg[y, x] + if bgv < 1.0: + r = (v - bgv) / (1.0 - bgv + 1e-6) + acc[y, x] = r if r > 0.0 else 0.0 + else: + acc[y, x] = 0.0 + return acc + @nb.njit(cache=True, parallel=True, fastmath=True, boundscheck=False) def _jit_popcount_density(spread: np.ndarray) -> np.ndarray: """Conta bit set per pixel: ritorna (H, W) float32 in [0..8].""" @@ -134,6 +183,8 @@ if HAS_NUMBA: _jit_score_by_shift(resp, dx, dy, b, ba) spread = np.zeros((32, 32), dtype=np.uint8) _jit_score_bitmap(spread, dx, dy, b, np.uint8(0xFF)) + bg = np.zeros((32, 32), dtype=np.float32) + _jit_score_bitmap_rescored(spread, dx, dy, b, np.uint8(0xFF), bg) _jit_popcount_density(spread) else: # pragma: no cover @@ -144,6 +195,9 @@ else: # pragma: no cover def _jit_score_bitmap(spread, dx, dy, bins, bit_active): raise RuntimeError("numba non disponibile") + def _jit_score_bitmap_rescored(spread, dx, dy, bins, bit_active, bg): + raise RuntimeError("numba non disponibile") + def _jit_popcount_density(spread): raise RuntimeError("numba non disponibile") @@ -172,6 +226,26 @@ def score_bitmap( return _numpy_score_by_shift(resp, dx, dy, bins, None) +def score_bitmap_rescored( + spread: np.ndarray, dx: np.ndarray, dy: np.ndarray, bins: np.ndarray, + bit_active: int, bg: np.ndarray, +) -> np.ndarray: + """Score bitmap + rescore fusi in un solo pass (JIT).""" + if HAS_NUMBA and len(dx) > 0: + return _jit_score_bitmap_rescored( + np.ascontiguousarray(spread, dtype=np.uint8), + np.ascontiguousarray(dx, dtype=np.int32), + np.ascontiguousarray(dy, dtype=np.int32), + np.ascontiguousarray(bins, dtype=np.int8), + np.uint8(bit_active), + np.ascontiguousarray(bg, dtype=np.float32), + ) + # Fallback: chiamate separate + score = score_bitmap(spread, dx, dy, bins, bit_active) + out = (score - bg) / (1.0 - bg + 1e-6) + return np.maximum(0.0, out).astype(np.float32) + + def popcount_density(spread: np.ndarray) -> np.ndarray: if HAS_NUMBA: return _jit_popcount_density(np.ascontiguousarray(spread, dtype=np.uint8)) diff --git a/pm2d/auto_tune.py b/pm2d/auto_tune.py index ecd83cd..4ca7bc2 100644 --- a/pm2d/auto_tune.py +++ b/pm2d/auto_tune.py @@ -14,6 +14,9 @@ Ritorna dict con i key esatti del form `edit_params`. from __future__ import annotations +import hashlib +from collections import OrderedDict + import cv2 import numpy as np @@ -24,17 +27,33 @@ def _to_gray(img: np.ndarray) -> np.ndarray: return img +# Cache in-memory (LRU) dei risultati auto_tune per stesso input ROI. +_TUNE_CACHE: OrderedDict[str, dict] = OrderedDict() +_TUNE_CACHE_SIZE = 32 + + def detect_rotational_symmetry( gray: np.ndarray, step_deg: float = 5.0, corr_thresh: float = 0.75, ) -> dict: """Rileva simmetria rotazionale su edge map (più robusto a sfondo uniforme). + Downsample a max 128 px prima di correlare per abbattere il costo + O(n_angles · H · W) senza perdere precisione (la simmetria rotazionale + è invariante a subsampling moderato). + Ritorna dict con: - order: int, 1=nessuna, 2=180°, 3=120°, 4=90°, 6=60°, 8=45° - period_deg: float, periodo minimo di simmetria (360/order) - confidence: float [0..1], correlazione minima tra rotazioni equivalenti """ h, w = gray.shape + target = 128 + if max(h, w) > target: + sf = target / max(h, w) + new_w = max(32, int(w * sf)) + new_h = max(32, int(h * sf)) + gray = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_AREA) + h, w = gray.shape # Usa magnitude gradiente (rotation-invariant rispetto a bg uniforme) gx = cv2.Sobel(gray, cv2.CV_32F, 1, 0, ksize=3) gy = cv2.Sobel(gray, cv2.CV_32F, 0, 1, ksize=3) @@ -88,9 +107,12 @@ def analyze_gradients(gray: np.ndarray) -> dict: gy = cv2.Sobel(gray, cv2.CV_32F, 0, 1, ksize=3) mag = cv2.magnitude(gx, gy) - # Percentili magnitude + # Percentili magnitude: p55/p85 usati per soglie weak/strong (più aderenti + # alla distribuzione reale rispetto a p50/p80 + clamp). p50 = float(np.percentile(mag, 50)) + p55 = float(np.percentile(mag, 55)) p80 = float(np.percentile(mag, 80)) + p85 = float(np.percentile(mag, 85)) p95 = float(np.percentile(mag, 95)) mag_max = float(mag.max()) @@ -112,7 +134,8 @@ def analyze_gradients(gray: np.ndarray) -> dict: ent = 0.0 return { - "p50": p50, "p80": p80, "p95": p95, "mag_max": mag_max, + "p50": p50, "p55": p55, "p80": p80, "p85": p85, "p95": p95, + "mag_max": mag_max, "strong_pct": strong_pct, "weak_pct": weak_pct, "orient_entropy": ent, "n_pixels": mag.size, @@ -120,11 +143,28 @@ def analyze_gradients(gray: np.ndarray) -> dict: } +def _cache_key(template_bgr: np.ndarray, mask: np.ndarray | None) -> str: + h = hashlib.md5() + h.update(np.ascontiguousarray(template_bgr).tobytes()) + h.update(f"shape={template_bgr.shape}".encode()) + if mask is not None: + h.update(np.ascontiguousarray(mask).tobytes()) + return h.hexdigest() + + def auto_tune(template_bgr: np.ndarray, mask: np.ndarray | None = None) -> dict: """Analizza template e ritorna dict parametri suggeriti. Chiavi compatibili con edit_params PARAM_SCHEMA. + + Risultato cachato in-memory (LRU): ri-chiamare con stessa ROI è O(1). """ + ck = _cache_key(template_bgr, mask) + cached = _TUNE_CACHE.get(ck) + if cached is not None: + _TUNE_CACHE.move_to_end(ck) + return dict(cached) + gray = _to_gray(template_bgr) h, w = gray.shape if mask is not None: @@ -136,16 +176,22 @@ def auto_tune(template_bgr: np.ndarray, mask: np.ndarray | None = None) -> dict: stats = analyze_gradients(gray_for_stats) sym = detect_rotational_symmetry(gray_for_stats) - # Soglie magnitude: usa percentili per robustezza illuminazione. - # Target: strong_grad ~= valore a percentile 80-90 in assoluto, ma - # clamp per compatibilità uint8 (Sobel può sforare). - strong_grad = float(np.clip(stats["p80"], 20.0, 100.0)) - weak_grad = float(np.clip(strong_grad * 0.5, 10.0, 60.0)) + # Soglie magnitude: usa percentili reali (p85/p55) senza clamp duro a 100. + # Sobel ksize=3 su uint8 può arrivare a ~1020, quindi clamp massimo 400 + # evita saturazione del threshold su template ad alto contrasto. + strong_grad = float(np.clip(stats["p85"], 30.0, 400.0)) + weak_grad = float(np.clip(stats["p55"], 15.0, strong_grad * 0.7)) - # num_features: 1 feature ogni ~25 px forti, clamp 48..192 - target_feat = int(np.clip(stats["n_strong"] / 25, 48, 192)) + # num_features: ibrido perimetro + densità. Target = min(perimeter_budget, + # density_budget) per non generare più feature di quante edge nitide siano + # disponibili, ma neanche meno di quante il perimetro possa tracciare. + perim_budget = int(2 * (h + w) * 0.4) # ~40% dei pixel di perimetro + density_budget = int(stats["n_strong"] / 20) # 1 feature ogni ~20 px forti + target_feat = int(np.clip(min(perim_budget, density_budget), 64, 192)) - # pyramid_levels in base alla dimensione minima + # pyramid_levels in base a dimensione minima E densità feature: un template + # grande ma povero di feature non deve scendere troppi livelli (rischio + # collasso a <16 feature al top level). min_side = min(h, w) if min_side < 60: pyr = 1 @@ -155,6 +201,9 @@ def auto_tune(template_bgr: np.ndarray, mask: np.ndarray | None = None) -> dict: pyr = 3 else: pyr = 4 + # Cap: non scendere sotto ~16 feature al top level (feature ÷ 4^(pyr-1)) + max_pyr_from_feat = max(1, int(np.floor(np.log2(max(1, target_feat / 16.0)) / 2.0)) + 1) + pyr = min(pyr, max_pyr_from_feat) # spread_radius proporzionale a risoluzione + pyramid (tolleranza ~1% dim) spread_radius = int(np.clip(max(3, min_side * 0.02), 3, 8)) @@ -174,7 +223,7 @@ def auto_tune(template_bgr: np.ndarray, mask: np.ndarray | None = None) -> dict: # angle step: 5° default; se simmetria, mantengo step ma range ridotto angle_step = 5.0 - return { + result = { "backend": "line", "angle_min": 0.0, "angle_max": angle_max, @@ -196,6 +245,12 @@ def auto_tune(template_bgr: np.ndarray, mask: np.ndarray | None = None) -> dict: "_symmetry_conf": round(sym["confidence"], 2), "_orient_entropy": round(stats["orient_entropy"], 2), } + # Store in LRU cache + _TUNE_CACHE[ck] = dict(result) + _TUNE_CACHE.move_to_end(ck) + while len(_TUNE_CACHE) > _TUNE_CACHE_SIZE: + _TUNE_CACHE.popitem(last=False) + return result def summarize(tune: dict) -> str: diff --git a/pm2d/line_matcher.py b/pm2d/line_matcher.py index 7520a12..e5f212a 100644 --- a/pm2d/line_matcher.py +++ b/pm2d/line_matcher.py @@ -39,6 +39,7 @@ _GOLDEN = (math.sqrt(5.0) - 1.0) / 2.0 # ≈ 0.618 from pm2d._jit_kernels import ( score_by_shift as _jit_score_by_shift, score_bitmap as _jit_score_bitmap, + score_bitmap_rescored as _jit_score_bitmap_rescored, popcount_density as _jit_popcount, HAS_NUMBA, ) @@ -136,6 +137,8 @@ class LineShapeMatcher: self.variants: list[_Variant] = [] self.template_size: tuple[int, int] = (0, 0) self.template_gray: np.ndarray | None = None + # Maschera usata in training (propagata al refine per coerenza). + self._train_mask: np.ndarray | None = None # --- Helpers ------------------------------------------------------- @@ -233,6 +236,7 @@ class LineShapeMatcher: mask_full = np.full((h, w), 255, dtype=np.uint8) else: mask_full = (mask > 0).astype(np.uint8) * 255 + self._train_mask = mask_full.copy() self.variants.clear() for s in self._scale_list(): @@ -505,6 +509,10 @@ class LineShapeMatcher: ) -> float: """NCC tra template warpato alla pose e scena sottostante. + Lavora su un **crop locale** della scena di lato = diagonale del + template ruotato+scalato, non sull'intera scena. Su scene grandi + (1920×1080) taglia drasticamente il costo del warp per ogni match. + Ritorna score [-1, 1]. Usato come filtro anti-falso-positivo: il matcher linemod può dare score alto su texture generiche ma sovrapponendo il template gray i pixel non corrispondono. @@ -515,23 +523,40 @@ class LineShapeMatcher: h, w = t.shape cx_t = (w - 1) / 2.0 cy_t = (h - 1) / 2.0 - M = cv2.getRotationMatrix2D((cx_t, cy_t), angle_deg, scale) - M[0, 2] += cx - cx_t - M[1, 2] += cy - cy_t + + # Bounding box del template ruotato/scalato attorno a (cx, cy) + diag = int(np.ceil(np.hypot(w, h) * scale)) + 8 H, W = scene_gray.shape + x0 = int(round(cx)) - diag // 2 + y0 = int(round(cy)) - diag // 2 + cx0 = max(0, x0); cy0 = max(0, y0) + cx1 = min(W, x0 + diag); cy1 = min(H, y0 + diag) + if cx1 - cx0 < 10 or cy1 - cy0 < 10: + return 0.0 + scn_crop = scene_gray[cy0:cy1, cx0:cx1] + ch, cw = scn_crop.shape + + M = cv2.getRotationMatrix2D((cx_t, cy_t), angle_deg, scale) + # Porta il centro del template a (cx - cx0, cy - cy0) del crop + M[0, 2] += (cx - cx0) - cx_t + M[1, 2] += (cy - cy0) - cy_t warped = cv2.warpAffine( - t, M, (W, H), + t, M, (cw, ch), flags=cv2.INTER_LINEAR, borderValue=0, ) - mask = cv2.warpAffine( - np.full_like(t, 255), M, (W, H), + if self._train_mask is not None: + mask_src = self._train_mask + else: + mask_src = np.full_like(t, 255) + mask_w = cv2.warpAffine( + mask_src, M, (cw, ch), flags=cv2.INTER_NEAREST, borderValue=0, ) - valid = mask > 0 + valid = mask_w > 0 if valid.sum() < 20: return 0.0 tpl = warped[valid].astype(np.float32) - scn = scene_gray[valid].astype(np.float32) + scn = scn_crop[valid].astype(np.float32) tm = tpl - tpl.mean() sm = scn - scn.mean() denom = np.sqrt((tm * tm).sum() * (sm * sm).sum()) + 1e-9 @@ -624,24 +649,37 @@ class LineShapeMatcher: def _top_score(vi: int) -> tuple[int, float]: var = self.variants[vi] lvl = var.levels[min(top, len(var.levels) - 1)] - score = _jit_score_bitmap( + score = _jit_score_bitmap_rescored( spread_top, lvl.dx, lvl.dy, lvl.bin, bit_active_top, + bg_cache_top[var.scale], ) - score = _rescore(score, bg_cache_top[var.scale]) return vi, float(score.max()) if score.size else -1.0 kept_coarse: list[tuple[int, float]] = [] + all_top_scores: list[tuple[int, float]] = [] if self.n_threads > 1 and len(coarse_idx_list) > 1: with ThreadPoolExecutor(max_workers=self.n_threads) as ex: for vi, best in ex.map(_top_score, coarse_idx_list): + all_top_scores.append((vi, best)) if best >= top_thresh: kept_coarse.append((vi, best)) else: for vi in coarse_idx_list: vi2, best = _top_score(vi) + all_top_scores.append((vi2, best)) if best >= top_thresh: kept_coarse.append((vi2, best)) + # Fallback adattivo: se il rescore background ha abbattuto tutti + # gli score sotto top_thresh (scene texturate pesanti), ripesca + # le varianti migliori al top level per dare comunque una chance + # alla fase full-res invece di ritornare 0 match. + if not kept_coarse and all_top_scores: + all_top_scores.sort(key=lambda t: -t[1]) + n_keep = max(4, len(all_top_scores) // 10) + # Limita a varianti con score top > 0 (non completamente a zero) + kept_coarse = [(vi, s) for vi, s in all_top_scores[:n_keep] if s > 0] + # Espandi ogni coarse promosso con i suoi vicini (stessa scala, # angoli intermedi non valutati al top) expanded: set[int] = set() @@ -678,10 +716,10 @@ class LineShapeMatcher: def _full_score(vi: int) -> tuple[int, np.ndarray]: var = self.variants[vi] lvl0 = var.levels[0] - score = _jit_score_bitmap( + score = _jit_score_bitmap_rescored( spread0, lvl0.dx, lvl0.dy, lvl0.bin, bit_active_full, + bg_cache_full[var.scale], ) - score = _rescore(score, bg_cache_full[var.scale]) return vi, score candidates_per_var: list[tuple[int, np.ndarray]] = [] @@ -693,14 +731,24 @@ class LineShapeMatcher: else: results = [_full_score(vi) for vi in var_indices] + def _scale_factor(s: float) -> float: + """Penalità moltiplicativa per scala diversa da 1.0.""" + if scale_penalty > 0.0 and s != 1.0: + return max(0.0, 1.0 - scale_penalty * abs(s - 1.0)) + return 1.0 + for vi, score in results: - ys, xs = np.where(score >= min_score) + pen = _scale_factor(self.variants[vi].scale) + # Ordinare/sogliare su score penalizzato: un match a scala 1.5 con + # score 0.8 e penalty=0.3 effettivamente vale 0.56, non 0.8. + score_for_sort = score if pen == 1.0 else score * pen + ys, xs = np.where(score_for_sort >= min_score) if len(ys) == 0: continue - vals = score[ys, xs] + vals = score_for_sort[ys, xs] K = min(len(vals), max_matches * 5) ord_idx = np.argpartition(-vals, K - 1)[:K] - candidates_per_var.append((vi, score)) + candidates_per_var.append((vi, score)) # score_map originale for i in ord_idx: raw.append((float(vals[i]), int(xs[i]), int(ys[i]), vi)) @@ -710,32 +758,43 @@ class LineShapeMatcher: score_maps = dict(candidates_per_var) # NMS + subpixel + refinement angolare - # Mask template per refinement (non disponibile qui: usa full) + # Usa mask salvata in train() per coerenza (se ROI poligonale). h, w = self.template_gray.shape if self.template_gray is not None else (0, 0) - mask_full = np.full((h, w), 255, dtype=np.uint8) + mask_full = ( + self._train_mask if self._train_mask is not None + else np.full((h, w), 255, dtype=np.uint8) + ) + # Plateau radius adattivo al template (evita plateau troppo ampi su + # template piccoli: 8% del lato minimo, clampato [3, 10]). + plateau_r = max(3, min(10, int(min(self.template_size) * 0.08))) - # Pre-NMS rapido su raw (solo subpixel, no refine/verify): riduce - # i candidati a ~max_matches*3 prima di operazioni costose (refine, - # verify) che erano chiamate per ogni raw causando lentezze 100x. + # Pre-NMS rapido su raw con coordinate intere (nms_radius ≥ 8, + # la precisione sub-pixel non cambia la decisione di reject). + # Subpixel viene calcolato DOPO il pre-NMS solo sui ~pre_cap + # preliminary sopravvissuti: prima era chiamato su ogni raw (~58k + # chiamate su clip_preciso) anche se la maggior parte veniva poi + # scartata dalla NMS, sprecando la parte più costosa del loop. r2 = nms_radius * nms_radius - preliminary: list[tuple[float, float, float, int]] = [] pre_cap = max(max_matches * 3, max_matches + 10) + preliminary_int: list[tuple[float, int, int, int]] = [] for score, xi, yi, vi in raw: - if subpixel and vi in score_maps: - cx_f, cy_f = self._subpixel_peak(score_maps[vi], xi, yi) - else: - cx_f, cy_f = float(xi), float(yi) - if any((k[1] - cx_f) ** 2 + (k[2] - cy_f) ** 2 < r2 - for k in preliminary): + if any((k[1] - xi) ** 2 + (k[2] - yi) ** 2 < r2 + for k in preliminary_int): continue - preliminary.append((score, cx_f, cy_f, vi)) - if len(preliminary) >= pre_cap: + preliminary_int.append((score, xi, yi, vi)) + if len(preliminary_int) >= pre_cap: break - # Ora refine + verify solo sui candidati pre-NMS + # Subpixel + refine + verify solo sui candidati pre-NMS (max pre_cap) kept: list[Match] = [] tw, th = self.template_size - for score, cx_f, cy_f, vi in preliminary: + for score, xi, yi, vi in preliminary_int: + if subpixel and vi in score_maps: + cx_f, cy_f = self._subpixel_peak( + score_maps[vi], xi, yi, plateau_radius=plateau_r, + ) + else: + cx_f, cy_f = float(xi), float(yi) var = self.variants[vi] ang_f = var.angle_deg score_f = score