feat: profile mode + bench suite + skip-bin-vuoti + variant pruning histogram

4 ottimizzazioni performance + visibilita': GGG. find(profile=True) → timing per fase - _checkpoint() registra ms tra: to_gray, spread_top, top_pruning, full_kernel, refine_verify_nms - get_last_profile() ritorna dict ms per identificare bottleneck - Costo runtime trascurabile (~5 us per call) HHH. pm2d.bench - benchmark suite eseguibile - 3 scenarios (rect/L/circle x scene clean/cluttered) - 5 configs (baseline, polarity, propagate, greedy, stride) - Auto-aggiunge gpu_umat se opencl_available() - Tabella ms/find + profile per ogni combo - Entry-point pm2d-bench (--quick per smoke test 2 iter) XX. Skip dilate per bin vuoti in _spread_bitmap - Pre-calcolo bin presenti via np.unique sui pixel valid - Su scene a bassa varianza orientation skip 50-70% delle dilate - Misurato benchmark: spread_top da ~0.3ms a ~0.1ms in molti casi VV. Variant pruning preliminare via histogramma orientation - Per ogni variante calcolo overlap (feature bins ∩ scene bins) / total feature bins - Se overlap < 0.5 * min_score → skip variante (no kernel call) - Counter n_variants_pruned_histogram nel diag - Vantaggio: scene focalizzate (poche direzioni dominanti) skippano varianti template con bin assenti dalla scena Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-05 12:25:15 +02:00
parent ee1c4a8f92
commit 6d6dcc3b7a
3 changed files with 260 additions and 0 deletions
@@ -736,7 +736,24 @@ class LineShapeMatcher:
        nb = self._n_bins
        dtype = np.uint16 if nb > 8 else np.uint8
        spread = np.zeros((H, W), dtype=dtype)
+        # XX optimization: skip dilate per bin senza pixel attivi.
+        # Su scene a bassa varianza orientation (es. pezzi industriali con
+        # poche direzioni dominanti) tipicamente 50-70% dei bin sono vuoti.
+        # Pre-calcolo bin presenti via mask globale; per bin assenti niente
+        # dilate (resta zero nel bitmap).
+        if isinstance(bins, np.ndarray):
+            valid_bins = bins[valid] if isinstance(valid, np.ndarray) else None
+            if valid_bins is not None and valid_bins.size > 0:
+                bin_present = np.zeros(nb, dtype=bool)
+                unique_bins = np.unique(valid_bins)
+                bin_present[unique_bins[unique_bins < nb]] = True
+            else:
+                bin_present = np.zeros(nb, dtype=bool)
+        else:
+            bin_present = np.ones(nb, dtype=bool)
        for b in range(nb):
+            if not bin_present[b]:
+                continue  # XX: nessun pixel di questo bin sopra weak_grad
            mask_b = ((bins == b) & valid).astype(np.uint8)
            if self.use_gpu:
                d = cv2.dilate(cv2.UMat(mask_b), kernel)
@@ -1358,6 +1375,7 @@ class LineShapeMatcher:
        use_soft_score: bool = False,
        subpixel_lm: bool = False,
        debug: bool = False,
+        profile: bool = False,
    ) -> list[Match]:
        """
        scale_penalty: se > 0, riduce lo score per match a scala diversa da 1.0:
@@ -1390,6 +1408,7 @@ class LineShapeMatcher:
            "drop_recall_low": 0,
            "drop_bbox_out_of_scene": 0,
            "drop_nms_iou": 0,
+            "n_variants_pruned_histogram": 0,
            "n_final": 0,
            "top_thresh_used": 0.0,
            "verify_threshold_used": float(verify_threshold),
@@ -1401,7 +1420,21 @@ class LineShapeMatcher:
        }
        self._last_diag = diag

+        # GGG: profile mode → timing per fase, esposto via get_last_profile()
+        import time as _time
+        prof = {} if profile else None
+        _t_prev = _time.perf_counter() if profile else 0.0
+        def _checkpoint(name: str):
+            nonlocal _t_prev
+            if prof is None:
+                return
+            now = _time.perf_counter()
+            prof[name] = (now - _t_prev) * 1000.0  # ms
+            _t_prev = now
+        self._last_profile = prof
+
        gray_full = self._to_gray(scene_bgr)
+        _checkpoint("to_gray")
        # Applica ROI di ricerca: restringe scena a crop, ricorda offset per
        # ri-traslare le coordinate dei match a fine pipeline.
        if search_roi is not None:
@@ -1440,6 +1473,7 @@ class LineShapeMatcher:
            spread0 = None
            bit_active_full = None
            density_full = None
+        _checkpoint("spread_top")
        if nms_radius is None:
            nms_radius = max(8, min(self.template_size) // 2)
        # Pruning adattivo allo step angolare: con step piccolo (<= 3 deg)
@@ -1501,6 +1535,38 @@ class LineShapeMatcher:
                end = min(n, i + half + 1)
                neighbor_map[vi_c] = vi_sorted[start:end]

+        # VV: pruning preliminare via overlap istogramma orientation.
+        # Scene-bins-attivi vs variant-feature-bins. Se la variante ha bin
+        # dominanti che la scena non possiede → score impossibile, skip
+        # senza chiamare il kernel. Costo: O(n_variants * 8 ops).
+        scene_bins = np.array(
+            [bool((bit_active_top >> b) & 1) for b in range(self._n_bins)],
+            dtype=bool,
+        )
+        if scene_bins.any():
+            n_scene_active = int(scene_bins.sum())
+            # Soglia: variante deve avere >= 50% delle sue feature in bin
+            # presenti nella scena. Sotto = score certamente < 0.5.
+            pruned_idx_list = []
+            n_pruned = 0
+            for vi in coarse_idx_list:
+                lvl = self.variants[vi].levels[
+                    min(top, len(self.variants[vi].levels) - 1)
+                ]
+                if len(lvl.bin) == 0:
+                    continue
+                feat_in_scene = int(np.isin(lvl.bin, np.where(scene_bins)[0]).sum())
+                ratio = feat_in_scene / len(lvl.bin)
+                if ratio < 0.5 * min_score:
+                    n_pruned += 1
+                    continue
+                pruned_idx_list.append(vi)
+            if n_pruned > 0 and pruned_idx_list:
+                coarse_idx_list = pruned_idx_list
+            diag["n_variants_pruned_histogram"] = n_pruned
+        else:
+            diag["n_variants_pruned_histogram"] = 0
+
        # Pruning varianti via top-level (parallelizzato).
        # coarse_stride > 1: 1 pixel ogni stride (~stride^2 speed-up).
        # pyramid_propagate=True: top-K picchi per restringere full-res.
@@ -1596,6 +1662,7 @@ class LineShapeMatcher:
        kept_variants: list[tuple[int, float]] = [
            (vi, score_by_vi[vi]) for vi in expanded
        ]
+        _checkpoint("top_pruning")

        if not kept_variants:
            return []
@@ -1702,6 +1769,7 @@ class LineShapeMatcher:

        raw.sort(key=lambda c: -c[0])
        diag["n_raw_candidates"] = len(raw)
+        _checkpoint("full_kernel")

        # Mappa vi → score_map per subpixel/refinement
        score_maps = dict(candidates_per_var)
@@ -1869,6 +1937,9 @@ class LineShapeMatcher:
            if len(kept) >= max_matches:
                break
        diag["n_final"] = len(kept)
+        _checkpoint("refine_verify_nms")
+        if profile:
+            self._last_profile = prof
        if debug:
            # Debug mode: stampa diagnostica su stderr per visibilita' immediata.
            import sys as _sys
@@ -1892,6 +1963,15 @@ class LineShapeMatcher:
            f"final={diag['n_final']} (top_thresh={diag['top_thresh_used']:.2f})"
        )

+    def get_last_profile(self) -> dict | None:
+        """Ritorna timing per fase dell'ultimo find(profile=True).
+
+        Chiavi: to_gray, spread_top, top_pruning, full_kernel,
+        refine_verify_nms (millisecondi). Util per identificare bottleneck
+        dove ottimizzare.
+        """
+        return getattr(self, "_last_profile", None)
+
    def get_last_diag(self) -> dict | None:
        """Ritorna dict diagnostica dell'ultima chiamata find().