Fix precision validation gaps and enhance insert() capabilities

claude · claude · commit def408dfda85 · 2025-11-08T05:28:06.000Z
This commit addresses multiple precision and validation issues identified
in the codebase analysis:

## 1. Input Validation
- Add NaN/Inf validation to insert() methods for both float32 and float64
- Ensures consistency with constructor validation
- Prevents invalid data from entering the tree structure

## 2. Float64 Insert Support
- Add float64 overload for insert() method
- Maintains idx2exact map for dynamically inserted items
- Preserves double-precision refinement capability for inserted boxes
- Uses explicit py::overload_cast in Python bindings to handle overloads

## 3. Precision Testing
- Add comprehensive tests for NaN/Inf validation in insert operations
- Add tests for float64 insert() maintaining precision
- Add tests verifying rebuild() preserves idx2exact
- Add systematic precision boundary tests (adjusted for float32 limits)
- Document float32 precision limitations in test comments

## Technical Notes
- Float64 input is converted to float32 for tree structure
- Double-precision refinement helps reduce false positives
- Precision limits: gaps below ~1e-7 may not be reliably detected
- At large magnitudes (e.g., 1e6), absolute precision degrades

Fixes validation gaps in insert operations and maintains precision
capabilities for dynamically updated trees.
diff --git a/include/prtree/core/prtree.h b/include/prtree/core/prtree.h
@@ -308,6 +308,7 @@ template <IndexType T, int B = 6, int D = 2> class PRTree {
     return obj;
   }
 
+  // Insert with float32 coordinates (no double-precision refinement)
   void insert(const T &idx, const py::array_t<float> &x,
               const std::optional<std::string> objdumps = std::nullopt) {
     // Phase 1: Thread-safety - protect entire insert operation
@@ -342,6 +343,15 @@ template <IndexType T, int B = 6, int D = 2> class PRTree {
         minima[i] = *x.data(i);
         maxima[i] = *x.data(i + D);
       }
+
+      // Validate bounding box (reject NaN/Inf, enforce min <= max)
+      float coords[2 * D];
+      for (int j = 0; j < D; ++j) {
+        coords[j] = minima[j];
+        coords[j + D] = maxima[j];
+      }
+      validate_box(coords, D);
+
       bb = BB<D>(minima, maxima);
     }
     idx2bb.emplace(idx, bb);
@@ -437,6 +447,152 @@ template <IndexType T, int B = 6, int D = 2> class PRTree {
 #endif
   }
 
+  // Insert with float64 coordinates (maintains double-precision refinement)
+  void insert(const T &idx, const py::array_t<double> &x,
+              const std::optional<std::string> objdumps = std::nullopt) {
+    // Phase 1: Thread-safety - protect entire insert operation
+    std::lock_guard<std::recursive_mutex> lock(*tree_mutex_);
+
+#ifdef MY_DEBUG
+    ProfilerStart("insert.prof");
+    std::cout << "profiler start of insert (float64)" << std::endl;
+#endif
+    vec<size_t> cands;
+    BB<D> bb;
+    std::array<double, 2 * D> exact_coords;
+
+    const auto &buff_info_x = x.request();
+    const auto &shape_x = buff_info_x.shape;
+    const auto &ndim = buff_info_x.ndim;
+    // Phase 4: Improved error messages with context
+    if (unlikely((shape_x[0] != 2 * D || ndim != 1))) {
+      throw std::runtime_error(
+          "Invalid shape for bounding box array. Expected shape (" +
+          std::to_string(2 * D) + ",) but got shape (" +
+          std::to_string(shape_x[0]) + ",) with ndim=" + std::to_string(ndim));
+    }
+    auto it = idx2bb.find(idx);
+    if (unlikely(it != idx2bb.end())) {
+      throw std::runtime_error(
+          "Index already exists in tree: " + std::to_string(idx));
+    }
+    {
+      Real minima[D];
+      Real maxima[D];
+
+      // Store exact double coordinates
+      for (int i = 0; i < D; ++i) {
+        double val_min = *x.data(i);
+        double val_max = *x.data(i + D);
+        exact_coords[i] = val_min;
+        exact_coords[i + D] = val_max;
+      }
+
+      // Validate bounding box with double precision (reject NaN/Inf, enforce min <= max)
+      validate_box(exact_coords.data(), D);
+
+      // Convert to float32 for tree after validation
+      for (int i = 0; i < D; ++i) {
+        minima[i] = static_cast<Real>(exact_coords[i]);
+        maxima[i] = static_cast<Real>(exact_coords[i + D]);
+      }
+
+      bb = BB<D>(minima, maxima);
+    }
+    idx2bb.emplace(idx, bb);
+    idx2exact[idx] = exact_coords; // Store exact coordinates for refinement
+    set_obj(idx, objdumps);
+
+    Real delta[D];
+    for (int i = 0; i < D; ++i) {
+      delta[i] = bb.max(i) - bb.min(i) + 0.00000001;
+    }
+
+    // find the leaf node to insert
+    Real c = 0.0;
+    size_t count = flat_tree.size();
+    while (cands.empty()) {
+      Real d[D];
+      for (int i = 0; i < D; ++i) {
+        d[i] = delta[i] * c;
+      }
+      bb.expand(d);
+      c = (c + 1) * 2;
+
+      queue<size_t> que;
+      auto qpush_if_intersect = [&](const size_t &i) {
+        if (flat_tree[i](bb)) {
+          que.emplace(i);
+        }
+      };
+
+      qpush_if_intersect(0);
+      while (!que.empty()) {
+        size_t i = que.front();
+        que.pop();
+        PRTreeElement<T, B, D> &elem = flat_tree[i];
+
+        if (elem.leaf && elem.leaf->mbb(bb)) {
+          cands.push_back(i);
+        } else {
+          for (size_t offset = 0; offset < B; offset++) {
+            size_t j = i * B + offset + 1;
+            if (j < count)
+              qpush_if_intersect(j);
+          }
+        }
+      }
+    }
+
+    if (unlikely(cands.empty()))
+      throw std::runtime_error("cannnot determine where to insert");
+
+    // Now cands is the list of candidate leaf nodes to insert
+    bb = idx2bb.at(idx);
+    size_t min_leaf = 0;
+    if (cands.size() == 1) {
+      min_leaf = cands[0];
+    } else {
+      Real min_diff_area = 1e100;
+      for (const auto &i : cands) {
+        PRTreeLeaf<T, B, D> *leaf = flat_tree[i].leaf.get();
+        PRTreeLeaf<T, B, D> tmp_leaf = PRTreeLeaf<T, B, D>(*leaf);
+        Real diff_area = -tmp_leaf.area();
+        tmp_leaf.push(idx, bb);
+        diff_area += tmp_leaf.area();
+        if (diff_area < min_diff_area) {
+          min_diff_area = diff_area;
+          min_leaf = i;
+        }
+      }
+    }
+    flat_tree[min_leaf].leaf->push(idx, bb);
+    // update mbbs of all cands and their parents
+    size_t i = min_leaf;
+    while (true) {
+      PRTreeElement<T, B, D> &elem = flat_tree[i];
+
+      if (elem.leaf)
+        elem.mbb += elem.leaf->mbb;
+
+      if (i > 0) {
+        size_t j = (i - 1) / B;
+        flat_tree[j].mbb += flat_tree[i].mbb;
+      }
+      if (i == 0)
+        break;
+      i = (i - 1) / B;
+    }
+
+    if (size() > REBUILD_THRE * n_at_build) {
+      rebuild();
+    }
+#ifdef MY_DEBUG
+    ProfilerStop();
+    std::cout << "profiler end of insert (float64)" << std::endl;
+#endif
+  }
+
   void rebuild() {
     // Phase 1: Thread-safety - protect entire rebuild operation
     std::lock_guard<std::recursive_mutex> lock(*tree_mutex_);
diff --git a/src/cpp/bindings/python_bindings.cc b/src/cpp/bindings/python_bindings.cc
@@ -47,8 +47,21 @@ PYBIND11_MODULE(PRTree, m) {
       .def("get_obj", &PRTree<T, B, 2>::get_obj, R"pbdoc(
           Get string by index
         )pbdoc")
-      .def("insert", &PRTree<T, B, 2>::insert, R"pbdoc(
-          Insert one to prtree
+      .def("insert",
+           py::overload_cast<const T &, const py::array_t<float> &,
+                             const std::optional<std::string>>(
+               &PRTree<T, B, 2>::insert),
+           py::arg("idx"), py::arg("bb"), py::arg("obj") = py::none(),
+           R"pbdoc(
+          Insert one to prtree (float32)
+        )pbdoc")
+      .def("insert",
+           py::overload_cast<const T &, const py::array_t<double> &,
+                             const std::optional<std::string>>(
+               &PRTree<T, B, 2>::insert),
+           py::arg("idx"), py::arg("bb"), py::arg("obj") = py::none(),
+           R"pbdoc(
+          Insert one to prtree (float64 with precision)
         )pbdoc")
       .def("save", &PRTree<T, B, 2>::save, R"pbdoc(
           cereal save
@@ -100,8 +113,21 @@ PYBIND11_MODULE(PRTree, m) {
       .def("get_obj", &PRTree<T, B, 3>::get_obj, R"pbdoc(
           Get string by index
         )pbdoc")
-      .def("insert", &PRTree<T, B, 3>::insert, R"pbdoc(
-          Insert one to prtree
+      .def("insert",
+           py::overload_cast<const T &, const py::array_t<float> &,
+                             const std::optional<std::string>>(
+               &PRTree<T, B, 3>::insert),
+           py::arg("idx"), py::arg("bb"), py::arg("obj") = py::none(),
+           R"pbdoc(
+          Insert one to prtree (float32)
+        )pbdoc")
+      .def("insert",
+           py::overload_cast<const T &, const py::array_t<double> &,
+                             const std::optional<std::string>>(
+               &PRTree<T, B, 3>::insert),
+           py::arg("idx"), py::arg("bb"), py::arg("obj") = py::none(),
+           R"pbdoc(
+          Insert one to prtree (float64 with precision)
         )pbdoc")
       .def("save", &PRTree<T, B, 3>::save, R"pbdoc(
           cereal save
@@ -153,8 +179,21 @@ PYBIND11_MODULE(PRTree, m) {
       .def("get_obj", &PRTree<T, B, 4>::get_obj, R"pbdoc(
           Get string by index
         )pbdoc")
-      .def("insert", &PRTree<T, B, 4>::insert, R"pbdoc(
-          Insert one to prtree
+      .def("insert",
+           py::overload_cast<const T &, const py::array_t<float> &,
+                             const std::optional<std::string>>(
+               &PRTree<T, B, 4>::insert),
+           py::arg("idx"), py::arg("bb"), py::arg("obj") = py::none(),
+           R"pbdoc(
+          Insert one to prtree (float32)
+        )pbdoc")
+      .def("insert",
+           py::overload_cast<const T &, const py::array_t<double> &,
+                             const std::optional<std::string>>(
+               &PRTree<T, B, 4>::insert),
+           py::arg("idx"), py::arg("bb"), py::arg("obj") = py::none(),
+           R"pbdoc(
+          Insert one to prtree (float64 with precision)
         )pbdoc")
       .def("save", &PRTree<T, B, 4>::save, R"pbdoc(
           cereal save
diff --git a/tests/unit/test_insert.py b/tests/unit/test_insert.py
@@ -108,6 +108,50 @@ def test_insert_with_invalid_box(self, PRTree, dim):
         with pytest.raises((ValueError, RuntimeError)):
             tree.insert(idx=1, bb=box)
 
+    @pytest.mark.parametrize("PRTree, dim", [(PRTree2D, 2), (PRTree3D, 3), (PRTree4D, 4)])
+    def test_insert_with_nan_coordinates_float32(self, PRTree, dim):
+        """Verify that insert with NaN coordinates (float32) raises an error."""
+        tree = PRTree()
+
+        box = np.zeros(2 * dim, dtype=np.float32)
+        box[0] = np.nan
+
+        with pytest.raises((ValueError, RuntimeError)):
+            tree.insert(idx=1, bb=box)
+
+    @pytest.mark.parametrize("PRTree, dim", [(PRTree2D, 2), (PRTree3D, 3), (PRTree4D, 4)])
+    def test_insert_with_nan_coordinates_float64(self, PRTree, dim):
+        """Verify that insert with NaN coordinates (float64) raises an error."""
+        tree = PRTree()
+
+        box = np.zeros(2 * dim, dtype=np.float64)
+        box[0] = np.nan
+
+        with pytest.raises((ValueError, RuntimeError)):
+            tree.insert(idx=1, bb=box)
+
+    @pytest.mark.parametrize("PRTree, dim", [(PRTree2D, 2), (PRTree3D, 3), (PRTree4D, 4)])
+    def test_insert_with_inf_coordinates_float32(self, PRTree, dim):
+        """Verify that insert with Inf coordinates (float32) raises an error."""
+        tree = PRTree()
+
+        box = np.zeros(2 * dim, dtype=np.float32)
+        box[0] = np.inf
+
+        with pytest.raises((ValueError, RuntimeError)):
+            tree.insert(idx=1, bb=box)
+
+    @pytest.mark.parametrize("PRTree, dim", [(PRTree2D, 2), (PRTree3D, 3), (PRTree4D, 4)])
+    def test_insert_with_inf_coordinates_float64(self, PRTree, dim):
+        """Verify that insert with Inf coordinates (float64) raises an error."""
+        tree = PRTree()
+
+        box = np.zeros(2 * dim, dtype=np.float64)
+        box[0] = np.inf
+
+        with pytest.raises((ValueError, RuntimeError)):
+            tree.insert(idx=1, bb=box)
+
 
 class TestConsistencyInsert:
     """Test insert consistency."""
@@ -162,3 +206,98 @@ def test_incremental_construction(self, PRTree, dim):
         result2 = tree2.query(query_box)
 
         assert set(result1) == set(result2)
+
+
+class TestPrecisionInsert:
+    """Test insert with precision requirements."""
+
+    @pytest.mark.parametrize("PRTree, dim", [(PRTree2D, 2), (PRTree3D, 3), (PRTree4D, 4)])
+    def test_insert_float64_maintains_precision(self, PRTree, dim):
+        """Verify that float64 insert maintains double-precision refinement."""
+        # Create tree with float64 construction
+        A = np.zeros((1, 2 * dim), dtype=np.float64)
+        A[0, 0] = 0.0
+        A[0, dim] = 75.02750896
+        for i in range(1, dim):
+            A[0, i] = 0.0
+            A[0, i + dim] = 100.0
+
+        tree = PRTree(np.array([0], dtype=np.int64), A)
+
+        # Insert with float64 (small gap)
+        B = np.zeros(2 * dim, dtype=np.float64)
+        B[0] = 75.02751435
+        B[dim] = 100.0
+        for i in range(1, dim):
+            B[i] = 0.0
+            B[i + dim] = 100.0
+
+        tree.insert(idx=1, bb=B)
+
+        # Query should not find intersection due to small gap
+        result = tree.query(B)
+        assert 0 not in result, "Should not find item 0 due to small gap with float64 precision"
+        assert 1 in result, "Should find item 1 (self)"
+
+    @pytest.mark.parametrize("PRTree, dim", [(PRTree2D, 2), (PRTree3D, 3), (PRTree4D, 4)])
+    def test_insert_float32_loses_precision(self, PRTree, dim):
+        """Verify that float32 insert may lose precision for small gaps."""
+        # Create tree with float64 construction
+        A = np.zeros((1, 2 * dim), dtype=np.float64)
+        A[0, 0] = 0.0
+        A[0, dim] = 75.02750896
+        for i in range(1, dim):
+            A[0, i] = 0.0
+            A[0, i + dim] = 100.0
+
+        tree = PRTree(np.array([0], dtype=np.int64), A)
+
+        # Insert with float32 (small gap, may cause false positive)
+        B = np.zeros(2 * dim, dtype=np.float32)
+        B[0] = 75.02751435
+        B[dim] = 100.0
+        for i in range(1, dim):
+            B[i] = 0.0
+            B[i + dim] = 100.0
+
+        tree.insert(idx=1, bb=B)
+
+        # Query - item 1 won't have exact coordinates, so refinement won't apply to it
+        result = tree.query(B)
+        assert 1 in result, "Should find item 1 (self)"
+
+    @pytest.mark.parametrize("PRTree, dim", [(PRTree2D, 2), (PRTree3D, 3)])
+    def test_rebuild_preserves_idx2exact(self, PRTree, dim):
+        """Verify that rebuild() preserves idx2exact for precision."""
+        # Create tree with float64 to populate idx2exact
+        n = 10
+        idx = np.arange(n, dtype=np.int64)
+        boxes = np.random.rand(n, 2 * dim) * 100
+        boxes = boxes.astype(np.float64)
+        for i in range(dim):
+            boxes[:, i + dim] += boxes[:, i] + 1
+
+        tree = PRTree(idx, boxes)
+
+        # Insert more items to trigger rebuild
+        for i in range(n, n + 100):
+            box = np.random.rand(2 * dim) * 100
+            box = box.astype(np.float64)
+            for d in range(dim):
+                box[d + dim] += box[d] + 1
+            tree.insert(idx=i, bb=box)
+
+        # Create a small-gap query that should only work with float64 refinement
+        # Query box is to the right of boxes[0] with a small gap
+        query = np.zeros(2 * dim, dtype=np.float64)
+        query[0] = boxes[0, dim] + 1e-6  # Small gap after original box's max
+        query[dim] = boxes[0, dim] + 10.0  # Query max
+        for i in range(1, dim):
+            # Overlap in other dimensions
+            query[i] = boxes[0, i] - 10
+            query[i + dim] = boxes[0, i + dim] + 10
+
+        result = tree.query(query)
+        # Should not find item 0 if idx2exact is preserved and working
+        # The gap of 1e-6 should be detected with float64 precision
+        assert 0 not in result, "Should not find item 0 due to small gap (idx2exact should be preserved after rebuild)"
diff --git a/tests/unit/test_precision.py b/tests/unit/test_precision.py