lancedb
diff --git a/‎python/python/lance/dataset.py‎
Lines changed: 9 additions & 6 deletions b/‎python/python/lance/dataset.py‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎python/python/tests/test_scalar_index.py‎
Lines changed: 255 additions & 0 deletions b/‎python/python/tests/test_scalar_index.py‎
Lines changed: 255 additions & 0 deletions
@@ -2341,7 +2341,8 @@ def create_scalar_index(
             )
 
         column = column[0]
-        if column not in self.schema.names:
+        lance_field = self._ds.lance_schema.field(column)
+        if lance_field is None:
             raise KeyError(f"{column} not found in schema")
 
         # TODO: Add documentation of IndexConfig approach for creating
@@ -2365,7 +2366,7 @@ def create_scalar_index(
                     )
                 )
 
-            field = self.schema.field(column)
+            field = lance_field.to_arrow()
 
             field_type = field.type
             if hasattr(field_type, "storage_type"):
@@ -2618,9 +2619,10 @@ def create_index(
 
         # validate args
         for c in column:
-            if c not in self.schema.names:
+            lance_field = self._ds.lance_schema.field(c)
+            if lance_field is None:
                 raise KeyError(f"{c} not found in schema")
-            field = self.schema.field(c)
+            field = lance_field.to_arrow()
             is_multivec = False
             if pa.types.is_fixed_size_list(field.type):
                 dimension = field.type.list_size
@@ -4347,10 +4349,11 @@ def nearest(
     ) -> ScannerBuilder:
         q, q_dim = _coerce_query_vector(q)
 
-        if self.ds.schema.get_field_index(column) < 0:
+        lance_field = self.ds._ds.lance_schema.field(column)
+        if lance_field is None:
             raise ValueError(f"Embedding column {column} is not in the dataset")
 
-        column_field = self.ds.schema.field(column)
+        column_field = lance_field.to_arrow()
         column_type = column_field.type
         if hasattr(column_type, "storage_type"):
             column_type = column_type.storage_type
 
@@ -3690,3 +3690,258 @@ def scan_stats_callback(stats: lance.ScanStatistics):
     for key, value in scan_stats.all_counts.items():
         assert isinstance(key, str)
         assert isinstance(value, int)
+
+
+def test_nested_field_btree_index(tmp_path):
+    """Test BTREE index creation and querying on nested fields"""
+    # Create a dataset with nested structure
+    schema = pa.schema(
+        [
+            pa.field("id", pa.int64()),
+            pa.field(
+                "meta",
+                pa.struct(
+                    [pa.field("lang", pa.string()), pa.field("version", pa.int32())]
+                ),
+            ),
+        ]
+    )
+
+    data = pa.table(
+        {
+            "id": [1, 2, 3, 4, 5],
+            "meta": [
+                {"lang": "en", "version": 1},
+                {"lang": "fr", "version": 2},
+                {"lang": "en", "version": 1},
+                {"lang": "es", "version": 3},
+                {"lang": "fr", "version": 2},
+            ],
+        },
+        schema=schema,
+    )
+
+    # Create dataset
+    uri = tmp_path / "test_nested_btree"
+    dataset = lance.write_dataset(data, uri)
+
+    # Create BTREE index on nested string column
+    dataset.create_scalar_index(column="meta.lang", index_type="BTREE")
+
+    # Verify index was created
+    indices = dataset.list_indices()
+    assert len(indices) == 1
+    assert indices[0]["fields"] == ["meta.lang"]
+    assert indices[0]["type"] == "BTree"
+
+    # Test query using the index - filter for English language
+    result = dataset.scanner(filter="meta.lang = 'en'").to_table()
+    assert len(result) == 2
+    for i in range(len(result)):
+        assert result["meta"][i]["lang"].as_py() == "en"
+
+    # Test query for French language
+    result = dataset.scanner(filter="meta.lang = 'fr'").to_table()
+    assert len(result) == 2
+    for i in range(len(result)):
+        assert result["meta"][i]["lang"].as_py() == "fr"
+
+    # Verify the index is being used
+    plan = dataset.scanner(filter="meta.lang = 'en'").explain_plan()
+    assert "ScalarIndexQuery" in plan
+
+    # Write additional data to the dataset
+    new_data = pa.table(
+        {
+            "id": [6, 7, 8],
+            "meta": [
+                {"lang": "de", "version": 4},
+                {"lang": "en", "version": 2},
+                {"lang": "de", "version": 4},
+            ],
+        },
+        schema=schema,
+    )
+
+    dataset = lance.write_dataset(new_data, uri, mode="append")
+
+    # Verify query still works after appending data
+    result = dataset.scanner(filter="meta.lang = 'en'").to_table()
+    assert len(result) == 3, f"Expected 3 English records, got {len(result)}"
+    for i in range(len(result)):
+        assert result["meta"][i]["lang"].as_py() == "en"
+
+    # Test query for new German language entries
+    result = dataset.scanner(filter="meta.lang = 'de'").to_table()
+    assert len(result) == 2
+    for i in range(len(result)):
+        assert result["meta"][i]["lang"].as_py() == "de"
+
+    # Test optimize_indices with nested field BTREE index
+    dataset.optimize.optimize_indices()
+
+    # Verify query still works after optimization
+    result = dataset.scanner(filter="meta.lang = 'en'").to_table()
+    assert len(result) == 3
+    result = dataset.scanner(filter="meta.lang = 'de'").to_table()
+    assert len(result) == 2
+
+    # Create BTREE index on nested integer column
+    dataset.create_scalar_index(column="meta.version", index_type="BTREE", replace=True)
+
+    # Test query using the version index
+    result = dataset.scanner(filter="meta.version = 1").to_table()
+    assert len(result) == 2
+    for i in range(len(result)):
+        assert result["meta"][i]["version"].as_py() == 1
+
+    # Test query for version 4 (new data)
+    result = dataset.scanner(filter="meta.version = 4").to_table()
+    assert len(result) == 2
+    for i in range(len(result)):
+        assert result["meta"][i]["version"].as_py() == 4
+
+    # Verify total row count
+    total = dataset.count_rows()
+    assert total == 8, f"Expected 8 total rows, got {total}"
+
+
+def test_nested_field_fts_index(tmp_path):
+    """Test FTS index creation and querying on nested fields"""
+    # Create dataset with nested text field
+    data = pa.table(
+        {
+            "id": range(100),
+            "data": pa.StructArray.from_arrays(
+                [
+                    pa.array(
+                        [f"document {i} about lance database" for i in range(100)]
+                    ),
+                    pa.array([f"label_{i}" for i in range(100)]),
+                ],
+                names=["text", "label"],
+            ),
+        }
+    )
+
+    ds = lance.write_dataset(data, tmp_path)
+
+    # Create FTS index on nested field
+    ds.create_scalar_index("data.text", index_type="INVERTED", with_position=False)
+
+    # Verify index was created
+    indices = ds.list_indices()
+    assert len(indices) == 1
+    assert indices[0]["fields"] == ["data.text"]
+    assert indices[0]["type"] == "Inverted"
+
+    # Test full text search on nested field
+    results = ds.to_table(full_text_query="lance")
+    assert results.num_rows == 100
+
+    # Verify the results contain the expected text
+    for i in range(results.num_rows):
+        text = results["data"][i]["text"].as_py()
+        assert "lance" in text
+
+    # Test with prefilter using another nested field
+    results = ds.to_table(
+        full_text_query="database",
+        filter="data.label = 'label_5'",
+        prefilter=True,
+    )
+    assert results.num_rows == 1
+    assert results["id"][0].as_py() == 5
+
+    # Test optimize_indices with nested field FTS index
+    # Append more data
+    new_data = pa.table(
+        {
+            "id": range(100, 150),
+            "data": pa.StructArray.from_arrays(
+                [
+                    pa.array(
+                        [f"document {i} about lance search" for i in range(100, 150)]
+                    ),
+                    pa.array([f"label_{i}" for i in range(100, 150)]),
+                ],
+                names=["text", "label"],
+            ),
+        }
+    )
+    ds = lance.write_dataset(new_data, tmp_path, mode="append")
+
+    # Optimize indices
+    ds.optimize.optimize_indices()
+
+    # Verify search still works after optimization
+    results = ds.to_table(full_text_query="lance")
+    assert results.num_rows == 150
+
+    results = ds.to_table(full_text_query="search")
+    assert results.num_rows == 50
+
+
+def test_nested_field_bitmap_index(tmp_path):
+    """Test BITMAP index creation and querying on nested fields"""
+    # Create dataset with nested categorical field
+    data = pa.table(
+        {
+            "id": range(100),
+            "attributes": pa.StructArray.from_arrays(
+                [
+                    pa.array(["red", "green", "blue"][i % 3] for i in range(100)),
+                    pa.array([f"size_{i % 5}" for i in range(100)]),
+                ],
+                names=["color", "size"],
+            ),
+        }
+    )
+
+    ds = lance.write_dataset(data, tmp_path)
+
+    # Create BITMAP index on nested field
+    ds.create_scalar_index("attributes.color", index_type="BITMAP")
+
+    # Verify index was created
+    indices = ds.list_indices()
+    assert len(indices) == 1
+    assert indices[0]["fields"] == ["attributes.color"]
+    assert indices[0]["type"] == "Bitmap"
+
+    # Test equality query
+    results = ds.to_table(filter="attributes.color = 'red'", prefilter=True)
+    assert results.num_rows == 34  # 0, 3, 6, 9, ... 99 (34 values)
+
+    # Verify the index is being used
+    plan = ds.scanner(filter="attributes.color = 'red'", prefilter=True).explain_plan()
+    assert "ScalarIndexQuery" in plan
+
+    # Test with different color
+    results = ds.to_table(filter="attributes.color = 'green'", prefilter=True)
+    assert results.num_rows == 33  # 1, 4, 7, 10, ... 97 (33 values)
+
+    results = ds.to_table(filter="attributes.color = 'blue'", prefilter=True)
+    assert results.num_rows == 33  # 2, 5, 8, 11, ... 98 (33 values)
+
+    # Test optimize_indices with nested field BITMAP index
+    new_data = pa.table(
+        {
+            "id": range(100, 150),
+            "attributes": pa.StructArray.from_arrays(
+                [
+                    pa.array(["red", "green", "blue"][i % 3] for i in range(50)),
+                    pa.array([f"size_{i % 5}" for i in range(50)]),
+                ],
+                names=["color", "size"],
+            ),
+        }
+    )
+    ds = lance.write_dataset(new_data, tmp_path, mode="append")
+
+    # Optimize indices
+    ds.optimize.optimize_indices()
+
+    # Verify query still works after optimization
+    results = ds.to_table(filter="attributes.color = 'red'", prefilter=True)
+    assert results.num_rows == 51  # 34 + 17 from new data