Skip to content

Commit 0db80e8

Browse files
authored
fix: coerce nested regexp_match to boolean in filters (#5019)
When combining `regexp_match(col, ...)` with boolean predicates (e.g., `AND natural_caption IS NOT NULL AND natural_caption <> '' ...`), planning failed with: "Cannot infer common argument type for logical boolean operation List(Utf8) AND Boolean" Root cause: `regexp_match` returns a list, not a boolean. Our coercion only handled top-level `regexp_match`, not occurrences nested inside boolean expressions. Signed-off-by: BubbleCal <bubble-cal@outlook.com>
1 parent e3fa30c commit 0db80e8

File tree

3 files changed

+177
-10
lines changed

3 files changed

+177
-10
lines changed

rust/lance-datafusion/src/logical_expr.rs

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -198,17 +198,24 @@ pub fn coerce_expr(expr: &Expr, dtype: &DataType) -> Result<Expr> {
198198
///
199199
/// - *expr*: a datafusion logical expression
200200
pub fn coerce_filter_type_to_boolean(expr: Expr) -> Expr {
201-
match &expr {
202-
// TODO: consider making this dispatch more generic, i.e. fun.output_type -> coerce
203-
// instead of hardcoding coerce method for each function
204-
Expr::ScalarFunction(ScalarFunction { func, .. }) => {
205-
if func.name() == "regexp_match" {
206-
Expr::IsNotNull(Box::new(expr))
207-
} else {
208-
expr
209-
}
201+
match expr {
202+
// Coerce regexp_match to boolean by checking for non-null
203+
Expr::ScalarFunction(sf) if sf.func.name() == "regexp_match" => {
204+
Expr::IsNotNull(Box::new(Expr::ScalarFunction(sf)))
210205
}
211-
_ => expr,
206+
207+
// Recurse into boolean contexts so nested regexp_match terms are also coerced
208+
Expr::BinaryExpr(BinaryExpr { left, op, right }) => Expr::BinaryExpr(BinaryExpr {
209+
left: Box::new(coerce_filter_type_to_boolean(*left)),
210+
op,
211+
right: Box::new(coerce_filter_type_to_boolean(*right)),
212+
}),
213+
Expr::Not(inner) => Expr::Not(Box::new(coerce_filter_type_to_boolean(*inner))),
214+
Expr::IsNull(inner) => Expr::IsNull(Box::new(coerce_filter_type_to_boolean(*inner))),
215+
Expr::IsNotNull(inner) => Expr::IsNotNull(Box::new(coerce_filter_type_to_boolean(*inner))),
216+
217+
// Pass-through for all other nodes
218+
other => other,
212219
}
213220
}
214221

rust/lance-datafusion/src/planner.rs

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1740,4 +1740,88 @@ mod tests {
17401740
let ctx_provider = LanceContextProvider::default();
17411741
assert!(!ctx_provider.get_expr_planners().is_empty());
17421742
}
1743+
1744+
#[test]
1745+
fn test_regexp_match_and_non_empty_captions() {
1746+
// Repro for a bug where regexp_match inside an AND chain wasn't coerced to boolean,
1747+
// causing planning/evaluation failures. This should evaluate successfully.
1748+
let schema = Arc::new(Schema::new(vec![
1749+
Field::new("keywords", DataType::Utf8, true),
1750+
Field::new("natural_caption", DataType::Utf8, true),
1751+
Field::new("poetic_caption", DataType::Utf8, true),
1752+
]));
1753+
1754+
let planner = Planner::new(schema.clone());
1755+
1756+
let expr = planner
1757+
.parse_filter(
1758+
"regexp_match(keywords, 'Liberty|revolution') AND \
1759+
(natural_caption IS NOT NULL AND natural_caption <> '' AND \
1760+
poetic_caption IS NOT NULL AND poetic_caption <> '')",
1761+
)
1762+
.unwrap();
1763+
1764+
let physical_expr = planner.create_physical_expr(&expr).unwrap();
1765+
1766+
let batch = RecordBatch::try_new(
1767+
schema,
1768+
vec![
1769+
Arc::new(StringArray::from(vec![
1770+
Some("Liberty for all"),
1771+
Some("peace"),
1772+
Some("revolution now"),
1773+
Some("Liberty"),
1774+
Some("revolutionary"),
1775+
Some("none"),
1776+
])) as ArrayRef,
1777+
Arc::new(StringArray::from(vec![
1778+
Some("a"),
1779+
Some("b"),
1780+
None,
1781+
Some(""),
1782+
Some("c"),
1783+
Some("d"),
1784+
])) as ArrayRef,
1785+
Arc::new(StringArray::from(vec![
1786+
Some("x"),
1787+
Some(""),
1788+
Some("y"),
1789+
Some("z"),
1790+
None,
1791+
Some("w"),
1792+
])) as ArrayRef,
1793+
],
1794+
)
1795+
.unwrap();
1796+
1797+
let result = physical_expr.evaluate(&batch).unwrap();
1798+
assert_eq!(
1799+
result.into_array(0).unwrap().as_ref(),
1800+
&BooleanArray::from(vec![true, false, false, false, false, false])
1801+
);
1802+
}
1803+
1804+
#[test]
1805+
fn test_regexp_match_infer_error_without_boolean_coercion() {
1806+
// With the fix applied, using parse_filter should coerce regexp_match to boolean
1807+
// even when nested in a larger AND expression, so this should plan successfully.
1808+
let schema = Arc::new(Schema::new(vec![
1809+
Field::new("keywords", DataType::Utf8, true),
1810+
Field::new("natural_caption", DataType::Utf8, true),
1811+
Field::new("poetic_caption", DataType::Utf8, true),
1812+
]));
1813+
1814+
let planner = Planner::new(schema);
1815+
1816+
let expr = planner
1817+
.parse_filter(
1818+
"regexp_match(keywords, 'Liberty|revolution') AND \
1819+
(natural_caption IS NOT NULL AND natural_caption <> '' AND \
1820+
poetic_caption IS NOT NULL AND poetic_caption <> '')",
1821+
)
1822+
.unwrap();
1823+
1824+
// Should not panic
1825+
let _physical = planner.create_physical_expr(&expr).unwrap();
1826+
}
17431827
}

rust/lance/src/dataset/scanner.rs

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4127,6 +4127,82 @@ mod test {
41274127
Ok(())
41284128
}
41294129

4130+
#[tokio::test]
4131+
async fn test_scan_regexp_match_and_non_empty_captions() {
4132+
// Build a small dataset with three Utf8 columns and verify the full
4133+
// scan().filter(...) path handles regexp_match combined with non-null/non-empty checks.
4134+
let schema = Arc::new(ArrowSchema::new(vec![
4135+
ArrowField::new("keywords", DataType::Utf8, true),
4136+
ArrowField::new("natural_caption", DataType::Utf8, true),
4137+
ArrowField::new("poetic_caption", DataType::Utf8, true),
4138+
]));
4139+
4140+
let batch = RecordBatch::try_new(
4141+
schema.clone(),
4142+
vec![
4143+
Arc::new(StringArray::from(vec![
4144+
Some("Liberty for all"),
4145+
Some("peace"),
4146+
Some("revolution now"),
4147+
Some("Liberty"),
4148+
Some("revolutionary"),
4149+
Some("none"),
4150+
])) as ArrayRef,
4151+
Arc::new(StringArray::from(vec![
4152+
Some("a"),
4153+
Some("b"),
4154+
None,
4155+
Some(""),
4156+
Some("c"),
4157+
Some("d"),
4158+
])) as ArrayRef,
4159+
Arc::new(StringArray::from(vec![
4160+
Some("x"),
4161+
Some(""),
4162+
Some("y"),
4163+
Some("z"),
4164+
None,
4165+
Some("w"),
4166+
])) as ArrayRef,
4167+
],
4168+
)
4169+
.unwrap();
4170+
4171+
let reader = RecordBatchIterator::new(vec![Ok(batch.clone())], schema.clone());
4172+
let dataset = Dataset::write(reader, "memory://", None).await.unwrap();
4173+
4174+
let mut scan = dataset.scan();
4175+
scan.filter(
4176+
"regexp_match(keywords, 'Liberty|revolution') AND \
4177+
(natural_caption IS NOT NULL AND natural_caption <> '' AND \
4178+
poetic_caption IS NOT NULL AND poetic_caption <> '')",
4179+
)
4180+
.unwrap();
4181+
4182+
let out = scan.try_into_batch().await.unwrap();
4183+
assert_eq!(out.num_rows(), 1);
4184+
4185+
let out_keywords = out
4186+
.column_by_name("keywords")
4187+
.unwrap()
4188+
.as_string::<i32>()
4189+
.value(0);
4190+
let out_nat = out
4191+
.column_by_name("natural_caption")
4192+
.unwrap()
4193+
.as_string::<i32>()
4194+
.value(0);
4195+
let out_poetic = out
4196+
.column_by_name("poetic_caption")
4197+
.unwrap()
4198+
.as_string::<i32>()
4199+
.value(0);
4200+
4201+
assert_eq!(out_keywords, "Liberty for all");
4202+
assert_eq!(out_nat, "a");
4203+
assert_eq!(out_poetic, "x");
4204+
}
4205+
41304206
#[tokio::test]
41314207
async fn test_nested_projection() {
41324208
let point_fields: Fields = vec![

0 commit comments

Comments
 (0)