|
17 | 17 |
|
18 | 18 | //! Common utilities for implementing string functions |
19 | 19 |
|
20 | | -use std::collections::HashMap; |
21 | 20 | use std::fmt::{Display, Formatter}; |
22 | 21 | use std::sync::Arc; |
23 | 22 |
|
24 | 23 | use arrow::array::{ |
25 | 24 | new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, ArrayRef, |
26 | | - BooleanArray, GenericStringArray, GenericStringBuilder, OffsetSizeTrait, StringArray, |
| 25 | + GenericStringArray, GenericStringBuilder, OffsetSizeTrait, StringArray, |
27 | 26 | StringBuilder, StringViewArray, |
28 | 27 | }; |
29 | 28 | use arrow::buffer::{Buffer, MutableBuffer, NullBuffer}; |
30 | 29 | use arrow::datatypes::DataType; |
31 | | -use arrow_buffer::BooleanBufferBuilder; |
32 | 30 | use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; |
| 31 | +use datafusion_common::Result; |
33 | 32 | use datafusion_common::{exec_err, ScalarValue}; |
34 | | -use datafusion_common::{DataFusionError, Result}; |
35 | 33 | use datafusion_expr::ColumnarValue; |
36 | | -use regex::Regex; |
37 | 34 |
|
38 | 35 | pub(crate) enum TrimType { |
39 | 36 | Left, |
@@ -481,96 +478,3 @@ where |
481 | 478 | GenericStringArray::<O>::new_unchecked(offsets, values, nulls) |
482 | 479 | })) |
483 | 480 | } |
484 | | - |
485 | | -#[cfg(doc)] |
486 | | -use arrow::array::LargeStringArray; |
487 | | -/// Perform SQL `array ~ regex_array` operation on |
488 | | -/// [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`]. |
489 | | -/// |
490 | | -/// If `regex_array` element has an empty value, the corresponding result value is always true. |
491 | | -/// |
492 | | -/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`] flag, |
493 | | -/// which allow special search modes, such as case-insensitive and multi-line mode. |
494 | | -/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags) |
495 | | -/// for more information. |
496 | | -/// |
497 | | -/// It is inspired / copied from `regexp_is_match_utf8` [arrow-rs]. |
498 | | -/// |
499 | | -/// [arrow-rs]: https://github.com/apache/arrow-rs/blob/8c956a9f9ab26c14072740cce64c2b99cb039b13/arrow-string/src/regexp.rs#L31-L37 |
500 | | -pub fn regexp_is_match<'a, S1, S2, S3>( |
501 | | - array: &'a S1, |
502 | | - regex_array: &'a S2, |
503 | | - flags_array: Option<&'a S3>, |
504 | | -) -> Result<BooleanArray, DataFusionError> |
505 | | -where |
506 | | - &'a S1: StringArrayType<'a>, |
507 | | - &'a S2: StringArrayType<'a>, |
508 | | - &'a S3: StringArrayType<'a>, |
509 | | -{ |
510 | | - if array.len() != regex_array.len() { |
511 | | - return Err(DataFusionError::Execution( |
512 | | - "Cannot perform comparison operation on arrays of different length" |
513 | | - .to_string(), |
514 | | - )); |
515 | | - } |
516 | | - |
517 | | - let nulls = NullBuffer::union(array.nulls(), regex_array.nulls()); |
518 | | - |
519 | | - let mut patterns: HashMap<String, Regex> = HashMap::new(); |
520 | | - let mut result = BooleanBufferBuilder::new(array.len()); |
521 | | - |
522 | | - let complete_pattern = match flags_array { |
523 | | - Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map( |
524 | | - |(pattern, flags)| { |
525 | | - pattern.map(|pattern| match flags { |
526 | | - Some(flag) => format!("(?{flag}){pattern}"), |
527 | | - None => pattern.to_string(), |
528 | | - }) |
529 | | - }, |
530 | | - )) as Box<dyn Iterator<Item = Option<String>>>, |
531 | | - None => Box::new( |
532 | | - regex_array |
533 | | - .iter() |
534 | | - .map(|pattern| pattern.map(|pattern| pattern.to_string())), |
535 | | - ), |
536 | | - }; |
537 | | - |
538 | | - array |
539 | | - .iter() |
540 | | - .zip(complete_pattern) |
541 | | - .map(|(value, pattern)| { |
542 | | - match (value, pattern) { |
543 | | - (Some(_), Some(pattern)) if pattern == *"" => { |
544 | | - result.append(true); |
545 | | - } |
546 | | - (Some(value), Some(pattern)) => { |
547 | | - let existing_pattern = patterns.get(&pattern); |
548 | | - let re = match existing_pattern { |
549 | | - Some(re) => re, |
550 | | - None => { |
551 | | - let re = Regex::new(pattern.as_str()).map_err(|e| { |
552 | | - DataFusionError::Execution(format!( |
553 | | - "Regular expression did not compile: {e:?}" |
554 | | - )) |
555 | | - })?; |
556 | | - patterns.entry(pattern).or_insert(re) |
557 | | - } |
558 | | - }; |
559 | | - result.append(re.is_match(value)); |
560 | | - } |
561 | | - _ => result.append(false), |
562 | | - } |
563 | | - Ok(()) |
564 | | - }) |
565 | | - .collect::<Result<Vec<()>, DataFusionError>>()?; |
566 | | - |
567 | | - let data = unsafe { |
568 | | - ArrayDataBuilder::new(DataType::Boolean) |
569 | | - .len(array.len()) |
570 | | - .buffers(vec![result.into()]) |
571 | | - .nulls(nulls) |
572 | | - .build_unchecked() |
573 | | - }; |
574 | | - |
575 | | - Ok(BooleanArray::from(data)) |
576 | | -} |
0 commit comments