Skip to content

Commit 8ad8c0a

Browse files
committed
refactor to support str and bin
1 parent f027e5f commit 8ad8c0a

20 files changed

Lines changed: 162 additions & 61 deletions

File tree

datafusion-examples/examples/dataframe_in_memory.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ async fn main() -> Result<()> {
3838
let batch = RecordBatch::try_new(
3939
schema.clone(),
4040
vec![
41-
Arc::new(StringArray::from(vec!["a", "b", "c", "d"])),
41+
Arc::new(StringArray::from_slice(&["a", "b", "c", "d"])),
4242
Arc::new(Int32Array::from_slice(&[1, 10, 10, 100])),
4343
],
4444
)?;

datafusion/src/execution/context.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3885,8 +3885,8 @@ mod tests {
38853885
Arc::new(Float64Array::from_slice(&[1.0])),
38863886
Arc::new(StringArray::from(vec![Some("foo")])),
38873887
Arc::new(LargeStringArray::from(vec![Some("bar")])),
3888-
Arc::new(BinaryArray::from(vec![b"foo" as &[u8]])),
3889-
Arc::new(LargeBinaryArray::from(vec![b"foo" as &[u8]])),
3888+
Arc::new(BinaryArray::from_slice(&[b"foo" as &[u8]])),
3889+
Arc::new(LargeBinaryArray::from_slice(&[b"foo" as &[u8]])),
38903890
Arc::new(TimestampNanosecondArray::from_opt_vec(
38913891
vec![Some(123)],
38923892
None,
@@ -4150,7 +4150,7 @@ mod tests {
41504150

41514151
// create mock record batch
41524152
let ids = Arc::new(Int32Array::from_slice(&[i as i32]));
4153-
let names = Arc::new(StringArray::from(vec!["test"]));
4153+
let names = Arc::new(StringArray::from_slice(&["test"]));
41544154
let rec_batch =
41554155
RecordBatch::try_new(schema.clone(), vec![ids, names]).unwrap();
41564156

datafusion/src/from_slice.rs

Lines changed: 103 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,27 +19,119 @@
1919
//!
2020
//! This file essentially exists to ease the transition onto arrow2
2121
22-
use arrow::array::{ArrayData, PrimitiveArray};
23-
use arrow::buffer::Buffer;
24-
use arrow::datatypes::ArrowPrimitiveType;
22+
use arrow::array::{
23+
ArrayData, BinaryOffsetSizeTrait, BooleanArray, GenericBinaryArray,
24+
GenericStringArray, PrimitiveArray, StringOffsetSizeTrait,
25+
};
26+
use arrow::buffer::{Buffer, MutableBuffer};
27+
use arrow::datatypes::{ArrowPrimitiveType, DataType};
28+
use arrow::util::bit_util;
2529

2630
/// A trait to define from_slice functions for arrow primitive array types
27-
pub trait FromSlice<T>
31+
pub trait FromSlice<S, E>
2832
where
29-
T: ArrowPrimitiveType,
33+
S: AsRef<[E]>,
3034
{
3135
/// convert a slice of native types into a primitive array (without nulls)
32-
fn from_slice(slice: &[T::Native]) -> PrimitiveArray<T>;
36+
fn from_slice(slice: S) -> Self;
3337
}
3438

35-
/// default implementation for primitive types
36-
// #[cfg(test)]
37-
impl<T: ArrowPrimitiveType> FromSlice<T> for PrimitiveArray<T> {
38-
fn from_slice(slice: &[T::Native]) -> PrimitiveArray<T> {
39+
/// default implementation for primitive array types, adapted from `From<Vec<_>>`
40+
impl<S, T> FromSlice<S, T::Native> for PrimitiveArray<T>
41+
where
42+
T: ArrowPrimitiveType,
43+
S: AsRef<[T::Native]>,
44+
{
45+
fn from_slice(slice: S) -> Self {
46+
let slice = slice.as_ref();
3947
let array_data = ArrayData::builder(T::DATA_TYPE)
4048
.len(slice.len())
4149
.add_buffer(Buffer::from_slice_ref(&slice));
4250
let array_data = unsafe { array_data.build_unchecked() };
43-
PrimitiveArray::<T>::from(array_data)
51+
Self::from(array_data)
52+
}
53+
}
54+
55+
/// default implementation for binary array types, adapted from `From<Vec<_>>`
56+
impl<S, I, OffsetSize> FromSlice<S, I> for GenericBinaryArray<OffsetSize>
57+
where
58+
OffsetSize: BinaryOffsetSizeTrait,
59+
S: AsRef<[I]>,
60+
I: AsRef<[u8]>,
61+
{
62+
fn from_slice(slice: S) -> Self {
63+
let slice = slice.as_ref();
64+
let mut offsets = Vec::with_capacity(slice.len() + 1);
65+
let mut values = Vec::new();
66+
let mut length_so_far: OffsetSize = OffsetSize::zero();
67+
offsets.push(length_so_far);
68+
for s in slice {
69+
let s = s.as_ref();
70+
length_so_far += OffsetSize::from_usize(s.len()).unwrap();
71+
offsets.push(length_so_far);
72+
values.extend_from_slice(s);
73+
}
74+
let array_data = ArrayData::builder(OffsetSize::DATA_TYPE)
75+
.len(slice.len())
76+
.add_buffer(Buffer::from_slice_ref(&offsets))
77+
.add_buffer(Buffer::from_slice_ref(&values));
78+
let array_data = unsafe { array_data.build_unchecked() };
79+
Self::from(array_data)
80+
}
81+
}
82+
83+
/// default implementation for utf8 array types, adapted from `From<Vec<_>>`
84+
impl<S, I, OffsetSize> FromSlice<S, I> for GenericStringArray<OffsetSize>
85+
where
86+
OffsetSize: StringOffsetSizeTrait,
87+
S: AsRef<[I]>,
88+
I: AsRef<str>,
89+
{
90+
fn from_slice(slice: S) -> Self {
91+
let slice = slice.as_ref();
92+
let mut offsets =
93+
MutableBuffer::new((slice.len() + 1) * std::mem::size_of::<OffsetSize>());
94+
let mut values = MutableBuffer::new(0);
95+
96+
let mut length_so_far = OffsetSize::zero();
97+
offsets.push(length_so_far);
98+
99+
for s in slice {
100+
let s = s.as_ref();
101+
length_so_far += OffsetSize::from_usize(s.len()).unwrap();
102+
offsets.push(length_so_far);
103+
values.extend_from_slice(s.as_bytes());
104+
}
105+
let array_data = ArrayData::builder(OffsetSize::DATA_TYPE)
106+
.len(slice.len())
107+
.add_buffer(offsets.into())
108+
.add_buffer(values.into());
109+
let array_data = unsafe { array_data.build_unchecked() };
110+
Self::from(array_data)
111+
}
112+
}
113+
114+
/// default implementation for boolean array type, adapted from `From<Vec<bool>>`
115+
impl<S> FromSlice<S, bool> for BooleanArray
116+
where
117+
S: AsRef<[bool]>,
118+
{
119+
fn from_slice(slice: S) -> Self {
120+
let slice = slice.as_ref();
121+
let mut mut_buf = MutableBuffer::new_null(slice.len());
122+
{
123+
let mut_slice = mut_buf.as_slice_mut();
124+
for (i, b) in slice.iter().enumerate() {
125+
if *b {
126+
bit_util::set_bit(mut_slice, i);
127+
}
128+
}
129+
}
130+
let array_data = ArrayData::builder(DataType::Boolean)
131+
.len(slice.len())
132+
.add_buffer(mut_buf.into());
133+
134+
let array_data = unsafe { array_data.build_unchecked() };
135+
Self::from(array_data)
44136
}
45137
}

datafusion/src/physical_optimizer/pruning.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -706,15 +706,15 @@ enum StatisticsType {
706706

707707
#[cfg(test)]
708708
mod tests {
709-
use std::collections::HashMap;
710-
711709
use super::*;
710+
use crate::from_slice::FromSlice;
712711
use crate::logical_plan::{col, lit};
713712
use crate::{assert_batches_eq, physical_optimizer::pruning::StatisticsType};
714713
use arrow::{
715714
array::{BinaryArray, Int32Array, Int64Array, StringArray},
716715
datatypes::{DataType, TimeUnit},
717716
};
717+
use std::collections::HashMap;
718718

719719
#[derive(Debug)]
720720
/// Test for container stats
@@ -972,7 +972,7 @@ mod tests {
972972

973973
// Note the statistics return binary (which can't be cast to string)
974974
let statistics = OneContainerStats {
975-
min_values: Some(Arc::new(BinaryArray::from(vec![&[255u8] as &[u8]]))),
975+
min_values: Some(Arc::new(BinaryArray::from_slice(&[&[255u8] as &[u8]]))),
976976
max_values: None,
977977
num_containers: 1,
978978
};

datafusion/src/physical_plan/distinct_expressions.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -513,11 +513,12 @@ mod tests {
513513

514514
let zero_count_values = BooleanArray::from(Vec::<bool>::new());
515515

516-
let one_count_values = BooleanArray::from(vec![false, false]);
516+
let one_count_values = BooleanArray::from_slice(&[false, false]);
517517
let one_count_values_with_null =
518518
BooleanArray::from(vec![Some(true), Some(true), None, None]);
519519

520-
let two_count_values = BooleanArray::from(vec![true, false, true, false, true]);
520+
let two_count_values =
521+
BooleanArray::from_slice(&[true, false, true, false, true]);
521522
let two_count_values_with_null = BooleanArray::from(vec![
522523
Some(true),
523524
Some(false),

datafusion/src/physical_plan/expressions/cast.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ pub fn cast(
159159
mod tests {
160160
use super::*;
161161
use crate::error::Result;
162+
use crate::from_slice::FromSlice;
162163
use crate::physical_plan::expressions::col;
163164
use arrow::{
164165
array::{
@@ -458,7 +459,7 @@ mod tests {
458459
fn invalid_cast_with_options_error() -> Result<()> {
459460
// Ensure a useful error happens at plan time if invalid casts are used
460461
let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]);
461-
let a = StringArray::from(vec!["9.1"]);
462+
let a = StringArray::from_slice(&["9.1"]);
462463
let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
463464
let expression = cast_with_options(
464465
col("a", &schema)?,

datafusion/src/physical_plan/expressions/count.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ mod tests {
209209
#[test]
210210
fn count_utf8() -> Result<()> {
211211
let a: ArrayRef =
212-
Arc::new(StringArray::from(vec!["a", "bb", "ccc", "dddd", "ad"]));
212+
Arc::new(StringArray::from_slice(&["a", "bb", "ccc", "dddd", "ad"]));
213213
generic_test_op!(
214214
a,
215215
DataType::Utf8,
@@ -221,8 +221,9 @@ mod tests {
221221

222222
#[test]
223223
fn count_large_utf8() -> Result<()> {
224-
let a: ArrayRef =
225-
Arc::new(LargeStringArray::from(vec!["a", "bb", "ccc", "dddd", "ad"]));
224+
let a: ArrayRef = Arc::new(LargeStringArray::from_slice(&[
225+
"a", "bb", "ccc", "dddd", "ad",
226+
]));
226227
generic_test_op!(
227228
a,
228229
DataType::LargeUtf8,

datafusion/src/physical_plan/expressions/is_not_null.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ pub fn is_not_null(arg: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>>
8888
#[cfg(test)]
8989
mod tests {
9090
use super::*;
91+
use crate::from_slice::FromSlice;
9192
use crate::physical_plan::expressions::col;
9293
use arrow::{
9394
array::{BooleanArray, StringArray},
@@ -110,7 +111,7 @@ mod tests {
110111
.downcast_ref::<BooleanArray>()
111112
.expect("failed to downcast to BooleanArray");
112113

113-
let expected = &BooleanArray::from(vec![true, false]);
114+
let expected = &BooleanArray::from_slice(&[true, false]);
114115

115116
assert_eq!(expected, result);
116117

datafusion/src/physical_plan/expressions/is_null.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ pub fn is_null(arg: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
8888
#[cfg(test)]
8989
mod tests {
9090
use super::*;
91+
use crate::from_slice::FromSlice;
9192
use crate::physical_plan::expressions::col;
9293
use arrow::{
9394
array::{BooleanArray, StringArray},
@@ -111,7 +112,7 @@ mod tests {
111112
.downcast_ref::<BooleanArray>()
112113
.expect("failed to downcast to BooleanArray");
113114

114-
let expected = &BooleanArray::from(vec![false, true]);
115+
let expected = &BooleanArray::from_slice(&[false, true]);
115116

116117
assert_eq!(expected, result);
117118

datafusion/src/physical_plan/expressions/min_max.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -775,7 +775,7 @@ mod tests {
775775

776776
#[test]
777777
fn max_utf8() -> Result<()> {
778-
let a: ArrayRef = Arc::new(StringArray::from(vec!["d", "a", "c", "b"]));
778+
let a: ArrayRef = Arc::new(StringArray::from_slice(&["d", "a", "c", "b"]));
779779
generic_test_op!(
780780
a,
781781
DataType::Utf8,
@@ -787,7 +787,7 @@ mod tests {
787787

788788
#[test]
789789
fn max_large_utf8() -> Result<()> {
790-
let a: ArrayRef = Arc::new(LargeStringArray::from(vec!["d", "a", "c", "b"]));
790+
let a: ArrayRef = Arc::new(LargeStringArray::from_slice(&["d", "a", "c", "b"]));
791791
generic_test_op!(
792792
a,
793793
DataType::LargeUtf8,
@@ -799,7 +799,7 @@ mod tests {
799799

800800
#[test]
801801
fn min_utf8() -> Result<()> {
802-
let a: ArrayRef = Arc::new(StringArray::from(vec!["d", "a", "c", "b"]));
802+
let a: ArrayRef = Arc::new(StringArray::from_slice(&["d", "a", "c", "b"]));
803803
generic_test_op!(
804804
a,
805805
DataType::Utf8,
@@ -811,7 +811,7 @@ mod tests {
811811

812812
#[test]
813813
fn min_large_utf8() -> Result<()> {
814-
let a: ArrayRef = Arc::new(LargeStringArray::from(vec!["d", "a", "c", "b"]));
814+
let a: ArrayRef = Arc::new(LargeStringArray::from_slice(&["d", "a", "c", "b"]));
815815
generic_test_op!(
816816
a,
817817
DataType::LargeUtf8,

0 commit comments

Comments
 (0)