@@ -5969,6 +5969,188 @@ true false true false false false true true false false true false true
59695969#----
59705970#true false true false false false true true false false true false true
59715971
5972+ # rewrite various array_has operations to InList where the haystack is a literal list
5973+ # NB that `col in (a, b, c)` is simplified to OR if there are <= 3 elements, so we make 4-element haystack lists
5974+
5975+ query I
5976+ with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
5977+ select count(*) from test WHERE needle IN ('7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c');
5978+ ----
5979+ 1
5980+
5981+ query TT
5982+ explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
5983+ select count(*) from test WHERE needle IN ('7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c');
5984+ ----
5985+ logical_plan
5986+ 01)Projection: count(Int64(1)) AS count(*)
5987+ 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
5988+ 03)----SubqueryAlias: test
5989+ 04)------SubqueryAlias: t
5990+ 05)--------Projection:
5991+ 06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
5992+ 07)------------TableScan: tmp_table projection=[value]
5993+ physical_plan
5994+ 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
5995+ 02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
5996+ 03)----CoalescePartitionsExec
5997+ 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
5998+ 05)--------ProjectionExec: expr=[]
5999+ 06)----------CoalesceBatchesExec: target_batch_size=8192
6000+ 07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }])
6001+ 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
6002+ 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
6003+
6004+ query I
6005+ with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
6006+ select count(*) from test WHERE needle = ANY(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c']);
6007+ ----
6008+ 1
6009+
6010+ query TT
6011+ explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
6012+ select count(*) from test WHERE needle = ANY(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c']);
6013+ ----
6014+ logical_plan
6015+ 01)Projection: count(Int64(1)) AS count(*)
6016+ 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
6017+ 03)----SubqueryAlias: test
6018+ 04)------SubqueryAlias: t
6019+ 05)--------Projection:
6020+ 06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
6021+ 07)------------TableScan: tmp_table projection=[value]
6022+ physical_plan
6023+ 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
6024+ 02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
6025+ 03)----CoalescePartitionsExec
6026+ 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
6027+ 05)--------ProjectionExec: expr=[]
6028+ 06)----------CoalesceBatchesExec: target_batch_size=8192
6029+ 07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }])
6030+ 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
6031+ 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
6032+
6033+ query I
6034+ with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
6035+ select count(*) from test WHERE array_has(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], needle);
6036+ ----
6037+ 1
6038+
6039+ query TT
6040+ explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
6041+ select count(*) from test WHERE array_has(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], needle);
6042+ ----
6043+ logical_plan
6044+ 01)Projection: count(Int64(1)) AS count(*)
6045+ 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
6046+ 03)----SubqueryAlias: test
6047+ 04)------SubqueryAlias: t
6048+ 05)--------Projection:
6049+ 06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
6050+ 07)------------TableScan: tmp_table projection=[value]
6051+ physical_plan
6052+ 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
6053+ 02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
6054+ 03)----CoalescePartitionsExec
6055+ 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
6056+ 05)--------ProjectionExec: expr=[]
6057+ 06)----------CoalesceBatchesExec: target_batch_size=8192
6058+ 07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }])
6059+ 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
6060+ 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
6061+
6062+ # FIXME: due to rewrite below not working, this is _extremely_ slow to evaluate
6063+ # query I
6064+ # with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
6065+ # select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'LargeList(Utf8View)'), needle);
6066+ # ----
6067+ # 1
6068+
6069+ # FIXME: array_has with large list haystack not currently rewritten to InList
6070+ query TT
6071+ explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
6072+ select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'LargeList(Utf8View)'), needle);
6073+ ----
6074+ logical_plan
6075+ 01)Projection: count(Int64(1)) AS count(*)
6076+ 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
6077+ 03)----SubqueryAlias: test
6078+ 04)------SubqueryAlias: t
6079+ 05)--------Projection:
6080+ 06)----------Filter: array_has(LargeList([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)))
6081+ 07)------------TableScan: tmp_table projection=[value]
6082+ physical_plan
6083+ 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
6084+ 02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
6085+ 03)----CoalescePartitionsExec
6086+ 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
6087+ 05)--------ProjectionExec: expr=[]
6088+ 06)----------CoalesceBatchesExec: target_batch_size=8192
6089+ 07)------------FilterExec: array_has([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c], substr(md5(CAST(value@0 AS Utf8)), 1, 32))
6090+ 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
6091+ 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
6092+
6093+ query I
6094+ with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
6095+ select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'FixedSizeList(4, Utf8View)'), needle);
6096+ ----
6097+ 1
6098+
6099+ query TT
6100+ explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
6101+ select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'FixedSizeList(4, Utf8View)'), needle);
6102+ ----
6103+ logical_plan
6104+ 01)Projection: count(Int64(1)) AS count(*)
6105+ 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
6106+ 03)----SubqueryAlias: test
6107+ 04)------SubqueryAlias: t
6108+ 05)--------Projection:
6109+ 06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
6110+ 07)------------TableScan: tmp_table projection=[value]
6111+ physical_plan
6112+ 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
6113+ 02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
6114+ 03)----CoalescePartitionsExec
6115+ 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
6116+ 05)--------ProjectionExec: expr=[]
6117+ 06)----------CoalesceBatchesExec: target_batch_size=8192
6118+ 07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }])
6119+ 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
6120+ 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
6121+
6122+ query I
6123+ with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
6124+ select count(*) from test WHERE array_has([needle], needle);
6125+ ----
6126+ 100000
6127+
6128+ # TODO: this should probably be possible to completely remove the filter as always true?
6129+ query TT
6130+ explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
6131+ select count(*) from test WHERE array_has([needle], needle);
6132+ ----
6133+ logical_plan
6134+ 01)Projection: count(Int64(1)) AS count(*)
6135+ 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
6136+ 03)----SubqueryAlias: test
6137+ 04)------SubqueryAlias: t
6138+ 05)--------Projection:
6139+ 06)----------Filter: __common_expr_3 = __common_expr_3
6140+ 07)------------Projection: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) AS __common_expr_3
6141+ 08)--------------TableScan: tmp_table projection=[value]
6142+ physical_plan
6143+ 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
6144+ 02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
6145+ 03)----CoalescePartitionsExec
6146+ 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
6147+ 05)--------ProjectionExec: expr=[]
6148+ 06)----------CoalesceBatchesExec: target_batch_size=8192
6149+ 07)------------FilterExec: __common_expr_3@0 = __common_expr_3@0
6150+ 08)--------------ProjectionExec: expr=[substr(md5(CAST(value@0 AS Utf8)), 1, 32) as __common_expr_3]
6151+ 09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
6152+ 10)------------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
6153+
59726154# any operator
59736155query ?
59746156select column3 from arrays where 'L'=any(column3);
0 commit comments