diff --git a/legate/numpy/array.py b/legate/numpy/array.py
index 9a31ec2f1d..9f24e4ddfc 100644
--- a/legate/numpy/array.py
+++ b/legate/numpy/array.py
@@ -370,13 +370,6 @@ def _convert_key(self, key, stacklevel=2, first=True):
     def __getitem__(self, key):
         if key is None:
             raise KeyError("invalid key passed to legate.numpy.ndarray")
-        # If we're a scalar, we're our own value
-        if self.size == 1:
-            if (self.ndim == 0 and key != () and key != Ellipsis) or (
-                key != ((0,) * self.ndim)
-            ):
-                raise KeyError("invalid key passed to legate.numpy.ndarray")
-            return self
         key = self._convert_key(key)
         return ndarray(
             shape=None, thunk=self._thunk.get_item(key, stacklevel=2)
@@ -745,11 +738,6 @@ def __setitem__(self, key, value):
             temp = ndarray(value_array.shape, dtype=self.dtype)
             temp._thunk.convert(value_array._thunk, stacklevel=2)
             value_array = temp
-        if self.size == 1:
-            if (self.ndim == 0 and key != () and key != Ellipsis) or (
-                key != ((0,) * self.ndim)
-            ):
-                raise KeyError("invalid key passed to legate.numpy.ndarray")
         key = self._convert_key(key)
         self._thunk.set_item(key, value_array._thunk, stacklevel=2)
 
diff --git a/legate/numpy/config.py b/legate/numpy/config.py
index 16023bcdb2..f33883da93 100644
--- a/legate/numpy/config.py
+++ b/legate/numpy/config.py
@@ -189,6 +189,8 @@ class NumPyOpCode(IntEnum):
     INCLUSIVE_SCAN = legate_numpy.NUMPY_INCLUSIVE_SCAN
     CONVERT_TO_RECT = legate_numpy.NUMPY_CONVERT_TO_RECT
     ARANGE = legate_numpy.NUMPY_ARANGE
+    ZIP = legate_numpy.NUMPY_ZIP
+    TRANSFORM = legate_numpy.NUMPY_TRANSFORM
 
 
 # Match these to NumPyRedopID in legate_numpy_c.h
@@ -317,6 +319,6 @@ class NumPyProjCode(IntEnum):
     PROJ_RADIX_3D_Z_4_2 = legate_numpy.NUMPY_PROJ_RADIX_3D_Z_4_2
     PROJ_RADIX_3D_Z_4_3 = legate_numpy.NUMPY_PROJ_RADIX_3D_Z_4_3
     # Flattening
-    PROJ_ND_1D_C_ORDER = legate_numpy.NUMPY_PROJ_ND_1D_C_ORDER
+    PROJ_ND_MD_C_ORDER = legate_numpy.NUMPY_PROJ_ND_MD_C_ORDER
     # Must always be last
     PROJ_LAST = legate_numpy.NUMPY_PROJ_LAST
diff --git a/legate/numpy/deferred.py b/legate/numpy/deferred.py
index 95a2ea5e2e..efca1c34d6 100644
--- a/legate/numpy/deferred.py
+++ b/legate/numpy/deferred.py
@@ -23,7 +23,7 @@
 
 from .config import *  # noqa F403
 from .thunk import NumPyThunk
-from .utils import calculate_volume
+from .utils import calculate_volume, get_point_dim
 
 try:
     xrange  # Python 2
@@ -44,11 +44,10 @@ class DeferredArray(NumPyThunk):
     :meta private:
     """
 
-    def __init__(self, runtime, base, shape, dtype, scalar):
+    def __init__(self, runtime, base, shape, dtype):
         NumPyThunk.__init__(self, runtime, shape, dtype)
         assert base is not None
         self.base = base  # Either a RegionField or a Future
-        self.scalar = scalar
 
     @property
     def storage(self):
@@ -62,7 +61,7 @@ def ndim(self):
         return len(self.shape)
 
     def __numpy_array__(self, stacklevel):
-        if self.scalar:
+        if self.size == 1:
             return np.full(
                 self.shape,
                 self.get_scalar_array(stacklevel=(stacklevel + 1)),
@@ -310,13 +309,13 @@ def get_scalar_array(self, stacklevel):
     def _create_indexing_array(self, key, stacklevel):
         # Convert everything into deferred arrays of int64
         if isinstance(key, tuple):
-            tuple_of_arrays = ()
+            tuple_of_keys = ()
             for k in key:
                 if not isinstance(k, NumPyThunk):
                     raise NotImplementedError(
                         "need support for mixed advanced indexing"
                     )
-                tuple_of_arrays += (k,)
+                tuple_of_keys += (k,)
         else:
             assert isinstance(key, NumPyThunk)
             # Handle the boolean array case
@@ -326,29 +325,178 @@ def _create_indexing_array(self, key, stacklevel):
                         "Boolean advanced indexing dimension mismatch"
                     )
                 # For boolean arrays do the non-zero operation to make
-                # them into a normal indexing array
-                tuple_of_arrays = key.nonzero(stacklevel + 1)
+                # them into a set of integer indexing arrays
+                tuple_of_keys = key.nonzero(stacklevel + 1)
             else:
-                tuple_of_arrays = (key,)
-        if len(tuple_of_arrays) != self.ndim:
+                tuple_of_keys = (key,)
+        if len(tuple_of_keys) != self.ndim:
             raise TypeError("Advanced indexing dimension mismatch")
-        if self.ndim > 1:
-            # Check that all the arrays can be broadcast together
-            # Concatenate all the arrays into a single array
-            raise NotImplementedError("need support for concatenating arrays")
+        index_arrays = []
+        result_shape = tuple_of_keys[0].shape
+        for k in tuple_of_keys:
+            # TODO: Implement broadcasting for advanced indexing arrays
+            if k.shape != result_shape:
+                raise NotImplementedError(
+                    "Broadcasting for advanced indexing arrays"
+                )
+            # TODO: Support singleton arrays in advanced indexing
+            if k.size <= 1:
+                raise NotImplementedError(
+                    "Singleton arrays in advanced indexing"
+                )
+            arr = self.runtime.to_deferred_array(
+                k, stacklevel=(stacklevel + 1)
+            )
+            if arr.dtype != np.dtype(np.int64):
+                converted_array = self.runtime.to_deferred_array(
+                    self.runtime.create_empty_thunk(
+                        shape=arr.shape,
+                        dtype=np.dtype(np.int64),
+                        inputs=(arr,),
+                    ),
+                    stacklevel=(stacklevel + 1),
+                )
+                converted_array.convert(
+                    arr, warn=False, stacklevel=(stacklevel + 1)
+                )
+                arr = converted_array
+            index_arrays.append(arr)
+        # Zip index arrays if necessary
+        if len(index_arrays) == 1:
+            return index_arrays[0]
+        # NOTE: We need to instantiate a RegionField of non-primitive dtype, to
+        # store N-dimensional index points, to be used as the indirection field
+        # in a copy. Such dtypes are technically not supported, but it should
+        # be safe to directly create a DeferredArray of that dtype, so long as
+        # we don't try to convert it to a NumPy array.
+        result_dtype = np.dtype((np.int64, (len(index_arrays),)))
+        result_array = DeferredArray(
+            self.runtime,
+            self.runtime.allocate_field(result_shape, result_dtype),
+            shape=result_shape,
+            dtype=result_dtype,
+        )
+        result = result_array.base
+        launch_space = result.compute_parallel_launch_space()
+        # Pack our arguments
+        argbuf = BufferBuilder()
+        if launch_space is not None:
+            (
+                result_part,
+                shardfn,
+                shardsp,
+            ) = result.find_or_create_key_partition()
+            self.pack_shape(
+                argbuf, result_array.shape, result_part.tile_shape, 0
+            )
         else:
-            return self.runtime.to_deferred_array(
-                tuple_of_arrays[0], stacklevel=(stacklevel + 1)
+            self.pack_shape(argbuf, result_array.shape)
+        argbuf.pack_accessor(result.field.field_id, result.transform)
+        # All arrays are of the same shape, so no need for transform accessors
+        for arr in index_arrays:
+            argbuf.pack_accessor(arr.base.field.field_id, arr.base.transform)
+        task_id = self.runtime.get_zip_task_id(len(index_arrays))
+        if launch_space is not None:
+            # Index task launch case
+            task = IndexTask(
+                task_id,
+                Rect(launch_space),
+                self.runtime.empty_argmap,
+                argbuf.get_string(),
+                argbuf.get_size(),
+                mapper=self.runtime.mapper_id,
+                tag=shardfn,
             )
+            if shardsp is not None:
+                task.set_sharding_space(shardsp)
+            task.add_write_requirement(
+                result_part,
+                result.field.field_id,
+                0,
+                tag=NumPyMappingTag.KEY_REGION_TAG,
+            )
+
+            for arr in index_arrays:
+                task.add_read_requirement(
+                    arr.base.find_or_create_congruent_partition(result_part),
+                    arr.base.field.field_id,
+                    0,
+                )
+            self.runtime.dispatch(task)
+        else:
+            # Single task launch case
+            shardpt, shardfn, shardsp = result.find_point_sharding()
+            task = Task(
+                task_id,
+                argbuf.get_string(),
+                argbuf.get_size(),
+                mapper=self.runtime.mapper_id,
+                tag=shardfn,
+            )
+            if shardpt is not None:
+                task.set_point(shardpt)
+            if shardsp is not None:
+                task.set_sharding_space(shardsp)
+            task.add_write_requirement(result.region, result.field.field_id)
+            for arr in index_arrays:
+                task.add_read_requirement(
+                    arr.base.region, arr.base.field.field_id
+                )
+            self.runtime.dispatch(task)
+        return result_array
+
+    # Gets rid of any transforms
+    def materialize(self, stacklevel):
+        if isinstance(self.base, Future):
+            return self
+        result = DeferredArray(
+            self.runtime,
+            self.runtime.allocate_field(self.shape, self.dtype),
+            shape=self.shape,
+            dtype=self.dtype,
+        )
+        result.copy(self, stacklevel=(stacklevel + 1), deep=False)
+        return result
+
+    # Apply a transform to every element in an array of Points
+    def apply_transform(self, transform, stacklevel):
+        assert get_point_dim(self.dtype) == transform.N
+        result_dtype = (
+            np.dtype(np.int64)
+            if transform.M == 1
+            else np.dtype((np.int64, (transform.M,)))
+        )
+        argbuf = BufferBuilder()
+        argbuf.pack_transform(transform)
+        args = (argbuf.get_string(),)
+        result = DeferredArray(
+            self.runtime,
+            self.runtime.allocate_field(self.shape, result_dtype),
+            shape=self.shape,
+            dtype=result_dtype,
+        )
+        result.unary_op(
+            NumPyOpCode.TRANSFORM,
+            self.dtype,
+            self,
+            True,
+            args,
+            stacklevel=(stacklevel + 1),
+        )
+        return result
 
     def get_item(self, key, stacklevel, view=None, dim_map=None):
-        assert self.size > 1
         # Check to see if this is advanced indexing or not
         if self._is_advanced_indexing(key):
             # Create the indexing array
             index_array = self._create_indexing_array(
                 key, stacklevel=(stacklevel + 1)
             )
+            # TODO: Support singleton arrays in advanced indexing
+            if self.size <= 1:
+                raise NotImplementedError(
+                    "Singleton arrays in advanced indexing"
+                )
             # Create a new array to be the result
             result_field = self.runtime.allocate_field(
                 index_array.shape, dtype=self.dtype
@@ -358,40 +506,43 @@ def get_item(self, key, stacklevel, view=None, dim_map=None):
                 result_field,
                 shape=index_array.shape,
                 dtype=self.dtype,
-                scalar=False,
             )
-            # Issue the gather copy to the result
-            index = index_array.base
-            launch_space = index.compute_parallel_launch_space()
-            src = self.base
+            # Prepare regions for the gather copy: Get rid of all transforms,
+            # so that we use the underlying "global" coordinate space directly
+            # (gather copies can't accept accessors, that are normally used
+            # for this purpose).
+            src = self.base.root
             dst = result_field
-            if src.transform is not None or index.transform is not None:
-                raise NotImplementedError("views with advanced indexing")
+            if self.base.transform is not None:
+                src_indirect = index_array.apply_transform(
+                    self.base.transform, stacklevel=(stacklevel + 1)
+                ).base
+            elif index_array.base.transform is not None:
+                src_indirect = index_array.materialize(
+                    stacklevel=(stacklevel + 1)
+                ).base
+            else:
+                src_indirect = index_array.base
+            # Issue the gather copy to the result
+            (
+                src_indirect_part,
+                shardfn,
+                shardsp,
+            ) = src_indirect.find_or_create_key_partition()
+            launch_space = src_indirect.compute_parallel_launch_space()
             if launch_space is not None:
                 # Index copy launch
-                if self.ndim == index_array.ndim:
+                if len(src.shape) == len(src_indirect.shape):
                     # If we have the same dimensionality then normal
                     # partitioning works
-                    (
-                        src_part,
-                        shardfn,
-                        shardsp,
-                    ) = src.find_or_create_key_partition()
+                    src_part = src.find_or_create_partition(launch_space)
                     src_proj = 0  # identity
-                    index_part = index.find_or_create_congruent_partition(
-                        src_part
-                    )
                 else:
                     # Otherwise we need to compute an indirect partition
                     # and functor
                     src_part, src_proj = src.find_or_create_indirect_partition(
                         launch_space
                     )
-                    (
-                        index_part,
-                        shardfn,
-                        shardsp,
-                    ) = index.find_or_create_key_partition()
                 copy = IndexCopy(
                     Rect(launch_space),
                     mapper=self.runtime.mapper_id,
@@ -401,7 +552,9 @@ def get_item(self, key, stacklevel, view=None, dim_map=None):
                     copy.set_sharding_space(shardsp)
                 # Partition the index array and the destination array
                 # the same way
-                dst_part = dst.find_or_create_congruent_partition(index_part)
+                dst_part = dst.find_or_create_congruent_partition(
+                    src_indirect_part
+                )
                 # Set this as the key partition
                 dst.set_key_partition(dst_part, shardfn, shardsp)
                 copy.add_src_requirement(
@@ -414,11 +567,11 @@ def get_item(self, key, stacklevel, view=None, dim_map=None):
                     tag=NumPyMappingTag.KEY_REGION_TAG,
                 )
                 copy.add_src_indirect_requirement(
-                    index_part, index.field.field_id, 0
+                    src_indirect_part, src_indirect.field.field_id, 0
                 )
             else:
                 # Single copy launch
-                shardpt, shardfn, shardsp = index.find_point_sharding()
+                shardpt, shardfn, shardsp = src_indirect.find_point_sharding()
                 copy = Copy(mapper=self.runtime.mapper_id, tag=shardfn)
                 if shardpt is not None:
                     copy.set_point(shardpt)
@@ -427,11 +580,28 @@ def get_item(self, key, stacklevel, view=None, dim_map=None):
                 copy.add_src_requirement(src.region, src.field.field_id)
                 copy.add_dst_requirement(dst.region, dst.field.field_id)
                 copy.add_src_indirect_requirement(
-                    index.region, index.field.field_id
+                    src_indirect.region, src_indirect.field.field_id
                 )
             # Issue the copy to the runtime
             self.runtime.dispatch(copy)
+        elif self.size == 1:
+            # Reading from a singleton array, using basic indexing: simulate
+            # the operation in numpy and see how many values need to be
+            # returned
+            sim_result = np.zeros(self.shape)[key]
+            is_number = type(sim_result) is not np.ndarray
+            result_size = (
+                1 if is_number else calculate_volume(sim_result.shape)
+            )
+            assert result_size <= 1
+            result = DeferredArray(
+                self.runtime,
+                base=(self.base if result_size == 1 else Future()),
+                shape=(() if is_number else sim_result.shape),
+                dtype=self.dtype,
+            )
         else:
+            # Reading from a non-singleton array, using basic indexing
             if view is None or dim_map is None:
                 view, dim_map = self._get_view(key)
             new_shape = self._get_view_shape(view, dim_map)
@@ -476,7 +646,6 @@ def get_item(self, key, stacklevel, view=None, dim_map=None):
                     base=future,
                     shape=(),
                     dtype=self.dtype,
-                    scalar=True,
                 )
         if self.runtime.shadow_debug:
             result.shadow = self.shadow.get_item(
@@ -495,48 +664,56 @@ def set_item(self, key, rhs, stacklevel):
             index_array = self._create_indexing_array(
                 key, stacklevel=(stacklevel + 1)
             )
+            # TODO: Support singleton arrays in advanced indexing
+            if self.size <= 1 or value_array.size <= 1:
+                raise NotImplementedError(
+                    "Singleton arrays in advanced indexing"
+                )
             if index_array.shape != value_array.shape:
                 raise ValueError(
                     "Advanced indexing array does not match source shape"
                 )
-            # Do the scatter copy
-            index = index_array.base
-            launch_space = index.compute_parallel_launch_space()
-            dst = self.base
+            # Prepare regions for the scatter copy: Get rid of all transforms,
+            # so that we use the underlying "global" coordinate space directly
+            # (scatter copies can't accept accessors, that are normally used
+            # for this purpose).
             src = value_array.base
+            if src.transform is not None:
+                # Waiting for Legion Issue #705
+                raise NotImplementedError("views on advanced __set_item__ RHS")
+            dst = self.base.root
+            if self.base.transform is not None:
+                dst_indirect = index_array.apply_transform(
+                    self.base.transform, stacklevel=(stacklevel + 1)
+                ).base
+            elif index_array.base.transform is not None:
+                dst_indirect = index_array.materialize(
+                    stacklevel=(stacklevel + 1)
+                ).base
+            else:
+                dst_indirect = index_array.base
             if src.field is dst.field:
                 raise NotImplementedError("intra-array advanced coyping")
-            if (
-                src.transform is not None
-                or index.transform is not None
-                or dst.transform is not None
-            ):
-                raise NotImplementedError("views with advanced indexing")
+            # Do the scatter copy
+            (
+                dst_indirect_part,
+                shardfn,
+                shardsp,
+            ) = dst_indirect.find_or_create_key_partition()
+            launch_space = dst_indirect.compute_parallel_launch_space()
             if launch_space is not None:
                 # Index copy launch
-                if self.ndim == index_array.ndim:
+                if len(dst.shape) == len(dst_indirect.shape):
                     # If we have the same dimensionality then normal
                     # partitioning works
-                    (
-                        dst_part,
-                        shardfn,
-                        shardsp,
-                    ) = dst.find_or_create_key_partition()
+                    dst_part = dst.find_or_create_partition(launch_space)
                     dst_proj = 0  # identity
-                    index_part = index.find_or_create_congruent_partition(
-                        dst_part
-                    )
                 else:
                     # Otherwise we need to compute an indirect partition
                     # and functor
                     dst_part, dst_proj = dst.find_or_create_indirect_partition(
                         launch_space
                     )
-                    (
-                        index_part,
-                        shardfn,
-                        shardsp,
-                    ) = index.find_or_create_key_partition()
                 copy = IndexCopy(
                     Rect(launch_space),
                     mapper=self.runtime.mapper_id,
@@ -544,36 +721,49 @@ def set_item(self, key, rhs, stacklevel):
                 )
                 if shardsp is not None:
                     copy.set_sharding_space(shardsp)
-                src_part = src.find_or_create_congruent_partition(index_part)
+                src_part = src.find_or_create_congruent_partition(
+                    dst_indirect_part
+                )
                 copy.add_src_requirement(src_part, src.field.field_id, 0)
                 copy.add_dst_requirement(
                     dst_part, dst.field.field_id, dst_proj
                 )
                 copy.add_dst_indirect_requirement(
-                    index_part, index.field.field_id, 0
+                    dst_indirect_part, dst_indirect.field.field_id, 0
                 )
             else:
                 # Single copy launch
-                point, shardfn, shardsp = index.find_point_sharding()
+                shardpt, shardfn, shardsp = dst_indirect.find_point_sharding()
                 copy = Copy(mapper=self.runtime.mapper_id, tag=shardfn)
-                if point is not None:
-                    copy.set_point(point)
+                if shardpt is not None:
+                    copy.set_point(shardpt)
                 if shardsp is not None:
                     copy.set_sharding_space(shardsp)
                 copy.add_src_requirement(src.region, src.field.field_id)
                 copy.add_dst_requirement(dst.region, dst.field.field_id)
                 copy.add_dst_indirect_requirement(
-                    index.region, index.field.field_id
+                    dst_indirect.region, dst_indirect.field.field_id
                 )
             # Issue the copy to the runtime
             self.runtime.dispatch(copy)
         elif self.size == 1:
-            assert value_array.size == 1
-            # Special case of writing a single value
-            # We can just copy the future because they are immutable
-            self.base = value_array.base
+            # Writing to a singleton array, using basic indexing: simulate the
+            # operation in numpy and see how many values need to be updated.
+            # If it is a single value we can just copy the future because they
+            # are immutable. We don't check for exact match of shapes between
+            # LHS and RHS to capture cases of broadcasting.
+            updated_part = np.zeros(self.shape)[key]
+            updated_values = (
+                calculate_volume(updated_part.shape)
+                if type(updated_part) is np.ndarray
+                else 1
+            )
+            assert updated_values <= 1
+            if updated_values == 1:
+                assert value_array.size == 1
+                self.base = value_array.base
         else:
-            # Writing to a view of this array
+            # Writing to a view of a non-singleton array, using basic indexing
             view, dim_map = self._get_view(key)
             # See what the shape of the view is
             new_shape = self._get_view_shape(view, dim_map)
@@ -729,7 +919,7 @@ def squeeze(self, axis, stacklevel):
             raise TypeError(
                 '"axis" argument for squeeze must be int-like or tuple-like'
             )
-        if not self.scalar:
+        if self.size != 1:
             # Make transform for the lost dimensions
             transform = AffineTransform(self.ndim, len(new_shape), False)
             child_idx = 0
@@ -745,7 +935,7 @@ def squeeze(self, axis, stacklevel):
         else:
             # Easy case of size 1 array, nothing to do
             result = DeferredArray(
-                self.runtime, self.base, new_shape, self.dtype, True
+                self.runtime, self.base, new_shape, self.dtype
             )
         if self.runtime.shadow_debug:
             result.shadow = self.shadow.squeeze(
@@ -2925,7 +3115,7 @@ def nonzero(self, stacklevel, callsite=None):
             for _ in xrange(self.ndim):
                 result += (
                     self.runtime.create_empty_thunk(
-                        shape=(0,), dtype=np.dtype(np.uint64), inputs=(self,)
+                        shape=(0,), dtype=np.dtype(np.int64), inputs=(self,)
                     ),
                 )
             return result
@@ -2934,7 +3124,7 @@ def nonzero(self, stacklevel, callsite=None):
         # TODO: A better way to create the output array?
         dst_array = self.runtime.to_deferred_array(
             self.runtime.create_empty_thunk(
-                shape=out_shape, dtype=np.dtype(np.uint64), inputs=(self,)
+                shape=out_shape, dtype=np.dtype(np.int64), inputs=(self,)
             ),
             stacklevel=(stacklevel + 1),
         )
@@ -2958,7 +3148,7 @@ def nonzero(self, stacklevel, callsite=None):
                 argbuf,
                 nonzeros_dist_shape,
                 nonzeros_dist_part.tile_shape,
-                self.runtime.first_proj_id + NumPyProjCode.PROJ_ND_1D_C_ORDER,
+                self.runtime.first_proj_id + NumPyProjCode.PROJ_ND_MD_C_ORDER,
             )
             argbuf.pack_accessor(
                 nonzeros_dist_field.field.field_id,
@@ -2988,7 +3178,7 @@ def nonzero(self, stacklevel, callsite=None):
             index_task.add_write_requirement(
                 nonzeros_dist_part,
                 nonzeros_dist_field.field.field_id,
-                self.runtime.first_proj_id + NumPyProjCode.PROJ_ND_1D_C_ORDER,
+                self.runtime.first_proj_id + NumPyProjCode.PROJ_ND_MD_C_ORDER,
                 tag=NumPyMappingTag.NO_MEMOIZE_TAG,
             )
             self.runtime.dispatch(index_task)
@@ -3032,7 +3222,7 @@ def nonzero(self, stacklevel, callsite=None):
             task = Task(
                 self.runtime.get_unary_task_id(
                     NumPyOpCode.CONVERT_TO_RECT,
-                    result_type=np.dtype(np.uint64),
+                    result_type=nonzeros_ranges_field.field.dtype,
                     argument_type=nonzeros_dist_field.field.dtype,
                 ),
                 argbuf.get_string(),
@@ -3076,7 +3266,7 @@ def nonzero(self, stacklevel, callsite=None):
             index_task = IndexTask(
                 self.runtime.get_unary_task_id(
                     NumPyOpCode.NONZERO,
-                    result_type=np.dtype(np.uint64),
+                    result_type=np.dtype(np.int64),
                     argument_type=self.dtype,
                 ),
                 Rect(launch_space),
@@ -3095,7 +3285,7 @@ def nonzero(self, stacklevel, callsite=None):
             index_task.add_write_requirement(
                 dst_partition,
                 dst.field.field_id,
-                self.runtime.first_proj_id + NumPyProjCode.PROJ_ND_1D_C_ORDER,
+                self.runtime.first_proj_id + NumPyProjCode.PROJ_ND_MD_C_ORDER,
                 tag=NumPyMappingTag.NO_MEMOIZE_TAG,
             )
             self.runtime.dispatch(index_task)
@@ -3111,8 +3301,8 @@ def nonzero(self, stacklevel, callsite=None):
             task = Task(
                 self.runtime.get_unary_task_id(
                     NumPyOpCode.NONZERO,
+                    result_type=np.dtype(np.int64),
                     argument_type=self.dtype,
-                    result_type=self.dtype,
                 ),
                 argbuf.get_string(),
                 argbuf.get_size(),
@@ -5683,11 +5873,8 @@ def add_read_requirement(task, region, tag):
     # A helper method for attaching arguments
     def add_arguments(self, task, args):
         assert args is not None
-        for numpy_array in args:
-            assert numpy_array.size == 1
-            future = self.runtime.create_future(
-                numpy_array.data, numpy_array.nbytes
-            )
+        for buf in args:
+            future = self.runtime.create_future(buf, memoryview(buf).nbytes)
             task.add_future(future)
 
     # A helper method for support for 16 bit arithmetic
diff --git a/legate/numpy/runtime.py b/legate/numpy/runtime.py
index 8498f78884..9fdca8a138 100644
--- a/legate/numpy/runtime.py
+++ b/legate/numpy/runtime.py
@@ -58,7 +58,7 @@
 from .eager import EagerArray
 from .lazy import LazyArray
 from .thunk import NumPyThunk
-from .utils import calculate_volume
+from .utils import calculate_volume, get_point_dim
 
 
 # Helper method for python 3 support
@@ -562,6 +562,13 @@ def set_key_partition(self, part, shardfn=None, shardsp=None):
         self.shard_function = shardfn
         self.shard_space = shardsp
 
+    @property
+    def root(self):
+        res = self
+        while res.parent is not None:
+            res = res.parent
+        return res
+
     def find_or_create_key_partition(self):
         if self.key_partition is not None:
             return self.key_partition, self.shard_function, self.shard_space
@@ -570,9 +577,7 @@ def find_or_create_key_partition(self):
             return None, None, None
         if self.parent is not None:
             # Figure out how many tiles we overlap with of the root
-            root = self.parent
-            while root.parent is not None:
-                root = root.parent
+            root = self.root
             root_key, rootfn, rootsp = root.find_or_create_key_partition()
             if root_key is None:
                 self.launch_space = ()
@@ -798,11 +803,21 @@ def find_or_create_partition(
         )
 
     def find_or_create_indirect_partition(self, launch_space):
-        assert len(launch_space) != len(self.shape)
         # If there is a mismatch in the number of dimensions then we need
         # to compute a partition and projection functor that can transform
-        # the points into a partition that makes sense
-        raise NotImplementedError("need support for indirect partitioning")
+        # the points into a partition that makes sense.
+        assert len(launch_space) != len(self.shape)
+        # For now cover the common case where we launch over a space wih the
+        # same number of points as the array's key partition.
+        key_space = self.compute_parallel_launch_space()
+        if calculate_volume(key_space) != calculate_volume(launch_space):
+            raise NotImplementedError(
+                "indirect partitioning over launch space of different size "
+                "than key partition"
+            )
+        key_part, _, _ = self.find_or_create_key_partition()
+        proj = self.runtime.first_proj_id + NumPyProjCode.PROJ_ND_MD_C_ORDER
+        return key_part, proj
 
     def attach_numpy_array(self, numpy_array, share=False):
         assert self.parent is None
@@ -1352,9 +1367,7 @@ def create_future(self, data, size, wrap=False, dtype=None, shape=()):
             # If we have a shape then all the extents should be 1 for now
             for extent in shape:
                 assert extent == 1
-            return DeferredArray(
-                self, result, shape=shape, dtype=dtype, scalar=True
-            )
+            return DeferredArray(self, result, shape=shape, dtype=dtype)
         else:
             return result
 
@@ -1576,7 +1589,6 @@ def find_or_create_view(self, parent, view, dim_map, shape, key):
                         child,
                         shape,
                         parent.field.dtype,
-                        scalar=False,
                     )
         # We need to make this subview
         # If all the slices have strides of one then this is a dense
@@ -1638,9 +1650,7 @@ def find_or_create_view(self, parent, view, dim_map, shape, key):
                 dim_map,
                 key,
             )
-            return DeferredArray(
-                self, region_field, shape, parent.field.dtype, scalar=False
-            )
+            return DeferredArray(self, region_field, shape, parent.field.dtype)
         elif dense:
             # We can do a single call to create partition by restriction
             # Build the rect for the subview
@@ -1718,7 +1728,7 @@ def find_or_create_view(self, parent, view, dim_map, shape, key):
                 )
                 parent.subviews.add(region_field)
                 return DeferredArray(
-                    self, region_field, shape, parent.field.dtype, scalar=False
+                    self, region_field, shape, parent.field.dtype
                 )
             else:
                 tile_shape = tuple(map(lambda x, y: ((x - y) + 1), hi, lo))
@@ -1749,7 +1759,7 @@ def find_or_create_view(self, parent, view, dim_map, shape, key):
                 )
                 parent.subviews.add(region_field)
                 return DeferredArray(
-                    self, region_field, shape, parent.field.dtype, scalar=False
+                    self, region_field, shape, parent.field.dtype
                 )
         else:
             # We need fill in a phased partition operation from Legion
@@ -1773,7 +1783,6 @@ def create_transform_view(self, region_field, new_shape, transform):
             new_region_field,
             new_shape,
             region_field.field.dtype,
-            scalar=False,
         )
 
     def find_or_create_transform_sharding_functor(self, transform):
@@ -1888,9 +1897,7 @@ def get_numpy_thunk(self, obj, stacklevel, share=False, dtype=None):
             dtype = np.dtype(store.type.to_pandas_dtype())
             primitive = store.storage
             if kind == Future:
-                return DeferredArray(
-                    self, primitive, shape=(), dtype=dtype, scalar=True
-                )
+                return DeferredArray(self, primitive, shape=(), dtype=dtype)
             elif kind == FutureMap:
                 raise NotImplementedError("Need support for FutureMap inputs")
             elif kind == (Region, FieldID):
@@ -1903,9 +1910,7 @@ def get_numpy_thunk(self, obj, stacklevel, share=False, dtype=None):
                 )
             else:
                 raise TypeError("Unknown LegateStore type")
-            return DeferredArray(
-                self, region_field, region_field.shape, dtype, scalar=False
-            )
+            return DeferredArray(self, region_field, region_field.shape, dtype)
         # See if this is a normal numpy array
         if not isinstance(obj, np.ndarray):
             # If it's not, make it into a numpy array
@@ -2170,7 +2175,6 @@ def find_or_create_array_thunk(
                     region_field,
                     shape=array.shape,
                     dtype=array.dtype,
-                    scalar=(array.size == 1),
                 )
             # If we're doing shadow debug make an EagerArray shadow
             if self.shadow_debug:
@@ -2195,14 +2199,12 @@ def create_empty_thunk(self, shape, dtype, inputs=None):
         ):
             if len(shape) == 0:
                 # Empty tuple
-                result = DeferredArray(
-                    self, Future(), shape=(), dtype=dtype, scalar=True
-                )
+                result = DeferredArray(self, Future(), shape=(), dtype=dtype)
             else:
                 volume = reduce(lambda x, y: x * y, shape)
                 if volume == 1:
                     result = DeferredArray(
-                        self, Future(), shape=shape, dtype=dtype, scalar=True
+                        self, Future(), shape=shape, dtype=dtype
                     )
                 else:
                     region_field = self.allocate_field(shape, dtype)
@@ -2211,7 +2213,6 @@ def create_empty_thunk(self, shape, dtype, inputs=None):
                         region_field,
                         shape=shape,
                         dtype=dtype,
-                        scalar=False,
                     )
             # If we're doing shadow debug make an EagerArray shadow
             if self.shadow_debug:
@@ -2324,6 +2325,16 @@ def get_unary_task_id(
                 * NUMPY_MAX_VARIANTS
             )
 
+        # TRANSFORM's ID is special
+        if op_code == NumPyOpCode.TRANSFORM:
+            assert variant_code == NumPyVariantCode.NORMAL
+            return (
+                self.first_task_id
+                + legate_numpy.NUMPY_TRANSFORM_OFFSET
+                + get_point_dim(result_type) * (LEGATE_MAX_DIM + 1)
+                + get_point_dim(argument_type)
+            )
+
         # unary tasks distinguish themselves by argument_type
         return (
             self.first_task_id
@@ -2388,6 +2399,11 @@ def get_weighted_bincount_task_id(self, dt1, dt2):
         result += numpy_field_type_offsets[dt2.type] * NUMPY_MAX_VARIANTS
         return result
 
+    def get_zip_task_id(self, num_coord_arrays):
+        result = self.first_task_id + legate_numpy.NUMPY_ZIP_OFFSET
+        result += num_coord_arrays
+        return result
+
     def get_reduction_op_id(self, op, field_dtype):
         redop_id = numpy_reduction_op_offsets[op]
         if redop_id < legion.LEGION_REDOP_KIND_TOTAL:
diff --git a/legate/numpy/utils.py b/legate/numpy/utils.py
index 76792e2085..ffd1b20c68 100644
--- a/legate/numpy/utils.py
+++ b/legate/numpy/utils.py
@@ -19,6 +19,8 @@
 
 import numpy as np
 
+from legate.core import LEGATE_MAX_DIM
+
 try:
     reduce  # Python 2
 except NameError:
@@ -100,3 +102,16 @@ def calculate_volume(shape):
     if shape == ():
         return 0
     return reduce(lambda x, y: x * y, shape)
+
+
+def get_point_dim(dtype):
+    if dtype == np.int64:
+        # Special case: allowing coord_t in place of Point<1>; this is fine
+        # because physical regions are untyped
+        return 1
+    assert (
+        dtype.ndim == 1
+        and dtype.base == np.int64
+        and dtype.shape[0] <= LEGATE_MAX_DIM
+    )
+    return dtype.shape[0]
diff --git a/src/clip.h b/src/clip.h
index dcab04a026..f83aeafab3 100644
--- a/src/clip.h
+++ b/src/clip.h
@@ -34,24 +34,33 @@ struct ClipOperation {
 
 #if defined(LEGATE_USE_CUDA) && defined(__CUDACC__)
 template <int DIM, typename T, typename Args>
-__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) gpu_clip(const Args args)
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  gpu_clip(const Args args, const bool dense)
 {
   const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx >= args.volume) return;
-  const Legion::Point<DIM> point = args.pitches.unflatten(idx, args.rect.lo);
   ClipOperation<T> func;
-  args.out[point] = func(args.in[point], args.min, args.max);
+  if (dense) {
+    args.outptr[idx] = func(args.inptr[idx], args.min, args.max);
+  } else {
+    const Legion::Point<DIM> point = args.pitches.unflatten(idx, args.rect.lo);
+    args.out[point]                = func(args.in[point], args.min, args.max);
+  }
 }
 
 template <int DIM, typename T, typename Args>
 __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
-  gpu_clip_inplace(const Args args)
+  gpu_clip_inplace(const Args args, const bool dense)
 {
   const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx >= args.volume) return;
-  const Legion::Point<DIM> point = args.pitches.unflatten(idx, args.rect.lo);
   ClipOperation<T> func;
-  args.inout[point] = func(args.inout[point], args.min, args.max);
+  if (dense) {
+    args.inoutptr[idx] = func(args.inoutptr[idx], args.min, args.max);
+  } else {
+    const Legion::Point<DIM> point = args.pitches.unflatten(idx, args.rect.lo);
+    args.inout[point]              = func(args.inout[point], args.min, args.max);
+  }
 }
 #endif
 
@@ -145,10 +154,10 @@ class ClipTask : public PointTask<ClipTask<T>> {
                            LegateDeserializer& derez)
   {
     DeserializedArgs<DIM> args;
-    args.deserialize(derez, task, regions);
+    const bool dense = args.deserialize(derez, task, regions);
     if (args.volume == 0) return;
     const size_t blocks = (args.volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    gpu_clip<DIM, T, DeserializedArgs<DIM>><<<blocks, THREADS_PER_BLOCK>>>(args);
+    gpu_clip<DIM, T, DeserializedArgs<DIM>><<<blocks, THREADS_PER_BLOCK>>>(args, dense);
   }
 #elif defined(LEGATE_USE_CUDA)
   template <int DIM>
@@ -243,10 +252,10 @@ class ClipInplace : public PointTask<ClipInplace<T>> {
                            LegateDeserializer& derez)
   {
     DeserializedArgs<DIM> args;
-    args.deserialize(derez, task, regions);
+    const bool dense = args.deserialize(derez, task, regions);
     if (args.volume == 0) return;
     const size_t blocks = (args.volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    gpu_clip_inplace<DIM, T, DeserializedArgs<DIM>><<<blocks, THREADS_PER_BLOCK>>>(args);
+    gpu_clip_inplace<DIM, T, DeserializedArgs<DIM>><<<blocks, THREADS_PER_BLOCK>>>(args, dense);
   }
 #elif defined(LEGATE_USE_CUDA)
   template <int DIM>
diff --git a/src/fill.h b/src/fill.h
index c381626a75..1d382b468e 100644
--- a/src/fill.h
+++ b/src/fill.h
@@ -24,12 +24,17 @@ namespace numpy {
 
 #if defined(LEGATE_USE_CUDA) && defined(__CUDACC__)
 template <int DIM, typename Args>
-__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) gpu_fill(const Args args)
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  gpu_fill(const Args args, const bool dense)
 {
   const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx >= args.volume) return;
-  const Legion::Point<DIM> point = args.pitches.unflatten(idx, args.rect.lo);
-  args.out[point]                = args.value;
+  if (dense) {
+    args.outptr[idx] = args.value;
+  } else {
+    const Legion::Point<DIM> point = args.pitches.unflatten(idx, args.rect.lo);
+    args.out[point]                = args.value;
+  }
 }
 #endif
 
@@ -112,10 +117,10 @@ struct FillTask : PointTask<FillTask<T>> {
                            LegateDeserializer& derez)
   {
     DeserializedArgs<DIM> args;
-    args.deserialize(derez, task, regions);
+    const bool dense = args.deserialize(derez, task, regions);
     if (args.volume == 0) return;
     const size_t blocks = (args.volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    gpu_fill<DIM, DeserializedArgs<DIM>><<<blocks, THREADS_PER_BLOCK>>>(args);
+    gpu_fill<DIM, DeserializedArgs<DIM>><<<blocks, THREADS_PER_BLOCK>>>(args, dense);
   }
 #elif defined(LEGATE_USE_CUDA)
   template <int DIM>
diff --git a/src/legate_numpy_c.h b/src/legate_numpy_c.h
index 959f8b6e35..3b8cb6ab8c 100644
--- a/src/legate_numpy_c.h
+++ b/src/legate_numpy_c.h
@@ -107,6 +107,8 @@ enum NumPyOpCode {
   NUMPY_INCLUSIVE_SCAN      = 72,
   NUMPY_CONVERT_TO_RECT     = 73,
   NUMPY_ARANGE              = 74,
+  NUMPY_ZIP                 = 75,
+  NUMPY_TRANSFORM           = 76,
 };
 
 // Match these to NumPyRedopCode in legate/numpy/config.py
@@ -175,7 +177,7 @@ enum NumPyProjectionCode {
   NUMPY_PROJ_RADIX_3D_Z_4_2 = 46,
   NUMPY_PROJ_RADIX_3D_Z_4_3 = 47,
   // Flattening
-  NUMPY_PROJ_ND_1D_C_ORDER = 48,
+  NUMPY_PROJ_ND_MD_C_ORDER = 48,
   // Must always be last
   NUMPY_PROJ_LAST = 49,
 };
@@ -242,8 +244,10 @@ enum NumPyShardingCode {
 };
 
 enum NumpyTaskOffset {
-  NUMPY_CONVERT_OFFSET  = 100000,
-  NUMPY_BINCOUNT_OFFSET = 200000,
+  NUMPY_CONVERT_OFFSET   = 100000,
+  NUMPY_BINCOUNT_OFFSET  = 200000,
+  NUMPY_ZIP_OFFSET       = 300000,
+  NUMPY_TRANSFORM_OFFSET = 400000,
 };
 
 // Match these to NumPyMappingTag in legate/numpy/config.py
diff --git a/src/numpy.mk b/src/numpy.mk
index d6c9351706..d062946b05 100644
--- a/src/numpy.mk
+++ b/src/numpy.mk
@@ -13,7 +13,7 @@
 # limitations under the License.
 #
 
-# List all the application source files that need OpenMP separately 
+# List all the application source files that need OpenMP separately
 # since we have to add the -fopenmp flag to  CC_FLAGS for them
 GEN_CPU_SRC +=	  universal_functions/absolute.cc  	\
 		  universal_functions/add.cc	       	\
@@ -93,6 +93,8 @@ GEN_CPU_SRC +=	  universal_functions/absolute.cc  	\
 		  tile.cc	                       	\
 		  trans.cc	                       	\
 		  where.cc				\
+		  zip.cc				\
+		  transform.cc				\
 		  numpy.cc	 # This must always be the last file!
 				 # It guarantees we do our registration callback
 				 # only after all task variants are recorded
@@ -169,4 +171,6 @@ GEN_GPU_SRC +=    universal_functions/absolute.cu      	\
 		  universal_functions/tanh.cu	        \
 		  tile.cu	                        \
 		  trans.cu	                        \
-		  where.cu							
+		  where.cu	                        \
+		  zip.cu				\
+		  transform.cu
diff --git a/src/point_task.h b/src/point_task.h
index 372e52866f..69c2ddebf3 100644
--- a/src/point_task.h
+++ b/src/point_task.h
@@ -31,13 +31,13 @@ namespace numpy {
 template <NumPyOpCode op_code, NumPyVariantCode variant_code, class ResultType, class...>
 constexpr int task_id = -1;
 
-// nullary tasks themselves by ResultType
+// nullary tasks distinguish themselves by ResultType
 template <NumPyOpCode op_code, NumPyVariantCode variant_code, class ResultType>
 constexpr int task_id<op_code, variant_code, ResultType> =
   static_cast<int>(op_code) * NUMPY_TYPE_OFFSET + variant_code
   + legate_type_code_of<ResultType>* NUMPY_MAX_VARIANTS;
 
-// unary tasks themselves by ArgumentType
+// unary tasks distinguish themselves by ArgumentType
 template <NumPyOpCode op_code, NumPyVariantCode variant_code, class ResultType, class ArgumentType>
 constexpr int task_id<op_code, variant_code, ResultType, ArgumentType> =
   static_cast<int>(op_code) * NUMPY_TYPE_OFFSET + variant_code
@@ -181,6 +181,14 @@ class CPULoop<1> {
     for (int x = rect.lo[0]; x <= rect.hi[0]; ++x)
       inout[x] = func(inout[x], in[x], std::forward<Args>(args)...);
   }
+  template <typename Function, typename T, typename... Ts>
+  static inline void generic_loop(const Legion::Rect<1>& rect,
+                                  const Function& func,
+                                  const T& out,
+                                  const Ts&... in)
+  {
+    for (int x = rect.lo[0]; x <= rect.hi[0]; ++x) out[x] = func(in[x]...);
+  }
 };
 
 template <>
@@ -224,6 +232,15 @@ class CPULoop<2> {
       for (int y = rect.lo[1]; y <= rect.hi[1]; ++y)
         inout[x][y] = func(inout[x][y], in[x][y], std::forward<Args>(args)...);
   }
+  template <typename Function, typename T, typename... Ts>
+  static inline void generic_loop(const Legion::Rect<2>& rect,
+                                  const Function& func,
+                                  const T& out,
+                                  const Ts&... in)
+  {
+    for (int x = rect.lo[0]; x <= rect.hi[0]; ++x)
+      for (int y = rect.lo[1]; y <= rect.hi[1]; ++y) out[x][y] = func(in[x][y]...);
+  }
 };
 
 template <>
@@ -271,6 +288,16 @@ class CPULoop<3> {
         for (int z = rect.lo[2]; z <= rect.hi[2]; ++z)
           inout[x][y][z] = func(inout[x][y][z], in[x][y][z], std::forward<Args>(args)...);
   }
+  template <typename Function, typename T, typename... Ts>
+  static inline void generic_loop(const Legion::Rect<3>& rect,
+                                  const Function& func,
+                                  const T& out,
+                                  const Ts&... in)
+  {
+    for (int x = rect.lo[0]; x <= rect.hi[0]; ++x)
+      for (int y = rect.lo[1]; y <= rect.hi[1]; ++y)
+        for (int z = rect.lo[2]; z <= rect.hi[2]; ++z) out[x][y][z] = func(in[x][y][z]...);
+  }
 };
 
 template <>
@@ -323,6 +350,17 @@ class CPULoop<4> {
             inout[x][y][z][w] =
               func(inout[x][y][z][w], in[x][y][z][w], std::forward<Args>(args)...);
   }
+  template <typename Function, typename T, typename... Ts>
+  static inline void generic_loop(const Legion::Rect<4>& rect,
+                                  const Function& func,
+                                  const T& out,
+                                  const Ts&... in)
+  {
+    for (int x = rect.lo[0]; x <= rect.hi[0]; ++x)
+      for (int y = rect.lo[1]; y <= rect.hi[1]; ++y)
+        for (int z = rect.lo[2]; z <= rect.hi[2]; ++z)
+          for (int w = rect.lo[3]; w <= rect.hi[3]; ++w) out[x][y][z][w] = func(in[x][y][z][w]...);
+  }
 };
 
 #ifdef LEGATE_USE_OPENMP
@@ -373,6 +411,15 @@ class OMPLoop<1> {
     for (int x = rect.lo[0]; x <= rect.hi[0]; ++x)
       inout[x] = func(inout[x], in[x], std::forward<Args>(args)...);
   }
+  template <typename Function, typename T, typename... Ts>
+  static inline void generic_loop(const Legion::Rect<1>& rect,
+                                  const Function& func,
+                                  const T& out,
+                                  const Ts&... in)
+  {
+#pragma omp parallel for schedule(static)
+    for (int x = rect.lo[0]; x <= rect.hi[0]; ++x) out[x] = func(in[x]...);
+  }
 };
 
 template <>
@@ -420,6 +467,16 @@ class OMPLoop<2> {
       for (int y = rect.lo[1]; y <= rect.hi[1]; ++y)
         inout[x][y] = func(inout[x][y], in[x][y], std::forward<Args>(args)...);
   }
+  template <typename Function, typename T, typename... Ts>
+  static inline void generic_loop(const Legion::Rect<2>& rect,
+                                  const Function& func,
+                                  const T& out,
+                                  const Ts&... in)
+  {
+#pragma omp parallel for schedule(static), collapse(2)
+    for (int x = rect.lo[0]; x <= rect.hi[0]; ++x)
+      for (int y = rect.lo[1]; y <= rect.hi[1]; ++y) out[x][y] = func(in[x][y]...);
+  }
 };
 
 template <>
@@ -471,6 +528,17 @@ class OMPLoop<3> {
         for (int z = rect.lo[2]; z <= rect.hi[2]; ++z)
           inout[x][y][z] = func(inout[x][y][z], in[x][y][z], std::forward<Args>(args)...);
   }
+  template <typename Function, typename T, typename... Ts>
+  static inline void generic_loop(const Legion::Rect<3>& rect,
+                                  const Function& func,
+                                  const T& out,
+                                  const Ts&... in)
+  {
+#pragma omp parallel for schedule(static), collapse(3)
+    for (int x = rect.lo[0]; x <= rect.hi[0]; ++x)
+      for (int y = rect.lo[1]; y <= rect.hi[1]; ++y)
+        for (int z = rect.lo[2]; z <= rect.hi[2]; ++z) out[x][y][z] = func(in[x][y][z]...);
+  }
 };
 
 template <>
@@ -527,6 +595,18 @@ class OMPLoop<4> {
             inout[x][y][z][w] =
               func(inout[x][y][z][w], in[x][y][z][w], std::forward<Args>(args)...);
   }
+  template <typename Function, typename T, typename... Ts>
+  static inline void generic_loop(const Legion::Rect<4>& rect,
+                                  const Function& func,
+                                  const T& out,
+                                  const Ts&... in)
+  {
+#pragma omp parallel for schedule(static), collapse(4)
+    for (int x = rect.lo[0]; x <= rect.hi[0]; ++x)
+      for (int y = rect.lo[1]; y <= rect.hi[1]; ++y)
+        for (int z = rect.lo[2]; z <= rect.hi[2]; ++z)
+          for (int w = rect.lo[3]; w <= rect.hi[3]; ++w) out[x][y][z][w] = func(in[x][y][z][w]...);
+  }
 };
 #endif
 
diff --git a/src/proj.cc b/src/proj.cc
index 7dea6e296d..59607d44c5 100644
--- a/src/proj.cc
+++ b/src/proj.cc
@@ -29,7 +29,8 @@ LogicalRegion NumPyProjectionFunctor::project(LogicalPartition upper_bound,
                                               const DomainPoint& point,
                                               const Domain& launch_domain)
 {
-  const DomainPoint dp = project_point(point, launch_domain);
+  Domain color_domain = runtime->get_index_partition_color_space(upper_bound.get_index_partition());
+  const DomainPoint dp = project_point(color_domain, point, launch_domain);
   if (runtime->has_logical_subregion_by_color(upper_bound, dp))
     return runtime->get_logical_subregion_by_color(upper_bound, dp);
   else
@@ -62,7 +63,8 @@ NumPyProjectionFunctor_2D_1D::NumPyProjectionFunctor_2D_1D(NumPyProjectionCode c
   return result;
 }
 
-DomainPoint NumPyProjectionFunctor_2D_1D::project_point(const DomainPoint& p,
+DomainPoint NumPyProjectionFunctor_2D_1D::project_point(const Domain& color_domain,
+                                                        const DomainPoint& p,
                                                         const Domain& launch_domain) const
 {
   const Point<1> point = transform * Point<2>(p);
@@ -110,7 +112,8 @@ NumPyProjectionFunctor_2D_2D::NumPyProjectionFunctor_2D_2D(NumPyProjectionCode c
   return result;
 }
 
-DomainPoint NumPyProjectionFunctor_2D_2D::project_point(const DomainPoint& p,
+DomainPoint NumPyProjectionFunctor_2D_2D::project_point(const Domain& color_domain,
+                                                        const DomainPoint& p,
                                                         const Domain& launch_domain) const
 {
   const Point<2> point = transform * Point<2>(p);
@@ -145,7 +148,8 @@ NumPyProjectionFunctor_1D_2D::NumPyProjectionFunctor_1D_2D(NumPyProjectionCode c
   return result;
 }
 
-DomainPoint NumPyProjectionFunctor_1D_2D::project_point(const DomainPoint& p,
+DomainPoint NumPyProjectionFunctor_1D_2D::project_point(const Domain& color_domain,
+                                                        const DomainPoint& p,
                                                         const Domain& launch_domain) const
 {
   const Point<2> point = transform * Point<1>(p);
@@ -221,7 +225,8 @@ NumPyProjectionFunctor_3D_2D::NumPyProjectionFunctor_3D_2D(NumPyProjectionCode c
   return result;
 }
 
-DomainPoint NumPyProjectionFunctor_3D_2D::project_point(const DomainPoint& p,
+DomainPoint NumPyProjectionFunctor_3D_2D::project_point(const Domain& color_domain,
+                                                        const DomainPoint& p,
                                                         const Domain& launch_domain) const
 {
   const Point<2> point = transform * Point<3>(p);
@@ -263,7 +268,8 @@ NumPyProjectionFunctor_3D_1D::NumPyProjectionFunctor_3D_1D(NumPyProjectionCode c
   return result;
 }
 
-DomainPoint NumPyProjectionFunctor_3D_1D::project_point(const DomainPoint& p,
+DomainPoint NumPyProjectionFunctor_3D_1D::project_point(const Domain& color_domain,
+                                                        const DomainPoint& p,
                                                         const Domain& launch_domain) const
 {
   const Point<1> point = transform * Point<3>(p);
@@ -374,7 +380,8 @@ NumPyProjectionFunctor_3D_3D::NumPyProjectionFunctor_3D_3D(NumPyProjectionCode c
   return result;
 }
 
-DomainPoint NumPyProjectionFunctor_3D_3D::project_point(const DomainPoint& p,
+DomainPoint NumPyProjectionFunctor_3D_3D::project_point(const Domain& color_domain,
+                                                        const DomainPoint& p,
                                                         const Domain& launch_domain) const
 {
   const Point<3> point = transform * Point<3>(p);
@@ -431,7 +438,8 @@ NumPyProjectionFunctor_2D_3D::NumPyProjectionFunctor_2D_3D(NumPyProjectionCode c
   return result;
 }
 
-DomainPoint NumPyProjectionFunctor_2D_3D::project_point(const DomainPoint& p,
+DomainPoint NumPyProjectionFunctor_2D_3D::project_point(const Domain& color_domain,
+                                                        const DomainPoint& p,
                                                         const Domain& launch_domain) const
 {
   const Point<3> point = transform * Point<2>(p);
@@ -473,31 +481,43 @@ NumPyProjectionFunctor_1D_3D::NumPyProjectionFunctor_1D_3D(NumPyProjectionCode c
   return result;
 }
 
-DomainPoint NumPyProjectionFunctor_1D_3D::project_point(const DomainPoint& p,
+DomainPoint NumPyProjectionFunctor_1D_3D::project_point(const Domain& color_domain,
+                                                        const DomainPoint& p,
                                                         const Domain& launch_domain) const
 {
   const Point<3> point = transform * Point<1>(p);
   return DomainPoint(point);
 }
 
-NumPyProjectionFunctor_ND_1D_C_ORDER::NumPyProjectionFunctor_ND_1D_C_ORDER(NumPyProjectionCode c,
+NumPyProjectionFunctor_ND_MD_C_ORDER::NumPyProjectionFunctor_ND_MD_C_ORDER(NumPyProjectionCode c,
                                                                            Runtime* rt)
   : NumPyProjectionFunctor(rt), code(c)
 {
 }
 
-DomainPoint NumPyProjectionFunctor_ND_1D_C_ORDER::project_point(const DomainPoint& p,
+DomainPoint NumPyProjectionFunctor_ND_MD_C_ORDER::project_point(const Domain& color_domain,
+                                                                const DomainPoint& launch_point,
                                                                 const Domain& launch_domain) const
 {
-  auto hi                = launch_domain.hi();
-  auto lo                = launch_domain.lo();
-  coord_t index          = p[0] - lo[0];
-  coord_t partial_volume = hi[0] - lo[0] + 1;
-  for (int dim = 1; dim < p.get_dim(); ++dim) {
-    index += p[dim] * partial_volume;
-    partial_volume *= hi[dim] - lo[dim] + 1;
+  assert(color_domain.get_volume() == launch_domain.get_volume());
+  const DomainPoint launch_hi = launch_domain.hi();
+  const DomainPoint launch_lo = launch_domain.lo();
+  coord_t index               = 0;
+  coord_t partial_volume      = 1;
+  for (int dim = launch_domain.get_dim() - 1; dim >= 0; --dim) {
+    index += (launch_point[dim] - launch_lo[dim]) * partial_volume;
+    partial_volume *= launch_hi[dim] - launch_lo[dim] + 1;
   }
-  return index;
+  const DomainPoint color_hi = color_domain.hi();
+  const DomainPoint color_lo = color_domain.lo();
+  DomainPoint color_point    = color_lo;
+  for (int dim = color_domain.get_dim() - 1; dim >= 0; --dim) {
+    const coord_t diff = color_hi[dim] - color_lo[dim] + 1;
+    color_point[dim] += index % diff;
+    index /= diff;
+  }
+  assert(index == 0);
+  return color_point;
 }
 
 template <bool LEFT>
@@ -507,7 +527,8 @@ NumPyProjectionFunctor_GEMV<LEFT>::NumPyProjectionFunctor_GEMV(NumPyProjectionCo
 }
 
 template <bool LEFT>
-DomainPoint NumPyProjectionFunctor_GEMV<LEFT>::project_point(const DomainPoint& p,
+DomainPoint NumPyProjectionFunctor_GEMV<LEFT>::project_point(const Domain& color_domain,
+                                                             const DomainPoint& p,
                                                              const Domain& launch_domain) const
 {
   const Rect<3> launch_rect = launch_domain;
@@ -536,7 +557,7 @@ NumPyProjectionFunctorRadix2D<DIM, RADIX, OFFSET>::NumPyProjectionFunctorRadix2D
 
 template <int DIM, int RADIX, int OFFSET>
 DomainPoint NumPyProjectionFunctorRadix2D<DIM, RADIX, OFFSET>::project_point(
-  const DomainPoint& p, const Domain& launch_domain) const
+  const Domain& color_domain, const DomainPoint& p, const Domain& launch_domain) const
 {
   const Rect<2> launch_rect = launch_domain;
   const Point<2> point      = p;
@@ -559,7 +580,7 @@ NumPyProjectionFunctorRadix3D<DIM, RADIX, OFFSET>::NumPyProjectionFunctorRadix3D
 
 template <int DIM, int RADIX, int OFFSET>
 DomainPoint NumPyProjectionFunctorRadix3D<DIM, RADIX, OFFSET>::project_point(
-  const DomainPoint& p, const Domain& launch_domain) const
+  const Domain& color_domain, const DomainPoint& p, const Domain& launch_domain) const
 {
   const Rect<3> launch_rect = launch_domain;
   const Point<3> point      = p;
@@ -662,7 +683,7 @@ static void register_functor(Runtime* runtime, ProjectionID offset, NumPyProject
   register_functor<NumPyProjectionFunctorRadix3D<2, 4, 3>>(
     runtime, offset, NUMPY_PROJ_RADIX_3D_Z_4_3);
   // Flattening
-  register_functor<NumPyProjectionFunctor_ND_1D_C_ORDER>(runtime, offset, NUMPY_PROJ_ND_1D_C_ORDER);
+  register_functor<NumPyProjectionFunctor_ND_MD_C_ORDER>(runtime, offset, NUMPY_PROJ_ND_MD_C_ORDER);
 }
 
 }  // namespace numpy
diff --git a/src/proj.h b/src/proj.h
index e5b27eca35..d1d86c5c4f 100644
--- a/src/proj.h
+++ b/src/proj.h
@@ -35,7 +35,8 @@ class NumPyProjectionFunctor : public Legion::ProjectionFunctor {
                                         const Legion::Domain& launch_domain);
 
  public:
-  virtual Legion::DomainPoint project_point(const Legion::DomainPoint& point,
+  virtual Legion::DomainPoint project_point(const Legion::Domain& color_domain,
+                                            const Legion::DomainPoint& point,
                                             const Legion::Domain& launch_domain) const = 0;
 
  public:
@@ -63,7 +64,7 @@ class NumPyProjectionFunctor : public Legion::ProjectionFunctor {
     } else {
       NumPyProjectionFunctor* functor = functors[functor_id];
       const Legion::Point<DIM> local =
-        functor->project_point(task->index_point, task->index_domain);
+        functor->project_point(rect, task->index_point, task->index_domain);
       const Legion::Point<DIM> lower = local * chunk;
       const Legion::Point<DIM> upper = lower + chunk - Legion::Point<DIM>::ONES();
       return rect.intersection(Legion::Rect<DIM>(lower, upper));
@@ -81,7 +82,8 @@ class NumPyProjectionFunctor_2D_1D : public NumPyProjectionFunctor {
   virtual unsigned get_depth(void) const { return 0; }
 
  public:
-  virtual Legion::DomainPoint project_point(const Legion::DomainPoint& point,
+  virtual Legion::DomainPoint project_point(const Legion::Domain& color_domain,
+                                            const Legion::DomainPoint& point,
                                             const Legion::Domain& launch_domain) const;
 
  public:
@@ -102,7 +104,8 @@ class NumPyProjectionFunctor_2D_2D : public NumPyProjectionFunctor {
   virtual unsigned get_depth(void) const { return 0; }
 
  public:
-  virtual Legion::DomainPoint project_point(const Legion::DomainPoint& point,
+  virtual Legion::DomainPoint project_point(const Legion::Domain& color_domain,
+                                            const Legion::DomainPoint& point,
                                             const Legion::Domain& launch_domain) const;
 
  public:
@@ -123,7 +126,8 @@ class NumPyProjectionFunctor_1D_2D : public NumPyProjectionFunctor {
   virtual unsigned get_depth(void) const { return 0; }
 
  public:
-  virtual Legion::DomainPoint project_point(const Legion::DomainPoint& point,
+  virtual Legion::DomainPoint project_point(const Legion::Domain& color_domain,
+                                            const Legion::DomainPoint& point,
                                             const Legion::Domain& launch_domain) const;
 
  public:
@@ -144,7 +148,8 @@ class NumPyProjectionFunctor_3D_2D : public NumPyProjectionFunctor {
   virtual unsigned get_depth(void) const { return 0; }
 
  public:
-  virtual Legion::DomainPoint project_point(const Legion::DomainPoint& point,
+  virtual Legion::DomainPoint project_point(const Legion::Domain& color_domain,
+                                            const Legion::DomainPoint& point,
                                             const Legion::Domain& launch_domain) const;
 
  public:
@@ -165,7 +170,8 @@ class NumPyProjectionFunctor_3D_1D : public NumPyProjectionFunctor {
   virtual unsigned get_depth(void) const { return 0; }
 
  public:
-  virtual Legion::DomainPoint project_point(const Legion::DomainPoint& point,
+  virtual Legion::DomainPoint project_point(const Legion::Domain& color_domain,
+                                            const Legion::DomainPoint& point,
                                             const Legion::Domain& launch_domain) const;
 
  public:
@@ -186,7 +192,8 @@ class NumPyProjectionFunctor_3D_3D : public NumPyProjectionFunctor {
   virtual unsigned get_depth(void) const { return 0; }
 
  public:
-  virtual Legion::DomainPoint project_point(const Legion::DomainPoint& point,
+  virtual Legion::DomainPoint project_point(const Legion::Domain& color_domain,
+                                            const Legion::DomainPoint& point,
                                             const Legion::Domain& launch_domain) const;
 
  public:
@@ -207,7 +214,8 @@ class NumPyProjectionFunctor_2D_3D : public NumPyProjectionFunctor {
   virtual unsigned get_depth(void) const { return 0; }
 
  public:
-  virtual Legion::DomainPoint project_point(const Legion::DomainPoint& point,
+  virtual Legion::DomainPoint project_point(const Legion::Domain& color_domain,
+                                            const Legion::DomainPoint& point,
                                             const Legion::Domain& launch_domain) const;
 
  public:
@@ -228,7 +236,8 @@ class NumPyProjectionFunctor_1D_3D : public NumPyProjectionFunctor {
   virtual unsigned get_depth(void) const { return 0; }
 
  public:
-  virtual Legion::DomainPoint project_point(const Legion::DomainPoint& point,
+  virtual Legion::DomainPoint project_point(const Legion::Domain& color_domain,
+                                            const Legion::DomainPoint& point,
                                             const Legion::Domain& launch_domain) const;
 
  public:
@@ -250,7 +259,8 @@ class NumPyProjectionFunctor_GEMV : public NumPyProjectionFunctor {
   virtual unsigned get_depth(void) const { return 0; }
 
  public:
-  virtual Legion::DomainPoint project_point(const Legion::DomainPoint& point,
+  virtual Legion::DomainPoint project_point(const Legion::Domain& color_domain,
+                                            const Legion::DomainPoint& point,
                                             const Legion::Domain& launch_domain) const;
 
  public:
@@ -268,7 +278,8 @@ class NumPyProjectionFunctorRadix2D : public NumPyProjectionFunctor {
   virtual unsigned get_depth(void) const { return 0; }
 
  public:
-  virtual Legion::DomainPoint project_point(const Legion::DomainPoint& point,
+  virtual Legion::DomainPoint project_point(const Legion::Domain& color_domain,
+                                            const Legion::DomainPoint& point,
                                             const Legion::Domain& launch_domain) const;
 
  public:
@@ -286,16 +297,17 @@ class NumPyProjectionFunctorRadix3D : public NumPyProjectionFunctor {
   virtual unsigned get_depth(void) const { return 0; }
 
  public:
-  virtual Legion::DomainPoint project_point(const Legion::DomainPoint& point,
+  virtual Legion::DomainPoint project_point(const Legion::Domain& color_domain,
+                                            const Legion::DomainPoint& point,
                                             const Legion::Domain& launch_domain) const;
 
  public:
   const NumPyProjectionCode code;
 };
 
-class NumPyProjectionFunctor_ND_1D_C_ORDER : public NumPyProjectionFunctor {
+class NumPyProjectionFunctor_ND_MD_C_ORDER : public NumPyProjectionFunctor {
  public:
-  NumPyProjectionFunctor_ND_1D_C_ORDER(NumPyProjectionCode code, Legion::Runtime* runtime);
+  NumPyProjectionFunctor_ND_MD_C_ORDER(NumPyProjectionCode code, Legion::Runtime* runtime);
 
  public:
   virtual bool is_functional(void) const { return true; }
@@ -303,7 +315,8 @@ class NumPyProjectionFunctor_ND_1D_C_ORDER : public NumPyProjectionFunctor {
   virtual unsigned get_depth(void) const { return 0; }
 
  public:
-  virtual Legion::DomainPoint project_point(const Legion::DomainPoint& point,
+  virtual Legion::DomainPoint project_point(const Legion::Domain& color_domain,
+                                            const Legion::DomainPoint& point,
                                             const Legion::Domain& launch_domain) const;
 
  public:
diff --git a/src/transform.cc b/src/transform.cc
new file mode 100644
index 0000000000..661bc010fb
--- /dev/null
+++ b/src/transform.cc
@@ -0,0 +1,31 @@
+/* Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "transform.h"
+
+using namespace Legion;
+
+namespace legate {
+namespace numpy {
+
+// Instantiate tasks for applying an AffineTransform to arrays of Points,
+// for all combinations of dimensionalities.
+#define DIMFUNC(M, N) template class TransformTask<M, N>;
+LEGION_FOREACH_NN(DIMFUNC)
+#undef DIMFUNC
+
+}  // namespace numpy
+}  // namespace legate
diff --git a/src/transform.cu b/src/transform.cu
new file mode 100644
index 0000000000..9ce64acceb
--- /dev/null
+++ b/src/transform.cu
@@ -0,0 +1,32 @@
+/* Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "transform.h"
+
+using namespace Legion;
+
+namespace legate {
+namespace numpy {
+
+// Instantiate Transform tasks' gpu variants.
+#define DIMFUNC(M, N)                                        \
+  template void PointTask<TransformTask<M, N>>::gpu_variant( \
+    const Task*, const std::vector<PhysicalRegion>&, Context, Runtime*);
+LEGION_FOREACH_NN(DIMFUNC)
+#undef DIMFUNC
+
+}  // namespace numpy
+}  // namespace legate
diff --git a/src/transform.h b/src/transform.h
new file mode 100644
index 0000000000..f05b42a944
--- /dev/null
+++ b/src/transform.h
@@ -0,0 +1,154 @@
+/* Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#ifndef __NUMPY_TRANSFORM_H__
+#define __NUMPY_TRANSFORM_H__
+
+#include "point_task.h"
+
+namespace legate {
+namespace numpy {
+
+template <int M, int N>
+struct TransformOperation {
+  __CUDA_HD__ constexpr Legion::Point<M> operator()(
+    const Legion::Point<N>& p, const Legion::AffineTransform<M, N>& transform) const
+  {
+    return transform[p];
+  }
+};
+
+#if defined(LEGATE_USE_CUDA) && defined(__CUDACC__)
+template <int DIM, int M, int N, typename Args>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  gpu_transform(const Args args, const bool dense)
+{
+  const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= args.volume) return;
+  TransformOperation<M, N> func;
+  if (dense) {
+    args.outptr[idx] = func(args.inptr[idx], args.transform);
+  } else {
+    const Legion::Point<DIM> point = args.pitches.unflatten(idx, args.rect.lo);
+    args.out[point]                = func(args.in[point], args.transform);
+  }
+}
+#endif
+
+template <int M, int N>
+class TransformTask : public PointTask<TransformTask<M, N>> {
+ public:
+  static const int TASK_ID = NUMPY_TRANSFORM_OFFSET + M * (LEGION_MAX_DIM + 1) + N;
+
+  // out_region = op in_region;
+  static const int REGIONS = 2;
+
+  template <int DIM>
+  struct DeserializedArgs {
+    Legion::Rect<DIM> rect;
+    AccessorWO<Legion::Point<M>, DIM> out;
+    AccessorRO<Legion::Point<N>, DIM> in;
+    Pitches<DIM - 1> pitches;
+    size_t volume;
+    Legion::AffineTransform<M, N> transform;
+    Legion::Point<M>* outptr;
+    const Legion::Point<N>* inptr;
+    bool deserialize(LegateDeserializer& derez,
+                     const Legion::Task* task,
+                     const std::vector<Legion::PhysicalRegion>& regions)
+    {
+      rect   = NumPyProjectionFunctor::unpack_shape<DIM>(task, derez);
+      out    = derez.unpack_accessor_WO<Legion::Point<M>, DIM>(regions[0], rect);
+      in     = derez.unpack_accessor_RO<Legion::Point<N>, DIM>(regions[1], rect);
+      volume = pitches.flatten(rect);
+      size_t size;
+      const void* data = task->futures[0].get_buffer(
+        Realm::Memory::SYSTEM_MEM, &size, false /*check_extent*/, true /*silence_warnings*/);
+      transform = LegateDeserializer(data, size).unpack_transform<M, N>();
+#ifndef LEGION_BOUNDS_CHECKS
+      // Check to see if this is dense or not
+      return out.accessor.is_dense_row_major(rect) && in.accessor.is_dense_row_major(rect) &&
+             (outptr = out.ptr(rect)) && (inptr = in.ptr(rect));
+#else
+      // No dense execution if we're doing bounds checks
+      return false;
+#endif
+    }
+  };
+
+  template <int DIM>
+  static void dispatch_cpu(const Legion::Task* task,
+                           const std::vector<Legion::PhysicalRegion>& regions,
+                           LegateDeserializer& derez)
+  {
+    DeserializedArgs<DIM> args;
+    const bool dense = args.deserialize(derez, task, regions);
+    if (args.volume == 0) return;
+    TransformOperation<M, N> func;
+    if (dense) {
+      for (size_t idx = 0; idx < args.volume; ++idx) {
+        args.outptr[idx] = func(args.inptr[idx], args.transform);
+      }
+    } else {
+      CPULoop<DIM>::unary_loop(func, args.out, args.in, args.rect, args.transform);
+    }
+  }
+
+#ifdef LEGATE_USE_OPENMP
+  template <int DIM>
+  static void dispatch_omp(const Legion::Task* task,
+                           const std::vector<Legion::PhysicalRegion>& regions,
+                           LegateDeserializer& derez)
+  {
+    DeserializedArgs<DIM> args;
+    const bool dense = args.deserialize(derez, task, regions);
+    if (args.volume == 0) return;
+    TransformOperation<M, N> func;
+    if (dense) {
+#pragma omp parallel for schedule(static)
+      for (size_t idx = 0; idx < args.volume; ++idx) {
+        args.outptr[idx] = func(args.inptr[idx], args.transform);
+      }
+    } else {
+      OMPLoop<DIM>::unary_loop(func, args.out, args.in, args.rect, args.transform);
+    }
+  }
+#endif  // LEGATE_USE_OPENMP
+
+#if defined(LEGATE_USE_CUDA) && defined(__CUDACC__)
+  template <int DIM>
+  static void dispatch_gpu(const Legion::Task* task,
+                           const std::vector<Legion::PhysicalRegion>& regions,
+                           LegateDeserializer& derez)
+  {
+    DeserializedArgs<DIM> args;
+    const bool dense = args.deserialize(derez, task, regions);
+    if (args.volume == 0) return;
+    const size_t blocks = (args.volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    gpu_transform<DIM, M, N><<<blocks, THREADS_PER_BLOCK>>>(args, dense);
+  }
+#elif defined(LEGATE_USE_CUDA)
+  template <int DIM>
+  static void dispatch_gpu(const Legion::Task* task,
+                           const std::vector<Legion::PhysicalRegion>& regions,
+                           LegateDeserializer& derez);
+#endif  // LEGATE_USE_CUDA
+};
+
+}  // namespace numpy
+}  // namespace legate
+
+#endif  // __NUMPY_TRANSFORM_H__
diff --git a/src/zip.cc b/src/zip.cc
new file mode 100644
index 0000000000..cbae4302f4
--- /dev/null
+++ b/src/zip.cc
@@ -0,0 +1,29 @@
+/* Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "zip.h"
+
+namespace legate {
+namespace numpy {
+
+// Instantiate tasks for zipping multiple arrays of coordinates into one array of points,
+// up to our maximum array dimensionality.
+#define DIMFUNC(N) template class ZipTask<N>;
+LEGATE_FOREACH_N(DIMFUNC)
+#undef DIMFUNC
+
+}  // namespace numpy
+}  // namespace legate
diff --git a/src/zip.cu b/src/zip.cu
new file mode 100644
index 0000000000..082d71abed
--- /dev/null
+++ b/src/zip.cu
@@ -0,0 +1,32 @@
+/* Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "zip.h"
+
+using namespace Legion;
+
+namespace legate {
+namespace numpy {
+
+// Instantiate Zip tasks' gpu variants.
+#define DIMFUNC(N)                                  \
+  template void PointTask<ZipTask<N>>::gpu_variant( \
+    const Task*, const std::vector<PhysicalRegion>&, Context, Runtime*);
+LEGATE_FOREACH_N(DIMFUNC)
+#undef DIMFUNC
+
+}  // namespace numpy
+}  // namespace legate
diff --git a/src/zip.h b/src/zip.h
new file mode 100644
index 0000000000..86173a3ac9
--- /dev/null
+++ b/src/zip.h
@@ -0,0 +1,188 @@
+/* Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#ifndef __NUMPY_ZIP_H__
+#define __NUMPY_ZIP_H__
+
+#include <utility>
+#include "point_task.h"
+
+namespace legate {
+namespace numpy {
+
+template <int N>
+struct point_ctor {
+};
+template <>
+struct point_ctor<1> {
+  inline Legion::Point<1> operator()(coord_t x) const { return Legion::Point<1>(x); }
+};
+template <>
+struct point_ctor<2> {
+  inline Legion::Point<2> operator()(coord_t x, coord_t y) const { return Legion::Point<2>(x, y); }
+};
+template <>
+struct point_ctor<3> {
+  inline Legion::Point<3> operator()(coord_t x, coord_t y, coord_t z) const
+  {
+    return Legion::Point<3>(x, y, z);
+  }
+};
+template <>
+struct point_ctor<4> {
+  inline Legion::Point<4> operator()(coord_t x, coord_t y, coord_t z, coord_t w) const
+  {
+    return Legion::Point<4>(x, y, z, w);
+  }
+};
+
+#if defined(LEGATE_USE_CUDA) && defined(__CUDACC__)
+template <int DIM, int N, typename Args, size_t... Is>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  gpu_zip(const Args args, const bool dense, std::index_sequence<Is...>)
+{
+  const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= args.volume) return;
+  if (dense) {
+    args.outptr[idx] = Legion::Point<N>(args.inptr[Is][idx]...);
+  } else {
+    const Legion::Point<DIM> point = args.pitches.unflatten(idx, args.rect.lo);
+    args.out[point]                = Legion::Point<N>(args.in[Is][point]...);
+  }
+}
+#endif
+
+template <int N>
+class ZipTask : public PointTask<ZipTask<N>> {
+ public:
+  static const int TASK_ID = NumpyTaskOffset::NUMPY_ZIP_OFFSET + N;
+
+  // out_region = op( in_region1, ..., in_regionN )
+  static const int REGIONS = N + 1;
+
+  template <int DIM>
+  struct DeserializedArgs {
+    Legion::Rect<DIM> rect;
+    AccessorWO<Legion::Point<N>, DIM> out;
+    AccessorRO<coord_t, DIM> in[N];
+    Pitches<DIM - 1> pitches;
+    size_t volume;
+    Legion::Point<N>* outptr;
+    const coord_t* inptr[N];
+    bool deserialize(LegateDeserializer& derez,
+                     const Legion::Task* task,
+                     const std::vector<Legion::PhysicalRegion>& regions)
+    {
+      rect = NumPyProjectionFunctor::unpack_shape<DIM>(task, derez);
+      out  = derez.unpack_accessor_WO<Legion::Point<N>, DIM>(regions[0], rect);
+      for (int i = 0; i < N; ++i) {
+        in[i] = derez.unpack_accessor_RO<coord_t, DIM>(regions[i + 1], rect);
+      }
+      volume = pitches.flatten(rect);
+#ifndef LEGION_BOUNDS_CHECKS
+      // Check to see if this is dense or not
+      if (!out.accessor.is_dense_row_major(rect)) { return false; }
+      outptr = out.ptr(rect);
+      for (int i = 0; i < N; ++i) {
+        if (!in[i].accessor.is_dense_row_major(rect)) { return false; }
+        inptr[i] = in[i].ptr(rect);
+      }
+      return true;
+#else
+      // No dense execution if we're doing bounds checks
+      return false;
+#endif
+    }
+  };
+
+  template <int DIM, size_t... Is>
+  static void dispatch_cpu_impl(const Legion::Task* task,
+                                const std::vector<Legion::PhysicalRegion>& regions,
+                                LegateDeserializer& derez,
+                                std::index_sequence<Is...>)
+  {
+    DeserializedArgs<DIM> args;
+    const bool dense = args.deserialize(derez, task, regions);
+    if (args.volume == 0) return;
+    if (dense) {
+      for (size_t idx = 0; idx < args.volume; ++idx) {
+        args.outptr[idx] = Legion::Point<N>(args.inptr[Is][idx]...);
+      }
+    } else {
+      CPULoop<DIM>::generic_loop(args.rect, point_ctor<N>(), args.out, args.in[Is]...);
+    }
+  }
+
+  template <int DIM>
+  static void dispatch_cpu(const Legion::Task* task,
+                           const std::vector<Legion::PhysicalRegion>& regions,
+                           LegateDeserializer& derez)
+  {
+    dispatch_cpu_impl<DIM>(task, regions, derez, std::make_index_sequence<N>());
+  }
+
+#ifdef LEGATE_USE_OPENMP
+  template <int DIM, size_t... Is>
+  static void dispatch_omp_impl(const Legion::Task* task,
+                                const std::vector<Legion::PhysicalRegion>& regions,
+                                LegateDeserializer& derez,
+                                std::index_sequence<Is...>)
+  {
+    DeserializedArgs<DIM> args;
+    const bool dense = args.deserialize(derez, task, regions);
+    if (args.volume == 0) return;
+    if (dense) {
+#pragma omp parallel for schedule(static)
+      for (size_t idx = 0; idx < args.volume; ++idx) {
+        args.outptr[idx] = Legion::Point<N>(args.inptr[Is][idx]...);
+      }
+    } else {
+      OMPLoop<DIM>::generic_loop(args.rect, point_ctor<N>(), args.out, args.in[Is]...);
+    }
+  }
+
+  template <int DIM>
+  static void dispatch_omp(const Legion::Task* task,
+                           const std::vector<Legion::PhysicalRegion>& regions,
+                           LegateDeserializer& derez)
+  {
+    dispatch_omp_impl<DIM>(task, regions, derez, std::make_index_sequence<N>());
+  }
+#endif
+#if defined(LEGATE_USE_CUDA) && defined(__CUDACC__)
+  template <int DIM>
+  static void dispatch_gpu(const Legion::Task* task,
+                           const std::vector<Legion::PhysicalRegion>& regions,
+                           LegateDeserializer& derez)
+  {
+    DeserializedArgs<DIM> args;
+    const bool dense = args.deserialize(derez, task, regions);
+    if (args.volume == 0) return;
+    const size_t blocks = (args.volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    gpu_zip<DIM, N><<<blocks, THREADS_PER_BLOCK>>>(args, dense, std::make_index_sequence<N>());
+  }
+#elif defined(LEGATE_USE_CUDA)
+  template <int DIM>
+  static void dispatch_gpu(const Legion::Task* task,
+                           const std::vector<Legion::PhysicalRegion>& regions,
+                           LegateDeserializer& derez);
+#endif
+};
+
+}  // namespace numpy
+}  // namespace legate
+
+#endif  // __NUMPY_ZIP_H__
diff --git a/tests/advanced_slicing.py b/tests/advanced_slicing.py
new file mode 100644
index 0000000000..f4e5c86b85
--- /dev/null
+++ b/tests/advanced_slicing.py
@@ -0,0 +1,230 @@
+# Copyright 2021 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+
+import legate.numpy as lg
+
+
+def sequence_2d():
+    return lg.array(
+        [
+            [0, 1, 2, 3, 4],
+            [5, 6, 7, 8, 9],
+            [10, 11, 12, 13, 14],
+            [15, 16, 17, 18, 19],
+            [20, 21, 22, 23, 24],
+        ]
+    )
+
+
+def test():
+
+    # 1d __getitem__
+
+    # index arrays can be boolean, of the same shape as the base array
+    a = lg.arange(10)
+    assert np.array_equal(a[lg.array([False, True] * 5)], [1, 3, 5, 7, 9])
+    assert np.array_equal(a[a > 5], [6, 7, 8, 9])
+
+    # index arrays can be integer, one per base array dimension
+    a = lg.arange(10) * 2
+    assert np.array_equal(a[lg.array([1, 3, 5, 7, 9])], [2, 6, 10, 14, 18])
+
+    # output shape follows index array shape
+    a = lg.arange(10) * 2
+    assert np.array_equal(
+        a[lg.array([[1, 2, 3], [4, 5, 6]])], [[2, 4, 6], [8, 10, 12]]
+    )
+
+    # index arrays can be any sequence object
+    a = lg.arange(10) * 2
+    assert np.array_equal(a[[4, 3, 2, 1]], [8, 6, 4, 2])
+
+    # index arrays are automatically cast to int64
+    a = lg.arange(10)
+    assert np.array_equal(a[lg.arange(5, 10, dtype=np.int16)], [5, 6, 7, 8, 9])
+
+    # advanced slicing creates copies
+    a = lg.arange(10)
+    b = a[range(5)]
+    b[:] = 0
+    assert np.array_equal(a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+    # can read through views
+    a = np.arange(10)
+    c = np.arange(9, -1, -1)
+    assert np.array_equal(a[2:7][c[5:10]], [6, 5, 4, 3, 2])
+
+    # 1d __setitem__
+
+    # can write through a single advanced slice
+    a = lg.arange(10)
+    b = lg.arange(5)
+    a[range(5, 10)] = b
+    assert np.array_equal(a, [0, 1, 2, 3, 4, 0, 1, 2, 3, 4])
+
+    # can write through views
+    a = np.arange(10)
+    b = np.arange(5)  # TODO: RHS cannot be a view currently
+    c = np.arange(9, -1, -1)
+    a[2:7][c[5:10]] = b
+    assert np.array_equal(a, [0, 1, 4, 3, 2, 1, 0, 7, 8, 9])
+
+    a = lg.arange(20)
+    b = np.arange(10)
+    a[10:][range(5)] = b[2:7]
+    assert np.array_equal(
+        a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 15, 16, 17, 18, 19]
+    )
+
+    # can copy within the same array
+    # TODO: Fix #40
+    # a = lg.arange(20)
+    # a[10:][range(5)] = a[2:7]
+    # assert np.array_equal(
+    #     a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 15, 16, 17, 18, 19]
+    # )
+
+    # source & destination regions can (partially) overlap
+    # TODO: Fix #40
+    # a = lg.arange(20)
+    # a[10:][range(5)] = a[12:17]
+    # assert np.array_equal(
+    #     a,
+    #     [0,1,2,3,4,5,6,7,8,9,12,13,14,15,16,15,16,17,18,19],
+    # )
+
+    # 2d __getitem__
+
+    # index arrays can be boolean, of the same shape as the base array
+    a = sequence_2d()
+    # TODO: Have to sort the output array, until #50 is fixed
+    assert np.array_equal(np.sort(a[a > 17]), [18, 19, 20, 21, 22, 23, 24])
+
+    # index arrays can be integer, one per base array dimension
+    a = sequence_2d()
+    assert np.array_equal(
+        a[lg.array([0, 1, 2, 3]), lg.array([0, 1, 2, 3])], [0, 6, 12, 18]
+    )
+
+    # output shape follows index array shape
+    a = sequence_2d()
+    assert np.array_equal(
+        a[lg.array([[1, 1, 1], [2, 2, 2]]), lg.array([[0, 1, 2], [0, 1, 2]])],
+        [[5, 6, 7], [10, 11, 12]],
+    )
+
+    # index arrays can be any sequence object
+    a = sequence_2d()
+    assert np.array_equal(a[range(4), range(4)], [0, 6, 12, 18])
+
+    # index arrays are automatically cast to int64
+    a = sequence_2d()
+    assert np.array_equal(a[range(4), range(4)], [0, 6, 12, 18])
+    assert np.array_equal(
+        a[
+            lg.array([0, 1, 2, 3], dtype=np.int16),
+            lg.array([0, 1, 2, 3], dtype=np.uint32),
+        ],
+        [0, 6, 12, 18],
+    )
+
+    # advanced slicing creates copies
+    a = sequence_2d()
+    b = a[a > 17]
+    b[:] = -1
+    assert a.min() == 0 and a.max() == 24
+
+    # can read through views
+    a = sequence_2d()
+    cx = lg.array([0, 0, 1, 1, 2, 2])
+    cy = lg.array([1, 2, 1, 2, 1, 2])
+    assert np.array_equal(a[2:, 2:][cx[2:], cy[2:]], [18, 19, 23, 24])
+
+    # 2d __setitem__
+
+    # can write through a single advanced slice
+    a = sequence_2d()
+    b = lg.zeros(7, dtype=np.int64)
+    a[a > 17] = b
+    assert np.array_equal(
+        a,
+        [
+            [0, 1, 2, 3, 4],
+            [5, 6, 7, 8, 9],
+            [10, 11, 12, 13, 14],
+            [15, 16, 17, 0, 0],
+            [0, 0, 0, 0, 0],
+        ],
+    )
+
+    # can write through views
+    a = sequence_2d()
+    b = lg.array([100, 101, 102, 103])  # TODO: RHS cannot be a view currently
+    cx = lg.array([0, 0, 1, 1, 2, 2])
+    cy = lg.array([1, 2, 1, 2, 1, 2])
+    a[2:, 2:][cx[2:], cy[2:]] = b
+    assert np.array_equal(
+        a,
+        [
+            [0, 1, 2, 3, 4],
+            [5, 6, 7, 8, 9],
+            [10, 11, 12, 13, 14],
+            [15, 16, 17, 100, 101],
+            [20, 21, 22, 102, 103],
+        ],
+    )
+
+    # can copy within the same array
+    # TODO: Fix #40
+    # a = sequence_2d()
+    # a[2:, :][[[1, 1], [2, 2]], [[1, 2], [1, 2]]] = a[1:3, 3:5]
+    # assert np.array_equal(
+    #     a,
+    #     [
+    #         [0, 1, 2, 3, 4],
+    #         [5, 6, 7, 8, 9],
+    #         [10, 11, 12, 13, 14],
+    #         [15, 8, 9, 18, 19],
+    #         [20, 13, 14, 23, 24],
+    #     ],
+    # )
+
+    # source & destination regions can (partially) overlap
+    # TODO: Fix #40
+    # a = sequence_2d()
+    # a[2:, :][[[1, 1], [2, 2]], [[1, 2], [1, 2]]] = a[3:5, 2:4]
+    # assert np.array_equal(
+    #     a,
+    #     [
+    #         [0, 1, 2, 3, 4],
+    #         [5, 6, 7, 8, 9],
+    #         [10, 11, 12, 13, 14],
+    #         [15, 17, 18, 18, 19],
+    #         [20, 22, 23, 23, 24],
+    #     ],
+    # )
+
+    # TODO: broadcasting in index arrays
+    # TODO: mixed advanced indexing
+    # TODO: singleton arrays
+    # TODO: views as base, index or value array (incl. reshape)
+
+    return
+
+
+if __name__ == "__main__":
+    test()
diff --git a/tests/slicing.py b/tests/slicing.py
index 7b5f47d77c..71d7ddaace 100644
--- a/tests/slicing.py
+++ b/tests/slicing.py
@@ -79,10 +79,57 @@ def test():
     # print(thirdslicegold)
     # assert(lg.array_equal(thirdslice, thirdslicegold))
 
+    # can copy within the same array
     x = lg.arange(10)
     x[0:5] = x[5:10]
     assert np.array_equal(x, [5, 6, 7, 8, 9, 5, 6, 7, 8, 9])
 
+    a = lg.arange(25).reshape((5, 5))
+    a[3:5:, 1:3] = a[1:3, 3:5]
+    assert np.array_equal(
+        a,
+        [
+            [0, 1, 2, 3, 4],
+            [5, 6, 7, 8, 9],
+            [10, 11, 12, 13, 14],
+            [15, 8, 9, 18, 19],
+            [20, 13, 14, 23, 24],
+        ],
+    )
+
+    # source & destination regions can (partially) overlap
+    # TODO: Fix #40
+    # a = lg.arange(25).reshape((5, 5))
+    # a[3:5:, 1:3] = a[3:5, 2:4]
+    # assert np.array_equal(
+    #     a,
+    #     [
+    #         [0, 1, 2, 3, 4],
+    #         [5, 6, 7, 8, 9],
+    #         [10, 11, 12, 13, 14],
+    #         [15, 17, 18, 18, 19],
+    #         [20, 22, 23, 23, 24],
+    #     ],
+    # )
+
+    # corner case of singleton base (backed by Futures instead of Regions)
+    a = lg.array([7])
+    assert a[0] == 7
+    assert np.array_equal(a[:], [7])
+    assert np.array_equal(a[1:], [])
+
+    a = lg.array([7])
+    a[0] = 4
+    assert np.array_equal(a, [4])
+
+    a = lg.array([7])
+    a[:] = [4]
+    assert np.array_equal(a, [4])
+
+    a = lg.array([7])
+    a[1:] = 4
+    assert np.array_equal(a, [7])
+
     return