diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py index 6603a07ce19c..bb31b12d6922 100644 --- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py +++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py @@ -116,15 +116,14 @@ def _copy_array(x, copy_flag=False, dtype=None, order="C"): exec_q = x_copy.sycl_queue _manager = dpu.SequentialOrderManager[exec_q] - dep_evs = _manager.submitted_events - ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=dpnp.get_usm_ndarray(x), dst=x_copy.get_array(), sycl_queue=exec_q, - depends=dep_evs, + depends=_manager.submitted_events, ) - _manager.add_event_pair(ht_copy_ev, copy_ev) + _manager.add_event_pair(ht_ev, copy_ev) return x_copy return x @@ -356,14 +355,14 @@ def _gemm_batch_matmul(exec_q, x1, x2, res): x2_usm = dpnp.get_usm_ndarray(x2[i : i + chunk, ...]) res_usm = dpnp.get_usm_ndarray(res[i : i + chunk, ...]) - ht_blas_ev, blas_ev, row_major = bi._gemm_batch( + ht_ev, blas_ev, row_major = bi._gemm_batch( exec_q, x1_usm, x2_usm, res_usm, depends=_manager.submitted_events, ) - _manager.add_event_pair(ht_blas_ev, blas_ev) + _manager.add_event_pair(ht_ev, blas_ev) res_shape = res.shape _, res_is_c_contig, res_is_f_contig = _define_contig_flag(res) @@ -388,14 +387,15 @@ def _gemm_batch_matmul(exec_q, x1, x2, res): def _gemm_matmul(exec_q, x1, x2, res): _manager = dpu.SequentialOrderManager[exec_q] - ht_gemm_ev, gemm_ev, row_major = bi._gemm( + + ht_ev, gemm_ev, row_major = bi._gemm( exec_q, dpnp.get_usm_ndarray(x1), dpnp.get_usm_ndarray(x2), dpnp.get_usm_ndarray(res), depends=_manager.submitted_events, ) - _manager.add_event_pair(ht_gemm_ev, gemm_ev) + _manager.add_event_pair(ht_ev, gemm_ev) if row_major: if res.flags.f_contiguous is True: @@ -635,14 +635,14 @@ def dpnp_dot(a, b, /, out=None, *, conjugate=False): else: dot_func = "_dot" - ht_dot_ev, dot_ev = getattr(bi, dot_func)( + ht_ev, dot_ev = getattr(bi, dot_func)( exec_q, dpnp.get_usm_ndarray(a), dpnp.get_usm_ndarray(b), dpnp.get_usm_ndarray(result), depends=_manager.submitted_events, ) - _manager.add_event_pair(ht_dot_ev, dot_ev) + _manager.add_event_pair(ht_ev, dot_ev) else: # oneapi::mkl::blas::dot is slow for integer data type, # so using dpctl.tensor.vecdot instead @@ -866,7 +866,8 @@ def dpnp_matmul( x_usm = dpnp.get_usm_ndarray(x2) _manager = dpu.SequentialOrderManager[exec_q] - ht_gemv_ev, gemv_ev = bi._gemv( + + ht_ev, gemv_ev = bi._gemv( exec_q, a_usm, x_usm, @@ -874,7 +875,7 @@ def dpnp_matmul( transpose, depends=_manager.submitted_events, ) - _manager.add_event_pair(ht_gemv_ev, gemv_ev) + _manager.add_event_pair(ht_ev, gemv_ev) elif call_flag == "gemm": result = _gemm_matmul( exec_q, diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index d71d73f5b472..10e93db2e1a6 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -38,8 +38,8 @@ # pylint: disable=protected-access # pylint: disable=useless-import-alias -import dpctl import dpctl.tensor._tensor_impl as ti +import dpctl.utils as dpu import numpy from numpy import prod from numpy.core.numeric import normalize_axis_index @@ -128,11 +128,12 @@ def _batched_eigh(a, UPLO, eigen_mode, w_type, v_type): a_sycl_queue = a.sycl_queue a_order = "C" if a.flags.c_contiguous else "F" + _manager = dpu.SequentialOrderManager[a_sycl_queue] + # need to loop over the 1st dimension to get eigenvalues and # eigenvectors of 3d matrix A batch_size = a.shape[0] eig_vecs = [None] * batch_size - ht_list_ev = [None] * batch_size * 2 for i in range(batch_size): # oneMKL LAPACK assumes fortran-like array as input, so # allocate a memory with 'F' order for dpnp array of eigenvectors @@ -140,11 +141,13 @@ def _batched_eigh(a, UPLO, eigen_mode, w_type, v_type): # use DPCTL tensor function to fill the array of eigenvectors with # content of input array - ht_list_ev[2 * i], copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=a_usm_arr[i], dst=eig_vecs[i].get_array(), sycl_queue=a_sycl_queue, + depends=_manager.submitted_events, ) + _manager.add_event_pair(ht_ev, copy_ev) # TODO: Remove this w/a when MKLD-17201 is solved. # Waiting for a host task executing an OneMKL LAPACK syevd/heevd call @@ -153,11 +156,11 @@ def _batched_eigh(a, UPLO, eigen_mode, w_type, v_type): # We need to wait for each host tasks before calling _seyvd and _heevd # to avoid deadlock. if is_cpu_device: - ht_list_ev[2 * i].wait() + dpnp.synchronize_array_data(a) # call LAPACK extension function to get eigenvalues and # eigenvectors of a portion of matrix A - ht_list_ev[2 * i + 1], _ = getattr(li, lapack_func)( + ht_ev, lapack_ev = getattr(li, lapack_func)( a_sycl_queue, jobz, uplo, @@ -165,8 +168,7 @@ def _batched_eigh(a, UPLO, eigen_mode, w_type, v_type): w[i].get_array(), depends=[copy_ev], ) - - dpctl.SyclEvent.wait_for(ht_list_ev) + _manager.add_event_pair(ht_ev, lapack_ev) w = w.reshape(w_orig_shape) @@ -208,18 +210,24 @@ def _batched_inv(a, res_type): ) dev_info = [0] * batch_size + _manager = dpu.SequentialOrderManager[a_sycl_queue] + # use DPCTL tensor function to fill the matrix array # with content from the input array `a` - a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_usm_arr, dst=a_h.get_array(), sycl_queue=a.sycl_queue + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_usm_arr, + dst=a_h.get_array(), + sycl_queue=a.sycl_queue, + depends=_manager.submitted_events, ) + _manager.add_event_pair(ht_ev, copy_ev) ipiv_stride = n a_stride = a_h.strides[0] # Call the LAPACK extension function _getrf_batch # to perform LU decomposition of a batch of general matrices - ht_getrf_ev, getrf_ev = li._getrf_batch( + ht_ev, getrf_ev = li._getrf_batch( a_sycl_queue, a_h.get_array(), ipiv_h.get_array(), @@ -228,15 +236,16 @@ def _batched_inv(a, res_type): a_stride, ipiv_stride, batch_size, - [a_copy_ev], + depends=[copy_ev], ) + _manager.add_event_pair(ht_ev, getrf_ev) _check_lapack_dev_info(dev_info) # Call the LAPACK extension function _getri_batch # to compute the inverse of a batch of matrices using the results # from the LU decomposition performed by _getrf_batch - ht_getri_ev, _ = li._getri_batch( + ht_ev, getri_ev = li._getri_batch( a_sycl_queue, a_h.get_array(), ipiv_h.get_array(), @@ -245,15 +254,12 @@ def _batched_inv(a, res_type): a_stride, ipiv_stride, batch_size, - [getrf_ev], + depends=[getrf_ev], ) + _manager.add_event_pair(ht_ev, getri_ev) _check_lapack_dev_info(dev_info) - ht_getri_ev.wait() - ht_getrf_ev.wait() - a_ht_copy_ev.wait() - return a_h.reshape(orig_shape) @@ -290,13 +296,12 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type): b_usm_arr = dpnp.get_usm_ndarray(b) reshape = True - batch_size = a.shape[0] + _manager = dpu.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + batch_size = a.shape[0] coeff_vecs = [None] * batch_size val_vecs = [None] * batch_size - a_ht_copy_ev = [None] * batch_size - b_ht_copy_ev = [None] * batch_size - ht_lapack_ev = [None] * batch_size for i in range(batch_size): # oneMKL LAPACK assumes fortran-like array as input, so allocate @@ -307,11 +312,13 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type): # use DPCTL tensor function to fill the coefficient matrix array # with content from the input array - a_ht_copy_ev[i], a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + ht_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=a_usm_arr[i], dst=coeff_vecs[i].get_array(), sycl_queue=a.sycl_queue, + depends=dep_evs, ) + _manager.add_event_pair(ht_ev, a_copy_ev) # oneMKL LAPACK assumes fortran-like array as input, so # allocate a memory with 'F' order for dpnp array of multiple @@ -322,21 +329,24 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type): # use DPCTL tensor function to fill the array of multiple dependent # variables with content from the input arrays - b_ht_copy_ev[i], b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + ht_ev, b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=b_usm_arr[i], dst=val_vecs[i].get_array(), sycl_queue=b.sycl_queue, + depends=dep_evs, ) + _manager.add_event_pair(ht_ev, b_copy_ev) # Call the LAPACK extension function _gesv to solve the system of # linear equations using a portion of the coefficient square matrix # and a corresponding portion of the dependent variables array. - ht_lapack_ev[i], _ = li._gesv( + ht_ev, gesv_ev = li._gesv( exec_q, coeff_vecs[i].get_array(), val_vecs[i].get_array(), depends=[a_copy_ev, b_copy_ev], ) + _manager.add_event_pair(ht_ev, gesv_ev) # TODO: Remove this w/a when MKLD-17201 is solved. # Waiting for a host task executing an OneMKL LAPACK gesv call @@ -345,13 +355,7 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type): # We need to wait for each host tasks before calling _gesv to avoid # deadlock. if is_cpu_device: - ht_lapack_ev[i].wait() - b_ht_copy_ev[i].wait() - - for i in range(batch_size): - ht_lapack_ev[i].wait() - b_ht_copy_ev[i].wait() - a_ht_copy_ev[i].wait() + dpnp.synchronize_array_data(a) # combine the list of solutions into a single array out_v = dpnp.array( @@ -393,11 +397,17 @@ def _batched_qr(a, mode="reduced"): a_t = dpnp.empty_like(a, order="C", dtype=res_type) + _manager = dpu.SequentialOrderManager[a_sycl_queue] + # use DPCTL tensor function to fill the matrix array # with content from the input array `a` - a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_usm_arr, dst=a_t.get_array(), sycl_queue=a_sycl_queue + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_usm_arr, + dst=a_t.get_array(), + sycl_queue=a_sycl_queue, + depends=_manager.submitted_events, ) + _manager.add_event_pair(ht_ev, copy_ev) tau_h = dpnp.empty_like( a_t, @@ -410,7 +420,7 @@ def _batched_qr(a, mode="reduced"): # Call the LAPACK extension function _geqrf_batch to compute # the QR factorization of a general m x n matrix. - ht_geqrf_batch_ev, geqrf_batch_ev = li._geqrf_batch( + ht_ev, geqrf_ev = li._geqrf_batch( a_sycl_queue, a_t.get_array(), tau_h.get_array(), @@ -419,20 +429,18 @@ def _batched_qr(a, mode="reduced"): a_stride, tau_stride, batch_size, - [a_copy_ev], + depends=[copy_ev], ) - - ht_list_ev = [ht_geqrf_batch_ev, a_ht_copy_ev] + _manager.add_event_pair(ht_ev, geqrf_ev) if mode in ["r", "raw"]: if mode == "r": r = a_t[..., :k].swapaxes(-2, -1) - r = _triu_inplace(r, ht_list_ev, [geqrf_batch_ev]) - dpctl.SyclEvent.wait_for(ht_list_ev) + r = _triu_inplace(r) + return r.reshape(batch_shape + r.shape[-2:]) # mode=="raw" - dpctl.SyclEvent.wait_for(ht_list_ev) q = a_t.reshape(batch_shape + a_t.shape[-2:]) r = tau_h.reshape(batch_shape + tau_h.shape[-1:]) return (q, r) @@ -454,14 +462,13 @@ def _batched_qr(a, mode="reduced"): # use DPCTL tensor function to fill the matrix array `q[..., :n, :]` # with content from the array `a_t` overwritten by geqrf_batch - a_t_ht_copy_ev, a_t_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=a_t.get_array(), dst=q[..., :n, :].get_array(), sycl_queue=a_sycl_queue, - depends=[geqrf_batch_ev], + depends=[geqrf_ev], ) - - ht_list_ev.append(a_t_ht_copy_ev) + _manager.add_event_pair(ht_ev, copy_ev) q_stride = q.strides[0] tau_stride = tau_h.strides[0] @@ -477,7 +484,7 @@ def _batched_qr(a, mode="reduced"): # Call the LAPACK extension function _orgqr_batch/ to generate the real # orthogonal/complex unitary matrices `Qi` of the QR factorization # for a batch of general matrices. - ht_lapack_ev, lapack_ev = getattr(li, lapack_func)( + ht_ev, lapack_ev = getattr(li, lapack_func)( a_sycl_queue, q.get_array(), tau_h.get_array(), @@ -487,18 +494,14 @@ def _batched_qr(a, mode="reduced"): q_stride, tau_stride, batch_size, - [a_t_copy_ev], + depends=[copy_ev], ) - - ht_list_ev.append(ht_lapack_ev) + _manager.add_event_pair(ht_ev, lapack_ev) q = q[..., :mc, :].swapaxes(-2, -1) r = a_t[..., :mc].swapaxes(-2, -1) - ht_list_ev.append(ht_lapack_ev) - - r = _triu_inplace(r, ht_list_ev, [lapack_ev]) - dpctl.SyclEvent.wait_for(ht_list_ev) + r = _triu_inplace(r) return ( q.reshape(batch_shape + q.shape[-2:]), @@ -569,22 +572,15 @@ def _batched_svd( u_matrices = [None] * batch_size s_matrices = [None] * batch_size vt_matrices = [None] * batch_size - ht_list_ev = [None] * batch_size * 2 for i in range(batch_size): if compute_uv: ( u_matrices[i], s_matrices[i], vt_matrices[i], - ht_list_ev[2 * i], - ht_list_ev[2 * i + 1], - ) = dpnp_svd(a[i], full_matrices, compute_uv=True, batch_call=True) + ) = dpnp_svd(a[i], full_matrices, compute_uv=True) else: - s_matrices[i], ht_list_ev[2 * i], ht_list_ev[2 * i + 1] = dpnp_svd( - a[i], full_matrices, compute_uv=False, batch_call=True - ) - - dpctl.SyclEvent.wait_for(ht_list_ev) + s_matrices[i] = dpnp_svd(a[i], full_matrices, compute_uv=False) # TODO: Need to return C-contiguous array to match the output of # numpy.linalg.svd @@ -827,6 +823,8 @@ def _lu_factor(a, res_type): # On GPU call getrf for each two-dimensional array by loop use_batch = a.sycl_device.has_aspect_cpu + _manager = dpu.SequentialOrderManager[a_sycl_queue] + if a.ndim > 2: orig_shape = a.shape # get 3d input arrays by reshape @@ -846,16 +844,20 @@ def _lu_factor(a, res_type): ) dev_info_h = [0] * batch_size - a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_usm_arr, dst=a_h.get_array(), sycl_queue=a_sycl_queue + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_usm_arr, + dst=a_h.get_array(), + sycl_queue=a_sycl_queue, + depends=_manager.submitted_events, ) + _manager.add_event_pair(ht_ev, copy_ev) ipiv_stride = n a_stride = a_h.strides[0] # Call the LAPACK extension function _getrf_batch # to perform LU decomposition of a batch of general matrices - ht_lapack_ev, _ = li._getrf_batch( + ht_ev, getrf_ev = li._getrf_batch( a_sycl_queue, a_h.get_array(), ipiv_h.get_array(), @@ -864,11 +866,9 @@ def _lu_factor(a, res_type): a_stride, ipiv_stride, batch_size, - [a_copy_ev], + depends=[copy_ev], ) - - ht_lapack_ev.wait() - a_ht_copy_ev.wait() + _manager.add_event_pair(ht_ev, getrf_ev) dev_info_array = dpnp.array( dev_info_h, usm_type=a_usm_type, sycl_queue=a_sycl_queue @@ -885,22 +885,23 @@ def _lu_factor(a, res_type): a_vecs = [None] * batch_size ipiv_vecs = [None] * batch_size dev_info_vecs = [None] * batch_size - a_ht_copy_ev = [None] * batch_size - ht_lapack_ev = [None] * batch_size + + dep_evs = _manager.submitted_events # Process each batch for i in range(batch_size): # Copy each 2D slice to a new array because getrf will destroy # the input matrix a_vecs[i] = dpnp.empty_like(a[i], order="C", dtype=res_type) - ( - a_ht_copy_ev[i], - a_copy_ev, - ) = ti._copy_usm_ndarray_into_usm_ndarray( + + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=a_usm_arr[i], dst=a_vecs[i].get_array(), sycl_queue=a_sycl_queue, + depends=dep_evs, ) + _manager.add_event_pair(ht_ev, copy_ev) + ipiv_vecs[i] = dpnp.empty( (n,), dtype=dpnp.int64, @@ -912,17 +913,14 @@ def _lu_factor(a, res_type): # Call the LAPACK extension function _getrf # to perform LU decomposition on each batch in 'a_vecs[i]' - ht_lapack_ev[i], _ = li._getrf( + ht_ev, getrf_ev = li._getrf( a_sycl_queue, a_vecs[i].get_array(), ipiv_vecs[i].get_array(), dev_info_vecs[i], - [a_copy_ev], + depends=[copy_ev], ) - - for i in range(batch_size): - ht_lapack_ev[i].wait() - a_ht_copy_ev[i].wait() + _manager.add_event_pair(ht_ev, getrf_ev) # Reshape the results back to their original shape out_a = dpnp.array(a_vecs, order="C").reshape(orig_shape) @@ -940,9 +938,13 @@ def _lu_factor(a, res_type): # use DPCTL tensor function to fill the сopy of the input array # from the input array - a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_usm_arr, dst=a_h.get_array(), sycl_queue=a_sycl_queue + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_usm_arr, + dst=a_h.get_array(), + sycl_queue=a_sycl_queue, + depends=_manager.submitted_events, ) + _manager.add_event_pair(ht_ev, copy_ev) ipiv_h = dpnp.empty( n, @@ -955,16 +957,14 @@ def _lu_factor(a, res_type): # Call the LAPACK extension function _getrf # to perform LU decomposition on the input matrix - ht_lapack_ev, _ = li._getrf( + ht_ev, getrf_ev = li._getrf( a_sycl_queue, a_h.get_array(), ipiv_h.get_array(), dev_info_h, - [a_copy_ev], + depends=[copy_ev], ) - - ht_lapack_ev.wait() - a_ht_copy_ev.wait() + _manager.add_event_pair(ht_ev, getrf_ev) dev_info_array = dpnp.array( dev_info_h, usm_type=a_usm_type, sycl_queue=a_sycl_queue @@ -1308,10 +1308,8 @@ def _stacked_identity_like(x): return x -def _triu_inplace(a, host_tasks, depends=None): +def _triu_inplace(a): """ - _triu_inplace(a, host_tasks, depends=None) - Computes the upper triangular part of an array in-place, but currently allocates extra memory for the result. @@ -1319,14 +1317,6 @@ def _triu_inplace(a, host_tasks, depends=None): ---------- a : {dpnp.ndarray, usm_ndarray} Input array from which the upper triangular part is to be extracted. - host_tasks : list - A list to which the function appends the host event corresponding to - the computation. This allows for dependency management and - synchronization with other tasks. - depends : list, optional - A list of events that the triangular operation depends on. - These tasks are completed before the triangular computation starts. - If ``None``, defaults to an empty list. Returns ------- @@ -1337,17 +1327,17 @@ def _triu_inplace(a, host_tasks, depends=None): # TODO: implement a dedicated kernel for in-place triu instead of # extra memory allocation for result - if depends is None: - depends = [] out = dpnp.empty_like(a, order="C") - ht_triu_ev, _ = ti._triu( + + _manager = dpu.SequentialOrderManager[a.sycl_queue] + ht_ev, triu_ev = ti._triu( src=a.get_array(), dst=out.get_array(), k=0, sycl_queue=a.sycl_queue, - depends=depends, + depends=_manager.submitted_events, ) - host_tasks.append(ht_triu_ev) + _manager.add_event_pair(ht_ev, triu_ev) return out @@ -1750,29 +1740,33 @@ def dpnp_cholesky_batch(a, upper_lower, res_type): # `a` must be copied because potrf_batch destroys the input matrix a_h = dpnp.empty_like(a, order="C", dtype=res_type, usm_type=a_usm_type) + _manager = dpu.SequentialOrderManager[a_sycl_queue] + # use DPCTL tensor function to fill the сopy of the input array # from the input array - a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_usm_arr, dst=a_h.get_array(), sycl_queue=a_sycl_queue + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_usm_arr, + dst=a_h.get_array(), + sycl_queue=a_sycl_queue, + depends=_manager.submitted_events, ) + _manager.add_event_pair(ht_ev, copy_ev) a_stride = a_h.strides[0] # Call the LAPACK extension function _potrf_batch # to computes the Cholesky decomposition of a batch of # symmetric positive-definite matrices - ht_lapack_ev, _ = li._potrf_batch( + ht_ev, potrf_ev = li._potrf_batch( a_sycl_queue, a_h.get_array(), upper_lower, n, a_stride, batch_size, - [a_copy_ev], + depends=[copy_ev], ) - - ht_lapack_ev.wait() - a_ht_copy_ev.wait() + _manager.add_event_pair(ht_ev, potrf_ev) # Get upper or lower-triangular matrix part as per `upper_lower` value # upper_lower is 0 (lower) or 1 (upper) @@ -1828,23 +1822,27 @@ def dpnp_cholesky(a, upper): # `a` must be copied because potrf destroys the input matrix a_h = dpnp.empty_like(a, order="C", dtype=res_type, usm_type=a_usm_type) + _manager = dpu.SequentialOrderManager[a_sycl_queue] + # use DPCTL tensor function to fill the сopy of the input array # from the input array - a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_usm_arr, dst=a_h.get_array(), sycl_queue=a_sycl_queue + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_usm_arr, + dst=a_h.get_array(), + sycl_queue=a_sycl_queue, + depends=_manager.submitted_events, ) + _manager.add_event_pair(ht_ev, copy_ev) # Call the LAPACK extension function _potrf # to computes the Cholesky decomposition - ht_lapack_ev, _ = li._potrf( + ht_ev, potrf_ev = li._potrf( a_sycl_queue, a_h.get_array(), upper_lower, - [a_copy_ev], + depends=[copy_ev], ) - - ht_lapack_ev.wait() - a_ht_copy_ev.wait() + _manager.add_event_pair(ht_ev, potrf_ev) # Get upper or lower-triangular matrix part as per `upper` value if upper: @@ -1974,8 +1972,8 @@ def dpnp_eigh(a, UPLO, eigen_mode="V"): a_order = "C" if a.flags.c_contiguous else "F" a_usm_arr = dpnp.get_usm_ndarray(a) - ht_list_ev = [] - copy_ev = dpctl.SyclEvent() + + _manager = dpu.SequentialOrderManager[a_sycl_queue] # When `eigen_mode == "N"` (jobz == 0), OneMKL LAPACK does not # overwrite the input array. @@ -1991,10 +1989,13 @@ def dpnp_eigh(a, UPLO, eigen_mode="V"): # use DPCTL tensor function to fill the array of eigenvectors with # content of input array - ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_usm_arr, dst=v.get_array(), sycl_queue=a_sycl_queue + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_usm_arr, + dst=v.get_array(), + sycl_queue=a_sycl_queue, + depends=_manager.submitted_events, ) - ht_list_ev.append(ht_copy_ev) + _manager.add_event_pair(ht_ev, copy_ev) # allocate a memory for dpnp array of eigenvalues w = dpnp.empty_like( @@ -2005,30 +2006,30 @@ def dpnp_eigh(a, UPLO, eigen_mode="V"): # call LAPACK extension function to get eigenvalues and eigenvectors of # matrix A - ht_lapack_ev, lapack_ev = getattr(li, lapack_func)( + ht_ev, lapack_ev = getattr(li, lapack_func)( a_sycl_queue, jobz, uplo, v.get_array(), w.get_array(), - depends=[copy_ev], + depends=_manager.submitted_events, ) - ht_list_ev.append(ht_lapack_ev) + _manager.add_event_pair(ht_ev, lapack_ev) if eigen_mode == "V" and a_order != "F": # need to align order of eigenvectors with one of input matrix A out_v = dpnp.empty_like(v, order=a_order) - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=v.get_array(), dst=out_v.get_array(), sycl_queue=a_sycl_queue, depends=[lapack_ev], ) - ht_list_ev.append(ht_copy_out_ev) + _manager.add_event_pair(ht_ev, copy_ev) else: out_v = v - dpctl.SyclEvent.wait_for(ht_list_ev) return (w, out_v) if eigen_mode == "V" else w @@ -2063,11 +2064,17 @@ def dpnp_inv(a): # This transposition is effective because the input array `a` is square. a_f = dpnp.empty_like(a, order=a_order, dtype=res_type) + _manager = dpu.SequentialOrderManager[a_sycl_queue] + # use DPCTL tensor function to fill the coefficient matrix array # with content from the input array `a` - a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_usm_arr, dst=a_f.get_array(), sycl_queue=a_sycl_queue + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_usm_arr, + dst=a_f.get_array(), + sycl_queue=a_sycl_queue, + depends=_manager.submitted_events, ) + _manager.add_event_pair(ht_ev, copy_ev) b_f = dpnp.eye( a_shape[0], @@ -2078,16 +2085,14 @@ def dpnp_inv(a): ) if a_order == "F": - ht_lapack_ev, _ = li._gesv( - a_sycl_queue, a_f.get_array(), b_f.get_array(), [a_copy_ev] - ) + usm_a_f = a_f.get_array() + usm_b_f = b_f.get_array() else: - ht_lapack_ev, _ = li._gesv( - a_sycl_queue, a_f.T.get_array(), b_f.T.get_array(), [a_copy_ev] - ) + usm_a_f = a_f.T.get_array() + usm_b_f = b_f.T.get_array() - ht_lapack_ev.wait() - a_ht_copy_ev.wait() + ht_ev, gesv_ev = li._gesv(a_sycl_queue, usm_a_f, usm_b_f, depends=[copy_ev]) + _manager.add_event_pair(ht_ev, gesv_ev) return b_f @@ -2359,11 +2364,17 @@ def dpnp_qr(a, mode="reduced"): a_usm_arr = dpnp.get_usm_ndarray(a) a_t = dpnp.empty_like(a, order="C", dtype=res_type) + _manager = dpu.SequentialOrderManager[a_sycl_queue] + # use DPCTL tensor function to fill the matrix array # with content from the input array `a` - a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_usm_arr, dst=a_t.get_array(), sycl_queue=a_sycl_queue + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_usm_arr, + dst=a_t.get_array(), + sycl_queue=a_sycl_queue, + depends=_manager.submitted_events, ) + _manager.add_event_pair(ht_ev, copy_ev) tau_h = dpnp.empty_like( a, @@ -2373,21 +2384,18 @@ def dpnp_qr(a, mode="reduced"): # Call the LAPACK extension function _geqrf to compute the QR factorization # of a general m x n matrix. - ht_geqrf_ev, geqrf_ev = li._geqrf( - a_sycl_queue, a_t.get_array(), tau_h.get_array(), [a_copy_ev] + ht_ev, geqrf_ev = li._geqrf( + a_sycl_queue, a_t.get_array(), tau_h.get_array(), depends=[copy_ev] ) - - ht_list_ev = [ht_geqrf_ev, a_ht_copy_ev] + _manager.add_event_pair(ht_ev, geqrf_ev) if mode in ["r", "raw"]: if mode == "r": r = a_t[:, :k].transpose() - r = _triu_inplace(r, ht_list_ev, [geqrf_ev]) - dpctl.SyclEvent.wait_for(ht_list_ev) + r = _triu_inplace(r) return r # mode == "raw": - dpctl.SyclEvent.wait_for(ht_list_ev) return (a_t, tau_h) # mc is the total number of columns in the q matrix. @@ -2410,14 +2418,13 @@ def dpnp_qr(a, mode="reduced"): # use DPCTL tensor function to fill the matrix array `q[:n]` # with content from the array `a_t` overwritten by geqrf - a_t_ht_copy_ev, a_t_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=a_t.get_array(), dst=q[:n].get_array(), sycl_queue=a_sycl_queue, depends=[geqrf_ev], ) - - ht_list_ev.append(a_t_ht_copy_ev) + _manager.add_event_pair(ht_ev, copy_ev) # Get LAPACK function (_orgqr for real or _ungqf for complex data types) # for QR factorization @@ -2429,18 +2436,21 @@ def dpnp_qr(a, mode="reduced"): # Call the LAPACK extension function _orgqr/_ungqf to generate the real # orthogonal/complex unitary matrix `Q` of the QR factorization - ht_lapack_ev, lapack_ev = getattr(li, lapack_func)( - a_sycl_queue, m, mc, k, q.get_array(), tau_h.get_array(), [a_t_copy_ev] + ht_ev, lapack_ev = getattr(li, lapack_func)( + a_sycl_queue, + m, + mc, + k, + q.get_array(), + tau_h.get_array(), + depends=[copy_ev], ) + _manager.add_event_pair(ht_ev, lapack_ev) q = q[:mc].transpose() r = a_t[:, :mc].transpose() - ht_list_ev.append(ht_lapack_ev) - - r = _triu_inplace(r, ht_list_ev, [lapack_ev]) - dpctl.SyclEvent.wait_for(ht_list_ev) - + r = _triu_inplace(r) return (q, r) @@ -2476,11 +2486,18 @@ def dpnp_solve(a, b): # oneMKL LAPACK getrf overwrites `a`. a_h = dpnp.empty_like(a, order="C", dtype=res_type, usm_type=res_usm_type) + _manager = dpu.SequentialOrderManager[exec_q] + dev_evs = _manager.submitted_events + # use DPCTL tensor function to fill the сopy of the input array # from the input array - a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_usm_arr, dst=a_h.get_array(), sycl_queue=a.sycl_queue + ht_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_usm_arr, + dst=a_h.get_array(), + sycl_queue=a.sycl_queue, + depends=dev_evs, ) + _manager.add_event_pair(ht_ev, a_copy_ev) # oneMKL LAPACK getrs overwrites `b` and assumes fortran-like array as # input. @@ -2490,9 +2507,13 @@ def dpnp_solve(a, b): # use DPCTL tensor function to fill the array of multiple dependent # variables with content from the input array `b` - b_ht_copy_ev, b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=b_usm_arr, dst=b_h.get_array(), sycl_queue=b.sycl_queue + ht_ev, b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=b_usm_arr, + dst=b_h.get_array(), + sycl_queue=b.sycl_queue, + depends=dev_evs, ) + _manager.add_event_pair(ht_ev, b_copy_ev) n = a.shape[0] @@ -2505,29 +2526,28 @@ def dpnp_solve(a, b): # Call the LAPACK extension function _getrf # to perform LU decomposition of the input matrix - ht_getrf_ev, getrf_ev = li._getrf( + ht_ev, getrf_ev = li._getrf( exec_q, a_h.get_array(), ipiv_h.get_array(), dev_info_h, - [a_copy_ev], + depends=[a_copy_ev], ) + _manager.add_event_pair(ht_ev, getrf_ev) _check_lapack_dev_info(dev_info_h) # Call the LAPACK extension function _getrs # to solve the system of linear equations with an LU-factored # coefficient square matrix, with multiple right-hand sides. - ht_getrs_ev, _ = li._getrs( + ht_ev, getrs_ev = li._getrs( exec_q, a_h.get_array(), ipiv_h.get_array(), b_h.get_array(), - [b_copy_ev, getrf_ev], + depends=[b_copy_ev, getrf_ev], ) - - ht_list_ev = [a_ht_copy_ev, b_ht_copy_ev, ht_getrf_ev, ht_getrs_ev] - dpctl.SyclEvent.wait_for(ht_list_ev) + _manager.add_event_pair(ht_ev, getrs_ev) return b_h @@ -2583,7 +2603,6 @@ def dpnp_svd( full_matrices=True, compute_uv=True, hermitian=False, - batch_call=False, related_arrays=None, ): """ @@ -2592,7 +2611,6 @@ def dpnp_svd( full_matrices=True, compute_uv=True, hermitian=False, - batch_call=False, related_arrays=None, ) @@ -2643,11 +2661,17 @@ def dpnp_svd( a_usm_arr = dpnp.get_usm_ndarray(a) + _manager = dpu.SequentialOrderManager[exec_q] + # use DPCTL tensor function to fill the сopy of the input array # from the input array - a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_usm_arr, dst=a_h.get_array(), sycl_queue=exec_q + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_usm_arr, + dst=a_h.get_array(), + sycl_queue=exec_q, + depends=_manager.submitted_events, ) + _manager.add_event_pair(ht_ev, copy_ev) k = min(m, n) if compute_uv: @@ -2681,7 +2705,7 @@ def dpnp_svd( ) s_h = dpnp.empty_like(a_h, shape=(k,), dtype=s_type) - ht_lapack_ev, _ = li._gesvd( + ht_ev, gesvd_ev = li._gesvd( exec_q, jobu, jobvt, @@ -2689,16 +2713,9 @@ def dpnp_svd( s_h.get_array(), u_h.get_array(), vt_h.get_array(), - [a_copy_ev], + depends=[copy_ev], ) - - if batch_call: - if compute_uv: - return u_h, s_h, vt_h, ht_lapack_ev, a_ht_copy_ev - return s_h, ht_lapack_ev, a_ht_copy_ev - - ht_lapack_ev.wait() - a_ht_copy_ev.wait() + _manager.add_event_pair(ht_ev, gesvd_ev) # TODO: Need to return C-contiguous array to match the output of # numpy.linalg.svd