From 4c7c8c240280fe5935c7cf8a17d7cc3711db08d7 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 6 Jun 2024 22:55:56 +0200 Subject: [PATCH 01/37] Init work --- dpnp/backend/extensions/lapack/gesv.cpp | 119 ++++++++++++ dpnp/backend/extensions/lapack/gesv.hpp | 6 + dpnp/backend/extensions/lapack/lapack_py.cpp | 7 + dpnp/linalg/dpnp_utils_linalg.py | 191 ++++++++++++------- test.py | 74 +++++++ 5 files changed, 331 insertions(+), 66 deletions(-) create mode 100644 test.py diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp index 3dc4ec6b125d..2fda114f1ef0 100644 --- a/dpnp/backend/extensions/lapack/gesv.cpp +++ b/dpnp/backend/extensions/lapack/gesv.cpp @@ -273,6 +273,125 @@ std::pair return std::make_pair(args_ev, gesv_ev); } + +std::pair + gesv_batch(sycl::queue exec_q, + dpctl::tensor::usm_ndarray coeff_matrix, + dpctl::tensor::usm_ndarray dependent_vals, + const std::vector &depends) +{ + const int coeff_matrix_nd = coeff_matrix.get_ndim(); + const int dependent_vals_nd = dependent_vals.get_ndim(); + + if (coeff_matrix_nd != 3) { + throw py::value_error("The coefficient matrix has ndim=" + + std::to_string(coeff_matrix_nd) + + ", but a 3-dimensional array is expected."); + } + + if (dependent_vals_nd > 2) { + throw py::value_error( + "The dependent values array has ndim=" + + std::to_string(dependent_vals_nd) + + ", but more than 2-dimensional array is expected."); + } + + const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw(); + const py::ssize_t *dependent_vals_shape = dependent_vals.get_shape_raw(); + + if (coeff_matrix_shape[1] != coeff_matrix_shape[2]) { + throw py::value_error("The coefficient matrix must be square," + " but got a shape of (" + + std::to_string(coeff_matrix_shape[1]) + ", " + + std::to_string(coeff_matrix_shape[2]) + ")."); + } + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, + {coeff_matrix, dependent_vals})) + { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(coeff_matrix, dependent_vals)) { + throw py::value_error( + "The arrays of coefficients and dependent variables " + "are overlapping segments of memory"); + } + + // bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous(); + // if (!is_coeff_matrix_f_contig) { + // throw py::value_error("The coefficient matrix " + // "must be F-contiguous"); + // } + + // bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous(); + // if (!is_dependent_vals_f_contig) { + // throw py::value_error("The array of dependent variables " + // "must be F-contiguous"); + // } + + auto array_types = dpctl_td_ns::usm_ndarray_types(); + int coeff_matrix_type_id = + array_types.typenum_to_lookup_id(coeff_matrix.get_typenum()); + int dependent_vals_type_id = + array_types.typenum_to_lookup_id(dependent_vals.get_typenum()); + + if (coeff_matrix_type_id != dependent_vals_type_id) { + throw py::value_error("The types of the coefficient matrix and " + "dependent variables are mismatched"); + } + + gesv_impl_fn_ptr_t gesv_fn = gesv_dispatch_vector[coeff_matrix_type_id]; + if (gesv_fn == nullptr) { + throw py::value_error( + "No gesv implementation defined for the provided type " + "of the coefficient matrix."); + } + + char *coeff_matrix_data = coeff_matrix.get_data(); + char *dependent_vals_data = dependent_vals.get_data(); + + const std::int64_t batch_size = coeff_matrix_shape[0]; + const std::int64_t n = dependent_vals_shape[0]; + const std::int64_t nrhs = + (dependent_vals_nd > 1) ? dependent_vals_shape[1] : 1; + + const std::int64_t lda = std::max(1UL, n); + const std::int64_t ldb = std::max(1UL, n); + + int coeff_matrix_elemsize = coeff_matrix.get_elemsize(); + int dependent_vals_elemsize = dependent_vals.get_elemsize(); + + std::vector host_task_events; + std::vector gesv_task_events; + + host_task_events.reserve(batch_size); + gesv_task_events.reserve(batch_size); + + for (std::int64_t i = 0; i < batch_size; ++i) { + char *coeff_matrix_batch = coeff_matrix_data + i * n * n * coeff_matrix_elemsize; + char *dependent_vals_batch = dependent_vals_data + i * n * dependent_vals_elemsize; + + sycl::event gesv_ev = + gesv_fn(exec_q, n, nrhs, coeff_matrix_batch, lda, dependent_vals_batch, + ldb, host_task_events, depends); + + gesv_task_events.push_back(gesv_ev); + } + + sycl::event combine_ev = exec_q.submit( + [&](sycl::handler &cgh) { cgh.depends_on(gesv_task_events); }); + + + sycl::event args_ev = dpctl::utils::keep_args_alive( + exec_q, {coeff_matrix, dependent_vals}, host_task_events); + + return std::make_pair(args_ev, combine_ev); +} + template struct GesvContigFactory { diff --git a/dpnp/backend/extensions/lapack/gesv.hpp b/dpnp/backend/extensions/lapack/gesv.hpp index 12486fae7870..4de03370da70 100644 --- a/dpnp/backend/extensions/lapack/gesv.hpp +++ b/dpnp/backend/extensions/lapack/gesv.hpp @@ -44,6 +44,12 @@ extern std::pair dpctl::tensor::usm_ndarray dependent_vals, const std::vector &depends); +extern std::pair + gesv_batch(sycl::queue exec_q, + dpctl::tensor::usm_ndarray coeff_matrix, + dpctl::tensor::usm_ndarray dependent_vals, + const std::vector &depends); + extern void init_gesv_dispatch_vector(void); } // namespace lapack } // namespace ext diff --git a/dpnp/backend/extensions/lapack/lapack_py.cpp b/dpnp/backend/extensions/lapack/lapack_py.cpp index e6b4365a906d..863d1ffaea47 100644 --- a/dpnp/backend/extensions/lapack/lapack_py.cpp +++ b/dpnp/backend/extensions/lapack/lapack_py.cpp @@ -103,6 +103,13 @@ PYBIND11_MODULE(_lapack_impl, m) py::arg("sycl_queue"), py::arg("coeff_matrix"), py::arg("dependent_vals"), py::arg("depends") = py::list()); + m.def("_gesv_batch", &lapack_ext::gesv_batch, + "Call `gesv` from OneMKL LAPACK library to return " + "the batch solution of a system of linear equations with " + "a square coefficient matrix A and multiple dependent variables", + py::arg("sycl_queue"), py::arg("coeff_matrix"), + py::arg("dependent_vals"), py::arg("depends") = py::list()); + m.def("_gesvd", &lapack_ext::gesvd, "Call `gesvd` from OneMKL LAPACK library to return " "the singular value decomposition of a general rectangular matrix", diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 22aa396c7fed..989c81ff118c 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -267,18 +267,109 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type): """ - a_usm_arr = dpnp.get_usm_ndarray(a) - b_usm_arr = dpnp.get_usm_ndarray(b) + new = True - b_order = "C" if b.flags.c_contiguous else "F" - a_shape = a.shape - b_shape = b.shape + if not new: + a_usm_arr = dpnp.get_usm_ndarray(a) + b_usm_arr = dpnp.get_usm_ndarray(b) + + b_order = "C" if b.flags.c_contiguous else "F" + a_shape = a.shape + b_shape = b.shape + + is_cpu_device = exec_q.sycl_device.has_aspect_cpu + reshape = False + orig_shape_b = b_shape + if a.ndim > 3: + # get 3d input arrays by reshape + if a.ndim == b.ndim: + b = b.reshape(-1, b_shape[-2], b_shape[-1]) + else: + b = b.reshape(-1, b_shape[-1]) + + a = a.reshape(-1, a_shape[-2], a_shape[-1]) + + a_usm_arr = dpnp.get_usm_ndarray(a) + b_usm_arr = dpnp.get_usm_ndarray(b) + reshape = True + + batch_size = a.shape[0] + + coeff_vecs = [None] * batch_size + val_vecs = [None] * batch_size + a_ht_copy_ev = [None] * batch_size + b_ht_copy_ev = [None] * batch_size + ht_lapack_ev = [None] * batch_size + + for i in range(batch_size): + # oneMKL LAPACK assumes fortran-like array as input, so allocate + # a memory with 'F' order for dpnp array of coefficient matrix + coeff_vecs[i] = dpnp.empty_like( + a[i], order="F", dtype=res_type, usm_type=res_usm_type + ) + + # use DPCTL tensor function to fill the coefficient matrix array + # with content from the input array + a_ht_copy_ev[i], a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_usm_arr[i], + dst=coeff_vecs[i].get_array(), + sycl_queue=a.sycl_queue, + ) + + # oneMKL LAPACK assumes fortran-like array as input, so + # allocate a memory with 'F' order for dpnp array of multiple + # dependent variables array + val_vecs[i] = dpnp.empty_like( + b[i], order="F", dtype=res_type, usm_type=res_usm_type + ) + + # use DPCTL tensor function to fill the array of multiple dependent + # variables with content from the input arrays + b_ht_copy_ev[i], b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=b_usm_arr[i], + dst=val_vecs[i].get_array(), + sycl_queue=b.sycl_queue, + ) + + # Call the LAPACK extension function _gesv to solve the system of + # linear equations using a portion of the coefficient square matrix + # and a corresponding portion of the dependent variables array. + ht_lapack_ev[i], _ = li._gesv( + exec_q, + coeff_vecs[i].get_array(), + val_vecs[i].get_array(), + depends=[a_copy_ev, b_copy_ev], + ) + + # TODO: Remove this w/a when MKLD-17201 is solved. + # Waiting for a host task executing an OneMKL LAPACK gesv call + # on CPU causes deadlock due to serialization of all host tasks + # in the queue. + # We need to wait for each host tasks before calling _gesv to avoid + # deadlock. + if is_cpu_device: + ht_lapack_ev[i].wait() + b_ht_copy_ev[i].wait() + + for i in range(batch_size): + ht_lapack_ev[i].wait() + b_ht_copy_ev[i].wait() + a_ht_copy_ev[i].wait() + + # combine the list of solutions into a single array + out_v = dpnp.array( + val_vecs, order=b_order, dtype=res_type, usm_type=res_usm_type + ) + if reshape: + # shape of the out_v must be equal to the shape of the array of + # dependent variables + out_v = out_v.reshape(orig_shape_b) + return out_v + + else: + a_shape = a.shape + b_shape = b.shape - is_cpu_device = exec_q.sycl_device.has_aspect_cpu - reshape = False - orig_shape_b = b_shape - if a.ndim > 3: - # get 3d input arrays by reshape if a.ndim == b.ndim: b = b.reshape(-1, b_shape[-2], b_shape[-1]) else: @@ -286,82 +377,50 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type): a = a.reshape(-1, a_shape[-2], a_shape[-1]) + a = a.transpose((0, 2, 1)) a_usm_arr = dpnp.get_usm_ndarray(a) + + # b = b.T b_usm_arr = dpnp.get_usm_ndarray(b) - reshape = True - batch_size = a.shape[0] + # a_usm_arr = dpnp.get_usm_ndarray(a) + # b_usm_arr = dpnp.get_usm_ndarray(b) - coeff_vecs = [None] * batch_size - val_vecs = [None] * batch_size - a_ht_copy_ev = [None] * batch_size - b_ht_copy_ev = [None] * batch_size - ht_lapack_ev = [None] * batch_size + ht_list_ev = [] - for i in range(batch_size): - # oneMKL LAPACK assumes fortran-like array as input, so allocate - # a memory with 'F' order for dpnp array of coefficient matrix - coeff_vecs[i] = dpnp.empty_like( - a[i], order="F", dtype=res_type, usm_type=res_usm_type - ) + a_copy = dpnp.empty_like(a, dtype=res_type, order="F", usm_type=res_usm_type) - # use DPCTL tensor function to fill the coefficient matrix array - # with content from the input array - a_ht_copy_ev[i], a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_usm_arr[i], - dst=coeff_vecs[i].get_array(), + a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_usm_arr, + dst=a_copy.get_array(), sycl_queue=a.sycl_queue, ) + ht_list_ev.append(a_ht_copy_ev) - # oneMKL LAPACK assumes fortran-like array as input, so - # allocate a memory with 'F' order for dpnp array of multiple - # dependent variables array - val_vecs[i] = dpnp.empty_like( - b[i], order="F", dtype=res_type, usm_type=res_usm_type + b_copy = dpnp.empty_like( + b, order="C", dtype=res_type, usm_type=res_usm_type ) - # use DPCTL tensor function to fill the array of multiple dependent - # variables with content from the input arrays - b_ht_copy_ev[i], b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=b_usm_arr[i], - dst=val_vecs[i].get_array(), + b_ht_copy_ev, b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=b_usm_arr, + dst=b_copy.get_array(), sycl_queue=b.sycl_queue, ) + ht_list_ev.append(b_ht_copy_ev) - # Call the LAPACK extension function _gesv to solve the system of - # linear equations using a portion of the coefficient square matrix - # and a corresponding portion of the dependent variables array. - ht_lapack_ev[i], _ = li._gesv( + ht_lapack_ev, _ = li._gesv_batch( exec_q, - coeff_vecs[i].get_array(), - val_vecs[i].get_array(), + a_copy.get_array(), + b_copy.get_array(), depends=[a_copy_ev, b_copy_ev], ) - # TODO: Remove this w/a when MKLD-17201 is solved. - # Waiting for a host task executing an OneMKL LAPACK gesv call - # on CPU causes deadlock due to serialization of all host tasks - # in the queue. - # We need to wait for each host tasks before calling _gesv to avoid - # deadlock. - if is_cpu_device: - ht_lapack_ev[i].wait() - b_ht_copy_ev[i].wait() + ht_list_ev.append(ht_lapack_ev) - for i in range(batch_size): - ht_lapack_ev[i].wait() - b_ht_copy_ev[i].wait() - a_ht_copy_ev[i].wait() + dpctl.SyclEvent.wait_for(ht_list_ev) - # combine the list of solutions into a single array - out_v = dpnp.array( - val_vecs, order=b_order, dtype=res_type, usm_type=res_usm_type - ) - if reshape: - # shape of the out_v must be equal to the shape of the array of - # dependent variables - out_v = out_v.reshape(orig_shape_b) - return out_v + v = b_copy.reshape(b_shape) + return v def _batched_qr(a, mode="reduced"): diff --git a/test.py b/test.py new file mode 100644 index 000000000000..b71d3d67d37f --- /dev/null +++ b/test.py @@ -0,0 +1,74 @@ +import numpy + +import dpnp +import dpnp.linalg +from tests.helper import generate_random_numpy_array + +# a = numpy.array([[[2,3],[1,2]],[[1,2],[2,3]]],dtype='f4') +# a = numpy.array([[[1,2],[2,3]],[[1,2],[2,3]]],dtype='f4') +# a = numpy.array([[1,2],[2,3]],dtype='f4') +# a = numpy.array([[[2,3],[1,2]]],dtype='f4') +# a = numpy.array([[[1,2],[2,3]]],dtype='f4') +a = generate_random_numpy_array( + (2, 3, 3), dtype='f4', hermitian=False, seed_value=81 +) + +b = generate_random_numpy_array( + (2, 3), dtype='f4', hermitian=False, seed_value=76 +) +# a = numpy.array( +# [ +# [[1, 0, 3], [0, 5, 0], [7, 0, 9]], +# [[3, 0, 3], [0, 7, 0], [7, 0, 11]], +# ], +# dtype='f4', +# ) +a_dp = dpnp.array(a) +b_dp = dpnp.array(b) + +# a_f = numpy.array(a,order="F") +# a_dp_f = dpnp.array(a_dp,order="F") + +# res = numpy.linalg.eigh(a) +# res_dp = dpnp.linalg.eigh(a_dp) +# res = numpy.linalg.eigh(a_f) +# res_dp = dpnp.linalg.eigh(a_dp_f) +res = numpy.linalg.solve(a,b) +res_dp = dpnp.linalg.solve(a_dp,b_dp) +# res = numpy.linalg.eigvalsh(a) +# res_dp = dpnp.linalg.eigvalsh(a_dp) + + +print("VALS: ") +print("NUMPY: ") +print(res) +print("DPNP: ") +print(res_dp) + +# print("BEFORE") +# print(a_dp) + +# print("VALS: ") +# print("NUMPY: ") +# print(res[0]) +# print("DPNP: ") +# print(res_dp[0]) + + +# print("VECS: ") +# print("NUMPY: ") +# print(res[1]) +# print("DPNP: ") +# print(res_dp[1]) + +# print("AFTER") +# print(a_dp) + + +# a = generate_random_numpy_array((10,3,3),dtype=numpy.float32, hermitian=True, seed_value=81) +# a_dp = dpnp.array(a, device='gpu') + +# res = numpy.linalg.eigh(a) +# res_dp = dpnp.linalg.eigh(a_dp) + +# print("Done") From 8e2cb2344308411cfa32c50762452ca0f658e783 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 7 Jun 2024 09:07:42 +0200 Subject: [PATCH 02/37] First working version with transpose and C contig --- dpnp/backend/extensions/lapack/gesv.cpp | 18 ++++++++++++++---- dpnp/linalg/dpnp_utils_linalg.py | 13 ++++++++++--- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp index 2fda114f1ef0..ad49db3a2fb2 100644 --- a/dpnp/backend/extensions/lapack/gesv.cpp +++ b/dpnp/backend/extensions/lapack/gesv.cpp @@ -289,7 +289,7 @@ std::pair ", but a 3-dimensional array is expected."); } - if (dependent_vals_nd > 2) { + if (dependent_vals_nd > 3) { throw py::value_error( "The dependent values array has ndim=" + std::to_string(dependent_vals_nd) + @@ -355,13 +355,23 @@ std::pair char *dependent_vals_data = dependent_vals.get_data(); const std::int64_t batch_size = coeff_matrix_shape[0]; - const std::int64_t n = dependent_vals_shape[0]; + // const std::int64_t n = dependent_vals_shape[1]; + // const std::int64_t nrhs = + // (dependent_vals_nd > 2) ? dependent_vals_shape[2] : 1; + + const std::int64_t n = coeff_matrix_shape[1]; const std::int64_t nrhs = - (dependent_vals_nd > 1) ? dependent_vals_shape[1] : 1; + (dependent_vals_nd > 2) ? dependent_vals_shape[1] : 1; const std::int64_t lda = std::max(1UL, n); const std::int64_t ldb = std::max(1UL, n); + std::cout << "n: " << n << std::endl; + std::cout << "nrhs: " << nrhs << std::endl; + std::cout << "lda: " << n << std::endl; + std::cout << "ldb: " << n << std::endl; + + int coeff_matrix_elemsize = coeff_matrix.get_elemsize(); int dependent_vals_elemsize = dependent_vals.get_elemsize(); @@ -373,7 +383,7 @@ std::pair for (std::int64_t i = 0; i < batch_size; ++i) { char *coeff_matrix_batch = coeff_matrix_data + i * n * n * coeff_matrix_elemsize; - char *dependent_vals_batch = dependent_vals_data + i * n * dependent_vals_elemsize; + char *dependent_vals_batch = dependent_vals_data + i * n * nrhs * dependent_vals_elemsize; sycl::event gesv_ev = gesv_fn(exec_q, n, nrhs, coeff_matrix_batch, lda, dependent_vals_batch, diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 989c81ff118c..d2e80f758fee 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -380,7 +380,10 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type): a = a.transpose((0, 2, 1)) a_usm_arr = dpnp.get_usm_ndarray(a) - # b = b.T + if b.ndim > 2: + b = b.transpose((0,2,1)) + else: + b = b b_usm_arr = dpnp.get_usm_ndarray(b) # a_usm_arr = dpnp.get_usm_ndarray(a) @@ -388,7 +391,7 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type): ht_list_ev = [] - a_copy = dpnp.empty_like(a, dtype=res_type, order="F", usm_type=res_usm_type) + a_copy = dpnp.empty_like(a, dtype=res_type, order="C", usm_type=res_usm_type) a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=a_usm_arr, @@ -419,7 +422,11 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type): dpctl.SyclEvent.wait_for(ht_list_ev) - v = b_copy.reshape(b_shape) + if b.ndim > 2: + v = b_copy.transpose(0,2,1).reshape(b_shape) + else: + v = b_copy.reshape(b_shape) + # v = b_copy.reshape(b_shape) return v From 67fa43592719ca522b7b0f473023705d9c1e62f0 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 7 Jun 2024 10:47:13 +0200 Subject: [PATCH 03/37] Second working version with moveaxis, transpose and F contig --- dpnp/backend/extensions/lapack/gesv.cpp | 9 +-- dpnp/linalg/dpnp_utils_linalg.py | 80 +++++++++++++++++-------- 2 files changed, 59 insertions(+), 30 deletions(-) diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp index ad49db3a2fb2..e0ddbad1c893 100644 --- a/dpnp/backend/extensions/lapack/gesv.cpp +++ b/dpnp/backend/extensions/lapack/gesv.cpp @@ -299,11 +299,11 @@ std::pair const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw(); const py::ssize_t *dependent_vals_shape = dependent_vals.get_shape_raw(); - if (coeff_matrix_shape[1] != coeff_matrix_shape[2]) { + if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) { throw py::value_error("The coefficient matrix must be square," " but got a shape of (" + - std::to_string(coeff_matrix_shape[1]) + ", " + - std::to_string(coeff_matrix_shape[2]) + ")."); + std::to_string(coeff_matrix_shape[0]) + ", " + + std::to_string(coeff_matrix_shape[1]) + ")."); } // check compatibility of execution queue and allocation queue @@ -354,7 +354,7 @@ std::pair char *coeff_matrix_data = coeff_matrix.get_data(); char *dependent_vals_data = dependent_vals.get_data(); - const std::int64_t batch_size = coeff_matrix_shape[0]; + const std::int64_t batch_size = coeff_matrix_shape[2]; // const std::int64_t n = dependent_vals_shape[1]; // const std::int64_t nrhs = // (dependent_vals_nd > 2) ? dependent_vals_shape[2] : 1; @@ -370,6 +370,7 @@ std::pair std::cout << "nrhs: " << nrhs << std::endl; std::cout << "lda: " << n << std::endl; std::cout << "ldb: " << n << std::endl; + std::cout << "b_nd: " << dependent_vals_nd << std::endl; int coeff_matrix_elemsize = coeff_matrix.get_elemsize(); diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index d2e80f758fee..dc7e215e557c 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -377,56 +377,84 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type): a = a.reshape(-1, a_shape[-2], a_shape[-1]) - a = a.transpose((0, 2, 1)) - a_usm_arr = dpnp.get_usm_ndarray(a) + n_a = a.shape[1] + m,n = b.shape[-2:] + + a = dpnp.moveaxis(a,(-2,-1),(0,1)) + a = dpnp.array(a, dtype=res_type, order="F", copy=True) + a = dpnp.reshape(a,(n_a,n_a,-1)) if b.ndim > 2: - b = b.transpose((0,2,1)) + b = dpnp.moveaxis(b,(-2,-1),(0,1)) + else: + b = b.T + b = dpnp.array(b, dtype=res_type, order="F", copy=True) + if b.ndim > 2: + b = dpnp.reshape(b,(m,n,-1)) else: - b = b + b = b.T + + a_usm_arr = dpnp.get_usm_ndarray(a) b_usm_arr = dpnp.get_usm_ndarray(b) + + # a = a.reshape(-1, a_shape[-2], a_shape[-1]) + + # a = a.transpose((0, 2, 1)) + # a_usm_arr = dpnp.get_usm_ndarray(a) + + # if b.ndim > 2: + # b = b.transpose((0,2,1)) + # else: + # b = b + # b_usm_arr = dpnp.get_usm_ndarray(b) + # a_usm_arr = dpnp.get_usm_ndarray(a) # b_usm_arr = dpnp.get_usm_ndarray(b) ht_list_ev = [] - a_copy = dpnp.empty_like(a, dtype=res_type, order="C", usm_type=res_usm_type) + # a_copy = dpnp.empty_like(a, dtype=res_type, order="C", usm_type=res_usm_type) - a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_usm_arr, - dst=a_copy.get_array(), - sycl_queue=a.sycl_queue, - ) - ht_list_ev.append(a_ht_copy_ev) + # a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + # src=a_usm_arr, + # dst=a_copy.get_array(), + # sycl_queue=a.sycl_queue, + # ) + # ht_list_ev.append(a_ht_copy_ev) - b_copy = dpnp.empty_like( - b, order="C", dtype=res_type, usm_type=res_usm_type - ) + # b_copy = dpnp.empty_like( + # b, order="C", dtype=res_type, usm_type=res_usm_type + # ) - b_ht_copy_ev, b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=b_usm_arr, - dst=b_copy.get_array(), - sycl_queue=b.sycl_queue, - ) - ht_list_ev.append(b_ht_copy_ev) + # b_ht_copy_ev, b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + # src=b_usm_arr, + # dst=b_copy.get_array(), + # sycl_queue=b.sycl_queue, + # ) + # ht_list_ev.append(b_ht_copy_ev) ht_lapack_ev, _ = li._gesv_batch( exec_q, - a_copy.get_array(), - b_copy.get_array(), - depends=[a_copy_ev, b_copy_ev], + a.get_array(), + b.get_array(), + # depends=[a_copy_ev, b_copy_ev], + depends=[], ) ht_list_ev.append(ht_lapack_ev) dpctl.SyclEvent.wait_for(ht_list_ev) + # if b.ndim > 2: + # v = b_copy.transpose(0,2,1).reshape(b_shape) + # else: + # v = b_copy.reshape(b_shape) if b.ndim > 2: - v = b_copy.transpose(0,2,1).reshape(b_shape) + v = dpnp.moveaxis(b, -1 , 0) + v = v.reshape(b_shape) else: - v = b_copy.reshape(b_shape) - # v = b_copy.reshape(b_shape) + v = b.reshape(b_shape) return v From 4f5abecc9c117b534499e8997a6088722d15774a Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 11 Jun 2024 23:27:31 +0200 Subject: [PATCH 04/37] Add more shape checks --- dpnp/backend/extensions/lapack/gesv.cpp | 79 ++++++++++++++++--------- 1 file changed, 50 insertions(+), 29 deletions(-) diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp index e0ddbad1c893..192e0325cf3f 100644 --- a/dpnp/backend/extensions/lapack/gesv.cpp +++ b/dpnp/backend/extensions/lapack/gesv.cpp @@ -273,7 +273,6 @@ std::pair return std::make_pair(args_ev, gesv_ev); } - std::pair gesv_batch(sycl::queue exec_q, dpctl::tensor::usm_ndarray coeff_matrix, @@ -289,16 +288,20 @@ std::pair ", but a 3-dimensional array is expected."); } - if (dependent_vals_nd > 3) { + if (dependent_vals_nd < 2 || dependent_vals_nd > 3) { throw py::value_error( "The dependent values array has ndim=" + std::to_string(dependent_vals_nd) + - ", but more than 2-dimensional array is expected."); + ", but a 2-dimensional or 3-dimensional array is expected."); } const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw(); const py::ssize_t *dependent_vals_shape = dependent_vals.get_shape_raw(); + // The coeff_matrix and dependent_vals arrays must be F-contiguous arrays + // with the shapes (n,n,batch_size) and (n,nrhs,batch_size) or + // (n,batch_size) respectively. + if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) { throw py::value_error("The coefficient matrix must be square," " but got a shape of (" + @@ -306,6 +309,34 @@ std::pair std::to_string(coeff_matrix_shape[1]) + ")."); } + if (coeff_matrix_shape[0] != dependent_vals_shape[0]) { + throw py::value_error("The first dimension (n) of coeff_matrix and" + " dependent_vals must be the same, but got " + + std::to_string(coeff_matrix_shape[0]) + " and " + + std::to_string(dependent_vals_shape[0]) + "."); + } + + if (dependent_vals_nd == 2) { + if (coeff_matrix_shape[2] != dependent_vals_shape[1]) { + throw py::value_error( + "The batch_size of " + " coeff_matrix and dependent_vals must be" + " the same, but got " + + std::to_string(coeff_matrix_shape[2]) + " and " + + std::to_string(dependent_vals_shape[1]) + "."); + } + } + else if (dependent_vals_nd == 3) { + if (coeff_matrix_shape[2] != dependent_vals_shape[2]) { + throw py::value_error( + "The batch_size of " + " coeff_matrix and dependent_vals must be" + " the same, but got " + + std::to_string(coeff_matrix_shape[2]) + " and " + + std::to_string(dependent_vals_shape[2]) + "."); + } + } + // check compatibility of execution queue and allocation queue if (!dpctl::utils::queues_are_compatible(exec_q, {coeff_matrix, dependent_vals})) @@ -321,17 +352,17 @@ std::pair "are overlapping segments of memory"); } - // bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous(); - // if (!is_coeff_matrix_f_contig) { - // throw py::value_error("The coefficient matrix " - // "must be F-contiguous"); - // } + bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous(); + if (!is_coeff_matrix_f_contig) { + throw py::value_error("The coefficient matrix " + "must be F-contiguous"); + } - // bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous(); - // if (!is_dependent_vals_f_contig) { - // throw py::value_error("The array of dependent variables " - // "must be F-contiguous"); - // } + bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous(); + if (!is_dependent_vals_f_contig) { + throw py::value_error("The array of dependent variables " + "must be F-contiguous"); + } auto array_types = dpctl_td_ns::usm_ndarray_types(); int coeff_matrix_type_id = @@ -355,10 +386,6 @@ std::pair char *dependent_vals_data = dependent_vals.get_data(); const std::int64_t batch_size = coeff_matrix_shape[2]; - // const std::int64_t n = dependent_vals_shape[1]; - // const std::int64_t nrhs = - // (dependent_vals_nd > 2) ? dependent_vals_shape[2] : 1; - const std::int64_t n = coeff_matrix_shape[1]; const std::int64_t nrhs = (dependent_vals_nd > 2) ? dependent_vals_shape[1] : 1; @@ -366,13 +393,6 @@ std::pair const std::int64_t lda = std::max(1UL, n); const std::int64_t ldb = std::max(1UL, n); - std::cout << "n: " << n << std::endl; - std::cout << "nrhs: " << nrhs << std::endl; - std::cout << "lda: " << n << std::endl; - std::cout << "ldb: " << n << std::endl; - std::cout << "b_nd: " << dependent_vals_nd << std::endl; - - int coeff_matrix_elemsize = coeff_matrix.get_elemsize(); int dependent_vals_elemsize = dependent_vals.get_elemsize(); @@ -383,12 +403,14 @@ std::pair gesv_task_events.reserve(batch_size); for (std::int64_t i = 0; i < batch_size; ++i) { - char *coeff_matrix_batch = coeff_matrix_data + i * n * n * coeff_matrix_elemsize; - char *dependent_vals_batch = dependent_vals_data + i * n * nrhs * dependent_vals_elemsize; + char *coeff_matrix_batch = + coeff_matrix_data + i * n * n * coeff_matrix_elemsize; + char *dependent_vals_batch = + dependent_vals_data + i * n * nrhs * dependent_vals_elemsize; sycl::event gesv_ev = - gesv_fn(exec_q, n, nrhs, coeff_matrix_batch, lda, dependent_vals_batch, - ldb, host_task_events, depends); + gesv_fn(exec_q, n, nrhs, coeff_matrix_batch, lda, + dependent_vals_batch, ldb, host_task_events, depends); gesv_task_events.push_back(gesv_ev); } @@ -396,7 +418,6 @@ std::pair sycl::event combine_ev = exec_q.submit( [&](sycl::handler &cgh) { cgh.depends_on(gesv_task_events); }); - sycl::event args_ev = dpctl::utils::keep_args_alive( exec_q, {coeff_matrix, dependent_vals}, host_task_events); From 0cb2808181df14d55b05bb083b449956365e306a Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 11 Jun 2024 23:30:40 +0200 Subject: [PATCH 05/37] Pass sycl::queue by reference for gesv/gesv_batch --- dpnp/backend/extensions/lapack/gesv.cpp | 8 ++++---- dpnp/backend/extensions/lapack/gesv.hpp | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp index 192e0325cf3f..de5d38cdfc00 100644 --- a/dpnp/backend/extensions/lapack/gesv.cpp +++ b/dpnp/backend/extensions/lapack/gesv.cpp @@ -48,7 +48,7 @@ namespace mkl_lapack = oneapi::mkl::lapack; namespace py = pybind11; namespace type_utils = dpctl::tensor::type_utils; -typedef sycl::event (*gesv_impl_fn_ptr_t)(sycl::queue, +typedef sycl::event (*gesv_impl_fn_ptr_t)(sycl::queue &, const std::int64_t, const std::int64_t, char *, @@ -61,7 +61,7 @@ typedef sycl::event (*gesv_impl_fn_ptr_t)(sycl::queue, static gesv_impl_fn_ptr_t gesv_dispatch_vector[dpctl_td_ns::num_types]; template -static sycl::event gesv_impl(sycl::queue exec_q, +static sycl::event gesv_impl(sycl::queue &exec_q, const std::int64_t n, const std::int64_t nrhs, char *in_a, @@ -176,7 +176,7 @@ static sycl::event gesv_impl(sycl::queue exec_q, } std::pair - gesv(sycl::queue exec_q, + gesv(sycl::queue &exec_q, dpctl::tensor::usm_ndarray coeff_matrix, dpctl::tensor::usm_ndarray dependent_vals, const std::vector &depends) @@ -274,7 +274,7 @@ std::pair } std::pair - gesv_batch(sycl::queue exec_q, + gesv_batch(sycl::queue &exec_q, dpctl::tensor::usm_ndarray coeff_matrix, dpctl::tensor::usm_ndarray dependent_vals, const std::vector &depends) diff --git a/dpnp/backend/extensions/lapack/gesv.hpp b/dpnp/backend/extensions/lapack/gesv.hpp index 4de03370da70..c9dfa1a3de50 100644 --- a/dpnp/backend/extensions/lapack/gesv.hpp +++ b/dpnp/backend/extensions/lapack/gesv.hpp @@ -39,13 +39,13 @@ namespace ext namespace lapack { extern std::pair - gesv(sycl::queue exec_q, + gesv(sycl::queue &exec_q, dpctl::tensor::usm_ndarray coeff_matrix, dpctl::tensor::usm_ndarray dependent_vals, const std::vector &depends); extern std::pair - gesv_batch(sycl::queue exec_q, + gesv_batch(sycl::queue &exec_q, dpctl::tensor::usm_ndarray coeff_matrix, dpctl::tensor::usm_ndarray dependent_vals, const std::vector &depends); From bfa37d4c0641a7e764afa183d895c5727f89ab16 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 11 Jun 2024 23:36:08 +0200 Subject: [PATCH 06/37] qwe --- dpnp/linalg/dpnp_utils_linalg.py | 71 +++++++++++++++++--------------- perf.py | 22 ++++++++++ test.py | 15 +++---- 3 files changed, 68 insertions(+), 40 deletions(-) create mode 100644 perf.py diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index dc7e215e557c..96ec793dded9 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -381,21 +381,23 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type): m,n = b.shape[-2:] a = dpnp.moveaxis(a,(-2,-1),(0,1)) - a = dpnp.array(a, dtype=res_type, order="F", copy=True) - a = dpnp.reshape(a,(n_a,n_a,-1)) + # a_f = dpnp.array(a, dtype=res_type, order="F", copy=True) + # a = dpnp.reshape(a,(n_a,n_a,-1)) if b.ndim > 2: b = dpnp.moveaxis(b,(-2,-1),(0,1)) else: b = b.T - b = dpnp.array(b, dtype=res_type, order="F", copy=True) - if b.ndim > 2: - b = dpnp.reshape(b,(m,n,-1)) - else: - b = b.T + # b_f = dpnp.array(b, dtype=res_type, order="F", copy=True) + # if b.ndim > 2: + # # b = dpnp.reshape(b,(m,n,-1)) + # b = b + # else: + # # b = b.T + # b = b - a_usm_arr = dpnp.get_usm_ndarray(a) - b_usm_arr = dpnp.get_usm_ndarray(b) + # a_usm_arr = dpnp.get_usm_ndarray(a) + # b_usm_arr = dpnp.get_usm_ndarray(b) # a = a.reshape(-1, a_shape[-2], a_shape[-1]) @@ -409,37 +411,36 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type): # b = b # b_usm_arr = dpnp.get_usm_ndarray(b) - # a_usm_arr = dpnp.get_usm_ndarray(a) - # b_usm_arr = dpnp.get_usm_ndarray(b) + a_usm_arr = dpnp.get_usm_ndarray(a) + b_usm_arr = dpnp.get_usm_ndarray(b) ht_list_ev = [] - # a_copy = dpnp.empty_like(a, dtype=res_type, order="C", usm_type=res_usm_type) + a_f = dpnp.empty_like(a, dtype=res_type, order="F", usm_type=res_usm_type) - # a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - # src=a_usm_arr, - # dst=a_copy.get_array(), - # sycl_queue=a.sycl_queue, - # ) - # ht_list_ev.append(a_ht_copy_ev) + a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_usm_arr, + dst=a_f.get_array(), + sycl_queue=a.sycl_queue, + ) + ht_list_ev.append(a_ht_copy_ev) - # b_copy = dpnp.empty_like( - # b, order="C", dtype=res_type, usm_type=res_usm_type - # ) + b_f = dpnp.empty_like( + b, order="F", dtype=res_type, usm_type=res_usm_type + ) - # b_ht_copy_ev, b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - # src=b_usm_arr, - # dst=b_copy.get_array(), - # sycl_queue=b.sycl_queue, - # ) - # ht_list_ev.append(b_ht_copy_ev) + b_ht_copy_ev, b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=b_usm_arr, + dst=b_f.get_array(), + sycl_queue=b.sycl_queue, + ) + ht_list_ev.append(b_ht_copy_ev) ht_lapack_ev, _ = li._gesv_batch( exec_q, - a.get_array(), - b.get_array(), - # depends=[a_copy_ev, b_copy_ev], - depends=[], + a_f.get_array(), + b_f.get_array(), + depends=[a_copy_ev, b_copy_ev], ) ht_list_ev.append(ht_lapack_ev) @@ -451,10 +452,14 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type): # else: # v = b_copy.reshape(b_shape) if b.ndim > 2: - v = dpnp.moveaxis(b, -1 , 0) + v = dpnp.moveaxis(b_f, -1 , 0) v = v.reshape(b_shape) else: - v = b.reshape(b_shape) + v = dpnp.moveaxis(b_f,-1,0) + v = v.reshape(b_shape) + + # v = dpnp.copy(dpnp.moveaxis(b_f, -1, 0), order="C").reshape(b_shape) + return v diff --git a/perf.py b/perf.py new file mode 100644 index 000000000000..20de9e824e66 --- /dev/null +++ b/perf.py @@ -0,0 +1,22 @@ +import numpy +import dpnp +from tests.helper import generate_random_numpy_array +import cProfile, pstats, io + +a = generate_random_numpy_array((256,512,512),dtype='f4',hermitian=False, seed_value=81) +b = generate_random_numpy_array((256,512,),dtype='f4',hermitian=False, seed_value=76) + +a_dp = dpnp.array(a,device='cpu') +b_dp = dpnp.array(b,device='cpu') + +cold_run = dpnp.linalg.solve(a_dp, b_dp) +pr = cProfile.Profile() +pr.enable() +# res = numpy.linalg.solve(a, b) +res = dpnp.linalg.solve(a_dp, b_dp) +pr.disable() +s = io.StringIO() +sortBy = pstats.SortKey.CUMULATIVE +ps = pstats.Stats(pr, stream=s).sort_stats(sortBy) +ps.print_stats() +print(s.getvalue()) diff --git a/test.py b/test.py index b71d3d67d37f..fc22fc3309c8 100644 --- a/test.py +++ b/test.py @@ -5,17 +5,18 @@ from tests.helper import generate_random_numpy_array # a = numpy.array([[[2,3],[1,2]],[[1,2],[2,3]]],dtype='f4') -# a = numpy.array([[[1,2],[2,3]],[[1,2],[2,3]]],dtype='f4') +a = numpy.array([[[1,2],[3,5]],[[1,2],[3,5]],[[1,2],[3,5]]], dtype='f4') +b = numpy.array([[1,2],[1,2],[1,2]], dtype='f4') # a = numpy.array([[1,2],[2,3]],dtype='f4') # a = numpy.array([[[2,3],[1,2]]],dtype='f4') # a = numpy.array([[[1,2],[2,3]]],dtype='f4') -a = generate_random_numpy_array( - (2, 3, 3), dtype='f4', hermitian=False, seed_value=81 -) +# a = generate_random_numpy_array( +# (2, 3, 3), dtype='f4', hermitian=False, seed_value=81 +# ) -b = generate_random_numpy_array( - (2, 3), dtype='f4', hermitian=False, seed_value=76 -) +# b = generate_random_numpy_array( +# (2, 3, 2), dtype='f4', hermitian=False, seed_value=76 +# ) # a = numpy.array( # [ # [[1, 0, 3], [0, 5, 0], [7, 0, 9]], From 4a4429227d710ac3390ebd05682b8d2f450f9465 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 12 Jun 2024 14:34:13 +0200 Subject: [PATCH 07/37] Update _batched_solve implementation --- dpnp/linalg/dpnp_iface_linalg.py | 4 +- dpnp/linalg/dpnp_utils_linalg.py | 143 +++++++++++++------------------ 2 files changed, 60 insertions(+), 87 deletions(-) diff --git a/dpnp/linalg/dpnp_iface_linalg.py b/dpnp/linalg/dpnp_iface_linalg.py index 72d79ad329d7..8f535ba8b47e 100644 --- a/dpnp/linalg/dpnp_iface_linalg.py +++ b/dpnp/linalg/dpnp_iface_linalg.py @@ -1093,7 +1093,7 @@ def qr(a, mode="reduced"): return dpnp_qr(a, mode) -def solve(a, b): +def solve(a, b, new=False): """ Solve a linear matrix equation, or system of linear scalar equations. @@ -1143,7 +1143,7 @@ def solve(a, b): "or (..., M, K)" ) - return dpnp_solve(a, b) + return dpnp_solve(a, b, new=new) def svd(a, full_matrices=True, compute_uv=True, hermitian=False): diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 96ec793dded9..3b68ff964874 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -257,7 +257,7 @@ def _batched_inv(a, res_type): return a_h.reshape(orig_shape) -def _batched_solve(a, b, exec_q, res_usm_type, res_type): +def _batched_solve(a, b, exec_q, res_usm_type, res_type, new=False): """ _batched_solve(a, b, exec_q, res_usm_type, res_type) @@ -267,8 +267,6 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type): """ - new = True - if not new: a_usm_arr = dpnp.get_usm_ndarray(a) b_usm_arr = dpnp.get_usm_ndarray(b) @@ -366,101 +364,76 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type): out_v = out_v.reshape(orig_shape_b) return out_v - else: - a_shape = a.shape - b_shape = b.shape - - if a.ndim == b.ndim: - b = b.reshape(-1, b_shape[-2], b_shape[-1]) - else: - b = b.reshape(-1, b_shape[-1]) - - a = a.reshape(-1, a_shape[-2], a_shape[-1]) - - n_a = a.shape[1] - m,n = b.shape[-2:] - - a = dpnp.moveaxis(a,(-2,-1),(0,1)) - # a_f = dpnp.array(a, dtype=res_type, order="F", copy=True) - # a = dpnp.reshape(a,(n_a,n_a,-1)) - - if b.ndim > 2: - b = dpnp.moveaxis(b,(-2,-1),(0,1)) - else: - b = b.T - # b_f = dpnp.array(b, dtype=res_type, order="F", copy=True) - # if b.ndim > 2: - # # b = dpnp.reshape(b,(m,n,-1)) - # b = b - # else: - # # b = b.T - # b = b - - # a_usm_arr = dpnp.get_usm_ndarray(a) - # b_usm_arr = dpnp.get_usm_ndarray(b) - + a_shape = a.shape + b_shape = b.shape - # a = a.reshape(-1, a_shape[-2], a_shape[-1]) + # gesv_batch expects `a` to be a 3D array and + # `b` to be either a 2D or 3D array. + if a.ndim == b.ndim: + b = b.reshape(-1, b_shape[-2], b_shape[-1]) + else: + b = b.reshape(-1, b_shape[-1]) - # a = a.transpose((0, 2, 1)) - # a_usm_arr = dpnp.get_usm_ndarray(a) + a = a.reshape(-1, a_shape[-2], a_shape[-1]) - # if b.ndim > 2: - # b = b.transpose((0,2,1)) - # else: - # b = b - # b_usm_arr = dpnp.get_usm_ndarray(b) + # Reorder the elements by moving the last two axes of `a`` to the front + # to match fortran-like array order which is assumed by gesv. + a = dpnp.moveaxis(a, (-2, -1), (0, 1)) + # The same for `b` if it is 3D; + # if it is 2D, transpose it. + if b.ndim > 2: + b = dpnp.moveaxis(b, (-2, -1), (0, 1)) + else: + b = b.T - a_usm_arr = dpnp.get_usm_ndarray(a) - b_usm_arr = dpnp.get_usm_ndarray(b) + a_usm_arr = dpnp.get_usm_ndarray(a) + b_usm_arr = dpnp.get_usm_ndarray(b) - ht_list_ev = [] + ht_list_ev = [] - a_f = dpnp.empty_like(a, dtype=res_type, order="F", usm_type=res_usm_type) + # oneMKL LAPACK gesv destroys `a` and assumes fortran-like array + # as input. + a_f = dpnp.empty_like(a, dtype=res_type, order="F", usm_type=res_usm_type) - a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_usm_arr, - dst=a_f.get_array(), - sycl_queue=a.sycl_queue, - ) - ht_list_ev.append(a_ht_copy_ev) + a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_usm_arr, + dst=a_f.get_array(), + sycl_queue=exec_q, + ) + ht_list_ev.append(a_ht_copy_ev) - b_f = dpnp.empty_like( - b, order="F", dtype=res_type, usm_type=res_usm_type - ) + # oneMKL LAPACK gesv overwrites `b` and assumes fortran-like array + # as input. + b_f = dpnp.empty_like(b, order="F", dtype=res_type, usm_type=res_usm_type) - b_ht_copy_ev, b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=b_usm_arr, - dst=b_f.get_array(), - sycl_queue=b.sycl_queue, - ) - ht_list_ev.append(b_ht_copy_ev) + b_ht_copy_ev, b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=b_usm_arr, + dst=b_f.get_array(), + sycl_queue=exec_q, + ) + ht_list_ev.append(b_ht_copy_ev) - ht_lapack_ev, _ = li._gesv_batch( - exec_q, - a_f.get_array(), - b_f.get_array(), - depends=[a_copy_ev, b_copy_ev], - ) + ht_lapack_ev, _ = li._gesv_batch( + exec_q, + a_f.get_array(), + b_f.get_array(), + depends=[a_copy_ev, b_copy_ev], + ) - ht_list_ev.append(ht_lapack_ev) + ht_list_ev.append(ht_lapack_ev) - dpctl.SyclEvent.wait_for(ht_list_ev) + dpctl.SyclEvent.wait_for(ht_list_ev) - # if b.ndim > 2: - # v = b_copy.transpose(0,2,1).reshape(b_shape) - # else: - # v = b_copy.reshape(b_shape) - if b.ndim > 2: - v = dpnp.moveaxis(b_f, -1 , 0) - v = v.reshape(b_shape) - else: - v = dpnp.moveaxis(b_f,-1,0) - v = v.reshape(b_shape) + # Gesv call overwtires `b` in Fortran order, reorder the axes + # to match C order by moving the last axis to the front and + # reshape it back to the original shape of `b`. + v = dpnp.moveaxis(b_f, -1, 0).reshape(b_shape) - # v = dpnp.copy(dpnp.moveaxis(b_f, -1, 0), order="C").reshape(b_shape) + # dpnp.moveaxis can make the array non-contiguous if it is not 2D + if b.ndim > 2: + v = dpnp.ascontiguousarray(v) - return v + return v def _batched_qr(a, mode="reduced"): @@ -2543,7 +2516,7 @@ def dpnp_qr(a, mode="reduced"): return (q, r) -def dpnp_solve(a, b): +def dpnp_solve(a, b, new=False): """ dpnp_solve(a, b) @@ -2560,7 +2533,7 @@ def dpnp_solve(a, b): return dpnp.empty_like(b, dtype=res_type, usm_type=res_usm_type) if a.ndim > 2: - return _batched_solve(a, b, exec_q, res_usm_type, res_type) + return _batched_solve(a, b, exec_q, res_usm_type, res_type, new=new) a_usm_arr = dpnp.get_usm_ndarray(a) b_usm_arr = dpnp.get_usm_ndarray(b) From df4774ef67d7a803e2b2e2edde1d37bdac7475f0 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 12 Jun 2024 14:37:12 +0200 Subject: [PATCH 08/37] Remove old impl in _batched_solve --- dpnp/linalg/dpnp_iface_linalg.py | 4 +- dpnp/linalg/dpnp_utils_linalg.py | 105 ++----------------------------- 2 files changed, 6 insertions(+), 103 deletions(-) diff --git a/dpnp/linalg/dpnp_iface_linalg.py b/dpnp/linalg/dpnp_iface_linalg.py index 8f535ba8b47e..72d79ad329d7 100644 --- a/dpnp/linalg/dpnp_iface_linalg.py +++ b/dpnp/linalg/dpnp_iface_linalg.py @@ -1093,7 +1093,7 @@ def qr(a, mode="reduced"): return dpnp_qr(a, mode) -def solve(a, b, new=False): +def solve(a, b): """ Solve a linear matrix equation, or system of linear scalar equations. @@ -1143,7 +1143,7 @@ def solve(a, b, new=False): "or (..., M, K)" ) - return dpnp_solve(a, b, new=new) + return dpnp_solve(a, b) def svd(a, full_matrices=True, compute_uv=True, hermitian=False): diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 3b68ff964874..4e176ce884f3 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -257,7 +257,7 @@ def _batched_inv(a, res_type): return a_h.reshape(orig_shape) -def _batched_solve(a, b, exec_q, res_usm_type, res_type, new=False): +def _batched_solve(a, b, exec_q, res_usm_type, res_type): """ _batched_solve(a, b, exec_q, res_usm_type, res_type) @@ -267,103 +267,6 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type, new=False): """ - if not new: - a_usm_arr = dpnp.get_usm_ndarray(a) - b_usm_arr = dpnp.get_usm_ndarray(b) - - b_order = "C" if b.flags.c_contiguous else "F" - a_shape = a.shape - b_shape = b.shape - - is_cpu_device = exec_q.sycl_device.has_aspect_cpu - reshape = False - orig_shape_b = b_shape - if a.ndim > 3: - # get 3d input arrays by reshape - if a.ndim == b.ndim: - b = b.reshape(-1, b_shape[-2], b_shape[-1]) - else: - b = b.reshape(-1, b_shape[-1]) - - a = a.reshape(-1, a_shape[-2], a_shape[-1]) - - a_usm_arr = dpnp.get_usm_ndarray(a) - b_usm_arr = dpnp.get_usm_ndarray(b) - reshape = True - - batch_size = a.shape[0] - - coeff_vecs = [None] * batch_size - val_vecs = [None] * batch_size - a_ht_copy_ev = [None] * batch_size - b_ht_copy_ev = [None] * batch_size - ht_lapack_ev = [None] * batch_size - - for i in range(batch_size): - # oneMKL LAPACK assumes fortran-like array as input, so allocate - # a memory with 'F' order for dpnp array of coefficient matrix - coeff_vecs[i] = dpnp.empty_like( - a[i], order="F", dtype=res_type, usm_type=res_usm_type - ) - - # use DPCTL tensor function to fill the coefficient matrix array - # with content from the input array - a_ht_copy_ev[i], a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_usm_arr[i], - dst=coeff_vecs[i].get_array(), - sycl_queue=a.sycl_queue, - ) - - # oneMKL LAPACK assumes fortran-like array as input, so - # allocate a memory with 'F' order for dpnp array of multiple - # dependent variables array - val_vecs[i] = dpnp.empty_like( - b[i], order="F", dtype=res_type, usm_type=res_usm_type - ) - - # use DPCTL tensor function to fill the array of multiple dependent - # variables with content from the input arrays - b_ht_copy_ev[i], b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=b_usm_arr[i], - dst=val_vecs[i].get_array(), - sycl_queue=b.sycl_queue, - ) - - # Call the LAPACK extension function _gesv to solve the system of - # linear equations using a portion of the coefficient square matrix - # and a corresponding portion of the dependent variables array. - ht_lapack_ev[i], _ = li._gesv( - exec_q, - coeff_vecs[i].get_array(), - val_vecs[i].get_array(), - depends=[a_copy_ev, b_copy_ev], - ) - - # TODO: Remove this w/a when MKLD-17201 is solved. - # Waiting for a host task executing an OneMKL LAPACK gesv call - # on CPU causes deadlock due to serialization of all host tasks - # in the queue. - # We need to wait for each host tasks before calling _gesv to avoid - # deadlock. - if is_cpu_device: - ht_lapack_ev[i].wait() - b_ht_copy_ev[i].wait() - - for i in range(batch_size): - ht_lapack_ev[i].wait() - b_ht_copy_ev[i].wait() - a_ht_copy_ev[i].wait() - - # combine the list of solutions into a single array - out_v = dpnp.array( - val_vecs, order=b_order, dtype=res_type, usm_type=res_usm_type - ) - if reshape: - # shape of the out_v must be equal to the shape of the array of - # dependent variables - out_v = out_v.reshape(orig_shape_b) - return out_v - a_shape = a.shape b_shape = b.shape @@ -376,7 +279,7 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type, new=False): a = a.reshape(-1, a_shape[-2], a_shape[-1]) - # Reorder the elements by moving the last two axes of `a`` to the front + # Reorder the elements by moving the last two axes of `a` to the front # to match fortran-like array order which is assumed by gesv. a = dpnp.moveaxis(a, (-2, -1), (0, 1)) # The same for `b` if it is 3D; @@ -2516,7 +2419,7 @@ def dpnp_qr(a, mode="reduced"): return (q, r) -def dpnp_solve(a, b, new=False): +def dpnp_solve(a, b): """ dpnp_solve(a, b) @@ -2533,7 +2436,7 @@ def dpnp_solve(a, b, new=False): return dpnp.empty_like(b, dtype=res_type, usm_type=res_usm_type) if a.ndim > 2: - return _batched_solve(a, b, exec_q, res_usm_type, res_type, new=new) + return _batched_solve(a, b, exec_q, res_usm_type, res_type) a_usm_arr = dpnp.get_usm_ndarray(a) b_usm_arr = dpnp.get_usm_ndarray(b) From 8fb2af3f6b1868fe23b9dd71fe63f976538d6c8f Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 12 Jun 2024 15:19:24 +0200 Subject: [PATCH 09/37] Use py::gil_scoped_release before gesv call --- dpnp/backend/extensions/lapack/gesv.cpp | 47 ++++++++++++++----------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp index de5d38cdfc00..b1ed4cfa3e8c 100644 --- a/dpnp/backend/extensions/lapack/gesv.cpp +++ b/dpnp/backend/extensions/lapack/gesv.cpp @@ -212,26 +212,26 @@ std::pair {coeff_matrix, dependent_vals})) { throw py::value_error( - "Execution queue is not compatible with allocation queues"); + "Execution queue is not compatible with allocation queues."); } auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); if (overlap(coeff_matrix, dependent_vals)) { throw py::value_error( "The arrays of coefficients and dependent variables " - "are overlapping segments of memory"); + "are overlapping segments of memory."); } bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous(); if (!is_coeff_matrix_f_contig) { throw py::value_error("The coefficient matrix " - "must be F-contiguous"); + "must be F-contiguous."); } bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous(); if (!is_dependent_vals_f_contig) { throw py::value_error("The array of dependent variables " - "must be F-contiguous"); + "must be F-contiguous."); } auto array_types = dpctl_td_ns::usm_ndarray_types(); @@ -242,7 +242,7 @@ std::pair if (coeff_matrix_type_id != dependent_vals_type_id) { throw py::value_error("The types of the coefficient matrix and " - "dependent variables are mismatched"); + "dependent variables are mismatched."); } gesv_impl_fn_ptr_t gesv_fn = gesv_dispatch_vector[coeff_matrix_type_id]; @@ -300,8 +300,7 @@ std::pair // The coeff_matrix and dependent_vals arrays must be F-contiguous arrays // with the shapes (n,n,batch_size) and (n,nrhs,batch_size) or - // (n,batch_size) respectively. - + // (n,batch_size) respectively if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) { throw py::value_error("The coefficient matrix must be square," " but got a shape of (" + @@ -342,26 +341,26 @@ std::pair {coeff_matrix, dependent_vals})) { throw py::value_error( - "Execution queue is not compatible with allocation queues"); + "Execution queue is not compatible with allocation queues."); } auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); if (overlap(coeff_matrix, dependent_vals)) { throw py::value_error( "The arrays of coefficients and dependent variables " - "are overlapping segments of memory"); + "are overlapping segments of memory."); } bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous(); if (!is_coeff_matrix_f_contig) { throw py::value_error("The coefficient matrix " - "must be F-contiguous"); + "must be F-contiguous."); } bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous(); if (!is_dependent_vals_f_contig) { throw py::value_error("The array of dependent variables " - "must be F-contiguous"); + "must be F-contiguous."); } auto array_types = dpctl_td_ns::usm_ndarray_types(); @@ -372,7 +371,7 @@ std::pair if (coeff_matrix_type_id != dependent_vals_type_id) { throw py::value_error("The types of the coefficient matrix and " - "dependent variables are mismatched"); + "dependent variables are mismatched."); } gesv_impl_fn_ptr_t gesv_fn = gesv_dispatch_vector[coeff_matrix_type_id]; @@ -402,17 +401,23 @@ std::pair host_task_events.reserve(batch_size); gesv_task_events.reserve(batch_size); - for (std::int64_t i = 0; i < batch_size; ++i) { - char *coeff_matrix_batch = - coeff_matrix_data + i * n * n * coeff_matrix_elemsize; - char *dependent_vals_batch = - dependent_vals_data + i * n * nrhs * dependent_vals_elemsize; + { + // Release GIL to allow other Python threads to run during the loop + // as the operations in the loop do not require GIL + py::gil_scoped_release release; + + for (std::int64_t i = 0; i < batch_size; ++i) { + char *coeff_matrix_batch = + coeff_matrix_data + i * n * n * coeff_matrix_elemsize; + char *dependent_vals_batch = + dependent_vals_data + i * n * nrhs * dependent_vals_elemsize; - sycl::event gesv_ev = - gesv_fn(exec_q, n, nrhs, coeff_matrix_batch, lda, - dependent_vals_batch, ldb, host_task_events, depends); + sycl::event gesv_ev = + gesv_fn(exec_q, n, nrhs, coeff_matrix_batch, lda, + dependent_vals_batch, ldb, host_task_events, depends); - gesv_task_events.push_back(gesv_ev); + gesv_task_events.push_back(gesv_ev); + } } sycl::event combine_ev = exec_q.submit( From ddcf9fef8e1a1cbdaad8f4e0caa77013d6edcc4a Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 12 Jun 2024 15:27:30 +0200 Subject: [PATCH 10/37] Remove junk files --- perf.py | 22 ----------------- test.py | 75 --------------------------------------------------------- 2 files changed, 97 deletions(-) delete mode 100644 perf.py delete mode 100644 test.py diff --git a/perf.py b/perf.py deleted file mode 100644 index 20de9e824e66..000000000000 --- a/perf.py +++ /dev/null @@ -1,22 +0,0 @@ -import numpy -import dpnp -from tests.helper import generate_random_numpy_array -import cProfile, pstats, io - -a = generate_random_numpy_array((256,512,512),dtype='f4',hermitian=False, seed_value=81) -b = generate_random_numpy_array((256,512,),dtype='f4',hermitian=False, seed_value=76) - -a_dp = dpnp.array(a,device='cpu') -b_dp = dpnp.array(b,device='cpu') - -cold_run = dpnp.linalg.solve(a_dp, b_dp) -pr = cProfile.Profile() -pr.enable() -# res = numpy.linalg.solve(a, b) -res = dpnp.linalg.solve(a_dp, b_dp) -pr.disable() -s = io.StringIO() -sortBy = pstats.SortKey.CUMULATIVE -ps = pstats.Stats(pr, stream=s).sort_stats(sortBy) -ps.print_stats() -print(s.getvalue()) diff --git a/test.py b/test.py deleted file mode 100644 index fc22fc3309c8..000000000000 --- a/test.py +++ /dev/null @@ -1,75 +0,0 @@ -import numpy - -import dpnp -import dpnp.linalg -from tests.helper import generate_random_numpy_array - -# a = numpy.array([[[2,3],[1,2]],[[1,2],[2,3]]],dtype='f4') -a = numpy.array([[[1,2],[3,5]],[[1,2],[3,5]],[[1,2],[3,5]]], dtype='f4') -b = numpy.array([[1,2],[1,2],[1,2]], dtype='f4') -# a = numpy.array([[1,2],[2,3]],dtype='f4') -# a = numpy.array([[[2,3],[1,2]]],dtype='f4') -# a = numpy.array([[[1,2],[2,3]]],dtype='f4') -# a = generate_random_numpy_array( -# (2, 3, 3), dtype='f4', hermitian=False, seed_value=81 -# ) - -# b = generate_random_numpy_array( -# (2, 3, 2), dtype='f4', hermitian=False, seed_value=76 -# ) -# a = numpy.array( -# [ -# [[1, 0, 3], [0, 5, 0], [7, 0, 9]], -# [[3, 0, 3], [0, 7, 0], [7, 0, 11]], -# ], -# dtype='f4', -# ) -a_dp = dpnp.array(a) -b_dp = dpnp.array(b) - -# a_f = numpy.array(a,order="F") -# a_dp_f = dpnp.array(a_dp,order="F") - -# res = numpy.linalg.eigh(a) -# res_dp = dpnp.linalg.eigh(a_dp) -# res = numpy.linalg.eigh(a_f) -# res_dp = dpnp.linalg.eigh(a_dp_f) -res = numpy.linalg.solve(a,b) -res_dp = dpnp.linalg.solve(a_dp,b_dp) -# res = numpy.linalg.eigvalsh(a) -# res_dp = dpnp.linalg.eigvalsh(a_dp) - - -print("VALS: ") -print("NUMPY: ") -print(res) -print("DPNP: ") -print(res_dp) - -# print("BEFORE") -# print(a_dp) - -# print("VALS: ") -# print("NUMPY: ") -# print(res[0]) -# print("DPNP: ") -# print(res_dp[0]) - - -# print("VECS: ") -# print("NUMPY: ") -# print(res[1]) -# print("DPNP: ") -# print(res_dp[1]) - -# print("AFTER") -# print(a_dp) - - -# a = generate_random_numpy_array((10,3,3),dtype=numpy.float32, hermitian=True, seed_value=81) -# a_dp = dpnp.array(a, device='gpu') - -# res = numpy.linalg.eigh(a) -# res_dp = dpnp.linalg.eigh(a_dp) - -# print("Done") From 262794f55aba80c0a4520bd54c50d033cf896fc0 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 13 Jun 2024 16:02:14 +0200 Subject: [PATCH 11/37] Move gesv_batch to gesv_batch.cpp --- dpnp/backend/extensions/lapack/CMakeLists.txt | 1 + dpnp/backend/extensions/lapack/gesv.cpp | 156 -------- dpnp/backend/extensions/lapack/gesv.hpp | 1 + dpnp/backend/extensions/lapack/gesv_batch.cpp | 357 ++++++++++++++++++ dpnp/backend/extensions/lapack/lapack_py.cpp | 1 + 5 files changed, 360 insertions(+), 156 deletions(-) create mode 100644 dpnp/backend/extensions/lapack/gesv_batch.cpp diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt index c25ef1d97bcb..7c71a5a37eb1 100644 --- a/dpnp/backend/extensions/lapack/CMakeLists.txt +++ b/dpnp/backend/extensions/lapack/CMakeLists.txt @@ -30,6 +30,7 @@ set(_module_src ${CMAKE_CURRENT_SOURCE_DIR}/geqrf.cpp ${CMAKE_CURRENT_SOURCE_DIR}/geqrf_batch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/gesv.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/gesv_batch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/gesvd.cpp ${CMAKE_CURRENT_SOURCE_DIR}/getrf.cpp ${CMAKE_CURRENT_SOURCE_DIR}/getrf_batch.cpp diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp index b1ed4cfa3e8c..ea31b39d5119 100644 --- a/dpnp/backend/extensions/lapack/gesv.cpp +++ b/dpnp/backend/extensions/lapack/gesv.cpp @@ -273,162 +273,6 @@ std::pair return std::make_pair(args_ev, gesv_ev); } -std::pair - gesv_batch(sycl::queue &exec_q, - dpctl::tensor::usm_ndarray coeff_matrix, - dpctl::tensor::usm_ndarray dependent_vals, - const std::vector &depends) -{ - const int coeff_matrix_nd = coeff_matrix.get_ndim(); - const int dependent_vals_nd = dependent_vals.get_ndim(); - - if (coeff_matrix_nd != 3) { - throw py::value_error("The coefficient matrix has ndim=" + - std::to_string(coeff_matrix_nd) + - ", but a 3-dimensional array is expected."); - } - - if (dependent_vals_nd < 2 || dependent_vals_nd > 3) { - throw py::value_error( - "The dependent values array has ndim=" + - std::to_string(dependent_vals_nd) + - ", but a 2-dimensional or 3-dimensional array is expected."); - } - - const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw(); - const py::ssize_t *dependent_vals_shape = dependent_vals.get_shape_raw(); - - // The coeff_matrix and dependent_vals arrays must be F-contiguous arrays - // with the shapes (n,n,batch_size) and (n,nrhs,batch_size) or - // (n,batch_size) respectively - if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) { - throw py::value_error("The coefficient matrix must be square," - " but got a shape of (" + - std::to_string(coeff_matrix_shape[0]) + ", " + - std::to_string(coeff_matrix_shape[1]) + ")."); - } - - if (coeff_matrix_shape[0] != dependent_vals_shape[0]) { - throw py::value_error("The first dimension (n) of coeff_matrix and" - " dependent_vals must be the same, but got " + - std::to_string(coeff_matrix_shape[0]) + " and " + - std::to_string(dependent_vals_shape[0]) + "."); - } - - if (dependent_vals_nd == 2) { - if (coeff_matrix_shape[2] != dependent_vals_shape[1]) { - throw py::value_error( - "The batch_size of " - " coeff_matrix and dependent_vals must be" - " the same, but got " + - std::to_string(coeff_matrix_shape[2]) + " and " + - std::to_string(dependent_vals_shape[1]) + "."); - } - } - else if (dependent_vals_nd == 3) { - if (coeff_matrix_shape[2] != dependent_vals_shape[2]) { - throw py::value_error( - "The batch_size of " - " coeff_matrix and dependent_vals must be" - " the same, but got " + - std::to_string(coeff_matrix_shape[2]) + " and " + - std::to_string(dependent_vals_shape[2]) + "."); - } - } - - // check compatibility of execution queue and allocation queue - if (!dpctl::utils::queues_are_compatible(exec_q, - {coeff_matrix, dependent_vals})) - { - throw py::value_error( - "Execution queue is not compatible with allocation queues."); - } - - auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); - if (overlap(coeff_matrix, dependent_vals)) { - throw py::value_error( - "The arrays of coefficients and dependent variables " - "are overlapping segments of memory."); - } - - bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous(); - if (!is_coeff_matrix_f_contig) { - throw py::value_error("The coefficient matrix " - "must be F-contiguous."); - } - - bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous(); - if (!is_dependent_vals_f_contig) { - throw py::value_error("The array of dependent variables " - "must be F-contiguous."); - } - - auto array_types = dpctl_td_ns::usm_ndarray_types(); - int coeff_matrix_type_id = - array_types.typenum_to_lookup_id(coeff_matrix.get_typenum()); - int dependent_vals_type_id = - array_types.typenum_to_lookup_id(dependent_vals.get_typenum()); - - if (coeff_matrix_type_id != dependent_vals_type_id) { - throw py::value_error("The types of the coefficient matrix and " - "dependent variables are mismatched."); - } - - gesv_impl_fn_ptr_t gesv_fn = gesv_dispatch_vector[coeff_matrix_type_id]; - if (gesv_fn == nullptr) { - throw py::value_error( - "No gesv implementation defined for the provided type " - "of the coefficient matrix."); - } - - char *coeff_matrix_data = coeff_matrix.get_data(); - char *dependent_vals_data = dependent_vals.get_data(); - - const std::int64_t batch_size = coeff_matrix_shape[2]; - const std::int64_t n = coeff_matrix_shape[1]; - const std::int64_t nrhs = - (dependent_vals_nd > 2) ? dependent_vals_shape[1] : 1; - - const std::int64_t lda = std::max(1UL, n); - const std::int64_t ldb = std::max(1UL, n); - - int coeff_matrix_elemsize = coeff_matrix.get_elemsize(); - int dependent_vals_elemsize = dependent_vals.get_elemsize(); - - std::vector host_task_events; - std::vector gesv_task_events; - - host_task_events.reserve(batch_size); - gesv_task_events.reserve(batch_size); - - { - // Release GIL to allow other Python threads to run during the loop - // as the operations in the loop do not require GIL - py::gil_scoped_release release; - - for (std::int64_t i = 0; i < batch_size; ++i) { - char *coeff_matrix_batch = - coeff_matrix_data + i * n * n * coeff_matrix_elemsize; - char *dependent_vals_batch = - dependent_vals_data + i * n * nrhs * dependent_vals_elemsize; - - sycl::event gesv_ev = - gesv_fn(exec_q, n, nrhs, coeff_matrix_batch, lda, - dependent_vals_batch, ldb, host_task_events, depends); - - gesv_task_events.push_back(gesv_ev); - } - } - - sycl::event combine_ev = exec_q.submit( - [&](sycl::handler &cgh) { cgh.depends_on(gesv_task_events); }); - - sycl::event args_ev = dpctl::utils::keep_args_alive( - exec_q, {coeff_matrix, dependent_vals}, host_task_events); - - return std::make_pair(args_ev, combine_ev); -} - template struct GesvContigFactory { diff --git a/dpnp/backend/extensions/lapack/gesv.hpp b/dpnp/backend/extensions/lapack/gesv.hpp index c9dfa1a3de50..9b7230f1b18a 100644 --- a/dpnp/backend/extensions/lapack/gesv.hpp +++ b/dpnp/backend/extensions/lapack/gesv.hpp @@ -51,6 +51,7 @@ extern std::pair const std::vector &depends); extern void init_gesv_dispatch_vector(void); +extern void init_gesv_batch_dispatch_vector(void); } // namespace lapack } // namespace ext } // namespace backend diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp new file mode 100644 index 000000000000..0a4aa14b2e5e --- /dev/null +++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp @@ -0,0 +1,357 @@ +//***************************************************************************** +// Copyright (c) 2023-2024, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#include + +// dpctl tensor headers +#include "utils/memory_overlap.hpp" +#include "utils/type_utils.hpp" + +#include "common_helpers.hpp" +#include "gesv.hpp" +#include "linalg_exceptions.hpp" +#include "types_matrix.hpp" + +#include "dpnp_utils.hpp" + +namespace dpnp +{ +namespace backend +{ +namespace ext +{ +namespace lapack +{ +namespace mkl_lapack = oneapi::mkl::lapack; +namespace py = pybind11; +namespace type_utils = dpctl::tensor::type_utils; + +typedef sycl::event (*gesv_batch_impl_fn_ptr_t)(sycl::queue &, + const std::int64_t, + const std::int64_t, + const std::int64_t, + char *, + std::int64_t, + char *, + std::int64_t, + const std::vector &); + +static gesv_batch_impl_fn_ptr_t gesv_batch_dispatch_vector[dpctl_td_ns::num_types]; + +template +static sycl::event gesv_batch_impl(sycl::queue &exec_q, + const std::int64_t n, + const std::int64_t nrhs, + const std::int64_t batch_size, + char *in_a, + std::int64_t lda, + char *in_b, + std::int64_t ldb, + const std::vector &depends) +{ + // py::gil_scoped_release release; + type_utils::validate_type_for_device(exec_q); + + T *a = reinterpret_cast(in_a); + T *b = reinterpret_cast(in_b); + + std::int64_t a_size = n * n; + std::int64_t b_size = n * nrhs; + + const std::int64_t scratchpad_size = + mkl_lapack::gesv_scratchpad_size(exec_q, n, nrhs, lda, ldb); + T *scratchpad = nullptr; + + if (scratchpad_size > 0) { + scratchpad = sycl::malloc_device(scratchpad_size, exec_q); + if (!scratchpad) + throw std::runtime_error("Device allocation for scratchpad failed"); + } + + std::int64_t *ipiv = sycl::malloc_device(n, exec_q); + if (!ipiv) + throw std::runtime_error("Device allocation for ipiv failed"); + + std::vector gesv_task_events; + gesv_task_events.reserve(batch_size); + + std::stringstream error_msg; + std::int64_t info = 0; + bool is_exception_caught = false; + + sycl::event gesv_event; + + for (std::int64_t batch_id = 0; batch_id < batch_size; ++batch_id) { + T *a_batch = a + batch_id * a_size; + T *b_batch = b + batch_id * b_size; + + try { + gesv_event = mkl_lapack::gesv( + exec_q, + n, // The order of the square matrix A + // and the number of rows in matrix B (0 ≤ n). + nrhs, // The number of right-hand sides, + // i.e., the number of columns in matrix B (0 ≤ nrhs). + a_batch, // Pointer to the square coefficient matrix A (n x n). + lda, // The leading dimension of a, must be at least max(1, n). + ipiv, // The pivot indices that define the permutation matrix P; + // row i of the matrix was interchanged with row ipiv(i), + // must be at least max(1, n). + b_batch, // Pointer to the right hand side matrix B (n x nrhs). + ldb, // The leading dimension of matrix B, + // must be at least max(1, n). + scratchpad, // Pointer to scratchpad memory to be used by MKL + // routine for storing intermediate results. + scratchpad_size, depends); + } catch (mkl_lapack::exception const &e) { + is_exception_caught = true; + info = e.info(); + + if (info < 0) { + error_msg << "Parameter number " << -info + << " had an illegal value."; + } + else if (info == scratchpad_size && e.detail() != 0) { + error_msg + << "Insufficient scratchpad size. Required size is at least " + << e.detail(); + } + else if (info > 0) { + T host_U; + exec_q.memcpy(&host_U, &a[(info - 1) * lda + info - 1], sizeof(T)) + .wait(); + + using ThresholdType = typename helper::value_type_of::type; + + const auto threshold = + std::numeric_limits::epsilon() * 100; + if (std::abs(host_U) < threshold) { + sycl::free(scratchpad, exec_q); + throw LinAlgError("The input coefficient matrix is singular."); + } + else { + error_msg << "Unexpected MKL exception caught during gesv() " + "call:\nreason: " + << e.what() << "\ninfo: " << e.info(); + } + } + else { + error_msg << "Unexpected MKL exception caught during gesv() " + "call:\nreason: " + << e.what() << "\ninfo: " << e.info(); + } + } catch (sycl::exception const &e) { + is_exception_caught = true; + error_msg << "Unexpected SYCL exception caught during gesv() call:\n" + << e.what(); + } + + gesv_task_events.push_back(gesv_event); + + } + + if (is_exception_caught) // an unexpected error occurs + { + if (scratchpad != nullptr) { + sycl::free(scratchpad, exec_q); + } + if (ipiv != nullptr) { + sycl::free(ipiv, exec_q); + } + throw std::runtime_error(error_msg.str()); + } + + sycl::event ht_ev = exec_q.submit([&](sycl::handler &cgh) { + for(const auto &el : gesv_task_events) { + cgh.depends_on(el); + } + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, scratchpad, ipiv]() { + sycl::free(scratchpad, ctx); + sycl::free(ipiv, ctx); + }); + }); + + return ht_ev; +} + + +std::pair + gesv_batch(sycl::queue &exec_q, + dpctl::tensor::usm_ndarray coeff_matrix, + dpctl::tensor::usm_ndarray dependent_vals, + const std::vector &depends) +{ + const int coeff_matrix_nd = coeff_matrix.get_ndim(); + const int dependent_vals_nd = dependent_vals.get_ndim(); + + if (coeff_matrix_nd != 3) { + throw py::value_error("The coefficient matrix has ndim=" + + std::to_string(coeff_matrix_nd) + + ", but a 3-dimensional array is expected."); + } + + if (dependent_vals_nd < 2 || dependent_vals_nd > 3) { + throw py::value_error( + "The dependent values array has ndim=" + + std::to_string(dependent_vals_nd) + + ", but a 2-dimensional or 3-dimensional array is expected."); + } + + const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw(); + const py::ssize_t *dependent_vals_shape = dependent_vals.get_shape_raw(); + + // The coeff_matrix and dependent_vals arrays must be F-contiguous arrays + // with the shapes (n,n,batch_size) and (n,nrhs,batch_size) or + // (n,batch_size) respectively + if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) { + throw py::value_error("The coefficient matrix must be square," + " but got a shape of (" + + std::to_string(coeff_matrix_shape[0]) + ", " + + std::to_string(coeff_matrix_shape[1]) + ")."); + } + + if (coeff_matrix_shape[0] != dependent_vals_shape[0]) { + throw py::value_error("The first dimension (n) of coeff_matrix and" + " dependent_vals must be the same, but got " + + std::to_string(coeff_matrix_shape[0]) + " and " + + std::to_string(dependent_vals_shape[0]) + "."); + } + + if (dependent_vals_nd == 2) { + if (coeff_matrix_shape[2] != dependent_vals_shape[1]) { + throw py::value_error( + "The batch_size of " + " coeff_matrix and dependent_vals must be" + " the same, but got " + + std::to_string(coeff_matrix_shape[2]) + " and " + + std::to_string(dependent_vals_shape[1]) + "."); + } + } + else if (dependent_vals_nd == 3) { + if (coeff_matrix_shape[2] != dependent_vals_shape[2]) { + throw py::value_error( + "The batch_size of " + " coeff_matrix and dependent_vals must be" + " the same, but got " + + std::to_string(coeff_matrix_shape[2]) + " and " + + std::to_string(dependent_vals_shape[2]) + "."); + } + } + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, + {coeff_matrix, dependent_vals})) + { + throw py::value_error( + "Execution queue is not compatible with allocation queues."); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(coeff_matrix, dependent_vals)) { + throw py::value_error( + "The arrays of coefficients and dependent variables " + "are overlapping segments of memory."); + } + + bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous(); + if (!is_coeff_matrix_f_contig) { + throw py::value_error("The coefficient matrix " + "must be F-contiguous."); + } + + bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous(); + if (!is_dependent_vals_f_contig) { + throw py::value_error("The array of dependent variables " + "must be F-contiguous."); + } + + auto array_types = dpctl_td_ns::usm_ndarray_types(); + int coeff_matrix_type_id = + array_types.typenum_to_lookup_id(coeff_matrix.get_typenum()); + int dependent_vals_type_id = + array_types.typenum_to_lookup_id(dependent_vals.get_typenum()); + + if (coeff_matrix_type_id != dependent_vals_type_id) { + throw py::value_error("The types of the coefficient matrix and " + "dependent variables are mismatched."); + } + + gesv_batch_impl_fn_ptr_t gesv_batch_fn = gesv_batch_dispatch_vector[coeff_matrix_type_id]; + if (gesv_batch_fn == nullptr) { + throw py::value_error( + "No gesv implementation defined for the provided type " + "of the coefficient matrix."); + } + + char *coeff_matrix_data = coeff_matrix.get_data(); + char *dependent_vals_data = dependent_vals.get_data(); + + const std::int64_t batch_size = coeff_matrix_shape[2]; + const std::int64_t n = coeff_matrix_shape[1]; + const std::int64_t nrhs = + (dependent_vals_nd > 2) ? dependent_vals_shape[1] : 1; + + const std::int64_t lda = std::max(1UL, n); + const std::int64_t ldb = std::max(1UL, n); + + sycl::event gesv_ev; + + gesv_ev = gesv_batch_fn(exec_q, n, nrhs, batch_size, coeff_matrix_data, lda, + dependent_vals_data, ldb, depends); + + + sycl::event ht_ev = + dpctl::utils::keep_args_alive(exec_q, {coeff_matrix, dependent_vals}, {gesv_ev}); + + return std::make_pair(ht_ev, gesv_ev); + +} + +template +struct GesvBatchContigFactory +{ + fnT get() + { + if constexpr (types::GesvTypePairSupportFactory::is_defined) { + return gesv_batch_impl; + } + else { + return nullptr; + } + } +}; + +void init_gesv_batch_dispatch_vector(void) +{ + dpctl_td_ns::DispatchVectorBuilder + contig; + contig.populate_dispatch_vector(gesv_batch_dispatch_vector); +} +} // namespace lapack +} // namespace ext +} // namespace backend +} // namespace dpnp diff --git a/dpnp/backend/extensions/lapack/lapack_py.cpp b/dpnp/backend/extensions/lapack/lapack_py.cpp index 863d1ffaea47..aba27224ca82 100644 --- a/dpnp/backend/extensions/lapack/lapack_py.cpp +++ b/dpnp/backend/extensions/lapack/lapack_py.cpp @@ -51,6 +51,7 @@ void init_dispatch_vectors(void) { lapack_ext::init_geqrf_batch_dispatch_vector(); lapack_ext::init_geqrf_dispatch_vector(); + lapack_ext::init_gesv_batch_dispatch_vector(); lapack_ext::init_gesv_dispatch_vector(); lapack_ext::init_getrf_batch_dispatch_vector(); lapack_ext::init_getrf_dispatch_vector(); From 3a7b8ca6eef121aaa9ba030d064700a9aaab8327 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 13 Jun 2024 18:01:41 +0200 Subject: [PATCH 12/37] Improve gesv_batch with independent linear streams --- .../extensions/lapack/common_helpers.hpp | 8 + dpnp/backend/extensions/lapack/gesv_batch.cpp | 175 +++++++++++------- 2 files changed, 114 insertions(+), 69 deletions(-) diff --git a/dpnp/backend/extensions/lapack/common_helpers.hpp b/dpnp/backend/extensions/lapack/common_helpers.hpp index 3e840ddf7f39..036d0b9c7328 100644 --- a/dpnp/backend/extensions/lapack/common_helpers.hpp +++ b/dpnp/backend/extensions/lapack/common_helpers.hpp @@ -48,6 +48,14 @@ struct value_type_of> { using type = T; }; + +// Rounds up the number `value` to the nearest multiple of `mult`. +template +intT round_up_mult(intT value, intT mult) +{ + intT q = (value + (mult - 1)) / mult; + return q * mult; +} } // namespace helper } // namespace lapack } // namespace ext diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp index 0a4aa14b2e5e..f8341f353486 100644 --- a/dpnp/backend/extensions/lapack/gesv_batch.cpp +++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp @@ -48,17 +48,19 @@ namespace mkl_lapack = oneapi::mkl::lapack; namespace py = pybind11; namespace type_utils = dpctl::tensor::type_utils; -typedef sycl::event (*gesv_batch_impl_fn_ptr_t)(sycl::queue &, - const std::int64_t, - const std::int64_t, - const std::int64_t, - char *, - std::int64_t, - char *, - std::int64_t, - const std::vector &); - -static gesv_batch_impl_fn_ptr_t gesv_batch_dispatch_vector[dpctl_td_ns::num_types]; +typedef sycl::event (*gesv_batch_impl_fn_ptr_t)( + sycl::queue &, + const std::int64_t, + const std::int64_t, + const std::int64_t, + char *, + std::int64_t, + char *, + std::int64_t, + const std::vector &); + +static gesv_batch_impl_fn_ptr_t + gesv_batch_dispatch_vector[dpctl_td_ns::num_types]; template static sycl::event gesv_batch_impl(sycl::queue &exec_q, @@ -71,7 +73,6 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q, std::int64_t ldb, const std::vector &depends) { - // py::gil_scoped_release release; type_utils::validate_type_for_device(exec_q); T *a = reinterpret_cast(in_a); @@ -80,67 +81,101 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q, std::int64_t a_size = n * n; std::int64_t b_size = n * nrhs; + // Get the number of independent linear streams + std::int64_t n_linear_streams = + (batch_size > 16) ? 4 : ((batch_size > 4 ? 2 : 1)); + const std::int64_t scratchpad_size = mkl_lapack::gesv_scratchpad_size(exec_q, n, nrhs, lda, ldb); T *scratchpad = nullptr; + // Get padding size to ensure memory allocations are aligned to 256 bytes + // for better performance + std::int64_t padding = 256 / sizeof(T); + if (scratchpad_size > 0) { - scratchpad = sycl::malloc_device(scratchpad_size, exec_q); + // Calculate the total scratchpad memory size needed for all linear + // streams with proper alignment + size_t alloc_scratch_size = + helper::round_up_mult(n_linear_streams * scratchpad_size, padding); + + // Allocate memory for the total scratchpad + scratchpad = sycl::malloc_device(alloc_scratch_size, exec_q); if (!scratchpad) throw std::runtime_error("Device allocation for scratchpad failed"); } - std::int64_t *ipiv = sycl::malloc_device(n, exec_q); + // Calculate the total size needed for the pivot indices array for all + // linear streams with proper alignment + size_t alloc_ipiv_size = + helper::round_up_mult(n_linear_streams * n, padding); + + // Allocate memory for the total pivot indices array + std::int64_t *ipiv = + sycl::malloc_device(alloc_ipiv_size, exec_q); if (!ipiv) throw std::runtime_error("Device allocation for ipiv failed"); - std::vector gesv_task_events; - gesv_task_events.reserve(batch_size); + // Computation events to manage dependencies for each linear stream + std::vector> comp_evs(n_linear_streams, depends); std::stringstream error_msg; std::int64_t info = 0; bool is_exception_caught = false; - sycl::event gesv_event; - for (std::int64_t batch_id = 0; batch_id < batch_size; ++batch_id) { T *a_batch = a + batch_id * a_size; T *b_batch = b + batch_id * b_size; + std::int64_t stream_id = (batch_id % n_linear_streams); + + T *current_scratch_gesv = nullptr; + if (scratchpad != nullptr) { + current_scratch_gesv = scratchpad + stream_id * scratchpad_size; + } + std::int64_t *current_ipiv = ipiv + stream_id * n; + + // Get the event dependencies for the current stream + const auto ¤t_dep = comp_evs[stream_id]; + + sycl::event gesv_event; + try { gesv_event = mkl_lapack::gesv( exec_q, n, // The order of the square matrix A - // and the number of rows in matrix B (0 ≤ n). + // and the number of rows in matrix B (0 ≤ n). nrhs, // The number of right-hand sides, - // i.e., the number of columns in matrix B (0 ≤ nrhs). - a_batch, // Pointer to the square coefficient matrix A (n x n). - lda, // The leading dimension of a, must be at least max(1, n). - ipiv, // The pivot indices that define the permutation matrix P; - // row i of the matrix was interchanged with row ipiv(i), - // must be at least max(1, n). - b_batch, // Pointer to the right hand side matrix B (n x nrhs). - ldb, // The leading dimension of matrix B, - // must be at least max(1, n). - scratchpad, // Pointer to scratchpad memory to be used by MKL - // routine for storing intermediate results. - scratchpad_size, depends); + // i.e., the number of columns in matrix B (0 ≤ nrhs). + a_batch, // Pointer to the square coefficient matrix A (n x n). + lda, // The leading dimension of a, must be at least max(1, n). + current_ipiv, // The pivot indices that define the permutation + // matrix P; row i of the matrix was interchanged + // with row ipiv(i), must be at least max(1, n). + b_batch, // Pointer to the right hand side matrix B (n x nrhs). + ldb, // The leading dimension of matrix B, + // must be at least max(1, n). + current_scratch_gesv, // Pointer to scratchpad memory to be used + // by MKL routine for storing intermediate + // results. + scratchpad_size, current_dep); } catch (mkl_lapack::exception const &e) { is_exception_caught = true; info = e.info(); if (info < 0) { error_msg << "Parameter number " << -info - << " had an illegal value."; + << " had an illegal value."; } else if (info == scratchpad_size && e.detail() != 0) { - error_msg - << "Insufficient scratchpad size. Required size is at least " - << e.detail(); + error_msg << "Insufficient scratchpad size. Required size is " + "at least " + << e.detail(); } else if (info > 0) { T host_U; - exec_q.memcpy(&host_U, &a[(info - 1) * lda + info - 1], sizeof(T)) + exec_q + .memcpy(&host_U, &a[(info - 1) * lda + info - 1], sizeof(T)) .wait(); using ThresholdType = typename helper::value_type_of::type; @@ -149,44 +184,48 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q, std::numeric_limits::epsilon() * 100; if (std::abs(host_U) < threshold) { sycl::free(scratchpad, exec_q); - throw LinAlgError("The input coefficient matrix is singular."); + sycl::free(ipiv, exec_q); + throw LinAlgError( + "The input coefficient matrix is singular."); } else { - error_msg << "Unexpected MKL exception caught during gesv() " - "call:\nreason: " - << e.what() << "\ninfo: " << e.info(); + error_msg + << "Unexpected MKL exception caught during gesv() " + "call:\nreason: " + << e.what() << "\ninfo: " << e.info(); } } else { error_msg << "Unexpected MKL exception caught during gesv() " - "call:\nreason: " - << e.what() << "\ninfo: " << e.info(); + "call:\nreason: " + << e.what() << "\ninfo: " << e.info(); } } catch (sycl::exception const &e) { is_exception_caught = true; - error_msg << "Unexpected SYCL exception caught during gesv() call:\n" - << e.what(); + error_msg + << "Unexpected SYCL exception caught during gesv() call:\n" + << e.what(); } - gesv_task_events.push_back(gesv_event); - + // Update the event dependencies for the current stream + comp_evs[stream_id] = {gesv_event}; } if (is_exception_caught) // an unexpected error occurs - { - if (scratchpad != nullptr) { - sycl::free(scratchpad, exec_q); - } - if (ipiv != nullptr) { - sycl::free(ipiv, exec_q); - } - throw std::runtime_error(error_msg.str()); + { + if (scratchpad != nullptr) { + sycl::free(scratchpad, exec_q); + } + if (ipiv != nullptr) { + sycl::free(ipiv, exec_q); } + throw std::runtime_error(error_msg.str()); + } sycl::event ht_ev = exec_q.submit([&](sycl::handler &cgh) { - for(const auto &el : gesv_task_events) { - cgh.depends_on(el); - } + for (const auto &ev : comp_evs) { + cgh.depends_on(ev); + } auto ctx = exec_q.get_context(); cgh.host_task([ctx, scratchpad, ipiv]() { sycl::free(scratchpad, ctx); @@ -197,7 +236,6 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q, return ht_ev; } - std::pair gesv_batch(sycl::queue &exec_q, dpctl::tensor::usm_ndarray coeff_matrix, @@ -261,7 +299,7 @@ std::pair } } - // check compatibility of execution queue and allocation queue + // Check compatibility of execution queue and allocation queue if (!dpctl::utils::queues_are_compatible(exec_q, {coeff_matrix, dependent_vals})) { @@ -299,7 +337,8 @@ std::pair "dependent variables are mismatched."); } - gesv_batch_impl_fn_ptr_t gesv_batch_fn = gesv_batch_dispatch_vector[coeff_matrix_type_id]; + gesv_batch_impl_fn_ptr_t gesv_batch_fn = + gesv_batch_dispatch_vector[coeff_matrix_type_id]; if (gesv_batch_fn == nullptr) { throw py::value_error( "No gesv implementation defined for the provided type " @@ -317,17 +356,14 @@ std::pair const std::int64_t lda = std::max(1UL, n); const std::int64_t ldb = std::max(1UL, n); - sycl::event gesv_ev; + sycl::event gesv_ev = + gesv_batch_fn(exec_q, n, nrhs, batch_size, coeff_matrix_data, lda, + dependent_vals_data, ldb, depends); - gesv_ev = gesv_batch_fn(exec_q, n, nrhs, batch_size, coeff_matrix_data, lda, - dependent_vals_data, ldb, depends); - - - sycl::event ht_ev = - dpctl::utils::keep_args_alive(exec_q, {coeff_matrix, dependent_vals}, {gesv_ev}); + sycl::event ht_ev = dpctl::utils::keep_args_alive( + exec_q, {coeff_matrix, dependent_vals}, {gesv_ev}); return std::make_pair(ht_ev, gesv_ev); - } template @@ -346,7 +382,8 @@ struct GesvBatchContigFactory void init_gesv_batch_dispatch_vector(void) { - dpctl_td_ns::DispatchVectorBuilder contig; contig.populate_dispatch_vector(gesv_batch_dispatch_vector); From 2016a8cdad9f9991a8cb7ad1977101b0e02648b9 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 13 Jun 2024 21:53:57 +0200 Subject: [PATCH 13/37] Extend checks for gesv/gesv_batch --- dpnp/backend/extensions/lapack/gesv.cpp | 15 +++++++++++++++ dpnp/backend/extensions/lapack/gesv_batch.cpp | 15 +++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp index ea31b39d5119..5094da17282e 100644 --- a/dpnp/backend/extensions/lapack/gesv.cpp +++ b/dpnp/backend/extensions/lapack/gesv.cpp @@ -27,6 +27,7 @@ // dpctl tensor headers #include "utils/memory_overlap.hpp" +#include "utils/output_validation.hpp" #include "utils/type_utils.hpp" #include "common_helpers.hpp" @@ -207,6 +208,17 @@ std::pair std::to_string(coeff_matrix_shape[1]) + ")."); } + size_t src_nelems(1); + + for (int i = 0; i < dependent_vals_nd; ++i) { + src_nelems *= static_cast(dependent_vals_shape[i]); + } + + if (src_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + // check compatibility of execution queue and allocation queue if (!dpctl::utils::queues_are_compatible(exec_q, {coeff_matrix, dependent_vals})) @@ -222,6 +234,9 @@ std::pair "are overlapping segments of memory."); } + dpctl::tensor::validation::CheckWritable::throw_if_not_writable( + dependent_vals); + bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous(); if (!is_coeff_matrix_f_contig) { throw py::value_error("The coefficient matrix " diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp index f8341f353486..262eefbad712 100644 --- a/dpnp/backend/extensions/lapack/gesv_batch.cpp +++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp @@ -27,6 +27,7 @@ // dpctl tensor headers #include "utils/memory_overlap.hpp" +#include "utils/output_validation.hpp" #include "utils/type_utils.hpp" #include "common_helpers.hpp" @@ -299,6 +300,17 @@ std::pair } } + size_t src_nelems(1); + + for (int i = 0; i < dependent_vals_nd; ++i) { + src_nelems *= static_cast(dependent_vals_shape[i]); + } + + if (src_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + // Check compatibility of execution queue and allocation queue if (!dpctl::utils::queues_are_compatible(exec_q, {coeff_matrix, dependent_vals})) @@ -314,6 +326,9 @@ std::pair "are overlapping segments of memory."); } + dpctl::tensor::validation::CheckWritable::throw_if_not_writable( + dependent_vals); + bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous(); if (!is_coeff_matrix_f_contig) { throw py::value_error("The coefficient matrix " From 2c422905f6607440ae8d98449fb655fdae54ea5a Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 13 Jun 2024 22:00:40 +0200 Subject: [PATCH 14/37] Update comment --- dpnp/linalg/dpnp_utils_linalg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 4e176ce884f3..c8b50b3dc421 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -333,6 +333,7 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type): v = dpnp.moveaxis(b_f, -1, 0).reshape(b_shape) # dpnp.moveaxis can make the array non-contiguous if it is not 2D + # Convert to contiguous to align with NumPy if b.ndim > 2: v = dpnp.ascontiguousarray(v) From e030da8d303cfb8c8c396a6f59581fa4bf64f7b5 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 14 Jun 2024 12:03:55 +0200 Subject: [PATCH 15/37] junk files --- num.py | 51 +++++++++++++++++++++++++++++++++++++++ qwe.py | 11 +++++++++ test.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 137 insertions(+) create mode 100644 num.py create mode 100644 qwe.py create mode 100644 test.py diff --git a/num.py b/num.py new file mode 100644 index 000000000000..d5467b1cbf55 --- /dev/null +++ b/num.py @@ -0,0 +1,51 @@ +import numpy as np +import ctypes + +# Размеры и параметры для примера +batch_size = 2 +n = 3 +nrhs = 2 +coeff_matrix_elemsize = ctypes.sizeof(ctypes.c_double) +dependent_vals_elemsize = ctypes.sizeof(ctypes.c_double) + +# Создаем примерные массивы NumPy +coeff_matrix = np.arange(batch_size * n * n, dtype=np.double).reshape(batch_size, n, n) +dependent_vals = np.arange(batch_size * n * nrhs, dtype=np.double).reshape(batch_size, n, nrhs) + +# my logic +coeff_matrix = coeff_matrix.transpose((0, 2, 1)) +dependent_vals = dependent_vals.transpose((0, 2, 1)) + +coeff_matrix = np.array(coeff_matrix, order="C") +dependent_vals = np.array(dependent_vals, order="C") + +# # Sasha`s logic +# coeff_matrix = np.moveaxis(coeff_matrix, (-2, -1), (0, 1)) +# dependent_vals = np.moveaxis(dependent_vals, (-2, -1), (0, 1)) + +# coeff_matrix = np.array(coeff_matrix, order="F") +# dependent_vals = np.array(dependent_vals, order="F") + +# get pointer +coeff_matrix_data = coeff_matrix.ctypes.data_as(ctypes.POINTER(ctypes.c_double)) +dependent_vals_data = dependent_vals.ctypes.data_as(ctypes.POINTER(ctypes.c_double)) + +# Функция для получения значений по указателю с учетом смещения и размера элемента +def get_values_from_pointer(ptr, step, count, elemsize): + result = [] + for i in range(count): + result.append(ptr[i * step // elemsize]) + return result + + +for i in range(batch_size): + coeff_matrix_batch = ctypes.cast(ctypes.addressof(coeff_matrix_data.contents) + i * n * n * coeff_matrix_elemsize, ctypes.POINTER(ctypes.c_double)) + dependent_vals_batch = ctypes.cast(ctypes.addressof(dependent_vals_data.contents) + i * n * nrhs * dependent_vals_elemsize, ctypes.POINTER(ctypes.c_double)) + + # get value by new pointer + coeff_values = get_values_from_pointer(coeff_matrix_batch, coeff_matrix_elemsize, n * n, coeff_matrix_elemsize) + dependent_values = get_values_from_pointer(dependent_vals_batch, dependent_vals_elemsize, n * nrhs, dependent_vals_elemsize) + + print(f"Batch {i + 1}:") + print("Coefficients:", coeff_values) + print("Dependent values:", dependent_values) diff --git a/qwe.py b/qwe.py new file mode 100644 index 000000000000..e373bab0d566 --- /dev/null +++ b/qwe.py @@ -0,0 +1,11 @@ +import dpnp +import numpy +from tests.helper import generate_random_numpy_array + +a = dpnp.array(generate_random_numpy_array((2,3,3,3,3),dtype='f4',hermitian=False, seed_value=81)) +b = dpnp.array(generate_random_numpy_array((2,3,3,3),dtype='f4',hermitian=False, seed_value=76)) + +res = dpnp.linalg.solve(a,b,new=True) + +print(res.shape) +print(res.flags) diff --git a/test.py b/test.py new file mode 100644 index 000000000000..37218883cd43 --- /dev/null +++ b/test.py @@ -0,0 +1,75 @@ +import numpy + +import dpnp +import dpnp.linalg +from tests.helper import generate_random_numpy_array + +# a = numpy.array([[[2,3],[1,2]],[[1,2],[2,3]]],dtype='f4') +# a = numpy.array([[[1,2],[3,5]],[[1,2],[3,5]],[[1,2],[3,5]]], dtype='f4') +# b = numpy.array([[1,2],[1,2],[1,2]], dtype='f4') +# a = numpy.array([[1,2],[2,3]],dtype='f4') +# a = numpy.array([[[2,3],[1,2]]],dtype='f4') +# a = numpy.array([[[1,2],[2,3]]],dtype='f4') +a = generate_random_numpy_array( + (2, 3, 3), dtype='c8', hermitian=False, seed_value=81 +) + +b = generate_random_numpy_array( + (2, 3, 3), dtype='c8', hermitian=False, seed_value=76 +) +# a = numpy.array( +# [ +# [[1, 0, 3], [0, 5, 0], [7, 0, 9]], +# [[3, 0, 3], [0, 7, 0], [7, 0, 11]], +# ], +# dtype='f4', +# ) +a_dp = dpnp.array(a) +b_dp = dpnp.array(b) + +# a_f = numpy.array(a,order="F") +# a_dp_f = dpnp.array(a_dp,order="F") + +# res = numpy.linalg.eigh(a) +# res_dp = dpnp.linalg.eigh(a_dp) +# res = numpy.linalg.eigh(a_f) +# res_dp = dpnp.linalg.eigh(a_dp_f) +res = numpy.linalg.solve(a,b) +res_dp = dpnp.linalg.solve(a_dp,b_dp) +# res = numpy.linalg.eigvalsh(a) +# res_dp = dpnp.linalg.eigvalsh(a_dp) + + +print("VALS: ") +print("NUMPY: ") +print(res) +print("DPNP: ") +print(res_dp) + +# print("BEFORE") +# print(a_dp) + +# print("VALS: ") +# print("NUMPY: ") +# print(res[0]) +# print("DPNP: ") +# print(res_dp[0]) + + +# print("VECS: ") +# print("NUMPY: ") +# print(res[1]) +# print("DPNP: ") +# print(res_dp[1]) + +# print("AFTER") +# print(a_dp) + + +# a = generate_random_numpy_array((10,3,3),dtype=numpy.float32, hermitian=True, seed_value=81) +# a_dp = dpnp.array(a, device='gpu') + +# res = numpy.linalg.eigh(a) +# res_dp = dpnp.linalg.eigh(a_dp) + +# print("Done") From 5a48f333d7eda4e6dfc0922f2d98a187d7405cc1 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 12 Jul 2024 14:34:02 +0200 Subject: [PATCH 16/37] Add common_gesv_checks --- dpnp/backend/extensions/lapack/gesv.cpp | 155 +++++++++++------- dpnp/backend/extensions/lapack/gesv.hpp | 9 + dpnp/backend/extensions/lapack/gesv_batch.cpp | 72 +------- 3 files changed, 113 insertions(+), 123 deletions(-) diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp index 5094da17282e..1462ee51659a 100644 --- a/dpnp/backend/extensions/lapack/gesv.cpp +++ b/dpnp/backend/extensions/lapack/gesv.cpp @@ -49,6 +49,97 @@ namespace mkl_lapack = oneapi::mkl::lapack; namespace py = pybind11; namespace type_utils = dpctl::tensor::type_utils; +void common_gesv_checks(sycl::queue &exec_q, + dpctl::tensor::usm_ndarray coeff_matrix, + dpctl::tensor::usm_ndarray dependent_vals, + const py::ssize_t *coeff_matrix_shape, + const py::ssize_t *dependent_vals_shape, + const int expected_coeff_matrix_ndim, + const int min_dependent_vals_ndim, + const int max_dependent_vals_ndim) +{ + const int coeff_matrix_nd = coeff_matrix.get_ndim(); + const int dependent_vals_nd = dependent_vals.get_ndim(); + + if (coeff_matrix_nd != expected_coeff_matrix_ndim) { + throw py::value_error("The coefficient matrix has ndim=" + + std::to_string(coeff_matrix_nd) + ", but a " + + std::to_string(expected_coeff_matrix_ndim) + + "-dimensional array is expected."); + } + + if (dependent_vals_nd < min_dependent_vals_ndim || + dependent_vals_nd > max_dependent_vals_ndim) + { + throw py::value_error("The dependent values array has ndim=" + + std::to_string(dependent_vals_nd) + ", but a " + + std::to_string(min_dependent_vals_ndim) + + "-dimensional or a " + + std::to_string(max_dependent_vals_ndim) + + "-dimensional array is expected."); + } + + // The coeff_matrix and dependent_vals arrays must be F-contiguous arrays + // for gesv + // with the shapes (n,n) and (n,nrhs) or (n,) respectively; + // for gesv_batch + // with the shapes (n,n,batch_size) and (n,nrhs,batch_size) or + // (n,batch_size) respectively + if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) { + throw py::value_error("The coefficient matrix must be square," + " but got a shape of (" + + std::to_string(coeff_matrix_shape[0]) + ", " + + std::to_string(coeff_matrix_shape[1]) + ")."); + } + if (coeff_matrix_shape[0] != dependent_vals_shape[0]) { + throw py::value_error("The first dimension (n) of coeff_matrix and" + " dependent_vals must be the same, but got " + + std::to_string(coeff_matrix_shape[0]) + " and " + + std::to_string(dependent_vals_shape[0]) + "."); + } + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, + {coeff_matrix, dependent_vals})) + { + throw py::value_error( + "Execution queue is not compatible with allocation queues."); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(coeff_matrix, dependent_vals)) { + throw py::value_error( + "The arrays of coefficients and dependent variables " + "are overlapping segments of memory."); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable( + dependent_vals); + + bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous(); + if (!is_coeff_matrix_f_contig) { + throw py::value_error("The coefficient matrix " + "must be F-contiguous."); + } + + bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous(); + if (!is_dependent_vals_f_contig) { + throw py::value_error("The array of dependent variables " + "must be F-contiguous."); + } + + auto array_types = dpctl_td_ns::usm_ndarray_types(); + int coeff_matrix_type_id = + array_types.typenum_to_lookup_id(coeff_matrix.get_typenum()); + int dependent_vals_type_id = + array_types.typenum_to_lookup_id(dependent_vals.get_typenum()); + + if (coeff_matrix_type_id != dependent_vals_type_id) { + throw py::value_error("The types of the coefficient matrix and " + "dependent variables are mismatched."); + } +} + typedef sycl::event (*gesv_impl_fn_ptr_t)(sycl::queue &, const std::int64_t, const std::int64_t, @@ -182,31 +273,18 @@ std::pair dpctl::tensor::usm_ndarray dependent_vals, const std::vector &depends) { - const int coeff_matrix_nd = coeff_matrix.get_ndim(); const int dependent_vals_nd = dependent_vals.get_ndim(); - if (coeff_matrix_nd != 2) { - throw py::value_error("The coefficient matrix has ndim=" + - std::to_string(coeff_matrix_nd) + - ", but a 2-dimensional array is expected."); - } - - if (dependent_vals_nd > 2) { - throw py::value_error( - "The dependent values array has ndim=" + - std::to_string(dependent_vals_nd) + - ", but a 1-dimensional or a 2-dimensional array is expected."); - } - const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw(); const py::ssize_t *dependent_vals_shape = dependent_vals.get_shape_raw(); - if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) { - throw py::value_error("The coefficient matrix must be square," - " but got a shape of (" + - std::to_string(coeff_matrix_shape[0]) + ", " + - std::to_string(coeff_matrix_shape[1]) + ")."); - } + const int expected_coeff_matrix_ndim = 2; + const int min_dependent_vals_ndim = 1; + const int max_dependent_vals_ndim = 2; + + common_gesv_checks(exec_q, coeff_matrix, dependent_vals, coeff_matrix_shape, + dependent_vals_shape, expected_coeff_matrix_ndim, + min_dependent_vals_ndim, max_dependent_vals_ndim); size_t src_nelems(1); @@ -219,46 +297,9 @@ std::pair return std::make_pair(sycl::event(), sycl::event()); } - // check compatibility of execution queue and allocation queue - if (!dpctl::utils::queues_are_compatible(exec_q, - {coeff_matrix, dependent_vals})) - { - throw py::value_error( - "Execution queue is not compatible with allocation queues."); - } - - auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); - if (overlap(coeff_matrix, dependent_vals)) { - throw py::value_error( - "The arrays of coefficients and dependent variables " - "are overlapping segments of memory."); - } - - dpctl::tensor::validation::CheckWritable::throw_if_not_writable( - dependent_vals); - - bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous(); - if (!is_coeff_matrix_f_contig) { - throw py::value_error("The coefficient matrix " - "must be F-contiguous."); - } - - bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous(); - if (!is_dependent_vals_f_contig) { - throw py::value_error("The array of dependent variables " - "must be F-contiguous."); - } - auto array_types = dpctl_td_ns::usm_ndarray_types(); int coeff_matrix_type_id = array_types.typenum_to_lookup_id(coeff_matrix.get_typenum()); - int dependent_vals_type_id = - array_types.typenum_to_lookup_id(dependent_vals.get_typenum()); - - if (coeff_matrix_type_id != dependent_vals_type_id) { - throw py::value_error("The types of the coefficient matrix and " - "dependent variables are mismatched."); - } gesv_impl_fn_ptr_t gesv_fn = gesv_dispatch_vector[coeff_matrix_type_id]; if (gesv_fn == nullptr) { diff --git a/dpnp/backend/extensions/lapack/gesv.hpp b/dpnp/backend/extensions/lapack/gesv.hpp index ab90022af1ee..e4ceebfc1db6 100644 --- a/dpnp/backend/extensions/lapack/gesv.hpp +++ b/dpnp/backend/extensions/lapack/gesv.hpp @@ -50,6 +50,15 @@ extern std::pair dpctl::tensor::usm_ndarray dependent_vals, const std::vector &depends); +extern void common_gesv_checks(sycl::queue &exec_q, + dpctl::tensor::usm_ndarray coeff_matrix, + dpctl::tensor::usm_ndarray dependent_vals, + const py::ssize_t *coeff_matrix_shape, + const py::ssize_t *dependent_vals_shape, + const int expected_coeff_matrix_ndim, + const int min_dependent_vals_ndim, + const int max_dependent_vals_ndim); + extern void init_gesv_dispatch_vector(void); extern void init_gesv_batch_dispatch_vector(void); } // namespace lapack diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp index 262eefbad712..11d3442e891d 100644 --- a/dpnp/backend/extensions/lapack/gesv_batch.cpp +++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp @@ -243,41 +243,18 @@ std::pair dpctl::tensor::usm_ndarray dependent_vals, const std::vector &depends) { - const int coeff_matrix_nd = coeff_matrix.get_ndim(); const int dependent_vals_nd = dependent_vals.get_ndim(); - if (coeff_matrix_nd != 3) { - throw py::value_error("The coefficient matrix has ndim=" + - std::to_string(coeff_matrix_nd) + - ", but a 3-dimensional array is expected."); - } - - if (dependent_vals_nd < 2 || dependent_vals_nd > 3) { - throw py::value_error( - "The dependent values array has ndim=" + - std::to_string(dependent_vals_nd) + - ", but a 2-dimensional or 3-dimensional array is expected."); - } - const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw(); const py::ssize_t *dependent_vals_shape = dependent_vals.get_shape_raw(); - // The coeff_matrix and dependent_vals arrays must be F-contiguous arrays - // with the shapes (n,n,batch_size) and (n,nrhs,batch_size) or - // (n,batch_size) respectively - if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) { - throw py::value_error("The coefficient matrix must be square," - " but got a shape of (" + - std::to_string(coeff_matrix_shape[0]) + ", " + - std::to_string(coeff_matrix_shape[1]) + ")."); - } + const int expected_coeff_matrix_ndim = 3; + const int min_dependent_vals_ndim = 2; + const int max_dependent_vals_ndim = 3; - if (coeff_matrix_shape[0] != dependent_vals_shape[0]) { - throw py::value_error("The first dimension (n) of coeff_matrix and" - " dependent_vals must be the same, but got " + - std::to_string(coeff_matrix_shape[0]) + " and " + - std::to_string(dependent_vals_shape[0]) + "."); - } + common_gesv_checks(exec_q, coeff_matrix, dependent_vals, coeff_matrix_shape, + dependent_vals_shape, expected_coeff_matrix_ndim, + min_dependent_vals_ndim, max_dependent_vals_ndim); if (dependent_vals_nd == 2) { if (coeff_matrix_shape[2] != dependent_vals_shape[1]) { @@ -311,46 +288,9 @@ std::pair return std::make_pair(sycl::event(), sycl::event()); } - // Check compatibility of execution queue and allocation queue - if (!dpctl::utils::queues_are_compatible(exec_q, - {coeff_matrix, dependent_vals})) - { - throw py::value_error( - "Execution queue is not compatible with allocation queues."); - } - - auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); - if (overlap(coeff_matrix, dependent_vals)) { - throw py::value_error( - "The arrays of coefficients and dependent variables " - "are overlapping segments of memory."); - } - - dpctl::tensor::validation::CheckWritable::throw_if_not_writable( - dependent_vals); - - bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous(); - if (!is_coeff_matrix_f_contig) { - throw py::value_error("The coefficient matrix " - "must be F-contiguous."); - } - - bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous(); - if (!is_dependent_vals_f_contig) { - throw py::value_error("The array of dependent variables " - "must be F-contiguous."); - } - auto array_types = dpctl_td_ns::usm_ndarray_types(); int coeff_matrix_type_id = array_types.typenum_to_lookup_id(coeff_matrix.get_typenum()); - int dependent_vals_type_id = - array_types.typenum_to_lookup_id(dependent_vals.get_typenum()); - - if (coeff_matrix_type_id != dependent_vals_type_id) { - throw py::value_error("The types of the coefficient matrix and " - "dependent variables are mismatched."); - } gesv_batch_impl_fn_ptr_t gesv_batch_fn = gesv_batch_dispatch_vector[coeff_matrix_type_id]; From 924fee73d917fb9a0705c1498da2c9ef1f9b9a62 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 12 Jul 2024 14:53:49 +0200 Subject: [PATCH 17/37] Release GIL in gesv_batch_impl --- dpnp/backend/extensions/lapack/gesv_batch.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp index 11d3442e891d..abdc9ffe1a7f 100644 --- a/dpnp/backend/extensions/lapack/gesv_batch.cpp +++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp @@ -124,6 +124,10 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q, std::int64_t info = 0; bool is_exception_caught = false; + // Release GIL to avoid serialization of host task + // submissions to the same queue in OneMKL + py::gil_scoped_release release; + for (std::int64_t batch_id = 0; batch_id < batch_size; ++batch_id) { T *a_batch = a + batch_id * a_size; T *b_batch = b + batch_id * b_size; From 2b15e6c7fb763d93a71e95cce857594921167926 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 12 Jul 2024 14:54:18 +0200 Subject: [PATCH 18/37] Remove junk file --- test.py | 75 --------------------------------------------------------- 1 file changed, 75 deletions(-) delete mode 100644 test.py diff --git a/test.py b/test.py deleted file mode 100644 index 37218883cd43..000000000000 --- a/test.py +++ /dev/null @@ -1,75 +0,0 @@ -import numpy - -import dpnp -import dpnp.linalg -from tests.helper import generate_random_numpy_array - -# a = numpy.array([[[2,3],[1,2]],[[1,2],[2,3]]],dtype='f4') -# a = numpy.array([[[1,2],[3,5]],[[1,2],[3,5]],[[1,2],[3,5]]], dtype='f4') -# b = numpy.array([[1,2],[1,2],[1,2]], dtype='f4') -# a = numpy.array([[1,2],[2,3]],dtype='f4') -# a = numpy.array([[[2,3],[1,2]]],dtype='f4') -# a = numpy.array([[[1,2],[2,3]]],dtype='f4') -a = generate_random_numpy_array( - (2, 3, 3), dtype='c8', hermitian=False, seed_value=81 -) - -b = generate_random_numpy_array( - (2, 3, 3), dtype='c8', hermitian=False, seed_value=76 -) -# a = numpy.array( -# [ -# [[1, 0, 3], [0, 5, 0], [7, 0, 9]], -# [[3, 0, 3], [0, 7, 0], [7, 0, 11]], -# ], -# dtype='f4', -# ) -a_dp = dpnp.array(a) -b_dp = dpnp.array(b) - -# a_f = numpy.array(a,order="F") -# a_dp_f = dpnp.array(a_dp,order="F") - -# res = numpy.linalg.eigh(a) -# res_dp = dpnp.linalg.eigh(a_dp) -# res = numpy.linalg.eigh(a_f) -# res_dp = dpnp.linalg.eigh(a_dp_f) -res = numpy.linalg.solve(a,b) -res_dp = dpnp.linalg.solve(a_dp,b_dp) -# res = numpy.linalg.eigvalsh(a) -# res_dp = dpnp.linalg.eigvalsh(a_dp) - - -print("VALS: ") -print("NUMPY: ") -print(res) -print("DPNP: ") -print(res_dp) - -# print("BEFORE") -# print(a_dp) - -# print("VALS: ") -# print("NUMPY: ") -# print(res[0]) -# print("DPNP: ") -# print(res_dp[0]) - - -# print("VECS: ") -# print("NUMPY: ") -# print(res[1]) -# print("DPNP: ") -# print(res_dp[1]) - -# print("AFTER") -# print(a_dp) - - -# a = generate_random_numpy_array((10,3,3),dtype=numpy.float32, hermitian=True, seed_value=81) -# a_dp = dpnp.array(a, device='gpu') - -# res = numpy.linalg.eigh(a) -# res_dp = dpnp.linalg.eigh(a_dp) - -# print("Done") From afca80398e4444fd7fb437a9e6d7c4bc1874d8f2 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 16 Jul 2024 12:05:08 +0200 Subject: [PATCH 19/37] Remove junk files --- num.py | 51 --------------------------------------------------- qwe.py | 11 ----------- 2 files changed, 62 deletions(-) delete mode 100644 num.py delete mode 100644 qwe.py diff --git a/num.py b/num.py deleted file mode 100644 index d5467b1cbf55..000000000000 --- a/num.py +++ /dev/null @@ -1,51 +0,0 @@ -import numpy as np -import ctypes - -# Размеры и параметры для примера -batch_size = 2 -n = 3 -nrhs = 2 -coeff_matrix_elemsize = ctypes.sizeof(ctypes.c_double) -dependent_vals_elemsize = ctypes.sizeof(ctypes.c_double) - -# Создаем примерные массивы NumPy -coeff_matrix = np.arange(batch_size * n * n, dtype=np.double).reshape(batch_size, n, n) -dependent_vals = np.arange(batch_size * n * nrhs, dtype=np.double).reshape(batch_size, n, nrhs) - -# my logic -coeff_matrix = coeff_matrix.transpose((0, 2, 1)) -dependent_vals = dependent_vals.transpose((0, 2, 1)) - -coeff_matrix = np.array(coeff_matrix, order="C") -dependent_vals = np.array(dependent_vals, order="C") - -# # Sasha`s logic -# coeff_matrix = np.moveaxis(coeff_matrix, (-2, -1), (0, 1)) -# dependent_vals = np.moveaxis(dependent_vals, (-2, -1), (0, 1)) - -# coeff_matrix = np.array(coeff_matrix, order="F") -# dependent_vals = np.array(dependent_vals, order="F") - -# get pointer -coeff_matrix_data = coeff_matrix.ctypes.data_as(ctypes.POINTER(ctypes.c_double)) -dependent_vals_data = dependent_vals.ctypes.data_as(ctypes.POINTER(ctypes.c_double)) - -# Функция для получения значений по указателю с учетом смещения и размера элемента -def get_values_from_pointer(ptr, step, count, elemsize): - result = [] - for i in range(count): - result.append(ptr[i * step // elemsize]) - return result - - -for i in range(batch_size): - coeff_matrix_batch = ctypes.cast(ctypes.addressof(coeff_matrix_data.contents) + i * n * n * coeff_matrix_elemsize, ctypes.POINTER(ctypes.c_double)) - dependent_vals_batch = ctypes.cast(ctypes.addressof(dependent_vals_data.contents) + i * n * nrhs * dependent_vals_elemsize, ctypes.POINTER(ctypes.c_double)) - - # get value by new pointer - coeff_values = get_values_from_pointer(coeff_matrix_batch, coeff_matrix_elemsize, n * n, coeff_matrix_elemsize) - dependent_values = get_values_from_pointer(dependent_vals_batch, dependent_vals_elemsize, n * nrhs, dependent_vals_elemsize) - - print(f"Batch {i + 1}:") - print("Coefficients:", coeff_values) - print("Dependent values:", dependent_values) diff --git a/qwe.py b/qwe.py deleted file mode 100644 index e373bab0d566..000000000000 --- a/qwe.py +++ /dev/null @@ -1,11 +0,0 @@ -import dpnp -import numpy -from tests.helper import generate_random_numpy_array - -a = dpnp.array(generate_random_numpy_array((2,3,3,3,3),dtype='f4',hermitian=False, seed_value=81)) -b = dpnp.array(generate_random_numpy_array((2,3,3,3),dtype='f4',hermitian=False, seed_value=76)) - -res = dpnp.linalg.solve(a,b,new=True) - -print(res.shape) -print(res.flags) From e5b53a18f8fea74ff5ccb128bf7c6426fbeb3974 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 26 Jul 2024 14:27:05 +0200 Subject: [PATCH 20/37] Remove host_task_events from gesv --- dpnp/backend/extensions/lapack/gesv.cpp | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp index b1a35cd1f416..2da4de5b625a 100644 --- a/dpnp/backend/extensions/lapack/gesv.cpp +++ b/dpnp/backend/extensions/lapack/gesv.cpp @@ -141,7 +141,6 @@ typedef sycl::event (*gesv_impl_fn_ptr_t)(sycl::queue &, std::int64_t, char *, std::int64_t, - std::vector &, const std::vector &); static gesv_impl_fn_ptr_t gesv_dispatch_vector[dpctl_td_ns::num_types]; @@ -154,7 +153,6 @@ static sycl::event gesv_impl(sycl::queue &exec_q, std::int64_t lda, char *in_b, std::int64_t ldb, - std::vector &host_task_events, const std::vector &depends) { type_utils::validate_type_for_device(exec_q); @@ -248,7 +246,7 @@ static sycl::event gesv_impl(sycl::queue &exec_q, throw std::runtime_error(error_msg.str()); } - sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) { + sycl::event ht_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(gesv_event); auto ctx = exec_q.get_context(); cgh.host_task([ctx, scratchpad, ipiv]() { @@ -256,9 +254,8 @@ static sycl::event gesv_impl(sycl::queue &exec_q, sycl::free(ipiv, ctx); }); }); - host_task_events.push_back(clean_up_event); - return gesv_event; + return ht_ev; } std::pair @@ -312,15 +309,13 @@ std::pair const std::int64_t lda = std::max(1UL, n); const std::int64_t ldb = std::max(1UL, n); - std::vector host_task_events; - sycl::event gesv_ev = - gesv_fn(exec_q, n, nrhs, coeff_matrix_data, lda, dependent_vals_data, - ldb, host_task_events, depends); + sycl::event gesv_ev = gesv_fn(exec_q, n, nrhs, coeff_matrix_data, lda, + dependent_vals_data, ldb, depends); - sycl::event args_ev = dpctl::utils::keep_args_alive( - exec_q, {coeff_matrix, dependent_vals}, host_task_events); + sycl::event ht_ev = dpctl::utils::keep_args_alive( + exec_q, {coeff_matrix, dependent_vals}, {gesv_ev}); - return std::make_pair(args_ev, gesv_ev); + return std::make_pair(ht_ev, gesv_ev); } template From d5adbd65f23e7363fb5ae11631210a12ec07cde9 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 26 Jul 2024 15:03:09 +0200 Subject: [PATCH 21/37] Use check_zeros_shape in gesv and gesv_batch --- dpnp/backend/extensions/lapack/gesv.cpp | 13 ++++++------ dpnp/backend/extensions/lapack/gesv_batch.cpp | 21 +++++++++---------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp index 2da4de5b625a..bfaa4d5657a1 100644 --- a/dpnp/backend/extensions/lapack/gesv.cpp +++ b/dpnp/backend/extensions/lapack/gesv.cpp @@ -264,6 +264,7 @@ std::pair dpctl::tensor::usm_ndarray dependent_vals, const std::vector &depends) { + const int coeff_matrix_nd = coeff_matrix.get_ndim(); const int dependent_vals_nd = dependent_vals.get_ndim(); const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw(); @@ -277,13 +278,11 @@ std::pair dependent_vals_shape, expected_coeff_matrix_ndim, min_dependent_vals_ndim, max_dependent_vals_ndim); - size_t src_nelems(1); - - for (int i = 0; i < dependent_vals_nd; ++i) { - src_nelems *= static_cast(dependent_vals_shape[i]); - } - - if (src_nelems == 0) { + // Ensure `batch_size`, `n` and 'nrhs' are non-zero, otherwise return empty + // events + if (helper::check_zeros_shape(coeff_matrix_nd, coeff_matrix_shape) || + helper::check_zeros_shape(dependent_vals_nd, dependent_vals_shape)) + { // nothing to do return std::make_pair(sycl::event(), sycl::event()); } diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp index 22c6c8df6114..5013389dfb31 100644 --- a/dpnp/backend/extensions/lapack/gesv_batch.cpp +++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp @@ -241,6 +241,7 @@ std::pair dpctl::tensor::usm_ndarray dependent_vals, const std::vector &depends) { + const int coeff_matrix_nd = coeff_matrix.get_ndim(); const int dependent_vals_nd = dependent_vals.get_ndim(); const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw(); @@ -254,6 +255,15 @@ std::pair dependent_vals_shape, expected_coeff_matrix_ndim, min_dependent_vals_ndim, max_dependent_vals_ndim); + // Ensure `batch_size`, `n` and 'nrhs' are non-zero, otherwise return empty + // events + if (helper::check_zeros_shape(coeff_matrix_nd, coeff_matrix_shape) || + helper::check_zeros_shape(dependent_vals_nd, dependent_vals_shape)) + { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + if (dependent_vals_nd == 2) { if (coeff_matrix_shape[2] != dependent_vals_shape[1]) { throw py::value_error( @@ -275,17 +285,6 @@ std::pair } } - size_t src_nelems(1); - - for (int i = 0; i < dependent_vals_nd; ++i) { - src_nelems *= static_cast(dependent_vals_shape[i]); - } - - if (src_nelems == 0) { - // nothing to do - return std::make_pair(sycl::event(), sycl::event()); - } - auto array_types = dpctl_td_ns::usm_ndarray_types(); int coeff_matrix_type_id = array_types.typenum_to_lookup_id(coeff_matrix.get_typenum()); From 5b2780c68555d7dc1ce426216909b536092b571f Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 26 Jul 2024 15:16:46 +0200 Subject: [PATCH 22/37] Add additional checks for gesv_impl --- dpnp/backend/extensions/lapack/gesv.cpp | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp index bfaa4d5657a1..a3d22eed38ec 100644 --- a/dpnp/backend/extensions/lapack/gesv.cpp +++ b/dpnp/backend/extensions/lapack/gesv.cpp @@ -162,9 +162,25 @@ static sycl::event gesv_impl(sycl::queue &exec_q, const std::int64_t scratchpad_size = mkl_lapack::gesv_scratchpad_size(exec_q, n, nrhs, lda, ldb); + + if (scratchpad_size <= 0) { + throw std::runtime_error( + "Invalid scratchpad size: must be greater than zero." + "Calculated scratchpad size: " + + std::to_string(scratchpad_size)); + } + T *scratchpad = nullptr; + // Allocate memory for the scratchpad + scratchpad = sycl::malloc_device(scratchpad_size, exec_q); + if (!scratchpad) + throw std::runtime_error("Device allocation for scratchpad failed"); std::int64_t *ipiv = nullptr; + // Allocate memory for the ipiv + ipiv = sycl::malloc_device(n, exec_q); + if (!ipiv) + throw std::runtime_error("Device allocation for ipiv failed"); std::stringstream error_msg; std::int64_t info = 0; @@ -172,9 +188,6 @@ static sycl::event gesv_impl(sycl::queue &exec_q, sycl::event gesv_event; try { - scratchpad = sycl::malloc_device(scratchpad_size, exec_q); - ipiv = sycl::malloc_device(n, exec_q); - gesv_event = mkl_lapack::gesv( exec_q, n, // The order of the square matrix A From d4547d41beef163dca51775ffd229f88e5f9ca18 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 26 Jul 2024 15:30:01 +0200 Subject: [PATCH 23/37] Move alloc_scratchpad to common_helpers.hpp --- .../extensions/lapack/common_helpers.hpp | 36 ++++++++++++++++++- .../extensions/lapack/evd_batch_common.hpp | 30 ---------------- .../backend/extensions/lapack/heevd_batch.cpp | 2 +- .../backend/extensions/lapack/syevd_batch.cpp | 2 +- 4 files changed, 37 insertions(+), 33 deletions(-) diff --git a/dpnp/backend/extensions/lapack/common_helpers.hpp b/dpnp/backend/extensions/lapack/common_helpers.hpp index df2b87028e48..e1812c9ad415 100644 --- a/dpnp/backend/extensions/lapack/common_helpers.hpp +++ b/dpnp/backend/extensions/lapack/common_helpers.hpp @@ -24,9 +24,11 @@ //***************************************************************************** #pragma once +#include +#include + #include #include -#include #include namespace dpnp::extensions::lapack::helper @@ -63,4 +65,36 @@ inline bool check_zeros_shape(int ndim, const py::ssize_t *shape) } return src_nelems == 0; } + +// Allocate the total scratchpad memory with proper alignment for batch +// implementations +template +inline T *alloc_scratchpad(std::int64_t scratchpad_size, + std::int64_t n_linear_streams, + sycl::queue &exec_q) +{ + // Get padding size to ensure memory allocations are aligned to 256 bytes + // for better performance + const std::int64_t padding = 256 / sizeof(T); + + if (scratchpad_size <= 0) { + throw std::runtime_error( + "Invalid scratchpad size: must be greater than zero." + " Calculated scratchpad size: " + + std::to_string(scratchpad_size)); + } + + // Calculate the total scratchpad memory size needed for all linear + // streams with proper alignment + const size_t alloc_scratch_size = + round_up_mult(n_linear_streams * scratchpad_size, padding); + + // Allocate memory for the total scratchpad + T *scratchpad = sycl::malloc_device(alloc_scratch_size, exec_q); + if (!scratchpad) { + throw std::runtime_error("Device allocation for scratchpad failed"); + } + + return scratchpad; +} } // namespace dpnp::extensions::lapack::helper diff --git a/dpnp/backend/extensions/lapack/evd_batch_common.hpp b/dpnp/backend/extensions/lapack/evd_batch_common.hpp index 5a7d985ff821..3fa5f98214a4 100644 --- a/dpnp/backend/extensions/lapack/evd_batch_common.hpp +++ b/dpnp/backend/extensions/lapack/evd_batch_common.hpp @@ -119,34 +119,4 @@ std::pair return std::make_pair(ht_ev, evd_batch_ev); } - -template -inline T *alloc_scratchpad(std::int64_t scratchpad_size, - std::int64_t n_linear_streams, - sycl::queue &exec_q) -{ - // Get padding size to ensure memory allocations are aligned to 256 bytes - // for better performance - const std::int64_t padding = 256 / sizeof(T); - - if (scratchpad_size <= 0) { - throw std::runtime_error( - "Invalid scratchpad size: must be greater than zero." - " Calculated scratchpad size: " + - std::to_string(scratchpad_size)); - } - - // Calculate the total scratchpad memory size needed for all linear - // streams with proper alignment - const size_t alloc_scratch_size = - helper::round_up_mult(n_linear_streams * scratchpad_size, padding); - - // Allocate memory for the total scratchpad - T *scratchpad = sycl::malloc_device(alloc_scratch_size, exec_q); - if (!scratchpad) { - throw std::runtime_error("Device allocation for scratchpad failed"); - } - - return scratchpad; -} } // namespace dpnp::extensions::lapack::evd diff --git a/dpnp/backend/extensions/lapack/heevd_batch.cpp b/dpnp/backend/extensions/lapack/heevd_batch.cpp index bc21b3da35d5..2ea37e22d7d9 100644 --- a/dpnp/backend/extensions/lapack/heevd_batch.cpp +++ b/dpnp/backend/extensions/lapack/heevd_batch.cpp @@ -66,7 +66,7 @@ static sycl::event heevd_batch_impl(sycl::queue &exec_q, mkl_lapack::heevd_scratchpad_size(exec_q, jobz, upper_lower, n, lda); T *scratchpad = - evd::alloc_scratchpad(scratchpad_size, n_linear_streams, exec_q); + helper::alloc_scratchpad(scratchpad_size, n_linear_streams, exec_q); // Computation events to manage dependencies for each linear stream std::vector> comp_evs(n_linear_streams, depends); diff --git a/dpnp/backend/extensions/lapack/syevd_batch.cpp b/dpnp/backend/extensions/lapack/syevd_batch.cpp index d2b87fc260cc..aa5656e0ac3c 100644 --- a/dpnp/backend/extensions/lapack/syevd_batch.cpp +++ b/dpnp/backend/extensions/lapack/syevd_batch.cpp @@ -66,7 +66,7 @@ static sycl::event syevd_batch_impl(sycl::queue &exec_q, mkl_lapack::syevd_scratchpad_size(exec_q, jobz, upper_lower, n, lda); T *scratchpad = - evd::alloc_scratchpad(scratchpad_size, n_linear_streams, exec_q); + helper::alloc_scratchpad(scratchpad_size, n_linear_streams, exec_q); // Computation events to manage dependencies for each linear stream std::vector> comp_evs(n_linear_streams, depends); From 67591649cc3812b20c92a5fd13a9e319bd766764 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 26 Jul 2024 15:35:24 +0200 Subject: [PATCH 24/37] Use helper::alloc_scratchpad in gesv_batch_impl --- dpnp/backend/extensions/lapack/gesv_batch.cpp | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp index 5013389dfb31..197049cd0b3a 100644 --- a/dpnp/backend/extensions/lapack/gesv_batch.cpp +++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp @@ -82,23 +82,13 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q, const std::int64_t scratchpad_size = mkl_lapack::gesv_scratchpad_size(exec_q, n, nrhs, lda, ldb); - T *scratchpad = nullptr; + + T *scratchpad = + helper::alloc_scratchpad(scratchpad_size, n_linear_streams, exec_q); // Get padding size to ensure memory allocations are aligned to 256 bytes // for better performance - std::int64_t padding = 256 / sizeof(T); - - if (scratchpad_size > 0) { - // Calculate the total scratchpad memory size needed for all linear - // streams with proper alignment - size_t alloc_scratch_size = - helper::round_up_mult(n_linear_streams * scratchpad_size, padding); - - // Allocate memory for the total scratchpad - scratchpad = sycl::malloc_device(alloc_scratch_size, exec_q); - if (!scratchpad) - throw std::runtime_error("Device allocation for scratchpad failed"); - } + const std::int64_t padding = 256 / sizeof(T); // Calculate the total size needed for the pivot indices array for all // linear streams with proper alignment From f37ec435583137600b4bfd91ff1d25de1d3822f9 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 26 Jul 2024 15:39:22 +0200 Subject: [PATCH 25/37] Remove current_scratch_gesv check --- dpnp/backend/extensions/lapack/gesv_batch.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp index 197049cd0b3a..42e9d857701b 100644 --- a/dpnp/backend/extensions/lapack/gesv_batch.cpp +++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp @@ -118,10 +118,7 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q, std::int64_t stream_id = (batch_id % n_linear_streams); - T *current_scratch_gesv = nullptr; - if (scratchpad != nullptr) { - current_scratch_gesv = scratchpad + stream_id * scratchpad_size; - } + T *current_scratch_gesv = scratchpad + stream_id * scratchpad_size; std::int64_t *current_ipiv = ipiv + stream_id * n; // Get the event dependencies for the current stream From adc17ba78f7260609d618c347b7e8233709f886e Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 26 Jul 2024 15:46:44 +0200 Subject: [PATCH 26/37] Remove lda, ldb pass to gesv_batch_impl, gesv_impl --- dpnp/backend/extensions/lapack/gesv.cpp | 14 +++++--------- dpnp/backend/extensions/lapack/gesv_batch.cpp | 14 +++++--------- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp index a3d22eed38ec..c215769d270f 100644 --- a/dpnp/backend/extensions/lapack/gesv.cpp +++ b/dpnp/backend/extensions/lapack/gesv.cpp @@ -138,9 +138,7 @@ typedef sycl::event (*gesv_impl_fn_ptr_t)(sycl::queue &, const std::int64_t, const std::int64_t, char *, - std::int64_t, char *, - std::int64_t, const std::vector &); static gesv_impl_fn_ptr_t gesv_dispatch_vector[dpctl_td_ns::num_types]; @@ -150,9 +148,7 @@ static sycl::event gesv_impl(sycl::queue &exec_q, const std::int64_t n, const std::int64_t nrhs, char *in_a, - std::int64_t lda, char *in_b, - std::int64_t ldb, const std::vector &depends) { type_utils::validate_type_for_device(exec_q); @@ -160,6 +156,9 @@ static sycl::event gesv_impl(sycl::queue &exec_q, T *a = reinterpret_cast(in_a); T *b = reinterpret_cast(in_b); + const std::int64_t lda = std::max(1UL, n); + const std::int64_t ldb = std::max(1UL, n); + const std::int64_t scratchpad_size = mkl_lapack::gesv_scratchpad_size(exec_q, n, nrhs, lda, ldb); @@ -318,11 +317,8 @@ std::pair const std::int64_t nrhs = (dependent_vals_nd > 1) ? dependent_vals_shape[1] : 1; - const std::int64_t lda = std::max(1UL, n); - const std::int64_t ldb = std::max(1UL, n); - - sycl::event gesv_ev = gesv_fn(exec_q, n, nrhs, coeff_matrix_data, lda, - dependent_vals_data, ldb, depends); + sycl::event gesv_ev = gesv_fn(exec_q, n, nrhs, coeff_matrix_data, + dependent_vals_data, depends); sycl::event ht_ev = dpctl::utils::keep_args_alive( exec_q, {coeff_matrix, dependent_vals}, {gesv_ev}); diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp index 42e9d857701b..f5e47389b4b9 100644 --- a/dpnp/backend/extensions/lapack/gesv_batch.cpp +++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp @@ -49,9 +49,7 @@ typedef sycl::event (*gesv_batch_impl_fn_ptr_t)( const std::int64_t, const std::int64_t, char *, - std::int64_t, char *, - std::int64_t, const std::vector &); static gesv_batch_impl_fn_ptr_t @@ -63,9 +61,7 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q, const std::int64_t nrhs, const std::int64_t batch_size, char *in_a, - std::int64_t lda, char *in_b, - std::int64_t ldb, const std::vector &depends) { type_utils::validate_type_for_device(exec_q); @@ -76,6 +72,9 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q, std::int64_t a_size = n * n; std::int64_t b_size = n * nrhs; + const std::int64_t lda = std::max(1UL, n); + const std::int64_t ldb = std::max(1UL, n); + // Get the number of independent linear streams std::int64_t n_linear_streams = (batch_size > 16) ? 4 : ((batch_size > 4 ? 2 : 1)); @@ -292,12 +291,9 @@ std::pair const std::int64_t nrhs = (dependent_vals_nd > 2) ? dependent_vals_shape[1] : 1; - const std::int64_t lda = std::max(1UL, n); - const std::int64_t ldb = std::max(1UL, n); - sycl::event gesv_ev = - gesv_batch_fn(exec_q, n, nrhs, batch_size, coeff_matrix_data, lda, - dependent_vals_data, ldb, depends); + gesv_batch_fn(exec_q, n, nrhs, batch_size, coeff_matrix_data, + dependent_vals_data, depends); sycl::event ht_ev = dpctl::utils::keep_args_alive( exec_q, {coeff_matrix, dependent_vals}, {gesv_ev}); From 77ba0e2375eb5a494aff498729aea47945017dbb Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 26 Jul 2024 15:55:32 +0200 Subject: [PATCH 27/37] Use const and constexpr in gesv/gesv_batch --- dpnp/backend/extensions/lapack/gesv.cpp | 16 ++++++++-------- dpnp/backend/extensions/lapack/gesv_batch.cpp | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp index c215769d270f..52b2384cc040 100644 --- a/dpnp/backend/extensions/lapack/gesv.cpp +++ b/dpnp/backend/extensions/lapack/gesv.cpp @@ -110,22 +110,22 @@ void common_gesv_checks(sycl::queue &exec_q, dpctl::tensor::validation::CheckWritable::throw_if_not_writable( dependent_vals); - bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous(); + const bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous(); if (!is_coeff_matrix_f_contig) { throw py::value_error("The coefficient matrix " "must be F-contiguous."); } - bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous(); + const bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous(); if (!is_dependent_vals_f_contig) { throw py::value_error("The array of dependent variables " "must be F-contiguous."); } auto array_types = dpctl_td_ns::usm_ndarray_types(); - int coeff_matrix_type_id = + const int coeff_matrix_type_id = array_types.typenum_to_lookup_id(coeff_matrix.get_typenum()); - int dependent_vals_type_id = + const int dependent_vals_type_id = array_types.typenum_to_lookup_id(dependent_vals.get_typenum()); if (coeff_matrix_type_id != dependent_vals_type_id) { @@ -282,9 +282,9 @@ std::pair const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw(); const py::ssize_t *dependent_vals_shape = dependent_vals.get_shape_raw(); - const int expected_coeff_matrix_ndim = 2; - const int min_dependent_vals_ndim = 1; - const int max_dependent_vals_ndim = 2; + constexpr int expected_coeff_matrix_ndim = 2; + constexpr int min_dependent_vals_ndim = 1; + constexpr int max_dependent_vals_ndim = 2; common_gesv_checks(exec_q, coeff_matrix, dependent_vals, coeff_matrix_shape, dependent_vals_shape, expected_coeff_matrix_ndim, @@ -300,7 +300,7 @@ std::pair } auto array_types = dpctl_td_ns::usm_ndarray_types(); - int coeff_matrix_type_id = + const int coeff_matrix_type_id = array_types.typenum_to_lookup_id(coeff_matrix.get_typenum()); gesv_impl_fn_ptr_t gesv_fn = gesv_dispatch_vector[coeff_matrix_type_id]; diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp index f5e47389b4b9..233d323247f0 100644 --- a/dpnp/backend/extensions/lapack/gesv_batch.cpp +++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp @@ -233,9 +233,9 @@ std::pair const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw(); const py::ssize_t *dependent_vals_shape = dependent_vals.get_shape_raw(); - const int expected_coeff_matrix_ndim = 3; - const int min_dependent_vals_ndim = 2; - const int max_dependent_vals_ndim = 3; + constexpr int expected_coeff_matrix_ndim = 3; + constexpr int min_dependent_vals_ndim = 2; + constexpr int max_dependent_vals_ndim = 3; common_gesv_checks(exec_q, coeff_matrix, dependent_vals, coeff_matrix_shape, dependent_vals_shape, expected_coeff_matrix_ndim, @@ -272,7 +272,7 @@ std::pair } auto array_types = dpctl_td_ns::usm_ndarray_types(); - int coeff_matrix_type_id = + const int coeff_matrix_type_id = array_types.typenum_to_lookup_id(coeff_matrix.get_typenum()); gesv_batch_impl_fn_ptr_t gesv_batch_fn = From 9bf94b51c7321e1bef6dbd39bc6d61f48829bb9c Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 29 Jul 2024 11:45:17 +0200 Subject: [PATCH 28/37] Applied review comments --- dpnp/backend/extensions/lapack/gesv.cpp | 6 +++--- dpnp/backend/extensions/lapack/gesv_batch.cpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp index 52b2384cc040..92e03213c42e 100644 --- a/dpnp/backend/extensions/lapack/gesv.cpp +++ b/dpnp/backend/extensions/lapack/gesv.cpp @@ -75,10 +75,10 @@ void common_gesv_checks(sycl::queue &exec_q, // The coeff_matrix and dependent_vals arrays must be F-contiguous arrays // for gesv - // with the shapes (n,n) and (n,nrhs) or (n,) respectively; + // with the shapes (n, n) and (n, nrhs) or (n, ) respectively; // for gesv_batch - // with the shapes (n,n,batch_size) and (n,nrhs,batch_size) or - // (n,batch_size) respectively + // with the shapes (n, n, batch_size) and (n, nrhs, batch_size) or + // (n, batch_size) respectively if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) { throw py::value_error("The coefficient matrix must be square," " but got a shape of (" + diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp index 233d323247f0..42cb7a36814f 100644 --- a/dpnp/backend/extensions/lapack/gesv_batch.cpp +++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp @@ -69,14 +69,14 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q, T *a = reinterpret_cast(in_a); T *b = reinterpret_cast(in_b); - std::int64_t a_size = n * n; - std::int64_t b_size = n * nrhs; + const std::int64_t a_size = n * n; + const std::int64_t b_size = n * nrhs; const std::int64_t lda = std::max(1UL, n); const std::int64_t ldb = std::max(1UL, n); // Get the number of independent linear streams - std::int64_t n_linear_streams = + const std::int64_t n_linear_streams = (batch_size > 16) ? 4 : ((batch_size > 4 ? 2 : 1)); const std::int64_t scratchpad_size = From b81893c3cb9be2270789b18a7c5df049e2a13151 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 29 Jul 2024 11:48:48 +0200 Subject: [PATCH 29/37] Use dpnp.reshape in _batched_solve --- dpnp/linalg/dpnp_utils_linalg.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 6dfa941cd0cd..fe417413cf5a 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -266,11 +266,11 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type): # gesv_batch expects `a` to be a 3D array and # `b` to be either a 2D or 3D array. if a.ndim == b.ndim: - b = b.reshape(-1, b_shape[-2], b_shape[-1]) + b = dpnp.reshape(b, (-1, b_shape[-2], b_shape[-1])) else: - b = b.reshape(-1, b_shape[-1]) + b = dpnp.reshape(b, (-1, b_shape[-1])) - a = a.reshape(-1, a_shape[-2], a_shape[-1]) + a = dpnp.reshape(a, (-1, a_shape[-2], a_shape[-1])) # Reorder the elements by moving the last two axes of `a` to the front # to match fortran-like array order which is assumed by gesv. From f8d68efe476fbb721eb2f22af9db2165c263e84e Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 29 Jul 2024 12:12:46 +0200 Subject: [PATCH 30/37] Implement alloc_ipiv in common_helpers.hpp --- .../extensions/lapack/common_helpers.hpp | 24 +++++++++++++++++++ dpnp/backend/extensions/lapack/gesv_batch.cpp | 15 +----------- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/dpnp/backend/extensions/lapack/common_helpers.hpp b/dpnp/backend/extensions/lapack/common_helpers.hpp index e1812c9ad415..aeeb3fb2793c 100644 --- a/dpnp/backend/extensions/lapack/common_helpers.hpp +++ b/dpnp/backend/extensions/lapack/common_helpers.hpp @@ -66,6 +66,30 @@ inline bool check_zeros_shape(int ndim, const py::ssize_t *shape) return src_nelems == 0; } +// Allocate the total memory for the total pivot indices with proper alignment +// for batch implementations +template +inline std::int64_t *alloc_ipiv(const std::int64_t n, + std::int64_t n_linear_streams, + sycl::queue &exec_q) +{ + // Get padding size to ensure memory allocations are aligned to 256 bytes + // for better performance + const std::int64_t padding = 256 / sizeof(T); + + // Calculate the total size needed for the pivot indices array for all + // linear streams with proper alignment + size_t alloc_ipiv_size = round_up_mult(n_linear_streams * n, padding); + + // Allocate memory for the total pivot indices array + std::int64_t *ipiv = + sycl::malloc_device(alloc_ipiv_size, exec_q); + if (!ipiv) + throw std::runtime_error("Device allocation for ipiv failed"); + + return ipiv; +} + // Allocate the total scratchpad memory with proper alignment for batch // implementations template diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp index 42cb7a36814f..89f2a4fe9d77 100644 --- a/dpnp/backend/extensions/lapack/gesv_batch.cpp +++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp @@ -85,20 +85,7 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q, T *scratchpad = helper::alloc_scratchpad(scratchpad_size, n_linear_streams, exec_q); - // Get padding size to ensure memory allocations are aligned to 256 bytes - // for better performance - const std::int64_t padding = 256 / sizeof(T); - - // Calculate the total size needed for the pivot indices array for all - // linear streams with proper alignment - size_t alloc_ipiv_size = - helper::round_up_mult(n_linear_streams * n, padding); - - // Allocate memory for the total pivot indices array - std::int64_t *ipiv = - sycl::malloc_device(alloc_ipiv_size, exec_q); - if (!ipiv) - throw std::runtime_error("Device allocation for ipiv failed"); + std::int64_t *ipiv = helper::alloc_ipiv(n, n_linear_streams, exec_q); // Computation events to manage dependencies for each linear stream std::vector> comp_evs(n_linear_streams, depends); From fc6c7fabf4c9a776a9b065f5022baf192b8d46dc Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 29 Jul 2024 13:15:24 +0200 Subject: [PATCH 31/37] Add gesv_common_utils.hpp --- dpnp/backend/extensions/lapack/gesv.cpp | 103 +------------- dpnp/backend/extensions/lapack/gesv_batch.cpp | 12 +- .../extensions/lapack/gesv_common_utils.hpp | 130 ++++++++++++++++++ 3 files changed, 140 insertions(+), 105 deletions(-) create mode 100644 dpnp/backend/extensions/lapack/gesv_common_utils.hpp diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp index 92e03213c42e..be7df6bbf11c 100644 --- a/dpnp/backend/extensions/lapack/gesv.cpp +++ b/dpnp/backend/extensions/lapack/gesv.cpp @@ -26,114 +26,20 @@ #include // dpctl tensor headers -#include "utils/memory_overlap.hpp" -#include "utils/output_validation.hpp" #include "utils/type_utils.hpp" #include "common_helpers.hpp" #include "gesv.hpp" +#include "gesv_common_utils.hpp" #include "linalg_exceptions.hpp" #include "types_matrix.hpp" -#include "dpnp_utils.hpp" - namespace dpnp::extensions::lapack { namespace mkl_lapack = oneapi::mkl::lapack; namespace py = pybind11; namespace type_utils = dpctl::tensor::type_utils; -void common_gesv_checks(sycl::queue &exec_q, - dpctl::tensor::usm_ndarray coeff_matrix, - dpctl::tensor::usm_ndarray dependent_vals, - const py::ssize_t *coeff_matrix_shape, - const py::ssize_t *dependent_vals_shape, - const int expected_coeff_matrix_ndim, - const int min_dependent_vals_ndim, - const int max_dependent_vals_ndim) -{ - const int coeff_matrix_nd = coeff_matrix.get_ndim(); - const int dependent_vals_nd = dependent_vals.get_ndim(); - - if (coeff_matrix_nd != expected_coeff_matrix_ndim) { - throw py::value_error("The coefficient matrix has ndim=" + - std::to_string(coeff_matrix_nd) + ", but a " + - std::to_string(expected_coeff_matrix_ndim) + - "-dimensional array is expected."); - } - - if (dependent_vals_nd < min_dependent_vals_ndim || - dependent_vals_nd > max_dependent_vals_ndim) - { - throw py::value_error("The dependent values array has ndim=" + - std::to_string(dependent_vals_nd) + ", but a " + - std::to_string(min_dependent_vals_ndim) + - "-dimensional or a " + - std::to_string(max_dependent_vals_ndim) + - "-dimensional array is expected."); - } - - // The coeff_matrix and dependent_vals arrays must be F-contiguous arrays - // for gesv - // with the shapes (n, n) and (n, nrhs) or (n, ) respectively; - // for gesv_batch - // with the shapes (n, n, batch_size) and (n, nrhs, batch_size) or - // (n, batch_size) respectively - if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) { - throw py::value_error("The coefficient matrix must be square," - " but got a shape of (" + - std::to_string(coeff_matrix_shape[0]) + ", " + - std::to_string(coeff_matrix_shape[1]) + ")."); - } - if (coeff_matrix_shape[0] != dependent_vals_shape[0]) { - throw py::value_error("The first dimension (n) of coeff_matrix and" - " dependent_vals must be the same, but got " + - std::to_string(coeff_matrix_shape[0]) + " and " + - std::to_string(dependent_vals_shape[0]) + "."); - } - - // check compatibility of execution queue and allocation queue - if (!dpctl::utils::queues_are_compatible(exec_q, - {coeff_matrix, dependent_vals})) - { - throw py::value_error( - "Execution queue is not compatible with allocation queues."); - } - - auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); - if (overlap(coeff_matrix, dependent_vals)) { - throw py::value_error( - "The arrays of coefficients and dependent variables " - "are overlapping segments of memory."); - } - - dpctl::tensor::validation::CheckWritable::throw_if_not_writable( - dependent_vals); - - const bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous(); - if (!is_coeff_matrix_f_contig) { - throw py::value_error("The coefficient matrix " - "must be F-contiguous."); - } - - const bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous(); - if (!is_dependent_vals_f_contig) { - throw py::value_error("The array of dependent variables " - "must be F-contiguous."); - } - - auto array_types = dpctl_td_ns::usm_ndarray_types(); - const int coeff_matrix_type_id = - array_types.typenum_to_lookup_id(coeff_matrix.get_typenum()); - const int dependent_vals_type_id = - array_types.typenum_to_lookup_id(dependent_vals.get_typenum()); - - if (coeff_matrix_type_id != dependent_vals_type_id) { - throw py::value_error("The types of the coefficient matrix and " - "dependent variables are mismatched."); - } -} - typedef sycl::event (*gesv_impl_fn_ptr_t)(sycl::queue &, const std::int64_t, const std::int64_t, @@ -286,9 +192,10 @@ std::pair constexpr int min_dependent_vals_ndim = 1; constexpr int max_dependent_vals_ndim = 2; - common_gesv_checks(exec_q, coeff_matrix, dependent_vals, coeff_matrix_shape, - dependent_vals_shape, expected_coeff_matrix_ndim, - min_dependent_vals_ndim, max_dependent_vals_ndim); + gesv_utils::common_gesv_checks( + exec_q, coeff_matrix, dependent_vals, coeff_matrix_shape, + dependent_vals_shape, expected_coeff_matrix_ndim, + min_dependent_vals_ndim, max_dependent_vals_ndim); // Ensure `batch_size`, `n` and 'nrhs' are non-zero, otherwise return empty // events diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp index 89f2a4fe9d77..bbecc7985c00 100644 --- a/dpnp/backend/extensions/lapack/gesv_batch.cpp +++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp @@ -26,17 +26,14 @@ #include // dpctl tensor headers -#include "utils/memory_overlap.hpp" -#include "utils/output_validation.hpp" #include "utils/type_utils.hpp" #include "common_helpers.hpp" #include "gesv.hpp" +#include "gesv_common_utils.hpp" #include "linalg_exceptions.hpp" #include "types_matrix.hpp" -#include "dpnp_utils.hpp" - namespace dpnp::extensions::lapack { namespace mkl_lapack = oneapi::mkl::lapack; @@ -224,9 +221,10 @@ std::pair constexpr int min_dependent_vals_ndim = 2; constexpr int max_dependent_vals_ndim = 3; - common_gesv_checks(exec_q, coeff_matrix, dependent_vals, coeff_matrix_shape, - dependent_vals_shape, expected_coeff_matrix_ndim, - min_dependent_vals_ndim, max_dependent_vals_ndim); + gesv_utils::common_gesv_checks( + exec_q, coeff_matrix, dependent_vals, coeff_matrix_shape, + dependent_vals_shape, expected_coeff_matrix_ndim, + min_dependent_vals_ndim, max_dependent_vals_ndim); // Ensure `batch_size`, `n` and 'nrhs' are non-zero, otherwise return empty // events diff --git a/dpnp/backend/extensions/lapack/gesv_common_utils.hpp b/dpnp/backend/extensions/lapack/gesv_common_utils.hpp new file mode 100644 index 000000000000..8cf148255650 --- /dev/null +++ b/dpnp/backend/extensions/lapack/gesv_common_utils.hpp @@ -0,0 +1,130 @@ +//***************************************************************************** +// Copyright (c) 2024, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#pragma once + +#include + +// dpctl tensor headers +#include "utils/memory_overlap.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpnp::extensions::lapack::gesv_utils +{ +namespace dpctl_td_ns = dpctl::tensor::type_dispatch; +namespace py = pybind11; + +inline void common_gesv_checks(sycl::queue &exec_q, + dpctl::tensor::usm_ndarray coeff_matrix, + dpctl::tensor::usm_ndarray dependent_vals, + const py::ssize_t *coeff_matrix_shape, + const py::ssize_t *dependent_vals_shape, + const int expected_coeff_matrix_ndim, + const int min_dependent_vals_ndim, + const int max_dependent_vals_ndim) +{ + const int coeff_matrix_nd = coeff_matrix.get_ndim(); + const int dependent_vals_nd = dependent_vals.get_ndim(); + + if (coeff_matrix_nd != expected_coeff_matrix_ndim) { + throw py::value_error("The coefficient matrix has ndim=" + + std::to_string(coeff_matrix_nd) + ", but a " + + std::to_string(expected_coeff_matrix_ndim) + + "-dimensional array is expected."); + } + + if (dependent_vals_nd < min_dependent_vals_ndim || + dependent_vals_nd > max_dependent_vals_ndim) + { + throw py::value_error("The dependent values array has ndim=" + + std::to_string(dependent_vals_nd) + ", but a " + + std::to_string(min_dependent_vals_ndim) + + "-dimensional or a " + + std::to_string(max_dependent_vals_ndim) + + "-dimensional array is expected."); + } + + // The coeff_matrix and dependent_vals arrays must be F-contiguous arrays + // for gesv + // with the shapes (n, n) and (n, nrhs) or (n, ) respectively; + // for gesv_batch + // with the shapes (n, n, batch_size) and (n, nrhs, batch_size) or + // (n, batch_size) respectively + if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) { + throw py::value_error("The coefficient matrix must be square," + " but got a shape of (" + + std::to_string(coeff_matrix_shape[0]) + ", " + + std::to_string(coeff_matrix_shape[1]) + ")."); + } + if (coeff_matrix_shape[0] != dependent_vals_shape[0]) { + throw py::value_error("The first dimension (n) of coeff_matrix and" + " dependent_vals must be the same, but got " + + std::to_string(coeff_matrix_shape[0]) + " and " + + std::to_string(dependent_vals_shape[0]) + "."); + } + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, + {coeff_matrix, dependent_vals})) + { + throw py::value_error( + "Execution queue is not compatible with allocation queues."); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(coeff_matrix, dependent_vals)) { + throw py::value_error( + "The arrays of coefficients and dependent variables " + "are overlapping segments of memory."); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable( + dependent_vals); + + const bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous(); + if (!is_coeff_matrix_f_contig) { + throw py::value_error("The coefficient matrix " + "must be F-contiguous."); + } + + const bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous(); + if (!is_dependent_vals_f_contig) { + throw py::value_error("The array of dependent variables " + "must be F-contiguous."); + } + + auto array_types = dpctl_td_ns::usm_ndarray_types(); + const int coeff_matrix_type_id = + array_types.typenum_to_lookup_id(coeff_matrix.get_typenum()); + const int dependent_vals_type_id = + array_types.typenum_to_lookup_id(dependent_vals.get_typenum()); + + if (coeff_matrix_type_id != dependent_vals_type_id) { + throw py::value_error("The types of the coefficient matrix and " + "dependent variables are mismatched."); + } +} +} // namespace dpnp::extensions::lapack::gesv_utils From 75079d27535d4f1adff267d14ae6eaa0b19ab4e4 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 29 Jul 2024 14:07:10 +0200 Subject: [PATCH 32/37] Implement handle_lapack_exc function --- dpnp/backend/extensions/lapack/gesv.cpp | 45 ++--------------- dpnp/backend/extensions/lapack/gesv_batch.cpp | 49 ++---------------- .../extensions/lapack/gesv_common_utils.hpp | 50 +++++++++++++++++++ 3 files changed, 58 insertions(+), 86 deletions(-) diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp index be7df6bbf11c..a8ef8d5d47c5 100644 --- a/dpnp/backend/extensions/lapack/gesv.cpp +++ b/dpnp/backend/extensions/lapack/gesv.cpp @@ -31,7 +31,6 @@ #include "common_helpers.hpp" #include "gesv.hpp" #include "gesv_common_utils.hpp" -#include "linalg_exceptions.hpp" #include "types_matrix.hpp" namespace dpnp::extensions::lapack @@ -88,7 +87,6 @@ static sycl::event gesv_impl(sycl::queue &exec_q, throw std::runtime_error("Device allocation for ipiv failed"); std::stringstream error_msg; - std::int64_t info = 0; bool is_exception_caught = false; sycl::event gesv_event; @@ -112,41 +110,8 @@ static sycl::event gesv_impl(sycl::queue &exec_q, scratchpad_size, depends); } catch (mkl_lapack::exception const &e) { is_exception_caught = true; - info = e.info(); - - if (info < 0) { - error_msg << "Parameter number " << -info - << " had an illegal value."; - } - else if (info == scratchpad_size && e.detail() != 0) { - error_msg - << "Insufficient scratchpad size. Required size is at least " - << e.detail(); - } - else if (info > 0) { - T host_U; - exec_q.memcpy(&host_U, &a[(info - 1) * lda + info - 1], sizeof(T)) - .wait(); - - using ThresholdType = typename helper::value_type_of::type; - - const auto threshold = - std::numeric_limits::epsilon() * 100; - if (std::abs(host_U) < threshold) { - sycl::free(scratchpad, exec_q); - throw LinAlgError("The input coefficient matrix is singular."); - } - else { - error_msg << "Unexpected MKL exception caught during gesv() " - "call:\nreason: " - << e.what() << "\ninfo: " << e.info(); - } - } - else { - error_msg << "Unexpected MKL exception caught during gesv() " - "call:\nreason: " - << e.what() << "\ninfo: " << e.info(); - } + gesv_utils::handle_lapack_exc(exec_q, lda, a, scratchpad_size, + scratchpad, ipiv, e, error_msg); } catch (sycl::exception const &e) { is_exception_caught = true; error_msg << "Unexpected SYCL exception caught during gesv() call:\n" @@ -155,12 +120,10 @@ static sycl::event gesv_impl(sycl::queue &exec_q, if (is_exception_caught) // an unexpected error occurs { - if (scratchpad != nullptr) { + if (scratchpad != nullptr) sycl::free(scratchpad, exec_q); - } - if (ipiv != nullptr) { + if (ipiv != nullptr) sycl::free(ipiv, exec_q); - } throw std::runtime_error(error_msg.str()); } diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp index bbecc7985c00..62c115259d18 100644 --- a/dpnp/backend/extensions/lapack/gesv_batch.cpp +++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp @@ -31,7 +31,6 @@ #include "common_helpers.hpp" #include "gesv.hpp" #include "gesv_common_utils.hpp" -#include "linalg_exceptions.hpp" #include "types_matrix.hpp" namespace dpnp::extensions::lapack @@ -88,7 +87,6 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q, std::vector> comp_evs(n_linear_streams, depends); std::stringstream error_msg; - std::int64_t info = 0; bool is_exception_caught = false; // Release GIL to avoid serialization of host task @@ -130,45 +128,8 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q, scratchpad_size, current_dep); } catch (mkl_lapack::exception const &e) { is_exception_caught = true; - info = e.info(); - - if (info < 0) { - error_msg << "Parameter number " << -info - << " had an illegal value."; - } - else if (info == scratchpad_size && e.detail() != 0) { - error_msg << "Insufficient scratchpad size. Required size is " - "at least " - << e.detail(); - } - else if (info > 0) { - T host_U; - exec_q - .memcpy(&host_U, &a[(info - 1) * lda + info - 1], sizeof(T)) - .wait(); - - using ThresholdType = typename helper::value_type_of::type; - - const auto threshold = - std::numeric_limits::epsilon() * 100; - if (std::abs(host_U) < threshold) { - sycl::free(scratchpad, exec_q); - sycl::free(ipiv, exec_q); - throw LinAlgError( - "The input coefficient matrix is singular."); - } - else { - error_msg - << "Unexpected MKL exception caught during gesv() " - "call:\nreason: " - << e.what() << "\ninfo: " << e.info(); - } - } - else { - error_msg << "Unexpected MKL exception caught during gesv() " - "call:\nreason: " - << e.what() << "\ninfo: " << e.info(); - } + gesv_utils::handle_lapack_exc(exec_q, lda, a, scratchpad_size, + scratchpad, ipiv, e, error_msg); } catch (sycl::exception const &e) { is_exception_caught = true; error_msg @@ -182,12 +143,10 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q, if (is_exception_caught) // an unexpected error occurs { - if (scratchpad != nullptr) { + if (scratchpad != nullptr) sycl::free(scratchpad, exec_q); - } - if (ipiv != nullptr) { + if (ipiv != nullptr) sycl::free(ipiv, exec_q); - } throw std::runtime_error(error_msg.str()); } diff --git a/dpnp/backend/extensions/lapack/gesv_common_utils.hpp b/dpnp/backend/extensions/lapack/gesv_common_utils.hpp index 8cf148255650..4b4df013aab5 100644 --- a/dpnp/backend/extensions/lapack/gesv_common_utils.hpp +++ b/dpnp/backend/extensions/lapack/gesv_common_utils.hpp @@ -32,6 +32,9 @@ #include "utils/output_validation.hpp" #include "utils/type_dispatch.hpp" +#include "common_helpers.hpp" +#include "linalg_exceptions.hpp" + namespace dpnp::extensions::lapack::gesv_utils { namespace dpctl_td_ns = dpctl::tensor::type_dispatch; @@ -127,4 +130,51 @@ inline void common_gesv_checks(sycl::queue &exec_q, "dependent variables are mismatched."); } } + +template +inline void handle_lapack_exc(sycl::queue &exec_q, + const std::int64_t lda, + T *a, + std::int64_t scratchpad_size, + T *scratchpad, + std::int64_t *ipiv, + const oneapi::mkl::lapack::exception &e, + std::stringstream &error_msg) +{ + std::int64_t info = e.info(); + if (info < 0) { + error_msg << "Parameter number " << -info << " had an illegal value."; + } + else if (info == scratchpad_size && e.detail() != 0) { + error_msg << "Insufficient scratchpad size. Required size is at least " + << e.detail(); + } + else if (info > 0) { + T host_U; + exec_q.memcpy(&host_U, &a[(info - 1) * lda + info - 1], sizeof(T)) + .wait(); + + using ThresholdType = typename helper::value_type_of::type; + + const auto threshold = + std::numeric_limits::epsilon() * 100; + if (std::abs(host_U) < threshold) { + if (scratchpad != nullptr) + sycl::free(scratchpad, exec_q); + if (ipiv != nullptr) + sycl::free(ipiv, exec_q); + throw LinAlgError("The input coefficient matrix is singular."); + } + else { + error_msg << "Unexpected MKL exception caught during gesv() " + "call:\nreason: " + << e.what() << "\ninfo: " << e.info(); + } + } + else { + error_msg + << "Unexpected MKL exception caught during gesv() call:\nreason: " + << e.what() << "\ninfo: " << e.info(); + } +} } // namespace dpnp::extensions::lapack::gesv_utils From 7e0f384de1a6507575db2e70451ab2e6e4b18622 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 29 Jul 2024 15:47:17 +0200 Subject: [PATCH 33/37] Use try/catch for scratchpad/ipiv allocation --- .../extensions/lapack/common_helpers.hpp | 26 ++++++++++---- dpnp/backend/extensions/lapack/gesv.cpp | 34 +++++++++++++++---- 2 files changed, 47 insertions(+), 13 deletions(-) diff --git a/dpnp/backend/extensions/lapack/common_helpers.hpp b/dpnp/backend/extensions/lapack/common_helpers.hpp index aeeb3fb2793c..d3616346056e 100644 --- a/dpnp/backend/extensions/lapack/common_helpers.hpp +++ b/dpnp/backend/extensions/lapack/common_helpers.hpp @@ -82,10 +82,17 @@ inline std::int64_t *alloc_ipiv(const std::int64_t n, size_t alloc_ipiv_size = round_up_mult(n_linear_streams * n, padding); // Allocate memory for the total pivot indices array - std::int64_t *ipiv = - sycl::malloc_device(alloc_ipiv_size, exec_q); - if (!ipiv) - throw std::runtime_error("Device allocation for ipiv failed"); + try { + std::int64_t *ipiv = + sycl::malloc_device(alloc_ipiv_size, exec_q); + if (!ipiv) + throw std::runtime_error("Device allocation for ipiv failed"); + } catch (sycl::exception const &e) { + throw std::runtime_error( + std::string( + "Unexpected SYCL exception caught during ipiv allocation: ") + + e.what()); + } return ipiv; } @@ -114,9 +121,14 @@ inline T *alloc_scratchpad(std::int64_t scratchpad_size, round_up_mult(n_linear_streams * scratchpad_size, padding); // Allocate memory for the total scratchpad - T *scratchpad = sycl::malloc_device(alloc_scratch_size, exec_q); - if (!scratchpad) { - throw std::runtime_error("Device allocation for scratchpad failed"); + try { + T *scratchpad = sycl::malloc_device(alloc_scratch_size, exec_q); + if (!scratchpad) + throw std::runtime_error("Device allocation for scratchpad failed"); + } catch (sycl::exception const &e) { + throw std::runtime_error(std::string("Unexpected SYCL exception caught " + "during scratchpad allocation: ") + + e.what()); } return scratchpad; diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp index a8ef8d5d47c5..e912645e0387 100644 --- a/dpnp/backend/extensions/lapack/gesv.cpp +++ b/dpnp/backend/extensions/lapack/gesv.cpp @@ -76,15 +76,37 @@ static sycl::event gesv_impl(sycl::queue &exec_q, T *scratchpad = nullptr; // Allocate memory for the scratchpad - scratchpad = sycl::malloc_device(scratchpad_size, exec_q); - if (!scratchpad) - throw std::runtime_error("Device allocation for scratchpad failed"); + try { + scratchpad = sycl::malloc_device(scratchpad_size, exec_q); + if (!scratchpad) + throw std::runtime_error("Device allocation for scratchpad failed"); + } catch (sycl::exception const &e) { + if (scratchpad != nullptr) + sycl::free(scratchpad, exec_q); + throw std::runtime_error(std::string("Unexpected SYCL exception caught " + "during scratchpad allocation: ") + + e.what()); + } std::int64_t *ipiv = nullptr; // Allocate memory for the ipiv - ipiv = sycl::malloc_device(n, exec_q); - if (!ipiv) - throw std::runtime_error("Device allocation for ipiv failed"); + try { + ipiv = sycl::malloc_device(n, exec_q); + if (!ipiv) { + if (scratchpad != nullptr) + sycl::free(scratchpad, exec_q); + throw std::runtime_error("Device allocation for ipiv failed"); + } + } catch (sycl::exception const &e) { + if (scratchpad != nullptr) + sycl::free(scratchpad, exec_q); + if (ipiv != nullptr) + sycl::free(ipiv, exec_q); + throw std::runtime_error( + std::string( + "Unexpected SYCL exception caught during ipiv allocation: ") + + e.what()); + } std::stringstream error_msg; bool is_exception_caught = false; From f5ee368b6da526dfa46a00bd1befa727ec9351d2 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 29 Jul 2024 18:22:07 +0200 Subject: [PATCH 34/37] Update alloc_scratchpad/alloc_ipiv --- dpnp/backend/extensions/lapack/common_helpers.hpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dpnp/backend/extensions/lapack/common_helpers.hpp b/dpnp/backend/extensions/lapack/common_helpers.hpp index d3616346056e..d102cf0df23f 100644 --- a/dpnp/backend/extensions/lapack/common_helpers.hpp +++ b/dpnp/backend/extensions/lapack/common_helpers.hpp @@ -81,10 +81,11 @@ inline std::int64_t *alloc_ipiv(const std::int64_t n, // linear streams with proper alignment size_t alloc_ipiv_size = round_up_mult(n_linear_streams * n, padding); + std::int64_t *ipiv = nullptr; + // Allocate memory for the total pivot indices array try { - std::int64_t *ipiv = - sycl::malloc_device(alloc_ipiv_size, exec_q); + ipiv = sycl::malloc_device(alloc_ipiv_size, exec_q); if (!ipiv) throw std::runtime_error("Device allocation for ipiv failed"); } catch (sycl::exception const &e) { @@ -120,9 +121,11 @@ inline T *alloc_scratchpad(std::int64_t scratchpad_size, const size_t alloc_scratch_size = round_up_mult(n_linear_streams * scratchpad_size, padding); + T *scratchpad = nullptr; + // Allocate memory for the total scratchpad try { - T *scratchpad = sycl::malloc_device(alloc_scratch_size, exec_q); + scratchpad = sycl::malloc_device(alloc_scratch_size, exec_q); if (!scratchpad) throw std::runtime_error("Device allocation for scratchpad failed"); } catch (sycl::exception const &e) { From eb8c3a080b58d197d3a961041c03db2d14e46601 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 30 Jul 2024 16:09:04 +0200 Subject: [PATCH 35/37] gesv_scratchpad_size can be 0 --- dpnp/backend/extensions/lapack/gesv.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp index e912645e0387..6004540ad328 100644 --- a/dpnp/backend/extensions/lapack/gesv.cpp +++ b/dpnp/backend/extensions/lapack/gesv.cpp @@ -64,22 +64,19 @@ static sycl::event gesv_impl(sycl::queue &exec_q, const std::int64_t lda = std::max(1UL, n); const std::int64_t ldb = std::max(1UL, n); + // gesv scratchpad_size can be 0 on CPU const std::int64_t scratchpad_size = mkl_lapack::gesv_scratchpad_size(exec_q, n, nrhs, lda, ldb); - if (scratchpad_size <= 0) { - throw std::runtime_error( - "Invalid scratchpad size: must be greater than zero." - "Calculated scratchpad size: " + - std::to_string(scratchpad_size)); - } - T *scratchpad = nullptr; // Allocate memory for the scratchpad try { - scratchpad = sycl::malloc_device(scratchpad_size, exec_q); - if (!scratchpad) - throw std::runtime_error("Device allocation for scratchpad failed"); + if (scratchpad_size > 0) { + scratchpad = sycl::malloc_device(scratchpad_size, exec_q); + if (!scratchpad) + throw std::runtime_error( + "Device allocation for scratchpad failed"); + } } catch (sycl::exception const &e) { if (scratchpad != nullptr) sycl::free(scratchpad, exec_q); From 3c8cda6809e87ca0fd2c1e3f254b9ac621edc64e Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 30 Jul 2024 18:12:31 +0200 Subject: [PATCH 36/37] Implement help functions alloc_ipiv/alloc_scratchpad --- .../extensions/lapack/common_helpers.hpp | 76 +++++++++++++++---- dpnp/backend/extensions/lapack/gesv.cpp | 35 +-------- dpnp/backend/extensions/lapack/gesv_batch.cpp | 13 +++- dpnp/backend/extensions/lapack/heevd.cpp | 13 +--- .../backend/extensions/lapack/heevd_batch.cpp | 4 +- dpnp/backend/extensions/lapack/syevd.cpp | 13 +--- .../backend/extensions/lapack/syevd_batch.cpp | 4 +- 7 files changed, 80 insertions(+), 78 deletions(-) diff --git a/dpnp/backend/extensions/lapack/common_helpers.hpp b/dpnp/backend/extensions/lapack/common_helpers.hpp index d102cf0df23f..44fa3d86b95a 100644 --- a/dpnp/backend/extensions/lapack/common_helpers.hpp +++ b/dpnp/backend/extensions/lapack/common_helpers.hpp @@ -66,12 +66,34 @@ inline bool check_zeros_shape(int ndim, const py::ssize_t *shape) return src_nelems == 0; } +// Allocate the memory for the pivot indices +inline std::int64_t *alloc_ipiv(const std::int64_t n, sycl::queue &exec_q) +{ + std::int64_t *ipiv = nullptr; + + try { + ipiv = sycl::malloc_device(n, exec_q); + if (!ipiv) { + throw std::runtime_error("Device allocation for ipiv failed"); + } + } catch (sycl::exception const &e) { + if (ipiv != nullptr) + sycl::free(ipiv, exec_q); + throw std::runtime_error( + std::string( + "Unexpected SYCL exception caught during ipiv allocation: ") + + e.what()); + } + + return ipiv; +} + // Allocate the total memory for the total pivot indices with proper alignment // for batch implementations template -inline std::int64_t *alloc_ipiv(const std::int64_t n, - std::int64_t n_linear_streams, - sycl::queue &exec_q) +inline std::int64_t *alloc_ipiv_batch(const std::int64_t n, + std::int64_t n_linear_streams, + sycl::queue &exec_q) { // Get padding size to ensure memory allocations are aligned to 256 bytes // for better performance @@ -98,24 +120,43 @@ inline std::int64_t *alloc_ipiv(const std::int64_t n, return ipiv; } +// Allocate the memory for the scratchpad +template +inline T *alloc_scratchpad(std::int64_t scratchpad_size, sycl::queue &exec_q) +{ + T *scratchpad = nullptr; + + try { + if (scratchpad_size > 0) { + scratchpad = sycl::malloc_device(scratchpad_size, exec_q); + if (!scratchpad) { + throw std::runtime_error( + "Device allocation for scratchpad failed"); + } + } + } catch (sycl::exception const &e) { + if (scratchpad != nullptr) { + sycl::free(scratchpad, exec_q); + } + throw std::runtime_error(std::string("Unexpected SYCL exception caught " + "during scratchpad allocation: ") + + e.what()); + } + + return scratchpad; +} + // Allocate the total scratchpad memory with proper alignment for batch // implementations template -inline T *alloc_scratchpad(std::int64_t scratchpad_size, - std::int64_t n_linear_streams, - sycl::queue &exec_q) +inline T *alloc_scratchpad_batch(std::int64_t scratchpad_size, + std::int64_t n_linear_streams, + sycl::queue &exec_q) { // Get padding size to ensure memory allocations are aligned to 256 bytes // for better performance const std::int64_t padding = 256 / sizeof(T); - if (scratchpad_size <= 0) { - throw std::runtime_error( - "Invalid scratchpad size: must be greater than zero." - " Calculated scratchpad size: " + - std::to_string(scratchpad_size)); - } - // Calculate the total scratchpad memory size needed for all linear // streams with proper alignment const size_t alloc_scratch_size = @@ -125,9 +166,12 @@ inline T *alloc_scratchpad(std::int64_t scratchpad_size, // Allocate memory for the total scratchpad try { - scratchpad = sycl::malloc_device(alloc_scratch_size, exec_q); - if (!scratchpad) - throw std::runtime_error("Device allocation for scratchpad failed"); + if (alloc_scratch_size > 0) { + scratchpad = sycl::malloc_device(alloc_scratch_size, exec_q); + if (!scratchpad) + throw std::runtime_error( + "Device allocation for scratchpad failed"); + } } catch (sycl::exception const &e) { throw std::runtime_error(std::string("Unexpected SYCL exception caught " "during scratchpad allocation: ") + diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp index 6004540ad328..0cc6eb1f008e 100644 --- a/dpnp/backend/extensions/lapack/gesv.cpp +++ b/dpnp/backend/extensions/lapack/gesv.cpp @@ -64,45 +64,18 @@ static sycl::event gesv_impl(sycl::queue &exec_q, const std::int64_t lda = std::max(1UL, n); const std::int64_t ldb = std::max(1UL, n); - // gesv scratchpad_size can be 0 on CPU const std::int64_t scratchpad_size = mkl_lapack::gesv_scratchpad_size(exec_q, n, nrhs, lda, ldb); - T *scratchpad = nullptr; - // Allocate memory for the scratchpad - try { - if (scratchpad_size > 0) { - scratchpad = sycl::malloc_device(scratchpad_size, exec_q); - if (!scratchpad) - throw std::runtime_error( - "Device allocation for scratchpad failed"); - } - } catch (sycl::exception const &e) { - if (scratchpad != nullptr) - sycl::free(scratchpad, exec_q); - throw std::runtime_error(std::string("Unexpected SYCL exception caught " - "during scratchpad allocation: ") + - e.what()); - } + T *scratchpad = helper::alloc_scratchpad(scratchpad_size, exec_q); std::int64_t *ipiv = nullptr; - // Allocate memory for the ipiv try { - ipiv = sycl::malloc_device(n, exec_q); - if (!ipiv) { - if (scratchpad != nullptr) - sycl::free(scratchpad, exec_q); - throw std::runtime_error("Device allocation for ipiv failed"); - } - } catch (sycl::exception const &e) { + ipiv = helper::alloc_ipiv(n, exec_q); + } catch (const std::exception &e) { if (scratchpad != nullptr) sycl::free(scratchpad, exec_q); - if (ipiv != nullptr) - sycl::free(ipiv, exec_q); - throw std::runtime_error( - std::string( - "Unexpected SYCL exception caught during ipiv allocation: ") + - e.what()); + throw; } std::stringstream error_msg; diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp index 62c115259d18..90aaf8ebcfd5 100644 --- a/dpnp/backend/extensions/lapack/gesv_batch.cpp +++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp @@ -78,10 +78,17 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q, const std::int64_t scratchpad_size = mkl_lapack::gesv_scratchpad_size(exec_q, n, nrhs, lda, ldb); - T *scratchpad = - helper::alloc_scratchpad(scratchpad_size, n_linear_streams, exec_q); + T *scratchpad = helper::alloc_scratchpad_batch(scratchpad_size, + n_linear_streams, exec_q); - std::int64_t *ipiv = helper::alloc_ipiv(n, n_linear_streams, exec_q); + std::int64_t *ipiv = nullptr; + try { + ipiv = helper::alloc_ipiv_batch(n, n_linear_streams, exec_q); + } catch (const std::exception &e) { + if (scratchpad != nullptr) + sycl::free(scratchpad, exec_q); + throw; + } // Computation events to manage dependencies for each linear stream std::vector> comp_evs(n_linear_streams, depends); diff --git a/dpnp/backend/extensions/lapack/heevd.cpp b/dpnp/backend/extensions/lapack/heevd.cpp index a623a0dd182c..234dbef18eda 100644 --- a/dpnp/backend/extensions/lapack/heevd.cpp +++ b/dpnp/backend/extensions/lapack/heevd.cpp @@ -55,18 +55,7 @@ static sycl::event heevd_impl(sycl::queue &exec_q, const std::int64_t scratchpad_size = mkl_lapack::heevd_scratchpad_size(exec_q, jobz, upper_lower, n, lda); - if (scratchpad_size <= 0) { - throw std::runtime_error( - "Invalid scratchpad size: must be greater than zero." - "Calculated scratchpad size: " + - std::to_string(scratchpad_size)); - } - - T *scratchpad = nullptr; - // Allocate memory for the scratchpad - scratchpad = sycl::malloc_device(scratchpad_size, exec_q); - if (!scratchpad) - throw std::runtime_error("Device allocation for scratchpad failed"); + T *scratchpad = helper::alloc_scratchpad(scratchpad_size, exec_q); std::stringstream error_msg; std::int64_t info = 0; diff --git a/dpnp/backend/extensions/lapack/heevd_batch.cpp b/dpnp/backend/extensions/lapack/heevd_batch.cpp index 2ea37e22d7d9..05d968701dd8 100644 --- a/dpnp/backend/extensions/lapack/heevd_batch.cpp +++ b/dpnp/backend/extensions/lapack/heevd_batch.cpp @@ -65,8 +65,8 @@ static sycl::event heevd_batch_impl(sycl::queue &exec_q, const std::int64_t scratchpad_size = mkl_lapack::heevd_scratchpad_size(exec_q, jobz, upper_lower, n, lda); - T *scratchpad = - helper::alloc_scratchpad(scratchpad_size, n_linear_streams, exec_q); + T *scratchpad = helper::alloc_scratchpad_batch(scratchpad_size, + n_linear_streams, exec_q); // Computation events to manage dependencies for each linear stream std::vector> comp_evs(n_linear_streams, depends); diff --git a/dpnp/backend/extensions/lapack/syevd.cpp b/dpnp/backend/extensions/lapack/syevd.cpp index bbc975b43d0f..30f7ea1d13c6 100644 --- a/dpnp/backend/extensions/lapack/syevd.cpp +++ b/dpnp/backend/extensions/lapack/syevd.cpp @@ -55,18 +55,7 @@ static sycl::event syevd_impl(sycl::queue &exec_q, const std::int64_t scratchpad_size = mkl_lapack::syevd_scratchpad_size(exec_q, jobz, upper_lower, n, lda); - if (scratchpad_size <= 0) { - throw std::runtime_error( - "Invalid scratchpad size: must be greater than zero." - "Calculated scratchpad size: " + - std::to_string(scratchpad_size)); - } - - T *scratchpad = nullptr; - // Allocate memory for the scratchpad - scratchpad = sycl::malloc_device(scratchpad_size, exec_q); - if (!scratchpad) - throw std::runtime_error("Device allocation for scratchpad failed"); + T *scratchpad = helper::alloc_scratchpad(scratchpad_size, exec_q); std::stringstream error_msg; std::int64_t info = 0; diff --git a/dpnp/backend/extensions/lapack/syevd_batch.cpp b/dpnp/backend/extensions/lapack/syevd_batch.cpp index aa5656e0ac3c..78bf07264f55 100644 --- a/dpnp/backend/extensions/lapack/syevd_batch.cpp +++ b/dpnp/backend/extensions/lapack/syevd_batch.cpp @@ -65,8 +65,8 @@ static sycl::event syevd_batch_impl(sycl::queue &exec_q, const std::int64_t scratchpad_size = mkl_lapack::syevd_scratchpad_size(exec_q, jobz, upper_lower, n, lda); - T *scratchpad = - helper::alloc_scratchpad(scratchpad_size, n_linear_streams, exec_q); + T *scratchpad = helper::alloc_scratchpad_batch(scratchpad_size, + n_linear_streams, exec_q); // Computation events to manage dependencies for each linear stream std::vector> comp_evs(n_linear_streams, depends); From 629b97ab4a92b14ed46fe6e6bde62073e8137f99 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 2 Aug 2024 11:11:26 +0200 Subject: [PATCH 37/37] Reuse alloc_scratchpad/ipiv in batch versions --- .../extensions/lapack/common_helpers.hpp | 36 ++----------------- 1 file changed, 3 insertions(+), 33 deletions(-) diff --git a/dpnp/backend/extensions/lapack/common_helpers.hpp b/dpnp/backend/extensions/lapack/common_helpers.hpp index 44fa3d86b95a..88e791256dce 100644 --- a/dpnp/backend/extensions/lapack/common_helpers.hpp +++ b/dpnp/backend/extensions/lapack/common_helpers.hpp @@ -24,8 +24,8 @@ //***************************************************************************** #pragma once -#include #include +#include #include #include @@ -103,21 +103,7 @@ inline std::int64_t *alloc_ipiv_batch(const std::int64_t n, // linear streams with proper alignment size_t alloc_ipiv_size = round_up_mult(n_linear_streams * n, padding); - std::int64_t *ipiv = nullptr; - - // Allocate memory for the total pivot indices array - try { - ipiv = sycl::malloc_device(alloc_ipiv_size, exec_q); - if (!ipiv) - throw std::runtime_error("Device allocation for ipiv failed"); - } catch (sycl::exception const &e) { - throw std::runtime_error( - std::string( - "Unexpected SYCL exception caught during ipiv allocation: ") + - e.what()); - } - - return ipiv; + return alloc_ipiv(alloc_ipiv_size, exec_q); } // Allocate the memory for the scratchpad @@ -162,22 +148,6 @@ inline T *alloc_scratchpad_batch(std::int64_t scratchpad_size, const size_t alloc_scratch_size = round_up_mult(n_linear_streams * scratchpad_size, padding); - T *scratchpad = nullptr; - - // Allocate memory for the total scratchpad - try { - if (alloc_scratch_size > 0) { - scratchpad = sycl::malloc_device(alloc_scratch_size, exec_q); - if (!scratchpad) - throw std::runtime_error( - "Device allocation for scratchpad failed"); - } - } catch (sycl::exception const &e) { - throw std::runtime_error(std::string("Unexpected SYCL exception caught " - "during scratchpad allocation: ") + - e.what()); - } - - return scratchpad; + return alloc_scratchpad(alloc_scratch_size, exec_q); } } // namespace dpnp::extensions::lapack::helper