From 4c7c8c240280fe5935c7cf8a17d7cc3711db08d7 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 6 Jun 2024 22:55:56 +0200
Subject: [PATCH 01/37] Init work

---
 dpnp/backend/extensions/lapack/gesv.cpp      | 119 ++++++++++++
 dpnp/backend/extensions/lapack/gesv.hpp      |   6 +
 dpnp/backend/extensions/lapack/lapack_py.cpp |   7 +
 dpnp/linalg/dpnp_utils_linalg.py             | 191 ++++++++++++-------
 test.py                                      |  74 +++++++
 5 files changed, 331 insertions(+), 66 deletions(-)
 create mode 100644 test.py

diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index 3dc4ec6b125d..2fda114f1ef0 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -273,6 +273,125 @@ std::pair<sycl::event, sycl::event>
     return std::make_pair(args_ev, gesv_ev);
 }
 
+
+std::pair<sycl::event, sycl::event>
+    gesv_batch(sycl::queue exec_q,
+               dpctl::tensor::usm_ndarray coeff_matrix,
+               dpctl::tensor::usm_ndarray dependent_vals,
+               const std::vector<sycl::event> &depends)
+{
+    const int coeff_matrix_nd = coeff_matrix.get_ndim();
+    const int dependent_vals_nd = dependent_vals.get_ndim();
+
+    if (coeff_matrix_nd != 3) {
+        throw py::value_error("The coefficient matrix has ndim=" +
+                              std::to_string(coeff_matrix_nd) +
+                              ", but a 3-dimensional array is expected.");
+    }
+
+    if (dependent_vals_nd > 2) {
+        throw py::value_error(
+            "The dependent values array has ndim=" +
+            std::to_string(dependent_vals_nd) +
+            ", but more than 2-dimensional array is expected.");
+    }
+
+    const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw();
+    const py::ssize_t *dependent_vals_shape = dependent_vals.get_shape_raw();
+
+    if (coeff_matrix_shape[1] != coeff_matrix_shape[2]) {
+        throw py::value_error("The coefficient matrix must be square,"
+                              " but got a shape of (" +
+                              std::to_string(coeff_matrix_shape[1]) + ", " +
+                              std::to_string(coeff_matrix_shape[2]) + ").");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q,
+                                             {coeff_matrix, dependent_vals}))
+    {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(coeff_matrix, dependent_vals)) {
+        throw py::value_error(
+            "The arrays of coefficients and dependent variables "
+            "are overlapping segments of memory");
+    }
+
+    // bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous();
+    // if (!is_coeff_matrix_f_contig) {
+    //     throw py::value_error("The coefficient matrix "
+    //                           "must be F-contiguous");
+    // }
+
+    // bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous();
+    // if (!is_dependent_vals_f_contig) {
+    //     throw py::value_error("The array of dependent variables "
+    //                           "must be F-contiguous");
+    // }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int coeff_matrix_type_id =
+        array_types.typenum_to_lookup_id(coeff_matrix.get_typenum());
+    int dependent_vals_type_id =
+        array_types.typenum_to_lookup_id(dependent_vals.get_typenum());
+
+    if (coeff_matrix_type_id != dependent_vals_type_id) {
+        throw py::value_error("The types of the coefficient matrix and "
+                              "dependent variables are mismatched");
+    }
+
+    gesv_impl_fn_ptr_t gesv_fn = gesv_dispatch_vector[coeff_matrix_type_id];
+    if (gesv_fn == nullptr) {
+        throw py::value_error(
+            "No gesv implementation defined for the provided type "
+            "of the coefficient matrix.");
+    }
+
+    char *coeff_matrix_data = coeff_matrix.get_data();
+    char *dependent_vals_data = dependent_vals.get_data();
+
+    const std::int64_t batch_size = coeff_matrix_shape[0];
+    const std::int64_t n = dependent_vals_shape[0];
+    const std::int64_t nrhs =
+        (dependent_vals_nd > 1) ? dependent_vals_shape[1] : 1;
+
+    const std::int64_t lda = std::max<size_t>(1UL, n);
+    const std::int64_t ldb = std::max<size_t>(1UL, n);
+
+    int coeff_matrix_elemsize = coeff_matrix.get_elemsize();
+    int dependent_vals_elemsize = dependent_vals.get_elemsize();
+
+    std::vector<sycl::event> host_task_events;
+    std::vector<sycl::event> gesv_task_events;
+
+    host_task_events.reserve(batch_size);
+    gesv_task_events.reserve(batch_size);
+
+    for (std::int64_t i = 0; i < batch_size; ++i) {
+        char *coeff_matrix_batch = coeff_matrix_data + i * n * n * coeff_matrix_elemsize;
+        char *dependent_vals_batch = dependent_vals_data + i * n * dependent_vals_elemsize;
+
+        sycl::event gesv_ev =
+        gesv_fn(exec_q, n, nrhs, coeff_matrix_batch, lda, dependent_vals_batch,
+                ldb, host_task_events, depends);
+
+        gesv_task_events.push_back(gesv_ev);
+    }
+
+    sycl::event combine_ev = exec_q.submit(
+        [&](sycl::handler &cgh) { cgh.depends_on(gesv_task_events); });
+
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(
+        exec_q, {coeff_matrix, dependent_vals}, host_task_events);
+
+    return std::make_pair(args_ev, combine_ev);
+}
+
 template <typename fnT, typename T>
 struct GesvContigFactory
 {
diff --git a/dpnp/backend/extensions/lapack/gesv.hpp b/dpnp/backend/extensions/lapack/gesv.hpp
index 12486fae7870..4de03370da70 100644
--- a/dpnp/backend/extensions/lapack/gesv.hpp
+++ b/dpnp/backend/extensions/lapack/gesv.hpp
@@ -44,6 +44,12 @@ extern std::pair<sycl::event, sycl::event>
          dpctl::tensor::usm_ndarray dependent_vals,
          const std::vector<sycl::event> &depends);
 
+extern std::pair<sycl::event, sycl::event>
+    gesv_batch(sycl::queue exec_q,
+               dpctl::tensor::usm_ndarray coeff_matrix,
+               dpctl::tensor::usm_ndarray dependent_vals,
+               const std::vector<sycl::event> &depends);
+
 extern void init_gesv_dispatch_vector(void);
 } // namespace lapack
 } // namespace ext
diff --git a/dpnp/backend/extensions/lapack/lapack_py.cpp b/dpnp/backend/extensions/lapack/lapack_py.cpp
index e6b4365a906d..863d1ffaea47 100644
--- a/dpnp/backend/extensions/lapack/lapack_py.cpp
+++ b/dpnp/backend/extensions/lapack/lapack_py.cpp
@@ -103,6 +103,13 @@ PYBIND11_MODULE(_lapack_impl, m)
           py::arg("sycl_queue"), py::arg("coeff_matrix"),
           py::arg("dependent_vals"), py::arg("depends") = py::list());
 
+    m.def("_gesv_batch", &lapack_ext::gesv_batch,
+          "Call `gesv` from OneMKL LAPACK library to return "
+          "the batch solution of a system of linear equations with "
+          "a square coefficient matrix A and multiple dependent variables",
+          py::arg("sycl_queue"), py::arg("coeff_matrix"),
+          py::arg("dependent_vals"), py::arg("depends") = py::list());
+
     m.def("_gesvd", &lapack_ext::gesvd,
           "Call `gesvd` from OneMKL LAPACK library to return "
           "the singular value decomposition of a general rectangular matrix",
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 22aa396c7fed..989c81ff118c 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -267,18 +267,109 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type):
 
     """
 
-    a_usm_arr = dpnp.get_usm_ndarray(a)
-    b_usm_arr = dpnp.get_usm_ndarray(b)
+    new = True
 
-    b_order = "C" if b.flags.c_contiguous else "F"
-    a_shape = a.shape
-    b_shape = b.shape
+    if not new:
+        a_usm_arr = dpnp.get_usm_ndarray(a)
+        b_usm_arr = dpnp.get_usm_ndarray(b)
+
+        b_order = "C" if b.flags.c_contiguous else "F"
+        a_shape = a.shape
+        b_shape = b.shape
+
+        is_cpu_device = exec_q.sycl_device.has_aspect_cpu
+        reshape = False
+        orig_shape_b = b_shape
+        if a.ndim > 3:
+            # get 3d input arrays by reshape
+            if a.ndim == b.ndim:
+                b = b.reshape(-1, b_shape[-2], b_shape[-1])
+            else:
+                b = b.reshape(-1, b_shape[-1])
+
+            a = a.reshape(-1, a_shape[-2], a_shape[-1])
+
+            a_usm_arr = dpnp.get_usm_ndarray(a)
+            b_usm_arr = dpnp.get_usm_ndarray(b)
+            reshape = True
+
+        batch_size = a.shape[0]
+
+        coeff_vecs = [None] * batch_size
+        val_vecs = [None] * batch_size
+        a_ht_copy_ev = [None] * batch_size
+        b_ht_copy_ev = [None] * batch_size
+        ht_lapack_ev = [None] * batch_size
+
+        for i in range(batch_size):
+            # oneMKL LAPACK assumes fortran-like array as input, so allocate
+            # a memory with 'F' order for dpnp array of coefficient matrix
+            coeff_vecs[i] = dpnp.empty_like(
+                a[i], order="F", dtype=res_type, usm_type=res_usm_type
+            )
+
+            # use DPCTL tensor function to fill the coefficient matrix array
+            # with content from the input array
+            a_ht_copy_ev[i], a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=a_usm_arr[i],
+                dst=coeff_vecs[i].get_array(),
+                sycl_queue=a.sycl_queue,
+            )
+
+            # oneMKL LAPACK assumes fortran-like array as input, so
+            # allocate a memory with 'F' order for dpnp array of multiple
+            # dependent variables array
+            val_vecs[i] = dpnp.empty_like(
+                b[i], order="F", dtype=res_type, usm_type=res_usm_type
+            )
+
+            # use DPCTL tensor function to fill the array of multiple dependent
+            # variables with content from the input arrays
+            b_ht_copy_ev[i], b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=b_usm_arr[i],
+                dst=val_vecs[i].get_array(),
+                sycl_queue=b.sycl_queue,
+            )
+
+            # Call the LAPACK extension function _gesv to solve the system of
+            # linear equations using a portion of the coefficient square matrix
+            # and a corresponding portion of the dependent variables array.
+            ht_lapack_ev[i], _ = li._gesv(
+                exec_q,
+                coeff_vecs[i].get_array(),
+                val_vecs[i].get_array(),
+                depends=[a_copy_ev, b_copy_ev],
+            )
+
+            # TODO: Remove this w/a when MKLD-17201 is solved.
+            # Waiting for a host task executing an OneMKL LAPACK gesv call
+            # on CPU causes deadlock due to serialization of all host tasks
+            # in the queue.
+            # We need to wait for each host tasks before calling _gesv to avoid
+            # deadlock.
+            if is_cpu_device:
+                ht_lapack_ev[i].wait()
+                b_ht_copy_ev[i].wait()
+
+        for i in range(batch_size):
+            ht_lapack_ev[i].wait()
+            b_ht_copy_ev[i].wait()
+            a_ht_copy_ev[i].wait()
+
+        # combine the list of solutions into a single array
+        out_v = dpnp.array(
+            val_vecs, order=b_order, dtype=res_type, usm_type=res_usm_type
+        )
+        if reshape:
+            # shape of the out_v must be equal to the shape of the array of
+            # dependent variables
+            out_v = out_v.reshape(orig_shape_b)
+        return out_v
+
+    else:
+        a_shape = a.shape
+        b_shape = b.shape
 
-    is_cpu_device = exec_q.sycl_device.has_aspect_cpu
-    reshape = False
-    orig_shape_b = b_shape
-    if a.ndim > 3:
-        # get 3d input arrays by reshape
         if a.ndim == b.ndim:
             b = b.reshape(-1, b_shape[-2], b_shape[-1])
         else:
@@ -286,82 +377,50 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type):
 
         a = a.reshape(-1, a_shape[-2], a_shape[-1])
 
+        a = a.transpose((0, 2, 1))
         a_usm_arr = dpnp.get_usm_ndarray(a)
+
+        # b = b.T
         b_usm_arr = dpnp.get_usm_ndarray(b)
-        reshape = True
 
-    batch_size = a.shape[0]
+        # a_usm_arr = dpnp.get_usm_ndarray(a)
+        # b_usm_arr = dpnp.get_usm_ndarray(b)
 
-    coeff_vecs = [None] * batch_size
-    val_vecs = [None] * batch_size
-    a_ht_copy_ev = [None] * batch_size
-    b_ht_copy_ev = [None] * batch_size
-    ht_lapack_ev = [None] * batch_size
+        ht_list_ev = []
 
-    for i in range(batch_size):
-        # oneMKL LAPACK assumes fortran-like array as input, so allocate
-        # a memory with 'F' order for dpnp array of coefficient matrix
-        coeff_vecs[i] = dpnp.empty_like(
-            a[i], order="F", dtype=res_type, usm_type=res_usm_type
-        )
+        a_copy = dpnp.empty_like(a, dtype=res_type, order="F", usm_type=res_usm_type)
 
-        # use DPCTL tensor function to fill the coefficient matrix array
-        # with content from the input array
-        a_ht_copy_ev[i], a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=a_usm_arr[i],
-            dst=coeff_vecs[i].get_array(),
+        a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=a_usm_arr,
+            dst=a_copy.get_array(),
             sycl_queue=a.sycl_queue,
         )
+        ht_list_ev.append(a_ht_copy_ev)
 
-        # oneMKL LAPACK assumes fortran-like array as input, so
-        # allocate a memory with 'F' order for dpnp array of multiple
-        # dependent variables array
-        val_vecs[i] = dpnp.empty_like(
-            b[i], order="F", dtype=res_type, usm_type=res_usm_type
+        b_copy = dpnp.empty_like(
+            b, order="C", dtype=res_type, usm_type=res_usm_type
         )
 
-        # use DPCTL tensor function to fill the array of multiple dependent
-        # variables with content from the input arrays
-        b_ht_copy_ev[i], b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=b_usm_arr[i],
-            dst=val_vecs[i].get_array(),
+        b_ht_copy_ev, b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=b_usm_arr,
+            dst=b_copy.get_array(),
             sycl_queue=b.sycl_queue,
         )
+        ht_list_ev.append(b_ht_copy_ev)
 
-        # Call the LAPACK extension function _gesv to solve the system of
-        # linear equations using a portion of the coefficient square matrix
-        # and a corresponding portion of the dependent variables array.
-        ht_lapack_ev[i], _ = li._gesv(
+        ht_lapack_ev, _ = li._gesv_batch(
             exec_q,
-            coeff_vecs[i].get_array(),
-            val_vecs[i].get_array(),
+            a_copy.get_array(),
+            b_copy.get_array(),
             depends=[a_copy_ev, b_copy_ev],
         )
 
-        # TODO: Remove this w/a when MKLD-17201 is solved.
-        # Waiting for a host task executing an OneMKL LAPACK gesv call
-        # on CPU causes deadlock due to serialization of all host tasks
-        # in the queue.
-        # We need to wait for each host tasks before calling _gesv to avoid
-        # deadlock.
-        if is_cpu_device:
-            ht_lapack_ev[i].wait()
-            b_ht_copy_ev[i].wait()
+        ht_list_ev.append(ht_lapack_ev)
 
-    for i in range(batch_size):
-        ht_lapack_ev[i].wait()
-        b_ht_copy_ev[i].wait()
-        a_ht_copy_ev[i].wait()
+        dpctl.SyclEvent.wait_for(ht_list_ev)
 
-    # combine the list of solutions into a single array
-    out_v = dpnp.array(
-        val_vecs, order=b_order, dtype=res_type, usm_type=res_usm_type
-    )
-    if reshape:
-        # shape of the out_v must be equal to the shape of the array of
-        # dependent variables
-        out_v = out_v.reshape(orig_shape_b)
-    return out_v
+        v = b_copy.reshape(b_shape)
+        return v
 
 
 def _batched_qr(a, mode="reduced"):
diff --git a/test.py b/test.py
new file mode 100644
index 000000000000..b71d3d67d37f
--- /dev/null
+++ b/test.py
@@ -0,0 +1,74 @@
+import numpy
+
+import dpnp
+import dpnp.linalg
+from tests.helper import generate_random_numpy_array
+
+# a = numpy.array([[[2,3],[1,2]],[[1,2],[2,3]]],dtype='f4')
+# a = numpy.array([[[1,2],[2,3]],[[1,2],[2,3]]],dtype='f4')
+# a = numpy.array([[1,2],[2,3]],dtype='f4')
+# a = numpy.array([[[2,3],[1,2]]],dtype='f4')
+# a = numpy.array([[[1,2],[2,3]]],dtype='f4')
+a = generate_random_numpy_array(
+    (2, 3, 3), dtype='f4', hermitian=False, seed_value=81
+)
+
+b = generate_random_numpy_array(
+    (2, 3), dtype='f4', hermitian=False, seed_value=76
+)
+# a = numpy.array(
+#             [
+#                 [[1, 0, 3], [0, 5, 0], [7, 0, 9]],
+#                 [[3, 0, 3], [0, 7, 0], [7, 0, 11]],
+#             ],
+#             dtype='f4',
+#         )
+a_dp = dpnp.array(a)
+b_dp = dpnp.array(b)
+
+# a_f = numpy.array(a,order="F")
+# a_dp_f = dpnp.array(a_dp,order="F")
+
+# res = numpy.linalg.eigh(a)
+# res_dp = dpnp.linalg.eigh(a_dp)
+# res = numpy.linalg.eigh(a_f)
+# res_dp = dpnp.linalg.eigh(a_dp_f)
+res = numpy.linalg.solve(a,b)
+res_dp = dpnp.linalg.solve(a_dp,b_dp)
+# res = numpy.linalg.eigvalsh(a)
+# res_dp = dpnp.linalg.eigvalsh(a_dp)
+
+
+print("VALS: ")
+print("NUMPY: ")
+print(res)
+print("DPNP: ")
+print(res_dp)
+
+# print("BEFORE")
+# print(a_dp)
+
+# print("VALS: ")
+# print("NUMPY: ")
+# print(res[0])
+# print("DPNP: ")
+# print(res_dp[0])
+
+
+# print("VECS: ")
+# print("NUMPY: ")
+# print(res[1])
+# print("DPNP: ")
+# print(res_dp[1])
+
+# print("AFTER")
+# print(a_dp)
+
+
+# a = generate_random_numpy_array((10,3,3),dtype=numpy.float32, hermitian=True, seed_value=81)
+# a_dp = dpnp.array(a, device='gpu')
+
+# res = numpy.linalg.eigh(a)
+# res_dp = dpnp.linalg.eigh(a_dp)
+
+# print("Done")

From 8e2cb2344308411cfa32c50762452ca0f658e783 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 7 Jun 2024 09:07:42 +0200
Subject: [PATCH 02/37] First working version with transpose and C contig

---
 dpnp/backend/extensions/lapack/gesv.cpp | 18 ++++++++++++++----
 dpnp/linalg/dpnp_utils_linalg.py        | 13 ++++++++++---
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index 2fda114f1ef0..ad49db3a2fb2 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -289,7 +289,7 @@ std::pair<sycl::event, sycl::event>
                               ", but a 3-dimensional array is expected.");
     }
 
-    if (dependent_vals_nd > 2) {
+    if (dependent_vals_nd > 3) {
         throw py::value_error(
             "The dependent values array has ndim=" +
             std::to_string(dependent_vals_nd) +
@@ -355,13 +355,23 @@ std::pair<sycl::event, sycl::event>
     char *dependent_vals_data = dependent_vals.get_data();
 
     const std::int64_t batch_size = coeff_matrix_shape[0];
-    const std::int64_t n = dependent_vals_shape[0];
+    // const std::int64_t n = dependent_vals_shape[1];
+    // const std::int64_t nrhs =
+    //     (dependent_vals_nd > 2) ? dependent_vals_shape[2] : 1;
+
+    const std::int64_t n = coeff_matrix_shape[1];
     const std::int64_t nrhs =
-        (dependent_vals_nd > 1) ? dependent_vals_shape[1] : 1;
+        (dependent_vals_nd > 2) ? dependent_vals_shape[1] : 1;
 
     const std::int64_t lda = std::max<size_t>(1UL, n);
     const std::int64_t ldb = std::max<size_t>(1UL, n);
 
+    std::cout << "n: " << n << std::endl;
+    std::cout << "nrhs: " << nrhs << std::endl;
+    std::cout << "lda: " << n << std::endl;
+    std::cout << "ldb: " << n << std::endl;
+
+
     int coeff_matrix_elemsize = coeff_matrix.get_elemsize();
     int dependent_vals_elemsize = dependent_vals.get_elemsize();
 
@@ -373,7 +383,7 @@ std::pair<sycl::event, sycl::event>
 
     for (std::int64_t i = 0; i < batch_size; ++i) {
         char *coeff_matrix_batch = coeff_matrix_data + i * n * n * coeff_matrix_elemsize;
-        char *dependent_vals_batch = dependent_vals_data + i * n * dependent_vals_elemsize;
+        char *dependent_vals_batch = dependent_vals_data + i * n * nrhs * dependent_vals_elemsize;
 
         sycl::event gesv_ev =
         gesv_fn(exec_q, n, nrhs, coeff_matrix_batch, lda, dependent_vals_batch,
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 989c81ff118c..d2e80f758fee 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -380,7 +380,10 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type):
         a = a.transpose((0, 2, 1))
         a_usm_arr = dpnp.get_usm_ndarray(a)
 
-        # b = b.T
+        if b.ndim > 2:
+            b = b.transpose((0,2,1))
+        else:
+            b = b
         b_usm_arr = dpnp.get_usm_ndarray(b)
 
         # a_usm_arr = dpnp.get_usm_ndarray(a)
@@ -388,7 +391,7 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type):
 
         ht_list_ev = []
 
-        a_copy = dpnp.empty_like(a, dtype=res_type, order="F", usm_type=res_usm_type)
+        a_copy = dpnp.empty_like(a, dtype=res_type, order="C", usm_type=res_usm_type)
 
         a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=a_usm_arr,
@@ -419,7 +422,11 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type):
 
         dpctl.SyclEvent.wait_for(ht_list_ev)
 
-        v = b_copy.reshape(b_shape)
+        if b.ndim > 2:
+            v = b_copy.transpose(0,2,1).reshape(b_shape)
+        else:
+            v = b_copy.reshape(b_shape)
+        # v = b_copy.reshape(b_shape)
         return v
 
 

From 67fa43592719ca522b7b0f473023705d9c1e62f0 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 7 Jun 2024 10:47:13 +0200
Subject: [PATCH 03/37] Second working version with moveaxis, transpose and F
 contig

---
 dpnp/backend/extensions/lapack/gesv.cpp |  9 +--
 dpnp/linalg/dpnp_utils_linalg.py        | 80 +++++++++++++++++--------
 2 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index ad49db3a2fb2..e0ddbad1c893 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -299,11 +299,11 @@ std::pair<sycl::event, sycl::event>
     const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw();
     const py::ssize_t *dependent_vals_shape = dependent_vals.get_shape_raw();
 
-    if (coeff_matrix_shape[1] != coeff_matrix_shape[2]) {
+    if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) {
         throw py::value_error("The coefficient matrix must be square,"
                               " but got a shape of (" +
-                              std::to_string(coeff_matrix_shape[1]) + ", " +
-                              std::to_string(coeff_matrix_shape[2]) + ").");
+                              std::to_string(coeff_matrix_shape[0]) + ", " +
+                              std::to_string(coeff_matrix_shape[1]) + ").");
     }
 
     // check compatibility of execution queue and allocation queue
@@ -354,7 +354,7 @@ std::pair<sycl::event, sycl::event>
     char *coeff_matrix_data = coeff_matrix.get_data();
     char *dependent_vals_data = dependent_vals.get_data();
 
-    const std::int64_t batch_size = coeff_matrix_shape[0];
+    const std::int64_t batch_size = coeff_matrix_shape[2];
     // const std::int64_t n = dependent_vals_shape[1];
     // const std::int64_t nrhs =
     //     (dependent_vals_nd > 2) ? dependent_vals_shape[2] : 1;
@@ -370,6 +370,7 @@ std::pair<sycl::event, sycl::event>
     std::cout << "nrhs: " << nrhs << std::endl;
     std::cout << "lda: " << n << std::endl;
     std::cout << "ldb: " << n << std::endl;
+    std::cout << "b_nd: " << dependent_vals_nd << std::endl;
 
 
     int coeff_matrix_elemsize = coeff_matrix.get_elemsize();
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index d2e80f758fee..dc7e215e557c 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -377,56 +377,84 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type):
 
         a = a.reshape(-1, a_shape[-2], a_shape[-1])
 
-        a = a.transpose((0, 2, 1))
-        a_usm_arr = dpnp.get_usm_ndarray(a)
+        n_a = a.shape[1]
+        m,n = b.shape[-2:]
+
+        a = dpnp.moveaxis(a,(-2,-1),(0,1))
+        a = dpnp.array(a, dtype=res_type, order="F", copy=True)
+        a = dpnp.reshape(a,(n_a,n_a,-1))
 
         if b.ndim > 2:
-            b = b.transpose((0,2,1))
+            b = dpnp.moveaxis(b,(-2,-1),(0,1))
+        else:
+            b = b.T
+        b = dpnp.array(b, dtype=res_type, order="F", copy=True)
+        if b.ndim > 2:
+            b = dpnp.reshape(b,(m,n,-1))
         else:
-            b = b
+            b = b.T
+
+        a_usm_arr = dpnp.get_usm_ndarray(a)
         b_usm_arr = dpnp.get_usm_ndarray(b)
 
+
+        # a = a.reshape(-1, a_shape[-2], a_shape[-1])
+
+        # a = a.transpose((0, 2, 1))
+        # a_usm_arr = dpnp.get_usm_ndarray(a)
+
+        # if b.ndim > 2:
+        #     b = b.transpose((0,2,1))
+        # else:
+        #     b = b
+        # b_usm_arr = dpnp.get_usm_ndarray(b)
+
         # a_usm_arr = dpnp.get_usm_ndarray(a)
         # b_usm_arr = dpnp.get_usm_ndarray(b)
 
         ht_list_ev = []
 
-        a_copy = dpnp.empty_like(a, dtype=res_type, order="C", usm_type=res_usm_type)
+        # a_copy = dpnp.empty_like(a, dtype=res_type, order="C", usm_type=res_usm_type)
 
-        a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=a_usm_arr,
-            dst=a_copy.get_array(),
-            sycl_queue=a.sycl_queue,
-        )
-        ht_list_ev.append(a_ht_copy_ev)
+        # a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        #     src=a_usm_arr,
+        #     dst=a_copy.get_array(),
+        #     sycl_queue=a.sycl_queue,
+        # )
+        # ht_list_ev.append(a_ht_copy_ev)
 
-        b_copy = dpnp.empty_like(
-            b, order="C", dtype=res_type, usm_type=res_usm_type
-        )
+        # b_copy = dpnp.empty_like(
+        #     b, order="C", dtype=res_type, usm_type=res_usm_type
+        # )
 
-        b_ht_copy_ev, b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=b_usm_arr,
-            dst=b_copy.get_array(),
-            sycl_queue=b.sycl_queue,
-        )
-        ht_list_ev.append(b_ht_copy_ev)
+        # b_ht_copy_ev, b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        #     src=b_usm_arr,
+        #     dst=b_copy.get_array(),
+        #     sycl_queue=b.sycl_queue,
+        # )
+        # ht_list_ev.append(b_ht_copy_ev)
 
         ht_lapack_ev, _ = li._gesv_batch(
             exec_q,
-            a_copy.get_array(),
-            b_copy.get_array(),
-            depends=[a_copy_ev, b_copy_ev],
+            a.get_array(),
+            b.get_array(),
+            # depends=[a_copy_ev, b_copy_ev],
+            depends=[],
         )
 
         ht_list_ev.append(ht_lapack_ev)
 
         dpctl.SyclEvent.wait_for(ht_list_ev)
 
+        # if b.ndim > 2:
+        #     v = b_copy.transpose(0,2,1).reshape(b_shape)
+        # else:
+        #     v = b_copy.reshape(b_shape)
         if b.ndim > 2:
-            v = b_copy.transpose(0,2,1).reshape(b_shape)
+            v = dpnp.moveaxis(b, -1 , 0)
+            v = v.reshape(b_shape)
         else:
-            v = b_copy.reshape(b_shape)
-        # v = b_copy.reshape(b_shape)
+            v = b.reshape(b_shape)
         return v
 
 

From 4f5abecc9c117b534499e8997a6088722d15774a Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 11 Jun 2024 23:27:31 +0200
Subject: [PATCH 04/37] Add more shape checks

---
 dpnp/backend/extensions/lapack/gesv.cpp | 79 ++++++++++++++++---------
 1 file changed, 50 insertions(+), 29 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index e0ddbad1c893..192e0325cf3f 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -273,7 +273,6 @@ std::pair<sycl::event, sycl::event>
     return std::make_pair(args_ev, gesv_ev);
 }
 
-
 std::pair<sycl::event, sycl::event>
     gesv_batch(sycl::queue exec_q,
                dpctl::tensor::usm_ndarray coeff_matrix,
@@ -289,16 +288,20 @@ std::pair<sycl::event, sycl::event>
                               ", but a 3-dimensional array is expected.");
     }
 
-    if (dependent_vals_nd > 3) {
+    if (dependent_vals_nd < 2 || dependent_vals_nd > 3) {
         throw py::value_error(
             "The dependent values array has ndim=" +
             std::to_string(dependent_vals_nd) +
-            ", but more than 2-dimensional array is expected.");
+            ", but a 2-dimensional or 3-dimensional array is expected.");
     }
 
     const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw();
     const py::ssize_t *dependent_vals_shape = dependent_vals.get_shape_raw();
 
+    // The coeff_matrix and dependent_vals arrays must be F-contiguous arrays
+    // with the shapes (n,n,batch_size) and (n,nrhs,batch_size) or
+    // (n,batch_size) respectively.
+
     if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) {
         throw py::value_error("The coefficient matrix must be square,"
                               " but got a shape of (" +
@@ -306,6 +309,34 @@ std::pair<sycl::event, sycl::event>
                               std::to_string(coeff_matrix_shape[1]) + ").");
     }
 
+    if (coeff_matrix_shape[0] != dependent_vals_shape[0]) {
+        throw py::value_error("The first dimension (n) of coeff_matrix and"
+                              " dependent_vals must be the same, but got " +
+                              std::to_string(coeff_matrix_shape[0]) + " and " +
+                              std::to_string(dependent_vals_shape[0]) + ".");
+    }
+
+    if (dependent_vals_nd == 2) {
+        if (coeff_matrix_shape[2] != dependent_vals_shape[1]) {
+            throw py::value_error(
+                "The batch_size of "
+                " coeff_matrix and dependent_vals must be"
+                " the same, but got " +
+                std::to_string(coeff_matrix_shape[2]) + " and " +
+                std::to_string(dependent_vals_shape[1]) + ".");
+        }
+    }
+    else if (dependent_vals_nd == 3) {
+        if (coeff_matrix_shape[2] != dependent_vals_shape[2]) {
+            throw py::value_error(
+                "The batch_size of "
+                " coeff_matrix and dependent_vals must be"
+                " the same, but got " +
+                std::to_string(coeff_matrix_shape[2]) + " and " +
+                std::to_string(dependent_vals_shape[2]) + ".");
+        }
+    }
+
     // check compatibility of execution queue and allocation queue
     if (!dpctl::utils::queues_are_compatible(exec_q,
                                              {coeff_matrix, dependent_vals}))
@@ -321,17 +352,17 @@ std::pair<sycl::event, sycl::event>
             "are overlapping segments of memory");
     }
 
-    // bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous();
-    // if (!is_coeff_matrix_f_contig) {
-    //     throw py::value_error("The coefficient matrix "
-    //                           "must be F-contiguous");
-    // }
+    bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous();
+    if (!is_coeff_matrix_f_contig) {
+        throw py::value_error("The coefficient matrix "
+                              "must be F-contiguous");
+    }
 
-    // bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous();
-    // if (!is_dependent_vals_f_contig) {
-    //     throw py::value_error("The array of dependent variables "
-    //                           "must be F-contiguous");
-    // }
+    bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous();
+    if (!is_dependent_vals_f_contig) {
+        throw py::value_error("The array of dependent variables "
+                              "must be F-contiguous");
+    }
 
     auto array_types = dpctl_td_ns::usm_ndarray_types();
     int coeff_matrix_type_id =
@@ -355,10 +386,6 @@ std::pair<sycl::event, sycl::event>
     char *dependent_vals_data = dependent_vals.get_data();
 
     const std::int64_t batch_size = coeff_matrix_shape[2];
-    // const std::int64_t n = dependent_vals_shape[1];
-    // const std::int64_t nrhs =
-    //     (dependent_vals_nd > 2) ? dependent_vals_shape[2] : 1;
-
     const std::int64_t n = coeff_matrix_shape[1];
     const std::int64_t nrhs =
         (dependent_vals_nd > 2) ? dependent_vals_shape[1] : 1;
@@ -366,13 +393,6 @@ std::pair<sycl::event, sycl::event>
     const std::int64_t lda = std::max<size_t>(1UL, n);
     const std::int64_t ldb = std::max<size_t>(1UL, n);
 
-    std::cout << "n: " << n << std::endl;
-    std::cout << "nrhs: " << nrhs << std::endl;
-    std::cout << "lda: " << n << std::endl;
-    std::cout << "ldb: " << n << std::endl;
-    std::cout << "b_nd: " << dependent_vals_nd << std::endl;
-
-
     int coeff_matrix_elemsize = coeff_matrix.get_elemsize();
     int dependent_vals_elemsize = dependent_vals.get_elemsize();
 
@@ -383,12 +403,14 @@ std::pair<sycl::event, sycl::event>
     gesv_task_events.reserve(batch_size);
 
     for (std::int64_t i = 0; i < batch_size; ++i) {
-        char *coeff_matrix_batch = coeff_matrix_data + i * n * n * coeff_matrix_elemsize;
-        char *dependent_vals_batch = dependent_vals_data + i * n * nrhs * dependent_vals_elemsize;
+        char *coeff_matrix_batch =
+            coeff_matrix_data + i * n * n * coeff_matrix_elemsize;
+        char *dependent_vals_batch =
+            dependent_vals_data + i * n * nrhs * dependent_vals_elemsize;
 
         sycl::event gesv_ev =
-        gesv_fn(exec_q, n, nrhs, coeff_matrix_batch, lda, dependent_vals_batch,
-                ldb, host_task_events, depends);
+            gesv_fn(exec_q, n, nrhs, coeff_matrix_batch, lda,
+                    dependent_vals_batch, ldb, host_task_events, depends);
 
         gesv_task_events.push_back(gesv_ev);
     }
@@ -396,7 +418,6 @@ std::pair<sycl::event, sycl::event>
     sycl::event combine_ev = exec_q.submit(
         [&](sycl::handler &cgh) { cgh.depends_on(gesv_task_events); });
 
-
     sycl::event args_ev = dpctl::utils::keep_args_alive(
         exec_q, {coeff_matrix, dependent_vals}, host_task_events);
 

From 0cb2808181df14d55b05bb083b449956365e306a Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 11 Jun 2024 23:30:40 +0200
Subject: [PATCH 05/37] Pass sycl::queue by reference for gesv/gesv_batch

---
 dpnp/backend/extensions/lapack/gesv.cpp | 8 ++++----
 dpnp/backend/extensions/lapack/gesv.hpp | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index 192e0325cf3f..de5d38cdfc00 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -48,7 +48,7 @@ namespace mkl_lapack = oneapi::mkl::lapack;
 namespace py = pybind11;
 namespace type_utils = dpctl::tensor::type_utils;
 
-typedef sycl::event (*gesv_impl_fn_ptr_t)(sycl::queue,
+typedef sycl::event (*gesv_impl_fn_ptr_t)(sycl::queue &,
                                           const std::int64_t,
                                           const std::int64_t,
                                           char *,
@@ -61,7 +61,7 @@ typedef sycl::event (*gesv_impl_fn_ptr_t)(sycl::queue,
 static gesv_impl_fn_ptr_t gesv_dispatch_vector[dpctl_td_ns::num_types];
 
 template <typename T>
-static sycl::event gesv_impl(sycl::queue exec_q,
+static sycl::event gesv_impl(sycl::queue &exec_q,
                              const std::int64_t n,
                              const std::int64_t nrhs,
                              char *in_a,
@@ -176,7 +176,7 @@ static sycl::event gesv_impl(sycl::queue exec_q,
 }
 
 std::pair<sycl::event, sycl::event>
-    gesv(sycl::queue exec_q,
+    gesv(sycl::queue &exec_q,
          dpctl::tensor::usm_ndarray coeff_matrix,
          dpctl::tensor::usm_ndarray dependent_vals,
          const std::vector<sycl::event> &depends)
@@ -274,7 +274,7 @@ std::pair<sycl::event, sycl::event>
 }
 
 std::pair<sycl::event, sycl::event>
-    gesv_batch(sycl::queue exec_q,
+    gesv_batch(sycl::queue &exec_q,
                dpctl::tensor::usm_ndarray coeff_matrix,
                dpctl::tensor::usm_ndarray dependent_vals,
                const std::vector<sycl::event> &depends)
diff --git a/dpnp/backend/extensions/lapack/gesv.hpp b/dpnp/backend/extensions/lapack/gesv.hpp
index 4de03370da70..c9dfa1a3de50 100644
--- a/dpnp/backend/extensions/lapack/gesv.hpp
+++ b/dpnp/backend/extensions/lapack/gesv.hpp
@@ -39,13 +39,13 @@ namespace ext
 namespace lapack
 {
 extern std::pair<sycl::event, sycl::event>
-    gesv(sycl::queue exec_q,
+    gesv(sycl::queue &exec_q,
          dpctl::tensor::usm_ndarray coeff_matrix,
          dpctl::tensor::usm_ndarray dependent_vals,
          const std::vector<sycl::event> &depends);
 
 extern std::pair<sycl::event, sycl::event>
-    gesv_batch(sycl::queue exec_q,
+    gesv_batch(sycl::queue &exec_q,
                dpctl::tensor::usm_ndarray coeff_matrix,
                dpctl::tensor::usm_ndarray dependent_vals,
                const std::vector<sycl::event> &depends);

From bfa37d4c0641a7e764afa183d895c5727f89ab16 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 11 Jun 2024 23:36:08 +0200
Subject: [PATCH 06/37] qwe

---
 dpnp/linalg/dpnp_utils_linalg.py | 71 +++++++++++++++++---------------
 perf.py                          | 22 ++++++++++
 test.py                          | 15 +++----
 3 files changed, 68 insertions(+), 40 deletions(-)
 create mode 100644 perf.py

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index dc7e215e557c..96ec793dded9 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -381,21 +381,23 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type):
         m,n = b.shape[-2:]
 
         a = dpnp.moveaxis(a,(-2,-1),(0,1))
-        a = dpnp.array(a, dtype=res_type, order="F", copy=True)
-        a = dpnp.reshape(a,(n_a,n_a,-1))
+        # a_f = dpnp.array(a, dtype=res_type, order="F", copy=True)
+        # a = dpnp.reshape(a,(n_a,n_a,-1))
 
         if b.ndim > 2:
             b = dpnp.moveaxis(b,(-2,-1),(0,1))
         else:
             b = b.T
-        b = dpnp.array(b, dtype=res_type, order="F", copy=True)
-        if b.ndim > 2:
-            b = dpnp.reshape(b,(m,n,-1))
-        else:
-            b = b.T
+        # b_f = dpnp.array(b, dtype=res_type, order="F", copy=True)
+        # if b.ndim > 2:
+        #     # b = dpnp.reshape(b,(m,n,-1))
+        #     b = b
+        # else:
+        #     # b = b.T
+        #     b = b
 
-        a_usm_arr = dpnp.get_usm_ndarray(a)
-        b_usm_arr = dpnp.get_usm_ndarray(b)
+        # a_usm_arr = dpnp.get_usm_ndarray(a)
+        # b_usm_arr = dpnp.get_usm_ndarray(b)
 
 
         # a = a.reshape(-1, a_shape[-2], a_shape[-1])
@@ -409,37 +411,36 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type):
         #     b = b
         # b_usm_arr = dpnp.get_usm_ndarray(b)
 
-        # a_usm_arr = dpnp.get_usm_ndarray(a)
-        # b_usm_arr = dpnp.get_usm_ndarray(b)
+        a_usm_arr = dpnp.get_usm_ndarray(a)
+        b_usm_arr = dpnp.get_usm_ndarray(b)
 
         ht_list_ev = []
 
-        # a_copy = dpnp.empty_like(a, dtype=res_type, order="C", usm_type=res_usm_type)
+        a_f = dpnp.empty_like(a, dtype=res_type, order="F", usm_type=res_usm_type)
 
-        # a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-        #     src=a_usm_arr,
-        #     dst=a_copy.get_array(),
-        #     sycl_queue=a.sycl_queue,
-        # )
-        # ht_list_ev.append(a_ht_copy_ev)
+        a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=a_usm_arr,
+            dst=a_f.get_array(),
+            sycl_queue=a.sycl_queue,
+        )
+        ht_list_ev.append(a_ht_copy_ev)
 
-        # b_copy = dpnp.empty_like(
-        #     b, order="C", dtype=res_type, usm_type=res_usm_type
-        # )
+        b_f = dpnp.empty_like(
+            b, order="F", dtype=res_type, usm_type=res_usm_type
+        )
 
-        # b_ht_copy_ev, b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-        #     src=b_usm_arr,
-        #     dst=b_copy.get_array(),
-        #     sycl_queue=b.sycl_queue,
-        # )
-        # ht_list_ev.append(b_ht_copy_ev)
+        b_ht_copy_ev, b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=b_usm_arr,
+            dst=b_f.get_array(),
+            sycl_queue=b.sycl_queue,
+        )
+        ht_list_ev.append(b_ht_copy_ev)
 
         ht_lapack_ev, _ = li._gesv_batch(
             exec_q,
-            a.get_array(),
-            b.get_array(),
-            # depends=[a_copy_ev, b_copy_ev],
-            depends=[],
+            a_f.get_array(),
+            b_f.get_array(),
+            depends=[a_copy_ev, b_copy_ev],
         )
 
         ht_list_ev.append(ht_lapack_ev)
@@ -451,10 +452,14 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type):
         # else:
         #     v = b_copy.reshape(b_shape)
         if b.ndim > 2:
-            v = dpnp.moveaxis(b, -1 , 0)
+            v = dpnp.moveaxis(b_f, -1 , 0)
             v = v.reshape(b_shape)
         else:
-            v = b.reshape(b_shape)
+            v = dpnp.moveaxis(b_f,-1,0)
+            v = v.reshape(b_shape)
+
+        # v = dpnp.copy(dpnp.moveaxis(b_f, -1, 0), order="C").reshape(b_shape)
+
         return v
 
 
diff --git a/perf.py b/perf.py
new file mode 100644
index 000000000000..20de9e824e66
--- /dev/null
+++ b/perf.py
@@ -0,0 +1,22 @@
+import numpy
+import dpnp
+from tests.helper import generate_random_numpy_array
+import cProfile, pstats, io
+
+a = generate_random_numpy_array((256,512,512),dtype='f4',hermitian=False, seed_value=81)
+b = generate_random_numpy_array((256,512,),dtype='f4',hermitian=False, seed_value=76)
+
+a_dp = dpnp.array(a,device='cpu')
+b_dp = dpnp.array(b,device='cpu')
+
+cold_run = dpnp.linalg.solve(a_dp, b_dp)
+pr = cProfile.Profile()
+pr.enable()
+# res = numpy.linalg.solve(a, b)
+res = dpnp.linalg.solve(a_dp, b_dp)
+pr.disable()
+s = io.StringIO()
+sortBy = pstats.SortKey.CUMULATIVE
+ps = pstats.Stats(pr, stream=s).sort_stats(sortBy)
+ps.print_stats()
+print(s.getvalue())
diff --git a/test.py b/test.py
index b71d3d67d37f..fc22fc3309c8 100644
--- a/test.py
+++ b/test.py
@@ -5,17 +5,18 @@
 from tests.helper import generate_random_numpy_array
 
 # a = numpy.array([[[2,3],[1,2]],[[1,2],[2,3]]],dtype='f4')
-# a = numpy.array([[[1,2],[2,3]],[[1,2],[2,3]]],dtype='f4')
+a = numpy.array([[[1,2],[3,5]],[[1,2],[3,5]],[[1,2],[3,5]]], dtype='f4')
+b = numpy.array([[1,2],[1,2],[1,2]], dtype='f4')
 # a = numpy.array([[1,2],[2,3]],dtype='f4')
 # a = numpy.array([[[2,3],[1,2]]],dtype='f4')
 # a = numpy.array([[[1,2],[2,3]]],dtype='f4')
-a = generate_random_numpy_array(
-    (2, 3, 3), dtype='f4', hermitian=False, seed_value=81
-)
+# a = generate_random_numpy_array(
+#     (2, 3, 3), dtype='f4', hermitian=False, seed_value=81
+# )
 
-b = generate_random_numpy_array(
-    (2, 3), dtype='f4', hermitian=False, seed_value=76
-)
+# b = generate_random_numpy_array(
+#     (2, 3, 2), dtype='f4', hermitian=False, seed_value=76
+# )
 # a = numpy.array(
 #             [
 #                 [[1, 0, 3], [0, 5, 0], [7, 0, 9]],

From 4a4429227d710ac3390ebd05682b8d2f450f9465 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 12 Jun 2024 14:34:13 +0200
Subject: [PATCH 07/37] Update _batched_solve implementation

---
 dpnp/linalg/dpnp_iface_linalg.py |   4 +-
 dpnp/linalg/dpnp_utils_linalg.py | 143 +++++++++++++------------------
 2 files changed, 60 insertions(+), 87 deletions(-)

diff --git a/dpnp/linalg/dpnp_iface_linalg.py b/dpnp/linalg/dpnp_iface_linalg.py
index 72d79ad329d7..8f535ba8b47e 100644
--- a/dpnp/linalg/dpnp_iface_linalg.py
+++ b/dpnp/linalg/dpnp_iface_linalg.py
@@ -1093,7 +1093,7 @@ def qr(a, mode="reduced"):
     return dpnp_qr(a, mode)
 
 
-def solve(a, b):
+def solve(a, b, new=False):
     """
     Solve a linear matrix equation, or system of linear scalar equations.
 
@@ -1143,7 +1143,7 @@ def solve(a, b):
             "or (..., M, K)"
         )
 
-    return dpnp_solve(a, b)
+    return dpnp_solve(a, b, new=new)
 
 
 def svd(a, full_matrices=True, compute_uv=True, hermitian=False):
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 96ec793dded9..3b68ff964874 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -257,7 +257,7 @@ def _batched_inv(a, res_type):
     return a_h.reshape(orig_shape)
 
 
-def _batched_solve(a, b, exec_q, res_usm_type, res_type):
+def _batched_solve(a, b, exec_q, res_usm_type, res_type, new=False):
     """
     _batched_solve(a, b, exec_q, res_usm_type, res_type)
 
@@ -267,8 +267,6 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type):
 
     """
 
-    new = True
-
     if not new:
         a_usm_arr = dpnp.get_usm_ndarray(a)
         b_usm_arr = dpnp.get_usm_ndarray(b)
@@ -366,101 +364,76 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type):
             out_v = out_v.reshape(orig_shape_b)
         return out_v
 
-    else:
-        a_shape = a.shape
-        b_shape = b.shape
-
-        if a.ndim == b.ndim:
-            b = b.reshape(-1, b_shape[-2], b_shape[-1])
-        else:
-            b = b.reshape(-1, b_shape[-1])
-
-        a = a.reshape(-1, a_shape[-2], a_shape[-1])
-
-        n_a = a.shape[1]
-        m,n = b.shape[-2:]
-
-        a = dpnp.moveaxis(a,(-2,-1),(0,1))
-        # a_f = dpnp.array(a, dtype=res_type, order="F", copy=True)
-        # a = dpnp.reshape(a,(n_a,n_a,-1))
-
-        if b.ndim > 2:
-            b = dpnp.moveaxis(b,(-2,-1),(0,1))
-        else:
-            b = b.T
-        # b_f = dpnp.array(b, dtype=res_type, order="F", copy=True)
-        # if b.ndim > 2:
-        #     # b = dpnp.reshape(b,(m,n,-1))
-        #     b = b
-        # else:
-        #     # b = b.T
-        #     b = b
-
-        # a_usm_arr = dpnp.get_usm_ndarray(a)
-        # b_usm_arr = dpnp.get_usm_ndarray(b)
-
+    a_shape = a.shape
+    b_shape = b.shape
 
-        # a = a.reshape(-1, a_shape[-2], a_shape[-1])
+    # gesv_batch expects `a` to be a 3D array and
+    # `b` to be either a 2D or 3D array.
+    if a.ndim == b.ndim:
+        b = b.reshape(-1, b_shape[-2], b_shape[-1])
+    else:
+        b = b.reshape(-1, b_shape[-1])
 
-        # a = a.transpose((0, 2, 1))
-        # a_usm_arr = dpnp.get_usm_ndarray(a)
+    a = a.reshape(-1, a_shape[-2], a_shape[-1])
 
-        # if b.ndim > 2:
-        #     b = b.transpose((0,2,1))
-        # else:
-        #     b = b
-        # b_usm_arr = dpnp.get_usm_ndarray(b)
+    # Reorder the elements by moving the last two axes of `a`` to the front
+    # to match fortran-like array order which is assumed by gesv.
+    a = dpnp.moveaxis(a, (-2, -1), (0, 1))
+    # The same for `b` if it is 3D;
+    # if it is 2D, transpose it.
+    if b.ndim > 2:
+        b = dpnp.moveaxis(b, (-2, -1), (0, 1))
+    else:
+        b = b.T
 
-        a_usm_arr = dpnp.get_usm_ndarray(a)
-        b_usm_arr = dpnp.get_usm_ndarray(b)
+    a_usm_arr = dpnp.get_usm_ndarray(a)
+    b_usm_arr = dpnp.get_usm_ndarray(b)
 
-        ht_list_ev = []
+    ht_list_ev = []
 
-        a_f = dpnp.empty_like(a, dtype=res_type, order="F", usm_type=res_usm_type)
+    # oneMKL LAPACK gesv destroys `a` and assumes fortran-like array
+    # as input.
+    a_f = dpnp.empty_like(a, dtype=res_type, order="F", usm_type=res_usm_type)
 
-        a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=a_usm_arr,
-            dst=a_f.get_array(),
-            sycl_queue=a.sycl_queue,
-        )
-        ht_list_ev.append(a_ht_copy_ev)
+    a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=a_usm_arr,
+        dst=a_f.get_array(),
+        sycl_queue=exec_q,
+    )
+    ht_list_ev.append(a_ht_copy_ev)
 
-        b_f = dpnp.empty_like(
-            b, order="F", dtype=res_type, usm_type=res_usm_type
-        )
+    # oneMKL LAPACK gesv overwrites `b` and assumes fortran-like array
+    # as input.
+    b_f = dpnp.empty_like(b, order="F", dtype=res_type, usm_type=res_usm_type)
 
-        b_ht_copy_ev, b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=b_usm_arr,
-            dst=b_f.get_array(),
-            sycl_queue=b.sycl_queue,
-        )
-        ht_list_ev.append(b_ht_copy_ev)
+    b_ht_copy_ev, b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=b_usm_arr,
+        dst=b_f.get_array(),
+        sycl_queue=exec_q,
+    )
+    ht_list_ev.append(b_ht_copy_ev)
 
-        ht_lapack_ev, _ = li._gesv_batch(
-            exec_q,
-            a_f.get_array(),
-            b_f.get_array(),
-            depends=[a_copy_ev, b_copy_ev],
-        )
+    ht_lapack_ev, _ = li._gesv_batch(
+        exec_q,
+        a_f.get_array(),
+        b_f.get_array(),
+        depends=[a_copy_ev, b_copy_ev],
+    )
 
-        ht_list_ev.append(ht_lapack_ev)
+    ht_list_ev.append(ht_lapack_ev)
 
-        dpctl.SyclEvent.wait_for(ht_list_ev)
+    dpctl.SyclEvent.wait_for(ht_list_ev)
 
-        # if b.ndim > 2:
-        #     v = b_copy.transpose(0,2,1).reshape(b_shape)
-        # else:
-        #     v = b_copy.reshape(b_shape)
-        if b.ndim > 2:
-            v = dpnp.moveaxis(b_f, -1 , 0)
-            v = v.reshape(b_shape)
-        else:
-            v = dpnp.moveaxis(b_f,-1,0)
-            v = v.reshape(b_shape)
+    # Gesv call overwtires `b` in Fortran order, reorder the axes
+    # to match C order by moving the last axis to the front and
+    # reshape it back to the original shape of `b`.
+    v = dpnp.moveaxis(b_f, -1, 0).reshape(b_shape)
 
-        # v = dpnp.copy(dpnp.moveaxis(b_f, -1, 0), order="C").reshape(b_shape)
+    # dpnp.moveaxis can make the array non-contiguous if it is not 2D
+    if b.ndim > 2:
+        v = dpnp.ascontiguousarray(v)
 
-        return v
+    return v
 
 
 def _batched_qr(a, mode="reduced"):
@@ -2543,7 +2516,7 @@ def dpnp_qr(a, mode="reduced"):
     return (q, r)
 
 
-def dpnp_solve(a, b):
+def dpnp_solve(a, b, new=False):
     """
     dpnp_solve(a, b)
 
@@ -2560,7 +2533,7 @@ def dpnp_solve(a, b):
         return dpnp.empty_like(b, dtype=res_type, usm_type=res_usm_type)
 
     if a.ndim > 2:
-        return _batched_solve(a, b, exec_q, res_usm_type, res_type)
+        return _batched_solve(a, b, exec_q, res_usm_type, res_type, new=new)
 
     a_usm_arr = dpnp.get_usm_ndarray(a)
     b_usm_arr = dpnp.get_usm_ndarray(b)

From df4774ef67d7a803e2b2e2edde1d37bdac7475f0 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 12 Jun 2024 14:37:12 +0200
Subject: [PATCH 08/37] Remove old impl in _batched_solve

---
 dpnp/linalg/dpnp_iface_linalg.py |   4 +-
 dpnp/linalg/dpnp_utils_linalg.py | 105 ++-----------------------------
 2 files changed, 6 insertions(+), 103 deletions(-)

diff --git a/dpnp/linalg/dpnp_iface_linalg.py b/dpnp/linalg/dpnp_iface_linalg.py
index 8f535ba8b47e..72d79ad329d7 100644
--- a/dpnp/linalg/dpnp_iface_linalg.py
+++ b/dpnp/linalg/dpnp_iface_linalg.py
@@ -1093,7 +1093,7 @@ def qr(a, mode="reduced"):
     return dpnp_qr(a, mode)
 
 
-def solve(a, b, new=False):
+def solve(a, b):
     """
     Solve a linear matrix equation, or system of linear scalar equations.
 
@@ -1143,7 +1143,7 @@ def solve(a, b, new=False):
             "or (..., M, K)"
         )
 
-    return dpnp_solve(a, b, new=new)
+    return dpnp_solve(a, b)
 
 
 def svd(a, full_matrices=True, compute_uv=True, hermitian=False):
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 3b68ff964874..4e176ce884f3 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -257,7 +257,7 @@ def _batched_inv(a, res_type):
     return a_h.reshape(orig_shape)
 
 
-def _batched_solve(a, b, exec_q, res_usm_type, res_type, new=False):
+def _batched_solve(a, b, exec_q, res_usm_type, res_type):
     """
     _batched_solve(a, b, exec_q, res_usm_type, res_type)
 
@@ -267,103 +267,6 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type, new=False):
 
     """
 
-    if not new:
-        a_usm_arr = dpnp.get_usm_ndarray(a)
-        b_usm_arr = dpnp.get_usm_ndarray(b)
-
-        b_order = "C" if b.flags.c_contiguous else "F"
-        a_shape = a.shape
-        b_shape = b.shape
-
-        is_cpu_device = exec_q.sycl_device.has_aspect_cpu
-        reshape = False
-        orig_shape_b = b_shape
-        if a.ndim > 3:
-            # get 3d input arrays by reshape
-            if a.ndim == b.ndim:
-                b = b.reshape(-1, b_shape[-2], b_shape[-1])
-            else:
-                b = b.reshape(-1, b_shape[-1])
-
-            a = a.reshape(-1, a_shape[-2], a_shape[-1])
-
-            a_usm_arr = dpnp.get_usm_ndarray(a)
-            b_usm_arr = dpnp.get_usm_ndarray(b)
-            reshape = True
-
-        batch_size = a.shape[0]
-
-        coeff_vecs = [None] * batch_size
-        val_vecs = [None] * batch_size
-        a_ht_copy_ev = [None] * batch_size
-        b_ht_copy_ev = [None] * batch_size
-        ht_lapack_ev = [None] * batch_size
-
-        for i in range(batch_size):
-            # oneMKL LAPACK assumes fortran-like array as input, so allocate
-            # a memory with 'F' order for dpnp array of coefficient matrix
-            coeff_vecs[i] = dpnp.empty_like(
-                a[i], order="F", dtype=res_type, usm_type=res_usm_type
-            )
-
-            # use DPCTL tensor function to fill the coefficient matrix array
-            # with content from the input array
-            a_ht_copy_ev[i], a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=a_usm_arr[i],
-                dst=coeff_vecs[i].get_array(),
-                sycl_queue=a.sycl_queue,
-            )
-
-            # oneMKL LAPACK assumes fortran-like array as input, so
-            # allocate a memory with 'F' order for dpnp array of multiple
-            # dependent variables array
-            val_vecs[i] = dpnp.empty_like(
-                b[i], order="F", dtype=res_type, usm_type=res_usm_type
-            )
-
-            # use DPCTL tensor function to fill the array of multiple dependent
-            # variables with content from the input arrays
-            b_ht_copy_ev[i], b_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=b_usm_arr[i],
-                dst=val_vecs[i].get_array(),
-                sycl_queue=b.sycl_queue,
-            )
-
-            # Call the LAPACK extension function _gesv to solve the system of
-            # linear equations using a portion of the coefficient square matrix
-            # and a corresponding portion of the dependent variables array.
-            ht_lapack_ev[i], _ = li._gesv(
-                exec_q,
-                coeff_vecs[i].get_array(),
-                val_vecs[i].get_array(),
-                depends=[a_copy_ev, b_copy_ev],
-            )
-
-            # TODO: Remove this w/a when MKLD-17201 is solved.
-            # Waiting for a host task executing an OneMKL LAPACK gesv call
-            # on CPU causes deadlock due to serialization of all host tasks
-            # in the queue.
-            # We need to wait for each host tasks before calling _gesv to avoid
-            # deadlock.
-            if is_cpu_device:
-                ht_lapack_ev[i].wait()
-                b_ht_copy_ev[i].wait()
-
-        for i in range(batch_size):
-            ht_lapack_ev[i].wait()
-            b_ht_copy_ev[i].wait()
-            a_ht_copy_ev[i].wait()
-
-        # combine the list of solutions into a single array
-        out_v = dpnp.array(
-            val_vecs, order=b_order, dtype=res_type, usm_type=res_usm_type
-        )
-        if reshape:
-            # shape of the out_v must be equal to the shape of the array of
-            # dependent variables
-            out_v = out_v.reshape(orig_shape_b)
-        return out_v
-
     a_shape = a.shape
     b_shape = b.shape
 
@@ -376,7 +279,7 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type, new=False):
 
     a = a.reshape(-1, a_shape[-2], a_shape[-1])
 
-    # Reorder the elements by moving the last two axes of `a`` to the front
+    # Reorder the elements by moving the last two axes of `a` to the front
     # to match fortran-like array order which is assumed by gesv.
     a = dpnp.moveaxis(a, (-2, -1), (0, 1))
     # The same for `b` if it is 3D;
@@ -2516,7 +2419,7 @@ def dpnp_qr(a, mode="reduced"):
     return (q, r)
 
 
-def dpnp_solve(a, b, new=False):
+def dpnp_solve(a, b):
     """
     dpnp_solve(a, b)
 
@@ -2533,7 +2436,7 @@ def dpnp_solve(a, b, new=False):
         return dpnp.empty_like(b, dtype=res_type, usm_type=res_usm_type)
 
     if a.ndim > 2:
-        return _batched_solve(a, b, exec_q, res_usm_type, res_type, new=new)
+        return _batched_solve(a, b, exec_q, res_usm_type, res_type)
 
     a_usm_arr = dpnp.get_usm_ndarray(a)
     b_usm_arr = dpnp.get_usm_ndarray(b)

From 8fb2af3f6b1868fe23b9dd71fe63f976538d6c8f Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 12 Jun 2024 15:19:24 +0200
Subject: [PATCH 09/37] Use py::gil_scoped_release before gesv call

---
 dpnp/backend/extensions/lapack/gesv.cpp | 47 ++++++++++++++-----------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index de5d38cdfc00..b1ed4cfa3e8c 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -212,26 +212,26 @@ std::pair<sycl::event, sycl::event>
                                              {coeff_matrix, dependent_vals}))
     {
         throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
+            "Execution queue is not compatible with allocation queues.");
     }
 
     auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
     if (overlap(coeff_matrix, dependent_vals)) {
         throw py::value_error(
             "The arrays of coefficients and dependent variables "
-            "are overlapping segments of memory");
+            "are overlapping segments of memory.");
     }
 
     bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous();
     if (!is_coeff_matrix_f_contig) {
         throw py::value_error("The coefficient matrix "
-                              "must be F-contiguous");
+                              "must be F-contiguous.");
     }
 
     bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous();
     if (!is_dependent_vals_f_contig) {
         throw py::value_error("The array of dependent variables "
-                              "must be F-contiguous");
+                              "must be F-contiguous.");
     }
 
     auto array_types = dpctl_td_ns::usm_ndarray_types();
@@ -242,7 +242,7 @@ std::pair<sycl::event, sycl::event>
 
     if (coeff_matrix_type_id != dependent_vals_type_id) {
         throw py::value_error("The types of the coefficient matrix and "
-                              "dependent variables are mismatched");
+                              "dependent variables are mismatched.");
     }
 
     gesv_impl_fn_ptr_t gesv_fn = gesv_dispatch_vector[coeff_matrix_type_id];
@@ -300,8 +300,7 @@ std::pair<sycl::event, sycl::event>
 
     // The coeff_matrix and dependent_vals arrays must be F-contiguous arrays
     // with the shapes (n,n,batch_size) and (n,nrhs,batch_size) or
-    // (n,batch_size) respectively.
-
+    // (n,batch_size) respectively
     if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) {
         throw py::value_error("The coefficient matrix must be square,"
                               " but got a shape of (" +
@@ -342,26 +341,26 @@ std::pair<sycl::event, sycl::event>
                                              {coeff_matrix, dependent_vals}))
     {
         throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
+            "Execution queue is not compatible with allocation queues.");
     }
 
     auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
     if (overlap(coeff_matrix, dependent_vals)) {
         throw py::value_error(
             "The arrays of coefficients and dependent variables "
-            "are overlapping segments of memory");
+            "are overlapping segments of memory.");
     }
 
     bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous();
     if (!is_coeff_matrix_f_contig) {
         throw py::value_error("The coefficient matrix "
-                              "must be F-contiguous");
+                              "must be F-contiguous.");
     }
 
     bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous();
     if (!is_dependent_vals_f_contig) {
         throw py::value_error("The array of dependent variables "
-                              "must be F-contiguous");
+                              "must be F-contiguous.");
     }
 
     auto array_types = dpctl_td_ns::usm_ndarray_types();
@@ -372,7 +371,7 @@ std::pair<sycl::event, sycl::event>
 
     if (coeff_matrix_type_id != dependent_vals_type_id) {
         throw py::value_error("The types of the coefficient matrix and "
-                              "dependent variables are mismatched");
+                              "dependent variables are mismatched.");
     }
 
     gesv_impl_fn_ptr_t gesv_fn = gesv_dispatch_vector[coeff_matrix_type_id];
@@ -402,17 +401,23 @@ std::pair<sycl::event, sycl::event>
     host_task_events.reserve(batch_size);
     gesv_task_events.reserve(batch_size);
 
-    for (std::int64_t i = 0; i < batch_size; ++i) {
-        char *coeff_matrix_batch =
-            coeff_matrix_data + i * n * n * coeff_matrix_elemsize;
-        char *dependent_vals_batch =
-            dependent_vals_data + i * n * nrhs * dependent_vals_elemsize;
+    {
+        // Release GIL to allow other Python threads to run during the loop
+        // as the operations in the loop do not require GIL
+        py::gil_scoped_release release;
+
+        for (std::int64_t i = 0; i < batch_size; ++i) {
+            char *coeff_matrix_batch =
+                coeff_matrix_data + i * n * n * coeff_matrix_elemsize;
+            char *dependent_vals_batch =
+                dependent_vals_data + i * n * nrhs * dependent_vals_elemsize;
 
-        sycl::event gesv_ev =
-            gesv_fn(exec_q, n, nrhs, coeff_matrix_batch, lda,
-                    dependent_vals_batch, ldb, host_task_events, depends);
+            sycl::event gesv_ev =
+                gesv_fn(exec_q, n, nrhs, coeff_matrix_batch, lda,
+                        dependent_vals_batch, ldb, host_task_events, depends);
 
-        gesv_task_events.push_back(gesv_ev);
+            gesv_task_events.push_back(gesv_ev);
+        }
     }
 
     sycl::event combine_ev = exec_q.submit(

From ddcf9fef8e1a1cbdaad8f4e0caa77013d6edcc4a Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 12 Jun 2024 15:27:30 +0200
Subject: [PATCH 10/37] Remove junk files

---
 perf.py | 22 -----------------
 test.py | 75 ---------------------------------------------------------
 2 files changed, 97 deletions(-)
 delete mode 100644 perf.py
 delete mode 100644 test.py

diff --git a/perf.py b/perf.py
deleted file mode 100644
index 20de9e824e66..000000000000
--- a/perf.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import numpy
-import dpnp
-from tests.helper import generate_random_numpy_array
-import cProfile, pstats, io
-
-a = generate_random_numpy_array((256,512,512),dtype='f4',hermitian=False, seed_value=81)
-b = generate_random_numpy_array((256,512,),dtype='f4',hermitian=False, seed_value=76)
-
-a_dp = dpnp.array(a,device='cpu')
-b_dp = dpnp.array(b,device='cpu')
-
-cold_run = dpnp.linalg.solve(a_dp, b_dp)
-pr = cProfile.Profile()
-pr.enable()
-# res = numpy.linalg.solve(a, b)
-res = dpnp.linalg.solve(a_dp, b_dp)
-pr.disable()
-s = io.StringIO()
-sortBy = pstats.SortKey.CUMULATIVE
-ps = pstats.Stats(pr, stream=s).sort_stats(sortBy)
-ps.print_stats()
-print(s.getvalue())
diff --git a/test.py b/test.py
deleted file mode 100644
index fc22fc3309c8..000000000000
--- a/test.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import numpy
-
-import dpnp
-import dpnp.linalg
-from tests.helper import generate_random_numpy_array
-
-# a = numpy.array([[[2,3],[1,2]],[[1,2],[2,3]]],dtype='f4')
-a = numpy.array([[[1,2],[3,5]],[[1,2],[3,5]],[[1,2],[3,5]]], dtype='f4')
-b = numpy.array([[1,2],[1,2],[1,2]], dtype='f4')
-# a = numpy.array([[1,2],[2,3]],dtype='f4')
-# a = numpy.array([[[2,3],[1,2]]],dtype='f4')
-# a = numpy.array([[[1,2],[2,3]]],dtype='f4')
-# a = generate_random_numpy_array(
-#     (2, 3, 3), dtype='f4', hermitian=False, seed_value=81
-# )
-
-# b = generate_random_numpy_array(
-#     (2, 3, 2), dtype='f4', hermitian=False, seed_value=76
-# )
-# a = numpy.array(
-#             [
-#                 [[1, 0, 3], [0, 5, 0], [7, 0, 9]],
-#                 [[3, 0, 3], [0, 7, 0], [7, 0, 11]],
-#             ],
-#             dtype='f4',
-#         )
-a_dp = dpnp.array(a)
-b_dp = dpnp.array(b)
-
-# a_f = numpy.array(a,order="F")
-# a_dp_f = dpnp.array(a_dp,order="F")
-
-# res = numpy.linalg.eigh(a)
-# res_dp = dpnp.linalg.eigh(a_dp)
-# res = numpy.linalg.eigh(a_f)
-# res_dp = dpnp.linalg.eigh(a_dp_f)
-res = numpy.linalg.solve(a,b)
-res_dp = dpnp.linalg.solve(a_dp,b_dp)
-# res = numpy.linalg.eigvalsh(a)
-# res_dp = dpnp.linalg.eigvalsh(a_dp)
-
-
-print("VALS: ")
-print("NUMPY: ")
-print(res)
-print("DPNP: ")
-print(res_dp)
-
-# print("BEFORE")
-# print(a_dp)
-
-# print("VALS: ")
-# print("NUMPY: ")
-# print(res[0])
-# print("DPNP: ")
-# print(res_dp[0])
-
-
-# print("VECS: ")
-# print("NUMPY: ")
-# print(res[1])
-# print("DPNP: ")
-# print(res_dp[1])
-
-# print("AFTER")
-# print(a_dp)
-
-
-# a = generate_random_numpy_array((10,3,3),dtype=numpy.float32, hermitian=True, seed_value=81)
-# a_dp = dpnp.array(a, device='gpu')
-
-# res = numpy.linalg.eigh(a)
-# res_dp = dpnp.linalg.eigh(a_dp)
-
-# print("Done")

From 262794f55aba80c0a4520bd54c50d033cf896fc0 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 13 Jun 2024 16:02:14 +0200
Subject: [PATCH 11/37] Move gesv_batch to gesv_batch.cpp

---
 dpnp/backend/extensions/lapack/CMakeLists.txt |   1 +
 dpnp/backend/extensions/lapack/gesv.cpp       | 156 --------
 dpnp/backend/extensions/lapack/gesv.hpp       |   1 +
 dpnp/backend/extensions/lapack/gesv_batch.cpp | 357 ++++++++++++++++++
 dpnp/backend/extensions/lapack/lapack_py.cpp  |   1 +
 5 files changed, 360 insertions(+), 156 deletions(-)
 create mode 100644 dpnp/backend/extensions/lapack/gesv_batch.cpp

diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index c25ef1d97bcb..7c71a5a37eb1 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -30,6 +30,7 @@ set(_module_src
     ${CMAKE_CURRENT_SOURCE_DIR}/geqrf.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/geqrf_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/gesv.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/gesv_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/gesvd.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/getrf.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/getrf_batch.cpp
diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index b1ed4cfa3e8c..ea31b39d5119 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -273,162 +273,6 @@ std::pair<sycl::event, sycl::event>
     return std::make_pair(args_ev, gesv_ev);
 }
 
-std::pair<sycl::event, sycl::event>
-    gesv_batch(sycl::queue &exec_q,
-               dpctl::tensor::usm_ndarray coeff_matrix,
-               dpctl::tensor::usm_ndarray dependent_vals,
-               const std::vector<sycl::event> &depends)
-{
-    const int coeff_matrix_nd = coeff_matrix.get_ndim();
-    const int dependent_vals_nd = dependent_vals.get_ndim();
-
-    if (coeff_matrix_nd != 3) {
-        throw py::value_error("The coefficient matrix has ndim=" +
-                              std::to_string(coeff_matrix_nd) +
-                              ", but a 3-dimensional array is expected.");
-    }
-
-    if (dependent_vals_nd < 2 || dependent_vals_nd > 3) {
-        throw py::value_error(
-            "The dependent values array has ndim=" +
-            std::to_string(dependent_vals_nd) +
-            ", but a 2-dimensional or 3-dimensional array is expected.");
-    }
-
-    const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw();
-    const py::ssize_t *dependent_vals_shape = dependent_vals.get_shape_raw();
-
-    // The coeff_matrix and dependent_vals arrays must be F-contiguous arrays
-    // with the shapes (n,n,batch_size) and (n,nrhs,batch_size) or
-    // (n,batch_size) respectively
-    if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) {
-        throw py::value_error("The coefficient matrix must be square,"
-                              " but got a shape of (" +
-                              std::to_string(coeff_matrix_shape[0]) + ", " +
-                              std::to_string(coeff_matrix_shape[1]) + ").");
-    }
-
-    if (coeff_matrix_shape[0] != dependent_vals_shape[0]) {
-        throw py::value_error("The first dimension (n) of coeff_matrix and"
-                              " dependent_vals must be the same, but got " +
-                              std::to_string(coeff_matrix_shape[0]) + " and " +
-                              std::to_string(dependent_vals_shape[0]) + ".");
-    }
-
-    if (dependent_vals_nd == 2) {
-        if (coeff_matrix_shape[2] != dependent_vals_shape[1]) {
-            throw py::value_error(
-                "The batch_size of "
-                " coeff_matrix and dependent_vals must be"
-                " the same, but got " +
-                std::to_string(coeff_matrix_shape[2]) + " and " +
-                std::to_string(dependent_vals_shape[1]) + ".");
-        }
-    }
-    else if (dependent_vals_nd == 3) {
-        if (coeff_matrix_shape[2] != dependent_vals_shape[2]) {
-            throw py::value_error(
-                "The batch_size of "
-                " coeff_matrix and dependent_vals must be"
-                " the same, but got " +
-                std::to_string(coeff_matrix_shape[2]) + " and " +
-                std::to_string(dependent_vals_shape[2]) + ".");
-        }
-    }
-
-    // check compatibility of execution queue and allocation queue
-    if (!dpctl::utils::queues_are_compatible(exec_q,
-                                             {coeff_matrix, dependent_vals}))
-    {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues.");
-    }
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(coeff_matrix, dependent_vals)) {
-        throw py::value_error(
-            "The arrays of coefficients and dependent variables "
-            "are overlapping segments of memory.");
-    }
-
-    bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous();
-    if (!is_coeff_matrix_f_contig) {
-        throw py::value_error("The coefficient matrix "
-                              "must be F-contiguous.");
-    }
-
-    bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous();
-    if (!is_dependent_vals_f_contig) {
-        throw py::value_error("The array of dependent variables "
-                              "must be F-contiguous.");
-    }
-
-    auto array_types = dpctl_td_ns::usm_ndarray_types();
-    int coeff_matrix_type_id =
-        array_types.typenum_to_lookup_id(coeff_matrix.get_typenum());
-    int dependent_vals_type_id =
-        array_types.typenum_to_lookup_id(dependent_vals.get_typenum());
-
-    if (coeff_matrix_type_id != dependent_vals_type_id) {
-        throw py::value_error("The types of the coefficient matrix and "
-                              "dependent variables are mismatched.");
-    }
-
-    gesv_impl_fn_ptr_t gesv_fn = gesv_dispatch_vector[coeff_matrix_type_id];
-    if (gesv_fn == nullptr) {
-        throw py::value_error(
-            "No gesv implementation defined for the provided type "
-            "of the coefficient matrix.");
-    }
-
-    char *coeff_matrix_data = coeff_matrix.get_data();
-    char *dependent_vals_data = dependent_vals.get_data();
-
-    const std::int64_t batch_size = coeff_matrix_shape[2];
-    const std::int64_t n = coeff_matrix_shape[1];
-    const std::int64_t nrhs =
-        (dependent_vals_nd > 2) ? dependent_vals_shape[1] : 1;
-
-    const std::int64_t lda = std::max<size_t>(1UL, n);
-    const std::int64_t ldb = std::max<size_t>(1UL, n);
-
-    int coeff_matrix_elemsize = coeff_matrix.get_elemsize();
-    int dependent_vals_elemsize = dependent_vals.get_elemsize();
-
-    std::vector<sycl::event> host_task_events;
-    std::vector<sycl::event> gesv_task_events;
-
-    host_task_events.reserve(batch_size);
-    gesv_task_events.reserve(batch_size);
-
-    {
-        // Release GIL to allow other Python threads to run during the loop
-        // as the operations in the loop do not require GIL
-        py::gil_scoped_release release;
-
-        for (std::int64_t i = 0; i < batch_size; ++i) {
-            char *coeff_matrix_batch =
-                coeff_matrix_data + i * n * n * coeff_matrix_elemsize;
-            char *dependent_vals_batch =
-                dependent_vals_data + i * n * nrhs * dependent_vals_elemsize;
-
-            sycl::event gesv_ev =
-                gesv_fn(exec_q, n, nrhs, coeff_matrix_batch, lda,
-                        dependent_vals_batch, ldb, host_task_events, depends);
-
-            gesv_task_events.push_back(gesv_ev);
-        }
-    }
-
-    sycl::event combine_ev = exec_q.submit(
-        [&](sycl::handler &cgh) { cgh.depends_on(gesv_task_events); });
-
-    sycl::event args_ev = dpctl::utils::keep_args_alive(
-        exec_q, {coeff_matrix, dependent_vals}, host_task_events);
-
-    return std::make_pair(args_ev, combine_ev);
-}
-
 template <typename fnT, typename T>
 struct GesvContigFactory
 {
diff --git a/dpnp/backend/extensions/lapack/gesv.hpp b/dpnp/backend/extensions/lapack/gesv.hpp
index c9dfa1a3de50..9b7230f1b18a 100644
--- a/dpnp/backend/extensions/lapack/gesv.hpp
+++ b/dpnp/backend/extensions/lapack/gesv.hpp
@@ -51,6 +51,7 @@ extern std::pair<sycl::event, sycl::event>
                const std::vector<sycl::event> &depends);
 
 extern void init_gesv_dispatch_vector(void);
+extern void init_gesv_batch_dispatch_vector(void);
 } // namespace lapack
 } // namespace ext
 } // namespace backend
diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp
new file mode 100644
index 000000000000..0a4aa14b2e5e
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp
@@ -0,0 +1,357 @@
+//*****************************************************************************
+// Copyright (c) 2023-2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "common_helpers.hpp"
+#include "gesv.hpp"
+#include "linalg_exceptions.hpp"
+#include "types_matrix.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+namespace mkl_lapack = oneapi::mkl::lapack;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*gesv_batch_impl_fn_ptr_t)(sycl::queue &,
+                                                const std::int64_t,
+                                                const std::int64_t,
+                                                const std::int64_t,
+                                                char *,
+                                                std::int64_t,
+                                                char *,
+                                                std::int64_t,
+                                                const std::vector<sycl::event> &);
+
+static gesv_batch_impl_fn_ptr_t gesv_batch_dispatch_vector[dpctl_td_ns::num_types];
+
+template <typename T>
+static sycl::event gesv_batch_impl(sycl::queue &exec_q,
+                                   const std::int64_t n,
+                                   const std::int64_t nrhs,
+                                   const std::int64_t batch_size,
+                                   char *in_a,
+                                   std::int64_t lda,
+                                   char *in_b,
+                                   std::int64_t ldb,
+                                   const std::vector<sycl::event> &depends)
+{
+    // py::gil_scoped_release release;
+    type_utils::validate_type_for_device<T>(exec_q);
+
+    T *a = reinterpret_cast<T *>(in_a);
+    T *b = reinterpret_cast<T *>(in_b);
+
+    std::int64_t a_size = n * n;
+    std::int64_t b_size = n * nrhs;
+
+    const std::int64_t scratchpad_size =
+        mkl_lapack::gesv_scratchpad_size<T>(exec_q, n, nrhs, lda, ldb);
+    T *scratchpad = nullptr;
+
+    if (scratchpad_size > 0) {
+        scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+        if (!scratchpad)
+            throw std::runtime_error("Device allocation for scratchpad failed");
+    }
+
+    std::int64_t *ipiv = sycl::malloc_device<std::int64_t>(n, exec_q);
+    if (!ipiv)
+        throw std::runtime_error("Device allocation for ipiv failed");
+
+    std::vector<sycl::event> gesv_task_events;
+    gesv_task_events.reserve(batch_size);
+
+    std::stringstream error_msg;
+    std::int64_t info = 0;
+    bool is_exception_caught = false;
+
+    sycl::event gesv_event;
+
+    for (std::int64_t batch_id = 0; batch_id < batch_size; ++batch_id) {
+        T *a_batch = a + batch_id * a_size;
+        T *b_batch = b + batch_id * b_size;
+
+        try {
+            gesv_event = mkl_lapack::gesv(
+                exec_q,
+                n,    // The order of the square matrix A
+                    // and the number of rows in matrix B (0 ≤ n).
+                nrhs, // The number of right-hand sides,
+                    // i.e., the number of columns in matrix B (0 ≤ nrhs).
+                a_batch,    // Pointer to the square coefficient matrix A (n x n).
+                lda,  // The leading dimension of a, must be at least max(1, n).
+                ipiv, // The pivot indices that define the permutation matrix P;
+                    // row i of the matrix was interchanged with row ipiv(i),
+                    // must be at least max(1, n).
+                b_batch,    // Pointer to the right hand side matrix B (n x nrhs).
+                ldb,  // The leading dimension of matrix B,
+                    // must be at least max(1, n).
+                scratchpad, // Pointer to scratchpad memory to be used by MKL
+                            // routine for storing intermediate results.
+                scratchpad_size, depends);
+        } catch (mkl_lapack::exception const &e) {
+            is_exception_caught = true;
+            info = e.info();
+
+            if (info < 0) {
+                error_msg << "Parameter number " << -info
+                        << " had an illegal value.";
+            }
+            else if (info == scratchpad_size && e.detail() != 0) {
+                error_msg
+                    << "Insufficient scratchpad size. Required size is at least "
+                    << e.detail();
+            }
+            else if (info > 0) {
+                T host_U;
+                exec_q.memcpy(&host_U, &a[(info - 1) * lda + info - 1], sizeof(T))
+                    .wait();
+
+                using ThresholdType = typename helper::value_type_of<T>::type;
+
+                const auto threshold =
+                    std::numeric_limits<ThresholdType>::epsilon() * 100;
+                if (std::abs(host_U) < threshold) {
+                    sycl::free(scratchpad, exec_q);
+                    throw LinAlgError("The input coefficient matrix is singular.");
+                }
+                else {
+                    error_msg << "Unexpected MKL exception caught during gesv() "
+                                "call:\nreason: "
+                            << e.what() << "\ninfo: " << e.info();
+                }
+            }
+            else {
+                error_msg << "Unexpected MKL exception caught during gesv() "
+                            "call:\nreason: "
+                        << e.what() << "\ninfo: " << e.info();
+            }
+        } catch (sycl::exception const &e) {
+            is_exception_caught = true;
+            error_msg << "Unexpected SYCL exception caught during gesv() call:\n"
+                    << e.what();
+        }
+
+        gesv_task_events.push_back(gesv_event);
+
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+        {
+            if (scratchpad != nullptr) {
+                sycl::free(scratchpad, exec_q);
+            }
+            if (ipiv != nullptr) {
+                sycl::free(ipiv, exec_q);
+            }
+            throw std::runtime_error(error_msg.str());
+        }
+
+    sycl::event ht_ev = exec_q.submit([&](sycl::handler &cgh) {
+        for(const auto &el : gesv_task_events) {
+                cgh.depends_on(el);
+            }
+        auto ctx = exec_q.get_context();
+        cgh.host_task([ctx, scratchpad, ipiv]() {
+            sycl::free(scratchpad, ctx);
+            sycl::free(ipiv, ctx);
+        });
+    });
+
+    return ht_ev;
+}
+
+
+std::pair<sycl::event, sycl::event>
+    gesv_batch(sycl::queue &exec_q,
+               dpctl::tensor::usm_ndarray coeff_matrix,
+               dpctl::tensor::usm_ndarray dependent_vals,
+               const std::vector<sycl::event> &depends)
+{
+    const int coeff_matrix_nd = coeff_matrix.get_ndim();
+    const int dependent_vals_nd = dependent_vals.get_ndim();
+
+    if (coeff_matrix_nd != 3) {
+        throw py::value_error("The coefficient matrix has ndim=" +
+                              std::to_string(coeff_matrix_nd) +
+                              ", but a 3-dimensional array is expected.");
+    }
+
+    if (dependent_vals_nd < 2 || dependent_vals_nd > 3) {
+        throw py::value_error(
+            "The dependent values array has ndim=" +
+            std::to_string(dependent_vals_nd) +
+            ", but a 2-dimensional or 3-dimensional array is expected.");
+    }
+
+    const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw();
+    const py::ssize_t *dependent_vals_shape = dependent_vals.get_shape_raw();
+
+    // The coeff_matrix and dependent_vals arrays must be F-contiguous arrays
+    // with the shapes (n,n,batch_size) and (n,nrhs,batch_size) or
+    // (n,batch_size) respectively
+    if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) {
+        throw py::value_error("The coefficient matrix must be square,"
+                              " but got a shape of (" +
+                              std::to_string(coeff_matrix_shape[0]) + ", " +
+                              std::to_string(coeff_matrix_shape[1]) + ").");
+    }
+
+    if (coeff_matrix_shape[0] != dependent_vals_shape[0]) {
+        throw py::value_error("The first dimension (n) of coeff_matrix and"
+                              " dependent_vals must be the same, but got " +
+                              std::to_string(coeff_matrix_shape[0]) + " and " +
+                              std::to_string(dependent_vals_shape[0]) + ".");
+    }
+
+    if (dependent_vals_nd == 2) {
+        if (coeff_matrix_shape[2] != dependent_vals_shape[1]) {
+            throw py::value_error(
+                "The batch_size of "
+                " coeff_matrix and dependent_vals must be"
+                " the same, but got " +
+                std::to_string(coeff_matrix_shape[2]) + " and " +
+                std::to_string(dependent_vals_shape[1]) + ".");
+        }
+    }
+    else if (dependent_vals_nd == 3) {
+        if (coeff_matrix_shape[2] != dependent_vals_shape[2]) {
+            throw py::value_error(
+                "The batch_size of "
+                " coeff_matrix and dependent_vals must be"
+                " the same, but got " +
+                std::to_string(coeff_matrix_shape[2]) + " and " +
+                std::to_string(dependent_vals_shape[2]) + ".");
+        }
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q,
+                                             {coeff_matrix, dependent_vals}))
+    {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues.");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(coeff_matrix, dependent_vals)) {
+        throw py::value_error(
+            "The arrays of coefficients and dependent variables "
+            "are overlapping segments of memory.");
+    }
+
+    bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous();
+    if (!is_coeff_matrix_f_contig) {
+        throw py::value_error("The coefficient matrix "
+                              "must be F-contiguous.");
+    }
+
+    bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous();
+    if (!is_dependent_vals_f_contig) {
+        throw py::value_error("The array of dependent variables "
+                              "must be F-contiguous.");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int coeff_matrix_type_id =
+        array_types.typenum_to_lookup_id(coeff_matrix.get_typenum());
+    int dependent_vals_type_id =
+        array_types.typenum_to_lookup_id(dependent_vals.get_typenum());
+
+    if (coeff_matrix_type_id != dependent_vals_type_id) {
+        throw py::value_error("The types of the coefficient matrix and "
+                              "dependent variables are mismatched.");
+    }
+
+    gesv_batch_impl_fn_ptr_t gesv_batch_fn = gesv_batch_dispatch_vector[coeff_matrix_type_id];
+    if (gesv_batch_fn == nullptr) {
+        throw py::value_error(
+            "No gesv implementation defined for the provided type "
+            "of the coefficient matrix.");
+    }
+
+    char *coeff_matrix_data = coeff_matrix.get_data();
+    char *dependent_vals_data = dependent_vals.get_data();
+
+    const std::int64_t batch_size = coeff_matrix_shape[2];
+    const std::int64_t n = coeff_matrix_shape[1];
+    const std::int64_t nrhs =
+        (dependent_vals_nd > 2) ? dependent_vals_shape[1] : 1;
+
+    const std::int64_t lda = std::max<size_t>(1UL, n);
+    const std::int64_t ldb = std::max<size_t>(1UL, n);
+
+    sycl::event gesv_ev;
+
+    gesv_ev = gesv_batch_fn(exec_q, n, nrhs, batch_size, coeff_matrix_data, lda,
+                            dependent_vals_data, ldb, depends);
+
+
+    sycl::event ht_ev =
+        dpctl::utils::keep_args_alive(exec_q, {coeff_matrix, dependent_vals}, {gesv_ev});
+
+    return std::make_pair(ht_ev, gesv_ev);
+
+}
+
+template <typename fnT, typename T>
+struct GesvBatchContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::GesvTypePairSupportFactory<T>::is_defined) {
+            return gesv_batch_impl<T>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_gesv_batch_dispatch_vector(void)
+{
+    dpctl_td_ns::DispatchVectorBuilder<gesv_batch_impl_fn_ptr_t, GesvBatchContigFactory,
+                                       dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_vector(gesv_batch_dispatch_vector);
+}
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/lapack/lapack_py.cpp b/dpnp/backend/extensions/lapack/lapack_py.cpp
index 863d1ffaea47..aba27224ca82 100644
--- a/dpnp/backend/extensions/lapack/lapack_py.cpp
+++ b/dpnp/backend/extensions/lapack/lapack_py.cpp
@@ -51,6 +51,7 @@ void init_dispatch_vectors(void)
 {
     lapack_ext::init_geqrf_batch_dispatch_vector();
     lapack_ext::init_geqrf_dispatch_vector();
+    lapack_ext::init_gesv_batch_dispatch_vector();
     lapack_ext::init_gesv_dispatch_vector();
     lapack_ext::init_getrf_batch_dispatch_vector();
     lapack_ext::init_getrf_dispatch_vector();

From 3a7b8ca6eef121aaa9ba030d064700a9aaab8327 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 13 Jun 2024 18:01:41 +0200
Subject: [PATCH 12/37] Improve gesv_batch with independent linear streams

---
 .../extensions/lapack/common_helpers.hpp      |   8 +
 dpnp/backend/extensions/lapack/gesv_batch.cpp | 175 +++++++++++-------
 2 files changed, 114 insertions(+), 69 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/common_helpers.hpp b/dpnp/backend/extensions/lapack/common_helpers.hpp
index 3e840ddf7f39..036d0b9c7328 100644
--- a/dpnp/backend/extensions/lapack/common_helpers.hpp
+++ b/dpnp/backend/extensions/lapack/common_helpers.hpp
@@ -48,6 +48,14 @@ struct value_type_of<std::complex<T>>
 {
     using type = T;
 };
+
+// Rounds up the number `value` to the nearest multiple of `mult`.
+template <typename intT>
+intT round_up_mult(intT value, intT mult)
+{
+    intT q = (value + (mult - 1)) / mult;
+    return q * mult;
+}
 } // namespace helper
 } // namespace lapack
 } // namespace ext
diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp
index 0a4aa14b2e5e..f8341f353486 100644
--- a/dpnp/backend/extensions/lapack/gesv_batch.cpp
+++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp
@@ -48,17 +48,19 @@ namespace mkl_lapack = oneapi::mkl::lapack;
 namespace py = pybind11;
 namespace type_utils = dpctl::tensor::type_utils;
 
-typedef sycl::event (*gesv_batch_impl_fn_ptr_t)(sycl::queue &,
-                                                const std::int64_t,
-                                                const std::int64_t,
-                                                const std::int64_t,
-                                                char *,
-                                                std::int64_t,
-                                                char *,
-                                                std::int64_t,
-                                                const std::vector<sycl::event> &);
-
-static gesv_batch_impl_fn_ptr_t gesv_batch_dispatch_vector[dpctl_td_ns::num_types];
+typedef sycl::event (*gesv_batch_impl_fn_ptr_t)(
+    sycl::queue &,
+    const std::int64_t,
+    const std::int64_t,
+    const std::int64_t,
+    char *,
+    std::int64_t,
+    char *,
+    std::int64_t,
+    const std::vector<sycl::event> &);
+
+static gesv_batch_impl_fn_ptr_t
+    gesv_batch_dispatch_vector[dpctl_td_ns::num_types];
 
 template <typename T>
 static sycl::event gesv_batch_impl(sycl::queue &exec_q,
@@ -71,7 +73,6 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q,
                                    std::int64_t ldb,
                                    const std::vector<sycl::event> &depends)
 {
-    // py::gil_scoped_release release;
     type_utils::validate_type_for_device<T>(exec_q);
 
     T *a = reinterpret_cast<T *>(in_a);
@@ -80,67 +81,101 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q,
     std::int64_t a_size = n * n;
     std::int64_t b_size = n * nrhs;
 
+    // Get the number of independent linear streams
+    std::int64_t n_linear_streams =
+        (batch_size > 16) ? 4 : ((batch_size > 4 ? 2 : 1));
+
     const std::int64_t scratchpad_size =
         mkl_lapack::gesv_scratchpad_size<T>(exec_q, n, nrhs, lda, ldb);
     T *scratchpad = nullptr;
 
+    // Get padding size to ensure memory allocations are aligned to 256 bytes
+    // for better performance
+    std::int64_t padding = 256 / sizeof(T);
+
     if (scratchpad_size > 0) {
-        scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+        // Calculate the total scratchpad memory size needed for all linear
+        // streams with proper alignment
+        size_t alloc_scratch_size =
+            helper::round_up_mult(n_linear_streams * scratchpad_size, padding);
+
+        // Allocate memory for the total scratchpad
+        scratchpad = sycl::malloc_device<T>(alloc_scratch_size, exec_q);
         if (!scratchpad)
             throw std::runtime_error("Device allocation for scratchpad failed");
     }
 
-    std::int64_t *ipiv = sycl::malloc_device<std::int64_t>(n, exec_q);
+    // Calculate the total size needed for the pivot indices array for all
+    // linear streams with proper alignment
+    size_t alloc_ipiv_size =
+        helper::round_up_mult(n_linear_streams * n, padding);
+
+    // Allocate memory for the total pivot indices array
+    std::int64_t *ipiv =
+        sycl::malloc_device<std::int64_t>(alloc_ipiv_size, exec_q);
     if (!ipiv)
         throw std::runtime_error("Device allocation for ipiv failed");
 
-    std::vector<sycl::event> gesv_task_events;
-    gesv_task_events.reserve(batch_size);
+    // Computation events to manage dependencies for each linear stream
+    std::vector<std::vector<sycl::event>> comp_evs(n_linear_streams, depends);
 
     std::stringstream error_msg;
     std::int64_t info = 0;
     bool is_exception_caught = false;
 
-    sycl::event gesv_event;
-
     for (std::int64_t batch_id = 0; batch_id < batch_size; ++batch_id) {
         T *a_batch = a + batch_id * a_size;
         T *b_batch = b + batch_id * b_size;
 
+        std::int64_t stream_id = (batch_id % n_linear_streams);
+
+        T *current_scratch_gesv = nullptr;
+        if (scratchpad != nullptr) {
+            current_scratch_gesv = scratchpad + stream_id * scratchpad_size;
+        }
+        std::int64_t *current_ipiv = ipiv + stream_id * n;
+
+        // Get the event dependencies for the current stream
+        const auto &current_dep = comp_evs[stream_id];
+
+        sycl::event gesv_event;
+
         try {
             gesv_event = mkl_lapack::gesv(
                 exec_q,
                 n,    // The order of the square matrix A
-                    // and the number of rows in matrix B (0 ≤ n).
+                      // and the number of rows in matrix B (0 ≤ n).
                 nrhs, // The number of right-hand sides,
-                    // i.e., the number of columns in matrix B (0 ≤ nrhs).
-                a_batch,    // Pointer to the square coefficient matrix A (n x n).
-                lda,  // The leading dimension of a, must be at least max(1, n).
-                ipiv, // The pivot indices that define the permutation matrix P;
-                    // row i of the matrix was interchanged with row ipiv(i),
-                    // must be at least max(1, n).
-                b_batch,    // Pointer to the right hand side matrix B (n x nrhs).
-                ldb,  // The leading dimension of matrix B,
-                    // must be at least max(1, n).
-                scratchpad, // Pointer to scratchpad memory to be used by MKL
-                            // routine for storing intermediate results.
-                scratchpad_size, depends);
+                      // i.e., the number of columns in matrix B (0 ≤ nrhs).
+                a_batch, // Pointer to the square coefficient matrix A (n x n).
+                lda, // The leading dimension of a, must be at least max(1, n).
+                current_ipiv, // The pivot indices that define the permutation
+                              // matrix P; row i of the matrix was interchanged
+                              // with row ipiv(i), must be at least max(1, n).
+                b_batch, // Pointer to the right hand side matrix B (n x nrhs).
+                ldb,     // The leading dimension of matrix B,
+                         // must be at least max(1, n).
+                current_scratch_gesv, // Pointer to scratchpad memory to be used
+                                      // by MKL routine for storing intermediate
+                                      // results.
+                scratchpad_size, current_dep);
         } catch (mkl_lapack::exception const &e) {
             is_exception_caught = true;
             info = e.info();
 
             if (info < 0) {
                 error_msg << "Parameter number " << -info
-                        << " had an illegal value.";
+                          << " had an illegal value.";
             }
             else if (info == scratchpad_size && e.detail() != 0) {
-                error_msg
-                    << "Insufficient scratchpad size. Required size is at least "
-                    << e.detail();
+                error_msg << "Insufficient scratchpad size. Required size is "
+                             "at least "
+                          << e.detail();
             }
             else if (info > 0) {
                 T host_U;
-                exec_q.memcpy(&host_U, &a[(info - 1) * lda + info - 1], sizeof(T))
+                exec_q
+                    .memcpy(&host_U, &a[(info - 1) * lda + info - 1], sizeof(T))
                     .wait();
 
                 using ThresholdType = typename helper::value_type_of<T>::type;
@@ -149,44 +184,48 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q,
                     std::numeric_limits<ThresholdType>::epsilon() * 100;
                 if (std::abs(host_U) < threshold) {
                     sycl::free(scratchpad, exec_q);
-                    throw LinAlgError("The input coefficient matrix is singular.");
+                    sycl::free(ipiv, exec_q);
+                    throw LinAlgError(
+                        "The input coefficient matrix is singular.");
                 }
                 else {
-                    error_msg << "Unexpected MKL exception caught during gesv() "
-                                "call:\nreason: "
-                            << e.what() << "\ninfo: " << e.info();
+                    error_msg
+                        << "Unexpected MKL exception caught during gesv() "
+                           "call:\nreason: "
+                        << e.what() << "\ninfo: " << e.info();
                 }
             }
             else {
                 error_msg << "Unexpected MKL exception caught during gesv() "
-                            "call:\nreason: "
-                        << e.what() << "\ninfo: " << e.info();
+                             "call:\nreason: "
+                          << e.what() << "\ninfo: " << e.info();
             }
         } catch (sycl::exception const &e) {
             is_exception_caught = true;
-            error_msg << "Unexpected SYCL exception caught during gesv() call:\n"
-                    << e.what();
+            error_msg
+                << "Unexpected SYCL exception caught during gesv() call:\n"
+                << e.what();
         }
 
-        gesv_task_events.push_back(gesv_event);
-
+        // Update the event dependencies for the current stream
+        comp_evs[stream_id] = {gesv_event};
     }
 
     if (is_exception_caught) // an unexpected error occurs
-        {
-            if (scratchpad != nullptr) {
-                sycl::free(scratchpad, exec_q);
-            }
-            if (ipiv != nullptr) {
-                sycl::free(ipiv, exec_q);
-            }
-            throw std::runtime_error(error_msg.str());
+    {
+        if (scratchpad != nullptr) {
+            sycl::free(scratchpad, exec_q);
+        }
+        if (ipiv != nullptr) {
+            sycl::free(ipiv, exec_q);
         }
+        throw std::runtime_error(error_msg.str());
+    }
 
     sycl::event ht_ev = exec_q.submit([&](sycl::handler &cgh) {
-        for(const auto &el : gesv_task_events) {
-                cgh.depends_on(el);
-            }
+        for (const auto &ev : comp_evs) {
+            cgh.depends_on(ev);
+        }
         auto ctx = exec_q.get_context();
         cgh.host_task([ctx, scratchpad, ipiv]() {
             sycl::free(scratchpad, ctx);
@@ -197,7 +236,6 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q,
     return ht_ev;
 }
 
-
 std::pair<sycl::event, sycl::event>
     gesv_batch(sycl::queue &exec_q,
                dpctl::tensor::usm_ndarray coeff_matrix,
@@ -261,7 +299,7 @@ std::pair<sycl::event, sycl::event>
         }
     }
 
-    // check compatibility of execution queue and allocation queue
+    // Check compatibility of execution queue and allocation queue
     if (!dpctl::utils::queues_are_compatible(exec_q,
                                              {coeff_matrix, dependent_vals}))
     {
@@ -299,7 +337,8 @@ std::pair<sycl::event, sycl::event>
                               "dependent variables are mismatched.");
     }
 
-    gesv_batch_impl_fn_ptr_t gesv_batch_fn = gesv_batch_dispatch_vector[coeff_matrix_type_id];
+    gesv_batch_impl_fn_ptr_t gesv_batch_fn =
+        gesv_batch_dispatch_vector[coeff_matrix_type_id];
     if (gesv_batch_fn == nullptr) {
         throw py::value_error(
             "No gesv implementation defined for the provided type "
@@ -317,17 +356,14 @@ std::pair<sycl::event, sycl::event>
     const std::int64_t lda = std::max<size_t>(1UL, n);
     const std::int64_t ldb = std::max<size_t>(1UL, n);
 
-    sycl::event gesv_ev;
+    sycl::event gesv_ev =
+        gesv_batch_fn(exec_q, n, nrhs, batch_size, coeff_matrix_data, lda,
+                      dependent_vals_data, ldb, depends);
 
-    gesv_ev = gesv_batch_fn(exec_q, n, nrhs, batch_size, coeff_matrix_data, lda,
-                            dependent_vals_data, ldb, depends);
-
-
-    sycl::event ht_ev =
-        dpctl::utils::keep_args_alive(exec_q, {coeff_matrix, dependent_vals}, {gesv_ev});
+    sycl::event ht_ev = dpctl::utils::keep_args_alive(
+        exec_q, {coeff_matrix, dependent_vals}, {gesv_ev});
 
     return std::make_pair(ht_ev, gesv_ev);
-
 }
 
 template <typename fnT, typename T>
@@ -346,7 +382,8 @@ struct GesvBatchContigFactory
 
 void init_gesv_batch_dispatch_vector(void)
 {
-    dpctl_td_ns::DispatchVectorBuilder<gesv_batch_impl_fn_ptr_t, GesvBatchContigFactory,
+    dpctl_td_ns::DispatchVectorBuilder<gesv_batch_impl_fn_ptr_t,
+                                       GesvBatchContigFactory,
                                        dpctl_td_ns::num_types>
         contig;
     contig.populate_dispatch_vector(gesv_batch_dispatch_vector);

From 2016a8cdad9f9991a8cb7ad1977101b0e02648b9 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 13 Jun 2024 21:53:57 +0200
Subject: [PATCH 13/37] Extend checks for gesv/gesv_batch

---
 dpnp/backend/extensions/lapack/gesv.cpp       | 15 +++++++++++++++
 dpnp/backend/extensions/lapack/gesv_batch.cpp | 15 +++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index ea31b39d5119..5094da17282e 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -27,6 +27,7 @@
 
 // dpctl tensor headers
 #include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
 #include "utils/type_utils.hpp"
 
 #include "common_helpers.hpp"
@@ -207,6 +208,17 @@ std::pair<sycl::event, sycl::event>
                               std::to_string(coeff_matrix_shape[1]) + ").");
     }
 
+    size_t src_nelems(1);
+
+    for (int i = 0; i < dependent_vals_nd; ++i) {
+        src_nelems *= static_cast<size_t>(dependent_vals_shape[i]);
+    }
+
+    if (src_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
     // check compatibility of execution queue and allocation queue
     if (!dpctl::utils::queues_are_compatible(exec_q,
                                              {coeff_matrix, dependent_vals}))
@@ -222,6 +234,9 @@ std::pair<sycl::event, sycl::event>
             "are overlapping segments of memory.");
     }
 
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(
+        dependent_vals);
+
     bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous();
     if (!is_coeff_matrix_f_contig) {
         throw py::value_error("The coefficient matrix "
diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp
index f8341f353486..262eefbad712 100644
--- a/dpnp/backend/extensions/lapack/gesv_batch.cpp
+++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp
@@ -27,6 +27,7 @@
 
 // dpctl tensor headers
 #include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
 #include "utils/type_utils.hpp"
 
 #include "common_helpers.hpp"
@@ -299,6 +300,17 @@ std::pair<sycl::event, sycl::event>
         }
     }
 
+    size_t src_nelems(1);
+
+    for (int i = 0; i < dependent_vals_nd; ++i) {
+        src_nelems *= static_cast<size_t>(dependent_vals_shape[i]);
+    }
+
+    if (src_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
     // Check compatibility of execution queue and allocation queue
     if (!dpctl::utils::queues_are_compatible(exec_q,
                                              {coeff_matrix, dependent_vals}))
@@ -314,6 +326,9 @@ std::pair<sycl::event, sycl::event>
             "are overlapping segments of memory.");
     }
 
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(
+        dependent_vals);
+
     bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous();
     if (!is_coeff_matrix_f_contig) {
         throw py::value_error("The coefficient matrix "

From 2c422905f6607440ae8d98449fb655fdae54ea5a Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 13 Jun 2024 22:00:40 +0200
Subject: [PATCH 14/37] Update comment

---
 dpnp/linalg/dpnp_utils_linalg.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 4e176ce884f3..c8b50b3dc421 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -333,6 +333,7 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type):
     v = dpnp.moveaxis(b_f, -1, 0).reshape(b_shape)
 
     # dpnp.moveaxis can make the array non-contiguous if it is not 2D
+    # Convert to contiguous to align with NumPy
     if b.ndim > 2:
         v = dpnp.ascontiguousarray(v)
 

From e030da8d303cfb8c8c396a6f59581fa4bf64f7b5 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 14 Jun 2024 12:03:55 +0200
Subject: [PATCH 15/37] junk files

---
 num.py  | 51 +++++++++++++++++++++++++++++++++++++++
 qwe.py  | 11 +++++++++
 test.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 137 insertions(+)
 create mode 100644 num.py
 create mode 100644 qwe.py
 create mode 100644 test.py

diff --git a/num.py b/num.py
new file mode 100644
index 000000000000..d5467b1cbf55
--- /dev/null
+++ b/num.py
@@ -0,0 +1,51 @@
+import numpy as np
+import ctypes
+
+# Размеры и параметры для примера
+batch_size = 2
+n = 3
+nrhs = 2
+coeff_matrix_elemsize = ctypes.sizeof(ctypes.c_double)
+dependent_vals_elemsize = ctypes.sizeof(ctypes.c_double)
+
+# Создаем примерные массивы NumPy
+coeff_matrix = np.arange(batch_size * n * n, dtype=np.double).reshape(batch_size, n, n)
+dependent_vals = np.arange(batch_size * n * nrhs, dtype=np.double).reshape(batch_size, n, nrhs)
+
+# my logic
+coeff_matrix = coeff_matrix.transpose((0, 2, 1))
+dependent_vals = dependent_vals.transpose((0, 2, 1))
+
+coeff_matrix = np.array(coeff_matrix, order="C")
+dependent_vals = np.array(dependent_vals, order="C")
+
+# # Sasha`s logic
+# coeff_matrix = np.moveaxis(coeff_matrix, (-2, -1), (0, 1))
+# dependent_vals = np.moveaxis(dependent_vals, (-2, -1), (0, 1))
+
+# coeff_matrix = np.array(coeff_matrix, order="F")
+# dependent_vals = np.array(dependent_vals, order="F")
+
+# get pointer
+coeff_matrix_data = coeff_matrix.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
+dependent_vals_data = dependent_vals.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
+
+# Функция для получения значений по указателю с учетом смещения и размера элемента
+def get_values_from_pointer(ptr, step, count, elemsize):
+    result = []
+    for i in range(count):
+        result.append(ptr[i * step // elemsize])
+    return result
+
+
+for i in range(batch_size):
+    coeff_matrix_batch = ctypes.cast(ctypes.addressof(coeff_matrix_data.contents) + i * n * n * coeff_matrix_elemsize, ctypes.POINTER(ctypes.c_double))
+    dependent_vals_batch = ctypes.cast(ctypes.addressof(dependent_vals_data.contents) + i * n * nrhs * dependent_vals_elemsize, ctypes.POINTER(ctypes.c_double))
+
+    # get value by new pointer
+    coeff_values = get_values_from_pointer(coeff_matrix_batch, coeff_matrix_elemsize, n * n, coeff_matrix_elemsize)
+    dependent_values = get_values_from_pointer(dependent_vals_batch, dependent_vals_elemsize, n * nrhs, dependent_vals_elemsize)
+
+    print(f"Batch {i + 1}:")
+    print("Coefficients:", coeff_values)
+    print("Dependent values:", dependent_values)
diff --git a/qwe.py b/qwe.py
new file mode 100644
index 000000000000..e373bab0d566
--- /dev/null
+++ b/qwe.py
@@ -0,0 +1,11 @@
+import dpnp
+import numpy
+from tests.helper import generate_random_numpy_array
+
+a = dpnp.array(generate_random_numpy_array((2,3,3,3,3),dtype='f4',hermitian=False, seed_value=81))
+b = dpnp.array(generate_random_numpy_array((2,3,3,3),dtype='f4',hermitian=False, seed_value=76))
+
+res = dpnp.linalg.solve(a,b,new=True)
+
+print(res.shape)
+print(res.flags)
diff --git a/test.py b/test.py
new file mode 100644
index 000000000000..37218883cd43
--- /dev/null
+++ b/test.py
@@ -0,0 +1,75 @@
+import numpy
+
+import dpnp
+import dpnp.linalg
+from tests.helper import generate_random_numpy_array
+
+# a = numpy.array([[[2,3],[1,2]],[[1,2],[2,3]]],dtype='f4')
+# a = numpy.array([[[1,2],[3,5]],[[1,2],[3,5]],[[1,2],[3,5]]], dtype='f4')
+# b = numpy.array([[1,2],[1,2],[1,2]], dtype='f4')
+# a = numpy.array([[1,2],[2,3]],dtype='f4')
+# a = numpy.array([[[2,3],[1,2]]],dtype='f4')
+# a = numpy.array([[[1,2],[2,3]]],dtype='f4')
+a = generate_random_numpy_array(
+    (2, 3, 3), dtype='c8', hermitian=False, seed_value=81
+)
+
+b = generate_random_numpy_array(
+    (2, 3, 3), dtype='c8', hermitian=False, seed_value=76
+)
+# a = numpy.array(
+#             [
+#                 [[1, 0, 3], [0, 5, 0], [7, 0, 9]],
+#                 [[3, 0, 3], [0, 7, 0], [7, 0, 11]],
+#             ],
+#             dtype='f4',
+#         )
+a_dp = dpnp.array(a)
+b_dp = dpnp.array(b)
+
+# a_f = numpy.array(a,order="F")
+# a_dp_f = dpnp.array(a_dp,order="F")
+
+# res = numpy.linalg.eigh(a)
+# res_dp = dpnp.linalg.eigh(a_dp)
+# res = numpy.linalg.eigh(a_f)
+# res_dp = dpnp.linalg.eigh(a_dp_f)
+res = numpy.linalg.solve(a,b)
+res_dp = dpnp.linalg.solve(a_dp,b_dp)
+# res = numpy.linalg.eigvalsh(a)
+# res_dp = dpnp.linalg.eigvalsh(a_dp)
+
+
+print("VALS: ")
+print("NUMPY: ")
+print(res)
+print("DPNP: ")
+print(res_dp)
+
+# print("BEFORE")
+# print(a_dp)
+
+# print("VALS: ")
+# print("NUMPY: ")
+# print(res[0])
+# print("DPNP: ")
+# print(res_dp[0])
+
+
+# print("VECS: ")
+# print("NUMPY: ")
+# print(res[1])
+# print("DPNP: ")
+# print(res_dp[1])
+
+# print("AFTER")
+# print(a_dp)
+
+
+# a = generate_random_numpy_array((10,3,3),dtype=numpy.float32, hermitian=True, seed_value=81)
+# a_dp = dpnp.array(a, device='gpu')
+
+# res = numpy.linalg.eigh(a)
+# res_dp = dpnp.linalg.eigh(a_dp)
+
+# print("Done")

From 5a48f333d7eda4e6dfc0922f2d98a187d7405cc1 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 12 Jul 2024 14:34:02 +0200
Subject: [PATCH 16/37] Add common_gesv_checks

---
 dpnp/backend/extensions/lapack/gesv.cpp       | 155 +++++++++++-------
 dpnp/backend/extensions/lapack/gesv.hpp       |   9 +
 dpnp/backend/extensions/lapack/gesv_batch.cpp |  72 +-------
 3 files changed, 113 insertions(+), 123 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index 5094da17282e..1462ee51659a 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -49,6 +49,97 @@ namespace mkl_lapack = oneapi::mkl::lapack;
 namespace py = pybind11;
 namespace type_utils = dpctl::tensor::type_utils;
 
+void common_gesv_checks(sycl::queue &exec_q,
+                        dpctl::tensor::usm_ndarray coeff_matrix,
+                        dpctl::tensor::usm_ndarray dependent_vals,
+                        const py::ssize_t *coeff_matrix_shape,
+                        const py::ssize_t *dependent_vals_shape,
+                        const int expected_coeff_matrix_ndim,
+                        const int min_dependent_vals_ndim,
+                        const int max_dependent_vals_ndim)
+{
+    const int coeff_matrix_nd = coeff_matrix.get_ndim();
+    const int dependent_vals_nd = dependent_vals.get_ndim();
+
+    if (coeff_matrix_nd != expected_coeff_matrix_ndim) {
+        throw py::value_error("The coefficient matrix has ndim=" +
+                              std::to_string(coeff_matrix_nd) + ", but a " +
+                              std::to_string(expected_coeff_matrix_ndim) +
+                              "-dimensional array is expected.");
+    }
+
+    if (dependent_vals_nd < min_dependent_vals_ndim ||
+        dependent_vals_nd > max_dependent_vals_ndim)
+    {
+        throw py::value_error("The dependent values array has ndim=" +
+                              std::to_string(dependent_vals_nd) + ", but a " +
+                              std::to_string(min_dependent_vals_ndim) +
+                              "-dimensional or a " +
+                              std::to_string(max_dependent_vals_ndim) +
+                              "-dimensional array is expected.");
+    }
+
+    // The coeff_matrix and dependent_vals arrays must be F-contiguous arrays
+    // for gesv
+    // with the shapes (n,n) and (n,nrhs) or (n,) respectively;
+    // for gesv_batch
+    // with the shapes (n,n,batch_size) and (n,nrhs,batch_size) or
+    // (n,batch_size) respectively
+    if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) {
+        throw py::value_error("The coefficient matrix must be square,"
+                              " but got a shape of (" +
+                              std::to_string(coeff_matrix_shape[0]) + ", " +
+                              std::to_string(coeff_matrix_shape[1]) + ").");
+    }
+    if (coeff_matrix_shape[0] != dependent_vals_shape[0]) {
+        throw py::value_error("The first dimension (n) of coeff_matrix and"
+                              " dependent_vals must be the same, but got " +
+                              std::to_string(coeff_matrix_shape[0]) + " and " +
+                              std::to_string(dependent_vals_shape[0]) + ".");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q,
+                                             {coeff_matrix, dependent_vals}))
+    {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues.");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(coeff_matrix, dependent_vals)) {
+        throw py::value_error(
+            "The arrays of coefficients and dependent variables "
+            "are overlapping segments of memory.");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(
+        dependent_vals);
+
+    bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous();
+    if (!is_coeff_matrix_f_contig) {
+        throw py::value_error("The coefficient matrix "
+                              "must be F-contiguous.");
+    }
+
+    bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous();
+    if (!is_dependent_vals_f_contig) {
+        throw py::value_error("The array of dependent variables "
+                              "must be F-contiguous.");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int coeff_matrix_type_id =
+        array_types.typenum_to_lookup_id(coeff_matrix.get_typenum());
+    int dependent_vals_type_id =
+        array_types.typenum_to_lookup_id(dependent_vals.get_typenum());
+
+    if (coeff_matrix_type_id != dependent_vals_type_id) {
+        throw py::value_error("The types of the coefficient matrix and "
+                              "dependent variables are mismatched.");
+    }
+}
+
 typedef sycl::event (*gesv_impl_fn_ptr_t)(sycl::queue &,
                                           const std::int64_t,
                                           const std::int64_t,
@@ -182,31 +273,18 @@ std::pair<sycl::event, sycl::event>
          dpctl::tensor::usm_ndarray dependent_vals,
          const std::vector<sycl::event> &depends)
 {
-    const int coeff_matrix_nd = coeff_matrix.get_ndim();
     const int dependent_vals_nd = dependent_vals.get_ndim();
 
-    if (coeff_matrix_nd != 2) {
-        throw py::value_error("The coefficient matrix has ndim=" +
-                              std::to_string(coeff_matrix_nd) +
-                              ", but a 2-dimensional array is expected.");
-    }
-
-    if (dependent_vals_nd > 2) {
-        throw py::value_error(
-            "The dependent values array has ndim=" +
-            std::to_string(dependent_vals_nd) +
-            ", but a 1-dimensional or a 2-dimensional array is expected.");
-    }
-
     const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw();
     const py::ssize_t *dependent_vals_shape = dependent_vals.get_shape_raw();
 
-    if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) {
-        throw py::value_error("The coefficient matrix must be square,"
-                              " but got a shape of (" +
-                              std::to_string(coeff_matrix_shape[0]) + ", " +
-                              std::to_string(coeff_matrix_shape[1]) + ").");
-    }
+    const int expected_coeff_matrix_ndim = 2;
+    const int min_dependent_vals_ndim = 1;
+    const int max_dependent_vals_ndim = 2;
+
+    common_gesv_checks(exec_q, coeff_matrix, dependent_vals, coeff_matrix_shape,
+                       dependent_vals_shape, expected_coeff_matrix_ndim,
+                       min_dependent_vals_ndim, max_dependent_vals_ndim);
 
     size_t src_nelems(1);
 
@@ -219,46 +297,9 @@ std::pair<sycl::event, sycl::event>
         return std::make_pair(sycl::event(), sycl::event());
     }
 
-    // check compatibility of execution queue and allocation queue
-    if (!dpctl::utils::queues_are_compatible(exec_q,
-                                             {coeff_matrix, dependent_vals}))
-    {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues.");
-    }
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(coeff_matrix, dependent_vals)) {
-        throw py::value_error(
-            "The arrays of coefficients and dependent variables "
-            "are overlapping segments of memory.");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(
-        dependent_vals);
-
-    bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous();
-    if (!is_coeff_matrix_f_contig) {
-        throw py::value_error("The coefficient matrix "
-                              "must be F-contiguous.");
-    }
-
-    bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous();
-    if (!is_dependent_vals_f_contig) {
-        throw py::value_error("The array of dependent variables "
-                              "must be F-contiguous.");
-    }
-
     auto array_types = dpctl_td_ns::usm_ndarray_types();
     int coeff_matrix_type_id =
         array_types.typenum_to_lookup_id(coeff_matrix.get_typenum());
-    int dependent_vals_type_id =
-        array_types.typenum_to_lookup_id(dependent_vals.get_typenum());
-
-    if (coeff_matrix_type_id != dependent_vals_type_id) {
-        throw py::value_error("The types of the coefficient matrix and "
-                              "dependent variables are mismatched.");
-    }
 
     gesv_impl_fn_ptr_t gesv_fn = gesv_dispatch_vector[coeff_matrix_type_id];
     if (gesv_fn == nullptr) {
diff --git a/dpnp/backend/extensions/lapack/gesv.hpp b/dpnp/backend/extensions/lapack/gesv.hpp
index ab90022af1ee..e4ceebfc1db6 100644
--- a/dpnp/backend/extensions/lapack/gesv.hpp
+++ b/dpnp/backend/extensions/lapack/gesv.hpp
@@ -50,6 +50,15 @@ extern std::pair<sycl::event, sycl::event>
                dpctl::tensor::usm_ndarray dependent_vals,
                const std::vector<sycl::event> &depends);
 
+extern void common_gesv_checks(sycl::queue &exec_q,
+                               dpctl::tensor::usm_ndarray coeff_matrix,
+                               dpctl::tensor::usm_ndarray dependent_vals,
+                               const py::ssize_t *coeff_matrix_shape,
+                               const py::ssize_t *dependent_vals_shape,
+                               const int expected_coeff_matrix_ndim,
+                               const int min_dependent_vals_ndim,
+                               const int max_dependent_vals_ndim);
+
 extern void init_gesv_dispatch_vector(void);
 extern void init_gesv_batch_dispatch_vector(void);
 } // namespace lapack
diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp
index 262eefbad712..11d3442e891d 100644
--- a/dpnp/backend/extensions/lapack/gesv_batch.cpp
+++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp
@@ -243,41 +243,18 @@ std::pair<sycl::event, sycl::event>
                dpctl::tensor::usm_ndarray dependent_vals,
                const std::vector<sycl::event> &depends)
 {
-    const int coeff_matrix_nd = coeff_matrix.get_ndim();
     const int dependent_vals_nd = dependent_vals.get_ndim();
 
-    if (coeff_matrix_nd != 3) {
-        throw py::value_error("The coefficient matrix has ndim=" +
-                              std::to_string(coeff_matrix_nd) +
-                              ", but a 3-dimensional array is expected.");
-    }
-
-    if (dependent_vals_nd < 2 || dependent_vals_nd > 3) {
-        throw py::value_error(
-            "The dependent values array has ndim=" +
-            std::to_string(dependent_vals_nd) +
-            ", but a 2-dimensional or 3-dimensional array is expected.");
-    }
-
     const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw();
     const py::ssize_t *dependent_vals_shape = dependent_vals.get_shape_raw();
 
-    // The coeff_matrix and dependent_vals arrays must be F-contiguous arrays
-    // with the shapes (n,n,batch_size) and (n,nrhs,batch_size) or
-    // (n,batch_size) respectively
-    if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) {
-        throw py::value_error("The coefficient matrix must be square,"
-                              " but got a shape of (" +
-                              std::to_string(coeff_matrix_shape[0]) + ", " +
-                              std::to_string(coeff_matrix_shape[1]) + ").");
-    }
+    const int expected_coeff_matrix_ndim = 3;
+    const int min_dependent_vals_ndim = 2;
+    const int max_dependent_vals_ndim = 3;
 
-    if (coeff_matrix_shape[0] != dependent_vals_shape[0]) {
-        throw py::value_error("The first dimension (n) of coeff_matrix and"
-                              " dependent_vals must be the same, but got " +
-                              std::to_string(coeff_matrix_shape[0]) + " and " +
-                              std::to_string(dependent_vals_shape[0]) + ".");
-    }
+    common_gesv_checks(exec_q, coeff_matrix, dependent_vals, coeff_matrix_shape,
+                       dependent_vals_shape, expected_coeff_matrix_ndim,
+                       min_dependent_vals_ndim, max_dependent_vals_ndim);
 
     if (dependent_vals_nd == 2) {
         if (coeff_matrix_shape[2] != dependent_vals_shape[1]) {
@@ -311,46 +288,9 @@ std::pair<sycl::event, sycl::event>
         return std::make_pair(sycl::event(), sycl::event());
     }
 
-    // Check compatibility of execution queue and allocation queue
-    if (!dpctl::utils::queues_are_compatible(exec_q,
-                                             {coeff_matrix, dependent_vals}))
-    {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues.");
-    }
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(coeff_matrix, dependent_vals)) {
-        throw py::value_error(
-            "The arrays of coefficients and dependent variables "
-            "are overlapping segments of memory.");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(
-        dependent_vals);
-
-    bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous();
-    if (!is_coeff_matrix_f_contig) {
-        throw py::value_error("The coefficient matrix "
-                              "must be F-contiguous.");
-    }
-
-    bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous();
-    if (!is_dependent_vals_f_contig) {
-        throw py::value_error("The array of dependent variables "
-                              "must be F-contiguous.");
-    }
-
     auto array_types = dpctl_td_ns::usm_ndarray_types();
     int coeff_matrix_type_id =
         array_types.typenum_to_lookup_id(coeff_matrix.get_typenum());
-    int dependent_vals_type_id =
-        array_types.typenum_to_lookup_id(dependent_vals.get_typenum());
-
-    if (coeff_matrix_type_id != dependent_vals_type_id) {
-        throw py::value_error("The types of the coefficient matrix and "
-                              "dependent variables are mismatched.");
-    }
 
     gesv_batch_impl_fn_ptr_t gesv_batch_fn =
         gesv_batch_dispatch_vector[coeff_matrix_type_id];

From 924fee73d917fb9a0705c1498da2c9ef1f9b9a62 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 12 Jul 2024 14:53:49 +0200
Subject: [PATCH 17/37] Release GIL in gesv_batch_impl

---
 dpnp/backend/extensions/lapack/gesv_batch.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp
index 11d3442e891d..abdc9ffe1a7f 100644
--- a/dpnp/backend/extensions/lapack/gesv_batch.cpp
+++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp
@@ -124,6 +124,10 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q,
     std::int64_t info = 0;
     bool is_exception_caught = false;
 
+    // Release GIL to avoid serialization of host task
+    // submissions to the same queue in OneMKL
+    py::gil_scoped_release release;
+
     for (std::int64_t batch_id = 0; batch_id < batch_size; ++batch_id) {
         T *a_batch = a + batch_id * a_size;
         T *b_batch = b + batch_id * b_size;

From 2b15e6c7fb763d93a71e95cce857594921167926 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 12 Jul 2024 14:54:18 +0200
Subject: [PATCH 18/37] Remove junk file

---
 test.py | 75 ---------------------------------------------------------
 1 file changed, 75 deletions(-)
 delete mode 100644 test.py

diff --git a/test.py b/test.py
deleted file mode 100644
index 37218883cd43..000000000000
--- a/test.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import numpy
-
-import dpnp
-import dpnp.linalg
-from tests.helper import generate_random_numpy_array
-
-# a = numpy.array([[[2,3],[1,2]],[[1,2],[2,3]]],dtype='f4')
-# a = numpy.array([[[1,2],[3,5]],[[1,2],[3,5]],[[1,2],[3,5]]], dtype='f4')
-# b = numpy.array([[1,2],[1,2],[1,2]], dtype='f4')
-# a = numpy.array([[1,2],[2,3]],dtype='f4')
-# a = numpy.array([[[2,3],[1,2]]],dtype='f4')
-# a = numpy.array([[[1,2],[2,3]]],dtype='f4')
-a = generate_random_numpy_array(
-    (2, 3, 3), dtype='c8', hermitian=False, seed_value=81
-)
-
-b = generate_random_numpy_array(
-    (2, 3, 3), dtype='c8', hermitian=False, seed_value=76
-)
-# a = numpy.array(
-#             [
-#                 [[1, 0, 3], [0, 5, 0], [7, 0, 9]],
-#                 [[3, 0, 3], [0, 7, 0], [7, 0, 11]],
-#             ],
-#             dtype='f4',
-#         )
-a_dp = dpnp.array(a)
-b_dp = dpnp.array(b)
-
-# a_f = numpy.array(a,order="F")
-# a_dp_f = dpnp.array(a_dp,order="F")
-
-# res = numpy.linalg.eigh(a)
-# res_dp = dpnp.linalg.eigh(a_dp)
-# res = numpy.linalg.eigh(a_f)
-# res_dp = dpnp.linalg.eigh(a_dp_f)
-res = numpy.linalg.solve(a,b)
-res_dp = dpnp.linalg.solve(a_dp,b_dp)
-# res = numpy.linalg.eigvalsh(a)
-# res_dp = dpnp.linalg.eigvalsh(a_dp)
-
-
-print("VALS: ")
-print("NUMPY: ")
-print(res)
-print("DPNP: ")
-print(res_dp)
-
-# print("BEFORE")
-# print(a_dp)
-
-# print("VALS: ")
-# print("NUMPY: ")
-# print(res[0])
-# print("DPNP: ")
-# print(res_dp[0])
-
-
-# print("VECS: ")
-# print("NUMPY: ")
-# print(res[1])
-# print("DPNP: ")
-# print(res_dp[1])
-
-# print("AFTER")
-# print(a_dp)
-
-
-# a = generate_random_numpy_array((10,3,3),dtype=numpy.float32, hermitian=True, seed_value=81)
-# a_dp = dpnp.array(a, device='gpu')
-
-# res = numpy.linalg.eigh(a)
-# res_dp = dpnp.linalg.eigh(a_dp)
-
-# print("Done")

From afca80398e4444fd7fb437a9e6d7c4bc1874d8f2 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 16 Jul 2024 12:05:08 +0200
Subject: [PATCH 19/37] Remove junk files

---
 num.py | 51 ---------------------------------------------------
 qwe.py | 11 -----------
 2 files changed, 62 deletions(-)
 delete mode 100644 num.py
 delete mode 100644 qwe.py

diff --git a/num.py b/num.py
deleted file mode 100644
index d5467b1cbf55..000000000000
--- a/num.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import numpy as np
-import ctypes
-
-# Размеры и параметры для примера
-batch_size = 2
-n = 3
-nrhs = 2
-coeff_matrix_elemsize = ctypes.sizeof(ctypes.c_double)
-dependent_vals_elemsize = ctypes.sizeof(ctypes.c_double)
-
-# Создаем примерные массивы NumPy
-coeff_matrix = np.arange(batch_size * n * n, dtype=np.double).reshape(batch_size, n, n)
-dependent_vals = np.arange(batch_size * n * nrhs, dtype=np.double).reshape(batch_size, n, nrhs)
-
-# my logic
-coeff_matrix = coeff_matrix.transpose((0, 2, 1))
-dependent_vals = dependent_vals.transpose((0, 2, 1))
-
-coeff_matrix = np.array(coeff_matrix, order="C")
-dependent_vals = np.array(dependent_vals, order="C")
-
-# # Sasha`s logic
-# coeff_matrix = np.moveaxis(coeff_matrix, (-2, -1), (0, 1))
-# dependent_vals = np.moveaxis(dependent_vals, (-2, -1), (0, 1))
-
-# coeff_matrix = np.array(coeff_matrix, order="F")
-# dependent_vals = np.array(dependent_vals, order="F")
-
-# get pointer
-coeff_matrix_data = coeff_matrix.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
-dependent_vals_data = dependent_vals.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
-
-# Функция для получения значений по указателю с учетом смещения и размера элемента
-def get_values_from_pointer(ptr, step, count, elemsize):
-    result = []
-    for i in range(count):
-        result.append(ptr[i * step // elemsize])
-    return result
-
-
-for i in range(batch_size):
-    coeff_matrix_batch = ctypes.cast(ctypes.addressof(coeff_matrix_data.contents) + i * n * n * coeff_matrix_elemsize, ctypes.POINTER(ctypes.c_double))
-    dependent_vals_batch = ctypes.cast(ctypes.addressof(dependent_vals_data.contents) + i * n * nrhs * dependent_vals_elemsize, ctypes.POINTER(ctypes.c_double))
-
-    # get value by new pointer
-    coeff_values = get_values_from_pointer(coeff_matrix_batch, coeff_matrix_elemsize, n * n, coeff_matrix_elemsize)
-    dependent_values = get_values_from_pointer(dependent_vals_batch, dependent_vals_elemsize, n * nrhs, dependent_vals_elemsize)
-
-    print(f"Batch {i + 1}:")
-    print("Coefficients:", coeff_values)
-    print("Dependent values:", dependent_values)
diff --git a/qwe.py b/qwe.py
deleted file mode 100644
index e373bab0d566..000000000000
--- a/qwe.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import dpnp
-import numpy
-from tests.helper import generate_random_numpy_array
-
-a = dpnp.array(generate_random_numpy_array((2,3,3,3,3),dtype='f4',hermitian=False, seed_value=81))
-b = dpnp.array(generate_random_numpy_array((2,3,3,3),dtype='f4',hermitian=False, seed_value=76))
-
-res = dpnp.linalg.solve(a,b,new=True)
-
-print(res.shape)
-print(res.flags)

From e5b53a18f8fea74ff5ccb128bf7c6426fbeb3974 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 26 Jul 2024 14:27:05 +0200
Subject: [PATCH 20/37] Remove host_task_events from gesv

---
 dpnp/backend/extensions/lapack/gesv.cpp | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index b1a35cd1f416..2da4de5b625a 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -141,7 +141,6 @@ typedef sycl::event (*gesv_impl_fn_ptr_t)(sycl::queue &,
                                           std::int64_t,
                                           char *,
                                           std::int64_t,
-                                          std::vector<sycl::event> &,
                                           const std::vector<sycl::event> &);
 
 static gesv_impl_fn_ptr_t gesv_dispatch_vector[dpctl_td_ns::num_types];
@@ -154,7 +153,6 @@ static sycl::event gesv_impl(sycl::queue &exec_q,
                              std::int64_t lda,
                              char *in_b,
                              std::int64_t ldb,
-                             std::vector<sycl::event> &host_task_events,
                              const std::vector<sycl::event> &depends)
 {
     type_utils::validate_type_for_device<T>(exec_q);
@@ -248,7 +246,7 @@ static sycl::event gesv_impl(sycl::queue &exec_q,
         throw std::runtime_error(error_msg.str());
     }
 
-    sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) {
+    sycl::event ht_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(gesv_event);
         auto ctx = exec_q.get_context();
         cgh.host_task([ctx, scratchpad, ipiv]() {
@@ -256,9 +254,8 @@ static sycl::event gesv_impl(sycl::queue &exec_q,
             sycl::free(ipiv, ctx);
         });
     });
-    host_task_events.push_back(clean_up_event);
 
-    return gesv_event;
+    return ht_ev;
 }
 
 std::pair<sycl::event, sycl::event>
@@ -312,15 +309,13 @@ std::pair<sycl::event, sycl::event>
     const std::int64_t lda = std::max<size_t>(1UL, n);
     const std::int64_t ldb = std::max<size_t>(1UL, n);
 
-    std::vector<sycl::event> host_task_events;
-    sycl::event gesv_ev =
-        gesv_fn(exec_q, n, nrhs, coeff_matrix_data, lda, dependent_vals_data,
-                ldb, host_task_events, depends);
+    sycl::event gesv_ev = gesv_fn(exec_q, n, nrhs, coeff_matrix_data, lda,
+                                  dependent_vals_data, ldb, depends);
 
-    sycl::event args_ev = dpctl::utils::keep_args_alive(
-        exec_q, {coeff_matrix, dependent_vals}, host_task_events);
+    sycl::event ht_ev = dpctl::utils::keep_args_alive(
+        exec_q, {coeff_matrix, dependent_vals}, {gesv_ev});
 
-    return std::make_pair(args_ev, gesv_ev);
+    return std::make_pair(ht_ev, gesv_ev);
 }
 
 template <typename fnT, typename T>

From d5adbd65f23e7363fb5ae11631210a12ec07cde9 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 26 Jul 2024 15:03:09 +0200
Subject: [PATCH 21/37] Use check_zeros_shape in gesv and gesv_batch

---
 dpnp/backend/extensions/lapack/gesv.cpp       | 13 ++++++------
 dpnp/backend/extensions/lapack/gesv_batch.cpp | 21 +++++++++----------
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index 2da4de5b625a..bfaa4d5657a1 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -264,6 +264,7 @@ std::pair<sycl::event, sycl::event>
          dpctl::tensor::usm_ndarray dependent_vals,
          const std::vector<sycl::event> &depends)
 {
+    const int coeff_matrix_nd = coeff_matrix.get_ndim();
     const int dependent_vals_nd = dependent_vals.get_ndim();
 
     const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw();
@@ -277,13 +278,11 @@ std::pair<sycl::event, sycl::event>
                        dependent_vals_shape, expected_coeff_matrix_ndim,
                        min_dependent_vals_ndim, max_dependent_vals_ndim);
 
-    size_t src_nelems(1);
-
-    for (int i = 0; i < dependent_vals_nd; ++i) {
-        src_nelems *= static_cast<size_t>(dependent_vals_shape[i]);
-    }
-
-    if (src_nelems == 0) {
+    // Ensure `batch_size`, `n` and 'nrhs' are non-zero, otherwise return empty
+    // events
+    if (helper::check_zeros_shape(coeff_matrix_nd, coeff_matrix_shape) ||
+        helper::check_zeros_shape(dependent_vals_nd, dependent_vals_shape))
+    {
         // nothing to do
         return std::make_pair(sycl::event(), sycl::event());
     }
diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp
index 22c6c8df6114..5013389dfb31 100644
--- a/dpnp/backend/extensions/lapack/gesv_batch.cpp
+++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp
@@ -241,6 +241,7 @@ std::pair<sycl::event, sycl::event>
                dpctl::tensor::usm_ndarray dependent_vals,
                const std::vector<sycl::event> &depends)
 {
+    const int coeff_matrix_nd = coeff_matrix.get_ndim();
     const int dependent_vals_nd = dependent_vals.get_ndim();
 
     const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw();
@@ -254,6 +255,15 @@ std::pair<sycl::event, sycl::event>
                        dependent_vals_shape, expected_coeff_matrix_ndim,
                        min_dependent_vals_ndim, max_dependent_vals_ndim);
 
+    // Ensure `batch_size`, `n` and 'nrhs' are non-zero, otherwise return empty
+    // events
+    if (helper::check_zeros_shape(coeff_matrix_nd, coeff_matrix_shape) ||
+        helper::check_zeros_shape(dependent_vals_nd, dependent_vals_shape))
+    {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
     if (dependent_vals_nd == 2) {
         if (coeff_matrix_shape[2] != dependent_vals_shape[1]) {
             throw py::value_error(
@@ -275,17 +285,6 @@ std::pair<sycl::event, sycl::event>
         }
     }
 
-    size_t src_nelems(1);
-
-    for (int i = 0; i < dependent_vals_nd; ++i) {
-        src_nelems *= static_cast<size_t>(dependent_vals_shape[i]);
-    }
-
-    if (src_nelems == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
     auto array_types = dpctl_td_ns::usm_ndarray_types();
     int coeff_matrix_type_id =
         array_types.typenum_to_lookup_id(coeff_matrix.get_typenum());

From 5b2780c68555d7dc1ce426216909b536092b571f Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 26 Jul 2024 15:16:46 +0200
Subject: [PATCH 22/37] Add additional checks for gesv_impl

---
 dpnp/backend/extensions/lapack/gesv.cpp | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index bfaa4d5657a1..a3d22eed38ec 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -162,9 +162,25 @@ static sycl::event gesv_impl(sycl::queue &exec_q,
 
     const std::int64_t scratchpad_size =
         mkl_lapack::gesv_scratchpad_size<T>(exec_q, n, nrhs, lda, ldb);
+
+    if (scratchpad_size <= 0) {
+        throw std::runtime_error(
+            "Invalid scratchpad size: must be greater than zero."
+            "Calculated scratchpad size: " +
+            std::to_string(scratchpad_size));
+    }
+
     T *scratchpad = nullptr;
+    // Allocate memory for the scratchpad
+    scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+    if (!scratchpad)
+        throw std::runtime_error("Device allocation for scratchpad failed");
 
     std::int64_t *ipiv = nullptr;
+    // Allocate memory for the ipiv
+    ipiv = sycl::malloc_device<std::int64_t>(n, exec_q);
+    if (!ipiv)
+        throw std::runtime_error("Device allocation for ipiv failed");
 
     std::stringstream error_msg;
     std::int64_t info = 0;
@@ -172,9 +188,6 @@ static sycl::event gesv_impl(sycl::queue &exec_q,
 
     sycl::event gesv_event;
     try {
-        scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
-        ipiv = sycl::malloc_device<std::int64_t>(n, exec_q);
-
         gesv_event = mkl_lapack::gesv(
             exec_q,
             n,    // The order of the square matrix A

From d4547d41beef163dca51775ffd229f88e5f9ca18 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 26 Jul 2024 15:30:01 +0200
Subject: [PATCH 23/37] Move alloc_scratchpad to common_helpers.hpp

---
 .../extensions/lapack/common_helpers.hpp      | 36 ++++++++++++++++++-
 .../extensions/lapack/evd_batch_common.hpp    | 30 ----------------
 .../backend/extensions/lapack/heevd_batch.cpp |  2 +-
 .../backend/extensions/lapack/syevd_batch.cpp |  2 +-
 4 files changed, 37 insertions(+), 33 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/common_helpers.hpp b/dpnp/backend/extensions/lapack/common_helpers.hpp
index df2b87028e48..e1812c9ad415 100644
--- a/dpnp/backend/extensions/lapack/common_helpers.hpp
+++ b/dpnp/backend/extensions/lapack/common_helpers.hpp
@@ -24,9 +24,11 @@
 //*****************************************************************************
 
 #pragma once
+#include <oneapi/mkl.hpp>
+#include <pybind11/pybind11.h>
+
 #include <complex>
 #include <cstring>
-#include <pybind11/pybind11.h>
 #include <stdexcept>
 
 namespace dpnp::extensions::lapack::helper
@@ -63,4 +65,36 @@ inline bool check_zeros_shape(int ndim, const py::ssize_t *shape)
     }
     return src_nelems == 0;
 }
+
+// Allocate the total scratchpad memory with proper alignment for batch
+// implementations
+template <typename T>
+inline T *alloc_scratchpad(std::int64_t scratchpad_size,
+                           std::int64_t n_linear_streams,
+                           sycl::queue &exec_q)
+{
+    // Get padding size to ensure memory allocations are aligned to 256 bytes
+    // for better performance
+    const std::int64_t padding = 256 / sizeof(T);
+
+    if (scratchpad_size <= 0) {
+        throw std::runtime_error(
+            "Invalid scratchpad size: must be greater than zero."
+            " Calculated scratchpad size: " +
+            std::to_string(scratchpad_size));
+    }
+
+    // Calculate the total scratchpad memory size needed for all linear
+    // streams with proper alignment
+    const size_t alloc_scratch_size =
+        round_up_mult(n_linear_streams * scratchpad_size, padding);
+
+    // Allocate memory for the total scratchpad
+    T *scratchpad = sycl::malloc_device<T>(alloc_scratch_size, exec_q);
+    if (!scratchpad) {
+        throw std::runtime_error("Device allocation for scratchpad failed");
+    }
+
+    return scratchpad;
+}
 } // namespace dpnp::extensions::lapack::helper
diff --git a/dpnp/backend/extensions/lapack/evd_batch_common.hpp b/dpnp/backend/extensions/lapack/evd_batch_common.hpp
index 5a7d985ff821..3fa5f98214a4 100644
--- a/dpnp/backend/extensions/lapack/evd_batch_common.hpp
+++ b/dpnp/backend/extensions/lapack/evd_batch_common.hpp
@@ -119,34 +119,4 @@ std::pair<sycl::event, sycl::event>
 
     return std::make_pair(ht_ev, evd_batch_ev);
 }
-
-template <typename T>
-inline T *alloc_scratchpad(std::int64_t scratchpad_size,
-                           std::int64_t n_linear_streams,
-                           sycl::queue &exec_q)
-{
-    // Get padding size to ensure memory allocations are aligned to 256 bytes
-    // for better performance
-    const std::int64_t padding = 256 / sizeof(T);
-
-    if (scratchpad_size <= 0) {
-        throw std::runtime_error(
-            "Invalid scratchpad size: must be greater than zero."
-            " Calculated scratchpad size: " +
-            std::to_string(scratchpad_size));
-    }
-
-    // Calculate the total scratchpad memory size needed for all linear
-    // streams with proper alignment
-    const size_t alloc_scratch_size =
-        helper::round_up_mult(n_linear_streams * scratchpad_size, padding);
-
-    // Allocate memory for the total scratchpad
-    T *scratchpad = sycl::malloc_device<T>(alloc_scratch_size, exec_q);
-    if (!scratchpad) {
-        throw std::runtime_error("Device allocation for scratchpad failed");
-    }
-
-    return scratchpad;
-}
 } // namespace dpnp::extensions::lapack::evd
diff --git a/dpnp/backend/extensions/lapack/heevd_batch.cpp b/dpnp/backend/extensions/lapack/heevd_batch.cpp
index bc21b3da35d5..2ea37e22d7d9 100644
--- a/dpnp/backend/extensions/lapack/heevd_batch.cpp
+++ b/dpnp/backend/extensions/lapack/heevd_batch.cpp
@@ -66,7 +66,7 @@ static sycl::event heevd_batch_impl(sycl::queue &exec_q,
         mkl_lapack::heevd_scratchpad_size<T>(exec_q, jobz, upper_lower, n, lda);
 
     T *scratchpad =
-        evd::alloc_scratchpad<T>(scratchpad_size, n_linear_streams, exec_q);
+        helper::alloc_scratchpad<T>(scratchpad_size, n_linear_streams, exec_q);
 
     // Computation events to manage dependencies for each linear stream
     std::vector<std::vector<sycl::event>> comp_evs(n_linear_streams, depends);
diff --git a/dpnp/backend/extensions/lapack/syevd_batch.cpp b/dpnp/backend/extensions/lapack/syevd_batch.cpp
index d2b87fc260cc..aa5656e0ac3c 100644
--- a/dpnp/backend/extensions/lapack/syevd_batch.cpp
+++ b/dpnp/backend/extensions/lapack/syevd_batch.cpp
@@ -66,7 +66,7 @@ static sycl::event syevd_batch_impl(sycl::queue &exec_q,
         mkl_lapack::syevd_scratchpad_size<T>(exec_q, jobz, upper_lower, n, lda);
 
     T *scratchpad =
-        evd::alloc_scratchpad<T>(scratchpad_size, n_linear_streams, exec_q);
+        helper::alloc_scratchpad<T>(scratchpad_size, n_linear_streams, exec_q);
 
     // Computation events to manage dependencies for each linear stream
     std::vector<std::vector<sycl::event>> comp_evs(n_linear_streams, depends);

From 67591649cc3812b20c92a5fd13a9e319bd766764 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 26 Jul 2024 15:35:24 +0200
Subject: [PATCH 24/37] Use helper::alloc_scratchpad in gesv_batch_impl

---
 dpnp/backend/extensions/lapack/gesv_batch.cpp | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp
index 5013389dfb31..197049cd0b3a 100644
--- a/dpnp/backend/extensions/lapack/gesv_batch.cpp
+++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp
@@ -82,23 +82,13 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q,
 
     const std::int64_t scratchpad_size =
         mkl_lapack::gesv_scratchpad_size<T>(exec_q, n, nrhs, lda, ldb);
-    T *scratchpad = nullptr;
+
+    T *scratchpad =
+        helper::alloc_scratchpad<T>(scratchpad_size, n_linear_streams, exec_q);
 
     // Get padding size to ensure memory allocations are aligned to 256 bytes
     // for better performance
-    std::int64_t padding = 256 / sizeof(T);
-
-    if (scratchpad_size > 0) {
-        // Calculate the total scratchpad memory size needed for all linear
-        // streams with proper alignment
-        size_t alloc_scratch_size =
-            helper::round_up_mult(n_linear_streams * scratchpad_size, padding);
-
-        // Allocate memory for the total scratchpad
-        scratchpad = sycl::malloc_device<T>(alloc_scratch_size, exec_q);
-        if (!scratchpad)
-            throw std::runtime_error("Device allocation for scratchpad failed");
-    }
+    const std::int64_t padding = 256 / sizeof(T);
 
     // Calculate the total size needed for the pivot indices array for all
     // linear streams with proper alignment

From f37ec435583137600b4bfd91ff1d25de1d3822f9 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 26 Jul 2024 15:39:22 +0200
Subject: [PATCH 25/37] Remove current_scratch_gesv check

---
 dpnp/backend/extensions/lapack/gesv_batch.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp
index 197049cd0b3a..42e9d857701b 100644
--- a/dpnp/backend/extensions/lapack/gesv_batch.cpp
+++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp
@@ -118,10 +118,7 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q,
 
         std::int64_t stream_id = (batch_id % n_linear_streams);
 
-        T *current_scratch_gesv = nullptr;
-        if (scratchpad != nullptr) {
-            current_scratch_gesv = scratchpad + stream_id * scratchpad_size;
-        }
+        T *current_scratch_gesv = scratchpad + stream_id * scratchpad_size;
         std::int64_t *current_ipiv = ipiv + stream_id * n;
 
         // Get the event dependencies for the current stream

From adc17ba78f7260609d618c347b7e8233709f886e Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 26 Jul 2024 15:46:44 +0200
Subject: [PATCH 26/37] Remove lda, ldb pass to gesv_batch_impl, gesv_impl

---
 dpnp/backend/extensions/lapack/gesv.cpp       | 14 +++++---------
 dpnp/backend/extensions/lapack/gesv_batch.cpp | 14 +++++---------
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index a3d22eed38ec..c215769d270f 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -138,9 +138,7 @@ typedef sycl::event (*gesv_impl_fn_ptr_t)(sycl::queue &,
                                           const std::int64_t,
                                           const std::int64_t,
                                           char *,
-                                          std::int64_t,
                                           char *,
-                                          std::int64_t,
                                           const std::vector<sycl::event> &);
 
 static gesv_impl_fn_ptr_t gesv_dispatch_vector[dpctl_td_ns::num_types];
@@ -150,9 +148,7 @@ static sycl::event gesv_impl(sycl::queue &exec_q,
                              const std::int64_t n,
                              const std::int64_t nrhs,
                              char *in_a,
-                             std::int64_t lda,
                              char *in_b,
-                             std::int64_t ldb,
                              const std::vector<sycl::event> &depends)
 {
     type_utils::validate_type_for_device<T>(exec_q);
@@ -160,6 +156,9 @@ static sycl::event gesv_impl(sycl::queue &exec_q,
     T *a = reinterpret_cast<T *>(in_a);
     T *b = reinterpret_cast<T *>(in_b);
 
+    const std::int64_t lda = std::max<size_t>(1UL, n);
+    const std::int64_t ldb = std::max<size_t>(1UL, n);
+
     const std::int64_t scratchpad_size =
         mkl_lapack::gesv_scratchpad_size<T>(exec_q, n, nrhs, lda, ldb);
 
@@ -318,11 +317,8 @@ std::pair<sycl::event, sycl::event>
     const std::int64_t nrhs =
         (dependent_vals_nd > 1) ? dependent_vals_shape[1] : 1;
 
-    const std::int64_t lda = std::max<size_t>(1UL, n);
-    const std::int64_t ldb = std::max<size_t>(1UL, n);
-
-    sycl::event gesv_ev = gesv_fn(exec_q, n, nrhs, coeff_matrix_data, lda,
-                                  dependent_vals_data, ldb, depends);
+    sycl::event gesv_ev = gesv_fn(exec_q, n, nrhs, coeff_matrix_data,
+                                  dependent_vals_data, depends);
 
     sycl::event ht_ev = dpctl::utils::keep_args_alive(
         exec_q, {coeff_matrix, dependent_vals}, {gesv_ev});
diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp
index 42e9d857701b..f5e47389b4b9 100644
--- a/dpnp/backend/extensions/lapack/gesv_batch.cpp
+++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp
@@ -49,9 +49,7 @@ typedef sycl::event (*gesv_batch_impl_fn_ptr_t)(
     const std::int64_t,
     const std::int64_t,
     char *,
-    std::int64_t,
     char *,
-    std::int64_t,
     const std::vector<sycl::event> &);
 
 static gesv_batch_impl_fn_ptr_t
@@ -63,9 +61,7 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q,
                                    const std::int64_t nrhs,
                                    const std::int64_t batch_size,
                                    char *in_a,
-                                   std::int64_t lda,
                                    char *in_b,
-                                   std::int64_t ldb,
                                    const std::vector<sycl::event> &depends)
 {
     type_utils::validate_type_for_device<T>(exec_q);
@@ -76,6 +72,9 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q,
     std::int64_t a_size = n * n;
     std::int64_t b_size = n * nrhs;
 
+    const std::int64_t lda = std::max<size_t>(1UL, n);
+    const std::int64_t ldb = std::max<size_t>(1UL, n);
+
     // Get the number of independent linear streams
     std::int64_t n_linear_streams =
         (batch_size > 16) ? 4 : ((batch_size > 4 ? 2 : 1));
@@ -292,12 +291,9 @@ std::pair<sycl::event, sycl::event>
     const std::int64_t nrhs =
         (dependent_vals_nd > 2) ? dependent_vals_shape[1] : 1;
 
-    const std::int64_t lda = std::max<size_t>(1UL, n);
-    const std::int64_t ldb = std::max<size_t>(1UL, n);
-
     sycl::event gesv_ev =
-        gesv_batch_fn(exec_q, n, nrhs, batch_size, coeff_matrix_data, lda,
-                      dependent_vals_data, ldb, depends);
+        gesv_batch_fn(exec_q, n, nrhs, batch_size, coeff_matrix_data,
+                      dependent_vals_data, depends);
 
     sycl::event ht_ev = dpctl::utils::keep_args_alive(
         exec_q, {coeff_matrix, dependent_vals}, {gesv_ev});

From 77ba0e2375eb5a494aff498729aea47945017dbb Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 26 Jul 2024 15:55:32 +0200
Subject: [PATCH 27/37] Use const and constexpr in gesv/gesv_batch

---
 dpnp/backend/extensions/lapack/gesv.cpp       | 16 ++++++++--------
 dpnp/backend/extensions/lapack/gesv_batch.cpp |  8 ++++----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index c215769d270f..52b2384cc040 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -110,22 +110,22 @@ void common_gesv_checks(sycl::queue &exec_q,
     dpctl::tensor::validation::CheckWritable::throw_if_not_writable(
         dependent_vals);
 
-    bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous();
+    const bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous();
     if (!is_coeff_matrix_f_contig) {
         throw py::value_error("The coefficient matrix "
                               "must be F-contiguous.");
     }
 
-    bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous();
+    const bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous();
     if (!is_dependent_vals_f_contig) {
         throw py::value_error("The array of dependent variables "
                               "must be F-contiguous.");
     }
 
     auto array_types = dpctl_td_ns::usm_ndarray_types();
-    int coeff_matrix_type_id =
+    const int coeff_matrix_type_id =
         array_types.typenum_to_lookup_id(coeff_matrix.get_typenum());
-    int dependent_vals_type_id =
+    const int dependent_vals_type_id =
         array_types.typenum_to_lookup_id(dependent_vals.get_typenum());
 
     if (coeff_matrix_type_id != dependent_vals_type_id) {
@@ -282,9 +282,9 @@ std::pair<sycl::event, sycl::event>
     const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw();
     const py::ssize_t *dependent_vals_shape = dependent_vals.get_shape_raw();
 
-    const int expected_coeff_matrix_ndim = 2;
-    const int min_dependent_vals_ndim = 1;
-    const int max_dependent_vals_ndim = 2;
+    constexpr int expected_coeff_matrix_ndim = 2;
+    constexpr int min_dependent_vals_ndim = 1;
+    constexpr int max_dependent_vals_ndim = 2;
 
     common_gesv_checks(exec_q, coeff_matrix, dependent_vals, coeff_matrix_shape,
                        dependent_vals_shape, expected_coeff_matrix_ndim,
@@ -300,7 +300,7 @@ std::pair<sycl::event, sycl::event>
     }
 
     auto array_types = dpctl_td_ns::usm_ndarray_types();
-    int coeff_matrix_type_id =
+    const int coeff_matrix_type_id =
         array_types.typenum_to_lookup_id(coeff_matrix.get_typenum());
 
     gesv_impl_fn_ptr_t gesv_fn = gesv_dispatch_vector[coeff_matrix_type_id];
diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp
index f5e47389b4b9..233d323247f0 100644
--- a/dpnp/backend/extensions/lapack/gesv_batch.cpp
+++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp
@@ -233,9 +233,9 @@ std::pair<sycl::event, sycl::event>
     const py::ssize_t *coeff_matrix_shape = coeff_matrix.get_shape_raw();
     const py::ssize_t *dependent_vals_shape = dependent_vals.get_shape_raw();
 
-    const int expected_coeff_matrix_ndim = 3;
-    const int min_dependent_vals_ndim = 2;
-    const int max_dependent_vals_ndim = 3;
+    constexpr int expected_coeff_matrix_ndim = 3;
+    constexpr int min_dependent_vals_ndim = 2;
+    constexpr int max_dependent_vals_ndim = 3;
 
     common_gesv_checks(exec_q, coeff_matrix, dependent_vals, coeff_matrix_shape,
                        dependent_vals_shape, expected_coeff_matrix_ndim,
@@ -272,7 +272,7 @@ std::pair<sycl::event, sycl::event>
     }
 
     auto array_types = dpctl_td_ns::usm_ndarray_types();
-    int coeff_matrix_type_id =
+    const int coeff_matrix_type_id =
         array_types.typenum_to_lookup_id(coeff_matrix.get_typenum());
 
     gesv_batch_impl_fn_ptr_t gesv_batch_fn =

From 9bf94b51c7321e1bef6dbd39bc6d61f48829bb9c Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 29 Jul 2024 11:45:17 +0200
Subject: [PATCH 28/37] Applied review comments

---
 dpnp/backend/extensions/lapack/gesv.cpp       | 6 +++---
 dpnp/backend/extensions/lapack/gesv_batch.cpp | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index 52b2384cc040..92e03213c42e 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -75,10 +75,10 @@ void common_gesv_checks(sycl::queue &exec_q,
 
     // The coeff_matrix and dependent_vals arrays must be F-contiguous arrays
     // for gesv
-    // with the shapes (n,n) and (n,nrhs) or (n,) respectively;
+    // with the shapes (n, n) and (n, nrhs) or (n, ) respectively;
     // for gesv_batch
-    // with the shapes (n,n,batch_size) and (n,nrhs,batch_size) or
-    // (n,batch_size) respectively
+    // with the shapes (n, n, batch_size) and (n, nrhs, batch_size) or
+    // (n, batch_size) respectively
     if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) {
         throw py::value_error("The coefficient matrix must be square,"
                               " but got a shape of (" +
diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp
index 233d323247f0..42cb7a36814f 100644
--- a/dpnp/backend/extensions/lapack/gesv_batch.cpp
+++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp
@@ -69,14 +69,14 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q,
     T *a = reinterpret_cast<T *>(in_a);
     T *b = reinterpret_cast<T *>(in_b);
 
-    std::int64_t a_size = n * n;
-    std::int64_t b_size = n * nrhs;
+    const std::int64_t a_size = n * n;
+    const std::int64_t b_size = n * nrhs;
 
     const std::int64_t lda = std::max<size_t>(1UL, n);
     const std::int64_t ldb = std::max<size_t>(1UL, n);
 
     // Get the number of independent linear streams
-    std::int64_t n_linear_streams =
+    const std::int64_t n_linear_streams =
         (batch_size > 16) ? 4 : ((batch_size > 4 ? 2 : 1));
 
     const std::int64_t scratchpad_size =

From b81893c3cb9be2270789b18a7c5df049e2a13151 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 29 Jul 2024 11:48:48 +0200
Subject: [PATCH 29/37] Use dpnp.reshape in _batched_solve

---
 dpnp/linalg/dpnp_utils_linalg.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 6dfa941cd0cd..fe417413cf5a 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -266,11 +266,11 @@ def _batched_solve(a, b, exec_q, res_usm_type, res_type):
     # gesv_batch expects `a` to be a 3D array and
     # `b` to be either a 2D or 3D array.
     if a.ndim == b.ndim:
-        b = b.reshape(-1, b_shape[-2], b_shape[-1])
+        b = dpnp.reshape(b, (-1, b_shape[-2], b_shape[-1]))
     else:
-        b = b.reshape(-1, b_shape[-1])
+        b = dpnp.reshape(b, (-1, b_shape[-1]))
 
-    a = a.reshape(-1, a_shape[-2], a_shape[-1])
+    a = dpnp.reshape(a, (-1, a_shape[-2], a_shape[-1]))
 
     # Reorder the elements by moving the last two axes of `a` to the front
     # to match fortran-like array order which is assumed by gesv.

From f8d68efe476fbb721eb2f22af9db2165c263e84e Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 29 Jul 2024 12:12:46 +0200
Subject: [PATCH 30/37] Implement alloc_ipiv in common_helpers.hpp

---
 .../extensions/lapack/common_helpers.hpp      | 24 +++++++++++++++++++
 dpnp/backend/extensions/lapack/gesv_batch.cpp | 15 +-----------
 2 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/common_helpers.hpp b/dpnp/backend/extensions/lapack/common_helpers.hpp
index e1812c9ad415..aeeb3fb2793c 100644
--- a/dpnp/backend/extensions/lapack/common_helpers.hpp
+++ b/dpnp/backend/extensions/lapack/common_helpers.hpp
@@ -66,6 +66,30 @@ inline bool check_zeros_shape(int ndim, const py::ssize_t *shape)
     return src_nelems == 0;
 }
 
+// Allocate the total memory for the total pivot indices with proper alignment
+// for batch implementations
+template <typename T>
+inline std::int64_t *alloc_ipiv(const std::int64_t n,
+                                std::int64_t n_linear_streams,
+                                sycl::queue &exec_q)
+{
+    // Get padding size to ensure memory allocations are aligned to 256 bytes
+    // for better performance
+    const std::int64_t padding = 256 / sizeof(T);
+
+    // Calculate the total size needed for the pivot indices array for all
+    // linear streams with proper alignment
+    size_t alloc_ipiv_size = round_up_mult(n_linear_streams * n, padding);
+
+    // Allocate memory for the total pivot indices array
+    std::int64_t *ipiv =
+        sycl::malloc_device<std::int64_t>(alloc_ipiv_size, exec_q);
+    if (!ipiv)
+        throw std::runtime_error("Device allocation for ipiv failed");
+
+    return ipiv;
+}
+
 // Allocate the total scratchpad memory with proper alignment for batch
 // implementations
 template <typename T>
diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp
index 42cb7a36814f..89f2a4fe9d77 100644
--- a/dpnp/backend/extensions/lapack/gesv_batch.cpp
+++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp
@@ -85,20 +85,7 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q,
     T *scratchpad =
         helper::alloc_scratchpad<T>(scratchpad_size, n_linear_streams, exec_q);
 
-    // Get padding size to ensure memory allocations are aligned to 256 bytes
-    // for better performance
-    const std::int64_t padding = 256 / sizeof(T);
-
-    // Calculate the total size needed for the pivot indices array for all
-    // linear streams with proper alignment
-    size_t alloc_ipiv_size =
-        helper::round_up_mult(n_linear_streams * n, padding);
-
-    // Allocate memory for the total pivot indices array
-    std::int64_t *ipiv =
-        sycl::malloc_device<std::int64_t>(alloc_ipiv_size, exec_q);
-    if (!ipiv)
-        throw std::runtime_error("Device allocation for ipiv failed");
+    std::int64_t *ipiv = helper::alloc_ipiv<T>(n, n_linear_streams, exec_q);
 
     // Computation events to manage dependencies for each linear stream
     std::vector<std::vector<sycl::event>> comp_evs(n_linear_streams, depends);

From fc6c7fabf4c9a776a9b065f5022baf192b8d46dc Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 29 Jul 2024 13:15:24 +0200
Subject: [PATCH 31/37] Add gesv_common_utils.hpp

---
 dpnp/backend/extensions/lapack/gesv.cpp       | 103 +-------------
 dpnp/backend/extensions/lapack/gesv_batch.cpp |  12 +-
 .../extensions/lapack/gesv_common_utils.hpp   | 130 ++++++++++++++++++
 3 files changed, 140 insertions(+), 105 deletions(-)
 create mode 100644 dpnp/backend/extensions/lapack/gesv_common_utils.hpp

diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index 92e03213c42e..be7df6bbf11c 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -26,114 +26,20 @@
 #include <pybind11/pybind11.h>
 
 // dpctl tensor headers
-#include "utils/memory_overlap.hpp"
-#include "utils/output_validation.hpp"
 #include "utils/type_utils.hpp"
 
 #include "common_helpers.hpp"
 #include "gesv.hpp"
+#include "gesv_common_utils.hpp"
 #include "linalg_exceptions.hpp"
 #include "types_matrix.hpp"
 
-#include "dpnp_utils.hpp"
-
 namespace dpnp::extensions::lapack
 {
 namespace mkl_lapack = oneapi::mkl::lapack;
 namespace py = pybind11;
 namespace type_utils = dpctl::tensor::type_utils;
 
-void common_gesv_checks(sycl::queue &exec_q,
-                        dpctl::tensor::usm_ndarray coeff_matrix,
-                        dpctl::tensor::usm_ndarray dependent_vals,
-                        const py::ssize_t *coeff_matrix_shape,
-                        const py::ssize_t *dependent_vals_shape,
-                        const int expected_coeff_matrix_ndim,
-                        const int min_dependent_vals_ndim,
-                        const int max_dependent_vals_ndim)
-{
-    const int coeff_matrix_nd = coeff_matrix.get_ndim();
-    const int dependent_vals_nd = dependent_vals.get_ndim();
-
-    if (coeff_matrix_nd != expected_coeff_matrix_ndim) {
-        throw py::value_error("The coefficient matrix has ndim=" +
-                              std::to_string(coeff_matrix_nd) + ", but a " +
-                              std::to_string(expected_coeff_matrix_ndim) +
-                              "-dimensional array is expected.");
-    }
-
-    if (dependent_vals_nd < min_dependent_vals_ndim ||
-        dependent_vals_nd > max_dependent_vals_ndim)
-    {
-        throw py::value_error("The dependent values array has ndim=" +
-                              std::to_string(dependent_vals_nd) + ", but a " +
-                              std::to_string(min_dependent_vals_ndim) +
-                              "-dimensional or a " +
-                              std::to_string(max_dependent_vals_ndim) +
-                              "-dimensional array is expected.");
-    }
-
-    // The coeff_matrix and dependent_vals arrays must be F-contiguous arrays
-    // for gesv
-    // with the shapes (n, n) and (n, nrhs) or (n, ) respectively;
-    // for gesv_batch
-    // with the shapes (n, n, batch_size) and (n, nrhs, batch_size) or
-    // (n, batch_size) respectively
-    if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) {
-        throw py::value_error("The coefficient matrix must be square,"
-                              " but got a shape of (" +
-                              std::to_string(coeff_matrix_shape[0]) + ", " +
-                              std::to_string(coeff_matrix_shape[1]) + ").");
-    }
-    if (coeff_matrix_shape[0] != dependent_vals_shape[0]) {
-        throw py::value_error("The first dimension (n) of coeff_matrix and"
-                              " dependent_vals must be the same, but got " +
-                              std::to_string(coeff_matrix_shape[0]) + " and " +
-                              std::to_string(dependent_vals_shape[0]) + ".");
-    }
-
-    // check compatibility of execution queue and allocation queue
-    if (!dpctl::utils::queues_are_compatible(exec_q,
-                                             {coeff_matrix, dependent_vals}))
-    {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues.");
-    }
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(coeff_matrix, dependent_vals)) {
-        throw py::value_error(
-            "The arrays of coefficients and dependent variables "
-            "are overlapping segments of memory.");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(
-        dependent_vals);
-
-    const bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous();
-    if (!is_coeff_matrix_f_contig) {
-        throw py::value_error("The coefficient matrix "
-                              "must be F-contiguous.");
-    }
-
-    const bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous();
-    if (!is_dependent_vals_f_contig) {
-        throw py::value_error("The array of dependent variables "
-                              "must be F-contiguous.");
-    }
-
-    auto array_types = dpctl_td_ns::usm_ndarray_types();
-    const int coeff_matrix_type_id =
-        array_types.typenum_to_lookup_id(coeff_matrix.get_typenum());
-    const int dependent_vals_type_id =
-        array_types.typenum_to_lookup_id(dependent_vals.get_typenum());
-
-    if (coeff_matrix_type_id != dependent_vals_type_id) {
-        throw py::value_error("The types of the coefficient matrix and "
-                              "dependent variables are mismatched.");
-    }
-}
-
 typedef sycl::event (*gesv_impl_fn_ptr_t)(sycl::queue &,
                                           const std::int64_t,
                                           const std::int64_t,
@@ -286,9 +192,10 @@ std::pair<sycl::event, sycl::event>
     constexpr int min_dependent_vals_ndim = 1;
     constexpr int max_dependent_vals_ndim = 2;
 
-    common_gesv_checks(exec_q, coeff_matrix, dependent_vals, coeff_matrix_shape,
-                       dependent_vals_shape, expected_coeff_matrix_ndim,
-                       min_dependent_vals_ndim, max_dependent_vals_ndim);
+    gesv_utils::common_gesv_checks(
+        exec_q, coeff_matrix, dependent_vals, coeff_matrix_shape,
+        dependent_vals_shape, expected_coeff_matrix_ndim,
+        min_dependent_vals_ndim, max_dependent_vals_ndim);
 
     // Ensure `batch_size`, `n` and 'nrhs' are non-zero, otherwise return empty
     // events
diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp
index 89f2a4fe9d77..bbecc7985c00 100644
--- a/dpnp/backend/extensions/lapack/gesv_batch.cpp
+++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp
@@ -26,17 +26,14 @@
 #include <pybind11/pybind11.h>
 
 // dpctl tensor headers
-#include "utils/memory_overlap.hpp"
-#include "utils/output_validation.hpp"
 #include "utils/type_utils.hpp"
 
 #include "common_helpers.hpp"
 #include "gesv.hpp"
+#include "gesv_common_utils.hpp"
 #include "linalg_exceptions.hpp"
 #include "types_matrix.hpp"
 
-#include "dpnp_utils.hpp"
-
 namespace dpnp::extensions::lapack
 {
 namespace mkl_lapack = oneapi::mkl::lapack;
@@ -224,9 +221,10 @@ std::pair<sycl::event, sycl::event>
     constexpr int min_dependent_vals_ndim = 2;
     constexpr int max_dependent_vals_ndim = 3;
 
-    common_gesv_checks(exec_q, coeff_matrix, dependent_vals, coeff_matrix_shape,
-                       dependent_vals_shape, expected_coeff_matrix_ndim,
-                       min_dependent_vals_ndim, max_dependent_vals_ndim);
+    gesv_utils::common_gesv_checks(
+        exec_q, coeff_matrix, dependent_vals, coeff_matrix_shape,
+        dependent_vals_shape, expected_coeff_matrix_ndim,
+        min_dependent_vals_ndim, max_dependent_vals_ndim);
 
     // Ensure `batch_size`, `n` and 'nrhs' are non-zero, otherwise return empty
     // events
diff --git a/dpnp/backend/extensions/lapack/gesv_common_utils.hpp b/dpnp/backend/extensions/lapack/gesv_common_utils.hpp
new file mode 100644
index 000000000000..8cf148255650
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/gesv_common_utils.hpp
@@ -0,0 +1,130 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpnp::extensions::lapack::gesv_utils
+{
+namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
+namespace py = pybind11;
+
+inline void common_gesv_checks(sycl::queue &exec_q,
+                               dpctl::tensor::usm_ndarray coeff_matrix,
+                               dpctl::tensor::usm_ndarray dependent_vals,
+                               const py::ssize_t *coeff_matrix_shape,
+                               const py::ssize_t *dependent_vals_shape,
+                               const int expected_coeff_matrix_ndim,
+                               const int min_dependent_vals_ndim,
+                               const int max_dependent_vals_ndim)
+{
+    const int coeff_matrix_nd = coeff_matrix.get_ndim();
+    const int dependent_vals_nd = dependent_vals.get_ndim();
+
+    if (coeff_matrix_nd != expected_coeff_matrix_ndim) {
+        throw py::value_error("The coefficient matrix has ndim=" +
+                              std::to_string(coeff_matrix_nd) + ", but a " +
+                              std::to_string(expected_coeff_matrix_ndim) +
+                              "-dimensional array is expected.");
+    }
+
+    if (dependent_vals_nd < min_dependent_vals_ndim ||
+        dependent_vals_nd > max_dependent_vals_ndim)
+    {
+        throw py::value_error("The dependent values array has ndim=" +
+                              std::to_string(dependent_vals_nd) + ", but a " +
+                              std::to_string(min_dependent_vals_ndim) +
+                              "-dimensional or a " +
+                              std::to_string(max_dependent_vals_ndim) +
+                              "-dimensional array is expected.");
+    }
+
+    // The coeff_matrix and dependent_vals arrays must be F-contiguous arrays
+    // for gesv
+    // with the shapes (n, n) and (n, nrhs) or (n, ) respectively;
+    // for gesv_batch
+    // with the shapes (n, n, batch_size) and (n, nrhs, batch_size) or
+    // (n, batch_size) respectively
+    if (coeff_matrix_shape[0] != coeff_matrix_shape[1]) {
+        throw py::value_error("The coefficient matrix must be square,"
+                              " but got a shape of (" +
+                              std::to_string(coeff_matrix_shape[0]) + ", " +
+                              std::to_string(coeff_matrix_shape[1]) + ").");
+    }
+    if (coeff_matrix_shape[0] != dependent_vals_shape[0]) {
+        throw py::value_error("The first dimension (n) of coeff_matrix and"
+                              " dependent_vals must be the same, but got " +
+                              std::to_string(coeff_matrix_shape[0]) + " and " +
+                              std::to_string(dependent_vals_shape[0]) + ".");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q,
+                                             {coeff_matrix, dependent_vals}))
+    {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues.");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(coeff_matrix, dependent_vals)) {
+        throw py::value_error(
+            "The arrays of coefficients and dependent variables "
+            "are overlapping segments of memory.");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(
+        dependent_vals);
+
+    const bool is_coeff_matrix_f_contig = coeff_matrix.is_f_contiguous();
+    if (!is_coeff_matrix_f_contig) {
+        throw py::value_error("The coefficient matrix "
+                              "must be F-contiguous.");
+    }
+
+    const bool is_dependent_vals_f_contig = dependent_vals.is_f_contiguous();
+    if (!is_dependent_vals_f_contig) {
+        throw py::value_error("The array of dependent variables "
+                              "must be F-contiguous.");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    const int coeff_matrix_type_id =
+        array_types.typenum_to_lookup_id(coeff_matrix.get_typenum());
+    const int dependent_vals_type_id =
+        array_types.typenum_to_lookup_id(dependent_vals.get_typenum());
+
+    if (coeff_matrix_type_id != dependent_vals_type_id) {
+        throw py::value_error("The types of the coefficient matrix and "
+                              "dependent variables are mismatched.");
+    }
+}
+} // namespace dpnp::extensions::lapack::gesv_utils

From 75079d27535d4f1adff267d14ae6eaa0b19ab4e4 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 29 Jul 2024 14:07:10 +0200
Subject: [PATCH 32/37] Implement handle_lapack_exc function

---
 dpnp/backend/extensions/lapack/gesv.cpp       | 45 ++---------------
 dpnp/backend/extensions/lapack/gesv_batch.cpp | 49 ++----------------
 .../extensions/lapack/gesv_common_utils.hpp   | 50 +++++++++++++++++++
 3 files changed, 58 insertions(+), 86 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index be7df6bbf11c..a8ef8d5d47c5 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -31,7 +31,6 @@
 #include "common_helpers.hpp"
 #include "gesv.hpp"
 #include "gesv_common_utils.hpp"
-#include "linalg_exceptions.hpp"
 #include "types_matrix.hpp"
 
 namespace dpnp::extensions::lapack
@@ -88,7 +87,6 @@ static sycl::event gesv_impl(sycl::queue &exec_q,
         throw std::runtime_error("Device allocation for ipiv failed");
 
     std::stringstream error_msg;
-    std::int64_t info = 0;
     bool is_exception_caught = false;
 
     sycl::event gesv_event;
@@ -112,41 +110,8 @@ static sycl::event gesv_impl(sycl::queue &exec_q,
             scratchpad_size, depends);
     } catch (mkl_lapack::exception const &e) {
         is_exception_caught = true;
-        info = e.info();
-
-        if (info < 0) {
-            error_msg << "Parameter number " << -info
-                      << " had an illegal value.";
-        }
-        else if (info == scratchpad_size && e.detail() != 0) {
-            error_msg
-                << "Insufficient scratchpad size. Required size is at least "
-                << e.detail();
-        }
-        else if (info > 0) {
-            T host_U;
-            exec_q.memcpy(&host_U, &a[(info - 1) * lda + info - 1], sizeof(T))
-                .wait();
-
-            using ThresholdType = typename helper::value_type_of<T>::type;
-
-            const auto threshold =
-                std::numeric_limits<ThresholdType>::epsilon() * 100;
-            if (std::abs(host_U) < threshold) {
-                sycl::free(scratchpad, exec_q);
-                throw LinAlgError("The input coefficient matrix is singular.");
-            }
-            else {
-                error_msg << "Unexpected MKL exception caught during gesv() "
-                             "call:\nreason: "
-                          << e.what() << "\ninfo: " << e.info();
-            }
-        }
-        else {
-            error_msg << "Unexpected MKL exception caught during gesv() "
-                         "call:\nreason: "
-                      << e.what() << "\ninfo: " << e.info();
-        }
+        gesv_utils::handle_lapack_exc(exec_q, lda, a, scratchpad_size,
+                                      scratchpad, ipiv, e, error_msg);
     } catch (sycl::exception const &e) {
         is_exception_caught = true;
         error_msg << "Unexpected SYCL exception caught during gesv() call:\n"
@@ -155,12 +120,10 @@ static sycl::event gesv_impl(sycl::queue &exec_q,
 
     if (is_exception_caught) // an unexpected error occurs
     {
-        if (scratchpad != nullptr) {
+        if (scratchpad != nullptr)
             sycl::free(scratchpad, exec_q);
-        }
-        if (ipiv != nullptr) {
+        if (ipiv != nullptr)
             sycl::free(ipiv, exec_q);
-        }
         throw std::runtime_error(error_msg.str());
     }
 
diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp
index bbecc7985c00..62c115259d18 100644
--- a/dpnp/backend/extensions/lapack/gesv_batch.cpp
+++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp
@@ -31,7 +31,6 @@
 #include "common_helpers.hpp"
 #include "gesv.hpp"
 #include "gesv_common_utils.hpp"
-#include "linalg_exceptions.hpp"
 #include "types_matrix.hpp"
 
 namespace dpnp::extensions::lapack
@@ -88,7 +87,6 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q,
     std::vector<std::vector<sycl::event>> comp_evs(n_linear_streams, depends);
 
     std::stringstream error_msg;
-    std::int64_t info = 0;
     bool is_exception_caught = false;
 
     // Release GIL to avoid serialization of host task
@@ -130,45 +128,8 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q,
                 scratchpad_size, current_dep);
         } catch (mkl_lapack::exception const &e) {
             is_exception_caught = true;
-            info = e.info();
-
-            if (info < 0) {
-                error_msg << "Parameter number " << -info
-                          << " had an illegal value.";
-            }
-            else if (info == scratchpad_size && e.detail() != 0) {
-                error_msg << "Insufficient scratchpad size. Required size is "
-                             "at least "
-                          << e.detail();
-            }
-            else if (info > 0) {
-                T host_U;
-                exec_q
-                    .memcpy(&host_U, &a[(info - 1) * lda + info - 1], sizeof(T))
-                    .wait();
-
-                using ThresholdType = typename helper::value_type_of<T>::type;
-
-                const auto threshold =
-                    std::numeric_limits<ThresholdType>::epsilon() * 100;
-                if (std::abs(host_U) < threshold) {
-                    sycl::free(scratchpad, exec_q);
-                    sycl::free(ipiv, exec_q);
-                    throw LinAlgError(
-                        "The input coefficient matrix is singular.");
-                }
-                else {
-                    error_msg
-                        << "Unexpected MKL exception caught during gesv() "
-                           "call:\nreason: "
-                        << e.what() << "\ninfo: " << e.info();
-                }
-            }
-            else {
-                error_msg << "Unexpected MKL exception caught during gesv() "
-                             "call:\nreason: "
-                          << e.what() << "\ninfo: " << e.info();
-            }
+            gesv_utils::handle_lapack_exc(exec_q, lda, a, scratchpad_size,
+                                          scratchpad, ipiv, e, error_msg);
         } catch (sycl::exception const &e) {
             is_exception_caught = true;
             error_msg
@@ -182,12 +143,10 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q,
 
     if (is_exception_caught) // an unexpected error occurs
     {
-        if (scratchpad != nullptr) {
+        if (scratchpad != nullptr)
             sycl::free(scratchpad, exec_q);
-        }
-        if (ipiv != nullptr) {
+        if (ipiv != nullptr)
             sycl::free(ipiv, exec_q);
-        }
         throw std::runtime_error(error_msg.str());
     }
 
diff --git a/dpnp/backend/extensions/lapack/gesv_common_utils.hpp b/dpnp/backend/extensions/lapack/gesv_common_utils.hpp
index 8cf148255650..4b4df013aab5 100644
--- a/dpnp/backend/extensions/lapack/gesv_common_utils.hpp
+++ b/dpnp/backend/extensions/lapack/gesv_common_utils.hpp
@@ -32,6 +32,9 @@
 #include "utils/output_validation.hpp"
 #include "utils/type_dispatch.hpp"
 
+#include "common_helpers.hpp"
+#include "linalg_exceptions.hpp"
+
 namespace dpnp::extensions::lapack::gesv_utils
 {
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
@@ -127,4 +130,51 @@ inline void common_gesv_checks(sycl::queue &exec_q,
                               "dependent variables are mismatched.");
     }
 }
+
+template <typename T>
+inline void handle_lapack_exc(sycl::queue &exec_q,
+                              const std::int64_t lda,
+                              T *a,
+                              std::int64_t scratchpad_size,
+                              T *scratchpad,
+                              std::int64_t *ipiv,
+                              const oneapi::mkl::lapack::exception &e,
+                              std::stringstream &error_msg)
+{
+    std::int64_t info = e.info();
+    if (info < 0) {
+        error_msg << "Parameter number " << -info << " had an illegal value.";
+    }
+    else if (info == scratchpad_size && e.detail() != 0) {
+        error_msg << "Insufficient scratchpad size. Required size is at least "
+                  << e.detail();
+    }
+    else if (info > 0) {
+        T host_U;
+        exec_q.memcpy(&host_U, &a[(info - 1) * lda + info - 1], sizeof(T))
+            .wait();
+
+        using ThresholdType = typename helper::value_type_of<T>::type;
+
+        const auto threshold =
+            std::numeric_limits<ThresholdType>::epsilon() * 100;
+        if (std::abs(host_U) < threshold) {
+            if (scratchpad != nullptr)
+                sycl::free(scratchpad, exec_q);
+            if (ipiv != nullptr)
+                sycl::free(ipiv, exec_q);
+            throw LinAlgError("The input coefficient matrix is singular.");
+        }
+        else {
+            error_msg << "Unexpected MKL exception caught during gesv() "
+                         "call:\nreason: "
+                      << e.what() << "\ninfo: " << e.info();
+        }
+    }
+    else {
+        error_msg
+            << "Unexpected MKL exception caught during gesv() call:\nreason: "
+            << e.what() << "\ninfo: " << e.info();
+    }
+}
 } // namespace dpnp::extensions::lapack::gesv_utils

From 7e0f384de1a6507575db2e70451ab2e6e4b18622 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 29 Jul 2024 15:47:17 +0200
Subject: [PATCH 33/37] Use try/catch for scratchpad/ipiv allocation

---
 .../extensions/lapack/common_helpers.hpp      | 26 ++++++++++----
 dpnp/backend/extensions/lapack/gesv.cpp       | 34 +++++++++++++++----
 2 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/common_helpers.hpp b/dpnp/backend/extensions/lapack/common_helpers.hpp
index aeeb3fb2793c..d3616346056e 100644
--- a/dpnp/backend/extensions/lapack/common_helpers.hpp
+++ b/dpnp/backend/extensions/lapack/common_helpers.hpp
@@ -82,10 +82,17 @@ inline std::int64_t *alloc_ipiv(const std::int64_t n,
     size_t alloc_ipiv_size = round_up_mult(n_linear_streams * n, padding);
 
     // Allocate memory for the total pivot indices array
-    std::int64_t *ipiv =
-        sycl::malloc_device<std::int64_t>(alloc_ipiv_size, exec_q);
-    if (!ipiv)
-        throw std::runtime_error("Device allocation for ipiv failed");
+    try {
+        std::int64_t *ipiv =
+            sycl::malloc_device<std::int64_t>(alloc_ipiv_size, exec_q);
+        if (!ipiv)
+            throw std::runtime_error("Device allocation for ipiv failed");
+    } catch (sycl::exception const &e) {
+        throw std::runtime_error(
+            std::string(
+                "Unexpected SYCL exception caught during ipiv allocation: ") +
+            e.what());
+    }
 
     return ipiv;
 }
@@ -114,9 +121,14 @@ inline T *alloc_scratchpad(std::int64_t scratchpad_size,
         round_up_mult(n_linear_streams * scratchpad_size, padding);
 
     // Allocate memory for the total scratchpad
-    T *scratchpad = sycl::malloc_device<T>(alloc_scratch_size, exec_q);
-    if (!scratchpad) {
-        throw std::runtime_error("Device allocation for scratchpad failed");
+    try {
+        T *scratchpad = sycl::malloc_device<T>(alloc_scratch_size, exec_q);
+        if (!scratchpad)
+            throw std::runtime_error("Device allocation for scratchpad failed");
+    } catch (sycl::exception const &e) {
+        throw std::runtime_error(std::string("Unexpected SYCL exception caught "
+                                             "during scratchpad allocation: ") +
+                                 e.what());
     }
 
     return scratchpad;
diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index a8ef8d5d47c5..e912645e0387 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -76,15 +76,37 @@ static sycl::event gesv_impl(sycl::queue &exec_q,
 
     T *scratchpad = nullptr;
     // Allocate memory for the scratchpad
-    scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
-    if (!scratchpad)
-        throw std::runtime_error("Device allocation for scratchpad failed");
+    try {
+        scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+        if (!scratchpad)
+            throw std::runtime_error("Device allocation for scratchpad failed");
+    } catch (sycl::exception const &e) {
+        if (scratchpad != nullptr)
+            sycl::free(scratchpad, exec_q);
+        throw std::runtime_error(std::string("Unexpected SYCL exception caught "
+                                             "during scratchpad allocation: ") +
+                                 e.what());
+    }
 
     std::int64_t *ipiv = nullptr;
     // Allocate memory for the ipiv
-    ipiv = sycl::malloc_device<std::int64_t>(n, exec_q);
-    if (!ipiv)
-        throw std::runtime_error("Device allocation for ipiv failed");
+    try {
+        ipiv = sycl::malloc_device<std::int64_t>(n, exec_q);
+        if (!ipiv) {
+            if (scratchpad != nullptr)
+                sycl::free(scratchpad, exec_q);
+            throw std::runtime_error("Device allocation for ipiv failed");
+        }
+    } catch (sycl::exception const &e) {
+        if (scratchpad != nullptr)
+            sycl::free(scratchpad, exec_q);
+        if (ipiv != nullptr)
+            sycl::free(ipiv, exec_q);
+        throw std::runtime_error(
+            std::string(
+                "Unexpected SYCL exception caught during ipiv allocation: ") +
+            e.what());
+    }
 
     std::stringstream error_msg;
     bool is_exception_caught = false;

From f5ee368b6da526dfa46a00bd1befa727ec9351d2 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 29 Jul 2024 18:22:07 +0200
Subject: [PATCH 34/37] Update alloc_scratchpad/alloc_ipiv

---
 dpnp/backend/extensions/lapack/common_helpers.hpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/common_helpers.hpp b/dpnp/backend/extensions/lapack/common_helpers.hpp
index d3616346056e..d102cf0df23f 100644
--- a/dpnp/backend/extensions/lapack/common_helpers.hpp
+++ b/dpnp/backend/extensions/lapack/common_helpers.hpp
@@ -81,10 +81,11 @@ inline std::int64_t *alloc_ipiv(const std::int64_t n,
     // linear streams with proper alignment
     size_t alloc_ipiv_size = round_up_mult(n_linear_streams * n, padding);
 
+    std::int64_t *ipiv = nullptr;
+
     // Allocate memory for the total pivot indices array
     try {
-        std::int64_t *ipiv =
-            sycl::malloc_device<std::int64_t>(alloc_ipiv_size, exec_q);
+        ipiv = sycl::malloc_device<std::int64_t>(alloc_ipiv_size, exec_q);
         if (!ipiv)
             throw std::runtime_error("Device allocation for ipiv failed");
     } catch (sycl::exception const &e) {
@@ -120,9 +121,11 @@ inline T *alloc_scratchpad(std::int64_t scratchpad_size,
     const size_t alloc_scratch_size =
         round_up_mult(n_linear_streams * scratchpad_size, padding);
 
+    T *scratchpad = nullptr;
+
     // Allocate memory for the total scratchpad
     try {
-        T *scratchpad = sycl::malloc_device<T>(alloc_scratch_size, exec_q);
+        scratchpad = sycl::malloc_device<T>(alloc_scratch_size, exec_q);
         if (!scratchpad)
             throw std::runtime_error("Device allocation for scratchpad failed");
     } catch (sycl::exception const &e) {

From eb8c3a080b58d197d3a961041c03db2d14e46601 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 30 Jul 2024 16:09:04 +0200
Subject: [PATCH 35/37] gesv_scratchpad_size can be 0

---
 dpnp/backend/extensions/lapack/gesv.cpp | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index e912645e0387..6004540ad328 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -64,22 +64,19 @@ static sycl::event gesv_impl(sycl::queue &exec_q,
     const std::int64_t lda = std::max<size_t>(1UL, n);
     const std::int64_t ldb = std::max<size_t>(1UL, n);
 
+    // gesv scratchpad_size can be 0 on CPU
     const std::int64_t scratchpad_size =
         mkl_lapack::gesv_scratchpad_size<T>(exec_q, n, nrhs, lda, ldb);
 
-    if (scratchpad_size <= 0) {
-        throw std::runtime_error(
-            "Invalid scratchpad size: must be greater than zero."
-            "Calculated scratchpad size: " +
-            std::to_string(scratchpad_size));
-    }
-
     T *scratchpad = nullptr;
     // Allocate memory for the scratchpad
     try {
-        scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
-        if (!scratchpad)
-            throw std::runtime_error("Device allocation for scratchpad failed");
+        if (scratchpad_size > 0) {
+            scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+            if (!scratchpad)
+                throw std::runtime_error(
+                    "Device allocation for scratchpad failed");
+        }
     } catch (sycl::exception const &e) {
         if (scratchpad != nullptr)
             sycl::free(scratchpad, exec_q);

From 3c8cda6809e87ca0fd2c1e3f254b9ac621edc64e Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 30 Jul 2024 18:12:31 +0200
Subject: [PATCH 36/37] Implement help functions alloc_ipiv/alloc_scratchpad

---
 .../extensions/lapack/common_helpers.hpp      | 76 +++++++++++++++----
 dpnp/backend/extensions/lapack/gesv.cpp       | 35 +--------
 dpnp/backend/extensions/lapack/gesv_batch.cpp | 13 +++-
 dpnp/backend/extensions/lapack/heevd.cpp      | 13 +---
 .../backend/extensions/lapack/heevd_batch.cpp |  4 +-
 dpnp/backend/extensions/lapack/syevd.cpp      | 13 +---
 .../backend/extensions/lapack/syevd_batch.cpp |  4 +-
 7 files changed, 80 insertions(+), 78 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/common_helpers.hpp b/dpnp/backend/extensions/lapack/common_helpers.hpp
index d102cf0df23f..44fa3d86b95a 100644
--- a/dpnp/backend/extensions/lapack/common_helpers.hpp
+++ b/dpnp/backend/extensions/lapack/common_helpers.hpp
@@ -66,12 +66,34 @@ inline bool check_zeros_shape(int ndim, const py::ssize_t *shape)
     return src_nelems == 0;
 }
 
+// Allocate the memory for the pivot indices
+inline std::int64_t *alloc_ipiv(const std::int64_t n, sycl::queue &exec_q)
+{
+    std::int64_t *ipiv = nullptr;
+
+    try {
+        ipiv = sycl::malloc_device<std::int64_t>(n, exec_q);
+        if (!ipiv) {
+            throw std::runtime_error("Device allocation for ipiv failed");
+        }
+    } catch (sycl::exception const &e) {
+        if (ipiv != nullptr)
+            sycl::free(ipiv, exec_q);
+        throw std::runtime_error(
+            std::string(
+                "Unexpected SYCL exception caught during ipiv allocation: ") +
+            e.what());
+    }
+
+    return ipiv;
+}
+
 // Allocate the total memory for the total pivot indices with proper alignment
 // for batch implementations
 template <typename T>
-inline std::int64_t *alloc_ipiv(const std::int64_t n,
-                                std::int64_t n_linear_streams,
-                                sycl::queue &exec_q)
+inline std::int64_t *alloc_ipiv_batch(const std::int64_t n,
+                                      std::int64_t n_linear_streams,
+                                      sycl::queue &exec_q)
 {
     // Get padding size to ensure memory allocations are aligned to 256 bytes
     // for better performance
@@ -98,24 +120,43 @@ inline std::int64_t *alloc_ipiv(const std::int64_t n,
     return ipiv;
 }
 
+// Allocate the memory for the scratchpad
+template <typename T>
+inline T *alloc_scratchpad(std::int64_t scratchpad_size, sycl::queue &exec_q)
+{
+    T *scratchpad = nullptr;
+
+    try {
+        if (scratchpad_size > 0) {
+            scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+            if (!scratchpad) {
+                throw std::runtime_error(
+                    "Device allocation for scratchpad failed");
+            }
+        }
+    } catch (sycl::exception const &e) {
+        if (scratchpad != nullptr) {
+            sycl::free(scratchpad, exec_q);
+        }
+        throw std::runtime_error(std::string("Unexpected SYCL exception caught "
+                                             "during scratchpad allocation: ") +
+                                 e.what());
+    }
+
+    return scratchpad;
+}
+
 // Allocate the total scratchpad memory with proper alignment for batch
 // implementations
 template <typename T>
-inline T *alloc_scratchpad(std::int64_t scratchpad_size,
-                           std::int64_t n_linear_streams,
-                           sycl::queue &exec_q)
+inline T *alloc_scratchpad_batch(std::int64_t scratchpad_size,
+                                 std::int64_t n_linear_streams,
+                                 sycl::queue &exec_q)
 {
     // Get padding size to ensure memory allocations are aligned to 256 bytes
     // for better performance
     const std::int64_t padding = 256 / sizeof(T);
 
-    if (scratchpad_size <= 0) {
-        throw std::runtime_error(
-            "Invalid scratchpad size: must be greater than zero."
-            " Calculated scratchpad size: " +
-            std::to_string(scratchpad_size));
-    }
-
     // Calculate the total scratchpad memory size needed for all linear
     // streams with proper alignment
     const size_t alloc_scratch_size =
@@ -125,9 +166,12 @@ inline T *alloc_scratchpad(std::int64_t scratchpad_size,
 
     // Allocate memory for the total scratchpad
     try {
-        scratchpad = sycl::malloc_device<T>(alloc_scratch_size, exec_q);
-        if (!scratchpad)
-            throw std::runtime_error("Device allocation for scratchpad failed");
+        if (alloc_scratch_size > 0) {
+            scratchpad = sycl::malloc_device<T>(alloc_scratch_size, exec_q);
+            if (!scratchpad)
+                throw std::runtime_error(
+                    "Device allocation for scratchpad failed");
+        }
     } catch (sycl::exception const &e) {
         throw std::runtime_error(std::string("Unexpected SYCL exception caught "
                                              "during scratchpad allocation: ") +
diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index 6004540ad328..0cc6eb1f008e 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -64,45 +64,18 @@ static sycl::event gesv_impl(sycl::queue &exec_q,
     const std::int64_t lda = std::max<size_t>(1UL, n);
     const std::int64_t ldb = std::max<size_t>(1UL, n);
 
-    // gesv scratchpad_size can be 0 on CPU
     const std::int64_t scratchpad_size =
         mkl_lapack::gesv_scratchpad_size<T>(exec_q, n, nrhs, lda, ldb);
 
-    T *scratchpad = nullptr;
-    // Allocate memory for the scratchpad
-    try {
-        if (scratchpad_size > 0) {
-            scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
-            if (!scratchpad)
-                throw std::runtime_error(
-                    "Device allocation for scratchpad failed");
-        }
-    } catch (sycl::exception const &e) {
-        if (scratchpad != nullptr)
-            sycl::free(scratchpad, exec_q);
-        throw std::runtime_error(std::string("Unexpected SYCL exception caught "
-                                             "during scratchpad allocation: ") +
-                                 e.what());
-    }
+    T *scratchpad = helper::alloc_scratchpad<T>(scratchpad_size, exec_q);
 
     std::int64_t *ipiv = nullptr;
-    // Allocate memory for the ipiv
     try {
-        ipiv = sycl::malloc_device<std::int64_t>(n, exec_q);
-        if (!ipiv) {
-            if (scratchpad != nullptr)
-                sycl::free(scratchpad, exec_q);
-            throw std::runtime_error("Device allocation for ipiv failed");
-        }
-    } catch (sycl::exception const &e) {
+        ipiv = helper::alloc_ipiv(n, exec_q);
+    } catch (const std::exception &e) {
         if (scratchpad != nullptr)
             sycl::free(scratchpad, exec_q);
-        if (ipiv != nullptr)
-            sycl::free(ipiv, exec_q);
-        throw std::runtime_error(
-            std::string(
-                "Unexpected SYCL exception caught during ipiv allocation: ") +
-            e.what());
+        throw;
     }
 
     std::stringstream error_msg;
diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp
index 62c115259d18..90aaf8ebcfd5 100644
--- a/dpnp/backend/extensions/lapack/gesv_batch.cpp
+++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp
@@ -78,10 +78,17 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q,
     const std::int64_t scratchpad_size =
         mkl_lapack::gesv_scratchpad_size<T>(exec_q, n, nrhs, lda, ldb);
 
-    T *scratchpad =
-        helper::alloc_scratchpad<T>(scratchpad_size, n_linear_streams, exec_q);
+    T *scratchpad = helper::alloc_scratchpad_batch<T>(scratchpad_size,
+                                                      n_linear_streams, exec_q);
 
-    std::int64_t *ipiv = helper::alloc_ipiv<T>(n, n_linear_streams, exec_q);
+    std::int64_t *ipiv = nullptr;
+    try {
+        ipiv = helper::alloc_ipiv_batch<T>(n, n_linear_streams, exec_q);
+    } catch (const std::exception &e) {
+        if (scratchpad != nullptr)
+            sycl::free(scratchpad, exec_q);
+        throw;
+    }
 
     // Computation events to manage dependencies for each linear stream
     std::vector<std::vector<sycl::event>> comp_evs(n_linear_streams, depends);
diff --git a/dpnp/backend/extensions/lapack/heevd.cpp b/dpnp/backend/extensions/lapack/heevd.cpp
index a623a0dd182c..234dbef18eda 100644
--- a/dpnp/backend/extensions/lapack/heevd.cpp
+++ b/dpnp/backend/extensions/lapack/heevd.cpp
@@ -55,18 +55,7 @@ static sycl::event heevd_impl(sycl::queue &exec_q,
     const std::int64_t scratchpad_size =
         mkl_lapack::heevd_scratchpad_size<T>(exec_q, jobz, upper_lower, n, lda);
 
-    if (scratchpad_size <= 0) {
-        throw std::runtime_error(
-            "Invalid scratchpad size: must be greater than zero."
-            "Calculated scratchpad size: " +
-            std::to_string(scratchpad_size));
-    }
-
-    T *scratchpad = nullptr;
-    // Allocate memory for the scratchpad
-    scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
-    if (!scratchpad)
-        throw std::runtime_error("Device allocation for scratchpad failed");
+    T *scratchpad = helper::alloc_scratchpad<T>(scratchpad_size, exec_q);
 
     std::stringstream error_msg;
     std::int64_t info = 0;
diff --git a/dpnp/backend/extensions/lapack/heevd_batch.cpp b/dpnp/backend/extensions/lapack/heevd_batch.cpp
index 2ea37e22d7d9..05d968701dd8 100644
--- a/dpnp/backend/extensions/lapack/heevd_batch.cpp
+++ b/dpnp/backend/extensions/lapack/heevd_batch.cpp
@@ -65,8 +65,8 @@ static sycl::event heevd_batch_impl(sycl::queue &exec_q,
     const std::int64_t scratchpad_size =
         mkl_lapack::heevd_scratchpad_size<T>(exec_q, jobz, upper_lower, n, lda);
 
-    T *scratchpad =
-        helper::alloc_scratchpad<T>(scratchpad_size, n_linear_streams, exec_q);
+    T *scratchpad = helper::alloc_scratchpad_batch<T>(scratchpad_size,
+                                                      n_linear_streams, exec_q);
 
     // Computation events to manage dependencies for each linear stream
     std::vector<std::vector<sycl::event>> comp_evs(n_linear_streams, depends);
diff --git a/dpnp/backend/extensions/lapack/syevd.cpp b/dpnp/backend/extensions/lapack/syevd.cpp
index bbc975b43d0f..30f7ea1d13c6 100644
--- a/dpnp/backend/extensions/lapack/syevd.cpp
+++ b/dpnp/backend/extensions/lapack/syevd.cpp
@@ -55,18 +55,7 @@ static sycl::event syevd_impl(sycl::queue &exec_q,
     const std::int64_t scratchpad_size =
         mkl_lapack::syevd_scratchpad_size<T>(exec_q, jobz, upper_lower, n, lda);
 
-    if (scratchpad_size <= 0) {
-        throw std::runtime_error(
-            "Invalid scratchpad size: must be greater than zero."
-            "Calculated scratchpad size: " +
-            std::to_string(scratchpad_size));
-    }
-
-    T *scratchpad = nullptr;
-    // Allocate memory for the scratchpad
-    scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
-    if (!scratchpad)
-        throw std::runtime_error("Device allocation for scratchpad failed");
+    T *scratchpad = helper::alloc_scratchpad<T>(scratchpad_size, exec_q);
 
     std::stringstream error_msg;
     std::int64_t info = 0;
diff --git a/dpnp/backend/extensions/lapack/syevd_batch.cpp b/dpnp/backend/extensions/lapack/syevd_batch.cpp
index aa5656e0ac3c..78bf07264f55 100644
--- a/dpnp/backend/extensions/lapack/syevd_batch.cpp
+++ b/dpnp/backend/extensions/lapack/syevd_batch.cpp
@@ -65,8 +65,8 @@ static sycl::event syevd_batch_impl(sycl::queue &exec_q,
     const std::int64_t scratchpad_size =
         mkl_lapack::syevd_scratchpad_size<T>(exec_q, jobz, upper_lower, n, lda);
 
-    T *scratchpad =
-        helper::alloc_scratchpad<T>(scratchpad_size, n_linear_streams, exec_q);
+    T *scratchpad = helper::alloc_scratchpad_batch<T>(scratchpad_size,
+                                                      n_linear_streams, exec_q);
 
     // Computation events to manage dependencies for each linear stream
     std::vector<std::vector<sycl::event>> comp_evs(n_linear_streams, depends);

From 629b97ab4a92b14ed46fe6e6bde62073e8137f99 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 2 Aug 2024 11:11:26 +0200
Subject: [PATCH 37/37] Reuse alloc_scratchpad/ipiv in batch versions

---
 .../extensions/lapack/common_helpers.hpp      | 36 ++-----------------
 1 file changed, 3 insertions(+), 33 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/common_helpers.hpp b/dpnp/backend/extensions/lapack/common_helpers.hpp
index 44fa3d86b95a..88e791256dce 100644
--- a/dpnp/backend/extensions/lapack/common_helpers.hpp
+++ b/dpnp/backend/extensions/lapack/common_helpers.hpp
@@ -24,8 +24,8 @@
 //*****************************************************************************
 
 #pragma once
-#include <oneapi/mkl.hpp>
 #include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
 
 #include <complex>
 #include <cstring>
@@ -103,21 +103,7 @@ inline std::int64_t *alloc_ipiv_batch(const std::int64_t n,
     // linear streams with proper alignment
     size_t alloc_ipiv_size = round_up_mult(n_linear_streams * n, padding);
 
-    std::int64_t *ipiv = nullptr;
-
-    // Allocate memory for the total pivot indices array
-    try {
-        ipiv = sycl::malloc_device<std::int64_t>(alloc_ipiv_size, exec_q);
-        if (!ipiv)
-            throw std::runtime_error("Device allocation for ipiv failed");
-    } catch (sycl::exception const &e) {
-        throw std::runtime_error(
-            std::string(
-                "Unexpected SYCL exception caught during ipiv allocation: ") +
-            e.what());
-    }
-
-    return ipiv;
+    return alloc_ipiv(alloc_ipiv_size, exec_q);
 }
 
 // Allocate the memory for the scratchpad
@@ -162,22 +148,6 @@ inline T *alloc_scratchpad_batch(std::int64_t scratchpad_size,
     const size_t alloc_scratch_size =
         round_up_mult(n_linear_streams * scratchpad_size, padding);
 
-    T *scratchpad = nullptr;
-
-    // Allocate memory for the total scratchpad
-    try {
-        if (alloc_scratch_size > 0) {
-            scratchpad = sycl::malloc_device<T>(alloc_scratch_size, exec_q);
-            if (!scratchpad)
-                throw std::runtime_error(
-                    "Device allocation for scratchpad failed");
-        }
-    } catch (sycl::exception const &e) {
-        throw std::runtime_error(std::string("Unexpected SYCL exception caught "
-                                             "during scratchpad allocation: ") +
-                                 e.what());
-    }
-
-    return scratchpad;
+    return alloc_scratchpad<T>(alloc_scratch_size, exec_q);
 }
 } // namespace dpnp::extensions::lapack::helper