From cf274913dd998704cdccd09f1bf37a2cacf27ca2 Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <>
Date: Tue, 17 Feb 2026 01:18:51 -0800
Subject: [PATCH 1/2] initial

---
 plugin/sycl/common/optional_weight.cc | 21 ++++++++++++++++-----
 plugin/sycl/common/stats.cc           | 26 ++++++++++++++++++++++++++
 plugin/sycl/context_helper.cc         |  5 ++++-
 plugin/sycl/device_manager.cc         | 11 +++++++++--
 src/common/linalg_op.h                | 10 +++++++---
 src/common/stats.cc                   |  2 ++
 src/common/stats.h                    | 13 ++++++++++++-
 src/objective/multiclass_obj.cu       |  7 +++++--
 8 files changed, 81 insertions(+), 14 deletions(-)
 create mode 100644 plugin/sycl/common/stats.cc
diff --git a/plugin/sycl/common/optional_weight.cc b/plugin/sycl/common/optional_weight.cc
index aa984a152dc3..819f274cec6d 100644
--- a/plugin/sycl/common/optional_weight.cc
+++ b/plugin/sycl/common/optional_weight.cc
@@ -8,14 +8,13 @@
 #include "../device_manager.h"
 
 namespace xgboost::common::sycl_impl {
-double SumOptionalWeights(Context const* ctx, OptionalWeights const& weights) {
-  sycl::DeviceManager device_manager;
-  auto* qu = device_manager.GetQueue(ctx->Device());
 
+template <typename T>
+T ElementWiseSum(::sycl::queue* qu, OptionalWeights const& weights) {
   const auto* data = weights.Data();
-  double result = 0;
+  T result = 0;
   {
-    ::sycl::buffer<double> buff(&result, 1);
+    ::sycl::buffer<T> buff(&result, 1);
     qu->submit([&](::sycl::handler& cgh) {
       auto reduction = ::sycl::reduction(buff, cgh, ::sycl::plus<>());
       cgh.parallel_for<>(::sycl::range<1>(weights.Size()), reduction,
@@ -28,4 +27,16 @@ double SumOptionalWeights(Context const* ctx, OptionalWeights const& weights) {
 
   return result;
 }
+
+double SumOptionalWeights(Context const* ctx, OptionalWeights const& weights) {
+  sycl::DeviceManager device_manager;
+  auto* qu = device_manager.GetQueue(ctx->Device());
+
+  bool has_fp64_support = qu->get_device().has(::sycl::aspect::fp64);
+  if (has_fp64_support) {
+    return ElementWiseSum<double>(qu, weights);
+  } else {
+    return ElementWiseSum<float>(qu, weights);
+  }
+}
 }  // namespace xgboost::common::sycl_impl
diff --git a/plugin/sycl/common/stats.cc b/plugin/sycl/common/stats.cc
new file mode 100644
index 000000000000..fb66f3c81229
--- /dev/null
+++ b/plugin/sycl/common/stats.cc
@@ -0,0 +1,26 @@
+/*!
+ * Copyright by Contributors 2017-2025
+ */
+#include <sycl/sycl.hpp>
+
+#include "../../../src/common/stats.h"
+
+#include "../device_manager.h"
+
+namespace xgboost::common::sycl_impl {
+    void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorView<float> out) {
+        sycl::DeviceManager device_manager;
+        auto* qu = device_manager.GetQueue(ctx->Device());
+
+        qu->submit([&](::sycl::handler& cgh) {
+            auto reduction =
+                ::sycl::reduction(&(out(0)), 0.0f, ::sycl::plus<float>(),
+                                  ::sycl::property::reduction::initialize_to_identity());
+            cgh.parallel_for<>(::sycl::range<1>(v.Size()), reduction,
+                               [=](::sycl::id<1> pid, auto& sum) {
+                size_t i = pid[0];
+                sum += v(i);
+            });
+        }).wait_and_throw();
+    }
+}
\ No newline at end of file
diff --git a/plugin/sycl/context_helper.cc b/plugin/sycl/context_helper.cc
index d5ced146187c..a064ce56103b 100644
--- a/plugin/sycl/context_helper.cc
+++ b/plugin/sycl/context_helper.cc
@@ -14,7 +14,10 @@ namespace sycl {
 
 DeviceOrd DeviceFP64(const DeviceOrd& device) {
   DeviceManager device_manager;
-  bool support_fp64 = device_manager.GetQueue(device)->get_device().has(::sycl::aspect::fp64);
+  bool support_fp64 = true;
+  if (device.IsSycl()) {
+    support_fp64 = device_manager.GetQueue(device)->get_device().has(::sycl::aspect::fp64);
+  }
   if (support_fp64) {
     return device;
   } else {
diff --git a/plugin/sycl/device_manager.cc b/plugin/sycl/device_manager.cc
index dc3939934e31..ee652065db23 100644
--- a/plugin/sycl/device_manager.cc
+++ b/plugin/sycl/device_manager.cc
@@ -21,18 +21,25 @@ ::sycl::queue* DeviceManager::GetQueue(const DeviceOrd& device_spec) const {
                                     (collective::IsDistributed());
     DeviceRegister& device_register = GetDevicesRegister();
     if (not_use_default_selector) {
-        const int device_idx =
-            collective::IsDistributed() ? collective::GetRank() : device_spec.ordinal;
         if (device_spec.IsSyclDefault()) {
             auto& devices = device_register.devices;
+            const int device_idx = collective::IsDistributed()
+                                   ? collective::GetRank() % devices.size()
+                                   : device_spec.ordinal;
             CHECK_LT(device_idx, devices.size());
             queue_idx = device_idx;
         } else if (device_spec.IsSyclCPU()) {
             auto& cpu_devices_idxes = device_register.cpu_devices_idxes;
+            const int device_idx = collective::IsDistributed()
+                                   ? collective::GetRank() % cpu_devices_idxes.size()
+                                   : device_spec.ordinal;
             CHECK_LT(device_idx, cpu_devices_idxes.size());
             queue_idx = cpu_devices_idxes[device_idx];
         } else if (device_spec.IsSyclGPU()) {
             auto& gpu_devices_idxes = device_register.gpu_devices_idxes;
+            const int device_idx = collective::IsDistributed()
+                                   ? collective::GetRank() % gpu_devices_idxes.size()
+                                   : device_spec.ordinal;
             CHECK_LT(device_idx, gpu_devices_idxes.size());
             queue_idx = gpu_devices_idxes[device_idx];
         } else {
diff --git a/src/common/linalg_op.h b/src/common/linalg_op.h
index c6d37eef4c82..c236ab7d54d2 100644
--- a/src/common/linalg_op.h
+++ b/src/common/linalg_op.h
@@ -159,9 +159,13 @@ void ElementWiseKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
 #elif defined(SYCL_LANGUAGE_VERSION)
 template <typename T, std::int32_t D, typename Fn, auto _tag = detail::SysTag()>
 void ElementWiseKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
-  ctx->DispatchDevice([&] { cpu_impl::ElementWiseKernel(t, ctx->Threads(), std::forward<Fn>(fn)); },
-                      [&] { LOG(FATAL) << "Invalid TU"; },
-                      [&] { ::xgboost::sycl::linalg::ElementWiseKernel(t, std::forward<Fn>(fn)); });
+  if (t.Device().IsCPU()) {
+    cpu_impl::ElementWiseKernel(t, ctx->Threads(), std::forward<Fn>(fn));
+  } else {
+    ctx->DispatchDevice([&] { cpu_impl::ElementWiseKernel(t, ctx->Threads(), std::forward<Fn>(fn)); },
+                        [&] { LOG(FATAL) << "Invalid TU"; },
+                        [&] { ::xgboost::sycl::linalg::ElementWiseKernel(t, std::forward<Fn>(fn)); });
+  }
 }
 #else
 template <typename T, std::int32_t D, typename Fn, auto _tag = detail::SysTag()>
diff --git a/src/common/stats.cc b/src/common/stats.cc
index b1c65aaf1a50..2c2f8522dba9 100644
--- a/src/common/stats.cc
+++ b/src/common/stats.cc
@@ -51,6 +51,8 @@ void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::Vector<
 
   if (ctx->IsCUDA()) {
     cuda_impl::Mean(ctx, v, out->View(ctx->Device()));
+  } else if (ctx->IsSycl()) {
+    sycl_impl::Mean(ctx, v, out->View(ctx->Device()));
   } else {
     auto h_v = v;
     float n = v.Size();
diff --git a/src/common/stats.h b/src/common/stats.h
index dffae5b068a2..3add130ffffa 100644
--- a/src/common/stats.h
+++ b/src/common/stats.h
@@ -14,7 +14,7 @@
 #include "xgboost/linalg.h"   // TensorView,VectorView
 #include "xgboost/logging.h"  // CHECK_GE
 
-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_SYCL)
 #include "common.h"  // AssertGPUSupport
 #endif
 
@@ -140,6 +140,17 @@ inline void WeightedSampleMean(Context const*, bool, linalg::MatrixView<float co
 #endif  // !defined(XGBOOST_USE_CUDA)
 }  // namespace cuda_impl
 
+namespace sycl_impl {
+void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorView<float> out);
+
+#if !defined(XGBOOST_USE_SYCL)
+inline void Mean(Context const*, linalg::VectorView<float const>, linalg::VectorView<float>) {
+  common::AssertGPUSupport();
+}
+
+#endif  // !defined(XGBOOST_USE_SYCL)
+}  // namespace sycl_impl
+
 /**
  * @brief Calculate medians for each column of the input matrix.
  */
diff --git a/src/objective/multiclass_obj.cu b/src/objective/multiclass_obj.cu
index 46bfbff686fc..9546567ae04d 100644
--- a/src/objective/multiclass_obj.cu
+++ b/src/objective/multiclass_obj.cu
@@ -106,10 +106,13 @@ class SoftmaxMultiClassObj : public ObjFunction {
           << "Number of weights should be equal to number of data points.";
     }
     info.weights_.SetDevice(device);
-    auto weights = common::MakeOptionalWeights(this->ctx_->Device(), info.weights_);
+    auto weights = common::MakeOptionalWeights(device, info.weights_);
 
     preds.SetDevice(device);
-    auto predt = linalg::MakeTensorView(this->ctx_, &preds, n_samples, n_classes);
+    Context cpu_context = Context();
+    auto predt = linalg::MakeTensorView(
+      device == ctx_->Device() ? this->ctx_ : &cpu_context,
+      &preds, n_samples, n_classes);
     CHECK_EQ(labels.Shape(1), 1);
     auto y1d = labels.Slice(linalg::All(), 0);
     CHECK_EQ(y1d.Shape(0), info.num_row_);

From d87acdf10044982e4f5fde4733336a566504d3a7 Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <>
Date: Tue, 17 Feb 2026 01:38:39 -0800
Subject: [PATCH 2/2] linting

---
 plugin/sycl/common/optional_weight.cc |  18 ++--
 plugin/sycl/common/stats.cc           |  32 +++----
 plugin/sycl/context_helper.cc         |   4 +-
 plugin/sycl/device_manager.cc         | 128 +++++++++++++-------------
 src/common/linalg_op.h                |   7 +-
 src/objective/multiclass_obj.cu       |   5 +-
 6 files changed, 95 insertions(+), 99 deletions(-)

diff --git a/plugin/sycl/common/optional_weight.cc b/plugin/sycl/common/optional_weight.cc
index 819f274cec6d..7abeaaa6c765 100644
--- a/plugin/sycl/common/optional_weight.cc
+++ b/plugin/sycl/common/optional_weight.cc
@@ -1,10 +1,10 @@
 /*!
  * Copyright by Contributors 2017-2025
  */
-#include <sycl/sycl.hpp>
-
 #include "../../../src/common/optional_weight.h"
 
+#include <sycl/sycl.hpp>
+
 #include "../device_manager.h"
 
 namespace xgboost::common::sycl_impl {
@@ -16,13 +16,13 @@ T ElementWiseSum(::sycl::queue* qu, OptionalWeights const& weights) {
   {
     ::sycl::buffer<T> buff(&result, 1);
     qu->submit([&](::sycl::handler& cgh) {
-      auto reduction = ::sycl::reduction(buff, cgh, ::sycl::plus<>());
-      cgh.parallel_for<>(::sycl::range<1>(weights.Size()), reduction,
-                        [=](::sycl::id<1> pid, auto& sum) {
-        size_t i = pid[0];
-        sum += data[i];
-      });
-    }).wait_and_throw();
+        auto reduction = ::sycl::reduction(buff, cgh, ::sycl::plus<>());
+        cgh.parallel_for<>(::sycl::range<1>(weights.Size()), reduction,
+                           [=](::sycl::id<1> pid, auto& sum) {
+                             size_t i = pid[0];
+                             sum += data[i];
+                           });
+      }).wait_and_throw();
   }
 
   return result;
diff --git a/plugin/sycl/common/stats.cc b/plugin/sycl/common/stats.cc
index fb66f3c81229..d01b8c33295f 100644
--- a/plugin/sycl/common/stats.cc
+++ b/plugin/sycl/common/stats.cc
@@ -1,26 +1,24 @@
 /*!
  * Copyright by Contributors 2017-2025
  */
-#include <sycl/sycl.hpp>
-
 #include "../../../src/common/stats.h"
 
+#include <sycl/sycl.hpp>
+
 #include "../device_manager.h"
 
 namespace xgboost::common::sycl_impl {
-    void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorView<float> out) {
-        sycl::DeviceManager device_manager;
-        auto* qu = device_manager.GetQueue(ctx->Device());
+void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorView<float> out) {
+  sycl::DeviceManager device_manager;
+  auto* qu = device_manager.GetQueue(ctx->Device());
 
-        qu->submit([&](::sycl::handler& cgh) {
-            auto reduction =
-                ::sycl::reduction(&(out(0)), 0.0f, ::sycl::plus<float>(),
-                                  ::sycl::property::reduction::initialize_to_identity());
-            cgh.parallel_for<>(::sycl::range<1>(v.Size()), reduction,
-                               [=](::sycl::id<1> pid, auto& sum) {
-                size_t i = pid[0];
-                sum += v(i);
-            });
-        }).wait_and_throw();
-    }
-}
\ No newline at end of file
+  qu->submit([&](::sycl::handler& cgh) {
+      auto reduction = ::sycl::reduction(&(out(0)), 0.0f, ::sycl::plus<float>(),
+                                         ::sycl::property::reduction::initialize_to_identity());
+      cgh.parallel_for<>(::sycl::range<1>(v.Size()), reduction, [=](::sycl::id<1> pid, auto& sum) {
+        size_t i = pid[0];
+        sum += v(i);
+      });
+    }).wait_and_throw();
+}
+}  // namespace xgboost::common::sycl_impl
diff --git a/plugin/sycl/context_helper.cc b/plugin/sycl/context_helper.cc
index a064ce56103b..7006d8f6001e 100644
--- a/plugin/sycl/context_helper.cc
+++ b/plugin/sycl/context_helper.cc
@@ -3,11 +3,11 @@
  * \file context_helper.cc
  */
 
-#include <sycl/sycl.hpp>
+#include "context_helper.h"
 
+#include <sycl/sycl.hpp>
 
 #include "device_manager.h"
-#include "context_helper.h"
 
 namespace xgboost {
 namespace sycl {
diff --git a/plugin/sycl/device_manager.cc b/plugin/sycl/device_manager.cc
index ee652065db23..ffae9b449358 100644
--- a/plugin/sycl/device_manager.cc
+++ b/plugin/sycl/device_manager.cc
@@ -10,80 +10,78 @@ namespace xgboost {
 namespace sycl {
 
 ::sycl::queue* DeviceManager::GetQueue(const DeviceOrd& device_spec) const {
-    if (!device_spec.IsSycl()) {
-        LOG(WARNING) << "Sycl kernel is executed with non-sycl context: "
-                     << device_spec.Name() << ". "
-                     << "Default sycl device_selector will be used.";
-    }
+  if (!device_spec.IsSycl()) {
+    LOG(WARNING) << "Sycl kernel is executed with non-sycl context: " << device_spec.Name() << ". "
+                 << "Default sycl device_selector will be used.";
+  }
 
-    size_t queue_idx;
-    bool not_use_default_selector = (device_spec.ordinal != kDefaultOrdinal) ||
-                                    (collective::IsDistributed());
-    DeviceRegister& device_register = GetDevicesRegister();
-    if (not_use_default_selector) {
-        if (device_spec.IsSyclDefault()) {
-            auto& devices = device_register.devices;
-            const int device_idx = collective::IsDistributed()
-                                   ? collective::GetRank() % devices.size()
-                                   : device_spec.ordinal;
-            CHECK_LT(device_idx, devices.size());
-            queue_idx = device_idx;
-        } else if (device_spec.IsSyclCPU()) {
-            auto& cpu_devices_idxes = device_register.cpu_devices_idxes;
-            const int device_idx = collective::IsDistributed()
-                                   ? collective::GetRank() % cpu_devices_idxes.size()
-                                   : device_spec.ordinal;
-            CHECK_LT(device_idx, cpu_devices_idxes.size());
-            queue_idx = cpu_devices_idxes[device_idx];
-        } else if (device_spec.IsSyclGPU()) {
-            auto& gpu_devices_idxes = device_register.gpu_devices_idxes;
-            const int device_idx = collective::IsDistributed()
-                                   ? collective::GetRank() % gpu_devices_idxes.size()
-                                   : device_spec.ordinal;
-            CHECK_LT(device_idx, gpu_devices_idxes.size());
-            queue_idx = gpu_devices_idxes[device_idx];
-        } else {
-            LOG(WARNING) << device_spec << " is not sycl, sycl:cpu or sycl:gpu";
-            auto device = ::sycl::queue(::sycl::default_selector_v).get_device();
-            queue_idx = device_register.devices.at(device);
-        }
+  size_t queue_idx;
+  bool not_use_default_selector =
+      (device_spec.ordinal != kDefaultOrdinal) || (collective::IsDistributed());
+  DeviceRegister& device_register = GetDevicesRegister();
+  if (not_use_default_selector) {
+    if (device_spec.IsSyclDefault()) {
+      auto& devices = device_register.devices;
+      const int device_idx = collective::IsDistributed() ? collective::GetRank() % devices.size()
+                                                         : device_spec.ordinal;
+      CHECK_LT(device_idx, devices.size());
+      queue_idx = device_idx;
+    } else if (device_spec.IsSyclCPU()) {
+      auto& cpu_devices_idxes = device_register.cpu_devices_idxes;
+      const int device_idx = collective::IsDistributed()
+                                 ? collective::GetRank() % cpu_devices_idxes.size()
+                                 : device_spec.ordinal;
+      CHECK_LT(device_idx, cpu_devices_idxes.size());
+      queue_idx = cpu_devices_idxes[device_idx];
+    } else if (device_spec.IsSyclGPU()) {
+      auto& gpu_devices_idxes = device_register.gpu_devices_idxes;
+      const int device_idx = collective::IsDistributed()
+                                 ? collective::GetRank() % gpu_devices_idxes.size()
+                                 : device_spec.ordinal;
+      CHECK_LT(device_idx, gpu_devices_idxes.size());
+      queue_idx = gpu_devices_idxes[device_idx];
+    } else {
+      LOG(WARNING) << device_spec << " is not sycl, sycl:cpu or sycl:gpu";
+      auto device = ::sycl::queue(::sycl::default_selector_v).get_device();
+      queue_idx = device_register.devices.at(device);
+    }
+  } else {
+    if (device_spec.IsSyclCPU()) {
+      auto device = ::sycl::queue(::sycl::cpu_selector_v).get_device();
+      queue_idx = device_register.devices.at(device);
+    } else if (device_spec.IsSyclGPU()) {
+      auto device = ::sycl::queue(::sycl::gpu_selector_v).get_device();
+      queue_idx = device_register.devices.at(device);
     } else {
-        if (device_spec.IsSyclCPU()) {
-            auto device = ::sycl::queue(::sycl::cpu_selector_v).get_device();
-            queue_idx = device_register.devices.at(device);
-        } else if (device_spec.IsSyclGPU()) {
-            auto device = ::sycl::queue(::sycl::gpu_selector_v).get_device();
-            queue_idx = device_register.devices.at(device);
-        } else {
-            auto device = ::sycl::queue(::sycl::default_selector_v).get_device();
-            queue_idx = device_register.devices.at(device);
-        }
+      auto device = ::sycl::queue(::sycl::default_selector_v).get_device();
+      queue_idx = device_register.devices.at(device);
     }
-    return &(device_register.queues[queue_idx]);
+  }
+  return &(device_register.queues[queue_idx]);
 }
 
 DeviceManager::DeviceRegister& DeviceManager::GetDevicesRegister() const {
-    static DeviceRegister device_register;
+  static DeviceRegister device_register;
 
-    if (device_register.devices.size() == 0) {
-        std::lock_guard<std::mutex> guard(device_registering_mutex);
-        std::vector<::sycl::device> devices = ::sycl::device::get_devices();
-        for (size_t i = 0; i < devices.size(); i++) {
-            LOG(INFO) << "device_index = " << i << ", name = "
-                      << devices[i].get_info<::sycl::info::device::name>();
-        }
+  if (device_register.devices.size() == 0) {
+    std::lock_guard<std::mutex> guard(device_registering_mutex);
+    std::vector<::sycl::device> devices = ::sycl::device::get_devices();
+    for (size_t i = 0; i < devices.size(); i++) {
+      LOG(INFO) << "device_index = " << i
+                << ", name = " << devices[i].get_info<::sycl::info::device::name>();
+    }
 
-        for (size_t i = 0; i < devices.size(); i++) {
-            device_register.devices[devices[i]] = i;
-            device_register.queues.push_back(::sycl::queue(devices[i]));
-            if (devices[i].is_cpu()) {
-                device_register.cpu_devices_idxes.push_back(i);
-            } else if (devices[i].is_gpu()) {
-                device_register.gpu_devices_idxes.push_back(i);
-            }
-        }
+    for (size_t i = 0; i < devices.size(); i++) {
+      device_register.devices[devices[i]] = i;
+      device_register.queues.push_back(::sycl::queue(devices[i]));
+      if (devices[i].is_cpu()) {
+        device_register.cpu_devices_idxes.push_back(i);
+      } else if (devices[i].is_gpu()) {
+        device_register.gpu_devices_idxes.push_back(i);
+      }
     }
-    return device_register;
+  }
+  return device_register;
 }
 
 }  // namespace sycl
diff --git a/src/common/linalg_op.h b/src/common/linalg_op.h
index c236ab7d54d2..967faaff7f9e 100644
--- a/src/common/linalg_op.h
+++ b/src/common/linalg_op.h
@@ -162,9 +162,10 @@ void ElementWiseKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
   if (t.Device().IsCPU()) {
     cpu_impl::ElementWiseKernel(t, ctx->Threads(), std::forward<Fn>(fn));
   } else {
-    ctx->DispatchDevice([&] { cpu_impl::ElementWiseKernel(t, ctx->Threads(), std::forward<Fn>(fn)); },
-                        [&] { LOG(FATAL) << "Invalid TU"; },
-                        [&] { ::xgboost::sycl::linalg::ElementWiseKernel(t, std::forward<Fn>(fn)); });
+    ctx->DispatchDevice(
+        [&] { cpu_impl::ElementWiseKernel(t, ctx->Threads(), std::forward<Fn>(fn)); },
+        [&] { LOG(FATAL) << "Invalid TU"; },
+        [&] { ::xgboost::sycl::linalg::ElementWiseKernel(t, std::forward<Fn>(fn)); });
   }
 }
 #else
diff --git a/src/objective/multiclass_obj.cu b/src/objective/multiclass_obj.cu
index 9546567ae04d..de96e4b8c018 100644
--- a/src/objective/multiclass_obj.cu
+++ b/src/objective/multiclass_obj.cu
@@ -110,9 +110,8 @@ class SoftmaxMultiClassObj : public ObjFunction {
 
     preds.SetDevice(device);
     Context cpu_context = Context();
-    auto predt = linalg::MakeTensorView(
-      device == ctx_->Device() ? this->ctx_ : &cpu_context,
-      &preds, n_samples, n_classes);
+    auto predt = linalg::MakeTensorView(device == ctx_->Device() ? this->ctx_ : &cpu_context,
+                                        &preds, n_samples, n_classes);
     CHECK_EQ(labels.Shape(1), 1);
     auto y1d = labels.Slice(linalg::All(), 0);
     CHECK_EQ(y1d.Shape(0), info.num_row_);