From cf274913dd998704cdccd09f1bf37a2cacf27ca2 Mon Sep 17 00:00:00 2001 From: Dmitry Razdoburdin <> Date: Tue, 17 Feb 2026 01:18:51 -0800 Subject: [PATCH 1/2] initial --- plugin/sycl/common/optional_weight.cc | 21 ++++++++++++++++----- plugin/sycl/common/stats.cc | 26 ++++++++++++++++++++++++++ plugin/sycl/context_helper.cc | 5 ++++- plugin/sycl/device_manager.cc | 11 +++++++++-- src/common/linalg_op.h | 10 +++++++--- src/common/stats.cc | 2 ++ src/common/stats.h | 13 ++++++++++++- src/objective/multiclass_obj.cu | 7 +++++-- 8 files changed, 81 insertions(+), 14 deletions(-) create mode 100644 plugin/sycl/common/stats.cc diff --git a/plugin/sycl/common/optional_weight.cc b/plugin/sycl/common/optional_weight.cc index aa984a152dc3..819f274cec6d 100644 --- a/plugin/sycl/common/optional_weight.cc +++ b/plugin/sycl/common/optional_weight.cc @@ -8,14 +8,13 @@ #include "../device_manager.h" namespace xgboost::common::sycl_impl { -double SumOptionalWeights(Context const* ctx, OptionalWeights const& weights) { - sycl::DeviceManager device_manager; - auto* qu = device_manager.GetQueue(ctx->Device()); +template +T ElementWiseSum(::sycl::queue* qu, OptionalWeights const& weights) { const auto* data = weights.Data(); - double result = 0; + T result = 0; { - ::sycl::buffer buff(&result, 1); + ::sycl::buffer buff(&result, 1); qu->submit([&](::sycl::handler& cgh) { auto reduction = ::sycl::reduction(buff, cgh, ::sycl::plus<>()); cgh.parallel_for<>(::sycl::range<1>(weights.Size()), reduction, @@ -28,4 +27,16 @@ double SumOptionalWeights(Context const* ctx, OptionalWeights const& weights) { return result; } + +double SumOptionalWeights(Context const* ctx, OptionalWeights const& weights) { + sycl::DeviceManager device_manager; + auto* qu = device_manager.GetQueue(ctx->Device()); + + bool has_fp64_support = qu->get_device().has(::sycl::aspect::fp64); + if (has_fp64_support) { + return ElementWiseSum(qu, weights); + } else { + return ElementWiseSum(qu, weights); + } +} } // namespace xgboost::common::sycl_impl diff --git a/plugin/sycl/common/stats.cc b/plugin/sycl/common/stats.cc new file mode 100644 index 000000000000..fb66f3c81229 --- /dev/null +++ b/plugin/sycl/common/stats.cc @@ -0,0 +1,26 @@ +/*! + * Copyright by Contributors 2017-2025 + */ +#include + +#include "../../../src/common/stats.h" + +#include "../device_manager.h" + +namespace xgboost::common::sycl_impl { + void Mean(Context const* ctx, linalg::VectorView v, linalg::VectorView out) { + sycl::DeviceManager device_manager; + auto* qu = device_manager.GetQueue(ctx->Device()); + + qu->submit([&](::sycl::handler& cgh) { + auto reduction = + ::sycl::reduction(&(out(0)), 0.0f, ::sycl::plus(), + ::sycl::property::reduction::initialize_to_identity()); + cgh.parallel_for<>(::sycl::range<1>(v.Size()), reduction, + [=](::sycl::id<1> pid, auto& sum) { + size_t i = pid[0]; + sum += v(i); + }); + }).wait_and_throw(); + } +} \ No newline at end of file diff --git a/plugin/sycl/context_helper.cc b/plugin/sycl/context_helper.cc index d5ced146187c..a064ce56103b 100644 --- a/plugin/sycl/context_helper.cc +++ b/plugin/sycl/context_helper.cc @@ -14,7 +14,10 @@ namespace sycl { DeviceOrd DeviceFP64(const DeviceOrd& device) { DeviceManager device_manager; - bool support_fp64 = device_manager.GetQueue(device)->get_device().has(::sycl::aspect::fp64); + bool support_fp64 = true; + if (device.IsSycl()) { + support_fp64 = device_manager.GetQueue(device)->get_device().has(::sycl::aspect::fp64); + } if (support_fp64) { return device; } else { diff --git a/plugin/sycl/device_manager.cc b/plugin/sycl/device_manager.cc index dc3939934e31..ee652065db23 100644 --- a/plugin/sycl/device_manager.cc +++ b/plugin/sycl/device_manager.cc @@ -21,18 +21,25 @@ ::sycl::queue* DeviceManager::GetQueue(const DeviceOrd& device_spec) const { (collective::IsDistributed()); DeviceRegister& device_register = GetDevicesRegister(); if (not_use_default_selector) { - const int device_idx = - collective::IsDistributed() ? collective::GetRank() : device_spec.ordinal; if (device_spec.IsSyclDefault()) { auto& devices = device_register.devices; + const int device_idx = collective::IsDistributed() + ? collective::GetRank() % devices.size() + : device_spec.ordinal; CHECK_LT(device_idx, devices.size()); queue_idx = device_idx; } else if (device_spec.IsSyclCPU()) { auto& cpu_devices_idxes = device_register.cpu_devices_idxes; + const int device_idx = collective::IsDistributed() + ? collective::GetRank() % cpu_devices_idxes.size() + : device_spec.ordinal; CHECK_LT(device_idx, cpu_devices_idxes.size()); queue_idx = cpu_devices_idxes[device_idx]; } else if (device_spec.IsSyclGPU()) { auto& gpu_devices_idxes = device_register.gpu_devices_idxes; + const int device_idx = collective::IsDistributed() + ? collective::GetRank() % gpu_devices_idxes.size() + : device_spec.ordinal; CHECK_LT(device_idx, gpu_devices_idxes.size()); queue_idx = gpu_devices_idxes[device_idx]; } else { diff --git a/src/common/linalg_op.h b/src/common/linalg_op.h index c6d37eef4c82..c236ab7d54d2 100644 --- a/src/common/linalg_op.h +++ b/src/common/linalg_op.h @@ -159,9 +159,13 @@ void ElementWiseKernel(Context const* ctx, TensorView t, Fn&& fn) { #elif defined(SYCL_LANGUAGE_VERSION) template void ElementWiseKernel(Context const* ctx, TensorView t, Fn&& fn) { - ctx->DispatchDevice([&] { cpu_impl::ElementWiseKernel(t, ctx->Threads(), std::forward(fn)); }, - [&] { LOG(FATAL) << "Invalid TU"; }, - [&] { ::xgboost::sycl::linalg::ElementWiseKernel(t, std::forward(fn)); }); + if (t.Device().IsCPU()) { + cpu_impl::ElementWiseKernel(t, ctx->Threads(), std::forward(fn)); + } else { + ctx->DispatchDevice([&] { cpu_impl::ElementWiseKernel(t, ctx->Threads(), std::forward(fn)); }, + [&] { LOG(FATAL) << "Invalid TU"; }, + [&] { ::xgboost::sycl::linalg::ElementWiseKernel(t, std::forward(fn)); }); + } } #else template diff --git a/src/common/stats.cc b/src/common/stats.cc index b1c65aaf1a50..2c2f8522dba9 100644 --- a/src/common/stats.cc +++ b/src/common/stats.cc @@ -51,6 +51,8 @@ void Mean(Context const* ctx, linalg::VectorView v, linalg::Vector< if (ctx->IsCUDA()) { cuda_impl::Mean(ctx, v, out->View(ctx->Device())); + } else if (ctx->IsSycl()) { + sycl_impl::Mean(ctx, v, out->View(ctx->Device())); } else { auto h_v = v; float n = v.Size(); diff --git a/src/common/stats.h b/src/common/stats.h index dffae5b068a2..3add130ffffa 100644 --- a/src/common/stats.h +++ b/src/common/stats.h @@ -14,7 +14,7 @@ #include "xgboost/linalg.h" // TensorView,VectorView #include "xgboost/logging.h" // CHECK_GE -#if !defined(XGBOOST_USE_CUDA) +#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_SYCL) #include "common.h" // AssertGPUSupport #endif @@ -140,6 +140,17 @@ inline void WeightedSampleMean(Context const*, bool, linalg::MatrixView v, linalg::VectorView out); + +#if !defined(XGBOOST_USE_SYCL) +inline void Mean(Context const*, linalg::VectorView, linalg::VectorView) { + common::AssertGPUSupport(); +} + +#endif // !defined(XGBOOST_USE_SYCL) +} // namespace sycl_impl + /** * @brief Calculate medians for each column of the input matrix. */ diff --git a/src/objective/multiclass_obj.cu b/src/objective/multiclass_obj.cu index 46bfbff686fc..9546567ae04d 100644 --- a/src/objective/multiclass_obj.cu +++ b/src/objective/multiclass_obj.cu @@ -106,10 +106,13 @@ class SoftmaxMultiClassObj : public ObjFunction { << "Number of weights should be equal to number of data points."; } info.weights_.SetDevice(device); - auto weights = common::MakeOptionalWeights(this->ctx_->Device(), info.weights_); + auto weights = common::MakeOptionalWeights(device, info.weights_); preds.SetDevice(device); - auto predt = linalg::MakeTensorView(this->ctx_, &preds, n_samples, n_classes); + Context cpu_context = Context(); + auto predt = linalg::MakeTensorView( + device == ctx_->Device() ? this->ctx_ : &cpu_context, + &preds, n_samples, n_classes); CHECK_EQ(labels.Shape(1), 1); auto y1d = labels.Slice(linalg::All(), 0); CHECK_EQ(y1d.Shape(0), info.num_row_); From d87acdf10044982e4f5fde4733336a566504d3a7 Mon Sep 17 00:00:00 2001 From: Dmitry Razdoburdin <> Date: Tue, 17 Feb 2026 01:38:39 -0800 Subject: [PATCH 2/2] linting --- plugin/sycl/common/optional_weight.cc | 18 ++-- plugin/sycl/common/stats.cc | 32 +++---- plugin/sycl/context_helper.cc | 4 +- plugin/sycl/device_manager.cc | 128 +++++++++++++------------- src/common/linalg_op.h | 7 +- src/objective/multiclass_obj.cu | 5 +- 6 files changed, 95 insertions(+), 99 deletions(-) diff --git a/plugin/sycl/common/optional_weight.cc b/plugin/sycl/common/optional_weight.cc index 819f274cec6d..7abeaaa6c765 100644 --- a/plugin/sycl/common/optional_weight.cc +++ b/plugin/sycl/common/optional_weight.cc @@ -1,10 +1,10 @@ /*! * Copyright by Contributors 2017-2025 */ -#include - #include "../../../src/common/optional_weight.h" +#include + #include "../device_manager.h" namespace xgboost::common::sycl_impl { @@ -16,13 +16,13 @@ T ElementWiseSum(::sycl::queue* qu, OptionalWeights const& weights) { { ::sycl::buffer buff(&result, 1); qu->submit([&](::sycl::handler& cgh) { - auto reduction = ::sycl::reduction(buff, cgh, ::sycl::plus<>()); - cgh.parallel_for<>(::sycl::range<1>(weights.Size()), reduction, - [=](::sycl::id<1> pid, auto& sum) { - size_t i = pid[0]; - sum += data[i]; - }); - }).wait_and_throw(); + auto reduction = ::sycl::reduction(buff, cgh, ::sycl::plus<>()); + cgh.parallel_for<>(::sycl::range<1>(weights.Size()), reduction, + [=](::sycl::id<1> pid, auto& sum) { + size_t i = pid[0]; + sum += data[i]; + }); + }).wait_and_throw(); } return result; diff --git a/plugin/sycl/common/stats.cc b/plugin/sycl/common/stats.cc index fb66f3c81229..d01b8c33295f 100644 --- a/plugin/sycl/common/stats.cc +++ b/plugin/sycl/common/stats.cc @@ -1,26 +1,24 @@ /*! * Copyright by Contributors 2017-2025 */ -#include - #include "../../../src/common/stats.h" +#include + #include "../device_manager.h" namespace xgboost::common::sycl_impl { - void Mean(Context const* ctx, linalg::VectorView v, linalg::VectorView out) { - sycl::DeviceManager device_manager; - auto* qu = device_manager.GetQueue(ctx->Device()); +void Mean(Context const* ctx, linalg::VectorView v, linalg::VectorView out) { + sycl::DeviceManager device_manager; + auto* qu = device_manager.GetQueue(ctx->Device()); - qu->submit([&](::sycl::handler& cgh) { - auto reduction = - ::sycl::reduction(&(out(0)), 0.0f, ::sycl::plus(), - ::sycl::property::reduction::initialize_to_identity()); - cgh.parallel_for<>(::sycl::range<1>(v.Size()), reduction, - [=](::sycl::id<1> pid, auto& sum) { - size_t i = pid[0]; - sum += v(i); - }); - }).wait_and_throw(); - } -} \ No newline at end of file + qu->submit([&](::sycl::handler& cgh) { + auto reduction = ::sycl::reduction(&(out(0)), 0.0f, ::sycl::plus(), + ::sycl::property::reduction::initialize_to_identity()); + cgh.parallel_for<>(::sycl::range<1>(v.Size()), reduction, [=](::sycl::id<1> pid, auto& sum) { + size_t i = pid[0]; + sum += v(i); + }); + }).wait_and_throw(); +} +} // namespace xgboost::common::sycl_impl diff --git a/plugin/sycl/context_helper.cc b/plugin/sycl/context_helper.cc index a064ce56103b..7006d8f6001e 100644 --- a/plugin/sycl/context_helper.cc +++ b/plugin/sycl/context_helper.cc @@ -3,11 +3,11 @@ * \file context_helper.cc */ -#include +#include "context_helper.h" +#include #include "device_manager.h" -#include "context_helper.h" namespace xgboost { namespace sycl { diff --git a/plugin/sycl/device_manager.cc b/plugin/sycl/device_manager.cc index ee652065db23..ffae9b449358 100644 --- a/plugin/sycl/device_manager.cc +++ b/plugin/sycl/device_manager.cc @@ -10,80 +10,78 @@ namespace xgboost { namespace sycl { ::sycl::queue* DeviceManager::GetQueue(const DeviceOrd& device_spec) const { - if (!device_spec.IsSycl()) { - LOG(WARNING) << "Sycl kernel is executed with non-sycl context: " - << device_spec.Name() << ". " - << "Default sycl device_selector will be used."; - } + if (!device_spec.IsSycl()) { + LOG(WARNING) << "Sycl kernel is executed with non-sycl context: " << device_spec.Name() << ". " + << "Default sycl device_selector will be used."; + } - size_t queue_idx; - bool not_use_default_selector = (device_spec.ordinal != kDefaultOrdinal) || - (collective::IsDistributed()); - DeviceRegister& device_register = GetDevicesRegister(); - if (not_use_default_selector) { - if (device_spec.IsSyclDefault()) { - auto& devices = device_register.devices; - const int device_idx = collective::IsDistributed() - ? collective::GetRank() % devices.size() - : device_spec.ordinal; - CHECK_LT(device_idx, devices.size()); - queue_idx = device_idx; - } else if (device_spec.IsSyclCPU()) { - auto& cpu_devices_idxes = device_register.cpu_devices_idxes; - const int device_idx = collective::IsDistributed() - ? collective::GetRank() % cpu_devices_idxes.size() - : device_spec.ordinal; - CHECK_LT(device_idx, cpu_devices_idxes.size()); - queue_idx = cpu_devices_idxes[device_idx]; - } else if (device_spec.IsSyclGPU()) { - auto& gpu_devices_idxes = device_register.gpu_devices_idxes; - const int device_idx = collective::IsDistributed() - ? collective::GetRank() % gpu_devices_idxes.size() - : device_spec.ordinal; - CHECK_LT(device_idx, gpu_devices_idxes.size()); - queue_idx = gpu_devices_idxes[device_idx]; - } else { - LOG(WARNING) << device_spec << " is not sycl, sycl:cpu or sycl:gpu"; - auto device = ::sycl::queue(::sycl::default_selector_v).get_device(); - queue_idx = device_register.devices.at(device); - } + size_t queue_idx; + bool not_use_default_selector = + (device_spec.ordinal != kDefaultOrdinal) || (collective::IsDistributed()); + DeviceRegister& device_register = GetDevicesRegister(); + if (not_use_default_selector) { + if (device_spec.IsSyclDefault()) { + auto& devices = device_register.devices; + const int device_idx = collective::IsDistributed() ? collective::GetRank() % devices.size() + : device_spec.ordinal; + CHECK_LT(device_idx, devices.size()); + queue_idx = device_idx; + } else if (device_spec.IsSyclCPU()) { + auto& cpu_devices_idxes = device_register.cpu_devices_idxes; + const int device_idx = collective::IsDistributed() + ? collective::GetRank() % cpu_devices_idxes.size() + : device_spec.ordinal; + CHECK_LT(device_idx, cpu_devices_idxes.size()); + queue_idx = cpu_devices_idxes[device_idx]; + } else if (device_spec.IsSyclGPU()) { + auto& gpu_devices_idxes = device_register.gpu_devices_idxes; + const int device_idx = collective::IsDistributed() + ? collective::GetRank() % gpu_devices_idxes.size() + : device_spec.ordinal; + CHECK_LT(device_idx, gpu_devices_idxes.size()); + queue_idx = gpu_devices_idxes[device_idx]; + } else { + LOG(WARNING) << device_spec << " is not sycl, sycl:cpu or sycl:gpu"; + auto device = ::sycl::queue(::sycl::default_selector_v).get_device(); + queue_idx = device_register.devices.at(device); + } + } else { + if (device_spec.IsSyclCPU()) { + auto device = ::sycl::queue(::sycl::cpu_selector_v).get_device(); + queue_idx = device_register.devices.at(device); + } else if (device_spec.IsSyclGPU()) { + auto device = ::sycl::queue(::sycl::gpu_selector_v).get_device(); + queue_idx = device_register.devices.at(device); } else { - if (device_spec.IsSyclCPU()) { - auto device = ::sycl::queue(::sycl::cpu_selector_v).get_device(); - queue_idx = device_register.devices.at(device); - } else if (device_spec.IsSyclGPU()) { - auto device = ::sycl::queue(::sycl::gpu_selector_v).get_device(); - queue_idx = device_register.devices.at(device); - } else { - auto device = ::sycl::queue(::sycl::default_selector_v).get_device(); - queue_idx = device_register.devices.at(device); - } + auto device = ::sycl::queue(::sycl::default_selector_v).get_device(); + queue_idx = device_register.devices.at(device); } - return &(device_register.queues[queue_idx]); + } + return &(device_register.queues[queue_idx]); } DeviceManager::DeviceRegister& DeviceManager::GetDevicesRegister() const { - static DeviceRegister device_register; + static DeviceRegister device_register; - if (device_register.devices.size() == 0) { - std::lock_guard guard(device_registering_mutex); - std::vector<::sycl::device> devices = ::sycl::device::get_devices(); - for (size_t i = 0; i < devices.size(); i++) { - LOG(INFO) << "device_index = " << i << ", name = " - << devices[i].get_info<::sycl::info::device::name>(); - } + if (device_register.devices.size() == 0) { + std::lock_guard guard(device_registering_mutex); + std::vector<::sycl::device> devices = ::sycl::device::get_devices(); + for (size_t i = 0; i < devices.size(); i++) { + LOG(INFO) << "device_index = " << i + << ", name = " << devices[i].get_info<::sycl::info::device::name>(); + } - for (size_t i = 0; i < devices.size(); i++) { - device_register.devices[devices[i]] = i; - device_register.queues.push_back(::sycl::queue(devices[i])); - if (devices[i].is_cpu()) { - device_register.cpu_devices_idxes.push_back(i); - } else if (devices[i].is_gpu()) { - device_register.gpu_devices_idxes.push_back(i); - } - } + for (size_t i = 0; i < devices.size(); i++) { + device_register.devices[devices[i]] = i; + device_register.queues.push_back(::sycl::queue(devices[i])); + if (devices[i].is_cpu()) { + device_register.cpu_devices_idxes.push_back(i); + } else if (devices[i].is_gpu()) { + device_register.gpu_devices_idxes.push_back(i); + } } - return device_register; + } + return device_register; } } // namespace sycl diff --git a/src/common/linalg_op.h b/src/common/linalg_op.h index c236ab7d54d2..967faaff7f9e 100644 --- a/src/common/linalg_op.h +++ b/src/common/linalg_op.h @@ -162,9 +162,10 @@ void ElementWiseKernel(Context const* ctx, TensorView t, Fn&& fn) { if (t.Device().IsCPU()) { cpu_impl::ElementWiseKernel(t, ctx->Threads(), std::forward(fn)); } else { - ctx->DispatchDevice([&] { cpu_impl::ElementWiseKernel(t, ctx->Threads(), std::forward(fn)); }, - [&] { LOG(FATAL) << "Invalid TU"; }, - [&] { ::xgboost::sycl::linalg::ElementWiseKernel(t, std::forward(fn)); }); + ctx->DispatchDevice( + [&] { cpu_impl::ElementWiseKernel(t, ctx->Threads(), std::forward(fn)); }, + [&] { LOG(FATAL) << "Invalid TU"; }, + [&] { ::xgboost::sycl::linalg::ElementWiseKernel(t, std::forward(fn)); }); } } #else diff --git a/src/objective/multiclass_obj.cu b/src/objective/multiclass_obj.cu index 9546567ae04d..de96e4b8c018 100644 --- a/src/objective/multiclass_obj.cu +++ b/src/objective/multiclass_obj.cu @@ -110,9 +110,8 @@ class SoftmaxMultiClassObj : public ObjFunction { preds.SetDevice(device); Context cpu_context = Context(); - auto predt = linalg::MakeTensorView( - device == ctx_->Device() ? this->ctx_ : &cpu_context, - &preds, n_samples, n_classes); + auto predt = linalg::MakeTensorView(device == ctx_->Device() ? this->ctx_ : &cpu_context, + &preds, n_samples, n_classes); CHECK_EQ(labels.Shape(1), 1); auto y1d = labels.Slice(linalg::All(), 0); CHECK_EQ(y1d.Shape(0), info.num_row_);