diff --git a/plugin/sycl/common/optional_weight.cc b/plugin/sycl/common/optional_weight.cc index aa984a152dc3..7abeaaa6c765 100644 --- a/plugin/sycl/common/optional_weight.cc +++ b/plugin/sycl/common/optional_weight.cc @@ -1,31 +1,42 @@ /*! * Copyright by Contributors 2017-2025 */ -#include - #include "../../../src/common/optional_weight.h" +#include + #include "../device_manager.h" namespace xgboost::common::sycl_impl { -double SumOptionalWeights(Context const* ctx, OptionalWeights const& weights) { - sycl::DeviceManager device_manager; - auto* qu = device_manager.GetQueue(ctx->Device()); +template +T ElementWiseSum(::sycl::queue* qu, OptionalWeights const& weights) { const auto* data = weights.Data(); - double result = 0; + T result = 0; { - ::sycl::buffer buff(&result, 1); + ::sycl::buffer buff(&result, 1); qu->submit([&](::sycl::handler& cgh) { - auto reduction = ::sycl::reduction(buff, cgh, ::sycl::plus<>()); - cgh.parallel_for<>(::sycl::range<1>(weights.Size()), reduction, - [=](::sycl::id<1> pid, auto& sum) { - size_t i = pid[0]; - sum += data[i]; - }); - }).wait_and_throw(); + auto reduction = ::sycl::reduction(buff, cgh, ::sycl::plus<>()); + cgh.parallel_for<>(::sycl::range<1>(weights.Size()), reduction, + [=](::sycl::id<1> pid, auto& sum) { + size_t i = pid[0]; + sum += data[i]; + }); + }).wait_and_throw(); } return result; } + +double SumOptionalWeights(Context const* ctx, OptionalWeights const& weights) { + sycl::DeviceManager device_manager; + auto* qu = device_manager.GetQueue(ctx->Device()); + + bool has_fp64_support = qu->get_device().has(::sycl::aspect::fp64); + if (has_fp64_support) { + return ElementWiseSum(qu, weights); + } else { + return ElementWiseSum(qu, weights); + } +} } // namespace xgboost::common::sycl_impl diff --git a/plugin/sycl/common/stats.cc b/plugin/sycl/common/stats.cc new file mode 100644 index 000000000000..d01b8c33295f --- /dev/null +++ b/plugin/sycl/common/stats.cc @@ -0,0 +1,24 @@ +/*! + * Copyright by Contributors 2017-2025 + */ +#include "../../../src/common/stats.h" + +#include + +#include "../device_manager.h" + +namespace xgboost::common::sycl_impl { +void Mean(Context const* ctx, linalg::VectorView v, linalg::VectorView out) { + sycl::DeviceManager device_manager; + auto* qu = device_manager.GetQueue(ctx->Device()); + + qu->submit([&](::sycl::handler& cgh) { + auto reduction = ::sycl::reduction(&(out(0)), 0.0f, ::sycl::plus(), + ::sycl::property::reduction::initialize_to_identity()); + cgh.parallel_for<>(::sycl::range<1>(v.Size()), reduction, [=](::sycl::id<1> pid, auto& sum) { + size_t i = pid[0]; + sum += v(i); + }); + }).wait_and_throw(); +} +} // namespace xgboost::common::sycl_impl diff --git a/plugin/sycl/context_helper.cc b/plugin/sycl/context_helper.cc index d5ced146187c..7006d8f6001e 100644 --- a/plugin/sycl/context_helper.cc +++ b/plugin/sycl/context_helper.cc @@ -3,18 +3,21 @@ * \file context_helper.cc */ -#include +#include "context_helper.h" +#include #include "device_manager.h" -#include "context_helper.h" namespace xgboost { namespace sycl { DeviceOrd DeviceFP64(const DeviceOrd& device) { DeviceManager device_manager; - bool support_fp64 = device_manager.GetQueue(device)->get_device().has(::sycl::aspect::fp64); + bool support_fp64 = true; + if (device.IsSycl()) { + support_fp64 = device_manager.GetQueue(device)->get_device().has(::sycl::aspect::fp64); + } if (support_fp64) { return device; } else { diff --git a/plugin/sycl/device_manager.cc b/plugin/sycl/device_manager.cc index dc3939934e31..ffae9b449358 100644 --- a/plugin/sycl/device_manager.cc +++ b/plugin/sycl/device_manager.cc @@ -10,73 +10,78 @@ namespace xgboost { namespace sycl { ::sycl::queue* DeviceManager::GetQueue(const DeviceOrd& device_spec) const { - if (!device_spec.IsSycl()) { - LOG(WARNING) << "Sycl kernel is executed with non-sycl context: " - << device_spec.Name() << ". " - << "Default sycl device_selector will be used."; - } + if (!device_spec.IsSycl()) { + LOG(WARNING) << "Sycl kernel is executed with non-sycl context: " << device_spec.Name() << ". " + << "Default sycl device_selector will be used."; + } - size_t queue_idx; - bool not_use_default_selector = (device_spec.ordinal != kDefaultOrdinal) || - (collective::IsDistributed()); - DeviceRegister& device_register = GetDevicesRegister(); - if (not_use_default_selector) { - const int device_idx = - collective::IsDistributed() ? collective::GetRank() : device_spec.ordinal; - if (device_spec.IsSyclDefault()) { - auto& devices = device_register.devices; - CHECK_LT(device_idx, devices.size()); - queue_idx = device_idx; - } else if (device_spec.IsSyclCPU()) { - auto& cpu_devices_idxes = device_register.cpu_devices_idxes; - CHECK_LT(device_idx, cpu_devices_idxes.size()); - queue_idx = cpu_devices_idxes[device_idx]; - } else if (device_spec.IsSyclGPU()) { - auto& gpu_devices_idxes = device_register.gpu_devices_idxes; - CHECK_LT(device_idx, gpu_devices_idxes.size()); - queue_idx = gpu_devices_idxes[device_idx]; - } else { - LOG(WARNING) << device_spec << " is not sycl, sycl:cpu or sycl:gpu"; - auto device = ::sycl::queue(::sycl::default_selector_v).get_device(); - queue_idx = device_register.devices.at(device); - } + size_t queue_idx; + bool not_use_default_selector = + (device_spec.ordinal != kDefaultOrdinal) || (collective::IsDistributed()); + DeviceRegister& device_register = GetDevicesRegister(); + if (not_use_default_selector) { + if (device_spec.IsSyclDefault()) { + auto& devices = device_register.devices; + const int device_idx = collective::IsDistributed() ? collective::GetRank() % devices.size() + : device_spec.ordinal; + CHECK_LT(device_idx, devices.size()); + queue_idx = device_idx; + } else if (device_spec.IsSyclCPU()) { + auto& cpu_devices_idxes = device_register.cpu_devices_idxes; + const int device_idx = collective::IsDistributed() + ? collective::GetRank() % cpu_devices_idxes.size() + : device_spec.ordinal; + CHECK_LT(device_idx, cpu_devices_idxes.size()); + queue_idx = cpu_devices_idxes[device_idx]; + } else if (device_spec.IsSyclGPU()) { + auto& gpu_devices_idxes = device_register.gpu_devices_idxes; + const int device_idx = collective::IsDistributed() + ? collective::GetRank() % gpu_devices_idxes.size() + : device_spec.ordinal; + CHECK_LT(device_idx, gpu_devices_idxes.size()); + queue_idx = gpu_devices_idxes[device_idx]; + } else { + LOG(WARNING) << device_spec << " is not sycl, sycl:cpu or sycl:gpu"; + auto device = ::sycl::queue(::sycl::default_selector_v).get_device(); + queue_idx = device_register.devices.at(device); + } + } else { + if (device_spec.IsSyclCPU()) { + auto device = ::sycl::queue(::sycl::cpu_selector_v).get_device(); + queue_idx = device_register.devices.at(device); + } else if (device_spec.IsSyclGPU()) { + auto device = ::sycl::queue(::sycl::gpu_selector_v).get_device(); + queue_idx = device_register.devices.at(device); } else { - if (device_spec.IsSyclCPU()) { - auto device = ::sycl::queue(::sycl::cpu_selector_v).get_device(); - queue_idx = device_register.devices.at(device); - } else if (device_spec.IsSyclGPU()) { - auto device = ::sycl::queue(::sycl::gpu_selector_v).get_device(); - queue_idx = device_register.devices.at(device); - } else { - auto device = ::sycl::queue(::sycl::default_selector_v).get_device(); - queue_idx = device_register.devices.at(device); - } + auto device = ::sycl::queue(::sycl::default_selector_v).get_device(); + queue_idx = device_register.devices.at(device); } - return &(device_register.queues[queue_idx]); + } + return &(device_register.queues[queue_idx]); } DeviceManager::DeviceRegister& DeviceManager::GetDevicesRegister() const { - static DeviceRegister device_register; + static DeviceRegister device_register; - if (device_register.devices.size() == 0) { - std::lock_guard guard(device_registering_mutex); - std::vector<::sycl::device> devices = ::sycl::device::get_devices(); - for (size_t i = 0; i < devices.size(); i++) { - LOG(INFO) << "device_index = " << i << ", name = " - << devices[i].get_info<::sycl::info::device::name>(); - } + if (device_register.devices.size() == 0) { + std::lock_guard guard(device_registering_mutex); + std::vector<::sycl::device> devices = ::sycl::device::get_devices(); + for (size_t i = 0; i < devices.size(); i++) { + LOG(INFO) << "device_index = " << i + << ", name = " << devices[i].get_info<::sycl::info::device::name>(); + } - for (size_t i = 0; i < devices.size(); i++) { - device_register.devices[devices[i]] = i; - device_register.queues.push_back(::sycl::queue(devices[i])); - if (devices[i].is_cpu()) { - device_register.cpu_devices_idxes.push_back(i); - } else if (devices[i].is_gpu()) { - device_register.gpu_devices_idxes.push_back(i); - } - } + for (size_t i = 0; i < devices.size(); i++) { + device_register.devices[devices[i]] = i; + device_register.queues.push_back(::sycl::queue(devices[i])); + if (devices[i].is_cpu()) { + device_register.cpu_devices_idxes.push_back(i); + } else if (devices[i].is_gpu()) { + device_register.gpu_devices_idxes.push_back(i); + } } - return device_register; + } + return device_register; } } // namespace sycl diff --git a/src/common/linalg_op.h b/src/common/linalg_op.h index c6d37eef4c82..967faaff7f9e 100644 --- a/src/common/linalg_op.h +++ b/src/common/linalg_op.h @@ -159,9 +159,14 @@ void ElementWiseKernel(Context const* ctx, TensorView t, Fn&& fn) { #elif defined(SYCL_LANGUAGE_VERSION) template void ElementWiseKernel(Context const* ctx, TensorView t, Fn&& fn) { - ctx->DispatchDevice([&] { cpu_impl::ElementWiseKernel(t, ctx->Threads(), std::forward(fn)); }, - [&] { LOG(FATAL) << "Invalid TU"; }, - [&] { ::xgboost::sycl::linalg::ElementWiseKernel(t, std::forward(fn)); }); + if (t.Device().IsCPU()) { + cpu_impl::ElementWiseKernel(t, ctx->Threads(), std::forward(fn)); + } else { + ctx->DispatchDevice( + [&] { cpu_impl::ElementWiseKernel(t, ctx->Threads(), std::forward(fn)); }, + [&] { LOG(FATAL) << "Invalid TU"; }, + [&] { ::xgboost::sycl::linalg::ElementWiseKernel(t, std::forward(fn)); }); + } } #else template diff --git a/src/common/stats.cc b/src/common/stats.cc index b1c65aaf1a50..2c2f8522dba9 100644 --- a/src/common/stats.cc +++ b/src/common/stats.cc @@ -51,6 +51,8 @@ void Mean(Context const* ctx, linalg::VectorView v, linalg::Vector< if (ctx->IsCUDA()) { cuda_impl::Mean(ctx, v, out->View(ctx->Device())); + } else if (ctx->IsSycl()) { + sycl_impl::Mean(ctx, v, out->View(ctx->Device())); } else { auto h_v = v; float n = v.Size(); diff --git a/src/common/stats.h b/src/common/stats.h index dffae5b068a2..3add130ffffa 100644 --- a/src/common/stats.h +++ b/src/common/stats.h @@ -14,7 +14,7 @@ #include "xgboost/linalg.h" // TensorView,VectorView #include "xgboost/logging.h" // CHECK_GE -#if !defined(XGBOOST_USE_CUDA) +#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_SYCL) #include "common.h" // AssertGPUSupport #endif @@ -140,6 +140,17 @@ inline void WeightedSampleMean(Context const*, bool, linalg::MatrixView v, linalg::VectorView out); + +#if !defined(XGBOOST_USE_SYCL) +inline void Mean(Context const*, linalg::VectorView, linalg::VectorView) { + common::AssertGPUSupport(); +} + +#endif // !defined(XGBOOST_USE_SYCL) +} // namespace sycl_impl + /** * @brief Calculate medians for each column of the input matrix. */ diff --git a/src/objective/multiclass_obj.cu b/src/objective/multiclass_obj.cu index 46bfbff686fc..de96e4b8c018 100644 --- a/src/objective/multiclass_obj.cu +++ b/src/objective/multiclass_obj.cu @@ -106,10 +106,12 @@ class SoftmaxMultiClassObj : public ObjFunction { << "Number of weights should be equal to number of data points."; } info.weights_.SetDevice(device); - auto weights = common::MakeOptionalWeights(this->ctx_->Device(), info.weights_); + auto weights = common::MakeOptionalWeights(device, info.weights_); preds.SetDevice(device); - auto predt = linalg::MakeTensorView(this->ctx_, &preds, n_samples, n_classes); + Context cpu_context = Context(); + auto predt = linalg::MakeTensorView(device == ctx_->Device() ? this->ctx_ : &cpu_context, + &preds, n_samples, n_classes); CHECK_EQ(labels.Shape(1), 1); auto y1d = labels.Slice(linalg::All(), 0); CHECK_EQ(y1d.Shape(0), info.num_row_);