Ginkgo Generated from branch based on main. Ginkgo version 1.9.0
A numerical linear algebra library targeting many-core architectures
 
Loading...
Searching...
No Matches
executor.hpp
1// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
2//
3// SPDX-License-Identifier: BSD-3-Clause
4
5#ifndef GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
6#define GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
7
8
9#include <array>
10#include <atomic>
11#include <iostream>
12#include <memory>
13#include <mutex>
14#include <sstream>
15#include <string>
16#include <tuple>
17#include <type_traits>
18#include <vector>
19
20#include <ginkgo/core/base/device.hpp>
21#include <ginkgo/core/base/fwd_decls.hpp>
22#include <ginkgo/core/base/machine_topology.hpp>
23#include <ginkgo/core/base/memory.hpp>
24#include <ginkgo/core/base/scoped_device_id_guard.hpp>
25#include <ginkgo/core/base/types.hpp>
26#include <ginkgo/core/log/logger.hpp>
27#include <ginkgo/core/synthesizer/containers.hpp>
28
29
30namespace gko {
31
32
48
49
62enum class allocation_mode { device, unified_global, unified_host };
63
64
65#ifdef NDEBUG
66
67// When in release, prefer device allocations
68constexpr allocation_mode default_cuda_alloc_mode = allocation_mode::device;
69
70constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
71
72#else
73
74// When in debug, always UM allocations.
75constexpr allocation_mode default_cuda_alloc_mode =
76 allocation_mode::unified_global;
77
78#if (GINKGO_HIP_PLATFORM_HCC == 1)
79
80// HIP on AMD GPUs does not support UM, so always prefer device allocations.
81constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
82
83#else
84
85// HIP on NVIDIA GPUs supports UM, so prefer UM allocations.
86constexpr allocation_mode default_hip_alloc_mode =
87 allocation_mode::unified_global;
88
89#endif
90
91#endif
92
93
94} // namespace gko
95
96
101enum class dpcpp_queue_property {
105 in_order = 1,
106
110 enable_profiling = 2
111};
112
113GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,
114 dpcpp_queue_property b)
115{
116 return static_cast<dpcpp_queue_property>(static_cast<int>(a) |
117 static_cast<int>(b));
118}
119
120
121namespace gko {
122
123
124#define GKO_FORWARD_DECLARE(_type, ...) class _type
125
126GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
127
128#undef GKO_FORWARD_DECLARE
129
130
132
133
134namespace detail {
135
136
137template <typename>
138class ExecutorBase;
139
140
141} // namespace detail
142
143
259public:
260#define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \
261 virtual void run(std::shared_ptr<const _type>) const
262
263 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
264
265#undef GKO_DECLARE_RUN_OVERLOAD
266
267 // ReferenceExecutor overload can be defaulted to OmpExecutor's
268 virtual void run(std::shared_ptr<const ReferenceExecutor> executor) const;
269
275 virtual const char* get_name() const noexcept;
276};
277
278
279namespace detail {
280
281
291template <typename Closure>
292class RegisteredOperation : public Operation {
293public:
300 RegisteredOperation(const char* name, Closure op)
301 : name_(name), op_(std::move(op))
302 {}
303
304 const char* get_name() const noexcept override { return name_; }
305
306 void run(std::shared_ptr<const ReferenceExecutor> exec) const override
307 {
308 op_(exec);
309 }
310
311 void run(std::shared_ptr<const OmpExecutor> exec) const override
312 {
313 op_(exec);
314 }
315
316 void run(std::shared_ptr<const CudaExecutor> exec) const override
317 {
318 op_(exec);
319 }
320
321 void run(std::shared_ptr<const HipExecutor> exec) const override
322 {
323 op_(exec);
324 }
325
326 void run(std::shared_ptr<const DpcppExecutor> exec) const override
327 {
328 op_(exec);
329 }
330
331private:
332 const char* name_;
333 Closure op_;
334};
335
336
337template <typename Closure>
338RegisteredOperation<Closure> make_register_operation(const char* name,
339 Closure op)
340{
341 return RegisteredOperation<Closure>{name, std::move(op)};
342}
343
344
345} // namespace detail
346
347
419#define GKO_REGISTER_OPERATION(_name, _kernel) \
420 template <typename... Args> \
421 auto make_##_name(Args&&... args) \
422 { \
423 return ::gko::detail::make_register_operation( \
424 #_kernel, [&args...](auto exec) { \
425 using exec_type = decltype(exec); \
426 if (std::is_same< \
427 exec_type, \
428 std::shared_ptr<const ::gko::ReferenceExecutor>>:: \
429 value) { \
430 ::gko::kernels::reference::_kernel( \
431 std::dynamic_pointer_cast< \
432 const ::gko::ReferenceExecutor>(exec), \
433 std::forward<Args>(args)...); \
434 } else if (std::is_same< \
435 exec_type, \
436 std::shared_ptr<const ::gko::OmpExecutor>>:: \
437 value) { \
438 ::gko::kernels::omp::_kernel( \
439 std::dynamic_pointer_cast<const ::gko::OmpExecutor>( \
440 exec), \
441 std::forward<Args>(args)...); \
442 } else if (std::is_same< \
443 exec_type, \
444 std::shared_ptr<const ::gko::CudaExecutor>>:: \
445 value) { \
446 ::gko::kernels::cuda::_kernel( \
447 std::dynamic_pointer_cast<const ::gko::CudaExecutor>( \
448 exec), \
449 std::forward<Args>(args)...); \
450 } else if (std::is_same< \
451 exec_type, \
452 std::shared_ptr<const ::gko::HipExecutor>>:: \
453 value) { \
454 ::gko::kernels::hip::_kernel( \
455 std::dynamic_pointer_cast<const ::gko::HipExecutor>( \
456 exec), \
457 std::forward<Args>(args)...); \
458 } else if (std::is_same< \
459 exec_type, \
460 std::shared_ptr<const ::gko::DpcppExecutor>>:: \
461 value) { \
462 ::gko::kernels::dpcpp::_kernel( \
463 std::dynamic_pointer_cast<const ::gko::DpcppExecutor>( \
464 exec), \
465 std::forward<Args>(args)...); \
466 } else { \
467 GKO_NOT_IMPLEMENTED; \
468 } \
469 }); \
470 } \
471 static_assert(true, \
472 "This assert is used to counter the false positive extra " \
473 "semi-colon warnings")
474
475
513#define GKO_REGISTER_HOST_OPERATION(_name, _kernel) \
514 template <typename... Args> \
515 auto make_##_name(Args&&... args) \
516 { \
517 return ::gko::detail::make_register_operation( \
518 #_kernel, \
519 [&args...](auto) { _kernel(std::forward<Args>(args)...); }); \
520 } \
521 static_assert(true, \
522 "This assert is used to counter the false positive extra " \
523 "semi-colon warnings")
524
525
526#define GKO_DECLARE_EXECUTOR_FRIEND(_type, ...) friend class _type
527
615class Executor : public log::EnableLogging<Executor> {
616 template <typename T>
617 friend class detail::ExecutorBase;
618
619 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
620 friend class ReferenceExecutor;
621
622public:
623 virtual ~Executor() = default;
624
625 Executor() = default;
626 Executor(Executor&) = delete;
627 Executor(Executor&&) = delete;
628 Executor& operator=(Executor&) = delete;
629 Executor& operator=(Executor&&) = delete;
630
636 virtual void run(const Operation& op) const = 0;
637
652 template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,
653 typename ClosureDpcpp>
654 GKO_DEPRECATED(
655 "Please use the overload with std::string as first parameter.")
656 void run(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
657 const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) const
658 {
659 LambdaOperation<ClosureOmp, ClosureOmp, ClosureCuda, ClosureHip,
660 ClosureDpcpp>
661 op(op_omp, op_cuda, op_hip, op_dpcpp);
662 this->run(op);
663 }
664
681 template <typename ClosureReference, typename ClosureOmp,
682 typename ClosureCuda, typename ClosureHip, typename ClosureDpcpp>
683 void run(std::string name, const ClosureReference& op_ref,
684 const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
685 const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) const
686 {
687 LambdaOperation<ClosureReference, ClosureOmp, ClosureCuda, ClosureHip,
688 ClosureDpcpp>
689 op(std::move(name), op_ref, op_omp, op_cuda, op_hip, op_dpcpp);
690 this->run(op);
691 }
692
704 template <typename T>
705 T* alloc(size_type num_elems) const
706 {
707 this->template log<log::Logger::allocation_started>(
708 this, num_elems * sizeof(T));
709 T* allocated = static_cast<T*>(this->raw_alloc(num_elems * sizeof(T)));
710 this->template log<log::Logger::allocation_completed>(
711 this, num_elems * sizeof(T), reinterpret_cast<uintptr>(allocated));
712 return allocated;
713 }
714
722 void free(void* ptr) const noexcept
723 {
724 this->template log<log::Logger::free_started>(
725 this, reinterpret_cast<uintptr>(ptr));
726 this->raw_free(ptr);
727 this->template log<log::Logger::free_completed>(
728 this, reinterpret_cast<uintptr>(ptr));
729 }
730
743 template <typename T>
745 const T* src_ptr, T* dest_ptr) const
746 {
747 const auto src_loc = reinterpret_cast<uintptr>(src_ptr);
748 const auto dest_loc = reinterpret_cast<uintptr>(dest_ptr);
749 this->template log<log::Logger::copy_started>(
750 src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
751 if (this != src_exec.get()) {
752 src_exec->template log<log::Logger::copy_started>(
753 src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
754 }
755 try {
756 this->raw_copy_from(src_exec.get(), num_elems * sizeof(T), src_ptr,
757 dest_ptr);
758 } catch (NotSupported&) {
759#if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG)
760 // Unoptimized copy. Try to go through the masters.
761 // output to log when verbose >= 1 and debug build
762 std::clog << "Not direct copy. Try to copy data from the masters."
763 << std::endl;
764#endif
765 auto src_master = src_exec->get_master().get();
766 if (num_elems > 0 && src_master != src_exec.get()) {
767 auto* master_ptr = src_exec->get_master()->alloc<T>(num_elems);
768 src_master->copy_from<T>(src_exec, num_elems, src_ptr,
769 master_ptr);
770 this->copy_from<T>(src_master, num_elems, master_ptr, dest_ptr);
771 src_master->free(master_ptr);
772 }
773 }
774 this->template log<log::Logger::copy_completed>(
775 src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
776 if (this != src_exec.get()) {
777 src_exec->template log<log::Logger::copy_completed>(
778 src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
779 }
780 }
781
793 template <typename T>
794 void copy(size_type num_elems, const T* src_ptr, T* dest_ptr) const
795 {
796 this->copy_from(this, num_elems, src_ptr, dest_ptr);
797 }
798
808 template <typename T>
809 T copy_val_to_host(const T* ptr) const
810 {
811 T out{};
812 this->get_master()->copy_from(this, 1, ptr, &out);
813 return out;
814 }
815
820 virtual std::shared_ptr<Executor> get_master() noexcept = 0;
821
825 virtual std::shared_ptr<const Executor> get_master() const noexcept = 0;
826
830 virtual void synchronize() const = 0;
831
838 void add_logger(std::shared_ptr<const log::Logger> logger) override
839 {
840 this->propagating_logger_refcount_.fetch_add(
841 logger->needs_propagation() ? 1 : 0);
842 this->EnableLogging<Executor>::add_logger(logger);
843 }
844
851 void remove_logger(const log::Logger* logger) override
852 {
853 this->propagating_logger_refcount_.fetch_sub(
854 logger->needs_propagation() ? 1 : 0);
855 this->EnableLogging<Executor>::remove_logger(logger);
856 }
857
858 using EnableLogging<Executor>::remove_logger;
859
868 {
869 log_propagation_mode_ = mode;
870 }
871
880 {
881 return this->propagating_logger_refcount_.load() > 0 &&
882 log_propagation_mode_ == log_propagation_mode::automatic;
883 }
884
892 bool memory_accessible(const std::shared_ptr<const Executor>& other) const
893 {
894 return this->verify_memory_from(other.get());
895 }
896
897 virtual scoped_device_id_guard get_scoped_device_id_guard() const = 0;
898
900 virtual std::string get_description() const = 0;
901
902protected:
907 struct exec_info {
911 int device_id = -1;
912
916 std::string device_type;
917
921 int numa_node = -1;
922
931 int num_computing_units = -1;
932
944 int num_pu_per_cu = -1;
945
954 std::vector<int> subgroup_sizes{};
955
964 int max_subgroup_size = -1;
965
976 std::vector<int> max_workitem_sizes{};
977
987 int max_workgroup_size;
988
992 int major = -1;
993
997 int minor = -1;
998
1004 std::string pci_bus_id = std::string(13, 'x');
1005
1016 std::vector<int> closest_pu_ids{};
1017 };
1018
1024 const exec_info& get_exec_info() const { return this->exec_info_; }
1025
1035 virtual void* raw_alloc(size_type size) const = 0;
1036
1044 virtual void raw_free(void* ptr) const noexcept = 0;
1045
1056 virtual void raw_copy_from(const Executor* src_exec, size_type n_bytes,
1057 const void* src_ptr, void* dest_ptr) const = 0;
1058
1068#define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...) \
1069 virtual void raw_copy_to(const _exec_type* dest_exec, size_type n_bytes, \
1070 const void* src_ptr, void* dest_ptr) const = 0
1071
1072 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
1073
1074#undef GKO_ENABLE_RAW_COPY_TO
1075
1083 virtual bool verify_memory_from(const Executor* src_exec) const = 0;
1084
1094#define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \
1095 virtual bool verify_memory_to(const _exec_type* dest_exec) const = 0
1096
1097 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
1098
1099 GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);
1100
1101#undef GKO_ENABLE_VERIFY_MEMORY_TO
1102
1109 virtual void populate_exec_info(const machine_topology* mach_topo) = 0;
1110
1116 exec_info& get_exec_info() { return this->exec_info_; }
1117
1118 exec_info exec_info_;
1119
1120 log_propagation_mode log_propagation_mode_{log_propagation_mode::automatic};
1121
1122 std::atomic<int> propagating_logger_refcount_{};
1123
1124private:
1139 template <typename ClosureReference, typename ClosureOmp,
1140 typename ClosureCuda, typename ClosureHip, typename ClosureDpcpp>
1141 class LambdaOperation : public Operation {
1142 public:
1143 LambdaOperation(std::string name, const ClosureReference& op_ref,
1144 const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
1145 const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
1146 : name_(std::move(name)),
1147 op_ref_(op_ref),
1148 op_omp_(op_omp),
1149 op_cuda_(op_cuda),
1150 op_hip_(op_hip),
1151 op_dpcpp_(op_dpcpp)
1152 {}
1153
1164 LambdaOperation(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
1165 const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
1166 : LambdaOperation("unnamed", op_omp, op_omp, op_cuda, op_hip,
1167 op_dpcpp)
1168 {}
1169
1170 void run(std::shared_ptr<const OmpExecutor>) const override
1171 {
1172 op_omp_();
1173 }
1174
1175 void run(std::shared_ptr<const ReferenceExecutor>) const override
1176 {
1177 op_ref_();
1178 }
1179
1180 void run(std::shared_ptr<const CudaExecutor>) const override
1181 {
1182 op_cuda_();
1183 }
1184
1185 void run(std::shared_ptr<const HipExecutor>) const override
1186 {
1187 op_hip_();
1188 }
1189
1190 void run(std::shared_ptr<const DpcppExecutor>) const override
1191 {
1192 op_dpcpp_();
1193 }
1194
1195 const char* get_name() const noexcept override { return name_.c_str(); }
1196
1197 private:
1198 std::string name_;
1199 ClosureReference op_ref_;
1200 ClosureOmp op_omp_;
1201 ClosureCuda op_cuda_;
1202 ClosureHip op_hip_;
1203 ClosureDpcpp op_dpcpp_;
1204 };
1205};
1206
1207
1216template <typename T>
1218public:
1219 using pointer = T*;
1220
1226 explicit executor_deleter(std::shared_ptr<const Executor> exec)
1227 : exec_{exec}
1228 {}
1229
1235 void operator()(pointer ptr) const
1236 {
1237 if (exec_) {
1238 exec_->free(ptr);
1239 }
1240 }
1241
1242private:
1243 std::shared_ptr<const Executor> exec_;
1244};
1245
1246// a specialization for arrays
1247template <typename T>
1248class executor_deleter<T[]> {
1249public:
1250 using pointer = T[];
1251
1252 explicit executor_deleter(std::shared_ptr<const Executor> exec)
1253 : exec_{exec}
1254 {}
1255
1256 void operator()(pointer ptr) const
1257 {
1258 if (exec_) {
1259 exec_->free(ptr);
1260 }
1261 }
1262
1263private:
1264 std::shared_ptr<const Executor> exec_;
1265};
1266
1267
1268namespace detail {
1269
1270
1271template <typename ConcreteExecutor>
1272class ExecutorBase : public Executor {
1273 // friend class is not in the nearest enclosing namesace, so we write the
1274 // full name
1275 friend class ::gko::OmpExecutor;
1276 friend class ::gko::HipExecutor;
1277 friend class ::gko::DpcppExecutor;
1278 friend class ::gko::CudaExecutor;
1279 friend class ::gko::ReferenceExecutor;
1280
1281public:
1282 void run(const Operation& op) const override
1283 {
1284 this->template log<log::Logger::operation_launched>(this, &op);
1285 auto scope_guard = get_scoped_device_id_guard();
1286 op.run(self()->shared_from_this());
1287 this->template log<log::Logger::operation_completed>(this, &op);
1288 }
1289
1290protected:
1291 void raw_copy_from(const Executor* src_exec, size_type n_bytes,
1292 const void* src_ptr, void* dest_ptr) const override
1293 {
1294 src_exec->raw_copy_to(self(), n_bytes, src_ptr, dest_ptr);
1295 }
1296
1297 virtual bool verify_memory_from(const Executor* src_exec) const override
1298 {
1299 return src_exec->verify_memory_to(self());
1300 }
1301
1302private:
1303 ConcreteExecutor* self() noexcept
1304 {
1305 return static_cast<ConcreteExecutor*>(this);
1306 }
1307
1308 const ConcreteExecutor* self() const noexcept
1309 {
1310 return static_cast<const ConcreteExecutor*>(this);
1311 }
1312};
1313
1314#undef GKO_DECLARE_EXECUTOR_FRIEND
1315
1316
1324class EnableDeviceReset {
1325public:
1331 GKO_DEPRECATED(
1332 "device_reset is no longer supported, call "
1333 "cudaDeviceReset/hipDeviceReset manually")
1334 void set_device_reset(bool device_reset) {}
1335
1341 GKO_DEPRECATED(
1342 "device_reset is no longer supported, call "
1343 "cudaDeviceReset/hipDeviceReset manually")
1344 bool get_device_reset() { return false; }
1345
1346protected:
1352 EnableDeviceReset() {}
1353
1354 GKO_DEPRECATED(
1355 "device_reset is no longer supported, call "
1356 "cudaDeviceReset/hipDeviceReset manually")
1357 EnableDeviceReset(bool device_reset) {}
1358};
1359
1360
1361} // namespace detail
1362
1363
1364#define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...) \
1365 void raw_copy_to(const _executor_type* dest_exec, size_type n_bytes, \
1366 const void* src_ptr, void* dest_ptr) const override
1367
1368
1369#define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_) \
1370 virtual bool verify_memory_to(const dest_* other) const override \
1371 { \
1372 return bool_; \
1373 } \
1374 static_assert(true, \
1375 "This assert is used to counter the false positive extra " \
1376 "semi-colon warnings")
1377
1378
1386class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
1387 public std::enable_shared_from_this<OmpExecutor> {
1388 friend class detail::ExecutorBase<OmpExecutor>;
1389
1390public:
1391 using Executor::run;
1392
1396 static std::shared_ptr<OmpExecutor> create(
1397 std::shared_ptr<CpuAllocatorBase> alloc =
1398 std::make_shared<CpuAllocator>())
1399 {
1400 return std::shared_ptr<OmpExecutor>(new OmpExecutor(std::move(alloc)));
1401 }
1402
1403 std::shared_ptr<Executor> get_master() noexcept override;
1404
1405 std::shared_ptr<const Executor> get_master() const noexcept override;
1406
1407 void synchronize() const override;
1408
1409 int get_num_cores() const
1410 {
1411 return this->get_exec_info().num_computing_units;
1412 }
1413
1414 int get_num_threads_per_core() const
1415 {
1416 return this->get_exec_info().num_pu_per_cu;
1417 }
1418
1419 static int get_num_omp_threads();
1420
1421 scoped_device_id_guard get_scoped_device_id_guard() const override;
1422
1423 std::string get_description() const override;
1424
1425protected:
1426 OmpExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1427 : alloc_{std::move(alloc)}
1428 {
1429 this->OmpExecutor::populate_exec_info(machine_topology::get_instance());
1430 }
1431
1432 void populate_exec_info(const machine_topology* mach_topo) override;
1433
1434 void* raw_alloc(size_type size) const override;
1435
1436 void raw_free(void* ptr) const noexcept override;
1437
1438 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1439
1440 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, true);
1441
1442 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1443
1444 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1445
1446 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1447
1448 bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
1449
1450 std::shared_ptr<CpuAllocatorBase> alloc_;
1451};
1452
1453
1454namespace kernels {
1455namespace omp {
1456using DefaultExecutor = OmpExecutor;
1457} // namespace omp
1458} // namespace kernels
1459
1460
1468class ReferenceExecutor : public OmpExecutor {
1469public:
1470 using Executor::run;
1471
1472 static std::shared_ptr<ReferenceExecutor> create(
1473 std::shared_ptr<CpuAllocatorBase> alloc =
1474 std::make_shared<CpuAllocator>())
1475 {
1476 return std::shared_ptr<ReferenceExecutor>(
1477 new ReferenceExecutor(std::move(alloc)));
1478 }
1479
1480 scoped_device_id_guard get_scoped_device_id_guard() const override
1481 {
1482 return {this, 0};
1483 }
1484
1485 std::string get_description() const override { return "ReferenceExecutor"; }
1486
1487 void run(const Operation& op) const override
1488 {
1489 this->template log<log::Logger::operation_launched>(this, &op);
1490 op.run(std::static_pointer_cast<const ReferenceExecutor>(
1491 this->shared_from_this()));
1492 this->template log<log::Logger::operation_completed>(this, &op);
1493 }
1494
1495protected:
1496 ReferenceExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1497 : OmpExecutor{std::move(alloc)}
1498 {
1499 this->ReferenceExecutor::populate_exec_info(
1501 }
1502
1503 void populate_exec_info(const machine_topology*) override
1504 {
1505 this->get_exec_info().device_id = -1;
1506 this->get_exec_info().num_computing_units = 1;
1507 this->get_exec_info().num_pu_per_cu = 1;
1508 }
1509
1510 bool verify_memory_from(const Executor* src_exec) const override
1511 {
1512 return src_exec->verify_memory_to(this);
1513 }
1514
1515 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, true);
1516
1517 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1518
1519 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1520
1521 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1522
1523 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1524};
1525
1526
1527namespace kernels {
1528namespace reference {
1529using DefaultExecutor = ReferenceExecutor;
1530} // namespace reference
1531} // namespace kernels
1532
1533
1540class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
1541 public std::enable_shared_from_this<CudaExecutor>,
1542 public detail::EnableDeviceReset {
1543 friend class detail::ExecutorBase<CudaExecutor>;
1544
1545public:
1546 using Executor::run;
1547
1559 GKO_DEPRECATED(
1560 "calling this CudaExecutor::create method is deprecated, because"
1561 "device_reset no longer has an effect"
1562 "call CudaExecutor::create("
1563 " int device_id, std::shared_ptr<Executor> master,"
1564 " std::shared_ptr<CudaAllocatorBase> alloc,"
1565 " CUstream_st* stream);"
1566 "instead")
1567 static std::shared_ptr<CudaExecutor> create(
1568 int device_id, std::shared_ptr<Executor> master, bool device_reset,
1569 allocation_mode alloc_mode = default_cuda_alloc_mode,
1570 CUstream_st* stream = nullptr);
1571
1581 static std::shared_ptr<CudaExecutor> create(
1582 int device_id, std::shared_ptr<Executor> master,
1583 std::shared_ptr<CudaAllocatorBase> alloc =
1584 std::make_shared<CudaAllocator>(),
1585 CUstream_st* stream = nullptr);
1586
1587 std::shared_ptr<Executor> get_master() noexcept override;
1588
1589 std::shared_ptr<const Executor> get_master() const noexcept override;
1590
1591 void synchronize() const override;
1592
1593 scoped_device_id_guard get_scoped_device_id_guard() const override;
1594
1595 std::string get_description() const override;
1596
1600 int get_device_id() const noexcept
1601 {
1602 return this->get_exec_info().device_id;
1603 }
1604
1608 static int get_num_devices();
1609
1613 int get_num_warps_per_sm() const noexcept
1614 {
1615 return this->get_exec_info().num_pu_per_cu;
1616 }
1617
1621 int get_num_multiprocessor() const noexcept
1622 {
1623 return this->get_exec_info().num_computing_units;
1624 }
1625
1629 int get_num_warps() const noexcept
1630 {
1631 return this->get_exec_info().num_computing_units *
1632 this->get_exec_info().num_pu_per_cu;
1633 }
1634
1638 int get_warp_size() const noexcept
1639 {
1640 return this->get_exec_info().max_subgroup_size;
1641 }
1642
1646 int get_major_version() const noexcept
1647 {
1648 return this->get_exec_info().major;
1649 }
1650
1654 int get_minor_version() const noexcept
1655 {
1656 return this->get_exec_info().minor;
1657 }
1658
1664 GKO_DEPRECATED("use get_blas_handle() instead")
1665 cublasContext* get_cublas_handle() const { return get_blas_handle(); }
1666
1670 cublasContext* get_blas_handle() const { return cublas_handle_.get(); }
1671
1677 GKO_DEPRECATED("use get_sparselib_handle() instead")
1678 cusparseContext* get_cusparse_handle() const
1679 {
1680 return get_sparselib_handle();
1681 }
1682
1686 cusparseContext* get_sparselib_handle() const
1687 {
1688 return cusparse_handle_.get();
1689 }
1690
1696 std::vector<int> get_closest_pus() const
1697 {
1698 return this->get_exec_info().closest_pu_ids;
1699 }
1700
1706 int get_closest_numa() const { return this->get_exec_info().numa_node; }
1707
1714 CUstream_st* get_stream() const { return stream_; }
1715
1716protected:
1717 void set_gpu_property();
1718
1719 void init_handles();
1720
1721 CudaExecutor(int device_id, std::shared_ptr<Executor> master,
1722 std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
1723 : master_(master), alloc_{std::move(alloc)}, stream_{stream}
1724 {
1725 this->get_exec_info().device_id = device_id;
1726 this->get_exec_info().num_computing_units = 0;
1727 this->get_exec_info().num_pu_per_cu = 0;
1728 this->CudaExecutor::populate_exec_info(
1730 this->set_gpu_property();
1731 this->init_handles();
1732 }
1733
1734 void* raw_alloc(size_type size) const override;
1735
1736 void raw_free(void* ptr) const noexcept override;
1737
1738 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1739
1740 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1741
1742 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1743
1744 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1745
1746 bool verify_memory_to(const HipExecutor* dest_exec) const override;
1747
1748 bool verify_memory_to(const CudaExecutor* dest_exec) const override;
1749
1750 void populate_exec_info(const machine_topology* mach_topo) override;
1751
1752private:
1753 std::shared_ptr<Executor> master_;
1754
1755 template <typename T>
1756 using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1757 handle_manager<cublasContext> cublas_handle_;
1758 handle_manager<cusparseContext> cusparse_handle_;
1759 std::shared_ptr<CudaAllocatorBase> alloc_;
1760 CUstream_st* stream_;
1761};
1762
1763
1764namespace kernels {
1765namespace cuda {
1766using DefaultExecutor = CudaExecutor;
1767} // namespace cuda
1768} // namespace kernels
1769
1770
1777class HipExecutor : public detail::ExecutorBase<HipExecutor>,
1778 public std::enable_shared_from_this<HipExecutor>,
1779 public detail::EnableDeviceReset {
1780 friend class detail::ExecutorBase<HipExecutor>;
1781
1782public:
1783 using Executor::run;
1784
1796 GKO_DEPRECATED(
1797 "device_reset is deprecated entirely, call hipDeviceReset directly. "
1798 "alloc_mode was replaced by the Allocator type "
1799 "hierarchy.")
1800 static std::shared_ptr<HipExecutor> create(
1801 int device_id, std::shared_ptr<Executor> master, bool device_reset,
1802 allocation_mode alloc_mode = default_hip_alloc_mode,
1803 GKO_HIP_STREAM_STRUCT* stream = nullptr);
1804
1805 static std::shared_ptr<HipExecutor> create(
1806 int device_id, std::shared_ptr<Executor> master,
1807 std::shared_ptr<HipAllocatorBase> alloc =
1808 std::make_shared<HipAllocator>(),
1809 GKO_HIP_STREAM_STRUCT* stream = nullptr);
1810
1811 std::shared_ptr<Executor> get_master() noexcept override;
1812
1813 std::shared_ptr<const Executor> get_master() const noexcept override;
1814
1815 void synchronize() const override;
1816
1817 scoped_device_id_guard get_scoped_device_id_guard() const override;
1818
1819 std::string get_description() const override;
1820
1824 int get_device_id() const noexcept
1825 {
1826 return this->get_exec_info().device_id;
1827 }
1828
1832 static int get_num_devices();
1833
1837 int get_num_warps_per_sm() const noexcept
1838 {
1839 return this->get_exec_info().num_pu_per_cu;
1840 }
1841
1845 int get_num_multiprocessor() const noexcept
1846 {
1847 return this->get_exec_info().num_computing_units;
1848 }
1849
1853 int get_major_version() const noexcept
1854 {
1855 return this->get_exec_info().major;
1856 }
1857
1861 int get_minor_version() const noexcept
1862 {
1863 return this->get_exec_info().minor;
1864 }
1865
1869 int get_num_warps() const noexcept
1870 {
1871 return this->get_exec_info().num_computing_units *
1872 this->get_exec_info().num_pu_per_cu;
1873 }
1874
1878 int get_warp_size() const noexcept
1879 {
1880 return this->get_exec_info().max_subgroup_size;
1881 }
1882
1888 GKO_DEPRECATED("use get_blas_handle() instead")
1889 hipblasContext* get_hipblas_handle() const { return get_blas_handle(); }
1890
1894 hipblasContext* get_blas_handle() const { return hipblas_handle_.get(); }
1895
1901 GKO_DEPRECATED("use get_sparselib_handle() instead")
1902 hipsparseContext* get_hipsparse_handle() const
1903 {
1904 return get_sparselib_handle();
1905 }
1906
1910 hipsparseContext* get_sparselib_handle() const
1911 {
1912 return hipsparse_handle_.get();
1913 }
1914
1920 int get_closest_numa() const { return this->get_exec_info().numa_node; }
1921
1927 std::vector<int> get_closest_pus() const
1928 {
1929 return this->get_exec_info().closest_pu_ids;
1930 }
1931
1932 GKO_HIP_STREAM_STRUCT* get_stream() const { return stream_; }
1933
1934protected:
1935 void set_gpu_property();
1936
1937 void init_handles();
1938
1939 HipExecutor(int device_id, std::shared_ptr<Executor> master,
1940 std::shared_ptr<HipAllocatorBase> alloc,
1941 GKO_HIP_STREAM_STRUCT* stream)
1942 : master_{std::move(master)}, alloc_{std::move(alloc)}, stream_{stream}
1943 {
1944 this->get_exec_info().device_id = device_id;
1945 this->get_exec_info().num_computing_units = 0;
1946 this->get_exec_info().num_pu_per_cu = 0;
1947 this->HipExecutor::populate_exec_info(machine_topology::get_instance());
1948 this->set_gpu_property();
1949 this->init_handles();
1950 }
1951
1952 void* raw_alloc(size_type size) const override;
1953
1954 void raw_free(void* ptr) const noexcept override;
1955
1956 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1957
1958 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1959
1960 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1961
1962 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1963
1964 bool verify_memory_to(const CudaExecutor* dest_exec) const override;
1965
1966 bool verify_memory_to(const HipExecutor* dest_exec) const override;
1967
1968 void populate_exec_info(const machine_topology* mach_topo) override;
1969
1970private:
1971 std::shared_ptr<Executor> master_;
1972
1973 template <typename T>
1974 using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1975 handle_manager<hipblasContext> hipblas_handle_;
1976 handle_manager<hipsparseContext> hipsparse_handle_;
1977 std::shared_ptr<HipAllocatorBase> alloc_;
1978 GKO_HIP_STREAM_STRUCT* stream_;
1979};
1980
1981
1982namespace kernels {
1983namespace hip {
1984using DefaultExecutor = HipExecutor;
1985} // namespace hip
1986} // namespace kernels
1987
1988
1995class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
1996 public std::enable_shared_from_this<DpcppExecutor> {
1997 friend class detail::ExecutorBase<DpcppExecutor>;
1998
1999public:
2000 using Executor::run;
2001
2011 static std::shared_ptr<DpcppExecutor> create(
2012 int device_id, std::shared_ptr<Executor> master,
2013 std::string device_type = "all",
2014 dpcpp_queue_property property = dpcpp_queue_property::in_order);
2015
2016 std::shared_ptr<Executor> get_master() noexcept override;
2017
2018 std::shared_ptr<const Executor> get_master() const noexcept override;
2019
2020 void synchronize() const override;
2021
2022 scoped_device_id_guard get_scoped_device_id_guard() const override;
2023
2024 std::string get_description() const override;
2025
2031 int get_device_id() const noexcept
2032 {
2033 return this->get_exec_info().device_id;
2034 }
2035
2036 sycl::queue* get_queue() const { return queue_.get(); }
2037
2045 static int get_num_devices(std::string device_type);
2046
2052 const std::vector<int>& get_subgroup_sizes() const noexcept
2053 {
2054 return this->get_exec_info().subgroup_sizes;
2055 }
2056
2062 int get_num_computing_units() const noexcept
2063 {
2064 return this->get_exec_info().num_computing_units;
2065 }
2066
2070 int get_num_subgroups() const noexcept
2071 {
2072 return this->get_exec_info().num_computing_units *
2073 this->get_exec_info().num_pu_per_cu;
2074 }
2075
2081 const std::vector<int>& get_max_workitem_sizes() const noexcept
2082 {
2083 return this->get_exec_info().max_workitem_sizes;
2084 }
2085
2091 int get_max_workgroup_size() const noexcept
2092 {
2093 return this->get_exec_info().max_workgroup_size;
2094 }
2095
2101 int get_max_subgroup_size() const noexcept
2102 {
2103 return this->get_exec_info().max_subgroup_size;
2104 }
2105
2111 std::string get_device_type() const noexcept
2112 {
2113 return this->get_exec_info().device_type;
2114 }
2115
2116protected:
2117 void set_device_property(
2118 dpcpp_queue_property property = dpcpp_queue_property::in_order);
2119
2121 int device_id, std::shared_ptr<Executor> master,
2122 std::string device_type = "all",
2123 dpcpp_queue_property property = dpcpp_queue_property::in_order)
2124 : master_(master)
2125 {
2126 std::for_each(device_type.begin(), device_type.end(),
2127 [](char& c) { c = std::tolower(c); });
2128 this->get_exec_info().device_type = std::string(device_type);
2129 this->get_exec_info().device_id = device_id;
2130 this->set_device_property(property);
2131 }
2132
2133 void populate_exec_info(const machine_topology* mach_topo) override;
2134
2135 void* raw_alloc(size_type size) const override;
2136
2137 void raw_free(void* ptr) const noexcept override;
2138
2139 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
2140
2141 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
2142
2143 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
2144
2145 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
2146
2147 bool verify_memory_to(const OmpExecutor* dest_exec) const override;
2148
2149 bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
2150
2151private:
2152 std::shared_ptr<Executor> master_;
2153
2154 template <typename T>
2155 using queue_manager = std::unique_ptr<T, std::function<void(T*)>>;
2156 queue_manager<sycl::queue> queue_;
2157};
2158
2159
2160namespace kernels {
2161namespace dpcpp {
2162using DefaultExecutor = DpcppExecutor;
2163} // namespace dpcpp
2164} // namespace kernels
2165
2166
2167#undef GKO_OVERRIDE_RAW_COPY_TO
2168
2169
2170} // namespace gko
2171
2172
2173#endif // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
Implement this interface to provide an allocator for CudaExecutor.
Definition memory.hpp:40
Allocator using cudaMalloc.
Definition memory.hpp:102
This is the Executor subclass which represents the CUDA device.
Definition executor.hpp:1542
static std::shared_ptr< CudaExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_cuda_alloc_mode, CUstream_st *stream=nullptr)
Creates a new CudaExecutor.
cusparseContext * get_sparselib_handle() const
Get the cusparse handle for this executor.
Definition executor.hpp:1686
cublasContext * get_blas_handle() const
Get the cublas handle for this executor.
Definition executor.hpp:1670
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition executor.hpp:1696
int get_closest_numa() const
Get the closest NUMA node.
Definition executor.hpp:1706
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition executor.hpp:1613
int get_major_version() const noexcept
Get the major version of compute capability.
Definition executor.hpp:1646
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition executor.hpp:1621
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition executor.hpp:1638
cusparseContext * get_cusparse_handle() const
Get the cusparse handle for this executor.
Definition executor.hpp:1678
int get_device_id() const noexcept
Get the CUDA device id of the device associated to this executor.
Definition executor.hpp:1600
void synchronize() const override
Synchronize the operations launched on the executor with its master.
CUstream_st * get_stream() const
Returns the CUDA stream used by this executor.
Definition executor.hpp:1714
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition executor.hpp:1654
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition executor.hpp:1629
cublasContext * get_cublas_handle() const
Get the cublas handle for this executor.
Definition executor.hpp:1665
std::string get_description() const override
static int get_num_devices()
Get the number of devices present on the system.
This is the Executor subclass which represents a DPC++ enhanced device.
Definition executor.hpp:1996
const std::vector< int > & get_subgroup_sizes() const noexcept
Get the available subgroup sizes for this device.
Definition executor.hpp:2052
int get_num_computing_units() const noexcept
Get the number of Computing Units of this executor.
Definition executor.hpp:2062
int get_max_workgroup_size() const noexcept
Get the maximum workgroup size.
Definition executor.hpp:2091
std::string get_description() const override
int get_num_subgroups() const noexcept
Get the number of subgroups of this executor.
Definition executor.hpp:2070
void synchronize() const override
Synchronize the operations launched on the executor with its master.
const std::vector< int > & get_max_workitem_sizes() const noexcept
Get the maximum work item sizes.
Definition executor.hpp:2081
int get_max_subgroup_size() const noexcept
Get the maximum subgroup size.
Definition executor.hpp:2101
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
std::string get_device_type() const noexcept
Get a string representing the device type.
Definition executor.hpp:2111
static int get_num_devices(std::string device_type)
Get the number of devices present on the system.
int get_device_id() const noexcept
Get the DPCPP device id of the device associated to this executor.
Definition executor.hpp:2031
static std::shared_ptr< DpcppExecutor > create(int device_id, std::shared_ptr< Executor > master, std::string device_type="all", dpcpp_queue_property property=dpcpp_queue_property::in_order)
Creates a new DpcppExecutor.
The first step in using the Ginkgo library consists of creating an executor.
Definition executor.hpp:615
void free(void *ptr) const noexcept
Frees memory previously allocated with Executor::alloc().
Definition executor.hpp:722
virtual void run(const Operation &op) const =0
Runs the specified Operation using this Executor.
virtual void synchronize() const =0
Synchronize the operations launched on the executor with its master.
bool should_propagate_log() const
Returns true iff events occurring at an object created on this executor should be logged at propagati...
Definition executor.hpp:879
bool memory_accessible(const std::shared_ptr< const Executor > &other) const
Verifies whether the executors share the same memory.
Definition executor.hpp:892
void copy(size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data within this Executor.
Definition executor.hpp:794
void add_logger(std::shared_ptr< const log::Logger > logger) override
Definition executor.hpp:838
void copy_from(ptr_param< const Executor > src_exec, size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data from another Executor.
Definition executor.hpp:744
void set_log_propagation_mode(log_propagation_mode mode)
Sets the logger event propagation mode for the executor.
Definition executor.hpp:867
T * alloc(size_type num_elems) const
Allocates memory in this Executor.
Definition executor.hpp:705
virtual std::string get_description() const =0
virtual std::shared_ptr< Executor > get_master() noexcept=0
Returns the master OmpExecutor of this Executor.
T copy_val_to_host(const T *ptr) const
Retrieves a single element at the given location from executor memory.
Definition executor.hpp:809
void run(std::string name, const ClosureReference &op_ref, const ClosureOmp &op_omp, const ClosureCuda &op_cuda, const ClosureHip &op_hip, const ClosureDpcpp &op_dpcpp) const
Runs one of the passed in functors, depending on the Executor type.
Definition executor.hpp:683
void remove_logger(const log::Logger *logger) override
Definition executor.hpp:851
Implement this interface to provide an allocator for HipExecutor.
Definition memory.hpp:65
Definition memory.hpp:172
This is the Executor subclass which represents the HIP enhanced device.
Definition executor.hpp:1779
void synchronize() const override
Synchronize the operations launched on the executor with its master.
std::string get_description() const override
int get_device_id() const noexcept
Get the HIP device id of the device associated to this executor.
Definition executor.hpp:1824
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition executor.hpp:1837
int get_major_version() const noexcept
Get the major version of compute capability.
Definition executor.hpp:1853
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition executor.hpp:1927
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition executor.hpp:1861
static int get_num_devices()
Get the number of devices present on the system.
static std::shared_ptr< HipExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_hip_alloc_mode, CUstream_st *stream=nullptr)
Creates a new HipExecutor.
hipsparseContext * get_sparselib_handle() const
Get the hipsparse handle for this executor.
Definition executor.hpp:1910
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition executor.hpp:1845
hipblasContext * get_blas_handle() const
Get the hipblas handle for this executor.
Definition executor.hpp:1894
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition executor.hpp:1869
hipsparseContext * get_hipsparse_handle() const
Get the hipsparse handle for this executor.
Definition executor.hpp:1902
int get_closest_numa() const
Get the closest NUMA node.
Definition executor.hpp:1920
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition executor.hpp:1878
hipblasContext * get_hipblas_handle() const
Get the hipblas handle for this executor.
Definition executor.hpp:1889
NotSupported is thrown in case it is not possible to perform the requested operation on the given obj...
Definition exception.hpp:127
This is the Executor subclass which represents the OpenMP device (typically CPU).
Definition executor.hpp:1387
std::string get_description() const override
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
void synchronize() const override
Synchronize the operations launched on the executor with its master.
static std::shared_ptr< OmpExecutor > create(std::shared_ptr< CpuAllocatorBase > alloc=std::make_shared< CpuAllocator >())
Creates a new OmpExecutor.
Definition executor.hpp:1396
Operations can be used to define functionalities whose implementations differ among devices.
Definition executor.hpp:258
virtual const char * get_name() const noexcept
Returns the operation's name.
This is a specialization of the OmpExecutor, which runs the reference implementations of the kernels ...
Definition executor.hpp:1468
void run(const Operation &op) const override
Runs the specified Operation using this Executor.
Definition executor.hpp:1487
std::string get_description() const override
Definition executor.hpp:1485
executor_deleter(std::shared_ptr< const Executor > exec)
Creates a new deleter.
Definition executor.hpp:1226
void operator()(pointer ptr) const
Deletes the object.
Definition executor.hpp:1235
EnableLogging is a mixin which should be inherited by any class which wants to enable logging.
Definition logger.hpp:760
Definition logger.hpp:76
virtual bool needs_propagation() const
Returns true if this logger, when attached to an Executor, needs to be forwarded all events from obje...
Definition logger.hpp:654
The machine topology class represents the hierarchical topology of a machine, including NUMA nodes,...
Definition machine_topology.hpp:61
static machine_topology * get_instance()
Returns an instance of the machine_topology object.
Definition machine_topology.hpp:182
This class is used for function parameters in the place of raw pointers.
Definition utils_helper.hpp:41
T * get() const
Definition utils_helper.hpp:75
This move-only class uses RAII to set the device id within a scoped block, if necessary.
Definition scoped_device_id_guard.hpp:76
The logger namespace .
Definition convergence.hpp:22
The Ginkgo namespace.
Definition abstract_factory.hpp:20
std::uintptr_t uintptr
Unsigned integer type capable of holding a pointer to void.
Definition types.hpp:141
std::size_t size_type
Integral type used for allocation quantities.
Definition types.hpp:89
log_propagation_mode
How Logger events are propagated to their Executor.
Definition executor.hpp:34
@ automatic
Events get reported to loggers attached to the triggering object and propagating loggers (Logger::nee...
Definition executor.hpp:46
@ never
Events only get reported at loggers attached to the triggering object.
Definition executor.hpp:40
allocation_mode
Specify the mode of allocation for CUDA/HIP GPUs.
Definition executor.hpp:62
STL namespace.