40#ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
41#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
43#include "TpetraCore_config.h"
44#include "Teuchos_Array.hpp"
45#include "Teuchos_ArrayView.hpp"
46#include "Teuchos_OrdinalTraits.hpp"
54#include "Kokkos_Core.hpp"
86namespace UnpackAndCombineCrsMatrixImpl {
97template<
class ST,
class LO,
class GO>
102 const char imports[],
106 const size_t bytes_per_value)
112 bool unpack_pids =
pids_out.size() > 0;
122 const size_t pids_len = unpack_pids ?
131 const char*
const pids_in = unpack_pids ? imports +
pids_beg :
nullptr;
142 Kokkos::pair<int, size_t>
p;
181template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
186 typedef typename local_matrix_type::value_type ST;
190 typedef typename DT::execution_space XS;
192 typedef Kokkos::View<const size_t*, BufferDeviceType>
193 num_packets_per_lid_type;
194 typedef Kokkos::View<const size_t*, DT> offsets_type;
195 typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
196 typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
198 typedef Kokkos::View<int, DT> error_type;
199 using member_type =
typename Kokkos::TeamPolicy<XS>::member_type;
201 static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
202 "LocalMap::local_ordinal_type and "
203 "LocalMatrix::ordinal_type must be the same.");
207 input_buffer_type imports;
208 num_packets_per_lid_type num_packets_per_lid;
209 import_lids_type import_lids;
210 Kokkos::View<const LO*[2], DT> batch_info;
211 offsets_type offsets;
214 size_t bytes_per_value;
216 error_type error_code;
245 void operator()(member_type team_member)
const
248 using Kokkos::subview;
249 using Kokkos::MemoryUnmanaged;
251 const LO
batch = team_member.league_rank();
263 const size_t buf_size = imports.size();
283#ifndef KOKKOS_ENABLE_SYCL
285 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
286 "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
290 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 21);
297#ifndef KOKKOS_ENABLE_SYCL
299 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
300 "At row %d, the offset (%d) > buffer size (%d)\n",
304 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 22);
337#ifndef KOKKOS_ENABLE_SYCL
339 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
340 "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
344 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 23);
350 Kokkos::parallel_for(
370 if (combine_mode ==
ADD) {
375 (
void)local_matrix.sumIntoValues(
383 }
else if (combine_mode ==
REPLACE) {
388 (
void)local_matrix.replaceValues(
399#ifndef KOKKOS_ENABLE_SYCL
401 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
402 "At row %d, an unknown error occurred during unpack\n", (
int)
lid_no
405 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 31);
410 team_member.team_barrier();
416 auto error_code_h = Kokkos::create_mirror_view_and_copy(
417 Kokkos::HostSpace(), error_code
424struct MaxNumEntTag {};
425struct TotNumEntTag {};
435template<
class LO,
class DT,
class BDT>
438 typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
439 typedef Kokkos::View<const size_t*, DT> offsets_type;
440 typedef Kokkos::View<const char*, BDT> input_buffer_type;
446 num_packets_per_lid_type num_packets_per_lid;
447 offsets_type offsets;
448 input_buffer_type imports;
462 const size_t num_bytes = num_packets_per_lid(
i);
465 const char*
const in_buf = imports.data () + offsets(
i);
474 join (
const MaxNumEntTag,
478 if (dst < src) dst = src;
484 const size_t num_bytes = num_packets_per_lid(
i);
487 const char*
const in_buf = imports.data () + offsets(
i);
501template<
class LO,
class DT,
class BDT>
504 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
505 const Kokkos::View<const size_t*, DT>& offsets,
506 const Kokkos::View<const char*, BDT>& imports)
508 typedef typename DT::execution_space XS;
509 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
515 static_cast<LO
> (num_packets_per_lid.extent (0));
516 size_t max_num_ent = 0;
517 Kokkos::parallel_reduce (
"Max num entries in CRS",
530template<
class LO,
class DT,
class BDT>
533 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
534 const Kokkos::View<const size_t*, DT>& offsets,
535 const Kokkos::View<const char*, BDT>& imports)
537 typedef typename DT::execution_space XS;
538 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag>
range_policy;
543 static_cast<LO
> (num_packets_per_lid.extent (0));
544 Kokkos::parallel_reduce (
"Total num entries in CRS to unpack",
553unpackRowCount(
const char imports[],
568 return static_cast<size_t>(num_ent_LO);
575template<
class View1,
class View2>
583 using LO =
typename View2::value_type;
589 batch_info(
batch, 0) =
static_cast<LO
>(
i);
594 return batch == batch_info.extent(0);
604template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
609 const Kokkos::View<const char*, BufferDeviceType>& imports,
610 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
614 using ST =
typename LocalMatrix::value_type;
617 using XS =
typename DT::execution_space;
619 "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
620 "unpackAndCombineIntoCrsMatrix: ";
622 const size_t num_import_lids =
static_cast<size_t>(import_lids.extent(0));
631 std::invalid_argument,
632 prefix <<
"ABSMAX combine mode is not yet implemented for a matrix that has a "
633 "static graph (i.e., was constructed with the CrsMatrix constructor "
634 "that takes a const CrsGraph pointer).");
637 std::invalid_argument,
638 prefix <<
"INSERT combine mode is not allowed if the matrix has a static graph "
639 "(i.e., was constructed with the CrsMatrix constructor that takes a "
640 "const CrsGraph pointer).");
644 std::invalid_argument,
645 prefix <<
"Invalid combine mode; should never get "
646 "here! Please report this bug to the Tpetra developers.");
652 std::invalid_argument,
654 "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) <<
").");
668 Kokkos::View<LO*[2], DT> batch_info(
"",
num_batches);
671 Kokkos::parallel_reduce(
672 Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0,
num_import_lids),
676 imports.data(), offsets(
i), num_packets_per_lid(
i)
704 const bool atomic = XS().concurrency() != 1;
720 using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
722#if defined(KOKKOS_ENABLE_CUDA)
723 constexpr bool is_cuda = std::is_same<XS, Kokkos::Cuda>::value;
725 constexpr bool is_cuda =
false;
727 if (!is_cuda ||
team_size == Teuchos::OrdinalTraits<size_t>::invalid())
736 auto error_code =
f.error();
740 prefix <<
"UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
744template<
class LocalMatrix,
class BufferDeviceType>
749 const Kokkos::View<const char*, BufferDeviceType>& imports,
750 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
753 using Kokkos::parallel_reduce;
754 typedef typename LocalMatrix::ordinal_type LO;
755 typedef typename LocalMatrix::device_type device_type;
756 typedef typename device_type::execution_space XS;
757 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
758 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> >
range_policy;
770 update +=
static_cast<size_t>(local_matrix.graph.row_map[
lid+1]
771 -local_matrix.graph.row_map[
lid]);
777 num_items =
static_cast<LO
>(permute_from_lids.extent(0));
780 parallel_reduce(range_policy(0, num_items),
781 KOKKOS_LAMBDA(
const LO i,
size_t& update) {
782 const LO lid = permute_from_lids(i);
783 update +=
static_cast<size_t> (local_matrix.graph.row_map[lid+1]
784 - local_matrix.graph.row_map[lid]);
791 const size_type np = num_packets_per_lid.extent(0);
792 Kokkos::View<size_t*, device_type> offsets(
"offsets", np+1);
795 compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
803template<
class LO,
class DT,
class BDT>
805setupRowPointersForRemotes(
808 const Kokkos::View<const char*, BDT>& imports,
809 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
812 using Kokkos::parallel_reduce;
813 typedef typename DT::execution_space XS;
815 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> >
range_policy;
818 const size_type
N = num_packets_per_lid.extent(0);
825 const size_t num_bytes = num_packets_per_lid(
i);
826 const size_t offset = offsets(
i);
839makeCrsRowPtrFromLengths(
843 using Kokkos::parallel_scan;
844 typedef typename DT::execution_space XS;
845 typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
846 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> >
range_policy;
849 KOKKOS_LAMBDA(
const size_t&
i,
size_t& update,
const bool&
final) {
860template<
class LocalMatrix,
class LocalMap>
864 const typename PackTraits<int>::output_array_type& tgt_pids,
866 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
868 const typename PackTraits<int>::input_array_type& src_pids,
869 const LocalMatrix& local_matrix,
870 const LocalMap& local_col_map,
871 const size_t num_same_ids,
874 using Kokkos::parallel_for;
877 typedef typename DT::execution_space XS;
878 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
880 parallel_for(range_policy(0, num_same_ids),
881 KOKKOS_LAMBDA(
const size_t i) {
882 typedef typename std::remove_reference<
decltype( new_start_row(0) ) >::type atomic_incr_type;
884 const LO src_lid =
static_cast<LO
>(i);
885 size_t src_row = local_matrix.graph.row_map(src_lid);
887 const LO tgt_lid =
static_cast<LO
>(i);
888 const size_t tgt_row = tgt_rowptr(tgt_lid);
890 const size_t nsr = local_matrix.graph.row_map(src_lid+1)
891 - local_matrix.graph.row_map(src_lid);
892 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
894 for (
size_t j=local_matrix.graph.row_map(src_lid);
895 j<local_matrix.graph.row_map(src_lid+1); ++j) {
896 LO src_col = local_matrix.graph.entries(j);
897 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
898 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
899 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
905template<
class LocalMatrix,
class LocalMap>
907copyDataFromPermuteIDs(
909 const typename PackTraits<int>::output_array_type& tgt_pids,
911 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
913 const typename PackTraits<int>::input_array_type& src_pids,
916 const LocalMatrix& local_matrix,
917 const LocalMap& local_col_map,
920 using Kokkos::parallel_for;
923 typedef typename DT::execution_space XS;
924 typedef typename PackTraits<LO>::input_array_type::size_type size_type;
925 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
927 const size_type num_permute_to_lids = permute_to_lids.extent(0);
929 parallel_for(range_policy(0, num_permute_to_lids),
930 KOKKOS_LAMBDA(
const size_t i) {
931 typedef typename std::remove_reference<
decltype( new_start_row(0) ) >::type atomic_incr_type;
933 const LO src_lid = permute_from_lids(i);
934 const size_t src_row = local_matrix.graph.row_map(src_lid);
936 const LO tgt_lid = permute_to_lids(i);
937 const size_t tgt_row = tgt_rowptr(tgt_lid);
939 size_t nsr = local_matrix.graph.row_map(src_lid+1)
940 - local_matrix.graph.row_map(src_lid);
941 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
943 for (
size_t j=local_matrix.graph.row_map(src_lid);
944 j<local_matrix.graph.row_map(src_lid+1); ++j) {
945 LO src_col = local_matrix.graph.entries(j);
946 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
947 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
948 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
954template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
956unpackAndCombineIntoCrsArrays2(
958 const typename PackTraits<int>::output_array_type& tgt_pids,
960 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
963 const Kokkos::View<const char*, BufferDeviceType>& imports,
964 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
968 const size_t bytes_per_value)
971 using Kokkos::subview;
972 using Kokkos::MemoryUnmanaged;
973 using Kokkos::parallel_reduce;
974 using Kokkos::atomic_fetch_add;
979 typedef typename LocalMatrix::value_type ST;
980 typedef typename DT::execution_space XS;
981 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
982 typedef typename Kokkos::pair<size_type, size_type> slice;
983 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
985 typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
986 typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
987 typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
989 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
992 const size_type num_import_lids = import_lids.size();
995 parallel_reduce (
"Unpack and combine into CRS",
996 range_policy (0, num_import_lids),
997 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
998 typedef typename std::remove_reference<
decltype( new_start_row(0) ) >::type atomic_incr_type;
999 const size_t num_bytes = num_packets_per_lid(i);
1000 const size_t offset = offsets(i);
1001 if (num_bytes == 0) {
1005 size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
1006 if (num_ent == InvalidNum) {
1010 const LO lcl_row = import_lids(i);
1011 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
1012 const size_t end_row = start_row + num_ent;
1014 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
1015 vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
1016 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
1018 k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
1019 imports.data(), offset, num_bytes,
1020 num_ent, bytes_per_value);
1023 for (
size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
1024 const int pid = pids_out(j);
1025 pids_out(j) = (pid != my_pid) ? pid : -1;
1032template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
1035 const LocalMatrix & local_matrix,
1036 const LocalMap & local_col_map,
1038 const Kokkos::View<const char*, BufferDeviceType>& imports,
1039 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
1045 const typename PackTraits<int>::input_array_type& src_pids,
1046 const typename PackTraits<int>::output_array_type& tgt_pids,
1047 const size_t num_same_ids,
1048 const size_t tgt_num_rows,
1049 const size_t tgt_num_nonzeros,
1050 const int my_tgt_pid,
1051 const size_t bytes_per_value)
1054 using Kokkos::subview;
1055 using Kokkos::parallel_for;
1056 using Kokkos::MemoryUnmanaged;
1060 typedef typename DT::execution_space XS;
1061 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1062 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1063 typedef BufferDeviceType BDT;
1065 const char prefix[] =
"unpackAndCombineIntoCrsArrays: ";
1067 const size_t N = tgt_num_rows;
1071 const int my_pid = my_tgt_pid;
1074 parallel_for(range_policy(0, N+1),
1075 KOKKOS_LAMBDA(
const size_t i) {
1081 parallel_for(range_policy(0, num_same_ids),
1082 KOKKOS_LAMBDA(
const size_t i) {
1083 const LO tgt_lid =
static_cast<LO
>(i);
1084 const LO src_lid =
static_cast<LO
>(i);
1085 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1086 - local_matrix.graph.row_map(src_lid);
1091 const size_type num_permute_to_lids = permute_to_lids.extent(0);
1092 parallel_for(range_policy(0, num_permute_to_lids),
1093 KOKKOS_LAMBDA(
const size_t i) {
1094 const LO tgt_lid = permute_to_lids(i);
1095 const LO src_lid = permute_from_lids(i);
1096 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1097 - local_matrix.graph.row_map(src_lid);
1102 const size_type num_import_lids = import_lids.extent(0);
1103 View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
1106#ifdef HAVE_TPETRA_DEBUG
1108 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1109 const bool condition =
1110 nth_offset_h !=
static_cast<size_t>(imports.extent (0));
1111 TEUCHOS_TEST_FOR_EXCEPTION
1112 (condition, std::logic_error, prefix
1113 <<
"The final offset in bytes " << nth_offset_h
1114 <<
" != imports.size() = " << imports.extent(0)
1115 <<
". Please report this bug to the Tpetra developers.");
1121 setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1122 import_lids, imports, num_packets_per_lid, offsets);
1123 TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1124 <<
" Error transferring data to target row pointers. "
1125 "Please report this bug to the Tpetra developers.");
1129 View<size_t*, DT> new_start_row (
"new_start_row", N+1);
1132 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1135 copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1136 tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1138 copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1139 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1140 local_matrix, local_col_map, my_pid);
1142 if (imports.extent(0) <= 0) {
1146 int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1147 tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1148 local_matrix, local_col_map, my_pid, bytes_per_value);
1149 TEUCHOS_TEST_FOR_EXCEPTION(
1150 unpack_err != 0, std::logic_error, prefix <<
"unpack loop failed. This "
1151 "should never happen. Please report this bug to the Tpetra developers.");
1197template<
typename ST,
typename LO,
typename GO,
typename Node>
1201 const Teuchos::ArrayView<const char>& imports,
1203 const Teuchos::ArrayView<const LO>&
importLIDs,
1208 typedef typename Node::device_type device_type;
1210 static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1211 "Node::device_type and LocalMatrix::device_type must be the same.");
1229 imports.size(),
true,
"imports");
1231 auto local_matrix =
sourceMatrix.getLocalMatrixDevice();
1232 auto local_col_map =
sourceMatrix.getColMap()->getLocalMap();
1243 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1249template<
typename ST,
typename LO,
typename GO,
typename NT>
1251unpackCrsMatrixAndCombineNew(
1253 Kokkos::DualView<
char*,
1255 Kokkos::DualView<
size_t*,
1257 const Kokkos::DualView<
const LO*,
1265 using device_type =
typename crs_matrix_type::device_type;
1266 using local_matrix_device_type =
typename crs_matrix_type::local_matrix_device_type;
1267 using buffer_device_type =
typename dist_object_type::buffer_device_type;
1270 (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1271 "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1272 "must be the same.");
1277 auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1279 TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1280 auto import_lids_d = importLIDs.view_device ();
1282 if (imports.need_sync_device()) {
1283 imports.sync_device ();
1285 auto imports_d = imports.view_device ();
1287 auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1288 auto local_col_map = sourceMatrix.getColMap ()->getLocalMap ();
1289 typedef decltype (local_col_map) local_map_type;
1291 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1292 local_matrix_device_type,
1295 > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1296 import_lids_d, combineMode);
1354template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1358 const Teuchos::ArrayView<const LocalOrdinal> &
importLIDs,
1359 const Teuchos::ArrayView<const char> &imports,
1364 const Teuchos::ArrayView<const LocalOrdinal>&
permuteToLIDs,
1367 using Kokkos::MemoryUnmanaged;
1369 typedef typename Node::device_type DT;
1371 const char prefix[] =
"unpackAndCombineWithOwningPIDsCount: ";
1382 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1388 auto local_matrix =
sourceMatrix.getLocalMatrixDevice ();
1393 "permute_from_lids");
1396 imports.getRawPtr (),
1397 imports.size (),
true,
1403 "num_packets_per_lid");
1405 return UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1424template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1428 const Teuchos::ArrayView<const LocalOrdinal>&
importLIDs,
1429 const Teuchos::ArrayView<const char>& imports,
1433 const size_t numSameIDs,
1434 const Teuchos::ArrayView<const LocalOrdinal>&
permuteToLIDs,
1439 const Teuchos::ArrayView<size_t>&
CRS_rowptr,
1440 const Teuchos::ArrayView<GlobalOrdinal>&
CRS_colind,
1442 const Teuchos::ArrayView<const int>&
SourcePids,
1445 using execution_space =
typename Node::execution_space;
1449 using Kokkos::deep_copy;
1451 using Teuchos::ArrayView;
1452 using Teuchos::outArg;
1453 using Teuchos::REDUCE_MAX;
1454 using Teuchos::reduceAll;
1458 typedef typename Node::device_type DT;
1461 typedef typename matrix_type::impl_scalar_type ST;
1464 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1468 std::invalid_argument,
prefix <<
"CRS_rowptr.size() = " <<
1489 auto local_matrix =
sourceMatrix.getLocalMatrixDevice();
1490 auto local_col_map =
sourceMatrix.getColMap()->getLocalMap();
1500 imports.size(),
true,
"imports");
1522#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1523 static_assert (! std::is_same<
1524 typename std::remove_const<
1525 typename std::decay<
1529 std::complex<double> >::value,
1530 "CRS_vals::value_type is std::complex<double>; this should never happen"
1531 ", since std::complex does not work in Kokkos::View objects.");
1536 CRS_vals.size(),
true,
"crs_vals");
1538#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1539 static_assert (! std::is_same<
1540 typename decltype (
crs_vals_d)::non_const_value_type,
1541 std::complex<double> >::value,
1542 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1543 "never happen, since std::complex does not work in Kokkos::View objects.");
1554 size_t bytes_per_value = 0;
1569 if (local_matrix.values.extent(0) > 0) {
1570 const ST&
val = local_matrix.values(0);
1576 Teuchos::reduceAll<int, size_t>(*(
sourceMatrix.getComm()),
1577 Teuchos::REDUCE_MAX,
1579 outArg(bytes_per_value));
1582#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1583 static_assert (! std::is_same<
1584 typename decltype (
crs_vals_d)::non_const_value_type,
1585 std::complex<double> >::value,
1586 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1587 "never happen, since std::complex does not work in Kokkos::View objects.");
1590 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays(
1622#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1624 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1625 const CrsMatrix<ST, LO, GO, NT>&, \
1626 const Teuchos::ArrayView<const char>&, \
1627 const Teuchos::ArrayView<const size_t>&, \
1628 const Teuchos::ArrayView<const LO>&, \
1632 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1633 const CrsMatrix<ST, LO, GO, NT>&, \
1634 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1635 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1636 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1638 const CombineMode); \
1640 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1641 const CrsMatrix<ST, LO, GO, NT> &, \
1642 const Teuchos::ArrayView<const LO>&, \
1643 const Teuchos::ArrayView<const char>&, \
1644 const Teuchos::ArrayView<const size_t>&, \
1646 const CombineMode, \
1648 const Teuchos::ArrayView<const LO>&, \
1649 const Teuchos::ArrayView<const LO>&, \
1653 const Teuchos::ArrayView<size_t>&, \
1654 const Teuchos::ArrayView<GO>&, \
1655 const Teuchos::ArrayView<CrsMatrix<ST, LO, GO, NT>::impl_scalar_type>&, \
1656 const Teuchos::ArrayView<const int>&, \
1657 Teuchos::Array<int>&); \
1659 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1660 const CrsMatrix<ST, LO, GO, NT> &, \
1661 const Teuchos::ArrayView<const LO> &, \
1662 const Teuchos::ArrayView<const char> &, \
1663 const Teuchos::ArrayView<const size_t>&, \
1667 const Teuchos::ArrayView<const LO>&, \
1668 const Teuchos::ArrayView<const LO>&);
Declaration of the Tpetra::CrsMatrix class.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Declaration and definition of Tpetra::Details::getEntryOnHost.
size_t compute_total_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Total number of entries in any row of the packed matrix.
void unpackAndCombineIntoCrsMatrix(const LocalMatrix &local_matrix, const LocalMap &local_map, const Kokkos::View< const char *, BufferDeviceType > &imports, const Kokkos::View< const size_t *, BufferDeviceType > &num_packets_per_lid, const typename PackTraits< typename LocalMap::local_ordinal_type >::input_array_type import_lids, const Tpetra::CombineMode combine_mode)
Perform the unpack operation for the matrix.
size_t compute_maximum_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Maximum number of entries in any row of the packed matrix.
bool compute_batch_info(const View1 &batches_per_lid, View2 &batch_info)
Compute the index and batch number associated with each batch.
Struct that holds views of the contents of a CrsMatrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
"Local" part of Map suitable for Kokkos kernels.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
LocalOrdinal local_ordinal_type
The type of local indices.
GlobalOrdinal global_ordinal_type
The type of global indices.
DeviceType device_type
The device type.
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
Kokkos::Device< typename device_type::execution_space, buffer_memory_space > buffer_device_type
Kokkos::Device specialization for communication buffers.
Implementation details of Tpetra.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ INSERT
Insert new values that don't currently exist.
Traits class for packing / unpacking data of type T.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Unpacks and combines a single row of the CrsMatrix.
int error() const
Host function for getting the error.