Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// ************************************************************************
38// @HEADER
39
40#ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
41#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
42
43#include "TpetraCore_config.h"
44#include "Teuchos_Array.hpp"
45#include "Teuchos_ArrayView.hpp"
46#include "Teuchos_OrdinalTraits.hpp"
54#include "Kokkos_Core.hpp"
55#include <memory>
56#include <string>
57
78
79namespace Tpetra {
80
81//
82// Users must never rely on anything in the Details namespace.
83//
84namespace Details {
85
86namespace UnpackAndCombineCrsMatrixImpl {
87
97template<class ST, class LO, class GO>
98KOKKOS_FUNCTION int
102 const char imports[],
103 const size_t offset,
104 const size_t /* num_bytes */,
105 const size_t num_ent,
106 const size_t bytes_per_value)
107{
108 if (num_ent == 0) {
109 // Empty rows always take zero bytes, to ensure sparsity.
110 return 0;
111 }
112 bool unpack_pids = pids_out.size() > 0;
113
114 const size_t num_ent_beg = offset;
115 const size_t num_ent_len = PackTraits<LO>::packValueCount (LO (0));
116
117 const size_t gids_beg = num_ent_beg + num_ent_len;
118 const size_t gids_len =
120
121 const size_t pids_beg = gids_beg + gids_len;
122 const size_t pids_len = unpack_pids ?
124 size_t (0);
125
126 const size_t vals_beg = gids_beg + gids_len + pids_len;
127 const size_t vals_len = num_ent * bytes_per_value;
128
129 const char* const num_ent_in = imports + num_ent_beg;
130 const char* const gids_in = imports + gids_beg;
131 const char* const pids_in = unpack_pids ? imports + pids_beg : nullptr;
132 const char* const vals_in = imports + vals_beg;
133
134 size_t num_bytes_out = 0;
135 LO num_ent_out;
137 if (static_cast<size_t> (num_ent_out) != num_ent) {
138 return 20; // error code
139 }
140
141 {
142 Kokkos::pair<int, size_t> p;
144 if (p.first != 0) {
145 return 21; // error code
146 }
147 num_bytes_out += p.second;
148
149 if (unpack_pids) {
151 if (p.first != 0) {
152 return 22; // error code
153 }
154 num_bytes_out += p.second;
155 }
156
158 if (p.first != 0) {
159 return 23; // error code
160 }
161 num_bytes_out += p.second;
162 }
163
166 return 24; // error code
167 }
168 return 0; // no errors
169}
170
181template<class LocalMatrix, class LocalMap, class BufferDeviceType>
184 typedef LocalMap local_map_type;
185
186 typedef typename local_matrix_type::value_type ST;
187 typedef typename local_map_type::local_ordinal_type LO;
189 typedef typename local_map_type::device_type DT;
190 typedef typename DT::execution_space XS;
191
192 typedef Kokkos::View<const size_t*, BufferDeviceType>
193 num_packets_per_lid_type;
194 typedef Kokkos::View<const size_t*, DT> offsets_type;
195 typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
196 typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
197
198 typedef Kokkos::View<int, DT> error_type;
199 using member_type = typename Kokkos::TeamPolicy<XS>::member_type;
200
201 static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
202 "LocalMap::local_ordinal_type and "
203 "LocalMatrix::ordinal_type must be the same.");
204
205 local_matrix_type local_matrix;
206 local_map_type local_col_map;
207 input_buffer_type imports;
208 num_packets_per_lid_type num_packets_per_lid;
209 import_lids_type import_lids;
210 Kokkos::View<const LO*[2], DT> batch_info;
211 offsets_type offsets;
212 Tpetra::CombineMode combine_mode;
213 size_t batch_size;
214 size_t bytes_per_value;
215 bool atomic;
216 error_type error_code;
217
221 const input_buffer_type& imports_in,
222 const num_packets_per_lid_type& num_packets_per_lid_in,
223 const import_lids_type& import_lids_in,
224 const Kokkos::View<const LO*[2], DT>& batch_info_in,
225 const offsets_type& offsets_in,
227 const size_t batch_size_in,
228 const size_t bytes_per_value_in,
229 const bool atomic_in) :
230 local_matrix (local_matrix_in),
231 local_col_map (local_col_map_in),
232 imports (imports_in),
233 num_packets_per_lid (num_packets_per_lid_in),
234 import_lids (import_lids_in),
235 batch_info (batch_info_in),
236 offsets (offsets_in),
237 combine_mode (combine_mode_in),
238 batch_size (batch_size_in),
239 bytes_per_value (bytes_per_value_in),
240 atomic (atomic_in),
241 error_code("error")
242 {}
243
245 void operator()(member_type team_member) const
246 {
247 using Kokkos::View;
248 using Kokkos::subview;
249 using Kokkos::MemoryUnmanaged;
250
251 const LO batch = team_member.league_rank();
252 const LO lid_no = batch_info(batch, 0);
253 const LO batch_no = batch_info(batch, 1);
254
255 const size_t num_bytes = num_packets_per_lid(lid_no);
256
257 // Only unpack data if there is a nonzero number of bytes.
258 if (num_bytes == 0)
259 return;
260
261 // there is actually something in the row
262 const LO import_lid = import_lids(lid_no);
263 const size_t buf_size = imports.size();
264 const size_t offset = offsets(lid_no);
265
266 // Get the number of entries to expect in the received data for this row.
267 LO num_ent_LO = 0;
268 const char* const in_buf = imports.data() + offset;
270 const size_t num_entries_in_row = static_cast<size_t>(num_ent_LO);
271
272 // Count the number of bytes expected to unpack
273 size_t expected_num_bytes = 0;
274 {
278 }
279
281 {
282// FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
283#ifndef KOKKOS_ENABLE_SYCL
284 printf(
285 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
286 "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
287 (int) lid_no, (int) expected_num_bytes, (int) num_bytes
288 );
289#endif
290 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 21);
291 return;
292 }
293
295 {
296// FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
297#ifndef KOKKOS_ENABLE_SYCL
298 printf(
299 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
300 "At row %d, the offset (%d) > buffer size (%d)\n",
301 (int) lid_no, (int) offset, (int) buf_size
302 );
303#endif
304 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 22);
305 return;
306 }
307
308 // Determine the number of entries to unpack in this batch
309 size_t num_entries_in_batch = 0;
310 if (num_entries_in_row <= batch_size)
312 else if (num_entries_in_row >= (batch_no + 1) * batch_size)
313 num_entries_in_batch = batch_size;
314 else
316
317 const size_t bytes_per_lid = PackTraits<LO>::packValueCount(LO(0));
318 const size_t num_ent_start = offset;
319 const size_t num_ent_end = num_ent_start + bytes_per_lid;
320
322 const size_t gids_start = num_ent_end;
324
325 const size_t vals_start = gids_end;
326
327 const size_t shift = batch_no * batch_size;
328 const char* const num_ent_in = imports.data() + num_ent_start;
329 const char* const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
330 const char* const vals_in = imports.data() + vals_start + shift * bytes_per_value;
331
332 LO num_ent_out;
334 if (static_cast<size_t>(num_ent_out) != num_entries_in_row)
335 {
336// FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
337#ifndef KOKKOS_ENABLE_SYCL
338 printf(
339 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
340 "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
341 (int) lid_no, (int) num_entries_in_row, (int) num_ent_out
342 );
343#endif
344 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 23);
345 }
346
347 constexpr bool matrix_has_sorted_rows = true; // see #6282
348 //Note BMK 6-22: this lambda must use capture-by-value [=] and not capture-by-ref [&].
349 //By ref triggers compiler bug in CUDA 10.
350 Kokkos::parallel_for(
351 Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
352 [=](const LO& j)
353 {
354 size_t distance = 0;
355
356 GO gid_out;
359 auto lid_out = local_col_map.getLocalElement(gid_out);
360
361 // Column indices come in as global indices, in case the
362 // source object's column Map differs from the target object's
363 // (this's) column Map, and must be converted local index values
364
365 // assume that ST is default constructible
366 ST val_out;
367 distance = j * bytes_per_value;
369
370 if (combine_mode == ADD) {
371 // NOTE (mfh 20 Nov 2019) Must assume atomic is required, unless
372 // different threads don't touch the same row (i.e., no
373 // duplicates in incoming LIDs list).
374 const bool use_atomic_updates = atomic;
375 (void)local_matrix.sumIntoValues(
377 &lid_out,
378 1,
379 &val_out,
382 );
383 } else if (combine_mode == REPLACE) {
384 // NOTE (mfh 20 Nov 2019): It's never correct to use REPLACE
385 // combine mode with multiple incoming rows that touch the same
386 // target matrix entries, so we never need atomic updates.
387 const bool use_atomic_updates = false;
388 (void)local_matrix.replaceValues(
390 &lid_out,
391 1,
392 &val_out,
395 );
396 } else {
397 // should never get here
398// FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
399#ifndef KOKKOS_ENABLE_SYCL
400 printf(
401 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
402 "At row %d, an unknown error occurred during unpack\n", (int) lid_no
403 );
404#endif
405 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 31);
406 }
407 }
408 );
409
410 team_member.team_barrier();
411
412 }
413
415 int error() const {
416 auto error_code_h = Kokkos::create_mirror_view_and_copy(
417 Kokkos::HostSpace(), error_code
418 );
419 return error_code_h();
420 }
421
422};
423
424struct MaxNumEntTag {};
425struct TotNumEntTag {};
426
435template<class LO, class DT, class BDT>
437public:
438 typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
439 typedef Kokkos::View<const size_t*, DT> offsets_type;
440 typedef Kokkos::View<const char*, BDT> input_buffer_type;
441 // This needs to be public, since it appears in the argument list of
442 // public methods (see below). Otherwise, build errors may happen.
443 typedef size_t value_type;
444
445private:
446 num_packets_per_lid_type num_packets_per_lid;
447 offsets_type offsets;
448 input_buffer_type imports;
449
450public:
451 NumEntriesFunctor (const num_packets_per_lid_type num_packets_per_lid_in,
452 const offsets_type& offsets_in,
453 const input_buffer_type& imports_in) :
454 num_packets_per_lid (num_packets_per_lid_in),
455 offsets (offsets_in),
456 imports (imports_in)
457 {}
458
460 operator() (const MaxNumEntTag, const LO i, value_type& update) const {
461 // Get how many entries to expect in the received data for this row.
462 const size_t num_bytes = num_packets_per_lid(i);
463 if (num_bytes > 0) {
464 LO num_ent_LO = 0; // output argument of unpackValue
465 const char* const in_buf = imports.data () + offsets(i);
467 const size_t num_ent = static_cast<size_t> (num_ent_LO);
468
469 update = (update < num_ent) ? num_ent : update;
470 }
471 }
472
474 join (const MaxNumEntTag,
475 value_type& dst,
476 const value_type& src) const
477 {
478 if (dst < src) dst = src;
479 }
480
482 operator() (const TotNumEntTag, const LO i, value_type& tot_num_ent) const {
483 // Get how many entries to expect in the received data for this row.
484 const size_t num_bytes = num_packets_per_lid(i);
485 if (num_bytes > 0) {
486 LO num_ent_LO = 0; // output argument of unpackValue
487 const char* const in_buf = imports.data () + offsets(i);
489 tot_num_ent += static_cast<size_t> (num_ent_LO);
490 }
491 }
492};
493
501template<class LO, class DT, class BDT>
502size_t
504 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
505 const Kokkos::View<const size_t*, DT>& offsets,
506 const Kokkos::View<const char*, BDT>& imports)
507{
508 typedef typename DT::execution_space XS;
509 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
510 MaxNumEntTag> range_policy;
511
512 NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
513 imports);
514 const LO numRowsToUnpack =
515 static_cast<LO> (num_packets_per_lid.extent (0));
516 size_t max_num_ent = 0;
517 Kokkos::parallel_reduce ("Max num entries in CRS",
519 functor, max_num_ent);
520 return max_num_ent;
521}
522
530template<class LO, class DT, class BDT>
531size_t
533 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
534 const Kokkos::View<const size_t*, DT>& offsets,
535 const Kokkos::View<const char*, BDT>& imports)
536{
537 typedef typename DT::execution_space XS;
538 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
539 size_t tot_num_ent = 0;
540 NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
541 imports);
542 const LO numRowsToUnpack =
543 static_cast<LO> (num_packets_per_lid.extent (0));
544 Kokkos::parallel_reduce ("Total num entries in CRS to unpack",
547 return tot_num_ent;
548}
549
550template<class LO>
552size_t
553unpackRowCount(const char imports[],
554 const size_t offset,
555 const size_t num_bytes)
556{
557 using PT = PackTraits<LO>;
558
559 LO num_ent_LO = 0;
560 if (num_bytes > 0) {
561 const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
562 if (p_num_bytes > num_bytes) {
564 }
565 const char* const in_buf = imports + offset;
566 (void) PT::unpackValue(num_ent_LO, in_buf);
567 }
568 return static_cast<size_t>(num_ent_LO);
569}
570
575template<class View1, class View2>
576inline
577bool
579 const View1& batches_per_lid,
580 View2& batch_info
581)
582{
583 using LO = typename View2::value_type;
584 size_t batch = 0;
585 for (size_t i=0; i<batches_per_lid.extent(0); i++)
586 {
587 for (size_t batch_no=0; batch_no<batches_per_lid(i); batch_no++)
588 {
589 batch_info(batch, 0) = static_cast<LO>(i);
590 batch_info(batch, 1) = batch_no;
591 batch++;
592 }
593 }
594 return batch == batch_info.extent(0);
595}
596
604template<class LocalMatrix, class LocalMap, class BufferDeviceType>
605void
607 const LocalMatrix& local_matrix,
608 const LocalMap& local_map,
609 const Kokkos::View<const char*, BufferDeviceType>& imports,
610 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
612 const Tpetra::CombineMode combine_mode)
613{
614 using ST = typename LocalMatrix::value_type;
615 using LO = typename LocalMap::local_ordinal_type;
616 using DT = typename LocalMap::device_type;
617 using XS = typename DT::execution_space;
618 const char prefix[] =
619 "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
620 "unpackAndCombineIntoCrsMatrix: ";
621
622 const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
623 if (num_import_lids == 0) {
624 // Nothing to unpack
625 return;
626 }
627
628 {
629 // Check for correct input
630 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == ABSMAX,
631 std::invalid_argument,
632 prefix << "ABSMAX combine mode is not yet implemented for a matrix that has a "
633 "static graph (i.e., was constructed with the CrsMatrix constructor "
634 "that takes a const CrsGraph pointer).");
635
636 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == INSERT,
637 std::invalid_argument,
638 prefix << "INSERT combine mode is not allowed if the matrix has a static graph "
639 "(i.e., was constructed with the CrsMatrix constructor that takes a "
640 "const CrsGraph pointer).");
641
642 // Unknown combine mode!
643 TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode == ADD || combine_mode == REPLACE),
644 std::invalid_argument,
645 prefix << "Invalid combine mode; should never get "
646 "here! Please report this bug to the Tpetra developers.");
647
648 // Check that sizes of input objects are consistent.
650 num_import_lids != static_cast<size_t>(num_packets_per_lid.extent(0));
652 std::invalid_argument,
653 prefix << "importLIDs.size() (" << num_import_lids << ") != "
654 "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) << ").");
655 } // end QA error checking
656
657 // Get the offsets
658 Kokkos::View<size_t*, DT> offsets("offsets", num_import_lids+1);
659 computeOffsetsFromCounts(offsets, num_packets_per_lid);
660
661 // Determine the sizes of the unpack batches
662 size_t max_num_ent = compute_maximum_num_entries<LO,DT>(num_packets_per_lid, offsets, imports);
664 const size_t batch_size = std::min(default_batch_size, max_num_ent);
665
666 // To achieve some balance amongst threads, unpack each row in equal size batches
667 size_t num_batches = 0;
668 Kokkos::View<LO*[2], DT> batch_info("", num_batches);
669 Kokkos::View<size_t*, DT> batches_per_lid("", num_import_lids);
670 // Compute meta data that allows batch unpacking
671 Kokkos::parallel_reduce(
672 Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
673 KOKKOS_LAMBDA(const size_t i, size_t& batches)
674 {
676 imports.data(), offsets(i), num_packets_per_lid(i)
677 );
679 (num_entries_in_row <= batch_size) ?
680 1 :
681 num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
683 },
685 );
686 Kokkos::resize(batch_info, num_batches);
687
688 Kokkos::HostSpace host_space;
689 auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
690 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
691 Kokkos::deep_copy(XS(), batches_per_lid_h, batches_per_lid);
692
693 auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
694
696 // DEEP_COPY REVIEW - HOSTMIRROR-TO-DEVICE
697 Kokkos::deep_copy(XS(), batch_info, batch_info_h);
698
699 // FIXME (TJF SEP 2017)
700 // The scalar type is not necessarily default constructible
701 size_t bytes_per_value = PackTraits<ST>::packValueCount(ST());
702
703 // Now do the actual unpack!
704 const bool atomic = XS().concurrency() != 1;
706 functor f(
707 local_matrix,
708 local_map,
709 imports,
710 num_packets_per_lid,
711 import_lids,
712 batch_info,
713 offsets,
714 combine_mode,
715 batch_size,
716 bytes_per_value,
717 atomic
718 );
719
720 using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
722#if defined(KOKKOS_ENABLE_CUDA)
723 constexpr bool is_cuda = std::is_same<XS, Kokkos::Cuda>::value;
724#else
725 constexpr bool is_cuda = false;
726#endif
727 if (!is_cuda || team_size == Teuchos::OrdinalTraits<size_t>::invalid())
728 {
729 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
730 }
731 else
732 {
733 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
734 }
735
736 auto error_code = f.error();
738 error_code != 0,
739 std::runtime_error,
740 prefix << "UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
741 );
742}
743
744template<class LocalMatrix, class BufferDeviceType>
745size_t
747 const LocalMatrix& local_matrix,
749 const Kokkos::View<const char*, BufferDeviceType>& imports,
750 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
751 const size_t num_same_ids)
752{
753 using Kokkos::parallel_reduce;
754 typedef typename LocalMatrix::ordinal_type LO;
755 typedef typename LocalMatrix::device_type device_type;
756 typedef typename device_type::execution_space XS;
757 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
758 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
759 typedef BufferDeviceType BDT;
760
761 size_t count = 0;
762 LO num_items;
763
764 // Number of matrix entries to unpack (returned by this function).
765 num_items = static_cast<LO>(num_same_ids);
766 if (num_items) {
767 size_t kcnt = 0;
769 KOKKOS_LAMBDA(const LO lid, size_t& update) {
770 update += static_cast<size_t>(local_matrix.graph.row_map[lid+1]
771 -local_matrix.graph.row_map[lid]);
772 }, kcnt);
773 count += kcnt;
774 }
775
776 // Count entries copied directly from the source matrix with permuting.
777 num_items = static_cast<LO>(permute_from_lids.extent(0));
778 if (num_items) {
779 size_t kcnt = 0;
780 parallel_reduce(range_policy(0, num_items),
781 KOKKOS_LAMBDA(const LO i, size_t& update) {
782 const LO lid = permute_from_lids(i);
783 update += static_cast<size_t> (local_matrix.graph.row_map[lid+1]
784 - local_matrix.graph.row_map[lid]);
785 }, kcnt);
786 count += kcnt;
787 }
788
789 {
790 // Count entries received from other MPI processes.
791 const size_type np = num_packets_per_lid.extent(0);
792 Kokkos::View<size_t*, device_type> offsets("offsets", np+1);
793 computeOffsetsFromCounts(offsets, num_packets_per_lid);
794 count +=
795 compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
796 offsets, imports);
797 }
798
799 return count;
800}
801
803template<class LO, class DT, class BDT>
804int
805setupRowPointersForRemotes(
807 const typename PackTraits<LO>::input_array_type& import_lids,
808 const Kokkos::View<const char*, BDT>& imports,
809 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
810 const typename PackTraits<size_t>::input_array_type& offsets)
811{
812 using Kokkos::parallel_reduce;
813 typedef typename DT::execution_space XS;
814 typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
815 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
816
818 const size_type N = num_packets_per_lid.extent(0);
819
820 int errors = 0;
821 parallel_reduce ("Setup row pointers for remotes",
822 range_policy (0, N),
823 KOKKOS_LAMBDA (const size_t i, int& k_error) {
824 typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
825 const size_t num_bytes = num_packets_per_lid(i);
826 const size_t offset = offsets(i);
827 const size_t num_ent = unpackRowCount<LO> (imports.data(), offset, num_bytes);
828 if (num_ent == InvalidNum) {
829 k_error += 1;
830 }
831 Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
832 }, errors);
833 return errors;
834}
835
836// Convert array of row lengths to a CRS pointer array
837template<class DT>
838void
839makeCrsRowPtrFromLengths(
841 const Kokkos::View<size_t*,DT>& new_start_row)
842{
843 using Kokkos::parallel_scan;
844 typedef typename DT::execution_space XS;
845 typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
846 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
847 const size_type N = new_start_row.extent(0);
849 KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
850 auto cur_val = tgt_rowptr(i);
851 if (final) {
852 tgt_rowptr(i) = update;
854 }
855 update += cur_val;
856 }
857 );
858}
859
860template<class LocalMatrix, class LocalMap>
861void
862copyDataFromSameIDs(
864 const typename PackTraits<int>::output_array_type& tgt_pids,
866 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
867 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
868 const typename PackTraits<int>::input_array_type& src_pids,
869 const LocalMatrix& local_matrix,
870 const LocalMap& local_col_map,
871 const size_t num_same_ids,
872 const int my_pid)
873{
874 using Kokkos::parallel_for;
875 typedef typename LocalMap::device_type DT;
876 typedef typename LocalMap::local_ordinal_type LO;
877 typedef typename DT::execution_space XS;
878 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
879
880 parallel_for(range_policy(0, num_same_ids),
881 KOKKOS_LAMBDA(const size_t i) {
882 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
883
884 const LO src_lid = static_cast<LO>(i);
885 size_t src_row = local_matrix.graph.row_map(src_lid);
886
887 const LO tgt_lid = static_cast<LO>(i);
888 const size_t tgt_row = tgt_rowptr(tgt_lid);
889
890 const size_t nsr = local_matrix.graph.row_map(src_lid+1)
891 - local_matrix.graph.row_map(src_lid);
892 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
893
894 for (size_t j=local_matrix.graph.row_map(src_lid);
895 j<local_matrix.graph.row_map(src_lid+1); ++j) {
896 LO src_col = local_matrix.graph.entries(j);
897 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
898 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
899 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
900 }
901 }
902 );
903}
904
905template<class LocalMatrix, class LocalMap>
906void
907copyDataFromPermuteIDs(
909 const typename PackTraits<int>::output_array_type& tgt_pids,
911 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
912 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
913 const typename PackTraits<int>::input_array_type& src_pids,
916 const LocalMatrix& local_matrix,
917 const LocalMap& local_col_map,
918 const int my_pid)
919{
920 using Kokkos::parallel_for;
921 typedef typename LocalMap::device_type DT;
922 typedef typename LocalMap::local_ordinal_type LO;
923 typedef typename DT::execution_space XS;
924 typedef typename PackTraits<LO>::input_array_type::size_type size_type;
925 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
926
927 const size_type num_permute_to_lids = permute_to_lids.extent(0);
928
929 parallel_for(range_policy(0, num_permute_to_lids),
930 KOKKOS_LAMBDA(const size_t i) {
931 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
932
933 const LO src_lid = permute_from_lids(i);
934 const size_t src_row = local_matrix.graph.row_map(src_lid);
935
936 const LO tgt_lid = permute_to_lids(i);
937 const size_t tgt_row = tgt_rowptr(tgt_lid);
938
939 size_t nsr = local_matrix.graph.row_map(src_lid+1)
940 - local_matrix.graph.row_map(src_lid);
941 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
942
943 for (size_t j=local_matrix.graph.row_map(src_lid);
944 j<local_matrix.graph.row_map(src_lid+1); ++j) {
945 LO src_col = local_matrix.graph.entries(j);
946 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
947 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
948 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
949 }
950 }
951 );
952}
953
954template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
955int
956unpackAndCombineIntoCrsArrays2(
958 const typename PackTraits<int>::output_array_type& tgt_pids,
960 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
961 const typename PackTraits<size_t>::input_array_type& offsets,
963 const Kokkos::View<const char*, BufferDeviceType>& imports,
964 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
965 const LocalMatrix& /* local_matrix */,
966 const LocalMap /*& local_col_map*/,
967 const int my_pid,
968 const size_t bytes_per_value)
969{
970 using Kokkos::View;
971 using Kokkos::subview;
972 using Kokkos::MemoryUnmanaged;
973 using Kokkos::parallel_reduce;
974 using Kokkos::atomic_fetch_add;
976 typedef typename LocalMap::device_type DT;
977 typedef typename LocalMap::local_ordinal_type LO;
978 typedef typename LocalMap::global_ordinal_type GO;
979 typedef typename LocalMatrix::value_type ST;
980 typedef typename DT::execution_space XS;
981 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
982 typedef typename Kokkos::pair<size_type, size_type> slice;
983 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
984
985 typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
986 typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
987 typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
988
989 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
990
991 int errors = 0;
992 const size_type num_import_lids = import_lids.size();
993
994 // RemoteIDs: Loop structure following UnpackAndCombine
995 parallel_reduce ("Unpack and combine into CRS",
996 range_policy (0, num_import_lids),
997 KOKKOS_LAMBDA (const size_t i, int& k_error) {
998 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
999 const size_t num_bytes = num_packets_per_lid(i);
1000 const size_t offset = offsets(i);
1001 if (num_bytes == 0) {
1002 // Empty buffer means that the row is empty.
1003 return;
1004 }
1005 size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
1006 if (num_ent == InvalidNum) {
1007 k_error += 1;
1008 return;
1009 }
1010 const LO lcl_row = import_lids(i);
1011 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
1012 const size_t end_row = start_row + num_ent;
1013
1014 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
1015 vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
1016 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
1017
1018 k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
1019 imports.data(), offset, num_bytes,
1020 num_ent, bytes_per_value);
1021
1022 // Correct target PIDs.
1023 for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
1024 const int pid = pids_out(j);
1025 pids_out(j) = (pid != my_pid) ? pid : -1;
1026 }
1027 }, errors);
1028
1029 return errors;
1030}
1031
1032template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
1033void
1035 const LocalMatrix & local_matrix,
1036 const LocalMap & local_col_map,
1038 const Kokkos::View<const char*, BufferDeviceType>& imports,
1039 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
1042 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
1045 const typename PackTraits<int>::input_array_type& src_pids,
1046 const typename PackTraits<int>::output_array_type& tgt_pids,
1047 const size_t num_same_ids,
1048 const size_t tgt_num_rows,
1049 const size_t tgt_num_nonzeros,
1050 const int my_tgt_pid,
1051 const size_t bytes_per_value)
1052{
1053 using Kokkos::View;
1054 using Kokkos::subview;
1055 using Kokkos::parallel_for;
1056 using Kokkos::MemoryUnmanaged;
1058 typedef typename LocalMap::device_type DT;
1059 typedef typename LocalMap::local_ordinal_type LO;
1060 typedef typename DT::execution_space XS;
1061 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1062 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1063 typedef BufferDeviceType BDT;
1064
1065 const char prefix[] = "unpackAndCombineIntoCrsArrays: ";
1066
1067 const size_t N = tgt_num_rows;
1068
1069 // In the case of reduced communicators, the sourceMatrix won't have
1070 // the right "my_pid", so thus we have to supply it.
1071 const int my_pid = my_tgt_pid;
1072
1073 // Zero the rowptr
1074 parallel_for(range_policy(0, N+1),
1075 KOKKOS_LAMBDA(const size_t i) {
1076 tgt_rowptr(i) = 0;
1077 }
1078 );
1079
1080 // same IDs: Always first, always in the same place
1081 parallel_for(range_policy(0, num_same_ids),
1082 KOKKOS_LAMBDA(const size_t i) {
1083 const LO tgt_lid = static_cast<LO>(i);
1084 const LO src_lid = static_cast<LO>(i);
1085 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1086 - local_matrix.graph.row_map(src_lid);
1087 }
1088 );
1089
1090 // Permute IDs: Still local, but reordered
1091 const size_type num_permute_to_lids = permute_to_lids.extent(0);
1092 parallel_for(range_policy(0, num_permute_to_lids),
1093 KOKKOS_LAMBDA(const size_t i) {
1094 const LO tgt_lid = permute_to_lids(i);
1095 const LO src_lid = permute_from_lids(i);
1096 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1097 - local_matrix.graph.row_map(src_lid);
1098 }
1099 );
1100
1101 // Get the offsets from the number of packets per LID
1102 const size_type num_import_lids = import_lids.extent(0);
1103 View<size_t*, DT> offsets("offsets", num_import_lids+1);
1104 computeOffsetsFromCounts(offsets, num_packets_per_lid);
1105
1106#ifdef HAVE_TPETRA_DEBUG
1107 {
1108 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1109 const bool condition =
1110 nth_offset_h != static_cast<size_t>(imports.extent (0));
1111 TEUCHOS_TEST_FOR_EXCEPTION
1112 (condition, std::logic_error, prefix
1113 << "The final offset in bytes " << nth_offset_h
1114 << " != imports.size() = " << imports.extent(0)
1115 << ". Please report this bug to the Tpetra developers.");
1116 }
1117#endif // HAVE_TPETRA_DEBUG
1118
1119 // Setup row pointers for remotes
1120 int k_error =
1121 setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1122 import_lids, imports, num_packets_per_lid, offsets);
1123 TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1124 << " Error transferring data to target row pointers. "
1125 "Please report this bug to the Tpetra developers.");
1126
1127 // If multiple processes contribute to the same row, we may need to
1128 // update row offsets. This tracks that.
1129 View<size_t*, DT> new_start_row ("new_start_row", N+1);
1130
1131 // Turn row length into a real CRS row pointer
1132 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1133
1134 // SameIDs: Copy the data over
1135 copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1136 tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1137
1138 copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1139 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1140 local_matrix, local_col_map, my_pid);
1141
1142 if (imports.extent(0) <= 0) {
1143 return;
1144 }
1145
1146 int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1147 tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1148 local_matrix, local_col_map, my_pid, bytes_per_value);
1149 TEUCHOS_TEST_FOR_EXCEPTION(
1150 unpack_err != 0, std::logic_error, prefix << "unpack loop failed. This "
1151 "should never happen. Please report this bug to the Tpetra developers.");
1152
1153 return;
1154}
1155
1156} // namespace UnpackAndCombineCrsMatrixImpl
1157
1197template<typename ST, typename LO, typename GO, typename Node>
1198void
1201 const Teuchos::ArrayView<const char>& imports,
1202 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1203 const Teuchos::ArrayView<const LO>& importLIDs,
1204 size_t /* constantNumPackets */,
1206{
1207 using Kokkos::View;
1208 typedef typename Node::device_type device_type;
1209 typedef typename CrsMatrix<ST, LO, GO, Node>::local_matrix_device_type local_matrix_device_type;
1210 static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1211 "Node::device_type and LocalMatrix::device_type must be the same.");
1212
1213 // Convert all Teuchos::Array to Kokkos::View.
1214 device_type outputDevice;
1215
1216 // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
1217 // them to device. Since unpacking is done directly in to the local matrix
1218 // (lclMatrix), no copying needs to be performed after unpacking.
1221 numPacketsPerLID.size(), true, "num_packets_per_lid");
1222
1223 auto import_lids_d =
1225 importLIDs.size(), true, "import_lids");
1226
1227 auto imports_d =
1229 imports.size(), true, "imports");
1230
1231 auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1232 auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1233
1234//KDDKDD This loop doesn't appear to do anything; what is it?
1235//KDDKDD for (int i=0; i<importLIDs.size(); i++)
1236//KDDKDD {
1237//KDDKDD auto lclRow = importLIDs[i];
1238//KDDKDD Teuchos::ArrayView<const LO> A_indices;
1239//KDDKDD Teuchos::ArrayView<const ST> A_values;
1240//KDDKDD sourceMatrix.getLocalRowView(lclRow, A_indices, A_values);
1241//KDDKDD }
1242 // Now do the actual unpack!
1243 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1244 local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1246
1247}
1248
1249template<typename ST, typename LO, typename GO, typename NT>
1250void
1251unpackCrsMatrixAndCombineNew(
1253 Kokkos::DualView<char*,
1255 Kokkos::DualView<size_t*,
1257 const Kokkos::DualView<const LO*,
1259 const size_t /* constantNumPackets */,
1261{
1262 using Kokkos::View;
1263 using crs_matrix_type = CrsMatrix<ST, LO, GO, NT>;
1264 using dist_object_type = DistObject<char, LO, GO, NT>;
1265 using device_type = typename crs_matrix_type::device_type;
1266 using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
1267 using buffer_device_type = typename dist_object_type::buffer_device_type;
1268
1269 static_assert
1270 (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1271 "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1272 "must be the same.");
1273
1274 if (numPacketsPerLID.need_sync_device()) {
1275 numPacketsPerLID.sync_device ();
1276 }
1277 auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1278
1279 TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1280 auto import_lids_d = importLIDs.view_device ();
1281
1282 if (imports.need_sync_device()) {
1283 imports.sync_device ();
1284 }
1285 auto imports_d = imports.view_device ();
1286
1287 auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1288 auto local_col_map = sourceMatrix.getColMap ()->getLocalMap ();
1289 typedef decltype (local_col_map) local_map_type;
1290
1291 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1292 local_matrix_device_type,
1293 local_map_type,
1294 buffer_device_type
1295 > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1296 import_lids_d, combineMode);
1297}
1298
1345//
1354template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1355size_t
1358 const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1359 const Teuchos::ArrayView<const char> &imports,
1360 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1361 size_t /* constantNumPackets */,
1362 CombineMode /* combineMode */,
1363 size_t numSameIDs,
1364 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1365 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1366{
1367 using Kokkos::MemoryUnmanaged;
1368 using Kokkos::View;
1369 typedef typename Node::device_type DT;
1371 const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1372
1374 (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1375 prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () << " != "
1376 "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1377 // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1378 // process, then the matrix is neither locally nor globally indexed.
1379 const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1381 (! locallyIndexed, std::invalid_argument, prefix << "The input "
1382 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1384 (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1385 prefix << "importLIDs.size() = " << importLIDs.size () << " != "
1386 "numPacketsPerLID.size() = " << numPacketsPerLID.size () << ".");
1387
1388 auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1389 auto permute_from_lids_d =
1391 permuteFromLIDs.getRawPtr (),
1392 permuteFromLIDs.size (), true,
1393 "permute_from_lids");
1394 auto imports_d =
1396 imports.getRawPtr (),
1397 imports.size (), true,
1398 "imports");
1401 numPacketsPerLID.getRawPtr (),
1402 numPacketsPerLID.size (), true,
1403 "num_packets_per_lid");
1404
1405 return UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1406 local_matrix, permute_from_lids_d, imports_d,
1407 num_packets_per_lid_d, numSameIDs);
1408}
1409
1424template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1425void
1428 const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1429 const Teuchos::ArrayView<const char>& imports,
1430 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1431 const size_t /* constantNumPackets */,
1432 const CombineMode /* combineMode */,
1433 const size_t numSameIDs,
1434 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1435 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1436 size_t TargetNumRows,
1437 size_t TargetNumNonzeros,
1438 const int MyTargetPID,
1439 const Teuchos::ArrayView<size_t>& CRS_rowptr,
1440 const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1442 const Teuchos::ArrayView<const int>& SourcePids,
1443 Teuchos::Array<int>& TargetPids)
1444{
1445 using execution_space = typename Node::execution_space;
1447
1448 using Kokkos::View;
1449 using Kokkos::deep_copy;
1450
1451 using Teuchos::ArrayView;
1452 using Teuchos::outArg;
1453 using Teuchos::REDUCE_MAX;
1454 using Teuchos::reduceAll;
1455
1456 typedef LocalOrdinal LO;
1457
1458 typedef typename Node::device_type DT;
1459
1461 typedef typename matrix_type::impl_scalar_type ST;
1462 typedef typename ArrayView<const LO>::size_type size_type;
1463
1464 const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1465
1467 TargetNumRows + 1 != static_cast<size_t> (CRS_rowptr.size ()),
1468 std::invalid_argument, prefix << "CRS_rowptr.size() = " <<
1469 CRS_rowptr.size () << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
1470
1472 permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1473 prefix << "permuteToLIDs.size() = " << permuteToLIDs.size ()
1474 << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size () << ".");
1475 const size_type numImportLIDs = importLIDs.size ();
1476
1478 numImportLIDs != numPacketsPerLID.size (), std::invalid_argument,
1479 prefix << "importLIDs.size() = " << numImportLIDs << " != "
1480 "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
1481
1482 // Preseed TargetPids with -1 for local
1483 if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1485 }
1486 TargetPids.assign (TargetNumNonzeros, -1);
1487
1488 // Grab pointers for sourceMatrix
1489 auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1490 auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1491
1492 // Convert input arrays to Kokkos::View
1493 DT outputDevice;
1494 auto import_lids_d =
1496 importLIDs.size(), true, "import_lids");
1497
1498 auto imports_d =
1500 imports.size(), true, "imports");
1501
1504 numPacketsPerLID.size(), true, "num_packets_per_lid");
1505
1506 auto permute_from_lids_d =
1508 permuteFromLIDs.size(), true, "permute_from_lids");
1509
1510 auto permute_to_lids_d =
1512 permuteToLIDs.size(), true, "permute_to_lids");
1513
1514 auto crs_rowptr_d =
1516 CRS_rowptr.size(), true, "crs_rowptr");
1517
1518 auto crs_colind_d =
1520 CRS_colind.size(), true, "crs_colidx");
1521
1522#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1523 static_assert (! std::is_same<
1524 typename std::remove_const<
1525 typename std::decay<
1526 decltype (CRS_vals)
1527 >::type::value_type
1528 >::type,
1529 std::complex<double> >::value,
1530 "CRS_vals::value_type is std::complex<double>; this should never happen"
1531 ", since std::complex does not work in Kokkos::View objects.");
1532#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1533
1534 auto crs_vals_d =
1536 CRS_vals.size(), true, "crs_vals");
1537
1538#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1539 static_assert (! std::is_same<
1540 typename decltype (crs_vals_d)::non_const_value_type,
1541 std::complex<double> >::value,
1542 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1543 "never happen, since std::complex does not work in Kokkos::View objects.");
1544#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1545
1546 auto src_pids_d =
1548 SourcePids.size(), true, "src_pids");
1549
1550 auto tgt_pids_d =
1552 TargetPids.size(), true, "tgt_pids");
1553
1554 size_t bytes_per_value = 0;
1556 // assume that ST is default constructible
1557 bytes_per_value = PackTraits<ST>::packValueCount(ST());
1558 }
1559 else {
1560 // Since the packed data come from the source matrix, we can use the source
1561 // matrix to get the number of bytes per Scalar value stored in the matrix.
1562 // This assumes that all Scalar values in the source matrix require the same
1563 // number of bytes. If the source matrix has no entries on the calling
1564 // process, then we hope that some process does have some idea how big
1565 // a Scalar value is. Of course, if no processes have any entries, then no
1566 // values should be packed (though this does assume that in our packing
1567 // scheme, rows with zero entries take zero bytes).
1568 size_t bytes_per_value_l = 0;
1569 if (local_matrix.values.extent(0) > 0) {
1570 const ST& val = local_matrix.values(0);
1572 } else {
1573 const ST& val = crs_vals_d(0);
1575 }
1576 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1577 Teuchos::REDUCE_MAX,
1579 outArg(bytes_per_value));
1580 }
1581
1582#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1583 static_assert (! std::is_same<
1584 typename decltype (crs_vals_d)::non_const_value_type,
1585 std::complex<double> >::value,
1586 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1587 "never happen, since std::complex does not work in Kokkos::View objects.");
1588#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1589
1590 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays(
1591 local_matrix, local_col_map, import_lids_d, imports_d,
1595 bytes_per_value);
1596
1597 // Copy outputs back to host
1598 typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1599 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1600 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1601 deep_copy(execution_space(), crs_rowptr_h, crs_rowptr_d);
1602
1603 typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1604 CRS_colind.getRawPtr(), CRS_colind.size());
1605 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1606 deep_copy(execution_space(), crs_colind_h, crs_colind_d);
1607
1608 typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1609 CRS_vals.getRawPtr(), CRS_vals.size());
1610 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1611 deep_copy(execution_space(), crs_vals_h, crs_vals_d);
1612
1613 typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1614 TargetPids.getRawPtr(), TargetPids.size());
1615 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1616 deep_copy(execution_space(), tgt_pids_h, tgt_pids_d);
1617}
1618
1619} // namespace Details
1620} // namespace Tpetra
1621
1622#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1623 template void \
1624 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1625 const CrsMatrix<ST, LO, GO, NT>&, \
1626 const Teuchos::ArrayView<const char>&, \
1627 const Teuchos::ArrayView<const size_t>&, \
1628 const Teuchos::ArrayView<const LO>&, \
1629 size_t, \
1630 CombineMode); \
1631 template void \
1632 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1633 const CrsMatrix<ST, LO, GO, NT>&, \
1634 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1635 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1636 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1637 const size_t, \
1638 const CombineMode); \
1639 template void \
1640 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1641 const CrsMatrix<ST, LO, GO, NT> &, \
1642 const Teuchos::ArrayView<const LO>&, \
1643 const Teuchos::ArrayView<const char>&, \
1644 const Teuchos::ArrayView<const size_t>&, \
1645 const size_t, \
1646 const CombineMode, \
1647 const size_t, \
1648 const Teuchos::ArrayView<const LO>&, \
1649 const Teuchos::ArrayView<const LO>&, \
1650 size_t, \
1651 size_t, \
1652 const int, \
1653 const Teuchos::ArrayView<size_t>&, \
1654 const Teuchos::ArrayView<GO>&, \
1655 const Teuchos::ArrayView<CrsMatrix<ST, LO, GO, NT>::impl_scalar_type>&, \
1656 const Teuchos::ArrayView<const int>&, \
1657 Teuchos::Array<int>&); \
1658 template size_t \
1659 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1660 const CrsMatrix<ST, LO, GO, NT> &, \
1661 const Teuchos::ArrayView<const LO> &, \
1662 const Teuchos::ArrayView<const char> &, \
1663 const Teuchos::ArrayView<const size_t>&, \
1664 size_t, \
1665 CombineMode, \
1666 size_t, \
1667 const Teuchos::ArrayView<const LO>&, \
1668 const Teuchos::ArrayView<const LO>&);
1669
1670#endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Declaration of the Tpetra::CrsMatrix class.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Declaration and definition of Tpetra::Details::getEntryOnHost.
size_t compute_total_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Total number of entries in any row of the packed matrix.
void unpackAndCombineIntoCrsMatrix(const LocalMatrix &local_matrix, const LocalMap &local_map, const Kokkos::View< const char *, BufferDeviceType > &imports, const Kokkos::View< const size_t *, BufferDeviceType > &num_packets_per_lid, const typename PackTraits< typename LocalMap::local_ordinal_type >::input_array_type import_lids, const Tpetra::CombineMode combine_mode)
Perform the unpack operation for the matrix.
size_t compute_maximum_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Maximum number of entries in any row of the packed matrix.
bool compute_batch_info(const View1 &batches_per_lid, View2 &batch_info)
Compute the index and batch number associated with each batch.
Struct that holds views of the contents of a CrsMatrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
"Local" part of Map suitable for Kokkos kernels.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
LocalOrdinal local_ordinal_type
The type of local indices.
GlobalOrdinal global_ordinal_type
The type of global indices.
DeviceType device_type
The device type.
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
Kokkos::Device< typename device_type::execution_space, buffer_memory_space > buffer_device_type
Kokkos::Device specialization for communication buffers.
Implementation details of Tpetra.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ INSERT
Insert new values that don't currently exist.
Traits class for packing / unpacking data of type T.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.