Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Details_unpackCrsGraphAndCombine_def.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// ************************************************************************
38// @HEADER
39
40#ifndef TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
41#define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
42
43#include "TpetraCore_config.h"
44#include "Teuchos_Array.hpp"
45#include "Teuchos_ArrayView.hpp"
54#include "Kokkos_Core.hpp"
55#include <memory>
56#include <string>
57
76
77namespace Tpetra {
78
79//
80// Users must never rely on anything in the Details namespace.
81//
82namespace Details {
83
84namespace UnpackAndCombineCrsGraphImpl {
85
95template<class Packet, class GO, class Device, class BufferDevice>
96KOKKOS_FUNCTION int
97unpackRow (const Kokkos::View<GO*,Device,Kokkos::MemoryUnmanaged>& gids_out,
98 const Kokkos::View<int*,Device,Kokkos::MemoryUnmanaged>& pids_out,
99 const Kokkos::View<const Packet*,BufferDevice>& imports,
100 const size_t offset,
101 const size_t num_ent)
102{
103 using size_type = typename Kokkos::View<GO*,Device>::size_type;
104
105 if (num_ent == 0) {
106 // Empty rows always take zero bytes, to ensure sparsity.
107 return 0;
108 }
109
110 // Unpack GIDs
111 for (size_type k=0; k<num_ent; k++)
112 gids_out(k) = imports(offset+k);
113
114 // Unpack PIDs
115 if (pids_out.size() > 0) {
116 for (size_type k=0; k<num_ent; k++) {
117 pids_out(k) = static_cast<int>(imports(offset+num_ent+k));
118 }
119 }
120
121 return 0;
122}
123
134template<class LocalOrdinal,
135 class Packet,
136 class RowView,
137 class IndicesView,
138 class BufferDevice>
140
141 using LO = LocalOrdinal;
142 using GO = typename IndicesView::value_type;
143 using packet_type = Packet;
144 using row_ptrs_type = RowView;
147
148 using device_type = typename IndicesView::device_type;
149 using execution_space = typename device_type::execution_space;
150
151 using num_packets_per_lid_type = Kokkos::View<const size_t*, buffer_device_type>;
152 using offsets_type = Kokkos::View<const size_t*, device_type>;
153 using input_buffer_type = Kokkos::View<const packet_type*, buffer_device_type>;
154 using import_lids_type = Kokkos::View<const LO*, buffer_device_type>;
155
156 using gids_scratch_type = Kokkos::View<GO*, device_type>;
157 using pids_scratch_type = Kokkos::View<int*,device_type>;
158
159 row_ptrs_type row_ptrs_beg;
160 row_ptrs_type row_ptrs_end;
161 indices_type indices;
162 input_buffer_type imports;
163 num_packets_per_lid_type num_packets_per_lid;
164 import_lids_type import_lids;
165 offsets_type offsets;
166 size_t max_num_ent;
167 bool unpack_pids;
168 Kokkos::Experimental::UniqueToken<execution_space,
169 Kokkos::Experimental::UniqueTokenScope::Global> tokens;
170 gids_scratch_type gids_scratch;
171 pids_scratch_type pids_scratch;
172
173 public:
174 using value_type = Kokkos::pair<int, LO>;
175
180 const input_buffer_type& imports_in,
181 const num_packets_per_lid_type& num_packets_per_lid_in,
182 const import_lids_type& import_lids_in,
183 const offsets_type& offsets_in,
184 const size_t max_num_ent_in,
185 const bool unpack_pids_in) :
186 row_ptrs_beg(row_ptrs_beg_in),
187 row_ptrs_end(row_ptrs_end_in),
188 indices(indices_in),
189 imports(imports_in),
190 num_packets_per_lid(num_packets_per_lid_in),
191 import_lids(import_lids_in),
192 offsets(offsets_in),
193 max_num_ent(max_num_ent_in),
194 unpack_pids(unpack_pids_in),
195 tokens(execution_space()),
196 gids_scratch("gids_scratch", tokens.size() * max_num_ent),
197 pids_scratch("pids_scratch", tokens.size() * max_num_ent)
198 {}
199
200 KOKKOS_INLINE_FUNCTION void init(value_type& dst) const
201 {
202 using Tpetra::Details::OrdinalTraits;
203 dst = Kokkos::make_pair(0, OrdinalTraits<LO>::invalid());
204 }
205
207 join(value_type& dst, const value_type& src) const
208 {
209 // `dst` should reflect the first (least) bad index and
210 // all other associated error codes and data. Thus, we need only
211 // check if the `src` object shows an error and if its associated
212 // bad index is less than `dst`'s bad index.
213 using Tpetra::Details::OrdinalTraits;
214 if (src.second != OrdinalTraits<LO>::invalid()) {
215 // An error in the src; check if
216 // 1. `dst` shows errors
217 // 2. If `dst` does show errors, if src's bad index is less than
218 // *this' bad index
219 if (dst.second == OrdinalTraits<LO>::invalid() ||
220 src.second < dst.second) {
221 dst = src;
222 }
223 }
224 }
225
227 void operator()(const LO i, value_type& dst) const
228 {
229 using Kokkos::View;
230 using Kokkos::subview;
231 using Kokkos::MemoryUnmanaged;
232 using size_type = typename execution_space::size_type;
233 using slice = typename Kokkos::pair<size_type, size_type>;
234
237
238 const size_t num_packets_this_lid = num_packets_per_lid(i);
239 const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2
241 if (unpack_pids && num_packets_this_lid%2 != 0) {
242 // Attempting to unpack PIDs, but num_packets_this_lid is not even; this
243 // should never
244 dst = Kokkos::make_pair(1, i);
245 return;
246 }
247
248 // Only unpack data if there is a nonzero number to unpack
249 if (num_ent == 0) {
250 return;
251 }
252
253 // there is actually something in the row
254 const size_t buf_size = imports.size();
255 const size_t offset = offsets(i);
256
258 dst = Kokkos::make_pair(2, i); // out of bounds
259 return;
260 }
261
262 // Get subviews in to the scratch arrays. The token returned from acquire
263 // is an integer in [0, tokens.size()). It is used to grab a unique (to
264 // this thread) subview of the scratch arrays.
265 const size_type token = tokens.acquire();
266 const size_t a = static_cast<size_t>(token) * max_num_ent;
267 const size_t b = a + num_ent;
268 gids_out_type gids_out = subview(gids_scratch, slice(a, b));
269 pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a)));
270
271 const int err = unpackRow (gids_out, pids_out, imports, offset, num_ent);
272
273 if (err != 0) {
274 dst = Kokkos::make_pair(3, i);
275 tokens.release(token);
276 return;
277 }
278
279 auto import_lid = import_lids(i);
280 for (size_t k = 0; k < num_ent; ++k) {
281 indices(row_ptrs_end(import_lid)) = gids_out(k);
282 // this is OK; don't need atomic, since LIDs to pack don't have repeats.
283 row_ptrs_end(import_lid) += 1;
284 }
285
286 tokens.release(token);
287 }
288
289};
290
297template<class LocalOrdinal, class GlobalOrdinal, class Node,
298 class RowView, class IndicesView, class BufferDevice>
299void
301(const RowView& row_ptrs_beg,
302 const RowView& row_ptrs_end,
303 IndicesView& indices,
304 const Kokkos::View<const GlobalOrdinal*, BufferDevice,
305 Kokkos::MemoryUnmanaged>& imports,
306 const Kokkos::View<const size_t*, BufferDevice,
307 Kokkos::MemoryUnmanaged>& num_packets_per_lid,
308 const Kokkos::View<const LocalOrdinal*, BufferDevice,
309 Kokkos::MemoryUnmanaged>& import_lids,
310 const typename CrsGraph<LocalOrdinal, GlobalOrdinal,
311 Node>::padding_type& padding,
312 const bool unpack_pids,
313 const int myRank,
314 const bool verbose)
315{
316 using LO = LocalOrdinal;
317 using GO = GlobalOrdinal;
318 using device_type = typename Node::device_type;
319 using execution_space = typename BufferDevice::execution_space;
320 using range_policy =
321 Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
322 using unpack_functor_type =
324
325 const char prefix[] =
326 "Tpetra::Details::UnpackAndCombineCrsGraphImpl::unpackAndCombine: ";
327
328 const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
329 if (num_import_lids == 0) {
330 // Nothing to unpack
331 return;
332 }
333
334 // Resize row pointers and indices to accommodate incoming data
335 padCrsArrays(row_ptrs_beg, row_ptrs_end, indices, padding,
336 myRank, verbose);
337
338 // Get the offsets
339 Kokkos::View<size_t*, device_type> offsets("offsets", num_import_lids+1);
340 computeOffsetsFromCounts(offsets, num_packets_per_lid);
341
342 // Determine the maximum number of entries in any row in the graph. The
343 // maximum number of entries is needed to allocate unpack buffers on the
344 // device.
345 size_t max_num_ent;
346 Kokkos::parallel_reduce
347 ("MaxReduce",
348 range_policy (0, LO (num_packets_per_lid.size ())),
349 KOKKOS_LAMBDA (const LO i, size_t& running_max_num_ent) {
350 const size_t num_packets_this_lid = num_packets_per_lid(i);
351 const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2 :
355 }
356 }, Kokkos::Max<size_t> (max_num_ent));
357
358 // Now do the actual unpack!
359 unpack_functor_type f (row_ptrs_beg, row_ptrs_end, indices, imports,
360 num_packets_per_lid, import_lids, offsets,
361 max_num_ent, unpack_pids);
362
363 typename unpack_functor_type::value_type x;
364 Kokkos::parallel_reduce(range_policy(0, static_cast<LO>(num_import_lids)), f, x);
365 auto x_h = x.to_std_pair();
366 TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error,
367 prefix << "UnpackAndCombineFunctor reported error code "
368 << x_h.first << " for the first bad row " << x_h.second);
369}
370
371template<class Packet, class LocalGraph, class BufferDevice>
372size_t
374 const LocalGraph& local_graph,
375 const Kokkos::View<const typename LocalGraph::data_type*,
376 typename LocalGraph::device_type,
377 Kokkos::MemoryUnmanaged> permute_from_lids,
378 const Kokkos::View<const Packet*, BufferDevice>& /* imports */,
379 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
380 const size_t num_same_ids)
381{
382 using Kokkos::parallel_reduce;
383 using local_graph_type = LocalGraph;
384 using LO = typename local_graph_type::data_type;
385 using device_type = typename local_graph_type::device_type;
386 using execution_space = typename device_type::execution_space;
387 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
388
389 size_t count = 0;
390 LO num_items;
391
392 // Number of graph entries to unpack (returned by this function).
393 num_items = static_cast<LO>(num_same_ids);
394 if (num_items) {
395 size_t kcnt = 0;
398 KOKKOS_LAMBDA(const LO lid, size_t& update) {
399 update += static_cast<size_t>(local_graph.row_map[lid+1]
400 -local_graph.row_map[lid]);
401 }, kcnt);
402 count += kcnt;
403 }
404
405 // Count entries copied directly from the source graph with permuting.
406 num_items = static_cast<LO>(permute_from_lids.extent(0));
407 if (num_items) {
408 size_t kcnt = 0;
409 parallel_reduce(
410 range_policy(0, num_items),
411 KOKKOS_LAMBDA(const LO i, size_t& update) {
412 const LO lid = permute_from_lids(i);
413 update += static_cast<size_t>(local_graph.row_map[lid+1]
414 - local_graph.row_map[lid]);
415 }, kcnt);
416 count += kcnt;
417 }
418
419 {
420 // Count entries received from other MPI processes.
421 size_t tot_num_ent = 0;
422 parallel_reduce("SumReduce",
423 range_policy(0,num_packets_per_lid.size()),
424 KOKKOS_LAMBDA(const int& i, size_t& lsum) {
425 lsum += num_packets_per_lid(i) / 2;
426 }, Kokkos::Sum<size_t>(tot_num_ent));
427 count += tot_num_ent;
428 }
429
430 return count;
431}
432
434template<class Packet, class LO, class Device, class BufferDevice>
435void
437 const Kokkos::View<size_t*, Device>& tgt_rowptr,
438 const Kokkos::View<const LO*, BufferDevice>& import_lids,
439 const Kokkos::View<const Packet*, BufferDevice>& /* imports */,
440 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid)
441{
442 using Kokkos::parallel_reduce;
443 using device_type = Device;
444 using execution_space = typename device_type::execution_space;
445 using size_type = typename Kokkos::View<size_t*,device_type>::size_type;
446 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
447
448 const size_type N = num_packets_per_lid.extent(0);
449 parallel_for("Setup row pointers for remotes",
450 range_policy(0, N),
451 KOKKOS_LAMBDA(const size_t i){
452 using atomic_incr_type = typename std::remove_reference<decltype(tgt_rowptr(0))>::type;
453 const size_t num_packets_this_lid = num_packets_per_lid(i);
454 const size_t num_ent = num_packets_this_lid / 2;
455 Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent));
456 });
457}
458
459// Convert array of row lengths to a CRS pointer array
460template<class Device>
461void
462makeCrsRowPtrFromLengths(
463 const Kokkos::View<size_t*,Device,Kokkos::MemoryUnmanaged>& tgt_rowptr,
464 const Kokkos::View<size_t*,Device>& new_start_row)
465{
466 using Kokkos::parallel_scan;
467 using device_type = Device;
468 using execution_space = typename device_type::execution_space;
469 using size_type = typename Kokkos::View<size_t*,device_type>::size_type;
470 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
471 const size_type N = new_start_row.extent(0);
473 range_policy(0, N),
474 KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
475 auto cur_val = tgt_rowptr(i);
476 if (final) {
477 tgt_rowptr(i) = update;
479 }
480 update += cur_val;
481 }
482 );
483}
484
485template<class LocalGraph, class LocalMap>
486void
487copyDataFromSameIDs(
488 const Kokkos::View<typename LocalMap::global_ordinal_type*,
489 typename LocalMap::device_type>& tgt_colind,
490 const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
491 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
492 const Kokkos::View<size_t*, typename LocalMap::device_type>& tgt_rowptr,
493 const Kokkos::View<const int*, typename LocalMap::device_type>& src_pids,
494 const LocalGraph& local_graph,
495 const LocalMap& local_col_map,
496 const size_t num_same_ids,
497 const int my_pid)
498{
499 using Kokkos::parallel_for;
500 using device_type = typename LocalMap::device_type;
501 using LO = typename LocalMap::local_ordinal_type;
502 using execution_space = typename device_type::execution_space;
503 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
504
505 parallel_for(
506 range_policy(0, num_same_ids),
507 KOKKOS_LAMBDA(const size_t i) {
508 using atomic_incr_type =typename std::remove_reference<decltype(new_start_row(0))>::type;
509
510 const LO src_lid = static_cast<LO>(i);
511 size_t src_row = local_graph.row_map(src_lid);
512
513 const LO tgt_lid = static_cast<LO>(i);
514 const size_t tgt_row = tgt_rowptr(tgt_lid);
515
516 const size_t nsr = local_graph.row_map(src_lid+1)
517 - local_graph.row_map(src_lid);
518 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
519
520 for (size_t j=local_graph.row_map(src_lid);
521 j<local_graph.row_map(src_lid+1); ++j) {
522 LO src_col = local_graph.entries(j);
523 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
524 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
525 }
526 }
527 );
528}
529
530template<class LocalGraph, class LocalMap, class BufferDevice>
531void
532copyDataFromPermuteIDs(
533 const Kokkos::View<typename LocalMap::global_ordinal_type*,
534 typename LocalMap::device_type>& tgt_colind,
535 const Kokkos::View<int*,
536 typename LocalMap::device_type>& tgt_pids,
537 const Kokkos::View<size_t*,
538 typename LocalMap::device_type>& new_start_row,
539 const Kokkos::View<size_t*,
540 typename LocalMap::device_type>& tgt_rowptr,
541 const Kokkos::View<const int*,
542 typename LocalMap::device_type>& src_pids,
543 const Kokkos::View<const typename LocalMap::local_ordinal_type*,
544 BufferDevice, Kokkos::MemoryUnmanaged>& permute_to_lids,
545 const Kokkos::View<const typename LocalMap::local_ordinal_type*,
546 BufferDevice, Kokkos::MemoryUnmanaged>& permute_from_lids,
547 const LocalGraph& local_graph,
548 const LocalMap& local_col_map,
549 const int my_pid)
550{
551 using Kokkos::parallel_for;
552 using device_type = typename LocalMap::device_type;
553 using LO = typename LocalMap::local_ordinal_type;
554 using execution_space = typename device_type::execution_space;
555 using size_type = typename Kokkos::View<LO*,device_type>::size_type;
556 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
557
558 const size_type num_permute_to_lids = permute_to_lids.extent(0);
559
560 parallel_for(
561 range_policy(0, num_permute_to_lids),
562 KOKKOS_LAMBDA(const size_t i) {
563 using atomic_incr_type = typename std::remove_reference<decltype(new_start_row(0))>::type;
564
565 const LO src_lid = permute_from_lids(i);
566 const size_t src_row = local_graph.row_map(src_lid);
567
568 const LO tgt_lid = permute_to_lids(i);
569 const size_t tgt_row = tgt_rowptr(tgt_lid);
570
571 size_t nsr = local_graph.row_map(src_lid+1)
572 - local_graph.row_map(src_lid);
573 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
574
575 for (size_t j=local_graph.row_map(src_lid);
576 j<local_graph.row_map(src_lid+1); ++j) {
577 LO src_col = local_graph.entries(j);
578 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
579 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
580 }
581 }
582 );
583}
584
585template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
586void
587unpackAndCombineIntoCrsArrays2(
588 const Kokkos::View<typename LocalMap::global_ordinal_type*, typename LocalMap::device_type>& tgt_colind,
589 const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
590 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
591 const Kokkos::View<const size_t*, typename LocalMap::device_type>& offsets,
592 const Kokkos::View<
593 const typename LocalMap::local_ordinal_type*,
594 BufferDevice,
595 Kokkos::MemoryUnmanaged>& import_lids,
596 const Kokkos::View<const Packet*, BufferDevice>& imports,
597 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
598 const LocalGraph& /* local_graph */,
599 const LocalMap /*& local_col_map*/,
600 const int my_pid)
601{
602 using Kokkos::View;
603 using Kokkos::subview;
604 using Kokkos::MemoryUnmanaged;
605 using Kokkos::parallel_reduce;
606 using Kokkos::atomic_fetch_add;
607
608 using device_type = typename LocalMap::device_type;
609 using LO = typename LocalMap::local_ordinal_type;
610 using GO = typename LocalMap::global_ordinal_type;
611 using execution_space = typename device_type::execution_space;
612 using size_type = typename Kokkos::View<LO*, device_type>::size_type;
613 using slice = typename Kokkos::pair<size_type, size_type>;
614 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
615
616 using pids_out_type = View<int*,device_type, MemoryUnmanaged>;
617 using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
618
619 const size_type num_import_lids = import_lids.size();
620 const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays2: ";
621
622 // RemoteIDs: Loop structure following UnpackAndCombine
623 int gbl_err_count;
624 parallel_reduce("Unpack and combine into CRS",
625 range_policy(0, num_import_lids),
626 KOKKOS_LAMBDA(const size_t i, int& err) {
627 using atomic_incr_type = typename std::remove_reference< decltype( new_start_row(0) )>::type;
628 const size_t num_packets_this_lid = num_packets_per_lid(i);
629 const size_t num_ent = num_packets_this_lid / 2;
630 const size_t offset = offsets(i);
631 const LO lcl_row = import_lids(i);
632 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
633 const size_t end_row = start_row + num_ent;
634
635 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
636 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
637
638 err += unpackRow (gids_out, pids_out, imports, offset, num_ent);
639
640 // Correct target PIDs.
641 for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
642 const int pid = pids_out(j);
643 pids_out(j) = (pid != my_pid) ? pid : -1;
644 }
645 }, gbl_err_count);
646
647 TEUCHOS_TEST_FOR_EXCEPTION(gbl_err_count != 0,
648 std::invalid_argument, prefix <<
649 "Attempting to unpack PIDs, but num_ent is not even; this should never "
650 "happen! Please report this bug to the Tpetra developers.");
651
652 return;
653}
654
655template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
656void
658 const LocalGraph & local_graph,
659 const LocalMap & local_col_map,
660 const Kokkos::View<const typename LocalMap::local_ordinal_type*,
661 BufferDevice,
662 Kokkos::MemoryUnmanaged>& import_lids,
663 const Kokkos::View<const Packet*, BufferDevice>& imports,
664 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
665 const Kokkos::View<const typename LocalMap::local_ordinal_type*,
666 BufferDevice,
667 Kokkos::MemoryUnmanaged>& permute_to_lids,
668 const Kokkos::View<const typename LocalMap::local_ordinal_type*,
669 BufferDevice,
670 Kokkos::MemoryUnmanaged>& permute_from_lids,
671 const Kokkos::View<size_t*,
672 typename LocalMap::device_type,
673 Kokkos::MemoryUnmanaged>& tgt_rowptr,
674 const Kokkos::View<typename LocalMap::global_ordinal_type*,
675 typename LocalMap::device_type,
676 Kokkos::MemoryUnmanaged>& tgt_colind,
677 const Kokkos::View<const int*,
678 typename LocalMap::device_type,
679 Kokkos::MemoryUnmanaged>& src_pids,
680 const Kokkos::View<int*,
681 typename LocalMap::device_type,
682 Kokkos::MemoryUnmanaged>& tgt_pids,
683 const size_t num_same_ids,
684 const size_t tgt_num_rows,
685 const size_t tgt_num_nonzeros,
686 const int my_tgt_pid)
687{
688 using Kokkos::View;
689 using Kokkos::subview;
690 using Kokkos::parallel_for;
691 using Kokkos::MemoryUnmanaged;
692 using packet_type = Packet;
693 using local_map_type = LocalMap;
694 using local_graph_type = LocalGraph;
695 using buffer_device_type = BufferDevice;
696 using device_type = typename LocalMap::device_type;
697 using LO = typename LocalMap::local_ordinal_type;
698 using execution_space = typename device_type::execution_space;
699 using size_type = typename Kokkos::View<LO*, device_type>::size_type;
700 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
701
702 const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays: ";
703
704 const size_t N = tgt_num_rows;
705 const size_t mynnz = tgt_num_nonzeros;
706
707 // In the case of reduced communicators, the sourceGraph won't have
708 // the right "my_pid", so thus we have to supply it.
709 const int my_pid = my_tgt_pid;
710
711 // FIXME (mfh 24 Jun 2019)
712 //
713 // 1. Only zero the entries of tgt_rowptr that actually need it.
714 // 2. Consider merging these three kernels into one.
715
716 // Zero the rowptr
717 parallel_for(
718 range_policy(0, N+1),
719 KOKKOS_LAMBDA(const size_t i) {
720 tgt_rowptr(i) = 0;
721 }
722 );
723
724 // same IDs: Always first, always in the same place
725 parallel_for(
726 range_policy(0, num_same_ids),
727 KOKKOS_LAMBDA(const size_t i) {
728 const LO tgt_lid = static_cast<LO>(i);
729 const LO src_lid = static_cast<LO>(i);
730 tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
731 - local_graph.row_map(src_lid);
732 }
733 );
734
735 // Permute IDs: Still local, but reordered
736 const size_type num_permute_to_lids = permute_to_lids.extent(0);
737 parallel_for(
738 range_policy(0, num_permute_to_lids),
739 KOKKOS_LAMBDA(const size_t i) {
740 const LO tgt_lid = permute_to_lids(i);
741 const LO src_lid = permute_from_lids(i);
742 tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
743 - local_graph.row_map(src_lid);
744 }
745 );
746
747 // Get the offsets from the number of packets per LID
748 const size_type num_import_lids = import_lids.extent(0);
749 View<size_t*, device_type> offsets("offsets", num_import_lids+1);
750 computeOffsetsFromCounts(offsets, num_packets_per_lid);
751
752#ifdef HAVE_TPETRA_DEBUG
753 {
754 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
755 const bool condition =
756 nth_offset_h != static_cast<size_t>(imports.extent(0));
757 TEUCHOS_TEST_FOR_EXCEPTION
758 (condition, std::logic_error, prefix
759 << "The final offset in bytes " << nth_offset_h
760 << " != imports.size() = " << imports.extent(0)
761 << ". Please report this bug to the Tpetra developers.");
762 }
763#endif // HAVE_TPETRA_DEBUG
764
765 // Setup row pointers for remotes
766 setupRowPointersForRemotes<packet_type,LO,device_type,buffer_device_type>(
767 tgt_rowptr, import_lids, imports, num_packets_per_lid);
768
769 // If multiple processes contribute to the same row, we may need to
770 // update row offsets. This tracks that.
771 View<size_t*, device_type> new_start_row("new_start_row", N+1);
772
773 // Turn row length into a real CRS row pointer
774 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
775 {
776 auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N);
777 bool condition = nth_tgt_rowptr_h != mynnz;
778 TEUCHOS_TEST_FOR_EXCEPTION(condition, std::invalid_argument,
779 prefix << "CRS_rowptr[last] = " <<
780 nth_tgt_rowptr_h << "!= mynnz = " << mynnz << ".");
781 }
782
783 // SameIDs: Copy the data over
784 copyDataFromSameIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
785 tgt_rowptr, src_pids, local_graph, local_col_map, num_same_ids, my_pid);
786
787 copyDataFromPermuteIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
788 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
789 local_graph, local_col_map, my_pid);
790
791 if (imports.extent(0) <= 0) {
792 return;
793 }
794
795 unpackAndCombineIntoCrsArrays2<
796 packet_type,local_graph_type,local_map_type,buffer_device_type>(
797 tgt_colind, tgt_pids, new_start_row, offsets, import_lids, imports,
798 num_packets_per_lid, local_graph, local_col_map, my_pid);
799
800 return;
801}
802
803} // namespace UnpackAndCombineCrsGraphImpl
804
852template<class LocalOrdinal, class GlobalOrdinal, class Node>
853size_t
856 const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
857 const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type> &imports,
858 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
859 size_t /* constantNumPackets */,
860 CombineMode /* combineMode */,
861 size_t numSameIDs,
862 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
863 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
864{
865 using Kokkos::MemoryUnmanaged;
866 using Kokkos::View;
867 using device_type = typename Node::device_type;
869 using local_graph_device_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::local_graph_device_type;
870 using buffer_device_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::buffer_device_type;
871 const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
872
874 (permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
875 prefix << "permuteToLIDs.size() = " << permuteToLIDs.size() << " != "
876 "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
877 // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
878 // process, then the graph is neither locally nor globally indexed.
879 const bool locallyIndexed = sourceGraph.isLocallyIndexed();
881 (! locallyIndexed, std::invalid_argument, prefix << "The input "
882 "CrsGraph 'sourceGraph' must be locally indexed.");
884 (importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument,
885 prefix << "importLIDs.size() = " << importLIDs.size() << " != "
886 "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
887
888 auto local_graph = sourceGraph.getLocalGraphDevice();
891 permuteFromLIDs.getRawPtr(),
892 permuteFromLIDs.size(), true,
893 "permute_from_lids");
894 auto imports_d =
895 create_mirror_view_from_raw_host_array(buffer_device_type(),
896 imports.getRawPtr(),
897 imports.size(), true,
898 "imports");
900 create_mirror_view_from_raw_host_array(buffer_device_type(),
901 numPacketsPerLID.getRawPtr(),
902 numPacketsPerLID.size(), true,
903 "num_packets_per_lid");
904
905 return UnpackAndCombineCrsGraphImpl::unpackAndCombineWithOwningPIDsCount<
906 packet_type,local_graph_device_type,buffer_device_type>(
907 local_graph, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs);
908}
909
923template<class LocalOrdinal, class GlobalOrdinal, class Node>
924void
927 const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
928 const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type>& imports,
929 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
930 const size_t /* constantNumPackets */,
931 const CombineMode /* combineMode */,
932 const size_t numSameIDs,
933 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
934 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
935 size_t TargetNumRows,
936 size_t TargetNumNonzeros,
937 const int MyTargetPID,
938 const Teuchos::ArrayView<size_t>& CRS_rowptr,
939 const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
940 const Teuchos::ArrayView<const int>& SourcePids,
941 Teuchos::Array<int>& TargetPids)
942{
943 using Kokkos::View;
944 using Kokkos::deep_copy;
945 using Teuchos::outArg;
946 using Teuchos::REDUCE_MAX;
947 using Teuchos::reduceAll;
948 using LO = LocalOrdinal;
949 using GO = GlobalOrdinal;
950 using crs_graph_type = CrsGraph<LO, GO, Node>;
951 using packet_type = typename crs_graph_type::packet_type;
952 using local_graph_device_type = typename crs_graph_type::local_graph_device_type;
953 using buffer_device_type = typename crs_graph_type::buffer_device_type;
954 using device_type = typename Node::device_type;
955 using size_type = typename Teuchos::ArrayView<const LO>::size_type;
956
957 const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
958
960 TargetNumRows + 1 != static_cast<size_t>(CRS_rowptr.size()),
961 std::invalid_argument, prefix << "CRS_rowptr.size() = " <<
962 CRS_rowptr.size() << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
963
965 permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
966 prefix << "permuteToLIDs.size() = " << permuteToLIDs.size()
967 << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
968 const size_type numImportLIDs = importLIDs.size();
969
971 numImportLIDs != numPacketsPerLID.size(), std::invalid_argument,
972 prefix << "importLIDs.size() = " << numImportLIDs << " != "
973 "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
974
975 // Preseed TargetPids with -1 for local
976 if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
978 }
979 TargetPids.assign(TargetNumNonzeros, -1);
980
981 // Grab pointers for sourceGraph
982 auto local_graph = sourceGraph.getLocalGraphDevice();
983 auto local_col_map = sourceGraph.getColMap()->getLocalMap();
984
985 // Convert input arrays to Kokkos::View
986 device_type outputDevice;
987 buffer_device_type bufferOutputDevice;
988
989 Kokkos::View<const LO*, buffer_device_type> import_lids_d =
991 (bufferOutputDevice, importLIDs.getRawPtr(),
992 importLIDs.size(), true, "import_lids");
993
994 Kokkos::View<const packet_type*, buffer_device_type> imports_d =
996 (bufferOutputDevice, imports.getRawPtr(),
997 imports.size(), true, "imports");
998
999 Kokkos::View<const size_t*, buffer_device_type> num_packets_per_lid_d =
1001 numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
1002 true, "num_packets_per_lid");
1003
1004 Kokkos::View<const LO*, buffer_device_type> permute_to_lids_d =
1006 permuteToLIDs.getRawPtr(), permuteToLIDs.size(),
1007 true, "permute_to_lids");
1008
1009 Kokkos::View<const LO*, buffer_device_type> permute_from_lids_d =
1011 permuteFromLIDs.getRawPtr(), permuteFromLIDs.size(),
1012 true, "permute_from_lids");
1013
1014 Kokkos::View<size_t*, device_type> crs_rowptr_d =
1016 CRS_rowptr.getRawPtr(), CRS_rowptr.size(),
1017 true, "crs_rowptr");
1018
1019 Kokkos::View<GO*, device_type> crs_colind_d =
1021 CRS_colind.getRawPtr(), CRS_colind.size(),
1022 true, "crs_colidx");
1023
1024 Kokkos::View<const int*, device_type> src_pids_d =
1026 SourcePids.getRawPtr(), SourcePids.size(),
1027 true, "src_pids");
1028
1029 Kokkos::View<int*, device_type> tgt_pids_d =
1031 TargetPids.getRawPtr(), TargetPids.size(),
1032 true, "tgt_pids");
1033
1034 using local_map_type = decltype(local_col_map);
1035 UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays<
1036 packet_type,local_graph_device_type,local_map_type,buffer_device_type>(
1037 local_graph, local_col_map, import_lids_d, imports_d, num_packets_per_lid_d,
1040
1041 // FIXME (mfh 25 Jun 2019) HostMirror of CudaUVMSpace is CudaUVMSpace!!!
1042
1043 // Copy outputs back to host
1044 typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1045 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1047
1048 typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1049 CRS_colind.getRawPtr(), CRS_colind.size());
1051
1052 typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1053 TargetPids.getRawPtr(), TargetPids.size());
1055
1056}
1057
1058} // namespace Details
1059} // namespace Tpetra
1060
1061#define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_INSTANT( LO, GO, NT ) \
1062 template void \
1063 Details::unpackAndCombineIntoCrsArrays<LO, GO, NT>( \
1064 const CrsGraph<LO, GO, NT> &, \
1065 const Teuchos::ArrayView<const LO>&, \
1066 const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1067 const Teuchos::ArrayView<const size_t>&, \
1068 const size_t, \
1069 const CombineMode, \
1070 const size_t, \
1071 const Teuchos::ArrayView<const LO>&, \
1072 const Teuchos::ArrayView<const LO>&, \
1073 size_t, \
1074 size_t, \
1075 const int, \
1076 const Teuchos::ArrayView<size_t>&, \
1077 const Teuchos::ArrayView<GO>&, \
1078 const Teuchos::ArrayView<const int>&, \
1079 Teuchos::Array<int>&); \
1080 template size_t \
1081 Details::unpackAndCombineWithOwningPIDsCount<LO, GO, NT>( \
1082 const CrsGraph<LO, GO, NT> &, \
1083 const Teuchos::ArrayView<const LO> &, \
1084 const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type> &, \
1085 const Teuchos::ArrayView<const size_t>&, \
1086 size_t, \
1087 CombineMode, \
1088 size_t, \
1089 const Teuchos::ArrayView<const LO>&, \
1090 const Teuchos::ArrayView<const LO>&);
1091
1092#endif // TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
Declaration of the Tpetra::CrsGraph class.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Functions for manipulating CRS arrays.
Declaration and definition of Tpetra::Details::getEntryOnHost.
void unpackAndCombine(const RowView &row_ptrs_beg, const RowView &row_ptrs_end, IndicesView &indices, const Kokkos::View< const GlobalOrdinal *, BufferDevice, Kokkos::MemoryUnmanaged > &imports, const Kokkos::View< const size_t *, BufferDevice, Kokkos::MemoryUnmanaged > &num_packets_per_lid, const Kokkos::View< const LocalOrdinal *, BufferDevice, Kokkos::MemoryUnmanaged > &import_lids, const typename CrsGraph< LocalOrdinal, GlobalOrdinal, Node >::padding_type &padding, const bool unpack_pids, const int myRank, const bool verbose)
Perform the unpack operation for the graph.
KOKKOS_FUNCTION int unpackRow(const Kokkos::View< GO *, Device, Kokkos::MemoryUnmanaged > &gids_out, const Kokkos::View< int *, Device, Kokkos::MemoryUnmanaged > &pids_out, const Kokkos::View< const Packet *, BufferDevice > &imports, const size_t offset, const size_t num_ent)
Unpack a single row of a CrsGraph.
void setupRowPointersForRemotes(const Kokkos::View< size_t *, Device > &tgt_rowptr, const Kokkos::View< const LO *, BufferDevice > &import_lids, const Kokkos::View< const Packet *, BufferDevice > &, const Kokkos::View< const size_t *, BufferDevice > &num_packets_per_lid)
Setup row pointers for remotes.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, device_type, void, size_t > local_graph_device_type
The type of the part of the sparse graph on each MPI process.
typename dist_object_type::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
Struct that holds views of the contents of a CrsMatrix.
LocalOrdinal local_ordinal_type
The type of local indices.
GlobalOrdinal global_ordinal_type
The type of global indices.
DeviceType device_type
The device type.
Implementation details of Tpetra.
void padCrsArrays(const RowPtr &rowPtrBeg, const RowPtr &rowPtrEnd, Indices &indices_wdv, const Padding &padding, const int my_rank, const bool verbose)
Determine if the row pointers and indices arrays need to be resized to accommodate new entries....
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
CombineMode
Rule for combining data in an Import or Export.