Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// ************************************************************************
38// @HEADER
39
40#ifndef TPETRA_BLOCKCRSMATRIX_DEF_HPP
41#define TPETRA_BLOCKCRSMATRIX_DEF_HPP
42
45
49#include "Tpetra_BlockMultiVector.hpp"
50#include "Tpetra_BlockView.hpp"
51
52#include "Teuchos_TimeMonitor.hpp"
53#ifdef HAVE_TPETRA_DEBUG
54# include <set>
55#endif // HAVE_TPETRA_DEBUG
56
57//
58// mfh 25 May 2016: Temporary fix for #393.
59//
60// Don't use lambdas in the BCRS mat-vec for GCC < 4.8, due to a GCC
61// 4.7.2 compiler bug ("internal compiler error") when compiling them.
62// Also, lambdas for Kokkos::parallel_* don't work with CUDA, so don't
63// use them in that case, either.
64//
65// mfh 31 May 2016: GCC 4.9.[23] appears to be broken ("internal
66// compiler error") too. Ditto for GCC 5.1. I'll just disable the
67// thing for any GCC version.
68//
69#if defined(__CUDACC__)
70 // Lambdas for Kokkos::parallel_* don't work with CUDA 7.5 either.
71# if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
72# undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
73# endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
74
75#elif defined(__GNUC__)
76
77# if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
78# undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
79# endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
80
81#else // some other compiler
82
83 // Optimistically assume that other compilers aren't broken.
84# if ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
85# define TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA 1
86# endif // ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
87#endif // defined(__CUDACC__), defined(__GNUC__)
88
89
90namespace Tpetra {
91
92namespace Impl {
93
94 template<typename T>
95 struct BlockCrsRowStruct {
96 T totalNumEntries, totalNumBytes, maxRowLength;
97 KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct() = default;
98 KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct(const BlockCrsRowStruct &b) = default;
99 KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(const T& numEnt, const T& numBytes, const T& rowLength)
100 : totalNumEntries(numEnt), totalNumBytes(numBytes), maxRowLength(rowLength) {}
101 };
102
103 template<typename T>
104 static
105 KOKKOS_INLINE_FUNCTION
106 void operator+=(BlockCrsRowStruct<T> &a, const BlockCrsRowStruct<T> &b) {
107 a.totalNumEntries += b.totalNumEntries;
108 a.totalNumBytes += b.totalNumBytes;
109 a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
110 }
111
112 template<typename T, typename ExecSpace>
113 struct BlockCrsReducer {
114 typedef BlockCrsReducer reducer;
115 typedef T value_type;
116 typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
117 value_type *value;
118
119 KOKKOS_INLINE_FUNCTION
120 BlockCrsReducer(value_type &val) : value(&val) {}
121
122 KOKKOS_INLINE_FUNCTION void join(value_type &dst, value_type &src) const { dst += src; }
123 KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const { dst += src; }
124 KOKKOS_INLINE_FUNCTION void init(value_type &val) const { val = value_type(); }
125 KOKKOS_INLINE_FUNCTION value_type& reference() { return *value; }
126 KOKKOS_INLINE_FUNCTION result_view_type view() const { return result_view_type(value); }
127 };
128
129 template<class AlphaCoeffType,
130 class GraphType,
131 class MatrixValuesType,
132 class InVecType,
133 class BetaCoeffType,
134 class OutVecType,
135 bool IsBuiltInType>
136 class BcrsApplyNoTransFunctor {
137 private:
138 static_assert (Kokkos::is_view<MatrixValuesType>::value,
139 "MatrixValuesType must be a Kokkos::View.");
140 static_assert (Kokkos::is_view<OutVecType>::value,
141 "OutVecType must be a Kokkos::View.");
142 static_assert (Kokkos::is_view<InVecType>::value,
143 "InVecType must be a Kokkos::View.");
144 static_assert (std::is_same<MatrixValuesType,
145 typename MatrixValuesType::const_type>::value,
146 "MatrixValuesType must be a const Kokkos::View.");
147 static_assert (std::is_same<typename OutVecType::value_type,
148 typename OutVecType::non_const_value_type>::value,
149 "OutVecType must be a nonconst Kokkos::View.");
150 static_assert (std::is_same<typename InVecType::value_type,
151 typename InVecType::const_value_type>::value,
152 "InVecType must be a const Kokkos::View.");
153 static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
154 "MatrixValuesType must be a rank-1 Kokkos::View.");
155 static_assert (static_cast<int> (InVecType::rank) == 1,
156 "InVecType must be a rank-1 Kokkos::View.");
157 static_assert (static_cast<int> (OutVecType::rank) == 1,
158 "OutVecType must be a rank-1 Kokkos::View.");
159 typedef typename MatrixValuesType::non_const_value_type scalar_type;
160
161 public:
162 typedef typename GraphType::device_type device_type;
163
165 typedef typename std::remove_const<typename GraphType::data_type>::type
166 local_ordinal_type;
167
174 void setX (const InVecType& X) { X_ = X; }
175
182 void setY (const OutVecType& Y) { Y_ = Y; }
183
184 typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
185 typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
186
188 BcrsApplyNoTransFunctor (const alpha_coeff_type& alpha,
189 const GraphType& graph,
190 const MatrixValuesType& val,
191 const local_ordinal_type blockSize,
192 const InVecType& X,
193 const beta_coeff_type& beta,
194 const OutVecType& Y) :
195 alpha_ (alpha),
196 ptr_ (graph.row_map),
197 ind_ (graph.entries),
198 val_ (val),
199 blockSize_ (blockSize),
200 X_ (X),
201 beta_ (beta),
202 Y_ (Y)
203 {}
204
205 // Dummy team version
206 KOKKOS_INLINE_FUNCTION void
207 operator () (const typename Kokkos::TeamPolicy<typename device_type::execution_space>::member_type & member) const
208 {
209 Kokkos::abort("Tpetra::BcrsApplyNoTransFunctor:: this should not be called");
210 }
211
212 // Range Policy for non built-in types
213 KOKKOS_INLINE_FUNCTION void
214 operator () (const local_ordinal_type& lclRow) const
215 {
216 using ::Tpetra::FILL;
217 using ::Tpetra::SCAL;
218 using ::Tpetra::GEMV;
219 using Kokkos::Details::ArithTraits;
220 // I'm not writing 'using Kokkos::make_pair;' here, because that
221 // may break builds for users who make the mistake of putting
222 // 'using namespace std;' in the global namespace. Please don't
223 // ever do that! But just in case you do, I'll take this
224 // precaution.
225 using Kokkos::parallel_for;
226 using Kokkos::subview;
227 typedef typename decltype (ptr_)::non_const_value_type offset_type;
228 typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
230 device_type,
231 Kokkos::MemoryTraits<Kokkos::Unmanaged> >
232 little_block_type;
233
234 const offset_type Y_ptBeg = lclRow * blockSize_;
235 const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
236 auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
237
238 // This version of the code does not use temporary storage.
239 // Each thread writes to its own block of the target vector.
240 if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
241 FILL (Y_cur, ArithTraits<beta_coeff_type>::zero ());
242 }
243 else if (beta_ != ArithTraits<beta_coeff_type>::one ()) { // beta != 0 && beta != 1
244 SCAL (beta_, Y_cur);
245 }
246
247 if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
248 const offset_type blkBeg = ptr_[lclRow];
249 const offset_type blkEnd = ptr_[lclRow+1];
250 // Precompute to save integer math in the inner loop.
251 const offset_type bs2 = blockSize_ * blockSize_;
252 for (offset_type absBlkOff = blkBeg; absBlkOff < blkEnd;
253 ++absBlkOff) {
254 little_block_type A_cur (val_.data () + absBlkOff * bs2,
255 blockSize_, blockSize_);
256 const offset_type X_blkCol = ind_[absBlkOff];
257 const offset_type X_ptBeg = X_blkCol * blockSize_;
258 const offset_type X_ptEnd = X_ptBeg + blockSize_;
259 auto X_cur = subview (X_, ::Kokkos::make_pair (X_ptBeg, X_ptEnd));
260
261 GEMV (alpha_, A_cur, X_cur, Y_cur); // Y_cur += alpha*A_cur*X_cur
262 } // for each entry in current local block row of matrix
263 }
264 }
265
266 private:
267 alpha_coeff_type alpha_;
268 typename GraphType::row_map_type::const_type ptr_;
269 typename GraphType::entries_type::const_type ind_;
270 MatrixValuesType val_;
271 local_ordinal_type blockSize_;
272 InVecType X_;
273 beta_coeff_type beta_;
274 OutVecType Y_;
275 };
276
277 template<class AlphaCoeffType,
278 class GraphType,
279 class MatrixValuesType,
280 class InVecType,
281 class BetaCoeffType,
282 class OutVecType>
283 class BcrsApplyNoTransFunctor<AlphaCoeffType,
284 GraphType,
285 MatrixValuesType,
286 InVecType,
287 BetaCoeffType,
288 OutVecType,
289 true> {
290 private:
291 static_assert (Kokkos::is_view<MatrixValuesType>::value,
292 "MatrixValuesType must be a Kokkos::View.");
293 static_assert (Kokkos::is_view<OutVecType>::value,
294 "OutVecType must be a Kokkos::View.");
295 static_assert (Kokkos::is_view<InVecType>::value,
296 "InVecType must be a Kokkos::View.");
297 static_assert (std::is_same<MatrixValuesType,
298 typename MatrixValuesType::const_type>::value,
299 "MatrixValuesType must be a const Kokkos::View.");
300 static_assert (std::is_same<typename OutVecType::value_type,
301 typename OutVecType::non_const_value_type>::value,
302 "OutVecType must be a nonconst Kokkos::View.");
303 static_assert (std::is_same<typename InVecType::value_type,
304 typename InVecType::const_value_type>::value,
305 "InVecType must be a const Kokkos::View.");
306 static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
307 "MatrixValuesType must be a rank-1 Kokkos::View.");
308 static_assert (static_cast<int> (InVecType::rank) == 1,
309 "InVecType must be a rank-1 Kokkos::View.");
310 static_assert (static_cast<int> (OutVecType::rank) == 1,
311 "OutVecType must be a rank-1 Kokkos::View.");
312 typedef typename MatrixValuesType::non_const_value_type scalar_type;
313
314 public:
315 typedef typename GraphType::device_type device_type;
316
318 static constexpr bool runOnHost = !std::is_same_v<typename device_type::execution_space, Kokkos::DefaultExecutionSpace> || std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::DefaultHostExecutionSpace>;
319
320
321
323 typedef typename std::remove_const<typename GraphType::data_type>::type
325
332 void setX (const InVecType& X) { X_ = X; }
333
340 void setY (const OutVecType& Y) { Y_ = Y; }
341
342 typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
343 typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
344
346 BcrsApplyNoTransFunctor (const alpha_coeff_type& alpha,
347 const GraphType& graph,
348 const MatrixValuesType& val,
349 const local_ordinal_type blockSize,
350 const InVecType& X,
351 const beta_coeff_type& beta,
352 const OutVecType& Y) :
353 alpha_ (alpha),
354 ptr_ (graph.row_map),
355 ind_ (graph.entries),
356 val_ (val),
357 blockSize_ (blockSize),
358 X_ (X),
359 beta_ (beta),
360 Y_ (Y)
361 {}
362
363 // dummy Range version
364 KOKKOS_INLINE_FUNCTION void
365 operator () (const local_ordinal_type& lclRow) const
366 {
367 Kokkos::abort("Tpetra::BcrsApplyNoTransFunctor:: this should not be called");
368 }
369
370 // Team policy for built-in types
371 KOKKOS_INLINE_FUNCTION void
372 operator () (const typename Kokkos::TeamPolicy<typename device_type::execution_space>::member_type & member) const
373 {
374
375 const local_ordinal_type lclRow = member.league_rank();
376
377 using Kokkos::Details::ArithTraits;
378 // I'm not writing 'using Kokkos::make_pair;' here, because that
379 // may break builds for users who make the mistake of putting
380 // 'using namespace std;' in the global namespace. Please don't
381 // ever do that! But just in case you do, I'll take this
382 // precaution.
383 using Kokkos::parallel_for;
384 using Kokkos::subview;
385 typedef typename decltype (ptr_)::non_const_value_type offset_type;
386 typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
388 device_type,
389 Kokkos::MemoryTraits<Kokkos::Unmanaged> >
390 little_block_type;
391
392 const offset_type Y_ptBeg = lclRow * blockSize_;
393 const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
394 auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
395
396 // This version of the code does not use temporary storage.
397 // Each thread writes to its own block of the target vector.
398 if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
399 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blockSize_),
400 [&](const local_ordinal_type &i) {
401 Y_cur(i) = ArithTraits<beta_coeff_type>::zero ();
402 });
403 }
404 else if (beta_ != ArithTraits<beta_coeff_type>::one ()) { // beta != 0 && beta != 1
405 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blockSize_),
406 [&](const local_ordinal_type &i) {
407 Y_cur(i) *= beta_;
408 });
409 }
410 member.team_barrier();
411
412 if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
413 const offset_type blkBeg = ptr_[lclRow];
414 const offset_type blkEnd = ptr_[lclRow+1];
415 // Precompute to save integer math in the inner loop.
416 const offset_type bs2 = blockSize_ * blockSize_;
417 little_block_type A_cur (val_.data (), blockSize_, blockSize_);
418 auto X_cur = subview (X_, ::Kokkos::make_pair (0, blockSize_));
419 Kokkos::parallel_for
420 (Kokkos::TeamThreadRange(member, blkBeg, blkEnd),
421 [&](const local_ordinal_type &absBlkOff) {
422 A_cur.assign_data(val_.data () + absBlkOff * bs2);
423 const offset_type X_blkCol = ind_[absBlkOff];
424 const offset_type X_ptBeg = X_blkCol * blockSize_;
425 X_cur.assign_data(&X_(X_ptBeg));
426
427 Kokkos::parallel_for
428 (Kokkos::ThreadVectorRange(member, blockSize_),
429 [&](const local_ordinal_type &k0) {
430 scalar_type val(0);
431 for (local_ordinal_type k1=0;k1<blockSize_;++k1)
432 val += A_cur(k0,k1)*X_cur(k1);
433 if constexpr(runOnHost) {
434 // host space team size is always 1
435 Y_cur(k0) += alpha_*val;
436 }
437 else {
438 // cuda space team size can be larger than 1
439 // atomic is not allowed for sacado type;
440 // thus this needs to be specialized or
441 // sacado atomic should be supported.
442 Kokkos::atomic_add(&Y_cur(k0), alpha_*val);
443 }
444 });
445 }); // for each entry in current local block row of matrix
446 }
447 }
448
449 private:
450 alpha_coeff_type alpha_;
451 typename GraphType::row_map_type::const_type ptr_;
452 typename GraphType::entries_type::const_type ind_;
453 MatrixValuesType val_;
454 local_ordinal_type blockSize_;
455 InVecType X_;
456 beta_coeff_type beta_;
457 OutVecType Y_;
458 };
459
460
461 template<class AlphaCoeffType,
462 class GraphType,
463 class MatrixValuesType,
464 class InMultiVecType,
465 class BetaCoeffType,
466 class OutMultiVecType>
467 void
468 bcrsLocalApplyNoTrans (const AlphaCoeffType& alpha,
469 const GraphType& graph,
470 const MatrixValuesType& val,
471 const typename std::remove_const<typename GraphType::data_type>::type blockSize,
472 const InMultiVecType& X,
473 const BetaCoeffType& beta,
474 const OutMultiVecType& Y
475 )
476 {
477 static_assert (Kokkos::is_view<MatrixValuesType>::value,
478 "MatrixValuesType must be a Kokkos::View.");
479 static_assert (Kokkos::is_view<OutMultiVecType>::value,
480 "OutMultiVecType must be a Kokkos::View.");
481 static_assert (Kokkos::is_view<InMultiVecType>::value,
482 "InMultiVecType must be a Kokkos::View.");
483 static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
484 "MatrixValuesType must be a rank-1 Kokkos::View.");
485 static_assert (static_cast<int> (OutMultiVecType::rank) == 2,
486 "OutMultiVecType must be a rank-2 Kokkos::View.");
487 static_assert (static_cast<int> (InMultiVecType::rank) == 2,
488 "InMultiVecType must be a rank-2 Kokkos::View.");
489
490 typedef typename MatrixValuesType::device_type::execution_space execution_space;
491 typedef typename MatrixValuesType::device_type::memory_space memory_space;
492 typedef typename MatrixValuesType::const_type matrix_values_type;
493 typedef typename OutMultiVecType::non_const_type out_multivec_type;
494 typedef typename InMultiVecType::const_type in_multivec_type;
495 typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_type;
496 typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_type;
497 typedef typename std::remove_const<typename GraphType::data_type>::type LO;
498
499 constexpr bool is_builtin_type_enabled = std::is_arithmetic<typename InMultiVecType::non_const_value_type>::value;
500 constexpr bool is_host_memory_space = std::is_same<memory_space,Kokkos::HostSpace>::value;
501 constexpr bool use_team_policy = (is_builtin_type_enabled && !is_host_memory_space);
502
503 const LO numLocalMeshRows = graph.row_map.extent (0) == 0 ?
504 static_cast<LO> (0) :
505 static_cast<LO> (graph.row_map.extent (0) - 1);
506 const LO numVecs = Y.extent (1);
507 if (numLocalMeshRows == 0 || numVecs == 0) {
508 return; // code below doesn't handle numVecs==0 correctly
509 }
510
511 // These assignments avoid instantiating the functor extra times
512 // unnecessarily, e.g., for X const vs. nonconst. We only need the
513 // X const case, so only instantiate for that case.
514 in_multivec_type X_in = X;
515 out_multivec_type Y_out = Y;
516
517 // The functor only knows how to handle one vector at a time, and it
518 // expects 1-D Views. Thus, we need to know the type of each column
519 // of X and Y.
520 typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0)) in_vec_type;
521 typedef decltype (Kokkos::subview (Y_out, Kokkos::ALL (), 0)) out_vec_type;
522 typedef BcrsApplyNoTransFunctor<alpha_type, GraphType,
523 matrix_values_type, in_vec_type, beta_type, out_vec_type,
524 use_team_policy> functor_type;
525
526 auto X_0 = Kokkos::subview (X_in, Kokkos::ALL (), 0);
527 auto Y_0 = Kokkos::subview (Y_out, Kokkos::ALL (), 0);
528
529 // Compute the first column of Y.
530 if (use_team_policy) {
531 functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
532 // Built-in version uses atomic add which might not be supported from sacado or any user-defined types.
533 typedef Kokkos::TeamPolicy<execution_space> policy_type;
534 policy_type policy(1,1);
535#if defined(KOKKOS_ENABLE_CUDA)
536 constexpr bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
537#else
538 constexpr bool is_cuda = false;
539#endif // defined(KOKKOS_ENABLE_CUDA)
540 if (is_cuda) {
541 LO team_size = 8, vector_size = 1;
542 if (blockSize <= 5) vector_size = 4;
543 else if (blockSize <= 9) vector_size = 8;
544 else if (blockSize <= 12) vector_size = 12;
545 else if (blockSize <= 20) vector_size = 20;
546 else vector_size = 20;
547 policy = policy_type(numLocalMeshRows, team_size, vector_size);
548 } else {
549 policy = policy_type(numLocalMeshRows, 1, 1);
550 }
551 Kokkos::parallel_for (policy, functor);
552
553 // Compute the remaining columns of Y.
554 for (LO j = 1; j < numVecs; ++j) {
555 auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
556 auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
557 functor.setX (X_j);
558 functor.setY (Y_j);
559 Kokkos::parallel_for (policy, functor);
560 }
561 } else {
562 functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
563 Kokkos::RangePolicy<execution_space,LO> policy(0, numLocalMeshRows);
564 Kokkos::parallel_for (policy, functor);
565 for (LO j = 1; j < numVecs; ++j) {
566 auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
567 auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
568 functor.setX (X_j);
569 functor.setY (Y_j);
570 Kokkos::parallel_for (policy, functor);
571 }
572 }
573 }
574} // namespace Impl
575
576namespace { // (anonymous)
577
578// Implementation of BlockCrsMatrix::getLocalDiagCopy (non-deprecated
579// version that takes two Kokkos::View arguments).
580template<class Scalar, class LO, class GO, class Node>
581class GetLocalDiagCopy {
582public:
583 typedef typename Node::device_type device_type;
584 typedef size_t diag_offset_type;
585 typedef Kokkos::View<const size_t*, device_type,
586 Kokkos::MemoryUnmanaged> diag_offsets_type;
587 typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
588 typedef typename global_graph_type::local_graph_device_type local_graph_device_type;
589 typedef typename local_graph_device_type::row_map_type row_offsets_type;
590 typedef typename ::Tpetra::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
591 typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
592 typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
593
594 // Constructor
595 GetLocalDiagCopy (const diag_type& diag,
596 const values_type& val,
597 const diag_offsets_type& diagOffsets,
598 const row_offsets_type& ptr,
599 const LO blockSize) :
600 diag_ (diag),
601 diagOffsets_ (diagOffsets),
602 ptr_ (ptr),
603 blockSize_ (blockSize),
604 offsetPerBlock_ (blockSize_*blockSize_),
605 val_(val)
606 {}
607
608 KOKKOS_INLINE_FUNCTION void
609 operator() (const LO& lclRowInd) const
610 {
611 using Kokkos::ALL;
612
613 // Get row offset
614 const size_t absOffset = ptr_[lclRowInd];
615
616 // Get offset relative to start of row
617 const size_t relOffset = diagOffsets_[lclRowInd];
618
619 // Get the total offset
620 const size_t pointOffset = (absOffset+relOffset)*offsetPerBlock_;
621
622 // Get a view of the block. BCRS currently uses LayoutRight
623 // regardless of the device.
624 typedef Kokkos::View<const IST**, Impl::BlockCrsMatrixLittleBlockArrayLayout,
625 device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
626 const_little_block_type;
627 const_little_block_type D_in (val_.data () + pointOffset,
628 blockSize_, blockSize_);
629 auto D_out = Kokkos::subview (diag_, lclRowInd, ALL (), ALL ());
630 COPY (D_in, D_out);
631 }
632
633 private:
634 diag_type diag_;
635 diag_offsets_type diagOffsets_;
636 row_offsets_type ptr_;
637 LO blockSize_;
638 LO offsetPerBlock_;
639 values_type val_;
640 };
641} // namespace (anonymous)
642
643 template<class Scalar, class LO, class GO, class Node>
644 std::ostream&
645 BlockCrsMatrix<Scalar, LO, GO, Node>::
646 markLocalErrorAndGetStream ()
647 {
648 * (this->localError_) = true;
649 if ((*errs_).is_null ()) {
650 *errs_ = Teuchos::rcp (new std::ostringstream ());
651 }
652 return **errs_;
653 }
654
655 template<class Scalar, class LO, class GO, class Node>
658 dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
659 graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
660 blockSize_ (static_cast<LO> (0)),
661 X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
662 Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
663 pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
664 offsetPerBlock_ (0),
665 localError_ (new bool (false)),
666 errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
667 {
668 }
669
670 template<class Scalar, class LO, class GO, class Node>
673 const LO blockSize) :
674 dist_object_type (graph.getMap ()),
675 graph_ (graph),
676 rowMeshMap_ (* (graph.getRowMap ())),
677 blockSize_ (blockSize),
678 X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
679 Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
680 pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
681 offsetPerBlock_ (blockSize * blockSize),
682 localError_ (new bool (false)),
683 errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
684 {
685
688 ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
689 "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
690 "rows (isSorted() is false). This class assumes sorted rows.");
691
692 graphRCP_ = Teuchos::rcpFromRef(graph_);
693
694 // Trick to test whether LO is nonpositive, without a compiler
695 // warning in case LO is unsigned (which is generally a bad idea
696 // anyway). I don't promise that the trick works, but it
697 // generally does with gcc at least, in my experience.
698 const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
700 blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
701 "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
702 " <= 0. The block size must be positive.");
703
704 domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
705 rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
706
707 {
708 // These are rcp
709 const auto domainPointMap = getDomainMap();
710 const auto colPointMap = Teuchos::rcp
711 (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
712 *pointImporter_ = Teuchos::rcp
714 }
715 {
716 auto local_graph_h = graph.getLocalGraphHost ();
717 auto ptr_h = local_graph_h.row_map;
718 ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
719 Kokkos::deep_copy(ptrHost_, ptr_h);
720
721 auto ind_h = local_graph_h.entries;
722 indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
723 // DEEP_COPY REVIEW - HOST-TO-HOST
724 Kokkos::deep_copy (indHost_, ind_h);
725
726 const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
727 val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
728 }
729 }
730
731 template<class Scalar, class LO, class GO, class Node>
734 const typename local_matrix_device_type::values_type& values,
735 const LO blockSize) :
736 dist_object_type (graph.getMap ()),
737 graph_ (graph),
738 rowMeshMap_ (* (graph.getRowMap ())),
739 blockSize_ (blockSize),
740 X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
741 Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
742 pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
743 offsetPerBlock_ (blockSize * blockSize),
744 localError_ (new bool (false)),
745 errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
746 {
749 ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
750 "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
751 "rows (isSorted() is false). This class assumes sorted rows.");
752
753 graphRCP_ = Teuchos::rcpFromRef(graph_);
754
755 // Trick to test whether LO is nonpositive, without a compiler
756 // warning in case LO is unsigned (which is generally a bad idea
757 // anyway). I don't promise that the trick works, but it
758 // generally does with gcc at least, in my experience.
759 const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
761 blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
762 "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
763 " <= 0. The block size must be positive.");
764
765 domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
766 rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
767
768 {
769 // These are rcp
770 const auto domainPointMap = getDomainMap();
771 const auto colPointMap = Teuchos::rcp
772 (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
773 *pointImporter_ = Teuchos::rcp
775 }
776 {
777 auto local_graph_h = graph.getLocalGraphHost ();
778 auto ptr_h = local_graph_h.row_map;
779 ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
780 Kokkos::deep_copy(ptrHost_, ptr_h);
781
782 auto ind_h = local_graph_h.entries;
783 indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
784 Kokkos::deep_copy (indHost_, ind_h);
785
786 const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
788 val_ = decltype (val_) (values);
789 }
790 }
791
792 template<class Scalar, class LO, class GO, class Node>
796 const map_type& rangePointMap,
797 const LO blockSize) :
798 dist_object_type (graph.getMap ()),
799 graph_ (graph),
800 rowMeshMap_ (* (graph.getRowMap ())),
801 domainPointMap_ (domainPointMap),
802 rangePointMap_ (rangePointMap),
803 blockSize_ (blockSize),
804 X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
805 Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
806 pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
807 offsetPerBlock_ (blockSize * blockSize),
808 localError_ (new bool (false)),
809 errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
810 {
812 ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
813 "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
814 "rows (isSorted() is false). This class assumes sorted rows.");
815
816 graphRCP_ = Teuchos::rcpFromRef(graph_);
817
818 // Trick to test whether LO is nonpositive, without a compiler
819 // warning in case LO is unsigned (which is generally a bad idea
820 // anyway). I don't promise that the trick works, but it
821 // generally does with gcc at least, in my experience.
822 const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
824 blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
825 "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
826 " <= 0. The block size must be positive.");
827 {
828 // These are rcp
829 const auto rcpDomainPointMap = getDomainMap();
830 const auto colPointMap = Teuchos::rcp
831 (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
832 *pointImporter_ = Teuchos::rcp
834 }
835 {
836 auto local_graph_h = graph.getLocalGraphHost ();
837 auto ptr_h = local_graph_h.row_map;
838 ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
839 // DEEP_COPY REVIEW - HOST-TO-HOST
840 Kokkos::deep_copy(ptrHost_, ptr_h);
841
842 auto ind_h = local_graph_h.entries;
843 indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
844 // DEEP_COPY REVIEW - HOST-TO-HOST
845 Kokkos::deep_copy (indHost_, ind_h);
846
847 const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
848 val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
849
850 }
851 }
852
853 template<class Scalar, class LO, class GO, class Node>
854 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
856 getDomainMap () const
857 { // Copy constructor of map_type does a shallow copy.
858 // We're only returning by RCP for backwards compatibility.
859 return Teuchos::rcp (new map_type (domainPointMap_));
860 }
861
862 template<class Scalar, class LO, class GO, class Node>
863 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
865 getRangeMap () const
866 { // Copy constructor of map_type does a shallow copy.
867 // We're only returning by RCP for backwards compatibility.
868 return Teuchos::rcp (new map_type (rangePointMap_));
869 }
870
871 template<class Scalar, class LO, class GO, class Node>
872 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
874 getRowMap () const
875 {
876 return graph_.getRowMap();
877 }
878
879 template<class Scalar, class LO, class GO, class Node>
880 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
882 getColMap () const
883 {
884 return graph_.getColMap();
885 }
886
887 template<class Scalar, class LO, class GO, class Node>
890 getGlobalNumRows() const
891 {
892 return graph_.getGlobalNumRows();
893 }
894
895 template<class Scalar, class LO, class GO, class Node>
896 size_t
898 getLocalNumRows() const
899 {
900 return graph_.getLocalNumRows();
901 }
902
903 template<class Scalar, class LO, class GO, class Node>
904 size_t
907 {
908 return graph_.getLocalMaxNumRowEntries();
909 }
910
911 template<class Scalar, class LO, class GO, class Node>
912 void
914 apply (const mv_type& X,
915 mv_type& Y,
916 Teuchos::ETransp mode,
918 Scalar beta) const
919 {
922 mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
923 std::invalid_argument, "Tpetra::BlockCrsMatrix::apply: "
924 "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
925 "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
926
927 BMV X_view;
928 BMV Y_view;
929 const LO blockSize = getBlockSize ();
930 try {
931 X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
932 Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
933 }
934 catch (std::invalid_argument& e) {
936 true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
937 "apply: Either the input MultiVector X or the output MultiVector Y "
938 "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
939 "graph. BlockMultiVector's constructor threw the following exception: "
940 << e.what ());
941 }
942
943 try {
944 // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
945 // either that or mark fields of this class as 'mutable'. The
946 // problem is that applyBlock wants to do lazy initialization of
947 // temporary block multivectors.
948 const_cast<this_BCRS_type&> (*this).applyBlock (X_view, Y_view, mode, alpha, beta);
949 } catch (std::invalid_argument& e) {
951 true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
952 "apply: The implementation method applyBlock complained about having "
953 "an invalid argument. It reported the following: " << e.what ());
954 } catch (std::logic_error& e) {
956 true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
957 "apply: The implementation method applyBlock complained about a "
958 "possible bug in its implementation. It reported the following: "
959 << e.what ());
960 } catch (std::exception& e) {
962 true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
963 "apply: The implementation method applyBlock threw an exception which "
964 "is neither std::invalid_argument nor std::logic_error, but is a "
965 "subclass of std::exception. It reported the following: "
966 << e.what ());
967 } catch (...) {
969 true, std::logic_error, "Tpetra::BlockCrsMatrix::"
970 "apply: The implementation method applyBlock threw an exception which "
971 "is not an instance of a subclass of std::exception. This probably "
972 "indicates a bug. Please report this to the Tpetra developers.");
973 }
974 }
975
976 template<class Scalar, class LO, class GO, class Node>
977 void
981 Teuchos::ETransp mode,
982 const Scalar alpha,
983 const Scalar beta)
984 {
986 X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
987 "Tpetra::BlockCrsMatrix::applyBlock: "
988 "X and Y have different block sizes. X.getBlockSize() = "
989 << X.getBlockSize () << " != Y.getBlockSize() = "
990 << Y.getBlockSize () << ".");
991
992 if (mode == Teuchos::NO_TRANS) {
993 applyBlockNoTrans (X, Y, alpha, beta);
994 } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
995 applyBlockTrans (X, Y, mode, alpha, beta);
996 } else {
998 true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
999 "applyBlock: Invalid 'mode' argument. Valid values are "
1000 "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
1001 }
1002 }
1003
1004 template<class Scalar, class LO, class GO, class Node>
1005 void
1008 const Import<LO, GO, Node>& importer) const
1009 {
1010 using Teuchos::RCP;
1011 using Teuchos::rcp;
1013
1014 // Right now, we make many assumptions...
1015 TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
1016 "destMatrix is required to be null.");
1017
1018 // BlockCrsMatrix requires a complete graph at construction.
1019 // So first step is to import and fill complete the destGraph.
1020 RCP<crs_graph_type> srcGraph = rcp (new crs_graph_type(this->getCrsGraph()));
1022 srcGraph->getDomainMap(),
1023 srcGraph->getRangeMap());
1024
1025
1026 // Final step, create and import the destMatrix.
1027 destMatrix = rcp (new this_BCRS_type (*destGraph, getBlockSize()));
1028 destMatrix->doImport(*this, importer, Tpetra::INSERT);
1029 }
1030
1031
1032 template<class Scalar, class LO, class GO, class Node>
1033 void
1036 const Export<LO, GO, Node>& exporter) const
1037 {
1038 using Teuchos::RCP;
1039 using Teuchos::rcp;
1041
1042 // Right now, we make many assumptions...
1043 TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
1044 "destMatrix is required to be null.");
1045
1046 // BlockCrsMatrix requires a complete graph at construction.
1047 // So first step is to import and fill complete the destGraph.
1048 RCP<crs_graph_type> srcGraph = rcp (new crs_graph_type(this->getCrsGraph()));
1050 srcGraph->getDomainMap(),
1051 srcGraph->getRangeMap());
1052
1053
1054 // Final step, create and import the destMatrix.
1055 destMatrix = rcp (new this_BCRS_type (*destGraph, getBlockSize()));
1056 destMatrix->doExport(*this, exporter, Tpetra::INSERT);
1057 }
1058
1059
1060 template<class Scalar, class LO, class GO, class Node>
1061 void
1064 {
1065 auto val_d = val_.getDeviceView(Access::OverwriteAll);
1066 Kokkos::deep_copy(val_d, alpha);
1067 }
1068
1069 template<class Scalar, class LO, class GO, class Node>
1070 LO
1073 const LO colInds[],
1074 const Scalar vals[],
1075 const LO numColInds) const
1076 {
1077 Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
1078 offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
1079 ptrdiff_t * offsets = offsets_host_view.data();
1080 const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
1081 const LO validCount = this->replaceLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
1082 return validCount;
1083 }
1084
1085 template <class Scalar, class LO, class GO, class Node>
1086 void
1088 getLocalDiagOffsets (const Kokkos::View<size_t*, device_type,
1089 Kokkos::MemoryUnmanaged>& offsets) const
1090 {
1091 graph_.getLocalDiagOffsets (offsets);
1092 }
1093
1094 template <class Scalar, class LO, class GO, class Node>
1095 void
1097 getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
1098 Kokkos::MemoryUnmanaged>& diag,
1099 const Kokkos::View<const size_t*, device_type,
1100 Kokkos::MemoryUnmanaged>& offsets) const
1101 {
1102 using Kokkos::parallel_for;
1103 const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
1104
1105 const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getLocalNumElements ());
1106 const LO blockSize = this->getBlockSize ();
1108 (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
1109 static_cast<LO> (diag.extent (1)) < blockSize ||
1110 static_cast<LO> (diag.extent (2)) < blockSize,
1111 std::invalid_argument, prefix <<
1112 "The input Kokkos::View is not big enough to hold all the data.");
1114 (static_cast<LO> (offsets.size ()) < lclNumMeshRows, std::invalid_argument,
1115 prefix << "offsets.size() = " << offsets.size () << " < local number of "
1116 "diagonal blocks " << lclNumMeshRows << ".");
1117
1118 typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
1120
1121 // FIXME (mfh 26 May 2016) Not really OK to const_cast here, since
1122 // we reserve the right to do lazy allocation of device data. (We
1123 // don't plan to do lazy allocation for host data; the host
1124 // version of the data always exists.)
1125 auto val_d = val_.getDeviceView(Access::ReadOnly);
1127 functor_type (diag, val_d, offsets,
1128 graph_.getLocalGraphDevice ().row_map, blockSize_));
1129 }
1130
1131 template<class Scalar, class LO, class GO, class Node>
1132 LO
1135 const LO colInds[],
1136 const Scalar vals[],
1137 const LO numColInds) const
1138 {
1139 Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
1140 offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
1141 ptrdiff_t * offsets = offsets_host_view.data();
1142 const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
1143 const LO validCount = this->absMaxLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
1144 return validCount;
1145 }
1146
1147
1148 template<class Scalar, class LO, class GO, class Node>
1149 LO
1152 const LO colInds[],
1153 const Scalar vals[],
1154 const LO numColInds) const
1155 {
1156 Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
1157 offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
1158 ptrdiff_t * offsets = offsets_host_view.data();
1159 const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
1160 const LO validCount = this->sumIntoLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
1161 return validCount;
1162 }
1163 template<class Scalar, class LO, class GO, class Node>
1164 void
1167 nonconst_local_inds_host_view_type &Indices,
1168 nonconst_values_host_view_type &Values,
1169 size_t& NumEntries) const
1170 {
1171 auto vals = getValuesHost(LocalRow);
1172 const LO numInds = ptrHost_(LocalRow+1) - ptrHost_(LocalRow);
1173 if (numInds > (LO)Indices.extent(0) || numInds*blockSize_*blockSize_ > (LO)Values.extent(0)) {
1174 TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
1175 "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
1176 << numInds << " row entries");
1177 }
1178 const LO * colInds = indHost_.data() + ptrHost_(LocalRow);
1179 for (LO i=0; i<numInds; ++i) {
1180 Indices[i] = colInds[i];
1181 }
1182 for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
1183 Values[i] = vals[i];
1184 }
1186 }
1187
1188 template<class Scalar, class LO, class GO, class Node>
1189 LO
1192 ptrdiff_t offsets[],
1193 const LO colInds[],
1194 const LO numColInds) const
1195 {
1196 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1197 // We got no offsets, because the input local row index is
1198 // invalid on the calling process. That may not be an error, if
1199 // numColInds is zero anyway; it doesn't matter. This is the
1200 // advantage of returning the number of valid indices.
1201 return static_cast<LO> (0);
1202 }
1203
1204 const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1205 LO hint = 0; // Guess for the relative offset into the current row
1206 LO validCount = 0; // number of valid column indices in colInds
1207
1208 for (LO k = 0; k < numColInds; ++k) {
1209 const LO relBlockOffset =
1210 this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1211 if (relBlockOffset != LINV) {
1212 offsets[k] = static_cast<ptrdiff_t> (relBlockOffset);
1213 hint = relBlockOffset + 1;
1214 ++validCount;
1215 }
1216 }
1217 return validCount;
1218 }
1219
1220
1221 template<class Scalar, class LO, class GO, class Node>
1222 LO
1225 const ptrdiff_t offsets[],
1226 const Scalar vals[],
1227 const LO numOffsets) const
1228 {
1229 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1230 // We modified no values, because the input local row index is
1231 // invalid on the calling process. That may not be an error, if
1232 // numColInds is zero anyway; it doesn't matter. This is the
1233 // advantage of returning the number of valid indices.
1234 return static_cast<LO> (0);
1235 }
1236 const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1238 auto val_out = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
1239 impl_scalar_type* vOut = val_out.data();
1240
1241 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1242 const ptrdiff_t STINV = Teuchos::OrdinalTraits<ptrdiff_t>::invalid ();
1243 size_t pointOffset = 0; // Current offset into input values
1244 LO validCount = 0; // number of valid offsets
1245
1246 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1247 const size_t blockOffset = offsets[k]*perBlockSize;
1248 if (offsets[k] != STINV) {
1249 little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
1250 const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
1251 COPY (A_new, A_old);
1252 ++validCount;
1253 }
1254 }
1255 return validCount;
1256 }
1257
1258
1259 template<class Scalar, class LO, class GO, class Node>
1260 LO
1263 const ptrdiff_t offsets[],
1264 const Scalar vals[],
1265 const LO numOffsets) const
1266 {
1267 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1268 // We modified no values, because the input local row index is
1269 // invalid on the calling process. That may not be an error, if
1270 // numColInds is zero anyway; it doesn't matter. This is the
1271 // advantage of returning the number of valid indices.
1272 return static_cast<LO> (0);
1273 }
1274 const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1275 auto val_out = getValuesHost(localRowInd);
1276 impl_scalar_type* vOut = const_cast<impl_scalar_type*>(val_out.data());
1277
1278 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1279 const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1280 size_t pointOffset = 0; // Current offset into input values
1281 LO validCount = 0; // number of valid offsets
1282
1283 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1284 const size_t blockOffset = offsets[k]*perBlockSize;
1285 if (blockOffset != STINV) {
1286 little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
1287 const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
1288 ::Tpetra::Impl::absMax (A_old, A_new);
1289 ++validCount;
1290 }
1291 }
1292 return validCount;
1293 }
1294
1295
1296 template<class Scalar, class LO, class GO, class Node>
1297 LO
1300 const ptrdiff_t offsets[],
1301 const Scalar vals[],
1302 const LO numOffsets) const
1303 {
1304 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1305 // We modified no values, because the input local row index is
1306 // invalid on the calling process. That may not be an error, if
1307 // numColInds is zero anyway; it doesn't matter. This is the
1308 // advantage of returning the number of valid indices.
1309 return static_cast<LO> (0);
1310 }
1311 const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1312 const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1314 auto val_out = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
1315 impl_scalar_type* vOut = val_out.data();
1316
1317 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1318 const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1319 size_t pointOffset = 0; // Current offset into input values
1320 LO validCount = 0; // number of valid offsets
1321
1322 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1323 const size_t blockOffset = offsets[k]*perBlockSize;
1324 if (blockOffset != STINV) {
1325 little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
1326 const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
1327 AXPY (ONE, A_new, A_old);
1328 ++validCount;
1329 }
1330 }
1331 return validCount;
1332 }
1333
1334 template<class Scalar, class LO, class GO, class Node>
1336 impl_scalar_type_dualview::t_host::const_type
1338 getValuesHost () const
1339 {
1340 return val_.getHostView(Access::ReadOnly);
1341 }
1342
1343 template<class Scalar, class LO, class GO, class Node>
1344 typename BlockCrsMatrix<Scalar, LO, GO, Node>::
1345 impl_scalar_type_dualview::t_dev::const_type
1346 BlockCrsMatrix<Scalar, LO, GO, Node>::
1347 getValuesDevice () const
1348 {
1349 return val_.getDeviceView(Access::ReadOnly);
1350 }
1351
1352 template<class Scalar, class LO, class GO, class Node>
1353 typename BlockCrsMatrix<Scalar, LO, GO, Node>::
1354 impl_scalar_type_dualview::t_host
1356 getValuesHostNonConst () const
1357 {
1358 return val_.getHostView(Access::ReadWrite);
1359 }
1360
1361 template<class Scalar, class LO, class GO, class Node>
1363 impl_scalar_type_dualview::t_dev
1366 {
1367 return val_.getDeviceView(Access::ReadWrite);
1368 }
1369
1370 template<class Scalar, class LO, class GO, class Node>
1371 typename BlockCrsMatrix<Scalar, LO, GO, Node>::
1372 impl_scalar_type_dualview::t_host::const_type
1374 getValuesHost (const LO& lclRow) const
1375 {
1376 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1377 auto val = val_.getHostView(Access::ReadOnly);
1378 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1379 return r_val;
1380 }
1381
1382 template<class Scalar, class LO, class GO, class Node>
1384 impl_scalar_type_dualview::t_dev::const_type
1386 getValuesDevice (const LO& lclRow) const
1387 {
1388 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1389 auto val = val_.getDeviceView(Access::ReadOnly);
1390 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1391 return r_val;
1392 }
1393
1394 template<class Scalar, class LO, class GO, class Node>
1397 getValuesHostNonConst (const LO& lclRow)
1398 {
1399 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1400 auto val = val_.getHostView(Access::ReadWrite);
1401 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1402 return r_val;
1403 }
1404
1405 template<class Scalar, class LO, class GO, class Node>
1409 {
1410 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1411 auto val = val_.getDeviceView(Access::ReadWrite);
1412 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1413 return r_val;
1414 }
1415
1416 template<class Scalar, class LO, class GO, class Node>
1417 size_t
1420 {
1421 const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
1422 if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1423 return static_cast<LO> (0); // the calling process doesn't have that row
1424 } else {
1425 return static_cast<LO> (numEntInGraph);
1426 }
1427 }
1428
1429 template<class Scalar, class LO, class GO, class Node>
1430 typename BlockCrsMatrix<Scalar, LO, GO, Node>::local_matrix_device_type
1432 getLocalMatrixDevice () const
1433 {
1434 auto numCols = this->graph_.getColMap()->getLocalNumElements();
1435 auto val = val_.getDeviceView(Access::ReadWrite);
1436 const LO blockSize = this->getBlockSize ();
1437 const auto graph = this->graph_.getLocalGraphDevice ();
1438
1439 return local_matrix_device_type("Tpetra::BlockCrsMatrix::lclMatrixDevice",
1440 numCols,
1441 val,
1442 graph,
1443 blockSize);
1444 }
1445
1446 template<class Scalar, class LO, class GO, class Node>
1447 void
1451 const Teuchos::ETransp mode,
1452 const Scalar alpha,
1453 const Scalar beta)
1454 {
1455 (void) X;
1456 (void) Y;
1457 (void) mode;
1458 (void) alpha;
1459 (void) beta;
1460
1462 true, std::logic_error, "Tpetra::BlockCrsMatrix::apply: "
1463 "transpose and conjugate transpose modes are not yet implemented.");
1464 }
1465
1466 template<class Scalar, class LO, class GO, class Node>
1467 void
1468 BlockCrsMatrix<Scalar, LO, GO, Node>::
1469 applyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
1470 BlockMultiVector<Scalar, LO, GO, Node>& Y,
1471 const Scalar alpha,
1472 const Scalar beta)
1473 {
1474 using Teuchos::RCP;
1475 using Teuchos::rcp;
1476 typedef ::Tpetra::Import<LO, GO, Node> import_type;
1477 typedef ::Tpetra::Export<LO, GO, Node> export_type;
1478 const Scalar zero = STS::zero ();
1479 const Scalar one = STS::one ();
1480 RCP<const import_type> import = graph_.getImporter ();
1481 // "export" is a reserved C++ keyword, so we can't use it.
1482 RCP<const export_type> theExport = graph_.getExporter ();
1483 const char prefix[] = "Tpetra::BlockCrsMatrix::applyBlockNoTrans: ";
1484
1485 if (alpha == zero) {
1486 if (beta == zero) {
1487 Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
1488 }
1489 else if (beta != one) {
1490 Y.scale (beta);
1491 }
1492 }
1493 else { // alpha != 0
1494 const BMV* X_colMap = NULL;
1495 if (import.is_null ()) {
1496 try {
1497 X_colMap = &X;
1498 }
1499 catch (std::exception& e) {
1500 TEUCHOS_TEST_FOR_EXCEPTION
1501 (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1502 "operator= threw an exception: " << std::endl << e.what ());
1503 }
1504 }
1505 else {
1506 // X_colMap_ is a pointer to a pointer to BMV. Ditto for
1507 // Y_rowMap_ below. This lets us do lazy initialization
1508 // correctly with view semantics of BlockCrsMatrix. All views
1509 // of this BlockCrsMatrix have the same outer pointer. That
1510 // way, we can set the inner pointer in one view, and all
1511 // other views will see it.
1512 if ((*X_colMap_).is_null () ||
1513 (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1514 (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1515 *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
1516 static_cast<LO> (X.getNumVectors ())));
1517 }
1518 (*X_colMap_)->getMultiVectorView().doImport (X.getMultiVectorView (),
1519 **pointImporter_,
1521 try {
1522 X_colMap = &(**X_colMap_);
1523 }
1524 catch (std::exception& e) {
1525 TEUCHOS_TEST_FOR_EXCEPTION
1526 (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1527 "operator= threw an exception: " << std::endl << e.what ());
1528 }
1529 }
1530
1531 BMV* Y_rowMap = NULL;
1532 if (theExport.is_null ()) {
1533 Y_rowMap = &Y;
1534 }
1535 else if ((*Y_rowMap_).is_null () ||
1536 (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1537 (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1538 *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
1539 static_cast<LO> (X.getNumVectors ())));
1540 try {
1541 Y_rowMap = &(**Y_rowMap_);
1542 }
1543 catch (std::exception& e) {
1544 TEUCHOS_TEST_FOR_EXCEPTION(
1545 true, std::logic_error, prefix << "Tpetra::MultiVector::"
1546 "operator= threw an exception: " << std::endl << e.what ());
1547 }
1548 }
1549
1550 try {
1551 localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1552 }
1553 catch (std::exception& e) {
1554 TEUCHOS_TEST_FOR_EXCEPTION
1555 (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1556 "an exception: " << e.what ());
1557 }
1558 catch (...) {
1559 TEUCHOS_TEST_FOR_EXCEPTION
1560 (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1561 "an exception not a subclass of std::exception.");
1562 }
1563
1564 if (! theExport.is_null ()) {
1565 Y.doExport (*Y_rowMap, *theExport, ::Tpetra::REPLACE);
1566 }
1567 }
1568 }
1569
1570 template<class Scalar, class LO, class GO, class Node>
1571 void
1572 BlockCrsMatrix<Scalar, LO, GO, Node>::
1573 localApplyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
1574 BlockMultiVector<Scalar, LO, GO, Node>& Y,
1575 const Scalar alpha,
1576 const Scalar beta)
1577 {
1578 using ::Tpetra::Impl::bcrsLocalApplyNoTrans;
1579
1580 const impl_scalar_type alpha_impl = alpha;
1581 const auto graph = this->graph_.getLocalGraphDevice ();
1582 const impl_scalar_type beta_impl = beta;
1583 const LO blockSize = this->getBlockSize ();
1584
1585 auto X_mv = X.getMultiVectorView ();
1586 auto Y_mv = Y.getMultiVectorView ();
1587
1588 //auto X_lcl = X_mv.template getLocalView<device_type> (Access::ReadOnly);
1589 //auto Y_lcl = Y_mv.template getLocalView<device_type> (Access::ReadWrite);
1590 auto X_lcl = X_mv.getLocalViewDevice (Access::ReadOnly);
1591 auto Y_lcl = Y_mv.getLocalViewDevice (Access::ReadWrite);
1592 auto val = val_.getDeviceView(Access::ReadWrite);
1593
1594 bcrsLocalApplyNoTrans (alpha_impl, graph, val, blockSize, X_lcl,
1595 beta_impl, Y_lcl);
1596 }
1597
1598 template<class Scalar, class LO, class GO, class Node>
1599 LO
1600 BlockCrsMatrix<Scalar, LO, GO, Node>::
1601 findRelOffsetOfColumnIndex (const LO localRowIndex,
1602 const LO colIndexToFind,
1603 const LO hint) const
1604 {
1605 const size_t absStartOffset = ptrHost_[localRowIndex];
1606 const size_t absEndOffset = ptrHost_[localRowIndex+1];
1607 const LO numEntriesInRow = static_cast<LO> (absEndOffset - absStartOffset);
1608 // Amortize pointer arithmetic over the search loop.
1609 const LO* const curInd = indHost_.data () + absStartOffset;
1610
1611 // If the hint was correct, then the hint is the offset to return.
1612 if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1613 // Always return the offset relative to the current row.
1614 return hint;
1615 }
1616
1617 // The hint was wrong, so we must search for the given column
1618 // index in the column indices for the given row.
1619 LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1620
1621 // We require that the graph have sorted rows. However, binary
1622 // search only pays if the current row is longer than a certain
1623 // amount. We set this to 32, but you might want to tune this.
1624 const LO maxNumEntriesForLinearSearch = 32;
1625 if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1626 // Use binary search. It would probably be better for us to
1627 // roll this loop by hand. If we wrote it right, a smart
1628 // compiler could perhaps use conditional loads and avoid
1629 // branches (according to Jed Brown on May 2014).
1630 const LO* beg = curInd;
1631 const LO* end = curInd + numEntriesInRow;
1632 std::pair<const LO*, const LO*> p =
1633 std::equal_range (beg, end, colIndexToFind);
1634 if (p.first != p.second) {
1635 // offset is relative to the current row
1636 relOffset = static_cast<LO> (p.first - beg);
1637 }
1638 }
1639 else { // use linear search
1640 for (LO k = 0; k < numEntriesInRow; ++k) {
1641 if (colIndexToFind == curInd[k]) {
1642 relOffset = k; // offset is relative to the current row
1643 break;
1644 }
1645 }
1646 }
1647
1648 return relOffset;
1649 }
1650
1651 template<class Scalar, class LO, class GO, class Node>
1652 LO
1653 BlockCrsMatrix<Scalar, LO, GO, Node>::
1654 offsetPerBlock () const
1655 {
1656 return offsetPerBlock_;
1657 }
1658
1659 template<class Scalar, class LO, class GO, class Node>
1661 BlockCrsMatrix<Scalar, LO, GO, Node>::
1662 getConstLocalBlockFromInput (const impl_scalar_type* val,
1663 const size_t pointOffset) const
1664 {
1665 // Row major blocks
1666 const LO rowStride = blockSize_;
1667 return const_little_block_type (val + pointOffset, blockSize_, rowStride);
1668 }
1669
1670 template<class Scalar, class LO, class GO, class Node>
1672 BlockCrsMatrix<Scalar, LO, GO, Node>::
1673 getNonConstLocalBlockFromInput (impl_scalar_type* val,
1674 const size_t pointOffset) const
1675 {
1676 // Row major blocks
1677 const LO rowStride = blockSize_;
1678 return little_block_type (val + pointOffset, blockSize_, rowStride);
1679 }
1680
1681 template<class Scalar, class LO, class GO, class Node>
1682 typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1683 BlockCrsMatrix<Scalar, LO, GO, Node>::
1684 getNonConstLocalBlockFromInputHost (impl_scalar_type* val,
1685 const size_t pointOffset) const
1686 {
1687 // Row major blocks
1688 const LO rowStride = blockSize_;
1689 const size_t bs2 = blockSize_ * blockSize_;
1690 return little_block_host_type (val + bs2 * pointOffset, blockSize_, rowStride);
1691 }
1692
1693 template<class Scalar, class LO, class GO, class Node>
1695 BlockCrsMatrix<Scalar, LO, GO, Node>::
1696 getLocalBlockDeviceNonConst (const LO localRowInd, const LO localColInd) const
1697 {
1698 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1699
1700 const size_t absRowBlockOffset = ptrHost_[localRowInd];
1701 const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1702 if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1703 const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1704 auto vals = const_cast<this_BCRS_type&>(*this).getValuesDeviceNonConst();
1705 auto r_val = getNonConstLocalBlockFromInput (vals.data(), absBlockOffset);
1706 return r_val;
1707 }
1708 else {
1709 return little_block_type ();
1710 }
1711 }
1712
1713 template<class Scalar, class LO, class GO, class Node>
1714 typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1715 BlockCrsMatrix<Scalar, LO, GO, Node>::
1716 getLocalBlockHostNonConst (const LO localRowInd, const LO localColInd) const
1717 {
1718 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1719
1720 const size_t absRowBlockOffset = ptrHost_[localRowInd];
1721 const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1722 if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1723 const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1724 auto vals = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst();
1725 auto r_val = getNonConstLocalBlockFromInputHost (vals.data(), absBlockOffset);
1726 return r_val;
1727 }
1728 else {
1729 return little_block_host_type ();
1730 }
1731 }
1732
1733
1734 template<class Scalar, class LO, class GO, class Node>
1735 bool
1736 BlockCrsMatrix<Scalar, LO, GO, Node>::
1737 checkSizes (const ::Tpetra::SrcDistObject& source)
1738 {
1739 using std::endl;
1740 typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
1741 const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
1742
1743 if (src == NULL) {
1744 std::ostream& err = markLocalErrorAndGetStream ();
1745 err << "checkSizes: The source object of the Import or Export "
1746 "must be a BlockCrsMatrix with the same template parameters as the "
1747 "target object." << endl;
1748 }
1749 else {
1750 // Use a string of ifs, not if-elseifs, because we want to know
1751 // all the errors.
1752 if (src->getBlockSize () != this->getBlockSize ()) {
1753 std::ostream& err = markLocalErrorAndGetStream ();
1754 err << "checkSizes: The source and target objects of the Import or "
1755 << "Export must have the same block sizes. The source's block "
1756 << "size = " << src->getBlockSize () << " != the target's block "
1757 << "size = " << this->getBlockSize () << "." << endl;
1758 }
1759 if (! src->graph_.isFillComplete ()) {
1760 std::ostream& err = markLocalErrorAndGetStream ();
1761 err << "checkSizes: The source object of the Import or Export is "
1762 "not fill complete. Both source and target objects must be fill "
1763 "complete." << endl;
1764 }
1765 if (! this->graph_.isFillComplete ()) {
1766 std::ostream& err = markLocalErrorAndGetStream ();
1767 err << "checkSizes: The target object of the Import or Export is "
1768 "not fill complete. Both source and target objects must be fill "
1769 "complete." << endl;
1770 }
1771 if (src->graph_.getColMap ().is_null ()) {
1772 std::ostream& err = markLocalErrorAndGetStream ();
1773 err << "checkSizes: The source object of the Import or Export does "
1774 "not have a column Map. Both source and target objects must have "
1775 "column Maps." << endl;
1776 }
1777 if (this->graph_.getColMap ().is_null ()) {
1778 std::ostream& err = markLocalErrorAndGetStream ();
1779 err << "checkSizes: The target object of the Import or Export does "
1780 "not have a column Map. Both source and target objects must have "
1781 "column Maps." << endl;
1782 }
1783 }
1784
1785 return ! (* (this->localError_));
1786 }
1787
1788 template<class Scalar, class LO, class GO, class Node>
1789 void
1792 (const ::Tpetra::SrcDistObject& source,
1793 const size_t numSameIDs,
1794 const Kokkos::DualView<const local_ordinal_type*,
1796 const Kokkos::DualView<const local_ordinal_type*,
1798 const CombineMode /*CM*/)
1799 {
1800 using ::Tpetra::Details::Behavior;
1801 using ::Tpetra::Details::dualViewStatusToString;
1802 using ::Tpetra::Details::ProfilingRegion;
1803 using std::endl;
1805
1806 ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::copyAndPermute");
1807 const bool debug = Behavior::debug();
1808 const bool verbose = Behavior::verbose();
1809
1810 // Define this function prefix
1811 std::string prefix;
1812 {
1813 std::ostringstream os;
1814 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1815 os << "Proc " << myRank << ": BlockCrsMatrix::copyAndPermute : " << endl;
1816 prefix = os.str();
1817 }
1818
1819 // check if this already includes a local error
1820 if (* (this->localError_)) {
1821 std::ostream& err = this->markLocalErrorAndGetStream ();
1822 err << prefix
1823 << "The target object of the Import or Export is already in an error state."
1824 << endl;
1825 return;
1826 }
1827
1828 //
1829 // Verbose input dual view status
1830 //
1831 if (verbose) {
1832 std::ostringstream os;
1833 os << prefix << endl
1834 << prefix << " " << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") << endl
1835 << prefix << " " << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") << endl;
1836 std::cerr << os.str ();
1837 }
1838
1842 if (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0)) {
1843 std::ostream& err = this->markLocalErrorAndGetStream ();
1844 err << prefix
1845 << "permuteToLIDs.extent(0) = " << permuteToLIDs.extent (0)
1846 << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0)
1847 << "." << endl;
1848 return;
1849 }
1850 if (permuteToLIDs.need_sync_host () || permuteFromLIDs.need_sync_host ()) {
1851 std::ostream& err = this->markLocalErrorAndGetStream ();
1852 err << prefix
1853 << "Both permuteToLIDs and permuteFromLIDs must be sync'd to host."
1854 << endl;
1855 return;
1856 }
1857
1858 const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
1859 if (src == NULL) {
1860 std::ostream& err = this->markLocalErrorAndGetStream ();
1861 err << prefix
1862 << "The source (input) object of the Import or "
1863 "Export is either not a BlockCrsMatrix, or does not have the right "
1864 "template parameters. checkSizes() should have caught this. "
1865 "Please report this bug to the Tpetra developers." << endl;
1866 return;
1867 }
1868
1869 bool lclErr = false;
1870#ifdef HAVE_TPETRA_DEBUG
1871 std::set<LO> invalidSrcCopyRows;
1872 std::set<LO> invalidDstCopyRows;
1873 std::set<LO> invalidDstCopyCols;
1874 std::set<LO> invalidDstPermuteCols;
1875 std::set<LO> invalidPermuteFromRows;
1876#endif // HAVE_TPETRA_DEBUG
1877
1878 // Copy the initial sequence of rows that are the same.
1879 //
1880 // The two graphs might have different column Maps, so we need to
1881 // do this using global column indices. This is purely local, so
1882 // we only need to check for local sameness of the two column
1883 // Maps.
1884
1885#ifdef HAVE_TPETRA_DEBUG
1886 const map_type& srcRowMap = * (src->graph_.getRowMap ());
1887#endif // HAVE_TPETRA_DEBUG
1888 const map_type& dstRowMap = * (this->graph_.getRowMap ());
1889 const map_type& srcColMap = * (src->graph_.getColMap ());
1890 const map_type& dstColMap = * (this->graph_.getColMap ());
1891 const bool canUseLocalColumnIndices = srcColMap.locallySameAs (dstColMap);
1892
1893 const size_t numPermute = static_cast<size_t> (permuteFromLIDs.extent(0));
1894 if (verbose) {
1895 std::ostringstream os;
1896 os << prefix
1897 << "canUseLocalColumnIndices: "
1898 << (canUseLocalColumnIndices ? "true" : "false")
1899 << ", numPermute: " << numPermute
1900 << endl;
1901 std::cerr << os.str ();
1902 }
1903
1904 const auto permuteToLIDsHost = permuteToLIDs.view_host();
1905 const auto permuteFromLIDsHost = permuteFromLIDs.view_host();
1906
1908 // Copy local rows that are the "same" in both source and target.
1909 for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1910#ifdef HAVE_TPETRA_DEBUG
1911 if (! srcRowMap.isNodeLocalElement (localRow)) {
1912 lclErr = true;
1913 invalidSrcCopyRows.insert (localRow);
1914 continue; // skip invalid rows
1915 }
1916#endif // HAVE_TPETRA_DEBUG
1917
1918 local_inds_host_view_type lclSrcCols;
1919 values_host_view_type vals;
1920 LO numEntries;
1921 // If this call fails, that means the mesh row local index is
1922 // invalid. That means the Import or Export is invalid somehow.
1923 src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1924 if (numEntries > 0) {
1925 LO err = this->replaceLocalValues (localRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1926 if (err != numEntries) {
1927 lclErr = true;
1928 if (! dstRowMap.isNodeLocalElement (localRow)) {
1929#ifdef HAVE_TPETRA_DEBUG
1930 invalidDstCopyRows.insert (localRow);
1931#endif // HAVE_TPETRA_DEBUG
1932 }
1933 else {
1934 // Once there's an error, there's no sense in saving
1935 // time, so we check whether the column indices were
1936 // invalid. However, only remember which ones were
1937 // invalid in a debug build, because that might take a
1938 // lot of space.
1939 for (LO k = 0; k < numEntries; ++k) {
1940 if (! dstColMap.isNodeLocalElement (lclSrcCols[k])) {
1941 lclErr = true;
1942#ifdef HAVE_TPETRA_DEBUG
1943 (void) invalidDstCopyCols.insert (lclSrcCols[k]);
1944#endif // HAVE_TPETRA_DEBUG
1945 }
1946 }
1947 }
1948 }
1949 }
1950 } // for each "same" local row
1951
1952 // Copy the "permute" local rows.
1953 for (size_t k = 0; k < numPermute; ++k) {
1954 const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
1955 const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
1956
1957 local_inds_host_view_type lclSrcCols;
1958 values_host_view_type vals;
1959 LO numEntries;
1960 src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1961 if (numEntries > 0) {
1962 LO err = this->replaceLocalValues (dstLclRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1963 if (err != numEntries) {
1964 lclErr = true;
1965#ifdef HAVE_TPETRA_DEBUG
1966 for (LO c = 0; c < numEntries; ++c) {
1967 if (! dstColMap.isNodeLocalElement (lclSrcCols[c])) {
1969 }
1970 }
1971#endif // HAVE_TPETRA_DEBUG
1972 }
1973 }
1974 }
1975 }
1976 else { // must convert column indices to global
1977 // Reserve space to store the destination matrix's local column indices.
1978 const size_t maxNumEnt = src->graph_.getLocalMaxNumRowEntries ();
1979 Teuchos::Array<LO> lclDstCols (maxNumEnt);
1980
1981 // Copy local rows that are the "same" in both source and target.
1982 for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1983 local_inds_host_view_type lclSrcCols;
1984 values_host_view_type vals;
1985 LO numEntries;
1986
1987 // If this call fails, that means the mesh row local index is
1988 // invalid. That means the Import or Export is invalid somehow.
1989 try {
1990 src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1991 } catch (std::exception& e) {
1992 if (debug) {
1993 std::ostringstream os;
1994 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1995 os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1996 << localRow << ", src->getLocalRowView() threw an exception: "
1997 << e.what ();
1998 std::cerr << os.str ();
1999 }
2000 throw e;
2001 }
2002
2003 if (numEntries > 0) {
2004 if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2005 lclErr = true;
2006 if (debug) {
2007 std::ostringstream os;
2008 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2009 os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2010 << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
2011 << maxNumEnt << endl;
2012 std::cerr << os.str ();
2013 }
2014 }
2015 else {
2016 // Convert the source matrix's local column indices to the
2017 // destination matrix's local column indices.
2018 Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2019 for (LO j = 0; j < numEntries; ++j) {
2020 lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2021 if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2022 lclErr = true;
2023#ifdef HAVE_TPETRA_DEBUG
2025#endif // HAVE_TPETRA_DEBUG
2026 }
2027 }
2028 LO err(0);
2029 try {
2030 err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
2031 } catch (std::exception& e) {
2032 if (debug) {
2033 std::ostringstream os;
2034 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2035 os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2036 << localRow << ", this->replaceLocalValues() threw an exception: "
2037 << e.what ();
2038 std::cerr << os.str ();
2039 }
2040 throw e;
2041 }
2042 if (err != numEntries) {
2043 lclErr = true;
2044 if (debug) {
2045 std::ostringstream os;
2046 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2047 os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
2048 "localRow " << localRow << ", this->replaceLocalValues "
2049 "returned " << err << " instead of numEntries = "
2050 << numEntries << endl;
2051 std::cerr << os.str ();
2052 }
2053 }
2054 }
2055 }
2056 }
2057
2058 // Copy the "permute" local rows.
2059 for (size_t k = 0; k < numPermute; ++k) {
2060 const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
2061 const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
2062
2063 local_inds_host_view_type lclSrcCols;
2064 values_host_view_type vals;
2065 LO numEntries;
2066
2067 try {
2068 src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
2069 } catch (std::exception& e) {
2070 if (debug) {
2071 std::ostringstream os;
2072 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2073 os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
2074 "srcLclRow " << srcLclRow << " and dstLclRow " << dstLclRow
2075 << ", src->getLocalRowView() threw an exception: " << e.what ();
2076 std::cerr << os.str ();
2077 }
2078 throw e;
2079 }
2080
2081 if (numEntries > 0) {
2082 if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2083 lclErr = true;
2084 }
2085 else {
2086 // Convert the source matrix's local column indices to the
2087 // destination matrix's local column indices.
2088 Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2089 for (LO j = 0; j < numEntries; ++j) {
2090 lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2091 if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2092 lclErr = true;
2093#ifdef HAVE_TPETRA_DEBUG
2095#endif // HAVE_TPETRA_DEBUG
2096 }
2097 }
2098 LO err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
2099 if (err != numEntries) {
2100 lclErr = true;
2101 }
2102 }
2103 }
2104 }
2105 }
2106
2107 if (lclErr) {
2108 std::ostream& err = this->markLocalErrorAndGetStream ();
2109#ifdef HAVE_TPETRA_DEBUG
2110 err << "copyAndPermute: The graph structure of the source of the "
2111 "Import or Export must be a subset of the graph structure of the "
2112 "target. ";
2113 err << "invalidSrcCopyRows = [";
2114 for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
2115 it != invalidSrcCopyRows.end (); ++it) {
2116 err << *it;
2117 typename std::set<LO>::const_iterator itp1 = it;
2118 itp1++;
2119 if (itp1 != invalidSrcCopyRows.end ()) {
2120 err << ",";
2121 }
2122 }
2123 err << "], invalidDstCopyRows = [";
2124 for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
2125 it != invalidDstCopyRows.end (); ++it) {
2126 err << *it;
2127 typename std::set<LO>::const_iterator itp1 = it;
2128 itp1++;
2129 if (itp1 != invalidDstCopyRows.end ()) {
2130 err << ",";
2131 }
2132 }
2133 err << "], invalidDstCopyCols = [";
2134 for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
2135 it != invalidDstCopyCols.end (); ++it) {
2136 err << *it;
2137 typename std::set<LO>::const_iterator itp1 = it;
2138 itp1++;
2139 if (itp1 != invalidDstCopyCols.end ()) {
2140 err << ",";
2141 }
2142 }
2143 err << "], invalidDstPermuteCols = [";
2144 for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
2145 it != invalidDstPermuteCols.end (); ++it) {
2146 err << *it;
2147 typename std::set<LO>::const_iterator itp1 = it;
2148 itp1++;
2149 if (itp1 != invalidDstPermuteCols.end ()) {
2150 err << ",";
2151 }
2152 }
2153 err << "], invalidPermuteFromRows = [";
2154 for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
2155 it != invalidPermuteFromRows.end (); ++it) {
2156 err << *it;
2157 typename std::set<LO>::const_iterator itp1 = it;
2158 itp1++;
2159 if (itp1 != invalidPermuteFromRows.end ()) {
2160 err << ",";
2161 }
2162 }
2163 err << "]" << endl;
2164#else
2165 err << "copyAndPermute: The graph structure of the source of the "
2166 "Import or Export must be a subset of the graph structure of the "
2167 "target." << endl;
2168#endif // HAVE_TPETRA_DEBUG
2169 }
2170
2171 if (debug) {
2172 std::ostringstream os;
2173 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2174 const bool lclSuccess = ! (* (this->localError_));
2175 os << "*** Proc " << myRank << ": copyAndPermute "
2176 << (lclSuccess ? "succeeded" : "FAILED");
2177 if (lclSuccess) {
2178 os << endl;
2179 } else {
2180 os << ": error messages: " << this->errorMessages (); // comes w/ endl
2181 }
2182 std::cerr << os.str ();
2183 }
2184 }
2185
2186 namespace { // (anonymous)
2187
2206 template<class LO, class GO>
2207 size_t
2208 packRowCount (const size_t numEnt,
2209 const size_t numBytesPerValue,
2210 const size_t blkSize)
2211 {
2212 using ::Tpetra::Details::PackTraits;
2213
2214 if (numEnt == 0) {
2215 // Empty rows always take zero bytes, to ensure sparsity.
2216 return 0;
2217 }
2218 else {
2219 // We store the number of entries as a local index (LO).
2220 LO numEntLO = 0; // packValueCount wants this.
2221 GO gid {};
2222 const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
2223 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2224 const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
2225 return numEntLen + gidsLen + valsLen;
2226 }
2227 }
2228
2239 template<class ST, class LO, class GO>
2240 size_t
2241 unpackRowCount (const typename ::Tpetra::Details::PackTraits<LO>::input_buffer_type& imports,
2242 const size_t offset,
2243 const size_t numBytes,
2244 const size_t /* numBytesPerValue */)
2245 {
2246 using Kokkos::subview;
2247 using ::Tpetra::Details::PackTraits;
2248
2249 if (numBytes == 0) {
2250 // Empty rows always take zero bytes, to ensure sparsity.
2251 return static_cast<size_t> (0);
2252 }
2253 else {
2254 LO numEntLO = 0;
2255 const size_t theNumBytes = PackTraits<LO>::packValueCount (numEntLO);
2256 TEUCHOS_TEST_FOR_EXCEPTION
2257 (theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2258 "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
2259 << ".");
2260 const char* const inBuf = imports.data () + offset;
2261 const size_t actualNumBytes = PackTraits<LO>::unpackValue (numEntLO, inBuf);
2262 TEUCHOS_TEST_FOR_EXCEPTION
2263 (actualNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2264 "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
2265 << ".");
2266 return static_cast<size_t> (numEntLO);
2267 }
2268 }
2269
2273 template<class ST, class LO, class GO>
2274 size_t
2275 packRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<LO>::output_buffer_type exports,
2276 const size_t offset,
2277 const size_t numEnt,
2278 const typename ::Tpetra::Details::PackTraits<GO>::input_array_type& gidsIn,
2279 const typename ::Tpetra::Details::PackTraits<ST>::input_array_type& valsIn,
2280 const size_t numBytesPerValue,
2281 const size_t blockSize)
2282 {
2283 using Kokkos::subview;
2284 using ::Tpetra::Details::PackTraits;
2285
2286 if (numEnt == 0) {
2287 // Empty rows always take zero bytes, to ensure sparsity.
2288 return 0;
2289 }
2290 const size_t numScalarEnt = numEnt * blockSize * blockSize;
2291
2292 const GO gid = 0; // packValueCount wants this
2293 const LO numEntLO = static_cast<size_t> (numEnt);
2294
2295 const size_t numEntBeg = offset;
2296 const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
2297 const size_t gidsBeg = numEntBeg + numEntLen;
2298 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2299 const size_t valsBeg = gidsBeg + gidsLen;
2300 const size_t valsLen = numScalarEnt * numBytesPerValue;
2301
2302 char* const numEntOut = exports.data () + numEntBeg;
2303 char* const gidsOut = exports.data () + gidsBeg;
2304 char* const valsOut = exports.data () + valsBeg;
2305
2306 size_t numBytesOut = 0;
2307 int errorCode = 0;
2308 numBytesOut += PackTraits<LO>::packValue (numEntOut, numEntLO);
2309
2310 {
2311 Kokkos::pair<int, size_t> p;
2312 p = PackTraits<GO>::packArray (gidsOut, gidsIn.data (), numEnt);
2313 errorCode += p.first;
2314 numBytesOut += p.second;
2315
2316 p = PackTraits<ST>::packArray (valsOut, valsIn.data (), numScalarEnt);
2317 errorCode += p.first;
2318 numBytesOut += p.second;
2319 }
2320
2321 const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2322 TEUCHOS_TEST_FOR_EXCEPTION
2323 (numBytesOut != expectedNumBytes, std::logic_error,
2324 "packRowForBlockCrs: numBytesOut = " << numBytesOut
2325 << " != expectedNumBytes = " << expectedNumBytes << ".");
2326
2327 TEUCHOS_TEST_FOR_EXCEPTION
2328 (errorCode != 0, std::runtime_error, "packRowForBlockCrs: "
2329 "PackTraits::packArray returned a nonzero error code");
2330
2331 return numBytesOut;
2332 }
2333
2334 // Return the number of bytes actually read / used.
2335 template<class ST, class LO, class GO>
2336 size_t
2337 unpackRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<GO>::output_array_type& gidsOut,
2338 const typename ::Tpetra::Details::PackTraits<ST>::output_array_type& valsOut,
2339 const typename ::Tpetra::Details::PackTraits<int>::input_buffer_type& imports,
2340 const size_t offset,
2341 const size_t numBytes,
2342 const size_t numEnt,
2343 const size_t numBytesPerValue,
2344 const size_t blockSize)
2345 {
2346 using ::Tpetra::Details::PackTraits;
2347
2348 if (numBytes == 0) {
2349 // Rows with zero bytes always have zero entries.
2350 return 0;
2351 }
2352 const size_t numScalarEnt = numEnt * blockSize * blockSize;
2353 TEUCHOS_TEST_FOR_EXCEPTION
2354 (static_cast<size_t> (imports.extent (0)) <= offset,
2355 std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
2356 << imports.extent (0) << " <= offset = " << offset << ".");
2357 TEUCHOS_TEST_FOR_EXCEPTION
2358 (static_cast<size_t> (imports.extent (0)) < offset + numBytes,
2359 std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
2360 << imports.extent (0) << " < offset + numBytes = "
2361 << (offset + numBytes) << ".");
2362
2363 const GO gid = 0; // packValueCount wants this
2364 const LO lid = 0; // packValueCount wants this
2365
2366 const size_t numEntBeg = offset;
2367 const size_t numEntLen = PackTraits<LO>::packValueCount (lid);
2368 const size_t gidsBeg = numEntBeg + numEntLen;
2369 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2370 const size_t valsBeg = gidsBeg + gidsLen;
2371 const size_t valsLen = numScalarEnt * numBytesPerValue;
2372
2373 const char* const numEntIn = imports.data () + numEntBeg;
2374 const char* const gidsIn = imports.data () + gidsBeg;
2375 const char* const valsIn = imports.data () + valsBeg;
2376
2377 size_t numBytesOut = 0;
2378 int errorCode = 0;
2379 LO numEntOut;
2380 numBytesOut += PackTraits<LO>::unpackValue (numEntOut, numEntIn);
2381 TEUCHOS_TEST_FOR_EXCEPTION
2382 (static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
2383 "unpackRowForBlockCrs: Expected number of entries " << numEnt
2384 << " != actual number of entries " << numEntOut << ".");
2385
2386 {
2387 Kokkos::pair<int, size_t> p;
2388 p = PackTraits<GO>::unpackArray (gidsOut.data (), gidsIn, numEnt);
2389 errorCode += p.first;
2390 numBytesOut += p.second;
2391
2392 p = PackTraits<ST>::unpackArray (valsOut.data (), valsIn, numScalarEnt);
2393 errorCode += p.first;
2394 numBytesOut += p.second;
2395 }
2396
2397 TEUCHOS_TEST_FOR_EXCEPTION
2398 (numBytesOut != numBytes, std::logic_error,
2399 "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
2400 << " != numBytes = " << numBytes << ".");
2401
2402 const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2403 TEUCHOS_TEST_FOR_EXCEPTION
2404 (numBytesOut != expectedNumBytes, std::logic_error,
2405 "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
2406 << " != expectedNumBytes = " << expectedNumBytes << ".");
2407
2408 TEUCHOS_TEST_FOR_EXCEPTION
2409 (errorCode != 0, std::runtime_error, "unpackRowForBlockCrs: "
2410 "PackTraits::unpackArray returned a nonzero error code");
2411
2412 return numBytesOut;
2413 }
2414 } // namespace (anonymous)
2415
2416 template<class Scalar, class LO, class GO, class Node>
2417 void
2420 (const ::Tpetra::SrcDistObject& source,
2421 const Kokkos::DualView<const local_ordinal_type*,
2423 Kokkos::DualView<packet_type*,
2424 buffer_device_type>& exports, // output
2425 Kokkos::DualView<size_t*,
2427 size_t& constantNumPackets)
2428 {
2429 using ::Tpetra::Details::Behavior;
2430 using ::Tpetra::Details::dualViewStatusToString;
2431 using ::Tpetra::Details::ProfilingRegion;
2432 using ::Tpetra::Details::PackTraits;
2433
2434 typedef typename Kokkos::View<int*, device_type>::HostMirror::execution_space host_exec;
2435
2437
2438 ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::packAndPrepare");
2439
2440 const bool debug = Behavior::debug();
2441 const bool verbose = Behavior::verbose();
2442
2443 // Define this function prefix
2444 std::string prefix;
2445 {
2446 std::ostringstream os;
2447 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2448 os << "Proc " << myRank << ": BlockCrsMatrix::packAndPrepare: " << std::endl;
2449 prefix = os.str();
2450 }
2451
2452 // check if this already includes a local error
2453 if (* (this->localError_)) {
2454 std::ostream& err = this->markLocalErrorAndGetStream ();
2455 err << prefix
2456 << "The target object of the Import or Export is already in an error state."
2457 << std::endl;
2458 return;
2459 }
2460
2461 //
2462 // Verbose input dual view status
2463 //
2464 if (verbose) {
2465 std::ostringstream os;
2466 os << prefix << std::endl
2467 << prefix << " " << dualViewStatusToString (exportLIDs, "exportLIDs") << std::endl
2468 << prefix << " " << dualViewStatusToString (exports, "exports") << std::endl
2469 << prefix << " " << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID") << std::endl;
2470 std::cerr << os.str ();
2471 }
2472
2476 if (exportLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2477 std::ostream& err = this->markLocalErrorAndGetStream ();
2478 err << prefix
2479 << "exportLIDs.extent(0) = " << exportLIDs.extent (0)
2480 << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2481 << "." << std::endl;
2482 return;
2483 }
2484 if (exportLIDs.need_sync_host ()) {
2485 std::ostream& err = this->markLocalErrorAndGetStream ();
2486 err << prefix << "exportLIDs be sync'd to host." << std::endl;
2487 return;
2488 }
2489
2490 const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
2491 if (src == NULL) {
2492 std::ostream& err = this->markLocalErrorAndGetStream ();
2493 err << prefix
2494 << "The source (input) object of the Import or "
2495 "Export is either not a BlockCrsMatrix, or does not have the right "
2496 "template parameters. checkSizes() should have caught this. "
2497 "Please report this bug to the Tpetra developers." << std::endl;
2498 return;
2499 }
2500
2501 // Graphs and matrices are allowed to have a variable number of
2502 // entries per row. We could test whether all rows have the same
2503 // number of entries, but DistObject can only use this
2504 // optimization if all rows on _all_ processes have the same
2505 // number of entries. Rather than do the all-reduce necessary to
2506 // test for this unlikely case, we tell DistObject (by setting
2507 // constantNumPackets to zero) to assume that different rows may
2508 // have different numbers of entries.
2510
2511 // const values
2512 const crs_graph_type& srcGraph = src->graph_;
2513 const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
2514 const size_t numExportLIDs = exportLIDs.extent (0);
2515 size_t numBytesPerValue(0);
2516 {
2517 auto val_host = val_.getHostView(Access::ReadOnly);
2519 PackTraits<impl_scalar_type>
2520 ::packValueCount(val_host.extent(0) ? val_host(0) : impl_scalar_type());
2521 }
2522
2523 // Compute the number of bytes ("packets") per row to pack. While
2524 // we're at it, compute the total # of block entries to send, and
2525 // the max # of block entries in any of the rows we're sending.
2526
2527 Impl::BlockCrsRowStruct<size_t> rowReducerStruct;
2528
2529 // Graph information is on host; let's do this on host parallel reduce
2530 auto exportLIDsHost = exportLIDs.view_host();
2531 auto numPacketsPerLIDHost = numPacketsPerLID.view_host(); // we will modify this.
2532 numPacketsPerLID.modify_host ();
2533 {
2534 using reducer_type = Impl::BlockCrsReducer<Impl::BlockCrsRowStruct<size_t>,host_exec>;
2535 const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs);
2536 Kokkos::parallel_reduce
2537 (policy,
2538 [=](const size_t &i, typename reducer_type::value_type &update) {
2539 const LO lclRow = exportLIDsHost(i);
2540 size_t numEnt = srcGraph.getNumEntriesInLocalRow (lclRow);
2541 numEnt = (numEnt == Teuchos::OrdinalTraits<size_t>::invalid () ? 0 : numEnt);
2542
2543 const size_t numBytes =
2546 update += typename reducer_type::value_type(numEnt, numBytes, numEnt);
2547 }, rowReducerStruct);
2548 }
2549
2550 // Compute the number of bytes ("packets") per row to pack. While
2551 // we're at it, compute the total # of block entries to send, and
2552 // the max # of block entries in any of the rows we're sending.
2553 const size_t totalNumBytes = rowReducerStruct.totalNumBytes;
2554 const size_t totalNumEntries = rowReducerStruct.totalNumEntries;
2555 const size_t maxRowLength = rowReducerStruct.maxRowLength;
2556
2557 if (verbose) {
2558 std::ostringstream os;
2559 os << prefix
2560 << "totalNumBytes = " << totalNumBytes << ", totalNumEntries = " << totalNumEntries
2561 << std::endl;
2562 std::cerr << os.str ();
2563 }
2564
2565 // We use a "struct of arrays" approach to packing each row's
2566 // entries. All the column indices (as global indices) go first,
2567 // then all their owning process ranks, and then the values.
2568 if(exports.extent(0) != totalNumBytes)
2569 {
2570 const std::string oldLabel = exports.d_view.label ();
2571 const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
2572 exports = Kokkos::DualView<packet_type*, buffer_device_type>(newLabel, totalNumBytes);
2573 }
2574 if (totalNumEntries > 0) {
2575 // Current position (in bytes) in the 'exports' output array.
2576 Kokkos::View<size_t*, host_exec> offset("offset", numExportLIDs+1);
2577 {
2578 const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs+1);
2579 Kokkos::parallel_scan
2580 (policy,
2581 [=](const size_t &i, size_t &update, const bool &final) {
2582 if (final) offset(i) = update;
2583 update += (i == numExportLIDs ? 0 : numPacketsPerLIDHost(i));
2584 });
2585 }
2586 if (offset(numExportLIDs) != totalNumBytes) {
2587 std::ostream& err = this->markLocalErrorAndGetStream ();
2588 err << prefix
2589 << "At end of method, the final offset (in bytes) "
2590 << offset(numExportLIDs) << " does not equal the total number of bytes packed "
2591 << totalNumBytes << ". "
2592 << "Please report this bug to the Tpetra developers." << std::endl;
2593 return;
2594 }
2595
2596 // For each block row of the matrix owned by the calling
2597 // process, pack that block row's column indices and values into
2598 // the exports array.
2599
2600 // Source matrix's column Map. We verified in checkSizes() that
2601 // the column Map exists (is not null).
2602 const map_type& srcColMap = * (srcGraph.getColMap ());
2603
2604 // Pack the data for each row to send, into the 'exports' buffer.
2605 // exports will be modified on host.
2606 exports.modify_host();
2607 {
2608 typedef Kokkos::TeamPolicy<host_exec> policy_type;
2609 const auto policy =
2611 .set_scratch_size(0, Kokkos::PerTeam(sizeof(GO)*maxRowLength));
2612 Kokkos::parallel_for
2613 (policy,
2614 [=](const typename policy_type::member_type &member) {
2615 const size_t i = member.league_rank();
2616 Kokkos::View<GO*, typename host_exec::scratch_memory_space>
2617 gblColInds(member.team_scratch(0), maxRowLength);
2618
2619 const LO lclRowInd = exportLIDsHost(i);
2620 local_inds_host_view_type lclColInds;
2621 values_host_view_type vals;
2622
2623 // It's OK to ignore the return value, since if the calling
2624 // process doesn't own that local row, then the number of
2625 // entries in that row on the calling process is zero.
2626 src->getLocalRowView (lclRowInd, lclColInds, vals);
2627 const size_t numEnt = lclColInds.extent(0);
2628
2629 // Convert column indices from local to global.
2630 for (size_t j = 0; j < numEnt; ++j)
2631 gblColInds(j) = srcColMap.getGlobalElement (lclColInds(j));
2632
2633 // Kyungjoo: additional wrapping scratch view is necessary
2634 // the following function interface need the same execution space
2635 // host scratch space somehow is not considered same as the host_exec
2636 // Copy the row's data into the current spot in the exports array.
2637 const size_t numBytes =
2639 (exports.view_host(),
2640 offset(i),
2641 numEnt,
2642 Kokkos::View<const GO*, host_exec>(gblColInds.data(), maxRowLength),
2643 vals,
2645 blockSize);
2646
2647 // numBytes should be same as the difference between offsets
2648 if (debug) {
2649 const size_t offsetDiff = offset(i+1) - offset(i);
2650 if (numBytes != offsetDiff) {
2651 std::ostringstream os;
2652 os << prefix
2653 << "numBytes computed from packRowForBlockCrs is different from "
2654 << "precomputed offset values, LID = " << i << std::endl;
2655 std::cerr << os.str ();
2656 }
2657 }
2658 }); // for each LID (of a row) to send
2659 }
2660 } // if totalNumEntries > 0
2661
2662 if (debug) {
2663 std::ostringstream os;
2664 const bool lclSuccess = ! (* (this->localError_));
2665 os << prefix
2666 << (lclSuccess ? "succeeded" : "FAILED")
2667 << std::endl;
2668 std::cerr << os.str ();
2669 }
2670 }
2671
2672 template<class Scalar, class LO, class GO, class Node>
2673 void
2676 (const Kokkos::DualView<const local_ordinal_type*,
2678 Kokkos::DualView<packet_type*,
2679 buffer_device_type> imports,
2680 Kokkos::DualView<size_t*,
2682 const size_t /* constantNumPackets */,
2684 {
2685 using ::Tpetra::Details::Behavior;
2686 using ::Tpetra::Details::dualViewStatusToString;
2687 using ::Tpetra::Details::ProfilingRegion;
2688 using ::Tpetra::Details::PackTraits;
2689 using std::endl;
2690 using host_exec =
2691 typename Kokkos::View<int*, device_type>::HostMirror::execution_space;
2692
2693 ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::unpackAndCombine");
2694 const bool verbose = Behavior::verbose ();
2695
2696 // Define this function prefix
2697 std::string prefix;
2698 {
2699 std::ostringstream os;
2700 auto map = this->graph_.getRowMap ();
2701 auto comm = map.is_null () ? Teuchos::null : map->getComm ();
2702 const int myRank = comm.is_null () ? -1 : comm->getRank ();
2703 os << "Proc " << myRank << ": Tpetra::BlockCrsMatrix::unpackAndCombine: ";
2704 prefix = os.str ();
2705 if (verbose) {
2706 os << "Start" << endl;
2707 std::cerr << os.str ();
2708 }
2709 }
2710
2711 // check if this already includes a local error
2712 if (* (this->localError_)) {
2713 std::ostream& err = this->markLocalErrorAndGetStream ();
2714 std::ostringstream os;
2715 os << prefix << "{Im/Ex}port target is already in error." << endl;
2716 if (verbose) {
2717 std::cerr << os.str ();
2718 }
2719 err << os.str ();
2720 return;
2721 }
2722
2726 if (importLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2727 std::ostream& err = this->markLocalErrorAndGetStream ();
2728 std::ostringstream os;
2729 os << prefix << "importLIDs.extent(0) = " << importLIDs.extent (0)
2730 << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2731 << "." << endl;
2732 if (verbose) {
2733 std::cerr << os.str ();
2734 }
2735 err << os.str ();
2736 return;
2737 }
2738
2739 if (combineMode != ADD && combineMode != INSERT &&
2741 combineMode != ZERO) {
2742 std::ostream& err = this->markLocalErrorAndGetStream ();
2743 std::ostringstream os;
2744 os << prefix
2745 << "Invalid CombineMode value " << combineMode << ". Valid "
2746 << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
2747 << std::endl;
2748 if (verbose) {
2749 std::cerr << os.str ();
2750 }
2751 err << os.str ();
2752 return;
2753 }
2754
2755 if (this->graph_.getColMap ().is_null ()) {
2756 std::ostream& err = this->markLocalErrorAndGetStream ();
2757 std::ostringstream os;
2758 os << prefix << "Target matrix's column Map is null." << endl;
2759 if (verbose) {
2760 std::cerr << os.str ();
2761 }
2762 err << os.str ();
2763 return;
2764 }
2765
2766 // Target matrix's column Map. Use to convert the global column
2767 // indices in the receive buffer to local indices. We verified in
2768 // checkSizes() that the column Map exists (is not null).
2769 const map_type& tgtColMap = * (this->graph_.getColMap ());
2770
2771 // Const values
2772 const size_t blockSize = this->getBlockSize ();
2773 const size_t numImportLIDs = importLIDs.extent(0);
2774 // FIXME (mfh 06 Feb 2019) For scalar types with run-time sizes, a
2775 // default-constructed instance could have a different size than
2776 // other instances. (We assume all nominally constructed
2777 // instances have the same size; that's not the issue here.) This
2778 // could be bad if the calling process has no entries, but other
2779 // processes have entries that they want to send to us.
2780 size_t numBytesPerValue(0);
2781 {
2782 auto val_host = val_.getHostView(Access::ReadOnly);
2784 PackTraits<impl_scalar_type>::packValueCount
2785 (val_host.extent (0) ? val_host(0) : impl_scalar_type ());
2786 }
2787 const size_t maxRowNumEnt = graph_.getLocalMaxNumRowEntries ();
2789
2790 if (verbose) {
2791 std::ostringstream os;
2792 os << prefix << "combineMode: "
2794 << ", blockSize: " << blockSize
2795 << ", numImportLIDs: " << numImportLIDs
2796 << ", numBytesPerValue: " << numBytesPerValue
2797 << ", maxRowNumEnt: " << maxRowNumEnt
2798 << ", maxRowNumScalarEnt: " << maxRowNumScalarEnt << endl;
2799 std::cerr << os.str ();
2800 }
2801
2802 if (combineMode == ZERO || numImportLIDs == 0) {
2803 if (verbose) {
2804 std::ostringstream os;
2805 os << prefix << "Nothing to unpack. Done!" << std::endl;
2806 std::cerr << os.str ();
2807 }
2808 return;
2809 }
2810
2811 // NOTE (mfh 07 Feb 2019) If we ever implement unpack on device,
2812 // we can remove this sync.
2813 if (imports.need_sync_host ()) {
2814 imports.sync_host ();
2815 }
2816
2817 // NOTE (mfh 07 Feb 2019) DistObject::doTransferNew has already
2818 // sync'd numPacketsPerLID to host, since it needs to do that in
2819 // order to post MPI_Irecv messages with the correct lengths. A
2820 // hypothetical device-specific boundary exchange implementation
2821 // could possibly receive data without sync'ing lengths to host,
2822 // but we don't need to design for that nonexistent thing yet.
2823
2824 if (imports.need_sync_host () || numPacketsPerLID.need_sync_host () ||
2825 importLIDs.need_sync_host ()) {
2826 std::ostream& err = this->markLocalErrorAndGetStream ();
2827 std::ostringstream os;
2828 os << prefix << "All input DualViews must be sync'd to host by now. "
2829 << "imports_nc.need_sync_host()="
2830 << (imports.need_sync_host () ? "true" : "false")
2831 << ", numPacketsPerLID.need_sync_host()="
2832 << (numPacketsPerLID.need_sync_host () ? "true" : "false")
2833 << ", importLIDs.need_sync_host()="
2834 << (importLIDs.need_sync_host () ? "true" : "false")
2835 << "." << endl;
2836 if (verbose) {
2837 std::cerr << os.str ();
2838 }
2839 err << os.str ();
2840 return;
2841 }
2842
2843 const auto importLIDsHost = importLIDs.view_host ();
2844 const auto numPacketsPerLIDHost = numPacketsPerLID.view_host ();
2845
2846 // FIXME (mfh 06 Feb 2019) We could fuse the scan with the unpack
2847 // loop, by only unpacking on final==true (when we know the
2848 // current offset's value).
2849
2850 Kokkos::View<size_t*, host_exec> offset ("offset", numImportLIDs+1);
2851 {
2852 const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numImportLIDs+1);
2853 Kokkos::parallel_scan
2854 ("Tpetra::BlockCrsMatrix::unpackAndCombine: offsets", policy,
2855 [=] (const size_t &i, size_t &update, const bool &final) {
2856 if (final) offset(i) = update;
2857 update += (i == numImportLIDs ? 0 : numPacketsPerLIDHost(i));
2858 });
2859 }
2860
2861 // this variable does not matter with a race condition (just error flag)
2862 //
2863 // NOTE (mfh 06 Feb 2019) CUDA doesn't necessarily like reductions
2864 // or atomics on bool, so we use int instead. (I know we're not
2865 // launching a CUDA loop here, but we could in the future, and I'd
2866 // like to avoid potential trouble.)
2867 Kokkos::View<int, host_exec, Kokkos::MemoryTraits<Kokkos::Atomic> >
2868 errorDuringUnpack ("errorDuringUnpack");
2869 errorDuringUnpack () = 0;
2870 {
2871 using policy_type = Kokkos::TeamPolicy<host_exec>;
2872 size_t scratch_per_row = sizeof(GO) * maxRowNumEnt + sizeof (LO) * maxRowNumEnt + numBytesPerValue * maxRowNumScalarEnt
2873 + 2 * sizeof(GO); // Yeah, this is a fudge factor
2874
2875 const auto policy = policy_type (numImportLIDs, 1, 1)
2876 .set_scratch_size (0, Kokkos::PerTeam (scratch_per_row));
2877 using host_scratch_space = typename host_exec::scratch_memory_space;
2878
2879 using pair_type = Kokkos::pair<size_t, size_t>;
2880 Kokkos::parallel_for
2881 ("Tpetra::BlockCrsMatrix::unpackAndCombine: unpack", policy,
2882 [=] (const typename policy_type::member_type& member) {
2883 const size_t i = member.league_rank();
2884 Kokkos::View<GO*, host_scratch_space> gblColInds
2885 (member.team_scratch (0), maxRowNumEnt);
2886 Kokkos::View<LO*, host_scratch_space> lclColInds
2887 (member.team_scratch (0), maxRowNumEnt);
2888 Kokkos::View<impl_scalar_type*, host_scratch_space> vals
2889 (member.team_scratch (0), maxRowNumScalarEnt);
2890
2891
2892 const size_t offval = offset(i);
2893 const LO lclRow = importLIDsHost(i);
2894 const size_t numBytes = numPacketsPerLIDHost(i);
2895 const size_t numEnt =
2897 (imports.view_host (), offval, numBytes, numBytesPerValue);
2898
2899 if (numBytes > 0) {
2900 if (numEnt > maxRowNumEnt) {
2901 errorDuringUnpack() = 1;
2902 if (verbose) {
2903 std::ostringstream os;
2904 os << prefix
2905 << "At i = " << i << ", numEnt = " << numEnt
2906 << " > maxRowNumEnt = " << maxRowNumEnt
2907 << std::endl;
2908 std::cerr << os.str ();
2909 }
2910 return;
2911 }
2912 }
2913 const size_t numScalarEnt = numEnt*blockSize*blockSize;
2914 auto gidsOut = Kokkos::subview(gblColInds, pair_type(0, numEnt));
2915 auto lidsOut = Kokkos::subview(lclColInds, pair_type(0, numEnt));
2916 auto valsOut = Kokkos::subview(vals, pair_type(0, numScalarEnt));
2917
2918 // Kyungjoo: additional wrapping scratch view is necessary
2919 // the following function interface need the same execution space
2920 // host scratch space somehow is not considered same as the host_exec
2921 size_t numBytesOut = 0;
2922 try {
2923 numBytesOut =
2925 (Kokkos::View<GO*,host_exec>(gidsOut.data(), numEnt),
2926 Kokkos::View<impl_scalar_type*,host_exec>(valsOut.data(), numScalarEnt),
2927 imports.view_host(),
2930 }
2931 catch (std::exception& e) {
2932 errorDuringUnpack() = 1;
2933 if (verbose) {
2934 std::ostringstream os;
2935 os << prefix << "At i = " << i << ", unpackRowForBlockCrs threw: "
2936 << e.what () << endl;
2937 std::cerr << os.str ();
2938 }
2939 return;
2940 }
2941
2942 if (numBytes != numBytesOut) {
2943 errorDuringUnpack() = 1;
2944 if (verbose) {
2945 std::ostringstream os;
2946 os << prefix
2947 << "At i = " << i << ", numBytes = " << numBytes
2948 << " != numBytesOut = " << numBytesOut << "."
2949 << std::endl;
2950 std::cerr << os.str();
2951 }
2952 return;
2953 }
2954
2955 // Convert incoming global indices to local indices.
2956 for (size_t k = 0; k < numEnt; ++k) {
2957 lidsOut(k) = tgtColMap.getLocalElement (gidsOut(k));
2958 if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
2959 if (verbose) {
2960 std::ostringstream os;
2961 os << prefix
2962 << "At i = " << i << ", GID " << gidsOut(k)
2963 << " is not owned by the calling process."
2964 << std::endl;
2965 std::cerr << os.str();
2966 }
2967 return;
2968 }
2969 }
2970
2971 // Combine the incoming data with the matrix's current data.
2972 LO numCombd = 0;
2973 const LO* const lidsRaw = const_cast<const LO*> (lidsOut.data ());
2974 const Scalar* const valsRaw = reinterpret_cast<const Scalar*>
2975 (const_cast<const impl_scalar_type*> (valsOut.data ()));
2976 if (combineMode == ADD) {
2977 numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2978 } else if (combineMode == INSERT || combineMode == REPLACE) {
2979 numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2980 } else if (combineMode == ABSMAX) {
2981 numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2982 }
2983
2984 if (static_cast<LO> (numEnt) != numCombd) {
2985 errorDuringUnpack() = 1;
2986 if (verbose) {
2987 std::ostringstream os;
2988 os << prefix << "At i = " << i << ", numEnt = " << numEnt
2989 << " != numCombd = " << numCombd << "."
2990 << endl;
2991 std::cerr << os.str ();
2992 }
2993 return;
2994 }
2995 }); // for each import LID i
2996 }
2997
2998 if (errorDuringUnpack () != 0) {
2999 std::ostream& err = this->markLocalErrorAndGetStream ();
3000 err << prefix << "Unpacking failed.";
3001 if (! verbose) {
3002 err << " Please run again with the environment variable setting "
3003 "TPETRA_VERBOSE=1 to get more verbose diagnostic output.";
3004 }
3005 err << endl;
3006 }
3007
3008 if (verbose) {
3009 std::ostringstream os;
3010 const bool lclSuccess = ! (* (this->localError_));
3011 os << prefix
3012 << (lclSuccess ? "succeeded" : "FAILED")
3013 << std::endl;
3014 std::cerr << os.str ();
3015 }
3016 }
3017
3018 template<class Scalar, class LO, class GO, class Node>
3019 std::string
3021 {
3022 using Teuchos::TypeNameTraits;
3023 std::ostringstream os;
3024 os << "\"Tpetra::BlockCrsMatrix\": { "
3025 << "Template parameters: { "
3026 << "Scalar: " << TypeNameTraits<Scalar>::name ()
3027 << "LO: " << TypeNameTraits<LO>::name ()
3028 << "GO: " << TypeNameTraits<GO>::name ()
3029 << "Node: " << TypeNameTraits<Node>::name ()
3030 << " }"
3031 << ", Label: \"" << this->getObjectLabel () << "\""
3032 << ", Global dimensions: ["
3033 << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3034 << graph_.getRangeMap ()->getGlobalNumElements () << "]"
3035 << ", Block size: " << getBlockSize ()
3036 << " }";
3037 return os.str ();
3038 }
3039
3040
3041 template<class Scalar, class LO, class GO, class Node>
3042 void
3044 describe (Teuchos::FancyOStream& out,
3045 const Teuchos::EVerbosityLevel verbLevel) const
3046 {
3047 using Teuchos::ArrayRCP;
3048 using Teuchos::CommRequest;
3049 using Teuchos::FancyOStream;
3050 using Teuchos::getFancyOStream;
3051 using Teuchos::ireceive;
3052 using Teuchos::isend;
3053 using Teuchos::outArg;
3054 using Teuchos::TypeNameTraits;
3055 using Teuchos::VERB_DEFAULT;
3056 using Teuchos::VERB_NONE;
3057 using Teuchos::VERB_LOW;
3058 using Teuchos::VERB_MEDIUM;
3059 // using Teuchos::VERB_HIGH;
3060 using Teuchos::VERB_EXTREME;
3061 using Teuchos::RCP;
3062 using Teuchos::wait;
3063 using std::endl;
3064
3065 // Set default verbosity if applicable.
3066 const Teuchos::EVerbosityLevel vl =
3068
3069 if (vl == VERB_NONE) {
3070 return; // print nothing
3071 }
3072
3073 // describe() always starts with a tab before it prints anything.
3074 Teuchos::OSTab tab0 (out);
3075
3076 out << "\"Tpetra::BlockCrsMatrix\":" << endl;
3077 Teuchos::OSTab tab1 (out);
3078
3079 out << "Template parameters:" << endl;
3080 {
3081 Teuchos::OSTab tab2 (out);
3082 out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
3083 << "LO: " << TypeNameTraits<LO>::name () << endl
3084 << "GO: " << TypeNameTraits<GO>::name () << endl
3085 << "Node: " << TypeNameTraits<Node>::name () << endl;
3086 }
3087 out << "Label: \"" << this->getObjectLabel () << "\"" << endl
3088 << "Global dimensions: ["
3089 << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3090 << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
3091
3092 const LO blockSize = getBlockSize ();
3093 out << "Block size: " << blockSize << endl;
3094
3095 // constituent objects
3096 if (vl >= VERB_MEDIUM) {
3097 const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3098 const int myRank = comm.getRank ();
3099 if (myRank == 0) {
3100 out << "Row Map:" << endl;
3101 }
3102 getRowMap()->describe (out, vl);
3103
3104 if (! getColMap ().is_null ()) {
3105 if (getColMap() == getRowMap()) {
3106 if (myRank == 0) {
3107 out << "Column Map: same as row Map" << endl;
3108 }
3109 }
3110 else {
3111 if (myRank == 0) {
3112 out << "Column Map:" << endl;
3113 }
3114 getColMap ()->describe (out, vl);
3115 }
3116 }
3117 if (! getDomainMap ().is_null ()) {
3118 if (getDomainMap () == getRowMap ()) {
3119 if (myRank == 0) {
3120 out << "Domain Map: same as row Map" << endl;
3121 }
3122 }
3123 else if (getDomainMap () == getColMap ()) {
3124 if (myRank == 0) {
3125 out << "Domain Map: same as column Map" << endl;
3126 }
3127 }
3128 else {
3129 if (myRank == 0) {
3130 out << "Domain Map:" << endl;
3131 }
3132 getDomainMap ()->describe (out, vl);
3133 }
3134 }
3135 if (! getRangeMap ().is_null ()) {
3136 if (getRangeMap () == getDomainMap ()) {
3137 if (myRank == 0) {
3138 out << "Range Map: same as domain Map" << endl;
3139 }
3140 }
3141 else if (getRangeMap () == getRowMap ()) {
3142 if (myRank == 0) {
3143 out << "Range Map: same as row Map" << endl;
3144 }
3145 }
3146 else {
3147 if (myRank == 0) {
3148 out << "Range Map: " << endl;
3149 }
3150 getRangeMap ()->describe (out, vl);
3151 }
3152 }
3153 }
3154
3155 if (vl >= VERB_EXTREME) {
3156 const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3157 const int myRank = comm.getRank ();
3158 const int numProcs = comm.getSize ();
3159
3160 // Print the calling process' data to the given output stream.
3161 RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
3163 FancyOStream& os = *osPtr;
3164 os << "Process " << myRank << ":" << endl;
3165 Teuchos::OSTab tab2 (os);
3166
3167 const map_type& meshRowMap = * (graph_.getRowMap ());
3168 const map_type& meshColMap = * (graph_.getColMap ());
3169 for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
3170 meshLclRow <= meshRowMap.getMaxLocalIndex ();
3171 ++meshLclRow) {
3172 const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
3173 os << "Row " << meshGblRow << ": {";
3174
3175 local_inds_host_view_type lclColInds;
3176 values_host_view_type vals;
3177 LO numInds = 0;
3178 this->getLocalRowView (meshLclRow, lclColInds, vals); numInds = lclColInds.extent(0);
3179
3180 for (LO k = 0; k < numInds; ++k) {
3181 const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
3182
3183 os << "Col " << gblCol << ": [";
3184 for (LO i = 0; i < blockSize; ++i) {
3185 for (LO j = 0; j < blockSize; ++j) {
3187 if (j + 1 < blockSize) {
3188 os << ", ";
3189 }
3190 }
3191 if (i + 1 < blockSize) {
3192 os << "; ";
3193 }
3194 }
3195 os << "]";
3196 if (k + 1 < numInds) {
3197 os << ", ";
3198 }
3199 }
3200 os << "}" << endl;
3201 }
3202
3203 // Print data on Process 0. This will automatically respect the
3204 // current indentation level.
3205 if (myRank == 0) {
3206 out << lclOutStrPtr->str ();
3207 lclOutStrPtr = Teuchos::null; // clear it to save space
3208 }
3209
3210 const int sizeTag = 1337;
3211 const int dataTag = 1338;
3212
3213 ArrayRCP<char> recvDataBuf; // only used on Process 0
3214
3215 // Send string sizes and data from each process in turn to
3216 // Process 0, and print on that process.
3217 for (int p = 1; p < numProcs; ++p) {
3218 if (myRank == 0) {
3219 // Receive the incoming string's length.
3221 recvSize[0] = 0;
3224 wait<int> (comm, outArg (recvSizeReq));
3225 const size_t numCharsToRecv = recvSize[0];
3226
3227 // Allocate space for the string to receive. Reuse receive
3228 // buffer space if possible. We can do this because in the
3229 // current implementation, we only have one receive in
3230 // flight at a time. Leave space for the '\0' at the end,
3231 // in case the sender doesn't send it.
3232 if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
3233 recvDataBuf.resize (numCharsToRecv + 1);
3234 }
3235 ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
3236 // Post the receive of the actual string data.
3239 wait<int> (comm, outArg (recvDataReq));
3240
3241 // Print the received data. This will respect the current
3242 // indentation level. Make sure that the string is
3243 // null-terminated.
3245 out << recvDataBuf.getRawPtr ();
3246 }
3247 else if (myRank == p) { // if I am not Process 0, and my rank is p
3248 // This deep-copies the string at most twice, depending on
3249 // whether std::string reference counts internally (it
3250 // generally does, so this won't deep-copy at all).
3251 const std::string stringToSend = lclOutStrPtr->str ();
3252 lclOutStrPtr = Teuchos::null; // clear original to save space
3253
3254 // Send the string's length to Process 0.
3255 const size_t numCharsToSend = stringToSend.size ();
3260 wait<int> (comm, outArg (sendSizeReq));
3261
3262 // Send the actual string to Process 0. We know that the
3263 // string has length > 0, so it's save to take the address
3264 // of the first entry. Make a nonowning ArrayRCP to hold
3265 // the string. Process 0 will add a null termination
3266 // character at the end of the string, after it receives the
3267 // message.
3270 isend<int, char> (sendData, 0, dataTag, comm);
3271 wait<int> (comm, outArg (sendDataReq));
3272 }
3273 } // for each process rank p other than 0
3274 } // extreme verbosity level (print the whole matrix)
3275 }
3276
3277 template<class Scalar, class LO, class GO, class Node>
3278 Teuchos::RCP<const Teuchos::Comm<int> >
3280 getComm() const
3281 {
3282 return graph_.getComm();
3283 }
3284
3285
3286 template<class Scalar, class LO, class GO, class Node>
3289 getGlobalNumCols() const
3290 {
3291 return graph_.getGlobalNumCols();
3292 }
3293
3294
3295 template<class Scalar, class LO, class GO, class Node>
3296 size_t
3298 getLocalNumCols() const
3299 {
3300 return graph_.getLocalNumCols();
3301 }
3302
3303 template<class Scalar, class LO, class GO, class Node>
3304 GO
3306 getIndexBase() const
3307 {
3308 return graph_.getIndexBase();
3309 }
3310
3311 template<class Scalar, class LO, class GO, class Node>
3315 {
3316 return graph_.getGlobalNumEntries();
3317 }
3318
3319 template<class Scalar, class LO, class GO, class Node>
3320 size_t
3322 getLocalNumEntries() const
3323 {
3324 return graph_.getLocalNumEntries();
3325 }
3326
3327 template<class Scalar, class LO, class GO, class Node>
3328 size_t
3331 {
3332 return graph_.getNumEntriesInGlobalRow(globalRow);
3333 }
3334
3335
3336 template<class Scalar, class LO, class GO, class Node>
3337 size_t
3340 {
3341 return graph_.getGlobalMaxNumRowEntries();
3342 }
3343
3344 template<class Scalar, class LO, class GO, class Node>
3345 bool
3347 hasColMap() const
3348 {
3349 return graph_.hasColMap();
3350 }
3351
3352
3353 template<class Scalar, class LO, class GO, class Node>
3354 bool
3356 isLocallyIndexed() const
3357 {
3358 return graph_.isLocallyIndexed();
3359 }
3360
3361 template<class Scalar, class LO, class GO, class Node>
3362 bool
3364 isGloballyIndexed() const
3365 {
3366 return graph_.isGloballyIndexed();
3367 }
3368
3369 template<class Scalar, class LO, class GO, class Node>
3370 bool
3372 isFillComplete() const
3373 {
3374 return graph_.isFillComplete ();
3375 }
3376
3377 template<class Scalar, class LO, class GO, class Node>
3378 bool
3380 supportsRowViews() const
3381 {
3382 return false;
3383 }
3384
3385 template<class Scalar, class LO, class GO, class Node>
3386 void
3388 getGlobalRowCopy (GO /*GlobalRow*/,
3389 nonconst_global_inds_host_view_type &/*Indices*/,
3390 nonconst_values_host_view_type &/*Values*/,
3391 size_t& /*NumEntries*/) const
3392 {
3394 true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowCopy: "
3395 "This class doesn't support global matrix indexing.");
3396
3397 }
3398
3399 template<class Scalar, class LO, class GO, class Node>
3400 void
3402 getGlobalRowView (GO /* GlobalRow */,
3403 global_inds_host_view_type &/* indices */,
3404 values_host_view_type &/* values */) const
3405 {
3407 true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowView: "
3408 "This class doesn't support global matrix indexing.");
3409
3410 }
3411
3412 template<class Scalar, class LO, class GO, class Node>
3413 void
3416 local_inds_host_view_type &colInds,
3417 values_host_view_type &vals) const
3418 {
3419 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
3420 colInds = local_inds_host_view_type();
3421 vals = values_host_view_type();
3422 }
3423 else {
3424 const size_t absBlockOffsetStart = ptrHost_[localRowInd];
3425 const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
3426 colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
3427
3428 vals = getValuesHost (localRowInd);
3429 }
3430 }
3431
3432 template<class Scalar, class LO, class GO, class Node>
3433 void
3436 local_inds_host_view_type &colInds,
3437 nonconst_values_host_view_type &vals) const
3438 {
3439 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
3440 colInds = nonconst_local_inds_host_view_type();
3441 vals = nonconst_values_host_view_type();
3442 }
3443 else {
3444 const size_t absBlockOffsetStart = ptrHost_[localRowInd];
3445 const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
3446 colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
3447
3449 vals = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
3450 }
3451 }
3452
3453 template<class Scalar, class LO, class GO, class Node>
3454 void
3457 {
3458 const size_t lclNumMeshRows = graph_.getLocalNumRows ();
3459
3460 Kokkos::View<size_t*, device_type> diagOffsets ("diagOffsets", lclNumMeshRows);
3461 graph_.getLocalDiagOffsets (diagOffsets);
3462
3463 // The code below works on host, so use a host View.
3464 auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
3465 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
3466 Kokkos::deep_copy (execution_space(), diagOffsetsHost, diagOffsets);
3467
3468 auto vals_host_out = val_.getHostView(Access::ReadOnly);
3470
3471 // TODO amk: This is a temporary measure to make the code run with Ifpack2
3472 size_t rowOffset = 0;
3473 size_t offset = 0;
3474 LO bs = getBlockSize();
3475 for(size_t r=0; r<getLocalNumRows(); r++)
3476 {
3477 // move pointer to start of diagonal block
3479 for(int b=0; b<bs; b++)
3480 {
3481 diag.replaceLocalValue(r*bs+b, vals_host_out_raw[offset+b*(bs+1)]);
3482 }
3483 // move pointer to start of next block row
3484 rowOffset += getNumEntriesInLocalRow(r)*bs*bs;
3485 }
3486
3487 //diag.template sync<memory_space> (); // sync vec of diag entries back to dev
3488 }
3489
3490 template<class Scalar, class LO, class GO, class Node>
3491 void
3493 leftScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3494 {
3496 true, std::logic_error, "Tpetra::BlockCrsMatrix::leftScale: "
3497 "not implemented.");
3498
3499 }
3500
3501 template<class Scalar, class LO, class GO, class Node>
3502 void
3504 rightScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3505 {
3507 true, std::logic_error, "Tpetra::BlockCrsMatrix::rightScale: "
3508 "not implemented.");
3509
3510 }
3511
3512 template<class Scalar, class LO, class GO, class Node>
3513 Teuchos::RCP<const ::Tpetra::RowGraph<LO, GO, Node> >
3515 getGraph() const
3516 {
3517 return graphRCP_;
3518 }
3519
3520 template<class Scalar, class LO, class GO, class Node>
3521 typename ::Tpetra::RowMatrix<Scalar, LO, GO, Node>::mag_type
3523 getFrobeniusNorm () const
3524 {
3526 true, std::logic_error, "Tpetra::BlockCrsMatrix::getFrobeniusNorm: "
3527 "not implemented.");
3528 }
3529
3530} // namespace Tpetra
3531
3532//
3533// Explicit instantiation macro
3534//
3535// Must be expanded from within the Tpetra namespace!
3536//
3537#define TPETRA_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3538 template class BlockCrsMatrix< S, LO, GO, NODE >;
3539
3540#endif // TPETRA_BLOCKCRSMATRIX_DEF_HPP
Linear algebra kernels for small dense matrices and vectors.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh,...
void exportAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Export< LO, GO, Node > &exporter) const
Import from this to the given destination matrix, and make the result fill complete.
virtual bool isLocallyIndexed() const override
Whether matrix indices are locally indexed.
virtual bool isFillComplete() const override
Whether fillComplete() has been called.
virtual size_t getGlobalMaxNumRowEntries() const override
The maximum number of entries in any row over all processes in the matrix's communicator.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
virtual void getLocalRowCopy(LO LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Not implemented.
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
virtual void getGlobalRowView(GO GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const override
Print a description of this object to the given output stream.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
LO local_ordinal_type
The type of local indices.
virtual void getGlobalRowCopy(GO GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Get a copy of the given global row's entries.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
virtual size_t getLocalNumCols() const override
The number of columns needed to apply the forward operator on this node.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
Teuchos::RCP< const map_type > getRangeMap() const override
Get the (point) range Map of this matrix.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
virtual bool hasColMap() const override
Whether this matrix has a well-defined column Map.
device_type::execution_space execution_space
The Kokkos execution space that this class uses.
size_t getLocalNumRows() const override
get the local number of block rows
virtual size_t getLocalNumEntries() const override
The local number of stored (structurally nonzero) entries.
impl_scalar_type_dualview::t_host getValuesHostNonConst() const
Get the host or device View of the matrix's values (val_).
Kokkos::View< impl_scalar_type **, Impl::BlockCrsMatrixLittleBlockArrayLayout, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_block_type
The type used to access nonconst matrix blocks.
Teuchos::RCP< const map_type > getColMap() const override
get the (mesh) map for the columns of this block matrix.
virtual typename::Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const override
The Frobenius norm of the matrix.
virtual void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< packet_type *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode) override
typename DistObject< Scalar, LO, GO, Node >::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
local_matrix_device_type getLocalMatrixDevice() const
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
virtual global_size_t getGlobalNumCols() const override
The global number of columns of this matrix.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
Teuchos::RCP< const map_type > getRowMap() const override
get the (mesh) map for the rows of this block matrix.
std::string description() const override
One-line description of this object.
void importAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Import< LO, GO, Node > &importer) const
Import from this to the given destination matrix, and make the result fill complete.
void getLocalRowView(LO LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
size_t getNumEntriesInLocalRow(const LO localRowInd) const override
Return the number of entries in the given row on the calling process.
virtual bool supportsRowViews() const override
Whether this object implements getLocalRowView() and getGlobalRowView().
virtual Teuchos::RCP< const ::Tpetra::RowGraph< LO, GO, Node > > getGraph() const override
Get the (mesh) graph.
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh,...
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const override
The current number of entries on the calling process in the specified global row.
Node::device_type device_type
The Kokkos::Device specialization that this class uses.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
Kokkos::View< const impl_scalar_type **, Impl::BlockCrsMatrixLittleBlockArrayLayout, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > const_little_block_type
The type used to access const matrix blocks.
void getLocalRowViewNonConst(LO LocalRow, local_inds_host_view_type &indices, nonconst_values_host_view_type &values) const
char packet_type
Implementation detail; tells.
virtual void packAndPrepare(const SrcDistObject &sourceObj, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< packet_type *, buffer_device_type > &exports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, size_t &constantNumPackets) override
virtual void rightScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the right with the given Vector x.
virtual GO getIndexBase() const override
The index base for global indices in this matrix.
virtual global_size_t getGlobalNumEntries() const override
The global number of stored (structurally nonzero) entries.
typename BMV::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which this matrix is distributed.
global_size_t getGlobalNumRows() const override
get the global number of block rows
virtual void leftScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the left with the given Vector x.
Teuchos::RCP< const map_type > getDomainMap() const override
Get the (point) domain Map of this matrix.
virtual bool isGloballyIndexed() const override
Whether matrix indices are globally indexed.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
MultiVector for multiple degrees of freedom per mesh point.
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
Struct that holds views of the contents of a CrsMatrix.
One or more distributed dense vectors.
int local_ordinal_type
Default value of Scalar template parameter.
Kokkos::LayoutRight BlockCrsMatrixLittleBlockArrayLayout
give an option to use layoutleft
KOKKOS_INLINE_FUNCTION void absMax(const ViewType2 &Y, const ViewType1 &X)
Implementation of Tpetra's ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix,...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
KOKKOS_INLINE_FUNCTION void GEMV(const CoeffType &alpha, const BlkType &A, const VecType1 &x, const VecType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
KOKKOS_INLINE_FUNCTION void FILL(const ViewType &x, const InputType &val)
Set every entry of x to val.
size_t global_size_t
Global size_t object.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
KOKKOS_INLINE_FUNCTION void SCAL(const CoefficientType &alpha, const ViewType &x)
x := alpha*x, where x is either rank 1 (a vector) or rank 2 (a matrix).
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ INSERT
Insert new values that don't currently exist.
@ ZERO
Replace old values with zero.