Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
TpetraExt_MatrixMatrix_def.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38//
39// ************************************************************************
40// @HEADER
41#ifndef TPETRA_MATRIXMATRIX_DEF_HPP
42#define TPETRA_MATRIXMATRIX_DEF_HPP
43#include "KokkosSparse_Utils.hpp"
44#include "Tpetra_ConfigDefs.hpp"
46#include "Teuchos_VerboseObject.hpp"
47#include "Teuchos_Array.hpp"
48#include "Tpetra_Util.hpp"
49#include "Tpetra_CrsMatrix.hpp"
50#include "Tpetra_BlockCrsMatrix.hpp"
52#include "Tpetra_RowMatrixTransposer.hpp"
55#include "Tpetra_Details_makeColMap.hpp"
56#include "Tpetra_ConfigDefs.hpp"
57#include "Tpetra_Map.hpp"
58#include "Tpetra_Export.hpp"
61#include <algorithm>
62#include <type_traits>
63#include "Teuchos_FancyOStream.hpp"
64
65#include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
67
68#include "KokkosSparse_spgemm.hpp"
69#include "KokkosSparse_spadd.hpp"
70#include "Kokkos_Bitset.hpp"
71
73
81/*********************************************************************************************************/
82// Include the architecture-specific kernel partial specializations here
83// NOTE: This needs to be outside all namespaces
84#include "TpetraExt_MatrixMatrix_OpenMP.hpp"
85#include "TpetraExt_MatrixMatrix_Cuda.hpp"
86#include "TpetraExt_MatrixMatrix_HIP.hpp"
87#include "TpetraExt_MatrixMatrix_SYCL.hpp"
88
89namespace Tpetra {
90
91namespace MatrixMatrix{
92
93//
94// This method forms the matrix-matrix product C = op(A) * op(B), where
95// op(A) == A if transposeA is false,
96// op(A) == A^T if transposeA is true,
97// and similarly for op(B).
98//
99template <class Scalar,
100 class LocalOrdinal,
101 class GlobalOrdinal,
102 class Node>
105 bool transposeA,
107 bool transposeB,
110 const std::string& label,
111 const Teuchos::RCP<Teuchos::ParameterList>& params)
112{
113 using Teuchos::null;
114 using Teuchos::RCP;
115 using Teuchos::rcp;
116 typedef Scalar SC;
117 typedef LocalOrdinal LO;
118 typedef GlobalOrdinal GO;
119 typedef Node NO;
120 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
121 typedef Import<LO,GO,NO> import_type;
123 typedef Map<LO,GO,NO> map_type;
125
126#ifdef HAVE_TPETRA_MMM_TIMINGS
127 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
128 using Teuchos::TimeMonitor;
129 //MM is used to time setup, and then multiply.
130
131 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Setup"))));
132#endif
133
134 const std::string prefix = "TpetraExt::MatrixMatrix::Multiply(): ";
135
136 // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
137
138 // The input matrices A and B must both be fillComplete.
139 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
140 TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
141
142 // If transposeA is true, then Aprime will be the transpose of A
143 // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
144 // will just be a pointer to A.
146 // If transposeB is true, then Bprime will be the transpose of B
147 // (computed explicitly via RowMatrixTransposer). Otherwise, Bprime
148 // will just be a pointer to B.
150
151 // Is this a "clean" matrix?
152 //
153 // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
154 // locally nor globally indexed, then it was empty. I don't like
155 // this, because the most straightforward implementation presumes
156 // lazy allocation of indices. However, historical precedent
157 // demands that we keep around this predicate as a way to test
158 // whether the matrix is empty.
159 const bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
160
161 bool use_optimized_ATB = false;
163 use_optimized_ATB = true;
164
165#ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
166 use_optimized_ATB = false;
167#endif
168
169 using Teuchos::ParameterList;
171 transposeParams->set ("sort", false);
172
175 Aprime = transposer.createTranspose (transposeParams);
176 }
177 else {
179 }
180
181 if (transposeB) {
183 Bprime = transposer.createTranspose (transposeParams);
184 }
185 else {
187 }
188
189 // Check size compatibility
190 global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
191 global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
192 global_size_t Aouter = transposeA ? numACols : A.getGlobalNumRows();
193 global_size_t Bouter = transposeB ? B.getGlobalNumRows() : numBCols;
194 global_size_t Ainner = transposeA ? A.getGlobalNumRows() : numACols;
195 global_size_t Binner = transposeB ? numBCols : B.getGlobalNumRows();
196 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
197 prefix << "ERROR, inner dimensions of op(A) and op(B) "
198 "must match for matrix-matrix product. op(A) is "
199 << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
200
201 // The result matrix C must at least have a row-map that reflects the correct
202 // row-size. Don't check the number of columns because rectangular matrices
203 // which were constructed with only one map can still end up having the
204 // correct capacity and dimensions when filled.
205 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
206 prefix << "ERROR, dimensions of result C must "
207 "match dimensions of op(A) * op(B). C has " << C.getGlobalNumRows()
208 << " rows, should have at least " << Aouter << std::endl);
209
210 // It doesn't matter whether C is already Filled or not. If it is already
211 // Filled, it must have space allocated for the positions that will be
212 // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
213 // we'll error out later when trying to store result values.
214
215 // CGB: However, matrix must be in active-fill
216 if (!C.isFillActive()) C.resumeFill();
217
218 // We're going to need to import remotely-owned sections of A and/or B if
219 // more than one processor is performing this run, depending on the scenario.
220 int numProcs = A.getComm()->getSize();
221
222 // Declare a couple of structs that will be used to hold views of the data
223 // of A and B, to be used for fast access during the matrix-multiplication.
226
229
230#ifdef HAVE_TPETRA_MMM_TIMINGS
231 {
232 TimeMonitor MM_importExtract(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All I&X")));
233#endif
234
235 // Now import any needed remote rows and populate the Aview struct
236 // NOTE: We assert that an import isn't needed --- since we do the transpose
237 // above to handle that.
238 if (!use_optimized_ATB) {
240 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
241 }
242
243 // We will also need local access to all rows of B that correspond to the
244 // column-map of op(A).
245 if (numProcs > 1)
246 targetMap_B = Aprime->getColMap();
247
248 // Import any needed remote rows and populate the Bview struct.
250 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label, params);
251
252#ifdef HAVE_TPETRA_MMM_TIMINGS
253 } //stop MM_importExtract here
254 //stop the setup timer, and start the multiply timer
255 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Multiply"))));
256#endif
257
258 // Call the appropriate method to perform the actual multiplication.
259 if (use_optimized_ATB) {
260 MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
261
262 } else if (call_FillComplete_on_result && newFlag) {
263 MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
264
265 } else if (call_FillComplete_on_result) {
266 MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
267
268 } else {
269 // mfh 27 Sep 2016: Is this the "slow" case? This
270 // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
271 // thread-parallel inserts, but that may take some effort.
273
274 MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
275 }
276
277#ifdef HAVE_TPETRA_MMM_TIMINGS
278 TimeMonitor MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All FillComplete")));
279#endif
280 if (call_FillComplete_on_result && !C.isFillComplete()) {
281 // We'll call FillComplete on the C matrix before we exit, and give it a
282 // domain-map and a range-map.
283 // The domain-map will be the domain-map of B, unless
284 // op(B)==transpose(B), in which case the range-map of B will be used.
285 // The range-map will be the range-map of A, unless op(A)==transpose(A),
286 // in which case the domain-map of A will be used.
287 C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
288 }
289}
290
291//
292// This method forms the matrix-matrix product C = op(A) * op(B), where
293// op(A) == A and similarly for op(B). op(A) = A^T is not yet implemented.
294//
295template <class Scalar,
296 class LocalOrdinal,
297 class GlobalOrdinal,
298 class Node>
301 bool transposeA,
303 bool transposeB,
305 const std::string& label)
306{
307 using Teuchos::null;
308 using Teuchos::RCP;
309 using Teuchos::rcp;
310 typedef Scalar SC;
311 typedef LocalOrdinal LO;
312 typedef GlobalOrdinal GO;
313 typedef Node NO;
315 typedef Map<LO,GO,NO> map_type;
316 typedef Import<LO,GO,NO> import_type;
317
318 std::string prefix = std::string("TpetraExt ") + label + std::string(": ");
319
320 TEUCHOS_TEST_FOR_EXCEPTION(transposeA==true, std::runtime_error, prefix << "Matrix A cannot be transposed.");
321 TEUCHOS_TEST_FOR_EXCEPTION(transposeB==true, std::runtime_error, prefix << "Matrix B cannot be transposed.");
322
323 // Check size compatibility
324 global_size_t numACols = A->getGlobalNumCols();
325 global_size_t numBCols = B->getGlobalNumCols();
326 global_size_t numARows = A->getGlobalNumRows();
327 global_size_t numBRows = B->getGlobalNumRows();
328
333 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
334 prefix << "ERROR, inner dimensions of op(A) and op(B) "
335 "must match for matrix-matrix product. op(A) is "
336 << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
337
338 // We're going to need to import remotely-owned sections of A and/or B if
339 // more than one processor is performing this run, depending on the scenario.
340 int numProcs = A->getComm()->getSize();
341
342 const LO blocksize = A->getBlockSize();
343 TEUCHOS_TEST_FOR_EXCEPTION(blocksize != B->getBlockSize(), std::runtime_error,
344 prefix << "ERROR, Blocksizes do not match. A.blocksize = " <<
345 blocksize << ", B.blocksize = " << B->getBlockSize() );
346
347 // Declare a couple of structs that will be used to hold views of the data
348 // of A and B, to be used for fast access during the matrix-multiplication.
351
352 RCP<const map_type> targetMap_A = A->getRowMap();
353 RCP<const map_type> targetMap_B = B->getRowMap();
354
355 // Populate the Aview struct. No remotes are needed.
357 MMdetails::import_and_extract_views(*A, targetMap_A, Aview, dummyImporter, true);
358
359 // We will also need local access to all rows of B that correspond to the
360 // column-map of op(A).
361 if (numProcs > 1)
362 targetMap_B = A->getColMap();
363
364 // Import any needed remote rows and populate the Bview struct.
365 MMdetails::import_and_extract_views(*B, targetMap_B, Bview, A->getGraph()->getImporter(),
366 A->getGraph()->getImporter().is_null());
367
368 // Call the appropriate method to perform the actual multiplication.
369 MMdetails::mult_A_B_newmatrix(Aview, Bview, C);
370}
371
372template <class Scalar,
373 class LocalOrdinal,
374 class GlobalOrdinal,
375 class Node>
382 const std::string& label,
383 const Teuchos::RCP<Teuchos::ParameterList>& params)
384{
385 using Teuchos::RCP;
386 typedef Scalar SC;
387 typedef LocalOrdinal LO;
388 typedef GlobalOrdinal GO;
389 typedef Node NO;
390 typedef Import<LO,GO,NO> import_type;
392 typedef Map<LO,GO,NO> map_type;
393 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
394
395#ifdef HAVE_TPETRA_MMM_TIMINGS
396 std::string prefix_mmm = std::string("TpetraExt ")+ label + std::string(": ");
397 using Teuchos::TimeMonitor;
398 TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm+std::string("Jacobi All Setup")));
399#endif
400
401 const std::string prefix = "TpetraExt::MatrixMatrix::Jacobi(): ";
402
403 // A and B should already be Filled.
404 // Should we go ahead and call FillComplete() on them if necessary or error
405 // out? For now, we choose to error out.
406 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
407 TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
408
411
412 // Now check size compatibility
413 global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
414 global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
415 global_size_t Aouter = A.getGlobalNumRows();
418 global_size_t Binner = B.getGlobalNumRows();
419 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
420 prefix << "ERROR, inner dimensions of op(A) and op(B) "
421 "must match for matrix-matrix product. op(A) is "
422 << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
423
424 // The result matrix C must at least have a row-map that reflects the correct
425 // row-size. Don't check the number of columns because rectangular matrices
426 // which were constructed with only one map can still end up having the
427 // correct capacity and dimensions when filled.
428 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
429 prefix << "ERROR, dimensions of result C must "
430 "match dimensions of op(A) * op(B). C has "<< C.getGlobalNumRows()
431 << " rows, should have at least "<< Aouter << std::endl);
432
433 // It doesn't matter whether C is already Filled or not. If it is already
434 // Filled, it must have space allocated for the positions that will be
435 // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
436 // we'll error out later when trying to store result values.
437
438 // CGB: However, matrix must be in active-fill
439 TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
440
441 // We're going to need to import remotely-owned sections of A and/or B if
442 // more than one processor is performing this run, depending on the scenario.
443 int numProcs = A.getComm()->getSize();
444
445 // Declare a couple of structs that will be used to hold views of the data of
446 // A and B, to be used for fast access during the matrix-multiplication.
449
452
453#ifdef HAVE_TPETRA_MMM_TIMINGS
454 TimeMonitor MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All I&X")));
455 {
456#endif
457
458 // Enable globalConstants by default
459 // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
460 RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
461 if(!params.is_null()) {
462 importParams1->set("compute global constants",params->get("compute global constants: temporaries",false));
464 auto slist = params->sublist("matrixmatrix: kernel params",false);
466 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
468 bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
469 bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
470 auto & ip1slist = importParams1->sublist("matrixmatrix: kernel params",false);
471 ip1slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
472 ip1slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
473 ip1slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
474 }
475
476 //Now import any needed remote rows and populate the Aview struct.
478 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label,importParams1);
479
480 // We will also need local access to all rows of B that correspond to the
481 // column-map of op(A).
482 if (numProcs > 1)
483 targetMap_B = Aprime->getColMap();
484
485 // Now import any needed remote rows and populate the Bview struct.
486 // Enable globalConstants by default
487 // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
488 RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
489 if(!params.is_null()) {
490 importParams2->set("compute global constants",params->get("compute global constants: temporaries",false));
491
492 auto slist = params->sublist("matrixmatrix: kernel params",false);
494 bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
495 bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
496 auto & ip2slist = importParams2->sublist("matrixmatrix: kernel params",false);
497 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
499 ip2slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
500 ip2slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
501 ip2slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
502 }
503
504 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label,importParams2);
505
506#ifdef HAVE_TPETRA_MMM_TIMINGS
507 }
508 TimeMonitor MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All Multiply")));
509#endif
510
511 // Now call the appropriate method to perform the actual multiplication.
513
514 // Is this a "clean" matrix
515 bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
516
518 MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
519
520 } else if (call_FillComplete_on_result) {
521 MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
522
523 } else {
524 TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "jacobi_A_B_general not implemented");
525 }
526
527 if(!params.is_null()) {
528 bool removeZeroEntries = params->get("remove zeros", false);
529 if (removeZeroEntries) {
530 typedef Teuchos::ScalarTraits<Scalar> STS;
531 typename STS::magnitudeType threshold = params->get("remove zeros threshold", STS::magnitude(STS::zero()));
533 }
534 }
535}
536
537
538template <class Scalar,
539 class LocalOrdinal,
540 class GlobalOrdinal,
541 class Node>
542void Add(
544 bool transposeA,
548{
549 using Teuchos::Array;
550 using Teuchos::RCP;
551 using Teuchos::null;
552 typedef Scalar SC;
553 typedef LocalOrdinal LO;
554 typedef GlobalOrdinal GO;
555 typedef Node NO;
556 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
558
559 const std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
560
561 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
562 prefix << "ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
563 "(Result matrix B is not required to be isFillComplete()).");
564 TEUCHOS_TEST_FOR_EXCEPTION(B.isFillComplete() , std::runtime_error,
565 prefix << "ERROR, input matrix B must not be fill complete!");
566 TEUCHOS_TEST_FOR_EXCEPTION(B.isStaticGraph() , std::runtime_error,
567 prefix << "ERROR, input matrix B must not have static graph!");
568 TEUCHOS_TEST_FOR_EXCEPTION(B.isLocallyIndexed() , std::runtime_error,
569 prefix << "ERROR, input matrix B must not be locally indexed!");
570
571 using Teuchos::ParameterList;
573 transposeParams->set ("sort", false);
574
576 if (transposeA) {
578 Aprime = transposer.createTranspose (transposeParams);
579 }
580 else {
582 }
583
584 size_t a_numEntries;
585 typename crs_matrix_type::nonconst_global_inds_host_view_type a_inds("a_inds",A.getLocalMaxNumRowEntries());
586 typename crs_matrix_type::nonconst_values_host_view_type a_vals("a_vals",A.getLocalMaxNumRowEntries());
587 GO row;
588
589 if (scalarB != Teuchos::ScalarTraits<SC>::one())
590 B.scale(scalarB);
591
592 bool bFilled = B.isFillComplete();
593 size_t numMyRows = B.getLocalNumRows();
594 if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
595 for (LO i = 0; (size_t)i < numMyRows; ++i) {
596 row = B.getRowMap()->getGlobalElement(i);
597 Aprime->getGlobalRowCopy(row, a_inds, a_vals, a_numEntries);
598
599 if (scalarA != Teuchos::ScalarTraits<SC>::one())
600 for (size_t j = 0; j < a_numEntries; ++j)
601 a_vals[j] *= scalarA;
602
603 if (bFilled)
604 B.sumIntoGlobalValues(row, a_numEntries, reinterpret_cast<Scalar *>(a_vals.data()), a_inds.data());
605 else
606 B.insertGlobalValues(row, a_numEntries, reinterpret_cast<Scalar *>(a_vals.data()), a_inds.data());
607 }
608 }
609}
610
611template <class Scalar,
612 class LocalOrdinal,
613 class GlobalOrdinal,
614 class Node>
615Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
617 const bool transposeA,
619 const Scalar& beta,
620 const bool transposeB,
622 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
623 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
624 const Teuchos::RCP<Teuchos::ParameterList>& params)
625{
626 using Teuchos::RCP;
627 using Teuchos::rcpFromRef;
628 using Teuchos::rcp;
629 using Teuchos::ParameterList;
631 if(!params.is_null())
632 {
634 params->isParameter("Call fillComplete") && !params->get<bool>("Call fillComplete"),
635 std::invalid_argument,
636 "Tpetra::MatrixMatrix::add(): this version of add() always calls fillComplete\n"
637 "on the result, but you explicitly set 'Call fillComplete' = false in the parameter list. Don't set this explicitly.");
638 params->set("Call fillComplete", true);
639 }
640 //If transposeB, must compute B's explicit transpose to
641 //get the correct row map for C.
643 if(transposeB)
644 {
646 Brcp = transposer.createTranspose();
647 }
648 //Check that A,B are fillComplete before getting B's column map
650 (! A.isFillComplete () || ! Brcp->isFillComplete (), std::invalid_argument,
651 "TpetraExt::MatrixMatrix::add(): A and B must both be fill complete.");
652 RCP<crs_matrix_type> C = rcp(new crs_matrix_type(Brcp->getRowMap(), 0));
653 //this version of add() always fill completes the result, no matter what is in params on input
654 add(alpha, transposeA, A, beta, false, *Brcp, *C, domainMap, rangeMap, params);
655 return C;
656}
657
658//This functor does the same thing as CrsGraph::convertColumnIndicesFromGlobalToLocal,
659//but since the spadd() output is always packed there is no need for a separate
660//numRowEntries here.
661//
662template<class LO, class GO, class LOView, class GOView, class LocalMap>
663struct ConvertGlobalToLocalFunctor
664{
665 ConvertGlobalToLocalFunctor(LOView& lids_, const GOView& gids_, const LocalMap localColMap_)
666 : lids(lids_), gids(gids_), localColMap(localColMap_)
667 {}
668
669 KOKKOS_FUNCTION void operator() (const GO i) const
670 {
671 lids(i) = localColMap.getLocalElement(gids(i));
672 }
673
674 LOView lids;
675 const GOView gids;
676 const LocalMap localColMap;
677};
678
679template <class Scalar,
680 class LocalOrdinal,
681 class GlobalOrdinal,
682 class Node>
683void
685 const bool transposeA,
687 const Scalar& beta,
688 const bool transposeB,
691 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
692 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
693 const Teuchos::RCP<Teuchos::ParameterList>& params)
694{
695 using Teuchos::RCP;
696 using Teuchos::rcp;
697 using Teuchos::rcpFromRef;
698 using Teuchos::rcp_implicit_cast;
699 using Teuchos::rcp_dynamic_cast;
700 using Teuchos::TimeMonitor;
701 using SC = Scalar;
702 using LO = LocalOrdinal;
703 using GO = GlobalOrdinal;
704 using NO = Node;
705 using crs_matrix_type = CrsMatrix<SC,LO,GO,NO>;
706 using crs_graph_type = CrsGraph<LO,GO,NO>;
707 using map_type = Map<LO,GO,NO>;
709 using import_type = Import<LO,GO,NO>;
710 using export_type = Export<LO,GO,NO>;
711 using exec_space = typename crs_graph_type::execution_space;
712 using AddKern = MMdetails::AddKernels<SC,LO,GO,NO>;
713 const char* prefix_mmm = "TpetraExt::MatrixMatrix::add: ";
714 constexpr bool debug = false;
715
716#ifdef HAVE_TPETRA_MMM_TIMINGS
717 RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("transpose"))));
718#endif
719
720 if (debug) {
721 std::ostringstream os;
722 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
723 << "TpetraExt::MatrixMatrix::add" << std::endl;
724 std::cerr << os.str ();
725 }
726
728 (C.isLocallyIndexed() || C.isGloballyIndexed(), std::invalid_argument,
729 prefix_mmm << "C must be a 'new' matrix (neither locally nor globally indexed).");
731 (! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
732 prefix_mmm << "A and B must both be fill complete.");
733#ifdef HAVE_TPETRA_DEBUG
734 // The matrices don't have domain or range Maps unless they are fill complete.
735 if (A.isFillComplete () && B.isFillComplete ()) {
736 const bool domainMapsSame =
737 (! transposeA && ! transposeB &&
738 ! A.getDomainMap()->locallySameAs (*B.getDomainMap ())) ||
739 (! transposeA && transposeB &&
740 ! A.getDomainMap()->isSameAs (*B.getRangeMap ())) ||
741 ( transposeA && ! transposeB &&
742 ! A.getRangeMap ()->isSameAs (*B.getDomainMap ()));
743 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
744 prefix_mmm << "The domain Maps of Op(A) and Op(B) are not the same.");
745
746 const bool rangeMapsSame =
747 (! transposeA && ! transposeB &&
748 ! A.getRangeMap ()->isSameAs (*B.getRangeMap ())) ||
749 (! transposeA && transposeB &&
750 ! A.getRangeMap ()->isSameAs (*B.getDomainMap())) ||
751 ( transposeA && ! transposeB &&
752 ! A.getDomainMap()->isSameAs (*B.getRangeMap ()));
753 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
754 prefix_mmm << "The range Maps of Op(A) and Op(B) are not the same.");
755 }
756#endif // HAVE_TPETRA_DEBUG
757
758 using Teuchos::ParameterList;
759 // Form the explicit transpose of A if necessary.
761 if (transposeA) {
763 Aprime = transposer.createTranspose ();
764 }
765
766#ifdef HAVE_TPETRA_DEBUG
768 (Aprime.is_null (), std::logic_error,
769 prefix_mmm << "Failed to compute Op(A). "
770 "Please report this bug to the Tpetra developers.");
771#endif // HAVE_TPETRA_DEBUG
772
773 // Form the explicit transpose of B if necessary.
775 if (transposeB) {
776 if (debug) {
777 std::ostringstream os;
778 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
779 << "Form explicit xpose of B" << std::endl;
780 std::cerr << os.str ();
781 }
783 Bprime = transposer.createTranspose ();
784 }
785#ifdef HAVE_TPETRA_DEBUG
786 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
787 prefix_mmm << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
789 !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
790 prefix_mmm << "Aprime and Bprime must both be fill complete. "
791 "Please report this bug to the Tpetra developers.");
792#endif // HAVE_TPETRA_DEBUG
795 if(CDomainMap.is_null())
796 {
797 CDomainMap = Bprime->getDomainMap();
798 }
799 if(CRangeMap.is_null())
800 {
801 CRangeMap = Bprime->getRangeMap();
802 }
803 assert(!(CDomainMap.is_null()));
804 assert(!(CRangeMap.is_null()));
805 typedef typename AddKern::values_array values_array;
806 typedef typename AddKern::row_ptrs_array row_ptrs_array;
807 typedef typename AddKern::col_inds_array col_inds_array;
808 bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
809 bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
810 values_array vals;
811 row_ptrs_array rowptrs;
812 col_inds_array colinds;
813#ifdef HAVE_TPETRA_MMM_TIMINGS
814 MM = Teuchos::null;
815 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("rowmap check/import"))));
816#endif
817 if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
818 {
819 //import Aprime into Bprime's row map so the local matrices have same # of rows
820 auto import = rcp(new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
821 // cbl do not set
822 // parameterlist "isMatrixMatrix_TransferAndFillComplete" true here as
823 // this import _may_ take the form of a transfer. In practice it would be unlikely,
824 // but the general case is not so forgiving.
825 Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *import, Bprime->getDomainMap(), Bprime->getRangeMap());
826 }
827 bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
829 RCP<const import_type> Cimport = Teuchos::null;
830 RCP<export_type> Cexport = Teuchos::null;
831 bool doFillComplete = true;
832 if(Teuchos::nonnull(params) && params->isParameter("Call fillComplete"))
833 {
834 doFillComplete = params->get<bool>("Call fillComplete");
835 }
836 auto Alocal = Aprime->getLocalMatrixDevice();
837 auto Blocal = Bprime->getLocalMatrixDevice();
838 LO numLocalRows = Alocal.numRows();
839 if(numLocalRows == 0)
840 {
841 //KokkosKernels spadd assumes rowptrs.extent(0) + 1 == nrows,
842 //but an empty Tpetra matrix is allowed to have rowptrs.extent(0) == 0.
843 //Handle this case now
844 //(without interfering with collective operations, since it's possible for
845 //some ranks to have 0 local rows and others not).
846 rowptrs = row_ptrs_array("C rowptrs", 0);
847 }
848 auto Acolmap = Aprime->getColMap();
849 auto Bcolmap = Bprime->getColMap();
850 if(!matchingColMaps)
851 {
852 using global_col_inds_array = typename AddKern::global_col_inds_array;
853#ifdef HAVE_TPETRA_MMM_TIMINGS
854 MM = Teuchos::null;
855 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mismatched col map full kernel"))));
856#endif
857 //use kernel that converts col indices in both A and B to common domain map before adding
858 auto AlocalColmap = Acolmap->getLocalMap();
859 auto BlocalColmap = Bcolmap->getLocalMap();
860 global_col_inds_array globalColinds;
861 if (debug) {
862 std::ostringstream os;
863 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
864 << "Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
865 std::cerr << os.str ();
866 }
867 AddKern::convertToGlobalAndAdd(
870 if (debug) {
871 std::ostringstream os;
872 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
873 << "Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
874 std::cerr << os.str ();
875 }
876#ifdef HAVE_TPETRA_MMM_TIMINGS
877 MM = Teuchos::null;
878 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Constructing graph"))));
879#endif
881 Tpetra::Details::makeColMap<LocalOrdinal, GlobalOrdinal, Node>
883 C.replaceColMap(CcolMap);
884 col_inds_array localColinds("C colinds", globalColinds.extent(0));
885 Kokkos::parallel_for(Kokkos::RangePolicy<exec_space>(0, globalColinds.extent(0)),
886 ConvertGlobalToLocalFunctor<LocalOrdinal, GlobalOrdinal,
887 col_inds_array, global_col_inds_array,
888 typename map_type::local_map_type>
889 (localColinds, globalColinds, CcolMap->getLocalMap()));
890 KokkosSparse::sort_crs_matrix<exec_space, row_ptrs_array, col_inds_array, values_array>(rowptrs, localColinds, vals);
891 C.setAllValues(rowptrs, localColinds, vals);
892 C.fillComplete(CDomainMap, CRangeMap, params);
893 if(!doFillComplete)
894 C.resumeFill();
895 }
896 else
897 {
898 //Aprime, Bprime and C all have the same column maps
899 auto Avals = Alocal.values;
900 auto Bvals = Blocal.values;
901 auto Arowptrs = Alocal.graph.row_map;
902 auto Browptrs = Blocal.graph.row_map;
903 auto Acolinds = Alocal.graph.entries;
904 auto Bcolinds = Blocal.graph.entries;
905 if(sorted)
906 {
907 //use sorted kernel
908#ifdef HAVE_TPETRA_MMM_TIMINGS
909 MM = Teuchos::null;
910 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("sorted entries full kernel"))));
911#endif
912 if (debug) {
913 std::ostringstream os;
914 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
915 << "Call AddKern::addSorted(...)" << std::endl;
916 std::cerr << os.str ();
917 }
918 AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, vals, rowptrs, colinds);
919 }
920 else
921 {
922 //use unsorted kernel
923#ifdef HAVE_TPETRA_MMM_TIMINGS
924 MM = Teuchos::null;
925 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mm add unsorted entries full kernel"))));
926#endif
927 if (debug) {
928 std::ostringstream os;
929 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
930 << "Call AddKern::addUnsorted(...)" << std::endl;
931 std::cerr << os.str ();
932 }
933 AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
934 }
935 //Bprime col map works as C's row map, since Aprime and Bprime have the same colmaps.
937 C.replaceColMap(Ccolmap);
938 C.setAllValues(rowptrs, colinds, vals);
940 {
941 #ifdef HAVE_TPETRA_MMM_TIMINGS
942 MM = Teuchos::null;
943 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Tpetra::Crs expertStaticFillComplete"))));
944 #endif
945 if(!CDomainMap->isSameAs(*Ccolmap))
946 {
947 if (debug) {
948 std::ostringstream os;
949 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
950 << "Create Cimport" << std::endl;
951 std::cerr << os.str ();
952 }
953 Cimport = rcp(new import_type(CDomainMap, Ccolmap));
954 }
955 if(!C.getRowMap()->isSameAs(*CRangeMap))
956 {
957 if (debug) {
958 std::ostringstream os;
959 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
960 << "Create Cexport" << std::endl;
961 std::cerr << os.str ();
962 }
963 Cexport = rcp(new export_type(C.getRowMap(), CRangeMap));
964 }
965
966 if (debug) {
967 std::ostringstream os;
968 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
969 << "Call C->expertStaticFillComplete(...)" << std::endl;
970 std::cerr << os.str ();
971 }
972 C.expertStaticFillComplete(CDomainMap, CRangeMap, Cimport, Cexport, params);
973 }
974 }
975}
976
977template <class Scalar,
978 class LocalOrdinal,
979 class GlobalOrdinal,
980 class Node>
981void Add(
983 bool transposeA,
986 bool transposeB,
989{
990 using Teuchos::Array;
991 using Teuchos::ArrayRCP;
992 using Teuchos::ArrayView;
993 using Teuchos::RCP;
994 using Teuchos::rcp;
995 using Teuchos::rcp_dynamic_cast;
996 using Teuchos::rcpFromRef;
997 using Teuchos::tuple;
998 using std::endl;
999 // typedef typename ArrayView<const Scalar>::size_type size_type;
1000 typedef Teuchos::ScalarTraits<Scalar> STS;
1002 // typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
1003 // typedef RowGraph<LocalOrdinal, GlobalOrdinal, Node> row_graph_type;
1004 // typedef CrsGraph<LocalOrdinal, GlobalOrdinal, Node> crs_graph_type;
1007
1008 std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
1009
1010 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1011 prefix << "The case C == null does not actually work. Fixing this will require an interface change.");
1012
1014 ! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
1015 prefix << "Both input matrices must be fill complete before calling this function.");
1016
1017#ifdef HAVE_TPETRA_DEBUG
1018 {
1019 const bool domainMapsSame =
1020 (! transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getDomainMap ()))) ||
1021 (! transposeA && transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ()))) ||
1022 (transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ())));
1023 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
1024 prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
1025
1026 const bool rangeMapsSame =
1027 (! transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getRangeMap ()))) ||
1028 (! transposeA && transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ()))) ||
1029 (transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ())));
1030 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
1031 prefix << "The range Maps of Op(A) and Op(B) are not the same.");
1032 }
1033#endif // HAVE_TPETRA_DEBUG
1034
1035 using Teuchos::ParameterList;
1037 transposeParams->set ("sort", false);
1038
1039 // Form the explicit transpose of A if necessary.
1041 if (transposeA) {
1043 Aprime = theTransposer.createTranspose (transposeParams);
1044 }
1045 else {
1046 Aprime = rcpFromRef (A);
1047 }
1048
1049#ifdef HAVE_TPETRA_DEBUG
1050 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1051 prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
1052#endif // HAVE_TPETRA_DEBUG
1053
1054 // Form the explicit transpose of B if necessary.
1056 if (transposeB) {
1058 Bprime = theTransposer.createTranspose (transposeParams);
1059 }
1060 else {
1061 Bprime = rcpFromRef (B);
1062 }
1063
1064#ifdef HAVE_TPETRA_DEBUG
1065 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1066 prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
1067#endif // HAVE_TPETRA_DEBUG
1068
1069 // Allocate or zero the entries of the result matrix.
1070 if (! C.is_null ()) {
1071 C->setAllToScalar (STS::zero ());
1072 } else {
1073 // FIXME (mfh 08 May 2013) When I first looked at this method, I
1074 // noticed that C was being given the row Map of Aprime (the
1075 // possibly transposed version of A). Is this what we want?
1076 C = rcp (new crs_matrix_type (Aprime->getRowMap (), 0));
1077 }
1078
1079#ifdef HAVE_TPETRA_DEBUG
1080 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1081 prefix << "At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1082 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1083 prefix << "At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1084 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1085 prefix << "At this point, C is null. Please report this bug to the Tpetra developers.");
1086#endif // HAVE_TPETRA_DEBUG
1087
1091
1092 // do a loop over each matrix to add: A reordering might be more efficient
1093 for (int k = 0; k < 2; ++k) {
1094 typename crs_matrix_type::nonconst_global_inds_host_view_type Indices;
1095 typename crs_matrix_type::nonconst_values_host_view_type Values;
1096
1097 // Loop over each locally owned row of the current matrix (either
1098 // Aprime or Bprime), and sum its entries into the corresponding
1099 // row of C. This works regardless of whether Aprime or Bprime
1100 // has the same row Map as C, because both sumIntoGlobalValues and
1101 // insertGlobalValues allow summing resp. inserting into nonowned
1102 // rows of C.
1103#ifdef HAVE_TPETRA_DEBUG
1104 TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1105 prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1106#endif // HAVE_TPETRA_DEBUG
1107 RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1108#ifdef HAVE_TPETRA_DEBUG
1109 TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1110 prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1111#endif // HAVE_TPETRA_DEBUG
1112
1113 const size_t localNumRows = Mat[k]->getLocalNumRows ();
1114 for (size_t i = 0; i < localNumRows; ++i) {
1115 const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1116 size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1117 if (numEntries > 0) {
1118 Kokkos::resize(Indices,numEntries);
1119 Kokkos::resize(Values,numEntries);
1120 Mat[k]->getGlobalRowCopy (globalRow, Indices, Values, numEntries);
1121
1122 if (scalar[k] != STS::one ()) {
1123 for (size_t j = 0; j < numEntries; ++j) {
1124 Values[j] *= scalar[k];
1125 }
1126 }
1127
1128 if (C->isFillComplete ()) {
1129 C->sumIntoGlobalValues (globalRow, numEntries,
1130 reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1131 } else {
1132 C->insertGlobalValues (globalRow, numEntries,
1133 reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1134 }
1135 }
1136 }
1137 }
1138}
1139
1140} //End namespace MatrixMatrix
1141
1142namespace MMdetails{
1143
1144/*********************************************************************************************************/
1146//template <class TransferType>
1147//void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer, const std::string &label) {
1148// if (Transfer.is_null())
1149// return;
1150//
1151// const Distributor & Distor = Transfer->getDistributor();
1152// Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1153//
1154// size_t rows_send = Transfer->getNumExportIDs();
1155// size_t rows_recv = Transfer->getNumRemoteIDs();
1156//
1157// size_t round1_send = Transfer->getNumExportIDs() * sizeof(size_t);
1158// size_t round1_recv = Transfer->getNumRemoteIDs() * sizeof(size_t);
1159// size_t num_send_neighbors = Distor.getNumSends();
1160// size_t num_recv_neighbors = Distor.getNumReceives();
1161// size_t round2_send, round2_recv;
1162// Distor.getLastDoStatistics(round2_send,round2_recv);
1163//
1164// int myPID = Comm->getRank();
1165// int NumProcs = Comm->getSize();
1166//
1167// // Processor by processor statistics
1168// // printf("[%d] %s Statistics: neigh[s/r]=%d/%d rows[s/r]=%d/%d r1bytes[s/r]=%d/%d r2bytes[s/r]=%d/%d\n",
1169// // myPID, label.c_str(),num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv);
1170//
1171// // Global statistics
1172// size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1173// size_t gstats_min[8], gstats_max[8];
1174//
1175// double lstats_avg[8], gstats_avg[8];
1176// for(int i=0; i<8; i++)
1177// lstats_avg[i] = ((double)lstats[i])/NumProcs;
1178//
1179// Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1180// Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1181// Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1182//
1183// if(!myPID) {
1184// printf("%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1185// (int)gstats_min[0],gstats_avg[0],(int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(int)gstats_max[2],
1186// (int)gstats_min[4],gstats_avg[4],(int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(int)gstats_max[6]);
1187// printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1188// (int)gstats_min[1],gstats_avg[1],(int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(int)gstats_max[3],
1189// (int)gstats_min[5],gstats_avg[5],(int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(int)gstats_max[7]);
1190// }
1191//}
1192
1193// Kernel method for computing the local portion of C = A*B
1194template<class Scalar,
1195 class LocalOrdinal,
1196 class GlobalOrdinal,
1197 class Node>
1198void mult_AT_B_newmatrix(
1199 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1200 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1201 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1202 const std::string & label,
1203 const Teuchos::RCP<Teuchos::ParameterList>& params)
1204{
1205 using Teuchos::RCP;
1206 using Teuchos::rcp;
1207 typedef Scalar SC;
1208 typedef LocalOrdinal LO;
1209 typedef GlobalOrdinal GO;
1210 typedef Node NO;
1211 typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1212 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1213
1214#ifdef HAVE_TPETRA_MMM_TIMINGS
1215 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1216 using Teuchos::TimeMonitor;
1217 RCP<TimeMonitor> MM = rcp (new TimeMonitor
1218 (*TimeMonitor::getNewTimer (prefix_mmm + "MMM-T Transpose")));
1219#endif
1220
1221 /*************************************************************/
1222 /* 1) Local Transpose of A */
1223 /*************************************************************/
1224 transposer_type transposer (rcpFromRef (A), label + std::string("XP: "));
1225
1226 using Teuchos::ParameterList;
1227 RCP<ParameterList> transposeParams (new ParameterList);
1228 transposeParams->set ("sort", false);
1229 if(! params.is_null ()) {
1230 transposeParams->set ("compute global constants",
1231 params->get ("compute global constants: temporaries",
1232 false));
1233 }
1234 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1235 transposer.createTransposeLocal (transposeParams);
1236
1237 /*************************************************************/
1238 /* 2/3) Call mult_A_B_newmatrix w/ fillComplete */
1239 /*************************************************************/
1240#ifdef HAVE_TPETRA_MMM_TIMINGS
1241 MM = Teuchos::null;
1242 MM = rcp (new TimeMonitor
1243 (*TimeMonitor::getNewTimer (prefix_mmm + std::string ("MMM-T I&X"))));
1244#endif
1245
1246 // Get views, asserting that no import is required to speed up computation
1247 crs_matrix_struct_type Aview;
1248 crs_matrix_struct_type Bview;
1249 RCP<const Import<LO, GO, NO> > dummyImporter;
1250
1251 // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
1252 RCP<Teuchos::ParameterList> importParams1 (new ParameterList);
1253 if (! params.is_null ()) {
1254 importParams1->set ("compute global constants",
1255 params->get ("compute global constants: temporaries",
1256 false));
1257 auto slist = params->sublist ("matrixmatrix: kernel params", false);
1258 bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1259 bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
1260 int mm_optimization_core_count =
1262 mm_optimization_core_count =
1263 slist.get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1264 int mm_optimization_core_count2 =
1265 params->get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1266 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1267 mm_optimization_core_count = mm_optimization_core_count2;
1268 }
1269 auto & sip1 = importParams1->sublist ("matrixmatrix: kernel params", false);
1270 sip1.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1271 sip1.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1272 sip1.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1273 }
1274
1275 MMdetails::import_and_extract_views (*Atrans, Atrans->getRowMap (),
1276 Aview, dummyImporter, true,
1277 label, importParams1);
1278
1279 RCP<ParameterList> importParams2 (new ParameterList);
1280 if (! params.is_null ()) {
1281 importParams2->set ("compute global constants",
1282 params->get ("compute global constants: temporaries",
1283 false));
1284 auto slist = params->sublist ("matrixmatrix: kernel params", false);
1285 bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1286 bool overrideAllreduce = slist.get ("MM_TAFC_OverrideAllreduceCheck", false);
1287 int mm_optimization_core_count =
1289 mm_optimization_core_count =
1290 slist.get ("MM_TAFC_OptimizationCoreCount",
1291 mm_optimization_core_count);
1292 int mm_optimization_core_count2 =
1293 params->get ("MM_TAFC_OptimizationCoreCount",
1294 mm_optimization_core_count);
1295 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1296 mm_optimization_core_count = mm_optimization_core_count2;
1297 }
1298 auto & sip2 = importParams2->sublist ("matrixmatrix: kernel params", false);
1299 sip2.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1300 sip2.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1301 sip2.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1302 }
1303
1304 if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1305 MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,true, label,importParams2);
1306 }
1307 else {
1308 MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,false, label,importParams2);
1309 }
1310
1311#ifdef HAVE_TPETRA_MMM_TIMINGS
1312 MM = Teuchos::null;
1313 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T AB-core"))));
1314#endif
1315
1316 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1317
1318 // If Atrans has no Exporter, we can use C instead of having to create a temp matrix
1319 bool needs_final_export = ! Atrans->getGraph ()->getExporter ().is_null();
1320 if (needs_final_export) {
1321 Ctemp = rcp (new Tpetra::CrsMatrix<SC, LO, GO, NO> (Atrans->getRowMap (), 0));
1322 }
1323 else {
1324 Ctemp = rcp (&C, false);
1325 }
1326
1327 mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1328
1329 /*************************************************************/
1330 /* 4) exportAndFillComplete matrix */
1331 /*************************************************************/
1332#ifdef HAVE_TPETRA_MMM_TIMINGS
1333 MM = Teuchos::null;
1334 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T exportAndFillComplete"))));
1335#endif
1336
1337 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp (&C, false);
1338
1339 if (needs_final_export) {
1340 ParameterList labelList;
1341 labelList.set("Timer Label", label);
1342 if(!params.is_null()) {
1343 ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
1344 ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
1345 int mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
1346 mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1347 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1348 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1349 labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
1350 bool isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
1351 bool overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
1352
1353 labelList_subList.set ("isMatrixMatrix_TransferAndFillComplete", isMM,
1354 "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1355 labelList.set("compute global constants",params->get("compute global constants",true));
1356 labelList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1357 }
1358 Ctemp->exportAndFillComplete (Crcp,
1359 *Ctemp->getGraph ()->getExporter (),
1360 B.getDomainMap (),
1361 A.getDomainMap (),
1362 rcp (&labelList, false));
1363 }
1364#ifdef HAVE_TPETRA_MMM_STATISTICS
1365 printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(" AT_B MMM"));
1366#endif
1367}
1368
1369/*********************************************************************************************************/
1370// Kernel method for computing the local portion of C = A*B
1371template<class Scalar,
1372 class LocalOrdinal,
1373 class GlobalOrdinal,
1374 class Node>
1375void mult_A_B(
1376 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1377 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1378 CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1379 const std::string& /* label */,
1380 const Teuchos::RCP<Teuchos::ParameterList>& /* params */)
1381{
1382 using Teuchos::Array;
1383 using Teuchos::ArrayRCP;
1384 using Teuchos::ArrayView;
1385 using Teuchos::OrdinalTraits;
1386 using Teuchos::null;
1387
1388 typedef Teuchos::ScalarTraits<Scalar> STS;
1389 // TEUCHOS_FUNC_TIME_MONITOR_DIFF("mult_A_B", mult_A_B);
1390 LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1391 LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1392
1393 LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1394 LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1395
1396 ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getLocalElementList();
1397 ArrayView<const GlobalOrdinal> bcols_import = null;
1398 if (Bview.importColMap != null) {
1399 C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1400 C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1401
1402 bcols_import = Bview.importColMap->getLocalElementList();
1403 }
1404
1405 size_t C_numCols = C_lastCol - C_firstCol +
1406 OrdinalTraits<LocalOrdinal>::one();
1407 size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1408 OrdinalTraits<LocalOrdinal>::one();
1409
1410 if (C_numCols_import > C_numCols)
1411 C_numCols = C_numCols_import;
1412
1413 Array<Scalar> dwork = Array<Scalar>(C_numCols);
1414 Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1415 Array<size_t> iwork2 = Array<size_t>(C_numCols);
1416
1417 Array<Scalar> C_row_i = dwork;
1418 Array<GlobalOrdinal> C_cols = iwork;
1419 Array<size_t> c_index = iwork2;
1420 Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1421 Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1422
1423 size_t C_row_i_length, j, k, last_index;
1424
1425 // Run through all the hash table lookups once and for all
1426 LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1427 Array<LocalOrdinal> Acol2Brow(Aview.colMap->getLocalNumElements(),LO_INVALID);
1428 Array<LocalOrdinal> Acol2Irow(Aview.colMap->getLocalNumElements(),LO_INVALID);
1429 if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1430 // Maps are the same: Use local IDs as the hash
1431 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1432 Aview.colMap->getMaxLocalIndex(); i++)
1433 Acol2Brow[i]=i;
1434 }
1435 else {
1436 // Maps are not the same: Use the map's hash
1437 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1438 Aview.colMap->getMaxLocalIndex(); i++) {
1439 GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1440 LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1441 if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1442 else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1443 }
1444 }
1445
1446 // To form C = A*B we're going to execute this expression:
1447 //
1448 // C(i,j) = sum_k( A(i,k)*B(k,j) )
1449 //
1450 // Our goal, of course, is to navigate the data in A and B once, without
1451 // performing searches for column-indices, etc.
1452 auto Arowptr = Aview.origMatrix->getLocalRowPtrsHost();
1453 auto Acolind = Aview.origMatrix->getLocalIndicesHost();
1454 auto Avals = Aview.origMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1455 auto Browptr = Bview.origMatrix->getLocalRowPtrsHost();
1456 auto Bcolind = Bview.origMatrix->getLocalIndicesHost();
1457 auto Bvals = Bview.origMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1458 decltype(Browptr) Irowptr;
1459 decltype(Bcolind) Icolind;
1460 decltype(Bvals) Ivals;
1461 if(!Bview.importMatrix.is_null()) {
1462 Irowptr = Bview.importMatrix->getLocalRowPtrsHost();
1463 Icolind = Bview.importMatrix->getLocalIndicesHost();
1464 Ivals = Bview.importMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1465 }
1466
1467 bool C_filled = C.isFillComplete();
1468
1469 for (size_t i = 0; i < C_numCols; i++)
1470 c_index[i] = OrdinalTraits<size_t>::invalid();
1471
1472 // Loop over the rows of A.
1473 size_t Arows = Aview.rowMap->getLocalNumElements();
1474 for(size_t i=0; i<Arows; ++i) {
1475
1476 // Only navigate the local portion of Aview... which is, thankfully, all of
1477 // A since this routine doesn't do transpose modes
1478 GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1479
1480 // Loop across the i-th row of A and for each corresponding row in B, loop
1481 // across columns and accumulate product A(i,k)*B(k,j) into our partial sum
1482 // quantities C_row_i. In other words, as we stride across B(k,:) we're
1483 // calculating updates for row i of the result matrix C.
1484 C_row_i_length = OrdinalTraits<size_t>::zero();
1485
1486 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1487 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1488 const Scalar Aval = Avals[k];
1489 if (Aval == STS::zero())
1490 continue;
1491
1492 if (Ak == LO_INVALID)
1493 continue;
1494
1495 for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1496 LocalOrdinal col = Bcolind[j];
1497 //assert(col >= 0 && col < C_numCols);
1498
1499 if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1500 //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1501 // This has to be a += so insertGlobalValue goes out
1502 C_row_i[C_row_i_length] = Aval*Bvals[j];
1503 C_cols[C_row_i_length] = col;
1504 c_index[col] = C_row_i_length;
1505 C_row_i_length++;
1506
1507 } else {
1508 // static cast from impl_scalar_type to Scalar needed for complex
1509 C_row_i[c_index[col]] += Aval * static_cast<Scalar>(Bvals[j]);
1510 }
1511 }
1512 }
1513
1514 for (size_t ii = 0; ii < C_row_i_length; ii++) {
1515 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1516 C_cols[ii] = bcols[C_cols[ii]];
1517 combined_index[ii] = C_cols[ii];
1518 combined_values[ii] = C_row_i[ii];
1519 }
1520 last_index = C_row_i_length;
1521
1522 //
1523 //Now put the C_row_i values into C.
1524 //
1525 // We might have to revamp this later.
1526 C_row_i_length = OrdinalTraits<size_t>::zero();
1527
1528 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1529 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1530 const Scalar Aval = Avals[k];
1531 if (Aval == STS::zero())
1532 continue;
1533
1534 if (Ak!=LO_INVALID) continue;
1535
1536 Ak = Acol2Irow[Acolind[k]];
1537 for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1538 LocalOrdinal col = Icolind[j];
1539 //assert(col >= 0 && col < C_numCols);
1540
1541 if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1542 //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1543 // This has to be a += so insertGlobalValue goes out
1544 C_row_i[C_row_i_length] = Aval*Ivals[j];
1545 C_cols[C_row_i_length] = col;
1546 c_index[col] = C_row_i_length;
1547 C_row_i_length++;
1548
1549 } else {
1550 // This has to be a += so insertGlobalValue goes out
1551 // static cast from impl_scalar_type to Scalar needed for complex
1552 C_row_i[c_index[col]] += Aval * static_cast<Scalar>(Ivals[j]);
1553 }
1554 }
1555 }
1556
1557 for (size_t ii = 0; ii < C_row_i_length; ii++) {
1558 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1559 C_cols[ii] = bcols_import[C_cols[ii]];
1560 combined_index[last_index] = C_cols[ii];
1561 combined_values[last_index] = C_row_i[ii];
1562 last_index++;
1563 }
1564
1565 // Now put the C_row_i values into C.
1566 // We might have to revamp this later.
1567 C_filled ?
1568 C.sumIntoGlobalValues(
1569 global_row,
1570 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1571 combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1572 :
1573 C.insertGlobalValues(
1574 global_row,
1575 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1576 combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1577
1578 }
1579}
1580
1581/*********************************************************************************************************/
1582template<class Scalar,
1583 class LocalOrdinal,
1584 class GlobalOrdinal,
1585 class Node>
1586void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1587 typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1588 Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1589
1590 if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1591 Mview.maxNumRowEntries = Mview.indices[0].size();
1592
1593 for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1594 if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1595 Mview.maxNumRowEntries = Mview.indices[i].size();
1596 }
1597}
1598
1599/*********************************************************************************************************/
1600template<class CrsMatrixType>
1601size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1602 // Follows the NZ estimate in ML's ml_matmatmult.c
1603 size_t Aest = 100, Best=100;
1604 if (A.getLocalNumEntries() >= A.getLocalNumRows())
1605 Aest = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries()/A.getLocalNumRows() : 100;
1606 if (B.getLocalNumEntries() >= B.getLocalNumRows())
1607 Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries()/B.getLocalNumRows() : 100;
1608
1609 size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
1610 nnzperrow *= nnzperrow;
1611
1612 return (size_t)(A.getLocalNumRows()*nnzperrow*0.75 + 100);
1613}
1614
1615
1616/*********************************************************************************************************/
1617// Kernel method for computing the local portion of C = A*B for CrsMatrix
1618//
1619// mfh 27 Sep 2016: Currently, mult_AT_B_newmatrix() also calls this
1620// function, so this is probably the function we want to
1621// thread-parallelize.
1622template<class Scalar,
1623 class LocalOrdinal,
1624 class GlobalOrdinal,
1625 class Node>
1626void mult_A_B_newmatrix(
1627 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1628 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1629 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1630 const std::string& label,
1631 const Teuchos::RCP<Teuchos::ParameterList>& params)
1632{
1633 using Teuchos::Array;
1634 using Teuchos::ArrayRCP;
1635 using Teuchos::ArrayView;
1636 using Teuchos::RCP;
1637 using Teuchos::rcp;
1638
1639 // Tpetra typedefs
1640 typedef LocalOrdinal LO;
1641 typedef GlobalOrdinal GO;
1642 typedef Node NO;
1643 typedef Import<LO,GO,NO> import_type;
1644 typedef Map<LO,GO,NO> map_type;
1645
1646 // Kokkos typedefs
1647 typedef typename map_type::local_map_type local_map_type;
1649 typedef typename KCRS::StaticCrsGraphType graph_t;
1650 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1651 typedef typename NO::execution_space execution_space;
1652 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1653 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1654
1655#ifdef HAVE_TPETRA_MMM_TIMINGS
1656 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1657 using Teuchos::TimeMonitor;
1658 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM M5 Cmap")))));
1659#endif
1660 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1661
1662 // Build the final importer / column map, hash table lookups for C
1663 RCP<const import_type> Cimport;
1664 RCP<const map_type> Ccolmap;
1665 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1666 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1667 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1668 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1669 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1670 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1671 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1672 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1673
1674 // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
1675 // indices of B, to local column indices of C. (B and C have the
1676 // same number of columns.) The kernel uses this, instead of
1677 // copying the entire input matrix B and converting its column
1678 // indices to those of C.
1679 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
1680
1681 if (Bview.importMatrix.is_null()) {
1682 // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1683 Cimport = Bimport;
1684 Ccolmap = Bview.colMap;
1685 const LO colMapSize = static_cast<LO>(Bview.colMap->getLocalNumElements());
1686 // Bcol2Ccol is trivial
1687 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1688 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1689 KOKKOS_LAMBDA(const LO i) {
1690 Bcol2Ccol(i) = i;
1691 });
1692 }
1693 else {
1694 // mfh 27 Sep 2016: B has "remotes," so we need to build the
1695 // column Map of C, as well as C's Import object (from its domain
1696 // Map to its column Map). C's column Map is the union of the
1697 // column Maps of (the local part of) B, and the "remote" part of
1698 // B. Ditto for the Import. We have optimized this "setUnion"
1699 // operation on Import objects and Maps.
1700
1701 // Choose the right variant of setUnion
1702 if (!Bimport.is_null() && !Iimport.is_null()) {
1703 Cimport = Bimport->setUnion(*Iimport);
1704 }
1705 else if (!Bimport.is_null() && Iimport.is_null()) {
1706 Cimport = Bimport->setUnion();
1707 }
1708 else if (Bimport.is_null() && !Iimport.is_null()) {
1709 Cimport = Iimport->setUnion();
1710 }
1711 else {
1712 throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1713 }
1714 Ccolmap = Cimport->getTargetMap();
1715
1716 // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
1717 // in general. We should get rid of it in order to reduce
1718 // communication costs of sparse matrix-matrix multiply.
1719 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1720 std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1721
1722 // NOTE: This is not efficient and should be folded into setUnion
1723 //
1724 // mfh 27 Sep 2016: What the above comment means, is that the
1725 // setUnion operation on Import objects could also compute these
1726 // local index - to - local index look-up tables.
1727 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
1728 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1729 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
1730 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1731 });
1732 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
1733 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1734 });
1735
1736 }
1737
1738 // Replace the column map
1739 //
1740 // mfh 27 Sep 2016: We do this because C was originally created
1741 // without a column Map. Now we have its column Map.
1742 C.replaceColMap(Ccolmap);
1743
1744 // mfh 27 Sep 2016: Construct tables that map from local column
1745 // indices of A, to local row indices of either B_local (the locally
1746 // owned part of B), or B_remote (the "imported" remote part of B).
1747 //
1748 // For column index Aik in row i of A, if the corresponding row of B
1749 // exists in the local part of B ("orig") (which I'll call B_local),
1750 // then targetMapToOrigRow[Aik] is the local index of that row of B.
1751 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1752 //
1753 // For column index Aik in row i of A, if the corresponding row of B
1754 // exists in the remote part of B ("Import") (which I'll call
1755 // B_remote), then targetMapToImportRow[Aik] is the local index of
1756 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1757 // (a flag value).
1758
1759 // Run through all the hash table lookups once and for all
1760 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
1761 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
1762
1763 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
1764 GO aidx = Acolmap_local.getGlobalElement(i);
1765 LO B_LID = Browmap_local.getLocalElement(aidx);
1766 if (B_LID != LO_INVALID) {
1767 targetMapToOrigRow(i) = B_LID;
1768 targetMapToImportRow(i) = LO_INVALID;
1769 } else {
1770 LO I_LID = Irowmap_local.getLocalElement(aidx);
1771 targetMapToOrigRow(i) = LO_INVALID;
1772 targetMapToImportRow(i) = I_LID;
1773
1774 }
1775 });
1776
1777 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
1778 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
1779 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
1780
1781}
1782
1783/*********************************************************************************************************/
1784// Kernel method for computing the local portion of C = A*B for BlockCrsMatrix
1785template<class Scalar,
1786 class LocalOrdinal,
1787 class GlobalOrdinal,
1788 class Node>
1789void mult_A_B_newmatrix(BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1790 BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1791 Teuchos::RCP<BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > &C)
1792{
1793 using Teuchos::null;
1794 using Teuchos::Array;
1795 using Teuchos::ArrayRCP;
1796 using Teuchos::ArrayView;
1797 using Teuchos::RCP;
1798 using Teuchos::rcp;
1799
1800 // Tpetra typedefs
1801 typedef LocalOrdinal LO;
1802 typedef GlobalOrdinal GO;
1803 typedef Node NO;
1804 typedef Import<LO,GO,NO> import_type;
1805 typedef Map<LO,GO,NO> map_type;
1806 typedef BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> block_crs_matrix_type;
1807 typedef typename block_crs_matrix_type::crs_graph_type graph_t;
1808
1809 // Kokkos typedefs
1810 typedef typename map_type::local_map_type local_map_type;
1811 typedef typename block_crs_matrix_type::local_matrix_device_type KBSR;
1812 typedef typename KBSR::device_type device_t;
1813 typedef typename KBSR::StaticCrsGraphType static_graph_t;
1814 typedef typename static_graph_t::row_map_type::non_const_type lno_view_t;
1815 typedef typename static_graph_t::entries_type::non_const_type lno_nnz_view_t;
1816 typedef typename KBSR::values_type::non_const_type scalar_view_t;
1817 typedef typename NO::execution_space execution_space;
1818 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1819 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1820
1821 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1822
1823 // Build the final importer / column map, hash table lookups for C
1824 RCP<const import_type> Cimport;
1825 RCP<const map_type> Ccolmap;
1826 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1827 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1828 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1829 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1830 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1831 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1832 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1833 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1834
1835 // Bcol2Ccol is a table that maps from local column
1836 // indices of B, to local column indices of C. (B and C have the
1837 // same number of columns.) The kernel uses this, instead of
1838 // copying the entire input matrix B and converting its column
1839 // indices to those of C.
1840 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
1841
1842 if (Bview.importMatrix.is_null()) {
1843 // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1844 Cimport = Bimport;
1845 Ccolmap = Bview.colMap;
1846 const LO colMapSize = static_cast<LO>(Bview.colMap->getLocalNumElements());
1847 // Bcol2Ccol is trivial
1848 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1849 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1850 KOKKOS_LAMBDA(const LO i) {
1851 Bcol2Ccol(i) = i;
1852 });
1853 }
1854 else {
1855 // B has "remotes," so we need to build the
1856 // column Map of C, as well as C's Import object (from its domain
1857 // Map to its column Map). C's column Map is the union of the
1858 // column Maps of (the local part of) B, and the "remote" part of
1859 // B. Ditto for the Import. We have optimized this "setUnion"
1860 // operation on Import objects and Maps.
1861
1862 // Choose the right variant of setUnion
1863 if (!Bimport.is_null() && !Iimport.is_null()) {
1864 Cimport = Bimport->setUnion(*Iimport);
1865 }
1866 else if (!Bimport.is_null() && Iimport.is_null()) {
1867 Cimport = Bimport->setUnion();
1868 }
1869 else if (Bimport.is_null() && !Iimport.is_null()) {
1870 Cimport = Iimport->setUnion();
1871 }
1872 else {
1873 throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1874 }
1875 Ccolmap = Cimport->getTargetMap();
1876
1877 // NOTE: This is not efficient and should be folded into setUnion
1878 //
1879 // What the above comment means, is that the
1880 // setUnion operation on Import objects could also compute these
1881 // local index - to - local index look-up tables.
1882 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
1883 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1884 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",
1885 range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),
1886 KOKKOS_LAMBDA(const LO i) {
1887 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1888 });
1889 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",
1890 range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),
1891 KOKKOS_LAMBDA(const LO i) {
1892 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1893 });
1894 }
1895
1896 // Construct tables that map from local column
1897 // indices of A, to local row indices of either B_local (the locally
1898 // owned part of B), or B_remote (the "imported" remote part of B).
1899 //
1900 // For column index Aik in row i of A, if the corresponding row of B
1901 // exists in the local part of B ("orig") (which I'll call B_local),
1902 // then targetMapToOrigRow[Aik] is the local index of that row of B.
1903 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1904 //
1905 // For column index Aik in row i of A, if the corresponding row of B
1906 // exists in the remote part of B ("Import") (which I'll call
1907 // B_remote), then targetMapToImportRow[Aik] is the local index of
1908 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1909 // (a flag value).
1910
1911 // Run through all the hash table lookups once and for all
1912 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),
1913 Aview.colMap->getLocalNumElements());
1914 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),
1915 Aview.colMap->getLocalNumElements());
1916
1917 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",
1918 range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),
1919 KOKKOS_LAMBDA(const LO i) {
1920 GO aidx = Acolmap_local.getGlobalElement(i);
1921 LO B_LID = Browmap_local.getLocalElement(aidx);
1922 if (B_LID != LO_INVALID) {
1923 targetMapToOrigRow(i) = B_LID;
1924 targetMapToImportRow(i) = LO_INVALID;
1925 } else {
1926 LO I_LID = Irowmap_local.getLocalElement(aidx);
1927 targetMapToOrigRow(i) = LO_INVALID;
1928 targetMapToImportRow(i) = I_LID;
1929 }
1930 });
1931
1932 // Create the KernelHandle
1933 using KernelHandle =
1934 KokkosKernels::Experimental::KokkosKernelsHandle<typename lno_view_t::const_value_type,
1935 typename lno_nnz_view_t::const_value_type,
1936 typename scalar_view_t::const_value_type,
1937 typename device_t::execution_space,
1938 typename device_t::memory_space,
1939 typename device_t::memory_space>;
1940 int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
1941 std::string myalg("SPGEMM_KK_MEMORY");
1942 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
1943
1944 KernelHandle kh;
1945 kh.create_spgemm_handle(alg_enum);
1946 kh.set_team_work_size(team_work_size);
1947
1948 // Get KokkosSparse::BsrMatrix for A and Bmerged (B and BImport)
1949 const KBSR Amat = Aview.origMatrix->getLocalMatrixDevice();
1950 const KBSR Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,
1951 targetMapToOrigRow,targetMapToImportRow,
1952 Bcol2Ccol,Icol2Ccol,
1953 Ccolmap.getConst()->getLocalNumElements());
1954
1955 RCP<graph_t> graphC;
1956 typename KBSR::values_type values;
1957 {
1958 // Call KokkosSparse routines to calculate Amat*Bmerged on device.
1959 // NOTE: Need to scope guard this since the BlockCrs constructor will need to copy the host graph
1960 KBSR Cmat;
1961 KokkosSparse::block_spgemm_symbolic(kh, Amat, false, Bmerged, false, Cmat);
1962 KokkosSparse::block_spgemm_numeric (kh, Amat, false, Bmerged, false, Cmat);
1963 kh.destroy_spgemm_handle();
1964
1965 // Build Tpetra::BlockCrsMatrix from KokkosSparse::BsrMatrix
1966 graphC = rcp(new graph_t(Cmat.graph, Aview.origMatrix->getRowMap(), Ccolmap.getConst()));
1967 values = Cmat.values;
1968 }
1969 C = rcp (new block_crs_matrix_type (*graphC, values, Aview.blocksize));
1970
1971}
1972
1973/*********************************************************************************************************/
1974// AB NewMatrix Kernel wrappers (Default non-threaded version for CrsMatrix)
1975template<class Scalar,
1976 class LocalOrdinal,
1977 class GlobalOrdinal,
1978 class Node,
1979 class LocalOrdinalViewType>
1980void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1981 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1982 const LocalOrdinalViewType & targetMapToOrigRow,
1983 const LocalOrdinalViewType & targetMapToImportRow,
1984 const LocalOrdinalViewType & Bcol2Ccol,
1985 const LocalOrdinalViewType & Icol2Ccol,
1986 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1987 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
1988 const std::string& label,
1989 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1990#ifdef HAVE_TPETRA_MMM_TIMINGS
1991 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1992 using Teuchos::TimeMonitor;
1993 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore"))));
1994#endif
1995
1996 using Teuchos::Array;
1997 using Teuchos::ArrayRCP;
1998 using Teuchos::ArrayView;
1999 using Teuchos::RCP;
2000 using Teuchos::rcp;
2001
2002 // Lots and lots of typedefs
2003 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2004 typedef typename KCRS::StaticCrsGraphType graph_t;
2005 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2006 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2007 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2008 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2009
2010 typedef Scalar SC;
2011 typedef LocalOrdinal LO;
2012 typedef GlobalOrdinal GO;
2013 typedef Node NO;
2014 typedef Map<LO,GO,NO> map_type;
2015 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2016 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2017 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2018
2019 // Sizes
2020 RCP<const map_type> Ccolmap = C.getColMap();
2021 size_t m = Aview.origMatrix->getLocalNumRows();
2022 size_t n = Ccolmap->getLocalNumElements();
2023 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
2024
2025 // Grab the Kokkos::SparseCrsMatrices & inner stuff
2026 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2027 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2028
2029 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2030 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2031 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2032
2033 c_lno_view_t Irowptr;
2034 lno_nnz_view_t Icolind;
2035 scalar_view_t Ivals;
2036 if(!Bview.importMatrix.is_null()) {
2037 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2038 Irowptr = lclB.graph.row_map;
2039 Icolind = lclB.graph.entries;
2040 Ivals = lclB.values;
2041 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
2042 }
2043
2044#ifdef HAVE_TPETRA_MMM_TIMINGS
2045 RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2046#endif
2047
2048 // Classic csr assembly (low memory edition)
2049 //
2050 // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2051 // The method loops over rows of A, and may resize after processing
2052 // each row. Chris Siefert says that this reflects experience in
2053 // ML; for the non-threaded case, ML found it faster to spend less
2054 // effort on estimation and risk an occasional reallocation.
2055 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2056 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2057 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2058 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2059
2060 // mfh 27 Sep 2016: The c_status array is an implementation detail
2061 // of the local sparse matrix-matrix multiply routine.
2062
2063 // The status array will contain the index into colind where this entry was last deposited.
2064 // c_status[i] < CSR_ip - not in the row yet
2065 // c_status[i] >= CSR_ip - this is the entry where you can find the data
2066 // We start with this filled with INVALID's indicating that there are no entries yet.
2067 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2068 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2069 std::vector<size_t> c_status(n, ST_INVALID);
2070
2071 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2072 // routine. The routine computes C := A * (B_local + B_remote).
2073 //
2074 // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2075 // you whether the corresponding row of B belongs to B_local
2076 // ("orig") or B_remote ("Import").
2077
2078 // For each row of A/C
2079 size_t CSR_ip = 0, OLD_ip = 0;
2080 for (size_t i = 0; i < m; i++) {
2081 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2082 // on the calling process.
2083 Crowptr[i] = CSR_ip;
2084
2085 // mfh 27 Sep 2016: For each entry of A in the current row of A
2086 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2087 LO Aik = Acolind[k]; // local column index of current entry of A
2088 const SC Aval = Avals[k]; // value of current entry of A
2089 if (Aval == SC_ZERO)
2090 continue; // skip explicitly stored zero values in A
2091
2092 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2093 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2094 // corresponding to the current entry of A is populated, then
2095 // the corresponding row of B is in B_local (i.e., it lives on
2096 // the calling process).
2097
2098 // Local matrix
2099 size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2100
2101 // mfh 27 Sep 2016: Go through all entries in that row of B_local.
2102 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2103 LO Bkj = Bcolind[j];
2104 LO Cij = Bcol2Ccol[Bkj];
2105
2106 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2107 // New entry
2108 c_status[Cij] = CSR_ip;
2109 Ccolind[CSR_ip] = Cij;
2110 Cvals[CSR_ip] = Aval*Bvals[j];
2111 CSR_ip++;
2112
2113 } else {
2114 Cvals[c_status[Cij]] += Aval*Bvals[j];
2115 }
2116 }
2117
2118 } else {
2119 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2120 // corresponding to the current entry of A NOT populated (has
2121 // a flag "invalid" value), then the corresponding row of B is
2122 // in B_local (i.e., it lives on the calling process).
2123
2124 // Remote matrix
2125 size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2126 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2127 LO Ikj = Icolind[j];
2128 LO Cij = Icol2Ccol[Ikj];
2129
2130 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
2131 // New entry
2132 c_status[Cij] = CSR_ip;
2133 Ccolind[CSR_ip] = Cij;
2134 Cvals[CSR_ip] = Aval*Ivals[j];
2135 CSR_ip++;
2136 } else {
2137 Cvals[c_status[Cij]] += Aval*Ivals[j];
2138 }
2139 }
2140 }
2141 }
2142
2143 // Resize for next pass if needed
2144 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
2145 CSR_alloc *= 2;
2146 Kokkos::resize(Ccolind,CSR_alloc);
2147 Kokkos::resize(Cvals,CSR_alloc);
2148 }
2149 OLD_ip = CSR_ip;
2150 }
2151
2152 Crowptr[m] = CSR_ip;
2153
2154 // Downward resize
2155 Kokkos::resize(Ccolind,CSR_ip);
2156 Kokkos::resize(Cvals,CSR_ip);
2157
2158#ifdef HAVE_TPETRA_MMM_TIMINGS
2159 {
2160 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix Final Sort")));
2161#endif
2162
2163 // Final sort & set of CRS arrays
2164 if (params.is_null() || params->get("sort entries",true))
2165 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2166 C.setAllValues(Crowptr,Ccolind, Cvals);
2167
2168
2169#ifdef HAVE_TPETRA_MMM_TIMINGS
2170 }
2171 auto MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix ESFC")));
2172 {
2173#endif
2174
2175 // Final FillComplete
2176 //
2177 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2178 // Import (from domain Map to column Map) construction (which costs
2179 // lots of communication) by taking the previously constructed
2180 // Import object. We should be able to do this without interfering
2181 // with the implementation of the local part of sparse matrix-matrix
2182 // multply above.
2183 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2184 labelList->set("Timer Label",label);
2185 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2186 RCP<const Export<LO,GO,NO> > dummyExport;
2187 C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2188#ifdef HAVE_TPETRA_MMM_TIMINGS
2189 }
2190 MM2 = Teuchos::null;
2191#endif
2192
2193}
2194/*********************************************************************************************************/
2195// Kernel method for computing the local portion of C = A*B (reuse)
2196template<class Scalar,
2197 class LocalOrdinal,
2198 class GlobalOrdinal,
2199 class Node>
2200void mult_A_B_reuse(
2201 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2202 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2203 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2204 const std::string& label,
2205 const Teuchos::RCP<Teuchos::ParameterList>& params)
2206{
2207 using Teuchos::Array;
2208 using Teuchos::ArrayRCP;
2209 using Teuchos::ArrayView;
2210 using Teuchos::RCP;
2211 using Teuchos::rcp;
2212
2213 // Tpetra typedefs
2214 typedef LocalOrdinal LO;
2215 typedef GlobalOrdinal GO;
2216 typedef Node NO;
2217 typedef Import<LO,GO,NO> import_type;
2218 typedef Map<LO,GO,NO> map_type;
2219
2220 // Kokkos typedefs
2221 typedef typename map_type::local_map_type local_map_type;
2223 typedef typename KCRS::StaticCrsGraphType graph_t;
2224 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2225 typedef typename NO::execution_space execution_space;
2226 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2227 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2228
2229#ifdef HAVE_TPETRA_MMM_TIMINGS
2230 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2231 using Teuchos::TimeMonitor;
2232 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse Cmap"))));
2233#endif
2234 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2235
2236 // Grab all the maps
2237 RCP<const import_type> Cimport = C.getGraph()->getImporter();
2238 RCP<const map_type> Ccolmap = C.getColMap();
2239 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2240 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2241 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2242 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2243 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2244 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2245
2246 // Build the final importer / column map, hash table lookups for C
2247 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2248 {
2249 // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2250 // So, column map of C may be a strict subset of the column map of B
2251 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2252 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2253 });
2254
2255 if (!Bview.importMatrix.is_null()) {
2256 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2257 std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
2258
2259 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2260 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2261 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2262 });
2263 }
2264 }
2265
2266 // Run through all the hash table lookups once and for all
2267 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2268 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2269 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2270 GO aidx = Acolmap_local.getGlobalElement(i);
2271 LO B_LID = Browmap_local.getLocalElement(aidx);
2272 if (B_LID != LO_INVALID) {
2273 targetMapToOrigRow(i) = B_LID;
2274 targetMapToImportRow(i) = LO_INVALID;
2275 } else {
2276 LO I_LID = Irowmap_local.getLocalElement(aidx);
2277 targetMapToOrigRow(i) = LO_INVALID;
2278 targetMapToImportRow(i) = I_LID;
2279
2280 }
2281 });
2282
2283 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2284 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2285 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2286}
2287
2288/*********************************************************************************************************/
2289template<class Scalar,
2290 class LocalOrdinal,
2291 class GlobalOrdinal,
2292 class Node,
2293 class LocalOrdinalViewType>
2294void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2295 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2296 const LocalOrdinalViewType & targetMapToOrigRow,
2297 const LocalOrdinalViewType & targetMapToImportRow,
2298 const LocalOrdinalViewType & Bcol2Ccol,
2299 const LocalOrdinalViewType & Icol2Ccol,
2300 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2301 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2302 const std::string& label,
2303 const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2304#ifdef HAVE_TPETRA_MMM_TIMINGS
2305 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2306 using Teuchos::TimeMonitor;
2307 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
2308 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2309#else
2310 (void)label;
2311#endif
2312 using Teuchos::RCP;
2313 using Teuchos::rcp;
2314
2315
2316 // Lots and lots of typedefs
2317 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2318 typedef typename KCRS::StaticCrsGraphType graph_t;
2319 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2320 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2321 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2322
2323 typedef Scalar SC;
2324 typedef LocalOrdinal LO;
2325 typedef GlobalOrdinal GO;
2326 typedef Node NO;
2327 typedef Map<LO,GO,NO> map_type;
2328 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2329 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2330 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2331
2332 // Sizes
2333 RCP<const map_type> Ccolmap = C.getColMap();
2334 size_t m = Aview.origMatrix->getLocalNumRows();
2335 size_t n = Ccolmap->getLocalNumElements();
2336
2337 // Grab the Kokkos::SparseCrsMatrices & inner stuff
2338 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2339 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2340 const KCRS & Cmat = C.getLocalMatrixHost();
2341
2342 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2343 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2344 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2345 scalar_view_t Cvals = Cmat.values;
2346
2347 c_lno_view_t Irowptr;
2348 lno_nnz_view_t Icolind;
2349 scalar_view_t Ivals;
2350 if(!Bview.importMatrix.is_null()) {
2351 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2352 Irowptr = lclB.graph.row_map;
2353 Icolind = lclB.graph.entries;
2354 Ivals = lclB.values;
2355 }
2356
2357#ifdef HAVE_TPETRA_MMM_TIMINGS
2358 MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2359#endif
2360
2361 // Classic csr assembly (low memory edition)
2362 // mfh 27 Sep 2016: The c_status array is an implementation detail
2363 // of the local sparse matrix-matrix multiply routine.
2364
2365 // The status array will contain the index into colind where this entry was last deposited.
2366 // c_status[i] < CSR_ip - not in the row yet
2367 // c_status[i] >= CSR_ip - this is the entry where you can find the data
2368 // We start with this filled with INVALID's indicating that there are no entries yet.
2369 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2370 std::vector<size_t> c_status(n, ST_INVALID);
2371
2372 // For each row of A/C
2373 size_t CSR_ip = 0, OLD_ip = 0;
2374 for (size_t i = 0; i < m; i++) {
2375 // First fill the c_status array w/ locations where we're allowed to
2376 // generate nonzeros for this row
2377 OLD_ip = Crowptr[i];
2378 CSR_ip = Crowptr[i+1];
2379 for (size_t k = OLD_ip; k < CSR_ip; k++) {
2380 c_status[Ccolind[k]] = k;
2381
2382 // Reset values in the row of C
2383 Cvals[k] = SC_ZERO;
2384 }
2385
2386 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2387 LO Aik = Acolind[k];
2388 const SC Aval = Avals[k];
2389 if (Aval == SC_ZERO)
2390 continue;
2391
2392 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2393 // Local matrix
2394 size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2395
2396 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2397 LO Bkj = Bcolind[j];
2398 LO Cij = Bcol2Ccol[Bkj];
2399
2400 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2401 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2402 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2403
2404 Cvals[c_status[Cij]] += Aval * Bvals[j];
2405 }
2406
2407 } else {
2408 // Remote matrix
2409 size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2410 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2411 LO Ikj = Icolind[j];
2412 LO Cij = Icol2Ccol[Ikj];
2413
2414 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2415 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2416 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2417
2418 Cvals[c_status[Cij]] += Aval * Ivals[j];
2419 }
2420 }
2421 }
2422 }
2423
2424#ifdef HAVE_TPETRA_MMM_TIMINGS
2425 auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse ESFC"))));
2426#endif
2427
2428 C.fillComplete(C.getDomainMap(), C.getRangeMap());
2429}
2430
2431
2432/*********************************************************************************************************/
2433// Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2434template<class Scalar,
2435 class LocalOrdinal,
2436 class GlobalOrdinal,
2437 class Node>
2438void jacobi_A_B_newmatrix(
2439 Scalar omega,
2440 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2441 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2442 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2443 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2444 const std::string& label,
2445 const Teuchos::RCP<Teuchos::ParameterList>& params)
2446{
2447 using Teuchos::Array;
2448 using Teuchos::ArrayRCP;
2449 using Teuchos::ArrayView;
2450 using Teuchos::RCP;
2451 using Teuchos::rcp;
2452 // typedef Scalar SC;
2453 typedef LocalOrdinal LO;
2454 typedef GlobalOrdinal GO;
2455 typedef Node NO;
2456
2457 typedef Import<LO,GO,NO> import_type;
2458 typedef Map<LO,GO,NO> map_type;
2459 typedef typename map_type::local_map_type local_map_type;
2460
2461 // All of the Kokkos typedefs
2463 typedef typename KCRS::StaticCrsGraphType graph_t;
2464 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2465 typedef typename NO::execution_space execution_space;
2466 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2467 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2468
2469
2470#ifdef HAVE_TPETRA_MMM_TIMINGS
2471 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2472 using Teuchos::TimeMonitor;
2473 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi M5 Cmap"))));
2474#endif
2475 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2476
2477 // Build the final importer / column map, hash table lookups for C
2478 RCP<const import_type> Cimport;
2479 RCP<const map_type> Ccolmap;
2480 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2481 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2482 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2483 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2484 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2485 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2486 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2487 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2488
2489 // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
2490 // indices of B, to local column indices of C. (B and C have the
2491 // same number of columns.) The kernel uses this, instead of
2492 // copying the entire input matrix B and converting its column
2493 // indices to those of C.
2494 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2495
2496 if (Bview.importMatrix.is_null()) {
2497 // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
2498 Cimport = Bimport;
2499 Ccolmap = Bview.colMap;
2500 // Bcol2Ccol is trivial
2501 // Bcol2Ccol is trivial
2502
2503 Kokkos::RangePolicy<execution_space, LO> range (0, static_cast<LO> (Bview.colMap->getLocalNumElements ()));
2504 Kokkos::parallel_for (range, KOKKOS_LAMBDA (const size_t i) {
2505 Bcol2Ccol(i) = static_cast<LO> (i);
2506 });
2507 } else {
2508 // mfh 27 Sep 2016: B has "remotes," so we need to build the
2509 // column Map of C, as well as C's Import object (from its domain
2510 // Map to its column Map). C's column Map is the union of the
2511 // column Maps of (the local part of) B, and the "remote" part of
2512 // B. Ditto for the Import. We have optimized this "setUnion"
2513 // operation on Import objects and Maps.
2514
2515 // Choose the right variant of setUnion
2516 if (!Bimport.is_null() && !Iimport.is_null()){
2517 Cimport = Bimport->setUnion(*Iimport);
2518 Ccolmap = Cimport->getTargetMap();
2519
2520 } else if (!Bimport.is_null() && Iimport.is_null()) {
2521 Cimport = Bimport->setUnion();
2522
2523 } else if(Bimport.is_null() && !Iimport.is_null()) {
2524 Cimport = Iimport->setUnion();
2525
2526 } else
2527 throw std::runtime_error("TpetraExt::Jacobi status of matrix importers is nonsensical");
2528
2529 Ccolmap = Cimport->getTargetMap();
2530
2531 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2532 std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2533
2534 // NOTE: This is not efficient and should be folded into setUnion
2535 //
2536 // mfh 27 Sep 2016: What the above comment means, is that the
2537 // setUnion operation on Import objects could also compute these
2538 // local index - to - local index look-up tables.
2539 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2540 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2541 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2542 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2543 });
2544 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2545 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2546 });
2547
2548 }
2549
2550 // Replace the column map
2551 //
2552 // mfh 27 Sep 2016: We do this because C was originally created
2553 // without a column Map. Now we have its column Map.
2554 C.replaceColMap(Ccolmap);
2555
2556 // mfh 27 Sep 2016: Construct tables that map from local column
2557 // indices of A, to local row indices of either B_local (the locally
2558 // owned part of B), or B_remote (the "imported" remote part of B).
2559 //
2560 // For column index Aik in row i of A, if the corresponding row of B
2561 // exists in the local part of B ("orig") (which I'll call B_local),
2562 // then targetMapToOrigRow[Aik] is the local index of that row of B.
2563 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
2564 //
2565 // For column index Aik in row i of A, if the corresponding row of B
2566 // exists in the remote part of B ("Import") (which I'll call
2567 // B_remote), then targetMapToImportRow[Aik] is the local index of
2568 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
2569 // (a flag value).
2570
2571 // Run through all the hash table lookups once and for all
2572 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2573 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2574 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2575 GO aidx = Acolmap_local.getGlobalElement(i);
2576 LO B_LID = Browmap_local.getLocalElement(aidx);
2577 if (B_LID != LO_INVALID) {
2578 targetMapToOrigRow(i) = B_LID;
2579 targetMapToImportRow(i) = LO_INVALID;
2580 } else {
2581 LO I_LID = Irowmap_local.getLocalElement(aidx);
2582 targetMapToOrigRow(i) = LO_INVALID;
2583 targetMapToImportRow(i) = I_LID;
2584
2585 }
2586 });
2587
2588 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2589 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2590 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2591
2592}
2593
2594
2595/*********************************************************************************************************/
2596// Jacobi AB NewMatrix Kernel wrappers (Default non-threaded version)
2597// Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2598
2599template<class Scalar,
2600 class LocalOrdinal,
2601 class GlobalOrdinal,
2602 class Node,
2603 class LocalOrdinalViewType>
2604void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2605 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2606 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2607 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2608 const LocalOrdinalViewType & targetMapToOrigRow,
2609 const LocalOrdinalViewType & targetMapToImportRow,
2610 const LocalOrdinalViewType & Bcol2Ccol,
2611 const LocalOrdinalViewType & Icol2Ccol,
2612 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2613 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2614 const std::string& label,
2615 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2616
2617#ifdef HAVE_TPETRA_MMM_TIMINGS
2618 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2619 using Teuchos::TimeMonitor;
2620 auto MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Nemwmatrix SerialCore")));
2621#endif
2622
2623 using Teuchos::Array;
2624 using Teuchos::ArrayRCP;
2625 using Teuchos::ArrayView;
2626 using Teuchos::RCP;
2627 using Teuchos::rcp;
2628
2629 // Lots and lots of typedefs
2630 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2631 typedef typename KCRS::StaticCrsGraphType graph_t;
2632 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2633 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2634 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2635 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2636
2637 // Jacobi-specific
2638 typedef typename scalar_view_t::memory_space scalar_memory_space;
2639
2640 typedef Scalar SC;
2641 typedef LocalOrdinal LO;
2642 typedef GlobalOrdinal GO;
2643 typedef Node NO;
2644
2645 typedef Map<LO,GO,NO> map_type;
2646 size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2647 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2648
2649 // Sizes
2650 RCP<const map_type> Ccolmap = C.getColMap();
2651 size_t m = Aview.origMatrix->getLocalNumRows();
2652 size_t n = Ccolmap->getLocalNumElements();
2653 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
2654
2655 // Grab the Kokkos::SparseCrsMatrices & inner stuff
2656 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2657 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2658
2659 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2660 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2661 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2662
2663 c_lno_view_t Irowptr;
2664 lno_nnz_view_t Icolind;
2665 scalar_view_t Ivals;
2666 if(!Bview.importMatrix.is_null()) {
2667 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2668 Irowptr = lclB.graph.row_map;
2669 Icolind = lclB.graph.entries;
2670 Ivals = lclB.values;
2671 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
2672 }
2673
2674 // Jacobi-specific inner stuff
2675 auto Dvals =
2676 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2677
2678 // Teuchos::ArrayView::operator[].
2679 // The status array will contain the index into colind where this entry was last deposited.
2680 // c_status[i] < CSR_ip - not in the row yet.
2681 // c_status[i] >= CSR_ip, this is the entry where you can find the data
2682 // We start with this filled with INVALID's indicating that there are no entries yet.
2683 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2684 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2685 Array<size_t> c_status(n, ST_INVALID);
2686
2687 // Classic csr assembly (low memory edition)
2688 //
2689 // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2690 // The method loops over rows of A, and may resize after processing
2691 // each row. Chris Siefert says that this reflects experience in
2692 // ML; for the non-threaded case, ML found it faster to spend less
2693 // effort on estimation and risk an occasional reallocation.
2694 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2695 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2696 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2697 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2698 size_t CSR_ip = 0, OLD_ip = 0;
2699
2700 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2701
2702 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2703 // routine. The routine computes
2704 //
2705 // C := (I - omega * D^{-1} * A) * (B_local + B_remote)).
2706 //
2707 // This corresponds to one sweep of (weighted) Jacobi.
2708 //
2709 // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2710 // you whether the corresponding row of B belongs to B_local
2711 // ("orig") or B_remote ("Import").
2712
2713 // For each row of A/C
2714 for (size_t i = 0; i < m; i++) {
2715 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2716 // on the calling process.
2717 Crowptr[i] = CSR_ip;
2718 SC minusOmegaDval = -omega*Dvals(i,0);
2719
2720 // Entries of B
2721 for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2722 Scalar Bval = Bvals[j];
2723 if (Bval == SC_ZERO)
2724 continue;
2725 LO Bij = Bcolind[j];
2726 LO Cij = Bcol2Ccol[Bij];
2727
2728 // Assume no repeated entries in B
2729 c_status[Cij] = CSR_ip;
2730 Ccolind[CSR_ip] = Cij;
2731 Cvals[CSR_ip] = Bvals[j];
2732 CSR_ip++;
2733 }
2734
2735 // Entries of -omega * Dinv * A * B
2736 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2737 LO Aik = Acolind[k];
2738 const SC Aval = Avals[k];
2739 if (Aval == SC_ZERO)
2740 continue;
2741
2742 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2743 // Local matrix
2744 size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2745
2746 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2747 LO Bkj = Bcolind[j];
2748 LO Cij = Bcol2Ccol[Bkj];
2749
2750 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2751 // New entry
2752 c_status[Cij] = CSR_ip;
2753 Ccolind[CSR_ip] = Cij;
2754 Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2755 CSR_ip++;
2756
2757 } else {
2758 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2759 }
2760 }
2761
2762 } else {
2763 // Remote matrix
2764 size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2765 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2766 LO Ikj = Icolind[j];
2767 LO Cij = Icol2Ccol[Ikj];
2768
2769 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2770 // New entry
2771 c_status[Cij] = CSR_ip;
2772 Ccolind[CSR_ip] = Cij;
2773 Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2774 CSR_ip++;
2775 } else {
2776 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2777 }
2778 }
2779 }
2780 }
2781
2782 // Resize for next pass if needed
2783 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2784 CSR_alloc *= 2;
2785 Kokkos::resize(Ccolind,CSR_alloc);
2786 Kokkos::resize(Cvals,CSR_alloc);
2787 }
2788 OLD_ip = CSR_ip;
2789 }
2790 Crowptr[m] = CSR_ip;
2791
2792 // Downward resize
2793 Kokkos::resize(Ccolind,CSR_ip);
2794 Kokkos::resize(Cvals,CSR_ip);
2795
2796 {
2797#ifdef HAVE_TPETRA_MMM_TIMINGS
2798 auto MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix Final Sort")));
2799#endif
2800
2801 // Replace the column map
2802 //
2803 // mfh 27 Sep 2016: We do this because C was originally created
2804 // without a column Map. Now we have its column Map.
2805 C.replaceColMap(Ccolmap);
2806
2807 // Final sort & set of CRS arrays
2808 //
2809 // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
2810 // matrix-matrix multiply routine sort the entries for us?
2811 // Final sort & set of CRS arrays
2812 if (params.is_null() || params->get("sort entries",true))
2813 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2814 C.setAllValues(Crowptr,Ccolind, Cvals);
2815 }
2816 {
2817#ifdef HAVE_TPETRA_MMM_TIMINGS
2818 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix ESFC")));
2819#endif
2820
2821 // Final FillComplete
2822 //
2823 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2824 // Import (from domain Map to column Map) construction (which costs
2825 // lots of communication) by taking the previously constructed
2826 // Import object. We should be able to do this without interfering
2827 // with the implementation of the local part of sparse matrix-matrix
2828 // multply above
2829 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2830 labelList->set("Timer Label",label);
2831 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2832 RCP<const Export<LO,GO,NO> > dummyExport;
2833 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2834
2835 }
2836}
2837
2838
2839/*********************************************************************************************************/
2840// Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2841template<class Scalar,
2842 class LocalOrdinal,
2843 class GlobalOrdinal,
2844 class Node>
2845void jacobi_A_B_reuse(
2846 Scalar omega,
2847 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2848 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2849 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2850 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2851 const std::string& label,
2852 const Teuchos::RCP<Teuchos::ParameterList>& params)
2853{
2854 using Teuchos::Array;
2855 using Teuchos::ArrayRCP;
2856 using Teuchos::ArrayView;
2857 using Teuchos::RCP;
2858 using Teuchos::rcp;
2859
2860 typedef LocalOrdinal LO;
2861 typedef GlobalOrdinal GO;
2862 typedef Node NO;
2863
2864 typedef Import<LO,GO,NO> import_type;
2865 typedef Map<LO,GO,NO> map_type;
2866
2867 // Kokkos typedefs
2868 typedef typename map_type::local_map_type local_map_type;
2870 typedef typename KCRS::StaticCrsGraphType graph_t;
2871 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2872 typedef typename NO::execution_space execution_space;
2873 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2874 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2875
2876#ifdef HAVE_TPETRA_MMM_TIMINGS
2877 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2878 using Teuchos::TimeMonitor;
2879 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse Cmap"))));
2880#endif
2881 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2882
2883 // Grab all the maps
2884 RCP<const import_type> Cimport = C.getGraph()->getImporter();
2885 RCP<const map_type> Ccolmap = C.getColMap();
2886 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2887 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2888 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2889 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2890 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2891 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2892
2893 // Build the final importer / column map, hash table lookups for C
2894 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2895 {
2896 // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2897 // So, column map of C may be a strict subset of the column map of B
2898 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2899 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2900 });
2901
2902 if (!Bview.importMatrix.is_null()) {
2903 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2904 std::runtime_error, "Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2905
2906 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2907 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2908 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2909 });
2910 }
2911 }
2912
2913 // Run through all the hash table lookups once and for all
2914 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2915 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2916 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2917 GO aidx = Acolmap_local.getGlobalElement(i);
2918 LO B_LID = Browmap_local.getLocalElement(aidx);
2919 if (B_LID != LO_INVALID) {
2920 targetMapToOrigRow(i) = B_LID;
2921 targetMapToImportRow(i) = LO_INVALID;
2922 } else {
2923 LO I_LID = Irowmap_local.getLocalElement(aidx);
2924 targetMapToOrigRow(i) = LO_INVALID;
2925 targetMapToImportRow(i) = I_LID;
2926
2927 }
2928 });
2929
2930#ifdef HAVE_TPETRA_MMM_TIMINGS
2931 MM = Teuchos::null;
2932#endif
2933
2934 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2935 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2936 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2937}
2938
2939
2940
2941/*********************************************************************************************************/
2942template<class Scalar,
2943 class LocalOrdinal,
2944 class GlobalOrdinal,
2945 class Node,
2946 class LocalOrdinalViewType>
2947void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
2948 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2949 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2950 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2951 const LocalOrdinalViewType & targetMapToOrigRow,
2952 const LocalOrdinalViewType & targetMapToImportRow,
2953 const LocalOrdinalViewType & Bcol2Ccol,
2954 const LocalOrdinalViewType & Icol2Ccol,
2955 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2956 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2957 const std::string& label,
2958 const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2959#ifdef HAVE_TPETRA_MMM_TIMINGS
2960 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2961 using Teuchos::TimeMonitor;
2962 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore"))));
2963 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2964#else
2965 (void)label;
2966#endif
2967 using Teuchos::RCP;
2968 using Teuchos::rcp;
2969
2970 // Lots and lots of typedefs
2971 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2972 typedef typename KCRS::StaticCrsGraphType graph_t;
2973 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2974 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2975 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2976 typedef typename scalar_view_t::memory_space scalar_memory_space;
2977
2978 typedef Scalar SC;
2979 typedef LocalOrdinal LO;
2980 typedef GlobalOrdinal GO;
2981 typedef Node NO;
2982 typedef Map<LO,GO,NO> map_type;
2983 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2984 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2985 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2986
2987 // Sizes
2988 RCP<const map_type> Ccolmap = C.getColMap();
2989 size_t m = Aview.origMatrix->getLocalNumRows();
2990 size_t n = Ccolmap->getLocalNumElements();
2991
2992 // Grab the Kokkos::SparseCrsMatrices & inner stuff
2993 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2994 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2995 const KCRS & Cmat = C.getLocalMatrixHost();
2996
2997 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2998 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2999 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
3000 scalar_view_t Cvals = Cmat.values;
3001
3002 c_lno_view_t Irowptr;
3003 lno_nnz_view_t Icolind;
3004 scalar_view_t Ivals;
3005 if(!Bview.importMatrix.is_null()) {
3006 auto lclB = Bview.importMatrix->getLocalMatrixHost();
3007 Irowptr = lclB.graph.row_map;
3008 Icolind = lclB.graph.entries;
3009 Ivals = lclB.values;
3010 }
3011
3012 // Jacobi-specific inner stuff
3013 auto Dvals =
3014 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
3015
3016#ifdef HAVE_TPETRA_MMM_TIMINGS
3017 MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore - Compare"))));
3018#endif
3019
3020 // The status array will contain the index into colind where this entry was last deposited.
3021 // c_status[i] < CSR_ip - not in the row yet
3022 // c_status[i] >= CSR_ip - this is the entry where you can find the data
3023 // We start with this filled with INVALID's indicating that there are no entries yet.
3024 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
3025 std::vector<size_t> c_status(n, ST_INVALID);
3026
3027 // For each row of A/C
3028 size_t CSR_ip = 0, OLD_ip = 0;
3029 for (size_t i = 0; i < m; i++) {
3030
3031 // First fill the c_status array w/ locations where we're allowed to
3032 // generate nonzeros for this row
3033 OLD_ip = Crowptr[i];
3034 CSR_ip = Crowptr[i+1];
3035 for (size_t k = OLD_ip; k < CSR_ip; k++) {
3036 c_status[Ccolind[k]] = k;
3037
3038 // Reset values in the row of C
3039 Cvals[k] = SC_ZERO;
3040 }
3041
3042 SC minusOmegaDval = -omega*Dvals(i,0);
3043
3044 // Entries of B
3045 for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
3046 Scalar Bval = Bvals[j];
3047 if (Bval == SC_ZERO)
3048 continue;
3049 LO Bij = Bcolind[j];
3050 LO Cij = Bcol2Ccol[Bij];
3051
3052 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3053 std::runtime_error, "Trying to insert a new entry into a static graph");
3054
3055 Cvals[c_status[Cij]] = Bvals[j];
3056 }
3057
3058 // Entries of -omega * Dinv * A * B
3059 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
3060 LO Aik = Acolind[k];
3061 const SC Aval = Avals[k];
3062 if (Aval == SC_ZERO)
3063 continue;
3064
3065 if (targetMapToOrigRow[Aik] != LO_INVALID) {
3066 // Local matrix
3067 size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
3068
3069 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
3070 LO Bkj = Bcolind[j];
3071 LO Cij = Bcol2Ccol[Bkj];
3072
3073 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3074 std::runtime_error, "Trying to insert a new entry into a static graph");
3075
3076 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
3077 }
3078
3079 } else {
3080 // Remote matrix
3081 size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
3082 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
3083 LO Ikj = Icolind[j];
3084 LO Cij = Icol2Ccol[Ikj];
3085
3086 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3087 std::runtime_error, "Trying to insert a new entry into a static graph");
3088
3089 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
3090 }
3091 }
3092 }
3093 }
3094
3095#ifdef HAVE_TPETRA_MMM_TIMINGS
3096 MM2 = Teuchos::null;
3097 MM2 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
3098#endif
3099
3100 C.fillComplete(C.getDomainMap(), C.getRangeMap());
3101#ifdef HAVE_TPETRA_MMM_TIMINGS
3102 MM2 = Teuchos::null;
3103 MM = Teuchos::null;
3104#endif
3105
3106}
3107
3108
3109
3110/*********************************************************************************************************/
3111template<class Scalar,
3112 class LocalOrdinal,
3113 class GlobalOrdinal,
3114 class Node>
3115void import_and_extract_views(
3116 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
3117 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
3118 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3119 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
3120 bool userAssertsThereAreNoRemotes,
3121 const std::string& label,
3122 const Teuchos::RCP<Teuchos::ParameterList>& params)
3123{
3124 using Teuchos::Array;
3125 using Teuchos::ArrayView;
3126 using Teuchos::RCP;
3127 using Teuchos::rcp;
3128 using Teuchos::null;
3129
3130 typedef Scalar SC;
3131 typedef LocalOrdinal LO;
3132 typedef GlobalOrdinal GO;
3133 typedef Node NO;
3134
3135 typedef Map<LO,GO,NO> map_type;
3136 typedef Import<LO,GO,NO> import_type;
3137 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
3138
3139#ifdef HAVE_TPETRA_MMM_TIMINGS
3140 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
3141 using Teuchos::TimeMonitor;
3142 RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Alloc"))));
3143#endif
3144 // The goal of this method is to populate the 'Aview' struct with views of the
3145 // rows of A, including all rows that correspond to elements in 'targetMap'.
3146 //
3147 // If targetMap includes local elements that correspond to remotely-owned rows
3148 // of A, then those remotely-owned rows will be imported into
3149 // 'Aview.importMatrix', and views of them will be included in 'Aview'.
3150 Aview.deleteContents();
3151
3152 Aview.origMatrix = rcp(&A, false);
3153 Aview.origRowMap = A.getRowMap();
3154 Aview.rowMap = targetMap;
3155 Aview.colMap = A.getColMap();
3156 Aview.domainMap = A.getDomainMap();
3157 Aview.importColMap = null;
3158 RCP<const map_type> rowMap = A.getRowMap();
3159 const int numProcs = rowMap->getComm()->getSize();
3160
3161 // Short circuit if the user swears there are no remotes (or if we're in serial)
3162 if (userAssertsThereAreNoRemotes || numProcs < 2)
3163 return;
3164
3165 RCP<const import_type> importer;
3166 if (params != null && params->isParameter("importer")) {
3167 importer = params->get<RCP<const import_type> >("importer");
3168
3169 } else {
3170#ifdef HAVE_TPETRA_MMM_TIMINGS
3171 MM = Teuchos::null;
3172 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap"))));
3173#endif
3174
3175 // Mark each row in targetMap as local or remote, and go ahead and get a view
3176 // for the local rows
3177 RCP<const map_type> remoteRowMap;
3178 size_t numRemote = 0;
3179 int mode = 0;
3180 if (!prototypeImporter.is_null() &&
3181 prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3182 prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3183 // We have a valid prototype importer --- ask it for the remotes
3184#ifdef HAVE_TPETRA_MMM_TIMINGS
3185 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode1"));
3186#endif
3187 ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3188 numRemote = prototypeImporter->getNumRemoteIDs();
3189
3190 Array<GO> remoteRows(numRemote);
3191 for (size_t i = 0; i < numRemote; i++)
3192 remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3193
3194 remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3195 rowMap->getIndexBase(), rowMap->getComm()));
3196 mode = 1;
3197
3198 } else if (prototypeImporter.is_null()) {
3199 // No prototype importer --- count the remotes the hard way
3200#ifdef HAVE_TPETRA_MMM_TIMINGS
3201 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode2"));
3202#endif
3203 ArrayView<const GO> rows = targetMap->getLocalElementList();
3204 size_t numRows = targetMap->getLocalNumElements();
3205
3206 Array<GO> remoteRows(numRows);
3207 for(size_t i = 0; i < numRows; ++i) {
3208 const LO mlid = rowMap->getLocalElement(rows[i]);
3209
3210 if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3211 remoteRows[numRemote++] = rows[i];
3212 }
3213 remoteRows.resize(numRemote);
3214 remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3215 rowMap->getIndexBase(), rowMap->getComm()));
3216 mode = 2;
3217
3218 } else {
3219 // PrototypeImporter is bad. But if we're in serial that's OK.
3220 mode = 3;
3221 }
3222
3223 if (numProcs < 2) {
3224 TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3225 "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3226 // If only one processor we don't need to import any remote rows, so return.
3227 return;
3228 }
3229
3230 //
3231 // Now we will import the needed remote rows of A, if the global maximum
3232 // value of numRemote is greater than 0.
3233 //
3234#ifdef HAVE_TPETRA_MMM_TIMINGS
3235 MM = Teuchos::null;
3236 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Collective-0"))));
3237#endif
3238
3239 global_size_t globalMaxNumRemote = 0;
3240 Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
3241
3242 if (globalMaxNumRemote > 0) {
3243#ifdef HAVE_TPETRA_MMM_TIMINGS
3244 MM = Teuchos::null;
3245 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-2"))));
3246#endif
3247 // Create an importer with target-map remoteRowMap and source-map rowMap.
3248 if (mode == 1)
3249 importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3250 else if (mode == 2)
3251 importer = rcp(new import_type(rowMap, remoteRowMap));
3252 else
3253 throw std::runtime_error("prototypeImporter->SourceMap() does not match A.getRowMap()!");
3254 }
3255
3256 if (params != null)
3257 params->set("importer", importer);
3258 }
3259
3260 if (importer != null) {
3261#ifdef HAVE_TPETRA_MMM_TIMINGS
3262 MM = Teuchos::null;
3263 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-3"))));
3264#endif
3265
3266 // Now create a new matrix into which we can import the remote rows of A that we need.
3267 Teuchos::ParameterList labelList;
3268 labelList.set("Timer Label", label);
3269 auto & labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
3270
3271 bool isMM = true;
3272 bool overrideAllreduce = false;
3273 int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
3274 // Minor speedup tweak - avoid computing the global constants
3275 Teuchos::ParameterList params_sublist;
3276 if(!params.is_null()) {
3277 labelList.set("compute global constants", params->get("compute global constants",false));
3278 params_sublist = params->sublist("matrixmatrix: kernel params",false);
3279 mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3280 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3281 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3282 isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
3283 overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
3284 }
3285 labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM);
3286 labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3287 labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3288
3289 Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3290 A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3291
3292#if 0
3293 // Disabled code for dumping input matrices
3294 static int count=0;
3295 char str[80];
3296 sprintf(str,"import_matrix.%d.dat",count);
3298 count++;
3299#endif
3300
3301#ifdef HAVE_TPETRA_MMM_STATISTICS
3302 printMultiplicationStatistics(importer, label + std::string(" I&X MMM"));
3303#endif
3304
3305
3306#ifdef HAVE_TPETRA_MMM_TIMINGS
3307 MM = Teuchos::null;
3308 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-4"))));
3309#endif
3310
3311 // Save the column map of the imported matrix, so that we can convert indices back to global for arithmetic later
3312 Aview.importColMap = Aview.importMatrix->getColMap();
3313#ifdef HAVE_TPETRA_MMM_TIMINGS
3314 MM = Teuchos::null;
3315#endif
3316 }
3317}
3318
3319/*********************************************************************************************************/
3320template<class Scalar,
3321 class LocalOrdinal,
3322 class GlobalOrdinal,
3323 class Node>
3324void import_and_extract_views(
3325 const BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& M,
3326 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
3327 BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview,
3328 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
3329 bool userAssertsThereAreNoRemotes)
3330{
3331 using Teuchos::Array;
3332 using Teuchos::ArrayView;
3333 using Teuchos::RCP;
3334 using Teuchos::rcp;
3335 using Teuchos::null;
3336
3337 typedef Scalar SC;
3338 typedef LocalOrdinal LO;
3339 typedef GlobalOrdinal GO;
3340 typedef Node NO;
3341
3342 typedef Map<LO,GO,NO> map_type;
3343 typedef Import<LO,GO,NO> import_type;
3344 typedef BlockCrsMatrix<SC,LO,GO,NO> blockcrs_matrix_type;
3345
3346 // The goal of this method is to populate the 'Mview' struct with views of the
3347 // rows of M, including all rows that correspond to elements in 'targetMap'.
3348 //
3349 // If targetMap includes local elements that correspond to remotely-owned rows
3350 // of M, then those remotely-owned rows will be imported into
3351 // 'Mview.importMatrix', and views of them will be included in 'Mview'.
3352 Mview.deleteContents();
3353
3354 Mview.origMatrix = rcp(&M, false);
3355 Mview.origRowMap = M.getRowMap();
3356 Mview.rowMap = targetMap;
3357 Mview.colMap = M.getColMap();
3358 Mview.importColMap = null;
3359 RCP<const map_type> rowMap = M.getRowMap();
3360 const int numProcs = rowMap->getComm()->getSize();
3361
3362 // Short circuit if the user swears there are no remotes (or if we're in serial)
3363 if (userAssertsThereAreNoRemotes || numProcs < 2) return;
3364
3365 // Mark each row in targetMap as local or remote, and go ahead and get a view
3366 // for the local rows
3367 RCP<const map_type> remoteRowMap;
3368 size_t numRemote = 0;
3369 int mode = 0;
3370 if (!prototypeImporter.is_null() &&
3371 prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3372 prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3373
3374 // We have a valid prototype importer --- ask it for the remotes
3375 ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3376 numRemote = prototypeImporter->getNumRemoteIDs();
3377
3378 Array<GO> remoteRows(numRemote);
3379 for (size_t i = 0; i < numRemote; i++)
3380 remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3381
3382 remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3383 rowMap->getIndexBase(), rowMap->getComm()));
3384 mode = 1;
3385
3386 } else if (prototypeImporter.is_null()) {
3387
3388 // No prototype importer --- count the remotes the hard way
3389 ArrayView<const GO> rows = targetMap->getLocalElementList();
3390 size_t numRows = targetMap->getLocalNumElements();
3391
3392 Array<GO> remoteRows(numRows);
3393 for(size_t i = 0; i < numRows; ++i) {
3394 const LO mlid = rowMap->getLocalElement(rows[i]);
3395
3396 if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3397 remoteRows[numRemote++] = rows[i];
3398 }
3399 remoteRows.resize(numRemote);
3400 remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3401 rowMap->getIndexBase(), rowMap->getComm()));
3402 mode = 2;
3403
3404 } else {
3405 // PrototypeImporter is bad. But if we're in serial that's OK.
3406 mode = 3;
3407 }
3408
3409 if (numProcs < 2) {
3410 TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3411 "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3412 // If only one processor we don't need to import any remote rows, so return.
3413 return;
3414 }
3415
3416 // Now we will import the needed remote rows of M, if the global maximum
3417 // value of numRemote is greater than 0.
3418 global_size_t globalMaxNumRemote = 0;
3419 Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
3420
3421 RCP<const import_type> importer;
3422
3423 if (globalMaxNumRemote > 0) {
3424 // Create an importer with target-map remoteRowMap and source-map rowMap.
3425 if (mode == 1)
3426 importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3427 else if (mode == 2)
3428 importer = rcp(new import_type(rowMap, remoteRowMap));
3429 else
3430 throw std::runtime_error("prototypeImporter->SourceMap() does not match M.getRowMap()!");
3431 }
3432
3433 if (importer != null) {
3434 // Get import matrix
3435 Mview.importMatrix = Tpetra::importAndFillCompleteBlockCrsMatrix<blockcrs_matrix_type>(rcpFromRef(M), *importer);
3436
3437 // Save the column map of the imported matrix, so that we can convert indices
3438 // back to global for arithmetic later
3439 Mview.importColMap = Mview.importMatrix->getColMap();
3440 }
3441}
3442
3443/*********************************************************************************************************/
3444 // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3445template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3447merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3448 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3449 const LocalOrdinalViewType & Acol2Brow,
3450 const LocalOrdinalViewType & Acol2Irow,
3451 const LocalOrdinalViewType & Bcol2Ccol,
3452 const LocalOrdinalViewType & Icol2Ccol,
3453 const size_t mergedNodeNumCols) {
3454
3455 using Teuchos::RCP;
3457 typedef typename KCRS::StaticCrsGraphType graph_t;
3458 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3459 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3460 typedef typename KCRS::values_type::non_const_type scalar_view_t;
3461 // Grab the Kokkos::SparseCrsMatrices
3462 const KCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3463 const KCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3464
3465 // We need to do this dance if either (a) We have Bimport or (b) We don't A's colMap is not the same as B's rowMap
3466 if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3467 // We do have a Bimport
3468 // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3469 // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3470 RCP<const KCRS> Ik_;
3471 if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrixDevice());
3472 const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3473 KCRS Iks;
3474 if(Ik!=0) Iks = *Ik;
3475 size_t merge_numrows = Ak.numCols();
3476 // The last entry of this at least, need to be initialized
3477 lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3478
3479 const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3480
3481 // Use a Kokkos::parallel_scan to build the rowptr
3482 typedef typename Node::execution_space execution_space;
3483 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3484 Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3485 KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3486 if(final) Mrowptr(i) = update;
3487 // Get the row count
3488 size_t ct=0;
3489 if(Acol2Brow(i)!=LO_INVALID)
3490 ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3491 else
3492 ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3493 update+=ct;
3494
3495 if(final && i+1==merge_numrows)
3496 Mrowptr(i+1)=update;
3497 });
3498
3499 // Allocate nnz
3500 size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3501 lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"),merge_nnz);
3502 scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"),merge_nnz);
3503
3504 // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3505 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3506 Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3507 if(Acol2Brow(i)!=LO_INVALID) {
3508 size_t row = Acol2Brow(i);
3509 size_t start = Bk.graph.row_map(row);
3510 for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3511 Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3512 Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3513 }
3514 }
3515 else {
3516 size_t row = Acol2Irow(i);
3517 size_t start = Iks.graph.row_map(row);
3518 for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3519 Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3520 Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3521 }
3522 }
3523 });
3524
3525 KCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3526 return newmat;
3527 }
3528 else {
3529 // We don't have a Bimport (the easy case)
3530 return Bk;
3531 }
3532}//end merge_matrices
3533
3534/*********************************************************************************************************/
3535 // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3536template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3537const typename Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type
3538merge_matrices(BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3539 BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3540 const LocalOrdinalViewType & Acol2Brow,
3541 const LocalOrdinalViewType & Acol2Irow,
3542 const LocalOrdinalViewType & Bcol2Ccol,
3543 const LocalOrdinalViewType & Icol2Ccol,
3544 const size_t mergedNodeNumCols)
3545{
3546 using Teuchos::RCP;
3547 typedef typename Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KBCRS;
3548 typedef typename KBCRS::StaticCrsGraphType graph_t;
3549 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3550 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3551 typedef typename KBCRS::values_type::non_const_type scalar_view_t;
3552
3553 // Grab the KokkosSparse::BsrMatrix
3554 const KBCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3555 const KBCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3556
3557 // We need to do this dance if either (a) We have Bimport or (b) A's colMap is not the same as B's rowMap
3558 if(!Bview.importMatrix.is_null() ||
3559 (Bview.importMatrix.is_null() &&
3560 (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3561
3562 // We do have a Bimport
3563 // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3564 // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3565 RCP<const KBCRS> Ik_;
3566 if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KBCRS>(Bview.importMatrix->getLocalMatrixDevice());
3567 const KBCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3568 KBCRS Iks;
3569 if(Ik!=0) Iks = *Ik;
3570 size_t merge_numrows = Ak.numCols();
3571
3572 // The last entry of this at least, need to be initialized
3573 lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3574
3575 const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3576
3577 // Use a Kokkos::parallel_scan to build the rowptr
3578 typedef typename Node::execution_space execution_space;
3579 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3580 Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3581 KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3582 if(final) Mrowptr(i) = update;
3583 // Get the row count
3584 size_t ct=0;
3585 if(Acol2Brow(i)!=LO_INVALID)
3586 ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3587 else
3588 ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3589 update+=ct;
3590
3591 if(final && i+1==merge_numrows)
3592 Mrowptr(i+1)=update;
3593 });
3594
3595 // Allocate nnz
3596 size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3597 const int blocksize = Ak.blockDim();
3598 lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"),merge_nnz);
3599 scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"),merge_nnz*blocksize*blocksize);
3600
3601 // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3602 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3603 Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3604 if(Acol2Brow(i)!=LO_INVALID) {
3605 size_t row = Acol2Brow(i);
3606 size_t start = Bk.graph.row_map(row);
3607 for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3608 Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3609
3610 for (int b=0; b<blocksize*blocksize; ++b) {
3611 const int val_indx = j*blocksize*blocksize + b;
3612 const int b_val_indx = (j-Mrowptr(i)+start)*blocksize*blocksize + b;
3613 Mvalues(val_indx) = Bk.values(b_val_indx);
3614 }
3615 }
3616 }
3617 else {
3618 size_t row = Acol2Irow(i);
3619 size_t start = Iks.graph.row_map(row);
3620 for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3621 Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3622
3623 for (int b=0; b<blocksize*blocksize; ++b) {
3624 const int val_indx = j*blocksize*blocksize + b;
3625 const int b_val_indx = (j-Mrowptr(i)+start)*blocksize*blocksize + b;
3626 Mvalues(val_indx) = Iks.values(b_val_indx);
3627 }
3628 }
3629 }
3630 });
3631
3632 // Build and return merged KokkosSparse matrix
3633 KBCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind, blocksize);
3634 return newmat;
3635 }
3636 else {
3637 // We don't have a Bimport (the easy case)
3638 return Bk;
3639 }
3640}//end merge_matrices
3641
3642/*********************************************************************************************************/
3643template<typename SC, typename LO, typename GO, typename NO>
3644void AddKernels<SC, LO, GO, NO>::
3645addSorted(
3646 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3647 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3648 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3649 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3650 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3651 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3652 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3653 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3654 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3655 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3656 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3657{
3658 using Teuchos::TimeMonitor;
3659 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3660 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3661 auto nrows = Arowptrs.extent(0) - 1;
3662 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3663 typename AddKern::KKH handle;
3664 handle.create_spadd_handle(true);
3665 auto addHandle = handle.get_spadd_handle();
3666#ifdef HAVE_TPETRA_MMM_TIMINGS
3667 auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted symbolic")));
3668#endif
3669 KokkosSparse::Experimental::spadd_symbolic
3670 (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3671 //KokkosKernels requires values to be zeroed
3672 Cvals = values_array("C values", addHandle->get_c_nnz());
3673 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3674#ifdef HAVE_TPETRA_MMM_TIMINGS
3675 MM = Teuchos::null;
3676 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted numeric")));
3677#endif
3678 KokkosSparse::Experimental::spadd_numeric(&handle,
3679 Arowptrs, Acolinds, Avals, scalarA,
3680 Browptrs, Bcolinds, Bvals, scalarB,
3681 Crowptrs, Ccolinds, Cvals);
3682}
3683
3684template<typename SC, typename LO, typename GO, typename NO>
3685void AddKernels<SC, LO, GO, NO>::
3686addUnsorted(
3687 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3688 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3689 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3690 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3691 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3692 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3693 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3694 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3695 GO /* numGlobalCols */,
3696 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3697 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3698 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3699{
3700 using Teuchos::TimeMonitor;
3701 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3702 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3703 auto nrows = Arowptrs.extent(0) - 1;
3704 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3705 typedef MMdetails::AddKernels<SC, LO, GO, NO> AddKern;
3706 typename AddKern::KKH handle;
3707 handle.create_spadd_handle(false);
3708 auto addHandle = handle.get_spadd_handle();
3709#ifdef HAVE_TPETRA_MMM_TIMINGS
3710 auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() unsorted symbolic")));
3711#endif
3712 KokkosSparse::Experimental::spadd_symbolic
3713 (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3714 //Cvals must be zeroed out
3715 Cvals = values_array("C values", addHandle->get_c_nnz());
3716 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3717#ifdef HAVE_TPETRA_MMM_TIMINGS
3718 MM = Teuchos::null;
3719 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() unsorted kernel: unsorted numeric")));
3720#endif
3721 KokkosSparse::Experimental::spadd_numeric(&handle,
3722 Arowptrs, Acolinds, Avals, scalarA,
3723 Browptrs, Bcolinds, Bvals, scalarB,
3724 Crowptrs, Ccolinds, Cvals);
3725}
3726
3727template<typename GO,
3728 typename LocalIndicesType,
3729 typename GlobalIndicesType,
3730 typename ColMapType>
3731struct ConvertLocalToGlobalFunctor
3732{
3733 ConvertLocalToGlobalFunctor(
3734 const LocalIndicesType& colindsOrig_,
3735 const GlobalIndicesType& colindsConverted_,
3736 const ColMapType& colmap_) :
3737 colindsOrig (colindsOrig_),
3738 colindsConverted (colindsConverted_),
3739 colmap (colmap_)
3740 {}
3741 KOKKOS_INLINE_FUNCTION void
3742 operator() (const GO i) const
3743 {
3744 colindsConverted(i) = colmap.getGlobalElement(colindsOrig(i));
3745 }
3746 LocalIndicesType colindsOrig;
3747 GlobalIndicesType colindsConverted;
3748 ColMapType colmap;
3749};
3750
3751template<typename SC, typename LO, typename GO, typename NO>
3752void MMdetails::AddKernels<SC, LO, GO, NO>::
3753convertToGlobalAndAdd(
3754 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
3755 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3756 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
3757 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3758 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
3759 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
3760 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3761 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3762 typename MMdetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
3763{
3764 using Teuchos::TimeMonitor;
3765 //Need to use a different KokkosKernelsHandle type than other versions,
3766 //since the ordinals are now GO
3767 using KKH_GO = KokkosKernels::Experimental::KokkosKernelsHandle<size_t, GO, impl_scalar_type,
3768 typename NO::execution_space, typename NO::memory_space, typename NO::memory_space>;
3769
3770 const values_array Avals = A.values;
3771 const values_array Bvals = B.values;
3772 const col_inds_array Acolinds = A.graph.entries;
3773 const col_inds_array Bcolinds = B.graph.entries;
3774 auto Arowptrs = A.graph.row_map;
3775 auto Browptrs = B.graph.row_map;
3776 global_col_inds_array AcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("A colinds (converted)"), Acolinds.extent(0));
3777 global_col_inds_array BcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("B colinds (converted)"), Bcolinds.extent(0));
3778#ifdef HAVE_TPETRA_MMM_TIMINGS
3779 auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string("column map conversion"))));
3780#endif
3781 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(Acolinds, AcolindsConverted, AcolMap);
3782 Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
3783 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(Bcolinds, BcolindsConverted, BcolMap);
3784 Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
3785 KKH_GO handle;
3786 handle.create_spadd_handle(false);
3787 auto addHandle = handle.get_spadd_handle();
3788#ifdef HAVE_TPETRA_MMM_TIMINGS
3789 MM = Teuchos::null;
3790 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
3791#endif
3792 auto nrows = Arowptrs.extent(0) - 1;
3793 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3794 KokkosSparse::Experimental::spadd_symbolic
3795 (&handle, Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
3796 Cvals = values_array("C values", addHandle->get_c_nnz());
3797 Ccolinds = global_col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3798#ifdef HAVE_TPETRA_MMM_TIMINGS
3799 MM = Teuchos::null;
3800 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
3801#endif
3802 KokkosSparse::Experimental::spadd_numeric(&handle,
3803 Arowptrs, AcolindsConverted, Avals, scalarA,
3804 Browptrs, BcolindsConverted, Bvals, scalarB,
3805 Crowptrs, Ccolinds, Cvals);
3806}
3807
3808
3809} //End namepsace MMdetails
3810
3811} //End namespace Tpetra
3812
3813/*********************************************************************************************************/
3814//
3815// Explicit instantiation macro
3816//
3817// Must be expanded from within the Tpetra namespace!
3818//
3819namespace Tpetra {
3820
3821#define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3822template \
3823 void MatrixMatrix::Multiply( \
3824 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3825 bool transposeA, \
3826 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3827 bool transposeB, \
3828 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3829 bool call_FillComplete_on_result, \
3830 const std::string & label, \
3831 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3832\
3833template \
3834 void MatrixMatrix::Multiply( \
3835 const Teuchos::RCP<const BlockCrsMatrix< SCALAR , LO , GO , NODE > >& A, \
3836 bool transposeA, \
3837 const Teuchos::RCP<const BlockCrsMatrix< SCALAR , LO , GO , NODE > >& B, \
3838 bool transposeB, \
3839 Teuchos::RCP<BlockCrsMatrix< SCALAR , LO , GO , NODE > >& C, \
3840 const std::string & label); \
3841\
3842template \
3843 void MatrixMatrix::Jacobi( \
3844 SCALAR omega, \
3845 const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3846 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3847 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3848 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3849 bool call_FillComplete_on_result, \
3850 const std::string & label, \
3851 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3852\
3853 template \
3854 void MatrixMatrix::Add( \
3855 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3856 bool transposeA, \
3857 SCALAR scalarA, \
3858 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3859 bool transposeB, \
3860 SCALAR scalarB, \
3861 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > C); \
3862\
3863 template \
3864 void MatrixMatrix::Add( \
3865 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3866 bool transposeA, \
3867 SCALAR scalarA, \
3868 CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3869 SCALAR scalarB ); \
3870\
3871 template \
3872 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3873 MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3874 (const SCALAR & alpha, \
3875 const bool transposeA, \
3876 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3877 const SCALAR & beta, \
3878 const bool transposeB, \
3879 const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3880 const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3881 const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3882 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3883\
3884 template \
3885 void \
3886 MatrixMatrix::add< SCALAR , LO, GO , NODE > \
3887 (const SCALAR & alpha, \
3888 const bool transposeA, \
3889 const CrsMatrix< SCALAR , LO, GO , NODE >& A, \
3890 const SCALAR& beta, \
3891 const bool transposeB, \
3892 const CrsMatrix< SCALAR , LO, GO , NODE >& B, \
3893 CrsMatrix< SCALAR , LO, GO , NODE >& C, \
3894 const Teuchos::RCP<const Map<LO, GO , NODE > >& domainMap, \
3895 const Teuchos::RCP<const Map<LO, GO , NODE > >& rangeMap, \
3896 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3897\
3898 template struct MMdetails::AddKernels<SCALAR, LO, GO, NODE>; \
3899\
3900 template void MMdetails::import_and_extract_views<SCALAR, LO, GO, NODE>(const CrsMatrix<SCALAR, LO, GO, NODE>& M, \
3901 Teuchos::RCP<const Map<LO, GO, NODE> > targetMap, \
3902 CrsMatrixStruct<SCALAR, LO, GO, NODE>& Mview, \
3903 Teuchos::RCP<const Import<LO,GO, NODE> > prototypeImporter, \
3904 bool userAssertsThereAreNoRemotes, \
3905 const std::string& label, \
3906 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3907\
3908 template void MMdetails::import_and_extract_views<SCALAR, LO, GO, NODE>(const BlockCrsMatrix<SCALAR, LO, GO, NODE>& M, \
3909 Teuchos::RCP<const Map<LO, GO, NODE> > targetMap, \
3910 BlockCrsMatrixStruct<SCALAR, LO, GO, NODE>& Mview, \
3911 Teuchos::RCP<const Import<LO,GO, NODE> > prototypeImporter, \
3912 bool userAssertsThereAreNoRemotes);
3913} //End namespace Tpetra
3914
3915#endif // TPETRA_MATRIXMATRIX_DEF_HPP
Matrix Market file readers and writers for Tpetra objects.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Declaration and definition of Tpetra::Details::getEntryOnHost.
Utility functions for packing and unpacking sparse matrix entries.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
Teuchos::RCP< const map_type > colMap
Col map for the original version of the matrix.
Teuchos::RCP< const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > origMatrix
The original matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void removeCrsMatrixZeros(CrsMatrixType &matrix, typename Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::magnitudeType const &threshold=Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::magnitude(Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::zero()))
Remove zero entries from a matrix.
size_t global_size_t
Global size_t object.