Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_CrsMatrixMultiplyOp.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38//
39// ************************************************************************
40// @HEADER
41
42#ifndef TPETRA_CRSMATRIXMULTIPLYOP_HPP
43#define TPETRA_CRSMATRIXMULTIPLYOP_HPP
44
49
51#include "Tpetra_CrsMatrix.hpp"
52#include "Tpetra_Util.hpp"
55#include "Tpetra_LocalCrsMatrixOperator.hpp"
56
57namespace Tpetra {
58
95 template <class Scalar,
96 class MatScalar,
97 class LocalOrdinal,
98 class GlobalOrdinal,
99 class Node>
101 public Operator<Scalar, LocalOrdinal, GlobalOrdinal, Node>
102 {
103 public:
109
110 private:
111 using local_matrix_device_type =
112 typename crs_matrix_type::local_matrix_device_type;
113
114 public:
116
117
122 CrsMatrixMultiplyOp (const Teuchos::RCP<const crs_matrix_type>& A) :
123 matrix_ (A),
124 localMultiply_ (std::make_shared<local_matrix_device_type> (
125 A->getLocalMatrixDevice ()))
126 {}
127
129 ~CrsMatrixMultiplyOp () override = default;
130
132
134
137 void
140 Teuchos::ETransp mode = Teuchos::NO_TRANS,
141 Scalar alpha = Teuchos::ScalarTraits<Scalar>::one (),
142 Scalar beta = Teuchos::ScalarTraits<Scalar>::zero ()) const override
143 {
145 (! matrix_->isFillComplete (), std::runtime_error,
146 Teuchos::typeName (*this) << "::apply(): underlying matrix is not fill-complete.");
148 (X.getNumVectors () != Y.getNumVectors (), std::runtime_error,
149 Teuchos::typeName (*this) << "::apply(X,Y): X and Y must have the same number of vectors.");
151 (Teuchos::ScalarTraits<Scalar>::isComplex && mode == Teuchos::TRANS, std::logic_error,
152 Teuchos::typeName (*this) << "::apply() does not currently support transposed multiplications for complex scalar types.");
153 if (mode == Teuchos::NO_TRANS) {
155 }
156 else {
158 }
159 }
160
166 bool hasTransposeApply() const override {
167 return true;
168 }
169
171 Teuchos::RCP<const map_type> getDomainMap () const override {
172 return matrix_->getDomainMap ();
173 }
174
176 Teuchos::RCP<const map_type> getRangeMap () const override {
177 return matrix_->getRangeMap ();
178 }
179
181
182 protected:
184
186 const Teuchos::RCP<const crs_matrix_type> matrix_;
187
191
204 mutable Teuchos::RCP<MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> > importMV_;
205
218 mutable Teuchos::RCP<MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> > exportMV_;
219
222 void
225 Teuchos::ETransp mode,
227 Scalar beta) const
228 {
229 using Teuchos::null;
230 using Teuchos::RCP;
231 using Teuchos::rcp;
234 using STS = Teuchos::ScalarTraits<Scalar>;
235 typedef typename MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>:: dual_view_type::t_dev nonconst_view_type;
236
237 const size_t numVectors = X_in.getNumVectors();
238 // because of Views, it is difficult to determine if X and Y point to the same data.
239 // however, if they reference the exact same object, we will do the user the favor of copying X into new storage (with a warning)
240 // we ony need to do this if we have trivial importers; otherwise, we don't actually apply the operator from X into Y
241 RCP<const import_type> importer = matrix_->getGraph()->getImporter();
242 RCP<const export_type> exporter = matrix_->getGraph()->getExporter();
243
244 // some parameters for below
245 const bool Y_is_replicated = ! Y_in.isDistributed ();
246 const bool Y_is_overwritten = (beta == STS::zero ());
247 const int myRank = matrix_->getComm ()->getRank ();
248 if (Y_is_replicated && myRank != 0) {
249 beta = STS::zero ();
250 }
251
252 // access X indirectly, in case we need to create temporary storage
254 // currently, cannot multiply from multivector of non-constant stride
255 if (! X_in.isConstantStride () && importer == null) {
256 // generate a strided copy of X_in
257 X = Teuchos::rcp (new MV (X_in, Teuchos::Copy));
258 }
259 else {
260 // just temporary, so this non-owning RCP is okay
261 X = Teuchos::rcpFromRef (X_in);
262 }
263
264 // set up import/export temporary multivectors
265 if (importer != null) {
266 if (importMV_ != null && importMV_->getNumVectors () != numVectors) {
267 importMV_ = null;
268 }
269 if (importMV_ == null) {
270 importMV_ = rcp (new MV (matrix_->getColMap (), numVectors));
271 }
272 }
273 if (exporter != null) {
274 if (exportMV_ != null && exportMV_->getNumVectors () != numVectors) {
275 exportMV_ = null;
276 }
277 if (exportMV_ == null) {
278 exportMV_ = rcp (new MV (matrix_->getRowMap (), numVectors));
279 }
280 }
281
282 // If we have a non-trivial exporter, we must import elements that are permuted or are on other processors
283 if (exporter != null) {
284 exportMV_->doImport(X_in,*exporter,INSERT);
285 X = exportMV_; // multiply out of exportMV_
286 }
287
288 auto X_lcl = X->getLocalViewDevice(Access::ReadOnly);
289
290 // If we have a non-trivial importer, we must export elements that are permuted or belong to other processors
291 // We will compute solution into the to-be-exported MV; get a view
292 if (importer != null) {
293 // Beta is zero here, so we clobber Y_lcl
294 auto Y_lcl = importMV_->getLocalViewDevice(Access::OverwriteAll);
295
296 localMultiply_.apply (X_lcl, Y_lcl, mode, alpha, STS::zero ());
297 if (Y_is_overwritten) {
298 Y_in.putScalar (STS::zero ());
299 }
300 else {
301 Y_in.scale (beta);
302 }
303 Y_in.doExport (*importMV_, *importer, ADD_ASSIGN);
304 }
305 // otherwise, multiply into Y
306 else {
307 // can't multiply in-situ; can't multiply into non-strided multivector
308 if (! Y_in.isConstantStride () || X.getRawPtr () == &Y_in) {
309 // generate a strided copy of Y
310 MV Y (Y_in, Teuchos::Copy);
312 if(Y_is_overwritten) Y_lcl = Y.getLocalViewDevice(Access::OverwriteAll);
313 else Y_lcl = Y.getLocalViewDevice(Access::ReadWrite);
314
317 }
318 else {
320 if(Y_is_overwritten) Y_lcl = Y_in.getLocalViewDevice(Access::OverwriteAll);
321 else Y_lcl = Y_in.getLocalViewDevice(Access::ReadWrite);
322
324 }
325 }
326 // Handle case of rangemap being a local replicated map: in this case, sum contributions from each processor
327 if (Y_is_replicated) {
328 Y_in.reduce();
329 }
330 }
331
333 void
337 Scalar beta) const
338 {
340 using Teuchos::NO_TRANS;
341 using Teuchos::RCP;
342 using Teuchos::rcp;
343 using Teuchos::rcp_const_cast;
344 using Teuchos::rcpFromRef;
345 typedef Export<LocalOrdinal,GlobalOrdinal,Node> export_type;
346 typedef Import<LocalOrdinal,GlobalOrdinal,Node> import_type;
347 typedef Teuchos::ScalarTraits<Scalar> STS;
348 typedef typename MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>:: dual_view_type::t_dev nonconst_view_type;
349
350 if (alpha == STS::zero ()) {
351 if (beta == STS::zero ()) {
352 Y_in.putScalar (STS::zero ());
353 } else if (beta != STS::one ()) {
354 Y_in.scale (beta);
355 }
356 return;
357 }
358
359 // It's possible that X is a view of Y or vice versa. We don't
360 // allow this (apply() requires that X and Y not alias one
361 // another), but it's helpful to detect and work around this
362 // case. We don't try to to detect the more subtle cases (e.g.,
363 // one is a subview of the other, but their initial pointers
364 // differ). We only need to do this if this matrix's Import is
365 // trivial; otherwise, we don't actually apply the operator from
366 // X into Y.
367
368 RCP<const import_type> importer = matrix_->getGraph()->getImporter();
369 RCP<const export_type> exporter = matrix_->getGraph()->getExporter();
370
371 // If beta == 0, then the output MV will be overwritten; none of
372 // its entries should be read. (Sparse BLAS semantics say that we
373 // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
374 // This matters if we need to do an Export operation; see below.
375 const bool Y_is_overwritten = (beta == STS::zero());
376
377 // We treat the case of a replicated MV output specially.
378 const bool Y_is_replicated =
379 (! Y_in.isDistributed () && matrix_->getComm ()->getSize () != 1);
380
381 // This is part of the special case for replicated MV output.
382 // We'll let each process do its thing, but do an all-reduce at
383 // the end to sum up the results. Setting beta=0 on all
384 // processes but Proc 0 makes the math work out for the
385 // all-reduce. (This assumes that the replicated data is
386 // correctly replicated, so that the data are the same on all
387 // processes.)
388 if (Y_is_replicated && matrix_->getComm ()->getRank () > 0) {
389 beta = STS::zero();
390 }
391
392 // Temporary MV for Import operation. After the block of code
393 // below, this will be an (Imported if necessary) column Map MV
394 // ready to give to localMultiply_.apply(...).
396 if (importer.is_null ()) {
397 if (! X_in.isConstantStride ()) {
398 // Not all sparse mat-vec kernels can handle an input MV with
399 // nonconstant stride correctly, so we have to copy it in that
400 // case into a constant stride MV. To make a constant stride
401 // copy of X_in, we force creation of the column (== domain)
402 // Map MV (if it hasn't already been created, else fetch the
403 // cached copy). This avoids creating a new MV each time.
404 RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in, true);
407 }
408 else {
409 // The domain and column Maps are the same, so do the local
410 // multiply using the domain Map input MV X_in.
412 }
413 }
414 else { // need to Import source (multi)vector
415 ProfilingRegion regionImport ("Tpetra::CrsMatrixMultiplyOp::apply: Import");
416
417 // We're doing an Import anyway, which will copy the relevant
418 // elements of the domain Map MV X_in into a separate column Map
419 // MV. Thus, we don't have to worry whether X_in is constant
420 // stride.
421 RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in);
422
423 // Import from the domain Map MV to the column Map MV.
424 X_colMapNonConst->doImport (X_in, *importer, INSERT);
426 }
427
428 // Temporary MV for doExport (if needed), or for copying a
429 // nonconstant stride output MV into a constant stride MV. This
430 // is null if we don't need the temporary MV, that is, if the
431 // Export is trivial (null).
432 RCP<MV> Y_rowMap = getRowMapMultiVector (Y_in);
433
434 auto X_lcl = X_colMap->getLocalViewDevice(Access::ReadOnly);
435
436 // If we have a nontrivial Export object, we must perform an
437 // Export. In that case, the local multiply result will go into
438 // the row Map multivector. We don't have to make a
439 // constant-stride version of Y_in in this case, because we had to
440 // make a constant stride Y_rowMap MV and do an Export anyway.
441 if (! exporter.is_null ()) {
442 auto Y_lcl = Y_rowMap->getLocalViewDevice(Access::OverwriteAll);
443
445 alpha, STS::zero ());
446 {
447 ProfilingRegion regionExport ("Tpetra::CrsMatrixMultiplyOp::apply: Export");
448
449 // If we're overwriting the output MV Y_in completely (beta
450 // == 0), then make sure that it is filled with zeros before
451 // we do the Export. Otherwise, the ADD combine mode will
452 // use data in Y_in, which is supposed to be zero.
453 if (Y_is_overwritten) {
454 Y_in.putScalar (STS::zero());
455 }
456 else {
457 // Scale the output MV by beta, so that the Export sums in
458 // the mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
459 Y_in.scale (beta);
460 }
461 // Do the Export operation.
462 Y_in.doExport (*Y_rowMap, *exporter, ADD_ASSIGN);
463 }
464 }
465 else { // Don't do an Export: row Map and range Map are the same.
466 //
467 // If Y_in does not have constant stride, or if the column Map
468 // MV aliases Y_in, then we can't let the kernel write directly
469 // to Y_in. Instead, we have to use the cached row (== range)
470 // Map MV as temporary storage.
471 //
472 // FIXME (mfh 05 Jun 2014, mfh 07 Dec 2018) This test for
473 // aliasing only tests if the user passed in the same
474 // MultiVector for both X and Y. It won't detect whether one
475 // MultiVector views the other. We should also check the
476 // MultiVectors' raw data pointers.
477 if (! Y_in.isConstantStride () || X_colMap.getRawPtr () == &Y_in) {
478 // Force creating the MV if it hasn't been created already.
479 // This will reuse a previously created cached MV.
480 Y_rowMap = getRowMapMultiVector (Y_in, true);
481
482 // If beta == 0, we don't need to copy Y_in into Y_rowMap,
483 // since we're overwriting it anyway.
484 if (beta != STS::zero ()) {
486 }
488 if(Y_is_overwritten) Y_lcl = Y_rowMap->getLocalViewDevice(Access::OverwriteAll);
489 else Y_lcl = Y_rowMap->getLocalViewDevice(Access::ReadWrite);
490
493 }
494 else {
496 if(Y_is_overwritten) Y_lcl = Y_in.getLocalViewDevice(Access::OverwriteAll);
497 else Y_lcl = Y_in.getLocalViewDevice(Access::ReadWrite);
498
500 }
501 }
502
503 // If the range Map is a locally replicated Map, sum up
504 // contributions from each process. We set beta = 0 on all
505 // processes but Proc 0 initially, so this will handle the scaling
506 // factor beta correctly.
507 if (Y_is_replicated) {
508 ProfilingRegion regionReduce ("Tpetra::CrsMatrixMultiplyOp::apply: Reduce Y");
509 Y_in.reduce ();
510 }
511 }
512
513 private:
533 Teuchos::RCP<MV>
535 const bool force = false) const
536 {
537 using Teuchos::null;
538 using Teuchos::RCP;
539 using Teuchos::rcp;
540 typedef Import<LocalOrdinal,GlobalOrdinal,Node> import_type;
541
542 const size_t numVecs = X_domainMap.getNumVectors ();
543 RCP<const import_type> importer = matrix_->getGraph ()->getImporter ();
544 RCP<const map_type> colMap = matrix_->getColMap ();
545
546 RCP<MV> X_colMap; // null by default
547
548 // If the Import object is trivial (null), then we don't need a
549 // separate column Map multivector. Just return null in that
550 // case. The caller is responsible for knowing not to use the
551 // returned null pointer.
552 //
553 // If the Import is nontrivial, then we do need a separate
554 // column Map multivector for the Import operation. Check in
555 // that case if we have to (re)create the column Map
556 // multivector.
557 if (! importer.is_null () || force) {
558 if (importMV_.is_null () || importMV_->getNumVectors () != numVecs) {
559 X_colMap = rcp (new MV (colMap, numVecs));
560
561 // Cache the newly created multivector for later reuse.
563 }
564 else { // Yay, we can reuse the cached multivector!
565 X_colMap = importMV_;
566 // mfh 09 Jan 2013: We don't have to fill with zeros first,
567 // because the Import uses INSERT combine mode, which overwrites
568 // existing entries.
569 //
570 //X_colMap->putScalar (STS::zero ());
571 }
572 }
573 return X_colMap;
574 }
575
597 Teuchos::RCP<MV>
598 getRowMapMultiVector (const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>& Y_rangeMap,
599 const bool force = false) const
600 {
601 using Teuchos::null;
602 using Teuchos::RCP;
603 using Teuchos::rcp;
604 typedef Export<LocalOrdinal,GlobalOrdinal,Node> export_type;
605
606 const size_t numVecs = Y_rangeMap.getNumVectors ();
607 RCP<const export_type> exporter = matrix_->getGraph ()->getExporter ();
608 RCP<const map_type> rowMap = matrix_->getRowMap ();
609
610 RCP<MV> Y_rowMap; // null by default
611
612 // If the Export object is trivial (null), then we don't need a
613 // separate row Map multivector. Just return null in that case.
614 // The caller is responsible for knowing not to use the returned
615 // null pointer.
616 //
617 // If the Export is nontrivial, then we do need a separate row
618 // Map multivector for the Export operation. Check in that case
619 // if we have to (re)create the row Map multivector.
620 if (! exporter.is_null () || force) {
621 if (exportMV_.is_null () || exportMV_->getNumVectors () != numVecs) {
622 Y_rowMap = rcp (new MV (rowMap, numVecs));
623 exportMV_ = Y_rowMap; // Cache the newly created MV for later reuse
624 }
625 else { // Yay, we can reuse the cached multivector!
626 Y_rowMap = exportMV_;
627 }
628 }
629 return Y_rowMap;
630 }
631 };
632
640 template <class OpScalar,
641 class MatScalar,
642 class LocalOrdinal,
643 class GlobalOrdinal,
644 class Node>
645 Teuchos::RCP<
646 CrsMatrixMultiplyOp<OpScalar, MatScalar, LocalOrdinal, GlobalOrdinal, Node> >
647 createCrsMatrixMultiplyOp (const Teuchos::RCP<
649 {
651 GlobalOrdinal, Node> op_type;
652 return Teuchos::rcp (new op_type (A));
653 }
654
655} // end of namespace Tpetra
656
657#endif // TPETRA_CRSMATRIXMULTIPLYOP_HPP
Forward declaration of Tpetra::CrsMatrixMultiplyOp.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Stand-alone utility functions and macros.
A class for wrapping a CrsMatrix multiply in a Operator.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > importMV_
Column Map MultiVector used in apply().
bool hasTransposeApply() const override
Whether this Operator's apply() method can apply the transpose or conjugate transpose.
void applyNonTranspose(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X_in, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y_in, Scalar alpha, Scalar beta) const
Apply the matrix (not its transpose) to X_in, producing Y_in.
void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
Compute Y = beta*Y + alpha*Op(A)*X, where Op(A) is either A, , or .
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this Operator.
void applyTranspose(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X_in, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y_in, Teuchos::ETransp mode, Scalar alpha, Scalar beta) const
Apply the transpose or conjugate transpose of the matrix to X_in, producing Y_in.
CrsMatrixMultiplyOp(const Teuchos::RCP< const crs_matrix_type > &A)
Constructor.
~CrsMatrixMultiplyOp() override=default
Destructor (virtual for memory safety of derived classes).
Teuchos::RCP< CrsMatrixMultiplyOp< OpScalar, MatScalar, LocalOrdinal, GlobalOrdinal, Node > > createCrsMatrixMultiplyOp(const Teuchos::RCP< const CrsMatrix< MatScalar, LocalOrdinal, GlobalOrdinal, Node > > &A)
Non-member function to create a CrsMatrixMultiplyOp.
const Teuchos::RCP< const crs_matrix_type > matrix_
The underlying CrsMatrix object.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > exportMV_
Row Map MultiVector used in apply().
LocalCrsMatrixOperator< Scalar, MatScalar, typename crs_matrix_type::device_type > localMultiply_
Implementation of local sparse matrix-vector multiply.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this Operator.
Struct that holds views of the contents of a CrsMatrix.
typename Node::device_type device_type
The Kokkos device type.
Abstract interface for local operators (e.g., matrices and preconditioners).
Abstract interface for operators (e.g., matrices and preconditioners).
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
@ ADD_ASSIGN
Accumulate new values into existing values (may not be supported in all classes)
@ INSERT
Insert new values that don't currently exist.