MueLu Version of the Day
Loading...
Searching...
No Matches
MueLu_CoalesceDropFactory_kokkos_def.hpp
Go to the documentation of this file.
1// @HEADER
2//
3// ***********************************************************************
4//
5// MueLu: A package for multigrid based preconditioning
6// Copyright 2012 Sandia Corporation
7//
8// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9// the U.S. Government retains certain rights in this software.
10//
11// Redistribution and use in source and binary forms, with or without
12// modification, are permitted provided that the following conditions are
13// met:
14//
15// 1. Redistributions of source code must retain the above copyright
16// notice, this list of conditions and the following disclaimer.
17//
18// 2. Redistributions in binary form must reproduce the above copyright
19// notice, this list of conditions and the following disclaimer in the
20// documentation and/or other materials provided with the distribution.
21//
22// 3. Neither the name of the Corporation nor the names of the
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Questions? Contact
39// Jonathan Hu (jhu@sandia.gov)
40// Andrey Prokopenko (aprokop@sandia.gov)
41// Ray Tuminaro (rstumin@sandia.gov)
42//
43// ***********************************************************************
44//
45// @HEADER
46#ifndef MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
47#define MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
48
49#include <Kokkos_Core.hpp>
50#include <KokkosSparse_CrsMatrix.hpp>
51
52#include "Xpetra_Matrix.hpp"
53
55
56#include "MueLu_AmalgamationInfo_kokkos.hpp"
57#include "MueLu_Exceptions.hpp"
58#include "MueLu_Level.hpp"
59#include "MueLu_LWGraph_kokkos.hpp"
60#include "MueLu_MasterList.hpp"
61#include "MueLu_Monitor.hpp"
62#include "MueLu_Utilities_kokkos.hpp"
63
64namespace MueLu {
65
66
67 namespace CoalesceDrop_Kokkos_Details { // anonymous
68
69 template<class LO, class RowType>
71 public:
72 ScanFunctor(RowType rows_) : rows(rows_) { }
73
74 KOKKOS_INLINE_FUNCTION
75 void operator()(const LO i, LO& upd, const bool& final) const {
76 upd += rows(i);
77 if (final)
78 rows(i) = upd;
79 }
80
81 private:
82 RowType rows;
83 };
84
85 template<class LO, class GhostedViewType>
87 private:
88 typedef typename GhostedViewType::value_type SC;
89 typedef Kokkos::ArithTraits<SC> ATS;
90 typedef typename ATS::magnitudeType magnitudeType;
91
92 GhostedViewType diag; // corresponds to overlapped diagonal multivector (2D View)
94
95 public:
96 ClassicalDropFunctor(GhostedViewType ghostedDiag, magnitudeType threshold) :
97 diag(ghostedDiag),
98 eps(threshold)
99 { }
100
101 // Return true if we drop, false if not
102 KOKKOS_FORCEINLINE_FUNCTION
103 bool operator()(LO row, LO col, SC val) const {
104 // We avoid square root by using squared values
105 auto aiiajj = ATS::magnitude(diag(row, 0)) * ATS::magnitude(diag(col, 0)); // |a_ii|*|a_jj|
106 auto aij2 = ATS::magnitude(val) * ATS::magnitude(val); // |a_ij|^2
107
108 return (aij2 <= eps*eps * aiiajj);
109 }
110 };
111
112 template<class LO, class CoordsType>
114 private:
115 typedef typename CoordsType::value_type SC;
116 typedef Kokkos::ArithTraits<SC> ATS;
117 typedef typename ATS::magnitudeType magnitudeType;
118
119 public:
120 typedef SC value_type;
121
122 public:
123 DistanceFunctor(CoordsType coords_) : coords(coords_) { }
124
125 KOKKOS_INLINE_FUNCTION
126 magnitudeType distance2(LO row, LO col) const {
127 SC d = ATS::zero(), s;
128 for (size_t j = 0; j < coords.extent(1); j++) {
129 s = coords(row,j) - coords(col,j);
130 d += s*s;
131 }
132 return ATS::magnitude(d);
133 }
134 private:
135 CoordsType coords;
136 };
137
138 template<class LO, class GhostedViewType, class DistanceFunctor>
140 private:
141 typedef typename GhostedViewType::value_type SC;
142 typedef Kokkos::ArithTraits<SC> ATS;
143 typedef typename ATS::magnitudeType magnitudeType;
144
145 public:
146 DistanceLaplacianDropFunctor(GhostedViewType ghostedLaplDiag, DistanceFunctor distFunctor_, magnitudeType threshold) :
147 diag(ghostedLaplDiag),
148 distFunctor(distFunctor_),
149 eps(threshold)
150 { }
151
152 // Return true if we drop, false if not
153 KOKKOS_INLINE_FUNCTION
154 bool operator()(LO row, LO col, SC /* val */) const {
155 // We avoid square root by using squared values
156
157 // We ignore incoming value of val as we operate on an auxiliary
158 // distance Laplacian matrix
159 typedef typename DistanceFunctor::value_type dSC;
160 typedef Kokkos::ArithTraits<dSC> dATS;
161 auto fval = dATS::one() / distFunctor.distance2(row, col);
162
163 auto aiiajj = ATS::magnitude(diag(row, 0)) * ATS::magnitude(diag(col, 0)); // |a_ii|*|a_jj|
164 auto aij2 = ATS::magnitude(fval) * ATS::magnitude(fval); // |a_ij|^2
165
166 return (aij2 <= eps*eps * aiiajj);
167 }
168
169 private:
170 GhostedViewType diag; // corresponds to overlapped diagonal multivector (2D View)
173 };
174
175 template<class SC, class LO, class MatrixType, class BndViewType, class DropFunctorType>
177 private:
178 typedef typename MatrixType::StaticCrsGraphType graph_type;
179 typedef typename graph_type::row_map_type rows_type;
180 typedef typename graph_type::entries_type cols_type;
181 typedef typename MatrixType::values_type vals_type;
182 typedef Kokkos::ArithTraits<SC> ATS;
183 typedef typename ATS::val_type impl_Scalar;
184 typedef Kokkos::ArithTraits<impl_Scalar> impl_ATS;
185 typedef typename ATS::magnitudeType magnitudeType;
186
187 public:
188 ScalarFunctor(MatrixType A_, BndViewType bndNodes_, DropFunctorType dropFunctor_,
189 typename rows_type::non_const_type rows_,
190 typename cols_type::non_const_type colsAux_,
191 typename vals_type::non_const_type valsAux_,
192 bool reuseGraph_, bool lumping_, SC /* threshold_ */,
193 bool aggregationMayCreateDirichlet_ ) :
194 A(A_),
195 bndNodes(bndNodes_),
196 dropFunctor(dropFunctor_),
197 rows(rows_),
198 colsAux(colsAux_),
199 valsAux(valsAux_),
200 reuseGraph(reuseGraph_),
201 lumping(lumping_),
202 aggregationMayCreateDirichlet(aggregationMayCreateDirichlet_)
203 {
204 rowsA = A.graph.row_map;
205 zero = impl_ATS::zero();
206 }
207
208 KOKKOS_INLINE_FUNCTION
209 void operator()(const LO row, LO& nnz) const {
210 auto rowView = A.rowConst(row);
211 auto length = rowView.length;
212 auto offset = rowsA(row);
213
214 impl_Scalar diag = zero;
215 LO rownnz = 0;
216 LO diagID = -1;
217 for (decltype(length) colID = 0; colID < length; colID++) {
218 LO col = rowView.colidx(colID);
219 impl_Scalar val = rowView.value (colID);
220
221 if (!dropFunctor(row, col, rowView.value(colID)) || row == col) {
222 colsAux(offset+rownnz) = col;
223
224 LO valID = (reuseGraph ? colID : rownnz);
225 valsAux(offset+valID) = val;
226 if (row == col)
227 diagID = valID;
228
229 rownnz++;
230
231 } else {
232 // Rewrite with zeros (needed for reuseGraph)
233 valsAux(offset+colID) = zero;
234 diag += val;
235 }
236 }
237 // How to assert on the device?
238 // assert(diagIndex != -1);
239 rows(row+1) = rownnz;
240 // if (lumping && diagID != -1) {
241 if (lumping) {
242 // Add diag to the diagonal
243
244 // NOTE_KOKKOS: valsAux was allocated with
245 // ViewAllocateWithoutInitializing. This is not a problem here
246 // because we explicitly set this value above.
247 valsAux(offset+diagID) += diag;
248 }
249
250 // If the only element remaining after filtering is diagonal, mark node as boundary
251 // FIXME: this should really be replaced by the following
252 // if (indices.size() == 1 && indices[0] == row)
253 // boundaryNodes[row] = true;
254 // We do not do it this way now because there is no framework for distinguishing isolated
255 // and boundary nodes in the aggregation algorithms
256 bndNodes(row) = (rownnz == 1 && aggregationMayCreateDirichlet);
257
258 nnz += rownnz;
259 }
260
261 private:
262 MatrixType A;
263 BndViewType bndNodes;
264 DropFunctorType dropFunctor;
265
267
268 typename rows_type::non_const_type rows;
269 typename cols_type::non_const_type colsAux;
270 typename vals_type::non_const_type valsAux;
271
276 };
277
278 // collect number nonzeros of blkSize rows in nnz_(row+1)
279 template<class MatrixType, class NnzType, class blkSizeType>
281 private:
282 typedef typename MatrixType::ordinal_type LO;
283
284 public:
285 Stage1aVectorFunctor(MatrixType kokkosMatrix_, NnzType nnz_, blkSizeType blkSize_) :
286 kokkosMatrix(kokkosMatrix_),
287 nnz(nnz_),
288 blkSize(blkSize_) { }
289
290 KOKKOS_INLINE_FUNCTION
291 void operator()(const LO row, LO& totalnnz) const {
292
293 // the following code is more or less what MergeRows is doing
294 // count nonzero entries in all dof rows associated with node row
295 LO nodeRowMaxNonZeros = 0;
296 for (LO j = 0; j < blkSize; j++) {
297 auto rowView = kokkosMatrix.row(row * blkSize + j);
298 nodeRowMaxNonZeros += rowView.length;
299 }
300 nnz(row + 1) = nodeRowMaxNonZeros;
301 totalnnz += nodeRowMaxNonZeros;
302 }
303
304
305 private:
306 MatrixType kokkosMatrix; //< local matrix part
307 NnzType nnz; //< View containing number of nonzeros for current row
308 blkSizeType blkSize; //< block size (or partial block size in strided maps)
309 };
310
311
312 // build the dof-based column map containing the local dof ids belonging to blkSize rows in matrix
313 // sort column ids
314 // translate them into (unique) node ids
315 // count the node column ids per node row
316 template<class MatrixType, class NnzType, class blkSizeType, class ColDofType, class Dof2NodeTranslationType, class BdryNodeTypeConst, class BdryNodeType, class boolType>
318 private:
319 typedef typename MatrixType::ordinal_type LO;
320
321 private:
322 MatrixType kokkosMatrix; //< local matrix part
323 NnzType coldofnnz; //< view containing start and stop indices for subviews
324 blkSizeType blkSize; //< block size (or partial block size in strided maps)
325 ColDofType coldofs; //< view containing the local dof ids associated with columns for the blkSize rows (not sorted)
326 Dof2NodeTranslationType dof2node; //< view containing the local node id associated with the local dof id
327 NnzType colnodennz; //< view containing number of column nodes for each node row
328 BdryNodeTypeConst dirichletdof; //< view containing with num dofs booleans. True if dof (not necessarily entire node) is dirichlet boundardy dof.
329 BdryNodeType bdrynode; //< view containing with numNodes booleans. True if node is (full) dirichlet boundardy node.
330 boolType usegreedydirichlet; //< boolean for use of greedy Dirichlet (if any dof is Dirichlet, entire node is dirichlet) default false (need all dofs in node to be Dirichlet for node to be Dirichlet)
331
332 public:
333 Stage1bcVectorFunctor(MatrixType kokkosMatrix_,
334 NnzType coldofnnz_,
335 blkSizeType blkSize_,
336 ColDofType coldofs_,
337 Dof2NodeTranslationType dof2node_,
338 NnzType colnodennz_,
339 BdryNodeTypeConst dirichletdof_,
340 BdryNodeType bdrynode_,
341 boolType usegreedydirichlet_) :
342 kokkosMatrix(kokkosMatrix_),
343 coldofnnz(coldofnnz_),
344 blkSize(blkSize_),
345 coldofs(coldofs_),
346 dof2node(dof2node_),
347 colnodennz(colnodennz_),
348 dirichletdof(dirichletdof_),
349 bdrynode(bdrynode_),
350 usegreedydirichlet(usegreedydirichlet_) {
351 }
352
353 KOKKOS_INLINE_FUNCTION
354 void operator()(const LO rowNode, LO& nnz) const {
355
356 LO pos = coldofnnz(rowNode);
357 if( usegreedydirichlet ){
358 bdrynode(rowNode) = false;
359 for (LO j = 0; j < blkSize; j++) {
360 auto rowView = kokkosMatrix.row(rowNode * blkSize + j);
361 auto numIndices = rowView.length;
362
363 // if any dof in the node is Dirichlet
364 if( dirichletdof(rowNode * blkSize + j) )
365 bdrynode(rowNode) = true;
366
367 for (decltype(numIndices) k = 0; k < numIndices; k++) {
368 auto dofID = rowView.colidx(k);
369 coldofs(pos) = dofID;
370 pos ++;
371 }
372 }
373 }else{
374 bdrynode(rowNode) = true;
375 for (LO j = 0; j < blkSize; j++) {
376 auto rowView = kokkosMatrix.row(rowNode * blkSize + j);
377 auto numIndices = rowView.length;
378
379 // if any dof in the node is not Dirichlet
380 if( dirichletdof(rowNode * blkSize + j) == false )
381 bdrynode(rowNode) = false;
382
383 for (decltype(numIndices) k = 0; k < numIndices; k++) {
384 auto dofID = rowView.colidx(k);
385 coldofs(pos) = dofID;
386 pos ++;
387 }
388 }
389 }
390
391 // sort coldofs
392 LO begin = coldofnnz(rowNode);
393 LO end = coldofnnz(rowNode+1);
394 LO n = end - begin;
395 for (LO i = 0; i < (n-1); i++) {
396 for (LO j = 0; j < (n-i-1); j++) {
397 if (coldofs(j+begin) > coldofs(j+begin+1)) {
398 LO temp = coldofs(j+begin);
399 coldofs(j+begin) = coldofs(j+begin+1);
400 coldofs(j+begin+1) = temp;
401 }
402 }
403 }
404 size_t cnt = 0;
405 LO lastNodeID = -1;
406 for (LO i = 0; i < n; i++) {
407 LO dofID = coldofs(begin + i);
408 LO nodeID = dof2node(dofID);
409 if(nodeID != lastNodeID) {
410 lastNodeID = nodeID;
411 coldofs(begin+cnt) = nodeID;
412 cnt++;
413 }
414 }
415 colnodennz(rowNode+1) = cnt;
416 nnz += cnt;
417 }
418
419 };
420
421 // fill column node id view
422 template<class MatrixType, class ColDofNnzType, class ColDofType, class ColNodeNnzType, class ColNodeType>
424 private:
425 typedef typename MatrixType::ordinal_type LO;
426 typedef typename MatrixType::value_type SC;
427
428 private:
429 ColDofType coldofs; //< view containing mixed node and dof indices (only input)
430 ColDofNnzType coldofnnz; //< view containing the start and stop indices for subviews (dofs)
431 ColNodeType colnodes; //< view containing the local node ids associated with columns
432 ColNodeNnzType colnodennz; //< view containing start and stop indices for subviews
433
434 public:
435 Stage1dVectorFunctor(ColDofType coldofs_, ColDofNnzType coldofnnz_, ColNodeType colnodes_, ColNodeNnzType colnodennz_) :
436 coldofs(coldofs_),
437 coldofnnz(coldofnnz_),
438 colnodes(colnodes_),
439 colnodennz(colnodennz_) {
440 }
441
442 KOKKOS_INLINE_FUNCTION
443 void operator()(const LO rowNode) const {
444 auto dofbegin = coldofnnz(rowNode);
445 auto nodebegin = colnodennz(rowNode);
446 auto nodeend = colnodennz(rowNode+1);
447 auto n = nodeend - nodebegin;
448
449 for (decltype(nodebegin) i = 0; i < n; i++) {
450 colnodes(nodebegin + i) = coldofs(dofbegin + i);
451 }
452 }
453 };
454
455
456 } // namespace
457
458 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
460 RCP<ParameterList> validParamList = rcp(new ParameterList());
461
462#define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
463 SET_VALID_ENTRY("aggregation: drop tol");
464 SET_VALID_ENTRY("aggregation: Dirichlet threshold");
465 SET_VALID_ENTRY("aggregation: drop scheme");
466 SET_VALID_ENTRY("aggregation: dropping may create Dirichlet");
467 SET_VALID_ENTRY("aggregation: greedy Dirichlet");
468 SET_VALID_ENTRY("filtered matrix: use lumping");
469 SET_VALID_ENTRY("filtered matrix: reuse graph");
470 SET_VALID_ENTRY("filtered matrix: reuse eigenvalue");
471 {
472 typedef Teuchos::StringToIntegralParameterEntryValidator<int> validatorType;
473 validParamList->getEntry("aggregation: drop scheme").setValidator(
474 rcp(new validatorType(Teuchos::tuple<std::string>("classical", "distance laplacian"), "aggregation: drop scheme")));
475 }
476#undef SET_VALID_ENTRY
477 validParamList->set< RCP<const FactoryBase> >("A", Teuchos::null, "Generating factory of the matrix A");
478 validParamList->set< RCP<const FactoryBase> >("UnAmalgamationInfo", Teuchos::null, "Generating factory for UnAmalgamationInfo");
479 validParamList->set< RCP<const FactoryBase> >("Coordinates", Teuchos::null, "Generating factory for Coordinates");
480
481 return validParamList;
482 }
483
484 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
486 Input(currentLevel, "A");
487 Input(currentLevel, "UnAmalgamationInfo");
488
489 const ParameterList& pL = GetParameterList();
490 if (pL.get<std::string>("aggregation: drop scheme") == "distance laplacian")
491 Input(currentLevel, "Coordinates");
492 }
493
494 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
496 Build(Level& currentLevel) const {
497 FactoryMonitor m(*this, "Build", currentLevel);
498
499 typedef Teuchos::ScalarTraits<SC> STS;
500 typedef typename STS::magnitudeType MT;
501 const MT zero = Teuchos::ScalarTraits<MT>::zero();
502
503 auto A = Get< RCP<Matrix> >(currentLevel, "A");
504
505
506 /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme.
507 blkSize is the number of storage blocks that must kept together during the amalgamation process.
508
509 Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold:
510
511 numPDEs = blkSize * storageblocksize.
512
513 If numPDEs==1
514 Matrix is point storage (classical CRS storage). storageblocksize=1 and blkSize=1
515 No other values makes sense.
516
517 If numPDEs>1
518 If matrix uses point storage, then storageblocksize=1 and blkSize=numPDEs.
519 If matrix uses block storage, with block size of n, then storageblocksize=n, and blkSize=numPDEs/n.
520 Thus far, only storageblocksize=numPDEs and blkSize=1 has been tested.
521 */
522
523 TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0,Exceptions::RuntimeError,"A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()");
524 LO blkSize = A->GetFixedBlockSize() / A->GetStorageBlockSize();
525
526 auto amalInfo = Get< RCP<AmalgamationInfo_kokkos> >(currentLevel, "UnAmalgamationInfo");
527
528 const ParameterList& pL = GetParameterList();
529
530 std::string algo = pL.get<std::string>("aggregation: drop scheme");
531
532 double threshold = pL.get<double>("aggregation: drop tol");
533 GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold
534 << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
535
536 const typename STS::magnitudeType dirichletThreshold =
537 STS::magnitude(as<SC>(pL.get<double>("aggregation: Dirichlet threshold")));
538
539 GO numDropped = 0, numTotal = 0;
540
541 RCP<LWGraph_kokkos> graph;
542 LO dofsPerNode = -1;
543
544 typedef typename LWGraph_kokkos::boundary_nodes_type boundary_nodes_type;
545 boundary_nodes_type boundaryNodes;
546
547 RCP<Matrix> filteredA;
548 if (blkSize == 1 && threshold == zero) {
549 // Scalar problem without dropping
550
551 // Detect and record rows that correspond to Dirichlet boundary conditions
552 boundaryNodes = Utilities_kokkos::DetectDirichletRows(*A, dirichletThreshold);
553
554 // Trivial LWGraph construction
555 graph = rcp(new LWGraph_kokkos(A->getCrsGraph()->getLocalGraphDevice(), A->getRowMap(), A->getColMap(), "graph of A"));
556 graph->getLocalLWGraph().SetBoundaryNodeMap(boundaryNodes);
557
558 numTotal = A->getLocalNumEntries();
559 dofsPerNode = 1;
560
561 filteredA = A;
562
563 } else if (blkSize == 1 && threshold != zero) {
564 // Scalar problem with dropping
565
566 typedef typename Matrix::local_matrix_type local_matrix_type;
567 typedef typename LWGraph_kokkos::local_graph_type kokkos_graph_type;
568 typedef typename kokkos_graph_type::row_map_type::non_const_type rows_type;
569 typedef typename kokkos_graph_type::entries_type::non_const_type cols_type;
570 typedef typename local_matrix_type::values_type::non_const_type vals_type;
571
572 LO numRows = A->getLocalNumRows();
573 local_matrix_type kokkosMatrix = A->getLocalMatrixDevice();
574 auto nnzA = kokkosMatrix.nnz();
575 auto rowsA = kokkosMatrix.graph.row_map;
576
577
578 typedef Kokkos::ArithTraits<SC> ATS;
579 typedef typename ATS::val_type impl_Scalar;
580 typedef Kokkos::ArithTraits<impl_Scalar> impl_ATS;
581
582 bool reuseGraph = pL.get<bool>("filtered matrix: reuse graph");
583 bool lumping = pL.get<bool>("filtered matrix: use lumping");
584 if (lumping)
585 GetOStream(Runtime0) << "Lumping dropped entries" << std::endl;
586
587 const bool aggregationMayCreateDirichlet = pL.get<bool>("aggregation: dropping may create Dirichlet");
588
589 // FIXME_KOKKOS: replace by ViewAllocateWithoutInitializing + setting a single value
590 rows_type rows ("FA_rows", numRows+1);
591 cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("FA_aux_cols"), nnzA);
592 vals_type valsAux;
593 if (reuseGraph) {
594 SubFactoryMonitor m2(*this, "CopyMatrix", currentLevel);
595
596 // Share graph with the original matrix
597 filteredA = MatrixFactory::Build(A->getCrsGraph());
598
599 // Do a no-op fill-complete
600 RCP<ParameterList> fillCompleteParams(new ParameterList);
601 fillCompleteParams->set("No Nonlocal Changes", true);
602 filteredA->fillComplete(fillCompleteParams);
603
604 // No need to reuseFill, just modify in place
605 valsAux = filteredA->getLocalMatrixDevice().values;
606
607 } else {
608 // Need an extra array to compress
609 valsAux = vals_type(Kokkos::ViewAllocateWithoutInitializing("FA_aux_vals"), nnzA);
610 }
611
612 typename boundary_nodes_type::non_const_type bndNodes(Kokkos::ViewAllocateWithoutInitializing("boundaryNodes"), numRows);
613
614 LO nnzFA = 0;
615 {
616 if (algo == "classical") {
617 // Construct overlapped matrix diagonal
618 RCP<Vector> ghostedDiag;
619 {
620 kokkosMatrix = local_matrix_type();
621 SubFactoryMonitor m2(*this, "Ghosted diag construction", currentLevel);
623 kokkosMatrix=A->getLocalMatrixDevice();
624 }
625
626 // Filter out entries
627 {
628 SubFactoryMonitor m2(*this, "MainLoop", currentLevel);
629
630 auto ghostedDiagView = ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadWrite);
631
632 CoalesceDrop_Kokkos_Details::ClassicalDropFunctor<LO, decltype(ghostedDiagView)> dropFunctor(ghostedDiagView, threshold);
633 CoalesceDrop_Kokkos_Details::ScalarFunctor<typename ATS::val_type, LO, local_matrix_type, decltype(bndNodes), decltype(dropFunctor)>
634 scalarFunctor(kokkosMatrix, bndNodes, dropFunctor, rows, colsAux, valsAux, reuseGraph, lumping, threshold, aggregationMayCreateDirichlet);
635
636 Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:main_loop", range_type(0,numRows),
637 scalarFunctor, nnzFA);
638 }
639
640 } else if (algo == "distance laplacian") {
641 typedef Xpetra::MultiVector<typename Teuchos::ScalarTraits<Scalar>::magnitudeType,LO,GO,NO> doubleMultiVector;
642 auto coords = Get<RCP<doubleMultiVector> >(currentLevel, "Coordinates");
643
644 auto uniqueMap = A->getRowMap();
645 auto nonUniqueMap = A->getColMap();
646
647 // Construct ghosted coordinates
648 RCP<const Import> importer;
649 {
650 SubFactoryMonitor m2(*this, "Coords Import construction", currentLevel);
651 importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
652 }
653 RCP<doubleMultiVector> ghostedCoords;
654 {
655 SubFactoryMonitor m2(*this, "Ghosted coords construction", currentLevel);
656 ghostedCoords = Xpetra::MultiVectorFactory<typename Teuchos::ScalarTraits<Scalar>::magnitudeType,LO,GO,NO>::Build(nonUniqueMap, coords->getNumVectors());
657 ghostedCoords->doImport(*coords, *importer, Xpetra::INSERT);
658 }
659
660 auto ghostedCoordsView = ghostedCoords->getDeviceLocalView(Xpetra::Access::ReadWrite);
661 CoalesceDrop_Kokkos_Details::DistanceFunctor<LO, decltype(ghostedCoordsView)> distFunctor(ghostedCoordsView);
662
663 // Construct Laplacian diagonal
664 RCP<Vector> localLaplDiag;
665 {
666 SubFactoryMonitor m2(*this, "Local Laplacian diag construction", currentLevel);
667
668 localLaplDiag = VectorFactory::Build(uniqueMap);
669
670 auto localLaplDiagView = localLaplDiag->getDeviceLocalView(Xpetra::Access::OverwriteAll);
671 auto kokkosGraph = kokkosMatrix.graph;
672
673 Kokkos::parallel_for("MueLu:CoalesceDropF:Build:scalar_filter:laplacian_diag", range_type(0,numRows),
674 KOKKOS_LAMBDA(const LO row) {
675 auto rowView = kokkosGraph.rowConst(row);
676 auto length = rowView.length;
677
678 impl_Scalar d = impl_ATS::zero();
679 for (decltype(length) colID = 0; colID < length; colID++) {
680 auto col = rowView(colID);
681 if (row != col)
682 d += impl_ATS::one()/distFunctor.distance2(row, col);
683 }
684 localLaplDiagView(row,0) = d;
685 });
686 }
687
688 // Construct ghosted Laplacian diagonal
689 RCP<Vector> ghostedLaplDiag;
690 {
691 SubFactoryMonitor m2(*this, "Ghosted Laplacian diag construction", currentLevel);
692 ghostedLaplDiag = VectorFactory::Build(nonUniqueMap);
693 ghostedLaplDiag->doImport(*localLaplDiag, *importer, Xpetra::INSERT);
694 }
695
696 // Filter out entries
697 {
698 SubFactoryMonitor m2(*this, "MainLoop", currentLevel);
699
700 auto ghostedLaplDiagView = ghostedLaplDiag->getDeviceLocalView(Xpetra::Access::ReadWrite);
701
702 CoalesceDrop_Kokkos_Details::DistanceLaplacianDropFunctor<LO, decltype(ghostedLaplDiagView), decltype(distFunctor)>
703 dropFunctor(ghostedLaplDiagView, distFunctor, threshold);
704 CoalesceDrop_Kokkos_Details::ScalarFunctor<SC, LO, local_matrix_type, decltype(bndNodes), decltype(dropFunctor)>
705 scalarFunctor(kokkosMatrix, bndNodes, dropFunctor, rows, colsAux, valsAux, reuseGraph, lumping, threshold, true);
706
707 Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:main_loop", range_type(0,numRows),
708 scalarFunctor, nnzFA);
709 }
710 }
711
712 }
713 numDropped = nnzA - nnzFA;
714
715 boundaryNodes = bndNodes;
716
717 {
718 SubFactoryMonitor m2(*this, "CompressRows", currentLevel);
719
720 // parallel_scan (exclusive)
721 Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:compress_rows", range_type(0,numRows+1),
722 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
723 update += rows(i);
724 if (final_pass)
725 rows(i) = update;
726 });
727 }
728
729 // Compress cols (and optionally vals)
730 // We use a trick here: we moved all remaining elements to the beginning
731 // of the original row in the main loop, so we don't need to check for
732 // INVALID here, and just stop when achieving the new number of elements
733 // per row.
734 cols_type cols(Kokkos::ViewAllocateWithoutInitializing("FA_cols"), nnzFA);
735 vals_type vals;
736 if (reuseGraph) {
737 GetOStream(Runtime1) << "reuse matrix graph for filtering (compress matrix columns only)" << std::endl;
738 // Only compress cols
739 SubFactoryMonitor m2(*this, "CompressColsAndVals", currentLevel);
740
741 Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols", range_type(0,numRows),
742 KOKKOS_LAMBDA(const LO i) {
743 // Is there Kokkos memcpy?
744 LO rowStart = rows(i);
745 LO rowAStart = rowsA(i);
746 size_t rownnz = rows(i+1) - rows(i);
747 for (size_t j = 0; j < rownnz; j++)
748 cols(rowStart+j) = colsAux(rowAStart+j);
749 });
750 } else {
751 // Compress cols and vals
752 GetOStream(Runtime1) << "new matrix graph for filtering (compress matrix columns and values)" << std::endl;
753 SubFactoryMonitor m2(*this, "CompressColsAndVals", currentLevel);
754
755 vals = vals_type(Kokkos::ViewAllocateWithoutInitializing("FA_vals"), nnzFA);
756
757 Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols", range_type(0,numRows),
758 KOKKOS_LAMBDA(const LO i) {
759 LO rowStart = rows(i);
760 LO rowAStart = rowsA(i);
761 size_t rownnz = rows(i+1) - rows(i);
762 for (size_t j = 0; j < rownnz; j++) {
763 cols(rowStart+j) = colsAux(rowAStart+j);
764 vals(rowStart+j) = valsAux(rowAStart+j);
765 }
766 });
767 }
768
769 kokkos_graph_type kokkosGraph(cols, rows);
770
771 {
772 SubFactoryMonitor m2(*this, "LWGraph construction", currentLevel);
773
774 graph = rcp(new LWGraph_kokkos(kokkosGraph, A->getRowMap(), A->getColMap(), "filtered graph of A"));
775 graph->getLocalLWGraph().SetBoundaryNodeMap(boundaryNodes);
776 }
777
778 numTotal = A->getLocalNumEntries();
779
780 dofsPerNode = 1;
781
782 if (!reuseGraph) {
783 SubFactoryMonitor m2(*this, "LocalMatrix+FillComplete", currentLevel);
784
785 local_matrix_type localFA = local_matrix_type("A", numRows, A->getLocalMatrixDevice().numCols(), nnzFA, vals, rows, cols);
786 auto filteredACrs = CrsMatrixFactory::Build(localFA, A->getRowMap(), A->getColMap(), A->getDomainMap(), A->getRangeMap(),
787 A->getCrsGraph()->getImporter(), A->getCrsGraph()->getExporter());
788 filteredA = rcp(new CrsMatrixWrap(filteredACrs));
789 }
790
791 filteredA->SetFixedBlockSize(A->GetFixedBlockSize());
792
793 if (pL.get<bool>("filtered matrix: reuse eigenvalue")) {
794 // Reuse max eigenvalue from A
795 // It is unclear what eigenvalue is the best for the smoothing, but we already may have
796 // the D^{-1}A estimate in A, may as well use it.
797 // NOTE: ML does that too
798 filteredA->SetMaxEigenvalueEstimate(A->GetMaxEigenvalueEstimate());
799 } else {
800 filteredA->SetMaxEigenvalueEstimate(-Teuchos::ScalarTraits<SC>::one());
801 }
802
803 } else if (blkSize > 1 && threshold == zero) {
804 // Case 3: block problem without filtering
805 //
806 // FIXME_KOKKOS: this code is completely unoptimized. It really should do
807 // a very simple thing: merge rows and produce nodal graph. But the code
808 // seems very complicated. Can we do better?
809
810 TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getLocalNumElements() % blkSize != 0, MueLu::Exceptions::RuntimeError, "MueLu::CoalesceDropFactory: Number of local elements is " << A->getRowMap()->getLocalNumElements() << " but should be a multiply of " << blkSize);
811
812 const RCP<const Map> rowMap = A->getRowMap();
813 const RCP<const Map> colMap = A->getColMap();
814
815 // build a node row map (uniqueMap = non-overlapping) and a node column map
816 // (nonUniqueMap = overlapping). The arrays rowTranslation and colTranslation
817 // stored in the AmalgamationInfo class container contain the local node id
818 // given a local dof id. The data is calculated in the AmalgamationFactory and
819 // stored in the variable "UnAmalgamationInfo" (which is of type AmalagamationInfo)
820 const RCP<const Map> uniqueMap = amalInfo->getNodeRowMap();
821 const RCP<const Map> nonUniqueMap = amalInfo->getNodeColMap();
822 Array<LO> rowTranslationArray = *(amalInfo->getRowTranslation()); // TAW should be transform that into a View?
823 Array<LO> colTranslationArray = *(amalInfo->getColTranslation());
824
825 Kokkos::View<LO*, Kokkos::MemoryUnmanaged>
826 rowTranslationView(rowTranslationArray.getRawPtr(),rowTranslationArray.size() );
827 Kokkos::View<LO*, Kokkos::MemoryUnmanaged>
828 colTranslationView(colTranslationArray.getRawPtr(),colTranslationArray.size() );
829
830 // get number of local nodes
831 LO numNodes = Teuchos::as<LocalOrdinal>(uniqueMap->getLocalNumElements());
832 typedef typename Kokkos::View<LocalOrdinal*, DeviceType> id_translation_type;
833 id_translation_type rowTranslation("dofId2nodeId",rowTranslationArray.size());
834 id_translation_type colTranslation("ov_dofId2nodeId",colTranslationArray.size());
835 Kokkos::deep_copy(rowTranslation, rowTranslationView);
836 Kokkos::deep_copy(colTranslation, colTranslationView);
837
838 // extract striding information
839 blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map)
840 LocalOrdinal blkId = -1; //< the block id within a strided map or -1 if it is a full block map
841 LocalOrdinal blkPartSize = A->GetFixedBlockSize(); //< stores block size of part blkId (or the full block size)
842 if(A->IsView("stridedMaps") == true) {
843 const RCP<const Map> myMap = A->getRowMap("stridedMaps");
844 const RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
845 TEUCHOS_TEST_FOR_EXCEPTION(strMap.is_null() == true, Exceptions::RuntimeError, "Map is not of type stridedMap");
846 blkSize = Teuchos::as<const LocalOrdinal>(strMap->getFixedBlockSize());
847 blkId = strMap->getStridedBlockId();
848 if (blkId > -1)
849 blkPartSize = Teuchos::as<LocalOrdinal>(strMap->getStridingData()[blkId]);
850 }
851 auto kokkosMatrix = A->getLocalMatrixDevice(); // access underlying kokkos data
852
853 //
854 typedef typename LWGraph_kokkos::local_graph_type kokkos_graph_type;
855 typedef typename kokkos_graph_type::row_map_type row_map_type;
856 //typedef typename row_map_type::HostMirror row_map_type_h;
857 typedef typename kokkos_graph_type::entries_type entries_type;
858
859 // Stage 1c: get number of dof-nonzeros per blkSize node rows
860 typename row_map_type::non_const_type dofNnz("nnz_map", numNodes + 1);
861 LO numDofCols = 0;
862 CoalesceDrop_Kokkos_Details::Stage1aVectorFunctor<decltype(kokkosMatrix), decltype(dofNnz), decltype(blkPartSize)> stage1aFunctor(kokkosMatrix, dofNnz, blkPartSize);
863 Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:stage1a", range_type(0,numNodes), stage1aFunctor, numDofCols);
864 // parallel_scan (exclusive)
865 CoalesceDrop_Kokkos_Details::ScanFunctor<LO,decltype(dofNnz)> scanFunctor(dofNnz);
866 Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:stage1_scan", range_type(0,numNodes+1), scanFunctor);
867
868 // Detect and record dof rows that correspond to Dirichlet boundary conditions
869 boundary_nodes_type singleEntryRows = Utilities_kokkos::DetectDirichletRows(*A, dirichletThreshold);
870
871 typename entries_type::non_const_type dofcols("dofcols", numDofCols/*dofNnz(numNodes)*/); // why does dofNnz(numNodes) work? should be a parallel reduce, i guess
872
873 // we have dofcols and dofids from Stage1dVectorFunctor
874 LO numNodeCols = 0;
875 typename row_map_type::non_const_type rows("nnz_nodemap", numNodes + 1);
876 typename boundary_nodes_type::non_const_type bndNodes("boundaryNodes", numNodes);
877
878 CoalesceDrop_Kokkos_Details::Stage1bcVectorFunctor <decltype(kokkosMatrix), decltype(dofNnz), decltype(blkPartSize), decltype(dofcols), decltype(colTranslation), decltype(singleEntryRows), decltype(bndNodes), bool> stage1bcFunctor(kokkosMatrix, dofNnz, blkPartSize, dofcols, colTranslation, rows, singleEntryRows, bndNodes, pL.get<bool>("aggregation: greedy Dirichlet"));
879 Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:stage1c", range_type(0,numNodes), stage1bcFunctor,numNodeCols);
880
881 // parallel_scan (exclusive)
882 CoalesceDrop_Kokkos_Details::ScanFunctor<LO,decltype(rows)> scanNodeFunctor(rows);
883 Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:stage1_scan", range_type(0,numNodes+1), scanNodeFunctor);
884
885 // create column node view
886 typename entries_type::non_const_type cols("nodecols", numNodeCols);
887
888
889 CoalesceDrop_Kokkos_Details::Stage1dVectorFunctor <decltype(kokkosMatrix), decltype(dofNnz), decltype(dofcols), decltype(rows), decltype(cols)> stage1dFunctor(dofcols, dofNnz, cols, rows);
890 Kokkos::parallel_for("MueLu:CoalesceDropF:Build:scalar_filter:stage1c", range_type(0,numNodes), stage1dFunctor);
891 kokkos_graph_type kokkosGraph(cols, rows);
892
893 // create LW graph
894 graph = rcp(new LWGraph_kokkos(kokkosGraph, uniqueMap, nonUniqueMap, "amalgamated graph of A"));
895
896 boundaryNodes = bndNodes;
897 graph->getLocalLWGraph().SetBoundaryNodeMap(boundaryNodes);
898 numTotal = A->getLocalNumEntries();
899
900 dofsPerNode = blkSize;
901
902 filteredA = A;
903
904 } else {
905 TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "MueLu: CoalesceDropFactory_kokkos: Block filtering is not implemented");
906 }
907
908 if (GetVerbLevel() & Statistics1) {
909 GO numLocalBoundaryNodes = 0;
910 GO numGlobalBoundaryNodes = 0;
911
912 Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:bnd", range_type(0, boundaryNodes.extent(0)),
913 KOKKOS_LAMBDA(const LO i, GO& n) {
914 if (boundaryNodes(i))
915 n++;
916 }, numLocalBoundaryNodes);
917
918 auto comm = A->getRowMap()->getComm();
919 MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
920 GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
921 }
922
923 if ((GetVerbLevel() & Statistics1) && threshold != zero) {
924 auto comm = A->getRowMap()->getComm();
925
926 GO numGlobalTotal, numGlobalDropped;
927 MueLu_sumAll(comm, numTotal, numGlobalTotal);
928 MueLu_sumAll(comm, numDropped, numGlobalDropped);
929
930 if (numGlobalTotal != 0) {
931 GetOStream(Statistics1) << "Number of dropped entries: "
932 << numGlobalDropped << "/" << numGlobalTotal
933 << " (" << 100*Teuchos::as<double>(numGlobalDropped)/Teuchos::as<double>(numGlobalTotal) << "%)" << std::endl;
934 }
935 }
936
937 Set(currentLevel, "DofsPerNode", dofsPerNode);
938 Set(currentLevel, "Graph", graph);
939 Set(currentLevel, "A", filteredA);
940 }
941}
942#endif // MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
#define SET_VALID_ENTRY(name)
#define MueLu_sumAll(rcpComm, in, out)
MueLu::DefaultLocalOrdinal LocalOrdinal
Factory for creating a graph based on a given matrix.
KOKKOS_FORCEINLINE_FUNCTION bool operator()(LO row, LO col, SC val) const
ClassicalDropFunctor(GhostedViewType ghostedDiag, magnitudeType threshold)
KOKKOS_INLINE_FUNCTION magnitudeType distance2(LO row, LO col) const
DistanceLaplacianDropFunctor(GhostedViewType ghostedLaplDiag, DistanceFunctor distFunctor_, magnitudeType threshold)
KOKKOS_INLINE_FUNCTION void operator()(const LO row, LO &nnz) const
ScalarFunctor(MatrixType A_, BndViewType bndNodes_, DropFunctorType dropFunctor_, typename rows_type::non_const_type rows_, typename cols_type::non_const_type colsAux_, typename vals_type::non_const_type valsAux_, bool reuseGraph_, bool lumping_, SC, bool aggregationMayCreateDirichlet_)
KOKKOS_INLINE_FUNCTION void operator()(const LO i, LO &upd, const bool &final) const
Stage1aVectorFunctor(MatrixType kokkosMatrix_, NnzType nnz_, blkSizeType blkSize_)
KOKKOS_INLINE_FUNCTION void operator()(const LO row, LO &totalnnz) const
KOKKOS_INLINE_FUNCTION void operator()(const LO rowNode, LO &nnz) const
Stage1bcVectorFunctor(MatrixType kokkosMatrix_, NnzType coldofnnz_, blkSizeType blkSize_, ColDofType coldofs_, Dof2NodeTranslationType dof2node_, NnzType colnodennz_, BdryNodeTypeConst dirichletdof_, BdryNodeType bdrynode_, boolType usegreedydirichlet_)
Stage1dVectorFunctor(ColDofType coldofs_, ColDofNnzType coldofnnz_, ColNodeType colnodes_, ColNodeNnzType colnodennz_)
Exception throws to report errors in the internal logical of the program.
Timer to be used in factories. Similar to Monitor but with additional timers.
Lightweight MueLu representation of a compressed row storage graph.
Class that holds all level-specific information.
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
static RCP< Vector > GetMatrixOverlappedDiagonal(const Matrix &A)
Extract Overlapped Matrix Diagonal.
static Kokkos::View< bool *, typename NO::device_type > DetectDirichletRows(const Matrix &A, const Magnitude &tol=Teuchos::ScalarTraits< typename Teuchos::ScalarTraits< SC >::magnitudeType >::zero(), const bool count_twos_as_dirichlet=false)
Detect Dirichlet rows.
Namespace for MueLu classes and methods.
@ Statistics1
Print more statistics.
@ Runtime0
One-liner description of what is happening.
@ Runtime1
Description of what is happening (more verbose)