MueLu Version of the Day
Loading...
Searching...
No Matches
MueLu_TentativePFactory_kokkos_def.hpp
Go to the documentation of this file.
1// @HEADER
2//
3// ***********************************************************************
4//
5// MueLu: A package for multigrid based preconditioning
6// Copyright 2012 Sandia Corporation
7//
8// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9// the U.S. Government retains certain rights in this software.
10//
11// Redistribution and use in source and binary forms, with or without
12// modification, are permitted provided that the following conditions are
13// met:
14//
15// 1. Redistributions of source code must retain the above copyright
16// notice, this list of conditions and the following disclaimer.
17//
18// 2. Redistributions in binary form must reproduce the above copyright
19// notice, this list of conditions and the following disclaimer in the
20// documentation and/or other materials provided with the distribution.
21//
22// 3. Neither the name of the Corporation nor the names of the
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Questions? Contact
39// Jonathan Hu (jhu@sandia.gov)
40// Andrey Prokopenko (aprokop@sandia.gov)
41// Ray Tuminaro (rstumin@sandia.gov)
42//
43// ***********************************************************************
44//
45// @HEADER
46#ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
47#define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
48
49#include "Kokkos_UnorderedMap.hpp"
50#include "Xpetra_CrsGraphFactory.hpp"
51
53
54#include "MueLu_Aggregates_kokkos.hpp"
55#include "MueLu_AmalgamationFactory_kokkos.hpp"
56#include "MueLu_AmalgamationInfo_kokkos.hpp"
57#include "MueLu_CoarseMapFactory_kokkos.hpp"
58
59#include "MueLu_MasterList.hpp"
60#include "MueLu_NullspaceFactory_kokkos.hpp"
61#include "MueLu_PerfUtils.hpp"
62#include "MueLu_Monitor.hpp"
63#include "MueLu_Utilities_kokkos.hpp"
64
65#include "Xpetra_IO.hpp"
66
67namespace MueLu {
68
69 namespace { // anonymous
70
71 template<class LocalOrdinal, class View>
72 class ReduceMaxFunctor{
73 public:
74 ReduceMaxFunctor(View view) : view_(view) { }
75
76 KOKKOS_INLINE_FUNCTION
77 void operator()(const LocalOrdinal &i, LocalOrdinal& vmax) const {
78 if (vmax < view_(i))
79 vmax = view_(i);
80 }
81
82 KOKKOS_INLINE_FUNCTION
83 void join (LocalOrdinal& dst, const LocalOrdinal& src) const {
84 if (dst < src) {
85 dst = src;
86 }
87 }
88
89 KOKKOS_INLINE_FUNCTION
90 void init (LocalOrdinal& dst) const {
91 dst = 0;
92 }
93 private:
95 };
96
97 // local QR decomposition
98 template<class LOType, class GOType, class SCType,class DeviceType, class NspType, class aggRowsType, class maxAggDofSizeType, class agg2RowMapLOType, class statusType, class rowsType, class rowsAuxType, class colsAuxType, class valsAuxType>
99 class LocalQRDecompFunctor {
100 private:
101 typedef LOType LO;
102 typedef GOType GO;
103 typedef SCType SC;
104
105 typedef typename DeviceType::execution_space execution_space;
106 typedef typename Kokkos::ArithTraits<SC>::val_type impl_SC;
107 typedef Kokkos::ArithTraits<impl_SC> impl_ATS;
108 typedef typename impl_ATS::magnitudeType Magnitude;
109
110 typedef Kokkos::View<impl_SC**,typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_matrix;
111 typedef Kokkos::View<impl_SC* ,typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_vector;
112
113 private:
114
115 NspType fineNS;
116 NspType coarseNS;
117 aggRowsType aggRows;
118 maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode)
119 agg2RowMapLOType agg2RowMapLO;
120 statusType statusAtomic;
121 rowsType rows;
122 rowsAuxType rowsAux;
123 colsAuxType colsAux;
124 valsAuxType valsAux;
126 public:
127 LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, bool doQRStep_) :
128 fineNS(fineNS_),
129 coarseNS(coarseNS_),
130 aggRows(aggRows_),
131 maxAggDofSize(maxAggDofSize_),
132 agg2RowMapLO(agg2RowMapLO_),
133 statusAtomic(statusAtomic_),
134 rows(rows_),
135 rowsAux(rowsAux_),
136 colsAux(colsAux_),
137 valsAux(valsAux_),
138 doQRStep(doQRStep_)
139 { }
140
141 KOKKOS_INLINE_FUNCTION
142 void operator() ( const typename Kokkos::TeamPolicy<execution_space>::member_type & thread, size_t& nnz) const {
143 auto agg = thread.league_rank();
144
145 // size of aggregate: number of DOFs in aggregate
146 auto aggSize = aggRows(agg+1) - aggRows(agg);
147
148 const impl_SC one = impl_ATS::one();
149 const impl_SC two = one + one;
150 const impl_SC zero = impl_ATS::zero();
151 const auto zeroM = impl_ATS::magnitude(zero);
152
153 int m = aggSize;
154 int n = fineNS.extent(1);
155
156 // calculate row offset for coarse nullspace
157 Xpetra::global_size_t offset = agg * n;
158
159 if (doQRStep) {
160
161 // Extract the piece of the nullspace corresponding to the aggregate
162 shared_matrix r(thread.team_shmem(), m, n); // A (initially), R (at the end)
163 for (int j = 0; j < n; j++)
164 for (int k = 0; k < m; k++)
165 r(k,j) = fineNS(agg2RowMapLO(aggRows(agg)+k),j);
166#if 0
167 printf("A\n");
168 for (int i = 0; i < m; i++) {
169 for (int j = 0; j < n; j++)
170 printf(" %5.3lf ", r(i,j));
171 printf("\n");
172 }
173#endif
174
175 // Calculate QR decomposition (standard)
176 shared_matrix q(thread.team_shmem(), m, m); // Q
177 if (m >= n) {
178 bool isSingular = false;
179
180 // Initialize Q^T
181 auto qt = q;
182 for (int i = 0; i < m; i++) {
183 for (int j = 0; j < m; j++)
184 qt(i,j) = zero;
185 qt(i,i) = one;
186 }
187
188 for (int k = 0; k < n; k++) { // we ignore "n" instead of "n-1" to normalize
189 // FIXME_KOKKOS: use team
190 Magnitude s = zeroM, norm, norm_x;
191 for (int i = k+1; i < m; i++)
192 s += pow(impl_ATS::magnitude(r(i,k)), 2);
193 norm = sqrt(pow(impl_ATS::magnitude(r(k,k)), 2) + s);
194
195 if (norm == zero) {
196 isSingular = true;
197 break;
198 }
199
200 r(k,k) -= norm*one;
201
202 norm_x = sqrt(pow(impl_ATS::magnitude(r(k,k)), 2) + s);
203 if (norm_x == zeroM) {
204 // We have a single diagonal element in the column.
205 // No reflections required. Just need to restor r(k,k).
206 r(k,k) = norm*one;
207 continue;
208 }
209
210 // FIXME_KOKKOS: use team
211 for (int i = k; i < m; i++)
212 r(i,k) /= norm_x;
213
214 // Update R(k:m,k+1:n)
215 for (int j = k+1; j < n; j++) {
216 // FIXME_KOKKOS: use team in the loops
217 impl_SC si = zero;
218 for (int i = k; i < m; i++)
219 si += r(i,k) * r(i,j);
220 for (int i = k; i < m; i++)
221 r(i,j) -= two*si * r(i,k);
222 }
223
224 // Update Q^T (k:m,k:m)
225 for (int j = k; j < m; j++) {
226 // FIXME_KOKKOS: use team in the loops
227 impl_SC si = zero;
228 for (int i = k; i < m; i++)
229 si += r(i,k) * qt(i,j);
230 for (int i = k; i < m; i++)
231 qt(i,j) -= two*si * r(i,k);
232 }
233
234 // Fix R(k:m,k)
235 r(k,k) = norm*one;
236 for (int i = k+1; i < m; i++)
237 r(i,k) = zero;
238 }
239
240#if 0
241 // Q = (Q^T)^T
242 for (int i = 0; i < m; i++)
243 for (int j = 0; j < i; j++) {
244 impl_SC tmp = qt(i,j);
245 qt(i,j) = qt(j,i);
246 qt(j,i) = tmp;
247 }
248#endif
249
250 // Build coarse nullspace using the upper triangular part of R
251 for (int j = 0; j < n; j++)
252 for (int k = 0; k <= j; k++)
253 coarseNS(offset+k,j) = r(k,j);
254
255 if (isSingular) {
256 statusAtomic(1) = true;
257 return;
258 }
259
260 } else {
261 // Special handling for m < n (i.e. single node aggregates in structural mechanics)
262
263 // The local QR decomposition is not possible in the "overconstrained"
264 // case (i.e. number of columns in qr > number of rowsAux), which
265 // corresponds to #DOFs in Aggregate < n. For usual problems this
266 // is only possible for single node aggregates in structural mechanics.
267 // (Similar problems may arise in discontinuous Galerkin problems...)
268 // We bypass the QR decomposition and use an identity block in the
269 // tentative prolongator for the single node aggregate and transfer the
270 // corresponding fine level null space information 1-to-1 to the coarse
271 // level null space part.
272
273 // NOTE: The resulting tentative prolongation operator has
274 // (m*DofsPerNode-n) zero columns leading to a singular
275 // coarse level operator A. To deal with that one has the following
276 // options:
277 // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
278 // false) to add some identity block to the diagonal of the zero rowsAux
279 // in the coarse level operator A, such that standard level smoothers
280 // can be used again.
281 // - Use special (projection-based) level smoothers, which can deal
282 // with singular matrices (very application specific)
283 // - Adapt the code below to avoid zero columns. However, we do not
284 // support a variable number of DOFs per node in MueLu/Xpetra which
285 // makes the implementation really hard.
286 //
287 // FIXME: do we need to check for singularity here somehow? Zero
288 // columns would be easy but linear dependency would require proper QR.
289
290 // R = extended (by adding identity rowsAux) qr
291 for (int j = 0; j < n; j++)
292 for (int k = 0; k < n; k++)
293 if (k < m)
294 coarseNS(offset+k,j) = r(k,j);
295 else
296 coarseNS(offset+k,j) = (k == j ? one : zero);
297
298 // Q = I (rectangular)
299 for (int i = 0; i < m; i++)
300 for (int j = 0; j < n; j++)
301 q(i,j) = (j == i ? one : zero);
302 }
303
304 // Process each row in the local Q factor and fill helper arrays to assemble P
305 for (int j = 0; j < m; j++) {
306 LO localRow = agg2RowMapLO(aggRows(agg)+j);
307 size_t rowStart = rowsAux(localRow);
308 size_t lnnz = 0;
309 for (int k = 0; k < n; k++) {
310 // skip zeros
311 if (q(j,k) != zero) {
312 colsAux(rowStart+lnnz) = offset + k;
313 valsAux(rowStart+lnnz) = q(j,k);
314 lnnz++;
315 }
316 }
317 rows(localRow+1) = lnnz;
318 nnz += lnnz;
319 }
320
321#if 0
322 printf("R\n");
323 for (int i = 0; i < m; i++) {
324 for (int j = 0; j < n; j++)
325 printf(" %5.3lf ", coarseNS(i,j));
326 printf("\n");
327 }
328
329 printf("Q\n");
330 for (int i = 0; i < aggSize; i++) {
331 for (int j = 0; j < aggSize; j++)
332 printf(" %5.3lf ", q(i,j));
333 printf("\n");
334 }
335#endif
336 } else {
338 // "no-QR" option //
340 // Local Q factor is just the fine nullspace support over the current aggregate.
341 // Local R factor is the identity.
342 // TODO I have not implemented any special handling for aggregates that are too
343 // TODO small to locally support the nullspace, as is done in the standard QR
344 // TODO case above.
345
346 for (int j = 0; j < m; j++) {
347 LO localRow = agg2RowMapLO(aggRows(agg)+j);
348 size_t rowStart = rowsAux(localRow);
349 size_t lnnz = 0;
350 for (int k = 0; k < n; k++) {
351 const impl_SC qr_jk = fineNS(localRow,k);
352 // skip zeros
353 if (qr_jk != zero) {
354 colsAux(rowStart+lnnz) = offset + k;
355 valsAux(rowStart+lnnz) = qr_jk;
356 lnnz++;
357 }
358 }
359 rows(localRow+1) = lnnz;
360 nnz += lnnz;
361 }
362
363 for (int j = 0; j < n; j++)
364 coarseNS(offset+j,j) = one;
365
366 }
367
368 }
369
370 // amount of shared memory
371 size_t team_shmem_size( int /* team_size */ ) const {
372 if (doQRStep) {
373 int m = maxAggDofSize;
374 int n = fineNS.extent(1);
375 return shared_matrix::shmem_size(m, n) + // r
376 shared_matrix::shmem_size(m, m); // q
377 } else
378 return 0;
379 }
380 };
381
382 } // namespace anonymous
383
384 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
386 RCP<ParameterList> validParamList = rcp(new ParameterList());
387
388#define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
389 SET_VALID_ENTRY("tentative: calculate qr");
390 SET_VALID_ENTRY("tentative: build coarse coordinates");
391#undef SET_VALID_ENTRY
392
393 validParamList->set< RCP<const FactoryBase> >("A", Teuchos::null, "Generating factory of the matrix A");
394 validParamList->set< RCP<const FactoryBase> >("Aggregates", Teuchos::null, "Generating factory of the aggregates");
395 validParamList->set< RCP<const FactoryBase> >("Nullspace", Teuchos::null, "Generating factory of the nullspace");
396 validParamList->set< RCP<const FactoryBase> >("Scaled Nullspace", Teuchos::null, "Generating factory of the scaled nullspace");
397 validParamList->set< RCP<const FactoryBase> >("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
398 validParamList->set< RCP<const FactoryBase> >("CoarseMap", Teuchos::null, "Generating factory of the coarse map");
399 validParamList->set< RCP<const FactoryBase> >("Coordinates", Teuchos::null, "Generating factory of the coordinates");
400
401 // Make sure we don't recursively validate options for the matrixmatrix kernels
402 ParameterList norecurse;
403 norecurse.disableRecursiveValidation();
404 validParamList->set<ParameterList> ("matrixmatrix: kernel params", norecurse, "MatrixMatrix kernel parameters");
405
406 return validParamList;
407 }
408
409 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
411
412 const ParameterList& pL = GetParameterList();
413 // NOTE: This guy can only either be 'Nullspace' or 'Scaled Nullspace' or else the validator above will cause issues
414 std::string nspName = "Nullspace";
415 if(pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
416
417 Input(fineLevel, "A");
418 Input(fineLevel, "Aggregates");
419 Input(fineLevel, nspName);
420 Input(fineLevel, "UnAmalgamationInfo");
421 Input(fineLevel, "CoarseMap");
422 if( fineLevel.GetLevelID() == 0 &&
423 fineLevel.IsAvailable("Coordinates", NoFactory::get()) && // we have coordinates (provided by user app)
424 pL.get<bool>("tentative: build coarse coordinates") ) { // and we want coordinates on other levels
425 bTransferCoordinates_ = true; // then set the transfer coordinates flag to true
426 Input(fineLevel, "Coordinates");
427 } else if (bTransferCoordinates_) {
428 Input(fineLevel, "Coordinates");
429 }
430 }
431
432 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
434 return BuildP(fineLevel, coarseLevel);
435 }
436
437 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
439 FactoryMonitor m(*this, "Build", coarseLevel);
440
441 typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
442 typedef Xpetra::MultiVectorFactory<coordinate_type,LO,GO,NO> RealValuedMultiVectorFactory;
443 const ParameterList& pL = GetParameterList();
444 std::string nspName = "Nullspace";
445 if(pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
446
447 auto A = Get< RCP<Matrix> > (fineLevel, "A");
448 auto aggregates = Get< RCP<Aggregates_kokkos> > (fineLevel, "Aggregates");
449 auto amalgInfo = Get< RCP<AmalgamationInfo_kokkos> > (fineLevel, "UnAmalgamationInfo");
450 auto fineNullspace = Get< RCP<MultiVector> > (fineLevel, nspName);
451 auto coarseMap = Get< RCP<const Map> > (fineLevel, "CoarseMap");
452 RCP<RealValuedMultiVector> fineCoords;
453 if(bTransferCoordinates_) {
454 fineCoords = Get< RCP<RealValuedMultiVector> >(fineLevel, "Coordinates");
455 }
456
457 RCP<Matrix> Ptentative;
458 // No coarse DoFs so we need to bail by setting Ptentattive to null and returning
459 // This level will ultimately be removed in MueLu_Hierarchy_defs.h via a resize()
460 if ( aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) {
461 Ptentative = Teuchos::null;
462 Set(coarseLevel, "P", Ptentative);
463 return;
464 }
465 RCP<MultiVector> coarseNullspace;
466 RCP<RealValuedMultiVector> coarseCoords;
467
468 if(bTransferCoordinates_) {
469 ArrayView<const GO> elementAList = coarseMap->getLocalElementList();
470 GO indexBase = coarseMap->getIndexBase();
471
472 LO blkSize = 1;
473 if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
474 blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();
475
476 Array<GO> elementList;
477 ArrayView<const GO> elementListView;
478 if (blkSize == 1) {
479 // Scalar system
480 // No amalgamation required
481 elementListView = elementAList;
482
483 } else {
484 auto numElements = elementAList.size() / blkSize;
485
486 elementList.resize(numElements);
487
488 // Amalgamate the map
489 for (LO i = 0; i < Teuchos::as<LO>(numElements); i++)
490 elementList[i] = (elementAList[i*blkSize]-indexBase)/blkSize + indexBase;
491
492 elementListView = elementList;
493 }
494
495 auto uniqueMap = fineCoords->getMap();
496 auto coarseCoordMap = MapFactory::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
497 elementListView, indexBase, coarseMap->getComm());
498 coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
499
500 // Create overlapped fine coordinates to reduce global communication
501 RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
502 if (aggregates->AggregatesCrossProcessors()) {
503 auto nonUniqueMap = aggregates->GetMap();
504 auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
505
506 ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
507 ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
508 }
509
510 // The good new is that his graph has already been constructed for the
511 // TentativePFactory and was cached in Aggregates. So this is a no-op.
512 auto aggGraph = aggregates->GetGraph();
513 auto numAggs = aggGraph.numRows();
514
515 auto fineCoordsView = fineCoords ->getDeviceLocalView(Xpetra::Access::ReadOnly);
516 auto coarseCoordsView = coarseCoords->getDeviceLocalView(Xpetra::Access::OverwriteAll);
517
518 // Fill in coarse coordinates
519 {
520 SubFactoryMonitor m2(*this, "AverageCoords", coarseLevel);
521
522 const auto dim = fineCoords->getNumVectors();
523
524 typename AppendTrait<decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
525 for (size_t j = 0; j < dim; j++) {
526 Kokkos::parallel_for("MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
527 KOKKOS_LAMBDA(const LO i) {
528 // A row in this graph represents all node ids in the aggregate
529 // Therefore, averaging is very easy
530
531 auto aggregate = aggGraph.rowConst(i);
532
533 coordinate_type sum = 0.0; // do not use Scalar here (Stokhos)
534 for (size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
535 sum += fineCoordsRandomView(aggregate(colID),j);
536
537 coarseCoordsView(i,j) = sum / aggregate.length;
538 });
539 }
540 }
541 }
542
543 if (!aggregates->AggregatesCrossProcessors()) {
544 if(Xpetra::Helpers<SC,LO,GO,NO>::isTpetraBlockCrs(A)) {
545 BuildPuncoupledBlockCrs(coarseLevel,A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,
546 coarseLevel.GetLevelID());
547 }
548 else {
549 BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
550 }
551 }
552 else
553 BuildPcoupled (A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
554
555 // If available, use striding information of fine level matrix A for range
556 // map and coarseMap as domain map; otherwise use plain range map of
557 // Ptent = plain range map of A for range map and coarseMap as domain map.
558 // NOTE:
559 // The latter is not really safe, since there is no striding information
560 // for the range map. This is not really a problem, since striding
561 // information is always available on the intermedium levels and the
562 // coarsest levels.
563 if (A->IsView("stridedMaps") == true)
564 Ptentative->CreateView("stridedMaps", A->getRowMap("stridedMaps"), coarseMap);
565 else
566 Ptentative->CreateView("stridedMaps", Ptentative->getRangeMap(), coarseMap);
567
568 if(bTransferCoordinates_) {
569 Set(coarseLevel, "Coordinates", coarseCoords);
570 }
571 Set(coarseLevel, "Nullspace", coarseNullspace);
572 Set(coarseLevel, "P", Ptentative);
573
574 if (IsPrint(Statistics2)) {
575 RCP<ParameterList> params = rcp(new ParameterList());
576 params->set("printLoadBalancingInfo", true);
577 GetOStream(Statistics2) << PerfUtils::PrintMatrixInfo(*Ptentative, "Ptent", params);
578 }
579 }
580
581 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
583 BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates,
584 RCP<AmalgamationInfo_kokkos> amalgInfo, RCP<MultiVector> fineNullspace,
585 RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
586 RCP<MultiVector>& coarseNullspace, const int levelID) const {
587 auto rowMap = A->getRowMap();
588 auto colMap = A->getColMap();
589
590 const size_t numRows = rowMap->getLocalNumElements();
591 const size_t NSDim = fineNullspace->getNumVectors();
592
593 typedef Kokkos::ArithTraits<SC> ATS;
594 using impl_SC = typename ATS::val_type;
595 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
596 const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
597
598 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
599
600 typename Aggregates_kokkos::local_graph_type aggGraph;
601 {
602 SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
603 aggGraph = aggregates->GetGraph();
604 }
605 auto aggRows = aggGraph.row_map;
606 auto aggCols = aggGraph.entries;
607
608 // Aggregates map is based on the amalgamated column map
609 // We can skip global-to-local conversion if LIDs in row map are
610 // same as LIDs in column map
611 bool goodMap;
612 {
613 SubFactoryMonitor m2(*this, "Check good map", coarseLevel);
614 goodMap = isGoodMap(*rowMap, *colMap);
615 }
616 // FIXME_KOKKOS: need to proofread later code for bad maps
617 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
618 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
619 "(i.e. \"matching\" row and column maps)");
620
621 // STEP 1: do unamalgamation
622 // The non-kokkos version uses member functions from the AmalgamationInfo
623 // container class to unamalgamate the data. In contrast, the kokkos
624 // version of TentativePFactory does the unamalgamation here and only uses
625 // the data of the AmalgamationInfo container class
626
627 // Extract information for unamalgamation
628 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
629 GO indexBase;
630 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
631 GO globalOffset = amalgInfo->GlobalOffset();
632
633 // Extract aggregation info (already in Kokkos host views)
634 auto procWinner = aggregates->GetProcWinner() ->getDeviceLocalView(Xpetra::Access::ReadOnly);
635 auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
636 const size_t numAggregates = aggregates->GetNumAggregates();
637
638 int myPID = aggregates->GetMap()->getComm()->getRank();
639
640 // Create Kokkos::View (on the device) to store the aggreate dof sizes
641 // Later used to get aggregate dof offsets
642 // NOTE: This zeros itself on construction
643 typedef typename Aggregates_kokkos::aggregates_sizes_type::non_const_type AggSizeType;
644 AggSizeType aggDofSizes;
645
646 if (stridedBlockSize == 1) {
647 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
648
649 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
650 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates+1);
651
652 auto sizesConst = aggregates->ComputeAggregateSizes();
653 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates+1)), sizesConst);
654
655 } else {
656 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
657
658 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
659 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
660
661 auto nodeMap = aggregates->GetMap()->getLocalMap();
662 auto dofMap = colMap->getLocalMap();
663
664 Kokkos::parallel_for("MueLu:TentativePF:Build:compute_agg_sizes", range_type(0,numAggregates),
665 KOKKOS_LAMBDA(const LO agg) {
666 auto aggRowView = aggGraph.rowConst(agg);
667
668 size_t size = 0;
669 for (LO colID = 0; colID < aggRowView.length; colID++) {
670 GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
671
672 for (LO k = 0; k < stridedBlockSize; k++) {
673 GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
674
675 if (dofMap.getLocalElement(dofGID) != INVALID)
676 size++;
677 }
678 }
679 aggDofSizes(agg+1) = size;
680 });
681 }
682
683 // Find maximum dof size for aggregates
684 // Later used to reserve enough scratch space for local QR decompositions
685 LO maxAggSize = 0;
686 ReduceMaxFunctor<LO,decltype(aggDofSizes)> reduceMax(aggDofSizes);
687 Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
688
689 // parallel_scan (exclusive)
690 // The aggDofSizes View then contains the aggregate dof offsets
691 Kokkos::parallel_scan("MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0,numAggregates+1),
692 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
693 update += aggDofSizes(i);
694 if (final_pass)
695 aggDofSizes(i) = update;
696 });
697
698 // Create Kokkos::View on the device to store mapping
699 // between (local) aggregate id and row map ids (LIDs)
700 Kokkos::View<LO*, DeviceType> agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing("agg2row_map_LO"), numRows);
701 {
702 SubFactoryMonitor m2(*this, "Create Agg2RowMap", coarseLevel);
703
704 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
705 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
706
707 Kokkos::parallel_for("MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
708 KOKKOS_LAMBDA(const LO lnode) {
709 if (procWinner(lnode, 0) == myPID) {
710 // No need for atomics, it's one-to-one
711 auto aggID = vertex2AggId(lnode,0);
712
713 auto offset = Kokkos::atomic_fetch_add( &aggOffsets(aggID), stridedBlockSize );
714 // FIXME: I think this may be wrong
715 // We unconditionally add the whole block here. When we calculated
716 // aggDofSizes, we did the isLocalElement check. Something's fishy.
717 for (LO k = 0; k < stridedBlockSize; k++)
718 agg2RowMapLO(offset + k) = lnode*stridedBlockSize + k;
719 }
720 });
721 }
722
723 // STEP 2: prepare local QR decomposition
724 // Reserve memory for tentative prolongation operator
725 coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
726
727 // Pull out the nullspace vectors so that we can have random access (on the device)
728 auto fineNS = fineNullspace ->getDeviceLocalView(Xpetra::Access::ReadWrite);
729 auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
730
731 size_t nnz = 0; // actual number of nnz
732
733 typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type local_matrix_type;
734 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
735 typedef typename local_matrix_type::index_type::non_const_type cols_type;
736 typedef typename local_matrix_type::values_type::non_const_type vals_type;
737
738
739 // Device View for status (error messages...)
740 typedef Kokkos::View<int[10], DeviceType> status_type;
741 status_type status("status");
742
743 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
745
746 const ParameterList& pL = GetParameterList();
747 const bool& doQRStep = pL.get<bool>("tentative: calculate qr");
748 if (!doQRStep) {
749 GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
750 if (NSDim>1)
751 GetOStream(Warnings0) << "TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
752 }
753
754 size_t nnzEstimate = numRows * NSDim;
755 rows_type rowsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_rows"), numRows+1);
756 cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_cols"), nnzEstimate);
757 vals_type valsAux("Ptent_aux_vals", nnzEstimate);
758 rows_type rows("Ptent_rows", numRows+1);
759 {
760 // Stage 0: fill in views.
761 SubFactoryMonitor m2(*this, "Stage 0 (InitViews)", coarseLevel);
762
763 // The main thing to notice is initialization of vals with INVALID. These
764 // values will later be used to compress the arrays
765 Kokkos::parallel_for("MueLu:TentativePF:BuildPuncoupled:for1", range_type(0, numRows+1),
766 KOKKOS_LAMBDA(const LO row) {
767 rowsAux(row) = row*NSDim;
768 });
769 Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:for2", range_type(0, nnzEstimate),
770 KOKKOS_LAMBDA(const LO j) {
771 colsAux(j) = INVALID;
772 });
773 }
774
775 if (NSDim == 1) {
776 // 1D is special, as it is the easiest. We don't even need to the QR,
777 // just normalize an array. Plus, no worries abot small aggregates. In
778 // addition, we do not worry about compression. It is unlikely that
779 // nullspace will have zeros. If it does, a prolongator row would be
780 // zero and we'll get singularity anyway.
781 SubFactoryMonitor m2(*this, "Stage 1 (LocalQR)", coarseLevel);
782
783 // Set up team policy with numAggregates teams and one thread per team.
784 // Each team handles a slice of the data associated with one aggregate
785 // and performs a local QR decomposition (in this case real QR is
786 // unnecessary).
787 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
788
789 if (doQRStep) {
790 Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:main_loop", policy,
791 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
792 auto agg = thread.league_rank();
793
794 // size of the aggregate (number of DOFs in aggregate)
795 LO aggSize = aggRows(agg+1) - aggRows(agg);
796
797 // Extract the piece of the nullspace corresponding to the aggregate, and
798 // put it in the flat array, "localQR" (in column major format) for the
799 // QR routine. Trivial in 1D.
800 auto norm = impl_ATS::magnitude(zero);
801
802 // Calculate QR by hand
803 // FIXME: shouldn't there be stridedblock here?
804 // FIXME_KOKKOS: shouldn't there be stridedblock here?
805 for (decltype(aggSize) k = 0; k < aggSize; k++) {
806 auto dnorm = impl_ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0));
807 norm += dnorm*dnorm;
808 }
809 norm = sqrt(norm);
810
811 if (norm == zero) {
812 // zero column; terminate the execution
813 statusAtomic(1) = true;
814 return;
815 }
816
817 // R = norm
818 coarseNS(agg, 0) = norm;
819
820 // Q = localQR(:,0)/norm
821 for (decltype(aggSize) k = 0; k < aggSize; k++) {
822 LO localRow = agg2RowMapLO(aggRows(agg)+k);
823 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0) / norm;
824
825 rows(localRow+1) = 1;
826 colsAux(localRow) = agg;
827 valsAux(localRow) = localVal;
828
829 }
830 });
831
832 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
833 Kokkos::deep_copy(statusHost, status);
834 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
835 if (statusHost(i)) {
836 std::ostringstream oss;
837 oss << "MueLu::TentativePFactory::MakeTentative: ";
838 switch (i) {
839 case 0: oss << "!goodMap is not implemented"; break;
840 case 1: oss << "fine level NS part has a zero column"; break;
841 }
842 throw Exceptions::RuntimeError(oss.str());
843 }
844
845 } else {
846 Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
847 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
848 auto agg = thread.league_rank();
849
850 // size of the aggregate (number of DOFs in aggregate)
851 LO aggSize = aggRows(agg+1) - aggRows(agg);
852
853 // R = norm
854 coarseNS(agg, 0) = one;
855
856 // Q = localQR(:,0)/norm
857 for (decltype(aggSize) k = 0; k < aggSize; k++) {
858 LO localRow = agg2RowMapLO(aggRows(agg)+k);
859 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0);
860
861 rows(localRow+1) = 1;
862 colsAux(localRow) = agg;
863 valsAux(localRow) = localVal;
864
865 }
866 });
867 }
868
869 Kokkos::parallel_reduce("MueLu:TentativeP:CountNNZ", range_type(0, numRows+1),
870 KOKKOS_LAMBDA(const LO i, size_t &nnz_count) {
871 nnz_count += rows(i);
872 }, nnz);
873
874 } else { // NSdim > 1
875 // FIXME_KOKKOS: This code branch is completely unoptimized.
876 // Work to do:
877 // - Optimize QR decomposition
878 // - Remove INVALID usage similarly to CoalesceDropFactory_kokkos by
879 // packing new values in the beginning of each row
880 // We do use auxilary view in this case, so keep a second rows view for
881 // counting nonzeros in rows
882
883 {
884 SubFactoryMonitor m2 = SubFactoryMonitor(*this, doQRStep ? "Stage 1 (LocalQR)" : "Stage 1 (Fill coarse nullspace and tentative P)", coarseLevel);
885 // Set up team policy with numAggregates teams and one thread per team.
886 // Each team handles a slice of the data associated with one aggregate
887 // and performs a local QR decomposition
888 const Kokkos::TeamPolicy<execution_space> policy(numAggregates,1); // numAggregates teams a 1 thread
889 LocalQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom),
890 decltype(aggDofSizes /*aggregate sizes in dofs*/), decltype(maxAggSize), decltype(agg2RowMapLO),
891 decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
892 decltype(valsAux)>
893 localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
895 Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
896 }
897
898 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
899 Kokkos::deep_copy(statusHost, status);
900 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
901 if (statusHost(i)) {
902 std::ostringstream oss;
903 oss << "MueLu::TentativePFactory::MakeTentative: ";
904 switch(i) {
905 case 0: oss << "!goodMap is not implemented"; break;
906 case 1: oss << "fine level NS part has a zero column"; break;
907 }
908 throw Exceptions::RuntimeError(oss.str());
909 }
910 }
911
912 // Compress the cols and vals by ignoring INVALID column entries that correspond
913 // to 0 in QR.
914
915 // The real cols and vals are constructed using calculated (not estimated) nnz
916 cols_type cols;
917 vals_type vals;
918
919 if (nnz != nnzEstimate) {
920 {
921 // Stage 2: compress the arrays
922 SubFactoryMonitor m2(*this, "Stage 2 (CompressRows)", coarseLevel);
923
924 Kokkos::parallel_scan("MueLu:TentativePF:Build:compress_rows", range_type(0,numRows+1),
925 KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
926 upd += rows(i);
927 if (final)
928 rows(i) = upd;
929 });
930 }
931
932 {
933 SubFactoryMonitor m2(*this, "Stage 2 (CompressCols)", coarseLevel);
934
935 cols = cols_type("Ptent_cols", nnz);
936 vals = vals_type("Ptent_vals", nnz);
937
938 // FIXME_KOKKOS: this can be spedup by moving correct cols and vals values
939 // to the beginning of rows. See CoalesceDropFactory_kokkos for
940 // example.
941 Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols_vals", range_type(0,numRows),
942 KOKKOS_LAMBDA(const LO i) {
943 LO rowStart = rows(i);
944
945 size_t lnnz = 0;
946 for (auto j = rowsAux(i); j < rowsAux(i+1); j++)
947 if (colsAux(j) != INVALID) {
948 cols(rowStart+lnnz) = colsAux(j);
949 vals(rowStart+lnnz) = valsAux(j);
950 lnnz++;
951 }
952 });
953 }
954
955 } else {
956 rows = rowsAux;
957 cols = colsAux;
958 vals = valsAux;
959 }
960
961 GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
962
963 {
964 // Stage 3: construct Xpetra::Matrix
965 SubFactoryMonitor m2(*this, "Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
966
967 local_matrix_type lclMatrix = local_matrix_type("A", numRows, coarseMap->getLocalNumElements(), nnz, vals, rows, cols);
968
969 // Managing labels & constants for ESFC
970 RCP<ParameterList> FCparams;
971 if (pL.isSublist("matrixmatrix: kernel params"))
972 FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
973 else
974 FCparams = rcp(new ParameterList);
975
976 // By default, we don't need global constants for TentativeP
977 FCparams->set("compute global constants", FCparams->get("compute global constants", false));
978 FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + toString(levelID));
979
980 auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
981 Ptentative = rcp(new CrsMatrixWrap(PtentCrs));
982 }
983 }
984
985
986 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
988 BuildPuncoupledBlockCrs(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates,
989 RCP<AmalgamationInfo_kokkos> amalgInfo, RCP<MultiVector> fineNullspace,
990 RCP<const Map> coarsePointMap, RCP<Matrix>& Ptentative,
991 RCP<MultiVector>& coarseNullspace, const int levelID) const {
992#ifdef HAVE_MUELU_TPETRA
993 /* This routine generates a BlockCrs P for a BlockCrs A. There are a few assumptions here, which meet the use cases we care about, but could
994 be generalized later, if we ever need to do so:
995 1) Null space dimension === block size of matrix: So no elasticity right now
996 2) QR is not supported: Under assumption #1, this shouldn't cause problems.
997 3) Maps are "good": Aka the first chunk of the ColMap is the RowMap.
998
999 These assumptions keep our code way simpler and still support the use cases we actually care about.
1000 */
1001
1002 RCP<const Map> rowMap = A->getRowMap();
1003 RCP<const Map> rangeMap = A->getRangeMap();
1004 RCP<const Map> colMap = A->getColMap();
1005 // const size_t numFinePointRows = rangeMap->getLocalNumElements();
1006 const size_t numFineBlockRows = rowMap->getLocalNumElements();
1007
1008 // typedef Teuchos::ScalarTraits<SC> STS;
1009 // typedef typename STS::magnitudeType Magnitude;
1010 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1011
1012 typedef Kokkos::ArithTraits<SC> ATS;
1013 using impl_SC = typename ATS::val_type;
1014 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
1015 const impl_SC one = impl_ATS::one();
1016
1017 // const GO numAggs = aggregates->GetNumAggregates();
1018 const size_t NSDim = fineNullspace->getNumVectors();
1019 auto aggSizes = aggregates->ComputeAggregateSizes();
1020
1021
1022 typename Aggregates_kokkos::local_graph_type aggGraph;
1023 {
1024 SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
1025 aggGraph = aggregates->GetGraph();
1026 }
1027 auto aggRows = aggGraph.row_map;
1028 auto aggCols = aggGraph.entries;
1029
1030
1031 // Need to generate the coarse block map
1032 // NOTE: We assume NSDim == block size here
1033 // NOTE: We also assume that coarseMap has contiguous GIDs
1034 //const size_t numCoarsePointRows = coarsePointMap->getLocalNumElements();
1035 const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim;
1036 RCP<const Map> coarseBlockMap = MapFactory::Build(coarsePointMap->lib(),
1037 Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
1038 numCoarseBlockRows,
1039 coarsePointMap->getIndexBase(),
1040 coarsePointMap->getComm());
1041 // Sanity checking
1042 const ParameterList& pL = GetParameterList();
1043 // const bool &doQRStep = pL.get<bool>("tentative: calculate qr");
1044
1045
1046 // The aggregates use the amalgamated column map, which in this case is what we want
1047
1048 // Aggregates map is based on the amalgamated column map
1049 // We can skip global-to-local conversion if LIDs in row map are
1050 // same as LIDs in column map
1051 bool goodMap = MueLu::Utilities<SC,LO,GO,NO>::MapsAreNested(*rowMap, *colMap);
1052 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
1053 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
1054 "(i.e. \"matching\" row and column maps)");
1055
1056 // STEP 1: do unamalgamation
1057 // The non-kokkos version uses member functions from the AmalgamationInfo
1058 // container class to unamalgamate the data. In contrast, the kokkos
1059 // version of TentativePFactory does the unamalgamation here and only uses
1060 // the data of the AmalgamationInfo container class
1061
1062 // Extract information for unamalgamation
1063 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
1064 GO indexBase;
1065 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
1066 //GO globalOffset = amalgInfo->GlobalOffset();
1067
1068 // Extract aggregation info (already in Kokkos host views)
1069 auto procWinner = aggregates->GetProcWinner() ->getDeviceLocalView(Xpetra::Access::ReadOnly);
1070 auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
1071 const size_t numAggregates = aggregates->GetNumAggregates();
1072
1073 int myPID = aggregates->GetMap()->getComm()->getRank();
1074
1075 // Create Kokkos::View (on the device) to store the aggreate dof sizes
1076 // Later used to get aggregate dof offsets
1077 // NOTE: This zeros itself on construction
1078 typedef typename Aggregates_kokkos::aggregates_sizes_type::non_const_type AggSizeType;
1079 AggSizeType aggDofSizes; // This turns into "starts" after the parallel_scan
1080
1081 {
1082 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
1083
1084 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
1085 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates+1);
1086
1087 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates+1)), aggSizes);
1088 }
1089
1090 // Find maximum dof size for aggregates
1091 // Later used to reserve enough scratch space for local QR decompositions
1092 LO maxAggSize = 0;
1093 ReduceMaxFunctor<LO,decltype(aggDofSizes)> reduceMax(aggDofSizes);
1094 Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
1095
1096 // parallel_scan (exclusive)
1097 // The aggDofSizes View then contains the aggregate dof offsets
1098 Kokkos::parallel_scan("MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0,numAggregates+1),
1099 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
1100 update += aggDofSizes(i);
1101 if (final_pass)
1102 aggDofSizes(i) = update;
1103 });
1104
1105 // Create Kokkos::View on the device to store mapping
1106 // between (local) aggregate id and row map ids (LIDs)
1107 Kokkos::View<LO*, DeviceType> aggToRowMapLO(Kokkos::ViewAllocateWithoutInitializing("aggtorow_map_LO"), numFineBlockRows);
1108 {
1109 SubFactoryMonitor m2(*this, "Create AggToRowMap", coarseLevel);
1110
1111 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
1112 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
1113
1114 Kokkos::parallel_for("MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
1115 KOKKOS_LAMBDA(const LO lnode) {
1116 if (procWinner(lnode, 0) == myPID) {
1117 // No need for atomics, it's one-to-one
1118 auto aggID = vertex2AggId(lnode,0);
1119
1120 auto offset = Kokkos::atomic_fetch_add( &aggOffsets(aggID), stridedBlockSize );
1121 // FIXME: I think this may be wrong
1122 // We unconditionally add the whole block here. When we calculated
1123 // aggDofSizes, we did the isLocalElement check. Something's fishy.
1124 for (LO k = 0; k < stridedBlockSize; k++)
1125 aggToRowMapLO(offset + k) = lnode*stridedBlockSize + k;
1126 }
1127 });
1128 }
1129
1130 // STEP 2: prepare local QR decomposition
1131 // Reserve memory for tentative prolongation operator
1132 coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim);
1133
1134 // Pull out the nullspace vectors so that we can have random access (on the device)
1135 auto fineNS = fineNullspace ->getDeviceLocalView(Xpetra::Access::ReadWrite);
1136 auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
1137
1138 typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type local_matrix_type;
1139 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
1140 typedef typename local_matrix_type::index_type::non_const_type cols_type;
1141 // typedef typename local_matrix_type::values_type::non_const_type vals_type;
1142
1143
1144 // Device View for status (error messages...)
1145 typedef Kokkos::View<int[10], DeviceType> status_type;
1146 status_type status("status");
1147
1148 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
1150
1151 // We're going to bypass QR in the BlockCrs version of the code regardless of what the user asks for
1152 GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
1153
1154 // BlockCrs requires that we build the (block) graph first, so let's do that...
1155
1156 // NOTE: Because we're assuming that the NSDim == BlockSize, we only have one
1157 // block non-zero per row in the matrix;
1158 rows_type ia(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows+1);
1159 cols_type ja(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), numFineBlockRows);
1160
1161 Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:graph_init", range_type(0, numFineBlockRows),
1162 KOKKOS_LAMBDA(const LO j) {
1163 ia[j] = j;
1164 ja[j] = INVALID;
1165
1166 if(j==(LO)numFineBlockRows-1)
1167 ia[numFineBlockRows] = numFineBlockRows;
1168 });
1169
1170 // Fill Graph
1171 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
1172 Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:fillGraph", policy,
1173 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
1174 auto agg = thread.league_rank();
1175 Xpetra::global_size_t offset = agg;
1176
1177 // size of the aggregate (number of DOFs in aggregate)
1178 LO aggSize = aggRows(agg+1) - aggRows(agg);
1179
1180 for (LO j = 0; j < aggSize; j++) {
1181 // FIXME: Allow for bad maps
1182 const LO localRow = aggToRowMapLO[aggDofSizes[agg]+j];
1183 const size_t rowStart = ia[localRow];
1184 ja[rowStart] = offset;
1185 }
1186 });
1187
1188 // Compress storage (remove all INVALID, which happen when we skip zeros)
1189 // We do that in-place
1190 {
1191 // Stage 2: compress the arrays
1192 SubFactoryMonitor m2(*this, "Stage 2 (CompressData)", coarseLevel);
1193 // Fill i_temp with the correct row starts
1194 rows_type i_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows+1);
1195 LO nnz=0;
1196 Kokkos::parallel_scan("MueLu:TentativePF:BlockCrs:compress_rows", range_type(0,numFineBlockRows),
1197 KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
1198 if(final)
1199 i_temp[i] = upd;
1200 for (auto j = ia[i]; j < ia[i+1]; j++)
1201 if (ja[j] != INVALID)
1202 upd++;
1203 if(final && i == (LO) numFineBlockRows-1)
1204 i_temp[numFineBlockRows] = upd;
1205 },nnz);
1206
1207 cols_type j_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), nnz);
1208
1209
1210 Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:compress_cols", range_type(0,numFineBlockRows),
1211 KOKKOS_LAMBDA(const LO i) {
1212 size_t rowStart = i_temp[i];
1213 size_t lnnz = 0;
1214 for (auto j = ia[i]; j < ia[i+1]; j++)
1215 if (ja[j] != INVALID) {
1216 j_temp[rowStart+lnnz] = ja[j];
1217 lnnz++;
1218 }
1219 });
1220
1221 ia = i_temp;
1222 ja = j_temp;
1223 }
1224
1225 RCP<CrsGraph> BlockGraph = CrsGraphFactory::Build(rowMap,coarseBlockMap,ia,ja);
1226
1227
1228 // Managing labels & constants for ESFC
1229 {
1230 RCP<ParameterList> FCparams;
1231 if(pL.isSublist("matrixmatrix: kernel params"))
1232 FCparams=rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
1233 else
1234 FCparams= rcp(new ParameterList);
1235 // By default, we don't need global constants for TentativeP
1236 FCparams->set("compute global constants",FCparams->get("compute global constants",false));
1237 std::string levelIDs = toString(levelID);
1238 FCparams->set("Timer Label",std::string("MueLu::TentativeP-")+levelIDs);
1239 RCP<const Export> dummy_e;
1240 RCP<const Import> dummy_i;
1241 BlockGraph->expertStaticFillComplete(coarseBlockMap,rowMap,dummy_i,dummy_e,FCparams);
1242 }
1243
1244 // We can't leave the ia/ja pointers floating around, because of host/device view counting, so
1245 // we clear them here
1246 ia = rows_type();
1247 ja = cols_type();
1248
1249
1250 // Now let's make a BlockCrs Matrix
1251 // NOTE: Assumes block size== NSDim
1252 RCP<Xpetra::CrsMatrix<SC,LO,GO,NO> > P_xpetra = Xpetra::CrsMatrixFactory<SC,LO,GO,NO>::BuildBlock(BlockGraph, coarsePointMap, rangeMap,NSDim);
1253 RCP<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> > P_tpetra = rcp_dynamic_cast<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> >(P_xpetra);
1254 if(P_tpetra.is_null()) throw std::runtime_error("BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix");
1255 RCP<CrsMatrixWrap> P_wrap = rcp(new CrsMatrixWrap(P_xpetra));
1256
1257 auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst();
1258 const LO stride = NSDim*NSDim;
1259
1260 Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:main_loop_noqr", policy,
1261 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
1262 auto agg = thread.league_rank();
1263
1264 // size of the aggregate (number of DOFs in aggregate)
1265 LO aggSize = aggRows(agg+1) - aggRows(agg);
1266 Xpetra::global_size_t offset = agg*NSDim;
1267
1268 // Q = localQR(:,0)/norm
1269 for (LO j = 0; j < aggSize; j++) {
1270 LO localBlockRow = aggToRowMapLO(aggRows(agg)+j);
1271 LO rowStart = localBlockRow * stride;
1272 for (LO r = 0; r < (LO)NSDim; r++) {
1273 LO localPointRow = localBlockRow*NSDim + r;
1274 for (LO c = 0; c < (LO)NSDim; c++) {
1275 values[rowStart + r*NSDim + c] = fineNSRandom(localPointRow,c);
1276 }
1277 }
1278 }
1279
1280 // R = norm
1281 for(LO j=0; j<(LO)NSDim; j++)
1282 coarseNS(offset+j,j) = one;
1283 });
1284
1285 Ptentative = P_wrap;
1286
1287#else
1288 throw std::runtime_error("TentativePFactory::BuildPuncoupledBlockCrs: Requires Tpetra");
1289#endif
1290 }
1291
1292 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
1294 BuildPcoupled(RCP<Matrix> /* A */, RCP<Aggregates_kokkos> /* aggregates */,
1295 RCP<AmalgamationInfo_kokkos> /* amalgInfo */, RCP<MultiVector> /* fineNullspace */,
1296 RCP<const Map> /* coarseMap */, RCP<Matrix>& /* Ptentative */,
1297 RCP<MultiVector>& /* coarseNullspace */) const {
1298 throw Exceptions::RuntimeError("MueLu: Construction of coupled tentative P is not implemented");
1299 }
1300
1301 template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
1303 isGoodMap(const Map& rowMap, const Map& colMap) const {
1304 auto rowLocalMap = rowMap.getLocalMap();
1305 auto colLocalMap = colMap.getLocalMap();
1306
1307 const size_t numRows = rowLocalMap.getLocalNumElements();
1308 const size_t numCols = colLocalMap.getLocalNumElements();
1309
1310 if (numCols < numRows)
1311 return false;
1312
1313 size_t numDiff = 0;
1314 Kokkos::parallel_reduce("MueLu:TentativePF:isGoodMap", range_type(0, numRows),
1315 KOKKOS_LAMBDA(const LO i, size_t &diff) {
1316 diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
1317 }, numDiff);
1318
1319 return (numDiff == 0);
1320 }
1321
1322} //namespace MueLu
1323
1324#define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
1325#endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
View
#define SET_VALID_ENTRY(name)
maxAggDofSizeType maxAggDofSize
agg2RowMapLOType agg2RowMapLO
MueLu::DefaultLocalOrdinal LocalOrdinal
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
Exception throws to report errors in the internal logical of the program.
Timer to be used in factories. Similar to Monitor but with additional timers.
Class that holds all level-specific information.
bool IsAvailable(const std::string &ename, const FactoryBase *factory=NoFactory::get()) const
Test whether a need's value has been saved.
int GetLevelID() const
Return level number.
static const NoFactory * get()
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
static bool MapsAreNested(const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &rowMap, const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &colMap)
Namespace for MueLu classes and methods.
@ Warnings0
Important warning messages (one line)
@ Statistics2
Print even more statistics.
@ Runtime1
Description of what is happening (more verbose)
std::string toString(const T &what)
Little helper function to convert non-string types to strings.