755 *
return 1. / (0.05 + 2. * p.square());
761 *
double Coefficient<dim>::value(
const Point<dim> & p,
762 *
const unsigned int component)
const
764 *
return value<double>(p, component);
771 * <a name=
"Matrixfreeimplementation"></a>
772 * <h3>Matrix-
free implementation</h3>
776 * The following
class, called <code>LaplaceOperator</code>, implements the
777 * differential
operator. For all practical purposes, it is a
matrix, i.e.,
778 * you can ask it
for its size (member functions <code>m(), n()</code>) and
779 * you can apply it to a vector (the <code>vmult()</code> function). The
780 * difference to a real
matrix of course lies in the fact that this class
781 * does not actually store the <i>elements</i> of the
matrix, but only knows
782 * how to compute the action of the operator when applied to a vector.
786 * The infrastructure describing the
matrix size, the initialization from a
788 * through vmult() and Tvmult() methods, is provided by the class
789 * MatrixFreeOperator::Base from which this class derives. The
790 * LaplaceOperator class defined here only has to provide a few interfaces,
791 * namely the actual action of the operator through the apply_add() method
792 * that gets used in the vmult()
functions, and a method to compute the
794 * definition of the multigrid smoother. Since we consider a problem with
795 * variable coefficient, we further implement a method that can fill the
800 * Note that the file <code>include/deal.II/matrix_free/operators.h</code>
801 * already contains an implementation of the Laplacian through the class
803 * operator is re-implemented in this tutorial program, explaining the
804 * ingredients and concepts used there.
808 * This program makes use of the data cache for finite element operator
809 * application that is integrated in deal.II. This data cache class is
810 * called
MatrixFree. It contains mapping information (Jacobians) and index
811 * relations between local and global degrees of freedom. It also contains
812 * constraints like the ones from hanging nodes or Dirichlet boundary
813 * conditions. Moreover, it can issue a
loop over all cells in %
parallel,
814 * making sure that only cells are worked on that do not share any degree of
815 * freedom (this makes the
loop thread-safe when writing into destination
816 * vectors). This is a more advanced strategy compared to the
WorkStream
817 * class described in the @ref threads module. Of course, to not destroy
818 * thread-safety, we have to be careful when writing into class-global
823 * The class implementing the Laplace operator has three template arguments,
824 *
one for the dimension (as many deal.II classes carry),
one for the degree
825 * of the finite element (which we need to enable efficient computations
827 * type. We want to use <code>
double</code>
numbers (i.
e.,
double precision,
828 * 64-bit floating
point) for the final
matrix, but floats (single
830 * matrices (as that is only a preconditioner, and floats can be processed
831 * twice as fast). The class
FEEvaluation also takes a template argument for
832 * the number of quadrature points in
one dimension. In the code below, we
833 * hard-code it to <code>fe_degree+1</code>. If we wanted to change it
834 * independently of the polynomial degree, we would need to add a template
839 * As a sidenote, if we implemented several different operations on the same
840 * grid and degrees of freedom (like a mass
matrix and a Laplace
matrix), we
841 * would define two classes like the current
one for each of the operators
845 * only provide a minimal
set of
functions. This concept allows for writing
846 * complex application codes with many
matrix-
free operations.
851 * requires care: Here, we use the deal.II table class which is prepared to
852 * hold the data with correct alignment. However, storing
e.g. an
854 * vectorization:
A certain alignment of the data with the memory address
855 * boundaries is required (essentially, a
VectorizedArray that is 32 bytes
856 *
long in case of AVX needs to start at a memory address that is divisible
857 * by 32). The table class (as well as the
AlignedVector class it is based
858 * on) makes sure that this alignment is respected, whereas
std::vector does
859 * not in
general, which may lead to segmentation faults at strange places
860 * for some systems or suboptimal performance for other systems.
863 * template <
int dim,
int fe_degree, typename number>
864 * class LaplaceOperator
869 *
using value_type = number;
873 *
void clear()
override;
875 *
void evaluate_coefficient(
const Coefficient<dim> &coefficient_function);
880 *
virtual void apply_add(
888 *
const std::pair<unsigned int, unsigned int> &cell_range)
const;
890 *
void local_compute_diagonal(
893 *
const unsigned int & dummy,
894 *
const std::pair<unsigned int, unsigned int> &cell_range)
const;
903 * This is the constructor of the @p LaplaceOperator
class. All it does is
904 * to
call the
default constructor of the base
class
906 *
class that asserts that this class is not accessed after going out of scope
907 *
e.g. in a preconditioner.
910 *
template <
int dim,
int fe_degree,
typename number>
911 * LaplaceOperator<dim, fe_degree, number>::LaplaceOperator()
918 *
template <
int dim,
int fe_degree,
typename number>
919 *
void LaplaceOperator<dim, fe_degree, number>::clear()
921 * coefficient.
reinit(0, 0);
931 * <a name=
"Computationofcoefficient"></a>
932 * <h4>Computation of coefficient</h4>
936 * To initialize the coefficient, we directly give it the Coefficient
class
937 * defined above and then select the method
938 * <code>coefficient_function.value</code> with vectorized number (which the
939 * compiler can deduce from the
point data type). The use of the
940 *
FEEvaluation class (and its
template arguments) will be explained below.
943 * template <int dim, int fe_degree, typename number>
944 *
void LaplaceOperator<dim, fe_degree, number>::evaluate_coefficient(
945 *
const Coefficient<dim> &coefficient_function)
947 *
const unsigned int n_cells = this->data->n_cell_batches();
950 * coefficient.reinit(
n_cells, phi.n_q_points);
951 *
for (
unsigned int cell = 0; cell <
n_cells; ++cell)
954 *
for (
unsigned int q = 0; q < phi.n_q_points; ++q)
955 * coefficient(cell, q) =
956 * coefficient_function.value(phi.quadrature_point(q));
965 * <a name=
"LocalevaluationofLaplaceoperator"></a>
966 * <h4>Local evaluation of Laplace
operator</h4>
970 * Here comes the main function of
this class, the evaluation of the
971 *
matrix-vector product (or, in
general, a finite element
operator
972 * evaluation). This is done in a function that takes exactly four
973 * arguments, the
MatrixFree object, the destination and source vectors, and
974 * a range of cells that are to be worked on. The method
975 * <code>cell_loop</code> in the
MatrixFree class will internally
call this
976 * function with some range of cells that is obtained by checking which
977 * cells are possible to work on simultaneously so that write operations
do
978 * not cause any race condition. Note that the cell range used in the
loop
979 * is not directly the number of (active) cells in the current mesh, but
980 * rather a collection of batches of cells. In other word,
"cell" may be
982 * cells together. This means that in the
loop over quadrature points we are
983 * actually seeing a group of quadrature points of several cells as
one
984 * block. This is done to enable a higher degree of vectorization. The
985 * number of such
"cells" or
"cell batches" is stored in
MatrixFree and can
987 * cell iterators, in
this class all cells are laid out in a plain array
988 * with no direct knowledge of
level or neighborship relations, which makes
989 * it possible to index the cells by
unsigned integers.
993 * The implementation of the Laplace
operator is quite simple: First, we
994 * need to create an
object FEEvaluation that contains the computational
995 * kernels and has data fields to store temporary results (
e.g.
gradients
996 * evaluated on all quadrature points on a collection of a few cells). Note
997 * that temporary results
do not use a lot of memory, and since we specify
998 *
template arguments with the element order, the data is stored on the
999 * stack (without expensive memory allocation). Usually,
one only needs to
1000 *
set two
template arguments, the dimension as a
first argument and the
1001 * degree of the finite element as the
second argument (
this is
equal to the
1002 * number of degrees of freedom per dimension minus
one for FE_Q
1003 * elements). However, here we also want to be able to use
float numbers for
1004 * the multigrid preconditioner, which is the last (fifth)
template
1005 * argument. Therefore, we cannot rely on the
default template arguments and
1006 * must also fill the third and fourth field, consequently. The third
1007 * argument specifies the number of quadrature points per direction and has
1008 * a
default value equal to the degree of the element plus
one. The fourth
1009 * argument sets the number of components (
one can also evaluate
1010 * vector-valued
functions in systems of PDEs, but the
default is a scalar
1011 * element), and
finally the last argument sets the number type.
1015 * Next, we
loop over the given cell range and then we
continue with the
1016 * actual implementation: <ol> <li>Tell the
FEEvaluation object the (macro)
1017 * cell we want to work on. <li>Read in the
values of the source vectors
1018 * (@p read_dof_values), including the resolution of constraints. This
1019 * stores @f$u_\mathrm{cell}@f$ as described in the introduction. <li>Compute
1020 * the unit-cell
gradient (the evaluation of finite element
1022 *
gradient computations, it uses a unified
interface to all kinds of
1023 * derivatives of order between
zero and two. We only want
gradients, no
1024 *
values and no
second derivatives, so we
set the function arguments to
1026 * (
first slot). There is also a third slot
for the Hessian which is
1027 *
false by
default, so it needs not be given. Note that the
FEEvaluation
1028 *
class internally evaluates shape
functions in an efficient way where
one
1029 * dimension is worked on at a time (
using the tensor product form of shape
1030 *
functions and quadrature points as mentioned in the introduction). This
1031 * gives complexity
equal to @f$\mathcal
O(
d^2 (p+1)^{
d+1})@f$
for polynomial
1032 * degree @f$p@f$ in @f$d@f$ dimensions, compared to the naive approach with loops
1033 * over all local degrees of freedom and quadrature points that is used in
1034 *
FEValues and costs @f$\mathcal
O(
d (p+1)^{2
d})@f$. <li>Next comes the
1035 * application of the Jacobian transformation, the multiplication by the
1036 * variable coefficient and the quadrature weight.
FEEvaluation has an
1037 * access function @p get_gradient that applies the Jacobian and returns the
1038 * gradient in real space. Then, we just need to multiply by the (scalar)
1039 * coefficient, and let the function @p submit_gradient apply the
second
1040 * Jacobian (
for the test function) and the quadrature weight and Jacobian
1042 * data field as where it is read from in @p get_gradient. Therefore, you
1043 * need to make sure to not read from the same quadrature
point again after
1044 * having called @p submit_gradient on that particular quadrature
point. In
1045 *
general, it is a good idea to
copy the result of @p get_gradient when it
1046 * is used more often than once. <li>Next follows the summation over
1047 * quadrature points
for all test
functions that corresponds to the actual
1048 * integration step. For the Laplace
operator, we just multiply by the
1049 *
gradient, so we
call the integrate function with the respective argument
1050 *
set. If you have an equation where you test by both the
values of the
1052 * to
true. Calling
first the integrate function
for values and then
1054 *
call will internally overwrite the results from the
first call. Note that
1055 * there is no function argument
for the
second derivative
for integrate
1056 * step. <li>Eventually, the local contributions in the vector
1057 * @f$v_\mathrm{cell}@f$ as mentioned in the introduction need to be added into
1058 * the result vector (and constraints are applied). This is done with a
call
1059 * to @p distribute_local_to_global, the same name as the corresponding
1061 * in the
FEEvaluation object, as are the indices between local and global
1062 * degrees of freedom). </ol>
1065 *
template <
int dim,
int fe_degree,
typename number>
1066 *
void LaplaceOperator<dim, fe_degree, number>::local_apply(
1070 *
const std::pair<unsigned int, unsigned int> & cell_range)
const
1074 *
for (
unsigned int cell = cell_range.first; cell < cell_range.second; ++cell)
1080 * phi.read_dof_values(src);
1082 *
for (
unsigned int q = 0; q < phi.n_q_points; ++q)
1083 * phi.submit_gradient(coefficient(cell, q) * phi.get_gradient(q), q);
1085 * phi.distribute_local_to_global(dst);
1093 * This function implements the
loop over all cells
for the
1094 * Base::apply_add() interface. This is done with the @p cell_loop of the
1095 *
MatrixFree class, which takes the operator() of this class with arguments
1096 *
MatrixFree, OutVector, InVector, cell_range. When working with MPI
1097 * parallelization (but no threading) as is done in this tutorial program,
1098 * the cell
loop corresponds to the following three lines of code:
1102 * <div class=CodeFragmentInTutorialComment>
1104 * src.update_ghost_values();
1105 * local_apply(*this->data, dst, src,
std::make_pair(0
U,
1106 * data.n_cell_batches()));
1113 * Here, the two calls update_ghost_values() and
compress() perform the data
1114 * exchange on processor boundaries for MPI, once for the source vector
1115 * where we need to read from entries owned by remote processors, and once
1116 * for the destination vector where we have accumulated parts of the
1117 * residuals that need to be added to the respective entry of the owner
1118 * processor. However,
MatrixFree::cell_loop does not only abstract away
1119 * those two calls, but also performs some additional optimizations. On the
1120 *
one hand, it will
split the update_ghost_values() and
compress() calls in
1121 * a way to allow for overlapping communication and computation. The
1122 * local_apply function is then called with three cell ranges representing
1123 * partitions of the cell range from 0 to
MatrixFree::n_cell_batches(). On
1124 * the other hand, cell_loop also supports thread parallelism in which case
1125 * the cell ranges are
split into smaller chunks and scheduled in an
1126 * advanced way that avoids access to the same vector entry by several
1127 * threads. That feature is explained in @ref step_48 "step-48".
1131 * Note that after the cell
loop, the constrained degrees of freedom need to
1132 * be touched once more for sensible vmult() operators: Since the assembly
1133 *
loop automatically resolves constraints (just as the
1135 * compute any contribution for constrained degrees of freedom, leaving the
1136 * respective entries
zero. This would represent a
matrix that had empty
1137 * rows and columns for constrained degrees of freedom. However, iterative
1138 * solvers like CG only work for non-singular matrices. The easiest way to
1139 * do that is to
set the sub-block of the
matrix that corresponds to
1140 * constrained DoFs to an
identity matrix, in which case application of the
1141 *
matrix would simply
copy the elements of the right hand side vector into
1142 * the left hand side. Fortunately, the vmult() implementations
1144 * apply_add() function, so we do not need to take further action here.
1149 * with MPI, there is
one aspect to be careful about — the indexing
1150 * used for accessing the vector. For performance reasons,
MatrixFree and
1151 *
FEEvaluation are designed to access vectors in MPI-local index space also
1152 * when working with multiple processors. Working in local index space means
1153 * that no index
translation needs to be performed at the place the vector
1154 * access happens, apart from the unavoidable indirect addressing. However,
1155 * local index spaces are ambiguous: While it is standard convention to
1156 * access the locally owned range of a vector with indices between 0 and the
1157 * local size, the numbering is not so clear for the ghosted entries and
1158 * somewhat arbitrary. For the
matrix-vector product, only the indices
1159 * appearing on locally owned cells (plus those referenced via hanging node
1160 * constraints) are necessary. However, in deal.II we often
set all the
1161 * degrees of freedom on ghosted elements as ghosted vector entries, called
1162 * the @ref GlossLocallyRelevantDof "locally relevant DoFs described in the
1163 * glossary". In that case, the MPI-local index of a ghosted vector entry
1164 * can in
general be different in the two possible ghost sets, despite
1165 * referring to the same global index. To avoid problems,
FEEvaluation
1166 * checks that the partitioning of the vector used for the
matrix-vector
1167 * product does indeed match with the partitioning of the indices in
1171 * mechanism to fit the ghost
set to the correct layout. This happens in the
1172 * ghost region of the vector, so keep in mind that the ghost region might
1173 * be modified in both the destination and source vector after a
call to a
1174 * vmult() method. This is legitimate because the ghost region of a
1175 * distributed deal.II vector is a mutable section and filled on
1176 * demand. Vectors used in
matrix-vector products must not be ghosted upon
1177 * entry of vmult()
functions, so no information gets lost.
1180 * template <
int dim,
int fe_degree, typename number>
1181 *
void LaplaceOperator<dim, fe_degree, number>::apply_add(
1185 * this->data->cell_loop(&LaplaceOperator::local_apply,
this, dst, src);
1192 * The following function implements the computation of the
diagonal of the
1194 * turns out to be more complicated than evaluating the
1195 *
operator. Fundamentally, we could obtain a
matrix representation of the
1196 *
operator by applying the
operator on <i>all</i> unit vectors. Of course,
1197 * that would be very inefficient since we would need to perform <i>n</i>
1198 *
operator evaluations to retrieve the whole
matrix. Furthermore,
this
1199 * approach would completely ignore the
matrix sparsity. On an individual
1200 * cell, however,
this is the way to go and actually not that inefficient as
1201 * there usually is a coupling between all degrees of freedom inside the
1207 * layout. This vector is encapsulated in a member called
1208 * inverse_diagonal_entries of type
DiagonalMatrix in the base
class
1210 * need to initialize and then get the vector representing the
diagonal
1211 * entries in the
matrix. As to the actual
diagonal computation, we again
1212 * use the cell_loop infrastructure of
MatrixFree to invoke a local worker
1213 * routine called local_compute_diagonal(). Since we will only write into a
1214 * vector but not have any source vector, we put a dummy argument of type
1215 * <tt>
unsigned int</tt> in place of the source vector to confirm with the
1216 * cell_loop interface. After the
loop, we need to
set the vector entries
1217 * subject to Dirichlet boundary conditions to
one (either those on the
1219 * the indices at the interface between different grid levels in adaptive
1220 * multigrid). This is done through the function
1222 * with the setting in the
matrix-vector product provided by the Base
1223 * operator. Finally, we need to
invert the
diagonal entries which is the
1224 * form required by the Chebyshev smoother based on the Jacobi iteration. In
1225 * the
loop, we assert that all entries are non-
zero, because they should
1226 * either have obtained a positive contribution from integrals or be
1227 * constrained and treated by @p set_constrained_entries_to_one() following
1231 * template <
int dim,
int fe_degree, typename number>
1238 * this->data->initialize_dof_vector(inverse_diagonal);
1239 *
unsigned int dummy = 0;
1240 * this->data->cell_loop(&LaplaceOperator::local_compute_diagonal,
1250 *
ExcMessage(
"No diagonal entry in a positive definite operator "
1251 *
"should be zero"));
1262 * columns in the local
matrix and putting the entry 1 in the <i>i</i>th
1263 * slot and a
zero entry in all other slots, i.e., we apply the cell-wise
1264 * differential
operator on
one unit vector at a time. The inner part
1266 * FEEvalution::integrate, is exactly the same as in the local_apply
1267 * function. Afterwards, we pick out the <i>i</i>th entry of the local
1268 * result and put it to a temporary storage (as we overwrite all entries in
1270 * iteration). Finally, the temporary storage is written to the destination
1272 *
FEEvaluation::submit_dof_value() to read and write to the data field that
1273 *
FEEvaluation uses for the integration on the
one hand and writes into the
1274 * global vector on the other hand.
1278 * Given that we are only interested in the
matrix diagonal, we simply throw
1279 * away all other entries of the local
matrix that have been computed along
1280 * the way. While it might seem wasteful to compute the complete cell
matrix
1281 * and then throw away everything but the
diagonal, the integration are so
1282 * efficient that the computation does not take too much time. Note that the
1283 * complexity of operator evaluation per element is @f$\mathcal
1284 *
O((p+1)^{
d+1})@f$
for polynomial degree @f$k@f$, so computing the whole
matrix
1285 * costs us @f$\mathcal
O((p+1)^{2
d+1})@f$ operations, not too far away from
1286 * @f$\mathcal
O((p+1)^{2
d})@f$ complexity
for computing the
diagonal with
1288 * vectorization and other optimizations, the
diagonal computation with
this
1289 * function is actually the fastest (simple) variant. (It would be possible
1290 * to compute the
diagonal with
sum factorization techniques in @f$\mathcal
1291 *
O((p+1)^{
d+1})@f$ operations involving specifically adapted
1292 * kernels—but since such kernels are only useful in that particular
1293 * context and the
diagonal computation is typically not on the critical
1294 * path, they have not been implemented in deal.II.)
1298 * Note that the code that calls distribute_local_to_global on the vector to
1299 * accumulate the
diagonal entries into the global
matrix has some
1300 * limitations. For operators with hanging node constraints that distribute
1301 * an integral contribution of a constrained DoF to several other entries
1302 * inside the distribute_local_to_global
call, the vector
interface used
1303 * here does not exactly compute the
diagonal entries, but lumps some
1306 * result is correct up to discretization accuracy as explained in <a
1307 * href=
"http://dx.doi.org/10.4208/cicp.101214.021015a">Kormann (2016),
1308 * section 5.3</a>, but not mathematically
equal. In
this tutorial program,
1309 * no harm can happen because the
diagonal is only used
for the multigrid
1310 *
level matrices where no hanging node constraints appear.
1313 *
template <
int dim,
int fe_degree,
typename number>
1314 *
void LaplaceOperator<dim, fe_degree, number>::local_compute_diagonal(
1317 *
const unsigned int &,
1318 *
const std::pair<unsigned int, unsigned int> &cell_range)
const
1324 *
for (
unsigned int cell = cell_range.first; cell < cell_range.second; ++cell)
1330 *
for (
unsigned int i = 0; i < phi.dofs_per_cell; ++i)
1332 *
for (
unsigned int j = 0; j < phi.dofs_per_cell; ++j)
1334 * phi.submit_dof_value(make_vectorized_array<number>(1.), i);
1337 *
for (
unsigned int q = 0; q < phi.n_q_points; ++q)
1338 * phi.submit_gradient(coefficient(cell, q) * phi.get_gradient(q),
1341 *
diagonal[i] = phi.get_dof_value(i);
1343 *
for (
unsigned int i = 0; i < phi.dofs_per_cell; ++i)
1344 * phi.submit_dof_value(
diagonal[i], i);
1345 * phi.distribute_local_to_global(dst);
1354 * <a name=
"LaplaceProblemclass"></a>
1355 * <h3>LaplaceProblem
class</h3>
1359 * This
class is based on the
one in @ref step_16 "step-16". However, we replaced the
1361 * that we can also skip the sparsity patterns. Notice that we define the
1362 * LaplaceOperator
class with the degree of finite element as template
1363 * argument (the value is defined at the top of the file), and that we use
1368 * The
class also has a member variable to keep track of all the detailed
1369 * timings
for setting up the entire chain of data before we actually go
1370 * about solving the problem. In addition, there is an output stream (that
1371 * is disabled by
default) that can be used to output details
for the
1372 * individual setup operations instead of the summary only that is printed
1377 * Since
this program is designed to be used with MPI, we also provide the
1378 * usual @p pcout output stream that only prints the information of the
1379 * processor with MPI rank 0. The grid used
for this programs can either be
1380 * a distributed
triangulation based on p4est (in
case deal.II is configured
1381 * to use p4est), otherwise it is a serial grid that only runs without MPI.
1384 *
template <
int dim>
1385 *
class LaplaceProblem
1392 *
void setup_system();
1393 *
void assemble_rhs();
1395 *
void output_results(
const unsigned int cycle)
const;
1397 * #ifdef DEAL_II_WITH_P4EST
1409 *
using SystemMatrixType =
1410 * LaplaceOperator<dim, degree_finite_element, double>;
1411 * SystemMatrixType system_matrix;
1414 *
using LevelMatrixType = LaplaceOperator<dim, degree_finite_element, float>;
1420 *
double setup_time;
1429 * When we initialize the finite element, we of course have to use the
1430 * degree specified at the top of the file as well (otherwise, an exception
1431 * will be thrown at some
point, since the computational kernel defined in
1432 * the templated LaplaceOperator
class and the information from the finite
1433 * element read out by
MatrixFree will not match). The constructor of the
1435 * conform to the 2:1 cell balance over
vertices, which is needed
for the
1436 * convergence of the geometric multigrid routines. For the distributed
1437 * grid, we also need to specifically enable the multigrid hierarchy.
1440 *
template <
int dim>
1441 * LaplaceProblem<dim>::LaplaceProblem()
1443 * #ifdef DEAL_II_WITH_P4EST
1453 * fe(degree_finite_element)
1460 * The LaplaceProblem
class holds an additional output stream that
1461 * collects detailed timings about the setup phase. This stream, called
1462 * time_details, is disabled by
default through the @p
false argument
1463 * specified here. For detailed timings, removing the @p
false argument
1464 * prints all the details.
1467 * time_details(std::cout,
1476 * <a name=
"LaplaceProblemsetup_system"></a>
1477 * <h4>LaplaceProblem::setup_system</h4>
1481 * The setup stage is in analogy to @ref step_16
"step-16" with relevant changes due to the
1483 * including the degrees of freedom
for the multigrid levels, and to
1484 * initialize constraints from hanging nodes and homogeneous Dirichlet
1485 * conditions. Since we intend to use
this programs in %
parallel with MPI,
1486 * we need to make sure that the constraints get to know the locally
1487 * relevant degrees of freedom, otherwise the storage would explode when
1488 *
using more than a few hundred millions of degrees of freedom, see
1489 * @ref step_40
"step-40".
1493 * Once we have created the multigrid dof_handler and the constraints, we
1495 * each
level of the multigrid scheme. The main action is to
set up the
1496 * <code>
MatrixFree </code> instance
for the problem. The base
class of the
1498 * initialized with a shared pointer to
MatrixFree object. This way, we can
1499 * simply create it here and then pass it on to the system
matrix and
level
1500 * matrices, respectively. For setting up
MatrixFree, we need to activate
1501 * the update flag in the AdditionalData field of
MatrixFree that enables
1502 * the storage of quadrature
point coordinates in real space (by
default, it
1503 * only caches data
for gradients (inverse transposed Jacobians) and JxW
1505 *
level (i.e., giving <code>
level = numbers::invalid_unsigned_int</code>),
1506 *
MatrixFree constructs a
loop over the active cells. In
this tutorial, we
1507 *
do not use threads in addition to MPI, which is why we explicitly disable
1510 * and vectors are initialized as explained above.
1513 *
template <
int dim>
1514 *
void LaplaceProblem<dim>::setup_system()
1519 * system_matrix.clear();
1520 * mg_matrices.clear_elements();
1522 * dof_handler.distribute_dofs(fe);
1523 * dof_handler.distribute_mg_dofs();
1525 * pcout <<
"Number of degrees of freedom: " << dof_handler.n_dofs()
1531 * constraints.clear();
1532 * constraints.reinit(locally_relevant_dofs);
1536 * constraints.close();
1538 * time_details <<
"Distribute DoFs & B.C. (CPU/wall) " << time.
cpu_time()
1539 * <<
"s/" << time.
wall_time() <<
"s" << std::endl;
1548 * std::shared_ptr<MatrixFree<dim, double>> system_mf_storage(
1550 * system_mf_storage->reinit(mapping,
1555 * system_matrix.initialize(system_mf_storage);
1558 * system_matrix.evaluate_coefficient(Coefficient<dim>());
1560 * system_matrix.initialize_dof_vector(solution);
1561 * system_matrix.initialize_dof_vector(system_rhs);
1564 * time_details <<
"Setup matrix-free system (CPU/wall) " << time.
cpu_time()
1565 * <<
"s/" << time.
wall_time() <<
"s" << std::endl;
1570 * Next, initialize the matrices
for the multigrid method on all the
1572 * the indices subject to boundary conditions as well as the indices on
1573 * edges between different refinement levels as described in the @ref step_16
"step-16"
1574 * tutorial program. We then go through the levels of the mesh and
1575 * construct the constraints and matrices on each
level. These follow
1576 * closely the construction of the system
matrix on the original mesh,
1577 * except the slight difference in naming when accessing information on
1578 * the levels rather than the active cells.
1581 *
const unsigned int nlevels =
triangulation.n_global_levels();
1582 * mg_matrices.resize(0, nlevels - 1);
1584 * std::set<types::boundary_id> dirichlet_boundary;
1585 * dirichlet_boundary.insert(0);
1586 * mg_constrained_dofs.initialize(dof_handler);
1587 * mg_constrained_dofs.make_zero_boundary_constraints(dof_handler,
1588 * dirichlet_boundary);
1597 * level_constraints.
reinit(relevant_dofs);
1599 * mg_constrained_dofs.get_boundary_indices(
level));
1600 * level_constraints.
close();
1608 * std::shared_ptr<MatrixFree<dim, float>> mg_mf_storage_level(
1610 * mg_mf_storage_level->reinit(mapping,
1612 * level_constraints,
1616 * mg_matrices[
level].initialize(mg_mf_storage_level,
1617 * mg_constrained_dofs,
1619 * mg_matrices[
level].evaluate_coefficient(Coefficient<dim>());
1622 * time_details <<
"Setup matrix-free levels (CPU/wall) " << time.
cpu_time()
1623 * <<
"s/" << time.
wall_time() <<
"s" << std::endl;
1631 * <a name=
"LaplaceProblemassemble_rhs"></a>
1632 * <h4>LaplaceProblem::assemble_rhs</h4>
1636 * The
assemble function is very simple since all we have to
do is to
1638 * cached in the
MatrixFree class, which we query from
1641 * alternative), we must not forget to
call compress() at the
end of the
1642 * assembly to send all the contributions of the right hand side to the
1643 * owner of the respective degree of freedom.
1646 * template <
int dim>
1647 *
void LaplaceProblem<dim>::assemble_rhs()
1653 * *system_matrix.get_matrix_free());
1654 *
for (
unsigned int cell = 0;
1655 * cell < system_matrix.get_matrix_free()->n_cell_batches();
1659 *
for (
unsigned int q = 0; q < phi.n_q_points; ++q)
1660 * phi.submit_value(make_vectorized_array<double>(1.0), q);
1662 * phi.distribute_local_to_global(system_rhs);
1667 * time_details <<
"Assemble right hand side (CPU/wall) " << time.
cpu_time()
1668 * <<
"s/" << time.
wall_time() <<
"s" << std::endl;
1676 * <a name=
"LaplaceProblemsolve"></a>
1677 * <h4>LaplaceProblem::solve</h4>
1681 * The solution process is similar as in @ref step_16
"step-16". We start with the setup of
1684 * interpolation between the grid levels with the same fast
sum
1685 * factorization kernels that get also used in
FEEvaluation.
1688 *
template <
int dim>
1689 *
void LaplaceProblem<dim>::solve()
1693 * mg_transfer.build(dof_handler);
1695 * time_details <<
"MG build transfer time (CPU/wall) " << time.
cpu_time()
1701 * As a smoother,
this tutorial program uses a Chebyshev iteration instead
1702 * of SOR in @ref step_16
"step-16". (SOR would be very difficult to implement because we
1703 *
do not have the
matrix elements available explicitly, and it is
1704 * difficult to make it work efficiently in %
parallel.) The smoother is
1705 * initialized with our
level matrices and the mandatory additional data
1706 *
for the Chebyshev smoother. We use a relatively high degree here (5),
1707 * since
matrix-vector products are comparably cheap. We choose to smooth
1708 * out a range of @f$[1.2 \hat{\lambda}_{
\max}/15,1.2 \hat{\lambda}_{
\max}]@f$
1709 * in the smoother where @f$\hat{\lambda}_{
\max}@f$ is an estimate of the
1710 * largest eigenvalue (the factor 1.2 is applied inside
1712 * Chebyshev initialization performs a few steps of a CG algorithm
1713 * without preconditioner. Since the highest eigenvalue is usually the
1714 * easiest
one to find and a rough estimate is enough, we choose 10
1715 * iterations. Finally, we also
set the inner preconditioner type in the
1716 * Chebyshev method which is a Jacobi iteration. This is represented by
1718 * by our LaplaceOperator
class.
1722 * On
level zero, we initialize the smoother differently because we want
1724 * allows the user to
switch to solver mode where the number of iterations
1725 * is internally chosen to the correct
value. In the additional data
1726 * object,
this setting is activated by choosing the polynomial degree to
1729 *
matrix. The number of steps in the Chebyshev smoother are chosen such
1730 * that the Chebyshev convergence estimates guarantee to reduce the
1731 * residual by the number specified in the variable @p
1732 * smoothing_range. Note that
for solving, @p smoothing_range is a
1733 * relative tolerance and chosen smaller than
one, in
this case, we select
1734 * three orders of magnitude, whereas it is a number larger than 1 when
1739 * From a computational
point of view, the Chebyshev iteration is a very
1740 * attractive coarse grid solver as
long as the coarse size is
1741 * moderate. This is because the Chebyshev method performs only
1742 *
matrix-vector products and vector updates, which typically parallelize
1743 * better to the largest cluster size with more than a few tens of
1744 * thousands of cores than inner product involved in other iterative
1745 * methods. The former involves only local communication between neighbors
1746 * in the (coarse) mesh, whereas the latter
requires global communication
1747 * over all processors.
1750 *
using SmootherType =
1763 * smoother_data[
level].smoothing_range = 15.;
1764 * smoother_data[
level].degree = 5;
1765 * smoother_data[
level].eig_cg_n_iterations = 10;
1769 * smoother_data[0].smoothing_range = 1
e-3;
1771 * smoother_data[0].eig_cg_n_iterations = mg_matrices[0].m();
1773 * mg_matrices[
level].compute_diagonal();
1774 * smoother_data[
level].preconditioner =
1775 * mg_matrices[
level].get_matrix_diagonal_inverse();
1777 * mg_smoother.initialize(mg_matrices, smoother_data);
1785 * The next step is to
set up the
interface matrices that are needed for the
1786 *
case with hanging nodes. The adaptive multigrid realization in deal.II
1787 * implements an approach called local smoothing. This means that the
1788 * smoothing on the finest
level only covers the local part of the mesh
1789 * defined by the fixed (finest) grid
level and ignores parts of the
1790 * computational domain where the terminal cells are coarser than
this
1791 *
level. As the method progresses to coarser levels, more and more of the
1792 * global mesh will be covered. At some coarser
level, the whole mesh will
1793 * be covered. Since all
level matrices in the multigrid method cover a
1794 * single
level in the mesh, no hanging nodes appear on the
level matrices.
1795 * At the
interface between multigrid levels, homogeneous Dirichlet boundary
1796 * conditions are
set while smoothing. When the residual is transferred to
1797 * the next coarser
level, however, the coupling over the multigrid
1798 *
interface needs to be taken into account. This is done by the so-called
1799 * interface (or edge) matrices that compute the part of the residual that
1801 * homogeneous Dirichlet conditions. We refer to the @ref mg_paper
1802 *
"Multigrid paper by Janssen and Kanschat" for more details.
1806 * For the implementation of those
interface matrices, there is already a
1810 *
vmult() and @p
Tvmult() operations (that were originally written for
1811 * matrices, hence expecting those names). Note that vmult_interface_down
1812 * is used during the restriction phase of the multigrid
V-cycle, whereas
1813 * vmult_interface_up is used during the prolongation phase.
1818 * preconditioner infrastructure in complete analogy to @ref step_16 "step-16" to obtain
1819 * a @p preconditioner
object that can be applied to a
matrix.
1826 * mg_interface_matrices;
1827 * mg_interface_matrices.resize(0,
triangulation.n_global_levels() - 1);
1832 * mg_interface_matrices);
1835 * mg_matrix, mg_coarse, mg_transfer, mg_smoother, mg_smoother);
1836 *
mg.set_edge_matrices(mg_interface, mg_interface);
1841 * preconditioner(dof_handler,
mg, mg_transfer);
1845 * The setup of the multigrid routines is quite easy and
one cannot see
1846 * any difference in the solve process as compared to @ref step_16 "step-16". All the
1847 * magic is hidden behind the implementation of the LaplaceOperator::
vmult
1848 * operation. Note that we print out the solve time and the accumulated
1849 * setup time through standard out, i.
e., in any case, whereas detailed
1850 * times for the setup operations are only printed in case the flag for
1851 * detail_times in the constructor is changed.
1859 * setup_time += time.wall_time();
1860 * time_details << "MG build smoother time (CPU/wall) " << time.cpu_time()
1861 * << "s/" << time.wall_time() << "s\n";
1862 * pcout << "Total setup time (wall) " << setup_time << "s\n";
1866 * constraints.set_zero(solution);
1867 * cg.solve(system_matrix, solution, system_rhs, preconditioner);
1869 * constraints.distribute(solution);
1871 * pcout << "Time solve (" << solver_control.last_step() << " iterations)"
1872 * << (solver_control.last_step() < 10 ? " " : " ") << "(CPU/wall) "
1873 * << time.cpu_time() << "s/" << time.wall_time() << "s\n";
1881 * <a name="LaplaceProblemoutput_results"></a>
1882 * <h4>LaplaceProblem::output_results</h4>
1886 * Here is the data output, which is a simplified version of @ref step_5 "step-5". We use
1887 * the standard VTU (= compressed VTK) output for each grid produced in the
1888 * refinement process. In addition, we use a compression algorithm that is
1889 * optimized for speed rather than disk usage. The default setting (which
1890 * optimizes for disk usage) makes saving the output take about 4 times as
1891 *
long as running the linear solver, while setting
1893 *
DataOutBase::VtkFlags::best_speed lowers this to only
one fourth the time
1894 * of the linear solve.
1898 * We disable the output when the mesh gets too large.
A variant of this
1899 * program has been
run on hundreds of thousands MPI ranks with as many as
1900 * 100 billion grid cells, which is not directly accessible to classical
1901 * visualization tools.
1904 * template <
int dim>
1905 *
void LaplaceProblem<dim>::output_results(const
unsigned int cycle) const
1913 * solution.update_ghost_values();
1922 *
"./",
"solution", cycle, MPI_COMM_WORLD, 3);
1924 * time_details <<
"Time write output (CPU/wall) " << time.
cpu_time()
1933 * <a name=
"LaplaceProblemrun"></a>
1938 * The function that runs the program is very similar to the
one in
1939 * @ref step_16
"step-16". We
do few refinement steps in 3D compared to 2D, but that
's
1944 * Before we run the program, we output some information about the detected
1945 * vectorization level as discussed in the introduction.
1948 * template <int dim>
1949 * void LaplaceProblem<dim>::run()
1952 * const unsigned int n_vect_doubles = VectorizedArray<double>::size();
1953 * const unsigned int n_vect_bits = 8 * sizeof(double) * n_vect_doubles;
1955 * pcout << "Vectorization over " << n_vect_doubles
1956 * << " doubles = " << n_vect_bits << " bits ("
1957 * << Utilities::System::get_current_vectorization_level() << ")"
1961 * for (unsigned int cycle = 0; cycle < 9 - dim; ++cycle)
1963 * pcout << "Cycle " << cycle << std::endl;
1967 * GridGenerator::hyper_cube(triangulation, 0., 1.);
1968 * triangulation.refine_global(3 - dim);
1970 * triangulation.refine_global(1);
1974 * output_results(cycle);
1975 * pcout << std::endl;
1978 * } // namespace Step37
1985 * <a name="Thecodemaincodefunction"></a>
1986 * <h3>The <code>main</code> function</h3>
1990 * Apart from the fact that we set up the MPI framework according to @ref step_40 "step-40",
1991 * there are no surprises in the main function.
1994 * int main(int argc, char *argv[])
1998 * using namespace Step37;
2000 * Utilities::MPI::MPI_InitFinalize mpi_init(argc, argv, 1);
2002 * LaplaceProblem<dimension> laplace_problem;
2003 * laplace_problem.run();
2005 * catch (std::exception &exc)
2007 * std::cerr << std::endl
2009 * << "----------------------------------------------------"
2011 * std::cerr << "Exception on processing: " << std::endl
2012 * << exc.what() << std::endl
2013 * << "Aborting!" << std::endl
2014 * << "----------------------------------------------------"
2020 * std::cerr << std::endl
2022 * << "----------------------------------------------------"
2024 * std::cerr << "Unknown exception!" << std::endl
2025 * << "Aborting!" << std::endl
2026 * << "----------------------------------------------------"
2034<a name="Results"></a><h1>Results</h1>
2037<a name="Programoutput"></a><h3>Program output</h3>
2040Since this example solves the same problem as @ref step_5 "step-5" (except for
2041a different coefficient), there is little to say about the
2042solution. We show a picture anyway, illustrating the size of the
2043solution through both isocontours and volume rendering:
2045<img src="https://www.dealii.org/images/steps/developer/step-37.solution.png" alt="">
2047Of more interest is to evaluate some aspects of the multigrid solver.
2048When we run this program in 2D for quadratic (@f$Q_2@f$) elements, we get the
2049following output (when run on one core in release mode):
2051Vectorization over 2 doubles = 128 bits (SSE2)
2053Number of degrees of freedom: 81
2054Total setup time (wall) 0.00159788s
2055Time solve (6 iterations) (CPU/wall) 0.000951s/0.000951052s
2058Number of degrees of freedom: 289
2059Total setup time (wall) 0.00114608s
2060Time solve (6 iterations) (CPU/wall) 0.000935s/0.000934839s
2063Number of degrees of freedom: 1089
2064Total setup time (wall) 0.00244665s
2065Time solve (6 iterations) (CPU/wall) 0.00207s/0.002069s
2068Number of degrees of freedom: 4225
2069Total setup time (wall) 0.00678205s
2070Time solve (6 iterations) (CPU/wall) 0.005616s/0.00561595s
2073Number of degrees of freedom: 16641
2074Total setup time (wall) 0.0241671s
2075Time solve (6 iterations) (CPU/wall) 0.019543s/0.0195441s
2078Number of degrees of freedom: 66049
2079Total setup time (wall) 0.0967851s
2080Time solve (6 iterations) (CPU/wall) 0.07457s/0.0745709s
2083Number of degrees of freedom: 263169
2084Total setup time (wall) 0.346374s
2085Time solve (6 iterations) (CPU/wall) 0.260042s/0.265033s
2088As in @ref step_16 "step-16", we see that the number of CG iterations remains constant with
2089increasing number of degrees of freedom. A constant number of iterations
2090(together with optimal computational properties) means that the computing time
2091approximately quadruples as the problem size quadruples from one cycle to the
2092next. The code is also very efficient in terms of storage. Around 2-4 million
2093degrees of freedom fit into 1 GB of memory, see also the MPI results below. An
2094interesting fact is that solving one linear system is cheaper than the setup,
2095despite not building a matrix (approximately half of which is spent in the
2096DoFHandler::distribute_dofs() and DoFHandler::distribute_mg_dofs()
2097calls). This shows the high efficiency of this approach, but also that the
2098deal.II data structures are quite expensive to set up and the setup cost must
2099be amortized over several system solves.
2101Not much changes if we run the program in three spatial dimensions. Since we
2102use uniform mesh refinement, we get eight times as many elements and
2103approximately eight times as many degrees of freedom with each cycle:
2106Vectorization over 2 doubles = 128 bits (SSE2)
2108Number of degrees of freedom: 125
2109Total setup time (wall) 0.00231099s
2110Time solve (6 iterations) (CPU/wall) 0.000692s/0.000922918s
2113Number of degrees of freedom: 729
2114Total setup time (wall) 0.00289083s
2115Time solve (6 iterations) (CPU/wall) 0.001534s/0.0024128s
2118Number of degrees of freedom: 4913
2119Total setup time (wall) 0.0143182s
2120Time solve (6 iterations) (CPU/wall) 0.010785s/0.0107841s
2123Number of degrees of freedom: 35937
2124Total setup time (wall) 0.087064s
2125Time solve (6 iterations) (CPU/wall) 0.063522s/0.06545s
2128Number of degrees of freedom: 274625
2129Total setup time (wall) 0.596306s
2130Time solve (6 iterations) (CPU/wall) 0.427757s/0.431765s
2133Number of degrees of freedom: 2146689
2134Total setup time (wall) 4.96491s
2135Time solve (6 iterations) (CPU/wall) 3.53126s/3.56142s
2138Since it is so easy, we look at what happens if we increase the polynomial
2139degree. When selecting the degree as four in 3D, i.e., on @f$\mathcal Q_4@f$
2140elements, by changing the line <code>const unsigned int
2141degree_finite_element=4;</code> at the top of the program, we get the
2142following program output:
2145Vectorization over 2 doubles = 128 bits (SSE2)
2147Number of degrees of freedom: 729
2148Total setup time (wall) 0.00633097s
2149Time solve (6 iterations) (CPU/wall) 0.002829s/0.00379395s
2152Number of degrees of freedom: 4913
2153Total setup time (wall) 0.0174279s
2154Time solve (6 iterations) (CPU/wall) 0.012255s/0.012254s
2157Number of degrees of freedom: 35937
2158Total setup time (wall) 0.082655s
2159Time solve (6 iterations) (CPU/wall) 0.052362s/0.0523629s
2162Number of degrees of freedom: 274625
2163Total setup time (wall) 0.507943s
2164Time solve (6 iterations) (CPU/wall) 0.341811s/0.345788s
2167Number of degrees of freedom: 2146689
2168Total setup time (wall) 3.46251s
2169Time solve (7 iterations) (CPU/wall) 3.29638s/3.3265s
2172Number of degrees of freedom: 16974593
2173Total setup time (wall) 27.8989s
2174Time solve (7 iterations) (CPU/wall) 26.3705s/27.1077s
2177Since @f$\mathcal Q_4@f$ elements on a certain mesh correspond to @f$\mathcal Q_2@f$
2178elements on half the mesh size, we can compare the run time at cycle 4 with
2179fourth degree polynomials with cycle 5 using quadratic polynomials, both at
21802.1 million degrees of freedom. The surprising effect is that the solver for
2181@f$\mathcal Q_4@f$ element is actually slightly faster than for the quadratic
2182case, despite using one more linear iteration. The effect that higher-degree
2183polynomials are similarly fast or even faster than lower degree ones is one of
2184the main strengths of matrix-free operator evaluation through sum
2185factorization, see the <a
2186href="http://dx.doi.org/10.1016/j.compfluid.2012.04.012">matrix-free
2187paper</a>. This is fundamentally different to matrix-based methods that get
2188more expensive per unknown as the polynomial degree increases and the coupling
2191In addition, also the setup gets a bit cheaper for higher order, which is
2192because fewer elements need to be set up.
2194Finally, let us look at the timings with degree 8, which corresponds to
2195another round of mesh refinement in the lower order methods:
2198Vectorization over 2 doubles = 128 bits (SSE2)
2200Number of degrees of freedom: 4913
2201Total setup time (wall) 0.0842004s
2202Time solve (8 iterations) (CPU/wall) 0.019296s/0.0192959s
2205Number of degrees of freedom: 35937
2206Total setup time (wall) 0.327048s
2207Time solve (8 iterations) (CPU/wall) 0.07517s/0.075999s
2210Number of degrees of freedom: 274625
2211Total setup time (wall) 2.12335s
2212Time solve (8 iterations) (CPU/wall) 0.448739s/0.453698s
2215Number of degrees of freedom: 2146689
2216Total setup time (wall) 16.1743s
2217Time solve (8 iterations) (CPU/wall) 3.95003s/3.97717s
2220Number of degrees of freedom: 16974593
2221Total setup time (wall) 130.8s
2222Time solve (8 iterations) (CPU/wall) 31.0316s/31.767s
2225Here, the initialization seems considerably slower than before, which is
2226mainly due to the computation of the diagonal of the matrix, which actually
2227computes a 729 x 729 matrix on each cell and throws away everything but the
2228diagonal. The solver times, however, are again very close to the quartic case,
2229showing that the linear increase with the polynomial degree that is
2230theoretically expected is almost completely offset by better computational
2231characteristics and the fact that higher order methods have a smaller share of
2232degrees of freedom living on several cells that add to the evaluation
2235<a name="Comparisonwithasparsematrix"></a><h3>Comparison with a sparse matrix</h3>
2238In order to understand the capabilities of the matrix-free implementation, we
2239compare the performance of the 3d example above with a sparse matrix
2240implementation based on TrilinosWrappers::SparseMatrix by measuring both the
2241computation times for the initialization of the problem (distribute DoFs,
2242setup and assemble matrices, setup multigrid structures) and the actual
2243solution for the matrix-free variant and the variant based on sparse
2244matrices. We base the preconditioner on float numbers and the actual matrix
2245and vectors on double numbers, as shown above. Tests are run on an Intel Core
2246i7-5500U notebook processor (two cores and <a
2247href="http://en.wikipedia.org/wiki/Advanced_Vector_Extensions">AVX</a>
2248support, i.e., four operations on doubles can be done with one CPU
2249instruction, which is heavily used in FEEvaluation), optimized mode, and two
2252<table align="center" class="doxtable">
2255 <th colspan="2">Sparse matrix</th>
2256 <th colspan="2">Matrix-free implementation</th>
2260 <th>Setup + assemble</th>
2261 <th> Solve </th>
2262 <th>Setup + assemble</th>
2263 <th> Solve </th>
2266 <td align="right">125</td>
2267 <td align="center">0.0042s</td>
2268 <td align="center">0.0012s</td>
2269 <td align="center">0.0022s</td>
2270 <td align="center">0.00095s</td>
2273 <td align="right">729</td>
2274 <td align="center">0.012s</td>
2275 <td align="center">0.0040s</td>
2276 <td align="center">0.0027s</td>
2277 <td align="center">0.0021s</td>
2280 <td align="right">4,913</td>
2281 <td align="center">0.082s</td>
2282 <td align="center">0.012s</td>
2283 <td align="center">0.011s</td>
2284 <td align="center">0.0057s</td>
2287 <td align="right">35,937</td>
2288 <td align="center">0.73s</td>
2289 <td align="center">0.13s</td>
2290 <td align="center">0.048s</td>
2291 <td align="center">0.040s</td>
2294 <td align="right">274,625</td>
2295 <td align="center">5.43s</td>
2296 <td align="center">1.01s</td>
2297 <td align="center">0.33s</td>
2298 <td align="center">0.25s</td>
2301 <td align="right">2,146,689</td>
2302 <td align="center">43.8s</td>
2303 <td align="center">8.24s</td>
2304 <td align="center">2.42s</td>
2305 <td align="center">2.06s</td>
2309The table clearly shows that the matrix-free implementation is more than twice
2310as fast for the solver, and more than six times as fast when it comes to
2311initialization costs. As the problem size is made a factor 8 larger, we note
2312that the times usually go up by a factor eight, too (as the solver iterations
2313are constant at six). The main deviation is in the sparse matrix between 5k
2314and 36k degrees of freedom, where the time increases by a factor 12. This is
2315the threshold where the (L3) cache in the processor can no longer hold all
2316data necessary for the matrix-vector products and all matrix elements must be
2317fetched from main memory.
2319Of course, this picture does not necessarily translate to all cases, as there
2320are problems where knowledge of matrix entries enables much better solvers (as
2321happens when the coefficient is varying more strongly than in the above
2322example). Moreover, it also depends on the computer system. The present system
2323has good memory performance, so sparse matrices perform comparably
2324well. Nonetheless, the matrix-free implementation gives a nice speedup already
2325for the <i>Q</i><sub>2</sub> elements used in this example. This becomes
2326particularly apparent for time-dependent or nonlinear problems where sparse
2327matrices would need to be reassembled over and over again, which becomes much
2328easier with this class. And of course, thanks to the better complexity of the
2329products, the method gains increasingly larger advantages when the order of the
2330elements increases (the matrix-free implementation has costs
23314<i>d</i><sup>2</sup><i>p</i> per degree of freedom, compared to
23322<i>p<sup>d</sup></i> for the sparse matrix, so it will win anyway for order 4
2335<a name="ResultsforlargescaleparallelcomputationsonSuperMUC"></a><h3> Results for large-scale parallel computations on SuperMUC</h3>
2338As explained in the introduction and the in-code comments, this program can be
2339run in parallel with MPI. It turns out that geometric multigrid schemes work
2340really well and can scale to very large machines. To the authors' knowledge,
2341the geometric multigrid results shown here are the largest computations done
2342with deal.II as of late 2016,
run on up to 147,456 cores of the <a
2343href=
"https://www.lrz.de/services/compute/supermuc/systemdescription/">complete
2344SuperMUC Phase 1</a>. The ingredients
for scalability beyond 1000 cores are
2345that no data structure that depends on the global problem size is held in its
2346entirety on a single processor and that the communication is not too frequent
2347in order not to
run into latency issues of the network. For PDEs solved with
2348iterative solvers, the communication latency is often the limiting factor,
2349rather than the throughput of the network. For the example of the SuperMUC
2350system, the
point-to-
point latency between two processors is between 1
e-6 and
23511
e-5 seconds, depending on the proximity in the MPI network. The
matrix-vector
2352products with @p LaplaceOperator from
this class involves several
2353point-to-
point communication steps, interleaved with computations on each
2354core. The resulting latency of a
matrix-vector product is around 1
e-4
2355seconds. Global communication, for example an @p MPI_Allreduce operation that
2356accumulates the
sum of a single number per rank over all ranks in the MPI
2357network, has a latency of 1
e-4 seconds. The multigrid
V-cycle used in this
2358program is also a form of global communication. Think about the coarse grid
2359solve that happens on a single processor: It accumulates the contributions
2360from all processors before it starts. When completed, the coarse grid solution
2361is transferred to finer levels, where more and more processors help in
2362smoothing until the fine grid. Essentially, this is a tree-like pattern over
2363the processors in the network and controlled by the mesh. As opposed to the
2364@p MPI_Allreduce operations where the tree in the
reduction is optimized to the
2365actual links in the MPI network, the multigrid
V-cycle does
this according to
2366the partitioning of the mesh. Thus, we cannot expect the same
2367optimality. Furthermore, the multigrid cycle is not simply a walk up and down
2368the refinement tree, but also communication on each
level when doing the
2369smoothing. In other words, the global communication in multigrid is more
2370challenging and related to the mesh that provides less optimization
2371opportunities. The measured latency of the
V-cycle is between 6
e-3 and 2
e-2
2372seconds, i.e., the same as 60 to 200 MPI_Allreduce operations.
2374The following figure shows a scaling experiments on @f$\mathcal Q_3@f$
2375elements. Along the lines, the problem size is held constant as the number of
2376cores is increasing. When doubling the number of cores,
one expects a halving
2377of the computational time, indicated by the dotted gray lines. The results
2378show that the implementation shows almost ideal behavior until an absolute
2379time of around 0.1 seconds is reached. The solver tolerances have been
set
2380such that the solver performs five iterations. This way of plotting data is
2381the <b>strong scaling</
b> of the algorithm. As we go to very large core
2382counts, the curves flatten out a bit earlier, which is because of the
2383communication network in SuperMUC where communication between processors
2384farther away is slightly slower.
2386<img src=
"https://www.dealii.org/images/steps/developer/step-37.scaling_strong.png" alt=
"">
2388In addition, the plot also contains results for <b>weak scaling</
b> that lists
2389how the algorithm behaves as both the number of processor cores and elements
2390is increased at the same pace. In
this situation, we expect that the compute
2391time remains constant. Algorithmically, the number of CG iterations is
2392constant at 5, so we are good from that
end. The lines in the plot are
2393arranged such that the top left
point in each data series represents the same
2394size per processor, namely 131,072 elements (or approximately 3.5 million
2395degrees of freedom per core). The gray lines indicating ideal strong scaling
2396are by the same factor of 8 apart. The results show again that the scaling is
2397almost ideal. The
parallel efficiency when going from 288 cores to 147,456
2398cores is at around 75%
for a local problem size of 750,000 degrees of freedom
2399per core which takes 1.0s on 288 cores, 1.03s on 2304 cores, 1.19s on 18k
2400cores, and 1.35s on 147k cores. The algorithms also reach a very high
2401utilization of the processor. The largest computation on 147k cores reaches
2402around 1.7 PFLOPs/s on SuperMUC out of an arithmetic peak of 3.2 PFLOPs/s. For
2403an iterative PDE solver,
this is a very high number and significantly more is
2404often only reached
for dense linear algebra. Sparse linear algebra is limited
2405to a tenth of
this value.
2407As mentioned in the introduction, the
matrix-
free method reduces the memory
2408consumption of the data structures. Besides the higher performance due to less
2409memory transfer, the algorithms also allow
for very large problems to fit into
2410memory. The figure below shows the computational time as we increase the
2411problem size until an upper limit where the computation exhausts memory. We
do
2412this for 1k cores, 8k cores, and 65k cores and see that the problem size can
2413be varied over almost two orders of magnitude with ideal scaling. The largest
2414computation shown in
this picture involves 292 billion (@f$2.92 \cdot 10^{11}@f$)
2415degrees of freedom. On a DG computation of 147k cores, the above algorithms
2416were also
run involving up to 549 billion (2^39) DoFs.
2420Finally, we note that while performing the tests on the large-
scale system
2421shown above, improvements of the multigrid algorithms in deal.II have been
2422developed. The original version contained the sub-optimal code based on
2424all vector entries are
zero) were done on each smoothing
2425operation on each
level, which only became apparent on 65k cores and
2426more. However, the following picture shows that the improvement already pay
2427off on a smaller
scale, here shown on computations on up to 14,336 cores for
2428@f$\mathcal Q_5@f$ elements:
2433<a name="Adaptivity"></a><h3> Adaptivity</h3>
2436As explained in the code, the algorithm presented here is prepared to
run on
2437adaptively refined meshes. If only part of the mesh is refined, the multigrid
2438cycle will
run with local smoothing and impose Dirichlet conditions along the
2439interfaces which differ in refinement
level for smoothing through the
2441distributed over levels, relating the owner of the
level cells to the owner of
2442the
first descendant active cell, there can be an imbalance between different
2443processors in MPI, which limits scalability by a factor of around two to five.
2445<a name="Possibilitiesforextensions"></a><h3> Possibilities for extensions</h3>
2448<a name="Kellyerrorestimator"></a><h4> Kelly error estimator </h4>
2451As mentioned above the code is ready for locally adaptive h-refinement.
2452For the Poisson equation
one can employ the Kelly error indicator,
2454with the ghost indices of
parallel vectors.
2455In order to evaluate the jump terms in the error indicator, each MPI process
2456needs to know locally relevant DoFs.
2457However
MatrixFree::initialize_dof_vector() function initializes the vector only with
2458some locally relevant DoFs.
2459The ghost indices made available in the vector are a tight
set of only those indices
2460that are touched in the cell integrals (including constraint resolution).
2461This choice has performance reasons, because sending all locally relevant degrees
2462of freedom would be too expensive compared to the
matrix-vector product.
2463Consequently the solution vector as-is is
2465The trick is to change the ghost part of the
partition, for example using a
2473solution.
reinit(dof_handler.locally_owned_dofs(),
2474 locally_relevant_dofs,
2476solution.copy_locally_owned_data_from(copy_vec);
2477constraints.distribute(solution);
2478solution.update_ghost_values();
2481<a name="Sharedmemoryparallelization"></a><h4> Shared-memory parallelization</h4>
2484This program is parallelized with MPI only. As an alternative, the
MatrixFree
2485loop can also be issued in hybrid mode, for example by using MPI parallelizing
2486over the nodes of a cluster and with threads through Intel TBB within the
2487shared memory region of
one node. To use this,
one would need to both
set the
2488number of threads in the MPI_InitFinalize data structure in the main function,
2489and
set the
MatrixFree::AdditionalData::tasks_parallel_scheme to
2490partition_color to actually do the
loop in
parallel. This use case is
2491discussed in @ref step_48 "step-48".
2493<a name="InhomogeneousDirichletboundaryconditions"></a><h4> Inhomogeneous Dirichlet boundary conditions </h4>
2496The presented program assumes homogeneous Dirichlet boundary conditions. When
2497going to non-homogeneous conditions, the situation is a bit more intricate. To
2498understand how to implement such a setting, let us
first recall how these
2499arise in the mathematical formulation and how they are implemented in a
2500matrix-based variant. In essence, an inhomogeneous Dirichlet condition sets
2501some of the nodal
values in the solution to given
values rather than
2502determining them through the variational principles,
2504u_h(\mathbf{x}) = \sum_{i\in \mathcal
N} \varphi_i(\mathbf{x}) u_i =
2505\sum_{i\in \mathcal N \setminus \mathcal N_D} \varphi_i(\mathbf{x}) u_i +
2506\sum_{i\in \mathcal N_D} \varphi_i(\mathbf{x}) g_i,
2508where @f$u_i@f$ denotes the nodal
values of the solution and @f$\mathcal
N@f$ denotes
2509the
set of all nodes. The
set @f$\mathcal N_D\subset \mathcal
N@f$ is the subset
2510of the nodes that are subject to Dirichlet boundary conditions where the
2511solution is forced to
equal @f$u_i = g_i = g(\mathbf{x}_i)@f$ as the interpolation
2512of boundary
values on the Dirichlet-constrained node points @f$i\in \mathcal
2513N_D@f$. We then
insert this solution
2514representation into the weak form,
e.g. the Laplacian shown above, and move
2515the known quantities to the right hand side:
2517(\nabla \varphi_i, \nabla u_h)_\Omega &=& (\varphi_i, f)_\Omega \quad \Rightarrow \\
2518\sum_{j\in \mathcal N \setminus \mathcal N_D}(\nabla \varphi_i,\nabla \varphi_j)_\Omega \, u_j &=&
2519(\varphi_i, f)_\Omega
2520-\sum_{j\in \mathcal N_D} (\nabla \varphi_i,\nabla\varphi_j)_\Omega\, g_j.
2522In
this formula, the equations are tested
for all basis
functions @f$\varphi_i@f$
2523with @f$i\in N \setminus \mathcal N_D@f$ that are not related to the nodes
2524constrained by Dirichlet conditions.
2526In the implementation in deal.II, the integrals @f$(\nabla \varphi_i,\nabla \varphi_j)_\Omega@f$
2527on the right hand side are already contained in the local
matrix contributions
2528we
assemble on each cell. When
using
2530@ref step_6 "step-6" and @ref step_7 "step-7" tutorial programs, we can account for the contribution of
2531inhomogeneous constraints <i>j</i> by multiplying the columns <i>j</i> and
2532rows <i>i</i> of the local
matrix according to the integrals @f$(\varphi_i,
2533\varphi_j)_\Omega@f$ by the inhomogeneities and subtracting the resulting from
2534the position <i>i</i> in the global right-hand-side vector, see also the @ref
2535constraints module. In essence, we use some of the integrals that get
2536eliminated from the left hand side of the equation to finalize the right hand
2537side contribution. Similar mathematics are also involved when
first writing
2538all entries into a left hand side
matrix and then eliminating
matrix rows and
2541In principle, the components that belong to the constrained degrees of freedom
2542could be eliminated from the linear system because they do not carry any
2543information. In practice, in deal.II we
always keep the size of the linear
2544system the same to avoid handling two different numbering systems and avoid
2545confusion about the two different index sets. In order to ensure that the
2546linear system does not get singular when not adding anything to constrained
2547rows, we then add dummy entries to the
matrix diagonal that are otherwise
2548unrelated to the real entries.
2550In a
matrix-
free method, we need to take a different approach, since the @p
2551LaplaceOperator class represents the
matrix-vector product of a
2552<
b>homogeneous</
b> operator (the left-hand side of the last formula). It does
2555MatrixFree::cell_loop()
call will only resolve the homogeneous part of the
2556constraints as
long as it represents a <
b>linear</
b> operator.
2558In our
matrix-
free code, the right hand side computation where the
2559contribution of inhomogeneous conditions ends up is completely decoupled from
2560the
matrix operator and handled by a different function above. Thus, we need
2561to explicitly generate the data that enters the right hand side rather than
2562using a byproduct of the
matrix assembly. Since we already know how to apply
2563the operator on a vector, we could try to use those facilities for a vector
2571 BoundaryValueFunction<dim>(),
2574 if (solution.locally_owned_elements().is_element(pair.
first))
2577or, equivalently, if we already had filled the inhomogeneous constraints into
2581 constraints.distribute(solution);
2584We could then pass the vector @p solution to the @p
2585LaplaceOperator::vmult_add() function and add the new contribution to the @p
2586system_rhs vector that gets filled in the @p LaplaceProblem::assemble_rhs()
2587function. However, this idea does not work because the
2589homogeneous
values on all constraints (otherwise the operator would not be a
2591inhomogeneities, we could select
one of two following strategies.
2593<a name="UseFEEvaluationread_dof_values_plaintoavoidresolvingconstraints"></a><h5> Use
FEEvaluation::read_dof_values_plain() to avoid resolving constraints </h5>
2596The class
FEEvaluation has a facility that addresses precisely this
2597requirement: For non-homogeneous Dirichlet
values, we do want to skip the
2598implicit imposition of homogeneous (Dirichlet) constraints upon reading the
2599data from the vector @p solution. For example, we could extend the @p
2600LaplaceProblem::assemble_rhs() function to deal with inhomogeneous Dirichlet
2601values as follows, assuming the Dirichlet
values have been interpolated into
2602the
object @p constraints:
2605void LaplaceProblem<dim>::assemble_rhs()
2608 constraints.distribute(solution);
2609 solution.update_ghost_values();
2614 for (
unsigned int cell = 0;
2615 cell < system_matrix.get_matrix_free()->n_cell_batches();
2619 phi.read_dof_values_plain(solution);
2621 for (
unsigned int q = 0; q < phi.n_q_points; ++q)
2623 phi.submit_gradient(-coefficient(cell, q) * phi.get_gradient(q), q);
2624 phi.submit_value(make_vectorized_array<double>(1.0), q);
2627 phi.distribute_local_to_global(system_rhs);
2634tentative solution vector by
FEEvaluation::read_dof_values_plain() that
2635ignores all constraints. Due to this setup, we must make sure that other
2636constraints,
e.g. by hanging nodes, are correctly distributed to the input
2637vector already as they are not resolved as in
2638FEEvaluation::read_dof_values_plain(). Inside the
loop, we then evaluate the
2639Laplacian and repeat the
second derivative
call with
2640FEEvaluation::submit_gradient() from the @p LaplaceOperator class, but with the
2641sign switched since we wanted to subtract the contribution of Dirichlet
2642conditions on the right hand side vector according to the formula above. When
2644regarding the value slot and
first derivative slot to true to account for both
2645terms added in the
loop over quadrature points. Once the right hand side is
2646assembled, we then go on to solving the linear system for the homogeneous
2647problem, say involving a variable @p solution_update. After solving, we can
2648add @p solution_update to the @p solution vector that contains the final
2649(inhomogeneous) solution.
2651Note that the negative
sign for the Laplacian alongside with a positive
sign
2652for the forcing that we needed to build the right hand side is a more
general
2653concept: We have implemented
nothing else than Newton's method for nonlinear
2654equations, but applied to a linear system. We have used an
initial guess for
2655the variable @p solution in terms of the Dirichlet boundary conditions and
2656computed a residual @f$r = f - Au_0@f$. The linear system was then solved as
2657@f$\Delta u =
A^{-1} (f-Au)@f$ and we
finally computed @f$u = u_0 + \Delta u@f$. For a
2658linear system, we obviously reach the exact solution after a single
2659iteration. If we wanted to extend the code to a nonlinear problem, we would
2660rename the @p assemble_rhs() function into a more descriptive name like @p
2661assemble_residual() that computes the (weak) form of the residual, whereas the
2662@p LaplaceOperator::apply_add() function would get the linearization of the
2663residual with respect to the solution variable.
2665<a name="UseLaplaceOperatorwithasecondAffineConstraintsobjectwithoutDirichletconditions"></a><h5> Use LaplaceOperator with a
second AffineConstraints object without Dirichlet conditions </h5>
2668A second alternative to get the right hand side that re-uses the @p
2669LaplaceOperator::apply_add() function is to instead add a
second LaplaceOperator
2670that skips Dirichlet constraints. To do this, we initialize a
second MatrixFree
2671object which does not have any boundary value constraints. This @p matrix_free
2672object is then passed to a @p LaplaceOperator class instance @p
2673inhomogeneous_operator that is only used to create the right hand side:
2676void LaplaceProblem<dim>::assemble_rhs()
2680 no_constraints.
close();
2681 LaplaceOperator<dim, degree_finite_element, double> inhomogeneous_operator;
2686 std::shared_ptr<MatrixFree<dim, double>> matrix_free(
2688 matrix_free->reinit(dof_handler,
2692 inhomogeneous_operator.initialize(matrix_free);
2695 constraints.distribute(solution);
2696 inhomogeneous_operator.evaluate_coefficient(Coefficient<dim>());
2697 inhomogeneous_operator.vmult(system_rhs, solution);
2701 *inhomogeneous_operator.get_matrix_free());
2702 for (
unsigned int cell = 0;
2703 cell < inhomogeneous_operator.get_matrix_free()->n_cell_batches();
2707 for (
unsigned int q = 0; q < phi.n_q_points; ++q)
2708 phi.submit_value(make_vectorized_array<double>(1.0), q);
2710 phi.distribute_local_to_global(system_rhs);
2716A more sophisticated implementation of
this technique could reuse the original
2719object. Doing
this would require making substantial modifications to the
2721comes with the library can do this. See the discussion on blocks in
2725<a name="PlainProg"></a>
2726<h1> The plain program</h1>
2727@include
"step-37.cc"
void reinit(const IndexSet &local_constraints=IndexSet())
void distribute_local_to_global(const InVector &local_vector, const std::vector< size_type > &local_dof_indices, OutVector &global_vector) const
void add_lines(const std::vector< bool > &lines)
void attach_dof_handler(const DoFHandlerType &)
void add_data_vector(const VectorType &data, const std::vector< std::string > &names, const DataVectorType type=type_automatic, const std::vector< DataComponentInterpretation::DataComponentInterpretation > &data_component_interpretation=std::vector< DataComponentInterpretation::DataComponentInterpretation >())
virtual void build_patches(const unsigned int n_subdivisions=0)
value_type get_dof_value(const unsigned int dof) const
void read_dof_values(const VectorType &src, const unsigned int first_index=0)
void evaluate(const EvaluationFlags::EvaluationFlags evaluation_flag)
void initialize(const MGSmootherBase< VectorType > &coarse_smooth)
void resize(const unsigned int new_minlevel, const unsigned int new_maxlevel, Args &&... args)
void set_constrained_entries_to_one(VectorType &dst) const
void vmult_interface_down(VectorType &dst, const VectorType &src) const
std::shared_ptr< DiagonalMatrix< VectorType > > inverse_diagonal_entries
void vmult(VectorType &dst, const VectorType &src) const
void initialize(const OperatorType &operator_in)
void Tvmult(VectorType &dst, const VectorType &src) const
unsigned int n_cell_batches() const
void cell_loop(const std::function< void(const MatrixFree< dim, Number, VectorizedArrayType > &, OutVector &, const InVector &, const std::pair< unsigned int, unsigned int > &)> &cell_operation, OutVector &dst, const InVector &src, const bool zero_dst_vector=false) const
constexpr SymmetricTensor< 2, dim, Number > invert(const SymmetricTensor< 2, dim, Number > &t)
constexpr Number determinant(const SymmetricTensor< 2, dim, Number > &t)
@ update_JxW_values
Transformed quadrature weights.
@ update_gradients
Shape function gradients.
@ update_quadrature_points
Transformed quadrature points.
__global__ void reduction(Number *result, const Number *v, const size_type N)
__global__ void set(Number *val, const Number s, const size_type N)
std::string write_vtu_with_pvtu_record(const std::string &directory, const std::string &filename_without_extension, const unsigned int counter, const MPI_Comm &mpi_communicator, const unsigned int n_digits_for_counter=numbers::invalid_unsigned_int, const unsigned int n_groups=0) const
ZlibCompressionLevel compression_level
#define Assert(cond, exc)
#define AssertDimension(dim1, dim2)
void set_flags(const FlagType &flags)
static ::ExceptionBase & ExcMessage(std::string arg1)
void loop(ITERATOR begin, typename identity< ITERATOR >::type end, DOFINFO &dinfo, INFOBOX &info, const std::function< void(DOFINFO &, typename INFOBOX::CellInfo &)> &cell_worker, const std::function< void(DOFINFO &, typename INFOBOX::CellInfo &)> &boundary_worker, const std::function< void(DOFINFO &, DOFINFO &, typename INFOBOX::CellInfo &, typename INFOBOX::CellInfo &)> &face_worker, ASSEMBLER &assembler, const LoopControl &lctrl=LoopControl())
Number local_element(const size_type local_index) const
size_type locally_owned_size() const
void reinit(const size_type size, const bool omit_zeroing_entries=false)
void make_hanging_node_constraints(const DoFHandler< dim, spacedim > &dof_handler, AffineConstraints< number > &constraints)
std::vector< value_type > l2_norm(const typename ::Triangulation< dim, spacedim >::cell_iterator &parent, const value_type parent_value)
std::vector< value_type > split(const typename ::Triangulation< dim, spacedim >::cell_iterator &parent, const value_type parent_value)
Expression sign(const Expression &x)
static const types::blas_int zero
@ matrix
Contents is actually a matrix.
@ eigenvalues
Eigenvalue vector is filled.
@ diagonal
Matrix is diagonal.
@ general
No special properties.
static const types::blas_int one
Point< spacedim > point(const gp_Pnt &p, const double tolerance=1e-10)
SymmetricTensor< 2, dim, Number > e(const Tensor< 2, dim, Number > &F)
SymmetricTensor< 2, dim, Number > b(const Tensor< 2, dim, Number > &F)
SymmetricTensor< 2, dim, Number > d(const Tensor< 2, dim, Number > &F, const Tensor< 2, dim, Number > &dF_dt)
void call(const std::function< RT()> &function, internal::return_value< RT > &ret_val)
VectorType::value_type * end(VectorType &V)
VectorType::value_type * begin(VectorType &V)
unsigned int this_mpi_process(const MPI_Comm &mpi_communicator)
T sum(const T &t, const MPI_Comm &mpi_communicator)
std::string compress(const std::string &input)
void run(const Iterator &begin, const typename identity< Iterator >::type &end, Worker worker, Copier copier, const ScratchData &sample_scratch_data, const CopyData &sample_copy_data, const unsigned int queue_length, const unsigned int chunk_size)
unsigned int n_cells(const internal::TriangulationImplementation::NumberCache< 1 > &c)
void copy(const T *begin, const T *end, U *dest)
int(&) functions(const void *v1, const void *v2)
void assemble(const MeshWorker::DoFInfoBox< dim, DOFINFO > &dinfo, A *assembler)
void reinit(MatrixBlock< MatrixType > &v, const BlockSparsityPattern &p)
static const unsigned int invalid_unsigned_int
unsigned int global_dof_index
const ::parallel::distributed::Triangulation< dim, spacedim > * triangulation
TasksParallelScheme tasks_parallel_scheme
UpdateFlags mapping_update_flags