42 typedef typename ViewTypeC::value_type scalar_type;
43 typedef typename ViewTypeC::execution_space execution_space;
45#if defined (KOKKOS_ENABLE_CUDA)
46 const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
48 const bool is_cuda =
false;
50 const unsigned vector_size = is_cuda ? 32 : 1;
51 const unsigned team_size = is_cuda ? 128 / vector_size : 1;
53 const int m = A.extent(0);
54 const int n = A.extent(1);
55 const int range = (m+team_size-1)/team_size;
57 typedef Kokkos::TeamPolicy<execution_space> Policy;
59 Policy( range,team_size,vector_size ),
60 KOKKOS_LAMBDA (
const typename Policy::member_type& team) {
61 const int i = team.league_rank()*team.team_size() + team.team_rank();
66 for (
int j=0; j<n; ++j)
75 const ViewTypeA& A,
const ViewTypeB& b,
const ViewTypeC&
c) {
76 typedef typename ViewTypeC::value_type scalar_type;
77 typedef typename ViewTypeC::execution_space execution_space;
78 typedef Kokkos::TeamPolicy<execution_space> Policy;
79 typedef typename Policy::member_type team_member;
80 typedef Kokkos::View<scalar_type*,Kokkos::LayoutLeft, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> TmpScratchSpace;
82#if defined (KOKKOS_ENABLE_CUDA)
83 const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
85 const bool is_cuda =
false;
87 const unsigned VectorSize = is_cuda ? 32 : 1;
88 const unsigned TeamSize = is_cuda ? 128 / VectorSize : 1;
90 const int m = A.extent(0);
91 const int n = A.extent(1);
92 const int p = dimension_scalar(A);
93 const int N = (m+TeamSize-1)/TeamSize;
95 Policy policy(
N, TeamSize, VectorSize);
96 const size_t bytes = TmpScratchSpace::shmem_size(TeamSize,
p);
98 policy.set_scratch_size(0, Kokkos::PerTeam(bytes)),
99 KOKKOS_LAMBDA (
const team_member& team) {
100 const int team_rank = team.team_rank();
101 const int team_size = team.team_size();
102 TmpScratchSpace t(team.team_scratch(0), team_size,
p);
103 const int i = team.league_rank()*team_size + team_rank;
106 for (
int j=0; j<n; ++j)
107 t(team_rank) +=
A(
i,j)*b(j);
139 const size_t nloop,
const bool check)
141 typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
142 typedef Kokkos::View<
FadType*, ViewArgs...> ViewTypeB;
143 typedef Kokkos::View<
FadType*, ViewArgs...> ViewTypeC;
144 typedef typename ViewTypeA::execution_space execution_space;
148 typedef Kokkos::View<FadType**, ConLayoutA, execution_space> ConViewTypeA;
149 typedef Kokkos::View<FadType*, ConLayoutB, execution_space> ConViewTypeB;
150 typedef Kokkos::View<FadType*, ConLayoutC, execution_space> ConViewTypeC;
152 ConViewTypeA
A(
"A",m,n,
p+1);
153 ConViewTypeB b(
"B",n,
p+1);
154 ConViewTypeC
c(
"c",m,
p+1);
159 Kokkos::deep_copy(
typename ConViewTypeA::array_type(A), 1.0);
160 Kokkos::deep_copy(
typename ConViewTypeB::array_type(b), 1.0);
162 Kokkos::Timer wall_clock;
165#if defined (KOKKOS_ENABLE_CUDA)
166 const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
168 const bool is_cuda =
false;
170 const size_t concurrency = execution_space().concurrency();
171 const size_t warp_dim = is_cuda ? 32 : 1;
172 const size_t block_size =
p*
sizeof(double);
173 const size_t nkernels = concurrency / warp_dim;
174 const size_t mem_pool_size =
175 static_cast<size_t>(1.2*nkernels*block_size);
176 const size_t superblock_size = std::max<size_t>(nkernels / 100, 1) * block_size;
177 execution_space space;
186 execution_space().fence();
189 for (
size_t l=0; l<nloop; l++) {
192 execution_space().fence();
194 perf.
time = wall_clock.seconds() / nloop;
210 const size_t m,
const size_t n,
const size_t p,
const size_t nloop,
213 typedef Kokkos::View<
FadType**, ViewArgs...> ViewTypeA;
214 typedef Kokkos::View<
FadType*, ViewArgs...> ViewTypeB;
215 typedef Kokkos::View<
FadType*, ViewArgs...> ViewTypeC;
216 typedef typename ViewTypeA::execution_space execution_space;
220 typedef Kokkos::View<FadType**, ConLayoutA, execution_space> ConViewTypeA;
221 typedef Kokkos::View<FadType*, ConLayoutB, execution_space> ConViewTypeB;
222 typedef Kokkos::View<FadType*, ConLayoutC, execution_space> ConViewTypeC;
224 ConViewTypeA
A(
"A",m,n,
p+1);
225 ConViewTypeB b(
"B",n,
p+1);
226 ConViewTypeC
c(
"c",m,
p+1);
231 Kokkos::deep_copy(
typename ConViewTypeA::array_type(A), 1.0);
232 Kokkos::deep_copy(
typename ConViewTypeB::array_type(b), 1.0);
234 Kokkos::Timer wall_clock;
239 execution_space().fence();
242 for (
size_t l=0; l<nloop; l++) {
245 execution_space().fence();
247 perf.
time = wall_clock.seconds() / nloop;
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)