Stokhos Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
Stokhos_Cuda_DeviceProp.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Stokhos Package
5// Copyright (2009) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38//
39// ***********************************************************************
40// @HEADER
41
42#ifndef STOKHOS_CUDA_DEVICE_PROP_HPP
43#define STOKHOS_CUDA_DEVICE_PROP_HPP
44
45#include "Kokkos_Core.hpp"
46
47#include "Teuchos_TestForException.hpp"
48
49#include "cuda_runtime_api.h"
50
51namespace Stokhos {
52
53 // Class encapsulating various device attributes
54 class DeviceProp {
55 public:
56
57 typedef Kokkos::Cuda::size_type size_type;
58
61
74
76 bool has_ldg;
77
78 DeviceProp(int device_id = -1) :
88 warp_size(0),
93 has_shuffle(false),
94 has_ldg(false)
95 {
96 // If device_id is negative, use currently selected device
97 if (device_id < 0)
98 cudaGetDevice(&device_id);
99
100 // Get compute capability
101 int major, minor;
102 cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor,
103 device_id);
104 cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor,
105 device_id);
108
109 // Require compute capability >= 2
110 TEUCHOS_TEST_FOR_EXCEPTION(
111 compute_capability_major < 2, std::logic_error,
112 "Cuda compute capability >= 2 is required!");
113
114 // These come from the CUDA occupancy calculator
115 if (compute_capability_major == 7) {
116 if (compute_capability_minor == 0) {
117 shared_memory_capacity = 96 * 1024;
118 }
119 else {
120 shared_memory_capacity = 64 * 1024;
121 }
122
123 max_shmem_per_block = 48 * 1024;
124 max_regs_per_block = 64 * 1024;
125 max_regs_per_sm = 64 * 1024;
128
129 if (compute_capability_minor == 0) {
130 max_threads_per_sm = 2048;
131 max_warps_per_sm = 64;
133 }
134 else {
135 max_threads_per_sm = 1024;
136 max_warps_per_sm = 32;
138 }
139
140 warp_size = 32;
141 warp_granularity = 4; // ??
142 reg_bank_size = 256;
143 has_shuffle = true;
144 has_ldg = true;
145 }
146
147 else if (compute_capability_major == 6) {
149 shared_memory_capacity = 96 * 1024;
150 else
151 shared_memory_capacity = 64 * 1024;
152
154 max_regs_per_block = 64 * 1024;
155 else
156 max_regs_per_block = 32 * 1024;
157
158 max_shmem_per_block = 48 * 1024;
159 max_regs_per_sm = 64 * 1024;
162
163 if (compute_capability_minor == 2) {
164 max_threads_per_sm = 4096;
165 max_warps_per_sm = 128;
166 }
167 else {
168 max_threads_per_sm = 2048;
169 max_warps_per_sm = 64;
170 }
172
173 warp_size = 32;
176 else
178 reg_bank_size = 256;
179 has_shuffle = true;
180 has_ldg = true;
181 }
182
183 else if (compute_capability_major == 3) {
184 if (compute_capability_minor >= 7) {
185 shared_memory_capacity = 112 * 1024;
186 max_shmem_per_block = 48 * 1024;
187 max_regs_per_sm = 128 * 1024;
188 max_regs_per_block = 64 * 1024;
189 }
190 else {
191 shared_memory_capacity = 48 * 1024;
192 max_shmem_per_block = 48 * 1024;
193 max_regs_per_sm = 64 * 1024;
194 max_regs_per_block = 64 * 1024;
195 }
198 max_threads_per_sm = 2048;
200 max_warps_per_sm = 64;
201 warp_size = 32;
203 reg_bank_size = 256;
204 has_shuffle = true;
205 has_ldg = true;
206 }
207
208 else if (compute_capability_major == 2) {
209 shared_memory_capacity = 48 * 1024;
211 max_shmem_per_block = 48 * 1024;
213 max_threads_per_sm = 1536;
215 max_warps_per_sm = 48;
216 warp_size = 32;
218 max_regs_per_sm = 32 * 1024;
219 max_regs_per_block = 32 * 1024;
220 reg_bank_size = 64;
221 has_shuffle = false;
222 has_ldg = false;
223 }
224
225 else
226 TEUCHOS_TEST_FOR_EXCEPTION(
227 true, std::logic_error,
228 "DeviceProp not configured for compute capability " <<
230 }
231
232 // Returns number of registers per thread used by the given kernel
233 template <typename Kernel>
235 get_kernel_registers(Kernel kernel) {
236#ifdef __CUDACC__
237 typedef void (*func_ptr_t)();
238 func_ptr_t func_ptr = reinterpret_cast<func_ptr_t>(kernel);
239 cudaFuncAttributes attrib;
240 cudaFuncGetAttributes(&attrib, func_ptr);
241 return attrib.numRegs;
242#else
243 return 0;
244#endif
245 }
246
247 // Returns number of resident warps per sm for the given kernel
248 template <typename Kernel>
251 const size_type regs_per_thread = get_kernel_registers(kernel);
252 const size_type regs_per_warp =
253 (warp_size*regs_per_thread + reg_bank_size-1) & ~(reg_bank_size-1);
254 const size_type warps_per_sm =
255 (max_regs_per_sm/regs_per_warp) & ~(warp_granularity-1);
256 return warps_per_sm;
257 }
258 };
259
260} // namespace Stokhos
261
262#endif /* #ifndef STOKHOS_CUDA_DEVICE_PROP_HPP */
size_type get_kernel_registers(Kernel kernel)
size_type get_resident_warps_per_sm(Kernel kernel)
Kokkos::Cuda::size_type size_type
Top-level namespace for Stokhos classes and functions.