Stokhos Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
Stokhos_CrsProductTensor.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Stokhos Package
5// Copyright (2009) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38//
39// ***********************************************************************
40// @HEADER
41
42#ifndef STOKHOS_CRSPRODUCTTENSOR_HPP
43#define STOKHOS_CRSPRODUCTTENSOR_HPP
44
45#include "Kokkos_Core.hpp"
46
47#include "Stokhos_Multiply.hpp"
50#include "Teuchos_ParameterList.hpp"
53#include "Stokhos_TinyVec.hpp"
54
55//----------------------------------------------------------------------------
56//----------------------------------------------------------------------------
57
58namespace Stokhos {
59
77template< typename ValueType, class ExecutionSpace, class Memory = void >
79public:
80
81 typedef ExecutionSpace execution_space;
82 typedef int size_type;
83 typedef ValueType value_type;
84 typedef Memory memory_type;
85
86 typedef typename Kokkos::ViewTraits< size_type*, execution_space,void,void >::host_mirror_space host_mirror_space ;
87 typedef CrsProductTensor<value_type, host_mirror_space> HostMirror;
88
89// Vectorsize used in multiply algorithm
90#if defined(__AVX__)
91 static const size_type host_vectorsize = 32/sizeof(value_type);
92 static const bool use_intrinsics = true;
93 static const size_type num_entry_align = 1;
94#elif defined(__MIC__)
95 static const size_type host_vectorsize = 16;
96 static const bool use_intrinsics = true;
97 static const size_type num_entry_align = 8; // avoid use of mask instructions
98#else
99 static const size_type host_vectorsize = 2;
100 static const bool use_intrinsics = false;
101 static const size_type num_entry_align = 1;
102#endif
103 static const size_type cuda_vectorsize = 32;
104 static const bool is_cuda =
105#if defined( KOKKOS_ENABLE_CUDA )
106 std::is_same<ExecutionSpace,Kokkos::Cuda>::value;
107#else
108 false ;
109#endif
111
112 // Alignment in terms of number of entries of CRS rows
114
115private:
116
117 template <class, class, class> friend class CrsProductTensor;
118
119 typedef Kokkos::View< value_type*, Kokkos::LayoutLeft, execution_space, memory_type > vec_type;
120 typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type > coord_array_type;
121 typedef Kokkos::View< size_type*[2], Kokkos::LayoutLeft, execution_space, memory_type > coord2_array_type;
122 typedef Kokkos::View< value_type*, Kokkos::LayoutLeft, execution_space, memory_type > value_array_type;
123 typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type > entry_array_type;
124 typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type > row_map_array_type;
125
136
138 unsigned count;
139 unsigned basis;
140
142 : count(0)
143 , basis(0)
144 {}
145 };
146
148 bool operator() (const CijkRowCount& a, const CijkRowCount& b) const {
149 return a.count < b.count;
150 }
151 };
152
153public:
154
155 KOKKOS_INLINE_FUNCTION
157
158 KOKKOS_INLINE_FUNCTION
160 m_coord(),
161 m_coord2(),
162 m_value(),
163 m_num_entry(),
164 m_row_map(),
165 m_dim(0),
166 m_entry_max(0),
167 m_nnz(0),
168 m_flops(0),
170
171 template <class M>
172 KOKKOS_INLINE_FUNCTION
173 CrsProductTensor( const CrsProductTensor<value_type,execution_space,M> & rhs ) :
174 m_coord( rhs.m_coord ),
175 m_coord2( rhs.m_coord2 ),
176 m_value( rhs.m_value ),
178 m_row_map( rhs.m_row_map ),
179 m_dim( rhs.m_dim ),
181 m_nnz( rhs.m_nnz ),
182 m_flops( rhs.m_flops ),
184
185 template <class M>
186 KOKKOS_INLINE_FUNCTION
188 operator = ( const CrsProductTensor<value_type,execution_space,M> & rhs )
189 {
190 m_coord = rhs.m_coord;
191 m_coord2 = rhs.m_coord2;
192 m_value = rhs.m_value;
193 m_num_entry = rhs.m_num_entry;
194 m_row_map = rhs.m_row_map;
195 m_dim = rhs.m_dim;
196 m_entry_max = rhs.m_entry_max;
197 m_nnz = rhs.m_nnz;
198 m_flops = rhs.m_flops;
199 m_avg_entries_per_row = rhs.m_avg_entries_per_row;
200 return *this;
201 }
202
204 KOKKOS_INLINE_FUNCTION
205 size_type dimension() const { return m_dim; }
206
208 KOKKOS_INLINE_FUNCTION
209 bool is_empty() const { return m_dim == 0; }
210
212 KOKKOS_INLINE_FUNCTION
214 { return m_coord.extent(0); }
215
217 KOKKOS_INLINE_FUNCTION
219 { return m_entry_max; }
220
222 KOKKOS_INLINE_FUNCTION
224 { return m_row_map[i]; }
225
227 KOKKOS_INLINE_FUNCTION
229 { return m_row_map[i] + m_num_entry(i); }
230
232 KOKKOS_INLINE_FUNCTION
234 { return m_num_entry(i); }
235
237 KOKKOS_INLINE_FUNCTION
238 const size_type& coord( const size_type entry, const size_type c ) const
239 { return m_coord2( entry, c ); }
240
242 KOKKOS_INLINE_FUNCTION
243 const size_type& coord( const size_type entry ) const
244 { return m_coord( entry ); }
245
247 KOKKOS_INLINE_FUNCTION
248 const value_type & value( const size_type entry ) const
249 { return m_value( entry ); }
250
252 KOKKOS_INLINE_FUNCTION
254 { return m_nnz; }
255
257 KOKKOS_INLINE_FUNCTION
259 { return m_flops; }
260
262 KOKKOS_INLINE_FUNCTION
265
266 template <typename OrdinalType>
267 static CrsProductTensor
270 const Teuchos::ParameterList& params = Teuchos::ParameterList())
271 {
273
274 // Note (etp 01/08/15 Commenting out the sorting as it causes a really
275 // weird compiler error when compiling with NVCC. It seems to think the
276 // < in CompareCijkRowCount() is part of a template parameter. We don't
277 // seem to use this option, so I am just commenting it out.
278
279 // bool sort_nnz = false;
280 // if (params.isParameter("Sort Nonzeros"))
281 // sort_nnz = params.get<bool>("Sort Nonzeros");
282
283 // Compute number of non-zeros for each i
284 const size_type dimension = basis.size();
285 std::vector< size_t > coord_work( dimension, (size_t) 0 );
287 for (typename Cijk_type::i_iterator i_it=Cijk.i_begin();
288 i_it!=Cijk.i_end(); ++i_it) {
289 OrdinalType i = index(i_it);
290 for (typename Cijk_type::ik_iterator k_it = Cijk.k_begin(i_it);
291 k_it != Cijk.k_end(i_it); ++k_it) {
292 OrdinalType k = index(k_it);
293 for (typename Cijk_type::ikj_iterator j_it = Cijk.j_begin(k_it);
294 j_it != Cijk.j_end(k_it); ++j_it) {
295 OrdinalType j = index(j_it);
296 if (j >= k) {
297 ++coord_work[i];
298 ++entry_count;
299 }
300 }
301 }
302 }
303
304 // Compute average nonzeros per row (must be before padding)
306
307 // Pad each row to have size divisible by alignment size
308 for ( size_type i = 0; i < dimension; ++i ) {
309 const size_t rem = coord_work[i] % tensor_align;
310 if (rem > 0) {
311 const size_t pad = tensor_align - rem;
312 coord_work[i] += pad;
313 entry_count += pad;
314 }
315 }
316
317 // Sort based on number of non-zeros
318 std::vector< CijkRowCount > row_count( dimension );
319 for ( size_type i = 0; i < dimension; ++i ) {
320 row_count[i].count = coord_work[i];
321 row_count[i].basis = i;
322 }
323
324 // Note (etp 01/08/15 See above.
325
326 // if (sort_nnz)
327 // std::sort( row_count.begin(), row_count.end(), CompareCijkRowCount() );
328 std::vector<size_type> sorted_row_map( dimension );
329 for ( size_type i = 0; i < dimension; ++i ) {
330 coord_work[i] = row_count[i].count;
331 sorted_row_map[ row_count[i].basis ] = i;
332 }
333
334 // Allocate tensor data
335 // coord and coord2 are initialized to zero because otherwise we get
336 // seg faults in the MIC algorithm when processing the tail of each
337 // tensor row. Not quite sure why as the coord loads are padded to
338 // length 16 and are masked for the remainder (unless it does load x[j]
339 // anyway and masks off the result, so j needs to be valid).
340 CrsProductTensor tensor;
341 tensor.m_coord = coord_array_type("tensor_coord", entry_count );
342 tensor.m_coord2 = coord2_array_type( "tensor_coord2", entry_count );
343 tensor.m_value = value_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_value"), entry_count );
344 tensor.m_num_entry = entry_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_num_entry"), dimension );
345 tensor.m_row_map = row_map_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_row_map"), dimension+1 );
346 tensor.m_dim = dimension;
347 tensor.m_entry_max = 0;
349
350 // Create mirror, is a view if is host memory
351 typename coord_array_type::HostMirror
352 host_coord = Kokkos::create_mirror_view( tensor.m_coord );
353 typename coord2_array_type::HostMirror
354 host_coord2 = Kokkos::create_mirror_view( tensor.m_coord2 );
355 typename value_array_type::HostMirror
356 host_value = Kokkos::create_mirror_view( tensor.m_value );
357 typename entry_array_type::HostMirror
358 host_num_entry = Kokkos::create_mirror_view( tensor.m_num_entry );
359 typename entry_array_type::HostMirror
360 host_row_map = Kokkos::create_mirror_view( tensor.m_row_map );
361
362 // Compute row map
363 size_type sum = 0;
364 host_row_map(0) = 0;
365 for ( size_type i = 0; i < dimension; ++i ) {
366 sum += coord_work[i];
367 host_row_map(i+1) = sum;
368 host_num_entry(i) = 0;
369 }
370
371 for ( size_type iCoord = 0; iCoord < dimension; ++iCoord ) {
372 coord_work[iCoord] = host_row_map[iCoord];
373 }
374
375 // Initialize values and coordinates to zero since we will have extra
376 // ones for alignment
377 Kokkos::deep_copy( host_value, 0.0 );
378 Kokkos::deep_copy( host_coord, 0 );
379 Kokkos::deep_copy( host_coord2, 0 );
380
381 for (typename Cijk_type::i_iterator i_it=Cijk.i_begin();
382 i_it!=Cijk.i_end(); ++i_it) {
383 OrdinalType i = index(i_it);
384 const size_type row = sorted_row_map[i];
385 for (typename Cijk_type::ik_iterator k_it = Cijk.k_begin(i_it);
386 k_it != Cijk.k_end(i_it); ++k_it) {
387 OrdinalType k = index(k_it);
388 for (typename Cijk_type::ikj_iterator j_it = Cijk.j_begin(k_it);
389 j_it != Cijk.j_end(k_it); ++j_it) {
390 OrdinalType j = index(j_it);
391 ValueType c = Stokhos::value(j_it);
392 if (j >= k) {
393 const size_type n = coord_work[row]; ++coord_work[row];
394 host_value(n) = (j != k) ? c : 0.5*c;
395 host_coord2(n,0) = j;
396 host_coord2(n,1) = k;
397 host_coord(n) = ( k << 16 ) | j;
398 ++host_num_entry(row);
399 ++tensor.m_nnz;
400 }
401 }
402 }
403 // Align num_entry
404 host_num_entry(row) =
405 (host_num_entry(row) + num_entry_align-1) & ~(num_entry_align-1);
406 }
407
408 // Copy data to device if necessary
409 Kokkos::deep_copy( tensor.m_coord, host_coord );
410 Kokkos::deep_copy( tensor.m_coord2, host_coord2 );
411 Kokkos::deep_copy( tensor.m_value, host_value );
412 Kokkos::deep_copy( tensor.m_num_entry, host_num_entry );
413 Kokkos::deep_copy( tensor.m_row_map, host_row_map );
414
415 for ( size_type i = 0; i < dimension; ++i ) {
416 tensor.m_entry_max = std::max( tensor.m_entry_max, host_num_entry(i) );
417 }
418
419 tensor.m_flops = 5*tensor.m_nnz + dimension;
420
421 return tensor;
422 }
423
425 {
426 const size_type dimension = 1;
427 const size_type entry_count = 1;
428
429 // Allocate tensor data
430 // coord and coord2 are initialized to zero because otherwise we get
431 // seg faults in the MIC algorithm when processing the tail of each
432 // tensor row. Not quite sure why as the coord loads are padded to
433 // length 16 and are masked for the remainder (unless it does load x[j]
434 // anyway and masks off the result, so j needs to be valid).
435 CrsProductTensor tensor;
436 tensor.m_coord = coord_array_type("tensor_coord", entry_count );
437 tensor.m_coord2 = coord2_array_type( "tensor_coord2", entry_count );
438 tensor.m_value = value_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_value"), entry_count );
439 tensor.m_num_entry = entry_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_num_entry"), dimension );
440 tensor.m_row_map = row_map_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_row_map"), dimension+1 );
441 tensor.m_dim = dimension;
442 tensor.m_entry_max = 1;
443 tensor.m_avg_entries_per_row = 1;
444 tensor.m_nnz = 1;
445 tensor.m_flops = 5*tensor.m_nnz + dimension;
446
447 // Create mirror, is a view if is host memory
448 typename coord_array_type::HostMirror
449 host_coord = Kokkos::create_mirror_view( tensor.m_coord );
450 typename coord2_array_type::HostMirror
451 host_coord2 = Kokkos::create_mirror_view( tensor.m_coord2 );
452 typename value_array_type::HostMirror
453 host_value = Kokkos::create_mirror_view( tensor.m_value );
454 typename entry_array_type::HostMirror
455 host_num_entry = Kokkos::create_mirror_view( tensor.m_num_entry );
456 typename entry_array_type::HostMirror
457 host_row_map = Kokkos::create_mirror_view( tensor.m_row_map );
458
459 // Compute row map
460 host_row_map(0) = 0;
461 host_row_map(1) = 1;
462 host_num_entry(0) = 1;
463
464 // Compute tensor values
465 host_value(0) = 0.5;
466 host_coord2(0,0) = 0;
467 host_coord2(0,1) = 0;
468 host_coord(0) = 0;
469
470 // Copy data to device if necessary
471 Kokkos::deep_copy( tensor.m_coord, host_coord );
472 Kokkos::deep_copy( tensor.m_coord2, host_coord2 );
473 Kokkos::deep_copy( tensor.m_value, host_value );
474 Kokkos::deep_copy( tensor.m_num_entry, host_num_entry );
475 Kokkos::deep_copy( tensor.m_row_map, host_row_map );
476
477 return tensor;
478 }
479
480 static HostMirror
482 HostMirror host_tensor;
483
484 host_tensor.m_coord = Kokkos::create_mirror_view( tensor.m_coord );
485 host_tensor.m_coord2 = Kokkos::create_mirror_view( tensor.m_coord2 );
486 host_tensor.m_value = Kokkos::create_mirror_view( tensor.m_value );
487 host_tensor.m_num_entry = Kokkos::create_mirror_view( tensor.m_num_entry );
488 host_tensor.m_row_map = Kokkos::create_mirror_view( tensor.m_row_map );
489
490 host_tensor.m_dim = tensor.m_dim;
491 host_tensor.m_entry_max = tensor.m_entry_max;
492 host_tensor.m_avg_entries_per_row = tensor.m_avg_entries_per_row;
493 host_tensor.m_nnz = tensor.m_nnz;
494 host_tensor.m_flops = tensor.m_flops;
495
496 return host_tensor;
497 }
498
499 template < class DstDevice, class DstMemory >
500 static void
501 deep_copy( const CrsProductTensor<ValueType,DstDevice,DstMemory>& dst ,
502 const CrsProductTensor& src ) {
503 Kokkos::deep_copy( dst.m_coord, src.m_coord );
504 Kokkos::deep_copy( dst.m_coord2, src.m_coord2 );
505 Kokkos::deep_copy( dst.m_value, src.m_value );
506 Kokkos::deep_copy( dst.m_num_entry, src.m_num_entry );
507 Kokkos::deep_copy( dst.m_row_map, src.m_row_map );
508 }
509
510};
511
512template< class Device, typename OrdinalType, typename ValueType>
513CrsProductTensor<ValueType, Device>
517 const Teuchos::ParameterList& params = Teuchos::ParameterList())
518{
519 return CrsProductTensor<ValueType, Device>::create(basis, Cijk, params );
520}
521
522template< class Device, typename OrdinalType, typename ValueType,
523 class Memory >
524CrsProductTensor<ValueType, Device, Memory>
528 const Teuchos::ParameterList& params = Teuchos::ParameterList())
529{
531 basis, Cijk, params );
532}
533
534template< class Device, typename OrdinalType, typename ValueType>
535CrsProductTensor<ValueType, Device>
540
541template< class Device, typename OrdinalType, typename ValueType,
542 class Memory >
543CrsProductTensor<ValueType, Device, Memory>
548
549template < class ValueType, class Device, class Memory >
550inline
552create_mirror_view( const CrsProductTensor<ValueType,Device,Memory> & src )
553{
555}
556
557 template < class ValueType,
558 class DstDevice, class DstMemory,
559 class SrcDevice, class SrcMemory >
560void
561deep_copy( const CrsProductTensor<ValueType,DstDevice,DstMemory> & dst ,
562 const CrsProductTensor<ValueType,SrcDevice,SrcMemory> & src )
563{
565}
566
567template < typename ValueType, typename Device >
568class BlockMultiply< CrsProductTensor< ValueType , Device > >
569{
570public:
571
572 typedef Device execution_space;
573 typedef CrsProductTensor< ValueType , execution_space > tensor_type ;
574 typedef typename tensor_type::size_type size_type ;
575
576// Whether to use manual or auto-vectorization
577#ifdef __MIC__
578#define USE_AUTO_VECTORIZATION 1
579#else
580#define USE_AUTO_VECTORIZATION 0
581#endif
582
583#if defined(__INTEL_COMPILER) && USE_AUTO_VECTORIZATION
584
585 // Version leveraging intel vectorization
586 template< typename MatrixValue , typename VectorValue >
587 KOKKOS_INLINE_FUNCTION
588 static void apply( const tensor_type & tensor ,
589 const MatrixValue * const a ,
590 const VectorValue * const x ,
591 VectorValue * const y ,
592 const VectorValue & alpha = VectorValue(1) )
593 {
594 // The intel compiler doesn't seem to be able to vectorize through
595 // the coord() calls, so extract pointers
596 const size_type * cj = &tensor.coord(0,0);
597 const size_type * ck = &tensor.coord(0,1);
598 const size_type nDim = tensor.dimension();
599
600 for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
601 const size_type nEntry = tensor.num_entry(iy);
602 const size_type iEntryBeg = tensor.entry_begin(iy);
603 const size_type iEntryEnd = iEntryBeg + nEntry;
604 VectorValue ytmp = 0;
605
606#pragma simd vectorlength(tensor_type::vectorsize)
607#pragma ivdep
608#pragma vector aligned
609 for (size_type iEntry = iEntryBeg; iEntry<iEntryEnd; ++iEntry) {
610 const size_type j = cj[iEntry]; //tensor.coord(iEntry,0);
611 const size_type k = ck[iEntry]; //tensor.coord(iEntry,1);
612 ytmp += tensor.value(iEntry) * ( a[j] * x[k] + a[k] * x[j] );
613 }
614
615 y[iy] += alpha * ytmp ;
616 }
617 }
618
619#elif defined(__MIC__)
620
621 // Version specific to MIC architecture using manual vectorization
622 template< typename MatrixValue , typename VectorValue >
623 KOKKOS_INLINE_FUNCTION
624 static void apply( const tensor_type & tensor ,
625 const MatrixValue * const a ,
626 const VectorValue * const x ,
627 VectorValue * const y ,
628 const VectorValue & alpha = VectorValue(1) )
629 {
630 const size_type nDim = tensor.dimension();
631 for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
632
633 const size_type nEntry = tensor.num_entry(iy);
634 const size_type iEntryBeg = tensor.entry_begin(iy);
635 const size_type iEntryEnd = iEntryBeg + nEntry;
636 size_type iEntry = iEntryBeg;
637
638 VectorValue ytmp = 0 ;
639
640 const size_type nBlock = nEntry / tensor_type::vectorsize;
641 const size_type nEntryB = nBlock * tensor_type::vectorsize;
642 const size_type iEnd = iEntryBeg + nEntryB;
643
644 typedef TinyVec<ValueType,tensor_type::vectorsize,tensor_type::use_intrinsics> TV;
645 TV vy;
646 vy.zero();
647 for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
648 const size_type *j = &tensor.coord(iEntry,0);
649 const size_type *k = &tensor.coord(iEntry,1);
650 TV aj(a, j), ak(a, k), xj(x, j), xk(x, k),
651 c(&(tensor.value(iEntry)));
652
653 // vy += c * ( aj * xk + ak * xj)
654 aj.times_equal(xk);
655 aj.multiply_add(ak, xj);
656 vy.multiply_add(c, aj);
657
658 }
659 ytmp += vy.sum();
660
661 // The number of nonzeros is always constrained to be a multiple of 8
662
663 const size_type rem = iEntryEnd-iEntry;
664 if (rem >= 8) {
665 typedef TinyVec<ValueType,8,tensor_type::use_intrinsics> TV2;
666 const size_type *j = &tensor.coord(iEntry,0);
667 const size_type *k = &tensor.coord(iEntry,1);
668 TV2 aj(a, j), ak(a, k), xj(x, j), xk(x, k),
669 c(&(tensor.value(iEntry)));
670
671 // vy += c * ( aj * xk + ak * xj)
672 aj.times_equal(xk);
673 aj.multiply_add(ak, xj);
674 aj.times_equal(c);
675 ytmp += aj.sum();
676 }
677
678 y[iy] += alpha * ytmp ;
679 }
680 }
681
682#else
683
684 // General version
685 template< typename MatrixValue , typename VectorValue >
686 KOKKOS_INLINE_FUNCTION
687 static void apply( const tensor_type & tensor ,
688 const MatrixValue * const a ,
689 const VectorValue * const x ,
690 VectorValue * const y ,
691 const VectorValue & alpha = VectorValue(1) )
692 {
693 const size_type nDim = tensor.dimension();
694 for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
695
696 const size_type nEntry = tensor.num_entry(iy);
697 const size_type iEntryBeg = tensor.entry_begin(iy);
698 const size_type iEntryEnd = iEntryBeg + nEntry;
699 size_type iEntry = iEntryBeg;
700
701 VectorValue ytmp = 0 ;
702
703 // Do entries with a blocked loop of size vectorsize
704 if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
705 const size_type nBlock = nEntry / tensor_type::vectorsize;
706 const size_type nEntryB = nBlock * tensor_type::vectorsize;
707 const size_type iEnd = iEntryBeg + nEntryB;
708
709 typedef TinyVec<ValueType,tensor_type::vectorsize,tensor_type::use_intrinsics> TV;
710 TV vy;
711 vy.zero();
712 for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
713 const size_type *j = &tensor.coord(iEntry,0);
714 const size_type *k = &tensor.coord(iEntry,1);
715 TV aj(a, j), ak(a, k), xj(x, j), xk(x, k), c(&(tensor.value(iEntry)));
716
717 // vy += c * ( aj * xk + ak * xj)
718 aj.times_equal(xk);
719 aj.multiply_add(ak, xj);
720 vy.multiply_add(c, aj);
721 }
722 ytmp += vy.sum();
723 }
724
725 // Do remaining entries with a scalar loop
726 for ( ; iEntry<iEntryEnd; ++iEntry) {
727 const size_type j = tensor.coord(iEntry,0);
728 const size_type k = tensor.coord(iEntry,1);
729
730 ytmp += tensor.value(iEntry) * ( a[j] * x[k] + a[k] * x[j] );
731 }
732
733 y[iy] += alpha * ytmp ;
734 }
735 }
736#endif
737
738 KOKKOS_INLINE_FUNCTION
739 static size_type matrix_size( const tensor_type & tensor )
740 { return tensor.dimension(); }
741
742 KOKKOS_INLINE_FUNCTION
743 static size_type vector_size( const tensor_type & tensor )
744 { return tensor.dimension(); }
745};
746
747// Specialization of Multiply< BlockCrsMatrix< BlockSpec, ... > > > for
748// CrsProductTensor, which provides a version that processes blocks of FEM
749// columns together to reduce the number of global reads of the sparse 3 tensor
750
751// Even though this isn't specific to Threads, templating on Device creates a
752// duplicate specialization error for Cuda. Need to see if we can fix this,
753// or put the implementation in another class easily specialized for Threads,
754// OpenMP, ...
755template< typename ValueType , typename MatrixValue , typename VectorValue ,
756 typename Device >
758public:
759
760 typedef Device execution_space ;
761 typedef CrsProductTensor< ValueType , execution_space > tensor_type;
762 typedef StochasticProductTensor< ValueType, tensor_type, execution_space > BlockSpec;
763 typedef typename BlockSpec::size_type size_type ;
764 typedef Kokkos::View< VectorValue** , Kokkos::LayoutLeft , execution_space > block_vector_type ;
765 typedef BlockCrsMatrix< BlockSpec , MatrixValue , execution_space > matrix_type ;
766
770
772 const block_vector_type & x ,
773 const block_vector_type & y )
774 : m_A( A )
775 , m_x( x )
776 , m_y( y )
777 {}
778
779 //--------------------------------------------------------------------------
780 // A( storage_size( m_A.block.size() ) , m_A.graph.row_map.size() );
781 // x( m_A.block.dimension() , m_A.graph.row_map.first_count() );
782 // y( m_A.block.dimension() , m_A.graph.row_map.first_count() );
783 //
784
785 KOKKOS_INLINE_FUNCTION
786 void operator()( const size_type iBlockRow ) const
787 {
788 // Prefer that y[ m_A.block.dimension() ] be scratch space
789 // on the local thread, but cannot dynamically allocate
790 VectorValue * const y = & m_y(0,iBlockRow);
791
792 const size_type iEntryBegin = m_A.graph.row_map[ iBlockRow ];
793 const size_type iEntryEnd = m_A.graph.row_map[ iBlockRow + 1 ];
794
795 // Leading dimension guaranteed contiguous for LayoutLeft
796 for ( size_type j = 0 ; j < m_A.block.dimension() ; ++j ) { y[j] = 0 ; }
797
798 for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
799 const VectorValue * const x = & m_x( 0 , m_A.graph.entries(iEntry) );
800 const MatrixValue * const a = & m_A.values( 0 , iEntry );
801
802 BlockMultiply< BlockSpec >::apply( m_A.block , a , x , y );
803 }
804
805 }
806
807 /*
808 * Compute work range = (begin, end) such that adjacent threads write to
809 * separate cache lines
810 */
811 KOKKOS_INLINE_FUNCTION
812 std::pair< size_type , size_type >
813 compute_work_range( const size_type work_count ,
814 const size_type thread_count ,
815 const size_type thread_rank ) const
816 {
817 enum { work_align = 64 / sizeof(VectorValue) };
818 enum { work_shift = Stokhos::power_of_two< work_align >::value };
819 enum { work_mask = work_align - 1 };
820
821 const size_type work_per_thread =
822 ( ( ( ( work_count + work_mask ) >> work_shift ) + thread_count - 1 ) /
823 thread_count ) << work_shift ;
824
825 const size_type work_begin =
826 std::min( thread_rank * work_per_thread , work_count );
827 const size_type work_end =
828 std::min( work_begin + work_per_thread , work_count );
829
830 return std::make_pair( work_begin , work_end );
831 }
832
833#if defined(__MIC__)
834
835 // A MIC-specific version of the block-multiply algorithm, where block here
836 // means processing multiple FEM columns at a time
837 KOKKOS_INLINE_FUNCTION
838 void operator()( const typename Kokkos::TeamPolicy< execution_space >::member_type & device ) const
839 {
840 const size_type iBlockRow = device.league_rank();
841
842 // Check for valid row
843 const size_type row_count = m_A.graph.row_map.extent(0)-1;
844 if (iBlockRow >= row_count)
845 return;
846
847 const size_type num_thread = device.team_size();
848 const size_type thread_idx = device.team_rank();
849 std::pair<size_type,size_type> work_range =
850 compute_work_range(m_A.block.dimension(), num_thread, thread_idx);
851
852 // Prefer that y[ m_A.block.dimension() ] be scratch space
853 // on the local thread, but cannot dynamically allocate
854 VectorValue * const y = & m_y(0,iBlockRow);
855
856 // Leading dimension guaranteed contiguous for LayoutLeft
857 for ( size_type j = work_range.first ; j < work_range.second ; ++j )
858 y[j] = 0 ;
859
860 const tensor_type& tensor = m_A.block.tensor();
861
862 const size_type iBlockEntryBeg = m_A.graph.row_map[ iBlockRow ];
863 const size_type iBlockEntryEnd = m_A.graph.row_map[ iBlockRow + 1 ];
864 const size_type BlockSize = 9;
865 const size_type numBlock =
866 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
867
868 const MatrixValue* sh_A[BlockSize];
869 const VectorValue* sh_x[BlockSize];
870
871 size_type iBlockEntry = iBlockEntryBeg;
872 for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
873 const size_type block_size =
874 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
875
876 for ( size_type col = 0; col < block_size; ++col ) {
877 const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col );
878 sh_x[col] = & m_x( 0 , iBlockColumn );
879 sh_A[col] = & m_A.values( 0 , iBlockEntry + col );
880 }
881
882 for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
883
884 const size_type nEntry = tensor.num_entry(iy);
885 const size_type iEntryBeg = tensor.entry_begin(iy);
886 const size_type iEntryEnd = iEntryBeg + nEntry;
887 size_type iEntry = iEntryBeg;
888
889 VectorValue ytmp = 0 ;
890
891 // Do entries with a blocked loop of size blocksize
892 const size_type nBlock = nEntry / tensor_type::vectorsize;
893 const size_type nEntryB = nBlock * tensor_type::vectorsize;
894 const size_type iEnd = iEntryBeg + nEntryB;
895
896 typedef TinyVec<ValueType,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
897 typedef TinyVec<MatrixValue,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
898 typedef TinyVec<VectorValue,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
899 VecTV vy;
900 vy.zero();
901 for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
902 const size_type *j = &tensor.coord(iEntry,0);
903 const size_type *k = &tensor.coord(iEntry,1);
904 ValTV c(&(tensor.value(iEntry)));
905
906 for ( size_type col = 0; col < block_size; ++col ) {
907 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
908 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
909
910 // vy += c * ( aj * xk + ak * xj)
911 aj.times_equal(xk);
912 aj.multiply_add(ak, xj);
913 vy.multiply_add(c, aj);
914 }
915 }
916 ytmp += vy.sum();
917
918 // The number of nonzeros is always constrained to be a multiple of 8
919
920 const size_type rem = iEntryEnd-iEntry;
921 if (rem >= 8) {
922 typedef TinyVec<ValueType,8,tensor_type::use_intrinsics> ValTV2;
923 typedef TinyVec<MatrixValue,8,tensor_type::use_intrinsics> MatTV2;
924 typedef TinyVec<VectorValue,8,tensor_type::use_intrinsics> VecTV2;
925 const size_type *j = &tensor.coord(iEntry,0);
926 const size_type *k = &tensor.coord(iEntry,1);
927 ValTV2 c(&(tensor.value(iEntry)));
928
929 for ( size_type col = 0; col < block_size; ++col ) {
930 MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
931 VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
932
933 // vy += c * ( aj * xk + ak * xj)
934 aj.times_equal(xk);
935 aj.multiply_add(ak, xj);
936 aj.times_equal(c);
937 ytmp += aj.sum();
938 }
939 }
940
941 y[iy] += ytmp ;
942 }
943
944 // Add a team barrier to keep the thread team in-sync before going on
945 // to the next block
946 device.team_barrier();
947 }
948
949 }
950
951#else
952
953 // A general hand-vectorized version of the block multiply algorithm, where
954 // block here means processing multiple FEM columns at a time. Note that
955 // auto-vectorization of a block algorithm doesn't work, because the
956 // stochastic loop is not the inner-most loop.
957 KOKKOS_INLINE_FUNCTION
958 void operator()( const typename Kokkos::TeamPolicy< execution_space >::member_type & device ) const
959 {
960 const size_type iBlockRow = device.league_rank();
961
962 // Check for valid row
963 const size_type row_count = m_A.graph.row_map.extent(0)-1;
964 if (iBlockRow >= row_count)
965 return;
966
967 const size_type num_thread = device.team_size();
968 const size_type thread_idx = device.team_rank();
969 std::pair<size_type,size_type> work_range =
970 compute_work_range(m_A.block.dimension(), num_thread, thread_idx);
971
972 // Prefer that y[ m_A.block.dimension() ] be scratch space
973 // on the local thread, but cannot dynamically allocate
974 VectorValue * const y = & m_y(0,iBlockRow);
975
976 // Leading dimension guaranteed contiguous for LayoutLeft
977 for ( size_type j = work_range.first ; j < work_range.second ; ++j )
978 y[j] = 0 ;
979
980 const tensor_type& tensor = m_A.block.tensor();
981
982 const size_type iBlockEntryBeg = m_A.graph.row_map[ iBlockRow ];
983 const size_type iBlockEntryEnd = m_A.graph.row_map[ iBlockRow + 1 ];
984 const size_type BlockSize = 14;
985 const size_type numBlock =
986 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
987
988 const MatrixValue* sh_A[BlockSize];
989 const VectorValue* sh_x[BlockSize];
990
991 size_type iBlockEntry = iBlockEntryBeg;
992 for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
993 const size_type block_size =
994 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
995
996 for ( size_type col = 0; col < block_size; ++col ) {
997 const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col );
998 sh_x[col] = & m_x( 0 , iBlockColumn );
999 sh_A[col] = & m_A.values( 0 , iBlockEntry + col );
1000 }
1001
1002 for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
1003
1004 const size_type nEntry = tensor.num_entry(iy);
1005 const size_type iEntryBeg = tensor.entry_begin(iy);
1006 const size_type iEntryEnd = iEntryBeg + nEntry;
1007 size_type iEntry = iEntryBeg;
1008
1009 VectorValue ytmp = 0 ;
1010
1011 // Do entries with a blocked loop of size blocksize
1012 if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
1013 const size_type nBlock = nEntry / tensor_type::vectorsize;
1014 const size_type nEntryB = nBlock * tensor_type::vectorsize;
1015 const size_type iEnd = iEntryBeg + nEntryB;
1016
1017 typedef TinyVec<ValueType,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
1018 typedef TinyVec<MatrixValue,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
1019 typedef TinyVec<VectorValue,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
1020 VecTV vy;
1021 vy.zero();
1022 for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
1023 const size_type *j = &tensor.coord(iEntry,0);
1024 const size_type *k = &tensor.coord(iEntry,1);
1025 ValTV c(&(tensor.value(iEntry)));
1026
1027 for ( size_type col = 0; col < block_size; ++col ) {
1028 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
1029 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
1030
1031 // vy += c * ( aj * xk + ak * xj)
1032 aj.times_equal(xk);
1033 aj.multiply_add(ak, xj);
1034 vy.multiply_add(c, aj);
1035 }
1036 }
1037 ytmp += vy.sum();
1038 }
1039
1040 // Do remaining entries with a scalar loop
1041 for ( ; iEntry<iEntryEnd; ++iEntry) {
1042 const size_type j = tensor.coord(iEntry,0);
1043 const size_type k = tensor.coord(iEntry,1);
1044 ValueType cijk = tensor.value(iEntry);
1045
1046 for ( size_type col = 0; col < block_size; ++col ) {
1047 ytmp += cijk * ( sh_A[col][j] * sh_x[col][k] +
1048 sh_A[col][k] * sh_x[col][j] );
1049 }
1050
1051 }
1052
1053 y[iy] += ytmp ;
1054 }
1055
1056 // Add a team barrier to keep the thread team in-sync before going on
1057 // to the next block
1058 device.team_barrier();
1059 }
1060
1061 }
1062
1063#endif
1064
1065 static void apply( const matrix_type & A ,
1066 const block_vector_type & x ,
1067 const block_vector_type & y )
1068 {
1069 // Generally the block algorithm seems to perform better on the MIC,
1070 // as long as the stochastic size isn't too big, but doesn't perform
1071 // any better on the CPU (probably because the CPU has a fat L3 cache
1072 // to store the sparse 3 tensor).
1073#ifdef __MIC__
1074 const bool use_block_algorithm = true;
1075#else
1076 const bool use_block_algorithm = false;
1077#endif
1078
1079 const size_t row_count = A.graph.row_map.extent(0) - 1 ;
1080 if (use_block_algorithm) {
1081#ifdef __MIC__
1082 const size_t team_size = 4; // 4 hyperthreads for MIC
1083#else
1084 const size_t team_size = 2; // 2 for everything else
1085#endif
1086 const size_t league_size = row_count;
1087 Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
1088 Kokkos::parallel_for( config , MultiplyImpl(A,x,y) );
1089 }
1090 else {
1091 Kokkos::parallel_for( row_count , MultiplyImpl(A,x,y) );
1092 }
1093 }
1094};
1095
1096//----------------------------------------------------------------------------
1097
1098} /* namespace Stokhos */
1099
1100//----------------------------------------------------------------------------
1101//----------------------------------------------------------------------------
1102
1103// Inject some functions into the Kokkos namespace
1104namespace Kokkos {
1105
1107 using Stokhos::deep_copy;
1108
1109} // namespace Kokkos
1110
1111#endif /* #ifndef STOKHOS_CRSPRODUCTTENSOR_HPP */
Kokkos::DefaultExecutionSpace device
static KOKKOS_INLINE_FUNCTION size_type vector_size(const tensor_type &tensor)
static KOKKOS_INLINE_FUNCTION size_type matrix_size(const tensor_type &tensor)
static KOKKOS_INLINE_FUNCTION void apply(const tensor_type &tensor, const MatrixValue *const a, const VectorValue *const x, VectorValue *const y, const VectorValue &alpha=VectorValue(1))
Sparse product tensor with replicated entries to provide subsets with a given coordinate.
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry) const
Coordinates of an entry.
Kokkos::View< value_type *, Kokkos::LayoutLeft, execution_space, memory_type > vec_type
KOKKOS_INLINE_FUNCTION size_type num_entry(size_type i) const
Number of entries with a coordinate 'i'.
static void deep_copy(const CrsProductTensor< ValueType, DstDevice, DstMemory > &dst, const CrsProductTensor &src)
KOKKOS_INLINE_FUNCTION size_type num_flops() const
Number flop's per multiply-add.
KOKKOS_INLINE_FUNCTION CrsProductTensor()
Kokkos::ViewTraits< size_type *, execution_space, void, void >::host_mirror_space host_mirror_space
KOKKOS_INLINE_FUNCTION ~CrsProductTensor()
KOKKOS_INLINE_FUNCTION size_type entry_maximum() const
Maximum sparse entries for any coordinate.
Kokkos::View< value_type *, Kokkos::LayoutLeft, execution_space, memory_type > value_array_type
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > row_map_array_type
static HostMirror create_mirror_view(const CrsProductTensor &tensor)
KOKKOS_INLINE_FUNCTION bool is_empty() const
Is the tensor empty.
KOKKOS_INLINE_FUNCTION CrsProductTensor & operator=(const CrsProductTensor< value_type, execution_space, M > &rhs)
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry, const size_type c) const
Coordinates of an entry.
KOKKOS_INLINE_FUNCTION size_type entry_begin(size_type i) const
Begin entries with a coordinate 'i'.
Kokkos::View< size_type *[2], Kokkos::LayoutLeft, execution_space, memory_type > coord2_array_type
KOKKOS_INLINE_FUNCTION size_type dimension() const
Dimension of the tensor.
CrsProductTensor< value_type, host_mirror_space > HostMirror
KOKKOS_INLINE_FUNCTION CrsProductTensor(const CrsProductTensor< value_type, execution_space, M > &rhs)
KOKKOS_INLINE_FUNCTION size_type entry_end(size_type i) const
End entries with a coordinate 'i'.
KOKKOS_INLINE_FUNCTION size_type num_non_zeros() const
Number of non-zero's.
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > entry_array_type
KOKKOS_INLINE_FUNCTION size_type entry_count() const
Number of sparse entries.
KOKKOS_INLINE_FUNCTION size_type avg_entries_per_row() const
Number average number of entries per row.
static CrsProductTensor create(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList &params=Teuchos::ParameterList())
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > coord_array_type
static CrsProductTensor createMeanBased()
static const size_type host_vectorsize
KOKKOS_INLINE_FUNCTION const value_type & value(const size_type entry) const
Value of an entry.
BlockCrsMatrix< BlockSpec, MatrixValue, execution_space > matrix_type
CrsProductTensor< ValueType, execution_space > tensor_type
Kokkos::View< VectorValue **, Kokkos::LayoutLeft, execution_space > block_vector_type
KOKKOS_INLINE_FUNCTION std::pair< size_type, size_type > compute_work_range(const size_type work_count, const size_type thread_count, const size_type thread_rank) const
KOKKOS_INLINE_FUNCTION void operator()(const size_type iBlockRow) const
KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy< execution_space >::member_type &device) const
static void apply(const matrix_type &A, const block_vector_type &x, const block_vector_type &y)
StochasticProductTensor< ValueType, tensor_type, execution_space > BlockSpec
MultiplyImpl(const matrix_type &A, const block_vector_type &x, const block_vector_type &y)
virtual ordinal_type size() const =0
Return total size of basis.
Abstract base class for multivariate orthogonal polynomials generated from tensor products of univari...
Data structure storing a sparse 3-tensor C(i,j,k) in a a compressed format.
kj_iterator j_end(const k_iterator &k) const
Iterator pointing to last j entry for given k.
k_iterator k_begin() const
Iterator pointing to first k entry.
kji_iterator i_begin(const kj_iterator &j) const
Iterator pointing to first i entry for given j and k.
kj_iterator j_begin(const k_iterator &k) const
Iterator pointing to first j entry for given k.
kji_iterator i_end(const kj_iterator &j) const
Iterator pointing to last i entry for given j and k.
k_iterator k_end() const
Iterator pointing to last k entry.
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
Stokhos::CrsMatrix< ValueType, Device, Layout >::HostMirror create_mirror_view(const Stokhos::CrsMatrix< ValueType, Device, Layout > &A)
Top-level namespace for Stokhos classes and functions.
CrsProductTensor< ValueType, Device > create_product_tensor(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList &params=Teuchos::ParameterList())
void deep_copy(const CrsProductTensor< ValueType, DstDevice, DstMemory > &dst, const CrsProductTensor< ValueType, SrcDevice, SrcMemory > &src)
CrsProductTensor< ValueType, Device, Memory >::HostMirror create_mirror_view(const CrsProductTensor< ValueType, Device, Memory > &src)
CrsProductTensor< ValueType, Device > create_mean_based_product_tensor()
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType ValueType * y
Definition csr_vector.h:267
bool operator()(const CijkRowCount &a, const CijkRowCount &b) const