Stokhos Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
Stokhos_TinyVec.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Stokhos Package
5// Copyright (2009) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38//
39// ***********************************************************************
40// @HEADER
41
42#ifndef STOKHOS_TINY_VEC_HPP
43#define STOKHOS_TINY_VEC_HPP
44
45#include "Stokhos_ConfigDefs.h"
46#if defined(HAVE_STOKHOS_INTRINSICS) && !defined( __CUDACC__ )
47
48extern "C" {
49#include <immintrin.h>
50}
51
52#endif
53
54#include "Kokkos_Macros.hpp"
55
56namespace Stokhos {
57
58#if defined(__INTEL_COMPILER) && ! defined( __CUDA_ARCH__)
59
60template <typename ValueType, int N, bool UseIntrinsics, bool Mask = false >
61class TinyVec {
62public:
63
64 static const int Num = N;
65
66 KOKKOS_INLINE_FUNCTION
67 TinyVec() {}
68
69 KOKKOS_INLINE_FUNCTION
70 TinyVec(const ValueType a[]) {
71 load(a);
72 }
73
74 template <typename OrdinalType>
75 KOKKOS_INLINE_FUNCTION
76 TinyVec(const ValueType a[], const OrdinalType idx[]) {
77 gather(a,idx);
78 }
79
80 KOKKOS_INLINE_FUNCTION
81 TinyVec(const ValueType a) {
82 load(a);
83 }
84
85 KOKKOS_INLINE_FUNCTION
86 TinyVec(const TinyVec& tv) {
87#pragma ivdep
88#pragma vector aligned
89 for (int i=0; i<Num; ++i)
90 v[i] = tv.v[i];
91 }
92
93 KOKKOS_INLINE_FUNCTION
94 TinyVec& operator=(const TinyVec& tv) {
95#pragma ivdep
96#pragma vector aligned
97 for (int i=0; i<Num; ++i)
98 v[i] = tv.v[i];
99 return *this;
100 }
101
102 KOKKOS_INLINE_FUNCTION
103 void load(const ValueType a[]) {
104#pragma ivdep
105#pragma vector aligned
106 for (int i=0; i<Num; ++i)
107 v[i] = a[i];
108 }
109
110 KOKKOS_INLINE_FUNCTION
111 void load(const ValueType a) {
112#pragma ivdep
113#pragma vector aligned
114 for (int i=0; i<Num; ++i)
115 v[i] = a;
116 }
117
118 KOKKOS_INLINE_FUNCTION
119 void aligned_load(const ValueType a[]) {
120#pragma ivdep
121#pragma vector aligned
122 for (int i=0; i<Num; ++i)
123 v[i] = a[i];
124 }
125
126 template <typename OrdinalType>
127 KOKKOS_INLINE_FUNCTION
128 void gather(const ValueType a[], const OrdinalType idx[]) {
129#pragma ivdep
130#pragma vector aligned
131 for (int i=0; i<Num; ++i)
132 v[i] = a[idx[i]];
133 }
134
135 KOKKOS_INLINE_FUNCTION
136 void scatter(ValueType a[]) const {
137#pragma ivdep
138#pragma vector aligned
139 for (int i=0; i<Num; ++i)
140 a[i] = v[i];
141 }
142
143 KOKKOS_INLINE_FUNCTION
144 void aligned_scatter(ValueType a[]) const {
145#pragma ivdep
146#pragma vector aligned
147 for (int i=0; i<Num; ++i)
148 a[i] = v[i];
149 }
150
151 KOKKOS_INLINE_FUNCTION
152 void zero() {
153#pragma ivdep
154#pragma vector aligned
155 for (int i=0; i<Num; ++i)
156 v[i] = ValueType(0.0);
157 }
158
159 KOKKOS_INLINE_FUNCTION
160 void plus_equal(const TinyVec& t) {
161#pragma ivdep
162#pragma vector aligned
163 for (int i=0; i<Num; ++i)
164 v[i] += t.v[i];
165 }
166
167 KOKKOS_INLINE_FUNCTION
168 void times_equal(const TinyVec& t) {
169#pragma ivdep
170#pragma vector aligned
171 for (int i=0; i<Num; ++i)
172 v[i] *= t.v[i];
173 }
174
175 // *this = *this + t1 * t2
176 KOKKOS_INLINE_FUNCTION
177 void multiply_add(const TinyVec& t1, const TinyVec& t2) {
178#pragma ivdep
179#pragma vector aligned
180 for (int i=0; i<Num; ++i)
181 v[i] += t1.v[i]*t2.v[i];
182 }
183
184 KOKKOS_INLINE_FUNCTION
185 ValueType sum() const {
186 ValueType s(0.0);
187#pragma ivdep
188#pragma vector aligned
189 for (int i=0; i<Num; ++i)
190 s += v[i];
191 return s;
192 }
193
194private:
195 ValueType v[Num] __attribute__((aligned(64)));
196};
197
198template <typename ValueType, int N, bool UseIntrinsics >
199class TinyVec<ValueType,N,UseIntrinsics,true> {
200public:
201
202 static const int Num = N;
203
204 KOKKOS_INLINE_FUNCTION
205 TinyVec(int size) { sz = size; }
206
207 KOKKOS_INLINE_FUNCTION
208 TinyVec(const ValueType a[], int size) {
209 sz = size;
210 load(a);
211 }
212
213 template <typename OrdinalType>
214 KOKKOS_INLINE_FUNCTION
215 TinyVec(const ValueType a[], const OrdinalType idx[], int size) {
216 sz = size;
217 gather(a,idx);
218 }
219
220 KOKKOS_INLINE_FUNCTION
221 TinyVec(const ValueType a, int size) {
222 sz = size;
223 load(a);
224 }
225
226 KOKKOS_INLINE_FUNCTION
227 TinyVec(const TinyVec& tv) {
228 sz = tv.sz;
229#pragma ivdep
230#pragma vector aligned
231 for (int i=0; i<sz; ++i)
232 v[i] = tv.v[i];
233 }
234
235 KOKKOS_INLINE_FUNCTION
236 TinyVec& operator=(const TinyVec& tv) {
237 sz = tv.sz;
238#pragma ivdep
239#pragma vector aligned
240 for (int i=0; i<sz; ++i)
241 v[i] = tv.v[i];
242 return *this;
243 }
244
245 KOKKOS_INLINE_FUNCTION
246 void load(const ValueType a[]) {
247#pragma ivdep
248#pragma vector aligned
249 for (int i=0; i<sz; ++i)
250 v[i] = a[i];
251 }
252
253 KOKKOS_INLINE_FUNCTION
254 void load(const ValueType a) {
255#pragma ivdep
256#pragma vector aligned
257 for (int i=0; i<sz; ++i)
258 v[i] = a;
259 }
260
261 KOKKOS_INLINE_FUNCTION
262 void aligned_load(const ValueType a[]) {
263#pragma ivdep
264#pragma vector aligned
265 for (int i=0; i<sz; ++i)
266 v[i] = a[i];
267 }
268
269 template <typename OrdinalType>
270 KOKKOS_INLINE_FUNCTION
271 void gather(const ValueType a[], const OrdinalType idx[]) {
272#pragma ivdep
273#pragma vector aligned
274 for (int i=0; i<sz; ++i)
275 v[i] = a[idx[i]];
276 }
277
278 KOKKOS_INLINE_FUNCTION
279 void scatter(ValueType a[]) const {
280#pragma ivdep
281#pragma vector aligned
282 for (int i=0; i<sz; ++i)
283 a[i] = v[i];
284 }
285
286 KOKKOS_INLINE_FUNCTION
287 void aligned_scatter(ValueType a[]) const {
288#pragma ivdep
289#pragma vector aligned
290 for (int i=0; i<sz; ++i)
291 a[i] = v[i];
292 }
293
294 KOKKOS_INLINE_FUNCTION
295 void zero() {
296#pragma ivdep
297#pragma vector aligned
298 for (int i=0; i<sz; ++i)
299 v[i] = ValueType(0.0);
300 }
301
302 KOKKOS_INLINE_FUNCTION
303 void plus_equal(const TinyVec& t) {
304#pragma ivdep
305#pragma vector aligned
306 for (int i=0; i<sz; ++i)
307 v[i] += t.v[i];
308 }
309
310 KOKKOS_INLINE_FUNCTION
311 void times_equal(const TinyVec& t) {
312#pragma ivdep
313#pragma vector aligned
314 for (int i=0; i<sz; ++i)
315 v[i] *= t.v[i];
316 }
317
318 // *this = *this + t1 * t2
319 KOKKOS_INLINE_FUNCTION
320 void multiply_add(const TinyVec& t1, const TinyVec& t2) {
321#pragma ivdep
322#pragma vector aligned
323 for (int i=0; i<sz; ++i)
324 v[i] += t1.v[i]*t2.v[i];
325 }
326
327 KOKKOS_INLINE_FUNCTION
328 ValueType sum() const {
329 ValueType s(0.0);
330#pragma ivdep
331#pragma vector aligned
332 for (int i=0; i<sz; ++i)
333 s += v[i];
334 return s;
335 }
336
337private:
338 ValueType v[Num] __attribute__((aligned(64)));
339 int sz;
340};
341
342#else
343
344template <typename ValueType, int N, bool UseIntrinsics, bool Mask = false >
345class TinyVec {
346public:
347
348 static const int Num = N;
349
350 KOKKOS_INLINE_FUNCTION
352
353 KOKKOS_INLINE_FUNCTION
354 TinyVec(const ValueType a[]) {
355 load(a);
356 }
357
358 template <typename OrdinalType>
359 KOKKOS_INLINE_FUNCTION
360 TinyVec(const ValueType a[], const OrdinalType idx[]) {
361 gather(a,idx);
362 }
363
364 KOKKOS_INLINE_FUNCTION
365 TinyVec(const ValueType a) {
366 load(a);
367 }
368
369 KOKKOS_INLINE_FUNCTION
370 TinyVec(const TinyVec& tv) {
371 for (int i=0; i<Num; ++i)
372 v[i] = tv.v[i];
373 }
374
375 KOKKOS_INLINE_FUNCTION
377 for (int i=0; i<Num; ++i)
378 v[i] = tv.v[i];
379 return *this;
380 }
381
382 KOKKOS_INLINE_FUNCTION
383 void load(const ValueType a[]) {
384 for (int i=0; i<Num; ++i)
385 v[i] = a[i];
386 }
387
388 KOKKOS_INLINE_FUNCTION
389 void load(const ValueType a) {
390 for (int i=0; i<Num; ++i)
391 v[i] = a;
392 }
393
394 KOKKOS_INLINE_FUNCTION
395 void aligned_load(const ValueType a[]) {
396 for (int i=0; i<Num; ++i)
397 v[i] = a[i];
398 }
399
400 template <typename OrdinalType>
401 KOKKOS_INLINE_FUNCTION
402 void gather(const ValueType a[], const OrdinalType idx[]) {
403 for (int i=0; i<Num; ++i)
404 v[i] = a[idx[i]];
405 }
406
407 KOKKOS_INLINE_FUNCTION
408 void scatter(ValueType a[]) const {
409 for (int i=0; i<Num; ++i)
410 a[i] = v[i];
411 }
412
413 KOKKOS_INLINE_FUNCTION
414 void aligned_scatter(ValueType a[]) const {
415 for (int i=0; i<Num; ++i)
416 a[i] = v[i];
417 }
418
419 KOKKOS_INLINE_FUNCTION
420 void zero() {
421 for (int i=0; i<Num; ++i)
422 v[i] = ValueType(0.0);
423 }
424
425 KOKKOS_INLINE_FUNCTION
426 void plus_equal(const TinyVec& t) {
427 for (int i=0; i<Num; ++i)
428 v[i] += t.v[i];
429 }
430
431 KOKKOS_INLINE_FUNCTION
432 void times_equal(const TinyVec& t) {
433 for (int i=0; i<Num; ++i)
434 v[i] *= t.v[i];
435 }
436
437 // *this = *this + t1 * t2
438 KOKKOS_INLINE_FUNCTION
439 void multiply_add(const TinyVec& t1, const TinyVec& t2) {
440 for (int i=0; i<Num; ++i)
441 v[i] += t1.v[i]*t2.v[i];
442 }
443
444 KOKKOS_INLINE_FUNCTION
445 ValueType sum() const {
446 ValueType s(0.0);
447 for (int i=0; i<Num; ++i)
448 s += v[i];
449 return s;
450 }
451
452private:
453 ValueType v[Num];
454};
455
456template <typename ValueType, int N, bool UseIntrinsics >
457class TinyVec<ValueType,N,UseIntrinsics,true> {
458public:
459
460 static const int Num = N;
461
462 KOKKOS_INLINE_FUNCTION
463 TinyVec(int size) { sz = size; }
464
465 KOKKOS_INLINE_FUNCTION
466 TinyVec(const ValueType a[], int size) {
467 sz = size;
468 load(a);
469 }
470
471 template <typename OrdinalType>
472 KOKKOS_INLINE_FUNCTION
473 TinyVec(const ValueType a[], const OrdinalType idx[], int size) {
474 sz = size;
475 gather(a,idx);
476 }
477
478 KOKKOS_INLINE_FUNCTION
479 TinyVec(const ValueType a, int size) {
480 sz = size;
481 load(a);
482 }
483
484 KOKKOS_INLINE_FUNCTION
485 TinyVec(const TinyVec& tv) {
486 sz = tv.sz;
487 for (int i=0; i<sz; ++i)
488 v[i] = tv.v[i];
489 }
490
491 KOKKOS_INLINE_FUNCTION
493 sz = tv.sz;
494 for (int i=0; i<sz; ++i)
495 v[i] = tv.v[i];
496 return *this;
497 }
498
499 KOKKOS_INLINE_FUNCTION
500 void load(const ValueType a[]) {
501 for (int i=0; i<sz; ++i)
502 v[i] = a[i];
503 }
504
505 KOKKOS_INLINE_FUNCTION
506 void load(const ValueType a) {
507 for (int i=0; i<sz; ++i)
508 v[i] = a;
509 }
510
511 KOKKOS_INLINE_FUNCTION
512 void aligned_load(const ValueType a[]) {
513 for (int i=0; i<sz; ++i)
514 v[i] = a[i];
515 }
516
517 template <typename OrdinalType>
518 KOKKOS_INLINE_FUNCTION
519 void gather(const ValueType a[], const OrdinalType idx[]) {
520 for (int i=0; i<sz; ++i)
521 v[i] = a[idx[i]];
522 }
523
524 KOKKOS_INLINE_FUNCTION
525 void scatter(ValueType a[]) const {
526 for (int i=0; i<sz; ++i)
527 a[i] = v[i];
528 }
529
530 KOKKOS_INLINE_FUNCTION
531 void aligned_scatter(ValueType a[]) const {
532 for (int i=0; i<sz; ++i)
533 a[i] = v[i];
534 }
535
536 KOKKOS_INLINE_FUNCTION
537 void zero() {
538 for (int i=0; i<sz; ++i)
539 v[i] = ValueType(0.0);
540 }
541
542 KOKKOS_INLINE_FUNCTION
543 void plus_equal(const TinyVec& t) {
544 for (int i=0; i<sz; ++i)
545 v[i] += t.v[i];
546 }
547
548 KOKKOS_INLINE_FUNCTION
549 void times_equal(const TinyVec& t) {
550 for (int i=0; i<sz; ++i)
551 v[i] *= t.v[i];
552 }
553
554 // *this = *this + t1 * t2
555 KOKKOS_INLINE_FUNCTION
556 void multiply_add(const TinyVec& t1, const TinyVec& t2) {
557 for (int i=0; i<sz; ++i)
558 v[i] += t1.v[i]*t2.v[i];
559 }
560
561 KOKKOS_INLINE_FUNCTION
562 ValueType sum() const {
563 ValueType s(0.0);
564 for (int i=0; i<sz; ++i)
565 s += v[i];
566 return s;
567 }
568
569private:
570 ValueType v[Num];
571 int sz;
572};
573
574#endif
575
576#if defined(HAVE_STOKHOS_INTRINSICS) && !defined( __CUDACC__ )
577
578#ifdef __SSE2__
579template <>
580class TinyVec<double,2,true,false> {
581public:
582
583 typedef double ValueType;
584 static const int Num = 2;
585
586 TinyVec() {}
587
588 TinyVec(const ValueType a[]) {
589 load(a);
590 }
591
592 template <typename OrdinalType>
593 TinyVec(const ValueType a[], const OrdinalType idx[]) {
594 gather(a,idx);
595 }
596
597 TinyVec(const ValueType a) {
598 load(a);
599 }
600
601 TinyVec(const TinyVec& tv) {
602 v = tv.v;
603 }
604
605 TinyVec& operator=(const TinyVec& tv) {
606 v = tv.v;
607 return *this;
608 }
609
610 void load(const ValueType a[]) {
611 v = _mm_set_pd(a[1], a[0]);
612 }
613
614 void load(const ValueType a) {
615 v = _mm_set1_pd(a);
616 }
617
618 void aligned_load(const ValueType a[]) {
619 v = _mm_load_pd(a);
620 }
621
622 template <typename OrdinalType>
623 void gather(const ValueType a[], const OrdinalType idx[]) {
624 v = _mm_set_pd(a[idx[1]], a[idx[0]]);
625 }
626
627 void scatter(ValueType a[]) const {
628 _mm_storel_pd(&a[0], v);
629 _mm_storeh_pd(&a[1], v);
630 }
631
632 void aligned_scatter(ValueType a[]) const {
633 _mm_store_pd(a, v);
634 }
635
636 void zero() {
637 v = _mm_setzero_pd();
638 }
639
640 void plus_equal(const TinyVec& t) {
641 v = _mm_add_pd(v, t.v);
642 }
643
644 void times_equal(const TinyVec& t) {
645 v = _mm_mul_pd(v, t.v);
646 }
647
648 // *this = *this + t1 * t2
649 void multiply_add(const TinyVec& t1, const TinyVec& t2) {
650 __m128d t = _mm_mul_pd(t1.v, t2.v);
651 v = _mm_add_pd(v, t);
652 }
653
654 ValueType sum() const {
655 ValueType a[Num];
656 scatter(a);
657 return a[0]+a[1];
658 }
659
660private:
661 __m128d v;
662};
663#endif
664
665#ifdef __AVX__
666template <>
667class TinyVec<float,8,true,false> {
668public:
669
670 typedef float ValueType;
671 static const int Num = 8;
672
673 TinyVec() {}
674
675 TinyVec(const ValueType a[]) {
676 aligned_load(a);
677 }
678
679 template <typename OrdinalType>
680 TinyVec(const ValueType a[], const OrdinalType idx[]) {
681 gather(a,idx);
682 }
683
684 TinyVec(const ValueType a) {
685 load(a);
686 }
687
688 TinyVec(const TinyVec& tv) {
689 v = tv.v;
690 }
691
692 TinyVec& operator=(const TinyVec& tv) {
693 v = tv.v;
694 return *this;
695 }
696
697 void load(const ValueType a[]) {
698 v = _mm256_loadu_ps(a);
699 }
700
701 void load(const ValueType a) {
702 v = _mm256_set1_ps(a);
703 }
704
705 void aligned_load(const ValueType a[]) {
706 v = _mm256_load_ps(a);
707 }
708
709 template <typename OrdinalType>
710 void gather(const ValueType a[], const OrdinalType idx[]) {
711 __m128 v1 = _mm_set_ps(a[idx[3]], a[idx[2]], a[idx[1]], a[idx[0]]);
712 __m128 v2 = _mm_set_ps(a[idx[7]], a[idx[6]], a[idx[5]], a[idx[4]]);
713 v = _mm256_insertf128_ps(v, v1, 0);
714 v = _mm256_insertf128_ps(v, v2, 1);
715 }
716
717 void scatter(ValueType a[]) const {
718 _mm256_storeu_ps(a, v);
719 }
720
721 void aligned_scatter(ValueType a[]) const {
722 _mm256_store_ps(a, v);
723 }
724
725 void zero() {
726 v = _mm256_setzero_ps();
727 }
728
729 void plus_equal(const TinyVec& t) {
730 v = _mm256_add_ps(v, t.v);
731 }
732
733 void times_equal(const TinyVec& t) {
734 v = _mm256_mul_ps(v, t.v);
735 }
736
737 // *this = *this + t1 * t2
738 void multiply_add(const TinyVec& t1, const TinyVec& t2) {
739 __m256 t = _mm256_mul_ps(t1.v, t2.v);
740 v = _mm256_add_ps(v, t);
741 }
742
743 ValueType sum() {
744 __m256 s = _mm256_hadd_ps(v,v);
745 __m128 sl = _mm256_extractf128_ps(s, 0);
746 __m128 sh = _mm256_extractf128_ps(s, 1);
747 sl = _mm_add_ps(sl,sh);
748 sl = _mm_hadd_ps(sl,sl);
749 ValueType res;
750 _MM_EXTRACT_FLOAT(res, sl, 0);
751
752 return res;
753 }
754
755private:
756 __m256 v;
757};
758
759template <>
760class TinyVec<double,4,true,false> {
761public:
762
763 typedef double ValueType;
764 static const int Num = 4;
765
766 TinyVec() {}
767
768 TinyVec(const ValueType a[]) {
769 aligned_load(a);
770 }
771
772 template <typename OrdinalType>
773 TinyVec(const ValueType a[], const OrdinalType idx[]) {
774 gather(a,idx);
775 }
776
777 TinyVec(const ValueType a) {
778 load(a);
779 }
780
781 TinyVec(const TinyVec& tv) {
782 v = tv.v;
783 }
784
785 TinyVec& operator=(const TinyVec& tv) {
786 v = tv.v;
787 return *this;
788 }
789
790 void load(const ValueType a[]) {
791 v = _mm256_loadu_pd(a);
792 }
793
794 void load(const ValueType a) {
795 v = _mm256_set1_pd(a);
796 }
797
798 void aligned_load(const ValueType a[]) {
799 v = _mm256_load_pd(a);
800 }
801
802 template <typename OrdinalType>
803 void gather(const ValueType a[], const OrdinalType idx[]) {
804 __m128d v1 = _mm_set_pd(a[idx[1]], a[idx[0]]);
805 __m128d v2 = _mm_set_pd(a[idx[3]], a[idx[2]]);
806 v = _mm256_insertf128_pd(v, v1, 0);
807 v = _mm256_insertf128_pd(v, v2, 1);
808 }
809
810 void scatter(ValueType a[]) const {
811 _mm256_storeu_pd(a, v);
812 }
813
814 void aligned_scatter(ValueType a[]) const {
815 _mm256_store_pd(a, v);
816 }
817
818 void zero() {
819 v = _mm256_setzero_pd();
820 }
821
822 void plus_equal(const TinyVec& t) {
823 v = _mm256_add_pd(v, t.v);
824 }
825
826 void times_equal(const TinyVec& t) {
827 v = _mm256_mul_pd(v, t.v);
828 }
829
830 // *this = *this + t1 * t2
831 void multiply_add(const TinyVec& t1, const TinyVec& t2) {
832 __m256d t = _mm256_mul_pd(t1.v, t2.v);
833 v = _mm256_add_pd(v, t);
834 }
835
836 ValueType sum() {
837 // ValueType a[Num];
838 // scatter(a);
839 // return a[0]+a[1]+a[2]+a[3];
840
841 // __m128d vl = _mm256_extractf128_pd(v, 0); // v[0], v[1]
842 // __m128d vh = _mm256_extractf128_pd(v, 1); // v[2], v[3]
843 // vh = _mm_hadd_pd(vl, vh); // v[0]+v[1], v[2]+v[3]
844 // vh = _mm_hadd_pd(vh, vh); // v[0]+v[1]+v[2]+v[3], v[0]+v[1]+v[2]+v[3]
845 // ValueType res;
846 // _mm_storel_pd(&res, vh);
847 // return res;
848
849 __m256d s = _mm256_hadd_pd(v,v); //v[0]+v[1] v[0]+v[1] v[2]+v[3] v[2]+v[3]
850 __m128d sl = _mm256_extractf128_pd(s, 0); //v[0]+v[1] v[0]+v[1]
851 __m128d sh = _mm256_extractf128_pd(s, 1); //v[2]+v[3] v[2]+v[3]
852 sl = _mm_add_pd(sl,sh); // v[0]+v[1]+v[2]+v[3] v[0]+v[1]+v[2]+v[3]
853 ValueType res;
854 _mm_storel_pd(&res, sl);
855 return res;
856 }
857
858private:
859 __m256d v;
860};
861
862template <>
863class TinyVec<double,8,true,false> {
864public:
865
866 typedef double ValueType;
867 static const int Num = 8;
868
869 TinyVec() {}
870
871 TinyVec(const ValueType a[]) {
872 load(a);
873 }
874
875 template <typename OrdinalType>
876 TinyVec(const ValueType a[], const OrdinalType idx[]) {
877 gather(a,idx);
878 }
879
880 TinyVec(const ValueType a) {
881 load(a);
882 }
883
884 TinyVec(const TinyVec& tv) {
885 v1 = tv.v1; v2 = tv.v2;
886 }
887
888 TinyVec& operator=(const TinyVec& tv) {
889 v1 = tv.v1; v2 = tv.v2;
890 return *this;
891 }
892
893 void load(const ValueType a[]) {
894 v1 = _mm256_loadu_pd(a);
895 v2 = _mm256_loadu_pd(a+4);
896 }
897
898 void load(const ValueType a) {
899 v1 = _mm256_set1_pd(a);
900 v2 = _mm256_set1_pd(a);
901 }
902
903 void aligned_load(const ValueType a[]) {
904 v1 = _mm256_load_pd(a);
905 v2 = _mm256_load_pd(a+4);
906 }
907
908 template <typename OrdinalType>
909 void gather(const ValueType a[], const OrdinalType idx[]) {
910 __m128d t1 = _mm_set_pd(a[idx[1]], a[idx[0]]);
911 __m128d t2 = _mm_set_pd(a[idx[3]], a[idx[2]]);
912 __m128d t3 = _mm_set_pd(a[idx[5]], a[idx[4]]);
913 __m128d t4 = _mm_set_pd(a[idx[7]], a[idx[6]]);
914 v1 = _mm256_insertf128_pd(v1, t1, 0);
915 v1 = _mm256_insertf128_pd(v1, t2, 1);
916 v2 = _mm256_insertf128_pd(v2, t3, 0);
917 v2 = _mm256_insertf128_pd(v2, t4, 1);
918 }
919
920 void scatter(ValueType a[]) const {
921 _mm256_storeu_pd(a, v1);
922 _mm256_storeu_pd(a+4, v2);
923 }
924
925 void aligned_scatter(ValueType a[]) const {
926 _mm256_store_pd(a, v1);
927 _mm256_store_pd(a+4, v2);
928 }
929
930 void zero() {
931 v1 = _mm256_setzero_pd();
932 v2 = _mm256_setzero_pd();
933 }
934
935 void plus_equal(const TinyVec& t) {
936 v1 = _mm256_add_pd(v1, t.v1);
937 v2 = _mm256_add_pd(v2, t.v2);
938 }
939
940 void times_equal(const TinyVec& t) {
941 v1 = _mm256_mul_pd(v1, t.v1);
942 v2 = _mm256_mul_pd(v2, t.v2);
943 }
944
945 // *this = *this + t1 * t2
946 void multiply_add(const TinyVec& t1, const TinyVec& t2) {
947 __m256d t = _mm256_mul_pd(t1.v1, t2.v1);
948 __m256d s = _mm256_mul_pd(t1.v2, t2.v2);
949 v1 = _mm256_add_pd(v1, t);
950 v2 = _mm256_add_pd(v2, s);
951 }
952
953 ValueType sum() {
954 __m256d s1 = _mm256_hadd_pd(v1,v1);//v[0]+v[1] v[0]+v[1] v[2]+v[3] v[2]+v[3]
955 __m128d s1l = _mm256_extractf128_pd(s1, 0); //v[0]+v[1] v[0]+v[1]
956 __m128d s1h = _mm256_extractf128_pd(s1, 1); //v[2]+v[3] v[2]+v[3]
957 s1l = _mm_add_pd(s1l,s1h); // v[0]+v[1]+v[2]+v[3] v[0]+v[1]+v[2]+v[3]
958 ValueType res1;
959 _mm_storel_pd(&res1, s1l);
960
961 __m256d s2 = _mm256_hadd_pd(v2,v2);//v[0]+v[1] v[0]+v[1] v[2]+v[3] v[2]+v[3]
962 __m128d s2l = _mm256_extractf128_pd(s2, 0); //v[0]+v[1] v[0]+v[1]
963 __m128d s2h = _mm256_extractf128_pd(s2, 1); //v[2]+v[3] v[2]+v[3]
964 s2l = _mm_add_pd(s2l,s2h); // v[0]+v[1]+v[2]+v[3] v[0]+v[1]+v[2]+v[3]
965 ValueType res2;
966 _mm_storel_pd(&res2, s2l);
967
968 return res1 + res2;
969 }
970
971private:
972 __m256d v1, v2;
973};
974#endif
975
976#if defined( __MIC__ )
977template <>
978class TinyVec<double,8,true,false> {
979public:
980
981 typedef double ValueType;
982 static const int Num = 8;
983
984 TinyVec() {}
985
986 TinyVec(const ValueType a[]) {
987 load(a);
988 }
989
990 template <typename OrdinalType>
991 TinyVec(const ValueType a[], const OrdinalType idx[]) {
992 gather(a,idx);
993 }
994
995 TinyVec(const ValueType a) {
996 load(a);
997 }
998
999 TinyVec(const TinyVec& tv) {
1000 v = tv.v;
1001 }
1002
1003 TinyVec& operator=(const TinyVec& tv) {
1004 v = tv.v;
1005 return *this;
1006 }
1007
1008 void load(const ValueType a[]) {
1009 v = _mm512_load_pd(a);
1010 }
1011
1012 void load(const ValueType a) {
1013 v = _mm512_set1_pd(a);
1014 }
1015
1016 void aligned_load(const ValueType a[]) {
1017 v = _mm512_load_pd(a);
1018 }
1019
1020 template <typename OrdinalType>
1021 void gather(const ValueType a[], const OrdinalType idx[]) {
1022 __mmask16 mask = _mm512_int2mask(255);
1023 __m512i vidx = _mm512_setzero_epi32();
1024 vidx = _mm512_mask_load_epi32(vidx, mask, idx);
1025 v = _mm512_i32logather_pd(vidx, a, 8);
1026 }
1027
1028 void scatter(ValueType a[]) const {
1029 _mm512_store_pd(a, v);
1030 }
1031
1032 void aligned_scatter(ValueType a[]) const {
1033 _mm512_store_pd(a, v);
1034 }
1035
1036 void zero() {
1037 v = _mm512_setzero_pd();
1038 }
1039
1040 void plus_equal(const TinyVec& t) {
1041 v = _mm512_add_pd(v, t.v);
1042 }
1043
1044 void times_equal(const TinyVec& t) {
1045 v = _mm512_mul_pd(v, t.v);
1046 }
1047
1048 // *this = *this + t1 * t2
1049 void multiply_add(const TinyVec& t1, const TinyVec& t2) {
1050 v = _mm512_fmadd_pd(t1.v, t2.v, v);
1051 }
1052
1053 ValueType sum() {
1054 return _mm512_reduce_add_pd(v);
1055 }
1056
1057private:
1058 __m512d v;
1059};
1060
1061template <>
1062class TinyVec<double,8,true,true> {
1063public:
1064
1065 typedef double ValueType;
1066 static const int Num = 8;
1067
1068 TinyVec(const int sz) {
1069 mask = _mm512_int2mask((1 << (sz+1))-1);
1070 }
1071
1072 TinyVec(const ValueType a[], const int sz) {
1073 mask = _mm512_int2mask((1 << (sz+1))-1);
1074 load(a);
1075 }
1076
1077 template <typename OrdinalType>
1078 TinyVec(const ValueType a[], const OrdinalType idx[], const int sz) {
1079 mask = _mm512_int2mask((1 << (sz+1))-1);
1080 gather(a,idx);
1081 }
1082
1083 TinyVec(const ValueType a, int sz) {
1084 mask = _mm512_int2mask((1 << (sz+1))-1);
1085 load(a);
1086 }
1087
1088 TinyVec(const TinyVec& tv) {
1089 mask = tv.mask;
1090 v = tv.v;
1091 }
1092
1093 TinyVec& operator=(const TinyVec& tv) {
1094 mask = tv.mask;
1095 v = tv.v;
1096 return *this;
1097 }
1098
1099 void load(const ValueType a[]) {
1100 v = _mm512_setzero_pd();
1101 v = _mm512_mask_load_pd(v, mask, a);
1102 }
1103
1104 void load(const ValueType a) {
1105 v = _mm512_set1_pd(a);
1106 }
1107
1108 void aligned_load(const ValueType a[]) {
1109 v = _mm512_setzero_pd();
1110 v = _mm512_mask_load_pd(v, mask, a);
1111 }
1112
1113 template <typename OrdinalType>
1114 void gather(const ValueType a[], const OrdinalType idx[]) {
1115 // We're assuming idx is an array of 32-bit integers
1116 // Load 16 integers into v1idx, then permute the high 256 bits
1117 // to the low 256 bits (DCBA -> BADC where 128 bit lanes are read right to
1118 // left). Then load the vectors into v1 and v2.
1119 // logather_pd only uses the low 256 bits in the index vector.
1120 __m512i vidx = _mm512_load_epi32(idx);
1121 v = _mm512_setzero_pd();
1122 v = _mm512_mask_i32logather_pd(v, mask, vidx, a, 8);
1123 }
1124
1125 void scatter(ValueType a[]) const {
1126 _mm512_mask_store_pd(a, mask, v);
1127 }
1128
1129 void aligned_scatter(ValueType a[]) const {
1130 _mm512_mask_store_pd(a, mask, v);
1131 }
1132
1133 void zero() {
1134 v = _mm512_setzero_pd();
1135 }
1136
1137 void plus_equal(const TinyVec& t) {
1138 v = _mm512_mask_add_pd(v, mask, v, t.v);
1139 }
1140
1141 void times_equal(const TinyVec& t) {
1142 v = _mm512_mask_mul_pd(v, mask, v, t.v);
1143 }
1144
1145 // *this = *this + t1 * t2
1146 void multiply_add(const TinyVec& t1, const TinyVec& t2) {
1147 v = _mm512_mask3_fmadd_pd(t1.v, t2.v, v, mask);
1148 }
1149
1150 ValueType sum() {
1151 return _mm512_mask_reduce_add_pd(mask, v);
1152 }
1153
1154private:
1155 __mmask8 mask;
1156 __m512d v;
1157};
1158
1159template <>
1160class TinyVec<double,16,true,false> {
1161public:
1162
1163 typedef double ValueType;
1164 static const int Num = 16;
1165
1166 TinyVec() {}
1167
1168 TinyVec(const ValueType a[]) {
1169 load(a);
1170 }
1171
1172 template <typename OrdinalType>
1173 TinyVec(const ValueType a[], const OrdinalType idx[]) {
1174 gather(a,idx);
1175 }
1176
1177 TinyVec(const ValueType a) {
1178 load(a);
1179 }
1180
1181 TinyVec(const TinyVec& tv) {
1182 v1 = tv.v1; v2 = tv.v2;
1183 }
1184
1185 TinyVec& operator=(const TinyVec& tv) {
1186 v1 = tv.v1; v2 = tv.v2;
1187 return *this;
1188 }
1189
1190 void load(const ValueType a[]) {
1191 v1 = _mm512_load_pd(a);
1192 v2 = _mm512_load_pd(a+8);
1193 }
1194
1195 void load(const ValueType a) {
1196 v1 = _mm512_set1_pd(a);
1197 v2 = _mm512_set1_pd(a);
1198 }
1199
1200 void aligned_load(const ValueType a[]) {
1201 v1 = _mm512_load_pd(a);
1202 v2 = _mm512_load_pd(a+8);
1203 }
1204
1205 template <typename OrdinalType>
1206 void gather(const ValueType a[], const OrdinalType idx[]) {
1207 // We're assuming idx is an array of 32-bit integers
1208 // Load 16 integers into v1idx, then permute the high 256 bits
1209 // to the low 256 bits (DCBA -> BADC where 128 bit lanes are read right to
1210 // left). Then load the vectors into v1 and v2.
1211 // logather_pd only uses the low 256 bits in the index vector.
1212 __m512i v1idx = _mm512_load_epi32(idx);
1213 __m512i v2idx = _mm512_permute4f128_epi32(v1idx, _MM_PERM_BADC);
1214 v1 = _mm512_i32logather_pd(v1idx, a, 8);
1215 v2 = _mm512_i32logather_pd(v2idx, a, 8);
1216 }
1217
1218 void scatter(ValueType a[]) const {
1219 _mm512_store_pd(a, v1);
1220 _mm512_store_pd(a+8, v2);
1221 }
1222
1223 void aligned_scatter(ValueType a[]) const {
1224 _mm512_store_pd(a, v1);
1225 _mm512_store_pd(a+8, v2);
1226 }
1227
1228 void zero() {
1229 v1 = _mm512_setzero_pd();
1230 v2 = _mm512_setzero_pd();
1231 }
1232
1233 void plus_equal(const TinyVec& t) {
1234 v1 = _mm512_add_pd(v1, t.v1);
1235 v2 = _mm512_add_pd(v2, t.v2);
1236 }
1237
1238 void times_equal(const TinyVec& t) {
1239 v1 = _mm512_mul_pd(v1, t.v1);
1240 v2 = _mm512_mul_pd(v2, t.v2);
1241 }
1242
1243 // *this = *this + t1 * t2
1244 void multiply_add(const TinyVec& t1, const TinyVec& t2) {
1245 v1 = _mm512_fmadd_pd(t1.v1, t2.v1, v1);
1246 v2 = _mm512_fmadd_pd(t1.v2, t2.v2, v2);
1247 }
1248
1249 ValueType sum() {
1250 return _mm512_reduce_add_pd(v1) + _mm512_reduce_add_pd(v2);
1251 }
1252
1253private:
1254 __m512d v1, v2;
1255};
1256
1257template <>
1258class TinyVec<double,16,true,true> {
1259public:
1260
1261 typedef double ValueType;
1262 static const int Num = 16;
1263
1264 TinyVec(const int sz) {
1265 mask = _mm512_int2mask((1 << (sz-7))-1);
1266 }
1267
1268 TinyVec(const ValueType a[], int sz) {
1269 mask = _mm512_int2mask((1 << (sz-7))-1);
1270 load(a);
1271 }
1272
1273 template <typename OrdinalType>
1274 TinyVec(const ValueType a[], const OrdinalType idx[], int sz) {
1275 mask = _mm512_int2mask((1 << (sz-7))-1);
1276 gather(a,idx);
1277 }
1278
1279 TinyVec(const ValueType a, int sz) {
1280 mask = _mm512_int2mask((1 << (sz-7))-1);
1281 load(a);
1282 }
1283
1284 TinyVec(const TinyVec& tv) {
1285 mask = tv.mask;
1286 v1 = tv.v1; v2 = tv.v2;
1287 }
1288
1289 TinyVec& operator=(const TinyVec& tv) {
1290 mask = tv.mask;
1291 v1 = tv.v1; v2 = tv.v2;
1292 return *this;
1293 }
1294
1295 void load(const ValueType a[]) {
1296 v1 = _mm512_load_pd(a);
1297 v2 = _mm512_setzero_pd();
1298 v2 = _mm512_mask_load_pd(v2, mask, a+8);
1299 }
1300
1301 void load(const ValueType a) {
1302 v1 = _mm512_set1_pd(a);
1303 v2 = _mm512_set1_pd(a);
1304 }
1305
1306 void aligned_load(const ValueType a[]) {
1307 v1 = _mm512_load_pd(a);
1308 v2 = _mm512_setzero_pd();
1309 v2 = _mm512_mask_load_pd(v2, mask, a+8);
1310 }
1311
1312 template <typename OrdinalType>
1313 void gather(const ValueType a[], const OrdinalType idx[]) {
1314 // We're assuming idx is an array of 32-bit integers
1315 // Load 16 integers into v1idx, then permute the high 256 bits
1316 // to the low 256 bits (DCBA -> BADC where 128 bit lanes are read right to
1317 // left). Then load the vectors into v1 and v2.
1318 // logather_pd only uses the low 256 bits in the index vector.
1319 // Note: permute4f128 overwrites its argument, so we need to load v1 first
1320 __m512i v1idx = _mm512_load_epi32(idx);
1321 v1 = _mm512_i32logather_pd(v1idx, a, 8);
1322
1323 v1idx = _mm512_permute4f128_epi32(v1idx, _MM_PERM_BADC);
1324 v2 = _mm512_setzero_pd();
1325 v2 = _mm512_mask_i32logather_pd(v2, mask, v1idx, a, 8);
1326 }
1327
1328 void scatter(ValueType a[]) const {
1329 _mm512_store_pd(a, v1);
1330 _mm512_mask_store_pd(a+8, mask, v2);
1331 }
1332
1333 void aligned_scatter(ValueType a[]) const {
1334 _mm512_store_pd(a, v1);
1335 _mm512_mask_store_pd(a+8, mask, v2);
1336 }
1337
1338 void zero() {
1339 v1 = _mm512_setzero_pd();
1340 v2 = _mm512_setzero_pd();
1341 }
1342
1343 void plus_equal(const TinyVec& t) {
1344 v1 = _mm512_add_pd(v1, t.v1);
1345 v2 = _mm512_mask_add_pd(v2, mask, v2, t.v2);
1346 }
1347
1348 void times_equal(const TinyVec& t) {
1349 v1 = _mm512_mul_pd(v1, t.v1);
1350 v2 = _mm512_mask_mul_pd(v2, mask, v2, t.v2);
1351 }
1352
1353 // *this = *this + t1 * t2
1354 void multiply_add(const TinyVec& t1, const TinyVec& t2) {
1355 v1 = _mm512_fmadd_pd(t1.v1, t2.v1, v1);
1356 v2 = _mm512_mask3_fmadd_pd(t1.v2, t2.v2, v2, mask);
1357 }
1358
1359 ValueType sum() {
1360 return _mm512_reduce_add_pd(v1) + _mm512_mask_reduce_add_pd(mask, v2);
1361 }
1362
1363private:
1364 __mmask8 mask;
1365 __m512d v1, v2;
1366};
1367#endif
1368
1369#endif // #if defined(HAVE_STOKHOS_INTRINSICS) && !defined( __CUDACC__ )
1370
1371} // namespace Stokhos
1372
1373#endif /* #ifndef STOKHOS_TINY_VEC_HPP */
KOKKOS_INLINE_FUNCTION void aligned_load(const ValueType a[])
KOKKOS_INLINE_FUNCTION TinyVec(const TinyVec &tv)
KOKKOS_INLINE_FUNCTION ValueType sum() const
KOKKOS_INLINE_FUNCTION TinyVec(const ValueType a, int size)
KOKKOS_INLINE_FUNCTION void plus_equal(const TinyVec &t)
KOKKOS_INLINE_FUNCTION TinyVec(const ValueType a[], int size)
KOKKOS_INLINE_FUNCTION void aligned_scatter(ValueType a[]) const
KOKKOS_INLINE_FUNCTION void load(const ValueType a[])
KOKKOS_INLINE_FUNCTION void scatter(ValueType a[]) const
KOKKOS_INLINE_FUNCTION void times_equal(const TinyVec &t)
KOKKOS_INLINE_FUNCTION TinyVec & operator=(const TinyVec &tv)
KOKKOS_INLINE_FUNCTION void multiply_add(const TinyVec &t1, const TinyVec &t2)
KOKKOS_INLINE_FUNCTION TinyVec(const ValueType a[], const OrdinalType idx[], int size)
KOKKOS_INLINE_FUNCTION void gather(const ValueType a[], const OrdinalType idx[])
KOKKOS_INLINE_FUNCTION void load(const ValueType a)
KOKKOS_INLINE_FUNCTION void times_equal(const TinyVec &t)
KOKKOS_INLINE_FUNCTION TinyVec(const TinyVec &tv)
KOKKOS_INLINE_FUNCTION void plus_equal(const TinyVec &t)
KOKKOS_INLINE_FUNCTION void zero()
KOKKOS_INLINE_FUNCTION void load(const ValueType a)
KOKKOS_INLINE_FUNCTION void multiply_add(const TinyVec &t1, const TinyVec &t2)
KOKKOS_INLINE_FUNCTION void aligned_scatter(ValueType a[]) const
KOKKOS_INLINE_FUNCTION ValueType sum() const
KOKKOS_INLINE_FUNCTION void scatter(ValueType a[]) const
KOKKOS_INLINE_FUNCTION TinyVec(const ValueType a)
KOKKOS_INLINE_FUNCTION void aligned_load(const ValueType a[])
static const int Num
KOKKOS_INLINE_FUNCTION TinyVec(const ValueType a[], const OrdinalType idx[])
KOKKOS_INLINE_FUNCTION TinyVec()
KOKKOS_INLINE_FUNCTION void load(const ValueType a[])
KOKKOS_INLINE_FUNCTION void gather(const ValueType a[], const OrdinalType idx[])
KOKKOS_INLINE_FUNCTION TinyVec & operator=(const TinyVec &tv)
KOKKOS_INLINE_FUNCTION TinyVec(const ValueType a[])
Top-level namespace for Stokhos classes and functions.