MagickCore  6.9.13-10
Convert, Edit, Or Compose Bitmap Images
accelerate-kernels-private.h
1 /*
2  Copyright 1999 ImageMagick Studio LLC, a non-profit organization
3  dedicated to making software imaging solutions freely available.
4 
5  You may not use this file except in compliance with the License. You may
6  obtain a copy of the License at
7 
8  https://imagemagick.org/script/license.php
9 
10  Unless required by applicable law or agreed to in writing, software
11  distributed under the License is distributed on an "AS IS" BASIS,
12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  See the License for the specific language governing permissions and
14  limitations under the License.
15 
16  MagickCore private methods for accelerated functions.
17 */
18 
19 #ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
20 #define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
21 
22 #if defined(__cplusplus) || defined(c_plusplus)
23 extern "C" {
24 #endif
25 
26 #if defined(MAGICKCORE_OPENCL_SUPPORT)
27 
28 /*
29  Define declarations.
30 */
31 #define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n"
32 #define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n"
33 #define OPENCL_ELSE() "\n #""else " " \n"
34 #define OPENCL_ENDIF() "\n #""endif " " \n"
35 #define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n"
36 #define STRINGIFY(...) #__VA_ARGS__ "\n"
37 
38 const char* accelerateKernels =
39 
40 /*
41  Define declarations.
42 */
43  OPENCL_DEFINE(GetPixelAlpha(pixel),(QuantumRange-(pixel).w))
44  OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
45  OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
46  OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
47  OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
48  OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
49  OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
50  OPENCL_DEFINE(SigmaRandom, (attenuate))
51  OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
52  OPENCL_DEFINE(MagickMax(x, y), (((x) > (y)) ? (x) : (y)))
53  OPENCL_DEFINE(MagickMin(x, y), (((x) < (y)) ? (x) : (y)))
54 
55 /*
56  Typedef declarations.
57 */
58  STRINGIFY(
59  typedef enum
60  {
61  UndefinedColorspace,
62  RGBColorspace, /* Linear RGB colorspace */
63  GRAYColorspace, /* greyscale (non-linear) image (faked 1 channel) */
64  TransparentColorspace,
65  OHTAColorspace,
66  LabColorspace,
67  XYZColorspace,
68  YCbCrColorspace,
69  YCCColorspace,
70  YIQColorspace,
71  YPbPrColorspace,
72  YUVColorspace,
73  CMYKColorspace, /* negared linear RGB with black separated */
74  sRGBColorspace, /* Default: non-lienar sRGB colorspace */
75  HSBColorspace,
76  HSLColorspace,
77  HWBColorspace,
78  Rec601LumaColorspace,
79  Rec601YCbCrColorspace,
80  Rec709LumaColorspace,
81  Rec709YCbCrColorspace,
82  LogColorspace,
83  CMYColorspace, /* negated linear RGB colorspace */
84  LuvColorspace,
85  HCLColorspace,
86  LCHColorspace, /* alias for LCHuv */
87  LMSColorspace,
88  LCHabColorspace, /* Cylindrical (Polar) Lab */
89  LCHuvColorspace, /* Cylindrical (Polar) Luv */
90  scRGBColorspace,
91  HSIColorspace,
92  HSVColorspace, /* alias for HSB */
93  HCLpColorspace,
94  YDbDrColorspace,
95  LinearGRAYColorspace /* greyscale (linear) image (faked 1 channel) */
96  } ColorspaceType;
97  )
98 
99  STRINGIFY(
100  typedef enum
101  {
102  UndefinedCompositeOp,
103  NoCompositeOp,
104  ModulusAddCompositeOp,
105  AtopCompositeOp,
106  BlendCompositeOp,
107  BumpmapCompositeOp,
108  ChangeMaskCompositeOp,
109  ClearCompositeOp,
110  ColorBurnCompositeOp,
111  ColorDodgeCompositeOp,
112  ColorizeCompositeOp,
113  CopyBlackCompositeOp,
114  CopyBlueCompositeOp,
115  CopyCompositeOp,
116  CopyCyanCompositeOp,
117  CopyGreenCompositeOp,
118  CopyMagentaCompositeOp,
119  CopyOpacityCompositeOp,
120  CopyRedCompositeOp,
121  CopyYellowCompositeOp,
122  DarkenCompositeOp,
123  DstAtopCompositeOp,
124  DstCompositeOp,
125  DstInCompositeOp,
126  DstOutCompositeOp,
127  DstOverCompositeOp,
128  DifferenceCompositeOp,
129  DisplaceCompositeOp,
130  DissolveCompositeOp,
131  ExclusionCompositeOp,
132  HardLightCompositeOp,
133  HueCompositeOp,
134  InCompositeOp,
135  LightenCompositeOp,
136  LinearLightCompositeOp,
137  LuminizeCompositeOp,
138  MinusDstCompositeOp,
139  ModulateCompositeOp,
140  MultiplyCompositeOp,
141  OutCompositeOp,
142  OverCompositeOp,
143  OverlayCompositeOp,
144  PlusCompositeOp,
145  ReplaceCompositeOp,
146  SaturateCompositeOp,
147  ScreenCompositeOp,
148  SoftLightCompositeOp,
149  SrcAtopCompositeOp,
150  SrcCompositeOp,
151  SrcInCompositeOp,
152  SrcOutCompositeOp,
153  SrcOverCompositeOp,
154  ModulusSubtractCompositeOp,
155  ThresholdCompositeOp,
156  XorCompositeOp,
157  /* These are new operators, added after the above was last sorted.
158  * The list should be re-sorted only when a new library version is
159  * created.
160  */
161  DivideDstCompositeOp,
162  DistortCompositeOp,
163  BlurCompositeOp,
164  PegtopLightCompositeOp,
165  VividLightCompositeOp,
166  PinLightCompositeOp,
167  LinearDodgeCompositeOp,
168  LinearBurnCompositeOp,
169  MathematicsCompositeOp,
170  DivideSrcCompositeOp,
171  MinusSrcCompositeOp,
172  DarkenIntensityCompositeOp,
173  LightenIntensityCompositeOp
174  } CompositeOperator;
175  )
176 
177  STRINGIFY(
178  typedef enum
179  {
180  UndefinedFunction,
181  PolynomialFunction,
182  SinusoidFunction,
183  ArcsinFunction,
184  ArctanFunction
185  } MagickFunction;
186  )
187 
188  STRINGIFY(
189  typedef enum
190  {
191  UndefinedNoise,
192  UniformNoise,
193  GaussianNoise,
194  MultiplicativeGaussianNoise,
195  ImpulseNoise,
196  LaplacianNoise,
197  PoissonNoise,
198  RandomNoise
199  } NoiseType;
200  )
201 
202  STRINGIFY(
203  typedef enum
204  {
205  UndefinedPixelIntensityMethod = 0,
206  AveragePixelIntensityMethod,
207  BrightnessPixelIntensityMethod,
208  LightnessPixelIntensityMethod,
209  Rec601LumaPixelIntensityMethod,
210  Rec601LuminancePixelIntensityMethod,
211  Rec709LumaPixelIntensityMethod,
212  Rec709LuminancePixelIntensityMethod,
213  RMSPixelIntensityMethod,
214  MSPixelIntensityMethod
215  } PixelIntensityMethod;
216  )
217 
218  STRINGIFY(
219  typedef enum {
220  BoxWeightingFunction = 0,
221  TriangleWeightingFunction,
222  CubicBCWeightingFunction,
223  HanningWeightingFunction,
224  HammingWeightingFunction,
225  BlackmanWeightingFunction,
226  GaussianWeightingFunction,
227  QuadraticWeightingFunction,
228  JincWeightingFunction,
229  SincWeightingFunction,
230  SincFastWeightingFunction,
231  KaiserWeightingFunction,
232  WelshWeightingFunction,
233  BohmanWeightingFunction,
234  LagrangeWeightingFunction,
235  CosineWeightingFunction,
236  } ResizeWeightingFunctionType;
237  )
238 
239  STRINGIFY(
240  typedef enum
241  {
242  UndefinedChannel,
243  RedChannel = 0x0001,
244  GrayChannel = 0x0001,
245  CyanChannel = 0x0001,
246  GreenChannel = 0x0002,
247  MagentaChannel = 0x0002,
248  BlueChannel = 0x0004,
249  YellowChannel = 0x0004,
250  AlphaChannel = 0x0008,
251  OpacityChannel = 0x0008,
252  MatteChannel = 0x0008, /* deprecated */
253  BlackChannel = 0x0020,
254  IndexChannel = 0x0020,
255  CompositeChannels = 0x002F,
256  AllChannels = 0x7ffffff,
257  /*
258  Special purpose channel types.
259  */
260  TrueAlphaChannel = 0x0040, /* extract actual alpha channel from opacity */
261  RGBChannels = 0x0080, /* set alpha from grayscale mask in RGB */
262  GrayChannels = 0x0080,
263  SyncChannels = 0x0100, /* channels should be modified equally */
264  DefaultChannels = ((AllChannels | SyncChannels) &~ OpacityChannel)
265  } ChannelType;
266  )
267 
268 /*
269  Helper functions.
270 */
271 
272 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
273 
274  STRINGIFY(
275  static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
276  {
277  return((CLQuantum) value);
278  }
279  )
280 
281 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
282 
283  STRINGIFY(
284  static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
285  {
286  return((CLQuantum) (257.0f*value));
287  }
288  )
289 
290 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
291 
292  STRINGIFY(
293  static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
294  {
295  return((CLQuantum) (16843009.0*value));
296  }
297  )
298 
299 OPENCL_ENDIF()
300 
301 OPENCL_IF((MAGICKCORE_HDRI_SUPPORT == 1))
302 
303  STRINGIFY(
304  static inline CLQuantum ClampToQuantum(const float value)
305  {
306  return (CLQuantum) value;
307  }
308  )
309 
310 OPENCL_ELSE()
311 
312  STRINGIFY(
313  static inline CLQuantum ClampToQuantum(const float value)
314  {
315  return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
316  }
317  )
318 
319 OPENCL_ENDIF()
320 
321  STRINGIFY(
322  static inline int ClampToCanvas(const int offset, const int range)
323  {
324  return clamp(offset, (int)0, range - 1);
325  }
326  )
327 
328  STRINGIFY(
329  static inline int ClampToCanvasWithHalo(const int offset, const int range, const int edge, const int section)
330  {
331  return clamp(offset, section ? (int)(0 - edge) : (int)0, section ? (range - 1) : (range - 1 + edge));
332  }
333  )
334 
335  STRINGIFY(
336  static inline uint ScaleQuantumToMap(CLQuantum value)
337  {
338  if (value >= (CLQuantum)MaxMap)
339  return ((uint)MaxMap);
340  else
341  return ((uint)value);
342  }
343  )
344 
345  STRINGIFY(
346  static inline float PerceptibleReciprocal(const float x)
347  {
348  float sign = x < (float) 0.0 ? (float)-1.0 : (float) 1.0;
349  return((sign*x) >= MagickEpsilon ? (float) 1.0 / x : sign*((float) 1.0 / MagickEpsilon));
350  }
351  )
352 
353  STRINGIFY(
354  static inline float RoundToUnity(const float value)
355  {
356  return clamp(value, 0.0f, 1.0f);
357  }
358  )
359 
360  STRINGIFY(
361 
362  static inline CLQuantum getBlue(CLPixelType p) { return p.x; }
363  static inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; }
364  static inline float getBlueF4(float4 p) { return p.x; }
365  static inline void setBlueF4(float4* p, float value) { (*p).x = value; }
366 
367  static inline CLQuantum getGreen(CLPixelType p) { return p.y; }
368  static inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
369  static inline float getGreenF4(float4 p) { return p.y; }
370  static inline void setGreenF4(float4* p, float value) { (*p).y = value; }
371 
372  static inline CLQuantum getRed(CLPixelType p) { return p.z; }
373  static inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; }
374  static inline float getRedF4(float4 p) { return p.z; }
375  static inline void setRedF4(float4* p, float value) { (*p).z = value; }
376 
377  static inline CLQuantum getOpacity(CLPixelType p) { return p.w; }
378  static inline void setOpacity(CLPixelType* p, CLQuantum value) { (*p).w = value; }
379  static inline float getOpacityF4(float4 p) { return p.w; }
380  static inline void setOpacityF4(float4* p, float value) { (*p).w = value; }
381 
382  static inline void setGray(CLPixelType* p, CLQuantum value) { (*p).z = value; (*p).y = value; (*p).x = value; }
383 
384  static inline float GetPixelIntensity(const int method, const int colorspace, CLPixelType p)
385  {
386  float red = getRed(p);
387  float green = getGreen(p);
388  float blue = getBlue(p);
389 
390  float intensity;
391 
392  if (colorspace == GRAYColorspace)
393  return red;
394 
395  switch (method)
396  {
397  case AveragePixelIntensityMethod:
398  {
399  intensity = (red + green + blue) / 3.0;
400  break;
401  }
402  case BrightnessPixelIntensityMethod:
403  {
404  intensity = MagickMax(MagickMax(red, green), blue);
405  break;
406  }
407  case LightnessPixelIntensityMethod:
408  {
409  intensity = (MagickMin(MagickMin(red, green), blue) +
410  MagickMax(MagickMax(red, green), blue)) / 2.0;
411  break;
412  }
413  case MSPixelIntensityMethod:
414  {
415  intensity = (float)(((float)red*red + green*green + blue*blue) /
416  (3.0*QuantumRange));
417  break;
418  }
419  case Rec601LumaPixelIntensityMethod:
420  {
421  /*
422  if (image->colorspace == RGBColorspace)
423  {
424  red=EncodePixelGamma(red);
425  green=EncodePixelGamma(green);
426  blue=EncodePixelGamma(blue);
427  }
428  */
429  intensity = 0.298839*red + 0.586811*green + 0.114350*blue;
430  break;
431  }
432  case Rec601LuminancePixelIntensityMethod:
433  {
434  /*
435  if (image->colorspace == sRGBColorspace)
436  {
437  red=DecodePixelGamma(red);
438  green=DecodePixelGamma(green);
439  blue=DecodePixelGamma(blue);
440  }
441  */
442  intensity = 0.298839*red + 0.586811*green + 0.114350*blue;
443  break;
444  }
445  case Rec709LumaPixelIntensityMethod:
446  default:
447  {
448  /*
449  if (image->colorspace == RGBColorspace)
450  {
451  red=EncodePixelGamma(red);
452  green=EncodePixelGamma(green);
453  blue=EncodePixelGamma(blue);
454  }
455  */
456  intensity = 0.212656*red + 0.715158*green + 0.072186*blue;
457  break;
458  }
459  case Rec709LuminancePixelIntensityMethod:
460  {
461  /*
462  if (image->colorspace == sRGBColorspace)
463  {
464  red=DecodePixelGamma(red);
465  green=DecodePixelGamma(green);
466  blue=DecodePixelGamma(blue);
467  }
468  */
469  intensity = 0.212656*red + 0.715158*green + 0.072186*blue;
470  break;
471  }
472  case RMSPixelIntensityMethod:
473  {
474  intensity = (float)(sqrt((float)red*red + green*green + blue*blue) /
475  sqrt(3.0));
476  break;
477  }
478  }
479 
480  return intensity;
481 
482  }
483  )
484 
485 /*
486 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
487 % %
488 % %
489 % %
490 % A d d N o i s e %
491 % %
492 % %
493 % %
494 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
495 */
496 
497 STRINGIFY(
498 
499 /*
500 Part of MWC64X by David Thomas, dt10@imperial.ac.uk
501 This is provided under BSD, full license is with the main package.
502 See http://www.doc.ic.ac.uk/~dt10/research
503 */
504 
505 // Pre: a<M, b<M
506 // Post: r=(a+b) mod M
507 ulong MWC_AddMod64(ulong a, ulong b, ulong M)
508 {
509  ulong v=a+b;
510  //if( (v>=M) || (v<a) )
511  if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
512  v=v-M;
513  return v;
514 }
515 
516 // Pre: a<M,b<M
517 // Post: r=(a*b) mod M
518 // This could be done more efficently, but it is portable, and should
519 // be easy to understand. It can be replaced with any of the better
520 // modular multiplication algorithms (for example if you know you have
521 // double precision available or something).
522 ulong MWC_MulMod64(ulong a, ulong b, ulong M)
523 {
524  ulong r=0;
525  while(a!=0){
526  if(a&1)
527  r=MWC_AddMod64(r,b,M);
528  b=MWC_AddMod64(b,b,M);
529  a=a>>1;
530  }
531  return r;
532 }
533 
534 
535 // Pre: a<M, e>=0
536 // Post: r=(a^b) mod M
537 // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
538 // most architectures
539 ulong MWC_PowMod64(ulong a, ulong e, ulong M)
540 {
541  ulong sqr=a, acc=1;
542  while(e!=0){
543  if(e&1)
544  acc=MWC_MulMod64(acc,sqr,M);
545  sqr=MWC_MulMod64(sqr,sqr,M);
546  e=e>>1;
547  }
548  return acc;
549 }
550 
551 uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
552 {
553  ulong m=MWC_PowMod64(A, distance, M);
554  ulong x=curr.x*(ulong)A+curr.y;
555  x=MWC_MulMod64(x, m, M);
556  return (uint2)((uint)(x/A), (uint)(x%A));
557 }
558 
559 uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
560 {
561  // This is an arbitrary constant for starting LCG jumping from. I didn't
562  // want to start from 1, as then you end up with the two or three first values
563  // being a bit poor in ones - once you've decided that, one constant is as
564  // good as any another. There is no deep mathematical reason for it, I just
565  // generated a random number.
566  enum{ MWC_BASEID = 4077358422479273989UL };
567 
568  ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
569  ulong m=MWC_PowMod64(A, dist, M);
570 
571  ulong x=MWC_MulMod64(MWC_BASEID, m, M);
572  return (uint2)((uint)(x/A), (uint)(x%A));
573 }
574 
576 typedef struct{ uint x; uint c; uint seed0; ulong seed1; } mwc64x_state_t;
577 
578 void MWC64X_Step(mwc64x_state_t *s)
579 {
580  uint X=s->x, C=s->c;
581 
582  uint Xn=s->seed0*X+C;
583  uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
584  uint Cn=mad_hi(s->seed0,X,carry);
585 
586  s->x=Xn;
587  s->c=Cn;
588 }
589 
590 void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
591 {
592  uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), s->seed0, s->seed1, distance);
593  s->x=tmp.x;
594  s->c=tmp.y;
595 }
596 
597 void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
598 {
599  uint2 tmp=MWC_SeedImpl_Mod64(s->seed0, s->seed1, 1, 0, baseOffset, perStreamOffset);
600  s->x=tmp.x;
601  s->c=tmp.y;
602 }
603 
605 uint MWC64X_NextUint(mwc64x_state_t *s)
606 {
607  uint res=s->x ^ s->c;
608  MWC64X_Step(s);
609  return res;
610 }
611 
612 //
613 // End of MWC64X excerpt
614 //
615 
616  float mwcReadPseudoRandomValue(mwc64x_state_t* rng) {
617  return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
618  }
619 
620 
621  float mwcGenerateDifferentialNoise(mwc64x_state_t* r, CLQuantum pixel, NoiseType noise_type, float attenuate) {
622 
623  float
624  alpha,
625  beta,
626  noise,
627  sigma;
628 
629  noise = 0.0f;
630  alpha=mwcReadPseudoRandomValue(r);
631  switch(noise_type) {
632  case UniformNoise:
633  default:
634  {
635  noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
636  break;
637  }
638  case GaussianNoise:
639  {
640  float
641  gamma,
642  tau;
643 
644  if (alpha == 0.0f)
645  alpha=1.0f;
646  beta=mwcReadPseudoRandomValue(r);
647  gamma=sqrt(-2.0f*log(alpha));
648  sigma=gamma*cospi((2.0f*beta));
649  tau=gamma*sinpi((2.0f*beta));
650  noise=(float)(pixel+sqrt((float) pixel)*SigmaGaussian*sigma+
651  QuantumRange*TauGaussian*tau);
652  break;
653  }
654 
655 
656  case ImpulseNoise:
657  {
658  if (alpha < (SigmaImpulse/2.0f))
659  noise=0.0f;
660  else
661  if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
662  noise=(float)QuantumRange;
663  else
664  noise=(float)pixel;
665  break;
666  }
667  case LaplacianNoise:
668  {
669  if (alpha <= 0.5f)
670  {
671  if (alpha <= MagickEpsilon)
672  noise=(float) (pixel-QuantumRange);
673  else
674  noise=(float) (pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
675  0.5f);
676  break;
677  }
678  beta=1.0f-alpha;
679  if (beta <= (0.5f*MagickEpsilon))
680  noise=(float) (pixel+QuantumRange);
681  else
682  noise=(float) (pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
683  break;
684  }
685  case MultiplicativeGaussianNoise:
686  {
687  sigma=1.0f;
688  if (alpha > MagickEpsilon)
689  sigma=sqrt(-2.0f*log(alpha));
690  beta=mwcReadPseudoRandomValue(r);
691  noise=(float) (pixel+pixel*SigmaMultiplicativeGaussian*sigma*
692  cospi((float) (2.0f*beta))/2.0f);
693  break;
694  }
695  case PoissonNoise:
696  {
697  float
698  poisson;
699  unsigned int i;
700  poisson=exp(-SigmaPoisson*QuantumScale*pixel);
701  for (i=0; alpha > poisson; i++)
702  {
703  beta=mwcReadPseudoRandomValue(r);
704  alpha*=beta;
705  }
706  noise=(float) (QuantumRange*i*PerceptibleReciprocal(SigmaPoisson));
707  break;
708  }
709  case RandomNoise:
710  {
711  noise=(float) (QuantumRange*SigmaRandom*alpha);
712  break;
713  }
714 
715  };
716  return noise;
717  }
718 
719  __kernel
720  void AddNoise(const __global CLPixelType* inputImage, __global CLPixelType* filteredImage
721  ,const unsigned int inputPixelCount, const unsigned int pixelsPerWorkItem
722  ,const ChannelType channel
723  ,const NoiseType noise_type, const float attenuate
724  ,const unsigned int seed0, const unsigned int seed1
725  ,const unsigned int numRandomNumbersPerPixel) {
726 
727  mwc64x_state_t rng;
728  rng.seed0 = seed0;
729  rng.seed1 = seed1;
730 
731  uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
732  uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
733 
734  MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
735 
736  uint pos = get_local_size(0) * get_group_id(0) * pixelsPerWorkItem + get_local_id(0); // pixel to process
737 
738  uint count = pixelsPerWorkItem;
739 
740  while (count > 0) {
741  if (pos < inputPixelCount) {
742  CLPixelType p = inputImage[pos];
743 
744  if ((channel&RedChannel)!=0) {
745  setRed(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getRed(p),noise_type,attenuate)));
746  }
747 
748  if ((channel&GreenChannel)!=0) {
749  setGreen(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getGreen(p),noise_type,attenuate)));
750  }
751 
752  if ((channel&BlueChannel)!=0) {
753  setBlue(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getBlue(p),noise_type,attenuate)));
754  }
755 
756  if ((channel & OpacityChannel) != 0) {
757  setOpacity(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getOpacity(p),noise_type,attenuate)));
758  }
759 
760  filteredImage[pos] = p;
761  //filteredImage[pos] = (CLPixelType)(MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, 255);
762  }
763  pos += get_local_size(0);
764  --count;
765  }
766  }
767  )
768 
769 /*
770 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
771 % %
772 % %
773 % %
774 % B l u r %
775 % %
776 % %
777 % %
778 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
779 */
780 
781  STRINGIFY(
782  /*
783  Reduce image noise and reduce detail levels by row
784  im: input pixels filtered_in filtered_im: output pixels
785  filter : convolve kernel width: convolve kernel size
786  channel : define which channel is blured
787  is_RGBA_BGRA : define the input is RGBA or BGRA
788  */
789  __kernel void BlurRow(__global CLPixelType *im, __global float4 *filtered_im,
790  const ChannelType channel, __constant float *filter,
791  const unsigned int width,
792  const unsigned int imageColumns, const unsigned int imageRows,
793  __local CLPixelType *temp)
794  {
795  const int x = get_global_id(0);
796  const int y = get_global_id(1);
797 
798  const int columns = imageColumns;
799 
800  const unsigned int radius = (width-1)/2;
801  const int wsize = get_local_size(0);
802  const unsigned int loadSize = wsize+width;
803 
804  //load chunk only for now
805  //event_t e = async_work_group_copy(temp+radius, im+x+y*columns, wsize, 0);
806  //wait_group_events(1,&e);
807 
808  //parallel load and clamp
809  /*
810  int count = 0;
811  for (int i=0; i < loadSize; i=i+wsize)
812  {
813  int currentX = x + wsize*(count++);
814 
815  int localId = get_local_id(0);
816 
817  if ((localId+i) > loadSize)
818  break;
819 
820  temp[localId+i] = im[y*columns+ClampToCanvas(currentX-radius, columns)];
821 
822  if (y==0 && get_group_id(0) == 0)
823  {
824  printf("(%d %d) temp %d load %d currentX %d\n", x, y, localId+i, ClampToCanvas(currentX-radius, columns), currentX);
825  }
826  }
827  */
828 
829  //group coordinate
830  const int groupX=get_local_size(0)*get_group_id(0);
831  const int groupY=get_local_size(1)*get_group_id(1);
832 
833  //parallel load and clamp
834  for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
835  {
836  //int cx = ClampToCanvas(groupX+i, columns);
837  temp[i] = im[y * columns + ClampToCanvas(i+groupX-radius, columns)];
838 
839  /*if (0 && y==0 && get_group_id(1) == 0)
840  {
841  printf("(%d %d) temp %d load %d groupX %d\n", x, y, i, ClampToCanvas(groupX+i, columns), groupX);
842  }*/
843  }
844 
845  // barrier
846  barrier(CLK_LOCAL_MEM_FENCE);
847 
848  // only do the work if this is not a patched item
849  if (get_global_id(0) < columns)
850  {
851  // compute
852  float4 result = (float4) 0;
853 
854  int i = 0;
855 
856  \n #ifndef UFACTOR \n
857  \n #define UFACTOR 8 \n
858  \n #endif \n
859 
860  for ( ; i+UFACTOR < width; )
861  {
862  \n #pragma unroll UFACTOR\n
863  for (int j=0; j < UFACTOR; j++, i++)
864  {
865  result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
866  }
867  }
868 
869  for ( ; i < width; i++)
870  {
871  result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
872  }
873 
874  result.x = ClampToQuantum(result.x);
875  result.y = ClampToQuantum(result.y);
876  result.z = ClampToQuantum(result.z);
877  result.w = ClampToQuantum(result.w);
878 
879  // write back to global
880  filtered_im[y*columns+x] = result;
881  }
882  }
883  )
884 
885  STRINGIFY(
886  /*
887  Reduce image noise and reduce detail levels by line
888  im: input pixels filtered_in filtered_im: output pixels
889  filter : convolve kernel width: convolve kernel size
890  channel : define which channel is blured\
891  is_RGBA_BGRA : define the input is RGBA or BGRA
892  */
893  __kernel void BlurColumn(const __global float4 *blurRowData, __global CLPixelType *filtered_im,
894  const ChannelType channel, __constant float *filter,
895  const unsigned int width,
896  const unsigned int imageColumns, const unsigned int imageRows,
897  __local float4 *temp)
898  {
899  const int x = get_global_id(0);
900  const int y = get_global_id(1);
901 
902  //const int columns = get_global_size(0);
903  //const int rows = get_global_size(1);
904  const int columns = imageColumns;
905  const int rows = imageRows;
906 
907  unsigned int radius = (width-1)/2;
908  const int wsize = get_local_size(1);
909  const unsigned int loadSize = wsize+width;
910 
911  //group coordinate
912  const int groupX=get_local_size(0)*get_group_id(0);
913  const int groupY=get_local_size(1)*get_group_id(1);
914  //notice that get_local_size(0) is 1, so
915  //groupX=get_group_id(0);
916 
917  //parallel load and clamp
918  for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
919  {
920  temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
921  }
922 
923  // barrier
924  barrier(CLK_LOCAL_MEM_FENCE);
925 
926  // only do the work if this is not a patched item
927  if (get_global_id(1) < rows)
928  {
929  // compute
930  float4 result = (float4) 0;
931 
932  int i = 0;
933 
934  \n #ifndef UFACTOR \n
935  \n #define UFACTOR 8 \n
936  \n #endif \n
937 
938  for ( ; i+UFACTOR < width; )
939  {
940  \n #pragma unroll UFACTOR \n
941  for (int j=0; j < UFACTOR; j++, i++)
942  {
943  result+=filter[i]*temp[i+get_local_id(1)];
944  }
945  }
946 
947  for ( ; i < width; i++)
948  {
949  result+=filter[i]*temp[i+get_local_id(1)];
950  }
951 
952  result.x = ClampToQuantum(result.x);
953  result.y = ClampToQuantum(result.y);
954  result.z = ClampToQuantum(result.z);
955  result.w = ClampToQuantum(result.w);
956 
957  // write back to global
958  filtered_im[y*columns+x] = (CLPixelType) (result.x,result.y,result.z,result.w);
959  }
960 
961  }
962  )
963 
964 /*
965 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
966 % %
967 % %
968 % %
969 % C o m p o s i t e %
970 % %
971 % %
972 % %
973 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
974 */
975 
976  STRINGIFY(
977  static inline float ColorDodge(const float Sca,
978  const float Sa,const float Dca,const float Da)
979  {
980  /*
981  Oct 2004 SVG specification.
982  */
983  if ((Sca*Da+Dca*Sa) >= Sa*Da)
984  return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
985  return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
986 
987 
988  /*
989  New specification, March 2009 SVG specification. This specification was
990  also wrong of non-overlap cases.
991  */
992  /*
993  if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
994  return(Sca*(1.0-Da));
995  if (fabs(Sca-Sa) < MagickEpsilon)
996  return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
997  return(Sa*MagickMin(Da,Dca*Sa/(Sa-Sca)));
998  */
999 
1000  /*
1001  Working from first principles using the original formula:
1002 
1003  f(Sc,Dc) = Dc/(1-Sc)
1004 
1005  This works correctly! Looks like the 2004 model was right but just
1006  required a extra condition for correct handling.
1007  */
1008 
1009  /*
1010  if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
1011  return(Sca*(1.0-Da)+Dca*(1.0-Sa));
1012  if (fabs(Sca-Sa) < MagickEpsilon)
1013  return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
1014  return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
1015  */
1016  }
1017 
1018  static inline void CompositeColorDodge(const float4 *p,
1019  const float4 *q,float4 *composite) {
1020 
1021  float
1022  Da,
1023  gamma,
1024  Sa;
1025 
1026  Sa=1.0f-QuantumScale*getOpacityF4(*p); /* simplify and speed up equations */
1027  Da=1.0f-QuantumScale*getOpacityF4(*q);
1028  gamma=RoundToUnity(Sa+Da-Sa*Da); /* over blend, as per SVG doc */
1029  setOpacityF4(composite, QuantumRange*(1.0-gamma));
1030  gamma=QuantumRange/(fabs(gamma) < MagickEpsilon ? MagickEpsilon : gamma);
1031  setRedF4(composite,gamma*ColorDodge(QuantumScale*getRedF4(*p)*Sa,Sa,QuantumScale*
1032  getRedF4(*q)*Da,Da));
1033  setGreenF4(composite,gamma*ColorDodge(QuantumScale*getGreenF4(*p)*Sa,Sa,QuantumScale*
1034  getGreenF4(*q)*Da,Da));
1035  setBlueF4(composite,gamma*ColorDodge(QuantumScale*getBlueF4(*p)*Sa,Sa,QuantumScale*
1036  getBlueF4(*q)*Da,Da));
1037  }
1038  )
1039 
1040  STRINGIFY(
1041  static inline void MagickPixelCompositePlus(const float4 *p,
1042  const float alpha,const float4 *q,
1043  const float beta,float4 *composite)
1044  {
1045  float
1046  gamma;
1047 
1048  float
1049  Da,
1050  Sa;
1051  /*
1052  Add two pixels with the given opacities.
1053  */
1054  Sa=1.0-QuantumScale*alpha;
1055  Da=1.0-QuantumScale*beta;
1056  gamma=RoundToUnity(Sa+Da); /* 'Plus' blending -- not 'Over' blending */
1057  setOpacityF4(composite,(float) QuantumRange*(1.0-gamma));
1058  gamma=PerceptibleReciprocal(gamma);
1059  setRedF4(composite,gamma*(Sa*getRedF4(*p)+Da*getRedF4(*q)));
1060  setGreenF4(composite,gamma*(Sa*getGreenF4(*p)+Da*getGreenF4(*q)));
1061  setBlueF4(composite,gamma*(Sa*getBlueF4(*p)+Da*getBlueF4(*q)));
1062  }
1063  )
1064 
1065  STRINGIFY(
1066  static inline void MagickPixelCompositeBlend(const float4 *p,
1067  const float alpha,const float4 *q,
1068  const float beta,float4 *composite)
1069  {
1070  MagickPixelCompositePlus(p,(float) (QuantumRange-alpha*
1071  (QuantumRange-getOpacityF4(*p))),q,(float) (QuantumRange-beta*
1072  (QuantumRange-getOpacityF4(*q))),composite);
1073  }
1074  )
1075 
1076  STRINGIFY(
1077  __kernel
1078  void Composite(__global CLPixelType *image,
1079  const unsigned int imageWidth,
1080  const unsigned int imageHeight,
1081  const unsigned int imageMatte,
1082  const __global CLPixelType *compositeImage,
1083  const unsigned int compositeWidth,
1084  const unsigned int compositeHeight,
1085  const unsigned int compositeMatte,
1086  const unsigned int compose,
1087  const ChannelType channel,
1088  const float destination_dissolve,
1089  const float source_dissolve) {
1090 
1091  uint2 index;
1092  index.x = get_global_id(0);
1093  index.y = get_global_id(1);
1094 
1095 
1096  if (index.x >= imageWidth
1097  || index.y >= imageHeight) {
1098  return;
1099  }
1100  const CLPixelType inputPixel = image[index.y*imageWidth+index.x];
1101  float4 destination;
1102  setRedF4(&destination,getRed(inputPixel));
1103  setGreenF4(&destination,getGreen(inputPixel));
1104  setBlueF4(&destination,getBlue(inputPixel));
1105 
1106 
1107  const CLPixelType compositePixel
1108  = compositeImage[index.y*imageWidth+index.x];
1109  float4 source;
1110  setRedF4(&source,getRed(compositePixel));
1111  setGreenF4(&source,getGreen(compositePixel));
1112  setBlueF4(&source,getBlue(compositePixel));
1113 
1114  if (imageMatte != 0) {
1115  setOpacityF4(&destination,getOpacity(inputPixel));
1116  }
1117  else {
1118  setOpacityF4(&destination,0.0f);
1119  }
1120 
1121  if (compositeMatte != 0) {
1122  setOpacityF4(&source,getOpacity(compositePixel));
1123  }
1124  else {
1125  setOpacityF4(&source,0.0f);
1126  }
1127 
1128  float4 composite=destination;
1129 
1130  CompositeOperator op = (CompositeOperator)compose;
1131  switch (op) {
1132  case ColorDodgeCompositeOp:
1133  CompositeColorDodge(&source,&destination,&composite);
1134  break;
1135  case BlendCompositeOp:
1136  MagickPixelCompositeBlend(&source,source_dissolve,&destination,
1137  destination_dissolve,&composite);
1138  break;
1139  default:
1140  // unsupported operators
1141  break;
1142  };
1143 
1144  CLPixelType outputPixel;
1145  setRed(&outputPixel, ClampToQuantum(getRedF4(composite)));
1146  setGreen(&outputPixel, ClampToQuantum(getGreenF4(composite)));
1147  setBlue(&outputPixel, ClampToQuantum(getBlueF4(composite)));
1148  setOpacity(&outputPixel, ClampToQuantum(getOpacityF4(composite)));
1149  image[index.y*imageWidth+index.x] = outputPixel;
1150  }
1151  )
1152 
1153 /*
1154 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1155 % %
1156 % %
1157 % %
1158 % C o n t r a s t %
1159 % %
1160 % %
1161 % %
1162 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1163 */
1164 
1165  STRINGIFY(
1166 
1167  static inline float3 ConvertRGBToHSB(CLPixelType pixel) {
1168  float3 HueSaturationBrightness;
1169  HueSaturationBrightness.x = 0.0f; // Hue
1170  HueSaturationBrightness.y = 0.0f; // Saturation
1171  HueSaturationBrightness.z = 0.0f; // Brightness
1172 
1173  float r=(float) getRed(pixel);
1174  float g=(float) getGreen(pixel);
1175  float b=(float) getBlue(pixel);
1176 
1177  float tmin=MagickMin(MagickMin(r,g),b);
1178  float tmax= MagickMax(MagickMax(r,g),b);
1179 
1180  if (tmax!=0.0f) {
1181  float delta=tmax-tmin;
1182  HueSaturationBrightness.y=delta/tmax;
1183  HueSaturationBrightness.z=QuantumScale*tmax;
1184 
1185  if (delta != 0.0f) {
1186  HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f));
1187  HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta;
1188  HueSaturationBrightness.x/=6.0f;
1189  HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f;
1190  }
1191  }
1192  return HueSaturationBrightness;
1193  }
1194 
1195  static inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) {
1196 
1197  float hue = HueSaturationBrightness.x;
1198  float brightness = HueSaturationBrightness.z;
1199  float saturation = HueSaturationBrightness.y;
1200 
1201  CLPixelType rgb;
1202 
1203  if (saturation == 0.0f) {
1204  setRed(&rgb,ClampToQuantum(QuantumRange*brightness));
1205  setGreen(&rgb,getRed(rgb));
1206  setBlue(&rgb,getRed(rgb));
1207  }
1208  else {
1209 
1210  float h=6.0f*(hue-floor(hue));
1211  float f=h-floor(h);
1212  float p=brightness*(1.0f-saturation);
1213  float q=brightness*(1.0f-saturation*f);
1214  float t=brightness*(1.0f-(saturation*(1.0f-f)));
1215 
1216  float clampedBrightness = ClampToQuantum(QuantumRange*brightness);
1217  float clamped_t = ClampToQuantum(QuantumRange*t);
1218  float clamped_p = ClampToQuantum(QuantumRange*p);
1219  float clamped_q = ClampToQuantum(QuantumRange*q);
1220  int ih = (int)h;
1221  setRed(&rgb, (ih == 1)?clamped_q:
1222  (ih == 2 || ih == 3)?clamped_p:
1223  (ih == 4)?clamped_t:
1224  clampedBrightness);
1225 
1226  setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness:
1227  (ih == 3)?clamped_q:
1228  (ih == 4 || ih == 5)?clamped_p:
1229  clamped_t);
1230 
1231  setBlue(&rgb, (ih == 2)?clamped_t:
1232  (ih == 3 || ih == 4)?clampedBrightness:
1233  (ih == 5)?clamped_q:
1234  clamped_p);
1235  }
1236  return rgb;
1237  }
1238 
1239  __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen)
1240  {
1241 
1242  const int sign = sharpen!=0?1:-1;
1243  const int x = get_global_id(0);
1244  const int y = get_global_id(1);
1245  const int columns = get_global_size(0);
1246  const int c = x + y * columns;
1247 
1248  CLPixelType pixel = im[c];
1249  float3 HueSaturationBrightness = ConvertRGBToHSB(pixel);
1250  float brightness = HueSaturationBrightness.z;
1251  brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
1252  brightness = clamp(brightness,0.0f,1.0f);
1253  HueSaturationBrightness.z = brightness;
1254 
1255  CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness);
1256  filteredPixel.w = pixel.w;
1257  im[c] = filteredPixel;
1258  }
1259  )
1260 
1261 /*
1262 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1263 % %
1264 % %
1265 % %
1266 % C o n t r a s t S t r e t c h %
1267 % %
1268 % %
1269 % %
1270 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1271 */
1272 
1273  STRINGIFY(
1274  /*
1275  */
1276  __kernel void Histogram(__global CLPixelType * restrict im,
1277  const ChannelType channel,
1278  const int method,
1279  const int colorspace,
1280  __global uint4 * restrict histogram)
1281  {
1282  const int x = get_global_id(0);
1283  const int y = get_global_id(1);
1284  const int columns = get_global_size(0);
1285  const int c = x + y * columns;
1286  if ((channel & SyncChannels) != 0)
1287  {
1288  float intensity = GetPixelIntensity(method, colorspace,im[c]);
1289  uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
1290  atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
1291  }
1292  else
1293  {
1294  // for equalizing, we always need all channels?
1295  // otherwise something more
1296  }
1297  }
1298  )
1299 
1300  STRINGIFY(
1301  /*
1302  */
1303  __kernel void ContrastStretch(__global CLPixelType * restrict im,
1304  const ChannelType channel,
1305  __global CLPixelType * restrict stretch_map,
1306  const float4 white, const float4 black)
1307  {
1308  const int x = get_global_id(0);
1309  const int y = get_global_id(1);
1310  const int columns = get_global_size(0);
1311  const int c = x + y * columns;
1312 
1313  uint ePos;
1314  CLPixelType oValue, eValue;
1315  CLQuantum red, green, blue, opacity;
1316 
1317  //read from global
1318  oValue=im[c];
1319 
1320  if ((channel & RedChannel) != 0)
1321  {
1322  if (getRedF4(white) != getRedF4(black))
1323  {
1324  ePos = ScaleQuantumToMap(getRed(oValue));
1325  eValue = stretch_map[ePos];
1326  red = getRed(eValue);
1327  }
1328  }
1329 
1330  if ((channel & GreenChannel) != 0)
1331  {
1332  if (getGreenF4(white) != getGreenF4(black))
1333  {
1334  ePos = ScaleQuantumToMap(getGreen(oValue));
1335  eValue = stretch_map[ePos];
1336  green = getGreen(eValue);
1337  }
1338  }
1339 
1340  if ((channel & BlueChannel) != 0)
1341  {
1342  if (getBlueF4(white) != getBlueF4(black))
1343  {
1344  ePos = ScaleQuantumToMap(getBlue(oValue));
1345  eValue = stretch_map[ePos];
1346  blue = getBlue(eValue);
1347  }
1348  }
1349 
1350  if ((channel & OpacityChannel) != 0)
1351  {
1352  if (getOpacityF4(white) != getOpacityF4(black))
1353  {
1354  ePos = ScaleQuantumToMap(getOpacity(oValue));
1355  eValue = stretch_map[ePos];
1356  opacity = getOpacity(eValue);
1357  }
1358  }
1359 
1360  //write back
1361  im[c]=(CLPixelType)(blue, green, red, opacity);
1362 
1363  }
1364  )
1365 
1366 /*
1367 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1368 % %
1369 % %
1370 % %
1371 % C o n v o l v e %
1372 % %
1373 % %
1374 % %
1375 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1376 */
1377 
1378  STRINGIFY(
1379  __kernel
1380  void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
1381  const unsigned int imageWidth, const unsigned int imageHeight,
1382  __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1383  const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
1384 
1385  int2 blockID;
1386  blockID.x = get_group_id(0);
1387  blockID.y = get_group_id(1);
1388 
1389  // image area processed by this workgroup
1390  int2 imageAreaOrg;
1391  imageAreaOrg.x = blockID.x * get_local_size(0);
1392  imageAreaOrg.y = blockID.y * get_local_size(1);
1393 
1394  int2 midFilterDimen;
1395  midFilterDimen.x = (filterWidth-1)/2;
1396  midFilterDimen.y = (filterHeight-1)/2;
1397 
1398  int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
1399 
1400  // dimension of the local cache
1401  int2 cachedAreaDimen;
1402  cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
1403  cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
1404 
1405  // cache the pixels accessed by this workgroup in local memory
1406  int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
1407  int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
1408  int groupSize = get_local_size(0) * get_local_size(1);
1409  for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
1410 
1411  int2 cachedAreaIndex;
1412  cachedAreaIndex.x = i % cachedAreaDimen.x;
1413  cachedAreaIndex.y = i / cachedAreaDimen.x;
1414 
1415  int2 imagePixelIndex;
1416  imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
1417 
1418  // only support EdgeVirtualPixelMethod through ClampToCanvas
1419  // TODO: implement other virtual pixel method
1420  imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
1421  imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
1422 
1423  pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
1424  }
1425 
1426  // cache the filter
1427  for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
1428  filterCache[i] = filter[i];
1429  }
1430  barrier(CLK_LOCAL_MEM_FENCE);
1431 
1432 
1433  int2 imageIndex;
1434  imageIndex.x = imageAreaOrg.x + get_local_id(0);
1435  imageIndex.y = imageAreaOrg.y + get_local_id(1);
1436 
1437  // if out-of-range, stops here and quit
1438  if (imageIndex.x >= imageWidth
1439  || imageIndex.y >= imageHeight) {
1440  return;
1441  }
1442 
1443  int filterIndex = 0;
1444  float4 sum = (float4)0.0f;
1445  float gamma = 0.0f;
1446  if (((channel & OpacityChannel) == 0) || (matte == 0)) {
1447  int cacheIndexY = get_local_id(1);
1448  for (int j = 0; j < filterHeight; j++) {
1449  int cacheIndexX = get_local_id(0);
1450  for (int i = 0; i < filterWidth; i++) {
1451  CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1452  float f = filterCache[filterIndex];
1453 
1454  sum.x += f * p.x;
1455  sum.y += f * p.y;
1456  sum.z += f * p.z;
1457  sum.w += f * p.w;
1458 
1459  gamma += f;
1460  filterIndex++;
1461  cacheIndexX++;
1462  }
1463  cacheIndexY++;
1464  }
1465  }
1466  else {
1467  int cacheIndexY = get_local_id(1);
1468  for (int j = 0; j < filterHeight; j++) {
1469  int cacheIndexX = get_local_id(0);
1470  for (int i = 0; i < filterWidth; i++) {
1471 
1472  CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1473  float alpha = QuantumScale*(QuantumRange-p.w);
1474  float f = filterCache[filterIndex];
1475  float g = alpha * f;
1476 
1477  sum.x += g*p.x;
1478  sum.y += g*p.y;
1479  sum.z += g*p.z;
1480  sum.w += f*p.w;
1481 
1482  gamma += g;
1483  filterIndex++;
1484  cacheIndexX++;
1485  }
1486  cacheIndexY++;
1487  }
1488  gamma = PerceptibleReciprocal(gamma);
1489  sum.xyz = gamma*sum.xyz;
1490  }
1491  CLPixelType outputPixel;
1492  outputPixel.x = ClampToQuantum(sum.x);
1493  outputPixel.y = ClampToQuantum(sum.y);
1494  outputPixel.z = ClampToQuantum(sum.z);
1495  outputPixel.w = ((channel & OpacityChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1496 
1497  output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1498  }
1499  )
1500 
1501  STRINGIFY(
1502  __kernel
1503  void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
1504  const uint imageWidth, const uint imageHeight,
1505  __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1506  const uint matte, const ChannelType channel) {
1507 
1508  int2 imageIndex;
1509  imageIndex.x = get_global_id(0);
1510  imageIndex.y = get_global_id(1);
1511 
1512  /*
1513  unsigned int imageWidth = get_global_size(0);
1514  unsigned int imageHeight = get_global_size(1);
1515  */
1516  if (imageIndex.x >= imageWidth
1517  || imageIndex.y >= imageHeight)
1518  return;
1519 
1520  int2 midFilterDimen;
1521  midFilterDimen.x = (filterWidth-1)/2;
1522  midFilterDimen.y = (filterHeight-1)/2;
1523 
1524  int filterIndex = 0;
1525  float4 sum = (float4)0.0f;
1526  float gamma = 0.0f;
1527  if (((channel & OpacityChannel) == 0) || (matte == 0)) {
1528  for (int j = 0; j < filterHeight; j++) {
1529  int2 inputPixelIndex;
1530  inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1531  inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1532  for (int i = 0; i < filterWidth; i++) {
1533  inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1534  inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1535 
1536  CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1537  float f = filter[filterIndex];
1538 
1539  sum.x += f * p.x;
1540  sum.y += f * p.y;
1541  sum.z += f * p.z;
1542  sum.w += f * p.w;
1543 
1544  gamma += f;
1545 
1546  filterIndex++;
1547  }
1548  }
1549  }
1550  else {
1551 
1552  for (int j = 0; j < filterHeight; j++) {
1553  int2 inputPixelIndex;
1554  inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1555  inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1556  for (int i = 0; i < filterWidth; i++) {
1557  inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1558  inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1559 
1560  CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1561  float alpha = QuantumScale*(QuantumRange-p.w);
1562  float f = filter[filterIndex];
1563  float g = alpha * f;
1564 
1565  sum.x += g*p.x;
1566  sum.y += g*p.y;
1567  sum.z += g*p.z;
1568  sum.w += f*p.w;
1569 
1570  gamma += g;
1571 
1572 
1573  filterIndex++;
1574  }
1575  }
1576  gamma = PerceptibleReciprocal(gamma);
1577  sum.xyz = gamma*sum.xyz;
1578  }
1579 
1580  CLPixelType outputPixel;
1581  outputPixel.x = ClampToQuantum(sum.x);
1582  outputPixel.y = ClampToQuantum(sum.y);
1583  outputPixel.z = ClampToQuantum(sum.z);
1584  outputPixel.w = ((channel & OpacityChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1585 
1586  output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1587  }
1588  )
1589 
1590 /*
1591 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1592 % %
1593 % %
1594 % %
1595 % D e s p e c k l e %
1596 % %
1597 % %
1598 % %
1599 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1600 */
1601 
1602  STRINGIFY(
1603 
1604  __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1605  , const unsigned int imageWidth, const unsigned int imageHeight
1606  , const int2 offset, const int polarity, const int matte) {
1607 
1608  int x = get_global_id(0);
1609  int y = get_global_id(1);
1610 
1611  CLPixelType v = inputImage[y*imageWidth+x];
1612 
1613  int2 neighbor;
1614  neighbor.y = y + offset.y;
1615  neighbor.x = x + offset.x;
1616 
1617  int2 clampedNeighbor;
1618  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1619  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1620 
1621  CLPixelType r = (clampedNeighbor.x == neighbor.x
1622  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1623  :(CLPixelType)0;
1624 
1625  int sv[4];
1626  sv[0] = (int)v.x;
1627  sv[1] = (int)v.y;
1628  sv[2] = (int)v.z;
1629  sv[3] = (int)v.w;
1630 
1631  int sr[4];
1632  sr[0] = (int)r.x;
1633  sr[1] = (int)r.y;
1634  sr[2] = (int)r.z;
1635  sr[3] = (int)r.w;
1636 
1637  if (polarity > 0) {
1638  \n #pragma unroll 4\n
1639  for (unsigned int i = 0; i < 4; i++) {
1640  sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
1641  }
1642  }
1643  else {
1644  \n #pragma unroll 4\n
1645  for (unsigned int i = 0; i < 4; i++) {
1646  sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
1647  }
1648 
1649  }
1650 
1651  v.x = (CLQuantum)sv[0];
1652  v.y = (CLQuantum)sv[1];
1653  v.z = (CLQuantum)sv[2];
1654 
1655  if (matte!=0)
1656  v.w = (CLQuantum)sv[3];
1657 
1658  outputImage[y*imageWidth+x] = v;
1659 
1660  }
1661 
1662 
1663  )
1664 
1665 
1666 
1667  STRINGIFY(
1668 
1669  __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1670  , const unsigned int imageWidth, const unsigned int imageHeight
1671  , const int2 offset, const int polarity, const int matte) {
1672 
1673  int x = get_global_id(0);
1674  int y = get_global_id(1);
1675 
1676  CLPixelType v = inputImage[y*imageWidth+x];
1677 
1678  int2 neighbor, clampedNeighbor;
1679 
1680  neighbor.y = y + offset.y;
1681  neighbor.x = x + offset.x;
1682  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1683  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1684 
1685  CLPixelType r = (clampedNeighbor.x == neighbor.x
1686  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1687  :(CLPixelType)0;
1688 
1689 
1690  neighbor.y = y - offset.y;
1691  neighbor.x = x - offset.x;
1692  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1693  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1694 
1695  CLPixelType s = (clampedNeighbor.x == neighbor.x
1696  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1697  :(CLPixelType)0;
1698 
1699 
1700  int sv[4];
1701  sv[0] = (int)v.x;
1702  sv[1] = (int)v.y;
1703  sv[2] = (int)v.z;
1704  sv[3] = (int)v.w;
1705 
1706  int sr[4];
1707  sr[0] = (int)r.x;
1708  sr[1] = (int)r.y;
1709  sr[2] = (int)r.z;
1710  sr[3] = (int)r.w;
1711 
1712  int ss[4];
1713  ss[0] = (int)s.x;
1714  ss[1] = (int)s.y;
1715  ss[2] = (int)s.z;
1716  ss[3] = (int)s.w;
1717 
1718  if (polarity > 0) {
1719  \n #pragma unroll 4\n
1720  for (unsigned int i = 0; i < 4; i++) {
1721  //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] ) ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
1722  //
1723  //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1724  //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1725  sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1726  }
1727  }
1728  else {
1729  \n #pragma unroll 4\n
1730  for (unsigned int i = 0; i < 4; i++) {
1731  //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] ) ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
1732  //
1733  //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1734  sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1735  }
1736  }
1737 
1738  v.x = (CLQuantum)sv[0];
1739  v.y = (CLQuantum)sv[1];
1740  v.z = (CLQuantum)sv[2];
1741 
1742  if (matte!=0)
1743  v.w = (CLQuantum)sv[3];
1744 
1745  outputImage[y*imageWidth+x] = v;
1746 
1747  }
1748 
1749 
1750  )
1751 
1752 /*
1753 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1754 % %
1755 % %
1756 % %
1757 % E q u a l i z e %
1758 % %
1759 % %
1760 % %
1761 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1762 */
1763 
1764  STRINGIFY(
1765  /*
1766  */
1767  __kernel void Equalize(__global CLPixelType * restrict im,
1768  const ChannelType channel,
1769  __global CLPixelType * restrict equalize_map,
1770  const float4 white, const float4 black)
1771  {
1772  const int x = get_global_id(0);
1773  const int y = get_global_id(1);
1774  const int columns = get_global_size(0);
1775  const int c = x + y * columns;
1776 
1777  uint ePos;
1778  CLPixelType oValue, eValue;
1779  CLQuantum red, green, blue, opacity;
1780 
1781  //read from global
1782  oValue=im[c];
1783 
1784  if ((channel & SyncChannels) != 0)
1785  {
1786  if (getRedF4(white) != getRedF4(black))
1787  {
1788  ePos = ScaleQuantumToMap(getRed(oValue));
1789  eValue = equalize_map[ePos];
1790  red = getRed(eValue);
1791  ePos = ScaleQuantumToMap(getGreen(oValue));
1792  eValue = equalize_map[ePos];
1793  green = getRed(eValue);
1794  ePos = ScaleQuantumToMap(getBlue(oValue));
1795  eValue = equalize_map[ePos];
1796  blue = getRed(eValue);
1797  ePos = ScaleQuantumToMap(getOpacity(oValue));
1798  eValue = equalize_map[ePos];
1799  opacity = getRed(eValue);
1800 
1801  //write back
1802  im[c]=(CLPixelType)(blue, green, red, opacity);
1803  }
1804 
1805  }
1806 
1807  // for equalizing, we always need all channels?
1808  // otherwise something more
1809 
1810  }
1811  )
1812 
1813 /*
1814 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1815 % %
1816 % %
1817 % %
1818 % F u n c t i o n %
1819 % %
1820 % %
1821 % %
1822 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1823 */
1824 
1825  STRINGIFY(
1826 
1827  /*
1828  apply FunctionImageChannel(braightness-contrast)
1829  */
1830  CLPixelType ApplyFunction(CLPixelType pixel,const MagickFunction function,
1831  const unsigned int number_parameters,
1832  __constant float *parameters)
1833  {
1834  float4 result = (float4) 0.0f;
1835  switch (function)
1836  {
1837  case PolynomialFunction:
1838  {
1839  for (unsigned int i=0; i < number_parameters; i++)
1840  result = result*(float4)QuantumScale*convert_float4(pixel) + parameters[i];
1841  result *= (float4)QuantumRange;
1842  break;
1843  }
1844  case SinusoidFunction:
1845  {
1846  float freq,phase,ampl,bias;
1847  freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1848  phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
1849  ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
1850  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1851  result.x = QuantumRange*(ampl*sin(2.0f*MagickPI*
1852  (freq*QuantumScale*(float)pixel.x + phase/360.0f)) + bias);
1853  result.y = QuantumRange*(ampl*sin(2.0f*MagickPI*
1854  (freq*QuantumScale*(float)pixel.y + phase/360.0f)) + bias);
1855  result.z = QuantumRange*(ampl*sin(2.0f*MagickPI*
1856  (freq*QuantumScale*(float)pixel.z + phase/360.0f)) + bias);
1857  result.w = QuantumRange*(ampl*sin(2.0f*MagickPI*
1858  (freq*QuantumScale*(float)pixel.w + phase/360.0f)) + bias);
1859  break;
1860  }
1861  case ArcsinFunction:
1862  {
1863  float width,range,center,bias;
1864  width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1865  center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1866  range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1867  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1868 
1869  result.x = 2.0f/width*(QuantumScale*(float)pixel.x - center);
1870  result.x = range/MagickPI*asin(result.x)+bias;
1871  result.x = ( result.x <= -1.0f ) ? bias - range/2.0f : result.x;
1872  result.x = ( result.x >= 1.0f ) ? bias + range/2.0f : result.x;
1873 
1874  result.y = 2.0f/width*(QuantumScale*(float)pixel.y - center);
1875  result.y = range/MagickPI*asin(result.y)+bias;
1876  result.y = ( result.y <= -1.0f ) ? bias - range/2.0f : result.y;
1877  result.y = ( result.y >= 1.0f ) ? bias + range/2.0f : result.y;
1878 
1879  result.z = 2.0f/width*(QuantumScale*(float)pixel.z - center);
1880  result.z = range/MagickPI*asin(result.z)+bias;
1881  result.z = ( result.z <= -1.0f ) ? bias - range/2.0f : result.x;
1882  result.z = ( result.z >= 1.0f ) ? bias + range/2.0f : result.x;
1883 
1884 
1885  result.w = 2.0f/width*(QuantumScale*(float)pixel.w - center);
1886  result.w = range/MagickPI*asin(result.w)+bias;
1887  result.w = ( result.w <= -1.0f ) ? bias - range/2.0f : result.w;
1888  result.w = ( result.w >= 1.0f ) ? bias + range/2.0f : result.w;
1889 
1890  result *= (float4)QuantumRange;
1891  break;
1892  }
1893  case ArctanFunction:
1894  {
1895  float slope,range,center,bias;
1896  slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1897  center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1898  range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1899  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1900  result = (float4)MagickPI*(float4)slope*((float4)QuantumScale*convert_float4(pixel)-(float4)center);
1901  result = (float4)QuantumRange*((float4)range/(float4)MagickPI*atan(result) + (float4)bias);
1902  break;
1903  }
1904  case UndefinedFunction:
1905  break;
1906  }
1907  return (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
1908  ClampToQuantum(result.z), ClampToQuantum(result.w));
1909  }
1910  )
1911 
1912  STRINGIFY(
1913  /*
1914  Improve brightness / contrast of the image
1915  channel : define which channel is improved
1916  function : the function called to enchance the brightness contrast
1917  number_parameters : numbers of parameters
1918  parameters : the parameter
1919  */
1920  __kernel void ComputeFunction(__global CLPixelType *im,
1921  const ChannelType channel, const MagickFunction function,
1922  const unsigned int number_parameters, __constant float *parameters)
1923  {
1924  const int x = get_global_id(0);
1925  const int y = get_global_id(1);
1926  const int columns = get_global_size(0);
1927  const int c = x + y * columns;
1928  im[c] = ApplyFunction(im[c], function, number_parameters, parameters);
1929  }
1930  )
1931 
1932 /*
1933 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1934 % %
1935 % %
1936 % %
1937 % G r a y s c a l e %
1938 % %
1939 % %
1940 % %
1941 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1942 */
1943 
1944  STRINGIFY(
1945  __kernel void Grayscale(__global CLPixelType *im,
1946  const int method, const int colorspace)
1947  {
1948 
1949  const int x = get_global_id(0);
1950  const int y = get_global_id(1);
1951  const int columns = get_global_size(0);
1952  const int c = x + y * columns;
1953 
1954  CLPixelType pixel = im[c];
1955 
1956  float
1957  blue,
1958  green,
1959  intensity,
1960  red;
1961 
1962  red=(float)getRed(pixel);
1963  green=(float)getGreen(pixel);
1964  blue=(float)getBlue(pixel);
1965 
1966  intensity=0.0;
1967 
1968  CLPixelType filteredPixel;
1969 
1970  switch (method)
1971  {
1972  case AveragePixelIntensityMethod:
1973  {
1974  intensity=(red+green+blue)/3.0;
1975  break;
1976  }
1977  case BrightnessPixelIntensityMethod:
1978  {
1979  intensity=MagickMax(MagickMax(red,green),blue);
1980  break;
1981  }
1982  case LightnessPixelIntensityMethod:
1983  {
1984  intensity=(MagickMin(MagickMin(red,green),blue)+
1985  MagickMax(MagickMax(red,green),blue))/2.0;
1986  break;
1987  }
1988  case MSPixelIntensityMethod:
1989  {
1990  intensity=(float) (((float) red*red+green*green+
1991  blue*blue)/(3.0*QuantumRange));
1992  break;
1993  }
1994  case Rec601LumaPixelIntensityMethod:
1995  {
1996  /*
1997  if (colorspace == RGBColorspace)
1998  {
1999  red=EncodePixelGamma(red);
2000  green=EncodePixelGamma(green);
2001  blue=EncodePixelGamma(blue);
2002  }
2003  */
2004  intensity=0.298839*red+0.586811*green+0.114350*blue;
2005  break;
2006  }
2007  case Rec601LuminancePixelIntensityMethod:
2008  {
2009  /*
2010  if (image->colorspace == sRGBColorspace)
2011  {
2012  red=DecodePixelGamma(red);
2013  green=DecodePixelGamma(green);
2014  blue=DecodePixelGamma(blue);
2015  }
2016  */
2017  intensity=0.298839*red+0.586811*green+0.114350*blue;
2018  break;
2019  }
2020  case Rec709LumaPixelIntensityMethod:
2021  default:
2022  {
2023  /*
2024  if (image->colorspace == RGBColorspace)
2025  {
2026  red=EncodePixelGamma(red);
2027  green=EncodePixelGamma(green);
2028  blue=EncodePixelGamma(blue);
2029  }
2030  */
2031  intensity=0.212656*red+0.715158*green+0.072186*blue;
2032  break;
2033  }
2034  case Rec709LuminancePixelIntensityMethod:
2035  {
2036  /*
2037  if (image->colorspace == sRGBColorspace)
2038  {
2039  red=DecodePixelGamma(red);
2040  green=DecodePixelGamma(green);
2041  blue=DecodePixelGamma(blue);
2042  }
2043  */
2044  intensity=0.212656*red+0.715158*green+0.072186*blue;
2045  break;
2046  }
2047  case RMSPixelIntensityMethod:
2048  {
2049  intensity=(float) (sqrt((float) red*red+green*green+
2050  blue*blue)/sqrt(3.0));
2051  break;
2052  }
2053 
2054  }
2055 
2056  setGray(&filteredPixel, ClampToQuantum(intensity));
2057 
2058  filteredPixel.w = pixel.w;
2059 
2060  im[c] = filteredPixel;
2061  }
2062  )
2063 
2064 /*
2065 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2066 % %
2067 % %
2068 % %
2069 % L o c a l C o n t r a s t %
2070 % %
2071 % %
2072 % %
2073 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2074 */
2075 
2076  STRINGIFY(
2077  static inline int mirrorBottom(int value)
2078  {
2079  return (value < 0) ? - (value) : value;
2080  }
2081  static inline int mirrorTop(int value, int width)
2082  {
2083  return (value >= width) ? (2 * width - value - 1) : value;
2084  }
2085 
2086  __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
2087  const int radius,
2088  const int imageWidth,
2089  const int imageHeight)
2090  {
2091  const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
2092 
2093  int x = get_local_id(0);
2094  int y = get_global_id(1);
2095 
2096  if ((x >= imageWidth) || (y >= imageHeight))
2097  return;
2098 
2099  global CLPixelType *src = srcImage + y * imageWidth;
2100 
2101  for (int i = x; i < imageWidth; i += get_local_size(0)) {
2102  float sum = 0.0f;
2103  float weight = 1.0f;
2104 
2105  int j = i - radius;
2106  while ((j + 7) < i) {
2107  for (int k = 0; k < 8; ++k) // Unroll 8x
2108  sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
2109  weight += 8.0f;
2110  j+=8;
2111  }
2112  while (j < i) {
2113  sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
2114  weight += 1.0f;
2115  ++j;
2116  }
2117 
2118  while ((j + 7) < radius + i) {
2119  for (int k = 0; k < 8; ++k) // Unroll 8x
2120  sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
2121  weight -= 8.0f;
2122  j+=8;
2123  }
2124  while (j < radius + i) {
2125  sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
2126  weight -= 1.0f;
2127  ++j;
2128  }
2129 
2130  tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
2131  }
2132  }
2133  )
2134 
2135  STRINGIFY(
2136  __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
2137  const int radius,
2138  const float strength,
2139  const int imageWidth,
2140  const int imageHeight)
2141  {
2142  const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
2143 
2144  int x = get_global_id(0);
2145  int y = get_global_id(1);
2146 
2147  if ((x >= imageWidth) || (y >= imageHeight))
2148  return;
2149 
2150  global float *src = blurImage + x;
2151 
2152  float sum = 0.0f;
2153  float weight = 1.0f;
2154 
2155  int j = y - radius;
2156  while ((j + 7) < y) {
2157  for (int k = 0; k < 8; ++k) // Unroll 8x
2158  sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
2159  weight += 8.0f;
2160  j+=8;
2161  }
2162  while (j < y) {
2163  sum += weight * src[mirrorBottom(j) * imageWidth];
2164  weight += 1.0f;
2165  ++j;
2166  }
2167 
2168  while ((j + 7) < radius + y) {
2169  for (int k = 0; k < 8; ++k) // Unroll 8x
2170  sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
2171  weight -= 8.0f;
2172  j+=8;
2173  }
2174  while (j < radius + y) {
2175  sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
2176  weight -= 1.0f;
2177  ++j;
2178  }
2179 
2180  CLPixelType pixel = srcImage[x + y * imageWidth];
2181  float srcVal = dot(RGB, convert_float4(pixel));
2182  float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
2183  mult = (srcVal + mult) / srcVal;
2184 
2185  pixel.x = ClampToQuantum(pixel.x * mult);
2186  pixel.y = ClampToQuantum(pixel.y * mult);
2187  pixel.z = ClampToQuantum(pixel.z * mult);
2188 
2189  dstImage[x + y * imageWidth] = pixel;
2190  }
2191  )
2192 
2193 /*
2194 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2195 % %
2196 % %
2197 % %
2198 % M o d u l a t e %
2199 % %
2200 % %
2201 % %
2202 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2203 */
2204 
2205  STRINGIFY(
2206 
2207  static inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
2208  float *hue, float *saturation, float *lightness)
2209  {
2210  float
2211  c,
2212  tmax,
2213  tmin;
2214 
2215  /*
2216  Convert RGB to HSL colorspace.
2217  */
2218  tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
2219  tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
2220 
2221  c=tmax-tmin;
2222 
2223  *lightness=(tmax+tmin)/2.0;
2224  if (c <= 0.0)
2225  {
2226  *hue=0.0;
2227  *saturation=0.0;
2228  return;
2229  }
2230 
2231  if (tmax == (QuantumScale*red))
2232  {
2233  *hue=(QuantumScale*green-QuantumScale*blue)/c;
2234  if ((QuantumScale*green) < (QuantumScale*blue))
2235  *hue+=6.0;
2236  }
2237  else
2238  if (tmax == (QuantumScale*green))
2239  *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
2240  else
2241  *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
2242 
2243  *hue*=60.0/360.0;
2244  if (*lightness <= 0.5)
2245  *saturation=c/(2.0*(*lightness));
2246  else
2247  *saturation=c/(2.0-2.0*(*lightness));
2248  }
2249 
2250  static inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
2251  CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2252  {
2253  float
2254  b,
2255  c,
2256  g,
2257  h,
2258  tmin,
2259  r,
2260  x;
2261 
2262  /*
2263  Convert HSL to RGB colorspace.
2264  */
2265  h=hue*360.0;
2266  if (lightness <= 0.5)
2267  c=2.0*lightness*saturation;
2268  else
2269  c=(2.0-2.0*lightness)*saturation;
2270  tmin=lightness-0.5*c;
2271  h-=360.0*floor(h/360.0);
2272  h/=60.0;
2273  x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
2274  switch ((int) floor(h) % 6)
2275  {
2276  case 0:
2277  default:
2278  {
2279  r=tmin+c;
2280  g=tmin+x;
2281  b=tmin;
2282  break;
2283  }
2284  case 1:
2285  {
2286  r=tmin+x;
2287  g=tmin+c;
2288  b=tmin;
2289  break;
2290  }
2291  case 2:
2292  {
2293  r=tmin;
2294  g=tmin+c;
2295  b=tmin+x;
2296  break;
2297  }
2298  case 3:
2299  {
2300  r=tmin;
2301  g=tmin+x;
2302  b=tmin+c;
2303  break;
2304  }
2305  case 4:
2306  {
2307  r=tmin+x;
2308  g=tmin;
2309  b=tmin+c;
2310  break;
2311  }
2312  case 5:
2313  {
2314  r=tmin+c;
2315  g=tmin;
2316  b=tmin+x;
2317  break;
2318  }
2319  }
2320  *red=ClampToQuantum(QuantumRange*r);
2321  *green=ClampToQuantum(QuantumRange*g);
2322  *blue=ClampToQuantum(QuantumRange*b);
2323  }
2324 
2325  static inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
2326  CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2327  {
2328  float
2329  hue,
2330  lightness,
2331  saturation;
2332 
2333  /*
2334  Increase or decrease color lightness, saturation, or hue.
2335  */
2336  ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
2337  hue+=0.5*(0.01*percent_hue-1.0);
2338  while (hue < 0.0)
2339  hue+=1.0;
2340  while (hue >= 1.0)
2341  hue-=1.0;
2342  saturation*=0.01*percent_saturation;
2343  lightness*=0.01*percent_lightness;
2344  ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
2345  }
2346 
2347  __kernel void Modulate(__global CLPixelType *im,
2348  const float percent_brightness,
2349  const float percent_hue,
2350  const float percent_saturation,
2351  const int colorspace)
2352  {
2353 
2354  const int x = get_global_id(0);
2355  const int y = get_global_id(1);
2356  const int columns = get_global_size(0);
2357  const int c = x + y * columns;
2358 
2359  CLPixelType pixel = im[c];
2360 
2361  CLQuantum
2362  blue,
2363  green,
2364  red;
2365 
2366  red=getRed(pixel);
2367  green=getGreen(pixel);
2368  blue=getBlue(pixel);
2369 
2370  switch (colorspace)
2371  {
2372  case HSLColorspace:
2373  default:
2374  {
2375  ModulateHSL(percent_hue, percent_saturation, percent_brightness,
2376  &red, &green, &blue);
2377  }
2378 
2379  }
2380 
2381  CLPixelType filteredPixel;
2382 
2383  setRed(&filteredPixel, red);
2384  setGreen(&filteredPixel, green);
2385  setBlue(&filteredPixel, blue);
2386  filteredPixel.w = pixel.w;
2387 
2388  im[c] = filteredPixel;
2389  }
2390  )
2391 
2392 /*
2393 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2394 % %
2395 % %
2396 % %
2397 % M o t i o n B l u r %
2398 % %
2399 % %
2400 % %
2401 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2402 */
2403 
2404  STRINGIFY(
2405  __kernel
2406  void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
2407  const unsigned int imageWidth, const unsigned int imageHeight,
2408  const __global float *filter, const unsigned int width, const __global int2* offset,
2409  const float4 bias,
2410  const ChannelType channel, const unsigned int matte) {
2411 
2412  int2 currentPixel;
2413  currentPixel.x = get_global_id(0);
2414  currentPixel.y = get_global_id(1);
2415 
2416  if (currentPixel.x >= imageWidth
2417  || currentPixel.y >= imageHeight)
2418  return;
2419 
2420  float4 pixel;
2421  pixel.x = (float)bias.x;
2422  pixel.y = (float)bias.y;
2423  pixel.z = (float)bias.z;
2424  pixel.w = (float)bias.w;
2425 
2426  if (((channel & OpacityChannel) == 0) || (matte == 0)) {
2427 
2428  for (int i = 0; i < width; i++) {
2429  // only support EdgeVirtualPixelMethod through ClampToCanvas
2430  // TODO: implement other virtual pixel method
2431  int2 samplePixel = currentPixel + offset[i];
2432  samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2433  samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2434  CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2435 
2436  pixel.x += (filter[i] * (float)samplePixelValue.x);
2437  pixel.y += (filter[i] * (float)samplePixelValue.y);
2438  pixel.z += (filter[i] * (float)samplePixelValue.z);
2439  pixel.w += (filter[i] * (float)samplePixelValue.w);
2440  }
2441 
2442  CLPixelType outputPixel;
2443  outputPixel.x = ClampToQuantum(pixel.x);
2444  outputPixel.y = ClampToQuantum(pixel.y);
2445  outputPixel.z = ClampToQuantum(pixel.z);
2446  outputPixel.w = ClampToQuantum(pixel.w);
2447  output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2448  }
2449  else {
2450 
2451  float gamma = 0.0f;
2452  for (int i = 0; i < width; i++) {
2453  // only support EdgeVirtualPixelMethod through ClampToCanvas
2454  // TODO: implement other virtual pixel method
2455  int2 samplePixel = currentPixel + offset[i];
2456  samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2457  samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2458 
2459  CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2460 
2461  float alpha = QuantumScale*(QuantumRange-samplePixelValue.w);
2462  float k = filter[i];
2463  pixel.x = pixel.x + k * alpha * samplePixelValue.x;
2464  pixel.y = pixel.y + k * alpha * samplePixelValue.y;
2465  pixel.z = pixel.z + k * alpha * samplePixelValue.z;
2466 
2467  pixel.w += k * alpha * samplePixelValue.w;
2468 
2469  gamma+=k*alpha;
2470  }
2471  gamma = PerceptibleReciprocal(gamma);
2472  pixel.xyz = gamma*pixel.xyz;
2473 
2474  CLPixelType outputPixel;
2475  outputPixel.x = ClampToQuantum(pixel.x);
2476  outputPixel.y = ClampToQuantum(pixel.y);
2477  outputPixel.z = ClampToQuantum(pixel.z);
2478  outputPixel.w = ClampToQuantum(pixel.w);
2479  output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2480  }
2481  }
2482  )
2483 
2484 /*
2485 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2486 % %
2487 % %
2488 % %
2489 % R a d i a l B l u r %
2490 % %
2491 % %
2492 % %
2493 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2494 */
2495 
2496  STRINGIFY(
2497  __kernel void RadialBlur(const __global CLPixelType *im, __global CLPixelType *filtered_im,
2498  const float4 bias,
2499  const unsigned int channel, const unsigned int matte,
2500  const float2 blurCenter,
2501  __constant float *cos_theta, __constant float *sin_theta,
2502  const unsigned int cossin_theta_size)
2503  {
2504  const int x = get_global_id(0);
2505  const int y = get_global_id(1);
2506  const int columns = get_global_size(0);
2507  const int rows = get_global_size(1);
2508  unsigned int step = 1;
2509  float center_x = (float) x - blurCenter.x;
2510  float center_y = (float) y - blurCenter.y;
2511  float radius = hypot(center_x, center_y);
2512 
2513  //float blur_radius = hypot((float) columns/2.0f, (float) rows/2.0f);
2514  float blur_radius = hypot(blurCenter.x, blurCenter.y);
2515 
2516  if (radius > MagickEpsilon)
2517  {
2518  step = (unsigned int) (blur_radius / radius);
2519  if (step == 0)
2520  step = 1;
2521  if (step >= cossin_theta_size)
2522  step = cossin_theta_size-1;
2523  }
2524 
2525  float4 result;
2526  result.x = (float)bias.x;
2527  result.y = (float)bias.y;
2528  result.z = (float)bias.z;
2529  result.w = (float)bias.w;
2530  float normalize = 0.0f;
2531 
2532  if (((channel & OpacityChannel) == 0) || (matte == 0)) {
2533  for (unsigned int i=0; i<cossin_theta_size; i+=step)
2534  {
2535  result += convert_float4(im[
2536  ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+
2537  ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
2538  normalize += 1.0f;
2539  }
2540  normalize = PerceptibleReciprocal(normalize);
2541  result = result * normalize;
2542  }
2543  else {
2544  float gamma = 0.0f;
2545  for (unsigned int i=0; i<cossin_theta_size; i+=step)
2546  {
2547  float4 p = convert_float4(im[
2548  ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+
2549  ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
2550 
2551  float alpha = (float)(QuantumScale*(QuantumRange-p.w));
2552  result.x += alpha * p.x;
2553  result.y += alpha * p.y;
2554  result.z += alpha * p.z;
2555  result.w += p.w;
2556  gamma+=alpha;
2557  normalize += 1.0f;
2558  }
2559  gamma = PerceptibleReciprocal(gamma);
2560  normalize = PerceptibleReciprocal(normalize);
2561  result.x = gamma*result.x;
2562  result.y = gamma*result.y;
2563  result.z = gamma*result.z;
2564  result.w = normalize*result.w;
2565  }
2566  filtered_im[y * columns + x] = (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
2567  ClampToQuantum(result.z), ClampToQuantum(result.w));
2568  }
2569  )
2570 
2571 /*
2572 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2573 % %
2574 % %
2575 % %
2576 % R e s i z e %
2577 % %
2578 % %
2579 % %
2580 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2581 */
2582 
2583  STRINGIFY(
2584  // Based on Box from resize.c
2585  float BoxResizeFilter(const float x)
2586  {
2587  return 1.0f;
2588  }
2589  )
2590 
2591  STRINGIFY(
2592  // Based on CubicBC from resize.c
2593  float CubicBC(const float x,const __global float* resizeFilterCoefficients)
2594  {
2595  /*
2596  Cubic Filters using B,C determined values:
2597  Mitchell-Netravali B = 1/3 C = 1/3 "Balanced" cubic spline filter
2598  Catmull-Rom B = 0 C = 1/2 Interpolatory and exact on linears
2599  Spline B = 1 C = 0 B-Spline Gaussian approximation
2600  Hermite B = 0 C = 0 B-Spline interpolator
2601 
2602  See paper by Mitchell and Netravali, Reconstruction Filters in Computer
2603  Graphics Computer Graphics, Volume 22, Number 4, August 1988
2604  http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
2605  Mitchell.pdf.
2606 
2607  Coefficients are determined from B,C values:
2608  P0 = ( 6 - 2*B )/6 = coeff[0]
2609  P1 = 0
2610  P2 = (-18 +12*B + 6*C )/6 = coeff[1]
2611  P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
2612  Q0 = ( 8*B +24*C )/6 = coeff[3]
2613  Q1 = ( -12*B -48*C )/6 = coeff[4]
2614  Q2 = ( 6*B +30*C )/6 = coeff[5]
2615  Q3 = ( - 1*B - 6*C )/6 = coeff[6]
2616 
2617  which are used to define the filter:
2618 
2619  P0 + P1*x + P2*x^2 + P3*x^3 0 <= x < 1
2620  Q0 + Q1*x + Q2*x^2 + Q3*x^3 1 <= x < 2
2621 
2622  which ensures function is continuous in value and derivative (slope).
2623  */
2624  if (x < 1.0)
2625  return(resizeFilterCoefficients[0]+x*(x*
2626  (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
2627  if (x < 2.0)
2628  return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
2629  (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
2630  return(0.0);
2631  }
2632  )
2633 
2634  STRINGIFY(
2635  float Sinc(const float x)
2636  {
2637  if (x != 0.0f)
2638  {
2639  const float alpha=(float) (MagickPI*x);
2640  return sinpi(x)/alpha;
2641  }
2642  return(1.0f);
2643  }
2644  )
2645 
2646  STRINGIFY(
2647  float Triangle(const float x)
2648  {
2649  /*
2650  1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
2651  a Bartlett 2D Cone filter. Also used as a Bartlett Windowing function
2652  for Sinc().
2653  */
2654  return ((x<1.0f)?(1.0f-x):0.0f);
2655  }
2656  )
2657 
2658 
2659  STRINGIFY(
2660  float Hanning(const float x)
2661  {
2662  /*
2663  Cosine window function:
2664  0.5+0.5*cos(pi*x).
2665  */
2666  const float cosine=cos((MagickPI*x));
2667  return(0.5f+0.5f*cosine);
2668  }
2669  )
2670 
2671  STRINGIFY(
2672  float Hamming(const float x)
2673  {
2674  /*
2675  Offset cosine window function:
2676  .54 + .46 cos(pi x).
2677  */
2678  const float cosine=cos((MagickPI*x));
2679  return(0.54f+0.46f*cosine);
2680  }
2681  )
2682 
2683  STRINGIFY(
2684  float Blackman(const float x)
2685  {
2686  /*
2687  Blackman: 2nd order cosine windowing function:
2688  0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
2689 
2690  Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
2691  five flops.
2692  */
2693  const float cosine=cos((MagickPI*x));
2694  return(0.34f+cosine*(0.5f+cosine*0.16f));
2695  }
2696  )
2697 
2698 
2699 
2700 
2701  STRINGIFY(
2702  static inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
2703  {
2704  switch (filterType)
2705  {
2706  /* Call Sinc even for SincFast to get better precision on GPU
2707  and to avoid thread divergence. Sinc is pretty fast on GPU anyway...*/
2708  case SincWeightingFunction:
2709  case SincFastWeightingFunction:
2710  return Sinc(x);
2711  case CubicBCWeightingFunction:
2712  return CubicBC(x,filterCoefficients);
2713  case BoxWeightingFunction:
2714  return BoxResizeFilter(x);
2715  case TriangleWeightingFunction:
2716  return Triangle(x);
2717  case HanningWeightingFunction:
2718  return Hanning(x);
2719  case HammingWeightingFunction:
2720  return Hamming(x);
2721  case BlackmanWeightingFunction:
2722  return Blackman(x);
2723 
2724  default:
2725  return 0.0f;
2726  }
2727  }
2728  )
2729 
2730 
2731  STRINGIFY(
2732  static inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
2733  , const ResizeWeightingFunctionType resizeWindowType
2734  , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
2735  {
2736  float scale;
2737  float xBlur = fabs(x/resizeFilterBlur);
2738  if (resizeWindowSupport < MagickEpsilon
2739  || resizeWindowType == BoxWeightingFunction)
2740  {
2741  scale = 1.0f;
2742  }
2743  else
2744  {
2745  scale = resizeFilterScale;
2746  scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
2747  }
2748  float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
2749  return weight;
2750  }
2751 
2752  )
2753 
2754  ;
2755  const char* accelerateKernels2 =
2756 
2757  STRINGIFY(
2758 
2759  static inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2760  return (numWorkItems/pixelPerWorkgroup);
2761  }
2762 
2763  // returns the index of the pixel for the current workitem to compute.
2764  // returns -1 if this workitem doesn't need to participate in any computation
2765  static inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2766  const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
2767  int pixelIndex = itemID/numWorkItemsPerPixel;
2768  pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
2769  return pixelIndex;
2770  }
2771 
2772  )
2773 
2774  STRINGIFY(
2775  __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
2776  void ResizeHorizontalFilter(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
2777  , const float xFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
2778  , const int resizeFilterType, const int resizeWindowType
2779  , const __global float* resizeFilterCubicCoefficients
2780  , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
2781  , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
2782  , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
2783 
2784 
2785  // calculate the range of resized image pixels computed by this workgroup
2786  const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
2787  const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
2788  const unsigned int actualNumPixelToCompute = stopX - startX;
2789 
2790  // calculate the range of input image pixels to cache
2791  float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
2792  const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2793  scale = PerceptibleReciprocal(scale);
2794 
2795  const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
2796  const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
2797 
2798  // cache the input pixels into local memory
2799  const unsigned int y = get_global_id(1);
2800  event_t e = async_work_group_copy(inputImageCache,inputImage+y*inputColumns+cacheRangeStartX,cacheRangeEndX-cacheRangeStartX,0);
2801  wait_group_events(1,&e);
2802 
2803  unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2804  for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2805  {
2806 
2807  const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
2808  const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
2809  const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
2810 
2811  // determine which resized pixel computed by this workitem
2812  const unsigned int itemID = get_local_id(0);
2813  const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
2814 
2815  const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
2816 
2817  float4 filteredPixel = (float4)0.0f;
2818  float density = 0.0f;
2819  float gamma = 0.0f;
2820  // -1 means this workitem doesn't participate in the computation
2821  if (pixelIndex != -1) {
2822 
2823  // x coordinated of the resized pixel computed by this workitem
2824  const int x = chunkStartX + pixelIndex;
2825 
2826  // calculate how many steps required for this pixel
2827  const float bisect = (x+0.5)/xFactor+MagickEpsilon;
2828  const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2829  const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
2830  const unsigned int n = stop - start;
2831 
2832  // calculate how many steps this workitem will contribute
2833  unsigned int numStepsPerWorkItem = n / numItems;
2834  numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2835 
2836  const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2837  if (startStep < n) {
2838  const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2839 
2840  unsigned int cacheIndex = start+startStep-cacheRangeStartX;
2841  if (matte == 0) {
2842 
2843  for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
2844  float4 cp = convert_float4(inputImageCache[cacheIndex]);
2845 
2846  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
2847  , (ResizeWeightingFunctionType)resizeWindowType
2848  , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
2849 
2850  filteredPixel += ((float4)weight)*cp;
2851  density+=weight;
2852  }
2853 
2854 
2855  }
2856  else {
2857  for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
2858  CLPixelType p = inputImageCache[cacheIndex];
2859 
2860  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
2861  , (ResizeWeightingFunctionType)resizeWindowType
2862  , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
2863 
2864  float alpha = weight * QuantumScale * GetPixelAlpha(p);
2865  float4 cp = convert_float4(p);
2866 
2867  filteredPixel.x += alpha * cp.x;
2868  filteredPixel.y += alpha * cp.y;
2869  filteredPixel.z += alpha * cp.z;
2870  filteredPixel.w += weight * cp.w;
2871 
2872  density+=weight;
2873  gamma+=alpha;
2874  }
2875  }
2876  }
2877  }
2878 
2879  // initialize the accumulators to zero
2880  if (itemID < actualNumPixelInThisChunk) {
2881  outputPixelCache[itemID] = (float4)0.0f;
2882  densityCache[itemID] = 0.0f;
2883  if (matte != 0)
2884  gammaCache[itemID] = 0.0f;
2885  }
2886  barrier(CLK_LOCAL_MEM_FENCE);
2887 
2888  // accumulatte the filtered pixel value and the density
2889  for (unsigned int i = 0; i < numItems; i++) {
2890  if (pixelIndex != -1) {
2891  if (itemID%numItems == i) {
2892  outputPixelCache[pixelIndex]+=filteredPixel;
2893  densityCache[pixelIndex]+=density;
2894  if (matte!=0) {
2895  gammaCache[pixelIndex]+=gamma;
2896  }
2897  }
2898  }
2899  barrier(CLK_LOCAL_MEM_FENCE);
2900  }
2901 
2902  if (itemID < actualNumPixelInThisChunk) {
2903  if (matte==0) {
2904  float density = densityCache[itemID];
2905  float4 filteredPixel = outputPixelCache[itemID];
2906  if (density!= 0.0f && density != 1.0)
2907  {
2908  density = PerceptibleReciprocal(density);
2909  filteredPixel *= (float4)density;
2910  }
2911  filteredImage[y*filteredColumns+chunkStartX+itemID] = (CLPixelType) (ClampToQuantum(filteredPixel.x)
2912  , ClampToQuantum(filteredPixel.y)
2913  , ClampToQuantum(filteredPixel.z)
2914  , ClampToQuantum(filteredPixel.w));
2915  }
2916  else {
2917  float density = densityCache[itemID];
2918  float gamma = gammaCache[itemID];
2919  float4 filteredPixel = outputPixelCache[itemID];
2920 
2921  if (density!= 0.0f && density != 1.0) {
2922  density = PerceptibleReciprocal(density);
2923  filteredPixel *= (float4)density;
2924  gamma *= density;
2925  }
2926  gamma = PerceptibleReciprocal(gamma);
2927 
2928  CLPixelType fp;
2929  fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
2930  , ClampToQuantum(gamma*filteredPixel.y)
2931  , ClampToQuantum(gamma*filteredPixel.z)
2932  , ClampToQuantum(filteredPixel.w));
2933 
2934  filteredImage[y*filteredColumns+chunkStartX+itemID] = fp;
2935 
2936  }
2937  }
2938 
2939  } // end of chunking loop
2940  }
2941  )
2942 
2943 
2944  STRINGIFY(
2945  __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2946  void ResizeVerticalFilter(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
2947  , const float yFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
2948  , const int resizeFilterType, const int resizeWindowType
2949  , const __global float* resizeFilterCubicCoefficients
2950  , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
2951  , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
2952  , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
2953 
2954 
2955  // calculate the range of resized image pixels computed by this workgroup
2956  const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
2957  const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
2958  const unsigned int actualNumPixelToCompute = stopY - startY;
2959 
2960  // calculate the range of input image pixels to cache
2961  float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
2962  const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2963  scale = PerceptibleReciprocal(scale);
2964 
2965  const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
2966  const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
2967 
2968  // cache the input pixels into local memory
2969  const unsigned int x = get_global_id(0);
2970  event_t e = async_work_group_strided_copy(inputImageCache, inputImage+cacheRangeStartY*inputColumns+x, cacheRangeEndY-cacheRangeStartY, inputColumns, 0);
2971  wait_group_events(1,&e);
2972 
2973  unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2974  for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2975  {
2976 
2977  const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
2978  const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
2979  const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
2980 
2981  // determine which resized pixel computed by this workitem
2982  const unsigned int itemID = get_local_id(1);
2983  const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
2984 
2985  const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
2986 
2987  float4 filteredPixel = (float4)0.0f;
2988  float density = 0.0f;
2989  float gamma = 0.0f;
2990  // -1 means this workitem doesn't participate in the computation
2991  if (pixelIndex != -1) {
2992 
2993  // x coordinated of the resized pixel computed by this workitem
2994  const int y = chunkStartY + pixelIndex;
2995 
2996  // calculate how many steps required for this pixel
2997  const float bisect = (y+0.5)/yFactor+MagickEpsilon;
2998  const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2999  const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
3000  const unsigned int n = stop - start;
3001 
3002  // calculate how many steps this workitem will contribute
3003  unsigned int numStepsPerWorkItem = n / numItems;
3004  numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
3005 
3006  const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
3007  if (startStep < n) {
3008  const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
3009 
3010  unsigned int cacheIndex = start+startStep-cacheRangeStartY;
3011  if (matte == 0) {
3012 
3013  for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
3014  float4 cp = convert_float4(inputImageCache[cacheIndex]);
3015 
3016  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
3017  , (ResizeWeightingFunctionType)resizeWindowType
3018  , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
3019 
3020  filteredPixel += ((float4)weight)*cp;
3021  density+=weight;
3022  }
3023 
3024 
3025  }
3026  else {
3027  for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
3028  CLPixelType p = inputImageCache[cacheIndex];
3029 
3030  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
3031  , (ResizeWeightingFunctionType)resizeWindowType
3032  , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
3033 
3034  float alpha = weight * QuantumScale * GetPixelAlpha(p);
3035  float4 cp = convert_float4(p);
3036 
3037  filteredPixel.x += alpha * cp.x;
3038  filteredPixel.y += alpha * cp.y;
3039  filteredPixel.z += alpha * cp.z;
3040  filteredPixel.w += weight * cp.w;
3041 
3042  density+=weight;
3043  gamma+=alpha;
3044  }
3045  }
3046  }
3047  }
3048 
3049  // initialize the accumulators to zero
3050  if (itemID < actualNumPixelInThisChunk) {
3051  outputPixelCache[itemID] = (float4)0.0f;
3052  densityCache[itemID] = 0.0f;
3053  if (matte != 0)
3054  gammaCache[itemID] = 0.0f;
3055  }
3056  barrier(CLK_LOCAL_MEM_FENCE);
3057 
3058  // accumulatte the filtered pixel value and the density
3059  for (unsigned int i = 0; i < numItems; i++) {
3060  if (pixelIndex != -1) {
3061  if (itemID%numItems == i) {
3062  outputPixelCache[pixelIndex]+=filteredPixel;
3063  densityCache[pixelIndex]+=density;
3064  if (matte!=0) {
3065  gammaCache[pixelIndex]+=gamma;
3066  }
3067  }
3068  }
3069  barrier(CLK_LOCAL_MEM_FENCE);
3070  }
3071 
3072  if (itemID < actualNumPixelInThisChunk) {
3073  if (matte==0) {
3074  float density = densityCache[itemID];
3075  float4 filteredPixel = outputPixelCache[itemID];
3076  if (density!= 0.0f && density != 1.0)
3077  {
3078  density = PerceptibleReciprocal(density);
3079  filteredPixel *= (float4)density;
3080  }
3081  filteredImage[(chunkStartY+itemID)*filteredColumns+x] = (CLPixelType) (ClampToQuantum(filteredPixel.x)
3082  , ClampToQuantum(filteredPixel.y)
3083  , ClampToQuantum(filteredPixel.z)
3084  , ClampToQuantum(filteredPixel.w));
3085  }
3086  else {
3087  float density = densityCache[itemID];
3088  float gamma = gammaCache[itemID];
3089  float4 filteredPixel = outputPixelCache[itemID];
3090 
3091  if (density!= 0.0f && density != 1.0) {
3092  density = PerceptibleReciprocal(density);
3093  filteredPixel *= (float4)density;
3094  gamma *= density;
3095  }
3096  gamma = PerceptibleReciprocal(gamma);
3097 
3098  CLPixelType fp;
3099  fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
3100  , ClampToQuantum(gamma*filteredPixel.y)
3101  , ClampToQuantum(gamma*filteredPixel.z)
3102  , ClampToQuantum(filteredPixel.w));
3103 
3104  filteredImage[(chunkStartY+itemID)*filteredColumns+x] = fp;
3105 
3106  }
3107  }
3108 
3109  } // end of chunking loop
3110  }
3111  )
3112 
3113 /*
3114 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3115 % %
3116 % %
3117 % %
3118 % U n s h a r p M a s k %
3119 % %
3120 % %
3121 % %
3122 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3123 */
3124 
3125  STRINGIFY(
3126  __kernel void UnsharpMaskBlurColumn(const __global CLPixelType* inputImage,
3127  const __global float4 *blurRowData, __global CLPixelType *filtered_im,
3128  const unsigned int imageColumns, const unsigned int imageRows,
3129  __local float4* cachedData, __local float* cachedFilter,
3130  const ChannelType channel, const __global float *filter, const unsigned int width,
3131  const float gain, const float threshold)
3132  {
3133  const unsigned int radius = (width-1)/2;
3134 
3135  // cache the pixel shared by the workgroup
3136  const int groupX = get_group_id(0);
3137  const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
3138  const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
3139 
3140  if (groupStartY >= 0
3141  && groupStopY < imageRows) {
3142  event_t e = async_work_group_strided_copy(cachedData
3143  ,blurRowData+groupStartY*imageColumns+groupX
3144  ,groupStopY-groupStartY,imageColumns,0);
3145  wait_group_events(1,&e);
3146  }
3147  else {
3148  for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) {
3149  cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,imageRows)*imageColumns+ groupX];
3150  }
3151  barrier(CLK_LOCAL_MEM_FENCE);
3152  }
3153  // cache the filter as well
3154  event_t e = async_work_group_copy(cachedFilter,filter,width,0);
3155  wait_group_events(1,&e);
3156 
3157  // only do the work if this is not a patched item
3158  //const int cy = get_group_id(1)*get_local_size(1)+get_local_id(1);
3159  const int cy = get_global_id(1);
3160 
3161  if (cy < imageRows) {
3162  float4 blurredPixel = (float4) 0.0f;
3163 
3164  int i = 0;
3165 
3166  \n #ifndef UFACTOR \n
3167  \n #define UFACTOR 8 \n
3168  \n #endif \n
3169 
3170  for ( ; i+UFACTOR < width; )
3171  {
3172  \n #pragma unroll UFACTOR \n
3173  for (int j=0; j < UFACTOR; j++, i++)
3174  {
3175  blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
3176  }
3177  }
3178 
3179  for ( ; i < width; i++)
3180  {
3181  blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
3182  }
3183 
3184  blurredPixel = floor((float4)(ClampToQuantum(blurredPixel.x), ClampToQuantum(blurredPixel.y)
3185  ,ClampToQuantum(blurredPixel.z), ClampToQuantum(blurredPixel.w)));
3186 
3187  float4 inputImagePixel = convert_float4(inputImage[cy*imageColumns+groupX]);
3188  float4 outputPixel = inputImagePixel - blurredPixel;
3189 
3190  float quantumThreshold = QuantumRange*threshold;
3191 
3192  int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
3193  outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
3194 
3195  //write back
3196  filtered_im[cy*imageColumns+groupX] = (CLPixelType) (ClampToQuantum(outputPixel.x), ClampToQuantum(outputPixel.y)
3197  ,ClampToQuantum(outputPixel.z), ClampToQuantum(outputPixel.w));
3198 
3199  }
3200  }
3201  )
3202 
3203 
3204 
3205  STRINGIFY(
3206  __kernel void UnsharpMask(__global CLPixelType *im, __global CLPixelType *filtered_im,
3207  __constant float *filter,
3208  const unsigned int width,
3209  const unsigned int imageColumns, const unsigned int imageRows,
3210  __local float4 *pixels,
3211  const float gain, const float threshold, const unsigned int justBlur)
3212  {
3213  const int x = get_global_id(0);
3214  const int y = get_global_id(1);
3215 
3216  const unsigned int radius = (width - 1) / 2;
3217 
3218  int row = y - radius;
3219  int baseRow = get_group_id(1) * get_local_size(1) - radius;
3220  int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
3221 
3222  while (row < endRow) {
3223  int srcy = (row < 0) ? -row : row; // mirror pad
3224  srcy = (srcy >= imageRows) ? (2 * imageRows - srcy - 1) : srcy;
3225 
3226  float4 value = 0.0f;
3227 
3228  int ix = x - radius;
3229  int i = 0;
3230 
3231  while (i + 7 < width) {
3232  for (int j = 0; j < 8; ++j) { // unrolled
3233  int srcx = ix + j;
3234  srcx = (srcx < 0) ? -srcx : srcx;
3235  srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx;
3236  value += filter[i + j] * convert_float4(im[srcx + srcy * imageColumns]);
3237  }
3238  ix += 8;
3239  i += 8;
3240  }
3241 
3242  while (i < width) {
3243  int srcx = (ix < 0) ? -ix : ix; // mirror pad
3244  srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx;
3245  value += filter[i] * convert_float4(im[srcx + srcy * imageColumns]);
3246  ++i;
3247  ++ix;
3248  }
3249  pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
3250  row += get_local_size(1);
3251  }
3252 
3253 
3254  barrier(CLK_LOCAL_MEM_FENCE);
3255 
3256 
3257  const int px = get_local_id(0);
3258  const int py = get_local_id(1);
3259  const int prp = get_local_size(0);
3260  float4 value = (float4)(0.0f);
3261 
3262  int i = 0;
3263  while (i + 7 < width) { // unrolled
3264  value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
3265  value += (float4)(filter[i]) * pixels[px + (py + i + 1) * prp];
3266  value += (float4)(filter[i]) * pixels[px + (py + i + 2) * prp];
3267  value += (float4)(filter[i]) * pixels[px + (py + i + 3) * prp];
3268  value += (float4)(filter[i]) * pixels[px + (py + i + 4) * prp];
3269  value += (float4)(filter[i]) * pixels[px + (py + i + 5) * prp];
3270  value += (float4)(filter[i]) * pixels[px + (py + i + 6) * prp];
3271  value += (float4)(filter[i]) * pixels[px + (py + i + 7) * prp];
3272  i += 8;
3273  }
3274  while (i < width) {
3275  value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
3276  ++i;
3277  }
3278  if ((x < imageColumns) && (y < imageRows)) {
3279  if (justBlur == 0) { // apply sharpening
3280  float4 srcPixel = convert_float4(im[x + y * imageColumns]);
3281  float4 diff = srcPixel - value;
3282 
3283  float quantumThreshold = QuantumRange*threshold;
3284 
3285  int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
3286  value = select(srcPixel + diff * gain, srcPixel, mask);
3287  }
3288  filtered_im[x + y * imageColumns] = (CLPixelType)(ClampToQuantum(value.s0), ClampToQuantum(value.s1), ClampToQuantum(value.s2), ClampToQuantum(value.s3));
3289  }
3290  }
3291  )
3292 
3293  STRINGIFY(
3294  __kernel __attribute__((reqd_work_group_size(64, 4, 1))) void WaveletDenoise(__global CLPixelType *srcImage, __global CLPixelType *dstImage,
3295  const float threshold,
3296  const int passes,
3297  const int imageWidth,
3298  const int imageHeight)
3299  {
3300  const int pad = (1 << (passes - 1));
3301  const int tileSize = 64;
3302  const int tileRowPixels = 64;
3303  const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
3304 
3305  CLPixelType stage[16];
3306 
3307  local float buffer[64 * 64];
3308 
3309  int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0);
3310  int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad;
3311 
3312  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3313  stage[i / 4] = srcImage[mirrorTop(mirrorBottom(srcx), imageWidth) + (mirrorTop(mirrorBottom(srcy + i) , imageHeight)) * imageWidth];
3314  }
3315 
3316 
3317  for (int channel = 0; channel < 3; ++channel) {
3318  // Load LDS
3319  switch (channel) {
3320  case 0:
3321  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3322  buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s0);
3323  break;
3324  case 1:
3325  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3326  buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s1);
3327  break;
3328  case 2:
3329  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3330  buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s2);
3331  break;
3332  }
3333 
3334 
3335  // Process
3336 
3337  float tmp[16];
3338  float accum[16];
3339  float pixel;
3340 
3341  for (int pass = 0; pass < passes; ++pass) {
3342  const int radius = 1 << pass;
3343  const int x = get_local_id(0);
3344  const float thresh = threshold * noise[pass];
3345 
3346  if (pass == 0)
3347  accum[0] = accum[1] = accum[2] = accum[3] = accum[4] = accum[5] = accum[6] = accum[6] = accum[7] = accum[8] = accum[9] = accum[10] = accum[11] = accum[12] = accum[13] = accum[14] = accum[15] = 0.0f;
3348 
3349  // Snapshot input
3350 
3351  // Apply horizontal hat
3352  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3353  const int offset = i * tileRowPixels;
3354  if (pass == 0)
3355  tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
3356  pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
3357  barrier(CLK_LOCAL_MEM_FENCE);
3358  buffer[x + offset] = pixel;
3359  }
3360  barrier(CLK_LOCAL_MEM_FENCE);
3361  // Apply vertical hat
3362  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3363  pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
3364  float delta = tmp[i / 4] - pixel;
3365  tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
3366  if (delta < -thresh)
3367  delta += thresh;
3368  else if (delta > thresh)
3369  delta -= thresh;
3370  else
3371  delta = 0;
3372  accum[i / 4] += delta;
3373 
3374  }
3375  barrier(CLK_LOCAL_MEM_FENCE);
3376  if (pass < passes - 1)
3377  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3378  buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
3379  else // last pass
3380  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3381  accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
3382  barrier(CLK_LOCAL_MEM_FENCE);
3383  }
3384 
3385  switch (channel) {
3386  case 0:
3387  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3388  stage[i / 4].s0 = ClampToQuantum(accum[i / 4]);
3389  break;
3390  case 1:
3391  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3392  stage[i / 4].s1 = ClampToQuantum(accum[i / 4]);
3393  break;
3394  case 2:
3395  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3396  stage[i / 4].s2 = ClampToQuantum(accum[i / 4]);
3397  break;
3398  }
3399 
3400  barrier(CLK_LOCAL_MEM_FENCE);
3401  }
3402 
3403  // Write from stage to output
3404 
3405  if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
3406  //for (int i = pad + get_local_id(1); i < tileSize - pad; i += get_local_size(1)) {
3407  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3408  if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
3409  dstImage[srcx + (srcy + i) * imageWidth] = stage[i / 4];
3410  }
3411  }
3412  }
3413  }
3414  )
3415  ;
3416 
3417 #endif // MAGICKCORE_OPENCL_SUPPORT
3418 
3419 #if defined(__cplusplus) || defined(c_plusplus)
3420 }
3421 #endif
3422 
3423 #endif // MAGICKCORE_ACCELERATE_PRIVATE_H