All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
oclkernels.h
Go to the documentation of this file.
1 
2 #ifndef _OCL_KERNEL_H_
3 #define _OCL_KERNEL_H_
4 #ifndef USE_EXTERNAL_KERNEL
5 #define KERNEL( ... )# __VA_ARGS__ "\n"
6 // Double precision is a default of spreadsheets
7 // cl_khr_fp64: Khronos extension
8 // cl_amd_fp64: AMD extension
9 // use build option outside to define fp_t
11 const char *kernel_src = KERNEL(
12 \n#ifdef KHR_DP_EXTENSION\n
13 \n#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n
14 \n#elif AMD_DP_EXTENSION\n
15 \n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n
16 \n#else\n
17 \n#endif\n
18 __kernel void composeRGBPixel(__global uint *tiffdata, int w, int h,int wpl, __global uint *output)
19 {
20  int i = get_global_id(1);
21  int j = get_global_id(0);
22  int tiffword,rval,gval,bval;
23 
24  //Ignore the excess
25  if ((i >= h) || (j >= w))
26  return;
27 
28  tiffword = tiffdata[i * w + j];
29  rval = ((tiffword) & 0xff);
30  gval = (((tiffword) >> 8) & 0xff);
31  bval = (((tiffword) >> 16) & 0xff);
32  output[i*wpl+j] = (rval << (8 * (sizeof(uint) - 1 - 0))) | (gval << (8 * (sizeof(uint) - 1 - 1))) | (bval << (8 * (sizeof(uint) - 1 - 2)));
33 }
34 )
35 
36 KERNEL(
37 \n__kernel void pixSubtract_inplace(__global int *dword, __global int *sword,
38  const int wpl, const int h)
39 {
40  const unsigned int row = get_global_id(1);
41  const unsigned int col = get_global_id(0);
42  const unsigned int pos = row * wpl + col;
43 
44  //Ignore the execss
45  if (row >= h || col >= wpl)
46  return;
47 
48  *(dword + pos) &= ~(*(sword + pos));
49 }\n
50 )
51 
52 KERNEL(
53 \n__kernel void pixSubtract(__global int *dword, __global int *sword,
54  const int wpl, const int h, __global int *outword)
55 {
56  const unsigned int row = get_global_id(1);
57  const unsigned int col = get_global_id(0);
58  const unsigned int pos = row * wpl + col;
59 
60  //Ignore the execss
61  if (row >= h || col >= wpl)
62  return;
63 
64  *(outword + pos) = *(dword + pos) & ~(*(sword + pos));
65 }\n
66 )
67 
68 KERNEL(
69 \n__kernel void pixAND(__global int *dword, __global int *sword, __global int *outword,
70  const int wpl, const int h)
71 {
72  const unsigned int row = get_global_id(1);
73  const unsigned int col = get_global_id(0);
74  const unsigned int pos = row * wpl + col;
75 
76  //Ignore the execss
77  if (row >= h || col >= wpl)
78  return;
79 
80  *(outword + pos) = *(dword + pos) & (*(sword + pos));
81 }\n
82 )
83 
84 KERNEL(
85 \n__kernel void pixOR(__global int *dword, __global int *sword, __global int *outword,
86  const int wpl, const int h)
87 {
88  const unsigned int row = get_global_id(1);
89  const unsigned int col = get_global_id(0);
90  const unsigned int pos = row * wpl + col;
91 
92  //Ignore the execss
93  if (row >= h || col >= wpl)
94  return;
95 
96  *(outword + pos) = *(dword + pos) | (*(sword + pos));
97 }\n
98 )
99 
100 KERNEL(
101 \n__kernel void morphoDilateHor_5x5(__global int *sword,__global int *dword,
102  const int wpl, const int h)
103 {
104  const unsigned int pos = get_global_id(0);
105  unsigned int prevword, nextword, currword,tempword;
106  unsigned int destword;
107  const int col = pos % wpl;
108 
109  //Ignore the execss
110  if (pos >= (wpl * h))
111  return;
112 
113 
114  currword = *(sword + pos);
115  destword = currword;
116 
117  //Handle boundary conditions
118  if(col==0)
119  prevword=0;
120  else
121  prevword = *(sword + pos - 1);
122 
123  if(col==(wpl - 1))
124  nextword=0;
125  else
126  nextword = *(sword + pos + 1);
127 
128  //Loop unrolled
129 
130  //1 bit to left and 1 bit to right
131  //Get the max value on LHS of every pixel
132  tempword = (prevword << (31)) | ((currword >> 1));
133  destword |= tempword;
134  //Get max value on RHS of every pixel
135  tempword = (currword << 1) | (nextword >> (31));
136  destword |= tempword;
137 
138  //2 bit to left and 2 bit to right
139  //Get the max value on LHS of every pixel
140  tempword = (prevword << (30)) | ((currword >> 2));
141  destword |= tempword;
142  //Get max value on RHS of every pixel
143  tempword = (currword << 2) | (nextword >> (30));
144  destword |= tempword;
145 
146 
147  *(dword + pos) = destword;
148 
149 }\n
150 )
151 
152 KERNEL(
153 \n__kernel void morphoDilateVer_5x5(__global int *sword,__global int *dword,
154  const int wpl, const int h)
155 {
156  const int col = get_global_id(0);
157  const int row = get_global_id(1);
158  const unsigned int pos = row * wpl + col;
159  unsigned int tempword;
160  unsigned int destword;
161  int i;
162 
163  //Ignore the execss
164  if (row >= h || col >= wpl)
165  return;
166 
167  destword = *(sword + pos);
168 
169  //2 words above
170  i = (row - 2) < 0 ? row : (row - 2);
171  tempword = *(sword + i*wpl + col);
172  destword |= tempword;
173 
174  //1 word above
175  i = (row - 1) < 0 ? row : (row - 1);
176  tempword = *(sword + i*wpl + col);
177  destword |= tempword;
178 
179  //1 word below
180  i = (row >= (h - 1)) ? row : (row + 1);
181  tempword = *(sword + i*wpl + col);
182  destword |= tempword;
183 
184  //2 words below
185  i = (row >= (h - 2)) ? row : (row + 2);
186  tempword = *(sword + i*wpl + col);
187  destword |= tempword;
188 
189  *(dword + pos) = destword;
190 }\n
191 )
192 
193 KERNEL(
194 \n__kernel void morphoDilateHor(__global int *sword,__global int *dword,const int xp, const int xn, const int wpl, const int h)
195 {
196  const int col = get_global_id(0);
197  const int row = get_global_id(1);
198  const unsigned int pos = row * wpl + col;
199  unsigned int parbitsxp, parbitsxn, nwords;
200  unsigned int destword, tempword, lastword, currword;
201  unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
202  int i, j, siter, eiter;
203 
204  //Ignore the execss
205  if (pos >= (wpl*h) || (xn < 1 && xp < 1))
206  return;
207 
208  currword = *(sword + pos);
209  destword = currword;
210 
211  parbitsxp = xp & 31;
212  parbitsxn = xn & 31;
213  nwords = xp >> 5;
214 
215  if (parbitsxp > 0)
216  nwords += 1;
217  else
218  parbitsxp = 31;
219 
220  siter = (col - nwords);
221  eiter = (col + nwords);
222 
223  //Get prev word
224  if (col==0)
225  firstword = 0x0;
226  else
227  firstword = *(sword + pos - 1);
228 
229  //Get next word
230  if (col == (wpl - 1))
231  secondword = 0x0;
232  else
233  secondword = *(sword + pos + 1);
234 
235  //Last partial bits on either side
236  for (i = 1; i <= parbitsxp; i++)
237  {
238  //Get the max value on LHS of every pixel
239  tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0x0 : (firstword << (32-i)) | ((currword >> i));
240 
241  destword |= tempword;
242 
243  //Get max value on RHS of every pixel
244  tempword = (currword << i) | (secondword >> (32 - i));
245  destword |= tempword;
246  }
247 
248  //Return if halfwidth <= 1 word
249  if (nwords == 1)
250  {
251  if (xn == 32)
252  {
253  destword |= firstword;
254  }
255  if (xp == 32)
256  {
257  destword |= secondword;
258  }
259 
260  *(dword + pos) = destword;
261  return;
262  }
263 
264  if (siter < 0)
265  firstword = 0x0;
266  else
267  firstword = *(sword + row*wpl + siter);
268 
269  if (eiter >= wpl)
270  lastword = 0x0;
271  else
272  lastword = *(sword + row*wpl + eiter);
273 
274  for ( i = 1; i < nwords; i++)
275  {
276  //Gets LHS words
277  if ((siter + i) < 0)
278  secondword = 0x0;
279  else
280  secondword = *(sword + row*wpl + siter + i);
281 
282  lprevword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
283 
284  firstword = secondword;
285 
286  if ((siter + i + 1) < 0)
287  secondword = 0x0;
288  else
289  secondword = *(sword + row*wpl + siter + i + 1);
290 
291  lnextword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
292 
293  //Gets RHS words
294  if ((eiter - i) >= wpl)
295  firstword = 0x0;
296  else
297  firstword = *(sword + row*wpl + eiter - i);
298 
299  rnextword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
300 
301  lastword = firstword;
302  if ((eiter - i - 1) >= wpl)
303  firstword = 0x0;
304  else
305  firstword = *(sword + row*wpl + eiter - i - 1);
306 
307  rprevword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
308 
309  for (j = 1; j < 32; j++)
310  {
311  //OR LHS full words
312  tempword = (lprevword << j) | (lnextword >> (32 - j));
313  destword |= tempword;
314 
315  //OR RHS full words
316  tempword = (rprevword << j) | (rnextword >> (32 - j));
317  destword |= tempword;
318  }
319 
320  destword |= lprevword;
321  destword |= lnextword;
322  destword |= rprevword;
323  destword |= rnextword;
324 
325  lastword = firstword;
326  firstword = secondword;
327  }
328 
329  *(dword + pos) = destword;
330 }\n
331 )
332 
333 KERNEL(
334 \n__kernel void morphoDilateHor_32word(__global int *sword,__global int *dword,
335  const int halfwidth,
336  const int wpl, const int h,
337  const char isEven)
338 {
339  const int col = get_global_id(0);
340  const int row = get_global_id(1);
341  const unsigned int pos = row * wpl + col;
342  unsigned int prevword, nextword, currword,tempword;
343  unsigned int destword;
344  int i;
345 
346  //Ignore the execss
347  if (pos >= (wpl * h))
348  return;
349 
350  currword = *(sword + pos);
351  destword = currword;
352 
353  //Handle boundary conditions
354  if(col==0)
355  prevword=0;
356  else
357  prevword = *(sword + pos - 1);
358 
359  if(col==(wpl - 1))
360  nextword=0;
361  else
362  nextword = *(sword + pos + 1);
363 
364  for (i = 1; i <= halfwidth; i++)
365  {
366  //Get the max value on LHS of every pixel
367  if (i == halfwidth && isEven)
368  {
369  tempword = 0x0;
370  }
371  else
372  {
373  tempword = (prevword << (32-i)) | ((currword >> i));
374  }
375 
376  destword |= tempword;
377 
378  //Get max value on RHS of every pixel
379  tempword = (currword << i) | (nextword >> (32 - i));
380 
381  destword |= tempword;
382  }
383 
384  *(dword + pos) = destword;
385 }\n
386 )
387 
388 KERNEL(
389 \n__kernel void morphoDilateVer(__global int *sword,__global int *dword,
390  const int yp,
391  const int wpl, const int h,
392  const int yn)
393 {
394  const int col = get_global_id(0);
395  const int row = get_global_id(1);
396  const unsigned int pos = row * wpl + col;
397  unsigned int tempword;
398  unsigned int destword;
399  int i, siter, eiter;
400 
401  //Ignore the execss
402  if (row >= h || col >= wpl)
403  return;
404 
405  destword = *(sword + pos);
406 
407  //Set start position and end position considering the boundary conditions
408  siter = (row - yn) < 0 ? 0 : (row - yn);
409  eiter = (row >= (h - yp)) ? (h - 1) : (row + yp);
410 
411  for (i = siter; i <= eiter; i++)
412  {
413  tempword = *(sword + i*wpl + col);
414 
415  destword |= tempword;
416  }
417 
418  *(dword + pos) = destword;
419 }\n
420 )
421 
422 KERNEL(
423 \n__kernel void morphoErodeHor_5x5(__global int *sword,__global int *dword,
424  const int wpl, const int h)
425 {
426  const unsigned int pos = get_global_id(0);
427  unsigned int prevword, nextword, currword,tempword;
428  unsigned int destword;
429  const int col = pos % wpl;
430 
431  //Ignore the execss
432  if (pos >= (wpl * h))
433  return;
434 
435  currword = *(sword + pos);
436  destword = currword;
437 
438  //Handle boundary conditions
439  if(col==0)
440  prevword=0xffffffff;
441  else
442  prevword = *(sword + pos - 1);
443 
444  if(col==(wpl - 1))
445  nextword=0xffffffff;
446  else
447  nextword = *(sword + pos + 1);
448 
449  //Loop unrolled
450 
451  //1 bit to left and 1 bit to right
452  //Get the min value on LHS of every pixel
453  tempword = (prevword << (31)) | ((currword >> 1));
454  destword &= tempword;
455  //Get min value on RHS of every pixel
456  tempword = (currword << 1) | (nextword >> (31));
457  destword &= tempword;
458 
459  //2 bit to left and 2 bit to right
460  //Get the min value on LHS of every pixel
461  tempword = (prevword << (30)) | ((currword >> 2));
462  destword &= tempword;
463  //Get min value on RHS of every pixel
464  tempword = (currword << 2) | (nextword >> (30));
465  destword &= tempword;
466 
467 
468  *(dword + pos) = destword;
469 
470 }\n
471 )
472 
473 KERNEL(
474 \n__kernel void morphoErodeVer_5x5(__global int *sword,__global int *dword,
475  const int wpl, const int h,
476  const int fwmask, const int lwmask)
477 {
478  const int col = get_global_id(0);
479  const int row = get_global_id(1);
480  const unsigned int pos = row * wpl + col;
481  unsigned int tempword;
482  unsigned int destword;
483  int i;
484 
485  //Ignore the execss
486  if (row >= h || col >= wpl)
487  return;
488 
489  destword = *(sword + pos);
490 
491  if (row < 2 || row >= (h - 2))
492  {
493  destword = 0x0;
494  }
495  else
496  {
497  //2 words above
498  //i = (row - 2) < 0 ? row : (row - 2);
499  i = (row - 2);
500  tempword = *(sword + i*wpl + col);
501  destword &= tempword;
502 
503  //1 word above
504  //i = (row - 1) < 0 ? row : (row - 1);
505  i = (row - 1);
506  tempword = *(sword + i*wpl + col);
507  destword &= tempword;
508 
509  //1 word below
510  //i = (row >= (h - 1)) ? row : (row + 1);
511  i = (row + 1);
512  tempword = *(sword + i*wpl + col);
513  destword &= tempword;
514 
515  //2 words below
516  //i = (row >= (h - 2)) ? row : (row + 2);
517  i = (row + 2);
518  tempword = *(sword + i*wpl + col);
519  destword &= tempword;
520 
521  if (col == 0)
522  {
523  destword &= fwmask;
524  }
525  if (col == (wpl - 1))
526  {
527  destword &= lwmask;
528  }
529  }
530 
531 
532  *(dword + pos) = destword;
533 }\n
534 )
535 
536 KERNEL(
537 \n__kernel void morphoErodeHor(__global int *sword,__global int *dword, const int xp, const int xn, const int wpl,
538  const int h, const char isAsymmetric, const int rwmask, const int lwmask)
539 {
540  const int col = get_global_id(0);
541  const int row = get_global_id(1);
542  const unsigned int pos = row * wpl + col;
543  unsigned int parbitsxp, parbitsxn, nwords;
544  unsigned int destword, tempword, lastword, currword;
545  unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
546  int i, j, siter, eiter;
547 
548  //Ignore the execss
549  if (pos >= (wpl*h) || (xn < 1 && xp < 1))
550  return;
551 
552  currword = *(sword + pos);
553  destword = currword;
554 
555  parbitsxp = xp & 31;
556  parbitsxn = xn & 31;
557  nwords = xp >> 5;
558 
559  if (parbitsxp > 0)
560  nwords += 1;
561  else
562  parbitsxp = 31;
563 
564  siter = (col - nwords);
565  eiter = (col + nwords);
566 
567  //Get prev word
568  if (col==0)
569  firstword = 0xffffffff;
570  else
571  firstword = *(sword + pos - 1);
572 
573  //Get next word
574  if (col == (wpl - 1))
575  secondword = 0xffffffff;
576  else
577  secondword = *(sword + pos + 1);
578 
579  //Last partial bits on either side
580  for (i = 1; i <= parbitsxp; i++)
581  {
582  //Get the max value on LHS of every pixel
583  tempword = (firstword << (32-i)) | ((currword >> i));
584  destword &= tempword;
585 
586  //Get max value on RHS of every pixel
587  tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0xffffffff : (currword << i) | (secondword >> (32 - i));
588 
589  //tempword = (currword << i) | (secondword >> (32 - i));
590  destword &= tempword;
591  }
592 
593  //Return if halfwidth <= 1 word
594  if (nwords == 1)
595  {
596  if (xp == 32)
597  {
598  destword &= firstword;
599  }
600  if (xn == 32)
601  {
602  destword &= secondword;
603  }
604 
605  //Clear boundary pixels
606  if (isAsymmetric)
607  {
608  if (col == 0)
609  destword &= rwmask;
610  if (col == (wpl - 1))
611  destword &= lwmask;
612  }
613 
614  *(dword + pos) = destword;
615  return;
616  }
617 
618  if (siter < 0)
619  firstword = 0xffffffff;
620  else
621  firstword = *(sword + row*wpl + siter);
622 
623  if (eiter >= wpl)
624  lastword = 0xffffffff;
625  else
626  lastword = *(sword + row*wpl + eiter);
627 
628 
629  for ( i = 1; i < nwords; i++)
630  {
631  //Gets LHS words
632  if ((siter + i) < 0)
633  secondword = 0xffffffff;
634  else
635  secondword = *(sword + row*wpl + siter + i);
636 
637  lprevword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
638 
639  firstword = secondword;
640 
641  if ((siter + i + 1) < 0)
642  secondword = 0xffffffff;
643  else
644  secondword = *(sword + row*wpl + siter + i + 1);
645 
646  lnextword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
647 
648  //Gets RHS words
649  if ((eiter - i) >= wpl)
650  firstword = 0xffffffff;
651  else
652  firstword = *(sword + row*wpl + eiter - i);
653 
654  rnextword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
655 
656  lastword = firstword;
657  if ((eiter - i - 1) >= wpl)
658  firstword = 0xffffffff;
659  else
660  firstword = *(sword + row*wpl + eiter - i - 1);
661 
662  rprevword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
663 
664  for (j = 0; j < 32; j++)
665  {
666  //OR LHS full words
667  tempword = (lprevword << j) | (lnextword >> (32 - j));
668  destword &= tempword;
669 
670  //OR RHS full words
671  tempword = (rprevword << j) | (rnextword >> (32 - j));
672  destword &= tempword;
673  }
674 
675  destword &= lprevword;
676  destword &= lnextword;
677  destword &= rprevword;
678  destword &= rnextword;
679 
680  lastword = firstword;
681  firstword = secondword;
682  }
683 
684  if (isAsymmetric)
685  {
686  //Clear boundary pixels
687  if (col < (nwords - 1))
688  destword = 0x0;
689  else if (col == (nwords - 1))
690  destword &= rwmask;
691  else if (col > (wpl - nwords))
692  destword = 0x0;
693  else if (col == (wpl - nwords))
694  destword &= lwmask;
695  }
696 
697  *(dword + pos) = destword;
698 }\n
699 )
700 
701 KERNEL(
702 \n__kernel void morphoErodeHor_32word(__global int *sword,__global int *dword,
703  const int halfwidth, const int wpl,
704  const int h, const char clearBoundPixH,
705  const int rwmask, const int lwmask,
706  const char isEven)
707 {
708  const int col = get_global_id(0);
709  const int row = get_global_id(1);
710  const unsigned int pos = row * wpl + col;
711  unsigned int prevword, nextword, currword,tempword, destword;
712  int i;
713 
714  //Ignore the execss
715  if (pos >= (wpl * h))
716  return;
717 
718  currword = *(sword + pos);
719  destword = currword;
720 
721  //Handle boundary conditions
722  if(col==0)
723  prevword=0xffffffff;
724  else
725  prevword = *(sword + pos - 1);
726 
727  if(col==(wpl - 1))
728  nextword=0xffffffff;
729  else
730  nextword = *(sword + pos + 1);
731 
732  for (i = 1; i <= halfwidth; i++)
733  {
734  //Get the min value on LHS of every pixel
735  tempword = (prevword << (32-i)) | ((currword >> i));
736 
737  destword &= tempword;
738 
739  //Get min value on RHS of every pixel
740  if (i == halfwidth && isEven)
741  {
742  tempword = 0xffffffff;
743  }
744  else
745  {
746  tempword = (currword << i) | (nextword >> (32 - i));
747  }
748 
749  destword &= tempword;
750  }
751 
752  if (clearBoundPixH)
753  {
754  if (col == 0)
755  {
756  destword &= rwmask;
757  }
758  else if (col == (wpl - 1))
759  {
760  destword &= lwmask;
761  }
762  }
763 
764  *(dword + pos) = destword;
765 }\n
766 )
767 
768 KERNEL(
769 \n__kernel void morphoErodeVer(__global int *sword,__global int *dword,
770  const int yp,
771  const int wpl, const int h,
772  const char clearBoundPixV, const int yn)
773 {
774  const int col = get_global_id(0);
775  const int row = get_global_id(1);
776  const unsigned int pos = row * wpl + col;
777  unsigned int tempword, destword;
778  int i, siter, eiter;
779 
780  //Ignore the execss
781  if (row >= h || col >= wpl)
782  return;
783 
784  destword = *(sword + pos);
785 
786  //Set start position and end position considering the boundary conditions
787  siter = (row - yp) < 0 ? 0 : (row - yp);
788  eiter = (row >= (h - yn)) ? (h - 1) : (row + yn);
789 
790  for (i = siter; i <= eiter; i++)
791  {
792  tempword = *(sword + i*wpl + col);
793 
794  destword &= tempword;
795  }
796 
797  //Clear boundary pixels
798  if (clearBoundPixV && ((row < yp) || ((h - row) <= yn)))
799  {
800  destword = 0x0;
801  }
802 
803  *(dword + pos) = destword;
804 }\n
805 )
806 
807 // HistogramRect Kernel: Accumulate
808 // assumes 4 channels, i.e., bytes_per_pixel = 4
809 // assumes number of pixels is multiple of 8
810 // data is layed out as
811 // ch0 ch1 ...
812 // bin0 bin1 bin2... bin0...
813 // rpt0,1,2...256 rpt0,1,2...
814 KERNEL(
815 \n#define HIST_REDUNDANCY 256\n
816 \n#define GROUP_SIZE 256\n
817 \n#define HIST_SIZE 256\n
818 \n#define NUM_CHANNELS 4\n
819 \n#define HR_UNROLL_SIZE 8 \n
820 \n#define HR_UNROLL_TYPE uchar8 \n
821 
822 __attribute__((reqd_work_group_size(256, 1, 1)))
823 __kernel
824 void kernel_HistogramRectAllChannels(
825  __global const uchar8 *data,
826  uint numPixels,
827  __global uint *histBuffer) {
828 
829  // declare variables
830  uchar8 pixels;
831  int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
832 
833  // for each pixel/channel, accumulate in global memory
834  for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS/HR_UNROLL_SIZE; pc += get_global_size(0) ) {
835  pixels = data[pc];
836  // channel bin thread
837  atomic_inc( &histBuffer[ 0*HIST_SIZE*HIST_REDUNDANCY + pixels.s0*HIST_REDUNDANCY + threadOffset ]); // ch0
838  atomic_inc( &histBuffer[ 0*HIST_SIZE*HIST_REDUNDANCY + pixels.s4*HIST_REDUNDANCY + threadOffset ]); // ch0
839  atomic_inc( &histBuffer[ 1*HIST_SIZE*HIST_REDUNDANCY + pixels.s1*HIST_REDUNDANCY + threadOffset ]); // ch1
840  atomic_inc( &histBuffer[ 1*HIST_SIZE*HIST_REDUNDANCY + pixels.s5*HIST_REDUNDANCY + threadOffset ]); // ch1
841  atomic_inc( &histBuffer[ 2*HIST_SIZE*HIST_REDUNDANCY + pixels.s2*HIST_REDUNDANCY + threadOffset ]); // ch2
842  atomic_inc( &histBuffer[ 2*HIST_SIZE*HIST_REDUNDANCY + pixels.s6*HIST_REDUNDANCY + threadOffset ]); // ch2
843  atomic_inc( &histBuffer[ 3*HIST_SIZE*HIST_REDUNDANCY + pixels.s3*HIST_REDUNDANCY + threadOffset ]); // ch3
844  atomic_inc( &histBuffer[ 3*HIST_SIZE*HIST_REDUNDANCY + pixels.s7*HIST_REDUNDANCY + threadOffset ]); // ch3
845  }
846 }
847 )
848 
849 KERNEL(
850 // NUM_CHANNELS = 1
851 __attribute__((reqd_work_group_size(256, 1, 1)))
852 __kernel
853 void kernel_HistogramRectOneChannel(
854  __global const uchar8 *data,
855  uint numPixels,
856  __global uint *histBuffer) {
857 
858  // declare variables
859  uchar8 pixels;
860  int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
861 
862  // for each pixel/channel, accumulate in global memory
863  for ( uint pc = get_global_id(0); pc < numPixels/HR_UNROLL_SIZE; pc += get_global_size(0) ) {
864  pixels = data[pc];
865  // bin thread
866  atomic_inc( &histBuffer[ pixels.s0*HIST_REDUNDANCY + threadOffset ]);
867  atomic_inc( &histBuffer[ pixels.s1*HIST_REDUNDANCY + threadOffset ]);
868  atomic_inc( &histBuffer[ pixels.s2*HIST_REDUNDANCY + threadOffset ]);
869  atomic_inc( &histBuffer[ pixels.s3*HIST_REDUNDANCY + threadOffset ]);
870  atomic_inc( &histBuffer[ pixels.s4*HIST_REDUNDANCY + threadOffset ]);
871  atomic_inc( &histBuffer[ pixels.s5*HIST_REDUNDANCY + threadOffset ]);
872  atomic_inc( &histBuffer[ pixels.s6*HIST_REDUNDANCY + threadOffset ]);
873  atomic_inc( &histBuffer[ pixels.s7*HIST_REDUNDANCY + threadOffset ]);
874  }
875 }
876 )
877 
878 
879 KERNEL(
880 // unused
881 \n __attribute__((reqd_work_group_size(256, 1, 1)))
882 \n __kernel
883 \n void kernel_HistogramRectAllChannels_Grey(
884 \n __global const uchar* data,
885 \n uint numPixels,
886 \n __global uint *histBuffer) { // each wg will write HIST_SIZE*NUM_CHANNELS into this result; cpu will accumulate across wg's
887 \n
888 \n /* declare variables */
889 \n
890 \n // work indices
891 \n size_t groupId = get_group_id(0);
892 \n size_t localId = get_local_id(0); // 0 -> 256-1
893 \n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
894 \n uint numThreads = get_global_size(0);
895 \n
896 \n /* accumulate in global memory */
897 \n for ( uint pc = get_global_id(0); pc < numPixels; pc += get_global_size(0) ) {
898 \n uchar value = data[ pc ];
899 \n int idx = value * get_global_size(0) + get_global_id(0);
900 \n histBuffer[ idx ]++;
901 \n
902 \n }
903 \n
904 \n } // kernel_HistogramRectAllChannels_Grey
905 
906 )
907 
908 // HistogramRect Kernel: Reduction
909 // only supports 4 channels
910 // each work group handles a single channel of a single histogram bin
911 KERNEL(
912 __attribute__((reqd_work_group_size(256, 1, 1)))
913 __kernel
914 void kernel_HistogramRectAllChannelsReduction(
915  int n, // unused pixel redundancy
916  __global uint *histBuffer,
917  __global int* histResult) {
918 
919  // declare variables
920  int channel = get_group_id(0)/HIST_SIZE;
921  int bin = get_group_id(0)%HIST_SIZE;
922  int value = 0;
923 
924  // accumulate in register
925  for ( uint i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
926  value += histBuffer[ channel*HIST_SIZE*HIST_REDUNDANCY+bin*HIST_REDUNDANCY+i];
927  }
928 
929  // reduction in local memory
930  __local int localHist[GROUP_SIZE];
931  localHist[get_local_id(0)] = value;
932  barrier(CLK_LOCAL_MEM_FENCE);
933  for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
934  if (get_local_id(0) < stride) {
935  value = localHist[ get_local_id(0)+stride];
936  }
937  barrier(CLK_LOCAL_MEM_FENCE);
938  if (get_local_id(0) < stride) {
939  localHist[ get_local_id(0)] += value;
940  }
941  barrier(CLK_LOCAL_MEM_FENCE);
942  }
943 
944  // write reduction to final result
945  if (get_local_id(0) == 0) {
946  histResult[get_group_id(0)] = localHist[0];
947  }
948 } // kernel_HistogramRectAllChannels
949 )
950 
951 
952 KERNEL(
953 // NUM_CHANNELS = 1
954 __attribute__((reqd_work_group_size(256, 1, 1)))
955 __kernel
956 void kernel_HistogramRectOneChannelReduction(
957  int n, // unused pixel redundancy
958  __global uint *histBuffer,
959  __global int* histResult) {
960 
961  // declare variables
962  // int channel = get_group_id(0)/HIST_SIZE;
963  int bin = get_group_id(0)%HIST_SIZE;
964  int value = 0;
965 
966  // accumulate in register
967  for ( int i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
968  value += histBuffer[ bin*HIST_REDUNDANCY+i];
969  }
970 
971  // reduction in local memory
972  __local int localHist[GROUP_SIZE];
973  localHist[get_local_id(0)] = value;
974  barrier(CLK_LOCAL_MEM_FENCE);
975  for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
976  if (get_local_id(0) < stride) {
977  value = localHist[ get_local_id(0)+stride];
978  }
979  barrier(CLK_LOCAL_MEM_FENCE);
980  if (get_local_id(0) < stride) {
981  localHist[ get_local_id(0)] += value;
982  }
983  barrier(CLK_LOCAL_MEM_FENCE);
984  }
985 
986  // write reduction to final result
987  if (get_local_id(0) == 0) {
988  histResult[get_group_id(0)] = localHist[0];
989  }
990 } // kernel_HistogramRectOneChannelReduction
991 )
992 
993 
994 KERNEL(
995 // unused
996  // each work group (x256) handles a histogram bin
997 \n __attribute__((reqd_work_group_size(256, 1, 1)))
998 \n __kernel
999 \n void kernel_HistogramRectAllChannelsReduction_Grey(
1000 \n int n, // pixel redundancy that needs to be accumulated
1001 \n __global uint *histBuffer,
1002 \n __global uint* histResult) { // each wg accumulates 1 bin
1003 \n
1004 \n /* declare variables */
1005 \n
1006 \n // work indices
1007 \n size_t groupId = get_group_id(0);
1008 \n size_t localId = get_local_id(0); // 0 -> 256-1
1009 \n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
1010 \n uint numThreads = get_global_size(0);
1011 \n unsigned int hist = 0;
1012 \n
1013 \n /* accumulate in global memory */
1014 \n for ( uint p = 0; p < n; p+=GROUP_SIZE) {
1015 \n hist += histBuffer[ (get_group_id(0)*n + p)];
1016 \n }
1017 \n
1018 \n /* reduction in local memory */
1019 \n // populate local memory
1020 \n __local unsigned int localHist[GROUP_SIZE];
1021 
1022 \n localHist[localId] = hist;
1023 \n barrier(CLK_LOCAL_MEM_FENCE);
1024 \n
1025 \n for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
1026 \n if (localId < stride) {
1027 \n hist = localHist[ (localId+stride)];
1028 \n }
1029 \n barrier(CLK_LOCAL_MEM_FENCE);
1030 \n if (localId < stride) {
1031 \n localHist[ localId] += hist;
1032 \n }
1033 \n barrier(CLK_LOCAL_MEM_FENCE);
1034 \n }
1035 \n
1036 \n if (localId == 0)
1037 \n histResult[get_group_id(0)] = localHist[0];
1038 \n
1039 \n } // kernel_HistogramRectAllChannelsReduction_Grey
1040 
1041 )
1042 
1043 // ThresholdRectToPix Kernel
1044 // only supports 4 channels
1045 // imageData is input image (24-bits/pixel)
1046 // pix is output image (1-bit/pixel)
1047 KERNEL(
1048 \n#define CHAR_VEC_WIDTH 8 \n
1049 \n#define PIXELS_PER_WORD 32 \n
1050 \n#define PIXELS_PER_BURST 8 \n
1051 \n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n
1052  typedef union {
1053  uchar s[PIXELS_PER_BURST*NUM_CHANNELS];
1054  uchar8 v[(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH];
1055  } charVec;
1056 
1057 __attribute__((reqd_work_group_size(256, 1, 1)))
1058 __kernel
1059 void kernel_ThresholdRectToPix(
1060  __global const uchar8 *imageData,
1061  int height,
1062  int width,
1063  int wpl, // words per line
1064  __global int *thresholds,
1065  __global int *hi_values,
1066  __global int *pix) {
1067 
1068  // declare variables
1069  int pThresholds[NUM_CHANNELS];
1070  int pHi_Values[NUM_CHANNELS];
1071  for ( int i = 0; i < NUM_CHANNELS; i++) {
1072  pThresholds[i] = thresholds[i];
1073  pHi_Values[i] = hi_values[i];
1074  }
1075 
1076  // for each word (32 pixels) in output image
1077  for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) {
1078  unsigned int word = 0; // all bits start at zero
1079 
1080  // for each burst in word
1081  for ( int b = 0; b < BURSTS_PER_WORD; b++) {
1082 
1083  // load burst
1084  charVec pixels;
1085  for ( int i = 0; i < (PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH; i++ ) {
1086  pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH) + i];
1087  }
1088 
1089  // for each pixel in burst
1090  for ( int p = 0; p < PIXELS_PER_BURST; p++) {
1091  for ( int c = 0; c < NUM_CHANNELS; c++) {
1092  unsigned char pixChan = pixels.s[p*NUM_CHANNELS + c];
1093  if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
1094  word |= (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
1095  }
1096  }
1097  }
1098  }
1099  pix[w] = word;
1100  }
1101 }
1102 
1103 // only supports 1 channel
1104  typedef union {
1105  uchar s[PIXELS_PER_BURST];
1106  uchar8 v[(PIXELS_PER_BURST)/CHAR_VEC_WIDTH];
1107  } charVec1;
1108 
1109 __attribute__((reqd_work_group_size(256, 1, 1)))
1110 __kernel
1111 void kernel_ThresholdRectToPix_OneChan(
1112  __global const uchar8 *imageData,
1113  int height,
1114  int width,
1115  int wpl, // words per line
1116  __global int *thresholds,
1117  __global int *hi_values,
1118  __global int *pix) {
1119 
1120  // declare variables
1121  int pThresholds[1];
1122  int pHi_Values[1];
1123  for ( int i = 0; i < 1; i++) {
1124  pThresholds[i] = thresholds[i];
1125  pHi_Values[i] = hi_values[i];
1126  }
1127 
1128  // for each word (32 pixels) in output image
1129  for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) {
1130  unsigned int word = 0; // all bits start at zero
1131 
1132  // for each burst in word
1133  for ( int b = 0; b < BURSTS_PER_WORD; b++) {
1134 
1135  // load burst
1136  charVec1 pixels;
1137  for ( int i = 0; i < (PIXELS_PER_BURST)/CHAR_VEC_WIDTH; i++ ) {
1138  pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + i];
1139  }
1140 
1141  // for each pixel in burst
1142  for ( int p = 0; p < PIXELS_PER_BURST; p++) {
1143  for ( int c = 0; c < 1; c++) {
1144  unsigned char pixChan = pixels.s[p + c];
1145  if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
1146  word |= (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
1147  }
1148  }
1149  }
1150  }
1151  pix[w] = word;
1152  }
1153 }
1154 )
1155 
1156  ; // close char*
1157 
1158 #endif // USE_EXTERNAL_KERNEL
1159 #endif //_OCL_KERNEL_H_
1160 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
1161 
1162 // Alternative histogram kernel written to use uchar and different global memory scattered write
1163 // was a little better for intel platforms but still not faster then native serial code
1164 #if 0
1165 /* data layed out as
1166  bin0 bin1 bin2...
1167  r,g,b,a,r,g,b,a,r,g,b,a nthreads/4 copies
1168 */
1169 \n__attribute__((reqd_work_group_size(256, 1, 1)))
1170 \n __kernel
1171 \n void kernel_HistogramRectAllChannels_uchar(
1172 \n volatile __global const uchar *data,
1173 \n uint numPixels,
1174 \n volatile __global uint *histBuffer) {
1175 \n
1176 \n // for each pixel/channel, accumulate in global memory
1177 \n for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS; pc += get_global_size(0) ) {
1178 \n uchar value = data[pc];
1179 \n int idx = value*get_global_size(0) + get_global_id(0);
1180 \n histBuffer[ idx ]++; // coalesced if same value
1181 \n }
1182 \n } // kernel_HistogramRectAllChannels
1183 \n
1184 \n __attribute__((reqd_work_group_size(256, 1, 1)))
1185 \n __kernel
1186 \n void kernel_HistogramRectAllChannelsReduction_uchar(
1187 \n int n, // pixel redundancy that needs to be accumulated = nthreads/4
1188 \n __global uint4 *histBuffer,
1189 \n __global uint* histResult) { // each wg accumulates 1 bin (all channels within it
1190 \n
1191 \n // declare variables
1192 \n int binIdx = get_group_id(0);
1193 \n size_t groupId = get_group_id(0);
1194 \n size_t localId = get_local_id(0); // 0 -> 256-1
1195 \n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
1196 \n uint numThreads = get_global_size(0);
1197 \n uint4 hist = {0, 0, 0, 0};
1198 \n
1199 \n // accumulate in register
1200 \n for ( uint p = get_local_id(0); p < n; p+=GROUP_SIZE) {
1201 \n hist += histBuffer[binIdx*n+p];
1202 \n }
1203 \n
1204 \n // reduction in local memory
1205 \n __local uint4 localHist[GROUP_SIZE];
1206 \n localHist[localId] = hist;
1207 \n barrier(CLK_LOCAL_MEM_FENCE);
1208 \n
1209 \n for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
1210 \n if (localId < stride) {
1211 \n hist = localHist[ localId+stride];
1212 \n }
1213 \n barrier(CLK_LOCAL_MEM_FENCE);
1214 \n if (localId < stride) {
1215 \n localHist[ localId] += hist;
1216 \n }
1217 \n barrier(CLK_LOCAL_MEM_FENCE);
1218 \n }
1219 \n
1220 \n // write reduction to final result
1221 \n if (localId == 0) {
1222 \n histResult[0*HIST_SIZE+binIdx] = localHist[0].s0;
1223 \n histResult[1*HIST_SIZE+binIdx] = localHist[0].s1;
1224 \n histResult[2*HIST_SIZE+binIdx] = localHist[0].s2;
1225 \n histResult[3*HIST_SIZE+binIdx] = localHist[0].s3;
1226 \n }
1227 \n
1228 \n } // kernel_HistogramRectAllChannels
1229 #endif
const char * kernel_src
Definition: oclkernels.h:11
#define KERNEL(...)
Definition: oclkernels.h:5