4 #ifndef USE_EXTERNAL_KERNEL
5 #define KERNEL( ... )# __VA_ARGS__ "\n"
12 \n#ifdef KHR_DP_EXTENSION\n
13 \n#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n
14 \n#elif AMD_DP_EXTENSION\n
15 \n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n
18 __kernel
void composeRGBPixel(__global uint *tiffdata,
int w,
int h,
int wpl, __global uint *output)
20 int i = get_global_id(1);
21 int j = get_global_id(0);
22 int tiffword,rval,gval,bval;
25 if ((i >= h) || (j >= w))
28 tiffword = tiffdata[i * w + j];
29 rval = ((tiffword) & 0xff);
30 gval = (((tiffword) >> 8) & 0xff);
31 bval = (((tiffword) >> 16) & 0xff);
32 output[i*wpl+j] = (rval << (8 * (
sizeof(uint) - 1 - 0))) | (gval << (8 * (
sizeof(uint) - 1 - 1))) | (bval << (8 * (
sizeof(uint) - 1 - 2)));
37 \n__kernel
void pixSubtract_inplace(__global
int *dword, __global
int *sword,
38 const int wpl,
const int h)
40 const unsigned int row = get_global_id(1);
41 const unsigned int col = get_global_id(0);
42 const unsigned int pos = row * wpl + col;
45 if (row >= h || col >= wpl)
48 *(dword + pos) &= ~(*(sword + pos));
53 \n__kernel
void pixSubtract(__global
int *dword, __global
int *sword,
54 const int wpl,
const int h, __global
int *outword)
56 const unsigned int row = get_global_id(1);
57 const unsigned int col = get_global_id(0);
58 const unsigned int pos = row * wpl + col;
61 if (row >= h || col >= wpl)
64 *(outword + pos) = *(dword + pos) & ~(*(sword + pos));
69 \n__kernel
void pixAND(__global
int *dword, __global
int *sword, __global
int *outword,
70 const int wpl,
const int h)
72 const unsigned int row = get_global_id(1);
73 const unsigned int col = get_global_id(0);
74 const unsigned int pos = row * wpl + col;
77 if (row >= h || col >= wpl)
80 *(outword + pos) = *(dword + pos) & (*(sword + pos));
85 \n__kernel
void pixOR(__global
int *dword, __global
int *sword, __global
int *outword,
86 const int wpl,
const int h)
88 const unsigned int row = get_global_id(1);
89 const unsigned int col = get_global_id(0);
90 const unsigned int pos = row * wpl + col;
93 if (row >= h || col >= wpl)
96 *(outword + pos) = *(dword + pos) | (*(sword + pos));
101 \n__kernel
void morphoDilateHor_5x5(__global
int *sword,__global
int *dword,
102 const int wpl,
const int h)
104 const unsigned int pos = get_global_id(0);
105 unsigned int prevword, nextword, currword,tempword;
106 unsigned int destword;
107 const int col = pos % wpl;
110 if (pos >= (wpl * h))
114 currword = *(sword + pos);
121 prevword = *(sword + pos - 1);
126 nextword = *(sword + pos + 1);
132 tempword = (prevword << (31)) | ((currword >> 1));
133 destword |= tempword;
135 tempword = (currword << 1) | (nextword >> (31));
136 destword |= tempword;
140 tempword = (prevword << (30)) | ((currword >> 2));
141 destword |= tempword;
143 tempword = (currword << 2) | (nextword >> (30));
144 destword |= tempword;
147 *(dword + pos) = destword;
153 \n__kernel
void morphoDilateVer_5x5(__global
int *sword,__global
int *dword,
154 const int wpl,
const int h)
156 const int col = get_global_id(0);
157 const int row = get_global_id(1);
158 const unsigned int pos = row * wpl + col;
159 unsigned int tempword;
160 unsigned int destword;
164 if (row >= h || col >= wpl)
167 destword = *(sword + pos);
170 i = (row - 2) < 0 ? row : (row - 2);
171 tempword = *(sword + i*wpl + col);
172 destword |= tempword;
175 i = (row - 1) < 0 ? row : (row - 1);
176 tempword = *(sword + i*wpl + col);
177 destword |= tempword;
180 i = (row >= (h - 1)) ? row : (row + 1);
181 tempword = *(sword + i*wpl + col);
182 destword |= tempword;
185 i = (row >= (h - 2)) ? row : (row + 2);
186 tempword = *(sword + i*wpl + col);
187 destword |= tempword;
189 *(dword + pos) = destword;
194 \n__kernel
void morphoDilateHor(__global
int *sword,__global
int *dword,
const int xp,
const int xn,
const int wpl,
const int h)
196 const int col = get_global_id(0);
197 const int row = get_global_id(1);
198 const unsigned int pos = row * wpl + col;
199 unsigned int parbitsxp, parbitsxn, nwords;
200 unsigned int destword, tempword, lastword, currword;
201 unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
202 int i, j, siter, eiter;
205 if (pos >= (wpl*h) || (xn < 1 && xp < 1))
208 currword = *(sword + pos);
220 siter = (col - nwords);
221 eiter = (col + nwords);
227 firstword = *(sword + pos - 1);
230 if (col == (wpl - 1))
233 secondword = *(sword + pos + 1);
236 for (i = 1; i <= parbitsxp; i++)
239 tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0x0 : (firstword << (32-i)) | ((currword >> i));
241 destword |= tempword;
244 tempword = (currword << i) | (secondword >> (32 - i));
245 destword |= tempword;
253 destword |= firstword;
257 destword |= secondword;
260 *(dword + pos) = destword;
267 firstword = *(sword + row*wpl + siter);
272 lastword = *(sword + row*wpl + eiter);
274 for ( i = 1; i < nwords; i++)
280 secondword = *(sword + row*wpl + siter + i);
282 lprevword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
284 firstword = secondword;
286 if ((siter + i + 1) < 0)
289 secondword = *(sword + row*wpl + siter + i + 1);
291 lnextword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
294 if ((eiter - i) >= wpl)
297 firstword = *(sword + row*wpl + eiter - i);
299 rnextword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
301 lastword = firstword;
302 if ((eiter - i - 1) >= wpl)
305 firstword = *(sword + row*wpl + eiter - i - 1);
307 rprevword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
309 for (j = 1; j < 32; j++)
312 tempword = (lprevword << j) | (lnextword >> (32 - j));
313 destword |= tempword;
316 tempword = (rprevword << j) | (rnextword >> (32 - j));
317 destword |= tempword;
320 destword |= lprevword;
321 destword |= lnextword;
322 destword |= rprevword;
323 destword |= rnextword;
325 lastword = firstword;
326 firstword = secondword;
329 *(dword + pos) = destword;
334 \n__kernel
void morphoDilateHor_32word(__global
int *sword,__global
int *dword,
336 const int wpl,
const int h,
339 const int col = get_global_id(0);
340 const int row = get_global_id(1);
341 const unsigned int pos = row * wpl + col;
342 unsigned int prevword, nextword, currword,tempword;
343 unsigned int destword;
347 if (pos >= (wpl * h))
350 currword = *(sword + pos);
357 prevword = *(sword + pos - 1);
362 nextword = *(sword + pos + 1);
364 for (i = 1; i <= halfwidth; i++)
367 if (i == halfwidth && isEven)
373 tempword = (prevword << (32-i)) | ((currword >> i));
376 destword |= tempword;
379 tempword = (currword << i) | (nextword >> (32 - i));
381 destword |= tempword;
384 *(dword + pos) = destword;
389 \n__kernel
void morphoDilateVer(__global
int *sword,__global
int *dword,
391 const int wpl,
const int h,
394 const int col = get_global_id(0);
395 const int row = get_global_id(1);
396 const unsigned int pos = row * wpl + col;
397 unsigned int tempword;
398 unsigned int destword;
402 if (row >= h || col >= wpl)
405 destword = *(sword + pos);
408 siter = (row - yn) < 0 ? 0 : (row - yn);
409 eiter = (row >= (h - yp)) ? (h - 1) : (row + yp);
411 for (i = siter; i <= eiter; i++)
413 tempword = *(sword + i*wpl + col);
415 destword |= tempword;
418 *(dword + pos) = destword;
423 \n__kernel
void morphoErodeHor_5x5(__global
int *sword,__global
int *dword,
424 const int wpl,
const int h)
426 const unsigned int pos = get_global_id(0);
427 unsigned int prevword, nextword, currword,tempword;
428 unsigned int destword;
429 const int col = pos % wpl;
432 if (pos >= (wpl * h))
435 currword = *(sword + pos);
442 prevword = *(sword + pos - 1);
447 nextword = *(sword + pos + 1);
453 tempword = (prevword << (31)) | ((currword >> 1));
454 destword &= tempword;
456 tempword = (currword << 1) | (nextword >> (31));
457 destword &= tempword;
461 tempword = (prevword << (30)) | ((currword >> 2));
462 destword &= tempword;
464 tempword = (currword << 2) | (nextword >> (30));
465 destword &= tempword;
468 *(dword + pos) = destword;
474 \n__kernel
void morphoErodeVer_5x5(__global
int *sword,__global
int *dword,
475 const int wpl,
const int h,
476 const int fwmask,
const int lwmask)
478 const int col = get_global_id(0);
479 const int row = get_global_id(1);
480 const unsigned int pos = row * wpl + col;
481 unsigned int tempword;
482 unsigned int destword;
486 if (row >= h || col >= wpl)
489 destword = *(sword + pos);
491 if (row < 2 || row >= (h - 2))
500 tempword = *(sword + i*wpl + col);
501 destword &= tempword;
506 tempword = *(sword + i*wpl + col);
507 destword &= tempword;
512 tempword = *(sword + i*wpl + col);
513 destword &= tempword;
518 tempword = *(sword + i*wpl + col);
519 destword &= tempword;
525 if (col == (wpl - 1))
532 *(dword + pos) = destword;
537 \n__kernel
void morphoErodeHor(__global
int *sword,__global
int *dword,
const int xp,
const int xn,
const int wpl,
538 const int h,
const char isAsymmetric,
const int rwmask,
const int lwmask)
540 const int col = get_global_id(0);
541 const int row = get_global_id(1);
542 const unsigned int pos = row * wpl + col;
543 unsigned int parbitsxp, parbitsxn, nwords;
544 unsigned int destword, tempword, lastword, currword;
545 unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
546 int i, j, siter, eiter;
549 if (pos >= (wpl*h) || (xn < 1 && xp < 1))
552 currword = *(sword + pos);
564 siter = (col - nwords);
565 eiter = (col + nwords);
569 firstword = 0xffffffff;
571 firstword = *(sword + pos - 1);
574 if (col == (wpl - 1))
575 secondword = 0xffffffff;
577 secondword = *(sword + pos + 1);
580 for (i = 1; i <= parbitsxp; i++)
583 tempword = (firstword << (32-i)) | ((currword >> i));
584 destword &= tempword;
587 tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0xffffffff : (currword << i) | (secondword >> (32 - i));
590 destword &= tempword;
598 destword &= firstword;
602 destword &= secondword;
610 if (col == (wpl - 1))
614 *(dword + pos) = destword;
619 firstword = 0xffffffff;
621 firstword = *(sword + row*wpl + siter);
624 lastword = 0xffffffff;
626 lastword = *(sword + row*wpl + eiter);
629 for ( i = 1; i < nwords; i++)
633 secondword = 0xffffffff;
635 secondword = *(sword + row*wpl + siter + i);
637 lprevword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
639 firstword = secondword;
641 if ((siter + i + 1) < 0)
642 secondword = 0xffffffff;
644 secondword = *(sword + row*wpl + siter + i + 1);
646 lnextword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
649 if ((eiter - i) >= wpl)
650 firstword = 0xffffffff;
652 firstword = *(sword + row*wpl + eiter - i);
654 rnextword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
656 lastword = firstword;
657 if ((eiter - i - 1) >= wpl)
658 firstword = 0xffffffff;
660 firstword = *(sword + row*wpl + eiter - i - 1);
662 rprevword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
664 for (j = 0; j < 32; j++)
667 tempword = (lprevword << j) | (lnextword >> (32 - j));
668 destword &= tempword;
671 tempword = (rprevword << j) | (rnextword >> (32 - j));
672 destword &= tempword;
675 destword &= lprevword;
676 destword &= lnextword;
677 destword &= rprevword;
678 destword &= rnextword;
680 lastword = firstword;
681 firstword = secondword;
687 if (col < (nwords - 1))
689 else if (col == (nwords - 1))
691 else if (col > (wpl - nwords))
693 else if (col == (wpl - nwords))
697 *(dword + pos) = destword;
702 \n__kernel
void morphoErodeHor_32word(__global
int *sword,__global
int *dword,
703 const int halfwidth,
const int wpl,
704 const int h,
const char clearBoundPixH,
705 const int rwmask,
const int lwmask,
708 const int col = get_global_id(0);
709 const int row = get_global_id(1);
710 const unsigned int pos = row * wpl + col;
711 unsigned int prevword, nextword, currword,tempword, destword;
715 if (pos >= (wpl * h))
718 currword = *(sword + pos);
725 prevword = *(sword + pos - 1);
730 nextword = *(sword + pos + 1);
732 for (i = 1; i <= halfwidth; i++)
735 tempword = (prevword << (32-i)) | ((currword >> i));
737 destword &= tempword;
740 if (i == halfwidth && isEven)
742 tempword = 0xffffffff;
746 tempword = (currword << i) | (nextword >> (32 - i));
749 destword &= tempword;
758 else if (col == (wpl - 1))
764 *(dword + pos) = destword;
769 \n__kernel
void morphoErodeVer(__global
int *sword,__global
int *dword,
771 const int wpl,
const int h,
772 const char clearBoundPixV,
const int yn)
774 const int col = get_global_id(0);
775 const int row = get_global_id(1);
776 const unsigned int pos = row * wpl + col;
777 unsigned int tempword, destword;
781 if (row >= h || col >= wpl)
784 destword = *(sword + pos);
787 siter = (row - yp) < 0 ? 0 : (row - yp);
788 eiter = (row >= (h - yn)) ? (h - 1) : (row + yn);
790 for (i = siter; i <= eiter; i++)
792 tempword = *(sword + i*wpl + col);
794 destword &= tempword;
798 if (clearBoundPixV && ((row < yp) || ((h - row) <= yn)))
803 *(dword + pos) = destword;
815 \n#define HIST_REDUNDANCY 256\n
816 \n#define GROUP_SIZE 256\n
817 \n#define HIST_SIZE 256\n
818 \n#define NUM_CHANNELS 4\n
819 \n#define HR_UNROLL_SIZE 8 \n
820 \n#define HR_UNROLL_TYPE uchar8 \n
822 __attribute__((reqd_work_group_size(256, 1, 1)))
824 void kernel_HistogramRectAllChannels(
825 __global
const uchar8 *data,
827 __global uint *histBuffer) {
831 int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
834 for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS/HR_UNROLL_SIZE; pc += get_global_size(0) ) {
837 atomic_inc( &histBuffer[ 0*HIST_SIZE*HIST_REDUNDANCY + pixels.s0*HIST_REDUNDANCY + threadOffset ]);
838 atomic_inc( &histBuffer[ 0*HIST_SIZE*HIST_REDUNDANCY + pixels.s4*HIST_REDUNDANCY + threadOffset ]);
839 atomic_inc( &histBuffer[ 1*HIST_SIZE*HIST_REDUNDANCY + pixels.s1*HIST_REDUNDANCY + threadOffset ]);
840 atomic_inc( &histBuffer[ 1*HIST_SIZE*HIST_REDUNDANCY + pixels.s5*HIST_REDUNDANCY + threadOffset ]);
841 atomic_inc( &histBuffer[ 2*HIST_SIZE*HIST_REDUNDANCY + pixels.s2*HIST_REDUNDANCY + threadOffset ]);
842 atomic_inc( &histBuffer[ 2*HIST_SIZE*HIST_REDUNDANCY + pixels.s6*HIST_REDUNDANCY + threadOffset ]);
843 atomic_inc( &histBuffer[ 3*HIST_SIZE*HIST_REDUNDANCY + pixels.s3*HIST_REDUNDANCY + threadOffset ]);
844 atomic_inc( &histBuffer[ 3*HIST_SIZE*HIST_REDUNDANCY + pixels.s7*HIST_REDUNDANCY + threadOffset ]);
851 __attribute__((reqd_work_group_size(256, 1, 1)))
853 void kernel_HistogramRectOneChannel(
854 __global
const uchar8 *data,
856 __global uint *histBuffer) {
860 int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
863 for ( uint pc = get_global_id(0); pc < numPixels/HR_UNROLL_SIZE; pc += get_global_size(0) ) {
866 atomic_inc( &histBuffer[ pixels.s0*HIST_REDUNDANCY + threadOffset ]);
867 atomic_inc( &histBuffer[ pixels.s1*HIST_REDUNDANCY + threadOffset ]);
868 atomic_inc( &histBuffer[ pixels.s2*HIST_REDUNDANCY + threadOffset ]);
869 atomic_inc( &histBuffer[ pixels.s3*HIST_REDUNDANCY + threadOffset ]);
870 atomic_inc( &histBuffer[ pixels.s4*HIST_REDUNDANCY + threadOffset ]);
871 atomic_inc( &histBuffer[ pixels.s5*HIST_REDUNDANCY + threadOffset ]);
872 atomic_inc( &histBuffer[ pixels.s6*HIST_REDUNDANCY + threadOffset ]);
873 atomic_inc( &histBuffer[ pixels.s7*HIST_REDUNDANCY + threadOffset ]);
881 \n __attribute__((reqd_work_group_size(256, 1, 1)))
883 \n
void kernel_HistogramRectAllChannels_Grey(
884 \n __global
const uchar* data,
886 \n __global uint *histBuffer) {
891 \n
size_t groupId = get_group_id(0);
892 \n
size_t localId = get_local_id(0);
893 \n
size_t globalId = get_global_id(0);
894 \n uint numThreads = get_global_size(0);
897 \n
for ( uint pc = get_global_id(0); pc < numPixels; pc += get_global_size(0) ) {
898 \n uchar value = data[ pc ];
899 \n
int idx = value * get_global_size(0) + get_global_id(0);
900 \n histBuffer[ idx ]++;
912 __attribute__((reqd_work_group_size(256, 1, 1)))
914 void kernel_HistogramRectAllChannelsReduction(
916 __global uint *histBuffer,
917 __global
int* histResult) {
920 int channel = get_group_id(0)/HIST_SIZE;
921 int bin = get_group_id(0)%HIST_SIZE;
925 for ( uint i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
926 value += histBuffer[ channel*HIST_SIZE*HIST_REDUNDANCY+bin*HIST_REDUNDANCY+i];
930 __local
int localHist[GROUP_SIZE];
931 localHist[get_local_id(0)] = value;
932 barrier(CLK_LOCAL_MEM_FENCE);
933 for (
int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
934 if (get_local_id(0) < stride) {
935 value = localHist[ get_local_id(0)+stride];
937 barrier(CLK_LOCAL_MEM_FENCE);
938 if (get_local_id(0) < stride) {
939 localHist[ get_local_id(0)] += value;
941 barrier(CLK_LOCAL_MEM_FENCE);
945 if (get_local_id(0) == 0) {
946 histResult[get_group_id(0)] = localHist[0];
954 __attribute__((reqd_work_group_size(256, 1, 1)))
956 void kernel_HistogramRectOneChannelReduction(
958 __global uint *histBuffer,
959 __global
int* histResult) {
963 int bin = get_group_id(0)%HIST_SIZE;
967 for (
int i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
968 value += histBuffer[ bin*HIST_REDUNDANCY+i];
972 __local
int localHist[GROUP_SIZE];
973 localHist[get_local_id(0)] = value;
974 barrier(CLK_LOCAL_MEM_FENCE);
975 for (
int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
976 if (get_local_id(0) < stride) {
977 value = localHist[ get_local_id(0)+stride];
979 barrier(CLK_LOCAL_MEM_FENCE);
980 if (get_local_id(0) < stride) {
981 localHist[ get_local_id(0)] += value;
983 barrier(CLK_LOCAL_MEM_FENCE);
987 if (get_local_id(0) == 0) {
988 histResult[get_group_id(0)] = localHist[0];
997 \n __attribute__((reqd_work_group_size(256, 1, 1)))
999 \n
void kernel_HistogramRectAllChannelsReduction_Grey(
1001 \n __global uint *histBuffer,
1002 \n __global uint* histResult) {
1007 \n
size_t groupId = get_group_id(0);
1008 \n
size_t localId = get_local_id(0);
1009 \n
size_t globalId = get_global_id(0);
1010 \n uint numThreads = get_global_size(0);
1011 \n
unsigned int hist = 0;
1014 \n
for ( uint p = 0; p < n; p+=GROUP_SIZE) {
1015 \n hist += histBuffer[ (get_group_id(0)*n + p)];
1020 \n __local
unsigned int localHist[GROUP_SIZE];
1022 \n localHist[localId] = hist;
1023 \n barrier(CLK_LOCAL_MEM_FENCE);
1025 \n
for (
int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
1026 \n
if (localId < stride) {
1027 \n hist = localHist[ (localId+stride)];
1029 \n barrier(CLK_LOCAL_MEM_FENCE);
1030 \n
if (localId < stride) {
1031 \n localHist[ localId] += hist;
1033 \n barrier(CLK_LOCAL_MEM_FENCE);
1036 \n
if (localId == 0)
1037 \n histResult[get_group_id(0)] = localHist[0];
1048 \n#define CHAR_VEC_WIDTH 8 \n
1049 \n#define PIXELS_PER_WORD 32 \n
1050 \n#define PIXELS_PER_BURST 8 \n
1051 \n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n
1053 uchar s[PIXELS_PER_BURST*NUM_CHANNELS];
1054 uchar8 v[(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH];
1057 __attribute__((reqd_work_group_size(256, 1, 1)))
1059 void kernel_ThresholdRectToPix(
1060 __global const uchar8 *imageData,
1064 __global
int *thresholds,
1065 __global
int *hi_values,
1066 __global
int *pix) {
1069 int pThresholds[NUM_CHANNELS];
1070 int pHi_Values[NUM_CHANNELS];
1071 for (
int i = 0; i < NUM_CHANNELS; i++) {
1072 pThresholds[i] = thresholds[i];
1073 pHi_Values[i] = hi_values[i];
1077 for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) {
1078 unsigned int word = 0;
1081 for (
int b = 0; b < BURSTS_PER_WORD; b++) {
1085 for (
int i = 0; i < (PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH; i++ ) {
1086 pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH) + i];
1090 for (
int p = 0; p < PIXELS_PER_BURST; p++) {
1091 for (
int c = 0; c < NUM_CHANNELS; c++) {
1092 unsigned char pixChan = pixels.s[p*NUM_CHANNELS + c];
1093 if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
1094 word |= (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
1105 uchar s[PIXELS_PER_BURST];
1106 uchar8 v[(PIXELS_PER_BURST)/CHAR_VEC_WIDTH];
1109 __attribute__((reqd_work_group_size(256, 1, 1)))
1111 void kernel_ThresholdRectToPix_OneChan(
1112 __global const uchar8 *imageData,
1116 __global
int *thresholds,
1117 __global
int *hi_values,
1118 __global
int *pix) {
1123 for (
int i = 0; i < 1; i++) {
1124 pThresholds[i] = thresholds[i];
1125 pHi_Values[i] = hi_values[i];
1129 for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) {
1130 unsigned int word = 0;
1133 for (
int b = 0; b < BURSTS_PER_WORD; b++) {
1137 for (
int i = 0; i < (PIXELS_PER_BURST)/CHAR_VEC_WIDTH; i++ ) {
1138 pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + i];
1142 for (
int p = 0; p < PIXELS_PER_BURST; p++) {
1143 for (
int c = 0; c < 1; c++) {
1144 unsigned char pixChan = pixels.s[p + c];
1145 if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
1146 word |= (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
1158 #endif // USE_EXTERNAL_KERNEL
1159 #endif //_OCL_KERNEL_H_
1169 \n__attribute__((reqd_work_group_size(256, 1, 1)))
1171 \n
void kernel_HistogramRectAllChannels_uchar(
1172 \n
volatile __global
const uchar *data,
1174 \n
volatile __global uint *histBuffer) {
1177 \n
for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS; pc += get_global_size(0) ) {
1178 \n uchar value = data[pc];
1179 \n
int idx = value*get_global_size(0) + get_global_id(0);
1180 \n histBuffer[ idx ]++;
1184 \n __attribute__((reqd_work_group_size(256, 1, 1)))
1186 \n
void kernel_HistogramRectAllChannelsReduction_uchar(
1188 \n __global uint4 *histBuffer,
1189 \n __global uint* histResult) {
1192 \n
int binIdx = get_group_id(0);
1193 \n
size_t groupId = get_group_id(0);
1194 \n
size_t localId = get_local_id(0);
1195 \n
size_t globalId = get_global_id(0);
1196 \n uint numThreads = get_global_size(0);
1197 \n uint4 hist = {0, 0, 0, 0};
1200 \n
for ( uint p = get_local_id(0); p < n; p+=GROUP_SIZE) {
1201 \n hist += histBuffer[binIdx*n+p];
1205 \n __local uint4 localHist[GROUP_SIZE];
1206 \n localHist[localId] = hist;
1207 \n barrier(CLK_LOCAL_MEM_FENCE);
1209 \n
for (
int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
1210 \n
if (localId < stride) {
1211 \n hist = localHist[ localId+stride];
1213 \n barrier(CLK_LOCAL_MEM_FENCE);
1214 \n
if (localId < stride) {
1215 \n localHist[ localId] += hist;
1217 \n barrier(CLK_LOCAL_MEM_FENCE);
1221 \n
if (localId == 0) {
1222 \n histResult[0*HIST_SIZE+binIdx] = localHist[0].s0;
1223 \n histResult[1*HIST_SIZE+binIdx] = localHist[0].s1;
1224 \n histResult[2*HIST_SIZE+binIdx] = localHist[0].s2;
1225 \n histResult[3*HIST_SIZE+binIdx] = localHist[0].s3;