11 #ifndef TESSERACT_OPENCL_OCLKERNELS_H_
12 #define TESSERACT_OPENCL_OCLKERNELS_H_
14 #ifndef USE_EXTERNAL_KERNEL
15 #define KERNEL(...) #__VA_ARGS__ "\n"
21 static const char* kernel_src =
KERNEL(
22 \n#ifdef KHR_DP_EXTENSION\n
23 \n#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n
24 \n#elif AMD_DP_EXTENSION\n
25 \n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n
28 __kernel
void composeRGBPixel(__global
uint *tiffdata,
int w,
int h,
int wpl, __global
uint *output)
30 int i = get_global_id(1);
31 int j = get_global_id(0);
32 int tiffword,rval,gval,bval;
35 if ((i >= h) || (j >= w))
38 tiffword = tiffdata[i * w + j];
39 rval = ((tiffword) & 0xff);
40 gval = (((tiffword) >> 8) & 0xff);
41 bval = (((tiffword) >> 16) & 0xff);
42 output[i*wpl+j] = (rval << (8 * (
sizeof(
uint) - 1 - 0))) | (gval << (8 * (
sizeof(
uint) - 1 - 1))) | (bval << (8 * (
sizeof(
uint) - 1 - 2)));
47 \n__kernel
void pixSubtract_inplace(__global
int *dword, __global
int *sword,
48 const int wpl,
const int h)
50 const unsigned int row = get_global_id(1);
51 const unsigned int col = get_global_id(0);
52 const unsigned int pos = row * wpl + col;
55 if (row >= h || col >= wpl)
58 *(dword + pos) &= ~(*(sword + pos));
63 \n__kernel
void morphoDilateHor_5x5(__global
int *sword,__global
int *dword,
64 const int wpl,
const int h)
66 const unsigned int pos = get_global_id(0);
67 unsigned int prevword, nextword, currword,tempword;
68 unsigned int destword;
69 const int col = pos % wpl;
76 currword = *(sword + pos);
83 prevword = *(sword + pos - 1);
88 nextword = *(sword + pos + 1);
94 tempword = (prevword << (31)) | ((currword >> 1));
97 tempword = (currword << 1) | (nextword >> (31));
102 tempword = (prevword << (30)) | ((currword >> 2));
103 destword |= tempword;
105 tempword = (currword << 2) | (nextword >> (30));
106 destword |= tempword;
109 *(dword + pos) = destword;
115 \n__kernel
void morphoDilateVer_5x5(__global
int *sword,__global
int *dword,
116 const int wpl,
const int h)
118 const int col = get_global_id(0);
119 const int row = get_global_id(1);
120 const unsigned int pos = row * wpl + col;
121 unsigned int tempword;
122 unsigned int destword;
126 if (row >= h || col >= wpl)
129 destword = *(sword + pos);
132 i = (row - 2) < 0 ? row : (row - 2);
133 tempword = *(sword + i*wpl + col);
134 destword |= tempword;
137 i = (row - 1) < 0 ? row : (row - 1);
138 tempword = *(sword + i*wpl + col);
139 destword |= tempword;
142 i = (row >= (h - 1)) ? row : (row + 1);
143 tempword = *(sword + i*wpl + col);
144 destword |= tempword;
147 i = (row >= (h - 2)) ? row : (row + 2);
148 tempword = *(sword + i*wpl + col);
149 destword |= tempword;
151 *(dword + pos) = destword;
156 \n__kernel
void morphoDilateHor(__global
int *sword,__global
int *dword,
const int xp,
const int xn,
const int wpl,
const int h)
158 const int col = get_global_id(0);
159 const int row = get_global_id(1);
160 const unsigned int pos = row * wpl + col;
161 unsigned int parbitsxp, parbitsxn, nwords;
162 unsigned int destword, tempword, lastword, currword;
163 unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
164 int i, j, siter, eiter;
167 if (pos >= (wpl*h) || (xn < 1 && xp < 1))
170 currword = *(sword + pos);
182 siter = (col - nwords);
183 eiter = (col + nwords);
189 firstword = *(sword + pos - 1);
192 if (col == (wpl - 1))
195 secondword = *(sword + pos + 1);
198 for (i = 1; i <= parbitsxp; i++)
201 tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0x0 : (firstword << (32-i)) | ((currword >> i));
203 destword |= tempword;
206 tempword = (currword << i) | (secondword >> (32 - i));
207 destword |= tempword;
215 destword |= firstword;
219 destword |= secondword;
222 *(dword + pos) = destword;
229 firstword = *(sword + row*wpl + siter);
234 lastword = *(sword + row*wpl + eiter);
236 for (i = 1; i < nwords; i++)
242 secondword = *(sword + row*wpl + siter + i);
244 lprevword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
246 firstword = secondword;
248 if ((siter + i + 1) < 0)
251 secondword = *(sword + row*wpl + siter + i + 1);
253 lnextword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
256 if ((eiter - i) >= wpl)
259 firstword = *(sword + row*wpl + eiter - i);
261 rnextword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
263 lastword = firstword;
264 if ((eiter - i - 1) >= wpl)
267 firstword = *(sword + row*wpl + eiter - i - 1);
269 rprevword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
271 for (j = 1; j < 32; j++)
274 tempword = (lprevword << j) | (lnextword >> (32 - j));
275 destword |= tempword;
278 tempword = (rprevword << j) | (rnextword >> (32 - j));
279 destword |= tempword;
282 destword |= lprevword;
283 destword |= lnextword;
284 destword |= rprevword;
285 destword |= rnextword;
287 lastword = firstword;
288 firstword = secondword;
291 *(dword + pos) = destword;
296 \n__kernel
void morphoDilateHor_32word(__global
int *sword,__global
int *dword,
298 const int wpl,
const int h,
301 const int col = get_global_id(0);
302 const int row = get_global_id(1);
303 const unsigned int pos = row * wpl + col;
304 unsigned int prevword, nextword, currword,tempword;
305 unsigned int destword;
309 if (pos >= (wpl * h))
312 currword = *(sword + pos);
319 prevword = *(sword + pos - 1);
324 nextword = *(sword + pos + 1);
326 for (i = 1; i <= halfwidth; i++)
329 if (i == halfwidth && isEven)
335 tempword = (prevword << (32-i)) | ((currword >> i));
338 destword |= tempword;
341 tempword = (currword << i) | (nextword >> (32 - i));
343 destword |= tempword;
346 *(dword + pos) = destword;
351 \n__kernel
void morphoDilateVer(__global
int *sword,__global
int *dword,
353 const int wpl,
const int h,
356 const int col = get_global_id(0);
357 const int row = get_global_id(1);
358 const unsigned int pos = row * wpl + col;
359 unsigned int tempword;
360 unsigned int destword;
364 if (row >= h || col >= wpl)
367 destword = *(sword + pos);
370 siter = (row - yn) < 0 ? 0 : (row - yn);
371 eiter = (row >= (h - yp)) ? (h - 1) : (row + yp);
373 for (i = siter; i <= eiter; i++)
375 tempword = *(sword + i*wpl + col);
377 destword |= tempword;
380 *(dword + pos) = destword;
385 \n__kernel
void morphoErodeHor_5x5(__global
int *sword,__global
int *dword,
386 const int wpl,
const int h)
388 const unsigned int pos = get_global_id(0);
389 unsigned int prevword, nextword, currword,tempword;
390 unsigned int destword;
391 const int col = pos % wpl;
394 if (pos >= (wpl * h))
397 currword = *(sword + pos);
404 prevword = *(sword + pos - 1);
409 nextword = *(sword + pos + 1);
415 tempword = (prevword << (31)) | ((currword >> 1));
416 destword &= tempword;
418 tempword = (currword << 1) | (nextword >> (31));
419 destword &= tempword;
423 tempword = (prevword << (30)) | ((currword >> 2));
424 destword &= tempword;
426 tempword = (currword << 2) | (nextword >> (30));
427 destword &= tempword;
430 *(dword + pos) = destword;
436 \n__kernel
void morphoErodeVer_5x5(__global
int *sword,__global
int *dword,
437 const int wpl,
const int h,
438 const int fwmask,
const int lwmask)
440 const int col = get_global_id(0);
441 const int row = get_global_id(1);
442 const unsigned int pos = row * wpl + col;
443 unsigned int tempword;
444 unsigned int destword;
448 if (row >= h || col >= wpl)
451 destword = *(sword + pos);
453 if (row < 2 || row >= (h - 2))
462 tempword = *(sword + i*wpl + col);
463 destword &= tempword;
468 tempword = *(sword + i*wpl + col);
469 destword &= tempword;
474 tempword = *(sword + i*wpl + col);
475 destword &= tempword;
480 tempword = *(sword + i*wpl + col);
481 destword &= tempword;
487 if (col == (wpl - 1))
494 *(dword + pos) = destword;
499 \n__kernel
void morphoErodeHor(__global
int *sword,__global
int *dword,
const int xp,
const int xn,
const int wpl,
500 const int h,
const char isAsymmetric,
const int rwmask,
const int lwmask)
502 const int col = get_global_id(0);
503 const int row = get_global_id(1);
504 const unsigned int pos = row * wpl + col;
505 unsigned int parbitsxp, parbitsxn, nwords;
506 unsigned int destword, tempword, lastword, currword;
507 unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
508 int i, j, siter, eiter;
511 if (pos >= (wpl*h) || (xn < 1 && xp < 1))
514 currword = *(sword + pos);
526 siter = (col - nwords);
527 eiter = (col + nwords);
531 firstword = 0xffffffff;
533 firstword = *(sword + pos - 1);
536 if (col == (wpl - 1))
537 secondword = 0xffffffff;
539 secondword = *(sword + pos + 1);
542 for (i = 1; i <= parbitsxp; i++)
545 tempword = (firstword << (32-i)) | ((currword >> i));
546 destword &= tempword;
549 tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0xffffffff : (currword << i) | (secondword >> (32 - i));
552 destword &= tempword;
560 destword &= firstword;
564 destword &= secondword;
572 if (col == (wpl - 1))
576 *(dword + pos) = destword;
581 firstword = 0xffffffff;
583 firstword = *(sword + row*wpl + siter);
586 lastword = 0xffffffff;
588 lastword = *(sword + row*wpl + eiter);
591 for (i = 1; i < nwords; i++)
595 secondword = 0xffffffff;
597 secondword = *(sword + row*wpl + siter + i);
599 lprevword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
601 firstword = secondword;
603 if ((siter + i + 1) < 0)
604 secondword = 0xffffffff;
606 secondword = *(sword + row*wpl + siter + i + 1);
608 lnextword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
611 if ((eiter - i) >= wpl)
612 firstword = 0xffffffff;
614 firstword = *(sword + row*wpl + eiter - i);
616 rnextword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
618 lastword = firstword;
619 if ((eiter - i - 1) >= wpl)
620 firstword = 0xffffffff;
622 firstword = *(sword + row*wpl + eiter - i - 1);
624 rprevword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
626 for (j = 0; j < 32; j++)
629 tempword = (lprevword << j) | (lnextword >> (32 - j));
630 destword &= tempword;
633 tempword = (rprevword << j) | (rnextword >> (32 - j));
634 destword &= tempword;
637 destword &= lprevword;
638 destword &= lnextword;
639 destword &= rprevword;
640 destword &= rnextword;
642 lastword = firstword;
643 firstword = secondword;
649 if (col < (nwords - 1))
651 else if (col == (nwords - 1))
653 else if (col > (wpl - nwords))
655 else if (col == (wpl - nwords))
659 *(dword + pos) = destword;
664 \n__kernel
void morphoErodeHor_32word(__global
int *sword,__global
int *dword,
665 const int halfwidth,
const int wpl,
666 const int h,
const char clearBoundPixH,
667 const int rwmask,
const int lwmask,
670 const int col = get_global_id(0);
671 const int row = get_global_id(1);
672 const unsigned int pos = row * wpl + col;
673 unsigned int prevword, nextword, currword,tempword, destword;
677 if (pos >= (wpl * h))
680 currword = *(sword + pos);
687 prevword = *(sword + pos - 1);
692 nextword = *(sword + pos + 1);
694 for (i = 1; i <= halfwidth; i++)
697 tempword = (prevword << (32-i)) | ((currword >> i));
699 destword &= tempword;
702 if (i == halfwidth && isEven)
704 tempword = 0xffffffff;
708 tempword = (currword << i) | (nextword >> (32 - i));
711 destword &= tempword;
720 else if (col == (wpl - 1))
726 *(dword + pos) = destword;
731 \n__kernel
void morphoErodeVer(__global
int *sword,__global
int *dword,
733 const int wpl,
const int h,
734 const char clearBoundPixV,
const int yn)
736 const int col = get_global_id(0);
737 const int row = get_global_id(1);
738 const unsigned int pos = row * wpl + col;
739 unsigned int tempword, destword;
743 if (row >= h || col >= wpl)
746 destword = *(sword + pos);
749 siter = (row - yp) < 0 ? 0 : (row - yp);
750 eiter = (row >= (h - yn)) ? (h - 1) : (row + yn);
752 for (i = siter; i <= eiter; i++)
754 tempword = *(sword + i*wpl + col);
756 destword &= tempword;
760 if (clearBoundPixV && ((row < yp) || ((h - row) <= yn)))
765 *(dword + pos) = destword;
777 \n#define HIST_REDUNDANCY 256\n
778 \n#define GROUP_SIZE 256\n
779 \n#define HIST_SIZE 256\n
780 \n#define NUM_CHANNELS 4\n
781 \n#define HR_UNROLL_SIZE 8 \n
782 \n#define HR_UNROLL_TYPE uchar8 \n
784 __attribute__((reqd_work_group_size(256, 1, 1)))
786 void kernel_HistogramRectAllChannels(
787 __global const uchar8 *data,
789 __global
uint *histBuffer) {
793 int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
796 for (
uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS/HR_UNROLL_SIZE; pc += get_global_size(0)) {
799 atomic_inc(&histBuffer[0*HIST_SIZE*HIST_REDUNDANCY + pixels.s0*HIST_REDUNDANCY + threadOffset]);
800 atomic_inc(&histBuffer[0*HIST_SIZE*HIST_REDUNDANCY + pixels.s4*HIST_REDUNDANCY + threadOffset]);
801 atomic_inc(&histBuffer[1*HIST_SIZE*HIST_REDUNDANCY + pixels.s1*HIST_REDUNDANCY + threadOffset]);
802 atomic_inc(&histBuffer[1*HIST_SIZE*HIST_REDUNDANCY + pixels.s5*HIST_REDUNDANCY + threadOffset]);
803 atomic_inc(&histBuffer[2*HIST_SIZE*HIST_REDUNDANCY + pixels.s2*HIST_REDUNDANCY + threadOffset]);
804 atomic_inc(&histBuffer[2*HIST_SIZE*HIST_REDUNDANCY + pixels.s6*HIST_REDUNDANCY + threadOffset]);
805 atomic_inc(&histBuffer[3*HIST_SIZE*HIST_REDUNDANCY + pixels.s3*HIST_REDUNDANCY + threadOffset]);
806 atomic_inc(&histBuffer[3*HIST_SIZE*HIST_REDUNDANCY + pixels.s7*HIST_REDUNDANCY + threadOffset]);
813 __attribute__((reqd_work_group_size(256, 1, 1)))
815 void kernel_HistogramRectOneChannel(
816 __global const uchar8 *data,
818 __global
uint *histBuffer) {
822 int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
825 for (
uint pc = get_global_id(0); pc < numPixels/HR_UNROLL_SIZE; pc += get_global_size(0)) {
828 atomic_inc(&histBuffer[pixels.s0*HIST_REDUNDANCY + threadOffset]);
829 atomic_inc(&histBuffer[pixels.s1*HIST_REDUNDANCY + threadOffset]);
830 atomic_inc(&histBuffer[pixels.s2*HIST_REDUNDANCY + threadOffset]);
831 atomic_inc(&histBuffer[pixels.s3*HIST_REDUNDANCY + threadOffset]);
832 atomic_inc(&histBuffer[pixels.s4*HIST_REDUNDANCY + threadOffset]);
833 atomic_inc(&histBuffer[pixels.s5*HIST_REDUNDANCY + threadOffset]);
834 atomic_inc(&histBuffer[pixels.s6*HIST_REDUNDANCY + threadOffset]);
835 atomic_inc(&histBuffer[pixels.s7*HIST_REDUNDANCY + threadOffset]);
844 __attribute__((reqd_work_group_size(256, 1, 1)))
846 void kernel_HistogramRectAllChannelsReduction(
848 __global
uint *histBuffer,
849 __global
int* histResult) {
852 int channel = get_group_id(0)/HIST_SIZE;
853 int bin = get_group_id(0)%HIST_SIZE;
857 for (
uint i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
858 value += histBuffer[ channel*HIST_SIZE*HIST_REDUNDANCY+bin*HIST_REDUNDANCY+i];
862 __local
int localHist[GROUP_SIZE];
863 localHist[get_local_id(0)] = value;
864 barrier(CLK_LOCAL_MEM_FENCE);
865 for (
int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
866 if (get_local_id(0) < stride) {
867 value = localHist[ get_local_id(0)+stride];
869 barrier(CLK_LOCAL_MEM_FENCE);
870 if (get_local_id(0) < stride) {
871 localHist[ get_local_id(0)] += value;
873 barrier(CLK_LOCAL_MEM_FENCE);
877 if (get_local_id(0) == 0) {
878 histResult[get_group_id(0)] = localHist[0];
886 __attribute__((reqd_work_group_size(256, 1, 1)))
888 void kernel_HistogramRectOneChannelReduction(
890 __global
uint *histBuffer,
891 __global
int* histResult) {
895 int bin = get_group_id(0)%HIST_SIZE;
899 for (
int i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
900 value += histBuffer[ bin*HIST_REDUNDANCY+i];
904 __local
int localHist[GROUP_SIZE];
905 localHist[get_local_id(0)] = value;
906 barrier(CLK_LOCAL_MEM_FENCE);
907 for (
int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
908 if (get_local_id(0) < stride) {
909 value = localHist[ get_local_id(0)+stride];
911 barrier(CLK_LOCAL_MEM_FENCE);
912 if (get_local_id(0) < stride) {
913 localHist[ get_local_id(0)] += value;
915 barrier(CLK_LOCAL_MEM_FENCE);
919 if (get_local_id(0) == 0) {
920 histResult[get_group_id(0)] = localHist[0];
930 \n#define CHAR_VEC_WIDTH 4 \n
931 \n#define PIXELS_PER_WORD 32 \n
932 \n#define PIXELS_PER_BURST 8 \n
933 \n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n
935 uchar s[PIXELS_PER_BURST*NUM_CHANNELS];
936 uchar4 v[(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH];
939 __attribute__((reqd_work_group_size(256, 1, 1)))
941 void kernel_ThresholdRectToPix(
942 __global const uchar4 *imageData,
946 __global
int *thresholds,
947 __global
int *hi_values,
951 int pThresholds[NUM_CHANNELS];
952 int pHi_Values[NUM_CHANNELS];
953 for (
int i = 0; i < NUM_CHANNELS; i++) {
954 pThresholds[i] = thresholds[i];
955 pHi_Values[i] = hi_values[i];
959 for (
uint w = get_global_id(0); w < wpl*height; w += get_global_size(0)) {
960 unsigned int word = 0;
962 for (
int b = 0; b < BURSTS_PER_WORD; b++) {
965 int offset = (w / wpl) * width;
966 offset += (w % wpl) * PIXELS_PER_WORD;
967 offset += b * PIXELS_PER_BURST;
969 for (
int i = 0; i < PIXELS_PER_BURST; ++i)
970 pixels.v[i] = imageData[offset + i];
973 for (
int p = 0; p < PIXELS_PER_BURST; p++) {
974 for (
int c = 0; c < NUM_CHANNELS; c++) {
975 unsigned char pixChan = pixels.s[p*NUM_CHANNELS + c];
976 if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
977 const uint kTopBit = 0x80000000;
978 word |= (kTopBit >> ((b*PIXELS_PER_BURST+p)&31));
987 \n#define CHAR_VEC_WIDTH 8 \n
988 \n#define PIXELS_PER_WORD 32 \n
989 \n#define PIXELS_PER_BURST 8 \n
990 \n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n
992 uchar s[PIXELS_PER_BURST*1];
993 uchar8 v[(PIXELS_PER_BURST*1)/CHAR_VEC_WIDTH];
996 __attribute__((reqd_work_group_size(256, 1, 1)))
998 void kernel_ThresholdRectToPix_OneChan(
999 __global const uchar8 *imageData,
1003 __global
int *thresholds,
1004 __global
int *hi_values,
1005 __global
int *pix) {
1010 for (
int i = 0; i < 1; i++) {
1011 pThresholds[i] = thresholds[i];
1012 pHi_Values[i] = hi_values[i];
1016 for (
uint w = get_global_id(0); w < wpl*height; w += get_global_size(0)) {
1017 unsigned int word = 0;
1020 for (
int b = 0; b < BURSTS_PER_WORD; b++) {
1025 pixels.v[0] = imageData[
1031 for (
int p = 0; p < PIXELS_PER_BURST; p++) {
1036 \n#ifdef __ENDIAN_LITTLE__\n
1041 unsigned char pixChan = pixels.s[idx];
1042 if (pHi_Values[0] >= 0 && (pixChan > pThresholds[0]) == (pHi_Values[0] == 0)) {
1043 const uint kTopBit = 0x80000000;
1044 word |= (kTopBit >> ((b*PIXELS_PER_BURST+p)&31));
1055 #endif // USE_EXTERNAL_KERNEL
1056 #endif // TESSERACT_OPENCL_OCLKERNELS_H_