13 #include <sys/types.h>
27 #if defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || \
28 defined(__CYGWIN__) || defined(__MINGW32__)
31 #elif defined(__linux__)
34 #elif defined(__APPLE__)
43 #include <mach/mach_time.h>
54 GPUEnv OpenclDevice::gpuEnv;
56 bool OpenclDevice::deviceIsSelected =
false;
57 ds_device OpenclDevice::selectedDevice;
59 int OpenclDevice::isInited = 0;
61 static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC;
63 static const l_uint32 lmask32[] = {
64 0x80000000, 0xc0000000, 0xe0000000, 0xf0000000, 0xf8000000, 0xfc000000,
65 0xfe000000, 0xff000000, 0xff800000, 0xffc00000, 0xffe00000, 0xfff00000,
66 0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000, 0xffff8000, 0xffffc000,
67 0xffffe000, 0xfffff000, 0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00,
68 0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0, 0xfffffff8, 0xfffffffc,
69 0xfffffffe, 0xffffffff};
71 static const l_uint32 rmask32[] = {
72 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
73 0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
74 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff,
75 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
76 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff,
77 0x7fffffff, 0xffffffff};
79 static cl_mem pixsCLBuffer, pixdCLBuffer,
81 static cl_mem pixThBuffer;
82 static cl_int clStatus;
83 static KernelEnv rEnv;
85 #define DS_TAG_VERSION "<version>"
86 #define DS_TAG_VERSION_END "</version>"
87 #define DS_TAG_DEVICE "<device>"
88 #define DS_TAG_DEVICE_END "</device>"
89 #define DS_TAG_SCORE "<score>"
90 #define DS_TAG_SCORE_END "</score>"
91 #define DS_TAG_DEVICE_TYPE "<type>"
92 #define DS_TAG_DEVICE_TYPE_END "</type>"
93 #define DS_TAG_DEVICE_NAME "<name>"
94 #define DS_TAG_DEVICE_NAME_END "</name>"
95 #define DS_TAG_DEVICE_DRIVER_VERSION "<driver>"
96 #define DS_TAG_DEVICE_DRIVER_VERSION_END "</driver>"
98 #define DS_DEVICE_NATIVE_CPU_STRING "native_cpu"
100 #define DS_DEVICE_NAME_LENGTH 256
102 enum ds_evaluation_type { DS_EVALUATE_ALL, DS_EVALUATE_NEW_ONLY };
105 std::vector<ds_device> devices;
106 unsigned int numDevices;
112 DS_INVALID_PROFILE = 1000,
114 DS_INVALID_PERF_EVALUATOR_TYPE,
115 DS_INVALID_PERF_EVALUATOR,
116 DS_PERF_EVALUATOR_ERROR,
118 DS_UNKNOWN_DEVICE_TYPE,
119 DS_PROFILE_FILE_ERROR,
120 DS_SCORE_SERIALIZER_ERROR,
121 DS_SCORE_DESERIALIZER_ERROR
128 typedef ds_status (*ds_perf_evaluator)(ds_device* device,
void* data);
131 typedef ds_status (*ds_score_release)(TessDeviceScore* score);
133 static ds_status releaseDSProfile(ds_profile* profile, ds_score_release sr) {
134 ds_status status = DS_SUCCESS;
135 if (profile !=
nullptr) {
138 for (i = 0; i < profile->numDevices; i++) {
139 free(profile->devices[i].oclDeviceName);
140 free(profile->devices[i].oclDriverVersion);
141 status = sr(profile->devices[i].score);
142 if (status != DS_SUCCESS)
break;
150 static ds_status initDSProfile(ds_profile** p,
const char* version) {
152 cl_uint numPlatforms;
153 std::vector<cl_platform_id> platforms;
154 std::vector <cl_device_id> devices;
155 ds_status status = DS_SUCCESS;
159 if (p ==
nullptr)
return DS_INVALID_PROFILE;
161 ds_profile* profile =
new ds_profile;
163 memset(profile, 0,
sizeof(ds_profile));
165 clGetPlatformIDs(0,
nullptr, &numPlatforms);
167 if (numPlatforms > 0) {
168 platforms.reserve(numPlatforms);
169 clGetPlatformIDs(numPlatforms, &platforms[0],
nullptr);
173 for (i = 0; i < numPlatforms; i++) {
175 clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0,
nullptr, &num);
179 if (numDevices > 0) {
180 devices.reserve(numDevices);
183 profile->numDevices =
185 profile->devices.reserve(profile->numDevices);
186 memset(&profile->devices[0], 0, profile->numDevices *
sizeof(ds_device));
189 for (i = 0; i < numPlatforms; i++) {
192 clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numDevices, &devices[0], &num);
193 for (j = 0; j < num; j++, next++) {
194 char buffer[DS_DEVICE_NAME_LENGTH];
197 profile->devices[next].type = DS_DEVICE_OPENCL_DEVICE;
198 profile->devices[next].oclDeviceID = devices[j];
200 clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME,
201 DS_DEVICE_NAME_LENGTH, &buffer,
nullptr);
202 length = strlen(buffer);
203 profile->devices[next].oclDeviceName = (
char*)malloc(length + 1);
204 memcpy(profile->devices[next].oclDeviceName, buffer, length + 1);
206 clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION,
207 DS_DEVICE_NAME_LENGTH, &buffer,
nullptr);
208 length = strlen(buffer);
209 profile->devices[next].oclDriverVersion = (
char*)malloc(length + 1);
210 memcpy(profile->devices[next].oclDriverVersion, buffer, length + 1);
213 profile->devices[next].type = DS_DEVICE_NATIVE_CPU;
214 profile->version = version;
220 static ds_status profileDevices(ds_profile* profile,
221 const ds_evaluation_type
type,
222 ds_perf_evaluator evaluator,
223 void* evaluatorData,
unsigned int* numUpdates) {
224 ds_status status = DS_SUCCESS;
226 unsigned int updates = 0;
228 if (profile ==
nullptr) {
229 return DS_INVALID_PROFILE;
231 if (evaluator ==
nullptr) {
232 return DS_INVALID_PERF_EVALUATOR;
235 for (i = 0; i < profile->numDevices; i++) {
236 ds_status evaluatorStatus;
239 case DS_EVALUATE_NEW_ONLY:
240 if (profile->devices[i].score !=
nullptr)
break;
242 case DS_EVALUATE_ALL:
243 evaluatorStatus = evaluator(&profile->devices[i], evaluatorData);
244 if (evaluatorStatus != DS_SUCCESS) {
245 status = evaluatorStatus;
251 return DS_INVALID_PERF_EVALUATOR_TYPE;
255 if (numUpdates) *numUpdates = updates;
259 static const char* findString(
const char* contentStart,
const char* contentEnd,
260 const char*
string) {
262 const char* currentPosition;
263 const char* found =
nullptr;
264 stringLength = strlen(
string);
265 currentPosition = contentStart;
266 for (currentPosition = contentStart; currentPosition < contentEnd;
268 if (*currentPosition ==
string[0]) {
269 if (currentPosition + stringLength < contentEnd) {
270 if (strncmp(currentPosition,
string, stringLength) == 0) {
271 found = currentPosition;
280 static ds_status readProFile(
const char* fileName,
char** content,
281 size_t* contentSize) {
284 ds_status status = DS_SUCCESS;
285 FILE* input = fopen(fileName,
"rb");
286 if (input ==
nullptr) {
287 status = DS_FILE_ERROR;
289 fseek(input, 0L, SEEK_END);
290 auto pos = std::ftell(input);
294 char *binary =
new char[size];
295 if (fread(binary,
sizeof(
char), size, input) != size) {
296 status = DS_FILE_ERROR;
308 typedef ds_status (*ds_score_deserializer)(ds_device* device,
309 const uint8_t* serializedScore,
310 unsigned int serializedScoreSize);
312 static ds_status readProfileFromFile(ds_profile* profile,
313 ds_score_deserializer deserializer,
315 ds_status status = DS_SUCCESS;
319 if (profile ==
nullptr)
return DS_INVALID_PROFILE;
321 status = readProFile(
file, &contentStart, &contentSize);
322 if (status == DS_SUCCESS) {
323 const char* currentPosition;
324 const char* dataStart;
327 const char* contentEnd = contentStart + contentSize;
328 currentPosition = contentStart;
331 dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION);
332 if (dataStart ==
nullptr) {
333 status = DS_PROFILE_FILE_ERROR;
336 dataStart += strlen(DS_TAG_VERSION);
338 dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END);
339 if (dataEnd ==
nullptr) {
340 status = DS_PROFILE_FILE_ERROR;
344 size_t versionStringLength = strlen(profile->version);
345 if (versionStringLength + dataStart != dataEnd ||
346 strncmp(profile->version, dataStart, versionStringLength) != 0) {
348 status = DS_PROFILE_FILE_ERROR;
351 currentPosition = dataEnd + strlen(DS_TAG_VERSION_END);
357 const char* deviceTypeStart;
358 const char* deviceTypeEnd;
359 ds_device_type deviceType;
361 const char* deviceNameStart;
362 const char* deviceNameEnd;
364 const char* deviceScoreStart;
365 const char* deviceScoreEnd;
367 const char* deviceDriverStart;
368 const char* deviceDriverEnd;
370 dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE);
371 if (dataStart ==
nullptr) {
375 dataStart += strlen(DS_TAG_DEVICE);
376 dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END);
377 if (dataEnd ==
nullptr) {
378 status = DS_PROFILE_FILE_ERROR;
383 deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE);
384 if (deviceTypeStart ==
nullptr) {
385 status = DS_PROFILE_FILE_ERROR;
388 deviceTypeStart += strlen(DS_TAG_DEVICE_TYPE);
390 findString(deviceTypeStart, contentEnd, DS_TAG_DEVICE_TYPE_END);
391 if (deviceTypeEnd ==
nullptr) {
392 status = DS_PROFILE_FILE_ERROR;
395 memcpy(&deviceType, deviceTypeStart,
sizeof(ds_device_type));
398 if (deviceType == DS_DEVICE_OPENCL_DEVICE) {
399 deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME);
400 if (deviceNameStart ==
nullptr) {
401 status = DS_PROFILE_FILE_ERROR;
404 deviceNameStart += strlen(DS_TAG_DEVICE_NAME);
406 findString(deviceNameStart, contentEnd, DS_TAG_DEVICE_NAME_END);
407 if (deviceNameEnd ==
nullptr) {
408 status = DS_PROFILE_FILE_ERROR;
413 findString(dataStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION);
414 if (deviceDriverStart ==
nullptr) {
415 status = DS_PROFILE_FILE_ERROR;
418 deviceDriverStart += strlen(DS_TAG_DEVICE_DRIVER_VERSION);
419 deviceDriverEnd = findString(deviceDriverStart, contentEnd,
420 DS_TAG_DEVICE_DRIVER_VERSION_END);
421 if (deviceDriverEnd ==
nullptr) {
422 status = DS_PROFILE_FILE_ERROR;
427 for (i = 0; i < profile->numDevices; i++) {
428 if (profile->devices[i].type == DS_DEVICE_OPENCL_DEVICE) {
429 size_t actualDeviceNameLength;
430 size_t driverVersionLength;
432 actualDeviceNameLength = strlen(profile->devices[i].oclDeviceName);
433 driverVersionLength = strlen(profile->devices[i].oclDriverVersion);
434 if (deviceNameStart + actualDeviceNameLength == deviceNameEnd &&
435 deviceDriverStart + driverVersionLength == deviceDriverEnd &&
436 strncmp(profile->devices[i].oclDeviceName, deviceNameStart,
437 actualDeviceNameLength) == 0 &&
438 strncmp(profile->devices[i].oclDriverVersion, deviceDriverStart,
439 driverVersionLength) == 0) {
441 findString(dataStart, contentEnd, DS_TAG_SCORE);
442 deviceScoreStart += strlen(DS_TAG_SCORE);
444 findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
445 status = deserializer(&profile->devices[i],
446 (
const unsigned char*)deviceScoreStart,
447 deviceScoreEnd - deviceScoreStart);
448 if (status != DS_SUCCESS) {
454 }
else if (deviceType == DS_DEVICE_NATIVE_CPU) {
455 for (i = 0; i < profile->numDevices; i++) {
456 if (profile->devices[i].type == DS_DEVICE_NATIVE_CPU) {
457 deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
458 if (deviceScoreStart ==
nullptr) {
459 status = DS_PROFILE_FILE_ERROR;
462 deviceScoreStart += strlen(DS_TAG_SCORE);
464 findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
465 status = deserializer(&profile->devices[i],
466 (
const unsigned char*)deviceScoreStart,
467 deviceScoreEnd - deviceScoreStart);
468 if (status != DS_SUCCESS) {
476 currentPosition = dataEnd + strlen(DS_TAG_DEVICE_END);
480 delete[] contentStart;
484 typedef ds_status (*ds_score_serializer)(ds_device* device,
485 uint8_t** serializedScore,
486 unsigned int* serializedScoreSize);
487 static ds_status writeProfileToFile(ds_profile* profile,
488 ds_score_serializer serializer,
490 ds_status status = DS_SUCCESS;
492 if (profile ==
nullptr)
return DS_INVALID_PROFILE;
494 FILE* profileFile = fopen(
file,
"wb");
495 if (profileFile ==
nullptr) {
496 status = DS_FILE_ERROR;
501 fwrite(DS_TAG_VERSION,
sizeof(
char), strlen(DS_TAG_VERSION), profileFile);
502 fwrite(profile->version,
sizeof(
char), strlen(profile->version),
504 fwrite(DS_TAG_VERSION_END,
sizeof(
char), strlen(DS_TAG_VERSION_END),
506 fwrite(
"\n",
sizeof(
char), 1, profileFile);
508 for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) {
509 uint8_t* serializedScore;
510 unsigned int serializedScoreSize;
512 fwrite(DS_TAG_DEVICE,
sizeof(
char), strlen(DS_TAG_DEVICE), profileFile);
514 fwrite(DS_TAG_DEVICE_TYPE,
sizeof(
char), strlen(DS_TAG_DEVICE_TYPE),
516 fwrite(&profile->devices[i].type,
sizeof(ds_device_type), 1, profileFile);
517 fwrite(DS_TAG_DEVICE_TYPE_END,
sizeof(
char),
518 strlen(DS_TAG_DEVICE_TYPE_END), profileFile);
520 switch (profile->devices[i].type) {
521 case DS_DEVICE_NATIVE_CPU: {
532 case DS_DEVICE_OPENCL_DEVICE: {
533 fwrite(DS_TAG_DEVICE_NAME,
sizeof(
char), strlen(DS_TAG_DEVICE_NAME),
535 fwrite(profile->devices[i].oclDeviceName,
sizeof(
char),
536 strlen(profile->devices[i].oclDeviceName), profileFile);
537 fwrite(DS_TAG_DEVICE_NAME_END,
sizeof(
char),
538 strlen(DS_TAG_DEVICE_NAME_END), profileFile);
540 fwrite(DS_TAG_DEVICE_DRIVER_VERSION,
sizeof(
char),
541 strlen(DS_TAG_DEVICE_DRIVER_VERSION), profileFile);
542 fwrite(profile->devices[i].oclDriverVersion,
sizeof(
char),
543 strlen(profile->devices[i].oclDriverVersion), profileFile);
544 fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END,
sizeof(
char),
545 strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile);
548 status = DS_UNKNOWN_DEVICE_TYPE;
552 fwrite(DS_TAG_SCORE,
sizeof(
char), strlen(DS_TAG_SCORE), profileFile);
553 status = serializer(&profile->devices[i], &serializedScore,
554 &serializedScoreSize);
555 if (status == DS_SUCCESS && serializedScore !=
nullptr &&
556 serializedScoreSize > 0) {
557 fwrite(serializedScore,
sizeof(
char), serializedScoreSize, profileFile);
558 delete[] serializedScore;
560 fwrite(DS_TAG_SCORE_END,
sizeof(
char), strlen(DS_TAG_SCORE_END),
562 fwrite(DS_TAG_DEVICE_END,
sizeof(
char), strlen(DS_TAG_DEVICE_END),
564 fwrite(
"\n",
sizeof(
char), 1, profileFile);
572 static void legalizeFileName(
char* fileName) {
574 const char* invalidChars =
577 for (
unsigned i = 0; i < strlen(invalidChars); i++) {
579 invalidStr[0] = invalidChars[i];
580 invalidStr[1] =
'\0';
586 for (
char* pos = strstr(fileName, invalidStr); pos !=
nullptr;
587 pos = strstr(pos + 1, invalidStr)) {
595 static void populateGPUEnvFromDevice(GPUEnv* gpuInfo, cl_device_id device) {
598 gpuInfo->mnIsUserCreated = 1;
600 gpuInfo->mpDevID = device;
601 gpuInfo->mpArryDevsID =
new cl_device_id[1];
602 gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
603 clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE,
604 sizeof(cl_device_type), &gpuInfo->mDevType, &size);
605 CHECK_OPENCL(clStatus,
"populateGPUEnv::getDeviceInfo(TYPE)");
608 clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM,
609 sizeof(cl_platform_id), &gpuInfo->mpPlatformID, &size);
610 CHECK_OPENCL(clStatus,
"populateGPUEnv::getDeviceInfo(PLATFORM)");
612 cl_context_properties props[3];
613 props[0] = CL_CONTEXT_PLATFORM;
614 props[1] = (cl_context_properties)gpuInfo->mpPlatformID;
617 clCreateContext(props, 1, &gpuInfo->mpDevID,
nullptr,
nullptr, &clStatus);
618 CHECK_OPENCL(clStatus,
"populateGPUEnv::createContext");
620 cl_command_queue_properties queueProperties = 0;
621 gpuInfo->mpCmdQueue = clCreateCommandQueue(
622 gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus);
623 CHECK_OPENCL(clStatus,
"populateGPUEnv::createCommandQueue");
626 int OpenclDevice::LoadOpencl() {
628 HINSTANCE HOpenclDll =
nullptr;
629 void* OpenclDll =
nullptr;
631 OpenclDll = static_cast<HINSTANCE>(HOpenclDll);
632 OpenclDll = LoadLibrary(
"openCL.dll");
633 if (!static_cast<HINSTANCE>(OpenclDll)) {
634 fprintf(stderr,
"[OD] Load opencl.dll failed!\n");
635 FreeLibrary(static_cast<HINSTANCE>(OpenclDll));
638 fprintf(stderr,
"[OD] Load opencl.dll successful!\n");
642 int OpenclDevice::SetKernelEnv(KernelEnv* envInfo) {
643 envInfo->mpkContext = gpuEnv.mpContext;
644 envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
645 envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
650 static cl_mem allocateZeroCopyBuffer(
const KernelEnv& rEnv,
651 l_uint32* hostbuffer,
size_t nElements,
652 cl_mem_flags flags, cl_int* pStatus) {
654 clCreateBuffer(rEnv.mpkContext, (cl_mem_flags)(flags),
655 nElements *
sizeof(l_uint32), hostbuffer, pStatus);
660 static Pix* mapOutputCLBuffer(
const KernelEnv& rEnv, cl_mem clbuffer, Pix* pixd,
661 Pix* pixs,
int elements, cl_mem_flags flags,
662 bool memcopy =
false,
bool sync =
true) {
665 if ((pixd = pixCreateTemplate(pixs)) ==
nullptr)
668 if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs),
669 pixGetDepth(pixs))) ==
nullptr)
673 l_uint32* pValues = (l_uint32*)clEnqueueMapBuffer(
674 rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
675 elements *
sizeof(l_uint32), 0,
nullptr,
nullptr,
nullptr);
678 memcpy(pixGetData(pixd), pValues, elements *
sizeof(l_uint32));
680 pixSetData(pixd, pValues);
683 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, clbuffer, pValues, 0,
nullptr,
687 clFinish(rEnv.mpkCmdQueue);
693 void OpenclDevice::releaseMorphCLBuffers() {
694 if (pixdCLIntermediate !=
nullptr) clReleaseMemObject(pixdCLIntermediate);
695 if (pixsCLBuffer !=
nullptr) clReleaseMemObject(pixsCLBuffer);
696 if (pixdCLBuffer !=
nullptr) clReleaseMemObject(pixdCLBuffer);
697 if (pixThBuffer !=
nullptr) clReleaseMemObject(pixThBuffer);
698 pixdCLIntermediate = pixsCLBuffer = pixdCLBuffer = pixThBuffer =
nullptr;
701 int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix* pixs) {
704 if (pixThBuffer !=
nullptr) {
705 pixsCLBuffer = allocateZeroCopyBuffer(rEnv,
nullptr, wpl * h,
706 CL_MEM_ALLOC_HOST_PTR, &clStatus);
710 clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0,
711 sizeof(l_uint32) * wpl * h, 0,
nullptr,
nullptr);
715 reinterpret_cast<l_uint32*>(malloc(wpl * h *
sizeof(l_uint32)));
716 memcpy(srcdata, pixGetData(pixs), wpl * h *
sizeof(l_uint32));
718 pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl * h,
719 CL_MEM_USE_HOST_PTR, &clStatus);
722 pixdCLBuffer = allocateZeroCopyBuffer(rEnv,
nullptr, wpl * h,
723 CL_MEM_ALLOC_HOST_PTR, &clStatus);
725 pixdCLIntermediate = allocateZeroCopyBuffer(rEnv,
nullptr, wpl * h,
726 CL_MEM_ALLOC_HOST_PTR, &clStatus);
728 return (
int)clStatus;
731 int OpenclDevice::InitEnv() {
735 if (1 == LoadOpencl())
break;
740 InitOpenclRunEnv_DeviceSelection(0);
744 int OpenclDevice::ReleaseOpenclRunEnv() {
745 ReleaseOpenclEnv(&gpuEnv);
752 inline int OpenclDevice::AddKernelConfig(
int kCount,
const char* kName) {
754 ASSERT_HOST(strlen(kName) <
sizeof(gpuEnv.mArrykernelNames[kCount - 1]));
755 strcpy(gpuEnv.mArrykernelNames[kCount - 1], kName);
756 gpuEnv.mnKernelCount++;
760 int OpenclDevice::RegistOpenclKernel() {
761 if (!gpuEnv.mnIsUserCreated) memset(&gpuEnv, 0,
sizeof(gpuEnv));
763 gpuEnv.mnFileCount = 0;
764 gpuEnv.mnKernelCount = 0UL;
766 AddKernelConfig(1,
"oclAverageSub1");
770 int OpenclDevice::InitOpenclRunEnv_DeviceSelection(
int argc) {
773 ds_device bestDevice_DS = getDeviceSelection();
774 cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
776 if (selectedDeviceIsOpenCL()) {
779 populateGPUEnvFromDevice(&gpuEnv, bestDevice);
780 gpuEnv.mnFileCount = 0;
781 gpuEnv.mnKernelCount = 0UL;
782 CompileKernelFile(&gpuEnv,
"");
792 OpenclDevice::OpenclDevice() {
796 OpenclDevice::~OpenclDevice() {
800 int OpenclDevice::ReleaseOpenclEnv(GPUEnv* gpuInfo) {
808 for (i = 0; i < gpuEnv.mnFileCount; i++) {
809 if (gpuEnv.mpArryPrograms[i]) {
810 clStatus = clReleaseProgram(gpuEnv.mpArryPrograms[i]);
811 CHECK_OPENCL(clStatus,
"clReleaseProgram");
812 gpuEnv.mpArryPrograms[i] =
nullptr;
815 if (gpuEnv.mpCmdQueue) {
816 clReleaseCommandQueue(gpuEnv.mpCmdQueue);
817 gpuEnv.mpCmdQueue =
nullptr;
819 if (gpuEnv.mpContext) {
820 clReleaseContext(gpuEnv.mpContext);
821 gpuEnv.mpContext =
nullptr;
824 gpuInfo->mnIsUserCreated = 0;
825 delete[] gpuInfo->mpArryDevsID;
828 int OpenclDevice::BinaryGenerated(
const char* clFileName, FILE** fhandle) {
833 char fileName[256] = {0}, cl_name[128] = {0};
834 char deviceName[1024];
835 clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME,
836 sizeof(deviceName), deviceName,
nullptr);
837 CHECK_OPENCL(clStatus,
"clGetDeviceInfo");
838 const char* str = strstr(clFileName,
".cl");
839 memcpy(cl_name, clFileName, str - clFileName);
840 cl_name[str - clFileName] =
'\0';
841 sprintf(fileName,
"%s-%s.bin", cl_name, deviceName);
842 legalizeFileName(fileName);
843 fd = fopen(fileName,
"rb");
844 status = (fd !=
nullptr) ? 1 : 0;
850 int OpenclDevice::CachedOfKernerPrg(
const GPUEnv* gpuEnvCached,
851 const char* clFileName) {
853 for (i = 0; i < gpuEnvCached->mnFileCount; i++) {
854 if (strcasecmp(gpuEnvCached->mArryKnelSrcFile[i], clFileName) == 0) {
855 if (gpuEnvCached->mpArryPrograms[i] !=
nullptr) {
863 int OpenclDevice::WriteBinaryToFile(
const char* fileName,
const char* birary,
865 FILE* output =
nullptr;
866 output = fopen(fileName,
"wb");
867 if (output ==
nullptr) {
871 fwrite(birary,
sizeof(
char), numBytes, output);
877 int OpenclDevice::GeneratBinFromKernelSource(cl_program program,
878 const char* clFileName) {
883 clStatus = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,
884 sizeof(numDevices), &numDevices,
nullptr);
885 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
887 std::vector<cl_device_id> mpArryDevsID(numDevices);
890 clStatus = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
891 sizeof(cl_device_id) * numDevices,
894 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
897 std::vector<size_t> binarySizes(numDevices);
900 clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
901 sizeof(
size_t) * numDevices, &binarySizes[0],
nullptr);
902 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
905 std::vector<char*> binaries(numDevices);
907 for (i = 0; i < numDevices; i++) {
908 if (binarySizes[i] != 0) {
909 binaries[i] =
new char[binarySizes[i]];
911 binaries[i] =
nullptr;
916 clGetProgramInfo(program, CL_PROGRAM_BINARIES,
sizeof(
char*) * numDevices,
917 &binaries[0],
nullptr);
918 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
921 for (i = 0; i < numDevices; i++) {
922 char fileName[256] = {0}, cl_name[128] = {0};
924 if (binarySizes[i] != 0) {
925 char deviceName[1024];
926 clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
927 sizeof(deviceName), deviceName,
nullptr);
928 CHECK_OPENCL(clStatus,
"clGetDeviceInfo");
930 const char* str = strstr(clFileName,
".cl");
931 memcpy(cl_name, clFileName, str - clFileName);
932 cl_name[str - clFileName] =
'\0';
933 sprintf(fileName,
"%s-%s.bin", cl_name, deviceName);
934 legalizeFileName(fileName);
935 if (!WriteBinaryToFile(fileName, binaries[i], binarySizes[i])) {
936 tprintf(
"[OD] write binary[%s] failed\n", fileName);
939 tprintf(
"[OD] write binary[%s] successfully\n", fileName);
944 for (i = 0; i < numDevices; i++) {
945 delete[] binaries[i];
951 int OpenclDevice::CompileKernelFile(GPUEnv* gpuInfo,
const char* buildOption) {
954 size_t source_size[1];
955 int binary_status, binaryExisted, idx;
958 const char* filename =
"kernel.cl";
960 if (CachedOfKernerPrg(gpuInfo, filename) == 1) {
964 idx = gpuInfo->mnFileCount;
968 source_size[0] = strlen(source);
970 binaryExisted = BinaryGenerated(
972 if (binaryExisted == 1) {
973 clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
974 sizeof(numDevices), &numDevices,
nullptr);
975 CHECK_OPENCL(clStatus,
"clGetContextInfo");
977 std::vector<cl_device_id> mpArryDevsID(numDevices);
978 bool b_error = fseek(fd, 0, SEEK_END) < 0;
979 auto pos = std::ftell(fd);
980 b_error |= (pos <= 0);
982 b_error |= fseek(fd, 0, SEEK_SET) < 0;
988 std::vector<uint8_t> binary(length + 2);
990 memset(&binary[0], 0, length + 2);
991 b_error |= fread(&binary[0], 1, length, fd) != length;
996 clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES,
997 sizeof(cl_device_id) * numDevices,
998 &mpArryDevsID[0],
nullptr);
999 CHECK_OPENCL(clStatus,
"clGetContextInfo");
1001 const uint8_t* c_binary = &binary[0];
1002 gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary(
1003 gpuInfo->mpContext, numDevices, &mpArryDevsID[0], &length, &c_binary,
1004 &binary_status, &clStatus);
1005 CHECK_OPENCL(clStatus,
"clCreateProgramWithBinary");
1009 gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource(
1010 gpuInfo->mpContext, 1, &source, source_size, &clStatus);
1011 CHECK_OPENCL(clStatus,
"clCreateProgramWithSource");
1014 if (gpuInfo->mpArryPrograms[idx] == (cl_program)
nullptr) {
1021 if (!gpuInfo->mnIsUserCreated) {
1023 clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
1024 buildOption,
nullptr,
nullptr);
1027 clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
1028 buildOption,
nullptr,
nullptr);
1030 if (clStatus != CL_SUCCESS) {
1031 tprintf(
"BuildProgram error!\n");
1033 if (!gpuInfo->mnIsUserCreated) {
1034 clStatus = clGetProgramBuildInfo(
1035 gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1036 CL_PROGRAM_BUILD_LOG, 0,
nullptr, &length);
1039 clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1040 CL_PROGRAM_BUILD_LOG, 0,
nullptr, &length);
1042 if (clStatus != CL_SUCCESS) {
1043 tprintf(
"opencl create build log fail\n");
1046 std::vector<char> buildLog(length);
1047 if (!gpuInfo->mnIsUserCreated) {
1048 clStatus = clGetProgramBuildInfo(
1049 gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1050 CL_PROGRAM_BUILD_LOG, length, &buildLog[0], &length);
1052 clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx],
1053 gpuInfo->mpDevID, CL_PROGRAM_BUILD_LOG,
1054 length, &buildLog[0], &length);
1056 if (clStatus != CL_SUCCESS) {
1057 tprintf(
"opencl program build info fail\n");
1061 fd1 = fopen(
"kernel-build.log",
"w+");
1062 if (fd1 !=
nullptr) {
1063 fwrite(&buildLog[0],
sizeof(
char), length, fd1);
1070 strcpy(gpuInfo->mArryKnelSrcFile[idx], filename);
1071 if (binaryExisted == 0) {
1072 GeneratBinFromKernelSource(gpuInfo->mpArryPrograms[idx], filename);
1075 gpuInfo->mnFileCount += 1;
1079 l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32* tiffdata, l_int32 w,
1080 l_int32 h, l_int32 wpl,
1084 size_t globalThreads[2];
1085 size_t localThreads[2];
1091 gsize = (w + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1092 globalThreads[0] = gsize;
1093 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1094 globalThreads[1] = gsize;
1095 localThreads[0] = GROUPSIZE_X;
1096 localThreads[1] = GROUPSIZE_Y;
1098 SetKernelEnv(&rEnv);
1100 l_uint32* pResult = (l_uint32*)malloc(w * h *
sizeof(l_uint32));
1102 clCreateKernel(rEnv.mpkProgram,
"composeRGBPixel", &clStatus);
1103 CHECK_OPENCL(clStatus,
"clCreateKernel composeRGBPixel");
1106 valuesCl = allocateZeroCopyBuffer(
1107 rEnv, tiffdata, w * h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1108 outputCl = allocateZeroCopyBuffer(
1109 rEnv, pResult, w * h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1112 clStatus = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &valuesCl);
1113 CHECK_OPENCL(clStatus,
"clSetKernelArg");
1114 clStatus = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(w), &w);
1115 CHECK_OPENCL(clStatus,
"clSetKernelArg");
1116 clStatus = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(h), &h);
1117 CHECK_OPENCL(clStatus,
"clSetKernelArg");
1118 clStatus = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1119 CHECK_OPENCL(clStatus,
"clSetKernelArg");
1120 clStatus = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(cl_mem), &outputCl);
1121 CHECK_OPENCL(clStatus,
"clSetKernelArg");
1125 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1126 globalThreads, localThreads, 0,
nullptr,
nullptr);
1127 CHECK_OPENCL(clStatus,
"clEnqueueNDRangeKernel");
1130 void* ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE,
1131 CL_MAP_READ, 0, w * h *
sizeof(l_uint32), 0,
1132 nullptr,
nullptr, &clStatus);
1133 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer outputCl");
1134 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0,
nullptr,
nullptr);
1137 clFinish(rEnv.mpkCmdQueue);
1143 static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) {
1144 size_t globalThreads[2];
1148 size_t localThreads[2];
1151 gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1152 globalThreads[0] = gsize;
1153 globalThreads[1] = GROUPSIZE_HMORY;
1154 localThreads[0] = GROUPSIZE_HMORX;
1155 localThreads[1] = GROUPSIZE_HMORY;
1158 clCreateKernel(rEnv.mpkProgram,
"morphoDilateHor_5x5", &status);
1159 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor_5x5");
1161 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1162 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1163 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1164 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1167 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1168 globalThreads, localThreads, 0,
nullptr,
nullptr);
1171 pixtemp = pixsCLBuffer;
1172 pixsCLBuffer = pixdCLBuffer;
1173 pixdCLBuffer = pixtemp;
1176 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1177 globalThreads[0] = gsize;
1178 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1179 globalThreads[1] = gsize;
1180 localThreads[0] = GROUPSIZE_X;
1181 localThreads[1] = GROUPSIZE_Y;
1184 clCreateKernel(rEnv.mpkProgram,
"morphoDilateVer_5x5", &status);
1185 CHECK_OPENCL(status,
"clCreateKernel morphoDilateVer_5x5");
1187 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1188 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1189 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1190 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1192 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1193 globalThreads, localThreads, 0,
nullptr,
nullptr);
1200 static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h) {
1201 size_t globalThreads[2];
1205 l_uint32 fwmask, lwmask;
1206 size_t localThreads[2];
1208 lwmask = lmask32[31 - 2];
1209 fwmask = rmask32[31 - 2];
1212 gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1213 globalThreads[0] = gsize;
1214 globalThreads[1] = GROUPSIZE_HMORY;
1215 localThreads[0] = GROUPSIZE_HMORX;
1216 localThreads[1] = GROUPSIZE_HMORY;
1219 clCreateKernel(rEnv.mpkProgram,
"morphoErodeHor_5x5", &status);
1220 CHECK_OPENCL(status,
"clCreateKernel morphoErodeHor_5x5");
1222 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1223 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1224 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1225 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1228 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1229 globalThreads, localThreads, 0,
nullptr,
nullptr);
1232 pixtemp = pixsCLBuffer;
1233 pixsCLBuffer = pixdCLBuffer;
1234 pixdCLBuffer = pixtemp;
1237 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1238 globalThreads[0] = gsize;
1239 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1240 globalThreads[1] = gsize;
1241 localThreads[0] = GROUPSIZE_X;
1242 localThreads[1] = GROUPSIZE_Y;
1245 clCreateKernel(rEnv.mpkProgram,
"morphoErodeVer_5x5", &status);
1246 CHECK_OPENCL(status,
"clCreateKernel morphoErodeVer_5x5");
1248 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1249 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1250 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1251 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1252 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(fwmask), &fwmask);
1253 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(lwmask), &lwmask);
1255 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1256 globalThreads, localThreads, 0,
nullptr,
nullptr);
1262 static cl_int pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl,
1264 l_int32 xp, yp, xn, yn;
1266 size_t globalThreads[2];
1270 size_t localThreads[2];
1273 OpenclDevice::SetKernelEnv(&rEnv);
1275 if (hsize == 5 && vsize == 5) {
1277 status = pixDilateCL_55(wpl, h);
1281 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1283 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1286 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1287 globalThreads[0] = gsize;
1288 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1289 globalThreads[1] = gsize;
1290 localThreads[0] = GROUPSIZE_X;
1291 localThreads[1] = GROUPSIZE_Y;
1293 if (xp > 31 || xn > 31) {
1296 clCreateKernel(rEnv.mpkProgram,
"morphoDilateHor", &status);
1297 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor");
1299 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1300 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1301 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1302 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(xn), &xn);
1303 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(wpl), &wpl);
1304 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(h), &h);
1305 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1306 nullptr, globalThreads, localThreads, 0,
1309 if (yp > 0 || yn > 0) {
1310 pixtemp = pixsCLBuffer;
1311 pixsCLBuffer = pixdCLBuffer;
1312 pixdCLBuffer = pixtemp;
1314 }
else if (xp > 0 || xn > 0) {
1317 clCreateKernel(rEnv.mpkProgram,
"morphoDilateHor_32word", &status);
1318 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor_32word");
1319 isEven = (xp != xn);
1321 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1322 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1323 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1324 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1325 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1326 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isEven), &isEven);
1327 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1328 nullptr, globalThreads, localThreads, 0,
1331 if (yp > 0 || yn > 0) {
1332 pixtemp = pixsCLBuffer;
1333 pixsCLBuffer = pixdCLBuffer;
1334 pixdCLBuffer = pixtemp;
1338 if (yp > 0 || yn > 0) {
1340 clCreateKernel(rEnv.mpkProgram,
"morphoDilateVer", &status);
1341 CHECK_OPENCL(status,
"clCreateKernel morphoDilateVer");
1343 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1344 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1345 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(yp), &yp);
1346 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1347 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1348 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(yn), &yn);
1349 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1350 nullptr, globalThreads, localThreads, 0,
1358 static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl,
1360 l_int32 xp, yp, xn, yn;
1362 size_t globalThreads[2];
1363 size_t localThreads[2];
1367 char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
1368 l_uint32 rwmask, lwmask;
1371 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1373 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1375 OpenclDevice::SetKernelEnv(&rEnv);
1377 if (hsize == 5 && vsize == 5 && isAsymmetric) {
1379 status = pixErodeCL_55(wpl, h);
1383 lwmask = lmask32[31 - (xn & 31)];
1384 rwmask = rmask32[31 - (xp & 31)];
1387 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1388 globalThreads[0] = gsize;
1389 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1390 globalThreads[1] = gsize;
1391 localThreads[0] = GROUPSIZE_X;
1392 localThreads[1] = GROUPSIZE_Y;
1395 if (xp > 31 || xn > 31) {
1397 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoErodeHor", &status);
1399 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1400 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1401 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1402 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(xn), &xn);
1403 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(wpl), &wpl);
1404 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(h), &h);
1406 clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(isAsymmetric), &isAsymmetric);
1407 status = clSetKernelArg(rEnv.mpkKernel, 7,
sizeof(rwmask), &rwmask);
1408 status = clSetKernelArg(rEnv.mpkKernel, 8,
sizeof(lwmask), &lwmask);
1409 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1410 nullptr, globalThreads, localThreads, 0,
1413 if (yp > 0 || yn > 0) {
1414 pixtemp = pixsCLBuffer;
1415 pixsCLBuffer = pixdCLBuffer;
1416 pixdCLBuffer = pixtemp;
1418 }
else if (xp > 0 || xn > 0) {
1420 clCreateKernel(rEnv.mpkProgram,
"morphoErodeHor_32word", &status);
1421 isEven = (xp != xn);
1423 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1424 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1425 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1426 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1427 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1429 clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isAsymmetric), &isAsymmetric);
1430 status = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(rwmask), &rwmask);
1431 status = clSetKernelArg(rEnv.mpkKernel, 7,
sizeof(lwmask), &lwmask);
1432 status = clSetKernelArg(rEnv.mpkKernel, 8,
sizeof(isEven), &isEven);
1433 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1434 nullptr, globalThreads, localThreads, 0,
1437 if (yp > 0 || yn > 0) {
1438 pixtemp = pixsCLBuffer;
1439 pixsCLBuffer = pixdCLBuffer;
1440 pixdCLBuffer = pixtemp;
1445 if (yp > 0 || yn > 0) {
1446 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoErodeVer", &status);
1447 CHECK_OPENCL(status,
"clCreateKernel morphoErodeVer");
1449 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1450 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1451 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(yp), &yp);
1452 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1453 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1455 clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isAsymmetric), &isAsymmetric);
1456 status = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(yn), &yn);
1457 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1458 nullptr, globalThreads, localThreads, 0,
1466 static cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1471 status = pixErodeCL(hsize, vsize, wpl, h);
1473 pixtemp = pixsCLBuffer;
1474 pixsCLBuffer = pixdCLBuffer;
1475 pixdCLBuffer = pixtemp;
1477 status = pixDilateCL(hsize, vsize, wpl, h);
1483 static cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1488 status = pixDilateCL(hsize, vsize, wpl, h);
1490 pixtemp = pixsCLBuffer;
1491 pixsCLBuffer = pixdCLBuffer;
1492 pixdCLBuffer = pixtemp;
1494 status = pixErodeCL(hsize, vsize, wpl, h);
1500 static cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1,
1503 size_t globalThreads[2];
1505 size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
1507 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1508 globalThreads[0] = gsize;
1509 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1510 globalThreads[1] = gsize;
1513 clCreateKernel(rEnv.mpkProgram,
"pixSubtract_inplace", &status);
1514 CHECK_OPENCL(status,
"clCreateKernel pixSubtract_inplace");
1517 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &buffer1);
1518 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &buffer2);
1519 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1520 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1522 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1523 globalThreads, localThreads, 0,
nullptr,
nullptr);
1531 void OpenclDevice::pixGetLinesCL(Pix* pixd, Pix* pixs, Pix** pix_vline,
1532 Pix** pix_hline, Pix** pixClosed,
1533 bool getpixClosed, l_int32 close_hsize,
1534 l_int32 close_vsize, l_int32 open_hsize,
1535 l_int32 open_vsize, l_int32 line_hsize,
1536 l_int32 line_vsize) {
1540 wpl = pixGetWpl(pixs);
1541 h = pixGetHeight(pixs);
1544 clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
1548 *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs,
1549 wpl * h, CL_MAP_READ,
true,
false);
1555 clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
1556 0,
sizeof(
int) * wpl * h, 0,
nullptr,
nullptr);
1559 pixtemp = pixsCLBuffer;
1560 pixsCLBuffer = pixdCLBuffer;
1561 pixdCLBuffer = pixtemp;
1563 clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
1566 pixtemp = pixsCLBuffer;
1567 pixsCLBuffer = pixdCLBuffer;
1568 pixdCLBuffer = pixdCLIntermediate;
1569 pixdCLIntermediate = pixtemp;
1571 clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
1576 clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
1577 0,
sizeof(
int) * wpl * h, 0,
nullptr,
nullptr);
1579 pixtemp = pixsCLBuffer;
1580 pixsCLBuffer = pixdCLBuffer;
1581 pixdCLBuffer = pixtemp;
1585 clStatus = pixOpenCL(1, line_vsize, wpl, h);
1588 *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl * h,
1589 CL_MAP_READ,
true,
false);
1591 pixtemp = pixsCLBuffer;
1592 pixsCLBuffer = pixdCLIntermediate;
1593 pixdCLIntermediate = pixtemp;
1597 clStatus = pixOpenCL(line_hsize, 1, wpl, h);
1600 *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl * h,
1601 CL_MAP_READ,
true,
true);
1612 int OpenclDevice::HistogramRectOCL(
void* imageData,
1613 int bytes_per_pixel,
int bytes_per_line,
1617 int* histogramAllChannels) {
1621 SetKernelEnv(&histKern);
1622 KernelEnv histRedKern;
1623 SetKernelEnv(&histRedKern);
1629 cl_mem imageBuffer = clCreateBuffer(
1630 histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1631 width * height * bytes_per_pixel *
sizeof(
char), imageData, &clStatus);
1632 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1635 int block_size = 256;
1637 clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
1638 sizeof(numCUs), &numCUs,
nullptr);
1639 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1641 int requestedOccupancy = 10;
1642 int numWorkGroups = numCUs * requestedOccupancy;
1643 int numThreads = block_size * numWorkGroups;
1644 size_t local_work_size[] = {static_cast<size_t>(block_size)};
1645 size_t global_work_size[] = {static_cast<size_t>(numThreads)};
1646 size_t red_global_work_size[] = {
1647 static_cast<size_t>(block_size *
kHistogramSize * bytes_per_pixel)};
1651 cl_mem histogramBuffer = clCreateBuffer(
1652 histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1653 kHistogramSize * bytes_per_pixel *
sizeof(
int), histogramAllChannels,
1655 CHECK_OPENCL(clStatus,
"clCreateBuffer histogramBuffer");
1659 int tmpHistogramBins =
kHistogramSize * bytes_per_pixel * histRed;
1661 cl_mem tmpHistogramBuffer =
1662 clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE,
1663 tmpHistogramBins *
sizeof(cl_uint),
nullptr, &clStatus);
1664 CHECK_OPENCL(clStatus,
"clCreateBuffer tmpHistogramBuffer");
1667 int* zeroBuffer =
new int[1];
1669 cl_mem atomicSyncBuffer = clCreateBuffer(
1670 histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1671 sizeof(cl_int), zeroBuffer, &clStatus);
1672 CHECK_OPENCL(clStatus,
"clCreateBuffer atomicSyncBuffer");
1673 delete[] zeroBuffer;
1675 if (bytes_per_pixel == 1) {
1676 histKern.mpkKernel = clCreateKernel(
1677 histKern.mpkProgram,
"kernel_HistogramRectOneChannel", &clStatus);
1678 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_HistogramRectOneChannel");
1680 histRedKern.mpkKernel =
1681 clCreateKernel(histRedKern.mpkProgram,
1682 "kernel_HistogramRectOneChannelReduction", &clStatus);
1683 CHECK_OPENCL(clStatus,
1684 "clCreateKernel kernel_HistogramRectOneChannelReduction");
1686 histKern.mpkKernel = clCreateKernel(
1687 histKern.mpkProgram,
"kernel_HistogramRectAllChannels", &clStatus);
1688 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_HistogramRectAllChannels");
1690 histRedKern.mpkKernel =
1691 clCreateKernel(histRedKern.mpkProgram,
1692 "kernel_HistogramRectAllChannelsReduction", &clStatus);
1693 CHECK_OPENCL(clStatus,
1694 "clCreateKernel kernel_HistogramRectAllChannelsReduction");
1700 ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE,
1701 CL_MAP_WRITE, 0, tmpHistogramBins *
sizeof(cl_uint),
1702 0,
nullptr,
nullptr, &clStatus);
1703 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer tmpHistogramBuffer");
1705 memset(ptr, 0, tmpHistogramBins *
sizeof(cl_uint));
1706 clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0,
1711 clSetKernelArg(histKern.mpkKernel, 0,
sizeof(cl_mem), &imageBuffer);
1712 CHECK_OPENCL(clStatus,
"clSetKernelArg imageBuffer");
1713 cl_uint numPixels = width * height;
1714 clStatus = clSetKernelArg(histKern.mpkKernel, 1,
sizeof(cl_uint), &numPixels);
1715 CHECK_OPENCL(clStatus,
"clSetKernelArg numPixels");
1716 clStatus = clSetKernelArg(histKern.mpkKernel, 2,
sizeof(cl_mem),
1717 &tmpHistogramBuffer);
1718 CHECK_OPENCL(clStatus,
"clSetKernelArg tmpHistogramBuffer");
1721 int n = numThreads / bytes_per_pixel;
1722 clStatus = clSetKernelArg(histRedKern.mpkKernel, 0,
sizeof(cl_int), &n);
1723 CHECK_OPENCL(clStatus,
"clSetKernelArg imageBuffer");
1724 clStatus = clSetKernelArg(histRedKern.mpkKernel, 1,
sizeof(cl_mem),
1725 &tmpHistogramBuffer);
1726 CHECK_OPENCL(clStatus,
"clSetKernelArg tmpHistogramBuffer");
1727 clStatus = clSetKernelArg(histRedKern.mpkKernel, 2,
sizeof(cl_mem),
1729 CHECK_OPENCL(clStatus,
"clSetKernelArg histogramBuffer");
1732 clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1,
1733 nullptr, global_work_size, local_work_size,
1734 0,
nullptr,
nullptr);
1735 CHECK_OPENCL(clStatus,
1736 "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels");
1737 clFinish(histKern.mpkCmdQueue);
1738 if (clStatus != 0) {
1742 clStatus = clEnqueueNDRangeKernel(
1743 histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1,
nullptr,
1744 red_global_work_size, local_work_size, 0,
nullptr,
nullptr);
1747 "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction");
1748 clFinish(histRedKern.mpkCmdQueue);
1749 if (clStatus != 0) {
1754 ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE,
1757 nullptr,
nullptr, &clStatus);
1758 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer histogramBuffer");
1759 if (clStatus != 0) {
1762 clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0,
1765 clReleaseMemObject(histogramBuffer);
1766 clReleaseMemObject(imageBuffer);
1775 int OpenclDevice::ThresholdRectToPixOCL(
unsigned char* imageData,
1776 int bytes_per_pixel,
int bytes_per_line,
1777 int* thresholds,
int* hi_values,
1778 Pix** pix,
int height,
int width,
1779 int top,
int left) {
1782 *pix = pixCreate(width, height, 1);
1783 uint32_t* pixData = pixGetData(*pix);
1784 int wpl = pixGetWpl(*pix);
1785 int pixSize = wpl * height *
sizeof(uint32_t);
1789 SetKernelEnv(&rEnv);
1792 int block_size = 256;
1794 clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
1795 sizeof(numCUs), &numCUs,
nullptr);
1796 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1798 int requestedOccupancy = 10;
1799 int numWorkGroups = numCUs * requestedOccupancy;
1800 int numThreads = block_size * numWorkGroups;
1801 size_t local_work_size[] = {(size_t)block_size};
1802 size_t global_work_size[] = {(size_t)numThreads};
1809 cl_mem imageBuffer = clCreateBuffer(
1810 rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1811 width * height * bytes_per_pixel *
sizeof(
char), imageData, &clStatus);
1812 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1816 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1817 pixSize, pixData, &clStatus);
1818 CHECK_OPENCL(clStatus,
"clCreateBuffer pix");
1821 cl_mem thresholdsBuffer =
1822 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1823 bytes_per_pixel *
sizeof(
int), thresholds, &clStatus);
1824 CHECK_OPENCL(clStatus,
"clCreateBuffer thresholdBuffer");
1825 cl_mem hiValuesBuffer =
1826 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1827 bytes_per_pixel *
sizeof(
int), hi_values, &clStatus);
1828 CHECK_OPENCL(clStatus,
"clCreateBuffer hiValuesBuffer");
1831 if (bytes_per_pixel == 4) {
1833 clCreateKernel(rEnv.mpkProgram,
"kernel_ThresholdRectToPix", &clStatus);
1834 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_ThresholdRectToPix");
1836 rEnv.mpkKernel = clCreateKernel(
1837 rEnv.mpkProgram,
"kernel_ThresholdRectToPix_OneChan", &clStatus);
1838 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_ThresholdRectToPix_OneChan");
1842 clStatus = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &imageBuffer);
1843 CHECK_OPENCL(clStatus,
"clSetKernelArg imageBuffer");
1844 clStatus = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(
int), &height);
1845 CHECK_OPENCL(clStatus,
"clSetKernelArg height");
1846 clStatus = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(
int), &width);
1847 CHECK_OPENCL(clStatus,
"clSetKernelArg width");
1848 clStatus = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(
int), &wpl);
1849 CHECK_OPENCL(clStatus,
"clSetKernelArg wpl");
1851 clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(cl_mem), &thresholdsBuffer);
1852 CHECK_OPENCL(clStatus,
"clSetKernelArg thresholdsBuffer");
1853 clStatus = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(cl_mem), &hiValuesBuffer);
1854 CHECK_OPENCL(clStatus,
"clSetKernelArg hiValuesBuffer");
1855 clStatus = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(cl_mem), &pixThBuffer);
1856 CHECK_OPENCL(clStatus,
"clSetKernelArg pixThBuffer");
1859 clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 1,
1860 nullptr, global_work_size, local_work_size,
1861 0,
nullptr,
nullptr);
1862 CHECK_OPENCL(clStatus,
"clEnqueueNDRangeKernel kernel_ThresholdRectToPix");
1863 clFinish(rEnv.mpkCmdQueue);
1864 if (clStatus != 0) {
1865 tprintf(
"Setting return value to -1\n");
1870 clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0,
1871 pixSize, 0,
nullptr,
nullptr, &clStatus);
1872 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer histogramBuffer");
1873 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0,
nullptr,
1876 clReleaseMemObject(imageBuffer);
1877 clReleaseMemObject(thresholdsBuffer);
1878 clReleaseMemObject(hiValuesBuffer);
1887 struct TessScoreEvaluationInputData {
1891 unsigned char* imageData;
1895 static void populateTessScoreEvaluationInputData(
1896 TessScoreEvaluationInputData* input) {
1901 int numChannels = 4;
1902 input->height = height;
1903 input->width = width;
1904 input->numChannels = numChannels;
1905 unsigned char(*imageData4)[4] = (
unsigned char(*)[4])malloc(
1906 height * width * numChannels *
1907 sizeof(
unsigned char));
1908 input->imageData = (
unsigned char*)&imageData4[0];
1911 unsigned char pixelWhite[4] = {0, 0, 0, 255};
1912 unsigned char pixelBlack[4] = {255, 255, 255, 255};
1913 for (
int p = 0; p < height * width; p++) {
1915 imageData4[p][0] = pixelWhite[0];
1916 imageData4[p][1] = pixelWhite[1];
1917 imageData4[p][2] = pixelWhite[2];
1918 imageData4[p][3] = pixelWhite[3];
1921 int maxLineWidth = 64;
1924 for (
int i = 0; i < numLines; i++) {
1925 int lineWidth = rand() % maxLineWidth;
1926 int vertLinePos = lineWidth + rand() % (width - 2 * lineWidth);
1928 for (
int row = vertLinePos - lineWidth / 2;
1929 row < vertLinePos + lineWidth / 2; row++) {
1930 for (
int col = 0; col < height; col++) {
1932 imageData4[row * width + col][0] = pixelBlack[0];
1933 imageData4[row * width + col][1] = pixelBlack[1];
1934 imageData4[row * width + col][2] = pixelBlack[2];
1935 imageData4[row * width + col][3] = pixelBlack[3];
1940 for (
int i = 0; i < numLines; i++) {
1941 int lineWidth = rand() % maxLineWidth;
1942 int horLinePos = lineWidth + rand() % (height - 2 * lineWidth);
1944 for (
int row = 0; row < width; row++) {
1945 for (
int col = horLinePos - lineWidth / 2;
1946 col < horLinePos + lineWidth / 2;
1951 imageData4[row * width + col][0] = pixelBlack[0];
1952 imageData4[row * width + col][1] = pixelBlack[1];
1953 imageData4[row * width + col][2] = pixelBlack[2];
1954 imageData4[row * width + col][3] = pixelBlack[3];
1959 float fractionBlack = 0.1;
1961 (height * width) * fractionBlack / (maxLineWidth * maxLineWidth / 2 / 2);
1962 for (
int i = 0; i < numSpots; i++) {
1963 int lineWidth = rand() % maxLineWidth;
1964 int col = lineWidth + rand() % (width - 2 * lineWidth);
1965 int row = lineWidth + rand() % (height - 2 * lineWidth);
1967 for (
int r = row - lineWidth / 2; r < row + lineWidth / 2; r++) {
1968 for (
int c = col - lineWidth / 2; c < col + lineWidth / 2; c++) {
1971 imageData4[r * width + c][0] = pixelBlack[0];
1972 imageData4[r * width + c][1] = pixelBlack[1];
1973 imageData4[r * width + c][2] = pixelBlack[2];
1974 imageData4[r * width + c][3] = pixelBlack[3];
1979 input->pix = pixCreate(input->width, input->height, 8 * input->numChannels);
1982 struct TessDeviceScore {
1992 static double composeRGBPixelMicroBench(GPUEnv* env,
1993 TessScoreEvaluationInputData input,
1994 ds_device_type
type) {
1997 LARGE_INTEGER freq, time_funct_start, time_funct_end;
1998 QueryPerformanceFrequency(&freq);
2000 mach_timebase_info_data_t info = {0, 0};
2001 mach_timebase_info(&info);
2002 long long start, stop;
2004 timespec time_funct_start, time_funct_end;
2007 l_uint32* tiffdata =
2008 (l_uint32*)input.imageData;
2012 if (
type == DS_DEVICE_OPENCL_DEVICE) {
2014 QueryPerformanceCounter(&time_funct_start);
2016 start = mach_absolute_time();
2018 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2021 OpenclDevice::gpuEnv = *env;
2022 int wpl = pixGetWpl(input.pix);
2023 OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height,
2026 QueryPerformanceCounter(&time_funct_end);
2027 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2028 (
double)(freq.QuadPart);
2030 stop = mach_absolute_time();
2031 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2033 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2034 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2035 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2040 QueryPerformanceCounter(&time_funct_start);
2042 start = mach_absolute_time();
2044 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2046 Pix* pix = pixCreate(input.width, input.height, 32);
2047 l_uint32* pixData = pixGetData(pix);
2050 for (i = 0; i < input.height; i++) {
2051 for (j = 0; j < input.width; j++) {
2052 l_uint32 tiffword = tiffdata[i * input.width + j];
2053 l_int32 rval = ((tiffword)&0xff);
2054 l_int32 gval = (((tiffword) >> 8) & 0xff);
2055 l_int32 bval = (((tiffword) >> 16) & 0xff);
2056 l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
2057 pixData[idx] = value;
2062 QueryPerformanceCounter(&time_funct_end);
2063 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2064 (
double)(freq.QuadPart);
2066 stop = mach_absolute_time();
2067 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2069 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2070 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2071 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2079 static double histogramRectMicroBench(GPUEnv* env,
2080 TessScoreEvaluationInputData input,
2081 ds_device_type
type) {
2084 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2085 QueryPerformanceFrequency(&freq);
2087 mach_timebase_info_data_t info = {0, 0};
2088 mach_timebase_info(&info);
2089 long long start, stop;
2091 timespec time_funct_start, time_funct_end;
2097 int bytes_per_line = input.width * input.numChannels;
2098 int* histogramAllChannels =
new int[
kHistogramSize * input.numChannels];
2100 if (
type == DS_DEVICE_OPENCL_DEVICE) {
2102 QueryPerformanceCounter(&time_funct_start);
2104 start = mach_absolute_time();
2106 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2109 OpenclDevice::gpuEnv = *env;
2110 int retVal = OpenclDevice::HistogramRectOCL(
2111 input.imageData, input.numChannels, bytes_per_line, left, top,
2112 input.width, input.height,
kHistogramSize, histogramAllChannels);
2115 QueryPerformanceCounter(&time_funct_end);
2116 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2117 (
double)(freq.QuadPart);
2119 stop = mach_absolute_time();
2121 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2126 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2127 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2128 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2133 QueryPerformanceCounter(&time_funct_start);
2135 start = mach_absolute_time();
2137 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2139 for (
int ch = 0; ch < input.numChannels; ++ch) {
2141 input.width, input.height, histogram);
2144 QueryPerformanceCounter(&time_funct_end);
2145 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2146 (
double)(freq.QuadPart);
2148 stop = mach_absolute_time();
2149 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2151 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2152 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2153 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2159 delete[] histogramAllChannels;
2164 static void ThresholdRectToPix_Native(
const unsigned char* imagedata,
2165 int bytes_per_pixel,
int bytes_per_line,
2166 const int* thresholds,
2167 const int* hi_values, Pix** pix) {
2170 int width = pixGetWidth(*pix);
2171 int height = pixGetHeight(*pix);
2173 *pix = pixCreate(width, height, 1);
2174 uint32_t* pixdata = pixGetData(*pix);
2175 int wpl = pixGetWpl(*pix);
2176 const unsigned char* srcdata =
2177 imagedata + top * bytes_per_line + left * bytes_per_pixel;
2178 for (
int y = 0; y < height; ++y) {
2179 const uint8_t* linedata = srcdata;
2180 uint32_t* pixline = pixdata + y * wpl;
2181 for (
int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
2182 bool white_result =
true;
2183 for (
int ch = 0; ch < bytes_per_pixel; ++ch) {
2184 if (hi_values[ch] >= 0 &&
2185 (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
2186 white_result =
false;
2191 CLEAR_DATA_BIT(pixline, x);
2193 SET_DATA_BIT(pixline, x);
2195 srcdata += bytes_per_line;
2199 static double thresholdRectToPixMicroBench(GPUEnv* env,
2200 TessScoreEvaluationInputData input,
2201 ds_device_type
type) {
2204 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2205 QueryPerformanceFrequency(&freq);
2207 mach_timebase_info_data_t info = {0, 0};
2208 mach_timebase_info(&info);
2209 long long start, stop;
2211 timespec time_funct_start, time_funct_end;
2215 unsigned char pixelHi = (
unsigned char)255;
2216 int thresholds[4] = {pixelHi, pixelHi, pixelHi, pixelHi};
2221 int bytes_per_line = input.width * input.numChannels;
2224 if (
type == DS_DEVICE_OPENCL_DEVICE) {
2226 QueryPerformanceCounter(&time_funct_start);
2228 start = mach_absolute_time();
2230 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2233 OpenclDevice::gpuEnv = *env;
2235 int retVal = OpenclDevice::ThresholdRectToPixOCL(
2236 input.imageData, input.numChannels, bytes_per_line, thresholds,
2237 hi_values, &input.pix, input.height, input.width, top, left);
2240 QueryPerformanceCounter(&time_funct_end);
2241 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2242 (
double)(freq.QuadPart);
2244 stop = mach_absolute_time();
2246 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2252 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2253 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2254 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2260 QueryPerformanceCounter(&time_funct_start);
2262 start = mach_absolute_time();
2264 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2266 int hi_values[4] = {};
2267 ThresholdRectToPix_Native(input.imageData, input.numChannels,
2268 bytes_per_line, thresholds, hi_values,
2272 QueryPerformanceCounter(&time_funct_end);
2273 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2274 (
double)(freq.QuadPart);
2276 stop = mach_absolute_time();
2277 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2279 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2280 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2281 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2288 static double getLineMasksMorphMicroBench(GPUEnv* env,
2289 TessScoreEvaluationInputData input,
2290 ds_device_type
type) {
2293 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2294 QueryPerformanceFrequency(&freq);
2296 mach_timebase_info_data_t info = {0, 0};
2297 mach_timebase_info(&info);
2298 long long start, stop;
2300 timespec time_funct_start, time_funct_end;
2304 int resolution = 300;
2305 int wpl = pixGetWpl(input.pix);
2310 int closing_brick = max_line_width / 3;
2313 if (
type == DS_DEVICE_OPENCL_DEVICE) {
2315 QueryPerformanceCounter(&time_funct_start);
2317 start = mach_absolute_time();
2319 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2321 OpenclDevice::gpuEnv = *env;
2322 OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
2323 Pix *pix_vline =
nullptr, *pix_hline =
nullptr, *pix_closed =
nullptr;
2324 OpenclDevice::pixGetLinesCL(
nullptr, input.pix, &pix_vline, &pix_hline,
2325 &pix_closed,
true, closing_brick, closing_brick,
2326 max_line_width, max_line_width, min_line_length,
2329 OpenclDevice::releaseMorphCLBuffers();
2332 QueryPerformanceCounter(&time_funct_end);
2333 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2334 (
double)(freq.QuadPart);
2336 stop = mach_absolute_time();
2337 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2339 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2340 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2341 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2345 QueryPerformanceCounter(&time_funct_start);
2347 start = mach_absolute_time();
2349 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2353 Pix* src_pix = input.pix;
2355 pixCloseBrick(
nullptr, src_pix, closing_brick, closing_brick);
2357 pixOpenBrick(
nullptr, pix_closed, max_line_width, max_line_width);
2358 Pix* pix_hollow = pixSubtract(
nullptr, pix_closed, pix_solid);
2359 pixDestroy(&pix_solid);
2360 Pix* pix_vline = pixOpenBrick(
nullptr, pix_hollow, 1, min_line_length);
2361 Pix* pix_hline = pixOpenBrick(
nullptr, pix_hollow, min_line_length, 1);
2362 pixDestroy(&pix_hline);
2363 pixDestroy(&pix_vline);
2364 pixDestroy(&pix_hollow);
2367 QueryPerformanceCounter(&time_funct_end);
2368 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2369 (
double)(freq.QuadPart);
2371 stop = mach_absolute_time();
2372 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2374 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2375 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2376 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2390 static ds_status serializeScore(ds_device* device, uint8_t** serializedScore,
2391 unsigned int* serializedScoreSize) {
2392 *serializedScoreSize =
sizeof(TessDeviceScore);
2393 *serializedScore =
new uint8_t[*serializedScoreSize];
2394 memcpy(*serializedScore, device->score, *serializedScoreSize);
2399 static ds_status deserializeScore(ds_device* device,
2400 const uint8_t* serializedScore,
2401 unsigned int serializedScoreSize) {
2403 device->score =
new TessDeviceScore;
2404 memcpy(device->score, serializedScore, serializedScoreSize);
2408 static ds_status releaseScore(TessDeviceScore* score) {
2414 static ds_status evaluateScoreForDevice(ds_device* device,
void* inputData) {
2417 tprintf(
"\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName,
2418 device->type == DS_DEVICE_OPENCL_DEVICE ?
"OpenCL" :
"Native");
2419 GPUEnv* env =
nullptr;
2420 if (device->type == DS_DEVICE_OPENCL_DEVICE) {
2421 env = &OpenclDevice::gpuEnv;
2422 memset(env, 0,
sizeof(*env));
2424 populateGPUEnvFromDevice(env, device->oclDeviceID);
2425 env->mnFileCount = 0;
2426 env->mnKernelCount = 0UL;
2428 OpenclDevice::CompileKernelFile(env,
"");
2431 TessScoreEvaluationInputData* input =
2432 static_cast<TessScoreEvaluationInputData*>(inputData);
2435 double composeRGBPixelTime =
2436 composeRGBPixelMicroBench(env, *input, device->type);
2439 double histogramRectTime = histogramRectMicroBench(env, *input, device->type);
2442 double thresholdRectToPixTime =
2443 thresholdRectToPixMicroBench(env, *input, device->type);
2446 double getLineMasksMorphTime =
2447 getLineMasksMorphMicroBench(env, *input, device->type);
2451 float composeRGBPixelWeight = 1.2f;
2452 float histogramRectWeight = 2.4f;
2453 float thresholdRectToPixWeight = 4.5f;
2454 float getLineMasksMorphWeight = 5.0f;
2456 float weightedTime = composeRGBPixelWeight * composeRGBPixelTime +
2457 histogramRectWeight * histogramRectTime +
2458 thresholdRectToPixWeight * thresholdRectToPixTime +
2459 getLineMasksMorphWeight * getLineMasksMorphTime;
2460 device->score =
new TessDeviceScore;
2461 device->score->time = weightedTime;
2463 tprintf(
"[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName,
2464 device->type == DS_DEVICE_OPENCL_DEVICE ?
"OpenCL" :
"Native");
2465 tprintf(
"[DS]%25s: %f (w=%.1f)\n",
"composeRGBPixel", composeRGBPixelTime,
2466 composeRGBPixelWeight);
2467 tprintf(
"[DS]%25s: %f (w=%.1f)\n",
"HistogramRect", histogramRectTime,
2468 histogramRectWeight);
2469 tprintf(
"[DS]%25s: %f (w=%.1f)\n",
"ThresholdRectToPix",
2470 thresholdRectToPixTime, thresholdRectToPixWeight);
2471 tprintf(
"[DS]%25s: %f (w=%.1f)\n",
"getLineMasksMorph", getLineMasksMorphTime,
2472 getLineMasksMorphWeight);
2473 tprintf(
"[DS]%25s: %f\n",
"Score", device->score->time);
2478 ds_device OpenclDevice::getDeviceSelection() {
2479 if (!deviceIsSelected) {
2481 if (1 == LoadOpencl()) {
2485 ds_profile* profile;
2486 status = initDSProfile(&profile,
"v0.1");
2488 const char* fileName =
"tesseract_opencl_profile_devices.dat";
2489 status = readProfileFromFile(profile, deserializeScore, fileName);
2490 if (status != DS_SUCCESS) {
2492 tprintf(
"[DS] Profile file not available (%s); performing profiling.\n",
2496 TessScoreEvaluationInputData input;
2497 populateTessScoreEvaluationInputData(&input);
2499 unsigned int numUpdates;
2500 status = profileDevices(profile, DS_EVALUATE_ALL,
2501 evaluateScoreForDevice, &input, &numUpdates);
2503 if (status == DS_SUCCESS) {
2504 status = writeProfileToFile(profile, serializeScore, fileName);
2505 if (status == DS_SUCCESS) {
2506 tprintf(
"[DS] Scores written to file (%s).\n", fileName);
2509 "[DS] Error saving scores to file (%s); scores not written to "
2515 "[DS] Unable to evaluate performance; scores not written to "
2519 tprintf(
"[DS] Profile read from file (%s).\n", fileName);
2524 float bestTime = FLT_MAX;
2525 int bestDeviceIdx = -1;
2526 for (
unsigned d = 0; d < profile->numDevices; d++) {
2527 ds_device device = profile->devices[d];
2528 if (device.score ==
nullptr)
continue;
2529 TessDeviceScore score = *device.score;
2531 float time = score.time;
2532 tprintf(
"[DS] Device[%u] %i:%s score is %f\n", d + 1, device.type,
2533 device.oclDeviceName, time);
2534 if (time < bestTime) {
2539 if (bestDeviceIdx >= 0) {
2540 tprintf(
"[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2541 profile->devices[bestDeviceIdx].oclDeviceName,
2542 profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
2549 bool overridden =
false;
2550 char* overrideDeviceStr = getenv(
"TESSERACT_OPENCL_DEVICE");
2551 if (overrideDeviceStr !=
nullptr) {
2552 int overrideDeviceIdx = atoi(overrideDeviceStr);
2553 if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices) {
2555 "[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, "
2557 overrideDeviceStr, overrideDeviceIdx);
2558 bestDeviceIdx = overrideDeviceIdx - 1;
2562 "[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are "
2563 "valid devices).\n",
2564 overrideDeviceStr, profile->numDevices);
2569 tprintf(
"[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2570 profile->devices[bestDeviceIdx].oclDeviceName,
2571 profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
2575 selectedDevice = profile->devices[bestDeviceIdx];
2577 releaseDSProfile(profile, releaseScore);
2580 tprintf(
"[DS] OpenCL runtime not available.\n");
2581 selectedDevice.type = DS_DEVICE_NATIVE_CPU;
2582 selectedDevice.oclDeviceName =
"(null)";
2583 selectedDevice.score =
nullptr;
2584 selectedDevice.oclDeviceID =
nullptr;
2585 selectedDevice.oclDriverVersion =
nullptr;
2587 deviceIsSelected =
true;
2589 return selectedDevice;
2592 bool OpenclDevice::selectedDeviceIsOpenCL() {
2593 ds_device device = getDeviceSelection();
2594 return (device.type == DS_DEVICE_OPENCL_DEVICE);