13 #include <sys/types.h> 26 #include <mach/mach_time.h> 37 GPUEnv OpenclDevice::gpuEnv;
39 bool OpenclDevice::deviceIsSelected =
false;
40 ds_device OpenclDevice::selectedDevice;
42 int OpenclDevice::isInited = 0;
44 static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC;
46 static const l_uint32 lmask32[] = {
47 0x80000000, 0xc0000000, 0xe0000000, 0xf0000000, 0xf8000000, 0xfc000000,
48 0xfe000000, 0xff000000, 0xff800000, 0xffc00000, 0xffe00000, 0xfff00000,
49 0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000, 0xffff8000, 0xffffc000,
50 0xffffe000, 0xfffff000, 0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00,
51 0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0, 0xfffffff8, 0xfffffffc,
52 0xfffffffe, 0xffffffff};
54 static const l_uint32 rmask32[] = {
55 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
56 0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
57 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff,
58 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
59 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff,
60 0x7fffffff, 0xffffffff};
62 static cl_mem pixsCLBuffer, pixdCLBuffer,
64 static cl_mem pixThBuffer;
65 static cl_int clStatus;
66 static KernelEnv rEnv;
68 #define DS_TAG_VERSION "<version>" 69 #define DS_TAG_VERSION_END "</version>" 70 #define DS_TAG_DEVICE "<device>" 71 #define DS_TAG_DEVICE_END "</device>" 72 #define DS_TAG_SCORE "<score>" 73 #define DS_TAG_SCORE_END "</score>" 74 #define DS_TAG_DEVICE_TYPE "<type>" 75 #define DS_TAG_DEVICE_TYPE_END "</type>" 76 #define DS_TAG_DEVICE_NAME "<name>" 77 #define DS_TAG_DEVICE_NAME_END "</name>" 78 #define DS_TAG_DEVICE_DRIVER_VERSION "<driver>" 79 #define DS_TAG_DEVICE_DRIVER_VERSION_END "</driver>" 81 #define DS_DEVICE_NATIVE_CPU_STRING "native_cpu" 83 #define DS_DEVICE_NAME_LENGTH 256 85 enum ds_evaluation_type { DS_EVALUATE_ALL, DS_EVALUATE_NEW_ONLY };
88 std::vector<ds_device> devices;
89 unsigned int numDevices;
95 DS_INVALID_PROFILE = 1000,
97 DS_INVALID_PERF_EVALUATOR_TYPE,
98 DS_INVALID_PERF_EVALUATOR,
99 DS_PERF_EVALUATOR_ERROR,
101 DS_UNKNOWN_DEVICE_TYPE,
102 DS_PROFILE_FILE_ERROR,
103 DS_SCORE_SERIALIZER_ERROR,
104 DS_SCORE_DESERIALIZER_ERROR
111 typedef ds_status (*ds_perf_evaluator)(ds_device* device,
void* data);
114 typedef ds_status (*ds_score_release)(TessDeviceScore* score);
116 static ds_status releaseDSProfile(ds_profile* profile, ds_score_release sr) {
117 ds_status status = DS_SUCCESS;
118 if (profile !=
nullptr) {
121 for (i = 0; i < profile->numDevices; i++) {
122 free(profile->devices[i].oclDeviceName);
123 free(profile->devices[i].oclDriverVersion);
124 status = sr(profile->devices[i].score);
125 if (status != DS_SUCCESS)
break;
133 static ds_status initDSProfile(ds_profile** p,
const char* version) {
135 cl_uint numPlatforms;
136 std::vector<cl_platform_id> platforms;
137 std::vector <cl_device_id> devices;
138 ds_status status = DS_SUCCESS;
142 if (p ==
nullptr)
return DS_INVALID_PROFILE;
144 ds_profile* profile =
new ds_profile;
146 memset(profile, 0,
sizeof(ds_profile));
148 clGetPlatformIDs(0,
nullptr, &numPlatforms);
150 if (numPlatforms > 0) {
151 platforms.reserve(numPlatforms);
152 clGetPlatformIDs(numPlatforms, &platforms[0],
nullptr);
156 for (i = 0; i < numPlatforms; i++) {
158 clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0,
nullptr, &num);
162 if (numDevices > 0) {
163 devices.reserve(numDevices);
166 profile->numDevices =
168 profile->devices.reserve(profile->numDevices);
169 memset(&profile->devices[0], 0, profile->numDevices *
sizeof(ds_device));
172 for (i = 0; i < numPlatforms; i++) {
175 clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numDevices, &devices[0], &num);
176 for (j = 0; j < num; j++, next++) {
177 char buffer[DS_DEVICE_NAME_LENGTH];
180 profile->devices[next].type = DS_DEVICE_OPENCL_DEVICE;
181 profile->devices[next].oclDeviceID = devices[j];
183 clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME,
184 DS_DEVICE_NAME_LENGTH, &buffer,
nullptr);
185 length = strlen(buffer);
186 profile->devices[next].oclDeviceName = (
char*)malloc(length + 1);
187 memcpy(profile->devices[next].oclDeviceName, buffer, length + 1);
189 clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION,
190 DS_DEVICE_NAME_LENGTH, &buffer,
nullptr);
191 length = strlen(buffer);
192 profile->devices[next].oclDriverVersion = (
char*)malloc(length + 1);
193 memcpy(profile->devices[next].oclDriverVersion, buffer, length + 1);
196 profile->devices[next].type = DS_DEVICE_NATIVE_CPU;
197 profile->version = version;
203 static ds_status profileDevices(ds_profile* profile,
204 const ds_evaluation_type type,
205 ds_perf_evaluator evaluator,
206 void* evaluatorData,
unsigned int* numUpdates) {
207 ds_status status = DS_SUCCESS;
209 unsigned int updates = 0;
211 if (profile ==
nullptr) {
212 return DS_INVALID_PROFILE;
214 if (evaluator ==
nullptr) {
215 return DS_INVALID_PERF_EVALUATOR;
218 for (i = 0; i < profile->numDevices; i++) {
219 ds_status evaluatorStatus;
222 case DS_EVALUATE_NEW_ONLY:
223 if (profile->devices[i].score !=
nullptr)
break;
225 case DS_EVALUATE_ALL:
226 evaluatorStatus = evaluator(&profile->devices[i], evaluatorData);
227 if (evaluatorStatus != DS_SUCCESS) {
228 status = evaluatorStatus;
234 return DS_INVALID_PERF_EVALUATOR_TYPE;
238 if (numUpdates) *numUpdates = updates;
242 static const char* findString(
const char* contentStart,
const char* contentEnd,
243 const char*
string) {
245 const char* currentPosition;
246 const char* found =
nullptr;
247 stringLength = strlen(
string);
248 currentPosition = contentStart;
249 for (currentPosition = contentStart; currentPosition < contentEnd;
251 if (*currentPosition ==
string[0]) {
252 if (currentPosition + stringLength < contentEnd) {
253 if (strncmp(currentPosition,
string, stringLength) == 0) {
254 found = currentPosition;
263 static ds_status readProFile(
const char* fileName,
char** content,
264 size_t* contentSize) {
267 ds_status status = DS_SUCCESS;
268 FILE* input = fopen(fileName,
"rb");
269 if (input ==
nullptr) {
270 status = DS_FILE_ERROR;
272 fseek(input, 0L, SEEK_END);
273 long pos = ftell(input);
277 char *binary =
new char[size];
278 if (fread(binary,
sizeof(
char), size, input) != size) {
279 status = DS_FILE_ERROR;
291 typedef ds_status (*ds_score_deserializer)(ds_device* device,
292 const unsigned char* serializedScore,
293 unsigned int serializedScoreSize);
295 static ds_status readProfileFromFile(ds_profile* profile,
296 ds_score_deserializer deserializer,
298 ds_status status = DS_SUCCESS;
302 if (profile ==
nullptr)
return DS_INVALID_PROFILE;
304 status = readProFile(file, &contentStart, &contentSize);
305 if (status == DS_SUCCESS) {
306 const char* currentPosition;
307 const char* dataStart;
310 const char* contentEnd = contentStart + contentSize;
311 currentPosition = contentStart;
314 dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION);
315 if (dataStart ==
nullptr) {
316 status = DS_PROFILE_FILE_ERROR;
319 dataStart += strlen(DS_TAG_VERSION);
321 dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END);
322 if (dataEnd ==
nullptr) {
323 status = DS_PROFILE_FILE_ERROR;
327 size_t versionStringLength = strlen(profile->version);
328 if (versionStringLength + dataStart != dataEnd ||
329 strncmp(profile->version, dataStart, versionStringLength) != 0) {
331 status = DS_PROFILE_FILE_ERROR;
334 currentPosition = dataEnd + strlen(DS_TAG_VERSION_END);
340 const char* deviceTypeStart;
341 const char* deviceTypeEnd;
342 ds_device_type deviceType;
344 const char* deviceNameStart;
345 const char* deviceNameEnd;
347 const char* deviceScoreStart;
348 const char* deviceScoreEnd;
350 const char* deviceDriverStart;
351 const char* deviceDriverEnd;
353 dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE);
354 if (dataStart ==
nullptr) {
358 dataStart += strlen(DS_TAG_DEVICE);
359 dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END);
360 if (dataEnd ==
nullptr) {
361 status = DS_PROFILE_FILE_ERROR;
366 deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE);
367 if (deviceTypeStart ==
nullptr) {
368 status = DS_PROFILE_FILE_ERROR;
371 deviceTypeStart += strlen(DS_TAG_DEVICE_TYPE);
373 findString(deviceTypeStart, contentEnd, DS_TAG_DEVICE_TYPE_END);
374 if (deviceTypeEnd ==
nullptr) {
375 status = DS_PROFILE_FILE_ERROR;
378 memcpy(&deviceType, deviceTypeStart,
sizeof(ds_device_type));
381 if (deviceType == DS_DEVICE_OPENCL_DEVICE) {
382 deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME);
383 if (deviceNameStart ==
nullptr) {
384 status = DS_PROFILE_FILE_ERROR;
387 deviceNameStart += strlen(DS_TAG_DEVICE_NAME);
389 findString(deviceNameStart, contentEnd, DS_TAG_DEVICE_NAME_END);
390 if (deviceNameEnd ==
nullptr) {
391 status = DS_PROFILE_FILE_ERROR;
396 findString(dataStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION);
397 if (deviceDriverStart ==
nullptr) {
398 status = DS_PROFILE_FILE_ERROR;
401 deviceDriverStart += strlen(DS_TAG_DEVICE_DRIVER_VERSION);
402 deviceDriverEnd = findString(deviceDriverStart, contentEnd,
403 DS_TAG_DEVICE_DRIVER_VERSION_END);
404 if (deviceDriverEnd ==
nullptr) {
405 status = DS_PROFILE_FILE_ERROR;
410 for (i = 0; i < profile->numDevices; i++) {
411 if (profile->devices[i].type == DS_DEVICE_OPENCL_DEVICE) {
412 size_t actualDeviceNameLength;
413 size_t driverVersionLength;
415 actualDeviceNameLength = strlen(profile->devices[i].oclDeviceName);
416 driverVersionLength = strlen(profile->devices[i].oclDriverVersion);
417 if (deviceNameStart + actualDeviceNameLength == deviceNameEnd &&
418 deviceDriverStart + driverVersionLength == deviceDriverEnd &&
419 strncmp(profile->devices[i].oclDeviceName, deviceNameStart,
420 actualDeviceNameLength) == 0 &&
421 strncmp(profile->devices[i].oclDriverVersion, deviceDriverStart,
422 driverVersionLength) == 0) {
424 findString(dataStart, contentEnd, DS_TAG_SCORE);
425 deviceScoreStart += strlen(DS_TAG_SCORE);
427 findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
428 status = deserializer(&profile->devices[i],
429 (
const unsigned char*)deviceScoreStart,
430 deviceScoreEnd - deviceScoreStart);
431 if (status != DS_SUCCESS) {
437 }
else if (deviceType == DS_DEVICE_NATIVE_CPU) {
438 for (i = 0; i < profile->numDevices; i++) {
439 if (profile->devices[i].type == DS_DEVICE_NATIVE_CPU) {
440 deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
441 if (deviceScoreStart ==
nullptr) {
442 status = DS_PROFILE_FILE_ERROR;
445 deviceScoreStart += strlen(DS_TAG_SCORE);
447 findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
448 status = deserializer(&profile->devices[i],
449 (
const unsigned char*)deviceScoreStart,
450 deviceScoreEnd - deviceScoreStart);
451 if (status != DS_SUCCESS) {
459 currentPosition = dataEnd + strlen(DS_TAG_DEVICE_END);
463 delete[] contentStart;
467 typedef ds_status (*ds_score_serializer)(ds_device* device,
468 void** serializedScore,
469 unsigned int* serializedScoreSize);
470 static ds_status writeProfileToFile(ds_profile* profile,
471 ds_score_serializer serializer,
473 ds_status status = DS_SUCCESS;
475 if (profile ==
nullptr)
return DS_INVALID_PROFILE;
477 FILE* profileFile = fopen(file,
"wb");
478 if (profileFile ==
nullptr) {
479 status = DS_FILE_ERROR;
484 fwrite(DS_TAG_VERSION,
sizeof(
char), strlen(DS_TAG_VERSION), profileFile);
485 fwrite(profile->version,
sizeof(
char), strlen(profile->version),
487 fwrite(DS_TAG_VERSION_END,
sizeof(
char), strlen(DS_TAG_VERSION_END),
489 fwrite(
"\n",
sizeof(
char), 1, profileFile);
491 for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) {
492 void* serializedScore;
493 unsigned int serializedScoreSize;
495 fwrite(DS_TAG_DEVICE,
sizeof(
char), strlen(DS_TAG_DEVICE), profileFile);
497 fwrite(DS_TAG_DEVICE_TYPE,
sizeof(
char), strlen(DS_TAG_DEVICE_TYPE),
499 fwrite(&profile->devices[i].type,
sizeof(ds_device_type), 1, profileFile);
500 fwrite(DS_TAG_DEVICE_TYPE_END,
sizeof(
char),
501 strlen(DS_TAG_DEVICE_TYPE_END), profileFile);
503 switch (profile->devices[i].type) {
504 case DS_DEVICE_NATIVE_CPU: {
515 case DS_DEVICE_OPENCL_DEVICE: {
516 fwrite(DS_TAG_DEVICE_NAME,
sizeof(
char), strlen(DS_TAG_DEVICE_NAME),
518 fwrite(profile->devices[i].oclDeviceName,
sizeof(
char),
519 strlen(profile->devices[i].oclDeviceName), profileFile);
520 fwrite(DS_TAG_DEVICE_NAME_END,
sizeof(
char),
521 strlen(DS_TAG_DEVICE_NAME_END), profileFile);
523 fwrite(DS_TAG_DEVICE_DRIVER_VERSION,
sizeof(
char),
524 strlen(DS_TAG_DEVICE_DRIVER_VERSION), profileFile);
525 fwrite(profile->devices[i].oclDriverVersion,
sizeof(
char),
526 strlen(profile->devices[i].oclDriverVersion), profileFile);
527 fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END,
sizeof(
char),
528 strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile);
531 status = DS_UNKNOWN_DEVICE_TYPE;
535 fwrite(DS_TAG_SCORE,
sizeof(
char), strlen(DS_TAG_SCORE), profileFile);
536 status = serializer(&profile->devices[i], &serializedScore,
537 &serializedScoreSize);
538 if (status == DS_SUCCESS && serializedScore !=
nullptr &&
539 serializedScoreSize > 0) {
540 fwrite(serializedScore,
sizeof(
char), serializedScoreSize, profileFile);
541 free(serializedScore);
543 fwrite(DS_TAG_SCORE_END,
sizeof(
char), strlen(DS_TAG_SCORE_END),
545 fwrite(DS_TAG_DEVICE_END,
sizeof(
char), strlen(DS_TAG_DEVICE_END),
547 fwrite(
"\n",
sizeof(
char), 1, profileFile);
555 static void legalizeFileName(
char* fileName) {
557 const char* invalidChars =
560 for (
unsigned i = 0; i < strlen(invalidChars); i++) {
562 invalidStr[0] = invalidChars[i];
563 invalidStr[1] =
'\0';
569 for (
char* pos = strstr(fileName, invalidStr); pos !=
nullptr;
570 pos = strstr(pos + 1, invalidStr)) {
578 static void populateGPUEnvFromDevice(GPUEnv* gpuInfo, cl_device_id device) {
581 gpuInfo->mnIsUserCreated = 1;
583 gpuInfo->mpDevID = device;
584 gpuInfo->mpArryDevsID =
new cl_device_id[1];
585 gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
586 clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE,
587 sizeof(cl_device_type), &gpuInfo->mDevType, &size);
588 CHECK_OPENCL(clStatus,
"populateGPUEnv::getDeviceInfo(TYPE)");
591 clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM,
592 sizeof(cl_platform_id), &gpuInfo->mpPlatformID, &size);
593 CHECK_OPENCL(clStatus,
"populateGPUEnv::getDeviceInfo(PLATFORM)");
595 cl_context_properties props[3];
596 props[0] = CL_CONTEXT_PLATFORM;
597 props[1] = (cl_context_properties)gpuInfo->mpPlatformID;
600 clCreateContext(props, 1, &gpuInfo->mpDevID,
nullptr,
nullptr, &clStatus);
601 CHECK_OPENCL(clStatus,
"populateGPUEnv::createContext");
603 cl_command_queue_properties queueProperties = 0;
604 gpuInfo->mpCmdQueue = clCreateCommandQueue(
605 gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus);
606 CHECK_OPENCL(clStatus,
"populateGPUEnv::createCommandQueue");
609 int OpenclDevice::LoadOpencl() {
611 HINSTANCE HOpenclDll =
nullptr;
612 void* OpenclDll =
nullptr;
614 OpenclDll =
static_cast<HINSTANCE
>(HOpenclDll);
615 OpenclDll = LoadLibrary(
"openCL.dll");
616 if (!static_cast<HINSTANCE>(OpenclDll)) {
617 fprintf(stderr,
"[OD] Load opencl.dll failed!\n");
618 FreeLibrary(static_cast<HINSTANCE>(OpenclDll));
621 fprintf(stderr,
"[OD] Load opencl.dll successful!\n");
625 int OpenclDevice::SetKernelEnv(KernelEnv* envInfo) {
626 envInfo->mpkContext = gpuEnv.mpContext;
627 envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
628 envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
633 static cl_mem allocateZeroCopyBuffer(
const KernelEnv& rEnv,
634 l_uint32* hostbuffer,
size_t nElements,
635 cl_mem_flags flags, cl_int* pStatus) {
637 clCreateBuffer(rEnv.mpkContext, (cl_mem_flags)(flags),
638 nElements *
sizeof(l_uint32), hostbuffer, pStatus);
643 static Pix* mapOutputCLBuffer(
const KernelEnv& rEnv, cl_mem clbuffer, Pix* pixd,
644 Pix* pixs,
int elements, cl_mem_flags flags,
645 bool memcopy =
false,
bool sync =
true) {
648 if ((pixd = pixCreateTemplate(pixs)) ==
nullptr)
651 if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs),
652 pixGetDepth(pixs))) ==
nullptr)
656 l_uint32* pValues = (l_uint32*)clEnqueueMapBuffer(
657 rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
658 elements *
sizeof(l_uint32), 0,
nullptr,
nullptr,
nullptr);
661 memcpy(pixGetData(pixd), pValues, elements *
sizeof(l_uint32));
663 pixSetData(pixd, pValues);
666 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, clbuffer, pValues, 0,
nullptr,
670 clFinish(rEnv.mpkCmdQueue);
676 void OpenclDevice::releaseMorphCLBuffers() {
677 if (pixdCLIntermediate !=
nullptr) clReleaseMemObject(pixdCLIntermediate);
678 if (pixsCLBuffer !=
nullptr) clReleaseMemObject(pixsCLBuffer);
679 if (pixdCLBuffer !=
nullptr) clReleaseMemObject(pixdCLBuffer);
680 if (pixThBuffer !=
nullptr) clReleaseMemObject(pixThBuffer);
681 pixdCLIntermediate = pixsCLBuffer = pixdCLBuffer = pixThBuffer =
nullptr;
684 int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix* pixs) {
687 if (pixThBuffer !=
nullptr) {
688 pixsCLBuffer = allocateZeroCopyBuffer(rEnv,
nullptr, wpl * h,
689 CL_MEM_ALLOC_HOST_PTR, &clStatus);
693 clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0,
694 sizeof(l_uint32) * wpl * h, 0,
nullptr,
nullptr);
698 reinterpret_cast<l_uint32*
>(malloc(wpl * h *
sizeof(l_uint32)));
699 memcpy(srcdata, pixGetData(pixs), wpl * h *
sizeof(l_uint32));
701 pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl * h,
702 CL_MEM_USE_HOST_PTR, &clStatus);
705 pixdCLBuffer = allocateZeroCopyBuffer(rEnv,
nullptr, wpl * h,
706 CL_MEM_ALLOC_HOST_PTR, &clStatus);
708 pixdCLIntermediate = allocateZeroCopyBuffer(rEnv,
nullptr, wpl * h,
709 CL_MEM_ALLOC_HOST_PTR, &clStatus);
711 return (
int)clStatus;
714 int OpenclDevice::InitEnv() {
719 if (1 == LoadOpencl())
break;
725 InitOpenclRunEnv_DeviceSelection(0);
731 int OpenclDevice::ReleaseOpenclRunEnv() {
732 ReleaseOpenclEnv(&gpuEnv);
739 inline int OpenclDevice::AddKernelConfig(
int kCount,
const char* kName) {
741 ASSERT_HOST(strlen(kName) <
sizeof(gpuEnv.mArrykernelNames[kCount - 1]));
742 strcpy(gpuEnv.mArrykernelNames[kCount - 1], kName);
743 gpuEnv.mnKernelCount++;
747 int OpenclDevice::RegistOpenclKernel() {
748 if (!gpuEnv.mnIsUserCreated) memset(&gpuEnv, 0,
sizeof(gpuEnv));
750 gpuEnv.mnFileCount = 0;
751 gpuEnv.mnKernelCount = 0UL;
753 AddKernelConfig(1,
"oclAverageSub1");
757 int OpenclDevice::InitOpenclRunEnv_DeviceSelection(
int argc) {
761 ds_device bestDevice_DS = getDeviceSelection();
763 cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
765 if (selectedDeviceIsOpenCL()) {
768 populateGPUEnvFromDevice(&gpuEnv, bestDevice);
769 gpuEnv.mnFileCount = 0;
770 gpuEnv.mnKernelCount = 0UL;
772 CompileKernelFile(&gpuEnv,
"");
784 OpenclDevice::OpenclDevice() {
788 OpenclDevice::~OpenclDevice() {
792 int OpenclDevice::ReleaseOpenclEnv(GPUEnv* gpuInfo) {
800 for (i = 0; i < gpuEnv.mnFileCount; i++) {
801 if (gpuEnv.mpArryPrograms[i]) {
802 clStatus = clReleaseProgram(gpuEnv.mpArryPrograms[i]);
803 CHECK_OPENCL(clStatus,
"clReleaseProgram");
804 gpuEnv.mpArryPrograms[i] =
nullptr;
807 if (gpuEnv.mpCmdQueue) {
808 clReleaseCommandQueue(gpuEnv.mpCmdQueue);
809 gpuEnv.mpCmdQueue =
nullptr;
811 if (gpuEnv.mpContext) {
812 clReleaseContext(gpuEnv.mpContext);
813 gpuEnv.mpContext =
nullptr;
816 gpuInfo->mnIsUserCreated = 0;
817 delete[] gpuInfo->mpArryDevsID;
820 int OpenclDevice::BinaryGenerated(
const char* clFileName, FILE** fhandle) {
825 char fileName[256] = {0}, cl_name[128] = {0};
826 char deviceName[1024];
827 clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME,
828 sizeof(deviceName), deviceName,
nullptr);
829 CHECK_OPENCL(clStatus,
"clGetDeviceInfo");
830 const char* str = strstr(clFileName,
".cl");
831 memcpy(cl_name, clFileName, str - clFileName);
832 cl_name[str - clFileName] =
'\0';
833 sprintf(fileName,
"%s-%s.bin", cl_name, deviceName);
834 legalizeFileName(fileName);
835 fd = fopen(fileName,
"rb");
836 status = (fd !=
nullptr) ? 1 : 0;
842 int OpenclDevice::CachedOfKernerPrg(
const GPUEnv* gpuEnvCached,
843 const char* clFileName) {
845 for (i = 0; i < gpuEnvCached->mnFileCount; i++) {
846 if (strcasecmp(gpuEnvCached->mArryKnelSrcFile[i], clFileName) == 0) {
847 if (gpuEnvCached->mpArryPrograms[i] !=
nullptr) {
855 int OpenclDevice::WriteBinaryToFile(
const char* fileName,
const char* birary,
857 FILE* output =
nullptr;
858 output = fopen(fileName,
"wb");
859 if (output ==
nullptr) {
863 fwrite(birary,
sizeof(
char), numBytes, output);
869 int OpenclDevice::GeneratBinFromKernelSource(cl_program program,
870 const char* clFileName) {
875 clStatus = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,
876 sizeof(numDevices), &numDevices,
nullptr);
877 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
879 std::vector<cl_device_id> mpArryDevsID(numDevices);
882 clStatus = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
883 sizeof(cl_device_id) * numDevices,
886 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
889 std::vector<size_t> binarySizes(numDevices);
892 clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
893 sizeof(
size_t) * numDevices, &binarySizes[0],
nullptr);
894 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
897 std::vector<char*> binaries(numDevices);
899 for (i = 0; i < numDevices; i++) {
900 if (binarySizes[i] != 0) {
901 binaries[i] =
new char[binarySizes[i]];
903 binaries[i] =
nullptr;
908 clGetProgramInfo(program, CL_PROGRAM_BINARIES,
sizeof(
char*) * numDevices,
909 &binaries[0],
nullptr);
910 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
913 for (i = 0; i < numDevices; i++) {
914 char fileName[256] = {0}, cl_name[128] = {0};
916 if (binarySizes[i] != 0) {
917 char deviceName[1024];
918 clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
919 sizeof(deviceName), deviceName,
nullptr);
920 CHECK_OPENCL(clStatus,
"clGetDeviceInfo");
922 const char* str = strstr(clFileName,
".cl");
923 memcpy(cl_name, clFileName, str - clFileName);
924 cl_name[str - clFileName] =
'\0';
925 sprintf(fileName,
"%s-%s.bin", cl_name, deviceName);
926 legalizeFileName(fileName);
927 if (!WriteBinaryToFile(fileName, binaries[i], binarySizes[i])) {
928 tprintf(
"[OD] write binary[%s] failed\n", fileName);
931 tprintf(
"[OD] write binary[%s] successfully\n", fileName);
936 for (i = 0; i < numDevices; i++) {
937 delete[] binaries[i];
943 int OpenclDevice::CompileKernelFile(GPUEnv* gpuInfo,
const char* buildOption) {
947 size_t source_size[1];
948 int binary_status, binaryExisted, idx;
951 const char* filename =
"kernel.cl";
953 if (CachedOfKernerPrg(gpuInfo, filename) == 1) {
957 idx = gpuInfo->mnFileCount;
961 source_size[0] = strlen(source);
963 binaryExisted = BinaryGenerated(
966 if (binaryExisted == 1) {
967 clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
968 sizeof(numDevices), &numDevices,
nullptr);
969 CHECK_OPENCL(clStatus,
"clGetContextInfo");
971 std::vector<cl_device_id> mpArryDevsID(numDevices);
973 bool b_error = fseek(fd, 0, SEEK_END) < 0;
974 long pos = ftell(fd);
975 b_error |= (pos <= 0);
977 b_error |= fseek(fd, 0, SEEK_SET) < 0;
983 std::vector<uint8_t> binary(length + 2);
985 memset(&binary[0], 0, length + 2);
986 b_error |= fread(&binary[0], 1, length, fd) != length;
992 clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES,
993 sizeof(cl_device_id) * numDevices,
994 &mpArryDevsID[0],
nullptr);
995 CHECK_OPENCL(clStatus,
"clGetContextInfo");
998 const uint8_t* c_binary = &binary[0];
999 gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary(
1000 gpuInfo->mpContext, numDevices, &mpArryDevsID[0], &length, &c_binary,
1001 &binary_status, &clStatus);
1002 CHECK_OPENCL(clStatus,
"clCreateProgramWithBinary");
1008 gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource(
1009 gpuInfo->mpContext, 1, &source, source_size, &clStatus);
1010 CHECK_OPENCL(clStatus,
"clCreateProgramWithSource");
1014 if (gpuInfo->mpArryPrograms[idx] == (cl_program)
nullptr) {
1022 if (!gpuInfo->mnIsUserCreated) {
1024 clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
1025 buildOption,
nullptr,
nullptr);
1029 clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
1030 buildOption,
nullptr,
nullptr);
1034 if (clStatus != CL_SUCCESS) {
1035 tprintf(
"BuildProgram error!\n");
1037 if (!gpuInfo->mnIsUserCreated) {
1038 clStatus = clGetProgramBuildInfo(
1039 gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1040 CL_PROGRAM_BUILD_LOG, 0,
nullptr, &length);
1043 clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1044 CL_PROGRAM_BUILD_LOG, 0,
nullptr, &length);
1046 if (clStatus != CL_SUCCESS) {
1047 tprintf(
"opencl create build log fail\n");
1050 std::vector<char> buildLog(length);
1051 if (!gpuInfo->mnIsUserCreated) {
1052 clStatus = clGetProgramBuildInfo(
1053 gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1054 CL_PROGRAM_BUILD_LOG, length, &buildLog[0], &length);
1056 clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx],
1057 gpuInfo->mpDevID, CL_PROGRAM_BUILD_LOG,
1058 length, &buildLog[0], &length);
1060 if (clStatus != CL_SUCCESS) {
1061 tprintf(
"opencl program build info fail\n");
1065 fd1 = fopen(
"kernel-build.log",
"w+");
1066 if (fd1 !=
nullptr) {
1067 fwrite(&buildLog[0],
sizeof(
char), length, fd1);
1075 strcpy(gpuInfo->mArryKnelSrcFile[idx], filename);
1077 if (binaryExisted == 0) {
1078 GeneratBinFromKernelSource(gpuInfo->mpArryPrograms[idx], filename);
1082 gpuInfo->mnFileCount += 1;
1087 l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32* tiffdata, l_int32 w,
1088 l_int32 h, l_int32 wpl,
1093 size_t globalThreads[2];
1094 size_t localThreads[2];
1100 gsize = (w + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1101 globalThreads[0] = gsize;
1102 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1103 globalThreads[1] = gsize;
1104 localThreads[0] = GROUPSIZE_X;
1105 localThreads[1] = GROUPSIZE_Y;
1107 SetKernelEnv(&rEnv);
1109 l_uint32* pResult = (l_uint32*)malloc(w * h * sizeof(l_uint32));
1111 clCreateKernel(rEnv.mpkProgram, "composeRGBPixel", &clStatus);
1112 CHECK_OPENCL(clStatus, "clCreateKernel composeRGBPixel");
1115 valuesCl = allocateZeroCopyBuffer(
1116 rEnv, tiffdata, w * h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1117 outputCl = allocateZeroCopyBuffer(
1118 rEnv, pResult, w * h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1121 clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &valuesCl);
1122 CHECK_OPENCL(clStatus, "clSetKernelArg");
1123 clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(w), &w);
1124 CHECK_OPENCL(clStatus, "clSetKernelArg");
1125 clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(h), &h);
1126 CHECK_OPENCL(clStatus, "clSetKernelArg");
1127 clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1128 CHECK_OPENCL(clStatus, "clSetKernelArg");
1129 clStatus = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outputCl);
1130 CHECK_OPENCL(clStatus, "clSetKernelArg");
1135 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1136 globalThreads, localThreads, 0,
nullptr,
nullptr);
1137 CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel");
1140 void* ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE,
1141 CL_MAP_READ, 0, w * h * sizeof(l_uint32), 0,
1142 nullptr,
nullptr, &clStatus);
1143 CHECK_OPENCL(clStatus, "clEnqueueMapBuffer outputCl");
1144 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0,
nullptr,
nullptr);
1147 clFinish(rEnv.mpkCmdQueue);
1155 static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) {
1156 size_t globalThreads[2];
1160 size_t localThreads[2];
1163 gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1164 globalThreads[0] = gsize;
1165 globalThreads[1] = GROUPSIZE_HMORY;
1166 localThreads[0] = GROUPSIZE_HMORX;
1167 localThreads[1] = GROUPSIZE_HMORY;
1170 clCreateKernel(rEnv.mpkProgram,
"morphoDilateHor_5x5", &status);
1171 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor_5x5");
1173 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1174 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1175 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1176 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1179 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1180 globalThreads, localThreads, 0,
nullptr,
nullptr);
1183 pixtemp = pixsCLBuffer;
1184 pixsCLBuffer = pixdCLBuffer;
1185 pixdCLBuffer = pixtemp;
1188 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1189 globalThreads[0] = gsize;
1190 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1191 globalThreads[1] = gsize;
1192 localThreads[0] = GROUPSIZE_X;
1193 localThreads[1] = GROUPSIZE_Y;
1196 clCreateKernel(rEnv.mpkProgram,
"morphoDilateVer_5x5", &status);
1197 CHECK_OPENCL(status,
"clCreateKernel morphoDilateVer_5x5");
1199 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1200 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1201 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1202 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1204 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1205 globalThreads, localThreads, 0,
nullptr,
nullptr);
1212 static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h) {
1213 size_t globalThreads[2];
1217 l_uint32 fwmask, lwmask;
1218 size_t localThreads[2];
1220 lwmask = lmask32[31 - 2];
1221 fwmask = rmask32[31 - 2];
1224 gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1225 globalThreads[0] = gsize;
1226 globalThreads[1] = GROUPSIZE_HMORY;
1227 localThreads[0] = GROUPSIZE_HMORX;
1228 localThreads[1] = GROUPSIZE_HMORY;
1231 clCreateKernel(rEnv.mpkProgram,
"morphoErodeHor_5x5", &status);
1232 CHECK_OPENCL(status,
"clCreateKernel morphoErodeHor_5x5");
1234 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1235 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1236 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1237 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1240 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1241 globalThreads, localThreads, 0,
nullptr,
nullptr);
1244 pixtemp = pixsCLBuffer;
1245 pixsCLBuffer = pixdCLBuffer;
1246 pixdCLBuffer = pixtemp;
1249 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1250 globalThreads[0] = gsize;
1251 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1252 globalThreads[1] = gsize;
1253 localThreads[0] = GROUPSIZE_X;
1254 localThreads[1] = GROUPSIZE_Y;
1257 clCreateKernel(rEnv.mpkProgram,
"morphoErodeVer_5x5", &status);
1258 CHECK_OPENCL(status,
"clCreateKernel morphoErodeVer_5x5");
1260 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1261 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1262 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1263 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1264 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(fwmask), &fwmask);
1265 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(lwmask), &lwmask);
1267 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1268 globalThreads, localThreads, 0,
nullptr,
nullptr);
1274 static cl_int pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl,
1276 l_int32 xp, yp, xn, yn;
1278 size_t globalThreads[2];
1282 size_t localThreads[2];
1285 OpenclDevice::SetKernelEnv(&rEnv);
1287 if (hsize == 5 && vsize == 5) {
1289 status = pixDilateCL_55(wpl, h);
1293 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1295 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1298 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1299 globalThreads[0] = gsize;
1300 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1301 globalThreads[1] = gsize;
1302 localThreads[0] = GROUPSIZE_X;
1303 localThreads[1] = GROUPSIZE_Y;
1305 if (xp > 31 || xn > 31) {
1308 clCreateKernel(rEnv.mpkProgram,
"morphoDilateHor", &status);
1309 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor");
1311 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1312 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1313 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1314 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(xn), &xn);
1315 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(wpl), &wpl);
1316 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(h), &h);
1317 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1318 nullptr, globalThreads, localThreads, 0,
1321 if (yp > 0 || yn > 0) {
1322 pixtemp = pixsCLBuffer;
1323 pixsCLBuffer = pixdCLBuffer;
1324 pixdCLBuffer = pixtemp;
1326 }
else if (xp > 0 || xn > 0) {
1329 clCreateKernel(rEnv.mpkProgram,
"morphoDilateHor_32word", &status);
1330 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor_32word");
1331 isEven = (xp != xn);
1333 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1334 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1335 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1336 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1337 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1338 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isEven), &isEven);
1339 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1340 nullptr, globalThreads, localThreads, 0,
1343 if (yp > 0 || yn > 0) {
1344 pixtemp = pixsCLBuffer;
1345 pixsCLBuffer = pixdCLBuffer;
1346 pixdCLBuffer = pixtemp;
1350 if (yp > 0 || yn > 0) {
1352 clCreateKernel(rEnv.mpkProgram,
"morphoDilateVer", &status);
1353 CHECK_OPENCL(status,
"clCreateKernel morphoDilateVer");
1355 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1356 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1357 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(yp), &yp);
1358 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1359 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1360 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(yn), &yn);
1361 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1362 nullptr, globalThreads, localThreads, 0,
1370 static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl,
1372 l_int32 xp, yp, xn, yn;
1374 size_t globalThreads[2];
1375 size_t localThreads[2];
1379 char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
1380 l_uint32 rwmask, lwmask;
1383 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1385 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1387 OpenclDevice::SetKernelEnv(&rEnv);
1389 if (hsize == 5 && vsize == 5 && isAsymmetric) {
1391 status = pixErodeCL_55(wpl, h);
1395 lwmask = lmask32[31 - (xn & 31)];
1396 rwmask = rmask32[31 - (xp & 31)];
1399 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1400 globalThreads[0] = gsize;
1401 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1402 globalThreads[1] = gsize;
1403 localThreads[0] = GROUPSIZE_X;
1404 localThreads[1] = GROUPSIZE_Y;
1407 if (xp > 31 || xn > 31) {
1409 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoErodeHor", &status);
1411 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1412 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1413 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1414 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(xn), &xn);
1415 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(wpl), &wpl);
1416 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(h), &h);
1418 clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(isAsymmetric), &isAsymmetric);
1419 status = clSetKernelArg(rEnv.mpkKernel, 7,
sizeof(rwmask), &rwmask);
1420 status = clSetKernelArg(rEnv.mpkKernel, 8,
sizeof(lwmask), &lwmask);
1421 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1422 nullptr, globalThreads, localThreads, 0,
1425 if (yp > 0 || yn > 0) {
1426 pixtemp = pixsCLBuffer;
1427 pixsCLBuffer = pixdCLBuffer;
1428 pixdCLBuffer = pixtemp;
1430 }
else if (xp > 0 || xn > 0) {
1432 clCreateKernel(rEnv.mpkProgram,
"morphoErodeHor_32word", &status);
1433 isEven = (xp != xn);
1435 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1436 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1437 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1438 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1439 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1441 clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isAsymmetric), &isAsymmetric);
1442 status = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(rwmask), &rwmask);
1443 status = clSetKernelArg(rEnv.mpkKernel, 7,
sizeof(lwmask), &lwmask);
1444 status = clSetKernelArg(rEnv.mpkKernel, 8,
sizeof(isEven), &isEven);
1445 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1446 nullptr, globalThreads, localThreads, 0,
1449 if (yp > 0 || yn > 0) {
1450 pixtemp = pixsCLBuffer;
1451 pixsCLBuffer = pixdCLBuffer;
1452 pixdCLBuffer = pixtemp;
1457 if (yp > 0 || yn > 0) {
1458 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoErodeVer", &status);
1459 CHECK_OPENCL(status,
"clCreateKernel morphoErodeVer");
1461 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1462 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1463 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(yp), &yp);
1464 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1465 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1467 clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isAsymmetric), &isAsymmetric);
1468 status = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(yn), &yn);
1469 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1470 nullptr, globalThreads, localThreads, 0,
1478 static cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1483 status = pixErodeCL(hsize, vsize, wpl, h);
1485 pixtemp = pixsCLBuffer;
1486 pixsCLBuffer = pixdCLBuffer;
1487 pixdCLBuffer = pixtemp;
1489 status = pixDilateCL(hsize, vsize, wpl, h);
1495 static cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1500 status = pixDilateCL(hsize, vsize, wpl, h);
1502 pixtemp = pixsCLBuffer;
1503 pixsCLBuffer = pixdCLBuffer;
1504 pixdCLBuffer = pixtemp;
1506 status = pixErodeCL(hsize, vsize, wpl, h);
1512 static cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1,
1513 cl_mem buffer2, cl_mem outBuffer =
nullptr) {
1515 size_t globalThreads[2];
1517 size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
1519 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1520 globalThreads[0] = gsize;
1521 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1522 globalThreads[1] = gsize;
1524 if (outBuffer !=
nullptr) {
1525 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"pixSubtract", &status);
1526 CHECK_OPENCL(status,
"clCreateKernel pixSubtract");
1529 clCreateKernel(rEnv.mpkProgram,
"pixSubtract_inplace", &status);
1530 CHECK_OPENCL(status,
"clCreateKernel pixSubtract_inplace");
1534 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &buffer1);
1535 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &buffer2);
1536 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1537 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1538 if (outBuffer !=
nullptr) {
1539 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(cl_mem), &outBuffer);
1542 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1543 globalThreads, localThreads, 0,
nullptr,
nullptr);
1551 void OpenclDevice::pixGetLinesCL(Pix* pixd, Pix* pixs, Pix** pix_vline,
1552 Pix** pix_hline, Pix** pixClosed,
1553 bool getpixClosed, l_int32 close_hsize,
1554 l_int32 close_vsize, l_int32 open_hsize,
1555 l_int32 open_vsize, l_int32 line_hsize,
1556 l_int32 line_vsize) {
1560 wpl = pixGetWpl(pixs);
1561 h = pixGetHeight(pixs);
1564 clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
1568 *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs,
1569 wpl * h, CL_MAP_READ,
true,
false);
1575 clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
1576 0,
sizeof(
int) * wpl * h, 0,
nullptr,
nullptr);
1579 pixtemp = pixsCLBuffer;
1580 pixsCLBuffer = pixdCLBuffer;
1581 pixdCLBuffer = pixtemp;
1583 clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
1586 pixtemp = pixsCLBuffer;
1587 pixsCLBuffer = pixdCLBuffer;
1588 pixdCLBuffer = pixdCLIntermediate;
1589 pixdCLIntermediate = pixtemp;
1591 clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
1596 clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
1597 0,
sizeof(
int) * wpl * h, 0,
nullptr,
nullptr);
1599 pixtemp = pixsCLBuffer;
1600 pixsCLBuffer = pixdCLBuffer;
1601 pixdCLBuffer = pixtemp;
1605 clStatus = pixOpenCL(1, line_vsize, wpl, h);
1608 *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl * h,
1609 CL_MAP_READ,
true,
false);
1611 pixtemp = pixsCLBuffer;
1612 pixsCLBuffer = pixdCLIntermediate;
1613 pixdCLIntermediate = pixtemp;
1617 clStatus = pixOpenCL(line_hsize, 1, wpl, h);
1620 *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl * h,
1621 CL_MAP_READ,
true,
true);
1632 int OpenclDevice::HistogramRectOCL(
void* imageData,
1633 int bytes_per_pixel,
int bytes_per_line,
1637 int* histogramAllChannels) {
1642 SetKernelEnv(&histKern);
1643 KernelEnv histRedKern;
1644 SetKernelEnv(&histRedKern);
1650 cl_mem imageBuffer = clCreateBuffer(
1651 histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1652 width * height * bytes_per_pixel * sizeof(
char), imageData, &clStatus);
1653 CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1656 int block_size = 256;
1658 clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
1659 sizeof(numCUs), &numCUs,
nullptr);
1660 CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1662 int requestedOccupancy = 10;
1663 int numWorkGroups = numCUs * requestedOccupancy;
1664 int numThreads = block_size * numWorkGroups;
1665 size_t local_work_size[] = {
static_cast<size_t>(block_size)};
1666 size_t global_work_size[] = {
static_cast<size_t>(numThreads)};
1667 size_t red_global_work_size[] = {
1668 static_cast<size_t>(block_size *
kHistogramSize * bytes_per_pixel)};
1672 cl_mem histogramBuffer = clCreateBuffer(
1673 histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1674 kHistogramSize * bytes_per_pixel *
sizeof(
int), histogramAllChannels,
1676 CHECK_OPENCL(clStatus,
"clCreateBuffer histogramBuffer");
1680 int tmpHistogramBins =
kHistogramSize * bytes_per_pixel * histRed;
1682 cl_mem tmpHistogramBuffer =
1683 clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE,
1684 tmpHistogramBins *
sizeof(cl_uint),
nullptr, &clStatus);
1685 CHECK_OPENCL(clStatus,
"clCreateBuffer tmpHistogramBuffer");
1688 int* zeroBuffer =
new int[1];
1690 cl_mem atomicSyncBuffer = clCreateBuffer(
1691 histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1692 sizeof(cl_int), zeroBuffer, &clStatus);
1693 CHECK_OPENCL(clStatus,
"clCreateBuffer atomicSyncBuffer");
1694 delete[] zeroBuffer;
1696 if (bytes_per_pixel == 1) {
1697 histKern.mpkKernel = clCreateKernel(
1698 histKern.mpkProgram,
"kernel_HistogramRectOneChannel", &clStatus);
1699 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_HistogramRectOneChannel");
1701 histRedKern.mpkKernel =
1702 clCreateKernel(histRedKern.mpkProgram,
1703 "kernel_HistogramRectOneChannelReduction", &clStatus);
1704 CHECK_OPENCL(clStatus,
1705 "clCreateKernel kernel_HistogramRectOneChannelReduction");
1707 histKern.mpkKernel = clCreateKernel(
1708 histKern.mpkProgram,
"kernel_HistogramRectAllChannels", &clStatus);
1709 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_HistogramRectAllChannels");
1711 histRedKern.mpkKernel =
1712 clCreateKernel(histRedKern.mpkProgram,
1713 "kernel_HistogramRectAllChannelsReduction", &clStatus);
1714 CHECK_OPENCL(clStatus,
1715 "clCreateKernel kernel_HistogramRectAllChannelsReduction");
1721 ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE,
1722 CL_MAP_WRITE, 0, tmpHistogramBins *
sizeof(cl_uint),
1723 0,
nullptr,
nullptr, &clStatus);
1724 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer tmpHistogramBuffer");
1726 memset(ptr, 0, tmpHistogramBins *
sizeof(cl_uint));
1727 clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0,
1732 clSetKernelArg(histKern.mpkKernel, 0,
sizeof(cl_mem), &imageBuffer);
1733 CHECK_OPENCL(clStatus,
"clSetKernelArg imageBuffer");
1734 cl_uint numPixels = width * height;
1735 clStatus = clSetKernelArg(histKern.mpkKernel, 1,
sizeof(cl_uint), &numPixels);
1736 CHECK_OPENCL(clStatus,
"clSetKernelArg numPixels");
1737 clStatus = clSetKernelArg(histKern.mpkKernel, 2,
sizeof(cl_mem),
1738 &tmpHistogramBuffer);
1739 CHECK_OPENCL(clStatus,
"clSetKernelArg tmpHistogramBuffer");
1742 int n = numThreads / bytes_per_pixel;
1743 clStatus = clSetKernelArg(histRedKern.mpkKernel, 0,
sizeof(cl_int), &n);
1744 CHECK_OPENCL(clStatus,
"clSetKernelArg imageBuffer");
1745 clStatus = clSetKernelArg(histRedKern.mpkKernel, 1,
sizeof(cl_mem),
1746 &tmpHistogramBuffer);
1747 CHECK_OPENCL(clStatus,
"clSetKernelArg tmpHistogramBuffer");
1748 clStatus = clSetKernelArg(histRedKern.mpkKernel, 2,
sizeof(cl_mem),
1750 CHECK_OPENCL(clStatus,
"clSetKernelArg histogramBuffer");
1754 clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1,
1755 nullptr, global_work_size, local_work_size,
1756 0,
nullptr,
nullptr);
1757 CHECK_OPENCL(clStatus,
1758 "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels");
1759 clFinish(histKern.mpkCmdQueue);
1760 if (clStatus != 0) {
1764 clStatus = clEnqueueNDRangeKernel(
1765 histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1,
nullptr,
1766 red_global_work_size, local_work_size, 0,
nullptr,
nullptr);
1769 "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction");
1770 clFinish(histRedKern.mpkCmdQueue);
1771 if (clStatus != 0) {
1777 ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE,
1780 nullptr,
nullptr, &clStatus);
1781 CHECK_OPENCL(clStatus, "clEnqueueMapBuffer histogramBuffer");
1782 if (clStatus != 0) {
1785 clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0,
1788 clReleaseMemObject(histogramBuffer);
1789 clReleaseMemObject(imageBuffer);
1800 int OpenclDevice::ThresholdRectToPixOCL(
unsigned char* imageData,
1801 int bytes_per_pixel,
int bytes_per_line,
1802 int* thresholds,
int* hi_values,
1803 Pix** pix,
int height,
int width,
1804 int top,
int left) {
1808 *pix = pixCreate(width, height, 1);
1809 uint32_t* pixData = pixGetData(*pix);
1810 int wpl = pixGetWpl(*pix);
1811 int pixSize = wpl * height * sizeof(uint32_t);
1815 SetKernelEnv(&rEnv);
1818 int block_size = 256;
1820 clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
1821 sizeof(numCUs), &numCUs,
nullptr);
1822 CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1824 int requestedOccupancy = 10;
1825 int numWorkGroups = numCUs * requestedOccupancy;
1826 int numThreads = block_size * numWorkGroups;
1827 size_t local_work_size[] = {(size_t)block_size};
1828 size_t global_work_size[] = {(size_t)numThreads};
1835 cl_mem imageBuffer = clCreateBuffer(
1836 rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1837 width * height * bytes_per_pixel *
sizeof(
char), imageData, &clStatus);
1838 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1842 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1843 pixSize, pixData, &clStatus);
1844 CHECK_OPENCL(clStatus,
"clCreateBuffer pix");
1847 cl_mem thresholdsBuffer =
1848 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1849 bytes_per_pixel *
sizeof(
int), thresholds, &clStatus);
1850 CHECK_OPENCL(clStatus,
"clCreateBuffer thresholdBuffer");
1851 cl_mem hiValuesBuffer =
1852 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1853 bytes_per_pixel *
sizeof(
int), hi_values, &clStatus);
1854 CHECK_OPENCL(clStatus,
"clCreateBuffer hiValuesBuffer");
1857 if (bytes_per_pixel == 4) {
1859 clCreateKernel(rEnv.mpkProgram,
"kernel_ThresholdRectToPix", &clStatus);
1860 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_ThresholdRectToPix");
1862 rEnv.mpkKernel = clCreateKernel(
1863 rEnv.mpkProgram,
"kernel_ThresholdRectToPix_OneChan", &clStatus);
1864 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_ThresholdRectToPix_OneChan");
1868 clStatus = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &imageBuffer);
1869 CHECK_OPENCL(clStatus,
"clSetKernelArg imageBuffer");
1870 clStatus = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(
int), &height);
1871 CHECK_OPENCL(clStatus,
"clSetKernelArg height");
1872 clStatus = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(
int), &width);
1873 CHECK_OPENCL(clStatus,
"clSetKernelArg width");
1874 clStatus = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(
int), &wpl);
1875 CHECK_OPENCL(clStatus,
"clSetKernelArg wpl");
1877 clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(cl_mem), &thresholdsBuffer);
1878 CHECK_OPENCL(clStatus,
"clSetKernelArg thresholdsBuffer");
1879 clStatus = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(cl_mem), &hiValuesBuffer);
1880 CHECK_OPENCL(clStatus,
"clSetKernelArg hiValuesBuffer");
1881 clStatus = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(cl_mem), &pixThBuffer);
1882 CHECK_OPENCL(clStatus,
"clSetKernelArg pixThBuffer");
1886 clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 1,
1887 nullptr, global_work_size, local_work_size,
1888 0,
nullptr,
nullptr);
1889 CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix");
1890 clFinish(rEnv.mpkCmdQueue);
1892 if (clStatus != 0) {
1893 tprintf(
"Setting return value to -1\n");
1898 clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0,
1899 pixSize, 0,
nullptr,
nullptr, &clStatus);
1900 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer histogramBuffer");
1901 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0,
nullptr,
1904 clReleaseMemObject(imageBuffer);
1905 clReleaseMemObject(thresholdsBuffer);
1906 clReleaseMemObject(hiValuesBuffer);
1917 struct TessScoreEvaluationInputData {
1921 unsigned char* imageData;
1925 static void populateTessScoreEvaluationInputData(
1926 TessScoreEvaluationInputData* input) {
1931 int numChannels = 4;
1932 input->height = height;
1933 input->width = width;
1934 input->numChannels = numChannels;
1935 unsigned char(*imageData4)[4] = (
unsigned char(*)[4])malloc(
1936 height * width * numChannels *
1937 sizeof(
unsigned char));
1938 input->imageData = (
unsigned char*)&imageData4[0];
1941 unsigned char pixelWhite[4] = {0, 0, 0, 255};
1942 unsigned char pixelBlack[4] = {255, 255, 255, 255};
1943 for (
int p = 0; p < height * width; p++) {
1945 imageData4[p][0] = pixelWhite[0];
1946 imageData4[p][1] = pixelWhite[1];
1947 imageData4[p][2] = pixelWhite[2];
1948 imageData4[p][3] = pixelWhite[3];
1951 int maxLineWidth = 64;
1954 for (
int i = 0; i < numLines; i++) {
1955 int lineWidth = rand() % maxLineWidth;
1956 int vertLinePos = lineWidth + rand() % (width - 2 * lineWidth);
1958 for (
int row = vertLinePos - lineWidth / 2;
1959 row < vertLinePos + lineWidth / 2; row++) {
1960 for (
int col = 0; col < height; col++) {
1962 imageData4[row * width + col][0] = pixelBlack[0];
1963 imageData4[row * width + col][1] = pixelBlack[1];
1964 imageData4[row * width + col][2] = pixelBlack[2];
1965 imageData4[row * width + col][3] = pixelBlack[3];
1970 for (
int i = 0; i < numLines; i++) {
1971 int lineWidth = rand() % maxLineWidth;
1972 int horLinePos = lineWidth + rand() % (height - 2 * lineWidth);
1974 for (
int row = 0; row < width; row++) {
1975 for (
int col = horLinePos - lineWidth / 2;
1976 col < horLinePos + lineWidth / 2;
1981 imageData4[row * width + col][0] = pixelBlack[0];
1982 imageData4[row * width + col][1] = pixelBlack[1];
1983 imageData4[row * width + col][2] = pixelBlack[2];
1984 imageData4[row * width + col][3] = pixelBlack[3];
1989 float fractionBlack = 0.1;
1991 (height * width) * fractionBlack / (maxLineWidth * maxLineWidth / 2 / 2);
1992 for (
int i = 0; i < numSpots; i++) {
1993 int lineWidth = rand() % maxLineWidth;
1994 int col = lineWidth + rand() % (width - 2 * lineWidth);
1995 int row = lineWidth + rand() % (height - 2 * lineWidth);
1997 for (
int r = row - lineWidth / 2; r < row + lineWidth / 2; r++) {
1998 for (
int c = col - lineWidth / 2; c < col + lineWidth / 2; c++) {
2001 imageData4[r * width + c][0] = pixelBlack[0];
2002 imageData4[r * width + c][1] = pixelBlack[1];
2003 imageData4[r * width + c][2] = pixelBlack[2];
2004 imageData4[r * width + c][3] = pixelBlack[3];
2009 input->pix = pixCreate(input->width, input->height, 1);
2012 struct TessDeviceScore {
2022 static double composeRGBPixelMicroBench(GPUEnv* env,
2023 TessScoreEvaluationInputData input,
2024 ds_device_type type) {
2027 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2028 QueryPerformanceFrequency(&freq);
2030 mach_timebase_info_data_t info = {0, 0};
2031 mach_timebase_info(&info);
2032 long long start, stop;
2034 timespec time_funct_start, time_funct_end;
2037 l_uint32* tiffdata =
2038 (l_uint32*)input.imageData;
2042 if (type == DS_DEVICE_OPENCL_DEVICE) {
2044 QueryPerformanceCounter(&time_funct_start);
2046 start = mach_absolute_time();
2048 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2051 OpenclDevice::gpuEnv = *env;
2052 int wpl = pixGetWpl(input.pix);
2053 OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height,
2056 QueryPerformanceCounter(&time_funct_end);
2057 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2058 (
double)(freq.QuadPart);
2060 stop = mach_absolute_time();
2061 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2063 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2064 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2065 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2070 QueryPerformanceCounter(&time_funct_start);
2072 start = mach_absolute_time();
2074 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2076 Pix* pix = pixCreate(input.width, input.height, 32);
2077 l_uint32* pixData = pixGetData(pix);
2080 for (i = 0; i < input.height; i++) {
2081 for (j = 0; j < input.width; j++) {
2082 l_uint32 tiffword = tiffdata[i * input.width + j];
2083 l_int32 rval = ((tiffword)&0xff);
2084 l_int32 gval = (((tiffword) >> 8) & 0xff);
2085 l_int32 bval = (((tiffword) >> 16) & 0xff);
2086 l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
2087 pixData[idx] = value;
2092 QueryPerformanceCounter(&time_funct_end);
2093 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2094 (
double)(freq.QuadPart);
2096 stop = mach_absolute_time();
2097 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2099 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2100 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2101 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2109 static double histogramRectMicroBench(GPUEnv* env,
2110 TessScoreEvaluationInputData input,
2111 ds_device_type type) {
2114 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2115 QueryPerformanceFrequency(&freq);
2117 mach_timebase_info_data_t info = {0, 0};
2118 mach_timebase_info(&info);
2119 long long start, stop;
2121 timespec time_funct_start, time_funct_end;
2127 int bytes_per_line = input.width * input.numChannels;
2128 int* histogramAllChannels =
new int[
kHistogramSize * input.numChannels];
2130 if (type == DS_DEVICE_OPENCL_DEVICE) {
2132 QueryPerformanceCounter(&time_funct_start);
2134 start = mach_absolute_time();
2136 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2139 OpenclDevice::gpuEnv = *env;
2140 int retVal = OpenclDevice::HistogramRectOCL(
2141 input.imageData, input.numChannels, bytes_per_line, left, top,
2142 input.width, input.height,
kHistogramSize, histogramAllChannels);
2145 QueryPerformanceCounter(&time_funct_end);
2146 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2147 (
double)(freq.QuadPart);
2149 stop = mach_absolute_time();
2151 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2156 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2157 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2158 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2163 QueryPerformanceCounter(&time_funct_start);
2165 start = mach_absolute_time();
2167 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2169 for (
int ch = 0; ch < input.numChannels; ++ch) {
2171 input.width, input.height, histogram);
2174 QueryPerformanceCounter(&time_funct_end);
2175 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2176 (
double)(freq.QuadPart);
2178 stop = mach_absolute_time();
2179 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2181 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2182 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2183 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2189 delete[] histogramAllChannels;
2194 static void ThresholdRectToPix_Native(
const unsigned char* imagedata,
2195 int bytes_per_pixel,
int bytes_per_line,
2196 const int* thresholds,
2197 const int* hi_values, Pix** pix) {
2200 int width = pixGetWidth(*pix);
2201 int height = pixGetHeight(*pix);
2203 *pix = pixCreate(width, height, 1);
2204 uint32_t* pixdata = pixGetData(*pix);
2205 int wpl = pixGetWpl(*pix);
2206 const unsigned char* srcdata =
2207 imagedata + top * bytes_per_line + left * bytes_per_pixel;
2208 for (
int y = 0; y < height; ++y) {
2209 const uint8_t* linedata = srcdata;
2210 uint32_t* pixline = pixdata + y * wpl;
2211 for (
int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
2212 bool white_result =
true;
2213 for (
int ch = 0; ch < bytes_per_pixel; ++ch) {
2214 if (hi_values[ch] >= 0 &&
2215 (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
2216 white_result =
false;
2221 CLEAR_DATA_BIT(pixline, x);
2223 SET_DATA_BIT(pixline, x);
2225 srcdata += bytes_per_line;
2229 static double thresholdRectToPixMicroBench(GPUEnv* env,
2230 TessScoreEvaluationInputData input,
2231 ds_device_type type) {
2234 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2235 QueryPerformanceFrequency(&freq);
2237 mach_timebase_info_data_t info = {0, 0};
2238 mach_timebase_info(&info);
2239 long long start, stop;
2241 timespec time_funct_start, time_funct_end;
2245 unsigned char pixelHi = (
unsigned char)255;
2246 int thresholds[4] = {pixelHi, pixelHi, pixelHi, pixelHi};
2251 int bytes_per_line = input.width * input.numChannels;
2254 if (type == DS_DEVICE_OPENCL_DEVICE) {
2256 QueryPerformanceCounter(&time_funct_start);
2258 start = mach_absolute_time();
2260 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2263 OpenclDevice::gpuEnv = *env;
2265 int retVal = OpenclDevice::ThresholdRectToPixOCL(
2266 input.imageData, input.numChannels, bytes_per_line, thresholds,
2267 hi_values, &input.pix, input.height, input.width, top, left);
2270 QueryPerformanceCounter(&time_funct_end);
2271 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2272 (
double)(freq.QuadPart);
2274 stop = mach_absolute_time();
2276 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2282 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2283 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2284 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2290 QueryPerformanceCounter(&time_funct_start);
2292 start = mach_absolute_time();
2294 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2296 int hi_values[4] = {};
2297 ThresholdRectToPix_Native(input.imageData, input.numChannels,
2298 bytes_per_line, thresholds, hi_values,
2302 QueryPerformanceCounter(&time_funct_end);
2303 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2304 (
double)(freq.QuadPart);
2306 stop = mach_absolute_time();
2307 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2309 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2310 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2311 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2318 static double getLineMasksMorphMicroBench(GPUEnv* env,
2319 TessScoreEvaluationInputData input,
2320 ds_device_type type) {
2323 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2324 QueryPerformanceFrequency(&freq);
2326 mach_timebase_info_data_t info = {0, 0};
2327 mach_timebase_info(&info);
2328 long long start, stop;
2330 timespec time_funct_start, time_funct_end;
2334 int resolution = 300;
2335 int wpl = pixGetWpl(input.pix);
2340 int closing_brick = max_line_width / 3;
2343 if (type == DS_DEVICE_OPENCL_DEVICE) {
2345 QueryPerformanceCounter(&time_funct_start);
2347 start = mach_absolute_time();
2349 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2351 OpenclDevice::gpuEnv = *env;
2352 OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
2353 Pix *pix_vline =
nullptr, *pix_hline =
nullptr, *pix_closed =
nullptr;
2354 OpenclDevice::pixGetLinesCL(
nullptr, input.pix, &pix_vline, &pix_hline,
2355 &pix_closed,
true, closing_brick, closing_brick,
2356 max_line_width, max_line_width, min_line_length,
2359 OpenclDevice::releaseMorphCLBuffers();
2362 QueryPerformanceCounter(&time_funct_end);
2363 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2364 (
double)(freq.QuadPart);
2366 stop = mach_absolute_time();
2367 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2369 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2370 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2371 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2375 QueryPerformanceCounter(&time_funct_start);
2377 start = mach_absolute_time();
2379 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2383 Pix* src_pix = input.pix;
2385 pixCloseBrick(
nullptr, src_pix, closing_brick, closing_brick);
2387 pixOpenBrick(
nullptr, pix_closed, max_line_width, max_line_width);
2388 Pix* pix_hollow = pixSubtract(
nullptr, pix_closed, pix_solid);
2389 pixDestroy(&pix_solid);
2390 Pix* pix_vline = pixOpenBrick(
nullptr, pix_hollow, 1, min_line_length);
2391 Pix* pix_hline = pixOpenBrick(
nullptr, pix_hollow, min_line_length, 1);
2392 pixDestroy(&pix_hline);
2393 pixDestroy(&pix_vline);
2394 pixDestroy(&pix_hollow);
2397 QueryPerformanceCounter(&time_funct_end);
2398 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2399 (
double)(freq.QuadPart);
2401 stop = mach_absolute_time();
2402 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2404 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2405 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2406 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2420 static ds_status serializeScore(ds_device* device,
void** serializedScore,
2421 unsigned int* serializedScoreSize) {
2422 *serializedScoreSize =
sizeof(TessDeviceScore);
2423 *serializedScore =
new unsigned char[*serializedScoreSize];
2424 memcpy(*serializedScore, device->score, *serializedScoreSize);
2429 static ds_status deserializeScore(ds_device* device,
2430 const unsigned char* serializedScore,
2431 unsigned int serializedScoreSize) {
2433 device->score =
new TessDeviceScore;
2434 memcpy(device->score, serializedScore, serializedScoreSize);
2438 static ds_status releaseScore(TessDeviceScore* score) {
2444 static ds_status evaluateScoreForDevice(ds_device* device,
void* inputData) {
2447 tprintf(
"\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName,
2448 device->type == DS_DEVICE_OPENCL_DEVICE ?
"OpenCL" :
"Native");
2449 GPUEnv* env =
nullptr;
2450 if (device->type == DS_DEVICE_OPENCL_DEVICE) {
2451 env = &OpenclDevice::gpuEnv;
2452 memset(env, 0,
sizeof(*env));
2454 populateGPUEnvFromDevice(env, device->oclDeviceID);
2455 env->mnFileCount = 0;
2456 env->mnKernelCount = 0UL;
2458 OpenclDevice::CompileKernelFile(env,
"");
2461 TessScoreEvaluationInputData* input =
2462 static_cast<TessScoreEvaluationInputData*
>(inputData);
2465 double composeRGBPixelTime =
2466 composeRGBPixelMicroBench(env, *input, device->type);
2469 double histogramRectTime = histogramRectMicroBench(env, *input, device->type);
2472 double thresholdRectToPixTime =
2473 thresholdRectToPixMicroBench(env, *input, device->type);
2476 double getLineMasksMorphTime =
2477 getLineMasksMorphMicroBench(env, *input, device->type);
2481 float composeRGBPixelWeight = 1.2f;
2482 float histogramRectWeight = 2.4f;
2483 float thresholdRectToPixWeight = 4.5f;
2484 float getLineMasksMorphWeight = 5.0f;
2486 float weightedTime = composeRGBPixelWeight * composeRGBPixelTime +
2487 histogramRectWeight * histogramRectTime +
2488 thresholdRectToPixWeight * thresholdRectToPixTime +
2489 getLineMasksMorphWeight * getLineMasksMorphTime;
2490 device->score =
new TessDeviceScore;
2491 device->score->time = weightedTime;
2493 tprintf(
"[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName,
2494 device->type == DS_DEVICE_OPENCL_DEVICE ?
"OpenCL" :
"Native");
2495 tprintf(
"[DS]%25s: %f (w=%.1f)\n",
"composeRGBPixel", composeRGBPixelTime,
2496 composeRGBPixelWeight);
2497 tprintf(
"[DS]%25s: %f (w=%.1f)\n",
"HistogramRect", histogramRectTime,
2498 histogramRectWeight);
2499 tprintf(
"[DS]%25s: %f (w=%.1f)\n",
"ThresholdRectToPix",
2500 thresholdRectToPixTime, thresholdRectToPixWeight);
2501 tprintf(
"[DS]%25s: %f (w=%.1f)\n",
"getLineMasksMorph", getLineMasksMorphTime,
2502 getLineMasksMorphWeight);
2503 tprintf(
"[DS]%25s: %f\n",
"Score", device->score->time);
2508 ds_device OpenclDevice::getDeviceSelection() {
2509 if (!deviceIsSelected) {
2512 if (1 == LoadOpencl()) {
2517 ds_profile* profile;
2518 status = initDSProfile(&profile,
"v0.1");
2521 const
char* fileName = "tesseract_opencl_profile_devices.dat";
2522 status = readProfileFromFile(profile, deserializeScore, fileName);
2523 if (status != DS_SUCCESS) {
2525 tprintf(
"[DS] Profile file not available (%s); performing profiling.\n",
2529 TessScoreEvaluationInputData input;
2530 populateTessScoreEvaluationInputData(&input);
2533 unsigned int numUpdates;
2534 status = profileDevices(profile, DS_EVALUATE_ALL,
2535 evaluateScoreForDevice, &input, &numUpdates);
2538 if (status == DS_SUCCESS) {
2539 status = writeProfileToFile(profile, serializeScore, fileName);
2541 if (status == DS_SUCCESS) {
2542 tprintf(
"[DS] Scores written to file (%s).\n", fileName);
2545 "[DS] Error saving scores to file (%s); scores not written to " 2551 "[DS] Unable to evaluate performance; scores not written to " 2556 tprintf("[DS] Profile read from file (%s).\n", fileName);
2561 float bestTime = FLT_MAX;
2562 int bestDeviceIdx = -1;
2563 for (
unsigned d = 0; d < profile->numDevices; d++) {
2564 ds_device device = profile->devices[d];
2565 if (device.score ==
nullptr)
continue;
2566 TessDeviceScore score = *device.score;
2568 float time = score.time;
2569 tprintf(
"[DS] Device[%u] %i:%s score is %f\n", d + 1, device.type,
2570 device.oclDeviceName, time);
2571 if (time < bestTime) {
2576 if (bestDeviceIdx >= 0) {
2577 tprintf(
"[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2578 profile->devices[bestDeviceIdx].oclDeviceName,
2579 profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
2586 bool overridden =
false;
2587 char* overrideDeviceStr = getenv(
"TESSERACT_OPENCL_DEVICE");
2588 if (overrideDeviceStr !=
nullptr) {
2589 int overrideDeviceIdx = atoi(overrideDeviceStr);
2590 if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices) {
2592 "[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, " 2594 overrideDeviceStr, overrideDeviceIdx);
2595 bestDeviceIdx = overrideDeviceIdx - 1;
2599 "[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are " 2600 "valid devices).\n",
2601 overrideDeviceStr, profile->numDevices);
2606 tprintf(
"[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2607 profile->devices[bestDeviceIdx].oclDeviceName,
2608 profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
2612 selectedDevice = profile->devices[bestDeviceIdx];
2614 releaseDSProfile(profile, releaseScore);
2617 tprintf(
"[DS] OpenCL runtime not available.\n");
2618 selectedDevice.type = DS_DEVICE_NATIVE_CPU;
2619 selectedDevice.oclDeviceName =
"(null)";
2620 selectedDevice.score =
nullptr;
2621 selectedDevice.oclDeviceID =
nullptr;
2622 selectedDevice.oclDriverVersion =
nullptr;
2624 deviceIsSelected =
true;
2629 return selectedDevice;
2632 bool OpenclDevice::selectedDeviceIsOpenCL() {
2633 ds_device device = getDeviceSelection();
2634 return (device.type == DS_DEVICE_OPENCL_DEVICE);
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
const int kMinLineLengthFraction
Denominator of resolution makes min pixels to demand line lengths to be.
const int kThinLineFraction
Denominator of resolution makes max pixel width to allow thin lines.
#define PERF_COUNT_START(FUNCT_NAME)
DLLSYM void tprintf(const char *format,...)
void HistogramRect(Pix *src_pix, int channel, int left, int top, int width, int height, int *histogram)
#define PERF_COUNT_SUB(SUB)