tesseract  4.0.0-1-g2a2b
openclwrapper.cpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use this file except in compliance with the License.
3 // You may obtain a copy of the License at
4 // http://www.apache.org/licenses/LICENSE-2.0
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 #ifdef _WIN32
11 #include <io.h>
12 #else
13 #include <sys/types.h>
14 #include <unistd.h>
15 #endif
16 #include <cfloat>
17 
18 #include "oclkernels.h"
19 #include "openclwrapper.h"
20 
21 // for micro-benchmark
22 #include "otsuthr.h"
23 #include "thresholder.h"
24 
25 #if ON_APPLE
26 #include <mach/mach_time.h>
27 #endif
28 
29 #ifdef USE_OPENCL
30 
31 #include <cstdio>
32 #include <vector>
33 
34 #include "errcode.h" // for ASSERT_HOST
36 
37 GPUEnv OpenclDevice::gpuEnv;
38 
39 bool OpenclDevice::deviceIsSelected = false;
40 ds_device OpenclDevice::selectedDevice;
41 
42 int OpenclDevice::isInited = 0;
43 
44 static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC;
45 
46 static const l_uint32 lmask32[] = {
47  0x80000000, 0xc0000000, 0xe0000000, 0xf0000000, 0xf8000000, 0xfc000000,
48  0xfe000000, 0xff000000, 0xff800000, 0xffc00000, 0xffe00000, 0xfff00000,
49  0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000, 0xffff8000, 0xffffc000,
50  0xffffe000, 0xfffff000, 0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00,
51  0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0, 0xfffffff8, 0xfffffffc,
52  0xfffffffe, 0xffffffff};
53 
54 static const l_uint32 rmask32[] = {
55  0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
56  0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
57  0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff,
58  0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
59  0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff,
60  0x7fffffff, 0xffffffff};
61 
62 static cl_mem pixsCLBuffer, pixdCLBuffer,
63  pixdCLIntermediate; // Morph operations buffers
64 static cl_mem pixThBuffer; // output from thresholdtopix calculation
65 static cl_int clStatus;
66 static KernelEnv rEnv;
67 
68 #define DS_TAG_VERSION "<version>"
69 #define DS_TAG_VERSION_END "</version>"
70 #define DS_TAG_DEVICE "<device>"
71 #define DS_TAG_DEVICE_END "</device>"
72 #define DS_TAG_SCORE "<score>"
73 #define DS_TAG_SCORE_END "</score>"
74 #define DS_TAG_DEVICE_TYPE "<type>"
75 #define DS_TAG_DEVICE_TYPE_END "</type>"
76 #define DS_TAG_DEVICE_NAME "<name>"
77 #define DS_TAG_DEVICE_NAME_END "</name>"
78 #define DS_TAG_DEVICE_DRIVER_VERSION "<driver>"
79 #define DS_TAG_DEVICE_DRIVER_VERSION_END "</driver>"
80 
81 #define DS_DEVICE_NATIVE_CPU_STRING "native_cpu"
82 
83 #define DS_DEVICE_NAME_LENGTH 256
84 
85 enum ds_evaluation_type { DS_EVALUATE_ALL, DS_EVALUATE_NEW_ONLY };
86 
87 struct ds_profile {
88  std::vector<ds_device> devices;
89  unsigned int numDevices;
90  const char* version;
91 };
92 
93 enum ds_status {
94  DS_SUCCESS = 0,
95  DS_INVALID_PROFILE = 1000,
96  DS_MEMORY_ERROR,
97  DS_INVALID_PERF_EVALUATOR_TYPE,
98  DS_INVALID_PERF_EVALUATOR,
99  DS_PERF_EVALUATOR_ERROR,
100  DS_FILE_ERROR,
101  DS_UNKNOWN_DEVICE_TYPE,
102  DS_PROFILE_FILE_ERROR,
103  DS_SCORE_SERIALIZER_ERROR,
104  DS_SCORE_DESERIALIZER_ERROR
105 };
106 
107 // Pointer to a function that calculates the score of a device (ex:
108 // device->score) update the data size of score. The encoding and the format
109 // of the score data is implementation defined. The function should return
110 // DS_SUCCESS if there's no error to be reported.
111 typedef ds_status (*ds_perf_evaluator)(ds_device* device, void* data);
112 
113 // deallocate memory used by score
114 typedef ds_status (*ds_score_release)(TessDeviceScore* score);
115 
116 static ds_status releaseDSProfile(ds_profile* profile, ds_score_release sr) {
117  ds_status status = DS_SUCCESS;
118  if (profile != nullptr) {
119  if (sr != nullptr) {
120  unsigned int i;
121  for (i = 0; i < profile->numDevices; i++) {
122  free(profile->devices[i].oclDeviceName);
123  free(profile->devices[i].oclDriverVersion);
124  status = sr(profile->devices[i].score);
125  if (status != DS_SUCCESS) break;
126  }
127  }
128  delete profile;
129  }
130  return status;
131 }
132 
133 static ds_status initDSProfile(ds_profile** p, const char* version) {
134  int numDevices;
135  cl_uint numPlatforms;
136  std::vector<cl_platform_id> platforms;
137  std::vector <cl_device_id> devices;
138  ds_status status = DS_SUCCESS;
139  unsigned int next;
140  unsigned int i;
141 
142  if (p == nullptr) return DS_INVALID_PROFILE;
143 
144  ds_profile* profile = new ds_profile;
145 
146  memset(profile, 0, sizeof(ds_profile));
147 
148  clGetPlatformIDs(0, nullptr, &numPlatforms);
149 
150  if (numPlatforms > 0) {
151  platforms.reserve(numPlatforms);
152  clGetPlatformIDs(numPlatforms, &platforms[0], nullptr);
153  }
154 
155  numDevices = 0;
156  for (i = 0; i < numPlatforms; i++) {
157  cl_uint num;
158  clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, nullptr, &num);
159  numDevices += num;
160  }
161 
162  if (numDevices > 0) {
163  devices.reserve(numDevices);
164  }
165 
166  profile->numDevices =
167  numDevices + 1; // +1 to numDevices to include the native CPU
168  profile->devices.reserve(profile->numDevices);
169  memset(&profile->devices[0], 0, profile->numDevices * sizeof(ds_device));
170 
171  next = 0;
172  for (i = 0; i < numPlatforms; i++) {
173  cl_uint num;
174  unsigned j;
175  clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numDevices, &devices[0], &num);
176  for (j = 0; j < num; j++, next++) {
177  char buffer[DS_DEVICE_NAME_LENGTH];
178  size_t length;
179 
180  profile->devices[next].type = DS_DEVICE_OPENCL_DEVICE;
181  profile->devices[next].oclDeviceID = devices[j];
182 
183  clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME,
184  DS_DEVICE_NAME_LENGTH, &buffer, nullptr);
185  length = strlen(buffer);
186  profile->devices[next].oclDeviceName = (char*)malloc(length + 1);
187  memcpy(profile->devices[next].oclDeviceName, buffer, length + 1);
188 
189  clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION,
190  DS_DEVICE_NAME_LENGTH, &buffer, nullptr);
191  length = strlen(buffer);
192  profile->devices[next].oclDriverVersion = (char*)malloc(length + 1);
193  memcpy(profile->devices[next].oclDriverVersion, buffer, length + 1);
194  }
195  }
196  profile->devices[next].type = DS_DEVICE_NATIVE_CPU;
197  profile->version = version;
198 
199  *p = profile;
200  return status;
201 }
202 
203 static ds_status profileDevices(ds_profile* profile,
204  const ds_evaluation_type type,
205  ds_perf_evaluator evaluator,
206  void* evaluatorData, unsigned int* numUpdates) {
207  ds_status status = DS_SUCCESS;
208  unsigned int i;
209  unsigned int updates = 0;
210 
211  if (profile == nullptr) {
212  return DS_INVALID_PROFILE;
213  }
214  if (evaluator == nullptr) {
215  return DS_INVALID_PERF_EVALUATOR;
216  }
217 
218  for (i = 0; i < profile->numDevices; i++) {
219  ds_status evaluatorStatus;
220 
221  switch (type) {
222  case DS_EVALUATE_NEW_ONLY:
223  if (profile->devices[i].score != nullptr) break;
224  // else fall through
225  case DS_EVALUATE_ALL:
226  evaluatorStatus = evaluator(&profile->devices[i], evaluatorData);
227  if (evaluatorStatus != DS_SUCCESS) {
228  status = evaluatorStatus;
229  return status;
230  }
231  updates++;
232  break;
233  default:
234  return DS_INVALID_PERF_EVALUATOR_TYPE;
235  break;
236  };
237  }
238  if (numUpdates) *numUpdates = updates;
239  return status;
240 }
241 
242 static const char* findString(const char* contentStart, const char* contentEnd,
243  const char* string) {
244  size_t stringLength;
245  const char* currentPosition;
246  const char* found = nullptr;
247  stringLength = strlen(string);
248  currentPosition = contentStart;
249  for (currentPosition = contentStart; currentPosition < contentEnd;
250  currentPosition++) {
251  if (*currentPosition == string[0]) {
252  if (currentPosition + stringLength < contentEnd) {
253  if (strncmp(currentPosition, string, stringLength) == 0) {
254  found = currentPosition;
255  break;
256  }
257  }
258  }
259  }
260  return found;
261 }
262 
263 static ds_status readProFile(const char* fileName, char** content,
264  size_t* contentSize) {
265  *contentSize = 0;
266  *content = nullptr;
267  ds_status status = DS_SUCCESS;
268  FILE* input = fopen(fileName, "rb");
269  if (input == nullptr) {
270  status = DS_FILE_ERROR;
271  } else {
272  fseek(input, 0L, SEEK_END);
273  long pos = ftell(input);
274  rewind(input);
275  if (pos > 0) {
276  size_t size = pos;
277  char *binary = new char[size];
278  if (fread(binary, sizeof(char), size, input) != size) {
279  status = DS_FILE_ERROR;
280  delete[] binary;
281  } else {
282  *contentSize = size;
283  *content = binary;
284  }
285  }
286  fclose(input);
287  }
288  return status;
289 }
290 
291 typedef ds_status (*ds_score_deserializer)(ds_device* device,
292  const unsigned char* serializedScore,
293  unsigned int serializedScoreSize);
294 
295 static ds_status readProfileFromFile(ds_profile* profile,
296  ds_score_deserializer deserializer,
297  const char* file) {
298  ds_status status = DS_SUCCESS;
299  char* contentStart;
300  size_t contentSize;
301 
302  if (profile == nullptr) return DS_INVALID_PROFILE;
303 
304  status = readProFile(file, &contentStart, &contentSize);
305  if (status == DS_SUCCESS) {
306  const char* currentPosition;
307  const char* dataStart;
308  const char* dataEnd;
309 
310  const char* contentEnd = contentStart + contentSize;
311  currentPosition = contentStart;
312 
313  // parse the version string
314  dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION);
315  if (dataStart == nullptr) {
316  status = DS_PROFILE_FILE_ERROR;
317  goto cleanup;
318  }
319  dataStart += strlen(DS_TAG_VERSION);
320 
321  dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END);
322  if (dataEnd == nullptr) {
323  status = DS_PROFILE_FILE_ERROR;
324  goto cleanup;
325  }
326 
327  size_t versionStringLength = strlen(profile->version);
328  if (versionStringLength + dataStart != dataEnd ||
329  strncmp(profile->version, dataStart, versionStringLength) != 0) {
330  // version mismatch
331  status = DS_PROFILE_FILE_ERROR;
332  goto cleanup;
333  }
334  currentPosition = dataEnd + strlen(DS_TAG_VERSION_END);
335 
336  // parse the device information
337  while (1) {
338  unsigned int i;
339 
340  const char* deviceTypeStart;
341  const char* deviceTypeEnd;
342  ds_device_type deviceType;
343 
344  const char* deviceNameStart;
345  const char* deviceNameEnd;
346 
347  const char* deviceScoreStart;
348  const char* deviceScoreEnd;
349 
350  const char* deviceDriverStart;
351  const char* deviceDriverEnd;
352 
353  dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE);
354  if (dataStart == nullptr) {
355  // nothing useful remain, quit...
356  break;
357  }
358  dataStart += strlen(DS_TAG_DEVICE);
359  dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END);
360  if (dataEnd == nullptr) {
361  status = DS_PROFILE_FILE_ERROR;
362  goto cleanup;
363  }
364 
365  // parse the device type
366  deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE);
367  if (deviceTypeStart == nullptr) {
368  status = DS_PROFILE_FILE_ERROR;
369  goto cleanup;
370  }
371  deviceTypeStart += strlen(DS_TAG_DEVICE_TYPE);
372  deviceTypeEnd =
373  findString(deviceTypeStart, contentEnd, DS_TAG_DEVICE_TYPE_END);
374  if (deviceTypeEnd == nullptr) {
375  status = DS_PROFILE_FILE_ERROR;
376  goto cleanup;
377  }
378  memcpy(&deviceType, deviceTypeStart, sizeof(ds_device_type));
379 
380  // parse the device name
381  if (deviceType == DS_DEVICE_OPENCL_DEVICE) {
382  deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME);
383  if (deviceNameStart == nullptr) {
384  status = DS_PROFILE_FILE_ERROR;
385  goto cleanup;
386  }
387  deviceNameStart += strlen(DS_TAG_DEVICE_NAME);
388  deviceNameEnd =
389  findString(deviceNameStart, contentEnd, DS_TAG_DEVICE_NAME_END);
390  if (deviceNameEnd == nullptr) {
391  status = DS_PROFILE_FILE_ERROR;
392  goto cleanup;
393  }
394 
395  deviceDriverStart =
396  findString(dataStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION);
397  if (deviceDriverStart == nullptr) {
398  status = DS_PROFILE_FILE_ERROR;
399  goto cleanup;
400  }
401  deviceDriverStart += strlen(DS_TAG_DEVICE_DRIVER_VERSION);
402  deviceDriverEnd = findString(deviceDriverStart, contentEnd,
403  DS_TAG_DEVICE_DRIVER_VERSION_END);
404  if (deviceDriverEnd == nullptr) {
405  status = DS_PROFILE_FILE_ERROR;
406  goto cleanup;
407  }
408 
409  // check if this device is on the system
410  for (i = 0; i < profile->numDevices; i++) {
411  if (profile->devices[i].type == DS_DEVICE_OPENCL_DEVICE) {
412  size_t actualDeviceNameLength;
413  size_t driverVersionLength;
414 
415  actualDeviceNameLength = strlen(profile->devices[i].oclDeviceName);
416  driverVersionLength = strlen(profile->devices[i].oclDriverVersion);
417  if (deviceNameStart + actualDeviceNameLength == deviceNameEnd &&
418  deviceDriverStart + driverVersionLength == deviceDriverEnd &&
419  strncmp(profile->devices[i].oclDeviceName, deviceNameStart,
420  actualDeviceNameLength) == 0 &&
421  strncmp(profile->devices[i].oclDriverVersion, deviceDriverStart,
422  driverVersionLength) == 0) {
423  deviceScoreStart =
424  findString(dataStart, contentEnd, DS_TAG_SCORE);
425  deviceScoreStart += strlen(DS_TAG_SCORE);
426  deviceScoreEnd =
427  findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
428  status = deserializer(&profile->devices[i],
429  (const unsigned char*)deviceScoreStart,
430  deviceScoreEnd - deviceScoreStart);
431  if (status != DS_SUCCESS) {
432  goto cleanup;
433  }
434  }
435  }
436  }
437  } else if (deviceType == DS_DEVICE_NATIVE_CPU) {
438  for (i = 0; i < profile->numDevices; i++) {
439  if (profile->devices[i].type == DS_DEVICE_NATIVE_CPU) {
440  deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
441  if (deviceScoreStart == nullptr) {
442  status = DS_PROFILE_FILE_ERROR;
443  goto cleanup;
444  }
445  deviceScoreStart += strlen(DS_TAG_SCORE);
446  deviceScoreEnd =
447  findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
448  status = deserializer(&profile->devices[i],
449  (const unsigned char*)deviceScoreStart,
450  deviceScoreEnd - deviceScoreStart);
451  if (status != DS_SUCCESS) {
452  goto cleanup;
453  }
454  }
455  }
456  }
457 
458  // skip over the current one to find the next device
459  currentPosition = dataEnd + strlen(DS_TAG_DEVICE_END);
460  }
461  }
462 cleanup:
463  delete[] contentStart;
464  return status;
465 }
466 
467 typedef ds_status (*ds_score_serializer)(ds_device* device,
468  void** serializedScore,
469  unsigned int* serializedScoreSize);
470 static ds_status writeProfileToFile(ds_profile* profile,
471  ds_score_serializer serializer,
472  const char* file) {
473  ds_status status = DS_SUCCESS;
474 
475  if (profile == nullptr) return DS_INVALID_PROFILE;
476 
477  FILE* profileFile = fopen(file, "wb");
478  if (profileFile == nullptr) {
479  status = DS_FILE_ERROR;
480  } else {
481  unsigned int i;
482 
483  // write version string
484  fwrite(DS_TAG_VERSION, sizeof(char), strlen(DS_TAG_VERSION), profileFile);
485  fwrite(profile->version, sizeof(char), strlen(profile->version),
486  profileFile);
487  fwrite(DS_TAG_VERSION_END, sizeof(char), strlen(DS_TAG_VERSION_END),
488  profileFile);
489  fwrite("\n", sizeof(char), 1, profileFile);
490 
491  for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) {
492  void* serializedScore;
493  unsigned int serializedScoreSize;
494 
495  fwrite(DS_TAG_DEVICE, sizeof(char), strlen(DS_TAG_DEVICE), profileFile);
496 
497  fwrite(DS_TAG_DEVICE_TYPE, sizeof(char), strlen(DS_TAG_DEVICE_TYPE),
498  profileFile);
499  fwrite(&profile->devices[i].type, sizeof(ds_device_type), 1, profileFile);
500  fwrite(DS_TAG_DEVICE_TYPE_END, sizeof(char),
501  strlen(DS_TAG_DEVICE_TYPE_END), profileFile);
502 
503  switch (profile->devices[i].type) {
504  case DS_DEVICE_NATIVE_CPU: {
505  // There's no need to emit a device name for the native CPU device.
506  /*
507  fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME),
508  profileFile);
509  fwrite(DS_DEVICE_NATIVE_CPU_STRING,sizeof(char),
510  strlen(DS_DEVICE_NATIVE_CPU_STRING), profileFile);
511  fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char),
512  strlen(DS_TAG_DEVICE_NAME_END), profileFile);
513  */
514  } break;
515  case DS_DEVICE_OPENCL_DEVICE: {
516  fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME),
517  profileFile);
518  fwrite(profile->devices[i].oclDeviceName, sizeof(char),
519  strlen(profile->devices[i].oclDeviceName), profileFile);
520  fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char),
521  strlen(DS_TAG_DEVICE_NAME_END), profileFile);
522 
523  fwrite(DS_TAG_DEVICE_DRIVER_VERSION, sizeof(char),
524  strlen(DS_TAG_DEVICE_DRIVER_VERSION), profileFile);
525  fwrite(profile->devices[i].oclDriverVersion, sizeof(char),
526  strlen(profile->devices[i].oclDriverVersion), profileFile);
527  fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END, sizeof(char),
528  strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile);
529  } break;
530  default:
531  status = DS_UNKNOWN_DEVICE_TYPE;
532  continue;
533  };
534 
535  fwrite(DS_TAG_SCORE, sizeof(char), strlen(DS_TAG_SCORE), profileFile);
536  status = serializer(&profile->devices[i], &serializedScore,
537  &serializedScoreSize);
538  if (status == DS_SUCCESS && serializedScore != nullptr &&
539  serializedScoreSize > 0) {
540  fwrite(serializedScore, sizeof(char), serializedScoreSize, profileFile);
541  free(serializedScore);
542  }
543  fwrite(DS_TAG_SCORE_END, sizeof(char), strlen(DS_TAG_SCORE_END),
544  profileFile);
545  fwrite(DS_TAG_DEVICE_END, sizeof(char), strlen(DS_TAG_DEVICE_END),
546  profileFile);
547  fwrite("\n", sizeof(char), 1, profileFile);
548  }
549  fclose(profileFile);
550  }
551  return status;
552 }
553 
554 // substitute invalid characters in device name with _
555 static void legalizeFileName(char* fileName) {
556  // tprintf("fileName: %s\n", fileName);
557  const char* invalidChars =
558  "/\?:*\"><| "; // space is valid but can cause headaches
559  // for each invalid char
560  for (unsigned i = 0; i < strlen(invalidChars); i++) {
561  char invalidStr[4];
562  invalidStr[0] = invalidChars[i];
563  invalidStr[1] = '\0';
564  // tprintf("eliminating %s\n", invalidStr);
565  // char *pos = strstr(fileName, invalidStr);
566  // initial ./ is valid for present directory
567  // if (*pos == '.') pos++;
568  // if (*pos == '/') pos++;
569  for (char* pos = strstr(fileName, invalidStr); pos != nullptr;
570  pos = strstr(pos + 1, invalidStr)) {
571  // tprintf("\tfound: %s, ", pos);
572  pos[0] = '_';
573  // tprintf("fileName: %s\n", fileName);
574  }
575  }
576 }
577 
578 static void populateGPUEnvFromDevice(GPUEnv* gpuInfo, cl_device_id device) {
579  // tprintf("[DS] populateGPUEnvFromDevice\n");
580  size_t size;
581  gpuInfo->mnIsUserCreated = 1;
582  // device
583  gpuInfo->mpDevID = device;
584  gpuInfo->mpArryDevsID = new cl_device_id[1];
585  gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
586  clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE,
587  sizeof(cl_device_type), &gpuInfo->mDevType, &size);
588  CHECK_OPENCL(clStatus, "populateGPUEnv::getDeviceInfo(TYPE)");
589  // platform
590  clStatus =
591  clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM,
592  sizeof(cl_platform_id), &gpuInfo->mpPlatformID, &size);
593  CHECK_OPENCL(clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)");
594  // context
595  cl_context_properties props[3];
596  props[0] = CL_CONTEXT_PLATFORM;
597  props[1] = (cl_context_properties)gpuInfo->mpPlatformID;
598  props[2] = 0;
599  gpuInfo->mpContext =
600  clCreateContext(props, 1, &gpuInfo->mpDevID, nullptr, nullptr, &clStatus);
601  CHECK_OPENCL(clStatus, "populateGPUEnv::createContext");
602  // queue
603  cl_command_queue_properties queueProperties = 0;
604  gpuInfo->mpCmdQueue = clCreateCommandQueue(
605  gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus);
606  CHECK_OPENCL(clStatus, "populateGPUEnv::createCommandQueue");
607 }
608 
609 int OpenclDevice::LoadOpencl() {
610 #ifdef WIN32
611  HINSTANCE HOpenclDll = nullptr;
612  void* OpenclDll = nullptr;
613  // fprintf(stderr, " LoadOpenclDllxx... \n");
614  OpenclDll = static_cast<HINSTANCE>(HOpenclDll);
615  OpenclDll = LoadLibrary("openCL.dll");
616  if (!static_cast<HINSTANCE>(OpenclDll)) {
617  fprintf(stderr, "[OD] Load opencl.dll failed!\n");
618  FreeLibrary(static_cast<HINSTANCE>(OpenclDll));
619  return 0;
620  }
621  fprintf(stderr, "[OD] Load opencl.dll successful!\n");
622 #endif
623  return 1;
624 }
625 int OpenclDevice::SetKernelEnv(KernelEnv* envInfo) {
626  envInfo->mpkContext = gpuEnv.mpContext;
627  envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
628  envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
629 
630  return 1;
631 }
632 
633 static cl_mem allocateZeroCopyBuffer(const KernelEnv& rEnv,
634  l_uint32* hostbuffer, size_t nElements,
635  cl_mem_flags flags, cl_int* pStatus) {
636  cl_mem membuffer =
637  clCreateBuffer(rEnv.mpkContext, (cl_mem_flags)(flags),
638  nElements * sizeof(l_uint32), hostbuffer, pStatus);
639 
640  return membuffer;
641 }
642 
643 static Pix* mapOutputCLBuffer(const KernelEnv& rEnv, cl_mem clbuffer, Pix* pixd,
644  Pix* pixs, int elements, cl_mem_flags flags,
645  bool memcopy = false, bool sync = true) {
646  if (!pixd) {
647  if (memcopy) {
648  if ((pixd = pixCreateTemplate(pixs)) == nullptr)
649  tprintf("pixd not made\n");
650  } else {
651  if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs),
652  pixGetDepth(pixs))) == nullptr)
653  tprintf("pixd not made\n");
654  }
655  }
656  l_uint32* pValues = (l_uint32*)clEnqueueMapBuffer(
657  rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
658  elements * sizeof(l_uint32), 0, nullptr, nullptr, nullptr);
659 
660  if (memcopy) {
661  memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32));
662  } else {
663  pixSetData(pixd, pValues);
664  }
665 
666  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, clbuffer, pValues, 0, nullptr,
667  nullptr);
668 
669  if (sync) {
670  clFinish(rEnv.mpkCmdQueue);
671  }
672 
673  return pixd;
674 }
675 
676 void OpenclDevice::releaseMorphCLBuffers() {
677  if (pixdCLIntermediate != nullptr) clReleaseMemObject(pixdCLIntermediate);
678  if (pixsCLBuffer != nullptr) clReleaseMemObject(pixsCLBuffer);
679  if (pixdCLBuffer != nullptr) clReleaseMemObject(pixdCLBuffer);
680  if (pixThBuffer != nullptr) clReleaseMemObject(pixThBuffer);
681  pixdCLIntermediate = pixsCLBuffer = pixdCLBuffer = pixThBuffer = nullptr;
682 }
683 
684 int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix* pixs) {
685  SetKernelEnv(&rEnv);
686 
687  if (pixThBuffer != nullptr) {
688  pixsCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h,
689  CL_MEM_ALLOC_HOST_PTR, &clStatus);
690 
691  // Get the output from ThresholdToPix operation
692  clStatus =
693  clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0,
694  sizeof(l_uint32) * wpl * h, 0, nullptr, nullptr);
695  } else {
696  // Get data from the source image
697  l_uint32* srcdata =
698  reinterpret_cast<l_uint32*>(malloc(wpl * h * sizeof(l_uint32)));
699  memcpy(srcdata, pixGetData(pixs), wpl * h * sizeof(l_uint32));
700 
701  pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl * h,
702  CL_MEM_USE_HOST_PTR, &clStatus);
703  }
704 
705  pixdCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h,
706  CL_MEM_ALLOC_HOST_PTR, &clStatus);
707 
708  pixdCLIntermediate = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h,
709  CL_MEM_ALLOC_HOST_PTR, &clStatus);
710 
711  return (int)clStatus;
712 }
713 
714 int OpenclDevice::InitEnv() {
715 // PERF_COUNT_START("OD::InitEnv")
716 // tprintf("[OD] OpenclDevice::InitEnv()\n");
717 #ifdef SAL_WIN32
718  while (1) {
719  if (1 == LoadOpencl()) break;
720  }
721  PERF_COUNT_SUB("LoadOpencl")
722 #endif
723  // sets up environment, compiles programs
724 
725  InitOpenclRunEnv_DeviceSelection(0);
726  // PERF_COUNT_SUB("called InitOpenclRunEnv_DS")
727  // PERF_COUNT_END
728  return 1;
729 }
730 
731 int OpenclDevice::ReleaseOpenclRunEnv() {
732  ReleaseOpenclEnv(&gpuEnv);
733 #ifdef SAL_WIN32
734  FreeOpenclDll();
735 #endif
736  return 1;
737 }
738 
739 inline int OpenclDevice::AddKernelConfig(int kCount, const char* kName) {
740  ASSERT_HOST(kCount > 0);
741  ASSERT_HOST(strlen(kName) < sizeof(gpuEnv.mArrykernelNames[kCount - 1]));
742  strcpy(gpuEnv.mArrykernelNames[kCount - 1], kName);
743  gpuEnv.mnKernelCount++;
744  return 0;
745 }
746 
747 int OpenclDevice::RegistOpenclKernel() {
748  if (!gpuEnv.mnIsUserCreated) memset(&gpuEnv, 0, sizeof(gpuEnv));
749 
750  gpuEnv.mnFileCount = 0; // argc;
751  gpuEnv.mnKernelCount = 0UL;
752 
753  AddKernelConfig(1, "oclAverageSub1");
754  return 0;
755 }
756 
757 int OpenclDevice::InitOpenclRunEnv_DeviceSelection(int argc) {
758  // PERF_COUNT_START("InitOpenclRunEnv_DS")
759  if (!isInited) {
760  // after programs compiled, selects best device
761  ds_device bestDevice_DS = getDeviceSelection();
762  // PERF_COUNT_SUB("called getDeviceSelection()")
763  cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
764  // overwrite global static GPUEnv with new device
765  if (selectedDeviceIsOpenCL()) {
766  // tprintf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice()
767  // for selected device\n");
768  populateGPUEnvFromDevice(&gpuEnv, bestDevice);
769  gpuEnv.mnFileCount = 0; // argc;
770  gpuEnv.mnKernelCount = 0UL;
771  // PERF_COUNT_SUB("populate gpuEnv")
772  CompileKernelFile(&gpuEnv, "");
773  // PERF_COUNT_SUB("CompileKernelFile")
774  } else {
775  // tprintf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice()
776  // b/c native cpu selected\n");
777  }
778  isInited = 1;
779  }
780  // PERF_COUNT_END
781  return 0;
782 }
783 
784 OpenclDevice::OpenclDevice() {
785  // InitEnv();
786 }
787 
788 OpenclDevice::~OpenclDevice() {
789  // ReleaseOpenclRunEnv();
790 }
791 
792 int OpenclDevice::ReleaseOpenclEnv(GPUEnv* gpuInfo) {
793  int i = 0;
794  int clStatus = 0;
795 
796  if (!isInited) {
797  return 1;
798  }
799 
800  for (i = 0; i < gpuEnv.mnFileCount; i++) {
801  if (gpuEnv.mpArryPrograms[i]) {
802  clStatus = clReleaseProgram(gpuEnv.mpArryPrograms[i]);
803  CHECK_OPENCL(clStatus, "clReleaseProgram");
804  gpuEnv.mpArryPrograms[i] = nullptr;
805  }
806  }
807  if (gpuEnv.mpCmdQueue) {
808  clReleaseCommandQueue(gpuEnv.mpCmdQueue);
809  gpuEnv.mpCmdQueue = nullptr;
810  }
811  if (gpuEnv.mpContext) {
812  clReleaseContext(gpuEnv.mpContext);
813  gpuEnv.mpContext = nullptr;
814  }
815  isInited = 0;
816  gpuInfo->mnIsUserCreated = 0;
817  delete[] gpuInfo->mpArryDevsID;
818  return 1;
819 }
820 int OpenclDevice::BinaryGenerated(const char* clFileName, FILE** fhandle) {
821  unsigned int i = 0;
822  cl_int clStatus;
823  int status = 0;
824  FILE* fd = nullptr;
825  char fileName[256] = {0}, cl_name[128] = {0};
826  char deviceName[1024];
827  clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME,
828  sizeof(deviceName), deviceName, nullptr);
829  CHECK_OPENCL(clStatus, "clGetDeviceInfo");
830  const char* str = strstr(clFileName, ".cl");
831  memcpy(cl_name, clFileName, str - clFileName);
832  cl_name[str - clFileName] = '\0';
833  sprintf(fileName, "%s-%s.bin", cl_name, deviceName);
834  legalizeFileName(fileName);
835  fd = fopen(fileName, "rb");
836  status = (fd != nullptr) ? 1 : 0;
837  if (fd != nullptr) {
838  *fhandle = fd;
839  }
840  return status;
841 }
842 int OpenclDevice::CachedOfKernerPrg(const GPUEnv* gpuEnvCached,
843  const char* clFileName) {
844  int i;
845  for (i = 0; i < gpuEnvCached->mnFileCount; i++) {
846  if (strcasecmp(gpuEnvCached->mArryKnelSrcFile[i], clFileName) == 0) {
847  if (gpuEnvCached->mpArryPrograms[i] != nullptr) {
848  return 1;
849  }
850  }
851  }
852 
853  return 0;
854 }
855 int OpenclDevice::WriteBinaryToFile(const char* fileName, const char* birary,
856  size_t numBytes) {
857  FILE* output = nullptr;
858  output = fopen(fileName, "wb");
859  if (output == nullptr) {
860  return 0;
861  }
862 
863  fwrite(birary, sizeof(char), numBytes, output);
864  fclose(output);
865 
866  return 1;
867 }
868 
869 int OpenclDevice::GeneratBinFromKernelSource(cl_program program,
870  const char* clFileName) {
871  unsigned int i = 0;
872  cl_int clStatus;
873  cl_uint numDevices;
874 
875  clStatus = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,
876  sizeof(numDevices), &numDevices, nullptr);
877  CHECK_OPENCL(clStatus, "clGetProgramInfo");
878 
879  std::vector<cl_device_id> mpArryDevsID(numDevices);
880 
881  /* grab the handles to all of the devices in the program. */
882  clStatus = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
883  sizeof(cl_device_id) * numDevices,
884  &mpArryDevsID[0],
885  nullptr);
886  CHECK_OPENCL(clStatus, "clGetProgramInfo");
887 
888  /* figure out the sizes of each of the binaries. */
889  std::vector<size_t> binarySizes(numDevices);
890 
891  clStatus =
892  clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
893  sizeof(size_t) * numDevices, &binarySizes[0], nullptr);
894  CHECK_OPENCL(clStatus, "clGetProgramInfo");
895 
896  /* copy over all of the generated binaries. */
897  std::vector<char*> binaries(numDevices);
898 
899  for (i = 0; i < numDevices; i++) {
900  if (binarySizes[i] != 0) {
901  binaries[i] = new char[binarySizes[i]];
902  } else {
903  binaries[i] = nullptr;
904  }
905  }
906 
907  clStatus =
908  clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(char*) * numDevices,
909  &binaries[0], nullptr);
910  CHECK_OPENCL(clStatus, "clGetProgramInfo");
911 
912  /* dump out each binary into its own separate file. */
913  for (i = 0; i < numDevices; i++) {
914  char fileName[256] = {0}, cl_name[128] = {0};
915 
916  if (binarySizes[i] != 0) {
917  char deviceName[1024];
918  clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
919  sizeof(deviceName), deviceName, nullptr);
920  CHECK_OPENCL(clStatus, "clGetDeviceInfo");
921 
922  const char* str = strstr(clFileName, ".cl");
923  memcpy(cl_name, clFileName, str - clFileName);
924  cl_name[str - clFileName] = '\0';
925  sprintf(fileName, "%s-%s.bin", cl_name, deviceName);
926  legalizeFileName(fileName);
927  if (!WriteBinaryToFile(fileName, binaries[i], binarySizes[i])) {
928  tprintf("[OD] write binary[%s] failed\n", fileName);
929  return 0;
930  } // else
931  tprintf("[OD] write binary[%s] successfully\n", fileName);
932  }
933  }
934 
935  // Release all resources and memory
936  for (i = 0; i < numDevices; i++) {
937  delete[] binaries[i];
938  }
939 
940  return 1;
941 }
942 
943 int OpenclDevice::CompileKernelFile(GPUEnv* gpuInfo, const char* buildOption) {
944  // PERF_COUNT_START("CompileKernelFile")
945  cl_int clStatus = 0;
946  const char* source;
947  size_t source_size[1];
948  int binary_status, binaryExisted, idx;
949  cl_uint numDevices;
950  FILE *fd, *fd1;
951  const char* filename = "kernel.cl";
952  // fprintf(stderr, "[OD] CompileKernelFile ... \n");
953  if (CachedOfKernerPrg(gpuInfo, filename) == 1) {
954  return 1;
955  }
956 
957  idx = gpuInfo->mnFileCount;
958 
959  source = kernel_src;
960 
961  source_size[0] = strlen(source);
962  binaryExisted = 0;
963  binaryExisted = BinaryGenerated(
964  filename, &fd); // don't check for binary during microbenchmark
965  // PERF_COUNT_SUB("BinaryGenerated")
966  if (binaryExisted == 1) {
967  clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
968  sizeof(numDevices), &numDevices, nullptr);
969  CHECK_OPENCL(clStatus, "clGetContextInfo");
970 
971  std::vector<cl_device_id> mpArryDevsID(numDevices);
972  // PERF_COUNT_SUB("get numDevices")
973  bool b_error = fseek(fd, 0, SEEK_END) < 0;
974  long pos = ftell(fd);
975  b_error |= (pos <= 0);
976  size_t length = pos;
977  b_error |= fseek(fd, 0, SEEK_SET) < 0;
978  if (b_error) {
979  fclose(fd);
980  return 0;
981  }
982 
983  std::vector<uint8_t> binary(length + 2);
984 
985  memset(&binary[0], 0, length + 2);
986  b_error |= fread(&binary[0], 1, length, fd) != length;
987 
988  fclose(fd);
989  // PERF_COUNT_SUB("read file")
990  fd = nullptr;
991  // grab the handles to all of the devices in the context.
992  clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES,
993  sizeof(cl_device_id) * numDevices,
994  &mpArryDevsID[0], nullptr);
995  CHECK_OPENCL(clStatus, "clGetContextInfo");
996  // PERF_COUNT_SUB("get devices")
997  // fprintf(stderr, "[OD] Create kernel from binary\n");
998  const uint8_t* c_binary = &binary[0];
999  gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary(
1000  gpuInfo->mpContext, numDevices, &mpArryDevsID[0], &length, &c_binary,
1001  &binary_status, &clStatus);
1002  CHECK_OPENCL(clStatus, "clCreateProgramWithBinary");
1003  // PERF_COUNT_SUB("clCreateProgramWithBinary")
1004  // PERF_COUNT_SUB("binaryExisted")
1005  } else {
1006  // create a CL program using the kernel source
1007  // fprintf(stderr, "[OD] Create kernel from source\n");
1008  gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource(
1009  gpuInfo->mpContext, 1, &source, source_size, &clStatus);
1010  CHECK_OPENCL(clStatus, "clCreateProgramWithSource");
1011  // PERF_COUNT_SUB("!binaryExisted")
1012  }
1013 
1014  if (gpuInfo->mpArryPrograms[idx] == (cl_program) nullptr) {
1015  return 0;
1016  }
1017 
1018  // char options[512];
1019  // create a cl program executable for all the devices specified
1020  // tprintf("[OD] BuildProgram.\n");
1021  PERF_COUNT_START("OD::CompileKernel::clBuildProgram")
1022  if (!gpuInfo->mnIsUserCreated) {
1023  clStatus =
1024  clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
1025  buildOption, nullptr, nullptr);
1026  // PERF_COUNT_SUB("clBuildProgram notUserCreated")
1027  } else {
1028  clStatus =
1029  clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
1030  buildOption, nullptr, nullptr);
1031  // PERF_COUNT_SUB("clBuildProgram isUserCreated")
1032  }
1034  if (clStatus != CL_SUCCESS) {
1035  tprintf("BuildProgram error!\n");
1036  size_t length;
1037  if (!gpuInfo->mnIsUserCreated) {
1038  clStatus = clGetProgramBuildInfo(
1039  gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1040  CL_PROGRAM_BUILD_LOG, 0, nullptr, &length);
1041  } else {
1042  clStatus =
1043  clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1044  CL_PROGRAM_BUILD_LOG, 0, nullptr, &length);
1045  }
1046  if (clStatus != CL_SUCCESS) {
1047  tprintf("opencl create build log fail\n");
1048  return 0;
1049  }
1050  std::vector<char> buildLog(length);
1051  if (!gpuInfo->mnIsUserCreated) {
1052  clStatus = clGetProgramBuildInfo(
1053  gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1054  CL_PROGRAM_BUILD_LOG, length, &buildLog[0], &length);
1055  } else {
1056  clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx],
1057  gpuInfo->mpDevID, CL_PROGRAM_BUILD_LOG,
1058  length, &buildLog[0], &length);
1059  }
1060  if (clStatus != CL_SUCCESS) {
1061  tprintf("opencl program build info fail\n");
1062  return 0;
1063  }
1064 
1065  fd1 = fopen("kernel-build.log", "w+");
1066  if (fd1 != nullptr) {
1067  fwrite(&buildLog[0], sizeof(char), length, fd1);
1068  fclose(fd1);
1069  }
1070 
1071  // PERF_COUNT_SUB("build error log")
1072  return 0;
1073  }
1074 
1075  strcpy(gpuInfo->mArryKnelSrcFile[idx], filename);
1076  // PERF_COUNT_SUB("strcpy")
1077  if (binaryExisted == 0) {
1078  GeneratBinFromKernelSource(gpuInfo->mpArryPrograms[idx], filename);
1079  PERF_COUNT_SUB("GenerateBinFromKernelSource")
1080  }
1081 
1082  gpuInfo->mnFileCount += 1;
1083  // PERF_COUNT_END
1084  return 1;
1085 }
1086 
1087 l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32* tiffdata, l_int32 w,
1088  l_int32 h, l_int32 wpl,
1089  l_uint32* line) {
1090  PERF_COUNT_START("pixReadFromTiffKernel")
1091  cl_int clStatus;
1092  KernelEnv rEnv;
1093  size_t globalThreads[2];
1094  size_t localThreads[2];
1095  int gsize;
1096  cl_mem valuesCl;
1097  cl_mem outputCl;
1098 
1099  // global and local work dimensions for Horizontal pass
1100  gsize = (w + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1101  globalThreads[0] = gsize;
1102  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1103  globalThreads[1] = gsize;
1104  localThreads[0] = GROUPSIZE_X;
1105  localThreads[1] = GROUPSIZE_Y;
1106 
1107  SetKernelEnv(&rEnv);
1108 
1109  l_uint32* pResult = (l_uint32*)malloc(w * h * sizeof(l_uint32));
1110  rEnv.mpkKernel =
1111  clCreateKernel(rEnv.mpkProgram, "composeRGBPixel", &clStatus);
1112  CHECK_OPENCL(clStatus, "clCreateKernel composeRGBPixel");
1113 
1114  // Allocate input and output OCL buffers
1115  valuesCl = allocateZeroCopyBuffer(
1116  rEnv, tiffdata, w * h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1117  outputCl = allocateZeroCopyBuffer(
1118  rEnv, pResult, w * h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1119 
1120  // Kernel arguments
1121  clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &valuesCl);
1122  CHECK_OPENCL(clStatus, "clSetKernelArg");
1123  clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(w), &w);
1124  CHECK_OPENCL(clStatus, "clSetKernelArg");
1125  clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(h), &h);
1126  CHECK_OPENCL(clStatus, "clSetKernelArg");
1127  clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1128  CHECK_OPENCL(clStatus, "clSetKernelArg");
1129  clStatus = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outputCl);
1130  CHECK_OPENCL(clStatus, "clSetKernelArg");
1131 
1132  // Kernel enqueue
1133  PERF_COUNT_SUB("before")
1134  clStatus =
1135  clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr,
1136  globalThreads, localThreads, 0, nullptr, nullptr);
1137  CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel");
1138 
1139  /* map results back from gpu */
1140  void* ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE,
1141  CL_MAP_READ, 0, w * h * sizeof(l_uint32), 0,
1142  nullptr, nullptr, &clStatus);
1143  CHECK_OPENCL(clStatus, "clEnqueueMapBuffer outputCl");
1144  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, nullptr, nullptr);
1145 
1146  // Sync
1147  clFinish(rEnv.mpkCmdQueue);
1148  PERF_COUNT_SUB("kernel & map")
1150  return pResult;
1151 }
1152 
1153 // Morphology Dilate operation for 5x5 structuring element. Invokes the relevant
1154 // OpenCL kernels
1155 static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) {
1156  size_t globalThreads[2];
1157  cl_mem pixtemp;
1158  cl_int status;
1159  int gsize;
1160  size_t localThreads[2];
1161 
1162  // Horizontal pass
1163  gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1164  globalThreads[0] = gsize;
1165  globalThreads[1] = GROUPSIZE_HMORY;
1166  localThreads[0] = GROUPSIZE_HMORX;
1167  localThreads[1] = GROUPSIZE_HMORY;
1168 
1169  rEnv.mpkKernel =
1170  clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_5x5", &status);
1171  CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_5x5");
1172 
1173  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1174  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1175  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1176  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1177 
1178  status =
1179  clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr,
1180  globalThreads, localThreads, 0, nullptr, nullptr);
1181 
1182  // Swap source and dest buffers
1183  pixtemp = pixsCLBuffer;
1184  pixsCLBuffer = pixdCLBuffer;
1185  pixdCLBuffer = pixtemp;
1186 
1187  // Vertical
1188  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1189  globalThreads[0] = gsize;
1190  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1191  globalThreads[1] = gsize;
1192  localThreads[0] = GROUPSIZE_X;
1193  localThreads[1] = GROUPSIZE_Y;
1194 
1195  rEnv.mpkKernel =
1196  clCreateKernel(rEnv.mpkProgram, "morphoDilateVer_5x5", &status);
1197  CHECK_OPENCL(status, "clCreateKernel morphoDilateVer_5x5");
1198 
1199  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1200  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1201  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1202  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1203  status =
1204  clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr,
1205  globalThreads, localThreads, 0, nullptr, nullptr);
1206 
1207  return status;
1208 }
1209 
1210 // Morphology Erode operation for 5x5 structuring element. Invokes the relevant
1211 // OpenCL kernels
1212 static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h) {
1213  size_t globalThreads[2];
1214  cl_mem pixtemp;
1215  cl_int status;
1216  int gsize;
1217  l_uint32 fwmask, lwmask;
1218  size_t localThreads[2];
1219 
1220  lwmask = lmask32[31 - 2];
1221  fwmask = rmask32[31 - 2];
1222 
1223  // Horizontal pass
1224  gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1225  globalThreads[0] = gsize;
1226  globalThreads[1] = GROUPSIZE_HMORY;
1227  localThreads[0] = GROUPSIZE_HMORX;
1228  localThreads[1] = GROUPSIZE_HMORY;
1229 
1230  rEnv.mpkKernel =
1231  clCreateKernel(rEnv.mpkProgram, "morphoErodeHor_5x5", &status);
1232  CHECK_OPENCL(status, "clCreateKernel morphoErodeHor_5x5");
1233 
1234  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1235  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1236  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1237  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1238 
1239  status =
1240  clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr,
1241  globalThreads, localThreads, 0, nullptr, nullptr);
1242 
1243  // Swap source and dest buffers
1244  pixtemp = pixsCLBuffer;
1245  pixsCLBuffer = pixdCLBuffer;
1246  pixdCLBuffer = pixtemp;
1247 
1248  // Vertical
1249  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1250  globalThreads[0] = gsize;
1251  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1252  globalThreads[1] = gsize;
1253  localThreads[0] = GROUPSIZE_X;
1254  localThreads[1] = GROUPSIZE_Y;
1255 
1256  rEnv.mpkKernel =
1257  clCreateKernel(rEnv.mpkProgram, "morphoErodeVer_5x5", &status);
1258  CHECK_OPENCL(status, "clCreateKernel morphoErodeVer_5x5");
1259 
1260  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1261  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1262  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1263  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1264  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(fwmask), &fwmask);
1265  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(lwmask), &lwmask);
1266  status =
1267  clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr,
1268  globalThreads, localThreads, 0, nullptr, nullptr);
1269 
1270  return status;
1271 }
1272 
1273 // Morphology Dilate operation. Invokes the relevant OpenCL kernels
1274 static cl_int pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl,
1275  l_int32 h) {
1276  l_int32 xp, yp, xn, yn;
1277  SEL* sel;
1278  size_t globalThreads[2];
1279  cl_mem pixtemp;
1280  cl_int status = 0;
1281  int gsize;
1282  size_t localThreads[2];
1283  char isEven;
1284 
1285  OpenclDevice::SetKernelEnv(&rEnv);
1286 
1287  if (hsize == 5 && vsize == 5) {
1288  // Specific case for 5x5
1289  status = pixDilateCL_55(wpl, h);
1290  return status;
1291  }
1292 
1293  sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1294 
1295  selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1296  selDestroy(&sel);
1297  // global and local work dimensions for Horizontal pass
1298  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1299  globalThreads[0] = gsize;
1300  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1301  globalThreads[1] = gsize;
1302  localThreads[0] = GROUPSIZE_X;
1303  localThreads[1] = GROUPSIZE_Y;
1304 
1305  if (xp > 31 || xn > 31) {
1306  // Generic case.
1307  rEnv.mpkKernel =
1308  clCreateKernel(rEnv.mpkProgram, "morphoDilateHor", &status);
1309  CHECK_OPENCL(status, "clCreateKernel morphoDilateHor");
1310 
1311  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1312  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1313  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1314  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn);
1315  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl);
1316  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h);
1317  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1318  nullptr, globalThreads, localThreads, 0,
1319  nullptr, nullptr);
1320 
1321  if (yp > 0 || yn > 0) {
1322  pixtemp = pixsCLBuffer;
1323  pixsCLBuffer = pixdCLBuffer;
1324  pixdCLBuffer = pixtemp;
1325  }
1326  } else if (xp > 0 || xn > 0) {
1327  // Specific Horizontal pass kernel for half width < 32
1328  rEnv.mpkKernel =
1329  clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_32word", &status);
1330  CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_32word");
1331  isEven = (xp != xn);
1332 
1333  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1334  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1335  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1336  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1337  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1338  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isEven), &isEven);
1339  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1340  nullptr, globalThreads, localThreads, 0,
1341  nullptr, nullptr);
1342 
1343  if (yp > 0 || yn > 0) {
1344  pixtemp = pixsCLBuffer;
1345  pixsCLBuffer = pixdCLBuffer;
1346  pixdCLBuffer = pixtemp;
1347  }
1348  }
1349 
1350  if (yp > 0 || yn > 0) {
1351  rEnv.mpkKernel =
1352  clCreateKernel(rEnv.mpkProgram, "morphoDilateVer", &status);
1353  CHECK_OPENCL(status, "clCreateKernel morphoDilateVer");
1354 
1355  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1356  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1357  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp);
1358  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1359  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1360  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(yn), &yn);
1361  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1362  nullptr, globalThreads, localThreads, 0,
1363  nullptr, nullptr);
1364  }
1365 
1366  return status;
1367 }
1368 
1369 // Morphology Erode operation. Invokes the relevant OpenCL kernels
1370 static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl,
1371  l_uint32 h) {
1372  l_int32 xp, yp, xn, yn;
1373  SEL* sel;
1374  size_t globalThreads[2];
1375  size_t localThreads[2];
1376  cl_mem pixtemp;
1377  cl_int status = 0;
1378  int gsize;
1379  char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
1380  l_uint32 rwmask, lwmask;
1381  char isEven;
1382 
1383  sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1384 
1385  selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1386  selDestroy(&sel);
1387  OpenclDevice::SetKernelEnv(&rEnv);
1388 
1389  if (hsize == 5 && vsize == 5 && isAsymmetric) {
1390  // Specific kernel for 5x5
1391  status = pixErodeCL_55(wpl, h);
1392  return status;
1393  }
1394 
1395  lwmask = lmask32[31 - (xn & 31)];
1396  rwmask = rmask32[31 - (xp & 31)];
1397 
1398  // global and local work dimensions for Horizontal pass
1399  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1400  globalThreads[0] = gsize;
1401  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1402  globalThreads[1] = gsize;
1403  localThreads[0] = GROUPSIZE_X;
1404  localThreads[1] = GROUPSIZE_Y;
1405 
1406  // Horizontal Pass
1407  if (xp > 31 || xn > 31) {
1408  // Generic case.
1409  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeHor", &status);
1410 
1411  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1412  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1413  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1414  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn);
1415  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl);
1416  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h);
1417  status =
1418  clSetKernelArg(rEnv.mpkKernel, 6, sizeof(isAsymmetric), &isAsymmetric);
1419  status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(rwmask), &rwmask);
1420  status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(lwmask), &lwmask);
1421  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1422  nullptr, globalThreads, localThreads, 0,
1423  nullptr, nullptr);
1424 
1425  if (yp > 0 || yn > 0) {
1426  pixtemp = pixsCLBuffer;
1427  pixsCLBuffer = pixdCLBuffer;
1428  pixdCLBuffer = pixtemp;
1429  }
1430  } else if (xp > 0 || xn > 0) {
1431  rEnv.mpkKernel =
1432  clCreateKernel(rEnv.mpkProgram, "morphoErodeHor_32word", &status);
1433  isEven = (xp != xn);
1434 
1435  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1436  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1437  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1438  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1439  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1440  status =
1441  clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric);
1442  status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(rwmask), &rwmask);
1443  status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(lwmask), &lwmask);
1444  status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(isEven), &isEven);
1445  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1446  nullptr, globalThreads, localThreads, 0,
1447  nullptr, nullptr);
1448 
1449  if (yp > 0 || yn > 0) {
1450  pixtemp = pixsCLBuffer;
1451  pixsCLBuffer = pixdCLBuffer;
1452  pixdCLBuffer = pixtemp;
1453  }
1454  }
1455 
1456  // Vertical Pass
1457  if (yp > 0 || yn > 0) {
1458  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeVer", &status);
1459  CHECK_OPENCL(status, "clCreateKernel morphoErodeVer");
1460 
1461  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1462  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1463  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp);
1464  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1465  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1466  status =
1467  clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric);
1468  status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(yn), &yn);
1469  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1470  nullptr, globalThreads, localThreads, 0,
1471  nullptr, nullptr);
1472  }
1473 
1474  return status;
1475 }
1476 
1477 // Morphology Open operation. Invokes the relevant OpenCL kernels
1478 static cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1479  cl_int status;
1480  cl_mem pixtemp;
1481 
1482  // Erode followed by Dilate
1483  status = pixErodeCL(hsize, vsize, wpl, h);
1484 
1485  pixtemp = pixsCLBuffer;
1486  pixsCLBuffer = pixdCLBuffer;
1487  pixdCLBuffer = pixtemp;
1488 
1489  status = pixDilateCL(hsize, vsize, wpl, h);
1490 
1491  return status;
1492 }
1493 
1494 // Morphology Close operation. Invokes the relevant OpenCL kernels
1495 static cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1496  cl_int status;
1497  cl_mem pixtemp;
1498 
1499  // Dilate followed by Erode
1500  status = pixDilateCL(hsize, vsize, wpl, h);
1501 
1502  pixtemp = pixsCLBuffer;
1503  pixsCLBuffer = pixdCLBuffer;
1504  pixdCLBuffer = pixtemp;
1505 
1506  status = pixErodeCL(hsize, vsize, wpl, h);
1507 
1508  return status;
1509 }
1510 
1511 // output = buffer1 & ~(buffer2)
1512 static cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1,
1513  cl_mem buffer2, cl_mem outBuffer = nullptr) {
1514  cl_int status;
1515  size_t globalThreads[2];
1516  int gsize;
1517  size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
1518 
1519  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1520  globalThreads[0] = gsize;
1521  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1522  globalThreads[1] = gsize;
1523 
1524  if (outBuffer != nullptr) {
1525  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "pixSubtract", &status);
1526  CHECK_OPENCL(status, "clCreateKernel pixSubtract");
1527  } else {
1528  rEnv.mpkKernel =
1529  clCreateKernel(rEnv.mpkProgram, "pixSubtract_inplace", &status);
1530  CHECK_OPENCL(status, "clCreateKernel pixSubtract_inplace");
1531  }
1532 
1533  // Enqueue a kernel run call.
1534  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &buffer1);
1535  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &buffer2);
1536  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1537  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1538  if (outBuffer != nullptr) {
1539  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outBuffer);
1540  }
1541  status =
1542  clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr,
1543  globalThreads, localThreads, 0, nullptr, nullptr);
1544 
1545  return status;
1546 }
1547 
1548 // OpenCL implementation of Get Lines from pix function
1549 // Note: Assumes the source and dest opencl buffer are initialized. No check
1550 // done
1551 void OpenclDevice::pixGetLinesCL(Pix* pixd, Pix* pixs, Pix** pix_vline,
1552  Pix** pix_hline, Pix** pixClosed,
1553  bool getpixClosed, l_int32 close_hsize,
1554  l_int32 close_vsize, l_int32 open_hsize,
1555  l_int32 open_vsize, l_int32 line_hsize,
1556  l_int32 line_vsize) {
1557  l_uint32 wpl, h;
1558  cl_mem pixtemp;
1559 
1560  wpl = pixGetWpl(pixs);
1561  h = pixGetHeight(pixs);
1562 
1563  // First step : Close Morph operation: Dilate followed by Erode
1564  clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
1565 
1566  // Copy the Close output to CPU buffer
1567  if (getpixClosed) {
1568  *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs,
1569  wpl * h, CL_MAP_READ, true, false);
1570  }
1571 
1572  // Store the output of close operation in an intermediate buffer
1573  // this will be later used for pixsubtract
1574  clStatus =
1575  clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
1576  0, sizeof(int) * wpl * h, 0, nullptr, nullptr);
1577 
1578  // Second step: Open Operation - Erode followed by Dilate
1579  pixtemp = pixsCLBuffer;
1580  pixsCLBuffer = pixdCLBuffer;
1581  pixdCLBuffer = pixtemp;
1582 
1583  clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
1584 
1585  // Third step: Subtract : (Close - Open)
1586  pixtemp = pixsCLBuffer;
1587  pixsCLBuffer = pixdCLBuffer;
1588  pixdCLBuffer = pixdCLIntermediate;
1589  pixdCLIntermediate = pixtemp;
1590 
1591  clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
1592 
1593  // Store the output of Hollow operation in an intermediate buffer
1594  // this will be later used
1595  clStatus =
1596  clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
1597  0, sizeof(int) * wpl * h, 0, nullptr, nullptr);
1598 
1599  pixtemp = pixsCLBuffer;
1600  pixsCLBuffer = pixdCLBuffer;
1601  pixdCLBuffer = pixtemp;
1602 
1603  // Fourth step: Get vertical line
1604  // pixOpenBrick(nullptr, pix_hollow, 1, min_line_length);
1605  clStatus = pixOpenCL(1, line_vsize, wpl, h);
1606 
1607  // Copy the vertical line output to CPU buffer
1608  *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl * h,
1609  CL_MAP_READ, true, false);
1610 
1611  pixtemp = pixsCLBuffer;
1612  pixsCLBuffer = pixdCLIntermediate;
1613  pixdCLIntermediate = pixtemp;
1614 
1615  // Fifth step: Get horizontal line
1616  // pixOpenBrick(nullptr, pix_hollow, min_line_length, 1);
1617  clStatus = pixOpenCL(line_hsize, 1, wpl, h);
1618 
1619  // Copy the horizontal line output to CPU buffer
1620  *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl * h,
1621  CL_MAP_READ, true, true);
1622 
1623  return;
1624 }
1625 
1626 /*************************************************************************
1627  * HistogramRect
1628  * Otsu Thresholding Operations
1629  * histogramAllChannels is laid out as all channel 0, then all channel 1...
1630  * only supports 1 or 4 channels (bytes_per_pixel)
1631  ************************************************************************/
1632 int OpenclDevice::HistogramRectOCL(void* imageData,
1633  int bytes_per_pixel, int bytes_per_line,
1634  int left, // always 0
1635  int top, // always 0
1636  int width, int height, int kHistogramSize,
1637  int* histogramAllChannels) {
1638  PERF_COUNT_START("HistogramRectOCL")
1639  cl_int clStatus;
1640  int retVal = 0;
1641  KernelEnv histKern;
1642  SetKernelEnv(&histKern);
1643  KernelEnv histRedKern;
1644  SetKernelEnv(&histRedKern);
1645  /* map imagedata to device as read only */
1646  // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be
1647  // coherent which we don't need.
1648  // faster option would be to allocate initial image buffer
1649  // using a garlic bus memory type
1650  cl_mem imageBuffer = clCreateBuffer(
1651  histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1652  width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus);
1653  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1654 
1655  /* setup work group size parameters */
1656  int block_size = 256;
1657  cl_uint numCUs;
1658  clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
1659  sizeof(numCUs), &numCUs, nullptr);
1660  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1661 
1662  int requestedOccupancy = 10;
1663  int numWorkGroups = numCUs * requestedOccupancy;
1664  int numThreads = block_size * numWorkGroups;
1665  size_t local_work_size[] = {static_cast<size_t>(block_size)};
1666  size_t global_work_size[] = {static_cast<size_t>(numThreads)};
1667  size_t red_global_work_size[] = {
1668  static_cast<size_t>(block_size * kHistogramSize * bytes_per_pixel)};
1669 
1670  /* map histogramAllChannels as write only */
1671 
1672  cl_mem histogramBuffer = clCreateBuffer(
1673  histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1674  kHistogramSize * bytes_per_pixel * sizeof(int), histogramAllChannels,
1675  &clStatus);
1676  CHECK_OPENCL(clStatus, "clCreateBuffer histogramBuffer");
1677 
1678  /* intermediate histogram buffer */
1679  int histRed = 256;
1680  int tmpHistogramBins = kHistogramSize * bytes_per_pixel * histRed;
1681 
1682  cl_mem tmpHistogramBuffer =
1683  clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE,
1684  tmpHistogramBins * sizeof(cl_uint), nullptr, &clStatus);
1685  CHECK_OPENCL(clStatus, "clCreateBuffer tmpHistogramBuffer");
1686 
1687  /* atomic sync buffer */
1688  int* zeroBuffer = new int[1];
1689  zeroBuffer[0] = 0;
1690  cl_mem atomicSyncBuffer = clCreateBuffer(
1691  histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1692  sizeof(cl_int), zeroBuffer, &clStatus);
1693  CHECK_OPENCL(clStatus, "clCreateBuffer atomicSyncBuffer");
1694  delete[] zeroBuffer;
1695  // Create kernel objects based on bytes_per_pixel
1696  if (bytes_per_pixel == 1) {
1697  histKern.mpkKernel = clCreateKernel(
1698  histKern.mpkProgram, "kernel_HistogramRectOneChannel", &clStatus);
1699  CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectOneChannel");
1700 
1701  histRedKern.mpkKernel =
1702  clCreateKernel(histRedKern.mpkProgram,
1703  "kernel_HistogramRectOneChannelReduction", &clStatus);
1704  CHECK_OPENCL(clStatus,
1705  "clCreateKernel kernel_HistogramRectOneChannelReduction");
1706  } else {
1707  histKern.mpkKernel = clCreateKernel(
1708  histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus);
1709  CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectAllChannels");
1710 
1711  histRedKern.mpkKernel =
1712  clCreateKernel(histRedKern.mpkProgram,
1713  "kernel_HistogramRectAllChannelsReduction", &clStatus);
1714  CHECK_OPENCL(clStatus,
1715  "clCreateKernel kernel_HistogramRectAllChannelsReduction");
1716  }
1717 
1718  void* ptr;
1719 
1720  // Initialize tmpHistogramBuffer buffer
1721  ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE,
1722  CL_MAP_WRITE, 0, tmpHistogramBins * sizeof(cl_uint),
1723  0, nullptr, nullptr, &clStatus);
1724  CHECK_OPENCL(clStatus, "clEnqueueMapBuffer tmpHistogramBuffer");
1725 
1726  memset(ptr, 0, tmpHistogramBins * sizeof(cl_uint));
1727  clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0,
1728  nullptr, nullptr);
1729 
1730  /* set kernel 1 arguments */
1731  clStatus =
1732  clSetKernelArg(histKern.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
1733  CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
1734  cl_uint numPixels = width * height;
1735  clStatus = clSetKernelArg(histKern.mpkKernel, 1, sizeof(cl_uint), &numPixels);
1736  CHECK_OPENCL(clStatus, "clSetKernelArg numPixels");
1737  clStatus = clSetKernelArg(histKern.mpkKernel, 2, sizeof(cl_mem),
1738  &tmpHistogramBuffer);
1739  CHECK_OPENCL(clStatus, "clSetKernelArg tmpHistogramBuffer");
1740 
1741  /* set kernel 2 arguments */
1742  int n = numThreads / bytes_per_pixel;
1743  clStatus = clSetKernelArg(histRedKern.mpkKernel, 0, sizeof(cl_int), &n);
1744  CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
1745  clStatus = clSetKernelArg(histRedKern.mpkKernel, 1, sizeof(cl_mem),
1746  &tmpHistogramBuffer);
1747  CHECK_OPENCL(clStatus, "clSetKernelArg tmpHistogramBuffer");
1748  clStatus = clSetKernelArg(histRedKern.mpkKernel, 2, sizeof(cl_mem),
1749  &histogramBuffer);
1750  CHECK_OPENCL(clStatus, "clSetKernelArg histogramBuffer");
1751 
1752  /* launch histogram */
1753  PERF_COUNT_SUB("before")
1754  clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1,
1755  nullptr, global_work_size, local_work_size,
1756  0, nullptr, nullptr);
1757  CHECK_OPENCL(clStatus,
1758  "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels");
1759  clFinish(histKern.mpkCmdQueue);
1760  if (clStatus != 0) {
1761  retVal = -1;
1762  }
1763  /* launch histogram */
1764  clStatus = clEnqueueNDRangeKernel(
1765  histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1, nullptr,
1766  red_global_work_size, local_work_size, 0, nullptr, nullptr);
1767  CHECK_OPENCL(
1768  clStatus,
1769  "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction");
1770  clFinish(histRedKern.mpkCmdQueue);
1771  if (clStatus != 0) {
1772  retVal = -1;
1773  }
1774  PERF_COUNT_SUB("redKernel")
1775 
1776  /* map results back from gpu */
1777  ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE,
1778  CL_MAP_READ, 0,
1779  kHistogramSize * bytes_per_pixel * sizeof(int), 0,
1780  nullptr, nullptr, &clStatus);
1781  CHECK_OPENCL(clStatus, "clEnqueueMapBuffer histogramBuffer");
1782  if (clStatus != 0) {
1783  retVal = -1;
1784  }
1785  clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0,
1786  nullptr, nullptr);
1787 
1788  clReleaseMemObject(histogramBuffer);
1789  clReleaseMemObject(imageBuffer);
1790  PERF_COUNT_SUB("after")
1792  return retVal;
1793 }
1794 
1795 /*************************************************************************
1796  * Threshold the rectangle, taking everything except the image buffer pointer
1797  * from the class, using thresholds/hi_values to the output IMAGE.
1798  * only supports 1 or 4 channels
1799  ************************************************************************/
1800 int OpenclDevice::ThresholdRectToPixOCL(unsigned char* imageData,
1801  int bytes_per_pixel, int bytes_per_line,
1802  int* thresholds, int* hi_values,
1803  Pix** pix, int height, int width,
1804  int top, int left) {
1805  PERF_COUNT_START("ThresholdRectToPixOCL")
1806  int retVal = 0;
1807  /* create pix result buffer */
1808  *pix = pixCreate(width, height, 1);
1809  uint32_t* pixData = pixGetData(*pix);
1810  int wpl = pixGetWpl(*pix);
1811  int pixSize = wpl * height * sizeof(uint32_t); // number of pixels
1812 
1813  cl_int clStatus;
1814  KernelEnv rEnv;
1815  SetKernelEnv(&rEnv);
1816 
1817  /* setup work group size parameters */
1818  int block_size = 256;
1819  cl_uint numCUs = 6;
1820  clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
1821  sizeof(numCUs), &numCUs, nullptr);
1822  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1823 
1824  int requestedOccupancy = 10;
1825  int numWorkGroups = numCUs * requestedOccupancy;
1826  int numThreads = block_size * numWorkGroups;
1827  size_t local_work_size[] = {(size_t)block_size};
1828  size_t global_work_size[] = {(size_t)numThreads};
1829 
1830  /* map imagedata to device as read only */
1831  // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be
1832  // coherent which we don't need.
1833  // faster option would be to allocate initial image buffer
1834  // using a garlic bus memory type
1835  cl_mem imageBuffer = clCreateBuffer(
1836  rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1837  width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus);
1838  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1839 
1840  /* map pix as write only */
1841  pixThBuffer =
1842  clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1843  pixSize, pixData, &clStatus);
1844  CHECK_OPENCL(clStatus, "clCreateBuffer pix");
1845 
1846  /* map thresholds and hi_values */
1847  cl_mem thresholdsBuffer =
1848  clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1849  bytes_per_pixel * sizeof(int), thresholds, &clStatus);
1850  CHECK_OPENCL(clStatus, "clCreateBuffer thresholdBuffer");
1851  cl_mem hiValuesBuffer =
1852  clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1853  bytes_per_pixel * sizeof(int), hi_values, &clStatus);
1854  CHECK_OPENCL(clStatus, "clCreateBuffer hiValuesBuffer");
1855 
1856  /* compile kernel */
1857  if (bytes_per_pixel == 4) {
1858  rEnv.mpkKernel =
1859  clCreateKernel(rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus);
1860  CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix");
1861  } else {
1862  rEnv.mpkKernel = clCreateKernel(
1863  rEnv.mpkProgram, "kernel_ThresholdRectToPix_OneChan", &clStatus);
1864  CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix_OneChan");
1865  }
1866 
1867  /* set kernel arguments */
1868  clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
1869  CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
1870  clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(int), &height);
1871  CHECK_OPENCL(clStatus, "clSetKernelArg height");
1872  clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(int), &width);
1873  CHECK_OPENCL(clStatus, "clSetKernelArg width");
1874  clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(int), &wpl);
1875  CHECK_OPENCL(clStatus, "clSetKernelArg wpl");
1876  clStatus =
1877  clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &thresholdsBuffer);
1878  CHECK_OPENCL(clStatus, "clSetKernelArg thresholdsBuffer");
1879  clStatus = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(cl_mem), &hiValuesBuffer);
1880  CHECK_OPENCL(clStatus, "clSetKernelArg hiValuesBuffer");
1881  clStatus = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(cl_mem), &pixThBuffer);
1882  CHECK_OPENCL(clStatus, "clSetKernelArg pixThBuffer");
1883 
1884  /* launch kernel & wait */
1885  PERF_COUNT_SUB("before")
1886  clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 1,
1887  nullptr, global_work_size, local_work_size,
1888  0, nullptr, nullptr);
1889  CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix");
1890  clFinish(rEnv.mpkCmdQueue);
1891  PERF_COUNT_SUB("kernel")
1892  if (clStatus != 0) {
1893  tprintf("Setting return value to -1\n");
1894  retVal = -1;
1895  }
1896  /* map results back from gpu */
1897  void* ptr =
1898  clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0,
1899  pixSize, 0, nullptr, nullptr, &clStatus);
1900  CHECK_OPENCL(clStatus, "clEnqueueMapBuffer histogramBuffer");
1901  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, nullptr,
1902  nullptr);
1903 
1904  clReleaseMemObject(imageBuffer);
1905  clReleaseMemObject(thresholdsBuffer);
1906  clReleaseMemObject(hiValuesBuffer);
1907 
1908  PERF_COUNT_SUB("after")
1910  return retVal;
1911 }
1912 
1913 /******************************************************************************
1914  * Data Types for Device Selection
1915  *****************************************************************************/
1916 
1917 struct TessScoreEvaluationInputData {
1918  int height;
1919  int width;
1920  int numChannels;
1921  unsigned char* imageData;
1922  Pix* pix;
1923 };
1924 
1925 static void populateTessScoreEvaluationInputData(
1926  TessScoreEvaluationInputData* input) {
1927  srand(1);
1928  // 8.5x11 inches @ 300dpi rounded to clean multiples
1929  int height = 3328; // %256
1930  int width = 2560; // %512
1931  int numChannels = 4;
1932  input->height = height;
1933  input->width = width;
1934  input->numChannels = numChannels;
1935  unsigned char(*imageData4)[4] = (unsigned char(*)[4])malloc(
1936  height * width * numChannels *
1937  sizeof(unsigned char)); // new unsigned char[4][height*width];
1938  input->imageData = (unsigned char*)&imageData4[0];
1939 
1940  // zero out image
1941  unsigned char pixelWhite[4] = {0, 0, 0, 255};
1942  unsigned char pixelBlack[4] = {255, 255, 255, 255};
1943  for (int p = 0; p < height * width; p++) {
1944  // unsigned char tmp[4] = imageData4[0];
1945  imageData4[p][0] = pixelWhite[0];
1946  imageData4[p][1] = pixelWhite[1];
1947  imageData4[p][2] = pixelWhite[2];
1948  imageData4[p][3] = pixelWhite[3];
1949  }
1950  // random lines to be eliminated
1951  int maxLineWidth = 64; // pixels wide
1952  int numLines = 10;
1953  // vertical lines
1954  for (int i = 0; i < numLines; i++) {
1955  int lineWidth = rand() % maxLineWidth;
1956  int vertLinePos = lineWidth + rand() % (width - 2 * lineWidth);
1957  // tprintf("[PI] VerticalLine @ %i (w=%i)\n", vertLinePos, lineWidth);
1958  for (int row = vertLinePos - lineWidth / 2;
1959  row < vertLinePos + lineWidth / 2; row++) {
1960  for (int col = 0; col < height; col++) {
1961  // imageData4[row*width+col] = pixelBlack;
1962  imageData4[row * width + col][0] = pixelBlack[0];
1963  imageData4[row * width + col][1] = pixelBlack[1];
1964  imageData4[row * width + col][2] = pixelBlack[2];
1965  imageData4[row * width + col][3] = pixelBlack[3];
1966  }
1967  }
1968  }
1969  // horizontal lines
1970  for (int i = 0; i < numLines; i++) {
1971  int lineWidth = rand() % maxLineWidth;
1972  int horLinePos = lineWidth + rand() % (height - 2 * lineWidth);
1973  // tprintf("[PI] HorizontalLine @ %i (w=%i)\n", horLinePos, lineWidth);
1974  for (int row = 0; row < width; row++) {
1975  for (int col = horLinePos - lineWidth / 2;
1976  col < horLinePos + lineWidth / 2;
1977  col++) { // for (int row = vertLinePos-lineWidth/2; row <
1978  // vertLinePos+lineWidth/2; row++) {
1979  // tprintf("[PI] HoizLine pix @ (%3i, %3i)\n", row, col);
1980  // imageData4[row*width+col] = pixelBlack;
1981  imageData4[row * width + col][0] = pixelBlack[0];
1982  imageData4[row * width + col][1] = pixelBlack[1];
1983  imageData4[row * width + col][2] = pixelBlack[2];
1984  imageData4[row * width + col][3] = pixelBlack[3];
1985  }
1986  }
1987  }
1988  // spots (noise, squares)
1989  float fractionBlack = 0.1; // how much of the image should be blackened
1990  int numSpots =
1991  (height * width) * fractionBlack / (maxLineWidth * maxLineWidth / 2 / 2);
1992  for (int i = 0; i < numSpots; i++) {
1993  int lineWidth = rand() % maxLineWidth;
1994  int col = lineWidth + rand() % (width - 2 * lineWidth);
1995  int row = lineWidth + rand() % (height - 2 * lineWidth);
1996  // tprintf("[PI] Spot[%i/%i] @ (%3i, %3i)\n", i, numSpots, row, col );
1997  for (int r = row - lineWidth / 2; r < row + lineWidth / 2; r++) {
1998  for (int c = col - lineWidth / 2; c < col + lineWidth / 2; c++) {
1999  // tprintf("[PI] \tSpot[%i/%i] @ (%3i, %3i)\n", i, numSpots, r, c );
2000  // imageData4[row*width+col] = pixelBlack;
2001  imageData4[r * width + c][0] = pixelBlack[0];
2002  imageData4[r * width + c][1] = pixelBlack[1];
2003  imageData4[r * width + c][2] = pixelBlack[2];
2004  imageData4[r * width + c][3] = pixelBlack[3];
2005  }
2006  }
2007  }
2008 
2009  input->pix = pixCreate(input->width, input->height, 1);
2010 }
2011 
2012 struct TessDeviceScore {
2013  float time; // small time means faster device
2014  bool clError; // were there any opencl errors
2015  bool valid; // was the correct response generated
2016 };
2017 
2018 /******************************************************************************
2019  * Micro Benchmarks for Device Selection
2020  *****************************************************************************/
2021 
2022 static double composeRGBPixelMicroBench(GPUEnv* env,
2023  TessScoreEvaluationInputData input,
2024  ds_device_type type) {
2025  double time = 0;
2026 #if ON_WINDOWS
2027  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2028  QueryPerformanceFrequency(&freq);
2029 #elif ON_APPLE
2030  mach_timebase_info_data_t info = {0, 0};
2031  mach_timebase_info(&info);
2032  long long start, stop;
2033 #else
2034  timespec time_funct_start, time_funct_end;
2035 #endif
2036  // input data
2037  l_uint32* tiffdata =
2038  (l_uint32*)input.imageData; // same size and random data; data doesn't
2039  // change workload
2040 
2041  // function call
2042  if (type == DS_DEVICE_OPENCL_DEVICE) {
2043 #if ON_WINDOWS
2044  QueryPerformanceCounter(&time_funct_start);
2045 #elif ON_APPLE
2046  start = mach_absolute_time();
2047 #else
2048  clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2049 #endif
2050 
2051  OpenclDevice::gpuEnv = *env;
2052  int wpl = pixGetWpl(input.pix);
2053  OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height,
2054  wpl, nullptr);
2055 #if ON_WINDOWS
2056  QueryPerformanceCounter(&time_funct_end);
2057  time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2058  (double)(freq.QuadPart);
2059 #elif ON_APPLE
2060  stop = mach_absolute_time();
2061  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2062 #else
2063  clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2064  time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2065  (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2066 #endif
2067 
2068  } else {
2069 #if ON_WINDOWS
2070  QueryPerformanceCounter(&time_funct_start);
2071 #elif ON_APPLE
2072  start = mach_absolute_time();
2073 #else
2074  clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2075 #endif
2076  Pix* pix = pixCreate(input.width, input.height, 32);
2077  l_uint32* pixData = pixGetData(pix);
2078  int i, j;
2079  int idx = 0;
2080  for (i = 0; i < input.height; i++) {
2081  for (j = 0; j < input.width; j++) {
2082  l_uint32 tiffword = tiffdata[i * input.width + j];
2083  l_int32 rval = ((tiffword)&0xff);
2084  l_int32 gval = (((tiffword) >> 8) & 0xff);
2085  l_int32 bval = (((tiffword) >> 16) & 0xff);
2086  l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
2087  pixData[idx] = value;
2088  idx++;
2089  }
2090  }
2091 #if ON_WINDOWS
2092  QueryPerformanceCounter(&time_funct_end);
2093  time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2094  (double)(freq.QuadPart);
2095 #elif ON_APPLE
2096  stop = mach_absolute_time();
2097  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2098 #else
2099  clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2100  time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2101  (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2102 #endif
2103  pixDestroy(&pix);
2104  }
2105 
2106  return time;
2107 }
2108 
2109 static double histogramRectMicroBench(GPUEnv* env,
2110  TessScoreEvaluationInputData input,
2111  ds_device_type type) {
2112  double time;
2113 #if ON_WINDOWS
2114  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2115  QueryPerformanceFrequency(&freq);
2116 #elif ON_APPLE
2117  mach_timebase_info_data_t info = {0, 0};
2118  mach_timebase_info(&info);
2119  long long start, stop;
2120 #else
2121  timespec time_funct_start, time_funct_end;
2122 #endif
2123 
2124  const int left = 0;
2125  const int top = 0;
2126  int kHistogramSize = 256;
2127  int bytes_per_line = input.width * input.numChannels;
2128  int* histogramAllChannels = new int[kHistogramSize * input.numChannels];
2129  // function call
2130  if (type == DS_DEVICE_OPENCL_DEVICE) {
2131 #if ON_WINDOWS
2132  QueryPerformanceCounter(&time_funct_start);
2133 #elif ON_APPLE
2134  start = mach_absolute_time();
2135 #else
2136  clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2137 #endif
2138 
2139  OpenclDevice::gpuEnv = *env;
2140  int retVal = OpenclDevice::HistogramRectOCL(
2141  input.imageData, input.numChannels, bytes_per_line, left, top,
2142  input.width, input.height, kHistogramSize, histogramAllChannels);
2143 
2144 #if ON_WINDOWS
2145  QueryPerformanceCounter(&time_funct_end);
2146  time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2147  (double)(freq.QuadPart);
2148 #elif ON_APPLE
2149  stop = mach_absolute_time();
2150  if (retVal == 0) {
2151  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2152  } else {
2153  time = FLT_MAX;
2154  }
2155 #else
2156  clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2157  time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2158  (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2159 #endif
2160  } else {
2161  int* histogram = new int[kHistogramSize];
2162 #if ON_WINDOWS
2163  QueryPerformanceCounter(&time_funct_start);
2164 #elif ON_APPLE
2165  start = mach_absolute_time();
2166 #else
2167  clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2168 #endif
2169  for (int ch = 0; ch < input.numChannels; ++ch) {
2170  tesseract::HistogramRect(input.pix, input.numChannels, left, top,
2171  input.width, input.height, histogram);
2172  }
2173 #if ON_WINDOWS
2174  QueryPerformanceCounter(&time_funct_end);
2175  time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2176  (double)(freq.QuadPart);
2177 #elif ON_APPLE
2178  stop = mach_absolute_time();
2179  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2180 #else
2181  clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2182  time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2183  (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2184 #endif
2185  delete[] histogram;
2186  }
2187 
2188  // cleanup
2189  delete[] histogramAllChannels;
2190  return time;
2191 }
2192 
2193 // Reproducing the ThresholdRectToPix native version
2194 static void ThresholdRectToPix_Native(const unsigned char* imagedata,
2195  int bytes_per_pixel, int bytes_per_line,
2196  const int* thresholds,
2197  const int* hi_values, Pix** pix) {
2198  int top = 0;
2199  int left = 0;
2200  int width = pixGetWidth(*pix);
2201  int height = pixGetHeight(*pix);
2202 
2203  *pix = pixCreate(width, height, 1);
2204  uint32_t* pixdata = pixGetData(*pix);
2205  int wpl = pixGetWpl(*pix);
2206  const unsigned char* srcdata =
2207  imagedata + top * bytes_per_line + left * bytes_per_pixel;
2208  for (int y = 0; y < height; ++y) {
2209  const uint8_t* linedata = srcdata;
2210  uint32_t* pixline = pixdata + y * wpl;
2211  for (int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
2212  bool white_result = true;
2213  for (int ch = 0; ch < bytes_per_pixel; ++ch) {
2214  if (hi_values[ch] >= 0 &&
2215  (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
2216  white_result = false;
2217  break;
2218  }
2219  }
2220  if (white_result)
2221  CLEAR_DATA_BIT(pixline, x);
2222  else
2223  SET_DATA_BIT(pixline, x);
2224  }
2225  srcdata += bytes_per_line;
2226  }
2227 }
2228 
2229 static double thresholdRectToPixMicroBench(GPUEnv* env,
2230  TessScoreEvaluationInputData input,
2231  ds_device_type type) {
2232  double time;
2233 #if ON_WINDOWS
2234  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2235  QueryPerformanceFrequency(&freq);
2236 #elif ON_APPLE
2237  mach_timebase_info_data_t info = {0, 0};
2238  mach_timebase_info(&info);
2239  long long start, stop;
2240 #else
2241  timespec time_funct_start, time_funct_end;
2242 #endif
2243 
2244  // input data
2245  unsigned char pixelHi = (unsigned char)255;
2246  int thresholds[4] = {pixelHi, pixelHi, pixelHi, pixelHi};
2247 
2248  // Pix* pix = pixCreate(width, height, 1);
2249  int top = 0;
2250  int left = 0;
2251  int bytes_per_line = input.width * input.numChannels;
2252 
2253  // function call
2254  if (type == DS_DEVICE_OPENCL_DEVICE) {
2255 #if ON_WINDOWS
2256  QueryPerformanceCounter(&time_funct_start);
2257 #elif ON_APPLE
2258  start = mach_absolute_time();
2259 #else
2260  clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2261 #endif
2262 
2263  OpenclDevice::gpuEnv = *env;
2264  int hi_values[4];
2265  int retVal = OpenclDevice::ThresholdRectToPixOCL(
2266  input.imageData, input.numChannels, bytes_per_line, thresholds,
2267  hi_values, &input.pix, input.height, input.width, top, left);
2268 
2269 #if ON_WINDOWS
2270  QueryPerformanceCounter(&time_funct_end);
2271  time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2272  (double)(freq.QuadPart);
2273 #elif ON_APPLE
2274  stop = mach_absolute_time();
2275  if (retVal == 0) {
2276  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2277  } else {
2278  time = FLT_MAX;
2279  }
2280 
2281 #else
2282  clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2283  time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2284  (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2285 #endif
2286  } else {
2287  tesseract::ImageThresholder thresholder;
2288  thresholder.SetImage(input.pix);
2289 #if ON_WINDOWS
2290  QueryPerformanceCounter(&time_funct_start);
2291 #elif ON_APPLE
2292  start = mach_absolute_time();
2293 #else
2294  clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2295 #endif
2296  int hi_values[4] = {};
2297  ThresholdRectToPix_Native(input.imageData, input.numChannels,
2298  bytes_per_line, thresholds, hi_values,
2299  &input.pix);
2300 
2301 #if ON_WINDOWS
2302  QueryPerformanceCounter(&time_funct_end);
2303  time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2304  (double)(freq.QuadPart);
2305 #elif ON_APPLE
2306  stop = mach_absolute_time();
2307  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2308 #else
2309  clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2310  time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2311  (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2312 #endif
2313  }
2314 
2315  return time;
2316 }
2317 
2318 static double getLineMasksMorphMicroBench(GPUEnv* env,
2319  TessScoreEvaluationInputData input,
2320  ds_device_type type) {
2321  double time = 0;
2322 #if ON_WINDOWS
2323  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2324  QueryPerformanceFrequency(&freq);
2325 #elif ON_APPLE
2326  mach_timebase_info_data_t info = {0, 0};
2327  mach_timebase_info(&info);
2328  long long start, stop;
2329 #else
2330  timespec time_funct_start, time_funct_end;
2331 #endif
2332 
2333  // input data
2334  int resolution = 300;
2335  int wpl = pixGetWpl(input.pix);
2336  int kThinLineFraction = 20; // tess constant
2337  int kMinLineLengthFraction = 4; // tess constant
2338  int max_line_width = resolution / kThinLineFraction;
2339  int min_line_length = resolution / kMinLineLengthFraction;
2340  int closing_brick = max_line_width / 3;
2341 
2342  // function call
2343  if (type == DS_DEVICE_OPENCL_DEVICE) {
2344 #if ON_WINDOWS
2345  QueryPerformanceCounter(&time_funct_start);
2346 #elif ON_APPLE
2347  start = mach_absolute_time();
2348 #else
2349  clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2350 #endif
2351  OpenclDevice::gpuEnv = *env;
2352  OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
2353  Pix *pix_vline = nullptr, *pix_hline = nullptr, *pix_closed = nullptr;
2354  OpenclDevice::pixGetLinesCL(nullptr, input.pix, &pix_vline, &pix_hline,
2355  &pix_closed, true, closing_brick, closing_brick,
2356  max_line_width, max_line_width, min_line_length,
2357  min_line_length);
2358 
2359  OpenclDevice::releaseMorphCLBuffers();
2360 
2361 #if ON_WINDOWS
2362  QueryPerformanceCounter(&time_funct_end);
2363  time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2364  (double)(freq.QuadPart);
2365 #elif ON_APPLE
2366  stop = mach_absolute_time();
2367  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2368 #else
2369  clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2370  time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2371  (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2372 #endif
2373  } else {
2374 #if ON_WINDOWS
2375  QueryPerformanceCounter(&time_funct_start);
2376 #elif ON_APPLE
2377  start = mach_absolute_time();
2378 #else
2379  clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2380 #endif
2381 
2382  // native serial code
2383  Pix* src_pix = input.pix;
2384  Pix* pix_closed =
2385  pixCloseBrick(nullptr, src_pix, closing_brick, closing_brick);
2386  Pix* pix_solid =
2387  pixOpenBrick(nullptr, pix_closed, max_line_width, max_line_width);
2388  Pix* pix_hollow = pixSubtract(nullptr, pix_closed, pix_solid);
2389  pixDestroy(&pix_solid);
2390  Pix* pix_vline = pixOpenBrick(nullptr, pix_hollow, 1, min_line_length);
2391  Pix* pix_hline = pixOpenBrick(nullptr, pix_hollow, min_line_length, 1);
2392  pixDestroy(&pix_hline);
2393  pixDestroy(&pix_vline);
2394  pixDestroy(&pix_hollow);
2395 
2396 #if ON_WINDOWS
2397  QueryPerformanceCounter(&time_funct_end);
2398  time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2399  (double)(freq.QuadPart);
2400 #elif ON_APPLE
2401  stop = mach_absolute_time();
2402  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2403 #else
2404  clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2405  time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2406  (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2407 #endif
2408  }
2409 
2410  return time;
2411 }
2412 
2413 /******************************************************************************
2414  * Device Selection
2415  *****************************************************************************/
2416 
2417 #include <cstdlib>
2418 
2419 // encode score object as byte string
2420 static ds_status serializeScore(ds_device* device, void** serializedScore,
2421  unsigned int* serializedScoreSize) {
2422  *serializedScoreSize = sizeof(TessDeviceScore);
2423  *serializedScore = new unsigned char[*serializedScoreSize];
2424  memcpy(*serializedScore, device->score, *serializedScoreSize);
2425  return DS_SUCCESS;
2426 }
2427 
2428 // parses byte string and stores in score object
2429 static ds_status deserializeScore(ds_device* device,
2430  const unsigned char* serializedScore,
2431  unsigned int serializedScoreSize) {
2432  // check that serializedScoreSize == sizeof(TessDeviceScore);
2433  device->score = new TessDeviceScore;
2434  memcpy(device->score, serializedScore, serializedScoreSize);
2435  return DS_SUCCESS;
2436 }
2437 
2438 static ds_status releaseScore(TessDeviceScore* score) {
2439  delete score;
2440  return DS_SUCCESS;
2441 }
2442 
2443 // evaluate devices
2444 static ds_status evaluateScoreForDevice(ds_device* device, void* inputData) {
2445  // overwrite statuc gpuEnv w/ current device
2446  // so native opencl calls can be used; they use static gpuEnv
2447  tprintf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName,
2448  device->type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2449  GPUEnv* env = nullptr;
2450  if (device->type == DS_DEVICE_OPENCL_DEVICE) {
2451  env = &OpenclDevice::gpuEnv;
2452  memset(env, 0, sizeof(*env));
2453  // tprintf("[DS] populating tmp GPUEnv from device\n");
2454  populateGPUEnvFromDevice(env, device->oclDeviceID);
2455  env->mnFileCount = 0; // argc;
2456  env->mnKernelCount = 0UL;
2457  // tprintf("[DS] compiling kernels for tmp GPUEnv\n");
2458  OpenclDevice::CompileKernelFile(env, "");
2459  }
2460 
2461  TessScoreEvaluationInputData* input =
2462  static_cast<TessScoreEvaluationInputData*>(inputData);
2463 
2464  // pixReadTiff
2465  double composeRGBPixelTime =
2466  composeRGBPixelMicroBench(env, *input, device->type);
2467 
2468  // HistogramRect
2469  double histogramRectTime = histogramRectMicroBench(env, *input, device->type);
2470 
2471  // ThresholdRectToPix
2472  double thresholdRectToPixTime =
2473  thresholdRectToPixMicroBench(env, *input, device->type);
2474 
2475  // getLineMasks
2476  double getLineMasksMorphTime =
2477  getLineMasksMorphMicroBench(env, *input, device->type);
2478 
2479  // weigh times (% of cpu time)
2480  // these weights should be the % execution time that the native cpu code took
2481  float composeRGBPixelWeight = 1.2f;
2482  float histogramRectWeight = 2.4f;
2483  float thresholdRectToPixWeight = 4.5f;
2484  float getLineMasksMorphWeight = 5.0f;
2485 
2486  float weightedTime = composeRGBPixelWeight * composeRGBPixelTime +
2487  histogramRectWeight * histogramRectTime +
2488  thresholdRectToPixWeight * thresholdRectToPixTime +
2489  getLineMasksMorphWeight * getLineMasksMorphTime;
2490  device->score = new TessDeviceScore;
2491  device->score->time = weightedTime;
2492 
2493  tprintf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName,
2494  device->type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2495  tprintf("[DS]%25s: %f (w=%.1f)\n", "composeRGBPixel", composeRGBPixelTime,
2496  composeRGBPixelWeight);
2497  tprintf("[DS]%25s: %f (w=%.1f)\n", "HistogramRect", histogramRectTime,
2498  histogramRectWeight);
2499  tprintf("[DS]%25s: %f (w=%.1f)\n", "ThresholdRectToPix",
2500  thresholdRectToPixTime, thresholdRectToPixWeight);
2501  tprintf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime,
2502  getLineMasksMorphWeight);
2503  tprintf("[DS]%25s: %f\n", "Score", device->score->time);
2504  return DS_SUCCESS;
2505 }
2506 
2507 // initial call to select device
2508 ds_device OpenclDevice::getDeviceSelection() {
2509  if (!deviceIsSelected) {
2510  PERF_COUNT_START("getDeviceSelection")
2511  // check if opencl is available at runtime
2512  if (1 == LoadOpencl()) {
2513  // opencl is available
2514  // PERF_COUNT_SUB("LoadOpencl")
2515  // setup devices
2516  ds_status status;
2517  ds_profile* profile;
2518  status = initDSProfile(&profile, "v0.1");
2519  PERF_COUNT_SUB("initDSProfile")
2520  // try reading scores from file
2521  const char* fileName = "tesseract_opencl_profile_devices.dat";
2522  status = readProfileFromFile(profile, deserializeScore, fileName);
2523  if (status != DS_SUCCESS) {
2524  // need to run evaluation
2525  tprintf("[DS] Profile file not available (%s); performing profiling.\n",
2526  fileName);
2527 
2528  // create input data
2529  TessScoreEvaluationInputData input;
2530  populateTessScoreEvaluationInputData(&input);
2531  // PERF_COUNT_SUB("populateTessScoreEvaluationInputData")
2532  // perform evaluations
2533  unsigned int numUpdates;
2534  status = profileDevices(profile, DS_EVALUATE_ALL,
2535  evaluateScoreForDevice, &input, &numUpdates);
2536  PERF_COUNT_SUB("profileDevices")
2537  // write scores to file
2538  if (status == DS_SUCCESS) {
2539  status = writeProfileToFile(profile, serializeScore, fileName);
2540  PERF_COUNT_SUB("writeProfileToFile")
2541  if (status == DS_SUCCESS) {
2542  tprintf("[DS] Scores written to file (%s).\n", fileName);
2543  } else {
2544  tprintf(
2545  "[DS] Error saving scores to file (%s); scores not written to "
2546  "file.\n",
2547  fileName);
2548  }
2549  } else {
2550  tprintf(
2551  "[DS] Unable to evaluate performance; scores not written to "
2552  "file.\n");
2553  }
2554  } else {
2555  PERF_COUNT_SUB("readProfileFromFile")
2556  tprintf("[DS] Profile read from file (%s).\n", fileName);
2557  }
2558 
2559  // we now have device scores either from file or evaluation
2560  // select fastest using custom Tesseract selection algorithm
2561  float bestTime = FLT_MAX; // begin search with worst possible time
2562  int bestDeviceIdx = -1;
2563  for (unsigned d = 0; d < profile->numDevices; d++) {
2564  ds_device device = profile->devices[d];
2565  if (device.score == nullptr) continue;
2566  TessDeviceScore score = *device.score;
2567 
2568  float time = score.time;
2569  tprintf("[DS] Device[%u] %i:%s score is %f\n", d + 1, device.type,
2570  device.oclDeviceName, time);
2571  if (time < bestTime) {
2572  bestTime = time;
2573  bestDeviceIdx = d;
2574  }
2575  }
2576  if (bestDeviceIdx >= 0) {
2577  tprintf("[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2578  profile->devices[bestDeviceIdx].oclDeviceName,
2579  profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
2580  ? "OpenCL"
2581  : "Native");
2582  }
2583  // cleanup
2584  // TODO: call destructor for profile object?
2585 
2586  bool overridden = false;
2587  char* overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE");
2588  if (overrideDeviceStr != nullptr) {
2589  int overrideDeviceIdx = atoi(overrideDeviceStr);
2590  if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices) {
2591  tprintf(
2592  "[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, "
2593  "%i)\n",
2594  overrideDeviceStr, overrideDeviceIdx);
2595  bestDeviceIdx = overrideDeviceIdx - 1;
2596  overridden = true;
2597  } else {
2598  tprintf(
2599  "[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are "
2600  "valid devices).\n",
2601  overrideDeviceStr, profile->numDevices);
2602  }
2603  }
2604 
2605  if (overridden) {
2606  tprintf("[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2607  profile->devices[bestDeviceIdx].oclDeviceName,
2608  profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
2609  ? "OpenCL"
2610  : "Native");
2611  }
2612  selectedDevice = profile->devices[bestDeviceIdx];
2613  // cleanup
2614  releaseDSProfile(profile, releaseScore);
2615  } else {
2616  // opencl isn't available at runtime, select native cpu device
2617  tprintf("[DS] OpenCL runtime not available.\n");
2618  selectedDevice.type = DS_DEVICE_NATIVE_CPU;
2619  selectedDevice.oclDeviceName = "(null)";
2620  selectedDevice.score = nullptr;
2621  selectedDevice.oclDeviceID = nullptr;
2622  selectedDevice.oclDriverVersion = nullptr;
2623  }
2624  deviceIsSelected = true;
2625  PERF_COUNT_SUB("select from Profile")
2627  }
2628  // PERF_COUNT_END
2629  return selectedDevice;
2630 }
2631 
2632 bool OpenclDevice::selectedDeviceIsOpenCL() {
2633  ds_device device = getDeviceSelection();
2634  return (device.type == DS_DEVICE_OPENCL_DEVICE);
2635 }
2636 
2637 #endif
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: thresholder.cpp:63
const int kMinLineLengthFraction
Denominator of resolution makes min pixels to demand line lengths to be.
Definition: linefind.cpp:41
const int kThinLineFraction
Denominator of resolution makes max pixel width to allow thin lines.
Definition: linefind.cpp:39
#define PERF_COUNT_START(FUNCT_NAME)
const int kHistogramSize
Definition: otsuthr.h:27
const char * kernel_src
Definition: oclkernels.h:21
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void HistogramRect(Pix *src_pix, int channel, int left, int top, int width, int height, int *histogram)
Definition: otsuthr.cpp:151
#define PERF_COUNT_END
#define PERF_COUNT_SUB(SUB)
#define ASSERT_HOST(x)
Definition: errcode.h:84