Point Cloud Library (PCL)  1.11.0
cutil_inline_drvapi.h
1 /*
2  * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3  *
4  * Please refer to the NVIDIA end user license agreement (EULA) associated
5  * with this source code for terms and conditions that govern your use of
6  * this software. Any use, reproduction, disclosure, or distribution of
7  * this software and related documentation outside the terms of the EULA
8  * is strictly prohibited.
9  *
10  */
11 
12 #pragma once
13 
14 #include <stdio.h>
15 #include <string.h>
16 #include <stdlib.h>
17 
18 
19 // We define these calls here, so the user doesn't need to include __FILE__ and __LINE__
20 // The advantage is the developers gets to use the inline function so they can debug
21 #define cutilDrvSafeCallNoSync(err) __cuSafeCallNoSync (err, __FILE__, __LINE__)
22 #define cutilDrvSafeCall(err) __cuSafeCall (err, __FILE__, __LINE__)
23 #define cutilDrvCtxSync() __cuCtxSync (__FILE__, __LINE__)
24 #define cutilDrvCheckMsg(msg) __cuCheckMsg (msg, __FILE__, __LINE__)
25 #define cutilDrvAlignOffset(offset, alignment) ( offset = (offset + (alignment-1)) & ~((alignment-1)) )
26 
27 // These are the inline versions for all of the CUTIL functions
28 inline void __cuSafeCallNoSync( CUresult err, const char *file, const int line )
29 {
30  if( CUDA_SUCCESS != err) {
31  fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
32  err, file, line );
33  exit(-1);
34  }
35 }
36 inline void __cuSafeCall( CUresult err, const char *file, const int line )
37 {
38  __cuSafeCallNoSync( err, file, line );
39 }
40 
41 inline void __cuCtxSync(const char *file, const int line )
42 {
43  CUresult err = cuCtxSynchronize();
44  if( CUDA_SUCCESS != err ) {
45  fprintf(stderr, "cuCtxSynchronize() API error = %04d in file <%s>, line %i.\n",
46  err, file, line );
47  exit(-1);
48  }
49 }
50 
51 #define MIN(a,b) ((a < b) ? a : b)
52 #define MAX(a,b) ((a > b) ? a : b)
53 
54 // Beginning of GPU Architecture definitions
55 inline int _ConvertSMVer2CoresDrvApi(int major, int minor)
56 {
57  // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
58  struct sSMtoCores{
59  int SM; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version
60  int Cores;
61  };
62 
63  sSMtoCores nGpuArchCoresPerSM[] =
64  { { 0x10, 8 },
65  { 0x11, 8 },
66  { 0x12, 8 },
67  { 0x13, 8 },
68  { 0x20, 32 },
69  { 0x21, 48 },
70  { -1, -1 }
71  };
72 
73  int index = 0;
74  while (nGpuArchCoresPerSM[index].SM != -1) {
75  if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
76  return nGpuArchCoresPerSM[index].Cores;
77  }
78  index++;
79  }
80  printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
81  return -1;
82 }
83 // end of GPU Architecture definitions
84 
85 // This function returns the best GPU based on performance
86 inline int cutilDrvGetMaxGflopsDeviceId()
87 {
88  CUdevice current_device = 0;
89  CUdevice max_perf_device = 0;
90  int device_count = 0;
91  int max_compute_perf = 0;
92  int best_SM_arch = 0;
93 
94  cuInit(0);
95  cutilDrvSafeCallNoSync(cuDeviceGetCount(&device_count));
96 
97  // Find the best major SM Architecture GPU device
98  while ( current_device < device_count ) {
99  int major = 0;
100  int minor = 0;
101  cutilDrvSafeCallNoSync (cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
102  cutilDrvSafeCallNoSync (cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
103  if (major > 0 && major < 9999) {
104  best_SM_arch = MAX(best_SM_arch, major);
105  }
106  current_device++;
107  }
108 
109  // Find the best CUDA capable GPU device
110  current_device = 0;
111  while( current_device < device_count ) {
112  int multiProcessorCount;
113  int clockRate;
114  int major = 0;
115  int minor = 0;
116  cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &multiProcessorCount,
117  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
118  current_device ) );
119  cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &clockRate,
120  CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
121  current_device ) );
122  cutilDrvSafeCallNoSync (cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
123  cutilDrvSafeCallNoSync (cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
124 
125  int sm_per_multiproc = (major == 9999 && minor == 9999) ? 1 : _ConvertSMVer2CoresDrvApi(major, minor);
126 
127  int compute_perf = multiProcessorCount * sm_per_multiproc * clockRate;
128  if( compute_perf > max_compute_perf ) {
129  // If we find GPU with SM major > 2, search only these
130  if ( best_SM_arch > 2 ) {
131  // If our device==dest_SM_arch, choose this, or else pass
132  if (major == best_SM_arch) {
133  max_compute_perf = compute_perf;
134  max_perf_device = current_device;
135  }
136  }
137  else {
138  max_compute_perf = compute_perf;
139  max_perf_device = current_device;
140  }
141  }
142  ++current_device;
143  }
144  return max_perf_device;
145 }
146 
147 // This function returns the best Graphics GPU based on performance
148 inline int cutilDrvGetMaxGflopsGraphicsDeviceId()
149 {
150  CUdevice current_device = 0;
151  CUdevice max_perf_device = 0;
152  int device_count = 0;
153  int max_compute_perf = 0;
154  int best_SM_arch = 0;
155 
156  cuInit(0);
157  cutilDrvSafeCallNoSync(cuDeviceGetCount(&device_count));
158 
159  // Find the best major SM Architecture GPU device that are graphics devices
160  while ( current_device < device_count ) {
161  char deviceName[256];
162  int major = 0;
163  int minor = 0;
164  int bTCC = 0;
165  cutilDrvSafeCallNoSync( cuDeviceGetName(deviceName, 256, current_device) );
166  cutilDrvSafeCallNoSync (cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
167  cutilDrvSafeCallNoSync (cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
168  cutilDrvSafeCallNoSync( cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device) );
169 
170  if (!bTCC) {
171  if (major > 0 && major < 9999) {
172  best_SM_arch = MAX(best_SM_arch, major);
173  }
174  }
175  current_device++;
176  }
177 
178  // Find the best CUDA capable GPU device
179  current_device = 0;
180  while( current_device < device_count ) {
181  int multiProcessorCount;
182  int clockRate;
183  int major = 0;
184  int minor = 0;
185  int bTCC = 0;
186  cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &multiProcessorCount,
187  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
188  current_device ) );
189  cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &clockRate,
190  CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
191  current_device ) );
192  cutilDrvSafeCallNoSync (cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
193  cutilDrvSafeCallNoSync (cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
194 
195  cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device ) );
196 
197  int sm_per_multiproc = (major == 9999 && minor == 9999) ? 1 : _ConvertSMVer2CoresDrvApi(major, minor);
198 
199  // If this is a Tesla based GPU and SM 2.0, and TCC is disabled, this is a contender
200  if (!bTCC) // Is this GPU running the TCC driver? If so we pass on this
201  {
202  int compute_perf = multiProcessorCount * sm_per_multiproc * clockRate;
203  if( compute_perf > max_compute_perf ) {
204  // If we find GPU with SM major > 2, search only these
205  if ( best_SM_arch > 2 ) {
206  // If our device = dest_SM_arch, then we pick this one
207  if (major == best_SM_arch) {
208  max_compute_perf = compute_perf;
209  max_perf_device = current_device;
210  }
211  }
212  else {
213  max_compute_perf = compute_perf;
214  max_perf_device = current_device;
215  }
216  }
217  }
218  ++current_device;
219  }
220  return max_perf_device;
221 }
222 
223 inline void __cuCheckMsg( const char * msg, const char *file, const int line )
224 {
225  CUresult err = cuCtxSynchronize();
226  if( CUDA_SUCCESS != err) {
227  fprintf(stderr, "cutilDrvCheckMsg -> %s", msg);
228  fprintf(stderr, "cutilDrvCheckMsg -> cuCtxSynchronize API error = %04d in file <%s>, line %i.\n",
229  err, file, line );
230  exit(-1);
231  }
232 }
233 
234 
235 #if __DEVICE_EMULATION__
236  inline int cutilDeviceInitDrv(int ARGC, char **ARGV) { }
237 #else
238  inline int cutilDeviceInitDrv(int ARGC, char ** ARGV)
239  {
240  int cuDevice = 0;
241  int deviceCount = 0;
242  CUresult err = cuInit(0);
243  if (CUDA_SUCCESS == err)
244  cutilDrvSafeCallNoSync(cuDeviceGetCount(&deviceCount));
245  if (deviceCount == 0) {
246  fprintf(stderr, "CUTIL DeviceInitDrv error: no devices supporting CUDA\n");
247  exit(-1);
248  }
249  int dev = 0;
250  cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev);
251  if (dev < 0) dev = 0;
252  if (dev > deviceCount-1) {
253  fprintf(stderr, "\n");
254  fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
255  fprintf(stderr, ">> cutilDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
256  fprintf(stderr, "\n");
257  return -dev;
258  }
259  cutilDrvSafeCallNoSync(cuDeviceGet(&cuDevice, dev));
260  char name[100];
261  cuDeviceGetName(name, 100, cuDevice);
262  if (cutCheckCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == CUTFalse) {
263  printf("> Using CUDA Device [%d]: %s\n", dev, name);
264  }
265  return dev;
266  }
267 #endif
268 
269  // General initialization call to pick the best CUDA Device
270 #if __DEVICE_EMULATION__
271  inline CUdevice cutilChooseCudaDeviceDrv(int argc, char **argv, int *p_devID)
272 #else
273  inline CUdevice cutilChooseCudaDeviceDrv(int argc, char **argv, int *p_devID)
274  {
275  CUdevice cuDevice;
276  int devID = 0;
277  // If the command-line has a device number specified, use it
278  if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
279  devID = cutilDeviceInitDrv(argc, argv);
280  if (devID < 0) {
281  printf("exiting...\n");
282  exit(0);
283  }
284  } else {
285  // Otherwise pick the device with highest Gflops/s
286  char name[100];
287  devID = cutilDrvGetMaxGflopsDeviceId();
288  cutilDrvSafeCallNoSync(cuDeviceGet(&cuDevice, devID));
289  cuDeviceGetName(name, 100, cuDevice);
290  printf("> Using CUDA Device [%d]: %s\n", devID, name);
291  }
292  cuDeviceGet(&cuDevice, devID);
293  if (p_devID) *p_devID = devID;
294  return cuDevice;
295  }
296 #endif
297 
298 
299 //! Check for CUDA context lost
300 inline void cutilDrvCudaCheckCtxLost(const char *errorMessage, const char *file, const int line )
301 {
302  CUresult err = cuCtxSynchronize();
303  if( CUDA_ERROR_INVALID_CONTEXT != err) {
304  fprintf(stderr, "Cuda error: %s in file '%s' in line %i\n",
305  errorMessage, file, line );
306  exit(-1);
307  }
308  err = cuCtxSynchronize();
309  if( CUDA_SUCCESS != err) {
310  fprintf(stderr, "Cuda error: %s in file '%s' in line %i\n",
311  errorMessage, file, line );
312  exit(-1);
313  }
314 }
315 
316 #ifndef STRCASECMP
317 #ifdef _WIN32
318 #define STRCASECMP _stricmp
319 #else
320 #define STRCASECMP strcasecmp
321 #endif
322 #endif
323 
324 #ifndef STRNCASECMP
325 #ifdef _WIN32
326 #define STRNCASECMP _strnicmp
327 #else
328 #define STRNCASECMP strncasecmp
329 #endif
330 #endif
331 
332 inline void __cutilDrvQAFinish(int argc, char **argv, bool bStatus)
333 {
334  const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
335 
336  bool bFlag = false;
337  for (int i=1; i < argc; i++) {
338  if (!STRCASECMP(argv[i], "-qatest") || !STRCASECMP(argv[i], "-noprompt")) {
339  bFlag |= true;
340  }
341  }
342 
343  if (bFlag) {
344  printf("&&&& %s %s", sStatus[bStatus], argv[0]);
345  for (int i=1; i < argc; i++) printf(" %s", argv[i]);
346  } else {
347  printf("[%s] test result\n%s\n", argv[0], sStatus[bStatus]);
348  }
349 }
350 
351 // General check for CUDA GPU SM Capabilities for a specific device #
352 inline bool cutilDrvCudaDevCapabilities(int major_version, int minor_version, int deviceNum, int argc, char** argv)
353 {
354  int major, minor, dev;
355  char device_name[256];
356 
357 #ifdef __DEVICE_EMULATION__
358  printf("> Compute Device Emulation Mode \n");
359 #endif
360 
361  cutilDrvSafeCallNoSync( cuDeviceGet(&dev, deviceNum) );
362  cutilDrvSafeCallNoSync (cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
363  cutilDrvSafeCallNoSync (cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
364  cutilDrvSafeCallNoSync( cuDeviceGetName(device_name, 256, dev) );
365 
366  if((major > major_version) ||
367  (major == major_version && minor >= minor_version))
368  {
369  printf("> Device %d: < %s >, Compute SM %d.%d detected\n", dev, device_name, major, minor);
370  return true;
371  }
372  else
373  {
374  printf("There is no device supporting CUDA compute capability %d.%d.\n", major_version, minor_version);
375  __cutilDrvQAFinish(argc, argv, true);
376  return false;
377  }
378 }
379 
380 // General check for CUDA GPU SM Capabilities
381 inline bool cutilDrvCudaCapabilities(int major_version, int minor_version, int argc, char **argv)
382 {
383  return cutilDrvCudaDevCapabilities(major_version, minor_version, 0, argc, argv);
384 }