forked from rose-compiler/rose
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preinclude-cuda.h
962 lines (780 loc) · 33.9 KB
/
preinclude-cuda.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
// TV (02/27/2018): Added guards for the definition of CUDA's built-in types, variables, functions, and API
#ifdef ROSE_DEFINE_CUDA
#define ROSE_DEFINE_CUDA_BUILTIN 1
#define ROSE_DEFINE_CUDA_API 1
#endif
#ifdef ROSE_DEFINE_CUDA_BUILTIN
#define ROSE_DEFINE_CUDA_BUILTIN_TYPES 1
#define ROSE_DEFINE_CUDA_BUILTIN_VARIABLES 1
#define ROSE_DEFINE_CUDA_BUILTIN_FUNCTIONS 1
#define ROSE_DEFINE_CUDA_BUILTIN_TEXTURES 1
#endif
#include <vector_types.h>
#define ROSE_DEFINE_CUDA_BUILTIN_VARIABLES 1
//#define ROSE_DEFINE_GNU_BUILTIN_FOR_CUDA 1
#define ROSE_DEFINE_CUDA_BUILTIN_SYNC_FUNCTIONS 1
// TV (7/24/2015): including cstdlib causes:
// rose/src/backend/unparser/nameQualificationSupport.C:5293:
// virtual NameQualificationInheritedAttribute NameQualificationTraversal::evaluateInheritedAttribute(SgNode*, NameQualificationInheritedAttribute):
// Assertion `currentStatement != __null' failed.
#if 1
typedef unsigned long size_t;
#else
#include <cstdlib>
#endif
/* CUDA Built-in Macros */
/* DQ (1/18/2016): Adding #define values from Cuda (required for Jeff's example code) */
#define cudaMemAttachGlobal 0x01
/* CUDA Built-in Types */
/*
Extract and adapted from:
CUDA_PATH/include/common_functions.h
CUDA_PATH/include/common_types.h
CUDA_PATH/include/device_functions.h
CUDA_PATH/include/device_types.h
CUDA_PATH/include/sm_11_atomic_functions.h
CUDA_PATH/include/sm_12_atomic_functions.h
CUDA_PATH/include/sm_13_atomic_functions.h
CUDA_PATH/include/sm_20_atomic_functions.h
CUDA_PATH/include/sm_20_intrinsics.h
CUDA_PATH/include/vector_types.h
CUDA_PATH/include/vector_functions.h
*/
/* Vector Types */
#ifdef ROSE_DEFINE_CUDA_VECTOR_TYPES
struct char1
{
signed char x;
};
struct uchar1
{
unsigned char x;
};
struct /*__builtin_align__(2)*/ char2
{
signed char x, y;
};
struct /*__builtin_align__(2)*/ uchar2
{
unsigned char x, y;
};
struct char3
{
signed char x, y, z;
};
struct uchar3
{
unsigned char x, y, z;
};
struct /*__builtin_align__(4)*/ char4
{
signed char x, y, z, w;
};
struct /*__builtin_align__(4)*/ uchar4
{
unsigned char x, y, z, w;
};
struct short1
{
short x;
};
struct ushort1
{
unsigned short x;
};
struct /*__builtin_align__(4)*/ short2
{
short x, y;
};
struct /*__builtin_align__(4)*/ ushort2
{
unsigned short x, y;
};
struct short3
{
short x, y, z;
};
struct ushort3
{
unsigned short x, y, z;
};
struct /*__builtin_align__(8)*/ short4
{
short x, y, z, w;
};
struct /*__builtin_align__(8)*/ ushort4
{
unsigned short x, y, z, w;
};
struct int1
{
int x;
};
struct uint1
{
unsigned int x;
};
struct /*__builtin_align__(8)*/ int2
{
int x, y;
};
struct /*__builtin_align__(8)*/ uint2
{
unsigned int x, y;
};
struct int3
{
int x, y, z;
};
struct uint3
{
unsigned int x, y, z;
};
struct /*__builtin_align__(16)*/ int4
{
int x, y, z, w;
};
struct /*__builtin_align__(16)*/ uint4
{
unsigned int x, y, z, w;
};
struct long1
{
long int x;
};
struct ulong1
{
unsigned long x;
};
struct
/*#if defined (_WIN32)
__builtin_align__(8)
#else
__builtin_align__(2*sizeof(long int))
#endif*/
long2
{
long int x, y;
};
struct
/*#if defined (_WIN32)
__builtin_align__(8)
#else
__builtin_align__(2*sizeof(unsigned long int))
#endif*/
ulong2
{
unsigned long int x, y;
};
#if !defined(__LP64__)
struct long3
{
long int x, y, z;
};
struct ulong3
{
unsigned long int x, y, z;
};
struct /*__builtin_align__(16)*/ long4
{
long int x, y, z, w;
};
struct /*__builtin_align__(16)*/ ulong4
{
unsigned long int x, y, z, w;
};
#endif /* !__LP64__ */
struct float1
{
float x;
};
struct /*__builtin_align__(8)*/ float2
{
float x, y;
};
struct float3
{
float x, y, z;
};
struct /*__builtin_align__(16)*/ float4
{
float x, y, z, w;
};
struct longlong1
{
long long int x;
};
struct ulonglong1
{
unsigned long long int x;
};
struct /*__builtin_align__(16)*/ longlong2
{
long long int x, y;
};
struct /*__builtin_align__(16)*/ ulonglong2
{
unsigned long long int x, y;
};
struct double1
{
double x;
};
struct /*__builtin_align__(16)*/ double2
{
double x, y;
};
typedef struct char1 char1;
typedef struct uchar1 uchar1;
typedef struct char2 char2;
typedef struct uchar2 uchar2;
typedef struct char3 char3;
typedef struct uchar3 uchar3;
typedef struct char4 char4;
typedef struct uchar4 uchar4;
typedef struct short1 short1;
typedef struct ushort1 ushort1;
typedef struct short2 short2;
typedef struct ushort2 ushort2;
typedef struct short3 short3;
typedef struct ushort3 ushort3;
typedef struct short4 short4;
typedef struct ushort4 ushort4;
typedef struct int1 int1;
typedef struct uint1 uint1;
typedef struct int2 int2;
typedef struct uint2 uint2;
typedef struct int3 int3;
typedef struct uint3 uint3;
typedef struct int4 int4;
typedef struct uint4 uint4;
typedef struct long1 long1;
typedef struct ulong1 ulong1;
typedef struct long2 long2;
typedef struct ulong2 ulong2;
typedef struct long3 long3;
typedef struct ulong3 ulong3;
typedef struct long4 long4;
typedef struct ulong4 ulong4;
typedef struct float1 float1;
typedef struct float2 float2;
typedef struct float3 float3;
typedef struct float4 float4;
typedef struct longlong1 longlong1;
typedef struct ulonglong1 ulonglong1;
typedef struct longlong2 longlong2;
typedef struct ulonglong2 ulonglong2;
typedef struct double1 double1;
typedef struct double2 double2;
#endif /* ROSE_DEFINE_CUDA_VECTOR_TYPES */
#ifdef ROSE_DEFINE_CUDA_BUILTIN_TYPES
typedef struct dim3 dim3;
struct dim3
{
unsigned int x, y, z;
#if defined(__cplusplus)
dim3(unsigned int x = 1, unsigned int y = 1, unsigned int z = 1) : x(x), y(y), z(z) {}
dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; }
#endif
};
#endif /* ROSE_DEFINE_CUDA_BUILTIN_TYPES */
#ifdef ROSE_DEFINE_CUDA_API_TYPES
enum cudaError
{
cudaSuccess = 0,
cudaErrorMissingConfiguration,
cudaErrorMemoryAllocation,
cudaErrorInitializationError,
cudaErrorLaunchFailure,
cudaErrorPriorLaunchFailure,
cudaErrorLaunchTimeout,
cudaErrorLaunchOutOfResources,
cudaErrorInvalidDeviceFunction,
cudaErrorInvalidConfiguration,
cudaErrorInvalidDevice,
cudaErrorInvalidValue,
cudaErrorInvalidPitchValue,
cudaErrorInvalidSymbol,
cudaErrorMapBufferObjectFailed,
cudaErrorUnmapBufferObjectFailed,
cudaErrorInvalidHostPointer,
cudaErrorInvalidDevicePointer,
cudaErrorInvalidTexture,
cudaErrorInvalidTextureBinding,
cudaErrorInvalidChannelDescriptor,
cudaErrorInvalidMemcpyDirection,
cudaErrorAddressOfConstant,
cudaErrorTextureFetchFailed,
cudaErrorTextureNotBound,
cudaErrorSynchronizationError,
cudaErrorInvalidFilterSetting,
cudaErrorInvalidNormSetting,
cudaErrorMixedDeviceExecution,
cudaErrorCudartUnloading,
cudaErrorUnknown,
cudaErrorNotYetImplemented,
cudaErrorMemoryValueTooLarge,
cudaErrorInvalidResourceHandle,
cudaErrorNotReady,
cudaErrorInsufficientDriver,
cudaErrorSetOnActiveProcess,
cudaErrorStartupFailure = 0x7f,
cudaErrorApiFailureBase = 10000
};
enum cudaChannelFormatKind
{
cudaChannelFormatKindSigned,
cudaChannelFormatKindUnsigned,
cudaChannelFormatKindFloat,
cudaChannelFormatKindNone
};
struct cudaChannelFormatDesc
{
int x;
int y;
int z;
int w;
enum cudaChannelFormatKind f;
};
struct cudaArray;
enum cudaMemcpyKind
{
cudaMemcpyHostToHost = 0,
cudaMemcpyHostToDevice,
cudaMemcpyDeviceToHost,
cudaMemcpyDeviceToDevice
};
struct cudaPitchedPtr
{
void *ptr;
size_t pitch;
size_t xsize;
size_t ysize;
};
struct cudaExtent
{
size_t width;
size_t height;
size_t depth;
};
struct cudaPos
{
size_t x;
size_t y;
size_t z;
};
struct cudaMemcpy3DParms
{
struct cudaArray *srcArray;
struct cudaPos srcPos;
struct cudaPitchedPtr srcPtr;
struct cudaArray *dstArray;
struct cudaPos dstPos;
struct cudaPitchedPtr dstPtr;
struct cudaExtent extent;
enum cudaMemcpyKind kind;
};
struct cudaDeviceProp
{
char name[256];
size_t totalGlobalMem;
size_t sharedMemPerBlock;
int regsPerBlock;
int warpSize;
size_t memPitch;
int maxThreadsPerBlock;
int maxThreadsDim[3];
int maxGridSize[3];
int clockRate;
size_t totalConstMem;
int major;
int minor;
size_t textureAlignment;
int deviceOverlap;
int multiProcessorCount;
int kernelExecTimeoutEnabled;
int __cudaReserved[39];
};
#define cudaDevicePropDontCare \
{ \
{'\0'}, /* char name[256]; */ \
0, /* size_t totalGlobalMem; */ \
0, /* size_t sharedMemPerBlock; */ \
0, /* int regsPerBlock; */ \
0, /* int warpSize; */ \
0, /* size_t memPitch; */ \
0, /* int maxThreadsPerBlock; */ \
{0, 0, 0}, /* int maxThreadsDim[3]; */ \
{0, 0, 0}, /* int maxGridSize[3]; */ \
0, /* int clockRate; */ \
0, /* size_t totalConstMem; */ \
-1, /* int major; */ \
-1, /* int minor; */ \
0, /* size_t textureAlignment; */ \
-1, /* int deviceOverlap; */ \
0, /* int multiProcessorCount; */ \
0 /* int kernelExecTimeoutEnabled */ \
}
typedef enum cudaError cudaError_t;
typedef int cudaStream_t;
typedef int cudaEvent_t;
enum cudaTextureAddressMode
{
cudaAddressModeWrap,
cudaAddressModeClamp
};
enum cudaTextureFilterMode
{
cudaFilterModePoint,
cudaFilterModeLinear
};
enum cudaTextureReadMode
{
cudaReadModeElementType,
cudaReadModeNormalizedFloat
};
struct textureReference
{
int normalized;
enum cudaTextureFilterMode filterMode;
enum cudaTextureAddressMode addressMode[3];
struct cudaChannelFormatDesc channelDesc;
int __cudaReserved[16];
};
#endif /* ROSE_DEFINE_CUDA_API_TYPES */
#ifdef ROSE_DEFINE_CUDA_BUILTIN_VARIABLES
/* CUDA Built-in Variables */
dim3 gridDim;
uint3 blockIdx;
dim3 blockDim;
uint3 threadIdx;
int warpSize;
#endif /* ROSE_DEFINE_CUDA_BUILTIN_VARIABLES */
#ifdef ROSE_DEFINE_CUDA_BUILTIN_VECTOR_FUNCTIONS
/* CUDA Built-in Functions */
/* Vector Functions (constructors) */
static __inline__ __host__ __device__ char1 make_char1(signed char x);
static __inline__ __host__ __device__ uchar1 make_uchar1(unsigned char x);
static __inline__ __host__ __device__ char2 make_char2(signed char x, signed char y);
static __inline__ __host__ __device__ uchar2 make_uchar2(unsigned char x, unsigned char y);
static __inline__ __host__ __device__ char3 make_char3(signed char x, signed char y, signed char z);
static __inline__ __host__ __device__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z);
static __inline__ __host__ __device__ char4 make_char4(signed char x, signed char y, signed char z, signed char w);
static __inline__ __host__ __device__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w);
static __inline__ __host__ __device__ short1 make_short1(short x);
static __inline__ __host__ __device__ ushort1 make_ushort1(unsigned short x);
static __inline__ __host__ __device__ short2 make_short2(short x, short y);
static __inline__ __host__ __device__ ushort2 make_ushort2(unsigned short x, unsigned short y);
static __inline__ __host__ __device__ short3 make_short3(short x,short y, short z);
static __inline__ __host__ __device__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z);
static __inline__ __host__ __device__ short4 make_short4(short x, short y, short z, short w);
static __inline__ __host__ __device__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
static __inline__ __host__ __device__ int1 make_int1(int x);
static __inline__ __host__ __device__ uint1 make_uint1(unsigned int x);
static __inline__ __host__ __device__ int2 make_int2(int x, int y);
static __inline__ __host__ __device__ uint2 make_uint2(unsigned int x, unsigned int y);
static __inline__ __host__ __device__ int3 make_int3(int x, int y, int z);
static __inline__ __host__ __device__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z);
static __inline__ __host__ __device__ int4 make_int4(int x, int y, int z, int w);
static __inline__ __host__ __device__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w);
static __inline__ __host__ __device__ long1 make_long1(long int x);
static __inline__ __host__ __device__ ulong1 make_ulong1(unsigned long int x);
static __inline__ __host__ __device__ long2 make_long2(long int x, long int y);
static __inline__ __host__ __device__ ulong2 make_ulong2(unsigned long int x, unsigned long int y);
static __inline__ __host__ __device__ long3 make_long3(long int x, long int y, long int z);
static __inline__ __host__ __device__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z);
static __inline__ __host__ __device__ long4 make_long4(long int x, long int y, long int z, long int w);
static __inline__ __host__ __device__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w);
static __inline__ __host__ __device__ float1 make_float1(float x);
static __inline__ __host__ __device__ float2 make_float2(float x, float y);
static __inline__ __host__ __device__ float3 make_float3(float x, float y, float z);
static __inline__ __host__ __device__ float4 make_float4(float x, float y, float z, float w);
static __inline__ __host__ __device__ longlong1 make_longlong1(long long int x);
static __inline__ __host__ __device__ ulonglong1 make_ulonglong1(unsigned long long int x);
static __inline__ __host__ __device__ longlong2 make_longlong2(long long int x, long long int y);
static __inline__ __host__ __device__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y);
static __inline__ __host__ __device__ double1 make_double1(double x);
static __inline__ __host__ __device__ double2 make_double2(double x, double y);
#endif /* ROSE_DEFINE_CUDA_BUILTIN_VECTOR_FUNCTIONS */
#ifdef ROSE_DEFINE_CUDA_BUILTIN_SYNC_FUNCTIONS
/* Synchronization functions */
__device__ void __threadfence_block();
__device__ void __threadfence();
__device__ void __threadfence_system();
__device__ void __syncthreads();
__device__ int __syncthreads_count(int predicate);
__device__ int __syncthreads_and(int predicate);
__device__ int __syncthreads_or(int predicate);
#endif /* ROSE_DEFINE_CUDA_BUILTIN_SYNC_FUNCTIONS */
/* Time function */
#ifdef ROSE_DEFINE_CUDA_BUILTIN_CLOCK_FUNCTIONS
//__device__ clock_t clock();
#endif /* ROSE_DEFINE_CUDA_BUILTIN_CLOCK_FUNCTIONS */
/* Atomic functions */
#ifdef ROSE_DEFINE_CUDA_BUILTIN_ATOMIC_FUNCTIONS
static __inline__ __device__ int atomicAdd(int *address, int val);
static __inline__ __device__ unsigned int atomicAdd(unsigned int *address, unsigned int val);
static __inline__ __device__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val);
static __inline__ __device__ float atomicAdd(float *address, float val);
static __inline__ __device__ int atomicSub(int *address, int val);
static __inline__ __device__ unsigned int atomicSub(unsigned int *address, unsigned int val);
static __inline__ __device__ int atomicExch(int *address, int val);
static __inline__ __device__ unsigned int atomicExch(unsigned int *address, unsigned int val);
static __inline__ __device__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val);
static __inline__ __device__ float atomicExch(float *address, float val);
static __inline__ __device__ int atomicMin(int *address, int val);
static __inline__ __device__ unsigned int atomicMin(unsigned int *address, unsigned int val);
static __inline__ __device__ int atomicMax(int *address, int val);
static __inline__ __device__ unsigned int atomicMax(unsigned int *address, unsigned int val);
static __inline__ __device__ unsigned int atomicInc(unsigned int *address, unsigned int val);
static __inline__ __device__ unsigned int atomicDec(unsigned int *address, unsigned int val);
static __inline__ __device__ int atomicCAS(int *address, int compare, int val);
static __inline__ __device__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val);
static __inline__ __device__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val);
static __inline__ __device__ int atomicAnd(int *address, int val);
static __inline__ __device__ unsigned int atomicAnd(unsigned int *address, unsigned int val);
static __inline__ __device__ int atomicOr(int *address, int val);
static __inline__ __device__ unsigned int atomicOr(unsigned int *address, unsigned int val);
static __inline__ __device__ int atomicXor(int *address, int val);
static __inline__ __device__ unsigned int atomicXor(unsigned int *address, unsigned int val);
#endif /* ROSE_DEFINE_CUDA_BUILTIN_ATOMIC_FUNCTIONS */
/* Warp Vote functions */
#ifdef ROSE_DEFINE_CUDA_BUILTIN_VOTE_FUNCTIONS
__device__ int __all(int cond);
__device__ int __any(int cond);
__device__ unsigned int __ballot(int);
#endif /* ROSE_DEFINE_CUDA_BUILTIN_VOTE_FUNCTIONS */
/* Profiler Counter functions */
#ifdef ROSE_DEFINE_CUDA_BUILTIN_PROFILE_FUNCTIONS
__device__ void __prof_trigger(int);
#endif /* ROSE_DEFINE_CUDA_BUILTIN_PROFILE_FUNCTIONS */
/* Mathematical functions (TODO-CUDA) */
#ifdef ROSE_DEFINE_CUDA_BUILTIN_MATH_FUNCTIONS
#endif /* ROSE_DEFINE_CUDA_BUILTIN_MATH_FUNCTIONS */
/* Texture functions (TODO-CUDA) */
#ifdef ROSE_DEFINE_CUDA_BUILTIN_TEXTURE_FUNCTIONS
#endif /* ROSE_DEFINE_CUDA_BUILTIN_TEXTURE_FUNCTIONS */
/* CUDA API (TODO-CUDA) */
#ifdef ROSE_DEFINE_CUDA_API
#if !defined(__dv)
#if defined(__cplusplus)
#define __dv(v) \
= v
#else /* __cplusplus */
#define __dv(v)
#endif /* __cplusplus */
#endif /* !__dv */
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
extern __host__ cudaError_t cudaMalloc3D(struct cudaPitchedPtr* pitchDevPtr, struct cudaExtent extent);
extern __host__ cudaError_t cudaMalloc3DArray(struct cudaArray** arrayPtr, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent);
extern __host__ cudaError_t cudaMemset3D(struct cudaPitchedPtr pitchDevPtr, int value, struct cudaExtent extent);
extern __host__ cudaError_t cudaMemcpy3D(const struct cudaMemcpy3DParms *p);
extern __host__ cudaError_t cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
extern __host__ cudaError_t cudaMalloc(void **devPtr, size_t size);
extern __host__ cudaError_t cudaMallocHost(void **ptr, size_t size);
extern __host__ cudaError_t cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height);
extern __host__ cudaError_t cudaMallocArray(struct cudaArray **array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height __dv(1));
extern __host__ cudaError_t cudaFree(void *devPtr);
extern __host__ cudaError_t cudaFreeHost(void *ptr);
extern __host__ cudaError_t cudaFreeArray(struct cudaArray *array);
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
extern __host__ cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);
extern __host__ cudaError_t cudaMemcpyToArray(struct cudaArray *dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind);
extern __host__ cudaError_t cudaMemcpyFromArray(void *dst, const struct cudaArray *src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind);
extern __host__ cudaError_t cudaMemcpyArrayToArray(struct cudaArray *dst, size_t wOffsetDst, size_t hOffsetDst, const struct cudaArray *src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));
extern __host__ cudaError_t cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
extern __host__ cudaError_t cudaMemcpy2DToArray(struct cudaArray *dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
extern __host__ cudaError_t cudaMemcpy2DFromArray(void *dst, size_t dpitch, const struct cudaArray *src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind);
extern __host__ cudaError_t cudaMemcpy2DArrayToArray(struct cudaArray *dst, size_t wOffsetDst, size_t hOffsetDst, const struct cudaArray *src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));
extern __host__ cudaError_t cudaMemcpyToSymbol(const char *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice));
extern __host__ cudaError_t cudaMemcpyFromSymbol(void *dst, const char *symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost));
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
extern __host__ cudaError_t cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __host__ cudaError_t cudaMemcpyToArrayAsync(struct cudaArray *dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __host__ cudaError_t cudaMemcpyFromArrayAsync(void *dst, const struct cudaArray *src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __host__ cudaError_t cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __host__ cudaError_t cudaMemcpy2DToArrayAsync(struct cudaArray *dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __host__ cudaError_t cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, const struct cudaArray *src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __host__ cudaError_t cudaMemcpyToSymbolAsync(const char *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __host__ cudaError_t cudaMemcpyFromSymbolAsync(void *dst, const char *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream);
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
extern __host__ cudaError_t cudaMemset(void *mem, int c, size_t count);
extern __host__ cudaError_t cudaMemset2D(void *mem, size_t pitch, int c, size_t width, size_t height);
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
extern __host__ cudaError_t cudaGetSymbolAddress(void **devPtr, const char *symbol);
extern __host__ cudaError_t cudaGetSymbolSize(size_t *size, const char *symbol);
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
extern __host__ cudaError_t cudaGetDeviceCount(int *count);
extern __host__ cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device);
extern __host__ cudaError_t cudaChooseDevice(int *device, const struct cudaDeviceProp *prop);
extern __host__ cudaError_t cudaSetDevice(int device);
extern __host__ cudaError_t cudaGetDevice(int *device);
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
extern __host__ cudaError_t cudaBindTexture(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t size /*__dv(UINT_MAX)*/);
extern __host__ cudaError_t cudaBindTextureToArray(const struct textureReference *texref, const struct cudaArray *array, const struct cudaChannelFormatDesc *desc);
extern __host__ cudaError_t cudaUnbindTexture(const struct textureReference *texref);
extern __host__ cudaError_t cudaGetTextureAlignmentOffset(size_t *offset, const struct textureReference *texref);
extern __host__ cudaError_t cudaGetTextureReference(const struct textureReference **texref, const char *symbol);
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
extern __host__ cudaError_t cudaGetChannelDesc(struct cudaChannelFormatDesc *desc, const struct cudaArray *array);
extern __host__ struct cudaChannelFormatDesc cudaCreateChannelDesc(int x, int y, int z, int w, enum cudaChannelFormatKind f);
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
extern __host__ cudaError_t cudaGetLastError(void);
extern __host__ const char* cudaGetErrorString(cudaError_t error);
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
extern __host__ cudaError_t cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dv(0), cudaStream_t stream __dv(0));
extern __host__ cudaError_t cudaSetupArgument(const void *arg, size_t size, size_t offset);
extern __host__ cudaError_t cudaLaunch(const char *symbol);
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
extern __host__ cudaError_t cudaStreamCreate(cudaStream_t *stream);
extern __host__ cudaError_t cudaStreamDestroy(cudaStream_t stream);
extern __host__ cudaError_t cudaStreamSynchronize(cudaStream_t stream);
extern __host__ cudaError_t cudaStreamQuery(cudaStream_t stream);
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
extern __host__ cudaError_t cudaEventCreate(cudaEvent_t *event);
extern __host__ cudaError_t cudaEventRecord(cudaEvent_t event, cudaStream_t stream);
extern __host__ cudaError_t cudaEventQuery(cudaEvent_t event);
extern __host__ cudaError_t cudaEventSynchronize(cudaEvent_t event);
extern __host__ cudaError_t cudaEventDestroy(cudaEvent_t event);
extern __host__ cudaError_t cudaEventElapsedTime(float *ms, cudaEvent_t start, cudaEvent_t end);
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
extern __host__ cudaError_t cudaSetDoubleForDevice(double *d);
extern __host__ cudaError_t cudaSetDoubleForHost(double *d);
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
extern __host__ cudaError_t cudaThreadExit(void);
extern __host__ cudaError_t cudaThreadSynchronize(void);
/* DQ (1/18/2016): Adding functions that appear to be specific to Cuda 7.0 (required for Jeff's example code) */
cudaError_t cudaMallocManaged(void **devPtr, size_t size, unsigned int flags=0);
cudaError_t cudaDeviceSynchronize();
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#endif /* ROSE_DEFINE_CUDA_API */
#if ROSE_DEFINE_GNU_BUILTIN_FOR_CUDA
/* DQ (3/1/2017): Moved this to be outside of the extern "C" declarations above (required to
support possible multiple overloaded declarations of __builtin_bswap32).
DQ (2/28/2017): Added builtin function to support CUDA code (this declaration is required
to compile cu test file test_2011_13.cu on gcc-4.8-c++11 ubuntu1404 platform).
*/
// int __builtin_bswap32 (int x);
// int32_t __builtin_bswap32 (int32_t x);
#ifdef __INT32_TYPE__
__INT32_TYPE__ __builtin_bswap32 (__INT32_TYPE__ x);
#else
int __builtin_bswap32 (int x);
#endif
#endif /* ROSE_DEFINE_GNU_BUILTIN_FOR_CUDA */
#if ROSE_DEFINE_CUDA_BUILTIN_TEXTURES
// \pp added preliminry support for textures (incomplete)
#include "texture_types.h"
#if defined(__cplusplus)
// \todo to be completed..
// is there a header for the C++ variants?
template <class DataType, int, cudaTextureReadMode>
struct texture : textureReference
{};
template <class DataType>
DataType
tex1Dfetch(texture <DataType, cudaTextureType1D, cudaReadModeElementType> texRef, int x ) ;
template<class T>
cudaChannelFormatDesc cudaCreateChannelDesc();
template<class T, int dim, cudaTextureReadMode readMode>
cudaError_t
cudaBindTexture( size_t* offset,
const texture< T, dim, readMode>& tex,
const void* devPtr,
const cudaChannelFormatDesc& desc,
size_t size = UINT_MAX
);
template<class T, int dim, cudaTextureReadMode readMode>
cudaError_t
cudaBindTexture( size_t* offset,
const texture<T, dim, readMode>& tex,
const void* devPtr,
size_t size = UINT_MAX
);
#endif /* __cplusplus */
#endif /* ROSE_DEFINE_CUDA_BUILTIN_TEXTURES */
#ifdef __host__
# undef __host__
#endif
#ifdef __device__
# undef __device__
#endif
#ifdef __global__
# undef __global__
#endif
#ifdef __shared__
# undef __shared__
#endif
#ifdef __constant__
# undef __constant__
#endif
#ifdef __managed__
# undef __managed__
#endif