Skip to content
This repository was archived by the owner on May 27, 2021. It is now read-only.

Commit c08d25f

Browse files
authored
Merge pull request #519 from JuliaGPU/tb/ccall_extern
Use ccall extern
2 parents 8e0de55 + a2b2b68 commit c08d25f

File tree

7 files changed

+234
-452
lines changed

7 files changed

+234
-452
lines changed

.gitlab-ci.yml

Lines changed: 11 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -9,38 +9,17 @@ variables:
99

1010
# Julia versions
1111

12-
julia:1.0:
13-
extends:
14-
- .julia:1.0
15-
- .test
16-
tags:
17-
- nvidia
18-
19-
julia:1.1:
20-
extends:
21-
- .julia:1.1
22-
- .test
23-
tags:
24-
- nvidia
25-
2612
# the "primary" target, where we require a new GPU to make sure all tests are run
27-
julia:1.2:
13+
julia:1.3:
2814
extends:
29-
- .julia:1.2
15+
- .julia:1.3
3016
- .test
3117
tags:
3218
- nvidia
3319
- sm_75
3420
variables:
3521
CI_THOROUGH: 'true'
3622

37-
julia:1.3:
38-
extends:
39-
- .julia:1.3
40-
- .test
41-
tags:
42-
- nvidia
43-
4423
julia:nightly:
4524
extends:
4625
- .julia:nightly
@@ -58,31 +37,31 @@ julia:nightly:
5837
cuda:10.1:
5938
image: nvidia/cuda:10.1-devel
6039
extends:
61-
- .julia:1.2
40+
- .julia:1.3
6241
- .test
6342
tags:
6443
- nvidia
6544

6645
cuda:10.0:
6746
image: nvidia/cuda:10.0-devel
6847
extends:
69-
- .julia:1.2
48+
- .julia:1.3
7049
- .test
7150
tags:
7251
- nvidia
7352

7453
cuda:9.2:
7554
image: nvidia/cuda:9.2-devel
7655
extends:
77-
- .julia:1.2
56+
- .julia:1.3
7857
- .test
7958
tags:
8059
- nvidia
8160

8261
cuda:9.0:
8362
image: nvidia/cuda:9.0-devel
8463
extends:
85-
- .julia:1.2
64+
- .julia:1.3
8665
- .test
8766
tags:
8867
- nvidia
@@ -93,7 +72,7 @@ cuda:9.0:
9372
platform:arm64:
9473
image: nvcr.io/nvidia/l4t-base:r32.2.1
9574
extends:
96-
- .julia:1.2
75+
- .julia:1.3
9776
- .test
9877
tags:
9978
- nvidia-arm64
@@ -106,7 +85,7 @@ platform:arm64:
10685
# that means we have to manually install CuArrays' test dependencies though.
10786

10887
cuarrays:
109-
extends: .julia:1.2
88+
extends: .julia:1.3
11089
tags:
11190
- nvidia
11291
image: nvidia/cuda:10.1-devel
@@ -126,7 +105,7 @@ cuarrays:
126105
# other tasks
127106

128107
precompile:
129-
extends: .julia:1.2
108+
extends: .julia:1.3
130109
image: ubuntu:bionic
131110
script:
132111
- julia --project -e 'using Pkg;
@@ -138,12 +117,12 @@ precompile:
138117

139118
coverage:
140119
extends:
141-
- .julia:1.2
120+
- .julia:1.3
142121
- .coverage
143122

144123
documentation:
145124
extends:
146-
- .julia:1.2
125+
- .julia:1.3
147126
- .documentation
148127
tags:
149128
- nvidia

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ CUDAdrv = "5.0"
2222
DataStructures = "0.15, 0.16, 0.17"
2323
LLVM = "1.2"
2424
TimerOutputs = "0.5"
25-
julia = "1"
25+
julia = "1.3"
2626

2727
[extras]
2828
CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"

src/device/cuda/libcudadevrt.jl

Lines changed: 30 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -33,15 +33,8 @@ function cudaDeviceGetSharedMemConfig(pConfig)
3333
pConfig)
3434
end
3535

36-
if VERSION >= v"1.2.0-DEV.512"
37-
function cudaDeviceSynchronize()
38-
ccall("extern cudaDeviceSynchronize", llvmcall, cudaError_t, ())
39-
end
40-
else
41-
@eval cudaDeviceSynchronize() = Base.llvmcall(
42-
$("declare i32 @cudaDeviceSynchronize()",
43-
"%rv = call i32 @cudaDeviceSynchronize()
44-
ret i32 %rv"), cudaError_t, Tuple{})
36+
function cudaDeviceSynchronize()
37+
ccall("extern cudaDeviceSynchronize", llvmcall, cudaError_t, ())
4538
end
4639

4740
function cudaGetLastError()
@@ -76,37 +69,16 @@ function cudaGetDevice(device)
7669
device)
7770
end
7871

79-
if VERSION >= v"1.2.0-DEV.512"
80-
function cudaStreamCreateWithFlags(pStream, flags)
81-
ccall("extern cudaStreamCreateWithFlags", llvmcall, cudaError_t,
82-
(Ptr{cudaStream_t}, UInt32),
83-
pStream, flags)
84-
end
85-
else
86-
@eval cudaStreamCreateWithFlags(pStream, flags) = Base.llvmcall(
87-
$("declare i32 @cudaStreamCreateWithFlags(i8**, i32)",
88-
"%pStream = inttoptr i$WORD_SIZE %0 to i8**
89-
%rv = call i32 @cudaStreamCreateWithFlags(i8** %pStream, i32 %1)
90-
ret i32 %rv"), cudaError_t,
91-
Tuple{Ptr{cudaStream_t}, UInt32},
92-
Base.unsafe_convert(Ptr{cudaStream_t}, Base.cconvert(Ptr{cudaStream_t}, pStream)),
93-
Base.unsafe_convert(UInt32, Base.cconvert(UInt32, flags)))
94-
end
95-
96-
if VERSION >= v"1.2.0-DEV.512"
97-
function cudaStreamDestroy(stream)
98-
ccall("extern cudaStreamDestroy", llvmcall, cudaError_t,
99-
(cudaStream_t,),
100-
stream)
101-
end
102-
else
103-
@eval cudaStreamDestroy(stream) = Base.llvmcall(
104-
$("declare i32 @cudaStreamDestroy(i8*)",
105-
"%stream = inttoptr i$WORD_SIZE %0 to i8*
106-
%rv = call i32 @cudaStreamDestroy(i8* %stream)
107-
ret i32 %rv"), cudaError_t,
108-
Tuple{cudaStream_t},
109-
Base.unsafe_convert(cudaStream_t, Base.cconvert(cudaStream_t, stream)))
72+
function cudaStreamCreateWithFlags(pStream, flags)
73+
ccall("extern cudaStreamCreateWithFlags", llvmcall, cudaError_t,
74+
(Ptr{cudaStream_t}, UInt32),
75+
pStream, flags)
76+
end
77+
78+
function cudaStreamDestroy(stream)
79+
ccall("extern cudaStreamDestroy", llvmcall, cudaError_t,
80+
(cudaStream_t,),
81+
stream)
11082
end
11183

11284
function cudaStreamWaitEvent(stream, event, flags)
@@ -249,31 +221,10 @@ function cudaGetParameterBuffer(alignment, size)
249221
alignment, size)
250222
end
251223

252-
if VERSION >= v"1.2.0-DEV.512"
253-
function cudaGetParameterBufferV2(func, gridDimension, blockDimension, sharedMemSize)
254-
ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid},
255-
(Ptr{Cvoid}, dim3, dim3, UInt32),
256-
func, gridDimension, blockDimension, sharedMemSize)
257-
end
258-
else
259-
@eval cudaGetParameterBufferV2(func, gridDimension, blockDimension, sharedMemSize) =
260-
Base.llvmcall(
261-
$("declare i8* @cudaGetParameterBufferV2(i8*, {i32,i32,i32}, {i32,i32,i32}, i32)",
262-
"%func = inttoptr i$WORD_SIZE %0 to i8*
263-
%gridDimension.x = insertvalue { i32, i32, i32 } undef, i32 %1, 0
264-
%gridDimension.y = insertvalue { i32, i32, i32 } %gridDimension.x, i32 %2, 1
265-
%gridDimension.z = insertvalue { i32, i32, i32 } %gridDimension.y, i32 %3, 2
266-
%blockDimension.x = insertvalue { i32, i32, i32 } undef, i32 %4, 0
267-
%blockDimension.y = insertvalue { i32, i32, i32 } %blockDimension.x, i32 %5, 1
268-
%blockDimension.z = insertvalue { i32, i32, i32 } %blockDimension.y, i32 %6, 2
269-
%rv = call i8* @cudaGetParameterBufferV2(i8* %func, {i32,i32,i32} %gridDimension.z, {i32,i32,i32} %blockDimension.z, i32 %7)
270-
%buf = ptrtoint i8* %rv to i$WORD_SIZE
271-
ret i$WORD_SIZE %buf"), Ptr{Cvoid},
272-
Tuple{Ptr{Cvoid}, Cuint, Cuint, Cuint, Cuint, Cuint, Cuint, Cuint},
273-
Base.unsafe_convert(Ptr{Cvoid}, Base.cconvert(Ptr{Cvoid}, func)),
274-
gridDimension.x, gridDimension.y, gridDimension.z, # known to be Cuint
275-
blockDimension.x, blockDimension.y, blockDimension.z, # known to be Cuint
276-
Base.unsafe_convert(Cuint, Base.cconvert(Cuint, sharedMemSize)))
224+
function cudaGetParameterBufferV2(func, gridDimension, blockDimension, sharedMemSize)
225+
ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid},
226+
(Ptr{Cvoid}, dim3, dim3, UInt32),
227+
func, gridDimension, blockDimension, sharedMemSize)
277228
end
278229

279230
function cudaLaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension,
@@ -296,23 +247,10 @@ function cudaLaunchDevice(func, parameterBuffer, gridDimension, blockDimension,
296247
func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream)
297248
end
298249

299-
if VERSION >= v"1.2.0-DEV.512"
300-
function cudaLaunchDeviceV2(parameterBuffer, stream)
301-
ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t,
302-
(Ptr{Cvoid}, cudaStream_t),
303-
parameterBuffer, stream)
304-
end
305-
else
306-
@eval cudaLaunchDeviceV2(parameterBuffer, stream) =
307-
Base.llvmcall(
308-
$("declare i32 @cudaLaunchDeviceV2(i8*, i8*)",
309-
"%buf = inttoptr i$WORD_SIZE %0 to i8*
310-
%stream = inttoptr i$WORD_SIZE %1 to i8*
311-
%rv = call i32 @cudaLaunchDeviceV2(i8* %buf, i8* %stream)
312-
ret i32 %rv"), cudaError_t,
313-
Tuple{Ptr{Cvoid}, cudaStream_t},
314-
Base.unsafe_convert(Ptr{Cvoid}, Base.cconvert(Ptr{Cvoid}, parameterBuffer)),
315-
Base.unsafe_convert(cudaStream_t, Base.cconvert(cudaStream_t, stream)))
250+
function cudaLaunchDeviceV2(parameterBuffer, stream)
251+
ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t,
252+
(Ptr{Cvoid}, cudaStream_t),
253+
parameterBuffer, stream)
316254
end
317255

318256
function cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, blockSize,
@@ -329,35 +267,16 @@ function cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, func,
329267
numBlocks, func, blockSize, dynamicSmemSize, flags)
330268
end
331269

332-
if VERSION >= v"1.2.0-DEV.512"
333-
function cudaCGGetIntrinsicHandle(scope)
334-
ccall("extern cudaCGGetIntrinsicHandle", llvmcall, Culonglong,
335-
(cudaCGScope,),
336-
scope)
337-
end
338-
else
339-
@eval cudaCGGetIntrinsicHandle(scope) = Base.llvmcall(
340-
$("declare i64 @cudaCGGetIntrinsicHandle(i32)",
341-
"%rv = call i64 @cudaCGGetIntrinsicHandle(i32 1)
342-
ret i64 %rv"), Culonglong,
343-
Tuple{cudaCGScope},
344-
Base.unsafe_convert(cudaCGScope, Base.cconvert(cudaCGScope, scope)))
345-
end
346-
347-
if VERSION >= v"1.2.0-DEV.512"
348-
function cudaCGSynchronize(handle, flags)
349-
ccall("extern cudaCGSynchronize", llvmcall, cudaError_t,
350-
(Culonglong, UInt32),
351-
handle, flags)
352-
end
353-
else
354-
@eval cudaCGSynchronize(handle, flags) = Base.llvmcall(
355-
$("declare i32 @cudaCGSynchronize(i64, i32)",
356-
"%rv = call i32 @cudaCGSynchronize(i64 %0, i32 %1)
357-
ret i32 %rv"), cudaError_t,
358-
Tuple{Culonglong, UInt32},
359-
Base.unsafe_convert(Culonglong, Base.cconvert(Culonglong, handle)),
360-
Base.unsafe_convert(UInt32, Base.cconvert(UInt32, flags)))
270+
function cudaCGGetIntrinsicHandle(scope)
271+
ccall("extern cudaCGGetIntrinsicHandle", llvmcall, Culonglong,
272+
(cudaCGScope,),
273+
scope)
274+
end
275+
276+
function cudaCGSynchronize(handle, flags)
277+
ccall("extern cudaCGSynchronize", llvmcall, cudaError_t,
278+
(Culonglong, UInt32),
279+
handle, flags)
361280
end
362281

363282
function cudaCGSynchronizeGrid(handle, flags)

0 commit comments

Comments
 (0)