Skip to content

Commit 745aacd

Browse files
GCP support
1 parent 4a51267 commit 745aacd

File tree

6 files changed

+91
-39
lines changed

6 files changed

+91
-39
lines changed

src/cluster_providers/ec2/ec2_deploy.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ function deploy_cluster(_::Type{AmazonEC2},
5454

5555
count = get(cluster_features, :node_count, 1)
5656
imageid = get(cluster_features, :imageid, defaults_dict[AmazonEC2][:imageid])
57-
user = get(cluster_features, :user, defaults_dict[GoogleCloud][:user])
57+
user = get(cluster_features, :user, defaults_dict[AmazonEC2][:user])
5858

5959
subnet_id = get(cluster_features, :subnet_id, get(defaults_dict[AmazonEC2], :subnet_id, nothing))
6060
placement_group = get(cluster_features, :placement_group, get(defaults_dict[AmazonEC2], :placement_group, nothing))
@@ -75,7 +75,7 @@ function deploy_cluster(_::Type{AmazonEC2},
7575
return cluster
7676
end
7777

78-
ec2_build_clusterobj(_::Type{<:PeerWorkers}, cluster_handle, instance_type, count, imageid, subnet_id, user,
78+
ec2_build_clusterobj(_::Type{<:PeerWorkers}, cluster_handle, instance_type, count, imageid, user, subnet_id,
7979
placement_group, auto_pg, security_group_id, auto_sg, cluster_features) =
8080
EC2PeerWorkers(cluster_handle, instance_type, count, imageid, user,
8181
subnet_id, placement_group, auto_pg, security_group_id, auto_sg,

src/cluster_providers/ec2/ec2_persist.jl

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ function ec2_cluster_save(cluster::ManagerWorkers)
1414
contents["count"] = cluster.count
1515
contents["image_id_manager"] = cluster.image_id_manager
1616
contents["image_id_worker"] = cluster.image_id_worker
17+
contents["user_manager"] = cluster.user_manager
18+
contents["user_worker"] = cluster.user_worker
1719
!isnothing(cluster.subnet_id) && (contents["subnet_id"] = cluster.subnet_id)
1820
!isnothing(cluster.placement_group) && (contents["placement_group"] = cluster.placement_group)
1921
contents["auto_pg"] = cluster.auto_pg
@@ -49,6 +51,7 @@ function ec2_cluster_save(cluster::PeerWorkers)
4951
contents["instance_type"] = cluster.instance_type
5052
contents["count"] = cluster.count
5153
contents["image_id"] = cluster.image_id
54+
contents["user"] = cluster.user
5255
!isnothing(cluster.subnet_id) && (contents["subnet_id"] = cluster.subnet_id)
5356
!isnothing(cluster.placement_group) && (contents["placement_group"] = cluster.placement_group)
5457
contents["auto_pg"] = cluster.auto_pg
@@ -79,6 +82,8 @@ function cluster_load(_::Type{AmazonEC2}, _::Type{<:ManagerWorkers}, cluster_han
7982
count = contents["count"]
8083
image_id_manager = contents["image_id_manager"]
8184
image_id_worker = contents["image_id_worker"]
85+
user_manager = contents["user_manager"]
86+
user_worker = contents["user_worker"]
8287
subnet_id = haskey(contents, "subnet_id") ? contents["subnet_id"] : nothing
8388
placement_group = haskey(contents, "placement_group") ? contents["placement_group"] : nothing
8489
auto_pg = contents["auto_pg"]
@@ -95,7 +100,7 @@ function cluster_load(_::Type{AmazonEC2}, _::Type{<:ManagerWorkers}, cluster_han
95100
shared_fs = contents["shared_fs"]
96101

97102
cluster = EC2ManagerWorkers(string(cluster_handle), instance_type_manager, instance_type_worker, count,
98-
image_id_manager, image_id_worker,
103+
image_id_manager, image_id_worker, user_manager, user_worker,
99104
subnet_id, placement_group, auto_pg, security_group_id, auto_sg,
100105
environment, cluster_nodes, shared_fs, cluster_features)
101106

@@ -125,6 +130,7 @@ function cluster_load(_::Type{AmazonEC2}, _::Type{<:PeerWorkers}, cluster_handle
125130
instance_type = contents["instance_type"]
126131
count = contents["count"]
127132
image_id = contents["image_id"]
133+
user = contents["user"]
128134
subnet_id = haskey(contents, "subnet_id") ? contents["subnet_id"] : nothing
129135
placement_group = haskey(contents, "placement_group") ? contents["placement_group"] : nothing
130136
auto_pg = contents["auto_pg"]
@@ -141,7 +147,7 @@ function cluster_load(_::Type{AmazonEC2}, _::Type{<:PeerWorkers}, cluster_handle
141147
shared_fs = contents["shared_fs"]
142148

143149
cluster = EC2PeerWorkers(string(cluster_handle), instance_type, count,
144-
image_id,
150+
image_id, user,
145151
subnet_id, placement_group, auto_pg, security_group_id, auto_sg,
146152
environment, cluster_nodes, shared_fs, cluster_features)
147153

src/cluster_providers/gcp/gcp_backend.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,9 @@ mutable struct GCPPeerWorkersMPI <: PeerWorkersMPI # Cluster
5656
image_id::String
5757
count::Int
5858
instance_type::String
59+
user::String
5960
zone::String
61+
project::String
6062
cluster_nodes::Union{Dict{Symbol, String}, Nothing}
6163
features::Dict{Symbol, Any}
6264
end

src/cluster_providers/gcp/gcp_deploy.jl

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ end
6666
# 1. run the script to clusterize the nodes
6767
# 2. call deploy_cluster to link ...
6868
function deploy_cluster(gcptype::Type{GoogleCloud},
69-
_::Type{<:PeerWorkers},
69+
cluster_type::Type{<:PeerWorkers},
7070
_::Type{<:CreateMode},
7171
cluster_handle,
7272
cluster_features,
@@ -78,15 +78,16 @@ function deploy_cluster(gcptype::Type{GoogleCloud},
7878
zone = get(cluster_features, :zone, defaults_dict[GoogleCloud][:zone])
7979
project = defaults_dict[GoogleCloud][:project]
8080

81-
cluster = GCPPeerWorkers(string(cluster_handle),
82-
imageid,
83-
node_count,
84-
instance_type,
85-
user,
86-
zone,
87-
project,
88-
nothing,
89-
cluster_features)
81+
cluster = gcp_build_clusterobj(cluster_type,
82+
string(cluster_handle),
83+
imageid,
84+
node_count,
85+
instance_type,
86+
user,
87+
zone,
88+
project,
89+
nothing,
90+
cluster_features)
9091

9192
gcp_create_cluster(cluster)
9293

@@ -97,11 +98,20 @@ function deploy_cluster(gcptype::Type{GoogleCloud},
9798
return cluster
9899
end
99100

101+
gcp_build_clusterobj(_::Type{<:PeerWorkers}, name, image_id, count, instance_type, user, zone, project, cluster_nodes, features) =
102+
GCPPeerWorkers(name, image_id, count, instance_type, user, zone, project, cluster_nodes, features)
103+
104+
gcp_build_clusterobj(_::Type{<:PeerWorkersMPI}, name, image_id, count, instance_type, user, zone, project, cluster_nodes, features) =
105+
GCPPeerWorkersMPI(name, image_id, count, instance_type, user, zone, project, cluster_nodes, features)
100106

101-
function launch_processes(_::Type{GoogleCloud}, cluster_type, cluster_handle, ips)
107+
function launch_processes(_::Type{GoogleCloud}, cluster_type::Type{<:Cluster}, cluster_handle, ips)
102108
cluster = gcp_cluster_info[cluster_handle]
109+
launch_processes_ssh(cluster.features, cluster_type, ips)
110+
end
103111

104-
return launch_processes_ssh(cluster.features, cluster_type, ips)
112+
function launch_processes(_::Type{GoogleCloud}, cluster_type::Type{<:PeerWorkersMPI}, cluster_handle, ips)
113+
cluster = gcp_cluster_info[cluster_handle]
114+
launch_processes_mpi(cluster.features, cluster_type, ips)
105115
end
106116

107117
#==== INTERRUPT CLUSTER ====#

src/deploy.jl

Lines changed: 37 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,12 @@ default_sshflags(provider_type) = defaults_dict[provider_type][:sshflags]
2222

2323
function extract_mwfeature(cluster_features, provider_type, featureid)
2424
if haskey(cluster_features, :manager_features) &&
25-
haskey(cluster_features, :worker_features) &&
26-
haskey(cluster_features[:manager_features], featureid) &&
27-
haskey(cluster_features[:worker_features], featureid) &&
28-
!haskey(cluster_features, featureid)
29-
feature_manager = cluster_features[:manager_features][featureid]
30-
feature_worker = cluster_features[:worker_features][featureid]
25+
haskey(cluster_features, :worker_features) &&
26+
haskey(cluster_features[:manager_features], featureid) &&
27+
haskey(cluster_features[:worker_features], featureid) &&
28+
!haskey(cluster_features, featureid)
29+
feature_manager = cluster_features[:manager_features][featureid]
30+
feature_worker = cluster_features[:worker_features][featureid]
3131
elseif haskey(cluster_features, featureid)
3232
feature_manager = feature_worker = cluster_features[featureid]
3333
else
@@ -65,6 +65,7 @@ function cluster_deploy(contract_handle, config_args...)
6565
cluster_terminate(cluster_handle)
6666
return :unsupported_mwcluster
6767
else
68+
save_exception_details()
6869
@error "Some error deploying cluster $cluster_handle ($e)"
6970
@warn "the cluster will be terminated"
7071
cluster_terminate(cluster_handle)
@@ -385,12 +386,16 @@ function cluster_interrupt(cluster_handle)
385386
try
386387
kill_processes(cluster_handle, cluster_type, cluster_features)
387388
sleep(1)
389+
catch e
390+
save_exception_details()
391+
@warn "error killing processes of cluster $cluster_handle ($e)"
388392
finally
389393
interrupt_cluster(node_provider, cluster_handle)
390394
end
391395
#@info "the cluster $cluster_handle has been interrupted"
392396
catch e
393-
println(e)
397+
save_exception_details()
398+
@error "error interrupting cluster $cluster_handle ($e)"
394399
return :fail
395400
end
396401
return :success
@@ -409,20 +414,21 @@ function cluster_resume(cluster_handle)
409414
try
410415
pids = launch_processes(node_provider, cluster_type, cluster_handle, ips)
411416
catch e
417+
save_exception_details()
412418
@warn "some error creating processes for cluster $cluster_handle ($e)"
419+
@warn "use '@restart $cluster_handle' to launch processes of cluster $cluster_handle."
413420
end
414421

415422
if !isnothing(pids)
416423
cluster_deploy_info[cluster_handle][:pids] = pids
417-
else
418-
@error "resume partially failed due to an unrecoverable error in launching processes"
419424
end
420425

421-
#@info "the cluster $cluster_handle has been resumed"
422426
catch e
423-
println(e)
427+
save_exception_details()
428+
@error "error resuming cluster $cluster_handle ($e)"
424429
return :fail
425430
end
431+
426432
return :success
427433
end
428434

@@ -436,14 +442,17 @@ function cluster_terminate(cluster_handle)
436442
try
437443
cluster_isrunning(node_provider, cluster_handle) && kill_processes(cluster_handle, cluster_features[:cluster_type], cluster_features)
438444
sleep(1)
445+
catch e
446+
save_exception_details()
447+
@warn "error killing processes of cluster $cluster_handle ($e)"
439448
finally
440449
terminate_cluster(node_provider, cluster_handle)
441450
terminated_cluster[cluster_handle] = cluster_deploy_info[cluster_handle]
442451
delete!(cluster_deploy_info, cluster_handle)
443452
end
444-
#@info "the cluster $cluster_handle has been terminated"
445453
catch e
446-
println(e)
454+
save_exception_details()
455+
@error "error terminating cluster $cluster_handle ($e)"
447456
return :fail
448457
end
449458
return :success
@@ -480,13 +489,16 @@ function cluster_restart(cluster_handle::Symbol)
480489
cluster_type = cluster_features[:cluster_type]
481490
try
482491
kill_processes(cluster_handle, cluster_type, cluster_features)
483-
finally
484-
ips = get_ips(cluster_provider, cluster_handle)
485-
pids = launch_processes(cluster_provider, cluster_type, cluster_handle, ips)
486-
cluster_deploy_info[cluster_handle][:pids] = pids
487-
end
492+
catch e
493+
error("error killing processes of cluster $cluster_handle ($e)")
494+
throw(e)
495+
end
496+
ips = get_ips(cluster_provider, cluster_handle)
497+
pids = launch_processes(cluster_provider, cluster_type, cluster_handle, ips)
498+
cluster_deploy_info[cluster_handle][:pids] = pids
488499
catch e
489-
println(e)
500+
save_exception_details()
501+
@error "error restarting processes of cluster $cluster_handle ($e)"
490502
return :fail
491503
end
492504

@@ -519,8 +531,8 @@ function cluster_reconnect(cluster_handle::Symbol)
519531
try
520532
pids = launch_processes(cluster_provider, cluster_type, cluster_handle, ips)
521533
catch e
534+
save_exception_details()
522535
@warn "exception caught when launching processes ($e) - fix the problem and try '@restart :$cluster_handle'"
523-
@error "error launching processes"
524536
end
525537

526538
if !isnothing(pids)
@@ -533,6 +545,7 @@ function cluster_reconnect(cluster_handle::Symbol)
533545
@error "The cluster $cluster_handle is not active"
534546
end
535547
catch e
548+
save_exception_details()
536549
println(e)
537550
return :fail
538551
end
@@ -544,6 +557,7 @@ function cluster_reconnect(cluster_handle::Symbol)
544557

545558

546559
function report_exception(e)
560+
save_exception_details()
547561
if e isa CompositeException
548562
@info "reporting composite exception:"
549563
for ee in e.exceptions
@@ -590,12 +604,12 @@ function load_cluster(cluster_handle::String; from = DateTime(0), cluster_type =
590604
result[:timestamp] = timestamp
591605
result[:features] = cluster_features
592606
else
593-
@warn "$this_cluster_type cluster $cluster_handle is not active"
607+
@warn "$this_cluster_type cluster $cluster_handle is not accessible"
594608
end
595609
end
596610
catch e
597-
@error e
598-
@error "cluster $cluster_handle not found"
611+
save_exception_details()
612+
@warn "cluster $cluster_handle not found"
599613
end
600614
return result
601615
end

src/utils.jl

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,24 @@ function try_run(command)
2424
end
2525
end
2626

27+
end
28+
29+
last_exceptions = Ref{Vector{Any}}(Vector{Any}())
30+
31+
function save_exception_details()
32+
33+
empty!(last_exceptions[])
34+
for (exc, bt) in current_exceptions()
35+
push!(last_exceptions[],(exc, bt))
36+
end
37+
38+
end
39+
40+
function show_exceptions()
41+
42+
for (exc, bt) in last_exceptions[]
43+
showerror(stdout, exc, bt)
44+
println(stdout)
45+
end
46+
2747
end

0 commit comments

Comments
 (0)