@@ -22,12 +22,12 @@ default_sshflags(provider_type) = defaults_dict[provider_type][:sshflags]
2222
2323function extract_mwfeature (cluster_features, provider_type, featureid)
2424 if haskey (cluster_features, :manager_features ) &&
25- haskey (cluster_features, :worker_features ) &&
26- haskey (cluster_features[:manager_features ], featureid) &&
27- haskey (cluster_features[:worker_features ], featureid) &&
28- ! haskey (cluster_features, featureid)
29- feature_manager = cluster_features[:manager_features ][featureid]
30- feature_worker = cluster_features[:worker_features ][featureid]
25+ haskey (cluster_features, :worker_features ) &&
26+ haskey (cluster_features[:manager_features ], featureid) &&
27+ haskey (cluster_features[:worker_features ], featureid) &&
28+ ! haskey (cluster_features, featureid)
29+ feature_manager = cluster_features[:manager_features ][featureid]
30+ feature_worker = cluster_features[:worker_features ][featureid]
3131 elseif haskey (cluster_features, featureid)
3232 feature_manager = feature_worker = cluster_features[featureid]
3333 else
@@ -65,6 +65,7 @@ function cluster_deploy(contract_handle, config_args...)
6565 cluster_terminate (cluster_handle)
6666 return :unsupported_mwcluster
6767 else
68+ save_exception_details ()
6869 @error " Some error deploying cluster $cluster_handle ($e )"
6970 @warn " the cluster will be terminated"
7071 cluster_terminate (cluster_handle)
@@ -385,12 +386,16 @@ function cluster_interrupt(cluster_handle)
385386 try
386387 kill_processes (cluster_handle, cluster_type, cluster_features)
387388 sleep (1 )
389+ catch e
390+ save_exception_details ()
391+ @warn " error killing processes of cluster $cluster_handle ($e )"
388392 finally
389393 interrupt_cluster (node_provider, cluster_handle)
390394 end
391395 # @info "the cluster $cluster_handle has been interrupted"
392396 catch e
393- println (e)
397+ save_exception_details ()
398+ @error " error interrupting cluster $cluster_handle ($e )"
394399 return :fail
395400 end
396401 return :success
@@ -409,20 +414,21 @@ function cluster_resume(cluster_handle)
409414 try
410415 pids = launch_processes (node_provider, cluster_type, cluster_handle, ips)
411416 catch e
417+ save_exception_details ()
412418 @warn " some error creating processes for cluster $cluster_handle ($e )"
419+ @warn " use '@restart $cluster_handle ' to launch processes of cluster $cluster_handle ."
413420 end
414421
415422 if ! isnothing (pids)
416423 cluster_deploy_info[cluster_handle][:pids ] = pids
417- else
418- @error " resume partially failed due to an unrecoverable error in launching processes"
419424 end
420425
421- # @info "the cluster $cluster_handle has been resumed"
422426 catch e
423- println (e)
427+ save_exception_details ()
428+ @error " error resuming cluster $cluster_handle ($e )"
424429 return :fail
425430 end
431+
426432 return :success
427433end
428434
@@ -436,14 +442,17 @@ function cluster_terminate(cluster_handle)
436442 try
437443 cluster_isrunning (node_provider, cluster_handle) && kill_processes (cluster_handle, cluster_features[:cluster_type ], cluster_features)
438444 sleep (1 )
445+ catch e
446+ save_exception_details ()
447+ @warn " error killing processes of cluster $cluster_handle ($e )"
439448 finally
440449 terminate_cluster (node_provider, cluster_handle)
441450 terminated_cluster[cluster_handle] = cluster_deploy_info[cluster_handle]
442451 delete! (cluster_deploy_info, cluster_handle)
443452 end
444- # @info "the cluster $cluster_handle has been terminated"
445453 catch e
446- println (e)
454+ save_exception_details ()
455+ @error " error terminating cluster $cluster_handle ($e )"
447456 return :fail
448457 end
449458 return :success
@@ -480,13 +489,16 @@ function cluster_restart(cluster_handle::Symbol)
480489 cluster_type = cluster_features[:cluster_type ]
481490 try
482491 kill_processes (cluster_handle, cluster_type, cluster_features)
483- finally
484- ips = get_ips (cluster_provider, cluster_handle)
485- pids = launch_processes (cluster_provider, cluster_type, cluster_handle, ips)
486- cluster_deploy_info[cluster_handle][:pids ] = pids
487- end
492+ catch e
493+ error (" error killing processes of cluster $cluster_handle ($e )" )
494+ throw (e)
495+ end
496+ ips = get_ips (cluster_provider, cluster_handle)
497+ pids = launch_processes (cluster_provider, cluster_type, cluster_handle, ips)
498+ cluster_deploy_info[cluster_handle][:pids ] = pids
488499 catch e
489- println (e)
500+ save_exception_details ()
501+ @error " error restarting processes of cluster $cluster_handle ($e )"
490502 return :fail
491503 end
492504
@@ -519,8 +531,8 @@ function cluster_reconnect(cluster_handle::Symbol)
519531 try
520532 pids = launch_processes (cluster_provider, cluster_type, cluster_handle, ips)
521533 catch e
534+ save_exception_details ()
522535 @warn " exception caught when launching processes ($e ) - fix the problem and try '@restart :$cluster_handle '"
523- @error " error launching processes"
524536 end
525537
526538 if ! isnothing (pids)
@@ -533,6 +545,7 @@ function cluster_reconnect(cluster_handle::Symbol)
533545 @error " The cluster $cluster_handle is not active"
534546 end
535547 catch e
548+ save_exception_details ()
536549 println (e)
537550 return :fail
538551 end
@@ -544,6 +557,7 @@ function cluster_reconnect(cluster_handle::Symbol)
544557
545558
546559function report_exception (e)
560+ save_exception_details ()
547561 if e isa CompositeException
548562 @info " reporting composite exception:"
549563 for ee in e. exceptions
@@ -590,12 +604,12 @@ function load_cluster(cluster_handle::String; from = DateTime(0), cluster_type =
590604 result[:timestamp ] = timestamp
591605 result[:features ] = cluster_features
592606 else
593- @warn " $this_cluster_type cluster $cluster_handle is not active "
607+ @warn " $this_cluster_type cluster $cluster_handle is not accessible "
594608 end
595609 end
596610 catch e
597- @error e
598- @error " cluster $cluster_handle not found"
611+ save_exception_details ()
612+ @warn " cluster $cluster_handle not found"
599613 end
600614 return result
601615end
0 commit comments