Skip to content

Commit 64adfc9

Browse files
authored
Update condor.jl
1 parent 6458d7f commit 64adfc9

File tree

1 file changed

+23
-4
lines changed

1 file changed

+23
-4
lines changed

src/condor.jl

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,19 @@ function condor_script(portnum::Integer, np::Integer, params::Dict)
5151
"$tdir/$jobname.sub"
5252
end
5353

54+
function _my_wait_without_timeout(f::Function; timeout_seconds)
55+
each_sleep_duration = 5
56+
for i = 1:each_sleep_duration:timeout_seconds
57+
sleep(each_sleep_duration)
58+
result = f()
59+
if result
60+
return nothing
61+
end
62+
end
63+
msg = "Timeout ($(timeout_seconds) seconds) exceeded"
64+
error(msg)
65+
end
66+
5467
function launch(manager::HTCManager, params::Dict, instances_arr::Array, c::Condition)
5568
let
5669
mgr_desc = "HTCondor"
@@ -68,13 +81,19 @@ function launch(manager::HTCManager, params::Dict, instances_arr::Array, c::Cond
6881

6982
script = condor_script(portnum, np, params)
7083
cmd = `condor_submit $script`
71-
proc = run(ignorestatus(cmd); wait=false) # run and wait (blocks)
72-
while !Base.process_exited(proc)
84+
pipeline = Base.pipeline(ignorestatus(cmd); stdout=Base.stdout, stderr=Base.stderr)
85+
proc = run(pipeline; wait = false)
86+
_my_wait_without_timeout(; timeout_seconds = 5 * 60) do
7387
run(`condor_q`)
88+
return
89+
end
90+
if !Base.process_exited(proc)
91+
@error "batch queue not available (could not run condor_submit)" Base.process_exited(proc)
92+
return nothing
7493
end
7594
if !success(proc)
76-
println("batch queue not available (could not run condor_submit)")
77-
return
95+
@error "batch queue not available (could not run condor_submit)" Base.process_exited(proc) success(proc)
96+
return nothing
7897
end
7998
print("Waiting for $np workers: ")
8099

0 commit comments

Comments
 (0)