@@ -51,6 +51,19 @@ function condor_script(portnum::Integer, np::Integer, params::Dict)
5151 " $tdir /$jobname .sub"
5252end
5353
54+ function _my_wait_without_timeout (f:: Function ; timeout_seconds)
55+ each_sleep_duration = 5
56+ for i = 1 : each_sleep_duration: timeout_seconds
57+ sleep (each_sleep_duration)
58+ result = f ()
59+ if result
60+ return nothing
61+ end
62+ end
63+ msg = " Timeout ($(timeout_seconds) seconds) exceeded"
64+ error (msg)
65+ end
66+
5467function launch (manager:: HTCManager , params:: Dict , instances_arr:: Array , c:: Condition )
5568 let
5669 mgr_desc = " HTCondor"
@@ -68,13 +81,19 @@ function launch(manager::HTCManager, params::Dict, instances_arr::Array, c::Cond
6881
6982 script = condor_script (portnum, np, params)
7083 cmd = ` condor_submit $script `
71- proc = run (ignorestatus (cmd); wait= false ) # run and wait (blocks)
72- while ! Base. process_exited (proc)
84+ pipeline = Base. pipeline (ignorestatus (cmd); stdout = Base. stdout , stderr = Base. stderr )
85+ proc = run (pipeline; wait = false )
86+ _my_wait_without_timeout (; timeout_seconds = 5 * 60 ) do
7387 run (` condor_q` )
88+ return
89+ end
90+ if ! Base. process_exited (proc)
91+ @error " batch queue not available (could not run condor_submit)" Base. process_exited (proc)
92+ return nothing
7493 end
7594 if ! success (proc)
76- println ( " batch queue not available (could not run condor_submit)" )
77- return
95+ @error " batch queue not available (could not run condor_submit)" Base . process_exited (proc) success (proc )
96+ return nothing
7897 end
7998 print (" Waiting for $np workers: " )
8099
0 commit comments