|
| 1 | +// Runs a job as a Pod (Kubernetes Job) in a Kubernetes cluster |
| 2 | + |
| 3 | +const k8s = require('@kubernetes/client-node'); |
| 4 | +var BufferManager = require('./buffer_manager.js').BufferManager; |
| 5 | +var RestartCounter = require('./restart_counter.js').RestartCounter; |
| 6 | +var submitK8sJob = require('./k8sJobSubmit.js').submitK8sJob; |
| 7 | +var fs = require('fs'); |
| 8 | + |
| 9 | +let bufferManager = new BufferManager(); |
| 10 | + |
| 11 | +let backoffLimit = process.env.HF_VAR_BACKOFF_LIMIT || 0; |
| 12 | +let restartCounter = new RestartCounter(backoffLimit); |
| 13 | + |
| 14 | +// Function k8sCommandGroup |
| 15 | +// |
| 16 | +// Inputs: |
| 17 | +// - bufferItems - array containing objects with following properties: |
| 18 | +// * ins |
| 19 | +// * outs |
| 20 | +// * context |
| 21 | +// * cb |
| 22 | +async function k8sCommandGroup(bufferItems) { |
| 23 | + |
| 24 | + // No action needed when buffer is empty |
| 25 | + if (bufferItems.length == 0) { |
| 26 | + return; |
| 27 | + } |
| 28 | + |
| 29 | + let startTime = Date.now(); |
| 30 | + console.log("k8sCommandGroup started, time:", startTime); |
| 31 | + |
| 32 | + // Function for rebuffering items |
| 33 | + let restartFn = (bufferIndex) => { |
| 34 | + let bufferItem = bufferItems[bufferIndex]; |
| 35 | + let taskId = bufferItem.context.taskId; |
| 36 | + try { |
| 37 | + var partition = bufferItem.context.executor.partition; // in case 'executor' doesn't exist |
| 38 | + } catch(error) { } |
| 39 | + if (restartCounter.isRestartPossible(taskId)) { |
| 40 | + let restartVal = restartCounter.increase(taskId); |
| 41 | + console.log("Readding task", taskId, "to buffer (restartCount:", restartVal + ") ..."); |
| 42 | + let itemName = bufferItem.context.name; |
| 43 | + bufferManager.addItem(itemName, bufferItem, partition); |
| 44 | + } |
| 45 | + return; |
| 46 | + } |
| 47 | + |
| 48 | + // Extract particular arrays from buffer items |
| 49 | + let jobArr = []; |
| 50 | + let taskIdArr = []; |
| 51 | + let contextArr = []; |
| 52 | + let cbArr = []; |
| 53 | + for (let i=0; i<bufferItems.length; i++) { |
| 54 | + let bufferItem = bufferItems[i]; |
| 55 | + let ins = bufferItem.ins; |
| 56 | + let outs = bufferItem.outs; |
| 57 | + let context = bufferItem.context; |
| 58 | + let cb = bufferItem.cb; |
| 59 | + |
| 60 | + var job = context.executor; // object containing 'executable', 'args' and others |
| 61 | + job.name = context.name; |
| 62 | + job.ins = ins; |
| 63 | + job.outs = outs; |
| 64 | + |
| 65 | + jobArr.push(job); |
| 66 | + taskIdArr.push(context.taskId); |
| 67 | + contextArr.push(context); |
| 68 | + cbArr.push(cb); |
| 69 | + } |
| 70 | + |
| 71 | + // All jobs in the group must have a similar context! |
| 72 | + // Here we retrieve the context of the first job in the group. |
| 73 | + // It is used below to read configuration for ALL jobs in the group. |
| 74 | + let context = contextArr[0]; |
| 75 | + |
| 76 | + // let cluster = await getCluster(); |
| 77 | + // const token = await getGCPToken(); |
| 78 | + |
| 79 | + // Read custom parameters for job template '${var}' variables. These can be |
| 80 | + // provided in 'workflow.config.jobvars.json' file. |
| 81 | + // |
| 82 | + // In addition, to support two (or more) clusters (for cloud bursting), if |
| 83 | + // 'partition' is defined, check if there is a custom configuration for that |
| 84 | + // partition -- it can be provided in file 'workflow.config.jobvars{$partNum}.json'. |
| 85 | + // This partition-specific config may override parameters of the job, possibly even |
| 86 | + // define a path to a different kubeconfig to be loaded. |
| 87 | + |
| 88 | + let partition = context.executor.partition; // could be 'undefined' |
| 89 | + //let partitionConfigDir = process.env.HF_VAR_PARTITION_CONFIG_DIR || "/opt/hyperflow/partitions"; |
| 90 | + //let partitionConfigFile = partitionConfigDir + "/" + "part." + partition + ".config.json"; |
| 91 | + |
| 92 | + // custom parameters for the job YAML template (will overwrite default values) |
| 93 | + // partition-specific configuration, if exists, overrides general configuration |
| 94 | + let customParams = context.appConfig.jobvars || {}; // general configuration |
| 95 | + let customParamsPartition = partition ? context.appConfig['jobvars'+partition]: null; |
| 96 | + if (customParamsPartition) { // partition-specific configuration |
| 97 | + Object.keys(customParamsPartition).forEach(function(key) { |
| 98 | + customParams[key] = customParamsPartition[key]; |
| 99 | + }); |
| 100 | + } |
| 101 | + |
| 102 | + //console.log("CUSTOM params...", customParams); |
| 103 | + |
| 104 | + // Set kubeconfig path if overridden (could point to a remote cluster) |
| 105 | + delete process.env.KUBECONFIG; |
| 106 | + if (customParams.kubeConfigPath) { |
| 107 | + process.env.KUBECONFIG = customParams.kubeConfigPath; |
| 108 | + } |
| 109 | + |
| 110 | + const kubeconfig = new k8s.KubeConfig(); |
| 111 | + kubeconfig.loadFromDefault(); // loadFromString(JSON.stringify(kconfig)) |
| 112 | + |
| 113 | + let jobExitCodes = []; |
| 114 | + try { |
| 115 | + jobExitCodes = await submitK8sJob(kubeconfig, jobArr, taskIdArr, contextArr, customParams, restartFn); |
| 116 | + } catch (err) { |
| 117 | + console.log("Error when submitting job:", err); |
| 118 | + throw err; |
| 119 | + } |
| 120 | + |
| 121 | + let endTime = Date.now(); |
| 122 | + console.log("Ending k8sCommandGroup function, time:", endTime, "exit codes:", jobExitCodes); |
| 123 | + |
| 124 | + // Stop the entire workflow if a job fails (controlled by an environment variable) |
| 125 | + for (var i=0; i<jobExitCodes.length; i++) { |
| 126 | + let jobExitCode = jobExitCodes[i]; |
| 127 | + if (jobExitCode != 0 && process.env.HF_VAR_STOP_WORKFLOW_WHEN_JOB_FAILED=="1") { |
| 128 | + let taskId = taskIdArr[i]; |
| 129 | + let job = jobArr[i]; |
| 130 | + console.log('Error: job', taskId, 'exited with error code', jobExitCode, ', stopping workflow.'); |
| 131 | + console.log('Error details: job.name: ' + job.name + ', job.args: ' + job.args.join(' ')); |
| 132 | + process.exit(1); |
| 133 | + } |
| 134 | + } |
| 135 | + |
| 136 | + // if we're here, the job should have succesfully completed -- we write this |
| 137 | + // information to Redis (job executor may make use of it). |
| 138 | + let markPromises = []; |
| 139 | + for (var i=0; i<contextArr.length; i++) { |
| 140 | + // skip failed jobs |
| 141 | + if (jobExitCodes[i] != 0) { |
| 142 | + continue; |
| 143 | + } |
| 144 | + |
| 145 | + let context = contextArr[i]; |
| 146 | + markPromises.push(context.markTaskCompleted()); |
| 147 | + } |
| 148 | + try { |
| 149 | + await Promise.all(markPromises); |
| 150 | + } catch { |
| 151 | + console.error("Marking jobs", taskIdArr, "as completed failed.") |
| 152 | + } |
| 153 | + |
| 154 | + for (var i=0; i<cbArr.length; i++) { |
| 155 | + // skip failed jobs |
| 156 | + if (jobExitCodes[i] != 0) { |
| 157 | + continue; |
| 158 | + } |
| 159 | + |
| 160 | + let cb = cbArr[i]; |
| 161 | + let outs = jobArr[i].outs; |
| 162 | + cb(null, outs); |
| 163 | + } |
| 164 | + |
| 165 | + return; |
| 166 | +} |
| 167 | + |
| 168 | +bufferManager.setCallback((items) => k8sCommandGroup(items)); |
| 169 | + |
| 170 | +async function k8sCommand(ins, outs, context, cb) { |
| 171 | + /** Buffer Manager configuration. */ |
| 172 | + buffersConf = context.appConfig.jobAgglomerations; |
| 173 | + let alreadyConfigured = bufferManager.isConfigured(); |
| 174 | + if (alreadyConfigured == false && buffersConf != undefined) { |
| 175 | + bufferManager.configure(buffersConf); |
| 176 | + } |
| 177 | + |
| 178 | + /** Buffer item. */ |
| 179 | + let item = { |
| 180 | + "ins": ins, |
| 181 | + "outs": outs, |
| 182 | + "context": context, |
| 183 | + "cb": cb |
| 184 | + }; |
| 185 | + |
| 186 | + try { |
| 187 | + var partition = context.executor.partition; // in case 'executor' doesn't exist |
| 188 | + } catch(error) { } |
| 189 | + bufferManager.addItem(context.name, item, partition); |
| 190 | + |
| 191 | + return; |
| 192 | +} |
| 193 | + |
| 194 | +exports.k8sCommand = k8sCommand; |
0 commit comments