@@ -29,11 +29,12 @@ import (
2929const serverScaleupEventSource = "scheduler.server.scaleup"
3030
3131type SimpleScheduler struct {
32- muSortAndUpdate sync.Mutex
33- store store.ModelStore
34- logger log.FieldLogger
35- synchroniser synchroniser.Synchroniser
36- eventHub * coordinator.EventHub
32+ muSortAndUpdate sync.Mutex
33+ store store.ModelStore
34+ logger log.FieldLogger
35+ synchroniser synchroniser.Synchroniser
36+ eventHub * coordinator.EventHub
37+ muScheduleFailed sync.Mutex
3738 SchedulerConfig
3839}
3940
@@ -80,19 +81,32 @@ func (s *SimpleScheduler) Schedule(modelKey string) error {
8081}
8182
8283func (s * SimpleScheduler ) ScheduleFailedModels () ([]string , error ) {
83- s .synchroniser .WaitReady ()
84+ if ! s .synchroniser .IsReady () {
85+ s .logger .Debug ("Waiting for servers to connect" )
86+ s .synchroniser .WaitReady ()
87+ s .logger .Debug ("Waiting for servers complete" )
88+ }
89+
90+ s .muScheduleFailed .Lock ()
91+ defer s .muScheduleFailed .Unlock ()
92+
8493 failedModels , err := s .getFailedModels ()
8594 if err != nil {
8695 return nil , err
8796 }
97+
98+ if len (failedModels ) > 0 {
99+ s .logger .WithField ("failed_models" , failedModels ).Debug ("Got failed models to schedule" )
100+ }
101+
88102 var updatedModels []string
89103 for _ , modelName := range failedModels {
90104 _ , err := s .scheduleToServer (modelName )
91105 if err != nil {
92- s .logger .Debugf ("Failed to schedule failed model %s" , modelName )
93- } else {
94- updatedModels = append (updatedModels , modelName )
106+ s .logger .WithError (err ).Debugf ("Failed to schedule failed model %s" , modelName )
107+ continue
95108 }
109+ updatedModels = append (updatedModels , modelName )
96110 }
97111 return updatedModels , nil
98112}
@@ -107,17 +121,20 @@ func (s *SimpleScheduler) getFailedModels() ([]string, error) {
107121 if err != nil {
108122 return nil , err
109123 }
124+
110125 var failedModels []string
111126 for _ , model := range models {
112127 version := model .GetLatest ()
113128 if version != nil {
114129 versionState := version .ModelState ()
115130 if versionState .State == store .ModelFailed || versionState .State == store .ScheduleFailed ||
116- (versionState .State == store .ModelAvailable && versionState .AvailableReplicas < version .GetDeploymentSpec ().GetReplicas ()) {
131+ ((versionState .State == store .ModelAvailable || versionState .State == store .ModelProgressing ) &&
132+ versionState .AvailableReplicas < version .GetDeploymentSpec ().GetReplicas ()) {
117133 failedModels = append (failedModels , model .Name )
118134 }
119135 }
120136 }
137+
121138 return failedModels , nil
122139}
123140
@@ -205,10 +222,11 @@ func (s *SimpleScheduler) scheduleToServer(modelName string) (*coordinator.Serve
205222 Debug ("Identified candidate servers for model" )
206223
207224 // The main logic of trying to find a server for the model is as follows:
208- // 1. If there are enough replicas on a server, schedule the model
209- // 2. If there are not enough replicas on a server, try to schedule with min replicas. In this case we actually should get
225+ // 1. If there are enough replicas of a server, schedule the model
226+ // 2. If there are not enough replicas of a server, try to schedule with min replicas. In this case we actually should get
210227 // the models loaded on all the replicas of the servers (assuming min replicas is less than the number of replicas on the server)
211- // we also mark the model in this case as failed to schedule so that if the infra changes in the future we can try to reschedule
228+ // we mark the model as failed to schedule only if we failed to schedule on both desired replicas and min replicas,
229+ // so that if the infra changes in the future we can try to re-schedule
212230
213231 // For each server filter and sort replicas and attempt schedule if enough replicas
214232 ok := s .findAndUpdateToServers (filteredServers , latestModel , desiredReplicas , desiredReplicas )
@@ -246,10 +264,11 @@ func (s *SimpleScheduler) scheduleToServer(modelName string) (*coordinator.Serve
246264 return serverEvent , nil
247265}
248266
249- func (s * SimpleScheduler ) findAndUpdateToServers (filteredServers []* store.ServerSnapshot , latestModel * store.ModelVersion , desiredReplicas , minReplicas int ) bool {
267+ func (s * SimpleScheduler ) findAndUpdateToServers (filteredServers []* store.ServerSnapshot , latestModel * store.ModelVersion , desiredReplicas , desiredMinReplicas int ) bool {
250268 modelName := latestModel .GetMeta ().GetName ()
251269 logger := s .logger .WithField ("func" , "findAndUpdateToServers" ).WithField ("model" , modelName )
252270 ok := false
271+
253272 for _ , candidateServer := range filteredServers {
254273 logger .WithField ("server" , candidateServer .Name ).Debug ("Checking compatibility with candidate server" )
255274 var candidateReplicas * sorters.CandidateServer
@@ -259,21 +278,21 @@ func (s *SimpleScheduler) findAndUpdateToServers(filteredServers []*store.Server
259278 s .muSortAndUpdate .Lock ()
260279 candidateReplicas = s .filterReplicas (latestModel , candidateServer )
261280 numServerReplicas := len (candidateReplicas .ChosenReplicas )
262- if numServerReplicas < minReplicas {
281+ if numServerReplicas < desiredMinReplicas {
263282 logger .
264283 WithField ("server" , candidateServer .Name ).
265284 WithField ("available_replicas" , numServerReplicas ).
266285 WithField ("desired_replicas" , desiredReplicas ).
267- WithField ("min_replicas" , minReplicas ).
286+ WithField ("min_replicas" , desiredMinReplicas ).
268287 Debug ("Skipping server due to insufficient suitable replicas" )
269288
270289 s .muSortAndUpdate .Unlock ()
271290 continue
272291 }
273292
274293 s .sortReplicas (candidateReplicas )
275- numReplicas := minReplicas
276- if minReplicas != desiredReplicas {
294+ numReplicas := desiredMinReplicas
295+ if desiredMinReplicas != desiredReplicas {
277296 numReplicas = min (numServerReplicas , desiredReplicas ) // we have more replicas for the server than min, so we can use all of them
278297 }
279298 err := s .store .UpdateLoadedModels (
0 commit comments