SeldonIO
diff --git a/‎ansible/README.dev.md‎
Lines changed: 22 additions & 0 deletions b/‎ansible/README.dev.md‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎ansible/install-delve.sh‎
Lines changed: 32 additions & 0 deletions b/‎ansible/install-delve.sh‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎ansible/playbooks/templates/default-kind-cluster-config.yaml.j2‎
Lines changed: 4 additions & 0 deletions b/‎ansible/playbooks/templates/default-kind-cluster-config.yaml.j2‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎scheduler/go.mod‎
Lines changed: 1 addition & 0 deletions b/‎scheduler/go.mod‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎scheduler/go.sum‎
Lines changed: 2 additions & 0 deletions b/‎scheduler/go.sum‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎scheduler/pkg/agent/server.go‎
Lines changed: 26 additions & 14 deletions b/‎scheduler/pkg/agent/server.go‎
Lines changed: 26 additions & 14 deletions
diff --git a/‎scheduler/pkg/scheduler/scheduler.go‎
Lines changed: 37 additions & 18 deletions b/‎scheduler/pkg/scheduler/scheduler.go‎
Lines changed: 37 additions & 18 deletions
@@ -245,6 +245,28 @@ stringData:
   client_id: "a-secret-client-id"
 ```
 
+## Install Delve - experimental
+
+Install Delve (Go debugger) onto the Kind node with:
+
+```shell
+./install-delve.sh
+```
+
+You can then docker exec into the Kind node and attach delve to the process you want to debug. For example to attach to
+the scheduler:
+
+```shell
+dlv --listen=:40000 --headless=true --api-version=2 --log attach $(pgrep -f "^/bin/scheduler")
+```
+
+This will listen on port `40,000`. The Kind cluster setup playbook has been configured to expose this port and map to localhost.
+It attaches to the PID of the scheduler determined by `$(pgrep -f "^/bin/scheduler")`.
+
+You then need to configure your IDE to connect to remote debug on `localhost:40000`. It does crash on occasion and needs
+restarting. This will need investigating to make more stable. Potential improvements of building a custom image from the
+Kind base image with Delve installed and any other packages to improve stability.
+
 ## Mounting local (host) path into the rclone container of a Server pod
 
 For this, you first need to enable local mounts into kind (configuring the `kind_local_mount`,
 
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+CLUSTER_NAME="seldon"
+GO_VERSION="1.24.7"
+
+echo "Installing Go and Delve in kind cluster: $CLUSTER_NAME"
+
+NODES=$(kind get nodes --name "$CLUSTER_NAME")
+
+for node in $NODES; do
+    echo "Setting up $node..."
+
+    docker exec "$node" bash -c "
+        # Install Go
+        curl -sL https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz -o /tmp/go.tar.gz && \
+        rm -rf /usr/local/go && \
+        tar -C /usr/local -xzf /tmp/go.tar.gz && \
+        rm /tmp/go.tar.gz && \
+
+        # Install Delve
+        /usr/local/go/bin/go install github.com/go-delve/delve/cmd/dlv@latest && \
+
+        # Create symlinks
+        ln -sf /usr/local/go/bin/go /usr/local/bin/go && \
+        ln -sf /root/go/bin/dlv /usr/local/bin/dlv && \
+
+        # Verify
+        echo '=== Versions ===' && go version && dlv version
+    "
+
+    echo "✓ Setup complete on $node"
+done
@@ -3,6 +3,10 @@ apiVersion: kind.x-k8s.io/v1alpha4
 kind: Cluster
 nodes:
   - role: control-plane
+    extraPortMappings:
+    - containerPort: 40000
+      hostPort: 40000
+      protocol: TCP
 {% if kind_local_mount %}
     extraMounts:
       - hostPath: {{ kind_host_path }}
 
@@ -16,6 +16,7 @@ require (
 	github.com/go-playground/validator/v10 v10.27.0
 	github.com/google/go-cmp v0.7.0
 	github.com/gorilla/mux v1.8.1
+	github.com/gotidy/ptr v1.4.0
 	github.com/grpc-ecosystem/go-grpc-middleware v1.4.0
 	github.com/jarcoal/httpmock v1.4.0
 	github.com/knadh/koanf/parsers/yaml v1.1.0
 
@@ -298,6 +298,8 @@ github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
 github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ=
 github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo=
 github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA=
+github.com/gotidy/ptr v1.4.0 h1:7++suUs+HNHMnyz6/AW3SE+4EnBhupPSQTSI7QNijVc=
+github.com/gotidy/ptr v1.4.0/go.mod h1:MjRBG6/IETiiZGWI8LrRtISXEji+8b/jigmj2q0mEyM=
 github.com/grpc-ecosystem/go-grpc-middleware v1.4.0 h1:UH//fgunKIs4JdUbpDl1VZCDaL56wXCB/5+wF6uHfaI=
 github.com/grpc-ecosystem/go-grpc-middleware v1.4.0/go.mod h1:g5qyo/la0ALbONm6Vbp88Yd8NsDy6rZz+RcrMPxvld8=
 github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1 h1:X5VWvz21y3gzm9Nw/kaUeku/1+uBhcekkmy4IkffJww=
 
@@ -406,6 +406,7 @@ func (s *Server) Subscribe(request *pb.AgentSubscribeRequest, stream pb.AgentSer
 	defer mu.(*sync.Mutex).Unlock()
 
 	logger.Infof("Received subscribe request from %s:%d", request.ServerName, request.ReplicaIdx)
+	defer logger.Infof("Agent subscribe stream closed for %s:%d", request.ServerName, request.ReplicaIdx)
 
 	fin := make(chan bool)
 
@@ -419,29 +420,31 @@ func (s *Server) Subscribe(request *pb.AgentSubscribeRequest, stream pb.AgentSer
 	s.logger.Debugf("Add Server Replica %+v with config %+v", request, request.ReplicaConfig)
 	err := s.store.AddServerReplica(request)
 	if err != nil {
+		s.logger.WithError(err).WithField("req", request).Error("Failed to add server replica")
 		return err
 	}
+
 	err = s.scheduleModelsFromRequest(request)
 	if err != nil {
+		s.logger.WithError(err).WithField("req", request).Error("Failed to schedule models")
 		return err
 	}
 
 	ctx := stream.Context()
 	// Keep this scope alive because once this scope exits - the stream is closed
-	for {
-		select {
-		case <-fin:
-			logger.Infof("Closing stream for replica: %s:%d", request.ServerName, request.ReplicaIdx)
-			return nil
-		case <-ctx.Done():
-			logger.Infof("Client replica %s:%d has disconnected", request.ServerName, request.ReplicaIdx)
-			s.mutex.Lock()
-			delete(s.agents, key)
-			s.mutex.Unlock()
-			s.removeServerReplicaImpl(request.GetServerName(), int(request.GetReplicaIdx())) // this is non-blocking beyond rescheduling models on removed server
-			return nil
-		}
+	select {
+	case <-fin:
+		logger.Infof("Closing stream for replica: %s:%d", request.ServerName, request.ReplicaIdx)
+	case <-ctx.Done():
+		logger.WithError(ctx.Err()).Warnf("Client replica %s:%d has disconnected", request.ServerName, request.ReplicaIdx)
+		s.mutex.Lock()
+		delete(s.agents, key)
+		s.mutex.Unlock()
+		logger.WithField("request", request).Info("Removing server replica and re-scheduling model(s)")
+		s.removeServerReplicaImpl(request.GetServerName(), int(request.GetReplicaIdx())) // this is non-blocking beyond rescheduling models on removed server
 	}
+
+	return nil
 }
 
 func (s *Server) StopAgentStreams() {
@@ -476,18 +479,27 @@ func (s *Server) removeServerReplicaImpl(serverName string, serverReplicaIdx int
 	if err != nil {
 		s.logger.WithError(err).Errorf("Failed to remove replica and redeploy models for %s:%d", serverName, serverReplicaIdx)
 	}
+
 	s.logger.Debugf("Removing models %v from server %s:%d", modelsChanged, serverName, serverReplicaIdx)
 	for _, modelName := range modelsChanged {
+		s.logger.WithField("model", modelName).Debug("Scheduling model")
 		err = s.scheduler.Schedule(modelName)
 		if err != nil {
-			s.logger.Debugf("Failed to reschedule model %s when server %s replica %d disconnected", modelName, serverName, serverReplicaIdx)
+			s.logger.WithError(err).Debugf("Failed to reschedule model %s when server %s replica %d disconnected", modelName, serverName, serverReplicaIdx)
+			continue
 		}
+		s.logger.WithField("model", modelName).Debug("Scheduling complete")
 	}
+
 	// retry failed models
 	// this is perhaps counterintuitive, but we want to retry failed models on other servers
 	// specifically in the case of model state `LoadFailed` and the server replica disconnects, we want to reconcile
 	// the model state with th new set of active servers
 	// note that this will also retry `ScheduleFailed`, which is a side effect of calling `ScheduleFailedModels`
+	s.logger.WithFields(log.Fields{
+		"serverName": serverName,
+		"replicaID":  serverReplicaIdx,
+	}).Debug("Scheduling failed models")
 	if _, err := s.scheduler.ScheduleFailedModels(); err != nil {
 		s.logger.WithError(err).Errorf("Failed to reschedule failed models when server %s replica %d disconnected", serverName, serverReplicaIdx)
 	}
 
@@ -29,11 +29,12 @@ import (
 const serverScaleupEventSource = "scheduler.server.scaleup"
 
 type SimpleScheduler struct {
-	muSortAndUpdate sync.Mutex
-	store           store.ModelStore
-	logger          log.FieldLogger
-	synchroniser    synchroniser.Synchroniser
-	eventHub        *coordinator.EventHub
+	muSortAndUpdate  sync.Mutex
+	store            store.ModelStore
+	logger           log.FieldLogger
+	synchroniser     synchroniser.Synchroniser
+	eventHub         *coordinator.EventHub
+	muScheduleFailed sync.Mutex
 	SchedulerConfig
 }
 
@@ -80,19 +81,32 @@ func (s *SimpleScheduler) Schedule(modelKey string) error {
 }
 
 func (s *SimpleScheduler) ScheduleFailedModels() ([]string, error) {
-	s.synchroniser.WaitReady()
+	if !s.synchroniser.IsReady() {
+		s.logger.Debug("Waiting for servers to connect")
+		s.synchroniser.WaitReady()
+		s.logger.Debug("Waiting for servers complete")
+	}
+
+	s.muScheduleFailed.Lock()
+	defer s.muScheduleFailed.Unlock()
+
 	failedModels, err := s.getFailedModels()
 	if err != nil {
 		return nil, err
 	}
+
+	if len(failedModels) > 0 {
+		s.logger.WithField("failed_models", failedModels).Debug("Got failed models to schedule")
+	}
+
 	var updatedModels []string
 	for _, modelName := range failedModels {
 		_, err := s.scheduleToServer(modelName)
 		if err != nil {
-			s.logger.Debugf("Failed to schedule failed model %s", modelName)
-		} else {
-			updatedModels = append(updatedModels, modelName)
+			s.logger.WithError(err).Debugf("Failed to schedule failed model %s", modelName)
+			continue
 		}
+		updatedModels = append(updatedModels, modelName)
 	}
 	return updatedModels, nil
 }
@@ -107,17 +121,20 @@ func (s *SimpleScheduler) getFailedModels() ([]string, error) {
 	if err != nil {
 		return nil, err
 	}
+
 	var failedModels []string
 	for _, model := range models {
 		version := model.GetLatest()
 		if version != nil {
 			versionState := version.ModelState()
 			if versionState.State == store.ModelFailed || versionState.State == store.ScheduleFailed ||
-				(versionState.State == store.ModelAvailable && versionState.AvailableReplicas < version.GetDeploymentSpec().GetReplicas()) {
+				((versionState.State == store.ModelAvailable || versionState.State == store.ModelProgressing) &&
+					versionState.AvailableReplicas < version.GetDeploymentSpec().GetReplicas()) {
 				failedModels = append(failedModels, model.Name)
 			}
 		}
 	}
+
 	return failedModels, nil
 }
 
@@ -205,10 +222,11 @@ func (s *SimpleScheduler) scheduleToServer(modelName string) (*coordinator.Serve
 		Debug("Identified candidate servers for model")
 
 	// The main logic of trying to find a server for the model is as follows:
-	// 1. If there are enough replicas on a server, schedule the model
-	// 2. If there are not enough replicas on a server, try to schedule with min replicas. In this case we actually should get
+	// 1. If there are enough replicas of a server, schedule the model
+	// 2. If there are not enough replicas of a server, try to schedule with min replicas. In this case we actually should get
 	// the models loaded on all the replicas of the servers (assuming min replicas is less than the number of replicas on the server)
-	// we also mark the model in this case as failed to schedule so that if the infra changes in the future we can try to reschedule
+	// we mark the model as failed to schedule only if we failed to schedule on both desired replicas and min replicas,
+	// so that if the infra changes in the future we can try to re-schedule
 
 	// For each server filter and sort replicas and attempt schedule if enough replicas
 	ok := s.findAndUpdateToServers(filteredServers, latestModel, desiredReplicas, desiredReplicas)
@@ -246,10 +264,11 @@ func (s *SimpleScheduler) scheduleToServer(modelName string) (*coordinator.Serve
 	return serverEvent, nil
 }
 
-func (s *SimpleScheduler) findAndUpdateToServers(filteredServers []*store.ServerSnapshot, latestModel *store.ModelVersion, desiredReplicas, minReplicas int) bool {
+func (s *SimpleScheduler) findAndUpdateToServers(filteredServers []*store.ServerSnapshot, latestModel *store.ModelVersion, desiredReplicas, desiredMinReplicas int) bool {
 	modelName := latestModel.GetMeta().GetName()
 	logger := s.logger.WithField("func", "findAndUpdateToServers").WithField("model", modelName)
 	ok := false
+
 	for _, candidateServer := range filteredServers {
 		logger.WithField("server", candidateServer.Name).Debug("Checking compatibility with candidate server")
 		var candidateReplicas *sorters.CandidateServer
@@ -259,21 +278,21 @@ func (s *SimpleScheduler) findAndUpdateToServers(filteredServers []*store.Server
 		s.muSortAndUpdate.Lock()
 		candidateReplicas = s.filterReplicas(latestModel, candidateServer)
 		numServerReplicas := len(candidateReplicas.ChosenReplicas)
-		if numServerReplicas < minReplicas {
+		if numServerReplicas < desiredMinReplicas {
 			logger.
 				WithField("server", candidateServer.Name).
 				WithField("available_replicas", numServerReplicas).
 				WithField("desired_replicas", desiredReplicas).
-				WithField("min_replicas", minReplicas).
+				WithField("min_replicas", desiredMinReplicas).
 				Debug("Skipping server due to insufficient suitable replicas")
 
 			s.muSortAndUpdate.Unlock()
 			continue
 		}
 
 		s.sortReplicas(candidateReplicas)
-		numReplicas := minReplicas
-		if minReplicas != desiredReplicas {
+		numReplicas := desiredMinReplicas
+		if desiredMinReplicas != desiredReplicas {
 			numReplicas = min(numServerReplicas, desiredReplicas) // we have more replicas for the server than min, so we can use all of them
 		}
 		err := s.store.UpdateLoadedModels(