diff --git a/README.md b/README.md index 97317a30..a7c38303 100644 --- a/README.md +++ b/README.md @@ -189,6 +189,8 @@ uvx kubernetes-mcp-server@latest --help |---------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | `--port` | Starts the MCP server in Streamable HTTP mode (path /mcp) and Server-Sent Event (SSE) (path /sse) mode and listens on the specified port . | | `--log-level` | Sets the logging level (values [from 0-9](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-instrumentation/logging.md)). Similar to [kubectl logging levels](https://kubernetes.io/docs/reference/kubectl/quick-reference/#kubectl-output-verbosity-and-debugging). | +| `--config` | (Optional) Path to the main TOML configuration file. See [Configuration Files](#configuration-files) section below for details. | +| `--config-dir` | (Optional) Path to drop-in configuration directory. Files are loaded in lexical (alphabetical) order. See [Drop-in Configuration](#drop-in-configuration) section below for details. | | `--kubeconfig` | Path to the Kubernetes configuration file. If not provided, it will try to resolve the configuration (in-cluster, default location, etc.). | | `--list-output` | Output format for resource list operations (one of: yaml, table) (default "table") | | `--read-only` | If set, the MCP server will run in read-only mode, meaning it will not allow any write operations (create, update, delete) on the Kubernetes cluster. This is useful for debugging or inspecting the cluster without making changes. | @@ -196,6 +198,88 @@ uvx kubernetes-mcp-server@latest --help | `--toolsets` | Comma-separated list of toolsets to enable. Check the [🛠️ Tools and Functionalities](#tools-and-functionalities) section for more information. | | `--disable-multi-cluster` | If set, the MCP server will disable multi-cluster support and will only use the current context from the kubeconfig file. This is useful if you want to restrict the MCP server to a single cluster. | +### Drop-in Configuration + +The Kubernetes MCP server supports flexible configuration through both a main config file and drop-in files. **Both are optional** - you can use either, both, or neither (server will use built-in defaults). + +#### Configuration Loading Order + +Configuration values are loaded and merged in the following order (later sources override earlier ones): + +1. **Internal Defaults** - Always loaded (hardcoded default values) +2. **Main Configuration File** - Optional, loaded via `--config` flag +3. **Drop-in Files** - Optional, loaded from `--config-dir` in **lexical (alphabetical) order** + +#### How Drop-in Files Work + +- **File Naming**: Use numeric prefixes to control loading order (e.g., `00-base.toml`, `10-cluster.toml`, `99-override.toml`) +- **File Extension**: Only `.toml` files are processed; dotfiles (starting with `.`) are ignored +- **Partial Configuration**: Drop-in files can contain only a subset of configuration options +- **Merge Behavior**: Values present in a drop-in file override previous values; missing values are preserved + +#### Dynamic Configuration Reload + +To reload configuration after modifying config files, send a `SIGHUP` signal to the running server process: + +```shell +# Find the process ID +ps aux | grep kubernetes-mcp-server + +# Send SIGHUP to reload configuration +kill -HUP + +# Or use pkill +pkill -HUP kubernetes-mcp-server +``` + +The server will: +- Reload the main config file and all drop-in files +- Update toolsets and enabled tools +- Reconnect to clusters if needed +- Log the reload status + +**Note**: SIGHUP reload is not available on Windows. On Windows, restart the server to reload configuration. + +#### Example: Using Both Config Methods + +**Command:** +```shell +kubernetes-mcp-server --config /etc/kubernetes-mcp-server/config.toml \ + --config-dir /etc/kubernetes-mcp-server/config.d/ +``` + +**Directory structure:** +``` +/etc/kubernetes-mcp-server/ +├── config.toml # Main configuration +└── config.d/ + ├── 00-base.toml # Base overrides + ├── 10-toolsets.toml # Toolset-specific config + └── 99-local.toml # Local overrides +``` + +**Example drop-in file** (`10-toolsets.toml`): +```toml +# Override only the toolsets - all other config preserved +toolsets = ["core", "config", "helm", "logs"] +``` + +**Example drop-in file** (`99-local.toml`): +```toml +# Local development overrides +log_level = 9 +read_only = true +``` + +**To apply changes:** +```shell +# Edit config files +vim /etc/kubernetes-mcp-server/config.d/99-local.toml + +# Reload without restarting +pkill -HUP kubernetes-mcp-server +``` + ## 🛠️ Tools and Functionalities The Kubernetes MCP server supports enabling or disabling specific groups of tools and functionalities (tools, resources, prompts, and so on) via the `--toolsets` command-line flag or `toolsets` configuration option. diff --git a/pkg/config/config.go b/pkg/config/config.go index 20695768..b0f65dc7 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -6,8 +6,11 @@ import ( "fmt" "os" "path/filepath" + "sort" + "strings" "github.com/BurntSushi/toml" + "k8s.io/klog/v2" ) const ( @@ -95,26 +98,147 @@ func withDirPath(path string) ReadConfigOpt { } } -// Read reads the toml file and returns the StaticConfig, with any opts applied. -func Read(configPath string, opts ...ReadConfigOpt) (*StaticConfig, error) { - configData, err := os.ReadFile(configPath) +// Read reads the toml file, applies drop-in configs from configDir (if provided), +// and returns the StaticConfig with any opts applied. +// Loading order: defaults → main config file → drop-in files (lexically sorted) +func Read(configPath string, configDir string, opts ...ReadConfigOpt) (*StaticConfig, error) { + // Start with defaults + cfg := Default() + + // Get the absolute dir path for the main config file + var dirPath string + if configPath != "" { + absPath, err := filepath.Abs(configPath) + if err != nil { + return nil, fmt.Errorf("failed to resolve absolute path to config file: %w", err) + } + dirPath = filepath.Dir(absPath) + + // Load main config file + klog.V(2).Infof("Loading main config from: %s", configPath) + if err := mergeConfigFile(cfg, configPath, append(opts, withDirPath(dirPath))...); err != nil { + return nil, fmt.Errorf("failed to load main config file %s: %w", configPath, err) + } + } + + // Load drop-in config files if directory is specified + if configDir != "" { + if err := loadDropInConfigs(cfg, configDir, append(opts, withDirPath(dirPath))...); err != nil { + return nil, fmt.Errorf("failed to load drop-in configs from %s: %w", configDir, err) + } + } + + return cfg, nil +} + +// mergeConfigFile reads a config file and merges its values into the target config. +// Values present in the file will overwrite existing values in cfg. +// Values not present in the file will remain unchanged in cfg. +func mergeConfigFile(cfg *StaticConfig, filePath string, opts ...ReadConfigOpt) error { + configData, err := os.ReadFile(filePath) if err != nil { - return nil, err + return err } - // get and save the absolute dir path to the config file, so that other config parsers can use it - absPath, err := filepath.Abs(configPath) + md, err := toml.NewDecoder(bytes.NewReader(configData)).Decode(cfg) if err != nil { - return nil, fmt.Errorf("failed to resolve absolute path to config file: %w", err) + return fmt.Errorf("failed to decode TOML: %w", err) + } + + for _, opt := range opts { + opt(cfg) + } + + if err := cfg.parseClusterProviderConfigs(md); err != nil { + return err + } + + if err := cfg.parseToolsetConfigs(md); err != nil { + return err } - dirPath := filepath.Dir(absPath) - cfg, err := ReadToml(configData, append(opts, withDirPath(dirPath))...) + return nil +} + +// loadDropInConfigs loads and merges config files from a drop-in directory. +// Files are processed in lexical (alphabetical) order. +// Only files with .toml extension are processed; dotfiles are ignored. +func loadDropInConfigs(cfg *StaticConfig, dropInDir string, opts ...ReadConfigOpt) error { + // Check if directory exists + info, err := os.Stat(dropInDir) if err != nil { - return nil, err + if os.IsNotExist(err) { + klog.V(2).Infof("Drop-in config directory does not exist, skipping: %s", dropInDir) + return nil + } + return fmt.Errorf("failed to stat drop-in directory: %w", err) } - return cfg, nil + if !info.IsDir() { + return fmt.Errorf("drop-in config path is not a directory: %s", dropInDir) + } + + // Get all .toml files in the directory + files, err := getSortedConfigFiles(dropInDir) + if err != nil { + return err + } + + if len(files) == 0 { + klog.V(2).Infof("No drop-in config files found in: %s", dropInDir) + return nil + } + + klog.V(2).Infof("Loading %d drop-in config file(s) from: %s", len(files), dropInDir) + + // Merge each file in order + for _, file := range files { + klog.V(3).Infof(" - Merging drop-in config: %s", filepath.Base(file)) + if err := mergeConfigFile(cfg, file, opts...); err != nil { + return fmt.Errorf("failed to merge drop-in config %s: %w", file, err) + } + } + + return nil +} + +// getSortedConfigFiles returns a sorted list of .toml files in the specified directory. +// Dotfiles (starting with '.') and non-.toml files are ignored. +// Files are sorted lexically (alphabetically) by filename. +func getSortedConfigFiles(dir string) ([]string, error) { + entries, err := os.ReadDir(dir) + if err != nil { + return nil, fmt.Errorf("failed to read directory: %w", err) + } + + var files []string + for _, entry := range entries { + // Skip directories + if entry.IsDir() { + continue + } + + name := entry.Name() + + // Skip dotfiles + if strings.HasPrefix(name, ".") { + klog.V(4).Infof("Skipping dotfile: %s", name) + continue + } + + // Only process .toml files + if !strings.HasSuffix(name, ".toml") { + klog.V(4).Infof("Skipping non-.toml file: %s", name) + continue + } + + files = append(files, filepath.Join(dir, name)) + } + + // Sort lexically + sort.Strings(files) + + return files, nil } // ReadToml reads the toml data and returns the StaticConfig, with any opts applied diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index afdde191..4e7bafd2 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -31,7 +31,7 @@ type ConfigSuite struct { } func (s *ConfigSuite) TestReadConfigMissingFile() { - config, err := Read("non-existent-config.toml") + config, err := Read("non-existent-config.toml", "") s.Run("returns error for missing file", func() { s.Require().NotNil(err, "Expected error for missing file, got nil") s.True(errors.Is(err, fs.ErrNotExist), "Expected ErrNotExist, got %v", err) @@ -53,13 +53,13 @@ func (s *ConfigSuite) TestReadConfigInvalid() { kind = "Role `) - config, err := Read(invalidConfigPath) + config, err := Read(invalidConfigPath, "") s.Run("returns error for invalid file", func() { s.Require().NotNil(err, "Expected error for invalid file, got nil") }) s.Run("error message contains toml error with line number", func() { expectedError := "toml: line 9" - s.Truef(strings.HasPrefix(err.Error(), expectedError), "Expected error message to contain line number, got %v", err) + s.Truef(strings.Contains(err.Error(), expectedError), "Expected error message to contain line number, got %v", err) }) s.Run("returns nil config for invalid file", func() { s.Nil(config, "Expected nil config for missing file") @@ -88,7 +88,7 @@ func (s *ConfigSuite) TestReadConfigValid() { `) - config, err := Read(validConfigPath) + config, err := Read(validConfigPath, "") s.Require().NotNil(config) s.Run("reads and unmarshalls file", func() { s.Nil(err, "Expected nil error for valid file") @@ -151,7 +151,7 @@ func (s *ConfigSuite) TestReadConfigValidPreservesDefaultsForMissingFields() { port = "1337" `) - config, err := Read(validConfigPath) + config, err := Read(validConfigPath, "") s.Require().NotNil(config) s.Run("reads and unmarshalls file", func() { s.Nil(err, "Expected nil error for valid file") @@ -174,46 +174,209 @@ func (s *ConfigSuite) TestReadConfigValidPreservesDefaultsForMissingFields() { }) } -func (s *ConfigSuite) TestMergeConfig() { - base := StaticConfig{ - ListOutput: "table", - Toolsets: []string{"core", "config", "helm"}, - Port: "8080", +func (s *ConfigSuite) TestGetSortedConfigFiles() { + tempDir := s.T().TempDir() + + // Create test files + files := []string{ + "10-first.toml", + "20-second.toml", + "05-before.toml", + "99-last.toml", + ".hidden.toml", // should be ignored + "readme.txt", // should be ignored + "invalid", // should be ignored } - s.Run("merges override values on top of base", func() { - override := StaticConfig{ - ListOutput: "json", - Port: "9090", + + for _, file := range files { + path := filepath.Join(tempDir, file) + err := os.WriteFile(path, []byte(""), 0644) + s.Require().NoError(err) + } + + // Create a subdirectory (should be ignored) + subDir := filepath.Join(tempDir, "subdir") + err := os.Mkdir(subDir, 0755) + s.Require().NoError(err) + + sorted, err := getSortedConfigFiles(tempDir) + s.Require().NoError(err) + + s.Run("returns only .toml files", func() { + s.Len(sorted, 4, "Expected 4 .toml files") + }) + + s.Run("sorted in lexical order", func() { + expected := []string{ + filepath.Join(tempDir, "05-before.toml"), + filepath.Join(tempDir, "10-first.toml"), + filepath.Join(tempDir, "20-second.toml"), + filepath.Join(tempDir, "99-last.toml"), } + s.Equal(expected, sorted) + }) + + s.Run("excludes dotfiles", func() { + for _, file := range sorted { + s.NotContains(file, ".hidden") + } + }) + + s.Run("excludes non-.toml files", func() { + for _, file := range sorted { + s.Contains(file, ".toml") + } + }) +} - result := mergeConfig(base, override) +func (s *ConfigSuite) TestDropInConfigPrecedence() { + tempDir := s.T().TempDir() + + // Main config file + mainConfigPath := s.writeConfig(` + log_level = 1 + port = "8080" + list_output = "table" + toolsets = ["core", "config"] + `) + + // Create drop-in directory + dropInDir := filepath.Join(tempDir, "config.d") + err := os.Mkdir(dropInDir, 0755) + s.Require().NoError(err) - s.Equal("json", result.ListOutput, "ListOutput should be overridden") - s.Equal("9090", result.Port, "Port should be overridden") + // First drop-in file + dropIn1 := filepath.Join(dropInDir, "10-override.toml") + err = os.WriteFile(dropIn1, []byte(` + log_level = 5 + port = "9090" + `), 0644) + s.Require().NoError(err) + + // Second drop-in file (should override first) + dropIn2 := filepath.Join(dropInDir, "20-final.toml") + err = os.WriteFile(dropIn2, []byte(` + port = "7777" + list_output = "yaml" + `), 0644) + s.Require().NoError(err) + + config, err := Read(mainConfigPath, dropInDir) + s.Require().NoError(err) + s.Require().NotNil(config) + + s.Run("drop-in overrides main config", func() { + s.Equal(5, config.LogLevel, "log_level from 10-override.toml should override main") }) - s.Run("preserves base values when override is empty", func() { - override := StaticConfig{} + s.Run("later drop-in overrides earlier drop-in", func() { + s.Equal("7777", config.Port, "port from 20-final.toml should override 10-override.toml") + }) - result := mergeConfig(base, override) + s.Run("preserves values not in drop-in files", func() { + s.Equal([]string{"core", "config"}, config.Toolsets, "toolsets from main config should be preserved") + }) - s.Equal("table", result.ListOutput, "ListOutput should be preserved from base") - s.Equal([]string{"core", "config", "helm"}, result.Toolsets, "Toolsets should be preserved from base") - s.Equal("8080", result.Port, "Port should be preserved from base") + s.Run("applies all drop-in changes", func() { + s.Equal("yaml", config.ListOutput, "list_output from 20-final.toml should be applied") }) +} - s.Run("handles partial overrides", func() { - override := StaticConfig{ - Toolsets: []string{"custom"}, - ReadOnly: true, - } +func (s *ConfigSuite) TestDropInConfigMissingDirectory() { + mainConfigPath := s.writeConfig(` + log_level = 3 + port = "8080" + `) + + config, err := Read(mainConfigPath, "/non/existent/directory") + s.Require().NoError(err, "Should not error for missing drop-in directory") + s.Require().NotNil(config) - result := mergeConfig(base, override) + s.Run("loads main config successfully", func() { + s.Equal(3, config.LogLevel) + s.Equal("8080", config.Port) + }) +} + +func (s *ConfigSuite) TestDropInConfigEmptyDirectory() { + mainConfigPath := s.writeConfig(` + log_level = 2 + `) + + dropInDir := s.T().TempDir() + + config, err := Read(mainConfigPath, dropInDir) + s.Require().NoError(err) + s.Require().NotNil(config) + + s.Run("loads main config successfully", func() { + s.Equal(2, config.LogLevel) + }) +} + +func (s *ConfigSuite) TestDropInConfigPartialOverride() { + tempDir := s.T().TempDir() + + mainConfigPath := s.writeConfig(` + log_level = 1 + port = "8080" + list_output = "table" + read_only = false + toolsets = ["core", "config", "helm"] + `) + + dropInDir := filepath.Join(tempDir, "config.d") + err := os.Mkdir(dropInDir, 0755) + s.Require().NoError(err) + + // Drop-in file with partial config + dropIn := filepath.Join(dropInDir, "10-partial.toml") + err = os.WriteFile(dropIn, []byte(` + read_only = true + `), 0644) + s.Require().NoError(err) + + config, err := Read(mainConfigPath, dropInDir) + s.Require().NoError(err) + s.Require().NotNil(config) + + s.Run("overrides specified field", func() { + s.True(config.ReadOnly, "read_only should be overridden to true") + }) + + s.Run("preserves all other fields", func() { + s.Equal(1, config.LogLevel) + s.Equal("8080", config.Port) + s.Equal("table", config.ListOutput) + s.Equal([]string{"core", "config", "helm"}, config.Toolsets) + }) +} + +func (s *ConfigSuite) TestDropInConfigWithArrays() { + tempDir := s.T().TempDir() + + mainConfigPath := s.writeConfig(` + toolsets = ["core", "config"] + enabled_tools = ["tool1", "tool2"] + `) + + dropInDir := filepath.Join(tempDir, "config.d") + err := os.Mkdir(dropInDir, 0755) + s.Require().NoError(err) + + dropIn := filepath.Join(dropInDir, "10-arrays.toml") + err = os.WriteFile(dropIn, []byte(` + toolsets = ["helm", "logs"] + `), 0644) + s.Require().NoError(err) + + config, err := Read(mainConfigPath, dropInDir) + s.Require().NoError(err) + s.Require().NotNil(config) - s.Equal("table", result.ListOutput, "ListOutput should be preserved from base") - s.Equal([]string{"custom"}, result.Toolsets, "Toolsets should be overridden") - s.Equal("8080", result.Port, "Port should be preserved from base since override doesn't specify it") - s.True(result.ReadOnly, "ReadOnly should be overridden to true") + s.Run("replaces arrays completely", func() { + s.Equal([]string{"helm", "logs"}, config.Toolsets, "toolsets should be completely replaced") + s.Equal([]string{"tool1", "tool2"}, config.EnabledTools, "enabled_tools should be preserved") }) } diff --git a/pkg/config/provider_config_test.go b/pkg/config/provider_config_test.go index 2afbd2d7..848041c2 100644 --- a/pkg/config/provider_config_test.go +++ b/pkg/config/provider_config_test.go @@ -66,7 +66,7 @@ func (s *ProviderConfigSuite) TestReadConfigValid() { int_prop = 42 `) - config, err := Read(validConfigPath) + config, err := Read(validConfigPath, "") s.Run("returns no error for valid file with registered provider config", func() { s.Require().NoError(err, "Expected no error for valid file, got %v", err) }) @@ -95,7 +95,7 @@ func (s *ProviderConfigSuite) TestReadConfigInvalidProviderConfig() { int_prop = 42 `) - config, err := Read(invalidConfigPath) + config, err := Read(invalidConfigPath, "") s.Run("returns error for invalid provider config", func() { s.Require().NotNil(err, "Expected error for invalid provider config, got nil") s.ErrorContains(err, "validation error forced by test", "Expected validation error from provider config") @@ -114,7 +114,7 @@ func (s *ProviderConfigSuite) TestReadConfigUnregisteredProviderConfig() { int_prop = 42 `) - config, err := Read(invalidConfigPath) + config, err := Read(invalidConfigPath, "") s.Run("returns no error for unregistered provider config", func() { s.Require().NoError(err, "Expected no error for unregistered provider config, got %v", err) }) @@ -139,7 +139,7 @@ func (s *ProviderConfigSuite) TestReadConfigParserError() { int_prop = 42 `) - config, err := Read(invalidConfigPath) + config, err := Read(invalidConfigPath, "") s.Run("returns error for provider config parser error", func() { s.Require().NotNil(err, "Expected error for provider config parser error, got nil") s.ErrorContains(err, "parser error forced by test", "Expected parser error from provider config") @@ -170,7 +170,7 @@ func (s *ProviderConfigSuite) TestConfigDirPathInContext() { absConfigPath, err := filepath.Abs(configPath) s.Require().NoError(err, "test error: getting the absConfigPath should not fail") - _, err = Read(configPath) + _, err = Read(configPath, "") s.Run("provides config directory path in context to parser", func() { s.Require().NoError(err, "Expected no error reading config") s.NotEmpty(capturedDirPath, "Expected non-empty directory path in context") diff --git a/pkg/kubernetes-mcp-server/cmd/root.go b/pkg/kubernetes-mcp-server/cmd/root.go index 0a1c6029..d1fc5259 100644 --- a/pkg/kubernetes-mcp-server/cmd/root.go +++ b/pkg/kubernetes-mcp-server/cmd/root.go @@ -57,6 +57,7 @@ const ( flagVersion = "version" flagLogLevel = "log-level" flagConfig = "config" + flagConfigDir = "config-dir" flagPort = "port" flagSSEBaseUrl = "sse-base-url" flagKubeconfig = "kubeconfig" @@ -92,6 +93,7 @@ type MCPServerOptions struct { DisableMultiCluster bool ConfigPath string + ConfigDir string StaticConfig *config.StaticConfig genericiooptions.IOStreams @@ -129,6 +131,7 @@ func NewMCPServer(streams genericiooptions.IOStreams) *cobra.Command { cmd.Flags().BoolVar(&o.Version, flagVersion, o.Version, "Print version information and quit") cmd.Flags().IntVar(&o.LogLevel, flagLogLevel, o.LogLevel, "Set the log level (from 0 to 9)") cmd.Flags().StringVar(&o.ConfigPath, flagConfig, o.ConfigPath, "Path of the config file.") + cmd.Flags().StringVar(&o.ConfigDir, flagConfigDir, o.ConfigDir, "Path to drop-in configuration directory (files loaded in lexical order).") cmd.Flags().StringVar(&o.Port, flagPort, o.Port, "Start a streamable HTTP and SSE HTTP server on the specified port (e.g. 8080)") cmd.Flags().StringVar(&o.SSEBaseUrl, flagSSEBaseUrl, o.SSEBaseUrl, "SSE public base URL to use when sending the endpoint message (e.g. https://example.com)") cmd.Flags().StringVar(&o.Kubeconfig, flagKubeconfig, o.Kubeconfig, "Path to the kubeconfig file to use for authentication") @@ -155,7 +158,7 @@ func NewMCPServer(streams genericiooptions.IOStreams) *cobra.Command { func (m *MCPServerOptions) Complete(cmd *cobra.Command) error { if m.ConfigPath != "" { - cnf, err := config.Read(m.ConfigPath) + cnf, err := config.Read(m.ConfigPath, m.ConfigDir) if err != nil { return err } @@ -319,7 +322,11 @@ func (m *MCPServerOptions) Run() error { oidcProvider = provider } - mcpServer, err := mcp.NewServer(mcp.Configuration{StaticConfig: m.StaticConfig}) + mcpServer, err := mcp.NewServer(mcp.Configuration{ + StaticConfig: m.StaticConfig, + ConfigPath: m.ConfigPath, + ConfigDir: m.ConfigDir, + }) if err != nil { return fmt.Errorf("failed to initialize MCP server: %w", err) } diff --git a/pkg/kubernetes-mcp-server/cmd/root_test.go b/pkg/kubernetes-mcp-server/cmd/root_test.go index a464daab..5e21a2f2 100644 --- a/pkg/kubernetes-mcp-server/cmd/root_test.go +++ b/pkg/kubernetes-mcp-server/cmd/root_test.go @@ -76,7 +76,7 @@ func TestConfig(t *testing.T) { if err == nil { t.Fatal("Expected error for invalid config path, got nil") } - expected := "open invalid-path-to-config.toml: " + expected := "failed to load main config file invalid-path-to-config.toml:" if !strings.HasPrefix(err.Error(), expected) { t.Fatalf("Expected error to be %s, got %s", expected, err.Error()) } diff --git a/pkg/kubernetes/manager.go b/pkg/kubernetes/manager.go index 32bd278e..df990bf2 100644 --- a/pkg/kubernetes/manager.go +++ b/pkg/kubernetes/manager.go @@ -4,7 +4,10 @@ import ( "context" "errors" "fmt" + "sort" "strings" + "sync" + "time" "github.com/containers/kubernetes-mcp-server/pkg/config" "github.com/containers/kubernetes-mcp-server/pkg/helm" @@ -25,11 +28,39 @@ type Manager struct { staticConfig *config.StaticConfig CloseWatchKubeConfig CloseWatchKubeConfig + + clusterWatcher *clusterStateWatcher +} + +// clusterState represents the cached state of the cluster +type clusterState struct { + apiGroups []string + isOpenShift bool +} + +// clusterStateWatcher monitors cluster state changes and triggers debounced reloads +type clusterStateWatcher struct { + manager *Manager + pollInterval time.Duration + debounceWindow time.Duration + lastKnownState clusterState + reloadCallback func() error + debounceTimer *time.Timer + mu sync.Mutex + stopCh chan struct{} + stoppedCh chan struct{} } var _ helm.Kubernetes = (*Manager)(nil) var _ Openshift = (*Manager)(nil) +const ( + // DefaultClusterStatePollInterval is the default interval for polling cluster state changes + DefaultClusterStatePollInterval = 30 * time.Second + // DefaultClusterStateDebounceWindow is the default debounce window for cluster state changes + DefaultClusterStateDebounceWindow = 5 * time.Second +) + var ( ErrorKubeconfigInClusterNotAllowed = errors.New("kubeconfig manager cannot be used in in-cluster deployments") ErrorInClusterNotInCluster = errors.New("in-cluster manager cannot be used outside of a cluster") @@ -148,6 +179,9 @@ func (m *Manager) Close() { if m.CloseWatchKubeConfig != nil { _ = m.CloseWatchKubeConfig() } + if m.clusterWatcher != nil { + m.clusterWatcher.stop() + } } func (m *Manager) configuredNamespace() string { @@ -263,3 +297,117 @@ func (m *Manager) Derived(ctx context.Context) (*Kubernetes, error) { } return derived, nil } + +// WatchClusterState starts a background watcher that periodically polls for cluster state changes +// and triggers a debounced reload when changes are detected. +func (m *Manager) WatchClusterState(pollInterval, debounceWindow time.Duration, onClusterStateChange func() error) { + if m.clusterWatcher != nil { + m.clusterWatcher.stop() + } + + watcher := &clusterStateWatcher{ + manager: m, + pollInterval: pollInterval, + debounceWindow: debounceWindow, + reloadCallback: onClusterStateChange, + stopCh: make(chan struct{}), + stoppedCh: make(chan struct{}), + } + + captureState := func() clusterState { + state := clusterState{apiGroups: []string{}} + if groups, err := m.discoveryClient.ServerGroups(); err == nil { + for _, group := range groups.Groups { + state.apiGroups = append(state.apiGroups, group.Name) + } + sort.Strings(state.apiGroups) + } + state.isOpenShift = m.IsOpenShift(context.Background()) + return state + } + watcher.lastKnownState = captureState() + + m.clusterWatcher = watcher + + // Start background monitoring + go func() { + defer close(watcher.stoppedCh) + ticker := time.NewTicker(pollInterval) + defer ticker.Stop() + + klog.V(2).Infof("Started cluster state watcher (poll interval: %v, debounce: %v)", pollInterval, debounceWindow) + + for { + select { + case <-watcher.stopCh: + klog.V(2).Info("Stopping cluster state watcher") + return + case <-ticker.C: + // Invalidate discovery cache to get fresh API groups + m.discoveryClient.Invalidate() + + watcher.mu.Lock() + current := captureState() + klog.V(3).Infof("Polled cluster state: %d API groups, OpenShift=%v", len(current.apiGroups), current.isOpenShift) + + changed := current.isOpenShift != watcher.lastKnownState.isOpenShift || + len(current.apiGroups) != len(watcher.lastKnownState.apiGroups) + + if !changed { + for i := range current.apiGroups { + if current.apiGroups[i] != watcher.lastKnownState.apiGroups[i] { + changed = true + break + } + } + } + + if changed { + klog.V(2).Info("Cluster state changed, scheduling debounced reload") + if watcher.debounceTimer != nil { + watcher.debounceTimer.Stop() + } + watcher.debounceTimer = time.AfterFunc(debounceWindow, func() { + klog.V(2).Info("Debounce window expired, triggering reload") + if err := onClusterStateChange(); err != nil { + klog.Errorf("Failed to reload: %v", err) + } else { + watcher.mu.Lock() + watcher.lastKnownState = captureState() + watcher.mu.Unlock() + klog.V(2).Info("Reload completed") + } + }) + } + watcher.mu.Unlock() + } + } + }() +} + +// stop stops the cluster state watcher +func (w *clusterStateWatcher) stop() { + if w == nil { + return + } + + w.mu.Lock() + defer w.mu.Unlock() + + if w.debounceTimer != nil { + w.debounceTimer.Stop() + } + + if w.stopCh == nil || w.stoppedCh == nil { + return + } + + select { + case <-w.stopCh: + // Already closed or stopped + return + default: + close(w.stopCh) + <-w.stoppedCh + } +} diff --git a/pkg/kubernetes/manager_test.go b/pkg/kubernetes/manager_test.go index c6f9da6a..843995ed 100644 --- a/pkg/kubernetes/manager_test.go +++ b/pkg/kubernetes/manager_test.go @@ -197,6 +197,48 @@ func (s *ManagerTestSuite) TestNewKubeconfigManager() { }) } +func (s *ManagerTestSuite) TestClusterStateWatcherStop() { + s.Run("stop() on nil watcher", func() { + var watcher *clusterStateWatcher + // Should not panic + watcher.stop() + }) + + s.Run("stop() on uninitialized watcher (nil channels)", func() { + watcher := &clusterStateWatcher{} + // Should not panic even with nil channels + watcher.stop() + }) + + s.Run("stop() on initialized watcher", func() { + watcher := &clusterStateWatcher{ + stopCh: make(chan struct{}), + stoppedCh: make(chan struct{}), + } + // Close the stoppedCh to simulate a running goroutine + go func() { + <-watcher.stopCh + close(watcher.stoppedCh) + }() + // Should not panic and should stop cleanly + watcher.stop() + }) + + s.Run("stop() called multiple times", func() { + watcher := &clusterStateWatcher{ + stopCh: make(chan struct{}), + stoppedCh: make(chan struct{}), + } + go func() { + <-watcher.stopCh + close(watcher.stoppedCh) + }() + // First stop + watcher.stop() + // Second stop should not panic + watcher.stop() + }) +} func TestManager(t *testing.T) { suite.Run(t, new(ManagerTestSuite)) } diff --git a/pkg/kubernetes/provider_kubeconfig.go b/pkg/kubernetes/provider_kubeconfig.go index 9ab055c8..2fbf5c09 100644 --- a/pkg/kubernetes/provider_kubeconfig.go +++ b/pkg/kubernetes/provider_kubeconfig.go @@ -120,8 +120,8 @@ func (p *kubeConfigClusterProvider) GetDefaultTarget() string { func (p *kubeConfigClusterProvider) WatchTargets(onKubeConfigChanged func() error) { m := p.managers[p.defaultContext] - m.WatchKubeConfig(onKubeConfigChanged) + m.WatchClusterState(DefaultClusterStatePollInterval, DefaultClusterStateDebounceWindow, onKubeConfigChanged) } func (p *kubeConfigClusterProvider) Close() { diff --git a/pkg/kubernetes/provider_single.go b/pkg/kubernetes/provider_single.go index 3693d639..1e663f67 100644 --- a/pkg/kubernetes/provider_single.go +++ b/pkg/kubernetes/provider_single.go @@ -87,6 +87,7 @@ func (p *singleClusterProvider) GetTargetParameterName() string { func (p *singleClusterProvider) WatchTargets(watch func() error) { p.manager.WatchKubeConfig(watch) + p.manager.WatchClusterState(DefaultClusterStatePollInterval, DefaultClusterStateDebounceWindow, watch) } func (p *singleClusterProvider) Close() { diff --git a/pkg/mcp/mcp.go b/pkg/mcp/mcp.go index 6a4a6d2f..cd201b62 100644 --- a/pkg/mcp/mcp.go +++ b/pkg/mcp/mcp.go @@ -5,10 +5,13 @@ import ( "fmt" "net/http" "os" + "os/signal" "slices" + "syscall" "github.com/modelcontextprotocol/go-sdk/mcp" authenticationapiv1 "k8s.io/api/authentication/v1" + "k8s.io/klog/v2" "k8s.io/utils/ptr" "github.com/containers/kubernetes-mcp-server/pkg/api" @@ -25,6 +28,8 @@ const TokenScopesContextKey = ContextKey("TokenScopesContextKey") type Configuration struct { *config.StaticConfig + ConfigPath string // Path to main config file (for watching) + ConfigDir string // Path to drop-in config directory (for watching) listOutput output.Output toolsets []api.Toolset } @@ -66,6 +71,7 @@ type Server struct { server *mcp.Server enabledTools []string p internalk8s.Provider + sigHupCh chan os.Signal } func NewServer(configuration Configuration) (*Server, error) { @@ -93,36 +99,52 @@ func NewServer(configuration Configuration) (*Server, error) { } s.p.WatchTargets(s.reloadKubernetesClusterProvider) + // Set up SIGHUP handler for configuration reload + s.setupSIGHUPHandler() + return s, nil } func (s *Server) reloadKubernetesClusterProvider() error { ctx := context.Background() - p, err := internalk8s.NewProvider(s.configuration.StaticConfig) + + newProvider, err := internalk8s.NewProvider(s.configuration.StaticConfig) + if err != nil { + return err + } + + targets, err := newProvider.GetTargets(ctx) if err != nil { + newProvider.Close() return err } - // close the old provider if s.p != nil { s.p.Close() } - s.p = p + s.p = newProvider - targets, err := p.GetTargets(ctx) - if err != nil { + if err := s.rebuildTools(targets); err != nil { return err } + s.p.WatchTargets(s.reloadKubernetesClusterProvider) + + return nil +} + +// rebuildTools rebuilds the MCP tool registry based on the current provider and targets. +// This is called after the provider has been successfully validated and set. +func (s *Server) rebuildTools(targets []string) error { filter := CompositeFilter( s.configuration.isToolApplicable, - ShouldIncludeTargetListTool(p.GetTargetParameterName(), targets), + ShouldIncludeTargetListTool(s.p.GetTargetParameterName(), targets), ) mutator := WithTargetParameter( - p.GetDefaultTarget(), - p.GetTargetParameterName(), + s.p.GetDefaultTarget(), + s.p.GetTargetParameterName(), targets, ) @@ -136,7 +158,7 @@ func (s *Server) reloadKubernetesClusterProvider() error { applicableTools := make([]api.ServerTool, 0) s.enabledTools = make([]string, 0) for _, toolset := range s.configuration.Toolsets() { - for _, tool := range toolset.GetTools(p) { + for _, tool := range toolset.GetTools(s.p) { tool := mutator(tool) if !filter(tool) { continue @@ -157,6 +179,7 @@ func (s *Server) reloadKubernetesClusterProvider() error { } s.server.RemoveTools(toolsToRemove...) + // Add new tools for _, tool := range applicableTools { goSdkTool, goSdkToolHandler, err := ServerToolToGoSdkTool(s, tool) if err != nil { @@ -165,8 +188,6 @@ func (s *Server) reloadKubernetesClusterProvider() error { s.server.AddTool(goSdkTool, goSdkToolHandler) } - // start new watch - s.p.WatchTargets(s.reloadKubernetesClusterProvider) return nil } @@ -211,7 +232,61 @@ func (s *Server) GetEnabledTools() []string { return s.enabledTools } +// setupSIGHUPHandler sets up a signal handler to reload configuration on SIGHUP +func (s *Server) setupSIGHUPHandler() { + s.sigHupCh = make(chan os.Signal, 1) + signal.Notify(s.sigHupCh, syscall.SIGHUP) + + go func() { + for range s.sigHupCh { + klog.V(1).Info("Received SIGHUP signal") + if err := s.reloadConfiguration(); err != nil { + klog.Errorf("Failed to reload configuration: %v", err) + } else { + klog.V(1).Info("Configuration reloaded successfully via SIGHUP") + } + } + }() + + klog.V(2).Info("SIGHUP handler registered for configuration reload") +} + +// reloadConfiguration reloads the configuration from disk and reinitializes the server +func (s *Server) reloadConfiguration() error { + klog.V(1).Info("Reloading configuration...") + + // Reload config from files + newConfig, err := config.Read(s.configuration.ConfigPath, s.configuration.ConfigDir) + if err != nil { + return fmt.Errorf("failed to reload configuration: %w", err) + } + + // Update the configuration + s.configuration.StaticConfig = newConfig + // Clear cached values so they get recomputed + s.configuration.listOutput = nil + s.configuration.toolsets = nil + + // Reload the Kubernetes provider (this will also rebuild tools) + if err := s.reloadKubernetesClusterProvider(); err != nil { + return fmt.Errorf("failed to reload Kubernetes provider: %w", err) + } + + klog.V(1).Info("Configuration reloaded successfully") + return nil +} + func (s *Server) Close() { + if s.sigHupCh != nil { + signal.Stop(s.sigHupCh) + // Check if channel is already closed + select { + case <-s.sigHupCh: + // Already closed + default: + close(s.sigHupCh) + } + } if s.p != nil { s.p.Close() } diff --git a/pkg/mcp/mcp_reload_test.go b/pkg/mcp/mcp_reload_test.go new file mode 100644 index 00000000..de3ec160 --- /dev/null +++ b/pkg/mcp/mcp_reload_test.go @@ -0,0 +1,318 @@ +package mcp + +import ( + "os" + "path/filepath" + "syscall" + "testing" + "time" + + "github.com/containers/kubernetes-mcp-server/internal/test" + "github.com/containers/kubernetes-mcp-server/pkg/config" + "github.com/mark3labs/mcp-go/mcp" + "github.com/stretchr/testify/suite" +) + +type ConfigReloadSuite struct { + BaseMcpSuite + mockServer *test.MockServer + configFile string + configDir string + server *Server +} + +func (s *ConfigReloadSuite) SetupTest() { + s.BaseMcpSuite.SetupTest() + s.mockServer = test.NewMockServer() + s.Cfg.KubeConfig = s.mockServer.KubeconfigFile(s.T()) + s.mockServer.Handle(&test.DiscoveryClientHandler{}) + + tempDir := s.T().TempDir() + s.configFile = filepath.Join(tempDir, "config.toml") + s.configDir = filepath.Join(tempDir, "config.d") + err := os.Mkdir(s.configDir, 0755) + s.Require().NoError(err) + + // Write initial config (include kubeconfig so reload works) + err = os.WriteFile(s.configFile, []byte(` +log_level = 1 +list_output = "table" +toolsets = ["core", "config"] +kubeconfig = "`+s.Cfg.KubeConfig+`" +`), 0644) + s.Require().NoError(err) +} + +func (s *ConfigReloadSuite) TearDownTest() { + s.BaseMcpSuite.TearDownTest() + if s.server != nil { + s.server.Close() + } + if s.mockServer != nil { + s.mockServer.Close() + } +} + +func (s *ConfigReloadSuite) TestDropInConfigurationReload() { + // Initialize server - it will load from config files + cfg, err := config.Read(s.configFile, s.configDir) + s.Require().NoError(err) + server, err := NewServer(Configuration{ + StaticConfig: cfg, + ConfigPath: s.configFile, + ConfigDir: s.configDir, + }) + s.Require().NoError(err) + s.Require().NotNil(server) + s.server = server + + s.Run("initial configuration loaded correctly", func() { + s.Equal(1, server.configuration.LogLevel) + s.Equal("table", server.configuration.StaticConfig.ListOutput) + s.Equal([]string{"core", "config"}, server.configuration.StaticConfig.Toolsets) + }) + + // Add first drop-in file + dropIn1 := filepath.Join(s.configDir, "10-override.toml") + err = os.WriteFile(dropIn1, []byte(` +log_level = 5 +list_output = "yaml" +`), 0644) + s.Require().NoError(err) + + err = server.reloadConfiguration() + s.Require().NoError(err) + + s.Run("drop-in file overrides main config", func() { + s.Equal(5, server.configuration.LogLevel) + s.Equal("yaml", server.configuration.StaticConfig.ListOutput) + s.Equal([]string{"core", "config"}, server.configuration.StaticConfig.Toolsets) + }) + + // Add second drop-in file with different priority + dropIn2 := filepath.Join(s.configDir, "20-toolsets.toml") + err = os.WriteFile(dropIn2, []byte(` +toolsets = ["core", "config", "helm"] +`), 0644) + s.Require().NoError(err) + + err = server.reloadConfiguration() + s.Require().NoError(err) + + s.Run("multiple drop-ins with correct precedence", func() { + s.Equal(5, server.configuration.LogLevel) + s.Equal("yaml", server.configuration.StaticConfig.ListOutput) + s.Equal([]string{"core", "config", "helm"}, server.configuration.StaticConfig.Toolsets) + }) + + // Add third drop-in that partially overrides + dropIn3 := filepath.Join(s.configDir, "30-partial.toml") + err = os.WriteFile(dropIn3, []byte(` +log_level = 7 +`), 0644) + s.Require().NoError(err) + + err = server.reloadConfiguration() + s.Require().NoError(err) + + s.Run("later drop-in overrides earlier with partial config", func() { + s.Equal(7, server.configuration.LogLevel) + s.Equal("yaml", server.configuration.StaticConfig.ListOutput) + s.Equal([]string{"core", "config", "helm"}, server.configuration.StaticConfig.Toolsets) + }) + + // Remove all drop-in files to test empty directory + err = os.Remove(dropIn1) + s.Require().NoError(err) + err = os.Remove(dropIn2) + s.Require().NoError(err) + err = os.Remove(dropIn3) + s.Require().NoError(err) + + err = server.reloadConfiguration() + s.Require().NoError(err) + + s.Run("empty drop-in directory reverts to main config", func() { + s.Equal(1, server.configuration.LogLevel) + s.Equal("table", server.configuration.StaticConfig.ListOutput) + s.Equal([]string{"core", "config"}, server.configuration.StaticConfig.Toolsets) + }) + + // Add a drop-in and then remove it + tempDropIn := filepath.Join(s.configDir, "10-temp.toml") + err = os.WriteFile(tempDropIn, []byte(` +log_level = 8 +`), 0644) + s.Require().NoError(err) + + err = server.reloadConfiguration() + s.Require().NoError(err) + s.Equal(8, server.configuration.LogLevel) + + err = os.Remove(tempDropIn) + s.Require().NoError(err) + + err = server.reloadConfiguration() + s.Require().NoError(err) + + s.Run("removing drop-in file reverts to main config", func() { + s.Equal(1, server.configuration.LogLevel) + }) +} + +func (s *ConfigReloadSuite) TestConfigurationReloadErrors() { + server, err := NewServer(Configuration{ + StaticConfig: s.Cfg, + ConfigPath: s.configFile, + ConfigDir: s.configDir, + }) + s.Require().NoError(err) + s.server = server + + initialLogLevel := server.configuration.LogLevel + + s.Run("invalid TOML in drop-in file", func() { + dropIn := filepath.Join(s.configDir, "10-invalid.toml") + err = os.WriteFile(dropIn, []byte(` +log_level = "invalid +`), 0644) + s.Require().NoError(err) + + err = server.reloadConfiguration() + s.Error(err, "should return error for invalid TOML") + s.Equal(initialLogLevel, server.configuration.LogLevel, "config unchanged on error") + + // Cleanup + _ = os.Remove(dropIn) + }) + + s.Run("missing main config file", func() { + // Delete main config file + err = os.Remove(s.configFile) + s.Require().NoError(err) + + err = server.reloadConfiguration() + s.Error(err, "should return error for missing config file") + s.Equal(initialLogLevel, server.configuration.LogLevel, "config unchanged on error") + }) +} + +func (s *ConfigReloadSuite) TestSIGHUPReload() { + server, err := NewServer(Configuration{ + StaticConfig: s.Cfg, + ConfigPath: s.configFile, + ConfigDir: s.configDir, + }) + s.Require().NoError(err) + s.server = server + + initialLogLevel := server.configuration.LogLevel + + s.Run("single SIGHUP triggers reload", func() { + dropIn := filepath.Join(s.configDir, "10-sighup.toml") + err = os.WriteFile(dropIn, []byte(` +log_level = 9 +`), 0644) + s.Require().NoError(err) + + // Send SIGHUP signal to the channel + server.sigHupCh <- syscall.SIGHUP + time.Sleep(100 * time.Millisecond) + + s.NotEqual(initialLogLevel, server.configuration.LogLevel) + s.Equal(9, server.configuration.LogLevel) + + // Cleanup for next test + _ = os.Remove(dropIn) + }) + + s.Run("multiple SIGHUP signals in succession", func() { + dropIn := filepath.Join(s.configDir, "10-multi.toml") + + // First SIGHUP + err = os.WriteFile(dropIn, []byte(`log_level = 3`), 0644) + s.Require().NoError(err) + server.sigHupCh <- syscall.SIGHUP + time.Sleep(50 * time.Millisecond) + s.Equal(3, server.configuration.LogLevel) + + // Second SIGHUP + err = os.WriteFile(dropIn, []byte(`log_level = 6`), 0644) + s.Require().NoError(err) + server.sigHupCh <- syscall.SIGHUP + time.Sleep(50 * time.Millisecond) + s.Equal(6, server.configuration.LogLevel) + + // Third SIGHUP + err = os.WriteFile(dropIn, []byte(`log_level = 9`), 0644) + s.Require().NoError(err) + server.sigHupCh <- syscall.SIGHUP + time.Sleep(50 * time.Millisecond) + s.Equal(9, server.configuration.LogLevel) + }) +} + +func (s *ConfigReloadSuite) TestReloadUpdatesToolsets() { + server, err := NewServer(Configuration{ + StaticConfig: s.Cfg, + ConfigPath: s.configFile, + ConfigDir: s.configDir, + }) + s.Require().NoError(err) + s.server = server + + // Get initial tools + s.InitMcpClient() + initialTools, err := s.ListTools(s.T().Context(), mcp.ListToolsRequest{}) + s.Require().NoError(err) + s.Require().Greater(len(initialTools.Tools), 0) + + // Add helm toolset via drop-in + dropIn := filepath.Join(s.configDir, "10-add-helm.toml") + err = os.WriteFile(dropIn, []byte(` +toolsets = ["core", "config", "helm"] +`), 0644) + s.Require().NoError(err) + + // Reload configuration + err = server.reloadConfiguration() + s.Require().NoError(err) + + // Verify helm tools are available + reloadedTools, err := s.ListTools(s.T().Context(), mcp.ListToolsRequest{}) + s.Require().NoError(err) + + helmToolFound := false + for _, tool := range reloadedTools.Tools { + if tool.Name == "helm_list" { + helmToolFound = true + break + } + } + s.True(helmToolFound, "helm tools should be available after reload") +} + +func (s *ConfigReloadSuite) TestServerLifecycle() { + server, err := NewServer(Configuration{ + StaticConfig: s.Cfg, + ConfigPath: s.configFile, + ConfigDir: s.configDir, + }) + s.Require().NoError(err) + + s.Run("server closes without panic", func() { + s.NotPanics(func() { + server.Close() + }) + }) + + s.Run("double close does not panic", func() { + s.NotPanics(func() { + server.Close() + }) + }) +} + +func TestConfigReload(t *testing.T) { + suite.Run(t, new(ConfigReloadSuite)) +}