From 62e29a38b0d8b8cf17a112c75614c6274bb0eabf Mon Sep 17 00:00:00 2001 From: avandras Date: Thu, 15 May 2025 10:41:50 +0200 Subject: [PATCH 01/10] Fix a small isort issue --- tests/test_ha.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_ha.py b/tests/test_ha.py index e02d427f5..53911c7e9 100644 --- a/tests/test_ha.py +++ b/tests/test_ha.py @@ -1,10 +1,11 @@ import datetime -import etcd import os import sys from unittest.mock import MagicMock, Mock, mock_open, patch, PropertyMock +import etcd + from patroni import global_config from patroni.collections import CaseInsensitiveSet from patroni.config import Config From 033cea6adb0e63d0df1142ea3391d5797e6b9078 Mon Sep 17 00:00:00 2001 From: avandras Date: Thu, 15 May 2025 16:22:02 +0200 Subject: [PATCH 02/10] Fix most Pyright issues in multisite.py --- patroni/multisite.py | 54 ++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/patroni/multisite.py b/patroni/multisite.py index dd9ea6a05..f7e401f89 100644 --- a/patroni/multisite.py +++ b/patroni/multisite.py @@ -3,15 +3,15 @@ import logging import time -from datetime import datetime +from datetime import datetime, timezone from threading import Event, Thread -from typing import Union +from typing import Any, Dict, Tuple, Union import six import kubernetes -from .dcs import Cluster, Member +from .dcs import AbstractDCS, Cluster, Member from .dcs.kubernetes import catch_kubernetes_errors from .exceptions import DCSError @@ -29,7 +29,7 @@ def start(self): def shutdown(self): pass - def get_active_standby_config(self) -> Union[dict, None]: + def get_active_standby_config(self) -> Union[Dict[str, Any], None]: """Returns currently active configuration for standby leader""" return {} @@ -52,13 +52,13 @@ def heartbeat(self): def release(self): pass - def status(self): + def status(self) -> Dict[str, Any]: return {} def should_failover(self) -> bool: return False - def on_shutdown(self, checkpoint_location): + def on_shutdown(self, checkpoint_location: int): pass @@ -71,7 +71,7 @@ def status(self): class MultisiteController(Thread, AbstractSiteController): is_active = True - def __init__(self, config, on_change=None): + def __init__(self, config: Dict[str, Any], on_change: None = None): super().__init__() self.stop_requested = False self.on_change = on_change @@ -82,10 +82,11 @@ def __init__(self, config, on_change=None): self.name = msconfig['name'] if msconfig.get('update_crd'): - self._state_updater = KubernetesStateManagement(msconfig.get('update_crd'), - msconfig.get('crd_uid'), - reporter=self.name, # Use pod name? - crd_api=msconfig.get('crd_api', 'acid.zalan.do/v1')) + self._state_updater = KubernetesStateManagement( + msconfig.get('update_crd'), # pyright: ignore [reportArgumentType] + msconfig.get('crd_uid'), # pyright: ignore [reportArgumentType] + reporter=self.name, # Use pod name? + crd_api=msconfig.get('crd_api', 'acid.zalan.do/v1')) else: self._state_updater = None @@ -105,7 +106,7 @@ def __init__(self, config, on_change=None): self._dcs_error = None @staticmethod - def get_dcs_config(config): + def get_dcs_config(config: Dict[str, Any]) -> Tuple[Dict[str, Any], AbstractDCS]: msconfig = config['multisite'] # Multisite configuration inherits values from main configuration @@ -166,7 +167,7 @@ def release(self): def should_failover(self): return self._failover_target is not None and self._failover_target != self.name - def on_shutdown(self, checkpoint_location): + def on_shutdown(self, checkpoint_location: int): """ Called when shutdown for multisite failover has completed. """ # TODO: check if we replicated everything to standby site @@ -193,7 +194,7 @@ def _set_standby_config(self, other: Member): logger.info(f"Setting standby configuration to: {self._standby_config}") return old_conf != self._standby_config - def _check_transition(self, leader, note=None): + def _check_transition(self, leader: bool, note: str = ''): if self._has_leader != leader: logger.info("State transition") self._has_leader = leader @@ -321,7 +322,7 @@ def _observe_leader(self): # On replicas we need to know the multisite status only for rewinding. logger.warning(f"Error accessing multisite DCS: {e}") - def _update_history(self, cluster): + def _update_history(self, cluster: Cluster): if cluster.history and cluster.history.lines and isinstance(cluster.history.lines[0], dict): self.site_switches = cluster.history.lines[0].get('switches') @@ -380,7 +381,7 @@ def shutdown(self): class KubernetesStateManagement: - def __init__(self, crd_name, crd_uid, reporter, crd_api): + def __init__(self, crd_name: str, crd_uid: str, reporter: str, crd_api: str): self.crd_namespace, self.crd_name = (['default'] + crd_name.rsplit('.', 1))[-2:] self.crd_uid = crd_uid self.reporter = reporter @@ -388,7 +389,7 @@ def __init__(self, crd_name, crd_uid, reporter, crd_api): # TODO: handle config loading when main DCS is not Kubernetes based # apiclient = k8s_client.ApiClient(False) - kubernetes.config.load_incluster_config() + kubernetes.config.load_incluster_config() # pyright: ignore [reportUnknownMemberType] apiclient = kubernetes.client.ApiClient() self._customobj_api = kubernetes.client.CustomObjectsApi(apiclient) self._events_api = kubernetes.client.EventsV1Api(apiclient) @@ -396,12 +397,12 @@ def __init__(self, crd_name, crd_uid, reporter, crd_api): self._status_update = None self._event_obj = None - def state_transition(self, new_state, note): + def state_transition(self, new_state: str, note: str): self._status_update = {"status": {"Multisite": new_state}} - failover_time = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%fZ") + failover_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ") reason = 'Promote' if new_state == 'Leader' else 'Demote' - if note is None: + if note == '': note = 'Acquired multisite leader' if new_state == 'Leader' else 'Became a standby cluster' self._event_obj = kubernetes.client.EventsV1Event( @@ -433,13 +434,12 @@ def store_updates(self): logger.warning("Unable to store Kubernetes status update: %s", e) @catch_kubernetes_errors - def update_crd_state(self, update): - self._customobj_api.patch_namespaced_custom_object_status(self.crd_api_group, self.crd_api_version, - self.crd_namespace, - 'postgresqls', self.crd_name + '/status', update, - field_manager='patroni') + def update_crd_state(self, update: Dict[str, Any]): + self._customobj_api.patch_namespaced_custom_object_status( # pyright: ignore [reportUnknownMemberType] + self.crd_api_group, self.crd_api_version, self.crd_namespace, 'postgresqls', self.crd_name + '/status', + update, field_manager='patroni') return True - def create_failover_event(self, event): - self._events_api.create_namespaced_event(self.crd_namespace, event) + def create_failover_event(self, event: kubernetes.client.EventsV1Event): + self._events_api.create_namespaced_event(self.crd_namespace, event) # pyright: ignore [reportUnknownMemberType] From 8f75f0dc5dd4af2d91ddbdbbe49f55720f4def58 Mon Sep 17 00:00:00 2001 From: avandras Date: Thu, 15 May 2025 17:09:05 +0200 Subject: [PATCH 03/10] Fix most Pyright issues in ha.py --- patroni/ha.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/patroni/ha.py b/patroni/ha.py index 9d96fcf7f..20421722e 100644 --- a/patroni/ha.py +++ b/patroni/ha.py @@ -285,7 +285,7 @@ def is_standby_cluster(self) -> bool: return self.patroni.multisite.is_active and not self.patroni.multisite.is_leader_site() \ or global_config.is_standby_cluster - def get_standby_cluster_config(self): + def get_standby_cluster_config(self) -> Any: if self.patroni.multisite.is_active: return self.patroni.multisite.get_active_standby_config() return global_config.get_standby_cluster_config() @@ -362,7 +362,8 @@ def acquire_lock(self) -> bool: multisite_ret = self.patroni.multisite.resolve_leader() if multisite_ret: logger.error("Releasing leader lock because multi site status is: " + multisite_ret) - self.dcs.delete_leader() + # self.dcs.delete_leader() + self._delete_leader() return False return ret @@ -1583,7 +1584,7 @@ def on_shutdown(checkpoint_location: int, prev_location: int) -> None: status['released'] = True if mode == 'multisite': - on_shutdown = self.patroni.multisite.on_shutdown # noqa: F811 + on_shutdown = self.patroni.multisite.on_shutdown # pyright: ignore [reportAssignmentType] # noqa: F811 def before_shutdown() -> None: if self.state_handler.mpp_handler.is_coordinator(): From 72c74aac36fe6d67f25e1125d4ede7ed3c6a1b7d Mon Sep 17 00:00:00 2001 From: avandras Date: Fri, 16 May 2025 11:47:23 +0200 Subject: [PATCH 04/10] Fix method call --- patroni/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/patroni/api.py b/patroni/api.py index d9f50b9b0..208f64606 100644 --- a/patroni/api.py +++ b/patroni/api.py @@ -1199,13 +1199,13 @@ def do_POST_switchover(self) -> None: self.do_POST_failover(action='switchover') @check_access - def do_POST_site_switchover(self): + def do_POST_site_switchover(self) -> None: request = self._read_json_content() (status_code, data) = (400, '') if not request: return if not self.server.patroni.multisite.is_active: - return self._write_response(400, 'Cluster is not in multisite mode') + return self.write_response(400, 'Cluster is not in multisite mode') scheduled_at = request.get('scheduled_at') target_site = request.get('target_site') From 2b2e289066d7a72fce7d0c100b7efcb782d73572 Mon Sep 17 00:00:00 2001 From: avandras Date: Mon, 19 May 2025 15:12:47 +0200 Subject: [PATCH 05/10] Fix remaining pyright issues that can be fixed without implementation changes --- patroni/multisite.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/patroni/multisite.py b/patroni/multisite.py index f7e401f89..ed566b354 100644 --- a/patroni/multisite.py +++ b/patroni/multisite.py @@ -5,7 +5,7 @@ from datetime import datetime, timezone from threading import Event, Thread -from typing import Any, Dict, Tuple, Union +from typing import Any, Callable, Dict, Optional, Tuple, TYPE_CHECKING, Union import six @@ -15,6 +15,9 @@ from .dcs.kubernetes import catch_kubernetes_errors from .exceptions import DCSError +if TYPE_CHECKING: # pragma: no cover + from .config import Config + logger = logging.getLogger(__name__) @@ -23,6 +26,8 @@ class AbstractSiteController(object): # Set whether we are relying on this controller for providing standby config is_active = False + dcs: AbstractDCS + def start(self): pass @@ -71,7 +76,7 @@ def status(self): class MultisiteController(Thread, AbstractSiteController): is_active = True - def __init__(self, config: Dict[str, Any], on_change: None = None): + def __init__(self, config: 'Config', on_change: Optional[Callable[..., None]] = None): super().__init__() self.stop_requested = False self.on_change = on_change @@ -106,7 +111,7 @@ def __init__(self, config: Dict[str, Any], on_change: None = None): self._dcs_error = None @staticmethod - def get_dcs_config(config: Dict[str, Any]) -> Tuple[Dict[str, Any], AbstractDCS]: + def get_dcs_config(config: 'Config') -> Tuple[Dict[str, Any], AbstractDCS]: msconfig = config['multisite'] # Multisite configuration inherits values from main configuration @@ -327,8 +332,8 @@ def _update_history(self, cluster: Cluster): self.site_switches = cluster.history.lines[0].get('switches') if self._has_leader: - if cluster.history and cluster.history.lines and isinstance(cluster.history.lines, dict): - history_state = cluster.history.lines + if cluster.history and cluster.history.lines and isinstance(cluster.history.lines[0], dict): + history_state = cluster.history.lines[0] if history_state.get('last_leader') != self.name: new_state = [{'last_leader': self.name, 'switches': history_state.get('switches', 0) + 1}] self.dcs.set_history_value(json.dumps(new_state)) From e6eb8ef861aeea9cc29487f0ffd95868a5485bdc Mon Sep 17 00:00:00 2001 From: avandras Date: Wed, 21 May 2025 17:23:37 +0200 Subject: [PATCH 06/10] Change MS history to the format of normal history --- patroni/api.py | 2 +- patroni/multisite.py | 42 ++++++++++++++++++++++++++++++++---------- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/patroni/api.py b/patroni/api.py index 208f64606..84d53cfeb 100644 --- a/patroni/api.py +++ b/patroni/api.py @@ -681,7 +681,7 @@ def do_GET_metrics(self) -> None: metrics.append("# HELP patroni_multisite_switches Number of times multisite leader has been switched") metrics.append("# TYPE patroni_multisite_switches counter") metrics.append("patroni_multisite_switches{0} {1}" - .format(labels, patroni.multisite.site_switches)) + .format(labels, patroni.multisite.site_switches)) # noqa: E501 # pyright: ignore [reportUnknownMemberType, reportUnknownArgumentType, reportAttributeAccessIssue] self.write_response(200, '\n'.join(metrics) + '\n', content_type='text/plain') diff --git a/patroni/multisite.py b/patroni/multisite.py index ed566b354..a02694ed5 100644 --- a/patroni/multisite.py +++ b/patroni/multisite.py @@ -311,7 +311,7 @@ def _observe_leader(self): cluster = self.dcs.get_cluster() if cluster.is_unlocked(): - logger.info("Multisite has no leader") + logger.info("Multisite has no leader because cluster is unlocked") self._disconnected_operation() else: # There is a leader cluster @@ -328,17 +328,39 @@ def _observe_leader(self): logger.warning(f"Error accessing multisite DCS: {e}") def _update_history(self, cluster: Cluster): - if cluster.history and cluster.history.lines and isinstance(cluster.history.lines[0], dict): - self.site_switches = cluster.history.lines[0].get('switches') + # The history lines are of type dcs._HistoryTuple to match normal timeline history. The data stored by tuple + # index: + # 0: site switch count + # 1: 0 (constant) TODO: maybe store the LSN when the switch happened - in that case it will match the LSN of the + # timeline switch + # 2: site switch timestamp + # 3: new leader site name + # + # The full history is a list of the tuples described above, the latest one being the last element. + # The older implementation was a single item list of dict, we replace it with the list of tuples. + # TODO: once we are sure there are no such instances, the dict references can be removed alongside the ugly + # pyright repellant comments. + + if cluster.history and cluster.history.lines: + if isinstance(cluster.history.lines[0], dict): # older implementation, will get replaced by this update + self.site_switches = cluster.history.lines[0].get('switches') # noqa: E501 # pyright: ignore [reportUnknownMemberType] + else: + self.site_switches = cluster.history.lines[-1][0] if self._has_leader: - if cluster.history and cluster.history.lines and isinstance(cluster.history.lines[0], dict): - history_state = cluster.history.lines[0] - if history_state.get('last_leader') != self.name: - new_state = [{'last_leader': self.name, 'switches': history_state.get('switches', 0) + 1}] - self.dcs.set_history_value(json.dumps(new_state)) - else: - self.dcs.set_history_value(json.dumps([{'last_leader': self.name, 'switches': 0}])) + if cluster.history and cluster.history.lines: + if isinstance(cluster.history.lines[0], dict): + history_state = cluster.history.lines[0] + if history_state.get('last_leader') != self.name: # pyright: ignore [reportUnknownMemberType] + new_state = (history_state.get('switches', 0) + 1, 0, '', self.name) # noqa: E501 # pyright: ignore [reportUnknownMemberType, reportUnknownVariableType] + self.dcs.set_history_value(json.dumps(new_state)) # FIXME: append instead + else: + history_state = cluster.history.lines[-1] + if len(history_state) > 3 and history_state[3] != self.name: + new_state = (history_state[0] + 1, 0, '', self.name) + self.dcs.set_history_value(json.dumps(cluster.history.lines.append(new_state))) + else: # no history yet, set initial item + self.dcs.set_history_value(json.dumps([(0, 0, '', self.name)])) # FIXME: append to list instead def _check_for_failover(self, cluster: Cluster): if cluster.failover and cluster.failover.target_site: From b8ec927b2a39be0efec061cf5e3ba58c467eefa4 Mon Sep 17 00:00:00 2001 From: avandras Date: Wed, 21 May 2025 17:39:59 +0200 Subject: [PATCH 07/10] Make argument which we do not use optional --- patroni/multisite.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/patroni/multisite.py b/patroni/multisite.py index a02694ed5..6061837df 100644 --- a/patroni/multisite.py +++ b/patroni/multisite.py @@ -63,7 +63,7 @@ def status(self) -> Dict[str, Any]: def should_failover(self) -> bool: return False - def on_shutdown(self, checkpoint_location: int): + def on_shutdown(self, checkpoint_location: Optional[int]): pass @@ -172,7 +172,7 @@ def release(self): def should_failover(self): return self._failover_target is not None and self._failover_target != self.name - def on_shutdown(self, checkpoint_location: int): + def on_shutdown(self, checkpoint_location: Optional[int]): """ Called when shutdown for multisite failover has completed. """ # TODO: check if we replicated everything to standby site From 5df8f57eae8f82972f7ee937604ed3c9ec522849 Mon Sep 17 00:00:00 2001 From: avandras Date: Thu, 22 May 2025 20:15:21 +0200 Subject: [PATCH 08/10] Fix tests failing with newest click --- tests/test_ctl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_ctl.py b/tests/test_ctl.py index db69dba23..85ba2c92d 100644 --- a/tests/test_ctl.py +++ b/tests/test_ctl.py @@ -243,7 +243,7 @@ def test_switchover(self): @patch('patroni.dcs.AbstractDCS.set_failover_value', Mock()) def test_failover(self): # No candidate specified - result = self.runner.invoke(ctl, ['failover', 'dummy'], input='0\n') + result = self.runner.invoke(ctl, ['failover', 'dummy'], input='0\n\n') self.assertIn('Failover could be performed only to a specific candidate', result.output) # Candidate is the same as the leader @@ -362,7 +362,7 @@ def test_reload(self, mock_post): @patch('patroni.ctl.request_patroni') def test_restart_reinit(self, mock_post): mock_post.return_value.status = 503 - result = self.runner.invoke(ctl, ['restart', 'alpha'], input='now\ny\n') + result = self.runner.invoke(ctl, ['restart', 'alpha'], input='now\ny\n\n') assert 'Failed: restart for' in result.output assert result.exit_code == 0 @@ -370,7 +370,7 @@ def test_restart_reinit(self, mock_post): assert result.exit_code == 1 # successful reinit - result = self.runner.invoke(ctl, ['reinit', 'alpha', 'other'], input='y\ny') + result = self.runner.invoke(ctl, ['reinit', 'alpha', 'other'], input='y\ny\nn') assert result.exit_code == 0 # Aborted restart From 2b102725a64c3c62ef8fabce8efc3e06d585ff7f Mon Sep 17 00:00:00 2001 From: avandras Date: Fri, 23 May 2025 10:40:15 +0200 Subject: [PATCH 09/10] Fix some RST --- docs/multisite.rst | 117 +++++++++++++++++++++++---------------------- 1 file changed, 59 insertions(+), 58 deletions(-) diff --git a/docs/multisite.rst b/docs/multisite.rst index 65b84ee8c..a0ca9de8d 100644 --- a/docs/multisite.rst +++ b/docs/multisite.rst @@ -91,48 +91,49 @@ The configuration is very similar to the usual Patroni config. In fact, the key An example configuration for two Patroni sites: -``` -multisite: - name: dc1 - namespace: /multisite/ - etcd3: # - hosts: - # dc1 - - 10.0.1.1:2379 - - 10.0.1.2:2379 - - 10.0.1.3:2379 - # dc2 - - 10.0.2.1:2379 - - 10.0.2.2:2379 - - 10.0.2.3:2379 - # dc 3 - - 10.0.0.1:2379 - host: 10.0.1.1,10.0.1.2,10.0.1.3 # How the leader of the other site(s) can connect to the primary on this site - port: 5432 - # Multisite failover timeouts - ttl: 90 - retry_timeout: 40 -``` +.. code:: YAML + + multisite: + name: dc1 + namespace: /multisite/ + etcd3: # + hosts: + # dc1 + - 10.0.1.1:2379 + - 10.0.1.2:2379 + - 10.0.1.3:2379 + # dc2 + - 10.0.2.1:2379 + - 10.0.2.2:2379 + - 10.0.2.3:2379 + # dc 3 + - 10.0.0.1:2379 + host: 10.0.1.1,10.0.1.2,10.0.1.3 # How the leader of the other site(s) can connect to the primary on this site + port: 5432 + # Multisite failover timeouts + ttl: 90 + retry_timeout: 40 + Details of the configuration parameters ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -`name` -: The name of the site. All nodes that share the same value are considered to be a part of the same site, thus it must be different for each site. -`namespace` -: Optional path within DCS where Patroni stores the multisite state. If used, it should be different from the namespace used by the base config, but the same on all sites. -`` (in the example `etcd3`) -: The DCS implementation in use. Possible values are `etcd`, `etcd3`, `zookeeper`, `consul`, `exhibitor`, `kubernetes`, or `raft` (the latter is deprecated). -`.hosts` -: a list of IP addresses of nodes forming the global DCS cluster, including the extra (tiebreaking) node(s) -`host` -: Comma-separated list of IPs of the Patroni nodes that can become a primary on the present site -`port` -: Postgres port, through which other sites' members can connect to this site. It can be specified once if all nodes use the same port, or as a comma-separated list matching the different port numbers, in the order used in the `host` key. -`ttl` -: Time to live of site leader lock. If the site is unable to elect a functioning leader within this timeout, a different site can take over the leader role. Must be a few times longer than the usual `ttl` value in order to prevent unnecessary site failovers. -`retry_timeout` -: How long the global etcd cluster can be inaccessible before the cluster is demoted. Must be a few times longer than the usual `retry_timeout` value in order to prevent unnecessary site failovers. +``name`` + The name of the site. All nodes that share the same value are considered to be a part of the same site, thus it must be different for each site. +``namespace`` + Optional path within DCS where Patroni stores the multisite state. If used, it should be different from the namespace used by the base config, but the same on all sites. +```` (in the example ``etcd3``) + The DCS implementation in use. Possible values are ``etcd``, ``etcd3``, ``zookeeper``, ``consul``, ``exhibitor``, ``kubernetes``, or ``raft`` (the latter is deprecated). +``.hosts`` + a list of IP addresses of nodes forming the global DCS cluster, including the extra (tiebreaking) node(s) +``host`` + Comma-separated list of IPs of the Patroni nodes that can become a primary on the present site +``port`` + Postgres port, through which other sites' members can connect to this site. It can be specified once if all nodes use the same port, or as a comma-separated list matching the different port numbers, in the order used in the ``host`` key. +``ttl`` + Time to live of site leader lock. If the site is unable to elect a functioning leader within this timeout, a different site can take over the leader role. Must be a few times longer than the usual ``ttl`` value in order to prevent unnecessary site failovers. +``retry_timeout`` + How long the global etcd cluster can be inaccessible before the cluster is demoted. Must be a few times longer than the usual ``retry_timeout`` value in order to prevent unnecessary site failovers. Passwords in the YAML configuration ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -184,23 +185,23 @@ Applications should be ready to try to connect to the new primary. See 'Connect Glossary ++++++++ -DCS -: distributed configuration store -site -: a Patroni cluster with any number of nodes, and the respective DCS - usually corresponding to a data centre -primary -: the writable PostgreSQL node, from which the other nodes replicate their data (either directly or in a cascading fashion) -leader -: the node which other nodes inside the same site replicate from - the leader can be a replica itself, in which case it's called a _standby leader_ -site switchover -: a (manual) leader site switch performed when both sites are functioning fine -site failover -: when the main site goes down (meaning there is no Patroni leader and none of the remaining nodes (if any left) can become a leader), the standby leader will be promoted, becoming a leader proper, and the Postgres instance running there becoming the primary -leader site -: the site where the PostgreSQL primary instance is -standby site -: a site replicating from the leader site, and a potential target for site switchover/failover -DCS quorum -: more than half of the DCS nodes are available (and can take part in a leader race) -multisite leader lock -: just like under normal Patroni operation, the leader puts/updates an entry in DCS, thus notifying other sites that there is a functioning Postgres primary running. The entry mentioned is the multisite leader lock. +**DCS** + distributed configuration store +**site** + a Patroni cluster with any number of nodes, and the respective DCS - usually corresponding to a data centre +**primary** + the writable PostgreSQL node, from which the other nodes replicate their data (either directly or in a cascading fashion) +**leader** + the node which other nodes inside the same site replicate from - the leader can be a replica itself, in which case it's called a *standby leader* +**site switchover** + a (manual) leader site switch performed when both sites are functioning fine +**site failover** + when the main site goes down (meaning there is no Patroni leader and none of the remaining nodes (if any left) can become a leader), the standby leader will be promoted, becoming a leader proper, and the Postgres instance running there becoming the primary +**leader site** + the site where the PostgreSQL primary instance is +**standby site** + a site replicating from the leader site, and a potential target for site switchover/failover +**DCS quorum** + more than half of the DCS nodes are available (and can take part in a leader race) +**multisite leader lock** + just like under normal Patroni operation, the leader puts/updates an entry in DCS, thus notifying other sites that there is a functioning Postgres primary running. The entry mentioned is the multisite leader lock. From 6a97d31fbdf60bab19b45982f7f5785487fdbd4e Mon Sep 17 00:00:00 2001 From: avandras Date: Fri, 23 May 2025 11:07:24 +0200 Subject: [PATCH 10/10] Add multisite.rst to TOC --- docs/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/index.rst b/docs/index.rst index b7af84ce4..6d347403b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -38,6 +38,7 @@ Currently supported PostgreSQL versions: 9.3 to 17. tools_integration security ha_multi_dc + multisite faq releases CONTRIBUTING