Skip to content

Commit

Permalink
More resilient topology observer (#200)
Browse files Browse the repository at this point in the history
  • Loading branch information
dragomirp authored Aug 5, 2023
1 parent f72c929 commit 0358c37
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 8 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,9 @@ jobs:
agent-versions:
- "2.9.44" # renovate: latest juju 2
- "3.1.5" # renovate: latest juju 3
include:
- tox-environments: db-admin-relation-integration
agent-versions: "2.9.43" # renovate: latest juju 2
# include:
# - tox-environments: db-admin-relation-integration
# agent-versions: "2.9.43" # renovate: latest juju 2
name: ${{ matrix.tox-environments }} | ${{ matrix.agent-versions }}
needs:
- lib-check
Expand Down
3 changes: 3 additions & 0 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1154,6 +1154,9 @@ def _on_update_status(self, _) -> None:

self._set_primary_status_message()

# Restart topology observer if it is gone
self._observer.start_observer()

def _handle_processes_failures(self) -> bool:
"""Handle Patroni and PostgreSQL OS processes failures.
Expand Down
14 changes: 9 additions & 5 deletions src/cluster_topology_observer.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,16 @@ def __init__(self, charm: CharmBase):

def start_observer(self):
"""Start the cluster topology observer running in a new process."""
if (
not isinstance(self._charm.unit.status, ActiveStatus)
or self._charm._peers is None
or "observer-pid" in self._charm._peers.data[self._charm.unit]
):
if not isinstance(self._charm.unit.status, ActiveStatus) or self._charm._peers is None:
return
if "observer-pid" in self._charm._peers.data[self._charm.unit]:
# Double check that the PID exists
pid = int(self._charm._peers.data[self._charm.unit]["observer-pid"])
try:
os.kill(pid, 0)
return
except OSError:
pass

logging.info("Starting cluster topology observer process")

Expand Down
5 changes: 5 additions & 0 deletions tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,7 @@ def test_on_set_password(
)

@patch_network_get(private_address="1.1.1.1")
@patch("charm.ClusterTopologyObserver.start_observer")
@patch("charm.PostgresqlOperatorCharm._set_primary_status_message")
@patch("charm.Patroni.restart_patroni")
@patch("charm.Patroni.is_member_isolated")
Expand All @@ -583,6 +584,7 @@ def test_on_update_status(
_is_member_isolated,
_restart_patroni,
_set_primary_status_message,
_start_observer,
):
# Test before the cluster is initialised.
self.charm.on.update_status.emit()
Expand Down Expand Up @@ -625,8 +627,10 @@ def test_on_update_status(
)
self.charm.on.update_status.emit()
_restart_patroni.assert_called_once()
_start_observer.assert_called_once()

@patch_network_get(private_address="1.1.1.1")
@patch("charm.ClusterTopologyObserver.start_observer")
@patch("charm.PostgresqlOperatorCharm._set_primary_status_message")
@patch("charm.PostgresqlOperatorCharm._handle_workload_failures")
@patch("charm.PostgresqlOperatorCharm._update_relation_endpoints")
Expand All @@ -652,6 +656,7 @@ def test_on_update_status_after_restore_operation(
_update_relation_endpoints,
_handle_workload_failures,
_set_primary_status_message,
_,
):
# Test when the restore operation fails.
with self.harness.hooks_disabled():
Expand Down

0 comments on commit 0358c37

Please sign in to comment.