summit/backend/venv/lib/python3.12/site-packages/redis/maint_notifications.py

811 lines
29 KiB
Python

import enum
import ipaddress
import logging
import re
import threading
import time
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Literal, Optional, Union
from redis.typing import Number
class MaintenanceState(enum.Enum):
NONE = "none"
MOVING = "moving"
MAINTENANCE = "maintenance"
class EndpointType(enum.Enum):
"""Valid endpoint types used in CLIENT MAINT_NOTIFICATIONS command."""
INTERNAL_IP = "internal-ip"
INTERNAL_FQDN = "internal-fqdn"
EXTERNAL_IP = "external-ip"
EXTERNAL_FQDN = "external-fqdn"
NONE = "none"
def __str__(self):
"""Return the string value of the enum."""
return self.value
if TYPE_CHECKING:
from redis.connection import (
MaintNotificationsAbstractConnection,
MaintNotificationsAbstractConnectionPool,
)
class MaintenanceNotification(ABC):
"""
Base class for maintenance notifications sent through push messages by Redis server.
This class provides common functionality for all maintenance notifications including
unique identification and TTL (Time-To-Live) functionality.
Attributes:
id (int): Unique identifier for this notification
ttl (int): Time-to-live in seconds for this notification
creation_time (float): Timestamp when the notification was created/read
"""
def __init__(self, id: int, ttl: int):
"""
Initialize a new MaintenanceNotification with unique ID and TTL functionality.
Args:
id (int): Unique identifier for this notification
ttl (int): Time-to-live in seconds for this notification
"""
self.id = id
self.ttl = ttl
self.creation_time = time.monotonic()
self.expire_at = self.creation_time + self.ttl
def is_expired(self) -> bool:
"""
Check if this notification has expired based on its TTL
and creation time.
Returns:
bool: True if the notification has expired, False otherwise
"""
return time.monotonic() > (self.creation_time + self.ttl)
@abstractmethod
def __repr__(self) -> str:
"""
Return a string representation of the maintenance notification.
This method must be implemented by all concrete subclasses.
Returns:
str: String representation of the notification
"""
pass
@abstractmethod
def __eq__(self, other) -> bool:
"""
Compare two maintenance notifications for equality.
This method must be implemented by all concrete subclasses.
Notifications are typically considered equal if they have the same id
and are of the same type.
Args:
other: The other object to compare with
Returns:
bool: True if the notifications are equal, False otherwise
"""
pass
@abstractmethod
def __hash__(self) -> int:
"""
Return a hash value for the maintenance notification.
This method must be implemented by all concrete subclasses to allow
instances to be used in sets and as dictionary keys.
Returns:
int: Hash value for the notification
"""
pass
class NodeMovingNotification(MaintenanceNotification):
"""
This notification is received when a node is replaced with a new node
during cluster rebalancing or maintenance operations.
"""
def __init__(
self,
id: int,
new_node_host: Optional[str],
new_node_port: Optional[int],
ttl: int,
):
"""
Initialize a new NodeMovingNotification.
Args:
id (int): Unique identifier for this notification
new_node_host (str): Hostname or IP address of the new replacement node
new_node_port (int): Port number of the new replacement node
ttl (int): Time-to-live in seconds for this notification
"""
super().__init__(id, ttl)
self.new_node_host = new_node_host
self.new_node_port = new_node_port
def __repr__(self) -> str:
expiry_time = self.expire_at
remaining = max(0, expiry_time - time.monotonic())
return (
f"{self.__class__.__name__}("
f"id={self.id}, "
f"new_node_host='{self.new_node_host}', "
f"new_node_port={self.new_node_port}, "
f"ttl={self.ttl}, "
f"creation_time={self.creation_time}, "
f"expires_at={expiry_time}, "
f"remaining={remaining:.1f}s, "
f"expired={self.is_expired()}"
f")"
)
def __eq__(self, other) -> bool:
"""
Two NodeMovingNotification notifications are considered equal if they have the same
id, new_node_host, and new_node_port.
"""
if not isinstance(other, NodeMovingNotification):
return False
return (
self.id == other.id
and self.new_node_host == other.new_node_host
and self.new_node_port == other.new_node_port
)
def __hash__(self) -> int:
"""
Return a hash value for the notification to allow
instances to be used in sets and as dictionary keys.
Returns:
int: Hash value based on notification type class name, id,
new_node_host and new_node_port
"""
try:
node_port = int(self.new_node_port) if self.new_node_port else None
except ValueError:
node_port = 0
return hash(
(
self.__class__.__name__,
int(self.id),
str(self.new_node_host),
node_port,
)
)
class NodeMigratingNotification(MaintenanceNotification):
"""
Notification for when a Redis cluster node is in the process of migrating slots.
This notification is received when a node starts migrating its slots to another node
during cluster rebalancing or maintenance operations.
Args:
id (int): Unique identifier for this notification
ttl (int): Time-to-live in seconds for this notification
"""
def __init__(self, id: int, ttl: int):
super().__init__(id, ttl)
def __repr__(self) -> str:
expiry_time = self.creation_time + self.ttl
remaining = max(0, expiry_time - time.monotonic())
return (
f"{self.__class__.__name__}("
f"id={self.id}, "
f"ttl={self.ttl}, "
f"creation_time={self.creation_time}, "
f"expires_at={expiry_time}, "
f"remaining={remaining:.1f}s, "
f"expired={self.is_expired()}"
f")"
)
def __eq__(self, other) -> bool:
"""
Two NodeMigratingNotification notifications are considered equal if they have the same
id and are of the same type.
"""
if not isinstance(other, NodeMigratingNotification):
return False
return self.id == other.id and type(self) is type(other)
def __hash__(self) -> int:
"""
Return a hash value for the notification to allow
instances to be used in sets and as dictionary keys.
Returns:
int: Hash value based on notification type and id
"""
return hash((self.__class__.__name__, int(self.id)))
class NodeMigratedNotification(MaintenanceNotification):
"""
Notification for when a Redis cluster node has completed migrating slots.
This notification is received when a node has finished migrating all its slots
to other nodes during cluster rebalancing or maintenance operations.
Args:
id (int): Unique identifier for this notification
"""
DEFAULT_TTL = 5
def __init__(self, id: int):
super().__init__(id, NodeMigratedNotification.DEFAULT_TTL)
def __repr__(self) -> str:
expiry_time = self.creation_time + self.ttl
remaining = max(0, expiry_time - time.monotonic())
return (
f"{self.__class__.__name__}("
f"id={self.id}, "
f"ttl={self.ttl}, "
f"creation_time={self.creation_time}, "
f"expires_at={expiry_time}, "
f"remaining={remaining:.1f}s, "
f"expired={self.is_expired()}"
f")"
)
def __eq__(self, other) -> bool:
"""
Two NodeMigratedNotification notifications are considered equal if they have the same
id and are of the same type.
"""
if not isinstance(other, NodeMigratedNotification):
return False
return self.id == other.id and type(self) is type(other)
def __hash__(self) -> int:
"""
Return a hash value for the notification to allow
instances to be used in sets and as dictionary keys.
Returns:
int: Hash value based on notification type and id
"""
return hash((self.__class__.__name__, int(self.id)))
class NodeFailingOverNotification(MaintenanceNotification):
"""
Notification for when a Redis cluster node is in the process of failing over.
This notification is received when a node starts a failover process during
cluster maintenance operations or when handling node failures.
Args:
id (int): Unique identifier for this notification
ttl (int): Time-to-live in seconds for this notification
"""
def __init__(self, id: int, ttl: int):
super().__init__(id, ttl)
def __repr__(self) -> str:
expiry_time = self.creation_time + self.ttl
remaining = max(0, expiry_time - time.monotonic())
return (
f"{self.__class__.__name__}("
f"id={self.id}, "
f"ttl={self.ttl}, "
f"creation_time={self.creation_time}, "
f"expires_at={expiry_time}, "
f"remaining={remaining:.1f}s, "
f"expired={self.is_expired()}"
f")"
)
def __eq__(self, other) -> bool:
"""
Two NodeFailingOverNotification notifications are considered equal if they have the same
id and are of the same type.
"""
if not isinstance(other, NodeFailingOverNotification):
return False
return self.id == other.id and type(self) is type(other)
def __hash__(self) -> int:
"""
Return a hash value for the notification to allow
instances to be used in sets and as dictionary keys.
Returns:
int: Hash value based on notification type and id
"""
return hash((self.__class__.__name__, int(self.id)))
class NodeFailedOverNotification(MaintenanceNotification):
"""
Notification for when a Redis cluster node has completed a failover.
This notification is received when a node has finished the failover process
during cluster maintenance operations or after handling node failures.
Args:
id (int): Unique identifier for this notification
"""
DEFAULT_TTL = 5
def __init__(self, id: int):
super().__init__(id, NodeFailedOverNotification.DEFAULT_TTL)
def __repr__(self) -> str:
expiry_time = self.creation_time + self.ttl
remaining = max(0, expiry_time - time.monotonic())
return (
f"{self.__class__.__name__}("
f"id={self.id}, "
f"ttl={self.ttl}, "
f"creation_time={self.creation_time}, "
f"expires_at={expiry_time}, "
f"remaining={remaining:.1f}s, "
f"expired={self.is_expired()}"
f")"
)
def __eq__(self, other) -> bool:
"""
Two NodeFailedOverNotification notifications are considered equal if they have the same
id and are of the same type.
"""
if not isinstance(other, NodeFailedOverNotification):
return False
return self.id == other.id and type(self) is type(other)
def __hash__(self) -> int:
"""
Return a hash value for the notification to allow
instances to be used in sets and as dictionary keys.
Returns:
int: Hash value based on notification type and id
"""
return hash((self.__class__.__name__, int(self.id)))
def _is_private_fqdn(host: str) -> bool:
"""
Determine if an FQDN is likely to be internal/private.
This uses heuristics based on RFC 952 and RFC 1123 standards:
- .local domains (RFC 6762 - Multicast DNS)
- .internal domains (common internal convention)
- Single-label hostnames (no dots)
- Common internal TLDs
Args:
host (str): The FQDN to check
Returns:
bool: True if the FQDN appears to be internal/private
"""
host_lower = host.lower().rstrip(".")
# Single-label hostnames (no dots) are typically internal
if "." not in host_lower:
return True
# Common internal/private domain patterns
internal_patterns = [
r"\.local$", # mDNS/Bonjour domains
r"\.internal$", # Common internal convention
r"\.corp$", # Corporate domains
r"\.lan$", # Local area network
r"\.intranet$", # Intranet domains
r"\.private$", # Private domains
]
for pattern in internal_patterns:
if re.search(pattern, host_lower):
return True
# If none of the internal patterns match, assume it's external
return False
class MaintNotificationsConfig:
"""
Configuration class for maintenance notifications handling behaviour. Notifications are received through
push notifications.
This class defines how the Redis client should react to different push notifications
such as node moving, migrations, etc. in a Redis cluster.
"""
def __init__(
self,
enabled: Union[bool, Literal["auto"]] = "auto",
proactive_reconnect: bool = True,
relaxed_timeout: Optional[Number] = 10,
endpoint_type: Optional[EndpointType] = None,
):
"""
Initialize a new MaintNotificationsConfig.
Args:
enabled (bool | "auto"): Controls maintenance notifications handling behavior.
- True: The CLIENT MAINT_NOTIFICATIONS command must succeed during connection setup,
otherwise a ResponseError is raised.
- "auto": The CLIENT MAINT_NOTIFICATIONS command is attempted but failures are
gracefully handled - a warning is logged and normal operation continues.
- False: Maintenance notifications are completely disabled.
Defaults to "auto".
proactive_reconnect (bool): Whether to proactively reconnect when a node is replaced.
Defaults to True.
relaxed_timeout (Number): The relaxed timeout to use for the connection during maintenance.
If -1 is provided - the relaxed timeout is disabled. Defaults to 20.
endpoint_type (Optional[EndpointType]): Override for the endpoint type to use in CLIENT MAINT_NOTIFICATIONS.
If None, the endpoint type will be automatically determined based on the host and TLS configuration.
Defaults to None.
Raises:
ValueError: If endpoint_type is provided but is not a valid endpoint type.
"""
self.enabled = enabled
self.relaxed_timeout = relaxed_timeout
self.proactive_reconnect = proactive_reconnect
self.endpoint_type = endpoint_type
def __repr__(self) -> str:
return (
f"{self.__class__.__name__}("
f"enabled={self.enabled}, "
f"proactive_reconnect={self.proactive_reconnect}, "
f"relaxed_timeout={self.relaxed_timeout}, "
f"endpoint_type={self.endpoint_type!r}"
f")"
)
def is_relaxed_timeouts_enabled(self) -> bool:
"""
Check if the relaxed_timeout is enabled. The '-1' value is used to disable the relaxed_timeout.
If relaxed_timeout is set to None, it will make the operation blocking
and waiting until any response is received.
Returns:
True if the relaxed_timeout is enabled, False otherwise.
"""
return self.relaxed_timeout != -1
def get_endpoint_type(
self, host: str, connection: "MaintNotificationsAbstractConnection"
) -> EndpointType:
"""
Determine the appropriate endpoint type for CLIENT MAINT_NOTIFICATIONS command.
Logic:
1. If endpoint_type is explicitly set, use it
2. Otherwise, check the original host from connection.host:
- If host is an IP address, use it directly to determine internal-ip vs external-ip
- If host is an FQDN, get the resolved IP to determine internal-fqdn vs external-fqdn
Args:
host: User provided hostname to analyze
connection: The connection object to analyze for endpoint type determination
Returns:
"""
# If endpoint_type is explicitly set, use it
if self.endpoint_type is not None:
return self.endpoint_type
# Check if the host is an IP address
try:
ip_addr = ipaddress.ip_address(host)
# Host is an IP address - use it directly
is_private = ip_addr.is_private
return EndpointType.INTERNAL_IP if is_private else EndpointType.EXTERNAL_IP
except ValueError:
# Host is an FQDN - need to check resolved IP to determine internal vs external
pass
# Host is an FQDN, get the resolved IP to determine if it's internal or external
resolved_ip = connection.get_resolved_ip()
if resolved_ip:
try:
ip_addr = ipaddress.ip_address(resolved_ip)
is_private = ip_addr.is_private
# Use FQDN types since the original host was an FQDN
return (
EndpointType.INTERNAL_FQDN
if is_private
else EndpointType.EXTERNAL_FQDN
)
except ValueError:
# This shouldn't happen since we got the IP from the socket, but fallback
pass
# Final fallback: use heuristics on the FQDN itself
is_private = _is_private_fqdn(host)
return EndpointType.INTERNAL_FQDN if is_private else EndpointType.EXTERNAL_FQDN
class MaintNotificationsPoolHandler:
def __init__(
self,
pool: "MaintNotificationsAbstractConnectionPool",
config: MaintNotificationsConfig,
) -> None:
self.pool = pool
self.config = config
self._processed_notifications = set()
self._lock = threading.RLock()
self.connection = None
def set_connection(self, connection: "MaintNotificationsAbstractConnection"):
self.connection = connection
def get_handler_for_connection(self):
# Copy all data that should be shared between connections
# but each connection should have its own pool handler
# since each connection can be in a different state
copy = MaintNotificationsPoolHandler(self.pool, self.config)
copy._processed_notifications = self._processed_notifications
copy._lock = self._lock
copy.connection = None
return copy
def remove_expired_notifications(self):
with self._lock:
for notification in tuple(self._processed_notifications):
if notification.is_expired():
self._processed_notifications.remove(notification)
def handle_notification(self, notification: MaintenanceNotification):
self.remove_expired_notifications()
if isinstance(notification, NodeMovingNotification):
return self.handle_node_moving_notification(notification)
else:
logging.error(f"Unhandled notification type: {notification}")
def handle_node_moving_notification(self, notification: NodeMovingNotification):
if (
not self.config.proactive_reconnect
and not self.config.is_relaxed_timeouts_enabled()
):
return
with self._lock:
if notification in self._processed_notifications:
# nothing to do in the connection pool handling
# the notification has already been handled or is expired
# just return
return
with self.pool._lock:
if (
self.config.proactive_reconnect
or self.config.is_relaxed_timeouts_enabled()
):
# Get the current connected address - if any
# This is the address that is being moved
# and we need to handle only connections
# connected to the same address
moving_address_src = (
self.connection.getpeername() if self.connection else None
)
if getattr(self.pool, "set_in_maintenance", False):
# Set pool in maintenance mode - executed only if
# BlockingConnectionPool is used
self.pool.set_in_maintenance(True)
# Update maintenance state, timeout and optionally host address
# connection settings for matching connections
self.pool.update_connections_settings(
state=MaintenanceState.MOVING,
maintenance_notification_hash=hash(notification),
relaxed_timeout=self.config.relaxed_timeout,
host_address=notification.new_node_host,
matching_address=moving_address_src,
matching_pattern="connected_address",
update_notification_hash=True,
include_free_connections=True,
)
if self.config.proactive_reconnect:
if notification.new_node_host is not None:
self.run_proactive_reconnect(moving_address_src)
else:
threading.Timer(
notification.ttl / 2,
self.run_proactive_reconnect,
args=(moving_address_src,),
).start()
# Update config for new connections:
# Set state to MOVING
# update host
# if relax timeouts are enabled - update timeouts
kwargs: dict = {
"maintenance_state": MaintenanceState.MOVING,
"maintenance_notification_hash": hash(notification),
}
if notification.new_node_host is not None:
# the host is not updated if the new node host is None
# this happens when the MOVING push notification does not contain
# the new node host - in this case we only update the timeouts
kwargs.update(
{
"host": notification.new_node_host,
}
)
if self.config.is_relaxed_timeouts_enabled():
kwargs.update(
{
"socket_timeout": self.config.relaxed_timeout,
"socket_connect_timeout": self.config.relaxed_timeout,
}
)
self.pool.update_connection_kwargs(**kwargs)
if getattr(self.pool, "set_in_maintenance", False):
self.pool.set_in_maintenance(False)
threading.Timer(
notification.ttl,
self.handle_node_moved_notification,
args=(notification,),
).start()
self._processed_notifications.add(notification)
def run_proactive_reconnect(self, moving_address_src: Optional[str] = None):
"""
Run proactive reconnect for the pool.
Active connections are marked for reconnect after they complete the current command.
Inactive connections are disconnected and will be connected on next use.
"""
with self._lock:
with self.pool._lock:
# take care for the active connections in the pool
# mark them for reconnect after they complete the current command
self.pool.update_active_connections_for_reconnect(
moving_address_src=moving_address_src,
)
# take care for the inactive connections in the pool
# delete them and create new ones
self.pool.disconnect_free_connections(
moving_address_src=moving_address_src,
)
def handle_node_moved_notification(self, notification: NodeMovingNotification):
"""
Handle the cleanup after a node moving notification expires.
"""
notification_hash = hash(notification)
with self._lock:
# if the current maintenance_notification_hash in kwargs is not matching the notification
# it means there has been a new moving notification after this one
# and we don't need to revert the kwargs yet
if (
self.pool.connection_kwargs.get("maintenance_notification_hash")
== notification_hash
):
orig_host = self.pool.connection_kwargs.get("orig_host_address")
orig_socket_timeout = self.pool.connection_kwargs.get(
"orig_socket_timeout"
)
orig_connect_timeout = self.pool.connection_kwargs.get(
"orig_socket_connect_timeout"
)
kwargs: dict = {
"maintenance_state": MaintenanceState.NONE,
"maintenance_notification_hash": None,
"host": orig_host,
"socket_timeout": orig_socket_timeout,
"socket_connect_timeout": orig_connect_timeout,
}
self.pool.update_connection_kwargs(**kwargs)
with self.pool._lock:
reset_relaxed_timeout = self.config.is_relaxed_timeouts_enabled()
reset_host_address = self.config.proactive_reconnect
self.pool.update_connections_settings(
relaxed_timeout=-1,
state=MaintenanceState.NONE,
maintenance_notification_hash=None,
matching_notification_hash=notification_hash,
matching_pattern="notification_hash",
update_notification_hash=True,
reset_relaxed_timeout=reset_relaxed_timeout,
reset_host_address=reset_host_address,
include_free_connections=True,
)
class MaintNotificationsConnectionHandler:
# 1 = "starting maintenance" notifications, 0 = "completed maintenance" notifications
_NOTIFICATION_TYPES: dict[type["MaintenanceNotification"], int] = {
NodeMigratingNotification: 1,
NodeFailingOverNotification: 1,
NodeMigratedNotification: 0,
NodeFailedOverNotification: 0,
}
def __init__(
self,
connection: "MaintNotificationsAbstractConnection",
config: MaintNotificationsConfig,
) -> None:
self.connection = connection
self.config = config
def handle_notification(self, notification: MaintenanceNotification):
# get the notification type by checking its class in the _NOTIFICATION_TYPES dict
notification_type = self._NOTIFICATION_TYPES.get(notification.__class__, None)
if notification_type is None:
logging.error(f"Unhandled notification type: {notification}")
return
if notification_type:
self.handle_maintenance_start_notification(MaintenanceState.MAINTENANCE)
else:
self.handle_maintenance_completed_notification()
def handle_maintenance_start_notification(
self, maintenance_state: MaintenanceState
):
if (
self.connection.maintenance_state == MaintenanceState.MOVING
or not self.config.is_relaxed_timeouts_enabled()
):
return
self.connection.maintenance_state = maintenance_state
self.connection.set_tmp_settings(
tmp_relaxed_timeout=self.config.relaxed_timeout
)
# extend the timeout for all created connections
self.connection.update_current_socket_timeout(self.config.relaxed_timeout)
def handle_maintenance_completed_notification(self):
# Only reset timeouts if state is not MOVING and relaxed timeouts are enabled
if (
self.connection.maintenance_state == MaintenanceState.MOVING
or not self.config.is_relaxed_timeouts_enabled()
):
return
self.connection.reset_tmp_settings(reset_relaxed_timeout=True)
# Maintenance completed - reset the connection
# timeouts by providing -1 as the relaxed timeout
self.connection.update_current_socket_timeout(-1)
self.connection.maintenance_state = MaintenanceState.NONE