Skip to content

Commit

Permalink
Merge branch 'develop' into allowlist_2
Browse files Browse the repository at this point in the history
  • Loading branch information
mgunnala authored Dec 16, 2024
2 parents 4a0a4ef + 50d37f8 commit daa8017
Show file tree
Hide file tree
Showing 18 changed files with 744 additions and 263 deletions.
59 changes: 12 additions & 47 deletions azurelinuxagent/common/protocol/goal_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,6 @@ class GoalStateProperties(object):
All = RoleConfig | HostingEnv | SharedConfig | ExtensionsGoalState | Certificates | RemoteAccessInfo


class GoalStateInconsistentError(ProtocolError):
"""
Indicates an inconsistency in the goal state (e.g. missing tenant certificate)
"""
def __init__(self, msg, inner=None):
super(GoalStateInconsistentError, self).__init__(msg, inner)


class GoalState(object):
def __init__(self, wire_client, goal_state_properties=GoalStateProperties.All, silent=False, save_to_history=False):
"""
Expand Down Expand Up @@ -201,26 +193,12 @@ def update_host_plugin_headers(wire_client):
# Fetching the goal state updates the HostGAPlugin so simply trigger the request
GoalState._fetch_goal_state(wire_client)

def update(self, silent=False):
def update(self, force_update=False, silent=False):
"""
Updates the current GoalState instance fetching values from the WireServer/HostGAPlugin as needed
"""
self.logger.silent = silent

try:
self._update(force_update=False)
except GoalStateInconsistentError as e:
message = "Detected an inconsistency in the goal state: {0}".format(ustr(e))
self.logger.warn(message)
add_event(op=WALAEventOperation.GoalState, is_success=False, log_event=False, message=message)

self._update(force_update=True)

message = "The goal state is consistent"
self.logger.info(message)
add_event(op=WALAEventOperation.GoalState, message=message)

def _update(self, force_update):
#
# Fetch the goal state from both the HGAP and the WireServer
#
Expand Down Expand Up @@ -282,40 +260,27 @@ def _update(self, force_update):
#
# Lastly, decide whether to use the vmSettings or extensionsConfig for the extensions goal state
#
if goal_state_updated and vm_settings_updated:
most_recent = vm_settings if vm_settings.created_on_timestamp > extensions_config.created_on_timestamp else extensions_config
elif goal_state_updated:
most_recent = extensions_config
if goal_state_updated:
# On rotation of the tenant certificate the vmSettings and extensionsConfig are not updated. However, the incarnation of the WS goal state is update so 'goal_state_updated' will be True.
# In this case, we should use the most recent of vmSettigns and extensionsConfig.
if vm_settings is not None:
most_recent = vm_settings if vm_settings.created_on_timestamp > extensions_config.created_on_timestamp else extensions_config
else:
most_recent = extensions_config
else: # vm_settings_updated
most_recent = vm_settings

if self._extensions_goal_state is None or most_recent.created_on_timestamp >= self._extensions_goal_state.created_on_timestamp:
self._extensions_goal_state = most_recent

#
# For Fast Track goal states, verify that the required certificates are in the goal state.
#
# Some scenarios can produce inconsistent goal states. For example, during hibernation/resume, the Fabric goal state changes (the
# tenant certificate is re-generated when the VM is restarted) *without* the incarnation necessarily changing (e.g. if the incarnation
# is 1 before the hibernation; on resume the incarnation is set to 1 even though the goal state has a new certificate). If a Fast
# Track goal state comes after that, the extensions will need the new certificate. The Agent needs to refresh the goal state in that
# case, to ensure it fetches the new certificate.
# Ensure all certificates are downloaded on Fast Track goal states in order to maintain backwards compatibility with previous
# versions of the Agent, which used to download certificates from the WireServer on every goal state. Some customer applications
# depend on this behavior (see https://github.com/Azure/WALinuxAgent/issues/2750).
#
if self._extensions_goal_state.source == GoalStateSource.FastTrack and self._goal_state_properties & GoalStateProperties.Certificates:
self._check_certificates()
self._check_and_download_missing_certs_on_disk()

def _check_certificates(self):
# Check that certificates needed by extensions are in goal state certs.summary
for extension in self.extensions_goal_state.extensions:
for settings in extension.settings:
if settings.protectedSettings is None:
continue
certificates = self.certs.summary
if not any(settings.certificateThumbprint == c['thumbprint'] for c in certificates):
message = "Certificate {0} needed by {1} is missing from the goal state".format(settings.certificateThumbprint, extension.name)
raise GoalStateInconsistentError(message)

def _download_certificates(self, certs_uri):
xml_text = self._wire_client.fetch_config(certs_uri, self._wire_client.get_header_for_cert())
certs = Certificates(xml_text, self.logger)
Expand Down Expand Up @@ -524,7 +489,7 @@ def _fetch_full_wire_server_goal_state(self, incarnation, xml_doc):
self.logger.warn("Fetching the goal state failed: {0}", ustr(exception))
raise ProtocolError(msg="Error fetching goal state", inner=exception)
finally:
message = 'Fetch goal state completed'
message = 'Fetch goal state from WireServer completed'
self.logger.info(message)
add_event(op=WALAEventOperation.GoalState, message=message)

Expand Down
21 changes: 6 additions & 15 deletions azurelinuxagent/common/protocol/wire.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@
from azurelinuxagent.common.exception import ProtocolNotFoundError, \
ResourceGoneError, ExtensionDownloadError, InvalidContainerError, ProtocolError, HttpError, ExtensionErrorCodes
from azurelinuxagent.common.future import httpclient, bytebuffer, ustr
from azurelinuxagent.common.protocol.goal_state import GoalState, TRANSPORT_CERT_FILE_NAME, TRANSPORT_PRV_FILE_NAME, \
GoalStateProperties, GoalStateInconsistentError
from azurelinuxagent.common.protocol.goal_state import GoalState, TRANSPORT_CERT_FILE_NAME, TRANSPORT_PRV_FILE_NAME, GoalStateProperties
from azurelinuxagent.common.protocol.hostplugin import HostPluginProtocol
from azurelinuxagent.common.protocol.restapi import DataContract, ProvisionStatus, VMInfo, VMStatus
from azurelinuxagent.common.telemetryevent import GuestAgentExtensionEventsSchema
Expand Down Expand Up @@ -92,18 +91,10 @@ def detect(self, init_goal_state=True, save_to_history=False):
# TODO: Currently protocol detection retrieves the entire goal state. This is not needed; in particular, retrieving the Extensions goal state
# is not needed. However, the goal state is cached in self.client._goal_state and other components, including the Extension Handler,
# depend on this cached value. This has been a long-standing issue that causes multiple problems. Before removing the cached goal state,
# though, a careful review of these dependencies is needed.
# though, a careful review of these dependencies is needed. One of the problems of fetching the full goal state is that issues while
# retrieving it can block protocol detection and make the Agent go into a retry loop that can last 1 full hour.
#
# One of the problems of fetching the full goal state is that issues while retrieving it can block protocol detection and make the
# Agent go into a retry loop that can last 1 full hour. One particular error, GoalStateInconsistentError, can arise if the certificates
# needed by extensions are missing from the goal state; for example, if a FastTrack goal state is out of sync with the corresponding
# Fabric goal state that contains the certificates, or if decryption of the certificates fais (and hence, the certificate list is
# empty). The try/except below handles only this one particular problem.
#
try:
self.client.reset_goal_state(save_to_history=save_to_history)
except GoalStateInconsistentError as error:
logger.warn("{0}", ustr(error))
self.client.reset_goal_state(save_to_history=save_to_history)

def update_host_plugin_from_goal_state(self):
self.client.update_host_plugin_from_goal_state()
Expand Down Expand Up @@ -794,15 +785,15 @@ def update_host_plugin(self, container_id, role_config_name):
self._host_plugin.update_container_id(container_id)
self._host_plugin.update_role_config_name(role_config_name)

def update_goal_state(self, silent=False, save_to_history=False):
def update_goal_state(self, force_update=False, silent=False, save_to_history=False):
"""
Updates the goal state if the incarnation or etag changed
"""
try:
if self._goal_state is None:
self._goal_state = GoalState(self, silent=silent, save_to_history=save_to_history)
else:
self._goal_state.update(silent=silent)
self._goal_state.update(force_update=force_update, silent=silent)

except ProtocolError:
raise
Expand Down
84 changes: 68 additions & 16 deletions azurelinuxagent/ga/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from azurelinuxagent.common.future import ustr
from azurelinuxagent.common.osutil import get_osutil, systemd
from azurelinuxagent.ga.persist_firewall_rules import PersistFirewallRulesHandler
from azurelinuxagent.common.protocol.goal_state import GoalStateSource
from azurelinuxagent.common.protocol.hostplugin import HostPluginProtocol, VmSettingsNotSupported
from azurelinuxagent.common.protocol.restapi import VERSION_0
from azurelinuxagent.common.protocol.util import get_protocol_util
Expand Down Expand Up @@ -154,7 +155,7 @@ def __init__(self):
# these members are used to avoid reporting errors too frequently
self._heartbeat_update_goal_state_error_count = 0
self._update_goal_state_error_count = 0
self._update_goal_state_last_error_report = datetime.min
self._update_goal_state_next_error_report = datetime.min
self._report_status_last_failed_goal_state = None

# incarnation of the last goal state that has been fully processed
Expand Down Expand Up @@ -440,8 +441,10 @@ def _initialize_goal_state(self, protocol):
#
# Block until we can fetch the first goal state (self._try_update_goal_state() does its own logging and error handling).
#
event.info(WALAEventOperation.GoalState, "Initializing the goal state...")
while not self._try_update_goal_state(protocol):
time.sleep(conf.get_goal_state_period())
event.info(WALAEventOperation.GoalState, "Goal state initialization completed.")

#
# If FastTrack is disabled we need to check if the current goal state (which will be retrieved using the WireServer and
Expand All @@ -453,7 +456,9 @@ def _initialize_goal_state(self, protocol):
egs = protocol.client.get_goal_state().extensions_goal_state
if egs.created_on_timestamp < last_fast_track_timestamp:
egs.is_outdated = True
logger.info("The current Fabric goal state is older than the most recent FastTrack goal state; will skip it.\nFabric: {0}\nFastTrack: {1}",
event.info(
WALAEventOperation.GoalState,
"The current Fabric goal state is older than the most recent FastTrack goal state; will skip it.\nFabric: {0}\nFastTrack: {1}",
egs.created_on_timestamp, last_fast_track_timestamp)

def _wait_for_cloud_init(self):
Expand Down Expand Up @@ -495,41 +500,88 @@ def _try_update_goal_state(self, protocol):
"""
Attempts to update the goal state and returns True on success or False on failure, sending telemetry events about the failures.
"""
max_errors_to_log = 3

try:
max_errors_to_log = 3
#
# For Fast Track goal states we need to ensure that the tenant certificate is in the goal state.
#
# Some scenarios can produce inconsistent goal states. For example, during hibernation/resume, the Fabric goal state changes (the
# tenant certificate is re-generated when the VM is restarted) *without* the incarnation necessarily changing (e.g. if the incarnation
# is 1 before the hibernation; on resume the incarnation is set to 1 even though the goal state has a new certificate). If a Fast
# Track goal state comes after that, the extensions will need the new certificate.
#
# For new Fast Track goal states, we check the certificates and, if an inconsistency is detected, re-fetch the entire goal state
# (update_goal_state(force_update=True). We re-fetch 2 times, one without waiting (to address scenarios like hibernation) and one with
# a delay (to address situations in which the HGAP and the WireServer are temporarily out of sync).
#
for attempt in range(3):
protocol.client.update_goal_state(force_update=attempt > 0, silent=self._update_goal_state_error_count >= max_errors_to_log, save_to_history=True)

protocol.client.update_goal_state(silent=self._update_goal_state_error_count >= max_errors_to_log, save_to_history=True)
goal_state = protocol.get_goal_state()
new_goal_state = self._goal_state is None or self._goal_state.extensions_goal_state.id != goal_state.extensions_goal_state.id

if not new_goal_state or goal_state.extensions_goal_state.source != GoalStateSource.FastTrack:
break

if self._check_certificates(goal_state):
if attempt > 0:
event.info(WALAEventOperation.FetchGoalState, "The extensions goal state is now in sync with the tenant cert.")
break

if attempt == 0:
event.info(WALAEventOperation.FetchGoalState, "The extensions are out of sync with the tenant cert. Will refresh the goal state.")
elif attempt == 1:
event.info(WALAEventOperation.FetchGoalState, "The extensions are still out of sync with the tenant cert. Will refresh the goal state one more time after a short delay.")
time.sleep(conf.get_goal_state_period())
else:
event.warn(WALAEventOperation.FetchGoalState, "The extensions are still out of sync with the tenant cert. Will continue execution, but some extensions may fail.")
break

self._goal_state = protocol.get_goal_state()

if self._update_goal_state_error_count > 0:
message = u"Fetching the goal state recovered from previous errors. Fetched {0} (certificates: {1})".format(
event.info(
WALAEventOperation.FetchGoalState,
"Fetching the goal state recovered from previous errors. Fetched {0} (certificates: {1})",
self._goal_state.extensions_goal_state.id, self._goal_state.certs.summary)
add_event(AGENT_NAME, op=WALAEventOperation.FetchGoalState, version=CURRENT_VERSION, is_success=True, message=message, log_event=False)
logger.info(message)
self._update_goal_state_error_count = 0

try:
self._supports_fast_track = conf.get_enable_fast_track() and protocol.client.get_host_plugin().check_vm_settings_support()
except VmSettingsNotSupported:
self._supports_fast_track = False

return True

except Exception as e:
self._update_goal_state_error_count += 1
self._heartbeat_update_goal_state_error_count += 1
if self._update_goal_state_error_count <= max_errors_to_log:
message = u"Error fetching the goal state: {0}".format(textutil.format_exception(e))
logger.error(message)
add_event(op=WALAEventOperation.FetchGoalState, is_success=False, message=message, log_event=False)
self._update_goal_state_last_error_report = datetime.now()
# Report up to 'max_errors_to_log' immediately
self._update_goal_state_next_error_report = datetime.now()
event.error(WALAEventOperation.FetchGoalState, "Error fetching the goal state: {0}", textutil.format_exception(e))
else:
if self._update_goal_state_last_error_report + timedelta(hours=6) > datetime.now():
self._update_goal_state_last_error_report = datetime.now()
message = u"Fetching the goal state is still failing: {0}".format(textutil.format_exception(e))
logger.error(message)
add_event(op=WALAEventOperation.FetchGoalState, is_success=False, message=message, log_event=False)
# Report one single periodic error every 6 hours
if datetime.now() >= self._update_goal_state_next_error_report:
self._update_goal_state_next_error_report = datetime.now() + timedelta(hours=6)
event.error(WALAEventOperation.FetchGoalState, "Fetching the goal state is still failing: {0}", textutil.format_exception(e))
return False

@staticmethod
def _check_certificates(goal_state):
# Check that the certificates needed by extensions are in the goal state certificates summary
for extension in goal_state.extensions_goal_state.extensions:
for settings in extension.settings:
if settings.protectedSettings is None:
continue
certificates = goal_state.certs.summary
if not any(settings.certificateThumbprint == c['thumbprint'] for c in certificates):
event.warn(
WALAEventOperation.FetchGoalState,
"The extensions goal state is out of sync with the tenant cert. Certificate {0}, needed by {1}, is missing.",
settings.certificateThumbprint, extension.name)
return False
return True

def _processing_new_incarnation(self):
Expand Down
Loading

0 comments on commit daa8017

Please sign in to comment.