-
Notifications
You must be signed in to change notification settings - Fork 70
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: replace standard logging with structured logging #122
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -4,17 +4,34 @@ | |||||
import time | ||||||
import urllib | ||||||
import socket | ||||||
import structlog | ||||||
import orjson | ||||||
|
||||||
import botocore | ||||||
import boto3 | ||||||
|
||||||
|
||||||
slogger = structlog.get_logger() | ||||||
# use structlog's production-ready, performant example config | ||||||
# ref: https://www.structlog.org/en/stable/performance.html#example | ||||||
structlog.configure( | ||||||
cache_logger_on_first_use=True, | ||||||
wrapper_class=structlog.make_filtering_bound_logger(logging.INFO), | ||||||
processors=[ | ||||||
structlog.contextvars.merge_contextvars, | ||||||
structlog.processors.add_log_level, | ||||||
structlog.processors.format_exc_info, | ||||||
structlog.processors.TimeStamper(fmt="iso", utc=True), | ||||||
structlog.processors.EventRenamer("message"), | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By default, this value is "event" though that term seems to be loaded since the Lambda accepts "events" and I thought that maybe Either way, I don't have a strong opinion on whether it stays as this, we leave it as the default There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||
structlog.processors.JSONRenderer(serializer=orjson.dumps) | ||||||
], | ||||||
logger_factory=structlog.BytesLoggerFactory() | ||||||
) | ||||||
|
||||||
# logger is still needed to set the level for dependencies | ||||||
logger = logging.getLogger() | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this line still needed? |
||||||
logger.setLevel(logging.INFO) | ||||||
logging.getLogger('boto3').setLevel(logging.CRITICAL) | ||||||
logging.getLogger('botocore').setLevel(logging.CRITICAL) | ||||||
Comment on lines
31
to
33
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I found this guide with regards to working with the standard library, but I wasn't sure how I could make this applicable here. Otherwise, third party deps will continue using the standard logger unless someone can figure out if wrapping the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My colleague suggests that we could use https://nhairs.github.io/python-json-logger/latest/ as the formatter for these handlers. e.g. something like:
Do you think this would work? |
||||||
|
||||||
|
||||||
ec2_client = boto3.client("ec2") | ||||||
|
||||||
LIFECYCLE_KEY = "LifecycleHookName" | ||||||
|
@@ -51,18 +68,18 @@ def get_az_and_vpc_zone_identifier(auto_scaling_group): | |||||
try: | ||||||
asg_objects = autoscaling.describe_auto_scaling_groups(AutoScalingGroupNames=[auto_scaling_group]) | ||||||
except botocore.exceptions.ClientError as error: | ||||||
logger.error("Unable to describe autoscaling groups") | ||||||
slogger.error("Unable to describe autoscaling groups") | ||||||
raise error | ||||||
|
||||||
if asg_objects["AutoScalingGroups"] and len(asg_objects["AutoScalingGroups"]) > 0: | ||||||
asg = asg_objects["AutoScalingGroups"][0] | ||||||
logger.debug("Auto Scaling Group: %s", asg) | ||||||
slogger.debug("Auto Scaling Group: %s", asg) | ||||||
|
||||||
availability_zone = asg["AvailabilityZones"][0] | ||||||
logger.debug("Availability Zone: %s", availability_zone) | ||||||
slogger.debug("Availability Zone: %s", availability_zone) | ||||||
|
||||||
vpc_zone_identifier = asg["VPCZoneIdentifier"] | ||||||
logger.debug("VPC zone identifier: %s", vpc_zone_identifier) | ||||||
slogger.debug("VPC zone identifier: %s", vpc_zone_identifier) | ||||||
|
||||||
return availability_zone, vpc_zone_identifier | ||||||
|
||||||
|
@@ -73,18 +90,18 @@ def get_vpc_id(route_table): | |||||
try: | ||||||
route_tables = ec2_client.describe_route_tables(RouteTableIds=[route_table]) | ||||||
except botocore.exceptions.ClientError as error: | ||||||
logger.error("Unable to get vpc id") | ||||||
slogger.error("Unable to get vpc id") | ||||||
raise error | ||||||
if "RouteTables" in route_tables and len(route_tables["RouteTables"]) == 1: | ||||||
vpc_id = route_tables["RouteTables"][0]["VpcId"] | ||||||
logger.debug("VPC ID: %s", vpc_id) | ||||||
slogger.debug("VPC ID: %s", vpc_id) | ||||||
return vpc_id | ||||||
|
||||||
|
||||||
def get_nat_gateway_id(vpc_id, subnet_id): | ||||||
nat_gateway_id = os.getenv("NAT_GATEWAY_ID") | ||||||
if nat_gateway_id: | ||||||
logger.info("Using NAT_GATEWAY_ID env. variable (%s)", nat_gateway_id) | ||||||
slogger.info("Using NAT_GATEWAY_ID env. variable (%s)", nat_gateway_id) | ||||||
return nat_gateway_id | ||||||
|
||||||
try: | ||||||
|
@@ -101,15 +118,15 @@ def get_nat_gateway_id(vpc_id, subnet_id): | |||||
] | ||||||
) | ||||||
except botocore.exceptions.ClientError as error: | ||||||
logger.error("Unable to describe nat gateway") | ||||||
slogger.error("Unable to describe nat gateway") | ||||||
raise error | ||||||
|
||||||
logger.debug("NAT Gateways: %s", nat_gateways) | ||||||
slogger.debug("NAT Gateways: %s", nat_gateways) | ||||||
if len(nat_gateways.get("NatGateways")) < 1: | ||||||
raise MissingNatGatewayError(nat_gateways) | ||||||
|
||||||
nat_gateway_id = nat_gateways['NatGateways'][0]["NatGatewayId"] | ||||||
logger.debug("NAT Gateway ID: %s", nat_gateway_id) | ||||||
slogger.debug("NAT Gateway ID: %s", nat_gateway_id) | ||||||
return nat_gateway_id | ||||||
|
||||||
|
||||||
|
@@ -120,10 +137,10 @@ def replace_route(route_table_id, nat_gateway_id): | |||||
"RouteTableId": route_table_id | ||||||
} | ||||||
try: | ||||||
logger.info("Replacing existing route %s for route table %s", route_table_id, new_route_table) | ||||||
slogger.info("Replacing existing route %s for route table %s", route_table_id, new_route_table) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line prints a dictionary ( Let's update the message so it's more accurate and doesn't include a nested dictionary. It will print more cleanly in the output. It does break backward compatibility for folks monitoring for this specific phrasing, which is too bad, but worthwhile to fix. May as well do it now.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, most folks should be monitoring for |
||||||
ec2_client.replace_route(**new_route_table) | ||||||
except botocore.exceptions.ClientError as error: | ||||||
logger.error("Unable to replace route") | ||||||
slogger.error("Unable to replace route") | ||||||
raise error | ||||||
|
||||||
|
||||||
|
@@ -138,17 +155,17 @@ def check_connection(check_urls): | |||||
req = urllib.request.Request(url) | ||||||
req.add_header('User-Agent', 'alternat/1.0') | ||||||
urllib.request.urlopen(req, timeout=REQUEST_TIMEOUT) | ||||||
logger.debug("Successfully connected to %s", url) | ||||||
slogger.debug("Successfully connected to %s", url) | ||||||
return True | ||||||
except urllib.error.HTTPError as error: | ||||||
logger.warning("Response error from %s: %s, treating as success", url, error) | ||||||
slogger.warning("Response error from %s: %s, treating as success", url, error) | ||||||
return True | ||||||
except urllib.error.URLError as error: | ||||||
logger.error("error connecting to %s: %s", url, error) | ||||||
slogger.error("error connecting to %s: %s", url, error) | ||||||
except socket.timeout as error: | ||||||
logger.error("timeout error connecting to %s: %s", url, error) | ||||||
slogger.error("timeout error connecting to %s: %s", url, error) | ||||||
|
||||||
logger.warning("Failed connectivity tests! Replacing route") | ||||||
slogger.warning("Failed connectivity tests! Replacing route") | ||||||
|
||||||
public_subnet_id = os.getenv("PUBLIC_SUBNET_ID") | ||||||
if not public_subnet_id: | ||||||
|
@@ -163,20 +180,20 @@ def check_connection(check_urls): | |||||
|
||||||
for rtb in route_tables: | ||||||
replace_route(rtb, nat_gateway_id) | ||||||
logger.info("Route replacement succeeded") | ||||||
slogger.info("Route replacement succeeded") | ||||||
return False | ||||||
|
||||||
|
||||||
def connectivity_test_handler(event, context): | ||||||
if not isinstance(event, dict): | ||||||
logger.error(f"Unknown event: {event}") | ||||||
slogger.error("Unknown event: %s", {event}) | ||||||
Comment on lines
-172
to
+189
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for standardizing this throughout the file. |
||||||
return | ||||||
|
||||||
if event.get("source") != "aws.events": | ||||||
logger.error(f"Unable to handle unknown event type: {json.dumps(event)}") | ||||||
slogger.error("Unable to handle unknown event type: %s", json.dumps(event)) | ||||||
raise UnknownEventTypeError | ||||||
|
||||||
logger.debug("Starting NAT instance connectivity test") | ||||||
slogger.debug("Starting NAT instance connectivity test") | ||||||
|
||||||
check_interval = int(os.getenv("CONNECTIVITY_CHECK_INTERVAL", DEFAULT_CONNECTIVITY_CHECK_INTERVAL)) | ||||||
check_urls = "CHECK_URLS" in os.environ and os.getenv("CHECK_URLS").split(",") or DEFAULT_CHECK_URLS | ||||||
|
@@ -209,10 +226,10 @@ def handler(event, _): | |||||
if LIFECYCLE_KEY in message and ASG_KEY in message: | ||||||
asg = message[ASG_KEY] | ||||||
else: | ||||||
logger.error("Failed to find lifecycle message to parse") | ||||||
slogger.error("Failed to find lifecycle message to parse") | ||||||
raise LifecycleMessageError | ||||||
except Exception as error: | ||||||
logger.error("Error: %s", error) | ||||||
slogger.error(error) | ||||||
raise error | ||||||
|
||||||
availability_zone, vpc_zone_identifier = get_az_and_vpc_zone_identifier(asg) | ||||||
|
@@ -227,7 +244,7 @@ def handler(event, _): | |||||
|
||||||
for rtb in route_tables: | ||||||
replace_route(rtb, nat_gateway_id) | ||||||
logger.info("Route replacement succeeded") | ||||||
slogger.info("Route replacement succeeded") | ||||||
|
||||||
|
||||||
class UnknownEventTypeError(Exception): pass | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,3 @@ | ||
boto3==1.34.90 | ||
structlog==24.4.0 | ||
orjson==3.10.14 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Talking to this particular config, when I ran into the example in this document, it matched what I had in my head as a starting point. The company I work at have some opinions on things, though I wasn't going to impose them here.