From 1ce68d02d4c9a5e96823d2eafe88bc4cfabdc847 Mon Sep 17 00:00:00 2001 From: Benjamin <5719034+bnjjj@users.noreply.github.com> Date: Tue, 26 Nov 2024 17:03:50 +0100 Subject: [PATCH 1/4] fix(traffic_shaping): fix experimental_retry and add test Signed-off-by: Benjamin <5719034+bnjjj@users.noreply.github.com> --- .../src/plugins/traffic_shaping/mod.rs | 1 - .../src/plugins/traffic_shaping/retry.rs | 202 ++++++++++++++++-- 2 files changed, 185 insertions(+), 18 deletions(-) diff --git a/apollo-router/src/plugins/traffic_shaping/mod.rs b/apollo-router/src/plugins/traffic_shaping/mod.rs index 4335bfe988..935a670164 100644 --- a/apollo-router/src/plugins/traffic_shaping/mod.rs +++ b/apollo-router/src/plugins/traffic_shaping/mod.rs @@ -396,7 +396,6 @@ impl TrafficShaping { config.min_per_sec, config.retry_percent, config.retry_mutations, - name.to_string(), ); tower::retry::RetryLayer::new(retry_policy) }); diff --git a/apollo-router/src/plugins/traffic_shaping/retry.rs b/apollo-router/src/plugins/traffic_shaping/retry.rs index 40727cc6dd..202fb7eee3 100644 --- a/apollo-router/src/plugins/traffic_shaping/retry.rs +++ b/apollo-router/src/plugins/traffic_shaping/retry.rs @@ -5,6 +5,7 @@ use std::time::Duration; use tower::retry::budget::Budget; use tower::retry::Policy; +use crate::plugins::telemetry::config_new::attributes::SubgraphRequestResendCountKey; use crate::query_planner::OperationKind; use crate::services::subgraph; @@ -12,7 +13,6 @@ use crate::services::subgraph; pub(crate) struct RetryPolicy { budget: Arc, retry_mutations: bool, - subgraph_name: String, } impl RetryPolicy { @@ -21,7 +21,6 @@ impl RetryPolicy { min_per_sec: Option, retry_percent: Option, retry_mutations: Option, - subgraph_name: String, ) -> Self { Self { budget: Arc::new(Budget::new( @@ -30,21 +29,57 @@ impl RetryPolicy { retry_percent.unwrap_or(0.2), )), retry_mutations: retry_mutations.unwrap_or(false), - subgraph_name, } } } -impl Policy for RetryPolicy { +impl Policy for RetryPolicy { type Future = future::Ready; - fn retry(&self, req: &subgraph::Request, result: Result<&Res, &E>) -> Option { + fn retry( + &self, + req: &subgraph::Request, + result: Result<&subgraph::Response, &E>, + ) -> Option { + let subgraph_name = req.subgraph_name.clone().unwrap_or_default(); match result { - Ok(_) => { - // Treat all `Response`s as success, - // so deposit budget and don't retry... - self.budget.deposit(); - None + Ok(resp) => { + if resp.response.status() >= http::StatusCode::BAD_REQUEST { + if req.operation_kind == OperationKind::Mutation && !self.retry_mutations { + return None; + } + + let withdrew = self.budget.withdraw(); + if withdrew.is_err() { + u64_counter!( + "apollo_router_http_request_retry_total", + "Number of retry for an http request to a subgraph", + 1u64, + status = "aborted", + subgraph = subgraph_name + ); + + return None; + } + + let _ = req + .context + .upsert::<_, usize>(SubgraphRequestResendCountKey::new(req), |val| val + 1); + + u64_counter!( + "apollo_router_http_request_retry_total", + "Number of retry for an http request to a subgraph", + 1u64, + subgraph = subgraph_name + ); + + Some(future::ready(self.clone())) + } else { + // Treat all `Response`s as success, + // so deposit budget and don't retry... + self.budget.deposit(); + None + } } Err(_e) => { if req.operation_kind == OperationKind::Mutation && !self.retry_mutations { @@ -53,20 +88,27 @@ impl Policy for RetryPolicy { let withdrew = self.budget.withdraw(); if withdrew.is_err() { - tracing::info!( - monotonic_counter.apollo_router_http_request_retry_total = 1u64, + u64_counter!( + "apollo_router_http_request_retry_total", + "Number of retry for an http request to a subgraph", + 1u64, status = "aborted", - subgraph = %self.subgraph_name, + subgraph = subgraph_name ); return None; } - - tracing::info!( - monotonic_counter.apollo_router_http_request_retry_total = 1u64, - subgraph = %self.subgraph_name, + u64_counter!( + "apollo_router_http_request_retry_total", + "Number of retry for an http request to a subgraph", + 1u64, + subgraph = subgraph_name ); + let _ = req + .context + .upsert::<_, usize>(SubgraphRequestResendCountKey::new(req), |val| val + 1); + Some(future::ready(self.clone())) } } @@ -76,3 +118,129 @@ impl Policy for RetryPolicy { Some(req.clone()) } } + +#[cfg(test)] +mod tests { + use http::StatusCode; + use tower::BoxError; + + use super::*; + use crate::error::FetchError; + use crate::graphql; + use crate::http_ext; + use crate::metrics::FutureMetricsExt; + + #[tokio::test] + async fn test_retry_with_error() { + async { + let retry = RetryPolicy::new( + Some(Duration::from_secs(10)), + Some(10), + Some(0.2), + Some(false), + ); + + let subgraph_req = subgraph::Request::fake_builder() + .subgraph_name("my_subgraph_name_error") + .subgraph_request( + http_ext::Request::fake_builder() + .header("test", "my_value_set") + .body( + graphql::Request::fake_builder() + .query(String::from("query { test }")) + .build(), + ) + .build() + .unwrap(), + ) + .build(); + + assert!(retry + .retry( + &subgraph_req, + Err(&Box::new(FetchError::SubrequestHttpError { + status_code: None, + service: String::from("my_subgraph_name_error"), + reason: String::from("cannot contact the subgraph"), + })) + ) + .is_some()); + + assert!(retry + .retry( + &subgraph_req, + Err(&Box::new(FetchError::SubrequestHttpError { + status_code: None, + service: String::from("my_subgraph_name_error"), + reason: String::from("cannot contact the subgraph"), + })) + ) + .is_some()); + + assert_counter!( + "apollo_router_http_request_retry_total", + 2, + "subgraph" = "my_subgraph_name_error" + ); + } + .with_metrics() + .await; + } + + #[tokio::test] + async fn test_retry_with_http_status_code() { + async { + let retry = RetryPolicy::new( + Some(Duration::from_secs(10)), + Some(10), + Some(0.2), + Some(false), + ); + + let subgraph_req = subgraph::Request::fake_builder() + .subgraph_name("my_subgraph_name_error") + .subgraph_request( + http_ext::Request::fake_builder() + .header("test", "my_value_set") + .body( + graphql::Request::fake_builder() + .query(String::from("query { test }")) + .build(), + ) + .build() + .unwrap(), + ) + .build(); + + assert!(retry + .retry( + &subgraph_req, + Ok::<&subgraph::Response, &BoxError>( + &subgraph::Response::fake_builder() + .status_code(StatusCode::BAD_REQUEST) + .build() + ) + ) + .is_some()); + + assert!(retry + .retry( + &subgraph_req, + Ok::<&subgraph::Response, &BoxError>( + &subgraph::Response::fake_builder() + .status_code(StatusCode::BAD_REQUEST) + .build() + ) + ) + .is_some()); + + assert_counter!( + "apollo_router_http_request_retry_total", + 2, + "subgraph" = "my_subgraph_name_error" + ); + } + .with_metrics() + .await; + } +} From c88d0a07c52e0d436cc50d49d09a35b6d17a74cf Mon Sep 17 00:00:00 2001 From: Benjamin <5719034+bnjjj@users.noreply.github.com> Date: Tue, 26 Nov 2024 17:06:05 +0100 Subject: [PATCH 2/4] fix(traffic_shaping): fix experimental_retry and add test Signed-off-by: Benjamin <5719034+bnjjj@users.noreply.github.com> --- .../telemetry/config_new/attributes.rs | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/apollo-router/src/plugins/telemetry/config_new/attributes.rs b/apollo-router/src/plugins/telemetry/config_new/attributes.rs index 53640cd87b..e43dba76e7 100644 --- a/apollo-router/src/plugins/telemetry/config_new/attributes.rs +++ b/apollo-router/src/plugins/telemetry/config_new/attributes.rs @@ -1131,6 +1131,26 @@ impl Selectors for SubgraphAttributes { } } +/// Key used in context to save number of retries for a subgraph http request +pub(crate) struct SubgraphRequestResendCountKey<'a> { + subgraph_req: &'a subgraph::Request, +} + +impl<'a> SubgraphRequestResendCountKey<'a> { + pub(crate) fn new(subgraph_req: &'a subgraph::Request) -> Self { + Self { subgraph_req } + } +} + +impl<'a> From> for String { + fn from(value: SubgraphRequestResendCountKey) -> Self { + format!( + "apollo::telemetry::http_request_resend_count_{}", + value.subgraph_req.id + ) + } +} + #[cfg(test)] mod test { use std::net::SocketAddr; From 0ccae424dd35a6a554d1df31bad1f4baa7801081 Mon Sep 17 00:00:00 2001 From: Benjamin <5719034+bnjjj@users.noreply.github.com> Date: Tue, 26 Nov 2024 17:08:42 +0100 Subject: [PATCH 3/4] changelog Signed-off-by: Benjamin <5719034+bnjjj@users.noreply.github.com> --- .changesets/fix_bnjjj_fix_retry_metric.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .changesets/fix_bnjjj_fix_retry_metric.md diff --git a/.changesets/fix_bnjjj_fix_retry_metric.md b/.changesets/fix_bnjjj_fix_retry_metric.md new file mode 100644 index 0000000000..eaaa220e80 --- /dev/null +++ b/.changesets/fix_bnjjj_fix_retry_metric.md @@ -0,0 +1,6 @@ +### Fix and test experimental_retry ([PR #6338](https://github.com/apollographql/router/pull/6338)) + +Fix the behavior of `experimental_retry` and make sure both the feature and metrics are working. +An entry in the context was also added, which would be useful later to implement a new standard attribute and selector for advanced telemetry. + +By [@bnjjj](https://github.com/bnjjj) in https://github.com/apollographql/router/pull/6338 \ No newline at end of file From a243068110a603f3e0852df230b5e797c864e2a8 Mon Sep 17 00:00:00 2001 From: Benjamin <5719034+bnjjj@users.noreply.github.com> Date: Wed, 27 Nov 2024 10:32:49 +0100 Subject: [PATCH 4/4] fix description for retry metrics Signed-off-by: Benjamin <5719034+bnjjj@users.noreply.github.com> --- apollo-router/src/plugins/traffic_shaping/retry.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apollo-router/src/plugins/traffic_shaping/retry.rs b/apollo-router/src/plugins/traffic_shaping/retry.rs index 202fb7eee3..4124b861ba 100644 --- a/apollo-router/src/plugins/traffic_shaping/retry.rs +++ b/apollo-router/src/plugins/traffic_shaping/retry.rs @@ -53,7 +53,7 @@ impl Policy for RetryPolicy { if withdrew.is_err() { u64_counter!( "apollo_router_http_request_retry_total", - "Number of retry for an http request to a subgraph", + "Number of retries for an http request to a subgraph", 1u64, status = "aborted", subgraph = subgraph_name @@ -68,7 +68,7 @@ impl Policy for RetryPolicy { u64_counter!( "apollo_router_http_request_retry_total", - "Number of retry for an http request to a subgraph", + "Number of retries for an http request to a subgraph", 1u64, subgraph = subgraph_name ); @@ -90,7 +90,7 @@ impl Policy for RetryPolicy { if withdrew.is_err() { u64_counter!( "apollo_router_http_request_retry_total", - "Number of retry for an http request to a subgraph", + "Number of retries for an http request to a subgraph", 1u64, status = "aborted", subgraph = subgraph_name @@ -100,7 +100,7 @@ impl Policy for RetryPolicy { } u64_counter!( "apollo_router_http_request_retry_total", - "Number of retry for an http request to a subgraph", + "Number of retries for an http request to a subgraph", 1u64, subgraph = subgraph_name );