apollographql · bnjjj · Nov 26, 2024 · Nov 26, 2024 · Nov 26, 2024 · Nov 27, 2024
diff --git a/.changesets/fix_bnjjj_fix_retry_metric.md b/.changesets/fix_bnjjj_fix_retry_metric.md
@@ -0,0 +1,6 @@
+### Fix and test experimental_retry ([PR #6338](https://github.com/apollographql/router/pull/6338))
+
+Fix the behavior of `experimental_retry` and make sure both the feature and metrics are working.
+An entry in the context was also added, which would be useful later to implement a new standard attribute and selector for advanced telemetry.
+
+By [@bnjjj](https://github.com/bnjjj) in https://github.com/apollographql/router/pull/6338
diff --git a/apollo-router/src/plugins/telemetry/config_new/attributes.rs b/apollo-router/src/plugins/telemetry/config_new/attributes.rs
@@ -1131,6 +1131,26 @@ impl Selectors for SubgraphAttributes {
     }
 }
 
+/// Key used in context to save number of retries for a subgraph http request
+pub(crate) struct SubgraphRequestResendCountKey<'a> {
+    subgraph_req: &'a subgraph::Request,
+}
+
+impl<'a> SubgraphRequestResendCountKey<'a> {
+    pub(crate) fn new(subgraph_req: &'a subgraph::Request) -> Self {
+        Self { subgraph_req }
+    }
+}
+
+impl<'a> From<SubgraphRequestResendCountKey<'a>> for String {
+    fn from(value: SubgraphRequestResendCountKey) -> Self {
+        format!(
+            "apollo::telemetry::http_request_resend_count_{}",
+            value.subgraph_req.id
+        )
+    }
+}
+
 #[cfg(test)]
 mod test {
     use std::net::SocketAddr;

diff --git a/apollo-router/src/plugins/traffic_shaping/mod.rs b/apollo-router/src/plugins/traffic_shaping/mod.rs
@@ -396,7 +396,6 @@ impl TrafficShaping {
                     config.min_per_sec,
                     config.retry_percent,
                     config.retry_mutations,
-                    name.to_string(),
                 );
                 tower::retry::RetryLayer::new(retry_policy)
             });

diff --git a/apollo-router/src/plugins/traffic_shaping/retry.rs b/apollo-router/src/plugins/traffic_shaping/retry.rs
@@ -5,14 +5,14 @@ use std::time::Duration;
 use tower::retry::budget::Budget;
 use tower::retry::Policy;
 
+use crate::plugins::telemetry::config_new::attributes::SubgraphRequestResendCountKey;
 use crate::query_planner::OperationKind;
 use crate::services::subgraph;
 
 #[derive(Clone, Default)]
 pub(crate) struct RetryPolicy {
     budget: Arc<Budget>,
     retry_mutations: bool,
-    subgraph_name: String,
 }
 
 impl RetryPolicy {
@@ -21,7 +21,6 @@ impl RetryPolicy {
         min_per_sec: Option<u32>,
         retry_percent: Option<f32>,
         retry_mutations: Option<bool>,
-        subgraph_name: String,
     ) -> Self {
         Self {
             budget: Arc::new(Budget::new(
@@ -30,21 +29,57 @@ impl RetryPolicy {
                 retry_percent.unwrap_or(0.2),
             )),
             retry_mutations: retry_mutations.unwrap_or(false),
-            subgraph_name,
         }
     }
 }
 
-impl<Res, E> Policy<subgraph::Request, Res, E> for RetryPolicy {
+impl<E> Policy<subgraph::Request, subgraph::Response, E> for RetryPolicy {
     type Future = future::Ready<Self>;
 
-    fn retry(&self, req: &subgraph::Request, result: Result<&Res, &E>) -> Option<Self::Future> {
+    fn retry(
+        &self,
+        req: &subgraph::Request,
+        result: Result<&subgraph::Response, &E>,
+    ) -> Option<Self::Future> {
+        let subgraph_name = req.subgraph_name.clone().unwrap_or_default();
         match result {
-            Ok(_) => {
-                // Treat all `Response`s as success,
-                // so deposit budget and don't retry...
-                self.budget.deposit();
-                None
+            Ok(resp) => {
+                if resp.response.status() >= http::StatusCode::BAD_REQUEST {
+                    if req.operation_kind == OperationKind::Mutation && !self.retry_mutations {
+                        return None;
+                    }
+
+                    let withdrew = self.budget.withdraw();
+                    if withdrew.is_err() {
+                        u64_counter!(
+                            "apollo_router_http_request_retry_total",
+                            "Number of retry for an http request to a subgraph",
+                            1u64,
+                            status = "aborted",
+                            subgraph = subgraph_name
+                        );
+
+                        return None;
+                    }
+
+                    let _ = req
+                        .context
+                        .upsert::<_, usize>(SubgraphRequestResendCountKey::new(req), |val| val + 1);
+
+                    u64_counter!(
+                        "apollo_router_http_request_retry_total",
+                        "Number of retry for an http request to a subgraph",
+                        1u64,
+                        subgraph = subgraph_name
+                    );
+
+                    Some(future::ready(self.clone()))
+                } else {
+                    // Treat all `Response`s as success,
+                    // so deposit budget and don't retry...
+                    self.budget.deposit();
+                    None
+                }
             }
             Err(_e) => {
                 if req.operation_kind == OperationKind::Mutation && !self.retry_mutations {
@@ -53,20 +88,27 @@ impl<Res, E> Policy<subgraph::Request, Res, E> for RetryPolicy {
 
                 let withdrew = self.budget.withdraw();
                 if withdrew.is_err() {
-                    tracing::info!(
-                        monotonic_counter.apollo_router_http_request_retry_total = 1u64,
+                    u64_counter!(
+                        "apollo_router_http_request_retry_total",
+                        "Number of retry for an http request to a subgraph",
+                        1u64,
                         status = "aborted",
-                        subgraph = %self.subgraph_name,
+                        subgraph = subgraph_name
                     );
 
                     return None;
                 }
-
-                tracing::info!(
-                    monotonic_counter.apollo_router_http_request_retry_total = 1u64,
-                    subgraph = %self.subgraph_name,
+                u64_counter!(
+                    "apollo_router_http_request_retry_total",
+                    "Number of retry for an http request to a subgraph",
+                    1u64,
+                    subgraph = subgraph_name
                 );
 
+                let _ = req
+                    .context
+                    .upsert::<_, usize>(SubgraphRequestResendCountKey::new(req), |val| val + 1);
+
                 Some(future::ready(self.clone()))
             }
         }
@@ -76,3 +118,129 @@ impl<Res, E> Policy<subgraph::Request, Res, E> for RetryPolicy {
         Some(req.clone())
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use http::StatusCode;
+    use tower::BoxError;
+
+    use super::*;
+    use crate::error::FetchError;
+    use crate::graphql;
+    use crate::http_ext;
+    use crate::metrics::FutureMetricsExt;
+
+    #[tokio::test]
+    async fn test_retry_with_error() {
+        async {
+            let retry = RetryPolicy::new(
+                Some(Duration::from_secs(10)),
+                Some(10),
+                Some(0.2),
+                Some(false),
+            );
+
+            let subgraph_req = subgraph::Request::fake_builder()
+                .subgraph_name("my_subgraph_name_error")
+                .subgraph_request(
+                    http_ext::Request::fake_builder()
+                        .header("test", "my_value_set")
+                        .body(
+                            graphql::Request::fake_builder()
+                                .query(String::from("query { test }"))
+                                .build(),
+                        )
+                        .build()
+                        .unwrap(),
+                )
+                .build();
+
+            assert!(retry
+                .retry(
+                    &subgraph_req,
+                    Err(&Box::new(FetchError::SubrequestHttpError {
+                        status_code: None,
+                        service: String::from("my_subgraph_name_error"),
+                        reason: String::from("cannot contact the subgraph"),
+                    }))
+                )
+                .is_some());
+
+            assert!(retry
+                .retry(
+                    &subgraph_req,
+                    Err(&Box::new(FetchError::SubrequestHttpError {
+                        status_code: None,
+                        service: String::from("my_subgraph_name_error"),
+                        reason: String::from("cannot contact the subgraph"),
+                    }))
+                )
+                .is_some());
+
+            assert_counter!(
+                "apollo_router_http_request_retry_total",
+                2,
+                "subgraph" = "my_subgraph_name_error"
+            );
+        }
+        .with_metrics()
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_retry_with_http_status_code() {
+        async {
+            let retry = RetryPolicy::new(
+                Some(Duration::from_secs(10)),
+                Some(10),
+                Some(0.2),
+                Some(false),
+            );
+
+            let subgraph_req = subgraph::Request::fake_builder()
+                .subgraph_name("my_subgraph_name_error")
+                .subgraph_request(
+                    http_ext::Request::fake_builder()
+                        .header("test", "my_value_set")
+                        .body(
+                            graphql::Request::fake_builder()
+                                .query(String::from("query { test }"))
+                                .build(),
+                        )
+                        .build()
+                        .unwrap(),
+                )
+                .build();
+
+            assert!(retry
+                .retry(
+                    &subgraph_req,
+                    Ok::<&subgraph::Response, &BoxError>(
+                        &subgraph::Response::fake_builder()
+                            .status_code(StatusCode::BAD_REQUEST)
+                            .build()
+                    )
+                )
+                .is_some());
+
+            assert!(retry
+                .retry(
+                    &subgraph_req,
+                    Ok::<&subgraph::Response, &BoxError>(
+                        &subgraph::Response::fake_builder()
+                            .status_code(StatusCode::BAD_REQUEST)
+                            .build()
+                    )
+                )
+                .is_some());
+
+            assert_counter!(
+                "apollo_router_http_request_retry_total",
+                2,
+                "subgraph" = "my_subgraph_name_error"
+            );
+        }
+        .with_metrics()
+        .await;
+    }
+}