Skip to content

Commit

Permalink
check in patches and code with patches applied
Browse files Browse the repository at this point in the history
  • Loading branch information
ruizh22 authored and avinashak committed Dec 18, 2024
1 parent 8645cd9 commit 20e6e1b
Show file tree
Hide file tree
Showing 7 changed files with 187 additions and 14 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ postgres-openssl = { version = "0.5.0", default-features = false, features = ["r
pulsar = { version = "6.3.0", default-features = false, features = ["tokio-runtime", "auth-oauth2", "flate2", "lz4", "snap", "zstd"], optional = true }
rand = { version = "0.8.5", default-features = false, features = ["small_rng"] }
rand_distr = { version = "0.4.3", default-features = false }
rdkafka = { version = "0.35.0", default-features = false, features = ["tokio", "libz", "ssl", "zstd"], optional = true }
rdkafka = { version = "0.35.0", default-features = false, features = ["tokio", "libz", "ssl", "zstd", "sasl", "curl"], optional = true }
redis = { version = "0.24.0", default-features = false, features = ["connection-manager", "tokio-comp", "tokio-native-tls-comp"], optional = true }
regex = { version = "1.10.4", default-features = false, features = ["std", "perf"] }
roaring = { version = "0.10.4", default-features = false, features = ["std"], optional = true }
Expand Down
18 changes: 18 additions & 0 deletions patches/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
This directory contains the Vector binary that we use. We use a non-standard binary to patch GCS retry behavior. In production, we noticed that we were getting a lot of 'Connection Reset by Peer' errors on the GCS sink, and in the GCS sink, these errors are *not* retriable.

We patch the GCS sink to be much more greedy in retrying, so that it pretty much retries anything.

## In This Directory
- `gcs-retry.path` -> A patch file which can be applied to the Vector main branch to introduce the retrying behavior we want


The Dockerfiles will compile a version of Vector with our patches in them automatically. Check those files for the commit hash that we are based off of in the case you'd like to make some updates.

To update the patch, clone the vector repo, checkout the specified commit hash and make your changes. After making your changes, run `git diff > gcs-retry.patch` to save the diff and copy it into this directory. The build files will build vector with your patch automatically.


### Currently Patched
The following are patched:
- Fixing GCS Sink error type that allows proper retry handling
- Extremely generous retry logic that functionaly retries everything

Check failure

Code scanning / check-spelling

Unrecognized Spelling Error

functionaly is not a recognized word. (unrecognized-spelling)
- Backport updated GCP auth token handling from https://github.com/vectordotdev/vector/pull/20574
136 changes: 136 additions & 0 deletions patches/gcs-retry.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
diff --git a/src/gcp.rs b/src/gcp.rs
index bfc486f92..148fa9dec 100644

Check warning

Code scanning / check-spelling

Candidate Pattern Warning

Line matches candidate pattern "index (?:[0-9a-z]{7,40},|)[0-9a-z]{7,40}..[0-9a-z]{7,40}" (candidate-pattern)

Check failure

Code scanning / check-spelling

Unrecognized Spelling Error

bfc is not a recognized word. (unrecognized-spelling)
--- a/src/gcp.rs
+++ b/src/gcp.rs
@@ -16,7 +16,7 @@ use hyper::header::AUTHORIZATION;
use once_cell::sync::Lazy;
use smpl_jwt::Jwt;
use snafu::{ResultExt, Snafu};
-use tokio::{sync::watch, time::Instant};
+use tokio::sync::watch;
use vector_lib::configurable::configurable_component;
use vector_lib::sensitive_string::SensitiveString;

@@ -25,6 +25,11 @@ use crate::{config::ProxyConfig, http::HttpClient, http::HttpError};
const SERVICE_ACCOUNT_TOKEN_URL: &str =
"http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/token";

+// See https://cloud.google.com/compute/docs/access/authenticate-workloads#applications
+const METADATA_TOKEN_EXPIRY_MARGIN_SECS: u64 = 200;
+
+const METADATA_TOKEN_ERROR_RETRY_SECS: u64 = 2;
+
pub const PUBSUB_URL: &str = "https://pubsub.googleapis.com";

pub static PUBSUB_ADDRESS: Lazy<String> = Lazy::new(|| {
@@ -194,19 +199,25 @@ impl GcpAuthenticator {
async fn token_regenerator(self, sender: watch::Sender<()>) {
match self {
Self::Credentials(inner) => {
- let period =
- Duration::from_secs(inner.token.read().unwrap().expires_in() as u64 / 2);
- let mut interval = tokio::time::interval_at(Instant::now() + period, period);
+ let expires_in = inner.token.read().unwrap().expires_in() as u64;
+ let mut deadline =
+ Duration::from_secs(expires_in.saturating_sub(METADATA_TOKEN_EXPIRY_MARGIN_SECS));
loop {
- interval.tick().await;
+ tokio::time::sleep(deadline).await;
debug!("Renewing GCP authentication token.");
match inner.regenerate_token().await {
- Ok(()) => sender.send_replace(()),
+ Ok(()) => {
+ sender.send_replace(());
+ let expires_in = inner.token.read().unwrap().expires_in() as u64;
+ deadline =
+ Duration::from_secs(expires_in.saturating_sub(METADATA_TOKEN_EXPIRY_MARGIN_SECS));
+ }
Err(error) => {
error!(
message = "Failed to update GCP authentication token.",
%error
);
+ deadline = Duration::from_secs(METADATA_TOKEN_ERROR_RETRY_SECS);
}
}
}
diff --git a/src/sinks/gcs_common/config.rs b/src/sinks/gcs_common/config.rs
index 914d780c8..e59a4e8e4 100644
--- a/src/sinks/gcs_common/config.rs
+++ b/src/sinks/gcs_common/config.rs
@@ -6,7 +6,7 @@ use vector_lib::configurable::configurable_component;

use crate::{
gcp::{GcpAuthenticator, GcpError},
- http::HttpClient,
+ http::{HttpClient, HttpError},
sinks::{
gcs_common::service::GcsResponse,
util::retries::{RetryAction, RetryLogic},
@@ -141,7 +141,7 @@ pub struct GcsRetryLogic;

// This is a clone of HttpRetryLogic for the Body type, should get merged
impl RetryLogic for GcsRetryLogic {
- type Error = hyper::Error;
+ type Error = HttpError;
type Response = GcsResponse;

fn is_retriable_error(&self, _error: &Self::Error) -> bool {
@@ -159,7 +159,7 @@ impl RetryLogic for GcsRetryLogic {
}
_ if status.is_server_error() => RetryAction::Retry(status.to_string().into()),
_ if status.is_success() => RetryAction::Successful,
- _ => RetryAction::DontRetry(format!("response status: {}", status).into()),
+ _ => RetryAction::Retry(format!("catchall retry with response status: {}", status).into()),
}
}
}
diff --git a/src/sinks/util/http.rs b/src/sinks/util/http.rs
index 0904a67cb..e3fae07e0 100644
--- a/src/sinks/util/http.rs
+++ b/src/sinks/util/http.rs
@@ -470,6 +470,7 @@ impl RetryLogic for HttpRetryLogic {
let status = response.status();

match status {
+ StatusCode::UNAUTHORIZED => RetryAction::Retry("unauthorized".into()),
StatusCode::TOO_MANY_REQUESTS => RetryAction::Retry("too many requests".into()),
StatusCode::NOT_IMPLEMENTED => {
RetryAction::DontRetry("endpoint not implemented".into())
@@ -478,7 +479,7 @@ impl RetryLogic for HttpRetryLogic {
format!("{}: {}", status, String::from_utf8_lossy(response.body())).into(),
),
_ if status.is_success() => RetryAction::Successful,
- _ => RetryAction::DontRetry(format!("response status: {}", status).into()),
+ _ => RetryAction::Retry(format!("catchall retry with response status: {}", status).into()),
}
}
}
diff --git a/src/sinks/util/retries.rs b/src/sinks/util/retries.rs
index 003f1990b..fea5cf5be 100644

Check failure

Code scanning / check-spelling

Unrecognized Spelling Error

fea is not a recognized word. (unrecognized-spelling)
--- a/src/sinks/util/retries.rs
+++ b/src/sinks/util/retries.rs
@@ -192,13 +192,20 @@ where
internal_log_rate_limit = true
);
Some(self.build_retry())
+ } else if error.downcast_ref::<hyper::Error>().is_some() {
+ warn!(
+ message = "Request failed on a Hyper error. This is likely a transient network issue, retrying.",
+ %error,
+ internal_log_rate_limit = true
+ );
+ Some(self.build_retry())
} else {
- error!(
- message = "Unexpected error type; dropping the request.",
+ warn!(
+ message = "Unexpected Error Type. Retrying anyway",
%error,
internal_log_rate_limit = true
);
- None
+ Some(self.build_retry())
}
}
}
23 changes: 17 additions & 6 deletions src/gcp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use hyper::header::AUTHORIZATION;
use once_cell::sync::Lazy;
use smpl_jwt::Jwt;
use snafu::{ResultExt, Snafu};
use tokio::{sync::watch, time::Instant};
use tokio::sync::watch;
use vector_lib::configurable::configurable_component;
use vector_lib::sensitive_string::SensitiveString;

Expand All @@ -25,6 +25,11 @@ use crate::{config::ProxyConfig, http::HttpClient, http::HttpError};
const SERVICE_ACCOUNT_TOKEN_URL: &str =
"http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/token";

// See https://cloud.google.com/compute/docs/access/authenticate-workloads#applications
const METADATA_TOKEN_EXPIRY_MARGIN_SECS: u64 = 200;

const METADATA_TOKEN_ERROR_RETRY_SECS: u64 = 2;

pub const PUBSUB_URL: &str = "https://pubsub.googleapis.com";

pub static PUBSUB_ADDRESS: Lazy<String> = Lazy::new(|| {
Expand Down Expand Up @@ -194,19 +199,25 @@ impl GcpAuthenticator {
async fn token_regenerator(self, sender: watch::Sender<()>) {
match self {
Self::Credentials(inner) => {
let period =
Duration::from_secs(inner.token.read().unwrap().expires_in() as u64 / 2);
let mut interval = tokio::time::interval_at(Instant::now() + period, period);
let expires_in = inner.token.read().unwrap().expires_in() as u64;
let mut deadline =
Duration::from_secs(expires_in.saturating_sub(METADATA_TOKEN_EXPIRY_MARGIN_SECS));
loop {
interval.tick().await;
tokio::time::sleep(deadline).await;
debug!("Renewing GCP authentication token.");
match inner.regenerate_token().await {
Ok(()) => sender.send_replace(()),
Ok(()) => {
sender.send_replace(());
let expires_in = inner.token.read().unwrap().expires_in() as u64;
deadline =
Duration::from_secs(expires_in.saturating_sub(METADATA_TOKEN_EXPIRY_MARGIN_SECS));
}
Err(error) => {
error!(
message = "Failed to update GCP authentication token.",
%error
);
deadline = Duration::from_secs(METADATA_TOKEN_ERROR_RETRY_SECS);
}
}
}
Expand Down
6 changes: 3 additions & 3 deletions src/sinks/gcs_common/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use vector_lib::configurable::configurable_component;

use crate::{
gcp::{GcpAuthenticator, GcpError},
http::HttpClient,
http::{HttpClient, HttpError},
sinks::{
gcs_common::service::GcsResponse,
util::retries::{RetryAction, RetryLogic},
Expand Down Expand Up @@ -141,7 +141,7 @@ pub struct GcsRetryLogic;

// This is a clone of HttpRetryLogic for the Body type, should get merged
impl RetryLogic for GcsRetryLogic {
type Error = hyper::Error;
type Error = HttpError;
type Response = GcsResponse;

fn is_retriable_error(&self, _error: &Self::Error) -> bool {
Expand All @@ -159,7 +159,7 @@ impl RetryLogic for GcsRetryLogic {
}
_ if status.is_server_error() => RetryAction::Retry(status.to_string().into()),
_ if status.is_success() => RetryAction::Successful,
_ => RetryAction::DontRetry(format!("response status: {}", status).into()),
_ => RetryAction::Retry(format!("catchall retry with response status: {}", status).into()),
}
}
}
3 changes: 2 additions & 1 deletion src/sinks/util/http.rs
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,7 @@ impl RetryLogic for HttpRetryLogic {
let status = response.status();

match status {
StatusCode::UNAUTHORIZED => RetryAction::Retry("unauthorized".into()),
StatusCode::TOO_MANY_REQUESTS => RetryAction::Retry("too many requests".into()),
StatusCode::NOT_IMPLEMENTED => {
RetryAction::DontRetry("endpoint not implemented".into())
Expand All @@ -478,7 +479,7 @@ impl RetryLogic for HttpRetryLogic {
format!("{}: {}", status, String::from_utf8_lossy(response.body())).into(),
),
_ if status.is_success() => RetryAction::Successful,
_ => RetryAction::DontRetry(format!("response status: {}", status).into()),
_ => RetryAction::Retry(format!("catchall retry with response status: {}", status).into()),
}
}
}
Expand Down
13 changes: 10 additions & 3 deletions src/sinks/util/retries.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,13 +192,20 @@ where
internal_log_rate_limit = true
);
Some(self.build_retry())
} else if error.downcast_ref::<hyper::Error>().is_some() {
warn!(
message = "Request failed on a Hyper error. This is likely a transient network issue, retrying.",
%error,
internal_log_rate_limit = true
);
Some(self.build_retry())
} else {
error!(
message = "Unexpected error type; dropping the request.",
warn!(
message = "Unexpected Error Type. Retrying anyway",
%error,
internal_log_rate_limit = true
);
None
Some(self.build_retry())
}
}
}
Expand Down

0 comments on commit 20e6e1b

Please sign in to comment.