Skip to content

Commit

Permalink
chore(bottlecap): Re-architect Telemetry API events forwarding (#262)
Browse files Browse the repository at this point in the history
* add `from_stream` to `HttpRequestParser`

read properly from the stream so we can read every byte, retry 3 times in case theres no data

* add another unit test for invalid data

* update algorithm

* send events as batch

* receive events as batch

* process as batch

and send events of interest for interest to the main thread

* aggregate logs as batch

* swap senders so data is sent faster

* drop lock as soon as possible

* add `no-default-features` to `reqwest` so it compiles in linux
  • Loading branch information
duncanista authored Jun 3, 2024
1 parent 09c473c commit dd6c742
Show file tree
Hide file tree
Showing 8 changed files with 190 additions and 365 deletions.
244 changes: 3 additions & 241 deletions bottlecap/Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion bottlecap/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ tracing-subscriber = { version = "0.3.18", default-features = false, features =
ureq = { version = "2.9.7", features = ["tls", "json"], default-features = false }
ustr = { version = "1.0.0", default-features = false }
hmac = "0.12.1"
reqwest = { version = "0.12.4", features = ["json", "blocking", "rustls-tls"] }
reqwest = { version = "0.12.4", features = ["json", "blocking", "rustls-tls"], default-features = false }
sha2 = "0.10.8"
hex = "0.4.3"
[target.'cfg(not(target_env = "msvc"))'.dependencies]
Expand Down
102 changes: 52 additions & 50 deletions bottlecap/src/bin/bottlecap/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,13 @@ fn main() -> Result<()> {
LAMBDA_RUNTIME_SLUG.to_string(),
&metadata_hash,
));
let logs_agent = LogsAgent::run(Arc::clone(&tags_provider), Arc::clone(&config));

let event_bus = EventBus::run();
let logs_agent = LogsAgent::run(
Arc::clone(&tags_provider),
Arc::clone(&config),
event_bus.get_sender_copy(),
);
let metrics_aggr = Arc::new(Mutex::new(
metrics_aggregator::Aggregator::<{ constants::CONTEXTS }>::new(tags_provider.clone())
.expect("failed to create aggregator"),
Expand All @@ -209,7 +214,7 @@ fn main() -> Result<()> {
port: TELEMETRY_PORT,
};
let telemetry_listener =
TelemetryListener::run(&telemetry_listener_config, event_bus.get_sender_copy())
TelemetryListener::run(&telemetry_listener_config, logs_agent.get_sender_copy())
.map_err(|e| Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let telemetry_client = TelemetryApiClient::new(r.extension_id.to_string(), TELEMETRY_PORT);
telemetry_client
Expand Down Expand Up @@ -258,61 +263,58 @@ fn main() -> Result<()> {
Event::Metric(event) => {
debug!("Metric event: {:?}", event);
}
Event::Telemetry(event) => {
logs_agent.send_event(event.clone());
match event.record {
TelemetryRecord::PlatformInitReport {
initialization_type,
phase,
metrics,
} => {
debug!("Platform init report for initialization_type: {:?} with phase: {:?} and metrics: {:?}", initialization_type, phase, metrics);
}
TelemetryRecord::PlatformRuntimeDone {
request_id, status, ..
} => {
if status != Status::Success {
Event::Telemetry(event) => match event.record {
TelemetryRecord::PlatformInitReport {
initialization_type,
phase,
metrics,
} => {
debug!("Platform init report for initialization_type: {:?} with phase: {:?} and metrics: {:?}", initialization_type, phase, metrics);
}
TelemetryRecord::PlatformRuntimeDone {
request_id, status, ..
} => {
if status != Status::Success {
if let Err(e) =
lambda_enhanced_metrics.increment_errors_metric()
{
error!("Failed to increment error metric: {e:?}");
}
if status == Status::Timeout {
if let Err(e) =
lambda_enhanced_metrics.increment_errors_metric()
lambda_enhanced_metrics.increment_timeout_metric()
{
error!("Failed to increment error metric: {e:?}");
}
if status == Status::Timeout {
if let Err(e) =
lambda_enhanced_metrics.increment_timeout_metric()
{
error!("Failed to increment timeout metric: {e:?}");
}
error!("Failed to increment timeout metric: {e:?}");
}
}
debug!(
"Runtime done for request_id: {:?} with status: {:?}",
request_id, status
);
logs_agent.flush();
dogstats_client.flush();
break;
}
TelemetryRecord::PlatformReport {
request_id,
status,
metrics,
..
} => {
debug!(
"Platform report for request_id: {:?} with status: {:?}",
request_id, status
);
lambda_enhanced_metrics.set_report_log_metrics(&metrics);
if shutdown {
break;
}
}
_ => {
debug!("Unforwarded Telemetry event: {:?}", event);
debug!(
"Runtime done for request_id: {:?} with status: {:?}",
request_id, status
);
logs_agent.flush();
dogstats_client.flush();
break;
}
TelemetryRecord::PlatformReport {
request_id,
status,
metrics,
..
} => {
debug!(
"Platform report for request_id: {:?} with status: {:?}",
request_id, status
);
lambda_enhanced_metrics.set_report_log_metrics(&metrics);
if shutdown {
break;
}
}
}
_ => {
debug!("Unforwarded Telemetry event: {:?}", event);
}
},
}
} else {
error!("could not get the event");
Expand Down
26 changes: 15 additions & 11 deletions bottlecap/src/logs/agent.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
use std::sync::mpsc::{self, Sender};
use std::sync::mpsc::{self, Sender, SyncSender};
use std::sync::{Arc, Mutex};
use std::thread;

use tracing::{debug, error};
use tracing::debug;

use crate::events::Event;
use crate::logs::{aggregator::Aggregator, datadog, processor::LogsProcessor};
use crate::tags;
use crate::telemetry::events::TelemetryEvent;
Expand All @@ -13,7 +14,7 @@ use crate::{config, LAMBDA_RUNTIME_SLUG};
pub struct LogsAgent {
dd_api: datadog::Api,
aggregator: Arc<Mutex<Aggregator>>,
tx: Sender<TelemetryEvent>,
tx: Sender<Vec<TelemetryEvent>>,
join_handle: std::thread::JoinHandle<()>,
}

Expand All @@ -22,27 +23,29 @@ impl LogsAgent {
pub fn run(
tags_provider: Arc<tags::provider::Provider>,
datadog_config: Arc<config::Config>,
event_bus: SyncSender<Event>,
) -> LogsAgent {
let aggregator: Arc<Mutex<Aggregator>> = Arc::new(Mutex::new(Aggregator::default()));
let mut processor = LogsProcessor::new(
Arc::clone(&datadog_config),
tags_provider,
event_bus,
LAMBDA_RUNTIME_SLUG.to_string(),
);

let cloned_aggregator = aggregator.clone();

let (tx, rx) = mpsc::channel::<TelemetryEvent>();
let (tx, rx) = mpsc::channel::<Vec<TelemetryEvent>>();
let join_handle = thread::spawn(move || loop {
let received = rx.recv();
// TODO(duncanista): we might need to create a Event::Shutdown
// to signal shutdown and make it easier to handle any floating events
let Ok(event) = received else {
let Ok(events) = received else {
debug!("Failed to received event in Logs Agent");
break;
};

processor.process(event, &cloned_aggregator);
processor.process(events, &cloned_aggregator);
});

let dd_api = datadog::Api::new(datadog_config.api_key.clone(), datadog_config.site.clone());
Expand All @@ -54,18 +57,19 @@ impl LogsAgent {
}
}

pub fn send_event(&self, event: TelemetryEvent) {
if let Err(e) = self.tx.send(event) {
error!("Error sending Telemetry event to the Logs Agent: {}", e);
}
#[must_use]
pub fn get_sender_copy(&self) -> Sender<Vec<TelemetryEvent>> {
self.tx.clone()
}

pub fn flush(&self) {
LogsAgent::flush_internal(&self.aggregator, &self.dd_api);
}

fn flush_internal(aggregator: &Arc<Mutex<Aggregator>>, dd_api: &datadog::Api) {
let logs = aggregator.lock().expect("lock poisoned").get_batch();
let mut guard = aggregator.lock().expect("lock poisoned");
let logs = guard.get_batch();
drop(guard);
dd_api.send(&logs).expect("Failed to send logs to Datadog");
}

Expand Down
35 changes: 20 additions & 15 deletions bottlecap/src/logs/aggregator.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use serde::Serialize;
use std::collections::VecDeque;
use tracing::{debug, warn};
use tracing::warn;

use crate::logs::constants;

Expand Down Expand Up @@ -38,10 +37,9 @@ impl Aggregator {
}
}

pub fn add<T: Serialize>(&mut self, log: T) {
match serde_json::to_string(&log) {
Ok(log) => self.messages.push_back(log),
Err(e) => debug!("Failed to serialize log: {}", e),
pub fn add_batch(&mut self, logs: Vec<String>) {
for log in logs {
self.messages.push_back(log);
}
}

Expand Down Expand Up @@ -91,7 +89,7 @@ mod tests {
use crate::logs::lambda::{IntakeLog, Lambda, Message};

#[test]
fn test_add() {
fn test_add_batch() {
let mut aggregator = Aggregator::default();
let log = IntakeLog {
message: Message {
Expand All @@ -108,9 +106,10 @@ mod tests {
tags: "tags".to_string(),
source: "source".to_string(),
};
aggregator.add(log.clone());
let serialized_log = serde_json::to_string(&log).unwrap();
aggregator.add_batch(vec![serialized_log.clone()]);
assert_eq!(aggregator.messages.len(), 1);
assert_eq!(aggregator.messages[0], serde_json::to_string(&log).unwrap());
assert_eq!(aggregator.messages[0], serialized_log);
}

#[test]
Expand All @@ -131,7 +130,8 @@ mod tests {
tags: "tags".to_string(),
source: "source".to_string(),
};
aggregator.add(log.clone());
let serialized_log = serde_json::to_string(&log).unwrap();
aggregator.add_batch(vec![serialized_log.clone()]);
assert_eq!(aggregator.messages.len(), 1);
let batch = aggregator.get_batch();
let serialized_batch = format!("[{}]", serde_json::to_string(&log).unwrap());
Expand All @@ -157,9 +157,12 @@ mod tests {
source: "source".to_string(),
};
// Add 3 logs
aggregator.add(log.clone());
aggregator.add(log.clone());
aggregator.add(log.clone());
let serialized_log = serde_json::to_string(&log).unwrap();
aggregator.add_batch(vec![
serialized_log.clone(),
serialized_log.clone(),
serialized_log.clone(),
]);

// The batch should only contain the first 2 logs
let first_batch = aggregator.get_batch();
Expand Down Expand Up @@ -194,12 +197,14 @@ mod tests {
source: "source".to_string(),
};
// Add 2 logs
aggregator.add(log.clone());
let serialized_log = serde_json::to_string(&log).unwrap();
aggregator.add_batch(vec![serialized_log.clone()]);

// This log will exceed the max content size
let mut big_log = log.clone();
big_log.message.message = "a".repeat(256);
aggregator.add(big_log.clone());
let serialized_big_log = serde_json::to_string(&log).unwrap();
aggregator.add_batch(vec![serialized_big_log.clone()]);

let first_batch = aggregator.get_batch();
let serialized_log = serde_json::to_string(&log).unwrap();
Expand Down
Loading

0 comments on commit dd6c742

Please sign in to comment.