Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(beacon-node): network worker event latency metrics #5800

Merged
merged 42 commits into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
3d1a1e6
feat(beacon-node): pass metrics to workerEvents
matthewkeil Jul 25, 2023
bd0c2ff
feat(beacon-node): add types for network worker event metrics
matthewkeil Jul 25, 2023
1950c78
feat(beacon-node): add metrics for network worker events
matthewkeil Jul 25, 2023
c7679bb
feat(beacon-node): add metric data to network worker events
matthewkeil Jul 25, 2023
98bef5e
fix(beacon-node): move async iterator timestamps to correct location
matthewkeil Jul 25, 2023
63708a8
fix(beacon-node): Omit unnecessary emittedAt from parameter type
matthewkeil Jul 25, 2023
baa4497
feat(dashboards): add network worker thread metrics to dashboard
matthewkeil Jul 25, 2023
c6f89f3
fix(dashboard): change metric name back to lodestar_ prefix
matthewkeil Jul 25, 2023
6c9f9fb
fix: run check-types and update missed test types
matthewkeil Jul 25, 2023
37a097b
Revert "feat(beacon-node): add types for network worker event metrics"
matthewkeil Jul 25, 2023
8a205d1
Revert "fix(beacon-node): move async iterator timestamps to correct l…
matthewkeil Jul 25, 2023
78714cc
Revert "fix(beacon-node): Omit unnecessary emittedAt from parameter t…
matthewkeil Jul 25, 2023
4d49479
Revert "feat(beacon-node): add metric data to network worker events"
matthewkeil Jul 25, 2023
1bb9fbd
feat(beacon-node): capture worker message in hrTime
matthewkeil Jul 25, 2023
69f720a
fix(dashboards): remove re-emit panel from network worker row
matthewkeil Jul 25, 2023
f70eeb3
fix(metrics): remove unused re-emit metrics
matthewkeil Jul 25, 2023
6117bd0
Revert "fix: run check-types and update missed test types"
matthewkeil Jul 25, 2023
79c1430
fix(beacon-node): update metric name at call site
matthewkeil Jul 25, 2023
c890148
fix(metrics): update capture to ISU units
matthewkeil Jul 25, 2023
c8172f5
refactor(beacon-node): move initialization out of conditional
matthewkeil Jul 25, 2023
0c84dcb
fix: remove unused import
matthewkeil Jul 25, 2023
daa2640
feat(metrics): add unit to metric name
matthewkeil Jul 27, 2023
18e4561
feat(metrics): add unit to metric name
matthewkeil Jul 27, 2023
d18702c
feat: add trace log statement to network worker
matthewkeil Jul 27, 2023
6099560
fix: change trace to debug log
matthewkeil Jul 27, 2023
5949206
feat(metrics): add eventName to network worker message metrics
matthewkeil Jul 27, 2023
e1ad935
bug(logger): check if trace is broken
matthewkeil Jul 27, 2023
106ae07
feat(metrics): add worker eventDirection label
matthewkeil Jul 27, 2023
570162b
fix(metrics): use string instead of enum for eventDirection
matthewkeil Jul 27, 2023
5f76c36
fix(metrics): remove eventDirection label
matthewkeil Jul 27, 2023
e76e672
feat(dashboards): add average panel for network worker message
matthewkeil Aug 1, 2023
f69bfbb
fix(metrics): update naming per Nico's suggestions
matthewkeil Aug 1, 2023
1b92def
refactor: remove unused logger from workerEvents.ts
matthewkeil Aug 1, 2023
6b04c02
fix(metrics): add network worker unit name back
matthewkeil Aug 1, 2023
f7874c0
fix(beacon-node): use bigint for hrtime in worker message metric
matthewkeil Aug 3, 2023
042b913
fix(dashboards): remove dashboard changes. moved to PR#5827
matthewkeil Aug 3, 2023
8477afb
fix(dashboards): remove dashboard changes. moved to PR#5827
matthewkeil Aug 3, 2023
e106735
fix(dashboards): remove dashboard changes. moved to PR#5827
matthewkeil Aug 3, 2023
cfcb7a6
fix: constant case for nano conversion
matthewkeil Oct 12, 2023
e802437
Revert "fix(beacon-node): use bigint for hrtime in worker message met…
matthewkeil Oct 12, 2023
537d465
refactor: remove Sec suffix in metric variable name
matthewkeil Oct 12, 2023
f9ed6bb
fix(dashboard): make metric name match the updates in PR
matthewkeil Oct 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions dashboards/lodestar_networking.json
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
{
"__inputs": [
{
"description": "",
"label": "Prometheus",
"name": "DS_PROMETHEUS",
"type": "datasource",
"label": "Prometheus",
"description": "",
"pluginId": "prometheus",
"pluginName": "Prometheus",
"type": "datasource"
"pluginName": "Prometheus"
}
],
"annotations": {
Expand Down Expand Up @@ -937,7 +937,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "(\n sum(rate(\n lodestar_network_worker_wire_events_on_worker_thread_latency_sum[$rate_interval]\n )) \n +\n sum(rate(\n lodestar_network_worker_wire_events_on_main_thread_latency_sum[$rate_interval]\n ))\n)\n/\n(\n sum(rate(\n lodestar_network_worker_wire_events_on_worker_thread_latency_count[$rate_interval]\n ))\n +\n sum(rate(\n lodestar_network_worker_wire_events_on_main_thread_latency_count[$rate_interval]\n ))\n)",
"expr": "(\n sum(rate(\n lodestar_network_worker_wire_events_on_worker_thread_latency_seconds_sum[$rate_interval]\n )) \n +\n sum(rate(\n lodestar_network_worker_wire_events_on_main_thread_latency_seconds_sum[$rate_interval]\n ))\n)\n/\n(\n sum(rate(\n lodestar_network_worker_wire_events_on_worker_thread_latency_seconds_count[$rate_interval]\n ))\n +\n sum(rate(\n lodestar_network_worker_wire_events_on_main_thread_latency_seconds_count[$rate_interval]\n ))\n)",
"hide": false,
"interval": "",
"legendFormat": "Average",
Expand All @@ -951,7 +951,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "avg(rate(lodestar_network_worker_wire_events_on_worker_thread_latency_sum[$rate_interval])/rate(lodestar_network_worker_wire_events_on_worker_thread_latency_count[$rate_interval]))",
"expr": "avg(rate(lodestar_network_worker_wire_events_on_worker_thread_latency_seconds_sum[$rate_interval])/rate(lodestar_network_worker_wire_events_on_worker_thread_latency_seconds_count[$rate_interval]))",
"hide": false,
"interval": "",
"legendFormat": "Worker to Main",
Expand All @@ -964,7 +964,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "avg(rate(lodestar_network_worker_wire_events_on_main_thread_latency_sum[$rate_interval])/rate(lodestar_network_worker_wire_events_on_main_thread_latency_count[$rate_interval]))",
"expr": "avg(rate(lodestar_network_worker_wire_events_on_main_thread_latency_seconds_sum[$rate_interval])/rate(lodestar_network_worker_wire_events_on_main_thread_latency_seconds_count[$rate_interval]))",
"hide": false,
"legendFormat": "Main to Worker",
"range": true,
Expand Down Expand Up @@ -1051,7 +1051,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "avg(rate(lodestar_network_worker_wire_events_on_main_thread_latency_sum[$rate_interval])/rate(lodestar_network_worker_wire_events_on_main_thread_latency_count[$rate_interval]))",
"expr": "avg(rate(lodestar_network_worker_wire_events_on_main_thread_latency_seconds_sum[$rate_interval])/rate(lodestar_network_worker_wire_events_on_main_thread_latency_seconds_count[$rate_interval]))",
"hide": true,
"interval": "",
"legendFormat": "Average to Main",
Expand All @@ -1065,7 +1065,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "rate(lodestar_network_worker_wire_events_on_main_thread_latency_sum[$rate_interval])/rate(lodestar_network_worker_wire_events_on_main_thread_latency_count[$rate_interval])",
"expr": "rate(lodestar_network_worker_wire_events_on_main_thread_latency_seconds_sum[$rate_interval])/rate(lodestar_network_worker_wire_events_on_main_thread_latency_seconds_count[$rate_interval])",
"interval": "",
"legendFormat": "{{eventName}}",
"range": true,
Expand All @@ -1077,7 +1077,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "rate(lodestar_network_worker_wire_events_on_worker_thread_latency_sum[$rate_interval])/rate(lodestar_network_worker_wire_events_on_worker_thread_latency_count[$rate_interval])",
"expr": "rate(lodestar_network_worker_wire_events_on_worker_thread_latency_seconds_sum[$rate_interval])/rate(lodestar_network_worker_wire_events_on_worker_thread_latency_seconds_count[$rate_interval])",
"hide": false,
"legendFormat": "{{eventName}}",
"range": true,
Expand Down
6 changes: 6 additions & 0 deletions packages/beacon-node/src/metrics/metrics/lodestar.ts
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,12 @@ export function createLodestarMetrics(
help: "Current count of pending items in reqRespBridgeReqCaller data structure",
}),
},
networkWorkerWireEventsOnMainThreadLatency: register.histogram<"eventName">({
name: "lodestar_network_worker_wire_events_on_main_thread_latency_seconds",
matthewkeil marked this conversation as resolved.
Show resolved Hide resolved
help: "Latency in seconds to transmit network events to main thread across worker port",
labelNames: ["eventName"],
buckets: [0.001, 0.003, 0.01, 0.03, 0.1],
}),

regenQueue: {
length: register.gauge({
Expand Down
8 changes: 8 additions & 0 deletions packages/beacon-node/src/network/core/metrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -333,12 +333,20 @@ export function createNetworkCoreMetrics(register: RegistryMetricCreator) {
};
}

export type NetworkCoreWorkerMetrics = ReturnType<typeof getNetworkCoreWorkerMetrics>;

// eslint-disable-next-line @typescript-eslint/explicit-function-return-type
export function getNetworkCoreWorkerMetrics(register: RegistryMetricCreator) {
return {
reqRespBridgeRespCallerPending: register.gauge({
name: "lodestar_network_worker_reqresp_bridge_caller_pending_count",
help: "Current count of pending elements in respBridgeCaller",
}),
networkWorkerWireEventsOnWorkerThreadLatency: register.histogram<"eventName">({
name: "lodestar_network_worker_wire_events_on_worker_thread_latency_seconds",
help: "Latency in seconds to transmit network events to worker thread across parent port",
labelNames: ["eventName"],
buckets: [0.001, 0.003, 0.01, 0.03, 0.1],
}),
};
}
12 changes: 7 additions & 5 deletions packages/beacon-node/src/network/core/networkCoreWorker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ import {peerIdToString} from "../../util/peerId.js";
import {profileNodeJS} from "../../util/profile.js";
import {NetworkEventBus, NetworkEventData, networkEventDirection} from "../events.js";
import {wireEventsOnWorkerThread} from "../../util/workerEvents.js";
import {getNetworkCoreWorkerMetrics} from "./metrics.js";
import {NetworkWorkerApi, NetworkWorkerData} from "./types.js";
import {NetworkCore} from "./networkCore.js";
import {
NetworkWorkerThreadEventType,
ReqRespBridgeEventBus,
Expand All @@ -21,9 +24,6 @@ import {
getReqRespBridgeRespEvents,
reqRespBridgeEventDirection,
} from "./events.js";
import {getNetworkCoreWorkerMetrics} from "./metrics.js";
import {NetworkCore} from "./networkCore.js";
import {NetworkWorkerApi, NetworkWorkerData} from "./types.js";

// Cloned data from instantiation
const workerData = worker.workerData as NetworkWorkerData;
Expand Down Expand Up @@ -83,9 +83,9 @@ new AsyncIterableBridgeHandler(getReqRespBridgeReqEvents(reqRespBridgeEventBus),
);
const reqRespBridgeRespCaller = new AsyncIterableBridgeCaller(getReqRespBridgeRespEvents(reqRespBridgeEventBus));

const networkCoreWorkerMetrics = metricsRegister ? getNetworkCoreWorkerMetrics(metricsRegister) : null;
// respBridgeCaller metrics
if (metricsRegister) {
const networkCoreWorkerMetrics = getNetworkCoreWorkerMetrics(metricsRegister);
if (networkCoreWorkerMetrics) {
networkCoreWorkerMetrics.reqRespBridgeRespCallerPending.addCollect(() => {
networkCoreWorkerMetrics.reqRespBridgeRespCallerPending.set(reqRespBridgeRespCaller.pendingCount);
});
Expand All @@ -110,12 +110,14 @@ wireEventsOnWorkerThread<NetworkEventData>(
NetworkWorkerThreadEventType.networkEvent,
events,
parentPort,
networkCoreWorkerMetrics,
networkEventDirection
);
wireEventsOnWorkerThread<ReqRespBridgeEventData>(
NetworkWorkerThreadEventType.reqRespBridgeEvents,
reqRespBridgeEventBus,
parentPort,
networkCoreWorkerMetrics,
reqRespBridgeEventDirection
);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,14 @@ export class WorkerNetworkCore implements INetworkCore {
NetworkWorkerThreadEventType.networkEvent,
modules.events,
modules.worker as unknown as worker_threads.Worker,
modules.metrics,
networkEventDirection
);
wireEventsOnMainThread<ReqRespBridgeEventData>(
NetworkWorkerThreadEventType.reqRespBridgeEvents,
this.reqRespBridgeEventBus,
modules.worker as unknown as worker_threads.Worker,
modules.metrics,
reqRespBridgeEventDirection
);

Expand Down
21 changes: 21 additions & 0 deletions packages/beacon-node/src/util/workerEvents.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@ import {MessagePort, Worker} from "node:worker_threads";
import {Thread} from "@chainsafe/threads";
import {Logger} from "@lodestar/logger";
import {sleep} from "@lodestar/utils";
import {Metrics} from "../metrics/metrics.js";
import {NetworkCoreWorkerMetrics} from "../network/core/metrics.js";
import {StrictEventEmitterSingleArg} from "./strictEvents.js";

const NANO_TO_SECOND_CONVERSION = 1e9;

export type WorkerBridgeEvent<EventData> = {
type: string;
event: keyof EventData;
posted: [number, number];
data: EventData[keyof EventData];
};

Expand All @@ -27,6 +32,7 @@ export function wireEventsOnWorkerThread<EventData>(
mainEventName: string,
events: StrictEventEmitterSingleArg<EventData>,
parentPort: MessagePort,
metrics: NetworkCoreWorkerMetrics | null,
isWorkerToMain: {[K in keyof EventData]: EventDirection}
): void {
// Subscribe to events from main thread
Expand All @@ -37,6 +43,12 @@ export function wireEventsOnWorkerThread<EventData>(
// This check is not necessary but added for safety in case of improper implemented events
isWorkerToMain[data.event] === EventDirection.mainToWorker
) {
const [sec, nanoSec] = process.hrtime(data.posted);
matthewkeil marked this conversation as resolved.
Show resolved Hide resolved
const networkWorkerLatency = sec + nanoSec / NANO_TO_SECOND_CONVERSION;
metrics?.networkWorkerWireEventsOnWorkerThreadLatency.observe(
{eventName: data.event as string},
networkWorkerLatency
);
events.emit(data.event, data.data);
}
});
Expand All @@ -48,6 +60,7 @@ export function wireEventsOnWorkerThread<EventData>(
const workerEvent: WorkerBridgeEvent<EventData> = {
type: mainEventName,
event: eventName,
posted: process.hrtime(),
data,
};
parentPort.postMessage(workerEvent);
Expand All @@ -60,6 +73,7 @@ export function wireEventsOnMainThread<EventData>(
mainEventName: string,
events: StrictEventEmitterSingleArg<EventData>,
worker: Pick<Worker, "on" | "postMessage">,
metrics: Metrics | null,
isWorkerToMain: {[K in keyof EventData]: EventDirection}
): void {
// Subscribe to events from main thread
Expand All @@ -70,6 +84,12 @@ export function wireEventsOnMainThread<EventData>(
// This check is not necessary but added for safety in case of improper implemented events
isWorkerToMain[data.event] === EventDirection.workerToMain
) {
const [sec, nanoSec] = process.hrtime(data.posted);
const networkWorkerLatency = sec + nanoSec / NANO_TO_SECOND_CONVERSION;
metrics?.networkWorkerWireEventsOnMainThreadLatency.observe(
{eventName: data.event as string},
networkWorkerLatency
);
events.emit(data.event, data.data);
}
});
Expand All @@ -81,6 +101,7 @@ export function wireEventsOnMainThread<EventData>(
const workerEvent: WorkerBridgeEvent<EventData> = {
type: mainEventName,
event: eventName,
posted: process.hrtime(),
data,
};
worker.postMessage(workerEvent);
Expand Down