diff --git a/packages/brain/src/index.ts b/packages/brain/src/index.ts index 094ad9ab..a7588155 100644 --- a/packages/brain/src/index.ts +++ b/packages/brain/src/index.ts @@ -8,7 +8,8 @@ import { ValidatorApi, DappnodeSignatureVerifier, DappmanagerApi, - PostgresClient + PostgresClient, + PrometheusApi } from "./modules/apiClients/index.js"; import { startUiServer, startLaunchpadApi } from "./modules/apiServers/index.js"; import * as dotenv from "dotenv"; @@ -59,6 +60,13 @@ logger.debug( ); // Create API instances. Must preceed db initialization +export const prometheusApi = new PrometheusApi({ + baseUrl: "http://prometheus.dms.dappnode:9090", + minGenesisTime, + secondsPerSlot, + slotsPerEpoch, + network +}); export const signerApi = new Web3SignerApi( { baseUrl: signerUrl, @@ -116,6 +124,7 @@ export const trackValidatorsPerformanceCronTask = new CronJob( executionClient, consensusClient, dappmanagerApi, + prometheusApi, sendNotification: true }); } diff --git a/packages/brain/src/modules/apiClients/index.ts b/packages/brain/src/modules/apiClients/index.ts index 752c626c..40ac0182 100644 --- a/packages/brain/src/modules/apiClients/index.ts +++ b/packages/brain/src/modules/apiClients/index.ts @@ -1,5 +1,6 @@ export { BlockExplorerApi } from "./blockExplorer/index.js"; export { DappmanagerApi } from "./dappmanager/index.js"; +export { PrometheusApi } from "./prometheus/index.js"; export { BeaconchainApi } from "./beaconchain/index.js"; export { ValidatorApi } from "./validator/index.js"; export { StandardApi } from "./standard.js"; diff --git a/packages/brain/src/modules/apiClients/prometheus/error.ts b/packages/brain/src/modules/apiClients/prometheus/error.ts new file mode 100644 index 00000000..0e93e6a6 --- /dev/null +++ b/packages/brain/src/modules/apiClients/prometheus/error.ts @@ -0,0 +1,8 @@ +import { ApiError } from "../error.js"; + +export class PrometheusApiError extends ApiError { + constructor(message: string) { + super(message); + this.name = "PrometheusApiError"; + } +} diff --git a/packages/brain/src/modules/apiClients/prometheus/index.ts b/packages/brain/src/modules/apiClients/prometheus/index.ts new file mode 100644 index 00000000..c0fb502b --- /dev/null +++ b/packages/brain/src/modules/apiClients/prometheus/index.ts @@ -0,0 +1,169 @@ +import { Network } from "@stakingbrain/common"; +import logger from "../../logger/index.js"; +import { StandardApi } from "../standard.js"; +import { AvgHostMetrics } from "./types.js"; +import { PrometheusApiError } from "./error.js"; + +export class PrometheusApi extends StandardApi { + private readonly minGenesisTime: number; + private readonly slotsPerEpoch: number; + private readonly secondsPerSlot: number; + + constructor({ + baseUrl, + minGenesisTime, + slotsPerEpoch, + secondsPerSlot, + network + }: { + baseUrl: string; + minGenesisTime: number; + slotsPerEpoch: number; + secondsPerSlot: number; + network: Network; + }) { + super({ baseUrl }, network); + this.minGenesisTime = minGenesisTime; + this.slotsPerEpoch = slotsPerEpoch; + this.secondsPerSlot = secondsPerSlot; + } + + /** + * Get average host metrics for a given epoch: + * - avgCpuTemperature + * - avgCpuUsage + * - avgMemoryUsage + * - ioUtilizationPerDisk + */ + public async getPrometheusMetrics({ epoch }: { epoch: number }): Promise { + try { + const { startTimestamp, endTimestamp } = this.calculateEpochTimestamps(epoch); + + // Looks like query_range does not allow to request multiple queries in a single reques + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const ioDisk: { metric: { device: string }; values: any[] }[] = await this.getPrometheusDataResult({ + query: `irate(node_disk_io_time_seconds_total{instance="node-exporter.dms.dappnode:9100", job="nodeexporter"}[${endTimestamp - startTimestamp}s])`, + startTimestamp, + endTimestamp + }); + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const ioUtilizationPerDisk = ioDisk.reduce((acc: { [key: string]: any }, disk) => { + const device = disk.metric.device; + const utilization = Math.round(parseFloat(disk.values[0][1]) * 100); + acc[device] = utilization; + return acc; + }, {}); + + return { + startTimestamp, + endTimestamp, + avgCpuTemperature: await this.getPrometheusAvgMetric({ + query: `avg_over_time(dappmanager_cpu_temperature_celsius{app="dappmanager-custom-metrics", instance="dappmanager.dappnode:80", job="manager_sd", package="dappmanager.dnp.dappnode.eth", service="dappmanager", type="current"}[${endTimestamp - startTimestamp}s])`, + startTimestamp, + endTimestamp + }), + avgCpuUsage: await this.getPrometheusAvgMetric({ + query: `100 * (1 - avg(rate(node_cpu_seconds_total{instance="node-exporter.dms.dappnode:9100", job="nodeexporter", mode="idle"}[${endTimestamp - startTimestamp}s])) by (instance))`, + startTimestamp, + endTimestamp + }), + avgMemoryUsage: await this.getPrometheusAvgMetric({ + query: `100 * (1 - avg(node_memory_MemAvailable_bytes{instance="node-exporter.dms.dappnode:9100", job="nodeexporter"} / node_memory_MemTotal_bytes{instance="node-exporter.dms.dappnode:9100", job="nodeexporter"}) by (instance))`, + startTimestamp, + endTimestamp + }), + ioUtilizationPerDisk + }; + } catch (error) { + logger.error("Failed to get prometheus metrics", error); + throw new PrometheusApiError(`Failed to get prometheus metrics: ${error.message}`); + } + } + + /** + * Query prometheus metric using the endpoint /api/v1/query_range. + * Used to get the data result for later processing. + * + * @see https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries + */ + private async getPrometheusDataResult({ + query, + startTimestamp, + endTimestamp + }: { + query: string; + startTimestamp: number; + endTimestamp: number; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + }): Promise<{ metric: { device: string }; values: any[] }[]> { + // Construct the request body + const requestBody = new URLSearchParams({ + query, + start: startTimestamp.toString(), + end: endTimestamp.toString(), + step: `10m` // It should be higher than the time range so it returns only one value + }).toString(); + + return ( + await this.request({ + method: "POST", + endpoint: `/api/v1/query_range`, + headers: { + "Content-Type": "application/x-www-form-urlencoded" + }, + body: requestBody + }) + ).data.result; + } + + /** + * Query prometheus metric using the endpoint /api/v1/query_range. + * This method assumes there is only 1 metric in the reponse (in the array) + * + * @see https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries + */ + private async getPrometheusAvgMetric({ + query, + startTimestamp, + endTimestamp + }: { + query: string; + startTimestamp: number; + endTimestamp: number; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + }): Promise { + // Construct the request body + const requestBody = new URLSearchParams({ + query, + start: startTimestamp.toString(), + end: endTimestamp.toString(), + step: `10m` // It should be higher than the time range so it returns only one value + }).toString(); + + return Math.round( + parseFloat( + ( + await this.request({ + method: "POST", + endpoint: `/api/v1/query_range`, + headers: { + "Content-Type": "application/x-www-form-urlencoded" + }, + body: requestBody + }) + ).data.result[0].values[0][1] + ) + ); + } + + private calculateEpochTimestamps(epoch: number): { startTimestamp: number; endTimestamp: number } { + const startTimestamp = this.minGenesisTime + epoch * this.slotsPerEpoch * this.secondsPerSlot; + const endTimestamp = startTimestamp + this.slotsPerEpoch * this.secondsPerSlot - 1; + return { + startTimestamp, + endTimestamp + }; + } +} diff --git a/packages/brain/src/modules/apiClients/prometheus/types.ts b/packages/brain/src/modules/apiClients/prometheus/types.ts new file mode 100644 index 00000000..7320886c --- /dev/null +++ b/packages/brain/src/modules/apiClients/prometheus/types.ts @@ -0,0 +1,10 @@ +export interface AvgHostMetrics { + startTimestamp: number; + endTimestamp: number; + avgCpuTemperature: number; + avgCpuUsage: number; + avgMemoryUsage: number; + ioUtilizationPerDisk: { + [disk: string]: number; + }; +} diff --git a/packages/brain/src/modules/apiClients/standard.ts b/packages/brain/src/modules/apiClients/standard.ts index b2abbc2b..98d28d97 100644 --- a/packages/brain/src/modules/apiClients/standard.ts +++ b/packages/brain/src/modules/apiClients/standard.ts @@ -130,7 +130,8 @@ export class StandardApi { let errorMessage = ""; if (res.headers["content-type"] && res.headers["content-type"].includes("application/json")) { try { - errorMessage = JSON.parse(Buffer.concat(data).toString())?.message; + // if its a error message in JSON we dont know the object format so print it in string format the whole error + errorMessage = Buffer.concat(data).toString(); } catch (e) { logger.error( `Error parsing response from ${this.requestOptions.hostname} ${endpoint} ${e.message}`, diff --git a/packages/brain/src/modules/cron/trackValidatorsPerformance/fetchAndInsertEpochValidatorsData.ts b/packages/brain/src/modules/cron/trackValidatorsPerformance/fetchAndInsertEpochValidatorsData.ts index 1d6568e8..e91ba960 100644 --- a/packages/brain/src/modules/cron/trackValidatorsPerformance/fetchAndInsertEpochValidatorsData.ts +++ b/packages/brain/src/modules/cron/trackValidatorsPerformance/fetchAndInsertEpochValidatorsData.ts @@ -6,7 +6,7 @@ import { setBlockProposalStatus } from "./setBlockProposalStatus.js"; import { setActiveValidatorsLoadedInBrain } from "./setActiveValidatorsLoadedInBrain.js"; import { ExecutionOfflineError, NodeSyncingError } from "./error.js"; // External -import { BeaconchainApi, PostgresClient, DappmanagerApi } from "../../apiClients/index.js"; +import { BeaconchainApi, PostgresClient, DappmanagerApi, PrometheusApi } from "../../apiClients/index.js"; import { BrainDbError } from "../../db/error.js"; import logger from "../../logger/index.js"; import { BrainDataBase } from "../../db/index.js"; @@ -23,6 +23,7 @@ export async function fetchAndInsertEpochValidatorsData({ clients, currentEpoch, dappmanagerApi, + prometheusApi, sendNotification }: { brainDb: BrainDataBase; @@ -31,6 +32,7 @@ export async function fetchAndInsertEpochValidatorsData({ clients: Clients; currentEpoch: number; dappmanagerApi: DappmanagerApi; + prometheusApi: PrometheusApi; sendNotification: boolean; }): Promise { if (currentEpoch === lastProcessedEpoch) return; @@ -88,6 +90,7 @@ export async function fetchAndInsertEpochValidatorsData({ if (sendNotification) await sendValidatorsPerformanceNotifications({ dappmanagerApi, + prometheusApi, currentEpoch: currentEpoch.toString(), validatorsDataPerEpochMap }); diff --git a/packages/brain/src/modules/cron/trackValidatorsPerformance/index.ts b/packages/brain/src/modules/cron/trackValidatorsPerformance/index.ts index 0662c1bf..88cd8981 100644 --- a/packages/brain/src/modules/cron/trackValidatorsPerformance/index.ts +++ b/packages/brain/src/modules/cron/trackValidatorsPerformance/index.ts @@ -1,5 +1,5 @@ import { ExecutionClient, ConsensusClient } from "@stakingbrain/common"; -import { PostgresClient, BeaconchainApi, DappmanagerApi } from "../../apiClients/index.js"; +import { PostgresClient, BeaconchainApi, DappmanagerApi, PrometheusApi } from "../../apiClients/index.js"; import { BrainDataBase } from "../../db/index.js"; import logger from "../../logger/index.js"; import { fetchAndInsertEpochValidatorsData } from "./fetchAndInsertEpochValidatorsData.js"; @@ -13,6 +13,7 @@ export async function trackEpochValidatorsDataCron({ executionClient, consensusClient, dappmanagerApi, + prometheusApi, sendNotification }: { brainDb: BrainDataBase; @@ -21,6 +22,7 @@ export async function trackEpochValidatorsDataCron({ executionClient: ExecutionClient; consensusClient: ConsensusClient; dappmanagerApi: DappmanagerApi; + prometheusApi: PrometheusApi; sendNotification: boolean; }): Promise { try { @@ -49,6 +51,7 @@ export async function trackEpochValidatorsDataCron({ }, currentEpoch, dappmanagerApi, + prometheusApi, sendNotification }); } catch (error) { diff --git a/packages/brain/src/modules/cron/trackValidatorsPerformance/sendValidatorsPerformanceNotifications.ts b/packages/brain/src/modules/cron/trackValidatorsPerformance/sendValidatorsPerformanceNotifications.ts index b249233d..db52a7d9 100644 --- a/packages/brain/src/modules/cron/trackValidatorsPerformance/sendValidatorsPerformanceNotifications.ts +++ b/packages/brain/src/modules/cron/trackValidatorsPerformance/sendValidatorsPerformanceNotifications.ts @@ -1,4 +1,4 @@ -import { DappmanagerApi } from "../../apiClients/index.js"; +import { DappmanagerApi, PrometheusApi } from "../../apiClients/index.js"; import { NotificationType } from "../../apiClients/dappmanager/types.js"; import { BlockProposalStatus, ValidatorsDataPerEpochMap } from "../../apiClients/postgres/types.js"; import logger from "../../logger/index.js"; @@ -12,10 +12,12 @@ import { logPrefix } from "./logPrefix.js"; */ export async function sendValidatorsPerformanceNotifications({ dappmanagerApi, + prometheusApi, currentEpoch, validatorsDataPerEpochMap }: { dappmanagerApi: DappmanagerApi; + prometheusApi: PrometheusApi; currentEpoch: string; validatorsDataPerEpochMap: ValidatorsDataPerEpochMap; }): Promise { @@ -23,10 +25,11 @@ export async function sendValidatorsPerformanceNotifications({ sendSuccessNotificationNotThrow({ dappmanagerApi, validatorsDataPerEpochMap, currentEpoch }), sendWarningNotificationNotThrow({ dappmanagerApi, + prometheusApi, validatorsDataPerEpochMap, currentEpoch }), - sendDangerNotificationNotThrow({ dappmanagerApi, validatorsDataPerEpochMap, currentEpoch }) + sendDangerNotificationNotThrow({ dappmanagerApi, prometheusApi, validatorsDataPerEpochMap, currentEpoch }) ]); } @@ -55,10 +58,12 @@ async function sendSuccessNotificationNotThrow({ async function sendWarningNotificationNotThrow({ dappmanagerApi, + prometheusApi, validatorsDataPerEpochMap, currentEpoch }: { dappmanagerApi: DappmanagerApi; + prometheusApi: PrometheusApi; validatorsDataPerEpochMap: ValidatorsDataPerEpochMap; currentEpoch: string; }): Promise { @@ -67,21 +72,24 @@ async function sendWarningNotificationNotThrow({ ); if (validatorsMissedAttestations.length === 0) return; + const hostMetricsMessage = await getHostMetricsMessage(prometheusApi, currentEpoch); await dappmanagerApi .sendDappmanagerNotification({ title: `Missed attestation in epoch ${currentEpoch}`, notificationType: NotificationType.Warning, - body: `Validator(s) ${validatorsMissedAttestations.join(", ")} missed attestations` + body: `Validator(s) ${validatorsMissedAttestations.join(", ")} missed an attestation\n${hostMetricsMessage}` }) .catch((error) => logger.error(`${logPrefix}Failed to send warning notification to dappmanager`, error)); } async function sendDangerNotificationNotThrow({ dappmanagerApi, + prometheusApi, validatorsDataPerEpochMap, currentEpoch }: { dappmanagerApi: DappmanagerApi; + prometheusApi: PrometheusApi; validatorsDataPerEpochMap: ValidatorsDataPerEpochMap; currentEpoch: string; }): Promise { @@ -90,11 +98,45 @@ async function sendDangerNotificationNotThrow({ ); if (validatorsMissedBlocks.length === 0) return; + const hostMetricsMessage = await getHostMetricsMessage(prometheusApi, currentEpoch); await dappmanagerApi .sendDappmanagerNotification({ title: `Block missed in epoch ${currentEpoch}`, notificationType: NotificationType.Danger, - body: `Validator(s) ${validatorsMissedBlocks.join(", ")} missed a block` + body: `Validator(s) ${validatorsMissedBlocks.join(", ")} missed a block\n${hostMetricsMessage}` }) .catch((error) => logger.error(`${logPrefix}Failed to send danger notification to dappmanager`, error)); } + +async function getHostMetricsMessage(prometheusApi: PrometheusApi, epoch: string): Promise { + const { startTimestamp, endTimestamp, avgCpuTemperature, avgCpuUsage, avgMemoryUsage, ioUtilizationPerDisk } = + await prometheusApi.getPrometheusMetrics({ epoch: parseInt(epoch) }); + + // Create a formatted message for Disk I/O utilization + const ioUtilizationPerDiskMessage = Object.entries(ioUtilizationPerDisk) + .map(([disk, utilization]) => ` - *${disk}*: *${utilization}%*`) + .join("\n"); + + // Create a structured and formatted message + return `⚠️ *Average host metrics within epoch ${epoch}*:\n +- *CPU temperature*: *${avgCpuTemperature}°C* +- *CPU usage*: *${avgCpuUsage}%* +- *Memory usage*: *${avgMemoryUsage}%* +- *Disk I/O utilization*:\n${ioUtilizationPerDiskMessage}\n +${getDmsDashboardsMessage({ startTimestamp, endTimestamp })}`; +} + +function getDmsDashboardsMessage({ + startTimestamp, + endTimestamp +}: { + startTimestamp: number; + endTimestamp: number; +}): string { + const startTimestampInMs = startTimestamp * 1000; + const endTimestampInMs = endTimestamp * 1000; + + return `🔗 *For more details, check the DMS dashboards:*\n +- [Host dashboard](http://dms.dappnode/d/dms-host/host?orgId=1&from=${startTimestampInMs}&to=${endTimestampInMs}) +- [Docker dashboard](http://dms.dappnode/d/dms-docker/docker?orgId=1&from=${startTimestampInMs}&to=${endTimestampInMs})`; +}