From 75fffe3ef63ba4e1737a12dad18c12e7af5a5ca7 Mon Sep 17 00:00:00 2001 From: walker-16 Date: Tue, 25 Jul 2023 15:39:27 -0300 Subject: [PATCH] add warning alert and metrics to fly component (#576) * Add unknown guardian set index alert * Add prometheus metrics to check if exists observations without txhash * Add max sequence cache error prometheus metrics --- fly/guardiansets/guardianset.go | 31 ++++++++++++++++++++-------- fly/guardiansets/guardianset_test.go | 10 +++++---- fly/internal/alert/alert.go | 20 ++++++++++++++---- fly/internal/metrics/dummy.go | 6 ++++++ fly/internal/metrics/metrics.go | 4 ++++ fly/internal/metrics/prometheus.go | 21 +++++++++++++++++++ fly/main.go | 2 +- fly/processor/vaa_gossip_consumer.go | 2 +- fly/processor/vaa_queue_consumer.go | 1 + fly/storage/repository.go | 3 ++- 10 files changed, 80 insertions(+), 20 deletions(-) diff --git a/fly/guardiansets/guardianset.go b/fly/guardiansets/guardianset.go index 896033d6a..da73bdc43 100644 --- a/fly/guardiansets/guardianset.go +++ b/fly/guardiansets/guardianset.go @@ -1,31 +1,42 @@ package guardiansets import ( + "context" "errors" "fmt" "time" "github.com/certusone/wormhole/node/pkg/common" + eth_common "github.com/ethereum/go-ethereum/common" + "github.com/wormhole-foundation/wormhole-explorer/common/client/alert" "github.com/wormhole-foundation/wormhole-explorer/common/domain" + flyAlert "github.com/wormhole-foundation/wormhole-explorer/fly/internal/alert" sdk "github.com/wormhole-foundation/wormhole/sdk/vaa" - - eth_common "github.com/ethereum/go-ethereum/common" ) // GuardianSetHistory contains information about all guardian sets for the current network (past and present). type GuardianSetHistory struct { guardianSetsByIndex []common.GuardianSet expirationTimesByIndex []time.Time + alertClient alert.AlertClient } // Verify takes a VAA as input and validates its guardian signatures. -func (h *GuardianSetHistory) Verify(vaa *sdk.VAA) error { +func (h *GuardianSetHistory) Verify(ctx context.Context, vaa *sdk.VAA) error { idx := vaa.GuardianSetIndex // Make sure the index exists if idx >= uint32(len(h.guardianSetsByIndex)) { - return fmt.Errorf("Guardian Set Index is out of bounds: got %d, max is %d", + alertContext := alert.AlertContext{ + Details: map[string]string{ + "vaaID": vaa.MessageID(), + "vaaGuardianSetIndex": fmt.Sprint(vaa.GuardianSetIndex), + "guardianSetIndex": fmt.Sprint(len(h.guardianSetsByIndex)), + }, + } + _ = h.alertClient.CreateAndSend(ctx, flyAlert.GuardianSetUnknown, alertContext) + return fmt.Errorf("guardian Set Index is out of bounds: got %d, max is %d", vaa.GuardianSetIndex, len(h.guardianSetsByIndex), ) @@ -45,16 +56,16 @@ func (h GuardianSetHistory) GetLatest() common.GuardianSet { } // Get get guardianset config by enviroment. -func GetByEnv(enviroment string) GuardianSetHistory { +func GetByEnv(enviroment string, alertClient alert.AlertClient) GuardianSetHistory { switch enviroment { case domain.P2pTestNet: - return getTestnetGuardianSet() + return getTestnetGuardianSet(alertClient) default: - return getMainnetGuardianSet() + return getMainnetGuardianSet(alertClient) } } -func getTestnetGuardianSet() GuardianSetHistory { +func getTestnetGuardianSet(alertClient alert.AlertClient) GuardianSetHistory { const tenYears = time.Hour * 24 * 365 * 10 gs0TestValidUntil := time.Now().Add(tenYears) gstest0 := common.GuardianSet{ @@ -66,10 +77,11 @@ func getTestnetGuardianSet() GuardianSetHistory { return GuardianSetHistory{ guardianSetsByIndex: []common.GuardianSet{gstest0}, expirationTimesByIndex: []time.Time{gs0TestValidUntil}, + alertClient: alertClient, } } -func getMainnetGuardianSet() GuardianSetHistory { +func getMainnetGuardianSet(alertClient alert.AlertClient) GuardianSetHistory { gs0ValidUntil := time.Unix(1628599904, 0) // Tue Aug 10 2021 12:51:44 GMT+0000 gs0 := common.GuardianSet{ Index: 0, @@ -163,5 +175,6 @@ func getMainnetGuardianSet() GuardianSetHistory { return GuardianSetHistory{ guardianSetsByIndex: []common.GuardianSet{gs0, gs1, gs2, gs3}, expirationTimesByIndex: []time.Time{gs0ValidUntil, gs1ValidUntil, gs2ValidUntil, gs3ValidUntil}, + alertClient: alertClient, } } diff --git a/fly/guardiansets/guardianset_test.go b/fly/guardiansets/guardianset_test.go index 4d14ed582..2672d4d39 100644 --- a/fly/guardiansets/guardianset_test.go +++ b/fly/guardiansets/guardianset_test.go @@ -1,9 +1,11 @@ package guardiansets import ( + "context" _ "embed" "testing" + "github.com/wormhole-foundation/wormhole-explorer/common/client/alert" sdk "github.com/wormhole-foundation/wormhole/sdk/vaa" ) @@ -21,8 +23,8 @@ func TestValidSignatures(t *testing.T) { } // assert that the signatures must be valid - h := getMainnetGuardianSet() - err = h.Verify(&vaa) + h := getMainnetGuardianSet(alert.NewDummyClient()) + err = h.Verify(context.TODO(), &vaa) if err != nil { t.Fatalf("Failed to verify VAA: %v", err) } @@ -45,8 +47,8 @@ func TestInvalidSignatures(t *testing.T) { } // assert that the signatures must be invalid - h := getMainnetGuardianSet() - err = h.Verify(&vaa) + h := getMainnetGuardianSet(alert.NewDummyClient()) + err = h.Verify(context.TODO(), &vaa) if err == nil { t.Fatal("Expected signatures to be invalid") } diff --git a/fly/internal/alert/alert.go b/fly/internal/alert/alert.go index d59f49a6b..63e0aa72f 100644 --- a/fly/internal/alert/alert.go +++ b/fly/internal/alert/alert.go @@ -13,7 +13,11 @@ const ( ErrorSaveObservation = "ERROR_SAVE_OBSERVATION" ErrorSaveHeartbeat = "ERROR_SAVE_HEARTBEAT" ErrorSaveGovernorStatus = "ERROR_SAVE_GOVERNOR_STATUS" - EroorSaveGovernorConfig = "ERROR_SAVE_GOVERNOR_CONFIG" + ErrorSaveGovernorConfig = "ERROR_SAVE_GOVERNOR_CONFIG" + + // warning alerts + GuardianSetUnknown = "GUARDIAN_SET_UNKNOWN" + ObservationWithoutTxHash = "OBSERVATION_WITHOUT_TX_HASH" ) func LoadAlerts(cfg alert.AlertConfig) map[string]alert.Alert { @@ -68,8 +72,8 @@ func LoadAlerts(cfg alert.AlertConfig) map[string]alert.Alert { Entity: "fly", Priority: alert.CRITICAL, } - alerts[EroorSaveGovernorConfig] = alert.Alert{ - Alias: EroorSaveGovernorConfig, + alerts[ErrorSaveGovernorConfig] = alert.Alert{ + Alias: ErrorSaveGovernorConfig, Message: fmt.Sprintf("[%s] %s", cfg.Environment, "Error saving governor config in governorConfig collection"), Description: "An error was found persisting the governor config in mongo in the governorConfig collection.", Actions: []string{}, @@ -77,6 +81,14 @@ func LoadAlerts(cfg alert.AlertConfig) map[string]alert.Alert { Entity: "fly", Priority: alert.CRITICAL, } - + alerts[GuardianSetUnknown] = alert.Alert{ + Alias: GuardianSetUnknown, + Message: fmt.Sprintf("[%s] %s", cfg.Environment, "Guardian set unknown"), + Description: "The guardian set from the vaa is unknown.", + Actions: []string{}, + Tags: []string{cfg.Environment, "fly", "guardianSet", "vaa"}, + Entity: "fly", + Priority: alert.INFORMATIONAL, + } return alerts } diff --git a/fly/internal/metrics/dummy.go b/fly/internal/metrics/dummy.go index 5f7fef8c7..15cf5e758 100644 --- a/fly/internal/metrics/dummy.go +++ b/fly/internal/metrics/dummy.go @@ -35,6 +35,9 @@ func (d *DummyMetrics) IncObservationUnfiltered(chain sdk.ChainID) {} // IncObservationInserted increases the number of observation inserted in database. func (d *DummyMetrics) IncObservationInserted(chain sdk.ChainID) {} +// IncObservationWithoutTxHash increases the number of observation without tx hash. +func (d *DummyMetrics) IncObservationWithoutTxHash(chain sdk.ChainID) {} + // IncObservationTotal increases the number of observation received from Gossip network. func (d *DummyMetrics) IncObservationTotal() {} @@ -55,3 +58,6 @@ func (d *DummyMetrics) IncGovernorStatusFromGossipNetwork(guardianName string) { // IncGovernorStatusInserted increases the number of guardian status inserted in database. func (d *DummyMetrics) IncGovernorStatusInserted(guardianName string) {} + +// IncMaxSequenceCacheError increases the number of errors when updating max sequence cache. +func (d *DummyMetrics) IncMaxSequenceCacheError(chain sdk.ChainID) {} diff --git a/fly/internal/metrics/metrics.go b/fly/internal/metrics/metrics.go index 9b4b2ee3e..cbd62dc82 100644 --- a/fly/internal/metrics/metrics.go +++ b/fly/internal/metrics/metrics.go @@ -16,6 +16,7 @@ type Metrics interface { IncObservationFromGossipNetwork(chain sdk.ChainID) IncObservationUnfiltered(chain sdk.ChainID) IncObservationInserted(chain sdk.ChainID) + IncObservationWithoutTxHash(chain sdk.ChainID) IncObservationTotal() // heartbeat metrics @@ -29,4 +30,7 @@ type Metrics interface { // governor status metrics IncGovernorStatusFromGossipNetwork(guardianName string) IncGovernorStatusInserted(guardianName string) + + // max sequence cache metrics + IncMaxSequenceCacheError(chain sdk.ChainID) } diff --git a/fly/internal/metrics/prometheus.go b/fly/internal/metrics/prometheus.go index 26d265bcd..a23e3b02a 100644 --- a/fly/internal/metrics/prometheus.go +++ b/fly/internal/metrics/prometheus.go @@ -15,6 +15,7 @@ type PrometheusMetrics struct { heartbeatReceivedCount *prometheus.CounterVec governorConfigReceivedCount *prometheus.CounterVec governorStatusReceivedCount *prometheus.CounterVec + maxSequenceCacheCount *prometheus.CounterVec } // NewPrometheusMetrics returns a new instance of PrometheusMetrics. @@ -88,6 +89,15 @@ func NewPrometheusMetrics(environment string) *PrometheusMetrics { "service": serviceName, }, }, []string{"guardian_node", "type"}) + maxSequenceCacheCount := promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "max_sequence_cache_count_by_chain", + Help: "Total number of errors when updating max sequence cache", + ConstLabels: map[string]string{ + "environment": environment, + "service": serviceName, + }, + }, []string{"chain"}) return &PrometheusMetrics{ vaaReceivedCount: vaaReceivedCount, vaaTotal: vaaTotal, @@ -96,6 +106,7 @@ func NewPrometheusMetrics(environment string) *PrometheusMetrics { heartbeatReceivedCount: heartbeatReceivedCount, governorConfigReceivedCount: governorConfigReceivedCount, governorStatusReceivedCount: governorStatusReceivedCount, + maxSequenceCacheCount: maxSequenceCacheCount, } } @@ -139,6 +150,11 @@ func (m *PrometheusMetrics) IncObservationInserted(chain sdk.ChainID) { m.observationReceivedCount.WithLabelValues(chain.String(), "inserted").Inc() } +// IncObservationWithoutTxHash increases the number of observation without tx hash. +func (m *PrometheusMetrics) IncObservationWithoutTxHash(chain sdk.ChainID) { + m.observationReceivedCount.WithLabelValues(chain.String(), "without_txhash").Inc() +} + // IncObservationTotal increases the number of observation received from Gossip network. func (m *PrometheusMetrics) IncObservationTotal() { m.observationTotal.Inc() @@ -173,3 +189,8 @@ func (m *PrometheusMetrics) IncGovernorStatusFromGossipNetwork(guardianName stri func (m *PrometheusMetrics) IncGovernorStatusInserted(guardianName string) { m.governorStatusReceivedCount.WithLabelValues(guardianName, "inserted").Inc() } + +// IncMaxSequenceCacheError increases the number of errors when updating max sequence cache. +func (m *PrometheusMetrics) IncMaxSequenceCacheError(chain sdk.ChainID) { + m.maxSequenceCacheCount.WithLabelValues(chain.String()).Inc() +} diff --git a/fly/main.go b/fly/main.go index 5cdd53ceb..d6a0216f7 100644 --- a/fly/main.go +++ b/fly/main.go @@ -309,7 +309,7 @@ func main() { // Bootstrap guardian set, otherwise heartbeats would be skipped // TODO: fetch this and probably figure out how to update it live - guardianSetHistory := guardiansets.GetByEnv(p2pNetworkConfig.Enviroment) + guardianSetHistory := guardiansets.GetByEnv(p2pNetworkConfig.Enviroment, alertClient) gsLastet := guardianSetHistory.GetLatest() gst.Set(&gsLastet) diff --git a/fly/processor/vaa_gossip_consumer.go b/fly/processor/vaa_gossip_consumer.go index 2d9db7664..9668b7c74 100644 --- a/fly/processor/vaa_gossip_consumer.go +++ b/fly/processor/vaa_gossip_consumer.go @@ -43,7 +43,7 @@ func NewVAAGossipConsumer( // Push handles incoming VAAs depending on whether it is a pyth or non pyth. func (p *vaaGossipConsumer) Push(ctx context.Context, v *vaa.VAA, serializedVaa []byte) error { - if err := p.guardianSetHistory.Verify(v); err != nil { + if err := p.guardianSetHistory.Verify(ctx, v); err != nil { p.logger.Error("Received invalid vaa", zap.String("id", v.MessageID())) return err } diff --git a/fly/processor/vaa_queue_consumer.go b/fly/processor/vaa_queue_consumer.go index 8eef67509..0c2028ccc 100644 --- a/fly/processor/vaa_queue_consumer.go +++ b/fly/processor/vaa_queue_consumer.go @@ -69,6 +69,7 @@ func (c *VAAQueueConsumer) Start(ctx context.Context) { err = c.notifyFunc(ctx, v, msg.Data()) if err != nil { + c.metrics.IncMaxSequenceCacheError(v.EmitterChain) c.logger.Error("Error notifying vaa", zap.String("id", v.MessageID()), zap.Error(err)) diff --git a/fly/storage/repository.go b/fly/storage/repository.go index 9750cdfa6..58a453f61 100644 --- a/fly/storage/repository.go +++ b/fly/storage/repository.go @@ -180,6 +180,7 @@ func (s *Repository) UpsertObservation(o *gossipv1.SignedObservation) error { zap.Uint64("chainId", chainID), zap.ByteString("txHash", o.GetTxHash()), zap.Error(err)) + s.metrics.IncObservationWithoutTxHash(vaa.ChainID(chainID)) } vaaTxHash := VaaIdTxHashUpdate{ @@ -284,7 +285,7 @@ func (s *Repository) UpsertGovernorConfig(govC *gossipv1.SignedChainGovernorConf }, Error: err2, } - s.alertClient.CreateAndSend(context.TODO(), flyAlert.EroorSaveGovernorConfig, alertContext) + s.alertClient.CreateAndSend(context.TODO(), flyAlert.ErrorSaveGovernorConfig, alertContext) } return err2 }