Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add heartbeat interval parameter. #3458

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions orchagent/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ extern bool gIsNatSupported;
#define SWSS_RECORD_ENABLE (0x1 << 1)
#define RESPONSE_PUBLISHER_RECORD_ENABLE (0x1 << 2)

/* orchagent heart beat message interval */
#define HEART_BEAT_INTERVAL_MSECS_DEFAULT 10 * 1000

string gMySwitchType = "";
int32_t gVoqMySwitchId = -1;
int32_t gVoqMaxCores = 0;
Expand All @@ -73,7 +76,7 @@ uint32_t create_switch_timeout = 0;

void usage()
{
cout << "usage: orchagent [-h] [-r record_type] [-d record_location] [-f swss_rec_filename] [-j sairedis_rec_filename] [-b batch_size] [-m MAC] [-i INST_ID] [-s] [-z mode] [-k bulk_size] [-q zmq_server_address] [-c mode] [-t create_switch_timeout] [-v VRF]" << endl;
cout << "usage: orchagent [-h] [-r record_type] [-d record_location] [-f swss_rec_filename] [-j sairedis_rec_filename] [-b batch_size] [-m MAC] [-i INST_ID] [-s] [-z mode] [-k bulk_size] [-q zmq_server_address] [-c mode] [-t create_switch_timeout] [-v VRF] [-I heart_beat_interval]" << endl;
cout << " -h: display this message" << endl;
cout << " -r record_type: record orchagent logs with type (default 3)" << endl;
cout << " Bit 0: sairedis.rec, Bit 1: swss.rec, Bit 2: responsepublisher.rec. For example:" << endl;
Expand All @@ -95,6 +98,7 @@ void usage()
cout << " -c counter mode (traditional|asic_db), default: asic_db" << endl;
cout << " -t Override create switch timeout, in sec" << endl;
cout << " -v vrf: VRF name (default empty)" << endl;
cout << " -I heart_beat_interval: Heart beat interval in millisecond (default 10)" << endl;
}

void sighup_handler(int signo)
Expand Down Expand Up @@ -349,8 +353,9 @@ int main(int argc, char **argv)
bool enable_zmq = false;
string responsepublisher_rec_filename = Recorder::RESPPUB_FNAME;
int record_type = 3; // Only swss and sairedis recordings enabled by default.
long heartBeatInterval = HEART_BEAT_INTERVAL_MSECS_DEFAULT;

while ((opt = getopt(argc, argv, "b:m:r:f:j:d:i:hsz:k:q:c:t:v:")) != -1)
while ((opt = getopt(argc, argv, "b:m:r:f:j:d:i:hsz:k:q:c:t:v:I:")) != -1)
{
switch (opt)
{
Expand Down Expand Up @@ -450,6 +455,22 @@ int main(int argc, char **argv)
vrf = optarg;
}
break;
case 'I':
if (optarg)
{
auto interval = atoi(optarg);
if (interval >= 0)
{
heartBeatInterval = interval;
SWSS_LOG_NOTICE("Setting heartbeat interval as %ld", heartBeatInterval);
}
else
{
heartBeatInterval = HEART_BEAT_INTERVAL_MSECS_DEFAULT;
SWSS_LOG_ERROR("Invalid input for heartbeat interval: %d. use default interval: %ld", interval, heartBeatInterval);
}
}
break;
default: /* '?' */
exit(EXIT_FAILURE);
}
Expand Down Expand Up @@ -815,7 +836,7 @@ int main(int argc, char **argv)
syncd_apply_view();
}

orchDaemon->start();
orchDaemon->start(heartBeatInterval);

return 0;
}
23 changes: 13 additions & 10 deletions orchagent/orchdaemon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,6 @@ using namespace swss;
#define APP_FABRIC_MONITOR_PORT_TABLE_NAME "FABRIC_PORT_TABLE"
#define APP_FABRIC_MONITOR_DATA_TABLE_NAME "FABRIC_MONITOR_TABLE"

/* orchagent heart beat message interval */
#define HEART_BEAT_INTERVAL_MSECS 10 * 1000

extern sai_switch_api_t* sai_switch_api;
extern sai_object_id_t gSwitchId;
extern string gMySwitchType;
Expand Down Expand Up @@ -827,7 +824,7 @@ void OrchDaemon::logRotate() {
}


void OrchDaemon::start()
void OrchDaemon::start(long heartBeatInterval)
{
SWSS_LOG_ENTER();

Expand All @@ -848,7 +845,7 @@ void OrchDaemon::start()
ret = m_select->select(&s, SELECT_TIMEOUT);

auto tend = std::chrono::high_resolution_clock::now();
heartBeat(tend);
heartBeat(tend, heartBeatInterval);

auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(tend - tstart);

Expand Down Expand Up @@ -925,7 +922,7 @@ void OrchDaemon::start()
flush();

SWSS_LOG_WARN("Orchagent is frozen for warm restart!");
freezeAndHeartBeat(UINT_MAX);
freezeAndHeartBeat(UINT_MAX, heartBeatInterval);
}
}
}
Expand Down Expand Up @@ -1089,25 +1086,31 @@ void OrchDaemon::addOrchList(Orch *o)
m_orchList.push_back(o);
}

void OrchDaemon::heartBeat(std::chrono::time_point<std::chrono::high_resolution_clock> tcurrent)
void OrchDaemon::heartBeat(std::chrono::time_point<std::chrono::high_resolution_clock> tcurrent, long interval)
{
if (interval == 0)
{
// disable heart beat feature when interval is 0
return;
}

// output heart beat message to SYSLOG
auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(tcurrent - m_lastHeartBeat);
if (diff.count() >= HEART_BEAT_INTERVAL_MSECS)
if (diff.count() >= interval)
{
m_lastHeartBeat = tcurrent;
// output heart beat message to supervisord with 'PROCESS_COMMUNICATION_STDOUT' event: http://supervisord.org/events.html
cout << "<!--XSUPERVISOR:BEGIN-->heartbeat<!--XSUPERVISOR:END-->" << endl;
}
}

void OrchDaemon::freezeAndHeartBeat(unsigned int duration)
void OrchDaemon::freezeAndHeartBeat(unsigned int duration, long interval)
{
while (duration > 0)
{
// Send heartbeat message to prevent Orchagent stuck alert.
auto tend = std::chrono::high_resolution_clock::now();
heartBeat(tend);
heartBeat(tend, interval);

duration--;
sleep(1);
Expand Down
6 changes: 3 additions & 3 deletions orchagent/orchdaemon.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class OrchDaemon
~OrchDaemon();

virtual bool init();
void start();
void start(long heartBeatInterval);
bool warmRestoreAndSyncUp();
void getTaskToSync(vector<string> &ts);
bool warmRestoreValidation();
Expand Down Expand Up @@ -102,9 +102,9 @@ class OrchDaemon

void flush();

void heartBeat(std::chrono::time_point<std::chrono::high_resolution_clock> tcurrent);
void heartBeat(std::chrono::time_point<std::chrono::high_resolution_clock> tcurrent, long interval);

void freezeAndHeartBeat(unsigned int duration);
void freezeAndHeartBeat(unsigned int duration, long interval);
};

class FabricOrchDaemon : public OrchDaemon
Expand Down
34 changes: 34 additions & 0 deletions tests/test_zmq.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,37 @@ def test_vrf(self, dvs):
dvs.runcmd("cp /usr/bin/orchagent.sh_vrf_ut_backup /usr/bin/orchagent.sh")
dvs.stop_swss()
dvs.start_swss()

def test_heartbeat(self, dvs):
# Improve test code coverage, change orchagent to disable heartbeat
dvs.runcmd("cp /usr/bin/orchagent.sh /usr/bin/orchagent.sh_hb_ut_backup")
dvs.runcmd("sed -i.bak 's/\/usr\/bin\/orchagent /\/usr\/bin\/orchagent -I 0 /g' /usr/bin/orchagent.sh")
dvs.stop_swss()
dvs.start_swss()

# wait orchagent start
time.sleep(3)
process_statue = dvs.runcmd("ps -ef")
zmq_logger.debug("Process status: {}".format(process_statue))

# revert change
dvs.runcmd("cp /usr/bin/orchagent.sh_hb_ut_backup /usr/bin/orchagent.sh")
dvs.stop_swss()
dvs.start_swss()

def test_usage(self, dvs):
# Improve test code coverage, change orchagent to display usage
dvs.runcmd("cp /usr/bin/orchagent.sh /usr/bin/orchagent.sh_usage_ut_backup")
dvs.runcmd("sed -i.bak 's/\/usr\/bin\/orchagent /\/usr\/bin\/orchagent -h /g' /usr/bin/orchagent.sh")
dvs.stop_swss()
dvs.start_swss()

# wait orchagent start
time.sleep(3)
process_statue = dvs.runcmd("ps -ef")
zmq_logger.debug("Process status: {}".format(process_statue))

# revert change
dvs.runcmd("cp /usr/bin/orchagent.sh_usage_ut_backup /usr/bin/orchagent.sh")
dvs.stop_swss()
dvs.start_swss()
Loading