diff options
Diffstat (limited to 'erts/etc/common/heart.c')
-rw-r--r-- | erts/etc/common/heart.c | 116 |
1 files changed, 52 insertions, 64 deletions
diff --git a/erts/etc/common/heart.c b/erts/etc/common/heart.c index 2830641802..bc353e384e 100644 --- a/erts/etc/common/heart.c +++ b/erts/etc/common/heart.c @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 1996-2013. All Rights Reserved. + * Copyright Ericsson AB 1996-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -47,13 +48,10 @@ * * HEART_BEATING * - * This program expects a heart beat messages. If it does not receive a - * heart beat message from Erlang within heart_beat_timeout seconds, it - * reboots the system. The variable heart_beat_timeout is exported (so - * that it can be set from the shell in VxWorks, as is the variable - * heart_beat_report_delay). When using Solaris, the system is rebooted - * by executing the command stored in the environment variable - * HEART_COMMAND. + * This program expects a heart beat message. If it does not receive a + * heart beat message from Erlang within heart_beat_timeout seconds, it + * reboots the system. The system is rebooted by executing the command + * stored in the environment variable HEART_COMMAND. * * BLOCKING DESCRIPTORS * @@ -109,7 +107,7 @@ # include <sys/time.h> # include <unistd.h> # include <signal.h> -# if defined(CORRECT_USING_TIMES) +# if defined(OS_MONOTONIC_TIME_USING_TIMES) # include <sys/times.h> # include <limits.h> # endif @@ -117,11 +115,14 @@ #define HEART_COMMAND_ENV "HEART_COMMAND" #define ERL_CRASH_DUMP_SECONDS_ENV "ERL_CRASH_DUMP_SECONDS" +#define HEART_KILL_SIGNAL "HEART_KILL_SIGNAL" +#define HEART_NO_KILL "HEART_NO_KILL" + -#define MSG_HDR_SIZE 2 -#define MSG_HDR_PLUS_OP_SIZE 3 -#define MSG_BODY_SIZE 2048 -#define MSG_TOTAL_SIZE 2050 +#define MSG_HDR_SIZE (2) +#define MSG_HDR_PLUS_OP_SIZE (3) +#define MSG_BODY_SIZE (2048) +#define MSG_TOTAL_SIZE (2050) unsigned char cmd[MSG_BODY_SIZE]; @@ -145,27 +146,17 @@ struct msg { /* Maybe interesting to change */ /* Times in seconds */ -#define HEART_BEAT_BOOT_DELAY 60 /* 1 minute */ #define SELECT_TIMEOUT 5 /* Every 5 seconds we reset the watchdog timer */ /* heart_beat_timeout is the maximum gap in seconds between two - consecutive heart beat messages from Erlang, and HEART_BEAT_BOOT_DELAY - is the the extra delay that wd_keeper allows for, to give heart a - chance to reboot in the "normal" way before the hardware watchdog - enters the scene. heart_beat_report_delay is the time allowed for reporting - before rebooting under VxWorks. */ + consecutive heart beat messages from Erlang. */ int heart_beat_timeout = 60; -int heart_beat_report_delay = 30; -int heart_beat_boot_delay = HEART_BEAT_BOOT_DELAY; /* All current platforms have a process identifier that fits in an unsigned long and where 0 is an impossible or invalid value */ unsigned long heart_beat_kill_pid = 0; -#define VW_WD_TIMEOUT (heart_beat_timeout+heart_beat_report_delay+heart_beat_boot_delay) -#define SOL_WD_TIMEOUT (heart_beat_timeout+heart_beat_boot_delay) - /* reasons for reboot */ #define R_TIMEOUT (1) #define R_CLOSED (2) @@ -293,7 +284,6 @@ free_env_val(char *value) static void get_arguments(int argc, char** argv) { int i = 1; int h; - int w; unsigned long p; while (i < argc) { @@ -309,15 +299,6 @@ static void get_arguments(int argc, char** argv) { i++; } break; - case 'w': - if (strcmp(argv[i], "-wt") == 0) - if (sscanf(argv[i+1],"%i",&w) ==1) - if ((w > 10) && (w <= 65535)) { - heart_beat_boot_delay = w; - fprintf(stderr,"heart_beat_boot_delay = %d\n",w); - i++; - } - break; case 'p': if (strcmp(argv[i], "-pid") == 0) if (sscanf(argv[i+1],"%lu",&p) ==1){ @@ -343,7 +324,7 @@ static void get_arguments(int argc, char** argv) { } i++; } - debugf("arguments -ht %d -wt %d -pid %lu\n",h,w,p); + debugf("arguments -ht %d -pid %lu\n",h,p); } int main(int argc, char **argv) { @@ -470,10 +451,6 @@ message_loop(erlin_fd, erlout_fd) switch (mp->op) { case HEART_BEAT: timestamp(&last_received); -#ifdef USE_WATCHDOG - /* reset the hardware watchdog timer */ - wd_reset(); -#endif break; case SHUT_DOWN: return R_SHUT_DOWN; @@ -526,6 +503,12 @@ static void kill_old_erlang(void){ HANDLE erlh; DWORD exit_code; + char* envvar = NULL; + + envvar = get_env(HEART_NO_KILL); + if (envvar && strcmp(envvar, "TRUE") == 0) + return; + if(heart_beat_kill_pid != 0){ if((erlh = OpenProcess(PROCESS_TERMINATE | SYNCHRONIZE | @@ -555,14 +538,26 @@ kill_old_erlang(void){ static void kill_old_erlang(void){ pid_t pid; - int i; - int res; + int i, res; + int sig = SIGKILL; + char *envvar = NULL; + + envvar = get_env(HEART_NO_KILL); + if (envvar && strcmp(envvar, "TRUE") == 0) + return; + + envvar = get_env(HEART_KILL_SIGNAL); + if (envvar && strcmp(envvar, "SIGABRT") == 0) { + print_error("kill signal SIGABRT requested"); + sig = SIGABRT; + } + if(heart_beat_kill_pid != 0){ pid = (pid_t) heart_beat_kill_pid; - res = kill(pid,SIGKILL); + res = kill(pid,sig); for(i=0; i < 5 && res == 0; ++i){ sleep(1); - res = kill(pid,SIGKILL); + res = kill(pid,sig); } if(errno != ESRCH){ print_error("Unable to kill old process, " @@ -656,11 +651,6 @@ void win_system(char *command) */ static void do_terminate(int erlin_fd, int reason) { - /* - When we get here, we have HEART_BEAT_BOOT_DELAY secs to finish - (plus heart_beat_report_delay if under VxWorks), so we don't need - to call wd_reset(). - */ int ret = 0, tmo=0; char *tmo_env; @@ -708,14 +698,12 @@ do_terminate(int erlin_fd, int reason) { print_error("Would reboot. Terminating."); else { kill_old_erlang(); - /* suppress gcc warning with 'if' */ ret = system(command); print_error("Executed \"%s\" -> %d. Terminating.",command, ret); } free_env_val(command); } else { kill_old_erlang(); - /* suppress gcc warning with 'if' */ ret = system((char*)&cmd[0]); print_error("Executed \"%s\" -> %d. Terminating.",cmd, ret); } @@ -1084,9 +1072,9 @@ time_t timestamp(time_t *res) return r; } -#elif defined(HAVE_GETHRTIME) || defined(GETHRTIME_WITH_CLOCK_GETTIME) +#elif defined(OS_MONOTONIC_TIME_USING_GETHRTIME) || defined(OS_MONOTONIC_TIME_USING_CLOCK_GETTIME) -#if defined(GETHRTIME_WITH_CLOCK_GETTIME) +#if defined(OS_MONOTONIC_TIME_USING_CLOCK_GETTIME) typedef long long SysHrTime; SysHrTime sys_gethrtime(void); @@ -1095,7 +1083,7 @@ SysHrTime sys_gethrtime(void) { struct timespec ts; long long result; - if (clock_gettime(CLOCK_MONOTONIC,&ts) != 0) { + if (clock_gettime(MONOTONIC_CLOCK_ID,&ts) != 0) { print_error("Fatal, could not get clock_monotonic value, terminating! " "errno = %d\n", errno); exit(1); @@ -1122,7 +1110,7 @@ time_t timestamp(time_t *res) return r; } -#elif defined(CORRECT_USING_TIMES) +#elif defined(OS_MONOTONIC_TIME_USING_TIMES) # ifdef NO_SYSCONF # include <sys/param.h> |