diff options
Diffstat (limited to 'erts/etc/common/heart.c')
-rw-r--r-- | erts/etc/common/heart.c | 325 |
1 files changed, 170 insertions, 155 deletions
diff --git a/erts/etc/common/heart.c b/erts/etc/common/heart.c index 755e308219..81d797dc7e 100644 --- a/erts/etc/common/heart.c +++ b/erts/etc/common/heart.c @@ -66,8 +66,7 @@ * input and output file descriptors (0 and 1). These descriptors * (and the standard error descriptor 2) must NOT be closed * explicitely by this program at termination (in UNIX it is - * taken care of by the operating system itself; in VxWorks - * it is taken care of by the spawn driver part of the Emulator). + * taken care of by the operating system itself). * * END OF FILE * @@ -75,12 +74,6 @@ * that there is no process at the other end of the connection * having the connection open for writing (end-of-file). * - * HARDWARE WATCHDOG - * - * When used with VxWorks(with CPU40), the hardware - * watchdog is enabled, making sure that the system reboots - * even if the heart port program malfunctions or the system - * is completely overloaded. */ #ifdef HAVE_CONFIG_H @@ -93,18 +86,12 @@ #include <fcntl.h> #include <process.h> #endif -#ifdef VXWORKS -#include "sys.h" -#endif /* * Implement time correction using times() call even on Linuxes * that can simulate gethrtime with clock_gettime, no use implementing * a phony gethrtime in this file as the time questions are so infrequent. */ -#if defined(CORRET_USING_TIMES) || defined(GETHRTIME_WITH_CLOCK_GETTIME) -# define HEART_CORRECT_USING_TIMES 1 -#endif #include <stdio.h> #include <stddef.h> @@ -116,31 +103,20 @@ #include <time.h> #include <errno.h> -#ifdef VXWORKS -# include <vxWorks.h> -# include <ioLib.h> -# include <selectLib.h> -# include <netinet/in.h> -# include <rebootLib.h> -# include <sysLib.h> -# include <taskLib.h> -# include <wdLib.h> -# include <taskHookLib.h> -# include <selectLib.h> -#endif -#if !defined(__WIN32__) && !defined(VXWORKS) +#if !defined(__WIN32__) # include <sys/types.h> # include <netinet/in.h> # include <sys/time.h> # include <unistd.h> # include <signal.h> -# if defined(HEART_CORRECT_USING_TIMES) +# if defined(CORRECT_USING_TIMES) # include <sys/times.h> # include <limits.h> # endif #endif -#define HEART_COMMAND_ENV "HEART_COMMAND" +#define HEART_COMMAND_ENV "HEART_COMMAND" +#define ERL_CRASH_DUMP_SECONDS_ENV "ERL_CRASH_DUMP_SECONDS" #define MSG_HDR_SIZE 2 #define MSG_HDR_PLUS_OP_SIZE 3 @@ -156,13 +132,14 @@ struct msg { }; /* operations */ -#define HEART_ACK 1 -#define HEART_BEAT 2 -#define SHUT_DOWN 3 -#define SET_CMD 4 -#define CLEAR_CMD 5 -#define GET_CMD 6 -#define HEART_CMD 7 +#define HEART_ACK (1) +#define HEART_BEAT (2) +#define SHUT_DOWN (3) +#define SET_CMD (4) +#define CLEAR_CMD (5) +#define GET_CMD (6) +#define HEART_CMD (7) +#define PREPARING_CRASH (8) /* Maybe interesting to change */ @@ -190,10 +167,11 @@ unsigned long heart_beat_kill_pid = 0; #define SOL_WD_TIMEOUT (heart_beat_timeout+heart_beat_boot_delay) /* reasons for reboot */ -#define R_TIMEOUT 1 -#define R_CLOSED 2 -#define R_ERROR 3 -#define R_SHUT_DOWN 4 +#define R_TIMEOUT (1) +#define R_CLOSED (2) +#define R_ERROR (3) +#define R_SHUT_DOWN (4) +#define R_CRASHING (5) /* Doing a crash dump and we will wait for it */ /* macros */ @@ -203,8 +181,8 @@ unsigned long heart_beat_kill_pid = 0; /* prototypes */ -static int message_loop(int,int); -static void do_terminate(int); +static int message_loop(int, int); +static void do_terminate(int, int); static int notify_ack(int); static int heart_cmd_reply(int, char *); static int write_message(int, struct msg *); @@ -215,6 +193,7 @@ static void print_error(const char *,...); static void debugf(const char *,...); static void init_timestamp(void); static time_t timestamp(time_t *); +static int wait_until_close_write_or_env_tmo(int); #ifdef __WIN32__ static BOOL enable_privilege(void); @@ -353,12 +332,14 @@ static void get_arguments(int argc, char** argv) { debugf("arguments -ht %d -wt %d -pid %lu\n",h,w,p); } -int -main(int argc, char **argv) -{ +int main(int argc, char **argv) { + + if (is_env_set("HEART_DEBUG")) { + fprintf(stderr, "heart: debug is ON!\r\n"); + debug_on = 1; + } + get_arguments(argc,argv); - if (is_env_set("HEART_DEBUG")) - debug_on=1; #ifdef __WIN32__ if (debug_on) { if(!is_env_set("ERLSRV_SERVICE_NAME")) { @@ -379,7 +360,7 @@ main(int argc, char **argv) program_name[sizeof(program_name)-1] = '\0'; notify_ack(erlout_fd); cmd[0] = '\0'; - do_terminate(message_loop(erlin_fd,erlout_fd)); + do_terminate(erlin_fd,message_loop(erlin_fd,erlout_fd)); return 0; } @@ -413,6 +394,7 @@ message_loop(erlin_fd, erlout_fd) #endif while (1) { + /* REFACTOR: below to select/tmo function */ #ifdef __WIN32__ wresult = WaitForSingleObject(hevent_dataready,SELECT_TIMEOUT*1000+ 2); if (wresult == WAIT_FAILED) { @@ -447,7 +429,8 @@ message_loop(erlin_fd, erlout_fd) */ timestamp(&now); if (now > last_received + heart_beat_timeout) { - print_error("heart-beat time-out."); + print_error("heart-beat time-out, no activity for %lu seconds", + (unsigned long) (now - last_received)); return R_TIMEOUT; } /* @@ -506,6 +489,10 @@ message_loop(erlin_fd, erlout_fd) free_env_val(env); } break; + case PREPARING_CRASH: + /* Erlang has reached a crushdump point (is crashing for sure) */ + print_error("Erlang is crashing .. (waiting for crash dump file)"); + return R_CRASHING; default: /* ignore all other messages */ break; @@ -550,8 +537,7 @@ kill_old_erlang(void){ CloseHandle(erlh); } } -#elif !defined(VXWORKS) -/* Unix eh? */ +#else static void kill_old_erlang(void){ pid_t pid; @@ -570,7 +556,7 @@ kill_old_erlang(void){ } } } -#endif /* Not on VxWorks */ +#endif #ifdef __WIN32__ void win_system(char *command) @@ -637,72 +623,130 @@ void win_system(char *command) * do_terminate */ static void -do_terminate(reason) - int reason; -{ +do_terminate(int erlin_fd, int reason) { /* When we get here, we have HEART_BEAT_BOOT_DELAY secs to finish (plus heart_beat_report_delay if under VxWorks), so we don't need to call wd_reset(). */ - + int ret = 0, tmo=0; + char *tmo_env; + switch (reason) { case R_SHUT_DOWN: break; + case R_CRASHING: + if (is_env_set(ERL_CRASH_DUMP_SECONDS_ENV)) { + tmo_env = get_env(ERL_CRASH_DUMP_SECONDS_ENV); + tmo = atoi(tmo_env); + print_error("Waiting for dump - timeout set to %d seconds.", tmo); + wait_until_close_write_or_env_tmo(tmo); + free_env_val(tmo_env); + } + /* fall through */ case R_TIMEOUT: - case R_ERROR: case R_CLOSED: + case R_ERROR: default: -#if defined(__WIN32__) /* Not VxWorks */ { - if(!cmd[0]) { - char *command = get_env(HEART_COMMAND_ENV); - if(!command) - print_error("Would reboot. Terminating."); - else { - kill_old_erlang(); - /* High prio combined with system() works badly indeed... */ - SetPriorityClass(GetCurrentProcess(), NORMAL_PRIORITY_CLASS); - win_system(command); - print_error("Executed \"%s\". Terminating.",command); +#if defined(__WIN32__) /* Not VxWorks */ + if(!cmd[0]) { + char *command = get_env(HEART_COMMAND_ENV); + if(!command) + print_error("Would reboot. Terminating."); + else { + kill_old_erlang(); + /* High prio combined with system() works badly indeed... */ + SetPriorityClass(GetCurrentProcess(), NORMAL_PRIORITY_CLASS); + win_system(command); + print_error("Executed \"%s\". Terminating.",command); + } + free_env_val(command); + } else { + kill_old_erlang(); + /* High prio combined with system() works badly indeed... */ + SetPriorityClass(GetCurrentProcess(), NORMAL_PRIORITY_CLASS); + win_system(&cmd[0]); + print_error("Executed \"%s\". Terminating.",cmd); } - free_env_val(command); - } - else { - kill_old_erlang(); - /* High prio combined with system() works badly indeed... */ - SetPriorityClass(GetCurrentProcess(), NORMAL_PRIORITY_CLASS); - win_system(&cmd[0]); - print_error("Executed \"%s\". Terminating.",cmd); - } - } - #else - { - if(!cmd[0]) { - char *command = get_env(HEART_COMMAND_ENV); - if(!command) - print_error("Would reboot. Terminating."); - else { - kill_old_erlang(); - /* suppress gcc warning with 'if' */ - if(system(command)); - print_error("Executed \"%s\". Terminating.",command); + if(!cmd[0]) { + char *command = get_env(HEART_COMMAND_ENV); + if(!command) + print_error("Would reboot. Terminating."); + else { + kill_old_erlang(); + /* suppress gcc warning with 'if' */ + ret = system(command); + print_error("Executed \"%s\" -> %d. Terminating.",command, ret); + } + free_env_val(command); + } else { + kill_old_erlang(); + /* suppress gcc warning with 'if' */ + ret = system((char*)&cmd[0]); + print_error("Executed \"%s\" -> %d. Terminating.",cmd, ret); } - free_env_val(command); - } - else { - kill_old_erlang(); - /* suppress gcc warning with 'if' */ - if(system((char*)&cmd[0])); - print_error("Executed \"%s\". Terminating.",cmd); - } +#endif } break; -#endif } /* switch(reason) */ } + +/* Waits until something happens on socket or handle + * + * Uses global variables erlin_fd or hevent_dataready + */ +int wait_until_close_write_or_env_tmo(int tmo) { + int i = 0; + +#ifdef __WIN32__ + DWORD wresult; + DWORD wtmo = INFINITE; + + if (tmo >= 0) { + wtmo = tmo*1000 + 2; + } + + wresult = WaitForSingleObject(hevent_dataready, wtmo); + if (wresult == WAIT_FAILED) { + print_last_error(); + return -1; + } + + if (wresult == WAIT_TIMEOUT) { + debugf("wait timed out\n"); + i = 0; + } else { + debugf("wait ok\n"); + i = 1; + } +#else + fd_set read_fds; + int max_fd; + struct timeval timeout; + struct timeval *tptr = NULL; + + max_fd = erlin_fd; /* global */ + + if (tmo >= 0) { + timeout.tv_sec = tmo; /* On Linux timeout is modified by select */ + timeout.tv_usec = 0; + tptr = &timeout; + } + + FD_ZERO(&read_fds); + FD_SET(erlin_fd, &read_fds); + if ((i = select(max_fd + 1, &read_fds, NULLFDS, NULLFDS, tptr)) < 0) { + print_error("error in select."); + return -1; + } +#endif + return i; +} + + /* * notify_ack * @@ -893,12 +937,13 @@ debugf(const char *format,...) { va_list args; - if (!debug_on) return; - va_start(args, format); - fprintf(stderr, "Heart: "); - vfprintf(stderr, format, args); - va_end(args); - fprintf(stderr, "\r\n"); + if (debug_on) { + va_start(args, format); + fprintf(stderr, "Heart: "); + vfprintf(stderr, format, args); + va_end(args); + fprintf(stderr, "\r\n"); + } } #ifdef __WIN32__ @@ -1026,60 +1071,30 @@ time_t timestamp(time_t *res) return r; } -#elif defined(VXWORKS) - -static WDOG_ID watchdog_id; -static volatile unsigned elapsed; -static WIND_TCB *this_task; -/* A simple variable is enough to lock the time update, as the - watchdog is run at interrupt level and never preempted. */ -static volatile int lock_time; - -static void my_delete_hook(WIND_TCB *tcb) -{ - if (tcb == this_task) { - wdDelete(watchdog_id); - watchdog_id = NULL; - taskDeleteHookDelete((FUNCPTR) &my_delete_hook); - } -} +#elif defined(HAVE_GETHRTIME) || defined(GETHRTIME_WITH_CLOCK_GETTIME) -static void my_wd_routine(int count) -{ - if (watchdog_id != NULL) { - ++count; - if (!lock_time) { - elapsed += count; - count = 0; - } - wdStart(watchdog_id, sysClkRateGet(), - (FUNCPTR) &my_wd_routine, count); - } -} +#if defined(GETHRTIME_WITH_CLOCK_GETTIME) +typedef long long SysHrTime; -void init_timestamp(void) -{ - lock_time = 0; - elapsed = 0; - watchdog_id = wdCreate(); - this_task = (WIND_TCB *) taskIdSelf(); - taskDeleteHookAdd((FUNCPTR) &my_delete_hook); - wdStart(watchdog_id, sysClkRateGet(), - (FUNCPTR) &my_wd_routine, 0); -} +SysHrTime sys_gethrtime(void); -time_t timestamp(time_t *res) +SysHrTime sys_gethrtime(void) { - time_t r; - ++lock_time; - r = (time_t) elapsed; - --lock_time; - if (res != NULL) - *res = r; - return r; + struct timespec ts; + long long result; + if (clock_gettime(CLOCK_MONOTONIC,&ts) != 0) { + print_error("Fatal, could not get clock_monotonic value, terminating! " + "errno = %d\n", errno); + exit(1); + } + result = ((long long) ts.tv_sec) * 1000000000LL + + ((long long) ts.tv_nsec); + return (SysHrTime) result; } - -#elif defined(HAVE_GETHRTIME) +#else +typedef hrtime_t SysHrTime; +#define sys_gethrtime() gethrtime() +#endif void init_timestamp(void) { @@ -1087,14 +1102,14 @@ void init_timestamp(void) time_t timestamp(time_t *res) { - hrtime_t ht = gethrtime(); + SysHrTime ht = sys_gethrtime(); time_t r = (time_t) (ht / 1000000000); if (res != NULL) *res = r; return r; } -#elif defined(HEART_CORRECT_USING_TIMES) +#elif defined(CORRECT_USING_TIMES) # ifdef NO_SYSCONF # include <sys/param.h> |