From 6aa87d58b756ef65650ee793ad4ece8add7b70fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn-Egil=20Dahlberg?= Date: Tue, 2 Oct 2012 18:44:03 +0200 Subject: erts, heart: Ensure erl_crash.dump is written When a crash dump is about to be written and we have heartbeat enabled on a system. We need time to write it before heart explicitly kills the beam. --- erts/emulator/beam/atom.names | 1 + erts/emulator/beam/global.h | 3 + erts/emulator/beam/io.c | 21 +++++++ erts/emulator/sys/unix/sys.c | 46 ++++++++++++---- erts/emulator/sys/win32/sys.c | 16 +++++- erts/etc/common/heart.c | 124 ++++++++++++++++++++++-------------------- lib/kernel/src/heart.erl | 3 + 7 files changed, 141 insertions(+), 73 deletions(-) diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names index 106fad030b..afcbd732df 100644 --- a/erts/emulator/beam/atom.names +++ b/erts/emulator/beam/atom.names @@ -252,6 +252,7 @@ atom heap_block_size atom heap_size atom heap_sizes atom heap_type +atom heart_port atom heir atom hidden atom hide diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h index dbf95f5bd7..2c20e3da3b 100755 --- a/erts/emulator/beam/global.h +++ b/erts/emulator/beam/global.h @@ -983,6 +983,9 @@ Uint erts_port_ioq_size(Port *pp); void erts_stale_drv_select(Eterm, ErlDrvEvent, int, int); void erts_port_cleanup(Port *); void erts_fire_port_monitor(Port *prt, Eterm ref); + +Port *erts_get_heart_port(void); + #ifdef ERTS_SMP void erts_smp_xports_unlock(Port *); #endif diff --git a/erts/emulator/beam/io.c b/erts/emulator/beam/io.c index 35b194f927..502ded4eca 100644 --- a/erts/emulator/beam/io.c +++ b/erts/emulator/beam/io.c @@ -5235,3 +5235,24 @@ erl_drv_getenv(char *key, char *value, size_t *value_size) { return erts_sys_getenv_raw(key, value, value_size); } + +/* get heart_port + * used by erl_crash_dump + * - uses the fact that heart_port is registered when starting heart + */ + +Port *erts_get_heart_port() { + + Port* port; + Uint ix; + + for(ix = 0; ix < erts_max_ports; ix++) { + port = &erts_port[ix]; + /* immediate compare */ + if (port->reg && port->reg->name == am_heart_port) { + return port; + } + } + + return NULL; +} diff --git a/erts/emulator/sys/unix/sys.c b/erts/emulator/sys/unix/sys.c index c1fa00b4ea..37dfcb1dd4 100644 --- a/erts/emulator/sys/unix/sys.c +++ b/erts/emulator/sys/unix/sys.c @@ -58,7 +58,6 @@ #define __DARWIN__ 1 #endif - #ifdef USE_THREADS #include "erl_threads.h" #endif @@ -71,7 +70,6 @@ static erts_smp_rwmtx_t environ_rwmtx; #define MAX_VSIZE 16 /* Max number of entries allowed in an I/O * vector sock_sendv(). */ - /* * Don't need global.h, but bif_table.h (included by bif.h), * won't compile otherwise @@ -123,6 +121,15 @@ struct ErtsSysReportExit_ { #endif }; +/* This data is shared by these drivers - initialized by spawn_init() */ +static struct driver_data { + int port_num, ofd, packet_bytes; + ErtsSysReportExit *report_exit; + int pid; + int alive; + int status; +} *driver_data; /* indexed by fd */ + static ErtsSysReportExit *report_exit_list; #if CHLDWTHR && !defined(ERTS_SMP) static ErtsSysReportExit *report_exit_transit_list; @@ -685,12 +692,33 @@ prepare_crash_dump(void) int i, max; char env[21]; /* enough to hold any 64-bit integer */ size_t envsz; + Port *heart_port; + Eterm heap[3]; + Eterm *hp = heap; + Eterm list = NIL; + + int heart_fd[2] = {-1,-1}; if (ERTS_PREPARED_CRASH_DUMP) return; /* We have already been called */ + heart_port = erts_get_heart_port(); + if (heart_port) { + /* hearts input fd + * We "know" drv_data is the in_fd since the port is started with read|write + */ + heart_fd[0] = (int)heart_port->drv_data; + heart_fd[1] = (int)driver_data[heart_fd[0]].ofd; + + list = CONS(hp, make_small(8), list); hp += 2; + + /* send to heart port, CMD = 8, i.e. prepare crash dump =o */ + erts_write_to_port(NIL, heart_port, list); + } + /* Make sure we unregister at epmd (unknown fd) and get at least one free filedescriptor (for erl_crash.dump) */ + max = max_files; if (max < 1024) max = 1024; @@ -704,6 +732,10 @@ prepare_crash_dump(void) if (i == async_fd[0] || i == async_fd[1]) continue; #endif + /* We don't want to close our heart yet ... */ + if (i == heart_fd[0] || i == heart_fd[1]) + continue; + close(i); } @@ -725,7 +757,6 @@ prepare_crash_dump(void) sec = (unsigned) i != 0 ? 0 : atoi(env); alarm(sec); } - } void @@ -1021,15 +1052,6 @@ void fini_getenv_state(GETENV_STATE *state) #define ERTS_SYS_READ_BUF_SZ (64*1024) -/* This data is shared by these drivers - initialized by spawn_init() */ -static struct driver_data { - int port_num, ofd, packet_bytes; - ErtsSysReportExit *report_exit; - int pid; - int alive; - int status; -} *driver_data; /* indexed by fd */ - /* Driver interfaces */ static ErlDrvData spawn_start(ErlDrvPort, char*, SysDriverOpts*); static ErlDrvData fd_start(ErlDrvPort, char*, SysDriverOpts*); diff --git a/erts/emulator/sys/win32/sys.c b/erts/emulator/sys/win32/sys.c index 6894d682e7..c5664d8e8a 100755 --- a/erts/emulator/sys/win32/sys.c +++ b/erts/emulator/sys/win32/sys.c @@ -258,8 +258,22 @@ void erl_sys_args(int* argc, char** argv) void erts_sys_prepare_crash_dump(void) { + Port *heart_port; + Eterm heap[3]; + Eterm *hp = heap; + Eterm list = NIL; + + heart_port = erts_get_heart_port(); + + if (heart_port) { + + list = CONS(hp, make_small(8), list); hp += 2; + + /* send to heart port, CMD = 8, i.e. prepare crash dump =o */ + erts_write_to_port(NIL, heart_port, list); + } + /* Windows - free file descriptors are hopefully available */ - return; } static void diff --git a/erts/etc/common/heart.c b/erts/etc/common/heart.c index 70c2b3bb23..c6ce017d8c 100644 --- a/erts/etc/common/heart.c +++ b/erts/etc/common/heart.c @@ -153,13 +153,14 @@ struct msg { }; /* operations */ -#define HEART_ACK 1 -#define HEART_BEAT 2 -#define SHUT_DOWN 3 -#define SET_CMD 4 -#define CLEAR_CMD 5 -#define GET_CMD 6 -#define HEART_CMD 7 +#define HEART_ACK (1) +#define HEART_BEAT (2) +#define SHUT_DOWN (3) +#define SET_CMD (4) +#define CLEAR_CMD (5) +#define GET_CMD (6) +#define HEART_CMD (7) +#define PREPARING_CRASH (8) /* Maybe interesting to change */ @@ -187,10 +188,11 @@ unsigned long heart_beat_kill_pid = 0; #define SOL_WD_TIMEOUT (heart_beat_timeout+heart_beat_boot_delay) /* reasons for reboot */ -#define R_TIMEOUT 1 -#define R_CLOSED 2 -#define R_ERROR 3 -#define R_SHUT_DOWN 4 +#define R_TIMEOUT (1) +#define R_CLOSED (2) +#define R_ERROR (3) +#define R_SHUT_DOWN (4) +#define R_CRASHING (5) /* Doing a crash dump and we will wait for it */ /* macros */ @@ -200,8 +202,8 @@ unsigned long heart_beat_kill_pid = 0; /* prototypes */ -static int message_loop(int,int); -static void do_terminate(int); +static int message_loop(int, int); +static void do_terminate(int, int); static int notify_ack(int); static int heart_cmd_reply(int, char *); static int write_message(int, struct msg *); @@ -376,7 +378,7 @@ main(int argc, char **argv) program_name[sizeof(program_name)-1] = '\0'; notify_ack(erlout_fd); cmd[0] = '\0'; - do_terminate(message_loop(erlin_fd,erlout_fd)); + do_terminate(erlin_fd,message_loop(erlin_fd,erlout_fd)); return 0; } @@ -410,6 +412,7 @@ message_loop(erlin_fd, erlout_fd) #endif while (1) { + /* REFACTOR: below to select/tmo function */ #ifdef __WIN32__ wresult = WaitForSingleObject(hevent_dataready,SELECT_TIMEOUT*1000+ 2); if (wresult == WAIT_FAILED) { @@ -504,6 +507,10 @@ message_loop(erlin_fd, erlout_fd) free_env_val(env); } break; + case PREPARING_CRASH: + /* Erlang has reached a crushdump point (is crashing for sure) */ + print_error("Erlang is crashing .. (waiting for crash dump file)"); + return R_CRASHING; default: /* ignore all other messages */ break; @@ -635,69 +642,66 @@ void win_system(char *command) * do_terminate */ static void -do_terminate(reason) - int reason; -{ +do_terminate(int erlin_fd, int reason) { /* When we get here, we have HEART_BEAT_BOOT_DELAY secs to finish (plus heart_beat_report_delay if under VxWorks), so we don't need to call wd_reset(). */ - + struct msg message; + switch (reason) { case R_SHUT_DOWN: break; + case R_CRASHING: + print_error("Waiting for dump"); + read_message(erlin_fd, &message); /* read until we get something */ case R_TIMEOUT: - case R_ERROR: case R_CLOSED: + case R_ERROR: default: -#if defined(__WIN32__) /* Not VxWorks */ { - if(!cmd[0]) { - char *command = get_env(HEART_COMMAND_ENV); - if(!command) - print_error("Would reboot. Terminating."); - else { - kill_old_erlang(); - /* High prio combined with system() works badly indeed... */ - SetPriorityClass(GetCurrentProcess(), NORMAL_PRIORITY_CLASS); - win_system(command); - print_error("Executed \"%s\". Terminating.",command); +#if defined(__WIN32__) /* Not VxWorks */ + if(!cmd[0]) { + char *command = get_env(HEART_COMMAND_ENV); + if(!command) + print_error("Would reboot. Terminating."); + else { + kill_old_erlang(); + /* High prio combined with system() works badly indeed... */ + SetPriorityClass(GetCurrentProcess(), NORMAL_PRIORITY_CLASS); + win_system(command); + print_error("Executed \"%s\". Terminating.",command); + } + free_env_val(command); + } else { + kill_old_erlang(); + /* High prio combined with system() works badly indeed... */ + SetPriorityClass(GetCurrentProcess(), NORMAL_PRIORITY_CLASS); + win_system(&cmd[0]); + print_error("Executed \"%s\". Terminating.",cmd); } - free_env_val(command); - } - else { - kill_old_erlang(); - /* High prio combined with system() works badly indeed... */ - SetPriorityClass(GetCurrentProcess(), NORMAL_PRIORITY_CLASS); - win_system(&cmd[0]); - print_error("Executed \"%s\". Terminating.",cmd); - } - } - #else - { - if(!cmd[0]) { - char *command = get_env(HEART_COMMAND_ENV); - if(!command) - print_error("Would reboot. Terminating."); - else { - kill_old_erlang(); - /* suppress gcc warning with 'if' */ - if(system(command)); - print_error("Executed \"%s\". Terminating.",command); + if(!cmd[0]) { + char *command = get_env(HEART_COMMAND_ENV); + if(!command) + print_error("Would reboot. Terminating."); + else { + kill_old_erlang(); + /* suppress gcc warning with 'if' */ + if(system(command)); + print_error("Executed \"%s\". Terminating.",command); + } + free_env_val(command); + } else { + kill_old_erlang(); + /* suppress gcc warning with 'if' */ + if(system((char*)&cmd[0])); + print_error("Executed \"%s\". Terminating.",cmd); } - free_env_val(command); - } - else { - kill_old_erlang(); - /* suppress gcc warning with 'if' */ - if(system((char*)&cmd[0])); - print_error("Executed \"%s\". Terminating.",cmd); - } +#endif } break; -#endif } /* switch(reason) */ } diff --git a/lib/kernel/src/heart.erl b/lib/kernel/src/heart.erl index 28452a377e..de287bfa43 100644 --- a/lib/kernel/src/heart.erl +++ b/lib/kernel/src/heart.erl @@ -42,6 +42,7 @@ -define(CLEAR_CMD, 5). -define(GET_CMD, 6). -define(HEART_CMD, 7). +-define(PREPARING_CRASH, 8). % Used in beam vm -define(TIMEOUT, 5000). -define(CYCLE_TIMEOUT, 10000). @@ -130,6 +131,8 @@ start_portprogram() -> Port when is_port(Port) -> case wait_ack(Port) of ok -> + %% register port so the vm can find it if need be + register(heart_port, Port), {ok, Port}; {error, Reason} -> report_problem({{port_problem, Reason}, -- cgit v1.2.3