diff options
Diffstat (limited to 'erts/emulator/sys/unix')
-rw-r--r-- | erts/emulator/sys/unix/erl_child_setup.c | 427 | ||||
-rw-r--r-- | erts/emulator/sys/unix/erl_unix_sys.h | 49 | ||||
-rw-r--r-- | erts/emulator/sys/unix/sys.c | 33 | ||||
-rw-r--r-- | erts/emulator/sys/unix/sys_drivers.c | 1771 | ||||
-rw-r--r-- | erts/emulator/sys/unix/sys_uds.c | 125 | ||||
-rw-r--r-- | erts/emulator/sys/unix/sys_uds.h | 47 |
6 files changed, 1360 insertions, 1092 deletions
diff --git a/erts/emulator/sys/unix/erl_child_setup.c b/erts/emulator/sys/unix/erl_child_setup.c index a3c5c20641..71c7948bf0 100644 --- a/erts/emulator/sys/unix/erl_child_setup.c +++ b/erts/emulator/sys/unix/erl_child_setup.c @@ -1,7 +1,7 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2002-2009. All Rights Reserved. + * Copyright Ericsson AB 2002-2015. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,109 +19,195 @@ */ /* - * After a vfork() (or fork()) the child exec()s to this program which - * sets up the child and exec()s to the user program (see spawn_start() - * in sys.c and ticket OTP-4389). + * This program is started at erts startup and all fork's that + * have to be done are done in here. This is done for a couple + * of reasons: + * - Allow usage of fork without a memory explosion. + * -- we do not want to use vfork, as it blocks the VM + * until the execv is done, and if the program that + * is to be executed is on an NFS that is unavailable, + * the execv can block for a very long time. + * -- we cannot do fork inside the VM as that would temporarily + * duplicate the memory usage of the VM per parallel exec. + * + * Some implementation notes: + * - A single Unix Domain Socket is setup in between the VM and + * this program. Over that UDS the file descriptors that should + * be used to talk to the child program are sent. + * The actual command to execute, together with options and the + * environment, is sent over the pipe represented by the + * file descriptors mentioned above. We don't send the + * command over the UDS as that would increase the likely hood + * that it's buffer would be full. + * + * - Since it is this program that execv's, it has to take care of + * all the SIGCHLD signals that the child programs generate. The + * signals are received and the pid+exit reason is sent as data + * on the UDS to the VM. The VM is then able to map the pid to the + * port of the child program that just exited and deliver the status + * code if requested. */ #ifdef HAVE_CONFIG_H # include "config.h" #endif -#define NEED_CHILD_SETUP_DEFINES -#include "sys.h" -#include "erl_misc_utils.h" +#include <stdlib.h> +#include <stdio.h> +#include <sys/wait.h> -#ifdef SIG_SIGSET /* Old SysV */ -void sys_sigrelease(int sig) -{ - sigrelse(sig); -} -#else /* !SIG_SIGSET */ -#ifdef SIG_SIGNAL /* Old BSD */ -sys_sigrelease(int sig) +#include "erl_driver.h" +#include "sys_uds.h" + +#define SET_CLOEXEC(fd) fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC) + +#if defined(__ANDROID__) +#define SHELL "/system/bin/sh" +#else +#define SHELL "/bin/sh" +#endif /* __ANDROID__ */ + +//#define HARD_DEBUG +#ifdef HARD_DEBUG +#define DEBUG_PRINT(fmt, ...) fprintf(stderr, fmt "\r\n", ##__VA_ARGS__) +#else +#define DEBUG_PRINT(fmt, ...) +#endif + +#define ABORT(fmt, ...) do { \ + fprintf(stderr, "erl_child_setup: " fmt "\r\n", ##__VA_ARGS__); \ + abort(); \ + } while(0) + +#undef ASSERT +#ifdef DEBUG +#define ASSERT(cnd) do { if (!(cnd)) { ABORT("assertion %s failed", #cnd); } } while(0) +#else +#define ASSERT(cnd) +#endif + +void sys_sigblock(int sig) { - sigsetmask(sigblock(0) & ~sigmask(sig)); + sigset_t mask; + + sigemptyset(&mask); + sigaddset(&mask, sig); + sigprocmask(SIG_BLOCK, &mask, (sigset_t *)NULL); } -#else /* !SIG_SIGNAL */ /* The True Way - POSIX!:-) */ + void sys_sigrelease(int sig) { sigset_t mask; - sigemptyset(&mask); sigaddset(&mask, sig); sigprocmask(SIG_UNBLOCK, &mask, (sigset_t *)NULL); } -#endif /* !SIG_SIGNAL */ -#endif /* !SIG_SIGSET */ -#if defined(__ANDROID__) -static int system_properties_fd(void); -#endif /* __ANDROID__ */ +static int max_files = -1; +static int sigchld_pipe[2]; -#if defined(__ANDROID__) -#define SHELL "/system/bin/sh" -#else -#define SHELL "/bin/sh" -#endif /* __ANDROID__ */ +static int +start_new_child(int pipes[]) +{ + int size, res, i; + char *buff, *o_buff; + char *cmd, *wd, **new_environ, **args = NULL, cbuff[1]; -int -main(int argc, char *argv[]) -{ - int i, from, to; - int erts_spawn_executable = 0; - - /* OBSERVE! - * Keep child setup after fork() (implemented in sys.c) up to date - * if changes are made here. - */ - - if (argc != CS_ARGV_NO_OF_ARGS) { - if (argc < CS_ARGV_NO_OF_ARGS) { - return 1; - } else { - erts_spawn_executable = 1; - } + Sint cnt, flags; + + /* only child executes here */ + + if ((res = read(pipes[0], (char*)&size, sizeof(size))) < 0) { + ABORT("Failed to read size from %d (%d)", pipes[0], errno); } - if (strcmp("false", argv[CS_ARGV_UNBIND_IX]) != 0) - if (erts_unbind_from_cpu_str(argv[CS_ARGV_UNBIND_IX]) != 0) - return 1; - - for (i = 0; i < CS_ARGV_NO_OF_DUP2_OPS; i++) { - if (argv[CS_ARGV_DUP2_OP_IX(i)][0] == '-' - && argv[CS_ARGV_DUP2_OP_IX(i)][1] == '\0') - break; - if (sscanf(argv[CS_ARGV_DUP2_OP_IX(i)], "%d:%d", &from, &to) != 2) - return 1; - if (dup2(from, to) < 0) - return 1; + buff = malloc(size); + + DEBUG_PRINT("size = %d", size); + + if ((res = read(pipes[0], buff, size)) != size) { + ABORT("Failed to read %d bytes from %d (%d,%d)", + size, pipes[0], res, errno); } - if (sscanf(argv[CS_ARGV_FD_CR_IX], "%d:%d", &from, &to) != 2) - return 1; + o_buff = buff; -#if defined(HAVE_CLOSEFROM) - closefrom(from); -#elif defined(__ANDROID__) - if (from <= to) { - int spfd = system_properties_fd(); - for (i = from; i <= to; i++) { - if (i != spfd) { - (void) close(i); - } - } + flags = *(int*)buff; + buff += sizeof(int); + + DEBUG_PRINT("flags = %d", flags); + + cmd = buff; + buff += strlen(buff) + 1; + if (*buff == '\0') { + wd = NULL; + } else { + wd = buff; + buff += strlen(buff) + 1; + } + buff++; + + DEBUG_PRINT("wd = %s", wd); + + cnt = *(int*)buff; + buff += sizeof(int); + new_environ = malloc(sizeof(char*)*(cnt + 1)); + + for (i = 0; i < cnt; i++, buff++) { + new_environ[i] = buff; + while(*buff != '\0') buff++; + } + new_environ[cnt] = NULL; + + if (o_buff + size != buff) { + /* This is a spawn executable call */ + cnt = *(int*)buff; + buff += sizeof(int); + args = malloc(sizeof(char*)*(cnt + 1)); + for (i = 0; i < cnt; i++, buff++) { + args[i] = buff; + while(*buff != '\0') buff++; + } + args[cnt] = NULL; + } + + if (o_buff + size != buff) { + ABORT("Buff error: %p, %p:%p", o_buff, o_buff+size, buff); + } + + if (read(pipes[0], cbuff, 1) < 1) + goto child_error; + + DEBUG_PRINT("Do that forking business: '%s'\n",cmd); + + /* When the dup2'ing below is done, only + fd's 0, 1, 2 and maybe 3, 4 should survive the + exec. All other fds (i.e. the unix domain sockets + and stray pipe ends) should have CLOEXEC set on them + so they will be closed when the exec happens */ + if (flags & FORKER_FLAG_USE_STDIO) { + /* stdin for process */ + if (flags & FORKER_FLAG_DO_WRITE && + dup2(pipes[0], 0) < 0) + goto child_error; + /* stdout for process */ + if (flags & FORKER_FLAG_DO_READ && + dup2(pipes[1], 1) < 0) + goto child_error; } -#else /* !__ANDROID__ */ - for (i = from; i <= to; i++) { - (void) close(i); + else { /* XXX will fail if pipes[0] == 4 (unlikely..) */ + if (flags & FORKER_FLAG_DO_READ && dup2(pipes[1], 4) < 0) + goto child_error; + if (flags & FORKER_FLAG_DO_WRITE && dup2(pipes[0], 3) < 0) + goto child_error; } -#endif /* HAVE_CLOSEFROM */ - if (!(argv[CS_ARGV_WD_IX][0] == '.' && argv[CS_ARGV_WD_IX][1] == '\0') - && chdir(argv[CS_ARGV_WD_IX]) < 0) - return 1; + if (dup2(pipes[2], 2) < 0) + goto child_error; + + if (wd && chdir(wd) < 0) + goto child_error; #if defined(USE_SETPGRP_NOARGS) /* SysV */ (void) setpgrp(); @@ -131,34 +217,197 @@ main(int argc, char *argv[]) (void) setsid(); #endif + close(pipes[0]); + close(pipes[1]); + close(pipes[2]); + sys_sigrelease(SIGCHLD); - sys_sigrelease(SIGINT); - sys_sigrelease(SIGUSR1); - - if (erts_spawn_executable) { - if (argv[CS_ARGV_NO_OF_ARGS + 1] == NULL) { - execl(argv[CS_ARGV_NO_OF_ARGS],argv[CS_ARGV_NO_OF_ARGS], - (char *) NULL); - } else { - execv(argv[CS_ARGV_NO_OF_ARGS],&(argv[CS_ARGV_NO_OF_ARGS + 1])); - } + + if (args) { + /* spawn_executable */ + execve(cmd, args, new_environ); } else { - execl(SHELL, "sh", "-c", argv[CS_ARGV_CMD_IX], (char *) NULL); + execle(SHELL, "sh", "-c", cmd, (char *) NULL, new_environ); } - return 1; +child_error: + _exit(1); +} + + +/* + * [OTP-3906] + * Solaris signal management gets confused when threads are used and a + * lot of child processes dies. The confusion results in that SIGCHLD + * signals aren't delivered to the emulator which in turn results in + * a lot of defunct processes in the system. + * + * The problem seems to appear when a signal is frequently + * blocked/unblocked at the same time as the signal is frequently + * propagated. The child waiter thread is a workaround for this problem. + * The SIGCHLD signal is always blocked (in all threads), and the child + * waiter thread fetches the signal by a call to sigwait(). See + * child_waiter(). + * + * This should be a non-issue since the fork:ing was moved outside of + * the emulator into erl_child_setup. I'm leaving the comment here + * for posterity. */ + +static void handle_sigchld(int sig) { + int buff[2], res; + + sys_sigblock(SIGCHLD); + + while ((buff[0] = waitpid((pid_t)(-1), buff+1, WNOHANG)) > 0) { + if ((res = write(sigchld_pipe[1], buff, sizeof(buff))) <= 0) + ABORT("Failed to write to sigchld_pipe (%d): %d (%d)", sigchld_pipe[1], res, errno); + DEBUG_PRINT("Reap child %d (%d)", buff[0], buff[1]); + } + + sys_sigrelease(SIGCHLD); } #if defined(__ANDROID__) static int system_properties_fd(void) { - int fd; + static int fd = -2; char *env; + if (fd != -2) return fd; env = getenv("ANDROID_PROPERTY_WORKSPACE"); if (!env) { + fd = -1; return -1; } fd = atoi(env); return fd; } #endif /* __ANDROID__ */ + +int +main(int argc, char *argv[]) +{ + /* This fd should be open from beam */ + int uds_fd = 3, max_fd = 3; +#ifndef HAVE_CLOSEFROM + int i; +#endif + struct sigaction sa; + + if (argc < 1 || sscanf(argv[1],"%d",&max_files) != 1) { + ABORT("Invalid arguments to child_setup"); + } + +/* We close all fds except the uds from beam. + All other fds from now on will have the + CLOEXEC flags set on them. This means that we + only have to close a very limited number of fds + after we fork before the exec. */ +#if defined(HAVE_CLOSEFROM) + closefrom(4); +#else + for (i = 4; i < max_files; i++) +#if defined(__ANDROID__) + if (i != system_properties_fd()) +#endif + (void) close(i); +#endif + + if (pipe(sigchld_pipe) < 0) { + ABORT("Failed to setup sigchld pipe (%d)", errno); + } + + SET_CLOEXEC(sigchld_pipe[0]); + SET_CLOEXEC(sigchld_pipe[1]); + + max_fd = max_fd < sigchld_pipe[0] ? sigchld_pipe[0] : max_fd; + + sa.sa_handler = &handle_sigchld; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_RESTART | SA_NOCLDSTOP; + if (sigaction(SIGCHLD, &sa, 0) == -1) { + perror(0); + exit(1); + } + + SET_CLOEXEC(uds_fd); + + DEBUG_PRINT("Starting forker %d", max_files); + + while (1) { + fd_set read_fds; + int res; + FD_ZERO(&read_fds); + FD_SET(uds_fd, &read_fds); + FD_SET(sigchld_pipe[0], &read_fds); + DEBUG_PRINT("child_setup selecting on %d, %d (%d)", + uds_fd, sigchld_pipe[0], max_fd); + res = select(max_fd+1, &read_fds, NULL, NULL, NULL); + + if (res < 0) { + if (errno == EINTR) continue; + ABORT("Select failed: %d (%d)",res, errno); + } + + if (FD_ISSET(uds_fd, &read_fds)) { + int pipes[3], res, os_pid; + char buff[256]; + errno = 0; + if ((res = sys_uds_read(uds_fd, buff, 1, pipes, 3, MSG_DONTWAIT)) < 0) { + if (errno == EINTR) + continue; + DEBUG_PRINT("erl_child_setup failed to read from uds: %d, %d", res, errno); + _exit(0); + } + + if (res == 0) { + DEBUG_PRINT("uds was closed!"); + _exit(0); + } + /* Since we use unix domain sockets and send the entire data in + one go we *should* get the entire payload at once. */ + ASSERT(res == 1); + ASSERT(buff[0] == 'S'); + + sys_sigblock(SIGCHLD); + + errno = 0; + + os_pid = fork(); + if (os_pid == 0) + start_new_child(pipes); + + /* We write an ack here, but expect the reply on + the pipes[0] inside the fork */ + res = sprintf(buff,"GO:%010d:%010d", os_pid, errno); + if (write(pipes[1], buff, res + 1)) + ; /* remove gcc warning */ + + sys_sigrelease(SIGCHLD); + close(pipes[0]); + close(pipes[1]); + close(pipes[2]); + } + + if (FD_ISSET(sigchld_pipe[0], &read_fds)) { + int ibuff[2]; + char buff[256]; + res = read(sigchld_pipe[0], ibuff, sizeof(ibuff)); + if (res < 0) { + if (errno == EINTR) + continue; + ABORT("Failed to read from sigchld pipe: %d (%d)", res, errno); + } + res = snprintf(buff, 256, "SIGCHLD:%010d:%010d", ibuff[0], ibuff[1]); + DEBUG_PRINT("send %s to %d", buff, uds_fd); + if (write(uds_fd, buff, res + 1) < 0) { + if (errno == EINTR) + continue; + /* The uds was close, which most likely means that the VM + has exited. This will be detected when we try to read + from the uds_fd. */ + DEBUG_PRINT("Failed to write to uds: %d (%d)", uds_fd, errno); + } + } + } + return 1; +} diff --git a/erts/emulator/sys/unix/erl_unix_sys.h b/erts/emulator/sys/unix/erl_unix_sys.h index a11a44c259..c46f2c1fa2 100644 --- a/erts/emulator/sys/unix/erl_unix_sys.h +++ b/erts/emulator/sys/unix/erl_unix_sys.h @@ -30,9 +30,7 @@ #include <limits.h> #include <stdlib.h> #include <string.h> -#ifndef QNX #include <memory.h> -#endif #if defined(__sun__) && defined(__SVR4) && !defined(__EXTENSIONS__) # define __EXTENSIONS__ @@ -92,11 +90,6 @@ #include <ieeefp.h> #endif -#ifdef QNX -#include <process.h> -#include <sys/qnx_glob.h> -#endif - #include <pwd.h> #ifndef HZ @@ -136,13 +129,6 @@ # define ERTS_POLL_NEED_ASYNC_INTERRUPT_SUPPORT #endif -#ifndef ENABLE_CHILD_WAITER_THREAD -# ifdef ERTS_SMP -# define ERTS_SMP_SCHEDULERS_NEED_TO_CHECK_CHILDREN -void erts_check_children(void); -# endif -#endif - typedef void *GETENV_STATE; /* @@ -310,7 +296,6 @@ typedef void (*SIGFUNC)(int); extern SIGFUNC sys_signal(int, SIGFUNC); extern void sys_sigrelease(int); extern void sys_sigblock(int); -extern void sys_stop_cat(void); /* * Handling of floating point exceptions. @@ -425,18 +410,14 @@ void erts_sys_unblock_fpe(int); #define ERTS_FP_ERROR_THOROUGH(p, f, A) __ERTS_FP_ERROR_THOROUGH(&(p)->fp_exception, f, A) -#ifdef NEED_CHILD_SETUP_DEFINES -/* The child setup argv[] */ -#define CS_ARGV_PROGNAME_IX 0 /* Program name */ -#define CS_ARGV_UNBIND_IX 1 /* Unbind from cpu */ -#define CS_ARGV_WD_IX 2 /* Working directory */ -#define CS_ARGV_CMD_IX 3 /* Command */ -#define CS_ARGV_FD_CR_IX 4 /* Fd close range */ -#define CS_ARGV_DUP2_OP_IX(N) ((N) + 5) /* dup2 operations */ +#define FORKER_ARGV_NO_OF_ARGS 3 +#define FORKER_ARGV_PROGNAME_IX 0 /* Program name */ +#define FORKER_ARGV_MAX_FILES 1 /* max_files */ -#define CS_ARGV_NO_OF_DUP2_OPS 3 /* Number of dup2 ops */ -#define CS_ARGV_NO_OF_ARGS 8 /* Number of arguments */ -#endif /* #ifdef NEED_CHILD_SETUP_DEFINES */ +#define FORKER_FLAG_USE_STDIO (1 << 0) /* dup the pipe to stdin/stderr */ +#define FORKER_FLAG_EXIT_STATUS (1 << 1) /* send the exit status to parent */ +#define FORKER_FLAG_DO_READ (1 << 2) /* dup write fd */ +#define FORKER_FLAG_DO_WRITE (1 << 3) /* dup read fd */ /* Threads */ #ifdef USE_THREADS @@ -470,20 +451,4 @@ extern jmp_buf erts_sys_sigsegv_jmp; } while(0) #endif -#ifdef USE_THREADS -# ifdef ENABLE_CHILD_WAITER_THREAD -# define CHLDWTHR ENABLE_CHILD_WAITER_THREAD -# else -# define CHLDWTHR 0 -# endif -# define FDBLOCK 1 -#else -# define CHLDWTHR 0 -# define FDBLOCK 0 -#endif - -#define ERTS_SYS_SUSPEND_SIGNAL SIGUSR2 - -int check_children(void); - #endif /* #ifndef _ERL_UNIX_SYS_H */ diff --git a/erts/emulator/sys/unix/sys.c b/erts/emulator/sys/unix/sys.c index 503ef5c2f6..2ad5f3b4d5 100644 --- a/erts/emulator/sys/unix/sys.c +++ b/erts/emulator/sys/unix/sys.c @@ -49,7 +49,6 @@ #include <sys/ioctl.h> #endif -#define NEED_CHILD_SETUP_DEFINES #define ERTS_WANT_BREAK_HANDLING #define ERTS_WANT_GOT_SIGUSR1 #define WANT_NONBLOCKING /* must define this to pull in defs from sys.h */ @@ -93,6 +92,7 @@ extern void erts_sys_init_float(void); extern void erl_crash_dump(char* file, int line, char* fmt, ...); + #ifdef DEBUG static int debug_log = 0; #endif @@ -122,6 +122,7 @@ static void smp_sig_notify(char c); static int sig_notify_fds[2] = {-1, -1}; static int sig_suspend_fds[2] = {-1, -1}; +#define ERTS_SYS_SUSPEND_SIGNAL SIGUSR2 #endif @@ -307,9 +308,10 @@ MALLOC_USE_HASH(1); #ifdef USE_THREADS #ifdef ERTS_THR_HAVE_SIG_FUNCS + /* * Child thread inherits parents signal mask at creation. In order to - * guarantee that the main thread will receive all SIGINT, SIGCHLD, and + * guarantee that the main thread will receive all SIGINT, and * SIGUSR1 signals sent to the process, we block these signals in the * parent thread when creating a new thread. */ @@ -405,7 +407,6 @@ erts_sys_pre_init(void) #ifdef ERTS_THR_HAVE_SIG_FUNCS sigemptyset(&thr_create_sigmask); sigaddset(&thr_create_sigmask, SIGINT); /* block interrupt */ - sigaddset(&thr_create_sigmask, SIGCHLD); /* block child signals */ sigaddset(&thr_create_sigmask, SIGUSR1); /* block user defined signal */ #endif @@ -1010,7 +1011,6 @@ erts_sys_unsetenv(char *key) void sys_init_io(void) { - } #if (0) /* unused? */ @@ -1204,7 +1204,6 @@ erl_debug(char* fmt, ...) #endif /* DEBUG */ - /* * Called from schedule() when it runs out of runnable processes, * or when Erlang code has performed INPUT_REDUCTIONS reduction @@ -1213,15 +1212,11 @@ erl_debug(char* fmt, ...) void erl_sys_schedule(int runnable) { -#ifdef ERTS_SMP ERTS_CHK_IO(!runnable); -#else - ERTS_CHK_IO(runnable ? 0 : !check_children()); -#endif ERTS_SMP_LC_ASSERT(!erts_thr_progress_is_blocking()); - (void) check_children(); } + #ifdef ERTS_SMP static erts_smp_tid_t sig_dispatcher_tid; @@ -1246,10 +1241,6 @@ smp_sig_notify(char c) static void * signal_dispatcher_thread_func(void *unused) { -#if !CHLDWTHR - int initialized = 0; - int notify_check_children = 0; -#endif #ifdef ERTS_ENABLE_LOCK_CHECK erts_lc_set_thread_name("signal_dispatcher"); #endif @@ -1287,19 +1278,7 @@ signal_dispatcher_thread_func(void *unused) */ switch (buf[i]) { case 0: /* Emulator initialized */ -#if !CHLDWTHR - initialized = 1; - if (!notify_check_children) -#endif - break; -#if !CHLDWTHR - case 'C': /* SIGCHLD */ - if (initialized) - erts_smp_notify_check_children_needed(); - else - notify_check_children = 1; - break; -#endif + break; case 'I': /* SIGINT */ break_requested(); break; diff --git a/erts/emulator/sys/unix/sys_drivers.c b/erts/emulator/sys/unix/sys_drivers.c index e3f089fc0f..8402197924 100644 --- a/erts/emulator/sys/unix/sys_drivers.c +++ b/erts/emulator/sys/unix/sys_drivers.c @@ -36,6 +36,7 @@ #include <ctype.h> #include <sys/utsname.h> #include <sys/select.h> +#include <arpa/inet.h> #ifdef ISC32 #include <sys/bsdtypes.h> @@ -49,26 +50,20 @@ #include <sys/ioctl.h> #endif -#define NEED_CHILD_SETUP_DEFINES #define WANT_NONBLOCKING /* must define this to pull in defs from sys.h */ #include "sys.h" -#include "erl_thr_progress.h" - -#if defined(__APPLE__) && defined(__MACH__) && !defined(__DARWIN__) -#define __DARWIN__ 1 -#endif #ifdef USE_THREADS #include "erl_threads.h" #endif -#include "erl_mseg.h" - extern char **environ; extern erts_smp_rwmtx_t environ_rwmtx; extern erts_smp_atomic_t sys_misc_mem_sz; +static Eterm forker_port; + #define MAX_VSIZE 16 /* Max number of entries allowed in an I/O * vector sock_sendv(). */ @@ -76,14 +71,10 @@ extern erts_smp_atomic_t sys_misc_mem_sz; * Don't need global.h, but erl_cpu_topology.h won't compile otherwise */ #include "global.h" - -#include "erl_sys_driver.h" -#include "erl_check_io.h" #include "erl_cpu_topology.h" -#ifndef DISABLE_VFORK -#define DISABLE_VFORK 0 -#endif +#include "erl_sys_driver.h" +#include "sys_uds.h" #if defined IOV_MAX #define MAXIOV IOV_MAX @@ -93,32 +84,11 @@ extern erts_smp_atomic_t sys_misc_mem_sz; #define MAXIOV 16 #endif -/* - * [OTP-3906] - * Solaris signal management gets confused when threads are used and a - * lot of child processes dies. The confusion results in that SIGCHLD - * signals aren't delivered to the emulator which in turn results in - * a lot of defunct processes in the system. - * - * The problem seems to appear when a signal is frequently - * blocked/unblocked at the same time as the signal is frequently - * propagated. The child waiter thread is a workaround for this problem. - * The SIGCHLD signal is always blocked (in all threads), and the child - * waiter thread fetches the signal by a call to sigwait(). See - * child_waiter(). - */ - -typedef struct ErtsSysReportExit_ ErtsSysReportExit; -struct ErtsSysReportExit_ { - ErtsSysReportExit *next; - Eterm port; - int pid; - int ifd; - int ofd; -#if CHLDWTHR && !defined(ERTS_SMP) - int status; +#ifdef USE_THREADS +# define FDBLOCK 1 +#else +# define FDBLOCK 0 #endif -}; /* Used by the fd driver iff the fd could not be set to non-blocking */ typedef struct ErtsSysBlocking_ { @@ -128,12 +98,21 @@ typedef struct ErtsSysBlocking_ { unsigned int pkey; } ErtsSysBlocking; +typedef struct fd_data { + int fd; + char pbuf[4]; /* hold partial packet bytes */ + int psz; /* size of pbuf */ + char *buf; + char *cpos; + int sz; + int remain; /* for input on fd */ +} ErtsSysFdData; -/* This data is shared by these drivers - initialized by spawn_init() */ typedef struct driver_data { ErlDrvPort port_num; - int ofd, packet_bytes; - ErtsSysReportExit *report_exit; + ErtsSysFdData *ofd; + ErtsSysFdData *ifd; + int packet_bytes; int pid; int alive; int status; @@ -141,12 +120,11 @@ typedef struct driver_data { ErtsSysBlocking *blocking; } ErtsSysDriverData; -static ErtsSysDriverData *driver_data; /* indexed by fd */ - -static ErtsSysReportExit *report_exit_list; -#if CHLDWTHR && !defined(ERTS_SMP) -static ErtsSysReportExit *report_exit_transit_list; -#endif +typedef struct exit_status { + HashBucket hb; + pid_t os_pid; + Eterm port_id; +} ErtsSysExitStatus; #define DIR_SEPARATOR_CHAR '/' @@ -156,7 +134,6 @@ static ErtsSysReportExit *report_exit_transit_list; #define SHELL "/bin/sh" #endif /* __ANDROID__ */ - #if defined(DEBUG) #define ERL_BUILD_TYPE_MARKER ".debug" #elif defined(PURIFY) @@ -171,103 +148,80 @@ static ErtsSysReportExit *report_exit_transit_list; #define ERL_BUILD_TYPE_MARKER #endif +#ifdef DEBUG +#define close(fd) do { int res = close(fd); ASSERT(res > -1); } while(0) +#endif + #define CHILD_SETUP_PROG_NAME "erl_child_setup" ERL_BUILD_TYPE_MARKER -static char *child_setup_prog; -#if CHLDWTHR || defined(ERTS_SMP) -erts_mtx_t chld_stat_mtx; -#endif -#if CHLDWTHR -static erts_tid_t child_waiter_tid; -/* chld_stat_mtx is used to protect against concurrent accesses - of the driver_data fields pid, alive, and status. */ -erts_cnd_t chld_stat_cnd; -static long children_alive; -#define CHLD_STAT_LOCK erts_mtx_lock(&chld_stat_mtx) -#define CHLD_STAT_UNLOCK erts_mtx_unlock(&chld_stat_mtx) -#define CHLD_STAT_WAIT erts_cnd_wait(&chld_stat_cnd, &chld_stat_mtx) -#define CHLD_STAT_SIGNAL erts_cnd_signal(&chld_stat_cnd) -#elif defined(ERTS_SMP) /* ------------------------------------------------- */ -#define CHLD_STAT_LOCK erts_mtx_lock(&chld_stat_mtx) -#define CHLD_STAT_UNLOCK erts_mtx_unlock(&chld_stat_mtx) - -#else /* ------------------------------------------------------------------- */ -#define CHLD_STAT_LOCK -#define CHLD_STAT_UNLOCK -static volatile int children_died; +// #define HARD_DEBUG +#ifdef HARD_DEBUG +#define driver_select(port_num, fd, flags, onoff) \ + do { \ + if (((flags) & ERL_DRV_READ) && onoff) \ + fprintf(stderr,"%010d %p: read select %d\r\n", __LINE__, port_num, (int)fd); \ + if (((flags) & ERL_DRV_WRITE) && onoff) \ + fprintf(stderr,"%010d %p: writ select %d\r\n", __LINE__, port_num, (int)fd); \ + if (((flags) & ERL_DRV_READ) && !onoff) \ + fprintf(stderr,"%010d %p: read unsele %d\r\n", __LINE__, port_num, (int)fd); \ + if (((flags) & ERL_DRV_WRITE) && !onoff) \ + fprintf(stderr,"%010d %p: writ unsele %d\r\n", __LINE__, port_num, (int)fd); \ + driver_select_nkp(port_num, fd, flags, onoff); \ + } while(0) #endif -typedef struct ErtsSysFdData { - char pbuf[4]; /* hold partial packet bytes */ - int psz; /* size of pbuf */ - char *buf; - char *cpos; - int sz; - int remain; /* for input on fd */ -} ErtsSysFdData; - -static ErtsSysFdData *fd_data; /* indexed by fd */ +/* + * Decreasing the size of it below 16384 is not allowed. + */ -/* static FUNCTION(int, write_fill, (int, char*, int)); unused? */ -static void note_child_death(int, int); +#define ERTS_SYS_READ_BUF_SZ (64*1024) -#if CHLDWTHR -static void* child_waiter(void *); -#endif +/* I. Initialization */ -static void block_signals(void) +void +erl_sys_late_init(void) { -#if !CHLDWTHR - sys_sigblock(SIGCHLD); -#endif -#ifndef ERTS_SMP - sys_sigblock(SIGINT); -#ifndef ETHR_UNUSABLE_SIGUSRX - sys_sigblock(SIGUSR1); -#endif /* #ifndef ETHR_UNUSABLE_SIGUSRX */ -#endif /* #ifndef ERTS_SMP */ - -#if defined(ERTS_SMP) && !defined(ETHR_UNUSABLE_SIGUSRX) - sys_sigblock(ERTS_SYS_SUSPEND_SIGNAL); + SysDriverOpts opts; +#ifdef ERTS_SMP + Port *port; #endif -} + sys_signal(SIGPIPE, SIG_IGN); /* Ignore - we'll handle the write failure */ + + opts.packet_bytes = 0; + opts.use_stdio = 1; + opts.redir_stderr = 0; + opts.read_write = 0; + opts.hide_window = 0; + opts.wd = NULL; + opts.envir = NULL; + opts.exit_status = 0; + opts.overlapped_io = 0; + opts.spawn_type = ERTS_SPAWN_ANY; + opts.argv = NULL; + opts.parallelism = erts_port_parallelism; -static void unblock_signals(void) -{ - /* Update erl_child_setup.c if changed */ -#if !CHLDWTHR - sys_sigrelease(SIGCHLD); +#ifdef ERTS_SMP + port = #endif -#ifndef ERTS_SMP - sys_sigrelease(SIGINT); -#ifndef ETHR_UNUSABLE_SIGUSRX - sys_sigrelease(SIGUSR1); -#endif /* #ifndef ETHR_UNUSABLE_SIGUSRX */ -#endif /* #ifndef ERTS_SMP */ - -#if defined(ERTS_SMP) && !defined(ETHR_UNUSABLE_SIGUSRX) - sys_sigrelease(ERTS_SYS_SUSPEND_SIGNAL); + erts_open_driver(&forker_driver, make_internal_pid(0), "forker", &opts, NULL, NULL); +#ifdef ERTS_SMP + erts_mtx_unlock(port->lock); #endif - } -/************************** Port I/O *******************************/ +/* II. Prototypes */ +/* II.I Spawn prototypes */ +static ErlDrvData spawn_start(ErlDrvPort, char*, SysDriverOpts*); +static ErlDrvSSizeT spawn_control(ErlDrvData, unsigned int, char *, + ErlDrvSizeT, char **, ErlDrvSizeT); +/* II.II Vanilla prototypes */ +static ErlDrvData vanilla_start(ErlDrvPort, char*, SysDriverOpts*); -/* I. Common stuff */ - -/* - * Decreasing the size of it below 16384 is not allowed. - */ - -/* II. The spawn/fd/vanilla drivers */ - -#define ERTS_SYS_READ_BUF_SZ (64*1024) -/* Driver interfaces */ -static ErlDrvData spawn_start(ErlDrvPort, char*, SysDriverOpts*); +/* II.III FD prototypes */ static ErlDrvData fd_start(ErlDrvPort, char*, SysDriverOpts*); #if FDBLOCK static void fd_async(void *); @@ -275,10 +229,10 @@ static void fd_ready_async(ErlDrvData drv_data, ErlDrvThreadData thread_data); #endif static ErlDrvSSizeT fd_control(ErlDrvData, unsigned int, char *, ErlDrvSizeT, char **, ErlDrvSizeT); -static ErlDrvData vanilla_start(ErlDrvPort, char*, SysDriverOpts*); -static int spawn_init(void); static void fd_stop(ErlDrvData); static void fd_flush(ErlDrvData); + +/* II.IV Common prototypes */ static void stop(ErlDrvData); static void ready_input(ErlDrvData, ErlDrvEvent); static void ready_output(ErlDrvData, ErlDrvEvent); @@ -286,8 +240,22 @@ static void output(ErlDrvData, char*, ErlDrvSizeT); static void outputv(ErlDrvData, ErlIOVec*); static void stop_select(ErlDrvEvent, void*); +/* II.V Forker prototypes */ +static int forker_init(void); +static ErlDrvData forker_start(ErlDrvPort, char*, SysDriverOpts*); +static void forker_stop(ErlDrvData); +static void forker_ready_input(ErlDrvData, ErlDrvEvent); +static void forker_ready_output(ErlDrvData, ErlDrvEvent); +static ErlDrvSSizeT forker_control(ErlDrvData, unsigned int, char *, + ErlDrvSizeT, char **, ErlDrvSizeT); +static void forker_add_os_pid_mapping(ErtsSysDriverData *); +static void forker_remove_os_pid_mapping(ErtsSysDriverData *); + +/* III Driver entries */ + +/* III.I The spawn driver */ struct erl_drv_entry spawn_driver_entry = { - spawn_init, + forker_init, spawn_start, stop, output, @@ -296,7 +264,7 @@ struct erl_drv_entry spawn_driver_entry = { "spawn", NULL, NULL, - NULL, + spawn_control, NULL, NULL, NULL, @@ -306,17 +274,19 @@ struct erl_drv_entry spawn_driver_entry = { ERL_DRV_EXTENDED_MARKER, ERL_DRV_EXTENDED_MAJOR_VERSION, ERL_DRV_EXTENDED_MINOR_VERSION, - ERL_DRV_FLAG_USE_PORT_LOCKING, + ERL_DRV_FLAG_USE_PORT_LOCKING | ERL_DRV_FLAG_USE_INIT_ACK, NULL, NULL, stop_select }; + +/* III.II The fd driver */ struct erl_drv_entry fd_driver_entry = { NULL, fd_start, fd_stop, output, ready_input, - ready_output, + ready_output, "fd", NULL, NULL, @@ -339,6 +309,8 @@ struct erl_drv_entry fd_driver_entry = { NULL, /* process_exit */ stop_select }; + +/* III.III The vanilla driver */ struct erl_drv_entry vanilla_driver_entry = { NULL, vanilla_start, @@ -365,22 +337,33 @@ struct erl_drv_entry vanilla_driver_entry = { stop_select }; -/* Handle SIGCHLD signals. */ -#if (defined(SIG_SIGSET) || defined(SIG_SIGNAL)) -static RETSIGTYPE onchld(void) -#else -static RETSIGTYPE onchld(int signum) -#endif -{ -#if CHLDWTHR - ASSERT(0); /* We should *never* catch a SIGCHLD signal */ -#elif defined(ERTS_SMP) - smp_sig_notify('C'); -#else - children_died = 1; - ERTS_CHK_IO_AS_INTR(); /* Make sure we don't sleep in poll */ -#endif -} +/* III.III The forker driver */ +struct erl_drv_entry forker_driver_entry = { + NULL, + forker_start, + forker_stop, + NULL, + forker_ready_input, + forker_ready_output, + "spawn_forker", + NULL, + NULL, + forker_control, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + ERL_DRV_EXTENDED_MARKER, + ERL_DRV_EXTENDED_MAJOR_VERSION, + ERL_DRV_EXTENDED_MINOR_VERSION, + 0, + NULL, NULL, + stop_select +}; + +/* Untility functions */ static int set_blocking_data(ErtsSysDriverData *dd) { @@ -396,187 +379,94 @@ static int set_blocking_data(ErtsSysDriverData *dd) { return 1; } -static int set_driver_data(ErlDrvPort port_num, - int ifd, - int ofd, - int packet_bytes, - int read_write, - int exit_status, - int pid, - int is_blocking) +static void init_fd_data(ErtsSysFdData *fd_data, int fd) { - Port *prt; - ErtsSysReportExit *report_exit; - - if (!exit_status) - report_exit = NULL; - else { - report_exit = erts_alloc(ERTS_ALC_T_PRT_REP_EXIT, - sizeof(ErtsSysReportExit)); - report_exit->next = report_exit_list; - report_exit->port = erts_drvport2id(port_num); - report_exit->pid = pid; - report_exit->ifd = read_write & DO_READ ? ifd : -1; - report_exit->ofd = read_write & DO_WRITE ? ofd : -1; -#if CHLDWTHR && !defined(ERTS_SMP) - report_exit->status = 0; -#endif - report_exit_list = report_exit; - } - - prt = erts_drvport2port(port_num); - if (prt != ERTS_INVALID_ERL_DRV_PORT) - prt->os_pid = pid; - - if (read_write & DO_READ) { - driver_data[ifd].packet_bytes = packet_bytes; - driver_data[ifd].port_num = port_num; - driver_data[ifd].report_exit = report_exit; - driver_data[ifd].pid = pid; - driver_data[ifd].alive = 1; - driver_data[ifd].status = 0; - driver_data[ifd].terminating = 0; - driver_data[ifd].blocking = NULL; - if (read_write & DO_WRITE) { - driver_data[ifd].ofd = ofd; - if (is_blocking && FDBLOCK) - if (!set_blocking_data(driver_data+ifd)) - return -1; - if (ifd != ofd) - driver_data[ofd] = driver_data[ifd]; /* structure copy */ - } else { /* DO_READ only */ - driver_data[ifd].ofd = -1; - } - (void) driver_select(port_num, ifd, (ERL_DRV_READ|ERL_DRV_USE), 1); - return(ifd); - } else { /* DO_WRITE only */ - driver_data[ofd].packet_bytes = packet_bytes; - driver_data[ofd].port_num = port_num; - driver_data[ofd].report_exit = report_exit; - driver_data[ofd].ofd = ofd; - driver_data[ofd].pid = pid; - driver_data[ofd].alive = 1; - driver_data[ofd].status = 0; - driver_data[ofd].terminating = 0; - driver_data[ofd].blocking = NULL; - if (is_blocking && FDBLOCK) - if (!set_blocking_data(driver_data+ofd)) - return -1; - return(ofd); - } + fd_data->fd = fd; + fd_data->buf = NULL; + fd_data->cpos = NULL; + fd_data->remain = 0; + fd_data->sz = 0; + fd_data->psz = 0; } -static int spawn_init() +static ErtsSysDriverData * +create_driver_data(ErlDrvPort port_num, + int ifd, + int ofd, + int packet_bytes, + int read_write, + int exit_status, + int pid, + int is_blocking) { - int i; -#if CHLDWTHR - erts_thr_opts_t thr_opts = ERTS_THR_OPTS_DEFAULT_INITER; -#endif - int res; - char bindir[MAXPATHLEN]; - size_t bindirsz = sizeof(bindir); - Uint csp_path_sz; - -#if CHLDWTHR - thr_opts.detached = 0; - thr_opts.suggested_stack_size = 0; /* Smallest possible */ - thr_opts.name = "child_waiter"; -#endif - -#if !DISABLE_VFORK - res = erts_sys_getenv_raw("BINDIR", bindir, &bindirsz); - if (res != 0) { - if (res < 0) - erl_exit(-1, - "Environment variable BINDIR is not set\n"); - if (res > 0) - erl_exit(-1, - "Value of environment variable BINDIR is too large\n"); - } - if (bindir[0] != DIR_SEPARATOR_CHAR) - erl_exit(-1, - "Environment variable BINDIR does not contain an" - " absolute path\n"); - csp_path_sz = (strlen(bindir) - + 1 /* DIR_SEPARATOR_CHAR */ - + sizeof(CHILD_SETUP_PROG_NAME) - + 1); - child_setup_prog = erts_alloc(ERTS_ALC_T_CS_PROG_PATH, csp_path_sz); - erts_smp_atomic_add_nob(&sys_misc_mem_sz, csp_path_sz); - erts_snprintf(child_setup_prog, csp_path_sz, - "%s%c%s", - bindir, - DIR_SEPARATOR_CHAR, - CHILD_SETUP_PROG_NAME); -#endif - - report_exit_list = NULL; - -#ifdef USE_THREADS - -#if CHLDWTHR || defined(ERTS_SMP) - erts_mtx_init(&chld_stat_mtx, "child_status"); -#endif -#if CHLDWTHR -#ifndef ERTS_SMP - report_exit_transit_list = NULL; -#endif - erts_cnd_init(&chld_stat_cnd); - children_alive = 0; -#endif - -#if !CHLDWTHR && !defined(ERTS_SMP) - children_died = 0; -#endif - -#endif /* USE_THREADS */ - - sys_signal(SIGPIPE, SIG_IGN); /* Ignore - we'll handle the write failure */ - driver_data = (ErtsSysDriverData *) - erts_alloc(ERTS_ALC_T_DRV_TAB, sys_max_files() * sizeof(ErtsSysDriverData)); - erts_smp_atomic_add_nob(&sys_misc_mem_sz, - sys_max_files() * sizeof(ErtsSysDriverData)); + Port *prt; + ErtsSysDriverData *driver_data; + char *data; + int size = sizeof(ErtsSysDriverData); - for (i = 0; i < sys_max_files(); i++) - driver_data[i].pid = -1; + if (read_write & DO_READ) + size += sizeof(ErtsSysFdData); - fd_data = (ErtsSysFdData *) - erts_alloc(ERTS_ALC_T_FD_TAB, sys_max_files() * sizeof(ErtsSysFdData)); - erts_smp_atomic_add_nob(&sys_misc_mem_sz, - sys_max_files() * sizeof(ErtsSysFdData)); + if ((read_write & DO_WRITE) && + ((ifd != ofd || ofd == -1) || !(read_write & DO_READ))) + size += sizeof(ErtsSysFdData); -#if CHLDWTHR - sys_sigblock(SIGCHLD); -#endif + data = erts_alloc(ERTS_ALC_T_DRV_TAB,size); + erts_smp_atomic_add_nob(&sys_misc_mem_sz, size); - sys_signal(SIGCHLD, onchld); /* Reap children */ + driver_data = (ErtsSysDriverData*)data; + data += sizeof(*driver_data); -#if CHLDWTHR - erts_thr_create(&child_waiter_tid, child_waiter, NULL, &thr_opts); -#endif + prt = erts_drvport2port(port_num); + if (prt != ERTS_INVALID_ERL_DRV_PORT) + prt->os_pid = pid; - return 1; -} + driver_data->packet_bytes = packet_bytes; + driver_data->port_num = port_num; + driver_data->pid = pid; + driver_data->alive = exit_status ? 1 : 0; + driver_data->status = 0; + driver_data->terminating = 0; + driver_data->blocking = NULL; -static void close_pipes(int ifd[2], int ofd[2], int read_write) -{ if (read_write & DO_READ) { - (void) close(ifd[0]); - (void) close(ifd[1]); + driver_data->ifd = (ErtsSysFdData*)data; + data += sizeof(*driver_data->ifd); + init_fd_data(driver_data->ifd, ifd); + driver_select(port_num, ifd, (ERL_DRV_READ|ERL_DRV_USE), 1); + } else { + driver_data->ifd = NULL; } + if (read_write & DO_WRITE) { - (void) close(ofd[0]); - (void) close(ofd[1]); + if (ofd != -1 && ifd == ofd && read_write & DO_READ) { + /* This is for when ifd and ofd are the same fd */ + driver_data->ofd = driver_data->ifd; + } else { + driver_data->ofd = (ErtsSysFdData*)data; + data += sizeof(*driver_data->ofd); + init_fd_data(driver_data->ofd, ofd); + } + if (is_blocking && FDBLOCK) + if (!set_blocking_data(driver_data)) { + erts_free(ERTS_ALC_T_DRV_TAB, driver_data); + return NULL; + } + } else { + driver_data->ofd = NULL; } + + return driver_data; } -static void init_fd_data(int fd, ErlDrvPort port_num) +/* Spawn driver */ + +static void close_pipes(int ifd[2], int ofd[2]) { - fd_data[fd].buf = NULL; - fd_data[fd].cpos = NULL; - fd_data[fd].remain = 0; - fd_data[fd].sz = 0; - fd_data[fd].psz = 0; + close(ifd[0]); + close(ifd[1]); + close(ofd[0]); + close(ofd[1]); } static char **build_unix_environment(char *block) @@ -622,6 +512,10 @@ static char **build_unix_environment(char *block) for (j = 0; j < len; j++) { char *s, *t; + /* check if cpp[j] equals old + before the = sign, + i.e. + "TMPDIR=/tmp/" */ s = cpp[j]; t = old; while (*s == *t && *s != '=') { @@ -656,81 +550,43 @@ static char **build_unix_environment(char *block) return cpp; } -/* - [arndt] In most Unix systems, including Solaris 2.5, 'fork' allocates memory - in swap space for the child of a 'fork', whereas 'vfork' does not do this. - The natural call to use here is therefore 'vfork'. Due to a bug in - 'vfork' in Solaris 2.5 (apparently fixed in 2.6), using 'vfork' - can be dangerous in what seems to be these circumstances: - If the child code under a vfork sets the signal action to SIG_DFL - (or SIG_IGN) - for any signal which was previously set to a signal handler, the - state of the parent is clobbered, so that the later arrival of - such a signal yields a sigsegv in the parent. If the signal was - not set to a signal handler, but ignored, all seems to work. - If you change the forking code below, beware of this. - */ - -static ErlDrvData spawn_start(ErlDrvPort port_num, char* name, SysDriverOpts* opts) +static ErlDrvData spawn_start(ErlDrvPort port_num, char* name, + SysDriverOpts* opts) { #define CMD_LINE_PREFIX_STR "exec " #define CMD_LINE_PREFIX_STR_SZ (sizeof(CMD_LINE_PREFIX_STR) - 1) - int ifd[2], ofd[2], len, pid, i; - char **volatile new_environ; /* volatile since a vfork() then cannot - cause 'new_environ' to be clobbered - in the parent process. */ - int saved_errno; - long res; + int len; + char **new_environ; + ErtsSysDriverData *res; char *cmd_line; -#ifndef QNX - int unbind; -#endif -#if !DISABLE_VFORK - int no_vfork; - size_t no_vfork_sz = sizeof(no_vfork); + char wd_buff[MAXPATHLEN+1]; + char *wd; + int ifd[2], ofd[2], stderrfd; + + if (pipe(ifd) < 0) return ERL_DRV_ERROR_ERRNO; + errno = EMFILE; /* default for next three conditions */ + if (ifd[0] >= sys_max_files() || pipe(ofd) < 0) { + close(ifd[0]); + close(ifd[1]); + return ERL_DRV_ERROR_ERRNO; + } + if (ofd[1] >= sys_max_files()) { + close_pipes(ifd, ofd); + errno = EMFILE; + return ERL_DRV_ERROR_ERRNO; + } - no_vfork = (erts_sys_getenv_raw("ERL_NO_VFORK", - (char *) &no_vfork, - &no_vfork_sz) >= 0); -#endif + SET_NONBLOCKING(ifd[0]); + SET_NONBLOCKING(ofd[1]); - switch (opts->read_write) { - case DO_READ: - if (pipe(ifd) < 0) - return ERL_DRV_ERROR_ERRNO; - if (ifd[0] >= sys_max_files()) { - close_pipes(ifd, ofd, opts->read_write); - errno = EMFILE; - return ERL_DRV_ERROR_ERRNO; - } - ofd[1] = -1; /* keep purify happy */ - break; - case DO_WRITE: - if (pipe(ofd) < 0) return ERL_DRV_ERROR_ERRNO; - if (ofd[1] >= sys_max_files()) { - close_pipes(ifd, ofd, opts->read_write); - errno = EMFILE; - return ERL_DRV_ERROR_ERRNO; - } - ifd[0] = -1; /* keep purify happy */ - break; - case DO_READ|DO_WRITE: - if (pipe(ifd) < 0) return ERL_DRV_ERROR_ERRNO; - errno = EMFILE; /* default for next two conditions */ - if (ifd[0] >= sys_max_files() || pipe(ofd) < 0) { - close_pipes(ifd, ofd, DO_READ); - return ERL_DRV_ERROR_ERRNO; - } - if (ofd[1] >= sys_max_files()) { - close_pipes(ifd, ofd, opts->read_write); - errno = EMFILE; - return ERL_DRV_ERROR_ERRNO; - } - break; - default: - ASSERT(0); - return ERL_DRV_ERROR_GENERAL; + stderrfd = opts->redir_stderr ? ifd[1] : dup(2); + + if (stderrfd >= sys_max_files() || stderrfd < 0) { + close_pipes(ifd, ofd); + if (stderrfd > -1) + close(stderrfd); + return ERL_DRV_ERROR_ERRNO; } if (opts->spawn_type == ERTS_SPAWN_EXECUTABLE) { @@ -738,15 +594,17 @@ static ErlDrvData spawn_start(ErlDrvPort port_num, char* name, SysDriverOpts* op len = strlen(name); cmd_line = (char *) erts_alloc_fnf(ERTS_ALC_T_TMP, len + 1); if (!cmd_line) { - close_pipes(ifd, ofd, opts->read_write); + close_pipes(ifd, ofd); errno = ENOMEM; return ERL_DRV_ERROR_ERRNO; } memcpy((void *) cmd_line,(void *) name, len); cmd_line[len] = '\0'; + len = len + 1; if (access(cmd_line,X_OK) != 0) { int save_errno = errno; erts_free(ERTS_ALC_T_TMP, cmd_line); + close_pipes(ifd, ofd); errno = save_errno; return ERL_DRV_ERROR_ERRNO; } @@ -756,7 +614,7 @@ static ErlDrvData spawn_start(ErlDrvPort port_num, char* name, SysDriverOpts* op cmd_line = (char *) erts_alloc_fnf(ERTS_ALC_T_TMP, CMD_LINE_PREFIX_STR_SZ + len + 1); if (!cmd_line) { - close_pipes(ifd, ofd, opts->read_write); + close_pipes(ifd, ofd); errno = ENOMEM; return ERL_DRV_ERROR_ERRNO; } @@ -765,6 +623,7 @@ static ErlDrvData spawn_start(ErlDrvPort port_num, char* name, SysDriverOpts* op CMD_LINE_PREFIX_STR_SZ); memcpy((void *) (cmd_line + CMD_LINE_PREFIX_STR_SZ), (void *) name, len); cmd_line[CMD_LINE_PREFIX_STR_SZ + len] = '\0'; + len = CMD_LINE_PREFIX_STR_SZ + len + 1; } erts_smp_rwmtx_rlock(&environ_rwmtx); @@ -773,283 +632,194 @@ static ErlDrvData spawn_start(ErlDrvPort port_num, char* name, SysDriverOpts* op new_environ = environ; } else if ((new_environ = build_unix_environment(opts->envir)) == NULL) { erts_smp_rwmtx_runlock(&environ_rwmtx); + close_pipes(ifd, ofd); erts_free(ERTS_ALC_T_TMP, (void *) cmd_line); errno = ENOMEM; return ERL_DRV_ERROR_ERRNO; - } + } -#ifndef QNX - /* Block child from SIGINT and SIGUSR1. Must be before fork() - to be safe. */ - block_signals(); + if (opts->wd == NULL) { + if ((wd = getcwd(wd_buff, MAXPATHLEN+1)) == NULL) { + /* on some OSs this call opens a fd in the + background which means that this can + return EMFILE */ + int err = errno; + close_pipes(ifd, ofd); + erts_free(ERTS_ALC_T_TMP, (void *) cmd_line); + if (new_environ != environ) + erts_free(ERTS_ALC_T_ENVIRONMENT, (void *) new_environ); + erts_smp_rwmtx_runlock(&environ_rwmtx); + errno = err; + return ERL_DRV_ERROR_ERRNO; + } + } else { + wd = opts->wd; + } - CHLD_STAT_LOCK; + { + struct iovec *io_vector; + int buffsz = 0, iov_len = 5; + char nullbuff[] = "\0"; + int j, i = 0; + Sint32 env_len = 0, argv_len = 0, + flags = (opts->use_stdio ? FORKER_FLAG_USE_STDIO : 0) + | (opts->exit_status ? FORKER_FLAG_EXIT_STATUS : 0) + | (opts->read_write & DO_READ ? FORKER_FLAG_DO_READ : 0) + | (opts->read_write & DO_WRITE ? FORKER_FLAG_DO_WRITE : 0); + + /* count number of elements in environment */ + while(new_environ[env_len] != NULL) + env_len++; + iov_len += 1 + env_len; /* num envs including size int */ + + /* count number of element in argument list */ + if (opts->spawn_type == ERTS_SPAWN_EXECUTABLE) { + if (opts->argv != NULL) { + while(opts->argv[argv_len] != NULL) + argv_len++; + } else { + argv_len++; + } + iov_len += 1 + argv_len; /* num argvs including size int */ + } - unbind = erts_sched_bind_atfork_prepare(); + io_vector = erts_alloc_fnf(ERTS_ALC_T_TMP, sizeof(struct iovec) * iov_len); -#if !DISABLE_VFORK - /* See fork/vfork discussion before this function. */ - if (no_vfork) { -#endif + if (!io_vector) { + close_pipes(ifd, ofd); + erts_smp_rwmtx_runlock(&environ_rwmtx); + erts_free(ERTS_ALC_T_TMP, (void *) cmd_line); + if (new_environ != environ) + erts_free(ERTS_ALC_T_ENVIRONMENT, (void *) new_environ); + errno = ENOMEM; + return ERL_DRV_ERROR_ERRNO; + } - DEBUGF(("Using fork\n")); - pid = fork(); - - if (pid == 0) { - /* The child! Setup child... */ - - if (erts_sched_bind_atfork_child(unbind) != 0) - goto child_error; - - /* OBSERVE! - * Keep child setup after vfork() (implemented below and in - * erl_child_setup.c) up to date if changes are made here. - */ - - if (opts->use_stdio) { - if (opts->read_write & DO_READ) { - /* stdout for process */ - if (dup2(ifd[1], 1) < 0) - goto child_error; - if(opts->redir_stderr) - /* stderr for process */ - if (dup2(ifd[1], 2) < 0) - goto child_error; - } - if (opts->read_write & DO_WRITE) - /* stdin for process */ - if (dup2(ofd[0], 0) < 0) - goto child_error; - } - else { /* XXX will fail if ofd[0] == 4 (unlikely..) */ - if (opts->read_write & DO_READ) - if (dup2(ifd[1], 4) < 0) - goto child_error; - if (opts->read_write & DO_WRITE) - if (dup2(ofd[0], 3) < 0) - goto child_error; - } + io_vector[i].iov_base = (void*)&buffsz; + io_vector[i++].iov_len = sizeof(buffsz); -#if defined(HAVE_CLOSEFROM) - closefrom(opts->use_stdio ? 3 : 5); -#else - for (i = opts->use_stdio ? 3 : 5; i < sys_max_files(); i++) - (void) close(i); -#endif + io_vector[i].iov_base = (void*)&flags; + io_vector[i++].iov_len = sizeof(flags); + buffsz += sizeof(flags); - if (opts->wd && chdir(opts->wd) < 0) - goto child_error; + io_vector[i].iov_base = cmd_line; + io_vector[i++].iov_len = len; + buffsz += len; -#if defined(USE_SETPGRP_NOARGS) /* SysV */ - (void) setpgrp(); -#elif defined(USE_SETPGRP) /* BSD */ - (void) setpgrp(0, getpid()); -#else /* POSIX */ - (void) setsid(); -#endif - - unblock_signals(); - - if (opts->spawn_type == ERTS_SPAWN_EXECUTABLE) { - if (opts->argv == NULL) { - execle(cmd_line,cmd_line,(char *) NULL, new_environ); - } else { - if (opts->argv[0] == erts_default_arg0) { - opts->argv[0] = cmd_line; - } - execve(cmd_line, opts->argv, new_environ); - if (opts->argv[0] == cmd_line) { - opts->argv[0] = erts_default_arg0; - } - } - } else { - execle(SHELL, "sh", "-c", cmd_line, (char *) NULL, new_environ); - } - child_error: - _exit(1); - } -#if !DISABLE_VFORK - } -#define ENOUGH_BYTES (44) - else { /* Use vfork() */ - char **cs_argv= erts_alloc(ERTS_ALC_T_TMP,(CS_ARGV_NO_OF_ARGS + 1)* - sizeof(char *)); - char fd_close_range[ENOUGH_BYTES]; /* 44 bytes are enough to */ - char dup2_op[CS_ARGV_NO_OF_DUP2_OPS][ENOUGH_BYTES]; /* hold any "%d:%d" string */ - /* on a 64-bit machine. */ - - /* Setup argv[] for the child setup program (implemented in - erl_child_setup.c) */ - i = 0; - if (opts->use_stdio) { - if (opts->read_write & DO_READ){ - /* stdout for process */ - erts_snprintf(&dup2_op[i++][0], ENOUGH_BYTES, "%d:%d", ifd[1], 1); - if(opts->redir_stderr) - /* stderr for process */ - erts_snprintf(&dup2_op[i++][0], ENOUGH_BYTES, "%d:%d", ifd[1], 2); - } - if (opts->read_write & DO_WRITE) - /* stdin for process */ - erts_snprintf(&dup2_op[i++][0], ENOUGH_BYTES, "%d:%d", ofd[0], 0); - } else { /* XXX will fail if ofd[0] == 4 (unlikely..) */ - if (opts->read_write & DO_READ) - erts_snprintf(&dup2_op[i++][0], ENOUGH_BYTES, "%d:%d", ifd[1], 4); - if (opts->read_write & DO_WRITE) - erts_snprintf(&dup2_op[i++][0], ENOUGH_BYTES, "%d:%d", ofd[0], 3); - } - for (; i < CS_ARGV_NO_OF_DUP2_OPS; i++) - strcpy(&dup2_op[i][0], "-"); - erts_snprintf(fd_close_range, ENOUGH_BYTES, "%d:%d", opts->use_stdio ? 3 : 5, sys_max_files()-1); - - cs_argv[CS_ARGV_PROGNAME_IX] = child_setup_prog; - cs_argv[CS_ARGV_WD_IX] = opts->wd ? opts->wd : "."; - cs_argv[CS_ARGV_UNBIND_IX] = erts_sched_bind_atvfork_child(unbind); - cs_argv[CS_ARGV_FD_CR_IX] = fd_close_range; - for (i = 0; i < CS_ARGV_NO_OF_DUP2_OPS; i++) - cs_argv[CS_ARGV_DUP2_OP_IX(i)] = &dup2_op[i][0]; - - if (opts->spawn_type == ERTS_SPAWN_EXECUTABLE) { - int num = 0; - int j = 0; - if (opts->argv != NULL) { - for(; opts->argv[num] != NULL; ++num) - ; - } - cs_argv = erts_realloc(ERTS_ALC_T_TMP,cs_argv, (CS_ARGV_NO_OF_ARGS + 1 + num + 1) * sizeof(char *)); - cs_argv[CS_ARGV_CMD_IX] = "-"; - cs_argv[CS_ARGV_NO_OF_ARGS] = cmd_line; - if (opts->argv != NULL) { - for (;opts->argv[j] != NULL; ++j) { - if (opts->argv[j] == erts_default_arg0) { - cs_argv[CS_ARGV_NO_OF_ARGS + 1 + j] = cmd_line; - } else { - cs_argv[CS_ARGV_NO_OF_ARGS + 1 + j] = opts->argv[j]; - } - } - } - cs_argv[CS_ARGV_NO_OF_ARGS + 1 + j] = NULL; - } else { - cs_argv[CS_ARGV_CMD_IX] = cmd_line; /* Command */ - cs_argv[CS_ARGV_NO_OF_ARGS] = NULL; - } - DEBUGF(("Using vfork\n")); - pid = vfork(); - - if (pid == 0) { - /* The child! */ - - /* Observe! - * OTP-4389: The child setup program (implemented in - * erl_child_setup.c) will perform the necessary setup of the - * child before it execs to the user program. This because - * vfork() only allow an *immediate* execve() or _exit() in the - * child. - */ - execve(child_setup_prog, cs_argv, new_environ); - _exit(1); - } - erts_free(ERTS_ALC_T_TMP,cs_argv); - } -#undef ENOUGH_BYTES -#endif + io_vector[i].iov_base = wd; + io_vector[i].iov_len = strlen(io_vector[i].iov_base) + 1; + buffsz += io_vector[i++].iov_len; - erts_sched_bind_atfork_parent(unbind); + io_vector[i].iov_base = nullbuff; + io_vector[i++].iov_len = 1; + buffsz += io_vector[i-1].iov_len; - if (pid == -1) { - saved_errno = errno; - CHLD_STAT_UNLOCK; - erts_smp_rwmtx_runlock(&environ_rwmtx); - erts_free(ERTS_ALC_T_TMP, (void *) cmd_line); - unblock_signals(); - close_pipes(ifd, ofd, opts->read_write); - errno = saved_errno; - return ERL_DRV_ERROR_ERRNO; - } -#else /* QNX */ - if (opts->use_stdio) { - if (opts->read_write & DO_READ) - qnx_spawn_options.iov[1] = ifd[1]; /* stdout for process */ - if (opts->read_write & DO_WRITE) - qnx_spawn_options.iov[0] = ofd[0]; /* stdin for process */ - } - else { - if (opts->read_write & DO_READ) - qnx_spawn_options.iov[4] = ifd[1]; - if (opts->read_write & DO_WRITE) - qnx_spawn_options.iov[3] = ofd[0]; - } - /* Close fds on exec */ - for (i = 3; i < sys_max_files(); i++) - fcntl(i, F_SETFD, 1); + io_vector[i].iov_base = (void*)&env_len; + io_vector[i++].iov_len = sizeof(env_len); + buffsz += io_vector[i-1].iov_len; - qnx_spawn_options.flags = _SPAWN_SETSID; - if ((pid = spawnl(P_NOWAIT, SHELL, SHELL, "-c", cmd_line, - (char *) 0)) < 0) { - erts_free(ERTS_ALC_T_TMP, (void *) cmd_line); - reset_qnx_spawn(); - erts_smp_rwmtx_runlock(&environ_rwmtx); - close_pipes(ifd, ofd, opts->read_write); - return ERL_DRV_ERROR_GENERAL; + for (j = 0; new_environ[j] != NULL; j++) { + io_vector[i].iov_base = new_environ[j]; + io_vector[i++].iov_len = strlen(new_environ[j]) + 1; + buffsz += io_vector[i-1].iov_len; + } + + /* only append arguments if this was a spawn_executable */ + if (opts->spawn_type == ERTS_SPAWN_EXECUTABLE) { + + io_vector[i].iov_base = (void*)&argv_len; + io_vector[i++].iov_len = sizeof(argv_len); + buffsz += io_vector[i-1].iov_len; + + if (opts->argv) { + /* If there are arguments we copy in the references to + them into the iov */ + for (j = 0; opts->argv[j]; j++) { + if (opts->argv[j] == erts_default_arg0) + io_vector[i].iov_base = cmd_line; + else + io_vector[i].iov_base = opts->argv[j]; + io_vector[i].iov_len = strlen(io_vector[i].iov_base) + 1; + buffsz += io_vector[i++].iov_len; + } + } else { + io_vector[i].iov_base = cmd_line; + io_vector[i].iov_len = strlen(io_vector[i].iov_base) + 1; + buffsz += io_vector[i++].iov_len; + } + } + + /* we send the request to do the fork */ + if (writev(ofd[1], io_vector, iov_len) < 0) { + int err = errno; + close_pipes(ifd, ofd); + erts_free(ERTS_ALC_T_TMP, io_vector); + if (new_environ != environ) + erts_free(ERTS_ALC_T_ENVIRONMENT, (void *) new_environ); + erts_smp_rwmtx_runlock(&environ_rwmtx); + erts_free(ERTS_ALC_T_TMP, (void *) cmd_line); + errno = err; + return ERL_DRV_ERROR_ERRNO; + } + + erts_free(ERTS_ALC_T_TMP, io_vector); } - reset_qnx_spawn(); -#endif /* QNX */ erts_free(ERTS_ALC_T_TMP, (void *) cmd_line); if (new_environ != environ) erts_free(ERTS_ALC_T_ENVIRONMENT, (void *) new_environ); - if (opts->read_write & DO_READ) - (void) close(ifd[1]); - if (opts->read_write & DO_WRITE) - (void) close(ofd[0]); - - if (opts->read_write & DO_READ) { - SET_NONBLOCKING(ifd[0]); - init_fd_data(ifd[0], port_num); - } - if (opts->read_write & DO_WRITE) { - SET_NONBLOCKING(ofd[1]); - init_fd_data(ofd[1], port_num); - } + erts_smp_rwmtx_runlock(&environ_rwmtx); - res = set_driver_data(port_num, ifd[0], ofd[1], opts->packet_bytes, - opts->read_write, opts->exit_status, pid, 0); - /* Don't unblock SIGCHLD until now, since the call above must - first complete putting away the info about our new subprocess. */ - unblock_signals(); + res = create_driver_data(port_num, ifd[0], ofd[1], opts->packet_bytes, + DO_WRITE | DO_READ, opts->exit_status, + 0, 0); -#if CHLDWTHR - ASSERT(children_alive >= 0); + { + /* send ofd[0] + ifd[1] + stderrfd to forker port */ + int *fds = erts_alloc(ERTS_ALC_T_DRV_CTRL_DATA, sizeof(int)*3); + memset(fds, 0, sizeof(int)*3); + fds[0] = ofd[0]; + fds[1] = ifd[1]; + fds[2] = stderrfd; + if (erl_drv_port_control(forker_port, 'S', (char*)fds, sizeof(int)*3)) { + /* The forker port has been killed, we close both fd's which will + make open_port throw an epipe error */ + close(ofd[0]); + close(ifd[1]); + } + } - if (!(children_alive++)) - CHLD_STAT_SIGNAL; /* Wake up child waiter thread if no children - was alive before we fork()ed ... */ -#endif - /* Don't unlock chld_stat_mtx until now of the same reason as above */ - CHLD_STAT_UNLOCK; + /* we set these fds to negative to mark if + they should be closed after the handshake */ + if (!(opts->read_write & DO_READ)) + res->ifd->fd *= -1; - erts_smp_rwmtx_runlock(&environ_rwmtx); + if (!(opts->read_write & DO_WRITE)) + res->ofd->fd *= -1; return (ErlDrvData)res; #undef CMD_LINE_PREFIX_STR #undef CMD_LINE_PREFIX_STR_SZ } -#ifdef QNX -static reset_qnx_spawn() +static ErlDrvSSizeT spawn_control(ErlDrvData e, unsigned int cmd, char *buf, + ErlDrvSizeT len, char **rbuf, ErlDrvSizeT rlen) { - int i; + ErtsSysDriverData *dd = (ErtsSysDriverData*)e; + + memcpy(&dd->status, buf, sizeof(dd->status)); + dd->alive = -1; + + if (dd->ifd) + driver_select(dd->port_num, abs(dd->ifd->fd), ERL_DRV_READ | ERL_DRV_USE, 1); + + if (dd->ofd) + driver_select(dd->port_num, abs(dd->ofd->fd), ERL_DRV_WRITE | ERL_DRV_USE, 1); - /* Reset qnx_spawn_options */ - qnx_spawn_options.flags = 0; - qnx_spawn_options.iov[0] = 0xff; - qnx_spawn_options.iov[1] = 0xff; - qnx_spawn_options.iov[2] = 0xff; - qnx_spawn_options.iov[3] = 0xff; + return 0; } -#endif #define FD_DEF_HEIGHT 24 #define FD_DEF_WIDTH 80 @@ -1099,7 +869,6 @@ static ErlDrvSSizeT fd_control(ErlDrvData drv_data, static ErlDrvData fd_start(ErlDrvPort port_num, char* name, SysDriverOpts* opts) { - ErlDrvData res; int non_blocking = 0; if (((opts->read_write & DO_READ) && opts->ifd >= sys_max_files()) || @@ -1189,11 +958,8 @@ static ErlDrvData fd_start(ErlDrvPort port_num, char* name, * use erlang:halt with flush=false. */ - if (opts->read_write & DO_READ) { - init_fd_data(opts->ifd, port_num); - } + /* Try to figure out if we can use non-blocking writes */ if (opts->read_write & DO_WRITE) { - init_fd_data(opts->ofd, port_num); /* If we don't have a read end, all bets are off - no non-blocking. */ if (opts->read_write & DO_READ) { @@ -1259,60 +1025,66 @@ static ErlDrvData fd_start(ErlDrvPort port_num, char* name, } } } - CHLD_STAT_LOCK; - res = (ErlDrvData)(long)set_driver_data(port_num, opts->ifd, opts->ofd, - opts->packet_bytes, - opts->read_write, 0, -1, - !non_blocking); - CHLD_STAT_UNLOCK; - return res; + return (ErlDrvData)create_driver_data(port_num, opts->ifd, opts->ofd, + opts->packet_bytes, + opts->read_write, 0, -1, + !non_blocking); } -static void clear_fd_data(int fd) +static void clear_fd_data(ErtsSysFdData *fdd) { - if (fd_data[fd].sz > 0) { - erts_free(ERTS_ALC_T_FD_ENTRY_BUF, (void *) fd_data[fd].buf); - ASSERT(erts_smp_atomic_read_nob(&sys_misc_mem_sz) >= fd_data[fd].sz); - erts_smp_atomic_add_nob(&sys_misc_mem_sz, -1*fd_data[fd].sz); + if (fdd->sz > 0) { + erts_free(ERTS_ALC_T_FD_ENTRY_BUF, (void *) fdd->buf); + ASSERT(erts_smp_atomic_read_nob(&sys_misc_mem_sz) >= fdd->sz); + erts_smp_atomic_add_nob(&sys_misc_mem_sz, -1*fdd->sz); } - fd_data[fd].buf = NULL; - fd_data[fd].sz = 0; - fd_data[fd].remain = 0; - fd_data[fd].cpos = NULL; - fd_data[fd].psz = 0; + fdd->buf = NULL; + fdd->sz = 0; + fdd->remain = 0; + fdd->cpos = NULL; + fdd->psz = 0; } -static void nbio_stop_fd(ErlDrvPort prt, int fd) +static void nbio_stop_fd(ErlDrvPort prt, ErtsSysFdData *fdd) { - driver_select(prt,fd,DO_READ|DO_WRITE,0); - clear_fd_data(fd); - SET_BLOCKING(fd); + driver_select(prt, abs(fdd->fd), DO_READ|DO_WRITE, 0); + clear_fd_data(fdd); + SET_BLOCKING(abs(fdd->fd)); + } static void fd_stop(ErlDrvData ev) /* Does not close the fds */ { - int ofd; - int fd = (int)(long)ev; - ErlDrvPort prt = driver_data[fd].port_num; - + ErtsSysDriverData* dd = (ErtsSysDriverData*)ev; + ErlDrvPort prt = dd->port_num; + int sz = sizeof(ErtsSysDriverData); + #if FDBLOCK - if (driver_data[fd].blocking) { - erts_free(ERTS_ALC_T_SYS_BLOCKING,driver_data[fd].blocking); - driver_data[fd].blocking = NULL; - erts_smp_atomic_add_nob(&sys_misc_mem_sz, -1*sizeof(ErtsSysBlocking)); + if (dd->blocking) { + erts_free(ERTS_ALC_T_SYS_BLOCKING, dd->blocking); + dd->blocking = NULL; + sz += sizeof(ErtsSysBlocking); } #endif - nbio_stop_fd(prt, fd); - ofd = driver_data[fd].ofd; - if (ofd != fd && ofd != -1) - nbio_stop_fd(prt, ofd); + if (dd->ifd) { + sz += sizeof(ErtsSysFdData); + nbio_stop_fd(prt, dd->ifd); + } + if (dd->ofd && dd->ofd != dd->ifd) { + sz += sizeof(ErtsSysFdData); + nbio_stop_fd(prt, dd->ofd); + } + + erts_free(ERTS_ALC_T_DRV_TAB, dd); + erts_smp_atomic_add_nob(&sys_misc_mem_sz, -sz); } -static void fd_flush(ErlDrvData fd) +static void fd_flush(ErlDrvData ev) { - if (!driver_data[(int)(long)fd].terminating) - driver_data[(int)(long)fd].terminating = 1; + ErtsSysDriverData* dd = (ErtsSysDriverData*)ev; + if (!dd->terminating) + dd->terminating = 1; } static ErlDrvData vanilla_start(ErlDrvPort port_num, char* name, @@ -1331,55 +1103,45 @@ static ErlDrvData vanilla_start(ErlDrvPort port_num, char* name, return ERL_DRV_ERROR_GENERAL; } SET_NONBLOCKING(fd); - init_fd_data(fd, port_num); - CHLD_STAT_LOCK; - res = (ErlDrvData)(long)set_driver_data(port_num, fd, fd, - opts->packet_bytes, - opts->read_write, 0, -1, 0); - CHLD_STAT_UNLOCK; + res = (ErlDrvData)(long)create_driver_data(port_num, fd, fd, + opts->packet_bytes, + opts->read_write, 0, -1, 0); return res; } /* Note that driver_data[fd].ifd == fd if the port was opened for reading, */ /* otherwise (i.e. write only) driver_data[fd].ofd = fd. */ -static void stop(ErlDrvData fd) +static void stop(ErlDrvData ev) { - ErlDrvPort prt; - int ofd; - - prt = driver_data[(int)(long)fd].port_num; - nbio_stop_fd(prt, (int)(long)fd); + ErtsSysDriverData* dd = (ErtsSysDriverData*)ev; + ErlDrvPort prt = dd->port_num; - ofd = driver_data[(int)(long)fd].ofd; - if (ofd != (int)(long)fd && (int)(long)ofd != -1) - nbio_stop_fd(prt, ofd); - else - ofd = -1; + if (dd->alive == 1) + /* Stop has been called before the exit status has been reported */ + forker_remove_os_pid_mapping(dd); - CHLD_STAT_LOCK; - - /* Mark as unused. */ - driver_data[(int)(long)fd].pid = -1; - - CHLD_STAT_UNLOCK; + if (dd->ifd) { + nbio_stop_fd(prt, dd->ifd); + driver_select(prt, abs(dd->ifd->fd), ERL_DRV_USE, 0); /* close(ifd); */ + } - /* SMP note: Close has to be last thing done (open file descriptors work - as locks on driver_data[] entries) */ - driver_select(prt, (int)(long)fd, ERL_DRV_USE, 0); /* close(fd); */ - if (ofd >= 0) { - driver_select(prt, (int)(long)ofd, ERL_DRV_USE, 0); /* close(ofd); */ + if (dd->ofd && dd->ofd != dd->ifd) { + nbio_stop_fd(prt, dd->ofd); + driver_select(prt, abs(dd->ofd->fd), ERL_DRV_USE, 0); /* close(ofd); */ } + + erts_free(ERTS_ALC_T_DRV_TAB, dd); } /* used by fd_driver */ static void outputv(ErlDrvData e, ErlIOVec* ev) { - int fd = (int)(long)e; - ErlDrvPort ix = driver_data[fd].port_num; - int pb = driver_data[fd].packet_bytes; - int ofd = driver_data[fd].ofd; + ErtsSysDriverData *dd = (ErtsSysDriverData*)e; + ErlDrvPort ix = dd->port_num; + int pb = dd->packet_bytes; + int ofd = dd->ofd ? dd->ofd->fd : -1; ssize_t n; ErlDrvSizeT sz; char lb[4]; @@ -1400,19 +1162,19 @@ static void outputv(ErlDrvData e, ErlIOVec* ev) ev->iov[0].iov_len = pb; ev->size += pb; - if (driver_data[fd].blocking && FDBLOCK) - driver_pdl_lock(driver_data[fd].blocking->pdl); + if (dd->blocking && FDBLOCK) + driver_pdl_lock(dd->blocking->pdl); if ((sz = driver_sizeq(ix)) > 0) { driver_enqv(ix, ev, 0); - if (driver_data[fd].blocking && FDBLOCK) - driver_pdl_unlock(driver_data[fd].blocking->pdl); + if (dd->blocking && FDBLOCK) + driver_pdl_unlock(dd->blocking->pdl); if (sz + ev->size >= (1 << 13)) set_busy_port(ix, 1); } - else if (!driver_data[fd].blocking || !FDBLOCK) { + else if (!dd->blocking || !FDBLOCK) { /* We try to write directly if the fd in non-blocking */ int vsize = ev->vsize > MAX_VSIZE ? MAX_VSIZE : ev->vsize; @@ -1433,11 +1195,11 @@ static void outputv(ErlDrvData e, ErlIOVec* ev) else { if (ev->size != 0) { driver_enqv(ix, ev, 0); - driver_pdl_unlock(driver_data[fd].blocking->pdl); - driver_async(ix, &driver_data[fd].blocking->pkey, - fd_async, driver_data+fd, NULL); + driver_pdl_unlock(dd->blocking->pdl); + driver_async(ix, &dd->blocking->pkey, + fd_async, dd, NULL); } else { - driver_pdl_unlock(driver_data[fd].blocking->pdl); + driver_pdl_unlock(dd->blocking->pdl); } } #endif @@ -1447,10 +1209,10 @@ static void outputv(ErlDrvData e, ErlIOVec* ev) /* Used by spawn_driver and vanilla driver */ static void output(ErlDrvData e, char* buf, ErlDrvSizeT len) { - int fd = (int)(long)e; - ErlDrvPort ix = driver_data[fd].port_num; - int pb = driver_data[fd].packet_bytes; - int ofd = driver_data[fd].ofd; + ErtsSysDriverData *dd = (ErtsSysDriverData*)e; + ErlDrvPort ix = dd->port_num; + int pb = dd->packet_bytes; + int ofd = dd->ofd ? dd->ofd->fd : -1; ssize_t n; ErlDrvSizeT sz; char lb[4]; @@ -1458,7 +1220,9 @@ static void output(ErlDrvData e, char* buf, ErlDrvSizeT len) struct iovec iv[2]; /* (len > ((unsigned long)-1 >> (4-pb)*8)) */ - if (((pb == 2) && (len > 0xffff)) || (pb == 1 && len > 0xff)) { + if (((pb == 2) && (len > 0xffff)) + || (pb == 1 && len > 0xff) + || dd->pid == 0 /* Attempt at output before port is ready */) { driver_failure_posix(ix, EINVAL); return; /* -1; */ } @@ -1499,62 +1263,58 @@ static void output(ErlDrvData e, char* buf, ErlDrvSizeT len) return; /* 0; */ } -static int port_inp_failure(ErlDrvPort port_num, int ready_fd, int res) +static int port_inp_failure(ErtsSysDriverData *dd, int res) /* Result: 0 (eof) or -1 (error) */ { int err = errno; ASSERT(res <= 0); - (void) driver_select(port_num, ready_fd, ERL_DRV_READ|ERL_DRV_WRITE, 0); - clear_fd_data(ready_fd); + if (dd->ifd) { + driver_select(dd->port_num, dd->ifd->fd, ERL_DRV_READ|ERL_DRV_WRITE, 0); + clear_fd_data(dd->ifd); + } - if (driver_data[ready_fd].blocking && FDBLOCK) { - driver_pdl_lock(driver_data[ready_fd].blocking->pdl); - if (driver_sizeq(driver_data[ready_fd].port_num) > 0) { - driver_pdl_unlock(driver_data[ready_fd].blocking->pdl); + if (dd->blocking && FDBLOCK) { + driver_pdl_lock(dd->blocking->pdl); + if (driver_sizeq(dd->port_num) > 0) { + driver_pdl_unlock(dd->blocking->pdl); /* We have stuff in the output queue, so we just set the state to terminating and wait for fd_async_ready to terminate the port */ if (res == 0) - driver_data[ready_fd].terminating = 2; + dd->terminating = 2; else - driver_data[ready_fd].terminating = -err; + dd->terminating = -err; return 0; } - driver_pdl_unlock(driver_data[ready_fd].blocking->pdl); + driver_pdl_unlock(dd->blocking->pdl); } if (res == 0) { - if (driver_data[ready_fd].report_exit) { - CHLD_STAT_LOCK; - - if (driver_data[ready_fd].alive) { - /* - * We have eof and want to report exit status, but the process - * hasn't exited yet. When it does report_exit_status() will - * driver_select() this fd which will make sure that we get - * back here with driver_data[ready_fd].alive == 0 and - * driver_data[ready_fd].status set. - */ - CHLD_STAT_UNLOCK; - return 0; - } - else { - int status = driver_data[ready_fd].status; - CHLD_STAT_UNLOCK; - - /* We need not be prepared for stopped/continued processes. */ - if (WIFSIGNALED(status)) - status = 128 + WTERMSIG(status); - else - status = WEXITSTATUS(status); + if (dd->alive == 1) { + /* + * We have eof and want to report exit status, but the process + * hasn't exited yet. When it does ready_input will + * driver_select() this fd which will make sure that we get + * back here with dd->alive == -1 and dd->status set. + */ + return 0; + } + else if (dd->alive == -1) { + int status = dd->status; - driver_report_exit(driver_data[ready_fd].port_num, status); - } - } - driver_failure_eof(port_num); + /* We need not be prepared for stopped/continued processes. */ + if (WIFSIGNALED(status)) + status = 128 + WTERMSIG(status); + else + status = WEXITSTATUS(status); + driver_report_exit(dd->port_num, status); + } + driver_failure_eof(dd->port_num); + } else if (dd->ifd) { + erl_drv_init_ack(dd->port_num, ERL_DRV_ERROR_ERRNO); } else { - driver_failure_posix(port_num, err); + driver_failure_posix(dd->port_num, err); } return 0; } @@ -1565,15 +1325,70 @@ static int port_inp_failure(ErlDrvPort port_num, int ready_fd, int res) static void ready_input(ErlDrvData e, ErlDrvEvent ready_fd) { - int fd = (int)(long)e; + ErtsSysDriverData *dd = (ErtsSysDriverData*)e; ErlDrvPort port_num; int packet_bytes; int res; Uint h; - port_num = driver_data[fd].port_num; - packet_bytes = driver_data[fd].packet_bytes; + port_num = dd->port_num; + packet_bytes = dd->packet_bytes; + + ASSERT(abs(dd->ifd->fd) == ready_fd); + + if (dd->pid == 0) { + /* the pid is sent from erl_child_setup. spawn driver only. */ + char message_buffer[3 + 10 + 1 + 10 + 1]; + int reason, res; + + if((res = read(ready_fd, message_buffer, sizeof(message_buffer))) <= 0) { + /* hmm, child setup seems to have closed the pipe too early... + we close the port as there is not much else we can do */ + if (res < 0 && errno == ERRNO_BLOCK) + return; + driver_select(port_num, ready_fd, ERL_DRV_READ, 0); + if (res == 0) + errno = EPIPE; + port_inp_failure(dd, -1); + return; + } + + if(sscanf(message_buffer,"GO:%010d:%010d", &dd->pid, &reason) == 2) { + if (dd->pid == -1) { + /* Setup failed! The only reason why this should happen is if + the fork fails. */ + errno = reason; + port_inp_failure(dd, -1); + return; + } + + if (write(abs(dd->ofd->fd), "A", 1) < 0) + ; /* do nothing on failure here. If the ofd is broken, then + the ifd will probably also be broken and trigger + a port_inp_failure */ + + if (dd->ifd->fd < 0) { + driver_select(port_num, abs(dd->ifd->fd), ERL_DRV_READ|ERL_DRV_USE, 0); + erts_smp_atomic_add_nob(&sys_misc_mem_sz, -sizeof(ErtsSysFdData)); + dd->ifd = NULL; + } + + if (dd->ofd->fd < 0 || driver_sizeq(port_num) > 0) + /* we select in order to close fd or write to queue, + child setup will close this fd if fd < 0 */ + driver_select(port_num, abs(dd->ofd->fd), ERL_DRV_WRITE|ERL_DRV_USE, 1); + + if (dd->alive == 1) { + forker_add_os_pid_mapping(dd); + } + erl_drv_set_os_pid(port_num, dd->pid); + erl_drv_init_ack(port_num, e); + return; + } + ASSERT(0); + return; + } if (packet_bytes == 0) { byte *read_buf = (byte *) erts_alloc(ERTS_ALC_T_SYS_READ_BUF, @@ -1581,76 +1396,76 @@ static void ready_input(ErlDrvData e, ErlDrvEvent ready_fd) res = read(ready_fd, read_buf, ERTS_SYS_READ_BUF_SZ); if (res < 0) { if ((errno != EINTR) && (errno != ERRNO_BLOCK)) - port_inp_failure(port_num, ready_fd, res); + port_inp_failure(dd, res); } else if (res == 0) - port_inp_failure(port_num, ready_fd, res); - else + port_inp_failure(dd, res); + else driver_output(port_num, (char*) read_buf, res); erts_free(ERTS_ALC_T_SYS_READ_BUF, (void *) read_buf); } - else if (fd_data[ready_fd].remain > 0) { /* We try to read the remainder */ + else if (dd->ifd->remain > 0) { /* We try to read the remainder */ /* space is allocated in buf */ - res = read(ready_fd, fd_data[ready_fd].cpos, - fd_data[ready_fd].remain); + res = read(ready_fd, dd->ifd->cpos, + dd->ifd->remain); if (res < 0) { if ((errno != EINTR) && (errno != ERRNO_BLOCK)) - port_inp_failure(port_num, ready_fd, res); + port_inp_failure(dd, res); } else if (res == 0) { - port_inp_failure(port_num, ready_fd, res); + port_inp_failure(dd, res); } - else if (res == fd_data[ready_fd].remain) { /* we're done */ - driver_output(port_num, fd_data[ready_fd].buf, - fd_data[ready_fd].sz); - clear_fd_data(ready_fd); + else if (res == dd->ifd->remain) { /* we're done */ + driver_output(port_num, dd->ifd->buf, + dd->ifd->sz); + clear_fd_data(dd->ifd); } - else { /* if (res < fd_data[ready_fd].remain) */ - fd_data[ready_fd].cpos += res; - fd_data[ready_fd].remain -= res; + else { /* if (res < dd->ifd->remain) */ + dd->ifd->cpos += res; + dd->ifd->remain -= res; } } - else if (fd_data[ready_fd].remain == 0) { /* clean fd */ + else if (dd->ifd->remain == 0) { /* clean fd */ byte *read_buf = (byte *) erts_alloc(ERTS_ALC_T_SYS_READ_BUF, ERTS_SYS_READ_BUF_SZ); /* We make one read attempt and see what happens */ res = read(ready_fd, read_buf, ERTS_SYS_READ_BUF_SZ); - if (res < 0) { + if (res < 0) { if ((errno != EINTR) && (errno != ERRNO_BLOCK)) - port_inp_failure(port_num, ready_fd, res); + port_inp_failure(dd, res); } else if (res == 0) { /* eof */ - port_inp_failure(port_num, ready_fd, res); - } - else if (res < packet_bytes - fd_data[ready_fd].psz) { - memcpy(fd_data[ready_fd].pbuf+fd_data[ready_fd].psz, + port_inp_failure(dd, res); + } + else if (res < packet_bytes - dd->ifd->psz) { + memcpy(dd->ifd->pbuf+dd->ifd->psz, read_buf, res); - fd_data[ready_fd].psz += res; + dd->ifd->psz += res; } else { /* if (res >= packet_bytes) */ unsigned char* cpos = read_buf; int bytes_left = res; while (1) { - int psz = fd_data[ready_fd].psz; - char* pbp = fd_data[ready_fd].pbuf + psz; + int psz = dd->ifd->psz; + char* pbp = dd->ifd->pbuf + psz; while(bytes_left && (psz < packet_bytes)) { *pbp++ = *cpos++; bytes_left--; psz++; } - + if (psz < packet_bytes) { - fd_data[ready_fd].psz = psz; + dd->ifd->psz = psz; break; } - fd_data[ready_fd].psz = 0; + dd->ifd->psz = 0; switch (packet_bytes) { - case 1: h = get_int8(fd_data[ready_fd].pbuf); break; - case 2: h = get_int16(fd_data[ready_fd].pbuf); break; - case 4: h = get_int32(fd_data[ready_fd].pbuf); break; + case 1: h = get_int8(dd->ifd->pbuf); break; + case 2: h = get_int16(dd->ifd->pbuf); break; + case 4: h = get_int32(dd->ifd->pbuf); break; default: ASSERT(0); return; /* -1; */ } @@ -1664,15 +1479,15 @@ static void ready_input(ErlDrvData e, ErlDrvEvent ready_fd) char *buf = erts_alloc_fnf(ERTS_ALC_T_FD_ENTRY_BUF, h); if (!buf) { errno = ENOMEM; - port_inp_failure(port_num, ready_fd, -1); + port_inp_failure(dd, -1); } else { erts_smp_atomic_add_nob(&sys_misc_mem_sz, h); sys_memcpy(buf, cpos, bytes_left); - fd_data[ready_fd].buf = buf; - fd_data[ready_fd].sz = h; - fd_data[ready_fd].remain = h - bytes_left; - fd_data[ready_fd].cpos = buf + bytes_left; + dd->ifd->buf = buf; + dd->ifd->sz = h; + dd->ifd->remain = h - bytes_left; + dd->ifd->cpos = buf + bytes_left; } break; } @@ -1689,17 +1504,24 @@ static void ready_input(ErlDrvData e, ErlDrvEvent ready_fd) static void ready_output(ErlDrvData e, ErlDrvEvent ready_fd) { - int fd = (int)(long)e; - ErlDrvPort ix = driver_data[fd].port_num; + ErtsSysDriverData *dd = (ErtsSysDriverData*)e; + ErlDrvPort ix = dd->port_num; int n; struct iovec* iv; int vsize; - if ((iv = (struct iovec*) driver_peekq(ix, &vsize)) == NULL) { driver_select(ix, ready_fd, ERL_DRV_WRITE, 0); - if (driver_data[fd].terminating) - driver_failure_atom(driver_data[fd].port_num,"normal"); + if (dd->pid > 0 && dd->ofd->fd < 0) { + /* The port was opened with 'in' option, which means we + should close the output fd as soon as the command has + been sent. */ + driver_select(ix, ready_fd, ERL_DRV_WRITE|ERL_DRV_USE, 0); + erts_smp_atomic_add_nob(&sys_misc_mem_sz, -sizeof(ErtsSysFdData)); + dd->ofd = NULL; + } + if (dd->terminating) + driver_failure_atom(dd->port_num,"normal"); return; /* 0; */ } vsize = vsize > MAX_VSIZE ? MAX_VSIZE : vsize; @@ -1731,7 +1553,7 @@ static void fd_async(void *async_data) { int res; - ErtsSysDriverData *dd = (ErtsSysDriverData*)async_data; + ErtsSysDriverData *dd = (ErtsSysDriverData *)async_data; SysIOVec *iov0; SysIOVec *iov; int iovlen; @@ -1751,7 +1573,7 @@ fd_async(void *async_data) driver_pdl_unlock(dd->blocking->pdl); do { - res = writev(dd->ofd, iov, iovlen); + res = writev(dd->ofd->fd, iov, iovlen); } while (res < 0 && errno == EINTR); if (res < 0) err = errno; @@ -1769,7 +1591,6 @@ void fd_ready_async(ErlDrvData drv_data, ErlDrvPort port_num = dd->port_num; ASSERT(dd->blocking); - ASSERT(dd == (driver_data + (int)(long)drv_data)); if (dd->blocking->res > 0) { driver_pdl_lock(dd->blocking->pdl); @@ -1807,177 +1628,259 @@ void fd_ready_async(ErlDrvData drv_data, #endif -static ERTS_INLINE void -report_exit_status(ErtsSysReportExit *rep, int status) +/* Forker driver */ + +static int forker_fd; +static Hash *forker_hash; +static erts_smp_mtx_t forker_hash_mtx; + +static void ffree(void *e); + +static ErlDrvData forker_start(ErlDrvPort port_num, char* name, + SysDriverOpts* opts) { - Port *pp; -#ifdef ERTS_SMP - CHLD_STAT_UNLOCK; - pp = erts_thr_id2port_sflgs(rep->port, - ERTS_PORT_SFLGS_INVALID_DRIVER_LOOKUP); - CHLD_STAT_LOCK; -#else - pp = erts_id2port_sflgs(rep->port, - NULL, - 0, - ERTS_PORT_SFLGS_INVALID_DRIVER_LOOKUP); -#endif - if (pp) { - if (rep->ifd >= 0) { - driver_data[rep->ifd].alive = 0; - driver_data[rep->ifd].status = status; - (void) driver_select(ERTS_Port2ErlDrvPort(pp), - rep->ifd, - (ERL_DRV_READ|ERL_DRV_USE), - 1); - } - if (rep->ofd >= 0) { - driver_data[rep->ofd].alive = 0; - driver_data[rep->ofd].status = status; - (void) driver_select(ERTS_Port2ErlDrvPort(pp), - rep->ofd, - (ERL_DRV_WRITE|ERL_DRV_USE), - 1); - } -#ifdef ERTS_SMP - erts_thr_port_release(pp); -#else - erts_port_release(pp); -#endif + + int i; + int fds[2]; + int res, unbind; + char bindir[MAXPATHLEN]; + size_t bindirsz = sizeof(bindir); + Uint csp_path_sz; + char *child_setup_prog; + + forker_port = erts_drvport2id(port_num); + + res = erts_sys_getenv_raw("BINDIR", bindir, &bindirsz); + if (res != 0) { + if (res < 0) + erl_exit(-1, + "Environment variable BINDIR is not set\n"); + if (res > 0) + erl_exit(-1, + "Value of environment variable BINDIR is too large\n"); + } + if (bindir[0] != DIR_SEPARATOR_CHAR) + erl_exit(-1, + "Environment variable BINDIR does not contain an" + " absolute path\n"); + csp_path_sz = (strlen(bindir) + + 1 /* DIR_SEPARATOR_CHAR */ + + sizeof(CHILD_SETUP_PROG_NAME) + + 1); + child_setup_prog = erts_alloc(ERTS_ALC_T_CS_PROG_PATH, csp_path_sz); + erts_snprintf(child_setup_prog, csp_path_sz, + "%s%c%s", + bindir, + DIR_SEPARATOR_CHAR, + CHILD_SETUP_PROG_NAME); + if (socketpair(AF_UNIX, SOCK_STREAM, 0, fds) < 0) { + erl_exit(ERTS_ABORT_EXIT, + "Could not open unix domain socket in spawn_init: %d\n", + errno); } - erts_free(ERTS_ALC_T_PRT_REP_EXIT, rep); -} -#if !CHLDWTHR /* ---------------------------------------------------------- */ + forker_fd = fds[0]; -#define ERTS_REPORT_EXIT_STATUS report_exit_status + unbind = erts_sched_bind_atfork_prepare(); -int check_children(void) -{ - int res = 0; - int pid; - int status; + i = fork(); -#ifndef ERTS_SMP - if (children_died) -#endif - { - sys_sigblock(SIGCHLD); - CHLD_STAT_LOCK; - while ((pid = waitpid(-1, &status, WNOHANG)) > 0) - note_child_death(pid, status); -#ifndef ERTS_SMP - children_died = 0; + if (i == 0) { + /* The child */ + char *cs_argv[FORKER_ARGV_NO_OF_ARGS] = + {CHILD_SETUP_PROG_NAME, NULL, NULL}; + char buff[128]; + + erts_sched_bind_atfork_child(unbind); + + snprintf(buff, 128, "%d", sys_max_files()); + cs_argv[FORKER_ARGV_MAX_FILES] = buff; + + /* We preallocate fd 3 for the uds fd */ + if (fds[1] != 3) { + dup2(fds[1], 3); + } + +#if defined(USE_SETPGRP_NOARGS) /* SysV */ + (void) setpgrp(); +#elif defined(USE_SETPGRP) /* BSD */ + (void) setpgrp(0, getpid()); +#else /* POSIX */ + (void) setsid(); #endif - CHLD_STAT_UNLOCK; - sys_sigrelease(SIGCHLD); - res = 1; + + execv(child_setup_prog, cs_argv); + _exit(1); } - return res; -} -#ifdef ERTS_SMP + erts_sched_bind_atfork_parent(unbind); -void -erts_check_children(void) -{ - (void) check_children(); -} + erts_free(ERTS_ALC_T_CS_PROG_PATH, child_setup_prog); -#endif + close(fds[1]); -#elif CHLDWTHR && defined(ERTS_SMP) /* ------------------------------------- */ + SET_NONBLOCKING(forker_fd); -#define ERTS_REPORT_EXIT_STATUS report_exit_status + driver_select(port_num, forker_fd, ERL_DRV_READ|ERL_DRV_USE, 1); -int check_children(void) + return (ErlDrvData)port_num; +} + +static void forker_stop(ErlDrvData e) { - return 0; + /* we probably should do something here */ } -#else /* CHLDWTHR && !defined(ERTS_SMP) ------------------------------------ */ +static void forker_ready_input(ErlDrvData e, ErlDrvEvent fd) +{ + int res, *exit_status; + char buff[8+10+1+10+1] = {0}; + ErtsSysExitStatus est, *es; + + if ((res = read(fd, buff, sizeof(buff))) < 0) { + if (errno == ERRNO_BLOCK) + return; + erl_exit(ERTS_DUMP_EXIT, "Failed to read from erl_child_setup: %d\n", errno); + } + + if (res == 0) + erl_exit(ERTS_DUMP_EXIT, "erl_child_setup closed\n"); + + if (sscanf(buff, "SIGCHLD:%010d:%010d", &est.os_pid, &res) != 2) + erl_exit(ERTS_DUMP_EXIT, "Got corrupt data from erl_child_setup: %.s\n", + buff, sizeof(buff)); + + erts_smp_mtx_lock(&forker_hash_mtx); + es = hash_remove(forker_hash, &est); + erts_smp_mtx_unlock(&forker_hash_mtx); + + if (!es) + return; -#define ERTS_REPORT_EXIT_STATUS initiate_report_exit_status + exit_status = erts_alloc(ERTS_ALC_T_DRV_CTRL_DATA, sizeof(int)); + exit_status[0] = res; + /* ideally this would be a port_command call, but as command is + already used by the spawn_driver, we use control instead. + Note that when using erl_drv_port_control it is an asynchronous + control. */ + erl_drv_port_control(es->port_id, 'S', (char*)exit_status, sizeof(int)); -static ERTS_INLINE void -initiate_report_exit_status(ErtsSysReportExit *rep, int status) + ffree(es); + +} + +static void forker_ready_output(ErlDrvData e, ErlDrvEvent fd) { - rep->next = report_exit_transit_list; - rep->status = status; - report_exit_transit_list = rep; - erts_sys_schedule_interrupt(1); + ErlDrvPort port_num = (ErlDrvPort)e; + + while(driver_sizeq(port_num) > 0) { + int vlen; + SysIOVec *iov = driver_peekq(port_num, &vlen); + int *fds = (int*)iov[0].iov_base; + ASSERT(iov[0].iov_len >= sizeof(int)*3); + if (sys_uds_write(forker_fd, "S", 1, fds, 3, 0) < 0) { + if (errno == ERRNO_BLOCK) + return; + erl_exit(ERTS_DUMP_EXIT, "Failed to write to erl_child_setup: %d\n", errno); + } + close(fds[0]); + close(fds[1]); + if (fds[1] != fds[2]) + close(fds[2]); + driver_deq(port_num, sizeof(int)*3); + } + + driver_select(port_num, forker_fd, ERL_DRV_WRITE, 0); } -int check_children(void) +static ErlDrvSSizeT forker_control(ErlDrvData e, unsigned int cmd, char *buf, + ErlDrvSizeT len, char **rbuf, ErlDrvSizeT rlen) { + int *fds = (int*)buf; + ErlDrvPort port_num = (ErlDrvPort)e; int res; - ErtsSysReportExit *rep; - CHLD_STAT_LOCK; - rep = report_exit_transit_list; - res = rep != NULL; - while (rep) { - ErtsSysReportExit *curr_rep = rep; - rep = rep->next; - report_exit_status(curr_rep, curr_rep->status); + + if (driver_sizeq(port_num) > 0) { + driver_enq(port_num, buf, len); + return 0; } - report_exit_transit_list = NULL; - CHLD_STAT_UNLOCK; - return res; + + if ((res = sys_uds_write(forker_fd, "S", 1, fds, 3, 0)) < 0) { + if (errno == ERRNO_BLOCK) { + driver_enq(port_num, buf, len); + driver_select(port_num, forker_fd, ERL_DRV_WRITE|ERL_DRV_USE, 1); + return 0; + } + erl_exit(ERTS_DUMP_EXIT, "Failed to write to erl_child_setup: %d\n", errno); + } + close(fds[0]); + close(fds[1]); + if (fds[1] != fds[2]) + close(fds[2]); + return 0; } -#endif /* ------------------------------------------------------------------ */ +static void forker_add_os_pid_mapping(ErtsSysDriverData *dd) +{ + Eterm port_id = erts_drvport2id(dd->port_num); + ErtsSysExitStatus es; + es.os_pid = dd->pid; + es.port_id = port_id; + erts_smp_mtx_lock(&forker_hash_mtx); + hash_put(forker_hash, &es); + erts_smp_mtx_unlock(&forker_hash_mtx); +} -static void note_child_death(int pid, int status) +static void forker_remove_os_pid_mapping(ErtsSysDriverData *dd) { - ErtsSysReportExit **repp = &report_exit_list; - ErtsSysReportExit *rep = report_exit_list; - - while (rep) { - if (pid == rep->pid) { - *repp = rep->next; - ERTS_REPORT_EXIT_STATUS(rep, status); - break; - } - repp = &rep->next; - rep = rep->next; - } + ErtsSysExitStatus est, *es; + est.os_pid = dd->pid; + erts_smp_mtx_lock(&forker_hash_mtx); + es = hash_remove(forker_hash, &est); + erts_smp_mtx_unlock(&forker_hash_mtx); + if (es) + ffree(es); } -#if CHLDWTHR +static int fcmp(void *a, void *b) +{ + ErtsSysExitStatus *sa = a; + ErtsSysExitStatus *sb = b; + return !(sa->os_pid == sb->os_pid); +} -static void * -child_waiter(void *unused) +static HashValue fhash(void *e) { - int pid; - int status; + ErtsSysExitStatus *se = e; + return make_hash2(make_small(se->os_pid)); +} -#ifdef ERTS_ENABLE_LOCK_CHECK - erts_lc_set_thread_name("child waiter"); -#endif +static void *falloc(void *e) +{ + ErtsSysExitStatus *se = e; + ErtsSysExitStatus *ne = erts_alloc(ERTS_ALC_T_DRV, sizeof(ErtsSysExitStatus)); + erts_smp_atomic_add_nob(&sys_misc_mem_sz, sizeof(ErtsSysBlocking)); + ne->os_pid = se->os_pid; + ne->port_id = se->port_id; + return ne; +} - while(1) { -#ifdef DEBUG - int waitpid_errno; -#endif - pid = waitpid(-1, &status, 0); -#ifdef DEBUG - waitpid_errno = errno; -#endif - CHLD_STAT_LOCK; - if (pid < 0) { - ASSERT(waitpid_errno == ECHILD); - } - else { - children_alive--; - ASSERT(children_alive >= 0); - note_child_death(pid, status); - } - while (!children_alive) - CHLD_STAT_WAIT; /* Wait for children to wait on... :) */ - CHLD_STAT_UNLOCK; - } - - return NULL; +static void ffree(void *e) +{ + erts_free(ERTS_ALC_T_DRV, e); + erts_smp_atomic_add_nob(&sys_misc_mem_sz, -sizeof(ErtsSysBlocking)); } -#endif +static int forker_init(void) +{ + HashFunctions forker_hash_functions; + forker_hash_functions.hash = fhash; + forker_hash_functions.cmp = fcmp; + forker_hash_functions.alloc = falloc; + forker_hash_functions.free = ffree; + forker_hash = hash_new(ERTS_ALC_T_DRV, "forker_hash", + 16, forker_hash_functions); + erts_smp_mtx_init(&forker_hash_mtx, "forker_hash_mtx"); + + return 1; +} diff --git a/erts/emulator/sys/unix/sys_uds.c b/erts/emulator/sys/unix/sys_uds.c new file mode 100644 index 0000000000..3b63d05cf6 --- /dev/null +++ b/erts/emulator/sys/unix/sys_uds.c @@ -0,0 +1,125 @@ +/* + * %CopyrightBegin% + * + * Copyright Ericsson AB 2002-2009. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * %CopyrightEnd% + */ + + +#include "sys_uds.h" + +int +sys_uds_readv(int fd, struct iovec *iov, size_t iov_len, + int *fds, int fd_count, int flags) { + struct msghdr msg; + struct cmsghdr *cmsg = NULL; + char ancillary_buff[256] = {0}; + int res, i = 0; + + /* setup a place to fill in message contents */ + memset(&msg, 0, sizeof(struct msghdr)); + msg.msg_iov = iov; + msg.msg_iovlen = iov_len; + + /* provide space for the ancillary data */ + msg.msg_control = ancillary_buff; + msg.msg_controllen = sizeof(ancillary_buff); + + if((res = recvmsg(fd, &msg, flags)) < 0) { + return res; + } + + if((msg.msg_flags & MSG_CTRUNC) == MSG_CTRUNC) + { + /* We assume that we have given enough space for any header + that are sent to us. So the only remaining reason to get + this flag set is if the caller has run out of file descriptors. + */ + errno = EMFILE; + return -1; + } + + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg) ) { + if ((cmsg->cmsg_level == SOL_SOCKET) && + (cmsg->cmsg_type == SCM_RIGHTS)) { + int *cmsg_data = (int *)CMSG_DATA(cmsg); + while ((char*)cmsg_data < (char*)cmsg + cmsg->cmsg_len) { + if (i < fd_count) { + fds[i++] = *cmsg_data++; + } else { + /* for some strange reason, we have received more FD's + than we wanted... close them if we are not running + debug. */ + if(i >= fd_count) abort(); + close(*cmsg_data++); + } + } + } + } + + return res; +} + +int +sys_uds_read(int fd, char *buff, size_t len, + int *fds, int fd_count, int flags) { + struct iovec iov; + iov.iov_base = buff; + iov.iov_len = len; + return sys_uds_readv(fd, &iov, 1, fds, fd_count, flags); +} + + +int +sys_uds_writev(int fd, struct iovec *iov, size_t iov_len, + int *fds, int fd_count, int flags) { + + struct msghdr msg; + struct cmsghdr *cmsg = NULL; + int res; + + /* initialize socket message */ + memset(&msg, 0, sizeof(struct msghdr)); + msg.msg_iov = iov; + msg.msg_iovlen = iov_len; + + /* initialize the ancillary data */ + msg.msg_control = calloc(1, CMSG_SPACE(sizeof(int) * fd_count)); + msg.msg_controllen = CMSG_SPACE(sizeof(int) * fd_count); + + /* copy the fd array into the ancillary data */ + cmsg = CMSG_FIRSTHDR(&msg); + if(!cmsg) abort(); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int) * fd_count); + memcpy(CMSG_DATA(cmsg), fds, sizeof(int) * fd_count); + + res = sendmsg(fd, &msg, flags); + + free(msg.msg_control); + + return res; +} + +int +sys_uds_write(int fd, char *buff, size_t len, + int *fds, int fd_count, int flags) { + struct iovec iov; + iov.iov_base = buff; + iov.iov_len = len; + return sys_uds_writev(fd, &iov, 1, fds, fd_count, flags); +} diff --git a/erts/emulator/sys/unix/sys_uds.h b/erts/emulator/sys/unix/sys_uds.h new file mode 100644 index 0000000000..7ff58b17dd --- /dev/null +++ b/erts/emulator/sys/unix/sys_uds.h @@ -0,0 +1,47 @@ +/* + * %CopyrightBegin% + * + * Copyright Ericsson AB 2002-2009. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * %CopyrightEnd% + */ + +#ifndef _ERL_UNIX_UDS_H +#define _ERL_UNIX_UDS_H + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#if defined(__sun__) && !defined(_XOPEN_SOURCE) +#define _XOPEN_SOURCE 500 +#endif + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/uio.h> + +#include "sys.h" + +int sys_uds_readv(int fd, struct iovec *iov, size_t iov_len, + int *fds, int fd_count, int flags); +int sys_uds_read(int fd, char *buff, size_t len, + int *fds, int fd_count, int flags); +int sys_uds_writev(int fd, struct iovec *iov, size_t iov_len, + int *fds, int fd_count, int flags); +int sys_uds_write(int fd, char *buff, size_t len, + int *fds, int fd_count, int flags); + +#endif /* #ifndef _ERL_UNIX_UDS_H */ |