aboutsummaryrefslogtreecommitdiffstats
path: root/erts/emulator/sys/unix
diff options
context:
space:
mode:
Diffstat (limited to 'erts/emulator/sys/unix')
-rw-r--r--erts/emulator/sys/unix/erl_child_setup.c560
-rw-r--r--erts/emulator/sys/unix/erl_child_setup.h77
-rw-r--r--erts/emulator/sys/unix/erl_unix_sys.h55
-rw-r--r--erts/emulator/sys/unix/sys.c1937
-rw-r--r--erts/emulator/sys/unix/sys_drivers.c1862
-rw-r--r--erts/emulator/sys/unix/sys_time.c114
-rw-r--r--erts/emulator/sys/unix/sys_uds.c155
-rw-r--r--erts/emulator/sys/unix/sys_uds.h57
8 files changed, 2773 insertions, 2044 deletions
diff --git a/erts/emulator/sys/unix/erl_child_setup.c b/erts/emulator/sys/unix/erl_child_setup.c
index a3c5c20641..8dd14bbac6 100644
--- a/erts/emulator/sys/unix/erl_child_setup.c
+++ b/erts/emulator/sys/unix/erl_child_setup.c
@@ -1,7 +1,7 @@
/*
* %CopyrightBegin%
*
- * Copyright Ericsson AB 2002-2009. All Rights Reserved.
+ * Copyright Ericsson AB 2002-2015. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -19,109 +19,234 @@
*/
/*
- * After a vfork() (or fork()) the child exec()s to this program which
- * sets up the child and exec()s to the user program (see spawn_start()
- * in sys.c and ticket OTP-4389).
+ * This program is started at erts startup and all fork's that
+ * have to be done are done in here. This is done for a couple
+ * of reasons:
+ * - Allow usage of fork without a memory explosion.
+ * -- we do not want to use vfork, as it blocks the VM
+ * until the execv is done, and if the program that
+ * is to be executed is on an NFS that is unavailable,
+ * the execv can block for a very long time.
+ * -- we cannot do fork inside the VM as that would temporarily
+ * duplicate the memory usage of the VM per parallel exec.
+ *
+ * Some implementation notes:
+ * - A single Unix Domain Socket is setup in between the VM and
+ * this program. Over that UDS the file descriptors that should
+ * be used to talk to the child program are sent.
+ * The actual command to execute, together with options and the
+ * environment, is sent over the pipe represented by the
+ * file descriptors mentioned above. We don't send the
+ * command over the UDS as that would increase the likely hood
+ * that it's buffer would be full.
+ *
+ * - Since it is this program that execv's, it has to take care of
+ * all the SIGCHLD signals that the child programs generate. The
+ * signals are received and the pid+exit reason is sent as data
+ * on the UDS to the VM. The VM is then able to map the pid to the
+ * port of the child program that just exited and deliver the status
+ * code if requested.
*/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
-#define NEED_CHILD_SETUP_DEFINES
-#include "sys.h"
-#include "erl_misc_utils.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/wait.h>
-#ifdef SIG_SIGSET /* Old SysV */
-void sys_sigrelease(int sig)
+#define WANT_NONBLOCKING
+
+#include "erl_driver.h"
+#include "sys_uds.h"
+#include "hash.h"
+#include "erl_term.h"
+#include "erl_child_setup.h"
+
+#define SET_CLOEXEC(fd) fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC)
+
+#if defined(__ANDROID__)
+#define SHELL "/system/bin/sh"
+#else
+#define SHELL "/bin/sh"
+#endif /* __ANDROID__ */
+
+//#define HARD_DEBUG
+#ifdef HARD_DEBUG
+#define DEBUG_PRINT(fmt, ...) fprintf(stderr, fmt "\r\n", ##__VA_ARGS__)
+#else
+#define DEBUG_PRINT(fmt, ...)
+#endif
+
+#define ABORT(fmt, ...) do { \
+ fprintf(stderr, "erl_child_setup: " fmt "\r\n", ##__VA_ARGS__); \
+ abort(); \
+ } while(0)
+
+#ifdef DEBUG
+void
+erl_assert_error(const char* expr, const char* func, const char* file, int line)
{
- sigrelse(sig);
+ fflush(stdout);
+ fprintf(stderr, "%s:%d:%s() Assertion failed: %s\n",
+ file, line, func, expr);
+ fflush(stderr);
+ abort();
}
-#else /* !SIG_SIGSET */
-#ifdef SIG_SIGNAL /* Old BSD */
-sys_sigrelease(int sig)
+#endif
+
+void sys_sigblock(int sig)
{
- sigsetmask(sigblock(0) & ~sigmask(sig));
+ sigset_t mask;
+
+ sigemptyset(&mask);
+ sigaddset(&mask, sig);
+ sigprocmask(SIG_BLOCK, &mask, (sigset_t *)NULL);
}
-#else /* !SIG_SIGNAL */ /* The True Way - POSIX!:-) */
+
void sys_sigrelease(int sig)
{
sigset_t mask;
-
sigemptyset(&mask);
sigaddset(&mask, sig);
sigprocmask(SIG_UNBLOCK, &mask, (sigset_t *)NULL);
}
-#endif /* !SIG_SIGNAL */
-#endif /* !SIG_SIGSET */
-
-#if defined(__ANDROID__)
-static int system_properties_fd(void);
-#endif /* __ANDROID__ */
-#if defined(__ANDROID__)
-#define SHELL "/system/bin/sh"
-#else
-#define SHELL "/bin/sh"
-#endif /* __ANDROID__ */
+static void add_os_pid_to_port_id_mapping(Eterm, pid_t);
+static Eterm get_port_id(pid_t);
+static int forker_hash_init(void);
+static int max_files = -1;
+static int sigchld_pipe[2];
-int
-main(int argc, char *argv[])
+static int
+start_new_child(int pipes[])
{
- int i, from, to;
- int erts_spawn_executable = 0;
+ int size, res, i, pos = 0;
+ char *buff, *o_buff;
+
+ char *cmd, *wd, **new_environ, **args = NULL;
+
+ Sint cnt, flags;
- /* OBSERVE!
- * Keep child setup after fork() (implemented in sys.c) up to date
- * if changes are made here.
- */
+ /* only child executes here */
- if (argc != CS_ARGV_NO_OF_ARGS) {
- if (argc < CS_ARGV_NO_OF_ARGS) {
- return 1;
- } else {
- erts_spawn_executable = 1;
- }
+ do {
+ res = read(pipes[0], (char*)&size, sizeof(size));
+ } while(res < 0 && (errno == EINTR || errno == ERRNO_BLOCK));
+
+ if (res <= 0) {
+ goto child_error;
}
- if (strcmp("false", argv[CS_ARGV_UNBIND_IX]) != 0)
- if (erts_unbind_from_cpu_str(argv[CS_ARGV_UNBIND_IX]) != 0)
- return 1;
+ buff = malloc(size);
+
+ DEBUG_PRINT("size = %d", size);
+
+ do {
+ if ((res = read(pipes[0], buff + pos, size - pos)) < 0) {
+ if (errno == ERRNO_BLOCK || errno == EINTR)
+ continue;
+ goto child_error;
+ }
+ if (res == 0) {
+ errno = EPIPE;
+ goto child_error;
+ }
+ pos += res;
+ } while(size - pos != 0);
+
+ o_buff = buff;
+
+ flags = get_int32(buff);
+ buff += sizeof(Sint32);
- for (i = 0; i < CS_ARGV_NO_OF_DUP2_OPS; i++) {
- if (argv[CS_ARGV_DUP2_OP_IX(i)][0] == '-'
- && argv[CS_ARGV_DUP2_OP_IX(i)][1] == '\0')
- break;
- if (sscanf(argv[CS_ARGV_DUP2_OP_IX(i)], "%d:%d", &from, &to) != 2)
- return 1;
- if (dup2(from, to) < 0)
- return 1;
+ DEBUG_PRINT("flags = %d", flags);
+
+ cmd = buff;
+ buff += strlen(buff) + 1;
+ if (*buff == '\0') {
+ wd = NULL;
+ } else {
+ wd = buff;
+ buff += strlen(buff) + 1;
}
+ buff++;
- if (sscanf(argv[CS_ARGV_FD_CR_IX], "%d:%d", &from, &to) != 2)
- return 1;
+ DEBUG_PRINT("wd = %s", wd);
-#if defined(HAVE_CLOSEFROM)
- closefrom(from);
-#elif defined(__ANDROID__)
- if (from <= to) {
- int spfd = system_properties_fd();
- for (i = from; i <= to; i++) {
- if (i != spfd) {
- (void) close(i);
- }
- }
+ cnt = get_int32(buff);
+ buff += sizeof(Sint32);
+ new_environ = malloc(sizeof(char*)*(cnt + 1));
+
+ DEBUG_PRINT("env_len = %ld", cnt);
+ for (i = 0; i < cnt; i++, buff++) {
+ new_environ[i] = buff;
+ while(*buff != '\0') buff++;
}
-#else /* !__ANDROID__ */
- for (i = from; i <= to; i++) {
- (void) close(i);
+ new_environ[cnt] = NULL;
+
+ if (o_buff + size != buff) {
+ /* This is a spawn executable call */
+ cnt = get_int32(buff);
+ buff += sizeof(Sint32);
+ args = malloc(sizeof(char*)*(cnt + 1));
+ for (i = 0; i < cnt; i++, buff++) {
+ args[i] = buff;
+ while(*buff != '\0') buff++;
+ }
+ args[cnt] = NULL;
}
-#endif /* HAVE_CLOSEFROM */
- if (!(argv[CS_ARGV_WD_IX][0] == '.' && argv[CS_ARGV_WD_IX][1] == '\0')
- && chdir(argv[CS_ARGV_WD_IX]) < 0)
- return 1;
+ if (o_buff + size != buff) {
+ errno = EINVAL;
+ goto child_error;
+ }
+
+ DEBUG_PRINT("read ack");
+ do {
+ ErtsSysForkerProto proto;
+ res = read(pipes[0], &proto, sizeof(proto));
+ if (res > 0) {
+ ASSERT(proto.action == ErtsSysForkerProtoAction_Ack);
+ ASSERT(res == sizeof(proto));
+ }
+ } while(res < 0 && (errno == EINTR || errno == ERRNO_BLOCK));
+ if (res < 1) {
+ errno = EPIPE;
+ goto child_error;
+ }
+
+ DEBUG_PRINT("Do that forking business: '%s'\n",cmd);
+
+ /* When the dup2'ing below is done, only
+ fd's 0, 1, 2 and maybe 3, 4 should survive the
+ exec. All other fds (i.e. the unix domain sockets
+ and stray pipe ends) should have CLOEXEC set on them
+ so they will be closed when the exec happens */
+ if (flags & FORKER_FLAG_USE_STDIO) {
+ /* stdin for process */
+ if (flags & FORKER_FLAG_DO_WRITE &&
+ dup2(pipes[0], 0) < 0)
+ goto child_error;
+ /* stdout for process */
+ if (flags & FORKER_FLAG_DO_READ &&
+ dup2(pipes[1], 1) < 0)
+ goto child_error;
+ }
+ else { /* XXX will fail if pipes[0] == 4 (unlikely..) */
+ if (flags & FORKER_FLAG_DO_READ && dup2(pipes[1], 4) < 0)
+ goto child_error;
+ if (flags & FORKER_FLAG_DO_WRITE && dup2(pipes[0], 3) < 0)
+ goto child_error;
+ }
+
+ if (dup2(pipes[2], 2) < 0)
+ goto child_error;
+
+ if (wd && chdir(wd) < 0)
+ goto child_error;
#if defined(USE_SETPGRP_NOARGS) /* SysV */
(void) setpgrp();
@@ -131,34 +256,301 @@ main(int argc, char *argv[])
(void) setsid();
#endif
+ close(pipes[0]);
+ close(pipes[1]);
+ close(pipes[2]);
+
sys_sigrelease(SIGCHLD);
- sys_sigrelease(SIGINT);
- sys_sigrelease(SIGUSR1);
-
- if (erts_spawn_executable) {
- if (argv[CS_ARGV_NO_OF_ARGS + 1] == NULL) {
- execl(argv[CS_ARGV_NO_OF_ARGS],argv[CS_ARGV_NO_OF_ARGS],
- (char *) NULL);
- } else {
- execv(argv[CS_ARGV_NO_OF_ARGS],&(argv[CS_ARGV_NO_OF_ARGS + 1]));
- }
+
+ if (args) {
+ /* spawn_executable */
+ execve(cmd, args, new_environ);
} else {
- execl(SHELL, "sh", "-c", argv[CS_ARGV_CMD_IX], (char *) NULL);
+ execle(SHELL, "sh", "-c", cmd, (char *) NULL, new_environ);
}
- return 1;
+child_error:
+ DEBUG_PRINT("exec error: %d\r\n",errno);
+ _exit(128 + errno);
+}
+
+
+/*
+ * [OTP-3906]
+ * Solaris signal management gets confused when threads are used and a
+ * lot of child processes dies. The confusion results in that SIGCHLD
+ * signals aren't delivered to the emulator which in turn results in
+ * a lot of defunct processes in the system.
+ *
+ * The problem seems to appear when a signal is frequently
+ * blocked/unblocked at the same time as the signal is frequently
+ * propagated. The child waiter thread is a workaround for this problem.
+ * The SIGCHLD signal is always blocked (in all threads), and the child
+ * waiter thread fetches the signal by a call to sigwait(). See
+ * child_waiter().
+ *
+ * This should be a non-issue since the fork:ing was moved outside of
+ * the emulator into erl_child_setup. I'm leaving the comment here
+ * for posterity. */
+
+static void handle_sigchld(int sig) {
+ int buff[2], res;
+
+ sys_sigblock(SIGCHLD);
+
+ while ((buff[0] = waitpid((pid_t)(-1), buff+1, WNOHANG)) > 0) {
+ do {
+ res = write(sigchld_pipe[1], buff, sizeof(buff));
+ } while (res < 0 && errno == EINTR);
+ if (res <= 0)
+ ABORT("Failed to write to sigchld_pipe (%d): %d (%d)", sigchld_pipe[1], res, errno);
+ DEBUG_PRINT("Reap child %d (%d)", buff[0], buff[1]);
+ }
+
+ sys_sigrelease(SIGCHLD);
}
#if defined(__ANDROID__)
static int system_properties_fd(void)
{
- int fd;
+ static int fd = -2;
char *env;
+ if (fd != -2) return fd;
env = getenv("ANDROID_PROPERTY_WORKSPACE");
if (!env) {
+ fd = -1;
return -1;
}
fd = atoi(env);
return fd;
}
#endif /* __ANDROID__ */
+
+int
+main(int argc, char *argv[])
+{
+ /* This fd should be open from beam */
+ int uds_fd = 3, max_fd = 3;
+#ifndef HAVE_CLOSEFROM
+ int i;
+#endif
+ struct sigaction sa;
+
+ if (argc < 1 || sscanf(argv[1],"%d",&max_files) != 1) {
+ ABORT("Invalid arguments to child_setup");
+ }
+
+/* We close all fds except the uds from beam.
+ All other fds from now on will have the
+ CLOEXEC flags set on them. This means that we
+ only have to close a very limited number of fds
+ after we fork before the exec. */
+#if defined(HAVE_CLOSEFROM)
+ closefrom(4);
+#else
+ for (i = 4; i < max_files; i++)
+#if defined(__ANDROID__)
+ if (i != system_properties_fd())
+#endif
+ (void) close(i);
+#endif
+
+ if (pipe(sigchld_pipe) < 0) {
+ ABORT("Failed to setup sigchld pipe (%d)", errno);
+ }
+
+ SET_CLOEXEC(sigchld_pipe[0]);
+ SET_CLOEXEC(sigchld_pipe[1]);
+
+ max_fd = max_fd < sigchld_pipe[0] ? sigchld_pipe[0] : max_fd;
+
+ sa.sa_handler = &handle_sigchld;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = SA_RESTART | SA_NOCLDSTOP;
+ if (sigaction(SIGCHLD, &sa, 0) == -1) {
+ perror(0);
+ exit(1);
+ }
+
+ forker_hash_init();
+
+ SET_CLOEXEC(uds_fd);
+
+ DEBUG_PRINT("Starting forker %d", max_files);
+
+ while (1) {
+ fd_set read_fds;
+ int res;
+ FD_ZERO(&read_fds);
+ FD_SET(uds_fd, &read_fds);
+ FD_SET(sigchld_pipe[0], &read_fds);
+ DEBUG_PRINT("child_setup selecting on %d, %d (%d)",
+ uds_fd, sigchld_pipe[0], max_fd);
+ res = select(max_fd+1, &read_fds, NULL, NULL, NULL);
+
+ if (res < 0) {
+ if (errno == EINTR) continue;
+ ABORT("Select failed: %d (%d)",res, errno);
+ }
+
+ if (FD_ISSET(uds_fd, &read_fds)) {
+ int pipes[3], res, os_pid;
+ ErtsSysForkerProto proto;
+ errno = 0;
+ if ((res = sys_uds_read(uds_fd, (char*)&proto, sizeof(proto),
+ pipes, 3, MSG_DONTWAIT)) < 0) {
+ if (errno == EINTR)
+ continue;
+ DEBUG_PRINT("erl_child_setup failed to read from uds: %d, %d", res, errno);
+ _exit(0);
+ }
+
+ if (res == 0) {
+ DEBUG_PRINT("uds was closed!");
+ _exit(0);
+ }
+ /* Since we use unix domain sockets and send the entire data in
+ one go we *should* get the entire payload at once. */
+ ASSERT(res == sizeof(proto));
+ ASSERT(proto.action == ErtsSysForkerProtoAction_Start);
+
+ sys_sigblock(SIGCHLD);
+
+ errno = 0;
+
+ os_pid = fork();
+ if (os_pid == 0)
+ start_new_child(pipes);
+
+ add_os_pid_to_port_id_mapping(proto.u.start.port_id, os_pid);
+
+ /* We write an ack here, but expect the reply on
+ the pipes[0] inside the fork */
+ proto.action = ErtsSysForkerProtoAction_Go;
+ proto.u.go.os_pid = os_pid;
+ proto.u.go.error_number = errno;
+ while (write(pipes[1], &proto, sizeof(proto)) < 0 && errno == EINTR)
+ ; /* remove gcc warning */
+
+#ifdef FORKER_PROTO_START_ACK
+ proto.action = ErtsSysForkerProtoAction_StartAck;
+ while (write(uds_fd, &proto, sizeof(proto)) < 0 && errno == EINTR)
+ ; /* remove gcc warning */
+#endif
+
+ sys_sigrelease(SIGCHLD);
+ close(pipes[0]);
+ close(pipes[1]);
+ close(pipes[2]);
+ }
+
+ if (FD_ISSET(sigchld_pipe[0], &read_fds)) {
+ int ibuff[2];
+ ErtsSysForkerProto proto;
+ res = read(sigchld_pipe[0], ibuff, sizeof(ibuff));
+ if (res <= 0) {
+ if (errno == EINTR)
+ continue;
+ ABORT("Failed to read from sigchld pipe: %d (%d)", res, errno);
+ }
+
+ proto.u.sigchld.port_id = get_port_id((pid_t)(ibuff[0]));
+
+ if (proto.u.sigchld.port_id == THE_NON_VALUE)
+ continue; /* exit status report not requested */
+
+ proto.action = ErtsSysForkerProtoAction_SigChld;
+ proto.u.sigchld.error_number = ibuff[1];
+ DEBUG_PRINT("send %s to %d", buff, uds_fd);
+ if (write(uds_fd, &proto, sizeof(proto)) < 0) {
+ if (errno == EINTR)
+ continue;
+ /* The uds was close, which most likely means that the VM
+ has exited. This will be detected when we try to read
+ from the uds_fd. */
+ DEBUG_PRINT("Failed to write to uds: %d (%d)", uds_fd, errno);
+ }
+ }
+ }
+ return 1;
+}
+
+typedef struct exit_status {
+ HashBucket hb;
+ pid_t os_pid;
+ Eterm port_id;
+} ErtsSysExitStatus;
+
+static Hash *forker_hash;
+
+static void add_os_pid_to_port_id_mapping(Eterm port_id, pid_t os_pid)
+{
+ if (port_id != THE_NON_VALUE) {
+ /* exit status report requested */
+ ErtsSysExitStatus es;
+ es.os_pid = os_pid;
+ es.port_id = port_id;
+ hash_put(forker_hash, &es);
+ }
+}
+
+static Eterm get_port_id(pid_t os_pid)
+{
+ ErtsSysExitStatus est, *es;
+ Eterm port_id;
+ est.os_pid = os_pid;
+ es = hash_remove(forker_hash, &est);
+ if (!es) return THE_NON_VALUE;
+ port_id = es->port_id;
+ free(es);
+ return port_id;
+}
+
+static int fcmp(void *a, void *b)
+{
+ ErtsSysExitStatus *sa = a;
+ ErtsSysExitStatus *sb = b;
+ return !(sa->os_pid == sb->os_pid);
+}
+
+static HashValue fhash(void *e)
+{
+ ErtsSysExitStatus *se = e;
+ Uint32 val = se->os_pid;
+ val = (val+0x7ed55d16) + (val<<12);
+ val = (val^0xc761c23c) ^ (val>>19);
+ val = (val+0x165667b1) + (val<<5);
+ val = (val+0xd3a2646c) ^ (val<<9);
+ val = (val+0xfd7046c5) + (val<<3);
+ val = (val^0xb55a4f09) ^ (val>>16);
+ return val;
+}
+
+static void *falloc(void *e)
+{
+ ErtsSysExitStatus *se = e;
+ ErtsSysExitStatus *ne = malloc(sizeof(ErtsSysExitStatus));
+ ne->os_pid = se->os_pid;
+ ne->port_id = se->port_id;
+ return ne;
+}
+
+static void *meta_alloc(int type, size_t size) { return malloc(size); }
+static void meta_free(int type, void *p) { free(p); }
+
+static int forker_hash_init(void)
+{
+ HashFunctions forker_hash_functions;
+ forker_hash_functions.hash = fhash;
+ forker_hash_functions.cmp = fcmp;
+ forker_hash_functions.alloc = falloc;
+ forker_hash_functions.free = free;
+ forker_hash_functions.meta_alloc = meta_alloc;
+ forker_hash_functions.meta_free = meta_free;
+ forker_hash_functions.meta_print = NULL;
+
+ forker_hash = hash_new(0, "forker_hash",
+ 16, forker_hash_functions);
+
+ return 1;
+}
diff --git a/erts/emulator/sys/unix/erl_child_setup.h b/erts/emulator/sys/unix/erl_child_setup.h
new file mode 100644
index 0000000000..a28b136bfc
--- /dev/null
+++ b/erts/emulator/sys/unix/erl_child_setup.h
@@ -0,0 +1,77 @@
+/*
+ * %CopyrightBegin%
+ *
+ * Copyright Ericsson AB 2015-2015. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * %CopyrightEnd%
+ *
+ * This file defines the interface inbetween erts and child_setup.
+ */
+
+#ifndef _ERL_UNIX_FORKER_H
+#define _ERL_UNIX_FORKER_H
+
+#include "sys.h"
+
+#ifdef __FreeBSD__
+/* The freebsd sendmsg man page explicitly states that
+ you should not close fds before they are known
+ to have reached the other side, so this Ack protects
+ against that. */
+#define FORKER_PROTO_START_ACK 1
+#endif
+
+#define FORKER_ARGV_NO_OF_ARGS 3
+#define FORKER_ARGV_PROGNAME_IX 0 /* Program name */
+#define FORKER_ARGV_MAX_FILES 1 /* max_files */
+
+#define FORKER_FLAG_USE_STDIO (1 << 0) /* dup the pipe to stdin/stderr */
+#define FORKER_FLAG_EXIT_STATUS (1 << 1) /* send the exit status to parent */
+#define FORKER_FLAG_DO_READ (1 << 2) /* dup write fd */
+#define FORKER_FLAG_DO_WRITE (1 << 3) /* dup read fd */
+
+#if SIZEOF_VOID_P == SIZEOF_LONG
+typedef unsigned long ErtsSysPortId;
+#elif SIZEOF_VOID_P == SIZEOF_INT
+typedef unsigned int ErtsSysPortId;
+#elif SIZEOF_VOID_P == SIZEOF_LONG_LONG
+typedef unsigned long long ErtsSysPortId;
+#endif
+
+typedef struct ErtsSysForkerProto_ {
+ enum {
+ ErtsSysForkerProtoAction_Start,
+ ErtsSysForkerProtoAction_StartAck,
+ ErtsSysForkerProtoAction_Go,
+ ErtsSysForkerProtoAction_SigChld,
+ ErtsSysForkerProtoAction_Ack
+ } action;
+ union {
+ struct {
+ ErtsSysPortId port_id;
+ int fds[3];
+ } start;
+ struct {
+ pid_t os_pid;
+ int error_number;
+ } go;
+ struct {
+ ErtsSysPortId port_id;
+ int error_number;
+ } sigchld;
+ } u;
+} ErtsSysForkerProto;
+
+#endif /* #ifndef _ERL_UNIX_FORKER_H */
diff --git a/erts/emulator/sys/unix/erl_unix_sys.h b/erts/emulator/sys/unix/erl_unix_sys.h
index 8d4e98bf3a..8b1822ca9f 100644
--- a/erts/emulator/sys/unix/erl_unix_sys.h
+++ b/erts/emulator/sys/unix/erl_unix_sys.h
@@ -30,9 +30,7 @@
#include <limits.h>
#include <stdlib.h>
#include <string.h>
-#ifndef QNX
#include <memory.h>
-#endif
#if defined(__sun__) && defined(__SVR4) && !defined(__EXTENSIONS__)
# define __EXTENSIONS__
@@ -92,11 +90,6 @@
#include <ieeefp.h>
#endif
-#ifdef QNX
-#include <process.h>
-#include <sys/qnx_glob.h>
-#endif
-
#include <pwd.h>
#ifndef HZ
@@ -108,6 +101,10 @@
#endif
#include <netdb.h>
+#ifdef HAVE_MACH_ABSOLUTE_TIME
+#include <mach/mach_time.h>
+#endif
+
#ifdef HAVE_POSIX_MEMALIGN
# define ERTS_HAVE_ERTS_SYS_ALIGNED_ALLOC 1
#endif
@@ -136,13 +133,6 @@
# define ERTS_POLL_NEED_ASYNC_INTERRUPT_SUPPORT
#endif
-#ifndef ENABLE_CHILD_WAITER_THREAD
-# ifdef ERTS_SMP
-# define ERTS_SMP_SCHEDULERS_NEED_TO_CHECK_CHILDREN
-void erts_check_children(void);
-# endif
-#endif
-
typedef void *GETENV_STATE;
/*
@@ -171,6 +161,7 @@ typedef long long ErtsSysHrTime;
#endif
typedef ErtsMonotonicTime ErtsSystemTime;
+typedef ErtsSysHrTime ErtsSysPerfCounter;
#define ERTS_MONOTONIC_TIME_MIN (((ErtsMonotonicTime) 1) << 63)
#define ERTS_MONOTONIC_TIME_MAX (~ERTS_MONOTONIC_TIME_MIN)
@@ -219,6 +210,7 @@ ErtsSystemTime erts_os_system_time(void);
* It may or may not be monotonic.
*/
ErtsSysHrTime erts_sys_hrtime(void);
+#define ERTS_HRTIME_UNIT (1000*1000*1000)
struct erts_sys_time_read_only_data__ {
#ifdef ERTS_OS_MONOTONIC_INLINE_FUNC_PTR_CALL__
@@ -227,6 +219,8 @@ struct erts_sys_time_read_only_data__ {
#ifdef ERTS_OS_TIMES_INLINE_FUNC_PTR_CALL__
void (*os_times)(ErtsMonotonicTime *, ErtsSystemTime *);
#endif
+ ErtsSysPerfCounter (*perf_counter)(void);
+ ErtsSysPerfCounter perf_counter_unit;
int ticks_per_sec;
};
@@ -280,7 +274,24 @@ erts_os_times(ErtsMonotonicTime *mtimep, ErtsSystemTime *stimep)
#endif /* ERTS_HAVE_OS_MONOTONIC_TIME_SUPPORT */
/*
- *
+ * Functions for getting the performance counter
+ */
+
+ERTS_GLB_INLINE ErtsSysPerfCounter erts_sys_perf_counter(void);
+#define erts_sys_perf_counter_unit() erts_sys_time_data__.r.o.perf_counter_unit
+
+#if ERTS_GLB_INLINE_INCL_FUNC_DEF
+
+ERTS_GLB_INLINE ErtsSysPerfCounter
+erts_sys_perf_counter()
+{
+ return (*erts_sys_time_data__.r.o.perf_counter)();
+}
+
+#endif /* ERTS_GLB_INLINE_INCL_FUNC_DEF */
+
+/*
+ * Functions for measuring CPU time
*/
#if (defined(HAVE_GETHRVTIME) || defined(HAVE_CLOCK_GETTIME_CPU_TIME))
@@ -310,7 +321,6 @@ typedef void (*SIGFUNC)(int);
extern SIGFUNC sys_signal(int, SIGFUNC);
extern void sys_sigrelease(int);
extern void sys_sigblock(int);
-extern void sys_stop_cat(void);
/*
* Handling of floating point exceptions.
@@ -425,19 +435,6 @@ void erts_sys_unblock_fpe(int);
#define ERTS_FP_ERROR_THOROUGH(p, f, A) __ERTS_FP_ERROR_THOROUGH(&(p)->fp_exception, f, A)
-#ifdef NEED_CHILD_SETUP_DEFINES
-/* The child setup argv[] */
-#define CS_ARGV_PROGNAME_IX 0 /* Program name */
-#define CS_ARGV_UNBIND_IX 1 /* Unbind from cpu */
-#define CS_ARGV_WD_IX 2 /* Working directory */
-#define CS_ARGV_CMD_IX 3 /* Command */
-#define CS_ARGV_FD_CR_IX 4 /* Fd close range */
-#define CS_ARGV_DUP2_OP_IX(N) ((N) + 5) /* dup2 operations */
-
-#define CS_ARGV_NO_OF_DUP2_OPS 3 /* Number of dup2 ops */
-#define CS_ARGV_NO_OF_ARGS 8 /* Number of arguments */
-#endif /* #ifdef NEED_CHILD_SETUP_DEFINES */
-
/* Threads */
#ifdef USE_THREADS
extern int init_async(int);
diff --git a/erts/emulator/sys/unix/sys.c b/erts/emulator/sys/unix/sys.c
index d94b37430e..0a18e54389 100644
--- a/erts/emulator/sys/unix/sys.c
+++ b/erts/emulator/sys/unix/sys.c
@@ -49,7 +49,6 @@
#include <sys/ioctl.h>
#endif
-#define NEED_CHILD_SETUP_DEFINES
#define ERTS_WANT_BREAK_HANDLING
#define ERTS_WANT_GOT_SIGUSR1
#define WANT_NONBLOCKING /* must define this to pull in defs from sys.h */
@@ -67,7 +66,7 @@
#include "erl_mseg.h"
extern char **environ;
-static erts_smp_rwmtx_t environ_rwmtx;
+erts_smp_rwmtx_t environ_rwmtx;
#define MAX_VSIZE 16 /* Max number of entries allowed in an I/O
* vector sock_sendv().
@@ -76,89 +75,12 @@ static erts_smp_rwmtx_t environ_rwmtx;
* Don't need global.h, but bif_table.h (included by bif.h),
* won't compile otherwise
*/
-#include "global.h"
+#include "global.h"
#include "bif.h"
-#include "erl_sys_driver.h"
#include "erl_check_io.h"
#include "erl_cpu_topology.h"
-#ifndef DISABLE_VFORK
-#define DISABLE_VFORK 0
-#endif
-
-#if defined IOV_MAX
-#define MAXIOV IOV_MAX
-#elif defined UIO_MAXIOV
-#define MAXIOV UIO_MAXIOV
-#else
-#define MAXIOV 16
-#endif
-
-#ifdef USE_THREADS
-# ifdef ENABLE_CHILD_WAITER_THREAD
-# define CHLDWTHR ENABLE_CHILD_WAITER_THREAD
-# else
-# define CHLDWTHR 0
-# endif
-# define FDBLOCK 1
-#else
-# define CHLDWTHR 0
-# define FDBLOCK 0
-#endif
-/*
- * [OTP-3906]
- * Solaris signal management gets confused when threads are used and a
- * lot of child processes dies. The confusion results in that SIGCHLD
- * signals aren't delivered to the emulator which in turn results in
- * a lot of defunct processes in the system.
- *
- * The problem seems to appear when a signal is frequently
- * blocked/unblocked at the same time as the signal is frequently
- * propagated. The child waiter thread is a workaround for this problem.
- * The SIGCHLD signal is always blocked (in all threads), and the child
- * waiter thread fetches the signal by a call to sigwait(). See
- * child_waiter().
- */
-
-typedef struct ErtsSysReportExit_ ErtsSysReportExit;
-struct ErtsSysReportExit_ {
- ErtsSysReportExit *next;
- Eterm port;
- int pid;
- int ifd;
- int ofd;
-#if CHLDWTHR && !defined(ERTS_SMP)
- int status;
-#endif
-};
-
-/* Used by the fd driver iff the fd could not be set to non-blocking */
-typedef struct ErtsSysBlocking_ {
- ErlDrvPDL pdl;
- int res;
- int err;
- unsigned int pkey;
-} ErtsSysBlocking;
-
-
-/* This data is shared by these drivers - initialized by spawn_init() */
-static struct driver_data {
- ErlDrvPort port_num;
- int ofd, packet_bytes;
- ErtsSysReportExit *report_exit;
- int pid;
- int alive;
- int status;
- int terminating;
- ErtsSysBlocking *blocking;
-} *driver_data; /* indexed by fd */
-
-static ErtsSysReportExit *report_exit_list;
-#if CHLDWTHR && !defined(ERTS_SMP)
-static ErtsSysReportExit *report_exit_transit_list;
-#endif
-
extern int driver_interrupt(int, int);
extern void do_break(void);
@@ -170,33 +92,6 @@ extern void erts_sys_init_float(void);
extern void erl_crash_dump(char* file, int line, char* fmt, ...);
-#define DIR_SEPARATOR_CHAR '/'
-
-#if defined(__ANDROID__)
-#define SHELL "/system/bin/sh"
-#else
-#define SHELL "/bin/sh"
-#endif /* __ANDROID__ */
-
-
-#if defined(DEBUG)
-#define ERL_BUILD_TYPE_MARKER ".debug"
-#elif defined(PURIFY)
-#define ERL_BUILD_TYPE_MARKER ".purify"
-#elif defined(QUANTIFY)
-#define ERL_BUILD_TYPE_MARKER ".quantify"
-#elif defined(PURECOV)
-#define ERL_BUILD_TYPE_MARKER ".purecov"
-#elif defined(VALGRIND)
-#define ERL_BUILD_TYPE_MARKER ".valgrind"
-#else /* opt */
-#define ERL_BUILD_TYPE_MARKER
-#endif
-
-#define CHILD_SETUP_PROG_NAME "child_setup" ERL_BUILD_TYPE_MARKER
-#if !DISABLE_VFORK
-static char *child_setup_prog;
-#endif
#ifdef DEBUG
static int debug_log = 0;
@@ -220,7 +115,7 @@ static volatile int have_prepared_crash_dump;
(have_prepared_crash_dump++)
#endif
-static erts_smp_atomic_t sys_misc_mem_sz;
+erts_smp_atomic_t sys_misc_mem_sz;
#if defined(ERTS_SMP)
static void smp_sig_notify(char c);
@@ -233,46 +128,6 @@ static int sig_suspend_fds[2] = {-1, -1};
jmp_buf erts_sys_sigsegv_jmp;
-#if CHLDWTHR || defined(ERTS_SMP)
-erts_mtx_t chld_stat_mtx;
-#endif
-#if CHLDWTHR
-static erts_tid_t child_waiter_tid;
-/* chld_stat_mtx is used to protect against concurrent accesses
- of the driver_data fields pid, alive, and status. */
-erts_cnd_t chld_stat_cnd;
-static long children_alive;
-#define CHLD_STAT_LOCK erts_mtx_lock(&chld_stat_mtx)
-#define CHLD_STAT_UNLOCK erts_mtx_unlock(&chld_stat_mtx)
-#define CHLD_STAT_WAIT erts_cnd_wait(&chld_stat_cnd, &chld_stat_mtx)
-#define CHLD_STAT_SIGNAL erts_cnd_signal(&chld_stat_cnd)
-#elif defined(ERTS_SMP) /* ------------------------------------------------- */
-#define CHLD_STAT_LOCK erts_mtx_lock(&chld_stat_mtx)
-#define CHLD_STAT_UNLOCK erts_mtx_unlock(&chld_stat_mtx)
-
-#else /* ------------------------------------------------------------------- */
-#define CHLD_STAT_LOCK
-#define CHLD_STAT_UNLOCK
-static volatile int children_died;
-#endif
-
-
-static struct fd_data {
- char pbuf[4]; /* hold partial packet bytes */
- int psz; /* size of pbuf */
- char *buf;
- char *cpos;
- int sz;
- int remain; /* for input on fd */
-} *fd_data; /* indexed by fd */
-
-/* static FUNCTION(int, write_fill, (int, char*, int)); unused? */
-static void note_child_death(int, int);
-
-#if CHLDWTHR
-static void* child_waiter(void *);
-#endif
-
static int crashdump_companion_cube_fd = -1;
/********************* General functions ****************************/
@@ -453,9 +308,10 @@ MALLOC_USE_HASH(1);
#ifdef USE_THREADS
#ifdef ERTS_THR_HAVE_SIG_FUNCS
+
/*
* Child thread inherits parents signal mask at creation. In order to
- * guarantee that the main thread will receive all SIGINT, SIGCHLD, and
+ * guarantee that the main thread will receive all SIGINT, and
* SIGUSR1 signals sent to the process, we block these signals in the
* parent thread when creating a new thread.
*/
@@ -551,14 +407,11 @@ erts_sys_pre_init(void)
#ifdef ERTS_THR_HAVE_SIG_FUNCS
sigemptyset(&thr_create_sigmask);
sigaddset(&thr_create_sigmask, SIGINT); /* block interrupt */
- sigaddset(&thr_create_sigmask, SIGCHLD); /* block child signals */
sigaddset(&thr_create_sigmask, SIGUSR1); /* block user defined signal */
#endif
erts_thr_init(&eid);
- report_exit_list = NULL;
-
#ifdef ERTS_ENABLE_LOCK_COUNT
erts_lcnt_init();
#endif
@@ -569,17 +422,6 @@ erts_sys_pre_init(void)
#ifdef USE_THREADS
-#if CHLDWTHR || defined(ERTS_SMP)
- erts_mtx_init(&chld_stat_mtx, "child_status");
-#endif
-#if CHLDWTHR
-#ifndef ERTS_SMP
- report_exit_transit_list = NULL;
-#endif
- erts_cnd_init(&chld_stat_cnd);
- children_alive = 0;
-#endif
-
#ifdef ERTS_SMP
erts_smp_atomic32_init_nob(&erts_break_requested, 0);
erts_smp_atomic32_init_nob(&erts_got_sigusr1, 0);
@@ -589,9 +431,6 @@ erts_sys_pre_init(void)
erts_got_sigusr1 = 0;
have_prepared_crash_dump = 0;
#endif
-#if !CHLDWTHR && !defined(ERTS_SMP)
- children_died = 0;
-#endif
#endif /* USE_THREADS */
@@ -628,39 +467,6 @@ erts_sys_pre_init(void)
void
erl_sys_init(void)
{
-#if !DISABLE_VFORK
- {
- int res;
- char bindir[MAXPATHLEN];
- size_t bindirsz = sizeof(bindir);
- Uint csp_path_sz;
-
- res = erts_sys_getenv_raw("BINDIR", bindir, &bindirsz);
- if (res != 0) {
- if (res < 0)
- erts_exit(1,
- "Environment variable BINDIR is not set\n");
- if (res > 0)
- erts_exit(1,
- "Value of environment variable BINDIR is too large\n");
- }
- if (bindir[0] != DIR_SEPARATOR_CHAR)
- erts_exit(1,
- "Environment variable BINDIR does not contain an"
- " absolute path\n");
- csp_path_sz = (strlen(bindir)
- + 1 /* DIR_SEPARATOR_CHAR */
- + sizeof(CHILD_SETUP_PROG_NAME)
- + 1);
- child_setup_prog = erts_alloc(ERTS_ALC_T_CS_PROG_PATH, csp_path_sz);
- erts_smp_atomic_add_nob(&sys_misc_mem_sz, csp_path_sz);
- erts_snprintf(child_setup_prog, csp_path_sz,
- "%s%c%s",
- bindir,
- DIR_SEPARATOR_CHAR,
- CHILD_SETUP_PROG_NAME);
- }
-#endif
#ifdef USE_SETLINEBUF
setlinebuf(stdout);
@@ -978,43 +784,6 @@ int sys_max_files(void)
return(max_files);
}
-static void block_signals(void)
-{
-#if !CHLDWTHR
- sys_sigblock(SIGCHLD);
-#endif
-#ifndef ERTS_SMP
- sys_sigblock(SIGINT);
-#ifndef ETHR_UNUSABLE_SIGUSRX
- sys_sigblock(SIGUSR1);
-#endif /* #ifndef ETHR_UNUSABLE_SIGUSRX */
-#endif /* #ifndef ERTS_SMP */
-
-#if defined(ERTS_SMP) && !defined(ETHR_UNUSABLE_SIGUSRX)
- sys_sigblock(ERTS_SYS_SUSPEND_SIGNAL);
-#endif
-
-}
-
-static void unblock_signals(void)
-{
- /* Update erl_child_setup.c if changed */
-#if !CHLDWTHR
- sys_sigrelease(SIGCHLD);
-#endif
-#ifndef ERTS_SMP
- sys_sigrelease(SIGINT);
-#ifndef ETHR_UNUSABLE_SIGUSRX
- sys_sigrelease(SIGUSR1);
-#endif /* #ifndef ETHR_UNUSABLE_SIGUSRX */
-#endif /* #ifndef ERTS_SMP */
-
-#if defined(ERTS_SMP) && !defined(ETHR_UNUSABLE_SIGUSRX)
- sys_sigrelease(ERTS_SYS_SUSPEND_SIGNAL);
-#endif
-
-}
-
/************************** OS info *******************************/
/* Used by erlang:info/1. */
@@ -1102,1502 +871,6 @@ void fini_getenv_state(GETENV_STATE *state)
erts_smp_rwmtx_runlock(&environ_rwmtx);
}
-
-/************************** Port I/O *******************************/
-
-
-
-/* I. Common stuff */
-
-/*
- * Decreasing the size of it below 16384 is not allowed.
- */
-
-/* II. The spawn/fd/vanilla drivers */
-
-#define ERTS_SYS_READ_BUF_SZ (64*1024)
-
-/* Driver interfaces */
-static ErlDrvData spawn_start(ErlDrvPort, char*, SysDriverOpts*);
-static ErlDrvData fd_start(ErlDrvPort, char*, SysDriverOpts*);
-#if FDBLOCK
-static void fd_async(void *);
-static void fd_ready_async(ErlDrvData drv_data, ErlDrvThreadData thread_data);
-#endif
-static ErlDrvSSizeT fd_control(ErlDrvData, unsigned int, char *, ErlDrvSizeT,
- char **, ErlDrvSizeT);
-static ErlDrvData vanilla_start(ErlDrvPort, char*, SysDriverOpts*);
-static int spawn_init(void);
-static void fd_stop(ErlDrvData);
-static void fd_flush(ErlDrvData);
-static void stop(ErlDrvData);
-static void ready_input(ErlDrvData, ErlDrvEvent);
-static void ready_output(ErlDrvData, ErlDrvEvent);
-static void output(ErlDrvData, char*, ErlDrvSizeT);
-static void outputv(ErlDrvData, ErlIOVec*);
-static void stop_select(ErlDrvEvent, void*);
-
-struct erl_drv_entry spawn_driver_entry = {
- spawn_init,
- spawn_start,
- stop,
- output,
- ready_input,
- ready_output,
- "spawn",
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- ERL_DRV_EXTENDED_MARKER,
- ERL_DRV_EXTENDED_MAJOR_VERSION,
- ERL_DRV_EXTENDED_MINOR_VERSION,
- ERL_DRV_FLAG_USE_PORT_LOCKING,
- NULL, NULL,
- stop_select
-};
-struct erl_drv_entry fd_driver_entry = {
- NULL,
- fd_start,
- fd_stop,
- output,
- ready_input,
- ready_output,
- "fd",
- NULL,
- NULL,
- fd_control,
- NULL,
- outputv,
-#if FDBLOCK
- fd_ready_async, /* ready_async */
-#else
- NULL,
-#endif
- fd_flush, /* flush */
- NULL, /* call */
- NULL, /* event */
- ERL_DRV_EXTENDED_MARKER,
- ERL_DRV_EXTENDED_MAJOR_VERSION,
- ERL_DRV_EXTENDED_MINOR_VERSION,
- 0, /* ERL_DRV_FLAGs */
- NULL, /* handle2 */
- NULL, /* process_exit */
- stop_select
-};
-struct erl_drv_entry vanilla_driver_entry = {
- NULL,
- vanilla_start,
- stop,
- output,
- ready_input,
- ready_output,
- "vanilla",
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL, /* flush */
- NULL, /* call */
- NULL, /* event */
- ERL_DRV_EXTENDED_MARKER,
- ERL_DRV_EXTENDED_MAJOR_VERSION,
- ERL_DRV_EXTENDED_MINOR_VERSION,
- 0, /* ERL_DRV_FLAGs */
- NULL, /* handle2 */
- NULL, /* process_exit */
- stop_select
-};
-
-/* Handle SIGCHLD signals. */
-#if (defined(SIG_SIGSET) || defined(SIG_SIGNAL))
-static RETSIGTYPE onchld(void)
-#else
-static RETSIGTYPE onchld(int signum)
-#endif
-{
-#if CHLDWTHR
- ASSERT(0); /* We should *never* catch a SIGCHLD signal */
-#elif defined(ERTS_SMP)
- smp_sig_notify('C');
-#else
- children_died = 1;
- ERTS_CHK_IO_AS_INTR(); /* Make sure we don't sleep in poll */
-#endif
-}
-
-static int set_blocking_data(struct driver_data *dd) {
-
- dd->blocking = erts_alloc(ERTS_ALC_T_SYS_BLOCKING, sizeof(ErtsSysBlocking));
-
- erts_smp_atomic_add_nob(&sys_misc_mem_sz, sizeof(ErtsSysBlocking));
-
- dd->blocking->pdl = driver_pdl_create(dd->port_num);
- dd->blocking->res = 0;
- dd->blocking->err = 0;
- dd->blocking->pkey = driver_async_port_key(dd->port_num);
-
- return 1;
-}
-
-static int set_driver_data(ErlDrvPort port_num,
- int ifd,
- int ofd,
- int packet_bytes,
- int read_write,
- int exit_status,
- int pid,
- int is_blocking)
-{
- Port *prt;
- ErtsSysReportExit *report_exit;
-
- if (!exit_status)
- report_exit = NULL;
- else {
- report_exit = erts_alloc(ERTS_ALC_T_PRT_REP_EXIT,
- sizeof(ErtsSysReportExit));
- report_exit->next = report_exit_list;
- report_exit->port = erts_drvport2id(port_num);
- report_exit->pid = pid;
- report_exit->ifd = read_write & DO_READ ? ifd : -1;
- report_exit->ofd = read_write & DO_WRITE ? ofd : -1;
-#if CHLDWTHR && !defined(ERTS_SMP)
- report_exit->status = 0;
-#endif
- report_exit_list = report_exit;
- }
-
- prt = erts_drvport2port(port_num);
- if (prt != ERTS_INVALID_ERL_DRV_PORT)
- prt->os_pid = pid;
-
- if (read_write & DO_READ) {
- driver_data[ifd].packet_bytes = packet_bytes;
- driver_data[ifd].port_num = port_num;
- driver_data[ifd].report_exit = report_exit;
- driver_data[ifd].pid = pid;
- driver_data[ifd].alive = 1;
- driver_data[ifd].status = 0;
- driver_data[ifd].terminating = 0;
- driver_data[ifd].blocking = NULL;
- if (read_write & DO_WRITE) {
- driver_data[ifd].ofd = ofd;
- if (is_blocking && FDBLOCK)
- if (!set_blocking_data(driver_data+ifd))
- return -1;
- if (ifd != ofd)
- driver_data[ofd] = driver_data[ifd]; /* structure copy */
- } else { /* DO_READ only */
- driver_data[ifd].ofd = -1;
- }
- (void) driver_select(port_num, ifd, (ERL_DRV_READ|ERL_DRV_USE), 1);
- return(ifd);
- } else { /* DO_WRITE only */
- driver_data[ofd].packet_bytes = packet_bytes;
- driver_data[ofd].port_num = port_num;
- driver_data[ofd].report_exit = report_exit;
- driver_data[ofd].ofd = ofd;
- driver_data[ofd].pid = pid;
- driver_data[ofd].alive = 1;
- driver_data[ofd].status = 0;
- driver_data[ofd].terminating = 0;
- driver_data[ofd].blocking = NULL;
- if (is_blocking && FDBLOCK)
- if (!set_blocking_data(driver_data+ofd))
- return -1;
- return(ofd);
- }
-}
-
-static int spawn_init()
-{
- int i;
-#if CHLDWTHR
- erts_thr_opts_t thr_opts = ERTS_THR_OPTS_DEFAULT_INITER;
-
- thr_opts.detached = 0;
- thr_opts.suggested_stack_size = 0; /* Smallest possible */
- thr_opts.name = "child_waiter";
-#endif
-
- sys_signal(SIGPIPE, SIG_IGN); /* Ignore - we'll handle the write failure */
- driver_data = (struct driver_data *)
- erts_alloc(ERTS_ALC_T_DRV_TAB, max_files * sizeof(struct driver_data));
- erts_smp_atomic_add_nob(&sys_misc_mem_sz,
- max_files * sizeof(struct driver_data));
-
- for (i = 0; i < max_files; i++)
- driver_data[i].pid = -1;
-
-#if CHLDWTHR
- sys_sigblock(SIGCHLD);
-#endif
-
- sys_signal(SIGCHLD, onchld); /* Reap children */
-
-#if CHLDWTHR
- erts_thr_create(&child_waiter_tid, child_waiter, NULL, &thr_opts);
-#endif
-
- return 1;
-}
-
-static void close_pipes(int ifd[2], int ofd[2], int read_write)
-{
- if (read_write & DO_READ) {
- (void) close(ifd[0]);
- (void) close(ifd[1]);
- }
- if (read_write & DO_WRITE) {
- (void) close(ofd[0]);
- (void) close(ofd[1]);
- }
-}
-
-static void init_fd_data(int fd, ErlDrvPort port_num)
-{
- fd_data[fd].buf = NULL;
- fd_data[fd].cpos = NULL;
- fd_data[fd].remain = 0;
- fd_data[fd].sz = 0;
- fd_data[fd].psz = 0;
-}
-
-static char **build_unix_environment(char *block)
-{
- int i;
- int j;
- int len;
- char *cp;
- char **cpp;
- char** old_env;
-
- ERTS_SMP_LC_ASSERT(erts_smp_lc_rwmtx_is_rlocked(&environ_rwmtx));
-
- cp = block;
- len = 0;
- while (*cp != '\0') {
- cp += strlen(cp) + 1;
- len++;
- }
- old_env = environ;
- while (*old_env++ != NULL) {
- len++;
- }
-
- cpp = (char **) erts_alloc_fnf(ERTS_ALC_T_ENVIRONMENT,
- sizeof(char *) * (len+1));
- if (cpp == NULL) {
- return NULL;
- }
-
- cp = block;
- len = 0;
- while (*cp != '\0') {
- cpp[len] = cp;
- cp += strlen(cp) + 1;
- len++;
- }
-
- i = len;
- for (old_env = environ; *old_env; old_env++) {
- char* old = *old_env;
-
- for (j = 0; j < len; j++) {
- char *s, *t;
-
- s = cpp[j];
- t = old;
- while (*s == *t && *s != '=') {
- s++, t++;
- }
- if (*s == '=' && *t == '=') {
- break;
- }
- }
-
- if (j == len) { /* New version not found */
- cpp[len++] = old;
- }
- }
-
- for (j = 0; j < i; ) {
- size_t last = strlen(cpp[j])-1;
- if (cpp[j][last] == '=' && strchr(cpp[j], '=') == cpp[j]+last) {
- cpp[j] = cpp[--len];
- if (len < i) {
- i--;
- } else {
- j++;
- }
- }
- else {
- j++;
- }
- }
-
- cpp[len] = NULL;
- return cpp;
-}
-
-/*
- [arndt] In most Unix systems, including Solaris 2.5, 'fork' allocates memory
- in swap space for the child of a 'fork', whereas 'vfork' does not do this.
- The natural call to use here is therefore 'vfork'. Due to a bug in
- 'vfork' in Solaris 2.5 (apparently fixed in 2.6), using 'vfork'
- can be dangerous in what seems to be these circumstances:
- If the child code under a vfork sets the signal action to SIG_DFL
- (or SIG_IGN)
- for any signal which was previously set to a signal handler, the
- state of the parent is clobbered, so that the later arrival of
- such a signal yields a sigsegv in the parent. If the signal was
- not set to a signal handler, but ignored, all seems to work.
- If you change the forking code below, beware of this.
- */
-
-static ErlDrvData spawn_start(ErlDrvPort port_num, char* name, SysDriverOpts* opts)
-{
-#define CMD_LINE_PREFIX_STR "exec "
-#define CMD_LINE_PREFIX_STR_SZ (sizeof(CMD_LINE_PREFIX_STR) - 1)
-
- int ifd[2], ofd[2], len, pid, i;
- char **volatile new_environ; /* volatile since a vfork() then cannot
- cause 'new_environ' to be clobbered
- in the parent process. */
- int saved_errno;
- long res;
- char *cmd_line;
-#ifndef QNX
- int unbind;
-#endif
-#if !DISABLE_VFORK
- int no_vfork;
- size_t no_vfork_sz = sizeof(no_vfork);
-
- no_vfork = (erts_sys_getenv_raw("ERL_NO_VFORK",
- (char *) &no_vfork,
- &no_vfork_sz) >= 0);
-#endif
-
- switch (opts->read_write) {
- case DO_READ:
- if (pipe(ifd) < 0)
- return ERL_DRV_ERROR_ERRNO;
- if (ifd[0] >= max_files) {
- close_pipes(ifd, ofd, opts->read_write);
- errno = EMFILE;
- return ERL_DRV_ERROR_ERRNO;
- }
- ofd[1] = -1; /* keep purify happy */
- break;
- case DO_WRITE:
- if (pipe(ofd) < 0) return ERL_DRV_ERROR_ERRNO;
- if (ofd[1] >= max_files) {
- close_pipes(ifd, ofd, opts->read_write);
- errno = EMFILE;
- return ERL_DRV_ERROR_ERRNO;
- }
- ifd[0] = -1; /* keep purify happy */
- break;
- case DO_READ|DO_WRITE:
- if (pipe(ifd) < 0) return ERL_DRV_ERROR_ERRNO;
- errno = EMFILE; /* default for next two conditions */
- if (ifd[0] >= max_files || pipe(ofd) < 0) {
- close_pipes(ifd, ofd, DO_READ);
- return ERL_DRV_ERROR_ERRNO;
- }
- if (ofd[1] >= max_files) {
- close_pipes(ifd, ofd, opts->read_write);
- errno = EMFILE;
- return ERL_DRV_ERROR_ERRNO;
- }
- break;
- default:
- ASSERT(0);
- return ERL_DRV_ERROR_GENERAL;
- }
-
- if (opts->spawn_type == ERTS_SPAWN_EXECUTABLE) {
- /* started with spawn_executable, not with spawn */
- len = strlen(name);
- cmd_line = (char *) erts_alloc_fnf(ERTS_ALC_T_TMP, len + 1);
- if (!cmd_line) {
- close_pipes(ifd, ofd, opts->read_write);
- errno = ENOMEM;
- return ERL_DRV_ERROR_ERRNO;
- }
- memcpy((void *) cmd_line,(void *) name, len);
- cmd_line[len] = '\0';
- if (access(cmd_line,X_OK) != 0) {
- int save_errno = errno;
- erts_free(ERTS_ALC_T_TMP, cmd_line);
- errno = save_errno;
- return ERL_DRV_ERROR_ERRNO;
- }
- } else {
- /* make the string suitable for giving to "sh" */
- len = strlen(name);
- cmd_line = (char *) erts_alloc_fnf(ERTS_ALC_T_TMP,
- CMD_LINE_PREFIX_STR_SZ + len + 1);
- if (!cmd_line) {
- close_pipes(ifd, ofd, opts->read_write);
- errno = ENOMEM;
- return ERL_DRV_ERROR_ERRNO;
- }
- memcpy((void *) cmd_line,
- (void *) CMD_LINE_PREFIX_STR,
- CMD_LINE_PREFIX_STR_SZ);
- memcpy((void *) (cmd_line + CMD_LINE_PREFIX_STR_SZ), (void *) name, len);
- cmd_line[CMD_LINE_PREFIX_STR_SZ + len] = '\0';
- }
-
- erts_smp_rwmtx_rlock(&environ_rwmtx);
-
- if (opts->envir == NULL) {
- new_environ = environ;
- } else if ((new_environ = build_unix_environment(opts->envir)) == NULL) {
- erts_smp_rwmtx_runlock(&environ_rwmtx);
- erts_free(ERTS_ALC_T_TMP, (void *) cmd_line);
- errno = ENOMEM;
- return ERL_DRV_ERROR_ERRNO;
- }
-
-#ifndef QNX
- /* Block child from SIGINT and SIGUSR1. Must be before fork()
- to be safe. */
- block_signals();
-
- CHLD_STAT_LOCK;
-
- unbind = erts_sched_bind_atfork_prepare();
-
-#if !DISABLE_VFORK
- /* See fork/vfork discussion before this function. */
- if (no_vfork) {
-#endif
-
- DEBUGF(("Using fork\n"));
- pid = fork();
-
- if (pid == 0) {
- /* The child! Setup child... */
-
- if (erts_sched_bind_atfork_child(unbind) != 0)
- goto child_error;
-
- /* OBSERVE!
- * Keep child setup after vfork() (implemented below and in
- * erl_child_setup.c) up to date if changes are made here.
- */
-
- if (opts->use_stdio) {
- if (opts->read_write & DO_READ) {
- /* stdout for process */
- if (dup2(ifd[1], 1) < 0)
- goto child_error;
- if(opts->redir_stderr)
- /* stderr for process */
- if (dup2(ifd[1], 2) < 0)
- goto child_error;
- }
- if (opts->read_write & DO_WRITE)
- /* stdin for process */
- if (dup2(ofd[0], 0) < 0)
- goto child_error;
- }
- else { /* XXX will fail if ofd[0] == 4 (unlikely..) */
- if (opts->read_write & DO_READ)
- if (dup2(ifd[1], 4) < 0)
- goto child_error;
- if (opts->read_write & DO_WRITE)
- if (dup2(ofd[0], 3) < 0)
- goto child_error;
- }
-
-#if defined(HAVE_CLOSEFROM)
- closefrom(opts->use_stdio ? 3 : 5);
-#else
- for (i = opts->use_stdio ? 3 : 5; i < max_files; i++)
- (void) close(i);
-#endif
-
- if (opts->wd && chdir(opts->wd) < 0)
- goto child_error;
-
-#if defined(USE_SETPGRP_NOARGS) /* SysV */
- (void) setpgrp();
-#elif defined(USE_SETPGRP) /* BSD */
- (void) setpgrp(0, getpid());
-#else /* POSIX */
- (void) setsid();
-#endif
-
- unblock_signals();
-
- if (opts->spawn_type == ERTS_SPAWN_EXECUTABLE) {
- if (opts->argv == NULL) {
- execle(cmd_line,cmd_line,(char *) NULL, new_environ);
- } else {
- if (opts->argv[0] == erts_default_arg0) {
- opts->argv[0] = cmd_line;
- }
- execve(cmd_line, opts->argv, new_environ);
- if (opts->argv[0] == cmd_line) {
- opts->argv[0] = erts_default_arg0;
- }
- }
- } else {
- execle(SHELL, "sh", "-c", cmd_line, (char *) NULL, new_environ);
- }
- child_error:
- _exit(1);
- }
-#if !DISABLE_VFORK
- }
-#define ENOUGH_BYTES (44)
- else { /* Use vfork() */
- char **cs_argv= erts_alloc(ERTS_ALC_T_TMP,(CS_ARGV_NO_OF_ARGS + 1)*
- sizeof(char *));
- char fd_close_range[ENOUGH_BYTES]; /* 44 bytes are enough to */
- char dup2_op[CS_ARGV_NO_OF_DUP2_OPS][ENOUGH_BYTES]; /* hold any "%d:%d" string */
- /* on a 64-bit machine. */
-
- /* Setup argv[] for the child setup program (implemented in
- erl_child_setup.c) */
- i = 0;
- if (opts->use_stdio) {
- if (opts->read_write & DO_READ){
- /* stdout for process */
- erts_snprintf(&dup2_op[i++][0], ENOUGH_BYTES, "%d:%d", ifd[1], 1);
- if(opts->redir_stderr)
- /* stderr for process */
- erts_snprintf(&dup2_op[i++][0], ENOUGH_BYTES, "%d:%d", ifd[1], 2);
- }
- if (opts->read_write & DO_WRITE)
- /* stdin for process */
- erts_snprintf(&dup2_op[i++][0], ENOUGH_BYTES, "%d:%d", ofd[0], 0);
- } else { /* XXX will fail if ofd[0] == 4 (unlikely..) */
- if (opts->read_write & DO_READ)
- erts_snprintf(&dup2_op[i++][0], ENOUGH_BYTES, "%d:%d", ifd[1], 4);
- if (opts->read_write & DO_WRITE)
- erts_snprintf(&dup2_op[i++][0], ENOUGH_BYTES, "%d:%d", ofd[0], 3);
- }
- for (; i < CS_ARGV_NO_OF_DUP2_OPS; i++)
- strcpy(&dup2_op[i][0], "-");
- erts_snprintf(fd_close_range, ENOUGH_BYTES, "%d:%d", opts->use_stdio ? 3 : 5, max_files-1);
-
- cs_argv[CS_ARGV_PROGNAME_IX] = child_setup_prog;
- cs_argv[CS_ARGV_WD_IX] = opts->wd ? opts->wd : ".";
- cs_argv[CS_ARGV_UNBIND_IX] = erts_sched_bind_atvfork_child(unbind);
- cs_argv[CS_ARGV_FD_CR_IX] = fd_close_range;
- for (i = 0; i < CS_ARGV_NO_OF_DUP2_OPS; i++)
- cs_argv[CS_ARGV_DUP2_OP_IX(i)] = &dup2_op[i][0];
-
- if (opts->spawn_type == ERTS_SPAWN_EXECUTABLE) {
- int num = 0;
- int j = 0;
- if (opts->argv != NULL) {
- for(; opts->argv[num] != NULL; ++num)
- ;
- }
- cs_argv = erts_realloc(ERTS_ALC_T_TMP,cs_argv, (CS_ARGV_NO_OF_ARGS + 1 + num + 1) * sizeof(char *));
- cs_argv[CS_ARGV_CMD_IX] = "-";
- cs_argv[CS_ARGV_NO_OF_ARGS] = cmd_line;
- if (opts->argv != NULL) {
- for (;opts->argv[j] != NULL; ++j) {
- if (opts->argv[j] == erts_default_arg0) {
- cs_argv[CS_ARGV_NO_OF_ARGS + 1 + j] = cmd_line;
- } else {
- cs_argv[CS_ARGV_NO_OF_ARGS + 1 + j] = opts->argv[j];
- }
- }
- }
- cs_argv[CS_ARGV_NO_OF_ARGS + 1 + j] = NULL;
- } else {
- cs_argv[CS_ARGV_CMD_IX] = cmd_line; /* Command */
- cs_argv[CS_ARGV_NO_OF_ARGS] = NULL;
- }
- DEBUGF(("Using vfork\n"));
- pid = vfork();
-
- if (pid == 0) {
- /* The child! */
-
- /* Observe!
- * OTP-4389: The child setup program (implemented in
- * erl_child_setup.c) will perform the necessary setup of the
- * child before it execs to the user program. This because
- * vfork() only allow an *immediate* execve() or _exit() in the
- * child.
- */
- execve(child_setup_prog, cs_argv, new_environ);
- _exit(1);
- }
- erts_free(ERTS_ALC_T_TMP,cs_argv);
- }
-#undef ENOUGH_BYTES
-#endif
-
- erts_sched_bind_atfork_parent(unbind);
-
- if (pid == -1) {
- saved_errno = errno;
- CHLD_STAT_UNLOCK;
- erts_smp_rwmtx_runlock(&environ_rwmtx);
- erts_free(ERTS_ALC_T_TMP, (void *) cmd_line);
- unblock_signals();
- close_pipes(ifd, ofd, opts->read_write);
- errno = saved_errno;
- return ERL_DRV_ERROR_ERRNO;
- }
-#else /* QNX */
- if (opts->use_stdio) {
- if (opts->read_write & DO_READ)
- qnx_spawn_options.iov[1] = ifd[1]; /* stdout for process */
- if (opts->read_write & DO_WRITE)
- qnx_spawn_options.iov[0] = ofd[0]; /* stdin for process */
- }
- else {
- if (opts->read_write & DO_READ)
- qnx_spawn_options.iov[4] = ifd[1];
- if (opts->read_write & DO_WRITE)
- qnx_spawn_options.iov[3] = ofd[0];
- }
- /* Close fds on exec */
- for (i = 3; i < max_files; i++)
- fcntl(i, F_SETFD, 1);
-
- qnx_spawn_options.flags = _SPAWN_SETSID;
- if ((pid = spawnl(P_NOWAIT, SHELL, SHELL, "-c", cmd_line,
- (char *) 0)) < 0) {
- erts_free(ERTS_ALC_T_TMP, (void *) cmd_line);
- reset_qnx_spawn();
- erts_smp_rwmtx_runlock(&environ_rwmtx);
- close_pipes(ifd, ofd, opts->read_write);
- return ERL_DRV_ERROR_GENERAL;
- }
- reset_qnx_spawn();
-#endif /* QNX */
-
- erts_free(ERTS_ALC_T_TMP, (void *) cmd_line);
-
- if (new_environ != environ)
- erts_free(ERTS_ALC_T_ENVIRONMENT, (void *) new_environ);
-
- if (opts->read_write & DO_READ)
- (void) close(ifd[1]);
- if (opts->read_write & DO_WRITE)
- (void) close(ofd[0]);
-
- if (opts->read_write & DO_READ) {
- SET_NONBLOCKING(ifd[0]);
- init_fd_data(ifd[0], port_num);
- }
- if (opts->read_write & DO_WRITE) {
- SET_NONBLOCKING(ofd[1]);
- init_fd_data(ofd[1], port_num);
- }
-
- res = set_driver_data(port_num, ifd[0], ofd[1], opts->packet_bytes,
- opts->read_write, opts->exit_status, pid, 0);
- /* Don't unblock SIGCHLD until now, since the call above must
- first complete putting away the info about our new subprocess. */
- unblock_signals();
-
-#if CHLDWTHR
- ASSERT(children_alive >= 0);
-
- if (!(children_alive++))
- CHLD_STAT_SIGNAL; /* Wake up child waiter thread if no children
- was alive before we fork()ed ... */
-#endif
- /* Don't unlock chld_stat_mtx until now of the same reason as above */
- CHLD_STAT_UNLOCK;
-
- erts_smp_rwmtx_runlock(&environ_rwmtx);
-
- return (ErlDrvData)res;
-#undef CMD_LINE_PREFIX_STR
-#undef CMD_LINE_PREFIX_STR_SZ
-}
-
-#ifdef QNX
-static reset_qnx_spawn()
-{
- int i;
-
- /* Reset qnx_spawn_options */
- qnx_spawn_options.flags = 0;
- qnx_spawn_options.iov[0] = 0xff;
- qnx_spawn_options.iov[1] = 0xff;
- qnx_spawn_options.iov[2] = 0xff;
- qnx_spawn_options.iov[3] = 0xff;
-}
-#endif
-
-#define FD_DEF_HEIGHT 24
-#define FD_DEF_WIDTH 80
-/* Control op */
-#define FD_CTRL_OP_GET_WINSIZE 100
-
-static int fd_get_window_size(int fd, Uint32 *width, Uint32 *height)
-{
-#ifdef TIOCGWINSZ
- struct winsize ws;
- if (ioctl(fd,TIOCGWINSZ,&ws) == 0) {
- *width = (Uint32) ws.ws_col;
- *height = (Uint32) ws.ws_row;
- return 0;
- }
-#endif
- return -1;
-}
-
-static ErlDrvSSizeT fd_control(ErlDrvData drv_data,
- unsigned int command,
- char *buf, ErlDrvSizeT len,
- char **rbuf, ErlDrvSizeT rlen)
-{
- int fd = (int)(long)drv_data;
- char resbuff[2*sizeof(Uint32)];
- switch (command) {
- case FD_CTRL_OP_GET_WINSIZE:
- {
- Uint32 w,h;
- if (fd_get_window_size(fd,&w,&h))
- return 0;
- memcpy(resbuff,&w,sizeof(Uint32));
- memcpy(resbuff+sizeof(Uint32),&h,sizeof(Uint32));
- }
- break;
- default:
- return 0;
- }
- if (rlen < 2*sizeof(Uint32)) {
- *rbuf = driver_alloc(2*sizeof(Uint32));
- }
- memcpy(*rbuf,resbuff,2*sizeof(Uint32));
- return 2*sizeof(Uint32);
-}
-
-static ErlDrvData fd_start(ErlDrvPort port_num, char* name,
- SysDriverOpts* opts)
-{
- ErlDrvData res;
- int non_blocking = 0;
-
- if (((opts->read_write & DO_READ) && opts->ifd >= max_files) ||
- ((opts->read_write & DO_WRITE) && opts->ofd >= max_files))
- return ERL_DRV_ERROR_GENERAL;
-
- /*
- * Historical:
- *
- * "Note about nonblocking I/O.
- *
- * At least on Solaris, setting the write end of a TTY to nonblocking,
- * will set the input end to nonblocking as well (and vice-versa).
- * If erl is run in a pipeline like this: cat | erl
- * the input end of the TTY will be the standard input of cat.
- * And cat is not prepared to handle nonblocking I/O."
- *
- * Actually, the reason for this is not that the tty itself gets set
- * in non-blocking mode, but that the "input end" (cat's stdin) and
- * the "output end" (erlang's stdout) are typically the "same" file
- * descriptor, dup()'ed from a single fd by one of this process'
- * ancestors.
- *
- * The workaround for this problem used to be a rather bad kludge,
- * interposing an extra process ("internal cat") between erlang's
- * stdout and the original stdout, allowing erlang to set its stdout
- * in non-blocking mode without affecting the stdin of the preceding
- * process in the pipeline - and being a kludge, it caused all kinds
- * of weird problems.
- *
- * So, this is the current logic:
- *
- * The only reason to set non-blocking mode on the output fd at all is
- * if it's something that can cause a write() to block, of course,
- * i.e. primarily if it points to a tty, socket, pipe, or fifo.
- *
- * If we don't set non-blocking mode when we "should" have, and output
- * becomes blocked, the entire runtime system will be suspended - this
- * is normally bad of course, and can happen fairly "easily" - e.g. user
- * hits ^S on tty - but doesn't necessarily happen.
- *
- * If we do set non-blocking mode when we "shouldn't" have, the runtime
- * system will end up seeing EOF on the input fd (due to the preceding
- * process dying), which typically will cause the entire runtime system
- * to terminate immediately (due to whatever erlang process is seeing
- * the EOF taking it as a signal to halt the system). This is *very* bad.
- *
- * I.e. we should take a conservative approach, and only set non-
- * blocking mode when we a) need to, and b) are reasonably certain
- * that it won't be a problem. And as in the example above, the problem
- * occurs when input fd and output fd point to different "things".
- *
- * However, determining that they are not just the same "type" of
- * "thing", but actually the same instance of that type of thing, is
- * unreasonably complex in many/most cases.
- *
- * Also, with pipes, sockets, and fifos it's far from obvious that the
- * user *wants* non-blocking output: If you're running erlang inside
- * some complex pipeline, you're probably not running a real-time system
- * that must never stop, but rather *want* it to suspend if the output
- * channel is "full".
- *
- * So, the bottom line: We will only set the output fd non-blocking if
- * it points to a tty, and either a) the input fd also points to a tty,
- * or b) we can make sure that setting the output fd non-blocking
- * doesn't interfere with someone else's input, via a somewhat milder
- * kludge than the above.
- *
- * Also keep in mind that while this code is almost exclusively run as
- * a result of an erlang open_port({fd,0,1}, ...), that isn't the only
- * case - it can be called with any old pre-existing file descriptors,
- * the relations between which (if they're even two) we can only guess
- * at - still, we try our best...
- *
- * Added note OTP 18: Some systems seem to use stdout/stderr to log data
- * using unix pipes, so we cannot allow the system to block on a write.
- * Therefore we use an async thread to write the data to fd's that could
- * not be set to non-blocking. When no async threads are available we
- * fall back on the old behaviour.
- *
- * Also the guarantee about what is delivered to the OS has changed.
- * Pre 18 the fd driver did no flushing of data before terminating.
- * Now it does. This is because we want to be able to guarantee that things
- * such as escripts and friends really have outputted all data before
- * terminating. This could potentially block the termination of the system
- * for a very long time, but if the user wants to terminate fast she should
- * use erlang:halt with flush=false.
- */
-
- if (opts->read_write & DO_READ) {
- init_fd_data(opts->ifd, port_num);
- }
- if (opts->read_write & DO_WRITE) {
- init_fd_data(opts->ofd, port_num);
-
- /* If we don't have a read end, all bets are off - no non-blocking. */
- if (opts->read_write & DO_READ) {
-
- if (isatty(opts->ofd)) { /* output fd is a tty:-) */
-
- if (isatty(opts->ifd)) { /* input fd is also a tty */
-
- /* To really do this "right", we should also check that
- input and output fd point to the *same* tty - but
- this seems like overkill; ttyname() isn't for free,
- and this is a very common case - and it's hard to
- imagine a scenario where setting non-blocking mode
- here would cause problems - go ahead and do it. */
-
- non_blocking = 1;
- SET_NONBLOCKING(opts->ofd);
-
- } else { /* output fd is a tty, input fd isn't */
-
- /* This is a "problem case", but also common (see the
- example above) - i.e. it makes sense to try a bit
- harder before giving up on non-blocking mode: Try to
- re-open the tty that the output fd points to, and if
- successful replace the original one with the "new" fd
- obtained this way, and set *that* one in non-blocking
- mode. (Yes, this is a kludge.)
-
- However, re-opening the tty may fail in a couple of
- (unusual) cases:
-
- 1) The name of the tty (or an equivalent one, i.e.
- same major/minor number) can't be found, because
- it actually lives somewhere other than /dev (or
- wherever ttyname() looks for it), and isn't
- equivalent to any of those that do live in the
- "standard" place - this should be *very* unusual.
-
- 2) Permissions on the tty don't allow us to open it -
- it's perfectly possible to have an fd open to an
- object whose permissions wouldn't allow us to open
- it. This is not as unusual as it sounds, one case
- is if the user has su'ed to someone else (not
- root) - we have a read/write fd open to the tty
- (because it has been inherited all the way down
- here), but we have neither read nor write
- permission for the tty.
-
- In these cases, we finally give up, and don't set the
- output fd in non-blocking mode. */
-
- char *tty;
- int nfd;
-
- if ((tty = ttyname(opts->ofd)) != NULL &&
- (nfd = open(tty, O_WRONLY)) != -1) {
- dup2(nfd, opts->ofd);
- close(nfd);
- non_blocking = 1;
- SET_NONBLOCKING(opts->ofd);
- }
- }
- }
- }
- }
- CHLD_STAT_LOCK;
- res = (ErlDrvData)(long)set_driver_data(port_num, opts->ifd, opts->ofd,
- opts->packet_bytes,
- opts->read_write, 0, -1,
- !non_blocking);
- CHLD_STAT_UNLOCK;
- return res;
-}
-
-static void clear_fd_data(int fd)
-{
- if (fd_data[fd].sz > 0) {
- erts_free(ERTS_ALC_T_FD_ENTRY_BUF, (void *) fd_data[fd].buf);
- ASSERT(erts_smp_atomic_read_nob(&sys_misc_mem_sz) >= fd_data[fd].sz);
- erts_smp_atomic_add_nob(&sys_misc_mem_sz, -1*fd_data[fd].sz);
- }
- fd_data[fd].buf = NULL;
- fd_data[fd].sz = 0;
- fd_data[fd].remain = 0;
- fd_data[fd].cpos = NULL;
- fd_data[fd].psz = 0;
-}
-
-static void nbio_stop_fd(ErlDrvPort prt, int fd)
-{
- driver_select(prt,fd,DO_READ|DO_WRITE,0);
- clear_fd_data(fd);
- SET_BLOCKING(fd);
-}
-
-static void fd_stop(ErlDrvData ev) /* Does not close the fds */
-{
- int ofd;
- int fd = (int)(long)ev;
- ErlDrvPort prt = driver_data[fd].port_num;
-
-#if FDBLOCK
- if (driver_data[fd].blocking) {
- erts_free(ERTS_ALC_T_SYS_BLOCKING,driver_data[fd].blocking);
- driver_data[fd].blocking = NULL;
- erts_smp_atomic_add_nob(&sys_misc_mem_sz, -1*sizeof(ErtsSysBlocking));
- }
-#endif
-
- nbio_stop_fd(prt, fd);
- ofd = driver_data[fd].ofd;
- if (ofd != fd && ofd != -1)
- nbio_stop_fd(prt, ofd);
-}
-
-static void fd_flush(ErlDrvData fd)
-{
- if (!driver_data[(int)(long)fd].terminating)
- driver_data[(int)(long)fd].terminating = 1;
-}
-
-static ErlDrvData vanilla_start(ErlDrvPort port_num, char* name,
- SysDriverOpts* opts)
-{
- int flags, fd;
- ErlDrvData res;
-
- flags = (opts->read_write == DO_READ ? O_RDONLY :
- opts->read_write == DO_WRITE ? O_WRONLY|O_CREAT|O_TRUNC :
- O_RDWR|O_CREAT);
- if ((fd = open(name, flags, 0666)) < 0)
- return ERL_DRV_ERROR_GENERAL;
- if (fd >= max_files) {
- close(fd);
- return ERL_DRV_ERROR_GENERAL;
- }
- SET_NONBLOCKING(fd);
- init_fd_data(fd, port_num);
-
- CHLD_STAT_LOCK;
- res = (ErlDrvData)(long)set_driver_data(port_num, fd, fd,
- opts->packet_bytes,
- opts->read_write, 0, -1, 0);
- CHLD_STAT_UNLOCK;
- return res;
-}
-
-/* Note that driver_data[fd].ifd == fd if the port was opened for reading, */
-/* otherwise (i.e. write only) driver_data[fd].ofd = fd. */
-
-static void stop(ErlDrvData fd)
-{
- ErlDrvPort prt;
- int ofd;
-
- prt = driver_data[(int)(long)fd].port_num;
- nbio_stop_fd(prt, (int)(long)fd);
-
- ofd = driver_data[(int)(long)fd].ofd;
- if (ofd != (int)(long)fd && (int)(long)ofd != -1)
- nbio_stop_fd(prt, ofd);
- else
- ofd = -1;
-
- CHLD_STAT_LOCK;
-
- /* Mark as unused. */
- driver_data[(int)(long)fd].pid = -1;
-
- CHLD_STAT_UNLOCK;
-
- /* SMP note: Close has to be last thing done (open file descriptors work
- as locks on driver_data[] entries) */
- driver_select(prt, (int)(long)fd, ERL_DRV_USE, 0); /* close(fd); */
- if (ofd >= 0) {
- driver_select(prt, (int)(long)ofd, ERL_DRV_USE, 0); /* close(ofd); */
- }
-}
-
-/* used by fd_driver */
-static void outputv(ErlDrvData e, ErlIOVec* ev)
-{
- int fd = (int)(long)e;
- ErlDrvPort ix = driver_data[fd].port_num;
- int pb = driver_data[fd].packet_bytes;
- int ofd = driver_data[fd].ofd;
- ssize_t n;
- ErlDrvSizeT sz;
- char lb[4];
- char* lbp;
- ErlDrvSizeT len = ev->size;
-
- /* (len > ((unsigned long)-1 >> (4-pb)*8)) */
- /* if (pb >= 0 && (len & (((ErlDrvSizeT)1 << (pb*8))) - 1) != len) {*/
- if (((pb == 2) && (len > 0xffff)) || (pb == 1 && len > 0xff)) {
- driver_failure_posix(ix, EINVAL);
- return; /* -1; */
- }
- /* Handles 0 <= pb <= 4 only */
- put_int32((Uint32) len, lb);
- lbp = lb + (4-pb);
-
- ev->iov[0].iov_base = lbp;
- ev->iov[0].iov_len = pb;
- ev->size += pb;
-
- if (driver_data[fd].blocking && FDBLOCK)
- driver_pdl_lock(driver_data[fd].blocking->pdl);
-
- if ((sz = driver_sizeq(ix)) > 0) {
- driver_enqv(ix, ev, 0);
-
- if (driver_data[fd].blocking && FDBLOCK)
- driver_pdl_unlock(driver_data[fd].blocking->pdl);
-
- if (sz + ev->size >= (1 << 13))
- set_busy_port(ix, 1);
- }
- else if (!driver_data[fd].blocking || !FDBLOCK) {
- /* We try to write directly if the fd in non-blocking */
- int vsize = ev->vsize > MAX_VSIZE ? MAX_VSIZE : ev->vsize;
-
- n = writev(ofd, (const void *) (ev->iov), vsize);
- if (n == ev->size)
- return; /* 0;*/
- if (n < 0) {
- if ((errno != EINTR) && (errno != ERRNO_BLOCK)) {
- driver_failure_posix(ix, errno);
- return; /* -1;*/
- }
- n = 0;
- }
- driver_enqv(ix, ev, n); /* n is the skip value */
- driver_select(ix, ofd, ERL_DRV_WRITE|ERL_DRV_USE, 1);
- }
-#if FDBLOCK
- else {
- if (ev->size != 0) {
- driver_enqv(ix, ev, 0);
- driver_pdl_unlock(driver_data[fd].blocking->pdl);
- driver_async(ix, &driver_data[fd].blocking->pkey,
- fd_async, driver_data+fd, NULL);
- } else {
- driver_pdl_unlock(driver_data[fd].blocking->pdl);
- }
- }
-#endif
- /* return 0;*/
-}
-
-/* Used by spawn_driver and vanilla driver */
-static void output(ErlDrvData e, char* buf, ErlDrvSizeT len)
-{
- int fd = (int)(long)e;
- ErlDrvPort ix = driver_data[fd].port_num;
- int pb = driver_data[fd].packet_bytes;
- int ofd = driver_data[fd].ofd;
- ssize_t n;
- ErlDrvSizeT sz;
- char lb[4];
- char* lbp;
- struct iovec iv[2];
-
- /* (len > ((unsigned long)-1 >> (4-pb)*8)) */
- if (((pb == 2) && (len > 0xffff)) || (pb == 1 && len > 0xff)) {
- driver_failure_posix(ix, EINVAL);
- return; /* -1; */
- }
- put_int32(len, lb);
- lbp = lb + (4-pb);
-
- if ((sz = driver_sizeq(ix)) > 0) {
- driver_enq(ix, lbp, pb);
- driver_enq(ix, buf, len);
- if (sz + len + pb >= (1 << 13))
- set_busy_port(ix, 1);
- }
- else {
- iv[0].iov_base = lbp;
- iv[0].iov_len = pb; /* should work for pb=0 */
- iv[1].iov_base = buf;
- iv[1].iov_len = len;
- n = writev(ofd, iv, 2);
- if (n == pb+len)
- return; /* 0; */
- if (n < 0) {
- if ((errno != EINTR) && (errno != ERRNO_BLOCK)) {
- driver_failure_posix(ix, errno);
- return; /* -1; */
- }
- n = 0;
- }
- if (n < pb) {
- driver_enq(ix, lbp+n, pb-n);
- driver_enq(ix, buf, len);
- }
- else {
- n -= pb;
- driver_enq(ix, buf+n, len-n);
- }
- driver_select(ix, ofd, ERL_DRV_WRITE|ERL_DRV_USE, 1);
- }
- return; /* 0; */
-}
-
-static int port_inp_failure(ErlDrvPort port_num, int ready_fd, int res)
- /* Result: 0 (eof) or -1 (error) */
-{
- int err = errno;
-
- ASSERT(res <= 0);
- (void) driver_select(port_num, ready_fd, ERL_DRV_READ|ERL_DRV_WRITE, 0);
- clear_fd_data(ready_fd);
-
- if (driver_data[ready_fd].blocking && FDBLOCK) {
- driver_pdl_lock(driver_data[ready_fd].blocking->pdl);
- if (driver_sizeq(driver_data[ready_fd].port_num) > 0) {
- driver_pdl_unlock(driver_data[ready_fd].blocking->pdl);
- /* We have stuff in the output queue, so we just
- set the state to terminating and wait for fd_async_ready
- to terminate the port */
- if (res == 0)
- driver_data[ready_fd].terminating = 2;
- else
- driver_data[ready_fd].terminating = -err;
- return 0;
- }
- driver_pdl_unlock(driver_data[ready_fd].blocking->pdl);
- }
-
- if (res == 0) {
- if (driver_data[ready_fd].report_exit) {
- CHLD_STAT_LOCK;
-
- if (driver_data[ready_fd].alive) {
- /*
- * We have eof and want to report exit status, but the process
- * hasn't exited yet. When it does report_exit_status() will
- * driver_select() this fd which will make sure that we get
- * back here with driver_data[ready_fd].alive == 0 and
- * driver_data[ready_fd].status set.
- */
- CHLD_STAT_UNLOCK;
- return 0;
- }
- else {
- int status = driver_data[ready_fd].status;
- CHLD_STAT_UNLOCK;
-
- /* We need not be prepared for stopped/continued processes. */
- if (WIFSIGNALED(status))
- status = 128 + WTERMSIG(status);
- else
- status = WEXITSTATUS(status);
-
- driver_report_exit(driver_data[ready_fd].port_num, status);
- }
- }
- driver_failure_eof(port_num);
- } else {
- driver_failure_posix(port_num, err);
- }
- return 0;
-}
-
-/* fd is the drv_data that is returned from the */
-/* initial start routine */
-/* ready_fd is the descriptor that is ready to read */
-
-static void ready_input(ErlDrvData e, ErlDrvEvent ready_fd)
-{
- int fd = (int)(long)e;
- ErlDrvPort port_num;
- int packet_bytes;
- int res;
- Uint h;
-
- port_num = driver_data[fd].port_num;
- packet_bytes = driver_data[fd].packet_bytes;
-
-
- if (packet_bytes == 0) {
- byte *read_buf = (byte *) erts_alloc(ERTS_ALC_T_SYS_READ_BUF,
- ERTS_SYS_READ_BUF_SZ);
- res = read(ready_fd, read_buf, ERTS_SYS_READ_BUF_SZ);
- if (res < 0) {
- if ((errno != EINTR) && (errno != ERRNO_BLOCK))
- port_inp_failure(port_num, ready_fd, res);
- }
- else if (res == 0)
- port_inp_failure(port_num, ready_fd, res);
- else
- driver_output(port_num, (char*) read_buf, res);
- erts_free(ERTS_ALC_T_SYS_READ_BUF, (void *) read_buf);
- }
- else if (fd_data[ready_fd].remain > 0) { /* We try to read the remainder */
- /* space is allocated in buf */
- res = read(ready_fd, fd_data[ready_fd].cpos,
- fd_data[ready_fd].remain);
- if (res < 0) {
- if ((errno != EINTR) && (errno != ERRNO_BLOCK))
- port_inp_failure(port_num, ready_fd, res);
- }
- else if (res == 0) {
- port_inp_failure(port_num, ready_fd, res);
- }
- else if (res == fd_data[ready_fd].remain) { /* we're done */
- driver_output(port_num, fd_data[ready_fd].buf,
- fd_data[ready_fd].sz);
- clear_fd_data(ready_fd);
- }
- else { /* if (res < fd_data[ready_fd].remain) */
- fd_data[ready_fd].cpos += res;
- fd_data[ready_fd].remain -= res;
- }
- }
- else if (fd_data[ready_fd].remain == 0) { /* clean fd */
- byte *read_buf = (byte *) erts_alloc(ERTS_ALC_T_SYS_READ_BUF,
- ERTS_SYS_READ_BUF_SZ);
- /* We make one read attempt and see what happens */
- res = read(ready_fd, read_buf, ERTS_SYS_READ_BUF_SZ);
- if (res < 0) {
- if ((errno != EINTR) && (errno != ERRNO_BLOCK))
- port_inp_failure(port_num, ready_fd, res);
- }
- else if (res == 0) { /* eof */
- port_inp_failure(port_num, ready_fd, res);
- }
- else if (res < packet_bytes - fd_data[ready_fd].psz) {
- memcpy(fd_data[ready_fd].pbuf+fd_data[ready_fd].psz,
- read_buf, res);
- fd_data[ready_fd].psz += res;
- }
- else { /* if (res >= packet_bytes) */
- unsigned char* cpos = read_buf;
- int bytes_left = res;
-
- while (1) {
- int psz = fd_data[ready_fd].psz;
- char* pbp = fd_data[ready_fd].pbuf + psz;
-
- while(bytes_left && (psz < packet_bytes)) {
- *pbp++ = *cpos++;
- bytes_left--;
- psz++;
- }
-
- if (psz < packet_bytes) {
- fd_data[ready_fd].psz = psz;
- break;
- }
- fd_data[ready_fd].psz = 0;
-
- switch (packet_bytes) {
- case 1: h = get_int8(fd_data[ready_fd].pbuf); break;
- case 2: h = get_int16(fd_data[ready_fd].pbuf); break;
- case 4: h = get_int32(fd_data[ready_fd].pbuf); break;
- default: ASSERT(0); return; /* -1; */
- }
-
- if (h <= (bytes_left)) {
- driver_output(port_num, (char*) cpos, h);
- cpos += h;
- bytes_left -= h;
- continue;
- }
- else { /* The last message we got was split */
- char *buf = erts_alloc_fnf(ERTS_ALC_T_FD_ENTRY_BUF, h);
- if (!buf) {
- errno = ENOMEM;
- port_inp_failure(port_num, ready_fd, -1);
- }
- else {
- erts_smp_atomic_add_nob(&sys_misc_mem_sz, h);
- sys_memcpy(buf, cpos, bytes_left);
- fd_data[ready_fd].buf = buf;
- fd_data[ready_fd].sz = h;
- fd_data[ready_fd].remain = h - bytes_left;
- fd_data[ready_fd].cpos = buf + bytes_left;
- }
- break;
- }
- }
- }
- erts_free(ERTS_ALC_T_SYS_READ_BUF, (void *) read_buf);
- }
-}
-
-
-/* fd is the drv_data that is returned from the */
-/* initial start routine */
-/* ready_fd is the descriptor that is ready to read */
-
-static void ready_output(ErlDrvData e, ErlDrvEvent ready_fd)
-{
- int fd = (int)(long)e;
- ErlDrvPort ix = driver_data[fd].port_num;
- int n;
- struct iovec* iv;
- int vsize;
-
-
- if ((iv = (struct iovec*) driver_peekq(ix, &vsize)) == NULL) {
- driver_select(ix, ready_fd, ERL_DRV_WRITE, 0);
- if (driver_data[fd].terminating)
- driver_failure_atom(driver_data[fd].port_num,"normal");
- return; /* 0; */
- }
- vsize = vsize > MAX_VSIZE ? MAX_VSIZE : vsize;
- if ((n = writev(ready_fd, iv, vsize)) > 0) {
- if (driver_deq(ix, n) == 0)
- set_busy_port(ix, 0);
- }
- else if (n < 0) {
- if (errno == ERRNO_BLOCK || errno == EINTR)
- return; /* 0; */
- else {
- int res = errno;
- driver_select(ix, ready_fd, ERL_DRV_WRITE, 0);
- driver_failure_posix(ix, res);
- return; /* -1; */
- }
- }
- return; /* 0; */
-}
-
-static void stop_select(ErlDrvEvent fd, void* _)
-{
- close((int)fd);
-}
-
-#if FDBLOCK
-
-static void
-fd_async(void *async_data)
-{
- int res;
- struct driver_data *dd = (struct driver_data*)async_data;
- SysIOVec *iov0;
- SysIOVec *iov;
- int iovlen;
- int err = 0;
- /* much of this code is stolen from efile_drv:invoke_writev */
- driver_pdl_lock(dd->blocking->pdl);
- iov0 = driver_peekq(dd->port_num, &iovlen);
- iovlen = iovlen < MAXIOV ? iovlen : MAXIOV;
- iov = erts_alloc_fnf(ERTS_ALC_T_SYS_WRITE_BUF,
- sizeof(SysIOVec)*iovlen);
- if (!iov) {
- res = -1;
- err = ENOMEM;
- driver_pdl_unlock(dd->blocking->pdl);
- } else {
- memcpy(iov,iov0,iovlen*sizeof(SysIOVec));
- driver_pdl_unlock(dd->blocking->pdl);
-
- do {
- res = writev(dd->ofd, iov, iovlen);
- } while (res < 0 && errno == EINTR);
- if (res < 0)
- err = errno;
-
- erts_free(ERTS_ALC_T_SYS_WRITE_BUF, iov);
- }
- dd->blocking->res = res;
- dd->blocking->err = err;
-}
-
-void fd_ready_async(ErlDrvData drv_data,
- ErlDrvThreadData thread_data) {
- struct driver_data *dd = (struct driver_data *)thread_data;
- ErlDrvPort port_num = dd->port_num;
-
- ASSERT(dd->blocking);
- ASSERT(dd == (driver_data + (int)(long)drv_data));
-
- if (dd->blocking->res > 0) {
- driver_pdl_lock(dd->blocking->pdl);
- if (driver_deq(port_num, dd->blocking->res) == 0) {
- driver_pdl_unlock(dd->blocking->pdl);
- set_busy_port(port_num, 0);
- if (dd->terminating) {
- /* The port is has been ordered to terminate
- from either fd_flush or port_inp_failure */
- if (dd->terminating == 1)
- driver_failure_atom(port_num, "normal");
- else if (dd->terminating == 2)
- driver_failure_eof(port_num);
- else if (dd->terminating < 0)
- driver_failure_posix(port_num, -dd->terminating);
- return; /* -1; */
- }
- } else {
- driver_pdl_unlock(dd->blocking->pdl);
- /* still data left to write in queue */
- driver_async(port_num, &dd->blocking->pkey, fd_async, dd, NULL);
- return /* 0; */;
- }
- } else if (dd->blocking->res < 0) {
- if (dd->blocking->err == ERRNO_BLOCK) {
- set_busy_port(port_num, 1);
- /* still data left to write in queue */
- driver_async(port_num, &dd->blocking->pkey, fd_async, dd, NULL);
- } else
- driver_failure_posix(port_num, dd->blocking->err);
- return; /* -1; */
- }
- return; /* 0; */
-}
-
-#endif
-
void erts_do_break_handling(void)
{
struct termios temp_mode;
@@ -2738,10 +1011,6 @@ erts_sys_unsetenv(char *key)
void
sys_init_io(void)
{
- fd_data = (struct fd_data *)
- erts_alloc(ERTS_ALC_T_FD_TAB, max_files * sizeof(struct fd_data));
- erts_smp_atomic_add_nob(&sys_misc_mem_sz,
- max_files * sizeof(struct fd_data));
}
#if (0) /* unused? */
@@ -2935,179 +1204,6 @@ erl_debug(char* fmt, ...)
#endif /* DEBUG */
-static ERTS_INLINE void
-report_exit_status(ErtsSysReportExit *rep, int status)
-{
- Port *pp;
-#ifdef ERTS_SMP
- CHLD_STAT_UNLOCK;
- pp = erts_thr_id2port_sflgs(rep->port,
- ERTS_PORT_SFLGS_INVALID_DRIVER_LOOKUP);
- CHLD_STAT_LOCK;
-#else
- pp = erts_id2port_sflgs(rep->port,
- NULL,
- 0,
- ERTS_PORT_SFLGS_INVALID_DRIVER_LOOKUP);
-#endif
- if (pp) {
- if (rep->ifd >= 0) {
- driver_data[rep->ifd].alive = 0;
- driver_data[rep->ifd].status = status;
- (void) driver_select(ERTS_Port2ErlDrvPort(pp),
- rep->ifd,
- (ERL_DRV_READ|ERL_DRV_USE),
- 1);
- }
- if (rep->ofd >= 0) {
- driver_data[rep->ofd].alive = 0;
- driver_data[rep->ofd].status = status;
- (void) driver_select(ERTS_Port2ErlDrvPort(pp),
- rep->ofd,
- (ERL_DRV_WRITE|ERL_DRV_USE),
- 1);
- }
-#ifdef ERTS_SMP
- erts_thr_port_release(pp);
-#else
- erts_port_release(pp);
-#endif
- }
- erts_free(ERTS_ALC_T_PRT_REP_EXIT, rep);
-}
-
-#if !CHLDWTHR /* ---------------------------------------------------------- */
-
-#define ERTS_REPORT_EXIT_STATUS report_exit_status
-
-static int check_children(void)
-{
- int res = 0;
- int pid;
- int status;
-
-#ifndef ERTS_SMP
- if (children_died)
-#endif
- {
- sys_sigblock(SIGCHLD);
- CHLD_STAT_LOCK;
- while ((pid = waitpid(-1, &status, WNOHANG)) > 0)
- note_child_death(pid, status);
-#ifndef ERTS_SMP
- children_died = 0;
-#endif
- CHLD_STAT_UNLOCK;
- sys_sigrelease(SIGCHLD);
- res = 1;
- }
- return res;
-}
-
-#ifdef ERTS_SMP
-
-void
-erts_check_children(void)
-{
- (void) check_children();
-}
-
-#endif
-
-#elif CHLDWTHR && defined(ERTS_SMP) /* ------------------------------------- */
-
-#define ERTS_REPORT_EXIT_STATUS report_exit_status
-
-#define check_children() (0)
-
-
-#else /* CHLDWTHR && !defined(ERTS_SMP) ------------------------------------ */
-
-#define ERTS_REPORT_EXIT_STATUS initiate_report_exit_status
-
-static ERTS_INLINE void
-initiate_report_exit_status(ErtsSysReportExit *rep, int status)
-{
- rep->next = report_exit_transit_list;
- rep->status = status;
- report_exit_transit_list = rep;
- erts_sys_schedule_interrupt(1);
-}
-
-static int check_children(void)
-{
- int res;
- ErtsSysReportExit *rep;
- CHLD_STAT_LOCK;
- rep = report_exit_transit_list;
- res = rep != NULL;
- while (rep) {
- ErtsSysReportExit *curr_rep = rep;
- rep = rep->next;
- report_exit_status(curr_rep, curr_rep->status);
- }
- report_exit_transit_list = NULL;
- CHLD_STAT_UNLOCK;
- return res;
-}
-
-#endif /* ------------------------------------------------------------------ */
-
-static void note_child_death(int pid, int status)
-{
- ErtsSysReportExit **repp = &report_exit_list;
- ErtsSysReportExit *rep = report_exit_list;
-
- while (rep) {
- if (pid == rep->pid) {
- *repp = rep->next;
- ERTS_REPORT_EXIT_STATUS(rep, status);
- break;
- }
- repp = &rep->next;
- rep = rep->next;
- }
-}
-
-#if CHLDWTHR
-
-static void *
-child_waiter(void *unused)
-{
- int pid;
- int status;
-
-#ifdef ERTS_ENABLE_LOCK_CHECK
- erts_lc_set_thread_name("child waiter");
-#endif
-
- while(1) {
-#ifdef DEBUG
- int waitpid_errno;
-#endif
- pid = waitpid(-1, &status, 0);
-#ifdef DEBUG
- waitpid_errno = errno;
-#endif
- CHLD_STAT_LOCK;
- if (pid < 0) {
- ASSERT(waitpid_errno == ECHILD);
- }
- else {
- children_alive--;
- ASSERT(children_alive >= 0);
- note_child_death(pid, status);
- }
- while (!children_alive)
- CHLD_STAT_WAIT; /* Wait for children to wait on... :) */
- CHLD_STAT_UNLOCK;
- }
-
- return NULL;
-}
-
-#endif
-
/*
* Called from schedule() when it runs out of runnable processes,
* or when Erlang code has performed INPUT_REDUCTIONS reduction
@@ -3116,13 +1212,8 @@ child_waiter(void *unused)
void
erl_sys_schedule(int runnable)
{
-#ifdef ERTS_SMP
ERTS_CHK_IO(!runnable);
-#else
- ERTS_CHK_IO(runnable ? 0 : !check_children());
-#endif
ERTS_SMP_LC_ASSERT(!erts_thr_progress_is_blocking());
- (void) check_children();
}
@@ -3150,10 +1241,6 @@ smp_sig_notify(char c)
static void *
signal_dispatcher_thread_func(void *unused)
{
-#if !CHLDWTHR
- int initialized = 0;
- int notify_check_children = 0;
-#endif
#ifdef ERTS_ENABLE_LOCK_CHECK
erts_lc_set_thread_name("signal_dispatcher");
#endif
@@ -3191,19 +1278,7 @@ signal_dispatcher_thread_func(void *unused)
*/
switch (buf[i]) {
case 0: /* Emulator initialized */
-#if !CHLDWTHR
- initialized = 1;
- if (!notify_check_children)
-#endif
- break;
-#if !CHLDWTHR
- case 'C': /* SIGCHLD */
- if (initialized)
- erts_smp_notify_check_children_needed();
- else
- notify_check_children = 1;
- break;
-#endif
+ break;
case 'I': /* SIGINT */
break_requested();
break;
diff --git a/erts/emulator/sys/unix/sys_drivers.c b/erts/emulator/sys/unix/sys_drivers.c
new file mode 100644
index 0000000000..82fea1b330
--- /dev/null
+++ b/erts/emulator/sys/unix/sys_drivers.c
@@ -0,0 +1,1862 @@
+/*
+ * %CopyrightBegin%
+ *
+ * Copyright Ericsson AB 1996-2014. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * %CopyrightEnd%
+ */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#ifdef ISC32
+#define _POSIX_SOURCE
+#define _XOPEN_SOURCE
+#endif
+
+#include <sys/times.h> /* ! */
+#include <time.h>
+#include <signal.h>
+#include <sys/wait.h>
+#include <sys/uio.h>
+#include <termios.h>
+#include <ctype.h>
+#include <sys/utsname.h>
+#include <sys/select.h>
+#include <arpa/inet.h>
+
+#ifdef ISC32
+#include <sys/bsdtypes.h>
+#endif
+
+#include <termios.h>
+#ifdef HAVE_FCNTL_H
+#include <fcntl.h>
+#endif
+#ifdef HAVE_SYS_IOCTL_H
+#include <sys/ioctl.h>
+#endif
+
+#define WANT_NONBLOCKING /* must define this to pull in defs from sys.h */
+#include "sys.h"
+
+#ifdef USE_THREADS
+#include "erl_threads.h"
+#endif
+
+extern char **environ;
+extern erts_smp_rwmtx_t environ_rwmtx;
+
+extern erts_smp_atomic_t sys_misc_mem_sz;
+
+static Eterm forker_port;
+
+#define MAX_VSIZE 16 /* Max number of entries allowed in an I/O
+ * vector sock_sendv().
+ */
+/*
+ * Don't need global.h, but erl_cpu_topology.h won't compile otherwise
+ */
+#include "global.h"
+#include "erl_cpu_topology.h"
+
+#include "erl_sys_driver.h"
+#include "sys_uds.h"
+
+#include "erl_child_setup.h"
+
+#if defined IOV_MAX
+#define MAXIOV IOV_MAX
+#elif defined UIO_MAXIOV
+#define MAXIOV UIO_MAXIOV
+#else
+#define MAXIOV 16
+#endif
+
+#ifdef USE_THREADS
+# define FDBLOCK 1
+#else
+# define FDBLOCK 0
+#endif
+
+/* Used by the fd driver iff the fd could not be set to non-blocking */
+typedef struct ErtsSysBlocking_ {
+ ErlDrvPDL pdl;
+ int res;
+ int err;
+ unsigned int pkey;
+} ErtsSysBlocking;
+
+typedef struct fd_data {
+ int fd;
+ char pbuf[4]; /* hold partial packet bytes */
+ int psz; /* size of pbuf */
+ char *buf;
+ char *cpos;
+ int sz;
+ int remain; /* for input on fd */
+} ErtsSysFdData;
+
+typedef struct driver_data {
+ ErlDrvPort port_num;
+ ErtsSysFdData *ofd;
+ ErtsSysFdData *ifd;
+ int packet_bytes;
+ int pid;
+ int alive;
+ int status;
+ int terminating;
+ ErtsSysBlocking *blocking;
+} ErtsSysDriverData;
+
+#define DIR_SEPARATOR_CHAR '/'
+
+#if defined(__ANDROID__)
+#define SHELL "/system/bin/sh"
+#else
+#define SHELL "/bin/sh"
+#endif /* __ANDROID__ */
+
+#if defined(DEBUG)
+#define ERL_BUILD_TYPE_MARKER ".debug"
+#elif defined(PURIFY)
+#define ERL_BUILD_TYPE_MARKER ".purify"
+#elif defined(QUANTIFY)
+#define ERL_BUILD_TYPE_MARKER ".quantify"
+#elif defined(PURECOV)
+#define ERL_BUILD_TYPE_MARKER ".purecov"
+#elif defined(VALGRIND)
+#define ERL_BUILD_TYPE_MARKER ".valgrind"
+#else /* opt */
+#define ERL_BUILD_TYPE_MARKER
+#endif
+
+#ifdef DEBUG
+#define close(fd) do { int res = close(fd); ASSERT(res > -1); } while(0)
+#endif
+
+#define CHILD_SETUP_PROG_NAME "erl_child_setup" ERL_BUILD_TYPE_MARKER
+
+// #define HARD_DEBUG
+#ifdef HARD_DEBUG
+#define driver_select(port_num, fd, flags, onoff) \
+ do { \
+ if (((flags) & ERL_DRV_READ) && onoff) \
+ fprintf(stderr,"%010d %p: read select %d\r\n", __LINE__, port_num, (int)fd); \
+ if (((flags) & ERL_DRV_WRITE) && onoff) \
+ fprintf(stderr,"%010d %p: writ select %d\r\n", __LINE__, port_num, (int)fd); \
+ if (((flags) & ERL_DRV_READ) && !onoff) \
+ fprintf(stderr,"%010d %p: read unsele %d\r\n", __LINE__, port_num, (int)fd); \
+ if (((flags) & ERL_DRV_WRITE) && !onoff) \
+ fprintf(stderr,"%010d %p: writ unsele %d\r\n", __LINE__, port_num, (int)fd); \
+ driver_select_nkp(port_num, fd, flags, onoff); \
+ } while(0)
+#endif
+
+/*
+ * Decreasing the size of it below 16384 is not allowed.
+ */
+
+#define ERTS_SYS_READ_BUF_SZ (64*1024)
+
+/* I. Initialization */
+
+void
+erl_sys_late_init(void)
+{
+ SysDriverOpts opts;
+#ifdef ERTS_SMP
+ Port *port;
+#endif
+
+ sys_signal(SIGPIPE, SIG_IGN); /* Ignore - we'll handle the write failure */
+
+ opts.packet_bytes = 0;
+ opts.use_stdio = 1;
+ opts.redir_stderr = 0;
+ opts.read_write = 0;
+ opts.hide_window = 0;
+ opts.wd = NULL;
+ opts.envir = NULL;
+ opts.exit_status = 0;
+ opts.overlapped_io = 0;
+ opts.spawn_type = ERTS_SPAWN_ANY;
+ opts.argv = NULL;
+ opts.parallelism = erts_port_parallelism;
+
+#ifdef ERTS_SMP
+ port =
+#endif
+ erts_open_driver(&forker_driver, make_internal_pid(0), "forker", &opts, NULL, NULL);
+#ifdef ERTS_SMP
+ erts_mtx_unlock(port->lock);
+#endif
+}
+
+/* II. Prototypes */
+
+/* II.I Spawn prototypes */
+static ErlDrvData spawn_start(ErlDrvPort, char*, SysDriverOpts*);
+static ErlDrvSSizeT spawn_control(ErlDrvData, unsigned int, char *,
+ ErlDrvSizeT, char **, ErlDrvSizeT);
+
+/* II.II Vanilla prototypes */
+static ErlDrvData vanilla_start(ErlDrvPort, char*, SysDriverOpts*);
+
+
+/* II.III FD prototypes */
+static ErlDrvData fd_start(ErlDrvPort, char*, SysDriverOpts*);
+#if FDBLOCK
+static void fd_async(void *);
+static void fd_ready_async(ErlDrvData drv_data, ErlDrvThreadData thread_data);
+#endif
+static ErlDrvSSizeT fd_control(ErlDrvData, unsigned int, char *, ErlDrvSizeT,
+ char **, ErlDrvSizeT);
+static void fd_stop(ErlDrvData);
+static void fd_flush(ErlDrvData);
+
+/* II.IV Common prototypes */
+static void stop(ErlDrvData);
+static void ready_input(ErlDrvData, ErlDrvEvent);
+static void ready_output(ErlDrvData, ErlDrvEvent);
+static void output(ErlDrvData, char*, ErlDrvSizeT);
+static void outputv(ErlDrvData, ErlIOVec*);
+static void stop_select(ErlDrvEvent, void*);
+
+/* II.V Forker prototypes */
+static ErlDrvData forker_start(ErlDrvPort, char*, SysDriverOpts*);
+static void forker_stop(ErlDrvData);
+static void forker_ready_input(ErlDrvData, ErlDrvEvent);
+static void forker_ready_output(ErlDrvData, ErlDrvEvent);
+static ErlDrvSSizeT forker_control(ErlDrvData, unsigned int, char *,
+ ErlDrvSizeT, char **, ErlDrvSizeT);
+
+/* III Driver entries */
+
+/* III.I The spawn driver */
+struct erl_drv_entry spawn_driver_entry = {
+ NULL,
+ spawn_start,
+ stop,
+ output,
+ ready_input,
+ ready_output,
+ "spawn",
+ NULL,
+ NULL,
+ spawn_control,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ ERL_DRV_EXTENDED_MARKER,
+ ERL_DRV_EXTENDED_MAJOR_VERSION,
+ ERL_DRV_EXTENDED_MINOR_VERSION,
+ ERL_DRV_FLAG_USE_PORT_LOCKING | ERL_DRV_FLAG_USE_INIT_ACK,
+ NULL, NULL,
+ stop_select
+};
+
+/* III.II The fd driver */
+struct erl_drv_entry fd_driver_entry = {
+ NULL,
+ fd_start,
+ fd_stop,
+ output,
+ ready_input,
+ ready_output,
+ "fd",
+ NULL,
+ NULL,
+ fd_control,
+ NULL,
+ outputv,
+#if FDBLOCK
+ fd_ready_async, /* ready_async */
+#else
+ NULL,
+#endif
+ fd_flush, /* flush */
+ NULL, /* call */
+ NULL, /* event */
+ ERL_DRV_EXTENDED_MARKER,
+ ERL_DRV_EXTENDED_MAJOR_VERSION,
+ ERL_DRV_EXTENDED_MINOR_VERSION,
+ 0, /* ERL_DRV_FLAGs */
+ NULL, /* handle2 */
+ NULL, /* process_exit */
+ stop_select
+};
+
+/* III.III The vanilla driver */
+struct erl_drv_entry vanilla_driver_entry = {
+ NULL,
+ vanilla_start,
+ stop,
+ output,
+ ready_input,
+ ready_output,
+ "vanilla",
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL, /* flush */
+ NULL, /* call */
+ NULL, /* event */
+ ERL_DRV_EXTENDED_MARKER,
+ ERL_DRV_EXTENDED_MAJOR_VERSION,
+ ERL_DRV_EXTENDED_MINOR_VERSION,
+ 0, /* ERL_DRV_FLAGs */
+ NULL, /* handle2 */
+ NULL, /* process_exit */
+ stop_select
+};
+
+/* III.III The forker driver */
+struct erl_drv_entry forker_driver_entry = {
+ NULL,
+ forker_start,
+ forker_stop,
+ NULL,
+ forker_ready_input,
+ forker_ready_output,
+ "spawn_forker",
+ NULL,
+ NULL,
+ forker_control,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ ERL_DRV_EXTENDED_MARKER,
+ ERL_DRV_EXTENDED_MAJOR_VERSION,
+ ERL_DRV_EXTENDED_MINOR_VERSION,
+ 0,
+ NULL, NULL,
+ stop_select
+};
+
+/* Untility functions */
+
+static int set_blocking_data(ErtsSysDriverData *dd) {
+
+ dd->blocking = erts_alloc(ERTS_ALC_T_SYS_BLOCKING, sizeof(ErtsSysBlocking));
+
+ erts_smp_atomic_add_nob(&sys_misc_mem_sz, sizeof(ErtsSysBlocking));
+
+ dd->blocking->pdl = driver_pdl_create(dd->port_num);
+ dd->blocking->res = 0;
+ dd->blocking->err = 0;
+ dd->blocking->pkey = driver_async_port_key(dd->port_num);
+
+ return 1;
+}
+
+static void init_fd_data(ErtsSysFdData *fd_data, int fd)
+{
+ fd_data->fd = fd;
+ fd_data->buf = NULL;
+ fd_data->cpos = NULL;
+ fd_data->remain = 0;
+ fd_data->sz = 0;
+ fd_data->psz = 0;
+}
+
+static ErtsSysDriverData *
+create_driver_data(ErlDrvPort port_num,
+ int ifd,
+ int ofd,
+ int packet_bytes,
+ int read_write,
+ int exit_status,
+ int pid,
+ int is_blocking)
+{
+ Port *prt;
+ ErtsSysDriverData *driver_data;
+ char *data;
+ int size = sizeof(ErtsSysDriverData);
+
+ if (read_write & DO_READ)
+ size += sizeof(ErtsSysFdData);
+
+ if ((read_write & DO_WRITE) &&
+ ((ifd != ofd || ofd == -1) || !(read_write & DO_READ)))
+ size += sizeof(ErtsSysFdData);
+
+ data = erts_alloc(ERTS_ALC_T_DRV_TAB,size);
+ erts_smp_atomic_add_nob(&sys_misc_mem_sz, size);
+
+ driver_data = (ErtsSysDriverData*)data;
+ data += sizeof(*driver_data);
+
+ prt = erts_drvport2port(port_num);
+ if (prt != ERTS_INVALID_ERL_DRV_PORT)
+ prt->os_pid = pid;
+
+ driver_data->packet_bytes = packet_bytes;
+ driver_data->port_num = port_num;
+ driver_data->pid = pid;
+ driver_data->alive = exit_status ? 1 : 0;
+ driver_data->status = 0;
+ driver_data->terminating = 0;
+ driver_data->blocking = NULL;
+
+ if (read_write & DO_READ) {
+ driver_data->ifd = (ErtsSysFdData*)data;
+ data += sizeof(*driver_data->ifd);
+ init_fd_data(driver_data->ifd, ifd);
+ driver_select(port_num, ifd, (ERL_DRV_READ|ERL_DRV_USE), 1);
+ } else {
+ driver_data->ifd = NULL;
+ }
+
+ if (read_write & DO_WRITE) {
+ if (ofd != -1 && ifd == ofd && read_write & DO_READ) {
+ /* This is for when ifd and ofd are the same fd */
+ driver_data->ofd = driver_data->ifd;
+ } else {
+ driver_data->ofd = (ErtsSysFdData*)data;
+ data += sizeof(*driver_data->ofd);
+ init_fd_data(driver_data->ofd, ofd);
+ }
+ if (is_blocking && FDBLOCK)
+ if (!set_blocking_data(driver_data)) {
+ erts_free(ERTS_ALC_T_DRV_TAB, driver_data);
+ return NULL;
+ }
+ } else {
+ driver_data->ofd = NULL;
+ }
+
+ return driver_data;
+}
+
+/* Spawn driver */
+
+static void close_pipes(int ifd[2], int ofd[2])
+{
+ close(ifd[0]);
+ close(ifd[1]);
+ close(ofd[0]);
+ close(ofd[1]);
+}
+
+static char **build_unix_environment(char *block)
+{
+ int i;
+ int j;
+ int len;
+ char *cp;
+ char **cpp;
+ char** old_env;
+
+ ERTS_SMP_LC_ASSERT(erts_smp_lc_rwmtx_is_rlocked(&environ_rwmtx));
+
+ cp = block;
+ len = 0;
+ while (*cp != '\0') {
+ cp += strlen(cp) + 1;
+ len++;
+ }
+ old_env = environ;
+ while (*old_env++ != NULL) {
+ len++;
+ }
+
+ cpp = (char **) erts_alloc_fnf(ERTS_ALC_T_ENVIRONMENT,
+ sizeof(char *) * (len+1));
+ if (cpp == NULL) {
+ return NULL;
+ }
+
+ cp = block;
+ len = 0;
+ while (*cp != '\0') {
+ cpp[len] = cp;
+ cp += strlen(cp) + 1;
+ len++;
+ }
+
+ i = len;
+ for (old_env = environ; *old_env; old_env++) {
+ char* old = *old_env;
+
+ for (j = 0; j < len; j++) {
+ char *s, *t;
+
+ /* check if cpp[j] equals old
+ before the = sign,
+ i.e.
+ "TMPDIR=/tmp/" */
+ s = cpp[j];
+ t = old;
+ while (*s == *t && *s != '=') {
+ s++, t++;
+ }
+ if (*s == '=' && *t == '=') {
+ break;
+ }
+ }
+
+ if (j == len) { /* New version not found */
+ cpp[len++] = old;
+ }
+ }
+
+ for (j = 0; j < i; ) {
+ size_t last = strlen(cpp[j])-1;
+ if (cpp[j][last] == '=' && strchr(cpp[j], '=') == cpp[j]+last) {
+ cpp[j] = cpp[--len];
+ if (len < i) {
+ i--;
+ } else {
+ j++;
+ }
+ }
+ else {
+ j++;
+ }
+ }
+
+ cpp[len] = NULL;
+ return cpp;
+}
+
+static ErlDrvData spawn_start(ErlDrvPort port_num, char* name,
+ SysDriverOpts* opts)
+{
+#define CMD_LINE_PREFIX_STR "exec "
+#define CMD_LINE_PREFIX_STR_SZ (sizeof(CMD_LINE_PREFIX_STR) - 1)
+
+ int len;
+ char **new_environ;
+ ErtsSysDriverData *dd;
+ char *cmd_line;
+ char wd_buff[MAXPATHLEN+1];
+ char *wd;
+ int ifd[2], ofd[2], stderrfd;
+
+ if (pipe(ifd) < 0) return ERL_DRV_ERROR_ERRNO;
+ errno = EMFILE; /* default for next three conditions */
+ if (ifd[0] >= sys_max_files() || pipe(ofd) < 0) {
+ close(ifd[0]);
+ close(ifd[1]);
+ return ERL_DRV_ERROR_ERRNO;
+ }
+ if (ofd[1] >= sys_max_files()) {
+ close_pipes(ifd, ofd);
+ errno = EMFILE;
+ return ERL_DRV_ERROR_ERRNO;
+ }
+
+ SET_NONBLOCKING(ifd[0]);
+ SET_NONBLOCKING(ofd[1]);
+
+ stderrfd = opts->redir_stderr ? ifd[1] : dup(2);
+
+ if (stderrfd >= sys_max_files() || stderrfd < 0) {
+ close_pipes(ifd, ofd);
+ if (stderrfd > -1)
+ close(stderrfd);
+ return ERL_DRV_ERROR_ERRNO;
+ }
+
+ if (opts->spawn_type == ERTS_SPAWN_EXECUTABLE) {
+ /* started with spawn_executable, not with spawn */
+ len = strlen(name);
+ cmd_line = (char *) erts_alloc_fnf(ERTS_ALC_T_TMP, len + 1);
+ if (!cmd_line) {
+ close_pipes(ifd, ofd);
+ errno = ENOMEM;
+ return ERL_DRV_ERROR_ERRNO;
+ }
+ memcpy((void *) cmd_line,(void *) name, len);
+ cmd_line[len] = '\0';
+ len = len + 1;
+ if (access(cmd_line,X_OK) != 0) {
+ int save_errno = errno;
+ erts_free(ERTS_ALC_T_TMP, cmd_line);
+ close_pipes(ifd, ofd);
+ errno = save_errno;
+ return ERL_DRV_ERROR_ERRNO;
+ }
+ } else {
+ /* make the string suitable for giving to "sh" */
+ len = strlen(name);
+ cmd_line = (char *) erts_alloc_fnf(ERTS_ALC_T_TMP,
+ CMD_LINE_PREFIX_STR_SZ + len + 1);
+ if (!cmd_line) {
+ close_pipes(ifd, ofd);
+ errno = ENOMEM;
+ return ERL_DRV_ERROR_ERRNO;
+ }
+ memcpy((void *) cmd_line,
+ (void *) CMD_LINE_PREFIX_STR,
+ CMD_LINE_PREFIX_STR_SZ);
+ memcpy((void *) (cmd_line + CMD_LINE_PREFIX_STR_SZ), (void *) name, len);
+ cmd_line[CMD_LINE_PREFIX_STR_SZ + len] = '\0';
+ len = CMD_LINE_PREFIX_STR_SZ + len + 1;
+ }
+
+ erts_smp_rwmtx_rlock(&environ_rwmtx);
+
+ if (opts->envir == NULL) {
+ new_environ = environ;
+ } else if ((new_environ = build_unix_environment(opts->envir)) == NULL) {
+ erts_smp_rwmtx_runlock(&environ_rwmtx);
+ close_pipes(ifd, ofd);
+ erts_free(ERTS_ALC_T_TMP, (void *) cmd_line);
+ errno = ENOMEM;
+ return ERL_DRV_ERROR_ERRNO;
+ }
+
+ if (opts->wd == NULL) {
+ if ((wd = getcwd(wd_buff, MAXPATHLEN+1)) == NULL) {
+ /* on some OSs this call opens a fd in the
+ background which means that this can
+ return EMFILE */
+ int err = errno;
+ close_pipes(ifd, ofd);
+ erts_free(ERTS_ALC_T_TMP, (void *) cmd_line);
+ if (new_environ != environ)
+ erts_free(ERTS_ALC_T_ENVIRONMENT, (void *) new_environ);
+ erts_smp_rwmtx_runlock(&environ_rwmtx);
+ errno = err;
+ return ERL_DRV_ERROR_ERRNO;
+ }
+ } else {
+ wd = opts->wd;
+ }
+
+ {
+ struct iovec *io_vector;
+ int iov_len = 5;
+ char nullbuff[] = "\0";
+ int j, i = 0, res;
+ Sint32 buffsz = 0, env_len = 0, argv_len = 0,
+ flags = (opts->use_stdio ? FORKER_FLAG_USE_STDIO : 0)
+ | (opts->exit_status ? FORKER_FLAG_EXIT_STATUS : 0)
+ | (opts->read_write & DO_READ ? FORKER_FLAG_DO_READ : 0)
+ | (opts->read_write & DO_WRITE ? FORKER_FLAG_DO_WRITE : 0);
+
+ /* count number of elements in environment */
+ while(new_environ[env_len] != NULL)
+ env_len++;
+ iov_len += 1 + env_len; /* num envs including size int */
+
+ /* count number of element in argument list */
+ if (opts->spawn_type == ERTS_SPAWN_EXECUTABLE) {
+ if (opts->argv != NULL) {
+ while(opts->argv[argv_len] != NULL)
+ argv_len++;
+ } else {
+ argv_len++;
+ }
+ iov_len += 1 + argv_len; /* num argvs including size int */
+ }
+
+ io_vector = erts_alloc_fnf(ERTS_ALC_T_TMP, sizeof(struct iovec) * iov_len);
+
+ if (!io_vector) {
+ close_pipes(ifd, ofd);
+ erts_smp_rwmtx_runlock(&environ_rwmtx);
+ erts_free(ERTS_ALC_T_TMP, (void *) cmd_line);
+ if (new_environ != environ)
+ erts_free(ERTS_ALC_T_ENVIRONMENT, (void *) new_environ);
+ errno = ENOMEM;
+ return ERL_DRV_ERROR_ERRNO;
+ }
+
+ io_vector[i].iov_base = (void*)&buffsz;
+ io_vector[i++].iov_len = sizeof(buffsz);
+
+ io_vector[i].iov_base = (void*)&flags;
+ flags = htonl(flags);
+ io_vector[i++].iov_len = sizeof(flags);
+ buffsz += sizeof(flags);
+
+ io_vector[i].iov_base = cmd_line;
+ io_vector[i++].iov_len = len;
+ buffsz += len;
+
+ io_vector[i].iov_base = wd;
+ io_vector[i].iov_len = strlen(io_vector[i].iov_base) + 1;
+ buffsz += io_vector[i++].iov_len;
+
+ io_vector[i].iov_base = nullbuff;
+ io_vector[i++].iov_len = 1;
+ buffsz += io_vector[i-1].iov_len;
+
+ io_vector[i].iov_base = (void*)&env_len;
+ env_len = htonl(env_len);
+ io_vector[i++].iov_len = sizeof(env_len);
+ buffsz += io_vector[i-1].iov_len;
+
+ for (j = 0; new_environ[j] != NULL; j++) {
+ io_vector[i].iov_base = new_environ[j];
+ io_vector[i++].iov_len = strlen(new_environ[j]) + 1;
+ buffsz += io_vector[i-1].iov_len;
+ }
+
+ /* only append arguments if this was a spawn_executable */
+ if (opts->spawn_type == ERTS_SPAWN_EXECUTABLE) {
+
+ io_vector[i].iov_base = (void*)&argv_len;
+ argv_len = htonl(argv_len);
+ io_vector[i++].iov_len = sizeof(argv_len);
+ buffsz += io_vector[i-1].iov_len;
+
+ if (opts->argv) {
+ /* If there are arguments we copy in the references to
+ them into the iov */
+ for (j = 0; opts->argv[j]; j++) {
+ if (opts->argv[j] == erts_default_arg0)
+ io_vector[i].iov_base = cmd_line;
+ else
+ io_vector[i].iov_base = opts->argv[j];
+ io_vector[i].iov_len = strlen(io_vector[i].iov_base) + 1;
+ buffsz += io_vector[i++].iov_len;
+ }
+ } else {
+ io_vector[i].iov_base = cmd_line;
+ io_vector[i].iov_len = strlen(io_vector[i].iov_base) + 1;
+ buffsz += io_vector[i++].iov_len;
+ }
+ }
+
+ /* we send the request to do the fork */
+ if ((res = writev(ofd[1], io_vector, iov_len > MAXIOV ? MAXIOV : iov_len)) < 0) {
+ if (errno == ERRNO_BLOCK) {
+ res = 0;
+ } else {
+ int err = errno;
+ close_pipes(ifd, ofd);
+ erts_free(ERTS_ALC_T_TMP, io_vector);
+ if (new_environ != environ)
+ erts_free(ERTS_ALC_T_ENVIRONMENT, (void *) new_environ);
+ erts_smp_rwmtx_runlock(&environ_rwmtx);
+ erts_free(ERTS_ALC_T_TMP, (void *) cmd_line);
+ errno = err;
+ return ERL_DRV_ERROR_ERRNO;
+ }
+ }
+
+ if (res < buffsz) {
+ /* we only wrote part of the command payload. Enqueue the rest. */
+ for (i = 0; i < iov_len; i++) {
+ driver_enq(port_num, io_vector[i].iov_base, io_vector[i].iov_len);
+ }
+ driver_deq(port_num, res);
+ driver_select(port_num, ofd[1], ERL_DRV_WRITE|ERL_DRV_USE, 1);
+ }
+
+ erts_free(ERTS_ALC_T_TMP, io_vector);
+ }
+
+ erts_free(ERTS_ALC_T_TMP, (void *) cmd_line);
+
+ if (new_environ != environ)
+ erts_free(ERTS_ALC_T_ENVIRONMENT, (void *) new_environ);
+
+ erts_smp_rwmtx_runlock(&environ_rwmtx);
+
+ dd = create_driver_data(port_num, ifd[0], ofd[1], opts->packet_bytes,
+ DO_WRITE | DO_READ, opts->exit_status,
+ 0, 0);
+
+ {
+ /* send ofd[0] + ifd[1] + stderrfd to forker port */
+ ErtsSysForkerProto *proto =
+ erts_alloc(ERTS_ALC_T_DRV_CTRL_DATA,
+ sizeof(ErtsSysForkerProto));
+ memset(proto, 0, sizeof(ErtsSysForkerProto));
+ proto->action = ErtsSysForkerProtoAction_Start;
+ proto->u.start.fds[0] = ofd[0];
+ proto->u.start.fds[1] = ifd[1];
+ proto->u.start.fds[2] = stderrfd;
+ proto->u.start.port_id = opts->exit_status ? erts_drvport2id(port_num) : THE_NON_VALUE;
+ if (erl_drv_port_control(forker_port, 'S', (char*)proto, sizeof(*proto))) {
+ /* The forker port has been killed, we close both fd's which will
+ make open_port throw an epipe error */
+ close(ofd[0]);
+ close(ifd[1]);
+ }
+ }
+
+ /* we set these fds to negative to mark if
+ they should be closed after the handshake */
+ if (!(opts->read_write & DO_READ))
+ dd->ifd->fd *= -1;
+
+ if (!(opts->read_write & DO_WRITE))
+ dd->ofd->fd *= -1;
+
+ return (ErlDrvData)dd;
+#undef CMD_LINE_PREFIX_STR
+#undef CMD_LINE_PREFIX_STR_SZ
+}
+
+static ErlDrvSSizeT spawn_control(ErlDrvData e, unsigned int cmd, char *buf,
+ ErlDrvSizeT len, char **rbuf, ErlDrvSizeT rlen)
+{
+ ErtsSysDriverData *dd = (ErtsSysDriverData*)e;
+ ErtsSysForkerProto *proto = (ErtsSysForkerProto *)buf;
+
+ ASSERT(len == sizeof(*proto));
+ ASSERT(proto->action == ErtsSysForkerProtoAction_SigChld);
+
+ dd->status = proto->u.sigchld.error_number;
+ dd->alive = -1;
+
+ if (dd->ifd)
+ driver_select(dd->port_num, abs(dd->ifd->fd), ERL_DRV_READ | ERL_DRV_USE, 1);
+
+ if (dd->ofd)
+ driver_select(dd->port_num, abs(dd->ofd->fd), ERL_DRV_WRITE | ERL_DRV_USE, 1);
+
+ return 0;
+}
+
+#define FD_DEF_HEIGHT 24
+#define FD_DEF_WIDTH 80
+/* Control op */
+#define FD_CTRL_OP_GET_WINSIZE 100
+
+static int fd_get_window_size(int fd, Uint32 *width, Uint32 *height)
+{
+#ifdef TIOCGWINSZ
+ struct winsize ws;
+ if (ioctl(fd,TIOCGWINSZ,&ws) == 0) {
+ *width = (Uint32) ws.ws_col;
+ *height = (Uint32) ws.ws_row;
+ return 0;
+ }
+#endif
+ return -1;
+}
+
+static ErlDrvSSizeT fd_control(ErlDrvData drv_data,
+ unsigned int command,
+ char *buf, ErlDrvSizeT len,
+ char **rbuf, ErlDrvSizeT rlen)
+{
+ int fd = (int)(long)drv_data;
+ char resbuff[2*sizeof(Uint32)];
+ switch (command) {
+ case FD_CTRL_OP_GET_WINSIZE:
+ {
+ Uint32 w,h;
+ if (fd_get_window_size(fd,&w,&h))
+ return 0;
+ memcpy(resbuff,&w,sizeof(Uint32));
+ memcpy(resbuff+sizeof(Uint32),&h,sizeof(Uint32));
+ }
+ break;
+ default:
+ return 0;
+ }
+ if (rlen < 2*sizeof(Uint32)) {
+ *rbuf = driver_alloc(2*sizeof(Uint32));
+ }
+ memcpy(*rbuf,resbuff,2*sizeof(Uint32));
+ return 2*sizeof(Uint32);
+}
+
+static ErlDrvData fd_start(ErlDrvPort port_num, char* name,
+ SysDriverOpts* opts)
+{
+ int non_blocking = 0;
+
+ if (((opts->read_write & DO_READ) && opts->ifd >= sys_max_files()) ||
+ ((opts->read_write & DO_WRITE) && opts->ofd >= sys_max_files()))
+ return ERL_DRV_ERROR_GENERAL;
+
+ /*
+ * Historical:
+ *
+ * "Note about nonblocking I/O.
+ *
+ * At least on Solaris, setting the write end of a TTY to nonblocking,
+ * will set the input end to nonblocking as well (and vice-versa).
+ * If erl is run in a pipeline like this: cat | erl
+ * the input end of the TTY will be the standard input of cat.
+ * And cat is not prepared to handle nonblocking I/O."
+ *
+ * Actually, the reason for this is not that the tty itself gets set
+ * in non-blocking mode, but that the "input end" (cat's stdin) and
+ * the "output end" (erlang's stdout) are typically the "same" file
+ * descriptor, dup()'ed from a single fd by one of this process'
+ * ancestors.
+ *
+ * The workaround for this problem used to be a rather bad kludge,
+ * interposing an extra process ("internal cat") between erlang's
+ * stdout and the original stdout, allowing erlang to set its stdout
+ * in non-blocking mode without affecting the stdin of the preceding
+ * process in the pipeline - and being a kludge, it caused all kinds
+ * of weird problems.
+ *
+ * So, this is the current logic:
+ *
+ * The only reason to set non-blocking mode on the output fd at all is
+ * if it's something that can cause a write() to block, of course,
+ * i.e. primarily if it points to a tty, socket, pipe, or fifo.
+ *
+ * If we don't set non-blocking mode when we "should" have, and output
+ * becomes blocked, the entire runtime system will be suspended - this
+ * is normally bad of course, and can happen fairly "easily" - e.g. user
+ * hits ^S on tty - but doesn't necessarily happen.
+ *
+ * If we do set non-blocking mode when we "shouldn't" have, the runtime
+ * system will end up seeing EOF on the input fd (due to the preceding
+ * process dying), which typically will cause the entire runtime system
+ * to terminate immediately (due to whatever erlang process is seeing
+ * the EOF taking it as a signal to halt the system). This is *very* bad.
+ *
+ * I.e. we should take a conservative approach, and only set non-
+ * blocking mode when we a) need to, and b) are reasonably certain
+ * that it won't be a problem. And as in the example above, the problem
+ * occurs when input fd and output fd point to different "things".
+ *
+ * However, determining that they are not just the same "type" of
+ * "thing", but actually the same instance of that type of thing, is
+ * unreasonably complex in many/most cases.
+ *
+ * Also, with pipes, sockets, and fifos it's far from obvious that the
+ * user *wants* non-blocking output: If you're running erlang inside
+ * some complex pipeline, you're probably not running a real-time system
+ * that must never stop, but rather *want* it to suspend if the output
+ * channel is "full".
+ *
+ * So, the bottom line: We will only set the output fd non-blocking if
+ * it points to a tty, and either a) the input fd also points to a tty,
+ * or b) we can make sure that setting the output fd non-blocking
+ * doesn't interfere with someone else's input, via a somewhat milder
+ * kludge than the above.
+ *
+ * Also keep in mind that while this code is almost exclusively run as
+ * a result of an erlang open_port({fd,0,1}, ...), that isn't the only
+ * case - it can be called with any old pre-existing file descriptors,
+ * the relations between which (if they're even two) we can only guess
+ * at - still, we try our best...
+ *
+ * Added note OTP 18: Some systems seem to use stdout/stderr to log data
+ * using unix pipes, so we cannot allow the system to block on a write.
+ * Therefore we use an async thread to write the data to fd's that could
+ * not be set to non-blocking. When no async threads are available we
+ * fall back on the old behaviour.
+ *
+ * Also the guarantee about what is delivered to the OS has changed.
+ * Pre 18 the fd driver did no flushing of data before terminating.
+ * Now it does. This is because we want to be able to guarantee that things
+ * such as escripts and friends really have outputted all data before
+ * terminating. This could potentially block the termination of the system
+ * for a very long time, but if the user wants to terminate fast she should
+ * use erlang:halt with flush=false.
+ */
+
+ /* Try to figure out if we can use non-blocking writes */
+ if (opts->read_write & DO_WRITE) {
+
+ /* If we don't have a read end, all bets are off - no non-blocking. */
+ if (opts->read_write & DO_READ) {
+
+ if (isatty(opts->ofd)) { /* output fd is a tty:-) */
+
+ if (isatty(opts->ifd)) { /* input fd is also a tty */
+
+ /* To really do this "right", we should also check that
+ input and output fd point to the *same* tty - but
+ this seems like overkill; ttyname() isn't for free,
+ and this is a very common case - and it's hard to
+ imagine a scenario where setting non-blocking mode
+ here would cause problems - go ahead and do it. */
+
+ non_blocking = 1;
+ SET_NONBLOCKING(opts->ofd);
+
+ } else { /* output fd is a tty, input fd isn't */
+
+ /* This is a "problem case", but also common (see the
+ example above) - i.e. it makes sense to try a bit
+ harder before giving up on non-blocking mode: Try to
+ re-open the tty that the output fd points to, and if
+ successful replace the original one with the "new" fd
+ obtained this way, and set *that* one in non-blocking
+ mode. (Yes, this is a kludge.)
+
+ However, re-opening the tty may fail in a couple of
+ (unusual) cases:
+
+ 1) The name of the tty (or an equivalent one, i.e.
+ same major/minor number) can't be found, because
+ it actually lives somewhere other than /dev (or
+ wherever ttyname() looks for it), and isn't
+ equivalent to any of those that do live in the
+ "standard" place - this should be *very* unusual.
+
+ 2) Permissions on the tty don't allow us to open it -
+ it's perfectly possible to have an fd open to an
+ object whose permissions wouldn't allow us to open
+ it. This is not as unusual as it sounds, one case
+ is if the user has su'ed to someone else (not
+ root) - we have a read/write fd open to the tty
+ (because it has been inherited all the way down
+ here), but we have neither read nor write
+ permission for the tty.
+
+ In these cases, we finally give up, and don't set the
+ output fd in non-blocking mode. */
+
+ char *tty;
+ int nfd;
+
+ if ((tty = ttyname(opts->ofd)) != NULL &&
+ (nfd = open(tty, O_WRONLY)) != -1) {
+ dup2(nfd, opts->ofd);
+ close(nfd);
+ non_blocking = 1;
+ SET_NONBLOCKING(opts->ofd);
+ }
+ }
+ }
+ }
+ }
+ return (ErlDrvData)create_driver_data(port_num, opts->ifd, opts->ofd,
+ opts->packet_bytes,
+ opts->read_write, 0, -1,
+ !non_blocking);
+}
+
+static void clear_fd_data(ErtsSysFdData *fdd)
+{
+ if (fdd->sz > 0) {
+ erts_free(ERTS_ALC_T_FD_ENTRY_BUF, (void *) fdd->buf);
+ ASSERT(erts_smp_atomic_read_nob(&sys_misc_mem_sz) >= fdd->sz);
+ erts_smp_atomic_add_nob(&sys_misc_mem_sz, -1*fdd->sz);
+ }
+ fdd->buf = NULL;
+ fdd->sz = 0;
+ fdd->remain = 0;
+ fdd->cpos = NULL;
+ fdd->psz = 0;
+}
+
+static void nbio_stop_fd(ErlDrvPort prt, ErtsSysFdData *fdd)
+{
+ driver_select(prt, abs(fdd->fd), DO_READ|DO_WRITE, 0);
+ clear_fd_data(fdd);
+ SET_BLOCKING(abs(fdd->fd));
+
+}
+
+static void fd_stop(ErlDrvData ev) /* Does not close the fds */
+{
+ ErtsSysDriverData* dd = (ErtsSysDriverData*)ev;
+ ErlDrvPort prt = dd->port_num;
+ int sz = sizeof(ErtsSysDriverData);
+
+#if FDBLOCK
+ if (dd->blocking) {
+ erts_free(ERTS_ALC_T_SYS_BLOCKING, dd->blocking);
+ dd->blocking = NULL;
+ sz += sizeof(ErtsSysBlocking);
+ }
+#endif
+
+ if (dd->ifd) {
+ sz += sizeof(ErtsSysFdData);
+ nbio_stop_fd(prt, dd->ifd);
+ }
+ if (dd->ofd && dd->ofd != dd->ifd) {
+ sz += sizeof(ErtsSysFdData);
+ nbio_stop_fd(prt, dd->ofd);
+ }
+
+ erts_free(ERTS_ALC_T_DRV_TAB, dd);
+ erts_smp_atomic_add_nob(&sys_misc_mem_sz, -sz);
+}
+
+static void fd_flush(ErlDrvData ev)
+{
+ ErtsSysDriverData* dd = (ErtsSysDriverData*)ev;
+ if (!dd->terminating)
+ dd->terminating = 1;
+}
+
+static ErlDrvData vanilla_start(ErlDrvPort port_num, char* name,
+ SysDriverOpts* opts)
+{
+ int flags, fd;
+ ErlDrvData res;
+
+ flags = (opts->read_write == DO_READ ? O_RDONLY :
+ opts->read_write == DO_WRITE ? O_WRONLY|O_CREAT|O_TRUNC :
+ O_RDWR|O_CREAT);
+ if ((fd = open(name, flags, 0666)) < 0)
+ return ERL_DRV_ERROR_GENERAL;
+ if (fd >= sys_max_files()) {
+ close(fd);
+ return ERL_DRV_ERROR_GENERAL;
+ }
+ SET_NONBLOCKING(fd);
+
+ res = (ErlDrvData)(long)create_driver_data(port_num, fd, fd,
+ opts->packet_bytes,
+ opts->read_write, 0, -1, 0);
+ return res;
+}
+
+/* Note that driver_data[fd].ifd == fd if the port was opened for reading, */
+/* otherwise (i.e. write only) driver_data[fd].ofd = fd. */
+
+static void stop(ErlDrvData ev)
+{
+ ErtsSysDriverData* dd = (ErtsSysDriverData*)ev;
+ ErlDrvPort prt = dd->port_num;
+
+ if (dd->ifd) {
+ nbio_stop_fd(prt, dd->ifd);
+ driver_select(prt, abs(dd->ifd->fd), ERL_DRV_USE, 0); /* close(ifd); */
+ }
+
+ if (dd->ofd && dd->ofd != dd->ifd) {
+ nbio_stop_fd(prt, dd->ofd);
+ driver_select(prt, abs(dd->ofd->fd), ERL_DRV_USE, 0); /* close(ofd); */
+ }
+
+ erts_free(ERTS_ALC_T_DRV_TAB, dd);
+}
+
+/* used by fd_driver */
+static void outputv(ErlDrvData e, ErlIOVec* ev)
+{
+ ErtsSysDriverData *dd = (ErtsSysDriverData*)e;
+ ErlDrvPort ix = dd->port_num;
+ int pb = dd->packet_bytes;
+ int ofd = dd->ofd ? dd->ofd->fd : -1;
+ ssize_t n;
+ ErlDrvSizeT sz;
+ char lb[4];
+ char* lbp;
+ ErlDrvSizeT len = ev->size;
+
+ /* (len > ((unsigned long)-1 >> (4-pb)*8)) */
+ /* if (pb >= 0 && (len & (((ErlDrvSizeT)1 << (pb*8))) - 1) != len) {*/
+ if (((pb == 2) && (len > 0xffff)) || (pb == 1 && len > 0xff)) {
+ driver_failure_posix(ix, EINVAL);
+ return; /* -1; */
+ }
+ /* Handles 0 <= pb <= 4 only */
+ put_int32((Uint32) len, lb);
+ lbp = lb + (4-pb);
+
+ ev->iov[0].iov_base = lbp;
+ ev->iov[0].iov_len = pb;
+ ev->size += pb;
+
+ if (dd->blocking && FDBLOCK)
+ driver_pdl_lock(dd->blocking->pdl);
+
+ if ((sz = driver_sizeq(ix)) > 0) {
+ driver_enqv(ix, ev, 0);
+
+ if (dd->blocking && FDBLOCK)
+ driver_pdl_unlock(dd->blocking->pdl);
+
+ if (sz + ev->size >= (1 << 13))
+ set_busy_port(ix, 1);
+ }
+ else if (!dd->blocking || !FDBLOCK) {
+ /* We try to write directly if the fd in non-blocking */
+ int vsize = ev->vsize > MAX_VSIZE ? MAX_VSIZE : ev->vsize;
+
+ n = writev(ofd, (const void *) (ev->iov), vsize);
+ if (n == ev->size)
+ return; /* 0;*/
+ if (n < 0) {
+ if ((errno != EINTR) && (errno != ERRNO_BLOCK)) {
+ driver_failure_posix(ix, errno);
+ return; /* -1;*/
+ }
+ n = 0;
+ }
+ driver_enqv(ix, ev, n); /* n is the skip value */
+ driver_select(ix, ofd, ERL_DRV_WRITE|ERL_DRV_USE, 1);
+ }
+#if FDBLOCK
+ else {
+ if (ev->size != 0) {
+ driver_enqv(ix, ev, 0);
+ driver_pdl_unlock(dd->blocking->pdl);
+ driver_async(ix, &dd->blocking->pkey,
+ fd_async, dd, NULL);
+ } else {
+ driver_pdl_unlock(dd->blocking->pdl);
+ }
+ }
+#endif
+ /* return 0;*/
+}
+
+/* Used by spawn_driver and vanilla driver */
+static void output(ErlDrvData e, char* buf, ErlDrvSizeT len)
+{
+ ErtsSysDriverData *dd = (ErtsSysDriverData*)e;
+ ErlDrvPort ix = dd->port_num;
+ int pb = dd->packet_bytes;
+ int ofd = dd->ofd ? dd->ofd->fd : -1;
+ ssize_t n;
+ ErlDrvSizeT sz;
+ char lb[4];
+ char* lbp;
+ struct iovec iv[2];
+
+ /* (len > ((unsigned long)-1 >> (4-pb)*8)) */
+ if (((pb == 2) && (len > 0xffff))
+ || (pb == 1 && len > 0xff)
+ || dd->pid == 0 /* Attempt at output before port is ready */) {
+ driver_failure_posix(ix, EINVAL);
+ return; /* -1; */
+ }
+ put_int32(len, lb);
+ lbp = lb + (4-pb);
+
+ if ((sz = driver_sizeq(ix)) > 0) {
+ driver_enq(ix, lbp, pb);
+ driver_enq(ix, buf, len);
+ if (sz + len + pb >= (1 << 13))
+ set_busy_port(ix, 1);
+ }
+ else {
+ iv[0].iov_base = lbp;
+ iv[0].iov_len = pb; /* should work for pb=0 */
+ iv[1].iov_base = buf;
+ iv[1].iov_len = len;
+ n = writev(ofd, iv, 2);
+ if (n == pb+len)
+ return; /* 0; */
+ if (n < 0) {
+ if ((errno != EINTR) && (errno != ERRNO_BLOCK)) {
+ driver_failure_posix(ix, errno);
+ return; /* -1; */
+ }
+ n = 0;
+ }
+ if (n < pb) {
+ driver_enq(ix, lbp+n, pb-n);
+ driver_enq(ix, buf, len);
+ }
+ else {
+ n -= pb;
+ driver_enq(ix, buf+n, len-n);
+ }
+ driver_select(ix, ofd, ERL_DRV_WRITE|ERL_DRV_USE, 1);
+ }
+ return; /* 0; */
+}
+
+static int port_inp_failure(ErtsSysDriverData *dd, int res)
+ /* Result: 0 (eof) or -1 (error) */
+{
+ int err = errno;
+
+ ASSERT(res <= 0);
+ if (dd->ifd) {
+ driver_select(dd->port_num, dd->ifd->fd, ERL_DRV_READ|ERL_DRV_WRITE, 0);
+ clear_fd_data(dd->ifd);
+ }
+
+ if (dd->blocking && FDBLOCK) {
+ driver_pdl_lock(dd->blocking->pdl);
+ if (driver_sizeq(dd->port_num) > 0) {
+ driver_pdl_unlock(dd->blocking->pdl);
+ /* We have stuff in the output queue, so we just
+ set the state to terminating and wait for fd_async_ready
+ to terminate the port */
+ if (res == 0)
+ dd->terminating = 2;
+ else
+ dd->terminating = -err;
+ return 0;
+ }
+ driver_pdl_unlock(dd->blocking->pdl);
+ }
+
+ if (res == 0) {
+ if (dd->alive == 1) {
+ /*
+ * We have eof and want to report exit status, but the process
+ * hasn't exited yet. When it does ready_input will
+ * driver_select() this fd which will make sure that we get
+ * back here with dd->alive == -1 and dd->status set.
+ */
+ return 0;
+ }
+ else if (dd->alive == -1) {
+ int status = dd->status;
+
+ /* We need not be prepared for stopped/continued processes. */
+ if (WIFSIGNALED(status))
+ status = 128 + WTERMSIG(status);
+ else
+ status = WEXITSTATUS(status);
+ driver_report_exit(dd->port_num, status);
+ }
+ driver_failure_eof(dd->port_num);
+ } else if (dd->ifd) {
+ erl_drv_init_ack(dd->port_num, ERL_DRV_ERROR_ERRNO);
+ } else {
+ driver_failure_posix(dd->port_num, err);
+ }
+ return 0;
+}
+
+/* fd is the drv_data that is returned from the */
+/* initial start routine */
+/* ready_fd is the descriptor that is ready to read */
+
+static void ready_input(ErlDrvData e, ErlDrvEvent ready_fd)
+{
+ ErtsSysDriverData *dd = (ErtsSysDriverData*)e;
+ ErlDrvPort port_num;
+ int packet_bytes;
+ int res;
+ Uint h;
+
+ port_num = dd->port_num;
+ packet_bytes = dd->packet_bytes;
+
+ ASSERT(abs(dd->ifd->fd) == ready_fd);
+
+ if (dd->pid == 0) {
+ /* the pid is sent from erl_child_setup. spawn driver only. */
+ ErtsSysForkerProto proto;
+ int res;
+
+ if((res = read(ready_fd, &proto, sizeof(proto))) <= 0) {
+ /* hmm, child setup seems to have closed the pipe too early...
+ we close the port as there is not much else we can do */
+ if (res < 0 && errno == ERRNO_BLOCK)
+ return;
+ driver_select(port_num, ready_fd, ERL_DRV_READ, 0);
+ if (res == 0)
+ errno = EPIPE;
+ port_inp_failure(dd, -1);
+ return;
+ }
+
+ ASSERT(proto.action == ErtsSysForkerProtoAction_Go);
+ dd->pid = proto.u.go.os_pid;
+
+ if (dd->pid == -1) {
+ /* Setup failed! The only reason why this should happen is if
+ the fork fails. */
+ errno = proto.u.go.error_number;
+ port_inp_failure(dd, -1);
+ return;
+ }
+
+ proto.action = ErtsSysForkerProtoAction_Ack;
+
+ if (driver_sizeq(port_num) > 0) {
+ driver_enq(port_num, (char*)&proto, sizeof(proto));
+ } else {
+ if (write(abs(dd->ofd->fd), &proto, sizeof(proto)) < 0)
+ if (errno == ERRNO_BLOCK || errno == EINTR)
+ driver_enq(port_num, (char*)&proto, sizeof(proto));
+ /* do nothing on failure here. If the ofd is broken, then
+ the ifd will probably also be broken and trigger
+ a port_inp_failure */
+ }
+
+ if (dd->ifd->fd < 0) {
+ driver_select(port_num, abs(dd->ifd->fd), ERL_DRV_READ|ERL_DRV_USE, 0);
+ erts_smp_atomic_add_nob(&sys_misc_mem_sz, -sizeof(ErtsSysFdData));
+ dd->ifd = NULL;
+ }
+
+ if (dd->ofd->fd < 0 || driver_sizeq(port_num) > 0)
+ /* we select in order to close fd or write to queue,
+ child setup will close this fd if fd < 0 */
+ driver_select(port_num, abs(dd->ofd->fd), ERL_DRV_WRITE|ERL_DRV_USE, 1);
+
+ erl_drv_set_os_pid(port_num, dd->pid);
+ erl_drv_init_ack(port_num, e);
+ return;
+ }
+
+ if (packet_bytes == 0) {
+ byte *read_buf = (byte *) erts_alloc(ERTS_ALC_T_SYS_READ_BUF,
+ ERTS_SYS_READ_BUF_SZ);
+ res = read(ready_fd, read_buf, ERTS_SYS_READ_BUF_SZ);
+ if (res < 0) {
+ if ((errno != EINTR) && (errno != ERRNO_BLOCK))
+ port_inp_failure(dd, res);
+ }
+ else if (res == 0)
+ port_inp_failure(dd, res);
+ else
+ driver_output(port_num, (char*) read_buf, res);
+ erts_free(ERTS_ALC_T_SYS_READ_BUF, (void *) read_buf);
+ }
+ else if (dd->ifd->remain > 0) { /* We try to read the remainder */
+ /* space is allocated in buf */
+ res = read(ready_fd, dd->ifd->cpos,
+ dd->ifd->remain);
+ if (res < 0) {
+ if ((errno != EINTR) && (errno != ERRNO_BLOCK))
+ port_inp_failure(dd, res);
+ }
+ else if (res == 0) {
+ port_inp_failure(dd, res);
+ }
+ else if (res == dd->ifd->remain) { /* we're done */
+ driver_output(port_num, dd->ifd->buf,
+ dd->ifd->sz);
+ clear_fd_data(dd->ifd);
+ }
+ else { /* if (res < dd->ifd->remain) */
+ dd->ifd->cpos += res;
+ dd->ifd->remain -= res;
+ }
+ }
+ else if (dd->ifd->remain == 0) { /* clean fd */
+ byte *read_buf = (byte *) erts_alloc(ERTS_ALC_T_SYS_READ_BUF,
+ ERTS_SYS_READ_BUF_SZ);
+ /* We make one read attempt and see what happens */
+ res = read(ready_fd, read_buf, ERTS_SYS_READ_BUF_SZ);
+ if (res < 0) {
+ if ((errno != EINTR) && (errno != ERRNO_BLOCK))
+ port_inp_failure(dd, res);
+ }
+ else if (res == 0) { /* eof */
+ port_inp_failure(dd, res);
+ }
+ else if (res < packet_bytes - dd->ifd->psz) {
+ memcpy(dd->ifd->pbuf+dd->ifd->psz,
+ read_buf, res);
+ dd->ifd->psz += res;
+ }
+ else { /* if (res >= packet_bytes) */
+ unsigned char* cpos = read_buf;
+ int bytes_left = res;
+
+ while (1) {
+ int psz = dd->ifd->psz;
+ char* pbp = dd->ifd->pbuf + psz;
+
+ while(bytes_left && (psz < packet_bytes)) {
+ *pbp++ = *cpos++;
+ bytes_left--;
+ psz++;
+ }
+
+ if (psz < packet_bytes) {
+ dd->ifd->psz = psz;
+ break;
+ }
+ dd->ifd->psz = 0;
+
+ switch (packet_bytes) {
+ case 1: h = get_int8(dd->ifd->pbuf); break;
+ case 2: h = get_int16(dd->ifd->pbuf); break;
+ case 4: h = get_int32(dd->ifd->pbuf); break;
+ default: ASSERT(0); return; /* -1; */
+ }
+
+ if (h <= (bytes_left)) {
+ driver_output(port_num, (char*) cpos, h);
+ cpos += h;
+ bytes_left -= h;
+ continue;
+ }
+ else { /* The last message we got was split */
+ char *buf = erts_alloc_fnf(ERTS_ALC_T_FD_ENTRY_BUF, h);
+ if (!buf) {
+ errno = ENOMEM;
+ port_inp_failure(dd, -1);
+ }
+ else {
+ erts_smp_atomic_add_nob(&sys_misc_mem_sz, h);
+ sys_memcpy(buf, cpos, bytes_left);
+ dd->ifd->buf = buf;
+ dd->ifd->sz = h;
+ dd->ifd->remain = h - bytes_left;
+ dd->ifd->cpos = buf + bytes_left;
+ }
+ break;
+ }
+ }
+ }
+ erts_free(ERTS_ALC_T_SYS_READ_BUF, (void *) read_buf);
+ }
+}
+
+
+/* fd is the drv_data that is returned from the */
+/* initial start routine */
+/* ready_fd is the descriptor that is ready to read */
+
+static void ready_output(ErlDrvData e, ErlDrvEvent ready_fd)
+{
+ ErtsSysDriverData *dd = (ErtsSysDriverData*)e;
+ ErlDrvPort ix = dd->port_num;
+ int n;
+ struct iovec* iv;
+ int vsize;
+
+ if ((iv = (struct iovec*) driver_peekq(ix, &vsize)) == NULL) {
+ driver_select(ix, ready_fd, ERL_DRV_WRITE, 0);
+ if (dd->pid > 0 && dd->ofd->fd < 0) {
+ /* The port was opened with 'in' option, which means we
+ should close the output fd as soon as the command has
+ been sent. */
+ driver_select(ix, ready_fd, ERL_DRV_WRITE|ERL_DRV_USE, 0);
+ erts_smp_atomic_add_nob(&sys_misc_mem_sz, -sizeof(ErtsSysFdData));
+ dd->ofd = NULL;
+ }
+ if (dd->terminating)
+ driver_failure_atom(dd->port_num,"normal");
+ return; /* 0; */
+ }
+ vsize = vsize > MAX_VSIZE ? MAX_VSIZE : vsize;
+ if ((n = writev(ready_fd, iv, vsize)) > 0) {
+ if (driver_deq(ix, n) == 0)
+ set_busy_port(ix, 0);
+ }
+ else if (n < 0) {
+ if (errno == ERRNO_BLOCK || errno == EINTR)
+ return; /* 0; */
+ else {
+ int res = errno;
+ driver_select(ix, ready_fd, ERL_DRV_WRITE, 0);
+ driver_failure_posix(ix, res);
+ return; /* -1; */
+ }
+ }
+ return; /* 0; */
+}
+
+static void stop_select(ErlDrvEvent fd, void* _)
+{
+ close((int)fd);
+}
+
+#if FDBLOCK
+
+static void
+fd_async(void *async_data)
+{
+ int res;
+ ErtsSysDriverData *dd = (ErtsSysDriverData *)async_data;
+ SysIOVec *iov0;
+ SysIOVec *iov;
+ int iovlen;
+ int err = 0;
+ /* much of this code is stolen from efile_drv:invoke_writev */
+ driver_pdl_lock(dd->blocking->pdl);
+ iov0 = driver_peekq(dd->port_num, &iovlen);
+ iovlen = iovlen < MAXIOV ? iovlen : MAXIOV;
+ iov = erts_alloc_fnf(ERTS_ALC_T_SYS_WRITE_BUF,
+ sizeof(SysIOVec)*iovlen);
+ if (!iov) {
+ res = -1;
+ err = ENOMEM;
+ driver_pdl_unlock(dd->blocking->pdl);
+ } else {
+ memcpy(iov,iov0,iovlen*sizeof(SysIOVec));
+ driver_pdl_unlock(dd->blocking->pdl);
+
+ do {
+ res = writev(dd->ofd->fd, iov, iovlen);
+ } while (res < 0 && errno == EINTR);
+ if (res < 0)
+ err = errno;
+ err = errno;
+
+ erts_free(ERTS_ALC_T_SYS_WRITE_BUF, iov);
+ }
+ dd->blocking->res = res;
+ dd->blocking->err = err;
+}
+
+void fd_ready_async(ErlDrvData drv_data,
+ ErlDrvThreadData thread_data) {
+ ErtsSysDriverData *dd = (ErtsSysDriverData *)thread_data;
+ ErlDrvPort port_num = dd->port_num;
+
+ ASSERT(dd->blocking);
+
+ if (dd->blocking->res > 0) {
+ driver_pdl_lock(dd->blocking->pdl);
+ if (driver_deq(port_num, dd->blocking->res) == 0) {
+ driver_pdl_unlock(dd->blocking->pdl);
+ set_busy_port(port_num, 0);
+ if (dd->terminating) {
+ /* The port is has been ordered to terminate
+ from either fd_flush or port_inp_failure */
+ if (dd->terminating == 1)
+ driver_failure_atom(port_num, "normal");
+ else if (dd->terminating == 2)
+ driver_failure_eof(port_num);
+ else if (dd->terminating < 0)
+ driver_failure_posix(port_num, -dd->terminating);
+ return; /* -1; */
+ }
+ } else {
+ driver_pdl_unlock(dd->blocking->pdl);
+ /* still data left to write in queue */
+ driver_async(port_num, &dd->blocking->pkey, fd_async, dd, NULL);
+ return /* 0; */;
+ }
+ } else if (dd->blocking->res < 0) {
+ if (dd->blocking->err == ERRNO_BLOCK) {
+ set_busy_port(port_num, 1);
+ /* still data left to write in queue */
+ driver_async(port_num, &dd->blocking->pkey, fd_async, dd, NULL);
+ } else
+ driver_failure_posix(port_num, dd->blocking->err);
+ return; /* -1; */
+ }
+ return; /* 0; */
+}
+
+#endif
+
+/* Forker driver */
+
+static int forker_fd;
+
+static ErlDrvData forker_start(ErlDrvPort port_num, char* name,
+ SysDriverOpts* opts)
+{
+
+ int i;
+ int fds[2];
+ int res, unbind;
+ char bindir[MAXPATHLEN];
+ size_t bindirsz = sizeof(bindir);
+ Uint csp_path_sz;
+ char *child_setup_prog;
+
+ forker_port = erts_drvport2id(port_num);
+
+ res = erts_sys_getenv_raw("BINDIR", bindir, &bindirsz);
+ if (res != 0) {
+ if (res < 0)
+ erts_exit(1,
+ "Environment variable BINDIR is not set\n");
+ if (res > 0)
+ erts_exit(1,
+ "Value of environment variable BINDIR is too large\n");
+ }
+ if (bindir[0] != DIR_SEPARATOR_CHAR)
+ erts_exit(1,
+ "Environment variable BINDIR does not contain an"
+ " absolute path\n");
+ csp_path_sz = (strlen(bindir)
+ + 1 /* DIR_SEPARATOR_CHAR */
+ + sizeof(CHILD_SETUP_PROG_NAME)
+ + 1);
+ child_setup_prog = erts_alloc(ERTS_ALC_T_CS_PROG_PATH, csp_path_sz);
+ erts_snprintf(child_setup_prog, csp_path_sz,
+ "%s%c%s",
+ bindir,
+ DIR_SEPARATOR_CHAR,
+ CHILD_SETUP_PROG_NAME);
+ if (socketpair(AF_UNIX, SOCK_STREAM, 0, fds) < 0) {
+ erts_exit(ERTS_ABORT_EXIT,
+ "Could not open unix domain socket in spawn_init: %d\n",
+ errno);
+ }
+
+ forker_fd = fds[0];
+
+ unbind = erts_sched_bind_atfork_prepare();
+
+ i = fork();
+
+ if (i == 0) {
+ /* The child */
+ char *cs_argv[FORKER_ARGV_NO_OF_ARGS] =
+ {CHILD_SETUP_PROG_NAME, NULL, NULL};
+ char buff[128];
+
+ erts_sched_bind_atfork_child(unbind);
+
+ snprintf(buff, 128, "%d", sys_max_files());
+ cs_argv[FORKER_ARGV_MAX_FILES] = buff;
+
+ /* We preallocate fd 3 for the uds fd */
+ if (fds[1] != 3) {
+ dup2(fds[1], 3);
+ }
+
+#if defined(USE_SETPGRP_NOARGS) /* SysV */
+ (void) setpgrp();
+#elif defined(USE_SETPGRP) /* BSD */
+ (void) setpgrp(0, getpid());
+#else /* POSIX */
+ (void) setsid();
+#endif
+
+ execv(child_setup_prog, cs_argv);
+ _exit(1);
+ }
+
+ erts_sched_bind_atfork_parent(unbind);
+
+ erts_free(ERTS_ALC_T_CS_PROG_PATH, child_setup_prog);
+
+ close(fds[1]);
+
+ SET_NONBLOCKING(forker_fd);
+
+ driver_select(port_num, forker_fd, ERL_DRV_READ|ERL_DRV_USE, 1);
+
+ return (ErlDrvData)port_num;
+}
+
+static void forker_stop(ErlDrvData e)
+{
+ /* we probably should do something here,
+ the port has been closed by the user. */
+}
+
+static void forker_ready_input(ErlDrvData e, ErlDrvEvent fd)
+{
+ int res;
+ ErtsSysForkerProto *proto;
+
+ proto = erts_alloc(ERTS_ALC_T_DRV_CTRL_DATA, sizeof(*proto));
+
+ if ((res = read(fd, proto, sizeof(*proto))) < 0) {
+ if (errno == ERRNO_BLOCK)
+ return;
+ erts_exit(ERTS_DUMP_EXIT, "Failed to read from erl_child_setup: %d\n", errno);
+ }
+
+ if (res == 0)
+ erts_exit(ERTS_DUMP_EXIT, "erl_child_setup closed\n");
+
+ ASSERT(res == sizeof(*proto));
+
+#ifdef FORKER_PROTO_START_ACK
+ if (proto->action == ErtsSysForkerProtoAction_StartAck) {
+ /* Ideally we would like to not have to ack each Start
+ command being sent over the uds, but it would seem
+ that some operating systems (only observed on FreeBSD)
+ throw away data on the uds when the socket becomes full,
+ so we have to.
+ */
+ ErlDrvPort port_num = (ErlDrvPort)e;
+ int vlen;
+ SysIOVec *iov = driver_peekq(port_num, &vlen);
+ ErtsSysForkerProto *proto = (ErtsSysForkerProto *)iov[0].iov_base;
+
+ close(proto->u.start.fds[0]);
+ close(proto->u.start.fds[1]);
+ if (proto->u.start.fds[1] != proto->u.start.fds[2])
+ close(proto->u.start.fds[2]);
+
+ driver_deq(port_num, sizeof(*proto));
+
+ if (driver_sizeq(port_num) > 0)
+ driver_select(port_num, forker_fd, ERL_DRV_WRITE|ERL_DRV_USE, 1);
+ } else
+#endif
+ {
+ ASSERT(proto->action == ErtsSysForkerProtoAction_SigChld);
+
+ /* ideally this would be a port_command call, but as command is
+ already used by the spawn_driver, we use control instead.
+ Note that when using erl_drv_port_control it is an asynchronous
+ control. */
+ erl_drv_port_control(proto->u.sigchld.port_id, 'S',
+ (char*)proto, sizeof(*proto));
+ }
+
+}
+
+static void forker_ready_output(ErlDrvData e, ErlDrvEvent fd)
+{
+ ErlDrvPort port_num = (ErlDrvPort)e;
+
+#ifndef FORKER_PROTO_START_ACK
+ while (driver_sizeq(port_num) > 0) {
+#endif
+ int vlen;
+ SysIOVec *iov = driver_peekq(port_num, &vlen);
+ ErtsSysForkerProto *proto = (ErtsSysForkerProto *)iov[0].iov_base;
+ ASSERT(iov[0].iov_len >= (sizeof(*proto)));
+ if (sys_uds_write(forker_fd, (char*)proto, sizeof(*proto),
+ proto->u.start.fds, 3, 0) < 0) {
+ if (errno == ERRNO_BLOCK)
+ return;
+ erts_exit(ERTS_DUMP_EXIT, "Failed to write to erl_child_setup: %d\n", errno);
+ }
+#ifndef FORKER_PROTO_START_ACK
+ close(proto->u.start.fds[0]);
+ close(proto->u.start.fds[1]);
+ if (proto->u.start.fds[1] != proto->u.start.fds[2])
+ close(proto->u.start.fds[2]);
+ driver_deq(port_num, sizeof(*proto));
+ }
+#endif
+
+ driver_select(port_num, forker_fd, ERL_DRV_WRITE, 0);
+}
+
+static ErlDrvSSizeT forker_control(ErlDrvData e, unsigned int cmd, char *buf,
+ ErlDrvSizeT len, char **rbuf, ErlDrvSizeT rlen)
+{
+ ErtsSysForkerProto *proto = (ErtsSysForkerProto *)buf;
+ ErlDrvPort port_num = (ErlDrvPort)e;
+ int res;
+
+ driver_enq(port_num, buf, len);
+ if (driver_sizeq(port_num) > sizeof(*proto)) {
+ return 0;
+ }
+
+ if ((res = sys_uds_write(forker_fd, (char*)proto, sizeof(*proto),
+ proto->u.start.fds, 3, 0)) < 0) {
+ if (errno == ERRNO_BLOCK) {
+ driver_select(port_num, forker_fd, ERL_DRV_WRITE|ERL_DRV_USE, 1);
+ return 0;
+ }
+ erts_exit(ERTS_DUMP_EXIT, "Failed to write to erl_child_setup: %d\n", errno);
+ }
+
+#ifndef FORKER_PROTO_START_ACK
+ ASSERT(res == sizeof(*proto));
+ close(proto->u.start.fds[0]);
+ close(proto->u.start.fds[1]);
+ if (proto->u.start.fds[1] != proto->u.start.fds[2])
+ close(proto->u.start.fds[2]);
+ driver_deq(port_num, sizeof(*proto));
+#endif
+
+ return 0;
+}
diff --git a/erts/emulator/sys/unix/sys_time.c b/erts/emulator/sys/unix/sys_time.c
index 95b3daa4ed..8ce02506ab 100644
--- a/erts/emulator/sys/unix/sys_time.c
+++ b/erts/emulator/sys/unix/sys_time.c
@@ -65,6 +65,8 @@
# include <fcntl.h>
#endif
+static void init_perf_counter(void);
+
/******************* Routines for time measurement *********************/
#undef ERTS_SYS_TIME_INTERNAL_STATE_WRITE_FREQ__
@@ -404,6 +406,8 @@ sys_init_time(ErtsSysInitTimeResult *init_resp)
# error Missing erts_os_system_time() implementation
#endif
+ init_perf_counter();
+
}
void
@@ -908,10 +912,120 @@ erts_os_times(ErtsMonotonicTime *mtimep, ErtsSystemTime *stimep)
#endif
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\
+ * Performance counter functions *
+\* */
+
+
+/* What resolution to spin to in micro seconds */
+#define RESOLUTION 100
+/* How many iterations to spin */
+#define ITERATIONS 1
+/* How many significant figures to round to */
+#define SIGFIGS 3
+
+static ErtsSysPerfCounter calculate_perf_counter_unit(void) {
+ int i;
+ ErtsSysPerfCounter pre, post;
+ double value = 0;
+ double round_factor;
+#if defined(HAVE_GETHRTIME) && defined(GETHRTIME_WITH_CLOCK_GETTIME)
+ struct timespec basetime,comparetime;
+#define __GETTIME(arg) clock_gettime(CLOCK_MONOTONIC,arg)
+#define __GETUSEC(arg) (arg.tv_nsec / 1000)
+#else
+ SysTimeval basetime,comparetime;
+#define __GETTIME(arg) sys_gettimeofday(arg)
+#define __GETUSEC(arg) arg.tv_usec
+#endif
+
+ for (i = 0; i < ITERATIONS; i++) {
+ /* Make sure usec just flipped over at current resolution */
+ __GETTIME(&basetime);
+ do {
+ __GETTIME(&comparetime);
+ } while ((__GETUSEC(basetime) / RESOLUTION) == (__GETUSEC(comparetime) / RESOLUTION));
+
+ pre = erts_sys_perf_counter();
+
+ __GETTIME(&basetime);
+ do {
+ __GETTIME(&comparetime);
+ } while ((__GETUSEC(basetime) / RESOLUTION) == (__GETUSEC(comparetime) / RESOLUTION));
+
+ post = erts_sys_perf_counter();
+
+ value += post - pre;
+ }
+ /* After this value is ticks per us */
+ value /= (RESOLUTION*ITERATIONS);
+
+ /* We round to 3 significant figures */
+ round_factor = pow(10.0, SIGFIGS - ceil(log10(value)));
+ value = ((ErtsSysPerfCounter)(value * round_factor + 0.5)) / round_factor;
+
+ /* convert to ticks per second */
+ return 1000000 * value;
+}
+
+static int have_rdtscp(void)
+{
+#if defined(ETHR_X86_RUNTIME_CONF__)
+ /* On early x86 cpu's the tsc varies with the current speed of the cpu,
+ which means that the time per tick vary depending on the current
+ load of the cpu. We do not want this as it would give very scewed
+ numbers when the cpu is mostly idle.
+ The linux kernel seems to think that checking for constant and
+ reliable is enough to trust the counter so we do the same.
+
+ If this test is not good enough, I don't know what we'll do.
+ Maybe fallback on erts_sys_hrtime always, but that would be a shame as
+ rdtsc is about 3 times faster than hrtime... */
+ return ETHR_X86_RUNTIME_CONF_HAVE_CONSTANT_TSC__ &&
+ ETHR_X86_RUNTIME_CONF_HAVE_TSC_RELIABLE__;
+#else
+ return 0;
+#endif
+}
+
+static ErtsSysPerfCounter rdtsc(void)
+{
+ /* It may have been a good idea to put the cpuid instruction before
+ the rdtsc, but I decided against it because it is not really
+ needed for msacc, and it slows it down by quite a bit (5-7 times slower).
+ As a result though, this timestamp becomes much less
+ accurate as it might be re-ordered to be executed way before or after this
+ function is called.
+ */
+ ErtsSysPerfCounter ts;
+#if defined(__x86_64__)
+ __asm__ __volatile__ ("rdtsc\n\t"
+ "shl $32, %%rdx\n\t"
+ "or %%rdx, %0" : "=a" (ts) : : "rdx");
+#elif defined(__i386__)
+ __asm__ __volatile__ ("rdtsc\n\t"
+ : "=A" (ts) );
+#endif
+ return ts;
+}
+
+static void init_perf_counter(void)
+{
+ if (have_rdtscp()) {
+ erts_sys_time_data__.r.o.perf_counter = rdtsc;
+ erts_sys_time_data__.r.o.perf_counter_unit = calculate_perf_counter_unit();
+ } else {
+ erts_sys_time_data__.r.o.perf_counter = erts_sys_hrtime;
+ erts_sys_time_data__.r.o.perf_counter_unit = ERTS_HRTIME_UNIT;
+ }
+}
+
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
#ifdef HAVE_GETHRVTIME_PROCFS_IOCTL
+/* The code below only has effect on solaris < 10,
+ needed in order for gehhrvtime to work properly */
int sys_start_hrvtime(void)
{
long msacct = PR_MSACCT;
diff --git a/erts/emulator/sys/unix/sys_uds.c b/erts/emulator/sys/unix/sys_uds.c
new file mode 100644
index 0000000000..015d0346a1
--- /dev/null
+++ b/erts/emulator/sys/unix/sys_uds.c
@@ -0,0 +1,155 @@
+/*
+ * %CopyrightBegin%
+ *
+ * Copyright Ericsson AB 2002-2009. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * %CopyrightEnd%
+ */
+
+#include "sys_uds.h"
+
+int
+sys_uds_readv(int fd, struct iovec *iov, size_t iov_len,
+ int *fds, int fd_count, int flags) {
+ struct msghdr msg;
+ struct cmsghdr *cmsg = NULL;
+ char ancillary_buff[256] = {0};
+ int res, i = 0;
+
+ /* setup a place to fill in message contents */
+ memset(&msg, 0, sizeof(struct msghdr));
+ msg.msg_iov = iov;
+ msg.msg_iovlen = iov_len;
+
+ /* provide space for the ancillary data */
+ msg.msg_control = ancillary_buff;
+ msg.msg_controllen = sizeof(ancillary_buff);
+
+ if((res = recvmsg(fd, &msg, flags)) < 0) {
+#if defined(__APPLE__) && defined(__MACH__) && !defined(__DARWIN__)
+ /* When some OS X versions run out of fd's
+ they give EMSGSIZE instead of EMFILE.
+ We remap this as we want the correct
+ error to appear for the user */
+ if (errno == EMSGSIZE)
+ errno = EMFILE;
+#endif
+ return res;
+ }
+
+ if((msg.msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
+ {
+ /* We assume that we have given enough space for any header
+ that are sent to us. So the only remaining reason to get
+ this flag set is if the caller has run out of file descriptors.
+ */
+ errno = EMFILE;
+ return -1;
+ }
+
+ for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg) ) {
+ if ((cmsg->cmsg_level == SOL_SOCKET) &&
+ (cmsg->cmsg_type == SCM_RIGHTS)) {
+ int *cmsg_data = (int *)CMSG_DATA(cmsg);
+ while ((char*)cmsg_data < (char*)cmsg + cmsg->cmsg_len) {
+ if (i < fd_count) {
+ fds[i++] = *cmsg_data++;
+ } else {
+ /* for some strange reason, we have received more FD's
+ than we wanted... close them if we are not running
+ debug. */
+ if(i >= fd_count) abort();
+ close(*cmsg_data++);
+ }
+ }
+ }
+ }
+
+ return res;
+}
+
+int
+sys_uds_read(int fd, char *buff, size_t len,
+ int *fds, int fd_count, int flags) {
+ struct iovec iov;
+ iov.iov_base = buff;
+ iov.iov_len = len;
+ return sys_uds_readv(fd, &iov, 1, fds, fd_count, flags);
+}
+
+
+int
+sys_uds_writev(int fd, struct iovec *iov, size_t iov_len,
+ int *fds, int fd_count, int flags) {
+
+ struct msghdr msg;
+ struct cmsghdr *cmsg = NULL;
+ int res, i;
+
+ /* initialize socket message */
+ memset(&msg, 0, sizeof(struct msghdr));
+
+ /* We flatten the iov if it is too long */
+ if (iov_len > MAXIOV) {
+ int size = 0;
+ char *buff;
+ for (i = 0; i < iov_len; i++)
+ size += iov[i].iov_len;
+ buff = malloc(size);
+
+ for (i = 0; i < iov_len; i++) {
+ memcpy(buff, iov[i].iov_base, iov[i].iov_len);
+ buff += iov[i].iov_len;
+ }
+
+ iov[0].iov_base = buff - size;
+ iov[0].iov_len = size;
+ msg.msg_iov = iov;
+ msg.msg_iovlen = 1;
+ } else {
+ msg.msg_iov = iov;
+ msg.msg_iovlen = iov_len;
+ }
+
+ /* initialize the ancillary data */
+ msg.msg_control = calloc(1, CMSG_SPACE(sizeof(int) * fd_count));
+ msg.msg_controllen = CMSG_SPACE(sizeof(int) * fd_count);
+
+ /* copy the fd array into the ancillary data */
+ cmsg = CMSG_FIRSTHDR(&msg);
+ if(!cmsg) abort();
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int) * fd_count);
+ memcpy(CMSG_DATA(cmsg), fds, sizeof(int) * fd_count);
+
+ res = sendmsg(fd, &msg, flags);
+
+ if (iov_len > MAXIOV)
+ free(iov[0].iov_base);
+
+ free(msg.msg_control);
+
+ return res;
+}
+
+int
+sys_uds_write(int fd, char *buff, size_t len,
+ int *fds, int fd_count, int flags) {
+ struct iovec iov;
+ iov.iov_base = buff;
+ iov.iov_len = len;
+ return sys_uds_writev(fd, &iov, 1, fds, fd_count, flags);
+}
diff --git a/erts/emulator/sys/unix/sys_uds.h b/erts/emulator/sys/unix/sys_uds.h
new file mode 100644
index 0000000000..844a2804d8
--- /dev/null
+++ b/erts/emulator/sys/unix/sys_uds.h
@@ -0,0 +1,57 @@
+/*
+ * %CopyrightBegin%
+ *
+ * Copyright Ericsson AB 2002-2009. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * %CopyrightEnd%
+ */
+
+#ifndef _ERL_UNIX_UDS_H
+#define _ERL_UNIX_UDS_H
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#if defined(__sun__) && !defined(_XOPEN_SOURCE)
+#define _XOPEN_SOURCE 500
+#endif
+
+#include <limits.h>
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+
+#if defined IOV_MAX
+#define MAXIOV IOV_MAX
+#elif defined UIO_MAXIOV
+#define MAXIOV UIO_MAXIOV
+#else
+#define MAXIOV 16
+#endif
+
+#include "sys.h"
+
+int sys_uds_readv(int fd, struct iovec *iov, size_t iov_len,
+ int *fds, int fd_count, int flags);
+int sys_uds_read(int fd, char *buff, size_t len,
+ int *fds, int fd_count, int flags);
+int sys_uds_writev(int fd, struct iovec *iov, size_t iov_len,
+ int *fds, int fd_count, int flags);
+int sys_uds_write(int fd, char *buff, size_t len,
+ int *fds, int fd_count, int flags);
+
+#endif /* #ifndef _ERL_UNIX_UDS_H */