Merge branch 'lukas/erts/cleanup_scheduler_start' into maint

* lukas/erts/cleanup_scheduler_start: erts: Reduce test time for multi_load in valgrind erts: Randomize valgrind taskset CPU erts: Make dump_SUITE:free_dump tc more robust to different systems erts: Fix distr SUITE latency testcases erts: Fix gc disable when terminating process erts: Cleanup start of all erts threads to ABORT when failing
author: Lukas Larsson <[email protected]> 2019-06-28 10:48:43 +0200
committer: Lukas Larsson <[email protected]> 2019-06-28 10:48:43 +0200
commit: c29006892cdddd95f32a7b6fc41eb3d8065c0f39 (patch)
tree: 5795e9d5f0bec9333b6c8d458149420efdae4834 /erts
parent: 42ab20bfe6cfaf7e08b97ba0fbfffa86da6dc821 (diff)
parent: b942df8593b6295e61eb767008d6e93a2cc34665 (diff)
download: otp-c29006892cdddd95f32a7b6fc41eb3d8065c0f39.tar.gz
otp-c29006892cdddd95f32a7b6fc41eb3d8065c0f39.tar.bz2
otp-c29006892cdddd95f32a7b6fc41eb3d8065c0f39.zip
5 files changed, 77 insertions, 67 deletions
diff --git a/erts/emulator/beam/erl_process.c b/erts/emulator/beam/erl_process.c
index 1f6adb98ef..de0564292d 100644
--- a/erts/emulator/beam/erl_process.c
+++ b/erts/emulator/beam/erl_process.c
@@ -8568,9 +8568,6 @@ erts_start_schedulers(void)
 {
     ethr_tid tid;
     int res = 0;
-    Uint actual;
-    Uint wanted = erts_no_schedulers;
-    Uint wanted_no_schedulers = erts_no_schedulers;
     char name[16];
     ethr_thr_opts opts = ETHR_THR_OPTS_DEFAULT_INITER;
     int ix;
@@ -8584,40 +8581,34 @@ erts_start_schedulers(void)
         erts_snprintf(opts.name, 16, "runq_supervisor");
 	erts_atomic_init_nob(&runq_supervisor_sleeping, 0);
 	if (0 != ethr_event_init(&runq_supervision_event))
-	    erts_exit(ERTS_ERROR_EXIT, "Failed to create run-queue supervision event\n");
+	    erts_exit(ERTS_ABORT_EXIT, "Failed to create run-queue supervision event\n");
         res = ethr_thr_create(&runq_supervisor_tid,
                               runq_supervisor,
                               NULL,
                               &opts);
 	if (0 != res)
-	    erts_exit(ERTS_ERROR_EXIT, "Failed to create run-queue supervision thread, "
+	    erts_exit(ERTS_ABORT_EXIT, "Failed to create run-queue supervision thread, "
                       "error = %d\n", res);
 
     }
 
     opts.suggested_stack_size = erts_sched_thread_suggested_stack_size;
 
-    if (wanted < 1)
-	wanted = 1;
-    if (wanted > ERTS_MAX_NO_OF_SCHEDULERS) {
-	wanted = ERTS_MAX_NO_OF_SCHEDULERS;
-	res = ENOTSUP;
-    }
-
-    for (actual = 0; actual < wanted; actual++) {
-	ErtsSchedulerData *esdp = ERTS_SCHEDULER_IX(actual);
-
-	ASSERT(actual == esdp->no - 1);
-
-	erts_snprintf(opts.name, 16, "%lu_scheduler", actual + 1);
+    ASSERT(erts_no_schedulers > 0 && erts_no_schedulers <= ERTS_MAX_NO_OF_SCHEDULERS);
 
+    for (ix = 0; ix < erts_no_schedulers; ix++) {
+	ErtsSchedulerData *esdp = ERTS_SCHEDULER_IX(ix);
+	ASSERT(ix == esdp->no - 1);
+	erts_snprintf(opts.name, 16, "%lu_scheduler", ix + 1);
 	res = ethr_thr_create(&esdp->tid, sched_thread_func, (void*)esdp, &opts);
-
 	if (res != 0) {
-           break;
+           erts_exit(ERTS_ABORT_EXIT, "Failed to create scheduler thread %d, error = %d\n", ix, res);
 	}
     }
-    erts_no_schedulers = actual;
+
+    /* Probably not needed as thread create will imply a memory barrier,
+       but we do one just to be safe. */
+    ERTS_THR_MEMORY_BARRIER;
 
     {
 	for (ix = 0; ix < erts_no_dirty_cpu_schedulers; ix++) {
@@ -8626,7 +8617,7 @@ erts_start_schedulers(void)
             opts.suggested_stack_size = erts_dcpu_sched_thread_suggested_stack_size;
 	    res = ethr_thr_create(&esdp->tid,sched_dirty_cpu_thread_func,(void*)esdp,&opts);
 	    if (res != 0)
-		erts_exit(ERTS_ERROR_EXIT, "Failed to create dirty cpu scheduler thread %d, error = %d\n", ix, res);
+		erts_exit(ERTS_ABORT_EXIT, "Failed to create dirty cpu scheduler thread %d, error = %d\n", ix, res);
 	}
 	for (ix = 0; ix < erts_no_dirty_io_schedulers; ix++) {
 	    ErtsSchedulerData *esdp = ERTS_DIRTY_IO_SCHEDULER_IX(ix);
@@ -8634,40 +8625,22 @@ erts_start_schedulers(void)
             opts.suggested_stack_size = erts_dio_sched_thread_suggested_stack_size;
 	    res = ethr_thr_create(&esdp->tid,sched_dirty_io_thread_func,(void*)esdp,&opts);
 	    if (res != 0)
-		erts_exit(ERTS_ERROR_EXIT, "Failed to create dirty io scheduler thread %d, error = %d\n", ix, res);
+		erts_exit(ERTS_ABORT_EXIT, "Failed to create dirty io scheduler thread %d, error = %d\n", ix, res);
 	}
     }
 
-    ERTS_THR_MEMORY_BARRIER;
-
     erts_snprintf(opts.name, 16, "aux");
 
     res = ethr_thr_create(&tid, aux_thread, NULL, &opts);
     if (res != 0)
-	erts_exit(ERTS_ERROR_EXIT, "Failed to create aux thread, error = %d\n", res);
+	erts_exit(ERTS_ABORT_EXIT, "Failed to create aux thread, error = %d\n", res);
 
     for (ix = 0; ix < erts_no_poll_threads; ix++) {
         erts_snprintf(opts.name, 16, "%d_poller", ix);
 
         res = ethr_thr_create(&tid, poll_thread, (void*)(UWord)ix, &opts);
         if (res != 0)
-            erts_exit(ERTS_ERROR_EXIT, "Failed to create poll thread\n");
-    }
-
-    if (actual < 1)
-	erts_exit(ERTS_ERROR_EXIT,
-		 "Failed to create any scheduler-threads: %s (%d)\n",
-		 erl_errno_id(res),
-		 res);
-    if (res != 0) {
-	erts_dsprintf_buf_t *dsbufp = erts_create_logger_dsbuf();
-	ASSERT(actual != wanted_no_schedulers);
-	erts_dsprintf(dsbufp,
-		      "Failed to create %beu scheduler-threads (%s:%d); "
-		      "only %beu scheduler-thread%s created.\n",
-		      wanted_no_schedulers, erl_errno_id(res), res,
-		      actual, actual == 1 ? " was" : "s were");
-	erts_send_error_to_logger_nogl(dsbufp);
+            erts_exit(ERTS_ABORT_EXIT, "Failed to create poll thread\n");
     }
 }
 
@@ -12097,6 +12070,7 @@ erts_proc_exit_handle_dist_monitor(ErtsMonitor *mon, void *vctxt, Sint reds)
     ErtsHeapFactory factory;
     Sint reds_consumed = 0;
 
+    ASSERT(c_p->flags & F_DISABLE_GC);
     ASSERT(erts_monitor_is_target(mon) && mon->type == ERTS_MON_TYPE_DIST_PROC);
 
     mdp = erts_monitor_to_data(mon);
@@ -12144,7 +12118,6 @@ erts_proc_exit_handle_dist_monitor(ErtsMonitor *mon, void *vctxt, Sint reds)
         switch (code) {
         case ERTS_DSIG_SEND_CONTINUE:
         case ERTS_DSIG_SEND_YIELD:
-            erts_set_gc_state(c_p, 0);
             ctxt->dist_state = erts_dsend_export_trap_context(c_p, &ctx);
             reds_consumed = reds; /* force yield */
             break;
@@ -12152,7 +12125,6 @@ erts_proc_exit_handle_dist_monitor(ErtsMonitor *mon, void *vctxt, Sint reds)
             break;
         case ERTS_DSIG_SEND_TOO_LRG:
             erts_kill_dist_connection(dep, dist->connection_id);
-            erts_set_gc_state(c_p, 1);
             break;
         default:
             ASSERT(! "Invalid dsig send exit monitor result");
@@ -12356,6 +12328,7 @@ erts_proc_exit_handle_dist_link(ErtsLink *lnk, void *vctxt, Sint reds)
     ErtsHeapFactory factory;
     Sint reds_consumed = 0;
 
+    ASSERT(c_p->flags & F_DISABLE_GC);
     ASSERT(lnk->type == ERTS_LNK_TYPE_DIST_PROC);
     dlnk = erts_link_to_other(lnk, &ldp);
     dist = ((ErtsLinkDataExtended *) ldp)->dist;
@@ -12395,7 +12368,6 @@ erts_proc_exit_handle_dist_link(ErtsLink *lnk, void *vctxt, Sint reds)
         switch (code) {
         case ERTS_DSIG_SEND_YIELD:
         case ERTS_DSIG_SEND_CONTINUE:
-            erts_set_gc_state(c_p, 0);
             ctxt->dist_state = erts_dsend_export_trap_context(c_p, &ctx);
             reds_consumed = reds; /* force yield */
             break;
@@ -12403,7 +12375,6 @@ erts_proc_exit_handle_dist_link(ErtsLink *lnk, void *vctxt, Sint reds)
             break;
         case ERTS_DSIG_SEND_TOO_LRG:
             erts_kill_dist_connection(dep, dist->connection_id);
-            erts_set_gc_state(c_p, 1);
             break;
         default:
             ASSERT(! "Invalid dsig send exit monitor result");
@@ -12951,6 +12922,8 @@ restart:
         yield_allowed = 0;
 #endif
 
+        /* Enable GC again, through strictly not needed it puts
+           the process in a consistent state. */
         erts_set_gc_state(p, 1);
 
         /* Set state to not active as we don't want this process
diff --git a/erts/emulator/test/distribution_SUITE.erl b/erts/emulator/test/distribution_SUITE.erl
index 7885d35d9d..9dcdd60060 100644
--- a/erts/emulator/test/distribution_SUITE.erl
+++ b/erts/emulator/test/distribution_SUITE.erl
@@ -1400,6 +1400,10 @@ get_conflicting_unicode_atoms(CIX, N) ->
 %% The message_latency_large tests that small distribution messages are
 %% not blocked by other large distribution messages. Basically it tests
 %% that fragmentation of distribution messages works.
+%%
+%% Because of large problems to get reliable values from these testcases
+%% they no longer fail when the latency is incorrect. However, they are
+%% kept as they continue to find bugs in the distribution implementation.
 message_latency_large_message(Config) when is_list(Config) ->
     measure_latency_large_message(?FUNCTION_NAME, fun(Dropper, Payload) -> Dropper ! Payload end).
 
@@ -1484,7 +1488,11 @@ measure_latency_large_message(Nodename, DataFun) ->
 
     case {lists:max(Times), lists:min(Times)} of
         {Max, Min} when Max * 0.25 > Min, BuildType =:= opt ->
-            ct:fail({incorrect_latency, IndexTimes});
+            %% We only issue a comment for this failure as the
+            %% testcases proved very difficult to run successfully
+            %% on many platforms.
+            ct:comment({incorrect_latency, IndexTimes}),
+            ok;
         _ ->
             ok
     end.
@@ -1503,10 +1511,7 @@ measure_latency(DataFun, Dropper, Echo, Payload) ->
                          end
                  end) || _ <- lists:seq(1,2)],
 
-    [receive
-         {monitor, _Sender, busy_dist_port, _Info} ->
-             ok
-     end || _ <- lists:seq(1,10)],
+    wait_for_busy_dist(2 * 60 * 1000, 10),
 
     {TS, Times} =
         timer:tc(fun() ->
@@ -1530,6 +1535,18 @@ measure_latency(DataFun, Dropper, Echo, Payload) ->
      end || {Sender, Ref} <- Senders],
     TS.
 
+wait_for_busy_dist(_Tmo, 0) ->
+    ok;
+wait_for_busy_dist(Tmo, N) ->
+    T0 = erlang:monotonic_time(millisecond),
+    receive
+         {monitor, _Sender, busy_dist_port, _Info} ->
+             wait_for_busy_dist(Tmo - (erlang:monotonic_time(millisecond) - T0), N - 1)
+    after Tmo ->
+            ct:log("Timed out waiting for busy_dist, ~p left",[N]),
+            timeout
+    end.
+
 flush() ->
     receive
         _ ->
@@ -2600,7 +2617,7 @@ verify_nc(Node) ->
             demonitor(MonRef,[flush]),
             ok;
         {Ref, Error} ->
-            ct:log("~p",[Error]),
+            ct:log("~s",[Error]),
             ct:fail(failed_nc_refc_check);
         {'DOWN', MonRef, _, _, _} = Down ->
             ct:log("~p",[Down]),
diff --git a/erts/emulator/test/dump_SUITE.erl b/erts/emulator/test/dump_SUITE.erl
index 9f8ac42fa9..b7da69e556 100644
--- a/erts/emulator/test/dump_SUITE.erl
+++ b/erts/emulator/test/dump_SUITE.erl
@@ -140,13 +140,13 @@ free_dump(Config) when is_list(Config) ->
     {ok, NodeA} = start_node(Config),
     {ok, NodeB} = start_node(Config),
 
-
     Self = self(),
 
     PidA = spawn_link(
              NodeA,
              fun() ->
                      Self ! ready,
+                     Reason = lists:duplicate(1000000,100),
                      receive
                          ok ->
                              spawn(fun() ->
@@ -154,24 +154,29 @@ free_dump(Config) when is_list(Config) ->
                                            timer:sleep(5),
                                            receive
                                                M ->
-                                                   io:format("~p",[M]),
-                                                   erlang:halt("dump")
-                                           end
+                                                   io:format("~p",[M])
+%% We may want to add this timeout here in-case no busy condition is triggered
+%%                                           after 60 * 1000 ->
+%%                                                   io:format("Timeout")
+                                           end,
+                                           erlang:halt("dump")
                                    end),
-                             exit(lists:duplicate(1000000,100))
+                             exit(Reason)
                      end
              end),
 
-    spawn_link(NodeB,
-               fun() ->
-                       [erlang:monitor(process, PidA) || _ <- lists:seq(1,10000)],
-                       Self ! done,
-                       receive _ -> ok end
-               end),
+    PidB = spawn_link(NodeB,
+                      fun() ->
+                              [erlang:monitor(process, PidA) || _ <- lists:seq(1,10000)],
+                              Self ! done,
+                              receive _ -> ok end
+                      end),
 
     receive done -> ok end,
     true = rpc:call(NodeA, os, putenv, ["ERL_CRASH_DUMP",Dump]),
-    ct:pal("~p",[rpc:call(NodeA, distribution_SUITE, make_busy, [NodeB, 1000])]),
+    %% Make the node busy towards NodeB for 10 seconds.
+    BusyPid = rpc:call(NodeA, distribution_SUITE, make_busy, [NodeB,10000]),
+    ct:pal("~p",[BusyPid]),
 
     receive ready -> unlink(PidA), PidA ! ok end,
 
@@ -185,6 +190,10 @@ free_dump(Config) when is_list(Config) ->
 
     file:delete(Dump),
 
+    unlink(PidB),
+
+    rpc:call(NodeB, erlang, halt, [0]),
+
     ok.
 
 
diff --git a/erts/emulator/test/multi_load_SUITE.erl b/erts/emulator/test/multi_load_SUITE.erl
index edf3205812..c79e2b6dcd 100644
--- a/erts/emulator/test/multi_load_SUITE.erl
+++ b/erts/emulator/test/multi_load_SUITE.erl
@@ -30,7 +30,15 @@ all() ->
     [many,on_load,errors].
 
 many(_Config) ->
-    Ms = make_modules(100, fun many_module/1),
+
+    N = case erlang:system_info(build_type) of
+            valgrind ->
+                10;
+            _ ->
+                100
+        end,
+
+    Ms = make_modules(N, fun many_module/1),
 
     io:put_chars("Light load\n"
 		 "=========="),
diff --git a/erts/etc/unix/cerl.src b/erts/etc/unix/cerl.src
index 710a7a9ef6..59de9bdec8 100644
--- a/erts/etc/unix/cerl.src
+++ b/erts/etc/unix/cerl.src
@@ -312,8 +312,11 @@ if [ "x$GDB" = "x" ]; then
 	    # on multiple cores (especially with async threads). Valgrind only run one pthread
 	    # at a time anyway so there is no point letting it utilize more than one core.
 	    # Use $sched_arg to force all schedulers online to emulate multicore.
-	    taskset1="taskset 1"
 	    ncpu=`cat /proc/cpuinfo | grep -w processor | wc -l`
+            # Choose a random core in order to not collide with any other valgrind
+            # run on the same machine.
+            taskset1=$((1 << (`shuf -i 1-$ncpu -n 1` - 1) ))
+	    taskset1="taskset $taskset1"
 	    sched_arg="-S$ncpu:$ncpu"
 	else
 	    taskset1=
author	Lukas Larsson <[email protected]>	2019-06-28 10:48:43 +0200
committer	Lukas Larsson <[email protected]>	2019-06-28 10:48:43 +0200
commit	c29006892cdddd95f32a7b6fc41eb3d8065c0f39 (patch)
tree	5795e9d5f0bec9333b6c8d458149420efdae4834 /erts
parent	42ab20bfe6cfaf7e08b97ba0fbfffa86da6dc821 (diff)
parent	b942df8593b6295e61eb767008d6e93a2cc34665 (diff)
download	otp-c29006892cdddd95f32a7b6fc41eb3d8065c0f39.tar.gz otp-c29006892cdddd95f32a7b6fc41eb3d8065c0f39.tar.bz2 otp-c29006892cdddd95f32a7b6fc41eb3d8065c0f39.zip