aboutsummaryrefslogtreecommitdiffstats
path: root/lib/os_mon
diff options
context:
space:
mode:
authorLuca Favatella <[email protected]>2014-08-26 12:09:54 +0100
committerLuca Favatella <[email protected]>2014-08-29 20:45:40 +0100
commit5ed225662a6e48877a2ac0bef7c080297d64561e (patch)
treec8bd9c9b9c42712be6f4d33d7b91b51bf714a946 /lib/os_mon
parent83113d9f1df48dc73fbff82705e8aa3fb3af6723 (diff)
downloadotp-5ed225662a6e48877a2ac0bef7c080297d64561e.tar.gz
otp-5ed225662a6e48877a2ac0bef7c080297d64561e.tar.bz2
otp-5ed225662a6e48877a2ac0bef7c080297d64561e.zip
Clarify error for slow `cpu_sup` port init
I noticed that running an R16B03-1 node on an overloaded host produced log entries like the following ones: ``` 2014-08-22 21:52:31 =ERROR REPORT==== Error in process <0.24112.3> on node '[email protected]' with exit value: {{case_clause,{data,4711}},[{cpu_sup,get_uint32_measurement,2,[{file,"cpu_sup.erl"},{line,227}]},{cpu_sup,measurement_server_loop,1,[{file,"cpu_sup.erl"},{line,585}]}]} ``` ``` ===== ALIVE Fri Aug 22 21:50:14 CEST 2014 [os_mon] cpu supervisor port (cpu_sup): Erlang has closed [os_mon] cpu supervisor port (cpu_sup): Erlang has closed [os_mon] cpu supervisor port (cpu_sup): Erlang has closed [os_mon] cpu supervisor port (cpu_sup): Erlang has closed ===== ALIVE Fri Aug 22 22:07:46 CEST 2014 ``` I performed a code inspection on the `cpu_sup` module and I concluded that the `case_clause` error shows a small issue in the `cpu_sup` module, that happens when the port used in `cpu_sup` is slow to start - as it may happen on an overloaded node. The `cpu_sup` `gen_server` process keeps in its state the pid of an unlinked process (called "measurement server" - see `cpu_sup:init/1`), in order to do dirty stuff (e.g. reading the filesystem, running OS commands) and start_link-ing & managing the connected process to a port (called "port server" - see `measurement_server_init/0`). So the process organization looks like this: ``` cpu_sup - measurement server - port server - port ``` When the measurement server start_links the port server (see `port_server_start/0`) it sends a `{self(), ?ping}` message to it and expects an answer within 6s, otherwise it returns `{error, timeout}` rather than the pid. This has two issues: * The measurement server keeps `{error, ...}` in its state as if it were a pid - that makes no sense; * A late `{Pid, {data,4711}}` response may arrive in the mailbox of the measurement server, that will believe it to be a request to be processed, causing a `case_clause` error. This commit teaches the measurement server to check the success of the initialization of the port server by matching on the return value of `port_server_start/0` (renamed to `port_server_start_link/0` for the sake of clarity) in order to fail earlier and with an error clearer than `{case_clause,{data,4711}}`. In such case I expect the measurement server to be restarted by the `cpu_sup` `gen_server` (see `handle_call/3`) - as before. BTW It is not clear to me when the `handle_info({'EXIT', _Port, Reason}, State)` may be called (the `cpu_sup` `gen_server` does not link to the measurement server) but I am leaving it.
Diffstat (limited to 'lib/os_mon')
-rw-r--r--lib/os_mon/src/cpu_sup.erl12
1 files changed, 7 insertions, 5 deletions
diff --git a/lib/os_mon/src/cpu_sup.erl b/lib/os_mon/src/cpu_sup.erl
index 34251178ee..1f088ecbde 100644
--- a/lib/os_mon/src/cpu_sup.erl
+++ b/lib/os_mon/src/cpu_sup.erl
@@ -543,7 +543,8 @@ measurement_server_init() ->
Server = case OS of
{unix, Flavor} when Flavor==sunos;
Flavor==linux ->
- port_server_start();
+ {ok, Pid} = port_server_start_link(),
+ Pid;
{unix, Flavor} when Flavor==darwin;
Flavor==freebsd;
Flavor==dragonfly;
@@ -588,8 +589,9 @@ measurement_server_loop(State) ->
Error -> Pid ! {error, Error}
end,
measurement_server_loop(State);
- {'EXIT', Pid, _n} when State#internal.port == Pid ->
- measurement_server_loop(State#internal{port = port_server_start()});
+ {'EXIT', OldPid, _n} when State#internal.port == OldPid ->
+ {ok, NewPid} = port_server_start_link(),
+ measurement_server_loop(State#internal{port = NewPid});
_Other ->
measurement_server_loop(State)
end.
@@ -605,12 +607,12 @@ port_server_call(Pid, Command) ->
{Pid, {error, Reason}} -> {error, Reason}
end.
-port_server_start() ->
+port_server_start_link() ->
Timeout = 6000,
Pid = spawn_link(fun() -> port_server_init(Timeout) end),
Pid ! {self(), ?ping},
receive
- {Pid, {data,4711}} -> Pid;
+ {Pid, {data,4711}} -> {ok, Pid};
{error,Reason} -> {error, Reason}
after Timeout ->
{error, timeout}