From 5ed225662a6e48877a2ac0bef7c080297d64561e Mon Sep 17 00:00:00 2001 From: Luca Favatella Date: Tue, 26 Aug 2014 12:09:54 +0100 Subject: Clarify error for slow `cpu_sup` port init I noticed that running an R16B03-1 node on an overloaded host produced log entries like the following ones: ``` 2014-08-22 21:52:31 =ERROR REPORT==== Error in process <0.24112.3> on node '...@127.0.0.1' with exit value: {{case_clause,{data,4711}},[{cpu_sup,get_uint32_measurement,2,[{file,"cpu_sup.erl"},{line,227}]},{cpu_sup,measurement_server_loop,1,[{file,"cpu_sup.erl"},{line,585}]}]} ``` ``` ===== ALIVE Fri Aug 22 21:50:14 CEST 2014 [os_mon] cpu supervisor port (cpu_sup): Erlang has closed [os_mon] cpu supervisor port (cpu_sup): Erlang has closed [os_mon] cpu supervisor port (cpu_sup): Erlang has closed [os_mon] cpu supervisor port (cpu_sup): Erlang has closed ===== ALIVE Fri Aug 22 22:07:46 CEST 2014 ``` I performed a code inspection on the `cpu_sup` module and I concluded that the `case_clause` error shows a small issue in the `cpu_sup` module, that happens when the port used in `cpu_sup` is slow to start - as it may happen on an overloaded node. The `cpu_sup` `gen_server` process keeps in its state the pid of an unlinked process (called "measurement server" - see `cpu_sup:init/1`), in order to do dirty stuff (e.g. reading the filesystem, running OS commands) and start_link-ing & managing the connected process to a port (called "port server" - see `measurement_server_init/0`). So the process organization looks like this: ``` cpu_sup - measurement server - port server - port ``` When the measurement server start_links the port server (see `port_server_start/0`) it sends a `{self(), ?ping}` message to it and expects an answer within 6s, otherwise it returns `{error, timeout}` rather than the pid. This has two issues: * The measurement server keeps `{error, ...}` in its state as if it were a pid - that makes no sense; * A late `{Pid, {data,4711}}` response may arrive in the mailbox of the measurement server, that will believe it to be a request to be processed, causing a `case_clause` error. This commit teaches the measurement server to check the success of the initialization of the port server by matching on the return value of `port_server_start/0` (renamed to `port_server_start_link/0` for the sake of clarity) in order to fail earlier and with an error clearer than `{case_clause,{data,4711}}`. In such case I expect the measurement server to be restarted by the `cpu_sup` `gen_server` (see `handle_call/3`) - as before. BTW It is not clear to me when the `handle_info({'EXIT', _Port, Reason}, State)` may be called (the `cpu_sup` `gen_server` does not link to the measurement server) but I am leaving it. --- lib/os_mon/src/cpu_sup.erl | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/os_mon/src/cpu_sup.erl b/lib/os_mon/src/cpu_sup.erl index 34251178ee..1f088ecbde 100644 --- a/lib/os_mon/src/cpu_sup.erl +++ b/lib/os_mon/src/cpu_sup.erl @@ -543,7 +543,8 @@ measurement_server_init() -> Server = case OS of {unix, Flavor} when Flavor==sunos; Flavor==linux -> - port_server_start(); + {ok, Pid} = port_server_start_link(), + Pid; {unix, Flavor} when Flavor==darwin; Flavor==freebsd; Flavor==dragonfly; @@ -588,8 +589,9 @@ measurement_server_loop(State) -> Error -> Pid ! {error, Error} end, measurement_server_loop(State); - {'EXIT', Pid, _n} when State#internal.port == Pid -> - measurement_server_loop(State#internal{port = port_server_start()}); + {'EXIT', OldPid, _n} when State#internal.port == OldPid -> + {ok, NewPid} = port_server_start_link(), + measurement_server_loop(State#internal{port = NewPid}); _Other -> measurement_server_loop(State) end. @@ -605,12 +607,12 @@ port_server_call(Pid, Command) -> {Pid, {error, Reason}} -> {error, Reason} end. -port_server_start() -> +port_server_start_link() -> Timeout = 6000, Pid = spawn_link(fun() -> port_server_init(Timeout) end), Pid ! {self(), ?ping}, receive - {Pid, {data,4711}} -> Pid; + {Pid, {data,4711}} -> {ok, Pid}; {error,Reason} -> {error, Reason} after Timeout -> {error, timeout} -- cgit v1.2.3