%%
%% %CopyrightBegin%
%%
%% Copyright Ericsson AB 2010. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%%
%% %CopyrightEnd%
%%
-module(hdlt_slave).
-export([start_link/4, start_link/5, start_link/6, stop/1]).
%% Internal exports
-export([wait_for_slave/9, slave_start/1, wait_for_master_to_die/3]).
-include("hdlt_logger.hrl").
-define(SSH_PORT, 22).
-define(TIMEOUT, 60000).
-define(LOGGER, hdlt_logger).
%% ***********************************************************************
%% start_link/4,5 --
%%
%% The start/4,5 functions are used to start a slave Erlang node.
%% The node on which the start/N functions are used is called the
%% master in the description below.
%%
%% If hostname is the same for the master and the slave,
%% the Erlang node will simply be spawned. The only requirment for
%% this to work is that the 'erl' program can be found in PATH.
%%
%% If the master and slave are on different hosts, start/N uses
%% the 'rsh' program to spawn an Erlang node on the other host.
%% Alternative, if the master was started as
%% 'erl -sname xxx -rsh my_rsh...', then 'my_rsh' will be used instead
%% of 'rsh' (this is useful for systems where the rsh program is named
%% 'remsh').
%%
%% For this to work, the following conditions must be fulfilled:
%%
%% 1. There must be an Rsh program on computer; if not an error
%% is returned.
%%
%% 2. The hosts must be configured to allowed 'rsh' access without
%% prompts for password.
%%
%% The slave node will have its filer and user server redirected
%% to the master. When the master node dies, the slave node will
%% terminate. For the start_link functions, the slave node will
%% terminate also if the process which called start_link terminates.
%%
%% Returns: {ok, Name@Host} |
%% {error, timeout} |
%% {error, no_rsh} |
%% {error, {already_running, Name@Host}}
start_link(Host, Name, ErlPath, Paths) ->
start_link(Host, Name, ErlPath, Paths, [], silence).
start_link(Host, Name, ErlPath, Paths, DebugLevel) when is_atom(DebugLevel) ->
start_link(Host, Name, ErlPath, Paths, [], DebugLevel);
start_link(Host, Name, ErlPath, Paths, Args) when is_list(Args) ->
start_link(Host, Name, ErlPath, Paths, Args, silence).
start_link(Host, Name, ErlPath, Paths, Args, DebugLevel) ->
Node = list_to_atom(lists:concat([Name, "@", Host])),
case net_adm:ping(Node) of
pang ->
start_it(Host, Name, Node, ErlPath, Paths, Args, DebugLevel);
pong ->
{error, {already_running, Node}}
end.
%% Stops a running node.
stop(Node) ->
rpc:call(Node, erlang, halt, []),
ok.
%% Starts a new slave node.
start_it(Host, Name, Node, ErlPath, Paths, Args, DebugLevel) ->
Prog = filename:join([ErlPath, "erl"]),
spawn(?MODULE, wait_for_slave, [self(), Host, Name, Node, Paths, Args, self(), Prog, DebugLevel]),
receive
{result, Result} -> Result
end.
%% Waits for the slave to start.
wait_for_slave(Parent, Host, Name, Node, Paths, Args,
LinkTo, Prog, DebugLevel) ->
?SET_NAME("HDLT SLAVE STARTER"),
?SET_LEVEL(DebugLevel),
?DEBUG("begin", []),
Waiter = register_unique_name(0),
case mk_cmd(Host, Name, Paths, Args, Waiter, Prog) of
{ok, Cmd} ->
?DEBUG("command generated: ~n~s", [Cmd]),
case (catch ssh_slave_start(Host, Cmd)) of
{ok, Conn, _Chan} ->
?DEBUG("ssh channel created", []),
receive
{SlavePid, slave_started} ->
?DEBUG("slave started: ~p", [SlavePid]),
unregister(Waiter),
slave_started(Parent, LinkTo, SlavePid, Conn,
DebugLevel)
after 32000 ->
?INFO("slave node failed to report in on time",
[]),
%% If it seems that the node was partially started,
%% try to kill it.
case net_adm:ping(Node) of
pong ->
spawn(Node, erlang, halt, []),
ok;
_ ->
ok
end,
Parent ! {result, {error, timeout}}
end;
{error, Reason} = Error ->
?INFO("FAILED starting node: "
"~n ~p"
"~n ~p", [Reason, Cmd]),
Parent ! {result, Error}
end;
Other ->
?INFO("FAILED creating node command string: "
"~n ~p", [Other]),
Parent ! {result, Other}
end.
ssh_slave_start(Host, ErlCmd) ->
?DEBUG("ssh_slave_start -> try connect to ~p", [Host]),
Connection =
case (catch ssh:connect(Host, ?SSH_PORT,
[{silently_accept_hosts, true}])) of
{ok, Conn} ->
?DEBUG("ssh_exec_erl -> connected: ~p", [Conn]),
Conn;
Error1 ->
?LOG("failed connecting to ~p: ~p", [Host, Error1]),
throw({error, {ssh_connect_failed, Error1}})
end,
?DEBUG("ssh_exec_erl -> connected - now create channel", []),
Channel =
case (catch ssh_connection:session_channel(Connection, ?TIMEOUT)) of
{ok, Chan} ->
?DEBUG("ssh_exec_erl -> channel ~p created", [Chan]),
Chan;
Error2 ->
?LOG("failed creating channel: ~p", [Error2]),
throw({error, {ssh_channel_create_failed, Error2}})
end,
?DEBUG("ssh_exec_erl -> channel created - now exec command: "
"~n ~p", [ErlCmd]),
case (catch ssh_connection:exec(Connection, Channel, ErlCmd, infinity)) of
success ->
?DEBUG("ssh_exec_erl -> command exec'ed - clean ssh msg", []),
clean_ssh_msg(),
?DEBUG("ssh_exec_erl -> done", []),
{ok, Connection, Channel};
Error3 ->
?LOG("failed exec command: ~p", [Error3]),
throw({error, {ssh_exec_failed, Error3}})
end.
clean_ssh_msg() ->
receive
{ssh_cm, _X, _Y} ->
clean_ssh_msg()
after 1000 ->
ok
end.
slave_started(ReplyTo, Master, Slave, Conn, Level)
when is_pid(Master) andalso is_pid(Slave) ->
process_flag(trap_exit, true),
SName = lists:flatten(
io_lib:format("HDLT SLAVE CTRL[~p,~p]",
[self(), node(Slave)])),
?SET_NAME(SName),
?SET_LEVEL(Level),
?LOG("initiating", []),
MasterRef = erlang:monitor(process, Master),
SlaveRef = erlang:monitor(process, Slave),
ReplyTo ! {result, {ok, node(Slave)}},
slave_running(Master, MasterRef, Slave, SlaveRef, Conn).
%% The slave node will be killed if the master process terminates,
%% The master process will not be killed if the slave node terminates.
slave_running(Master, MasterRef, Slave, SlaveRef, Conn) ->
?DEBUG("await message", []),
receive
{'DOWN', MasterRef, process, _Object, _Info} ->
?LOG("received DOWN from master", []),
erlang:demonitor(SlaveRef, [flush]),
Slave ! {nodedown, node()},
ssh:close(Conn);
{'DOWN', SlaveRef, process, Object, _Info} ->
?LOG("received DOWN from slave (~p)", [Object]),
erlang:demonitor(MasterRef, [flush]),
ssh:close(Conn);
Other ->
?DEBUG("received unknown: ~n~p", [Other]),
slave_running(Master, MasterRef, Slave, SlaveRef, Conn)
end.
register_unique_name(Number) ->
Name = list_to_atom(lists:concat([?MODULE, "_waiter_", Number])),
case catch register(Name, self()) of
true ->
Name;
{'EXIT', {badarg, _}} ->
register_unique_name(Number+1)
end.
%% Makes up the command to start the nodes.
%% If the node should run on the local host, there is
%% no need to use rsh.
mk_cmd(Host, Name, Paths, Args, Waiter, Prog) ->
PaPaths = [[" -pa ", Path] || Path <- Paths],
{ok, lists:flatten(
lists:concat([Prog,
" -detached -nopinput ",
Args, " ",
" -sname ", Name, "@", Host,
" -s ", ?MODULE, " slave_start ", node(),
" ", Waiter,
" ", PaPaths]))}.
%% This function will be invoked on the slave, using the -s option of erl.
%% It will wait for the master node to terminate.
slave_start([Master, Waiter]) ->
spawn(?MODULE, wait_for_master_to_die, [Master, Waiter, silence]);
slave_start([Master, Waiter, Level]) ->
spawn(?MODULE, wait_for_master_to_die, [Master, Waiter, Level]).
wait_for_master_to_die(Master, Waiter, Level) ->
process_flag(trap_exit, true),
SName = lists:flatten(
io_lib:format("HDLT-SLAVE MASTER MONITOR[~p,~p,~p]",
[self(), node(), Master])),
?SET_NAME(SName),
?SET_LEVEL(Level),
erlang:monitor_node(Master, true),
{Waiter, Master} ! {self(), slave_started},
wloop(Master).
wloop(Master) ->
?DEBUG("await message", []),
receive
{nodedown, Master} ->
?INFO("received master nodedown", []),
halt();
_Other ->
wloop(Master)
end.