aboutsummaryrefslogblamecommitdiffstats
path: root/lib/os_mon/test/disksup_SUITE.erl
blob: f33bde21c62e09be695cf3761aff8c8e686972b7 (plain) (tree)


















                                                                         
                                                    

                               
                                                                    




























                                                      

                                                


















                                                      








































































































































































































































































































































































                                                                                        
%%
%% %CopyrightBegin%
%%
%% Copyright Ericsson AB 1996-2010. All Rights Reserved.
%%
%% The contents of this file are subject to the Erlang Public License,
%% Version 1.1, (the "License"); you may not use this file except in
%% compliance with the License. You should have received a copy of the
%% Erlang Public License along with this software. If not, it can be
%% retrieved online at http://www.erlang.org/.
%%
%% Software distributed under the License is distributed on an "AS IS"
%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
%% the License for the specific language governing rights and limitations
%% under the License.
%%
%% %CopyrightEnd%
%%
-module(disksup_SUITE).
-include_lib("test_server/include/test_server.hrl").

%% Test server specific exports
-export([all/0, suite/0,groups/0,init_per_group/2,end_per_group/2]).
-export([init_per_suite/1, end_per_suite/1]).
-export([init_per_testcase/2, end_per_testcase/2]).

%% Test cases
-export([api/1, config/1, alarm/1]).
-export([port/1]).
-export([terminate/1, unavailable/1, restart/1]).
-export([otp_5910/1]).

%% Default timetrap timeout (set in init_per_testcase)
-define(default_timeout, ?t:minutes(1)).

init_per_suite(Config) when is_list(Config) ->
    ?line ok = application:start(os_mon),
    Config.

end_per_suite(Config) when is_list(Config) ->
    ?line ok = application:stop(os_mon),
    Config.

init_per_testcase(_Case, Config) ->
    Dog = ?t:timetrap(?default_timeout),
    [{watchdog,Dog} | Config].

end_per_testcase(_Case, Config) ->
    Dog = ?config(watchdog, Config),
    ?t:timetrap_cancel(Dog),
    ok.

suite() -> [{suite_callbacks,[ts_install_scb]}].

all() -> 
Bugs = [otp_5910],
	case test_server:os_type() of
  {unix, sunos} ->
      [api, config, alarm, port, unavailable] ++ Bugs;
  {unix, _OSname} -> [api, alarm] ++ Bugs;
  {win32, _OSname} -> [api, alarm] ++ Bugs;
  _OS -> [unavailable]
end.

groups() -> 
    [].

init_per_group(_GroupName, Config) ->
	Config.

end_per_group(_GroupName, Config) ->
	Config.


api(suite) ->
    [];
api(doc) ->
    ["Test of API functions"];
api(Config) when is_list(Config) ->

    %% get_disk_data()
    ?line [{Id, KByte, Capacity}|_] = disksup:get_disk_data(),
    ?line true = io_lib:printable_list(Id),
    ?line true = is_integer(KByte),
    ?line true = is_integer(Capacity),
    ?line true = KByte>0,
    ?line true = Capacity>0,

    %% get_check_interval()
    ?line 1800000 = disksup:get_check_interval(),

    %% set_check_interval(Minutes)
    ?line ok = disksup:set_check_interval(20),
    ?line 1200000 = disksup:get_check_interval(),
    ?line {'EXIT',{badarg,_}} = (catch disksup:set_check_interval(0.5)),
    ?line 1200000 = disksup:get_check_interval(),
    ?line ok = disksup:set_check_interval(30),

    %% get_almost_full_threshold()
    ?line 80 = disksup:get_almost_full_threshold(),

    %% set_almost_full_threshold(Float)
    ?line ok = disksup:set_almost_full_threshold(0.90),
    ?line 90 = disksup:get_almost_full_threshold(),
    ?line {'EXIT',{badarg,_}} =
	(catch disksup:set_almost_full_threshold(-0.5)),
    ?line 90 = disksup:get_almost_full_threshold(),
    ?line ok = disksup:set_almost_full_threshold(0.80),

    ok.

config(suite) ->
    [];
config(doc) ->
    ["Test configuration"];
config(Config) when is_list(Config) ->

    %% Change configuration parameters and make sure change is reflected
    %% when disksup is restarted
    ?line ok =
	application:set_env(os_mon, disk_space_check_interval, 29),
    ?line ok =
	application:set_env(os_mon, disk_almost_full_threshold, 0.81),

    ?line ok = supervisor:terminate_child(os_mon_sup, disksup),
    ?line {ok, _Child1} = supervisor:restart_child(os_mon_sup, disksup),

    ?line 1740000 = disksup:get_check_interval(),
    ?line 81 = disksup:get_almost_full_threshold(),

    %% Also try this with bad parameter values, should be ignored
    ?line ok =
	application:set_env(os_mon, disk_space_check_interval, 0.5),
    ?line ok =
	application:set_env(os_mon, disk_almost_full_threshold, -0.81),

    ?line ok = supervisor:terminate_child(os_mon_sup, disksup),
    ?line {ok, _Child2} = supervisor:restart_child(os_mon_sup, disksup),

    ?line 1800000 = disksup:get_check_interval(),
    ?line 80 = disksup:get_almost_full_threshold(),

    %% Reset configuration parameters
    ?line ok =
	application:set_env(os_mon, disk_space_check_interval, 30),
    ?line ok =
	application:set_env(os_mon, disk_almost_full_threshold, 0.80),

    ok.

%%----------------------------------------------------------------------
%% NOTE: The test case is a bit weak as it will fail if the disk usage
%% changes too much during its course, or if there are timing problems
%% with the alarm_handler receiving the alarms too late
%%----------------------------------------------------------------------
alarm(suite) ->
    [];
alarm(doc) ->
    ["Test that alarms are set and cleared"];
alarm(Config) when is_list(Config) ->

    %% Find out how many disks exceed the threshold
    %% and make sure the corresponding number of alarms is set
    ?line Threshold1 = disksup:get_almost_full_threshold(), % 80
    ?line Data1 = disksup:get_disk_data(),
    ?line Over1 = over_threshold(Data1, Threshold1),
    ?line Alarms1 = get_alarms(),
    if
	Over1==length(Alarms1) ->
	    ?line true;
	true ->
	    dump_info(),
	    ?line ?t:fail({bad_alarms, Threshold1, Data1, Alarms1})
    end,

    %% Try to find a disk with space usage below Threshold1,
    %% lower the threshold accordingly and make sure new alarms are set
    Fun1 = fun({_Id, _Kbyte, Capacity}) ->
		   if
		       Capacity>0, Capacity<Threshold1 -> true;
		       true -> false
		   end
	   end,
    ?line case until(Fun1, Data1) of
	      {_, _, Cap1} ->
		  Threshold2 = Cap1-1,
		  ?line ok =
		      disksup:set_almost_full_threshold(Threshold2/100),
		  ?line disksup ! timeout, % force a disk check
		  ?line Data2 = disksup:get_disk_data(),
		  ?line Over2 = over_threshold(Data2, Threshold2),
		  ?line Alarms2 = get_alarms(),
		  if
		      Over2==length(Alarms2), Over2>Over1 ->
			  ?line true;
		      true ->
			  dump_info(),
			  ?line ?t:fail({bad_alarms, Threshold2, Data2, Alarms2})
		  end;
	      false ->
		  ?line ignore
	  end,

    %% Find out the highest space usage among all disks
    %% and try to raise the threshold above this value,
    %% make sure all alarms are cleared
    Fun2 = fun({_Id, _Kbyte, Capacity}, MaxAcc) ->
		   if
		       Capacity>MaxAcc -> Capacity;
		       true -> MaxAcc
		   end
	   end,
    ?line case lists:foldl(Fun2, 0, Data1) of
	      Max when Max<100 ->
		  Threshold3 = Max+1,
		  ?line ok =
		      disksup:set_almost_full_threshold(Threshold3/100),
		  ?line disksup ! timeout, % force a disk check
		  ?line Data3 = disksup:get_disk_data(),
		  ?line Over3 = over_threshold(Data3, Threshold3),
		  ?line Alarms3 = get_alarms(),
		  if
		      Over3==0, length(Alarms3)==0 ->
			  ?line ok;
		      true ->
			  dump_info(),
			  ?line ?t:fail({bad_alarms, Threshold3, Data3, Alarms3})
		  end;
	      100 ->
		  ?line ignore
	  end,

    %% Reset threshold
    ?line ok = disksup:set_almost_full_threshold(Threshold1/100),

    ok.

over_threshold(Data, Threshold) ->
    Data2 = remove_duplicated_disks(lists:keysort(1, Data)),
    lists:foldl(fun({_Id, _Kbyte, Cap}, N) when Cap>=Threshold ->
			N+1;
		   (_DiskData, N) ->
			N
		end,
		0,
		Data2).

%% On some platforms (for example MontaVista) data for one disk can be
%% "duplicated":
%%  Linux ppb 2.4.20_mvl31-pcore680 #1 Sun Feb 1 23:12:56 PST 2004 ppc unknown
%%
%%  MontaVista(R) Linux(R) Professional Edition 3.1
%%
%%  [ppb:~]> /bin/df -lk
%%  Filesystem           1k-blocks      Used Available Use% Mounted on
%%  rootfs                 8066141   3023763   4961717  38% /
%%  /dev/root              8066141   3023763   4961717  38% /
%%  tmpfs                   192892         0    192892   0% /dev/shm
%%
%% disksup:
%%  [{"/",8066141,38}, {"/",8066141,38}, {"/dev/shm",192892,0}]
%%
%% disksup will only set ONE alarm for "/".
%% Therefore the list of disk data must be sorted and duplicated disk
%% tuples removed before calculating how many alarms should be set, or
%% the testcase will fail erroneously.
remove_duplicated_disks([{Id, _, _}, {Id, Kbyte, Cap}|T]) ->
    remove_duplicated_disks([{Id, Kbyte, Cap}|T]);
remove_duplicated_disks([H|T]) ->
    [H|remove_duplicated_disks(T)];
remove_duplicated_disks([]) ->
    [].

get_alarms() ->
    lists:filter(fun({{disk_almost_full, _Disk},_}) -> true;
		    (_) -> false
		 end,
		 alarm_handler:get_alarms()).

until(Fun, [H|T]) ->
    case Fun(H) of
	true -> H;
	false ->
	    until(Fun, T)
    end;
until(_Fun, []) ->
    false.

port(suite) ->
    [];
port(doc) ->
    ["Test that disksup handles a terminating port program"];
port(Config) when is_list(Config) ->
    ?line Str = os:cmd("ps -ef | grep '[d]isksup'"),
    case io_lib:fread("~s ~s", Str) of
	 {ok, [_Uid,Pid], _Rest} ->

	    %% Monitor disksup
	    ?line MonRef = erlang:monitor(process, disksup),
	    ?line [{_Disk1,Kbyte1,_Cap1}|_] = disksup:get_disk_data(),
	    ?line true = Kbyte1>0,

	    %% Kill the port program
	    case os:cmd("kill -9 " ++ Pid) of
		[] ->

		    %% disksup should now terminate
		    receive
			{'DOWN', MonRef, _, _, {port_died, _Reason}} ->
			    ok;
			{'DOWN', MonRef, _, _, Reason} ->
			    ?line ?t:fail({unexpected_exit_reason, Reason})
		    after
			3000 ->
			    ?line ?t:fail({still_alive, Str})
		    end,

		    %% Give os_mon_sup time to restart disksup
		    ?t:sleep(?t:seconds(3)),
		    ?line [{_Disk2,Kbyte2,_Cap2}|_] =
			disksup:get_disk_data(),
		    ?line true = Kbyte2>0,

		    ok;

		Line ->
		    erlang:demonitor(MonRef),
		    {skip, {not_killed, Line}}
	    end;
	_ ->
	    {skip, {os_pid_not_found, Str}}
    end.

terminate(suite) ->
    [];
terminate(Config) when is_list(Config) ->
    ?line ok = application:set_env(os_mon, start_disksup, false),
    ?line ok = supervisor:terminate_child(os_mon_sup, disksup),
    ok.

unavailable(suite) ->
    [];
unavailable(doc) ->
    ["Test correct behaviour when service is unavailable"];
unavailable(Config) when is_list(Config) ->

    %% Make sure all API functions return their dummy values
    ?line [{"none",0,0}] = disksup:get_disk_data(),
    ?line 1800000 = disksup:get_check_interval(),
    ?line ok = disksup:set_check_interval(5),
    ?line 80 = disksup:get_almost_full_threshold(),
    ?line ok = disksup:set_almost_full_threshold(0.9),

    ok.

restart(suite) ->
    [];
restart(Config) when is_list(Config) ->
    ?line ok = application:set_env(os_mon, start_disksup, true),
    ?line {ok, _Pid} = supervisor:restart_child(os_mon_sup, disksup),
    ok.

otp_5910(suite) ->
    [];
otp_5910(doc) ->
    ["Test that alarms are cleared if disksup crashes or "
     "if OS_Mon is stopped"];
otp_5910(Config) when is_list(Config) ->

    %% Make sure disksup sets at least one alarm
    ?line Data = disksup:get_disk_data(),
    ?line Threshold0 = disksup:get_almost_full_threshold(),
    ?line Threshold = case over_threshold(Data, Threshold0) of
			  0 ->
			      [{_Id,_Kbyte,Cap}|_] = Data,
			      ?line ok = disksup:set_almost_full_threshold((Cap-1)/100),
			      Cap-1;
			  _N ->
			      Threshold0
		      end,
    ?line ok = application:set_env(os_mon,
				   disk_almost_full_threshold,
				   Threshold/100),
    ?line disksup ! timeout, % force a disk check
    ?line Data2 = disksup:get_disk_data(),
    ?line Over = over_threshold(Data2, Threshold),
    ?line Alarms = get_alarms(),
    if
	Over==0 ->
	    ?line ?t:fail({threshold_too_low, Data2, Threshold});
	Over==length(Alarms) ->
	    ok;
	true ->
	    dump_info(),
	    ?line ?t:fail({bad_alarms, Threshold, Data2, Alarms})
    end,

    %% Kill disksup
    exit(whereis(disksup), faked_disksup_crash),

    %% Wait a little to make sure disksup has been restarted,
    %% then make sure the alarms are set once, but not twice
    ?t:sleep(?t:seconds(1)),
    ?line Data3 = disksup:get_disk_data(),
    ?line Alarms2 = get_alarms(),
    if
	length(Alarms2)==length(Alarms) ->
	    ok;
	true ->
	    dump_info(),
	    ?line ?t:fail({bad_alarms, Threshold, Data3, Alarms,Alarms2})
    end,

    %% Stop OS_Mon and make sure all disksup alarms are cleared
    ?line ok = application:stop(os_mon),
    ?t:sleep(?t:seconds(1)),
    ?line Alarms3 = get_alarms(),
    if
	length(Alarms3)==0 ->
	    ok;
	true ->
	    ?line ?t:fail({alarms_not_cleared, Alarms3})
    end,

    %% Reset threshold and restart OS_Mon
    ?line ok = application:set_env(os_mon,
				   disksup_almost_full_threshold, 0.8),
    ?line ok = disksup:set_almost_full_threshold(0.8),
    ?line ok = application:start(os_mon),

    ok.

dump_info() ->
    io:format("Status: ~p~n", [sys:get_status(disksup)]).