aboutsummaryrefslogblamecommitdiffstats
path: root/lib/os_mon/test/disksup_SUITE.erl
blob: 9c65d8b692893383844d573284f842ea7bf8e506 (plain) (tree)
1
2
3
4

                   
                                                        













                                                                         
                                                    
                               
                                                                    











                                                      
                                   

                                             
                                  
           

                                         


                                        

                                        



                                    
                                         
 
         






                                                            



                                     
           
                                    
           
 
 
                                      

                                   




                                              
                           
                                           
                                  



                                                                  
                                  
                                             
                                       

                                                 
                                                        
                                                 

       
                                      


                                                                        
                                                                       
 
                                                                  
 
                                             
                                                                 
        
                                                                    
        
                                                                       
                                                                  
 
                                             
                                     
                                                                       





                                                                        
                                                       


                                                              


                                                          
                                 
                 
                        
                                                             








                                                                       
                              
                                      
                      
                                                                        


                                                            
                                                            
                               
                                      
                                                                           
                      
                        









                                                       
                                       
                                     



                                                                         
                                                     
                             
                                      
                                                                           
                      
                        

                      
                                                           


                                                            


                                                               


























                                                                              


                                                    


                    
                              
        
                         
 
                  

                                                             
                                              
                                      
                                  
                              

                                                                








                                                                       
                                                                     
                               
                                                       


                                                              
                                                                        
                                    









                                              
                       
                                         
                                                           
       
                         



                                                            



                                                



                                       
                                                               
       
                      




                                                          












                                                                                
                  
                                                           


                               
                                                           






                                                             
                                      
      
                                              
                        
                                                                

                                                               
                                  
                            


                                                    

                                         

                                                                         


                                                         











                                                            
%%
%% %CopyrightBegin%
%%
%% Copyright Ericsson AB 1996-2011. All Rights Reserved.
%%
%% The contents of this file are subject to the Erlang Public License,
%% Version 1.1, (the "License"); you may not use this file except in
%% compliance with the License. You should have received a copy of the
%% Erlang Public License along with this software. If not, it can be
%% retrieved online at http://www.erlang.org/.
%%
%% Software distributed under the License is distributed on an "AS IS"
%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
%% the License for the specific language governing rights and limitations
%% under the License.
%%
%% %CopyrightEnd%
%%
-module(disksup_SUITE).
-include_lib("test_server/include/test_server.hrl").

%% Test server specific exports
-export([all/0, suite/0,groups/0,init_per_group/2,end_per_group/2]).
-export([init_per_suite/1, end_per_suite/1]).
-export([init_per_testcase/2, end_per_testcase/2]).

%% Test cases
-export([api/1, config/1, alarm/1]).
-export([port/1]).
-export([terminate/1, unavailable/1, restart/1]).
-export([otp_5910/1]).

%% Default timetrap timeout (set in init_per_testcase)
-define(default_timeout, ?t:minutes(1)).

init_per_suite(Config) when is_list(Config) ->
    ok = application:start(os_mon),
    Config.

end_per_suite(Config) when is_list(Config) ->
    ok = application:stop(os_mon),
    Config.

init_per_testcase(unavailable, Config) ->
    terminate(Config),
    init_per_testcase(dummy, Config);
init_per_testcase(_Case, Config) ->
    Dog = ?t:timetrap(?default_timeout),
    [{watchdog,Dog} | Config].

end_per_testcase(unavailable, Config) ->
    restart(Config),
    end_per_testcase(dummy, Config);
end_per_testcase(_Case, Config) ->
    Dog = ?config(watchdog, Config),
    ?t:timetrap_cancel(Dog),
    ok.

suite() -> [{ct_hooks,[ts_install_cth]}].

all() -> 
    Bugs = [otp_5910],
    case test_server:os_type() of
	{unix, sunos} ->
	    [api, config, alarm, port, unavailable] ++ Bugs;
	{unix, _OSname} -> [api, alarm] ++ Bugs;
	{win32, _OSname} -> [api, alarm] ++ Bugs;
	_OS -> [unavailable]
    end.

groups() -> 
    [].

init_per_group(_GroupName, Config) ->
    Config.

end_per_group(_GroupName, Config) ->
    Config.


api(suite) -> [];
api(doc) -> ["Test of API functions"];
api(Config) when is_list(Config) ->

    %% get_disk_data()
    [{Id,KByte,Capacity}|_] = get_disk_data(),
    true = io_lib:printable_list(Id),
    true = is_integer(KByte),
    true = is_integer(Capacity),
    true = Capacity>0,
    true = KByte>0,

    %% get_check_interval()
    1800000 = disksup:get_check_interval(),

    %% set_check_interval(Minutes)
    ok = disksup:set_check_interval(20),
    1200000 = disksup:get_check_interval(),
    {'EXIT',{badarg,_}} = (catch disksup:set_check_interval(0.5)),
    1200000 = disksup:get_check_interval(),
    ok = disksup:set_check_interval(30),

    %% get_almost_full_threshold()
    80 = disksup:get_almost_full_threshold(),

    %% set_almost_full_threshold(Float)
    ok = disksup:set_almost_full_threshold(0.90),
    90 = disksup:get_almost_full_threshold(),
    {'EXIT',{badarg,_}} =
	(catch disksup:set_almost_full_threshold(-0.5)),
    90 = disksup:get_almost_full_threshold(),
    ok = disksup:set_almost_full_threshold(0.80),

    ok.

config(suite) -> [];
config(doc) -> ["Test configuration"];
config(Config) when is_list(Config) ->

    %% Change configuration parameters and make sure change is reflected
    %% when disksup is restarted
    ok = application:set_env(os_mon, disk_space_check_interval, 29),
    ok = application:set_env(os_mon, disk_almost_full_threshold, 0.81),

    ok = supervisor:terminate_child(os_mon_sup, disksup),
    {ok, _Child1} = supervisor:restart_child(os_mon_sup, disksup),

    1740000 = disksup:get_check_interval(),
    81 = disksup:get_almost_full_threshold(),

    %% Also try this with bad parameter values, should be ignored
    ok =
	application:set_env(os_mon, disk_space_check_interval, 0.5),
    ok =
	application:set_env(os_mon, disk_almost_full_threshold, -0.81),

    ok = supervisor:terminate_child(os_mon_sup, disksup),
    {ok, _Child2} = supervisor:restart_child(os_mon_sup, disksup),

    1800000 = disksup:get_check_interval(),
    80 = disksup:get_almost_full_threshold(),

    %% Reset configuration parameters
    ok = application:set_env(os_mon, disk_space_check_interval, 30),
    ok = application:set_env(os_mon, disk_almost_full_threshold, 0.80),
    ok.

%%----------------------------------------------------------------------
%% NOTE: The test case is a bit weak as it will fail if the disk usage
%% changes too much during its course, or if there are timing problems
%% with the alarm_handler receiving the alarms too late
%%----------------------------------------------------------------------
alarm(suite) -> [];
alarm(doc) -> ["Test that alarms are set and cleared"];
alarm(Config) when is_list(Config) ->

    %% Find out how many disks exceed the threshold
    %% and make sure the corresponding number of alarms is set
    Threshold1 = disksup:get_almost_full_threshold(), % 80
    Data1 = disksup:get_disk_data(),
    Over1 = over_threshold(Data1, Threshold1),
    Alarms1 = get_alarms(),
    if
	Over1==length(Alarms1) ->
	    true;
	true ->
	    dump_info(),
	    ?t:fail({bad_alarms, Threshold1, Data1, Alarms1})
    end,

    %% Try to find a disk with space usage below Threshold1,
    %% lower the threshold accordingly and make sure new alarms are set
    Fun1 = fun({_Id, _Kbyte, Capacity}) ->
		   if
		       Capacity>0, Capacity<Threshold1 -> true;
		       true -> false
		   end
	   end,
    case until(Fun1, Data1) of
	      {_, _, Cap1} ->
		  Threshold2 = Cap1-1,
		  ok =
		      disksup:set_almost_full_threshold(Threshold2/100),
		  disksup ! timeout, % force a disk check
		  Data2 = disksup:get_disk_data(),
		  Over2 = over_threshold(Data2, Threshold2),
		  Alarms2 = get_alarms(),
		  if
		      Over2==length(Alarms2), Over2>Over1 ->
			  true;
		      true ->
			  dump_info(),
			  ?t:fail({bad_alarms, Threshold2, Data2, Alarms2})
		  end;
	      false ->
		  ignore
	  end,

    %% Find out the highest space usage among all disks
    %% and try to raise the threshold above this value,
    %% make sure all alarms are cleared
    Fun2 = fun({_Id, _Kbyte, Capacity}, MaxAcc) ->
		   if
		       Capacity>MaxAcc -> Capacity;
		       true -> MaxAcc
		   end
	   end,
    case lists:foldl(Fun2, 0, Data1) of
	      Max when Max<100 ->
		  Threshold3 = Max+1,
		  ok = disksup:set_almost_full_threshold(Threshold3/100),
		  disksup ! timeout, % force a disk check
		  Data3   = disksup:get_disk_data(),
		  Over3   = over_threshold(Data3, Threshold3),
		  Alarms3 = get_alarms(),
		  if
		      Over3==0, length(Alarms3)==0 ->
			  ok;
		      true ->
			  dump_info(),
			  ?t:fail({bad_alarms, Threshold3, Data3, Alarms3})
		  end;
	      100 ->
		  ignore
	  end,

    %% Reset threshold
    ok = disksup:set_almost_full_threshold(Threshold1/100),
    ok.

over_threshold(Data, Threshold) ->
    Data2 = remove_duplicated_disks(lists:keysort(1, Data)),
    lists:foldl(fun
	    ({_Id, _Kbyte, Cap}, N) when Cap>=Threshold -> N+1;
	    (_DiskData, N) -> N
	end, 0, Data2).

%% On some platforms (for example MontaVista) data for one disk can be
%% "duplicated":
%%  Linux ppb 2.4.20_mvl31-pcore680 #1 Sun Feb 1 23:12:56 PST 2004 ppc unknown
%%
%%  MontaVista(R) Linux(R) Professional Edition 3.1
%%
%%  [ppb:~]> /bin/df -lk
%%  Filesystem           1k-blocks      Used Available Use% Mounted on
%%  rootfs                 8066141   3023763   4961717  38% /
%%  /dev/root              8066141   3023763   4961717  38% /
%%  tmpfs                   192892         0    192892   0% /dev/shm
%%
%% disksup:
%%  [{"/",8066141,38}, {"/",8066141,38}, {"/dev/shm",192892,0}]
%%
%% disksup will only set ONE alarm for "/".
%% Therefore the list of disk data must be sorted and duplicated disk
%% tuples removed before calculating how many alarms should be set, or
%% the testcase will fail erroneously.
remove_duplicated_disks([{Id, _, _}, {Id, Kbyte, Cap}|T]) ->
    remove_duplicated_disks([{Id, Kbyte, Cap}|T]);
remove_duplicated_disks([H|T]) ->
    [H|remove_duplicated_disks(T)];
remove_duplicated_disks([]) ->
    [].

get_alarms() ->
    lists:filter(fun
	    ({{disk_almost_full, _Disk},_}) -> true;
	    (_) -> false
	end, alarm_handler:get_alarms()).

until(Fun, [H|T]) ->
    case Fun(H) of
	true -> H;
	false -> until(Fun, T)
    end;
until(_Fun, []) -> false.

port(suite) -> [];
port(doc) ->
    ["Test that disksup handles a terminating port program"];
port(Config) when is_list(Config) ->
    Str = os:cmd("ps -ef | grep '[d]isksup'"),
    case io_lib:fread("~s ~s", Str) of
	{ok, [_Uid,Pid], _Rest} ->

	    %% Monitor disksup
	    MonRef = erlang:monitor(process, disksup),
	    [{_Disk1,Kbyte1,_Cap1}|_] = disksup:get_disk_data(),
	    true = Kbyte1>0,

	    %% Kill the port program
	    case os:cmd("kill -9 " ++ Pid) of
		[] ->

		    %% disksup should now terminate
		    receive
			{'DOWN', MonRef, _, _, {port_died, _Reason}} ->
			    ok;
			{'DOWN', MonRef, _, _, Reason} ->
			    ?t:fail({unexpected_exit_reason, Reason})
		    after
			3000 ->
			    ?t:fail({still_alive, Str})
		    end,

		    %% Give os_mon_sup time to restart disksup
		    ?t:sleep(?t:seconds(3)),
		    [{_Disk2,Kbyte2,_Cap2}|_] = disksup:get_disk_data(),
		    true = Kbyte2>0,

		    ok;

		Line ->
		    erlang:demonitor(MonRef),
		    {skip, {not_killed, Line}}
	    end;
	_ ->
	    {skip, {os_pid_not_found, Str}}
    end.

terminate(suite) -> [];
terminate(Config) when is_list(Config) ->
    ok = application:set_env(os_mon, start_disksup, false),
    ok = supervisor:terminate_child(os_mon_sup, disksup),
    ok.

unavailable(suite) -> [];
unavailable(doc) ->
    ["Test correct behaviour when service is unavailable"];
unavailable(Config) when is_list(Config) ->

    %% Make sure all API functions return their dummy values
    [{"none",0,0}] = disksup:get_disk_data(),
    1800000 = disksup:get_check_interval(),
    ok = disksup:set_check_interval(5),
    80 = disksup:get_almost_full_threshold(),
    ok = disksup:set_almost_full_threshold(0.9),
    ok.

restart(suite) ->
    [];
restart(Config) when is_list(Config) ->
    ok = application:set_env(os_mon, start_disksup, true),
    {ok, _Pid} = supervisor:restart_child(os_mon_sup, disksup),
    ok.

otp_5910(suite) -> [];
otp_5910(doc) ->
    ["Test that alarms are cleared if disksup crashes or "
     "if OS_Mon is stopped"];
otp_5910(Config) when is_list(Config) ->

    %% Make sure disksup sets at least one alarm
    Data = disksup:get_disk_data(),
    Threshold0 = disksup:get_almost_full_threshold(),
    Threshold  = case over_threshold(Data, Threshold0) of
	0 ->
	    [{_Id,_Kbyte,Cap}|_] = Data,
	    ok = disksup:set_almost_full_threshold((Cap-1)/100),
	    Cap-1;
	_N -> Threshold0
    end,
    ok = application:set_env(os_mon, disk_almost_full_threshold, Threshold/100),
    disksup ! timeout, % force a disk check
    Data2 = disksup:get_disk_data(),
    Over = over_threshold(Data2, Threshold),
    Alarms = get_alarms(),
    if
	Over==0 ->
	    ?t:fail({threshold_too_low, Data2, Threshold});
	Over==length(Alarms) ->
	    ok;
	true ->
	    dump_info(),
	    ?t:fail({bad_alarms, Threshold, Data2, Alarms})
    end,

    %% Kill disksup
    exit(whereis(disksup), faked_disksup_crash),

    %% Wait a little to make sure disksup has been restarted,
    %% then make sure the alarms are set once, but not twice
    ?t:sleep(?t:seconds(1)),
    Data3   = disksup:get_disk_data(),
    Alarms2 = get_alarms(),
    if
	length(Alarms2)==length(Alarms) -> ok;
	true ->
	    dump_info(),
	    ?t:fail({bad_alarms,Threshold,Data3,Alarms,Alarms2})
    end,

    %% Stop OS_Mon and make sure all disksup alarms are cleared
    ok = application:stop(os_mon),
    ?t:sleep(?t:seconds(1)),
    Alarms3 = get_alarms(),
    case get_alarms() of
	[] -> ok;
	_  -> ?t:fail({alarms_not_cleared, Alarms3})
    end,

    %% Reset threshold and restart OS_Mon
    ok = application:set_env(os_mon, disksup_almost_full_threshold, 0.8),
    ok = disksup:set_almost_full_threshold(0.8),
    ok = application:start(os_mon),
    ok.

dump_info() ->
    io:format("Status: ~p~n", [sys:get_status(disksup)]).

% filter get_disk_data and remove entriew with zero capacity
% "non-normal" filesystems report zero capacity
% - Perhaps errorneous 'df -k -l'?
% - Always list filesystems by type '-t ufs,zfs,..' instead?
% It is unclear what the intention was from the beginning.
get_disk_data() ->
    get_disk_data(disksup:get_disk_data()).

get_disk_data([{"none",0,0}=E]) -> [E];
get_disk_data([{_,_,0}|Es]) -> get_disk_data(Es);
get_disk_data([E|Es]) -> [E|get_disk_data(Es)];
get_disk_data([]) -> [].