path: root/lib/hipe/opt/hipe_ultra_mod2.erl



%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%%     http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%
%%		       ULTRASPARC MACHINE MODEL
%%
%% This module is used by the scheduler.
%% The following interface is used:
%%   ...
%%
%% NOTES:
%% - the machine model is simple (on the verge of simplistic)
%%   * all FUs are pipelined => model only one cycle at a time
%%   * instruction latencies are mostly 1
%%   * floating point is left for later (I _think_ it works, but ...)
%% - conservative: instructions that require multiple resources are
%%   modelled as 'single'; instead, they could reserve IEU+BR or whatever
%% - possibly inefficient: I think machine state model could be turned into
%%   a bitvector.

-module(hipe_ultra_mod2).
-export([init_resources/1,
	 init_instr_resources/2,
	 resources_available/4,
	 advance_cycle/1
	]).
-export([raw_latency/2,
	 war_latency/2,
	 waw_latency/2,
	 %% m_raw_latency/2,
	 %% m_war_latency/2,
	 %% m_waw_latency/2,
	 m_raw_latency/0,
	 m_war_latency/0,
	 m_waw_latency/0,
	 br_to_unsafe_latency/2,
	 unsafe_to_br_latency/2,
	 br_br_latency/2
	]).

-include("../sparc/hipe_sparc.hrl").

-define(debug(Str,Args),ok).
%-define(debug(Str,Args),io:format(Str,Args)).

-define(debug_ultra(Str,Args),ok).
%-define(debug_ultra(Str,Args),io:format(Str,Args)).

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%
%% Straightforward and somewhat simplistic model for UltraSparc:
%% - only one cycle at a time is modelled
%% - resources are simplified:
%%   * ieu0, ieu1, ieu, mem, br, single
%%   * per-cycle state = done | { I0, I1, NumI, X, Mem, Br }
%%   * unoptimized representation (could be bit vector)

init_resources(_Size) ->
    ?debug_ultra('init res ~p~n',[_Size]),
    empty_state().

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

init_instr_resources(N,Nodes) ->
    ultra_instr_rsrcs(Nodes,hipe_vectors:new(N, '')).

ultra_instr_rsrcs([],I_res) -> I_res;
ultra_instr_rsrcs([N|Ns],I_res) ->
    ultra_instr_rsrcs(Ns,ultra_instr_type(N,I_res)).

ultra_instr_type({N,I},I_res) ->
    hipe_vectors:set(I_res,N-1,instr_type(I)).

instr_type(I) ->
    case I of
	#move{} ->
	    ieu;
	#multimove{} -> %% TODO: expand multimoves before scheduling
	    ieu;
	#alu{} ->
	    case hipe_sparc:alu_operator(I) of
		'>>' -> ieu0;
		'<<' -> ieu0;
		_ -> ieu
	    end;
	#alu_cc{} ->
	    ieu1;
	#sethi{} ->
	    ieu;
	#load{} ->
	    mem;
	#store{} ->
	    mem;
	#b{} ->
	    br;
	#br{} ->
	    br;
	#goto{} ->
	    br;
	#jmp_link{} ->         % imprecise; should be mem+br?
	    single;
	#jmp{} ->              % imprecise
	    br;
	#call_link{} ->        % imprecise; should be mem+br?
	    single;
	#cmov_cc{} ->          % imprecise
	    single;
	#cmov_r{} ->           % imprecise
	    single;
	#load_atom{} ->        % should be resolved to sethi/or
	    single;
	#load_address{} ->     % should be resolved to sethi/or
	    single;
	#load_word_index{} ->  % should be resolved to sethi/or
	    single;
	%% uncommon types:
	#label{} ->
	    none;
	#nop{} ->
	    none;
	#comment{} ->
	    none;
	_ ->
	    exit({ultrasparc_instr_type,{cant_schedule,I}})
    end.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

resources_available(_Cycle, I, Rsrc, I_res) ->
    res_avail(instruction_resource(I_res, I), Rsrc).

instruction_resource(I_res, I) ->
    hipe_vectors:get(I_res, I-1).

%% The following function checks resource availability.
%% * all function units are assumed to be fully pipelined, so only
%%   one cycle at a time is modelled.
%% * for IEU0 and IEU1, these must precede all generic IEU instructions
%%   (handled by X bit)
%% * at most 2 integer instructions can issue in a cycle
%% * mem is straightforward
%% * br closes the cycle (= returns done).
%% * single requires an entirely empty state and closes the cycle

res_avail(ieu0, { free, I1, NumI, free, Mem, Br })
  when is_integer(NumI), NumI < 2 ->
    { yes, { occ, I1, NumI+1, free, Mem, Br }};
res_avail(ieu1, { _I0, free, NumI, free, Mem, Br })
  when is_integer(NumI), NumI < 2 ->
    { yes, { free, occ, NumI+1, free, Mem, Br }};
res_avail(ieu, { I0, I1, NumI, _X, Mem, Br })
  when is_integer(NumI), NumI < 2 ->
    { yes, { I0, I1, NumI+1, occ, Mem, Br }};
res_avail(mem, { I0, I1, NumI, X, free, Br }) ->
    { yes, { I0, I1, NumI, X, occ, Br }};
res_avail(br, { _I0, _I1, _NumI, _X, _Mem, free }) ->
    { yes, done };
res_avail(single, { free, free, 0, free, free, free }) ->
    { yes, done };
res_avail(_, _) ->
    no.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

advance_cycle(_Rsrc) ->
    empty_state().

empty_state() -> { free, free, 0, free, free, free }.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Latencies are taken from UltraSparc hardware manual
%%
%% *** UNFINISHED ***
%% more precisely, they are taken from my memory of the US-manual
%% at the moment.
%%
%% Note: all ld/st are assumed to hit in the L1 cache (D-cache),
%%   which is sort of imprecise.

raw_latency(alu, store) -> 0;
raw_latency(load, _) -> 2;            % only if load is L1 hit
raw_latency(alu_cc, b) -> 0;
raw_latency(_I0, _I1) ->
    1.

war_latency(_I0, _I1) ->
    0.

waw_latency(_I0, _I1) ->
    1.

%% *** UNFINISHED ***
%% At present, all load/stores are assumed to hit in the L1 cache,
%% which isn't really satisfying.

%% m_raw_latency(_St, _Ld) ->
%%     1.
%% 
%% m_war_latency(_Ld, _St) ->
%%     1.
%% 
%% m_waw_latency(_St1, _St2) ->
%%     1.

%% Use these for 'default latencies' = do not permit reordering.

m_raw_latency() ->
    1.

m_war_latency() ->
    1.

m_waw_latency() ->
    1.

br_to_unsafe_latency(_BrTy, _UTy) ->
    0.

unsafe_to_br_latency(_UTy, _BrTy) ->
    0.

br_br_latency(_BrTy1, _BrTy2) ->
    0.
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%%     http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%
%%		       ULTRASPARC MACHINE MODEL
%%
%% This module is used by the scheduler.
%% The following interface is used:
%%   ...
%%
%% NOTES:
%% - the machine model is simple (on the verge of simplistic)
%%   * all FUs are pipelined => model only one cycle at a time
%%   * instruction latencies are mostly 1
%%   * floating point is left for later (I _think_ it works, but ...)
%% - conservative: instructions that require multiple resources are
%%   modelled as 'single'; instead, they could reserve IEU+BR or whatever
%% - possibly inefficient: I think machine state model could be turned into
%%   a bitvector.

-module(hipe_ultra_mod2).
-export([init_resources/1,
	 init_instr_resources/2,
	 resources_available/4,
	 advance_cycle/1
	]).
-export([raw_latency/2,
	 war_latency/2,
	 waw_latency/2,
	 %% m_raw_latency/2,
	 %% m_war_latency/2,
	 %% m_waw_latency/2,
	 m_raw_latency/0,
	 m_war_latency/0,
	 m_waw_latency/0,
	 br_to_unsafe_latency/2,
	 unsafe_to_br_latency/2,
	 br_br_latency/2
	]).

-include("../sparc/hipe_sparc.hrl").

-define(debug(Str,Args),ok).
%-define(debug(Str,Args),io:format(Str,Args)).

-define(debug_ultra(Str,Args),ok).
%-define(debug_ultra(Str,Args),io:format(Str,Args)).

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%
%% Straightforward and somewhat simplistic model for UltraSparc:
%% - only one cycle at a time is modelled
%% - resources are simplified:
%%   * ieu0, ieu1, ieu, mem, br, single
%%   * per-cycle state = done | { I0, I1, NumI, X, Mem, Br }
%%   * unoptimized representation (could be bit vector)

init_resources(_Size) ->
    ?debug_ultra('init res ~p~n',[_Size]),
    empty_state().

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

init_instr_resources(N,Nodes) ->
    ultra_instr_rsrcs(Nodes,hipe_vectors:new(N, '')).

ultra_instr_rsrcs([],I_res) -> I_res;
ultra_instr_rsrcs([N|Ns],I_res) ->
    ultra_instr_rsrcs(Ns,ultra_instr_type(N,I_res)).

ultra_instr_type({N,I},I_res) ->
    hipe_vectors:set(I_res,N-1,instr_type(I)).

instr_type(I) ->
    case I of
	#move{} ->
	    ieu;
	#multimove{} -> %% TODO: expand multimoves before scheduling
	    ieu;
	#alu{} ->
	    case hipe_sparc:alu_operator(I) of
		'>>' -> ieu0;
		'<<' -> ieu0;
		_ -> ieu
	    end;
	#alu_cc{} ->
	    ieu1;
	#sethi{} ->
	    ieu;
	#load{} ->
	    mem;
	#store{} ->
	    mem;
	#b{} ->
	    br;
	#br{} ->
	    br;
	#goto{} ->
	    br;
	#jmp_link{} ->         % imprecise; should be mem+br?
	    single;
	#jmp{} ->              % imprecise
	    br;
	#call_link{} ->        % imprecise; should be mem+br?
	    single;
	#cmov_cc{} ->          % imprecise
	    single;
	#cmov_r{} ->           % imprecise
	    single;
	#load_atom{} ->        % should be resolved to sethi/or
	    single;
	#load_address{} ->     % should be resolved to sethi/or
	    single;
	#load_word_index{} ->  % should be resolved to sethi/or
	    single;
	%% uncommon types:
	#label{} ->
	    none;
	#nop{} ->
	    none;
	#comment{} ->
	    none;
	_ ->
	    exit({ultrasparc_instr_type,{cant_schedule,I}})
    end.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

resources_available(_Cycle, I, Rsrc, I_res) ->
    res_avail(instruction_resource(I_res, I), Rsrc).

instruction_resource(I_res, I) ->
    hipe_vectors:get(I_res, I-1).

%% The following function checks resource availability.
%% * all function units are assumed to be fully pipelined, so only
%%   one cycle at a time is modelled.
%% * for IEU0 and IEU1, these must precede all generic IEU instructions
%%   (handled by X bit)
%% * at most 2 integer instructions can issue in a cycle
%% * mem is straightforward
%% * br closes the cycle (= returns done).
%% * single requires an entirely empty state and closes the cycle

res_avail(ieu0, { free, I1, NumI, free, Mem, Br })
  when is_integer(NumI), NumI < 2 ->
    { yes, { occ, I1, NumI+1, free, Mem, Br }};
res_avail(ieu1, { _I0, free, NumI, free, Mem, Br })
  when is_integer(NumI), NumI < 2 ->
    { yes, { free, occ, NumI+1, free, Mem, Br }};
res_avail(ieu, { I0, I1, NumI, _X, Mem, Br })
  when is_integer(NumI), NumI < 2 ->
    { yes, { I0, I1, NumI+1, occ, Mem, Br }};
res_avail(mem, { I0, I1, NumI, X, free, Br }) ->
    { yes, { I0, I1, NumI, X, occ, Br }};
res_avail(br, { _I0, _I1, _NumI, _X, _Mem, free }) ->
    { yes, done };
res_avail(single, { free, free, 0, free, free, free }) ->
    { yes, done };
res_avail(_, _) ->
    no.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

advance_cycle(_Rsrc) ->
    empty_state().

empty_state() -> { free, free, 0, free, free, free }.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Latencies are taken from UltraSparc hardware manual
%%
%% *** UNFINISHED ***
%% more precisely, they are taken from my memory of the US-manual
%% at the moment.
%%
%% Note: all ld/st are assumed to hit in the L1 cache (D-cache),
%%   which is sort of imprecise.

raw_latency(alu, store) -> 0;
raw_latency(load, _) -> 2;            % only if load is L1 hit
raw_latency(alu_cc, b) -> 0;
raw_latency(_I0, _I1) ->
    1.

war_latency(_I0, _I1) ->
    0.

waw_latency(_I0, _I1) ->
    1.

%% *** UNFINISHED ***
%% At present, all load/stores are assumed to hit in the L1 cache,
%% which isn't really satisfying.

%% m_raw_latency(_St, _Ld) ->
%%     1.
%% 
%% m_war_latency(_Ld, _St) ->
%%     1.
%% 
%% m_waw_latency(_St1, _St2) ->
%%     1.

%% Use these for 'default latencies' = do not permit reordering.

m_raw_latency() ->
    1.

m_war_latency() ->
    1.

m_waw_latency() ->
    1.

br_to_unsafe_latency(_BrTy, _UTy) ->
    0.

unsafe_to_br_latency(_UTy, _BrTy) ->
    0.

br_br_latency(_BrTy1, _BrTy2) ->
    0.