path: root/erts/emulator/internal_doc/dec.erl



%%
%% %CopyrightBegin%
%%
%% Copyright Ericsson AB 2000-2016. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%%     http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%%
%% %CopyrightEnd%
%%

%% This program is used to generate a header file with data for
%% normalizing denormalized unicode.

%% The C header is generated from a text file containing tuples in the 
%% following format:
%% {RevList,Translation}
%% Where 'RevList' is a reversed list of the denormalized repressentation of
%% the character 'Translation'. An example would be the swedish character 
%% 'ö', which would be represented in the file as:
%% {[776,111],246}, as the denormalized representation of codepoint 246
%% is [111,776] (i.e an 'o' followed by the "double dot accent character 776),
%% while 'ä' instead is represented as {[776,97],228}, as the denormalized 
%% form would be [97,776] (same accent but an 'a' instead).
%% The datafile is generated from the table on Apple's developer connection
%% http://developer.apple.com/library/mac/#technotes/tn/tn1150table.html
%% The generating is done whenever new data is present (i.e. dec.dat has 
%% to be changed) and not for every build. The product (the C header) is copied
%% to $ERL_TOP/erts/beam after generation and checked in.
%% The program and the data file is included for reference.

-module(dec).

-compile(export_all).

-define(HASH_SIZE_FACTOR,2).
-define(BIG_PREFIX_SIZE,392).

-define(INPUT_FILE_NAME,"dec.dat").
-define(OUTPUT_FILE_NAME,"erl_unicode_normalize.h").

read(FName) ->
    {ok,L} = file:consult(FName),
    [{A,B} || {A,B} <- L,
	      length(A) > 1% , hd(A) < 769
		 ].

dec() ->
    L = read(?INPUT_FILE_NAME),
    G = group(L),
    {ok,Out} = file:open(?OUTPUT_FILE_NAME,[write]),
    io:format
      (Out,
       "/*~n"
       "* %CopyrightBegin%~n"
       "*~n"
       "* Copyright Ericsson AB 1999-2010. All Rights Reserved.~n"
       "*~n"
       "* Licensed under the Apache License, Version 2.0 (the \"License\");~n"
       "* you may not use this file except in compliance with the License.~n"
       "* You may obtain a copy of the License at~n"
       "*~n"
       "*     http://www.apache.org/licenses/LICENSE-2.0~n"
       "*~n"
       "* Unless required by applicable law or agreed to in writing, software~n"
       "* distributed under the License is distributed on an \"AS IS\" BASIS,~n"
       "* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.~n"
       "* See the License for the specific language governing permissions and~n"
       "* limitations under the License.~n"
       "*~n"
       "* %CopyrightEnd%~n"
       "*/~n"
       "/*~n"
       "* This file is automatically generated by ~p.erl, "
       "do not edit manually~n"
       "*/~n",
       [?MODULE]),

    io:format(Out,
	      "#define HASH_SIZE_FACTOR ~w~n"
	      "typedef struct _compose_entry {~n"
	      "    Uint16 c;~n"
	      "    Uint16 res;~n"
	      "    Uint16 num_subs;~n"
	      "    struct _compose_entry *subs;~n"
	      "    int *hash;~n"
	      "} CompEntry;~n~n"
	      "static int compose_tab_size = ~p;~n", 
	      [?HASH_SIZE_FACTOR,length(G)]),
    d(Out,G,[],0),
    PreTab = tuple_to_list(make_prefix_table(G,erlang:make_tuple(102,0))),
    dump_prefixes(Out,PreTab),
%% Using this cuts down on the searching in the
%% actual implementation, but wastes memory with little real gain..
%%    LL = lists:flatten([PartList || {PartList,_} <- L]),
%%    BigPreTab = tuple_to_list(
%%		  make_big_prefixes(LL,
%%				    erlang:make_tuple(?BIG_PREFIX_SIZE,0))),
%%    dump_big_prefixes(Out,BigPreTab),
    file:close(Out),
    ok.
    
   
d(Out,List,D,C) ->
    d_sub(Out,List,D,C),
    d_top_hash(Out,List,D,C),
    d_top(Out,List,D,C).
d_sub(_Out,[],_D,_C) ->
    ok;
d_sub(Out,[{_CP,[],_Res}|T],D,C) ->
    d_sub(Out,T,D,C+1);
d_sub(Out,[{_CP,Subs,_Res0}|T],D,C) ->
    d(Out,Subs,[C|D],0),
    d_sub(Out,T,D,C+1).
d_top(Out,L,D,C) ->
    io:format(Out,"static CompEntry ~s[] = {~n",[format_depth(D)]),
    d_top_1(Out,L,D,C),
    io:format(Out,"}; /* ~s */ ~n",[format_depth(D)]).
    
d_top_1(_Out,[],_D,_C) ->
    ok;
d_top_1(Out,[{CP,[],Res}|T],D,C) ->
    io:format(Out,
	          "{~w, ~w, 0, NULL, NULL}",[CP,Res]),
    if 
	T =:= [] ->
	    io:format(Out,"~n",[]);
	true ->
	    io:format(Out,",~n",[])
    end,
    d_top_1(Out,T,D,C+1);
d_top_1(Out,[{CP,Subs,_Res}|T],D,C) ->
    io:format(Out,
	          "{~w, 0, ~w, ~s, ~s}",[CP,length(Subs),
					  format_depth([C|D]),
					 "hash_"++format_depth([C|D])]),
    if 
	T =:= [] ->
	    io:format(Out,"~n",[]);
	true ->
	    io:format(Out,",~n",[])
    end,
    d_top_1(Out,T,D,C+1).


d_top_hash(Out,List,D,_C) ->
     HSize = length(List)*?HASH_SIZE_FACTOR,
     io:format(Out,"static int ~s[~p] = ~n",["hash_"++format_depth(D),HSize]),
     Tup = d_top_hash_1(List,0,erlang:make_tuple(HSize,-1),HSize),
     io:format(Out,"~p; /* ~s */ ~n",[Tup,"hash_"++format_depth(D)]).

d_top_hash_1([],_,Hash,_HSize) -> 
    Hash;
d_top_hash_1([{CP,_,_}|T],Index,Hash,HSize) ->
    Bucket = hash_search(Hash,HSize,CP rem HSize),
    d_top_hash_1(T,Index+1,erlang:setelement(Bucket+1,Hash,Index),HSize).

hash_search(Hash,_HSize,Bucket) when element(Bucket+1,Hash) =:= -1 ->
    Bucket;
hash_search(Hash,HSize,Bucket) ->
    hash_search(Hash,HSize,(Bucket + 1) rem HSize). 

format_depth(D) ->
    lists:reverse(tl(lists:reverse(lists:flatten(["compose_tab_",[ integer_to_list(X) ++ "_" || X <- lists:reverse(D) ]])))).


make_prefix_table([],Table) ->
    Table;
make_prefix_table([{C,_,_}|T],Table) when C =< 4023 ->
    Index = (C div 32) + 1 - 24,
    Pos = C rem 32,
    X = element(Index,Table),
    Y = X bor (1 bsl Pos),
    NewTab = setelement(Index,Table,Y),
    make_prefix_table(T,NewTab);
make_prefix_table([_|T],Tab) ->
    make_prefix_table(T,Tab).

dump_prefixes(Out,L) ->
    io:format(Out,"#define COMP_CANDIDATE_MAP_OFFSET 24~n",[]),
    io:format(Out,"static Uint32 comp_candidate_map[] = {~n",[]),
    dump_prefixes_1(Out,L).
dump_prefixes_1(Out,[H]) ->
    io:format(Out,"    0x~8.16.0BU~n",[H]),
    io:format(Out,"};~n",[]);
dump_prefixes_1(Out,[H|T]) ->
    io:format(Out,"    0x~8.16.0BU,~n",[H]),
    dump_prefixes_1(Out,T).

%% make_big_prefixes([],Table) ->
%%     Table;
%% make_big_prefixes([C|T],Table) ->
%%     Index = (C div 32) + 1,
%%     Pos = C rem 32,
%%     X = element(Index,Table),
%%     Y = X bor (1 bsl Pos),
%%     NewTab = setelement(Index,Table,Y),
%%     make_big_prefixes(T,NewTab).

%% dump_big_prefixes(Out,L) ->
%%     io:format(Out,"#define BIG_COMP_CANDIDATE_SIZE ~w~n", [?BIG_PREFIX_SIZE]),
%%     io:format(Out,"static Uint32 big_comp_candidate_map[] = {~n",[]),
%%     dump_prefixes_1(Out,L).
   
pick([],_,Acc) ->
    {lists:reverse(Acc),[]};
pick([{[H|TT],N}|T],H,Acc) ->
    pick(T,H,[{TT,N}|Acc]);
pick([{[H|_],_}|_]=L,M,Acc) when H =/= M ->
    {lists:reverse(Acc),L}.
    

group([]) ->
    [];
group([{[H],N}|T]) ->
    {Part,Rest} = pick(T,H,[]),
    [{H,group(Part),N}| group(Rest)];
group([{[H|_],_}|_]=L) ->
    {Part,Rest} = pick(L,H,[]),
    [{H,group(Part),0}| group(Rest)].
    
    
%%
%% %CopyrightBegin%
%%
%% Copyright Ericsson AB 2000-2016. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%%     http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%%
%% %CopyrightEnd%
%%

%% This program is used to generate a header file with data for
%% normalizing denormalized unicode.

%% The C header is generated from a text file containing tuples in the 
%% following format:
%% {RevList,Translation}
%% Where 'RevList' is a reversed list of the denormalized repressentation of
%% the character 'Translation'. An example would be the swedish character 
%% 'ö', which would be represented in the file as:
%% {[776,111],246}, as the denormalized representation of codepoint 246
%% is [111,776] (i.e an 'o' followed by the "double dot accent character 776),
%% while 'ä' instead is represented as {[776,97],228}, as the denormalized 
%% form would be [97,776] (same accent but an 'a' instead).
%% The datafile is generated from the table on Apple's developer connection
%% http://developer.apple.com/library/mac/#technotes/tn/tn1150table.html
%% The generating is done whenever new data is present (i.e. dec.dat has 
%% to be changed) and not for every build. The product (the C header) is copied
%% to $ERL_TOP/erts/beam after generation and checked in.
%% The program and the data file is included for reference.

-module(dec).

-compile(export_all).

-define(HASH_SIZE_FACTOR,2).
-define(BIG_PREFIX_SIZE,392).

-define(INPUT_FILE_NAME,"dec.dat").
-define(OUTPUT_FILE_NAME,"erl_unicode_normalize.h").

read(FName) ->
    {ok,L} = file:consult(FName),
    [{A,B} || {A,B} <- L,
	      length(A) > 1% , hd(A) < 769
		 ].

dec() ->
    L = read(?INPUT_FILE_NAME),
    G = group(L),
    {ok,Out} = file:open(?OUTPUT_FILE_NAME,[write]),
    io:format
      (Out,
       "/*~n"
       "* %CopyrightBegin%~n"
       "*~n"
       "* Copyright Ericsson AB 1999-2010. All Rights Reserved.~n"
       "*~n"
       "* Licensed under the Apache License, Version 2.0 (the \"License\");~n"
       "* you may not use this file except in compliance with the License.~n"
       "* You may obtain a copy of the License at~n"
       "*~n"
       "*     http://www.apache.org/licenses/LICENSE-2.0~n"
       "*~n"
       "* Unless required by applicable law or agreed to in writing, software~n"
       "* distributed under the License is distributed on an \"AS IS\" BASIS,~n"
       "* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.~n"
       "* See the License for the specific language governing permissions and~n"
       "* limitations under the License.~n"
       "*~n"
       "* %CopyrightEnd%~n"
       "*/~n"
       "/*~n"
       "* This file is automatically generated by ~p.erl, "
       "do not edit manually~n"
       "*/~n",
       [?MODULE]),

    io:format(Out,
	      "#define HASH_SIZE_FACTOR ~w~n"
	      "typedef struct _compose_entry {~n"
	      "    Uint16 c;~n"
	      "    Uint16 res;~n"
	      "    Uint16 num_subs;~n"
	      "    struct _compose_entry *subs;~n"
	      "    int *hash;~n"
	      "} CompEntry;~n~n"
	      "static int compose_tab_size = ~p;~n", 
	      [?HASH_SIZE_FACTOR,length(G)]),
    d(Out,G,[],0),
    PreTab = tuple_to_list(make_prefix_table(G,erlang:make_tuple(102,0))),
    dump_prefixes(Out,PreTab),
%% Using this cuts down on the searching in the
%% actual implementation, but wastes memory with little real gain..
%%    LL = lists:flatten([PartList || {PartList,_} <- L]),
%%    BigPreTab = tuple_to_list(
%%		  make_big_prefixes(LL,
%%				    erlang:make_tuple(?BIG_PREFIX_SIZE,0))),
%%    dump_big_prefixes(Out,BigPreTab),
    file:close(Out),
    ok.
    
   
d(Out,List,D,C) ->
    d_sub(Out,List,D,C),
    d_top_hash(Out,List,D,C),
    d_top(Out,List,D,C).
d_sub(_Out,[],_D,_C) ->
    ok;
d_sub(Out,[{_CP,[],_Res}|T],D,C) ->
    d_sub(Out,T,D,C+1);
d_sub(Out,[{_CP,Subs,_Res0}|T],D,C) ->
    d(Out,Subs,[C|D],0),
    d_sub(Out,T,D,C+1).
d_top(Out,L,D,C) ->
    io:format(Out,"static CompEntry ~s[] = {~n",[format_depth(D)]),
    d_top_1(Out,L,D,C),
    io:format(Out,"}; /* ~s */ ~n",[format_depth(D)]).
    
d_top_1(_Out,[],_D,_C) ->
    ok;
d_top_1(Out,[{CP,[],Res}|T],D,C) ->
    io:format(Out,
	          "{~w, ~w, 0, NULL, NULL}",[CP,Res]),
    if 
	T =:= [] ->
	    io:format(Out,"~n",[]);
	true ->
	    io:format(Out,",~n",[])
    end,
    d_top_1(Out,T,D,C+1);
d_top_1(Out,[{CP,Subs,_Res}|T],D,C) ->
    io:format(Out,
	          "{~w, 0, ~w, ~s, ~s}",[CP,length(Subs),
					  format_depth([C|D]),
					 "hash_"++format_depth([C|D])]),
    if 
	T =:= [] ->
	    io:format(Out,"~n",[]);
	true ->
	    io:format(Out,",~n",[])
    end,
    d_top_1(Out,T,D,C+1).


d_top_hash(Out,List,D,_C) ->
     HSize = length(List)*?HASH_SIZE_FACTOR,
     io:format(Out,"static int ~s[~p] = ~n",["hash_"++format_depth(D),HSize]),
     Tup = d_top_hash_1(List,0,erlang:make_tuple(HSize,-1),HSize),
     io:format(Out,"~p; /* ~s */ ~n",[Tup,"hash_"++format_depth(D)]).

d_top_hash_1([],_,Hash,_HSize) -> 
    Hash;
d_top_hash_1([{CP,_,_}|T],Index,Hash,HSize) ->
    Bucket = hash_search(Hash,HSize,CP rem HSize),
    d_top_hash_1(T,Index+1,erlang:setelement(Bucket+1,Hash,Index),HSize).

hash_search(Hash,_HSize,Bucket) when element(Bucket+1,Hash) =:= -1 ->
    Bucket;
hash_search(Hash,HSize,Bucket) ->
    hash_search(Hash,HSize,(Bucket + 1) rem HSize). 

format_depth(D) ->
    lists:reverse(tl(lists:reverse(lists:flatten(["compose_tab_",[ integer_to_list(X) ++ "_" || X <- lists:reverse(D) ]])))).


make_prefix_table([],Table) ->
    Table;
make_prefix_table([{C,_,_}|T],Table) when C =< 4023 ->
    Index = (C div 32) + 1 - 24,
    Pos = C rem 32,
    X = element(Index,Table),
    Y = X bor (1 bsl Pos),
    NewTab = setelement(Index,Table,Y),
    make_prefix_table(T,NewTab);
make_prefix_table([_|T],Tab) ->
    make_prefix_table(T,Tab).

dump_prefixes(Out,L) ->
    io:format(Out,"#define COMP_CANDIDATE_MAP_OFFSET 24~n",[]),
    io:format(Out,"static Uint32 comp_candidate_map[] = {~n",[]),
    dump_prefixes_1(Out,L).
dump_prefixes_1(Out,[H]) ->
    io:format(Out,"    0x~8.16.0BU~n",[H]),
    io:format(Out,"};~n",[]);
dump_prefixes_1(Out,[H|T]) ->
    io:format(Out,"    0x~8.16.0BU,~n",[H]),
    dump_prefixes_1(Out,T).

%% make_big_prefixes([],Table) ->
%%     Table;
%% make_big_prefixes([C|T],Table) ->
%%     Index = (C div 32) + 1,
%%     Pos = C rem 32,
%%     X = element(Index,Table),
%%     Y = X bor (1 bsl Pos),
%%     NewTab = setelement(Index,Table,Y),
%%     make_big_prefixes(T,NewTab).

%% dump_big_prefixes(Out,L) ->
%%     io:format(Out,"#define BIG_COMP_CANDIDATE_SIZE ~w~n", [?BIG_PREFIX_SIZE]),
%%     io:format(Out,"static Uint32 big_comp_candidate_map[] = {~n",[]),
%%     dump_prefixes_1(Out,L).
   
pick([],_,Acc) ->
    {lists:reverse(Acc),[]};
pick([{[H|TT],N}|T],H,Acc) ->
    pick(T,H,[{TT,N}|Acc]);
pick([{[H|_],_}|_]=L,M,Acc) when H =/= M ->
    {lists:reverse(Acc),L}.
    

group([]) ->
    [];
group([{[H],N}|T]) ->
    {Part,Rest} = pick(T,H,[]),
    [{H,group(Part),N}| group(Rest)];
group([{[H|_],_}|_]=L) ->
    {Part,Rest} = pick(L,H,[]),
    [{H,group(Part),0}| group(Rest)].