From d00c1adf7e12f791781995c9469a1acf94ddfb93 Mon Sep 17 00:00:00 2001 From: Dan Gudmundsson Date: Mon, 19 Nov 2018 13:44:12 +0100 Subject: unicode_util did not handle binary input data correctly gc_zwj sent binaries recursivly to gc_extend/3 which didn't handle can't handle them. ERL-777 --- lib/stdlib/test/unicode_util_SUITE.erl | 27 ++++++++++++++++++++------- lib/stdlib/uc_spec/gen_unicode_mod.escript | 2 +- 2 files changed, 21 insertions(+), 8 deletions(-) (limited to 'lib') diff --git a/lib/stdlib/test/unicode_util_SUITE.erl b/lib/stdlib/test/unicode_util_SUITE.erl index 962b307b07..044b4e5834 100644 --- a/lib/stdlib/test/unicode_util_SUITE.erl +++ b/lib/stdlib/test/unicode_util_SUITE.erl @@ -126,17 +126,30 @@ verify_gc(Line0, N, Acc) -> %io:format("Line: ~s~n",[Line]), [Data|_Comments] = string:tokens(Line, "#"), - %io:format("Data: ~w~n",[string:tokens(Data, " \t")]), + %% io:format("Data: ~w~n",[string:tokens(Data, " \t")]), {Str,Res} = gc_test_data(string:tokens(Data, " \t"), [], [[]]), - try - Res = fetch(Str, fun unicode_util:gc/1), - Acc - catch _Cl:{badmatch, Other} -> + %% io:format("InputStr: ~w ~w~n",[Str,unicode:characters_to_binary(Str)]), + case verify_gc(Str, Res, N, Line) andalso + verify_gc(unicode:characters_to_binary(Str), Res, N, Line0) of + true -> Acc; + false -> Acc+1 + end. + +verify_gc({error,_,[CP|_]}=Err, _Res, N, Line) -> + IsSurrogate = 16#D800 =< CP andalso CP =< 16#DFFF, + %% Surrogat is not valid in utf8 encoding only utf16 + IsSurrogate orelse + io:format("~w: ~ts~n Error in unicode:characters_to_binary ~w~n", [N, Line, Err]), + IsSurrogate; +verify_gc(Str, Res, N, Line) -> + try fetch(Str, fun unicode_util:gc/1) of + Res -> true; + Other -> io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[N, Line, Str, Str]), io:format("Expected: ~p~n", [Res]), io:format("Got: ~w~n", [Other]), - Acc+1; - Cl:R:Stacktrace -> + false + catch Cl:R:Stacktrace -> io:format("~p: ~ts => |~tp|~n",[N, Line, Str]), io:format("Expected: ~p~n", [Res]), erlang:raise(Cl,R,Stacktrace) diff --git a/lib/stdlib/uc_spec/gen_unicode_mod.escript b/lib/stdlib/uc_spec/gen_unicode_mod.escript index fe5a860d45..535f01a1c5 100755 --- a/lib/stdlib/uc_spec/gen_unicode_mod.escript +++ b/lib/stdlib/uc_spec/gen_unicode_mod.escript @@ -646,7 +646,7 @@ gen_gc(Fd, GBP) -> io:put_chars(Fd, "is_emodifier(_) -> false.\n\n"), io:put_chars(Fd, "gc_zwj(R0, Acc) ->\n case cp(R0) of\n"), - GenZWJGlue = fun(Range) -> io:format(Fd, "~8c~s gc_extend(R1, R0, [CP|Acc]);\n", + GenZWJGlue = fun(Range) -> io:format(Fd, "~8c~s gc_extend(cp(R1), R0, [CP|Acc]);\n", [$\s,gen_case_clause(Range)]) end, [GenZWJGlue(CP) || CP <- merge_ranges(maps:get(glue_after_zwj,GBP))], GenZWJEBG = fun(Range) -> io:format(Fd, "~8c~s gc_e_cont(R1, [CP|Acc]);\n", -- cgit v1.2.3