diff options
Diffstat (limited to 'lib/xmerl/src/xmerl_scan.erl')
-rw-r--r-- | lib/xmerl/src/xmerl_scan.erl | 73 |
1 files changed, 44 insertions, 29 deletions
diff --git a/lib/xmerl/src/xmerl_scan.erl b/lib/xmerl/src/xmerl_scan.erl index 4e5cc59d8f..25c6547497 100644 --- a/lib/xmerl/src/xmerl_scan.erl +++ b/lib/xmerl/src/xmerl_scan.erl @@ -1,19 +1,19 @@ %% %% %CopyrightBegin% -%% -%% Copyright Ericsson AB 2003-2009. All Rights Reserved. -%% +%% +%% Copyright Ericsson AB 2003-2011. All Rights Reserved. +%% %% The contents of this file are subject to the Erlang Public License, %% Version 1.1, (the "License"); you may not use this file except in %% compliance with the License. You should have received a copy of the %% Erlang Public License along with this software. If not, it can be %% retrieved online at http://www.erlang.org/. -%% +%% %% Software distributed under the License is distributed on an "AS IS" %% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See %% the License for the specific language governing rights and limitations %% under the License. -%% +%% %% %CopyrightEnd% %% @@ -34,7 +34,9 @@ %% See also <a href="xmerl_examples.html">tutorial</a> on customization %% functions. %% </p> +%% <p> %% Possible options are: +%% </p> %% <dl> %% <dt><code>{acc_fun, Fun}</code></dt> %% <dd>Call back function to accumulate contents of entity.</dd> @@ -100,6 +102,7 @@ %% declaration. </dd> %% </dl> + -module(xmerl_scan). -vsn('0.20'). -date('03-09-16'). @@ -2071,10 +2074,10 @@ scan_element(T, S, Pos, Name, StartL, StartC, Attrs, Lang, Parents, {AttName, NamespaceInfo, T1, S1} = scan_name(T, S), {T2, S2} = scan_eq(T1, S1), {AttType,_DefaultDecl} = get_att_type(S2,AttName,Name), - {AttValue, T3, S3,IsNorm} = scan_att_value(T2, S2, AttType), + {AttValue, T3a, S3a,IsNorm} = scan_att_value(T2, S2, AttType), %% check_default_value(S3,DefaultDecl,AttValue), NewNS = check_namespace(AttName, NamespaceInfo, AttValue, NS), - wfc_whitespace_betw_attrs(hd(T3),S3), + {T3,S3} = wfc_whitespace_betw_attrs(T3a,S3a), ?strip4, AttrPos = case Attrs of [] -> @@ -2273,7 +2276,7 @@ scan_att_chars([H|T], S0, H, Acc, TmpAcc,AttType,IsNorm) -> % End quote true -> normalize(Acc,S,IsNorm) end, - {lists:reverse(Acc2), T, S2,IsNorm2}; + {lists:flatten(lists:reverse(Acc2)), T, S2,IsNorm2}; scan_att_chars("&" ++ T, S0, Delim, Acc, TmpAcc,AT,IsNorm) -> % Reference ?bump_col(1), {ExpRef, T1, S1} = scan_reference(T, S), @@ -2281,8 +2284,16 @@ scan_att_chars("&" ++ T, S0, Delim, Acc, TmpAcc,AT,IsNorm) -> % Reference true -> scan_att_chars(T1,S1,Delim,[ExpRef|Acc],[ExpRef|TmpAcc],AT,IsNorm); _ -> - scan_att_chars(string_to_char_set(S#xmerl_scanner.encoding,ExpRef) - ++ T1, S1, Delim, Acc,TmpAcc, AT,IsNorm) + Ch = string_to_char_set(S#xmerl_scanner.encoding, ExpRef), + case T of + "#" ++ _ -> + %% normalization rules (sec 3.3.3) require that for + %% character references, the referenced character be + %% added directly to the normalized value + scan_att_chars(T1, S1, Delim, Ch ++ Acc,TmpAcc, AT,IsNorm); + _ -> + scan_att_chars(Ch ++ T1, S1, Delim, Acc,TmpAcc, AT,IsNorm) + end end; scan_att_chars("<" ++ _T, S0, _Delim, _Acc,_, _,_) -> % Tags not allowed here ?fatal(unexpected_char, S0); @@ -2602,8 +2613,7 @@ scan_reference("#x" ++ T, S0) -> %% [66] CharRef ?bump_col(1), if hd(T) /= $; -> - {[Ch], T2, S2} = scan_char_ref_hex(T, S, 0), - {to_char_set(S2#xmerl_scanner.encoding,Ch),T2,S2}; + scan_char_ref_hex(T, S, 0); true -> ?fatal(invalid_char_ref, S) end; @@ -3274,12 +3284,17 @@ wfc_legal_char(Ch,S) -> end. -wfc_whitespace_betw_attrs(WS,_S) when ?whitespace(WS) -> - ok; -wfc_whitespace_betw_attrs($/,_S) -> - ok; -wfc_whitespace_betw_attrs($>,_S) -> - ok; +wfc_whitespace_betw_attrs([WS |_]=L,S) when ?whitespace(WS) -> + {L,S}; +wfc_whitespace_betw_attrs([$/ |_]=L,S) -> + {L,S}; +wfc_whitespace_betw_attrs([$> |_]=L,S) -> + {L,S}; +wfc_whitespace_betw_attrs([],S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> wfc_whitespace_betw_attrs(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); wfc_whitespace_betw_attrs(_,S) -> ?fatal({whitespace_required_between_attributes},S). @@ -3452,14 +3467,14 @@ scan_entity_value("%" ++ T, S0, Delim, Acc, PEName,Namespace,PENesting) -> %% {system,URI} or {public,URI} %% Included in literal. {ExpRef,Sx}=fetch_not_parse(Tuple,S1), - {EntV,_,_S2} = - scan_entity_value(ExpRef, Sx, no_delim,[], + {EntV, _, S5} = + scan_entity_value(ExpRef, Sx, no_delim,[], PERefName,parameter,[]), %% should do an update Write(parameter_entity) %% so next expand_pe_reference is faster - {EntV,_S2}; + {string_to_char_set(S5#xmerl_scanner.encoding, EntV), S5}; ExpRef -> - {ExpRef,S1} + {string_to_char_set(S1#xmerl_scanner.encoding, ExpRef) ,S1} end, %% single or duoble qoutes are not treated as delimeters %% in passages "included in literal" @@ -3908,7 +3923,7 @@ schemaLocations(#xmlElement{attributes=Atts,xmlbase=_Base}) -> case lists:dropwhile(Pred,Atts) of [#xmlAttribute{value=Paths}|_] -> - case string:tokens(Paths," ") of + case string:tokens(Paths," \n\t\r") of L when length(L) > 0 -> case length(L) rem 2 of 0 -> @@ -4020,12 +4035,12 @@ utf8_2_ucs([A|Rest]) when A < 16#80 -> utf8_2_ucs([A|Rest]) -> {{error,{bad_character,A}},Rest}. -to_char_set("iso-10646-utf-1",Ch) -> - [Ch]; -to_char_set(UTF8,Ch) when UTF8 =:= "utf-8"; UTF8 =:= undefined -> - ucs_2_utf8(Ch); -to_char_set(_,Ch) -> - [Ch]. +%% to_char_set("iso-10646-utf-1",Ch) -> +%% [Ch]; +%% to_char_set(UTF8,Ch) when UTF8 =:= "utf-8"; UTF8 =:= undefined -> +%% ucs_2_utf8(Ch); +%% to_char_set(_,Ch) -> +%% [Ch]. ucs_2_utf8(Ch) when Ch < 128 -> %% 0vvvvvvv |