Prevent xmerl from over-normalizing character references in attributes

Section 3.3.3 of the XML Recommendation gives the rules for attribute-value normalization. One of those rules requires that character references not be re-normalized after being replaced with the referenced characters: For a character reference, append the referenced character to the normalized value. And, in particular: Note that if the unnormalized attribute value contains a character reference to a white space character other than space (#x20), the normalized value contains the referenced character itself (#xD, #xA or #x9). Source: http://www.w3.org/TR/xml/#AVNormalize In xmerl_scan, however, character references in attributes are normalized an extra time after replacement. For example, the character reference "&#xA" in the following XML document gets normalized (incorrectly) into a space when parsed: 2> xmerl_scan:string("<root x='
'/>"). {... [{xmlAttribute,x,[],[],[],[],1,[]," ",false}] ...} This short patch restores the correct behavior: 2> xmerl_scan:string("<root x='
'/>"). {... [{xmlAttribute,x,[],[],[],[],1,[],"\n",false}] ...} NOTE: This change does not include tests because I could not find a test suite for xmerl.
author: Tom Moertel <tom@smashcode.com> 2011-04-28 17:15:16 -0400
committer: Tom Moertel <tom@smashcode.com> 2011-04-28 17:15:16 -0400
commit: a011451e7e40690b533003802ee54f7c6f77e16e (patch)
tree: 7785b731ce89bebba8c873a992f5fe0111c3264f
parent: 3e815447cafbcb704bac1fac3d195e94def7080f (diff)
download: otp-a011451e7e40690b533003802ee54f7c6f77e16e.tar.gz
otp-a011451e7e40690b533003802ee54f7c6f77e16e.tar.bz2
otp-a011451e7e40690b533003802ee54f7c6f77e16e.zip
1 files changed, 10 insertions, 2 deletions
diff --git a/lib/xmerl/src/xmerl_scan.erl b/lib/xmerl/src/xmerl_scan.erl
index e07d495fc7..e47e74db6a 100644
--- a/lib/xmerl/src/xmerl_scan.erl
+++ b/lib/xmerl/src/xmerl_scan.erl
@@ -2283,8 +2283,16 @@ scan_att_chars("&" ++ T, S0, Delim, Acc, TmpAcc,AT,IsNorm) -> % Reference
 	true ->
 	    scan_att_chars(T1,S1,Delim,[ExpRef|Acc],[ExpRef|TmpAcc],AT,IsNorm);
 	_ ->
-	    scan_att_chars(string_to_char_set(S#xmerl_scanner.encoding,ExpRef)
-			   ++ T1, S1, Delim, Acc,TmpAcc, AT,IsNorm)
+            Ch = string_to_char_set(S#xmerl_scanner.encoding, ExpRef),
+            case T of
+                "#" ++ _ ->
+                    %% normalization rules (sec 3.3.3) require that for
+                    %% character references, the referenced character be
+                    %% added directly to the normalized value
+                    scan_att_chars(T1, S1, Delim, Ch ++ Acc,TmpAcc, AT,IsNorm);
+                _ ->
+                    scan_att_chars(Ch ++ T1, S1, Delim, Acc,TmpAcc, AT,IsNorm)
+            end
     end;
 scan_att_chars("<" ++ _T, S0, _Delim, _Acc,_, _,_) -> % Tags not allowed here
     ?fatal(unexpected_char, S0);
author	Tom Moertel <tom@smashcode.com>	2011-04-28 17:15:16 -0400
committer	Tom Moertel <tom@smashcode.com>	2011-04-28 17:15:16 -0400
commit	a011451e7e40690b533003802ee54f7c6f77e16e (patch)
tree	7785b731ce89bebba8c873a992f5fe0111c3264f
parent	3e815447cafbcb704bac1fac3d195e94def7080f (diff)
download	otp-a011451e7e40690b533003802ee54f7c6f77e16e.tar.gz otp-a011451e7e40690b533003802ee54f7c6f77e16e.tar.bz2 otp-a011451e7e40690b533003802ee54f7c6f77e16e.zip