aboutsummaryrefslogtreecommitdiffstats
path: root/lib/xmerl/src/xmerl_sax_parser.erl
diff options
context:
space:
mode:
Diffstat (limited to 'lib/xmerl/src/xmerl_sax_parser.erl')
-rw-r--r--lib/xmerl/src/xmerl_sax_parser.erl171
1 files changed, 133 insertions, 38 deletions
diff --git a/lib/xmerl/src/xmerl_sax_parser.erl b/lib/xmerl/src/xmerl_sax_parser.erl
index 1aef6c58c4..e383c4c349 100644
--- a/lib/xmerl/src/xmerl_sax_parser.erl
+++ b/lib/xmerl/src/xmerl_sax_parser.erl
@@ -64,7 +64,7 @@
%% Description: Parse file containing an XML document.
%%----------------------------------------------------------------------
file(Name,Options) ->
- case file:open(Name, [raw, read,binary]) of
+ case file:open(Name, [raw, read_ahead, read,binary]) of
{error, Reason} ->
{error,{Name, file:format_error(Reason)}};
{ok, FD} ->
@@ -120,21 +120,22 @@ stream(Xml, Options, InputType) when is_binary(Xml), is_list(Options) ->
normal ->
parse
end,
- case detect_charset(Xml, State) of
- {error, Reason} -> {fatal_error,
- {
- State#xmerl_sax_parser_state.current_location,
- State#xmerl_sax_parser_state.entity,
- 1
- },
- Reason,
- [],
- State#xmerl_sax_parser_state.event_state};
- {Xml1, State1} ->
- parse_binary(Xml1,
- State1#xmerl_sax_parser_state{input_type = InputType},
- ParseFunction)
- end
+ try
+ {Xml1, State1} = detect_charset(Xml, State),
+ parse_binary(Xml1,
+ State1#xmerl_sax_parser_state{input_type = InputType},
+ ParseFunction)
+ catch
+ throw:{fatal_error, {State2, Reason}} ->
+ {fatal_error,
+ {
+ State2#xmerl_sax_parser_state.current_location,
+ State2#xmerl_sax_parser_state.entity,
+ 1
+ },
+ Reason, [],
+ State2#xmerl_sax_parser_state.event_state}
+ end
end.
%%----------------------------------------------------------------------
@@ -156,8 +157,8 @@ parse_binary(Xml, #xmerl_sax_parser_state{encoding={utf16,big}}=State, F) ->
xmerl_sax_parser_utf16be:F(Xml, State);
parse_binary(Xml, #xmerl_sax_parser_state{encoding=latin1}=State, F) ->
xmerl_sax_parser_latin1:F(Xml, State);
-parse_binary(_, #xmerl_sax_parser_state{encoding=Enc}, _) ->
- {error, lists:flatten(io_lib:format("Charcter set ~p not supported", [Enc]))}.
+parse_binary(_, #xmerl_sax_parser_state{encoding=Enc}, State) ->
+ ?fatal_error(State, lists:flatten(io_lib:format("Charcter set ~p not supported", [Enc]))).
%%----------------------------------------------------------------------
%% Function: initial_state/0
@@ -211,8 +212,7 @@ parse_options([{entity, Entity} |Options], State) ->
parse_options([skip_external_dtd |Options], State) ->
parse_options(Options, State#xmerl_sax_parser_state{skip_external_dtd = true});
parse_options([O |_], _State) ->
- {error,
- lists:flatten(io_lib:format("Option: ~p not supported", [O]))}.
+ {error, lists:flatten(io_lib:format("Option: ~p not supported", [O]))}.
check_encoding_option(E) when E==utf8; E=={utf16,little}; E=={utf16,big};
@@ -230,16 +230,10 @@ check_encoding_option(E) ->
%% Output: {utf8|utf16le|utf16be|iso8859, Xml, State}
%% Description: Detects which character set is used in a binary stream.
%%----------------------------------------------------------------------
-detect_charset(<<>>, #xmerl_sax_parser_state{continuation_fun = undefined} = _) ->
- {error, "Can't detect character encoding due to no indata"};
-detect_charset(<<>>, #xmerl_sax_parser_state{continuation_fun = CFun,
- continuation_state = CState} = State) ->
- case CFun(CState) of
- {<<>>, _} ->
- {error, "Can't detect character encoding due to lack of indata"};
- {NewBytes, NewContState} ->
- detect_charset(NewBytes, State#xmerl_sax_parser_state{continuation_state = NewContState})
- end;
+detect_charset(<<>>, #xmerl_sax_parser_state{continuation_fun = undefined} = State) ->
+ ?fatal_error(State, "Can't detect character encoding due to lack of indata");
+detect_charset(<<>>, State) ->
+ cf(<<>>, State, fun detect_charset/2);
detect_charset(Bytes, State) ->
case unicode:bom_to_encoding(Bytes) of
{latin1, 0} ->
@@ -249,25 +243,47 @@ detect_charset(Bytes, State) ->
{RealBytes, State#xmerl_sax_parser_state{encoding=Enc}}
end.
+detect_charset_1(<<16#00>> = Xml, State) ->
+ cf(Xml, State, fun detect_charset_1/2);
+detect_charset_1(<<16#00, 16#3C>> = Xml, State) ->
+ cf(Xml, State, fun detect_charset_1/2);
+detect_charset_1(<<16#00, 16#3C, 16#00>> = Xml, State) ->
+ cf(Xml, State, fun detect_charset_1/2);
detect_charset_1(<<16#00, 16#3C, 16#00, 16#3F, _/binary>> = Xml, State) ->
{Xml, State#xmerl_sax_parser_state{encoding={utf16, big}}};
+detect_charset_1(<<16#3C>> = Xml, State) ->
+ cf(Xml, State, fun detect_charset_1/2);
+detect_charset_1(<<16#3C, 16#00>> = Xml, State) ->
+ cf(Xml, State, fun detect_charset_1/2);
+detect_charset_1(<<16#3C, 16#00, 16#3F>> = Xml, State) ->
+ cf(Xml, State, fun detect_charset_1/2);
detect_charset_1(<<16#3C, 16#00, 16#3F, 16#00, _/binary>> = Xml, State) ->
{Xml, State#xmerl_sax_parser_state{encoding={utf16, little}}};
-detect_charset_1(<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml2/binary>> = Xml, State) ->
- case parse_xml_directive(Xml2) of
+detect_charset_1(<<16#3C>> = Xml, State) ->
+ cf(Xml, State, fun detect_charset_1/2);
+detect_charset_1(<<16#3C, 16#3F>> = Xml, State) ->
+ cf(Xml, State, fun detect_charset_1/2);
+detect_charset_1(<<16#3C, 16#3F, 16#78>> = Xml, State) ->
+ cf(Xml, State, fun detect_charset_1/2);
+detect_charset_1(<<16#3C, 16#3F, 16#78, 16#6D>> = Xml, State) ->
+ cf(Xml, State, fun detect_charset_1/2);
+detect_charset_1(<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml2/binary>>, State) ->
+ {Xml3, State1} = read_until_end_of_xml_directive(Xml2, State),
+ case parse_xml_directive(Xml3) of
{error, Reason} ->
- {error, Reason};
+ ?fatal_error(State, Reason);
AttrList ->
case lists:keysearch("encoding", 1, AttrList) of
{value, {_, E}} ->
case convert_encoding(E) of
{error, Reason} ->
- {error, Reason};
+ ?fatal_error(State, Reason);
Enc ->
- {Xml, State#xmerl_sax_parser_state{encoding=Enc}}
+ {<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml3/binary>>,
+ State1#xmerl_sax_parser_state{encoding=Enc}}
end;
_ ->
- {Xml, State}
+ {<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml3/binary>>, State1}
end
end;
detect_charset_1(Xml, State) ->
@@ -377,7 +393,7 @@ parse_value_1(<<C, Rest/binary>>, Stop, Acc) ->
parse_value_1(Rest, Stop, [C |Acc]).
%%======================================================================
-%%Default functions
+%% Default functions
%%======================================================================
%%----------------------------------------------------------------------
%% Function: default_event_cb(Event, LineNo, State) -> Result
@@ -393,7 +409,7 @@ default_event_cb(_Event, _LineNo, State) ->
%%----------------------------------------------------------------------
%% Function: default_continuation_cb(IoDevice) -> Result
%% IoDevice = iodevice()
-%% Output: Result = {[char()], State}
+%% Output: Result = {binary(), IoDevice}
%% Description: Default continuation callback reading blocks.
%%----------------------------------------------------------------------
default_continuation_cb(IoDevice) ->
@@ -403,3 +419,82 @@ default_continuation_cb(IoDevice) ->
{ok, FileBin} ->
{FileBin, IoDevice}
end.
+
+%%----------------------------------------------------------------------
+%% Function: read_until_end_of_xml_directive(Rest, State) -> Result
+%% Rest = binary()
+%% Output: Result = {binary(), State}
+%% Description: Reads a utf8 or latin1 until it finds '?>'
+%%----------------------------------------------------------------------
+read_until_end_of_xml_directive(Rest, State) ->
+ case binary:match(Rest, <<"?>">>) of
+ nomatch ->
+ case cf(Rest, State) of
+ {<<>>, _} ->
+ ?fatal_error(State, "Can't detect character encoding due to lack of indata");
+ {NewBytes, NewState} ->
+ read_until_end_of_xml_directive(NewBytes, NewState)
+ end;
+ _ ->
+ {Rest, State}
+ end.
+
+
+%%----------------------------------------------------------------------
+%% Function : cf(Rest, State) -> Result
+%% Parameters: Rest = binary()
+%% State = #xmerl_sax_parser_state{}
+%% NextCall = fun()
+%% Result : {Rest, State}
+%% Description: Function that uses provided fun to read another chunk from
+%% input stream and calls the fun in NextCall.
+%%----------------------------------------------------------------------
+cf(_Rest, #xmerl_sax_parser_state{continuation_fun = undefined} = State) ->
+ ?fatal_error(State, "Continuation function undefined");
+cf(Rest, #xmerl_sax_parser_state{continuation_fun = CFun, continuation_state = CState} = State) ->
+ Result =
+ try
+ CFun(CState)
+ catch
+ throw:ErrorTerm ->
+ ?fatal_error(State, ErrorTerm);
+ exit:Reason ->
+ ?fatal_error(State, {'EXIT', Reason})
+ end,
+ case Result of
+ {<<>>, _} ->
+ ?fatal_error(State, "Can't detect character encoding due to lack of indata");
+ {NewBytes, NewContState} ->
+ {<<Rest/binary, NewBytes/binary>>,
+ State#xmerl_sax_parser_state{continuation_state = NewContState}}
+ end.
+
+%%----------------------------------------------------------------------
+%% Function : cf(Rest, State, NextCall) -> Result
+%% Parameters: Rest = binary()
+%% State = #xmerl_sax_parser_state{}
+%% NextCall = fun()
+%% Result : {Rest, State}
+%% Description: Function that uses provided fun to read another chunk from
+%% input stream and calls the fun in NextCall.
+%%----------------------------------------------------------------------
+cf(_Rest, #xmerl_sax_parser_state{continuation_fun = undefined} = State, _) ->
+ ?fatal_error(State, "Continuation function undefined");
+cf(Rest, #xmerl_sax_parser_state{continuation_fun = CFun, continuation_state = CState} = State,
+ NextCall) ->
+ Result =
+ try
+ CFun(CState)
+ catch
+ throw:ErrorTerm ->
+ ?fatal_error(State, ErrorTerm);
+ exit:Reason ->
+ ?fatal_error(State, {'EXIT', Reason})
+ end,
+ case Result of
+ {<<>>, _} ->
+ ?fatal_error(State, "Can't detect character encoding due to lack of indata");
+ {NewBytes, NewContState} ->
+ NextCall(<<Rest/binary, NewBytes/binary>>,
+ State#xmerl_sax_parser_state{continuation_state = NewContState})
+ end.