aboutsummaryrefslogtreecommitdiffstats
path: root/lib/xmerl/src/xmerl_sax_parser.erl
diff options
context:
space:
mode:
Diffstat (limited to 'lib/xmerl/src/xmerl_sax_parser.erl')
-rw-r--r--lib/xmerl/src/xmerl_sax_parser.erl198
1 files changed, 149 insertions, 49 deletions
diff --git a/lib/xmerl/src/xmerl_sax_parser.erl b/lib/xmerl/src/xmerl_sax_parser.erl
index 318a0cf7f4..e383c4c349 100644
--- a/lib/xmerl/src/xmerl_sax_parser.erl
+++ b/lib/xmerl/src/xmerl_sax_parser.erl
@@ -1,7 +1,7 @@
%%--------------------------------------------------------------------
%% %CopyrightBegin%
%%
-%% Copyright Ericsson AB 2008-2016. All Rights Reserved.
+%% Copyright Ericsson AB 2008-2017. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
@@ -33,6 +33,7 @@
%% External exports
%%----------------------------------------------------------------------
-export([file/2,
+ stream/3,
stream/2]).
%%----------------------------------------------------------------------
@@ -63,7 +64,7 @@
%% Description: Parse file containing an XML document.
%%----------------------------------------------------------------------
file(Name,Options) ->
- case file:open(Name, [raw, read,binary]) of
+ case file:open(Name, [raw, read_ahead, read,binary]) of
{error, Reason} ->
{error,{Name, file:format_error(Reason)}};
{ok, FD} ->
@@ -72,11 +73,12 @@ file(Name,Options) ->
File = filename:basename(Name),
ContinuationFun = fun default_continuation_cb/1,
Res = stream(<<>>,
- [{continuation_fun, ContinuationFun},
- {continuation_state, FD},
- {current_location, CL},
- {entity, File}
- |Options]),
+ [{continuation_fun, ContinuationFun},
+ {continuation_state, FD},
+ {current_location, CL},
+ {entity, File}
+ |Options],
+ file),
ok = file:close(FD),
Res
end.
@@ -92,19 +94,22 @@ file(Name,Options) ->
%% EventState = term()
%% Description: Parse a stream containing an XML document.
%%----------------------------------------------------------------------
-stream(Xml, Options) when is_list(Xml), is_list(Options) ->
+stream(Xml, Options) ->
+ stream(Xml, Options, stream).
+
+stream(Xml, Options, InputType) when is_list(Xml), is_list(Options) ->
State = parse_options(Options, initial_state()),
- case State#xmerl_sax_parser_state.file_type of
+ case State#xmerl_sax_parser_state.file_type of
dtd ->
xmerl_sax_parser_list:parse_dtd(Xml,
State#xmerl_sax_parser_state{encoding = list,
- input_type = stream});
+ input_type = InputType});
normal ->
xmerl_sax_parser_list:parse(Xml,
State#xmerl_sax_parser_state{encoding = list,
- input_type = stream})
+ input_type = InputType})
end;
-stream(Xml, Options) when is_binary(Xml), is_list(Options) ->
+stream(Xml, Options, InputType) when is_binary(Xml), is_list(Options) ->
case parse_options(Options, initial_state()) of
{error, Reason} -> {error, Reason};
State ->
@@ -115,21 +120,22 @@ stream(Xml, Options) when is_binary(Xml), is_list(Options) ->
normal ->
parse
end,
- case detect_charset(Xml, State) of
- {error, Reason} -> {fatal_error,
- {
- State#xmerl_sax_parser_state.current_location,
- State#xmerl_sax_parser_state.entity,
- 1
- },
- Reason,
- [],
- State#xmerl_sax_parser_state.event_state};
- {Xml1, State1} ->
- parse_binary(Xml1,
- State1#xmerl_sax_parser_state{input_type = stream},
- ParseFunction)
- end
+ try
+ {Xml1, State1} = detect_charset(Xml, State),
+ parse_binary(Xml1,
+ State1#xmerl_sax_parser_state{input_type = InputType},
+ ParseFunction)
+ catch
+ throw:{fatal_error, {State2, Reason}} ->
+ {fatal_error,
+ {
+ State2#xmerl_sax_parser_state.current_location,
+ State2#xmerl_sax_parser_state.entity,
+ 1
+ },
+ Reason, [],
+ State2#xmerl_sax_parser_state.event_state}
+ end
end.
%%----------------------------------------------------------------------
@@ -151,8 +157,8 @@ parse_binary(Xml, #xmerl_sax_parser_state{encoding={utf16,big}}=State, F) ->
xmerl_sax_parser_utf16be:F(Xml, State);
parse_binary(Xml, #xmerl_sax_parser_state{encoding=latin1}=State, F) ->
xmerl_sax_parser_latin1:F(Xml, State);
-parse_binary(_, #xmerl_sax_parser_state{encoding=Enc}, _) ->
- {error, lists:flatten(io_lib:format("Charcter set ~p not supported", [Enc]))}.
+parse_binary(_, #xmerl_sax_parser_state{encoding=Enc}, State) ->
+ ?fatal_error(State, lists:flatten(io_lib:format("Charcter set ~p not supported", [Enc]))).
%%----------------------------------------------------------------------
%% Function: initial_state/0
@@ -206,8 +212,7 @@ parse_options([{entity, Entity} |Options], State) ->
parse_options([skip_external_dtd |Options], State) ->
parse_options(Options, State#xmerl_sax_parser_state{skip_external_dtd = true});
parse_options([O |_], _State) ->
- {error,
- lists:flatten(io_lib:format("Option: ~p not supported", [O]))}.
+ {error, lists:flatten(io_lib:format("Option: ~p not supported", [O]))}.
check_encoding_option(E) when E==utf8; E=={utf16,little}; E=={utf16,big};
@@ -225,16 +230,10 @@ check_encoding_option(E) ->
%% Output: {utf8|utf16le|utf16be|iso8859, Xml, State}
%% Description: Detects which character set is used in a binary stream.
%%----------------------------------------------------------------------
-detect_charset(<<>>, #xmerl_sax_parser_state{continuation_fun = undefined} = _) ->
- throw({error, "Can't detect character encoding due to no indata"});
-detect_charset(<<>>, #xmerl_sax_parser_state{continuation_fun = CFun,
- continuation_state = CState} = State) ->
- case CFun(CState) of
- {<<>>, _} ->
- throw({error, "Can't detect character encoding due to lack of indata"});
- {NewBytes, NewContState} ->
- detect_charset(NewBytes, State#xmerl_sax_parser_state{continuation_state = NewContState})
- end;
+detect_charset(<<>>, #xmerl_sax_parser_state{continuation_fun = undefined} = State) ->
+ ?fatal_error(State, "Can't detect character encoding due to lack of indata");
+detect_charset(<<>>, State) ->
+ cf(<<>>, State, fun detect_charset/2);
detect_charset(Bytes, State) ->
case unicode:bom_to_encoding(Bytes) of
{latin1, 0} ->
@@ -244,25 +243,47 @@ detect_charset(Bytes, State) ->
{RealBytes, State#xmerl_sax_parser_state{encoding=Enc}}
end.
+detect_charset_1(<<16#00>> = Xml, State) ->
+ cf(Xml, State, fun detect_charset_1/2);
+detect_charset_1(<<16#00, 16#3C>> = Xml, State) ->
+ cf(Xml, State, fun detect_charset_1/2);
+detect_charset_1(<<16#00, 16#3C, 16#00>> = Xml, State) ->
+ cf(Xml, State, fun detect_charset_1/2);
detect_charset_1(<<16#00, 16#3C, 16#00, 16#3F, _/binary>> = Xml, State) ->
{Xml, State#xmerl_sax_parser_state{encoding={utf16, big}}};
+detect_charset_1(<<16#3C>> = Xml, State) ->
+ cf(Xml, State, fun detect_charset_1/2);
+detect_charset_1(<<16#3C, 16#00>> = Xml, State) ->
+ cf(Xml, State, fun detect_charset_1/2);
+detect_charset_1(<<16#3C, 16#00, 16#3F>> = Xml, State) ->
+ cf(Xml, State, fun detect_charset_1/2);
detect_charset_1(<<16#3C, 16#00, 16#3F, 16#00, _/binary>> = Xml, State) ->
{Xml, State#xmerl_sax_parser_state{encoding={utf16, little}}};
-detect_charset_1(<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml2/binary>> = Xml, State) ->
- case parse_xml_directive(Xml2) of
+detect_charset_1(<<16#3C>> = Xml, State) ->
+ cf(Xml, State, fun detect_charset_1/2);
+detect_charset_1(<<16#3C, 16#3F>> = Xml, State) ->
+ cf(Xml, State, fun detect_charset_1/2);
+detect_charset_1(<<16#3C, 16#3F, 16#78>> = Xml, State) ->
+ cf(Xml, State, fun detect_charset_1/2);
+detect_charset_1(<<16#3C, 16#3F, 16#78, 16#6D>> = Xml, State) ->
+ cf(Xml, State, fun detect_charset_1/2);
+detect_charset_1(<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml2/binary>>, State) ->
+ {Xml3, State1} = read_until_end_of_xml_directive(Xml2, State),
+ case parse_xml_directive(Xml3) of
{error, Reason} ->
- {error, Reason};
+ ?fatal_error(State, Reason);
AttrList ->
case lists:keysearch("encoding", 1, AttrList) of
{value, {_, E}} ->
case convert_encoding(E) of
{error, Reason} ->
- {error, Reason};
+ ?fatal_error(State, Reason);
Enc ->
- {Xml, State#xmerl_sax_parser_state{encoding=Enc}}
+ {<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml3/binary>>,
+ State1#xmerl_sax_parser_state{encoding=Enc}}
end;
_ ->
- {Xml, State}
+ {<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml3/binary>>, State1}
end
end;
detect_charset_1(Xml, State) ->
@@ -372,7 +393,7 @@ parse_value_1(<<C, Rest/binary>>, Stop, Acc) ->
parse_value_1(Rest, Stop, [C |Acc]).
%%======================================================================
-%%Default functions
+%% Default functions
%%======================================================================
%%----------------------------------------------------------------------
%% Function: default_event_cb(Event, LineNo, State) -> Result
@@ -388,7 +409,7 @@ default_event_cb(_Event, _LineNo, State) ->
%%----------------------------------------------------------------------
%% Function: default_continuation_cb(IoDevice) -> Result
%% IoDevice = iodevice()
-%% Output: Result = {[char()], State}
+%% Output: Result = {binary(), IoDevice}
%% Description: Default continuation callback reading blocks.
%%----------------------------------------------------------------------
default_continuation_cb(IoDevice) ->
@@ -398,3 +419,82 @@ default_continuation_cb(IoDevice) ->
{ok, FileBin} ->
{FileBin, IoDevice}
end.
+
+%%----------------------------------------------------------------------
+%% Function: read_until_end_of_xml_directive(Rest, State) -> Result
+%% Rest = binary()
+%% Output: Result = {binary(), State}
+%% Description: Reads a utf8 or latin1 until it finds '?>'
+%%----------------------------------------------------------------------
+read_until_end_of_xml_directive(Rest, State) ->
+ case binary:match(Rest, <<"?>">>) of
+ nomatch ->
+ case cf(Rest, State) of
+ {<<>>, _} ->
+ ?fatal_error(State, "Can't detect character encoding due to lack of indata");
+ {NewBytes, NewState} ->
+ read_until_end_of_xml_directive(NewBytes, NewState)
+ end;
+ _ ->
+ {Rest, State}
+ end.
+
+
+%%----------------------------------------------------------------------
+%% Function : cf(Rest, State) -> Result
+%% Parameters: Rest = binary()
+%% State = #xmerl_sax_parser_state{}
+%% NextCall = fun()
+%% Result : {Rest, State}
+%% Description: Function that uses provided fun to read another chunk from
+%% input stream and calls the fun in NextCall.
+%%----------------------------------------------------------------------
+cf(_Rest, #xmerl_sax_parser_state{continuation_fun = undefined} = State) ->
+ ?fatal_error(State, "Continuation function undefined");
+cf(Rest, #xmerl_sax_parser_state{continuation_fun = CFun, continuation_state = CState} = State) ->
+ Result =
+ try
+ CFun(CState)
+ catch
+ throw:ErrorTerm ->
+ ?fatal_error(State, ErrorTerm);
+ exit:Reason ->
+ ?fatal_error(State, {'EXIT', Reason})
+ end,
+ case Result of
+ {<<>>, _} ->
+ ?fatal_error(State, "Can't detect character encoding due to lack of indata");
+ {NewBytes, NewContState} ->
+ {<<Rest/binary, NewBytes/binary>>,
+ State#xmerl_sax_parser_state{continuation_state = NewContState}}
+ end.
+
+%%----------------------------------------------------------------------
+%% Function : cf(Rest, State, NextCall) -> Result
+%% Parameters: Rest = binary()
+%% State = #xmerl_sax_parser_state{}
+%% NextCall = fun()
+%% Result : {Rest, State}
+%% Description: Function that uses provided fun to read another chunk from
+%% input stream and calls the fun in NextCall.
+%%----------------------------------------------------------------------
+cf(_Rest, #xmerl_sax_parser_state{continuation_fun = undefined} = State, _) ->
+ ?fatal_error(State, "Continuation function undefined");
+cf(Rest, #xmerl_sax_parser_state{continuation_fun = CFun, continuation_state = CState} = State,
+ NextCall) ->
+ Result =
+ try
+ CFun(CState)
+ catch
+ throw:ErrorTerm ->
+ ?fatal_error(State, ErrorTerm);
+ exit:Reason ->
+ ?fatal_error(State, {'EXIT', Reason})
+ end,
+ case Result of
+ {<<>>, _} ->
+ ?fatal_error(State, "Can't detect character encoding due to lack of indata");
+ {NewBytes, NewContState} ->
+ NextCall(<<Rest/binary, NewBytes/binary>>,
+ State#xmerl_sax_parser_state{continuation_state = NewContState})
+ end.