diff options
author | John Högberg <[email protected]> | 2017-11-03 11:49:27 +0100 |
---|---|---|
committer | John Högberg <[email protected]> | 2017-11-30 15:44:33 +0100 |
commit | ebbd26eeea4115c946d1254d94acd50f150b4455 (patch) | |
tree | ab2a8ba1023aa0684695dc36ff5567ae9e3f59c4 /lib/kernel/src/raw_file_io_delayed.erl | |
parent | 6cb62e44eba1db8d1917ebb0db84298e91582c4e (diff) | |
download | otp-ebbd26eeea4115c946d1254d94acd50f150b4455.tar.gz otp-ebbd26eeea4115c946d1254d94acd50f150b4455.tar.bz2 otp-ebbd26eeea4115c946d1254d94acd50f150b4455.zip |
Reimplement efile_drv as a dirty NIF
This improves the latency of file operations as dirty schedulers
are a bit more eager to run jobs than async threads, and use a
single global queue rather than per-thread queues, eliminating the
risk of a job stalling behind a long-running job on the same thread
while other async threads sit idle.
There's no such thing as a free lunch though; the lowered latency
comes at the cost of increased busy-waiting which may have an
adverse effect on some applications. This behavior can be tweaked
with the +sbwt flag, but unfortunately it affects all types of
schedulers and not just dirty ones. We plan to add type-specific
flags at a later stage.
sendfile has been moved to inet_drv to lessen the effect of a nasty
race; the cooperation between inet_drv and efile has never been
airtight and the socket dying at the wrong time (Regardless of
reason) could result in fd aliasing. Moving it to the inet driver
makes it impossible to trigger this by closing the socket in the
middle of a sendfile operation, while still allowing it to be
aborted -- something that can't be done if it stays in the file
driver.
The race still occurs if the controlling process dies in the short
window between dispatching the sendfile operation and the dup(2)
call in the driver, but it's much less likely to happen now.
A proper fix is in the works.
--
Notable functional differences:
* The use_threads option for file:sendfile/5 no longer has any
effect.
* The file-specific DTrace probes have been removed. The same
effect can be achieved with normal tracing together with the
nif__entry/nif__return probes to track scheduling.
--
OTP-14256
Diffstat (limited to 'lib/kernel/src/raw_file_io_delayed.erl')
-rw-r--r-- | lib/kernel/src/raw_file_io_delayed.erl | 320 |
1 files changed, 320 insertions, 0 deletions
diff --git a/lib/kernel/src/raw_file_io_delayed.erl b/lib/kernel/src/raw_file_io_delayed.erl new file mode 100644 index 0000000000..d2ad7550a1 --- /dev/null +++ b/lib/kernel/src/raw_file_io_delayed.erl @@ -0,0 +1,320 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2017. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%% +%% %CopyrightEnd% +%% +-module(raw_file_io_delayed). + +-behavior(gen_statem). + +-export([close/1, sync/1, datasync/1, truncate/1, advise/4, allocate/3, + position/2, write/2, pwrite/2, pwrite/3, + read_line/1, read/2, pread/2, pread/3]). + +%% OTP internal. +-export([ipread_s32bu_p32bu/3, sendfile/8]). + +-export([open_layer/3]). + +-export([init/1, callback_mode/0, terminate/3]). +-export([opening/3, opened/3]). + +-include("file_int.hrl"). + +open_layer(Filename, Modes, Options) -> + Secret = make_ref(), + case gen_statem:start(?MODULE, {self(), Secret, Options}, []) of + {ok, Pid} -> + gen_statem:call(Pid, {'$open', Secret, Filename, Modes}, infinity); + Other -> + Other + end. + +callback_mode() -> state_functions. + +init({Owner, Secret, Options}) -> + Monitor = monitor(process, Owner), + Defaults = + #{ owner => Owner, + monitor => Monitor, + secret => Secret, + timer => none, + pid => self(), + buffer => prim_buffer:new(), + delay_size => 64 bsl 10, + delay_time => 2000 }, + Data = fill_delay_values(Defaults, Options), + {ok, opening, Data}. + +fill_delay_values(Data, []) -> + Data; +fill_delay_values(Data, [{delayed_write, Size, Time} | Options]) -> + fill_delay_values(Data#{ delay_size => Size, delay_time => Time }, Options); +fill_delay_values(Data, [_ | Options]) -> + fill_delay_values(Data, Options). + +opening({call, From}, {'$open', Secret, Filename, Modes}, #{ secret := Secret } = Data) -> + case raw_file_io:open(Filename, Modes) of + {ok, PrivateFd} -> + PublicData = maps:with([owner, buffer, delay_size, pid], Data), + PublicFd = #file_descriptor{ module = ?MODULE, data = PublicData }, + + NewData = Data#{ handle => PrivateFd }, + Response = {ok, PublicFd}, + {next_state, opened, NewData, [{reply, From, Response}]}; + Other -> + {stop_and_reply, normal, [{reply, From, Other}]} + end; +opening(_Event, _Contents, _Data) -> + {keep_state_and_data, [postpone]}. + +%% + +opened(info, {'$timed_out', Secret}, #{ secret := Secret } = Data) -> + %% If the user writes something at this exact moment, the flush will fail + %% and the timer won't reset on the next write since the buffer won't be + %% empty (Unless we collided on a flush). We therefore reset the timeout to + %% ensure that data won't sit idle for extended periods of time. + case try_flush_write_buffer(Data) of + busy -> gen_statem:cast(self(), '$reset_timeout'); + ok -> ok + end, + {keep_state, Data#{ timer => none }, []}; + +opened(info, {'DOWN', Monitor, process, _Owner, Reason}, #{ monitor := Monitor } = Data) -> + if + Reason =/= kill -> try_flush_write_buffer(Data); + Reason =:= kill -> ignored + end, + {stop, shutdown}; + +opened(info, _Message, _Data) -> + keep_state_and_data; + +opened({call, {Owner, _Tag} = From}, [close], #{ owner := Owner } = Data) -> + case flush_write_buffer(Data) of + ok -> + #{ handle := PrivateFd } = Data, + Response = ?CALL_FD(PrivateFd, close, []), + {stop_and_reply, normal, [{reply, From, Response}]}; + Other -> + {stop_and_reply, normal, [{reply, From, Other}]} + end; + +opened({call, {Owner, _Tag} = From}, '$wait', #{ owner := Owner }) -> + %% Used in write/2 to synchronize writes on lock conflicts. + {keep_state_and_data, [{reply, From, ok}]}; + +opened({call, {Owner, _Tag} = From}, '$synchronous_flush', #{ owner := Owner } = Data) -> + cancel_flush_timeout(Data), + Response = flush_write_buffer(Data), + {keep_state_and_data, [{reply, From, Response}]}; + +opened({call, {Owner, _Tag} = From}, Command, #{ owner := Owner } = Data) -> + Response = + case flush_write_buffer(Data) of + ok -> dispatch_command(Data, Command); + Other -> Other + end, + {keep_state_and_data, [{reply, From, Response}]}; + +opened({call, _From}, _Command, _Data) -> + %% The client functions filter this out, so we'll crash if the user does + %% anything stupid on purpose. + {shutdown, protocol_violation}; + +opened(cast, '$reset_timeout', #{ delay_time := Timeout, secret := Secret } = Data) -> + cancel_flush_timeout(Data), + Timer = erlang:send_after(Timeout, self(), {'$timed_out', Secret}), + {keep_state, Data#{ timer => Timer }, []}; + +opened(cast, _Message, _Data) -> + {keep_state_and_data, []}. + +dispatch_command(Data, [Function | Args]) -> + #{ handle := Handle } = Data, + Module = Handle#file_descriptor.module, + apply(Module, Function, [Handle | Args]). + +cancel_flush_timeout(#{ timer := none }) -> + ok; +cancel_flush_timeout(#{ timer := Timer }) -> + _ = erlang:cancel_timer(Timer, [{async, true}]), + ok. + +try_flush_write_buffer(#{ buffer := Buffer, handle := PrivateFd }) -> + case prim_buffer:try_lock(Buffer) of + acquired -> + flush_write_buffer_1(Buffer, PrivateFd), + prim_buffer:unlock(Buffer), + ok; + busy -> + busy + end. + +%% This is only safe to use when there is no chance of conflict with the owner +%% process, or in other words, "during synchronous calls outside of the locked +%% section of write/2" +flush_write_buffer(#{ buffer := Buffer, handle := PrivateFd }) -> + acquired = prim_buffer:try_lock(Buffer), + Result = flush_write_buffer_1(Buffer, PrivateFd), + prim_buffer:unlock(Buffer), + Result. + +flush_write_buffer_1(Buffer, PrivateFd) -> + case prim_buffer:size(Buffer) of + Size when Size > 0 -> + ?CALL_FD(PrivateFd, write, [prim_buffer:read_iovec(Buffer, Size)]); + 0 -> + ok + end. + +terminate(_Reason, _State, _Data) -> + ok. + +%% Client functions + +write(Fd, IOData) -> + try + enqueue_write(Fd, erlang:iolist_to_iovec(IOData)) + catch + error:badarg -> {error, badarg} + end. +enqueue_write(_Fd, []) -> + ok; +enqueue_write(Fd, IOVec) -> + %% get_fd_data will reject everyone except the process that opened the Fd, + %% so we can't race with anyone except the wrapper process. + #{ delay_size := DelaySize, + buffer := Buffer, + pid := Pid } = get_fd_data(Fd), + case prim_buffer:try_lock(Buffer) of + acquired -> + %% (The wrapper process will exit without flushing if we're killed + %% while holding the lock). + enqueue_write_locked(Pid, Buffer, DelaySize, IOVec); + busy -> + %% This can only happen while we're processing a timeout in the + %% wrapper process, so we perform a bogus call to get a completion + %% notification before trying again. + gen_statem:call(Pid, '$wait'), + enqueue_write(Fd, IOVec) + end. +enqueue_write_locked(Pid, Buffer, DelaySize, IOVec) -> + %% The synchronous operations (write, forced flush) are safe since we're + %% running on the only process that can fill the buffer; a timeout being + %% processed just before $synchronous_flush will cause the flush to nop, + %% and a timeout sneaking in just before a synchronous write won't do + %% anything since the buffer is guaranteed to be empty at that point. + BufSize = prim_buffer:size(Buffer), + case is_iovec_smaller_than(IOVec, DelaySize - BufSize) of + true when BufSize > 0 -> + prim_buffer:write(Buffer, IOVec), + prim_buffer:unlock(Buffer); + true -> + prim_buffer:write(Buffer, IOVec), + prim_buffer:unlock(Buffer), + gen_statem:cast(Pid, '$reset_timeout'); + false when BufSize > 0 -> + prim_buffer:write(Buffer, IOVec), + prim_buffer:unlock(Buffer), + gen_statem:call(Pid, '$synchronous_flush'); + false -> + prim_buffer:unlock(Buffer), + gen_statem:call(Pid, [write, IOVec]) + end. + +%% iolist_size/1 will always look through the entire list to get a precise +%% amount, which is pretty inefficient since we only need to know whether we've +%% hit the buffer threshold or not. +%% +%% We only handle the binary case since write/2 forcibly translates input to +%% erlang:iovec(). +is_iovec_smaller_than(IOVec, Max) -> + is_iovec_smaller_than_1(IOVec, Max, 0). +is_iovec_smaller_than_1(_IOVec, Max, Acc) when Acc >= Max -> + false; +is_iovec_smaller_than_1([], _Max, _Acc) -> + true; +is_iovec_smaller_than_1([Binary | Rest], Max, Acc) when is_binary(Binary) -> + is_iovec_smaller_than_1(Rest, Max, Acc + byte_size(Binary)). + +close(Fd) -> + wrap_call(Fd, [close]). + +sync(Fd) -> + wrap_call(Fd, [sync]). +datasync(Fd) -> + wrap_call(Fd, [datasync]). + +truncate(Fd) -> + wrap_call(Fd, [truncate]). + +advise(Fd, Offset, Length, Advise) -> + wrap_call(Fd, [advise, Offset, Length, Advise]). +allocate(Fd, Offset, Length) -> + wrap_call(Fd, [allocate, Offset, Length]). + +position(Fd, Mark) -> + wrap_call(Fd, [position, Mark]). + +pwrite(Fd, Offset, IOData) -> + try + CompactedData = erlang:iolist_to_iovec(IOData), + wrap_call(Fd, [pwrite, Offset, CompactedData]) + catch + error:badarg -> {error, badarg} + end. +pwrite(Fd, LocBytes) -> + try + CompactedLocBytes = + [ {Offset, erlang:iolist_to_iovec(IOData)} || + {Offset, IOData} <- LocBytes ], + wrap_call(Fd, [pwrite, CompactedLocBytes]) + catch + error:badarg -> {error, badarg} + end. + +read_line(Fd) -> + wrap_call(Fd, [read_line]). +read(Fd, Size) -> + wrap_call(Fd, [read, Size]). +pread(Fd, Offset, Size) -> + wrap_call(Fd, [pread, Offset, Size]). +pread(Fd, LocNums) -> + wrap_call(Fd, [pread, LocNums]). + +ipread_s32bu_p32bu(Fd, Offset, MaxSize) -> + wrap_call(Fd, [ipread_s32bu_p32bu, Offset, MaxSize]). + +sendfile(_,_,_,_,_,_,_,_) -> + {error, enotsup}. + +wrap_call(Fd, Command) -> + #{ pid := Pid } = get_fd_data(Fd), + try gen_statem:call(Pid, Command, infinity) of + Result -> Result + catch + exit:{noproc, _StackTrace} -> {error, einval} + end. + +get_fd_data(#file_descriptor{ data = Data }) -> + #{ owner := Owner } = Data, + case self() of + Owner -> Data; + _ -> error(not_on_controlling_process) + end. |