From a9812e6307fe335d077f96d3a6342cbd4894ed0b Mon Sep 17 00:00:00 2001 From: Lukas Larsson Date: Thu, 24 Aug 2017 16:20:36 +0200 Subject: Add support for building a pgo beam_emu --- erts/Makefile | 10 ++-- erts/aclocal.m4 | 15 ++++++ erts/configure.in | 88 +++++++++++++++++++++++++++++++ erts/emulator/Makefile.in | 102 ++++++++++++++++++++++++++++-------- erts/emulator/test/estone_SUITE.erl | 31 ++++++++--- 5 files changed, 212 insertions(+), 34 deletions(-) (limited to 'erts') diff --git a/erts/Makefile b/erts/Makefile index 73d8560c1e..0393ccc759 100644 --- a/erts/Makefile +++ b/erts/Makefile @@ -49,11 +49,6 @@ debug opt lcnt clean: done (cd preloaded/src && $(MAKE) ../ebin/erts.app) -# ---------------------------------------------------------------------- -# These are "convenience targets", provided as shortcuts for developers -# - don't use them in scripts or assume they will always stay like this! -# - .PHONY: smp smp: $(V_at)for type in $(TYPES); do \ @@ -112,6 +107,11 @@ local_setup: $(ERL_TOP)/bin/start_clean.script \ $(ERL_TOP)/bin/no_dot_erlang.script +# ---------------------------------------------------------------------- +# These are "convenience targets", provided as shortcuts for developers +# - don't use them in scripts or assume they will always stay like this! +# + # Run the configure script .PHONY: configure configure: diff --git a/erts/aclocal.m4 b/erts/aclocal.m4 index 80bf236188..887babc13f 100644 --- a/erts/aclocal.m4 +++ b/erts/aclocal.m4 @@ -2726,6 +2726,21 @@ AC_DEFUN([LM_TRY_ENABLE_CFLAG], [ fi ]) +AC_DEFUN([LM_CHECK_ENABLE_CFLAG], [ + AC_MSG_CHECKING([whether $CC accepts $1...]) + saved_CFLAGS=$CFLAGS; + CFLAGS="$1 $CFLAGS"; + AC_TRY_COMPILE([],[return 0;],can_enable_flag=true,can_enable_flag=false) + CFLAGS=$saved_CFLAGS; + if test "X$can_enable_flag" = "Xtrue"; then + AS_VAR_SET($2, true) + AC_MSG_RESULT([yes]) + else + AS_VAR_SET($2, false) + AC_MSG_RESULT([no]) + fi +]) + dnl ERL_TRY_LINK_JAVA(CLASSES, FUNCTION-BODY dnl [ACTION_IF_FOUND [, ACTION-IF-NOT-FOUND]]) dnl Freely inspired by AC_TRY_LINK. (Maybe better to create a diff --git a/erts/configure.in b/erts/configure.in index b765a8ffe4..9948e71b2d 100644 --- a/erts/configure.in +++ b/erts/configure.in @@ -554,6 +554,94 @@ AC_SUBST(WFLAGS) AC_SUBST(WERRORFLAGS) AC_SUBST(CFLAG_RUNTIME_LIBRARY_PATH) +## Check if we can do profile guided optimization of beam_emu +LM_CHECK_ENABLE_CFLAG([-fprofile-generate -Werror],[PROFILE_GENERATE]) +LM_CHECK_ENABLE_CFLAG([-fprofile-use -Werror],[PROFILE_USE]) + +## Check if this is clang +LM_CHECK_ENABLE_CFLAG([-fprofile-instr-generate -Werror],[PROFILE_INSTR_GENERATE]) +if test "X$PROFILE_INSTR_GENERATE" = "Xtrue"; then + # It was clang, now we also have to check if we have llvm-profdata and that + # we can link programs with -fprofile-instr-use + saved_CFLAGS=$CFLAGS; + CFLAGS="-fprofile-instr-generate -Werror $saved_CFLAGS" + AC_RUN_IFELSE([AC_LANG_PROGRAM([],[])], + [AC_CHECK_PROGS([LLVM_PROFDATA], [llvm-profdata]) + AC_CHECK_PROGS([XCRUN], [xcrun]) + if test "X$XCRUN" != "X" -a "X$LLVM_PROFDATA" = "X"; then + AC_MSG_CHECKING([for $XCRUN llvm-profdata]) + if $XCRUN llvm-profdata --help 2>& AS_MESSAGE_LOG_FD >& AS_MESSAGE_LOG_FD; then + LLVM_PROFDATA="$XCRUN llvm-profdata" + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi + fi + AC_SUBST(LLVM_PROFDATA) + if test "X$LLVM_PROFDATA" != "X"; then + CFLAGS="-fprofile-instr-use=default.profdata -Werror $saved_CFLAGS"; + $LLVM_PROFDATA merge -output=default.profdata *.profraw; + AC_MSG_CHECKING([whether gcc accepts -fprofile-instr-use=default.profdata -Werror]) + AC_COMPILE_IFELSE([], + [AC_MSG_RESULT([yes]) + PROFILE_INSTR_USE=true], + [AC_MSG_RESULT([no]) + PROFILE_INSTR_USE=false]) + rm -f default.profdata + fi], + []) + rm -f *.profraw + CFLAGS=$saved_CFLAGS; +fi + +AC_ARG_ENABLE(pgo, +AS_HELP_STRING([--enable-pgo], + [build erts using PGO (profile guided optimization)]), +[ case "$enableval" in + no) enable_pgo=no ;; + *) enable_pgo=yes ;; + esac +],enable_pgo=default) + +USE_PGO=false +AC_MSG_CHECKING([whether to do PGO of erts]) +if test $enable_pgo = no; then + AC_MSG_RESULT([no, disabled by user]) +elif test $CROSS_COMPILING = yes; then + if $enable_pgo = yes; then + AC_MSG_ERROR(cannot use PGO when cross-compiling) + else + AC_MSG_RESULT([no, cross compiling]) + fi +elif test "X$host" = "Xwin32"; then + AC_MSG_RESULT([no, not supported in windows]) +elif test "X$PROFILE_GENERATE" = "Xtrue" -a "X$PROFILE_USE" = "Xtrue"; then + USE_PGO=true + AC_MSG_RESULT([yes, using -fprofile-generate]) + PROFILE_COMPILER=gcc +# check if $CC accepts -fprofile-correction, if so we can use PGO on multi-threaded files. + LM_CHECK_ENABLE_CFLAG([-fprofile-use -fprofile-correction -Werror],[PROFILE_CORRECTION]) + if test "X$PROFILE_CORRECTION" = "Xtrue"; then + PROFILE_CORRECTION="-fprofile-correction" + else + PROFILE_CORRECTION="" + fi + AC_SUBST(PROFILE_CORRECTION) +elif test "X$PROFILE_INSTR_GENERATE" = "Xtrue" -a "X$PROFILE_INSTR_USE" = "Xtrue"; then + USE_PGO=true + AC_MSG_RESULT([yes, using -fprofile-instr-generate]) + PROFILE_COMPILER=clang +else + if $enable_pgo = yes; then + AC_MSG_ERROR(cannot use PGO with this compiler) + else + AC_MSG_RESULT([no]) + fi +fi + +AC_SUBST(USE_PGO) +AC_SUBST(PROFILE_COMPILER) + AC_CHECK_SIZEOF(void *) # Needed for ARCH and smp checks below if test "x$ac_cv_sizeof_void_p" = x8; then AC_SUBST(EXTERNAL_WORD_SIZE, 64) diff --git a/erts/emulator/Makefile.in b/erts/emulator/Makefile.in index bc7eb72221..2b203c9ee6 100644 --- a/erts/emulator/Makefile.in +++ b/erts/emulator/Makefile.in @@ -63,6 +63,28 @@ ARFLAGS=rc OMIT_OMIT_FP=no TYPE_LIBS= +PROFILE_COMPILER=@PROFILE_COMPILER@ +PROFILE_MARKER= +ifeq ($(PROFILE),generate) +PROFILE_MARKER=_pg +else +ifeq ($(PROFILE),use) +PROFILE_MARKER=_pu +endif +endif + +ifeq ($(PROFILE_COMPILER), gcc) +PROFILE_CORRECTION=@PROFILE_CORRECTION@ +PROFILE_GENERATE=-fprofile-generate +PROFILE_USE=-fprofile-use $(PROFILE_CORRECTION) +PROFILE_USE_DEPS=$(OBJDIR)/%_pu.gcda +endif +ifeq ($(PROFILE_COMPILER), clang) +PROFILE_GENERATE=-fprofile-instr-generate +PROFILE_USE=-fprofile-instr-use=$(OBJDIR)/default.profdata +PROFILE_USE_DEPS=$(OBJDIR)/default.profdata +endif + DIRTY_SCHEDULER_SUPPORT=@DIRTY_SCHEDULER_SUPPORT@ DIRTY_SCHEDULER_TEST=@DIRTY_SCHEDULER_TEST@ @@ -418,9 +440,20 @@ ifeq ($(TARGET), win32) EMULATOR_EXECUTABLE = beam$(TF_MARKER).dll else EMULATOR_EXECUTABLE = beam$(TF_MARKER) +PROFILE_EXECUTABLE = beam.prof$(TF_MARKER) endif CS_EXECUTABLE = erl_child_setup$(TYPEMARKER) +ifeq ($(PROFILE), generate) +EMULATOR_EXECUTABLE = $(PROFILE_EXECUTABLE) +ifeq ($(PROFILE_COMPILER), gcc) +PROFILE_LDFLAGS = -fprofile-generate +endif +ifeq ($(PROFILE_COMPILER), clang) +PROFILE_LDFLAGS = -fprofile-instr-generate +endif +endif + # ---------------------------------------------------------------------- ifeq ($(ERLANG_OSTYPE), unix) @@ -687,16 +720,33 @@ $(OBJDIR)/beams.$(RES_EXT): $(TARGET)/beams.rc endif -ifneq ($(filter tile-%,$(TARGET)),) -$(OBJDIR)/beam_emu.o: beam/beam_emu.c - $(V_CC) $(subst -O2, $(GEN_OPT_FLGS), $(CFLAGS)) \ - $(INCLUDES) -c $< -o $@ -else # Usually the same as the default rule, but certain platforms (e.g. win32) mix # different compilers $(OBJDIR)/beam_emu.o: beam/beam_emu.c $(V_EMU_CC) $(subst -O2, $(GEN_OPT_FLGS), $(CFLAGS)) $(INCLUDES) -c $< -o $@ -endif + +$(OBJDIR)/%_pg.o: beam/%.c + $(V_CC) $(PROFILE_GENERATE) $(subst -O2, $(GEN_OPT_FLGS), $(CFLAGS)) $(INCLUDES) -c $< -o $@ +$(OBJDIR)/%_pu.o: beam/%.c $(PROFILE_USE_DEPS) + $(V_CC) $(PROFILE_USE) $(subst -O2, $(GEN_OPT_FLGS), $(CFLAGS)) $(INCLUDES) -c $< -o $@ + +$(OBJDIR)/PROFILE: $(BINDIR)/$(PROFILE_EXECUTABLE) + $(V_at)echo " PROFILE ${PROFILE_EXECUTABLE}" + $(V_at)rm -f $(OBJDIR)/erl*.profraw + $(V_at)set -e; LLVM_PROFILE_FILE="$(OBJDIR)/erlc-%m.profraw" \ + ERL_FLAGS="-emu_type prof${TYPEMARKER} +S 1" $(ERLC) -DPGO \ + -o $(OBJDIR) test/estone_SUITE.erl > $(OBJDIR)/PROFILE_LOG + $(V_at)set -e; LLVM_PROFILE_FILE="$(OBJDIR)/erl-%m.profraw" \ + ERL_FLAGS="-emu_type prof${TYPEMARKER} +S 1" $(ERL) -pa $(OBJDIR) \ + -noshell -s estone_SUITE pgo -s init stop >> $(OBJDIR)/PROFILE_LOG + $(V_at)touch $@ + +$(OBJDIR)/%_pu.gcda: $(OBJDIR)/PROFILE + $(V_at)mv $(OBJDIR)/$*_pg.gcda $@ + $(V_at)touch $@ + +$(OBJDIR)/default.profdata: $(OBJDIR)/PROFILE + $(V_LLVM_PROFDATA) merge -output $@ $(OBJDIR)/*.profraw $(OBJDIR)/%.o: beam/%.c $(V_CC) $(subst -O2, $(GEN_OPT_FLGS), $(CFLAGS)) $(INCLUDES) -c $< -o $@ @@ -758,15 +808,23 @@ $(ERL_TOP)/lib/%.beam: INIT_OBJS = $(OBJDIR)/erl_main.o $(PRELOAD_OBJ) +# -fprofile-correction is needed in order to use PGO on erl_process +# as multiple threads execute in that file. +ifeq ($(PROFILE_CORRECTION),) +PROFILE_OBJS = $(OBJDIR)/beam_emu.o +RUN_OBJS = $(OBJDIR)/erl_process.o +else +PROFILE_OBJS = $(OBJDIR)/beam_emu.o $(OBJDIR)/erl_process.o +endif + EMU_OBJS = \ - $(OBJDIR)/beam_emu.o $(OBJDIR)/beam_opcodes.o \ + $(OBJDIR)/beam_opcodes.o \ $(OBJDIR)/beam_load.o $(OBJDIR)/beam_bif_load.o \ $(OBJDIR)/beam_debug.o $(OBJDIR)/beam_bp.o \ - $(OBJDIR)/beam_catches.o \ - $(OBJDIR)/code_ix.o \ + $(OBJDIR)/beam_catches.o $(OBJDIR)/code_ix.o \ $(OBJDIR)/beam_ranges.o -RUN_OBJS = \ +RUN_OBJS += \ $(OBJDIR)/erl_alloc.o $(OBJDIR)/erl_mtrace.o \ $(OBJDIR)/erl_alloc_util.o $(OBJDIR)/erl_goodfit_alloc.o \ $(OBJDIR)/erl_bestfit_alloc.o $(OBJDIR)/erl_afit_alloc.o \ @@ -782,7 +840,7 @@ RUN_OBJS = \ $(OBJDIR)/utils.o $(OBJDIR)/bif.o \ $(OBJDIR)/io.o $(OBJDIR)/erl_printf_term.o\ $(OBJDIR)/erl_debug.o $(OBJDIR)/erl_md5.o \ - $(OBJDIR)/erl_message.o $(OBJDIR)/erl_process.o \ + $(OBJDIR)/erl_message.o \ $(OBJDIR)/erl_process_dict.o $(OBJDIR)/erl_process_lock.o \ $(OBJDIR)/erl_port_task.o $(OBJDIR)/erl_arith.o \ $(OBJDIR)/time.o $(OBJDIR)/erl_time_sup.o \ @@ -923,21 +981,23 @@ ifdef HIPE_ENABLED EXTRA_BASE_OBJS += $(HIPE_OBJS) endif -BASE_OBJS = $(EMU_OBJS) $(RUN_OBJS) $(OS_OBJS) $(EXTRA_BASE_OBJS) $(LTTNG_OBJS) +BASE_OBJS = $(EMU_OBJS) $(RUN_OBJS) $(OS_OBJS) $(EXTRA_BASE_OBJS) \ + $(LTTNG_OBJS) $(DRV_OBJS) $(NIF_OBJS) -before_DTrace_OBJS = $(BASE_OBJS) $(DRV_OBJS) $(NIF_OBJS) +PROF_OBJS = $(patsubst %.o,%$(PROFILE_MARKER).o,$(PROFILE_OBJS)) $(BASE_OBJS) + +OBJS = $(PROF_OBJS) -DTRACE_OBJS = ifdef DTRACE_ENABLED_2STEP -DTRACE_OBJS = $(OBJDIR)/erlang_dtrace.o -$(OBJDIR)/erlang_dtrace.o: $(before_DTrace_OBJS) $(TARGET)/erlang_dtrace.h +# The $(PROFILE_MARKER) is placed in the object file name in order to +# make sure we re-compile with the new object files for the profiled emulator +OBJS += $(OBJDIR)/erlang$(PROFILE_MARKER)_dtrace.o +$(OBJDIR)/erlang$(PROFILE_MARKER)_dtrace.o: $(PROF_OBJS) $(TARGET)/erlang_dtrace.h dtrace -G -C -Ibeam \ -s beam/erlang_dtrace.d \ - -o $@ $(before_DTrace_OBJS) + -o $@ $(PROF_OBJS) endif -OBJS = $(before_DTrace_OBJS) $(DTRACE_OBJS) - $(INIT_OBJS): $(TTF_DIR)/GENERATED $(OBJS): $(TTF_DIR)/GENERATED @@ -1029,8 +1089,8 @@ $(BINDIR)/$(EMULATOR_EXECUTABLE): $(INIT_OBJS) $(OBJS) $(DEPLIBS) else $(BINDIR)/$(EMULATOR_EXECUTABLE): $(INIT_OBJS) $(OBJS) $(DEPLIBS) - $(ld_verbose)$(PURIFY) $(LD) -o $(BINDIR)/$(EMULATOR_EXECUTABLE) \ - $(HIPEBEAMLDFLAGS) $(LDFLAGS) $(DEXPORT) $(INIT_OBJS) $(OBJS) \ + $(ld_verbose)$(PURIFY) $(LD) -o $@ \ + $(HIPEBEAMLDFLAGS) $(PROFILE_LDFLAGS) $(LDFLAGS) $(DEXPORT) $(INIT_OBJS) $(OBJS) \ $(STATIC_NIF_LIBS) $(STATIC_DRIVER_LIBS) $(LIBS) endif diff --git a/erts/emulator/test/estone_SUITE.erl b/erts/emulator/test/estone_SUITE.erl index 8b336b366d..c4899967ca 100644 --- a/erts/emulator/test/estone_SUITE.erl +++ b/erts/emulator/test/estone_SUITE.erl @@ -20,7 +20,7 @@ -module(estone_SUITE). %% Test functions -export([all/0, suite/0, groups/0, - estone/1, estone_bench/1]). + estone/1, estone_bench/1, pgo/0]). %% Internal exports for EStone tests -export([lists/1, @@ -44,9 +44,9 @@ links/1,lproc/1, run_micro/3,p1/1,ppp/3,macro/2,micros/0]). - --include_lib("common_test/include/ct.hrl"). +-ifndef(PGO). -include_lib("common_test/include/ct_event.hrl"). +-endif. %% EStone defines -define(TOTAL, (3000 * 1000 * 100)). %% 300 secs @@ -85,13 +85,28 @@ estone(Config) when is_list(Config) -> estone_bench(Config) -> DataDir = proplists:get_value(data_dir,Config), L = ?MODULE:macro(?MODULE:micros(),DataDir), - [ct_event:notify( - #event{name = benchmark_data, - data = [{name,proplists:get_value(title,Mark)}, - {value,proplists:get_value(estones,Mark)}]}) - || Mark <- L], + {Total, Stones} = sum_micros(L, 0, 0), + notify([[{title,"ESTONES"}, {estones, Stones}] | L]), L. +-ifndef(PGO). +notify(Marks) -> + [ct_event:notify( + #event{name = benchmark_data, + data = [{name,proplists:get_value(title, Mark)}, + {value,proplists:get_value(estones, Mark)}]}) + || Mark <- Marks]. +-else. +notify(_) -> + ok. +-endif. + +%% The benchmarks to run in order to guide PGO (profile guided optimisation) +pgo() -> + %% We run all benchmarks except the port_io as we don't want to + %% have to build a custom port. + Micros = ?MODULE:micros() -- [micro(port_io)], + ?MODULE:macro(Micros,[]). %% %% Calculate CPU speed -- cgit v1.2.3