diff --git a/cli/asc.js b/cli/asc.js index a94d42a69c..7c2baf3ae1 100644 --- a/cli/asc.js +++ b/cli/asc.js @@ -635,26 +635,22 @@ exports.main = function main(argv, options, callback) { if (args.trapMode === "clamp") { stats.optimizeCount++; stats.optimizeTime += measure(() => { - module.runPasses([ "trap-mode-clamp" ]); + module.runPass("trap-mode-clamp"); }); } else if (args.trapMode === "js") { stats.optimizeCount++; stats.optimizeTime += measure(() => { - module.runPasses([ "trap-mode-js" ]); + module.runPass("trap-mode-js"); }); } else if (args.trapMode !== "allow") { module.dispose(); return callback(Error("Unsupported trap mode")); } - // Implicitly run costly non-LLVM optimizations on -O3 or -Oz - // see: https://github.com/WebAssembly/binaryen/pull/1596 - if (optimizeLevel >= 3 || shrinkLevel >= 2) optimizeLevel = 4; - - module.setOptimizeLevel(optimizeLevel); - module.setShrinkLevel(shrinkLevel); - module.setDebugInfo(args.debug); - + // Optimize the module + const debugInfo = args.debug; + const usesARC = args.runtime == "half" || args.runtime == "full"; + const converge = args.converge; const runPasses = []; if (args.runPasses) { if (typeof args.runPasses === "string") { @@ -668,213 +664,16 @@ exports.main = function main(argv, options, callback) { } } - function doOptimize() { - const hasARC = args.runtime == "half" || args.runtime == "full"; - const passes = []; - function add(pass) { passes.push(pass); } - - if (optimizeLevel >= 2 && shrinkLevel === 0) { - // tweak inlining options when speed more preferable than size - module.setAlwaysInlineMaxSize(12); - module.setFlexibleInlineMaxSize(70); - module.setOneCallerInlineMaxSize(200); - } else { - // tweak inlining options when size matters - optimizeLevel === 0 && shrinkLevel >= 0 - ? module.setAlwaysInlineMaxSize(2) - : module.setAlwaysInlineMaxSize(4); // default: 2 - module.setFlexibleInlineMaxSize(65); // default: 20 - module.setOneCallerInlineMaxSize(80); // default: 15 - } - - // Optimize the module if requested - if (optimizeLevel > 0 || shrinkLevel > 0) { - // Binaryen's default passes with Post-AssemblyScript passes added. - // see: Binaryen/src/pass.cpp - - // PassRunner::addDefaultGlobalOptimizationPrePasses - add("duplicate-function-elimination"); - add("remove-unused-module-elements"); // differs - - // PassRunner::addDefaultFunctionOptimizationPasses - if (optimizeLevel >= 3 || shrinkLevel >= 1) { - add("ssa-nomerge"); - } - if (optimizeLevel >= 3) { - add("flatten"); // differs - add("simplify-locals-notee-nostructure"); // differs - add("vacuum"); // differs - add("code-folding"); // differs - add("flatten"); - add("local-cse"); - add("reorder-locals"); // differs - } - if (optimizeLevel >= 2 || shrinkLevel >= 1) { // differs - add("rse"); - add("vacuum"); - } - if (hasARC) { // differs - if (optimizeLevel < 3) { - add("flatten"); - } - add("post-assemblyscript"); - } - add("optimize-instructions"); // differs - add("inlining"); // differs - add("dce"); - add("remove-unused-brs"); - add("remove-unused-names"); - add("inlining-optimizing"); // differs - if (optimizeLevel >= 2 || shrinkLevel >= 1) { - add("pick-load-signs"); - add("simplify-globals-optimizing"); // differs - } - if (optimizeLevel >= 3 || shrinkLevel >= 2) { - add("precompute-propagate"); - } else { - add("precompute"); - } - add("vacuum"); // differs - // this will be done later (1) - // if (optimizeLevel >= 2 || shrinkLevel >= 2) { - // add("code-pushing"); - // } - if (optimizeLevel >= 3 && shrinkLevel <= 1) { // differs - add("licm"); - } - add("simplify-locals-nostructure"); - add("vacuum"); - add("reorder-locals"); - add("remove-unused-brs"); - // if (optimizeLevel >= 3 || shrinkLevel >= 2) { // do it later - // add("merge-locals"); - // } - add("coalesce-locals"); - add("simplify-locals"); - add("vacuum"); - add("reorder-locals"); - add("coalesce-locals"); - add("reorder-locals"); - if (optimizeLevel >= 3 || shrinkLevel >= 1) { // differs - add("merge-locals"); - } - add("vacuum"); - if (optimizeLevel >= 3 || shrinkLevel >= 1) { - add("code-folding"); - } - if (optimizeLevel >= 2 || shrinkLevel >= 1) { // differs - add("simplify-globals-optimizing"); - } - add("merge-blocks"); - add("remove-unused-brs"); - add("remove-unused-names"); - add("merge-blocks"); - // make this later & move to (2) - // if (optimizeLevel >= 3 || shrinkLevel >= 2) { - // add("precompute-propagate"); - // } else { - // add("precompute"); - // } - if (optimizeLevel >= 3) { - add("optimize-instructions"); - } - if (optimizeLevel >= 2 || shrinkLevel >= 1) { - add("rse"); - } - add("vacuum"); - // PassRunner::addDefaultGlobalOptimizationPostPasses - if (optimizeLevel >= 2 || shrinkLevel >= 1) { - add("simplify-globals-optimizing"); // differs - add("dae-optimizing"); - } - if (optimizeLevel >= 2 || shrinkLevel >= 2) { - add("inlining-optimizing"); - } - if (module.getLowMemoryUnused()) { - if (optimizeLevel >= 3 || shrinkLevel >= 1) { - add("optimize-added-constants-propagate"); - } else { - add("optimize-added-constants"); - } - } - // "duplicate-function-elimination" will better done later - // add("duplicate-function-elimination"); - add("duplicate-import-elimination"); - if (optimizeLevel >= 2 || shrinkLevel >= 2) { - add("simplify-globals-optimizing"); - } else { - add("simplify-globals"); - add("vacuum"); // differs - } - // moved from (2) - // it works better after globals optimizations like simplify-globals, inlining-optimizing and etc - if (optimizeLevel >= 2 || shrinkLevel >= 1) { // differs - add("precompute-propagate"); - } else { - add("precompute"); - } - // replace indirect calls with direct, reduce arity and - // inline this calls if possible - add("directize"); // differs - add("dae-optimizing"); // differs - add("inlining-optimizing"); // differs - // ARC finalization should be done exactly after inlining for better release/retain reduction - if (hasARC) { // differs - add("post-assemblyscript-finalize"); - } - if (optimizeLevel >= 2 || shrinkLevel >= 1) { // differs - add("rse"); - // move some code after early return which potentially could reduce computations - // do this after CFG cleanup (originally it was done before) - // moved from (1) - add("code-pushing"); - if (optimizeLevel >= 3) { - // this quite expensive so do this only for highest opt level - add("simplify-globals"); - add("vacuum"); - // replace indirect calls with direct and inline if possible again. - add("inlining-optimizing"); - add("directize"); - add("dae-optimizing"); - add("precompute-propagate"); - add("vacuum"); - add("merge-locals"); - add("coalesce-locals"); - add("simplify-locals-nostructure"); - add("vacuum"); - add("inlining-optimizing"); - add("precompute-propagate"); - } - add("remove-unused-brs"); - add("remove-unused-names"); - add("vacuum"); - add("optimize-instructions"); - add("simplify-globals-optimizing"); - } - // remove unused elements of table and pack / reduce memory - add("duplicate-function-elimination"); // differs - add("remove-unused-nonfunction-module-elements"); // differs - add("memory-packing"); - add("remove-unused-module-elements"); // differs - // It seems stack-ir unuseful for our needs. - // if (optimizeLevel >= 3 || shrinkLevel >= 1) { // differs. was optimizeLevel >= 2 - // add("generate-stack-ir"); - // add("optimize-stack-ir"); - // } - } - - // Append additional passes if requested and execute - module.runPasses(passes.concat(runPasses)); - } - stats.optimizeTime += measure(() => { stats.optimizeCount++; - doOptimize(); - if (args.converge) { + module.optimize(optimizeLevel, shrinkLevel, debugInfo, usesARC); + module.runPasses(runPasses); + if (converge) { let last = module.toBinary(); do { stats.optimizeCount++; - doOptimize(); + module.optimize(optimizeLevel, shrinkLevel, debugInfo, usesARC); + module.runPasses(runPasses); let next = module.toBinary(); if (next.output.length >= last.output.length) { if (next.output.length > last.output.length) { diff --git a/src/module.ts b/src/module.ts index 018c941d62..f0337460c1 100644 --- a/src/module.ts +++ b/src/module.ts @@ -1394,28 +1394,211 @@ export class Module { binaryen._BinaryenModuleSetFeatures(this.ref, featureFlags); } - optimize(func: FunctionRef = 0): void { + runPass(pass: string, func: FunctionRef = 0): void { + var cStr = allocString(pass); if (func) { - binaryen._BinaryenFunctionOptimize(func, this.ref); + binaryen._BinaryenFunctionRunPasses(func, this.ref, cStr, 1); } else { - binaryen._BinaryenModuleOptimize(this.ref); + binaryen._BinaryenModuleRunPasses(this.ref, cStr, 1); } + binaryen._free(cStr); } runPasses(passes: string[], func: FunctionRef = 0): void { var numNames = passes.length; - var names = new Array(numNames); + var cStrs = new Array(numNames); for (let i = 0; i < numNames; ++i) { - names[i] = allocString(passes[i]); + cStrs[i] = allocString(passes[i]); } - var cArr = allocPtrArray(names); + var cArr = allocPtrArray(cStrs); if (func) { binaryen._BinaryenFunctionRunPasses(func, this.ref, cArr, numNames); } else { binaryen._BinaryenModuleRunPasses(this.ref, cArr, numNames); } binaryen._free(cArr); - for (let i = numNames; i >= 0; --i) binaryen._free(names[i]); + for (let i = numNames; i >= 0; --i) binaryen._free(cStrs[i]); + } + + optimize(optimizeLevel: i32, shrinkLevel: i32, debugInfo: bool = false, usesARC: bool = true): void { + // Implicitly run costly non-LLVM optimizations on -O3 or -Oz + if (optimizeLevel >= 3 || shrinkLevel >= 2) optimizeLevel = 4; + + binaryen._BinaryenSetOptimizeLevel(optimizeLevel); + binaryen._BinaryenSetShrinkLevel(shrinkLevel); + binaryen._BinaryenSetDebugInfo(debugInfo); + + // Tweak inlining limits based on optimization levels + if (optimizeLevel >= 2 && shrinkLevel === 0) { + binaryen._BinaryenSetAlwaysInlineMaxSize(12); + binaryen._BinaryenSetFlexibleInlineMaxSize(70); + binaryen._BinaryenSetOneCallerInlineMaxSize(200); + } else { + binaryen._BinaryenSetAlwaysInlineMaxSize( + optimizeLevel == 0 && shrinkLevel >= 0 + ? 2 + : 4 + ); + binaryen._BinaryenSetFlexibleInlineMaxSize(65); + binaryen._BinaryenSetOneCallerInlineMaxSize(80); + } + + // Pass order here differs substantially from Binaryen's defaults + // see: Binaryen/src/pass.cpp + if (optimizeLevel > 0 || shrinkLevel > 0) { + let passes = new Array(); + + // --- PassRunner::addDefaultGlobalOptimizationPrePasses --- + + passes.push("duplicate-function-elimination"); + passes.push("remove-unused-module-elements"); // + + + // --- PassRunner::addDefaultFunctionOptimizationPasses --- + + if (optimizeLevel >= 3 || shrinkLevel >= 1) { + passes.push("ssa-nomerge"); + } + if (optimizeLevel >= 3) { + passes.push("flatten"); + passes.push("simplify-locals-notee-nostructure"); + passes.push("vacuum"); + passes.push("code-folding"); + passes.push("flatten"); + passes.push("local-cse"); + passes.push("reorder-locals"); + } + if (optimizeLevel >= 2 || shrinkLevel >= 1) { + passes.push("rse"); + passes.push("vacuum"); + } + if (usesARC) { + if (optimizeLevel < 3) { + passes.push("flatten"); + } + passes.push("post-assemblyscript"); + } + passes.push("optimize-instructions"); + passes.push("inlining"); + passes.push("dce"); + passes.push("remove-unused-brs"); + passes.push("remove-unused-names"); + passes.push("inlining-optimizing"); + if (optimizeLevel >= 2 || shrinkLevel >= 1) { + passes.push("pick-load-signs"); + passes.push("simplify-globals-optimizing"); + } + if (optimizeLevel >= 3 || shrinkLevel >= 2) { + passes.push("precompute-propagate"); + } else { + passes.push("precompute"); + } + passes.push("vacuum"); + if (optimizeLevel >= 3 && shrinkLevel <= 1) { + passes.push("licm"); + } + passes.push("simplify-locals-nostructure"); + passes.push("vacuum"); + passes.push("reorder-locals"); + passes.push("remove-unused-brs"); + passes.push("coalesce-locals"); + passes.push("simplify-locals"); + passes.push("vacuum"); + passes.push("reorder-locals"); + passes.push("coalesce-locals"); + passes.push("reorder-locals"); + if (optimizeLevel >= 3 || shrinkLevel >= 1) { + passes.push("merge-locals"); + } + passes.push("vacuum"); + if (optimizeLevel >= 3 || shrinkLevel >= 1) { + passes.push("code-folding"); + } + if (optimizeLevel >= 2 || shrinkLevel >= 1) { + passes.push("simplify-globals-optimizing"); + } + passes.push("merge-blocks"); + passes.push("remove-unused-brs"); + passes.push("remove-unused-names"); + passes.push("merge-blocks"); + if (optimizeLevel >= 3) { + passes.push("optimize-instructions"); + } + if (optimizeLevel >= 2 || shrinkLevel >= 1) { + passes.push("rse"); + } + passes.push("vacuum"); + + // --- PassRunner::addDefaultGlobalOptimizationPostPasses --- + + if (optimizeLevel >= 2 || shrinkLevel >= 1) { + passes.push("simplify-globals-optimizing"); + passes.push("dae-optimizing"); + } + if (optimizeLevel >= 2 || shrinkLevel >= 2) { + passes.push("inlining-optimizing"); + } + if (binaryen._BinaryenGetLowMemoryUnused()) { + if (optimizeLevel >= 3 || shrinkLevel >= 1) { + passes.push("optimize-added-constants-propagate"); + } else { + passes.push("optimize-added-constants"); + } + } + passes.push("duplicate-import-elimination"); + if (optimizeLevel >= 2 || shrinkLevel >= 2) { + passes.push("simplify-globals-optimizing"); + } else { + passes.push("simplify-globals"); + passes.push("vacuum"); + } + // precompute works best after global optimizations + if (optimizeLevel >= 2 || shrinkLevel >= 1) { + passes.push("precompute-propagate"); + } else { + passes.push("precompute"); + } + passes.push("directize"); // replace indirect with direct calls + passes.push("dae-optimizing"); // reduce arity + passes.push("inlining-optimizing"); // and inline if possible + if (usesARC) { + // works best after inlining to cover most retains/releases + passes.push("post-assemblyscript-finalize"); + } + if (optimizeLevel >= 2 || shrinkLevel >= 1) { + passes.push("rse"); + // move code on early return (after CFG cleanup) + passes.push("code-pushing"); + if (optimizeLevel >= 3) { + // very expensive, so O3 only + passes.push("simplify-globals"); + passes.push("vacuum"); + // replace indirect with direct calls again and inline + passes.push("inlining-optimizing"); + passes.push("directize"); + passes.push("dae-optimizing"); + passes.push("precompute-propagate"); + passes.push("vacuum"); + passes.push("merge-locals"); + passes.push("coalesce-locals"); + passes.push("simplify-locals-nostructure"); + passes.push("vacuum"); + passes.push("inlining-optimizing"); + passes.push("precompute-propagate"); + } + passes.push("remove-unused-brs"); + passes.push("remove-unused-names"); + passes.push("vacuum"); + passes.push("optimize-instructions"); + passes.push("simplify-globals-optimizing"); + } + // clean up + passes.push("duplicate-function-elimination"); + passes.push("remove-unused-nonfunction-module-elements"); + passes.push("memory-packing"); + passes.push("remove-unused-module-elements"); + + this.runPasses(passes); + } } private cachedPrecomputeNames: usize = 0; diff --git a/tests/compiler/features/simd.optimized.wat b/tests/compiler/features/simd.optimized.wat index 952933a6b7..317d754484 100644 --- a/tests/compiler/features/simd.optimized.wat +++ b/tests/compiler/features/simd.optimized.wat @@ -8,37 +8,29 @@ (global $~lib/rt/stub/offset (mut i32) (i32.const 0)) (export "memory" (memory $0)) (start $~start) - (func $features/simd/test_v128 + (func $start:features/simd (local $0 i32) (local $1 i32) - (local $2 i32) - (local $3 i32) - global.get $~lib/rt/stub/offset - i32.const 16 - i32.add - local.tee $1 - i32.const -64 - i32.sub - local.tee $2 + i32.const 1120 + global.set $~lib/rt/stub/offset + i32.const 1200 memory.size - local.tee $3 + local.tee $1 i32.const 16 i32.shl local.tee $0 i32.gt_u if - local.get $3 - local.get $2 + local.get $1 + i32.const 66735 local.get $0 i32.sub - i32.const 65535 - i32.add i32.const -65536 i32.and i32.const 16 i32.shr_u local.tee $0 - local.get $3 + local.get $1 local.get $0 i32.gt_s select @@ -55,48 +47,26 @@ end end end - local.get $2 + i32.const 1200 global.set $~lib/rt/stub/offset - local.get $1 - i32.const 16 - i32.sub - local.tee $0 + i32.const 1120 i32.const 64 i32.store - local.get $0 + i32.const 1124 i32.const 1 - i32.store offset=4 - local.get $0 + i32.store + i32.const 1128 i32.const 0 - i32.store offset=8 - local.get $0 + i32.store + i32.const 1132 i32.const 64 - i32.store offset=12 - local.get $1 - local.get $1 - v128.load offset=16 - v128.store offset=32 - local.get $1 - i32.const 15 - i32.and - i32.eqz - i32.const 0 - local.get $1 - select - i32.eqz - if - i32.const 0 - i32.const 1040 - i32.const 70 - i32.const 3 - call $~lib/builtins/abort - unreachable - end - local.get $1 - i32.const 16 - i32.sub - local.tee $0 - i32.load offset=4 + i32.store + i32.const 1168 + i32.const 1152 + v128.load + v128.store + i32.const 1124 + i32.load i32.const 1 i32.ne if @@ -108,19 +78,17 @@ unreachable end global.get $~lib/rt/stub/offset - local.get $1 - local.get $0 + i32.const 1120 i32.load + i32.const 1136 i32.add i32.eq if - local.get $0 + i32.const 1120 global.set $~lib/rt/stub/offset end ) (func $~start - i32.const 1120 - global.set $~lib/rt/stub/offset - call $features/simd/test_v128 + call $start:features/simd ) ) diff --git a/tests/compiler/std/object-literal-omitted.optimized.wat b/tests/compiler/std/object-literal-omitted.optimized.wat index a70a39a38b..7399ef8711 100644 --- a/tests/compiler/std/object-literal-omitted.optimized.wat +++ b/tests/compiler/std/object-literal-omitted.optimized.wat @@ -632,10 +632,10 @@ i32.const 16 i32.lt_u if - local.get $2 local.get $1 i32.const 4 i32.shl + local.get $2 i32.add i32.const 2 i32.shl @@ -1418,26 +1418,26 @@ i32.eq br_if $__inlined_func$~lib/string/String.__eq drop + block $folding-inner0 + i32.const 0 + i32.const 1 + local.get $2 + select + br_if $folding-inner0 + local.get $2 + call $~lib/string/String#get:length + local.tee $3 + i32.const 1280 + call $~lib/string/String#get:length + i32.ne + br_if $folding-inner0 + local.get $2 + local.get $3 + call $~lib/util/string/compareImpl + i32.eqz + br $__inlined_func$~lib/string/String.__eq + end i32.const 0 - i32.const 0 - i32.const 1 - local.get $2 - select - br_if $__inlined_func$~lib/string/String.__eq - drop - i32.const 0 - local.get $2 - call $~lib/string/String#get:length - local.tee $3 - i32.const 1280 - call $~lib/string/String#get:length - i32.ne - br_if $__inlined_func$~lib/string/String.__eq - drop - local.get $2 - local.get $3 - call $~lib/util/string/compareImpl - i32.eqz end i32.eqz if @@ -1501,16 +1501,13 @@ block $switch$1$case$6 block $switch$1$case$4 local.get $0 - i32.const 16 - i32.add - local.tee $1 i32.const 8 - i32.sub + i32.add i32.load br_table $__inlined_func$~lib/rt/__visit_members $__inlined_func$~lib/rt/__visit_members $switch$1$case$4 $__inlined_func$~lib/rt/__visit_members $switch$1$case$6 $switch$1$default end - local.get $1 - i32.load + local.get $0 + i32.load offset=16 local.tee $1 if local.get $1 @@ -1518,8 +1515,8 @@ end br $__inlined_func$~lib/rt/__visit_members end - local.get $1 - i32.load offset=4 + local.get $0 + i32.load offset=20 local.tee $1 if local.get $1