diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h index 2d8add82383be..2fda091e412ae 100644 --- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h +++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h @@ -309,6 +309,11 @@ struct BufferizationOptions { /// bufferized or not. bool bufferizeFunctionBoundaries = false; + // Specifies whether to account for parallel regions in RaW analysis. If true, + // then writes inside of parallel regions that write to buffers defined + // outside of the parallel region will be given a new buffer. + bool checkParallelRegions = true; + /// Certain ops have aliasing OpOperand/OpResult invariants (e.g., scf.for). /// If this flag is set to `false`, those invariants are no longer enforced /// with buffer copies. diff --git a/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td b/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td index 5ace9c390e146..53b3b0505b399 100644 --- a/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td +++ b/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td @@ -88,6 +88,7 @@ def OneShotBufferizeOp DefaultValuedAttr:$dump_alias_sets, DefaultValuedAttr:$test_analysis_only, DefaultValuedAttr:$print_conflicts, + DefaultValuedAttr:$check_parallel_regions, DefaultValuedAttr:$memcpy_op); let results = (outs TransformHandleTypeInterface:$transformed); diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td index 8f8826b9ad56b..1cece818dbbbc 100644 --- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td @@ -498,6 +498,8 @@ def OneShotBufferize : Pass<"one-shot-bufferize", "ModuleOp"> { Option<"bufferizeFunctionBoundaries", "bufferize-function-boundaries", "bool", /*default=*/"0", "Bufferize function boundaries (experimental).">, + Option<"checkParallelRegions", "check-parallel-regions", "bool", + /*default=*/"true", "Account for parallel regions in RaW analysis.">, Option<"copyBeforeWrite", "copy-before-write", "bool", /*default=*/"false", "Skip the analysis. Make a buffer copy on every write.">, ListOption<"dialectFilter", "dialect-filter", "std::string", diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp index 0fddd60eb8140..e422086c9fde6 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp @@ -226,6 +226,7 @@ struct OneShotBufferizePass opt.printConflicts = printConflicts; opt.testAnalysisOnly = testAnalysisOnly; opt.bufferizeFunctionBoundaries = bufferizeFunctionBoundaries; + opt.checkParallelRegions = checkParallelRegions; opt.noAnalysisFuncFilter = noAnalysisFuncFilter; // Configure type converter. diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp index 2d329a1f3d889..d0b4e0dd4383e 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp @@ -611,7 +611,7 @@ hasReadAfterWriteInterference(const DenseSet &usesRead, // Before going through the main RaW analysis, find cases where a buffer must // be privatized due to parallelism. If the result of a write is never read, // privatization is not necessary (and large parts of the IR are likely dead). - if (!usesRead.empty()) { + if (options.checkParallelRegions && !usesRead.empty()) { for (OpOperand *uConflictingWrite : usesWrite) { // Find the allocation point or last write (definition) of the buffer. // Note: In contrast to `findDefinitions`, this also returns results of diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir index 4d82021e86f5b..9bb87ffbb2090 100644 --- a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir +++ b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir @@ -1,4 +1,5 @@ -// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only" -split-input-file | FileCheck %s +// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only" -split-input-file | FileCheck %s --check-prefixes=CHECK,PARALLEL-CHECK +// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only check-parallel-regions=false" -split-input-file | FileCheck %s --check-prefixes=CHECK,NO-PARALLEL-CHECK // Run fuzzer with different seeds. // RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23" -split-input-file -o /dev/null @@ -811,8 +812,10 @@ func.func @parallel_region() -> tensor<320xf32> %0 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %alloc0) -> (tensor<320xf32>) { %val = "test.foo"() : () -> (f32) // linalg.fill must bufferize out-of-place because every thread needs a - // private copy of %alloc1. - // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]} + // private copy of %alloc1. If not accounting for parallel regions, the fill + // can bufferize in place. + // PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]} + // NO-PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]} %fill = linalg.fill ins(%val : f32) outs(%alloc1 : tensor<1xf32>) -> tensor<1xf32> scf.forall.in_parallel { // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]} @@ -841,8 +844,10 @@ func.func @parallel_region_mixed_def(%c: i1) -> tensor<320xf32> } %val = "test.foo"() : () -> (f32) // linalg.fill must bufferize out-of-place because every thread needs a - // private copy of %alloc1. - // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]} + // private copy of %alloc1. If not accounting for parallel regions, the fill + // can bufferize in place. + // PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]} + // NO-PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]} %fill = linalg.fill ins(%val : f32) outs(%selected : tensor<1xf32>) -> tensor<1xf32> scf.forall.in_parallel { // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]} @@ -866,8 +871,10 @@ func.func @parallel_region_two_writes(%f: f32) -> tensor<320xf32> %0 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %alloc0) -> (tensor<320xf32>) { %val = "test.foo"() : () -> (f32) // linalg.fill must bufferize out-of-place because every thread needs a - // private copy of %alloc1. - // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]} + // private copy of %alloc1. If not accounting for parallel regions, the fill + // can bufferize in place. + // PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]} + // NO-PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]} %fill = linalg.fill ins(%val : f32) outs(%alloc1 : tensor<1xf32>) -> tensor<1xf32> // CHECK: tensor.insert // CHECK-SAME: __inplace_operands_attr__ = ["none", "true", "none"]