[webgpu] Migrate sparseToDense to the atomic-based kernel (#6552)

hujiajie · web-flow · commit bdfb9d78883b · 2022-06-24T14:58:18.000+08:00
PERF The sparseToDense op takes an optional default value. Unlike scatterNd, the output cannot be initialized with fill(), since the default value is a scalar tensor (which could be the result of a previous op) than a scalar number. The (horrible!) workaround here is to broadcast the value with tile(). The other challenge is if the kernel should discard the original value at index or accumulate on that. The magic is performed by splitting the op into two "scatter" steps: 1) replace the default value with 0, and 2) add the input sparse values to 0 or whatever. This avoids a bitmap for recording whether the output element at index has been updated by another invocation. Closes #6525
diff --git a/tfjs-backend-webgpu/src/kernels/SparseToDense.ts b/tfjs-backend-webgpu/src/kernels/SparseToDense.ts
@@ -19,9 +19,11 @@ import {backend_util, KernelConfig, KernelFunc, Rank, SparseToDense, SparseToDen
 
 import {WebGPUBackend} from '../backend_webgpu';
 import {scatterImplCPU} from '../kernel_utils/shared';
-import {ScatterProgram} from '../scatter_webgpu';
+import {ScatterOptimizedProgram} from '../scatter_optimized_webgpu';
 
+import {identity} from './Identity';
 import {reshape} from './Reshape';
+import {tile} from './Tile';
 
 export function sparseToDense(args: {
   inputs: SparseToDenseInputs,
@@ -46,24 +48,85 @@ export function sparseToDense(args: {
         sliceRank, strides, $defaultValue, sumDupeIndices);
     return backend.makeTensorInfo(outputShape, outBuf.dtype, outBuf.values);
   }
+
+  const flattenShape = [outputSize / sliceSize, sliceSize];
+
+  const $sparseIndices = reshape({
+    inputs: {x: sparseIndices},
+    backend,
+    attrs: {shape: [numUpdates, sliceRank]}
+  });
+  const $sparseValues = sparseValues.shape.length ?
+      reshape({
+        inputs: {x: sparseValues},
+        backend,
+        attrs: {shape: [numUpdates, sliceSize]}
+      }) :
+      identity({inputs: {x: sparseValues}, backend});
+
+  const type = $sparseValues.dtype;
+  const zero =
+      backend.makeTensorInfo([], type, util.makeZerosTypedArray(1, type));
+
+  // Fill output tensor with the default value.
+  const $defaultValue = reshape({
+    inputs: {x: defaultValue},
+    backend,
+    attrs: {shape: Array(flattenShape.length).fill(1)}
+  });
+  const $denseValues =
+      tile({inputs: {x: $defaultValue}, backend, attrs: {reps: flattenShape}});
+
+  const size = util.sizeFromShape([numUpdates, sliceSize]);
   const uniformData = [
-    {type: 'int32', data: [numUpdates]},
     {type: 'int32', data: [sliceRank]},
     {type: 'int32', data: strides},
+    {type: 'int32', data: [size]},
   ];
-  const program = new ScatterProgram(
-      numUpdates, sliceRank, sparseIndices.shape.length,
-      sparseValues.shape.length, strides, [outputSize, 1], sumDupeIndices);
 
-  const res = backend.runWebGPUProgram(
-      program, [sparseValues, sparseIndices, defaultValue], sparseValues.dtype,
-      uniformData);
+  switch (numUpdates) {
+    case 0:
+      break;
+    case 1:
+      if (true) {
+        const program = new ScatterOptimizedProgram(
+            [numUpdates, sliceSize], sliceRank, $sparseIndices.shape.length,
+            $sparseValues.shape.length, strides, flattenShape, type,
+            sumDupeIndices);
+        backend.runWebGPUProgram(
+            program, [$sparseValues, $sparseIndices], type, uniformData,
+            $denseValues);
+      }
+      break;
+    default:
+      if (true) {
+        // First replace the default value with 0 at indices.
+        const program = new ScatterOptimizedProgram(
+            [numUpdates, sliceSize], sliceRank, $sparseIndices.shape.length,
+            zero.shape.length, strides, flattenShape, type, sumDupeIndices);
+        backend.runWebGPUProgram(
+            program, [zero, $sparseIndices], type, uniformData, $denseValues);
+      }
+      {
+        // Then replace 0 with the (sum of) sparse value(s) at indices.
+        const program = new ScatterOptimizedProgram(
+            [numUpdates, sliceSize], sliceRank, $sparseIndices.shape.length,
+            $sparseValues.shape.length, strides, flattenShape, type);
+        backend.runWebGPUProgram(
+            program, [$sparseValues, $sparseIndices], type, uniformData,
+            $denseValues);
+      }
+  }
 
-  const reshaped =
-      reshape({inputs: {x: res}, backend, attrs: {shape: outputShape}});
+  const denseValues = reshape(
+      {inputs: {x: $denseValues}, backend, attrs: {shape: outputShape}});
 
-  backend.disposeData(res.dataId);
-  return reshaped;
+  backend.disposeData($sparseIndices.dataId);
+  backend.disposeData($sparseValues.dataId);
+  backend.disposeData($defaultValue.dataId);
+  backend.disposeData(zero.dataId);
+  backend.disposeData($denseValues.dataId);
+  return denseValues;
 }
 
 export const sparseToDenseConfig: KernelConfig = {
diff --git a/tfjs-backend-webgpu/src/scatter_optimized_webgpu.ts b/tfjs-backend-webgpu/src/scatter_optimized_webgpu.ts
@@ -16,13 +16,14 @@
  */
 
 import {DataType} from '@tensorflow/tfjs-core';
-import {getCoordsDataType, getMainHeaderAndGlobalIndexString, WebGPUProgram} from './webgpu_program';
+import {getCoordsDataType, getMainHeaderAndGlobalIndexString, mapToWgslTypes, WebGPUProgram} from './webgpu_program';
 import {computeDispatch, flatDispatchLayout} from './webgpu_util';
 
 export class ScatterOptimizedProgram implements WebGPUProgram {
   variableNames = ['updates', 'indices'];
   uniforms: string;
   outputShape: number[];
+  sumDupeIndices: boolean;
   shaderKey: string;
   dispatchLayout: {x: number[]};
   dispatch: [number, number, number];
@@ -36,16 +37,17 @@ export class ScatterOptimizedProgram implements WebGPUProgram {
   constructor(
       flattenXShape: number[], sliceDim: number, indicesRank: number,
       updatesRank: number, strides: number[], shape: number[],
-      outputDtype: DataType) {
+      outputDtype: DataType, sumDupeIndices = true) {
     this.outputShape = shape;
     this.type = outputDtype;
+    this.sumDupeIndices = sumDupeIndices;
     this.dispatchLayout = flatDispatchLayout(flattenXShape);
     // Dispatching based on |updates| shape instead of output shape.
     this.dispatch =
         computeDispatch(this.dispatchLayout, flattenXShape, this.workGroupSize);
     this.sliceDimGreaterThanOne = sliceDim > 1;
     this.shaderKey = `scatter_${indicesRank}_${updatesRank}_${
-        this.sliceDimGreaterThanOne}_${outputDtype}`;
+        this.sliceDimGreaterThanOne}_${outputDtype}_${sumDupeIndices}`;
     const stridesType = getCoordsDataType(strides.length);
     this.uniforms = `sliceDim : i32, strides: ${stridesType}, size: i32,`;
     this.updatesRank = updatesRank;
@@ -64,45 +66,57 @@ export class ScatterOptimizedProgram implements WebGPUProgram {
     const strideString = this.sliceDimGreaterThanOne ? 'uniforms.strides[j]' :
                                                        'uniforms.strides';
 
-    let updatesString = '';
     let outCoordsString = '';
     let getUpdatesCoordsFromFlatIndex = '';
-    if (this.updatesRank === 1) {
-      updatesString = 'coords[0]';
+    if (this.dispatchLayout.x.length === 1) {
       outCoordsString = 'flattenedIndex';
       getUpdatesCoordsFromFlatIndex = `
       fn getUpdatesCoordsFromFlatIndex(index : i32) -> i32 {
         return index;
       }
       `;
-    } else if (this.updatesRank === 2) {
-      updatesString = 'coords[0], coords[1]';
+    } else if (this.dispatchLayout.x.length === 2) {
       outCoordsString = 'vec2<i32>(flattenedIndex, coords[1])';
       getUpdatesCoordsFromFlatIndex = `
       fn getUpdatesCoordsFromFlatIndex(index : i32) -> vec2<i32> {
-        let d0 = index / uniforms.updatesShape[1];
-        let d1 = index - d0 * uniforms.updatesShape[1];
+        // N.B. |updates| could be a scalar tensor, conceptually representing a
+        // 2D tensor with all values equal to that. By design, its size must be
+        // the same as |outShape[1]| in one dimension, and |indicesShape[0]|
+        // gives the other.
+        let sliceSize = uniforms.outShape[1];
+        let d0 = index / sliceSize;
+        let d1 = index - d0 * sliceSize;
         return vec2<i32>(d0, d1);
       }
       `;
     }
-    const updatesSnippet = `getUpdates(${updatesString})`;
+    const updatesString =
+        Array.from({length: this.updatesRank}, (_, idx) => `coords[${idx}]`);
+    const updatesSnippet = `getUpdates(${updatesString.join(', ')})`;
 
-    // atomicAdd only supports uint/int type. For float, we use
-    // atomicCompareExchangeWeak to simulate.
-    const atomicAddSnippet = this.type === 'int32' ?
-        `atomicAdd(&(result[flatIndex]), i32(updateValue));` :
-        `
-     var oldValue = atomicLoad(&(result[flatIndex]));
-     var exchanged = false;
-     for (; !exchanged;) {
-       let newValueF32 = bitcast<f32>(oldValue) + updateValue;
-       let newValue = bitcast<i32>(newValueF32);
-       let res = atomicCompareExchangeWeak(&(result[flatIndex]), oldValue, newValue);
-       oldValue = res.old_value;
-       exchanged = res.exchanged;
-     }
-     `;
+    const atomicRMW = (ptr: string, val: string) => {
+      let atomicAddSnippet = `atomicAdd(${ptr}, bitcast<i32>(${val}))`;
+      if (this.type === 'float32') {
+        atomicAddSnippet = `
+          {
+            var oldBits = 0;
+            var newBits = bitcast<i32>(${val});
+            loop {
+              let info = atomicCompareExchangeWeak(${ptr}, oldBits, newBits);
+              if (info.exchanged) {
+                break;
+              }
+              oldBits = info.old_value;
+              let oldValue = bitcast<f32>(oldBits);
+              let newValue = oldValue + (${val});
+              newBits = bitcast<i32>(newValue);
+            }
+          }
+        `;
+      }
+      const atomicStoreSnippet = `atomicStore(${ptr}, bitcast<i32>(${val}));`;
+      return this.sumDupeIndices ? atomicAddSnippet : atomicStoreSnippet;
+    };
 
     const userCode = `
     ${getUpdatesCoordsFromFlatIndex}
@@ -116,10 +130,11 @@ export class ScatterOptimizedProgram implements WebGPUProgram {
             let indexInside = i32(round(${indicesSnippet}));
             flattenedIndex = flattenedIndex + indexInside * ${strideString};
           }
-          let updateValue = ${updatesSnippet};
+          let updateValue =
+              ${mapToWgslTypes(this.type, false)}(${updatesSnippet});
           let flatIndex = getOutputIndexFromCoords(${outCoordsString});
 
-         ${atomicAddSnippet}
+          ${atomicRMW('&result[flatIndex]', 'updateValue')};
         }
       }`;
     return userCode;
diff --git a/tfjs-backend-webgpu/src/webgpu_program.ts b/tfjs-backend-webgpu/src/webgpu_program.ts
@@ -367,7 +367,7 @@ const commonSnippet = `
 type InputInfo = {
   dtype: DataType; shape: number[]; name: string;
 };
-type WGSLDataType = 'f32'|'i32'|'vec4<f32>'|'vec4<i32>'|'vec4<bool>';
+export type WGSLDataType = 'f32'|'i32'|'vec4<f32>'|'vec4<i32>'|'vec4<bool>';
 
 /**
  * Derives logical coordinates from a flat index. Performs integer division
@@ -754,7 +754,7 @@ function isFlatDispatch(program: WebGPUProgram): boolean {
   return program.dispatch[1] === 1 && program.dispatch[2] === 1;
 }
 
-function mapToWgslTypes(type: DataType, isVec4: boolean): WGSLDataType|
+export function mapToWgslTypes(type: DataType, isVec4: boolean): WGSLDataType|
     DataType {
   if (type === 'float32') {
     return isVec4 ? 'vec4<f32>' : 'f32';