[webgpu] Support convTranspose vec4 (#6603)

axinging · web-flow · commit 493945357a7a · 2022-07-07T16:35:58.000+08:00
diff --git a/tfjs-backend-webgpu/src/conv_backprop_mm_webgpu.ts b/tfjs-backend-webgpu/src/conv_backprop_mm_webgpu.ts
@@ -16,107 +16,163 @@
  */
 
 import {backend_util, util} from '@tensorflow/tfjs-core';
-
+import {typeSnippet} from './activation_util';
+import {makeMatMulPackedVec4Source} from './matmul_packed_vec4_webgpu';
 import {makeMatMulPackedSource} from './matmul_packed_webgpu';
 import {WebGPUProgram} from './webgpu_program';
 import {computeDispatch, computeWorkGroupSizeForConv2d, computeWorkPerThreadForConv2d} from './webgpu_util';
 
+function conv2dTransposeCommonSnippet(innerElementSize = 4) {
+  const getWSnippet = (innerElementSize: number) => {
+    switch (innerElementSize) {
+      case 1:
+        return 'return W[getIndexFromCoords4D(coord, uniforms.wShape)];';
+      case 4:
+        return `
+            let coord1 = vec4<i32>(coordX, coordY, col + 1, rowInner);
+            let coord2 = vec4<i32>(coordX, coordY, col + 2, rowInner);
+            let coord3 = vec4<i32>(coordX, coordY, col + 3, rowInner);
+            let v0 = W[getIndexFromCoords4D(coord, uniforms.wShape)];
+            let v1 = W[getIndexFromCoords4D(coord1, uniforms.wShape)];
+            let v2 = W[getIndexFromCoords4D(coord2, uniforms.wShape)];
+            let v3 = W[getIndexFromCoords4D(coord3, uniforms.wShape)];
+            return vec4<f32>(v0, v1, v2, v3);
+            `;
+      default:
+        throw new Error(
+            `innerElementSize ${innerElementSize} is not supported.`);
+    }
+  };
+
+  const readASnippet = `
+      let outRow = row / uniforms.outShape[2];
+      let outCol = row % uniforms.outShape[2];
+
+      let WRow = col / (uniforms.filterDims[1] * uniforms.outBackprop[3]);
+      let WCol = col / uniforms.outBackprop[3] % uniforms.filterDims[1];
+      let xR = f32(outRow - uniforms.pads[0] + WRow) / f32(uniforms.stride[0]);
+      let xC = f32(outCol - uniforms.pads[1] + WCol) / f32(uniforms.stride[1]);
+      if (xR < 0.0 || xR >= f32(uniforms.outBackprop[1]) || fract(xR) > 0.0) {
+        return ${typeSnippet(innerElementSize)}(0.0);
+      }
+      if (xC < 0.0 || xC >= f32(uniforms.outBackprop[2]) || fract(xC) > 0.0) {
+        return ${typeSnippet(innerElementSize)}(0.0);
+      }
+      let coord = vec4<i32>(
+          batch,
+          i32(xR),
+          i32(xC),
+          col % uniforms.outBackprop[3]);
+      return x[getIndexFromCoords4D(coord, uniforms.xShape)/${
+      innerElementSize}];`;
+
+  const sampleA = `if (row < uniforms.dimAOuter && col < uniforms.dimInner) {
+        ${readASnippet}
+      }
+      return ${typeSnippet(innerElementSize)}(0.0);`;
+
+  const userCode = `
+  fn mm_readA(row : i32, colIn : i32, globalId : vec3<u32>) -> ${
+      typeSnippet(innerElementSize)} {
+    let col = colIn * ${innerElementSize};
+    var batch = i32(globalId.z);
+    ${sampleA}
+  }
+
+  fn mm_readB(row : i32, colIn : i32, globalId : vec3<u32>) -> ${
+      typeSnippet(innerElementSize)} {
+    let col = colIn * ${innerElementSize};
+    let coordX = uniforms.filterDims.x - 1 -
+        row / (uniforms.filterDims[1] * uniforms.outBackprop[3]);
+    let coordY = uniforms.filterDims.y - 1 -
+        (row / uniforms.outBackprop[3]) % uniforms.filterDims[1];
+    if (row < uniforms.dimInner && col < uniforms.dimBOuter &&
+        coordX >= 0 && coordY >= 0) {
+      let rowInner = row % uniforms.outBackprop[3];
+      let coord = vec4<i32>(coordX, coordY, col, rowInner);
+      ${getWSnippet(innerElementSize)}
+    }
+    return ${typeSnippet(innerElementSize)}(0.0);
+  }
+
+  fn mm_write(row : i32, colIn : i32, valueInput : ${
+      typeSnippet(innerElementSize)}, globalId : vec3<u32>) {
+    let col = colIn * ${innerElementSize};
+    if (row < uniforms.dimAOuter && (col + ${
+      innerElementSize - 1}) < uniforms.dimBOuter) {
+      var batch = i32(globalId.z);
+      var value = valueInput;
+      let outCoord = vec4<i32>(
+          batch,
+          row / uniforms.outShape[2],
+          row % uniforms.outShape[2],
+          col);
+      result[getIndexFromCoords4D(outCoord, uniforms.outShape)/${
+      innerElementSize}] = value;
+    }
+  }`;
+  return userCode;
+}
+
 export class Conv2DDerInputMMProgram implements WebGPUProgram {
   outputShape: number[];
   shaderKey: string;
   dispatchLayout: {x: number[], y: number[], z: number[]};
   dispatch: [number, number, number];
   variableNames = ['x', 'W'];
+  variableTypes: string[];
   uniforms =
       'filterDims : vec2<i32>, pads : vec2<i32>, stride : vec2<i32>, outBackprop : vec4<i32>, dimAOuter : i32, dimBOuter : i32, dimInner : i32,';
   workGroupSize: [number, number, number];
   elementsPerThread: [number, number, number];
+  tileAOuter: number;
+  tileBOuter: number;
+  tileInner: number;
+  innerElementSize: number;
+  isVec4?: boolean;
 
   constructor(convInfo: backend_util.Conv2DInfo) {
     this.outputShape = convInfo.inShape;
 
     util.assert(
         convInfo.dataFormat === 'channelsLast',
         () => 'TODO: NCHW is unimplemented');
+    this.isVec4 =
+        convInfo.inChannels % 4 === 0 && convInfo.outChannels % 4 === 0;
     this.dispatchLayout = {x: [3], y: [1, 2], z: [0]};
-    this.workGroupSize =
-        computeWorkGroupSizeForConv2d(this.dispatchLayout, this.outputShape);
-    this.elementsPerThread =
-        computeWorkPerThreadForConv2d(this.dispatchLayout, this.outputShape);
+    this.workGroupSize = computeWorkGroupSizeForConv2d(
+        this.dispatchLayout, this.outputShape, this.isVec4);
+    this.elementsPerThread = computeWorkPerThreadForConv2d(
+        this.dispatchLayout, this.outputShape, this.isVec4);
 
     this.dispatch = computeDispatch(
         this.dispatchLayout, this.outputShape, this.workGroupSize,
         this.elementsPerThread);
 
-    this.shaderKey = `conv2DDerInputMM_${this.elementsPerThread}`;
+    if (this.isVec4) {
+      this.innerElementSize = 4;
+      this.variableTypes = ['vec4<f32>', 'f32'];
+    } else {
+      this.innerElementSize = this.elementsPerThread[0];
+    }
+    this.tileAOuter = this.workGroupSize[1] * this.elementsPerThread[1];
+    this.tileBOuter = this.workGroupSize[0] * this.elementsPerThread[0];
+    this.tileInner = Math.max(
+        this.workGroupSize[0] * this.innerElementSize, this.workGroupSize[1]);
+    this.shaderKey = `conv2DDerInputMM_${this.isVec4}_${
+        this.elementsPerThread}_${this.innerElementSize}`;
   }
 
   getUserCode(): string {
-    const matMulSource =
+    const matMulSource = this.isVec4 ?
+        makeMatMulPackedVec4Source(
+            this.elementsPerThread, this.tileAOuter, this.tileBOuter,
+            this.tileInner, this.innerElementSize) :
         makeMatMulPackedSource(this.elementsPerThread, this.workGroupSize);
-
-    const readASnippet = `
-    let outRow = row / uniforms.outShape[2];
-    let outCol = row % uniforms.outShape[2];
-
-    let WRow = col / (uniforms.filterDims[1] * uniforms.outBackprop[3]);
-    let WCol = col / uniforms.outBackprop[3] % uniforms.filterDims[1];
-    let xR = f32(outRow - uniforms.pads[0] + WRow) / f32(uniforms.stride[0]);
-    let xC = f32(outCol - uniforms.pads[1] + WCol) / f32(uniforms.stride[1]);
-    if (xR < 0.0 || xR >= f32(uniforms.outBackprop[1]) || fract(xR) > 0.0) {
-      return 0.0;
-    }
-    if (xC < 0.0 || xC >= f32(uniforms.outBackprop[2]) || fract(xC) > 0.0) {
-      return 0.0;
-    }
-    let coord = vec4<i32>(
-        batch,
-        i32(xR),
-        i32(xC),
-        col % uniforms.outBackprop[3]);
-    return x[getIndexFromCoords4D(coord, uniforms.xShape)];`;
-
-    const sampleA = `if (row < uniforms.dimAOuter && col < uniforms.dimInner) {
-      ${readASnippet}
-    }
-    return 0.0;`;
-
     const userCode = `
-    fn mm_readA(row : i32, col : i32, globalId : vec3<u32>) -> f32 {
-      var batch = i32(globalId.z);
-      ${sampleA}
-    }
-
-    fn mm_readB(row : i32, col : i32, globalId : vec3<u32>) -> f32 {
-      let coordX = uniforms.filterDims.x - 1 -
-          row / (uniforms.filterDims[1] * uniforms.outBackprop[3]);
-      let coordY = uniforms.filterDims.y - 1 -
-          (row / uniforms.outBackprop[3]) % uniforms.filterDims[1];
-      if (row < uniforms.dimInner && col < uniforms.dimBOuter &&
-          coordX >= 0 && coordY >= 0) {
-        let coord = vec4<i32>(coordX, coordY, col,
-            row % uniforms.outBackprop[3]);
-        return W[getIndexFromCoords4D(coord, uniforms.wShape)];
-      }
-      return 0.0;
-    }
-
-    fn mm_write(row : i32, col : i32, valueInput : f32, globalId : vec3<u32>) {
-      if (row < uniforms.dimAOuter && col < uniforms.dimBOuter)
-      {
-      var batch = i32(globalId.z);
-      var value = valueInput;
-      let outCoord = vec4<i32>(
-          batch,
-          row / uniforms.outShape[2],
-          row % uniforms.outShape[2],
-          col);
-      result[getIndexFromCoords4D(outCoord, uniforms.outShape)] = value;
-      }
-    }
-
+    ${conv2dTransposeCommonSnippet(this.isVec4 ? 4 : 1)}
     ${matMulSource}
-  `;
+    `;
     return userCode;
   }
 }
diff --git a/tfjs-backend-webgpu/src/setup_test.ts b/tfjs-backend-webgpu/src/setup_test.ts
@@ -60,6 +60,12 @@ const TEST_FILTERS: TestFilter[] = [
       'gradient',  // gradient function not found.
     ]
   },
+  {
+    startsWith: 'conv2dTranspose ',
+    excludes: [
+      'gradient',  // gradient function not found.
+    ]
+  },
   {
     startsWith: 'cumprod ',
     excludes: [
@@ -283,7 +289,6 @@ const TEST_FILTERS: TestFilter[] = [
       'avgPool3dBackprop ',
       'bincount ',
       'broadcastArgs ',
-      'conv2dTranspose ',
       'conv2DBackpropFilter ',
       'gradient with clones, input=2x2x1,d2=1,f=1,s=1,d=1,p=same',  // Conv2DBackpropFilter
       'conv1d gradients',  // Conv2DBackpropFilter
diff --git a/tfjs-core/src/ops/conv2d_transpose_test.ts b/tfjs-core/src/ops/conv2d_transpose_test.ts
@@ -207,14 +207,14 @@ describeWithFlags('conv2dTranspose', ALL_ENVS, () => {
            inputShape);
        const w = tf.tensor4d(
            [
-             0., 1., 2., 3., 4., 5., 6., 7., 8.,
-             9., 10., 11., 12., 13., 14., 15.
+             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14.,
+             15.
            ],
            [fSize, fSize, origInputDepth, origOutputDepth]);
 
        expect(
            () => tf.conv2dTranspose(
-              x, w, [1, 3, 3, 1], origStride, origPad, dimRoundingMode))
+               x, w, [1, 3, 3, 1], origStride, origPad, dimRoundingMode))
            .toThrowError();
      });
 
@@ -239,14 +239,14 @@ describeWithFlags('conv2dTranspose', ALL_ENVS, () => {
            inputShape);
        const w = tf.tensor4d(
            [
-             0., 1., 2., 3., 4., 5., 6., 7., 8.,
-             9., 10., 11., 12., 13., 14., 15.
+             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14.,
+             15.
            ],
            [fSize, fSize, origInputDepth, origOutputDepth]);
 
        expect(
            () => tf.conv2dTranspose(
-              x, w, [1, 3, 3, 1], origStride, origPad, dimRoundingMode))
+               x, w, [1, 3, 3, 1], origStride, origPad, dimRoundingMode))
            .toThrowError();
      });
 
@@ -666,4 +666,44 @@ describeWithFlags('conv2dTranspose', ALL_ENVS, () => {
     expect(result.shape).toEqual([2, 2, 1]);
     expectArraysClose(await result.data(), expected);
   });
+
+  it('input=8x8x8,output=4x4x8,f=8,s=1,inDepth=8,p=same vec4', async () => {
+    const origInputDepth = 8;
+    const origOutputDepth = 8;
+    const inputShape: [number, number, number, number] =
+        [1, 8, 8, origOutputDepth];
+    const fSize = 8;
+    const origPad = 'same';
+    const origStride: [number, number] = [1, 1];
+    const wShape: [number, number, number, number] =
+        [fSize, fSize, origInputDepth, origOutputDepth];
+
+    const inputData = [];
+    for (let i = 0; i < fSize * fSize * origInputDepth; i++) {
+      inputData.push(i % 5);
+    }
+    const wData = [];
+    for (let i = 0; i < fSize * fSize * origInputDepth * origOutputDepth; i++) {
+      wData.push(i % 5);
+    }
+
+    const x = tf.tensor4d(inputData, inputShape);
+    const w = tf.tensor4d(wData, wShape);
+    const result = tf.conv2dTranspose(
+        x, w, [1, 4, 4, origInputDepth], origStride, origPad);
+    expect(result.shape).toEqual([1, 4, 4, 8]);
+
+    const expected = [
+      512, 533, 469, 550, 506, 512, 533, 469, 550, 506, 512, 533, 469, 550, 506,
+      512, 533, 469, 550, 506, 512, 533, 469, 550, 506, 512, 533, 469, 550, 506,
+      512, 533, 506, 512, 533, 469, 550, 506, 512, 533, 469, 550, 506, 512, 533,
+      469, 550, 506, 512, 533, 469, 550, 506, 512, 533, 469, 550, 506, 512, 533,
+      469, 550, 506, 512, 550, 506, 512, 533, 469, 550, 506, 512, 533, 469, 550,
+      506, 512, 533, 469, 550, 506, 512, 533, 469, 550, 506, 512, 533, 469, 550,
+      506, 512, 533, 469, 550, 506, 469, 550, 506, 512, 533, 469, 550, 506, 512,
+      533, 469, 550, 506, 512, 533, 469, 550, 506, 512, 533, 469, 550, 506, 512,
+      533, 469, 550, 506, 512, 533, 469, 550
+    ];
+    expectArraysClose(await result.data(), expected);
+  });
 });
diff --git a/tfjs-node/src/run_tests.ts b/tfjs-node/src/run_tests.ts
@@ -104,6 +104,8 @@ const IGNORE_LIST: string[] = [
   // Node backend which uses TF 2.4.0 doesn't support explicit padding
   'conv2dTranspose test-tensorflow {} input=3x3x1,d2=1,f=2,s=2,p=explicit',
   // tslint:disable-next-line:max-line-length
+  'conv2dTranspose test-tensorflow {} input=8x8x8,output=4x4x8,f=8,s=1,inDepth=8,p=same vec4',
+  // tslint:disable-next-line:max-line-length
   'conv2dTranspose test-tensorflow {} gradient input=[1,3,3,1] f=[2,2,2,1] s=[1,1] p=explicit',
   'fused conv2d test-tensorflow {} basic in NCHW',
   'fused conv2d test-tensorflow {} im2row in NCHW',