Merge branch 'master' into frompixels2

Yang Gu · web-flow · commit 11f6a3a5f9d6 · 2022-06-20T14:19:01.000+08:00
diff --git a/package.json b/package.json
@@ -62,6 +62,7 @@
     "terser": "^5.7.0",
     "ts-morph": "^11.0.3",
     "ts-node": "~8.8.2",
+    "tslib": "^2.4.0",
     "tslint": "^6.1.3",
     "tslint-no-circular-imports": "~0.7.0",
     "typescript": "3.5.3"
diff --git a/tfjs-backend-webgpu/src/depthwise_conv2d_nchw_shared_webgpu.ts b/tfjs-backend-webgpu/src/depthwise_conv2d_nchw_shared_webgpu.ts
@@ -27,9 +27,7 @@ export class DepthwiseConv2DNCHWSharedProgram implements WebGPUProgram {
   dispatchLayout: {x: number[], y: number[], z: number[]};
   dispatch: [number, number, number];
   variableNames = ['x', 'W'];
-  uniforms = `pad : vec2<i32>, stride : vec2<i32>, dilation : vec2<i32>,
-      inDims : vec2<i32>, filterHeight : i32, filterWidth : i32,
-      channelMul : i32,`;
+  uniforms = `pad : vec2<i32>, inDims : vec2<i32>,`;
   workGroupSize: [number, number, number] = [16, 16, 1];
   addBias: boolean;
   activation: backend_util.Activation;
@@ -119,10 +117,10 @@ export class DepthwiseConv2DNCHWSharedProgram implements WebGPUProgram {
         numWorkgroups = NumWorkgroups;
         let coords = getOutputCoords();
         let batch = coords[0];
-        let xRCCorner = vec2<i32>(coords.zw) * uniforms.stride - uniforms.pad;
-        let d2 = coords[1];
-        let d1 = d2 / uniforms.channelMul;
-        let q = d2 - d1 * uniforms.channelMul;
+        let xRCCorner = vec2<i32>(coords.zw) - uniforms.pad;
+        let channelMul = uniforms.wShape[3];
+        let d1 = coords[1] / channelMul;
+        let q = coords[1] % channelMul;
 
         let inputRowStart = xRCCorner.x;
         let inputColStart = xRCCorner.y;
@@ -157,8 +155,8 @@ export class DepthwiseConv2DNCHWSharedProgram implements WebGPUProgram {
         workgroupBarrier();
 
         var dotProd = 0.0;
-        for (var wR = 0; wR < uniforms.filterHeight; wR = wR + 1) {
-          for (var wC = 0; wC < uniforms.filterWidth; wC = wC + 1) {
+        for (var wR = 0; wR < ${this.filterHeight}; wR = wR + 1) {
+          for (var wC = 0; wC < ${this.filterWidth}; wC = wC + 1) {
             let xVal = mm_Asub[localRow + wR][localCol + wC];
             let wVal = mm_Bsub[wR][wC];
             dotProd = fma(xVal, wVal, dotProd);
diff --git a/tfjs-backend-webgpu/src/depthwise_conv2d_vec4_webgpu.ts b/tfjs-backend-webgpu/src/depthwise_conv2d_vec4_webgpu.ts
@@ -20,14 +20,13 @@ import {mapActivationToShaderProgram} from './activation_util';
 import {getWorkGroupSizeString, WebGPUProgram} from './webgpu_program';
 import {computeDispatch} from './webgpu_util';
 
-export class DepthwiseConv2D3x3Program implements WebGPUProgram {
+export class DepthwiseConv2DVec4Program implements WebGPUProgram {
   outputShape: number[];
   shaderKey: string;
   dispatchLayout: {x: number[], y: number[], z: number[]};
   dispatch: [number, number, number];
   variableNames = ['x', 'W'];
-  uniforms =
-      'pad : vec2<i32>, stride : vec2<i32>, dilation : vec2<i32>, inDims : vec2<i32>,';
+  uniforms = 'pad : vec2<i32>, inDims : vec2<i32>,';
   workGroupSize: [number, number, number] = [4, 4, 4];
   convInfo: backend_util.Conv2DInfo;
   addBias: boolean;
@@ -39,9 +38,9 @@ export class DepthwiseConv2D3x3Program implements WebGPUProgram {
       convInfo: backend_util.Conv2DInfo, addBias = false,
       activation: backend_util.Activation = null, hasPreluActivation = false) {
     this.outputShape = convInfo.outShape;
-    this.dispatchLayout = {x: [0, 1], y: [2], z: [3]};
+    this.dispatchLayout = {x: [3], y: [2], z: [0, 1]};
     this.dispatch = computeDispatch(
-        this.dispatchLayout, this.outputShape, this.workGroupSize, [1, 4, 4]);
+        this.dispatchLayout, this.outputShape, this.workGroupSize, [4, 4, 1]);
 
     util.assert(
         convInfo.dataFormat === 'channelsLast',
@@ -59,7 +58,8 @@ export class DepthwiseConv2D3x3Program implements WebGPUProgram {
     this.activation = activation;
     this.hasPreluActivation = hasPreluActivation;
 
-    this.shaderKey = `depthwise3x3_${activation}`;
+    this.shaderKey = `depthwiseVec4_${activation}_${
+        this.convInfo.filterHeight}_${this.convInfo.filterWidth}`;
   }
 
   getUserCode(): string {
@@ -87,65 +87,53 @@ export class DepthwiseConv2D3x3Program implements WebGPUProgram {
     const addBiasSnippet = this.addBias ?
         'dotProd[i] = dotProd[i] + getBiasByOutputCoords(coords);' :
         '';
-
+    // Here 4 is the work per thread in X dimension.
+    const xNumber = 4 + this.convInfo.filterWidth - 1;
     const userCode = `
       ${activationSnippet}
-
+      fn readX(batch : i32, row : i32, col : i32, channel : i32) -> vec4<f32> {
+        var value = vec4<f32>(0.0);
+        if (row >=0 && row < uniforms.inDims[0] && col >=0 && col < uniforms.inDims[1])
+        {
+          value = getX(batch, row, col, channel);
+        }
+        return value;
+      }
       ${getWorkGroupSizeString()}
       fn main(@builtin(global_invocation_id) globalId: vec3<u32>) {
-        let batch = 0;
-        let r = i32(globalId.x);
+        let batch = i32(globalId.z) / uniforms.outShape[1];
+        let r = i32(globalId.z) % uniforms.outShape[1];
         let c = i32(globalId.y) * 4;
-        let d2 = i32(globalId.z) * 4;
-        let xRCCorner = vec2<i32>(r, c) * uniforms.stride - uniforms.pad;
-        let d1 = d2;
-        let q = 0;
+        let d1 = i32(globalId.x) * 4;
+        let xRCCorner = vec2<i32>(r, c) - uniforms.pad;
 
         let xRCorner = xRCCorner.x;
         let xCCorner = xRCCorner.y;
-
-        var wVals : array<vec4<f32>, 9>;
-        wVals[0] = getW(0, 0, d1, q);
-        wVals[1] = getW(0, 1, d1, q);
-        wVals[2] = getW(0, 2, d1, q);
-        wVals[3] = getW(1, 0, d1, q);
-        wVals[4] = getW(1, 1, d1, q);
-        wVals[5] = getW(1, 2, d1, q);
-        wVals[6] = getW(2, 0, d1, q);
-        wVals[7] = getW(2, 1, d1, q);
-        wVals[8] = getW(2, 2, d1, q);
-
-        var xVals : array<array<vec4<f32>, 6>, 3>;
-        for (var wR = 0; wR < 3; wR = wR + 1) {
-          let xR = xRCorner + wR * uniforms.dilation[0];
-          for (var wC = 0; wC < 6; wC = wC + 1) {
-            let xC = xCCorner + wC * uniforms.dilation[1];
-            if (xR < 0 || xR >= uniforms.inDims[0] || xC < 0 || xC >= uniforms.inDims[1]) {
-              xVals[wR][wC] = vec4<f32>(0.0);
-            } else {
-              xVals[wR][wC] = getX(batch, xR, xC, d1);
-            }
-          }
-        }
-
+        var xVals : array<vec4<f32>, ${xNumber}>;
         var dotProd : array<vec4<f32>, 4>;
         dotProd[0] = vec4<f32>(0.0);
         dotProd[1] = vec4<f32>(0.0);
         dotProd[2] = vec4<f32>(0.0);
         dotProd[3] = vec4<f32>(0.0);
 
-        for (var wR = 0; wR < 3; wR = wR + 1) {
-          for (var wC = 0; wC < 3; wC = wC + 1) {
-            let indexW = wR * 3 + wC;
-            dotProd[0] = dotProd[0] + xVals[wR][0 + wC] * wVals[indexW];
-            dotProd[1] = dotProd[1] + xVals[wR][1 + wC] * wVals[indexW];
-            dotProd[2] = dotProd[2] + xVals[wR][2 + wC] * wVals[indexW];
-            dotProd[3] = dotProd[3] + xVals[wR][3 + wC] * wVals[indexW];
+        // Use constant instead of uniform can give better performance.
+        for (var wR = 0; wR < ${this.convInfo.filterHeight}; wR = wR + 1) {
+          let xR = xRCorner + wR;
+          for (var i = 0; i < ${xNumber}; i++)
+          {
+            xVals[i] = readX(batch, xR, xCCorner + i, d1);
+          }
+          for (var wC = 0; wC < ${this.convInfo.filterWidth}; wC = wC + 1) {
+            let wValue = getW(wR, wC, d1, 0);
+            dotProd[0] = dotProd[0] + xVals[0 + wC] * wValue;
+            dotProd[1] = dotProd[1] + xVals[1 + wC] * wValue;
+            dotProd[2] = dotProd[2] + xVals[2 + wC] * wValue;
+            dotProd[3] = dotProd[3] + xVals[3 + wC] * wValue;
           }
         }
 
         for (var i = 0; i < 4; i = i + 1) {
-          let coords = vec4<i32>(batch, r, c + i, d2);
+          let coords = vec4<i32>(batch, r, c + i, d1);
           if (coordsInBounds4D(coords, uniforms.outShape)) {
             ${addBiasSnippet}
             ${applyActivationSnippet}
diff --git a/tfjs-backend-webgpu/src/depthwise_conv2d_webgpu.ts b/tfjs-backend-webgpu/src/depthwise_conv2d_webgpu.ts
@@ -27,9 +27,8 @@ export class DepthwiseConv2DProgram implements WebGPUProgram {
   dispatchLayout: {x: number[], y?: number[], z?: number[]};
   dispatch: [number, number, number];
   variableNames = ['x', 'W'];
-  uniforms = `pad : vec2<i32>, stride : vec2<i32>, dilation : vec2<i32>,
-      inDims : vec2<i32>, filterHeight : i32, filterWidth : i32,
-      channelMul : i32,`;
+  uniforms = `pad : vec2<i32>, inDims : vec2<i32>, filterHeight : i32,
+      filterWidth : i32, stride : vec2<i32>, dilation : vec2<i32>,`;
   // This is an experimental value.
   workGroupSize: [number, number, number] = [256, 1, 1];
   convInfo: backend_util.Conv2DInfo;
@@ -98,8 +97,9 @@ export class DepthwiseConv2DProgram implements WebGPUProgram {
         let xRCCorner = vec2<i32>(coords.${
         this.isChannelsLast ? 'yz' : 'zw'}) * uniforms.stride - uniforms.pad;
         let d2 = coords[${this.isChannelsLast ? 3 : 1}];
-        let d1 = d2 / uniforms.channelMul;
-        let q = d2 - d1 * uniforms.channelMul;
+        let channelMul = uniforms.wShape[3];
+        let d1 = d2 / channelMul;
+        let q = d2 % channelMul;
 
         let inputRowStart = xRCCorner.x;
         let inputColStart = xRCCorner.y;
diff --git a/tfjs-backend-webgpu/src/kernels/BatchMatMul_impl.ts b/tfjs-backend-webgpu/src/kernels/BatchMatMul_impl.ts
@@ -94,7 +94,7 @@ export function batchMatMulImpl({
                    (outerShapeA % 4 === 0 && transposeA)) &&
       outerShapeB % 4 === 0 && !transposeB;
   let program: WebGPUProgram;
-  if (outerShapeA * outerShapeB <= 32) {
+  if (outerShapeA * outerShapeB <= 128) {
     program = new MatMulReduceProgram(
         [batchDim, outerShapeA, outerShapeB], batchAEqualOne, batchBEqualOne,
         transposeA, transposeB, bias, activation, preluActivationWeights);
diff --git a/tfjs-backend-webgpu/src/kernels/DepthwiseConv2dNative.ts b/tfjs-backend-webgpu/src/kernels/DepthwiseConv2dNative.ts
@@ -18,8 +18,8 @@
 import {backend_util, DepthwiseConv2dNative, DepthwiseConv2dNativeAttrs, DepthwiseConv2dNativeInputs, KernelConfig, KernelFunc} from '@tensorflow/tfjs-core';
 
 import {WebGPUBackend} from '../backend_webgpu';
-import {DepthwiseConv2D3x3Program} from '../depthwise_conv2d_3x3_webgpu';
 import {DepthwiseConv2DNCHWSharedProgram} from '../depthwise_conv2d_nchw_shared_webgpu';
+import {DepthwiseConv2DVec4Program} from '../depthwise_conv2d_vec4_webgpu';
 import {DepthwiseConv2DProgram} from '../depthwise_conv2d_webgpu';
 
 export function depthwiseConv2dNative(args: {
@@ -42,43 +42,34 @@ export function depthwiseConv2dNative(args: {
       pad, dimRoundingMode, true /* depthwise */, $dataFormat);
   const dimensions = [
     {type: 'int32', data: [convInfo.padInfo.top, convInfo.padInfo.left]},
-    {type: 'int32', data: [convInfo.strideHeight, convInfo.strideWidth]},
-    {type: 'int32', data: [convInfo.dilationHeight, convInfo.dilationWidth]},
-    {type: 'int32', data: [convInfo.inHeight, convInfo.inWidth]}
+    {type: 'int32', data: [convInfo.inHeight, convInfo.inWidth]},
   ];
 
   const isChannelsLast = convInfo.dataFormat === 'channelsLast';
-  let program: DepthwiseConv2DProgram|DepthwiseConv2D3x3Program|
+  let program: DepthwiseConv2DProgram|DepthwiseConv2DVec4Program|
       DepthwiseConv2DNCHWSharedProgram;
   if (!isChannelsLast && convInfo.inHeight > 16 && convInfo.inWidth > 16 &&
       convInfo.strideHeight === 1 && convInfo.strideWidth === 1 &&
       convInfo.dilationWidth === 1 && convInfo.dilationHeight === 1 &&
       convInfo.inChannels === convInfo.outChannels) {
-    dimensions.push(
-        {type: 'int32', data: [convInfo.filterHeight]},
-        {type: 'int32', data: [convInfo.filterWidth]},
-        {type: 'int32', data: [convInfo.outChannels / convInfo.inChannels]});
     program = new DepthwiseConv2DNCHWSharedProgram(
         convInfo.outShape, convInfo.filterHeight, convInfo.filterWidth);
-  }
-  // TODO: To see if we need to relax the limitation. Currently, it's only
-  // for filter size 3x3.
-  else if (
-      isChannelsLast && convInfo.batchSize === 1 &&
-      convInfo.inHeight === convInfo.outHeight &&
-      convInfo.inWidth === convInfo.outWidth && convInfo.strideHeight === 1 &&
-      convInfo.strideWidth === 1 &&
-      convInfo.filterHeight === convInfo.filterWidth &&
+  } else if (
+      isChannelsLast && convInfo.inHeight > 4 && convInfo.inWidth > 4 &&
+      convInfo.strideHeight === 1 && convInfo.strideWidth === 1 &&
       convInfo.inChannels === convInfo.outChannels &&
       convInfo.dilationHeight === 1 && convInfo.dilationWidth === 1 &&
-      convInfo.filterHeight === 3 && convInfo.inChannels % 4 === 0) {
-    program = new DepthwiseConv2D3x3Program(convInfo);
+      convInfo.inChannels % 4 === 0) {
+    program = new DepthwiseConv2DVec4Program(convInfo);
   } else {
     program = new DepthwiseConv2DProgram(convInfo);
     dimensions.push(
         {type: 'int32', data: [convInfo.filterHeight]},
         {type: 'int32', data: [convInfo.filterWidth]},
-        {type: 'int32', data: [convInfo.outChannels / convInfo.inChannels]});
+        {type: 'int32', data: [convInfo.strideHeight, convInfo.strideWidth]}, {
+          type: 'int32',
+          data: [convInfo.dilationHeight, convInfo.dilationWidth]
+        });
   }
 
   return backend.runWebGPUProgram(program, [x, filter], x.dtype, dimensions);
diff --git a/tfjs-backend-webgpu/src/kernels/FusedDepthwiseConv2D.ts b/tfjs-backend-webgpu/src/kernels/FusedDepthwiseConv2D.ts
@@ -18,7 +18,7 @@
 import {backend_util, FusedDepthwiseConv2D, FusedDepthwiseConv2DAttrs, FusedDepthwiseConv2DInputs, KernelConfig, KernelFunc, TensorInfo, util} from '@tensorflow/tfjs-core';
 
 import {WebGPUBackend} from '../backend_webgpu';
-import {DepthwiseConv2D3x3Program} from '../depthwise_conv2d_3x3_webgpu';
+import {DepthwiseConv2DVec4Program} from '../depthwise_conv2d_vec4_webgpu';
 import {DepthwiseConv2DProgram} from '../depthwise_conv2d_webgpu';
 
 export function fusedDepthwiseConv2D(args: {
@@ -60,30 +60,27 @@ export function fusedDepthwiseConv2D(args: {
 
   const dimensions = [
     {type: 'int32', data: [convInfo.padInfo.top, convInfo.padInfo.left]},
-    {type: 'int32', data: [convInfo.strideHeight, convInfo.strideWidth]},
-    {type: 'int32', data: [convInfo.dilationHeight, convInfo.dilationWidth]},
-    {type: 'int32', data: [convInfo.inHeight, convInfo.inWidth]}
+    {type: 'int32', data: [convInfo.inHeight, convInfo.inWidth]},
   ];
 
-  let program: DepthwiseConv2DProgram|DepthwiseConv2D3x3Program;
-  // TODO: To see if we need to relax the limitation. Currently, it's only for
-  // filter size 3x3.
-  if (convInfo.batchSize === 1 && convInfo.inHeight === convInfo.outHeight &&
-      convInfo.inWidth === convInfo.outWidth && convInfo.strideHeight === 1 &&
-      convInfo.strideWidth === 1 &&
-      convInfo.filterHeight === convInfo.filterWidth &&
+  let program: DepthwiseConv2DProgram|DepthwiseConv2DVec4Program;
+  if (convInfo.inHeight > 4 && convInfo.inWidth > 4 &&
+      convInfo.strideHeight === 1 && convInfo.strideWidth === 1 &&
       convInfo.inChannels === convInfo.outChannels &&
       convInfo.dilationHeight === 1 && convInfo.dilationWidth === 1 &&
-      convInfo.filterHeight === 3 && convInfo.inChannels % 4 === 0) {
-    program = new DepthwiseConv2D3x3Program(
+      convInfo.inChannels % 4 === 0) {
+    program = new DepthwiseConv2DVec4Program(
         convInfo, hasBias, activation, hasPreluActivationWeights);
   } else {
     program = new DepthwiseConv2DProgram(
         convInfo, hasBias, activation, hasPreluActivationWeights);
     dimensions.push(
         {type: 'int32', data: [convInfo.filterHeight]},
         {type: 'int32', data: [convInfo.filterWidth]},
-        {type: 'int32', data: [convInfo.outChannels / convInfo.inChannels]});
+        {type: 'int32', data: [convInfo.strideHeight, convInfo.strideWidth]}, {
+          type: 'int32',
+          data: [convInfo.dilationHeight, convInfo.dilationWidth]
+        });
   }
   if (activation === 'leakyrelu') {
     dimensions.push({type: 'float32', data: [leakyreluAlpha]});
diff --git a/tfjs-backend-webgpu/src/matmul_packed_vec4_webgpu.ts b/tfjs-backend-webgpu/src/matmul_packed_vec4_webgpu.ts
@@ -100,7 +100,7 @@ export function makeMatMulPackedVec4Source(
   let InnerElementSize = ${innerElementSize};
   let TileInner = ${tileInner};
 
-  @stage(compute) @workgroup_size(workGroupSizeX, workGroupSizeY, workGroupSizeZ)
+  @compute @workgroup_size(workGroupSizeX, workGroupSizeY, workGroupSizeZ)
   fn main(@builtin(local_invocation_id) LocalId : vec3<u32>,
           @builtin(global_invocation_id) GlobalId : vec3<u32>,
           @builtin(num_workgroups) NumWorkgroups: vec3<u32>,
diff --git a/tfjs-backend-webgpu/src/matmul_packed_webgpu.ts b/tfjs-backend-webgpu/src/matmul_packed_webgpu.ts
@@ -69,7 +69,7 @@ export function makeMatMulPackedSource(
     let ColPerThread = ${workPerThread[0]};
     let TileInner = ${tileInner};
 
-    @stage(compute) @workgroup_size(workGroupSizeX, workGroupSizeY, workGroupSizeZ)
+    @compute @workgroup_size(workGroupSizeX, workGroupSizeY, workGroupSizeZ)
     fn main(@builtin(local_invocation_id) LocalId : vec3<u32>,
             @builtin(global_invocation_id) GlobalId : vec3<u32>,
             @builtin(num_workgroups) NumWorkgroups: vec3<u32>,
diff --git a/tfjs-backend-webgpu/src/matmul_small_output_size_webgpu.ts b/tfjs-backend-webgpu/src/matmul_small_output_size_webgpu.ts
diff --git a/tfjs-backend-webgpu/src/webgpu_program.ts b/tfjs-backend-webgpu/src/webgpu_program.ts
diff --git a/tools/tfjs_bundle.bzl b/tools/tfjs_bundle.bzl
diff --git a/yarn.lock b/yarn.lock