Skip to content

Commit 11f6a3a

Browse files
author
Yang Gu
authored
Merge branch 'master' into frompixels2
2 parents 960364d + 2d5755c commit 11f6a3a

13 files changed

+123
-176
lines changed

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
"terser": "^5.7.0",
6363
"ts-morph": "^11.0.3",
6464
"ts-node": "~8.8.2",
65+
"tslib": "^2.4.0",
6566
"tslint": "^6.1.3",
6667
"tslint-no-circular-imports": "~0.7.0",
6768
"typescript": "3.5.3"

tfjs-backend-webgpu/src/depthwise_conv2d_nchw_shared_webgpu.ts

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,7 @@ export class DepthwiseConv2DNCHWSharedProgram implements WebGPUProgram {
2727
dispatchLayout: {x: number[], y: number[], z: number[]};
2828
dispatch: [number, number, number];
2929
variableNames = ['x', 'W'];
30-
uniforms = `pad : vec2<i32>, stride : vec2<i32>, dilation : vec2<i32>,
31-
inDims : vec2<i32>, filterHeight : i32, filterWidth : i32,
32-
channelMul : i32,`;
30+
uniforms = `pad : vec2<i32>, inDims : vec2<i32>,`;
3331
workGroupSize: [number, number, number] = [16, 16, 1];
3432
addBias: boolean;
3533
activation: backend_util.Activation;
@@ -119,10 +117,10 @@ export class DepthwiseConv2DNCHWSharedProgram implements WebGPUProgram {
119117
numWorkgroups = NumWorkgroups;
120118
let coords = getOutputCoords();
121119
let batch = coords[0];
122-
let xRCCorner = vec2<i32>(coords.zw) * uniforms.stride - uniforms.pad;
123-
let d2 = coords[1];
124-
let d1 = d2 / uniforms.channelMul;
125-
let q = d2 - d1 * uniforms.channelMul;
120+
let xRCCorner = vec2<i32>(coords.zw) - uniforms.pad;
121+
let channelMul = uniforms.wShape[3];
122+
let d1 = coords[1] / channelMul;
123+
let q = coords[1] % channelMul;
126124
127125
let inputRowStart = xRCCorner.x;
128126
let inputColStart = xRCCorner.y;
@@ -157,8 +155,8 @@ export class DepthwiseConv2DNCHWSharedProgram implements WebGPUProgram {
157155
workgroupBarrier();
158156
159157
var dotProd = 0.0;
160-
for (var wR = 0; wR < uniforms.filterHeight; wR = wR + 1) {
161-
for (var wC = 0; wC < uniforms.filterWidth; wC = wC + 1) {
158+
for (var wR = 0; wR < ${this.filterHeight}; wR = wR + 1) {
159+
for (var wC = 0; wC < ${this.filterWidth}; wC = wC + 1) {
162160
let xVal = mm_Asub[localRow + wR][localCol + wC];
163161
let wVal = mm_Bsub[wR][wC];
164162
dotProd = fma(xVal, wVal, dotProd);

tfjs-backend-webgpu/src/depthwise_conv2d_3x3_webgpu.ts renamed to tfjs-backend-webgpu/src/depthwise_conv2d_vec4_webgpu.ts

Lines changed: 35 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,13 @@ import {mapActivationToShaderProgram} from './activation_util';
2020
import {getWorkGroupSizeString, WebGPUProgram} from './webgpu_program';
2121
import {computeDispatch} from './webgpu_util';
2222

23-
export class DepthwiseConv2D3x3Program implements WebGPUProgram {
23+
export class DepthwiseConv2DVec4Program implements WebGPUProgram {
2424
outputShape: number[];
2525
shaderKey: string;
2626
dispatchLayout: {x: number[], y: number[], z: number[]};
2727
dispatch: [number, number, number];
2828
variableNames = ['x', 'W'];
29-
uniforms =
30-
'pad : vec2<i32>, stride : vec2<i32>, dilation : vec2<i32>, inDims : vec2<i32>,';
29+
uniforms = 'pad : vec2<i32>, inDims : vec2<i32>,';
3130
workGroupSize: [number, number, number] = [4, 4, 4];
3231
convInfo: backend_util.Conv2DInfo;
3332
addBias: boolean;
@@ -39,9 +38,9 @@ export class DepthwiseConv2D3x3Program implements WebGPUProgram {
3938
convInfo: backend_util.Conv2DInfo, addBias = false,
4039
activation: backend_util.Activation = null, hasPreluActivation = false) {
4140
this.outputShape = convInfo.outShape;
42-
this.dispatchLayout = {x: [0, 1], y: [2], z: [3]};
41+
this.dispatchLayout = {x: [3], y: [2], z: [0, 1]};
4342
this.dispatch = computeDispatch(
44-
this.dispatchLayout, this.outputShape, this.workGroupSize, [1, 4, 4]);
43+
this.dispatchLayout, this.outputShape, this.workGroupSize, [4, 4, 1]);
4544

4645
util.assert(
4746
convInfo.dataFormat === 'channelsLast',
@@ -59,7 +58,8 @@ export class DepthwiseConv2D3x3Program implements WebGPUProgram {
5958
this.activation = activation;
6059
this.hasPreluActivation = hasPreluActivation;
6160

62-
this.shaderKey = `depthwise3x3_${activation}`;
61+
this.shaderKey = `depthwiseVec4_${activation}_${
62+
this.convInfo.filterHeight}_${this.convInfo.filterWidth}`;
6363
}
6464

6565
getUserCode(): string {
@@ -87,65 +87,53 @@ export class DepthwiseConv2D3x3Program implements WebGPUProgram {
8787
const addBiasSnippet = this.addBias ?
8888
'dotProd[i] = dotProd[i] + getBiasByOutputCoords(coords);' :
8989
'';
90-
90+
// Here 4 is the work per thread in X dimension.
91+
const xNumber = 4 + this.convInfo.filterWidth - 1;
9192
const userCode = `
9293
${activationSnippet}
93-
94+
fn readX(batch : i32, row : i32, col : i32, channel : i32) -> vec4<f32> {
95+
var value = vec4<f32>(0.0);
96+
if (row >=0 && row < uniforms.inDims[0] && col >=0 && col < uniforms.inDims[1])
97+
{
98+
value = getX(batch, row, col, channel);
99+
}
100+
return value;
101+
}
94102
${getWorkGroupSizeString()}
95103
fn main(@builtin(global_invocation_id) globalId: vec3<u32>) {
96-
let batch = 0;
97-
let r = i32(globalId.x);
104+
let batch = i32(globalId.z) / uniforms.outShape[1];
105+
let r = i32(globalId.z) % uniforms.outShape[1];
98106
let c = i32(globalId.y) * 4;
99-
let d2 = i32(globalId.z) * 4;
100-
let xRCCorner = vec2<i32>(r, c) * uniforms.stride - uniforms.pad;
101-
let d1 = d2;
102-
let q = 0;
107+
let d1 = i32(globalId.x) * 4;
108+
let xRCCorner = vec2<i32>(r, c) - uniforms.pad;
103109
104110
let xRCorner = xRCCorner.x;
105111
let xCCorner = xRCCorner.y;
106-
107-
var wVals : array<vec4<f32>, 9>;
108-
wVals[0] = getW(0, 0, d1, q);
109-
wVals[1] = getW(0, 1, d1, q);
110-
wVals[2] = getW(0, 2, d1, q);
111-
wVals[3] = getW(1, 0, d1, q);
112-
wVals[4] = getW(1, 1, d1, q);
113-
wVals[5] = getW(1, 2, d1, q);
114-
wVals[6] = getW(2, 0, d1, q);
115-
wVals[7] = getW(2, 1, d1, q);
116-
wVals[8] = getW(2, 2, d1, q);
117-
118-
var xVals : array<array<vec4<f32>, 6>, 3>;
119-
for (var wR = 0; wR < 3; wR = wR + 1) {
120-
let xR = xRCorner + wR * uniforms.dilation[0];
121-
for (var wC = 0; wC < 6; wC = wC + 1) {
122-
let xC = xCCorner + wC * uniforms.dilation[1];
123-
if (xR < 0 || xR >= uniforms.inDims[0] || xC < 0 || xC >= uniforms.inDims[1]) {
124-
xVals[wR][wC] = vec4<f32>(0.0);
125-
} else {
126-
xVals[wR][wC] = getX(batch, xR, xC, d1);
127-
}
128-
}
129-
}
130-
112+
var xVals : array<vec4<f32>, ${xNumber}>;
131113
var dotProd : array<vec4<f32>, 4>;
132114
dotProd[0] = vec4<f32>(0.0);
133115
dotProd[1] = vec4<f32>(0.0);
134116
dotProd[2] = vec4<f32>(0.0);
135117
dotProd[3] = vec4<f32>(0.0);
136118
137-
for (var wR = 0; wR < 3; wR = wR + 1) {
138-
for (var wC = 0; wC < 3; wC = wC + 1) {
139-
let indexW = wR * 3 + wC;
140-
dotProd[0] = dotProd[0] + xVals[wR][0 + wC] * wVals[indexW];
141-
dotProd[1] = dotProd[1] + xVals[wR][1 + wC] * wVals[indexW];
142-
dotProd[2] = dotProd[2] + xVals[wR][2 + wC] * wVals[indexW];
143-
dotProd[3] = dotProd[3] + xVals[wR][3 + wC] * wVals[indexW];
119+
// Use constant instead of uniform can give better performance.
120+
for (var wR = 0; wR < ${this.convInfo.filterHeight}; wR = wR + 1) {
121+
let xR = xRCorner + wR;
122+
for (var i = 0; i < ${xNumber}; i++)
123+
{
124+
xVals[i] = readX(batch, xR, xCCorner + i, d1);
125+
}
126+
for (var wC = 0; wC < ${this.convInfo.filterWidth}; wC = wC + 1) {
127+
let wValue = getW(wR, wC, d1, 0);
128+
dotProd[0] = dotProd[0] + xVals[0 + wC] * wValue;
129+
dotProd[1] = dotProd[1] + xVals[1 + wC] * wValue;
130+
dotProd[2] = dotProd[2] + xVals[2 + wC] * wValue;
131+
dotProd[3] = dotProd[3] + xVals[3 + wC] * wValue;
144132
}
145133
}
146134
147135
for (var i = 0; i < 4; i = i + 1) {
148-
let coords = vec4<i32>(batch, r, c + i, d2);
136+
let coords = vec4<i32>(batch, r, c + i, d1);
149137
if (coordsInBounds4D(coords, uniforms.outShape)) {
150138
${addBiasSnippet}
151139
${applyActivationSnippet}

tfjs-backend-webgpu/src/depthwise_conv2d_webgpu.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,8 @@ export class DepthwiseConv2DProgram implements WebGPUProgram {
2727
dispatchLayout: {x: number[], y?: number[], z?: number[]};
2828
dispatch: [number, number, number];
2929
variableNames = ['x', 'W'];
30-
uniforms = `pad : vec2<i32>, stride : vec2<i32>, dilation : vec2<i32>,
31-
inDims : vec2<i32>, filterHeight : i32, filterWidth : i32,
32-
channelMul : i32,`;
30+
uniforms = `pad : vec2<i32>, inDims : vec2<i32>, filterHeight : i32,
31+
filterWidth : i32, stride : vec2<i32>, dilation : vec2<i32>,`;
3332
// This is an experimental value.
3433
workGroupSize: [number, number, number] = [256, 1, 1];
3534
convInfo: backend_util.Conv2DInfo;
@@ -98,8 +97,9 @@ export class DepthwiseConv2DProgram implements WebGPUProgram {
9897
let xRCCorner = vec2<i32>(coords.${
9998
this.isChannelsLast ? 'yz' : 'zw'}) * uniforms.stride - uniforms.pad;
10099
let d2 = coords[${this.isChannelsLast ? 3 : 1}];
101-
let d1 = d2 / uniforms.channelMul;
102-
let q = d2 - d1 * uniforms.channelMul;
100+
let channelMul = uniforms.wShape[3];
101+
let d1 = d2 / channelMul;
102+
let q = d2 % channelMul;
103103
104104
let inputRowStart = xRCCorner.x;
105105
let inputColStart = xRCCorner.y;

tfjs-backend-webgpu/src/kernels/BatchMatMul_impl.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ export function batchMatMulImpl({
9494
(outerShapeA % 4 === 0 && transposeA)) &&
9595
outerShapeB % 4 === 0 && !transposeB;
9696
let program: WebGPUProgram;
97-
if (outerShapeA * outerShapeB <= 32) {
97+
if (outerShapeA * outerShapeB <= 128) {
9898
program = new MatMulReduceProgram(
9999
[batchDim, outerShapeA, outerShapeB], batchAEqualOne, batchBEqualOne,
100100
transposeA, transposeB, bias, activation, preluActivationWeights);

tfjs-backend-webgpu/src/kernels/DepthwiseConv2dNative.ts

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
import {backend_util, DepthwiseConv2dNative, DepthwiseConv2dNativeAttrs, DepthwiseConv2dNativeInputs, KernelConfig, KernelFunc} from '@tensorflow/tfjs-core';
1919

2020
import {WebGPUBackend} from '../backend_webgpu';
21-
import {DepthwiseConv2D3x3Program} from '../depthwise_conv2d_3x3_webgpu';
2221
import {DepthwiseConv2DNCHWSharedProgram} from '../depthwise_conv2d_nchw_shared_webgpu';
22+
import {DepthwiseConv2DVec4Program} from '../depthwise_conv2d_vec4_webgpu';
2323
import {DepthwiseConv2DProgram} from '../depthwise_conv2d_webgpu';
2424

2525
export function depthwiseConv2dNative(args: {
@@ -42,43 +42,34 @@ export function depthwiseConv2dNative(args: {
4242
pad, dimRoundingMode, true /* depthwise */, $dataFormat);
4343
const dimensions = [
4444
{type: 'int32', data: [convInfo.padInfo.top, convInfo.padInfo.left]},
45-
{type: 'int32', data: [convInfo.strideHeight, convInfo.strideWidth]},
46-
{type: 'int32', data: [convInfo.dilationHeight, convInfo.dilationWidth]},
47-
{type: 'int32', data: [convInfo.inHeight, convInfo.inWidth]}
45+
{type: 'int32', data: [convInfo.inHeight, convInfo.inWidth]},
4846
];
4947

5048
const isChannelsLast = convInfo.dataFormat === 'channelsLast';
51-
let program: DepthwiseConv2DProgram|DepthwiseConv2D3x3Program|
49+
let program: DepthwiseConv2DProgram|DepthwiseConv2DVec4Program|
5250
DepthwiseConv2DNCHWSharedProgram;
5351
if (!isChannelsLast && convInfo.inHeight > 16 && convInfo.inWidth > 16 &&
5452
convInfo.strideHeight === 1 && convInfo.strideWidth === 1 &&
5553
convInfo.dilationWidth === 1 && convInfo.dilationHeight === 1 &&
5654
convInfo.inChannels === convInfo.outChannels) {
57-
dimensions.push(
58-
{type: 'int32', data: [convInfo.filterHeight]},
59-
{type: 'int32', data: [convInfo.filterWidth]},
60-
{type: 'int32', data: [convInfo.outChannels / convInfo.inChannels]});
6155
program = new DepthwiseConv2DNCHWSharedProgram(
6256
convInfo.outShape, convInfo.filterHeight, convInfo.filterWidth);
63-
}
64-
// TODO: To see if we need to relax the limitation. Currently, it's only
65-
// for filter size 3x3.
66-
else if (
67-
isChannelsLast && convInfo.batchSize === 1 &&
68-
convInfo.inHeight === convInfo.outHeight &&
69-
convInfo.inWidth === convInfo.outWidth && convInfo.strideHeight === 1 &&
70-
convInfo.strideWidth === 1 &&
71-
convInfo.filterHeight === convInfo.filterWidth &&
57+
} else if (
58+
isChannelsLast && convInfo.inHeight > 4 && convInfo.inWidth > 4 &&
59+
convInfo.strideHeight === 1 && convInfo.strideWidth === 1 &&
7260
convInfo.inChannels === convInfo.outChannels &&
7361
convInfo.dilationHeight === 1 && convInfo.dilationWidth === 1 &&
74-
convInfo.filterHeight === 3 && convInfo.inChannels % 4 === 0) {
75-
program = new DepthwiseConv2D3x3Program(convInfo);
62+
convInfo.inChannels % 4 === 0) {
63+
program = new DepthwiseConv2DVec4Program(convInfo);
7664
} else {
7765
program = new DepthwiseConv2DProgram(convInfo);
7866
dimensions.push(
7967
{type: 'int32', data: [convInfo.filterHeight]},
8068
{type: 'int32', data: [convInfo.filterWidth]},
81-
{type: 'int32', data: [convInfo.outChannels / convInfo.inChannels]});
69+
{type: 'int32', data: [convInfo.strideHeight, convInfo.strideWidth]}, {
70+
type: 'int32',
71+
data: [convInfo.dilationHeight, convInfo.dilationWidth]
72+
});
8273
}
8374

8475
return backend.runWebGPUProgram(program, [x, filter], x.dtype, dimensions);

tfjs-backend-webgpu/src/kernels/FusedDepthwiseConv2D.ts

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import {backend_util, FusedDepthwiseConv2D, FusedDepthwiseConv2DAttrs, FusedDepthwiseConv2DInputs, KernelConfig, KernelFunc, TensorInfo, util} from '@tensorflow/tfjs-core';
1919

2020
import {WebGPUBackend} from '../backend_webgpu';
21-
import {DepthwiseConv2D3x3Program} from '../depthwise_conv2d_3x3_webgpu';
21+
import {DepthwiseConv2DVec4Program} from '../depthwise_conv2d_vec4_webgpu';
2222
import {DepthwiseConv2DProgram} from '../depthwise_conv2d_webgpu';
2323

2424
export function fusedDepthwiseConv2D(args: {
@@ -60,30 +60,27 @@ export function fusedDepthwiseConv2D(args: {
6060

6161
const dimensions = [
6262
{type: 'int32', data: [convInfo.padInfo.top, convInfo.padInfo.left]},
63-
{type: 'int32', data: [convInfo.strideHeight, convInfo.strideWidth]},
64-
{type: 'int32', data: [convInfo.dilationHeight, convInfo.dilationWidth]},
65-
{type: 'int32', data: [convInfo.inHeight, convInfo.inWidth]}
63+
{type: 'int32', data: [convInfo.inHeight, convInfo.inWidth]},
6664
];
6765

68-
let program: DepthwiseConv2DProgram|DepthwiseConv2D3x3Program;
69-
// TODO: To see if we need to relax the limitation. Currently, it's only for
70-
// filter size 3x3.
71-
if (convInfo.batchSize === 1 && convInfo.inHeight === convInfo.outHeight &&
72-
convInfo.inWidth === convInfo.outWidth && convInfo.strideHeight === 1 &&
73-
convInfo.strideWidth === 1 &&
74-
convInfo.filterHeight === convInfo.filterWidth &&
66+
let program: DepthwiseConv2DProgram|DepthwiseConv2DVec4Program;
67+
if (convInfo.inHeight > 4 && convInfo.inWidth > 4 &&
68+
convInfo.strideHeight === 1 && convInfo.strideWidth === 1 &&
7569
convInfo.inChannels === convInfo.outChannels &&
7670
convInfo.dilationHeight === 1 && convInfo.dilationWidth === 1 &&
77-
convInfo.filterHeight === 3 && convInfo.inChannels % 4 === 0) {
78-
program = new DepthwiseConv2D3x3Program(
71+
convInfo.inChannels % 4 === 0) {
72+
program = new DepthwiseConv2DVec4Program(
7973
convInfo, hasBias, activation, hasPreluActivationWeights);
8074
} else {
8175
program = new DepthwiseConv2DProgram(
8276
convInfo, hasBias, activation, hasPreluActivationWeights);
8377
dimensions.push(
8478
{type: 'int32', data: [convInfo.filterHeight]},
8579
{type: 'int32', data: [convInfo.filterWidth]},
86-
{type: 'int32', data: [convInfo.outChannels / convInfo.inChannels]});
80+
{type: 'int32', data: [convInfo.strideHeight, convInfo.strideWidth]}, {
81+
type: 'int32',
82+
data: [convInfo.dilationHeight, convInfo.dilationWidth]
83+
});
8784
}
8885
if (activation === 'leakyrelu') {
8986
dimensions.push({type: 'float32', data: [leakyreluAlpha]});

tfjs-backend-webgpu/src/matmul_packed_vec4_webgpu.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ export function makeMatMulPackedVec4Source(
100100
let InnerElementSize = ${innerElementSize};
101101
let TileInner = ${tileInner};
102102
103-
@stage(compute) @workgroup_size(workGroupSizeX, workGroupSizeY, workGroupSizeZ)
103+
@compute @workgroup_size(workGroupSizeX, workGroupSizeY, workGroupSizeZ)
104104
fn main(@builtin(local_invocation_id) LocalId : vec3<u32>,
105105
@builtin(global_invocation_id) GlobalId : vec3<u32>,
106106
@builtin(num_workgroups) NumWorkgroups: vec3<u32>,

tfjs-backend-webgpu/src/matmul_packed_webgpu.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ export function makeMatMulPackedSource(
6969
let ColPerThread = ${workPerThread[0]};
7070
let TileInner = ${tileInner};
7171
72-
@stage(compute) @workgroup_size(workGroupSizeX, workGroupSizeY, workGroupSizeZ)
72+
@compute @workgroup_size(workGroupSizeX, workGroupSizeY, workGroupSizeZ)
7373
fn main(@builtin(local_invocation_id) LocalId : vec3<u32>,
7474
@builtin(global_invocation_id) GlobalId : vec3<u32>,
7575
@builtin(num_workgroups) NumWorkgroups: vec3<u32>,

0 commit comments

Comments
 (0)