diff --git a/src/optimizers/lbfgs_function_optimizer.ts b/src/optimizers/lbfgs_function_optimizer.ts new file mode 100644 index 0000000000..a432287f7c --- /dev/null +++ b/src/optimizers/lbfgs_function_optimizer.ts @@ -0,0 +1,484 @@ +/** + * @license + * Copyright 2018 Google Inc. All Rights Reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ============================================================================= + */ + +import {mul, sub} from '../ops/binary_ops'; +import {Scalar, Tensor1D, Tensor} from '../tensor'; +import {scalar} from '../ops/ops'; +import {dot} from '../ops/matmul'; +import {ENV} from '../environment'; + +/* +function castScalar( t: Tensor ) { + if( t.rank !== 0 ) { throw new Error('Assertion failed.'); } + return t as Scalar; +} + +function castTensor1D( t: Tensor ) { + if( t.rank !== 1 ) { throw new Error('Assertion failed.'); } + return t as Tensor1D; +} +*/ + +function val( t: Tensor ) { + if( t.rank !== 0 ) { throw new Error('Assertion failed.'); } + return t.dataSync()[0]; +} + +function dotProd( x: Tensor1D, y: Tensor1D ) { + const Z = dot(x,y), + z = val(Z); + Z.dispose(); + return z; +} + +export class LineSearchError extends Error { + constructor( msg: string ) { + super(msg); + Object.setPrototypeOf(this, LineSearchError.prototype); + } +} + +export class LineSearchNoProgressError extends LineSearchError { + constructor( msg: string ) { + super(msg); + Object.setPrototypeOf(this, LineSearchNoProgressError.prototype); + } +} + +/** The function type of a linesearch method. + * + * @param fg A function that returns both the value and gradients of the + * optimized loss function. + * @param x The starting point (function input) of the line search. + * @param f The loss at the starting point of the line search. + * @param g The loss gradients as the starting point of the line search. + * @param negDir The negative of the line search direction. The length of + * `negDir` determines the first point the line search is + * going to examine. In other wird the first point that is + * examined is `x-negDir`. + * @returns [x,f,g] The new approximation of the minimum, its loss value + * and gradients. + */ +export type LineSearchMethod = ( + fg: (x: Tensor1D) => [Scalar,Tensor1D], + x: Tensor1D, + f: Scalar, + g: Tensor1D, + negDir: Tensor1D +) => [Tensor1D,Scalar,Tensor1D]; + +/** Creates a new strong Wolfe line search function for + * the given parameters. + * + * Implementation based on: + * "Numerical Optimization" 2n Edition, + * Jorge Nocedal Stephen J. Wright, + * Chapter 3. Line Search Methods, page 60. + * + * @param c1 [optional] 1st strong Wolfe condition parameter: + * Sufficient decrease of the objective. + * @param c2 [optional] 2nd strong Wolfe condition parameter: + * Sufficient decrease of the gradient projection. + * @param c3 [optional] Exponential growth constant for the + * first phase of the line search (bracketing phase). + * @returns A line search function that searches a point satisfying + * the strong Wolfe condition. + */ +export function strongWolfeLineSearch( + c1=0.4, c2=0.8, c3=1.6 +): LineSearchMethod +{ + // CHECK 0 < c1 < c2 < 1 < c3 + if( c1 <= 0 ) { + throw new Error('StrongWolfeLineSearch(): c1 must be positive.'); + } + if( c1 >= c2 ) { + throw new Error('StrongWolfeLineSearch(): c1 must less than c2.'); + } + if( 1 <= c2 ) { + throw new Error('StrongWolfeLineSearch(): c2 must less than 1.'); + } + if( 1 >= c3 ) { + throw new Error('StrongWolfeLineSearch(): c3 must larger than 1.'); + } + + return (fg, X0,F0,G0, negDir) => { + + const projGrad = ( g: Tensor1D ) => -dotProd(g,negDir); + + const f0 = val(F0), + p0 = projGrad(G0); // <- projected gradient + + if( p0 >= 0 ) { + throw new Error( + 'strongWolfeLineSearch(): Projected gradient not negative.' + ); + } + + let αMin = 0, α = 1, αMax = Infinity, + fMin = f0; + + // STEP 1: BRACKETING PHASE + // Find a range guaranteed to contain an α satisfying strong Wolfe. + bracketing: while(true) + { + const X = ENV.engine.tidy( + () => X0.sub( scalar(α).mul(negDir) ) as Tensor1D + ), + [F,G] = fg(X), + f = val(F), + p = projGrad(G); + + if( f - f0 > c1*α*p0 || (0 < αMin && f >= fMin) ) + { + αMax = α; + break bracketing; + } + + if( Math.abs(p) <= -c2*p0 ) { + return [X,F,G]; + } + X.dispose(); + F.dispose(); + G.dispose(); + + if( p >= 0 ) + { + αMax = αMin; + αMin = α; + fMin = f; + break bracketing; + } + + if( ! (α < αMax) ) { + throw new LineSearchError( + 'strongWolfeLineSearch(): ' + + 'Strong Wolfe condition not satisfiable in range.' + ); + } + + αMin = α; α = Math.fround(α*c3); + fMin = f; + } + + if( αMin === αMax ) { + throw new LineSearchError('strongWolfeLineSearch: bracketing failed.'); + } + + // STEP 2: BISECTION PHASE + // Given a range that is guaranteed to contain a valid + // strong Wolfe α values, this method finds such a value. + while(true) + { + α = Math.fround( (αMin + αMax) / 2 ); + + const X = ENV.engine.tidy( + () => X0.sub( scalar(α).mul(negDir) ) as Tensor1D + ), + [F,G] = fg(X), + f = val(F), + p = projGrad(G); + + if( f - f0 > c1*α*p0 || f >= fMin ) { + if( αMax === α ) { + throw new LineSearchError( + 'strongWolfeLineSearch(): bisection failed.' + ); + } + αMax = α; + } + else { + if( Math.abs(p) <= -c2*p0 ) { + return [X,F,G]; + } + X.dispose(); + F.dispose(); + G.dispose(); + + if( p * (αMax - αMin) >= 0 ) { + αMax = αMin; + } + + if( αMin === α ) { + throw new LineSearchError( + 'strongWolfeLineSearch(): bisection failed.' + ); + } + αMin = α; + fMin = f; + } + + if( αMin === αMax ) { + const msg = 'strongWolfeLineSearch(): bisection failed.'; + if( αMin === 0 ) { + throw new LineSearchNoProgressError(msg); + } + throw new LineSearchError(msg); + } + } + }; +} + +/** Limited-memory BFGS optimizer. + * + * At every point in time, the current approximation of the solution, + * its loss and gradient are stored as properties `x`, `f` and `g`. + * Those three tensors are disposed and replaced by the new approximation + * whenever `step()` is called. + */ +export class LBFGSFunctionOptimizer { + /** A function that returns the loss and its gradients + * for a given input. + * + * @param x The function input. + * + * @returns [loss,grad] The loss at point `x` and its gradients. + */ + fg: (x: Tensor1D) => [Scalar,Tensor1D]; + + /** The current approximation of the minimum solution. + */ + x: Tensor1D; + /** The loss value at the current approximation of the minimum solution. + */ + f: Scalar; + /** The loss gradients at the current approximation of the minimum solution. + */ + g: Tensor1D; + + /** The number of past approximations/iterations that is memoized in order + * to approximate the (inverse) Hessian. + */ + historySize: number; + + /** A function that returns the negative "initial" search direction. In + * other words this function returns the result matrix-vector `H₀•g`, + * where `H₀` is the initial inverse Hessian and `g` is the current + * (gradient) vector. The length of the returned vector stronly + * influences how far the `LineSearchMethod` looks. + * + * @param g The gradient vector for which the (initial) search direction + * is to be determined. + * @returns `H₀•g`, where `H₀` is the inverse of the initial inverse + * Hessian and `g` is the current (gradient) vector. + */ + initNegDir: (g: Tensor1D) => Tensor1D; + + /** The line search method used. Must statisfy the Wolfe conition. + * + * @param fg A function that returns both the value and gradients of the + * optimized loss function. + * @param x The starting point (function input) of the line search. + * @param f The loss at the starting point of the line search. + * @param g The loss gradients as the starting point of the line search. + * @param negDir The negative of the line search direction. The length of + * `negDir` determines the first point the line search is + * going to examine. In other wird the first point that is + * examined is `x-negDir`. + * @returns [x,f,g] The new approximation of the minimum, its loss value + * and gradients. + */ + lineSearch: LineSearchMethod; + + /** The change in the function input x during the past `historySize` + * iterations. + */ + dX: Tensor1D[] = []; + /** The dot product of `dX[i]` and `dG[i]`. + */ + dGdX: Scalar[] = []; + /** The change in the loss gradient during the past `historySize` + * iterations. + */ + dG : Tensor1D[] = []; // <- change in f'(x) + + /** Creates a new L-BFGS optimizer instance that minimized the given loss + * function. + * + * @param fg A function that returns the loss and its gradient for a given + * input. + * @param x0 The starting point of the L-BFGS iteration. + * @param params And optional set of parameters for the optimizer. + *
+ *
historySize
The number of past iterations that is + * memoized to approximate the (inverse) + * Hessian. + *
initNegDir
A function that returns the negative + * "initial" search direction. In other + * words this function returns the result + * matrix-vector `H₀•g`, where `H₀` is + * the inverse of the initial inverse + * Hessian and `g` is the current + * (gradient) vector. The length of the + * returned vector stronly influences + * how far the `LineSearchMethod` looks. + *
lineSearch
The line search method to be used. + * Must satisfy the Wolfe condition. + *
+ */ + constructor( + fg: (x: Tensor1D) => [Scalar,Tensor1D], + x0: Tensor1D, + params?: { + historySize?: number, // <- max. no. of past gradients memoized + initNegDir?: (g: Tensor1D) => Tensor1D, + lineSearch?: LineSearchMethod + } + ) + { + if( null == fg ) { throw new Error('new LBFGSOptimizer: fg undefined.'); } + if( null == x0 ) { throw new Error('new LBFGSOptimizer: x0 undefined.'); } + if( null == params ) { params = {}; } + this.fg = fg; + this.x = x0.clone(); + [this.f,this.g] = fg(x0); + this.historySize = 'historySize' in params + ? params.historySize : 8; + this.initNegDir = 'initNegDir' in params + ? params.initNegDir : g => g; + this.lineSearch = 'lineSearch' in params + ? params.lineSearch : strongWolfeLineSearch(); + if( this.historySize < 1 ) { + throw new Error('new LBFGSOptimizer: historySize must be positive.'); + } + } + + /** Computes the product of the `H•g` where `H` is the current approximation + * of the (estimated) inverse hessian of the function and `g` is the + * given (gradient) vector. + * + * @param g The gradient vector. + * @returns `H•g` where `H` is the current approximation of the (estimated) + * inverse Hessian of the function and `g` is the given (gradient) vector. + */ + negDir( g: Tensor1D ): Tensor1D + { + const dX = this. dX, + dGdX = this.dGdX, + dG = this.dG; + // SEE: + // Jorge Nocedal "Updating Quasi-Newton Matrices with Limited Storage" + // MATHEMATICS OF COMPUTATION, VOL. 35, NO. 151, JULY 1980, PAGES 773-78 + // https://courses.engr.illinois.edu/ece544na/fa2014/nocedal80.pdf + const α: Scalar[] = []; + g = g.clone(); + + for( let i=dGdX.length; i-- > 0; ) + { + const [αi,G] = ENV.engine.tidy( () => { + const αi = dot(dX[i],g).div(dGdX[i]); + return [ αi, g.sub( mul(αi,dG[i]) ) ]; + }); + g.dispose(); + g = G as Tensor1D; + α.push( αi as Scalar ); + } + + const G = this.initNegDir(g); + if( ! Object.is(G,g) ) { g.dispose(); } + g = G; + + for( let i=0; i < dGdX.length; i++ ) + { + const G = ENV.engine.tidy( () => { + const αi = α.pop(), + βi = dot(dG[i],g).div(dGdX[i]), + G = g.add( sub(αi,βi).mul(dX[i]) ); + αi.dispose(); + return G; + }); + g.dispose(); + g = G as Tensor1D; + } + + return g; + } + + /** Performs a single optimization step. In the process, the current + * property values `x`, `y` and `g` of this optimizer are disposed + * and replaced by the new approximation of the minimum. + */ + step(): void + { + const dX = this. dX, + dGdX = this.dGdX, + dG = this.dG; + + const [x,f,g] = ( () => { + const stepFunc = () => this.lineSearch( + this.fg, + this.x, + this.f, + this.g, + this.negDir(this.g) + ); + try { + return ENV.engine.tidy(stepFunc); + } + catch( err ) { + if( err instanceof LineSearchNoProgressError ) { + // reset line search + while( dX.length > 0 ) { + dX.pop().dispose(); + dGdX.pop().dispose(); + dG .pop().dispose(); + } + // try one last time + return ENV.engine.tidy(stepFunc); + } + else { + throw err; + } + } + })(); + + const dXi = sub(x,this.x) as Tensor1D, + dGi = sub(g,this.g) as Tensor1D; + dG .push( dGi ); + dGdX.push( dot(dGi,dXi) as Scalar ); + dX.push( dXi ); + + this.x.dispose(); + this.f.dispose(); + this.g.dispose(); + + [this.x, + this.f, + this.g] = [x,f,g]; + + if( dX.length !== dG.length ) { throw new Error('Assertion failed!'); } + + // LIMIT THE NUMBER OF MEMOIZED GRADIENTS + // (while loop in case historySize was changed) + while( dX.length > this.historySize ) { + dX.shift().dispose(); + dGdX.shift().dispose(); + dG .shift().dispose(); + } + } + + /** Disposes all resources held by this optimizer. + */ + dispose(): void { + this.x.dispose(); + this.f.dispose(); + this.g.dispose(); + this. dX.forEach( t => t.dispose() ); + this.dGdX.forEach( t => t.dispose() ); + this.dG .forEach( t => t.dispose() ); + } +} diff --git a/src/optimizers/lbfgs_function_optimizer_test.ts b/src/optimizers/lbfgs_function_optimizer_test.ts new file mode 100644 index 0000000000..fd4d094a04 --- /dev/null +++ b/src/optimizers/lbfgs_function_optimizer_test.ts @@ -0,0 +1,284 @@ +/** + * @license + * Copyright 2018 Google Inc. All Rights Reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ============================================================================= + */ + +import {describeWithFlags} from '../jasmine_util'; +import {add, mul, sub, squaredDifference} from '../ops/binary_ops'; +import {cos} from '../ops/unary_ops'; +import {Scalar, Tensor1D, Tensor} from '../tensor'; +import {zeros, ones, scalar, tensor1d} from '../ops/ops'; +import {ALL_ENVS, CPU_ENVS, expectArraysClose, expectArraysEqual} from '../test_util'; +import {TensorLike} from '../types'; +import {convertToTensor} from '../tensor_util_env'; +import {valueAndGrad} from '../gradients'; +import {ENV} from '../environment'; +import {randomUniform} from '../ops/array_ops'; +import {strongWolfeLineSearch, + LBFGSFunctionOptimizer, + LineSearchNoProgressError} from './lbfgs_function_optimizer'; +import {dot} from '../ops/matmul'; + +function rosenbrock( x: Tensor|TensorLike ): Tensor +{ + return ENV.engine.tidy( () => { + const $x = convertToTensor(x,'x','rosenbrock'); + + if( $x.rank < 1 ) { + throw new Error('rosenbrock(x): x.rank must be at least 1.'); + } + if( $x.shape[$x.rank-1] < 2 ) { + throw new Error('rosenbrock(x): x.shape[-1] must be at least 2.'); + } + + const ONE = scalar( 1), + TWO = scalar( 2), + B = scalar(100); + + const size = $x.shape.slice(), + start = $x.shape.map( () => 0 ); + -- size[$x.rank-1]; const xi = $x.slice(start.slice(),size); + ++start[$x.rank-1]; const xj = $x.slice(start, size); + + return add( + squaredDifference( xj, xi .pow(TWO) ).mul(B), + squaredDifference(ONE, xi) + ).sum(/*axis=*/-1); + }); +} + +function rastrigin( x: Tensor|TensorLike ): Tensor +{ + return ENV.engine.tidy( () => { + const $x = convertToTensor(x,'x','rosenbrock'); + + if( $x.rank < 1 ) { + throw new Error('rosenbrock(x): x.rank must be at least 1.'); + } + if( $x.shape[$x.rank-1] < 1 ) { + throw new Error('rosenbrock(x): x.shape[-1] must be at least 1.'); + } + + const π2 = scalar(Math.PI*2), + ONE = scalar(1), + A = scalar(10); + + return sub( + mul($x,$x), + cos( mul(π2,$x) ).sub(ONE).mul(A) + ).sum(/*axis=*/-1); + }); +} + +describeWithFlags('rosenbrock', ALL_ENVS, () => { + + for( const l of [2,3] ) + { + const ones = Array.from({ length: l }, () => 1 ); + + it(`should have a single minimum at [${ones}]`, () => { + const x = Array.from({ length: l }, () => 1 ); + + const fg = valueAndGrad(rosenbrock), + { value: fMin, grad: gMin } = fg( tensor1d(x) ); + expectArraysClose( fMin, scalar(0) ); + expectArraysClose( gMin, zeros([l]) ); + + // this should so not be necessary... + const atol = scalar( Math.sqrt(ENV.get('EPSILON')) ); + + for( let i=0; i < 128; i++ ) + { + const x = randomUniform([l],-2,+2), + f = rosenbrock(x); + expectArraysEqual( fMin.sub(atol).lessEqual(f), scalar(true,'bool') ); + } + + { + const x = randomUniform([128,128,l],-2,+2), + f = rosenbrock(x); + expectArraysEqual( + fMin.sub(atol).lessEqual(f).all(), + scalar(true,'bool') + ); + } + }); + } +}); + +describeWithFlags('rastrigin', ALL_ENVS, () => { + + for( const l of [1,2,3] ) + { + const zeros = Array.from({ length: l }, () => 0 ); + + it(`should have a global minimum at [${zeros}]`, () => { + + const fg = valueAndGrad(rastrigin), + { value: fMin, grad: gMin } = fg( tensor1d(zeros) ); + expectArraysClose( fMin, scalar(0) ); + expectArraysClose( gMin, zeros ); + + // this should so not be necessary... + const atol = scalar( Math.sqrt(ENV.get('EPSILON')) ); + + for( let i=0; i < 128; i++ ) + { + const x = randomUniform([l],-6,+6), + f = rastrigin(x); + expectArraysEqual( fMin.sub(atol).lessEqual(f), scalar(true,'bool') ); + } + + { + const x = randomUniform([128,128,l],-6,+6), + f = rastrigin(x); + try { + expectArraysEqual( + fMin.sub(atol).lessEqual(f).all(), + scalar(true,'bool') + ); + } + catch(err) { + const i = f.flatten().argMin().dataSync()[0]; + console.log('x_min:'); x.reshape([-1,l]).slice([i,0], [1,l]).print(); + console.log('f.min:'); f.min().print(); + console.log('fMin'); fMin.print(); + throw err; + } + } + }); + } + +}); + +function val( t: Tensor ) { + if( t.rank !== 0 ) { throw new Error('Assertion failed.'); } + return t.dataSync()[0]; +} + +describeWithFlags('strongWolfeLineSearch', CPU_ENVS, () => { + + const testWith = ( name: string, func: (x: Tensor) => Tensor ) => { + for( let test=0; test < 128; test++ ) { + for( const l of [2,3,4] ) { + it(`should work on ${l}d ${name} (test ${test})`, () => { + + const c1=0.4, + c2=0.8, + c3=1.6; + + const fg = ( () => { + const fg = valueAndGrad(func); + return (x: Tensor) => { + const { value, grad } = fg(x); + return [value, grad] as [Scalar,Tensor1D]; + }; + })(), + linSearch = strongWolfeLineSearch(c1,c2,c3); + + for( let run=0; run < 2; run++ ) + { + ENV.engine.tidy( () => { + + const X0 = randomUniform([l],-1,+1) as Tensor1D, + [F0,G0] = fg(X0), + dirLen = Math.random()*1.9 + 0.1, + f0 = val(F0); + + let negDir = ( + G0.div( scalar( Math.hypot( ...Array.from(G0.dataSync()) ) ) ) + .mul( scalar(dirLen) ) + ) as Tensor1D, + p0 = - val(dot(G0,negDir)); + + const atol = Math.sqrt(ENV.get('EPSILON')); + if( Math.abs(p0) <= atol ) { return; } + if( p0 > 0 ) { + p0 *= -1; + negDir = negDir.neg(); + } + + const [X,F,G] = linSearch(fg, X0,F0,G0, negDir), + f = val(F), + p = - val(dot(G,negDir)), + α = Math.hypot( + ...Array.from( sub(X,X0).dataSync() ) + ) / dirLen; + + expect( Math.abs(p) ).not.toBeGreaterThan( -c2*p0 ); + expect( f - f0 ).not.toBeGreaterThan( c1*p0*α ); + }); + } + }); + }} + }; + + testWith('rosenbrock', rosenbrock); + testWith('rastrigin' , rastrigin ); +}); + +describeWithFlags('lbfgs', CPU_ENVS, () => { + + for( let test=0; test < 128; test++ ) { + for( const n of [2,3] ) + { + const x0 = Array.from({ length: n }, () => Math.random()*6 - 3 ); + + it(`random_test#${test}`, () => { + const DENOM = scalar(32); + + const fg = (() => { + const fg = valueAndGrad(rosenbrock); + return (x: Tensor) => { + nCalls++; + const { value, grad } = fg(x); + return [value, grad] as [Scalar,Tensor1D]; + }; + })(); + + let nSteps=0, + nCalls=0; + + const opt = new LBFGSFunctionOptimizer( + fg, + tensor1d(x0), + { initNegDir: g => g.div(DENOM) } + ); + + const atol = scalar( Math.sqrt(ENV.get('EPSILON')) ); + opt_loop:while( ! opt.g.abs().lessEqual(atol).all().dataSync()[0] ) + { + ++nSteps; + try { + opt.step(); + } + catch(err) { + if( err instanceof LineSearchNoProgressError ) { + break opt_loop; + } + throw err; + } + } + + expect(nCalls).toBeLessThan(512); + expect(nSteps).toBeLessThan(256); + expectArraysClose(opt.x, ones([n]) ); + expectArraysClose(opt.f, zeros([ ]) ); + expectArraysClose(opt.g, zeros([n]) ); + opt.dispose(); + }); + }} + +});