Métodos quase-Newton possuem iterações da forma $$\def\vect{\boldsymbol}\large \vect x^{( k + 1 )} = \vect x^{( k )} + \lambda_k \vect d^{( k )} $$
onde $$\large \vect d^{( k )} = -D^{( k )}\nabla f( \vect x^{( k )} ). $$
Na iteração acima, as matrizes $D^{( k )}$ servirão como uma aproximação para $\nabla^2 f( \vect x^{( k )} )^{-1}$.
Definamos $$\large \vect q^{( k )} = \nabla f( \vect x^{( k + 1 )} ) - \nabla f( \vect x^{( k )} ) $$ e $$\large \vect p^{( k )} = \vect x^{( k + 1 )} - \vect x^{( k )}. $$
$$\large \vect q^{( k )} \approx \nabla^2 f( \vect x^{( k + 1 )} )\vect p^{( k )} $$
$$\large \bigl[ \vect q^{( 0 )} \mid \cdots \mid \vect q^{( n - 2 )} \mid \vect q^{( n - 1 )} \bigr] \approx \nabla^2 f( \vect x^{( n )} )\bigl[ \vect p^{( 0 )} \mid \cdots \mid \vect p^{( n - 2 )} \mid \vect p^{( n - 1 )} \bigr] $$
$$\large \nabla^2 f( \vect x^{( n )} )^{-1}\bigl[ \vect q^{( 0 )} \mid \cdots \mid \vect q^{( n - 2 )} \mid \vect q^{( n - 1 )} \bigr] \approx \bigl[ \vect p^{( 0 )} \mid \cdots \mid \vect p^{( n - 2 )} \mid \vect p^{( n - 1 )} \bigr] $$
$$\large \nabla^2 f( \vect x^{( n )} )^{-1} \approx \bigl[ \vect p^{( 0 )} \mid \cdots \mid \vect p^{( n - 2 )} \mid \vect p^{( n - 1 )} \bigr]\bigl[ \vect q^{( 0 )} \mid \cdots \mid \vect q^{( n - 2 )} \mid \vect q^{( n - 1 )} \bigr]^{-1} $$
$$\large D^{( k + 1 )} = D^{( k )} + \frac{\vect p^{( k )}(\vect p^{( k )})^T}{(\vect p^{( k )})^T\vect p^{( k )}} - \frac{D^{( k )}\vect q^{( k )}(D^{( k )}\vect q^{( k )})^T}{(\vect q^{( k )})^TD^{( k )}\vect q^{( k )}} + \xi_k \tau_k \vect v^{( k )}(\vect v^{( k )})^T $$
onde $\xi_k \in [ 0, 1 ]$, $$\large \tau_k = (\vect q^{( k )})^TD^{( k )}\vect q^{( k )} $$ $$\large \vect v^{( k )} = \frac{\vect p^{( k )}}{(\vect p^{( k )})^T\vect p^{( k )}} - \frac{D^{( k )}\vect q^{( k )}}{\tau_k}. $$
No caso de $f$ ser uma quadrática da forma $f( \vect x ) = 1/2\vect x^TQ\vect x - \vect c^T\vect x$ com $Q$ simétrica definida positiva, sabemos que o tamanho de passo que minimiza a busca linear é dado por $$\large \lambda_k = \frac{-\nabla f\bigl( \vect x^{( k )} \bigr)^T\vect d_k}{\vect d_k^TQ\vect d_k}. $$
import numpy as np
def DFP( Q, c, x = None, niter = None, D = None ):
if x is None:
x = np.zeros( ( Q.shape[ 0 ], ) )
if niter is None:
niter = Q.shape[ 0 ]
if D is None:
D = np.eye( Q.shape[ 0 ] )
grad = Q @ x - c
d = -D @ grad
for i in range( niter ):
step = -( grad @ d ) / ( d @ ( Q @ d ) )
p = step * d
q = Q @ p
x += p
Dq = D @ q
D += ( p / ( p @ p ) ) @ p.T - ( Dq / ( q @ Dq ) ) @ Dq.T
grad = Q @ x - c
d = -D @ grad
return ( x, D )
$$\large \vect v^{( k )} = \frac{\vect p^{( k )}}{(\vect p^{( k )})^T\vect p^{( k )}} - \frac{D^{( k )}\vect q^{( k )}}{\tau_k}. $$
n = 4
A = np.random.normal( size = ( n, n ) )
c = np.random.normal( size = ( n, ) )
Q = A.T @ A
( x_DFP, Q_DFP ) = DFP( Q, c )
print( np.linalg.norm( np.linalg.inv( Q ) - Q_DFP ) )
print( Q @ x_DFP - c )
def BFGS( Q, c, x = None, niter = None, D = None ):
if x is None:
x = np.zeros( ( Q.shape[ 0 ], ) )
if niter is None:
niter = Q.shape[ 0 ]
if D is None:
D = np.eye( Q.shape[ 0 ] )
grad = Q @ x - c
d = -D @ grad
for i in range( niter ):
step = -( grad @ d ) / ( d @ ( Q @ d ) )
p = step * d
q = Q @ p
x += p
Dq = D @ q
tau = q @ Dq
v = p / ( p @ p ) - ( D @ q ) / tau
D += np.outer( p / ( p @ p ), p ) - np.outer( Dq / tau, Dq )
D += np.outer( tau * v, v )
grad = Q @ x - c
d = -D @ grad
return ( x, D )
( x_BFGS, Q_BFGS ) = BFGS( Q, c, niter = 20 )
print( Q @ x_BFGS - c )
print( Q @ Q_BFGS )
def BFGS_NL(
f, grad_f,
x,
sigma = 1.0e-1,
beta = 0.5,
step = 1.0,
kappa = 1e-2,
gamma = 1e-2,
niter = None,
D = None,
eps = 1.0e-5
):
if D is None:
D = np.eye( x.shape[ 0 ] )
grad = grad_f( x )
grad_sq = np.sum( grad ** 2.0 )
d = -D @ grad
niter = 0
reset = 0
while grad_sq ** 0.5 > eps:
nrm_d = np.linalg.norm( d )
if d @ grad >= -gamma * ( grad_sq ** 0.5 ) * nrm_d:
d = -grad
D = np.eye( x.shape[ 0 ] )
reset += 1
nrm_d = np.linalg.norm( d )
if nrm_d <= kappa * ( grad_sq ** 0.5 ):
d = -grad
D = np.eye( x.shape[ 0 ] )
reset += 1
nrm_d = np.linalg.norm( d )
step_k = step
desc = sigma * np.sum( d * grad )
f_x = f( x )
x_next = x + step_k * d
f_x_next = f( x_next )
while f_x_next >= f_x + step_k * desc:
step_k = step_k * beta
x_next = x + step_k * d
f_x_next = f( x_next )
p = x_next - x
x = x_next
grad_prev = grad
grad = grad_f( x )
grad_sq = np.sum( grad ** 2.0 )
q = grad - grad_prev
Dq = D @ q
tau = q @ Dq
v = p / ( p @ p ) - ( D @ q ) / tau
D += np.outer( p / ( p @ p ), p ) - np.outer( Dq / tau, Dq )
D += np.outer( tau * v, v )
d = -D @ grad
niter = niter + 1
print( niter )
print( reset )
return x
def rosenbrock( x, a = 1, b = 100 ):
return ( a - x[ 0 ] ) ** 2.0 + b * ( x[ 1 ] - x[ 0 ] ** 2.0 ) ** 2.0
def rosenbrock_grad( x, a = 1, b = 100 ):
retval = np.empty( 2 )
retval[ 0 ] = 2.0 * ( x[ 0 ] - a ) - 4.0 * x[ 0 ] * b * ( x[ 1 ] - x[ 0 ] ** 2.0 )
retval[ 1 ] = 2.0 * b * ( x[ 1 ] - x[ 0 ] ** 2.0 )
return retval
x = BFGS_NL(
rosenbrock,
rosenbrock_grad,
np.array( [ 0.0, 0.0 ] ),
step = 1,
sigma = 0.1,
beta = 1e-1,
kappa = 1e-3,
gamma = 1e-1,
eps = 1e-10
)
print( x )