28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474 | def fmin_qnewton(
f: Callable[[FloatVector], float],
x0: FloatVectorLike,
*,
tolx: float = 1e-10,
tolg: float = 1e-5,
sclx: FloatVectorLike | None = None,
sclf: float = 1.0,
maxiter: int = 100,
maxlenfac: float = 1e3,
trustlen: float | None = None,
epsf: float | None = None,
global_method: Literal["line-search", "dogleg"] | None = "dogleg",
bfgs_update: bool | None = None,
bfgs_method: Literal["factored", "unfactored", "inverse"] | None = None,
grad: Callable[[FloatVector], FloatVector] | None = None,
hess: Callable[[FloatVector], FloatMatrix] | None = None,
H0: FloatMatrix | None = None,
callback: Callable[[int, FloatVector, float, FloatVector], tuple[bool, bool]]
| None = None,
verbose: bool = False,
) -> VectorOptimumResult:
r"""Find the minimum of a multivariate function using a quasi-Newton method with
optional global strategies.
Parameters
----------
f : Callable[[FloatVector (N)], float]
Objective function to minimize.
x0 : FloatVectorLike (N)
Initial guess for the optimum. Moreover, if no user-defined scale `sclx` is
provided, the scaling factors will be determined from this value.
tolx : float
Tolerance for the scaled step size. The algorithm terminates when the scaled
distance between two successive iterates `||Δx/max(x, 1/sclx)||∞` is below this
threshold. If the value is too large, the algorithm may terminate prematurely.
A value on the order of $\epsilon^{2/3}$ is typically recommended.
tolg : float
Tolerance for the scaled gradient. This is the main convergence criterion. The
algorithm terminates when the scaled gradient `||∇f(x)*sclf/sclx||∞` is below this
threshold. If the value is too large, the algorithm may terminate prematurely.
A value on the order of $\epsilon^{1/3}$ is typically recommended.
sclx : FloatVectorLike (N) | None
Positive scaling factors for the components of `x`. Ideally, these should be
chosen so that `sclx*x` is of order 1 near the solution for all components. By
default, scaling is determined from `x0`.
sclf : float
Positive scaling factor for the function values. Ideally, this should be chosen so
that `sclf*f(x)` is of order 1 across the domain of interest. The value is used to
scale the gradient; if too low a value is provided, the algorithm may terminate
prematurely.
maxiter : int
Maximum number of outer quasi-Newton iterations.
maxlenfac : float
Factor determining the maximum allowable scaled step length `||Δx*sclx||₂` for
global methods. Used to prevent steps that would cause the algorithm to overflow,
leave the domain of interest, or diverge. It should be chosen small enough to
prevent such issues, but large enough to allow any anticipated reasonable step
length.
trustlen : float | None
Initial trust region radius for the `dogleg` global method. By default, the length
of the initial scaled gradient is used.
epsf : float | None
Machine precision of the function values. If `None`, machine precision of 64-bit
floating-point type is assumed. If the number of reliable base-10 digits in the
results returned by the function is $n$, then `epsf` is approximately $10^{-n}$.
global_method : Literal['line-search','dogleg'] | None
Global strategy to improve convergence from remote starting points. With
`line-search`, the search direction is computed using the quasi-Newton step and
the length of the step is determined by backtracking until the Armijo condition is
fulfilled. With `dogleg`, a trust-region dogleg method is used to compute both the
step direction and length. If `None`, no global strategy is used and the full
quasi-Newton step is taken at each iteration.
bfgs_update : bool | None
Whether to update the Hessian approximation using the BFGS positive definite
secant formula. If `False`, the Hessian is evaluated anew at each iteration
using `hess` or forward finite differences. If `True`, the Hessian is updated
using the BFGS formula, avoiding the cost of a full evaluation. The BFGS
update typically increases the number of iterations required for convergence,
but decreases the total number of function evaluations when the Hessian is
approximated via finite differences. By default, the BFGS update is used if no
Hessian function is provided, and is not used otherwise.
bfgs_method : Literal['factored', 'unfactored', 'inverse'] | None
Method to carry out the BFGS Hessian update. In theory, all methods produce the
same result, but the computational cost and the numerical stability may differ.
By default, the `factored` form is used when the dogleg global method is selected,
because the dogleg method requires the Cholesky factor of the Hessian. Otherwise,
the `inverse` form is used because it has the lowest algorithmic overhead.
grad : Callable[[FloatVector (N)], FloatVector (N)] | None
Function to compute the gradient of `f`. By default, the gradient is approximated
using forward finite differences. In this case, setting `epsf` appropriately is
essential.
hess : Callable[[FloatVector (N)], FloatMatrix (N, N)] | None
Function to compute the Hessian of `f`. By default, the Hessian is approximated
using forward finite differences, using `grad` if available or `f` otherwise.
In this case, setting `epsf` appropriately is essential.
H0 : FloatMatrix (N, N) | None
Initial Hessian approximation at `x0`, expected to be symmetric positive definite.
By default, if `bfgs_update` is `False`, `H0` is computed using `hess` or forward
finite differences. If `bfgs_update` is `True` and `hess` is not provided, `H0` is
initialized to the identity matrix. If `bfgs_update` is `True` and `hess` is
provided, `H0` is initialized from `hess`.
callback : Callable[[int, FloatVector (N), float, FloatVector (N)], tuple[bool, bool]] | None
Optional callback with signature `callback(niter, x, fx, ∇fx)->(stop, success)`
called at each iteration. If `stop` is `True`, the iteration is terminated. If
`success` is `True`, the optimization is considered successful.
verbose : bool
Print iteration information.
Returns
-------
VectorOptimumResult
Dataclass with the results of the optimization.
Examples
--------
Find the minimum of the Rosenbrock function `f(x,y) = (1 - x)^2 + 100*(y - x^2)^2`.
>>> import numpy as np
>>> from polykin.math import fmin_qnewton
>>> f = lambda x: (1 - x[0])**2 + 100 * (x[1] - x[0]**2)**2
>>> fmin_qnewton(f, [-1.0, -1.0], global_method="dogleg")
method: Quasi-Newton (Global: Dogleg, BFGS: Factored)
success: True
message: ||∇f(x)*sclf/sclx||∞ ≤ tolg
nfeval: 98
ngeval: 0
nheval: 0
niter: 28
x: [0.9999964 0.99999277]
f: 1.3039626656861953e-11
g: [ 7.80959496e-06 -3.03099592e-06]
L: [[-28.34545479 0. ]
[ 14.14251074 -0.70808768]]
""" # noqa: E501
# Check/set method options
if bfgs_update is None:
bfgs_update = hess is None
if bfgs_update:
if bfgs_method is None:
if global_method == "dogleg":
bfgs_method = "factored"
else:
bfgs_method = "inverse"
elif bfgs_method != "factored" and global_method == "dogleg":
bfgs_method = "factored"
warn(
"BFGS update with dogleg global method requires factored form. Setting `bfgs_method` to 'factored'." # noqa: E501
)
# Construct method name for result
method = "Quasi-Newton"
method_options = []
_global_method = global_method if global_method else "none"
method_options.append(f"Global: {_global_method.title()}")
_bfgs_method = bfgs_method if bfgs_method else "none"
if bfgs_update:
method_options.append(f"BFGS: {_bfgs_method.title()}")
if method_options:
method += " (" + ", ".join(method_options) + ")"
# Initialize result variables
success = False
message = ""
nfeval = 0
ngeval = 0
nheval = 0
niter = 0
x0 = np.asarray(x0, dtype=float)
n = x0.size
# Set scaling factors
sclx = np.abs(np.asarray(sclx, dtype=float)) if sclx is not None else scalex(x0)
sclf = max(abs(sclf), 1.0)
# Set maximum step length for global methods
maxlen = max(0.0, maxlenfac) * max(norm(sclx * x0).item(), norm(sclx).item())
# Set initial trust region radius for dogleg method
if trustlen is None:
trustlen = -1.0 # Sentinel value
else:
trustlen = min(trustlen, maxlen)
# Helper functions to evaluate gradient and Hessian
def eval_grad(x: FloatVector, fx: float) -> FloatVector:
nonlocal ngeval, nfeval
if grad is not None:
g = grad(x)
ngeval += 1
else:
g = gradient_forward(f, x, fx=fx, sclx=sclx, epsf=epsf)
nfeval += n
return g
def eval_hess(x: FloatVector, fx: float, gx: FloatVector) -> FloatMatrix:
nonlocal nheval, ngeval, nfeval
if hess is not None:
H = hess(x)
nheval += 1
elif grad is not None:
H = jacobian_forward(grad, x, fx=gx, sclx=sclx, epsf=epsf)
ngeval += n
else:
H = hessian_forward(f, x, fx=fx, sclx=sclx, epsf=epsf)
nfeval += n * (n + 3) // 2
return H
# Evaluate function at x0
xc = x0.copy()
fc = f(xc)
nfeval += 1
# Evaluate gradient at x0
grad_analytic = grad is not None
gc = eval_grad(xc, fc)
# Check initial solution with tight tolerance
if gradient_condition(xc, fc, gc, sclx, sclf, 1e-3 * tolg):
message = "||∇f(x0)*sclf/sclx||∞ ≤ 1e-3*tolg"
return VectorOptimumResult(
method, True, message, nfeval, ngeval, nheval, niter, x0, fc, gc
)
# Evaluate Hessian at x0
H = L = Hinv = np.array([])
if H0 is not None:
H = H0.copy()
elif bfgs_update and hess is None:
# D = max(abs(fc), 1 / sclf) * sclx**2
D = np.ones_like(x0)
if bfgs_method == "unfactored":
H = np.zeros((n, n))
np.fill_diagonal(H, D)
elif bfgs_method == "factored":
L = np.zeros((n, n))
np.fill_diagonal(L, np.sqrt(D))
elif bfgs_method == "inverse":
Hinv = np.zeros((n, n))
np.fill_diagonal(Hinv, 1 / D)
else:
raise ValueError(f"Unknown `bfgs_method`: {bfgs_method}.")
else:
H = eval_hess(xc, fc, gc)
# Factorize or invert Hessian if BFGS update requires it
if bfgs_update and H.size > 0:
if bfgs_method == "unfactored" and hess is not None:
try:
H = make_hessian_spd(H, enforce_symmetry=True)[0]
except np.linalg.LinAlgError as e:
message = f"Cholesky factorization of initial Hessian failed: {e}."
return VectorOptimumResult(
method, False, message, nfeval, ngeval, nheval, niter, xc, fc, gc, H
)
elif bfgs_method == "factored":
try:
L = make_hessian_spd(H, enforce_symmetry=True)[1]
except np.linalg.LinAlgError as e:
message = f"Cholesky factorization of initial Hessian failed: {e}."
return VectorOptimumResult(
method, False, message, nfeval, ngeval, nheval, niter, xc, fc, gc, H
)
elif bfgs_method == "inverse":
try:
Hinv = np.linalg.inv(H)
except np.linalg.LinAlgError as e:
message = f"Inverse of initial Hessian failed: {e}."
return VectorOptimumResult(
method, False, message, nfeval, ngeval, nheval, niter, xc, fc, gc, H
)
else:
pass
# Main optimization loop
gm_nmaxsteps = 0
for niter in range(1, maxiter + 1):
if verbose:
print(f"Iteration {niter:3d}:", flush=True)
print(" Current point:", xc)
print(" Function value:", fc)
print(" Gradient value:", gc)
# Ensure Hessian is SPD
if not bfgs_update:
try:
H, L, _ = make_hessian_spd(H, enforce_symmetry=True)
except np.linalg.LinAlgError as e:
message = f"Perturbed Cholesky factorization of Hessian failed: {e}."
break
# Compute Newton step
if bfgs_update and bfgs_method == "inverse":
# p = - H⁻¹.gc
p = Hinv @ gc
p *= -1
elif bfgs_update and bfgs_method == "unfactored":
# H.p = - gc
try:
L, _ = scipy.linalg.cho_factor(
H,
lower=True,
overwrite_a=False,
check_finite=False,
)
p = scipy.linalg.cho_solve(
(L, True),
gc,
overwrite_b=False,
check_finite=False,
)
p *= -1
if global_method == "dogleg":
L = np.tril(L)
except scipy.linalg.LinAlgError as e:
message = f"Cholesky solve for Newton step failed: {e}."
break
elif not bfgs_update or bfgs_method == "factored":
# (L.Lᵀ).p = - gc
try:
# L.y = gc
y = scipy.linalg.solve_triangular(
L,
gc,
lower=True,
overwrite_b=False,
check_finite=False,
)
# Lᵀ.p = y
p = scipy.linalg.solve_triangular(
L,
y,
lower=True,
trans="T",
overwrite_b=True,
check_finite=False,
)
p *= -1
except scipy.linalg.LinAlgError as e:
message = f"Triangular solve for Newton step failed: {e}."
break
else:
raise ValueError(f"Unknown `bfgs_method`: {bfgs_method}.")
if verbose:
print(" Search direction:", p)
# Compute actual x step
if global_method is None:
xp = xc + p
fp = f(xp)
gm_ismaxstep = True
gm_success = True
gm_nfeval = 1
elif global_method == "line-search":
gm_success, gm_ismaxstep, gm_nfeval, xp, fp, _ = line_search(
f, p, xc, fc, gc, tolx, sclx, maxlen, verbose
)
elif global_method == "dogleg":
gm_success, gm_ismaxstep, gm_nfeval, trustlen, xp, fp, _ = dogleg(
f, p, xc, fc, gc, L.T, tolx, sclx, maxlen, trustlen, verbose
)
else:
raise ValueError(f"Unknown `global_method`: {global_method}.")
nfeval += gm_nfeval
gm_nmaxsteps = gm_nmaxsteps + 1 if gm_ismaxstep else 0
# Display iteration progress
if verbose:
print(f" x = {xp}\n f(x) = {fp:.2e}", flush=True)
# Evaluate gradient at new point
gp = eval_grad(xp, fp)
# Call user callback
if callback is not None:
stop, _success = callback(niter, xp, fp, gp)
if stop:
success = _success
message = "Terminated by user callback."
break
# Check termination and convergence conditions
if not gm_success:
message = """Last global step failed to decrease f(x) sufficiently.
Either `x` is an approximate local minimizer and no more accuracy is possible,
or the finite difference gradient approximation is too inaccurate, or `tolx`
is too large."""
stop = True
elif gradient_condition(xp, fp, gp, sclx, sclf, tolg):
message = "||∇f(x)*sclf/sclx||∞ ≤ tolg"
success = True
stop = True
elif norm((xp - xc) / np.maximum(np.abs(xp), 1 / sclx), np.inf) <= tolx:
message = """||Δx/max(x, 1/sclx)||∞ ≤ tolx
`x` may be an approximate local minimizer, but it is also possible that the
algorithm is making slow progress and is not near a minimizer, or that `tolx`
is too large."""
stop = True
elif global_method and gm_nmaxsteps >= 5:
message = """Maximum number (5) of consecutive steps of length `maxlen`
reached. Perhaps f(x) is unbounded below, or f(x) has a finite asymptote in
some direction, or `maxlen` is too small."""
stop = True
else:
stop = False
if stop:
xc, fc, gc = xp, fp, gp
break
# Update Hessian approximation
if bfgs_update:
if bfgs_method == "factored":
L = _update_bfgs_factored(xc, xp, gc, gp, L, grad_analytic)
elif bfgs_method == "unfactored":
H = _update_bfgs_unfactored(xc, xp, gc, gp, H, grad_analytic)
elif bfgs_method == "inverse":
Hinv = _update_bfgs_inverse(xc, xp, gc, gp, Hinv, grad_analytic)
else:
raise ValueError(f"Unknown `bfgs_method`: {bfgs_method}.")
else:
H = eval_hess(xp, fp, gp)
# Next iteration will start at xp
xc, fc, gc = xp, fp, gp
else:
message = f"Maximum number of iterations ({maxiter}) reached."
return VectorOptimumResult(
method=method,
success=success,
message=message,
nfeval=nfeval,
ngeval=ngeval,
nheval=nheval,
niter=niter,
x=xc,
f=fc,
g=gc,
H=H if not bfgs_update or bfgs_method == "factored" else None,
Hinv=Hinv if bfgs_method == "inverse" else None,
L=L if bfgs_method == "factored" else None,
)
|