@@ -133,15 +133,67 @@ proc eye[T](n: int): Tensor[T] =
133133 for i in 0 ..< n:
134134 result [i, i] = 1
135135
136- proc steepestDescent * [U; T: not Tensor ](f: proc (x: Tensor [U]): T, x0: Tensor [U], alpha: U = U (0.1 ), tol: U = U (1 e-6 ), fastMode: bool = false ): Tensor [U] =
136+ type LineSearchCriterion = enum
137+ Armijo , Wolfe , WolfeStrong , None
138+
139+ proc line_search * [U, T](alpha: var U, p: Tensor [T], x0: Tensor [U], f: proc (x: Tensor [U]): T, criterion: LineSearchCriterion , fastMode: bool = false ) =
140+ # note: set initial alpha for the methods as 1 / sqrt(dot(grad, grad)) so first step has length 1.
141+ if criterion == None :
142+ return
143+ var gradient = tensorGradient (f, x0, fastMode= fastMode)
144+ let dirDerivInit = dot (gradient, p)
145+
146+ if 0 < dirDerivInit:
147+ # p is pointing uphill, use whatever alpha we currently have.
148+ return
149+
150+ let fInit = f (x0)
151+ var counter = 0
152+ alpha = 1
153+ while counter < 20 :
154+ let x = x0 + alpha * p
155+ let fx = f (x)
156+ gradient = tensorGradient (f, x, fastMode= fastMode)
157+ counter += 1
158+
159+ if fx > fInit + 1 e-4 * alpha * dirDerivInit: # c1 = 1e-4 multiply as well? Doesn't seem to work
160+ alpha *= 0.5
161+ else :
162+ if criterion == Armijo :
163+ return
164+ let dirDeriv = dot (gradient, p)
165+ if dirDeriv < 0.9 * dirDerivInit:
166+ alpha *= 2.1
167+ else :
168+ if criterion == Wolfe :
169+ return
170+ if dirDeriv > - 0.9 * dirDerivInit:
171+ alpha *= 0.5
172+ else :
173+ return
174+ if alpha < 1 e-3 :
175+ alpha = 1 e-3
176+ return
177+ elif alpha > 1 e2 :
178+ alpha = 1 e2
179+ return
180+
181+
182+
183+
184+
185+
186+ proc steepestDescent * [U; T: not Tensor ](f: proc (x: Tensor [U]): T, x0: Tensor [U], alpha: U = U (0.1 ), tol: U = U (1 e-6 ), fastMode: bool = false , criterion: LineSearchCriterion = None ): Tensor [U] =
137187 # # Minimize scalar-valued function f.
188+ var alpha = alpha
138189 var x = x0.clone ()
139190 var fNorm = abs (f (x0))
140191 var gradient = tensorGradient (f, x0, fastMode= fastMode)
141192 var gradNorm = vectorNorm (gradient)
142193 var iters: int
143194 while gradNorm > tol* (1 + fNorm) and iters < 10000 :
144195 let p = - gradient
196+ line_search (alpha, p, x, f, criterion, fastMode)
145197 x += alpha * p
146198 let fx = f (x)
147199 fNorm = abs (fx)
@@ -218,8 +270,9 @@ proc bfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U =
218270 # echo iters, " iterations done!"
219271 result = x
220272
221- proc bfgs_optimized * [U; T: not Tensor ](f: proc (x: Tensor [U]): T, x0: Tensor [U], alpha: U = U (1 ), tol: U = U (1 e-6 ), fastMode: bool = false ): Tensor [U] =
273+ proc bfgs_optimized * [U; T: not Tensor ](f: proc (x: Tensor [U]): T, x0: Tensor [U], alpha: U = U (1 ), tol: U = U (1 e-6 ), fastMode: bool = false , criterion: LineSearchCriterion = None ): Tensor [U] =
222274 # Use gemm and gemv with preallocated Tensors and setting beta = 0
275+ var alpha = alpha
223276 var x = x0.clone ()
224277 let xLen = x.shape[0 ]
225278 var fNorm = abs (f (x))
@@ -240,6 +293,7 @@ proc bfgs_optimized*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U],
240293 # echo "p iter ", iters, ": ", p
241294 # echo "x iter ", iters, ": ", x
242295 # echo "gradient iter ", iters, ": ", gradient
296+ line_search (alpha, p, x, f, criterion, fastMode)
243297 x += alpha * p
244298 let newGradient = tensorGradient (f, x, fastMode= fastMode)
245299 let sk = alpha * p.reshape (xLen, 1 )
@@ -296,14 +350,15 @@ proc bfgs_optimized*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U],
296350 # echo iters, " iterations done!"
297351 result = x
298352
299- proc lbfgs * [U; T: not Tensor ](f: proc (x: Tensor [U]): T, x0: Tensor [U], alpha: U = U (1 ), tol: U = U (1 e-6 ), fastMode: bool = false ): Tensor [U] =
353+ proc lbfgs * [U; T: not Tensor ](f: proc (x: Tensor [U]): T, x0: Tensor [U], alpha: U = U (1 ), tol: U = U (1 e-6 ), fastMode: bool = false , m: int = 10 , criterion: LineSearchCriterion = None ): Tensor [U] =
354+ var alpha = alpha
300355 var x = x0.clone ()
301356 let xLen = x.shape[0 ]
302357 var fNorm = abs (f (x))
303358 var gradient = 0.01 * tensorGradient (f, x, fastMode= fastMode)
304359 var gradNorm = vectorNorm (gradient)
305360 var iters: int
306- let m = 10 # number of past iterations to save
361+ # let m = 10 # number of past iterations to save
307362 var sk_queue = initDeque [Tensor [U]](m)
308363 var yk_queue = initDeque [Tensor [T]](m)
309364 # the problem is the first iteration as the gradient is huge and no adjustments are made
@@ -330,6 +385,7 @@ proc lbfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U
330385 z = - z
331386 let p = z
332387 # echo "q: ", q
388+ line_search (alpha, p, x, f, criterion, fastMode)
333389 x += alpha * p
334390 sk_queue.addFirst alpha* p
335391 let newGradient = tensorGradient (f, x, fastMode= fastMode)
@@ -402,40 +458,71 @@ when isMainModule:
402458 result += (ix - 1 )^ 2
403459
404460 # let x0 = [-0.5, 2.0].toTensor
405- let x0 = ones [float ](500 ) * - 1
461+ let x0 = ones [float ](100 ) * - 1
406462 #[ let sol1 = steepestDescent(f1, x0, tol=1e-8, alpha=0.001, fastMode=true)
407463 echo sol1
408464 echo f1(sol1)
409465 echo "Newton: ", newton(f1, x0, tol=1e-8, fastMode=false)
410466 echo "Newton: ", newton(f1, x0, tol=1e-8, fastMode=true)
411467 echo "BFGS: ", bfgs(f1, x0, tol=1e-8, fastMode=false)
412468 echo "BFGS: ", bfgs(f1, x0, tol=1e-8, fastMode=true)
413- echo "LBFGS:", lbfgs(f1, x0, tol=1e-8, fastMode=false) ]#
469+ echo "LBFGS:", lbfgs(f1, x0, tol=1e-8, fastMode=false)
414470 echo "BFGS: ", bfgs(f1, x0, tol=1e-8, fastMode=false)
415471 echo "BFGS opt: ", bfgs_optimized(f1, x0, tol=1e-8, fastMode=false)
416472 echo "BFGS: ", bfgs(f1, x0, tol=1e-8, fastMode=true)
417- echo " BFGS opt: " , bfgs_optimized (f1, x0, tol= 1 e-8 , fastMode= true )
473+ echo "BFGS opt: ", bfgs_optimized(f1, x0, tol=1e-8, fastMode=true) ]#
418474
419- timeIt " steepest slow mode" :
420- keep steepestDescent (f1, x0, tol= 1 e-8 , alpha= 0.001 , fastMode= false )
421- timeIt " steepest fast mode" :
422- keep steepestDescent (f1, x0, tol= 1 e-8 , alpha= 0.001 , fastMode= true )
423- timeIt " newton slow mode" :
475+ #[ echo bfgs_optimized(f1, x0, tol=1e-8, alpha=0.001, fastMode=false, criterion=None)
476+ echo bfgs_optimized(f1, x0, tol=1e-8, alpha=0.001, fastMode=false, criterion=Armijo)
477+ echo bfgs_optimized(f1, x0, tol=1e-8, alpha=0.001, fastMode=false, criterion=Wolfe)
478+ echo bfgs_optimized(f1, x0, tol=1e-8, alpha=0.001, fastMode=false, criterion=WolfeStrong)
479+ echo lbfgs(f1, x0, tol=1e-8, alpha=0.001, fastMode=false, criterion=None)
480+ echo lbfgs(f1, x0, tol=1e-8, alpha=0.001, fastMode=false, criterion=Armijo)
481+ echo lbfgs(f1, x0, tol=1e-8, alpha=0.001, fastMode=false, criterion=Wolfe)
482+ echo lbfgs(f1, x0, tol=1e-8, alpha=0.001, fastMode=false, criterion=WolfeStrong) ]#
483+
484+ timeIt " steepest slow mode None" :
485+ keep bfgs_optimized (f1, x0, tol= 1 e-8 , alpha= 1 , fastMode= false , criterion= None )
486+
487+ timeIt " steepest slow mode Armijo" :
488+ keep bfgs_optimized (f1, x0, tol= 1 e-8 , alpha= 1 , fastMode= false , criterion= Armijo )
489+
490+ timeIt " steepest slow mode Wolfe" :
491+ keep bfgs_optimized (f1, x0, tol= 1 e-8 , alpha= 1 , fastMode= false , criterion= Wolfe )
492+
493+ timeIt " steepest slow mode WolfeStrong" :
494+ keep bfgs_optimized (f1, x0, tol= 1 e-8 , alpha= 1 , fastMode= false , criterion= WolfeStrong )
495+
496+ timeIt " steepest slow mode None" :
497+ keep lbfgs (f1, x0, tol= 1 e-8 , alpha= 1 , fastMode= false , criterion= None )
498+
499+ timeIt " steepest slow mode Armijo" :
500+ keep lbfgs (f1, x0, tol= 1 e-8 , alpha= 1 , fastMode= false , criterion= Armijo )
501+
502+ timeIt " steepest slow mode Wolfe" :
503+ keep lbfgs (f1, x0, tol= 1 e-8 , alpha= 1 , fastMode= false , criterion= Wolfe )
504+
505+ timeIt " steepest slow mode WolfeStrong" :
506+ keep lbfgs (f1, x0, tol= 1 e-8 , alpha= 1 , fastMode= false , criterion= WolfeStrong )
507+ #[ timeIt "newton slow mode":
424508 keep newton(f1, x0, tol=1e-8, fastMode=false)
425509 timeIt "newton fast mode":
426510 keep newton(f1, x0, tol=1e-8, fastMode=true)
427511 timeIt "bfgs slow mode":
428512 keep bfgs(f1, x0, tol=1e-8, fastMode=false)
429513 timeIt "bfgs fast mode":
430514 keep bfgs(f1, x0, tol=1e-8, fastMode=true)
431- timeIt " optimized bfgs slow mode" :
432- keep bfgs_optimized (f1, x0, tol= 1 e-8 , fastMode= false )
433- timeIt " optimized bfgs fast mode" :
434- keep bfgs_optimized (f1, x0, tol= 1 e-8 , fastMode= true )
435515 timeIt "lbfgs slow mode":
436516 keep lbfgs(f1, x0, tol=1e-8, fastMode=false)
437517 timeIt "lbfgs fast mode":
438518 keep lbfgs(f1, x0, tol=1e-8, fastMode=true)
519+ timeIt "optimized bfgs slow mode":
520+ keep bfgs_optimized(f1, x0, tol=1e-8, fastMode=false)
521+ timeIt "optimized bfgs fast mode":
522+ keep bfgs_optimized(f1, x0, tol=1e-8, fastMode=true)
523+ timeIt "steepest fast mode":
524+ keep steepestDescent(f1, x0, tol=1e-8, alpha=0.001, fastMode=true) ]#
525+
439526
440527 # Lev-Marq:
441528#[ proc fFit(params: Tensor[float], x: float): float =
0 commit comments