# A New Couchy (Gradient Descent) Picture a <- 5 r <- 0.9 A <- matrix(c(1,a*r,a*r,a^2),2,2) M <- 500 N <- 2*M+1 f <- function(x) t(x) %*% A %*% x X <- cbind(-M:M/M,-M:M/M) Z <- matrix(0, N, N) for(i in 1:N){ for(j in 1:N){ Z[i,j] <- f(c(X[i,1],X[j,2])) } } pdf("AGD.pdf", height = 7, width = 7) xlab = expression(x[1]) ylab = expression(x[2]) plot(-10:10/10, -10:10/50, xlab = xlab, ylab = ylab, type = "n") contour(X[,1],X[,2],Z, levels = (1:3)/20, add = TRUE, labels = "") # Gradient descent iteration: NB I've accentuated the oscillation with the 1.9 factor x0 <- c(-0.75, 0.15) # initial point grad <- function(x) c(2*x[1] + 2*a*r*x[2], 2*a*r*x[1] + 2 * a^2 * x[2]) h <- function(t,x) f(x - t*grad(x)) for(i in 1:100){ z <- optimize(h,c(0,1), x = x0) tz <- 1.9 * z$min lines(c(x0[1], x0[1] -tz * grad(x0)[1]), c(x0[2], x0[2] -tz * grad(x0)[2]), col = "grey") x0 <- x0 - tz * grad(x0) } print(z$obj) points(0,0, cex = .5) # An alternative version of Nesterov AGD a la B. O'Donoghue's Matlab code on github # https://github.com/bodono/apg/blob/master/apg.m alpha <- 1.01 # Step size growth factor beta <- 0.5 # Step size shrinkage factor x <- y <- c(-0.75, 0.15) # initial point g <- grad(y) theta <- 1 t <- 1/sqrt(sum(g^2)) xhat <- x - t * g ghat <- grad(xhat) t <- abs(crossprod(x - xhat, g - ghat))/crossprod(g - ghat) for(i in 1:42){ x0 <- x y0 <- y x <- y - t * g theta <- 2/(1+sqrt(1+4/theta^2)) y <- x + (1-theta)*(x - x0) lines(c(y0[1], y[1]), c(y0[2], y[2]), lwd = 1.5, col = "red") g0 <- g g <- grad(y) that <- 0.5 * crossprod(y - y0)/abs(crossprod(y-y0,g0 - g)) t <- min(alpha * t, max(beta * t, that)) } dev.off() if(FALSE){ # Now we consider the Nesterov Acceleration Method for speeding up Cauchy Method # Beck & Teboulle: (2009) FISTA, SIAM JIS, 2, 183-202, or # Bubeck: https://blogs.princeton.edu/imabandit/2013/04/01/acceleratedgradientdescent/ # Note here we don't do any line search AT ALL beta <- 26 # Lipschitz constant for grad f -- should be maximal eigenvalue of A?? lambda <- function (z) (1 + sqrt(1 + 4 * z^2))/2 x0 <- y1 <- c(-0.75, 0.15) # initial point lam0 <- 0 for(i in 1:5){ lam1 <- lambda(lam0) lam2 <- lambda(lam1) gamma <- (1 - lam1)/lam2 x1 <- y1 - g(y1)/beta y2 <- (1 - gamma) * x1 + gamma * x0 lines(c(y1[1], y2[1]), c(y1[2], y2[2]), col = "red") lam0 <- lam1 x0 <- x1 y1 <- y2 } }