# A New Couchy (Gradient Descent) Picture

a <- 5
r <- 0.9
A <- matrix(c(1,a*r,a*r,a^2),2,2)
M <- 500
N <- 2*M+1
f <- function(x) t(x) %*% A %*% x
X <- cbind(-M:M/M,-M:M/M)
Z <- matrix(0, N, N)
for(i in 1:N){
    for(j in 1:N){
	Z[i,j] <- f(c(X[i,1],X[j,2]))
    }
}
pdf("AGD.pdf", height = 7, width = 7)
xlab = expression(x[1])
ylab = expression(x[2])
plot(-10:10/10, -10:10/50, xlab = xlab, ylab = ylab, type = "n")
contour(X[,1],X[,2],Z, levels = (1:3)/20, add = TRUE, labels = "")

# Gradient descent iteration:  NB I've accentuated the oscillation with the 1.9 factor
x0 <- c(-0.75, 0.15) # initial point
grad <- function(x) c(2*x[1] + 2*a*r*x[2], 2*a*r*x[1] + 2 * a^2 * x[2])
h <- function(t,x) f(x - t*grad(x))
for(i in 1:100){
    z <- optimize(h,c(0,1), x = x0)
    tz <- 1.9 * z$min
    lines(c(x0[1], x0[1] -tz * grad(x0)[1]), 
	  c(x0[2], x0[2] -tz * grad(x0)[2]), col = "grey")
    x0 <- x0 - tz * grad(x0)
}
print(z$obj)
points(0,0, cex = .5)

# An alternative version of Nesterov AGD a la B. O'Donoghue's Matlab code on github
# https://github.com/bodono/apg/blob/master/apg.m
alpha <- 1.01 # Step size growth factor
beta <- 0.5 # Step size shrinkage factor
x <- y <- c(-0.75, 0.15) # initial point
g <- grad(y)
theta <- 1
t <- 1/sqrt(sum(g^2))
xhat <- x - t * g
ghat <- grad(xhat)
t <- abs(crossprod(x - xhat, g - ghat))/crossprod(g - ghat)
for(i in 1:42){
    x0 <- x
    y0 <- y
    x <- y - t * g
    theta <- 2/(1+sqrt(1+4/theta^2))
    y <- x + (1-theta)*(x - x0)
    lines(c(y0[1], y[1]), c(y0[2], y[2]), lwd = 1.5, col = "red")
    g0 <- g
    g <- grad(y)
    that <- 0.5 * crossprod(y - y0)/abs(crossprod(y-y0,g0 - g))
    t <- min(alpha * t, max(beta * t, that))
}
dev.off()

if(FALSE){
# Now we consider the Nesterov Acceleration Method  for speeding up Cauchy Method
# Beck & Teboulle:  (2009) FISTA, SIAM JIS, 2, 183-202, or
# Bubeck:  https://blogs.princeton.edu/imabandit/2013/04/01/acceleratedgradientdescent/
# Note here we don't do any line search AT ALL
beta <- 26 # Lipschitz constant for grad f -- should be maximal eigenvalue of A??
lambda <- function (z) (1 + sqrt(1 + 4 * z^2))/2
x0 <- y1 <- c(-0.75, 0.15) # initial point
lam0 <- 0
for(i in 1:5){
    lam1 <- lambda(lam0)
    lam2 <- lambda(lam1)
    gamma <- (1 - lam1)/lam2
    x1 <- y1 - g(y1)/beta
    y2 <- (1 - gamma) * x1 + gamma * x0
    lines(c(y1[1], y2[1]), c(y1[2], y2[2]), col = "red")
    lam0 <- lam1
    x0 <- x1
    y1 <- y2
}
}