Notebook4
Gradient descent with projection and augmented lagrangian algorithm
To find the infimum of an arbitrary cost function, we use here the gradient descent. Let us consider the function $f(x,y)=0.5(x^2+(y-1)^2)$ that we aim at minimizing on $K\subset \mathbb{R}^2$ with $K={(x,y) | x= y}$. We start by manually compute its gradient and we will display its evolution during gradient descent.
import sys
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install matplotlib
1. Complete the following code
import numpy as np
import matplotlib.pyplot as plt
def f(x1,x2):
    # function
    return 0.5*(x1**2+(x2-1)**2)
def df(x1,x2):
    # gradient
    return np.array([x1,x2-1])
def g(x1,x2):
    # constraint
    return -x1+x2
def dg(x1,x2):
    # constraint gradient
    return np.array([-1,1])
def L(x1,x2,mu): 
    # lagrangian
    return f(x1,x2)+mu* (g(x1,x2))
def Lc(x1,x2,mu,c): 
    # augmented lagrangian
    return f(x1,x2)+c/2.* np.dot(g(x1,x2),g(x1,x2))+mu* (g(x1,x2))
def dxLc(x1,x2,mu,c): 
    # augmented lagrangian gradient
    return df(x1,x2)+ c* np.dot(g(x1,x2),dg(x1,x2)) +mu* (dg(x1,x2)) #(x1-x2) (1,-1)
We need to define a norm to now how far from the global solution we are.
def norm(a):
    return np.sqrt(a[0]**2+a[1]**2)
With $\alpha$ too big, we found out that the solution blows up in the previous exercice, a correct choice for $\alpha$ is very important. That is why, we will proceed with Armijo-Wolfe rule:
# Wolfe rule:
c1=.6
c2=.8
beta=1. #test with beta=1
eta=2.
gamma=0.01
def phik(xyk,alphak,dk,fc,mu,c):
    return fc(xyk[0]+alphak*dk[0],xyk[1]+alphak*dk[1],mu,c)
def dphik(xyk,alphak,dk,dfc,mu,c):
    grad_f=dfc(xyk[0]+alphak*dk[0],xyk[1]+alphak*dk[1],mu,c)
    return np.dot(grad_f,dk)
    
def wolfe_rule(alpha,xy_old,d,fc,dfc,mu=0,c=0): #d_x est la direction de descente d_x . grad_x <= 0
    # test f(x_new) \leq f(x_0) + c alpha ps{d_x}{grad_x}
    test = 1
    iteration = 0
    min_ = 0
    max_ = 1000
    while (phik(xy_old,alpha,d,fc,mu,c)<=(phik(xy_old,0,d,fc,mu,c)+c1*dphik(xy_old,0,d,dfc,mu,c)*alpha)) &(iteration<=500): #armijo ok
        alpha=eta*alpha
    iteration = 0
    while (test!=0)&(iteration<=500): 
        xnew0,xnew1=xy_old[0]+alpha*d[0],xy_old[1]+alpha*d[1]
        if (phik(xy_old,alpha,d,fc,mu,c)<= (phik(xy_old,0,d,fc,mu,c)+c1*dphik(xy_old,0,d,dfc,mu,c)*alpha)) & (np.dot(dfc(xnew0,xnew1,mu,c),d) >= c2*np.dot(dfc(xy_old[0],xy_old[1],mu,c),d) ):
            test = 0
        elif phik(xy_old,alpha,d,fc,mu,c)> (phik(xy_old,0,d,fc,mu,c)+c1*dphik(xy_old,0,d,dfc,mu,c)*alpha): #no armijo
            max_ = alpha
            alpha = (max_ + min_)/2
            iteration = iteration +1
        else: # armijo ok
            minorant = alpha
            alpha = (max_ + min_)/2
            iteration = iteration +1
    return alpha
2. Now we proceed with the Gradient descent algorithm with Armijo-Wolfe rule: Complete the following code
def GradientDescent(fc,dfc,xy0,beta,mu=0,c=0):
    #initialization for Armijo
    alpha=beta
    # start GD algorithm
    eps=1e-6
    x0,y0=xy0[0],xy0[1] #starting points
    evolution = [[x0, y0]]
    grad_f = dfc(x0, y0,mu=mu,c=c) #gradient function
    cpt_grad=0
    while norm(grad_f)>=eps and cpt_grad<500:
    
        d = -grad_f #direction
        alpha=beta  #armijo
        xy_old=np.array([x0,y0])        
        alpha=wolfe_rule(alpha,xy_old,d,fc,dfc, mu=mu,c=c) # find best alpha
        x0, y0 = x0 + alpha*d[0], y0 + alpha*d[1] 
        evolution = np.vstack((evolution, [x0, y0]))
        grad_f = dfc(x0, y0,mu=mu,c=c)
        cpt_grad+=1
    return evolution
3. Complete the code of the augmented lagrangian:
#initialization for Armijo
alpha=beta
## start GD algorithm
eps=1e-6
# initial points
x1,x2=1.,1.
c=1. # penalty parameter
mu=1.7 # dual variable
gold=g(x1,x2) # not a golden function, but the constraints are very precious
gradx_f = dxLc(x1, x2,mu=mu,c=c) # gradient of the augmented lagrangian
cpt=0 # to avoid infinite while loop
xy0=[x1,x2]
while norm(gradx_f)>eps or np.abs(g(x1,x2))>eps :
    evolution=GradientDescent(Lc,dxLc,xy0,beta,mu=mu,c=c)
    evolution_X = evolution[:, 0]
    evolution_Y = evolution[:, 1]
    x1,x2=evolution_X[-1],evolution_Y[-1] # last points
    print("x1:", x1, " x2:", x2)
    if (np.abs(g(x1,x2)))<np.abs(gold): # if ||g|| is small 
        #print("||g|| smaller",np.abs(g(x1,x2)))
        mu=mu+c*g(x1,x2) #increase mu
        print("new mu: ", mu)
        
    else:
        c=c*2 # increase c
        print("new c: ",c)
    cpt+=1
    gold=g(x1,x2) #updates
    gradx_f = dxLc(x1, x2,mu,c)
    print("norm of gradx_f: ",norm(gradx_f))
    if cpt==20:
        break
        
We can display the solution onto the contour of $f$ or onto the contour of $L_c$
evolution_X = evolution[:, 0]
evolution_Y = evolution[:, 1]
x1 = np.linspace(-2, 2, 25)
y1 = np.linspace(-2, 2, 25)
Xx, Yy = np.meshgrid(x1, y1)
print("mu",mu)
#mu=1.#0.5
#Z = Lc(Xx, Yy,mu=mu,c=0)
Z = f(Xx, Yy)
fig= plt.figure(figsize = (10,7))
contours = plt.contour(Xx, Yy, Z, 45)
plt.clabel(contours, inline = True, fontsize = 10)
plt.title("Evolution of the cost function during gradient descent with level circles", fontsize=15)
plt.plot(evolution_X, evolution_Y)
plt.plot(evolution_X, evolution_Y, '*', label = "Cost function")
plt.xlabel('x', fontsize=11)
plt.ylabel('y', fontsize=11)
plt.colorbar()
plt.legend(loc = "upper right")
plt.show()
mu 0.5000001501871618

4. We add new constraints: $(x,y)$ must stay in $K=[-2.,0]\times [-2,0]$. Complete the code that finds the projection of $(x,y)$ on a cuboid $K$. Then, change the gradient descent algorithm to take into account this projection step
def proj(K,x):
    #K=[[l0,u0],[l1,u1]]
    l0=...
    u0=...
    l1=...
    u1=...
    return ...
    
# NEW GRADIENT DESCENT ALGORITHM:
def GradientDescent(fc,dfc,xy0,beta,mu=0,c=0):
    #initialization for Armijo
    alpha=beta
    ## start GD algorithm
    eps=1e-6
    x0,y0=xy0[0],xy0[1]#1,2
    evolution = [[x0, y0]]
    grad_f = dfc(x0, y0,mu=mu,c=c)
    cpt_grad=0
    while norm(grad_f)>=eps and cpt_grad<500:
    
        d = -grad_f
        #armijo
        alpha=beta
        xy_old=np.array([x0,y0])        
        alpha=wolfe_rule(alpha,xy_old,d,fc,dfc, mu=mu,c=c)
        x0, y0 = ...
        evolution = np.vstack((evolution, [x0, y0]))
        grad_f = dfc(x0, y0,mu=mu,c=c)
        cpt_grad+=1
    return evolution
## AUGMENTED LAGRANGIAN WITH PROJECTION
alpha=beta
## start GD algorithm
eps=1e-6
x1,x2=1.,1.#0.5
c=1.
mu=1.7
gold=g(x1,x2)
gradx_f = dxLc(x1, x2,mu=mu,c=c)
cpt=0
xy0=[x1,x2]
K=[[-2,0],[-2,0]]
while norm(gradx_f)>eps or np.abs(g(x1,x2))>eps :
    evolution=GradientDescent(Lc,dxLc,xy0,beta,mu=mu,c=c)
    evolution_X = evolution[:, 0]
    evolution_Y = evolution[:, 1]
    x1=evolution_X[-1]
    x2=evolution_Y[-1]
    print("x1:", x1, " x2:", x2)
    if (np.abs(g(x1,x2)))<np.abs(gold):
        #print("||g|| smaller",np.abs(g(x1,x2)))
        mu+=c*g(x1,x2)
        #print("mu", mu)
        
    else:
        c*=2
        print("c",c)
    cpt+=1
    gold=g(x1,x2)
    gradx_f = dxLc(x1, x2,mu,c)
    print("norm: ",norm(gradx_f))
    print("mu: ", mu)
    if cpt==20:
        break
    
5. You should find $\mu=1$, and $(x_1,x_2)=0,0$.
The constraints $-x_1 \leq 2$ and $-x_2 \leq 2$ are inactive, and thus the associated dual variables are equal to $0$. With $\mu=1$, we find $\lambda_1=1$ and $\lambda_2=0$ the dual variables associated to $x_1 \leq 0$ and $x_2 \leq 0$.
Defined the new augmented lagrangian with these constraints added:
evolution_X = evolution[:, 0]
evolution_Y = evolution[:, 1]
x1 = np.linspace(-2, 2, 25)
y1 = np.linspace(-2, 2, 25)
Xx, Yy = np.meshgrid(x1, y1)
print("mu",mu)
mu=1.
def Lc2(x1,x2,mu,c): 
    # augmented lagrangian with new constraints
    return ...
    
Z = Lc2(Xx, Yy,mu=mu,c=0)
#Z = f(Xx, Yy)
fig= plt.figure(figsize = (10,7))
contours = plt.contour(Xx, Yy, Z, 45)
plt.clabel(contours, inline = True, fontsize = 10)
plt.title("Evolution of the cost function during gradient descent with level circles", fontsize=15)
plt.plot(evolution_X, evolution_Y)
plt.plot(evolution_X, evolution_Y, '*', label = "Cost function")
plt.xlabel('x', fontsize=11)
plt.ylabel('y', fontsize=11)
plt.colorbar()
plt.legend(loc = "upper right")
plt.show()
mu 0.5

