import numpy as np
import matplotlib.pyplot as pltComparison for quadratics
np.random.seed(228)
# Parameters
n = 100 # Dimension of x
mu = 10
L = 100
def format_num(x):
"""
Formats a number into scientific notation with two decimal places
for the significand and removes the extra 0 in the exponent.
Example: 1.14e-01 becomes 1.14e-1.
"""
s = f"{x:.2e}"
s = s.replace("e+0", "e+").replace("e-0", "e-")
return
def generate_problem(n=100, mu=mu, L=L, problem_type="clustered"):
np.random.seed(228)
if problem_type == "clustered":
A = np.diagflat([mu * np.ones(n // 4), (mu + (L - mu) / 3) * np.ones(n // 4),
(mu + 2 * (L - mu) / 3) * np.ones(n // 4), L * np.ones(n // 4)])
U = np.random.rand(n, n)
Q, _ = np.linalg.qr(U)
A = Q.dot(A).dot(Q.T)
A = (A + A.T) * 0.5
x_opt = np.random.rand(n)
b = A @ x_opt
x_0 = 5 * np.random.randn(n)
elif problem_type == "random":
A = np.random.randn(n, n)
factual_L = max(np.linalg.eigvalsh(A.T @ A))
A = A.T.dot(A) / factual_L * L + mu * np.eye(n)
x_opt = np.random.rand(n)
b = A @ x_opt
x_0 = 3 * np.random.randn(n)
elif problem_type == "uniform spectrum":
A = np.diag(np.linspace(mu, L, n, endpoint=True))
x_opt = np.random.rand(n)
b = A @ x_opt
x_0 = 3 * np.random.randn(n)
elif problem_type == "Hilbert":
A = np.array([[1.0 / (i + j - 1) for i in range(1, n+1)] for j in range(1, n+1)])
b = np.ones(n)
x_0 = 3 * np.random.randn(n)
x_opt = np.linalg.lstsq(A, b, rcond=None)[0]
elif problem_type == "worst_cg":
t = 0.6 # Small t leads to worse conditioning
main_diag = np.ones(n)
main_diag[0] = t
main_diag[1:] = 1 + t
off_diag = np.sqrt(t) * np.ones(n-1)
A = np.diag(main_diag) + np.diag(off_diag, k=1) + np.diag(off_diag, k=-1)
b = np.zeros(n)
b[0] = 1
x_opt = np.linalg.solve(A, b)
x_0 = np.zeros(n)
return A, b, x_0, x_opt
return A, b, x_0, x_opt
# Optimization methods now return (function_gap, domain_gap, grad_norm, alpha, beta)
def gradient_descent(f, grad_f, x_0, step_size, iterations, x_opt):
x = x_0.copy()
f_opt = f(x_opt)
function_gap = [abs(f(x) - f_opt)]
domain_gap = [np.linalg.norm(x - x_opt)]
grad_norms = [np.linalg.norm(grad_f(x))]
for _ in range(iterations):
x = x - step_size * grad_f(x)
function_gap.append(abs(f(x) - f_opt))
domain_gap.append(np.linalg.norm(x - x_opt))
grad_norms.append(np.linalg.norm(grad_f(x)))
return function_gap, domain_gap, grad_norms, step_size, 0
def heavy_ball(f, grad_f, x_0, iterations, x_opt, mu, L):
### YOUR CODE HERE
return
return function_gap, domain_gap, grad_norms, alpha, beta
def nesterov_accelerated_gradient(f, grad_f, x_0, iterations, x_opt, mu, L):
### YOUR CODE HERE
return
return function_gap, domain_gap, grad_norms, alpha, beta
def run_experiment(params):
A, b, x_0, x_opt = generate_problem(n=params["n"], mu=params["mu"], L=params["L"],
problem_type=params["problem_type"])
eigs = np.linalg.eigvalsh(A)
# Update mu and L based on the spectrum
mu_val, L_val = min(eigs), max(eigs)
f = lambda x: 0.5 * x.T @ A @ x - b.T @ x
grad_f = lambda x: A @ x - b
if mu_val <= 1e-2:
alpha = 1 / L_val
else:
alpha = 2 / (mu_val + L_val)
beta = (np.sqrt(L_val) - np.sqrt(mu_val)) / (np.sqrt(L_val) + np.sqrt(mu_val))
results = {
"methods": {
"Gradient Descent": gradient_descent(f, grad_f, x_0, alpha, params["iterations"], x_opt),
"Heavy Ball": heavy_ball(f, grad_f, x_0, params["iterations"], x_opt, mu_val, L_val),
"NAG": nesterov_accelerated_gradient(f, grad_f, x_0, params["iterations"], x_opt, mu_val, L_val),
},
"problem": {
"eigs": eigs,
"params": params
}
}
return results
def plot_results(results):
# Define linestyles for methods
linestyles = {
"Gradient Descent": "r-",
"Heavy Ball": "g--",
"NAG": "m:",
}
params = results["problem"]["params"]
n = params["n"]
problem_type = params["problem_type"]
mu_val = params["mu"]
L_val = params["L"]
# Create a figure with 3 subplots for function gap, domain gap, and gradient norm
plt.figure(figsize=(10, 3))
plt.suptitle(f"{'Strongly convex' if mu_val > 1e-2 else 'Convex'} quadratics: n={n}, {problem_type} matrix, μ={mu_val}, L={L_val}")
# Plot 1: Function gap
plt.subplot(1, 3, 1)
for method, result in results["methods"].items():
# result = (function_gap, domain_gap, grad_norms, alpha, beta)
plt.semilogy(result[0], linestyles[method])
plt.xlabel('Iteration')
plt.ylabel(r'$|f(x) - f^*|$')
plt.grid(linestyle=":")
plt.title("Function Gap")
# Plot 2: Domain gap (||x - x*||)
plt.subplot(1, 3, 2)
for method, result in results["methods"].items():
plt.semilogy(result[1], linestyles[method])
plt.xlabel('Iteration')
plt.ylabel(r'$\|x - x^*\|_2$')
plt.grid(linestyle=":")
plt.title("Domain Gap")
# Plot 3: Norm of Gradient
plt.subplot(1, 3, 3)
for method, result in results["methods"].items():
if method == "NAG":
if mu_val > 1e-2:
label = f"{method}. α={result[3]:.2e}, β={result[4]:.2e}"
else:
label = f"{method}"
elif method == "Heavy Ball":
if mu_val > 1e-2:
label = f"{method}. α={result[3]:.2e}, β={result[4]:.2e}"
else:
label = f"{method}"
elif method == "Gradient Descent":
label = f"{method}. α={result[3]:.2e}"
plt.semilogy(result[2], linestyles[method], label=label)
plt.xlabel('Iteration')
plt.ylabel(r'$\|\nabla f(x)\|_2$')
plt.grid(linestyle=":")
plt.title("Gradient Norm")
# Place the legend below the plots
plt.figlegend(loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.05))
plt.tight_layout()
plt.savefig(f"agd_{problem_type}_{mu_val}_{L_val}_{n}.pdf", bbox_inches='tight')
plt.show()# Experiment parameters
params = {
"n": 60,
"mu": 0,
"L": 10,
"iterations": 800,
"problem_type": "random", # Change to "clustered", "uniform spectrum", or "Hilbert" as needed
}
results = run_experiment(params)
plot_results(results)# Experiment parameters
params = {
"n": 60,
"mu": 1,
"L": 10,
"iterations": 100,
"problem_type": "random", # Change to "clustered", "uniform spectrum", or "Hilbert" as needed
}
results = run_experiment(params)
plot_results(results)# Experiment parameters
params = {
"n": 60,
"mu": 1,
"L": 1000,
"iterations": 1000,
"problem_type": "random", # Change to "clustered", "uniform spectrum", or "Hilbert" as needed
}
results = run_experiment(params)
plot_results(results)# Experiment parameters
params = {
"n": 1000,
"mu": 1,
"L": 10,
"iterations": 100,
"problem_type": "random", # Change to "clustered", "uniform spectrum", or "Hilbert" as needed
}
results = run_experiment(params)
plot_results(results)# Experiment parameters
params = {
"n": 1000,
"mu": 1,
"L": 1000,
"iterations": 1000,
"problem_type": "random", # Change to "clustered", "uniform spectrum", or "Hilbert" as needed
}
results = run_experiment(params)
plot_results(results)Logreg
import numpy as np
import matplotlib.pyplot as plt
import cvxpy as cp
import jax
from jax import numpy as jnp, grad
from scipy.optimize import minimize_scalar
import jax.numpy as jnp
from jax import grad, jit, hessian
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import time
from ucimlrepo import fetch_ucirepo
from optax.losses import safe_softmax_cross_entropy as cros_entr
from sklearn.preprocessing import StandardScaler
from scipy.optimize import minimize_scalar
import sklearn.datasets as skldata
# Set a random seed for reproducibility
np.random.seed(228)
jax.random.PRNGKey(228)
@jit
def logistic_loss(w, X, y, mu=1):
m, n = X.shape
return jnp.sum(jnp.logaddexp(0, -y * (X @ w))) / m + mu / 2 * jnp.sum(w**2)
def generate_problem(m=1000, n=300, mu=1):
X, y = skldata.make_classification(n_classes=2, n_features=n, n_samples=m, n_informative=n//2, random_state=0)
X = jnp.array(X)
y = jnp.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
return X_train, y_train, X_test, y_test
def compute_optimal(X, y, mu):
w = cp.Variable(X.shape[1])
objective = cp.Minimize(cp.sum(cp.logistic(cp.multiply(-y, X @ w))) / len(y) + mu / 2 * cp.norm(w, 2)**2)
problem = cp.Problem(objective)
problem.solve()
return w.value, problem.value
@jit
def compute_accuracy(w, X, y):
# Compute predicted probabilities using the logistic (sigmoid) function
preds_probs = jax.nn.sigmoid(X @ w)
# Convert probabilities to class predictions: -1 if p < 0.5, else 1
preds = jnp.where(preds_probs < 0.5, 0, 1)
# Calculate accuracy as the average of correct predictions
accuracy = jnp.mean(preds == y)
return accuracy
# @jit
def compute_metrics(trajectory, x_star, f_star, times, X_train, y_train, X_test, y_test, mu):
f = lambda w: logistic_loss(w, X_train, y_train, mu)
metrics = {
"f_gap": [jnp.abs(f(x) - f_star) for x in trajectory],
"x_gap": [jnp.linalg.norm(x - x_star) for x in trajectory],
"time": times,
"train_acc": [compute_accuracy(x, X_train, y_train) for x in trajectory],
"test_acc": [compute_accuracy(x, X_test, y_test) for x in trajectory],
}
return metrics
def gradient_descent(w_0, X, y, learning_rate=0.01, num_iters=100, mu=0):
trajectory = [w_0]
times = [0]
w = w_0
f = lambda w: logistic_loss(w, X, y, mu)
iter_start = time.time()
for i in range(num_iters):
grad_val = grad(f)(w)
if learning_rate == "linesearch":
# Simple line search implementation
phi = lambda alpha: f(w - alpha*grad_val)
result = minimize_scalar(fun=phi,
bounds=(1e-3, 2e2)
)
step_size = result.x
else:
step_size = learning_rate
w -= step_size * grad_val
iter_time = time.time()
trajectory.append(w)
times.append(iter_time - iter_start)
return trajectory, times
def heavy_ball_method(w_0, X, y, learning_rate=0.01, momentum=0.9, num_iters=100, mu=0):
"""
Polyak Heavy-Ball Method implementation:
Given:
x^+ = x - α ∇f(x) (Gradient step)
d_k = β_k (x_k - x_{k-1}) (Momentum term)
The update is:
x_{k+1} = x_k^+ + d_k = x_k - α ∇f(x_k) + d_k
Parameters:
w_0 : initial point
X, y : data used in the objective function (e.g. logistic loss)
learning_rate: α, step size for the gradient step
momentum : β, momentum coefficient (can be constant or adjusted per iteration)
num_iters : number of iterations to run
mu : strong convexity parameter (not used in this explicit update)
Returns:
trajectory : list of iterates x_k (after the gradient update)
times : list of times (elapsed) at each iteration
"""
### YOUR CODE HERE
return
return trajectory, times
def nesterov_accelerated_gradient(w_0, X, y, learning_rate=0.01, momentum=0.9, num_iters=100, mu=0):
"""
Nesterov Accelerated Gradient (NAG) implementation using explicit momentum:
Given:
x^+ = x - α ∇f(x) (Gradient step)
d_k = β_k (x_k - x_{k-1}) (Momentum term)
The update is:
x_{k+1} = (x_k + d_k)^+ = (x_k + d_k) - α ∇f(x_k + d_k)
Parameters:
w_0 : initial point
X, y : data used in the objective function (e.g. logistic loss)
learning_rate: α, step size for the gradient step
momentum : β, momentum coefficient (can be constant or adjusted per iteration)
num_iters : number of iterations to run
mu : strong convexity parameter (not used in this explicit update)
Returns:
trajectory : list of iterates x_k (after the gradient update)
times : list of times (elapsed) at each iteration
"""
### YOUR CODE HERE
return
return trajectory, times
def run_experiments(params):
mu = params["mu"]
m, n = params["m"], params["n"]
methods = params["methods"]
results = {}
X_train, y_train, X_test, y_test = generate_problem(m, n, mu)
n_features = X_train.shape[1] # Number of features
params["n_features"] = n_features
x_0 = jax.random.normal(jax.random.PRNGKey(0), (n_features, ))
x_star, f_star = compute_optimal(X_train, y_train, mu)
for method in methods:
if method["method"] == "GD":
learning_rate = method["learning_rate"]
iterations = method["iterations"]
trajectory, times = gradient_descent(x_0, X_train, y_train, learning_rate, iterations, mu)
label = method["method"] + " " + str(learning_rate)
results[label] = compute_metrics(trajectory, x_star, f_star, times, X_train, y_train, X_test, y_test, mu)
elif method["method"] == "Heavy Ball":
learning_rate = method.get("learning_rate", 0.01)
momentum = method.get("momentum", 0.9)
iterations = method["iterations"]
trajectory, times = heavy_ball_method(x_0, X_train, y_train, learning_rate, momentum, iterations, mu)
# label = method["method"] + " " + str(learning_rate)
label = f"{method['method']}. α={learning_rate:.2e}. β={momentum:.2e}"
results[label] = compute_metrics(trajectory, x_star, f_star, times, X_train, y_train, X_test, y_test, mu)
elif method["method"] == "NAG":
learning_rate = method.get("learning_rate", 0.01)
momentum = method.get("momentum", 0.9)
iterations = method["iterations"]
trajectory, times = nesterov_accelerated_gradient(x_0, X_train, y_train, learning_rate, momentum, iterations, mu)
label = method["method"] + " " + str(learning_rate)
label = f"{method['method']}. α={learning_rate:.2e}. β={momentum:.2e}"
results[label] = compute_metrics(trajectory, x_star, f_star, times, X_train, y_train, X_test, y_test, mu)
return results, params
def plot_results(results, params):
plt.figure(figsize=(11, 2.6))
mu = params["mu"]
if mu > 1e-2:
plt.suptitle(f"Strongly convex binary logistic regression. mu={mu}.")
else:
plt.suptitle(f"Convex binary logistic regression. mu={mu}.")
plt.subplot(1, 4, 1)
for method, metrics in results.items():
plt.plot(metrics['f_gap'])
plt.xlabel('Iteration')
plt.ylabel(r'$|f(x) -f^*|$')
plt.yscale('log')
plt.grid(linestyle=":")
plt.subplot(1, 4, 2)
for method, metrics in results.items():
plt.plot(metrics['x_gap'], label=method)
plt.xlabel('Iteration')
plt.ylabel('$\|x_k - x^*\|$')
plt.yscale('log')
plt.grid(linestyle=":")
plt.subplot(1, 4, 3)
for method, metrics in results.items():
plt.plot(metrics["train_acc"])
plt.xlabel('Iteration')
plt.ylabel('Train accuracy')
# plt.yscale('log')
plt.grid(linestyle=":")
plt.subplot(1, 4, 4)
for method, metrics in results.items():
plt.plot(metrics["test_acc"])
plt.xlabel('Iteration')
plt.ylabel('Test accuracy')
# plt.yscale('log')
plt.grid(linestyle=":")
# Place the legend below the plots
plt.figlegend(loc='lower center', ncol=5, bbox_to_anchor=(0.5, -0.00))
# Adjust layout to make space for the legend below
filename = ""
for method, metrics in results.items():
filename += method
filename += f"_{mu}.pdf"
plt.tight_layout(rect=[0, 0.05, 1, 1])
plt.savefig(filename)
plt.show()<>:340: SyntaxWarning: invalid escape sequence '\|'
<>:340: SyntaxWarning: invalid escape sequence '\|'
/var/folders/6l/qhfv4nh50cqfd22s2mp1shlm0000gn/T/ipykernel_99724/1722798055.py:340: SyntaxWarning: invalid escape sequence '\|'
plt.ylabel('$\|x_k - x^*\|$')
params = {
"mu": 0,
"m": 1000,
"n": 100,
"methods": [
{
"method": "GD",
"learning_rate":9e4,
"iterations": 400,
},
{
"method": "Heavy Ball",
"learning_rate": 9e-1,
"iterations": 400,
"momentum": 0.99,
},
{
"method": "NAG",
"learning_rate": 9e-1,
"iterations": 400,
"momentum": 0.99,
},
]
}
results, params = run_experiments(params)
plot_results(results, params)params = {
"mu": 1,
"m": 1000,
"n": 100,
"methods": [
{
"method": "GD",
"learning_rate": 5e-2,
"iterations": 300,
},
{
"method": "Heavy Ball",
"learning_rate": 5e-2,
"iterations": 300,
"momentum": 0.9,
},
{
"method": "NAG",
"learning_rate": 5e-2,
"iterations": 300,
"momentum": 0.9,
},
]
}
results, params = run_experiments(params)
plot_results(results, params)