Source code for ttml.tt_radam

"""Implements Riemannian ADAM stochastic gradient descent algorithm for tensor
trains. 

This is an implementation of the algorithm in the paper 'Riemannian Adaptive
Optimization Methods' by Becigneul and Ganea"""

import autoray as ar
import numpy as np

from ttml.tt_opt import TensorTrainOptimizer
from ttml.utils import merge_sum, convert_backend


[docs]class TensorTrainSGD(TensorTrainOptimizer): """ Riemannian Adam optimizer for tensor trains. Parameters ---------- tt : TensorTrain TensorTrain to be optimized. During optimization it will be copied, not modified. y : array<float64> Target values. Should be flat array with same backend as tt. idx : array<int64> shape `(len(y),tt.order)` Indices of dense tensor corresponding to values `y`. Potential duplicate values in `idx` are automatically merged. batch_size : int lr : float (default: 1.0) Learning rate, needs to be tuned for each problem. beta1 : float (default: 0.9) parameter between 0 and 1 determining the contribution of transport of previous search direction to current search direction beta2 : float (default: 0.9) parameter between 0 and 1 determining contribution of previous gradient norms to the stepsize. task : str (default: `"regression"`) Whether to perform regression or binary classification. - If `task="regression` then MSE is minimized. - If `task="classification"`. The labels are assumed to be 0 or 1, and cross entropy is minimized. Note that predictions of the classifier will be on the logit scale, only the objective changes. sample_weight : array<float64> or None (default: None) Weights associated to all sample points. If None, use unit weight. """ def __init__( self, tt, y, idx, batch_size, lr=1.0, beta1=0.9, beta2=0.9, task="regression", sample_weight=None, red_idx=None, **kwargs ): super().__init__( tt, y, idx, task=task, sample_weight=sample_weight, red_idx=red_idx, **kwargs ) self.batch_size = batch_size self.N = len(y) self.lr = lr self.beta1 = beta1 self.beta2 = beta2 self.momentum = None self.adaptive_term = None
[docs] def step(self): "Do one step inplace" batch_indices = np.random.randint(0, self.N, self.batch_size) batch_indices = convert_backend(batch_indices, self.backend) if self.backend == "tensorflow": batch_y = ar.do("take", self.y, batch_indices) batch_idx = ar.do("take", self.idx, batch_indices) else: batch_y = self.y[batch_indices] batch_idx = self.idx[batch_indices] loss, egrad = self.egrad( y=batch_y, idx=batch_idx, merge=False, normalize=True ) batch_red_idx, egrad = merge_sum(batch_idx, egrad) rgrad = self.tt.rgrad_sparse(egrad, batch_red_idx) if self.momentum is None: self.momentum = rgrad else: self.momentum = self.tt.grad_proj(self.momentum) self.momentum = ( self.beta1 * self.momentum + (1 - self.beta1) * rgrad ) if self.adaptive_term is None: self.adaptive_term = rgrad.norm() ** 2 else: self.adaptive_term = ( self.beta2 * self.adaptive_term + (1 - self.beta2) * rgrad.norm() ** 2 ) step_size = self.lr / ar.do("sqrt", self.adaptive_term) self.tt.apply_grad(self.momentum, alpha=-step_size, inplace=True) derivative = -rgrad @ self.momentum return loss, derivative, step_size