Source code for ttml.tt_radam

"""Implements Riemannian ADAM stochastic gradient descent algorithm for tensor
trains. 

This is an implementation of the algorithm in the paper 'Riemannian Adaptive
Optimization Methods' by Becigneul and Ganea"""

import autoray as ar
import numpy as np

from ttml.tt_opt import TensorTrainOptimizer
from ttml.utils import merge_sum, convert_backend


[docs]class TensorTrainSGD(TensorTrainOptimizer):
    """
    Riemannian Adam optimizer for tensor trains.

    Parameters
    ----------
    tt : TensorTrain
        TensorTrain to be optimized. During optimization it will be copied, not
        modified.
    y : array<float64>
        Target values. Should be flat array with same backend as tt.
    idx : array<int64> shape `(len(y),tt.order)`
        Indices of dense tensor corresponding to values `y`. Potential
        duplicate values in `idx` are automatically merged.
    batch_size : int
    lr : float (default: 1.0)
        Learning rate, needs to be tuned for each problem.
    beta1 : float (default: 0.9)
        parameter between 0 and 1 determining the contribution of transport of
        previous search direction to current search direction
    beta2 : float (default: 0.9)
        parameter between 0 and 1 determining contribution of previous gradient
        norms to the stepsize.
    task : str (default: `"regression"`)
        Whether to perform regression or binary classification.

        - If `task="regression` then MSE is minimized.
        - If `task="classification"`. The labels are assumed to be 0 or 1, and
          cross entropy is minimized. Note that predictions of the classifier
          will be on the logit scale, only the objective changes.
    sample_weight : array<float64> or None (default: None)
        Weights associated to all sample points. If None, use unit weight.
    """
    def __init__(
        self,
        tt,
        y,
        idx,
        batch_size,
        lr=1.0,
        beta1=0.9,
        beta2=0.9,
        task="regression",
        sample_weight=None,
        red_idx=None,
        **kwargs
    ):
        super().__init__(
            tt,
            y,
            idx,
            task=task,
            sample_weight=sample_weight,
            red_idx=red_idx,
            **kwargs
        )
        self.batch_size = batch_size
        self.N = len(y)
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.momentum = None
        self.adaptive_term = None

[docs]    def step(self):
        "Do one step inplace"
        batch_indices = np.random.randint(0, self.N, self.batch_size)
        batch_indices = convert_backend(batch_indices, self.backend)
        if self.backend == "tensorflow":
            batch_y = ar.do("take", self.y, batch_indices)
            batch_idx = ar.do("take", self.idx, batch_indices)
        else:
            batch_y = self.y[batch_indices]
            batch_idx = self.idx[batch_indices]
        loss, egrad = self.egrad(
            y=batch_y, idx=batch_idx, merge=False, normalize=True
        )
        batch_red_idx, egrad = merge_sum(batch_idx, egrad)
        rgrad = self.tt.rgrad_sparse(egrad, batch_red_idx)
        if self.momentum is None:
            self.momentum = rgrad
        else:
            self.momentum = self.tt.grad_proj(self.momentum)
            self.momentum = (
                self.beta1 * self.momentum + (1 - self.beta1) * rgrad
            )
        if self.adaptive_term is None:
            self.adaptive_term = rgrad.norm() ** 2
        else:
            self.adaptive_term = (
                self.beta2 * self.adaptive_term
                + (1 - self.beta2) * rgrad.norm() ** 2
            )
        step_size = self.lr / ar.do("sqrt", self.adaptive_term)
        self.tt.apply_grad(self.momentum, alpha=-step_size, inplace=True)
        derivative = -rgrad @ self.momentum
        return loss, derivative, step_size