From bc573d3a32465189afdbee3f5550cf2b64a1bd43 Mon Sep 17 00:00:00 2001
From: nhamanasu <ryo.quantum@gmail.com>
Date: Thu, 28 Nov 2024 10:31:36 +0900
Subject: [PATCH] Add RAdamScheduleFree and RAdamScheduleFreeClosure

---
 README.md                                  |   8 +-
 schedulefree/__init__.py                   |   2 +
 schedulefree/radam_schedulefree.py         | 235 +++++++++++++++++++++
 schedulefree/radam_schedulefree_closure.py | 219 +++++++++++++++++++
 schedulefree/test_schedulefree.py          |  54 ++++-
 5 files changed, 516 insertions(+), 2 deletions(-)
 create mode 100644 schedulefree/radam_schedulefree.py
 create mode 100644 schedulefree/radam_schedulefree_closure.py

diff --git a/README.md b/README.md
index 44a8e26..4b9c421 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,13 @@ Authors: Aaron Defazio, Xingyu (Alice) Yang, Harsh Mehta, Konstantin Mishchenko,
 
 ``` pip install schedulefree ```
 
-Primary implementations are `SGDScheduleFree` and `AdamWScheduleFree`. We also have `AdamWScheduleFreeReference` and `SGDScheduleFreeReference` versions which have a simplified implementation, but which use more memory. To combine with other optimizers, use the experimental ScheduleFreeWrapper version.
+We provide several Schedule-Free optimizer implementations:
+- `SGDScheduleFree` and `SGDScheduleFreeReference`: Schedule-free variants of SGD
+- `AdamWScheduleFree` and `AdamWScheduleFreeReference`: Schedule-free variants of AdamW
+- `RAdamScheduleFree`: Schedule-free variant of RAdam, which eliminates the need for both learning rate scheduling and warmup setting
+- Experimental `ScheduleFreeWrapper` to combine with other optimizers
+
+`ScheduleFreeReference` versions have a simplified implementation, but which use more memory. There are also `ScheduleFreeClosure` versions which can be used with PyTorch's optimizer step closures.
 
 A [Jax implementation](https://optax.readthedocs.io/en/latest/api/contrib.html#schedule-free) is availiable as part of Optax.
 
diff --git a/schedulefree/__init__.py b/schedulefree/__init__.py
index a77b4c5..b4f9de6 100644
--- a/schedulefree/__init__.py
+++ b/schedulefree/__init__.py
@@ -3,6 +3,8 @@
 # 
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
+from .radam_schedulefree_closure import RAdamScheduleFreeClosure
+from .radam_schedulefree import RAdamScheduleFree
 from .adamw_schedulefree_closure import AdamWScheduleFreeClosure
 from .adamw_schedulefree import AdamWScheduleFree
 from .adamw_schedulefree_reference import AdamWScheduleFreeReference
diff --git a/schedulefree/radam_schedulefree.py b/schedulefree/radam_schedulefree.py
new file mode 100644
index 0000000..396766d
--- /dev/null
+++ b/schedulefree/radam_schedulefree.py
@@ -0,0 +1,235 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# 
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Tuple, Union, Optional, Iterable, Dict, Callable, Any
+from typing_extensions import TypeAlias
+import torch
+import torch.optim
+try:
+    from torch.optim.optimizer import ParamsT
+except ImportError:
+    ParamsT : TypeAlias = Union[Iterable[torch.Tensor], Iterable[Dict[str, Any]]]
+import math
+
+class RAdamScheduleFree(torch.optim.Optimizer):
+    r"""
+    Schedule-Free RAdam
+    Neither warmup hyperparameter nor scheduler is needed with this optimizer.
+
+    This optimizer requires that .train() and .eval() be called before the
+    beginning of training and evaluation respectively. The optimizer should
+    also be placed in eval mode when saving checkpoints.
+
+    Arguments:
+        params (iterable):
+            Iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float):
+            Learning rate parameter (default 0.0025)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999)).
+        eps (float):
+            Term added to the denominator outside of the root operation to
+            improve numerical stability. (default: 1e-8).
+        weight_decay (float):
+            Weight decay, i.e. a L2 penalty (default: 0).
+        r (float): Use polynomial weighting in the average
+            with power r (default 0).
+        weight_lr_power (float): During warmup, the weights in the average will
+            be equal to lr raised to this power. Set to 0 for no weighting
+            (default 2.0).
+        foreach (bool): Use a foreach-backed implementation of the optimizer.
+            Should be significantly faster, but will have higher peak memory
+            usage (default True if supported in your PyTorch version).
+        silent_sgd_phase (bool): If True, the optimizer will not use the first SGD phase of RAdam.
+            This means that the optimizer will not update model parameters during the early training
+            steps (e.g., < 5 when β_2 = 0.999), but just update the momentum values of the optimizer.
+            This helps stabilize training by ensuring smoother warmup behavior and more reliable
+            calculation of the moving average coefficient (`ckp1`). Recommended to set to True
+            (default True).
+    """
+
+    def __init__(self,
+                 params: ParamsT,
+                 lr: Union[float, torch.Tensor] = 0.0025,
+                 betas: Tuple[float, float] = (0.9, 0.999),
+                 eps: float = 1e-8,
+                 weight_decay: float = 0,
+                 r: float = 0.0,
+                 weight_lr_power: float = 2.0,
+                 foreach: Optional[bool] = hasattr(torch, "_foreach_mul_"),
+                 silent_sgd_phase: bool = True
+                 ):
+        
+        defaults = dict(lr=lr,
+                        betas=betas,
+                        eps=eps,
+                        r=r,
+                        k=0,
+                        train_mode=False,
+                        weight_sum=0.0,
+                        lr_max=-1.0,
+                        scheduled_lr=0.0,
+                        weight_lr_power=weight_lr_power,
+                        weight_decay=weight_decay,
+                        foreach=foreach,
+                        silent_sgd_phase=silent_sgd_phase)
+        super().__init__(params, defaults)
+
+    @torch.no_grad()
+    def eval(self):
+        for group in self.param_groups:
+            train_mode = group["train_mode"]
+            beta1, _ = group["betas"]
+            if train_mode:
+                for p in group["params"]:
+                    state = self.state[p]
+                    if "z" in state:
+                        # Set p to x
+                        p.lerp_(end=state["z"].to(p.device), weight=1 - 1 / beta1)
+                group["train_mode"] = False
+
+    @torch.no_grad()
+    def train(self):
+        for group in self.param_groups:
+            train_mode = group["train_mode"]
+            beta1, _ = group["betas"]
+            if not train_mode:
+                for p in group["params"]:
+                    state = self.state[p]
+                    if "z" in state:
+                        # Set p to y
+                        p.lerp_(end=state["z"].to(p.device), weight=1 - beta1)
+                group["train_mode"] = True
+
+    @torch.no_grad()
+    def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        if not self.param_groups[0]["train_mode"]:
+            raise Exception(
+                "Optimizer was not in train mode when step is called. "
+                "Please insert .train() and .eval() calls on the "
+                "optimizer. See documentation for details."
+            )
+
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            eps = group["eps"]
+            beta1, beta2 = group["betas"]
+            decay = group["weight_decay"]
+            silent_sgd_phase = group["silent_sgd_phase"]
+            k = group["k"]  # current steps
+            step = k + 1
+            r = group['r']
+            weight_lr_power = group['weight_lr_power']
+
+            beta2_t = beta2**step
+            bias_correction2 = 1 - beta2_t
+
+            # maximum length of the approximated SMA
+            rho_inf = 2 / (1 - beta2) - 1
+            # compute the length of the approximated SMA
+            rho_t = rho_inf - 2 * step * beta2_t / bias_correction2
+            rect = (
+                ((rho_t - 4) * (rho_t - 2) * rho_inf / ((rho_inf - 4) * (rho_inf - 2) * rho_t)) ** 0.5
+                if rho_t > 4.0
+                else float(not silent_sgd_phase)
+            )
+
+            lr = group["lr"] * rect
+            group["scheduled_lr"] = lr  # For logging purposes
+
+            lr_max = group["lr_max"] = max(lr, group["lr_max"])
+
+            weight = (step**r) * (lr_max**weight_lr_power)
+            weight_sum = group["weight_sum"] = group["weight_sum"] + weight
+
+            try:
+                ckp1 = weight / weight_sum
+            except ZeroDivisionError:
+                ckp1 = 0
+
+            adaptive_y_lr = lr * (beta1 * (1 - ckp1) - 1)
+            active_p = [p for p in group["params"] if p.grad is not None]
+
+            for p in active_p:
+                if "z" not in self.state[p]:
+                    self.state[p]["z"] = torch.clone(p, memory_format=torch.preserve_format)
+                    self.state[p]["exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+            if group["foreach"] and len(active_p) > 0:
+                y, grad, exp_avg_sq, z = zip(
+                    *[(p, p.grad, self.state[p]["exp_avg_sq"], self.state[p]["z"]) for p in active_p]
+                )
+
+                # Decay the first and second moment running average coefficient
+                torch._foreach_mul_(exp_avg_sq, beta2)
+                torch._foreach_addcmul_(exp_avg_sq, grad, grad, value=1 - beta2)
+
+                if rho_t > 4.0:
+                    # Adam step
+                    denom = torch._foreach_div(exp_avg_sq, bias_correction2)
+                    torch._foreach_sqrt_(denom)
+                    torch._foreach_add_(denom, eps)
+
+                    # Normalize grad in-place for memory efficiency
+                    torch._foreach_div_(grad, denom)
+
+                # Weight decay calculated at y
+                if decay != 0:
+                    torch._foreach_add_(grad, y, alpha=decay)
+
+                # These operations update y in-place,
+                # without computing x explicitly.
+                torch._foreach_lerp_(y, z, weight=ckp1)
+                torch._foreach_add_(y, grad, alpha=adaptive_y_lr)
+
+                # z step
+                torch._foreach_sub_(z, grad, alpha=lr)
+            else:
+                for p in active_p:
+                    y = p  # Notation to match theory
+                    grad = p.grad
+
+                    state = self.state[p]
+
+                    z = state["z"]
+                    exp_avg_sq = state["exp_avg_sq"]
+
+                    exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+
+                    if rho_t > 4.0:
+                        # Adam step
+                        denom = exp_avg_sq.div(bias_correction2).sqrt_().add_(eps)
+
+                        # Reuse grad buffer for memory efficiency
+                        grad_normalized = grad.div_(denom)
+                    else:
+                        # Fall back to SGD (or nothing)
+                        grad_normalized = grad
+
+                    # Weight decay calculated at y
+                    if decay != 0:
+                        grad_normalized.add_(y, alpha=decay)
+
+                    # These operations update y in-place,
+                    # without computing x explicitly.
+                    y.lerp_(end=z, weight=ckp1)
+                    y.add_(grad_normalized, alpha=adaptive_y_lr)
+
+                    # z step
+                    z.sub_(grad_normalized, alpha=lr)
+
+            group["k"] = k + 1
+        return loss
diff --git a/schedulefree/radam_schedulefree_closure.py b/schedulefree/radam_schedulefree_closure.py
new file mode 100644
index 0000000..c0d6a99
--- /dev/null
+++ b/schedulefree/radam_schedulefree_closure.py
@@ -0,0 +1,219 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# 
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Tuple, Union, Optional, Iterable, Dict, Callable, Any
+from typing_extensions import TypeAlias
+import torch.optim
+try:
+    from torch.optim.optimizer import ParamsT
+except ImportError:
+    ParamsT : TypeAlias = Union[Iterable[torch.Tensor], Iterable[Dict[str, Any]]]
+import math
+
+class RAdamScheduleFreeClosure(torch.optim.Optimizer):
+    r"""
+    Schedule-Free RAdam
+    Neither warmup hyperparameter nor scheduler is needed with this optimizer.
+
+    This "closure" version requires that step be called with a closure, see 
+    https://pytorch.org/docs/stable/optim.html#optimizer-step-closure. 
+    
+    Arguments:
+        params (iterable):
+            Iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float):
+            Learning rate parameter (default 0.0025)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999)).
+        eps (float):
+            Term added to the denominator outside of the root operation to
+            improve numerical stability. (default: 1e-8).
+        weight_decay (float):
+            Weight decay, i.e. a L2 penalty (default: 0).
+        r (float): Use polynomial weighting in the average
+            with power r (default 0).
+        weight_lr_power (float): During warmup, the weights in the average will
+            be equal to lr raised to this power. Set to 0 for no weighting
+            (default 2.0).
+        foreach (bool): Use a foreach-backed implementation of the optimizer.
+            Should be significantly faster, but will have higher peak memory
+            usage (default True if supported in your PyTorch version).
+        silent_sgd_phase (bool): If True, the optimizer will not use the first SGD phase of RAdam.
+            This means that the optimizer will not update model parameters during the early training
+            steps (e.g., < 5 when β_2 = 0.999), but just update the momentum values of the optimizer.
+            This helps stabilize training by ensuring smoother warmup behavior and more reliable
+            calculation of the moving average coefficient (`ckp1`). Recommended to set to True
+            (default True).
+    """
+    def __init__(self,
+                 params: ParamsT,
+                 lr: Union[float, torch.Tensor] = 0.0025,
+                 betas: Tuple[float, float] = (0.9, 0.999),
+                 eps: float = 1e-8,
+                 weight_decay: float = 0,
+                 r: float = 0,
+                 weight_lr_power: float = 2.0,
+                 foreach: Optional[bool] = hasattr(torch, "_foreach_mul_"),
+                 silent_sgd_phase: bool = True
+                 ):
+        defaults = dict(lr=lr,
+                        betas=betas,
+                        eps=eps,
+                        r=r,
+                        k=0,
+                        weight_sum=0.0,
+                        lr_max=0.0,
+                        scheduled_lr=0.0,
+                        weight_lr_power=weight_lr_power,
+                        weight_decay=weight_decay,
+                        foreach=foreach,
+                        silent_sgd_phase=silent_sgd_phase)
+
+        super().__init__(params, defaults)
+
+    @torch.no_grad()
+    def step(self, closure: Callable[[], float]) -> Optional[float]:
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        # Swap to extrapolated point:
+        for group in self.param_groups:
+            beta1, beta2 = group['betas']
+            r = group['r']
+            k = group['k']
+
+            for p in group['params']:
+                # State initialization
+                state = self.state[p]
+                if 'z' not in state:
+                    state['z'] = torch.clone(p, memory_format=torch.preserve_format)
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+            if group['foreach']:
+                if len(group['params']) > 0:
+                    y, z = zip(*[(p, self.state[p]['z']) for p in group['params']])
+
+                    torch._foreach_lerp_(y, z, weight=1-beta1)
+            else:
+                for p in group['params']:
+                    z = self.state[p]['z']
+
+                    # Extrapolate p to equal y
+                    p.lerp_(end=z, weight=1-beta1)
+
+        # Evaluate gradient at extrapolated point
+        with torch.enable_grad():
+            loss = closure()
+
+        for group in self.param_groups:
+            eps = group['eps']
+            k = group['k']
+            step = k + 1
+            r = group['r']
+            silent_sgd_phase = group["silent_sgd_phase"]
+            decay = group['weight_decay']
+            beta1, beta2 = group['betas']
+            weight_lr_power = group['weight_lr_power']
+
+            beta2_t = beta2**step
+            bias_correction2 = 1 - beta2_t
+
+            # maximum length of the approximated SMA
+            rho_inf = 2 / (1 - beta2) - 1
+            # compute the length of the approximated SMA
+            rho_t = rho_inf - 2 * step * beta2_t / bias_correction2
+            rect = (
+                ((rho_t - 4) * (rho_t - 2) * rho_inf / ((rho_inf - 4) * (rho_inf - 2) * rho_t)) ** 0.5
+                if rho_t > 4.0
+                else float(not silent_sgd_phase)
+            )
+
+            lr = group["lr"] * rect
+            group['scheduled_lr'] = lr # For logging purposes
+            
+            lr_max = group['lr_max'] = max(lr, group['lr_max'])
+            
+            weight = (step**r) * (lr_max**weight_lr_power)
+            weight_sum = group['weight_sum'] = group['weight_sum'] + weight
+
+            try:
+                ckp1 = weight/weight_sum
+            except ZeroDivisionError:
+                ckp1 = 0
+
+            adaptive_y_lr = lr * (beta1 * (1 - ckp1) - 1)
+            active_p = [p for p in group['params'] if p.grad is not None]
+
+            if group['foreach'] and len(active_p) > 0:
+                y, grad, exp_avg_sq, z = zip(*[(p, 
+                                                p.grad, 
+                                                self.state[p]['exp_avg_sq'], 
+                                                self.state[p]['z']) 
+                                                for p in active_p])
+
+                # Decay the first and second moment running average coefficient
+                torch._foreach_mul_(exp_avg_sq, beta2)
+                torch._foreach_addcmul_(exp_avg_sq, grad, grad, value=1-beta2)
+
+                if rho_t > 4.0:
+                    # Adam step
+                    denom = torch._foreach_div(exp_avg_sq, bias_correction2)
+                    torch._foreach_sqrt_(denom)
+                    torch._foreach_add_(denom, eps)
+
+                    # normalize grad in-place
+                    torch._foreach_div_(grad, denom)
+
+                # Weight decay calculated at y
+                if decay != 0:
+                    torch._foreach_add_(grad, y, alpha=decay)
+
+                # Unextrapolate
+                torch._foreach_lerp_(y, z, weight=1-1/beta1)
+
+                torch._foreach_sub_(z, grad, alpha=lr)
+
+                ### Take step
+                torch._foreach_lerp_(y, z, weight=ckp1)
+            else:
+                for p in active_p:
+                    grad = p.grad
+                    y = p # Notation to match theory
+                    
+                    state = self.state[p]
+
+                    exp_avg_sq = state['exp_avg_sq']
+                    z = state['z']
+
+                    # Decay the first and second moment running average coefficient
+                    exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1-beta2)
+
+                    if rho_t > 4.0:
+                        # Adam step
+                        denom = exp_avg_sq.div(bias_correction2).sqrt_().add_(eps)
+        
+                        grad_normalized = grad.div_(denom)
+                    else:
+                        # Fall back to SGD (or nothing)
+                        grad_normalized = grad
+
+                    # Weight decay calculated at y
+                    if decay != 0:
+                        grad_normalized.add_(y, alpha=decay)
+
+                    # Unextrapolate
+                    x = y.lerp_(end=z, weight=1-1/beta1)
+
+                    z.sub_(grad_normalized, alpha=lr)
+
+                    ### Take step
+                    x.lerp_(end=z, weight=ckp1)
+
+            group['k'] = k+1
+        return loss
diff --git a/schedulefree/test_schedulefree.py b/schedulefree/test_schedulefree.py
index 257694b..52f33bb 100644
--- a/schedulefree/test_schedulefree.py
+++ b/schedulefree/test_schedulefree.py
@@ -6,6 +6,7 @@
 import torch
 from schedulefree import (SGDScheduleFree, SGDScheduleFreeClosure, 
     AdamWScheduleFree, AdamWScheduleFreeClosure, AdamWScheduleFreeReference, 
+    RAdamScheduleFree, RAdamScheduleFreeClosure,
     ScheduleFreeWrapper, ScheduleFreeWrapperReference, SGDScheduleFreeReference)
 
 def allclose(x, y):
@@ -186,6 +187,56 @@ def closure():
                 allclose(y, y_closure)
                 allclose(y, p_reference.data)
 
+def test_schedulefree_radam():
+    decay = 0.5
+    warmup = 5
+    weight_closure = torch.randn(3, 2).cuda().requires_grad_()
+    weight = torch.clone(weight_closure.data).requires_grad_()
+    optimizer_closure = RAdamScheduleFreeClosure([weight_closure], lr=0.3, warmup_steps=warmup, weight_decay=decay)
+    optimizer = RAdamScheduleFree([weight], lr=0.3, warmup_steps=warmup, weight_decay=decay)
+
+    for step_idx in range(10):
+        print(step_idx)
+        optimizer.train()
+        grad = torch.rand_like(weight)
+
+        weight.grad = torch.clone(grad)
+
+        def closure():
+            weight_closure.grad = torch.clone(grad)
+
+        optimizer.step()
+        optimizer_closure.step(closure=closure)
+
+        optimizer.eval()
+
+        for group_closure, group in zip(optimizer_closure.param_groups, optimizer.param_groups):
+            for p_closure, p in zip(group_closure['params'], group['params']):
+                state_closure = optimizer_closure.state[p_closure]
+                state = optimizer.state[p]
+
+                z_closure = state_closure['z']
+                z = state['z']
+
+                allclose(p, p_closure)
+                allclose(z, z_closure)
+ 
+        optimizer.train()
+
+        for group_closure, group in zip(optimizer_closure.param_groups, optimizer.param_groups):
+            for p_closure, p in zip(group_closure['params'], group['params']):
+                state_closure = optimizer_closure.state[p_closure]
+                state = optimizer.state[p]
+
+                z_closure = state_closure['z']
+                z = state['z']
+
+                # Extrapolate p.data to equal y
+                y = p.data
+                y_closure = p_closure.lerp(end=z_closure, weight=1-0.9)
+
+                allclose(y, y_closure)
+
 def test_foreach():
     decay = 0.5
     warmup = 5
@@ -323,4 +374,5 @@ def opt_step():
     test_foreach()
 
     test_schedulefree_adam()
-    test_schedulefree_sgd()
\ No newline at end of file
+    test_schedulefree_sgd()
+    test_schedulefree_radam()
\ No newline at end of file