Constrained sparsemax and softmax (jankrepl#37)

New allocators implemented via convex optimization
alaatekleh · Jun 3, 2020 · 54e7dcf · 54e7dcf
1 parent a815fb8
commit 54e7dcf
Show file tree

Hide file tree

Showing 5 changed files with 328 additions and 18 deletions.
diff --git a/deepdow/layers/__init__.py b/deepdow/layers/__init__.py
@@ -1,7 +1,9 @@
 """Collection of layers."""
 
-from .collapse import AttentionCollapse, AverageCollapse, ElementCollapse, ExponentialCollapse, MaxCollapse, SumCollapse
-from .allocate import AnalyticalMarkowitz, NCO, NumericalMarkowitz, Resample, SoftmaxAllocator
+from .collapse import (AttentionCollapse, AverageCollapse, ElementCollapse, ExponentialCollapse,
+                       MaxCollapse, SumCollapse)
+from .allocate import (AnalyticalMarkowitz, NCO, NumericalMarkowitz, Resample, SoftmaxAllocator,
+                       SparsemaxAllocator)
 from .misc import Cov2Corr, CovarianceMatrix, KMeans, MultiplyByConstant
 from .transform import Conv, RNN
 
@@ -21,4 +23,5 @@
            'Resample',
            'RNN',
            'SoftmaxAllocator',
+           'SparsemaxAllocator',
            'SumCollapse']
diff --git a/deepdow/layers/allocate.py b/deepdow/layers/allocate.py
@@ -357,16 +357,52 @@ class SoftmaxAllocator(torch.nn.Module):
     Parameters
     ----------
     temperature : None or float
-        If None, then needs to be provided per sample during forward pass. If ``float`` then assumed to be always
-        the same.
+        If None, then needs to be provided per sample during forward pass. If ``float`` then assumed
+        to be always the same.
+
+    formulation : str, {'analytical', 'variational'}
+        Controls what way the problem is solved. If 'analytical' then using an explicit formula,
+        however, one cannot decide on a `max_weight` different than 1. If `variational` then solved
+        via convex optimization and one can set any `max_weight`.
+
+    n_assets : None or int
+        Only required and used if `formulation='variational`.
+
+    max_weight : float
+        A float between (0, 1] representing the maximum weight per asset.
 
     """
 
-    def __init__(self, temperature=1):
+    def __init__(self, temperature=1, formulation='analytical', n_assets=None, max_weight=1):
         super().__init__()
 
         self.temperature = temperature
 
+        if formulation not in {'analytical', 'variational'}:
+            raise ValueError('Unrecognized formulation {}'.format(formulation))
+
+        if formulation == 'variational' and n_assets is None:
+            raise ValueError('One needs to provide n_assets for the variational formulation.')
+
+        if formulation == 'analytical' and max_weight != 1:
+            raise ValueError('Cannot constraint weights via max_weight for analytical formulation')
+
+        if formulation == 'variational' and n_assets * max_weight < 1:
+            raise ValueError('One cannot create fully invested portfolio with the given max_weight')
+
+        self.formulation = formulation
+
+        if formulation == 'analytical':
+            self.layer = torch.nn.Softmax(dim=1)
+        else:
+            x = cp.Parameter(n_assets)
+            w = cp.Variable(n_assets)
+            obj = -x * w - cp.sum(cp.entr(w))
+            cons = [cp.sum(w) == 1.,
+                    w <= max_weight]
+            prob = cp.Problem(cp.Minimize(obj), cons)
+            self.layer = CvxpyLayer(prob, [x], [w])
+
     def forward(self, x, temperature=None):
         """Perform forward pass.
 
@@ -398,4 +434,83 @@ def forward(self, x, temperature=None):
 
         inp = x / temperature_[..., None]
 
-        return nn.functional.softmax(inp, dim=1)
+        return self.layer(inp) if self.formulation == 'analytical' else self.layer(inp)[0]
+
+
+class SparsemaxAllocator(torch.nn.Module):
+    """Portfolio creation by computing a sparsemax over the asset dimension with temperature.
+
+    Parameters
+    ----------
+    n_assets : int
+        Number of assets. Note that we require this quantity at construction to make sure
+        the underlying cvxpylayer does not need to be reinitialized every forward pass.
+
+    temperature : None or float
+        If None, then needs to be provided per sample during forward pass. If ``float`` then
+        assumed to be always the same.
+
+    max_weight : float
+        A float between (0, 1] representing the maximum weight per asset.
+
+    References
+    ----------
+    [1] Martins, Andre, and Ramon Astudillo. "From softmax to sparsemax: A sparse model of attention
+    and multi-label classification." International Conference on Machine Learning. 2016.
+
+    [2] Malaviya, Chaitanya, Pedro Ferreira, and André FT Martins. "Sparse and constrained attention
+    for neural machine translation." arXiv preprint arXiv:1805.08241 (2018)
+    """
+
+    def __init__(self, n_assets, temperature=1, max_weight=1):
+        super().__init__()
+
+        if n_assets * max_weight < 1:
+            raise ValueError('One cannot create fully invested portfolio with the given max_weight')
+
+        self.n_assets = n_assets
+        self.temperature = temperature
+
+        # Construct convex optimization problem
+        x = cp.Parameter(n_assets)
+        w = cp.Variable(n_assets)
+        obj = cp.sum_squares(x - w)
+        cons = [cp.sum(w) == 1,
+                0. <= w,
+                w <= max_weight]
+        prob = cp.Problem(cp.Minimize(obj), cons)
+
+        self.layer = CvxpyLayer(prob, parameters=[x], variables=[w])
+
+    def forward(self, x, temperature=None):
+        """Perform forward pass.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Tensor of shape `(n_samples, n_assets`).
+
+        temperature : None or torch.Tensor
+            If None, then using the `temperature` provided at construction time. Otherwise a
+            `torch.Tensor` of shape `(n_samples,)` representing a per sample temperature.
+
+        Returns
+        -------
+        weights : torch.Tensor
+            Tensor of shape `(n_samples, n_assets`).
+
+        """
+        n_samples, _ = x.shape
+        device, dtype = x.device, x.dtype
+
+        if not ((temperature is None) ^ (self.temperature is None)):
+            raise ValueError('Not clear which temperature to use')
+
+        if temperature is not None:
+            temperature_ = temperature  # (n_samples,)
+        else:
+            temperature_ = self.temperature * torch.ones(n_samples, dtype=dtype, device=device)
+
+        inp = x / temperature_[..., None]
+
+        return self.layer(inp)[0]
diff --git a/docs/source/layers.rst b/docs/source/layers.rst
@@ -224,6 +224,24 @@ performs a softmax over the input. Additionally, one can also provide custom :co
 Note that one can provide a single :code:`temperature` at construction that is shared across all samples. Alternatively,
 one can provide per sample temperature when performing the forward pass.
 
+The above formulation (:code:`formulation`) is **analytical**. One can also obtain the same weights
+via solving a convex optimization problem (**variational** formulation). See [Agrawal2019]_  and
+[Martins2017]_ for more details.
+
+.. math::
+
+    \begin{aligned}
+    \min_{\textbf{w}} \quad & - \textbf{x}^T \textbf{w} - H(\textbf{w}) \\
+    \textrm{s.t.} \quad & \sum_{i=1}^{N}w_i = 1 \\
+    \quad & w_i >= 0, i \in \{1,...,N\}\\
+    \quad & w_i <= w_{\text{max}}, i \in \{1,...,N\}\\
+    \end{aligned}
+
+where :math:`H(\textbf{w})=-\sum_{i=1}^{N} w_i \log(w_i)` is the entropy. Note that if
+:code:`max_weight` is set to 1 then one gets the unconstrained (analytical) softmax. The benefit of
+using the variational formulation is the fact that the user can decide on any :code:`max_weight`
+from :code:`(0, 1]`.
+
 .. testcode::
 
    from deepdow.layers import SoftmaxAllocator
@@ -237,6 +255,41 @@ one can provide per sample temperature when performing the forward pass.
    assert w.shape == (2, 2)
    assert torch.allclose(w.sum(1), torch.ones(2))
 
+SparsemaxAllocator
+******************
+Suggested in [Martins2016]_. It is similar to Softmax but enforces sparsity. It currently uses
+:code:`cvxpylayers` as a backend. See below a mathematical formulation. note that **x** represents
+the logits.
+
+.. math::
+
+    \begin{aligned}
+    \min_{\textbf{w}} \quad & {\vert \vert \textbf{w} - \textbf{x} \vert \vert}^2_{2} \\
+    \textrm{s.t.} \quad & \sum_{i=1}^{N}w_i = 1 \\
+    \quad & w_i >= 0, i \in \{1,...,N\}\\
+    \quad & w_i <= w_{\text{max}}, i \in \{1,...,N\}\\
+    \end{aligned}
+
+Similarly to :code:`SoftmaxAllocator` one can provide temperature either per sample or a single
+one at construction. Additionally, one can control the maximum weight via the :code:`max_weight`
+parameter.
+
+.. testcode::
+
+   from deepdow.layers import SparsemaxAllocator
+
+   n_assets = 3
+   layer = SparsemaxAllocator(n_assets, temperature=1)
+   x = torch.tensor([[1, 2.3, 2.1], [2, 4.2, -1.1]])
+
+   w = layer(x)
+   w_true = torch.tensor([[-1.2650e-10,  6.0000e-01,  4.0000e-01],
+                          [-2.9905e-10,  1.0000e+00,  4.2659e-10]])
+
+   assert w.shape == (2, 3)
+   assert torch.allclose(w.sum(1), torch.ones(2))
+   assert torch.allclose(w, w_true, atol=1e-5)
+
 
 Misc layers
 -----------
@@ -342,11 +395,17 @@ References
 .. [Michaud2007]
    Michaud, Richard O., and Robert Michaud. "Estimation error and portfolio optimization: a resampling solution." Available at SSRN 2658657 (2007).
 
+.. [Martins2016]
+   Martins, Andre, and Ramon Astudillo. "From softmax to sparsemax: A sparse model of attention and multi-label classification." International Conference on Machine Learning. 2016.
+
 .. [Ledoit2004]
    Ledoit, Olivier, and Michael Wolf. "Honey, I shrunk the sample covariance matrix." The Journal of Portfolio Management 30.4 (2004): 110-119.
 
 .. [sklearnkmeans]
    https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
 
+.. [Martins2017]
+   Martins, André FT, and Julia Kreutzer. "Learning what’s easy: Fully differentiable neural easy-first taggers." Proceedings of the 2017 conference on empirical methods in natural language processing. 2017.
+
 .. [Bodnar2013]
    Bodnar, Taras, Nestor Parolya, and Wolfgang Schmid. "On the equivalence of quadratic optimization problems commonly used in portfolio theory." European Journal of Operational Research 229.3 (2013): 637-644.
diff --git a/docs/source/networks.rst b/docs/source/networks.rst
@@ -149,7 +149,9 @@ The activations have the following shape (ommiting the sample dimension).
       (norm_layer_2): GroupNorm(4, 32, eps=1e-05, affine=True)
       (time_collapse_layer): AverageCollapse()
       (channel_collapse_layer): AverageCollapse()
-      (portfolio_opt_layer): SoftmaxAllocator()
+      (portfolio_opt_layer): SoftmaxAllocator(
+        (layer): Softmax(dim=1)
+      )
     )
 
 
@@ -184,7 +186,9 @@ The activations have the following shape (ommiting the sample dimension).
       (norm_layer): BatchNorm1d(600, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
       (dropout_layer): Dropout(p=0.5, inplace=False)
       (linear): Linear(in_features=600, out_features=10, bias=True)
-      (allocate_layer): SoftmaxAllocator()
+      (allocate_layer): SoftmaxAllocator(
+        (layer): Softmax(dim=1)
+      )
     )