offer an option to concat the values across heads (with head dimensio…

…n) and then project out, like multi-head attention
lucidrains · Jun 30, 2023 · ddcb10f · ddcb10f
1 parent f0be2db
commit ddcb10f
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -67,9 +67,9 @@ Special thanks go to <a href="https://github.com/AranKomat">Aran</a> for encoura
 ## Todo
 
 - [x] offer stochasticity with annealed gumbel noise. seen dramatic effects in vector-quantization setting
+- [x] offer a way for smaller value dimensions + concat and linear combination of heads (like multi-head attention)
 
 - [ ] get caught up on latest literature on product key memories, if any
-- [ ] offer a way for smaller value dimensions + concat and linear combination of heads (like multi-head attention)
 - [ ] instead of additive scores, try multiplicative using coordinate descent routing
 
 ## Citations

diff --git a/product_key_memory/product_key_memory.py b/product_key_memory/product_key_memory.py
@@ -3,6 +3,7 @@
 from torch import nn, einsum
 
 from einops import rearrange
+from einops.layers.torch import Rearrange, Reduce
 
 from colt5_attention import topk as coor_descent_topk
 
@@ -83,7 +84,8 @@ def __init__(
         attn_dropout = 0.,
         use_layernorm = True,
         pre_layernorm = False,
-        differentiable_topk = False
+        differentiable_topk = False,
+        concat_values_and_combine = False
     ):
         super().__init__()
         self.topk = topk
@@ -106,10 +108,32 @@ def __init__(
         else:
             self.norm = MaskedBatchNorm1D(nn.BatchNorm1d(dim_head))
 
+        # keys
+
         self.keys = nn.Parameter(torch.zeros(heads, num_keys, 2, dim_head))
-        self.values = nn.EmbeddingBag(num_keys ** 2, dim, mode = 'sum')
         init_(self.keys)
-        init_(self.values.weight)
+
+        # values
+
+        self.concat_values_and_combine = concat_values_and_combine
+
+        if concat_values_and_combine:
+            values = nn.Embedding(num_keys ** 2, dim_head)
+
+            self.values = nn.Sequential(
+                values,
+                Reduce('b (h k) d -> b h d', 'sum', h = heads),
+                Rearrange('b n d -> b (n d)'),
+                nn.Linear(dim_head * heads, dim, bias = False)
+            )
+        else:
+            values = nn.EmbeddingBag(num_keys ** 2, dim, mode = 'sum')
+            self.values = values
+
+
+        init_(values.weight)
+
+        # dropouts
 
         self.input_dropout = nn.Dropout(input_dropout)
         self.query_dropout = nn.Dropout(query_dropout)
@@ -192,7 +216,10 @@ def forward(
 
         # aggregate
 
-        out = self.values(value_indices, per_sample_weights=attn)
-        out = self.value_dropout(out)
+        if self.concat_values_and_combine:
+            out = self.values(value_indices)
+        else:
+            out = self.values(value_indices, per_sample_weights = attn)
 
+        out = self.value_dropout(out)
         return rearrange(out, '(b t) d -> b t d', b = b)