JamesLoong
diff --git a/‎.travis.yml
+1-1 b/‎.travis.yml
+1-1
diff --git a/‎README.md
+37-1 b/‎README.md
+37-1
diff --git a/‎deepctr/__init__.py
+4-4 b/‎deepctr/__init__.py
+4-4
diff --git a/‎deepctr/estimator/models/afm.py
+1-1 b/‎deepctr/estimator/models/afm.py
+1-1
diff --git a/‎deepctr/estimator/models/autoint.py
+1-1 b/‎deepctr/estimator/models/autoint.py
+1-1
diff --git a/‎deepctr/estimator/models/ccpm.py
+1-1 b/‎deepctr/estimator/models/ccpm.py
+1-1
diff --git a/‎deepctr/estimator/models/dcn.py
+1-1 b/‎deepctr/estimator/models/dcn.py
+1-1
diff --git a/‎deepctr/estimator/models/deepfm.py
+1-1 b/‎deepctr/estimator/models/deepfm.py
+1-1
diff --git a/‎deepctr/estimator/models/fibinet.py
+1-1 b/‎deepctr/estimator/models/fibinet.py
+1-1
diff --git a/‎deepctr/estimator/models/fnn.py
+1-1 b/‎deepctr/estimator/models/fnn.py
+1-1
diff --git a/‎deepctr/estimator/models/fwfm.py
+1-1 b/‎deepctr/estimator/models/fwfm.py
+1-1
diff --git a/‎deepctr/estimator/models/nfm.py
+1-1 b/‎deepctr/estimator/models/nfm.py
+1-1
diff --git a/‎deepctr/estimator/models/pnn.py
+1-1 b/‎deepctr/estimator/models/pnn.py
+1-1
diff --git a/‎deepctr/estimator/models/wdl.py
+1-1 b/‎deepctr/estimator/models/wdl.py
+1-1
diff --git a/‎deepctr/estimator/models/xdeepfm.py
+1-1 b/‎deepctr/estimator/models/xdeepfm.py
+1-1
diff --git a/‎deepctr/inputs.py
+1-1 b/‎deepctr/inputs.py
+1-1
diff --git a/‎deepctr/layers/activation.py
+1-1 b/‎deepctr/layers/activation.py
+1-1
diff --git a/‎deepctr/layers/core.py
+1-1 b/‎deepctr/layers/core.py
+1-1
diff --git a/‎deepctr/layers/interaction.py
+1-1 b/‎deepctr/layers/interaction.py
+1-1
diff --git a/‎deepctr/layers/normalization.py
+1-1 b/‎deepctr/layers/normalization.py
+1-1
diff --git a/‎deepctr/layers/sequence.py
+34-12 b/‎deepctr/layers/sequence.py
+34-12
diff --git a/‎deepctr/layers/utils.py
+1-1 b/‎deepctr/layers/utils.py
+1-1
diff --git a/‎deepctr/models/__init__.py
+2-1 b/‎deepctr/models/__init__.py
+2-1
diff --git a/‎deepctr/models/afm.py
+1-1 b/‎deepctr/models/afm.py
+1-1
diff --git a/‎deepctr/models/autoint.py
+1-1 b/‎deepctr/models/autoint.py
+1-1
@@ -59,7 +59,7 @@ script:
 
 notifications:
   recipients:
-    - wcshen1994@163.com
+    - weichenswc@163.com
 
   on_success: change
   on_failure: change
 
@@ -12,7 +12,7 @@
 [![Documentation Status](https://readthedocs.org/projects/deepctr-doc/badge/?version=latest)](https://deepctr-doc.readthedocs.io/)
 ![CI status](https://github.com/shenweichen/deepctr/workflows/CI/badge.svg)
 [![Coverage Status](https://coveralls.io/repos/github/shenweichen/DeepCTR/badge.svg?branch=master)](https://coveralls.io/github/shenweichen/DeepCTR?branch=master)
-[![Codacy Badge](https://api.codacy.com/project/badge/Grade/d4099734dc0e4bab91d332ead8c0bdd0)](https://www.codacy.com/app/wcshen1994/DeepCTR?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=shenweichen/DeepCTR&amp;utm_campaign=Badge_Grade)
+[![Codacy Badge](https://api.codacy.com/project/badge/Grade/d4099734dc0e4bab91d332ead8c0bdd0)](https://www.codacy.com/gh/shenweichen/DeepCTR?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=shenweichen/DeepCTR&amp;utm_campaign=Badge_Grade)
 [![Disscussion](https://img.shields.io/badge/chat-wechat-brightgreen?style=flat)](./README.md#DisscussionGroup)
 [![License](https://img.shields.io/github/license/shenweichen/deepctr.svg)](https://github.com/shenweichen/deepctr/blob/master/LICENSE)
 <!-- [![Gitter](https://badges.gitter.im/DeepCTR/community.svg)](https://gitter.im/DeepCTR/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) -->
@@ -54,6 +54,7 @@ Let's [**Get Started!**](https://deepctr-doc.readthedocs.io/en/latest/Quick-Star
 |     Deep Session Interest Network      | [IJCAI 2019][Deep Session Interest Network for Click-Through Rate Prediction ](https://arxiv.org/abs/1905.06482)                                                |
 |                FiBiNET                 | [RecSys 2019][FiBiNET: Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction](https://arxiv.org/pdf/1905.09433.pdf)   |
 |                FLEN                    | [arxiv 2019][FLEN: Leveraging Field for Scalable CTR Prediction](https://arxiv.org/pdf/1911.04690.pdf)   |
+|                 BST                   | [DLP-KDD 2019][Behavior sequence transformer for e-commerce recommendation in Alibaba](https://arxiv.org/pdf/1905.06874.pdf)                           | 
 |                DCN V2                    | [arxiv 2020][DCN V2: Improved Deep & Cross Network and Practical Lessons for Web-scale Learning to Rank Systems](https://arxiv.org/abs/2008.13535)   |
 
 ## Citation
@@ -82,3 +83,38 @@ If you find this code useful in your research, please cite it using the followin
 
   ![wechat](./docs/pics/code.png)
 
+
+## Main contributors([welcome to join us!](./CONTRIBUTING.md))
+
+<table border="0">
+  <tbody>
+    <tr align="center" >
+      <td>
+         <a href="https://github.com/shenweichen"><img width="70" height="70" src="https://github.com/shenweichen.png?s=40" alt="pic"></a><br>
+         <a href="https://github.com/shenweichen">Shen Weichen</a> 
+        <p>
+        Alibaba Group  </p>
+      </td>
+      <td>
+         <a href="https://github.com/zanshuxun"><img width="70" height="70" src="https://github.com/zanshuxun.png?s=40" alt="pic"></a><br>
+         <a href="https://github.com/zanshuxun">Zan Shuxun</a> 
+        <p>Beijing University <br> of  Posts and <br> Telecommunications  </p>
+      </td>
+      <td>
+         <a href="https://github.com/pandeconscious"><img width="70" height="70" src="https://github.com/pandeconscious.png?s=40" alt="pic"></a><br>
+         <a href="https://github.com/pandeconscious">Harshit Pande</a>
+        <p> Amazon   </p>
+      </td>
+      <td>
+         <a href="https://github.com/codewithzichao"><img width="70" height="70" src="https://github.com/codewithzichao.png?s=40" alt="pic"></a><br>
+         <a href="https://github.com/codewithzichao">Li Zichao</a>
+        <p> Peking University   </p>
+      </td>
+      <td>
+         <a href="https://github.com/TanTingyi"><img width="70" height="70" src="https://github.com/TanTingyi.png?s=40" alt="pic"></a><br>
+         <a href="https://github.com/TanTingyi">LeoCai</a>
+         <p>  Chongqing University <br> of  Posts and <br> Telecommunications   </p>
+      </td>
+    </tr>
+  </tbody>
+</table>
@@ -1,4 +1,4 @@
-from .utils import check_version
-
-__version__ = '0.8.3'
-check_version(__version__)
+from .utils import check_version
+
+__version__ = '0.8.5'
+check_version(__version__)
@@ -2,7 +2,7 @@
 """
 
 Author:
-    Weichen Shen, wcshen1994@163.com
+    Weichen Shen, weichenswc@163.com
 
 Reference:
     [1] Xiao J, Ye H, He X, et al. Attentional factorization machines: Learning the weight of feature interactions via attention networks[J]. arXiv preprint arXiv:1708.04617, 2017.
 
@@ -2,7 +2,7 @@
 """
 
 Author:
-    Weichen Shen, wcshen1994@163.com
+    Weichen Shen, weichenswc@163.com
 
 Reference:
     [1] Song W, Shi C, Xiao Z, et al. AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks[J]. arXiv preprint arXiv:1810.11921, 2018.(https://arxiv.org/abs/1810.11921)
 
@@ -2,7 +2,7 @@
 """
 
 Author:
-    Weichen Shen, wcshen1994@163.com
+    Weichen Shen, weichenswc@163.com
 
 Reference:
     [1] Liu Q, Yu F, Wu S, et al. A convolutional click prediction model[C]//Proceedings of the 24th ACM International on Conference on Information and Knowledge Management. ACM, 2015: 1743-1746.
 
@@ -1,7 +1,7 @@
 # -*- coding:utf-8 -*-
 """
 Author:
-    Weichen Shen, wcshen1994@163.com
+    Weichen Shen, weichenswc@163.com
 
 Reference:
     [1] Wang R, Fu B, Fu G, et al. Deep & cross network for ad click predictions[C]//Proceedings of the ADKDD'17. ACM, 2017: 12. (https://arxiv.org/abs/1708.05123)
 
@@ -1,7 +1,7 @@
 # -*- coding:utf-8 -*-
 """
 Author:
-    Weichen Shen, wcshen1994@163.com
+    Weichen Shen, weichenswc@163.com
 
 Reference:
     [1] Guo H, Tang R, Ye Y, et al. Deepfm: a factorization-machine based neural network for ctr prediction[J]. arXiv preprint arXiv:1703.04247, 2017.(https://arxiv.org/abs/1703.04247)
 
@@ -1,7 +1,7 @@
 # -*- coding:utf-8 -*-
 """
 Author:
-    Weichen Shen, wcshen1994@163.com
+    Weichen Shen, weichenswc@163.com
 
 Reference:
     [1] Huang T, Zhang Z, Zhang J. FiBiNET: Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction[J]. arXiv preprint arXiv:1905.09433, 2019.
 
@@ -1,7 +1,7 @@
 # -*- coding:utf-8 -*-
 """
 Author:
-    Weichen Shen, wcshen1994@163.com
+    Weichen Shen, weichenswc@163.com
 
 Reference:
     [1] Zhang W, Du T, Wang J. Deep learning over multi-field categorical data[C]//European conference on information retrieval. Springer, Cham, 2016: 45-57.(https://arxiv.org/pdf/1601.02376.pdf)
 
@@ -1,7 +1,7 @@
 # -*- coding:utf-8 -*-
 """
 Author:
-    Weichen Shen, wcshen1994@163.com
+    Weichen Shen, weichenswc@163.com
     Harshit Pande
 
 Reference:
 
@@ -1,7 +1,7 @@
 # -*- coding:utf-8 -*-
 """
 Author:
-    Weichen Shen, wcshen1994@163.com
+    Weichen Shen, weichenswc@163.com
 
 Reference:
     [1] He X, Chua T S. Neural factorization machines for sparse predictive analytics[C]//Proceedings of the 40th International ACM SIGIR conference on Research and Development in Information Retrieval. ACM, 2017: 355-364. (https://arxiv.org/abs/1708.05027)
 
@@ -1,7 +1,7 @@
 # -*- coding:utf-8 -*-
 """
 Author:
-    Weichen Shen, wcshen1994@163.com
+    Weichen Shen, weichenswc@163.com
 
 Reference:
     [1] Qu Y, Cai H, Ren K, et al. Product-based neural networks for user response prediction[C]//Data Mining (ICDM), 2016 IEEE 16th International Conference on. IEEE, 2016: 1149-1154.(https://arxiv.org/pdf/1611.00144.pdf)
 
@@ -1,7 +1,7 @@
 # -*- coding:utf-8 -*-
 """
 Author:
-    Weichen Shen, wcshen1994@163.com
+    Weichen Shen, weichenswc@163.com
 
 Reference:
     [1] Cheng H T, Koc L, Harmsen J, et al. Wide & deep learning for recommender systems[C]//Proceedings of the 1st Workshop on Deep Learning for Recommender Systems. ACM, 2016: 7-10.(https://arxiv.org/pdf/1606.07792.pdf)
 
@@ -1,7 +1,7 @@
 # -*- coding:utf-8 -*-
 """
 Author:
-    Weichen Shen, wcshen1994@163.com
+    Weichen Shen, weichenswc@163.com
 
 Reference:
     [1] Lian J, Zhou X, Zhang F, et al. xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems[J]. arXiv preprint arXiv:1803.05170, 2018.(https://arxiv.org/pdf/1803.05170.pdf)
 
@@ -2,7 +2,7 @@
 """
 
 Author:
-    Weichen Shen,wcshen1994@163.com
+    Weichen Shen,weichenswc@163.com
 
 """
 
 
@@ -2,7 +2,7 @@
 """
 
 Author:
-    Weichen Shen,wcshen1994@163.com
+    Weichen Shen,weichenswc@163.com
 
 """
 
 
@@ -2,7 +2,7 @@
 """
 
 Author:
-    Weichen Shen,wcshen1994@163.com
+    Weichen Shen,weichenswc@163.com
 
 """
 
 
@@ -2,7 +2,7 @@
 """
 
 Authors:
-    Weichen Shen,wcshen1994@163.com,
+    Weichen Shen,weichenswc@163.com,
     Harshit Pande
 
 """
 
@@ -2,7 +2,7 @@
 """
 
 Author:
-    Weichen Shen,wcshen1994@163.com
+    Weichen Shen,weichenswc@163.com
 
 """
 
 
@@ -2,7 +2,7 @@
 """
 
 Author:
-    Weichen Shen,wcshen1994@163.com
+    Weichen Shen,weichenswc@163.com
 
 """
 
@@ -79,7 +79,7 @@ def call(self, seq_value_len_list, mask=None, **kwargs):
         mask = tf.tile(mask, [1, 1, embedding_size])
 
         if self.mode == "max":
-            hist = uiseq_embed_list - (1-mask) * 1e9
+            hist = uiseq_embed_list - (1 - mask) * 1e9
             return reduce_max(hist, 1, keep_dims=True)
 
         hist = reduce_sum(uiseq_embed_list * mask, 1, keep_dims=False)
@@ -417,12 +417,12 @@ class Transformer(Layer):
     """  Simplified version of Transformer  proposed in 《Attention is all you need》
 
       Input shape
-        - a list of two 3D tensor with shape ``(batch_size, timesteps, input_dim)`` if supports_masking=True.
-        - a list of two 4 tensors, first two tensors with shape ``(batch_size, timesteps, input_dim)``,last two tensors with shape ``(batch_size, 1)`` if supports_masking=False.
+        - a list of two 3D tensor with shape ``(batch_size, timesteps, input_dim)`` if ``supports_masking=True`` .
+        - a list of two 4 tensors, first two tensors with shape ``(batch_size, timesteps, input_dim)``,last two tensors with shape ``(batch_size, 1)`` if ``supports_masking=False`` .
 
 
       Output shape
-        - 3D tensor with shape: ``(batch_size, 1, input_dim)``.
+        - 3D tensor with shape: ``(batch_size, 1, input_dim)``  if ``output_type='mean'`` or ``output_type='sum'`` , else  ``(batch_size, timesteps, input_dim)`` .
 
 
       Arguments
@@ -436,14 +436,16 @@ class Transformer(Layer):
             - **blinding**: bool. Whether or not use blinding.
             - **seed**: A Python integer to use as random seed.
             - **supports_masking**:bool. Whether or not support masking.
+            - **attention_type**: str, Type of attention, the value must be one of { ``'scaled_dot_product'`` , ``'additive'`` }.
+            - **output_type**: ``'mean'`` , ``'sum'`` or `None`. Whether or not use average/sum pooling for output.
 
       References
             - [Vaswani, Ashish, et al. "Attention is all you need." Advances in Neural Information Processing Systems. 2017.](https://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf)
     """
 
     def __init__(self, att_embedding_size=1, head_num=8, dropout_rate=0.0, use_positional_encoding=True, use_res=True,
                  use_feed_forward=True, use_layer_norm=False, blinding=True, seed=1024, supports_masking=False,
-                 **kwargs):
+                 attention_type="scaled_dot_product", output_type="mean", **kwargs):
         if head_num <= 0:
             raise ValueError('head_num must be a int > 0')
         self.att_embedding_size = att_embedding_size
@@ -456,6 +458,8 @@ def __init__(self, att_embedding_size=1, head_num=8, dropout_rate=0.0, use_posit
         self.dropout_rate = dropout_rate
         self.use_layer_norm = use_layer_norm
         self.blinding = blinding
+        self.attention_type = attention_type
+        self.output_type = output_type
         super(Transformer, self).__init__(**kwargs)
         self.supports_masking = supports_masking
 
@@ -464,7 +468,7 @@ def build(self, input_shape):
         if self.num_units != embedding_size:
             raise ValueError(
                 "att_embedding_size * head_num must equal the last dimension size of inputs,got %d * %d != %d" % (
-                self.att_embedding_size, self.head_num, embedding_size))
+                    self.att_embedding_size, self.head_num, embedding_size))
         self.seq_len_max = int(input_shape[0][-2])
         self.W_Query = self.add_weight(name='query', shape=[embedding_size, self.att_embedding_size * self.head_num],
                                        dtype=tf.float32,
@@ -475,6 +479,11 @@ def build(self, input_shape):
         self.W_Value = self.add_weight(name='value', shape=[embedding_size, self.att_embedding_size * self.head_num],
                                        dtype=tf.float32,
                                        initializer=tf.keras.initializers.TruncatedNormal(seed=self.seed + 2))
+        if self.attention_type == "additive":
+            self.b = self.add_weight('b', shape=[self.att_embedding_size], dtype=tf.float32,
+                                     initializer=tf.keras.initializers.glorot_uniform(seed=self.seed))
+            self.v = self.add_weight('v', shape=[self.att_embedding_size], dtype=tf.float32,
+                                     initializer=tf.keras.initializers.glorot_uniform(seed=self.seed))
         # if self.use_res:
         #     self.W_Res = self.add_weight(name='res', shape=[embedding_size, self.att_embedding_size * self.head_num], dtype=tf.float32,
         #                                  initializer=tf.keras.initializers.TruncatedNormal(seed=self.seed))
@@ -525,10 +534,18 @@ def call(self, inputs, mask=None, training=None, **kwargs):
         keys = tf.concat(tf.split(keys, self.head_num, axis=2), axis=0)
         values = tf.concat(tf.split(values, self.head_num, axis=2), axis=0)
 
-        # head_num*None T_q T_k
-        outputs = tf.matmul(querys, keys, transpose_b=True)
+        if self.attention_type == "scaled_dot_product":
+            # head_num*None T_q T_k
+            outputs = tf.matmul(querys, keys, transpose_b=True)
 
-        outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5)
+            outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5)
+        elif self.attention_type == "additive":
+            querys_reshaped = tf.expand_dims(querys, axis=-2)
+            keys_reshaped = tf.expand_dims(keys, axis=-3)
+            outputs = tf.tanh(tf.nn.bias_add(querys_reshaped + keys_reshaped, self.b))
+            outputs = tf.squeeze(tf.tensordot(outputs, tf.expand_dims(self.v, axis=-1), axes=[-1, 0]), axis=-1)
+        else:
+            NotImplementedError
 
         key_masks = tf.tile(key_masks, [self.head_num, 1])
 
@@ -579,7 +596,12 @@ def call(self, inputs, mask=None, training=None, **kwargs):
             if self.use_layer_norm:
                 result = self.ln(result)
 
-        return reduce_mean(result, axis=1, keep_dims=True)
+        if self.output_type == "mean":
+            return reduce_mean(result, axis=1, keep_dims=True)
+        elif self.output_type == "sum":
+            return reduce_sum(result, axis=1, keep_dims=True)
+        else:
+            return result
 
     def compute_output_shape(self, input_shape):
 
@@ -593,7 +615,7 @@ def get_config(self, ):
                   'dropout_rate': self.dropout_rate, 'use_res': self.use_res,
                   'use_positional_encoding': self.use_positional_encoding, 'use_feed_forward': self.use_feed_forward,
                   'use_layer_norm': self.use_layer_norm, 'seed': self.seed, 'supports_masking': self.supports_masking,
-                  'blinding': self.blinding}
+                  'blinding': self.blinding, 'attention_type': self.attention_type, 'output_type': self.output_type}
         base_config = super(Transformer, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
 
@@ -2,7 +2,7 @@
 """
 
 Author:
-    Weichen Shen,wcshen1994@163.com
+    Weichen Shen,weichenswc@163.com
 
 """
 import tensorflow as tf
 
@@ -18,6 +18,7 @@
 from .fibinet import FiBiNET
 from .flen import FLEN
 from .fwfm import FwFM
+from .bst import BST
 
 __all__ = ["AFM", "CCPM", "DCN", "DCNMix", "MLR",  "DeepFM", "MLR", "NFM", "DIN", "DIEN", "FNN", "PNN",
-           "WDL", "xDeepFM", "AutoInt", "ONN", "FGCNN", "DSIN", "FiBiNET", 'FLEN', "FwFM"]
+           "WDL", "xDeepFM", "AutoInt", "ONN", "FGCNN", "DSIN", "FiBiNET", 'FLEN', "FwFM", "BST"]
@@ -2,7 +2,7 @@
 """
 
 Author:
-    Weichen Shen, wcshen1994@163.com
+    Weichen Shen, weichenswc@163.com
 
 Reference:
     [1] Xiao J, Ye H, He X, et al. Attentional factorization machines: Learning the weight of feature interactions via attention networks[J]. arXiv preprint arXiv:1708.04617, 2017.
 
@@ -2,7 +2,7 @@
 """
 
 Author:
-    Weichen Shen, wcshen1994@163.com
+    Weichen Shen, weichenswc@163.com
 
 Reference:
     [1] Song W, Shi C, Xiao Z, et al. AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks[J]. arXiv preprint arXiv:1810.11921, 2018.(https://arxiv.org/abs/1810.11921)