add new contents

taiyouZhang · Dec 12, 2018 · 7e9f0f4 · 7e9f0f4
1 parent e109255
commit 7e9f0f4
Show file tree

Hide file tree

Showing 2 changed files with 265 additions and 0 deletions.
diff --git a/esmm_ext/dupn_input_fn.py b/esmm_ext/dupn_input_fn.py
@@ -0,0 +1,150 @@
+#-*- coding: UTF-8 -*-
+import tensorflow as tf
+from tensorflow import feature_column as fc
+
+
+def create_user_feature_columns():
+  gender = fc.indicator_column(fc.categorical_column_with_identity("gender", num_buckets=3, default_value=0))
+  age_class = fc.indicator_column(fc.categorical_column_with_identity("age_class", num_buckets=7, default_value=0))
+  has_baby = fc.indicator_column(fc.categorical_column_with_identity("has_baby", num_buckets=2, default_value=0))
+  baby_gender = fc.indicator_column(fc.categorical_column_with_identity("baby_gender", num_buckets=3, default_value=0))
+  baby_age = fc.indicator_column(fc.categorical_column_with_identity("baby_age", num_buckets=7, default_value=0))
+  grade = fc.indicator_column(fc.categorical_column_with_identity("grade", num_buckets=7, default_value=0))
+  rfm_type = fc.indicator_column(fc.categorical_column_with_identity("bi_rfm_type", num_buckets=12, default_value=0))
+  cate1_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate1_price_prefer", num_buckets=6, default_value=0))
+  cate2_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate2_price_prefer", num_buckets=6, default_value=0))
+  cate3_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate3_price_prefer", num_buckets=6, default_value=0))
+  city_id = fc.categorical_column_with_hash_bucket("city", 700)
+  city = fc.shared_embedding_columns([city_id], 16)
+  cols = [gender, age_class, has_baby, baby_gender, baby_age, grade, rfm_type, cate1_price_prefer, cate2_price_prefer, cate3_price_prefer]
+  return cols + city
+
+
+def create_feature_columns():
+  c2id = fc.categorical_column_with_hash_bucket("cate2Id", 5000, dtype=tf.int64)
+  modified_time = fc.numeric_column("modified_time", default_value=0.0)
+  modified_time_sqrt = fc.numeric_column("modified_time_sqrt", default_value=0.0)
+  modified_time_square = fc.numeric_column("modified_time_square", default_value=0.0)
+  props_sex = fc.indicator_column(
+    fc.categorical_column_with_vocabulary_list("props_sex", ["男", "女", "通用", "情侣"], default_value=0))
+  brand_grade = fc.indicator_column(
+    fc.categorical_column_with_vocabulary_list("brand_grade", ["A类品牌", "B类品牌", "C类品牌", "D类品牌"], default_value=0))
+  shipment_rate = fc.numeric_column("shipment_rate", default_value=0.0)
+  shipping_rate = fc.numeric_column("shipping_rate", default_value=0.0)
+  ipv_ntile = fc.bucketized_column(fc.numeric_column("ipv_ntile", dtype=tf.int64, default_value=99), boundaries=[1, 2, 3, 4, 5, 10, 20, 50, 80])
+  pay_ntile = fc.bucketized_column(fc.numeric_column("pay_ntile", dtype=tf.int64, default_value=99), boundaries=[1, 2, 3, 4, 5, 10, 20, 50, 80])
+  price = fc.numeric_column("price_norm", default_value=0.0)
+  ctr_1d = fc.numeric_column("ctr_1d", default_value=0.0)
+  cvr_1d = fc.numeric_column("cvr_1d", default_value=0.0)
+  uv_cvr_1d = fc.numeric_column("uv_cvr_1d", default_value=0.0)
+  ctr_1w = fc.numeric_column("ctr_1w", default_value=0.0)
+  cvr_1w = fc.numeric_column("cvr_1w", default_value=0.0)
+  uv_cvr_1w = fc.numeric_column("uv_cvr_1w", default_value=0.0)
+  ctr_2w = fc.numeric_column("ctr_2w", default_value=0.0)
+  cvr_2w = fc.numeric_column("cvr_2w", default_value=0.0)
+  uv_cvr_2w = fc.numeric_column("uv_cvr_2w", default_value=0.0)
+  ctr_1m = fc.numeric_column("ctr_1m", default_value=0.0)
+  cvr_1m = fc.numeric_column("cvr_1m", default_value=0.0)
+  uv_cvr_1m = fc.numeric_column("uv_cvr_1m", default_value=0.0)
+  pay_qty_1d = fc.numeric_column("pay_qty_1d", default_value=0.0)
+  pay_qty_1w = fc.numeric_column("pay_qty_1w", default_value=0.0)
+  pay_qty_2w = fc.numeric_column("pay_qty_2w", default_value=0.0)
+  pay_qty_1m = fc.numeric_column("pay_qty_1m", default_value=0.0)
+  cat2_pay_qty = fc.numeric_column("cat2_pay_qty_1d", default_value=0.0)
+  cat1_pay_qty = fc.numeric_column("cat1_pay_qty_1d", default_value=0.0)
+  brd_pay_qty = fc.numeric_column("brd_pay_qty_1d", default_value=0.0)
+  slr_pay_qty_1d = fc.numeric_column("slr_pay_qty_1d", default_value=0.0)
+  slr_pay_qty_1w = fc.numeric_column("slr_pay_qty_1w", default_value=0.0)
+  slr_pay_qty_2w = fc.numeric_column("slr_pay_qty_2w", default_value=0.0)
+  slr_pay_qty_1m = fc.numeric_column("slr_pay_qty_1m", default_value=0.0)
+  slr_brd_pay_qty_1d = fc.numeric_column("slr_brd_pay_qty_1d", default_value=0.0)
+  slr_brd_pay_qty_1w = fc.numeric_column("slr_brd_pay_qty_1w", default_value=0.0)
+  slr_brd_pay_qty_2w = fc.numeric_column("slr_brd_pay_qty_2w", default_value=0.0)
+  slr_brd_pay_qty_1m = fc.numeric_column("slr_brd_pay_qty_1m", default_value=0.0)
+  weighted_ipv = fc.numeric_column("weighted_ipv", default_value=0.0)
+  cat1_weighted_ipv = fc.numeric_column("cat1_weighted_ipv", default_value=0.0)
+  cate_weighted_ipv = fc.numeric_column("cate_weighted_ipv", default_value=0.0)
+  slr_weighted_ipv = fc.numeric_column("slr_weighted_ipv", default_value=0.0)
+  brd_weighted_ipv = fc.numeric_column("brd_weighted_ipv", default_value=0.0)
+  cms_scale = fc.numeric_column("cms_scale", default_value=0.0)
+  cms_scale_sqrt = fc.numeric_column("cms_scale_sqrt", default_value=0.0)
+
+  # context feature
+  matchScore = fc.numeric_column("matchScore", default_value=0.0)
+  popScore = fc.numeric_column("popScore", default_value=0.0)
+  brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0)
+  cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0)
+  catePrefer = fc.numeric_column("catePrefer", default_value=0.0)
+  sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0)
+  matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0))
+  position = fc.bucketized_column(fc.numeric_column("position", dtype=tf.int64, default_value=301),
+    boundaries=[1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 30, 40, 50, 80, 100, 150, 200, 300])
+  triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 41, default_value=40))
+  triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 41, default_value=40))
+  sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0))
+  hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0))
+  phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000)
+  phoneBrand = fc.shared_embedding_columns([phoneBrandId], 20)
+  phoneResolutionId = fc.categorical_column_with_hash_bucket("phoneResolution", 500)
+  phoneResolution = fc.shared_embedding_columns([phoneResolutionId], 10)
+  phoneOs = fc.indicator_column(
+    fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0))
+  tab = fc.indicator_column(fc.categorical_column_with_vocabulary_list("tab",
+        ["ALL", "TongZhuang", "XieBao", "MuYing", "NvZhuang", "MeiZhuang", "JuJia", "MeiShi"], default_value=0))
+  userType = fc.numeric_column("user_type", default_value=0, dtype=tf.int64)
+  c2id_embed = fc.shared_embedding_columns([c2id], 16, shared_embedding_collection_name="c2id")
+  feature_columns = [matchScore, matchType, position, triggerNum, triggerRank, sceneType, hour,
+    phoneOs, tab, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer,
+    price, props_sex, brand_grade, modified_time, modified_time_sqrt, modified_time_square,
+    shipment_rate, shipping_rate, ipv_ntile, pay_ntile, uv_cvr_1d, uv_cvr_1w, uv_cvr_2w, uv_cvr_1m,
+    ctr_1d, ctr_1w, ctr_2w, ctr_1m, cvr_1d, cvr_1w, cvr_2w, cvr_1m,
+    pay_qty_1d, pay_qty_1w, pay_qty_2w, pay_qty_1m, cat2_pay_qty, cat1_pay_qty, brd_pay_qty,
+    slr_pay_qty_1d, slr_pay_qty_1w, slr_pay_qty_2w, slr_pay_qty_1m,
+    slr_brd_pay_qty_1d, slr_brd_pay_qty_1w, slr_brd_pay_qty_2w, slr_brd_pay_qty_1m,
+    weighted_ipv, cat1_weighted_ipv, cate_weighted_ipv, slr_weighted_ipv, brd_weighted_ipv,
+    cms_scale, cms_scale_sqrt, userType]
+  feature_columns += c2id_embed
+  feature_columns += phoneResolution
+  feature_columns += phoneBrand
+  return feature_columns
+
+
+def parse_exmp(serial_exmp, feature_spec):
+  #share = fc.numeric_column("share", default_value=0, dtype=tf.int64)
+  click = fc.numeric_column("click", default_value=0, dtype=tf.int64)
+  pay = fc.numeric_column("pay", default_value=0, dtype=tf.int64)
+  fea_columns = [click, pay]
+  feature_spec.update(tf.feature_column.make_parse_example_spec(fea_columns))
+  feats = tf.parse_single_example(serial_exmp, features=feature_spec)
+  feats["modified_time_sqrt"] = tf.sqrt(feats["modified_time"])
+  feats["modified_time_square"] = tf.square(feats["modified_time"])
+  feats["cms_scale_sqrt"] = tf.sqrt(feats["cms_scale"])
+  click = feats.pop('click')
+  pay = feats.pop('pay')
+  return feats, {'ctr': tf.to_float(click), 'cvr': tf.to_float(pay)}
+
+
+def train_input_fn(filenames, feature_spec, batch_size, shuffle_buffer_size, num_parallel_readers):
+  #dataset = tf.data.TFRecordDataset(filenames)
+  files = tf.data.Dataset.list_files(filenames)
+  dataset = files.apply(tf.contrib.data.parallel_interleave(tf.data.TFRecordDataset, cycle_length=num_parallel_readers))
+  dataset = dataset.map(lambda x: parse_exmp(x, feature_spec), num_parallel_calls=8)
+  # Shuffle, repeat, and batch the examples.
+  if shuffle_buffer_size > 0:
+    dataset = dataset.shuffle(shuffle_buffer_size)
+  dataset = dataset.repeat().batch(batch_size).prefetch(1)
+  #print(dataset.output_types)
+  #print(dataset.output_shapes)
+  # Return the read end of the pipeline.
+  return dataset
+
+
+def eval_input_fn(filename, feature_spec, batch_size):
+  dataset = tf.data.TFRecordDataset(filename)
+  dataset = dataset.map(lambda x: parse_exmp(x, feature_spec), num_parallel_calls=8)
+  # Shuffle, repeat, and batch the examples.
+  #dataset = dataset.batch(batch_size)
+  dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(batch_size))
+  # Return the read end of the pipeline.
+  return dataset
+
diff --git a/esmm_ext/dupn_logit_fn.py b/esmm_ext/dupn_logit_fn.py
@@ -0,0 +1,115 @@
+import tensorflow as tf
+
+def get_behavior_embedding(params, features):
+  # this function can be implemented with tf.contrib.feature_column.sequence_input_layer
+  with tf.variable_scope("behavior_embedding"):
+    pid_vocab_size = params["pid_vocab_size"]
+    behaviorPids = tf.string_to_hash_bucket_fast(tf.as_string(features["behaviorPids"]), pid_vocab_size)
+    productId = tf.string_to_hash_bucket_fast(tf.as_string(features["productId"]), pid_vocab_size)
+    pid_embeddings = tf.get_variable(name="pid_embeddings", dtype=tf.float32,
+                                     shape=[pid_vocab_size, params["pid_embedding_size"]])
+    pids = tf.nn.embedding_lookup(pid_embeddings, behaviorPids)  # shape(batch_size, max_seq_len, embedding_size)
+    pid = tf.nn.embedding_lookup(pid_embeddings, productId)
+
+    bid_vocab_size = params["bid_vocab_size"]
+    behaviorBids = tf.string_to_hash_bucket_fast(tf.as_string(features["behaviorBids"]), bid_vocab_size)
+    brandId = tf.string_to_hash_bucket_fast(tf.as_string(features["brandId"]), bid_vocab_size)
+    bid_embeddings = tf.get_variable(name="bid_embeddings", dtype=tf.float32,
+                                     shape=[bid_vocab_size, params["bid_embedding_size"]])
+    bids = tf.nn.embedding_lookup(bid_embeddings, behaviorBids)  # shape(batch_size, max_seq_len, embedding_size)
+    bid = tf.nn.embedding_lookup(bid_embeddings, brandId)
+
+    sid_vocab_size = params["sid_vocab_size"]
+    behaviorSids = tf.string_to_hash_bucket_fast(tf.as_string(features["behaviorSids"]), sid_vocab_size)
+    sellerId = tf.string_to_hash_bucket_fast(tf.as_string(features["sellerId"]), sid_vocab_size)
+    sid_embeddings = tf.get_variable(name="sid_embeddings", dtype=tf.float32,
+                                     shape=[sid_vocab_size, params["sid_embedding_size"]])
+    sids = tf.nn.embedding_lookup(sid_embeddings, behaviorSids)  # shape(batch_size, max_seq_len, embedding_size)
+    sid = tf.nn.embedding_lookup(sid_embeddings, sellerId)
+
+    cid_vocab_size = params["cid_vocab_size"]
+    behaviorCids = tf.string_to_hash_bucket_fast(tf.as_string(features["behaviorCids"]), cid_vocab_size)
+    cateId = tf.string_to_hash_bucket_fast(tf.as_string(features["cateId"]), bid_vocab_size)
+    cid_embeddings = tf.get_variable(name="cid_embeddings", dtype=tf.float32,
+                                     shape=[cid_vocab_size, params["cid_embedding_size"]])
+    cids = tf.nn.embedding_lookup(cid_embeddings, behaviorCids)  # shape(batch_size, max_seq_len, embedding_size)
+    cid = tf.nn.embedding_lookup(cid_embeddings, cateId)
+
+    c1id_vocab_size = params["c1id_vocab_size"]
+    behaviorC1ids = tf.string_to_hash_bucket_fast(tf.as_string(features["behaviorC1ids"]), c1id_vocab_size)
+    cate1Id = tf.string_to_hash_bucket_fast(tf.as_string(features["cate1Id"]), c1id_vocab_size)
+    c1id_embeddings = tf.get_variable(name="c1id_embeddings", dtype=tf.float32,
+                                      shape=[c1id_vocab_size, params["c1id_embedding_size"]])
+    c1ids = tf.nn.embedding_lookup(c1id_embeddings, behaviorC1ids)  # shape(batch_size, max_seq_len, embedding_size)
+    c1id = tf.nn.embedding_lookup(c1id_embeddings, cate1Id)
+
+    item_emb = tf.concat([pid, bid, sid, cid, c1id], -1)  # shape(batch_size, embedding_size)
+
+    behavior_hour = tf.one_hot(features["behaviorTimeBucket"], 8, dtype=tf.float32)  # shape: (batch_size, seq_len, 8)
+    behavior_weekend = tf.expand_dims(tf.to_float(features["behaviorTimeIsWeekend"]), -1)
+    behavior_weight = tf.expand_dims(features["behaviorTimeWeight"], -1)
+    behavior_type = tf.expand_dims(tf.to_float(features["behaviorTypes"]), -1)
+    behavior_scenario = tf.expand_dims(tf.to_float(features["behaviorScenarios"]), -1)
+    behvr_columns = [pids, bids, sids, cids, c1ids,
+                     behavior_hour, behavior_weekend, behavior_weight, behavior_type, behavior_scenario]
+    behvr_emb = tf.concat(behvr_columns, -1)
+    return behvr_emb, item_emb
+
+
+def attention(inputs, context, params, masks):
+  with tf.variable_scope("attention_layer"):
+    shape = inputs.shape.as_list()
+    max_seq_len = shape[1]  # padded_dim
+    embedding_size = shape[2]
+    seq_emb = tf.reshape(inputs, shape=[-1, embedding_size])  # shape(batch_size * max_seq_len, embedding_size)
+    ctx_len = context.shape.as_list()[1]
+    ctx_emb = tf.reshape(tf.tile(context, [1, max_seq_len]), shape=[-1, ctx_len])
+    print("seq_emb shape:", seq_emb.shape)
+    print("ctx_emb shape:", ctx_emb.shape)
+    net = tf.concat([seq_emb, ctx_emb], axis=1)
+    print("attention input shape:", net.shape)
+    print("attention_hidden_units:", params['attention_hidden_units'])
+    for units in params['attention_hidden_units']:
+      net = tf.layers.dense(net, units=int(units), activation=tf.nn.relu)
+    att_wgt = tf.layers.dense(net, units=1, activation=tf.sigmoid)
+    att_wgt = tf.reshape(att_wgt, shape=[-1, max_seq_len, 1], name="weight")  # shape(batch_size, max_seq_len, 1)
+    wgt_emb = tf.multiply(inputs, att_wgt)  # shape(batch_size, max_seq_len, embedding_size)
+    # masks = tf.sequence_mask(seq_len, max_seq_len, dtype=tf.float32)
+    masks = tf.expand_dims(masks, axis=-1)
+    att_emb = tf.reduce_sum(tf.multiply(wgt_emb, masks), 1, name="weighted_embedding")
+    return att_emb
+
+
+def dupn_logit_fn(features, mode, params):
+  behavior_embedding_collection = tf.get_collection("behavior_embedding")
+  if behavior_embedding_collection:
+    behvr_emb, item_emb = behavior_embedding_collection
+  else:
+    behvr_emb, item_emb = get_behavior_embedding(params, features)
+    tf.add_to_collection("behavior_embedding", behvr_emb)
+    tf.add_to_collection("behavior_embedding", item_emb)
+  print("behvr_emb shape:", behvr_emb.shape)
+  print("item_emb shape:", item_emb.shape)
+
+  lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=params["num_units"])
+  initial_state = lstm_cell.zero_state(params["batch_size"], tf.float32)
+  outputs, state = tf.nn.dynamic_rnn(lstm_cell, behvr_emb, initial_state=initial_state)
+  print("lstm output shape:", outputs.shape)
+
+  masks = tf.cast(features["behaviorPids"] >= 0, tf.float32)
+  user = tf.feature_column.input_layer(features, params["user_feature_columns"])
+  context = tf.concat([user, item_emb], -1)
+  print("attention context shape:", context.shape)
+  sequence = attention(outputs, context, params, masks)
+  print("sequence embedding shape:", sequence.shape)
+
+  other = tf.feature_column.input_layer(features, params["other_feature_columns"])
+  net = tf.concat([sequence, item_emb, other], -1)
+  # Build the hidden layers, sized according to the 'hidden_units' param.
+  for units in params['hidden_units']:
+    net = tf.layers.dense(net, units=int(units), activation=tf.nn.relu)
+    if 'dropout_rate' in params and params['dropout_rate'] > 0.0:
+      net = tf.layers.dropout(net, params['dropout_rate'], training=(mode == tf.estimator.ModeKeys.TRAIN))
+  # Compute logits
+  logits = tf.layers.dense(net, 1, activation=None)
+  return logits