Skip to content

Commit

Permalink
add new contents
Browse files Browse the repository at this point in the history
  • Loading branch information
xudong.yang committed Dec 12, 2018
1 parent e109255 commit 7e9f0f4
Show file tree
Hide file tree
Showing 2 changed files with 265 additions and 0 deletions.
150 changes: 150 additions & 0 deletions esmm_ext/dupn_input_fn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#-*- coding: UTF-8 -*-
import tensorflow as tf
from tensorflow import feature_column as fc


def create_user_feature_columns():
gender = fc.indicator_column(fc.categorical_column_with_identity("gender", num_buckets=3, default_value=0))
age_class = fc.indicator_column(fc.categorical_column_with_identity("age_class", num_buckets=7, default_value=0))
has_baby = fc.indicator_column(fc.categorical_column_with_identity("has_baby", num_buckets=2, default_value=0))
baby_gender = fc.indicator_column(fc.categorical_column_with_identity("baby_gender", num_buckets=3, default_value=0))
baby_age = fc.indicator_column(fc.categorical_column_with_identity("baby_age", num_buckets=7, default_value=0))
grade = fc.indicator_column(fc.categorical_column_with_identity("grade", num_buckets=7, default_value=0))
rfm_type = fc.indicator_column(fc.categorical_column_with_identity("bi_rfm_type", num_buckets=12, default_value=0))
cate1_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate1_price_prefer", num_buckets=6, default_value=0))
cate2_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate2_price_prefer", num_buckets=6, default_value=0))
cate3_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate3_price_prefer", num_buckets=6, default_value=0))
city_id = fc.categorical_column_with_hash_bucket("city", 700)
city = fc.shared_embedding_columns([city_id], 16)
cols = [gender, age_class, has_baby, baby_gender, baby_age, grade, rfm_type, cate1_price_prefer, cate2_price_prefer, cate3_price_prefer]
return cols + city


def create_feature_columns():
c2id = fc.categorical_column_with_hash_bucket("cate2Id", 5000, dtype=tf.int64)
modified_time = fc.numeric_column("modified_time", default_value=0.0)
modified_time_sqrt = fc.numeric_column("modified_time_sqrt", default_value=0.0)
modified_time_square = fc.numeric_column("modified_time_square", default_value=0.0)
props_sex = fc.indicator_column(
fc.categorical_column_with_vocabulary_list("props_sex", ["男", "女", "通用", "情侣"], default_value=0))
brand_grade = fc.indicator_column(
fc.categorical_column_with_vocabulary_list("brand_grade", ["A类品牌", "B类品牌", "C类品牌", "D类品牌"], default_value=0))
shipment_rate = fc.numeric_column("shipment_rate", default_value=0.0)
shipping_rate = fc.numeric_column("shipping_rate", default_value=0.0)
ipv_ntile = fc.bucketized_column(fc.numeric_column("ipv_ntile", dtype=tf.int64, default_value=99), boundaries=[1, 2, 3, 4, 5, 10, 20, 50, 80])
pay_ntile = fc.bucketized_column(fc.numeric_column("pay_ntile", dtype=tf.int64, default_value=99), boundaries=[1, 2, 3, 4, 5, 10, 20, 50, 80])
price = fc.numeric_column("price_norm", default_value=0.0)
ctr_1d = fc.numeric_column("ctr_1d", default_value=0.0)
cvr_1d = fc.numeric_column("cvr_1d", default_value=0.0)
uv_cvr_1d = fc.numeric_column("uv_cvr_1d", default_value=0.0)
ctr_1w = fc.numeric_column("ctr_1w", default_value=0.0)
cvr_1w = fc.numeric_column("cvr_1w", default_value=0.0)
uv_cvr_1w = fc.numeric_column("uv_cvr_1w", default_value=0.0)
ctr_2w = fc.numeric_column("ctr_2w", default_value=0.0)
cvr_2w = fc.numeric_column("cvr_2w", default_value=0.0)
uv_cvr_2w = fc.numeric_column("uv_cvr_2w", default_value=0.0)
ctr_1m = fc.numeric_column("ctr_1m", default_value=0.0)
cvr_1m = fc.numeric_column("cvr_1m", default_value=0.0)
uv_cvr_1m = fc.numeric_column("uv_cvr_1m", default_value=0.0)
pay_qty_1d = fc.numeric_column("pay_qty_1d", default_value=0.0)
pay_qty_1w = fc.numeric_column("pay_qty_1w", default_value=0.0)
pay_qty_2w = fc.numeric_column("pay_qty_2w", default_value=0.0)
pay_qty_1m = fc.numeric_column("pay_qty_1m", default_value=0.0)
cat2_pay_qty = fc.numeric_column("cat2_pay_qty_1d", default_value=0.0)
cat1_pay_qty = fc.numeric_column("cat1_pay_qty_1d", default_value=0.0)
brd_pay_qty = fc.numeric_column("brd_pay_qty_1d", default_value=0.0)
slr_pay_qty_1d = fc.numeric_column("slr_pay_qty_1d", default_value=0.0)
slr_pay_qty_1w = fc.numeric_column("slr_pay_qty_1w", default_value=0.0)
slr_pay_qty_2w = fc.numeric_column("slr_pay_qty_2w", default_value=0.0)
slr_pay_qty_1m = fc.numeric_column("slr_pay_qty_1m", default_value=0.0)
slr_brd_pay_qty_1d = fc.numeric_column("slr_brd_pay_qty_1d", default_value=0.0)
slr_brd_pay_qty_1w = fc.numeric_column("slr_brd_pay_qty_1w", default_value=0.0)
slr_brd_pay_qty_2w = fc.numeric_column("slr_brd_pay_qty_2w", default_value=0.0)
slr_brd_pay_qty_1m = fc.numeric_column("slr_brd_pay_qty_1m", default_value=0.0)
weighted_ipv = fc.numeric_column("weighted_ipv", default_value=0.0)
cat1_weighted_ipv = fc.numeric_column("cat1_weighted_ipv", default_value=0.0)
cate_weighted_ipv = fc.numeric_column("cate_weighted_ipv", default_value=0.0)
slr_weighted_ipv = fc.numeric_column("slr_weighted_ipv", default_value=0.0)
brd_weighted_ipv = fc.numeric_column("brd_weighted_ipv", default_value=0.0)
cms_scale = fc.numeric_column("cms_scale", default_value=0.0)
cms_scale_sqrt = fc.numeric_column("cms_scale_sqrt", default_value=0.0)

# context feature
matchScore = fc.numeric_column("matchScore", default_value=0.0)
popScore = fc.numeric_column("popScore", default_value=0.0)
brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0)
cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0)
catePrefer = fc.numeric_column("catePrefer", default_value=0.0)
sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0)
matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0))
position = fc.bucketized_column(fc.numeric_column("position", dtype=tf.int64, default_value=301),
boundaries=[1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 30, 40, 50, 80, 100, 150, 200, 300])
triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 41, default_value=40))
triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 41, default_value=40))
sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0))
hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0))
phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000)
phoneBrand = fc.shared_embedding_columns([phoneBrandId], 20)
phoneResolutionId = fc.categorical_column_with_hash_bucket("phoneResolution", 500)
phoneResolution = fc.shared_embedding_columns([phoneResolutionId], 10)
phoneOs = fc.indicator_column(
fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0))
tab = fc.indicator_column(fc.categorical_column_with_vocabulary_list("tab",
["ALL", "TongZhuang", "XieBao", "MuYing", "NvZhuang", "MeiZhuang", "JuJia", "MeiShi"], default_value=0))
userType = fc.numeric_column("user_type", default_value=0, dtype=tf.int64)
c2id_embed = fc.shared_embedding_columns([c2id], 16, shared_embedding_collection_name="c2id")
feature_columns = [matchScore, matchType, position, triggerNum, triggerRank, sceneType, hour,
phoneOs, tab, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer,
price, props_sex, brand_grade, modified_time, modified_time_sqrt, modified_time_square,
shipment_rate, shipping_rate, ipv_ntile, pay_ntile, uv_cvr_1d, uv_cvr_1w, uv_cvr_2w, uv_cvr_1m,
ctr_1d, ctr_1w, ctr_2w, ctr_1m, cvr_1d, cvr_1w, cvr_2w, cvr_1m,
pay_qty_1d, pay_qty_1w, pay_qty_2w, pay_qty_1m, cat2_pay_qty, cat1_pay_qty, brd_pay_qty,
slr_pay_qty_1d, slr_pay_qty_1w, slr_pay_qty_2w, slr_pay_qty_1m,
slr_brd_pay_qty_1d, slr_brd_pay_qty_1w, slr_brd_pay_qty_2w, slr_brd_pay_qty_1m,
weighted_ipv, cat1_weighted_ipv, cate_weighted_ipv, slr_weighted_ipv, brd_weighted_ipv,
cms_scale, cms_scale_sqrt, userType]
feature_columns += c2id_embed
feature_columns += phoneResolution
feature_columns += phoneBrand
return feature_columns


def parse_exmp(serial_exmp, feature_spec):
#share = fc.numeric_column("share", default_value=0, dtype=tf.int64)
click = fc.numeric_column("click", default_value=0, dtype=tf.int64)
pay = fc.numeric_column("pay", default_value=0, dtype=tf.int64)
fea_columns = [click, pay]
feature_spec.update(tf.feature_column.make_parse_example_spec(fea_columns))
feats = tf.parse_single_example(serial_exmp, features=feature_spec)
feats["modified_time_sqrt"] = tf.sqrt(feats["modified_time"])
feats["modified_time_square"] = tf.square(feats["modified_time"])
feats["cms_scale_sqrt"] = tf.sqrt(feats["cms_scale"])
click = feats.pop('click')
pay = feats.pop('pay')
return feats, {'ctr': tf.to_float(click), 'cvr': tf.to_float(pay)}


def train_input_fn(filenames, feature_spec, batch_size, shuffle_buffer_size, num_parallel_readers):
#dataset = tf.data.TFRecordDataset(filenames)
files = tf.data.Dataset.list_files(filenames)
dataset = files.apply(tf.contrib.data.parallel_interleave(tf.data.TFRecordDataset, cycle_length=num_parallel_readers))
dataset = dataset.map(lambda x: parse_exmp(x, feature_spec), num_parallel_calls=8)
# Shuffle, repeat, and batch the examples.
if shuffle_buffer_size > 0:
dataset = dataset.shuffle(shuffle_buffer_size)
dataset = dataset.repeat().batch(batch_size).prefetch(1)
#print(dataset.output_types)
#print(dataset.output_shapes)
# Return the read end of the pipeline.
return dataset


def eval_input_fn(filename, feature_spec, batch_size):
dataset = tf.data.TFRecordDataset(filename)
dataset = dataset.map(lambda x: parse_exmp(x, feature_spec), num_parallel_calls=8)
# Shuffle, repeat, and batch the examples.
#dataset = dataset.batch(batch_size)
dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(batch_size))
# Return the read end of the pipeline.
return dataset

115 changes: 115 additions & 0 deletions esmm_ext/dupn_logit_fn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import tensorflow as tf

def get_behavior_embedding(params, features):
# this function can be implemented with tf.contrib.feature_column.sequence_input_layer
with tf.variable_scope("behavior_embedding"):
pid_vocab_size = params["pid_vocab_size"]
behaviorPids = tf.string_to_hash_bucket_fast(tf.as_string(features["behaviorPids"]), pid_vocab_size)
productId = tf.string_to_hash_bucket_fast(tf.as_string(features["productId"]), pid_vocab_size)
pid_embeddings = tf.get_variable(name="pid_embeddings", dtype=tf.float32,
shape=[pid_vocab_size, params["pid_embedding_size"]])
pids = tf.nn.embedding_lookup(pid_embeddings, behaviorPids) # shape(batch_size, max_seq_len, embedding_size)
pid = tf.nn.embedding_lookup(pid_embeddings, productId)

bid_vocab_size = params["bid_vocab_size"]
behaviorBids = tf.string_to_hash_bucket_fast(tf.as_string(features["behaviorBids"]), bid_vocab_size)
brandId = tf.string_to_hash_bucket_fast(tf.as_string(features["brandId"]), bid_vocab_size)
bid_embeddings = tf.get_variable(name="bid_embeddings", dtype=tf.float32,
shape=[bid_vocab_size, params["bid_embedding_size"]])
bids = tf.nn.embedding_lookup(bid_embeddings, behaviorBids) # shape(batch_size, max_seq_len, embedding_size)
bid = tf.nn.embedding_lookup(bid_embeddings, brandId)

sid_vocab_size = params["sid_vocab_size"]
behaviorSids = tf.string_to_hash_bucket_fast(tf.as_string(features["behaviorSids"]), sid_vocab_size)
sellerId = tf.string_to_hash_bucket_fast(tf.as_string(features["sellerId"]), sid_vocab_size)
sid_embeddings = tf.get_variable(name="sid_embeddings", dtype=tf.float32,
shape=[sid_vocab_size, params["sid_embedding_size"]])
sids = tf.nn.embedding_lookup(sid_embeddings, behaviorSids) # shape(batch_size, max_seq_len, embedding_size)
sid = tf.nn.embedding_lookup(sid_embeddings, sellerId)

cid_vocab_size = params["cid_vocab_size"]
behaviorCids = tf.string_to_hash_bucket_fast(tf.as_string(features["behaviorCids"]), cid_vocab_size)
cateId = tf.string_to_hash_bucket_fast(tf.as_string(features["cateId"]), bid_vocab_size)
cid_embeddings = tf.get_variable(name="cid_embeddings", dtype=tf.float32,
shape=[cid_vocab_size, params["cid_embedding_size"]])
cids = tf.nn.embedding_lookup(cid_embeddings, behaviorCids) # shape(batch_size, max_seq_len, embedding_size)
cid = tf.nn.embedding_lookup(cid_embeddings, cateId)

c1id_vocab_size = params["c1id_vocab_size"]
behaviorC1ids = tf.string_to_hash_bucket_fast(tf.as_string(features["behaviorC1ids"]), c1id_vocab_size)
cate1Id = tf.string_to_hash_bucket_fast(tf.as_string(features["cate1Id"]), c1id_vocab_size)
c1id_embeddings = tf.get_variable(name="c1id_embeddings", dtype=tf.float32,
shape=[c1id_vocab_size, params["c1id_embedding_size"]])
c1ids = tf.nn.embedding_lookup(c1id_embeddings, behaviorC1ids) # shape(batch_size, max_seq_len, embedding_size)
c1id = tf.nn.embedding_lookup(c1id_embeddings, cate1Id)

item_emb = tf.concat([pid, bid, sid, cid, c1id], -1) # shape(batch_size, embedding_size)

behavior_hour = tf.one_hot(features["behaviorTimeBucket"], 8, dtype=tf.float32) # shape: (batch_size, seq_len, 8)
behavior_weekend = tf.expand_dims(tf.to_float(features["behaviorTimeIsWeekend"]), -1)
behavior_weight = tf.expand_dims(features["behaviorTimeWeight"], -1)
behavior_type = tf.expand_dims(tf.to_float(features["behaviorTypes"]), -1)
behavior_scenario = tf.expand_dims(tf.to_float(features["behaviorScenarios"]), -1)
behvr_columns = [pids, bids, sids, cids, c1ids,
behavior_hour, behavior_weekend, behavior_weight, behavior_type, behavior_scenario]
behvr_emb = tf.concat(behvr_columns, -1)
return behvr_emb, item_emb


def attention(inputs, context, params, masks):
with tf.variable_scope("attention_layer"):
shape = inputs.shape.as_list()
max_seq_len = shape[1] # padded_dim
embedding_size = shape[2]
seq_emb = tf.reshape(inputs, shape=[-1, embedding_size]) # shape(batch_size * max_seq_len, embedding_size)
ctx_len = context.shape.as_list()[1]
ctx_emb = tf.reshape(tf.tile(context, [1, max_seq_len]), shape=[-1, ctx_len])
print("seq_emb shape:", seq_emb.shape)
print("ctx_emb shape:", ctx_emb.shape)
net = tf.concat([seq_emb, ctx_emb], axis=1)
print("attention input shape:", net.shape)
print("attention_hidden_units:", params['attention_hidden_units'])
for units in params['attention_hidden_units']:
net = tf.layers.dense(net, units=int(units), activation=tf.nn.relu)
att_wgt = tf.layers.dense(net, units=1, activation=tf.sigmoid)
att_wgt = tf.reshape(att_wgt, shape=[-1, max_seq_len, 1], name="weight") # shape(batch_size, max_seq_len, 1)
wgt_emb = tf.multiply(inputs, att_wgt) # shape(batch_size, max_seq_len, embedding_size)
# masks = tf.sequence_mask(seq_len, max_seq_len, dtype=tf.float32)
masks = tf.expand_dims(masks, axis=-1)
att_emb = tf.reduce_sum(tf.multiply(wgt_emb, masks), 1, name="weighted_embedding")
return att_emb


def dupn_logit_fn(features, mode, params):
behavior_embedding_collection = tf.get_collection("behavior_embedding")
if behavior_embedding_collection:
behvr_emb, item_emb = behavior_embedding_collection
else:
behvr_emb, item_emb = get_behavior_embedding(params, features)
tf.add_to_collection("behavior_embedding", behvr_emb)
tf.add_to_collection("behavior_embedding", item_emb)
print("behvr_emb shape:", behvr_emb.shape)
print("item_emb shape:", item_emb.shape)

lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=params["num_units"])
initial_state = lstm_cell.zero_state(params["batch_size"], tf.float32)
outputs, state = tf.nn.dynamic_rnn(lstm_cell, behvr_emb, initial_state=initial_state)
print("lstm output shape:", outputs.shape)

masks = tf.cast(features["behaviorPids"] >= 0, tf.float32)
user = tf.feature_column.input_layer(features, params["user_feature_columns"])
context = tf.concat([user, item_emb], -1)
print("attention context shape:", context.shape)
sequence = attention(outputs, context, params, masks)
print("sequence embedding shape:", sequence.shape)

other = tf.feature_column.input_layer(features, params["other_feature_columns"])
net = tf.concat([sequence, item_emb, other], -1)
# Build the hidden layers, sized according to the 'hidden_units' param.
for units in params['hidden_units']:
net = tf.layers.dense(net, units=int(units), activation=tf.nn.relu)
if 'dropout_rate' in params and params['dropout_rate'] > 0.0:
net = tf.layers.dropout(net, params['dropout_rate'], training=(mode == tf.estimator.ModeKeys.TRAIN))
# Compute logits
logits = tf.layers.dense(net, 1, activation=None)
return logits

0 comments on commit 7e9f0f4

Please sign in to comment.