Skip to content

Commit 19807fa

Browse files
committed
Merge commit for internal changes
2 parents 4fb28d6 + 9724401 commit 19807fa

File tree

102 files changed

+2791
-434
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

102 files changed

+2791
-434
lines changed

WORKSPACE

+1-1
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,7 @@ new_git_repository(
364364
name = "paper_styles",
365365
build_file = "bower.BUILD",
366366
remote = "https://github.com/polymerelements/paper-styles.git",
367-
tag = "v1.1.1",
367+
tag = "v1.1.4",
368368
)
369369

370370
new_git_repository(

tensorflow/contrib/layers/python/layers/feature_column.py

+81
Original file line numberDiff line numberDiff line change
@@ -1257,6 +1257,87 @@ def crossed_column(columns, hash_bucket_size, combiner="sum"):
12571257
return _CrossedColumn(columns, hash_bucket_size, combiner=combiner)
12581258

12591259

1260+
class DataFrameColumn(_FeatureColumn,
1261+
collections.namedtuple("DataFrameColumn",
1262+
["name", "series"])):
1263+
"""Represents a feature column produced from a `DataFrame`.
1264+
1265+
Instances of this class are immutable. A `DataFrame` column may be dense or
1266+
sparse, and may have any shape, with the constraint that dimension 0 is
1267+
batch_size.
1268+
1269+
Args:
1270+
name: a name for this column
1271+
series: a `Series` to be wrapped, which has already had its base features
1272+
substituted with `PredefinedSeries`.
1273+
"""
1274+
1275+
def __new__(cls, name, series):
1276+
return super(DataFrameColumn, cls).__new__(cls, name, series)
1277+
1278+
@property
1279+
def config(self):
1280+
return self.series.required_base_features()
1281+
1282+
@property
1283+
def key(self):
1284+
"""Returns a string which will be used as a key when we do sorting."""
1285+
return self.name
1286+
1287+
def insert_transformed_feature(self, columns_to_tensors):
1288+
# The cache must already contain mappings from the expected base feature
1289+
# names to Tensors.
1290+
1291+
# Passing columns_to_tensors as the cache here means that multiple outputs
1292+
# of the transform will be cached, keyed by the repr of their associated
1293+
# TransformedSeries.
1294+
# The specific requested output ends up in columns_to_tensors twice: once
1295+
# keyed by the TransformedSeries repr, and once keyed by this
1296+
# DataFrameColumn instance.
1297+
columns_to_tensors[self] = self.series.build(columns_to_tensors)
1298+
1299+
# pylint: disable=unused-argument
1300+
def to_dnn_input_layer(self,
1301+
input_tensor,
1302+
weight_collections=None,
1303+
trainable=True):
1304+
return input_tensor
1305+
1306+
# TODO(soergel): This mirrors RealValuedColumn for now, but should become
1307+
# better abstracted with less code duplication when we add other kinds.
1308+
def to_weighted_sum(self,
1309+
input_tensor,
1310+
num_outputs=1,
1311+
weight_collections=None,
1312+
trainable=True):
1313+
def _weight(name):
1314+
return variable_scope.get_variable(
1315+
name,
1316+
shape=[self.dimension, num_outputs],
1317+
initializer=array_ops.zeros_initializer,
1318+
collections=_add_variable_collection(weight_collections))
1319+
1320+
if self.name:
1321+
with variable_scope.variable_op_scope([input_tensor], None, self.name):
1322+
weight = _weight("weight")
1323+
else:
1324+
# Old behavior to support a subset of old checkpoints.
1325+
weight = _weight("_weight")
1326+
1327+
# The _RealValuedColumn has the shape of [batch_size, column.dimension].
1328+
log_odds_by_dim = math_ops.matmul(input_tensor, weight)
1329+
return log_odds_by_dim, [weight]
1330+
1331+
def __eq__(self, other):
1332+
if isinstance(other, self.__class__):
1333+
return self.__dict__ == other.__dict__
1334+
else:
1335+
return False
1336+
1337+
def __ne__(self, other):
1338+
return not self.__eq__(other)
1339+
1340+
12601341
def _get_feature_config(feature_column):
12611342
"""Returns configuration for the base feature defined in feature_column."""
12621343
if not isinstance(feature_column, _FeatureColumn):

tensorflow/contrib/learn/BUILD

+12
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,18 @@ py_test(
8888
],
8989
)
9090

91+
py_test(
92+
name = "estimator_utils_test",
93+
size = "medium",
94+
srcs = ["python/learn/tests/dataframe/estimator_utils_test.py"],
95+
srcs_version = "PY2AND3",
96+
deps = [
97+
":learn",
98+
"//tensorflow:tensorflow_py",
99+
"//tensorflow/python:framework_test_lib",
100+
],
101+
)
102+
91103
py_test(
92104
name = "series_test",
93105
size = "small",

tensorflow/contrib/learn/python/learn/dataframe/__init__.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from __future__ import print_function
2121

2222
from tensorflow.contrib.learn.python.learn.dataframe.dataframe import DataFrame
23+
from tensorflow.contrib.learn.python.learn.dataframe.series import PredefinedSeries
2324
from tensorflow.contrib.learn.python.learn.dataframe.series import Series
2425
from tensorflow.contrib.learn.python.learn.dataframe.series import TransformedSeries
2526
from tensorflow.contrib.learn.python.learn.dataframe.tensorflow_dataframe import TensorFlowDataFrame
@@ -45,5 +46,5 @@
4546
for ct_def in _cmp.COMPARISON_TRANSFORMS:
4647
_cmp.register_comparison_ops(*ct_def)
4748

48-
__all__ = ['DataFrame', 'Series', 'TransformedSeries', 'TensorFlowDataFrame',
49-
'parameter', 'Transform']
49+
__all__ = ['DataFrame', 'Series', 'PredefinedSeries', 'TransformedSeries',
50+
'TensorFlowDataFrame', 'parameter', 'Transform']

tensorflow/contrib/learn/python/learn/dataframe/dataframe.py

-43
Original file line numberDiff line numberDiff line change
@@ -124,46 +124,3 @@ def build(self):
124124
cache = {}
125125
tensors = {name: c.build(cache) for name, c in self._columns.items()}
126126
return tensors
127-
128-
def to_input_fn(self, feature_keys=None, target_keys=None):
129-
"""Build an input_fn suitable for use with Estimator.
130-
131-
Args:
132-
feature_keys: the names of columns to be used as features. If None, all
133-
columns except those in target_keys are used.
134-
target_keys: the names of columns to be used as targets. None is
135-
acceptable for unsupervised learning.
136-
137-
Returns:
138-
A function that returns a pair of dicts (features, targets), each mapping
139-
string names to Tensors.
140-
141-
Raises:
142-
ValueError: when the feature and target key sets are non-disjoint
143-
"""
144-
if target_keys is None:
145-
target_keys = []
146-
147-
if feature_keys is None:
148-
feature_keys = self.columns() - set(target_keys)
149-
else:
150-
in_both = set(feature_keys) & set(target_keys)
151-
if in_both:
152-
raise ValueError(
153-
"Columns cannot be used for both features and targets: %s" %
154-
", ".join(in_both))
155-
156-
def input_fn():
157-
# It's important to build all the tensors together in one DataFrame.
158-
# If we did df.select() for both key sets and then build those, the two
159-
# resulting DataFrames would be shuffled independently.
160-
tensors = self.build()
161-
162-
# Note that (for now at least) we provide our columns to Estimator keyed
163-
# by strings, so they are base features as far as Estimator is concerned.
164-
# TODO(soergel): reconcile with FeatureColumn keys, Transformer etc.
165-
features = {key: tensors[key] for key in feature_keys}
166-
targets = {key: tensors[key] for key in target_keys}
167-
return features, targets
168-
169-
return input_fn
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
# pylint: disable=g-bad-file-header
2+
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
# ==============================================================================
16+
"""Utility functions relating DataFrames to Estimators."""
17+
18+
from __future__ import absolute_import
19+
from __future__ import division
20+
from __future__ import print_function
21+
22+
from tensorflow.contrib.layers import feature_column
23+
from tensorflow.contrib.learn.python.learn.dataframe import series as ss
24+
from tensorflow.python.framework import ops
25+
from tensorflow.python.ops import parsing_ops
26+
27+
28+
def _to_feature_spec(tensor, default_value=None):
29+
if isinstance(tensor, ops.SparseTensor):
30+
return parsing_ops.VarLenFeature(dtype=tensor.dtype)
31+
else:
32+
return parsing_ops.FixedLenFeature(shape=tensor.get_shape(),
33+
dtype=tensor.dtype,
34+
default_value=default_value)
35+
36+
37+
def _infer_feature_specs(dataframe, keys_with_defaults):
38+
with ops.Graph().as_default():
39+
tensors = dataframe.build()
40+
feature_specs = {
41+
name: _to_feature_spec(tensor, keys_with_defaults.get(name))
42+
for name, tensor in tensors.items()}
43+
return feature_specs
44+
45+
46+
def _build_alternate_universe(
47+
dataframe, base_input_keys_with_defaults, feature_keys):
48+
"""Create an alternate universe assuming that the base series are defined.
49+
50+
The resulting graph will be used with an `input_fn` that provides exactly
51+
those features.
52+
53+
Args:
54+
dataframe: the underlying `DataFrame`
55+
base_input_keys_with_defaults: a `dict` from the names of columns to
56+
considered base features to their default values.
57+
feature_keys: the names of columns to be used as features (including base
58+
features and derived features).
59+
60+
Returns:
61+
A `dict` mapping names to rebuilt `Series`.
62+
"""
63+
feature_specs = _infer_feature_specs(dataframe, base_input_keys_with_defaults)
64+
65+
alternate_universe_map = {
66+
dataframe[name]: ss.PredefinedSeries(name, feature_specs[name])
67+
for name in base_input_keys_with_defaults.keys()
68+
}
69+
70+
def _in_alternate_universe(orig_series):
71+
# pylint: disable=protected-access
72+
# Map Series in the original DataFrame to series rebuilt assuming base_keys.
73+
try:
74+
return alternate_universe_map[orig_series]
75+
except KeyError:
76+
rebuilt_inputs = []
77+
for i in orig_series._input_series:
78+
rebuilt_inputs.append(_in_alternate_universe(i))
79+
rebuilt_series = ss.TransformedSeries(rebuilt_inputs,
80+
orig_series._transform,
81+
orig_series._output_name)
82+
alternate_universe_map[orig_series] = rebuilt_series
83+
return rebuilt_series
84+
85+
orig_feature_series_dict = {fk: dataframe[fk] for fk in feature_keys}
86+
new_feature_series_dict = ({name: _in_alternate_universe(x)
87+
for name, x in orig_feature_series_dict.items()})
88+
return new_feature_series_dict, feature_specs
89+
90+
91+
def to_feature_columns_and_input_fn(dataframe,
92+
base_input_keys_with_defaults,
93+
feature_keys,
94+
target_keys=None):
95+
"""Build a list of FeatureColumns and an input_fn for use with Estimator.
96+
97+
Args:
98+
dataframe: the underlying dataframe
99+
base_input_keys_with_defaults: a dict from the names of columns to be
100+
considered base features to their default values. These columns will be
101+
fed via input_fn.
102+
feature_keys: the names of columns from which to generate FeatureColumns.
103+
These may include base features and/or derived features.
104+
target_keys: the names of columns to be used as targets. None is
105+
acceptable for unsupervised learning.
106+
107+
Returns:
108+
A tuple of two elements:
109+
* A list of `FeatureColumn`s to be used when constructing an Estimator
110+
* An input_fn, i.e. a function that returns a pair of dicts
111+
(features, targets), each mapping string names to Tensors.
112+
the feature dict provides mappings for all the base columns required
113+
by the FeatureColumns.
114+
115+
Raises:
116+
ValueError: when the feature and target key sets are non-disjoint, or the
117+
base_input and target sets are non-disjoint.
118+
"""
119+
if feature_keys is None or not feature_keys:
120+
raise ValueError("feature_keys must be specified.")
121+
122+
if target_keys is None:
123+
target_keys = []
124+
125+
base_input_keys = base_input_keys_with_defaults.keys()
126+
127+
in_two = (set(feature_keys) & set(target_keys)) or (set(base_input_keys) &
128+
set(target_keys))
129+
if in_two:
130+
raise ValueError("Columns cannot be used for both features and targets: %s"
131+
% ", ".join(in_two))
132+
133+
# Obtain the feature series in the alternate universe
134+
new_feature_series_dict, feature_specs = _build_alternate_universe(
135+
dataframe, base_input_keys_with_defaults, feature_keys)
136+
137+
# TODO(soergel): Allow non-real, non-dense DataFrameColumns
138+
for key in new_feature_series_dict.keys():
139+
spec = feature_specs[key]
140+
if not (
141+
isinstance(spec, parsing_ops.FixedLenFeature)
142+
and (spec.dtype.is_integer or spec.dtype.is_floating)):
143+
raise ValueError("For now, only real dense columns can be passed from "
144+
"DataFrame to Estimator. %s is %s of %s" % (
145+
(key, type(spec).__name__, spec.dtype)))
146+
147+
# Make FeatureColumns from these
148+
feature_columns = [feature_column.DataFrameColumn(name, s)
149+
for name, s in new_feature_series_dict.items()]
150+
151+
# Make a new DataFrame with only the Series needed for input_fn.
152+
# This is important to avoid starting queue feeders that won't be used.
153+
limited_dataframe = dataframe.select_columns(
154+
list(base_input_keys) + list(target_keys))
155+
156+
# Build an input_fn suitable for use with Estimator.
157+
def input_fn():
158+
# It's important to build all the tensors together in one DataFrame.
159+
# If we did df.select() for both key sets and then build those, the two
160+
# resulting DataFrames would be shuffled independently.
161+
tensors = limited_dataframe.build()
162+
163+
base_input_features = {key: tensors[key] for key in base_input_keys}
164+
targets = {key: tensors[key] for key in target_keys}
165+
166+
# TODO(soergel): Remove this special case when b/30367437 is fixed.
167+
if len(targets) == 1:
168+
targets = list(targets.values())[0]
169+
170+
return base_input_features, targets
171+
172+
return feature_columns, input_fn

0 commit comments

Comments
 (0)