diff --git a/examples/census_example_common.py b/examples/census_example_common.py index 9f578bcc..160274fe 100644 --- a/examples/census_example_common.py +++ b/examples/census_example_common.py @@ -56,14 +56,14 @@ ] -RAW_DATA_FEATURE_SPEC = dict([(name, tf.io.FixedLenFeature([], tf.string)) +RAW_DATA_FEATURE_SPEC = dict([(name, tf.io.FixedLenFeature(shape=(1,), dtype=tf.string)) for name in CATEGORICAL_FEATURE_KEYS] + - [(name, tf.io.FixedLenFeature([], tf.float32)) + [(name, tf.io.FixedLenFeature(shape=(1,), dtype=tf.float32)) for name in NUMERIC_FEATURE_KEYS] + - [(name, tf.io.VarLenFeature(tf.float32)) + [(name, tf.io.FixedLenFeature(shape=(1,), dtype=tf.float32)) for name in OPTIONAL_NUMERIC_FEATURE_KEYS] + [(LABEL_KEY, - tf.io.FixedLenFeature([], tf.string))]) + tf.io.FixedLenFeature(shape=(1,), dtype=tf.string))]) _SCHEMA = dataset_metadata.DatasetMetadata( schema_utils.schema_from_feature_spec(RAW_DATA_FEATURE_SPEC)).schema @@ -125,14 +125,9 @@ def preprocessing_fn(inputs): outputs[key] = tft.scale_to_0_1(inputs[key]) for key in OPTIONAL_NUMERIC_FEATURE_KEYS: - # This is a SparseTensor because it is optional. Here we fill in a default - # value when it is missing. - sparse = tf.sparse.SparseTensor(inputs[key].indices, inputs[key].values, - [inputs[key].dense_shape[0], 1]) - dense = tf.sparse.to_dense(sp_input=sparse, default_value=0.) - # Reshaping from a batch of vectors of size 1 to a batch to scalars. - dense = tf.squeeze(dense, axis=1) - outputs[key] = tft.scale_to_0_1(dense) + # This is being treated as a dense tensor that might be missing + # values. Might call for some sort of imputation. + outputs[key] = tft.scale_to_0_1(inputs[key]) # For all categorical columns except the label column, we generate a # vocabulary, and convert the string feature to a one-hot encoding. diff --git a/examples/census_example_v2.py b/examples/census_example_v2.py index 66d37680..458a708e 100644 --- a/examples/census_example_v2.py +++ b/examples/census_example_v2.py @@ -93,7 +93,12 @@ def transform_dataset(data): raw_features[key] = tf.RaggedTensor.from_tensor( tf.expand_dims(val, -1)).to_sparse() continue - raw_features[key] = val + # We receive the raw data as scalars of length batch, but + # we need them to be tensors of shape (1,) with + # batch number of them. This can be thought of as adding a batch + # dimension, but more simply, this is us saying we want to treat + # each observation as a tensor of shape (1, ), which is a vector. + raw_features[key] = tf.expand_dims(val, -1) transformed_features = tft_layer(raw_features) data_labels = transformed_features.pop(common.LABEL_KEY) return (transformed_features, data_labels) @@ -128,7 +133,7 @@ def serve_tf_examples_fn(serialized_tf_examples): return {'classes': classes, 'scores': outputs} concrete_serving_fn = serve_tf_examples_fn.get_concrete_function( - tf.TensorSpec(shape=[None], dtype=tf.string, name='inputs')) + tf.TensorSpec(shape=(1,), dtype=tf.string, name='inputs')) signatures = {'serving_default': concrete_serving_fn} # This is required in order to make this model servable with model_server. @@ -191,12 +196,12 @@ def train_and_evaluate(raw_train_eval_data_path_pattern, for key, spec in feature_spec.items(): if isinstance(spec, tf.io.VarLenFeature): inputs[key] = tf.keras.layers.Input( - shape=[None], name=key, dtype=spec.dtype, sparse=True) + shape=(1,), name=key, dtype=spec.dtype, sparse=True) elif isinstance(spec, tf.io.FixedLenFeature): # TODO(b/208879020): Move into schema such that spec.shape is [1] and not # [] for scalars. inputs[key] = tf.keras.layers.Input( - shape=spec.shape or [1], name=key, dtype=spec.dtype) + shape=spec.shape, name=key, dtype=spec.dtype) else: raise ValueError('Spec type is not supported: ', key, spec) @@ -237,11 +242,11 @@ def main(input_data_dir, train_data_file = os.path.join(input_data_dir, 'adult.data') test_data_file = os.path.join(input_data_dir, 'adult.test') - common.transform_data(train_data_file, test_data_file, working_dir) if read_raw_data_for_training: raw_train_and_eval_patterns = (train_data_file, test_data_file) transformed_train_and_eval_patterns = None + common.transform_data(train_data_file, test_data_file, working_dir) else: train_pattern = os.path.join(working_dir, common.TRANSFORMED_TRAIN_DATA_FILEBASE + '*')