Skip to content

Commit

Permalink
Directly convert a batch of tf.Examples to Arrow tables.
Browse files Browse the repository at this point in the history
- Avoids converting tf.Example to intermediate Dict representation.
- Adds dependency on tfx_bsl
- Deletes fast example decoder.

PiperOrigin-RevId: 273417600
  • Loading branch information
paulgc authored and tf-data-validation-team committed Oct 8, 2019
1 parent 17f869f commit b9f060a
Show file tree
Hide file tree
Showing 12 changed files with 164 additions and 237 deletions.
2 changes: 2 additions & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
## Major Features and Improvements

* Generate statistics for sparse features.
* Directly convert a batch of tf.Examples to Arrow tables. Avoids conversion of
tf.Example to intermediate Dict representation.

## Bug Fixes and Other Changes

Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ def has_ext_modules(self):
# 'tensorflow>=1.14,<2',
'tensorflow-metadata>=0.14,<0.15',
'tensorflow-transform>=0.14,<0.15',
'tfx-bsl>=0.15.0.dev0,<0.16',

# Dependencies needed for visualization.
# Note that we don't add a max version for IPython as it introduces a
Expand Down
19 changes: 0 additions & 19 deletions tensorflow_data_validation/coders/cc/BUILD

This file was deleted.

125 changes: 0 additions & 125 deletions tensorflow_data_validation/coders/cc/fast_example_decoder.cc

This file was deleted.

35 changes: 0 additions & 35 deletions tensorflow_data_validation/coders/cc/fast_example_decoder.h

This file was deleted.

16 changes: 5 additions & 11 deletions tensorflow_data_validation/coders/tf_example_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,22 +23,18 @@
from tensorflow_data_validation import constants
from tensorflow_data_validation import types
from tensorflow_data_validation.pyarrow_tf import pyarrow as pa
from tensorflow_data_validation.pywrap import pywrap_tensorflow_data_validation
from tensorflow_data_validation.utils import batch_util
from tfx_bsl.coders import example_coder


DecodeExample = pywrap_tensorflow_data_validation.TFDV_DecodeExample # pylint: disable=invalid-name


# TODO(pachristopher): This fast coder can also benefit TFT. Consider moving
# this coder to tf.Beam once it is available.
# TODO(pachristopher): Deprecate this in 0.16.
class TFExampleDecoder(object):
"""A decoder for decoding TF examples into tf data validation datasets.
"""

def decode(self, serialized_example_proto: bytes) -> types.Example:
"""Decodes serialized tf.Example to tf data validation input dict."""
return DecodeExample(serialized_example_proto)
return example_coder.ExampleToNumpyDict(serialized_example_proto)


@beam.ptransform_fn
Expand All @@ -58,9 +54,7 @@ def DecodeTFExample(
Returns:
A PCollection of Arrow tables.
"""
decoder = TFExampleDecoder()
return (examples
| 'ParseTFExamples' >> beam.Map(decoder.decode)
| 'BatchExamplesToArrowTables' >>
batch_util.BatchExamplesToArrowTables(
| 'BatchSerializedExamplesToArrowTables' >>
batch_util.BatchSerializedExamplesToArrowTables(
desired_batch_size=desired_batch_size))
2 changes: 0 additions & 2 deletions tensorflow_data_validation/pywrap/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ tf_py_wrap_cc(
srcs = ["tensorflow_data_validation.i"],
swig_includes = [
"arrow.i",
"fast_example_decoder.i",
"validation_api.i",
],
# Since we are building a python extension, we tell the linker to only
Expand All @@ -44,7 +43,6 @@ tf_py_wrap_cc(
"//tensorflow_data_validation/arrow/cc:arrow_util",
"//tensorflow_data_validation/arrow/cc:decoded_examples_to_arrow",
"//tensorflow_data_validation/arrow/cc:merge",
"//tensorflow_data_validation/coders/cc:fast_example_decoder",
"@local_config_python//:python_headers",
"@org_tensorflow//tensorflow/core:lib",
],
Expand Down
24 changes: 0 additions & 24 deletions tensorflow_data_validation/pywrap/fast_example_decoder.i

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,4 @@ limitations under the License.
==============================================================================*/

%include "tensorflow_data_validation/pywrap/validation_api.i"
%include "tensorflow_data_validation/pywrap/arrow.i"
%include "tensorflow_data_validation/pywrap/fast_example_decoder.i"
%include "tensorflow_data_validation/pywrap/arrow.i"
24 changes: 12 additions & 12 deletions tensorflow_data_validation/tools/windows/pip/build_tfdv_windows.sh
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,17 @@ pip install setuptools --upgrade
pip install wheel --upgrade
pip freeze --all

echo "Installing TFX-BSL at head"
pushd tfx_bsl_at_head
source "tfx_bsl/tools/windows/pip/build_tfx_bsl_windows.sh" \
|| { echo "Failed to source build_tfx_bsl_windows.sh" >&2; exit 1; }

(tfx_bsl::build_from_head_windows) && wheel=$(ls dist/*.whl) \
|| { echo "Failed to build tfx_bsl."; exit 1; }

pip install ${wheel}
popd

PYARROW_REQUIREMENT=$(python -c "fp = open('third_party/pyarrow_version.bzl', 'r'); d = {}; exec(fp.read(), d); fp.close(); print(d['PY_DEP'])")
pip install "${PYARROW_REQUIREMENT}"
./configure.sh
Expand All @@ -143,19 +154,8 @@ pip uninstall -y Cython
pip install dist/*.whl
pip install ${TENSORFLOW}

# If running with tf-nightly, install TFT at head. If installing TFT at head,
# also install TFX-BSL at head.
# If running with tf-nightly, install TFT at head.
if [[ ${TENSORFLOW}==tf-nightly ]]; then
echo "Installing TFX-BSL at head"
pushd tfx_bsl_at_head
PYARROW_REQUIREMENT=$(python -c "fp = open('third_party/pyarrow_version.bzl', 'r'); d = {}; exec(fp.read(), d); fp.close(); print(d['PY_DEP'])")
pip install "${PYARROW_REQUIREMENT}"
./configure.sh
bazel run -c opt --copt=-DWIN32_LEAN_AND_MEAN tfx_bsl:build_pip_package -- --python_bin_path ${PYTHON_BIN_PATH}
BSL_WHEEL_PATH=$(find dist -name "*.whl")
pip install ${BSL_WHEEL_PATH}
popd # pop tfx_bsl_at_head

pip uninstall -y tensorflow-transform
echo "Installing TFT at head"
pushd tft_at_head
Expand Down
Loading

0 comments on commit b9f060a

Please sign in to comment.