Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[IR] Improve external data handling #2020

Merged
merged 32 commits into from
Jan 22, 2025
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
dbc3890
[IR] Improve external data handling
justinchuby Jan 17, 2025
3c9d315
Also load into memory
justinchuby Jan 17, 2025
fe696f5
external
justinchuby Jan 17, 2025
63553c5
f
justinchuby Jan 17, 2025
03b90ca
Simplify logic
justinchuby Jan 17, 2025
ad169e1
format
justinchuby Jan 17, 2025
903b206
comments
justinchuby Jan 17, 2025
ffe270f
docs
justinchuby Jan 17, 2025
e8726fc
sim
justinchuby Jan 17, 2025
68c8e41
Apply suggestions from code review
justinchuby Jan 17, 2025
d78f407
fix samefile call
justinchuby Jan 18, 2025
81458ec
wip
justinchuby Jan 18, 2025
273919d
address comments
justinchuby Jan 21, 2025
67d8ab9
Merge branch 'main' into justinchu/ir-save
justinchuby Jan 21, 2025
357c41b
test
justinchuby Jan 21, 2025
a0a5b58
test
justinchuby Jan 21, 2025
5e5a59c
Merge branch 'main' into justinchu/ir-save
justinchuby Jan 21, 2025
f601264
test
justinchuby Jan 21, 2025
e6fc2fe
update
justinchuby Jan 22, 2025
7dfdfdf
wip
justinchuby Jan 22, 2025
f5e0724
Handle right
justinchuby Jan 22, 2025
06f2eeb
polyfill
justinchuby Jan 22, 2025
458e0ab
Rename
justinchuby Jan 22, 2025
074afa9
Rename functions and expose external data module
justinchuby Jan 22, 2025
e37f3ba
Update onnxscript/ir/_polyfill.py
justinchuby Jan 22, 2025
b9f8e80
name
justinchuby Jan 22, 2025
58049f4
rename
justinchuby Jan 22, 2025
319e5cc
typing
justinchuby Jan 22, 2025
5dee1cd
mypy
justinchuby Jan 22, 2025
cb25b6e
Hashable
justinchuby Jan 22, 2025
b4f8c8c
naming
justinchuby Jan 22, 2025
33a8345
sort
justinchuby Jan 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 2 additions & 18 deletions onnxscript/_framework_apis/torch_2_5.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

from onnxscript import ir, optimizer, version_converter
from onnxscript.function_libs.torch_lib import registration
from onnxscript.ir import _external_data


@dataclasses.dataclass(frozen=True)
Expand Down Expand Up @@ -68,32 +67,17 @@ def save_model_with_external_data(model: ir.Model, model_path: str | os.PathLike
"""Save the model with external data. The model is unchanged after saving."""

# TODO(#1835): Decide if we want to externalize large attributes as well
initializer_values = tuple(model.graph.initializers.values())
tensors = [v.const_value for v in initializer_values]
tensors = [v.const_value for v in model.graph.initializers.values()]
for tensor in tensors:
justinchuby marked this conversation as resolved.
Show resolved Hide resolved
if tensor is None:
raise ValueError(
"The model contains uninitialized initializer values. "
"Please make sure all initializer values are initialized."
)
destination_path = pathlib.Path(model_path)
base_dir = destination_path.parent
data_path = f"{destination_path.name}.data"

external_tensors = _external_data.convert_tensors_to_external(
tensors, # type: ignore[arg-type]
base_dir,
data_path,
)

# Replace the initializer values with external tensors and save the model
for initializer, external_tensor in zip(initializer_values, external_tensors):
initializer.const_value = external_tensor
ir.save(model, model_path)

# Restore the original initializer values so the model is unchanged
for initializer, tensor in zip(initializer_values, tensors):
initializer.const_value = tensor
ir.save(model, model_path, external_data=data_path, modify_model=False)


def get_torchlib_ops() -> list[_OnnxFunctionMeta]:
Expand Down
132 changes: 59 additions & 73 deletions onnxscript/ir/_external_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@

from __future__ import annotations

__all__ = ["set_base_dir"]
import typing
Fixed Show fixed Hide fixed
Fixed Show fixed Hide fixed

__all__ = ["set_base_dir", "to_external_data", "convert_tensors_to_external"]

import dataclasses
import os
Expand Down Expand Up @@ -78,34 +80,25 @@ def set_base_dir(graph: _core.Graph | _core.GraphView, base_dir: str | os.PathLi
tensor.base_dir = base_dir


def _load_external_data_file(
tensors: Sequence[_protocols.TensorProtocol],
base_path: str | os.PathLike,
relative_path: str | os.PathLike,
) -> list[_protocols.TensorProtocol]:
"""Load all external data that is at relative_path into memory for the provided model.
def _external_tensor_to_memory_tensor(
tensor: _protocols.TensorProtocol,
) -> _protocols.TensorProtocol:
"""Convert an external tensor to an in memory tensor.

Args:
tensors: Tensors to be converted to external tensors. They can be external tensors themselves.
base_path: Path of base directory.
tensor: An external tensor to load.
base_dir: Path of base directory.
relative_path: Path to which external data is to be stored, relative to the ONNX file.

Returns:
A list of ir.Tensor values.
An ir.Tensor object with the data loaded into memory.
"""
updated_tensors: list[_protocols.TensorProtocol] = []
for tensor in tensors:
if isinstance(tensor, _core.ExternalTensor):
external_tensor = tensor
if os.path.samefile(tensor.path, os.path.join(base_path, relative_path)):
# Copy the data as the .numpy() call references data from a file whose data is eventually modified
tensor_data = external_tensor.numpy().copy()
external_tensor.release()
tensor = _core.Tensor(
tensor_data, name=external_tensor.name, dtype=external_tensor.dtype
)
updated_tensors.append(tensor)
return updated_tensors
if isinstance(tensor, _core.ExternalTensor):
# Copy the data as the .numpy() call references data from a file whose data is eventually modified
tensor_data = tensor.numpy().copy()
tensor.release()
return _core.Tensor(tensor_data, name=tensor.name, dtype=tensor.dtype)
return tensor


def _compute_new_offset(
Expand Down Expand Up @@ -177,14 +170,14 @@ def _save_external_data(

def _convert_as_external_tensors(
external_data_info: list[tuple[_protocols.TensorProtocol, _ExternalDataInfo]],
base_path: str | os.PathLike,
base_dir: str | os.PathLike,
relative_path: str | os.PathLike,
) -> list[_core.ExternalTensor]:
"""Convert the tensors (stored within the values) written as external data to _core.ExternalTensor types.

Args:
external_data_info: A collection of external data information stored for each tensor to be written as external data.
base_path: Path of base directory.
base_dir: Path of base directory.
relative_path: Path to which external data is to be stored, relative to the ONNX file.

Returns:
Expand All @@ -200,58 +193,50 @@ def _convert_as_external_tensors(
tensor.dtype, # type: ignore[arg-type]
shape=tensor.shape, # type: ignore[arg-type]
name=tensor.name, # type: ignore[arg-type]
base_dir=os.path.normpath(base_path),
base_dir=os.path.normpath(base_dir),
)
external_tensors.append(external_tensor)
return external_tensors


def convert_tensors_to_external(
tensors: Sequence[_protocols.TensorProtocol],
base_path: str | os.PathLike,
base_dir: str | os.PathLike,
relative_path: str | os.PathLike,
load_external_to_memory: bool = False,
) -> list[_core.ExternalTensor]:
"""Convert a sequence of any TensorProtocol tensors to external tensors.

Existing external tensors are loaded to memory if they are referring to the
same file path as the destination path.

Args:
tensors: Tensors to be converted to external tensors. They can be external tensors themselves.
base_path: Path of base directory.
base_dir: Path of base directory.
relative_path: Path to which external data is to be stored, relative to the ONNX file.
load_external_to_memory: If set to true, loads external tensors present in the same file path as destination path to memory.

Returns:
A list of external tensors derived from a list of input tensors.
A list of external tensors derived from a list of input tensors. The order
should match the input tensor order.
"""
path = os.path.join(base_path, relative_path)
path = os.path.join(base_dir, relative_path)
# Check if file path is valid, and create subsequent subdirectories within the path if they don't exist
os.makedirs(os.path.dirname(path), exist_ok=True)
tmp_file_created = False
# Check if file exists. Load pre-existing external data if it does.

# Check if output path exists. Load pre-existing external data if it does.
if os.path.exists(path):
# Check if any tensor in the model is using the destination file
file_used = False
# Check if any tensor provided is using the destination file
new_tensors = []
for tensor in tensors:
if isinstance(tensor, _core.ExternalTensor) and os.path.samefile(
path, tensor.path
):
# FIXME(shubhambhokare1): If there is a non-initializer tensor that is referring to this file, that tensor is now invalid. This is a special case we are ok not handling right now.
file_used = True
if file_used:
if load_external_to_memory:
tensors = _load_external_data_file(tensors, base_path, relative_path)
# FIXME(shubhambhokare1): If there is a non-initializer tensor that
# is referring to this file, that tensor is now invalid.
# This is a special case we are ok not handling right now.
new_tensors.append(_external_tensor_to_memory_tensor(tensor))
else:
tmp_path = os.path.join(base_path, "tmp")
os.makedirs(tmp_path, exist_ok=True)
# If exisiting external tensors are not loaded to memory, copy the external data to a temporary location
os.rename(path, os.path.join(tmp_path, relative_path))
tmp_file_created = True
for tensor in tensors:
if (
isinstance(tensor, _core.ExternalTensor)
and tensor.location == relative_path
):
tensor.base_dir = tmp_path
new_tensors.append(tensor)
tensors = new_tensors

external_data_info: list[tuple[_protocols.TensorProtocol, _ExternalDataInfo]] = []
# Sort all tensors based on tensor sizes, in order to avoid unneccesarry alignment.
Expand All @@ -268,56 +253,57 @@ def convert_tensors_to_external(

# Convert initializers to ExternalTensors
external_tensors = _convert_as_external_tensors(
external_data_info, base_path, relative_path
external_data_info, base_dir, relative_path
)
# Sort external_tensors based on original key order
external_tensors = [
external_tensors[i]
for i in sorted(range(len(external_tensors)), key=lambda i: sorted_indices[i])
]

# Clean-up temporary file if it is created
tmp_path = os.path.join(base_path, "tmp", relative_path)
if os.path.exists(tmp_path) and tmp_file_created:
os.remove(tmp_path)

return external_tensors


def to_external_data(
model: _core.Model,
base_path: str | os.PathLike,
base_dir: str | os.PathLike,
relative_path: str | os.PathLike,
load_external_to_memory: bool = False,
) -> _core.Model:
"""Set all tensors with raw data as external data.
"""Set all tensors with raw data as external data, into a single data file.

Existing external tensors are loaded to memory if they are referring to the
same file path as the destination path.

It should only replace the initializers in the model with external tensors
and not do any other modifications to the model.

Args:
model: Model to process.
base_path: Path of base directory.
base_dir: Path the directory where the ONNX model file is.
relative_path: Path to which external data is to be stored, relative to the ONNX file.
load_external_to_memory: If set to true, loads external tensors present in the same file path as destination path to memory. Otherwise, the external tensors are appended to file.
E.g. "model.data"

Returns:
An ir.Model with all tensors with raw data converted to external tensors.
An ir.Model with all initializer data converted to external tensors.
"""

# Get all the tensors in the graph which are to be stored as external data.
# Iterate through all the tensors, and extract the external data information such as
# name, offset and length.
# TODO: Currently attributes not handled, eventually try to use _all_tensors to include attrs
tensors: list[_protocols.TensorProtocol] = []
for value in model.graph.initializers.values():
if value.const_value is not None:
tensors.append(value.const_value)
# Filter out the uninitialized initializer values
initializer_values = [
v for v in model.graph.initializers.values() if v.const_value is not None
]
tensors = typing.cast(
list[_protocols.TensorProtocol], [v.const_value for v in initializer_values]
)

external_tensors = convert_tensors_to_external(
tensors,
base_path,
relative_path,
load_external_to_memory=load_external_to_memory,
tensors, base_dir=base_dir, relative_path=relative_path
)

for value, external_tensor in zip(model.graph.initializers.values(), external_tensors):
# Replace the initializer values with external tensors and save the model
for value, external_tensor in zip(initializer_values, external_tensors):
value.const_value = external_tensor
return model
70 changes: 3 additions & 67 deletions onnxscript/ir/_external_data_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,28 +347,7 @@ def test_external_data_simple(self):
self.assertEqual(external_tensor.numpy().tobytes(), self.data.tobytes())
self.assertEqual(external_tensor2.numpy().tobytes(), self.data_float16.tobytes())

def test_same_path_external_data_written_to_memory(self):
model_with_external_data = _external_data.to_external_data(
self.model_with_external_data_same_path,
self.base_path,
self.external_data_name,
load_external_to_memory=True,
)
external_tensor = model_with_external_data.graph.initializers["tensor1"].const_value
external_tensor2 = model_with_external_data.graph.initializers["tensor2"].const_value
external_tensor3 = model_with_external_data.graph.initializers[
"tensor_same_file"
].const_value

self.assertEqual(external_tensor.numpy().tobytes(), self.data.tobytes())
self.assertEqual(external_tensor2.numpy().tobytes(), self.data_float16.tobytes())
self.assertEqual(external_tensor3.numpy().tobytes(), self.data_other.tobytes())
# Ensure repeated reads are consistent
self.assertEqual(external_tensor.numpy().tobytes(), self.data.tobytes())
self.assertEqual(external_tensor2.numpy().tobytes(), self.data_float16.tobytes())
self.assertEqual(external_tensor3.numpy().tobytes(), self.data_other.tobytes())

def test_same_path_external_data_written_to_disk(self):
def test_same_path_external_data(self):
model_with_external_data = _external_data.to_external_data(
self.model_with_external_data_same_path,
self.base_path,
Expand Down Expand Up @@ -438,52 +417,9 @@ def test_custom_tensor_in_initializers(self):
self.assertEqual(external_tensor2.numpy().tobytes(), self.data_float16.tobytes())
self.assertEqual(external_tensor3.numpy().tobytes(), self.custom_data.tobytes())

def test_mixed_external_data_to_disk(self):
def test_mixed_external_data(self):
model_with_external_data = _external_data.to_external_data(
self.model_with_mixed_external_data,
self.base_path,
self.external_data_name,
)
external_tensor = model_with_external_data.graph.initializers["tensor1"].const_value
external_tensor2 = model_with_external_data.graph.initializers["tensor2"].const_value
external_tensor3 = model_with_external_data.graph.initializers[
"tensor_same_file"
].const_value
external_tensor4 = model_with_external_data.graph.initializers[
"custom_tensor"
].const_value
external_tensor5 = model_with_external_data.graph.initializers[
"tensor_ext1_1"
].const_value
external_tensor6 = model_with_external_data.graph.initializers[
"tensor_ext1_2"
].const_value
external_tensor7 = model_with_external_data.graph.initializers[
"tensor_ext2_1"
].const_value

self.assertEqual(external_tensor.numpy().tobytes(), self.data.tobytes())
self.assertEqual(external_tensor2.numpy().tobytes(), self.data_float16.tobytes())
self.assertEqual(external_tensor3.numpy().tobytes(), self.data_other.tobytes())
self.assertEqual(external_tensor4.numpy().tobytes(), self.custom_data.tobytes())
self.assertEqual(external_tensor5.numpy().tobytes(), self.data_ext1_1.tobytes())
self.assertEqual(external_tensor6.numpy().tobytes(), self.data_ext1_2.tobytes())
self.assertEqual(external_tensor7.numpy().tobytes(), self.data_ext2_1.tobytes())
# Ensure repeated reads are consistent
self.assertEqual(external_tensor.numpy().tobytes(), self.data.tobytes())
self.assertEqual(external_tensor2.numpy().tobytes(), self.data_float16.tobytes())
self.assertEqual(external_tensor3.numpy().tobytes(), self.data_other.tobytes())
self.assertEqual(external_tensor4.numpy().tobytes(), self.custom_data.tobytes())
self.assertEqual(external_tensor5.numpy().tobytes(), self.data_ext1_1.tobytes())
self.assertEqual(external_tensor6.numpy().tobytes(), self.data_ext1_2.tobytes())
self.assertEqual(external_tensor7.numpy().tobytes(), self.data_ext2_1.tobytes())

def test_mixed_external_data_to_memory(self):
model_with_external_data = _external_data.to_external_data(
self.model_with_mixed_external_data,
self.base_path,
self.external_data_name,
load_external_to_memory=True,
self.model_with_mixed_external_data, self.base_path, self.external_data_name
)
external_tensor = model_with_external_data.graph.initializers["tensor1"].const_value
external_tensor2 = model_with_external_data.graph.initializers["tensor2"].const_value
Expand Down
Loading
Loading