Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[IR] Improve external data handling #2020

Merged
merged 32 commits into from
Jan 22, 2025
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
dbc3890
[IR] Improve external data handling
justinchuby Jan 17, 2025
3c9d315
Also load into memory
justinchuby Jan 17, 2025
fe696f5
external
justinchuby Jan 17, 2025
63553c5
f
justinchuby Jan 17, 2025
03b90ca
Simplify logic
justinchuby Jan 17, 2025
ad169e1
format
justinchuby Jan 17, 2025
903b206
comments
justinchuby Jan 17, 2025
ffe270f
docs
justinchuby Jan 17, 2025
e8726fc
sim
justinchuby Jan 17, 2025
68c8e41
Apply suggestions from code review
justinchuby Jan 17, 2025
d78f407
fix samefile call
justinchuby Jan 18, 2025
81458ec
wip
justinchuby Jan 18, 2025
273919d
address comments
justinchuby Jan 21, 2025
67d8ab9
Merge branch 'main' into justinchu/ir-save
justinchuby Jan 21, 2025
357c41b
test
justinchuby Jan 21, 2025
a0a5b58
test
justinchuby Jan 21, 2025
5e5a59c
Merge branch 'main' into justinchu/ir-save
justinchuby Jan 21, 2025
f601264
test
justinchuby Jan 21, 2025
e6fc2fe
update
justinchuby Jan 22, 2025
7dfdfdf
wip
justinchuby Jan 22, 2025
f5e0724
Handle right
justinchuby Jan 22, 2025
06f2eeb
polyfill
justinchuby Jan 22, 2025
458e0ab
Rename
justinchuby Jan 22, 2025
074afa9
Rename functions and expose external data module
justinchuby Jan 22, 2025
e37f3ba
Update onnxscript/ir/_polyfill.py
justinchuby Jan 22, 2025
b9f8e80
name
justinchuby Jan 22, 2025
58049f4
rename
justinchuby Jan 22, 2025
319e5cc
typing
justinchuby Jan 22, 2025
5dee1cd
mypy
justinchuby Jan 22, 2025
cb25b6e
Hashable
justinchuby Jan 22, 2025
b4f8c8c
naming
justinchuby Jan 22, 2025
33a8345
sort
justinchuby Jan 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 3 additions & 20 deletions onnxscript/_framework_apis/torch_2_5.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

from onnxscript import ir, optimizer, version_converter
from onnxscript.function_libs.torch_lib import registration
from onnxscript.ir import _external_data


@dataclasses.dataclass(frozen=True)
Expand Down Expand Up @@ -68,32 +67,16 @@ def save_model_with_external_data(model: ir.Model, model_path: str | os.PathLike
"""Save the model with external data. The model is unchanged after saving."""

# TODO(#1835): Decide if we want to externalize large attributes as well
initializer_values = tuple(model.graph.initializers.values())
tensors = [v.const_value for v in initializer_values]
for tensor in tensors:
if tensor is None:
for value in model.graph.initializers.values():
if value.const_value is None:
justinchuby marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError(
"The model contains uninitialized initializer values. "
"Please make sure all initializer values are initialized."
)
destination_path = pathlib.Path(model_path)
base_dir = destination_path.parent
data_path = f"{destination_path.name}.data"

external_tensors = _external_data.convert_tensors_to_external(
tensors, # type: ignore[arg-type]
base_dir,
data_path,
)

# Replace the initializer values with external tensors and save the model
for initializer, external_tensor in zip(initializer_values, external_tensors):
initializer.const_value = external_tensor
ir.save(model, model_path)

# Restore the original initializer values so the model is unchanged
for initializer, tensor in zip(initializer_values, tensors):
initializer.const_value = tensor
ir.save(model, model_path, external_data=data_path)


def get_torchlib_ops() -> list[_OnnxFunctionMeta]:
Expand Down
245 changes: 132 additions & 113 deletions onnxscript/ir/_external_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@

from __future__ import annotations

__all__ = ["set_base_dir"]
__all__ = [
"set_base_dir",
"to_external_data",
"convert_tensors_to_external",
"convert_tensors_from_external",
]

import dataclasses
import os
Expand Down Expand Up @@ -78,34 +83,25 @@
tensor.base_dir = base_dir


def _load_external_data_file(
tensors: Sequence[_protocols.TensorProtocol],
base_path: str | os.PathLike,
relative_path: str | os.PathLike,
) -> list[_protocols.TensorProtocol]:
"""Load all external data that is at relative_path into memory for the provided model.
def _external_tensor_to_memory_tensor(
tensor: _protocols.TensorProtocol,
) -> _protocols.TensorProtocol:
"""Convert an external tensor to an in memory tensor.

Args:
tensors: Tensors to be converted to external tensors. They can be external tensors themselves.
base_path: Path of base directory.
tensor: An external tensor to load.
base_dir: Path of base directory.
relative_path: Path to which external data is to be stored, relative to the ONNX file.

Returns:
A list of ir.Tensor values.
An ir.Tensor object with the data loaded into memory.
"""
updated_tensors: list[_protocols.TensorProtocol] = []
for tensor in tensors:
if isinstance(tensor, _core.ExternalTensor):
external_tensor = tensor
if os.path.samefile(tensor.path, os.path.join(base_path, relative_path)):
# Copy the data as the .numpy() call references data from a file whose data is eventually modified
tensor_data = external_tensor.numpy().copy()
external_tensor.release()
tensor = _core.Tensor(
tensor_data, name=external_tensor.name, dtype=external_tensor.dtype
)
updated_tensors.append(tensor)
return updated_tensors
if not isinstance(tensor, _core.ExternalTensor):
raise TypeError(f"Expected ExternalTensor, got {type(tensor)}")
# Copy the data as the .numpy() call references data from a file whose data is eventually modified
tensor_data = tensor.numpy().copy()
tensor.release()
return _core.Tensor(tensor_data, name=tensor.name, dtype=tensor.dtype)


def _compute_new_offset(
Expand Down Expand Up @@ -151,18 +147,23 @@
return external_data_info


def _save_external_data(
external_data_info: list[tuple[_protocols.TensorProtocol, _ExternalDataInfo]],
def _write_external_data(
tensors: Sequence[_protocols.TensorProtocol],
external_data_infos: Sequence[_ExternalDataInfo],
file_path: str | os.PathLike,
) -> None:
"""Write tensor data to an external file according to information stored in ExternalDataInfo objects.

Args:
external_data_info: A collection of external data information stored for each tensor to be written as external data.
tensors: Tensors to be written as external data.
external_data_infos: External data information stored for each tensor to be written as external data.
file_path: Location to which external data is to be stored.
"""
assert len(tensors) == len(external_data_infos), (
"Number of tensors and external data infos should match"
)
with open(file_path, "wb") as data_file:
for tensor, tensor_info in external_data_info:
for tensor, tensor_info in zip(tensors, external_data_infos):
current_offset = tensor_info.offset
assert tensor is not None
raw_data = tensor.tobytes()
Expand All @@ -175,149 +176,167 @@
data_file.write(raw_data)


def _convert_as_external_tensors(
external_data_info: list[tuple[_protocols.TensorProtocol, _ExternalDataInfo]],
base_path: str | os.PathLike,
def _create_external_tensor(
tensor: _protocols.TensorProtocol,
external_data_info: _ExternalDataInfo,
base_dir: str | os.PathLike,
relative_path: str | os.PathLike,
) -> list[_core.ExternalTensor]:
"""Convert the tensors (stored within the values) written as external data to _core.ExternalTensor types.
) -> _core.ExternalTensor:
"""Create external tensors from external data information.

Args:
external_data_info: A collection of external data information stored for each tensor to be written as external data.
base_path: Path of base directory.
tensor: Tensor to be converted to external tensor.
external_data_info: External data information stored for the tensor to be written as external data.
base_dir: Path of base directory.
relative_path: Path to which external data is to be stored, relative to the ONNX file.

Returns:
A list of external tensors.
External tensor created from the information.
"""
external_tensors: list[_core.ExternalTensor] = []
for tensor, tensor_info in external_data_info:
assert tensor is not None
external_tensor = _core.ExternalTensor(
os.path.normpath(relative_path),
tensor_info.offset,
tensor_info.length,
tensor.dtype, # type: ignore[arg-type]
shape=tensor.shape, # type: ignore[arg-type]
name=tensor.name, # type: ignore[arg-type]
base_dir=os.path.normpath(base_path),
)
external_tensors.append(external_tensor)
return external_tensors
return _core.ExternalTensor(
os.path.normpath(relative_path),
external_data_info.offset,
external_data_info.length,
tensor.dtype, # type: ignore[arg-type]
shape=tensor.shape, # type: ignore[arg-type]
name=tensor.name, # type: ignore[arg-type]
base_dir=os.path.normpath(base_dir),
)


def convert_tensors_from_external(
tensors: Sequence[_core.ExternalTensor],
) -> list[_protocols.TensorProtocol]:
"""Convert a sequence of external tensors to in-memory tensors.

Args:
tensors: External tensors to be converted to in-memory tensors.

Returns:
A list of in-memory tensors derived from a list of external tensors.
"""
return [_external_tensor_to_memory_tensor(tensor) for tensor in tensors]


def convert_tensors_to_external(
tensors: Sequence[_protocols.TensorProtocol],
base_path: str | os.PathLike,
base_dir: str | os.PathLike,
relative_path: str | os.PathLike,
load_external_to_memory: bool = False,
) -> list[_core.ExternalTensor]:
"""Convert a sequence of any TensorProtocol tensors to external tensors.

Existing external tensors are loaded to memory if they are referring to the
same file path as the destination path.

Args:
tensors: Tensors to be converted to external tensors. They can be external tensors themselves.
base_path: Path of base directory.
base_dir: Path of base directory.
relative_path: Path to which external data is to be stored, relative to the ONNX file.
load_external_to_memory: If set to true, loads external tensors present in the same file path as destination path to memory.

Returns:
A list of external tensors derived from a list of input tensors.
A list of external tensors derived from a list of input tensors. The order
should match the input tensor order.
"""
path = os.path.join(base_path, relative_path)
path = os.path.join(base_dir, relative_path)
# Check if file path is valid, and create subsequent subdirectories within the path if they don't exist
os.makedirs(os.path.dirname(path), exist_ok=True)
tmp_file_created = False
# Check if file exists. Load pre-existing external data if it does.

# Check if output path exists. Load pre-existing external data if it does.
if os.path.exists(path):
# Check if any tensor in the model is using the destination file
file_used = False
# Check if any tensor provided is using the destination file
new_tensors = []
for tensor in tensors:
if isinstance(tensor, _core.ExternalTensor) and os.path.samefile(
path, tensor.path
if (
isinstance(tensor, _core.ExternalTensor)
and os.path.exists(tensor.path)
and os.path.samefile(path, tensor.path)
):
# FIXME(shubhambhokare1): If there is a non-initializer tensor that is referring to this file, that tensor is now invalid. This is a special case we are ok not handling right now.
file_used = True
if file_used:
if load_external_to_memory:
tensors = _load_external_data_file(tensors, base_path, relative_path)
# FIXME(shubhambhokare1): If there is a non-initializer tensor that
# is referring to this file, that tensor is now invalid.
# This is a special case we are ok not handling right now.
new_tensors.append(_external_tensor_to_memory_tensor(tensor))
else:
tmp_path = os.path.join(base_path, "tmp")
os.makedirs(tmp_path, exist_ok=True)
# If exisiting external tensors are not loaded to memory, copy the external data to a temporary location
os.rename(path, os.path.join(tmp_path, relative_path))
tmp_file_created = True
for tensor in tensors:
if (
isinstance(tensor, _core.ExternalTensor)
and tensor.location == relative_path
):
tensor.base_dir = tmp_path

external_data_info: list[tuple[_protocols.TensorProtocol, _ExternalDataInfo]] = []
new_tensors.append(tensor)
tensors = new_tensors

external_data_infos: list[_ExternalDataInfo] = []
# Sort all tensors based on tensor sizes, in order to avoid unneccesarry alignment.
# All the smaller tensors are written earlier and alignment is performed for the larger tensors.
sorted_indices = sorted(range(len(tensors)), key=lambda i: tensors[i].nbytes)
sorted_tensors = [tensors[i] for i in sorted_indices]

# Compute external data information for each tensor and write to disk
current_offset = 0
for tensor in sorted_tensors:
tensor_info = _compute_external_data_info(tensor, current_offset)
external_data_info.append((tensor, tensor_info))
current_offset = tensor_info.offset + tensor_info.length
_save_external_data(external_data_info, path)

# Convert initializers to ExternalTensors
external_tensors = _convert_as_external_tensors(
external_data_info, base_path, relative_path
)
# Sort external_tensors based on original key order
external_info = _compute_external_data_info(tensor, current_offset)
external_data_infos.append(external_info)
current_offset = external_info.offset + external_info.length
_write_external_data(sorted_tensors, external_data_infos, path)

# Create external tensor objects
assert len(sorted_tensors) == len(external_data_infos)
external_tensors: list[_core.ExternalTensor] = [
_create_external_tensor(tensor, external_info, base_dir, relative_path)
for tensor, external_info in zip(sorted_tensors, external_data_infos)
]

# Sort external_tensors based on original key order. So that it can match the input tensor order
external_tensors = [
external_tensors[i]
for i in sorted(range(len(external_tensors)), key=lambda i: sorted_indices[i])
]

# Clean-up temporary file if it is created
tmp_path = os.path.join(base_path, "tmp", relative_path)
if os.path.exists(tmp_path) and tmp_file_created:
os.remove(tmp_path)

return external_tensors


def to_external_data(
model: _core.Model,
base_path: str | os.PathLike,
base_dir: str | os.PathLike,
relative_path: str | os.PathLike,
load_external_to_memory: bool = False,
*,
size_threshold_bytes: int,
) -> _core.Model:
"""Set all tensors with raw data as external data.
"""Convert all initializers equal or above size_threshold_bytes to external tensors and save data to a single data file.

It should only replace the initializers in the model with external tensors
and not make any other modifications to the model.

Args:
model: Model to process.
base_path: Path of base directory.
base_dir: Path the directory where the ONNX model file is.
relative_path: Path to which external data is to be stored, relative to the ONNX file.
load_external_to_memory: If set to true, loads external tensors present in the same file path as destination path to memory. Otherwise, the external tensors are appended to file.
E.g. "model.data"
size_threshold_bytes: Save to external data if the tensor size in bytes is larger than this threshold.

Returns:
An ir.Model with all tensors with raw data converted to external tensors.
An ir.Model with all initializer data equal or above :param:`size_threshold_bytes`
converted to external tensors.
"""

# Get all the tensors in the graph which are to be stored as external data.
# Iterate through all the tensors, and extract the external data information such as
# name, offset and length.
# TODO: Currently attributes not handled, eventually try to use _all_tensors to include attrs
tensors: list[_protocols.TensorProtocol] = []
# In-memory or external tensors, if above the threshold, should be converted to or re-saved as external tensors
initializers_to_become_external = []
# Existing external tensors, if below the threshold, should be loaded to memory
initializers_to_load_to_memory = []
for value in model.graph.initializers.values():
if value.const_value is not None:
tensors.append(value.const_value)
if value.const_value is None:
# Filter out the uninitialized initializer values
continue
if value.const_value.nbytes > size_threshold_bytes:
initializers_to_become_external.append(value.const_value)
elif isinstance(value.const_value, _core.ExternalTensor):
initializers_to_load_to_memory.append(value.const_value)

external_tensors = convert_tensors_to_external(
tensors,
base_path,
relative_path,
load_external_to_memory=load_external_to_memory,
[v.const_value for v in initializers_to_become_external],
Fixed Show fixed Hide fixed
Fixed Show fixed Hide fixed
base_dir=base_dir,
relative_path=relative_path,
)
memory_tensors = convert_tensors_from_external(initializers_to_load_to_memory)

for value, external_tensor in zip(model.graph.initializers.values(), external_tensors):
# Replace the initializer values with external tensors and save the model
assert len(initializers_to_become_external) == len(external_tensors)
assert len(initializers_to_load_to_memory) == len(memory_tensors)
for value, external_tensor in zip(initializers_to_become_external, external_tensors):
Fixed Show fixed Hide fixed
value.const_value = external_tensor
for value, memory_tensor in zip(initializers_to_load_to_memory, memory_tensors):
Fixed Show fixed Hide fixed
value.const_value = memory_tensor
return model
Loading
Loading