Skip to content

revamp prototype features #5283

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 0 additions & 185 deletions test/test_prototype_features.py

This file was deleted.

2 changes: 1 addition & 1 deletion torchvision/prototype/datasets/_builtin/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@
image_buffer_from_array,
Decompressor,
INFINITE_BUFFER_SIZE,
fromfile,
hint_sharding,
hint_shuffling,
)
from torchvision.prototype.features import Image, Label
from torchvision.prototype.utils._internal import fromfile

__all__ = ["MNIST", "FashionMNIST", "KMNIST", "EMNIST", "QMNIST"]

Expand Down
67 changes: 1 addition & 66 deletions torchvision/prototype/datasets/utils/_internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,10 @@
import gzip
import io
import lzma
import mmap
import os
import os.path
import pathlib
import pickle
import platform
from typing import BinaryIO
from typing import (
Sequence,
Expand All @@ -32,6 +30,7 @@
import torch.utils.data
from torchdata.datapipes.iter import IoPathFileLister, IoPathFileOpener, IterDataPipe, ShardingFilter, Shuffler
from torchdata.datapipes.utils import StreamWrapper
from torchvision.prototype.utils._internal import fromfile


__all__ = [
Expand All @@ -46,7 +45,6 @@
"path_accessor",
"path_comparator",
"Decompressor",
"fromfile",
"read_flo",
"hint_sharding",
]
Expand Down Expand Up @@ -267,69 +265,6 @@ def _make_sharded_datapipe(root: str, dataset_size: int) -> IterDataPipe[Dict[st
return dp


def _read_mutable_buffer_fallback(file: BinaryIO, count: int, item_size: int) -> bytearray:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These functions are not removed, but rather moved into torchvision.prototype.utils._internal since they are used in datasets to read binary files as well as for reading the raw bytes for encoded images for the new features.

# A plain file.read() will give a read-only bytes, so we convert it to bytearray to make it mutable
return bytearray(file.read(-1 if count == -1 else count * item_size))


def fromfile(
file: BinaryIO,
*,
dtype: torch.dtype,
byte_order: str,
count: int = -1,
) -> torch.Tensor:
"""Construct a tensor from a binary file.

.. note::

This function is similar to :func:`numpy.fromfile` with two notable differences:

1. This function only accepts an open binary file, but not a path to it.
2. This function has an additional ``byte_order`` parameter, since PyTorch's ``dtype``'s do not support that
concept.

.. note::

If the ``file`` was opened in update mode, i.e. "r+b" or "w+b", reading data is much faster. Be aware that as
long as the file is still open, inplace operations on the returned tensor will reflect back to the file.

Args:
file (IO): Open binary file.
dtype (torch.dtype): Data type of the underlying data as well as of the returned tensor.
byte_order (str): Byte order of the data. Can be "little" or "big" endian.
count (int): Number of values of the returned tensor. If ``-1`` (default), will read the complete file.
"""
byte_order = "<" if byte_order == "little" else ">"
char = "f" if dtype.is_floating_point else ("i" if dtype.is_signed else "u")
item_size = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
np_dtype = byte_order + char + str(item_size)

buffer: Union[memoryview, bytearray]
if platform.system() != "Windows":
# PyTorch does not support tensors with underlying read-only memory. In case
# - the file has a .fileno(),
# - the file was opened for updating, i.e. 'r+b' or 'w+b',
# - the file is seekable
# we can avoid copying the data for performance. Otherwise we fall back to simply .read() the data and copy it
# to a mutable location afterwards.
try:
buffer = memoryview(mmap.mmap(file.fileno(), 0))[file.tell() :]
# Reading from the memoryview does not advance the file cursor, so we have to do it manually.
file.seek(*(0, io.SEEK_END) if count == -1 else (count * item_size, io.SEEK_CUR))
except (PermissionError, io.UnsupportedOperation):
buffer = _read_mutable_buffer_fallback(file, count, item_size)
else:
# On Windows just trying to call mmap.mmap() on a file that does not support it, may corrupt the internal state
# so no data can be read afterwards. Thus, we simply ignore the possible speed-up.
buffer = _read_mutable_buffer_fallback(file, count, item_size)

# We cannot use torch.frombuffer() directly, since it only supports the native byte order of the system. Thus, we
# read the data with np.frombuffer() with the correct byte order and convert it to the native one with the
# successive .astype() call.
return torch.from_numpy(np.frombuffer(buffer, dtype=np_dtype, count=count).astype(np_dtype[1:], copy=False))


def read_flo(file: BinaryIO) -> torch.Tensor:
if file.read(4) != b"PIEH":
raise ValueError("Magic number incorrect. Invalid .flo file")
Expand Down
10 changes: 6 additions & 4 deletions torchvision/prototype/features/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from ._bounding_box import BoundingBoxFormat, BoundingBox
from ._feature import Feature, DEFAULT
from ._image import Image, ColorSpace
from ._label import Label
from ._bounding_box import BoundingBox, BoundingBoxFormat
from ._encoded import EncodedData, EncodedImage, EncodedVideo
from ._feature import Feature
from ._image import ColorSpace, Image
from ._label import Label, OneHotLabel
from ._segmentation_mask import SegmentationMask
Loading