Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
a2c0492
feat: Refactored Dataset by removing intermediate layers
Dec 16, 2020
b179439
Added image_dataset and tabular_dataset subclass
Dec 21, 2020
4a605b7
Moved metadata_schema_uri responsibility to subclass to enable foreca…
Dec 22, 2020
17cedc3
Moved validation logic for tabular into Dataset._create_tabular
Dec 22, 2020
a5497dc
Added validation in image_dataset and fixed bounding_box schema error
Dec 22, 2020
2c0abdb
Removed import_config
Dec 22, 2020
7a5b765
Fixed metadata_schema_uri
Dec 23, 2020
eb0fbf2
Fixed import and subclasses
Dec 23, 2020
36b24ca
Added EmptyNontabularDatasource
Dec 23, 2020
ab5b94d
change import_metadata to ioformat
morgandu Dec 28, 2020
18845ec
added datasources.py
morgandu Dec 28, 2020
25c8a7e
added support of multiple gcs_sources
morgandu Dec 29, 2020
0ec3f2f
fix: default (empty) dataset_metadata need to be set to {}, not None
morgandu Dec 29, 2020
905a2cf
1) imported datasources 2) added _support_metadata_schema_uris and _s…
morgandu Dec 29, 2020
28be2b8
added image_dataset.py and tabular_dataset.py
morgandu Dec 29, 2020
b4fda59
fix: refactor - create datasets modeule
morgandu Dec 29, 2020
b23e47e
fix: cleanup __init__.py
morgandu Dec 29, 2020
4000624
fix: data_item_labels
morgandu Dec 29, 2020
9213ac5
fix: docstring
morgandu Dec 29, 2020
ea5ad2e
fix:
morgandu Dec 30, 2020
9eb9945
fix: import the module instead of the classes for datasources
morgandu Jan 5, 2021
34cffdc
fix: removed all validation for import_schema_uri
morgandu Jan 5, 2021
1d2e89f
fix: set parameter default to immutable
morgandu Jan 5, 2021
760f3e5
fix: replaced Datasource / DatasourceImportable abstract class instea…
morgandu Jan 5, 2021
cb180b5
fix: added examples for gcs_source
morgandu Jan 5, 2021
929c81d
fix:
morgandu Jan 6, 2021
88cc8bd
fix: remove all labels
morgandu Jan 6, 2021
fdc329b
fix: remove Optional in docstring, add example for bq_source
morgandu Jan 7, 2021
edab913
test: add import_data raise for tabular dataset test
morgandu Jan 7, 2021
4c83478
fix: refactor datasource creation with create_datasource
morgandu Jan 7, 2021
4d47fe3
fix: lint
morgandu Jan 7, 2021
6f51faa
Merge branch 'dev' into mor--dataset-refactor-datasource
morgandu Jan 7, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion google/cloud/aiplatform/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@
from google.cloud.aiplatform import gapic

from google.cloud.aiplatform import initializer
from google.cloud.aiplatform.datasets import Dataset
from google.cloud.aiplatform.datasets import (
Dataset,
TabularDataset,
ImageDataset,
)
from google.cloud.aiplatform.models import Endpoint
from google.cloud.aiplatform.models import Model
from google.cloud.aiplatform.jobs import BatchPredictionJob
Expand All @@ -42,5 +46,7 @@
"AutoMLTabularTrainingJob",
"Model",
"Dataset",
"TabularDataset",
"ImageDataset",
"Endpoint",
)
26 changes: 26 additions & 0 deletions google/cloud/aiplatform/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-

# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from google.cloud.aiplatform.datasets.dataset import Dataset
from google.cloud.aiplatform.datasets.tabular_dataset import TabularDataset
from google.cloud.aiplatform.datasets.image_dataset import ImageDataset

__all__ = (
"Dataset",
"TabularDataset",
"ImageDataset",
)
217 changes: 217 additions & 0 deletions google/cloud/aiplatform/datasets/_datasources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
import abc
from typing import Optional, Dict, Sequence, Union
from google.cloud.aiplatform_v1beta1.types import io as gca_io
from google.cloud.aiplatform_v1beta1.types import dataset as gca_dataset

from google.cloud.aiplatform import schema


class Datasource(abc.ABC):
"""An abstract class that sets dataset_metadata"""

@property
@abc.abstractmethod
def dataset_metadata(self):
"""Dataset Metadata."""
pass


class DatasourceImportable(abc.ABC):
"""An abstract class that sets import_data_config"""

@property
@abc.abstractmethod
def import_data_config(self):
"""Import Data Config."""
pass


class TabularDatasource(Datasource):
"""Datasource for creating a tabular dataset for AI Platform"""

def __init__(
self,
gcs_source: Optional[Union[str, Sequence[str]]] = None,
bq_source: Optional[str] = None,
):
"""Creates a tabular datasource

Args:
gcs_source (Union[str, Sequence[str]]):
Cloud Storage URI of one or more files. Only CSV files are supported.
The first line of the CSV file is used as the header.
If there are multiple files, the header is the first line of
the lexicographically first file, the other files must either
contain the exact same header or omit the header.
examples:
str: "gs://bucket/file.csv"
Sequence[str]: ["gs://bucket/file1.csv", "gs://bucket/file2.csv"]
bq_source (str):
The URI of a BigQuery table.
example:
"bq://project.dataset.table_name"

Raises:
ValueError if source configuration is not valid.
"""

dataset_metadata = None

if gcs_source and isinstance(gcs_source, str):
gcs_source = [gcs_source]

if gcs_source and bq_source:
raise ValueError("Only one of gcs_source or bq_source can be set.")

if not any([gcs_source, bq_source]):
raise ValueError("One of gcs_source or bq_source must be set.")

if gcs_source:
dataset_metadata = {"input_config": {"gcs_source": {"uri": gcs_source}}}
elif bq_source:
dataset_metadata = {"input_config": {"bigquery_source": {"uri": bq_source}}}

self._dataset_metadata = dataset_metadata

@property
def dataset_metadata(self) -> Optional[Dict]:
"""Dataset Metadata."""
return self._dataset_metadata


class NonTabularDatasource(Datasource):
"""Datasource for creating an empty non-tabular dataset for AI Platform"""

@property
def dataset_metadata(self) -> Optional[Dict]:
return None


class NonTabularDatasourceImportable(NonTabularDatasource, DatasourceImportable):
"""Datasource for creating a non-tabular dataset for AI Platform and importing data to the dataset"""

def __init__(
self,
gcs_source: Union[str, Sequence[str]],
import_schema_uri: str,
data_item_labels: Optional[Dict] = None,
):
"""Creates a non-tabular datasource

Args:
gcs_source (Union[str, Sequence[str]]):
Required. The Google Cloud Storage location for the input content.
Google Cloud Storage URI(-s) to the input file(s). May contain
wildcards. For more information on wildcards, see
https://cloud.google.com/storage/docs/gsutil/addlhelp/WildcardNames.
examples:
str: "gs://bucket/file.csv"
Sequence[str]: ["gs://bucket/file1.csv", "gs://bucket/file2.csv"]
import_schema_uri (str):
Required. Points to a YAML file stored on Google Cloud
Storage describing the import format. Validation will be
done against the schema. The schema is defined as an
`OpenAPI 3.0.2 Schema
data_item_labels (Dict):
Labels that will be applied to newly imported DataItems. If
an identical DataItem as one being imported already exists
in the Dataset, then these labels will be appended to these
of the already existing one, and if labels with identical
key is imported before, the old label value will be
overwritten. If two DataItems are identical in the same
import data operation, the labels will be combined and if
key collision happens in this case, one of the values will
be picked randomly. Two DataItems are considered identical
if their content bytes are identical (e.g. image bytes or
pdf bytes). These labels will be overridden by Annotation
labels specified inside index file refenced by
``import_schema_uri``,
e.g. jsonl file.
"""
super().__init__()
self._gcs_source = [gcs_source] if isinstance(gcs_source, str) else gcs_source
self._import_schema_uri = import_schema_uri
self._data_item_labels = data_item_labels

@property
def import_data_config(self) -> gca_dataset.ImportDataConfig:
"""Import Data Config."""
return gca_dataset.ImportDataConfig(
gcs_source=gca_io.GcsSource(uris=self._gcs_source),
import_schema_uri=self._import_schema_uri,
data_item_labels=self._data_item_labels,
)


def create_datasource(
metadata_schema_uri: str,
import_schema_uri: Optional[str] = None,
gcs_source: Optional[Union[str, Sequence[str]]] = None,
bq_source: Optional[str] = None,
data_item_labels: Optional[Dict] = None,
) -> Datasource:
"""Creates a datasource
Args:
metadata_schema_uri (str):
Required. Points to a YAML file stored on Google Cloud Storage
describing additional information about the Dataset. The schema
is defined as an OpenAPI 3.0.2 Schema Object. The schema files
that can be used here are found in gs://google-cloud-
aiplatform/schema/dataset/metadata/.
import_schema_uri (str):
Points to a YAML file stored on Google Cloud
Storage describing the import format. Validation will be
done against the schema. The schema is defined as an
`OpenAPI 3.0.2 Schema
gcs_source (Union[str, Sequence[str]]):
The Google Cloud Storage location for the input content.
Google Cloud Storage URI(-s) to the input file(s). May contain
wildcards. For more information on wildcards, see
https://cloud.google.com/storage/docs/gsutil/addlhelp/WildcardNames.
examples:
str: "gs://bucket/file.csv"
Sequence[str]: ["gs://bucket/file1.csv", "gs://bucket/file2.csv"]
bq_source (str):
BigQuery URI to the input table.
example:
"bq://project.dataset.table_name"
data_item_labels (Dict):
Labels that will be applied to newly imported DataItems. If
an identical DataItem as one being imported already exists
in the Dataset, then these labels will be appended to these
of the already existing one, and if labels with identical
key is imported before, the old label value will be
overwritten. If two DataItems are identical in the same
import data operation, the labels will be combined and if
key collision happens in this case, one of the values will
be picked randomly. Two DataItems are considered identical
if their content bytes are identical (e.g. image bytes or
pdf bytes). These labels will be overridden by Annotation
labels specified inside index file refenced by
``import_schema_uri``,
e.g. jsonl file.

Returns:
datasource (Datasource)

Raises:
ValueError when below scenarios happen
- import_schema_uri is identified for creating TabularDatasource
- either import_schema_uri or gcs_source is missing for creating NonTabularDatasourceImportable
"""

if metadata_schema_uri == schema.dataset.metadata.tabular:
if import_schema_uri:
raise ValueError("tabular dataset does not support data import.")
return TabularDatasource(gcs_source, bq_source)

if not import_schema_uri and not gcs_source:
return NonTabularDatasource()
elif import_schema_uri and gcs_source:
return NonTabularDatasourceImportable(
gcs_source, import_schema_uri, data_item_labels
)
else:
raise ValueError(
"nontabular dataset requires both import_schema_uri and gcs_source for data import."
)
Loading