openclimatefix · peterdudfield · Oct 11, 2021 · Oct 1, 2021 · Oct 1, 2021 · Oct 1, 2021
diff --git a/conftest.py b/conftest.py
@@ -9,6 +9,7 @@
 from nowcasting_dataset.config.load import load_yaml_configuration
 from nowcasting_dataset.data_sources import SatelliteDataSource
 from nowcasting_dataset.data_sources.gsp.gsp_data_source import GSPDataSource
+from nowcasting_dataset.data_sources.metadata.metadata_data_source import MetadataDataSource
 
 pytest.IMAGE_SIZE_PIXELS = 128
 
@@ -50,6 +51,14 @@ def sat_data_source(sat_filename: Path):
     )
 
 
+@pytest.fixture
+def general_data_source():
+
+    return MetadataDataSource(
+        history_minutes=0, forecast_minutes=5, object_at_center="GSP", convert_to_numpy=True
+    )
+
+
 @pytest.fixture
 def gsp_data_source():
     return GSPDataSource(
@@ -65,9 +74,9 @@ def gsp_data_source():
 @pytest.fixture
 def configuration():
     filename = os.path.join(os.path.dirname(nowcasting_dataset.__file__), "config", "gcp.yaml")
-    config = load_yaml_configuration(filename)
+    configuration = load_yaml_configuration(filename)
 
-    return config
+    return configuration
 
 
 @pytest.fixture

diff --git a/notebooks/2021-09/2021-09-07/sat_data.py b/notebooks/2021-09/2021-09-07/sat_data.py
@@ -1,6 +1,6 @@
 from datetime import datetime
 
-from nowcasting_dataset.data_sources.satellite_data_source import SatelliteDataSource
+from nowcasting_dataset.data_sources.satellite.satellite_data_source import SatelliteDataSource
 
 s = SatelliteDataSource(
     filename="gs://solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/OSGB36/"

diff --git a/notebooks/2021-10/2021-10-01/pydantic.py b/notebooks/2021-10/2021-10-01/pydantic.py
@@ -0,0 +1,129 @@
+from pydantic import BaseModel, Field, validator
+from typing import Union
+import numpy as np
+import xarray as xr
+import torch
+from nowcasting_dataset.config.model import Configuration
+
+
+Array = Union[xr.DataArray, np.ndarray, torch.Tensor]
+
+
+class Satellite(BaseModel):
+
+    # width: int = Field(..., g=0, description="The width of the satellite image")
+    # height: int = Field(..., g=0, description="The width of the satellite image")
+    # num_channels: int = Field(..., g=0, description="The width of the satellite image")
+
+    # Shape: [batch_size,] seq_length, width, height, channel
+    image_data: Array = Field(
+        ...,
+        description="Satellites images. Shape: [batch_size,] seq_length, width, height, channel",
+    )
+    x_coords: Array = Field(
+        ...,
+        description="The x (OSGB geo-spatial) coordinates of the satellite images. Shape: [batch_size,] width",
+    )
+    y_coords: Array = Field(
+        ...,
+        description="The y (OSGB geo-spatial) coordinates of the satellite images. Shape: [batch_size,] width",
+    )
+
+    # @validator("sat_data")
+    # def image_shape(cls, v):
+    #     assert v.shape[-1] == cls.num_channels
+    #     assert v.shape[-2] == cls.height
+    #     assert v.shape[-3] == cls.width
+    #
+    # @validator("x_coords")
+    # def x_coords_shape(cls, v):
+    #     assert v.shape[-1] == cls.width
+    #
+    # @validator("y_coords")
+    # def y_coords_shape(cls, v):
+    #     assert v.shape[-1] == cls.height
+    #
+    class Config:
+        arbitrary_types_allowed = True
+
+
+class Batch(BaseModel):
+
+    batch_size: int = Field(
+        ...,
+        g=0,
+        description="The size of this batch. If the batch size is 0, "
+        "then this item stores one data item",
+    )
+
+    satellite: Satellite
+
+
+class FakeDataset(torch.utils.data.Dataset):
+    """Fake dataset."""
+
+    def __init__(self, configuration: Configuration = Configuration(), length: int = 10):
+        """
+        Init
+
+        Args:
+            configuration: configuration object
+            length: length of dataset
+        """
+        self.batch_size = configuration.process.batch_size
+        self.seq_length_5 = (
+            configuration.process.seq_len_5_minutes
+        )  # the sequence data in 5 minute steps
+        self.seq_length_30 = (
+            configuration.process.seq_len_30_minutes
+        )  # the sequence data in 30 minute steps
+        self.satellite_image_size_pixels = configuration.process.satellite_image_size_pixels
+        self.nwp_image_size_pixels = configuration.process.nwp_image_size_pixels
+        self.number_sat_channels = len(configuration.process.sat_channels)
+        self.number_nwp_channels = len(configuration.process.nwp_channels)
+        self.length = length
+
+    def __len__(self):
+        """Number of pieces of data"""
+        return self.length
+
+    def per_worker_init(self, worker_id: int):
+        """Not needed"""
+        pass
+
+    def __getitem__(self, idx):
+        """
+        Get item, use for iter and next method
+
+        Args:
+            idx: batch index
+
+        Returns: Dictionary of random data
+
+        """
+
+        sat = Satellite(
+            image_data=np.random.randn(
+                self.batch_size,
+                self.seq_length_5,
+                self.satellite_image_size_pixels,
+                self.satellite_image_size_pixels,
+                self.number_sat_channels,
+            ),
+            x_coords=torch.sort(torch.randn(self.batch_size, self.satellite_image_size_pixels))[0],
+            y_coords=torch.sort(
+                torch.randn(self.batch_size, self.satellite_image_size_pixels), descending=True
+            )[0],
+        )
+
+        # Note need to return as nested dict
+        return Batch(satellite=sat, batch_size=self.batch_size).dict()
+
+
+train = torch.utils.data.DataLoader(FakeDataset())
+i = iter(train)
+x = next(i)
+
+x = Batch(**x)
+# IT WORKS
+assert type(x.satellite.image_data) == torch.Tensor
diff --git a/notebooks/2021-10/2021-10-08/no_validation.py b/notebooks/2021-10/2021-10-08/no_validation.py
diff --git a/notebooks/2021-10/2021-10-08/xr_compression.py b/notebooks/2021-10/2021-10-08/xr_compression.py
@@ -0,0 +1,97 @@
+import os
+
+import numpy as np
+import xarray as xr
+from nowcasting_dataset.utils import coord_to_range
+
+
+def get_satellite_xrarray_data_array(
+    batch_size, seq_length_5, satellite_image_size_pixels, number_sat_channels=10
+):
+
+    r = np.random.randn(
+        # self.batch_size,
+        seq_length_5,
+        satellite_image_size_pixels,
+        satellite_image_size_pixels,
+        number_sat_channels,
+    )
+
+    time = np.sort(np.random.randn(seq_length_5))
+
+    x_coords = np.sort(np.random.randint(0, 1000, (satellite_image_size_pixels)))
+    y_coords = np.sort(np.random.randint(0, 1000, (satellite_image_size_pixels)))[::-1].copy()
+
+    sat_xr = xr.DataArray(
+        data=r,
+        dims=["time", "x", "y", "channels"],
+        coords=dict(
+            # batch=range(0,self.batch_size),
+            x=list(x_coords),
+            y=list(y_coords),
+            time=list(time),
+            channels=range(0, number_sat_channels),
+        ),
+        attrs=dict(
+            description="Ambient temperature.",
+            units="degC",
+        ),
+        name="sata_data",
+    )
+
+    return sat_xr
+
+
+def sat_data_array_to_dataset(sat_xr):
+    ds = sat_xr.to_dataset(name="sat_data")
+    # ds["sat_data"] = ds["sat_data"].astype(np.int16)
+
+    for dim in ["time", "x", "y"]:
+        # This does seem like the right way to do it
+        # https://ecco-v4-python-tutorial.readthedocs.io/ECCO_v4_Saving_Datasets_and_DataArrays_to_NetCDF.html
+        ds = coord_to_range(ds, dim, prefix="sat")
+    ds = ds.rename(
+        {
+            "channels": f"sat_channels",
+            "x": f"sat_x",
+            "y": f"sat_y",
+        }
+    )
+
+    # ds["sat_x_coords"] = ds["sat_x_coords"].astype(np.int32)
+    # ds["sat_y_coords"] = ds["sat_y_coords"].astype(np.int32)
+
+    return ds
+
+
+def to_netcdf(batch_xr, local_filename):
+    encoding = {name: {"compression": "lzf"} for name in batch_xr.data_vars}
+    batch_xr.to_netcdf(local_filename, engine="h5netcdf", mode="w", encoding=encoding)
+
+
+# 1. try to save netcdf files not using coord to range function
+sat_xrs = [get_satellite_xrarray_data_array(4, 19, 32) for _ in range(0, 10)]
+
+### error ###
+# cant do this step as x/y index has duplicate values
+sat_dataset = xr.merge(sat_xrs)
+to_netcdf(sat_dataset, "test_no_alignment.nc")
+###
+
+# but can save it as separate files
+os.mkdir("test_no_alignment")
+[sat_xrs[i].to_netcdf(f"test_no_alignment/{i}.nc", engine="h5netcdf") for i in range(0, 10)]
+# 10 files about 1.5MB
+
+# 2.
+sat_xrs = [get_satellite_xrarray_data_array(4, 19, 32) for _ in range(0, 10)]
+sat_xrs = [sat_data_array_to_dataset(sat_xr) for sat_xr in sat_xrs]
+
+sat_dataset = xr.concat(sat_xrs, dim="example")
+to_netcdf(sat_dataset, "test_alignment.nc")
+# this 15 MB
+
+
+# conclusion
+# no major improvement in compression by joining datasets together, buts by joining array together,
+# it does make it easier to get array ready ML
diff --git a/notebooks/2021-10/2021-10-08/xr_pydantic.py b/notebooks/2021-10/2021-10-08/xr_pydantic.py
@@ -0,0 +1,99 @@
+from pydantic import BaseModel, Field, validator
+from typing import Union, List
+import numpy as np
+import xarray as xr
+import torch
+from nowcasting_dataset.config.model import Configuration
+
+
+Array = Union[xr.DataArray, np.ndarray, torch.Tensor]
+
+
+class Satellite(BaseModel):
+    # Shape: [batch_size,] seq_length, width, height, channel
+    image_data: xr.DataArray = Field(
+        ...,
+        description="Satellites images. Shape: [batch_size,] seq_length, width, height, channel",
+    )
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    @validator("image_data")
+    def v_image_data(cls, v):
+        print("validating image data")
+        return v
+
+
+class Batch(BaseModel):
+
+    batch_size: int = 0
+    satellite: Satellite
+
+    @validator("batch_size")
+    def v_image_data(cls, v):
+        print("validating batch size")
+        return v
+
+
+s = Satellite(image_data=xr.DataArray())
+s_dict = s.dict()
+
+x = Satellite(**s_dict)
+x = Satellite.construct(Satellite.__fields_set__, **s_dict)
+
+
+batch = Batch(batch_size=5, satellite=s)
+
+b_dict = batch.dict()
+
+x = Batch(**b_dict)
+x = Batch.construct(Batch.__fields_set__, **b_dict)
+
+
+# class Satellite(BaseModel):
+#
+#     image_data: xr.DataArray
+#
+#     # validate
+#
+#     def to_dataset(self):
+#         pass
+#
+#     def from_dateset(self):
+#         pass
+#
+#     def to_numpy(self) -> SatelliteNumpy:
+#         pass
+#
+#
+# class SatelliteNumpy(BaseModel):
+#
+#     image_data: np.ndarray
+#     x: np.ndarray
+#     # more
+#
+#
+# class Example(BaseModel):
+#
+#     satelllite: Satellite
+#     # more
+#
+#
+# class Batch(BaseModel):
+#
+#     batch_size: int = 0
+#     examples: List[Example]
+#
+#     def to/from_netcdf():
+#         pass
+#
+#
+# class BatchNumpy(BaseModel):
+#
+#     batch_size: int = 0
+#     satellite: SatellliteNumpy
+#     # more
+#
+#     def from_batch(self) -> BatchNumpy:
+#         """ change to Batch numpy structure """
diff --git a/nowcasting_dataset/config/gcp.yaml b/nowcasting_dataset/config/gcp.yaml
@@ -6,10 +6,10 @@ input_data:
   satellite_zarr_path: gs://solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/OSGB36/all_zarr_int16_single_timestep.zarr
   solar_pv_data_filename: gs://solar-pv-nowcasting-data/PV/PVOutput.org/UK_PV_timeseries_batch.nc
   solar_pv_metadata_filename: gs://solar-pv-nowcasting-data/PV/PVOutput.org/UK_PV_metadata.csv
-  gsp_zarr_path: gs://solar-pv-nowcasting-data/PV/PVOutput.org/PV/GSP/v1/pv_gsp.zarr
+  gsp_zarr_path: gs://solar-pv-nowcasting-data/PV/GSP/v1/pv_gsp.zarr
   topographic_filename: gs://solar-pv-nowcasting-data/Topographic/europe_dem_1km_osgb.tif
 output_data:
-  filepath: gs://solar-pv-nowcasting-data/prepared_ML_training_data/v6/
+  filepath: gs://solar-pv-nowcasting-data/prepared_ML_training_data/v7/
 process:
   local_temp_path: ~/temp/
   seed: 1234