Finish first rough draft of Manager.create_batches()

JackKelly · JackKelly · commit 04d4fbb9796e · 2021-10-29T14:59:47.000+01:00
diff --git a/nowcasting_dataset/data_sources/datasource_output.py b/nowcasting_dataset/data_sources/datasource_output.py
@@ -4,13 +4,12 @@
 import logging
 import os
 from pathlib import Path
-from typing import List
 
 import numpy as np
-from pydantic import BaseModel, Field
 
+# nowcasting_dataset imports
 from nowcasting_dataset.dataset.xr_utils import PydanticXArrayDataSet
-from nowcasting_dataset.filesystem.utils import make_folder
+from nowcasting_dataset.filesystem.utils import makedirs
 from nowcasting_dataset.utils import get_netcdf_filename
 
 logger = logging.getLogger(__name__)
@@ -43,8 +42,8 @@ def save_netcdf(self, batch_i: int, path: Path):
         # make folder
         folder = os.path.join(path, name)
         if batch_i == 0:
-            # only need to make the folder once, or check that there folder is there once
-            make_folder(path=folder)
+            # only need to make the folder once, or check that the folder is there once
+            makedirs(path=folder)
 
         # make file
         local_filename = os.path.join(folder, filename)
diff --git a/nowcasting_dataset/filesystem/utils.py b/nowcasting_dataset/filesystem/utils.py
@@ -146,8 +146,16 @@ def upload_one_file(
     filesystem.put(local_filename, remote_filename)
 
 
-def make_folder(path: Union[str, Path]):
-    """ Make folder """
+def makedirs(path: Union[str, Path], exist_ok: bool = True) -> None:
+    """Recursively make directories
+
+    Creates directory at path and any intervening required directories.
+
+    Raises exception if, for instance, the path already exists but is a file.
+
+    Args:
+        path: The path to create.
+        exist_ok: If False then raise an exception if `path` already exists.
+    """
     filesystem = fsspec.open(path).fs
-    if not filesystem.exists(path):
-        filesystem.mkdir(path)
+    filesystem.mkdir(path, exist_ok=exist_ok)
diff --git a/nowcasting_dataset/manager.py b/nowcasting_dataset/manager.py
@@ -4,10 +4,11 @@
 from pathlib import Path
 from typing import Optional, Union
 
-import fsspec
+import futures
 import numpy as np
 import pandas as pd
 
+# nowcasting_dataset imports
 import nowcasting_dataset.time as nd_time
 import nowcasting_dataset.utils as nd_utils
 from nowcasting_dataset import config
@@ -102,26 +103,6 @@ def initialise_data_sources(
                 " data_source_which_defines_geospatial_locations."
             )
 
-    def make_directories_if_necessary(self) -> None:
-        """Make dirs: `<output_data.filepath> / <split_name> / <data_source_name>`.
-
-        Also make `local_temp_path` if necessary.
-
-        Works on any compute environment.
-        """
-        filesystem = fsspec.open(self.config.output_data.filepath).fs
-        for split_name in split.SplitName:
-            for data_source_name in self.data_sources.keys():
-                path = self.config.output_data.filepath / split_name.value / data_source_name
-                logger.info(f"Making {path} if necessary...")
-                filesystem.mkdirs(path, exist_ok=True)
-
-        if self.save_batches_locally_and_upload:
-            logger.info(f"Making {self.local_temp_path} if necessary...")
-            filesystem.mkdirs(self.local_temp_path, exist_ok=True)
-            logger.info(f"Deleting all files in {self.local_temp_path}...")
-            nd_fs_utils.delete_all_files_in_temp_path(path=self.local_temp_path)
-
     def create_files_specifying_spatial_and_temporal_locations_of_each_example_if_necessary(
         self,
     ) -> None:
@@ -316,6 +297,10 @@ def _find_splits_which_need_more_batches(
     def create_batches(self, overwrite_batches: bool) -> None:
         """Create batches (if necessary).
 
+        Make dirs: `<output_data.filepath> / <split_name> / <data_source_name>`.
+
+        Also make `local_temp_path` if necessary.
+
         Args:
           overwrite_batches: If True then start from batch 0, regardless of which batches have
             previously been written to disk. If False then check which batches have previously been
@@ -335,7 +320,7 @@ def create_batches(self, overwrite_batches: bool) -> None:
                 return
 
         # Load locations for each example off disk.
-        locations_for_each_example_for_each_split: dict[split.SplitName, pd.DataFrame] = {}
+        locations_for_each_example_of_each_split: dict[split.SplitName, pd.DataFrame] = {}
         for split_name in splits_which_need_more_batches:
             filename = self._filename_of_locations_csv_file(split_name.value)
             logger.info(f"Loading {filename}.")
@@ -345,7 +330,51 @@ def create_batches(self, overwrite_batches: bool) -> None:
             locations_for_each_example["t0_datetime_UTC"] = pd.to_datetime(
                 locations_for_each_example["t0_datetime_UTC"]
             )
-            locations_for_each_example_for_each_split[split_name] = locations_for_each_example
+            locations_for_each_example_of_each_split[split_name] = locations_for_each_example
 
-        # TODO: Fire up a separate process for each DataSource, and pass it a list of batches to
+        # Fire up a separate process for each DataSource, and pass it a list of batches to
         # create, and whether to utils.upload_and_delete_local_files().
+        # TODO: Split this up into separate functions!!!
+        n_data_sources = len(self.data_sources)
+        for split_name in splits_which_need_more_batches:
+            locations_for_split = locations_for_each_example_of_each_split[split_name]
+            with futures.ProcessPoolExecutor(max_workers=n_data_sources) as executor:
+                future_create_batches_jobs = []
+                for worker_id, (data_source_name, data_source) in enumerate(
+                    self.data_sources.items()
+                ):
+                    # Get indexes of first batch and example; and subset locations_for_split.
+                    idx_of_first_batch = first_batches_to_create[split_name][data_source_name]
+                    idx_of_first_example = idx_of_first_batch * self.config.process.batch_size
+                    locations = locations_for_split.loc[idx_of_first_example:]
+
+                    # Get paths.
+                    dst_path = (
+                        self.config.output_data.filepath / split_name.value / data_source_name
+                    )
+                    temp_path = (
+                        self.temp_path / split_name.value / data_source_name / f"worker_{worker_id}"
+                    )
+
+                    # Make folders.
+                    nd_fs_utils.makedirs(dst_path, exist_ok=True)
+                    if self.save_batches_locally_and_upload:
+                        nd_fs_utils.makedirs(temp_path, exist_ok=True)
+
+                    # Submit data_source.create_batches task to the worker process.
+                    future = executor.submit(
+                        data_source.create_batches,
+                        spatial_and_temporal_locations_of_each_example=locations,
+                        idx_of_first_batch=idx_of_first_batch,
+                        batch_size=self.config.process.batch_size,
+                        dst_path=dst_path,
+                        temp_path=temp_path,
+                        upload_every_n_batches=self.config.process.upload_every_n_batches,
+                    )
+                    future_create_batches_jobs.append(future)
+
+                # Wait for all futures to finish:
+                for future in future_create_batches_jobs:
+                    # Call exception() to propagate any exceptions raised by the worker process into
+                    # the main process, and to wait for the worker to finish.
+                    future.exception()
diff --git a/scripts/prepare_ml_data.py b/scripts/prepare_ml_data.py
@@ -59,10 +59,8 @@ def main(config_filename: str, data_source: list[str], overwrite_batches: bool):
     manager = Manager()
     manager.load_yaml_configuration(config_filename)
     manager.initialise_data_sources(names_of_selected_data_sources=data_source)
-    manager.make_directories_if_necessary()
     manager.create_files_specifying_spatial_and_temporal_locations_of_each_example_if_necessary()
     manager.create_batches(overwrite_batches)
-    # TODO: Wait for all processes to complete.
     # TODO: save_yaml_configuration(config)
     # TODO: Validate ML data.
 
diff --git a/tests/filesystem/test_local.py b/tests/filesystem/test_local.py
@@ -1,3 +1,4 @@
+# noqa: D100
 import os
 import tempfile
 from pathlib import Path
@@ -7,12 +8,12 @@
     delete_all_files_in_temp_path,
     download_to_local,
     get_all_filenames_in_path,
-    make_folder,
+    makedirs,
     upload_one_file,
 )
 
 
-def test_check_file_exists():
+def test_check_file_exists():  # noqa: D103
 
     file1 = "test_file1.txt"
     file2 = "test_dir/test_file2.txt"
@@ -27,15 +28,15 @@ def test_check_file_exists():
 
         # add fake file to dir
         os.mkdir(f"{tmpdirname}/test_dir")
-        path_and_filename_2 = os.path.join(local_path, file2)
+        _ = os.path.join(local_path, file2)
         with open(os.path.join(local_path, file2), "w"):
             pass
 
         # run function
         check_path_exists(path=f"{tmpdirname}/test_dir")
 
 
-def test_make_folder():
+def test_makedirs():  # noqa: D103
 
     folder_1 = "test_dir_1"
     folder_2 = "test_dir_2"
@@ -48,7 +49,7 @@ def test_make_folder():
         folder_2 = os.path.join(local_path, folder_2)
 
         # use the make folder function
-        make_folder(folder_1)
+        makedirs(folder_1)
         check_path_exists(path=folder_1)
 
         # make a folder
@@ -58,7 +59,7 @@ def test_make_folder():
         check_path_exists(path=folder_2)
 
 
-def test_delete_local_files():
+def test_delete_local_files():  # noqa: D103
 
     file1 = "test_file1.txt"
     folder1 = "test_dir"
@@ -88,7 +89,7 @@ def test_delete_local_files():
         assert os.path.exists(path_and_folder_1)
 
 
-def test_delete_local_files_and_folder():
+def test_delete_local_files_and_folder():  # noqa: D103
 
     file1 = "test_file1.txt"
     folder1 = "test_dir"
@@ -118,7 +119,7 @@ def test_delete_local_files_and_folder():
         assert not os.path.exists(path_and_folder_1)
 
 
-def test_download():
+def test_download():  # noqa: D103
 
     file1 = "test_file1.txt"
     file2 = "test_dir/test_file2.txt"
@@ -134,7 +135,7 @@ def test_download():
 
         # add fake file to dir
         os.mkdir(f"{tmpdirname}/test_dir")
-        path_and_filename_2 = os.path.join(local_path, file2)
+        _ = os.path.join(local_path, file2)
         with open(os.path.join(local_path, file2), "w"):
             pass
 
@@ -147,7 +148,7 @@ def test_download():
         assert len(filenames) == 3
 
 
-def test_upload():
+def test_upload():  # noqa: D103
 
     file1 = "test_file1.txt"
     file2 = "test_dir/test_file2.txt"
@@ -163,7 +164,7 @@ def test_upload():
 
         # add fake file to dir
         os.mkdir(f"{tmpdirname}/test_dir")
-        path_and_filename_2 = os.path.join(local_path, file2)
+        _ = os.path.join(local_path, file2)
         with open(os.path.join(local_path, file2), "w"):
             pass