Skip to content
This repository was archived by the owner on Sep 11, 2023. It is now read-only.

Commit 04d4fbb

Browse files
committed
Finish first rough draft of Manager.create_batches()
1 parent b2387f3 commit 04d4fbb

File tree

5 files changed

+81
-46
lines changed

5 files changed

+81
-46
lines changed

nowcasting_dataset/data_sources/datasource_output.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,12 @@
44
import logging
55
import os
66
from pathlib import Path
7-
from typing import List
87

98
import numpy as np
10-
from pydantic import BaseModel, Field
119

10+
# nowcasting_dataset imports
1211
from nowcasting_dataset.dataset.xr_utils import PydanticXArrayDataSet
13-
from nowcasting_dataset.filesystem.utils import make_folder
12+
from nowcasting_dataset.filesystem.utils import makedirs
1413
from nowcasting_dataset.utils import get_netcdf_filename
1514

1615
logger = logging.getLogger(__name__)
@@ -43,8 +42,8 @@ def save_netcdf(self, batch_i: int, path: Path):
4342
# make folder
4443
folder = os.path.join(path, name)
4544
if batch_i == 0:
46-
# only need to make the folder once, or check that there folder is there once
47-
make_folder(path=folder)
45+
# only need to make the folder once, or check that the folder is there once
46+
makedirs(path=folder)
4847

4948
# make file
5049
local_filename = os.path.join(folder, filename)

nowcasting_dataset/filesystem/utils.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -146,8 +146,16 @@ def upload_one_file(
146146
filesystem.put(local_filename, remote_filename)
147147

148148

149-
def make_folder(path: Union[str, Path]):
150-
""" Make folder """
149+
def makedirs(path: Union[str, Path], exist_ok: bool = True) -> None:
150+
"""Recursively make directories
151+
152+
Creates directory at path and any intervening required directories.
153+
154+
Raises exception if, for instance, the path already exists but is a file.
155+
156+
Args:
157+
path: The path to create.
158+
exist_ok: If False then raise an exception if `path` already exists.
159+
"""
151160
filesystem = fsspec.open(path).fs
152-
if not filesystem.exists(path):
153-
filesystem.mkdir(path)
161+
filesystem.mkdir(path, exist_ok=exist_ok)

nowcasting_dataset/manager.py

Lines changed: 53 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@
44
from pathlib import Path
55
from typing import Optional, Union
66

7-
import fsspec
7+
import futures
88
import numpy as np
99
import pandas as pd
1010

11+
# nowcasting_dataset imports
1112
import nowcasting_dataset.time as nd_time
1213
import nowcasting_dataset.utils as nd_utils
1314
from nowcasting_dataset import config
@@ -102,26 +103,6 @@ def initialise_data_sources(
102103
" data_source_which_defines_geospatial_locations."
103104
)
104105

105-
def make_directories_if_necessary(self) -> None:
106-
"""Make dirs: `<output_data.filepath> / <split_name> / <data_source_name>`.
107-
108-
Also make `local_temp_path` if necessary.
109-
110-
Works on any compute environment.
111-
"""
112-
filesystem = fsspec.open(self.config.output_data.filepath).fs
113-
for split_name in split.SplitName:
114-
for data_source_name in self.data_sources.keys():
115-
path = self.config.output_data.filepath / split_name.value / data_source_name
116-
logger.info(f"Making {path} if necessary...")
117-
filesystem.mkdirs(path, exist_ok=True)
118-
119-
if self.save_batches_locally_and_upload:
120-
logger.info(f"Making {self.local_temp_path} if necessary...")
121-
filesystem.mkdirs(self.local_temp_path, exist_ok=True)
122-
logger.info(f"Deleting all files in {self.local_temp_path}...")
123-
nd_fs_utils.delete_all_files_in_temp_path(path=self.local_temp_path)
124-
125106
def create_files_specifying_spatial_and_temporal_locations_of_each_example_if_necessary(
126107
self,
127108
) -> None:
@@ -316,6 +297,10 @@ def _find_splits_which_need_more_batches(
316297
def create_batches(self, overwrite_batches: bool) -> None:
317298
"""Create batches (if necessary).
318299
300+
Make dirs: `<output_data.filepath> / <split_name> / <data_source_name>`.
301+
302+
Also make `local_temp_path` if necessary.
303+
319304
Args:
320305
overwrite_batches: If True then start from batch 0, regardless of which batches have
321306
previously been written to disk. If False then check which batches have previously been
@@ -335,7 +320,7 @@ def create_batches(self, overwrite_batches: bool) -> None:
335320
return
336321

337322
# Load locations for each example off disk.
338-
locations_for_each_example_for_each_split: dict[split.SplitName, pd.DataFrame] = {}
323+
locations_for_each_example_of_each_split: dict[split.SplitName, pd.DataFrame] = {}
339324
for split_name in splits_which_need_more_batches:
340325
filename = self._filename_of_locations_csv_file(split_name.value)
341326
logger.info(f"Loading {filename}.")
@@ -345,7 +330,51 @@ def create_batches(self, overwrite_batches: bool) -> None:
345330
locations_for_each_example["t0_datetime_UTC"] = pd.to_datetime(
346331
locations_for_each_example["t0_datetime_UTC"]
347332
)
348-
locations_for_each_example_for_each_split[split_name] = locations_for_each_example
333+
locations_for_each_example_of_each_split[split_name] = locations_for_each_example
349334

350-
# TODO: Fire up a separate process for each DataSource, and pass it a list of batches to
335+
# Fire up a separate process for each DataSource, and pass it a list of batches to
351336
# create, and whether to utils.upload_and_delete_local_files().
337+
# TODO: Split this up into separate functions!!!
338+
n_data_sources = len(self.data_sources)
339+
for split_name in splits_which_need_more_batches:
340+
locations_for_split = locations_for_each_example_of_each_split[split_name]
341+
with futures.ProcessPoolExecutor(max_workers=n_data_sources) as executor:
342+
future_create_batches_jobs = []
343+
for worker_id, (data_source_name, data_source) in enumerate(
344+
self.data_sources.items()
345+
):
346+
# Get indexes of first batch and example; and subset locations_for_split.
347+
idx_of_first_batch = first_batches_to_create[split_name][data_source_name]
348+
idx_of_first_example = idx_of_first_batch * self.config.process.batch_size
349+
locations = locations_for_split.loc[idx_of_first_example:]
350+
351+
# Get paths.
352+
dst_path = (
353+
self.config.output_data.filepath / split_name.value / data_source_name
354+
)
355+
temp_path = (
356+
self.temp_path / split_name.value / data_source_name / f"worker_{worker_id}"
357+
)
358+
359+
# Make folders.
360+
nd_fs_utils.makedirs(dst_path, exist_ok=True)
361+
if self.save_batches_locally_and_upload:
362+
nd_fs_utils.makedirs(temp_path, exist_ok=True)
363+
364+
# Submit data_source.create_batches task to the worker process.
365+
future = executor.submit(
366+
data_source.create_batches,
367+
spatial_and_temporal_locations_of_each_example=locations,
368+
idx_of_first_batch=idx_of_first_batch,
369+
batch_size=self.config.process.batch_size,
370+
dst_path=dst_path,
371+
temp_path=temp_path,
372+
upload_every_n_batches=self.config.process.upload_every_n_batches,
373+
)
374+
future_create_batches_jobs.append(future)
375+
376+
# Wait for all futures to finish:
377+
for future in future_create_batches_jobs:
378+
# Call exception() to propagate any exceptions raised by the worker process into
379+
# the main process, and to wait for the worker to finish.
380+
future.exception()

scripts/prepare_ml_data.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,8 @@ def main(config_filename: str, data_source: list[str], overwrite_batches: bool):
5959
manager = Manager()
6060
manager.load_yaml_configuration(config_filename)
6161
manager.initialise_data_sources(names_of_selected_data_sources=data_source)
62-
manager.make_directories_if_necessary()
6362
manager.create_files_specifying_spatial_and_temporal_locations_of_each_example_if_necessary()
6463
manager.create_batches(overwrite_batches)
65-
# TODO: Wait for all processes to complete.
6664
# TODO: save_yaml_configuration(config)
6765
# TODO: Validate ML data.
6866

tests/filesystem/test_local.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# noqa: D100
12
import os
23
import tempfile
34
from pathlib import Path
@@ -7,12 +8,12 @@
78
delete_all_files_in_temp_path,
89
download_to_local,
910
get_all_filenames_in_path,
10-
make_folder,
11+
makedirs,
1112
upload_one_file,
1213
)
1314

1415

15-
def test_check_file_exists():
16+
def test_check_file_exists(): # noqa: D103
1617

1718
file1 = "test_file1.txt"
1819
file2 = "test_dir/test_file2.txt"
@@ -27,15 +28,15 @@ def test_check_file_exists():
2728

2829
# add fake file to dir
2930
os.mkdir(f"{tmpdirname}/test_dir")
30-
path_and_filename_2 = os.path.join(local_path, file2)
31+
_ = os.path.join(local_path, file2)
3132
with open(os.path.join(local_path, file2), "w"):
3233
pass
3334

3435
# run function
3536
check_path_exists(path=f"{tmpdirname}/test_dir")
3637

3738

38-
def test_make_folder():
39+
def test_makedirs(): # noqa: D103
3940

4041
folder_1 = "test_dir_1"
4142
folder_2 = "test_dir_2"
@@ -48,7 +49,7 @@ def test_make_folder():
4849
folder_2 = os.path.join(local_path, folder_2)
4950

5051
# use the make folder function
51-
make_folder(folder_1)
52+
makedirs(folder_1)
5253
check_path_exists(path=folder_1)
5354

5455
# make a folder
@@ -58,7 +59,7 @@ def test_make_folder():
5859
check_path_exists(path=folder_2)
5960

6061

61-
def test_delete_local_files():
62+
def test_delete_local_files(): # noqa: D103
6263

6364
file1 = "test_file1.txt"
6465
folder1 = "test_dir"
@@ -88,7 +89,7 @@ def test_delete_local_files():
8889
assert os.path.exists(path_and_folder_1)
8990

9091

91-
def test_delete_local_files_and_folder():
92+
def test_delete_local_files_and_folder(): # noqa: D103
9293

9394
file1 = "test_file1.txt"
9495
folder1 = "test_dir"
@@ -118,7 +119,7 @@ def test_delete_local_files_and_folder():
118119
assert not os.path.exists(path_and_folder_1)
119120

120121

121-
def test_download():
122+
def test_download(): # noqa: D103
122123

123124
file1 = "test_file1.txt"
124125
file2 = "test_dir/test_file2.txt"
@@ -134,7 +135,7 @@ def test_download():
134135

135136
# add fake file to dir
136137
os.mkdir(f"{tmpdirname}/test_dir")
137-
path_and_filename_2 = os.path.join(local_path, file2)
138+
_ = os.path.join(local_path, file2)
138139
with open(os.path.join(local_path, file2), "w"):
139140
pass
140141

@@ -147,7 +148,7 @@ def test_download():
147148
assert len(filenames) == 3
148149

149150

150-
def test_upload():
151+
def test_upload(): # noqa: D103
151152

152153
file1 = "test_file1.txt"
153154
file2 = "test_dir/test_file2.txt"
@@ -163,7 +164,7 @@ def test_upload():
163164

164165
# add fake file to dir
165166
os.mkdir(f"{tmpdirname}/test_dir")
166-
path_and_filename_2 = os.path.join(local_path, file2)
167+
_ = os.path.join(local_path, file2)
167168
with open(os.path.join(local_path, file2), "w"):
168169
pass
169170

0 commit comments

Comments
 (0)