Skip to content

Commit 4e7a49e

Browse files
committed
updating training script in basic_interactive notebook
1 parent 2a43d9e commit 4e7a49e

File tree

5 files changed

+186
-162
lines changed

5 files changed

+186
-162
lines changed

demo-notebooks/guided-demos/2_basic_interactive.ipynb

+62-53
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
" min_memory=8,\n",
6969
" max_memory=8,\n",
7070
" num_gpus=1,\n",
71+
" head_gpus=1,\n",
7172
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
7273
" write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
7374
" # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
@@ -147,19 +148,17 @@
147148
"metadata": {},
148149
"outputs": [],
149150
"source": [
150-
"#before proceeding make sure the cluster exists and the uri is not empty\n",
151+
"# before proceeding make sure the cluster exists and the uri is not empty\n",
151152
"assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n",
152153
"\n",
153154
"import ray\n",
154-
"from ray.air.config import ScalingConfig\n",
155155
"\n",
156156
"# reset the ray context in case there's already one. \n",
157157
"ray.shutdown()\n",
158158
"# establish connection to ray cluster\n",
159159
"\n",
160-
"#install additional libraries that will be required for model training\n",
161-
"runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\", \"accelerate\"]}\n",
162-
"\n",
160+
"# install additional libraries that will be required for model training\n",
161+
"runtime_env = {\"pip\": [\"pytorch_lightning==1.5.10\", \"ray_lightning\", \"torchmetrics==0.9.1\", \"torchvision==0.12.0\"]}\n",
163162
"# NOTE: This will work for in-cluster notebook servers (RHODS/ODH), but not for local machines\n",
164163
"# To see how to connect from your laptop, go to demo-notebooks/additional-demos/local_interactive.ipynb\n",
165164
"ray.init(address=ray_cluster_uri, runtime_env=runtime_env)\n",
@@ -172,7 +171,7 @@
172171
"id": "9711030b",
173172
"metadata": {},
174173
"source": [
175-
"Now that we are connected (and have passed in some package requirements), let's try writing some training code for a DistilBERT transformer model via HuggingFace (using IMDB dataset):"
174+
"Now that we are connected (and have passed in some package requirements), let's try writing some training code:"
176175
]
177176
},
178177
{
@@ -184,66 +183,76 @@
184183
"source": [
185184
"@ray.remote\n",
186185
"def train_fn():\n",
187-
" from datasets import load_dataset\n",
188-
" import transformers\n",
189-
" from transformers import AutoTokenizer, TrainingArguments\n",
190-
" from transformers import AutoModelForSequenceClassification\n",
191-
" import numpy as np\n",
192-
" from datasets import load_metric\n",
186+
" import torch\n",
187+
" import torch.nn as nn\n",
193188
" import ray\n",
194-
" from ray import tune\n",
195-
" from ray.train.huggingface import HuggingFaceTrainer\n",
189+
" from torch.utils.data import DataLoader\n",
190+
" from torchvision import datasets\n",
191+
" from torchvision.transforms import ToTensor\n",
192+
" from ray.train.torch import TorchTrainer\n",
193+
" from ray.train import ScalingConfig\n",
196194
"\n",
197-
" dataset = load_dataset(\"imdb\")\n",
198-
" tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
195+
" def get_dataset():\n",
196+
" return datasets.FashionMNIST(\n",
197+
" root=\"/tmp/data\",\n",
198+
" train=True,\n",
199+
" download=True,\n",
200+
" transform=ToTensor(),\n",
201+
" )\n",
199202
"\n",
200-
" def tokenize_function(examples):\n",
201-
" return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n",
203+
" class NeuralNetwork(nn.Module):\n",
204+
" def __init__(self):\n",
205+
" super().__init__()\n",
206+
" self.flatten = nn.Flatten()\n",
207+
" self.linear_relu_stack = nn.Sequential(\n",
208+
" nn.Linear(28 * 28, 512),\n",
209+
" nn.ReLU(),\n",
210+
" nn.Linear(512, 512),\n",
211+
" nn.ReLU(),\n",
212+
" nn.Linear(512, 10),\n",
213+
" )\n",
202214
"\n",
203-
" tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
215+
" def forward(self, inputs):\n",
216+
" inputs = self.flatten(inputs)\n",
217+
" logits = self.linear_relu_stack(inputs)\n",
218+
" return logits\n",
204219
"\n",
205-
" #using a fraction of dataset but you can run with the full dataset\n",
206-
" small_train_dataset = tokenized_datasets[\"train\"].shuffle(seed=42).select(range(100))\n",
207-
" small_eval_dataset = tokenized_datasets[\"test\"].shuffle(seed=42).select(range(100))\n",
220+
" def train_func_distributed():\n",
221+
" num_epochs = 3\n",
222+
" batch_size = 64\n",
208223
"\n",
209-
" print(f\"len of train {small_train_dataset} and test {small_eval_dataset}\")\n",
224+
" dataset = get_dataset()\n",
225+
" dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)\n",
226+
" dataloader = ray.train.torch.prepare_data_loader(dataloader)\n",
210227
"\n",
211-
" ray_train_ds = ray.data.from_huggingface(small_train_dataset)\n",
212-
" ray_evaluation_ds = ray.data.from_huggingface(small_eval_dataset)\n",
228+
" model = NeuralNetwork()\n",
229+
" model = ray.train.torch.prepare_model(model)\n",
213230
"\n",
214-
" def compute_metrics(eval_pred):\n",
215-
" metric = load_metric(\"accuracy\")\n",
216-
" logits, labels = eval_pred\n",
217-
" predictions = np.argmax(logits, axis=-1)\n",
218-
" return metric.compute(predictions=predictions, references=labels)\n",
231+
" criterion = nn.CrossEntropyLoss()\n",
232+
" optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n",
219233
"\n",
220-
" def trainer_init_per_worker(train_dataset, eval_dataset, **config):\n",
221-
" model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=2)\n",
234+
" for epoch in range(num_epochs):\n",
235+
" if ray.train.get_context().get_world_size() > 1:\n",
236+
" dataloader.sampler.set_epoch(epoch)\n",
222237
"\n",
223-
" training_args = TrainingArguments(\"/tmp/hf_imdb/test\", eval_steps=1, disable_tqdm=True, \n",
224-
" num_train_epochs=1, skip_memory_metrics=True,\n",
225-
" learning_rate=2e-5,\n",
226-
" per_device_train_batch_size=16,\n",
227-
" per_device_eval_batch_size=16, \n",
228-
" weight_decay=0.01,)\n",
229-
" return transformers.Trainer(\n",
230-
" model=model,\n",
231-
" args=training_args,\n",
232-
" train_dataset=train_dataset,\n",
233-
" eval_dataset=eval_dataset,\n",
234-
" compute_metrics=compute_metrics\n",
235-
" )\n",
238+
" for inputs, labels in dataloader:\n",
239+
" optimizer.zero_grad()\n",
240+
" pred = model(inputs)\n",
241+
" loss = criterion(pred, labels)\n",
242+
" loss.backward()\n",
243+
" optimizer.step()\n",
244+
" print(f\"epoch: {epoch}, loss: {loss.item()}\")\n",
236245
"\n",
237-
" scaling_config = ScalingConfig(num_workers=2, use_gpu=True) #num workers is the number of gpus\n",
246+
" # For GPU Training, set `use_gpu` to True.\n",
247+
" use_gpu = True\n",
238248
"\n",
239-
" # we are using the ray native HuggingFaceTrainer, but you can swap out to use non ray Huggingface Trainer. Both have the same method signature. \n",
240-
" # the ray native HFTrainer has built in support for scaling to multiple GPUs\n",
241-
" trainer = HuggingFaceTrainer(\n",
242-
" trainer_init_per_worker=trainer_init_per_worker,\n",
243-
" scaling_config=scaling_config,\n",
244-
" datasets={\"train\": ray_train_ds, \"evaluation\": ray_evaluation_ds},\n",
249+
" trainer = TorchTrainer(\n",
250+
" train_func_distributed,\n",
251+
" scaling_config=ScalingConfig(\n",
252+
" num_workers=3, use_gpu=use_gpu\n",
253+
" ), # num_workers = number of worker nodes with the ray head node included\n",
245254
" )\n",
246-
" result = trainer.fit()"
255+
" trainer.fit()"
247256
]
248257
},
249258
{

demo-notebooks/guided-demos/mnist_fashion.py

-9
Original file line numberDiff line numberDiff line change
@@ -35,15 +35,6 @@ def forward(self, inputs):
3535
return logits
3636

3737

38-
def get_dataset():
39-
return datasets.FashionMNIST(
40-
root="/tmp/data",
41-
train=True,
42-
download=True,
43-
transform=ToTensor(),
44-
)
45-
46-
4738
def train_func_distributed():
4839
num_epochs = 3
4940
batch_size = 64

demo-notebooks/guided-demos/notebook-ex-outputs/2_basic_interactive.ipynb

+60-50
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@
7676
" min_memory=8,\n",
7777
" max_memory=8,\n",
7878
" num_gpus=1,\n",
79+
" head_gpus=1,\n",
7980
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
8081
" write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
8182
" # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
@@ -243,14 +244,13 @@
243244
"assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n",
244245
"\n",
245246
"import ray\n",
246-
"from ray.air.config import ScalingConfig\n",
247247
"\n",
248248
"# reset the ray context in case there's already one. \n",
249249
"ray.shutdown()\n",
250250
"# establish connection to ray cluster\n",
251251
"\n",
252252
"#install additional libraries that will be required for model training\n",
253-
"runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\", \"accelerate\"]}\n",
253+
"runtime_env = {\"pip\": [\"pytorch_lightning==1.5.10\", \"ray_lightning\", \"torchmetrics==0.9.1\", \"torchvision==0.12.0\"]}\n",
254254
"\n",
255255
"# NOTE: This will work for in-cluster notebook servers (RHODS/ODH), but not for local machines\n",
256256
"# To see how to connect from your laptop, go to demo-notebooks/additional-demos/local_interactive.ipynb\n",
@@ -264,7 +264,7 @@
264264
"id": "9711030b",
265265
"metadata": {},
266266
"source": [
267-
"Now that we are connected (and have passed in some package requirements), let's try writing some training code for a DistilBERT transformer model via HuggingFace (using IMDB dataset):"
267+
"Now that we are connected (and have passed in some package requirements), let's try writing some training code:"
268268
]
269269
},
270270
{
@@ -276,66 +276,76 @@
276276
"source": [
277277
"@ray.remote\n",
278278
"def train_fn():\n",
279-
" from datasets import load_dataset\n",
280-
" import transformers\n",
281-
" from transformers import AutoTokenizer, TrainingArguments\n",
282-
" from transformers import AutoModelForSequenceClassification\n",
283-
" import numpy as np\n",
284-
" from datasets import load_metric\n",
279+
" import torch\n",
280+
" import torch.nn as nn\n",
285281
" import ray\n",
286-
" from ray import tune\n",
287-
" from ray.train.huggingface import HuggingFaceTrainer\n",
282+
" from torch.utils.data import DataLoader\n",
283+
" from torchvision import datasets\n",
284+
" from torchvision.transforms import ToTensor\n",
285+
" from ray.train.torch import TorchTrainer\n",
286+
" from ray.train import ScalingConfig\n",
288287
"\n",
289-
" dataset = load_dataset(\"imdb\")\n",
290-
" tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
288+
" def get_dataset():\n",
289+
" return datasets.FashionMNIST(\n",
290+
" root=\"/tmp/data\",\n",
291+
" train=True,\n",
292+
" download=True,\n",
293+
" transform=ToTensor(),\n",
294+
" )\n",
291295
"\n",
292-
" def tokenize_function(examples):\n",
293-
" return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n",
296+
" class NeuralNetwork(nn.Module):\n",
297+
" def __init__(self):\n",
298+
" super().__init__()\n",
299+
" self.flatten = nn.Flatten()\n",
300+
" self.linear_relu_stack = nn.Sequential(\n",
301+
" nn.Linear(28 * 28, 512),\n",
302+
" nn.ReLU(),\n",
303+
" nn.Linear(512, 512),\n",
304+
" nn.ReLU(),\n",
305+
" nn.Linear(512, 10),\n",
306+
" )\n",
294307
"\n",
295-
" tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
308+
" def forward(self, inputs):\n",
309+
" inputs = self.flatten(inputs)\n",
310+
" logits = self.linear_relu_stack(inputs)\n",
311+
" return logits\n",
296312
"\n",
297-
" #using a fraction of dataset but you can run with the full dataset\n",
298-
" small_train_dataset = tokenized_datasets[\"train\"].shuffle(seed=42).select(range(100))\n",
299-
" small_eval_dataset = tokenized_datasets[\"test\"].shuffle(seed=42).select(range(100))\n",
313+
" def train_func_distributed():\n",
314+
" num_epochs = 3\n",
315+
" batch_size = 64\n",
300316
"\n",
301-
" print(f\"len of train {small_train_dataset} and test {small_eval_dataset}\")\n",
317+
" dataset = get_dataset()\n",
318+
" dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)\n",
319+
" dataloader = ray.train.torch.prepare_data_loader(dataloader)\n",
302320
"\n",
303-
" ray_train_ds = ray.data.from_huggingface(small_train_dataset)\n",
304-
" ray_evaluation_ds = ray.data.from_huggingface(small_eval_dataset)\n",
321+
" model = NeuralNetwork()\n",
322+
" model = ray.train.torch.prepare_model(model)\n",
305323
"\n",
306-
" def compute_metrics(eval_pred):\n",
307-
" metric = load_metric(\"accuracy\")\n",
308-
" logits, labels = eval_pred\n",
309-
" predictions = np.argmax(logits, axis=-1)\n",
310-
" return metric.compute(predictions=predictions, references=labels)\n",
324+
" criterion = nn.CrossEntropyLoss()\n",
325+
" optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n",
311326
"\n",
312-
" def trainer_init_per_worker(train_dataset, eval_dataset, **config):\n",
313-
" model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=2)\n",
327+
" for epoch in range(num_epochs):\n",
328+
" if ray.train.get_context().get_world_size() > 1:\n",
329+
" dataloader.sampler.set_epoch(epoch)\n",
314330
"\n",
315-
" training_args = TrainingArguments(\"/tmp/hf_imdb/test\", eval_steps=1, disable_tqdm=True, \n",
316-
" num_train_epochs=1, skip_memory_metrics=True,\n",
317-
" learning_rate=2e-5,\n",
318-
" per_device_train_batch_size=16,\n",
319-
" per_device_eval_batch_size=16, \n",
320-
" weight_decay=0.01,)\n",
321-
" return transformers.Trainer(\n",
322-
" model=model,\n",
323-
" args=training_args,\n",
324-
" train_dataset=train_dataset,\n",
325-
" eval_dataset=eval_dataset,\n",
326-
" compute_metrics=compute_metrics\n",
327-
" )\n",
331+
" for inputs, labels in dataloader:\n",
332+
" optimizer.zero_grad()\n",
333+
" pred = model(inputs)\n",
334+
" loss = criterion(pred, labels)\n",
335+
" loss.backward()\n",
336+
" optimizer.step()\n",
337+
" print(f\"epoch: {epoch}, loss: {loss.item()}\")\n",
328338
"\n",
329-
" scaling_config = ScalingConfig(num_workers=2, use_gpu=True) #num workers is the number of gpus\n",
339+
" # For GPU Training, set `use_gpu` to True.\n",
340+
" use_gpu = True\n",
330341
"\n",
331-
" # we are using the ray native HuggingFaceTrainer, but you can swap out to use non ray Huggingface Trainer. Both have the same method signature. \n",
332-
" # the ray native HFTrainer has built in support for scaling to multiple GPUs\n",
333-
" trainer = HuggingFaceTrainer(\n",
334-
" trainer_init_per_worker=trainer_init_per_worker,\n",
335-
" scaling_config=scaling_config,\n",
336-
" datasets={\"train\": ray_train_ds, \"evaluation\": ray_evaluation_ds},\n",
342+
" trainer = TorchTrainer(\n",
343+
" train_func_distributed,\n",
344+
" scaling_config=ScalingConfig(\n",
345+
" num_workers=3, use_gpu=use_gpu\n",
346+
" ), # num_workers = number of worker nodes with the ray head node included\n",
337347
" )\n",
338-
" result = trainer.fit()"
348+
" trainer.fit()"
339349
]
340350
},
341351
{

0 commit comments

Comments
 (0)