project-codeflare
diff --git a/‎demo-notebooks/guided-demos/2_basic_interactive.ipynb
+62-53 b/‎demo-notebooks/guided-demos/2_basic_interactive.ipynb
+62-53
diff --git a/‎demo-notebooks/guided-demos/mnist_fashion.py
-9 b/‎demo-notebooks/guided-demos/mnist_fashion.py
-9
diff --git a/‎demo-notebooks/guided-demos/notebook-ex-outputs/2_basic_interactive.ipynb
+60-50 b/‎demo-notebooks/guided-demos/notebook-ex-outputs/2_basic_interactive.ipynb
+60-50
@@ -68,6 +68,7 @@
     "    min_memory=8,\n",
     "    max_memory=8,\n",
     "    num_gpus=1,\n",
+    "    head_gpus=1,\n",
     "    image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
     "    write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
     "    # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
@@ -147,19 +148,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#before proceeding make sure the cluster exists and the uri is not empty\n",
+    "# before proceeding make sure the cluster exists and the uri is not empty\n",
     "assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n",
     "\n",
     "import ray\n",
-    "from ray.air.config import ScalingConfig\n",
     "\n",
     "# reset the ray context in case there's already one. \n",
     "ray.shutdown()\n",
     "# establish connection to ray cluster\n",
     "\n",
-    "#install additional libraries that will be required for model training\n",
-    "runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\", \"accelerate\"]}\n",
-    "\n",
+    "# install additional libraries that will be required for model training\n",
+    "runtime_env = {\"pip\": [\"pytorch_lightning==1.5.10\", \"ray_lightning\", \"torchmetrics==0.9.1\", \"torchvision==0.12.0\"]}\n",
     "# NOTE: This will work for in-cluster notebook servers (RHODS/ODH), but not for local machines\n",
     "# To see how to connect from your laptop, go to demo-notebooks/additional-demos/local_interactive.ipynb\n",
     "ray.init(address=ray_cluster_uri, runtime_env=runtime_env)\n",
@@ -172,7 +171,7 @@
    "id": "9711030b",
    "metadata": {},
    "source": [
-    "Now that we are connected (and have passed in some package requirements), let's try writing some training code for a DistilBERT transformer model via HuggingFace (using IMDB dataset):"
+    "Now that we are connected (and have passed in some package requirements), let's try writing some training code:"
    ]
   },
   {
@@ -184,66 +183,76 @@
    "source": [
     "@ray.remote\n",
     "def train_fn():\n",
-    "    from datasets import load_dataset\n",
-    "    import transformers\n",
-    "    from transformers import AutoTokenizer, TrainingArguments\n",
-    "    from transformers import AutoModelForSequenceClassification\n",
-    "    import numpy as np\n",
-    "    from datasets import load_metric\n",
+    "    import torch\n",
+    "    import torch.nn as nn\n",
     "    import ray\n",
-    "    from ray import tune\n",
-    "    from ray.train.huggingface import HuggingFaceTrainer\n",
+    "    from torch.utils.data import DataLoader\n",
+    "    from torchvision import datasets\n",
+    "    from torchvision.transforms import ToTensor\n",
+    "    from ray.train.torch import TorchTrainer\n",
+    "    from ray.train import ScalingConfig\n",
     "\n",
-    "    dataset = load_dataset(\"imdb\")\n",
-    "    tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
+    "    def get_dataset():\n",
+    "        return datasets.FashionMNIST(\n",
+    "            root=\"/tmp/data\",\n",
+    "            train=True,\n",
+    "            download=True,\n",
+    "            transform=ToTensor(),\n",
+    "        )\n",
     "\n",
-    "    def tokenize_function(examples):\n",
-    "        return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n",
+    "    class NeuralNetwork(nn.Module):\n",
+    "        def __init__(self):\n",
+    "            super().__init__()\n",
+    "            self.flatten = nn.Flatten()\n",
+    "            self.linear_relu_stack = nn.Sequential(\n",
+    "                nn.Linear(28 * 28, 512),\n",
+    "                nn.ReLU(),\n",
+    "                nn.Linear(512, 512),\n",
+    "                nn.ReLU(),\n",
+    "                nn.Linear(512, 10),\n",
+    "            )\n",
     "\n",
-    "    tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
+    "        def forward(self, inputs):\n",
+    "            inputs = self.flatten(inputs)\n",
+    "            logits = self.linear_relu_stack(inputs)\n",
+    "            return logits\n",
     "\n",
-    "    #using a fraction of dataset but you can run with the full dataset\n",
-    "    small_train_dataset = tokenized_datasets[\"train\"].shuffle(seed=42).select(range(100))\n",
-    "    small_eval_dataset = tokenized_datasets[\"test\"].shuffle(seed=42).select(range(100))\n",
+    "    def train_func_distributed():\n",
+    "        num_epochs = 3\n",
+    "        batch_size = 64\n",
     "\n",
-    "    print(f\"len of train {small_train_dataset} and test {small_eval_dataset}\")\n",
+    "        dataset = get_dataset()\n",
+    "        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)\n",
+    "        dataloader = ray.train.torch.prepare_data_loader(dataloader)\n",
     "\n",
-    "    ray_train_ds = ray.data.from_huggingface(small_train_dataset)\n",
-    "    ray_evaluation_ds = ray.data.from_huggingface(small_eval_dataset)\n",
+    "        model = NeuralNetwork()\n",
+    "        model = ray.train.torch.prepare_model(model)\n",
     "\n",
-    "    def compute_metrics(eval_pred):\n",
-    "        metric = load_metric(\"accuracy\")\n",
-    "        logits, labels = eval_pred\n",
-    "        predictions = np.argmax(logits, axis=-1)\n",
-    "        return metric.compute(predictions=predictions, references=labels)\n",
+    "        criterion = nn.CrossEntropyLoss()\n",
+    "        optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n",
     "\n",
-    "    def trainer_init_per_worker(train_dataset, eval_dataset, **config):\n",
-    "        model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=2)\n",
+    "        for epoch in range(num_epochs):\n",
+    "            if ray.train.get_context().get_world_size() > 1:\n",
+    "                dataloader.sampler.set_epoch(epoch)\n",
     "\n",
-    "        training_args = TrainingArguments(\"/tmp/hf_imdb/test\", eval_steps=1, disable_tqdm=True, \n",
-    "                                          num_train_epochs=1, skip_memory_metrics=True,\n",
-    "                                          learning_rate=2e-5,\n",
-    "                                          per_device_train_batch_size=16,\n",
-    "                                          per_device_eval_batch_size=16,                                \n",
-    "                                          weight_decay=0.01,)\n",
-    "        return transformers.Trainer(\n",
-    "            model=model,\n",
-    "            args=training_args,\n",
-    "            train_dataset=train_dataset,\n",
-    "            eval_dataset=eval_dataset,\n",
-    "            compute_metrics=compute_metrics\n",
-    "        )\n",
+    "            for inputs, labels in dataloader:\n",
+    "                optimizer.zero_grad()\n",
+    "                pred = model(inputs)\n",
+    "                loss = criterion(pred, labels)\n",
+    "                loss.backward()\n",
+    "                optimizer.step()\n",
+    "            print(f\"epoch: {epoch}, loss: {loss.item()}\")\n",
     "\n",
-    "    scaling_config = ScalingConfig(num_workers=2, use_gpu=True) #num workers is the number of gpus\n",
+    "    # For GPU Training, set `use_gpu` to True.\n",
+    "    use_gpu = True\n",
     "\n",
-    "    # we are using the ray native HuggingFaceTrainer, but you can swap out to use non ray Huggingface Trainer. Both have the same method signature. \n",
-    "    # the ray native HFTrainer has built in support for scaling to multiple GPUs\n",
-    "    trainer = HuggingFaceTrainer(\n",
-    "        trainer_init_per_worker=trainer_init_per_worker,\n",
-    "        scaling_config=scaling_config,\n",
-    "        datasets={\"train\": ray_train_ds, \"evaluation\": ray_evaluation_ds},\n",
+    "    trainer = TorchTrainer(\n",
+    "        train_func_distributed,\n",
+    "        scaling_config=ScalingConfig(\n",
+    "            num_workers=3, use_gpu=use_gpu\n",
+    "        ),  # num_workers = number of worker nodes with the ray head node included\n",
     "    )\n",
-    "    result = trainer.fit()"
+    "    trainer.fit()"
    ]
   },
   {
 
@@ -35,15 +35,6 @@ def forward(self, inputs):
         return logits
 
 
-def get_dataset():
-    return datasets.FashionMNIST(
-        root="/tmp/data",
-        train=True,
-        download=True,
-        transform=ToTensor(),
-    )
-
-
 def train_func_distributed():
     num_epochs = 3
     batch_size = 64
 
@@ -76,6 +76,7 @@
     "    min_memory=8,\n",
     "    max_memory=8,\n",
     "    num_gpus=1,\n",
+    "    head_gpus=1,\n",
     "    image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
     "    write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
     "    # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
@@ -243,14 +244,13 @@
     "assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n",
     "\n",
     "import ray\n",
-    "from ray.air.config import ScalingConfig\n",
     "\n",
     "# reset the ray context in case there's already one. \n",
     "ray.shutdown()\n",
     "# establish connection to ray cluster\n",
     "\n",
     "#install additional libraries that will be required for model training\n",
-    "runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\", \"accelerate\"]}\n",
+    "runtime_env = {\"pip\": [\"pytorch_lightning==1.5.10\", \"ray_lightning\", \"torchmetrics==0.9.1\", \"torchvision==0.12.0\"]}\n",
     "\n",
     "# NOTE: This will work for in-cluster notebook servers (RHODS/ODH), but not for local machines\n",
     "# To see how to connect from your laptop, go to demo-notebooks/additional-demos/local_interactive.ipynb\n",
@@ -264,7 +264,7 @@
    "id": "9711030b",
    "metadata": {},
    "source": [
-    "Now that we are connected (and have passed in some package requirements), let's try writing some training code for a DistilBERT transformer model via HuggingFace (using IMDB dataset):"
+    "Now that we are connected (and have passed in some package requirements), let's try writing some training code:"
    ]
   },
   {
@@ -276,66 +276,76 @@
    "source": [
     "@ray.remote\n",
     "def train_fn():\n",
-    "    from datasets import load_dataset\n",
-    "    import transformers\n",
-    "    from transformers import AutoTokenizer, TrainingArguments\n",
-    "    from transformers import AutoModelForSequenceClassification\n",
-    "    import numpy as np\n",
-    "    from datasets import load_metric\n",
+    "    import torch\n",
+    "    import torch.nn as nn\n",
     "    import ray\n",
-    "    from ray import tune\n",
-    "    from ray.train.huggingface import HuggingFaceTrainer\n",
+    "    from torch.utils.data import DataLoader\n",
+    "    from torchvision import datasets\n",
+    "    from torchvision.transforms import ToTensor\n",
+    "    from ray.train.torch import TorchTrainer\n",
+    "    from ray.train import ScalingConfig\n",
     "\n",
-    "    dataset = load_dataset(\"imdb\")\n",
-    "    tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
+    "    def get_dataset():\n",
+    "        return datasets.FashionMNIST(\n",
+    "            root=\"/tmp/data\",\n",
+    "            train=True,\n",
+    "            download=True,\n",
+    "            transform=ToTensor(),\n",
+    "        )\n",
     "\n",
-    "    def tokenize_function(examples):\n",
-    "        return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n",
+    "    class NeuralNetwork(nn.Module):\n",
+    "        def __init__(self):\n",
+    "            super().__init__()\n",
+    "            self.flatten = nn.Flatten()\n",
+    "            self.linear_relu_stack = nn.Sequential(\n",
+    "                nn.Linear(28 * 28, 512),\n",
+    "                nn.ReLU(),\n",
+    "                nn.Linear(512, 512),\n",
+    "                nn.ReLU(),\n",
+    "                nn.Linear(512, 10),\n",
+    "            )\n",
     "\n",
-    "    tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
+    "        def forward(self, inputs):\n",
+    "            inputs = self.flatten(inputs)\n",
+    "            logits = self.linear_relu_stack(inputs)\n",
+    "            return logits\n",
     "\n",
-    "    #using a fraction of dataset but you can run with the full dataset\n",
-    "    small_train_dataset = tokenized_datasets[\"train\"].shuffle(seed=42).select(range(100))\n",
-    "    small_eval_dataset = tokenized_datasets[\"test\"].shuffle(seed=42).select(range(100))\n",
+    "    def train_func_distributed():\n",
+    "        num_epochs = 3\n",
+    "        batch_size = 64\n",
     "\n",
-    "    print(f\"len of train {small_train_dataset} and test {small_eval_dataset}\")\n",
+    "        dataset = get_dataset()\n",
+    "        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)\n",
+    "        dataloader = ray.train.torch.prepare_data_loader(dataloader)\n",
     "\n",
-    "    ray_train_ds = ray.data.from_huggingface(small_train_dataset)\n",
-    "    ray_evaluation_ds = ray.data.from_huggingface(small_eval_dataset)\n",
+    "        model = NeuralNetwork()\n",
+    "        model = ray.train.torch.prepare_model(model)\n",
     "\n",
-    "    def compute_metrics(eval_pred):\n",
-    "        metric = load_metric(\"accuracy\")\n",
-    "        logits, labels = eval_pred\n",
-    "        predictions = np.argmax(logits, axis=-1)\n",
-    "        return metric.compute(predictions=predictions, references=labels)\n",
+    "        criterion = nn.CrossEntropyLoss()\n",
+    "        optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n",
     "\n",
-    "    def trainer_init_per_worker(train_dataset, eval_dataset, **config):\n",
-    "        model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=2)\n",
+    "        for epoch in range(num_epochs):\n",
+    "            if ray.train.get_context().get_world_size() > 1:\n",
+    "                dataloader.sampler.set_epoch(epoch)\n",
     "\n",
-    "        training_args = TrainingArguments(\"/tmp/hf_imdb/test\", eval_steps=1, disable_tqdm=True, \n",
-    "                                          num_train_epochs=1, skip_memory_metrics=True,\n",
-    "                                          learning_rate=2e-5,\n",
-    "                                          per_device_train_batch_size=16,\n",
-    "                                          per_device_eval_batch_size=16,                                \n",
-    "                                          weight_decay=0.01,)\n",
-    "        return transformers.Trainer(\n",
-    "            model=model,\n",
-    "            args=training_args,\n",
-    "            train_dataset=train_dataset,\n",
-    "            eval_dataset=eval_dataset,\n",
-    "            compute_metrics=compute_metrics\n",
-    "        )\n",
+    "            for inputs, labels in dataloader:\n",
+    "                optimizer.zero_grad()\n",
+    "                pred = model(inputs)\n",
+    "                loss = criterion(pred, labels)\n",
+    "                loss.backward()\n",
+    "                optimizer.step()\n",
+    "            print(f\"epoch: {epoch}, loss: {loss.item()}\")\n",
     "\n",
-    "    scaling_config = ScalingConfig(num_workers=2, use_gpu=True) #num workers is the number of gpus\n",
+    "    # For GPU Training, set `use_gpu` to True.\n",
+    "    use_gpu = True\n",
     "\n",
-    "    # we are using the ray native HuggingFaceTrainer, but you can swap out to use non ray Huggingface Trainer. Both have the same method signature. \n",
-    "    # the ray native HFTrainer has built in support for scaling to multiple GPUs\n",
-    "    trainer = HuggingFaceTrainer(\n",
-    "        trainer_init_per_worker=trainer_init_per_worker,\n",
-    "        scaling_config=scaling_config,\n",
-    "        datasets={\"train\": ray_train_ds, \"evaluation\": ray_evaluation_ds},\n",
+    "    trainer = TorchTrainer(\n",
+    "        train_func_distributed,\n",
+    "        scaling_config=ScalingConfig(\n",
+    "            num_workers=3, use_gpu=use_gpu\n",
+    "        ),  # num_workers = number of worker nodes with the ray head node included\n",
     "    )\n",
-    "    result = trainer.fit()"
+    "    trainer.fit()"
    ]
   },
   {