Merge pull request #47 from CompRhys/fix-wrenformer-scheduler

janosh · web-flow · commit c2824c811ad3 · 2022-06-18T21:14:32.000+01:00
Fix Wrenformer scheduler
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -14,9 +14,6 @@ repos:
     rev: 4.0.1
     hooks:
       - id: flake8
-        args: [--ignore, 'W503,E203']
-      # W503 black conflicts with "line break before operator" rule
-      # E203 black conflicts with "whitespace before ':'" rule
 
   - repo: https://github.com/asottile/pyupgrade
     rev: v2.31.1
diff --git a/aviary/wrenformer/utils.py b/aviary/wrenformer/utils.py
@@ -1,7 +1,7 @@
 import json
 import time
 from contextlib import contextmanager
-from typing import Generator
+from typing import Generator, Literal
 
 
 def _int_keys(dct: dict) -> dict:
@@ -21,12 +21,20 @@ def recursive_dict_merge(d1: dict, d2: dict) -> dict:
     return d1
 
 
-def merge_json_on_disk(dct: dict, file_path: str) -> None:
+def merge_json_on_disk(
+    dct: dict,
+    file_path: str,
+    on_non_serializable: Literal["annotate", "error"] = "annotate",
+) -> None:
     """Merge a dict into a (possibly) existing JSON file.
 
     Args:
         file_path (str): Path to JSON file. File will be created if not exist.
         dct (dict): Dictionary to merge into JSON file.
+        on_non_serializable ('annotate' | 'error'): What to do with non-serializable values
+            encountered in dct. 'annotate' will replace the offending object with a string
+            indicating the type, e.g. '<not serializable: function>'. 'error' will raise
+            'TypeError: Object of type function is not JSON serializable'. Defaults to 'annotate'.
     """
     try:
         with open(file_path) as json_file:
@@ -36,8 +44,15 @@ def merge_json_on_disk(dct: dict, file_path: str) -> None:
     except (FileNotFoundError, json.decoder.JSONDecodeError):  # file missing or empty
         pass
 
+    def non_serializable_handler(obj: object) -> str:
+        # replace functions and classes in dct with string indicating a non-serializable type
+        return f"<not serializable: {type(obj).__qualname__}>"
+
     with open(file_path, "w") as file:
-        json.dump(dct, file)
+        default = (
+            non_serializable_handler if on_non_serializable == "annotate" else None
+        )
+        json.dump(dct, file, default=default)
 
 
 @contextmanager
diff --git a/examples/mat_bench/slurm_submit.py b/examples/mat_bench/slurm_submit.py
@@ -18,7 +18,7 @@
 folds = list(range(5))
 checkpoint = None  # None | 'local' | 'wandb'
 lr = 3e-4
-model_name = f"wrenformer-{lr=:.0e}-{epochs=}-{n_attn_layers=}".replace("e-0", "e-")
+model_name = f"wrenformer-{lr=}-{epochs=}-{n_attn_layers=}"
 
 if "roost" in model_name.lower():
     # deploy Roost on all tasks
diff --git a/examples/mp_wbm/slurm_submit.py b/examples/mp_wbm/slurm_submit.py
@@ -13,17 +13,18 @@
 # %% write Python submission file and sbatch it
 epochs = 300
 n_attn_layers = 3
-embedding_aggregations = ("mean", "std", "min", "max")
+embedding_aggregations = ("mean",)
+optimizer = "AdamW"
+lr = 3e-4
+scheduler = ("CosineAnnealingLR", {"T_max": epochs})
 n_folds = 1
 data_path = f"{ROOT}/datasets/2022-06-09-mp+wbm.json.gz"
 target = "e_form"
 task_type = "regression"
 checkpoint = None  # None | 'local' | 'wandb'
-lr = 3e-4
-batch_size = 64
-model_name = f"wrenformer-robust-{lr=:.0e}-{epochs=}-{n_attn_layers=}".replace(
-    "e-0", "e-"
-)
+batch_size = 128
+model_name = f"wrenformer-robust-{epochs=}"
+
 
 os.makedirs(log_dir := f"{MODULE_DIR}/job-logs", exist_ok=True)
 timestamp = f"{datetime.now():%Y-%m-%d@%H-%M}"
@@ -52,7 +53,9 @@
     {epochs=},
     {n_attn_layers=},
     {checkpoint=},
+    {optimizer=},
     learning_rate={lr},
+    {scheduler=},
     {embedding_aggregations=},
     {batch_size=},
 )
diff --git a/examples/wrenformer.py b/examples/wrenformer.py
@@ -50,6 +50,8 @@ def run_wrenformer(
     checkpoint: Literal["local", "wandb"] | None = None,
     swa_start=0.7,
     run_params: dict[str, Any] = None,
+    optimizer: str | tuple[str, dict] = "AdamW",
+    scheduler: str | tuple[str, dict] = "LambdaLR",
     learning_rate: float = 3e-4,
     batch_size: int = 128,
     warmup_steps: int = 10,
@@ -82,9 +84,16 @@ def run_wrenformer(
             ```
         swa_start (float | None): When to start using stochastic weight averaging during training.
             Should be a float between 0 and 1. 0.7 means start SWA after 70% of epochs. Set to
-            None to disable SWA. Defaults to 0.7.
+            None to disable SWA. Defaults to 0.7. Proposed in https://arxiv.org/abs/1803.05407.
         run_params (dict[str, Any]): Additional parameters to merge into the run's dict of
             hyperparams. Will be logged to wandb. Can be anything really. Defaults to {}.
+        optimizer (str | tuple[str, dict]): Name of a torch.optim.Optimizer class like 'Adam',
+            'AdamW', 'SGD', etc. Can be a string or a string and dict with params to pass to the
+            class. Defaults to 'AdamW'.
+        scheduler (str | tuple[str, dict]): Name of a torch.optim.lr_scheduler class like
+            'LambdaLR', 'StepLR', 'CosineAnnealingLR', etc. Defaults to 'LambdaLR'. Can be a string
+            or a string and dict with params to pass to the class. E.g.
+            ('CosineAnnealingLR', {'T_max': n_epochs}).
         learning_rate (float): The optimizer's learning rate. Defaults to 3e-4.
         batch_size (int): The mini-batch size during training. Defaults to 128.
         warmup_steps (int): How many warmup steps the scheduler should do. Defaults to 10.
@@ -181,31 +190,53 @@ def run_wrenformer(
         embedding_aggregations=embedding_aggregations,
     )
     model.to(device)
-    optimizer = torch.optim.AdamW(params=model.parameters(), lr=learning_rate)
+    if isinstance(optimizer, str):
+        optimizer_name, optimizer_params = optimizer, None
+    elif isinstance(optimizer, (tuple, list)):
+        optimizer_name, optimizer_params = optimizer
+    else:
+        raise ValueError(f"Unknown {optimizer=}")
+    optimizer_cls = getattr(torch.optim, optimizer_name)
+    optimizer_instance = optimizer_cls(
+        params=model.parameters(), lr=learning_rate, **(optimizer_params or {})
+    )
 
     # This lambda goes up linearly until warmup_steps, then follows a power law decay.
     # Acts as a prefactor to the learning rate, i.e. actual_lr = lr_lambda(epoch) *
     # learning_rate.
-    scheduler = torch.optim.lr_scheduler.LambdaLR(
-        optimizer,
-        lambda epoch: min((epoch + 1) ** (-0.5), (epoch + 1) * warmup_steps ** (-1.5)),
-    )
+    if scheduler == "LambdaLR":
+        scheduler_name, scheduler_params = "LambdaLR", {
+            "lr_lambda": lambda epoch: min(
+                (epoch + 1) ** (-0.5), (epoch + 1) * warmup_steps ** (-1.5)
+            )
+        }
+    elif isinstance(scheduler, str):
+        scheduler_name, scheduler_params = scheduler, None
+    elif isinstance(scheduler, (tuple, list)):
+        scheduler_name, scheduler_params = scheduler
+    else:
+        raise ValueError(f"Unknown {scheduler=}")
+    scheduler_cls = getattr(torch.optim.lr_scheduler, scheduler_name)
+    scheduler_instance = scheduler_cls(optimizer_instance, **(scheduler_params or {}))
 
     if swa_start is not None:
         swa_model = AveragedModel(model)
-        # scheduler = CosineAnnealingLR(optimizer, T_max=epochs)
-        swa_scheduler = SWALR(optimizer, swa_lr=0.01)
+        swa_scheduler_instance = SWALR(optimizer_instance, swa_lr=0.01)
 
     run_params = {
         "epochs": epochs,
+        "optimizer": optimizer_name,
+        "optimizer_params": optimizer_params,
         "learning_rate": learning_rate,
+        "lr_scheduler": scheduler_name,
+        "scheduler_params": scheduler_params,
         "batch_size": batch_size,
         "n_attn_layers": n_attn_layers,
         "target": target_col,
         "warmup_steps": warmup_steps,
         "robust": robust,
         "embedding_len": embedding_len,
-        "losses": str(loss_dict),
+        "losses": loss_dict,
         "training_samples": len(train_df),
         "test_samples": len(test_df),
         "trainable_params": model.num_params,
@@ -232,7 +263,7 @@ def run_wrenformer(
         train_metrics = model.evaluate(
             train_loader,
             loss_dict,
-            optimizer,
+            optimizer_instance,
             normalizer_dict,
             action="train",
             verbose=verbose,
@@ -250,10 +281,10 @@ def run_wrenformer(
 
         if swa_start is not None and epoch > swa_start * epochs:
             swa_model.update_parameters(model)
-            swa_scheduler.step()
+            swa_scheduler_instance.step()
         else:
-            scheduler.step()
-        scheduler.step()
+            scheduler_instance.step()
+
         model.epoch += 1
 
         if wandb_project:
@@ -293,9 +324,9 @@ def run_wrenformer(
     if checkpoint is not None:
         state_dict = {
             "model_state": inference_model.state_dict(),
-            "optimizer_state": optimizer.state_dict(),
+            "optimizer_state": optimizer_instance.state_dict(),
             "scheduler_state": (
-                scheduler if swa_start is None else swa_scheduler
+                scheduler_instance if swa_start is None else swa_scheduler_instance
             ).state_dict(),
             "loss_dict": loss_dict,
             "epoch": epochs,
diff --git a/setup.cfg b/setup.cfg
@@ -1,7 +1,8 @@
 [flake8]
 max-line-length = 100
-# E203: whitespace before ':' preferred by black
-ignore = E203
+# E203: black conflicts with "whitespace before ':'" rule
+# W503 black conflicts with "line break before operator" rule
+ignore = E203,W503
 
 [isort]
 profile = black