make embedding_aggregations an explicit kwarg to wrenformer

janosh · janosh · commit 5b9a49be9441 · 2022-06-16T12:26:24.000+01:00
diff --git a/aviary/core.py b/aviary/core.py
@@ -324,20 +324,19 @@ def evaluate(
                 mixed_loss.backward()
                 optimizer.step()
 
-        epoch_averaged_metrics = {
-            target: {
-                metric_key: np.array(values).mean().squeeze().round(4)
-                for metric_key, values in dct.items()
+        avrg_metrics: dict[str, dict[str, float]] = {}
+        for target, per_batch_metrics in epoch_metrics.items():
+            avrg_metrics[target] = {
+                metric_key: (np.array(values).mean().squeeze().round(4))
+                for metric_key, values in per_batch_metrics.items()
             }
-            for target, dct in epoch_metrics.items()
-        }
-        # take sqrt at the end to get correct epoch RMSE
-        # per-batch averaged RMSE != RMSE of full epoch since (sqrt(a + b) != sqrt(a) + sqrt(b))
-        for metrics_for_target in epoch_averaged_metrics.values():
-            if "MSE" in metrics_for_target:
-                metrics_for_target["RMSE"] = metrics_for_target.pop("MSE") ** 0.5
-
-        return epoch_averaged_metrics
+            # take sqrt at the end to get correct epoch RMSE as per-batch averaged RMSE
+            # != RMSE of full epoch since (sqrt(a + b) != sqrt(a) + sqrt(b))
+            avrg_mse = avrg_metrics[target].pop("MSE")
+            if avrg_mse:
+                avrg_metrics[target]["RMSE"] = (avrg_mse**0.5).round(4)
+
+        return avrg_metrics
 
     @torch.no_grad()
     def predict(
diff --git a/aviary/wrenformer/model.py b/aviary/wrenformer/model.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from typing import Callable, Sequence
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -35,6 +37,7 @@ def __init__(
         trunk_hidden: list[int] = [1024, 512],
         out_hidden: list[int] = [256, 128, 64],
         robust: bool = False,
+        embedding_aggregations: Sequence[str] = ("mean",),
         **kwargs,
     ) -> None:
         """Initialize the Wrenformer model.
@@ -57,6 +60,9 @@ def __init__(
                 target will be an estimate for the aleatoric uncertainty (uncertainty inherent to
                 the sample) which can be used with a robust loss function to attenuate the weighting
                 of uncertain samples.
+            embedding_aggregations (list[str]): Aggregations to apply to the learned embedding
+                returned by the transformer encoder before passing into the ResidualNetwork. One or
+                more of ['mean', 'std', 'sum', 'min', 'max']. Defaults to ['mean'].
         """
         super().__init__(robust=robust, **kwargs)
 
@@ -73,9 +79,10 @@ def __init__(
         if self.robust:
             n_targets = [2 * n for n in n_targets]
 
-        n_aggregators = 2  # number of embedding aggregation functions
+        self.embedding_aggregations = embedding_aggregations
         self.trunk_nn = ResidualNetwork(
-            input_dim=n_aggregators * d_model,
+            # len(embedding_aggregations) = number of catted tensors in aggregated_embeddings below
+            input_dim=len(embedding_aggregations) * d_model,
             output_dim=out_hidden[0],
             hidden_layer_dims=trunk_hidden,
         )
@@ -123,18 +130,25 @@ def forward(  # type: ignore
         # into a single vector Wyckoff embedding
         # careful to ignore padded values when taking the mean
         inv_mask: torch.BoolTensor = ~mask[..., None]
-        # sum_agg = (embeddings * inv_mask).sum(dim=1)
-
-        # # replace padded values with +/-inf to exclude them from min/max
-        # min_agg, _ = torch.where(inv_mask, embeddings, float("inf")).min(dim=1)
-        # max_agg, _ = torch.where(inv_mask, embeddings, float("-inf")).max(dim=1)
-        mean_agg = masked_mean(embeddings, inv_mask, dim=1)
-        std_agg = masked_std(embeddings, inv_mask, dim=1)
 
-        # Sum+Std+Min+Max+Mean: we call this S2M3 aggregation
-        aggregated_embeddings = torch.cat([mean_agg, std_agg], dim=1)
+        aggregation_funcs = [aggregators[key] for key in self.embedding_aggregations]
+        aggregated_embeddings = torch.cat(
+            [func(embeddings, inv_mask, 1) for func in aggregation_funcs], dim=1
+        )
 
         # main body of the feed-forward NN jointly used by all multitask objectives
         predictions = F.relu(self.trunk_nn(aggregated_embeddings))
 
         return tuple(output_nn(predictions) for output_nn in self.output_nns)
+
+
+# using all at once we call this S2M3 aggregation
+aggregators: dict[str, Callable[[Tensor, BoolTensor, int], Tensor]] = {
+    "mean": masked_mean,
+    "sum": lambda x, mask, dim: (x * mask).sum(dim=dim),
+    "std": masked_std,
+    # replace padded values with +/-inf to make sure min()/max() ignore them
+    "min": lambda x, mask, dim: torch.where(mask, x, float("inf")).min(dim=dim)[0],
+    # 1st ret val = max, 2nd ret val = max indices
+    "max": lambda x, mask, dim: torch.where(mask, x, float("-inf")).max(dim=dim)[0],
+}
diff --git a/aviary/wrenformer/run.py b/aviary/wrenformer/run.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import os
-from typing import Any, Literal
+from typing import Any, Literal, Sequence
 
 import numpy as np
 import pandas as pd
@@ -49,6 +49,7 @@ def run_wrenformer(
     run_params: dict[str, Any] = None,
     learning_rate: float = 3e-4,
     warmup_steps: int = 10,
+    embedding_aggregations: Sequence[str] = ("mean",),
 ) -> tuple[dict[str, float], dict[str, Any], pd.DataFrame]:
     """Run a single matbench task.
 
@@ -81,6 +82,9 @@ def run_wrenformer(
             hyperparams. Will be logged to wandb. Can be anything really. Defaults to {}.
         learning_rate (float): The optimizer's learning rate. Defaults to 3e-4.
         warmup_steps (int): How many warmup steps the scheduler should do. Defaults to 10.
+        embedding_aggregations (list[str]): Aggregations to apply to the learned embedding returned
+            by the transformer encoder before passing into the ResidualNetwork. One or more of
+            ['mean', 'std', 'sum', 'min', 'max']. Defaults to ['mean'].
 
     Raises:
         ValueError: On unknown dataset_name or invalid checkpoint.
@@ -107,7 +111,7 @@ def run_wrenformer(
             )
             assert "wyckoff" in df, err_msg
             with print_walltime(
-                start_desc=f"{label} Generating Wyckoff embeddings", newline=False
+                start_desc=f"Generating Wyckoff embeddings for {label}", newline=False
             ):
                 df["features"] = df.wyckoff.map(wyckoff_embedding_from_aflow_str)
         elif "roost" in run_name.lower():
@@ -161,6 +165,7 @@ def run_wrenformer(
         task_dict={target_col: task_type},  # e.g. {'exfoliation_en': 'regression'}
         n_attn_layers=n_attn_layers,
         robust=robust,
+        embedding_aggregations=embedding_aggregations,
     )
     model.to(device)
     optimizer = torch.optim.AdamW(params=model.parameters(), lr=learning_rate)
@@ -192,6 +197,7 @@ def run_wrenformer(
         "trainable_params": model.num_params,
         "swa_start": swa_start,
         "timestamp": timestamp,
+        "embedding_aggregations": embedding_aggregations,
         **(run_params or {}),
     }
 
@@ -228,6 +234,12 @@ def run_wrenformer(
             wandb.log({"training": train_metrics, "validation": val_metrics})
 
     # get test set predictions
+    if swa_start is not None:
+        n_swa_epochs = int((1 - swa_start) * epochs)
+        print(
+            f"Using SWA model with weights averaged over {n_swa_epochs} epochs ({swa_start = })"
+        )
+
     inference_model = swa_model if swa_start is not None else model
     inference_model.eval()
 
diff --git a/examples/mat_bench/slurm_submit.py b/examples/mat_bench/slurm_submit.py
@@ -14,10 +14,11 @@
 # %% write Python submission file and sbatch it
 epochs = 300
 n_attn_layers = 6
-model_name = f"wrenformer-robust-s2m3-aggregation-{epochs=}-{n_attn_layers=}"
+embedding_aggregations = ("mean",)
 folds = list(range(5))
-checkpoint = "wandb"  # None | 'local' | 'wandb'
-learning_rate = 1e-3
+checkpoint = None  # None | 'local' | 'wandb'
+lr = 3e-4
+model_name = f"wrenformer-{lr=:.0e}-{epochs=}-{n_attn_layers=}".replace("e-0", "e-")
 
 if "roost" in model_name.lower():
     # deploy Roost on all tasks
@@ -26,6 +27,8 @@
     # deploy Wren on structure tasks only
     datasets = [k for k, v in mbv01_metadata.items() if v.input_type == "structure"]
 
+datasets = ["matbench_mp_e_form"]
+
 os.makedirs(log_dir := f"{MODULE_DIR}/job-logs", exist_ok=True)
 timestamp = f"{datetime.now():%Y-%m-%d@%H-%M}"
 
@@ -56,7 +59,8 @@
     {epochs=},
     {n_attn_layers=},
     {checkpoint=},
-    {learning_rate=},
+    learning_rate={lr},
+    {embedding_aggregations=},
 )
 """
 
diff --git a/examples/mp_wbm/slurm_submit.py b/examples/mp_wbm/slurm_submit.py
@@ -13,14 +13,15 @@
 # %% write Python submission file and sbatch it
 epochs = 300
 n_attn_layers = 6
-model_name = f"wrenformer-robust-s2m3-aggregation-{epochs=}-{n_attn_layers=}"
+embedding_aggregations = ("mean",)
 fold = 0
 n_folds = 1
 data_path = f"{ROOT}/datasets/2022-06-09-mp+wbm.json.gz"
 target = "e_form"
 task_type = "regression"
 checkpoint = "wandb"  # None | 'local' | 'wandb'
-learning_rate = 1e-3
+lr = 3e-4
+model_name = f"wrenformer-{lr=:.0e}-{epochs=}-{n_attn_layers=}".replace("e-0", "e-")
 
 os.makedirs(log_dir := f"{MODULE_DIR}/job-logs", exist_ok=True)
 timestamp = f"{datetime.now():%Y-%m-%d@%H-%M}"
@@ -51,7 +52,8 @@
     {epochs=},
     {n_attn_layers=},
     {checkpoint=},
-    {learning_rate=},
+    learning_rate={lr},
+    {embedding_aggregations=},
 )
 """