FunctionLab · evancofer · Dec 9, 2018 · Dec 6, 2018 · Dec 9, 2018 · Dec 9, 2018
diff --git a/selene_sdk/samplers/file_samplers/mat_file_sampler.py b/selene_sdk/samplers/file_samplers/mat_file_sampler.py
@@ -225,5 +225,5 @@ def get_data_and_targets(self, batch_size, n_samples=None):
         sequences_and_targets.append((seqs, tgts))
         targets_mat.append(tgts)
         # TODO: should not assume targets are always integers
-        targets_mat = np.vstack(targets_mat).astype(int)
+        targets_mat = np.vstack(targets_mat).astype(float)
         return sequences_and_targets, targets_mat
diff --git a/selene_sdk/train_model.py b/selene_sdk/train_model.py
@@ -12,6 +12,8 @@
 import torch.nn as nn
 from torch.autograd import Variable
 from torch.optim.lr_scheduler import ReduceLROnPlateau
+from sklearn.metrics import roc_auc_score
+from sklearn.metrics import average_precision_score
 
 from .utils import initialize_logger
 from .utils import load_model_from_state_dict
@@ -117,6 +119,11 @@ class TrainModel(object):
         The directory to save model checkpoints and logs.
     training_loss : list(float)
         The current training loss.
+    metrics : dict
+        A dictionary that maps metric names (`str`) to metric functions.
+        By default, this contains `"roc_auc"`, which maps to
+        `sklearn.metrics.roc_auc_score`, and `"average_precision"`,
+        which maps to `sklearn.metrics.average_precision_score`.
 
     """
 
@@ -138,7 +145,9 @@ def __init__(self,
                  use_cuda=False,
                  data_parallel=False,
                  logging_verbosity=2,
-                 checkpoint_resume=None):
+                 checkpoint_resume=None,
+                 metrics=dict(roc_auc=roc_auc_score,
+                              average_precision=average_precision_score)):
         """
         Constructs a new `TrainModel` object.
         """
@@ -181,13 +190,15 @@ def __init__(self,
         self._create_validation_set(n_samples=n_validation_samples)
         self._validation_metrics = PerformanceMetrics(
             self.sampler.get_feature_from_index,
-            report_gt_feature_n_positives=report_gt_feature_n_positives)
+            report_gt_feature_n_positives=report_gt_feature_n_positives,
+            metrics=metrics)
 
         if "test" in self.sampler.modes:
             self._create_test_set(n_samples=n_test_samples)
             self._test_metrics = PerformanceMetrics(
                 self.sampler.get_feature_from_index,
-                report_gt_feature_n_positives=report_gt_feature_n_positives)
+                report_gt_feature_n_positives=report_gt_feature_n_positives,
+                metrics=metrics)
 
         self._start_step = 0
         self._min_loss = float("inf") # TODO: Should this be set when it is used later? Would need to if we want to train model 2x in one run.
@@ -224,7 +235,8 @@ def __init__(self,
         self._train_logger.info("loss")
         # TODO: this makes the assumption that all models will report ROC AUC,
         # which is not the case.
-        self._validation_logger.info("loss\troc_auc")
+        self._validation_logger.info("\t".join(["loss"] +
+            sorted([x for x in self._validation_metrics.metrics.keys()])))
 
     def _create_validation_set(self, n_samples=None):
         """
@@ -315,17 +327,14 @@ def train_and_validate(self):
                 valid_scores = self.validate()
                 validation_loss = valid_scores["loss"]
                 self._train_logger.info(train_loss)
-                # TODO: check if "roc_auc" is a key in `valid_scores`?
-                if valid_scores["roc_auc"]:
-                    validation_roc_auc = valid_scores["roc_auc"]
-                    self._validation_logger.info(
-                        "{0}\t{1}".format(validation_loss,
-                                          validation_roc_auc))
-                    scheduler.step(
-                        math.ceil(validation_roc_auc * 1000.0) / 1000.0)
-                else:
-                    self._validation_logger.info("{0}\tNA".format(
-                        validation_loss))
+                to_log = [str(validation_loss)]
+                for k in sorted(self._validation_metrics.metrics.keys()):
+                    if k in valid_scores and valid_scores[k]:
+                        to_log.append(str(valid_scores[k]))
+                    else:
+                        to_log.append("NA")
+                self._validation_logger.info("\t".join(to_log))
+                scheduler.step(math.ceil(validation_loss * 1000.0) / 1000.0)
 
                 is_best = validation_loss < min_loss
                 min_loss = min(validation_loss, min_loss)

diff --git a/selene_sdk/utils/performance_metrics.py b/selene_sdk/utils/performance_metrics.py
@@ -11,6 +11,7 @@
 from sklearn.metrics import precision_recall_curve
 from sklearn.metrics import roc_auc_score
 from sklearn.metrics import roc_curve
+from sklearn.metrics import r2_score
 
 
 logger = logging.getLogger("selene")
@@ -201,11 +202,11 @@ def compute_score(prediction, target, metric_fn,
     feature_scores = np.ones(target.shape[1]) * -1
     for index, feature_preds in enumerate(prediction.T):
         feature_targets = target[:, index]
-        if len(np.unique(feature_targets)) > 1 and \
-                np.sum(feature_targets) > report_gt_feature_n_positives:
+        if len(np.unique(feature_targets)) > 0 and \
+               np.count_nonzero(feature_targets) > report_gt_feature_n_positives:
             feature_scores[index] = metric_fn(
                 feature_targets, feature_preds)
-    valid_feature_scores = [s for s in feature_scores if s >= 0]
+    valid_feature_scores = [s for s in feature_scores if not np.isnan(s)] # Allow 0 or negative values.
     if not valid_feature_scores:
         return None, feature_scores
     average_score = np.average(valid_feature_scores)
@@ -237,7 +238,7 @@ def get_feature_specific_scores(data, get_feature_from_index_fn):
     feature_score_dict = {}
     for index, score in enumerate(data):
         feature = get_feature_from_index_fn(index)
-        if score >= 0:
+        if not np.isnan(score):
             feature_score_dict[feature] = score
         else:
             feature_score_dict[feature] = None
@@ -257,6 +258,13 @@ class PerformanceMetrics(object):
     report_gt_feature_n_positives : int, optional
         Default is 10. The minimum number of positive examples for a
         feature in order to compute the score for it.
+    metrics : dict
+        A dictionary that maps metric names (`str`) to metric functions.
+        By default, this contains `"roc_auc"`, which maps to
+        `sklearn.metrics.roc_auc_score`, and `"average_precision"`,
+        which maps to `sklearn.metrics.average_precision_score`.
+
+
 
     Attributes
     ----------
@@ -276,16 +284,16 @@ class PerformanceMetrics(object):
 
     def __init__(self,
                  get_feature_from_index_fn,
-                 report_gt_feature_n_positives=10):
+                 report_gt_feature_n_positives=10,
+                 metrics=dict(roc_auc=roc_auc_score, average_precision=average_precision_score)):
         """
         Creates a new object of the `PerformanceMetrics` class.
         """
         self.skip_threshold = report_gt_feature_n_positives
         self.get_feature_from_index = get_feature_from_index_fn
-        self.metrics = {
-            "roc_auc": Metric(fn=roc_auc_score, data=[]),
-            "average_precision": Metric(fn=average_precision_score, data=[])
-        }
+        self.metrics = dict()
+        for k, v in metrics.items():
+            self.metrics[k] = Metric(fn=v, data=[])
 
     def add_metric(self, name, metric_fn):
         """
@@ -430,7 +438,7 @@ def write_feature_scores_to_file(self, output_path):
             file_handle.write("{0}\n".format(cols))
             for feature, metric_scores in sorted(feature_scores.items()):
                 if not metric_scores:
-                    file_handle.write("{0}\tNA\tNA\n".format(feature))
+                    file_handle.write("{0}\t{1}\n".format(feature, "\t".join(["NA"] * len(metric_cols))))
                 else:
                     metric_score_cols = '\t'.join(
                         ["{0:.4f}".format(s) for s in metric_scores.values()])

diff --git a/tutorials/README.md b/tutorials/README.md
@@ -8,7 +8,8 @@ To get started on training a model very quickly, please see [`quickstart_trainin
 Additionally, we have two tutorials that show how to apply trained models. Selene provides methods to run variant effect prediction and _in silico_ mutagenesis, along with some visualization methods that we recommend running based on our Jupyter notebook tutorials.
 
 - Comprehensive _in silico_ mutagenesis tutorial: [`analyzing_mutations_with_trained_models`](https://github.com/FunctionLab/selene/tree/master/tutorials/analyzing_mutations_with_trained_models)
-- Tutorial with both the config file method and the non-config file method of running Selene. Also shows how to run variant effect prediction and visualize the difference scores. Contains an _in silico_ mutagenesis example with known regulatory mutations: [`variants_and_visualizations`](https://github.com/FunctionLab/selene/tree/master/tutorials/variants_and_visualizations) 
+- Tutorial with both the config file method and the non-config file method of running Selene. Also shows how to run variant effect prediction and visualize the difference scores. Contains an _in silico_ mutagenesis example with known regulatory mutations: [`variants_and_visualizations`](https://github.com/FunctionLab/selene/tree/master/tutorials/variants_and_visualizations)
+- Tutorial demonstrating Selene's use to predict mean ribosomal load based on 5' UTR sequences: [`regression_mpra_example`](https://github.com/FunctionLab/selene/tree/master/tutorials/regression_mpra_example)
 
 ## Contributing tutorials
 

diff --git a/tutorials/regression_mpra_example/download_data.py b/tutorials/regression_mpra_example/download_data.py
@@ -0,0 +1,63 @@
+import io
+import gzip
+import os
+import urllib.request
+import tarfile
+
+import numpy
+import pandas
+import scipy.io
+import selene_sdk.sequences
+
+
+def run():
+    target_column = "rl"
+    local_file = "sample_et_al.tar"
+
+    # Download the data.
+    urllib.request.urlretrieve("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE114002&format=file", local_file)
+    with tarfile.open(local_file, "r") as archive:
+        contents = archive.extractfile("GSM3130435_egfp_unmod_1.csv.gz").read()
+        contents = gzip.decompress(contents).decode("utf-8")
+    os.remove(local_file)
+
+    # Format data.
+    df = pandas.read_csv(io.StringIO(contents), sep=",", index_col=0)
+    df = df[["utr", "total_reads", target_column]]
+    df.sort_values("total_reads", inplace=True, ascending=False)
+    df.reset_index(inplace=True, drop=True)
+
+    # Split into train/validation/test.
+    df = df.iloc[:280000]
+    datasets = dict(test=df.iloc[:20000])
+    df = df.iloc[20000:]
+    df = df.sample(frac=1.)
+    datasets["validate"] = df.iloc[:20000]
+    datasets["train"] = df.iloc[20000:]
+    x = dict.fromkeys(datasets.keys())
+    y = dict.fromkeys(datasets.keys())
+
+    # Construct features.
+    for k in datasets.keys():
+        x[k] = list()
+        y[k] = list()
+        for i in range(datasets[k].shape[0]):
+            x[k].append(selene_sdk.sequences.Genome.sequence_to_encoding(datasets[k]["utr"].iloc[i]).T)
+            y[k].append(datasets[k][target_column].iloc[i])
+        x[k] = numpy.stack(x[k])
+        y[k] = numpy.asarray(y[k]).reshape(-1, 1)
+
+    # Scale w/ parameters from training data to prevent leakage.
+    sdev = numpy.std(y["train"])
+    mean = numpy.mean(y["train"])
+    for k in datasets.keys():
+        y[k] = (y[k] - mean) / sdev
+
+    # Write data to file.
+    for k in datasets.keys():
+        scipy.io.savemat("{0}.mat".format(k), dict(x=x[k], y=y[k]))
+
+
+if __name__ == "__main__":
+    run()
+
diff --git a/tutorials/regression_mpra_example/regression_mpra_example.ipynb b/tutorials/regression_mpra_example/regression_mpra_example.ipynb
@@ -0,0 +1,115 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Regression Models in Selene\n",
+    "\n",
+    "Selene is a flexible framework, and can be used for tasks beyond simple classification.\n",
+    "This tutorial demonstrates the simple process of training regression models with Selene.\n",
+    "For this example, we will predict mean ribosomal load (MRL) from 50 base pair 5' UTR sequences using models and data from [*Human 5′ UTR design and variant effect prediction from a massively parallel translation assay*](https://doi.org/10.1101/310375) by Sample et al.\n",
+    "This data was generated from a massively parallel reporter assay (MPRA), which you can read more about it in the preprint on [*bioRxiv*](https://doi.org/10.1101/310375).\n",
+    "\n",
+    "## Setup\n",
+    "\n",
+    "**Architecture:** The model is defined in [utr_model.py](https://github.com/FunctionLab/selene/blob/master/tutorials/regression_mpra_example/utr_model.py), and only superficially differs from the model in [the paper](https://doi.org/10.1101/310375).\n",
+    "Since this is a real-valued regression problem, it is appropriate that the `criterion` method in `utr_model.py` uses the mean squared error.\n",
+    "\n",
+    "\n",
+    "**Data:** The data from Sample et al is available on the [Gene Expression Omnibus (GEO)](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE114002).\n",
+    "However, we have included [the `download_data.py` script](https://github.com/FunctionLab/selene/blob/master/tutorials/regression_mpra_example/download_data.py), to download the data and preprocess it.\n",
+    "It should produce three files, `train.mat`, `validate.mat`, and `test.mat`.\n",
+    "They include the data for training, validation, and testing respectively.\n",
+    "At present, regression models can only be trained with `*.mat` files and the [`MatFileSampler`](http://selene.flatironinstitute.org/samplers.file_samplers.html#selene_sdk.samplers.file_samplers.MatFileSampler).\n",
+    "Further, a `MatFileSampler` must be specified for each of the `train.mat`, `validate.mat`, and `test.mat` files.\n",
+    "These `MatFileSampler`s are then used for the `train`, `validate`, and `test` arguments of a [`MultiFileSampler`](http://selene.flatironinstitute.org/samplers.html#selene_sdk.samplers.MultiFileSampler).\n",
+    "The specific syntax is demonstrated in the configuration file, [`regression_train.yml`](https://github.com/FunctionLab/selene/blob/master/tutorials/regression_mpra_example/regression_train.yml).\n",
+    "\n",
+    "**Configuration file:** The configuration file [`regression_train.yml`](https://github.com/FunctionLab/selene/blob/master/tutorials/regression_mpra_example/regression_train.yml) is slightly different than the configuration files in the classification tutorials.\n",
+    "Specifically, `metrics` in `train_model` includes the coefficient of determination (`r2`), since the default metrics (`roc_auc` and `average_precision`) are not appropriate for regression.\n",
+    "Further, `report_gt_feature_n_positives` in `train_model` has been set to zero to prevent spurious filtering based on target values.\n",
+    "\n",
+    "## Download the data\n",
+    "\n",
+    "To download the data, just run the [`download_data.py`](https://github.com/FunctionLab/selene/blob/master/tutorials/regression_mpra_example/download_data.py) script from the command line:\n",
+    "```sh\n",
+    "python download_data.py\n",
+    "```\n",
+    "\n",
+    "## Train the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from selene_sdk.utils import load_path\n",
+    "from selene_sdk.utils import parse_configs_and_run"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Before running `load_path` on `regression_train.yml`, please edit the YAML file to include the absolute path of the model file.\n",
+    "\n",
+    "Currently, the model is set to train on GPU.\n",
+    "If you do not have CUDA on your machine, please set `use_cuda` to `False` in the configuration file. Note that using the CPU instead of GPU will slow down training considerably."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "configs = load_path(\"./regression_train.yml\", instantiate=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Outputs and logs saved to ./2018-12-08-22-08-14\n",
+      "[VALIDATE] average r2: 0.8641705948994154\n",
+      "[VALIDATE] average r2: 0.8767916124114791\n",
+      "[VALIDATE] average r2: 0.8817297326343803\n",
+      "[TEST] average r2: 0.9232683662644537\n"
+     ]
+    }
+   ],
+   "source": [
+    "parse_configs_and_run(configs, lr=0.001)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}