Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion selene_sdk/samplers/file_samplers/mat_file_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,5 +225,5 @@ def get_data_and_targets(self, batch_size, n_samples=None):
sequences_and_targets.append((seqs, tgts))
targets_mat.append(tgts)
# TODO: should not assume targets are always integers
targets_mat = np.vstack(targets_mat).astype(int)
targets_mat = np.vstack(targets_mat).astype(float)
return sequences_and_targets, targets_mat
39 changes: 24 additions & 15 deletions selene_sdk/train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import torch.nn as nn
from torch.autograd import Variable
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

from .utils import initialize_logger
from .utils import load_model_from_state_dict
Expand Down Expand Up @@ -117,6 +119,11 @@ class TrainModel(object):
The directory to save model checkpoints and logs.
training_loss : list(float)
The current training loss.
metrics : dict
A dictionary that maps metric names (`str`) to metric functions.
By default, this contains `"roc_auc"`, which maps to
`sklearn.metrics.roc_auc_score`, and `"average_precision"`,
which maps to `sklearn.metrics.average_precision_score`.

"""

Expand All @@ -138,7 +145,9 @@ def __init__(self,
use_cuda=False,
data_parallel=False,
logging_verbosity=2,
checkpoint_resume=None):
checkpoint_resume=None,
metrics=dict(roc_auc=roc_auc_score,
average_precision=average_precision_score)):
"""
Constructs a new `TrainModel` object.
"""
Expand Down Expand Up @@ -181,13 +190,15 @@ def __init__(self,
self._create_validation_set(n_samples=n_validation_samples)
self._validation_metrics = PerformanceMetrics(
self.sampler.get_feature_from_index,
report_gt_feature_n_positives=report_gt_feature_n_positives)
report_gt_feature_n_positives=report_gt_feature_n_positives,
metrics=metrics)

if "test" in self.sampler.modes:
self._create_test_set(n_samples=n_test_samples)
self._test_metrics = PerformanceMetrics(
self.sampler.get_feature_from_index,
report_gt_feature_n_positives=report_gt_feature_n_positives)
report_gt_feature_n_positives=report_gt_feature_n_positives,
metrics=metrics)

self._start_step = 0
self._min_loss = float("inf") # TODO: Should this be set when it is used later? Would need to if we want to train model 2x in one run.
Expand Down Expand Up @@ -224,7 +235,8 @@ def __init__(self,
self._train_logger.info("loss")
# TODO: this makes the assumption that all models will report ROC AUC,
# which is not the case.
self._validation_logger.info("loss\troc_auc")
self._validation_logger.info("\t".join(["loss"] +
sorted([x for x in self._validation_metrics.metrics.keys()])))

def _create_validation_set(self, n_samples=None):
"""
Expand Down Expand Up @@ -315,17 +327,14 @@ def train_and_validate(self):
valid_scores = self.validate()
validation_loss = valid_scores["loss"]
self._train_logger.info(train_loss)
# TODO: check if "roc_auc" is a key in `valid_scores`?
if valid_scores["roc_auc"]:
validation_roc_auc = valid_scores["roc_auc"]
self._validation_logger.info(
"{0}\t{1}".format(validation_loss,
validation_roc_auc))
scheduler.step(
math.ceil(validation_roc_auc * 1000.0) / 1000.0)
else:
self._validation_logger.info("{0}\tNA".format(
validation_loss))
to_log = [str(validation_loss)]
for k in sorted(self._validation_metrics.metrics.keys()):
if k in valid_scores and valid_scores[k]:
to_log.append(str(valid_scores[k]))
else:
to_log.append("NA")
self._validation_logger.info("\t".join(to_log))
scheduler.step(math.ceil(validation_loss * 1000.0) / 1000.0)

is_best = validation_loss < min_loss
min_loss = min(validation_loss, min_loss)
Expand Down
28 changes: 18 additions & 10 deletions selene_sdk/utils/performance_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import r2_score


logger = logging.getLogger("selene")
Expand Down Expand Up @@ -201,11 +202,11 @@ def compute_score(prediction, target, metric_fn,
feature_scores = np.ones(target.shape[1]) * -1
for index, feature_preds in enumerate(prediction.T):
feature_targets = target[:, index]
if len(np.unique(feature_targets)) > 1 and \
np.sum(feature_targets) > report_gt_feature_n_positives:
if len(np.unique(feature_targets)) > 0 and \
np.count_nonzero(feature_targets) > report_gt_feature_n_positives:
feature_scores[index] = metric_fn(
feature_targets, feature_preds)
valid_feature_scores = [s for s in feature_scores if s >= 0]
valid_feature_scores = [s for s in feature_scores if not np.isnan(s)] # Allow 0 or negative values.
if not valid_feature_scores:
return None, feature_scores
average_score = np.average(valid_feature_scores)
Expand Down Expand Up @@ -237,7 +238,7 @@ def get_feature_specific_scores(data, get_feature_from_index_fn):
feature_score_dict = {}
for index, score in enumerate(data):
feature = get_feature_from_index_fn(index)
if score >= 0:
if not np.isnan(score):
feature_score_dict[feature] = score
else:
feature_score_dict[feature] = None
Expand All @@ -257,6 +258,13 @@ class PerformanceMetrics(object):
report_gt_feature_n_positives : int, optional
Default is 10. The minimum number of positive examples for a
feature in order to compute the score for it.
metrics : dict
A dictionary that maps metric names (`str`) to metric functions.
By default, this contains `"roc_auc"`, which maps to
`sklearn.metrics.roc_auc_score`, and `"average_precision"`,
which maps to `sklearn.metrics.average_precision_score`.



Attributes
----------
Expand All @@ -276,16 +284,16 @@ class PerformanceMetrics(object):

def __init__(self,
get_feature_from_index_fn,
report_gt_feature_n_positives=10):
report_gt_feature_n_positives=10,
metrics=dict(roc_auc=roc_auc_score, average_precision=average_precision_score)):
"""
Creates a new object of the `PerformanceMetrics` class.
"""
self.skip_threshold = report_gt_feature_n_positives
self.get_feature_from_index = get_feature_from_index_fn
self.metrics = {
"roc_auc": Metric(fn=roc_auc_score, data=[]),
"average_precision": Metric(fn=average_precision_score, data=[])
}
self.metrics = dict()
for k, v in metrics.items():
self.metrics[k] = Metric(fn=v, data=[])

def add_metric(self, name, metric_fn):
"""
Expand Down Expand Up @@ -430,7 +438,7 @@ def write_feature_scores_to_file(self, output_path):
file_handle.write("{0}\n".format(cols))
for feature, metric_scores in sorted(feature_scores.items()):
if not metric_scores:
file_handle.write("{0}\tNA\tNA\n".format(feature))
file_handle.write("{0}\t{1}\n".format(feature, "\t".join(["NA"] * len(metric_cols))))
else:
metric_score_cols = '\t'.join(
["{0:.4f}".format(s) for s in metric_scores.values()])
Expand Down
3 changes: 2 additions & 1 deletion tutorials/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ To get started on training a model very quickly, please see [`quickstart_trainin
Additionally, we have two tutorials that show how to apply trained models. Selene provides methods to run variant effect prediction and _in silico_ mutagenesis, along with some visualization methods that we recommend running based on our Jupyter notebook tutorials.

- Comprehensive _in silico_ mutagenesis tutorial: [`analyzing_mutations_with_trained_models`](https://github.com/FunctionLab/selene/tree/master/tutorials/analyzing_mutations_with_trained_models)
- Tutorial with both the config file method and the non-config file method of running Selene. Also shows how to run variant effect prediction and visualize the difference scores. Contains an _in silico_ mutagenesis example with known regulatory mutations: [`variants_and_visualizations`](https://github.com/FunctionLab/selene/tree/master/tutorials/variants_and_visualizations)
- Tutorial with both the config file method and the non-config file method of running Selene. Also shows how to run variant effect prediction and visualize the difference scores. Contains an _in silico_ mutagenesis example with known regulatory mutations: [`variants_and_visualizations`](https://github.com/FunctionLab/selene/tree/master/tutorials/variants_and_visualizations)
- Tutorial demonstrating Selene's use to predict mean ribosomal load based on 5' UTR sequences: [`regression_mpra_example`](https://github.com/FunctionLab/selene/tree/master/tutorials/regression_mpra_example)

## Contributing tutorials

Expand Down
63 changes: 63 additions & 0 deletions tutorials/regression_mpra_example/download_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import io
import gzip
import os
import urllib.request
import tarfile

import numpy
import pandas
import scipy.io
import selene_sdk.sequences


def run():
target_column = "rl"
local_file = "sample_et_al.tar"

# Download the data.
urllib.request.urlretrieve("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE114002&format=file", local_file)
with tarfile.open(local_file, "r") as archive:
contents = archive.extractfile("GSM3130435_egfp_unmod_1.csv.gz").read()
contents = gzip.decompress(contents).decode("utf-8")
os.remove(local_file)

# Format data.
df = pandas.read_csv(io.StringIO(contents), sep=",", index_col=0)
df = df[["utr", "total_reads", target_column]]
df.sort_values("total_reads", inplace=True, ascending=False)
df.reset_index(inplace=True, drop=True)

# Split into train/validation/test.
df = df.iloc[:280000]
datasets = dict(test=df.iloc[:20000])
df = df.iloc[20000:]
df = df.sample(frac=1.)
datasets["validate"] = df.iloc[:20000]
datasets["train"] = df.iloc[20000:]
x = dict.fromkeys(datasets.keys())
y = dict.fromkeys(datasets.keys())

# Construct features.
for k in datasets.keys():
x[k] = list()
y[k] = list()
for i in range(datasets[k].shape[0]):
x[k].append(selene_sdk.sequences.Genome.sequence_to_encoding(datasets[k]["utr"].iloc[i]).T)
y[k].append(datasets[k][target_column].iloc[i])
x[k] = numpy.stack(x[k])
y[k] = numpy.asarray(y[k]).reshape(-1, 1)

# Scale w/ parameters from training data to prevent leakage.
sdev = numpy.std(y["train"])
mean = numpy.mean(y["train"])
for k in datasets.keys():
y[k] = (y[k] - mean) / sdev

# Write data to file.
for k in datasets.keys():
scipy.io.savemat("{0}.mat".format(k), dict(x=x[k], y=y[k]))


if __name__ == "__main__":
run()

115 changes: 115 additions & 0 deletions tutorials/regression_mpra_example/regression_mpra_example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Regression Models in Selene\n",
"\n",
"Selene is a flexible framework, and can be used for tasks beyond simple classification.\n",
"This tutorial demonstrates the simple process of training regression models with Selene.\n",
"For this example, we will predict mean ribosomal load (MRL) from 50 base pair 5' UTR sequences using models and data from [*Human 5′ UTR design and variant effect prediction from a massively parallel translation assay*](https://doi.org/10.1101/310375) by Sample et al.\n",
"This data was generated from a massively parallel reporter assay (MPRA), which you can read more about it in the preprint on [*bioRxiv*](https://doi.org/10.1101/310375).\n",
"\n",
"## Setup\n",
"\n",
"**Architecture:** The model is defined in [utr_model.py](https://github.com/FunctionLab/selene/blob/master/tutorials/regression_mpra_example/utr_model.py), and only superficially differs from the model in [the paper](https://doi.org/10.1101/310375).\n",
"Since this is a real-valued regression problem, it is appropriate that the `criterion` method in `utr_model.py` uses the mean squared error.\n",
"\n",
"\n",
"**Data:** The data from Sample et al is available on the [Gene Expression Omnibus (GEO)](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE114002).\n",
"However, we have included [the `download_data.py` script](https://github.com/FunctionLab/selene/blob/master/tutorials/regression_mpra_example/download_data.py), to download the data and preprocess it.\n",
"It should produce three files, `train.mat`, `validate.mat`, and `test.mat`.\n",
"They include the data for training, validation, and testing respectively.\n",
"At present, regression models can only be trained with `*.mat` files and the [`MatFileSampler`](http://selene.flatironinstitute.org/samplers.file_samplers.html#selene_sdk.samplers.file_samplers.MatFileSampler).\n",
"Further, a `MatFileSampler` must be specified for each of the `train.mat`, `validate.mat`, and `test.mat` files.\n",
"These `MatFileSampler`s are then used for the `train`, `validate`, and `test` arguments of a [`MultiFileSampler`](http://selene.flatironinstitute.org/samplers.html#selene_sdk.samplers.MultiFileSampler).\n",
"The specific syntax is demonstrated in the configuration file, [`regression_train.yml`](https://github.com/FunctionLab/selene/blob/master/tutorials/regression_mpra_example/regression_train.yml).\n",
"\n",
"**Configuration file:** The configuration file [`regression_train.yml`](https://github.com/FunctionLab/selene/blob/master/tutorials/regression_mpra_example/regression_train.yml) is slightly different than the configuration files in the classification tutorials.\n",
"Specifically, `metrics` in `train_model` includes the coefficient of determination (`r2`), since the default metrics (`roc_auc` and `average_precision`) are not appropriate for regression.\n",
"Further, `report_gt_feature_n_positives` in `train_model` has been set to zero to prevent spurious filtering based on target values.\n",
"\n",
"## Download the data\n",
"\n",
"To download the data, just run the [`download_data.py`](https://github.com/FunctionLab/selene/blob/master/tutorials/regression_mpra_example/download_data.py) script from the command line:\n",
"```sh\n",
"python download_data.py\n",
"```\n",
"\n",
"## Train the model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from selene_sdk.utils import load_path\n",
"from selene_sdk.utils import parse_configs_and_run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Before running `load_path` on `regression_train.yml`, please edit the YAML file to include the absolute path of the model file.\n",
"\n",
"Currently, the model is set to train on GPU.\n",
"If you do not have CUDA on your machine, please set `use_cuda` to `False` in the configuration file. Note that using the CPU instead of GPU will slow down training considerably."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"configs = load_path(\"./regression_train.yml\", instantiate=False)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Outputs and logs saved to ./2018-12-08-22-08-14\n",
"[VALIDATE] average r2: 0.8641705948994154\n",
"[VALIDATE] average r2: 0.8767916124114791\n",
"[VALIDATE] average r2: 0.8817297326343803\n",
"[TEST] average r2: 0.9232683662644537\n"
]
}
],
"source": [
"parse_configs_and_run(configs, lr=0.001)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading