Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 95 additions & 72 deletions tests/influence/_utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,20 @@ def _wrap_model_in_dataparallel(net):
return torch.nn.DataParallel(net, device_ids=alt_device_ids)


def _move_sample_to_cuda(samples):
def _move_sample_list_to_cuda(samples: List[Tensor]) -> List[Tensor]:
return [s.cuda() for s in samples]


class ExplicitDataset(Dataset):
def __init__(self, samples, labels, use_gpu=False) -> None:
def __init__(
self,
samples: Tensor,
labels: Tensor,
use_gpu=False,
) -> None:
self.samples, self.labels = samples, labels
if use_gpu:
self.samples = (
_move_sample_to_cuda(self.samples)
if isinstance(self.samples, list)
else self.samples.cuda()
)
self.samples = self.samples.cuda()
self.labels = self.labels.cuda()

def __len__(self) -> int:
Expand All @@ -59,14 +60,15 @@ def __getitem__(self, idx):


class UnpackDataset(Dataset):
def __init__(self, samples, labels, use_gpu=False) -> None:
def __init__(
self,
samples: List[Tensor],
labels: Tensor,
use_gpu=False,
) -> None:
self.samples, self.labels = samples, labels
if use_gpu:
self.samples = (
_move_sample_to_cuda(self.samples)
if isinstance(self.samples, list)
else self.samples.cuda()
)
self.samples = _move_sample_list_to_cuda(self.samples)
self.labels = self.labels.cuda()

def __len__(self) -> int:
Expand Down Expand Up @@ -225,6 +227,72 @@ def forward(self, *inputs: Tensor) -> Tensor:
return self.linear(torch.cat(inputs, dim=1))


def get_random_data(
in_features: int,
out_features: int,
num_examples: int,
use_gpu: bool,
unpack_inputs: bool,
) -> Tuple[Dataset, Dataset, Dataset]:
"""
returns train_dataset, test_dataset and hessian_dataset constructed from
random labels and random features, with features having shape
[num_examples x num_features] and labels having shape [num_examples].

Note: the random labels and features for different dataset needs to be
generated together.
Otherwise, some tests will fail (https://fburl.com/testinfra/737jnpip)
"""

num_train = 32
num_hessian = 22 # this needs to be high to prevent numerical issues
num_inputs = 2 if unpack_inputs else 1

labels = torch.normal(1, 2, (num_examples, out_features)).double()
labels = labels.cuda() if use_gpu else labels

all_samples = [
torch.normal(0, 1, (num_examples, in_features)).double()
for _ in range(num_inputs)
]
if use_gpu:
all_samples = _move_sample_list_to_cuda(all_samples)

# TODO: no need to pass use_gpu since the data format has already been moved to cuda
train_dataset = (
UnpackDataset(
[samples[:num_train] for samples in all_samples],
labels[:num_train],
use_gpu,
)
if unpack_inputs
else ExplicitDataset(all_samples[0][:num_train], labels[:num_train], use_gpu)
)

hessian_dataset = (
UnpackDataset(
[samples[:num_hessian] for samples in all_samples],
labels[:num_hessian],
use_gpu,
)
if unpack_inputs
else ExplicitDataset(
all_samples[0][:num_hessian], labels[:num_hessian], use_gpu
)
)

test_dataset = (
UnpackDataset(
[samples[num_train:] for samples in all_samples],
labels[num_train:],
use_gpu,
)
if unpack_inputs
else ExplicitDataset(all_samples[0][num_train:], labels[num_train:], use_gpu)
)
return (train_dataset, hessian_dataset, test_dataset)


def get_random_model_and_data(
tmpdir,
unpack_inputs,
Expand Down Expand Up @@ -283,38 +351,9 @@ def get_random_model_and_data(
out_features = 3

num_samples = 50
num_train = 32
num_hessian = 22 # this needs to be high to prevent numerical issues
all_labels = torch.normal(1, 2, (num_samples, out_features)).double()
all_labels = all_labels.cuda() if use_gpu else all_labels
train_labels = all_labels[:num_train]
test_labels = all_labels[num_train:]
hessian_labels = all_labels[:num_hessian]

if unpack_inputs:
all_samples = [
torch.normal(0, 1, (num_samples, in_features)).double()
for _ in range(num_inputs)
]
if use_gpu:
all_samples = _move_sample_to_cuda(all_samples)

train_samples = [ts[:num_train] for ts in all_samples]
test_samples = [ts[num_train:] for ts in all_samples]
hessian_samples = [ts[:num_hessian] for ts in all_samples]
else:
all_samples = torch.normal(0, 1, (num_samples, in_features)).double()

if use_gpu:
all_samples = all_samples.cuda()
train_samples = all_samples[:num_train]
test_samples = all_samples[num_train:]
hessian_samples = all_samples[:num_hessian]

dataset = (
ExplicitDataset(train_samples, train_labels, use_gpu)
if not unpack_inputs
else UnpackDataset(train_samples, train_labels, use_gpu)
train_dataset, hessian_dataset, test_dataset = get_random_data(
in_features, out_features, num_samples, use_gpu, unpack_inputs
)

if model_type == "random":
Expand Down Expand Up @@ -358,15 +397,19 @@ def get_random_model_and_data(

# turn input into a single tensor for use by least squares
tensor_hessian_samples = (
hessian_samples if not unpack_inputs else torch.cat(hessian_samples, dim=1)
hessian_dataset.samples
if not unpack_inputs
else torch.cat(hessian_dataset.samples, dim=1)
)
version = _parse_version(torch.__version__)
if version < (1, 9):
theta = torch.lstsq(tensor_hessian_samples, hessian_labels).solution[0:1]
theta = torch.lstsq(
tensor_hessian_samples, hessian_dataset.labels
).solution[0:1]
else:
# run least squares to get optimal trained parameters
theta = torch.linalg.lstsq(
hessian_labels,
hessian_dataset.labels,
tensor_hessian_samples,
).solution
# the first `n` rows of `theta` contains the least squares solution, where
Expand Down Expand Up @@ -397,14 +440,7 @@ def get_random_model_and_data(
)

# train model using several optimization steps on Hessian data

# create entire Hessian data as a batch
hessian_dataset = (
ExplicitDataset(hessian_samples, hessian_labels, use_gpu)
if not unpack_inputs
else UnpackDataset(hessian_samples, hessian_labels, use_gpu)
)
batch = next(iter(DataLoader(hessian_dataset, batch_size=num_hessian)))
batch = next(iter(DataLoader(hessian_dataset, batch_size=len(hessian_dataset))))

optimizer = torch.optim.Adam(net.parameters())
num_steps = 200
Expand All @@ -425,26 +461,13 @@ def get_random_model_and_data(

training_data = (
net_adjusted,
dataset,
train_dataset,
)

hessian_data = (
(
_move_sample_to_cuda(hessian_samples)
if isinstance(hessian_samples, list) and use_gpu
else (hessian_samples.cuda() if use_gpu else hessian_samples)
),
hessian_labels.cuda() if use_gpu else hessian_labels,
)
hessian_data = (hessian_dataset.samples, hessian_dataset.labels)

test_data = (test_dataset.samples, test_dataset.labels)

test_data = (
(
_move_sample_to_cuda(test_samples)
if isinstance(test_samples, list) and use_gpu
else (test_samples.cuda() if use_gpu else test_samples)
),
test_labels.cuda() if use_gpu else test_labels,
)
if return_test_data:
if not return_hessian_data:
return (*training_data, *test_data)
Expand Down