From c96f3e0c273c761783dfbf60e0f4bf62bcc7a472 Mon Sep 17 00:00:00 2001 From: Narine Kokhlikyan Date: Wed, 8 Jul 2020 22:42:33 -0700 Subject: [PATCH 1/4] Fix require grad issue in noise tunnel --- captum/_utils/gradient.py | 22 +++-- captum/attr/_core/gradient_shap.py | 4 +- captum/attr/_core/noise_tunnel.py | 134 +++++++++++++++-------------- 3 files changed, 86 insertions(+), 74 deletions(-) diff --git a/captum/_utils/gradient.py b/captum/_utils/gradient.py index 4bb89243b2..b6439a0a01 100644 --- a/captum/_utils/gradient.py +++ b/captum/_utils/gradient.py @@ -135,6 +135,7 @@ def _forward_layer_eval( additional_forward_args: Any = None, device_ids: Union[None, List[int]] = None, attribute_to_layer_input: bool = False, + grad_enabled: bool = False, ) -> Tuple[Tuple[Tensor, ...], Literal[True, False]]: return _forward_layer_eval_with_neuron_grads( forward_fn, @@ -142,6 +143,7 @@ def _forward_layer_eval( layer, additional_forward_args=additional_forward_args, gradient_neuron_index=None, + grad_enabled=grad_enabled, device_ids=device_ids, attribute_to_layer_input=attribute_to_layer_input, ) @@ -311,6 +313,7 @@ def _forward_layer_eval_with_neuron_grads( additional_forward_args: Any = None, *, gradient_neuron_index: Union[int, Tuple[int, ...]], + grad_enabled: bool = False, device_ids: Union[None, List[int]] = None, attribute_to_layer_input: bool = False, ) -> Tuple[Tuple[Tensor, ...], Tuple[Tensor, ...], Literal[True, False]]: @@ -324,6 +327,7 @@ def _forward_layer_eval_with_neuron_grads( layer: Module, additional_forward_args: Any = None, gradient_neuron_index: None = None, + grad_enabled: bool = False, device_ids: Union[None, List[int]] = None, attribute_to_layer_input: bool = False, ) -> Tuple[Tuple[Tensor, ...], Literal[True, False]]: @@ -336,6 +340,7 @@ def _forward_layer_eval_with_neuron_grads( layer: Module, additional_forward_args: Any = None, gradient_neuron_index: Union[None, int, Tuple[int, ...]] = None, + grad_enabled: bool = False, device_ids: Union[None, List[int]] = None, attribute_to_layer_input: bool = False, ) -> Union[ @@ -357,13 +362,16 @@ def _forward_layer_eval_with_neuron_grads( evals in a dictionary protected by a lock, analogous to the gather implementation for the core PyTorch DataParallel implementation. """ - saved_layer, is_layer_tuple = _forward_layer_distributed_eval( - forward_fn, - inputs, - layer, - additional_forward_args=additional_forward_args, - attribute_to_layer_input=attribute_to_layer_input, - ) + grad_enabled = True if gradient_neuron_index is not None or grad_enabled else False + + with torch.autograd.set_grad_enabled(grad_enabled): + saved_layer, is_layer_tuple = _forward_layer_distributed_eval( + forward_fn, + inputs, + layer, + additional_forward_args=additional_forward_args, + attribute_to_layer_input=attribute_to_layer_input, + ) device_ids = _extract_device_ids(forward_fn, saved_layer, device_ids) # Identifies correct device ordering based on device ids. # key_list is a list of devices in appropriate ordering for concatenation. diff --git a/captum/attr/_core/gradient_shap.py b/captum/attr/_core/gradient_shap.py index 0057e2960f..bbd1dded11 100644 --- a/captum/attr/_core/gradient_shap.py +++ b/captum/attr/_core/gradient_shap.py @@ -369,9 +369,9 @@ def _scale_input( inp_shape = (bsz,) + tuple([1] * len(inp_shape_wo_bsz)) # expand and reshape the indices - rand_coefficient = rand_coefficient.view(inp_shape).requires_grad_() + rand_coefficient = rand_coefficient.view(inp_shape) input_baseline_scaled = ( rand_coefficient * input + (torch.tensor(1) - rand_coefficient) * baseline - ) + ).requires_grad_() return input_baseline_scaled diff --git a/captum/attr/_core/noise_tunnel.py b/captum/attr/_core/noise_tunnel.py index ecbcf5be84..94512829d5 100644 --- a/captum/attr/_core/noise_tunnel.py +++ b/captum/attr/_core/noise_tunnel.py @@ -199,81 +199,85 @@ def compute_expected_attribution_and_sq(attribution): expected_attribution_sq = torch.mean(attribution ** 2, dim=1, keepdim=False) return expected_attribution, expected_attribution_sq - # Keeps track whether original input is a tuple or not before - # converting it into a tuple. - is_inputs_tuple = isinstance(inputs, tuple) - - inputs = _format_input(inputs) - - _validate_noise_tunnel_type(nt_type, SUPPORTED_NOISE_TUNNEL_TYPES) - - delta = None - inputs_with_noise = add_noise_to_inputs() - # if the algorithm supports targets, baselines and/or additional_forward_args - # they will be expanded based on the n_steps and corresponding kwargs - # variables will be updated accordingly - _expand_and_update_additional_forward_args(n_samples, kwargs) - _expand_and_update_target(n_samples, kwargs) - _expand_and_update_baselines( - inputs, - n_samples, - kwargs, - draw_baseline_from_distrib=draw_baseline_from_distrib, - ) + with torch.no_grad(): + # Keeps track whether original input is a tuple or not before + # converting it into a tuple. + is_inputs_tuple = isinstance(inputs, tuple) + + inputs = _format_input(inputs) + + _validate_noise_tunnel_type(nt_type, SUPPORTED_NOISE_TUNNEL_TYPES) + + delta = None + inputs_with_noise = add_noise_to_inputs() + # if the algorithm supports targets, baselines and/or + # additional_forward_args they will be expanded based + # on the n_steps and corresponding kwargs + # variables will be updated accordingly + _expand_and_update_additional_forward_args(n_samples, kwargs) + _expand_and_update_target(n_samples, kwargs) + _expand_and_update_baselines( + inputs, + n_samples, + kwargs, + draw_baseline_from_distrib=draw_baseline_from_distrib, + ) - # smoothgrad_Attr(x) = 1 / n * sum(Attr(x + N(0, sigma^2)) - # NOTE: using __wrapped__ such that it does not log the inner logs - attributions = self.attribution_method.attribute.__wrapped__( # type: ignore - self.attribution_method, # self - inputs_with_noise if is_inputs_tuple else inputs_with_noise[0], - **kwargs, - ) + # smoothgrad_Attr(x) = 1 / n * sum(Attr(x + N(0, sigma^2)) + # NOTE: using __wrapped__ such that it does not log the inner logs + attr_func = self.attribution_method.attribute + attributions = attr_func.__wrapped__( # type: ignore + self.attribution_method, # self + inputs_with_noise if is_inputs_tuple else inputs_with_noise[0], + **kwargs, + ) - return_convergence_delta = ( - "return_convergence_delta" in kwargs and kwargs["return_convergence_delta"] - ) + return_convergence_delta = ( + "return_convergence_delta" in kwargs + and kwargs["return_convergence_delta"] + ) - if self.is_delta_supported and return_convergence_delta: - attributions, delta = attributions + if self.is_delta_supported and return_convergence_delta: + attributions, delta = attributions - is_attrib_tuple = _is_tuple(attributions) - attributions = _format_tensor_into_tuples(attributions) + is_attrib_tuple = _is_tuple(attributions) + attributions = _format_tensor_into_tuples(attributions) - expected_attributions = [] - expected_attributions_sq = [] - for attribution in attributions: - expected_attr, expected_attr_sq = compute_expected_attribution_and_sq( - attribution - ) - expected_attributions.append(expected_attr) - expected_attributions_sq.append(expected_attr_sq) + expected_attributions = [] + expected_attributions_sq = [] + for attribution in attributions: + expected_attr, expected_attr_sq = compute_expected_attribution_and_sq( + attribution + ) + expected_attributions.append(expected_attr) + expected_attributions_sq.append(expected_attr_sq) + + if NoiseTunnelType[nt_type] == NoiseTunnelType.smoothgrad: + return self._apply_checks_and_return_attributions( + tuple(expected_attributions), + is_attrib_tuple, + return_convergence_delta, + delta, + ) - if NoiseTunnelType[nt_type] == NoiseTunnelType.smoothgrad: - return self._apply_checks_and_return_attributions( - tuple(expected_attributions), - is_attrib_tuple, - return_convergence_delta, - delta, - ) + if NoiseTunnelType[nt_type] == NoiseTunnelType.smoothgrad_sq: + return self._apply_checks_and_return_attributions( + tuple(expected_attributions_sq), + is_attrib_tuple, + return_convergence_delta, + delta, + ) - if NoiseTunnelType[nt_type] == NoiseTunnelType.smoothgrad_sq: - return self._apply_checks_and_return_attributions( - tuple(expected_attributions_sq), - is_attrib_tuple, - return_convergence_delta, - delta, + vargrad = tuple( + expected_attribution_sq - expected_attribution * expected_attribution + for expected_attribution, expected_attribution_sq in zip( + expected_attributions, expected_attributions_sq + ) ) - vargrad = tuple( - expected_attribution_sq - expected_attribution * expected_attribution - for expected_attribution, expected_attribution_sq in zip( - expected_attributions, expected_attributions_sq + return self._apply_checks_and_return_attributions( + vargrad, is_attrib_tuple, return_convergence_delta, delta ) - ) - - return self._apply_checks_and_return_attributions( - vargrad, is_attrib_tuple, return_convergence_delta, delta - ) def _apply_checks_and_return_attributions( self, From 3cde889db23df0bf338ce89ec0bae9f984ac6942 Mon Sep 17 00:00:00 2001 From: Narine Kokhlikyan Date: Wed, 8 Jul 2020 22:50:45 -0700 Subject: [PATCH 2/4] Fix test case --- tests/attr/neuron/test_neuron_gradient.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/attr/neuron/test_neuron_gradient.py b/tests/attr/neuron/test_neuron_gradient.py index 38442da51b..b26fea11fa 100644 --- a/tests/attr/neuron/test_neuron_gradient.py +++ b/tests/attr/neuron/test_neuron_gradient.py @@ -126,7 +126,7 @@ def _gradient_matching_test_assert( while len(neuron) < len(out.shape) - 1: neuron = neuron + (0,) input_attrib = Saliency( - lambda x: _forward_layer_eval(model, x, output_layer)[0][0][ + lambda x: _forward_layer_eval(model, x, output_layer, grad_enabled=True)[0][0][ (slice(None), *neuron) ] ) From c0a05e2f35cfc244e6f38db3bcfebcd95c30a7dc Mon Sep 17 00:00:00 2001 From: Narine Kokhlikyan Date: Wed, 8 Jul 2020 22:59:54 -0700 Subject: [PATCH 3/4] Fix tests --- tests/attr/neuron/test_neuron_gradient.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/attr/neuron/test_neuron_gradient.py b/tests/attr/neuron/test_neuron_gradient.py index b26fea11fa..daa8bb2328 100644 --- a/tests/attr/neuron/test_neuron_gradient.py +++ b/tests/attr/neuron/test_neuron_gradient.py @@ -126,9 +126,9 @@ def _gradient_matching_test_assert( while len(neuron) < len(out.shape) - 1: neuron = neuron + (0,) input_attrib = Saliency( - lambda x: _forward_layer_eval(model, x, output_layer, grad_enabled=True)[0][0][ - (slice(None), *neuron) - ] + lambda x: _forward_layer_eval( + model, x, output_layer, grad_enabled=True + )[0][0][(slice(None), *neuron)] ) sal_vals = input_attrib.attribute(test_input, abs=False) grad_vals = gradient_attrib.attribute(test_input, neuron) From 40f72bede648722f04343fe1e55c51231da73326 Mon Sep 17 00:00:00 2001 From: Narine Kokhlikyan Date: Mon, 13 Jul 2020 00:26:48 -0700 Subject: [PATCH 4/4] Add requires grad --- captum/attr/_core/noise_tunnel.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/captum/attr/_core/noise_tunnel.py b/captum/attr/_core/noise_tunnel.py index 94512829d5..7889d1109f 100644 --- a/captum/attr/_core/noise_tunnel.py +++ b/captum/attr/_core/noise_tunnel.py @@ -16,7 +16,7 @@ _format_tensor_into_tuples, _is_tuple, ) -from .._utils.attribution import Attribution +from .._utils.attribution import Attribution, GradientAttribution from .._utils.common import _validate_noise_tunnel_type @@ -63,7 +63,9 @@ def __init__(self, attribution_method: Attribution) -> None: """ self.attribution_method = attribution_method self.is_delta_supported = self.attribution_method.has_convergence_delta() - + self.is_gradient_method = isinstance( + self.attribution_method, GradientAttribution + ) Attribution.__init__(self, self.attribution_method.forward_func) @log_usage() @@ -165,7 +167,9 @@ def add_noise_to_inputs() -> Tuple[Tensor, ...]: ), "stdevs must be type float. " "Given: {}".format(type(stdevs)) stdevs_ = (stdevs,) * len(inputs) return tuple( - add_noise_to_input(input, stdev) + add_noise_to_input(input, stdev).requires_grad_() + if self.is_gradient_method + else add_noise_to_input(input, stdev) for (input, stdev) in zip(inputs, stdevs_) )