diff --git a/captum/attr/_core/dataloader_attr.py b/captum/attr/_core/dataloader_attr.py
index 60b1e4377d..f810b9645b 100644
--- a/captum/attr/_core/dataloader_attr.py
+++ b/captum/attr/_core/dataloader_attr.py
@@ -3,7 +3,7 @@
 # pyre-strict
 from collections import defaultdict
 from copy import copy
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Callable, cast, Dict, Iterable, List, Optional, Tuple, Union
 
 import torch
 from captum._utils.common import (
@@ -193,8 +193,7 @@ def _forward_with_dataloader(
         feature_mask: Tuple[Tensor, ...],
         # pyre-fixme[24]: Generic type `Callable` expects 2 type parameters.
         reduce: Callable,
-        # pyre-fixme[24]: Generic type `Callable` expects 2 type parameters.
-        to_metric: Optional[Callable],
+        to_metric: Optional[Callable[[Tensor], Tensor]],
         show_progress: bool,
         feature_idx_to_mask_idx: Dict[int, List[int]],
     ) -> Tensor:
@@ -243,7 +242,8 @@ def _forward_with_dataloader(
 
                 accum_states[i] = reduce(accum_states[i], output, perturbed_inputs)
 
-        accum_results = [
+        accum_states = cast(List[Tensor], accum_states)
+        accum_results: List[Tensor] = [
             to_metric(accum) if to_metric else accum for accum in accum_states
         ]
 
@@ -276,7 +276,7 @@ def attribute(
         Args:
 
             dataloader (torch.Dataloader): the dataloader to attribute, which should
-                        return a tuple of consistant size for every iteration
+                        return a tuple of consistent size for every iteration
             input_roles (tuple[int, ...], optional): a tuple of integers to define the
                         role of each element returned from the dataloader. It should
                         have the same size as the return of the dataloader.
@@ -326,7 +326,7 @@ def attribute(
                         traverses needed is
                         ceil(n_perturbations / perturbations_per_pass).
 
-                        This arguement offers control of the trade-off between memory
+                        This argument offers control of the trade-off between memory
                         and efficiency. If the dataloader involves slow operations like
                         remote request or file I/O, multiple traversals can be
                         inefficient. On the other hand, each perturbation needs to
diff --git a/captum/attr/_core/lime.py b/captum/attr/_core/lime.py
index f579a531dc..dc8447d1b3 100644
--- a/captum/attr/_core/lime.py
+++ b/captum/attr/_core/lime.py
@@ -522,7 +522,10 @@ def attribute(
             if show_progress:
                 attr_progress.close()
 
-            combined_interp_inps = torch.cat(interpretable_inps).float()
+            # Argument 1 to "cat" has incompatible type
+            # "list[Tensor | tuple[Tensor, ...]]";
+            # expected "tuple[Tensor, ...] | list[Tensor]"  [arg-type]
+            combined_interp_inps = torch.cat(interpretable_inps).float()  # type: ignore
             combined_outputs = (
                 torch.cat(outputs)
                 if len(outputs[0].shape) > 0
diff --git a/captum/concept/_utils/classifier.py b/captum/concept/_utils/classifier.py
index c9e7fc4022..477fa0c255 100644
--- a/captum/concept/_utils/classifier.py
+++ b/captum/concept/_utils/classifier.py
@@ -186,7 +186,9 @@ def train_and_eval(
         x_train, x_test, y_train, y_test = _train_test_split(
             torch.cat(inputs), torch.cat(labels), test_split=test_split_ratio
         )
-        self.lm.device = device
+        # error: Incompatible types in assignment (expression has type "str | Any",
+        # variable has type "Tensor | Module")  [assignment]
+        self.lm.device = device  # type: ignore
         self.lm.fit(DataLoader(TensorDataset(x_train, y_train)))
 
         predict = self.lm(x_test)
diff --git a/captum/log/__init__.py b/captum/log/__init__.py
index 82e851c14e..d70dea94fe 100644
--- a/captum/log/__init__.py
+++ b/captum/log/__init__.py
@@ -24,7 +24,7 @@
 except ImportError:
     from functools import wraps
 
-    def log(*args: Any, **kwargs: Any) -> None:
+    def log(*args: Any, **kwargs: Any) -> None:  # type: ignore
         pass
 
     # bug with mypy: https://github.com/python/mypy/issues/1153
@@ -56,12 +56,12 @@ def wrapper(*args: Any, **kwargs: Any):
         return _log_usage
 
     # pyre-fixme[2]: Parameter must be annotated.
-    def set_environment(env) -> None:
+    def set_environment(env) -> None:  # type: ignore
         pass
 
     def disable_detailed_logging() -> None:
         pass
 
     # pyre-fixme[2]: Parameter must be annotated.
-    def patch_methods(tester, patch_log: bool = True) -> None:
+    def patch_methods(tester, patch_log: bool = True) -> None:  # type: ignore
         pass
diff --git a/captum/module/gaussian_stochastic_gates.py b/captum/module/gaussian_stochastic_gates.py
index 18bffe732d..58650fd5a6 100644
--- a/captum/module/gaussian_stochastic_gates.py
+++ b/captum/module/gaussian_stochastic_gates.py
@@ -81,7 +81,7 @@ def __init__(
             mask=mask,
             # pyre-fixme[6]: For 3rd argument expected `float` but got
             #  `Optional[float]`.
-            reg_weight=reg_weight,
+            reg_weight=reg_weight,  # type: ignore
             reg_reduction=reg_reduction,
         )
 
@@ -91,7 +91,7 @@ def __init__(
 
         # pyre-fixme[58]: `<` is not supported for operand types `int` and
         #  `Optional[float]`.
-        assert 0 < std, f"the standard deviation should be positive, received {std}"
+        assert 0 < std, f"the standard deviation should be positive, received {std}"  # type: ignore  # noqa: E501 line too long
         self.std = std
 
     def _sample_gate_values(self, batch_size: int) -> Tensor:
@@ -109,7 +109,7 @@ def _sample_gate_values(self, batch_size: int) -> Tensor:
             n = torch.empty(batch_size, self.n_gates, device=self.mu.device)
             # pyre-fixme[6]: For 2nd argument expected `float` but got
             #  `Optional[float]`.
-            n.normal_(mean=0, std=self.std)
+            n.normal_(mean=0, std=self.std)  # type: ignore
             return self.mu + n
 
         return self.mu.expand(batch_size, self.n_gates)
diff --git a/tests/attr/helpers/gen_test_utils.py b/tests/attr/helpers/gen_test_utils.py
index 5dc0f7f22b..4ac1dd5909 100644
--- a/tests/attr/helpers/gen_test_utils.py
+++ b/tests/attr/helpers/gen_test_utils.py
@@ -41,7 +41,7 @@ def parse_test_config(
     baseline_distr = (
         test_config["baseline_distr"] if "baseline_distr" in test_config else False
     )
-    return algorithms, model, args, layer, noise_tunnel, baseline_distr
+    return algorithms, model, args, layer, noise_tunnel, baseline_distr  # type: ignore
 
 
 def should_create_generated_test(algorithm: Type[Attribution]) -> bool:
diff --git a/tests/attr/layer/test_layer_gradient_shap.py b/tests/attr/layer/test_layer_gradient_shap.py
index 045e3da77b..b50bac751f 100644
--- a/tests/attr/layer/test_layer_gradient_shap.py
+++ b/tests/attr/layer/test_layer_gradient_shap.py
@@ -201,7 +201,7 @@ def _assert_attributions(
         if expected_delta is None:
             assert_attribution_delta(
                 # pyre-fixme[6]: For 1st argument expected `FbBaseTest` but got `Test`.
-                self,
+                self,  # type: ignore
                 inputs,
                 attrs,
                 n_samples,
diff --git a/tests/attr/test_data_parallel.py b/tests/attr/test_data_parallel.py
index bf89b9068a..2135e9e368 100644
--- a/tests/attr/test_data_parallel.py
+++ b/tests/attr/test_data_parallel.py
@@ -4,7 +4,7 @@
 import copy
 import os
 from enum import Enum
-from typing import Any, Callable, cast, Dict, Optional, Tuple, Type
+from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Type
 
 import torch
 import torch.distributed as dist
@@ -136,91 +136,22 @@ def data_parallel_test_assert(self) -> None:
                 else:
                     cuda_args[key] = args[key]
 
-            alt_device_ids = None
             cuda_model = copy.deepcopy(model).cuda()
-            # Initialize models based on DataParallelCompareMode
-            if mode is DataParallelCompareMode.cpu_cuda:
-                model_1, model_2 = model, cuda_model
-                args_1, args_2 = args, cuda_args
-            elif mode is DataParallelCompareMode.data_parallel_default:
-                model_1, model_2 = (
-                    cuda_model,
-                    torch.nn.parallel.DataParallel(cuda_model),
-                )
-                args_1, args_2 = cuda_args, cuda_args
-            elif mode is DataParallelCompareMode.data_parallel_alt_dev_ids:
-                alt_device_ids = [0] + [
-                    x for x in range(torch.cuda.device_count() - 1, 0, -1)
-                ]
-                model_1, model_2 = (
-                    cuda_model,
-                    torch.nn.parallel.DataParallel(
-                        cuda_model, device_ids=alt_device_ids
-                    ),
-                )
-                args_1, args_2 = cuda_args, cuda_args
-            elif mode is DataParallelCompareMode.dist_data_parallel:
-
-                model_1, model_2 = (
-                    cuda_model,
-                    torch.nn.parallel.DistributedDataParallel(
-                        cuda_model, device_ids=[0], output_device=0
-                    ),
-                )
-                args_1, args_2 = cuda_args, cuda_args
-            else:
-                raise AssertionError("DataParallel compare mode type is not valid.")
-
-            attr_method_1: Attribution
-            attr_method_2: Attribution
-            if target_layer:
-                internal_algorithm = cast(Type[InternalAttribution], algorithm)
-                attr_method_1 = internal_algorithm(
-                    model_1, get_target_layer(model_1, target_layer)
-                )
-                # cuda_model is used to obtain target_layer since DataParallel
-                # adds additional wrapper.
-                # model_2 is always either the CUDA model itself or DataParallel
-                if alt_device_ids is None:
-                    attr_method_2 = internal_algorithm(
-                        model_2, get_target_layer(cuda_model, target_layer)
-                    )
-                else:
-                    # LayerDeepLift and LayerDeepLiftShap do not take device ids
-                    # as a parameter, since they must always have the DataParallel
-                    # model object directly.
-                    # Some neuron methods and GuidedGradCAM also require the
-                    # model and cannot take a forward function.
-                    if issubclass(
-                        internal_algorithm,
-                        (
-                            LayerDeepLift,
-                            LayerDeepLiftShap,
-                            LayerLRP,
-                            NeuronDeepLift,
-                            NeuronDeepLiftShap,
-                            NeuronDeconvolution,
-                            NeuronGuidedBackprop,
-                            GuidedGradCam,
-                        ),
-                    ):
-                        attr_method_2 = internal_algorithm(
-                            model_2,
-                            get_target_layer(cuda_model, target_layer),  # type: ignore
-                        )
-                    else:
-                        attr_method_2 = internal_algorithm(
-                            model_2.forward,
-                            get_target_layer(cuda_model, target_layer),
-                            device_ids=alt_device_ids,
-                        )
-            else:
-                attr_method_1 = algorithm(model_1)
-                attr_method_2 = algorithm(model_2)
+            # Set up test arguments based on DataParallelCompareMode
+            model_1, model_2, args_1, args_2, alt_device_ids = _get_dp_test_args(
+                cuda_model, model, cuda_args, args, mode
+            )
 
-            if noise_tunnel:
-                attr_method_1 = NoiseTunnel(attr_method_1)
-                attr_method_2 = NoiseTunnel(attr_method_2)
+            # Construct attribution methods
+            attr_method_1, attr_method_2 = _get_dp_attr_methods(
+                algorithm,
+                target_layer,
+                model_1,
+                model_2,
+                cuda_model,
+                alt_device_ids,
+                noise_tunnel,
+            )
             if attr_method_1.has_convergence_delta():
                 attributions_1, delta_1 = attr_method_1.attribute(
                     return_convergence_delta=True, **args_1
@@ -266,6 +197,107 @@ def data_parallel_test_assert(self) -> None:
         return data_parallel_test_assert
 
 
+def _get_dp_test_args(
+    cuda_model: Module,
+    model: Module,
+    cuda_args: Dict[str, Any],
+    args: Dict[str, Any],
+    mode: DataParallelCompareMode,
+) -> Tuple[Module, Module, Dict[str, Any], Dict[str, Any], Optional[List[int]]]:
+    # Initialize models based on DataParallelCompareMode
+    alt_device_ids = None
+    if mode is DataParallelCompareMode.cpu_cuda:
+        model_1, model_2 = model, cuda_model
+        args_1, args_2 = args, cuda_args
+    elif mode is DataParallelCompareMode.data_parallel_default:
+        model_1, model_2 = (
+            cuda_model,
+            torch.nn.parallel.DataParallel(cuda_model),
+        )
+        args_1, args_2 = cuda_args, cuda_args
+    elif mode is DataParallelCompareMode.data_parallel_alt_dev_ids:
+        alt_device_ids = [0] + list(range(torch.cuda.device_count() - 1, 0, -1))
+        model_1, model_2 = (
+            cuda_model,
+            torch.nn.parallel.DataParallel(cuda_model, device_ids=alt_device_ids),
+        )
+        args_1, args_2 = cuda_args, cuda_args
+    elif mode is DataParallelCompareMode.dist_data_parallel:
+
+        model_1, model_2 = (
+            cuda_model,
+            torch.nn.parallel.DistributedDataParallel(
+                cuda_model, device_ids=[0], output_device=0
+            ),
+        )
+        args_1, args_2 = cuda_args, cuda_args
+    else:
+        raise AssertionError("DataParallel compare mode type is not valid.")
+
+    return model_1, model_2, args_1, args_2, alt_device_ids
+
+
+def _get_dp_attr_methods(
+    algorithm: Type[Attribution],
+    target_layer: Optional[str],
+    model_1: Module,
+    model_2: Module,
+    cuda_model: Module,
+    alt_device_ids: Optional[List[int]],
+    noise_tunnel: bool,
+) -> Tuple[Attribution, Attribution]:
+    attr_method_1: Attribution
+    attr_method_2: Attribution
+    if target_layer:
+        internal_algorithm = cast(Type[InternalAttribution], algorithm)
+        attr_method_1 = internal_algorithm(
+            model_1, get_target_layer(model_1, target_layer)
+        )
+        # cuda_model is used to obtain target_layer since DataParallel
+        # adds additional wrapper.
+        # model_2 is always either the CUDA model itself or DataParallel
+        if alt_device_ids is None:
+            attr_method_2 = internal_algorithm(
+                model_2, get_target_layer(cuda_model, target_layer)
+            )
+        else:
+            # LayerDeepLift and LayerDeepLiftShap do not take device ids
+            # as a parameter, since they must always have the DataParallel
+            # model object directly.
+            # Some neuron methods and GuidedGradCAM also require the
+            # model and cannot take a forward function.
+            if issubclass(
+                internal_algorithm,
+                (
+                    LayerDeepLift,
+                    LayerDeepLiftShap,
+                    LayerLRP,
+                    NeuronDeepLift,
+                    NeuronDeepLiftShap,
+                    NeuronDeconvolution,
+                    NeuronGuidedBackprop,
+                    GuidedGradCam,
+                ),
+            ):
+                attr_method_2 = internal_algorithm(
+                    model_2,
+                    get_target_layer(cuda_model, target_layer),  # type: ignore
+                )
+            else:
+                attr_method_2 = internal_algorithm(
+                    model_2.forward,
+                    get_target_layer(cuda_model, target_layer),
+                    device_ids=alt_device_ids,
+                )
+    else:
+        attr_method_1 = algorithm(model_1)
+        attr_method_2 = algorithm(model_2)
+    if noise_tunnel:
+        attr_method_1 = NoiseTunnel(attr_method_1)
+        attr_method_2 = NoiseTunnel(attr_method_2)
+    return attr_method_1, attr_method_2
+
+
 if torch.cuda.is_available() and torch.cuda.device_count() != 0:
 
     class DataParallelTest(BaseTest, metaclass=DataParallelMeta):
diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py
index 5e49b13c35..745946af71 100644
--- a/tests/helpers/__init__.py
+++ b/tests/helpers/__init__.py
@@ -10,4 +10,7 @@
     ]
 
 except ImportError:
-    from tests.helpers.basic import BaseTest
+    # tests/helpers/__init__.py:13: error: Incompatible import of "BaseTest"
+    # (imported name has type "type[BaseTest]", local name has type
+    # "type[FbBaseTest]")  [assignment]
+    from tests.helpers.basic import BaseTest  # type: ignore
diff --git a/tests/helpers/influence/common.py b/tests/helpers/influence/common.py
index 9e7f5b5792..bedba76930 100644
--- a/tests/helpers/influence/common.py
+++ b/tests/helpers/influence/common.py
@@ -409,6 +409,7 @@ def get_random_model_and_data(
         in_features, out_features, num_samples, use_gpu, unpack_inputs
     )
 
+    net: Union[BasicLinearNet, MultLinearNet, Linear, UnpackLinear]
     if model_type == "random":
         net = (
             BasicLinearNet(in_features, hidden_nodes, out_features)
diff --git a/tests/influence/_core/test_tracin_regression.py b/tests/influence/_core/test_tracin_regression.py
index c70ba8449b..9609091698 100644
--- a/tests/influence/_core/test_tracin_regression.py
+++ b/tests/influence/_core/test_tracin_regression.py
@@ -31,7 +31,7 @@
 class TestTracInRegression(BaseTest):
     def _test_tracin_regression_setup(
         self, tmpdir: str, features: int, use_gpu: bool = False
-    ) -> Tuple[RangeDataset, Dict[str, Any]]:
+    ) -> Tuple[RangeDataset, Dict[str, Any]]:  # fixme (return type)
         low = 1
         high = 17
         dataset = RangeDataset(low, high, features, use_gpu)
@@ -49,7 +49,7 @@ def _test_tracin_regression_setup(
             torch.save(net_adjusted.state_dict(), os.path.join(tmpdir, checkpoint_name))
 
         # pyre-fixme[61]: `net_adjusted` is undefined, or not always defined.
-        return dataset, net_adjusted
+        return dataset, net_adjusted  # type: ignore
 
     use_gpu_list = (
         [True, False]
diff --git a/tests/influence/_core/test_tracin_xor.py b/tests/influence/_core/test_tracin_xor.py
index 83968bb909..a9ed3a389d 100644
--- a/tests/influence/_core/test_tracin_xor.py
+++ b/tests/influence/_core/test_tracin_xor.py
@@ -167,7 +167,7 @@ def _test_tracin_xor_setup(
 
         dataset = BinaryDataset(use_gpu)
 
-        return net_adjusted, dataset
+        return net_adjusted, dataset  # type: ignore
 
     parametrized_list: List[
         Tuple[Optional[str], DataInfluenceConstructor, str, bool]
diff --git a/tests/module/test_binary_concrete_stochastic_gates.py b/tests/module/test_binary_concrete_stochastic_gates.py
index 57cbba8edc..f4ada7b9ef 100644
--- a/tests/module/test_binary_concrete_stochastic_gates.py
+++ b/tests/module/test_binary_concrete_stochastic_gates.py
@@ -18,6 +18,9 @@
     ]
 )
 class TestBinaryConcreteStochasticGates(BaseTest):
+    # pyre-fixme[13]: Attribute `testing_device` is never initialized.
+    testing_device: str
+
     def setUp(self) -> None:
         super().setUp()
         # pyre-fixme[16]: `TestBinaryConcreteStochasticGates` has no attribute
diff --git a/tests/module/test_gaussian_stochastic_gates.py b/tests/module/test_gaussian_stochastic_gates.py
index e6cb9b9140..58b90d6673 100644
--- a/tests/module/test_gaussian_stochastic_gates.py
+++ b/tests/module/test_gaussian_stochastic_gates.py
@@ -19,6 +19,9 @@
     ]
 )
 class TestGaussianStochasticGates(BaseTest):
+    # pyre-fixme[13]: Attribute `testing_device` is never initialized.
+    testing_device: str
+
     def setUp(self) -> None:
         super().setUp()
         # pyre-fixme[16]: `TestGaussianStochasticGates` has no attribute