diff --git a/.circleci/config.yml b/.circleci/config.yml
index 903e33d4b7..257ccfaf51 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -2,96 +2,681 @@
 # See: https://circleci.com/docs/2.0/configuration-reference
 version: 2.1
 
-# Define a job to be invoked later in a workflow.
-# See: https://circleci.com/docs/2.0/configuration-reference/#jobs
-jobs:
-  build:
-    machine:
-    # Primary container image where all steps run.
-      # image: nvcr.io/nvidia/tensorrt:22.01-py3 # does not work with customized image
-      # https://circleci.com/docs/2.0/configuration-reference#available-linux-gpu-images
-      image: ubuntu-2004-cuda-11.4:202110-01
-    resource_class: gpu.nvidia.large
+commands:
+  install-bazel:
+    description: "Install bazel"
+    parameters:
+      platform:
+        type: string
+        default: "x86_64"
+      version:
+        type: string
+        default: "5.1.1"
     steps:
-      - checkout
       - run:
-          name: install cudnn + tensorrt + bazel
+          name: Install bazel
+          command: |
+            sudo wget -q https://github.com/bazelbuild/bazel/releases/download/<< parameters.version >>/bazel-<< parameters.version >>-linux-<< parameters.platform >> -O /usr/bin/bazel
+            sudo chmod a+x /usr/bin/bazel
+
+  install-cuda:
+    description: "Install CUDA"
+    parameters:
+      os:
+        type: string
+        default: "ubuntu2004"
+      platform:
+        type: string
+        default: "x86_64"
+      cuda-pkg-name:
+        type: string
+        default: "cuda-toolkit-11-4"
+    steps:
+      - run:
+          name: Install CUDA
           command: |
             cd ~
-            OS=ubuntu2004
-            CUDNN_VERSION=8.2.1.*-1+cuda11.3
-            TRT_VERSION=8.2.4-1+cuda11.4
-            BAZEL_VERSION=5.1.1
-            
-            wget https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin 
-            sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
-            sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/7fa2af80.pub
+
+            wget https://developer.download.nvidia.com/compute/cuda/repos/<< parameters.os >>/<< parameters.platform >>/cuda-<< parameters.os >>.pin
+            sudo mv cuda-<< parameters.os >>.pin /etc/apt/preferences.d/cuda-repository-pin-600
+
+            sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/<< parameters.os >>/<< parameters.platform >>/3bf863cc.pub
+            sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/<< parameters.os >>/<< parameters.platform >>/ /"
+            sudo apt-get update
+
+            sudo apt-get install -y << parameters.cuda-pkg-name >>
+      - run:
+          when: on_fail
+          name: Dump apt sources
+          command: cat /etc/apt/sources.list
+
+  create-env:
+    description: "Install dependencies for Torch-TensorRT"
+    parameters:
+      os:
+        type: string
+        default: "ubuntu2004"
+      platform:
+        type: string
+        default: "x86_64"
+      cudnn-version:
+        type: string
+        default: "8.2.1"
+      trt-version-short:
+        type: string
+        default: "8.2.4"
+      bazel-version:
+        type: string
+        default: "5.1.1"
+      bazel-platform:
+        type: string
+        default: "x86_64"
+    steps:
+      - run:
+          name: Install cudnn + tensorrt
+          command: |
+            cd ~
+
+            wget https://developer.download.nvidia.com/compute/cuda/repos/<< parameters.os >>/<< parameters.platform >>/cuda-<< parameters.os >>.pin
+            sudo mv cuda-<< parameters.os >>.pin /etc/apt/preferences.d/cuda-repository-pin-600
+            sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/<< parameters.os >>/<< parameters.platform >>/7fa2af80.pub
             sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 536F8F1DE80F6A35
             sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC
-            sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/ /"
+            sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/<< parameters.os >>/<< parameters.platform >>/ /"
             sudo apt-get update
-            sudo apt-get install libcudnn8=${CUDNN_VERSION}
-            sudo apt-get install libcudnn8-dev=${CUDNN_VERSION}
+            sudo apt-get install libcudnn8=<< parameters.cudnn-version >>*
+            sudo apt-get install libcudnn8-dev=<< parameters.cudnn-version >>*
 
-            sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/{OS}/x86_64/3bf863cc.pub
-            sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/ /"
+            sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/<< parameters.os >>/<< parameters.platform >>/3bf863cc.pub
+            sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/<< parameters.os >>/<< parameters.platform >>/ /"
             sudo apt-get update
-             
-            sudo apt-get install libnvinfer8=${TRT_VERSION} libnvonnxparsers8=${TRT_VERSION} libnvparsers8=${TRT_VERSION} libnvinfer-plugin8=${TRT_VERSION} libnvinfer-dev=${TRT_VERSION} libnvonnxparsers-dev=${TRT_VERSION} libnvparsers-dev=${TRT_VERSION} libnvinfer-plugin-dev=${TRT_VERSION} python3-libnvinfer=${TRT_VERSION}
-            # check available version, apt list libnvinfer8 -a
-            sudo wget -q https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-linux-x86_64 -O /usr/bin/bazel
-            sudo chmod a+x /usr/bin/bazel
 
+            sudo apt-get install libnvinfer8=<< parameters.trt-version-short >>* libnvinfer-plugin8=<< parameters.trt-version-short>>* libnvinfer-dev=<< parameters.trt-version-short>>* libnvinfer-plugin-dev=<< parameters.trt-version-short>>*
+      - install-bazel:
+          platform: << parameters.bazel-platform >>
+          version: << parameters.bazel-version >>
+
+  create-py-env:
+      description: "Install python dependencies"
+      parameters:
+        trt-version-long:
+          type: string
+          default: "8.2.4.2"
+      steps:
+        - run:
+            name: Set up python environment
+            command: |
+              pip3 install --upgrade pip
+              pip3 install wheel setuptools
+              pip3 install nvidia-pyindex
+              pip3 install tabulate
+              pip3 install nvidia-tensorrt==<< parameters.trt-version-long >>
+              pip3 install pytest parameterized expecttest nox
+            # install torch_tensorrt
+
+  install-torch-from-index:
+    description: "Install python dependencies"
+    parameters:
+      torch-build:
+        type: string
+        default: "1.11.0+cu113"
+      torch-build-index:
+        type: string
+        default: "https://download.pytorch.org/whl/cu113"
+    steps:
       - run:
-          name: set up python environment
+          name: Install Torch
           command: |
-            pip3 install nvidia-pyindex
-            pip3 install nvidia-tensorrt==8.2.4.2
-            pip3 install --pre torch==1.13.0.dev20220621  torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu113
-            pip3 install pytest parameterized expecttest
-            pip3 install tabulate
-            # install torch_tensorrt
-            mv WORKSPACE.ci WORKSPACE
+            pip3 install --upgrade pip
+            pip3 install --pre torch==<< parameters.torch-build >>  torchvision torchaudio --extra-index-url << parameters.torch-build-index >>
+
+  build-py:
+    description: "Build the torch-tensorrt python release (pre-cxx11-abi)"
+    parameters:
+      platform:
+        type: string
+        default: "x86_64"
+    steps:
+      - run:
+          name: Build torch-tensorrt python release (pre-cxx11-abi)
+          command: |
+            mv toolchains/ci_workspaces/WORKSPACE.<< parameters.platform >> WORKSPACE
             cd py
+            python3 -m pip install wheel setuptools
+            python3 -m pip install pybind11==2.6.2
+            python3 setup.py bdist_wheel
             python3 setup.py install
+            mkdir -p /tmp/dist/builds
+            cp dist/* /tmp/dist/builds
+
+  build-py-cxx11-abi:
+    description: "Build the torch-tensorrt python release (cxx11-abi)"
+    parameters:
+      platform:
+        type: string
+        default: "x86_64"
+    steps:
+      - run:
+          name: Build torch-tensorrt python release
+          command: |
+            mv toolchains/ci_workspaces/WORKSPACE.<< parameters.platform >> WORKSPACE
+            cd py
+            python3 -m pip install wheel setuptools
+            python3 -m pip install pybind11==2.6.2
+            python3 setup.py bdist_wheel --use-cxx11-abi
+            python3 setup.py install --use-cxx11-abi
+            mkdir -p /tmp/dist/builds
+            cp dist/* /tmp/dist/builds
+
+  build-py-fx-only:
+    description: "Build the torch-tensorrt python release with only the fx backend"
+    parameters:
+      platform:
+        type: string
+        default: "x86_64"
+    steps:
+      - run:
+          name: Build torch-tensorrt python release with only the fx backend
+          command: |
+            mv toolchains/ci_workspaces/WORKSPACE.<< parameters.platform >> WORKSPACE
+            cd py
+            python3 -m pip install wheel setuptools
+            python3 -m pip install pybind11==2.6.2
+            python3 setup.py bdist_wheel --fx-only
+            python3 setup.py install --fx-only
+            mkdir -p /tmp/dist/builds
+            cp dist/* /tmp/dist/builds
+
+  dump-test-env:
+    description: "Dump the test env to console"
+    steps:
+      - run:
+          name: GPU Config
+          command: |
+            nvidia-smi
+
+      - run:
+          name: Test torch
+          command: |
+            python3 -c "import torch; print(torch.cuda.is_available()); print(torch.cuda.device_count())"
 
-            # install fx2trt
-            # cd py/torch_tensorrt/fx/setup
-            # python3 setup.py install
       - run:
-          name: run fx2trt tests
+          name: Get torch-tensorrt version information
           command: |
+            python3 -c "import torch_tensorrt; torch_tensorrt.dump_build_info()"
+
+  pull-test-models:
+    description: "Pull the test model set"
+    steps:
+      - run:
+          name: Pull test models
+          environment:
+            USE_HOST_DEPS: "1"
+          command: |
+            cd tests/modules
+            pip3 install -r requirements.txt
+            python3 hub.py
+            cd ~/project
+
+  test-ts-core:
+    description: "Test torchscript backend c++ api"
+    parameters:
+      platform:
+        type: string
+        default: "x86_64"
+    steps:
+      - pull-test-models
+      - run: mkdir -p /tmp/artifacts
+      - run:
+          name: Run core / C++ tests
+          environment:
+            LD_LIBRARY_PATH: "/home/circleci/project/bazel-project/external/libtorch_pre_cxx11_abi/lib/:/home/circleci/project/bazel-project/external/tensorrt/lib/:/usr/local/cuda/lib64/:$LD_LIBRARY_PATH"
+          command: |
+            set -e
+            mv toolchains/ci_workspaces/WORKSPACE.<< parameters.platform >> WORKSPACE
+            bazel query 'kind(cc_*, tests(//tests))' --noshow_progress >> /tmp/test_manifest.txt
+            circleci tests split < /tmp/test_manifest.txt > /tmp/node_test_manifest.txt
+            bazel test $(cat /tmp/node_test_manifest.txt) --test_arg=--gtest_output=xml:/tmp/artifacts/test_results/ --jobs 4 --config ci_testing --config pre_cxx11_abi --noshow_progress
+      - run:
+          name: Collect logs
+          when: on_fail
+          command: |
+            mkdir -p /tmp/testlogs
+            cp -r bazel-testlogs /tmp/testlogs
+            sudo apt install tree
+            tree . > /tmp/testlogs/dir_structure.txt
+
+      - store_test_results:
+          path: /tmp/artifacts
+      - store_artifacts:
+          path: /tmp/testlogs
+
+  test-ts-py-api:
+    description: "Run L0 torch-tensorrt python tests"
+    steps:
+      - pull-test-models
+      - run:
+          name: Run L0 torch-tensorrt python tests
+          environment:
+            USE_HOST_DEPS: "1"
+            PYT_PATH: "/opt/circleci/.pyenv/versions/3.9.4/lib/python3.9/site-packages/"
+            LD_LIBRARY_PATH: "/usr/lib/x86_64-linux-gnu/:/usr/local/cuda/lib64/:$LD_LIBRARY_PATH"
+          command: |
+            set -e
+            mkdir -p /tmp/artifacts/test_results
+            cd tests/py
+            pip3 install -r requirements.txt
+            pytest --junitxml=/tmp/artifacts/test_results/api/api_test_results.xml api/
+            pytest --junitxml=/tmp/artifacts/test_results/integrations/integrations_test_results.xml integrations/
+            cd ~/project
+
+      - store_test_results:
+          path: /tmp/artifacts
+      - store_artifacts:
+          path: /tmp/testlogs
+
+
+  test-fx:
+    description: "Test the fx backend"
+    steps:
+      - run:
+          name: Run fx tests
+          command: |
+            mkdir -p /tmp/artifacts/test_results
             # one fix pending to enable below
             # cd py/torch_tensorrt/fx/test
             # pytest $(find . -name '*.py' | grep -v test_dispatch* | grep -v test_setitem*)
-            
             cd py/torch_tensorrt/fx/test
             pushd converters/acc_op
-            pytest 
+            pytest --junitxml=/tmp/artifacts/test_results/fx/converters/acc_op/test_results.xml
             popd
             pushd passes
-            list_passes=$(ls | grep -v test_setitem*) 
-            pytest $list_passes
+            list_passes=$(ls | grep -v test_setitem*)
+            pytest $list_passes --junitxml=/tmp/artifacts/test_results/fx/passes/test_results.xml
             popd
             pushd core
-            pytest
+            pytest --junitxml=/tmp/artifacts/test_results/fx/core/test_results.xml
             popd
             # pushd quant
-            # pytest
+            # pytest --junitxml=/tmp/artifacts/test_results/fx/quant/test_results.xml
             # popd
             pushd tools
-            pytest
+            pytest --junitxml=/tmp/artifacts/test_results/fx/tools/test_results.xml
             popd
             pushd trt_lower
-            pytest
+            pytest --junitxml=/tmp/artifacts/test_results/fx/trt_lower/test_results.xml
             popd
             pushd tracer
-            list_tracer=$(ls | grep -v test_dispatch_*) 
-            pytest $list_tracer
+            list_tracer=$(ls | grep -v test_dispatch_*)
+            pytest $list_tracer --junitxml=/tmp/artifacts/test_results/fx/tracer/test_results.xml
             popd
+            cd ~/project
+      - store_test_results:
+          path: /tmp/artifacts
+      - store_artifacts:
+          path: /tmp/testlogs
+
+# Define a job to be invoked later in a workflow.
+# See: https://circleci.com/docs/2.0/configuration-reference/#jobs
+jobs:
+  build-aarch64-pyt-jetson:
+    parameters:
+      torch-build:
+        type: string
+      jetpack-version:
+        type: string
+      cxx11-abi:
+        type: boolean
+        default: true
+      python-version:
+        type: string
+        default: 3.8.10
+    machine:
+      image: ubuntu-2004:202201-02
+    resource_class: arm.xlarge
+    steps:
+      - checkout
+      #- run:
+      #    name: Upgrade base
+      #    command: |
+      #      sudo apt clean
+      #      sudo apt update
+      #      sudo apt upgrade
+      #      sudo apt install software-properties-common
+      - install-cuda:
+          os: "ubuntu2004"
+          platform: "sbsa"
+          cuda-pkg-name: "cuda-toolkit-11-4"
+      - run:
+          name: Install openblas
+          command: sudo apt install libopenblas-dev
+      - create-env:
+          os: "ubuntu2004"
+          platform: "sbsa"
+          cudnn-version: << pipeline.parameters.cudnn-jetson-version >>
+          trt-version-short:  << pipeline.parameters.trt-jetson-version-short >>
+          bazel-version: "5.1.1"
+          bazel-platform: "arm64"
+      - run:
+          name: Set python version
+          command: |
+            pyenv install << parameters.python-version >>
+            pyenv global << parameters.python-version >>
+      - run:
+          name: Install NGC Torch
+          environment:
+            TORCH_INSTALL: https://developer.download.nvidia.com/compute/redist/jp/v<< parameters.jetpack-version >>/pytorch/<< parameters.torch-build >>
+          command: |
+            set -e
+            python3 -m pip install --upgrade pip; python3 -m pip install setuptools wheel; python3 -m pip install expecttest xmlrunner hypothesis aiohttp numpy=='1.19.4' pyyaml scipy=='1.5.3' ninja cython typing_extensions protobuf; export "LD_LIBRARY_PATH=/usr/lib/llvm-8/lib:$LD_LIBRARY_PATH"; python3 -m pip install --upgrade protobuf; python3 -m pip install --no-cache $TORCH_INSTALL
+      - when:
+          condition: << parameters.cxx11-abi >>
+          steps:
+            - build-py-cxx11-abi:
+                platform: "sbsa"
+      - unless:
+          condition: << parameters.cxx11-abi >>
+          steps:
+            - build-py:
+                platform: "sbsa"
+      - run:
+          name: Move to release dir
+          command: |
+            mkdir -p /tmp/dist/jetson
+            cp -r /tmp/dist/builds/* /tmp/dist/jetson
+      - persist_to_workspace:
+          root: /tmp/dist
+          paths:
+            - jetson
+      - store_artifacts:
+          path: /tmp/dist/jetson
+          destination: aarch64-pyt-jetson
+
+  build-x86_64-pyt-release:
+    parameters:
+      torch-build:
+        type: string
+      torch-build-index:
+        type: string
+      cxx11-abi:
+        type: boolean
+        default: false
+    machine:
+      image: ubuntu-2004-cuda-11.4:202110-01
+    resource_class: xlarge
+    steps:
+      - checkout
+      - create-env:
+          os: "ubuntu2004"
+          platform: "x86_64"
+          cudnn-version: << pipeline.parameters.cudnn-release-version >>
+          trt-version-short:  << pipeline.parameters.trt-release-version-short >>
+          bazel-version: "5.1.1"
+          bazel-platform: "x86_64"
+      - install-torch-from-index:
+          torch-build:  << parameters.torch-build >>
+          torch-build-index: << parameters.torch-build-index >>
+      - when:
+          condition: << parameters.cxx11-abi >>
+          steps:
+            - build-py-cxx11-abi
+      - unless:
+          condition: << parameters.cxx11-abi >>
+          steps:
+            - build-py
+      - run:
+          name: Move to release dir
+          command: |
+            mkdir -p /tmp/dist/release
+            cp -r /tmp/dist/builds/* /tmp/dist/release
+      - persist_to_workspace:
+          root: /tmp/dist
+          paths:
+            - release
+      - store_artifacts:
+          path: /tmp/dist/release
+          destination: x86_64-pyt-release
+
+  build-x86_64-pyt-nightly:
+    parameters:
+      torch-build:
+        type: string
+      torch-build-index:
+        type: string
+      cxx11-abi:
+        type: boolean
+        default: false
+    machine:
+      image: ubuntu-2004-cuda-11.4:202110-01
+    resource_class: xlarge
+    steps:
+      - checkout
+      - create-env:
+          os: "ubuntu2004"
+          platform: "x86_64"
+          cudnn-version: << pipeline.parameters.cudnn-nightly-version >>
+          trt-version-short:  << pipeline.parameters.trt-nightly-version-short >>
+          bazel-version: "5.1.1"
+          bazel-platform: "x86_64"
+      - install-torch-from-index:
+          torch-build:  << parameters.torch-build >>
+          torch-build-index: << parameters.torch-build-index >>
+      - when:
+          condition: << parameters.cxx11-abi >>
+          steps:
+            - build-py-cxx11-abi
+      - unless:
+          condition: << parameters.cxx11-abi >>
+          steps:
+            - build-py
+      - run:
+          name: Move to nightly dir
+          command: |
+            mkdir -p /tmp/dist/nightly
+            cp -r /tmp/dist/builds/* /tmp/dist/nightly
+      - persist_to_workspace:
+          root: /tmp/dist
+          paths:
+            - nightly
+      - store_artifacts:
+          path: /tmp/dist/nightly
+          destination: x86_64-pyt-nightly
+
+  test-core-cpp-x86_64-pyt-release:
+    parameters:
+      torch-build:
+        type: string
+      torch-build-index:
+        type: string
+    machine:
+      image: ubuntu-2004-cuda-11.4:202110-01
+    resource_class: gpu.nvidia.large
+    parallelism: 4
+    steps:
+      - checkout
+      - create-env:
+          os: "ubuntu2004"
+          platform: "x86_64"
+          cudnn-version: << pipeline.parameters.cudnn-release-version >>
+          trt-version-short:  << pipeline.parameters.trt-release-version-short >>
+          bazel-version: "5.1.1"
+          bazel-platform: "x86_64"
+      - create-py-env:
+          trt-version-long: << pipeline.parameters.trt-release-version-long >>
+      - install-torch-from-index:
+          torch-build:  << parameters.torch-build >>
+          torch-build-index: << parameters.torch-build-index >>
+      - attach_workspace:
+          at: /tmp/dist
+      - run:
+          name: "Install torch-tensorrt"
+          command: pip3 install /tmp/dist/release/*
+      - dump-test-env
+      - test-ts-core
+
+  test-ts-py-x86_64-pyt-release:
+    parameters:
+      torch-build:
+        type: string
+      torch-build-index:
+        type: string
+    machine:
+      image: ubuntu-2004-cuda-11.4:202110-01
+    resource_class: gpu.nvidia.large
+    steps:
+      - checkout
+      - create-py-env:
+          trt-version-long: << pipeline.parameters.trt-release-version-long >>
+      - install-torch-from-index:
+          torch-build:  << parameters.torch-build >>
+          torch-build-index: << parameters.torch-build-index >>
+      - attach_workspace:
+          at: /tmp/dist
+      - run:
+          name: "Install torch-tensorrt"
+          command: pip3 install /tmp/dist/release/*
+      - dump-test-env
+      - test-ts-py-api
+
+  test-x86_64-pyt-nightly:
+    parameters:
+      torch-build:
+        type: string
+      torch-build-index:
+        type: string
+    machine:
+      image: ubuntu-2004-cuda-11.4:202110-01
+    resource_class: gpu.nvidia.large
+    steps:
+      - checkout
+      - create-py-env:
+          trt-version-long: << pipeline.parameters.trt-nightly-version-long >>
+      - attach_workspace:
+          at: /tmp/dist/
+      - run:
+          name: "Install torch-tensorrt"
+          command: pip3 install /tmp/dist/nightly/*
+      # We install torch after torch-trt because pip automatically enforces the version constraint otherwise, swap back after versions are synced
+      - install-torch-from-index:
+          torch-build:  << parameters.torch-build >>
+          torch-build-index: << parameters.torch-build-index >>
+      - dump-test-env
+      - test-fx
+
+parameters:
+  # Nightly platform config
+  torch-nightly-build:
+    type: string
+    default: "1.13.0.dev20220715+cu113"
+  torch-nightly-build-index:
+    type: string
+    default: "https://download.pytorch.org/whl/nightly/cu113"
+  cudnn-nightly-version:
+    type: string
+    default: "8.2.1"
+  trt-nightly-version-short:
+    type: string
+    default: "8.2.4"
+  trt-nightly-version-long:
+    type: string
+    default: "8.2.4.2"
+
+  # Release platform config
+  torch-release-build:
+    type: string
+    default: "1.11.0+cu113"
+  torch-release-build-index:
+    type: string
+    default: "https://download.pytorch.org/whl/cu113"
+  cudnn-release-version:
+    type: string
+    default: "8.2.1"
+  trt-release-version-short:
+    type: string
+    default: "8.2.4"
+  trt-release-version-long:
+    type: string
+    default: "8.2.4.2"
+
+  # Jetson platform config
+  torch-jetson-build:
+    type: string
+    default: "torch-1.12.0a0+84d1cb9.nv22.4-cp38-cp38-linux_aarch64.whl"
+  jetpack-version:
+    type: string
+    default: "50"
+  cudnn-jetson-version:
+    type: string
+    default: "8.3.2"
+  trt-jetson-version-short:
+    type: string
+    default: "8.4.1"
+  trt-jetson-version-long:
+    type: string
+    default: "8.4.1.5"
+
 # Invoke jobs via workflows
 # See: https://circleci.com/docs/2.0/configuration-reference/#workflows
 workflows:
-  build_run:
+  nightly:
+    triggers:
+      - schedule:
+          cron: "0 0 * * *"
+          filters:
+            branches:
+              only:
+                - master
+    jobs:
+      - build-aarch64-pyt-jetson:
+          torch-build: << pipeline.parameters.torch-jetson-build >>
+          jetpack-version: << pipeline.parameters.jetpack-version >>
+          python-version: 3.8.10
+
+      - build-x86_64-pyt-release:
+          torch-build: << pipeline.parameters.torch-release-build >>
+          torch-build-index: << pipeline.parameters.torch-release-build-index >>
+      - test-core-cpp-x86_64-pyt-release:
+          torch-build: << pipeline.parameters.torch-release-build >>
+          torch-build-index: << pipeline.parameters.torch-release-build-index >>
+          requires:
+            - build-x86_64-pyt-release
+
+      - build-x86_64-pyt-nightly:
+          torch-build: << pipeline.parameters.torch-nightly-build >>
+          torch-build-index: << pipeline.parameters.torch-nightly-build-index >>
+      - test-x86_64-pyt-nightly:
+          torch-build: << pipeline.parameters.torch-nightly-build >>
+          torch-build-index: << pipeline.parameters.torch-nightly-build-index >>
+          requires:
+            - build-x86_64-pyt-nightly
+
+  on-push:
     jobs:
-      - build
+      - build-aarch64-pyt-jetson:
+          torch-build: << pipeline.parameters.torch-jetson-build >>
+          jetpack-version: << pipeline.parameters.jetpack-version >>
+          python-version: 3.8.10
+
+      - build-x86_64-pyt-release:
+          torch-build: << pipeline.parameters.torch-release-build >>
+          torch-build-index: << pipeline.parameters.torch-release-build-index >>
+      - test-core-cpp-x86_64-pyt-release:
+          torch-build: << pipeline.parameters.torch-release-build >>
+          torch-build-index: << pipeline.parameters.torch-release-build-index >>
+          requires:
+            - build-x86_64-pyt-release
+      - test-ts-py-x86_64-pyt-release:
+          torch-build: << pipeline.parameters.torch-release-build >>
+          torch-build-index: << pipeline.parameters.torch-release-build-index >>
+          requires:
+            - build-x86_64-pyt-release
+
+      - build-x86_64-pyt-nightly:
+          torch-build: << pipeline.parameters.torch-nightly-build >>
+          torch-build-index: << pipeline.parameters.torch-nightly-build-index >>
+      - test-x86_64-pyt-nightly:
+          torch-build: << pipeline.parameters.torch-nightly-build >>
+          torch-build-index: << pipeline.parameters.torch-nightly-build-index >>
+          requires:
+            - build-x86_64-pyt-nightly
+
diff --git a/.gitignore b/.gitignore
index 9571f39288..dd940571ad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -62,3 +62,6 @@ bazel-Torch-TensorRT-Preview
 docsrc/src/
 bazel-TensorRT
 bazel-tensorrt
+.pytest_cache
+*.cache
+*cifar-10-batches-py*
\ No newline at end of file
diff --git a/examples/int8/training/vgg16/requirements.txt b/examples/int8/training/vgg16/requirements.txt
index ed1268164a..dcb184324b 100644
--- a/examples/int8/training/vgg16/requirements.txt
+++ b/examples/int8/training/vgg16/requirements.txt
@@ -1,3 +1,6 @@
 torch>=1.10.0
 tensorboard>=1.14.0
-pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com
+nvidia-pyindex
+--extra-index-url https://pypi.ngc.nvidia.com
+pytorch-quantization>=2.1.2
+tqdm
\ No newline at end of file
diff --git a/noxfile.py b/noxfile.py
index 99084b8194..01a3f9bc08 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -5,16 +5,22 @@
 
 # Use system installed Python packages
 PYT_PATH='/opt/conda/lib/python3.8/site-packages' if not 'PYT_PATH' in os.environ else os.environ["PYT_PATH"]
+print(f"Using python path {PYT_PATH}")
 
 # Set the root directory to the directory of the noxfile unless the user wants to
 # TOP_DIR
 TOP_DIR=os.path.dirname(os.path.realpath(__file__)) if not 'TOP_DIR' in os.environ else os.environ["TOP_DIR"]
+print(f"Test root directory {TOP_DIR}")
 
 # Set the USE_CXX11=1 to use cxx11_abi
 USE_CXX11=0 if not 'USE_CXX11' in os.environ else os.environ["USE_CXX11"]
+if USE_CXX11:
+    print("Using cxx11 abi")
 
 # Set the USE_HOST_DEPS=1 to use host dependencies for tests
 USE_HOST_DEPS=0 if not 'USE_HOST_DEPS' in os.environ else os.environ["USE_HOST_DEPS"]
+if USE_HOST_DEPS:
+    print("Using dependencies from host python")
 
 SUPPORTED_PYTHON_VERSIONS=["3.7", "3.8", "3.9", "3.10"]
 
@@ -58,6 +64,12 @@ def download_datasets(session):
 
 def train_model(session):
     session.chdir(os.path.join(TOP_DIR, 'examples/int8/training/vgg16'))
+    session.install("-r", "requirements.txt")
+    if os.path.exists('vgg16_ckpts/ckpt_epoch25.pth'):
+        session.run_always('python',
+                'export_ckpt.py',
+                'vgg16_ckpts/ckpt_epoch25.pth')
+        return
     if USE_HOST_DEPS:
         session.run_always('python',
             'main.py',
@@ -140,14 +152,14 @@ def run_base_tests(session):
     print("Running basic tests")
     session.chdir(os.path.join(TOP_DIR, 'tests/py'))
     tests = [
-        "test_api.py",
-        "test_to_backend_api.py",
+        "api",
+        "integrations/test_to_backend_api.py",
     ]
     for test in tests:
         if USE_HOST_DEPS:
-            session.run_always('python', test, env={'PYTHONPATH': PYT_PATH})
+            session.run_always('pytest', test, env={'PYTHONPATH': PYT_PATH})
         else:
-            session.run_always("python", test)
+            session.run_always("pytest", test)
 
 def run_accuracy_tests(session):
     print("Running accuracy tests")
@@ -169,7 +181,7 @@ def copy_model(session):
             session.run_always('cp',
                                '-rpf',
                                os.path.join(TOP_DIR, src_file),
-                               os.path.join(TOP_DIR, str('tests/py/') + file_name),
+                               os.path.join(TOP_DIR, str('tests/modules/') + file_name),
                                external=True)
 
 def run_int8_accuracy_tests(session):
@@ -177,15 +189,15 @@ def run_int8_accuracy_tests(session):
     copy_model(session)
     session.chdir(os.path.join(TOP_DIR, 'tests/py'))
     tests = [
-        "test_ptq_dataloader_calibrator.py",
-        "test_ptq_to_backend.py",
-        "test_qat_trt_accuracy.py",
+        "ptq/test_ptq_to_backend.py",
+        "ptq/test_ptq_dataloader_calibrator.py",
+        "qat/",
     ]
     for test in tests:
         if USE_HOST_DEPS:
-            session.run_always('python', test, env={'PYTHONPATH': PYT_PATH})
+            session.run_always('pytest', test, env={'PYTHONPATH': PYT_PATH})
         else:
-            session.run_always("python", test)
+            session.run_always("pytest", test)
 
 def run_trt_compatibility_tests(session):
     print("Running TensorRT compatibility tests")
@@ -197,9 +209,9 @@ def run_trt_compatibility_tests(session):
     ]
     for test in tests:
         if USE_HOST_DEPS:
-            session.run_always('python', test, env={'PYTHONPATH': PYT_PATH})
+            session.run_always('pytest', test, env={'PYTHONPATH': PYT_PATH})
         else:
-            session.run_always("python", test)
+            session.run_always("pytest", test)
 
 def run_dla_tests(session):
     print("Running DLA tests")
@@ -209,9 +221,9 @@ def run_dla_tests(session):
     ]
     for test in tests:
         if USE_HOST_DEPS:
-            session.run_always('python', test, env={'PYTHONPATH': PYT_PATH})
+            session.run_always('pytest', test, env={'PYTHONPATH': PYT_PATH})
         else:
-            session.run_always("python", test)
+            session.run_always("pytest", test)
 
 def run_multi_gpu_tests(session):
     print("Running multi GPU tests")
@@ -221,9 +233,9 @@ def run_multi_gpu_tests(session):
     ]
     for test in tests:
         if USE_HOST_DEPS:
-            session.run_always('python', test, env={'PYTHONPATH': PYT_PATH})
+            session.run_always('pytest', test, env={'PYTHONPATH': PYT_PATH})
         else:
-            session.run_always("python", test)
+            session.run_always("pytest", test)
 
 def run_l0_api_tests(session):
     if not USE_HOST_DEPS:
@@ -245,7 +257,6 @@ def run_l1_accuracy_tests(session):
     if not USE_HOST_DEPS:
         install_deps(session)
         install_torch_trt(session)
-    download_models(session)
     download_datasets(session)
     train_model(session)
     run_accuracy_tests(session)
@@ -255,7 +266,6 @@ def run_l1_int8_accuracy_tests(session):
     if not USE_HOST_DEPS:
         install_deps(session)
         install_torch_trt(session)
-    download_models(session)
     download_datasets(session)
     train_model(session)
     finetune_model(session)
@@ -313,4 +323,8 @@ def l2_multi_gpu_tests(session):
 @nox.session(python=SUPPORTED_PYTHON_VERSIONS, reuse_venv=True)
 def download_test_models(session):
     """Grab all the models needed for testing"""
+    try:
+        import torch
+    except ModuleNotFoundError:
+        install_deps(session)
     download_models(session)
diff --git a/py/requirements.txt b/py/requirements.txt
index 8d12c108aa..fce4b91dca 100644
--- a/py/requirements.txt
+++ b/py/requirements.txt
@@ -1,5 +1,3 @@
--f https://download.pytorch.org/whl/torch_stable.html
--f https://download.pytorch.org/whl/torch/
 --extra-index-url https://download.pytorch.org/whl/cu113
-torch==1.11.0+cu113
+torch==1.11.0
 pybind11==2.6.2
diff --git a/py/torch_tensorrt/_util.py b/py/torch_tensorrt/_util.py
index ba260f8958..94a58cfcc5 100644
--- a/py/torch_tensorrt/_util.py
+++ b/py/torch_tensorrt/_util.py
@@ -1,6 +1,8 @@
 from torch_tensorrt import __version__
 from torch_tensorrt import _C
 
+import torch
+
 
 def dump_build_info():
     """Prints build information about the torch_tensorrt distribution to stdout
@@ -15,7 +17,9 @@ def get_build_info() -> str:
         str: String containing the build information for torch_tensorrt distribution
     """
     build_info = _C.get_build_info()
-    build_info = "Torch-TensorRT Version: " + str(__version__) + '\n' + build_info
+    build_info = "Torch-TensorRT Version: " + str(__version__) + '\n' \
+        + "Using PyTorch Version: " + str(torch.__version__) + '\n' \
+        + build_info
     return build_info
 
 
diff --git a/tests/modules/hub.py b/tests/modules/hub.py
index 57764494e9..48e6b519cb 100644
--- a/tests/modules/hub.py
+++ b/tests/modules/hub.py
@@ -80,7 +80,7 @@
         "model": timm.create_model('vit_base_patch16_224', pretrained=True),
         "path": "script"
     },
-    "pool": {
+    "pooling": {
         "model": cm.Pool(),
         "path": "trace"
     },
@@ -104,7 +104,7 @@
         "model": cm.FallbackInplaceOPIf(),
         "path": "script"
     },
-    "bert-base-uncased": {
+    "bert_base_uncased": {
         "model": cm.BertModule(),
         "path": "trace"
     }
diff --git a/tests/modules/requirements.txt b/tests/modules/requirements.txt
index b1a922e034..d4b5105850 100644
--- a/tests/modules/requirements.txt
+++ b/tests/modules/requirements.txt
@@ -1,4 +1,2 @@
--f https://download.pytorch.org/whl/torch_stable.html
-#torch==1.11.0+cu113
 timm==v0.4.12
 transformers==4.17.0
diff --git a/tests/py/api/test_classes.py b/tests/py/api/test_classes.py
new file mode 100644
index 0000000000..d21c12a750
--- /dev/null
+++ b/tests/py/api/test_classes.py
@@ -0,0 +1,190 @@
+import unittest
+import torch_tensorrt as torchtrt
+import torch
+import torchvision.models as models
+import copy
+from typing import Dict
+
+class TestDevice(unittest.TestCase):
+
+    def test_from_string_constructor(self):
+        device = torchtrt.Device("cuda:0")
+        self.assertEqual(device.device_type, torchtrt.DeviceType.GPU)
+        self.assertEqual(device.gpu_id, 0)
+
+        device = torchtrt.Device("gpu:1")
+        self.assertEqual(device.device_type, torchtrt.DeviceType.GPU)
+        self.assertEqual(device.gpu_id, 1)
+
+    def test_from_string_constructor_dla(self):
+        device = torchtrt.Device("dla:0")
+        self.assertEqual(device.device_type, torchtrt.DeviceType.DLA)
+        self.assertEqual(device.gpu_id, 0)
+        self.assertEqual(device.dla_core, 0)
+
+        device = torchtrt.Device("dla:1", allow_gpu_fallback=True)
+        self.assertEqual(device.device_type, torchtrt.DeviceType.DLA)
+        self.assertEqual(device.gpu_id, 0)
+        self.assertEqual(device.dla_core, 1)
+        self.assertEqual(device.allow_gpu_fallback, True)
+
+    def test_kwargs_gpu(self):
+        device = torchtrt.Device(gpu_id=0)
+        self.assertEqual(device.device_type, torchtrt.DeviceType.GPU)
+        self.assertEqual(device.gpu_id, 0)
+
+    def test_kwargs_dla_and_settings(self):
+        device = torchtrt.Device(dla_core=1, allow_gpu_fallback=False)
+        self.assertEqual(device.device_type, torchtrt.DeviceType.DLA)
+        self.assertEqual(device.gpu_id, 0)
+        self.assertEqual(device.dla_core, 1)
+        self.assertEqual(device.allow_gpu_fallback, False)
+
+        device = torchtrt.Device(gpu_id=1, dla_core=0, allow_gpu_fallback=True)
+        self.assertEqual(device.device_type, torchtrt.DeviceType.DLA)
+        self.assertEqual(device.gpu_id, 1)
+        self.assertEqual(device.dla_core, 0)
+        self.assertEqual(device.allow_gpu_fallback, True)
+
+    def test_from_torch(self):
+        device = torchtrt.Device._from_torch_device(torch.device("cuda:0"))
+        self.assertEqual(device.device_type, torchtrt.DeviceType.GPU)
+        self.assertEqual(device.gpu_id, 0)
+
+
+class TestInput(unittest.TestCase):
+
+    def _verify_correctness(self, struct: torchtrt.Input, target: Dict) -> bool:
+        internal = struct._to_internal()
+
+        list_eq = lambda al, bl: all([a == b for (a, b) in zip(al, bl)])
+
+        eq = lambda a, b: a == b
+
+        def field_is_correct(field, equal_fn, a1, a2):
+            equal = equal_fn(a1, a2)
+            if not equal:
+                print("\nField {} is incorrect: {} != {}".format(field, a1, a2))
+            return equal
+
+        min_ = field_is_correct("min", list_eq, internal.min, target["min"])
+        opt_ = field_is_correct("opt", list_eq, internal.opt, target["opt"])
+        max_ = field_is_correct("max", list_eq, internal.max, target["max"])
+        is_dynamic_ = field_is_correct("is_dynamic", eq, internal.input_is_dynamic, target["input_is_dynamic"])
+        explicit_set_dtype_ = field_is_correct("explicit_dtype", eq, internal._explicit_set_dtype,
+                                               target["explicit_set_dtype"])
+        dtype_ = field_is_correct("dtype", eq, int(internal.dtype), int(target["dtype"]))
+        format_ = field_is_correct("format", eq, int(internal.format), int(target["format"]))
+
+        return all([min_, opt_, max_, is_dynamic_, explicit_set_dtype_, dtype_, format_])
+
+    def test_infer_from_example_tensor(self):
+        shape = [1, 3, 255, 255]
+        target = {
+            "min": shape,
+            "opt": shape,
+            "max": shape,
+            "input_is_dynamic": False,
+            "dtype": torchtrt.dtype.half,
+            "format": torchtrt.TensorFormat.contiguous,
+            "explicit_set_dtype": True
+        }
+
+        example_tensor = torch.randn(shape).half()
+        i = torchtrt.Input._from_tensor(example_tensor)
+        self.assertTrue(self._verify_correctness(i, target))
+
+    def test_static_shape(self):
+        shape = [1, 3, 255, 255]
+        target = {
+            "min": shape,
+            "opt": shape,
+            "max": shape,
+            "input_is_dynamic": False,
+            "dtype": torchtrt.dtype.unknown,
+            "format": torchtrt.TensorFormat.contiguous,
+            "explicit_set_dtype": False
+        }
+
+        i = torchtrt.Input(shape)
+        self.assertTrue(self._verify_correctness(i, target))
+
+        i = torchtrt.Input(tuple(shape))
+        self.assertTrue(self._verify_correctness(i, target))
+
+        i = torchtrt.Input(torch.randn(shape).shape)
+        self.assertTrue(self._verify_correctness(i, target))
+
+        i = torchtrt.Input(shape=shape)
+        self.assertTrue(self._verify_correctness(i, target))
+
+        i = torchtrt.Input(shape=tuple(shape))
+        self.assertTrue(self._verify_correctness(i, target))
+
+        i = torchtrt.Input(shape=torch.randn(shape).shape)
+        self.assertTrue(self._verify_correctness(i, target))
+
+    def test_data_type(self):
+        shape = [1, 3, 255, 255]
+        target = {
+            "min": shape,
+            "opt": shape,
+            "max": shape,
+            "input_is_dynamic": False,
+            "dtype": torchtrt.dtype.half,
+            "format": torchtrt.TensorFormat.contiguous,
+            "explicit_set_dtype": True
+        }
+
+        i = torchtrt.Input(shape, dtype=torchtrt.dtype.half)
+        self.assertTrue(self._verify_correctness(i, target))
+
+        i = torchtrt.Input(shape, dtype=torch.half)
+        self.assertTrue(self._verify_correctness(i, target))
+
+    def test_tensor_format(self):
+        shape = [1, 3, 255, 255]
+        target = {
+            "min": shape,
+            "opt": shape,
+            "max": shape,
+            "input_is_dynamic": False,
+            "dtype": torchtrt.dtype.unknown,
+            "format": torchtrt.TensorFormat.channels_last,
+            "explicit_set_dtype": False
+        }
+
+        i = torchtrt.Input(shape, format=torchtrt.TensorFormat.channels_last)
+        self.assertTrue(self._verify_correctness(i, target))
+
+        i = torchtrt.Input(shape, format=torch.channels_last)
+        self.assertTrue(self._verify_correctness(i, target))
+
+    def test_dynamic_shape(self):
+        min_shape = [1, 3, 128, 128]
+        opt_shape = [1, 3, 256, 256]
+        max_shape = [1, 3, 512, 512]
+        target = {
+            "min": min_shape,
+            "opt": opt_shape,
+            "max": max_shape,
+            "input_is_dynamic": True,
+            "dtype": torchtrt.dtype.unknown,
+            "format": torchtrt.TensorFormat.contiguous,
+            "explicit_set_dtype": False
+        }
+
+        i = torchtrt.Input(min_shape=min_shape, opt_shape=opt_shape, max_shape=max_shape)
+        self.assertTrue(self._verify_correctness(i, target))
+
+        i = torchtrt.Input(min_shape=tuple(min_shape), opt_shape=tuple(opt_shape), max_shape=tuple(max_shape))
+        self.assertTrue(self._verify_correctness(i, target))
+
+        tensor_shape = lambda shape: torch.randn(shape).shape
+        i = torchtrt.Input(min_shape=tensor_shape(min_shape),
+                           opt_shape=tensor_shape(opt_shape),
+                           max_shape=tensor_shape(max_shape))
+        self.assertTrue(self._verify_correctness(i, target))
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/tests/py/api/test_e2e_behavior.py b/tests/py/api/test_e2e_behavior.py
new file mode 100644
index 0000000000..0229d28cce
--- /dev/null
+++ b/tests/py/api/test_e2e_behavior.py
@@ -0,0 +1,190 @@
+import unittest
+import torch_tensorrt as torchtrt
+import torch
+import torchvision.models as models
+import copy
+from typing import Dict
+
+class TestCompileHalf(unittest.TestCase):
+
+    def test_compile_script_half(self):
+        self.model = models.resnet18(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.scripted_model = torch.jit.script(self.model)
+        self.scripted_model.half()
+
+        compile_spec = {
+            "inputs": [torchtrt.Input(shape=self.input.shape, dtype=torch.half)],
+            "device": {
+                "device_type": torchtrt.DeviceType.GPU,
+                "gpu_id": 0,
+            },
+            "enabled_precisions": {torch.half}
+        }
+
+        trt_mod = torchtrt.ts.compile(self.scripted_model, **compile_spec)
+        same = (trt_mod(self.input.half()) - self.scripted_model(self.input.half())).abs().max()
+        torchtrt.logging.log(torchtrt.logging.Level.Debug, "Max diff: " + str(same))
+        self.assertTrue(same < 3e-2)
+
+    def test_compile_script_half_by_default(self):
+        self.model = models.resnet18(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.scripted_model = torch.jit.script(self.model)
+        self.scripted_model.half()
+
+        compile_spec = {
+            "inputs": [torchtrt.Input(shape=self.input.shape)],
+            "device": {
+                "device_type": torchtrt.DeviceType.GPU,
+                "gpu_id": 0,
+            },
+            "enabled_precisions": {torch.float, torch.half}
+        }
+
+        trt_mod = torchtrt.ts.compile(self.scripted_model, **compile_spec)
+        same = (trt_mod(self.input.half()) - self.scripted_model(self.input.half())).abs().max()
+        torchtrt.logging.log(torchtrt.logging.Level.Debug, "Max diff: " + str(same))
+        self.assertTrue(same < 3e-2)
+
+
+class TestFallbackToTorch(unittest.TestCase):
+
+    def test_fallback(self):
+        self.model = models.resnet18(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.scripted_model = torch.jit.script(self.model)
+
+        compile_spec = {
+            "inputs": [torchtrt.Input(self.input.shape)],
+            "device": {
+                "device_type": torchtrt.DeviceType.GPU,
+                "gpu_id": 0,
+                "allow_gpu_fallback": False,
+                "disable_tf32": False
+            },
+            "require_full_compilation": False,
+            "torch_executed_ops": ["aten::max_pool2d"],
+            "min_block_size": 1
+        }
+
+        trt_mod = torchtrt.ts.compile(self.scripted_model, **compile_spec)
+        same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max()
+        self.assertTrue(same < 2e-3)
+
+    def test_module_fallback(self):
+        self.model = models.resnet18(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.scripted_model = torch.jit.script(self.model)
+
+        compile_spec = {
+            "inputs": [torchtrt.Input(self.input.shape)],
+            "device": {
+                "device_type": torchtrt.DeviceType.GPU,
+                "gpu_id": 0,
+                "allow_gpu_fallback": False,
+                "disable_tf32": False
+            },
+            "require_full_compilation": False,
+            "torch_executed_modules": ["torchvision.models.resnet.BasicBlock"],
+            "min_block_size": 1
+        }
+
+        trt_mod = torchtrt.ts.compile(self.scripted_model, **compile_spec)
+        same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max()
+        self.assertTrue(same < 2e-3)
+
+class TestInputTypeDefaultsFP32Model(unittest.TestCase):
+
+    def test_input_use_default_fp32(self):
+        self.model = models.resnet18(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+
+        ts_model = torch.jit.script(self.model)
+        trt_mod = torchtrt.ts.compile(ts_model,
+                                      inputs=[torchtrt.Input(self.input.shape)],
+                                      enabled_precisions={torch.float, torch.half})
+        trt_mod(self.input)
+
+    def test_input_respect_user_setting_fp32_weights_fp16_in(self):
+        self.model = models.resnet18(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+
+        ts_model = torch.jit.script(self.model)
+        trt_mod = torchtrt.ts.compile(ts_model,
+                                      inputs=[self.input.half()],
+                                      require_full_compilation=True,
+                                      enabled_precisions={torch.float, torch.half})
+        trt_mod(self.input.half())
+
+    def test_input_respect_user_setting_fp32_weights_fp16_in_non_constructor(self):
+        self.model = models.resnet18(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+
+        ts_model = torch.jit.script(self.model)
+        input_spec = torchtrt.Input(self.input.shape)
+        input_spec.dtype = torch.half
+
+        trt_mod = torchtrt.ts.compile(ts_model,
+                                      inputs=[input_spec],
+                                      require_full_compilation=True,
+                                      enabled_precisions={torch.float, torch.half})
+        trt_mod(self.input.half())
+
+
+class TestInputTypeDefaultsFP16Model(unittest.TestCase):
+
+    def test_input_use_default_fp16(self):
+        self.model = models.resnet18(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+
+        half_mod = torch.jit.script(self.model)
+        half_mod.half()
+
+        trt_mod = torchtrt.ts.compile(half_mod,
+                                      inputs=[torchtrt.Input(self.input.shape)],
+                                      enabled_precisions={torch.float, torch.half})
+        trt_mod(self.input.half())
+
+    def test_input_use_default_fp16_without_fp16_enabled(self):
+        self.model = models.resnet18(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+
+        half_mod = torch.jit.script(self.model)
+        half_mod.half()
+
+        trt_mod = torchtrt.ts.compile(half_mod, inputs=[torchtrt.Input(self.input.shape)])
+        trt_mod(self.input.half())
+
+    def test_input_respect_user_setting_fp16_weights_fp32_in(self):
+        self.model = models.resnet18(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+
+        half_mod = torch.jit.script(self.model)
+        half_mod.half()
+
+        trt_mod = torchtrt.ts.compile(half_mod,
+                                      inputs=[self.input],
+                                      require_full_compilation=True,
+                                      enabled_precisions={torch.float, torch.half})
+        trt_mod(self.input)
+
+    def test_input_respect_user_setting_fp16_weights_fp32_in_non_constuctor(self):
+        self.model = models.resnet18(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+
+        half_mod = torch.jit.script(self.model)
+        half_mod.half()
+
+        input_spec = torchtrt.Input(self.input.shape)
+        input_spec.dtype = torch.float
+
+        trt_mod = torchtrt.ts.compile(half_mod,
+                                      inputs=[input_spec],
+                                      require_full_compilation=True,
+                                      enabled_precisions={torch.float, torch.half})
+        trt_mod(self.input)
+
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/tests/py/api/test_logging.py b/tests/py/api/test_logging.py
new file mode 100644
index 0000000000..81d8478c8d
--- /dev/null
+++ b/tests/py/api/test_logging.py
@@ -0,0 +1,72 @@
+import unittest
+import torch_tensorrt as torchtrt
+import torch
+import torchvision.models as models
+import copy
+from typing import Dict
+
+class TestLoggingAPIs(unittest.TestCase):
+
+    def test_logging_prefix(self):
+        new_prefix = "Python API Test: "
+        torchtrt.logging.set_logging_prefix(new_prefix)
+        logging_prefix = torchtrt.logging.get_logging_prefix()
+        self.assertEqual(new_prefix, logging_prefix)
+
+    def test_reportable_log_level(self):
+        new_level = torchtrt.logging.Level.Error
+        torchtrt.logging.set_reportable_log_level(new_level)
+        level = torchtrt.logging.get_reportable_log_level()
+        self.assertEqual(new_level, level)
+
+    def test_is_colored_output_on(self):
+        torchtrt.logging.set_is_colored_output_on(True)
+        color = torchtrt.logging.get_is_colored_output_on()
+        self.assertTrue(color)
+
+    def test_context_managers(self):
+        base_lvl = torchtrt.logging.get_reportable_log_level()
+        with torchtrt.logging.internal_errors():
+            lvl = torchtrt.logging.get_reportable_log_level()
+            self.assertEqual(torchtrt.logging.Level.InternalError, lvl)
+
+        lvl = torchtrt.logging.get_reportable_log_level()
+        self.assertEqual(base_lvl, lvl)
+
+        with torchtrt.logging.errors():
+            lvl = torchtrt.logging.get_reportable_log_level()
+            self.assertEqual(torchtrt.logging.Level.Error, lvl)
+
+        lvl = torchtrt.logging.get_reportable_log_level()
+        self.assertEqual(base_lvl, lvl)
+
+        with torchtrt.logging.warnings():
+            lvl = torchtrt.logging.get_reportable_log_level()
+            self.assertEqual(torchtrt.logging.Level.Warning, lvl)
+
+        lvl = torchtrt.logging.get_reportable_log_level()
+        self.assertEqual(base_lvl, lvl)
+
+        with torchtrt.logging.info():
+            lvl = torchtrt.logging.get_reportable_log_level()
+            self.assertEqual(torchtrt.logging.Level.Info, lvl)
+
+        lvl = torchtrt.logging.get_reportable_log_level()
+        self.assertEqual(base_lvl, lvl)
+
+        with torchtrt.logging.debug():
+            lvl = torchtrt.logging.get_reportable_log_level()
+            self.assertEqual(torchtrt.logging.Level.Debug, lvl)
+
+        lvl = torchtrt.logging.get_reportable_log_level()
+        self.assertEqual(base_lvl, lvl)
+
+        with torchtrt.logging.graphs():
+            lvl = torchtrt.logging.get_reportable_log_level()
+            self.assertEqual(torchtrt.logging.Level.Graph, lvl)
+
+        lvl = torchtrt.logging.get_reportable_log_level()
+        self.assertEqual(base_lvl, lvl)
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/tests/py/api/test_ts_backend.py b/tests/py/api/test_ts_backend.py
new file mode 100644
index 0000000000..3ab1604f90
--- /dev/null
+++ b/tests/py/api/test_ts_backend.py
@@ -0,0 +1,159 @@
+import unittest
+import torch_tensorrt as torchtrt
+import torch
+import torchvision.models as models
+import copy
+from typing import Dict
+
+class TestCompile(unittest.TestCase):
+
+    def test_compile_traced(self):
+        self.model = models.vgg16(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.traced_model = torch.jit.trace(self.model, [self.input])
+
+        compile_spec = {
+            "inputs": [torchtrt.Input(self.input.shape, dtype=torch.float, format=torch.contiguous_format)],
+            "device": {
+                "device_type": torchtrt.DeviceType.GPU,
+                "gpu_id": 0,
+            },
+            "enabled_precisions": {torch.float}
+        }
+
+        trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
+        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
+        self.assertTrue(same < 2e-2)
+
+    def test_compile_script(self):
+        self.model = models.vgg16(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.scripted_model = torch.jit.script(self.model)
+        with torch.no_grad():
+            trt_mod = torchtrt.ts.compile(self.scripted_model,
+                                          inputs=[self.input],
+                                          device=torchtrt.Device(gpu_id=0),
+                                          enabled_precisions={torch.float})
+            same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max()
+            self.assertTrue(same < 2e-2)
+
+    def test_compile_global(self):
+        self.model = models.vgg16(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.scripted_model = torch.jit.script(self.model)
+        trt_mod = torchtrt.compile(self.scripted_model,
+                                   inputs=[self.input],
+                                   device=torchtrt.Device(gpu_id=0),
+                                   enabled_precisions={torch.float})
+        same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max()
+        self.assertTrue(same < 2e-2)
+
+    def test_compile_global_nn_mod(self):
+        self.model = models.vgg16(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        with torch.no_grad():
+            trt_mod = torchtrt.compile(self.model,
+                                       inputs=[self.input],
+                                       device=torchtrt.Device(gpu_id=0),
+                                       enabled_precisions={torch.float})
+            same = (trt_mod(self.input) - self.model(self.input)).abs().max()
+            self.assertTrue(same < 2e-2)
+
+    def test_from_torch_tensor(self):
+        self.model = models.vgg16(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.traced_model = torch.jit.trace(self.model, [self.input])
+        compile_spec = {
+            "inputs": [self.input],
+            "device": {
+                "device_type": torchtrt.DeviceType.GPU,
+                "gpu_id": 0,
+            },
+            "enabled_precisions": {torch.float}
+        }
+
+        trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
+        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
+        self.assertTrue(same < 2e-2)
+
+    def test_device(self):
+        self.model = models.vgg16(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.traced_model = torch.jit.trace(self.model, [self.input])
+        compile_spec = {"inputs": [self.input], "device": torchtrt.Device("gpu:0"), "enabled_precisions": {torch.float}}
+
+        trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
+        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
+        self.assertTrue(same < 2e-2)
+
+    def test_default_device(self):
+        self.model = models.vgg16(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.traced_model = torch.jit.trace(self.model, [self.input])
+        compile_spec = {"inputs": [self.input], "enabled_precisions": {torch.float}}
+
+        trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
+        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
+        self.assertTrue(same < 2e-2)
+
+    def test_compile_script_from_dict(self):
+        self.model = models.vgg16(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.traced_model = torch.jit.trace(self.model, [self.input])
+        compile_spec = {
+            "inputs": [torchtrt.Input(shape=self.input.shape)],
+            "device": {
+                "device_type": torchtrt.DeviceType.GPU,
+                "gpu_id": 0,
+            },
+            "enabled_precisions": {torch.float}
+        }
+
+        trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
+        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
+        self.assertTrue(same < 2e-2)
+
+class TestPTtoTRTtoPT(unittest.TestCase):
+
+    def test_pt_to_trt_to_pt(self):
+        self.model = models.vgg16(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.ts_model = torch.jit.trace(self.model, [self.input])
+
+        compile_spec = {
+            "inputs": [torchtrt.Input(self.input.shape)],
+            "device": {
+                "device_type": torchtrt.DeviceType.GPU,
+                "gpu_id": 0,
+                "allow_gpu_fallback": False,
+                "disable_tf32": False
+            }
+        }
+
+        trt_engine = torchtrt.ts.convert_method_to_trt_engine(self.ts_model, "forward", **compile_spec)
+        trt_mod = torchtrt.ts.embed_engine_in_new_module(trt_engine, torchtrt.Device("cuda:0"))
+        same = (trt_mod(self.input) - self.ts_model(self.input)).abs().max()
+        self.assertTrue(same < 2e-3)
+
+class TestCheckMethodOpSupport(unittest.TestCase):
+
+    def test_check_support(self):
+        module = models.alexnet(pretrained=True).eval().to("cuda")
+        self.module = torch.jit.trace(module, torch.ones((1, 3, 224, 224)).to("cuda"))
+
+        self.assertTrue(torchtrt.ts.check_method_op_support(self.module, "forward"))
+
+
+class TestModuleIdentification(unittest.TestCase):
+
+    def test_module_type(self):
+        nn_module = models.alexnet(pretrained=True).eval().to("cuda")
+        ts_module = torch.jit.trace(nn_module, torch.ones([1, 3, 224, 224]).to("cuda"))
+        fx_module = torch.fx.symbolic_trace(nn_module)
+
+        self.assertEqual(torchtrt._compile._parse_module_type(nn_module), torchtrt._compile._ModuleType.nn)
+        self.assertEqual(torchtrt._compile._parse_module_type(ts_module), torchtrt._compile._ModuleType.ts)
+        self.assertEqual(torchtrt._compile._parse_module_type(fx_module), torchtrt._compile._ModuleType.fx)
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/tests/py/test_api_dla.py b/tests/py/hw/test_api_dla.py
similarity index 100%
rename from tests/py/test_api_dla.py
rename to tests/py/hw/test_api_dla.py
diff --git a/tests/py/test_multi_gpu.py b/tests/py/hw/test_multi_gpu.py
similarity index 100%
rename from tests/py/test_multi_gpu.py
rename to tests/py/hw/test_multi_gpu.py
diff --git a/tests/py/test_to_backend_api.py b/tests/py/integrations/test_to_backend_api.py
similarity index 77%
rename from tests/py/test_to_backend_api.py
rename to tests/py/integrations/test_to_backend_api.py
index 11c411ff56..1607e029f2 100644
--- a/tests/py/test_to_backend_api.py
+++ b/tests/py/integrations/test_to_backend_api.py
@@ -3,13 +3,12 @@
 import torch
 import torchvision.models as models
 
-from model_test_case import ModelTestCase
 
-
-class TestToBackendLowering(ModelTestCase):
+class TestToBackendLowering(unittest.TestCase):
 
     def setUp(self):
         self.input = torch.randn((1, 3, 300, 300)).to("cuda")
+        self.model = models.resnet18(pretrained=True).eval().to("cuda")
         self.scripted_model = torch.jit.script(self.model)
         self.spec = {
             "forward":
@@ -37,17 +36,5 @@ def test_to_backend_lowering(self):
         same = (trt_mod.forward(self.input) - self.scripted_model(self.input)).abs().max()
         self.assertTrue(same < 2e-3)
 
-
-def test_suite():
-    suite = unittest.TestSuite()
-    suite.addTest(TestToBackendLowering.parametrize(TestToBackendLowering, model=models.resnet18(pretrained=True)))
-
-    return suite
-
-
-suite = test_suite()
-
-runner = unittest.TextTestRunner()
-result = runner.run(suite)
-
-exit(int(not result.wasSuccessful()))
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/tests/py/test_trt_intercompatibility.py b/tests/py/integrations/test_trt_intercompatibility.py
similarity index 77%
rename from tests/py/test_trt_intercompatibility.py
rename to tests/py/integrations/test_trt_intercompatibility.py
index e1d614a200..742040022a 100644
--- a/tests/py/test_trt_intercompatibility.py
+++ b/tests/py/integrations/test_trt_intercompatibility.py
@@ -4,16 +4,13 @@
 import torchvision.models as models
 import tensorrt as trt
 
-from model_test_case import ModelTestCase
 
+class TestPyTorchToTRTEngine(unittest.TestCase):
 
-class TestPyTorchToTRTEngine(ModelTestCase):
-
-    def setUp(self):
+    def test_pt_to_trt(self):
+        self.model=models.resnet18(pretrained=True).eval().to("cuda:0")
         self.input = torch.randn((1, 3, 224, 224)).to("cuda:0")
         self.ts_model = torch.jit.script(self.model)
-
-    def test_pt_to_trt(self):
         compile_spec = {
             "inputs": [torchtrt.Input(self.input.shape)],
             "truncate_long_and_double": True,
@@ -40,17 +37,5 @@ def test_pt_to_trt(self):
                 same = (out - self.ts_model(self.input)).abs().max()
                 self.assertTrue(same < 2e-3)
 
-
-def test_suite():
-    suite = unittest.TestSuite()
-    suite.addTest(TestPyTorchToTRTEngine.parametrize(TestPyTorchToTRTEngine, model=models.resnet18(pretrained=True)))
-
-    return suite
-
-
-suite = test_suite()
-
-runner = unittest.TextTestRunner()
-result = runner.run(suite)
-
-exit(int(not result.wasSuccessful()))
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/py/test_ptq_dataloader_calibrator.py b/tests/py/ptq/test_ptq_dataloader_calibrator.py
similarity index 58%
rename from tests/py/test_ptq_dataloader_calibrator.py
rename to tests/py/ptq/test_ptq_dataloader_calibrator.py
index 158a5425e8..66c4b7ff15 100644
--- a/tests/py/test_ptq_dataloader_calibrator.py
+++ b/tests/py/ptq/test_ptq_dataloader_calibrator.py
@@ -6,12 +6,51 @@
 from torch.nn import functional as F
 import torchvision
 import torchvision.transforms as transforms
-from model_test_case import ModelTestCase
 
+import os
+
+def find_repo_root(max_depth=10):
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    for i in range(max_depth):
+        files = os.listdir(dir_path)
+        if "WORKSPACE" in files:
+            return dir_path
+        else:
+            dir_path = os.path.dirname(dir_path)
+
+    raise RuntimeError("Could not find repo root")
+
+MODULE_DIR = find_repo_root() + "/tests/modules"
+
+def compute_accuracy(testing_dataloader, model):
+    total = 0
+    correct = 0
+    loss = 0.0
+    class_probs = []
+    class_preds = []
+    device = torch.device('cuda:0')
+    with torch.no_grad():
+        idx = 0
+        for data, labels in testing_dataloader:
+            data, labels = data.to(device), labels.to(device)
+            out = model(data)
+            preds = torch.max(out, 1)[1]
+            class_probs.append([F.softmax(i, dim=0) for i in out])
+            class_preds.append(preds)
+            total += labels.size(0)
+            correct += (preds == labels).sum().item()
+            idx += 1
+
+    test_probs = torch.cat([torch.stack(batch) for batch in class_probs])
+    test_preds = torch.cat(class_preds)
+    return correct / total
+
+
+class TestAccuracy(unittest.TestCase):
 
-class TestAccuracy(ModelTestCase):
+    def test_compile_script(self):
 
-    def setUp(self):
+        self.model = torch.jit.load(MODULE_DIR + "/trained_vgg16.jit.pt").eval().to("cuda")
         self.input = torch.randn((1, 3, 32, 32)).to("cuda")
         self.testing_dataset = torchvision.datasets.CIFAR10(root='./data',
                                                             train=False,
@@ -33,32 +72,7 @@ def setUp(self):
             algo_type=torchtrt.ptq.CalibrationAlgo.ENTROPY_CALIBRATION_2,
             device=torch.device('cuda:0'))
 
-    def compute_accuracy(self, testing_dataloader, model):
-        total = 0
-        correct = 0
-        loss = 0.0
-        class_probs = []
-        class_preds = []
-        device = torch.device('cuda:0')
-        with torch.no_grad():
-            idx = 0
-            for data, labels in testing_dataloader:
-                data, labels = data.to(device), labels.to(device)
-                out = model(data)
-                preds = torch.max(out, 1)[1]
-                class_probs.append([F.softmax(i, dim=0) for i in out])
-                class_preds.append(preds)
-                total += labels.size(0)
-                correct += (preds == labels).sum().item()
-                idx += 1
-
-        test_probs = torch.cat([torch.stack(batch) for batch in class_probs])
-        test_preds = torch.cat(class_preds)
-        return correct / total
-
-    def test_compile_script(self):
-
-        fp32_test_acc = self.compute_accuracy(self.testing_dataloader, self.model)
+        fp32_test_acc = compute_accuracy(self.testing_dataloader, self.model)
         log(Level.Info, "[Pyt FP32] Test Acc: {:.2f}%".format(100 * fp32_test_acc))
 
         compile_spec = {
@@ -75,24 +89,11 @@ def test_compile_script(self):
         }
 
         trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
-        int8_test_acc = self.compute_accuracy(self.testing_dataloader, trt_mod)
+        int8_test_acc = compute_accuracy(self.testing_dataloader, trt_mod)
         log(Level.Info, "[TRT INT8] Test Acc: {:.2f}%".format(100 * int8_test_acc))
         acc_diff = fp32_test_acc - int8_test_acc
         self.assertTrue(abs(acc_diff) < 3)
 
 
-def test_suite():
-    suite = unittest.TestSuite()
-    # You need a pre-trained VGG cifar10 model to run this test. Please follow instructions at
-    # https://github.com/NVIDIA/torchtrt/tree/master/cpp/ptq/training/vgg16 to export this model.
-    suite.addTest(TestAccuracy.parametrize(TestAccuracy, model=torch.jit.load('./trained_vgg16.jit.pt')))
-
-    return suite
-
-
-suite = test_suite()
-
-runner = unittest.TextTestRunner()
-result = runner.run(suite)
-
-exit(int(not result.wasSuccessful()))
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/tests/py/test_ptq_to_backend.py b/tests/py/ptq/test_ptq_to_backend.py
similarity index 62%
rename from tests/py/test_ptq_to_backend.py
rename to tests/py/ptq/test_ptq_to_backend.py
index 297e7d672a..627208960d 100644
--- a/tests/py/test_ptq_to_backend.py
+++ b/tests/py/ptq/test_ptq_to_backend.py
@@ -6,12 +6,49 @@
 from torch.nn import functional as F
 import torchvision
 import torchvision.transforms as transforms
-from model_test_case import ModelTestCase
+import os
+
+def find_repo_root(max_depth=10):
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    for i in range(max_depth):
+        files = os.listdir(dir_path)
+        if "WORKSPACE" in files:
+            return dir_path
+        else:
+            dir_path = os.path.dirname(dir_path)
+
+    raise RuntimeError("Could not find repo root")
+
+MODULE_DIR = find_repo_root() + "/tests/modules"
+
+def compute_accuracy(testing_dataloader, model):
+    total = 0
+    correct = 0
+    loss = 0.0
+    class_probs = []
+    class_preds = []
+    device = torch.device('cuda:0')
+    with torch.no_grad():
+        idx = 0
+        for data, labels in testing_dataloader:
+            data, labels = data.to(device), labels.to(device)
+            out = model(data)
+            preds = torch.max(out, 1)[1]
+            class_probs.append([F.softmax(i, dim=0) for i in out])
+            class_preds.append(preds)
+            total += labels.size(0)
+            correct += (preds == labels).sum().item()
+            idx += 1
+
+    test_probs = torch.cat([torch.stack(batch) for batch in class_probs])
+    test_preds = torch.cat(class_preds)
+    return correct / total
+
+
+class TestAccuracy(unittest.TestCase):
 
-
-class TestAccuracy(ModelTestCase):
-
-    def setUp(self):
+    def test_compile_script(self):
+        self.model = torch.jit.load(MODULE_DIR + "/trained_vgg16.jit.pt").eval().to("cuda")
         self.input = torch.randn((1, 3, 32, 32)).to("cuda")
         self.testing_dataset = torchvision.datasets.CIFAR10(root='./data',
                                                             train=False,
@@ -50,53 +87,16 @@ def setUp(self):
                     })
         }
 
-    def compute_accuracy(self, testing_dataloader, model):
-        total = 0
-        correct = 0
-        loss = 0.0
-        class_probs = []
-        class_preds = []
-
-        with torch.no_grad():
-            idx = 0
-            for data, labels in testing_dataloader:
-                data, labels = data.cuda(), labels.cuda(non_blocking=True)
-                out = model(data)
-                preds = torch.max(out, 1)[1]
-                class_probs.append([F.softmax(i, dim=0) for i in out])
-                class_preds.append(preds)
-                total += labels.size(0)
-                correct += (preds == labels).sum().item()
-                idx += 1
-
-        test_probs = torch.cat([torch.stack(batch) for batch in class_probs])
-        test_preds = torch.cat(class_preds)
-        return correct / total
-
-    def test_compile_script(self):
 
-        fp32_test_acc = self.compute_accuracy(self.testing_dataloader, self.model)
+        fp32_test_acc = compute_accuracy(self.testing_dataloader, self.model)
         log(Level.Info, "[Pyt FP32] Test Acc: {:.2f}%".format(100 * fp32_test_acc))
 
         trt_mod = torch._C._jit_to_backend("tensorrt", self.model, self.spec)
-        int8_test_acc = self.compute_accuracy(self.testing_dataloader, trt_mod)
+        int8_test_acc = compute_accuracy(self.testing_dataloader, trt_mod)
         log(Level.Info, "[TRT INT8 Backend] Test Acc: {:.2f}%".format(100 * int8_test_acc))
         acc_diff = fp32_test_acc - int8_test_acc
         self.assertTrue(abs(acc_diff) < 3)
 
 
-def test_suite():
-    suite = unittest.TestSuite()
-    # You need a pre-trained VGG cifar10 model to run this test. Please follow instructions at
-    # https://github.com/NVIDIA/torchtrt/tree/master/cpp/ptq/training/vgg16 to export this model.
-    suite.addTest(TestAccuracy.parametrize(TestAccuracy, model=torch.jit.load('./trained_vgg16.jit.pt')))
-
-    return suite
-
-
-suite = test_suite()
-
-runner = unittest.TextTestRunner()
-result = runner.run(suite)
-
-exit(int(not result.wasSuccessful()))
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/py/test_ptq_trt_calibrator.py b/tests/py/ptq/test_ptq_trt_calibrator.py
similarity index 69%
rename from tests/py/test_ptq_trt_calibrator.py
rename to tests/py/ptq/test_ptq_trt_calibrator.py
index 7d9d3fa000..33431e4055 100644
--- a/tests/py/test_ptq_trt_calibrator.py
+++ b/tests/py/ptq/test_ptq_trt_calibrator.py
@@ -8,7 +8,42 @@
 from torch.nn import functional as F
 import torchvision
 import torchvision.transforms as transforms
-from model_test_case import ModelTestCase
+
+def find_repo_root(max_depth=10):
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    for i in range(max_depth):
+        files = os.listdir(dir_path)
+        if "WORKSPACE" in files:
+            return dir_path
+        else:
+            dir_path = os.path.dirname(dir_path)
+
+    raise RuntimeError("Could not find repo root")
+
+MODULE_DIR = find_repo_root() + "/tests/modules"
+
+def compute_accuracy(testing_dataloader, model):
+    total = 0
+    correct = 0
+    loss = 0.0
+    class_probs = []
+    class_preds = []
+    device = torch.device('cuda:0')
+    with torch.no_grad():
+        idx = 0
+        for data, labels in testing_dataloader:
+            data, labels = data.to(device), labels.to(device)
+            out = model(data)
+            preds = torch.max(out, 1)[1]
+            class_probs.append([F.softmax(i, dim=0) for i in out])
+            class_preds.append(preds)
+            total += labels.size(0)
+            correct += (preds == labels).sum().item()
+            idx += 1
+
+    test_probs = torch.cat([torch.stack(batch) for batch in class_probs])
+    test_preds = torch.cat(class_preds)
+    return correct / total
 
 
 class TRTEntropyCalibrator(trt.IInt8EntropyCalibrator2):
@@ -54,9 +89,10 @@ def write_calibration_cache(self, cache):
                 f.write(cache)
 
 
-class TestAccuracy(ModelTestCase):
+class TestAccuracy(unittest.TestCase):
 
-    def setUp(self):
+    def test_compile_script(self):
+        self.model = torch.jit.load(MODULE_DIR + "/trained_vgg16.jit.pt").eval().to("cuda")
         self.input = torch.randn((1, 3, 32, 32)).to("cuda")
         self.testing_dataset = torchvision.datasets.CIFAR10(root='./data',
                                                             train=False,
@@ -74,32 +110,7 @@ def setUp(self):
         # Test cases can assume using GPU id: 0
         self.calibrator = TRTEntropyCalibrator(self.testing_dataloader)
 
-    def compute_accuracy(self, testing_dataloader, model):
-        total = 0
-        correct = 0
-        loss = 0.0
-        class_probs = []
-        class_preds = []
-        device = torch.device('cuda:0')
-        with torch.no_grad():
-            idx = 0
-            for data, labels in testing_dataloader:
-                data, labels = data.to(device), labels.to(device)
-                out = model(data)
-                preds = torch.max(out, 1)[1]
-                class_probs.append([F.softmax(i, dim=0) for i in out])
-                class_preds.append(preds)
-                total += labels.size(0)
-                correct += (preds == labels).sum().item()
-                idx += 1
-
-        test_probs = torch.cat([torch.stack(batch) for batch in class_probs])
-        test_preds = torch.cat(class_preds)
-        return correct / total
-
-    def test_compile_script(self):
-
-        fp32_test_acc = self.compute_accuracy(self.testing_dataloader, self.model)
+        fp32_test_acc = compute_accuracy(self.testing_dataloader, self.model)
         log(Level.Info, "[Pyt FP32] Test Acc: {:.2f}%".format(100 * fp32_test_acc))
 
         compile_spec = {
@@ -116,24 +127,12 @@ def test_compile_script(self):
         }
 
         trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
-        int8_test_acc = self.compute_accuracy(self.testing_dataloader, trt_mod)
+        int8_test_acc = compute_accuracy(self.testing_dataloader, trt_mod)
         log(Level.Info, "[TRT INT8] Test Acc: {:.2f}%".format(100 * int8_test_acc))
         acc_diff = fp32_test_acc - int8_test_acc
         self.assertTrue(abs(acc_diff) < 3)
 
 
-def test_suite():
-    suite = unittest.TestSuite()
-    # You need a pre-trained VGG cifar10 model to run this test. Please follow instructions at
-    # https://github.com/NVIDIA/torchtrt/tree/master/cpp/ptq/training/vgg16 to export this model.
-    suite.addTest(TestAccuracy.parametrize(TestAccuracy, model=torch.jit.load('./trained_vgg16.jit.pt')))
-
-    return suite
-
-
-suite = test_suite()
-
-runner = unittest.TextTestRunner()
-result = runner.run(suite)
 
-exit(int(not result.wasSuccessful()))
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/tests/py/test_qat_trt_accuracy.py b/tests/py/qat/test_qat_trt_accuracy.py
similarity index 52%
rename from tests/py/test_qat_trt_accuracy.py
rename to tests/py/qat/test_qat_trt_accuracy.py
index 74fb70b3df..3086896f8c 100644
--- a/tests/py/test_qat_trt_accuracy.py
+++ b/tests/py/qat/test_qat_trt_accuracy.py
@@ -6,14 +6,52 @@
 from torch.nn import functional as F
 import torchvision
 import torchvision.transforms as transforms
-from model_test_case import ModelTestCase
+import os
+import sys
+
+def find_repo_root(max_depth=10):
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    for i in range(max_depth):
+        files = os.listdir(dir_path)
+        if "WORKSPACE" in files:
+            return dir_path
+        else:
+            dir_path = os.path.dirname(dir_path)
+
+    raise RuntimeError("Could not find repo root")
+
+MODULE_DIR = find_repo_root() + "/tests/modules"
 
 set_reportable_log_level(Level.Graph)
 
 
-class TestAccuracy(ModelTestCase):
+def compute_accuracy(testing_dataloader, model):
+    total = 0
+    correct = 0
+    loss = 0.0
+    class_probs = []
+    class_preds = []
+    device = torch.device('cuda:0')
+    with torch.no_grad():
+        idx = 0
+        for data, labels in testing_dataloader:
+            data, labels = data.to(device), labels.to(device)
+            out = model(data)
+            preds = torch.max(out, 1)[1]
+            class_probs.append([F.softmax(i, dim=0) for i in out])
+            class_preds.append(preds)
+            total += labels.size(0)
+            correct += (preds == labels).sum().item()
+            idx += 1
+
+    test_probs = torch.cat([torch.stack(batch) for batch in class_probs])
+    test_preds = torch.cat(class_preds)
+    return correct / total
+
+class TestAccuracy(unittest.TestCase):
 
-    def setUp(self):
+    def test_compile_script(self):
+        self.model = torch.jit.load(MODULE_DIR + "/trained_vgg16_qat.jit.pt").eval().to("cuda")
         self.testing_dataset = torchvision.datasets.CIFAR10(root='./data',
                                                             train=False,
                                                             download=True,
@@ -28,31 +66,7 @@ def setUp(self):
                                                               shuffle=False,
                                                               num_workers=1)
 
-    def compute_accuracy(self, testing_dataloader, model):
-        total = 0
-        correct = 0
-        loss = 0.0
-        class_probs = []
-        class_preds = []
-        device = torch.device('cuda:0')
-        with torch.no_grad():
-            idx = 0
-            for data, labels in testing_dataloader:
-                data, labels = data.to(device), labels.to(device)
-                out = model(data)
-                preds = torch.max(out, 1)[1]
-                class_probs.append([F.softmax(i, dim=0) for i in out])
-                class_preds.append(preds)
-                total += labels.size(0)
-                correct += (preds == labels).sum().item()
-                idx += 1
-
-        test_probs = torch.cat([torch.stack(batch) for batch in class_probs])
-        test_preds = torch.cat(class_preds)
-        return correct / total
-
-    def test_compile_script(self):
-        fp32_test_acc = self.compute_accuracy(self.testing_dataloader, self.model)
+        fp32_test_acc = compute_accuracy(self.testing_dataloader, self.model)
         log(Level.Info, "[Pyt FP32] Test Acc: {:.2f}%".format(100 * fp32_test_acc))
 
         compile_spec = {
@@ -62,24 +76,10 @@ def test_compile_script(self):
         }
 
         trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
-        int8_test_acc = self.compute_accuracy(self.testing_dataloader, trt_mod)
+        int8_test_acc = compute_accuracy(self.testing_dataloader, trt_mod)
         log(Level.Info, "[TRT QAT INT8] Test Acc: {:.2f}%".format(100 * int8_test_acc))
         acc_diff = fp32_test_acc - int8_test_acc
         self.assertTrue(abs(acc_diff) < 3)
 
-
-def test_suite():
-    suite = unittest.TestSuite()
-    # You need a VGG QAT model trained on CIFAR10 to run this test. Please follow instructions at
-    # https://github.com/NVIDIA/torchtrt/tree/master/examples/int8/training/vgg16 to export this model.
-    suite.addTest(TestAccuracy.parametrize(TestAccuracy, model=torch.jit.load('./trained_vgg16_qat.jit.pt')))
-
-    return suite
-
-
-suite = test_suite()
-
-runner = unittest.TextTestRunner()
-result = runner.run(suite)
-
-exit(int(not result.wasSuccessful()))
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/tests/py/requirements.txt b/tests/py/requirements.txt
index 91e97eed3e..784cca7d8d 100644
--- a/tests/py/requirements.txt
+++ b/tests/py/requirements.txt
@@ -1,2 +1,3 @@
-torchvision==0.12.0+cu113
--f https://download.pytorch.org/whl/torch_stable.html
+torchvision==0.12.0
+--extra-index-url https://download.pytorch.org/whl/cu113
+pytest
diff --git a/tests/py/test_api.py b/tests/py/test_api.py
deleted file mode 100644
index 987e49e1f6..0000000000
--- a/tests/py/test_api.py
+++ /dev/null
@@ -1,596 +0,0 @@
-import unittest
-import torch_tensorrt as torchtrt
-import torch
-import torchvision.models as models
-import copy
-from typing import Dict
-
-from model_test_case import ModelTestCase
-
-
-class TestCompile(ModelTestCase):
-
-    def setUp(self):
-        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-        self.traced_model = torch.jit.trace(self.model, [self.input])
-        self.scripted_model = torch.jit.script(self.model)
-
-    def test_compile_traced(self):
-        compile_spec = {
-            "inputs": [torchtrt.Input(self.input.shape, dtype=torch.float, format=torch.contiguous_format)],
-            "device": {
-                "device_type": torchtrt.DeviceType.GPU,
-                "gpu_id": 0,
-            },
-            "enabled_precisions": {torch.float}
-        }
-
-        trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
-        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-2)
-
-    def test_compile_script(self):
-        with torch.no_grad():
-            trt_mod = torchtrt.ts.compile(self.scripted_model,
-                                          inputs=[self.input],
-                                          device=torchtrt.Device(gpu_id=0),
-                                          enabled_precisions={torch.float})
-            same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max()
-            self.assertTrue(same < 2e-2)
-
-    def test_compile_global(self):
-        trt_mod = torchtrt.compile(self.scripted_model,
-                                   inputs=[self.input],
-                                   device=torchtrt.Device(gpu_id=0),
-                                   enabled_precisions={torch.float})
-        same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-2)
-
-    def test_compile_global_nn_mod(self):
-        with torch.no_grad():
-            trt_mod = torchtrt.compile(self.model,
-                                       inputs=[self.input],
-                                       device=torchtrt.Device(gpu_id=0),
-                                       enabled_precisions={torch.float})
-            same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max()
-            self.assertTrue(same < 2e-2)
-
-    def test_from_torch_tensor(self):
-        compile_spec = {
-            "inputs": [self.input],
-            "device": {
-                "device_type": torchtrt.DeviceType.GPU,
-                "gpu_id": 0,
-            },
-            "enabled_precisions": {torch.float}
-        }
-
-        trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
-        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-2)
-
-    def test_device(self):
-        compile_spec = {"inputs": [self.input], "device": torchtrt.Device("gpu:0"), "enabled_precisions": {torch.float}}
-
-        trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
-        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-2)
-
-    def test_default_device(self):
-        compile_spec = {"inputs": [self.input], "enabled_precisions": {torch.float}}
-
-        trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
-        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-2)
-
-    def test_compile_script_from_dict(self):
-        compile_spec = {
-            "inputs": [torchtrt.Input(shape=self.input.shape)],
-            "device": {
-                "device_type": torchtrt.DeviceType.GPU,
-                "gpu_id": 0,
-            },
-            "enabled_precisions": {torch.float}
-        }
-
-        trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
-        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-2)
-
-
-class TestCompileHalf(ModelTestCase):
-
-    def setUp(self):
-        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-        self.scripted_model = torch.jit.script(self.model)
-        self.scripted_model.half()
-
-    def test_compile_script_half(self):
-        compile_spec = {
-            "inputs": [torchtrt.Input(shape=self.input.shape, dtype=torch.half)],
-            "device": {
-                "device_type": torchtrt.DeviceType.GPU,
-                "gpu_id": 0,
-            },
-            "enabled_precisions": {torch.half}
-        }
-
-        trt_mod = torchtrt.ts.compile(self.scripted_model, **compile_spec)
-        same = (trt_mod(self.input.half()) - self.scripted_model(self.input.half())).abs().max()
-        torchtrt.logging.log(torchtrt.logging.Level.Debug, "Max diff: " + str(same))
-        self.assertTrue(same < 3e-2)
-
-
-class TestCompileHalfDefault(ModelTestCase):
-
-    def setUp(self):
-        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-        self.scripted_model = torch.jit.script(self.model)
-        self.scripted_model.half()
-
-    def test_compile_script_half_by_default(self):
-        compile_spec = {
-            "inputs": [torchtrt.Input(shape=self.input.shape)],
-            "device": {
-                "device_type": torchtrt.DeviceType.GPU,
-                "gpu_id": 0,
-            },
-            "enabled_precisions": {torch.float, torch.half}
-        }
-
-        trt_mod = torchtrt.ts.compile(self.scripted_model, **compile_spec)
-        same = (trt_mod(self.input.half()) - self.scripted_model(self.input.half())).abs().max()
-        torchtrt.logging.log(torchtrt.logging.Level.Debug, "Max diff: " + str(same))
-        self.assertTrue(same < 3e-2)
-
-
-class TestFallbackToTorch(ModelTestCase):
-
-    def setUp(self):
-        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-        self.scripted_model = torch.jit.script(self.model)
-
-    def test_compile_script(self):
-        compile_spec = {
-            "inputs": [torchtrt.Input(self.input.shape)],
-            "device": {
-                "device_type": torchtrt.DeviceType.GPU,
-                "gpu_id": 0,
-                "allow_gpu_fallback": False,
-                "disable_tf32": False
-            },
-            "require_full_compilation": False,
-            "torch_executed_ops": ["aten::max_pool2d"],
-            "min_block_size": 1
-        }
-
-        trt_mod = torchtrt.ts.compile(self.scripted_model, **compile_spec)
-        same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-3)
-
-
-class TestModuleFallbackToTorch(ModelTestCase):
-
-    def setUp(self):
-        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-        self.scripted_model = torch.jit.script(self.model)
-
-    def test_compile_script(self):
-        compile_spec = {
-            "inputs": [torchtrt.Input(self.input.shape)],
-            "device": {
-                "device_type": torchtrt.DeviceType.GPU,
-                "gpu_id": 0,
-                "allow_gpu_fallback": False,
-                "disable_tf32": False
-            },
-            "require_full_compilation": False,
-            "torch_executed_modules": ["torchvision.models.resnet.BasicBlock"],
-            "min_block_size": 1
-        }
-
-        trt_mod = torchtrt.ts.compile(self.scripted_model, **compile_spec)
-        same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-3)
-
-
-class TestPTtoTRTtoPT(ModelTestCase):
-
-    def setUp(self):
-        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-        self.ts_model = torch.jit.trace(self.model, [self.input])
-
-    def test_pt_to_trt_to_pt(self):
-        compile_spec = {
-            "inputs": [torchtrt.Input(self.input.shape)],
-            "device": {
-                "device_type": torchtrt.DeviceType.GPU,
-                "gpu_id": 0,
-                "allow_gpu_fallback": False,
-                "disable_tf32": False
-            }
-        }
-
-        trt_engine = torchtrt.ts.convert_method_to_trt_engine(self.ts_model, "forward", **compile_spec)
-        trt_mod = torchtrt.ts.embed_engine_in_new_module(trt_engine, torchtrt.Device("cuda:0"))
-        same = (trt_mod(self.input) - self.ts_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-3)
-
-
-class TestInputTypeDefaultsFP32Model(ModelTestCase):
-
-    def setUp(self):
-        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-
-    def test_input_use_default_fp32(self):
-        ts_model = torch.jit.script(self.model)
-        trt_mod = torchtrt.ts.compile(ts_model,
-                                      inputs=[torchtrt.Input(self.input.shape)],
-                                      enabled_precisions={torch.float, torch.half})
-        trt_mod(self.input)
-
-    def test_input_respect_user_setting_fp32_weights_fp16_in(self):
-        ts_model = torch.jit.script(self.model)
-        trt_mod = torchtrt.ts.compile(ts_model,
-                                      inputs=[self.input.half()],
-                                      require_full_compilation=True,
-                                      enabled_precisions={torch.float, torch.half})
-        trt_mod(self.input.half())
-
-    def test_input_respect_user_setting_fp32_weights_fp16_in_non_constructor(self):
-        ts_model = torch.jit.script(self.model)
-        input_spec = torchtrt.Input(self.input.shape)
-        input_spec.dtype = torch.half
-
-        trt_mod = torchtrt.ts.compile(ts_model,
-                                      inputs=[input_spec],
-                                      require_full_compilation=True,
-                                      enabled_precisions={torch.float, torch.half})
-        trt_mod(self.input.half())
-
-
-class TestInputTypeDefaultsFP16Model(ModelTestCase):
-
-    def setUp(self):
-        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-
-    def test_input_use_default_fp16(self):
-        half_mod = torch.jit.script(self.model)
-        half_mod.half()
-
-        trt_mod = torchtrt.ts.compile(half_mod,
-                                      inputs=[torchtrt.Input(self.input.shape)],
-                                      enabled_precisions={torch.float, torch.half})
-        trt_mod(self.input.half())
-
-    def test_input_use_default_fp16_without_fp16_enabled(self):
-        half_mod = torch.jit.script(self.model)
-        half_mod.half()
-
-        trt_mod = torchtrt.ts.compile(half_mod, inputs=[torchtrt.Input(self.input.shape)])
-        trt_mod(self.input.half())
-
-    def test_input_respect_user_setting_fp16_weights_fp32_in(self):
-        half_mod = torch.jit.script(self.model)
-        half_mod.half()
-
-        trt_mod = torchtrt.ts.compile(half_mod,
-                                      inputs=[self.input],
-                                      require_full_compilation=True,
-                                      enabled_precisions={torch.float, torch.half})
-        trt_mod(self.input)
-
-    def test_input_respect_user_setting_fp16_weights_fp32_in_non_constuctor(self):
-        half_mod = torch.jit.script(self.model)
-        half_mod.half()
-
-        input_spec = torchtrt.Input(self.input.shape)
-        input_spec.dtype = torch.float
-
-        trt_mod = torchtrt.ts.compile(half_mod,
-                                      inputs=[input_spec],
-                                      require_full_compilation=True,
-                                      enabled_precisions={torch.float, torch.half})
-        trt_mod(self.input)
-
-
-class TestCheckMethodOpSupport(unittest.TestCase):
-
-    def setUp(self):
-        module = models.alexnet(pretrained=True).eval().to("cuda")
-        self.module = torch.jit.trace(module, torch.ones((1, 3, 224, 224)).to("cuda"))
-
-    def test_check_support(self):
-        self.assertTrue(torchtrt.ts.check_method_op_support(self.module, "forward"))
-
-
-class TestLoggingAPIs(unittest.TestCase):
-
-    def test_logging_prefix(self):
-        new_prefix = "Python API Test: "
-        torchtrt.logging.set_logging_prefix(new_prefix)
-        logging_prefix = torchtrt.logging.get_logging_prefix()
-        self.assertEqual(new_prefix, logging_prefix)
-
-    def test_reportable_log_level(self):
-        new_level = torchtrt.logging.Level.Error
-        torchtrt.logging.set_reportable_log_level(new_level)
-        level = torchtrt.logging.get_reportable_log_level()
-        self.assertEqual(new_level, level)
-
-    def test_is_colored_output_on(self):
-        torchtrt.logging.set_is_colored_output_on(True)
-        color = torchtrt.logging.get_is_colored_output_on()
-        self.assertTrue(color)
-
-    def test_context_managers(self):
-        base_lvl = torchtrt.logging.get_reportable_log_level()
-        with torchtrt.logging.internal_errors():
-            lvl = torchtrt.logging.get_reportable_log_level()
-            self.assertEqual(torchtrt.logging.Level.InternalError, lvl)
-
-        lvl = torchtrt.logging.get_reportable_log_level()
-        self.assertEqual(base_lvl, lvl)
-
-        with torchtrt.logging.errors():
-            lvl = torchtrt.logging.get_reportable_log_level()
-            self.assertEqual(torchtrt.logging.Level.Error, lvl)
-
-        lvl = torchtrt.logging.get_reportable_log_level()
-        self.assertEqual(base_lvl, lvl)
-
-        with torchtrt.logging.warnings():
-            lvl = torchtrt.logging.get_reportable_log_level()
-            self.assertEqual(torchtrt.logging.Level.Warning, lvl)
-
-        lvl = torchtrt.logging.get_reportable_log_level()
-        self.assertEqual(base_lvl, lvl)
-
-        with torchtrt.logging.info():
-            lvl = torchtrt.logging.get_reportable_log_level()
-            self.assertEqual(torchtrt.logging.Level.Info, lvl)
-
-        lvl = torchtrt.logging.get_reportable_log_level()
-        self.assertEqual(base_lvl, lvl)
-
-        with torchtrt.logging.debug():
-            lvl = torchtrt.logging.get_reportable_log_level()
-            self.assertEqual(torchtrt.logging.Level.Debug, lvl)
-
-        lvl = torchtrt.logging.get_reportable_log_level()
-        self.assertEqual(base_lvl, lvl)
-
-        with torchtrt.logging.graphs():
-            lvl = torchtrt.logging.get_reportable_log_level()
-            self.assertEqual(torchtrt.logging.Level.Graph, lvl)
-
-        lvl = torchtrt.logging.get_reportable_log_level()
-        self.assertEqual(base_lvl, lvl)
-
-
-class TestDevice(unittest.TestCase):
-
-    def test_from_string_constructor(self):
-        device = torchtrt.Device("cuda:0")
-        self.assertEqual(device.device_type, torchtrt.DeviceType.GPU)
-        self.assertEqual(device.gpu_id, 0)
-
-        device = torchtrt.Device("gpu:1")
-        self.assertEqual(device.device_type, torchtrt.DeviceType.GPU)
-        self.assertEqual(device.gpu_id, 1)
-
-    def test_from_string_constructor_dla(self):
-        device = torchtrt.Device("dla:0")
-        self.assertEqual(device.device_type, torchtrt.DeviceType.DLA)
-        self.assertEqual(device.gpu_id, 0)
-        self.assertEqual(device.dla_core, 0)
-
-        device = torchtrt.Device("dla:1", allow_gpu_fallback=True)
-        self.assertEqual(device.device_type, torchtrt.DeviceType.DLA)
-        self.assertEqual(device.gpu_id, 0)
-        self.assertEqual(device.dla_core, 1)
-        self.assertEqual(device.allow_gpu_fallback, True)
-
-    def test_kwargs_gpu(self):
-        device = torchtrt.Device(gpu_id=0)
-        self.assertEqual(device.device_type, torchtrt.DeviceType.GPU)
-        self.assertEqual(device.gpu_id, 0)
-
-    def test_kwargs_dla_and_settings(self):
-        device = torchtrt.Device(dla_core=1, allow_gpu_fallback=False)
-        self.assertEqual(device.device_type, torchtrt.DeviceType.DLA)
-        self.assertEqual(device.gpu_id, 0)
-        self.assertEqual(device.dla_core, 1)
-        self.assertEqual(device.allow_gpu_fallback, False)
-
-        device = torchtrt.Device(gpu_id=1, dla_core=0, allow_gpu_fallback=True)
-        self.assertEqual(device.device_type, torchtrt.DeviceType.DLA)
-        self.assertEqual(device.gpu_id, 1)
-        self.assertEqual(device.dla_core, 0)
-        self.assertEqual(device.allow_gpu_fallback, True)
-
-    def test_from_torch(self):
-        device = torchtrt.Device._from_torch_device(torch.device("cuda:0"))
-        self.assertEqual(device.device_type, torchtrt.DeviceType.GPU)
-        self.assertEqual(device.gpu_id, 0)
-
-
-class TestInput(unittest.TestCase):
-
-    def _verify_correctness(self, struct: torchtrt.Input, target: Dict) -> bool:
-        internal = struct._to_internal()
-
-        list_eq = lambda al, bl: all([a == b for (a, b) in zip(al, bl)])
-
-        eq = lambda a, b: a == b
-
-        def field_is_correct(field, equal_fn, a1, a2):
-            equal = equal_fn(a1, a2)
-            if not equal:
-                print("\nField {} is incorrect: {} != {}".format(field, a1, a2))
-            return equal
-
-        min_ = field_is_correct("min", list_eq, internal.min, target["min"])
-        opt_ = field_is_correct("opt", list_eq, internal.opt, target["opt"])
-        max_ = field_is_correct("max", list_eq, internal.max, target["max"])
-        is_dynamic_ = field_is_correct("is_dynamic", eq, internal.input_is_dynamic, target["input_is_dynamic"])
-        explicit_set_dtype_ = field_is_correct("explicit_dtype", eq, internal._explicit_set_dtype,
-                                               target["explicit_set_dtype"])
-        dtype_ = field_is_correct("dtype", eq, int(internal.dtype), int(target["dtype"]))
-        format_ = field_is_correct("format", eq, int(internal.format), int(target["format"]))
-
-        return all([min_, opt_, max_, is_dynamic_, explicit_set_dtype_, dtype_, format_])
-
-    def test_infer_from_example_tensor(self):
-        shape = [1, 3, 255, 255]
-        target = {
-            "min": shape,
-            "opt": shape,
-            "max": shape,
-            "input_is_dynamic": False,
-            "dtype": torchtrt.dtype.half,
-            "format": torchtrt.TensorFormat.contiguous,
-            "explicit_set_dtype": True
-        }
-
-        example_tensor = torch.randn(shape).half()
-        i = torchtrt.Input._from_tensor(example_tensor)
-        self.assertTrue(self._verify_correctness(i, target))
-
-    def test_static_shape(self):
-        shape = [1, 3, 255, 255]
-        target = {
-            "min": shape,
-            "opt": shape,
-            "max": shape,
-            "input_is_dynamic": False,
-            "dtype": torchtrt.dtype.unknown,
-            "format": torchtrt.TensorFormat.contiguous,
-            "explicit_set_dtype": False
-        }
-
-        i = torchtrt.Input(shape)
-        self.assertTrue(self._verify_correctness(i, target))
-
-        i = torchtrt.Input(tuple(shape))
-        self.assertTrue(self._verify_correctness(i, target))
-
-        i = torchtrt.Input(torch.randn(shape).shape)
-        self.assertTrue(self._verify_correctness(i, target))
-
-        i = torchtrt.Input(shape=shape)
-        self.assertTrue(self._verify_correctness(i, target))
-
-        i = torchtrt.Input(shape=tuple(shape))
-        self.assertTrue(self._verify_correctness(i, target))
-
-        i = torchtrt.Input(shape=torch.randn(shape).shape)
-        self.assertTrue(self._verify_correctness(i, target))
-
-    def test_data_type(self):
-        shape = [1, 3, 255, 255]
-        target = {
-            "min": shape,
-            "opt": shape,
-            "max": shape,
-            "input_is_dynamic": False,
-            "dtype": torchtrt.dtype.half,
-            "format": torchtrt.TensorFormat.contiguous,
-            "explicit_set_dtype": True
-        }
-
-        i = torchtrt.Input(shape, dtype=torchtrt.dtype.half)
-        self.assertTrue(self._verify_correctness(i, target))
-
-        i = torchtrt.Input(shape, dtype=torch.half)
-        self.assertTrue(self._verify_correctness(i, target))
-
-    def test_tensor_format(self):
-        shape = [1, 3, 255, 255]
-        target = {
-            "min": shape,
-            "opt": shape,
-            "max": shape,
-            "input_is_dynamic": False,
-            "dtype": torchtrt.dtype.unknown,
-            "format": torchtrt.TensorFormat.channels_last,
-            "explicit_set_dtype": False
-        }
-
-        i = torchtrt.Input(shape, format=torchtrt.TensorFormat.channels_last)
-        self.assertTrue(self._verify_correctness(i, target))
-
-        i = torchtrt.Input(shape, format=torch.channels_last)
-        self.assertTrue(self._verify_correctness(i, target))
-
-    def test_dynamic_shape(self):
-        min_shape = [1, 3, 128, 128]
-        opt_shape = [1, 3, 256, 256]
-        max_shape = [1, 3, 512, 512]
-        target = {
-            "min": min_shape,
-            "opt": opt_shape,
-            "max": max_shape,
-            "input_is_dynamic": True,
-            "dtype": torchtrt.dtype.unknown,
-            "format": torchtrt.TensorFormat.contiguous,
-            "explicit_set_dtype": False
-        }
-
-        i = torchtrt.Input(min_shape=min_shape, opt_shape=opt_shape, max_shape=max_shape)
-        self.assertTrue(self._verify_correctness(i, target))
-
-        i = torchtrt.Input(min_shape=tuple(min_shape), opt_shape=tuple(opt_shape), max_shape=tuple(max_shape))
-        self.assertTrue(self._verify_correctness(i, target))
-
-        tensor_shape = lambda shape: torch.randn(shape).shape
-        i = torchtrt.Input(min_shape=tensor_shape(min_shape),
-                           opt_shape=tensor_shape(opt_shape),
-                           max_shape=tensor_shape(max_shape))
-        self.assertTrue(self._verify_correctness(i, target))
-
-
-class TestModule(unittest.TestCase):
-
-    def test_module_type(self):
-        nn_module = models.alexnet(pretrained=True).eval().to("cuda")
-        ts_module = torch.jit.trace(nn_module, torch.ones([1, 3, 224, 224]).to("cuda"))
-        fx_module = torch.fx.symbolic_trace(nn_module)
-
-        self.assertEqual(torchtrt._compile._parse_module_type(nn_module), torchtrt._compile._ModuleType.nn)
-        self.assertEqual(torchtrt._compile._parse_module_type(ts_module), torchtrt._compile._ModuleType.ts)
-        self.assertEqual(torchtrt._compile._parse_module_type(fx_module), torchtrt._compile._ModuleType.fx)
-
-
-def test_suite():
-    suite = unittest.TestSuite()
-    suite.addTest(unittest.makeSuite(TestLoggingAPIs))
-    suite.addTest(TestCompile.parametrize(TestCompile, model=models.resnet18(pretrained=True)))
-    # Disabling mobilenet_v2 test due to https://nvbugs/3433655
-    # suite.addTest(TestCompile.parametrize(TestCompile, model=models.mobilenet_v2(pretrained=True)))
-    suite.addTest(TestCompileHalf.parametrize(TestCompileHalf, model=models.resnet18(pretrained=True)))
-    suite.addTest(TestCompileHalfDefault.parametrize(TestCompileHalfDefault, model=models.resnet18(pretrained=True)))
-    suite.addTest(TestPTtoTRTtoPT.parametrize(TestPTtoTRTtoPT, model=models.resnet18(pretrained=True)))
-    suite.addTest(
-        TestInputTypeDefaultsFP32Model.parametrize(TestInputTypeDefaultsFP32Model,
-                                                   model=models.resnet18(pretrained=True)))
-    suite.addTest(
-        TestInputTypeDefaultsFP16Model.parametrize(TestInputTypeDefaultsFP16Model,
-                                                   model=models.resnet18(pretrained=True)))
-    suite.addTest(TestFallbackToTorch.parametrize(TestFallbackToTorch, model=models.resnet18(pretrained=True)))
-    suite.addTest(
-        TestModuleFallbackToTorch.parametrize(TestModuleFallbackToTorch, model=models.resnet18(pretrained=True)))
-    suite.addTest(unittest.makeSuite(TestCheckMethodOpSupport))
-    suite.addTest(unittest.makeSuite(TestDevice))
-    suite.addTest(unittest.makeSuite(TestInput))
-    suite.addTest(unittest.makeSuite(TestModule))
-
-    return suite
-
-
-suite = test_suite()
-
-runner = unittest.TextTestRunner()
-result = runner.run(suite)
-
-exit(int(not result.wasSuccessful()))
diff --git a/toolchains/ci_workspaces/WORKSPACE.sbsa b/toolchains/ci_workspaces/WORKSPACE.sbsa
new file mode 100644
index 0000000000..93cf5876a7
--- /dev/null
+++ b/toolchains/ci_workspaces/WORKSPACE.sbsa
@@ -0,0 +1,147 @@
+workspace(name = "Torch-TensorRT")
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
+
+http_archive(
+    name = "rules_python",
+    sha256 = "778197e26c5fbeb07ac2a2c5ae405b30f6cb7ad1f5510ea6fdac03bded96cc6f",
+    url = "https://github.com/bazelbuild/rules_python/releases/download/0.2.0/rules_python-0.2.0.tar.gz",
+)
+
+load("@rules_python//python:pip.bzl", "pip_install")
+
+http_archive(
+    name = "rules_pkg",
+    sha256 = "038f1caa773a7e35b3663865ffb003169c6a71dc995e39bf4815792f385d837d",
+    urls = [
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_pkg/releases/download/0.4.0/rules_pkg-0.4.0.tar.gz",
+        "https://github.com/bazelbuild/rules_pkg/releases/download/0.4.0/rules_pkg-0.4.0.tar.gz",
+    ],
+)
+
+load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
+
+rules_pkg_dependencies()
+
+git_repository(
+    name = "googletest",
+    commit = "703bd9caab50b139428cea1aaff9974ebee5742e",
+    remote = "https://github.com/google/googletest",
+    shallow_since = "1570114335 -0400",
+)
+
+# External dependency for torch_tensorrt if you already have precompiled binaries.
+local_repository(
+    name = "torch_tensorrt",
+    path = "/opt/circleci/.pyenv/versions/3.8.10/lib/python3.8/site-packages/torch_tensorrt"
+)
+
+# CUDA should be installed on the system locally
+new_local_repository(
+    name = "cuda",
+    build_file = "@//third_party/cuda:BUILD",
+    path = "/usr/local/cuda/",
+)
+
+new_local_repository(
+    name = "cublas",
+    build_file = "@//third_party/cublas:BUILD",
+    path = "/usr",
+)
+#############################################################################################################
+# Tarballs and fetched dependencies (default - use in cases when building from precompiled bin and tarballs)
+#############################################################################################################
+
+#http_archive(
+#    name = "libtorch",
+#    build_file = "@//third_party/libtorch:BUILD",
+#    sha256 = "8d9e829ce9478db4f35bdb7943308cf02e8a2f58cf9bb10f742462c1d57bf287",
+#    strip_prefix = "libtorch",
+#    urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-cxx11-abi-shared-with-deps-1.11.0%2Bcu113.zip"],
+#)
+#
+#http_archive(
+#    name = "libtorch_pre_cxx11_abi",
+#    build_file = "@//third_party/libtorch:BUILD",
+#    sha256 = "90159ecce3ff451f3ef3f657493b6c7c96759c3b74bbd70c1695f2ea2f81e1ad",
+#    strip_prefix = "libtorch",
+#    urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.11.0%2Bcu113.zip"],
+#)
+
+# Download these tarballs manually from the NVIDIA website
+# Either place them in the distdir directory in third_party and use the --distdir flag
+# or modify the urls to "file:///<PATH TO TARBALL>/<TARBALL NAME>.tar.gz
+
+#http_archive(
+#    name = "cudnn",
+#    build_file = "@//third_party/cudnn/archive:BUILD",
+#    sha256 = "0e5d2df890b9967efa6619da421310d97323565a79f05a1a8cb9b7165baad0d7",
+#    strip_prefix = "cuda",
+#    urls = [
+#        "https://developer.nvidia.com/compute/machine-learning/cudnn/secure/8.2.4/11.4_20210831/cudnn-11.4-linux-x64-v8.2.4.15.tgz",
+#    ],
+#)
+#
+#http_archive(
+#    name = "tensorrt",
+#    build_file = "@//third_party/tensorrt/archive:BUILD",
+#    sha256 = "826180eaaecdf9a7e76116855b9f1f3400ea9b06e66b06a3f6a0747ba6f863ad",
+#    strip_prefix = "TensorRT-8.2.4.2",
+#    urls = [
+#        "https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.2.4/tars/tensorrt-8.2.4.2.linux.x86_64-gnu.cuda-11.4.cudnn8.2.tar.gz",
+#    ],
+#)
+
+####################################################################################
+# Locally installed dependencies (use in cases of custom dependencies or aarch64)
+####################################################################################
+
+# NOTE: In the case you are using just the pre-cxx11-abi path or just the cxx11 abi path
+# with your local libtorch, just point deps at the same path to satisfy bazel.
+
+# NOTE: NVIDIA's aarch64 PyTorch (python) wheel file uses the CXX11 ABI unlike PyTorch's standard
+# x86_64 python distribution. If using NVIDIA's version just point to the root of the package
+# for both versions here and do not use --config=pre-cxx11-abi
+
+new_local_repository(
+    name = "libtorch",
+    path = "/opt/circleci/.pyenv/versions/3.8.10/lib/python3.8/site-packages/torch",
+    build_file = "third_party/libtorch/BUILD"
+)
+
+new_local_repository(
+    name = "libtorch_pre_cxx11_abi",
+    path = "/opt/circleci/.pyenv/versions/3.8.10/lib/python3.8/site-packages/torch",
+    build_file = "third_party/libtorch/BUILD"
+)
+
+new_local_repository(
+    name = "cudnn",
+    path = "/usr/",
+    build_file = "@//third_party/cudnn/local:BUILD"
+)
+
+new_local_repository(
+   name = "tensorrt",
+   path = "/usr/",
+   build_file = "@//third_party/tensorrt/local:BUILD"
+)
+
+# #########################################################################
+# # Testing Dependencies (optional - comment out on aarch64)
+# #########################################################################
+# pip_install(
+#     name = "torch_tensorrt_py_deps",
+#     requirements = "//py:requirements.txt",
+# )
+
+# pip_install(
+#     name = "py_test_deps",
+#     requirements = "//tests/py:requirements.txt",
+# )
+
+pip_install(
+    name = "pylinter_deps",
+    requirements = "//tools/linter:requirements.txt",
+)
diff --git a/WORKSPACE.ci b/toolchains/ci_workspaces/WORKSPACE.x86_64
similarity index 98%
rename from WORKSPACE.ci
rename to toolchains/ci_workspaces/WORKSPACE.x86_64
index 1eeb75ea7d..00f8efc5df 100644
--- a/WORKSPACE.ci
+++ b/toolchains/ci_workspaces/WORKSPACE.x86_64
@@ -34,7 +34,7 @@ git_repository(
 # External dependency for torch_tensorrt if you already have precompiled binaries.
 local_repository(
     name = "torch_tensorrt",
-    path = "/opt/conda/lib/python3.8/site-packages/torch_tensorrt"
+    path = "/opt/circleci/.pyenv/versions/3.9.4/lib/python3.9/site-packages/torch_tensorrt"
 )
 
 # CUDA should be installed on the system locally