|
1 |
| -#include <torch/csrc/jit/codegen/cuda/executor.h> |
| 1 | +#pragma once |
| 2 | + |
2 | 3 | #include <torch/csrc/jit/codegen/cuda/executor_utils.h>
|
3 | 4 | #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
|
4 | 5 | #include <torch/csrc/jit/codegen/cuda/fusion.h>
|
5 | 6 | #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
|
6 | 7 | #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
|
7 | 8 |
|
8 | 9 | #include <ATen/cuda/CUDAContext.h>
|
9 |
| -#include <c10/cuda/CUDACachingAllocator.h> |
10 |
| -#include <torch/torch.h> |
11 | 10 |
|
12 | 11 | #include <unordered_map>
|
13 | 12 |
|
| 13 | +// Tests go in torch::jit |
14 | 14 | namespace torch {
|
15 | 15 | namespace jit {
|
16 |
| -namespace fuser { |
17 |
| -namespace cuda { |
18 |
| - |
19 |
| -inline bool deviceMajorMinorCheck(int major, int minor = 0) { |
20 |
| - auto dev_prop = at::cuda::getCurrentDeviceProperties(); |
21 |
| - if (dev_prop->major < major || |
22 |
| - (dev_prop->major == major && dev_prop->minor < minor)) { |
23 |
| - return false; |
24 |
| - } |
25 |
| - return true; |
26 |
| -} |
27 | 16 |
|
28 |
| -inline int deviceSMCount() { |
29 |
| - int sm_count = at::cuda::getCurrentDeviceProperties()->multiProcessorCount; |
30 |
| - return sm_count; |
31 |
| -} |
| 17 | +using namespace torch::jit::fuser::cuda; |
32 | 18 |
|
33 |
| -class NVFuserTest : public ::testing::Test { |
34 |
| - protected: |
35 |
| - void SetUp() override { |
36 |
| - // requires PASCAL or newer |
37 |
| - if (!deviceMajorMinorCheck(6)) { |
38 |
| - GTEST_SKIP() << "skipping tests on pre-PASCAL GPUs"; |
39 |
| - } |
40 |
| - setFillAllocationWithNan(true); |
41 |
| - } |
42 |
| - |
43 |
| - void TearDown() override { |
44 |
| - c10::cuda::CUDACachingAllocator::emptyCache(); |
45 |
| - } |
46 |
| -}; |
| 19 | +namespace { |
47 | 20 |
|
48 | 21 | struct ValidationConstants {
|
49 | 22 | // Tolerances generated from randn + add + sum fusion
|
@@ -74,8 +47,6 @@ struct ValidationConstants {
|
74 | 47 | double base_float_rel_tol = -1;
|
75 | 48 | };
|
76 | 49 |
|
77 |
| -namespace { |
78 |
| - |
79 | 50 | // Returns abs and relative values to use for validation
|
80 | 51 | std::pair<double, double> getTolerance(
|
81 | 52 | DataType dtype,
|
@@ -338,15 +309,13 @@ ExpressionEvaluator bindInputsAndLaunchParams(
|
338 | 309 | return expr_eval;
|
339 | 310 | }
|
340 | 311 |
|
341 |
| -} // namespace |
342 |
| - |
343 | 312 | // Validation will look through the fusion and figure out how many elements were
|
344 | 313 | // reduced to create each output. It will then compute a tolernace to use for
|
345 | 314 | // allclose based on experimental results. The experimental results were based
|
346 | 315 | // on adding two tensors then summing them. This of course has an assumption
|
347 | 316 | // that we're always summing values between -2 and 2. If we start summing values
|
348 | 317 | // larger than that this approach might not hold.
|
349 |
| -inline void testValidate( |
| 318 | +void testValidate( |
350 | 319 | Fusion* fusion,
|
351 | 320 | const std::vector<at::Tensor>& fusion_outputs,
|
352 | 321 | const at::ArrayRef<IValue>& aten_inputs,
|
@@ -466,18 +435,6 @@ inline void testValidate(
|
466 | 435 | }
|
467 | 436 | }
|
468 | 437 |
|
469 |
| -inline void clearL2Cache() { |
470 |
| - torch::NoGradGuard no_grad; |
471 |
| - auto l2_cache_size = at::cuda::getCurrentDeviceProperties()->l2CacheSize; |
472 |
| - auto options = |
473 |
| - torch::TensorOptions().dtype(torch::kFloat32).device(at::kCUDA, 0); |
474 |
| - |
475 |
| - auto l2_elems = l2_cache_size / 4; |
476 |
| - torch::Tensor t0 = torch::empty(l2_elems, options); |
477 |
| - torch::Tensor t1 = torch::clone(t0); |
478 |
| -}; |
479 |
| - |
480 |
| -} // namespace cuda |
481 |
| -} // namespace fuser |
| 438 | +} // namespace |
482 | 439 | } // namespace jit
|
483 | 440 | } // namespace torch
|
0 commit comments