머신러닝 모델이 GPU에서 더 느리게 학습되는데 GPU의 종류 때문일까요?

# 해결하고자 하는 문제
colab환경에서 머신러닝을 공부중인데 처음 만들어본 MNIST분류 모델이 GPU를 이용해 back propagation을 계산하면 CPU를 이용했을 때 보다 약 2배정도 느리게 계산됩니다. 집접 계산하여 모델을 만들기 위해 텐서를 직접 계산하여 모델을 만들었고, numpy를 이용해 제작할 수 있었지만 gpu를 사용하기 위해 pytorch를 이용했습니다. 모델은 784개 노드의 입력 레이어에서 16 노드로 이루어진 레이어 2개를 통과해 10개의 노드로 출력되게 만들었습니다.

# 코드 혹은 오류
```
import torch
from torchvision import datasets as dsets
import torchvision.transforms as transforms

device = 'cuda' if torch.cuda.is_available() else 'cpu'

torch.manual_seed(42)
if device == 'cuda':
    torch.cuda.manual_seed_all(42)
    print(torch.cuda.current_device(), torch.cuda.device_count(), torch.cuda.get_device_name(0))

def sigmoid(X):
    return 1 / (1 + torch.exp(-X))

def softmax(X):
    return torch.exp(X) / torch.sum(torch.exp(X), dim = 0)

def dsigmoid(X):
    # return torch.exp(-X) / torch.pow(1 + torch.exp(-X), 2)
    return torch.mul(sigmoid(X), 1 - sigmoid(X))

def dsoftmax(X):
    #return torch.mul((torch.sum(torch.exp(X), dim = 1) - torch.exp(X)) / torch.pow(torch.sum(torch.exp(X), dim = 1), 2), torch.exp(X))
    return softmax(X) - torch.pow(softmax(X), 2)

lr = 0.001
class model:
    def __init__(self):
        self.W = [torch.normal(torch.zeros((784, 16)), torch.tensor([2])).to(device),
                  torch.normal(torch.zeros((16, 16)), torch.tensor([2])).to(device),
                  torch.normal(torch.zeros((16, 10)), torch.tensor([2])).to(device)]
        self.B = [torch.normal(torch.zeros((16)), torch.tensor(2)).to(device),
                  torch.normal(torch.zeros((16)), torch.tensor(2)).to(device),
                  torch.normal(torch.zeros((10)), torch.tensor(2)).to(device)]
    
    def forward(self, X):
        L1 = sigmoid(torch.matmul(X, self.W[0]) + self.B[0])
        L2 = sigmoid(torch.matmul(L1, self.W[1]) + self.B[1])
        Y = softmax(torch.matmul(L2, self.W[2]) + self.B[2])
        return Y

    def loss(self, X, Y):
        return torch.sum(torch.pow(self.forward(X) - Y, 2)) # / 원소 갯수(어케 구함?)
    
    def backward(self, X, Y):
        dW = [0, 0, 0]
        dB = [0, 0, 0]
        L1 = sigmoid(torch.matmul(X, self.W[0]) + self.B[0])
        L2 = sigmoid(torch.matmul(L1, self.W[1]) + self.B[1])
        FOR = softmax(torch.matmul(L2, self.W[2]) + self.B[2])

        dB[2] = torch.mul(dsoftmax(torch.matmul(L2, self.W[2]) + self.B[2]), 2 * (FOR - Y))
        # dW[2] = torch.mul(torch.reshape(L2, (16, 1)).expand(16, 10), dB[2].expand(16, 10)) #dB == dsoftmax
        dW[2] = torch.matmul(torch.transpose(torch.unsqueeze(L2, 0), 0, 1), torch.unsqueeze(dB[2], 0))

        # dB[1] = torch.mul(dsigmoid(torch.matmul(L1, self.W[1]) + self.B[1]), torch.sum(torch.transpose(dW[2], 0, 1), dim = 0))
        # dW[1] = torch.mul(torch.reshape(L1, (16, 1)).expand(16, 16), dB[1].expand(16, 16)) #dB == dsoftmax

        # dB[0] = torch.mul(dsigmoid(torch.matmul(X, self.W[0]) + self.B[0]), torch.sum(torch.transpose(dW[1], 0, 1), dim = 0))
        # dW[0] = torch.mul(torch.reshape(X, (784, 1)).expand(784, 16), dB[0].expand(784, 16)) #dB == dsoftmax

        # dB[1] = torch.mul(torch.reshape(torch.sum(torch.mul(self.W[2], dB[2].expand(16, 10)), dim = 1), (1, 16)), dsigmoid(torch.matmul(L1, self.W[1]) + self.B[1]))
        dB[1] = torch.mul(torch.matmul(dB[2], torch.transpose(self.W[2], 0, 1)), dsigmoid(torch.matmul(L1, self.W[1]) + self.B[1]))
        # dW[1] = torch.mul(torch.reshape(L1, (16, 1)).expand(16, 16), dB[1].expand(16, 16)) #dB == dsoftmax
        dW[1] = torch.matmul(torch.transpose(torch.unsqueeze(L1, 0), 0, 1), torch.unsqueeze(dB[1], 0))

        # dB[0] = torch.mul(torch.reshape(torch.sum(torch.mul(self.W[1], dB[1].expand(16, 16)), dim = 1), (1, 16)), dsigmoid(torch.matmul(X, self.W[0]) + self.B[0]))
        dB[0] = torch.mul(torch.matmul(dB[1], torch.transpose(self.W[1], 0, 1)), dsigmoid(torch.matmul(X, self.W[0]) + self.B[0]))
        # dW[0] = torch.mul(torch.reshape(X, (784, 1)).expand(784, 16), dB[0].expand(784, 16)) #dB == dsoftmax
        dW[0] = torch.matmul(torch.transpose(torch.unsqueeze(X, 0), 0, 1), torch.unsqueeze(dB[0], 0))


        self.W[2] = self.W[2] + torch.mul(dW[2], -1 * lr)
        self.B[2] = self.B[2] + torch.mul(dB[2], -1 * lr)

        self.W[1] = self.W[1] + torch.mul(dW[1], -1 * lr)
        self.B[1] = self.B[1] + torch.mul(dB[1], -1 * lr)

        self.W[0] = self.W[0] + torch.mul(dW[0], -1 * lr)
        self.B[0] = self.B[0] + torch.mul(dB[0], -1 * lr)

mnist_train = dsets.MNIST(root='MNIST_data/', # 다운로드 경로 지정
                          train=True, # True를 지정하면 훈련 데이터로 다운로드
                          transform=transforms.ToTensor(), # 텐서로 변환
                          download=True)

mnist_test = dsets.MNIST(root='MNIST_data/', # 다운로드 경로 지정
                         train=False, # False를 지정하면 테스트 데이터로 다운로드
                         transform=transforms.ToTensor(), # 텐서로 변환
                         download=True)

def one_hot_encoding(x):
    arg = [0] * 10
    arg[x] = 1
    return arg

mnist_test_X = []
for i in range(10000):
    mnist_test_X.append(mnist_test[i][0].reshape(1, 784))

mnist_test_X = torch.cat(mnist_test_X, dim = 0)
mnist_test_X = mnist_test_X.to(device)

mnist_train_X = []
for i in range(60000):
    mnist_train_X.append(mnist_train[i][0].reshape(1, 784))

mnist_train_X = torch.cat(mnist_train_X, dim = 0)
mnist_train_X = mnist_train_X.to(device)

mnist_test_Y = []
for i in range(10000):
    mnist_test_Y.append(one_hot_encoding(mnist_test[i][1]))
mnist_test_Y = torch.tensor(mnist_test_Y).to(device)

mnist_train_Y = []
for i in range(60000):
    mnist_train_Y.append(one_hot_encoding(mnist_train[i][1]))
mnist_train_Y = torch.tensor(mnist_train_Y).to(device)

M = model()

for tr in range(100):
    loss_sum = torch.tensor([0], dtype=torch.float).to(device)
    for i in range(60000):
        #print(tr, i, M.loss(mnist_train_X[i], mnist_train_Y[i]).to(torch.device('cpu')), end = '\t')
        M.backward(mnist_train_X[i], mnist_train_Y[i])
        loss_sum += M.loss(mnist_train_X[i], mnist_train_Y[i])
        #print(M.loss(mnist_train_X[i], mnist_train_Y[i]).to(torch.device('cpu')))
    print(tr, loss_sum)
```

# 환경
사용중인 운영체제, 언어, 라이브러리의 버전을 적어주세요.
Python 3.7.11
pytorch 1.9.0+cu102
GPU: nvidia T4
CPU: Intel(R) Xeon(R) CPU @ 2.30GHz(46080 KB cache) * 2

# 시도해본 방법
인터넷을 검색하여 찾아보니 nvidia의 T4는 다른 GPU보다 float연산 성능이 떨어지지만 integer 연산 성능이 더 좋은 것을 보아하니 float 연산보다는 4bit integer 연산에 특화된 기종 같았습니다. 그래서 float텐서로 만들어진 제 모델에서 계산이 느리게 되는건가 생각됩니다.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

머신러닝 모델이 GPU에서 더 느리게 학습되는데 GPU의 종류 때문일까요? #1131

해결하고자 하는 문제

코드 혹은 오류

환경

시도해본 방법

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

머신러닝 모델이 GPU에서 더 느리게 학습되는데 GPU의 종류 때문일까요? #1131

Description

해결하고자 하는 문제

코드 혹은 오류

환경

시도해본 방법

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions