Skip to content

머신러닝 모델이 GPU에서 더 느리게 학습되는데 GPU의 종류 때문일까요? #1131

@Jaeyoung-Choi

Description

@Jaeyoung-Choi

해결하고자 하는 문제

colab환경에서 머신러닝을 공부중인데 처음 만들어본 MNIST분류 모델이 GPU를 이용해 back propagation을 계산하면 CPU를 이용했을 때 보다 약 2배정도 느리게 계산됩니다. 집접 계산하여 모델을 만들기 위해 텐서를 직접 계산하여 모델을 만들었고, numpy를 이용해 제작할 수 있었지만 gpu를 사용하기 위해 pytorch를 이용했습니다. 모델은 784개 노드의 입력 레이어에서 16 노드로 이루어진 레이어 2개를 통과해 10개의 노드로 출력되게 만들었습니다.

코드 혹은 오류

import torch
from torchvision import datasets as dsets
import torchvision.transforms as transforms

device = 'cuda' if torch.cuda.is_available() else 'cpu'

torch.manual_seed(42)
if device == 'cuda':
    torch.cuda.manual_seed_all(42)
    print(torch.cuda.current_device(), torch.cuda.device_count(), torch.cuda.get_device_name(0))

def sigmoid(X):
    return 1 / (1 + torch.exp(-X))

def softmax(X):
    return torch.exp(X) / torch.sum(torch.exp(X), dim = 0)

def dsigmoid(X):
    # return torch.exp(-X) / torch.pow(1 + torch.exp(-X), 2)
    return torch.mul(sigmoid(X), 1 - sigmoid(X))

def dsoftmax(X):
    #return torch.mul((torch.sum(torch.exp(X), dim = 1) - torch.exp(X)) / torch.pow(torch.sum(torch.exp(X), dim = 1), 2), torch.exp(X))
    return softmax(X) - torch.pow(softmax(X), 2)

lr = 0.001
class model:
    def __init__(self):
        self.W = [torch.normal(torch.zeros((784, 16)), torch.tensor([2])).to(device),
                  torch.normal(torch.zeros((16, 16)), torch.tensor([2])).to(device),
                  torch.normal(torch.zeros((16, 10)), torch.tensor([2])).to(device)]
        self.B = [torch.normal(torch.zeros((16)), torch.tensor(2)).to(device),
                  torch.normal(torch.zeros((16)), torch.tensor(2)).to(device),
                  torch.normal(torch.zeros((10)), torch.tensor(2)).to(device)]
    
    def forward(self, X):
        L1 = sigmoid(torch.matmul(X, self.W[0]) + self.B[0])
        L2 = sigmoid(torch.matmul(L1, self.W[1]) + self.B[1])
        Y = softmax(torch.matmul(L2, self.W[2]) + self.B[2])
        return Y

    def loss(self, X, Y):
        return torch.sum(torch.pow(self.forward(X) - Y, 2)) # / 원소 갯수(어케 구함?)
    
    def backward(self, X, Y):
        dW = [0, 0, 0]
        dB = [0, 0, 0]
        L1 = sigmoid(torch.matmul(X, self.W[0]) + self.B[0])
        L2 = sigmoid(torch.matmul(L1, self.W[1]) + self.B[1])
        FOR = softmax(torch.matmul(L2, self.W[2]) + self.B[2])

        dB[2] = torch.mul(dsoftmax(torch.matmul(L2, self.W[2]) + self.B[2]), 2 * (FOR - Y))
        # dW[2] = torch.mul(torch.reshape(L2, (16, 1)).expand(16, 10), dB[2].expand(16, 10)) #dB == dsoftmax
        dW[2] = torch.matmul(torch.transpose(torch.unsqueeze(L2, 0), 0, 1), torch.unsqueeze(dB[2], 0))

        # dB[1] = torch.mul(dsigmoid(torch.matmul(L1, self.W[1]) + self.B[1]), torch.sum(torch.transpose(dW[2], 0, 1), dim = 0))
        # dW[1] = torch.mul(torch.reshape(L1, (16, 1)).expand(16, 16), dB[1].expand(16, 16)) #dB == dsoftmax

        # dB[0] = torch.mul(dsigmoid(torch.matmul(X, self.W[0]) + self.B[0]), torch.sum(torch.transpose(dW[1], 0, 1), dim = 0))
        # dW[0] = torch.mul(torch.reshape(X, (784, 1)).expand(784, 16), dB[0].expand(784, 16)) #dB == dsoftmax

        # dB[1] = torch.mul(torch.reshape(torch.sum(torch.mul(self.W[2], dB[2].expand(16, 10)), dim = 1), (1, 16)), dsigmoid(torch.matmul(L1, self.W[1]) + self.B[1]))
        dB[1] = torch.mul(torch.matmul(dB[2], torch.transpose(self.W[2], 0, 1)), dsigmoid(torch.matmul(L1, self.W[1]) + self.B[1]))
        # dW[1] = torch.mul(torch.reshape(L1, (16, 1)).expand(16, 16), dB[1].expand(16, 16)) #dB == dsoftmax
        dW[1] = torch.matmul(torch.transpose(torch.unsqueeze(L1, 0), 0, 1), torch.unsqueeze(dB[1], 0))

        # dB[0] = torch.mul(torch.reshape(torch.sum(torch.mul(self.W[1], dB[1].expand(16, 16)), dim = 1), (1, 16)), dsigmoid(torch.matmul(X, self.W[0]) + self.B[0]))
        dB[0] = torch.mul(torch.matmul(dB[1], torch.transpose(self.W[1], 0, 1)), dsigmoid(torch.matmul(X, self.W[0]) + self.B[0]))
        # dW[0] = torch.mul(torch.reshape(X, (784, 1)).expand(784, 16), dB[0].expand(784, 16)) #dB == dsoftmax
        dW[0] = torch.matmul(torch.transpose(torch.unsqueeze(X, 0), 0, 1), torch.unsqueeze(dB[0], 0))


        self.W[2] = self.W[2] + torch.mul(dW[2], -1 * lr)
        self.B[2] = self.B[2] + torch.mul(dB[2], -1 * lr)

        self.W[1] = self.W[1] + torch.mul(dW[1], -1 * lr)
        self.B[1] = self.B[1] + torch.mul(dB[1], -1 * lr)

        self.W[0] = self.W[0] + torch.mul(dW[0], -1 * lr)
        self.B[0] = self.B[0] + torch.mul(dB[0], -1 * lr)

mnist_train = dsets.MNIST(root='MNIST_data/', # 다운로드 경로 지정
                          train=True, # True를 지정하면 훈련 데이터로 다운로드
                          transform=transforms.ToTensor(), # 텐서로 변환
                          download=True)

mnist_test = dsets.MNIST(root='MNIST_data/', # 다운로드 경로 지정
                         train=False, # False를 지정하면 테스트 데이터로 다운로드
                         transform=transforms.ToTensor(), # 텐서로 변환
                         download=True)

def one_hot_encoding(x):
    arg = [0] * 10
    arg[x] = 1
    return arg

mnist_test_X = []
for i in range(10000):
    mnist_test_X.append(mnist_test[i][0].reshape(1, 784))

mnist_test_X = torch.cat(mnist_test_X, dim = 0)
mnist_test_X = mnist_test_X.to(device)

mnist_train_X = []
for i in range(60000):
    mnist_train_X.append(mnist_train[i][0].reshape(1, 784))

mnist_train_X = torch.cat(mnist_train_X, dim = 0)
mnist_train_X = mnist_train_X.to(device)

mnist_test_Y = []
for i in range(10000):
    mnist_test_Y.append(one_hot_encoding(mnist_test[i][1]))
mnist_test_Y = torch.tensor(mnist_test_Y).to(device)

mnist_train_Y = []
for i in range(60000):
    mnist_train_Y.append(one_hot_encoding(mnist_train[i][1]))
mnist_train_Y = torch.tensor(mnist_train_Y).to(device)

M = model()

for tr in range(100):
    loss_sum = torch.tensor([0], dtype=torch.float).to(device)
    for i in range(60000):
        #print(tr, i, M.loss(mnist_train_X[i], mnist_train_Y[i]).to(torch.device('cpu')), end = '\t')
        M.backward(mnist_train_X[i], mnist_train_Y[i])
        loss_sum += M.loss(mnist_train_X[i], mnist_train_Y[i])
        #print(M.loss(mnist_train_X[i], mnist_train_Y[i]).to(torch.device('cpu')))
    print(tr, loss_sum)

환경

사용중인 운영체제, 언어, 라이브러리의 버전을 적어주세요.
Python 3.7.11
pytorch 1.9.0+cu102
GPU: nvidia T4
CPU: Intel(R) Xeon(R) CPU @ 2.30GHz(46080 KB cache) * 2

시도해본 방법

인터넷을 검색하여 찾아보니 nvidia의 T4는 다른 GPU보다 float연산 성능이 떨어지지만 integer 연산 성능이 더 좋은 것을 보아하니 float 연산보다는 4bit integer 연산에 특화된 기종 같았습니다. 그래서 float텐서로 만들어진 제 모델에서 계산이 느리게 되는건가 생각됩니다.

Metadata

Metadata

Assignees

No one assigned

    Labels

    ML / DLmachine learning, deep learning

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions