-
Notifications
You must be signed in to change notification settings - Fork 51
Open
Labels
ML / DLmachine learning, deep learningmachine learning, deep learning
Milestone
Description
해결하고자 하는 문제
colab환경에서 머신러닝을 공부중인데 처음 만들어본 MNIST분류 모델이 GPU를 이용해 back propagation을 계산하면 CPU를 이용했을 때 보다 약 2배정도 느리게 계산됩니다. 집접 계산하여 모델을 만들기 위해 텐서를 직접 계산하여 모델을 만들었고, numpy를 이용해 제작할 수 있었지만 gpu를 사용하기 위해 pytorch를 이용했습니다. 모델은 784개 노드의 입력 레이어에서 16 노드로 이루어진 레이어 2개를 통과해 10개의 노드로 출력되게 만들었습니다.
코드 혹은 오류
import torch
from torchvision import datasets as dsets
import torchvision.transforms as transforms
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(42)
if device == 'cuda':
torch.cuda.manual_seed_all(42)
print(torch.cuda.current_device(), torch.cuda.device_count(), torch.cuda.get_device_name(0))
def sigmoid(X):
return 1 / (1 + torch.exp(-X))
def softmax(X):
return torch.exp(X) / torch.sum(torch.exp(X), dim = 0)
def dsigmoid(X):
# return torch.exp(-X) / torch.pow(1 + torch.exp(-X), 2)
return torch.mul(sigmoid(X), 1 - sigmoid(X))
def dsoftmax(X):
#return torch.mul((torch.sum(torch.exp(X), dim = 1) - torch.exp(X)) / torch.pow(torch.sum(torch.exp(X), dim = 1), 2), torch.exp(X))
return softmax(X) - torch.pow(softmax(X), 2)
lr = 0.001
class model:
def __init__(self):
self.W = [torch.normal(torch.zeros((784, 16)), torch.tensor([2])).to(device),
torch.normal(torch.zeros((16, 16)), torch.tensor([2])).to(device),
torch.normal(torch.zeros((16, 10)), torch.tensor([2])).to(device)]
self.B = [torch.normal(torch.zeros((16)), torch.tensor(2)).to(device),
torch.normal(torch.zeros((16)), torch.tensor(2)).to(device),
torch.normal(torch.zeros((10)), torch.tensor(2)).to(device)]
def forward(self, X):
L1 = sigmoid(torch.matmul(X, self.W[0]) + self.B[0])
L2 = sigmoid(torch.matmul(L1, self.W[1]) + self.B[1])
Y = softmax(torch.matmul(L2, self.W[2]) + self.B[2])
return Y
def loss(self, X, Y):
return torch.sum(torch.pow(self.forward(X) - Y, 2)) # / 원소 갯수(어케 구함?)
def backward(self, X, Y):
dW = [0, 0, 0]
dB = [0, 0, 0]
L1 = sigmoid(torch.matmul(X, self.W[0]) + self.B[0])
L2 = sigmoid(torch.matmul(L1, self.W[1]) + self.B[1])
FOR = softmax(torch.matmul(L2, self.W[2]) + self.B[2])
dB[2] = torch.mul(dsoftmax(torch.matmul(L2, self.W[2]) + self.B[2]), 2 * (FOR - Y))
# dW[2] = torch.mul(torch.reshape(L2, (16, 1)).expand(16, 10), dB[2].expand(16, 10)) #dB == dsoftmax
dW[2] = torch.matmul(torch.transpose(torch.unsqueeze(L2, 0), 0, 1), torch.unsqueeze(dB[2], 0))
# dB[1] = torch.mul(dsigmoid(torch.matmul(L1, self.W[1]) + self.B[1]), torch.sum(torch.transpose(dW[2], 0, 1), dim = 0))
# dW[1] = torch.mul(torch.reshape(L1, (16, 1)).expand(16, 16), dB[1].expand(16, 16)) #dB == dsoftmax
# dB[0] = torch.mul(dsigmoid(torch.matmul(X, self.W[0]) + self.B[0]), torch.sum(torch.transpose(dW[1], 0, 1), dim = 0))
# dW[0] = torch.mul(torch.reshape(X, (784, 1)).expand(784, 16), dB[0].expand(784, 16)) #dB == dsoftmax
# dB[1] = torch.mul(torch.reshape(torch.sum(torch.mul(self.W[2], dB[2].expand(16, 10)), dim = 1), (1, 16)), dsigmoid(torch.matmul(L1, self.W[1]) + self.B[1]))
dB[1] = torch.mul(torch.matmul(dB[2], torch.transpose(self.W[2], 0, 1)), dsigmoid(torch.matmul(L1, self.W[1]) + self.B[1]))
# dW[1] = torch.mul(torch.reshape(L1, (16, 1)).expand(16, 16), dB[1].expand(16, 16)) #dB == dsoftmax
dW[1] = torch.matmul(torch.transpose(torch.unsqueeze(L1, 0), 0, 1), torch.unsqueeze(dB[1], 0))
# dB[0] = torch.mul(torch.reshape(torch.sum(torch.mul(self.W[1], dB[1].expand(16, 16)), dim = 1), (1, 16)), dsigmoid(torch.matmul(X, self.W[0]) + self.B[0]))
dB[0] = torch.mul(torch.matmul(dB[1], torch.transpose(self.W[1], 0, 1)), dsigmoid(torch.matmul(X, self.W[0]) + self.B[0]))
# dW[0] = torch.mul(torch.reshape(X, (784, 1)).expand(784, 16), dB[0].expand(784, 16)) #dB == dsoftmax
dW[0] = torch.matmul(torch.transpose(torch.unsqueeze(X, 0), 0, 1), torch.unsqueeze(dB[0], 0))
self.W[2] = self.W[2] + torch.mul(dW[2], -1 * lr)
self.B[2] = self.B[2] + torch.mul(dB[2], -1 * lr)
self.W[1] = self.W[1] + torch.mul(dW[1], -1 * lr)
self.B[1] = self.B[1] + torch.mul(dB[1], -1 * lr)
self.W[0] = self.W[0] + torch.mul(dW[0], -1 * lr)
self.B[0] = self.B[0] + torch.mul(dB[0], -1 * lr)
mnist_train = dsets.MNIST(root='MNIST_data/', # 다운로드 경로 지정
train=True, # True를 지정하면 훈련 데이터로 다운로드
transform=transforms.ToTensor(), # 텐서로 변환
download=True)
mnist_test = dsets.MNIST(root='MNIST_data/', # 다운로드 경로 지정
train=False, # False를 지정하면 테스트 데이터로 다운로드
transform=transforms.ToTensor(), # 텐서로 변환
download=True)
def one_hot_encoding(x):
arg = [0] * 10
arg[x] = 1
return arg
mnist_test_X = []
for i in range(10000):
mnist_test_X.append(mnist_test[i][0].reshape(1, 784))
mnist_test_X = torch.cat(mnist_test_X, dim = 0)
mnist_test_X = mnist_test_X.to(device)
mnist_train_X = []
for i in range(60000):
mnist_train_X.append(mnist_train[i][0].reshape(1, 784))
mnist_train_X = torch.cat(mnist_train_X, dim = 0)
mnist_train_X = mnist_train_X.to(device)
mnist_test_Y = []
for i in range(10000):
mnist_test_Y.append(one_hot_encoding(mnist_test[i][1]))
mnist_test_Y = torch.tensor(mnist_test_Y).to(device)
mnist_train_Y = []
for i in range(60000):
mnist_train_Y.append(one_hot_encoding(mnist_train[i][1]))
mnist_train_Y = torch.tensor(mnist_train_Y).to(device)
M = model()
for tr in range(100):
loss_sum = torch.tensor([0], dtype=torch.float).to(device)
for i in range(60000):
#print(tr, i, M.loss(mnist_train_X[i], mnist_train_Y[i]).to(torch.device('cpu')), end = '\t')
M.backward(mnist_train_X[i], mnist_train_Y[i])
loss_sum += M.loss(mnist_train_X[i], mnist_train_Y[i])
#print(M.loss(mnist_train_X[i], mnist_train_Y[i]).to(torch.device('cpu')))
print(tr, loss_sum)
환경
사용중인 운영체제, 언어, 라이브러리의 버전을 적어주세요.
Python 3.7.11
pytorch 1.9.0+cu102
GPU: nvidia T4
CPU: Intel(R) Xeon(R) CPU @ 2.30GHz(46080 KB cache) * 2
시도해본 방법
인터넷을 검색하여 찾아보니 nvidia의 T4는 다른 GPU보다 float연산 성능이 떨어지지만 integer 연산 성능이 더 좋은 것을 보아하니 float 연산보다는 4bit integer 연산에 특화된 기종 같았습니다. 그래서 float텐서로 만들어진 제 모델에서 계산이 느리게 되는건가 생각됩니다.
Metadata
Metadata
Assignees
Labels
ML / DLmachine learning, deep learningmachine learning, deep learning