I tried to build a machine learning model using CIFAR 10 dataset, but I am encountering a bug that my model stops training past i = 78 (looped 78 times, see code for more).
JavaScript
x
61
61
1
import torch
2
import torchvision.transforms as transforms
3
from torchvision.datasets import CIFAR10
4
from torchvision.transforms import ToTensor
5
from torch.utils.data.dataloader import DataLoader
6
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5,
7
0.5, 0.5))])
8
classes = ('plane', 'car', 'bird', 'cat','deer', 'dog', 'frog', 'horse', 'ship', 'truck')
9
10
train_dataset = CIFAR10(root = './data', train = True, download = True, transform = transform)
11
train_loader = DataLoader(train_dataset, batch_size = 4, shuffle = True, num_workers = 2)
12
13
test_dataset = CIFAR10(root = './data', train = False, download = True, transform = transform)
14
test_loader = DataLoader(test_dataset, batch_size = 128, shuffle = False, num_workers = 2)
15
import torch.nn as nn
16
import torch.nn.functional as F
17
18
19
class Net(nn.Module):
20
def __init__(self):
21
super(Net, self).__init__()
22
self.conv1 = nn.Conv2d(3, 6, 5)
23
self.pool = nn.MaxPool2d(2, 2)
24
self.conv2 = nn.Conv2d(6, 16, 5)
25
self.fc1 = nn.Linear(16 * 5 * 5, 120)
26
self.fc2 = nn.Linear(120, 84)
27
self.fc3 = nn.Linear(84, 10)
28
29
def forward(self, x):
30
x = self.pool(F.relu(self.conv1(x)))
31
x = self.pool(F.relu(self.conv2(x)))
32
x = x.view(-1, 16 * 5 * 5)
33
x = F.relu(self.fc1(x))
34
x = F.relu(self.fc2(x))
35
x = self.fc3(x)
36
return x
37
38
39
net = Net()
40
41
optimiser = torch.optim.SGD(model.parameters(), lr = 0.001, momentum=0.9)
42
loss_fn = nn.CrossEntropyLoss()
43
for epoch in range(2):
44
running_loss = 0
45
for i, data in enumerate(test_loader, 0):
46
images, labels = data
47
outputs = model(images)
48
49
loss = loss_fn(outputs, labels)
50
51
optimiser.zero_grad()
52
loss.backward()
53
optimiser.step()
54
55
running_loss += loss.item()
56
print(i)
57
if i % 2000 == 1999: # print every 2000 mini-batches
58
print('[%d, %5d] loss: %.3f' %
59
(epoch + 1, i + 1, running_loss / 2000))
60
running_loss = 0
61
Sorry, I had to post the entire code because I cannot spot the mistake I made. Moreover, since I could not make it work, I tried copying the tutorial’s exact code, and it works as intended! I am posting that code too below,
JavaScript
1
77
77
1
import torch
2
import torchvision
3
import torchvision.transforms as transforms
4
5
transform = transforms.Compose(
6
[transforms.ToTensor(),
7
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
8
9
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
10
download=True, transform=transform)
11
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
12
shuffle=True, num_workers=2)
13
14
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
15
download=True, transform=transform)
16
testloader = torch.utils.data.DataLoader(testset, batch_size=4,
17
shuffle=False, num_workers=2)
18
19
classes = ('plane', 'car', 'bird', 'cat',
20
'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
21
22
import torch.nn as nn
23
import torch.nn.functional as F
24
25
26
class Net(nn.Module):
27
def __init__(self):
28
super(Net, self).__init__()
29
self.conv1 = nn.Conv2d(3, 6, 5)
30
self.pool = nn.MaxPool2d(2, 2)
31
self.conv2 = nn.Conv2d(6, 16, 5)
32
self.fc1 = nn.Linear(16 * 5 * 5, 120)
33
self.fc2 = nn.Linear(120, 84)
34
self.fc3 = nn.Linear(84, 10)
35
36
def forward(self, x):
37
x = self.pool(F.relu(self.conv1(x)))
38
x = self.pool(F.relu(self.conv2(x)))
39
x = x.view(-1, 16 * 5 * 5)
40
x = F.relu(self.fc1(x))
41
x = F.relu(self.fc2(x))
42
x = self.fc3(x)
43
return x
44
45
46
net = Net()
47
48
import torch.optim as optim
49
50
criterion = nn.CrossEntropyLoss()
51
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
52
53
for epoch in range(2): # loop over the dataset multiple times
54
55
running_loss = 0.0
56
for i, data in enumerate(trainloader, 0):
57
# get the inputs; data is a list of [inputs, labels]
58
inputs, labels = data
59
60
# zero the parameter gradients
61
optimizer.zero_grad()
62
63
# forward + backward + optimize
64
outputs = net(inputs)
65
loss = criterion(outputs, labels)
66
loss.backward()
67
optimizer.step()
68
69
# print statistics
70
running_loss += loss.item()
71
if i % 2000 == 1999: # print every 2000 mini-batches
72
print('[%d, %5d] loss: %.3f' %
73
(epoch + 1, i + 1, running_loss / 2000))
74
running_loss = 0.0
75
76
print('Finished Training')
77
Please help me find the bug!
Advertisement
Answer
Look at your main loop. you’ll notice you are using the test_loader
instead of train_loader
.
This
JavaScript
1
6
1
for epoch in range(2):
2
running_loss = 0
3
for i, data in enumerate(test_loader, 0):
4
images, labels = data
5
outputs = model(images)
6
should look like this:
JavaScript
1
6
1
for epoch in range(2):
2
running_loss = 0
3
for i, data in enumerate(train_loader, 0):
4
images, labels = data
5
outputs = model(images)
6