知识架构:

封面

本文的主要作用是在阅读过程中做一些摘录。对于「机器学习」领域,c7w 虽然曾尝试从各个领域入门,也尝试训过一些模型,但是还是缺少系统性、结构性的学习。希望阅读本书能带来更多的收获吧。

与前面的一些笔记相比,本文更加侧重于「实践」。也就是说切实地提升自己的代码能力。

本部分包含:

  1. {Finished} [5-5] 实现一个可以实现表情识别的类 CNN 网络并训练,重点在于造出一个机器学习的框架,然后评估其准确率。
  2. {Finished} [5-11, 5-12] 实现 ResNet 和 DenseNet,注意体会怎样才能使得运算维度匹配。

[5-5] CNN

主要是把训练模型的轮子连抄带造写了一遍。

  • train.py
from toolkit.dataset import *
import torch
from toolkit.utils import get_device
from toolkit.net import *
from toolkit.procedure import *

# By c7w, created on 2022/1/27.

'''
Usage:
+ Define your model in toolkit/net.py
+ Define your dataset in toolkit/dataset.py
+ Define configuration in main.py
'''

device = get_device()
device = 'cpu'
print(f"Now on {device}")

# Configuration Here
config = {
    'epochs': 10000,
    'batch_size': 16,
    # 'optimizer': in training stage
    'early_stop': 20,
    'save_path': 'save/model2-rms.pth'
}

if __name__ == "__main__":
    # Data Preparation Stage
    tr_data = Data('train')
    vd_data = Data('valid')
    tt_data = Data('test')

    tr_set = DataLoader(tr_data, config['batch_size'], shuffle=True, drop_last=False)
    vd_set = DataLoader(vd_data, config['batch_size'], shuffle=False, drop_last=False)
    tt_set = DataLoader(tt_data, config['batch_size'], shuffle=False, drop_last=False)
    
    # Training Stage
    model = LeNet().to(device)
    # config['optimizer'] = torch.optim.Adam(model.parameters())
    config['optimizer'] = torch.optim.RMSprop(model.parameters())
    model_loss, model_loss_record = train(tr_set, vd_set, model, config, device)

    # Test Stage
    del model
    model = LeNet().to(device)
    model.load_state_dict(torch.load(config['save_path']))

    acc = test(tt_set, model, device)
    print(acc)
  • toolkit/dataset.py
import torch
import random
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from icecream import ic
from PIL import Image

class Data(Dataset):
    def __init__(self, mode):
        self.mode = mode
        data = pd.read_csv("./project/data.csv")
        
        usage = "Test" if mode == "test" else "Training"
        data = data.loc[data.Usage == usage]
        
        features = []
        labels = []
        
        for r, row in data.iterrows():
            labels.append(  int(row['emotion']) )
            feature = [int(number) / 255 for number in row['pixels'].split(" ")]
            features.append(torch.Tensor(feature).view(1, 48, 48))
        
        self.data = list(zip(features, labels))
        random.shuffle(self.data)
        
        # im = Image.fromarray((self.data[0][0].view(48, 48) * 255).numpy())
        # im = im.convert('L')
        # ic(self.data[0][0], self.data[0][1])
        # im.show()
        
        l = len(self.data) // 10
        if mode == 'valid':
            self.data = self.data[-l : ]
        else:
            self.data = self.data[ : len(self.data) - l]
        
        print(f"Reading {mode} set finished with {len(self.data)} samples in total.")
        print("Example:")
        print(self.data[0])
        print("\n")
        
    def __getitem__(self, index):
        return self.data[index]
        
    def __len__(self):  
        return len(self.data)
  • toolkit/net.py
import torch.nn as nn

class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        
        self.net = nn.Sequential(
            # Conv Layer
            nn.Conv2d(1, 16, kernel_size=9, padding=4),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(16, 32, kernel_size=5, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=5, padding=2),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Flatten(),
            
            # FC Layer
            nn.Linear(64*6*6, 84),
            nn.BatchNorm1d(84),
            nn.ReLU(),
            nn.Linear(84, 7)
        )
        
        self.Loss = nn.CrossEntropyLoss()
        
    def forward(self, x):
        return self.net(x)
    
    def loss(self, pred, y):
        return self.Loss(pred, y).mean()
  • toolkit/procedure.py
import torch
import numpy as np

def train(tr_set, vd_set, model, config, device):
    max_epoch_count = config['epochs']
    optimizer = config['optimizer']

    loss_record = {'train': [], 'valid': []}
    curr_min_loss = 1145141919810.0
    early_stop_cnt = 0
    epoch = 0
    
    while epoch < max_epoch_count:
        model.train()
        for x, y in tr_set:
            optimizer.zero_grad()
            x, y = x.to(device), y.to(device)
            pred = model(x)
            
            l = model.loss(pred, y)
            l.backward()
            optimizer.step()
            
            loss_record['train'].append(l.detach().cpu().item())

        valid_mse = validate(vd_set, model, device)
        print('Epoch {:4d} completed, tr_loss = {:.4f}'.format(epoch + 1, valid_mse))
        
        
        if valid_mse < curr_min_loss:
            # Save model if model improved
            curr_min_loss = valid_mse
            
            print('Saving model (epoch = {:4d}, loss = {:.4f})'.format(epoch + 1, curr_min_loss))
            torch.save(model.state_dict(), config['save_path'])  # Save model to specified path
            early_stop_cnt = 0
        else:
            early_stop_cnt += 1

        epoch += 1
        loss_record['valid'].append(valid_mse)
        if early_stop_cnt > config['early_stop']:
            # Stop training if your model stops improving for "config['early_stop']" epochs.
            break
    
    print(f'Finished training after {epoch} epochs.')
    return curr_min_loss, loss_record

def validate(vd_set, model, device):
    model.eval()
    total_loss = 0
    for x, y in vd_set:
        x, y = x.to(device), y.to(device)
        with torch.no_grad():
            pred = model(x)
            vd_loss = model.loss(pred, y)
        total_loss += vd_loss.detach().cpu().item() * len(x)
    
    total_loss = total_loss / len(vd_set.dataset)              # compute averaged loss
    return total_loss

def test(tt_set, model, device):
    model.eval()
    total_right = 0
    for x, y in tt_set:
        x, y = x.to(device), y.to(device)
        
        with torch.no_grad():
            pred = model(x).detach().cpu().numpy()     
            for i, logit in enumerate(pred):
                if np.argmax(logit) == y[i]: total_right += 1
            

    acc = total_right / len(tt_set.dataset)              # compute averaged loss
    return acc

def predict(tt_set, model, device):
    model.eval()
    preds = []
    for x in tt_set:
        x = x.to(device)
        with torch.no_grad():
            pred = model(x)
            preds.append(pred.detach().cpu())
    preds = torch.cat(preds, dim=0).numpy()
    return preds
  • toolkit/utils.py
import torch


def get_device():
    return 'cuda' if torch.cuda.is_available() else 'cpu'

def get_one_hot(k, dim):
    t = torch.zeros(size=(dim, ))
    t[k] = 1
    return t
  • 训练结果
Now on cpu
Reading train set finished with 25839 samples in total.
Example:
(tensor([[[0.2000, 0.1922, 0.2118,  ..., 0.1804, 0.2392, 0.2863],
         [0.1922, 0.1843, 0.1647,  ..., 0.2392, 0.2078, 0.2588],
         [0.1961, 0.1804, 0.1608,  ..., 0.2039, 0.1569, 0.2000],
         ...,
         [0.3098, 0.1961, 0.2353,  ..., 0.2980, 0.3255, 0.3412],
         [0.2706, 0.1647, 0.2471,  ..., 0.3098, 0.3294, 0.3373],
         [0.2392, 0.1686, 0.2157,  ..., 0.3255, 0.3333, 0.3490]]]), 3)


Reading valid set finished with 2870 samples in total.
Example:
(tensor([[[0.1725, 0.1333, 0.1451,  ..., 0.3176, 0.3216, 0.4510],
         [0.1333, 0.1333, 0.1255,  ..., 0.3137, 0.3216, 0.4471],
         [0.1333, 0.1216, 0.1137,  ..., 0.3137, 0.3216, 0.4471],
         ...,
         [0.3922, 0.3255, 0.4039,  ..., 0.4000, 0.4510, 0.5137],
         [0.3882, 0.3098, 0.3961,  ..., 0.3843, 0.4549, 0.5333],
         [0.4000, 0.2745, 0.3725,  ..., 0.3961, 0.4392, 0.5137]]]), 3)


Reading test set finished with 6461 samples in total.
Example:
(tensor([[[0.7373, 0.7608, 0.7255,  ..., 0.8549, 0.8157, 0.8275],
         [0.7529, 0.7608, 0.7137,  ..., 0.8588, 0.8314, 0.8157],
         [0.7804, 0.7451, 0.7137,  ..., 0.8510, 0.8353, 0.8157],
         ...,
         [0.4784, 0.5765, 0.5804,  ..., 0.6863, 0.5451, 0.4235],
         [0.2549, 0.3373, 0.4588,  ..., 0.5804, 0.5373, 0.5608],
         [0.3608, 0.3961, 0.6275,  ..., 0.5569, 0.3176, 0.6314]]]), 3)


Epoch    1 completed, tr_loss = 1.4529
Saving model (epoch =    1, loss = 1.4529)
Epoch    2 completed, tr_loss = 1.2995
Saving model (epoch =    2, loss = 1.2995)
Epoch    3 completed, tr_loss = 1.1131
Saving model (epoch =    3, loss = 1.1131)
Epoch    4 completed, tr_loss = 1.0275
Saving model (epoch =    4, loss = 1.0275)
Epoch    5 completed, tr_loss = 1.1062
Epoch    6 completed, tr_loss = 0.8805
Saving model (epoch =    6, loss = 0.8805)
Epoch    7 completed, tr_loss = 0.7520
Saving model (epoch =    7, loss = 0.7520)
Epoch    8 completed, tr_loss = 0.8390
Epoch    9 completed, tr_loss = 0.8715
Epoch   10 completed, tr_loss = 0.6758
Saving model (epoch =   10, loss = 0.6758)
Epoch   11 completed, tr_loss = 0.6634
Saving model (epoch =   11, loss = 0.6634)
Epoch   12 completed, tr_loss = 0.5063
Saving model (epoch =   12, loss = 0.5063)
Epoch   13 completed, tr_loss = 0.5055
Saving model (epoch =   13, loss = 0.5055)
Epoch   14 completed, tr_loss = 0.6266
Epoch   15 completed, tr_loss = 0.4653
Saving model (epoch =   15, loss = 0.4653)
Epoch   16 completed, tr_loss = 0.4373
Saving model (epoch =   16, loss = 0.4373)
Epoch   17 completed, tr_loss = 0.3892
Saving model (epoch =   17, loss = 0.3892)
Epoch   18 completed, tr_loss = 0.4048
Epoch   19 completed, tr_loss = 0.4376
Epoch   20 completed, tr_loss = 0.3657
Saving model (epoch =   20, loss = 0.3657)
Epoch   21 completed, tr_loss = 0.3765
Epoch   22 completed, tr_loss = 0.3329
Saving model (epoch =   22, loss = 0.3329)
Epoch   23 completed, tr_loss = 0.3969
Epoch   24 completed, tr_loss = 0.3382
Epoch   25 completed, tr_loss = 0.3283
Saving model (epoch =   25, loss = 0.3283)
Epoch   26 completed, tr_loss = 0.3192
Saving model (epoch =   26, loss = 0.3192)
Epoch   27 completed, tr_loss = 0.3671
Epoch   28 completed, tr_loss = 0.3457
Epoch   29 completed, tr_loss = 0.3352
Epoch   30 completed, tr_loss = 0.3461
Epoch   31 completed, tr_loss = 0.3258
Epoch   32 completed, tr_loss = 0.3097
Saving model (epoch =   32, loss = 0.3097)
Epoch   33 completed, tr_loss = 0.3976
Epoch   34 completed, tr_loss = 0.3364
Epoch   35 completed, tr_loss = 0.3275
Epoch   36 completed, tr_loss = 0.3179
Epoch   37 completed, tr_loss = 0.3415
Epoch   38 completed, tr_loss = 0.3471
Epoch   39 completed, tr_loss = 0.3302
Epoch   40 completed, tr_loss = 0.3407
Epoch   41 completed, tr_loss = 0.4045
Epoch   42 completed, tr_loss = 0.3310
Epoch   43 completed, tr_loss = 0.3626
Epoch   44 completed, tr_loss = 0.3288
Epoch   45 completed, tr_loss = 0.3600
Epoch   46 completed, tr_loss = 0.3866
Epoch   47 completed, tr_loss = 0.3613
Epoch   48 completed, tr_loss = 0.3402
Epoch   49 completed, tr_loss = 0.3562
Epoch   50 completed, tr_loss = 0.3674
Epoch   51 completed, tr_loss = 0.3733
Epoch   52 completed, tr_loss = 0.3461
Epoch   53 completed, tr_loss = 0.3542
Finished training after 53 epochs.
0.5304132487231079

Resnet & DenseNet (5-11, 5-12)

  • main.py

随机生成数据,模拟 batch_size = 4input_channels = 3, pic_size = 96x96 的情况,然后将其丢入实现的网络中查看运行结果,没有发生错误则说明维度对应正确。

import torch
from ResNet import ResNet
from DenseNet import DenseNet
from icecream import ic as print

data = torch.randn(size=(4, 3, 96, 96))
# print(data)

# net = ResNet(3)
net = DenseNet(3)

print(net)
# print(net(data))
print(net(data).shape)
  • ResNet.py

本文件中实现了 ResNet-18.

import torch.nn as nn

# Implementation of ResNet-18

class Residual(nn.Module):
    
    # Stride: to control the height/width of the manipulating data
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, stride=stride)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
        
        # If in_channels != out_channels
        # Then use 1x1 conv layer to change channel size
        self.conv3 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride) if in_channels != out_channels else None
        
        self.bn = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)
        
        self.relu = nn.ReLU()
    
    def forward(self, x):
        y = self.conv1(x)
        y = self.bn(y)
        y = self.relu(y)
        y = self.conv2(y)
        y = self.bn2(y)
        
        if self.conv3:
            x = self.conv3(x)
        
        return self.relu(x + y)

def resnet_block(in_channels, out_channels, num_residuals, first_block=False):
    blk = []
    for i in range(num_residuals):
        if i == 0 and not first_block:
            blk.append(Residual(in_channels, out_channels, 2))
        else:
            blk.append(Residual(out_channels, out_channels))
    return nn.Sequential(*blk)    

class ResNet(nn.Module):
    
    def __init__(self, in_channels):
        super().__init__()
        self.start = nn.Sequential(
            nn.Conv2d(in_channels, 64, kernel_size=7, stride=2, padding=3),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )
        self.residual = nn.ModuleList([
            resnet_block(64, 64, 2, True),
            resnet_block(64, 128, 2),
            resnet_block(128, 256, 2),
            resnet_block(256, 512, 2),
        ])
        self.output = nn.Sequential(
            nn.Flatten(),
            nn.Linear(512, 10),
        )

    
    def forward(self, x):
        old_shape = x.shape
        x = self.start(x)
        for rb in self.residual:
            x = rb(x)
        x = x.mean(dim=(2,3)).view(old_shape[0], -1, 1, 1)
        x = self.output(x)
        return x
ic| net: ResNet(
           (start): Sequential(
             (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
             (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
             (2): ReLU()
             (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
           )
           (residual): ModuleList(
             (0): Sequential(
               (0): Residual(
                 (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                 (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                 (relu): ReLU()
               )
               (1): Residual(
                 (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                 (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                 (relu): ReLU()
               )
             )
             (1): Sequential(
               (0): Residual(
                 (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
                 (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 (conv3): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2))
                 (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                 (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                 (relu): ReLU()
               )
               (1): Residual(
                 (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                 (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                 (relu): ReLU()
               )
             )
             (2): Sequential(
               (0): Residual(
                 (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
                 (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 (conv3): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2))
                 (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                 (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                 (relu): ReLU()
               )
               (1): Residual(
                 (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                 (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                 (relu): ReLU()
               )
             )
             (3): Sequential(
               (0): Residual(
                 (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
                 (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 (conv3): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2))
                 (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                 (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                 (relu): ReLU()
               )
               (1): Residual(
                 (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                 (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                 (relu): ReLU()
               )
             )
           )
           (output): Sequential(
             (0): Flatten(start_dim=1, end_dim=-1)
             (1): Linear(in_features=512, out_features=10, bias=True)
           )
         )
ic| net(data).shape: torch.Size([4, 10])
  • DenseNet.py

本文件实现了 DenseNet.

import enum
import torch
import torch.nn as nn

# Implementation of DenseNet

def conv_block(in_channels, out_channels):
    return nn.Sequential(
        nn.BatchNorm2d(in_channels),
        nn.ReLU(),
        nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
    )

class DenseBlock(nn.Module):    
    
    # Out_channels number is the increasing rate of channels
    def __init__(self, num_convs, in_channels, out_channels):
        super().__init__()

        net = []
        for i in range(num_convs):
            in_c = in_channels + i * out_channels
            net.append(conv_block(in_c, out_channels))
        self.net = nn.ModuleList(net)
        self.out_channels = in_channels + num_convs * out_channels
    
    def forward(self, x):
        for blk in self.net:
            y = blk(x)
            x = torch.cat((x, y), dim = 1)
        return x

def transition_block(in_channels, out_channels):
    return nn.Sequential(
        nn.BatchNorm2d(in_channels),
        nn.ReLU(),
        nn.Conv2d(in_channels, out_channels, kernel_size=1),
        nn.AvgPool2d(kernel_size=2, stride=2)
    )

class DenseNet(nn.Module):
    
    def __init__(self, in_channels):
        super().__init__()
        self.start = nn.Sequential(
            nn.Conv2d(in_channels, 64, kernel_size=7, padding=3, stride=2),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, padding=1, stride=2)
        )
        
        dense_list = []
        
        num_channels, growth_rate = 64, 32
        num_convs_in_dense_blocks = [4, 4, 4, 4]
        
        for i, num_convs in enumerate(num_convs_in_dense_blocks):
            DB = DenseBlock(num_convs, num_channels, growth_rate)
            num_channels = DB.out_channels
            
            dense_list.append(DB)
            
            if i != len(num_convs_in_dense_blocks) - 1:
                dense_list.append( transition_block(num_channels, num_channels // 2))
                num_channels = num_channels // 2
        
        self.dense = nn.ModuleList(dense_list)
        self.output = nn.Sequential(
            nn.BatchNorm2d(num_channels),
            nn.ReLU()
        )
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(num_channels, 10)
        )
    
    def forward(self, x):
        x = self.start(x)
        for layer in self.dense:
            x = layer(x)
        x = self.output(x)
        x = x.mean(dim=(2, 3))
        x = self.fc(x)
        return x
ic| net: DenseNet(
           (start): Sequential(
             (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
             (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
             (2): ReLU()
             (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
           )
           (dense): ModuleList(
             (0): DenseBlock(
               (net): ModuleList(
                 (0): Sequential(
                   (0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                   (1): ReLU()
                   (2): Conv2d(64, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 )
                 (1): Sequential(
                   (0): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                   (1): ReLU()
                   (2): Conv2d(96, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 )
                 (2): Sequential(
                   (0): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                   (1): ReLU()
                   (2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 )
                 (3): Sequential(
                   (0): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                   (1): ReLU()
                   (2): Conv2d(160, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 )
               )
             )
             (1): Sequential(
               (0): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
               (1): ReLU()
               (2): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1))
               (3): AvgPool2d(kernel_size=2, stride=2, padding=0)
             )
             (2): DenseBlock(
               (net): ModuleList(
                 (0): Sequential(
                   (0): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                   (1): ReLU()
                   (2): Conv2d(96, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 )
                 (1): Sequential(
                   (0): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                   (1): ReLU()
                   (2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 )
                 (2): Sequential(
                   (0): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                   (1): ReLU()
                   (2): Conv2d(160, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 )
                 (3): Sequential(
                   (0): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                   (1): ReLU()
                   (2): Conv2d(192, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 )
               )
             )
             (3): Sequential(
               (0): BatchNorm2d(224, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
               (1): ReLU()
               (2): Conv2d(224, 112, kernel_size=(1, 1), stride=(1, 1))
               (3): AvgPool2d(kernel_size=2, stride=2, padding=0)
             )
             (4): DenseBlock(
               (net): ModuleList(
                 (0): Sequential(
                   (0): BatchNorm2d(112, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                   (1): ReLU()
                   (2): Conv2d(112, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 )
                 (1): Sequential(
                   (0): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                   (1): ReLU()
                   (2): Conv2d(144, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 )
                 (2): Sequential(
                   (0): BatchNorm2d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                   (1): ReLU()
                   (2): Conv2d(176, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 )
                 (3): Sequential(
                   (0): BatchNorm2d(208, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                   (1): ReLU()
                   (2): Conv2d(208, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 )
               )
             )
             (5): Sequential(
               (0): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
               (1): ReLU()
               (2): Conv2d(240, 120, kernel_size=(1, 1), stride=(1, 1))
               (3): AvgPool2d(kernel_size=2, stride=2, padding=0)
             )
             (6): DenseBlock(
               (net): ModuleList(
                 (0): Sequential(
                   (0): BatchNorm2d(120, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                   (1): ReLU()
                   (2): Conv2d(120, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 )
                 (1): Sequential(
                   (0): BatchNorm2d(152, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                   (1): ReLU()
                   (2): Conv2d(152, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 )
                 (2): Sequential(
                   (0): BatchNorm2d(184, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                   (1): ReLU()
                   (2): Conv2d(184, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 )
                 (3): Sequential(
                   (0): BatchNorm2d(216, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                   (1): ReLU()
                   (2): Conv2d(216, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                 )
               )
             )
           )
           (output): Sequential(
             (0): BatchNorm2d(248, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
             (1): ReLU()
           )
           (fc): Sequential(
             (0): Flatten(start_dim=1, end_dim=-1)
             (1): Linear(in_features=248, out_features=10, bias=True)
           )
         )
ic| net(data).shape: torch.Size([4, 10])