当前位置：首页 > article >正文

9、深入剖析PyTorch的nn.Sequential及ModuleList源码

article 2024/11/28 21:17:13

文章目录

1. train&eval
2. 求导数
3. 参数更新
4. ModuleList,Sequential
5. Parameter&Parameter_List&ParmeterDict

1. train&eval

train 模式：表示的是神经网络的训练模式，能够进行样本学习，通过样本来更新权重weight
eval 模式：表示的是神经网络的评估模式，能够进行推理，dropout,batchnorm等层失效
参考链接如下：参考其他大佬资料
drop_out 类测试

import torch
import torch.nn as nn
torch.manual_seed(23242)

# 创建 Dropout 层
dropout = nn.Dropout(p=0.3)

# 输入张量
x = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])

# 在训练模式下
dropout.train()
print("Training mode:\n", dropout(x))

# 在评估模式下
dropout.eval()
print("Evaluation mode:\n", dropout(x))

结果：

Training mode:
 tensor([[1.4286, 0.0000, 4.2857],
        [5.7143, 7.1429, 8.5714]])
Evaluation mode:
 tensor([[1., 2., 3.],
        [4., 5., 6.]])

numpy 版本

import numpy as np
np.random.seed(2323)


def dropout(x, p=0.5, training=True):
    """
    实现 Dropout 的功能。

    参数：
    - x: 输入数据，numpy 数组。
    - p: Dropout 概率，即随机置零的比例。
    - training: 是否处于训练模式。

    返回：
    - 应用 Dropout 后的数组。
    """
    if not training:
        return x  # 如果是评估模式，直接返回原始输入

    # 生成与输入同形状的随机二值 mask
    mask = np.random.binomial(1, 1 - p, size=x.shape)

    # 按照 Dropout 的规则应用 mask，并进行缩放
    return (x * mask) / (1 - p)


# 示例输入
x = np.array([[1.0, 2.0, 3.0],
              [4.0, 5.0, 6.0]])

# 训练模式下
x_dropout_train = dropout(x, p=0.3, training=True)
print("Training mode:")
print(x_dropout_train)

# 评估模式下
x_dropout_eval = dropout(x, p=0.3, training=False)
print("\nEvaluation mode:")
print(x_dropout_eval)

结果：

Training mode:
[[0.         0.         4.28571429]
 [5.71428571 0.         8.57142857]]

Evaluation mode:
[[1. 2. 3.]
 [4. 5. 6.]]

2. 求导数

$\begin{equation} y=x^2+3x\to \frac{\partial y}{\partial x}=2x+3 \end{equation}$
$\begin{equation} x=[1,2,3]\to y=[5,7,9] \end{equation}$

Python

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @FileName  :requre_grad_test.py
# @Time      :2024/11/24 15:51
# @Author    :Jason Zhang
import torch
from torch import nn

if __name__ == "__main__":
    run_code = 0
    x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
    y = x ** 2 + 3*x
    z = y.sum()
    x_grad = x.requires_grad
    y_grad = y.requires_grad
    z_grad = z.requires_grad
    print(f"x_grad={x_grad}")
    print(f"y_grad={y_grad}")
    print(f"z_grad={z_grad}")
    z.backward()
    x_grad_value = x.grad
    print(f"x={x}")
    print(f"x_grad_value={x_grad_value}")

结果：

x_grad=True
y_grad=True
z_grad=True
x=tensor([1., 2., 3.], requires_grad=True)
x_grad_value=tensor([5., 7., 9.])

3. 参数更新

python

import torch
import torch.nn as nn

# 创建一个线性回归模型
model = nn.Linear(2, 1)  # 输入特征为2，输出特征为1

# 定义输入和标签
x1 = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
y1 = torch.tensor([[3.0], [7.0]])

# 定义损失函数
loss_func = nn.MSELoss()

# 设置学习率
learning_rate = 0.01

# 训练循环
epoch = 0  # 记录训练次数
while True:
    epoch += 1

    # 前向传播
    output = model(x1)

    # 计算损失
    loss = loss_func(output, y1)

    # 打印当前损失值、权重和偏置
    print(f"Epoch {epoch}: Loss = {loss.item()}")
    print(f"  Weight: {model.weight.data.numpy()}")
    print(f"  Bias: {model.bias.data.numpy()}")
    # 判断是否满足停止条件
    if loss.item() < 0.01:
        print("Training finished!")
        break

    # 反向传播
    loss.backward()

    # 更新模型参数
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

    # 清零梯度
    model.zero_grad()

结果：

Epoch 1: Loss = 27.009794235229492
  Weight: [[-0.5681111  0.4556529]]
  Bias: [0.02086246]
Epoch 2: Loss = 12.700897216796875
  Weight: [[-0.33592594  0.78280616]]
  Bias: [0.11583048]
Epoch 3: Loss = 6.009862899780273
  Weight: [[-0.17655943  1.0063248 ]]
  Bias: [0.17998254]
Epoch 4: Loss = 2.8807597160339355
  Weight: [[-0.06698827  1.1589792 ]]
  Bias: [0.22306578]
Epoch 5: Loss = 1.4171319007873535
  Weight: [[0.00853085 1.2631778 ]]
  Bias: [0.25174525]

Epoch 300: Loss = 0.014076205901801586
  Weight: [[0.5034938 1.3849493]]
  Bias: [-0.12144651]
Epoch 301: Loss = 0.013969571329653263
  Weight: [[0.5041094 1.3847572]]
  Bias: [-0.12225428]
Epoch 302: Loss = 0.013863733038306236
  Weight: [[0.5047226 1.3845657]]
  Bias: [-0.123059]
Epoch 303: Loss = 0.01375875249505043
  Weight: [[0.5053335 1.384375 ]]
  Bias: [-0.12386066]
Epoch 304: Loss = 0.013654493726789951
  Weight: [[0.50594205 1.384185  ]]
  Bias: [-0.12465928]
Epoch 305: Loss = 0.013551048003137112
  Weight: [[0.5065483 1.3839957]]
  Bias: [-0.12545489]
Epoch 306: Loss = 0.01344841904938221
  Weight: [[0.50715226 1.3838071 ]]
  Bias: [-0.12624745]
Epoch 307: Loss = 0.013346527703106403
  Weight: [[0.50775397 1.3836192 ]]
  Bias: [-0.12703702]
Epoch 308: Loss = 0.013245439156889915
  Weight: [[0.50835335 1.383432  ]]
  Bias: [-0.12782359]
Epoch 309: Loss = 0.013145080767571926
  Weight: [[0.5089505 1.3832456]]
  Bias: [-0.12860717]
Epoch 310: Loss = 0.013045486062765121
  Weight: [[0.5095453 1.3830599]]
  Bias: [-0.12938778]
Epoch 311: Loss = 0.012946678325533867
  Weight: [[0.5101379 1.3828748]]
  Bias: [-0.13016543]
Epoch 312: Loss = 0.012848559767007828
  Weight: [[0.5107283 1.3826905]]
  Bias: [-0.13094012]
Epoch 313: Loss = 0.01275124866515398
  Weight: [[0.51131636 1.3825068 ]]
  Bias: [-0.13171189]
Epoch 314: Loss = 0.012654653750360012
  Weight: [[0.5119023 1.3823239]]
  Bias: [-0.13248071]
Epoch 315: Loss = 0.01255879271775484
  Weight: [[0.5124859 1.3821416]]
  Bias: [-0.13324663]
Epoch 316: Loss = 0.012463669292628765
  Weight: [[0.51306736 1.38196   ]]
  Bias: [-0.13400963]
Epoch 317: Loss = 0.012369243428111076
  Weight: [[0.5136466 1.3817792]]
  Bias: [-0.13476974]
Epoch 318: Loss = 0.012275544926524162
  Weight: [[0.51422364 1.3815991 ]]
  Bias: [-0.13552696]
Epoch 319: Loss = 0.012182533740997314
  Weight: [[0.51479846 1.3814195 ]]
  Bias: [-0.1362813]
Epoch 320: Loss = 0.01209024153649807
  Weight: [[0.51537114 1.3812407 ]]
  Bias: [-0.13703278]
Epoch 321: Loss = 0.011998665519058704
  Weight: [[0.5159416 1.3810626]]
  Bias: [-0.13778141]
Epoch 322: Loss = 0.01190776564180851
  Weight: [[0.51650995 1.3808851 ]]
  Bias: [-0.1385272]
Epoch 323: Loss = 0.011817571707069874
  Weight: [[0.51707613 1.3807083 ]]
  Bias: [-0.13927016]
Epoch 324: Loss = 0.011728014796972275
  Weight: [[0.5176402 1.3805323]]
  Bias: [-0.1400103]
Epoch 325: Loss = 0.011639159172773361
  Weight: [[0.51820207 1.3803568 ]]
  Bias: [-0.14074764]
Epoch 326: Loss = 0.01155102625489235
  Weight: [[0.5187618 1.380182 ]]
  Bias: [-0.14148217]
Epoch 327: Loss = 0.011463488452136517
  Weight: [[0.5193194 1.3800079]]
  Bias: [-0.14221393]
Epoch 328: Loss = 0.011376669630408287
  Weight: [[0.51987493 1.3798344 ]]
  Bias: [-0.14294289]
Epoch 329: Loss = 0.011290469206869602
  Weight: [[0.52042836 1.3796616 ]]
  Bias: [-0.1436691]
Epoch 330: Loss = 0.011204947717487812
  Weight: [[0.52097964 1.3794894 ]]
  Bias: [-0.14439255]
Epoch 331: Loss = 0.011120039038360119
  Weight: [[0.52152884 1.379318  ]]
  Bias: [-0.14511324]
Epoch 332: Loss = 0.011035802774131298
  Weight: [[0.52207595 1.3791472 ]]
  Bias: [-0.14583121]
Epoch 333: Loss = 0.010952199809253216
  Weight: [[0.522621 1.378977]]
  Bias: [-0.14654645]
Epoch 334: Loss = 0.010869231075048447
  Weight: [[0.523164  1.3788074]]
  Bias: [-0.14725898]
Epoch 335: Loss = 0.010786919854581356
  Weight: [[0.5237049 1.3786385]]
  Bias: [-0.14796881]
Epoch 336: Loss = 0.010705174878239632
  Weight: [[0.5242438 1.3784703]]
  Bias: [-0.14867595]
Epoch 337: Loss = 0.010624050162732601
  Weight: [[0.5247806 1.3783027]]
  Bias: [-0.1493804]
Epoch 338: Loss = 0.010543602518737316
  Weight: [[0.52531534 1.3781357 ]]
  Bias: [-0.15008219]
Epoch 339: Loss = 0.010463712736964226
  Weight: [[0.5258481 1.3779694]]
  Bias: [-0.15078129]
Epoch 340: Loss = 0.010384460911154747
  Weight: [[0.5263788 1.3778037]]
  Bias: [-0.15147775]
Epoch 341: Loss = 0.010305758565664291
  Weight: [[0.5269075 1.3776386]]
  Bias: [-0.15217157]
Epoch 342: Loss = 0.010227718390524387
  Weight: [[0.52743423 1.3774741 ]]
  Bias: [-0.15286276]
Epoch 343: Loss = 0.01015022024512291
  Weight: [[0.5279589 1.3773103]]
  Bias: [-0.15355131]
Epoch 344: Loss = 0.010073349811136723
  Weight: [[0.52848166 1.3771471 ]]
  Bias: [-0.15423726]
Epoch 345: Loss = 0.009997041895985603
  Weight: [[0.52900237 1.3769845 ]]
  Bias: [-0.15492061]
Training finished!

4. ModuleList,Sequential

python

import torch
from torch import nn
from pytorch_model_summary import summary

torch.manual_seed(2323)


class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.flatten = nn.Flatten()
        self.block = nn.ModuleList([
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        ])

    def forward(self, x):
        x = self.flatten(x)
        for layer in self.block:
            x = layer(x)
        return x


class MyNewNet(MyModel):
    def __init__(self):
        super(MyNewNet, self).__init__()
        self.block.insert(2, nn.Linear(512, 256))  # 插入新层
        self.block.insert(3, nn.ReLU())  # 插入新的激活函数
        self.block.insert(4, nn.Linear(256, 512))  # 插入另一层
        self.block.insert(5, nn.ReLU())  # 插入激活函数


class MyModelSequential(nn.Module):
    def __init__(self):
        super(MyModelSequential, self).__init__()
        self.block = nn.Sequential(
            nn.Flatten(),
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        y = self.block(x)
        return y


if __name__ == "__main__":
    # 测试原始模型
    my_model = MyModel()
    my_model_get_name = my_model._get_name()
    print(f"my_model_get_name={my_model_get_name}")
    print(f"str(my_model)=\n{str(my_model)}")
    print(f"dir(my_model)=\n{dir(my_model)}")
    my_model_modules = list(my_model.named_modules())
    print(f"my_model_modules=\n{my_model_modules}")
    print(f"*"*50)
    print("Original Model:")
    print(summary(my_model, torch.ones((1, 28, 28))))

    # 测试新模型
    my_new_model = MyNewNet()
    print("\nNew Model:")
    print(summary(my_new_model, torch.ones((1, 28, 28))))

    my_sequential_model = MyModelSequential()
    print("\nSequential Model:")
    print(summary(my_sequential_model, torch.ones((1, 28, 28))))

结果：

my_model_get_name=MyModel
str(my_model)=
MyModel(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (block): ModuleList(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=10, bias=True)
  )
)
dir(my_model)=
['T_destination', '__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_apply', '_backward_hooks', '_backward_pre_hooks', '_buffers', '_call_impl', '_forward_hooks', '_forward_hooks_with_kwargs', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_get_backward_hooks', '_get_backward_pre_hooks', '_get_name', '_is_full_backward_hook', '_load_from_state_dict', '_load_state_dict_post_hooks', '_load_state_dict_pre_hooks', '_maybe_warn_non_full_backward_hook', '_modules', '_named_members', '_non_persistent_buffers_set', '_parameters', '_register_load_state_dict_pre_hook', '_register_state_dict_hook', '_replicate_for_data_parallel', '_save_to_state_dict', '_slow_forward', '_state_dict_hooks', '_state_dict_pre_hooks', '_version', 'add_module', 'apply', 'bfloat16', 'block', 'buffers', 'call_super_init', 'children', 'cpu', 'cuda', 'double', 'dump_patches', 'eval', 'extra_repr', 'flatten', 'float', 'forward', 'get_buffer', 'get_extra_state', 'get_parameter', 'get_submodule', 'half', 'ipu', 'load_state_dict', 'modules', 'named_buffers', 'named_children', 'named_modules', 'named_parameters', 'parameters', 'register_backward_hook', 'register_buffer', 'register_forward_hook', 'register_forward_pre_hook', 'register_full_backward_hook', 'register_full_backward_pre_hook', 'register_load_state_dict_post_hook', 'register_module', 'register_parameter', 'register_state_dict_pre_hook', 'requires_grad_', 'set_extra_state', 'share_memory', 'state_dict', 'to', 'to_empty', 'train', 'training', 'type', 'xpu', 'zero_grad']
my_model_modules=
[('', MyModel(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (block): ModuleList(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=10, bias=True)
  )
)), ('flatten', Flatten(start_dim=1, end_dim=-1)), ('block', ModuleList(
  (0): Linear(in_features=784, out_features=512, bias=True)
  (1): ReLU()
  (2): Linear(in_features=512, out_features=10, bias=True)
)), ('block.0', Linear(in_features=784, out_features=512, bias=True)), ('block.1', ReLU()), ('block.2', Linear(in_features=512, out_features=10, bias=True))]
**************************************************
Original Model:
-----------------------------------------------------------------------
      Layer (type)        Output Shape         Param #     Tr. Param #
=======================================================================
         Flatten-1            [1, 784]               0               0
          Linear-2            [1, 512]         401,920         401,920
            ReLU-3            [1, 512]               0               0
          Linear-4             [1, 10]           5,130           5,130
=======================================================================
Total params: 407,050
Trainable params: 407,050
Non-trainable params: 0
-----------------------------------------------------------------------

New Model:
-----------------------------------------------------------------------
      Layer (type)        Output Shape         Param #     Tr. Param #
=======================================================================
         Flatten-1            [1, 784]               0               0
          Linear-2            [1, 512]         401,920         401,920
            ReLU-3            [1, 512]               0               0
          Linear-4            [1, 256]         131,328         131,328
            ReLU-5            [1, 256]               0               0
          Linear-6            [1, 512]         131,584         131,584
            ReLU-7            [1, 512]               0               0
          Linear-8             [1, 10]           5,130           5,130
=======================================================================
Total params: 669,962
Trainable params: 669,962
Non-trainable params: 0
-----------------------------------------------------------------------

Sequential Model:
-----------------------------------------------------------------------
      Layer (type)        Output Shape         Param #     Tr. Param #
=======================================================================
         Flatten-1            [1, 784]               0               0
          Linear-2            [1, 512]         401,920         401,920
            ReLU-3            [1, 512]               0               0
          Linear-4             [1, 10]           5,130           5,130
=======================================================================
Total params: 407,050
Trainable params: 407,050
Non-trainable params: 0
-----------------------------------------------------------------------

5. Parameter&Parameter_List&ParmeterDict

新增新的张量，并注册为可以迭代更新的参数，用nn.Parameter类

python:

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @FileName  :ParametersTest.py
# @Time      :2024/11/24 18:55
# @Author    :Jason Zhang
import torch
from torch import nn
torch.manual_seed(4223)


class MyOriginalNet(nn.Module):
    def __init__(self):
        super(MyOriginalNet, self).__init__()
        self.linear1 = nn.Linear(3, 4)
        self.mytensor = torch.rand(4, 5)

    def forward(self, x):
        x = self.linear1(x)
        return x


class MyParameterNet(nn.Module):
    def __init__(self):
        super(MyParameterNet, self).__init__()
        self.linear1 = nn.Linear(3, 4)
        self.my_tensor = nn.Parameter(torch.rand(4, 5))

    def forward(self, x):
        x = self.linear1(x)
        return x


class MyParameterListNet(nn.Module):
    def __init__(self):
        super(MyParameterListNet, self).__init__()
        self.linear1 = nn.Linear(3, 4)
        self.my_tensor = nn.ParameterList(torch.rand(4, 5) for i in range(5))

    def forward(self, x):
        x = self.linear1(x)
        return x


class MyParameterDictNet(nn.Module):
    def __init__(self):
        super(MyParameterDictNet, self).__init__()
        self.linear1 = nn.Linear(3, 4)
        self.para_dict = nn.ParameterDict({
            'left': nn.Parameter(torch.randn(5, 10)),
            'right': nn.Parameter(torch.randn(5, 10)),
        })

    def forward(self, x, choice):
        x = self.para_dict[choice].mm(x)
        return x


if __name__ == "__main__":
    run_code = 0
    my_net = MyOriginalNet()
    my_original_list = list(my_net.named_parameters())
    print(f"my_original_list={my_original_list}")
    my_para = MyParameterNet()
    my_para_list = list(my_para.named_parameters())
    print(f"my_para_list={my_para_list}")
    my_parameter_list = MyParameterListNet()
    my_parameter_list_1 = list(my_parameter_list.named_parameters())
    print(f"my_parameter_list_1={my_parameter_list_1}")
    my_para_dict = MyParameterDictNet()
    my_para_dict_list = list(my_para_dict.named_parameters())
    print(f"my_para_dict_list={my_para_dict_list}")

结果：

my_original_list=[('linear1.weight', Parameter containing:
tensor([[-0.5089,  0.1086, -0.3703],
        [-0.0486,  0.5186, -0.0712],
        [ 0.0793,  0.2897, -0.0913],
        [-0.3404,  0.4358,  0.4464]], requires_grad=True)), ('linear1.bias', Parameter containing:
tensor([-0.2931,  0.2479, -0.2552, -0.5016], requires_grad=True))]
my_para_list=[('my_tensor', Parameter containing:
tensor([[0.3348, 0.5709, 0.6592, 0.7997, 0.2638],
        [0.9706, 0.9607, 0.1166, 0.5584, 0.6739],
        [0.4050, 0.1469, 0.6307, 0.0260, 0.3300],
        [0.4974, 0.1928, 0.4940, 0.1852, 0.1085]], requires_grad=True)), ('linear1.weight', Parameter containing:
tensor([[-0.3481, -0.0894, -0.3917],
        [ 0.1817,  0.2657,  0.5464],
        [ 0.5769,  0.2373, -0.3456],
        [-0.2842, -0.1695,  0.4080]], requires_grad=True)), ('linear1.bias', Parameter containing:
tensor([-0.3280,  0.3507, -0.4470,  0.0984], requires_grad=True))]
my_parameter_list_1=[('linear1.weight', Parameter containing:
tensor([[-0.1617, -0.1333,  0.2694],
        [-0.3950, -0.5114,  0.0524],
        [ 0.1599,  0.1093, -0.1124],
        [ 0.4692,  0.1502,  0.4282]], requires_grad=True)), ('linear1.bias', Parameter containing:
tensor([ 0.5326, -0.0331,  0.4625,  0.4384], requires_grad=True)), ('my_tensor.0', Parameter containing:
tensor([[0.8858, 0.7725, 0.5914, 0.3476, 0.6177],
        [0.6986, 0.8195, 0.8608, 0.9989, 0.1673],
        [0.5300, 0.5413, 0.5605, 0.9120, 0.7765],
        [0.8869, 0.3050, 0.5276, 0.8894, 0.4718]], requires_grad=True)), ('my_tensor.1', Parameter containing:
tensor([[0.3136, 0.8541, 0.6999, 0.2314, 0.2391],
        [0.8063, 0.6426, 0.4157, 0.5995, 0.3899],
        [0.2969, 0.5717, 0.6532, 0.3171, 0.0089],
        [0.8649, 0.7355, 0.9127, 0.9174, 0.8873]], requires_grad=True)), ('my_tensor.2', Parameter containing:
tensor([[0.7101, 0.3629, 0.4214, 0.1725, 0.3527],
        [0.5847, 0.5381, 0.2783, 0.5601, 0.4330],
        [0.9955, 0.7466, 0.2785, 0.6853, 0.7695],
        [0.5083, 0.2306, 0.2742, 0.7975, 0.6680]], requires_grad=True)), ('my_tensor.3', Parameter containing:
tensor([[0.2137, 0.9676, 0.3329, 0.4692, 0.3137],
        [0.2052, 0.9377, 0.1016, 0.5990, 0.2175],
        [0.8921, 0.0642, 0.9662, 0.7828, 0.7527],
        [0.9349, 0.9536, 0.9047, 0.7468, 0.9529]], requires_grad=True)), ('my_tensor.4', Parameter containing:
tensor([[0.3828, 0.8569, 0.5692, 0.6235, 0.8658],
        [0.8093, 0.0742, 0.8216, 0.4765, 0.9254],
        [0.8145, 0.6869, 0.8607, 0.5033, 0.7655],
        [0.8213, 0.7167, 0.3585, 0.6060, 0.9576]], requires_grad=True))]
my_para_dict_list=[('linear1.weight', Parameter containing:
tensor([[-0.4675,  0.2992,  0.3019],
        [ 0.4830, -0.5492, -0.1147],
        [-0.1806,  0.3557, -0.5125],
        [ 0.0112,  0.0357,  0.5018]], requires_grad=True)), ('linear1.bias', Parameter containing:
tensor([ 0.2655, -0.0366,  0.3213,  0.5213], requires_grad=True)), ('para_dict.left', Parameter containing:
tensor([[-0.6760,  1.5255, -0.2469, -0.8515,  1.1868,  0.2585,  1.1479, -1.0624,
          0.9016, -0.0555],
        [ 1.6491, -0.1097, -0.6965, -0.7961, -0.1093, -0.3175, -0.7918,  0.1021,
          1.4860, -1.2344],
        [ 1.0438,  0.3725,  1.4540,  1.2188,  0.5695,  1.9102,  0.9539, -1.7334,
          1.1806,  0.0480],
        [-0.0024, -1.6298,  2.0577,  1.6975,  0.0825,  0.3063, -0.8376, -1.5045,
          1.4113,  0.4588],
        [-1.0440, -0.8490, -0.8787, -0.0418, -1.3182,  0.6033, -0.8973,  1.0968,
         -0.6005, -0.3022]], requires_grad=True)), ('para_dict.right', Parameter containing:
tensor([[-2.5233,  0.7036,  0.8324,  1.6768, -1.2052, -0.0711,  0.5338, -0.4779,
          0.0469, -0.0403],
        [ 1.3852, -0.3172, -1.3008,  0.1408, -1.9336, -0.9055, -1.7066, -0.2712,
          0.5037,  1.5622],
        [ 0.1787, -1.3736, -0.9583, -1.8272, -0.6235,  0.7015,  0.3433, -0.4415,
          1.0714, -0.7610],
        [-0.4396, -0.1573, -0.0357,  0.1194,  1.0966,  1.4011, -1.3459,  1.3431,
          0.0034,  2.3680],
        [ 1.4473, -0.2111, -1.3596,  0.7426, -1.0766,  1.2246, -0.7292,  1.1326,
          0.6913, -1.4395]], requires_grad=True))]