Numpy实现BatchNorm2d
假设 x[B,C,H,W],
1. 计算不同通道的均值
如何理解 np.mean(x, axis=(0, 2, 3), keepdims=True)
import numpy as np
def cal_mean(x:np.ndarray):
B,C,H,W = x.shape
batch_mean = np.zeros((1,C,1,1))
for c in range(C):
total_sum = 0.0
count = 0
for b in range(B):
for h in range(H):
for w in range(W):
total_sum += x[b,c,h,w]
count += 1
# 实际上,count=B*H*W
batch_mean[0,c,0,0] = total_sum / count
return batch_mean
B, C, H, W = 4, 3, 8, 8
x = np.random.randint(-10,10, (B, C, H, W) )
mean0 = np.mean(x,axis=(0,2,3),keepdims=True)
mean1 = cal_mean(x)
print(mean0==mean1) # True
2.计算不同通道的方差
如何理解np.var(x, axis=(0, 2, 3), keepdims=True)?
def cal_variance(x:np.ndarray):
batch_mean = cal_mean(x)
B,C,H,W = x.shape
batch_var = np.zeros((1,C,1,1))
for c in range(C):
sum_squared_diff = 0.0
count = 0
for b in range(B):
for h in range(H):
for w in range(W):
diff = x[b,c,h,w] - batch_mean[0,c,0,0]
sum_squared_diff += diff ** 2
count += 1
# 实际上,count=B*H*W
batch_var[0,c,0,0] = sum_squared_diff / count
return batch_var
B, C, H, W = 4, 3, 8, 8
x = np.random.randint(-10,10, (B, C, H, W) )
var0 = np.var(x, axis=(0, 2, 3), keepdims=True)
var1 = cal_variance(x)
print(var0 == var1) # True
3. NumPy计算BatchNorm2d
class BatchNorm2d:
def __init__(self, num_channels, epsilon=1e-5, momentum=0.1):
self.epsilon = epsilon
self.momentum = momentum
# 初始化 gamma (scale) and beta (shift) as learnable parameters
# gamma和beta是learnable参数
self.gamma = np.ones((1, num_channels, 1, 1))
self.beta = np.zeros((1, num_channels, 1, 1))
# 推理阶段使用的running_mean和running_var
self.running_mean = np.zeros((1, num_channels, 1, 1))
self.running_var = np.ones((1, num_channels, 1, 1))
def forward(self, x, training=True):
if training:
# 计算均值和方差
batch_mean = np.mean(x, axis=(0, 2, 3), keepdims=True)
batch_var = np.var(x, axis=(0, 2, 3), keepdims=True)
# 更新running_mean 和 running_var, 为推理阶段使用
self.running_mean = self.momentum * self.running_mean + (1 - self.momentum) * batch_mean
self.running_var = self.momentum * self.running_var + (1 - self.momentum) * batch_var
# x[4,3,8,8],batch_mean[1,3,1,1],batch_var[1,3,1,1]
x_normalized = (x - batch_mean) / np.sqrt(batch_var + self.epsilon)
else:
# 在推理阶段, 使用 running mean and variance
x_normalized = (x - self.running_mean) / np.sqrt(self.running_var + self.epsilon)
# Scale and shift,normalized之后,再放缩和+偏置
# 这就是为什么batchnorm之前的Conv2d(bias=False),因为BN层有偏置
out = self.gamma * x_normalized + self.beta
return out
batch, channels, height, width = 4, 3, 8, 8
x = np.random.randint(-10,10, (batch, channels, height, width) )
bn_customize = BatchNorm2d(num_channels=channels)
output0 = bn_customize.forward(x, training=True)
x_tensor = torch.tensor(x).float()
batchnorm = nn.BatchNorm2d(num_features=channels,eps=1e-5,momentum=0.1)
batchnorm.train()
output1 = batchnorm(x_tensor)
output2 = output1.detach().numpy()
# 检测2者误差是否在10的负4次方
comparison = np.allclose(output0,output2,atol=1e-4)
print("Are the outputs close enough?",comparison)
部分结果对比
output0[0,0,0:3,0:4]
array([[ 1.19182343, -0.03069952, -0.20534565, 0.84253116],
[ 1.19182343, 1.5411157 , -0.72928406, -1.4278686 ],
[-1.4278686 , -0.55463792, -1.07857633, 0.14394662]])
output1[0,0,0:3,0:4]
tensor([[ 1.1918, -0.0307, -0.2053, 0.8425],
[ 1.1918, 1.5411, -0.7293, -1.4279],
[-1.4279, -0.5546, -1.0786, 0.1439]], grad_fn=<SliceBackward0>)