基本数据类型

1668234025873

类型检验

1
2
3
4
5
6
7
8
9
10
11
12
data = torch.randn(2, 3)   # 正态分布随机初始化2*3矩阵

print(data.type())

print(type(data)) # python自带type()函数

# 合法化检验
print(isinstance(data, torch.FloatTensor))

print(isinstance(data, torch.cuda.FloatTensor))
data = data.cuda() # 将数据搬运到GPU上面
print(isinstance(data, torch.cuda.FloatTensor))

张量和维度

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# 标量 Dimension 0 (0阶张量); 如Loss
data = torch.tensor(1.)
print("data.shape:", data.shape)
print("data.size():", data.size())
print("data.ndimension():", data.ndimension())

print('----------------------------------------------')

# 向量 Dimension 1 (1阶张量); 如Bias,Linear Input
data = torch.tensor([1.1])
data = torch.tensor([1.1, 2.2])
# 随机初始化生成一维的长度为1、2的向量
data = torch.FloatTensor(1)
data = torch.FloatTensor(2)
print(data)
# 用numpy初始化
data = torch.from_numpy(np.ones(2))
print("data.shape:", data.shape)
print("data.size():", data.size())
print("data.ndimension():", data.ndimension())

print('----------------------------------------------')

# 矩阵 Dimension 2 (2阶张量); 如 Linear Input Batch,第一维度表示每个batch的样本数
data = torch.randn(2, 3)
print("data.shape:", data.shape)
print("data.shape[0]:", data.shape[0])
# 取shape的第0个元素,即张量第一维的元素个数
print("data.size(0):", data.size(0))
print("data.size(1):", data.size(1))
print("data.ndimension():", data.ndimension())

print('----------------------------------------------')

# Dimension 3 (3阶张量); 如RNN Input Batch
data = torch.randn(2, 2, 3)
print("data.shape:", data.shape)
print("data.size():", data.size())
print("data.ndimension():", data.ndimension())

# Dimension 4 (4阶张量); 如CNN
data = torch.randn(2, 3, 28, 28) # 2张照片,3个通道,28 * 28个像素
print(data)
print("data.shape:", data.shape)
print("data.size():", data.size())
print("data.ndimension():", data.ndimension())

# number of element
print(data.numel()) # 2*3*28*28 元素个数
print(data.dim()) # 张量维度

创建Tensor

通过list创建

1
2
3
4
# 通过list创建tensor
data = torch.tensor([2., 3.2])
data = torch.FloatTensor([2., 3.2]) # 尽量少用,容易混淆
data = torch.tensor([[2., 3.2], [1., 0.5]])

通过numpy创建

1
2
3
4
5
# 通过numpy创建tensor
a = np.array([2, 3.3])
data = torch.from_numpy(a)
a = np.ones([2, 3])
data = torch.from_numpy(a)

empty()

1
2
3
4
5
6
7
8
9
# 创建未初始化的tensor
data = torch.empty(2, 3)
print(data)

data = torch.FloatTensor(1, 2, 3)
print(data)

data = torch.IntTensor(1, 2, 3)
print(data)

set_default_tensor_type()

1
2
3
4
# 设置默认tensor类型
print(torch.tensor([1.2, 3]).type()) # 默认是FloatTensor
torch.set_default_tensor_type( torch.DoubleTensor)
print(torch.tensor([1.2, 3]).type())

rand/randn()

  • rand/rand_like; randint/randint_like

    1
    2
    3
    4
    5
    # rand/rand_like; randint/randint_like
    a = torch.rand(3, 3) # 在[0,1]之间均匀采样
    data = torch.rand_like(a) # 接受参数不在是shape而是一个tensor
    a = torch.randint(1, 10, [3, 3]) # 在[1,10)之间均匀采样
    data = torch.randint_like(a, 1, 10)
  • randn

    1
    2
    3
    4
    5
    6
    7
    # randn
    data = torch.randn(3, 3) # N(0,1)标准正态分布取样
    data = torch.normal(mean=3, std=1, size=(3, 3)) # N(3,1)正态分布取样
    print(data)
    # 让每一个值服从不同的正态分布
    torch.normal(mean=torch.arange(4.), std=torch.arange(1., 0.6, -0.1)).reshape(2,2)
    print(data)

full()

1
2
3
# 创建初始化为同一个值的tensor
data = torch.full([2, 3], 6)
print(data)

arange()

1
2
3
4
# 创建等差数列的tensor
data = torch.arange(0, 10, 2) # [0, 10)步幅为2
data = torch.linspace(0, 10, steps=11) # [0, 10]等分为11份
data = torch.logspace(0, -1, steps=11) # 将[0,-1]11等分的值作为10的指数返回

zeros/ones/eye()

1
2
3
4
# 生成全0/1,对角的tensor
data = torch.zeros(3, 3)
data = torch.ones(3, 3)
data = torch.eye(3)

randperm()

1
2
3
4
5
6
7
8
9
10
# 随机打算取值创建1维tensor
data = torch.randperm(10) # 不包含10
print(data)
# 协同shuffle洗牌
a = torch.rand(4, 3)
b = torch.rand(4, 2)
print(a, '\n', b)
idx = torch.randperm(4)
print(idx)
print(a[idx], '\n', b[idx])

索引与切片

四个维度

batch,channel,height,width

1
2
3
4
5
6
a = torch.rand(4, 3, 28, 28)
print(a[0].shape)
print(a[0, 0].shape)
print(a[0, 0, 0].shape)
print(a[0, 0, 0, 0].shape)
print(a[0, 1, 2, 4])

基本切片

1
2
3
4
5
# 切片 select fisrt/last N
print(a[:2].shape)
print(a[:2, :1, :, :].shape)
print(a[:2, -3:, :, :].shape)
print(a[[0, 1, 3], -3:, :, :].shape)

隔点采样

1
2
3
# 隔点采样 select by steps
print(a[:, :, 0:28:2, 0:28:2].shape) # 步长为2采样
print(a[:, :, ::2, ::2].shape)

a.index_select()

1
2
3
4
# 在某一维度上根据具体索引采样 index_select
print(a.index_select(0, torch.tensor([0, 1, 2])).shape) # 在第0维度(batch)上采样[0,1,2]
print(a.index_select(1, torch.tensor([0, 1])).shape) # 在第1维度(channel)上采样[0,1]
print(a.index_select(2, torch.arange(0, 28, 2)).shape)

1
2
3
# ...表示任意多维度
print(a[0, ...]) # 等价于a[0, :, :, :]和a[0]
print(a[..., :2]) # 等价于a[:, :, :, :2]

masked_select()

1
2
3
4
# 根据掩码取满足条件的值(会改变shape )masked_select
x = torch.randn(3, 4)
mask = x.ge(0.5)
print(torch.masked_select(x, mask))

take()

1
2
3
# 打平后根据索引采样  take
x = torch.tensor([[4, 3, 5], [6, 7, 8]])
print(torch.take(x, torch.tensor([0, 2, 5])))

维度变换

view/reshape

1
2
3
4
5
6
7
8
a = torch.rand(4, 1, 28, 28)
print(a.shape)

a = a.view(4, 28 * 28)
print(a.shape)

a = a.reshape(4 * 28, 28)
print(a.shape)

unsqueeze/squeeze 拉伸/挤压

1
2
3
4
# unsqueeze
a = torch.rand(4, 1, 28, 28)
print(a.unsqueeze(0).shape) # 在第0维度前插入维度;索引范围[-5, 4]
print(a.unsqueeze(-1).shape) # 在最后一个维度后插入维度
1
2
3
4
5
6
7
# squeeze
b = torch.rand(32)
b = b.unsqueeze(1).unsqueeze(2).unsqueeze(0)
print(b.shape)
print(b.squeeze().shape) # 挤压所有size为1的维度
print(b.squeeze(0).shape)
print(b.squeeze(1).shape) # size不为1的维度不能被挤压

expand/repeat 扩张/重复

1
2
3
4
5
6
7
8
# expand(推荐使用)
f = torch.rand(4, 32, 14, 14) # [4, 32, 14, 14]
b = torch.rand(32)
b = b.unsqueeze(1).unsqueeze(2).unsqueeze(0) # [1, 32, 1, 1]
# 每个channel所有像素增加一个bias
b = b.expand(4, 32, 14, 14) # 维度一致;只有size为1的维度才能扩张成N
b = b.expand(-1, 32, -1, -1) # -1表示维度size不变
print(b.shape)
1
2
3
4
5
# repeat
b = torch.rand(32)
b = b.unsqueeze(1).unsqueeze(2).unsqueeze(0) # [1, 32, 1, 1]
b = b.repeat(4, 1, 14, 14) # 参数指重复次数
print(b.shape)

转置 t

1
2
3
4
# 4.转置 t
# 只适用于矩阵,dimension为2
a = torch.tensor([[1, 2], [3, 4]])
print(a.t())

transpose 维度交换

1
2
3
4
5
6
7
8
# transpose 维度交换
a = torch.rand(4, 3, 14, 14)
# [b,c,h,w] => [b,w,h,c] => [b, whc] => [b,c,w,h] 导致数据污染
a1 = a.transpose(1, 3).contiguous().view(4, 3 * 14 * 14).view(4, 3, 14, 14)
# [b,c,h,w] => [b,w,h,c] => [b, whc] => [b,w,h,c] => [b,c,w,h]
a2 = a.transpose(1, 3).contiguous().view(4, 3 * 14 * 14).view(4, 14, 14, 3).transpose(1, 3)
print(torch.all(torch.eq(a, a1)))
print(torch.all(torch.eq(a, a2)))

permute 维度排列

1
2
3
4
# permute 维度排列
a = torch.rand(4, 3, 14, 7)
a1 = a.permute(0, 2, 3, 1)
print(a1.shape)

Broadcast机制

  • broadcast广播/自动扩张机制
    • expand
    • 不需要复制数据
  • 主要思想
    • 在前面插入维度,使维度一致
    • 扩张size为1的维度使得size一致
  • 为什么需要broadcast
    • 需求场景
    • 减少内存消耗
  • 可以broadcast的情况
    • 从最后一维往前开始匹配
    • 如果当前维度为1,扩展size到一致
    • 如果往前没有维度,则插入一个维度,并扩展size到一致
    • 其他情况都是不能broadcast的

1668428901290

拼接与拆分

cat

要求除dim=d外,其他维度size一致

1
2
3
4
5
6
7
8
9
10
11
12
13
14
a1 = torch.rand(4, 3, 16, 16)
a2 = torch.rand(5, 3, 16, 16)
b = torch.cat([a1, a2], dim=0)
print(b.size())

a1 = torch.rand(4, 2, 16, 16)
a2 = torch.rand(4, 1, 16, 16)
b = torch.cat([a1, a2], dim=1)
print(b.size())

a1 = torch.rand(4, 3, 16, 16)
a2 = torch.rand(4, 3, 16, 16)
b = torch.cat([a1, a2], dim=2)
print(b.size())

stack

要求所有维度一致,会在dim=d维度前新增一个维度

1
2
3
4
a1 = torch.rand(4, 3, 16, 16)
a2 = torch.rand(4, 3, 16, 16)
b = torch.stack([a1, a2], dim=2)
print(b.size())

split

按照长度拆分

1
2
3
4
5
6
a = torch.rand(3, 3, 16, 16)
aa, bb = a.split([2, 1], dim=0)
print(aa.size(), bb.size())

aa, bb, cc = a.split(1, dim=0)
print(aa.size(), bb.size(), cc.size())

chunk

按照数量拆分

1
2
3
4
5
a = torch.rand(3, 3, 16, 16)
aa, bb = a.chunk(2, dim=0)

aa, bb, cc = a.chunk(3, dim=0)
print(aa.size(), bb.size(), cc.size())

基本运算

加减乘除

+=*/ 或者add,sub,mul,div

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
a = torch.rand(3, 4)
b = torch.rand(4)

# 根据broadcast机制,b会自动扩展成[3, 4]再参与运算
c1 = a + b
c2 = a.add(b)
print(torch.all(torch.eq(c1, c2)))

c1 = a - b
c2 = a.sub(b)
print(torch.all(torch.eq(c1, c2)))

c1 = a * b
c2 = a.mul(b)
print(torch.all(torch.eq(c1, c2)))

c1 = a / b
c2 = a.div(b)
print(torch.all(torch.eq(c1, c2)))

matmul矩阵相乘

  • mm 只支持二维

  • matmul

  • @

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    # mm 只支持二维
    a = torch.tensor([[1., 2.], [3., 4.]])
    b = torch.ones(2, 2)
    c = torch.mm(a, b)
    print(c)
    # matmul
    c = torch.matmul(a, b)
    print(c)
    # @
    c = a @ b
    print(c)

    # 一个例子
    x = torch.rand(4, 784)
    w = torch.rand(512, 784)
    print((x @ w.t()).shape)

    # 大于2维张量相乘
    a = torch.rand(4, 3, 28, 64)
    b = torch.rand(4, 3, 64, 32)
    c = torch.rand(4, 1, 64, 32)
    print(torch.matmul(a, b).size())
    print(torch.matmul(a, c).size())

pow/次方

1
2
3
4
5
6
7
8
9
a = torch.full([2, 2], 3)
# 平方
print(a.pow(2))
print(a ** 2)
# 开方
print(a.pow(0.5))
print(a ** 0.5)
print(a.sqrt())
print(a.rsqrt()) # 开方倒数

exp/log

1
2
print(torch.exp(torch.ones(2, 2)))
print(torch.log(torch.ones(2, 2)))

近似(取整)

1
2
3
4
5
6
a = torch.tensor(3.14)
print(a.floor()) # 向上取整
print(a.ceil()) # 向下取整
print(a.round()) # 四舍五入
print(a.trunc()) # 取整数部分
print(a.frac()) # 取小数部分

clamp限幅

1
2
3
4
grad = torch.rand(2, 3) * 15    # [0,15]随机采样
print(a.max(), a.median(), a.min())
print(grad.clamp(10)) # 最小值限制为10
print(grad.clamp(0, 10)) #最小值限制为0,最大值限制为10

比较运算

1
2
3
4
5
6
7
8
9
a = torch.randn(2, 3)
b = torch.randn(2, 3)
print(a, '\n', b)

print(a > b, '\n', a.gt(b))
print(a >= b, '\n', a.ge(b))
print(a != b)
print(a == b)
print(torch.eq(a, b), torch.equal(a, b))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# 结果
tensor([[-1.9149, 0.2055, -0.9794],
[ 0.0520, 1.4960, -0.3995]])
tensor([[-0.2851, -0.4754, -0.9211],
[ 2.2437, -2.5068, 0.0198]])
tensor([[False, True, False],
[False, True, False]])
tensor([[False, True, False],
[False, True, False]])
tensor([[False, True, False],
[False, True, False]])
tensor([[False, True, False],
[False, True, False]])
tensor([[True, True, True],
[True, True, True]])
tensor([[False, False, False],
[False, False, False]])
tensor([[False, False, False],
[False, False, False]]) False

统计属性

norm范数

1
2
3
4
5
6
7
8
9
10
11
12
# 总体元素的范数
print(a.norm(1), '\n', b.norm(1), '\n', c.norm(1))
print(a.norm(2), '\n', b.norm(1), '\n', c.norm(2))

# 在某个维度上面求范数
print(b.norm(1, dim=0))
print(b.norm(1, dim=1))

c = torch.tensor([[[1., 1.], [1., 1.]],
[[2., 2.], [2., 2.]]])
print(c.norm(1, dim=0))
print(c.norm(1, dim=1))

min/max/mean/prod

1
2
3
4
5
6
7
a = torch.arange(1, 11).view(2, 5).float()
print(a)
# 不带参数的话,求的是所有元素的(打平后再求)
print(a.min())
print(a.max())
print(a.mean())
print(a.prod()) # 元素累乘
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# 带参数 dim, keepdim
# dim:在该维度上进行操作
# keepdim:操作后保存维度不变
a = torch.rand(3, 4)
print(a)
print(a.min(dim=0, keepdim=True))
print(a.min(dim=0))
print(a.max(dim=0))
print(a.mean(dim=0))
print(a.prod(dim=0))
"""
tensor([[0.9035, 0.6971, 0.4886, 0.7893],
[0.1721, 0.7944, 0.0457, 0.7884],
[0.9295, 0.3432, 0.1493, 0.7428]])
torch.return_types.min(values=tensor([[0.1721, 0.3432, 0.0457, 0.7428]]),indices=tensor([[1, 2, 1, 2]]))
torch.return_types.min(values=tensor([0.1721, 0.3432, 0.0457, 0.7428]),indices=tensor([1, 2, 1, 2]))
torch.return_types.max(values=tensor([0.9295, 0.7944, 0.4886, 0.7893]),indices=tensor([2, 1, 0, 0]))
tensor([0.6684, 0.6116, 0.2279, 0.7735])
tensor([0.1445, 0.1901, 0.0033, 0.4623])
"""

print(a.min(dim=1, keepdim=True)) # keepdim=True保持原来的维度
print(a.min(dim=1))
print(a.max(dim=1))
print(a.mean(dim=1))
print(a.prod(dim=1))
"""
tensor([[0.7369, 0.7026, 0.7261, 0.6016],
[0.1645, 0.3403, 0.6544, 0.4276],
[0.5673, 0.4806, 0.7130, 0.7522]])
torch.return_types.min(
values=tensor([[0.6016],
[0.1645],
[0.4806]]),
indices=tensor([[3],
[0],
[1]]))
torch.return_types.min(values=tensor([0.6016, 0.1645, 0.4806]),indices=tensor([3, 0, 1]))
torch.return_types.max(values=tensor([0.7369, 0.6544, 0.7522]),indices=tensor([0, 2, 3]))
tensor([0.6918, 0.3967, 0.6283])
tensor([0.2262, 0.0157, 0.1462])
"""

argmax/argmin

1
2
3
# 最大/小值索引
print(a.argmax())
print(a.argmin())

topk/kthvalue

1
2
3
4
5
6
# 取前k大/小的值 topK
# 取第k小的值 kthvalue
a = torch.arange(1, 11).view(2, 5).float()
print(a.topk(3, dim=1))
print(a.topk(3, dim=1, largest=False)) # largest=False取前k小
print(a.kthvalue(2, dim=1)) # 取前k小

高级操作

where

根据条件tensor赋值

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
condition = torch.rand(2, 2)
print(condition, '\n', condition > 0.5)

a = torch.zeros_like(condition)
b = torch.ones_like(condition)
c = torch.where(condition > 0.5, a, b)
print(c)

"""
tensor([[0.8901, 0.9493],
[0.7294, 0.4096]])
tensor([[ True, True],
[ True, False]])
tensor([[0., 0.],
[0., 1.]])
"""

gather

根据索引tensor找值

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# out[i][j][k] = input[index[i][j][k]][j][k] dim = 0
# out[i][j][k] = input[i][index[i][j][k]][k] dim = 1
# out[i][j][k] = input[i][j][index[i][j][k]] dim = 2
prob = torch.randn(4, 10)
idx = prob.topk(3, dim=1)
print(idx[0])

label = torch.arange(10) + 100
print(label)

res = torch.gather(label.expand(4, 10), dim=1, index=idx[0].long())
print(res)

"""
torch.return_types.topk(
values=tensor([[1.2671, 1.1393, 0.4483],
[1.3529, 0.8119, 0.7457],
[0.7395, 0.7150, 0.6264],
[0.9675, 0.7654, 0.6994]]),
indices=tensor([[2, 6, 7],
[5, 0, 1],
[7, 4, 0],
[0, 6, 2]]))
tensor([100, 101, 102, 103, 104, 105, 106, 107, 108, 109])
tensor([[102, 106, 107],
[105, 100, 101],
[107, 104, 100],
[100, 106, 102]])
"""

常见激活函数

sigmoid

1
2
3
4
5
a = torch.arange(-10, 10, 1)
b = torch.sigmoid(a)
plt.plot(a, b)
plt.title("sigmoid")
plt.show()

sigmoid

tanh

1
2
3
4
b = torch.tanh(a)
plt.plot(a, b)
plt.title("tanh")
plt.show()

tanh

relu

1
2
3
4
5
b = torch.relu(a)
plt.plot(a, b)
plt.title("relu")
plt.savefig("relu.jpg")
plt.show()

relu

损失和梯度

torch.autograd.grad

1
2
3
4
5
6
7
8
9
10
11
12
13
14
import torch
import torch.nn.functional as F

# 1.torch.autograd.grad
x = torch.ones(1)
w = torch.tensor([2.])
w.requires_grad_() # 设置参数需要求导
mse = F.mse_loss(torch.ones(1), x * w) # 求均方差损失
print(torch.autograd.grad(mse, [w])) # 求梯度

x = torch.ones(1)
w = torch.tensor([2.], requires_grad=True) # 设置参数需要求导
mse = F.mse_loss(torch.ones(1), x * w)
print(torch.autograd.grad(mse, [w]))

loss.backward

1
2
3
4
5
x = torch.ones(1)
w = torch.tensor([2.], requires_grad=True)
mse = F.mse_loss(torch.ones(1), x * w)
mse.backward() # 反向传播
print(w.grad) # 求梯度

F.softmax

1668736008530

  • 求梯度推导过程

1668736071190

1668736096566

1668736153084

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
a = torch.rand(3)
a.requires_grad_()
p = F.softmax(a, dim=0)
print(p)
dp1 = torch.autograd.grad(p[0], [a], retain_graph=True)
print(dp1)
dp2 = torch.autograd.grad(p[1], [a], retain_graph=True)
print(dp2)
dp3 = torch.autograd.grad(p[2], [a])
print(dp3)
"""
tensor([0.3132, 0.3989, 0.2879], grad_fn=<SoftmaxBackward0>)
(tensor([ 0.2151, -0.1249, -0.0902]),)
(tensor([-0.1249, 0.2398, -0.1148]),)
(tensor([-0.0902, -0.1148, 0.2050]),)
"""

感知机模型

单输出感知机

1668739371382

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import torch
import torch.nn.functional as F

# 三个样本,每个样本对应10个输入,1个输出
x = torch.rand(3, 10)
w = torch.rand(1, 10, requires_grad=True)
o = torch.sigmoid(x @ w.t())
print(o)

loss = F.mse_loss(torch.FloatTensor([[1], [0], [0]]), o)
loss.backward()
print(loss, w.grad)
"""
tensor([[0.8605],
[0.8590],
[0.7049]], grad_fn=<SigmoidBackward0>)
tensor(0.4181, grad_fn=<MseLossBackward0>) tensor([[0.0851, 0.0223, 0.0701, 0.0971, 0.0574, 0.0612, 0.0456, 0.0683, 0.0352, 0.0944]])
"""

多输出感知机

1668739592059

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import torch
import torch.nn.functional as F

x = torch.rand(3, 10)
w = torch.rand(4, 10, requires_grad=True)
o = torch.sigmoid(x @ w.t())
print(o)

label = torch.FloatTensor([[1, 0.5, 0, 0], [0, 1, 0.2, 0], [0, 0, 0, 0.8]])
loss = F.mse_loss(label, o)
loss.backward()
print(loss, w.grad)
"""
tensor([[0.9386, 0.8571, 0.8541, 0.8488],
[0.8648, 0.9295, 0.9111, 0.7584],
[0.9539, 0.9195, 0.8927, 0.9121]], grad_fn=<SigmoidBackward0>)
tensor(0.4983, grad_fn=<MseLossBackward0>)
tensor([[0.0148, 0.0137, 0.0151, 0.0075, 0.0186, 0.0043, 0.0097, 0.0156, 0.0149, 0.0040],
[0.0121, 0.0009, 0.0118, 0.0055, 0.0073, 0.0137, 0.0102, 0.0102, 0.0065, 0.0134],
[0.0260, 0.0099, 0.0312, 0.0133, 0.0204, 0.0245, 0.0174, 0.0205, 0.0177, 0.0251],
[0.0245, 0.0193, 0.0359, 0.0143, 0.0252, 0.0165, 0.0110, 0.0180, 0.0216, 0.0185]])
"""

himmelblau函数优化

  • himmelblau函数是数学家们构造出来的一个特殊的函数,可以用来测试深度学习算法是否能够收敛到局部最小值。这个函数的表达式是:
    $$
    f(x,y)=(x^{2}+y-11)^{2}+(x+y^{2}-7)^2
    $$
    这个函数最小值为0,有四个最小值点,分别是 :
    $$
    f(3.0,2.0)=0\
    f(−2.805118,3.131312)=0\
    f(−3.779310,−3.283186)=0\
    f(3.584428,−1.848126)=0
    $$

三维图像展示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import numpy as np
# from mpl_toolkits.mplot3d import Axes3D
from matplotlib import pyplot as plt
import torch

# himmelblau函数
def himmelblau(x):
return (x[0] ** 2 + x[1] - 11) ** 2 + (x[0] + x[1] ** 2 - 7) ** 2

# 绘制himmelblau函数的三维图像
# 通过x,y两个张量调用meshgrid方法生成一个网格
x = np.arange(-6, 6, 0.1)
y = np.arange(-6, 6, 0.1)
X, Y = np.meshgrid(x, y)
# 网格上每一个点x,y带入himmelblau求出z
Z = himmelblau([X, Y])
# 作图
fig = plt.figure('himmelblau')
ax = fig.add_subplot(projection='3d')
ax.plot_surface(X, Y, Z)
ax.view_init(60, -30)
ax.set_xlabel('x')
ax.set_ylabel('y')
plt.savefig("himmelblau.jpg")
plt.show()

himmelblau

梯度下降求最小值

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# 梯度下降法求局部最小值
# 初始坐标 [0, 0] [1, 0] [-4, 0] [4, 0]
x = torch.tensor([0., 4.], requires_grad=True)
# 构造优化器
optimizer = torch.optim.Adam([x], lr=1e-3)
# 进行20000轮下降
for step in range(20000):
# 要找到himmelblau的最小值,损失函数就是himmelblau的函数值
loss = himmelblau(x)
# 清空各参数的梯度
optimizer.zero_grad()
# 反向传播重新计算各参数的梯度
loss.backward()
# 优化器更新参数x'=x-lr*梯度
optimizer.step()

if step % 2000 == 0:
print('step {}:x={},f(x)={}'.format(step, x.tolist(), loss.item()))

"""
[0, 4]
step 18000:x=[-2.8051180839538574, 3.131312370300293],f(x)=2.2737367544323206e-13

[0,0]
step 18000:x=[3.0, 2.0],f(x)=0.0

[-4, 0]
step 18000:x=[-3.7793102264404297, -3.2831859588623047],f(x)=0.0

[4, 0]
step 18000:x=[3.584428310394287, -1.8481265306472778],f(x)=0.0
"""

初始化位置的不同将会影响最终收敛到哪个局部最小值

交叉熵

$$
H(p,q)=-\sum{p(x)logq(x)}
$$

衡量两个分布p和q的不一致程度,在分类问题中可用于衡量损失。

F.cross_entropy

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import torch
import torch.nn.functional as F

x = torch.randn(1, 784)
w = torch.randn(10, 784)
label = torch.tensor([3])
print(label)

logits = x @ w.t()
print(logits)

# cross_entropy = softmax + log + nll_loss(负对数似然损失)
pred = F.softmax(logits, dim=1)
pred_log = torch.log(pred)
loss = F.nll_loss(pred_log, label)
print(loss)

cross_en = F.cross_entropy(logits, label)
print(cross_en)

多分类问题

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

batch_size = 200
learning_rate = 0.01
epochs = 10

# 加载MNIST数据
train_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# 初始化参数
w1, b1 = torch.randn(200, 784, requires_grad=True), \
torch.zeros(200, requires_grad=True)
w2, b2 = torch.randn(200, 200, requires_grad=True), \
torch.zeros(200, requires_grad=True)
w3, b3 = torch.randn(10, 200, requires_grad=True), \
torch.zeros(10, requires_grad=True)

torch.nn.init.kaiming_normal_(w1)
torch.nn.init.kaiming_normal_(w2)
torch.nn.init.kaiming_normal_(w3)

# 前向传播
def forward(x):
x = x @ w1.t() + b1
x = F.relu(x)
x = x @ w2.t() + b2
x = F.relu(x)
x = x @ w3.t() + b3
x = F.relu(x)
return x

# 定义优化器和损失
optimizer = optim.SGD([w1, b1, w2, b2, w3, b3], lr=learning_rate)
criteon = nn.CrossEntropyLoss()

# 训练模型
for epoch in range(epochs):
# 训练集损失
for batch_idx, (data, target) in enumerate(train_loader):
data = data.view(-1, 28 * 28) # -1表示自适应维度的size

# 前向传播
logits = forward(data)
loss = criteon(logits, target)
# 梯度清零
optimizer.zero_grad()
# 反向传播
loss.backward()
# 梯度下降
optimizer.step()

if batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))

# 测试集损失
test_loss = 0
correct = 0
for data, target in test_loader:
data = data.view(-1, 28 * 28)
logits = forward(data)
test_loss += criteon(logits, target).item()

pred = logits.data.max(1)[1]
correct += pred.eq(target.data).sum()

test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))

全连接神经网络

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms

batch_size = 200
learning_rate = 0.01
epochs = 10

# 加载数据
train_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# 1. 定义模型
class MLP(nn.Module):
def __init__(self):
"""初始化模型网络层"""
super(MLP, self).__init__()
self.model = nn.Sequential(
nn.Linear(784, 200),
nn.ReLU(inplace=True),
nn.Linear(200, 200),
nn.ReLU(inplace=True),
nn.Linear(200, 10),
nn.ReLU(inplace=True),
)

def forward(self, x):
"""前向传播"""
x = self.model(x)
return x

# 2. 定义优化器和损失函数
net = MLP()
optimizer = optim.SGD(net.parameters(), lr=learning_rate)
criteon = nn.CrossEntropyLoss()

# 3.训练模型
for epoch in range(epochs):
for batch_idx, (data, target) in enumerate(train_loader):
data = data.view(-1, 28 * 28)

# 前向传播,会自动调用模型的forward函数
logits = net(data)
loss = criteon(logits, target)
# 梯度清零
optimizer.zero_grad()
# 反向传播
loss.backward()
# 梯度下降
optimizer.step()

if batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))

test_loss = 0
correct = 0
for data, target in test_loader:
data = data.view(-1, 28 * 28)
logits = net(data)
test_loss += criteon(logits, target).item()

pred = logits.data.max(1)[1]
correct += pred.eq(target.data).sum()

test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))

激活函数与CPU加速

激活函数

  • tanh,sigmoid

    1668915387877

  • relu

    1668915414865

  • leak relu

    1668915439685

  • selu

    1668915461546

  • softplus

    1668915475286

CPU加速

1668924196943

1668927592950

手写体数字识别

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from plot.utils import plot_image, plot_curve

batch_size = 200
learning_rate = 0.01
epochs = 10

train_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True)

# 打印几张图片的例子
x, y = next(iter(train_loader))
print(x.shape, y.shape, x.min(), x.max())
plot_image(x, y, 'image sample')
# torch.Size([512, 1, 28, 28]) torch.Size([512]) tensor(-0.4242) tensor(2.8215)


class MLP(nn.Module):

def __init__(self):
super(MLP, self).__init__()

self.model = nn.Sequential(
nn.Linear(784, 200),
nn.LeakyReLU(inplace=True),
nn.Linear(200, 200),
nn.LeakyReLU(inplace=True),
nn.Linear(200, 10),
nn.LeakyReLU(inplace=True),
)

def forward(self, x):
x = self.model(x)
return x


device = torch.device('cuda:0')
net = MLP().to(device)
optimizer = optim.SGD(net.parameters(), lr=learning_rate)
criteon = nn.CrossEntropyLoss().to(device)

train_losses = []
test_losses = []
accuracys = []
for epoch in range(epochs):
# 训练误差
for batch_idx, (data, target) in enumerate(train_loader):
data = data.view(-1, 28 * 28) # 将每张图片的数据打平
data, target = data.to(device), target.cuda()

logits = net(data)
loss = criteon(logits, target)

optimizer.zero_grad()
loss.backward()
optimizer.step()

train_losses.append(loss.item())

if batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))

# 测试误差与精度
test_loss = 0
correct = 0
for data, target in test_loader:
data = data.view(-1, 28 * 28)
data, target = data.to(device), target.cuda()
logits = net(data)

test_loss += criteon(logits, target).item()

# 在第一维度上求最大值所在索引,即模型预测值
pred = logits.argmax(dim=1)
# 测试集中预测正确的图片数量
# pred.eq(y)做一个掩码,相同为1不同为0
correct += pred.eq(target).float().sum().item()

test_loss /= len(test_loader.dataset) # 取平均误差

test_losses.append(test_loss)
accuracys.append(correct / len(test_loader.dataset))

print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))

plot_curve(train_losses)
plot_curve(test_losses)
plot_curve(accuracys)

plot.utils

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import torch
from matplotlib import pyplot as plt


def plot_curve(data):
fig = plt.figure()
plt.plot(range(len(data)), data, color='blue')
plt.legend(['value'], loc='upper right')
plt.xlabel('step')
plt.ylabel('value')
plt.show()


def plot_image(img, label, name):
fig = plt.figure()
for i in range(6):
plt.subplot(2, 3, i + 1)
plt.tight_layout()
plt.imshow(img[i][0] * 0.3081 + 0.1307, cmap='gray', interpolation='none')
plt.title("{}: {}".format(name, label[i].item()))
plt.xticks([])
plt.yticks([])
plt.show()


def one_hot(label, depth=10):
out = torch.zeros(label.size(0), depth)
idx = torch.LongTensor(label).view(-1, 1)
out.scatter_(dim=1, index=idx, value=1)
return out
train_loss test_loss accuracy

visdom可视化

安装与启动

作图举例

  • 实时绘制损失和精度

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    from visdom import Visdom

    # 在神经网络中绘制loss曲线和精度曲线时就可用下面的方法。
    # 新建一个客户端,环境命名为'mnist',如果不设置的话默认为'main'
    viz = visdom.Visdom(env='mnist')

    # 创建线图初始点。loss是差值图,acc是精度图
    viz.line([0.], [0.], win='train_loss', opts=dict(title='train loss'))
    viz.line([0.], [0.], win='acc', opts=dict(title='test acc'))
    # 使用'append'可以实时的添加随程序运行逐渐产生的loss值。
    viz.line([loss.item()], [global_step], win='train_loss', update='append')
  • 绘制折线图

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    ![newplot](D:/QQPCmgr/Desktop/newplot.png# a为x轴数值,b为y轴数值,win为绘制曲线的环境窗体名称,opts为图的名称
    viz = Visdom(env="line chart")
    # 创建线图初始点。
    viz.line([0.], [0.], win='line', opts=dict(title='line chart'))
    a, b = 0, 0
    for i in range(50):
    a = np.array([i])
    # 随机生成y值
    b = np.random.randn(1)
    # 使用‘append’实时添加产生的a,b值
    viz.line(b, a, win='line', update="append")

    line

  • 绘制Image图像

    1
    2
    3
    4
    # 使用visdom绘制image图像
    viz = Visdom(env="images")
    # nrow表示每行显示的图片数量,(16,3,32,32)表示16张3通道的32*32大小的图片
    viz.images(torch.randn(16, 3, 32, 32).numpy(), nrow=4, win='imgs', opts={'title': 'imgs'})

    imgs

与pytorch联用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

from visdom import Visdom

batch_size = 200
learning_rate = 0.01
epochs = 10

train_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
# transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
# transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True)


class MLP(nn.Module):

def __init__(self):
super(MLP, self).__init__()

self.model = nn.Sequential(
nn.Linear(784, 200),
nn.LeakyReLU(inplace=True),
nn.Linear(200, 200),
nn.LeakyReLU(inplace=True),
nn.Linear(200, 10),
nn.LeakyReLU(inplace=True),
)

def forward(self, x):
x = self.model(x)

return x


device = torch.device('cuda:0')
net = MLP().to(device)
optimizer = optim.SGD(net.parameters(), lr=learning_rate)
criteon = nn.CrossEntropyLoss().to(device)

# 初始化viz
viz = Visdom()
viz.line([0.], [0.], win='train_loss', opts=dict(title='train loss'))
viz.line([[0.0, 0.0]], [0.], win='test', opts=dict(title='test loss&acc.',
legend=['loss', 'acc.']))
global_step = 0

for epoch in range(epochs):

for batch_idx, (data, target) in enumerate(train_loader):
data = data.view(-1, 28 * 28)
data, target = data.to(device), target.cuda()

logits = net(data)
loss = criteon(logits, target)

optimizer.zero_grad()
loss.backward()
optimizer.step()

# 绘制训练损失
global_step += 1
viz.line([loss.item()], [global_step], win='train_loss', update='append')

if batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))

test_loss = 0
correct = 0
for data, target in test_loader:
data = data.view(-1, 28 * 28)
data, target = data.to(device), target.cuda()
logits = net(data)
test_loss += criteon(logits, target).item()

pred = logits.argmax(dim=1)
correct += pred.eq(target).float().sum().item()

# 绘制测试损失
viz.line([[test_loss, correct / len(test_loader.dataset)]],
[global_step], win='test', update='append')
# 绘制测试图像和对应的标签值
viz.images(data.view(-1, 1, 28, 28), win='x')
viz.text(str(pred.detach().cpu().numpy()), win='pred',
opts=dict(title='pred'))

test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))

1669119787758

1669119806301

1669119906052

1669166801846

1669120277247

1669120436916

1669166815409

交叉验证

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from visdom import Visdom

batch_size = 200
learning_rate = 0.01
epochs = 10

train_db = datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
]))
train_loader = torch.utils.data.DataLoader(
train_db,
batch_size=batch_size, shuffle=True)

test_db = datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
]))
test_loader = torch.utils.data.DataLoader(test_db,
batch_size=batch_size, shuffle=True)

# 将训练集数据划分为训练集和验证集
print('train:', len(train_db), 'test:', len(test_db))
train_db, val_db = torch.utils.data.random_split(train_db, [50000, 10000])
print('db1:', len(train_db), 'db2:', len(val_db))
train_loader = torch.utils.data.DataLoader(
train_db,
batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(
val_db,
batch_size=batch_size, shuffle=True)


class MLP(nn.Module):

def __init__(self):
super(MLP, self).__init__()

self.model = nn.Sequential(
nn.Linear(784, 200),
nn.LeakyReLU(inplace=True),
nn.Linear(200, 200),
nn.LeakyReLU(inplace=True),
nn.Linear(200, 10),
nn.LeakyReLU(inplace=True),
)

def forward(self, x):
x = self.model(x)
return x


device = torch.device('cuda:0')
net = MLP().to(device)
optimizer = optim.SGD(net.parameters(), lr=learning_rate)
criteon = nn.CrossEntropyLoss().to(device)

# 初始化viz
viz = Visdom(env='cross validation')
viz.line([0.], [0.], win='train_loss', opts=dict(title='train loss'))
viz.line([[0.0, 0.0]], [0.], win='val', opts=dict(title='validation loss&acc.',
legend=['loss', 'acc.']))
global_step = 0

for epoch in range(epochs):

for batch_idx, (data, target) in enumerate(train_loader):
data = data.view(-1, 28 * 28)
data, target = data.to(device), target.cuda()

logits = net(data)
loss = criteon(logits, target)

optimizer.zero_grad()
loss.backward()
optimizer.step()

# 绘制训练损失
global_step += 1
viz.line([loss.item()], [global_step], win='train_loss', update='append')

if batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))

val_loss = 0
correct = 0
# 验证集损失与精度,用来验证模型训练过程是否过拟合
for data, target in val_loader:
data = data.view(-1, 28 * 28)
data, target = data.to(device), target.cuda()
logits = net(data)
val_loss += criteon(logits, target).item()

pred = logits.argmax(dim=1)
correct += pred.eq(target.data).sum().item()
# print(type(pred.eq(target.data).sum().item()))
# print(type(pred.eq(target.data).sum()))

# 绘制验证损失
viz.line([[val_loss, correct / len(test_loader.dataset)]],
[global_step], win='val', update='append')
# 绘制测试图像和对应的标签值
# viz.images(data.view(-1, 1, 28, 28), win='x')
# viz.text(str(pred.detach().cpu().numpy()), win='pred',
# opts=dict(title='pred'))

val_loss /= len(val_loader.dataset)
print('\nVAL set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
val_loss, correct, len(val_loader.dataset),
100. * correct / len(val_loader.dataset)))

# 测试集损失与精度,验证模型泛化能力
test_loss = 0
correct = 0
for data, target in test_loader:
data = data.view(-1, 28 * 28)
data, target = data.to(device), target.cuda()
logits = net(data)
test_loss += criteon(logits, target).item()

pred = logits.data.max(1)[1]
correct += pred.eq(target.data).sum()

test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))

划分数据集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
train_db = datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
]))
train_loader = torch.utils.data.DataLoader(
train_db,
batch_size=batch_size, shuffle=True)

test_db = datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
]))
test_loader = torch.utils.data.DataLoader(test_db,
batch_size=batch_size, shuffle=True)

# 将训练集数据划分为训练集和验证集
print('train:', len(train_db), 'test:', len(test_db))
train_db, val_db = torch.utils.data.random_split(train_db, [50000, 10000])
print('db1:', len(train_db), 'db2:', len(val_db))
train_loader = torch.utils.data.DataLoader(
train_db,
batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(
val_db,
batch_size=batch_size, shuffle=True)

k-fold cross-validation

pytorch - K折交叉验证过程说明及实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset
import torch.nn.functional as F
from torch.autograd import Variable

#####构造的训练集####
x = torch.rand(100,28,28)
y = torch.randn(100,28,28)
x = torch.cat((x,y),dim=0)
label =[1] *100 + [0]*100
label = torch.tensor(label,dtype=torch.long)

######网络结构##########
class Net(nn.Module):
#定义Net
def __init__(self):
super(Net, self).__init__()

self.fc1 = nn.Linear(28*28, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 2)

def forward(self, x):

x = x.view(-1, self.num_flat_features(x))

x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
def num_flat_features(self, x):
size = x.size()[1:]
num_features = 1
for s in size:
num_features *= s
return num_features

##########定义dataset##########
class TraindataSet(Dataset):
def __init__(self,train_features,train_labels):
self.x_data = train_features
self.y_data = train_labels
self.len = len(train_labels)

def __getitem__(self,index):
return self.x_data[index],self.y_data[index]
def __len__(self):
return self.len


########k折划分############
def get_k_fold_data(k, i, X, y): ###此过程主要是步骤(1)
# 返回第i折交叉验证时所需要的训练和验证数据,分开放,X_train为训练数据,X_valid为验证数据
assert k > 1
fold_size = X.shape[0] // k # 每份的个数:数据总条数/折数(组数)

X_train, y_train = None, None
for j in range(k):
idx = slice(j * fold_size, (j + 1) * fold_size) #slice(start,end,step)切片函数
##idx 为每组 valid
X_part, y_part = X[idx, :], y[idx]
if j == i: ###第i折作valid
X_valid, y_valid = X_part, y_part
elif X_train is None:
X_train, y_train = X_part, y_part
else:
X_train = torch.cat((X_train, X_part), dim=0) #dim=0增加行数,竖着连接
y_train = torch.cat((y_train, y_part), dim=0)
#print(X_train.size(),X_valid.size())
return X_train, y_train, X_valid,y_valid


def k_fold(k, X_train, y_train, num_epochs=3,learning_rate=0.001, weight_decay=0.1, batch_size=5):
train_loss_sum, valid_loss_sum = 0, 0
train_acc_sum ,valid_acc_sum = 0,0

for i in range(k):
data = get_k_fold_data(k, i, X_train, y_train) # 获取k折交叉验证的训练和验证数据
net = Net() ### 实例化模型
### 每份数据进行训练,体现步骤三####
train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,\
weight_decay, batch_size)

print('*'*25,'第',i+1,'折','*'*25)
print('train_loss:%.6f'%train_ls[-1][0],'train_acc:%.4f\n'%valid_ls[-1][1],\
'valid loss:%.6f'%valid_ls[-1][0],'valid_acc:%.4f'%valid_ls[-1][1])
train_loss_sum += train_ls[-1][0]
valid_loss_sum += valid_ls[-1][0]
train_acc_sum += train_ls[-1][1]
valid_acc_sum += valid_ls[-1][1]
print('#'*10,'最终k折交叉验证结果','#'*10)
####体现步骤四#####
print('train_loss_sum:%.4f'%(train_loss_sum/k),'train_acc_sum:%.4f\n'%(train_acc_sum/k),\
'valid_loss_sum:%.4f'%(valid_loss_sum/k),'valid_acc_sum:%.4f'%(valid_acc_sum/k))


#########训练函数##########
def train(net, train_features, train_labels, test_features, test_labels, num_epochs, learning_rate,weight_decay, batch_size):
train_ls, test_ls = [], [] ##存储train_loss,test_loss
dataset = TraindataSet(train_features, train_labels)
train_iter = DataLoader(dataset, batch_size, shuffle=True)
### 将数据封装成 Dataloder 对应步骤(2)

#这里使用了Adam优化算法
optimizer = torch.optim.Adam(params=net.parameters(), lr= learning_rate, weight_decay=weight_decay)

for epoch in range(num_epochs):
for X, y in train_iter: ###分批训练
output = net(X)
loss = loss_func(output,y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
### 得到每个epoch的 loss 和 accuracy
train_ls.append(log_rmse(0,net, train_features, train_labels))
if test_labels is not None:
test_ls.append(log_rmse(1,net, test_features, test_labels))
#print(train_ls,test_ls)
return train_ls, test_ls

def log_rmse(flag,net,x,y):
if flag == 1: ### valid 数据集
net.eval()
output = net(x)
result = torch.max(output,1)[1].view(y.size())
corrects = (result.data == y.data).sum().item()
accuracy = corrects*100.0/len(y) #### 5 是 batch_size
loss = loss_func(output,y)
net.train()

return (loss.data.item(),accuracy)

loss_func = nn.CrossEntropyLoss() ###申明loss函
k_fold(10,x,label) ### k=10,十折交叉验证

正则化

奥卡姆剃刀原则

奥卡姆剃刀原则

减少过拟合

  • More data
  • Constraint model complexity
    • shallow
    • regularization 正则化
  • Dropput
  • Data argumentation 数据增强
  • Early stopptin

正则化项

1669172264288

实现

  • L2-regularization

    1669172347227

  • L1-regularization

    1669172468212

动量和学习率衰减

动量 momentum

1669252095671

  • 也就是说,权值w现在不只向着 $\bigtriangledown{f}$ 的方向衰减,也向着的$z^{k}$方向进行衰减。而$z^{k}$是由$\bigtriangledown{w^{k-1}}$推出的,也就是现在w更新是当前衰减方向和之前衰减方向的惯性的结合。

1669252733487

1669252745212

  • 设置动量

    1669252833860

    对于优化器SGD是需要进行设置的,但是对于Adam优化器不需要设置,因为它本身就有。

学习率衰减 learning rate decay

(1) 第一种,观测loss,如果到了一定时间,一直不变,就代表它无法找到更细分的地方,然后把learning rate减小点。

.step(loss_val)用来监测记录loss,检测周期为patience=的次数,如果没有不变的情况,则只有记录的作用,如果检测到,那么会按设定的规则进行减小learning rate。

1669253046483

(2) 第二种是简单粗暴的,直接每多少次epoch减小多少

1669253122256

Early stop and Dropout

Early stop

1669254359267

Dropput

1669254380365

1669254406113

卷积神经网络(CNN)

几个符号含义

  • Input_channels: 输入数据的通道数,一般是RGB三个通道
  • **Kernel_channels: **Kernel的通道数,与 Input_channels一致, 也是每个kernel里面的卷积数
  • Kernel_number: Kernel的数目,根据需要自定义
  • Kernel_size: 卷积的尺寸,如3*3
  • Stride: 步长
  • Padding: 补丁

1669345768825

1669345996947

1669346399823

卷积层

nn.Conv2d

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import torch
import torch.nn as nn
import torch.nn.functional as F

# 1. nn.Conv2d
x = torch.rand(1, 3, 28, 28) # [b, c, h, w]

# in_channels:每个kernel的通道数=输入通道数3, out_channels:kernel的数目=输出通道数
layer = nn.Conv2d(in_channels=3, out_channels=4, kernel_size=3, stride=1, padding=0)
out = layer.forward(x)
print(out.shape) # torch.Size([1, 4, 26, 26])

layer = nn.Conv2d(in_channels=3, out_channels=4, kernel_size=3, stride=1, padding=1)
out = layer.forward(x)
print(out.shape) # torch.Size([1, 4, 28, 28])

layer = nn.Conv2d(in_channels=3, out_channels=4, kernel_size=3, stride=2, padding=1)
out = layer.forward(x)
print(out.shape) # torch.Size([1, 4, 14, 14])

print(layer.weight.shape) # torch.Size([4, 3, 3, 3])
print(layer.bias.shape) # torch.Size([4])

F.Conv2d

1
2
3
4
5
6
7
# 2. F.conv2d
x = torch.rand(1, 3, 28, 28) # [b, c, h, w]
w = torch.rand(4, 3, 3, 3) # 4个kernel,每个kernel有3个通道,尺寸为3*3
b = torch.rand(4)

out = F.conv2d(x, w, b, stride=1, padding=0)
print(out.shape) # torch.Size([1, 4, 26, 26])

池化层

nn.MaxPool2d

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import torch
import torch.nn as nn
import torch.nn.functional as F

x = torch.rand(1, 3, 28, 28) # [b, c, h, w]
c_layer = nn.Conv2d(in_channels=3, out_channels=4, kernel_size=3, stride=1, padding=0)
out1 = c_layer.forward(x)
print(out1.shape) # torch.Size([1, 4, 26, 26])

# 1. 下采样
# nn.MaxPool2d
p_layer = nn.MaxPool2d(2, stride=2)
out2 = p_layer(out1)
print(out2.shape) # torch.Size([1, 4, 13, 13])

p_layer = nn.AvgPool2d(2, stride=2)
out2 = p_layer(out1)
print(out2.shape) # torch.Size([1, 4, 13, 13])

F.max_pool2d

1
2
3
4
5
out2 = F.max_pool2d(out1, 2, stride=2)
print(out2.shape) # torch.Size([1, 4, 13, 13])

out2 = F.avg_pool2d(out1, 2, stride=2)
print(out2.shape) # torch.Size([1, 4, 13, 13])

F.interpolate

1
2
3
4
# 2. 上采样 F.interpolate
x = torch.rand(1, 3, 28, 28)
out = F.interpolate(x, scale_factor=2, mode='nearest')
print(out.shape) # torch.Size([1, 3, 56, 56])

Batch Normalization

  • 几种Normalization

1669357924714

  • batch normalizatioin

1669357964873

1669358018212

1669358075614

  • 优点
    • Converge faster 收敛更快
    • Better performance 更好的表现
    • Robust 鲁棒

nn.Module模块

转载自 pytorch nn.Module模块以及nn部分函数的介绍使用

nn.Module

​ ‎nn.Module是所有神经网络模块的基类,我们使用pytorch构建神经网络需要继承nn.Module类来实现,在__init__构造函数中申明各个层的定义,在forward中实现层之间的连接关系,实际上就是前向传播的过程。

举个栗子,下面是简单构建有一个神经网络:

1
2
3
4
5
6
7
8
9
10
11
12
13
import torch.nn as nn


class Model(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 20, 5)
self.conv2 = nn.Conv2d(20, 20, 5)

def forward(self, x):
x = nn.ReLU(self.conv1(x))
x = nn.ReLU(self.conv2(x))
return x

nn.Conv2

参数:

1
torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros', device=None, dtype=None)

‎对由多个输入平面组成的输入信号应用 2D 卷积。‎这个就是用在卷积神经网络中的卷积层。

参数 参数类型
in_channels int Number of channels in the input image 输入图像通道数
out_channels int Number of channels produced by the convolution 卷积产生的通道数
kernel_size (int or tuple) Size of the convolving kernel 卷积核尺寸,可以设为1个int型数或者一个(int, int)型的元组。例如(2,3)是高2宽3卷积核
stride (int or tuple, optional) Stride of the convolution. Default: 1 卷积步长,默认为1。可以设为1个int型数或者一个(int, int)型的元组。
padding (int or tuple, optional) Zero-padding added to both sides of the input. Default: 0 填充操作,控制padding_mode的数目。
padding_mode (string, optional) ‘zeros’, ‘reflect’, ‘replicate’ or ‘circular’. Default: ‘zeros’ padding模式,默认为Zero-padding 。
dilation (int or tuple, optional) Spacing between kernel elements. Default: 1 扩张操作:控制kernel点(卷积核点)的间距,默认值:1。
groups (int, optional) Number of blocked connections from input channels to output channels. Default: 1 group参数的作用是控制分组卷积,默认不分组,为1组。
bias (bool, optional) If True, adds a learnable bias to the output. Default: True 为真,则在输出中添加一个可学习的偏差。默认:True。

举个栗子:

1
2
3
4
5
6
7
8
9
import torch

x = torch.randn(1, 3, 28, 28)
print(x.shape)


conv2d = torch.nn.Conv2d(in_channels=3, out_channels=12, kernel_size=3)
res = conv2d(x)
print(res.shape)

img

nn.MaxPool2d

1
torch.nn.MaxPool2d(kernel_size, stride=None, padding=0, dilation=1, return_indices=False, ceil_mode=False)

‎对由多个输入平面组成的输入信号应用 2D 最大池化,用在池化层。Pool层用于提取重要信息的操作,可以去掉部分相邻的信息,减少计算开销。MaxPool在提取数据时,保留相邻信息中的最大值,去掉其他值。

我们先来看一下基本参数,一共六个:

  • kernel_size :表示做最大池化的窗口大小,可以是单个值,也可以是tuple元组
  • stride :步长,可以是单个值,也可以是tuple元
  • padding :填充,可以是单个值,也可以是tuple元组
  • dilation :控制窗口中元素步幅
  • return_indices:布尔类型,返回最大值位置索引
  • ceil_mode:布尔类型,为True,用向上取整的方法,计算输出形状;默认是向下取整。

举个栗子:

1
2
3
4
5
6
7
8
9
10
import torch

x = torch.randn(1, 3, 28, 28)
print(x.shape)
conv2d = torch.nn.Conv2d(in_channels=3, out_channels=12, kernel_size=3)
res = conv2d(x)
print(res.shape)
pool = torch.nn.MaxPool2d(kernel_size=2)
res = pool(res)
print(res.shape)

img

nn.BatchNorm2d

1
torch.nn.BatchNorm2d(num_features, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True, device=None, dtype=None)

功能:对输入的四维数组进行批量标准化处理

  • num_features:输入图像的通道数量。
  • eps:稳定系数,防止分母出现0。
  • momentum: 动态均值和动态方差所使用的动量,即一个用于运行过程中均值和方差的一个估计参数,默认值为0.1。
  • affine:代表gamma,beta是否可学。如果设为True,代表两个参数是通过学习得到的;如果设为False,代表两个参数是固定值,默认情况下,gamma是1,beta是0。
  • track_running_stats:BatchNorm2d中存储的的均值和方差是否需要更新,若为True,表示需要更新;反之不需要更新。

举个栗子:

1
2
3
4
5
6
7
8
import torch


m = torch.nn.BatchNorm2d(3)
input = torch.randn(1, 3, 3, 3)
print(input)
output = m(input)
print(output)

img

nn.Linear

nn.Linear()是用于设置网络中的全连接层

1
torch.nn.Linear(in_features, out_features, bias=True, device=None, dtype=None)
  • in_features - 每个输入样本的大小
  • out_features - 每个输出样本的大小
  • bias - 如果设置为False,则图层不会学习附加偏差。默认值:True
1
2
3
4
5
6
7
from torch import nn
import torch

m = nn.Linear(3, 12)
input = torch.randn(128, 3)
output = m(input)
print(output.shape)

img

nn.Dropout

‎在训练期间,使用伯努利分布中的样本,以概率p随机将输入张量的某些元素归零

1
torch.nn.Dropout(p=0.5, inplace=False)

nn.dropout()是为了防止或减轻过拟合而使用的函数,它一般用在全连接层,Dropout就是在不同的训练过程中随机扔掉一部分神经元。也就是让某个神经元的激活值以一定的概率p,让其停止工作,这次训练过程中不更新权值,也不参加神经网络的计算。但是它的权重得保留下来(只是暂时不更新而已),因为下次样本输入时它可能又得工作了。

nn.ReLU

为了训练深层神经网络,需要一个激活函数神经网络,它看起来和行为都像一个线性函数,但实际上是一个非线性函数,允许学习数据中的复杂关系 。该函数还必须提供更灵敏的激活和输入,避免饱和。

1
torch.nn.ReLU(inplace=False)

img

nn.Sigmoid

Sigmoid 激活函数,也被称为 Logistic函数神经网络,传统上是一个非常受欢迎的神经网络激活函数。函数的输入被转换成介于0.0和1.0之间的值。大于1.0的输入被转换为值1.0,同样,小于0.0的值被折断为0.0。所有可能的输入函数的形状都是从0到0.5到1.0的 s 形。

img

img

nn.Tanh

Tanh的诞生比Sigmoid晚一些,sigmoid函数我们提到过有一个缺点就是输出不以0为中心,使得收敛变慢的问题。而Tanh则就是解决了这个问题。

img

img

nn.LSTM

LSTM*一般指长短期记忆人工神经网络。 长短期记忆网络LSTM,Long Short-Term Memory)是一种时间循环神经网络,是为了解决一般的RNN(循环神经网络)存在的长期依赖问题而专门设计出来的

1
torch.nn.lstm(input_size,hidden_size,num_layers,bias,batch_first,dropout,bidirectional)

详细介绍一下参数:

  • input_size:表示的是输入的矩阵特征数,或者说是输入的维度;
  • hidden_size:隐藏层的大小(即隐藏层节点数量),输出向量的维度等于隐藏节点数;
  • num_layers:lstm 隐层的层数,默认为1;
  • bias:隐层状态是否带 bias,默认为 true;
  • batch_first:True 或者 False,如果是 True,则 input 为(batch, seq, input_size),默认值为:False(seq_len, batch, input_size)
  • dropout:默认值0,除最后一层,每一层的输出都进行dropout;
  • bidirectional:如果设置为 True, 则表示双向 LSTM,默认为 False。

nn.LSTM中输入与输出关系为output, (hn, cn) = lstm(input, (h0, c0)),输入输出格式如下:

1
2
3
4
5
6
7
8
9
##输入数据格式:
input(seq_len, batch, input_size)
h0(num_layers * num_directions, batch, hidden_size)
c0(num_layers * num_directions, batch, hidden_size)

##输出数据格式:
output(seq_len, batch, hidden_size * num_directions)
hn(num_layers * num_directions, batch, hidden_size)
cn(num_layers * num_directions, batch, hidden_size)

​ **input (seq_len, batch, input_size)**,seq_len表示每个batch输入多少数据,batch表示把数据分成了batch批,input_size为样本输入维度。

​ **output(seq_len, batch, hidden_size * num_directions)**,output是一个三维张量,第一维表示序列长度,第二维表示数据批次的多少batch,即数据分为几批送进来,第三维hidden_size隐藏层大小,双向则二倍,单向则等价于隐藏层大小。

hn是一个三维张量,第一维是num_layers*num_directions,num_layers是我们定义的神经网络的层数,num_directions表示是否为双向LSTM;第二维表示一批的样本数量;第三维表示隐藏层的大小。

​ c_n与h_n一致。

nn.RNN

RNN一般指循环神经网络。 循环神经网络(Recurrent Neural Network,RNN)是一类以序列(sequence)数据为输入,在序列的演进方向进行递归(recursion)且所有节点(循环单元)按链式连接的递归神经网络(recursive neural network)。

用法同nn.LSTM

nn.GRU

门控循环神经网络(gated recurrent neural network)是为了更好地捕捉时序数据中间隔较大的依赖关系,门控循环单元(gated recurrent unit, GRU)是一种常用的门控循环神经网络。

用法同nn.LSTM

nn.Sequential

一个序列容器,用于搭建神经网络的模块被按照被传入构造器的顺序添加到nn.Sequential()容器中。

1
2
3
4
5
6
7
8
9
10
11
12
13
from torch import nn
import torch

model = nn.Sequential(
nn.Conv2d(1, 20, 5),
nn.ReLU(),
nn.Conv2d(20, 64, 5),
nn.ReLU()
)

x = torch.randn(1, 1, 30, 30)
y = model(x)
print(y.shape)

转载自 named_parameters(), named_children, named_modules()方法

named_parameters()

可以对一个nn.Module中所有注册的参数进行迭代:

1
2
3
4
5
6
7
8
9
10
11
import torch
from torch import nn

class MyLinear(nn.Module):
def __init__(self, in_features, out_features):
super().__init__()
self.weight = nn.Parameter(torch.randn(in_features, out_features))
self.bias = nn.Parameter(torch.randn(out_features))

def forward(self, input):
return (input @ self.weight) + self.bias

可以通过调用named_parameters()方法得到我们定义的nn.Module, 即MyLinear中所有的可学习的参数:

1
2
3
my_linear = MyLinear(3, 4)
for param in my_linear.named_parameters():
print(param)

可以得到以下输出:

1
2
3
4
5
6
('weight', Parameter containing:
tensor([[ 0.9009, 0.6984, 3.0670, 0.9113],
[-0.2515, -0.1617, 0.2517, 0.0977],
[-1.0986, -0.3517, -0.4920, -0.4112]], requires_grad=True))
('bias', Parameter containing:
tensor([-0.2491, -0.1508, 0.8393, -1.3849], requires_grad=True))

我们可以输出更多信息, 来更仔细地查看一下param的内容:

1
2
3
4
5
6
my_linear = MyLinear(3, 4)
for param in my_linear.named_parameters():
print(len(param))
print(type(param[0]), param[0])
print(type(param[1]), param[1])
break

得到了以下输出:

1
2
3
4
5
6
2
<class 'str'> weight
<class 'torch.nn.parameter.Parameter'> Parameter containing:
tensor([[-0.3835, 1.9484, 0.6865, -1.0978],
[-1.7985, 1.0263, -0.7779, -0.4302],
[-0.2007, 0.0540, 0.2876, 0.0776]], requires_grad=True)

可以发现param是由2个元素组成的tuple, 其中第一个元素是str类型的,表示参数的名字, 第二个元素是torch.nn.parameter.Parameter类型的,表示包含的参数信息

named_children()

​ 可以通过调用nn.Modulenamed_children()方法来查看这个nn.Module直接子级的模块:

1
2
3
4
5
6
7
8
9
10
11
12
import torch.nn.functional as F

class Net(nn.Module):
def __init__(self):
super().__init__()
self.linear_0 = MyLinear(4, 3)
self.linear_1 = MyLinear(3, 1)
def forward(self, x):
x = self.linear_0(x)
x = F.relu(x)
x = self.linear_1(x)
return x

​ 这里的Net是由我们定义的MyLinear层组成的, 其中包括了两个children,分别为linear_0linear_1, 我们使用named_children()可以查看Net所包含的直接子级模块:

1
2
3
net = Net()
for child in net.named_children():
print(child)

此时可以得到以下输出:

1
2
('linear_0', MyLinear())
('linear_1', MyLinear())

如果我们调用了children()方法,与named_children()的区别就是,不会输出名称:

1
2
3
4
5
for child in net.children():
print(child)

>>> MyLinear()
>>> MyLinear()

named_modules()

​ 在上一个section中我们介绍了named_children(),但是这个方法只能遍历第一级的nn.Module,如果nn.Module()中的直接子级也是一个nn.Module,你需要连着子级一起遍历(going deeper), 则可以调用named_modules()方法,这个方法会循环遍历nn.Module以及其child nn.Modules ,其实与named_children()的主要区别就是遍历的程度是否更deeper:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import torch.nn.functional as F

class Net(nn.Module):
def __init__(self):
super().__init__()
self.linear_0 = MyLinear(4, 3)
self.linear_1 = MyLinear(3, 1)
def forward(self, x):
x = self.linear_0(x)
x = F.relu(x)
x = self.linear_1(x)
return x

net = Net()
for child in net.named_modules():
print(child)

输出的结果如下:

1
2
3
4
5
6
('', Net(
(linear_0): MyLinear()
(linear_1): MyLinear()
))
('linear_0', MyLinear())
('linear_1', MyLinear())

​ 这里可以看到输出了一个Net,是因为named_modules()方法会迭代地找到每个nn.Module,包括自身,我们可以循环打印一下modules()方法的输出的__class__modules方法与named_modules()的区别就在于是否输出名称:

1
2
3
net = Net()
for child in net.modules():
print(child.__class__)

得到以下输出:

1
2
3
<class '__main__.Net'>
<class '__main__.MyLinear'>
<class '__main__.MyLinear'>

可以看出modules()方法和named_modules()方法会迭代自身

train() and eval()

Pytorch 中net.train() 和 net.eval()的作用和如何使用?

​ net.train()和net.eval()到底在什么时候使用?如果一个模型有DropoutBatchNormalization,那么它在训练时要以一定概率进行Dropout或者更新BatchNormalization参数,而在测试时不在需要Dropout或更新BatchNormalization参数。此时,要用net.train()和net.eval()进行区分。在没有涉及到BN与Dropout的模型,这两个函数没什么用。

save and load

1669431670670

定义自己的Module

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import torch
from torch import nn
from torch import optim


class MyLinear(nn.Module):

def __init__(self, inp, outp):
super(MyLinear, self).__init__()

# requires_grad = True
self.w = nn.Parameter(torch.randn(outp, inp))
self.b = nn.Parameter(torch.randn(outp))

def forward(self, x):
x = x @ self.w.t() + self.b
return x


class Flatten(nn.Module):

def __init__(self):
super(Flatten, self).__init__()

def forward(self, input):
return input.view(input.size(0), -1)


class TestNet(nn.Module):

def __init__(self):
super(TestNet, self).__init__()

self.net = nn.Sequential(nn.Conv2d(1, 16, stride=1, padding=1),
nn.MaxPool2d(2, 2),
Flatten(),
nn.Linear(1 * 14 * 14, 10))

def forward(self, x):
return self.net(x)


class BasicNet(nn.Module):

def __init__(self):
super(BasicNet, self).__init__()

self.net = nn.Linear(4, 3)

def forward(self, x):
return self.net(x)


class Net(nn.Module):

def __init__(self):
super(Net, self).__init__()

self.net = nn.Sequential(BasicNet(),
nn.ReLU(),
nn.Linear(3, 2))

def forward(self, x):
return self.net(x)


def main():
device = torch.device('cuda')
net = Net()
net.to(device)

net.train()

net.eval()

# net.load_state_dict(torch.load('ckpt.mdl'))
#
#
# torch.save(net.state_dict(), 'ckpt.mdl')

for name, t in net.named_parameters():
print('parameters:', name, t.shape)

for name, m in net.named_children():
print('children:', name, m)

for name, m in net.named_modules():
print('modules:', name, m)


if __name__ == '__main__':
main()

数据增强

Data argumentation

  • Flip 翻转
  • Rotate 旋转
  • Random Move and Crop 移动,裁剪
  • Noise 加噪声
  • GAN
1
2
3
4
5
6
7
8
9
10
11
12
13
train_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomVerticalFlip(),
transforms.RandomRotation(15),
transforms.RandomRotation([90, 180, 270]),
transforms.Resize([32, 32]),
transforms.RandomCrop([28, 28]),
transforms.ToTensor(),
# transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True)