关于深度学习系列笔记九（多分类问题）

路透社数据集新闻分类预测，是个多分类问题，对于多分类问题，主要注意几点：

1、如果要对 N 个类别的数据点进行分类，网络的最后一层应该是大小为 N 的 Dense 层。

2、对于单标签、多分类问题，网络的最后一层应该使用 softmax 激活，这样可以输出在 N 个输出类别上的概率分布。

3、多分类问题的损失函数几乎总是应该使用分类交叉熵。它将网络输出的概率分布与目标的真实分布之间的距离最小化。

处理多分类问题的标签有两种方法。

3.1通过分类编码（也叫one-hot 编码）对标签进行编码，然后使用categorical_ crossentropy 作为损失函数。

3.2将标签编码为整数，然后使用 sparse_categorical_crossentropy 损失函数。

4、如果你需要将数据划分到许多类别中，应该避免使用太小的中间层，以免在网络中造成信息瓶颈。

代码示例

代码语言：javascript复制

# 路透社数据集，包含许多短新闻及其对应的主题，由路透社在1986 年发布。
# 它是一个简单的、广泛使用的文本分类数据集。
# 它包括46 个不同的主题：某些主题的样本更多，但训练集中每个主题都有至少10 个样本。

#加载路透社数据集
from keras.datasets import reuters
(train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000)
def printshape(x):
    #print('数据值=',x)
    print('#----------------')
    print('#数据形状=',x.shape)
    print('#数据张量=',x.ndim)
    print('#数据类型=',x.dtype)

#---------查看数据特征------------------
#printshape(train_data)
#----------------
#数据形状= (8982,)
#数据张量= 1
#数据类型= object
#printshape(train_labels)
#----------------
#数据形状= (8982,)
#数据张量= 1
#数据类型= int64
#printshape(test_data)
#----------------
#数据形状= (2246,)
#数据张量= 1
#数据类型= object
#printshape(test_labels)
#----------------
#数据形状= (2246,)
#数据张量= 1
#数据类型= int64
#train_data = [list([1, 2, 2, 8, 43, 10,...])
#              list([1, 3267, 699, 3434, 2295, ..])
#              list([1, 227, 2406, 91, 2,..])]
#train_data[10]= [1, 245, 273, 207, 156, 53, 74, 160, 26, 14, 46, 296,..]
word_index = reuters.get_word_index()
#word_index= {'mdbl': 10996, 'fawc': 16260, 'degussa': 12089, 'woods': 8803, 'hanging': 13796,...}
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
#reverse_word_index={10996: 'mdbl', 16260: 'fawc', 12089: 'degussa', 8803: 'woods', 13796: 'hanging',...}
#注意，索引减去了3，因为0、1、2 是为“padding”（ 填充）、“start ofsequence”（序列开始）、“unknown”（未知词）分别保留的索引
decoded_newswire = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]])
#decoded_newswire = ? ? ? said as a result of its december acquisition of space co it expects earnings per share
train_labels
#[ 3  4  3 ... 25  3 25]
train_labels[10]
#3
#---------查看数据特征------------------

#----------数据预处理-------------------
#数据向量化
import numpy as np
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results
#train_data = [list([1, 2, 2, 8, 43, 10,...])
#              list([1, 3267, 699, 3434, 2295, ..])
#              list([1, 227, 2406, 91, 2,..])]
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)
#x_train。shape=(8982, 10000)
#x_train= [[0. 1. 1. ... 0. 0. 0.]
#           ...
#          [0. 1. 1. ... 0. 0. 0.]]

#将训练标签向量化，46种分类，标签向量化的方法同上
def to_one_hot(labels, dimension=46):
    results = np.zeros((len(labels), dimension))
    for i, label in enumerate(labels):
        results[i, label] = 1.
    return results
one_hot_train_labels = to_one_hot(train_labels)
one_hot_test_labels = to_one_hot(test_labels)

#Keras 内置方法实现标签向量化
from keras.utils.np_utils import to_categorical
one_hot_train_labels = to_categorical(train_labels)
one_hot_test_labels = to_categorical(test_labels)

#在训练数据中留出 1000 个样本作为验证集。
x_val = x_train[:1000]
partial_x_train = x_train[1000:]
y_val = one_hot_train_labels[:1000]
partial_y_train = one_hot_train_labels[1000:]

#构建模型
from keras import models
from keras import layers
model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(46, activation='softmax'))

#编译模型
model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])
print(model)
#训练模型
history = model.fit(partial_x_train,partial_y_train,epochs=20,batch_size=512,validation_data=(x_val, y_val))
print(history.history)
#{'val_loss': [1.7178231077194215, 1.349607783317566, 1.1751809968948363, ...],
# 'val_acc': [0.6120000143051147, 0.7090000042915344, 0.7430000009536744, ...],
# 'loss': [2.524090452290991, 1.444193690261038, 1.0991966037993979, ...],
# 'acc': [0.4977449262031664, 0.6889250825533321, 0.7643447748194718,...]}
#评估模型
results = model.evaluate(x_test, one_hot_test_labels)
#[1.21202463946593, 0.7791629564199507]
#在新数据上生成预测结果
predictions = model.predict(x_test)
#[[1.1499790e-06 9.1250433e-07 7.7823410e-09 ... 5.9703202e-09   2.0752662e-09 1.6480236e-09]
# ...
# [5.4418546e-05 6.6710752e-01 1.6387193e-02 ... 2.2803215e-07  1.4364658e-09 1.5013682e-06]]
#predictions 中的每个元素都是长度为 46 的向量。
predictions[0].shape
#(46,)
# 这个向量的所有元素总和为 1。
print(np.sum(predictions[0]) )
#1.0
#最大的元素就是预测类别，即概率最大的类别。
print(np.argmax(predictions[0]))
#3

import matplotlib.pyplot as plt
colors = ['red','blue','green','black','yellow','orange']
units = [16,32,64,128,256,512]
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
for i in range(len(units)):
    unit=units[i]
    model = models.Sequential()
    model = models.Sequential()
    model.add(layers.Dense(unit, activation='relu', input_shape=(10000,)))
    model.add(layers.Dense(unit, activation='relu'))
    model.add(layers.Dense(46, activation='softmax'))
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    history = model.fit(partial_x_train, partial_y_train, epochs=20, batch_size=512, validation_data=(x_val, y_val))
    history_dict = history.history
    print("history_dict%s =" %history_dict)
    acc = history_dict['acc']
    val_acc = history_dict['val_acc']
    epochs = range(1, len(acc)   1)
    ax.plot(epochs, acc, 'bo', label='Training acc,hidden unit=%s' %unit,color=colors[i])
    ax.plot(epochs, val_acc, 'b', label='Validation acc,hidden unit=%s' %unit,color=colors[i])
ax.legend(loc='best')
ax.set_title('Training and validation accuracy by different hidden unit')
ax.set_xlabel('Epochs')
ax.set_ylabel('Accuracy')
plt.show()

#如果要对 N 个类别的数据点进行分类，网络的最后一层应该是大小为 N 的 Dense 层。 
#对于单标签、多分类问题，网络的最后一层应该使用 softmax 激活，这样可以输出在 N 个输出类别上的概率分布。 
#这种问题的损失函数几乎总是应该使用分类交叉熵。它将网络输出的概率分布与目标的 真实分布之间的距离最小化。 
#处理多分类问题的标签有两种方法。 
#  通过分类编码（也叫one-hot 编码）对标签进行编码，然后使用categorical_ crossentropy 作为损失函数。 
#  将标签编码为整数，然后使用 sparse_categorical_crossentropy 损失函数。 
#如果你需要将数据划分到许多类别中，应该避免使用太小的中间层，以免在网络中造成 信息瓶颈。

基本上隐藏单元越大，最开始的准确率最高；隐藏单元越小，后续提升空间越大

编码

0 人点赞