keras 中数据预处理
所有的函数都在keras.preprocessing 分别有text ,sequence, image
代码语言:javascript复制# 文字预处理
txt = "My name is maoli.maoli don't like coding."
文字预处理
- 文字拆分
- 建立索引
- padding(序列补齐)
- 标注
from keras.preprocessing.text import text_to_word_sequence # 文本转化序列
out = text_to_word_sequence(txt) # 默认lower=True,,
print(out) # 与jieba 功能一样
代码语言:javascript复制['my', 'name', 'is', 'maoli', 'maoli', "don't", 'like', 'coding']
代码语言:javascript复制out1 = text_to_word_sequence(txt,filters='maoli')# 无视maili字母
print(out1)
代码语言:javascript复制['y', 'n', 'e', 's', '.', 'd', "n't", 'ke', 'c', 'd', 'ng.']
代码语言:javascript复制# 文字拆分
chn = '我的名字叫毛利。我不喜欢写码'
out2 = text_to_word_sequence(chn)
out3 = text_to_word_sequence(chn,filters='。')
print(out2)
print(out3)
代码语言:javascript复制['我的名字叫毛利。我不喜欢写码']
['我的名字叫毛利', '我不喜欢写码']
代码语言:javascript复制# 对于中文就没用了,必须使用jieba
import jieba
#cut是生成一个生成器,lcut生成列表,cut_all就是filters='。'
out4 = jieba.lcut(chn,cut_all=False)
out5 = jieba.lcut(chn,cut_all=True)
print(out4)
print(out5)
['我', '的', '名字', '叫', '毛利', '。', '我', '不', '喜欢', '写码'] ['我', '的', '名字', '叫', '毛利', '', '', '我', '不', '喜欢', '写', '码']
代码语言:javascript复制print(out)
代码语言:javascript复制['my', 'name', 'is', 'maoli', 'maoli', "don't", 'like', 'coding']
代码语言:javascript复制# 倒过来
out.sort(reverse=True)
print(out)
代码语言:javascript复制['name', 'my', 'maoli', 'maoli', 'like', 'is', "don't", 'coding']
代码语言:javascript复制import numpy as np
# 建立索引
word_index = dict(list(zip(out,np.arange(len(out)))))
代码语言:javascript复制word_index
代码语言:javascript复制{'name': 0, 'my': 1, 'maoli': 3, 'like': 4, 'is': 5, "don't": 6, 'coding': 7}
代码语言:javascript复制 # pad_sequences序列补充
from keras.preprocessing.sequence import pad_sequences
x = [[1,2,3],[4,5],[6,7,8,9]]
y0 = pad_sequences(x)
y1 = pad_sequences(x,maxlen=5)
代码语言:javascript复制print(y0)
print('-------------'*10)
print(y1)
代码语言:javascript复制[[0 1 2 3]
[0 0 4 5]
[6 7 8 9]]
----------------------------------------------------------------------------------------------------------------------------------
[[0 0 1 2 3]
[0 0 0 4 5]
[0 6 7 8 9]]
代码语言:javascript复制# 使用Tokenizer分词
from keras.preprocessing.text import Tokenizer
somestr = ['i am dalao,my name is maoli','maoli is very cool']
tok = Tokenizer()
tok.fit_on_texts(somestr)
tok.word_index
代码语言:javascript复制{'is': 1,
'maoli': 2,
'i': 3,
'am': 4,
'dalao': 5,
'my': 6,
'name': 7,
'very': 8,
'cool': 9}
代码语言:javascript复制# 图像预处理
# ImageDataGenerator 类
from keras.preprocessing.image import ImageDataGenerator
# 下面是官网的cifar10例子
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
y_train = np_utils.to_categorical(y_train, num_classes)
y_test = np_utils.to_categorical(y_test, num_classes)
datagen = ImageDataGenerator(
featurewise_center=True,
featurewise_std_normalization=True,
rotation_range=20,
width_shift_range=0.2,
height_shift_range=0.2,
horizontal_flip=True)
# 计算特征归一化所需的数量
# (如果应用 ZCA 白化,将计算标准差,均值,主成分)
datagen.fit(x_train)
# 使用实时数据增益的批数据对模型进行拟合:
model.fit_generator(datagen.flow(x_train, y_train, batch_size=32),
steps_per_epoch=len(x_train) / 32, epochs=epochs)
# 这里有一个更 「手动」的例子
for e in range(epochs):
print('Epoch', e)
batches = 0
for x_batch, y_batch in datagen.flow(x_train, y_train, batch_size=32):
model.fit(x_batch, y_batch)
batches = 1
if batches >= len(x_train) / 32:
# 我们需要手动打破循环,
# 因为生成器会无限循环
break
Keras 模型
在 Keras 中有两类主要的模型:Sequential 顺序模型 和 使用函数式 API 的 Model 类模型。
代码语言:javascript复制# 这里主要讲下使用函数式 API
from keras.models import Model
from keras.layers import Input, Dense
a = Input(shape=(32,))
b = Dense(32)(a)
model = Model(inputs=a, outputs=b)
model.summary()
代码语言:javascript复制_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_2 (InputLayer) (None, 32) 0
_________________________________________________________________
dense_2 (Dense) (None, 32) 1056
=================================================================
Total params: 1,056
Trainable params: 1,056
Non-trainable params: 0
_________________________________________________________________
模型可视化
代码语言:javascript复制from keras.utils import plot_model
plot_model(model, to_file='model.png')
代码语言:javascript复制# 训练可视化
import matplotlib.pyplot as plt
history = model.fit(x, y, validation_split=0.25, epochs=50, batch_size=16, verbose=1)
# 绘制训练 & 验证的准确率值
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
# 绘制训练 & 验证的损失值
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
实战手写字体
代码语言:javascript复制import keras
from keras import layers
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from keras.datasets import mnist
(train_image,train_label),(test_image,test_label) = mnist.load_data()
train_image= np.expand_dims(train_image,axis=-1)
test_image= np.expand_dims(test_image,axis=-1)
model = keras.Sequential()
model.add(layers.Conv2D(64,(3,3),activation='relu',input_shape=(28,28,1)))
model.add(layers.Conv2D(64,(3,3),activation='relu'))
model.add(layers.MaxPool2D())
model.add(layers.Flatten())
model.add(layers.Dense(256,activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(10,activation = 'softmax'))
model.compile(optimizer='adam',loss ='sparse_categorical_crossentropy',metrics=['acc'])
model.fit(train_image,train_label,epochs=5,batch_size=512)
代码语言:javascript复制Epoch 1/5
60000/60000 [==============================] - 237s 4ms/step - loss: 1.6534 - acc: 0.8529
Epoch 2/5
60000/60000 [==============================] - 237s 4ms/step - loss: 0.0784 - acc: 0.9762
Epoch 3/5
60000/60000 [==============================] - 241s 4ms/step - loss: 0.0521 - acc: 0.9840
Epoch 4/5
60000/60000 [==============================] - 240s 4ms/step - loss: 0.0399 - acc: 0.9879
Epoch 5/5
60000/60000 [==============================] - 236s 4ms/step - loss: 0.0314 - acc: 0.9896
<keras.callbacks.History at 0x165e8383438>