网络结构图
Created with Raphaël 2.1.0inputlstm1_1lstm2_1softmaxoutput
在这个只是一个time的结构,input:[batch_size, num_units] softmax:[num_units, vocabulary]
源码来自git
代码语言:javascript复制# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Example / benchmark for building a PTB LSTM model.
Trains the model described in:
(Zaremba, et. al.) Recurrent Neural Network Regularization
http://arxiv.org/abs/1409.2329
There are 3 supported model configurations:
===========================================
| config | epochs | train | valid | test
===========================================
| small | 13 | 37.99 | 121.39 | 115.91
| medium | 39 | 48.45 | 86.16 | 82.07
| large | 55 | 37.87 | 82.62 | 78.29
The exact results may vary depending on the random initialization.
The hyperparameters used in the model:
- init_scale - the initial scale of the weights
- learning_rate - the initial value of the learning rate
- max_grad_norm - the maximum permissible norm of the gradient
- num_layers - the number of LSTM layers
- num_steps - the number of unrolled steps of LSTM
- hidden_size - the number of LSTM units
- max_epoch - the number of epochs trained with the initial learning rate
- max_max_epoch - the total number of epochs for training
- keep_prob - the probability of keeping weights in the dropout layer
- lr_decay - the decay of the learning rate for each epoch after "max_epoch"
- batch_size - the batch size
The data required for this example is in the data/ dir of the
PTB dataset from Tomas Mikolov's webpage:
$ wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
$ tar xvf simple-examples.tgz
To run:
$ python ptb_word_lm.py --data_path=simple-examples/data/
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
import numpy as np
import tensorflow as tf
import tensorflow.models.rnn.ptb.reader as reader
flags = tf.flags #可以在命令行 python *.py --model=.. --data_path=.. --use_fp16=..
logging = tf.logging #不知这个是干嘛用的
flags.DEFINE_string( #第一个是参数名, 第二个是默认值, 第三个是注释
"model", "small",
"A type of model. Possible options are: small, medium, large.")
flags.DEFINE_string("data_path", None, "data_path")
flags.DEFINE_bool("use_fp16", False,
"Train using 16-bit floats instead of 32bit floats")
FLAGS = flags.FLAGS #使用FLAGS调用DEFINE的参数
def data_type():
return tf.float16 if FLAGS.use_fp16 else tf.float32
class PTBModel(object):
"""The PTB model."""
def __init__(self, is_training, config):#设置model的超参数
self.batch_size = batch_size = config.batch_size
self.num_steps = num_steps = config.num_steps
size = config.hidden_size
vocab_size = config.vocab_size
self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
self._targets = tf.placeholder(tf.int32, [batch_size, num_steps])
# Slightly better results can be obtained with forget gate biases
# initialized to 1 but the hyperparameters of the model would need to be
# different than reported in the paper.
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=0.0)
if is_training and config.keep_prob < 1:
lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
lstm_cell, output_keep_prob=config.keep_prob)
cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers)
self._initial_state = cell.zero_state(batch_size, data_type())
with tf.device("/cpu:0"):
embedding = tf.get_variable(
"embedding", [vocab_size, size], dtype=data_type())
inputs = tf.nn.embedding_lookup(embedding, self._input_data)
if is_training and config.keep_prob < 1:
inputs = tf.nn.dropout(inputs, config.keep_prob)
# Simplified version of tensorflow.models.rnn.rnn.py's rnn().
# This builds an unrolled LSTM for tutorial purposes only.
# In general, use the rnn() or state_saving_rnn() from rnn.py.
#
# The alternative version of the code below is:
#
# inputs = [tf.squeeze(input_, [1])
# for input_ in tf.split(1, num_steps, inputs)]
# outputs, state = tf.nn.rnn(cell, inputs, initial_state=self._initial_state)
outputs = []
state = self._initial_state
with tf.variable_scope("RNN"):
for time_step in range(num_steps):
if time_step > 0: tf.get_variable_scope().reuse_variables()#变量空间下,参数复用
(cell_output, state) = cell(inputs[:, time_step, :], state)
outputs.append(cell_output)
output = tf.reshape(tf.concat(1, outputs), [-1, size])
softmax_w = tf.get_variable(
"softmax_w", [size, vocab_size], dtype=data_type())
softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type())
logits = tf.matmul(output, softmax_w) softmax_b
loss = tf.nn.seq2seq.sequence_loss_by_example(
[logits],
[tf.reshape(self._targets, [-1])],
[tf.ones([batch_size * num_steps], dtype=data_type())])
self._cost = cost = tf.reduce_sum(loss) / batch_size
self._final_state = state
if not is_training:
return
self._lr = tf.Variable(0.0, trainable=False)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
config.max_grad_norm)
optimizer = tf.train.GradientDescentOptimizer(self._lr)
self._train_op = optimizer.apply_gradients(zip(grads, tvars))
self._new_lr = tf.placeholder(
tf.float32, shape=[], name="new_learning_rate")
self._lr_update = tf.assign(self._lr, self._new_lr)
def assign_lr(self, session, lr_value):
session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
def run_epoch(session, m, data, eval_op, verbose=False):
"""Runs the model on the given data."""
epoch_size = ((len(data) // m.batch_size) - 1) // m.num_steps
start_time = time.time()
costs = 0.0
iters = 0
state = m.initial_state.eval()
for step, (x, y) in enumerate(reader.ptb_iterator(data, m.batch_size,
m.num_steps)):
cost, state, _ = session.run([m.cost, m.final_state, eval_op],
{m.input_data: x,
m.targets: y,
m.initial_state: state})
costs = cost
iters = m.num_steps
if verbose and step % (epoch_size // 10) == 10:
print("%.3f perplexity: %.3f speed: %.0f wps" %
(step * 1.0 / epoch_size, np.exp(costs / iters),
iters * m.batch_size / (time.time() - start_time)))
return np.exp(costs / iters)
def get_config():
if FLAGS.model == "small":
return SmallConfig()
elif FLAGS.model == "medium":
return MediumConfig()
elif FLAGS.model == "large":
return LargeConfig()
elif FLAGS.model == "test":
return TestConfig()
else:
raise ValueError("Invalid model: %s", FLAGS.model)
def main(_):
if not FLAGS.data_path:
raise ValueError("Must set --data_path to PTB data directory")
raw_data = reader.ptb_raw_data(FLAGS.data_path)
train_data, valid_data, test_data, _ = raw_data
config = get_config()
eval_config = get_config()
eval_config.batch_size = 1
eval_config.num_steps = 1
with tf.Graph().as_default(), tf.Session() as session:
initializer = tf.random_uniform_initializer(-config.init_scale,
config.init_scale)
with tf.variable_scope("model", reuse=None, initializer=initializer):
m = PTBModel(is_training=True, config=config)
with tf.variable_scope("model", reuse=True, initializer=initializer):
mvalid = PTBModel(is_training=False, config=config)
mtest = PTBModel(is_training=False, config=eval_config)
tf.initialize_all_variables().run()
for i in range(config.max_max_epoch):
lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0)
m.assign_lr(session, config.learning_rate * lr_decay)
print("Epoch: %d Learning rate: %.3f" % (i 1, session.run(m.lr)))
train_perplexity = run_epoch(session, m, train_data, m.train_op,
verbose=True)
print("Epoch: %d Train Perplexity: %.3f" % (i 1, train_perplexity))
valid_perplexity = run_epoch(session, mvalid, valid_data, tf.no_op())
print("Epoch: %d Valid Perplexity: %.3f" % (i 1, valid_perplexity))
test_perplexity = run_epoch(session, mtest, test_data, tf.no_op())
print("Test Perplexity: %.3f" % test_perplexity)
if __name__ == "__main__":
tf.app.run() #解析命令行参数,调用上面写的main函数
下面详解sequence_loss_by_example
代码语言:javascript复制def sequence_loss_by_example(logits, targets, weights,
average_across_timesteps=True,
softmax_loss_function=None, name=None):
"""Weighted cross-entropy loss for a sequence of logits (per example).
Args:
logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols].
targets: List of 1D batch-sized int32 Tensors of the same length as logits.
weights: List of 1D batch-sized float-Tensors of the same length as logits.
average_across_timesteps: If set, divide the returned cost by the total
label weight.
softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch
to be used instead of the standard softmax (the default if this is None).
name: Optional name for this operation, default: "sequence_loss_by_example".
Returns:
1D batch-sized float Tensor: The log-perplexity for each sequence.
Raises:
ValueError: If len(logits) is different from len(targets) or len(weights).
"""
if len(targets) != len(logits) or len(weights) != len(logits):
raise ValueError("Lengths of logits, weights, and targets must be the same "
"%d, %d, %d." % (len(logits), len(weights), len(targets)))
with ops.name_scope(name, "sequence_loss_by_example",
logits targets weights):
log_perp_list = []
for logit, target, weight in zip(logits, targets, weights):
if softmax_loss_function is None:
# TODO(irving,ebrevdo): This reshape is needed because
# sequence_loss_by_example is called with scalars sometimes, which
# violates our general scalar strictness policy.
target = array_ops.reshape(target, [-1])
crossent = nn_ops.sparse_softmax_cross_entropy_with_logits(
logit, target)
else:
crossent = softmax_loss_function(logit, target)
log_perp_list.append(crossent * weight)
log_perps = math_ops.add_n(log_perp_list)
#######################################
#math_ops.add_n(x)
#x:[tensor1,tensor2,tensor3]
#return:tensor1 tensor2 tensor3
#######################################
if average_across_timesteps:
total_size = math_ops.add_n(weights)
total_size = 1e-12 # Just to avoid division by 0 for all-0 weights.
log_perps /= total_size
return log_perps #[batch_size]