版权声明:本文为博主原创文章,转载请注明出处。 https://cloud.tencent.com/developer/article/1436897
源码来自git 。正在学习tensorflow,所以在此记一下笔记
代码语言:javascript复制# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for parsing PTB text files."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import os
import numpy as np
import tensorflow as tf
def _read_words(filename):
with tf.gfile.GFile(filename, "r") as f:
return f.read().replace("n", "<eos>").split() #读取文件, 将换行符替换为 <eos>, 然后将文件按空格分割。 返回一个 1-D list
def _build_vocab(filename): #用于建立字典
data = _read_words(filename)
counter = collections.Counter(data) #输出一个字典: key是word, value是这个word出现的次数
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
#counter.items() 会返回一个tuple列表, tuple是(key, value), 按 value的降序,key的升序排列
words, _ = list(zip(*count_pairs)) #感觉这个像unzip 就是把key放在一个tuple里,value放在一个tuple里
word_to_id = dict(zip(words, range(len(words))))#对每个word进行编号, 按照之前words输出的顺序(value降序,key升序)
return word_to_id #返回dict, key:word, value:id
def _file_to_word_ids(filename, word_to_id): #将file表示为word_id的形式
data = _read_words(filename)
return [word_to_id[word] for word in data]
def ptb_raw_data(data_path=None):
"""Load PTB raw data from data directory "data_path".
Reads PTB text files, converts strings to integer ids,
and performs mini-batching of the inputs.
The PTB dataset comes from Tomas Mikolov's webpage:
http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
Args:
data_path: string path to the directory where simple-examples.tgz has
been extracted.
Returns:
tuple (train_data, valid_data, test_data, vocabulary)
where each of the data objects can be passed to PTBIterator.
"""
train_path = os.path.join(data_path, "ptb.train.txt")
valid_path = os.path.join(data_path, "ptb.valid.txt")
test_path = os.path.join(data_path, "ptb.test.txt")
word_to_id = _build_vocab(train_path) #使用训练集确定word id
train_data = _file_to_word_ids(train_path, word_to_id)
valid_data = _file_to_word_ids(valid_path, word_to_id)
test_data = _file_to_word_ids(test_path, word_to_id)
vocabulary = len(word_to_id)#字典的大小
return train_data, valid_data, test_data, vocabulary
def ptb_iterator(raw_data, batch_size, num_steps):
"""Iterate on the raw PTB data.
This generates batch_size pointers into the raw PTB data, and allows
minibatch iteration along these pointers.
Args:
raw_data: one of the raw data outputs from ptb_raw_data.
batch_size: int, the batch size.
num_steps: int, the number of unrolls.
Yields:
Pairs of the batched data, each a matrix of shape [batch_size, num_steps].
The second element of the tuple is the same data time-shifted to the
right by one.
Raises:
ValueError: if batch_size or num_steps are too high.
"""
raw_data = np.array(raw_data, dtype=np.int32)#raw data : train_data | vali_data | test data
data_len = len(raw_data) #how many words in the data_set
batch_len = data_len // batch_size
data = np.zeros([batch_size, batch_len], dtype=np.int32)#batch_len 就是几个word的意思
for i in range(batch_size):
data[i] = raw_data[batch_len * i:batch_len * (i 1)]
epoch_size = (batch_len - 1) // num_steps
if epoch_size == 0:
raise ValueError("epoch_size == 0, decrease batch_size or num_steps")
for i in range(epoch_size):
x = data[:, i*num_steps:(i 1)*num_steps]
y = data[:, i*num_steps 1:(i 1)*num_steps 1]
yield (x, y)
一直很懵逼怎么emdeding的, 这里并没有进行embeding。