# Initialize an iterator over a dataset with 10 elements. # 必须显示run iterator.initializer sess.run(iterator.initializer, feed_dict={max_value: 10}) for i in range(10): value = sess.run(next_element) assert i == value
# Initialize the same iterator over a dataset with 100 elements. sess.run(iterator.initializer, feed_dict={max_value: 100}) for i in range(100): value = sess.run(next_element) assert i == value
# Define training and validation datasets with the same structure. # 定义训练和验证集 具有相同的结构 training_dataset = tf.data.Dataset.range(100).map( lambda x: x + tf.random_uniform([], -10, 10, tf.int64)) validation_dataset = tf.data.Dataset.range(50)
# A reinitializable iterator is defined by its structure. We could use the # `output_types` and `output_shapes` properties of either `training_dataset` # or `validation_dataset` here, because they are compatible. ''' 一个reinitializable iterator是由其结构来定义的,我们可以使用(training或者validation)的output_types和output_shapes属性,因为彼此都是兼容的。 ''' iterator = tf.data.Iterator.from_structure(training_dataset.output_types, training_dataset.output_shapes) next_element = iterator.get_next()
# Run 20 epochs in which the training dataset is traversed, followed by the # validation dataset. for _ in range(20): # Initialize an iterator over the training dataset. sess.run(training_init_op) for _ in range(100): sess.run(next_element)
# Initialize an iterator over the validation dataset. sess.run(validation_init_op) for _ in range(50): sess.run(next_element)
# Define training and validation datasets with the same structure. training_dataset = tf.data.Dataset.range(100).map( lambda x: x + tf.random_uniform([], -10, 10, tf.int64)).repeat() validation_dataset = tf.data.Dataset.range(50)
# A feedable iterator is defined by a handle placeholder and its structure. We # could use the `output_types` and `output_shapes` properties of either # `training_dataset` or `validation_dataset` here, because they have # identical structure. handle = tf.placeholder(tf.string, shape=[]) iterator = tf.data.Iterator.from_string_handle( handle, training_dataset.output_types, training_dataset.output_shapes) next_element = iterator.get_next()
# You can use feedable iterators with a variety of different kinds of iterator # (such as one-shot and initializable iterators). ''' 你可以在feedable iterator中使用不同种类的iterator,比如one-shot和initializable iterator ''' training_iterator = training_dataset.make_one_shot_iterator() validation_iterator = validation_dataset.make_initializable_iterator()
# The `Iterator.string_handle()` method returns a tensor that can be evaluated # and used to feed the `handle` placeholder. ''' Iterator.string_handle()方法可以实现handle placeholder的字符串抽取 ''' training_handle = sess.run(training_iterator.string_handle()) validation_handle = sess.run(validation_iterator.string_handle())
# Loop forever, alternating between training and validation. whileTrue: # Run 200 steps using the training dataset. Note that the training dataset is # infinite, and we resume from where we left off in the previous `while` loop # iteration. for _ in range(200): sess.run(next_element, feed_dict={handle: training_handle})
# Run one pass over the validation dataset. sess.run(validation_iterator.initializer) for _ in range(50): sess.run(next_element, feed_dict={handle: validation_handle})
tf.contrib.data.make_saveable_from_iterator函数从一个迭代器中创建一个SaveableObject,这个对象可以实现当前迭代器的保存和恢复(有效的,就是整个输入管道)。一个saveable对象创建以后可以添加到tf.train.Saver变量list或者是tf.GraphKeys.SAVEABLE_OBJECTS集合里面,为了保存和恢复,这种方式和tf.Variable机制差不多。这涉及到了Saveing and Restoring的细节。怎么保存和恢复变量。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Create saveable object from iterator. saveable = tf.contrib.data.make_saveable_from_iterator(iterator)
# Save the iterator state by adding it to the saveable objects collection. tf.add_to_collection(tf.GraphKeys.SAVEABLE_OBJECTS, saveable) saver = tf.train.Saver()
with tf.Session() as sess:
if should_checkpoint: saver.save(path_to_checkpoint)
# Restore the iterator state. with tf.Session() as sess: saver.restore(sess, path_to_checkpoint)
# Load the training data into two NumPy arrays, for example using `np.load()`. # 加载训练数据到两个Numpy数组,使用np.load()示例 with np.load("/var/data/training_data.npy") as data: features = data["features"] labels = data["labels"]
# Assume that each row of `features` corresponds to the same row as `labels`. # 假定每一行的features和labels所对应。 assert features.shape[0] == labels.shape[0]
# Load the training data into two NumPy arrays, for example using `np.load()`. # 加载训练数据到两个Numpy数组,使用np.load()示例 with np.load("/var/data/training_data.npy") as data: features = data["features"] labels = data["labels"]
# Assume that each row of `features` corresponds to the same row as `labels`. # 假定每一行的features和labels所对应。 assert features.shape[0] == labels.shape[0]
# Creates a dataset that reads all of the examples from two files. # 从两个文件中读取所有样例,然后创建一个dataset filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"] dataset = tf.data.TFRecordDataset(filenames)
filenames = tf.placeholder(tf.string, shape=[None]) dataset = tf.data.TFRecordDataset(filenames) dataset = dataset.map(...) # Parse the record into tensors. 解析record到tensors dataset = dataset.repeat() # Repeat the input indefinitely. 重复无限输入数据 dataset = dataset.batch(32) iterator = dataset.make_initializable_iterator()
# You can feed the initializer with the appropriate filenames for the current # phase of execution, e.g. training vs. validation. ''' 你可以给不同阶段的执行初始化不同的filenames,比如训练和验证 '''
# Initialize `iterator` with training data. # 用训练数据来初始化iterator training_filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"] sess.run(iterator.initializer, feed_dict={filenames: training_filenames})
# Use `Dataset.flat_map()` to transform each file as a separate nested dataset, # and then concatenate their contents sequentially into a single "flat" dataset. # * Skip the first line (header row). # * Filter out lines beginning with "#" (comments).
# Creates a dataset that reads all of the records from two CSV files, each with # eight float columns # 从两个CSV文件读取所有的记录来创建一个dataset对象 # 每一个csv文件中有八列float数据 filenames = ["/var/data/file1.csv", "/var/data/file2.csv"] record_defaults = [tf.float32] * 8# Eight required float columns dataset = tf.contrib.data.CsvDataset(filenames, record_defaults)
如果有一些列是空的,你可以提供默认的值,代替其类型
1 2 3 4 5 6
# Creates a dataset that reads all of the records from two CSV files, each with # four float columns which may have missing values # 从两个CSV文件读取所有的记录来创建一个dataset对象 # record_defaults = [[0.0]] * 8 dataset = tf.contrib.data.CsvDataset(filenames, record_defaults)
# Creates a dataset that reads all of the records from two CSV files with # headers, extracting float data from columns 2 and 4. # 从两个CSV文件读取所有的记录来创建一个dataset对象 # 头忽略,然后只要第二列和第四列 record_defaults = [[0.0]] * 2# Only provide defaults for the selected columns dataset = tf.contrib.data.CsvDataset(filenames, record_defaults, header=True, select_cols=[2,4])
# Transforms a scalar string `example_proto` into a pair of a scalar string and # a scalar integer, representing an image and its label, respectively. def_parse_function(example_proto): features = {"image": tf.FixedLenFeature((), tf.string, default_value=""), "label": tf.FixedLenFeature((), tf.int32, default_value=0)} parsed_features = tf.parse_single_example(example_proto, features) return parsed_features["image"], parsed_features["label"]
# Creates a dataset that reads all of the examples from two files, and extracts # the image and label features. filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"] dataset = tf.data.TFRecordDataset(filenames) dataset = dataset.map(_parse_function)
# Reads an image from a file, decodes it into a dense tensor, and resizes it # to a fixed shape. def_parse_function(filename, label): image_string = tf.read_file(filename) image_decoded = tf.image.decode_jpeg(image_string) image_resized = tf.image.resize_images(image_decoded, [28, 28]) return image_resized, label
# A vector of filenames. filenames = tf.constant(["/var/data/image1.jpg", "/var/data/image2.jpg", ...])
# `labels[i]` is the label for the image in `filenames[i]. labels = tf.constant([0, 37, ...])
# Use a custom OpenCV function to read the image, instead of the standard # TensorFlow `tf.read_file()` operation. def_read_py_function(filename, label): image_decoded = cv2.imread(filename.decode(), cv2.IMREAD_GRAYSCALE) return image_decoded, label
# Use standard TensorFlow operations to resize the image to a fixed shape. def_resize_function(image_decoded, label): image_decoded.set_shape([None, None, None]) image_resized = tf.image.resize_images(image_decoded, [28, 28]) return image_resized, label
tf.train.MonitoredTrainingSession API 简化了许多TensorFlow的分布式设置。MonitoredTrainingSession使用tf.errors.OutOfRangeError去信号化训练是否完成,因此使用tf.data API,我们推荐使用Dataset.make_one_shor_iterator(),举例:
# Use `Dataset.map()` to build a pair of a feature dictionary and a label # tensor for each example. dataset = dataset.map(parser) dataset = dataset.shuffle(buffer_size=10000) dataset = dataset.batch(32) dataset = dataset.repeat(num_epochs) iterator = dataset.make_one_shot_iterator()
# `features` is a dictionary in which each value is a batch of values for # that feature; `labels` is a batch of labels. features, labels = iterator.get_next() return features, labels