1 from __future__ import print_function
3 from tensorflow.keras.datasets import mnist
4 from tensorflow.keras.models import Sequential
5 from tensorflow.keras.layers import Dense, Dropout, Flatten
6 from tensorflow.keras.layers import Conv2D, MaxPooling2D
7 from tensorflow.keras.preprocessing.image import ImageDataGenerator
8 from tensorflow.keras import backend as K
9 import tensorflow as tf
10 import horovod.tensorflow.keras as hvd
13 # Horovod: initialize Horovod.
16 # Horovod: pin GPU to be used to process local rank (one GPU per process)
17 config = tf.ConfigProto()
18 #config.gpu_options.allow_growth = True
19 #config.gpu_options.visible_device_list = str(hvd.local_rank())
20 K.set_session(tf.Session(config=config))
25 # Enough epochs to demonstrate learning rate warmup and the reduction of
26 # learning rate when training plateaues.
29 # Input image dimensions
30 img_rows, img_cols = 28, 28
32 # The data, shuffled and split between train and test sets
33 (x_train, y_train), (x_test, y_test) = mnist.load_data()
35 # Determine how many batches are there in train and test sets
36 train_batches = len(x_train) // batch_size
37 test_batches = len(x_test) // batch_size
39 if K.image_data_format() == 'channels_first':
40 x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
41 x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
42 input_shape = (1, img_rows, img_cols)
44 x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
45 x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
46 input_shape = (img_rows, img_cols, 1)
48 x_train = x_train.astype('float32')
49 x_test = x_test.astype('float32')
52 print('x_train shape:', x_train.shape)
53 print(x_train.shape[0], 'train samples')
54 print(x_test.shape[0], 'test samples')
56 # Convert class vectors to binary class matrices
57 y_train = tf.keras.utils.to_categorical(y_train, num_classes)
58 y_test = tf.keras.utils.to_categorical(y_test, num_classes)
61 model.add(Conv2D(32, kernel_size=(3, 3),
63 input_shape=input_shape))
64 model.add(Conv2D(64, (3, 3), activation='relu'))
65 model.add(MaxPooling2D(pool_size=(2, 2)))
66 model.add(Dropout(0.25))
68 model.add(Dense(128, activation='relu'))
69 model.add(Dropout(0.5))
70 model.add(Dense(num_classes, activation='softmax'))
72 # Horovod: adjust learning rate based on number of GPUs.
73 opt = tf.keras.optimizers.Adadelta(lr=1.0 * hvd.size())
75 # Horovod: add Horovod Distributed Optimizer.
76 opt = hvd.DistributedOptimizer(opt)
78 model.compile(loss=tf.keras.losses.categorical_crossentropy,
83 # Horovod: broadcast initial variable states from rank 0 to all other processes.
84 # This is necessary to ensure consistent initialization of all workers when
85 # training is started with random weights or restored from a checkpoint.
86 hvd.callbacks.BroadcastGlobalVariablesCallback(0),
88 # Horovod: average metrics among workers at the end of every epoch.
90 # Note: This callback must be in the list before the ReduceLROnPlateau,
91 # TensorBoard or other metrics-based callbacks.
92 hvd.callbacks.MetricAverageCallback(),
94 # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
95 # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
96 # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
97 hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1),
99 # Reduce the learning rate if training plateaues.
100 tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1),
103 # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
105 callbacks.append(tf.keras.callbacks.ModelCheckpoint(
106 './checkpoint-{epoch}.h5'))
108 # Set up ImageDataGenerators to do data augmentation for the training images.
109 train_gen = ImageDataGenerator(rotation_range=8, width_shift_range=0.08, shear_range=0.3,
110 height_shift_range=0.08, zoom_range=0.08)
111 test_gen = ImageDataGenerator()
114 # Horovod: the training will randomly sample 1 / N batches of training data and
115 # 3 / N batches of validation data on every worker, where N is the number of workers.
116 # Over-sampling of validation data helps to increase probability that every validation
117 # example will be evaluated.
118 model.fit_generator(train_gen.flow(x_train, y_train, batch_size=batch_size),
119 steps_per_epoch=train_batches // hvd.size(),
123 validation_data=test_gen.flow(
124 x_test, y_test, batch_size=batch_size),
125 validation_steps=3 * test_batches // hvd.size())
127 # Evaluate the model on the full data set.
128 score = model.evaluate(x_test, y_test, verbose=0)
129 print('Test loss:', score[0])
130 print('Test accuracy:', score[1])
132 # Save Model to Minio
134 print('Model Summary')
136 print('Exporting trained model to Minio Model Repo')
137 base_path = os.environ['MODEL_BASE_PATH']
139 # Option 1(Preferred) - Using Keras api and Tensorflow v1.13 version
140 saved_model_path = tf.contrib.saved_model.save_keras_model(model, base_path)
141 print('Model Saved to {} Using new Keras API!!!'.format(saved_model_path))
142 # Option 2 - Tensorflow v1.13+ Builder saved_model api.
143 # builder = saved_model_builder.SavedModelBuilder(base_path)
146 # print(model.outputs)
148 # signature = predict_signature_def(inputs={"inputs": model.input},
149 # outputs={t.name:t for t in model.outputs})
151 # K.set_learning_phase(0)
152 # with K.get_session() as sess:
153 # builder.add_meta_graph_and_variables(sess=sess,
154 # tags=[tag_constants.SERVING],
155 # signature_def_map={'predict': signature})
157 # print('Model Saved to S3 Using Builder!!!')
159 # Option 3 - Tensorflow v1.13 Will be deprecated in Tensorflow v2
160 # tf.saved_model.simple_save(
161 # keras.backend.get_session(),
163 # inputs={'input_image': model.input},
164 # outputs={t.name: t for t in model.outputs})