vnfs/DAaaS/sample-apps/training/sample-horovod-app/keras_mnist_advanced_modified.py

   1 from __future__ import print_function
   2 import keras
   3 import os
   4 from tensorflow.keras.datasets import mnist
   5 from tensorflow.keras.models import Sequential
   6 from tensorflow.keras.layers import Dense, Dropout, Flatten
   7 from tensorflow.keras.layers import Conv2D, MaxPooling2D
   8 from tensorflow.keras.preprocessing.image import ImageDataGenerator
   9 from tensorflow.keras import backend as K
  10 from tensorflow_estimator.python.estimator.export import export as export_helpers
  11 from tensorflow.python.saved_model import builder as saved_model_builder
  12 from tensorflow.python.saved_model import tag_constants, signature_constants
  13 from tensorflow.python.saved_model.signature_def_utils_impl import predict_signature_def
  14 import tensorflow as tf
  15 import horovod.tensorflow.keras as hvd
  16
  17
  18 # Horovod: initialize Horovod.
  19 hvd.init()
  20
  21 # Horovod: pin GPU to be used to process local rank (one GPU per process)
  22 config = tf.ConfigProto()
  23 #config.gpu_options.allow_growth = True
  24 #config.gpu_options.visible_device_list = str(hvd.local_rank())
  25 K.set_session(tf.Session(config=config))
  26
  27 batch_size = 128
  28 num_classes = 10
  29
  30 # Enough epochs to demonstrate learning rate warmup and the reduction of
  31 # learning rate when training plateaues.
  32 epochs = 24
  33
  34 # Input image dimensions
  35 img_rows, img_cols = 28, 28
  36
  37 # The data, shuffled and split between train and test sets
  38 (x_train, y_train), (x_test, y_test) = mnist.load_data()
  39
  40 # Determine how many batches are there in train and test sets
  41 train_batches = len(x_train) // batch_size
  42 test_batches = len(x_test) // batch_size
  43
  44 if K.image_data_format() == 'channels_first':
  45     x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
  46     x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
  47     input_shape = (1, img_rows, img_cols)
  48 else:
  49     x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
  50     x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
  51     input_shape = (img_rows, img_cols, 1)
  52
  53 x_train = x_train.astype('float32')
  54 x_test = x_test.astype('float32')
  55 x_train /= 255
  56 x_test /= 255
  57 print('x_train shape:', x_train.shape)
  58 print(x_train.shape[0], 'train samples')
  59 print(x_test.shape[0], 'test samples')
  60
  61 # Convert class vectors to binary class matrices
  62 y_train = tf.keras.utils.to_categorical(y_train, num_classes)
  63 y_test = tf.keras.utils.to_categorical(y_test, num_classes)
  64
  65 model = Sequential()
  66 model.add(Conv2D(32, kernel_size=(3, 3),
  67                  activation='relu',
  68                  input_shape=input_shape))
  69 model.add(Conv2D(64, (3, 3), activation='relu'))
  70 model.add(MaxPooling2D(pool_size=(2, 2)))
  71 model.add(Dropout(0.25))
  72 model.add(Flatten())
  73 model.add(Dense(128, activation='relu'))
  74 model.add(Dropout(0.5))
  75 model.add(Dense(num_classes, activation='softmax'))
  76
  77 # Horovod: adjust learning rate based on number of GPUs.
  78 opt = tf.keras.optimizers.Adadelta(lr=1.0 * hvd.size())
  79
  80 # Horovod: add Horovod Distributed Optimizer.
  81 opt = hvd.DistributedOptimizer(opt)
  82
  83 model.compile(loss=tf.keras.losses.categorical_crossentropy,
  84               optimizer=opt,
  85               metrics=['accuracy'])
  86
  87 callbacks = [
  88     # Horovod: broadcast initial variable states from rank 0 to all other processes.
  89     # This is necessary to ensure consistent initialization of all workers when
  90     # training is started with random weights or restored from a checkpoint.
  91     hvd.callbacks.BroadcastGlobalVariablesCallback(0),
  92
  93     # Horovod: average metrics among workers at the end of every epoch.
  94     #
  95     # Note: This callback must be in the list before the ReduceLROnPlateau,
  96     # TensorBoard or other metrics-based callbacks.
  97     hvd.callbacks.MetricAverageCallback(),
  98
  99     # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
 100     # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
 101     # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
 102     hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1),
 103
 104     # Reduce the learning rate if training plateaues.
 105     tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1),
 106 ]
 107
 108 # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
 109 if hvd.rank() == 0:
 110     callbacks.append(tf.keras.callbacks.ModelCheckpoint(
 111         './checkpoint-{epoch}.h5'))
 112
 113 # Set up ImageDataGenerators to do data augmentation for the training images.
 114 train_gen = ImageDataGenerator(rotation_range=8, width_shift_range=0.08, shear_range=0.3,
 115                                height_shift_range=0.08, zoom_range=0.08)
 116 test_gen = ImageDataGenerator()
 117
 118 # Train the model.
 119 # Horovod: the training will randomly sample 1 / N batches of training data and
 120 # 3 / N batches of validation data on every worker, where N is the number of workers.
 121 # Over-sampling of validation data helps to increase probability that every validation
 122 # example will be evaluated.
 123 model.fit_generator(train_gen.flow(x_train, y_train, batch_size=batch_size),
 124                     steps_per_epoch=train_batches // hvd.size(),
 125                     callbacks=callbacks,
 126                     epochs=epochs,
 127                     verbose=1,
 128                     validation_data=test_gen.flow(
 129                         x_test, y_test, batch_size=batch_size),
 130                     validation_steps=3 * test_batches // hvd.size())
 131
 132 # Evaluate the model on the full data set.
 133 score = model.evaluate(x_test, y_test, verbose=0)
 134 print('Test loss:', score[0])
 135 print('Test accuracy:', score[1])
 136
 137 # Save Model to Minio
 138 if hvd.rank() == 0:
 139     print('Model Summary')
 140     model.summary()
 141     print('Exporting trained model to Minio Model Repo')
 142     base_path = os.environ['MODEL_BASE_PATH']
 143
 144     # Option 1(Preferred) - Using Keras api and Tensorflow v1.13 version
 145     saved_model_path = tf.contrib.saved_model.save_keras_model(model, base_path)
 146     print('Model Saved to {} Using new Keras API!!!'.format(saved_model_path))
 147     # Option 2 - Tensorflow v1.13+ Builder saved_model api.
 148     # builder = saved_model_builder.SavedModelBuilder(base_path)
 149
 150     # print(model.input)
 151     # print(model.outputs)
 152
 153     # signature = predict_signature_def(inputs={"inputs": model.input},
 154     #                                   outputs={t.name:t for t in model.outputs})
 155     # print(signature)
 156     # K.set_learning_phase(0)
 157     # with K.get_session() as sess:
 158     #     builder.add_meta_graph_and_variables(sess=sess,
 159     #                                          tags=[tag_constants.SERVING],
 160     #                                          signature_def_map={'predict': signature})
 161     #     builder.save()
 162     # print('Model Saved to S3 Using Builder!!!')
 163
 164     # Option 3 - Tensorflow v1.13 Will be deprecated in Tensorflow v2
 165     # tf.saved_model.simple_save(
 166     #     keras.backend.get_session(),
 167     #     export_path,
 168     #     inputs={'input_image': model.input},
 169     #     outputs={t.name: t for t in model.outputs})