|
| 1 | +#!/usr/bin/env python |
| 2 | +# coding: utf-8 |
| 3 | + |
| 4 | +import tensorflow as tf |
| 5 | +import coremltools as ct |
| 6 | +import numpy as np |
| 7 | + |
| 8 | +input_height = 320 |
| 9 | +input_width = 320 |
| 10 | + |
| 11 | +input_node = 'Preprocessor/map/TensorArrayStack/TensorArrayGatherV3' |
| 12 | +bbox_output_node = 'Squeeze' |
| 13 | +class_output_node = 'Postprocessor/convert_scores' |
| 14 | + |
| 15 | +original_model = tf.saved_model.load('mobiledet') |
| 16 | +pruned_model = original_model.prune(input_node+':0', [bbox_output_node+':0', class_output_node+':0']) |
| 17 | + |
| 18 | +inputs=[ct.TensorType(name=input_node, shape=(1, input_height, input_width, 3))] |
| 19 | +ssd_model = ct.convert([pruned_model], source='tensorflow', inputs=inputs) |
| 20 | + |
| 21 | +spec = ssd_model.get_spec() |
| 22 | + |
| 23 | +ct.utils.rename_feature(spec, input_node.replace('/', '_'), 'image') |
| 24 | +ct.utils.rename_feature(spec, class_output_node.replace('/', '_'), 'scores') |
| 25 | +ct.utils.rename_feature(spec, bbox_output_node.replace('/', '_'), 'boxes') |
| 26 | + |
| 27 | +spec.description.output[0].shortDescription = "Predicted class scores for each bounding box" |
| 28 | +spec.description.output[1].shortDescription = "Predicted coordinates for each bounding box" |
| 29 | + |
| 30 | +num_classes = 90 |
| 31 | +num_anchors = 2034 |
| 32 | + |
| 33 | +ssd_model = ct.models.MLModel(spec) |
| 34 | +ssd_model.save('/tmp/mobiledet.mlmodel') |
| 35 | + |
| 36 | +def get_anchors(start_tensor, end_tensor): |
| 37 | + """ |
| 38 | + Computes the list of anchor boxes by sending a fake image through the graph. |
| 39 | + Outputs an array of size (4, num_anchors) where each element is an anchor box |
| 40 | + given as [ycenter, xcenter, height, width] in normalized coordinates. |
| 41 | + """ |
| 42 | + anchors_model = original_model.prune(start_tensor, [end_tensor]) |
| 43 | + box_corners = tf.squeeze(anchors_model(tf.zeros((1, input_height, input_width, 3), tf.uint8))) |
| 44 | + |
| 45 | + # The TensorFlow graph gives each anchor box as [ymin, xmin, ymax, xmax]. |
| 46 | + # Convert these min/max values to a center coordinate, width and height. |
| 47 | + ymin, xmin, ymax, xmax = np.transpose(box_corners) |
| 48 | + width = xmax - xmin |
| 49 | + height = ymax - ymin |
| 50 | + ycenter = ymin + height / 2. |
| 51 | + xcenter = xmin + width / 2. |
| 52 | + return np.stack([ycenter, xcenter, height, width]) |
| 53 | + |
| 54 | +# Read the anchors into a (4, 2034) tensor. |
| 55 | +start_tensor = "image_tensor:0" |
| 56 | +anchors_tensor = "Concatenate/concat:0" |
| 57 | +anchors = get_anchors(start_tensor, anchors_tensor) |
| 58 | +assert(anchors.shape[1] == num_anchors) |
| 59 | + |
| 60 | + |
| 61 | +from coremltools.models import datatypes |
| 62 | +from coremltools.models import neural_network |
| 63 | +from coremltools.proto.FeatureTypes_pb2 import ArrayFeatureType |
| 64 | + |
| 65 | +spec = ssd_model.get_spec() |
| 66 | +builder = neural_network.NeuralNetworkBuilder(spec=spec, use_float_arraytype=True) |
| 67 | + |
| 68 | +builder.add_permute(name="permute_boxed", |
| 69 | + dim=(0, 3, 2, 1), |
| 70 | + # input_name="expanded_boxes", |
| 71 | + input_name='boxes', |
| 72 | + output_name="permute_boxes_output") |
| 73 | + |
| 74 | +# Grab the y, x coordinates (channels 0-1). |
| 75 | +builder.add_slice(name="slice_yx", |
| 76 | + input_name="permute_boxes_output", |
| 77 | + output_name="slice_yx_output", |
| 78 | + axis="channel", |
| 79 | + start_index=0, |
| 80 | + end_index=2) |
| 81 | + |
| 82 | +# boxes_yx / 10 |
| 83 | +builder.add_elementwise(name="scale_yx", |
| 84 | + input_names="slice_yx_output", |
| 85 | + output_name="scale_yx_output", |
| 86 | + mode="MULTIPLY", |
| 87 | + alpha=0.1) |
| 88 | + |
| 89 | +# Split the anchors into two (2, 2034, 1) arrays. |
| 90 | +anchors_yx = np.expand_dims(anchors[:2, :], axis=-1) |
| 91 | +anchors_hw = np.expand_dims(anchors[2:, :], axis=-1) |
| 92 | + |
| 93 | +builder.add_load_constant(name="anchors_yx", |
| 94 | + output_name="anchors_yx", |
| 95 | + constant_value=anchors_yx, |
| 96 | + shape=[2, num_anchors, 1]) |
| 97 | + |
| 98 | +builder.add_load_constant(name="anchors_hw", |
| 99 | + output_name="anchors_hw", |
| 100 | + constant_value=anchors_hw, |
| 101 | + shape=[2, num_anchors, 1]) |
| 102 | + |
| 103 | +# (boxes_yx / 10) * anchors_hw |
| 104 | +builder.add_elementwise(name="yw_times_hw", |
| 105 | + input_names=["scale_yx_output", "anchors_hw"], |
| 106 | + output_name="yw_times_hw_output", |
| 107 | + mode="MULTIPLY") |
| 108 | + |
| 109 | +# (boxes_yx / 10) * anchors_hw + anchors_yx |
| 110 | +builder.add_elementwise(name="decoded_yx", |
| 111 | + input_names=["yw_times_hw_output", "anchors_yx"], |
| 112 | + output_name="decoded_yx_output", |
| 113 | + mode="ADD") |
| 114 | + |
| 115 | +# Grab the height and width (channels 2-3). |
| 116 | +builder.add_slice(name="slice_hw", |
| 117 | + input_name="permute_boxes_output", |
| 118 | + output_name="slice_hw_output", |
| 119 | + axis="channel", |
| 120 | + start_index=2, |
| 121 | + end_index=4) |
| 122 | + |
| 123 | +# (boxes_hw / 5) |
| 124 | +builder.add_elementwise(name="scale_hw", |
| 125 | + input_names="slice_hw_output", |
| 126 | + output_name="scale_hw_output", |
| 127 | + mode="MULTIPLY", |
| 128 | + alpha=0.2) |
| 129 | + |
| 130 | +# exp(boxes_hw / 5) |
| 131 | +builder.add_unary(name="exp_hw", |
| 132 | + input_name="scale_hw_output", |
| 133 | + output_name="exp_hw_output", |
| 134 | + mode="exp") |
| 135 | + |
| 136 | +# exp(boxes_hw / 5) * anchors_hw |
| 137 | +builder.add_elementwise(name="decoded_hw", |
| 138 | + input_names=["exp_hw_output", "anchors_hw"], |
| 139 | + output_name="decoded_hw_output", |
| 140 | + mode="MULTIPLY") |
| 141 | + |
| 142 | +# The coordinates are now (y, x) and (height, width) but NonMaximumSuppression |
| 143 | +# wants them as (x, y, width, height). So create four slices and then concat |
| 144 | +# them into the right order. |
| 145 | +builder.add_slice(name="slice_y", |
| 146 | + input_name="decoded_yx_output", |
| 147 | + output_name="slice_y_output", |
| 148 | + axis="channel", |
| 149 | + start_index=0, |
| 150 | + end_index=1) |
| 151 | + |
| 152 | +builder.add_slice(name="slice_x", |
| 153 | + input_name="decoded_yx_output", |
| 154 | + output_name="slice_x_output", |
| 155 | + axis="channel", |
| 156 | + start_index=1, |
| 157 | + end_index=2) |
| 158 | + |
| 159 | +builder.add_slice(name="slice_h", |
| 160 | + input_name="decoded_hw_output", |
| 161 | + output_name="slice_h_output", |
| 162 | + axis="channel", |
| 163 | + start_index=0, |
| 164 | + end_index=1) |
| 165 | + |
| 166 | +builder.add_slice(name="slice_w", |
| 167 | + input_name="decoded_hw_output", |
| 168 | + output_name="slice_w_output", |
| 169 | + axis="channel", |
| 170 | + start_index=1, |
| 171 | + end_index=2) |
| 172 | + |
| 173 | +builder.add_elementwise(name="concat2", |
| 174 | + input_names=["slice_x_output", "slice_y_output", |
| 175 | + "slice_w_output", "slice_h_output"], |
| 176 | + output_name="concat_output", |
| 177 | + mode="CONCAT") |
| 178 | + |
| 179 | +builder.add_permute(name="permute_output", |
| 180 | + dim=(0, 1, 2, 3), |
| 181 | + input_name="concat_output", |
| 182 | + output_name="raw_coordinates") |
| 183 | + |
| 184 | +input_names = ['raw_coordinates', 'scores'] |
| 185 | +output_names = ['coordinates', 'confidence','box_index', 'number_of_boxes'] |
| 186 | +builder.add_nms('nms', input_names, output_names, 0.3, 0.3, 10, True) |
| 187 | + |
| 188 | +# TF/TFLite and Core ML use different box encoding |
| 189 | +# TF/TFLite: $ (y_1, x_1, y_2, x_2) $ |
| 190 | +# Core ML: $ (x_{center}, y_{center}, width, height) $ |
| 191 | +# use maxtrix multiplication to calculte |
| 192 | +# $ (x_{center}, y_{center}, width, height) x (0, 1, 0, -0.5)^T = y_{center} + (- 1/2 * width) = y_1 $ |
| 193 | +# ... |
| 194 | + |
| 195 | +t = np.array([ |
| 196 | + [0, 1, 0, 1], |
| 197 | + [1, 0, 1, 0], |
| 198 | + [0, -0.5, 0, 0.5], |
| 199 | + [-0.5, 0, 0.5, 0]]) |
| 200 | + |
| 201 | +builder.add_batched_mat_mul('boxes_transform', |
| 202 | + ['coordinates'], |
| 203 | + 'detection_boxes', |
| 204 | + weight_matrix_rows=4, |
| 205 | + weight_matrix_columns=4, |
| 206 | + W=t) |
| 207 | + |
| 208 | +builder.add_argmax('confidence_argmax', 'confidence', 'detection_classes_1', 2) |
| 209 | +builder.add_load_constant('the_one', 'the_one', |
| 210 | + constant_value=np.ones(1), |
| 211 | + shape=[1,1,1]) |
| 212 | +builder.add_subtract_broadcastable('sub_1', ['detection_classes_1', 'the_one'], 'detection_classes') |
| 213 | +builder.add_reduce_max('confidence_max', 'confidence', 'detection_scores', [2]) |
| 214 | + |
| 215 | +spec.description.output.add() |
| 216 | +spec.description.output.add() |
| 217 | + |
| 218 | +output_names = ['detection_boxes', 'detection_classes', 'detection_scores', 'number_of_boxes'] |
| 219 | +output_dims = [(1, 10, 4), (1, 10), (1, 10), (1,)] |
| 220 | +builder.set_output(output_names, output_dims) |
| 221 | +for i in range(4): |
| 222 | + spec.description.output[i].type.multiArrayType.dataType = ArrayFeatureType.FLOAT32 |
| 223 | + |
| 224 | +spec.description.metadata.versionString = "ssdlite_mobiledet_edgetpu_320x320_coco_2020_05_19" |
| 225 | +spec.description.metadata.shortDescription = "MobileDet, trained on COCO" |
| 226 | +spec.description.metadata.author = "Converted to Core ML by Koan-Sin Tan. Original Authors: Yunyang Xiong, Hanxiao Liu, Suyog Gupta, Berkin Akin, Gabriel Bender, Yongzhe Wang, Pieter-Jan Kindermans, Mingxing Tan, Vikas Singh, Bo Chen" |
| 227 | +spec.description.metadata.license = "https://github.com/tensorflow/models/blob/master/research/object_detection" |
| 228 | + |
| 229 | +final_model = ct.models.MLModel(builder.spec) |
| 230 | +final_model.save('MobileDet_4_outputs_in_one_model_matching_tflite_outputs.mlmodel') |
| 231 | + |
| 232 | +print(final_model) |
| 233 | +print("Done!") |
| 234 | + |
0 commit comments