vgslspecs.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534
  1. # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. """String network description language mapping to TF-Slim calls where possible.
  16. See vglspecs.md for detailed description.
  17. """
  18. import re
  19. from string import maketrans
  20. import nn_ops
  21. import shapes
  22. import tensorflow as tf
  23. import tensorflow.contrib.slim as slim
  24. # Class that builds a set of ops to manipulate variable-sized images.
  25. class VGSLSpecs(object):
  26. """Layers that can be built from a string definition."""
  27. def __init__(self, widths, heights, is_training):
  28. """Constructs a VGSLSpecs.
  29. Args:
  30. widths: Tensor of size batch_size of the widths of the inputs.
  31. heights: Tensor of size batch_size of the heights of the inputs.
  32. is_training: True if the graph should be build for training.
  33. """
  34. # The string that was used to build this model.
  35. self.model_str = None
  36. # True if we are training
  37. self.is_training = is_training
  38. # Tensor for the size of the images, of size batch_size.
  39. self.widths = widths
  40. self.heights = heights
  41. # Overall reduction factors of this model so far for each dimension.
  42. # TODO(rays) consider building a graph from widths and heights instead of
  43. # computing a scale factor.
  44. self.reduction_factors = [1.0, 1.0, 1.0, 1.0]
  45. # List of Op parsers.
  46. # TODO(rays) add more Op types as needed.
  47. self.valid_ops = [self.AddSeries, self.AddParallel, self.AddConvLayer,
  48. self.AddMaxPool, self.AddDropout, self.AddReShape,
  49. self.AddFCLayer, self.AddLSTMLayer]
  50. # Translation table to convert unacceptable characters that may occur
  51. # in op strings that cannot be used as names.
  52. self.transtab = maketrans('(,)', '___')
  53. def Build(self, prev_layer, model_str):
  54. """Builds a network with input prev_layer from a VGSLSpecs description.
  55. Args:
  56. prev_layer: The input tensor.
  57. model_str: Model definition similar to Tesseract as follows:
  58. ============ FUNCTIONAL OPS ============
  59. C(s|t|r|l|m)[{name}]<y>,<x>,<d> Convolves using a y,x window, with no
  60. shrinkage, SAME infill, d outputs, with s|t|r|l|m non-linear layer.
  61. (s|t|r|l|m) specifies the type of non-linearity:
  62. s = sigmoid
  63. t = tanh
  64. r = relu
  65. l = linear (i.e., None)
  66. m = softmax
  67. F(s|t|r|l|m)[{name}]<d> Fully-connected with s|t|r|l|m non-linearity and
  68. d outputs. Reduces height, width to 1. Input height and width must be
  69. constant.
  70. L(f|r|b)(x|y)[s][{name}]<n> LSTM cell with n outputs.
  71. f runs the LSTM forward only.
  72. r runs the LSTM reversed only.
  73. b runs the LSTM bidirectionally.
  74. x runs the LSTM in the x-dimension (on data with or without the
  75. y-dimension).
  76. y runs the LSTM in the y-dimension (data must have a y dimension).
  77. s (optional) summarizes the output in the requested dimension,
  78. outputting only the final step, collapsing the dimension to a
  79. single element.
  80. Examples:
  81. Lfx128 runs a forward-only LSTM in the x-dimension with 128
  82. outputs, treating any y dimension independently.
  83. Lfys64 runs a forward-only LSTM in the y-dimension with 64 outputs
  84. and collapses the y-dimension to 1 element.
  85. NOTE that Lbxsn is implemented as (LfxsnLrxsn) since the summaries
  86. need to be taken from opposite ends of the output
  87. Do[{name}] Insert a dropout layer.
  88. ============ PLUMBING OPS ============
  89. [...] Execute ... networks in series (layers).
  90. (...) Execute ... networks in parallel, with their output concatenated
  91. in depth.
  92. S[{name}]<d>(<a>x<b>)<e>,<f> Splits one dimension, moves one part to
  93. another dimension.
  94. Splits input dimension d into a x b, sending the high part (a) to the
  95. high side of dimension e, and the low part (b) to the high side of
  96. dimension f. Exception: if d=e=f, then then dimension d is internally
  97. transposed to bxa.
  98. Either a or b can be zero, meaning whatever is left after taking out
  99. the other, allowing dimensions to be of variable size.
  100. Eg. S3(3x50)2,3 will split the 150-element depth into 3x50, with the 3
  101. going to the most significant part of the width, and the 50 part
  102. staying in depth.
  103. This will rearrange a 3x50 output parallel operation to spread the 3
  104. output sets over width.
  105. Mp[{name}]<y>,<x> Maxpool the input, reducing the (y,x) rectangle to a
  106. single vector value.
  107. Returns:
  108. Output tensor
  109. """
  110. self.model_str = model_str
  111. final_layer, _ = self.BuildFromString(prev_layer, 0)
  112. return final_layer
  113. def GetLengths(self, dim=2, factor=1):
  114. """Returns the lengths of the batch of elements in the given dimension.
  115. WARNING: The returned sizes may not exactly match TF's calculation.
  116. Args:
  117. dim: dimension to get the sizes of, in [1,2]. batch, depth not allowed.
  118. factor: A scalar value to multiply by.
  119. Returns:
  120. The original heights/widths scaled by the current scaling of the model and
  121. the given factor.
  122. Raises:
  123. ValueError: If the args are invalid.
  124. """
  125. if dim == 1:
  126. lengths = self.heights
  127. elif dim == 2:
  128. lengths = self.widths
  129. else:
  130. raise ValueError('Invalid dimension given to GetLengths')
  131. lengths = tf.cast(lengths, tf.float32)
  132. if self.reduction_factors[dim] is not None:
  133. lengths = tf.div(lengths, self.reduction_factors[dim])
  134. else:
  135. lengths = tf.ones_like(lengths)
  136. if factor != 1:
  137. lengths = tf.multiply(lengths, tf.cast(factor, tf.float32))
  138. return tf.cast(lengths, tf.int32)
  139. def BuildFromString(self, prev_layer, index):
  140. """Adds the layers defined by model_str[index:] to the model.
  141. Args:
  142. prev_layer: Input tensor.
  143. index: Position in model_str to start parsing
  144. Returns:
  145. Output tensor, next model_str index.
  146. Raises:
  147. ValueError: If the model string is unrecognized.
  148. """
  149. index = self._SkipWhitespace(index)
  150. for op in self.valid_ops:
  151. output_layer, next_index = op(prev_layer, index)
  152. if output_layer is not None:
  153. return output_layer, next_index
  154. if output_layer is not None:
  155. return output_layer, next_index
  156. raise ValueError('Unrecognized model string:' + self.model_str[index:])
  157. def AddSeries(self, prev_layer, index):
  158. """Builds a sequence of layers for a VGSLSpecs model.
  159. Args:
  160. prev_layer: Input tensor.
  161. index: Position in model_str to start parsing
  162. Returns:
  163. Output tensor of the series, end index in model_str.
  164. Raises:
  165. ValueError: If [] are unbalanced.
  166. """
  167. if self.model_str[index] != '[':
  168. return None, None
  169. index += 1
  170. while index < len(self.model_str) and self.model_str[index] != ']':
  171. prev_layer, index = self.BuildFromString(prev_layer, index)
  172. if index == len(self.model_str):
  173. raise ValueError('Missing ] at end of series!' + self.model_str)
  174. return prev_layer, index + 1
  175. def AddParallel(self, prev_layer, index):
  176. """tf.concats outputs of layers that run on the same inputs.
  177. Args:
  178. prev_layer: Input tensor.
  179. index: Position in model_str to start parsing
  180. Returns:
  181. Output tensor of the parallel, end index in model_str.
  182. Raises:
  183. ValueError: If () are unbalanced or the elements don't match.
  184. """
  185. if self.model_str[index] != '(':
  186. return None, None
  187. index += 1
  188. layers = []
  189. num_dims = 0
  190. # Each parallel must output the same, including any reduction factor, in
  191. # all dimensions except depth.
  192. # We have to save the starting factors, so they don't get reduced by all
  193. # the elements of the parallel, only once.
  194. original_factors = self.reduction_factors
  195. final_factors = None
  196. while index < len(self.model_str) and self.model_str[index] != ')':
  197. self.reduction_factors = original_factors
  198. layer, index = self.BuildFromString(prev_layer, index)
  199. if num_dims == 0:
  200. num_dims = len(layer.get_shape())
  201. elif num_dims != len(layer.get_shape()):
  202. raise ValueError('All elements of parallel must return same num dims')
  203. layers.append(layer)
  204. if final_factors:
  205. if final_factors != self.reduction_factors:
  206. raise ValueError('All elements of parallel must scale the same')
  207. else:
  208. final_factors = self.reduction_factors
  209. if index == len(self.model_str):
  210. raise ValueError('Missing ) at end of parallel!' + self.model_str)
  211. return tf.concat(axis=num_dims - 1, values=layers), index + 1
  212. def AddConvLayer(self, prev_layer, index):
  213. """Add a single standard convolutional layer.
  214. Args:
  215. prev_layer: Input tensor.
  216. index: Position in model_str to start parsing
  217. Returns:
  218. Output tensor, end index in model_str.
  219. """
  220. pattern = re.compile(R'(C)(s|t|r|l|m)({\w+})?(\d+),(\d+),(\d+)')
  221. m = pattern.match(self.model_str, index)
  222. if m is None:
  223. return None, None
  224. name = self._GetLayerName(m.group(0), index, m.group(3))
  225. width = int(m.group(4))
  226. height = int(m.group(5))
  227. depth = int(m.group(6))
  228. fn = self._NonLinearity(m.group(2))
  229. return slim.conv2d(
  230. prev_layer, depth, [height, width], activation_fn=fn,
  231. scope=name), m.end()
  232. def AddMaxPool(self, prev_layer, index):
  233. """Add a maxpool layer.
  234. Args:
  235. prev_layer: Input tensor.
  236. index: Position in model_str to start parsing
  237. Returns:
  238. Output tensor, end index in model_str.
  239. """
  240. pattern = re.compile(R'(Mp)({\w+})?(\d+),(\d+)(?:,(\d+),(\d+))?')
  241. m = pattern.match(self.model_str, index)
  242. if m is None:
  243. return None, None
  244. name = self._GetLayerName(m.group(0), index, m.group(2))
  245. height = int(m.group(3))
  246. width = int(m.group(4))
  247. y_stride = height if m.group(5) is None else m.group(5)
  248. x_stride = width if m.group(6) is None else m.group(6)
  249. self.reduction_factors[1] *= y_stride
  250. self.reduction_factors[2] *= x_stride
  251. return slim.max_pool2d(
  252. prev_layer, [height, width], [y_stride, x_stride],
  253. padding='SAME',
  254. scope=name), m.end()
  255. def AddDropout(self, prev_layer, index):
  256. """Adds a dropout layer.
  257. Args:
  258. prev_layer: Input tensor.
  259. index: Position in model_str to start parsing
  260. Returns:
  261. Output tensor, end index in model_str.
  262. """
  263. pattern = re.compile(R'(Do)({\w+})?')
  264. m = pattern.match(self.model_str, index)
  265. if m is None:
  266. return None, None
  267. name = self._GetLayerName(m.group(0), index, m.group(2))
  268. layer = slim.dropout(
  269. prev_layer, 0.5, is_training=self.is_training, scope=name)
  270. return layer, m.end()
  271. def AddReShape(self, prev_layer, index):
  272. """Reshapes the input tensor by moving each (x_scale,y_scale) rectangle to.
  273. the depth dimension. NOTE that the TF convention is that inputs are
  274. [batch, y, x, depth].
  275. Args:
  276. prev_layer: Input tensor.
  277. index: Position in model_str to start parsing
  278. Returns:
  279. Output tensor, end index in model_str.
  280. """
  281. pattern = re.compile(R'(S)(?:{(\w)})?(\d+)\((\d+)x(\d+)\)(\d+),(\d+)')
  282. m = pattern.match(self.model_str, index)
  283. if m is None:
  284. return None, None
  285. name = self._GetLayerName(m.group(0), index, m.group(2))
  286. src_dim = int(m.group(3))
  287. part_a = int(m.group(4))
  288. part_b = int(m.group(5))
  289. dest_dim_a = int(m.group(6))
  290. dest_dim_b = int(m.group(7))
  291. if part_a == 0:
  292. part_a = -1
  293. if part_b == 0:
  294. part_b = -1
  295. prev_shape = tf.shape(prev_layer)
  296. layer = shapes.transposing_reshape(
  297. prev_layer, src_dim, part_a, part_b, dest_dim_a, dest_dim_b, name=name)
  298. # Compute scale factors.
  299. result_shape = tf.shape(layer)
  300. for i in xrange(len(self.reduction_factors)):
  301. if self.reduction_factors[i] is not None:
  302. factor1 = tf.cast(self.reduction_factors[i], tf.float32)
  303. factor2 = tf.cast(prev_shape[i], tf.float32)
  304. divisor = tf.cast(result_shape[i], tf.float32)
  305. self.reduction_factors[i] = tf.div(tf.multiply(factor1, factor2), divisor)
  306. return layer, m.end()
  307. def AddFCLayer(self, prev_layer, index):
  308. """Parse expression and add Fully Connected Layer.
  309. Args:
  310. prev_layer: Input tensor.
  311. index: Position in model_str to start parsing
  312. Returns:
  313. Output tensor, end index in model_str.
  314. """
  315. pattern = re.compile(R'(F)(s|t|r|l|m)({\w+})?(\d+)')
  316. m = pattern.match(self.model_str, index)
  317. if m is None:
  318. return None, None
  319. fn = self._NonLinearity(m.group(2))
  320. name = self._GetLayerName(m.group(0), index, m.group(3))
  321. depth = int(m.group(4))
  322. input_depth = shapes.tensor_dim(prev_layer, 1) * shapes.tensor_dim(
  323. prev_layer, 2) * shapes.tensor_dim(prev_layer, 3)
  324. # The slim fully connected is actually a 1x1 conv, so we have to crush the
  325. # dimensions on input.
  326. # Everything except batch goes to depth, and therefore has to be known.
  327. shaped = tf.reshape(
  328. prev_layer, [-1, input_depth], name=name + '_reshape_in')
  329. output = slim.fully_connected(shaped, depth, activation_fn=fn, scope=name)
  330. # Width and height are collapsed to 1.
  331. self.reduction_factors[1] = None
  332. self.reduction_factors[2] = None
  333. return tf.reshape(
  334. output, [shapes.tensor_dim(prev_layer, 0), 1, 1, depth],
  335. name=name + '_reshape_out'), m.end()
  336. def AddLSTMLayer(self, prev_layer, index):
  337. """Parse expression and add LSTM Layer.
  338. Args:
  339. prev_layer: Input tensor.
  340. index: Position in model_str to start parsing
  341. Returns:
  342. Output tensor, end index in model_str.
  343. """
  344. pattern = re.compile(R'(L)(f|r|b)(x|y)(s)?({\w+})?(\d+)')
  345. m = pattern.match(self.model_str, index)
  346. if m is None:
  347. return None, None
  348. direction = m.group(2)
  349. dim = m.group(3)
  350. summarize = m.group(4) == 's'
  351. name = self._GetLayerName(m.group(0), index, m.group(5))
  352. depth = int(m.group(6))
  353. if direction == 'b' and summarize:
  354. fwd = self._LSTMLayer(prev_layer, 'forward', dim, True, depth,
  355. name + '_forward')
  356. back = self._LSTMLayer(prev_layer, 'backward', dim, True, depth,
  357. name + '_reverse')
  358. return tf.concat(axis=3, values=[fwd, back], name=name + '_concat'), m.end()
  359. if direction == 'f':
  360. direction = 'forward'
  361. elif direction == 'r':
  362. direction = 'backward'
  363. else:
  364. direction = 'bidirectional'
  365. outputs = self._LSTMLayer(prev_layer, direction, dim, summarize, depth,
  366. name)
  367. if summarize:
  368. # The x or y dimension is getting collapsed.
  369. if dim == 'x':
  370. self.reduction_factors[2] = None
  371. else:
  372. self.reduction_factors[1] = None
  373. return outputs, m.end()
  374. def _LSTMLayer(self, prev_layer, direction, dim, summarize, depth, name):
  375. """Adds an LSTM layer with the given pre-parsed attributes.
  376. Always maps 4-D to 4-D regardless of summarize.
  377. Args:
  378. prev_layer: Input tensor.
  379. direction: 'forward' 'backward' or 'bidirectional'
  380. dim: 'x' or 'y', dimension to consider as time.
  381. summarize: True if we are to return only the last timestep.
  382. depth: Output depth.
  383. name: Some string naming the op.
  384. Returns:
  385. Output tensor.
  386. """
  387. # If the target dimension is y, we need to transpose.
  388. if dim == 'x':
  389. lengths = self.GetLengths(2, 1)
  390. inputs = prev_layer
  391. else:
  392. lengths = self.GetLengths(1, 1)
  393. inputs = tf.transpose(prev_layer, [0, 2, 1, 3], name=name + '_ytrans_in')
  394. input_batch = shapes.tensor_dim(inputs, 0)
  395. num_slices = shapes.tensor_dim(inputs, 1)
  396. num_steps = shapes.tensor_dim(inputs, 2)
  397. input_depth = shapes.tensor_dim(inputs, 3)
  398. # Reshape away the other dimension.
  399. inputs = tf.reshape(
  400. inputs, [-1, num_steps, input_depth], name=name + '_reshape_in')
  401. # We need to replicate the lengths by the size of the other dimension, and
  402. # any changes that have been made to the batch dimension.
  403. tile_factor = tf.to_float(input_batch *
  404. num_slices) / tf.to_float(tf.shape(lengths)[0])
  405. lengths = tf.tile(lengths, [tf.cast(tile_factor, tf.int32)])
  406. lengths = tf.cast(lengths, tf.int64)
  407. outputs = nn_ops.rnn_helper(
  408. inputs,
  409. lengths,
  410. cell_type='lstm',
  411. num_nodes=depth,
  412. direction=direction,
  413. name=name,
  414. stddev=0.1)
  415. # Output depth is doubled if bi-directional.
  416. if direction == 'bidirectional':
  417. output_depth = depth * 2
  418. else:
  419. output_depth = depth
  420. # Restore the other dimension.
  421. if summarize:
  422. outputs = tf.slice(
  423. outputs, [0, num_steps - 1, 0], [-1, 1, -1], name=name + '_sum_slice')
  424. outputs = tf.reshape(
  425. outputs, [input_batch, num_slices, 1, output_depth],
  426. name=name + '_reshape_out')
  427. else:
  428. outputs = tf.reshape(
  429. outputs, [input_batch, num_slices, num_steps, output_depth],
  430. name=name + '_reshape_out')
  431. if dim == 'y':
  432. outputs = tf.transpose(outputs, [0, 2, 1, 3], name=name + '_ytrans_out')
  433. return outputs
  434. def _NonLinearity(self, code):
  435. """Returns the non-linearity function pointer for the given string code.
  436. For forwards compatibility, allows the full names for stand-alone
  437. non-linearities, as well as the single-letter names used in ops like C,F.
  438. Args:
  439. code: String code representing a non-linearity function.
  440. Returns:
  441. non-linearity function represented by the code.
  442. """
  443. if code in ['s', 'Sig']:
  444. return tf.sigmoid
  445. elif code in ['t', 'Tanh']:
  446. return tf.tanh
  447. elif code in ['r', 'Relu']:
  448. return tf.nn.relu
  449. elif code in ['m', 'Smax']:
  450. return tf.nn.softmax
  451. return None
  452. def _GetLayerName(self, op_str, index, name_str):
  453. """Generates a name for the op, using a user-supplied name if possible.
  454. Args:
  455. op_str: String representing the parsed op.
  456. index: Position in model_str of the start of the op.
  457. name_str: User-supplied {name} with {} that need removing or None.
  458. Returns:
  459. Selected name.
  460. """
  461. if name_str:
  462. return name_str[1:-1]
  463. else:
  464. return op_str.translate(self.transtab) + '_' + str(index)
  465. def _SkipWhitespace(self, index):
  466. """Skips any leading whitespace in the model description.
  467. Args:
  468. index: Position in model_str to start parsing
  469. Returns:
  470. end index in model_str of whitespace.
  471. """
  472. pattern = re.compile(R'([ \t\n]+)')
  473. m = pattern.match(self.model_str, index)
  474. if m is None:
  475. return index
  476. return m.end()