text2bin.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. #!/usr/bin/env python
  2. #
  3. # Copyright 2016 Google Inc. All Rights Reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. """Converts vectors from text to a binary format for quicker manipulation.
  17. Usage:
  18. text2bin.py -o <out> -v <vocab> vec1.txt [vec2.txt ...]
  19. Optiona:
  20. -o <filename>, --output <filename>
  21. The name of the file into which the binary vectors are written.
  22. -v <filename>, --vocab <filename>
  23. The name of the file into which the vocabulary is written.
  24. Description
  25. This program merges one or more whitespace separated vector files into a single
  26. binary vector file that can be used by downstream evaluation tools in this
  27. directory ("wordsim.py" and "analogy").
  28. If more than one vector file is specified, then the files must be aligned
  29. row-wise (i.e., each line must correspond to the same embedding), and they must
  30. have the same number of columns (i.e., be the same dimension).
  31. """
  32. from itertools import izip
  33. from getopt import GetoptError, getopt
  34. import os
  35. import struct
  36. import sys
  37. try:
  38. opts, args = getopt(
  39. sys.argv[1:], 'o:v:', ['output=', 'vocab='])
  40. except GetoptError, e:
  41. print >> sys.stderr, e
  42. sys.exit(2)
  43. opt_output = 'vecs.bin'
  44. opt_vocab = 'vocab.txt'
  45. for o, a in opts:
  46. if o in ('-o', '--output'):
  47. opt_output = a
  48. if o in ('-v', '--vocab'):
  49. opt_vocab = a
  50. def go(fhs):
  51. fmt = None
  52. with open(opt_vocab, 'w') as vocab_out:
  53. with open(opt_output, 'w') as vecs_out:
  54. for lines in izip(*fhs):
  55. parts = [line.split() for line in lines]
  56. token = parts[0][0]
  57. if any(part[0] != token for part in parts[1:]):
  58. raise IOError('vector files must be aligned')
  59. print >> vocab_out, token
  60. vec = [sum(float(x) for x in xs) for xs in zip(*parts)[1:]]
  61. if not fmt:
  62. fmt = struct.Struct('%df' % len(vec))
  63. vecs_out.write(fmt.pack(*vec))
  64. if args:
  65. fhs = [open(filename) for filename in args]
  66. go(fhs)
  67. for fh in fhs:
  68. fh.close()
  69. else:
  70. go([sys.stdin])