1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889 |
- #!/usr/bin/env python
- #
- # Copyright 2016 Google Inc. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """Converts vectors from text to a binary format for quicker manipulation.
- Usage:
- text2bin.py -o <out> -v <vocab> vec1.txt [vec2.txt ...]
- Optiona:
- -o <filename>, --output <filename>
- The name of the file into which the binary vectors are written.
- -v <filename>, --vocab <filename>
- The name of the file into which the vocabulary is written.
- Description
- This program merges one or more whitespace separated vector files into a single
- binary vector file that can be used by downstream evaluation tools in this
- directory ("wordsim.py" and "analogy").
- If more than one vector file is specified, then the files must be aligned
- row-wise (i.e., each line must correspond to the same embedding), and they must
- have the same number of columns (i.e., be the same dimension).
- """
- from itertools import izip
- from getopt import GetoptError, getopt
- import os
- import struct
- import sys
- try:
- opts, args = getopt(
- sys.argv[1:], 'o:v:', ['output=', 'vocab='])
- except GetoptError, e:
- print >> sys.stderr, e
- sys.exit(2)
- opt_output = 'vecs.bin'
- opt_vocab = 'vocab.txt'
- for o, a in opts:
- if o in ('-o', '--output'):
- opt_output = a
- if o in ('-v', '--vocab'):
- opt_vocab = a
- def go(fhs):
- fmt = None
- with open(opt_vocab, 'w') as vocab_out:
- with open(opt_output, 'w') as vecs_out:
- for lines in izip(*fhs):
- parts = [line.split() for line in lines]
- token = parts[0][0]
- if any(part[0] != token for part in parts[1:]):
- raise IOError('vector files must be aligned')
- print >> vocab_out, token
- vec = [sum(float(x) for x in xs) for xs in zip(*parts)[1:]]
- if not fmt:
- fmt = struct.Struct('%df' % len(vec))
- vecs_out.write(fmt.pack(*vec))
- if args:
- fhs = [open(filename) for filename in args]
- go(fhs)
- for fh in fhs:
- fh.close()
- else:
- go([sys.stdin])
|