12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091 |
- #!/usr/bin/env python
- #
- # Copyright 2016 Google Inc. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """Computes Spearman's rho with respect to human judgements.
- Given a set of row (and potentially column) embeddings, this computes Spearman's
- rho between the rank ordering of predicted word similarity and human judgements.
- Usage:
- wordim.py --embeddings=<binvecs> --vocab=<vocab> eval1.tab eval2.tab ...
- Options:
- --embeddings=<filename>: the vectors to test
- --vocab=<filename>: the vocabulary file
- Evaluation files are assumed to be tab-separated files with exactly three
- columns. The first two columns contain the words, and the third column contains
- the scored human judgement.
- """
- import scipy.stats
- import sys
- from getopt import GetoptError, getopt
- from vecs import Vecs
- try:
- opts, args = getopt(sys.argv[1:], '', ['embeddings=', 'vocab='])
- except GetoptError, e:
- print >> sys.stderr, e
- sys.exit(2)
- opt_embeddings = None
- opt_vocab = None
- for o, a in opts:
- if o == '--embeddings':
- opt_embeddings = a
- if o == '--vocab':
- opt_vocab = a
- if not opt_vocab:
- print >> sys.stderr, 'please specify a vocabulary file with "--vocab"'
- sys.exit(2)
- if not opt_embeddings:
- print >> sys.stderr, 'please specify the embeddings with "--embeddings"'
- sys.exit(2)
- try:
- vecs = Vecs(opt_vocab, opt_embeddings)
- except IOError, e:
- print >> sys.stderr, e
- sys.exit(1)
- def evaluate(lines):
- acts, preds = [], []
- with open(filename, 'r') as lines:
- for line in lines:
- w1, w2, act = line.strip().split('\t')
- pred = vecs.similarity(w1, w2)
- if pred is None:
- continue
- acts.append(float(act))
- preds.append(pred)
- rho, _ = scipy.stats.spearmanr(acts, preds)
- return rho
- for filename in args:
- with open(filename, 'r') as lines:
- print '%0.3f %s' % (evaluate(lines), filename)
|