/* -*- Mode: C++ -*- */ /* * Copyright 2016 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * Computes embedding performance on analogy tasks. Accepts as input one or * more files containing four words per line (A B C D), and determines if: * * vec(C) - vec(A) + vec(B) ~= vec(D) * * Cosine distance in the embedding space is used to retrieve neighbors. Any * missing vocabulary items are scored as losses. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include static const char usage[] = R"( Performs analogy testing of embedding vectors. Usage: analogy --embeddings --vocab eval1.tab ... Options: --embeddings The file containing the binary embedding vectors to evaluate. --vocab The vocabulary file corresponding to the embedding vectors. --nthreads The number of evaluation threads to run (default: 8) )"; // Reads the vocabulary file into a map from token to vector index. static std::unordered_map ReadVocab( const std::string& vocab_filename) { std::unordered_map vocab; std::ifstream fin(vocab_filename); int index = 0; for (std::string token; std::getline(fin, token); ++index) { auto n = token.find('\t'); if (n != std::string::npos) token = token.substr(n); vocab[token] = index; } return vocab; } // An analogy query: "A is to B as C is to D". typedef std::tuple AnalogyQuery; std::vector ReadQueries( const std::string &filename, const std::unordered_map &vocab, int *total) { std::ifstream fin(filename); std::vector queries; int lineno = 0; while (1) { // Read the four words. std::string words[4]; int nread = 0; for (int i = 0; i < 4; ++i) { fin >> words[i]; if (!words[i].empty()) ++nread; } ++lineno; if (nread == 0) break; if (nread < 4) { std::cerr << "expected four words at line " << lineno << std::endl; break; } // Look up each word's index. int ixs[4], nvalid; for (nvalid = 0; nvalid < 4; ++nvalid) { std::unordered_map::const_iterator it = vocab.find(words[nvalid]); if (it == vocab.end()) break; ixs[nvalid] = it->second; } // If we don't have all the words, count it as a loss. if (nvalid >= 4) queries.push_back(std::make_tuple(ixs[0], ixs[1], ixs[2], ixs[3])); } *total = lineno; return queries; } // A thread that evaluates some fraction of the analogies. class AnalogyEvaluator { public: // Creates a new Analogy evaluator for a range of analogy queries. AnalogyEvaluator(std::vector::const_iterator begin, std::vector::const_iterator end, const float *embeddings, const int num_embeddings, const int dim) : begin_(begin), end_(end), embeddings_(embeddings), num_embeddings_(num_embeddings), dim_(dim) {} // A thunk for pthreads. static void* Run(void *param) { AnalogyEvaluator *self = static_cast(param); self->Evaluate(); return nullptr; } // Evaluates the analogies. void Evaluate(); // Returns the number of correct analogies after evaluation is complete. int GetNumCorrect() const { return correct_; } protected: // The beginning of the range of queries to consider. std::vector::const_iterator begin_; // The end of the range of queries to consider. std::vector::const_iterator end_; // The raw embedding vectors. const float *embeddings_; // The number of embedding vectors. const int num_embeddings_; // The embedding vector dimensionality. const int dim_; // The number of correct analogies. int correct_; }; void AnalogyEvaluator::Evaluate() { float* sum = new float[dim_]; correct_ = 0; for (auto query = begin_; query < end_; ++query) { const float* vec; int a, b, c, d; std::tie(a, b, c, d) = *query; // Compute C - A + B. vec = embeddings_ + dim_ * c; for (int i = 0; i < dim_; ++i) sum[i] = vec[i]; vec = embeddings_ + dim_ * a; for (int i = 0; i < dim_; ++i) sum[i] -= vec[i]; vec = embeddings_ + dim_ * b; for (int i = 0; i < dim_; ++i) sum[i] += vec[i]; // Find the nearest neighbor that isn't one of the query words. int best_ix = -1; float best_dot = -1.0; for (int i = 0; i < num_embeddings_; ++i) { if (i == a || i == b || i == c) continue; vec = embeddings_ + dim_ * i; float dot = 0; for (int j = 0; j < dim_; ++j) dot += vec[j] * sum[j]; if (dot > best_dot) { best_ix = i; best_dot = dot; } } // The fourth word is the answer; did we get it right? if (best_ix == d) ++correct_; } delete[] sum; } int main(int argc, char *argv[]) { if (argc <= 1) { printf(usage); return 2; } std::string embeddings_filename, vocab_filename; int nthreads = 8; std::vector input_filenames; std::vector> queries; for (int i = 1; i < argc; ++i) { std::string arg = argv[i]; if (arg == "--embeddings") { if (++i >= argc) goto argmissing; embeddings_filename = argv[i]; } else if (arg == "--vocab") { if (++i >= argc) goto argmissing; vocab_filename = argv[i]; } else if (arg == "--nthreads") { if (++i >= argc) goto argmissing; if ((nthreads = atoi(argv[i])) <= 0) goto badarg; } else if (arg == "--help") { std::cout << usage << std::endl; return 0; } else if (arg[0] == '-') { std::cerr << "unknown option: '" << arg << "'" << std::endl; return 2; } else { input_filenames.push_back(arg); } continue; argmissing: std::cerr << "missing value for '" << argv[i - 1] << "' (--help for help)" << std::endl; return 2; badarg: std::cerr << "invalid value '" << argv[i] << "' for '" << argv[i - 1] << "' (--help for help)" << std::endl; return 2; } // Read the vocabulary. std::unordered_map vocab = ReadVocab(vocab_filename); if (!vocab.size()) { std::cerr << "unable to read vocabulary file '" << vocab_filename << "'" << std::endl; return 1; } const int n = vocab.size(); // Read the vectors. int fd; if ((fd = open(embeddings_filename.c_str(), O_RDONLY)) < 0) { std::cerr << "unable to open embeddings file '" << embeddings_filename << "'" << std::endl; return 1; } off_t nbytes = lseek(fd, 0, SEEK_END); if (nbytes == -1) { std::cerr << "unable to determine file size for '" << embeddings_filename << "'" << std::endl; return 1; } if (nbytes % (sizeof(float) * n) != 0) { std::cerr << "'" << embeddings_filename << "' has a strange file size; expected it to be " "a multiple of the vocabulary size" << std::endl; return 1; } const int dim = nbytes / (sizeof(float) * n); float *embeddings = static_cast(malloc(nbytes)); lseek(fd, 0, SEEK_SET); if (read(fd, embeddings, nbytes) < nbytes) { std::cerr << "unable to read embeddings from " << embeddings_filename << std::endl; return 1; } close(fd); /* Normalize the vectors. */ for (int i = 0; i < n; ++i) { float *vec = embeddings + dim * i; float norm = 0; for (int j = 0; j < dim; ++j) norm += vec[j] * vec[j]; norm = sqrt(norm); for (int j = 0; j < dim; ++j) vec[j] /= norm; } pthread_attr_t attr; if (pthread_attr_init(&attr) != 0) { std::cerr << "unable to initalize pthreads" << std::endl; return 1; } /* Read each input file. */ for (const auto filename : input_filenames) { int total = 0; std::vector queries = ReadQueries(filename.c_str(), vocab, &total); const int queries_per_thread = queries.size() / nthreads; std::vector evaluators; std::vector threads; for (int i = 0; i < nthreads; ++i) { auto begin = queries.begin() + i * queries_per_thread; auto end = (i + 1 < nthreads) ? queries.begin() + (i + 1) * queries_per_thread : queries.end(); AnalogyEvaluator *evaluator = new AnalogyEvaluator(begin, end, embeddings, n, dim); pthread_t thread; pthread_create(&thread, &attr, AnalogyEvaluator::Run, evaluator); evaluators.push_back(evaluator); threads.push_back(thread); } for (auto &thread : threads) pthread_join(thread, 0); int correct = 0; for (const AnalogyEvaluator* evaluator : evaluators) { correct += evaluator->GetNumCorrect(); delete evaluator; } printf("%0.3f %s\n", static_cast(correct) / total, filename.c_str()); } return 0; }