123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899 |
- # -*- Mode: Makefile -*-
- #
- # Copyright 2016 Google Inc. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # This makefile pulls down the evaluation datasets and formats them uniformly.
- # Word similarity evaluations are formatted to contain exactly three columns:
- # the two words being compared and the human judgement.
- #
- # Use wordsim.py and analogy to run the actual evaluations.
- CXXFLAGS=-std=c++11 -m64 -mavx -g -Ofast -Wall
- LDLIBS=-lpthread -lm
- WORDSIM_EVALS= ws353sim.ws.tab \
- ws353rel.ws.tab \
- men.ws.tab \
- mturk.ws.tab \
- rarewords.ws.tab \
- simlex999.ws.tab \
- $(NULL)
- ANALOGY_EVALS= mikolov.an.tab \
- msr.an.tab \
- $(NULL)
- all: $(WORDSIM_EVALS) $(ANALOGY_EVALS) analogy
- ws353sim.ws.tab: ws353simrel.tar.gz
- tar Oxfz $^ wordsim353_sim_rel/wordsim_similarity_goldstandard.txt > $@
- ws353rel.ws.tab: ws353simrel.tar.gz
- tar Oxfz $^ wordsim353_sim_rel/wordsim_relatedness_goldstandard.txt > $@
- men.ws.tab: MEN.tar.gz
- tar Oxfz $^ MEN/MEN_dataset_natural_form_full | tr ' ' '\t' > $@
- mturk.ws.tab: Mtruk.csv
- cat $^ | tr -d '\r' | tr ',' '\t' > $@
- rarewords.ws.tab: rw.zip
- unzip -p $^ rw/rw.txt | cut -f1-3 -d $$'\t' > $@
- simlex999.ws.tab: SimLex-999.zip
- unzip -p $^ SimLex-999/SimLex-999.txt \
- | tail -n +2 | cut -f1,2,4 -d $$'\t' > $@
- mikolov.an.tab: questions-words.txt
- egrep -v -E '^:' $^ | tr '[A-Z] ' '[a-z]\t' > $@
- msr.an.tab: myz_naacl13_test_set.tgz
- tar Oxfz $^ test_set/word_relationship.questions | tr ' ' '\t' > /tmp/q
- tar Oxfz $^ test_set/word_relationship.answers | cut -f2 -d ' ' > /tmp/a
- paste /tmp/q /tmp/a > $@
- rm -f /tmp/q /tmp/a
- # wget commands to fetch the datasets. Please see the original datasets for
- # appropriate references if you use these.
- ws353simrel.tar.gz:
- wget http://alfonseca.org/pubs/ws353simrel.tar.gz
- MEN.tar.gz:
- wget http://clic.cimec.unitn.it/~elia.bruni/resources/MEN.tar.gz
- Mtruk.csv:
- wget http://tx.technion.ac.il/~kirar/files/Mtruk.csv
- rw.zip:
- wget http://www-nlp.stanford.edu/~lmthang/morphoNLM/rw.zip
- SimLex-999.zip:
- wget http://www.cl.cam.ac.uk/~fh295/SimLex-999.zip
- questions-words.txt:
- wget http://word2vec.googlecode.com/svn/trunk/questions-words.txt
- myz_naacl13_test_set.tgz:
- wget http://research.microsoft.com/en-us/um/people/gzweig/Pubs/myz_naacl13_test_set.tgz
- analogy: analogy.cc
- clean:
- rm -f *.ws.tab *.an.tab analogy *.pyc
- distclean: clean
- rm -f *.tgz *.tar.gz *.zip Mtruk.csv questions-words.txt
|