eval.mk 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. # -*- Mode: Makefile -*-
  2. #
  3. # Copyright 2016 Google Inc. All Rights Reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. # This makefile pulls down the evaluation datasets and formats them uniformly.
  17. # Word similarity evaluations are formatted to contain exactly three columns:
  18. # the two words being compared and the human judgement.
  19. #
  20. # Use wordsim.py and analogy to run the actual evaluations.
  21. CXXFLAGS=-std=c++11 -m64 -mavx -g -Ofast -Wall
  22. LDLIBS=-lpthread -lm
  23. WORDSIM_EVALS= ws353sim.ws.tab \
  24. ws353rel.ws.tab \
  25. men.ws.tab \
  26. mturk.ws.tab \
  27. rarewords.ws.tab \
  28. simlex999.ws.tab \
  29. $(NULL)
  30. ANALOGY_EVALS= mikolov.an.tab \
  31. msr.an.tab \
  32. $(NULL)
  33. all: $(WORDSIM_EVALS) $(ANALOGY_EVALS) analogy
  34. ws353sim.ws.tab: ws353simrel.tar.gz
  35. tar Oxfz $^ wordsim353_sim_rel/wordsim_similarity_goldstandard.txt > $@
  36. ws353rel.ws.tab: ws353simrel.tar.gz
  37. tar Oxfz $^ wordsim353_sim_rel/wordsim_relatedness_goldstandard.txt > $@
  38. men.ws.tab: MEN.tar.gz
  39. tar Oxfz $^ MEN/MEN_dataset_natural_form_full | tr ' ' '\t' > $@
  40. mturk.ws.tab: Mtruk.csv
  41. cat $^ | tr -d '\r' | tr ',' '\t' > $@
  42. rarewords.ws.tab: rw.zip
  43. unzip -p $^ rw/rw.txt | cut -f1-3 -d $$'\t' > $@
  44. simlex999.ws.tab: SimLex-999.zip
  45. unzip -p $^ SimLex-999/SimLex-999.txt \
  46. | tail -n +2 | cut -f1,2,4 -d $$'\t' > $@
  47. mikolov.an.tab: questions-words.txt
  48. egrep -v -E '^:' $^ | tr '[A-Z] ' '[a-z]\t' > $@
  49. msr.an.tab: myz_naacl13_test_set.tgz
  50. tar Oxfz $^ test_set/word_relationship.questions | tr ' ' '\t' > /tmp/q
  51. tar Oxfz $^ test_set/word_relationship.answers | cut -f2 -d ' ' > /tmp/a
  52. paste /tmp/q /tmp/a > $@
  53. rm -f /tmp/q /tmp/a
  54. # wget commands to fetch the datasets. Please see the original datasets for
  55. # appropriate references if you use these.
  56. ws353simrel.tar.gz:
  57. wget http://alfonseca.org/pubs/ws353simrel.tar.gz
  58. MEN.tar.gz:
  59. wget http://clic.cimec.unitn.it/~elia.bruni/resources/MEN.tar.gz
  60. Mtruk.csv:
  61. wget http://tx.technion.ac.il/~kirar/files/Mtruk.csv
  62. rw.zip:
  63. wget http://www-nlp.stanford.edu/~lmthang/morphoNLM/rw.zip
  64. SimLex-999.zip:
  65. wget http://www.cl.cam.ac.uk/~fh295/SimLex-999.zip
  66. questions-words.txt:
  67. wget http://word2vec.googlecode.com/svn/trunk/questions-words.txt
  68. myz_naacl13_test_set.tgz:
  69. wget http://research.microsoft.com/en-us/um/people/gzweig/Pubs/myz_naacl13_test_set.tgz
  70. analogy: analogy.cc
  71. clean:
  72. rm -f *.ws.tab *.an.tab analogy *.pyc
  73. distclean: clean
  74. rm -f *.tgz *.tar.gz *.zip Mtruk.csv questions-words.txt