evaluate_lm.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. import fire
  2. from lm_eval.base import LM
  3. from lm_eval import tasks, evaluator
  4. from transformers import AutoModelForCausalLM, AutoTokenizer
  5. import torch
  6. class HuggingFaceModel(LM):
  7. def __init__(self, model_name, tokenizer_name):
  8. self.model = AutoModelForCausalLM.from_pretrained(model_name)
  9. self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
  10. def loglikelihood(self, ctx, cont):
  11. # Encode context and continuation
  12. input_ids = self.tokenizer.encode(ctx, add_special_tokens=False)
  13. cont_ids = self.tokenizer.encode(cont, add_special_tokens=False)
  14. # Concatenate context and continuation
  15. input_ids += cont_ids
  16. # Calculate log likelihood
  17. with torch.no_grad():
  18. outputs = self.model(input_ids=torch.tensor([input_ids]), labels=torch.tensor([input_ids]))
  19. log_likelihood = -outputs.loss.item() * len(cont_ids)
  20. return log_likelihood, len(cont_ids)
  21. @property
  22. def eot_token_id(self):
  23. return self._tokenizer.eos_id()
  24. @property
  25. def max_length(self):
  26. return self._max_seq_length
  27. @property
  28. def max_gen_toks(self):
  29. return 50
  30. @property
  31. def batch_size(self):
  32. return 1
  33. @property
  34. def device(self):
  35. return self._device
  36. def tok_encode(self, string: str):
  37. encoded = encode_tokens(self._tokenizer,
  38. string, bos=True, eos=False, device=self._device)
  39. # encoded is a pytorch tensor, but some internal logic in the
  40. # eval harness expects it to be a list instead
  41. # TODO: verify this for multi-batch as well
  42. encoded = encoded.tolist()
  43. return encoded
  44. def tok_decode(self, tokens):
  45. decoded = self._tokenizer.decode(tokens)
  46. return decoded
  47. def _model_call(self, inputss):
  48. max_new_tokens = 1
  49. logits = model(inputs)
  50. return logits
  51. def _model_generate(self, context, max_length, eos_token_id):
  52. raise Exception('unimplemented')
  53. # Implement other required methods if needed
  54. def evaluate_model(model_name, tokenizer_name, task_list):
  55. # Instantiate the model
  56. model = HuggingFaceModel(model_name, tokenizer_name)
  57. # Convert task_list string to list
  58. task_list = task_list.split(',')
  59. # Evaluate
  60. results = evaluator.evaluate(lm=model, tasks=tasks.get_task_dict(task_list), provide_description=True)
  61. print(results)
  62. if __name__ == "__main__":
  63. fire.Fire(evaluate_model)