test_train_utils.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. # Copyright (c) Meta Platforms, Inc. and affiliates.
  2. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
  3. from unittest.mock import patch
  4. import pytest
  5. import torch
  6. import os
  7. import shutil
  8. from llama_recipes.utils.train_utils import train
  9. TEMP_OUTPUT_DIR = os.getcwd() + "/tmp"
  10. @pytest.fixture(scope="session")
  11. def temp_output_dir():
  12. # Create the directory during the session-level setup
  13. temp_output_dir = "tmp"
  14. os.mkdir(os.path.join(os.getcwd(), temp_output_dir))
  15. yield temp_output_dir
  16. # Delete the directory during the session-level teardown
  17. shutil.rmtree(temp_output_dir)
  18. @pytest.fixture(scope="session")
  19. def path_to_generate():
  20. path_to_generate = "some/random/path"
  21. yield path_to_generate
  22. shutil.rmtree(path_to_generate)
  23. @patch("llama_recipes.utils.train_utils.MemoryTrace")
  24. @patch("llama_recipes.utils.train_utils.nullcontext")
  25. @patch("llama_recipes.utils.train_utils.torch.cuda.amp.GradScaler")
  26. @patch("llama_recipes.utils.train_utils.torch.cuda.amp.autocast")
  27. def test_gradient_accumulation(autocast, scaler, nullcontext, mem_trace, mocker):
  28. model = mocker.MagicMock(name="model")
  29. model().loss.__truediv__().detach.return_value = torch.tensor(1)
  30. mock_tensor = mocker.MagicMock(name="tensor")
  31. batch = {"input": mock_tensor}
  32. train_dataloader = [batch, batch, batch, batch, batch]
  33. eval_dataloader = None
  34. tokenizer = mocker.MagicMock()
  35. optimizer = mocker.MagicMock()
  36. lr_scheduler = mocker.MagicMock()
  37. gradient_accumulation_steps = 1
  38. train_config = mocker.MagicMock()
  39. train_config.enable_fsdp = False
  40. train_config.use_fp16 = False
  41. train_config.run_validation = False
  42. train_config.gradient_clipping = False
  43. train_config.save_metrics = False
  44. train(
  45. model,
  46. train_dataloader,
  47. eval_dataloader,
  48. tokenizer,
  49. optimizer,
  50. lr_scheduler,
  51. gradient_accumulation_steps,
  52. train_config,
  53. )
  54. assert optimizer.zero_grad.call_count == 5
  55. optimizer.zero_grad.reset_mock()
  56. assert nullcontext.call_count == 5
  57. nullcontext.reset_mock()
  58. assert autocast.call_count == 0
  59. gradient_accumulation_steps = 2
  60. train_config.use_fp16 = True
  61. train(
  62. model,
  63. train_dataloader,
  64. eval_dataloader,
  65. tokenizer,
  66. optimizer,
  67. lr_scheduler,
  68. gradient_accumulation_steps,
  69. train_config,
  70. )
  71. assert optimizer.zero_grad.call_count == 3
  72. assert nullcontext.call_count == 0
  73. assert autocast.call_count == 5
  74. def test_save_to_json_file_exists(temp_output_dir, mocker):
  75. model = mocker.MagicMock(name="model")
  76. model().loss.__truediv__().detach.return_value = torch.tensor(1)
  77. mock_tensor = mocker.MagicMock(name="tensor")
  78. batch = {"input": mock_tensor}
  79. train_dataloader = [batch, batch, batch, batch, batch]
  80. eval_dataloader = None
  81. tokenizer = mocker.MagicMock()
  82. optimizer = mocker.MagicMock()
  83. lr_scheduler = mocker.MagicMock()
  84. gradient_accumulation_steps = 1
  85. train_config = mocker.MagicMock()
  86. train_config.enable_fsdp = False
  87. train_config.use_fp16 = False
  88. train_config.run_validation = False
  89. train_config.gradient_clipping = False
  90. train_config.save_metrics = True
  91. train_config.output_dir = temp_output_dir
  92. results = train(
  93. model,
  94. train_dataloader,
  95. eval_dataloader,
  96. tokenizer,
  97. optimizer,
  98. lr_scheduler,
  99. gradient_accumulation_steps,
  100. train_config,
  101. local_rank=0
  102. )
  103. assert results["metrics_filename"] not in ["", None]
  104. assert os.path.isfile(results["metrics_filename"])
  105. def test_save_to_json_filen_name_local_rank_0(temp_output_dir, mocker):
  106. model = mocker.MagicMock(name="model")
  107. model().loss.__truediv__().detach.return_value = torch.tensor(1)
  108. mock_tensor = mocker.MagicMock(name="tensor")
  109. batch = {"input": mock_tensor}
  110. train_dataloader = [batch, batch, batch, batch, batch]
  111. eval_dataloader = None
  112. tokenizer = mocker.MagicMock()
  113. optimizer = mocker.MagicMock()
  114. lr_scheduler = mocker.MagicMock()
  115. gradient_accumulation_steps = 1
  116. train_config = mocker.MagicMock()
  117. train_config.enable_fsdp = False
  118. train_config.use_fp16 = False
  119. train_config.run_validation = False
  120. train_config.gradient_clipping = False
  121. train_config.save_metrics = True
  122. train_config.output_dir = temp_output_dir
  123. results = train(
  124. model,
  125. train_dataloader,
  126. eval_dataloader,
  127. tokenizer,
  128. optimizer,
  129. lr_scheduler,
  130. gradient_accumulation_steps,
  131. train_config
  132. )
  133. assert "None" not in results["metrics_filename"]
  134. assert "metrics_data_0" in results["metrics_filename"]
  135. def test_save_to_json_filen_name_local_rank_1(temp_output_dir, mocker):
  136. model = mocker.MagicMock(name="model")
  137. model().loss.__truediv__().detach.return_value = torch.tensor(1)
  138. mock_tensor = mocker.MagicMock(name="tensor")
  139. batch = {"input": mock_tensor}
  140. train_dataloader = [batch, batch, batch, batch, batch]
  141. eval_dataloader = None
  142. tokenizer = mocker.MagicMock()
  143. optimizer = mocker.MagicMock()
  144. lr_scheduler = mocker.MagicMock()
  145. gradient_accumulation_steps = 1
  146. train_config = mocker.MagicMock()
  147. train_config.enable_fsdp = False
  148. train_config.use_fp16 = False
  149. train_config.run_validation = False
  150. train_config.gradient_clipping = False
  151. train_config.save_metrics = True
  152. train_config.output_dir = temp_output_dir
  153. results = train(
  154. model,
  155. train_dataloader,
  156. eval_dataloader,
  157. tokenizer,
  158. optimizer,
  159. lr_scheduler,
  160. gradient_accumulation_steps,
  161. train_config,
  162. local_rank=1
  163. )
  164. assert "None" not in results["metrics_filename"]
  165. assert "metrics_data_1" in results["metrics_filename"]
  166. def test_save_to_json_folder_exists(path_to_generate, mocker):
  167. model = mocker.MagicMock(name="model")
  168. model().loss.__truediv__().detach.return_value = torch.tensor(1)
  169. mock_tensor = mocker.MagicMock(name="tensor")
  170. batch = {"input": mock_tensor}
  171. train_dataloader = [batch, batch, batch, batch, batch]
  172. eval_dataloader = None
  173. tokenizer = mocker.MagicMock()
  174. optimizer = mocker.MagicMock()
  175. lr_scheduler = mocker.MagicMock()
  176. gradient_accumulation_steps = 1
  177. train_config = mocker.MagicMock()
  178. train_config.enable_fsdp = False
  179. train_config.use_fp16 = False
  180. train_config.run_validation = False
  181. train_config.gradient_clipping = False
  182. train_config.save_metrics = True
  183. train_config.output_dir = path_to_generate
  184. results = train(
  185. model,
  186. train_dataloader,
  187. eval_dataloader,
  188. tokenizer,
  189. optimizer,
  190. lr_scheduler,
  191. gradient_accumulation_steps,
  192. train_config
  193. )
  194. assert os.path.isfile(results["metrics_filename"])