start_job.py 808 B

12345678910111213141516171819202122232425262728
  1. import datetime
  2. from sagemaker.pytorch import PyTorch
  3. import sagemaker
  4. import os
  5. sagemaker_session = sagemaker.Session()
  6. role = sagemaker.get_execution_role()
  7. try:
  8. role = sagemaker.get_execution_role()
  9. except ValueError:
  10. iam = boto3.client('iam')
  11. role = iam.get_role(RoleName='...')['Role']['Arn']
  12. print(role)
  13. volume_size = 500
  14. pytorch_estimator = PyTorch(
  15. entry_point="llama_finetuning.py", # the name of the script
  16. instance_type="ml.g5.12xlarge",
  17. instance_count=2, # this determines the number of p4d instances
  18. source_dir=os.getcwd(),
  19. framework_version="1.11.0",
  20. py_version="py38",
  21. volume_size=volume_size,
  22. # dependencies=[''],
  23. region='us-west-2',
  24. )
  25. pytorch_estimator.fit(
  26. job_name='FSDP' + '-' + datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%SZ"))