test_data.py 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. # coding=utf-8
  2. # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. from commons import print_separator
  16. from commons import initialize_distributed
  17. from mpu import data as data_utils
  18. import mpu
  19. import torch
  20. import functools
  21. import operator
  22. import sys
  23. sys.path.append("../..")
  24. def test_broadcast_data(tensor_model_parallel_size):
  25. if torch.distributed.get_rank() == 0:
  26. print('> testing broadcast_data with model parallel size {} ...'.
  27. format(tensor_model_parallel_size))
  28. mpu.initialize_model_parallel(tensor_model_parallel_size)
  29. torch.manual_seed(1234 + mpu.get_data_parallel_rank())
  30. tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
  31. key_size_t = {'key1': [7, 11],
  32. 'key2': [8, 2, 1],
  33. 'key3': [13],
  34. 'key4': [5, 1, 2],
  35. 'key5': [5, 12]}
  36. keys = list(key_size_t.keys())
  37. data = {}
  38. data_t = {}
  39. for key in key_size_t:
  40. data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
  41. data_t[key] = data[key].clone()
  42. data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
  43. data_t['keyX'] = data['keyX'].clone()
  44. if mpu.get_tensor_model_parallel_rank() != 0:
  45. data = None
  46. data_utils._check_data_types(keys, data_t, torch.int64)
  47. key_size, key_numel, \
  48. total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
  49. for key in keys:
  50. assert key_size[key] == key_size_t[key]
  51. total_numel_t = 0
  52. for key in keys:
  53. target_size = functools.reduce(operator.mul, key_size_t[key], 1)
  54. assert key_numel[key] == target_size
  55. total_numel_t += target_size
  56. assert total_numel == total_numel_t
  57. data_b = data_utils.broadcast_data(keys, data, torch.int64)
  58. for key in keys:
  59. tensor = data_t[key].cuda()
  60. assert data_b[key].sub(tensor).abs().max() == 0
  61. # Reset groups
  62. mpu.destroy_tensor_model_parallel()
  63. torch.distributed.barrier()
  64. if torch.distributed.get_rank() == 0:
  65. print('>> passed the test :-)')
  66. if __name__ == '__main__':
  67. initialize_distributed()
  68. world_size = torch.distributed.get_world_size()
  69. tensor_model_parallel_size = 1
  70. while tensor_model_parallel_size <= world_size:
  71. print_separator('test test broadcast data')
  72. test_broadcast_data(tensor_model_parallel_size)
  73. tensor_model_parallel_size *= 2