fused_layer_norm.py 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. # coding=utf-8
  2. # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """This code is copied fron NVIDIA apex:
  16. https://github.com/NVIDIA/apex
  17. with some changes. """
  18. import numbers
  19. import torch
  20. from torch.nn.parameter import Parameter
  21. from torch.nn import init
  22. import importlib
  23. global fused_mix_prec_layer_norm_cuda
  24. fused_mix_prec_layer_norm_cuda = None
  25. class FusedLayerNormAffineFunction(torch.autograd.Function):
  26. @staticmethod
  27. def forward(ctx, input, weight, bias, normalized_shape, eps):
  28. ctx.normalized_shape = normalized_shape
  29. ctx.eps = eps
  30. input_ = input.contiguous()
  31. weight_ = weight.contiguous()
  32. bias_ = bias.contiguous()
  33. output, mean, invvar = fused_mix_prec_layer_norm_cuda.forward_affine(
  34. input_, ctx.normalized_shape, weight_, bias_, ctx.eps)
  35. ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
  36. return output
  37. @staticmethod
  38. def backward(ctx, grad_output):
  39. input_, weight_, bias_, mean, invvar = ctx.saved_tensors
  40. grad_input = grad_weight = grad_bias = None
  41. grad_input, grad_weight, grad_bias \
  42. = fused_mix_prec_layer_norm_cuda.backward_affine(
  43. grad_output.contiguous(), mean, invvar,
  44. input_, ctx.normalized_shape,
  45. weight_, bias_, ctx.eps)
  46. return grad_input, grad_weight, grad_bias, None, None
  47. class MixedFusedLayerNorm(torch.nn.Module):
  48. def __init__(self, normalized_shape, eps=1e-5):
  49. super(MixedFusedLayerNorm, self).__init__()
  50. global fused_mix_prec_layer_norm_cuda
  51. fused_mix_prec_layer_norm_cuda = importlib.import_module(
  52. "fused_mix_prec_layer_norm_cuda")
  53. if isinstance(normalized_shape, numbers.Integral):
  54. normalized_shape = (normalized_shape,)
  55. self.normalized_shape = torch.Size(normalized_shape)
  56. self.eps = eps
  57. self.weight = Parameter(torch.Tensor(*normalized_shape))
  58. self.bias = Parameter(torch.Tensor(*normalized_shape))
  59. self.reset_parameters()
  60. def reset_parameters(self):
  61. init.ones_(self.weight)
  62. init.zeros_(self.bias)
  63. def forward(self, input):
  64. return FusedLayerNormAffineFunction.apply(
  65. input, self.weight, self.bias, self.normalized_shape,self.eps)