data_utils.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. # coding=utf-8
  2. # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """ Tasks data utility."""
  16. import re
  17. import numpy as np
  18. def clean_text(text):
  19. """Remove new lines and multiple spaces and adjust end of sentence dot."""
  20. text = text.replace("\n", " ")
  21. text = re.sub(r'\s+', ' ', text)
  22. for _ in range(3):
  23. text = text.replace(' . ', '. ')
  24. return text
  25. def build_sample(ids, types, paddings, label, unique_id):
  26. """Convert to numpy and return a sample consumed by the batch producer."""
  27. ids_np = np.array(ids, dtype=np.int64)
  28. types_np = np.array(types, dtype=np.int64)
  29. paddings_np = np.array(paddings, dtype=np.int64)
  30. sample = ({'text': ids_np,
  31. 'types': types_np,
  32. 'padding_mask': paddings_np,
  33. 'label': int(label),
  34. 'uid': int(unique_id)})
  35. return sample
  36. def build_tokens_types_paddings_from_text(text_a, text_b,
  37. tokenizer, max_seq_length):
  38. """Build token types and paddings, trim if needed, and pad if needed."""
  39. text_a_ids = tokenizer.tokenize(text_a)
  40. text_b_ids = None
  41. if text_b is not None:
  42. text_b_ids = tokenizer.tokenize(text_b)
  43. return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids,
  44. max_seq_length, tokenizer.cls,
  45. tokenizer.sep, tokenizer.pad)
  46. def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length,
  47. cls_id, sep_id, pad_id):
  48. """Build token types and paddings, trim if needed, and pad if needed."""
  49. ids = []
  50. types = []
  51. paddings = []
  52. # [CLS].
  53. ids.append(cls_id)
  54. types.append(0)
  55. paddings.append(1)
  56. # A.
  57. len_text_a = len(text_a_ids)
  58. ids.extend(text_a_ids)
  59. types.extend([0] * len_text_a)
  60. paddings.extend([1] * len_text_a)
  61. # [SEP].
  62. ids.append(sep_id)
  63. types.append(0)
  64. paddings.append(1)
  65. # B.
  66. if text_b_ids is not None:
  67. len_text_b = len(text_b_ids)
  68. ids.extend(text_b_ids)
  69. types.extend([1] * len_text_b)
  70. paddings.extend([1] * len_text_b)
  71. # Cap the size.
  72. trimmed = False
  73. if len(ids) >= max_seq_length:
  74. max_seq_length_m1 = max_seq_length - 1
  75. ids = ids[0:max_seq_length_m1]
  76. types = types[0:max_seq_length_m1]
  77. paddings = paddings[0:max_seq_length_m1]
  78. trimmed = True
  79. # [SEP].
  80. if (text_b_ids is not None) or trimmed:
  81. ids.append(sep_id)
  82. if text_b_ids is None:
  83. types.append(0)
  84. else:
  85. types.append(1)
  86. paddings.append(1)
  87. # Padding.
  88. padding_length = max_seq_length - len(ids)
  89. if padding_length > 0:
  90. ids.extend([pad_id] * padding_length)
  91. types.extend([pad_id] * padding_length)
  92. paddings.extend([0] * padding_length)
  93. return ids, types, paddings