123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119 |
- # coding=utf-8
- # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """ Tasks data utility."""
- import re
- import numpy as np
- def clean_text(text):
- """Remove new lines and multiple spaces and adjust end of sentence dot."""
- text = text.replace("\n", " ")
- text = re.sub(r'\s+', ' ', text)
- for _ in range(3):
- text = text.replace(' . ', '. ')
- return text
- def build_sample(ids, types, paddings, label, unique_id):
- """Convert to numpy and return a sample consumed by the batch producer."""
- ids_np = np.array(ids, dtype=np.int64)
- types_np = np.array(types, dtype=np.int64)
- paddings_np = np.array(paddings, dtype=np.int64)
- sample = ({'text': ids_np,
- 'types': types_np,
- 'padding_mask': paddings_np,
- 'label': int(label),
- 'uid': int(unique_id)})
- return sample
- def build_tokens_types_paddings_from_text(text_a, text_b,
- tokenizer, max_seq_length):
- """Build token types and paddings, trim if needed, and pad if needed."""
- text_a_ids = tokenizer.tokenize(text_a)
- text_b_ids = None
- if text_b is not None:
- text_b_ids = tokenizer.tokenize(text_b)
- return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids,
- max_seq_length, tokenizer.cls,
- tokenizer.sep, tokenizer.pad)
- def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length,
- cls_id, sep_id, pad_id):
- """Build token types and paddings, trim if needed, and pad if needed."""
- ids = []
- types = []
- paddings = []
- # [CLS].
- ids.append(cls_id)
- types.append(0)
- paddings.append(1)
- # A.
- len_text_a = len(text_a_ids)
- ids.extend(text_a_ids)
- types.extend([0] * len_text_a)
- paddings.extend([1] * len_text_a)
- # [SEP].
- ids.append(sep_id)
- types.append(0)
- paddings.append(1)
- # B.
- if text_b_ids is not None:
- len_text_b = len(text_b_ids)
- ids.extend(text_b_ids)
- types.extend([1] * len_text_b)
- paddings.extend([1] * len_text_b)
- # Cap the size.
- trimmed = False
- if len(ids) >= max_seq_length:
- max_seq_length_m1 = max_seq_length - 1
- ids = ids[0:max_seq_length_m1]
- types = types[0:max_seq_length_m1]
- paddings = paddings[0:max_seq_length_m1]
- trimmed = True
- # [SEP].
- if (text_b_ids is not None) or trimmed:
- ids.append(sep_id)
- if text_b_ids is None:
- types.append(0)
- else:
- types.append(1)
- paddings.append(1)
- # Padding.
- padding_length = max_seq_length - len(ids)
- if padding_length > 0:
- ids.extend([pad_id] * padding_length)
- types.extend([pad_id] * padding_length)
- paddings.extend([0] * padding_length)
- return ids, types, paddings
|