data.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. # coding=utf-8
  2. # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """GLUE dataset."""
  16. from abc import ABC
  17. from abc import abstractmethod
  18. from torch.utils.data import Dataset
  19. from megatron import print_rank_0
  20. from tasks.data_utils import build_sample
  21. from tasks.data_utils import build_tokens_types_paddings_from_text
  22. class GLUEAbstractDataset(ABC, Dataset):
  23. """GLUE base dataset class."""
  24. def __init__(self, task_name, dataset_name, datapaths,
  25. tokenizer, max_seq_length):
  26. # Store inputs.
  27. self.task_name = task_name
  28. self.dataset_name = dataset_name
  29. self.tokenizer = tokenizer
  30. self.max_seq_length = max_seq_length
  31. print_rank_0(' > building {} dataset for {}:'.format(self.task_name,
  32. self.dataset_name))
  33. # Process the files.
  34. string = ' > paths:'
  35. for path in datapaths:
  36. string += ' ' + path
  37. print_rank_0(string)
  38. self.samples = []
  39. for datapath in datapaths:
  40. self.samples.extend(self.process_samples_from_single_path(datapath))
  41. print_rank_0(' >> total number of samples: {}'.format(
  42. len(self.samples)))
  43. def __len__(self):
  44. return len(self.samples)
  45. def __getitem__(self, idx):
  46. raw_sample = self.samples[idx]
  47. ids, types, paddings = build_tokens_types_paddings_from_text(
  48. raw_sample['text_a'], raw_sample['text_b'],
  49. self.tokenizer, self.max_seq_length)
  50. sample = build_sample(ids, types, paddings,
  51. raw_sample['label'], raw_sample['uid'])
  52. return sample
  53. @abstractmethod
  54. def process_samples_from_single_path(self, datapath):
  55. """Abstract method that takes a single path / filename and
  56. returns a list of dataset samples, each sample being a dict of
  57. {'text_a': string, 'text_b': string, 'label': int, 'uid': int}
  58. """
  59. pass