| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 | # coding=utf-8# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at##     http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License."""GLUE dataset."""from abc import ABCfrom abc import abstractmethodfrom torch.utils.data import Datasetfrom megatron import print_rank_0from tasks.data_utils import build_samplefrom tasks.data_utils import build_tokens_types_paddings_from_textclass GLUEAbstractDataset(ABC, Dataset):    """GLUE base dataset class."""    def __init__(self, task_name, dataset_name, datapaths,                 tokenizer, max_seq_length):        # Store inputs.        self.task_name = task_name        self.dataset_name = dataset_name        self.tokenizer = tokenizer        self.max_seq_length = max_seq_length        print_rank_0(' > building {} dataset for {}:'.format(self.task_name,                                                             self.dataset_name))        # Process the files.        string = '  > paths:'        for path in datapaths:            string += ' ' + path        print_rank_0(string)        self.samples = []        for datapath in datapaths:            self.samples.extend(self.process_samples_from_single_path(datapath))        print_rank_0('  >> total number of samples: {}'.format(            len(self.samples)))    def __len__(self):        return len(self.samples)    def __getitem__(self, idx):        raw_sample = self.samples[idx]        ids, types, paddings = build_tokens_types_paddings_from_text(            raw_sample['text_a'], raw_sample['text_b'],            self.tokenizer, self.max_seq_length)        sample = build_sample(ids, types, paddings,                              raw_sample['label'], raw_sample['uid'])        return sample    @abstractmethod    def process_samples_from_single_path(self, datapath):        """Abstract method that takes a single path / filename and        returns a list of dataset samples, each sample being a dict of            {'text_a': string, 'text_b': string, 'label': int, 'uid': int}        """        pass
 |