123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136 |
- import glob
- import json
- import os
- import time
- from torch.utils.data import Dataset
- from megatron import print_rank_0
- from tasks.data_utils import build_sample
- from tasks.data_utils import build_tokens_types_paddings_from_ids
- from tasks.data_utils import clean_text
- NUM_CHOICES = 4
- MAX_QA_LENGTH = 128
- class RaceDataset(Dataset):
- def __init__(self, dataset_name, datapaths, tokenizer, max_seq_length,
- max_qa_length=MAX_QA_LENGTH):
- self.dataset_name = dataset_name
- print_rank_0(' > building RACE dataset for {}:'.format(
- self.dataset_name))
- string = ' > paths:'
- for path in datapaths:
- string += ' ' + path
- print_rank_0(string)
- self.samples = []
- for datapath in datapaths:
- self.samples.extend(process_single_datapath(datapath, tokenizer,
- max_qa_length,
- max_seq_length))
- print_rank_0(' >> total number of samples: {}'.format(
- len(self.samples)))
- # This indicates that each "sample" has multiple samples that
- # will collapse into batch dimension
- self.sample_multiplier = NUM_CHOICES
- def __len__(self):
- return len(self.samples)
- def __getitem__(self, idx):
- return self.samples[idx]
- def process_single_datapath(datapath, tokenizer, max_qa_length, max_seq_length):
- """Read in RACE files, combine, clean-up, tokenize, and convert to
- samples."""
- print_rank_0(' > working on {}'.format(datapath))
- start_time = time.time()
- # Get list of files.
- filenames = glob.glob(os.path.join(datapath, '*.txt'))
- samples = []
- num_docs = 0
- num_questions = 0
- num_samples = 0
- # Load all the files
- for filename in filenames:
- with open(filename, 'r') as f:
- for line in f:
- data = json.loads(line)
- num_docs += 1
- context = data["article"]
- questions = data["questions"]
- choices = data["options"]
- answers = data["answers"]
- # Check the length.
- assert len(questions) == len(answers)
- assert len(questions) == len(choices)
- # Context: clean up and convert to ids.
- context = clean_text(context)
- context_ids = tokenizer.tokenize(context)
- # Loop over questions.
- for qi, question in enumerate(questions):
- num_questions += 1
- # Label.
- label = ord(answers[qi]) - ord("A")
- assert label >= 0
- assert label < NUM_CHOICES
- assert len(choices[qi]) == NUM_CHOICES
- # For each question, build num-choices samples.
- ids_list = []
- types_list = []
- paddings_list = []
- for ci in range(NUM_CHOICES):
- choice = choices[qi][ci]
- # Merge with choice.
- if "_" in question:
- qa = question.replace("_", choice)
- else:
- qa = " ".join([question, choice])
- # Clean QA.
- qa = clean_text(qa)
- # Tokenize.
- qa_ids = tokenizer.tokenize(qa)
- # Trim if needed.
- if len(qa_ids) > max_qa_length:
- qa_ids = qa_ids[0:max_qa_length]
- # Build the sample.
- ids, types, paddings \
- = build_tokens_types_paddings_from_ids(
- qa_ids, context_ids, max_seq_length,
- tokenizer.cls, tokenizer.sep, tokenizer.pad)
- ids_list.append(ids)
- types_list.append(types)
- paddings_list.append(paddings)
- # Convert to numpy and add to samples
- samples.append(build_sample(ids_list, types_list,
- paddings_list, label,
- num_samples))
- num_samples += 1
- elapsed_time = time.time() - start_time
- print_rank_0(' > processed {} document, {} questions, and {} samples'
- ' in {:.2f} seconds'.format(num_docs, num_questions,
- num_samples, elapsed_time))
- return samples
|