|
@@ -151,11 +151,11 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
|
|
batch[key] = batch[key].to('cuda:0')
|
|
batch[key] = batch[key].to('cuda:0')
|
|
with autocast():
|
|
with autocast():
|
|
loss = model(**batch).loss
|
|
loss = model(**batch).loss
|
|
|
|
+ total_loss += loss.detach().float()
|
|
loss = loss / gradient_accumulation_steps
|
|
loss = loss / gradient_accumulation_steps
|
|
if train_config.save_metrics:
|
|
if train_config.save_metrics:
|
|
train_step_loss.append(loss.detach().float().item())
|
|
train_step_loss.append(loss.detach().float().item())
|
|
train_step_perplexity.append(float(torch.exp(loss.detach().float())))
|
|
train_step_perplexity.append(float(torch.exp(loss.detach().float())))
|
|
- total_loss += loss.detach().float()
|
|
|
|
if train_config.use_fp16:
|
|
if train_config.use_fp16:
|
|
# if fp16 is enabled, use gradient scaler to handle gradient update
|
|
# if fp16 is enabled, use gradient scaler to handle gradient update
|
|
scaler.scale(loss).backward()
|
|
scaler.scale(loss).backward()
|