Source code for biapy.engine.train_engine

import torch
import math
import sys
import numpy as np
from typing import Iterable
from timm.utils import accuracy

from biapy.utils.misc import MetricLogger, SmoothedValue, all_reduce_mean, to_pytorch_format

[docs]def train_one_epoch(cfg, model, model_call_func, loss_function, activations, metric_function, prepare_targets, data_loader, optimizer, device, loss_scaler, epoch, log_writer=None, lr_scheduler=None, start_steps=0, verbose=False): model.train(True) # Ensure correct order of each epoch info by adding loss first metric_logger = MetricLogger(delimiter=" ", verbose=verbose) metric_logger.add_meter('loss', SmoothedValue()) header = 'Epoch: [{}]'.format(epoch+1) print_freq = 10 optimizer.zero_grad() for step, (batch, targets) in enumerate(metric_logger.log_every(data_loader, print_freq, header)): # Apply warmup cosine decay scheduler if selected # (notice we use a per iteration (instead of per epoch) lr scheduler) if epoch % cfg.TRAIN.ACCUM_ITER == 0 and cfg.TRAIN.LR_SCHEDULER.NAME == 'warmupcosine': lr_scheduler.adjust_learning_rate(optimizer, step / len(data_loader) + epoch) it = start_steps + step # global training iteration # Gather inputs targets = prepare_targets(targets, batch) if batch.shape[1:-1] != cfg.DATA.PATCH_SIZE[:-1]: raise ValueError("Trying to input data with different shape than 'DATA.PATCH_SIZE'. Check your configuration." f" Input: {batch.shape[1:-1]} vs PATCH_SIZE: {cfg.DATA.PATCH_SIZE[:-1]}") # Pass the images through the model # TODO: control autocast and mixed precision with torch.cuda.amp.autocast(enabled=False): outputs = activations(model_call_func(batch, is_train=True), training=True) loss = loss_function(outputs, targets) loss_value = loss.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) sys.exit(1) # Calculate the metrics metric_function(outputs, targets, metric_logger) # Forward pass scaling the loss loss /= cfg.TRAIN.ACCUM_ITER if (step + 1) % cfg.TRAIN.ACCUM_ITER == 0: loss.backward() optimizer.step() #update weight optimizer.zero_grad() if lr_scheduler is not None and cfg.TRAIN.LR_SCHEDULER.NAME == 'onecycle': lr_scheduler.step() if device.type != 'cpu': torch.cuda.synchronize() # Update loss in loggers metric_logger.update(loss=loss_value) loss_value_reduce = all_reduce_mean(loss_value) if log_writer is not None: log_writer.update(loss=loss_value_reduce, head="loss") # Update lr in loggers max_lr = 0. for group in optimizer.param_groups: max_lr = max(max_lr, group["lr"]) if step == 0: metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}')) metric_logger.update(lr=max_lr) if log_writer is not None: log_writer.update(lr=max_lr, head="opt") # Gather the stats from all processes metric_logger.synchronize_between_processes() print("[Train] averaged stats:", metric_logger) return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
[docs]@torch.no_grad() def evaluate(cfg, model, model_call_func, loss_function, activations, metric_function, prepare_targets, epoch, data_loader, lr_scheduler): # Ensure correct order of each epoch info by adding loss first metric_logger = MetricLogger(delimiter=" ") metric_logger.add_meter('loss', SmoothedValue()) header = 'Epoch: [{}]'.format(epoch+1) # Switch to evaluation mode model.eval() for batch in metric_logger.log_every(data_loader, 10, header): # Gather inputs images = batch[0] targets = batch[1] targets = prepare_targets(targets, images) # Pass the images through the model # TODO: control autocast and mixed precision with torch.cuda.amp.autocast(enabled=False): outputs = activations(model_call_func(images, is_train=True), training=True) loss = loss_function(outputs, targets) # Calculate the metrics metric_function(outputs, targets, metric_logger) metric_logger.update(loss=loss.item()) # Gather the stats from all processes metric_logger.synchronize_between_processes() print("[Val] averaged stats:", metric_logger) # Apply reduceonplateau scheduler if the global validation has been reduced if lr_scheduler is not None and cfg.TRAIN.LR_SCHEDULER.NAME == 'reduceonplateau': lr_scheduler.step(metric_logger.meters['loss'].global_avg) return {k: meter.global_avg for k, meter in metric_logger.meters.items()}