diff --git a/.github/workflows/dali_tests.yml b/.github/workflows/dali_tests.yml
index b0e6e8de8..b1b8103ef 100644
--- a/.github/workflows/dali_tests.yml
+++ b/.github/workflows/dali_tests.yml
@@ -12,7 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python: [3.7, 3.8, 3.9]
+        python: [3.8, 3.9]
         os: [ubuntu-latest]
 
     steps:
diff --git a/.github/workflows/sphinx-linkcheck.yml b/.github/workflows/sphinx-linkcheck.yml
index c703aba4c..995e1c13d 100644
--- a/.github/workflows/sphinx-linkcheck.yml
+++ b/.github/workflows/sphinx-linkcheck.yml
@@ -13,10 +13,10 @@ jobs:
     steps:
       - uses: actions/checkout@v2
 
-      - name: Set up Python 3.7
+      - name: Set up Python 3.9
         uses: actions/setup-python@v1
         with:
-          python-version: 3.7
+          python-version: 3.9
 
       - name: python dependencies
         run: |
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 9bc20365e..e0e2950b1 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -12,7 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python: [3.7, 3.8, 3.9]
+        python: [3.8, 3.9]
         os: [ubuntu-latest, windows-latest]
 
     steps:
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 733a40c7c..2679646c9 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -52,7 +52,7 @@
 def package_list_from_file(file):
     """List up package name (not containing version and extras) from a package list file"""
     mocked_packages = []
-    with open(file, "r") as fp:
+    with open(file) as fp:
         for ln in fp.readlines():
             # Example: `tqdm>=4.41.0` => `tqdm`
             # `[` is for package with extras
diff --git a/docs/source/solo/methods/base.rst b/docs/source/solo/methods/base.rst
index 94929df99..3b3464a1a 100644
--- a/docs/source/solo/methods/base.rst
+++ b/docs/source/solo/methods/base.rst
@@ -45,9 +45,9 @@ validation_step
 .. automethod:: solo.methods.base.BaseMethod.validation_step
    :noindex:
 
-validation_epoch_end
+on_validation_epoch_end
 ~~~~~~~~~~~~~~~~~~~~
-.. automethod:: solo.methods.base.BaseMethod.validation_epoch_end
+.. automethod:: solo.methods.base.BaseMethod.on_validation_epoch_end
    :noindex:
 
 
@@ -104,7 +104,7 @@ validation_step
 .. automethod:: solo.methods.base.BaseMethod.validation_step
    :noindex:
 
-validation_epoch_end
+on_validation_epoch_end
 ~~~~~~~~~~~~~~~~~~~~
-.. automethod:: solo.methods.base.BaseMethod.validation_epoch_end
+.. automethod:: solo.methods.base.BaseMethod.on_validation_epoch_end
    :noindex:
diff --git a/docs/source/solo/methods/linear.rst b/docs/source/solo/methods/linear.rst
index 10f785b9d..cc17505cc 100644
--- a/docs/source/solo/methods/linear.rst
+++ b/docs/source/solo/methods/linear.rst
@@ -35,7 +35,7 @@ validation_step
 .. automethod:: solo.methods.linear.LinearModel.validation_step
    :noindex:
 
-validation_epoch_end
+on_validation_epoch_end
 ~~~~~~~~~~~~~~~~~~~~
-.. automethod:: solo.methods.linear.LinearModel.validation_epoch_end
+.. automethod:: solo.methods.linear.LinearModel.on_validation_epoch_end
    :noindex:
diff --git a/main_umap.py b/main_umap.py
index a6607b665..aa61bd199 100644
--- a/main_umap.py
+++ b/main_umap.py
@@ -21,6 +21,8 @@
 import os
 from pathlib import Path
 
+from omegaconf import OmegaConf
+
 from solo.args.umap import parse_args_umap
 from solo.data.classification_dataloader import prepare_data
 from solo.methods import METHODS
@@ -38,15 +40,14 @@ def main():
     # load arguments
     with open(args_path) as f:
         method_args = json.load(f)
+    cfg = OmegaConf.create(method_args)
 
     # build the model
     model = (
         METHODS[method_args["method"]]
-        .load_from_checkpoint(ckpt_path, strict=False, **method_args)
+        .load_from_checkpoint(ckpt_path, strict=False, cfg=cfg)
         .backbone
     )
-    model.cuda()
-
     # prepare data
     train_loader, val_loader = prepare_data(
         args.dataset,
@@ -55,7 +56,7 @@ def main():
         data_format=args.data_format,
         batch_size=args.batch_size,
         num_workers=args.num_workers,
-        auto_augment=args.auto_augment,
+        auto_augment=False,
     )
 
     umap = OfflineUMAP()
diff --git a/requirements.txt b/requirements.txt
index 55fe0ddce..bafca8262 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,8 @@
 torch>=1.10.0
 torchvision>=0.11.1
 einops
-pytorch-lightning>=1.7.0, <1.9.0
+pytorch-lightning==2.0.2
 torchmetrics>=0.6.0, <0.12.0
-lightning-bolts>=0.6.0
 tqdm
 wandb
 scipy
diff --git a/scripts/finetune/imagenet-100/mae.yaml b/scripts/finetune/imagenet-100/mae.yaml
index a51b72549..a8e2dfb50 100644
--- a/scripts/finetune/imagenet-100/mae.yaml
+++ b/scripts/finetune/imagenet-100/mae.yaml
@@ -49,4 +49,4 @@ devices: [0, 1, 2, 3, 4, 5, 6, 7]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/finetune/imagenet/mae.yaml b/scripts/finetune/imagenet/mae.yaml
index f3c0453a5..fc821348a 100644
--- a/scripts/finetune/imagenet/mae.yaml
+++ b/scripts/finetune/imagenet/mae.yaml
@@ -49,4 +49,4 @@ devices: [0, 1, 2, 3, 4, 5, 6, 7]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/linear/imagenet-100/barlow.yaml b/scripts/linear/imagenet-100/barlow.yaml
index 534859bd4..f984833a0 100644
--- a/scripts/linear/imagenet-100/barlow.yaml
+++ b/scripts/linear/imagenet-100/barlow.yaml
@@ -42,4 +42,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/linear/imagenet-100/byol.yaml b/scripts/linear/imagenet-100/byol.yaml
index e167722ba..5fcfe9e3f 100644
--- a/scripts/linear/imagenet-100/byol.yaml
+++ b/scripts/linear/imagenet-100/byol.yaml
@@ -42,4 +42,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/linear/imagenet-100/deepclusterv2.yaml b/scripts/linear/imagenet-100/deepclusterv2.yaml
index 4d4061930..0c68b6d4e 100644
--- a/scripts/linear/imagenet-100/deepclusterv2.yaml
+++ b/scripts/linear/imagenet-100/deepclusterv2.yaml
@@ -42,4 +42,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/linear/imagenet-100/dino.yaml b/scripts/linear/imagenet-100/dino.yaml
index edacd281b..261da18f3 100644
--- a/scripts/linear/imagenet-100/dino.yaml
+++ b/scripts/linear/imagenet-100/dino.yaml
@@ -42,4 +42,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/linear/imagenet-100/mocov2plus.yaml b/scripts/linear/imagenet-100/mocov2plus.yaml
index 55d15a03c..6f08d9fc2 100644
--- a/scripts/linear/imagenet-100/mocov2plus.yaml
+++ b/scripts/linear/imagenet-100/mocov2plus.yaml
@@ -42,4 +42,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/linear/imagenet-100/mocov3.yaml b/scripts/linear/imagenet-100/mocov3.yaml
index 30beaf130..d13dbf2b3 100644
--- a/scripts/linear/imagenet-100/mocov3.yaml
+++ b/scripts/linear/imagenet-100/mocov3.yaml
@@ -42,4 +42,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/linear/imagenet-100/mocov3_vit.yaml b/scripts/linear/imagenet-100/mocov3_vit.yaml
index 92a298e81..58f79c2bb 100644
--- a/scripts/linear/imagenet-100/mocov3_vit.yaml
+++ b/scripts/linear/imagenet-100/mocov3_vit.yaml
@@ -42,4 +42,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/linear/imagenet-100/nnclr.yaml b/scripts/linear/imagenet-100/nnclr.yaml
index ac197f94f..c4e27a443 100644
--- a/scripts/linear/imagenet-100/nnclr.yaml
+++ b/scripts/linear/imagenet-100/nnclr.yaml
@@ -42,4 +42,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/linear/imagenet-100/ressl.yaml b/scripts/linear/imagenet-100/ressl.yaml
index e8e87d8bd..800811224 100644
--- a/scripts/linear/imagenet-100/ressl.yaml
+++ b/scripts/linear/imagenet-100/ressl.yaml
@@ -42,4 +42,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/linear/imagenet-100/simclr.yaml b/scripts/linear/imagenet-100/simclr.yaml
index 04e312fe0..a40694c8c 100644
--- a/scripts/linear/imagenet-100/simclr.yaml
+++ b/scripts/linear/imagenet-100/simclr.yaml
@@ -42,4 +42,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/linear/imagenet-100/simsiam.yaml b/scripts/linear/imagenet-100/simsiam.yaml
index b7d9ddae5..7ecd4b4ab 100644
--- a/scripts/linear/imagenet-100/simsiam.yaml
+++ b/scripts/linear/imagenet-100/simsiam.yaml
@@ -42,4 +42,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/linear/imagenet-100/swav.yaml b/scripts/linear/imagenet-100/swav.yaml
index f0155b52b..08e606ff6 100644
--- a/scripts/linear/imagenet-100/swav.yaml
+++ b/scripts/linear/imagenet-100/swav.yaml
@@ -42,4 +42,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/linear/imagenet-100/vibcreg.yaml b/scripts/linear/imagenet-100/vibcreg.yaml
index d4ad39f70..463d70fde 100644
--- a/scripts/linear/imagenet-100/vibcreg.yaml
+++ b/scripts/linear/imagenet-100/vibcreg.yaml
@@ -42,4 +42,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/linear/imagenet-100/vicreg.yaml b/scripts/linear/imagenet-100/vicreg.yaml
index 0d0150b22..253b5c74e 100644
--- a/scripts/linear/imagenet-100/vicreg.yaml
+++ b/scripts/linear/imagenet-100/vicreg.yaml
@@ -42,4 +42,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/linear/imagenet/barlow.yaml b/scripts/linear/imagenet/barlow.yaml
index 61d32abcd..0d0947d50 100644
--- a/scripts/linear/imagenet/barlow.yaml
+++ b/scripts/linear/imagenet/barlow.yaml
@@ -42,4 +42,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/linear/imagenet/byol.yaml b/scripts/linear/imagenet/byol.yaml
index 12aef3266..ba44afdee 100644
--- a/scripts/linear/imagenet/byol.yaml
+++ b/scripts/linear/imagenet/byol.yaml
@@ -42,4 +42,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/linear/imagenet/mocov2plus.yaml b/scripts/linear/imagenet/mocov2plus.yaml
index 8b1a0ea8d..3cf52182d 100644
--- a/scripts/linear/imagenet/mocov2plus.yaml
+++ b/scripts/linear/imagenet/mocov2plus.yaml
@@ -42,4 +42,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/cifar-multicrop/swav.yaml b/scripts/pretrain/cifar-multicrop/swav.yaml
index 5a9c436c7..c36b7669f 100644
--- a/scripts/pretrain/cifar-multicrop/swav.yaml
+++ b/scripts/pretrain/cifar-multicrop/swav.yaml
@@ -55,4 +55,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/cifar/barlow.yaml b/scripts/pretrain/cifar/barlow.yaml
index 86c1aa684..728f14ba7 100644
--- a/scripts/pretrain/cifar/barlow.yaml
+++ b/scripts/pretrain/cifar/barlow.yaml
@@ -50,4 +50,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/cifar/byol.yaml b/scripts/pretrain/cifar/byol.yaml
index d3c163be9..eec69496f 100644
--- a/scripts/pretrain/cifar/byol.yaml
+++ b/scripts/pretrain/cifar/byol.yaml
@@ -53,4 +53,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/cifar/deepclusterv2.yaml b/scripts/pretrain/cifar/deepclusterv2.yaml
index c3159f234..f8847859c 100644
--- a/scripts/pretrain/cifar/deepclusterv2.yaml
+++ b/scripts/pretrain/cifar/deepclusterv2.yaml
@@ -53,4 +53,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/cifar/dino.yaml b/scripts/pretrain/cifar/dino.yaml
index 843cbb6c7..008e3abfe 100644
--- a/scripts/pretrain/cifar/dino.yaml
+++ b/scripts/pretrain/cifar/dino.yaml
@@ -53,4 +53,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/cifar/mae.yaml b/scripts/pretrain/cifar/mae.yaml
index 939c40fcd..0d8f8bad7 100644
--- a/scripts/pretrain/cifar/mae.yaml
+++ b/scripts/pretrain/cifar/mae.yaml
@@ -53,4 +53,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/cifar/mocov2plus.yaml b/scripts/pretrain/cifar/mocov2plus.yaml
index 6e3b137bc..8c990b196 100644
--- a/scripts/pretrain/cifar/mocov2plus.yaml
+++ b/scripts/pretrain/cifar/mocov2plus.yaml
@@ -50,4 +50,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/cifar/mocov3.yaml b/scripts/pretrain/cifar/mocov3.yaml
index a23b8c1e5..9eccbd2d5 100644
--- a/scripts/pretrain/cifar/mocov3.yaml
+++ b/scripts/pretrain/cifar/mocov3.yaml
@@ -54,4 +54,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/cifar/nnbyol.yaml b/scripts/pretrain/cifar/nnbyol.yaml
index 331101099..5cec47bac 100644
--- a/scripts/pretrain/cifar/nnbyol.yaml
+++ b/scripts/pretrain/cifar/nnbyol.yaml
@@ -54,4 +54,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/cifar/nnclr.yaml b/scripts/pretrain/cifar/nnclr.yaml
index d10f35278..2786f365e 100644
--- a/scripts/pretrain/cifar/nnclr.yaml
+++ b/scripts/pretrain/cifar/nnclr.yaml
@@ -52,4 +52,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/cifar/nnsiam.yaml b/scripts/pretrain/cifar/nnsiam.yaml
index 9d1102c69..3d611e7ac 100644
--- a/scripts/pretrain/cifar/nnsiam.yaml
+++ b/scripts/pretrain/cifar/nnsiam.yaml
@@ -50,4 +50,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/cifar/ressl.yaml b/scripts/pretrain/cifar/ressl.yaml
index 1bc44a70c..7272f622f 100644
--- a/scripts/pretrain/cifar/ressl.yaml
+++ b/scripts/pretrain/cifar/ressl.yaml
@@ -53,4 +53,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/cifar/simclr.yaml b/scripts/pretrain/cifar/simclr.yaml
index 6902362d4..0531365a7 100644
--- a/scripts/pretrain/cifar/simclr.yaml
+++ b/scripts/pretrain/cifar/simclr.yaml
@@ -50,4 +50,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/cifar/simsiam.yaml b/scripts/pretrain/cifar/simsiam.yaml
index bc9874a4f..dec94d430 100644
--- a/scripts/pretrain/cifar/simsiam.yaml
+++ b/scripts/pretrain/cifar/simsiam.yaml
@@ -47,4 +47,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/cifar/supcon.yaml b/scripts/pretrain/cifar/supcon.yaml
index 392069de7..365317b85 100644
--- a/scripts/pretrain/cifar/supcon.yaml
+++ b/scripts/pretrain/cifar/supcon.yaml
@@ -46,4 +46,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/cifar/swav.yaml b/scripts/pretrain/cifar/swav.yaml
index 14f71dd14..01d6c431b 100644
--- a/scripts/pretrain/cifar/swav.yaml
+++ b/scripts/pretrain/cifar/swav.yaml
@@ -54,4 +54,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/cifar/vibcreg.yaml b/scripts/pretrain/cifar/vibcreg.yaml
index a8deb2cc5..ebc2404f9 100644
--- a/scripts/pretrain/cifar/vibcreg.yaml
+++ b/scripts/pretrain/cifar/vibcreg.yaml
@@ -74,4 +74,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/cifar/vicreg.yaml b/scripts/pretrain/cifar/vicreg.yaml
index 4f04c7097..0a8db3111 100644
--- a/scripts/pretrain/cifar/vicreg.yaml
+++ b/scripts/pretrain/cifar/vicreg.yaml
@@ -80,4 +80,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/cifar/wmse.yaml b/scripts/pretrain/cifar/wmse.yaml
index 76d6f6529..7b77e45ec 100644
--- a/scripts/pretrain/cifar/wmse.yaml
+++ b/scripts/pretrain/cifar/wmse.yaml
@@ -70,4 +70,4 @@ devices: [0]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/custom/byol.yaml b/scripts/pretrain/custom/byol.yaml
index 943ee52b9..517dcb4a7 100644
--- a/scripts/pretrain/custom/byol.yaml
+++ b/scripts/pretrain/custom/byol.yaml
@@ -60,4 +60,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100-multicrop/byol.yaml b/scripts/pretrain/imagenet-100-multicrop/byol.yaml
index 345647d39..f17ede003 100644
--- a/scripts/pretrain/imagenet-100-multicrop/byol.yaml
+++ b/scripts/pretrain/imagenet-100-multicrop/byol.yaml
@@ -53,4 +53,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100-multicrop/simclr.yaml b/scripts/pretrain/imagenet-100-multicrop/simclr.yaml
index 082bb6666..cbd804bdf 100644
--- a/scripts/pretrain/imagenet-100-multicrop/simclr.yaml
+++ b/scripts/pretrain/imagenet-100-multicrop/simclr.yaml
@@ -50,4 +50,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100-multicrop/supcon.yaml b/scripts/pretrain/imagenet-100-multicrop/supcon.yaml
index e6b44e8cb..5de0a77c4 100644
--- a/scripts/pretrain/imagenet-100-multicrop/supcon.yaml
+++ b/scripts/pretrain/imagenet-100-multicrop/supcon.yaml
@@ -46,4 +46,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100/barlow.yaml b/scripts/pretrain/imagenet-100/barlow.yaml
index 6b56eb81c..ddd2da670 100644
--- a/scripts/pretrain/imagenet-100/barlow.yaml
+++ b/scripts/pretrain/imagenet-100/barlow.yaml
@@ -50,4 +50,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100/byol.yaml b/scripts/pretrain/imagenet-100/byol.yaml
index c4a0170bc..35cd7d560 100644
--- a/scripts/pretrain/imagenet-100/byol.yaml
+++ b/scripts/pretrain/imagenet-100/byol.yaml
@@ -53,4 +53,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100/deepclusterv2.yaml b/scripts/pretrain/imagenet-100/deepclusterv2.yaml
index 673bd8a96..f6c023f0d 100644
--- a/scripts/pretrain/imagenet-100/deepclusterv2.yaml
+++ b/scripts/pretrain/imagenet-100/deepclusterv2.yaml
@@ -55,4 +55,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100/dino.yaml b/scripts/pretrain/imagenet-100/dino.yaml
index b38fbd75a..1129e125c 100644
--- a/scripts/pretrain/imagenet-100/dino.yaml
+++ b/scripts/pretrain/imagenet-100/dino.yaml
@@ -54,4 +54,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100/dino_vit.yaml b/scripts/pretrain/imagenet-100/dino_vit.yaml
index 2d70d023b..89ff43b6e 100644
--- a/scripts/pretrain/imagenet-100/dino_vit.yaml
+++ b/scripts/pretrain/imagenet-100/dino_vit.yaml
@@ -51,4 +51,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100/mae.yaml b/scripts/pretrain/imagenet-100/mae.yaml
index bab22bcdb..7366cd647 100644
--- a/scripts/pretrain/imagenet-100/mae.yaml
+++ b/scripts/pretrain/imagenet-100/mae.yaml
@@ -51,4 +51,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100/mocov2plus.yaml b/scripts/pretrain/imagenet-100/mocov2plus.yaml
index a097a9ddb..afbe0b4cf 100644
--- a/scripts/pretrain/imagenet-100/mocov2plus.yaml
+++ b/scripts/pretrain/imagenet-100/mocov2plus.yaml
@@ -50,4 +50,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100/mocov3.yaml b/scripts/pretrain/imagenet-100/mocov3.yaml
index fc44e8808..df5d4def3 100644
--- a/scripts/pretrain/imagenet-100/mocov3.yaml
+++ b/scripts/pretrain/imagenet-100/mocov3.yaml
@@ -54,4 +54,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100/mocov3_vit.yaml b/scripts/pretrain/imagenet-100/mocov3_vit.yaml
index 1eda53764..af942c546 100644
--- a/scripts/pretrain/imagenet-100/mocov3_vit.yaml
+++ b/scripts/pretrain/imagenet-100/mocov3_vit.yaml
@@ -50,4 +50,4 @@ devices: [0, 1, 2, 3, 4, 5, 6, 7]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100/nnclr.yaml b/scripts/pretrain/imagenet-100/nnclr.yaml
index 08ec68063..422b7beca 100644
--- a/scripts/pretrain/imagenet-100/nnclr.yaml
+++ b/scripts/pretrain/imagenet-100/nnclr.yaml
@@ -52,4 +52,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100/ressl.yaml b/scripts/pretrain/imagenet-100/ressl.yaml
index 1e7402df0..70416d60e 100644
--- a/scripts/pretrain/imagenet-100/ressl.yaml
+++ b/scripts/pretrain/imagenet-100/ressl.yaml
@@ -53,4 +53,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100/simclr.yaml b/scripts/pretrain/imagenet-100/simclr.yaml
index 478851c14..8a07198f8 100644
--- a/scripts/pretrain/imagenet-100/simclr.yaml
+++ b/scripts/pretrain/imagenet-100/simclr.yaml
@@ -50,4 +50,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100/simsiam.yaml b/scripts/pretrain/imagenet-100/simsiam.yaml
index 06c27a740..dab8055b7 100644
--- a/scripts/pretrain/imagenet-100/simsiam.yaml
+++ b/scripts/pretrain/imagenet-100/simsiam.yaml
@@ -48,4 +48,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100/supcon.yaml b/scripts/pretrain/imagenet-100/supcon.yaml
index a101c56db..0b91b8815 100644
--- a/scripts/pretrain/imagenet-100/supcon.yaml
+++ b/scripts/pretrain/imagenet-100/supcon.yaml
@@ -46,4 +46,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100/swav.yaml b/scripts/pretrain/imagenet-100/swav.yaml
index 17aae7878..1833f54a2 100644
--- a/scripts/pretrain/imagenet-100/swav.yaml
+++ b/scripts/pretrain/imagenet-100/swav.yaml
@@ -54,4 +54,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100/vibcreg.yaml b/scripts/pretrain/imagenet-100/vibcreg.yaml
index 7f343983f..ba9c89100 100644
--- a/scripts/pretrain/imagenet-100/vibcreg.yaml
+++ b/scripts/pretrain/imagenet-100/vibcreg.yaml
@@ -53,4 +53,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100/vicreg.yaml b/scripts/pretrain/imagenet-100/vicreg.yaml
index 7263b5b4c..68e817fb2 100644
--- a/scripts/pretrain/imagenet-100/vicreg.yaml
+++ b/scripts/pretrain/imagenet-100/vicreg.yaml
@@ -81,4 +81,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet-100/wmse.yaml b/scripts/pretrain/imagenet-100/wmse.yaml
index 80713b5eb..3e1747842 100644
--- a/scripts/pretrain/imagenet-100/wmse.yaml
+++ b/scripts/pretrain/imagenet-100/wmse.yaml
@@ -47,4 +47,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet/barlow.yaml b/scripts/pretrain/imagenet/barlow.yaml
index 5ef3d958a..e799282bf 100644
--- a/scripts/pretrain/imagenet/barlow.yaml
+++ b/scripts/pretrain/imagenet/barlow.yaml
@@ -51,4 +51,4 @@ devices: [0, 1, 2, 3]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet/byol.yaml b/scripts/pretrain/imagenet/byol.yaml
index 2636108a8..23a1069c4 100644
--- a/scripts/pretrain/imagenet/byol.yaml
+++ b/scripts/pretrain/imagenet/byol.yaml
@@ -53,5 +53,5 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
 accumulate_grad_batches: 16
diff --git a/scripts/pretrain/imagenet/mae.yaml b/scripts/pretrain/imagenet/mae.yaml
index b9e327c8d..96886368c 100644
--- a/scripts/pretrain/imagenet/mae.yaml
+++ b/scripts/pretrain/imagenet/mae.yaml
@@ -54,4 +54,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/scripts/pretrain/imagenet/mocov2plus.yaml b/scripts/pretrain/imagenet/mocov2plus.yaml
index d86043de3..0fabde502 100644
--- a/scripts/pretrain/imagenet/mocov2plus.yaml
+++ b/scripts/pretrain/imagenet/mocov2plus.yaml
@@ -50,4 +50,4 @@ devices: [0, 1]
 sync_batchnorm: True
 accelerator: "gpu"
 strategy: "ddp"
-precision: 16
+precision: 16-mixed
diff --git a/solo/args/linear.py b/solo/args/linear.py
index 78e8f8650..f89c60091 100644
--- a/solo/args/linear.py
+++ b/solo/args/linear.py
@@ -1,5 +1,4 @@
 import os
-from multiprocessing.managers import BaseManager
 
 import omegaconf
 from omegaconf import OmegaConf
@@ -158,7 +157,7 @@ def parse_cfg(cfg: omegaconf.DictConfig):
         # even if the custom dataset doesn't have any labels
         cfg.data.num_classes = max(
             1,
-            len([entry.name for entry in os.scandir(cfg.data.train_path) if entry.is_dir]),
+            sum(entry.is_dir() for entry in os.scandir(cfg.data.train_path)),
         )
 
     if cfg.data.format == "dali":
diff --git a/solo/args/pretrain.py b/solo/args/pretrain.py
index 6c16d8028..36b5ff6db 100644
--- a/solo/args/pretrain.py
+++ b/solo/args/pretrain.py
@@ -124,7 +124,7 @@ def parse_cfg(cfg: omegaconf.DictConfig):
         # even if the custom dataset doesn't have any labels
         cfg.data.num_classes = max(
             1,
-            len([entry.name for entry in os.scandir(cfg.data.train_path) if entry.is_dir]),
+            sum(entry.is_dir() for entry in os.scandir(cfg.data.train_path)),
         )
 
     # find number of big/small crops
diff --git a/solo/backbones/poolformer/poolformer.py b/solo/backbones/poolformer/poolformer.py
index e558b75e6..b93f772c4 100644
--- a/solo/backbones/poolformer/poolformer.py
+++ b/solo/backbones/poolformer/poolformer.py
@@ -25,8 +25,7 @@
 import torch
 import torch.nn as nn
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from timm.models.layers import DropPath, trunc_normal_
-from timm.models.layers.helpers import to_2tuple
+from timm.models.layers import DropPath, trunc_normal_, to_2tuple
 from timm.models.registry import register_model
 
 
@@ -197,10 +196,10 @@ def __init__(
         self.use_layer_scale = use_layer_scale
         if use_layer_scale:
             self.layer_scale_1 = nn.Parameter(
-                layer_scale_init_value * torch.ones((dim)), requires_grad=True
+                layer_scale_init_value * torch.ones(dim), requires_grad=True
             )
             self.layer_scale_2 = nn.Parameter(
-                layer_scale_init_value * torch.ones((dim)), requires_grad=True
+                layer_scale_init_value * torch.ones(dim), requires_grad=True
             )
 
     def forward(self, x):
diff --git a/solo/backbones/wide_resnet/wide_resnet.py b/solo/backbones/wide_resnet/wide_resnet.py
index 5c4214f4d..86839ad45 100644
--- a/solo/backbones/wide_resnet/wide_resnet.py
+++ b/solo/backbones/wide_resnet/wide_resnet.py
@@ -30,7 +30,7 @@ class WideResnetBasicBlock(nn.Module):
     def __init__(
         self, in_planes, out_planes, stride, drop_rate=0.0, activate_before_residual=False
     ):
-        super(WideResnetBasicBlock, self).__init__()
+        super().__init__()
         self.bn1 = nn.BatchNorm2d(in_planes, momentum=0.001, eps=0.001)
         self.relu1 = nn.LeakyReLU(negative_slope=0.1, inplace=False)
         self.conv1 = nn.Conv2d(
@@ -73,7 +73,7 @@ def __init__(
         drop_rate=0.0,
         activate_before_residual=False,
     ):
-        super(WideResnetNetworkBlock, self).__init__()
+        super().__init__()
         self.layer = self._make_layer(
             block, in_planes, out_planes, nb_layers, stride, drop_rate, activate_before_residual
         )
@@ -100,7 +100,7 @@ def forward(self, x):
 
 class WideResNet(nn.Module):
     def __init__(self, first_stride=1, depth=28, widen_factor=2, drop_rate=0.0, **kwargs):
-        super(WideResNet, self).__init__()
+        super().__init__()
         channels = [16, 16 * widen_factor, 32 * widen_factor, 64 * widen_factor]
         self.num_features = channels[-1]
         assert (depth - 4) % 6 == 0
diff --git a/solo/data/h5_dataset.py b/solo/data/h5_dataset.py
index 46eb0a371..8aaf3236d 100644
--- a/solo/data/h5_dataset.py
+++ b/solo/data/h5_dataset.py
@@ -66,7 +66,7 @@ def __init__(
         if dataset == "imagenet100":
             script_folder = Path(os.path.dirname(__file__))
             classes_file = script_folder / "dataset_subset" / "imagenet100_classes.txt"
-            with open(classes_file, "r") as f:
+            with open(classes_file) as f:
                 self.classes = f.readline().strip().split()
             self.classes = sorted(self.classes)
             self.class_to_idx = {cls_name: i for i, cls_name in enumerate(self.classes)}
@@ -107,7 +107,7 @@ def _load_h5_data_info(self):
                     f.write(f"{class_name}/{img_name} {y}\n")
         else:
             # load data info file that was already generated by previous runs
-            with open(h5_data_info_file, "r") as f:
+            with open(h5_data_info_file) as f:
                 for line in f:
                     class_name_img, y = line.strip().split(" ")
                     class_name, img_name = class_name_img.split("/")
diff --git a/solo/data/pretrain_dataloader.py b/solo/data/pretrain_dataloader.py
index faf1fc76d..96cb7d030 100644
--- a/solo/data/pretrain_dataloader.py
+++ b/solo/data/pretrain_dataloader.py
@@ -172,7 +172,7 @@ def __call__(self, x: Image) -> List[torch.Tensor]:
         return out
 
     def __repr__(self) -> str:
-        return "\n".join([str(transform) for transform in self.transforms])
+        return "\n".join(str(transform) for transform in self.transforms)
 
 
 def build_transform_pipeline(dataset, cfg):
diff --git a/solo/methods/base.py b/solo/methods/base.py
index 362424fe3..93b66509f 100644
--- a/solo/methods/base.py
+++ b/solo/methods/base.py
@@ -26,7 +26,6 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
 from solo.backbones import (
     convnext_base,
     convnext_large,
@@ -52,6 +51,7 @@
 )
 from solo.utils.knn import WeightedKNNClassifier
 from solo.utils.lars import LARS
+from solo.utils.lr_scheduler import LinearWarmupCosineAnnealingLR
 from solo.utils.metrics import accuracy_at_k, weighted_mean
 from solo.utils.misc import omegaconf_select, remove_bias_and_norm_from_weight_decay
 from solo.utils.momentum import MomentumUpdater, initialize_momentum_params
@@ -142,8 +142,8 @@ def __init__(self, cfg: omegaconf.DictConfig):
                 warmup_start_lr (float): initial learning rate for warmup scheduler.
                     Defaults to 0.00003.
                 warmup_epochs (float): number of warmup epochs. Defaults to 10.
-                lr_decay_steps (Sequence, optional): steps to decay the learning rate if scheduler is
-                    step. Defaults to None.
+                lr_decay_steps (Sequence, optional): steps to decay the learning rate if
+                    scheduler is step. Defaults to None.
                 interval (str): interval to update the lr scheduler. Defaults to 'step'.
             knn_eval:
                 enabled (bool): enables online knn evaluation while training.
@@ -179,7 +179,8 @@ def __init__(self, cfg: omegaconf.DictConfig):
 
         self.cfg: omegaconf.DictConfig = cfg
 
-        ########## Backbone ##########
+        ##############################
+        # Backbone
         self.backbone_args: Dict[str, Any] = cfg.backbone.kwargs
         assert cfg.backbone.name in BaseMethod._BACKBONES
         self.base_model: Callable = self._BACKBONES[cfg.backbone.name]
@@ -257,6 +258,9 @@ def __init__(self, cfg: omegaconf.DictConfig):
         # for performance
         self.no_channel_last = cfg.performance.disable_channel_last
 
+        # keep track of validation metrics
+        self.validation_step_outputs = []
+
     @staticmethod
     def add_and_assert_specific_cfg(cfg: omegaconf.DictConfig) -> omegaconf.DictConfig:
         """Adds method specific default values/checks for config.
@@ -279,7 +283,7 @@ def add_and_assert_specific_cfg(cfg: omegaconf.DictConfig) -> omegaconf.DictConf
         cfg.optimizer.kwargs = omegaconf_select(cfg, "optimizer.kwargs", {})
 
         # default for acc grad batches
-        cfg.accumulate_grad_batches = omegaconf_select(cfg, "accumulate_grad_batches", None)
+        cfg.accumulate_grad_batches = omegaconf_select(cfg, "accumulate_grad_batches", 1)
 
         # default parameters for the scheduler
         cfg.scheduler.lr_decay_steps = omegaconf_select(cfg, "scheduler.lr_decay_steps", None)
@@ -397,14 +401,14 @@ def configure_optimizers(self) -> Tuple[List, List]:
 
         return [optimizer], [scheduler]
 
-    def optimizer_zero_grad(self, epoch, batch_idx, optimizer, optimizer_idx):
+    def optimizer_zero_grad(self, epoch, batch_idx, optimizer):
         """
         This improves performance marginally. It should be fine
         since we are not affected by any of the downsides descrited in
         https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html#torch.optim.Optimizer.zero_grad
 
         Implemented as in here
-        https://pytorch-lightning.readthedocs.io/en/1.5.10/guides/speed.html#set-grads-to-none
+        https://lightning.ai/docs/pytorch/latest/advanced/speed.html?highlight=set%20grads%20none
         """
         try:
             optimizer.zero_grad(set_to_none=True)
@@ -551,7 +555,11 @@ def base_validation_step(self, X: torch.Tensor, targets: torch.Tensor) -> Dict:
         return self._base_shared_step(X, targets)
 
     def validation_step(
-        self, batch: List[torch.Tensor], batch_idx: int, dataloader_idx: int = None
+        self,
+        batch: List[torch.Tensor],
+        batch_idx: int,
+        dataloader_idx: int = None,
+        update_validation_step_outputs: bool = True,
     ) -> Dict[str, Any]:
         """Validation step for pytorch lightning. It does all the shared operations, such as
         forwarding a batch of images, computing logits and computing metrics.
@@ -559,6 +567,8 @@ def validation_step(
         Args:
             batch (List[torch.Tensor]):a batch of data in the format of [img_indexes, X, Y].
             batch_idx (int): index of the batch.
+            update_validation_step_outputs (bool): whether or not to append the
+                metrics to validation_step_outputs
 
         Returns:
             Dict[str, Any]: dict with the batch_size (used for averaging), the classification loss
@@ -579,20 +589,19 @@ def validation_step(
             "val_acc1": out["acc1"],
             "val_acc5": out["acc5"],
         }
+        if update_validation_step_outputs:
+            self.validation_step_outputs.append(metrics)
         return metrics
 
-    def validation_epoch_end(self, outs: List[Dict[str, Any]]):
+    def on_validation_epoch_end(self):
         """Averages the losses and accuracies of all the validation batches.
         This is needed because the last batch can be smaller than the others,
         slightly skewing the metrics.
-
-        Args:
-            outs (List[Dict[str, Any]]): list of outputs of the validation step.
         """
 
-        val_loss = weighted_mean(outs, "val_loss", "batch_size")
-        val_acc1 = weighted_mean(outs, "val_acc1", "batch_size")
-        val_acc5 = weighted_mean(outs, "val_acc5", "batch_size")
+        val_loss = weighted_mean(self.validation_step_outputs, "val_loss", "batch_size")
+        val_acc1 = weighted_mean(self.validation_step_outputs, "val_acc1", "batch_size")
+        val_acc5 = weighted_mean(self.validation_step_outputs, "val_acc5", "batch_size")
 
         log = {"val_loss": val_loss, "val_acc1": val_acc1, "val_acc5": val_acc5}
 
@@ -618,7 +627,8 @@ def __init__(
             momentum:
                 base_tau (float): base value of the weighting decrease coefficient in [0,1].
                 final_tau (float): final value of the weighting decrease coefficient in [0,1].
-                classifier (bool): whether or not to train a classifier on top of the momentum backbone.
+                classifier (bool): whether or not to train a classifier on top of the
+                    momentum backbone.
         """
 
         super().__init__(cfg)
@@ -824,56 +834,78 @@ def on_train_batch_end(self, outputs: Dict[str, Any], batch: Sequence[Any], batc
         self.last_step = self.trainer.global_step
 
     def validation_step(
-        self, batch: List[torch.Tensor], batch_idx: int, dataloader_idx: int = None
+        self,
+        batch: List[torch.Tensor],
+        batch_idx: int,
+        dataloader_idx: int = None,
+        update_validation_step_outputs: bool = True,
     ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         """Validation step for pytorch lightning. It performs all the shared operations for the
         momentum backbone and classifier, such as forwarding a batch of images in the momentum
         backbone and classifier and computing statistics.
+
         Args:
             batch (List[torch.Tensor]): a batch of data in the format of [X, Y].
             batch_idx (int): index of the batch.
+            update_validation_step_outputs (bool): whether or not to append the
+                metrics to validation_step_outputs
+
         Returns:
             Tuple(Dict[str, Any], Dict[str, Any]): tuple of dicts containing the batch_size (used
                 for averaging), the classification loss and accuracies for both the online and the
                 momentum classifiers.
         """
 
-        parent_metrics = super().validation_step(batch, batch_idx)
+        metrics = super().validation_step(batch, batch_idx, update_validation_step_outputs=False)
 
         X, targets = batch
-        batch_size = targets.size(0)
 
         out = self._shared_step_momentum(X, targets)
 
-        metrics = None
         if self.momentum_classifier is not None:
-            metrics = {
-                "batch_size": batch_size,
-                "momentum_val_loss": out["loss"],
-                "momentum_val_acc1": out["acc1"],
-                "momentum_val_acc5": out["acc5"],
-            }
+            metrics.update(
+                {
+                    "momentum_val_loss": out["loss"],
+                    "momentum_val_acc1": out["acc1"],
+                    "momentum_val_acc5": out["acc5"],
+                }
+            )
 
-        return parent_metrics, metrics
+        if update_validation_step_outputs:
+            self.validation_step_outputs.append(metrics)
 
-    def validation_epoch_end(self, outs: Tuple[List[Dict[str, Any]]]):
+        return metrics
+
+    def on_validation_epoch_end(self):
         """Averages the losses and accuracies of the momentum backbone / classifier for all the
         validation batches. This is needed because the last batch can be smaller than the others,
         slightly skewing the metrics.
-        Args:
-            outs (Tuple[List[Dict[str, Any]]]):): list of outputs of the validation step for self
-                and the parent.
         """
 
-        parent_outs = [out[0] for out in outs]
-        super().validation_epoch_end(parent_outs)
+        # base method metrics
+        val_loss = weighted_mean(self.validation_step_outputs, "val_loss", "batch_size")
+        val_acc1 = weighted_mean(self.validation_step_outputs, "val_acc1", "batch_size")
+        val_acc5 = weighted_mean(self.validation_step_outputs, "val_acc5", "batch_size")
 
-        if self.momentum_classifier is not None:
-            momentum_outs = [out[1] for out in outs]
+        log = {"val_loss": val_loss, "val_acc1": val_acc1, "val_acc5": val_acc5}
+
+        if self.knn_eval and not self.trainer.sanity_checking:
+            val_knn_acc1, val_knn_acc5 = self.knn.compute()
+            log.update({"val_knn_acc1": val_knn_acc1, "val_knn_acc5": val_knn_acc5})
+
+        self.log_dict(log, sync_dist=True)
 
-            val_loss = weighted_mean(momentum_outs, "momentum_val_loss", "batch_size")
-            val_acc1 = weighted_mean(momentum_outs, "momentum_val_acc1", "batch_size")
-            val_acc5 = weighted_mean(momentum_outs, "momentum_val_acc5", "batch_size")
+        # momentum method metrics
+        if self.momentum_classifier is not None:
+            val_loss = weighted_mean(
+                self.validation_step_outputs, "momentum_val_loss", "batch_size"
+            )
+            val_acc1 = weighted_mean(
+                self.validation_step_outputs, "momentum_val_acc1", "batch_size"
+            )
+            val_acc5 = weighted_mean(
+                self.validation_step_outputs, "momentum_val_acc5", "batch_size"
+            )
 
             log = {
                 "momentum_val_loss": val_loss,
diff --git a/solo/methods/linear.py b/solo/methods/linear.py
index f67cf31bf..d84cc6e4b 100644
--- a/solo/methods/linear.py
+++ b/solo/methods/linear.py
@@ -19,13 +19,14 @@
 
 import logging
 from typing import Any, Callable, Dict, List, Tuple, Union
+
 import omegaconf
 import pytorch_lightning as pl
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
 from solo.utils.lars import LARS
+from solo.utils.lr_scheduler import LinearWarmupCosineAnnealingLR
 from solo.utils.metrics import accuracy_at_k, weighted_mean
 from solo.utils.misc import (
     omegaconf_select,
@@ -79,8 +80,8 @@ def __init__(
                 warmup_start_lr (float): initial learning rate for warmup scheduler.
                     Defaults to 0.00003.
                 warmup_epochs (float): number of warmup epochs. Defaults to 10.
-                lr_decay_steps (Sequence, optional): steps to decay the learning rate if scheduler is
-                    step. Defaults to None.
+                lr_decay_steps (Sequence, optional): steps to decay the learning rate
+                    if scheduler is step. Defaults to None.
                 interval (str): interval to update the lr scheduler. Defaults to 'step'.
 
             finetune (bool): whether or not to finetune the backbone. Defaults to False.
@@ -90,9 +91,9 @@ def __init__(
                 speeds up training considerably. Defaults to False.
                 https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html#converting-existing-models
 
-        loss_func (Callable): loss function to use (for mixup, label smoothing or default). Defaults to None
-        mixup_func (Callable, optional). function to convert data and targets with mixup/cutmix.
-            Defaults to None.
+        loss_func (Callable): loss function to use (for mixup, label smoothing or default).
+        Defaults to None mixup_func (Callable, optional). function to convert data and targets
+        with mixup/cutmix. Defaults to None.
         """
 
         super().__init__()
@@ -154,6 +155,9 @@ def __init__(
             for param in self.backbone.parameters():
                 param.requires_grad = False
 
+        # keep track of validation metrics
+        self.validation_step_outputs = []
+
     @staticmethod
     def add_and_assert_specific_cfg(cfg: omegaconf.DictConfig) -> omegaconf.DictConfig:
         """Adds method specific default values/checks for config.
@@ -177,7 +181,7 @@ def add_and_assert_specific_cfg(cfg: omegaconf.DictConfig) -> omegaconf.DictConf
         cfg.finetune = omegaconf_select(cfg, "finetune", False)
 
         # default for acc grad batches
-        cfg.accumulate_grad_batches = omegaconf_select(cfg, "accumulate_grad_batches", None)
+        cfg.accumulate_grad_batches = omegaconf_select(cfg, "accumulate_grad_batches", 1)
 
         # default parameters for the scheduler
         cfg.scheduler.lr_decay_steps = omegaconf_select(cfg, "scheduler.lr_decay_steps", None)
@@ -203,7 +207,10 @@ def configure_optimizers(self) -> Tuple[List, List]:
 
         if self.layer_decay > 0:
             assert self.finetune, "Only with use layer weight decay with finetune on."
-            msg = "Method should implement no_weight_decay() that returns a set of parameter names to ignore from weight decay"
+            msg = (
+                "Method should implement no_weight_decay() that returns "
+                "a set of parameter names to ignore from weight decay"
+            )
             assert hasattr(self.backbone, "no_weight_decay"), msg
 
             learnable_params = param_groups_layer_decay(
@@ -364,26 +371,25 @@ def validation_step(self, batch: torch.Tensor, batch_idx: int) -> Dict[str, Any]
 
         out = self.shared_step(batch, batch_idx)
 
-        results = {
+        metrics = {
             "batch_size": out["batch_size"],
             "val_loss": out["loss"],
             "val_acc1": out["acc1"],
             "val_acc5": out["acc5"],
         }
-        return results
+        self.validation_step_outputs.append(metrics)
+        return metrics
 
-    def validation_epoch_end(self, outs: List[Dict[str, Any]]):
+    def on_validation_epoch_end(self):
         """Averages the losses and accuracies of all the validation batches.
         This is needed because the last batch can be smaller than the others,
         slightly skewing the metrics.
-
-        Args:
-            outs (List[Dict[str, Any]]): list of outputs of the validation step.
         """
 
-        val_loss = weighted_mean(outs, "val_loss", "batch_size")
-        val_acc1 = weighted_mean(outs, "val_acc1", "batch_size")
-        val_acc5 = weighted_mean(outs, "val_acc5", "batch_size")
+        val_loss = weighted_mean(self.validation_step_outputs, "val_loss", "batch_size")
+        val_acc1 = weighted_mean(self.validation_step_outputs, "val_acc1", "batch_size")
+        val_acc5 = weighted_mean(self.validation_step_outputs, "val_acc5", "batch_size")
+        self.validation_step_outputs.clear()
 
         log = {"val_loss": val_loss, "val_acc1": val_acc1, "val_acc5": val_acc5}
         self.log_dict(log, sync_dist=True)
diff --git a/solo/utils/auto_umap.py b/solo/utils/auto_umap.py
index c926632ec..7c3971870 100644
--- a/solo/utils/auto_umap.py
+++ b/solo/utils/auto_umap.py
@@ -90,8 +90,8 @@ def add_and_assert_specific_cfg(cfg: DictConfig) -> DictConfig:
     @staticmethod
     def random_string(letter_count=4, digit_count=4):
         tmp_random = random.Random(time.time())
-        rand_str = "".join((tmp_random.choice(string.ascii_lowercase) for x in range(letter_count)))
-        rand_str += "".join((tmp_random.choice(string.digits) for x in range(digit_count)))
+        rand_str = "".join(tmp_random.choice(string.ascii_lowercase) for _ in range(letter_count))
+        rand_str += "".join(tmp_random.choice(string.digits) for _ in range(digit_count))
         rand_str = list(rand_str)
         tmp_random.shuffle(rand_str)
         return "".join(rand_str)
@@ -150,7 +150,10 @@ def plot(self, trainer: pl.Trainer, module: pl.LightningModule):
         # set module to eval model and collect all feature representations
         module.eval()
         with torch.no_grad():
-            for x, y in trainer.val_dataloaders[0]:
+            val_dataloader = trainer.val_dataloaders
+            if isinstance(val_dataloader, list):
+                val_dataloader = val_dataloader[0]
+            for x, y in val_dataloader:
                 x = x.to(device, non_blocking=True)
                 y = y.to(device, non_blocking=True)
 
diff --git a/solo/utils/checkpointer.py b/solo/utils/checkpointer.py
index 311f32393..14ded7bd0 100644
--- a/solo/utils/checkpointer.py
+++ b/solo/utils/checkpointer.py
@@ -79,8 +79,8 @@ def add_and_assert_specific_cfg(cfg: DictConfig) -> DictConfig:
     @staticmethod
     def random_string(letter_count=4, digit_count=4):
         tmp_random = random.Random(time.time())
-        rand_str = "".join((tmp_random.choice(string.ascii_lowercase) for _ in range(letter_count)))
-        rand_str += "".join((tmp_random.choice(string.digits) for _ in range(digit_count)))
+        rand_str = "".join(tmp_random.choice(string.ascii_lowercase) for _ in range(letter_count))
+        rand_str += "".join(tmp_random.choice(string.digits) for _ in range(digit_count))
         rand_str = list(rand_str)
         tmp_random.shuffle(rand_str)
         return "".join(rand_str)
diff --git a/solo/utils/lr_scheduler.py b/solo/utils/lr_scheduler.py
new file mode 100644
index 000000000..38c2c4489
--- /dev/null
+++ b/solo/utils/lr_scheduler.py
@@ -0,0 +1,149 @@
+# Copied from Pytorch Lightning Bolts
+# https://github.com/Lightning-Universe/lightning-bolts/blob/master/src/pl_bolts/optimizers/lr_scheduler.py
+# To avoid a dependency
+
+
+import math
+import warnings
+from typing import List
+
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+
+
+class LinearWarmupCosineAnnealingLR(_LRScheduler):
+    """Sets the learning rate of each parameter group to follow a linear warmup schedule
+        between warmup_start_lr and base_lr followed by a cosine annealing schedule
+        between base_lr and eta_min.
+
+    .. warning::
+        It is recommended to call :func:`.step()` for :class:`LinearWarmupCosineAnnealingLR`
+        after each iteration as calling it after each epoch will keep the starting lr at
+        warmup_start_lr for the first epoch which is 0 in most cases.
+
+    .. warning::
+        passing epoch to :func:`.step()` is being deprecated and comes with an
+        EPOCH_DEPRECATION_WARNING. It calls the :func:`_get_closed_form_lr()`
+        method for this scheduler instead of :func:`get_lr()`. Though this does not
+        change the behavior of the scheduler, when passing epoch param to :func:`.step()`,
+        the user should call the :func:`.step()` function before calling
+        train and validation methods.
+
+    Example:
+        >>> import torch.nn as nn
+        >>> from torch.optim import Adam
+        >>> #
+        >>> layer = nn.Linear(10, 1)
+        >>> optimizer = Adam(layer.parameters(), lr=0.02)
+        >>> scheduler = LinearWarmupCosineAnnealingLR(optimizer, warmup_epochs=10, max_epochs=40)
+        >>> # the default case
+        >>> for epoch in range(40):
+        ...     # train(...)
+        ...     # validate(...)
+        ...     scheduler.step()
+        >>> # passing epoch param case
+        >>> for epoch in range(40):
+        ...     scheduler.step(epoch)
+        ...     # train(...)
+        ...     # validate(...)
+    """
+
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        warmup_epochs: int,
+        max_epochs: int,
+        warmup_start_lr: float = 0.0,
+        eta_min: float = 0.0,
+        last_epoch: int = -1,
+    ) -> None:
+        """
+        Args:
+            optimizer (Optimizer): Wrapped optimizer.
+            warmup_epochs (int): Maximum number of iterations for linear warmup
+            max_epochs (int): Maximum number of iterations
+            warmup_start_lr (float): Learning rate to start the linear warmup. Default: 0.
+            eta_min (float): Minimum learning rate. Default: 0.
+            last_epoch (int): The index of last epoch. Default: -1.
+        """
+        self.warmup_epochs = warmup_epochs
+        self.max_epochs = max_epochs
+        self.warmup_start_lr = warmup_start_lr
+        self.eta_min = eta_min
+
+        super().__init__(optimizer, last_epoch)
+
+    def get_lr(self) -> List[float]:
+        """Compute learning rate using chainable form of the scheduler."""
+        if not self._get_lr_called_within_step:
+            warnings.warn(
+                "To get the last learning rate computed by the scheduler, "
+                "please use `get_last_lr()`.",
+                UserWarning,
+            )
+
+        if self.last_epoch == 0:
+            return [self.warmup_start_lr] * len(self.base_lrs)
+        if self.last_epoch < self.warmup_epochs:
+            return [
+                group["lr"] + (base_lr - self.warmup_start_lr) / (self.warmup_epochs - 1)
+                for base_lr, group in zip(self.base_lrs, self.optimizer.param_groups)
+            ]
+        if self.last_epoch == self.warmup_epochs:
+            return self.base_lrs
+        if (self.last_epoch - 1 - self.max_epochs) % (
+            2 * (self.max_epochs - self.warmup_epochs)
+        ) == 0:
+            return [
+                group["lr"]
+                + (base_lr - self.eta_min)
+                * (1 - math.cos(math.pi / (self.max_epochs - self.warmup_epochs)))
+                / 2
+                for base_lr, group in zip(self.base_lrs, self.optimizer.param_groups)
+            ]
+
+        return [
+            (
+                1
+                + math.cos(
+                    math.pi
+                    * (self.last_epoch - self.warmup_epochs)
+                    / (self.max_epochs - self.warmup_epochs)
+                )
+            )
+            / (
+                1
+                + math.cos(
+                    math.pi
+                    * (self.last_epoch - self.warmup_epochs - 1)
+                    / (self.max_epochs - self.warmup_epochs)
+                )
+            )
+            * (group["lr"] - self.eta_min)
+            + self.eta_min
+            for group in self.optimizer.param_groups
+        ]
+
+    def _get_closed_form_lr(self) -> List[float]:
+        """Called when epoch is passed as a param to the `step` function of the scheduler."""
+        if self.last_epoch < self.warmup_epochs:
+            return [
+                self.warmup_start_lr
+                + self.last_epoch * (base_lr - self.warmup_start_lr) / (self.warmup_epochs - 1)
+                for base_lr in self.base_lrs
+            ]
+
+        return [
+            self.eta_min
+            + 0.5
+            * (base_lr - self.eta_min)
+            * (
+                1
+                + math.cos(
+                    math.pi
+                    * (self.last_epoch - self.warmup_epochs)
+                    / (self.max_epochs - self.warmup_epochs)
+                )
+            )
+            for base_lr in self.base_lrs
+        ]
diff --git a/solo/utils/misc.py b/solo/utils/misc.py
index 6bb3fb8f1..317a1409f 100644
--- a/solo/utils/misc.py
+++ b/solo/utils/misc.py
@@ -295,7 +295,8 @@ def generate_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
     """Adapted from https://github.com/facebookresearch/mae.
     grid_size: int of the grid height and width
     return:
-    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    pos_embed: [grid_size*grid_size, embed_dim] or
+        [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
     """
 
     grid_h = np.arange(grid_size, dtype=np.float32)
@@ -331,7 +332,7 @@ def generate_1d_sincos_pos_embed_from_grid(embed_dim, pos):
     """
 
     assert embed_dim % 2 == 0
-    omega = np.arange(embed_dim // 2, dtype=np.float)
+    omega = np.arange(embed_dim // 2, dtype=float)
     omega /= embed_dim / 2.0
     omega = 1.0 / 10000**omega  # (D/2,)
 
diff --git a/solo/utils/whitening.py b/solo/utils/whitening.py
index 5daea1157..06524b4f8 100644
--- a/solo/utils/whitening.py
+++ b/solo/utils/whitening.py
@@ -36,7 +36,7 @@ def __init__(self, output_dim: int, eps: float = 0.0):
                 to 0.0.
         """
 
-        super(Whitening2d, self).__init__()
+        super().__init__()
         self.output_dim = output_dim
         self.eps = eps
 
@@ -175,7 +175,7 @@ def __init__(
         momentum: float = 0.1,
         affine: bool = True,
     ):
-        super(IterNorm, self).__init__()
+        super().__init__()
         # assert dim == 4, 'IterNorm does not support 2D'
         self.T = T
         self.eps = eps