Commit a993c739 authored by Kurs Kurs's avatar Kurs Kurs
Browse files

Prepare the retraining for running on Aurora

parent e057a019
......@@ -141,10 +141,10 @@ def main() :
[img_folder_train, img_folder_eval, _] = torch.utils.data.dataset.random_split(img_folder, [n_img_train, n_img_eval, n_img_rest])
# load the training set in random order
data_loader_train = torch.utils.data.DataLoader(img_folder_train, batch_size=8,
data_loader_train = torch.utils.data.DataLoader(img_folder_train, batch_size=16, num_workers=20,
shuffle=True)
data_loader_eval = torch.utils.data.DataLoader(img_folder_eval, batch_size=8,
data_loader_eval = torch.utils.data.DataLoader(img_folder_eval, batch_size=16, num_workers=20,
shuffle=True)
dataset_sizes = {'train' : n_img_train, 'val' : n_img_eval}
......
#!/bin/bash
#SBATCH -n 20
#SBATCH -N 1 # force all cores on one node
#SBATCH -t 04:00:00
#SBATCH -J retrain
#SBATCH -o retrain_%j.out
#SBATCH -e retrain_%j.err
#SBATCH -A computehpc
cat $0 # put the script in the output file
module purge
module load GCC/7.3.0-2.30 CUDA/9.2.88 OpenMPI/3.1.1
module load Python/3.6.6
module list
ARGS=$@
# bind threads to cores
export OMP_SCHEDULE=static
export OMP_PLACES=cores
export OMP_PROC_BIND=CLOSE
source venv/bin/activate
cd src
python retrain.py 25 1
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment