forked from MeteoSwiss/neural-lam
-
Notifications
You must be signed in to change notification settings - Fork 0
/
slurm_train.sh
37 lines (32 loc) · 1.13 KB
/
slurm_train.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/bin/bash -l
#SBATCH --job-name=NeurWP
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=4
#SBATCH --partition=a100-80gb
#SBATCH --account=s83
#SBATCH --output=lightning_logs/neurwp_out.log
#SBATCH --error=lightning_logs/neurwp_err.log
#SBATCH --mem=490G
#SBATCH --no-requeue
export PREPROCESS=true
export NORMALIZE=false
# Load necessary modules
conda activate neural-lam
if [ "$PREPROCESS" = true ]; then
echo "Create static features"
srun -ul -N1 -n1 python create_static_features.py --boundaries 60
echo "Creating mesh"
srun -ul -N1 -n1 python create_mesh.py --dataset "cosmo" --plot 1
echo "Creating grid features"
srun -ul -N1 -n1 python create_grid_features.py --dataset "cosmo"
if [ "$NORMALIZE" = true ]; then
# This takes multiple hours!
echo "Creating normalization weights"
srun -ul -N1 -n1 python create_parameter_weights.py --dataset "cosmo" --batch_size 32 --n_workers 8 --step_length 1
fi
fi
ulimit -c 0
export OMP_NUM_THREADS=16
# Run the script with torchrun
srun -ul python train_model.py --dataset "cosmo" --val_interval 5 \
--epochs 10 --n_workers 6 --batch_size 8 --subset_ds 1