jeanzay.md

#!/bin/bash
#SBATCH --constraint=v100-32g
#SBATCH --qos=qos_gpu-t4                # partition
#SBATCH --job-name=dan_training         # name of the job
#SBATCH --gres=gpu:1                    # number of GPUs per node
#SBATCH --cpus-per-task=10              # number of cores per tasks
#SBATCH --hint=nomultithread            # we get physical cores not logical
#SBATCH --distribution=block:block      # we pin the tasks on contiguous cores
#SBATCH --nodes=1                       # number of nodes
#SBATCH --ntasks-per-node=1             # number of MPI tasks per node
#SBATCH --time=99:00:00                 # max exec time
#SBATCH --output=dan_train_hugin_munin_page_%j.out         # output log file
#SBATCH --error=dan_train_hugin_munin_page_%j.err          # error log file

module purge                            # purging modules inherited by default
module load anaconda-py3

conda activate /gpfswork/rech/rxm/ubz97wr/.conda/envs/dan/

# print started commands
set -x

# execution
teklia-dan train document
(base) [ubz97wr@jean-zay1: ubz97wr]$ squeue -u $USER
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
           1762916   gpu_p13 pylaia_t  ubz97wr  R   23:07:54      1 r7i6n1
           1762954   gpu_p13 pylaia_t  ubz97wr  R   22:35:57      1 r7i3n1