GROMACS version: 2018.1-intel-2018.02
GROMACS modification: No
Dear Gromacs community,
I keep running into an issue when submitting follow-up SLURM jobs to continue a simulation from consecutive checkpoint files. I’m using the noappend flag because I want to have separate trajectory files from every restart. The problem is that the md_run.cpt files doesn’t seem to be updated/ or considered correctly when restarting so I restart the simulation from the exact same file every time instead of the consecutively updated ones.
I use this command:
gmx_mpi mdrun -ntomp 2 -v -deffnm md_run -cpi md_run.cpt -noappend -cpo md_run.cpt
so every jobs is run by the following generated script (e.g for run I=3):
#!/bin/bash
#SBATCH -J "keep_running_test_single_parall_run21" # Job name
#SBATCH -o "keep_running_test_single_parall_run21".%j.out # Specify stdout output file (%j expands to jobId)
#SBATCH -p parallel # Partition/Queue name #??
#SBATCH -C skylake # select either 'broadwell' or 'skylake'
#SBATCH -N 8 # 8 Total number of nodes requested (32 cores/node) #??
#SBATCH -t 00:30:00 # Run time (hh:mm:ss) - 0.5 hours
#SBATCH -A m2_komet331hpc # Specify allocation to charge against
filename="AF-A0A0K3AS98-F1-model_v4" # input file (WITHOUT .pdb)
run_name="keep_running_test_single_parall_run21" # todo - name of the experiment == Job name #specific to setup not geometry
group='test_runs'
output_folder="/lustre/miifs01/project/m2_komet331hpc/lubaltz/gmx_output/$group/$filename/$run_name" # Output folder for all output files
n_cores=2 #number of CPU cores per task
n_tasks=256 #jobdivided into XX tasks, each potentially running on a separate CPU core
n_threads=2 #number of OpenMP threads within each MPI task
cd "$output_folder"
module load bio/GROMACS/2018.1-intel-2018.02
srun -n $n_tasks -c $n_cores gmx_mpi mdrun -ntomp $n_threads -v -deffnm md_run -cpi md_run.cpt -noappend -cpo md_run.cpt
cp md_run.cpt md_run_part3.cpt
gmx dump -cp md_run_part3.cpt >> cpt_part_3.txt
and this is my code for the follow-up jobs:
# Batch job script:
JOB_SCRIPT=./gromacs_keep_running_tst.sh
# Setup script:
SETUP_SCRIPT=./gromacs_run_tst.sh
I=0
sed -i 's/-deffnm md_run/-deffnm md_run -cpo md_run.cpt /g' "$SETUP_SCRIPT"
echo 'cp md_run.cpt md_run_part'$I'.cpt' >> "$SETUP_SCRIPT"
echo 'gmx dump -cp md_run_part'$I'.cpt >> cpt_part_'$I'.txt' >> "$SETUP_SCRIPT"
sed -n '/^#!/ p; /^#SBATCH/ p; /^module load/ p; /^n_cores/ p; /^n_threads/ p; /^n_tasks/ p; /^group/ p; /^filename/ p; /^run_name/ p; /^output_folder/ p; /^cd/ p; /-v -deffnm md_run/ p' "$SETUP_SCRIPT" > "$JOB_SCRIPT"
sed -i 's/-deffnm md_run/-deffnm md_run -cpi md_run.cpt -noappend /g' "$JOB_SCRIPT"
#sed -i 's/-deffnm md_run/-deffnm md_run -cpi md_run.cpt -noappend /g' "$JOB_SCRIPT"
echo "Submitting job chain of ${NR_OF_JOBS} jobs for batch script ${JOB_SCRIPT}:"
# Submit the setup script initially
SETUP_JOBID=$(sbatch ${SETUP_SCRIPT} 2>&1 | awk '{print $(NF)}')
echo " Setup Job ID: ${SETUP_JOBID}"
cp $SETUP_SCRIPT $JOB_SCRIPT'_'$I'.sh'
#I=1
I=$((${I}+1))
echo 'cp md_run.cpt md_run_part'$I'.cpt' >> "$JOB_SCRIPT"
echo 'gmx dump -cp md_run_part'$I'.cpt >> cpt_part_'$I'.txt' >> "$JOB_SCRIPT"
# Now submit the main batch job script with dependency on the setup script
JOBID=$(sbatch --dependency=afterany:${SETUP_JOBID} ${JOB_SCRIPT} 2>&1 | awk '{print $(NF)}')
echo " Main Job ID: ${JOBID}"
cp $JOB_SCRIPT $JOB_SCRIPT'_'$I'.sh'
#I=2
K=$((${I}+1))
sed -i 's/cp md_run.cpt md_run_part'$I'.cpt'/'cp md_run.cpt md_run_part'$K'.cpt /g' "$JOB_SCRIPT"
sed -i 's/gmx dump -cp md_run_part'$I'.cpt >> cpt_part_'$I'.txt'/'gmx dump -cp md_run_part'$K'.cpt >> cpt_part_'$K'.txt /g' "$JOB_SCRIPT"
# Now submit the main batch job script with dependency on the setup script
JOBID=$(sbatch --dependency=afterany:${JOBID} ${JOB_SCRIPT} 2>&1 | awk '{print $(NF)}')
echo " Main Job ID: ${JOBID}"
cp $JOB_SCRIPT $JOB_SCRIPT'_'$K'.sh'
I=$((${I}+1))
K=$((${I}+1))
sed -i 's/cp md_run.cpt md_run_part'$I'.cpt'/'cp md_run.cpt md_run_part'$K'.cpt /g' "$JOB_SCRIPT"
sed -i 's/gmx dump -cp md_run_part'$I'.cpt >> cpt_part_'$I'.txt'/'gmx dump -cp md_run_part'$K'.cpt >> cpt_part_'$K'.txt /g' "$JOB_SCRIPT"
# Now submit the main batch job script with dependency on the setup script
JOBID=$(sbatch --dependency=afterany:${JOBID} ${JOB_SCRIPT} 2>&1 | awk '{print $(NF)}')
echo " Main Job ID: ${JOBID}"
cp $JOB_SCRIPT $JOB_SCRIPT'_'$K'.sh'
I attached the corresponding .log files.
keep_running_test_single_parall_run21.14124102.log (148.2 KB)
keep_running_test_single_parall_run21.14124103.log (4.9 KB)
I also tried to introduce consecutive names depending on the part I myself using the -cpi and -cpo flag but this results in a similar issue - after the second restart the -cpo file is not taken into account by cpi.
What aim I overseeing?
Thank you very much in advance