User Tools

Site Tools


tech:slurm

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
Next revisionBoth sides next revision
tech:slurm [2020/02/07 12:55] – [Controller] kohofertech:slurm [2020/02/10 17:07] kohofer
Line 17: Line 17:
 ==== Controller ==== ==== Controller ====
  
-Controller name: slurm-ctrl+===== Controller name: slurm-ctrl =====
  
 Install slurm-wlm and tools Install slurm-wlm and tools
Line 113: Line 113:
 NodeName=linux1 NodeAddr=10.7.20.98 CPUs=1 State=UNKNOWN NodeName=linux1 NodeAddr=10.7.20.98 CPUs=1 State=UNKNOWN
 </code> </code>
 +
 +Copy slurm.conf to compute nodes!
  
   root@slurm-ctrl# scp /etc/slurm-llnl/slurm.conf csadmin@10.7.20.109:/tmp/.; scp /etc/slurm-llnl/slurm.conf csadmin@10.7.20.110:/tmp/.   root@slurm-ctrl# scp /etc/slurm-llnl/slurm.conf csadmin@10.7.20.109:/tmp/.; scp /etc/slurm-llnl/slurm.conf csadmin@10.7.20.110:/tmp/.
 +
 +  vi /lib/systemd/system/slurmctld.service
 +  
 +<code>
 +[Unit]
 +Description=Slurm controller daemon
 +After=network.target munge.service
 +ConditionPathExists=/etc/slurm-llnl/slurm.conf
 +Documentation=man:slurmctld(8)
 +
 +[Service]
 +Type=forking
 +EnvironmentFile=-/etc/default/slurmctld
 +ExecStart=/usr/sbin/slurmctld $SLURMCTLD_OPTIONS
 +ExecReload=/bin/kill -HUP $MAINPID
 +PIDFile=/var/run/slurm-llnl/slurmctld.pid
 +
 +[Install]
 +WantedBy=multi-user.target
 +
 +</code>
 +
 +  vi /lib/systemd/system/slurmd.service
 +
 +<code>
 +[Unit]
 +Description=Slurm node daemon
 +After=network.target munge.service
 +ConditionPathExists=/etc/slurm-llnl/slurm.conf
 +Documentation=man:slurmd(8)
 +
 +[Service]
 +Type=forking
 +EnvironmentFile=-/etc/default/slurmd
 +ExecStart=/usr/sbin/slurmd $SLURMD_OPTIONS
 +ExecReload=/bin/kill -HUP $MAINPID
 +PIDFile=/var/run/slurm-llnl/slurmd.pid
 +KillMode=process
 +LimitNOFILE=51200
 +LimitMEMLOCK=infinity
 +LimitSTACK=infinity
 +
 +[Install]
 +WantedBy=multi-user.target
 +</code>
 +
 +  
   root@slurm-ctrl# systemctl daemon-reload   root@slurm-ctrl# systemctl daemon-reload
   root@slurm-ctrl# systemctl enable slurmdbd   root@slurm-ctrl# systemctl enable slurmdbd
Line 191: Line 240:
   PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST   PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
   debug*       up   infinite      1   idle linux1   debug*       up   infinite      1   idle linux1
 +
 +If computer node is down
 +
 +<code>
 +sinfo -a
 +PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
 +debug*       up   infinite      2   down gpu[02-03]
 +</code>
 +
 +  scontrol update nodename=gpu02 state=idle
 +  scontrol update nodename=gpu03 state=idle
 +
 +<code>
 +sinfo -a
 +PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
 +debug*       up   infinite      2   idle gpu[02-03]
 +</code>
 +
 +
  
 ==== Compute Nodes ==== ==== Compute Nodes ====
/data/www/wiki.inf.unibz.it/data/pages/tech/slurm.txt · Last modified: 2022/11/24 16:17 by kohofer