User Tools

Site Tools


tech:slurm

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Next revision
Previous revision
Next revisionBoth sides next revision
tech:slurm [2019/09/06 11:17] – created kohofertech:slurm [2020/02/07 10:34] – [Compute Nodes] kohofer
Line 13: Line 13:
 {{:tech:slurm-hpc-cluster.png?400|}} {{:tech:slurm-hpc-cluster.png?400|}}
  
 +===== Installation =====
  
 +==== Controller ====
  
 +Controller name: slurm-ctrl
  
 +Install slurm-wlm and tools
  
 +  ssh slurm-ctrl
 +  apt install slurm-wlm slurm-wlm-doc mailutils mariadb-client mariadb-server libmariadb-dev python-dev python-mysqldb
  
 +=== Install Maria DB Server ===
  
-https://slurm.schedmd.com/overview.html+  apt-get install mariadb-server 
 +  systemctl start mysql 
 +  mysql -u root 
 +  create database slurm_acct_db; 
 +  create user 'slurm'@'localhost'; 
 +  set password for 'slurm'@'localhost' = password('slurmdbpass'); 
 +  grant usage on *.* to 'slurm'@'localhost'; 
 +  grant all privileges on slurm_acct_db.* to 'slurm'@'localhost'; 
 +  flush privileges; 
 +  exit 
 + 
 +In the file /etc/mysql/mariadb.conf.d/50-server.cnf we should have the following setting: 
 + 
 +  vi /etc/mysql/mariadb.conf.d/50-server.cnf 
 +  bind-address = localhost 
 + 
 +=== Node Authentication === 
 + 
 +First, let us configure the default options for the munge service: 
 + 
 +  vi /etc/default/munge 
 +  OPTIONS="--syslog --key-file /etc/munge/munge.key" 
 + 
 +=== Central Controller === 
 + 
 +The main configuration file is /etc/slurm-llnl/slurm.conf this file has to be present in the controller and *ALL* of the compute nodes and it also has to be consistent between all of them. 
 + 
 +  vi /etc/slurm-llnl/slurm.conf 
 + 
 +<code> 
 +############################### 
 +# /etc/slurm-llnl/slurm.conf 
 +############################### 
 +# slurm.conf file generated by configurator easy.html. 
 +# Put this file on all nodes of your cluster. 
 +# See the slurm.conf man page for more information. 
 +
 +ControlMachine=slurm-ctrl 
 +#ControlAddr=10.7.20.97 
 +
 +#MailProg=/bin/mail 
 +MpiDefault=none 
 +#MpiParams=ports=#-# 
 +ProctrackType=proctrack/pgid 
 +ReturnToService=1 
 +SlurmctldPidFile=/var/run/slurm-llnl/slurmctld.pid 
 +##SlurmctldPidFile=/var/run/slurmctld.pid 
 +#SlurmctldPort=6817 
 +SlurmdPidFile=/var/run/slurm-llnl/slurmd.pid 
 +##SlurmdPidFile=/var/run/slurmd.pid 
 +#SlurmdPort=6818 
 +SlurmdSpoolDir=/var/spool/slurmd 
 +SlurmUser=slurm 
 +#SlurmdUser=root 
 +StateSaveLocation=/var/spool 
 +SwitchType=switch/none 
 +TaskPlugin=task/none 
 +
 +
 +# TIMERS 
 +#KillWait=30 
 +#MinJobAge=300 
 +#SlurmctldTimeout=120 
 +#SlurmdTimeout=300 
 +
 +
 +# SCHEDULING 
 +FastSchedule=1 
 +SchedulerType=sched/backfill 
 +SelectType=select/linear 
 +#SelectTypeParameters= 
 +
 +
 +# LOGGING AND ACCOUNTING 
 +AccountingStorageType=accounting_storage/none 
 +ClusterName=cluster 
 +#JobAcctGatherFrequency=30 
 +JobAcctGatherType=jobacct_gather/none 
 +#SlurmctldDebug=3 
 +SlurmctldLogFile=/var/log/slurm-llnl/SlurmctldLogFile 
 +#SlurmdDebug=3 
 +SlurmdLogFile=/var/log/slurm-llnl/SlurmLogFile 
 +
 +
 +# COMPUTE NODES 
 +NodeName=linux1 NodeAddr=10.7.20.98 CPUs=1 State=UNKNOWN 
 +</code> 
 + 
 +  root@slurm-ctrl# scp /etc/slurm-llnl/slurm.conf csadmin@10.7.20.98:/tmp/.; scp /etc/slurm-llnl/slurm.conf csadmin@10.7.20.102:/tmp/
 +  root@slurm-ctrl# systemctl start slurmctld 
 + 
 +=== Accounting Storage === 
 + 
 +After we have the slurm-llnl-slurmdbd package installed we configure it, by editing the /etc/slurm-llnl/slurmdbd.conf file: 
 + 
 +  vi /etc/slurm-llnl/slurmdbd.conf 
 + 
 +<code> 
 +######################################################################## 
 +
 +# /etc/slurm-llnl/slurmdbd.conf is an ASCII file which describes Slurm 
 +# Database Daemon (SlurmDBD) configuration information. 
 +# The contents of the file are case insensitive except for the names of 
 +# nodes and files. Any text following a "#" in the configuration file is 
 +# treated as a comment through the end of that line. The size of each 
 +# line in the file is limited to 1024 characters. Changes to the 
 +# configuration file take effect upon restart of SlurmDbd or daemon 
 +# receipt of the SIGHUP signal unless otherwise noted. 
 +
 +# This file should be only on the computer where SlurmDBD executes and 
 +# should only be readable by the user which executes SlurmDBD (e.g. 
 +# "slurm"). This file should be protected from unauthorized access since 
 +# it contains a database password. 
 +######################################################################### 
 +AuthType=auth/munge 
 +AuthInfo=/var/run/munge/munge.socket.2 
 +StorageHost=localhost 
 +StoragePort=3306 
 +StorageUser=slurm 
 +StoragePass=slurmdbpass 
 +StorageType=accounting_storage/mysql 
 +StorageLoc=slurm_acct_db 
 +LogFile=/var/log/slurm-llnl/slurmdbd.log 
 +PidFile=/var/run/slurm-llnl/slurmdbd.pid 
 +SlurmUser=slurm 
 + 
 +</code> 
 + 
 +  root@slurm-ctrl# systemctl start slurmdbd 
 + 
 +=== Authentication === 
 + 
 +Copy /etc/munge.key to all compute nodes 
 + 
 +  scp /etc/munge/munge.key csadmin@10.7.20.98:/tmp/
 +  
 +Allow password-less access to slurm-ctrl 
 +  
 +  csadmin@slurm-ctrl:~$ ssh-copy-id -i .ssh/id_rsa.pub 10.7.20.102: 
 +   
 +Run a job from slurm-ctrl 
 + 
 +  ssh csadmin@slurm-ctrl 
 +  srun -N 1 hostname 
 +  linux1 
 + 
 + 
 + 
 +=== Test munge === 
 + 
 +  munge -n | unmunge | grep STATUS 
 +  STATUS:           Success (0) 
 +  munge -n | ssh slurm-ctrl unmunge | grep STATUS 
 +  STATUS:           Success (0) 
 + 
 +=== Test Slurm === 
 + 
 +  sinfo 
 +  PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST 
 +  debug*       up   infinite      1   idle linux1 
 + 
 +==== Compute Nodes ==== 
 + 
 +A compute node is a machine which will receive jobs to execute, sent from the Controller, it runs the slurmd service. 
 + 
 +{{:tech:slurm-hpc-cluster_compute-node.png?400|}} 
 + 
 +=== Installation slurm and munge === 
 + 
 +  ssh -l csadmin <compute-nodes> 10.7.20.109 10.7.20.110 
 +  sudo apt install slurm-wlm libmunge-dev libmunge2 munge 
 +  sudo systemctl enable slurmd 
 +  sudo systemctl enable munge 
 +  sudo systemctl start slurmd 
 +  sudo systemctl start munge 
 + 
 + 
 +Generate ssh keys 
 + 
 +  ssh-keygen 
 + 
 +Copy ssh-keys to slurm-ctrl (using IP, because no DNS in place) 
 + 
 +  ssh-copy-id -i ~/.ssh/id_rsa.pub csadmin@10.7.20.97: 
 + 
 +Become root to do important things: 
 + 
 +  sudo -i 
 +  vi /etc/hosts 
 + 
 +Add those lines below to the /etc/hosts file 
 + 
 +<code> 
 +10.7.20.97      slurm-ctrl.inf.unibz.it slurm-ctrl 
 +10.7.20.98      linux1.inf.unibz.it     linux1 
 +</code> 
 + 
 +First copy the munge keys from the slurm-ctrl to all compute nodes, now fix location, 
 +owner and permission. 
 + 
 +  mv /tmp/munge.key /etc/munge/
 +  chown munge:munge /etc/munge/munge.key 
 +  chmod 400 /etc/munge/munge.key 
 + 
 +Place /etc/slurm-llnl/slurm.conf in right place, 
 + 
 +  mv /tmp/slurm.conf /etc/slurm-llnl/ 
 +  chown root: /etc/slurm-llnl/slurm.conf 
 +  
 +   
 + 
 + 
 +===== Links ===== 
 + 
 +[[https://slurm.schedmd.com/overview.html|Slurm Workload Manager Overview]] 
 + 
 +[[https://github.com/mknoxnv/ubuntu-slurm|Steps to create a small slurm cluster with GPU enabled nodes]] 
 + 
 +[[https://implement.pt/2018/09/slurm-in-ubuntu-clusters-pt1/|Slurm in Ubuntu Clusters Part1]] 
 + 
 +[[https://wiki.fysik.dtu.dk/niflheim/SLURM|Slurm batch queueing system]] 
 + 
 +[[https://doku.lrz.de/display/PUBLIC/SLURM+Workload+Manager|SLURM Workload Manager]] 
 + 
 +[[https://support.ceci-hpc.be/doc/_contents/QuickStart/SubmittingJobs/SlurmTutorial.html|Slurm Quick Start Tutorial]] 
 + 
 +{{ :tech:9-slurm.pdf |Linux Clusters Institute: Scheduling and Resource Management 2017}}
/data/www/wiki.inf.unibz.it/data/pages/tech/slurm.txt · Last modified: 2022/11/24 16:17 by kohofer