User Tools

Site Tools


tech:slurm

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
Next revisionBoth sides next revision
tech:slurm [2019/09/06 14:59] – [Controller] kohofertech:slurm [2020/04/27 15:50] – [GCC] kohofer
Line 15: Line 15:
 ===== Installation ===== ===== Installation =====
  
-==== Controller ==== +===== Controller name: slurm-ctrl =====
- +
-Controller name: slurm-ctrl+
  
 Install slurm-wlm and tools Install slurm-wlm and tools
  
   ssh slurm-ctrl   ssh slurm-ctrl
-  apt install slurm-wlm slurm-wlm-doc mailutils sview mariadb-client mariadb-server libmariadb-dev python-dev python-mysqldb+  apt install slurm-wlm slurm-wlm-doc mailutils mariadb-client mariadb-server libmariadb-dev python-dev python-mysqldb
  
 === Install Maria DB Server === === Install Maria DB Server ===
Line 51: Line 49:
 === Central Controller === === Central Controller ===
  
-The main configuration file is /etc/slurm-llnl/slurm.conf this file has to be present in the controller and all of the compute nodes and it also has to be consistent between all of them.+The main configuration file is /etc/slurm-llnl/slurm.conf this file has to be present in the controller and *ALL* of the compute nodes and it also has to be consistent between all of them.
  
-  vi /etc/slurm-llnl/slurm.conf +  vi /etc/slurm-llnl/slurm.conf
  
 <code> <code>
Line 114: Line 112:
 </code> </code>
  
-  root@controller# systemctl start slurmctld+Copy slurm.conf to compute nodes! 
 + 
 +  root@slurm-ctrl# scp /etc/slurm-llnl/slurm.conf csadmin@10.7.20.109:/tmp/.; scp /etc/slurm-llnl/slurm.conf csadmin@10.7.20.110:/tmp/
 + 
 +  vi /lib/systemd/system/slurmctld.service 
 +   
 +<code> 
 +[Unit] 
 +Description=Slurm controller daemon 
 +After=network.target munge.service 
 +ConditionPathExists=/etc/slurm-llnl/slurm.conf 
 +Documentation=man:slurmctld(8) 
 + 
 +[Service] 
 +Type=forking 
 +EnvironmentFile=-/etc/default/slurmctld 
 +ExecStart=/usr/sbin/slurmctld $SLURMCTLD_OPTIONS 
 +ExecStartPost=/bin/sleep 2 
 +ExecReload=/bin/kill -HUP $MAINPID 
 +PIDFile=/var/run/slurm-llnl/slurmctld.pid 
 + 
 +[Install] 
 +WantedBy=multi-user.target 
 + 
 +</code> 
 + 
 +  vi /lib/systemd/system/slurmd.service 
 + 
 +<code> 
 +[Unit] 
 +Description=Slurm node daemon 
 +After=network.target munge.service 
 +ConditionPathExists=/etc/slurm-llnl/slurm.conf 
 +Documentation=man:slurmd(8) 
 + 
 +[Service] 
 +Type=forking 
 +EnvironmentFile=-/etc/default/slurmd 
 +ExecStart=/usr/sbin/slurmd $SLURMD_OPTIONS 
 +ExecStartPost=/bin/sleep 2 
 +ExecReload=/bin/kill -HUP $MAINPID 
 +PIDFile=/var/run/slurm-llnl/slurmd.pid 
 +KillMode=process 
 +LimitNOFILE=51200 
 +LimitMEMLOCK=infinity 
 +LimitSTACK=infinity 
 + 
 +[Install] 
 +WantedBy=multi-user.target 
 +</code> 
 + 
 +   
 +  root@slurm-ctrl# systemctl daemon-reload 
 +  root@slurm-ctrl# systemctl enable slurmdbd 
 +  root@slurm-ctrl# systemctl start slurmdbd 
 +  root@slurm-ctrl# systemctl enable slurmctld 
 +  root@slurm-ctrl# systemctl start slurmctld 
  
 === Accounting Storage === === Accounting Storage ===
Line 153: Line 208:
 </code> </code>
  
-  root@controller# systemctl start slurmdbd+  root@slurm-ctrl# systemctl start slurmdbd 
 + 
 +=== Authentication === 
 + 
 +Copy /etc/munge.key to all compute nodes 
 + 
 +  scp /etc/munge/munge.key csadmin@10.7.20.98:/tmp/
 +  
 +Allow password-less access to slurm-ctrl 
 +  
 +  csadmin@slurm-ctrl:~$ ssh-copy-id -i .ssh/id_rsa.pub 10.7.20.102: 
 +   
 +Run a job from slurm-ctrl 
 + 
 +  ssh csadmin@slurm-ctrl 
 +  srun -N 1 hostname 
 +  linux1 
 + 
  
 === Test munge === === Test munge ===
Line 168: Line 241:
   debug*       up   infinite      1   idle linux1   debug*       up   infinite      1   idle linux1
  
-==== Compute Nodes ====+If computer node is down 
 + 
 +<code> 
 +sinfo -a 
 +PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST 
 +debug*       up   infinite      2   down gpu[02-03] 
 +</code> 
 + 
 +  scontrol update nodename=gpu02 state=idle 
 +  scontrol update nodename=gpu03 state=idle 
 +  scontrol update nodename=gpu02 state=resume 
 + 
 +<code> 
 +sinfo -a 
 +PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST 
 +debug*       up   infinite      2   idle gpu[02-03] 
 +</code> 
 + 
 + 
 +===== Compute Nodes ====
  
 A compute node is a machine which will receive jobs to execute, sent from the Controller, it runs the slurmd service. A compute node is a machine which will receive jobs to execute, sent from the Controller, it runs the slurmd service.
Line 174: Line 267:
 {{:tech:slurm-hpc-cluster_compute-node.png?400|}} {{:tech:slurm-hpc-cluster_compute-node.png?400|}}
  
-=== Authentication ===+=== Installation slurm and munge ===
  
-  ssh root@slurm-ctrl +  ssh -l csadmin <compute-nodes> 10.7.20.109 10.7.20.110 
-  root@controller# for i in `seq 1 2`; do scp /etc/munge/munge.key linux-${i}:/etc/munge/munge.key; done+  sudo apt install slurm-wlm libmunge-dev libmunge2 munge
  
-  root@compute-1# systemctl start munge+  sudo vi /lib/systemd/system/slurmd.service
  
-Run a job from slurm-ctrl+<code> 
 +[Unit] 
 +Description=Slurm node daemon 
 +After=network.target munge.service 
 +ConditionPathExists=/etc/slurm-llnl/slurm.conf 
 +Documentation=man:slurmd(8)
  
-  ssh csadmin +[Service] 
-  srun -N 1 hostname +Type=forking 
-  linux1+EnvironmentFile=-/etc/default/slurmd 
 +ExecStart=/usr/sbin/slurmd $SLURMD_OPTIONS 
 +ExecStartPost=/bin/sleep 2 
 +ExecReload=/bin/kill -HUP $MAINPID 
 +PIDFile=/var/run/slurm-llnl/slurmd.pid 
 +KillMode=process 
 +LimitNOFILE=51200 
 +LimitMEMLOCK=infinity 
 +LimitSTACK=infinity
  
 +[Install]
 +WantedBy=multi-user.target
 +</code>
 +
 +  sudo systemctl enable slurmd
 +  sudo systemctl enable munge
 +  sudo systemctl start slurmd
 +  sudo systemctl start munge
 +
 +
 +Generate ssh keys
 +
 +  ssh-keygen
 +
 +Copy ssh-keys to slurm-ctrl 
 +
 +  ssh-copy-id -i ~/.ssh/id_rsa.pub csadmin@slurm-ctrl.inf.unibz.it:
 +
 +Become root to do important things:
 +
 +  sudo -i
 +  vi /etc/hosts
 +
 +Add those lines below to the /etc/hosts file
 +
 +<code>
 +10.7.20.97      slurm-ctrl.inf.unibz.it slurm-ctrl
 +10.7.20.98      linux1.inf.unibz.it     linux1
 +</code>
 +
 +First copy the munge keys from the slurm-ctrl to all compute nodes, now fix location,
 +owner and permission.
 +
 +  mv /tmp/munge.key /etc/munge/.
 +  chown munge:munge /etc/munge/munge.key
 +  chmod 400 /etc/munge/munge.key
 +
 +Place /etc/slurm-llnl/slurm.conf in right place,
 +
 +  mv /tmp/slurm.conf /etc/slurm-llnl/
 +  chown root: /etc/slurm-llnl/slurm.conf
 + 
 +  
 +
 +
 +===== Links =====
 +
 +[[https://slurm.schedmd.com/overview.html|Slurm Workload Manager Overview]]
 +
 +[[https://github.com/mknoxnv/ubuntu-slurm|Steps to create a small slurm cluster with GPU enabled nodes]]
 +
 +[[https://implement.pt/2018/09/slurm-in-ubuntu-clusters-pt1/|Slurm in Ubuntu Clusters Part1]]
 +
 +[[https://wiki.fysik.dtu.dk/niflheim/SLURM|Slurm batch queueing system]]
 +
 +[[https://doku.lrz.de/display/PUBLIC/SLURM+Workload+Manager|SLURM Workload Manager]]
 +
 +[[https://support.ceci-hpc.be/doc/_contents/QuickStart/SubmittingJobs/SlurmTutorial.html|Slurm Quick Start Tutorial]]
 +
 +{{ :tech:9-slurm.pdf |Linux Clusters Institute: Scheduling and Resource Management 2017}}
 +
 +
 +====== Modules ======
 +
 +===== GCC =====
 +
 +This takes a long time!
 +
 +Commands to run to compile gcc-6.1.0
 +
 +  wget https://ftp.gnu.org/gnu/gcc/gcc-6.1.0/gcc-6.1.0.tar.bz2
 +  tar xfj gcc-6.1.0.tar.bz2
 +  cd gcc-6.1.0
 +  ./contrib/download_prerequisites
 +  ./configure --prefix=/opt/package/gcc/6.1.0 --disable-multilib
 +  make
 +
 +After some time an error occurs, and the make process stops!
 +<code>
 +...
 +In file included from ../.././libgcc/unwind-dw2.c:401:0:
 +./md-unwind-support.h: In function ‘x86_64_fallback_frame_state’:
 +./md-unwind-support.h:65:47: error: dereferencing pointer to incomplete type ‘struct ucontext’
 +       sc = (struct sigcontext *) (void *) &uc_->uc_mcontext;
 +                                               ^~
 +../.././libgcc/shared-object.mk:14: recipe for target 'unwind-dw2.o' failed
 +</code>
 +
 +To fix do: [[https://stackoverflow.com/questions/46999900/how-to-compile-gcc-6-4-0-with-gcc-7-2-in-archlinux|solution]]
 +
 +  vi /opt/packages/gcc-6.1.0/x86_64-pc-linux-gnu/libgcc/md-unwind-support.h
 +
 +and replace/comment out line 61 with this:
 +
 +<code>
 +struct ucontext_t *uc_ = context->cfa;
 +</code>
 +
 +old line: /* struct ucontext *uc_ = context->cfa; */
 +
 +  make
 +
 +Next error:
 +
 +<code>
 +../../.././libsanitizer/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc:270:22: error: aggregate ‘sigaltstack handler_stack’ has incomplete type and cannot be defined
 +   struct sigaltstack handler_stack;
 +
 +</code>
 +
 +To fix see: [[https://github.com/llvm-mirror/compiler-rt/commit/8a5e425a68de4d2c80ff00a97bbcb3722a4716da?diff=unified|solution]]
 +or [[https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81066]]
 +
 +Amend the files according to solution above!
 +
 +Next error:
 +
 +<code>
 +...
 +checking for unzip... unzip
 +configure: error: cannot find neither zip nor jar, cannot continue
 +Makefile:23048: recipe for target 'configure-target-libjava' failed
 +...
 +...
 +</code>
 +
 +  apt install unzip zip
 +
 +and run make again!
 +
 +  make
 +
 +Next error:
 +
 +<code>
 +...
 +In file included from ../.././libjava/prims.cc:26:0:
 +../.././libjava/prims.cc: In function ‘void _Jv_catch_fpe(int, siginfo_t*, void*)’:
 +./include/java-signal.h:32:26: error: invalid use of incomplete type ‘struct _Jv_catch_fpe(int, siginfo_t*, void*)::ucontext’
 +   gregset_t &_gregs = _uc->uc_mcontext.gregs;    \
 +...
 +</code>
 +
 +Edit the file: /opt/packages/gcc-6.1.0/x86_64-pc-linux-gnu/libjava/include/java-signal.h
 +
 +  vi /opt/packages/gcc-6.1.0/x86_64-pc-linux-gnu/libjava/include/java-signal.h
 +
 +<note warning>Not enough more errors!</note>
 +
 +<code>
 +// kh
 +  ucontext_t *_uc = (ucontext_t *);                             \
 +  //struct ucontext *_uc = (struct ucontext *)_p;                               \
 +  // kh
 +
 +</code>
 +
 +Next error:
 +
 +<code php>
 +...
 +In file included from ../.././libjava/prims.cc:26:0:          
 +./include/java-signal.h:32:3: warning: multi-line comment [-Wcomment]
 +   //struct ucontext *_uc = (struct ucontext *)_p;                                                  
 +                                                        
 +../.././libjava/prims.cc: In function ‘void _Jv_catch_fpe(int, siginfo_t*, void*)’:
 +./include/java-signal.h:31:15: warning: unused variable ‘_uc’ [-Wunused-variable]               
 +   ucontext_t *_uc = (ucontext_t *)_p;       
 +                       
 +../.././libjava/prims.cc:192:3: note: in expansion of macro ‘HANDLE_DIVIDE_OVERFLOW’            
 +   HANDLE_DIVIDE_OVERFLOW;       
 +   ^~~~~~~~~~~~~~~~~~~~~~
 +../.././libjava/prims.cc:203:1: error: expected ‘while’ before ‘jboolean’                    
 + jboolean                                       
 + ^~~~~~~~                                      
 +../.././libjava/prims.cc:203:1: error: expected ‘(’ before ‘jboolean’
 +../.././libjava/prims.cc:204:1: error: expected primary-expression before ‘_Jv_equalUtf8Consts’
 + _Jv_equalUtf8Consts (const Utf8Const* a, const Utf8Const *b)                   
 + ^~~~~~~~~~~~~~~~~~~                                    
 +../.././libjava/prims.cc:204:1: error: expected ‘)’ before ‘_Jv_equalUtf8Consts’
 +../.././libjava/prims.cc:204:1: error: expected ‘;’ before ‘_Jv_equalUtf8Consts’
 +../.././libjava/prims.cc:204:22: error: expected primary-expression before ‘const’
 + _Jv_equalUtf8Consts (const Utf8Const* a, const Utf8Const *b)
 +...
 +</code>
  
  
  
 +===== Links =====
  
 +http://www.walkingrandomly.com/?p=5680
  
-https://slurm.schedmd.com/overview.html+https://modules.readthedocs.io/en/latest/index.html
/data/www/wiki.inf.unibz.it/data/pages/tech/slurm.txt · Last modified: 2022/11/24 16:17 by kohofer