tech:slurm
Differences
This shows you the differences between two versions of the page.
Next revision | Previous revisionNext revisionBoth sides next revision | ||
tech:slurm [2019/09/06 11:17] – created kohofer | tech:slurm [2020/05/27 11:11] – [Example] kohofer | ||
---|---|---|---|
Line 13: | Line 13: | ||
{{: | {{: | ||
+ | ===== Installation ===== | ||
+ | ===== Controller name: slurm-ctrl ===== | ||
+ | Install slurm-wlm and tools | ||
+ | ssh slurm-ctrl | ||
+ | apt install slurm-wlm slurm-wlm-doc mailutils mariadb-client mariadb-server libmariadb-dev python-dev python-mysqldb | ||
+ | === Install Maria DB Server === | ||
+ | apt-get install mariadb-server | ||
+ | systemctl start mysql | ||
+ | mysql -u root | ||
+ | create database slurm_acct_db; | ||
+ | create user ' | ||
+ | set password for ' | ||
+ | grant usage on *.* to ' | ||
+ | grant all privileges on slurm_acct_db.* to ' | ||
+ | flush privileges; | ||
+ | exit | ||
+ | |||
+ | In the file / | ||
+ | |||
+ | vi / | ||
+ | bind-address = localhost | ||
+ | |||
+ | === Node Authentication === | ||
+ | |||
+ | First, let us configure the default options for the munge service: | ||
+ | |||
+ | vi / | ||
+ | OPTIONS=" | ||
+ | |||
+ | === Central Controller === | ||
+ | |||
+ | The main configuration file is / | ||
+ | |||
+ | vi / | ||
+ | |||
+ | < | ||
+ | ############################### | ||
+ | # / | ||
+ | ############################### | ||
+ | # slurm.conf file generated by configurator easy.html. | ||
+ | # Put this file on all nodes of your cluster. | ||
+ | # See the slurm.conf man page for more information. | ||
+ | # | ||
+ | ControlMachine=slurm-ctrl | ||
+ | # | ||
+ | # | ||
+ | # | ||
+ | MpiDefault=none | ||
+ | # | ||
+ | ProctrackType=proctrack/ | ||
+ | ReturnToService=1 | ||
+ | SlurmctldPidFile=/ | ||
+ | ## | ||
+ | # | ||
+ | SlurmdPidFile=/ | ||
+ | ## | ||
+ | # | ||
+ | SlurmdSpoolDir=/ | ||
+ | SlurmUser=slurm | ||
+ | # | ||
+ | StateSaveLocation=/ | ||
+ | SwitchType=switch/ | ||
+ | TaskPlugin=task/ | ||
+ | # | ||
+ | # | ||
+ | # TIMERS | ||
+ | # | ||
+ | # | ||
+ | # | ||
+ | # | ||
+ | # | ||
+ | # | ||
+ | # SCHEDULING | ||
+ | FastSchedule=1 | ||
+ | SchedulerType=sched/ | ||
+ | SelectType=select/ | ||
+ | # | ||
+ | # | ||
+ | # | ||
+ | # LOGGING AND ACCOUNTING | ||
+ | AccountingStorageType=accounting_storage/ | ||
+ | ClusterName=cluster | ||
+ | # | ||
+ | JobAcctGatherType=jobacct_gather/ | ||
+ | # | ||
+ | SlurmctldLogFile=/ | ||
+ | # | ||
+ | SlurmdLogFile=/ | ||
+ | # | ||
+ | # | ||
+ | # COMPUTE NODES | ||
+ | NodeName=linux1 NodeAddr=10.7.20.98 CPUs=1 State=UNKNOWN | ||
+ | </ | ||
+ | |||
+ | Copy slurm.conf to compute nodes! | ||
+ | |||
+ | root@slurm-ctrl# | ||
+ | |||
+ | vi / | ||
+ | | ||
+ | < | ||
+ | [Unit] | ||
+ | Description=Slurm controller daemon | ||
+ | After=network.target munge.service | ||
+ | ConditionPathExists=/ | ||
+ | Documentation=man: | ||
+ | |||
+ | [Service] | ||
+ | Type=forking | ||
+ | EnvironmentFile=-/ | ||
+ | ExecStart=/ | ||
+ | ExecStartPost=/ | ||
+ | ExecReload=/ | ||
+ | PIDFile=/ | ||
+ | |||
+ | [Install] | ||
+ | WantedBy=multi-user.target | ||
+ | |||
+ | </ | ||
+ | |||
+ | vi / | ||
+ | |||
+ | < | ||
+ | [Unit] | ||
+ | Description=Slurm node daemon | ||
+ | After=network.target munge.service | ||
+ | ConditionPathExists=/ | ||
+ | Documentation=man: | ||
+ | |||
+ | [Service] | ||
+ | Type=forking | ||
+ | EnvironmentFile=-/ | ||
+ | ExecStart=/ | ||
+ | ExecStartPost=/ | ||
+ | ExecReload=/ | ||
+ | PIDFile=/ | ||
+ | KillMode=process | ||
+ | LimitNOFILE=51200 | ||
+ | LimitMEMLOCK=infinity | ||
+ | LimitSTACK=infinity | ||
+ | |||
+ | [Install] | ||
+ | WantedBy=multi-user.target | ||
+ | </ | ||
+ | |||
+ | | ||
+ | root@slurm-ctrl# | ||
+ | root@slurm-ctrl# | ||
+ | root@slurm-ctrl# | ||
+ | root@slurm-ctrl# | ||
+ | root@slurm-ctrl# | ||
+ | |||
+ | |||
+ | === Accounting Storage === | ||
+ | |||
+ | After we have the slurm-llnl-slurmdbd package installed we configure it, by editing the / | ||
+ | |||
+ | vi / | ||
+ | |||
+ | < | ||
+ | ######################################################################## | ||
+ | # | ||
+ | # / | ||
+ | # Database Daemon (SlurmDBD) configuration information. | ||
+ | # The contents of the file are case insensitive except for the names of | ||
+ | # nodes and files. Any text following a "#" | ||
+ | # treated as a comment through the end of that line. The size of each | ||
+ | # line in the file is limited to 1024 characters. Changes to the | ||
+ | # configuration file take effect upon restart of SlurmDbd or daemon | ||
+ | # receipt of the SIGHUP signal unless otherwise noted. | ||
+ | # | ||
+ | # This file should be only on the computer where SlurmDBD executes and | ||
+ | # should only be readable by the user which executes SlurmDBD (e.g. | ||
+ | # " | ||
+ | # it contains a database password. | ||
+ | ######################################################################### | ||
+ | AuthType=auth/ | ||
+ | AuthInfo=/ | ||
+ | StorageHost=localhost | ||
+ | StoragePort=3306 | ||
+ | StorageUser=slurm | ||
+ | StoragePass=slurmdbpass | ||
+ | StorageType=accounting_storage/ | ||
+ | StorageLoc=slurm_acct_db | ||
+ | LogFile=/ | ||
+ | PidFile=/ | ||
+ | SlurmUser=slurm | ||
+ | |||
+ | </ | ||
+ | |||
+ | root@slurm-ctrl# | ||
+ | |||
+ | === Authentication === | ||
+ | |||
+ | Copy / | ||
+ | |||
+ | scp / | ||
+ | |||
+ | Allow password-less access to slurm-ctrl | ||
+ | |||
+ | csadmin@slurm-ctrl: | ||
+ | | ||
+ | Run a job from slurm-ctrl | ||
+ | |||
+ | ssh csadmin@slurm-ctrl | ||
+ | srun -N 1 hostname | ||
+ | linux1 | ||
+ | |||
+ | |||
+ | |||
+ | === Test munge === | ||
+ | |||
+ | munge -n | unmunge | grep STATUS | ||
+ | STATUS: | ||
+ | munge -n | ssh slurm-ctrl unmunge | grep STATUS | ||
+ | STATUS: | ||
+ | |||
+ | === Test Slurm === | ||
+ | |||
+ | sinfo | ||
+ | PARTITION AVAIL TIMELIMIT | ||
+ | debug* | ||
+ | |||
+ | If computer node is **<color # | ||
+ | |||
+ | < | ||
+ | sinfo -a | ||
+ | PARTITION AVAIL TIMELIMIT | ||
+ | debug* | ||
+ | |||
+ | sinfo | ||
+ | PARTITION AVAIL TIMELIMIT | ||
+ | gpu* | ||
+ | gpu* | ||
+ | |||
+ | </ | ||
+ | |||
+ | scontrol update nodename=gpu02 state=idle | ||
+ | scontrol update nodename=gpu03 state=idle | ||
+ | scontrol update nodename=gpu02 state=resume | ||
+ | |||
+ | < | ||
+ | sinfo -a | ||
+ | PARTITION AVAIL TIMELIMIT | ||
+ | debug* | ||
+ | </ | ||
+ | |||
+ | |||
+ | ===== Compute Nodes ===== | ||
+ | |||
+ | |||
+ | A compute node is a machine which will receive jobs to execute, sent from the Controller, it runs the slurmd service. | ||
+ | |||
+ | {{: | ||
+ | |||
+ | === Installation slurm and munge === | ||
+ | |||
+ | ssh -l csadmin < | ||
+ | sudo apt install slurm-wlm libmunge-dev libmunge2 munge | ||
+ | |||
+ | sudo vi / | ||
+ | |||
+ | < | ||
+ | [Unit] | ||
+ | Description=Slurm node daemon | ||
+ | After=network.target munge.service | ||
+ | ConditionPathExists=/ | ||
+ | Documentation=man: | ||
+ | |||
+ | [Service] | ||
+ | Type=forking | ||
+ | EnvironmentFile=-/ | ||
+ | ExecStart=/ | ||
+ | ExecStartPost=/ | ||
+ | ExecReload=/ | ||
+ | PIDFile=/ | ||
+ | KillMode=process | ||
+ | LimitNOFILE=51200 | ||
+ | LimitMEMLOCK=infinity | ||
+ | LimitSTACK=infinity | ||
+ | |||
+ | [Install] | ||
+ | WantedBy=multi-user.target | ||
+ | </ | ||
+ | |||
+ | sudo systemctl enable slurmd | ||
+ | sudo systemctl enable munge | ||
+ | sudo systemctl start slurmd | ||
+ | sudo systemctl start munge | ||
+ | |||
+ | |||
+ | Generate ssh keys | ||
+ | |||
+ | ssh-keygen | ||
+ | |||
+ | Copy ssh-keys to slurm-ctrl | ||
+ | |||
+ | ssh-copy-id -i ~/ | ||
+ | |||
+ | Become root to do important things: | ||
+ | |||
+ | sudo -i | ||
+ | vi /etc/hosts | ||
+ | |||
+ | Add those lines below to the /etc/hosts file | ||
+ | |||
+ | < | ||
+ | 10.7.20.97 | ||
+ | 10.7.20.98 | ||
+ | </ | ||
+ | |||
+ | First copy the munge keys from the slurm-ctrl to all compute nodes, now fix location, | ||
+ | owner and permission. | ||
+ | |||
+ | mv / | ||
+ | chown munge:munge / | ||
+ | chmod 400 / | ||
+ | |||
+ | Place / | ||
+ | |||
+ | mv / | ||
+ | chown root: / | ||
+ | |||
+ | | ||
+ | |||
+ | |||
+ | ===== Links ===== | ||
+ | |||
+ | [[https:// | ||
+ | |||
+ | [[https:// | ||
+ | |||
+ | [[https:// | ||
+ | |||
+ | [[https:// | ||
+ | |||
+ | [[https:// | ||
+ | |||
+ | [[https:// | ||
+ | |||
+ | {{ : | ||
+ | |||
+ | |||
+ | ====== Modules ====== | ||
+ | |||
+ | ===== Python ===== | ||
+ | |||
+ | ==== Python 3.7.7 ==== | ||
+ | |||
+ | |||
+ | cd / | ||
+ | mkdir / | ||
+ | wget https:// | ||
+ | tar xfJ Python-3.7.7.tar.xz | ||
+ | cd Python-3.7.7/ | ||
+ | ./configure --prefix=/ | ||
+ | make | ||
+ | make install | ||
+ | | ||
+ | |||
+ | ==== Python 2.7.18 ==== | ||
+ | |||
+ | |||
+ | cd / | ||
+ | mkdir / | ||
+ | wget https:// | ||
+ | cd Python-2.7.18 | ||
+ | ./configure --prefix=/ | ||
+ | make | ||
+ | make install | ||
+ | |||
+ | ==== Create modules file ==== | ||
+ | |||
+ | |||
+ | cd / | ||
+ | vi python-2.7.18 | ||
+ | |||
+ | < | ||
+ | #%Module1.0 | ||
+ | proc ModulesHelp { } { | ||
+ | global dotversion | ||
+ | |||
+ | puts stderr " | ||
+ | } | ||
+ | |||
+ | module-whatis " | ||
+ | prepend-path PATH / | ||
+ | |||
+ | </ | ||
+ | | ||
+ | |||
+ | |||
+ | |||
+ | ===== GCC ===== | ||
+ | |||
+ | This takes a long time! | ||
+ | |||
+ | Commands to run to compile gcc-6.1.0 | ||
+ | |||
+ | wget https:// | ||
+ | tar xfj gcc-6.1.0.tar.bz2 | ||
+ | cd gcc-6.1.0 | ||
+ | ./ | ||
+ | ./configure --prefix=/ | ||
+ | make | ||
+ | |||
+ | After some time an error occurs, and the make process stops! | ||
+ | < | ||
+ | ... | ||
+ | In file included from ../ | ||
+ | ./ | ||
+ | ./ | ||
+ | sc = (struct sigcontext *) (void *) & | ||
+ | ^~ | ||
+ | ../ | ||
+ | </ | ||
+ | |||
+ | To fix do: [[https:// | ||
+ | |||
+ | vi / | ||
+ | |||
+ | and replace/ | ||
+ | |||
+ | < | ||
+ | struct ucontext_t *uc_ = context-> | ||
+ | </ | ||
+ | |||
+ | old line: /* struct ucontext *uc_ = context-> | ||
+ | |||
+ | make | ||
+ | |||
+ | Next error: | ||
+ | |||
+ | < | ||
+ | ../ | ||
+ | | ||
+ | |||
+ | </ | ||
+ | |||
+ | To fix see: [[https:// | ||
+ | or [[https:// | ||
+ | |||
+ | Amend the files according to solution above! | ||
+ | |||
+ | Next error: | ||
+ | |||
+ | < | ||
+ | ... | ||
+ | checking for unzip... unzip | ||
+ | configure: error: cannot find neither zip nor jar, cannot continue | ||
+ | Makefile: | ||
+ | ... | ||
+ | ... | ||
+ | </ | ||
+ | |||
+ | apt install unzip zip | ||
+ | |||
+ | and run make again! | ||
+ | |||
+ | make | ||
+ | |||
+ | Next error: | ||
+ | |||
+ | < | ||
+ | ... | ||
+ | In file included from ../ | ||
+ | ../ | ||
+ | ./ | ||
+ | | ||
+ | ... | ||
+ | </ | ||
+ | |||
+ | Edit the file: / | ||
+ | |||
+ | vi / | ||
+ | |||
+ | <note warning> | ||
+ | |||
+ | < | ||
+ | // kh | ||
+ | ucontext_t *_uc = (ucontext_t *); \ | ||
+ | //struct ucontext *_uc = (struct ucontext *)_p; \ | ||
+ | // kh | ||
+ | |||
+ | </ | ||
+ | |||
+ | Next error: | ||
+ | |||
+ | <code php> | ||
+ | ... | ||
+ | In file included from ../ | ||
+ | ./ | ||
+ | // | ||
+ | | ||
+ | ../ | ||
+ | ./ | ||
+ | | ||
+ | | ||
+ | ../ | ||
+ | | ||
+ | | ||
+ | ../ | ||
+ | | ||
+ | | ||
+ | ../ | ||
+ | ../ | ||
+ | | ||
+ | | ||
+ | ../ | ||
+ | ../ | ||
+ | ../ | ||
+ | | ||
+ | ... | ||
+ | </ | ||
+ | |||
+ | ===== Examples ===== | ||
+ | |||
+ | ==== Example mnist ==== | ||
+ | |||
+ | An simple example to use nvidia GPU! | ||
+ | |||
+ | The example consists of the following files: | ||
+ | |||
+ | * README.md | ||
+ | * requirements.txt | ||
+ | * main.job | ||
+ | * main.py | ||
+ | |||
+ | Create a folder mnist and place the 4 files in there. | ||
+ | |||
+ | mkdir mnist | ||
+ | |||
+ | cat README.md | ||
+ | |||
+ | < | ||
+ | # Basic MNIST Example | ||
+ | |||
+ | ```bash | ||
+ | pip install -r requirements.txt | ||
+ | python main.py | ||
+ | # CUDA_VISIBLE_DEVICES=2 python main.py | ||
+ | ``` | ||
+ | </ | ||
+ | |||
+ | |||
+ | cat requirements.txt | ||
+ | < | ||
+ | torch | ||
+ | torchvision | ||
+ | </ | ||
+ | |||
+ | |||
+ | cat main.job | ||
+ | < | ||
+ | #!/bin/bash | ||
+ | |||
+ | #SBATCH --job-name=mnist | ||
+ | #SBATCH --output=mnist.out | ||
+ | #SBATCH --error=mnist.err | ||
+ | |||
+ | #SBATCH --partition gpu | ||
+ | #SBATCH --gres=gpu | ||
+ | #SBATCH --mem-per-cpu=4gb | ||
+ | #SBATCH --nodes 2 | ||
+ | #SBATCH --time=00: | ||
+ | |||
+ | #SBATCH --ntasks=10 | ||
+ | |||
+ | #SBATCH --mail-type=ALL | ||
+ | #SBATCH --mail-user=< | ||
+ | |||
+ | ml load miniconda3 | ||
+ | python3 main.py | ||
+ | </ | ||
+ | |||
+ | {(xssnipper>, | ||
+ | |||
+ | from __future__ import print_function | ||
+ | import argparse | ||
+ | import torch | ||
+ | import torch.nn as nn | ||
+ | import torch.nn.functional as F | ||
+ | import torch.optim as optim | ||
+ | from torchvision import datasets, transforms | ||
+ | from torch.optim.lr_scheduler import StepLR | ||
+ | |||
+ | |||
+ | class Net(nn.Module): | ||
+ | def __init__(self): | ||
+ | super(Net, self).__init__() | ||
+ | self.conv1 = nn.Conv2d(1, | ||
+ | self.conv2 = nn.Conv2d(32, | ||
+ | self.dropout1 = nn.Dropout2d(0.25) | ||
+ | self.dropout2 = nn.Dropout2d(0.5) | ||
+ | self.fc1 = nn.Linear(9216, | ||
+ | self.fc2 = nn.Linear(128, | ||
+ | |||
+ | def forward(self, | ||
+ | x = self.conv1(x) | ||
+ | x = F.relu(x) | ||
+ | x = self.conv2(x) | ||
+ | x = F.max_pool2d(x, | ||
+ | x = self.dropout1(x) | ||
+ | x = torch.flatten(x, | ||
+ | x = self.fc1(x) | ||
+ | x = F.relu(x) | ||
+ | x = self.dropout2(x) | ||
+ | x = self.fc2(x) | ||
+ | output = F.log_softmax(x, | ||
+ | return output | ||
+ | |||
+ | |||
+ | def train(args, model, device, train_loader, | ||
+ | model.train() | ||
+ | for batch_idx, (data, target) in enumerate(train_loader): | ||
+ | data, target = data.to(device), | ||
+ | optimizer.zero_grad() | ||
+ | output = model(data) | ||
+ | loss = F.nll_loss(output, | ||
+ | loss.backward() | ||
+ | optimizer.step() | ||
+ | if batch_idx % args.log_interval == 0: | ||
+ | print(' | ||
+ | epoch, batch_idx * len(data), len(train_loader.dataset), | ||
+ | 100. * batch_idx / len(train_loader), | ||
+ | |||
+ | |||
+ | def test(args, model, device, test_loader): | ||
+ | model.eval() | ||
+ | test_loss = 0 | ||
+ | correct = 0 | ||
+ | with torch.no_grad(): | ||
+ | for data, target in test_loader: | ||
+ | data, target = data.to(device), | ||
+ | output = model(data) | ||
+ | test_loss += F.nll_loss(output, | ||
+ | pred = output.argmax(dim=1, | ||
+ | correct += pred.eq(target.view_as(pred)).sum().item() | ||
+ | |||
+ | test_loss /= len(test_loader.dataset) | ||
+ | |||
+ | print(' | ||
+ | test_loss, correct, len(test_loader.dataset), | ||
+ | 100. * correct / len(test_loader.dataset))) | ||
+ | |||
+ | |||
+ | def main(): | ||
+ | # Training settings | ||
+ | parser = argparse.ArgumentParser(description=' | ||
+ | parser.add_argument(' | ||
+ | help=' | ||
+ | parser.add_argument(' | ||
+ | help=' | ||
+ | parser.add_argument(' | ||
+ | help=' | ||
+ | parser.add_argument(' | ||
+ | help=' | ||
+ | parser.add_argument(' | ||
+ | help=' | ||
+ | parser.add_argument(' | ||
+ | help=' | ||
+ | parser.add_argument(' | ||
+ | help=' | ||
+ | parser.add_argument(' | ||
+ | help=' | ||
+ | |||
+ | parser.add_argument(' | ||
+ | help=' | ||
+ | args = parser.parse_args() | ||
+ | use_cuda = not args.no_cuda and torch.cuda.is_available() | ||
+ | |||
+ | torch.manual_seed(args.seed) | ||
+ | |||
+ | device = torch.device(" | ||
+ | |||
+ | kwargs = {' | ||
+ | train_loader = torch.utils.data.DataLoader( | ||
+ | datasets.MNIST(' | ||
+ | | ||
+ | | ||
+ | | ||
+ | ])), | ||
+ | batch_size=args.batch_size, | ||
+ | test_loader = torch.utils.data.DataLoader( | ||
+ | datasets.MNIST(' | ||
+ | | ||
+ | | ||
+ | ])), | ||
+ | batch_size=args.test_batch_size, | ||
+ | |||
+ | model = Net().to(device) | ||
+ | optimizer = optim.Adadelta(model.parameters(), | ||
+ | |||
+ | scheduler = StepLR(optimizer, | ||
+ | for epoch in range(1, args.epochs + 1): | ||
+ | train(args, model, device, train_loader, | ||
+ | test(args, model, device, test_loader) | ||
+ | scheduler.step() | ||
+ | |||
+ | if args.save_model: | ||
+ | torch.save(model.state_dict(), | ||
+ | |||
+ | |||
+ | if __name__ == ' | ||
+ | main() | ||
+ | |||
+ | )} | ||
+ | |||
+ | |||
+ | |||
+ | ===== Links ===== | ||
+ | |||
+ | https:// | ||
+ | |||
+ | https:// | ||
+ | |||
+ | https:// | ||
+ | |||
+ | http:// | ||
+ | |||
+ | https:// | ||
- | https:// |
/data/www/wiki.inf.unibz.it/data/pages/tech/slurm.txt · Last modified: 2022/11/24 16:17 by kohofer