tech:slurm
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revisionNext revisionBoth sides next revision | ||
tech:slurm [2020/04/27 15:50] – [GCC] kohofer | tech:slurm [2020/05/27 11:11] – [Example] kohofer | ||
---|---|---|---|
Line 241: | Line 241: | ||
debug* | debug* | ||
- | If computer node is down | + | If computer node is **<color #ed1c24>down</ |
< | < | ||
Line 247: | Line 247: | ||
PARTITION AVAIL TIMELIMIT | PARTITION AVAIL TIMELIMIT | ||
debug* | debug* | ||
+ | |||
+ | sinfo | ||
+ | PARTITION AVAIL TIMELIMIT | ||
+ | gpu* | ||
+ | gpu* | ||
+ | |||
</ | </ | ||
Line 356: | Line 362: | ||
====== Modules ====== | ====== Modules ====== | ||
+ | |||
+ | ===== Python ===== | ||
+ | |||
+ | ==== Python 3.7.7 ==== | ||
+ | |||
+ | |||
+ | cd / | ||
+ | mkdir / | ||
+ | wget https:// | ||
+ | tar xfJ Python-3.7.7.tar.xz | ||
+ | cd Python-3.7.7/ | ||
+ | ./configure --prefix=/ | ||
+ | make | ||
+ | make install | ||
+ | | ||
+ | |||
+ | ==== Python 2.7.18 ==== | ||
+ | |||
+ | |||
+ | cd / | ||
+ | mkdir / | ||
+ | wget https:// | ||
+ | cd Python-2.7.18 | ||
+ | ./configure --prefix=/ | ||
+ | make | ||
+ | make install | ||
+ | |||
+ | ==== Create modules file ==== | ||
+ | |||
+ | |||
+ | cd / | ||
+ | vi python-2.7.18 | ||
+ | |||
+ | < | ||
+ | #%Module1.0 | ||
+ | proc ModulesHelp { } { | ||
+ | global dotversion | ||
+ | |||
+ | puts stderr " | ||
+ | } | ||
+ | |||
+ | module-whatis " | ||
+ | prepend-path PATH / | ||
+ | |||
+ | </ | ||
+ | | ||
+ | |||
+ | |||
===== GCC ===== | ===== GCC ===== | ||
Line 478: | Line 532: | ||
... | ... | ||
</ | </ | ||
+ | |||
+ | ===== Examples ===== | ||
+ | |||
+ | ==== Example mnist ==== | ||
+ | |||
+ | An simple example to use nvidia GPU! | ||
+ | |||
+ | The example consists of the following files: | ||
+ | |||
+ | * README.md | ||
+ | * requirements.txt | ||
+ | * main.job | ||
+ | * main.py | ||
+ | |||
+ | Create a folder mnist and place the 4 files in there. | ||
+ | |||
+ | mkdir mnist | ||
+ | |||
+ | cat README.md | ||
+ | |||
+ | < | ||
+ | # Basic MNIST Example | ||
+ | |||
+ | ```bash | ||
+ | pip install -r requirements.txt | ||
+ | python main.py | ||
+ | # CUDA_VISIBLE_DEVICES=2 python main.py | ||
+ | ``` | ||
+ | </ | ||
+ | |||
+ | |||
+ | cat requirements.txt | ||
+ | < | ||
+ | torch | ||
+ | torchvision | ||
+ | </ | ||
+ | |||
+ | |||
+ | cat main.job | ||
+ | < | ||
+ | #!/bin/bash | ||
+ | |||
+ | #SBATCH --job-name=mnist | ||
+ | #SBATCH --output=mnist.out | ||
+ | #SBATCH --error=mnist.err | ||
+ | |||
+ | #SBATCH --partition gpu | ||
+ | #SBATCH --gres=gpu | ||
+ | #SBATCH --mem-per-cpu=4gb | ||
+ | #SBATCH --nodes 2 | ||
+ | #SBATCH --time=00: | ||
+ | |||
+ | #SBATCH --ntasks=10 | ||
+ | |||
+ | #SBATCH --mail-type=ALL | ||
+ | #SBATCH --mail-user=< | ||
+ | |||
+ | ml load miniconda3 | ||
+ | python3 main.py | ||
+ | </ | ||
+ | |||
+ | {(xssnipper>, | ||
+ | |||
+ | from __future__ import print_function | ||
+ | import argparse | ||
+ | import torch | ||
+ | import torch.nn as nn | ||
+ | import torch.nn.functional as F | ||
+ | import torch.optim as optim | ||
+ | from torchvision import datasets, transforms | ||
+ | from torch.optim.lr_scheduler import StepLR | ||
+ | |||
+ | |||
+ | class Net(nn.Module): | ||
+ | def __init__(self): | ||
+ | super(Net, self).__init__() | ||
+ | self.conv1 = nn.Conv2d(1, | ||
+ | self.conv2 = nn.Conv2d(32, | ||
+ | self.dropout1 = nn.Dropout2d(0.25) | ||
+ | self.dropout2 = nn.Dropout2d(0.5) | ||
+ | self.fc1 = nn.Linear(9216, | ||
+ | self.fc2 = nn.Linear(128, | ||
+ | |||
+ | def forward(self, | ||
+ | x = self.conv1(x) | ||
+ | x = F.relu(x) | ||
+ | x = self.conv2(x) | ||
+ | x = F.max_pool2d(x, | ||
+ | x = self.dropout1(x) | ||
+ | x = torch.flatten(x, | ||
+ | x = self.fc1(x) | ||
+ | x = F.relu(x) | ||
+ | x = self.dropout2(x) | ||
+ | x = self.fc2(x) | ||
+ | output = F.log_softmax(x, | ||
+ | return output | ||
+ | |||
+ | |||
+ | def train(args, model, device, train_loader, | ||
+ | model.train() | ||
+ | for batch_idx, (data, target) in enumerate(train_loader): | ||
+ | data, target = data.to(device), | ||
+ | optimizer.zero_grad() | ||
+ | output = model(data) | ||
+ | loss = F.nll_loss(output, | ||
+ | loss.backward() | ||
+ | optimizer.step() | ||
+ | if batch_idx % args.log_interval == 0: | ||
+ | print(' | ||
+ | epoch, batch_idx * len(data), len(train_loader.dataset), | ||
+ | 100. * batch_idx / len(train_loader), | ||
+ | |||
+ | |||
+ | def test(args, model, device, test_loader): | ||
+ | model.eval() | ||
+ | test_loss = 0 | ||
+ | correct = 0 | ||
+ | with torch.no_grad(): | ||
+ | for data, target in test_loader: | ||
+ | data, target = data.to(device), | ||
+ | output = model(data) | ||
+ | test_loss += F.nll_loss(output, | ||
+ | pred = output.argmax(dim=1, | ||
+ | correct += pred.eq(target.view_as(pred)).sum().item() | ||
+ | |||
+ | test_loss /= len(test_loader.dataset) | ||
+ | |||
+ | print(' | ||
+ | test_loss, correct, len(test_loader.dataset), | ||
+ | 100. * correct / len(test_loader.dataset))) | ||
+ | |||
+ | |||
+ | def main(): | ||
+ | # Training settings | ||
+ | parser = argparse.ArgumentParser(description=' | ||
+ | parser.add_argument(' | ||
+ | help=' | ||
+ | parser.add_argument(' | ||
+ | help=' | ||
+ | parser.add_argument(' | ||
+ | help=' | ||
+ | parser.add_argument(' | ||
+ | help=' | ||
+ | parser.add_argument(' | ||
+ | help=' | ||
+ | parser.add_argument(' | ||
+ | help=' | ||
+ | parser.add_argument(' | ||
+ | help=' | ||
+ | parser.add_argument(' | ||
+ | help=' | ||
+ | |||
+ | parser.add_argument(' | ||
+ | help=' | ||
+ | args = parser.parse_args() | ||
+ | use_cuda = not args.no_cuda and torch.cuda.is_available() | ||
+ | |||
+ | torch.manual_seed(args.seed) | ||
+ | |||
+ | device = torch.device(" | ||
+ | |||
+ | kwargs = {' | ||
+ | train_loader = torch.utils.data.DataLoader( | ||
+ | datasets.MNIST(' | ||
+ | | ||
+ | | ||
+ | | ||
+ | ])), | ||
+ | batch_size=args.batch_size, | ||
+ | test_loader = torch.utils.data.DataLoader( | ||
+ | datasets.MNIST(' | ||
+ | | ||
+ | | ||
+ | ])), | ||
+ | batch_size=args.test_batch_size, | ||
+ | |||
+ | model = Net().to(device) | ||
+ | optimizer = optim.Adadelta(model.parameters(), | ||
+ | |||
+ | scheduler = StepLR(optimizer, | ||
+ | for epoch in range(1, args.epochs + 1): | ||
+ | train(args, model, device, train_loader, | ||
+ | test(args, model, device, test_loader) | ||
+ | scheduler.step() | ||
+ | |||
+ | if args.save_model: | ||
+ | torch.save(model.state_dict(), | ||
+ | |||
+ | |||
+ | if __name__ == ' | ||
+ | main() | ||
+ | |||
+ | )} | ||
===== Links ===== | ===== Links ===== | ||
+ | |||
+ | https:// | ||
+ | |||
+ | https:// | ||
+ | |||
+ | https:// | ||
http:// | http:// | ||
https:// | https:// | ||
+ |
/data/www/wiki.inf.unibz.it/data/pages/tech/slurm.txt · Last modified: 2022/11/24 16:17 by kohofer