User Tools

Site Tools


tech:slurm

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
Next revisionBoth sides next revision
tech:slurm [2020/04/29 09:17] – [Python] kohofertech:slurm [2020/05/27 11:11] – [Example] kohofer
Line 241: Line 241:
   debug*       up   infinite      1   idle linux1   debug*       up   infinite      1   idle linux1
  
-If computer node is down+If computer node is **<color #ed1c24>down</color>** or **<color #ed1c24>drain</color>**
  
 <code> <code>
Line 247: Line 247:
 PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
 debug*       up   infinite      2   down gpu[02-03] debug*       up   infinite      2   down gpu[02-03]
 +
 +sinfo 
 +PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
 +gpu*         up   infinite      1  drain gpu02
 +gpu*         up   infinite      1   down gpu03
 +
 </code> </code>
  
Line 526: Line 532:
 ... ...
 </code> </code>
 +
 +===== Examples =====
 +
 +==== Example mnist ====
 +
 +An simple example to use nvidia GPU!
 +
 +The example consists of the following files:
 +
 +  * README.md
 +  * requirements.txt
 +  * main.job
 +  * main.py
 +
 +Create a folder mnist and place the 4 files in there.
 +
 +  mkdir mnist
 +
 +cat README.md
 +
 +<code>
 +# Basic MNIST Example
 +
 +```bash
 +pip install -r requirements.txt
 +python main.py
 +# CUDA_VISIBLE_DEVICES=2 python main.py  # to specify GPU id to ex. 2
 +```
 +</code>
 +
 +
 +  cat requirements.txt
 +<code>
 +torch
 +torchvision
 +</code>
 +
 +
 +  cat main.job
 +<code>
 +#!/bin/bash
 +
 +#SBATCH --job-name=mnist
 +#SBATCH --output=mnist.out
 +#SBATCH --error=mnist.err
 +
 +#SBATCH --partition gpu
 +#SBATCH --gres=gpu
 +#SBATCH --mem-per-cpu=4gb
 +#SBATCH --nodes 2
 +#SBATCH --time=00:08:00
 +
 +#SBATCH --ntasks=10
 +
 +#SBATCH --mail-type=ALL
 +#SBATCH --mail-user=<your-email@address.com>
 +
 +ml load miniconda3
 +python3 main.py
 +</code>
 +
 +{(xssnipper>,1, main.py slide,
 +
 +from __future__ import print_function
 +import argparse
 +import torch
 +import torch.nn as nn
 +import torch.nn.functional as F
 +import torch.optim as optim
 +from torchvision import datasets, transforms
 +from torch.optim.lr_scheduler import StepLR
 +
 +
 +class Net(nn.Module):
 +    def __init__(self):
 +        super(Net, self).__init__()
 +        self.conv1 = nn.Conv2d(1, 32, 3, 1)
 +        self.conv2 = nn.Conv2d(32, 64, 3, 1)
 +        self.dropout1 = nn.Dropout2d(0.25)
 +        self.dropout2 = nn.Dropout2d(0.5)
 +        self.fc1 = nn.Linear(9216, 128)
 +        self.fc2 = nn.Linear(128, 10)
 +
 +    def forward(self, x):
 +        x = self.conv1(x)
 +        x = F.relu(x)
 +        x = self.conv2(x)
 +        x = F.max_pool2d(x, 2)
 +        x = self.dropout1(x)
 +        x = torch.flatten(x, 1)
 +        x = self.fc1(x)
 +        x = F.relu(x)
 +        x = self.dropout2(x)
 +        x = self.fc2(x)
 +        output = F.log_softmax(x, dim=1)
 +        return output
 +
 +
 +def train(args, model, device, train_loader, optimizer, epoch):
 +    model.train()
 +    for batch_idx, (data, target) in enumerate(train_loader):
 +        data, target = data.to(device), target.to(device)
 +        optimizer.zero_grad()
 +        output = model(data)
 +        loss = F.nll_loss(output, target)
 +        loss.backward()
 +        optimizer.step()
 +        if batch_idx % args.log_interval == 0:
 +            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
 +                epoch, batch_idx * len(data), len(train_loader.dataset),
 +                100. * batch_idx / len(train_loader), loss.item()))
 +
 +
 +def test(args, model, device, test_loader):
 +    model.eval()
 +    test_loss = 0
 +    correct = 0
 +    with torch.no_grad():
 +        for data, target in test_loader:
 +            data, target = data.to(device), target.to(device)
 +            output = model(data)
 +            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
 +            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
 +            correct += pred.eq(target.view_as(pred)).sum().item()
 +
 +    test_loss /= len(test_loader.dataset)
 +
 +    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
 +        test_loss, correct, len(test_loader.dataset),
 +        100. * correct / len(test_loader.dataset)))
 +
 +
 +def main():
 +    # Training settings
 +    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 +    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
 +                        help='input batch size for training (default: 64)')
 +    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
 +                        help='input batch size for testing (default: 1000)')
 +    parser.add_argument('--epochs', type=int, default=14, metavar='N',
 +                        help='number of epochs to train (default: 14)')
 +    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
 +                        help='learning rate (default: 1.0)')
 +    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
 +                        help='Learning rate step gamma (default: 0.7)')
 +    parser.add_argument('--no-cuda', action='store_true', default=False,
 +                        help='disables CUDA training')
 +    parser.add_argument('--seed', type=int, default=1, metavar='S',
 +                        help='random seed (default: 1)')
 +    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
 +                        help='how many batches to wait before logging training status')
 +
 +    parser.add_argument('--save-model', action='store_true', default=False,
 +                        help='For Saving the current Model')
 +    args = parser.parse_args()
 +    use_cuda = not args.no_cuda and torch.cuda.is_available()
 +
 +    torch.manual_seed(args.seed)
 +
 +    device = torch.device("cuda" if use_cuda else "cpu")
 +
 +    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
 +    train_loader = torch.utils.data.DataLoader(
 +        datasets.MNIST('../data', train=True, download=True,
 +                       transform=transforms.Compose([
 +                           transforms.ToTensor(),
 +                           transforms.Normalize((0.1307,), (0.3081,))
 +                       ])),
 +        batch_size=args.batch_size, shuffle=True, **kwargs)
 +    test_loader = torch.utils.data.DataLoader(
 +        datasets.MNIST('../data', train=False, transform=transforms.Compose([
 +                           transforms.ToTensor(),
 +                           transforms.Normalize((0.1307,), (0.3081,))
 +                       ])),
 +        batch_size=args.test_batch_size, shuffle=True, **kwargs)
 +
 +    model = Net().to(device)
 +    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
 +
 +    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
 +    for epoch in range(1, args.epochs + 1):
 +        train(args, model, device, train_loader, optimizer, epoch)
 +        test(args, model, device, test_loader)
 +        scheduler.step()
 +
 +    if args.save_model:
 +        torch.save(model.state_dict(), "mnist_cnn.pt")
 +
 +
 +if __name__ == '__main__':
 +    main()
 +
 +)}  
  
  
  
 ===== Links ===== ===== Links =====
 +
 +https://www.admin-magazine.com/HPC/Articles/Warewulf-Cluster-Manager-Development-and-Run-Time/Warewulf-3-Code/MPICH2
 +
 +https://proteusmaster.urcf.drexel.edu/urcfwiki/index.php/Environment_Modules_Quick_Start_Guide
 +
 +https://en.wikipedia.org/wiki/Environment_Modules_(software)
  
 http://www.walkingrandomly.com/?p=5680 http://www.walkingrandomly.com/?p=5680
  
 https://modules.readthedocs.io/en/latest/index.html https://modules.readthedocs.io/en/latest/index.html
 +
/data/www/wiki.inf.unibz.it/data/pages/tech/slurm.txt · Last modified: 2022/11/24 16:17 by kohofer