User Tools

Site Tools


tech:slurm

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
Next revisionBoth sides next revision
tech:slurm [2020/05/27 10:57] kohofertech:slurm [2020/06/22 16:12] kohofer
Line 533: Line 533:
 </code> </code>
  
-===== Example =====+===== Examples ===== 
 + 
 +==== Example mnist ====
  
 An simple example to use nvidia GPU! An simple example to use nvidia GPU!
  
 +The example consists of the following files:
 +
 +  * README.md
 +  * requirements.txt
 +  * main.job
 +  * main.py
 +
 +Create a folder mnist and place the 4 files in there.
 +
 +  mkdir mnist
 +
 +cat README.md
 +
 +<code>
 +# Basic MNIST Example
 +
 +```bash
 +pip install -r requirements.txt
 +python main.py
 +# CUDA_VISIBLE_DEVICES=2 python main.py  # to specify GPU id to ex. 2
 +```
 +</code>
 +
 +
 +  cat requirements.txt
 +<code>
 +torch
 +torchvision
 +</code>
 +
 +
 +  cat main.job
 <code> <code>
 #!/bin/bash #!/bin/bash
Line 554: Line 588:
 #SBATCH --mail-type=ALL #SBATCH --mail-type=ALL
 #SBATCH --mail-user=<your-email@address.com> #SBATCH --mail-user=<your-email@address.com>
 +
 +ml load miniconda3
 +python3 main.py
 </code> </code>
  
 +Remove <your-email@address.com> and add your e-mail address.
  
 +{(xssnipper>,1, main.py slide,
  
 +from __future__ import print_function
 +import argparse
 +import torch
 +import torch.nn as nn
 +import torch.nn.functional as F
 +import torch.optim as optim
 +from torchvision import datasets, transforms
 +from torch.optim.lr_scheduler import StepLR
  
-ml load miniconda3 
  
-python3 main.py+class Net(nn.Module): 
 +    def __init__(self): 
 +        super(Net, self).__init__() 
 +        self.conv1 = nn.Conv2d(1, 32, 3, 1) 
 +        self.conv2 = nn.Conv2d(32, 64, 3, 1) 
 +        self.dropout1 = nn.Dropout2d(0.25) 
 +        self.dropout2 = nn.Dropout2d(0.5) 
 +        self.fc1 = nn.Linear(9216, 128) 
 +        self.fc2 = nn.Linear(128, 10) 
 + 
 +    def forward(self, x): 
 +        x = self.conv1(x) 
 +        x = F.relu(x) 
 +        x = self.conv2(x) 
 +        x = F.max_pool2d(x, 2) 
 +        x = self.dropout1(x) 
 +        x = torch.flatten(x, 1) 
 +        x = self.fc1(x) 
 +        x = F.relu(x) 
 +        x = self.dropout2(x) 
 +        x = self.fc2(x) 
 +        output = F.log_softmax(x, dim=1) 
 +        return output 
 + 
 + 
 +def train(args, model, device, train_loader, optimizer, epoch): 
 +    model.train() 
 +    for batch_idx, (data, target) in enumerate(train_loader): 
 +        data, target = data.to(device), target.to(device) 
 +        optimizer.zero_grad() 
 +        output = model(data) 
 +        loss = F.nll_loss(output, target) 
 +        loss.backward() 
 +        optimizer.step() 
 +        if batch_idx % args.log_interval == 0: 
 +            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 
 +                epoch, batch_idx * len(data), len(train_loader.dataset), 
 +                100. * batch_idx / len(train_loader), loss.item())) 
 + 
 + 
 +def test(args, model, device, test_loader): 
 +    model.eval() 
 +    test_loss = 0 
 +    correct = 0 
 +    with torch.no_grad(): 
 +        for data, target in test_loader: 
 +            data, target = data.to(device), target.to(device) 
 +            output = model(data) 
 +            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss 
 +            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability 
 +            correct += pred.eq(target.view_as(pred)).sum().item() 
 + 
 +    test_loss /= len(test_loader.dataset) 
 + 
 +    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( 
 +        test_loss, correct, len(test_loader.dataset), 
 +        100. * correct / len(test_loader.dataset))) 
 + 
 + 
 +def main(): 
 +    # Training settings 
 +    parser = argparse.ArgumentParser(description='PyTorch MNIST Example'
 +    parser.add_argument('--batch-size', type=int, default=64, metavar='N', 
 +                        help='input batch size for training (default: 64)'
 +    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', 
 +                        help='input batch size for testing (default: 1000)'
 +    parser.add_argument('--epochs', type=int, default=14, metavar='N', 
 +                        help='number of epochs to train (default: 14)'
 +    parser.add_argument('--lr', type=float, default=1.0, metavar='LR', 
 +                        help='learning rate (default: 1.0)'
 +    parser.add_argument('--gamma', type=float, default=0.7, metavar='M', 
 +                        help='Learning rate step gamma (default: 0.7)'
 +    parser.add_argument('--no-cuda', action='store_true', default=False, 
 +                        help='disables CUDA training'
 +    parser.add_argument('--seed', type=int, default=1, metavar='S', 
 +                        help='random seed (default: 1)') 
 +    parser.add_argument('--log-interval', type=int, default=10, metavar='N', 
 +                        help='how many batches to wait before logging training status'
 + 
 +    parser.add_argument('--save-model', action='store_true', default=False, 
 +                        help='For Saving the current Model'
 +    args = parser.parse_args() 
 +    use_cuda = not args.no_cuda and torch.cuda.is_available() 
 + 
 +    torch.manual_seed(args.seed) 
 + 
 +    device = torch.device("cuda" if use_cuda else "cpu"
 + 
 +    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} 
 +    train_loader = torch.utils.data.DataLoader( 
 +        datasets.MNIST('../data', train=True, download=True, 
 +                       transform=transforms.Compose([ 
 +                           transforms.ToTensor(), 
 +                           transforms.Normalize((0.1307,), (0.3081,)) 
 +                       ])), 
 +        batch_size=args.batch_size, shuffle=True, **kwargs) 
 +    test_loader = torch.utils.data.DataLoader( 
 +        datasets.MNIST('../data', train=False, transform=transforms.Compose([ 
 +                           transforms.ToTensor(), 
 +                           transforms.Normalize((0.1307,), (0.3081,)) 
 +                       ])), 
 +        batch_size=args.test_batch_size, shuffle=True, **kwargs) 
 + 
 +    model = Net().to(device) 
 +    optimizer = optim.Adadelta(model.parameters(), lr=args.lr) 
 + 
 +    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) 
 +    for epoch in range(1, args.epochs + 1): 
 +        train(args, model, device, train_loader, optimizer, epoch) 
 +        test(args, model, device, test_loader) 
 +        scheduler.step() 
 + 
 +    if args.save_model: 
 +        torch.save(model.state_dict(), "mnist_cnn.pt"
 + 
 + 
 +if __name__ == '__main__': 
 +    main() 
 + 
 +)}   
 + 
 +Once you have all files launch this command on slurm-ctrl: 
 + 
 +  sbatch main.job 
 + 
 +Check your job with 
 + 
 +  squeue 
 + 
 + 
 +---- 
 + 
 + 
 +===== CUDA NVIDIA TESLA ===== 
 + 
 +root@gpu03:/usr/local/cuda/samples/bin/x86_64/linux# deviceQuery  
 +deviceQuery Starting... 
 + 
 + CUDA Device Query (Runtime API) version (CUDART static linking) 
 + 
 +Detected 2 CUDA Capable device(s) 
 + 
 +Device 0: "Tesla V100-PCIE-32GB" 
 +  CUDA Driver Version / Runtime Version          10.2 / 10.2 
 +  CUDA Capability Major/Minor version number:    7.0 
 +  Total amount of global memory:                 32510 MBytes (34089730048 bytes) 
 +  (80) Multiprocessors, ( 64) CUDA Cores/MP:     5120 CUDA Cores 
 +  GPU Max Clock rate:                            1380 MHz (1.38 GHz) 
 +  Memory Clock rate:                             877 Mhz 
 +  Memory Bus Width:                              4096-bit 
 +  L2 Cache Size:                                 6291456 bytes 
 +  Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384) 
 +  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers 
 +  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers 
 +  Total amount of constant memory:               65536 bytes 
 +  Total amount of shared memory per block:       49152 bytes 
 +  Total number of registers available per block: 65536 
 +  Warp size:                                     32 
 +  Maximum number of threads per multiprocessor:  2048 
 +  Maximum number of threads per block:           1024 
 +  Max dimension size of a thread block (x,y,z): (1024, 1024, 64) 
 +  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535) 
 +  Maximum memory pitch:                          2147483647 bytes 
 +  Texture alignment:                             512 bytes 
 +  Concurrent copy and kernel execution:          Yes with 7 copy engine(s) 
 +  Run time limit on kernels:                     No 
 +  Integrated GPU sharing Host Memory:            No 
 +  Support host page-locked memory mapping:       Yes 
 +  Alignment requirement for Surfaces:            Yes 
 +  Device has ECC support:                        Enabled 
 +  Device supports Unified Addressing (UVA):      Yes 
 +  Device supports Compute Preemption:            Yes 
 +  Supports Cooperative Kernel Launch:            Yes 
 +  Supports MultiDevice Co-op Kernel Launch:      Yes 
 +  Device PCI Domain ID / Bus ID / location ID:   0 / 59 / 0 
 +  Compute Mode: 
 +     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) > 
 + 
 +Device 1: "Tesla V100-PCIE-32GB" 
 +  CUDA Driver Version / Runtime Version          10.2 / 10.2 
 +  CUDA Capability Major/Minor version number:    7.0 
 +  Total amount of global memory:                 32510 MBytes (34089730048 bytes) 
 +  (80) Multiprocessors, ( 64) CUDA Cores/MP:     5120 CUDA Cores 
 +  GPU Max Clock rate:                            1380 MHz (1.38 GHz) 
 +  Memory Clock rate:                             877 Mhz 
 +  Memory Bus Width:                              4096-bit 
 +  L2 Cache Size:                                 6291456 bytes 
 +  Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384) 
 +  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers 
 +  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers 
 +  Total amount of constant memory:               65536 bytes 
 +  Total amount of shared memory per block:       49152 bytes 
 +  Total number of registers available per block: 65536 
 +  Warp size:                                     32 
 +  Maximum number of threads per multiprocessor:  2048 
 +  Maximum number of threads per block:           1024 
 +  Max dimension size of a thread block (x,y,z): (1024, 1024, 64) 
 +  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535) 
 +  Maximum memory pitch:                          2147483647 bytes 
 +  Texture alignment:                             512 bytes 
 +  Concurrent copy and kernel execution:          Yes with 7 copy engine(s) 
 +  Run time limit on kernels:                     No 
 +  Integrated GPU sharing Host Memory:            No 
 +  Support host page-locked memory mapping:       Yes 
 +  Alignment requirement for Surfaces:            Yes 
 +  Device has ECC support:                        Enabled 
 +  Device supports Unified Addressing (UVA):      Yes 
 +  Device supports Compute Preemption:            Yes 
 +  Supports Cooperative Kernel Launch:            Yes 
 +  Supports MultiDevice Co-op Kernel Launch:      Yes 
 +  Device PCI Domain ID / Bus ID / location ID:   0 / 175 / 0 
 +  Compute Mode: 
 +     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) > 
 +> Peer access from Tesla V100-PCIE-32GB (GPU0) -> Tesla V100-PCIE-32GB (GPU1) : Yes 
 +> Peer access from Tesla V100-PCIE-32GB (GPU1) -> Tesla V100-PCIE-32GB (GPU0) : Yes 
 + 
 +deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 10.2, CUDA Runtime Version = 10.2, NumDevs = 2 
 +Result = PASS
  
  
/data/www/wiki.inf.unibz.it/data/pages/tech/slurm.txt · Last modified: 2022/11/24 16:17 by kohofer