User Tools

Site Tools


tech:slurm

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
Next revisionBoth sides next revision
tech:slurm [2020/05/27 10:57] kohofertech:slurm [2020/05/27 11:47] – [Example mnist] kohofer
Line 533: Line 533:
 </code> </code>
  
-===== Example =====+===== Examples ===== 
 + 
 +==== Example mnist ====
  
 An simple example to use nvidia GPU! An simple example to use nvidia GPU!
  
 +The example consists of the following files:
 +
 +  * README.md
 +  * requirements.txt
 +  * main.job
 +  * main.py
 +
 +Create a folder mnist and place the 4 files in there.
 +
 +  mkdir mnist
 +
 +cat README.md
 +
 +<code>
 +# Basic MNIST Example
 +
 +```bash
 +pip install -r requirements.txt
 +python main.py
 +# CUDA_VISIBLE_DEVICES=2 python main.py  # to specify GPU id to ex. 2
 +```
 +</code>
 +
 +
 +  cat requirements.txt
 +<code>
 +torch
 +torchvision
 +</code>
 +
 +
 +  cat main.job
 <code> <code>
 #!/bin/bash #!/bin/bash
Line 554: Line 588:
 #SBATCH --mail-type=ALL #SBATCH --mail-type=ALL
 #SBATCH --mail-user=<your-email@address.com> #SBATCH --mail-user=<your-email@address.com>
 +
 +ml load miniconda3
 +python3 main.py
 </code> </code>
  
 +Remove <your-email@address.com> and add your e-mail address.
  
 +{(xssnipper>,1, main.py slide,
  
 +from __future__ import print_function
 +import argparse
 +import torch
 +import torch.nn as nn
 +import torch.nn.functional as F
 +import torch.optim as optim
 +from torchvision import datasets, transforms
 +from torch.optim.lr_scheduler import StepLR
  
-ml load miniconda3 
  
-python3 main.py+class Net(nn.Module): 
 +    def __init__(self): 
 +        super(Net, self).__init__() 
 +        self.conv1 = nn.Conv2d(1, 32, 3, 1) 
 +        self.conv2 = nn.Conv2d(32, 64, 3, 1) 
 +        self.dropout1 = nn.Dropout2d(0.25) 
 +        self.dropout2 = nn.Dropout2d(0.5) 
 +        self.fc1 = nn.Linear(9216, 128) 
 +        self.fc2 = nn.Linear(128, 10) 
 + 
 +    def forward(self, x): 
 +        x = self.conv1(x) 
 +        x = F.relu(x) 
 +        x = self.conv2(x) 
 +        x = F.max_pool2d(x, 2) 
 +        x = self.dropout1(x) 
 +        x = torch.flatten(x, 1) 
 +        x = self.fc1(x) 
 +        x = F.relu(x) 
 +        x = self.dropout2(x) 
 +        x = self.fc2(x) 
 +        output = F.log_softmax(x, dim=1) 
 +        return output 
 + 
 + 
 +def train(args, model, device, train_loader, optimizer, epoch): 
 +    model.train() 
 +    for batch_idx, (data, target) in enumerate(train_loader): 
 +        data, target = data.to(device), target.to(device) 
 +        optimizer.zero_grad() 
 +        output = model(data) 
 +        loss = F.nll_loss(output, target) 
 +        loss.backward() 
 +        optimizer.step() 
 +        if batch_idx % args.log_interval == 0: 
 +            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 
 +                epoch, batch_idx * len(data), len(train_loader.dataset), 
 +                100. * batch_idx / len(train_loader), loss.item())) 
 + 
 + 
 +def test(args, model, device, test_loader): 
 +    model.eval() 
 +    test_loss = 0 
 +    correct = 0 
 +    with torch.no_grad(): 
 +        for data, target in test_loader: 
 +            data, target = data.to(device), target.to(device) 
 +            output = model(data) 
 +            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss 
 +            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability 
 +            correct += pred.eq(target.view_as(pred)).sum().item() 
 + 
 +    test_loss /= len(test_loader.dataset) 
 + 
 +    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( 
 +        test_loss, correct, len(test_loader.dataset), 
 +        100. * correct / len(test_loader.dataset))) 
 + 
 + 
 +def main(): 
 +    # Training settings 
 +    parser = argparse.ArgumentParser(description='PyTorch MNIST Example'
 +    parser.add_argument('--batch-size', type=int, default=64, metavar='N', 
 +                        help='input batch size for training (default: 64)'
 +    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', 
 +                        help='input batch size for testing (default: 1000)'
 +    parser.add_argument('--epochs', type=int, default=14, metavar='N', 
 +                        help='number of epochs to train (default: 14)'
 +    parser.add_argument('--lr', type=float, default=1.0, metavar='LR', 
 +                        help='learning rate (default: 1.0)'
 +    parser.add_argument('--gamma', type=float, default=0.7, metavar='M', 
 +                        help='Learning rate step gamma (default: 0.7)'
 +    parser.add_argument('--no-cuda', action='store_true', default=False, 
 +                        help='disables CUDA training'
 +    parser.add_argument('--seed', type=int, default=1, metavar='S', 
 +                        help='random seed (default: 1)') 
 +    parser.add_argument('--log-interval', type=int, default=10, metavar='N', 
 +                        help='how many batches to wait before logging training status'
 + 
 +    parser.add_argument('--save-model', action='store_true', default=False, 
 +                        help='For Saving the current Model'
 +    args = parser.parse_args() 
 +    use_cuda = not args.no_cuda and torch.cuda.is_available() 
 + 
 +    torch.manual_seed(args.seed) 
 + 
 +    device = torch.device("cuda" if use_cuda else "cpu"
 + 
 +    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} 
 +    train_loader = torch.utils.data.DataLoader( 
 +        datasets.MNIST('../data', train=True, download=True, 
 +                       transform=transforms.Compose([ 
 +                           transforms.ToTensor(), 
 +                           transforms.Normalize((0.1307,), (0.3081,)) 
 +                       ])), 
 +        batch_size=args.batch_size, shuffle=True, **kwargs) 
 +    test_loader = torch.utils.data.DataLoader( 
 +        datasets.MNIST('../data', train=False, transform=transforms.Compose([ 
 +                           transforms.ToTensor(), 
 +                           transforms.Normalize((0.1307,), (0.3081,)) 
 +                       ])), 
 +        batch_size=args.test_batch_size, shuffle=True, **kwargs) 
 + 
 +    model = Net().to(device) 
 +    optimizer = optim.Adadelta(model.parameters(), lr=args.lr) 
 + 
 +    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) 
 +    for epoch in range(1, args.epochs + 1): 
 +        train(args, model, device, train_loader, optimizer, epoch) 
 +        test(args, model, device, test_loader) 
 +        scheduler.step() 
 + 
 +    if args.save_model: 
 +        torch.save(model.state_dict(), "mnist_cnn.pt"
 + 
 + 
 +if __name__ == '__main__': 
 +    main() 
 + 
 +)}   
 + 
 +Once you have all files launch this command on slurm-ctrl: 
 + 
 +  sbatch main.job 
 + 
 +Check your job with
  
 +  squeue
  
  
/data/www/wiki.inf.unibz.it/data/pages/tech/slurm.txt · Last modified: 2022/11/24 16:17 by kohofer