Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024
@@ -24,7 +24,7 @@ usage: main.py [-h] --dataset DATASET --dataroot DATAROOT [--workers WORKERS]
 [--batchSize BATCHSIZE] [--imageSize IMAGESIZE] [--nz NZ]
 [--ngf NGF] [--ndf NDF] [--niter NITER] [--lr LR]
 [--beta1 BETA1] [--cuda] [--ngpu NGPU] [--netG NETG]
-[--netD NETD] [--mps]
+[--netD NETD] [--mps] [--device DEVICE]
 
 optional arguments:
 -h, --help show this help message and exit
@@ -41,6 +41,7 @@ optional arguments:
 --beta1 BETA1 beta1 for adam. default=0.5
 --cuda enables cuda
 --mps enables macOS GPU
+--device backend device
 --ngpu NGPU number of GPUs to use
 --netG NETG path to netG (to continue training)
 --netD NETD path to netD (to continue training)

@@ -28,6 +28,7 @@ python neural_style/neural_style.py eval --content-image </path/to/content/image
 - `--content-scale`: factor for scaling down the content image if memory is an issue (eg: value of 2 will halve the height and width of content-image)
 - `--cuda`: set it to 1 for running on GPU, 0 for CPU.
 - `--mps`: set it to 1 for running on macOS GPU
+- `--device DEVICE`: backend device to run on, 'cpu' by default.
 
 Train model
 
@@ -42,6 +43,7 @@ There are several command line arguments, the important ones are listed below
 - `--save-model-dir`: path to folder where trained model will be saved.
 - `--cuda`: set it to 1 for running on GPU, 0 for CPU.
 - `--mps`: set it to 1 for running on macOS GPU
+- `--device DEVICE`: backend device to run on, 'cpu' by default.
 
 Refer to `neural_style/neural_style.py` for other command line arguments. For training new models you might have to tune the values of `--content-weight` and `--style-weight`. The mosaic style model shown above was trained with `--content-weight 1e5` and `--style-weight 1e10`. The remaining 3 models were also trained with similar order of weight parameters with slight variation in the `--style-weight` (`5e10` or `1e11`).
 

@@ -69,7 +69,7 @@ python main.py --epochs 300 --lr 0.005 --l2 5e-4 --dropout-p 0.6 --num-heads 8 -
 In more detail, the `main.py` script recieves following arguments:
 ```
 usage: main.py [-h] [--epochs EPOCHS] [--lr LR] [--l2 L2] [--dropout-p DROPOUT_P] [--hidden-dim HIDDEN_DIM] [--num-heads NUM_HEADS] [--concat-heads] [--val-every VAL_EVERY]
-[--no-cuda] [--no-mps] [--dry-run] [--seed S]
+[--no-cuda] [--no-mps] [--dry-run] [--seed S] [--device DEVICE]
 
 PyTorch Graph Attention Network
 
@@ -89,6 +89,7 @@ options:
 epochs to wait for print training and validation evaluation (default: 20)
 --no-cuda disables CUDA training
 --no-mps disables macOS GPU training
+--device DEVICE backend device
 --dry-run quickly check a single pass
 --seed S random seed (default: 13)
 ```

@@ -311,6 +311,8 @@ def test(model, criterion, input, target, mask):
 help='disables CUDA training')
 parser.add_argument('--no-mps', action='store_true', default=False,
 help='disables macOS GPU training')
+parser.add_argument('--device', type=str, default='cpu',
+help='backend device')
 parser.add_argument('--dry-run', action='store_true', default=False,
 help='quickly check a single pass')
 parser.add_argument('--seed', type=int, default=13, metavar='S',
@@ -327,7 +329,7 @@ def test(model, criterion, input, target, mask):
 elif use_mps:
 device = torch.device('mps')
 else:
-device = torch.device('cpu')
+device = torch.device(args.device)
 print(f'Using {device} device')
 
 # Load the dataset

@@ -220,6 +220,8 @@ def test(model, criterion, input, target, mask):
 help='disables CUDA training')
 parser.add_argument('--no-mps', action='store_true', default=False,
 help='disables macOS GPU training')
+parser.add_argument('--device', type=str, default='cpu',
+help='backend device')
 parser.add_argument('--dry-run', action='store_true', default=False,
 help='quickly check a single pass')
 parser.add_argument('--seed', type=int, default=42, metavar='S',
@@ -236,7 +238,7 @@ def test(model, criterion, input, target, mask):
 elif use_mps:
 device = torch.device('mps')
 else:
-device = torch.device('cpu')
+device = torch.device(args.device)
 print(f'Using {device} device')
 
 cora_url = 'https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz'

@@ -272,9 +272,9 @@ def main(opts):
 help="Default learning rate")
 parser.add_argument("--batch", type=int, default=128,
 help="Batch size")
-parser.add_argument("--backend", type=str, default="cpu",
-help="Batch size")
-
+parser.add_argument("--device", type=str, default="cpu",
+help="backend device")
+
 # Transformer settings
 parser.add_argument("--attn_heads", type=int, default=8,
 help="Number of attention heads")
@@ -298,7 +298,7 @@ def main(opts):
 
 args = parser.parse_args()
 
-DEVICE = torch.device("cuda" if args.backend == "gpu" and torch.cuda.is_available() else "cpu")
+DEVICE = torch.device("cuda" if args.device == "gpu" and torch.cuda.is_available() else args.device)
 
 if args.inference:
 inference(args)

@@ -25,7 +25,7 @@ spacy
 Start the training process with:
 
 ```bash
-python train.py --lower --word-vectors [PATH_TO_WORD_VECTORS] --vector-cache [PATH_TO_VECTOR_CACHE] --epochs [NUMBER_OF_EPOCHS] --batch-size [BATCH_SIZE] --save-path [PATH_TO_SAVE_MODEL] --gpu [GPU_NUMBER]
+python train.py --lower --word-vectors [PATH_TO_WORD_VECTORS] --vector-cache [PATH_TO_VECTOR_CACHE] --epochs [NUMBER_OF_EPOCHS] --batch-size [BATCH_SIZE] --save-path [PATH_TO_SAVE_MODEL] --gpu [GPU_NUMBER] --device [BACKEND_DEVICE]
 ```
 
 ## 🏋️‍♀️ Training

@@ -20,6 +20,8 @@ def makedirs(name):
 
 def get_args():
 parser = ArgumentParser(description='PyTorch/torchtext SNLI example')
+parser.add_argument('--device', type=str, default='cpu',
+help='backend device')
 parser.add_argument('--epochs', type=int, default=50,
 help='the number of total epochs to run.')
 parser.add_argument('--batch_size', type=int, default=128,

@@ -86,6 +86,8 @@ def main():
 help='disables CUDA training')
 parser.add_argument('--no-mps', action='store_true', default=False,
 help='disables macOS GPU training')
+parser.add_argument('--device', type=str, default='cpu',
+help='backend device')
 parser.add_argument('--dry-run', action='store_true', default=False,
 help='quickly check a single pass')
 parser.add_argument('--seed', type=int, default=1, metavar='S',
@@ -105,7 +107,7 @@ def main():
 elif use_mps:
 device = torch.device("mps")
 else:
-device = torch.device("cpu")
+device = torch.device(args.device)
 
 train_kwargs = {'batch_size': args.batch_size}
 test_kwargs = {'batch_size': args.test_batch_size}

@@ -18,6 +18,7 @@ optional arguments:
 --lr LR learning rate (default: 0.03)
 --no_cuda disables CUDA training
 --no_mps disables MPS training
+--device DEVICE backend device
 --seed SEED random seed (default: 1)
 --save_model For saving the current Model
 --train_size TRAIN_SIZE

@@ -108,6 +108,9 @@ def train(self, x_pos, x_neg):
 parser.add_argument(
 "--no_mps", action="store_true", default=False, help="disables MPS training"
 )
+parser.add_argument(
+'--device', type=str, default='cpu', help='backend device'
+)
 parser.add_argument(
 "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
 )
@@ -145,7 +148,7 @@ def train(self, x_pos, x_neg):
 elif use_mps:
 device = torch.device("mps")
 else:
-device = torch.device("cpu")
+device = torch.device(args.device)
 
 train_kwargs = {"batch_size": args.train_size}
 test_kwargs = {"batch_size": args.test_size}

@@ -21,6 +21,7 @@ optional arguments:
 --log_interval how many batches to wait before logging training status
 --num_process how many training processes to use (default: 2)
 --cuda enables CUDA training
+--device DEVICE backend device
 --dry-run quickly check a single pass
 --save-model For Saving the current Model
 ```
@@ -31,6 +31,8 @@
 help='enables CUDA training')
 parser.add_argument('--mps', action='store_true', default=False,
 help='enables macOS GPU training')
+parser.add_argument('--device', type=str, default='cpu',
+help='backend device')
 parser.add_argument('--save_model', action='store_true', default=False,
 help='save the trained model to state_dict')
 parser.add_argument('--dry-run', action='store_true', default=False,
@@ -65,7 +67,7 @@ def forward(self, x):
 elif use_mps:
 device = torch.device("mps")
 else:
-device = torch.device("cpu")
+device = torch.device(args.device)
 
 transform=transforms.Compose([
 transforms.ToTensor(),

@@ -95,6 +95,8 @@ def main():
 help='enables CUDA training')
 parser.add_argument('--mps', action="store_true", default=False,
 help="enables MPS training")
+parser.add_argument('--device', type=str, default='cpu',
+help='backend device')
 parser.add_argument('--dry-run', action='store_true', default=False,
 help='quickly check a single pass')
 parser.add_argument('--seed', type=int, default=1, metavar='S',
@@ -110,7 +112,7 @@ def main():
 elif args.mps and not args.cuda:
 device = "mps"
 else:
-device = "cpu"
+device = args.device
 
 device = torch.device(device)
 

@@ -13,6 +13,15 @@
 BASE_DIR="$(pwd)/$(dirname $0)"
 source $BASE_DIR/utils.sh
 
+# Run on a specific backend device with 'export BACKEND_DEVICE=cpu'. It will
+# be set to 'cpu' by default and has lower priority than '--cuda' and '--mps'.
+# See https://.com/pytorch/examples/pull/1288 for more information.
+if [ -n "${BACKEND_DEVICE}" ]; then
+DEVICE_FLAG="--device ${BACKEND_DEVICE}"
+else
+DEVICE_FLAG=""
+fi
+
 USE_CUDA=$(python -c "import torchvision, torch; print(torch.cuda.is_available())")
 case $USE_CUDA in
 "True")
@@ -32,7 +41,7 @@ esac
 
 function dcgan() {
 start
-python main.py --dataset fake $CUDA_FLAG --mps --dry-run || error "dcgan failed"
+python main.py --dataset fake $CUDA_FLAG --mps $DEVICE_FLAG --dry-run || error "dcgan failed"
 }
 
 function fast_neural_style() {
@@ -44,7 +53,7 @@ function fast_neural_style() {
 test -d "saved_models" || { error "saved models not found"; return; }
 
 echo "running fast neural style model"
-python neural_style/neural_style.py eval --content-image images/content-images/amber.jpg --model saved_models/candy.pth --output-image images/output-images/amber-candy.jpg --cuda $CUDA --mps || error "neural_style.py failed"
+python neural_style/neural_style.py eval --content-image images/content-images/amber.jpg --model saved_models/candy.pth --output-image images/output-images/amber-candy.jpg --cuda $CUDA --mps $DEVICE_FLAG || error "neural_style.py failed"
 }
 
 function imagenet() {
@@ -63,36 +72,36 @@ function language_translation() {
 start
 python -m spacy download en || error "couldn't download en package from spacy"
 python -m spacy download de || error "couldn't download de package from spacy"
-python main.py -e 1 --enc_layers 1 --dec_layers 1 --backend cpu --logging_dir output/ --dry_run || error "language translation example failed"
+python main.py -e 1 --enc_layers 1 --dec_layers 1 $DEVICE_FLAG --logging_dir output/ --dry_run || error "language translation example failed"
 }
 
 function mnist() {
 start
-python main.py --epochs 1 --dry-run || error "mnist example failed"
+python main.py --epochs 1 --dry-run $DEVICE_FLAG || error "mnist example failed"
 }
 function mnist_forward_forward() {
 start
-python main.py --epochs 1 --no_mps --no_cuda || error "mnist forward forward failed"
+python main.py --epochs 1 --no_mps --no_cuda $DEVICE_FLAG || error "mnist forward forward failed"
 
 }
 function mnist_hogwild() {
 start
-python main.py --epochs 1 --dry-run $CUDA_FLAG || error "mnist hogwild failed"
+python main.py --epochs 1 --dry-run $CUDA_FLAG $DEVICE_FLAG || error "mnist hogwild failed"
 }
 
 function mnist_rnn() {
 start
-python main.py --epochs 1 --dry-run || error "mnist rnn example failed"
+python main.py --epochs 1 --dry-run $DEVICE_FLAG || error "mnist rnn example failed"
 }
 
 function regression() {
 start
-python main.py --epochs 1 $CUDA_FLAG || error "regression failed"
+python main.py --epochs 1 $CUDA_FLAG $DEVICE_FLAG || error "regression failed"
 }
 
 function siamese_network() {
 start
-python main.py --epochs 1 --dry-run || error "siamese network example failed"
+python main.py --epochs 1 --dry-run $DEVICE_FLAG || error "siamese network example failed"
 }
 
 function reinforcement_learning() {
@@ -123,7 +132,7 @@ function fx() {
 
 function super_resolution() {
 start
-python main.py --upscale_factor 3 --batchSize 4 --testBatchSize 100 --nEpochs 1 --lr 0.001 --mps || error "super resolution failed"
+python main.py --upscale_factor 3 --batchSize 4 --testBatchSize 100 --nEpochs 1 --lr 0.001 --mps $DEVICE_FLAG || error "super resolution failed"
 }
 
 function time_sequence_prediction() {
@@ -134,7 +143,7 @@ function time_sequence_prediction() {
 
 function vae() {
 start
-python main.py --epochs 1 || error "vae failed"
+python main.py --epochs 1 $DEVICE_FLAG || error "vae failed"
 }
 
 function vision_transformer() {
@@ -144,17 +153,17 @@ function vision_transformer() {
 
 function word_language_model() {
 start
-python main.py --epochs 1 --dry-run $CUDA_FLAG --mps || error "word_language_model failed"
+python main.py --epochs 1 --dry-run $CUDA_FLAG --mps $DEVICE_FLAG || error "word_language_model failed"
 }
 
 function gcn() {
 start
-python main.py --epochs 1 --dry-run || error "graph convolutional network failed"
+python main.py --epochs 1 --dry-run $DEVICE_FLAG || error "graph convolutional network failed"
 }
 
 function gat() {
 start
-python main.py --epochs 1 --dry-run || error "graph attention network failed"
+python main.py --epochs 1 --dry-run $DEVICE_FLAG || error "graph attention network failed"
 }
 
 function clean() {