File tree

6 files changed

+322
-35
lines changed

6 files changed

+322
-35
lines changed
Original file line numberDiff line numberDiff line change
@@ -1070,13 +1070,15 @@ def from_local_script(
10701070
display_name: str,
10711071
script_path: str,
10721072
container_uri: str,
1073-
args: Optional[List[Union[str, float, int]]] = None,
1073+
args: Optional[Sequence[str]] = None,
10741074
requirements: Optional[Sequence[str]] = None,
10751075
environment_variables: Optional[Dict[str, str]] = None,
10761076
replica_count: int = 1,
10771077
machine_type: str = "n1-standard-4",
10781078
accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED",
10791079
accelerator_count: int = 0,
1080+
boot_disk_type: str = "pd-ssd",
1081+
boot_disk_size_gb: int = 100,
10801082
base_output_dir: Optional[str] = None,
10811083
project: Optional[str] = None,
10821084
location: Optional[str] = None,
@@ -1110,7 +1112,7 @@ def from_local_script(
11101112
Required. Local path to training script.
11111113
container_uri (str):
11121114
Required: Uri of the training container image to use for custom job.
1113-
args (Optional[List[Union[str, float, int]]]):
1115+
args (Optional[Sequence[str]]):
11141116
Optional. Command line arguments to be passed to the Python task.
11151117
requirements (Sequence[str]):
11161118
Optional. List of python packages dependencies of script.
@@ -1136,6 +1138,13 @@ def from_local_script(
11361138
NVIDIA_TESLA_T4
11371139
accelerator_count (int):
11381140
Optional. The number of accelerators to attach to a worker replica.
1141+
boot_disk_type (str):
1142+
Optional. Type of the boot disk, default is `pd-ssd`.
1143+
Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or
1144+
`pd-standard` (Persistent Disk Hard Disk Drive).
1145+
boot_disk_size_gb (int):
1146+
Optional. Size in GB of the boot disk, default is 100GB.
1147+
boot disk size must be within the range of [100, 64000].
11391148
base_output_dir (str):
11401149
Optional. GCS output directory of job. If not provided a
11411150
timestamped directory in the staging directory will be used.
@@ -1188,6 +1197,8 @@ def from_local_script(
11881197
machine_type=machine_type,
11891198
accelerator_count=accelerator_count,
11901199
accelerator_type=accelerator_type,
1200+
boot_disk_type=boot_disk_type,
1201+
boot_disk_size_gb=boot_disk_size_gb,
11911202
).pool_specs
11921203

11931204
python_packager = source_utils._TrainingScriptPythonPackager(
Original file line numberDiff line numberDiff line change
@@ -1139,6 +1139,8 @@ def _prepare_and_validate_run(
11391139
machine_type: str = "n1-standard-4",
11401140
accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED",
11411141
accelerator_count: int = 0,
1142+
boot_disk_type: str = "pd-ssd",
1143+
boot_disk_size_gb: int = 100,
11421144
) -> Tuple[worker_spec_utils._DistributedTrainingSpec, Optional[gca_model.Model]]:
11431145
"""Create worker pool specs and managed model as well validating the
11441146
run.
@@ -1172,6 +1174,13 @@ def _prepare_and_validate_run(
11721174
NVIDIA_TESLA_T4
11731175
accelerator_count (int):
11741176
The number of accelerators to attach to a worker replica.
1177+
boot_disk_type (str):
1178+
Type of the boot disk, default is `pd-ssd`.
1179+
Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or
1180+
`pd-standard` (Persistent Disk Hard Disk Drive).
1181+
boot_disk_size_gb (int):
1182+
Size in GB of the boot disk, default is 100GB.
1183+
boot disk size must be within the range of [100, 64000].
11751184
Returns:
11761185
Worker pools specs and managed model for run.
11771186
@@ -1204,6 +1213,8 @@ def _prepare_and_validate_run(
12041213
machine_type=machine_type,
12051214
accelerator_count=accelerator_count,
12061215
accelerator_type=accelerator_type,
1216+
boot_disk_type=boot_disk_type,
1217+
boot_disk_size_gb=boot_disk_size_gb,
12071218
).pool_specs
12081219

12091220
managed_model = self._managed_model
@@ -1588,6 +1599,8 @@ def run(
15881599
machine_type: str = "n1-standard-4",
15891600
accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED",
15901601
accelerator_count: int = 0,
1602+
boot_disk_type: str = "pd-ssd",
1603+
boot_disk_size_gb: int = 100,
15911604
training_fraction_split: float = 0.8,
15921605
validation_fraction_split: float = 0.1,
15931606
test_fraction_split: float = 0.1,
@@ -1724,6 +1737,13 @@ def run(
17241737
NVIDIA_TESLA_T4
17251738
accelerator_count (int):
17261739
The number of accelerators to attach to a worker replica.
1740+
boot_disk_type (str):
1741+
Type of the boot disk, default is `pd-ssd`.
1742+
Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or
1743+
`pd-standard` (Persistent Disk Hard Disk Drive).
1744+
boot_disk_size_gb (int):
1745+
Size in GB of the boot disk, default is 100GB.
1746+
boot disk size must be within the range of [100, 64000].
17271747
training_fraction_split (float):
17281748
The fraction of the input data that is to be
17291749
used to train the Model. This is ignored if Dataset is not provided.
@@ -1774,6 +1794,8 @@ def run(
17741794
machine_type=machine_type,
17751795
accelerator_count=accelerator_count,
17761796
accelerator_type=accelerator_type,
1797+
boot_disk_type=boot_disk_type,
1798+
boot_disk_size_gb=boot_disk_size_gb,
17771799
)
17781800

17791801
# make and copy package
@@ -2241,6 +2263,8 @@ def run(
22412263
machine_type: str = "n1-standard-4",
22422264
accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED",
22432265
accelerator_count: int = 0,
2266+
boot_disk_type: str = "pd-ssd",
2267+
boot_disk_size_gb: int = 100,
22442268
training_fraction_split: float = 0.8,
22452269
validation_fraction_split: float = 0.1,
22462270
test_fraction_split: float = 0.1,
@@ -2370,6 +2394,13 @@ def run(
23702394
NVIDIA_TESLA_T4
23712395
accelerator_count (int):
23722396
The number of accelerators to attach to a worker replica.
2397+
boot_disk_type (str):
2398+
Type of the boot disk, default is `pd-ssd`.
2399+
Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or
2400+
`pd-standard` (Persistent Disk Hard Disk Drive).
2401+
boot_disk_size_gb (int):
2402+
Size in GB of the boot disk, default is 100GB.
2403+
boot disk size must be within the range of [100, 64000].
23732404
training_fraction_split (float):
23742405
The fraction of the input data that is to be
23752406
used to train the Model. This is ignored if Dataset is not provided.
@@ -2425,6 +2456,8 @@ def run(
24252456
machine_type=machine_type,
24262457
accelerator_count=accelerator_count,
24272458
accelerator_type=accelerator_type,
2459+
boot_disk_type=boot_disk_type,
2460+
boot_disk_size_gb=boot_disk_size_gb,
24282461
)
24292462

24302463
return self._run(
@@ -4402,6 +4435,8 @@ def run(
44024435
machine_type: str = "n1-standard-4",
44034436
accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED",
44044437
accelerator_count: int = 0,
4438+
boot_disk_type: str = "pd-ssd",
4439+
boot_disk_size_gb: int = 100,
44054440
training_fraction_split: float = 0.8,
44064441
validation_fraction_split: float = 0.1,
44074442
test_fraction_split: float = 0.1,
@@ -4531,6 +4566,13 @@ def run(
45314566
NVIDIA_TESLA_T4
45324567
accelerator_count (int):
45334568
The number of accelerators to attach to a worker replica.
4569+
boot_disk_type (str):
4570+
Type of the boot disk, default is `pd-ssd`.
4571+
Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or
4572+
`pd-standard` (Persistent Disk Hard Disk Drive).
4573+
boot_disk_size_gb (int):
4574+
Size in GB of the boot disk, default is 100GB.
4575+
boot disk size must be within the range of [100, 64000].
45344576
training_fraction_split (float):
45354577
The fraction of the input data that is to be
45364578
used to train the Model. This is ignored if Dataset is not provided.
@@ -4581,6 +4623,8 @@ def run(
45814623
machine_type=machine_type,
45824624
accelerator_count=accelerator_count,
45834625
accelerator_type=accelerator_type,
4626+
boot_disk_type=boot_disk_type,
4627+
boot_disk_size_gb=boot_disk_size_gb,
45844628
)
45854629

45864630
return self._run(
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,19 @@
2222
)
2323

2424

25-
class _MachineSpec(NamedTuple):
26-
"""Specification container for Machine specs used for distributed training.
25+
class _WorkerPoolSpec(NamedTuple):
26+
"""Specification container for Worker Pool specs used for distributed training.
2727
2828
Usage:
2929
30-
spec = _MachineSpec(
30+
spec = _WorkerPoolSpec(
3131
replica_count=10,
3232
machine_type='n1-standard-4',
3333
accelerator_count=2,
34-
accelerator_type='NVIDIA_TESLA_K80')
34+
accelerator_type='NVIDIA_TESLA_K80',
35+
boot_disk_type='pd-ssd',
36+
boot_disk_size_gb=100,
37+
)
3538
3639
Note that container and python package specs are not stored with this spec.
3740
"""
@@ -40,6 +43,8 @@ class _MachineSpec(NamedTuple):
4043
machine_type: str = "n1-standard-4"
4144
accelerator_count: int = 0
4245
accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED"
46+
boot_disk_type: str = "pd-ssd"
47+
boot_disk_size_gb: int = 100
4348

4449
def _get_accelerator_type(self) -> Optional[str]:
4550
"""Validates accelerator_type and returns the name of the accelerator.
@@ -70,7 +75,12 @@ def spec_dict(self) -> Dict[str, Union[int, str, Dict[str, Union[int, str]]]]:
7075
spec = {
7176
"machine_spec": {"machine_type": self.machine_type},
7277
"replica_count": self.replica_count,
78+
"disk_spec": {
79+
"boot_disk_type": self.boot_disk_type,
80+
"boot_disk_size_gb": self.boot_disk_size_gb,
81+
},
7382
}
83+
7484
accelerator_type = self._get_accelerator_type()
7585
if accelerator_type and self.accelerator_count:
7686
spec["machine_spec"]["accelerator_type"] = accelerator_type
@@ -98,25 +108,29 @@ class _DistributedTrainingSpec(NamedTuple):
98108
Usage:
99109
100110
dist_training_spec = _DistributedTrainingSpec(
101-
chief_spec = _MachineSpec(
111+
chief_spec = _WorkerPoolSpec(
102112
replica_count=1,
103113
machine_type='n1-standard-4',
104114
accelerator_count=2,
105-
accelerator_type='NVIDIA_TESLA_K80'
106-
),
107-
worker_spec = _MachineSpec(
115+
accelerator_type='NVIDIA_TESLA_K80',
116+
boot_disk_type='pd-ssd',
117+
boot_disk_size_gb=100,
118+
),
119+
worker_spec = _WorkerPoolSpec(
108120
replica_count=10,
109121
machine_type='n1-standard-4',
110122
accelerator_count=2,
111-
accelerator_type='NVIDIA_TESLA_K80'
112-
)
123+
accelerator_type='NVIDIA_TESLA_K80',
124+
boot_disk_type='pd-ssd',
125+
boot_disk_size_gb=100,
126+
),
113127
)
114128
"""
115129

116-
chief_spec: _MachineSpec = _MachineSpec()
117-
worker_spec: _MachineSpec = _MachineSpec()
118-
parameter_server_spec: _MachineSpec = _MachineSpec()
119-
evaluator_spec: _MachineSpec = _MachineSpec()
130+
chief_spec: _WorkerPoolSpec = _WorkerPoolSpec()
131+
worker_spec: _WorkerPoolSpec = _WorkerPoolSpec()
132+
parameter_server_spec: _WorkerPoolSpec = _WorkerPoolSpec()
133+
evaluator_spec: _WorkerPoolSpec = _WorkerPoolSpec()
120134

121135
@property
122136
def pool_specs(
@@ -156,6 +170,8 @@ def chief_worker_pool(
156170
machine_type: str = "n1-standard-4",
157171
accelerator_count: int = 0,
158172
accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED",
173+
boot_disk_type: str = "pd-ssd",
174+
boot_disk_size_gb: int = 100,
159175
) -> "_DistributedTrainingSpec":
160176
"""Parameterizes Config to support only chief with worker replicas.
161177
@@ -174,6 +190,13 @@ def chief_worker_pool(
174190
NVIDIA_TESLA_T4
175191
accelerator_count (int):
176192
The number of accelerators to attach to a worker replica.
193+
boot_disk_type (str):
194+
Type of the boot disk (default is `pd-ssd`).
195+
Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or
196+
`pd-standard` (Persistent Disk Hard Disk Drive).
197+
boot_disk_size_gb (int):
198+
Size in GB of the boot disk (default is 100GB).
199+
boot disk size must be within the range of [100, 64000].
177200
178201
Returns:
179202
_DistributedTrainingSpec representing one chief and n workers all of same
@@ -182,18 +205,22 @@ def chief_worker_pool(
182205
if replica_count <= 0:
183206
return cls()
184207

185-
chief_spec = _MachineSpec(
208+
chief_spec = _WorkerPoolSpec(
186209
replica_count=1,
187210
machine_type=machine_type,
188211
accelerator_count=accelerator_count,
189212
accelerator_type=accelerator_type,
213+
boot_disk_type=boot_disk_type,
214+
boot_disk_size_gb=boot_disk_size_gb,
190215
)
191216

192-
worker_spec = _MachineSpec(
217+
worker_spec = _WorkerPoolSpec(
193218
replica_count=replica_count - 1,
194219
machine_type=machine_type,
195220
accelerator_count=accelerator_count,
196221
accelerator_type=accelerator_type,
222+
boot_disk_type=boot_disk_type,
223+
boot_disk_size_gb=boot_disk_size_gb,
197224
)
198225

199226
return cls(chief_spec=chief_spec, worker_spec=worker_spec)
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@
5454

5555
_TEST_TRAINING_CONTAINER_IMAGE = "gcr.io/test-training/container:image"
5656

57+
_TEST_RUN_ARGS = ["-v", "0.1", "--test=arg"]
58+
5759
_TEST_WORKER_POOL_SPEC = [
5860
{
5961
"machine_spec": {
@@ -62,10 +64,11 @@
6264
"accelerator_count": 1,
6365
},
6466
"replica_count": 1,
67+
"disk_spec": {"boot_disk_type": "pd-ssd", "boot_disk_size_gb": 100},
6568
"container_spec": {
6669
"image_uri": _TEST_TRAINING_CONTAINER_IMAGE,
6770
"command": [],
68-
"args": [],
71+
"args": _TEST_RUN_ARGS,
6972
},
7073
}
7174
]
@@ -490,3 +493,41 @@ def test_create_custom_job_without_base_output_dir(self,):
490493
assert job.job_spec.base_output_directory.output_uri_prefix.startswith(
491494
f"{_TEST_STAGING_BUCKET}/aiplatform-custom-job"
492495
)
496+
497+
@pytest.mark.usefixtures("mock_python_package_to_gcs")
498+
@pytest.mark.parametrize("sync", [True, False])
499+
def test_create_from_local_script_with_all_args(
500+
self, get_custom_job_mock, create_custom_job_mock, sync
501+
):
502+
aiplatform.init(
503+
project=_TEST_PROJECT,
504+
location=_TEST_LOCATION,
505+
staging_bucket=_TEST_STAGING_BUCKET,
506+
encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME,
507+
)
508+
509+
# configuration on this is tested in test_training_jobs.py
510+
job = aiplatform.CustomJob.from_local_script(
511+
display_name=_TEST_DISPLAY_NAME,
512+
script_path=test_training_jobs._TEST_LOCAL_SCRIPT_FILE_NAME,
513+
container_uri=_TEST_TRAINING_CONTAINER_IMAGE,
514+
args=_TEST_RUN_ARGS,
515+
requirements=test_training_jobs._TEST_REQUIREMENTS,
516+
environment_variables=test_training_jobs._TEST_ENVIRONMENT_VARIABLES,
517+
replica_count=test_training_jobs._TEST_REPLICA_COUNT,
518+
machine_type=test_training_jobs._TEST_MACHINE_TYPE,
519+
accelerator_type=test_training_jobs._TEST_ACCELERATOR_TYPE,
520+
accelerator_count=test_training_jobs._TEST_ACCELERATOR_COUNT,
521+
boot_disk_type=test_training_jobs._TEST_BOOT_DISK_TYPE,
522+
boot_disk_size_gb=test_training_jobs._TEST_BOOT_DISK_SIZE_GB,
523+
base_output_dir=_TEST_BASE_OUTPUT_DIR,
524+
labels=_TEST_LABELS,
525+
)
526+
527+
job.run(sync=sync)
528+
529+
job.wait()
530+
531+
assert (
532+
job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED
533+
)
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,10 @@ def test_dataset_create_to_model_predict(
211211
"accelerator_type": test_training_jobs._TEST_ACCELERATOR_TYPE,
212212
"accelerator_count": test_training_jobs._TEST_ACCELERATOR_COUNT,
213213
},
214+
"disk_spec": {
215+
"boot_disk_type": test_training_jobs._TEST_BOOT_DISK_TYPE_DEFAULT,
216+
"boot_disk_size_gb": test_training_jobs._TEST_BOOT_DISK_SIZE_GB_DEFAULT,
217+
},
214218
"python_package_spec": {
215219
"executor_image_uri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE,
216220
"python_module": source_utils._TrainingScriptPythonPackager.module_name,
@@ -394,6 +398,10 @@ def test_dataset_create_to_model_predict_with_pipeline_fail(
394398
"accelerator_type": test_training_jobs._TEST_ACCELERATOR_TYPE,
395399
"accelerator_count": test_training_jobs._TEST_ACCELERATOR_COUNT,
396400
},
401+
"disk_spec": {
402+
"boot_disk_type": test_training_jobs._TEST_BOOT_DISK_TYPE_DEFAULT,
403+
"boot_disk_size_gb": test_training_jobs._TEST_BOOT_DISK_SIZE_GB_DEFAULT,
404+
},
397405
"python_package_spec": {
398406
"executor_image_uri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE,
399407
"python_module": source_utils._TrainingScriptPythonPackager.module_name,

0 commit comments

Comments
 (0)