37
37
38
38
_MAX_IDENTITY_FETCH_RETRY = 10
39
39
40
+ _DEFAULT_AZURE_UBUNTU_HPC_IMAGE_GB = 30
41
+ _DEFAULT_AZURE_UBUNTU_2004_IMAGE_GB = 150
42
+
40
43
41
44
def _run_output (cmd ):
42
45
proc = subprocess .run (cmd ,
@@ -75,9 +78,6 @@ def _unsupported_features_for_resources(
75
78
features = {
76
79
clouds .CloudImplementationFeatures .CLONE_DISK_FROM_CLUSTER :
77
80
(f'Migrating disk is currently not supported on { cls ._REPR } .' ),
78
- clouds .CloudImplementationFeatures .IMAGE_ID :
79
- ('Specifying image ID is currently not supported on '
80
- f'{ cls ._REPR } .' ),
81
81
}
82
82
if resources .use_spot :
83
83
features [clouds .CloudImplementationFeatures .STOP ] = (
@@ -137,6 +137,50 @@ def get_egress_cost(self, num_gigabytes: float):
137
137
def is_same_cloud (self , other ):
138
138
return isinstance (other , Azure )
139
139
140
+ @classmethod
141
+ def get_image_size (cls , image_id : str , region : Optional [str ]) -> float :
142
+ if region is None :
143
+ # The region used here is only for where to send the query,
144
+ # not the image location. Azure's image is globally available.
145
+ region = 'eastus'
146
+ is_skypilot_image_tag = False
147
+ if image_id .startswith ('skypilot:' ):
148
+ is_skypilot_image_tag = True
149
+ image_id = service_catalog .get_image_id_from_tag (image_id ,
150
+ clouds = 'azure' )
151
+ image_id_splitted = image_id .split (':' )
152
+ if len (image_id_splitted ) != 4 :
153
+ with ux_utils .print_exception_no_traceback ():
154
+ raise ValueError (f'Invalid image id: { image_id } . Expected '
155
+ 'format: <publisher>:<offer>:<sku>:<version>' )
156
+ publisher , offer , sku , version = image_id_splitted
157
+ if is_skypilot_image_tag :
158
+ if offer == 'ubuntu-hpc' :
159
+ return _DEFAULT_AZURE_UBUNTU_HPC_IMAGE_GB
160
+ else :
161
+ return _DEFAULT_AZURE_UBUNTU_2004_IMAGE_GB
162
+ compute_client = azure .get_client ('compute' , cls .get_project_id ())
163
+ try :
164
+ image = compute_client .virtual_machine_images .get (
165
+ region , publisher , offer , sku , version )
166
+ except azure .exceptions ().ResourceNotFoundError () as e :
167
+ with ux_utils .print_exception_no_traceback ():
168
+ raise ValueError (f'Image not found: { image_id } ' ) from e
169
+ if image .os_disk_image is None :
170
+ with ux_utils .print_exception_no_traceback ():
171
+ raise ValueError (f'Retrieve image size for { image_id } failed.' )
172
+ ap = image .os_disk_image .additional_properties
173
+ size_in_gb = ap .get ('sizeInGb' )
174
+ if size_in_gb is not None :
175
+ return float (size_in_gb )
176
+ size_in_bytes = ap .get ('sizeInBytes' )
177
+ if size_in_bytes is None :
178
+ with ux_utils .print_exception_no_traceback ():
179
+ raise ValueError (f'Retrieve image size for { image_id } failed. '
180
+ f'Got additional_properties: { ap } ' )
181
+ size_in_gb = size_in_bytes / (1024 ** 3 )
182
+ return size_in_gb
183
+
140
184
@classmethod
141
185
def get_default_instance_type (
142
186
cls ,
@@ -149,33 +193,13 @@ def get_default_instance_type(
149
193
disk_tier = disk_tier ,
150
194
clouds = 'azure' )
151
195
152
- def _get_image_config (self , gen_version , instance_type ):
153
- # TODO(tian): images for Azure is not well organized. We should refactor
154
- # it to images.csv like AWS.
155
- # az vm image list \
156
- # --publisher microsoft-dsvm --all --output table
157
- # nvidia-driver: 535.54.03, cuda: 12.2
158
- # see: https://github.com/Azure/azhpc-images/releases/tag/ubuntu-hpc-20230803
159
- # All A100 instances is of gen2, so it will always use
160
- # the latest ubuntu-hpc:2204 image.
161
- image_config = {
162
- 'image_publisher' : 'microsoft-dsvm' ,
163
- 'image_offer' : 'ubuntu-hpc' ,
164
- 'image_sku' : '2204' ,
165
- 'image_version' : '22.04.2023080201'
166
- }
167
-
196
+ def _get_default_image_tag (self , gen_version , instance_type ) -> str :
168
197
# ubuntu-2004 v21.08.30, K80 requires image with old NVIDIA driver version
169
198
acc = self .get_accelerators_from_instance_type (instance_type )
170
199
if acc is not None :
171
200
acc_name = list (acc .keys ())[0 ]
172
201
if acc_name == 'K80' :
173
- image_config = {
174
- 'image_publisher' : 'microsoft-dsvm' ,
175
- 'image_offer' : 'ubuntu-2004' ,
176
- 'image_sku' : '2004-gen2' ,
177
- 'image_version' : '21.08.30'
178
- }
202
+ return 'skypilot:k80-ubuntu-2004'
179
203
180
204
# ubuntu-2004 v21.11.04, the previous image we used in the past for
181
205
# V1 HyperV instance before we change default image to ubuntu-hpc.
@@ -184,14 +208,13 @@ def _get_image_config(self, gen_version, instance_type):
184
208
# (Basic_A, Standard_D, ...) are V1 instance. For these instances,
185
209
# we use the previous image.
186
210
if gen_version == 'V1' :
187
- image_config = {
188
- 'image_publisher' : 'microsoft-dsvm' ,
189
- 'image_offer' : 'ubuntu-2004' ,
190
- 'image_sku' : '2004' ,
191
- 'image_version' : '21.11.04'
192
- }
211
+ return 'skypilot:v1-ubuntu-2004'
193
212
194
- return image_config
213
+ # nvidia-driver: 535.54.03, cuda: 12.2
214
+ # see: https://github.com/Azure/azhpc-images/releases/tag/ubuntu-hpc-20230803
215
+ # All A100 instances is of gen2, so it will always use
216
+ # the latest ubuntu-hpc:2204 image.
217
+ return 'skypilot:gpu-ubuntu-2204'
195
218
196
219
@classmethod
197
220
def regions_with_offering (cls , instance_type : str ,
@@ -270,11 +293,31 @@ def make_deploy_resources_variables(
270
293
acc_count = str (sum (acc_dict .values ()))
271
294
else :
272
295
custom_resources = None
273
- # pylint: disable=import-outside-toplevel
274
- from sky .clouds .service_catalog import azure_catalog
275
- gen_version = azure_catalog .get_gen_version_from_instance_type (
276
- r .instance_type )
277
- image_config = self ._get_image_config (gen_version , r .instance_type )
296
+
297
+ if resources .image_id is None :
298
+ # pylint: disable=import-outside-toplevel
299
+ from sky .clouds .service_catalog import azure_catalog
300
+ gen_version = azure_catalog .get_gen_version_from_instance_type (
301
+ r .instance_type )
302
+ image_id = self ._get_default_image_tag (gen_version , r .instance_type )
303
+ else :
304
+ if None in resources .image_id :
305
+ image_id = resources .image_id [None ]
306
+ else :
307
+ assert region_name in resources .image_id , resources .image_id
308
+ image_id = resources .image_id [region_name ]
309
+ if image_id .startswith ('skypilot:' ):
310
+ image_id = service_catalog .get_image_id_from_tag (image_id ,
311
+ clouds = 'azure' )
312
+ # Already checked in resources.py
313
+ publisher , offer , sku , version = image_id .split (':' )
314
+ image_config = {
315
+ 'image_publisher' : publisher ,
316
+ 'image_offer' : offer ,
317
+ 'image_sku' : sku ,
318
+ 'image_version' : version ,
319
+ }
320
+
278
321
# Setup commands to eliminate the banner and restart sshd.
279
322
# This script will modify /etc/ssh/sshd_config and add a bash script
280
323
# into .bashrc. The bash script will restart sshd if it has not been
0 commit comments