Skip to content

Commit cf64e87

Browse files
tchatonthomas
authored andcommitted
Resolve Lightning App with remote storage (#17426)
* update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update --------- Co-authored-by: thomas <[email protected]> (cherry picked from commit 3688b64)
1 parent 24ddce0 commit cf64e87

File tree

18 files changed

+95
-106
lines changed

18 files changed

+95
-106
lines changed

.azure/app-cloud-e2e.yml

Lines changed: 4 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -67,57 +67,42 @@ jobs:
6767
'App: v0_app':
6868
name: "v0_app"
6969
dir: "public"
70-
queue_type: "redis"
7170
'App: boring_app':
7271
name: "boring_app"
7372
dir: "public"
74-
queue_type: "redis"
75-
'App: boring_app / HTTP':
76-
name: "boring_app"
77-
dir: "public"
78-
queue_type: "http"
79-
'App: template_streamlit_ui':
80-
name: "template_streamlit_ui"
81-
dir: "public"
82-
queue_type: "redis"
73+
# TODO: RESOLVE ME ASAP
74+
# 'App: template_streamlit_ui':
75+
# name: "template_streamlit_ui"
76+
# dir: "public"
8377
'App: template_react_ui':
8478
name: "template_react_ui"
8579
dir: "public"
86-
queue_type: "redis"
8780
# 'App: template_jupyterlab': # TODO: clarify where these files lives
8881
# name: "template_jupyterlab"
8982
'App: installation_commands_app':
9083
name: "installation_commands_app"
9184
dir: "public"
92-
queue_type: "redis"
9385
'App: drive':
9486
name: "drive"
9587
dir: "public"
96-
queue_type: "redis"
9788
'App: payload':
9889
name: "payload"
9990
dir: "public"
100-
queue_type: "redis"
10191
'App: commands_and_api':
10292
name: "commands_and_api"
10393
dir: "public"
104-
queue_type: "redis"
10594
#'App: quick_start': # todo: consider adding back when fixed
10695
# name: "quick_start"
10796
# dir: "public"
108-
# queue_type: "redis"
10997
'App: idle_timeout':
11098
name: "idle_timeout"
11199
dir: "local"
112-
queue_type: "redis"
113100
'App: collect_failures':
114101
name: "collect_failures"
115102
dir: "local"
116-
queue_type: "redis"
117103
'App: custom_work_dependencies':
118104
name: "custom_work_dependencies"
119105
dir: "local"
120-
queue_type: "redis"
121106
timeoutInMinutes: "15"
122107
cancelTimeoutInMinutes: "1"
123108
# values: https://docs.microsoft.com/en-us/azure/devops/pipelines/process/phases?view=azure-devops&tabs=yaml#workspace
@@ -135,7 +120,6 @@ jobs:
135120
HAR_LOCATION: './artifacts/hars'
136121
SLOW_MO: '50'
137122
LIGHTNING_DEBUG: '1'
138-
LIGHTNING_CLOUD_QUEUE_TYPE: $(queue_type)
139123
steps:
140124

141125
- script: echo '##vso[task.setvariable variable=local_id]$(System.PullRequest.PullRequestNumber)'
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
venv/

src/lightning/app/core/constants.py

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020

2121

2222
def get_lightning_cloud_url() -> str:
23+
# detect local development
24+
if os.getenv("VSCODE_PROXY_URI", "").startswith("http://localhost:9800"):
25+
return "http://localhost:9800"
2326
# DO NOT CHANGE!
2427
return os.getenv("LIGHTNING_CLOUD_URL", "https://lightning.ai")
2528

@@ -115,17 +118,5 @@ def enable_interruptible_works() -> bool:
115118
return bool(int(os.getenv("LIGHTNING_INTERRUPTIBLE_WORKS", "0")))
116119

117120

118-
# Get Cluster Driver
119-
_CLUSTER_DRIVERS = [None, "k8s", "direct"]
120-
121-
122121
def get_cluster_driver() -> Optional[str]:
123-
value = os.getenv("LIGHTNING_CLUSTER_DRIVER", None)
124-
if value is None:
125-
if enable_interruptible_works():
126-
value = "direct"
127-
else:
128-
value = None
129-
if value not in _CLUSTER_DRIVERS:
130-
raise ValueError(f"Found {value} cluster driver. The value needs to be in {_CLUSTER_DRIVERS}.")
131-
return value
122+
return "direct"

src/lightning/app/core/queues.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -388,19 +388,30 @@ def get(self, timeout: Optional[float] = None) -> Any:
388388
if timeout is None:
389389
while True:
390390
try:
391-
return self._get()
391+
try:
392+
return self._get()
393+
except requests.exceptions.HTTPError:
394+
pass
392395
except queue.Empty:
393396
time.sleep(HTTP_QUEUE_REFRESH_INTERVAL)
394397

395398
# make one request and return the result
396399
if timeout == 0:
397-
return self._get()
400+
try:
401+
return self._get()
402+
except requests.exceptions.HTTPError:
403+
return None
398404

399405
# timeout is some value - loop until the timeout is reached
400406
start_time = time.time()
401407
while (time.time() - start_time) < timeout:
402408
try:
403-
return self._get()
409+
try:
410+
return self._get()
411+
except requests.exceptions.HTTPError:
412+
if timeout > self.default_timeout:
413+
return None
414+
raise queue.Empty
404415
except queue.Empty:
405416
# Note: In theory, there isn't a need for a sleep as the queue shouldn't
406417
# block the flow if the queue is empty.
@@ -441,8 +452,11 @@ def length(self) -> int:
441452
if not self.app_id:
442453
raise ValueError(f"App ID couldn't be extracted from the queue name: {self.name}")
443454

444-
val = self.client.get(f"/v1/{self.app_id}/{self._name_suffix}/length")
445-
return int(val.text)
455+
try:
456+
val = self.client.get(f"/v1/{self.app_id}/{self._name_suffix}/length")
457+
return int(val.text)
458+
except requests.exceptions.HTTPError:
459+
return 0
446460

447461
@staticmethod
448462
def _split_app_id_and_queue_name(queue_name: str) -> Tuple[str, str]:

src/lightning/app/runners/cloud.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,6 @@
7878
ENABLE_PULLING_STATE_ENDPOINT,
7979
ENABLE_PUSHING_STATE_ENDPOINT,
8080
get_cloud_queue_type,
81-
get_cluster_driver,
8281
get_lightning_cloud_url,
8382
LIGHTNING_CLOUD_PRINT_SPECS,
8483
SYS_CUSTOMIZATIONS_SYNC_ROOT,
@@ -874,12 +873,6 @@ def _get_env_vars(
874873
if not ENABLE_PUSHING_STATE_ENDPOINT:
875874
v1_env_vars.append(V1EnvVar(name="ENABLE_PUSHING_STATE_ENDPOINT", value="0"))
876875

877-
if get_cloud_queue_type():
878-
v1_env_vars.append(V1EnvVar(name="LIGHTNING_CLOUD_QUEUE_TYPE", value=get_cloud_queue_type()))
879-
880-
if get_cluster_driver():
881-
v1_env_vars.append(V1EnvVar(name="LIGHTNING_CLUSTER_DRIVER", value=get_cluster_driver()))
882-
883876
if enable_interruptible_works():
884877
v1_env_vars.append(
885878
V1EnvVar(

src/lightning/app/storage/path.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -427,16 +427,12 @@ def _filesystem() -> AbstractFileSystem:
427427
endpoint_url = os.getenv("LIGHTNING_BUCKET_ENDPOINT_URL", "")
428428
bucket_name = os.getenv("LIGHTNING_BUCKET_NAME", "")
429429
if endpoint_url != "" and bucket_name != "":
430-
key = os.getenv("LIGHTNING_AWS_ACCESS_KEY_ID", "")
431-
secret = os.getenv("LIGHTNING_AWS_SECRET_ACCESS_KEY", "")
432-
# TODO: Remove when updated on the platform side.
433-
if key == "" or secret == "":
434-
key = os.getenv("AWS_ACCESS_KEY_ID", "")
435-
secret = os.getenv("AWS_SECRET_ACCESS_KEY", "")
436-
if key == "" or secret == "":
437-
raise RuntimeError("missing S3 bucket credentials")
438-
439-
fs = S3FileSystem(key=key, secret=secret, use_ssl=False, client_kwargs={"endpoint_url": endpoint_url})
430+
# FIXME: Temporary fix until we remove the injection from the platform
431+
if "AWS_ACCESS_KEY_ID" in os.environ:
432+
del os.environ["AWS_ACCESS_KEY_ID"]
433+
del os.environ["AWS_SECRET_ACCESS_KEY"]
434+
435+
fs = S3FileSystem()
440436

441437
app_id = os.getenv("LIGHTNING_CLOUD_APP_ID", "")
442438
if app_id == "":

src/lightning/app/testing/testing.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,8 @@ def run_app_in_cloud(
377377
[constants.LIGHTNING_CLOUD_PROJECT_ID],
378378
)
379379

380+
admin_page.reload()
381+
380382
view_page = context.new_page()
381383
i = 1
382384
while True:
@@ -385,10 +387,10 @@ def run_app_in_cloud(
385387

386388
# wait until the app is running and openapi.json is ready
387389
if app.status.phase == V1LightningappInstanceState.RUNNING:
388-
view_page.goto(f"{app.status.url}/view")
389390
status_code = requests.get(f"{app.status.url}/openapi.json").status_code
390391
if status_code == 200:
391392
print("App is running, continuing with testing...")
393+
view_page.goto(f"{app.status.url}/view")
392394
break
393395
msg = f"Received status code {status_code} at {app.status.url!r}"
394396
elif app.status.phase not in (V1LightningappInstanceState.PENDING, V1LightningappInstanceState.NOT_STARTED):
@@ -478,6 +480,19 @@ def _delete_lightning_app(client, project_id, app_id, app_name):
478480
print(f"Failed to delete {app_name}. Exception {ex}")
479481

480482

483+
def _delete_cloud_space(client, project_id, cloud_space_id, app_name):
484+
"""Used to delete the parent cloudspace."""
485+
print(f"Deleting {app_name} id: {cloud_space_id}")
486+
try:
487+
res = client.cloud_space_service_delete_cloud_space(
488+
project_id=project_id,
489+
id=cloud_space_id,
490+
)
491+
assert res == {}
492+
except ApiException as ex:
493+
print(f"Failed to delete {app_name}. Exception {ex}")
494+
495+
481496
def delete_cloud_lightning_apps():
482497
"""Cleanup cloud apps that start with the name test-{PR_NUMBER}-{TEST_APP_NAME}.
483498
@@ -502,10 +517,16 @@ def delete_cloud_lightning_apps():
502517
if pr_number and app_name and not lit_app.name.startswith(f"test-{pr_number}-{app_name}-"):
503518
continue
504519
_delete_lightning_app(client, project_id=project_id, app_id=lit_app.id, app_name=lit_app.name)
520+
_delete_cloud_space(
521+
client, project_id=project_id, cloud_space_id=lit_app.spec.cloud_space_id, app_name=lit_app.name
522+
)
505523

506524
print("deleting apps that were created more than 1 hour ago.")
507525

508526
for lit_app in list_apps.lightningapps:
509527

510528
if lit_app.created_at < datetime.datetime.now(lit_app.created_at.tzinfo) - datetime.timedelta(hours=1):
511529
_delete_lightning_app(client, project_id=project_id, app_id=lit_app.id, app_name=lit_app.name)
530+
_delete_cloud_space(
531+
client, project_id=project_id, cloud_space_id=lit_app.spec.cloud_space_id, app_name=lit_app.name
532+
)

src/lightning/app/utilities/app_logs.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,16 @@
2727

2828
@dataclass
2929
class _LogEventLabels:
30-
app: str
31-
container: str
32-
filename: str
33-
job: str
34-
namespace: str
35-
node_name: str
36-
pod: str
30+
app: Optional[str] = None
31+
container: Optional[str] = None
32+
filename: Optional[str] = None
33+
job: Optional[str] = None
34+
namespace: Optional[str] = None
35+
node_name: Optional[str] = None
36+
pod: Optional[str] = None
37+
clusterID: Optional[str] = None
38+
component: Optional[str] = None
39+
projectID: Optional[str] = None
3740
stream: Optional[str] = None
3841

3942

src/lightning/app/utilities/packaging/cloud_compute.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,11 @@ def __post_init__(self) -> None:
111111
if "gpu" not in self.name:
112112
raise ValueError("CloudCompute `interruptible=True` is supported only with GPU.")
113113

114+
# FIXME: Clean the mess on the platform side
115+
if self.name == "default" or self.name == "cpu":
116+
self.name = "cpu-small"
117+
self._internal_id = "default"
118+
114119
# TODO: Remove from the platform first.
115120
self.preemptible = self.interruptible
116121

@@ -147,7 +152,7 @@ def id(self) -> Optional[str]:
147152
return self._internal_id
148153

149154
def is_default(self) -> bool:
150-
return self.name == "default"
155+
return self.name in ("default", "cpu-small")
151156

152157
def _generate_id(self):
153158
return "default" if self.name == "default" else uuid4().hex[:7]

tests/integrations_app/public/test_v0_app.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ def check_content(button_name, text_content):
5353
has_logs = False
5454
while not has_logs:
5555
for log in fetch_logs(["flow"]):
56+
print(log)
5657
if "'a': 'a', 'b': 'b'" in log:
5758
has_logs = True
5859
sleep(1)

0 commit comments

Comments
 (0)