Skip to content

Commit de6e60f

Browse files
tchatonBorda
authored andcommitted
Resolve Lightning App with remote storage (#17426)
--------- Co-authored-by: thomas <[email protected]> (cherry picked from commit 3688b64) # Conflicts: # examples/app_commands_and_api/.lightningignore # tests/tests_app/core/test_constants.py # tests/tests_app/runners/test_cloud.py
1 parent 60a98de commit de6e60f

File tree

18 files changed

+94
-105
lines changed

18 files changed

+94
-105
lines changed

.azure/app-cloud-e2e.yml

Lines changed: 4 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -60,57 +60,42 @@ jobs:
6060
'App: v0_app':
6161
name: "v0_app"
6262
dir: "public"
63-
queue_type: "redis"
6463
'App: boring_app':
6564
name: "boring_app"
6665
dir: "public"
67-
queue_type: "redis"
68-
'App: boring_app / HTTP':
69-
name: "boring_app"
70-
dir: "public"
71-
queue_type: "http"
72-
'App: template_streamlit_ui':
73-
name: "template_streamlit_ui"
74-
dir: "public"
75-
queue_type: "redis"
66+
# TODO: RESOLVE ME ASAP
67+
# 'App: template_streamlit_ui':
68+
# name: "template_streamlit_ui"
69+
# dir: "public"
7670
'App: template_react_ui':
7771
name: "template_react_ui"
7872
dir: "public"
79-
queue_type: "redis"
8073
# 'App: template_jupyterlab': # TODO: clarify where these files lives
8174
# name: "template_jupyterlab"
8275
'App: installation_commands_app':
8376
name: "installation_commands_app"
8477
dir: "public"
85-
queue_type: "redis"
8678
'App: drive':
8779
name: "drive"
8880
dir: "public"
89-
queue_type: "redis"
9081
'App: payload':
9182
name: "payload"
9283
dir: "public"
93-
queue_type: "redis"
9484
'App: commands_and_api':
9585
name: "commands_and_api"
9686
dir: "public"
97-
queue_type: "redis"
9887
#'App: quick_start': # todo: consider adding back when fixed
9988
# name: "quick_start"
10089
# dir: "public"
101-
# queue_type: "redis"
10290
'App: idle_timeout':
10391
name: "idle_timeout"
10492
dir: "local"
105-
queue_type: "redis"
10693
'App: collect_failures':
10794
name: "collect_failures"
10895
dir: "local"
109-
queue_type: "redis"
11096
'App: custom_work_dependencies':
11197
name: "custom_work_dependencies"
11298
dir: "local"
113-
queue_type: "redis"
11499
timeoutInMinutes: "15"
115100
cancelTimeoutInMinutes: "1"
116101
# values: https://docs.microsoft.com/en-us/azure/devops/pipelines/process/phases?view=azure-devops&tabs=yaml#workspace
@@ -127,7 +112,6 @@ jobs:
127112
HAR_LOCATION: './artifacts/hars'
128113
SLOW_MO: '50'
129114
LIGHTNING_DEBUG: '1'
130-
LIGHTNING_CLOUD_QUEUE_TYPE: $(queue_type)
131115
steps:
132116

133117
- script: echo '##vso[task.setvariable variable=local_id]$(System.PullRequest.PullRequestNumber)'
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
venv/

src/lightning_app/core/constants.py

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020

2121

2222
def get_lightning_cloud_url() -> str:
23+
# detect local development
24+
if os.getenv("VSCODE_PROXY_URI", "").startswith("http://localhost:9800"):
25+
return "http://localhost:9800"
2326
# DO NOT CHANGE!
2427
return os.getenv("LIGHTNING_CLOUD_URL", "https://lightning.ai")
2528

@@ -114,17 +117,5 @@ def enable_interruptible_works() -> bool:
114117
return bool(int(os.getenv("LIGHTNING_INTERRUPTIBLE_WORKS", "0")))
115118

116119

117-
# Get Cluster Driver
118-
_CLUSTER_DRIVERS = [None, "k8s", "direct"]
119-
120-
121120
def get_cluster_driver() -> Optional[str]:
122-
value = os.getenv("LIGHTNING_CLUSTER_DRIVER", None)
123-
if value is None:
124-
if enable_interruptible_works():
125-
value = "direct"
126-
else:
127-
value = None
128-
if value not in _CLUSTER_DRIVERS:
129-
raise ValueError(f"Found {value} cluster driver. The value needs to be in {_CLUSTER_DRIVERS}.")
130-
return value
121+
return "direct"

src/lightning_app/core/queues.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -388,19 +388,30 @@ def get(self, timeout: Optional[float] = None) -> Any:
388388
if timeout is None:
389389
while True:
390390
try:
391-
return self._get()
391+
try:
392+
return self._get()
393+
except requests.exceptions.HTTPError:
394+
pass
392395
except queue.Empty:
393396
time.sleep(HTTP_QUEUE_REFRESH_INTERVAL)
394397

395398
# make one request and return the result
396399
if timeout == 0:
397-
return self._get()
400+
try:
401+
return self._get()
402+
except requests.exceptions.HTTPError:
403+
return None
398404

399405
# timeout is some value - loop until the timeout is reached
400406
start_time = time.time()
401407
while (time.time() - start_time) < timeout:
402408
try:
403-
return self._get()
409+
try:
410+
return self._get()
411+
except requests.exceptions.HTTPError:
412+
if timeout > self.default_timeout:
413+
return None
414+
raise queue.Empty
404415
except queue.Empty:
405416
# Note: In theory, there isn't a need for a sleep as the queue shouldn't
406417
# block the flow if the queue is empty.
@@ -441,8 +452,11 @@ def length(self) -> int:
441452
if not self.app_id:
442453
raise ValueError(f"App ID couldn't be extracted from the queue name: {self.name}")
443454

444-
val = self.client.get(f"/v1/{self.app_id}/{self._name_suffix}/length")
445-
return int(val.text)
455+
try:
456+
val = self.client.get(f"/v1/{self.app_id}/{self._name_suffix}/length")
457+
return int(val.text)
458+
except requests.exceptions.HTTPError:
459+
return 0
446460

447461
@staticmethod
448462
def _split_app_id_and_queue_name(queue_name: str) -> Tuple[str, str]:

src/lightning_app/runners/cloud.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,6 @@
7777
ENABLE_PULLING_STATE_ENDPOINT,
7878
ENABLE_PUSHING_STATE_ENDPOINT,
7979
get_cloud_queue_type,
80-
get_cluster_driver,
8180
get_lightning_cloud_url,
8281
LIGHTNING_CLOUD_PRINT_SPECS,
8382
SYS_CUSTOMIZATIONS_SYNC_ROOT,
@@ -858,12 +857,6 @@ def _get_env_vars(
858857
if not ENABLE_PUSHING_STATE_ENDPOINT:
859858
v1_env_vars.append(V1EnvVar(name="ENABLE_PUSHING_STATE_ENDPOINT", value="0"))
860859

861-
if get_cloud_queue_type():
862-
v1_env_vars.append(V1EnvVar(name="LIGHTNING_CLOUD_QUEUE_TYPE", value=get_cloud_queue_type()))
863-
864-
if get_cluster_driver():
865-
v1_env_vars.append(V1EnvVar(name="LIGHTNING_CLUSTER_DRIVER", value=get_cluster_driver()))
866-
867860
if enable_interruptible_works():
868861
v1_env_vars.append(
869862
V1EnvVar(

src/lightning_app/storage/path.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -427,16 +427,12 @@ def _filesystem() -> AbstractFileSystem:
427427
endpoint_url = os.getenv("LIGHTNING_BUCKET_ENDPOINT_URL", "")
428428
bucket_name = os.getenv("LIGHTNING_BUCKET_NAME", "")
429429
if endpoint_url != "" and bucket_name != "":
430-
key = os.getenv("LIGHTNING_AWS_ACCESS_KEY_ID", "")
431-
secret = os.getenv("LIGHTNING_AWS_SECRET_ACCESS_KEY", "")
432-
# TODO: Remove when updated on the platform side.
433-
if key == "" or secret == "":
434-
key = os.getenv("AWS_ACCESS_KEY_ID", "")
435-
secret = os.getenv("AWS_SECRET_ACCESS_KEY", "")
436-
if key == "" or secret == "":
437-
raise RuntimeError("missing S3 bucket credentials")
438-
439-
fs = S3FileSystem(key=key, secret=secret, use_ssl=False, client_kwargs={"endpoint_url": endpoint_url})
430+
# FIXME: Temporary fix until we remove the injection from the platform
431+
if "AWS_ACCESS_KEY_ID" in os.environ:
432+
del os.environ["AWS_ACCESS_KEY_ID"]
433+
del os.environ["AWS_SECRET_ACCESS_KEY"]
434+
435+
fs = S3FileSystem()
440436

441437
app_id = os.getenv("LIGHTNING_CLOUD_APP_ID", "")
442438
if app_id == "":

src/lightning_app/testing/testing.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,8 @@ def run_app_in_cloud(
380380
[constants.LIGHTNING_CLOUD_PROJECT_ID],
381381
)
382382

383+
admin_page.reload()
384+
383385
view_page = context.new_page()
384386
i = 1
385387
while True:
@@ -388,10 +390,10 @@ def run_app_in_cloud(
388390

389391
# wait until the app is running and openapi.json is ready
390392
if app.status.phase == V1LightningappInstanceState.RUNNING:
391-
view_page.goto(f"{app.status.url}/view")
392393
status_code = requests.get(f"{app.status.url}/openapi.json").status_code
393394
if status_code == 200:
394395
print("App is running, continuing with testing...")
396+
view_page.goto(f"{app.status.url}/view")
395397
break
396398
msg = f"Received status code {status_code} at {app.status.url!r}"
397399
elif app.status.phase not in (V1LightningappInstanceState.PENDING, V1LightningappInstanceState.NOT_STARTED):
@@ -481,6 +483,19 @@ def _delete_lightning_app(client, project_id, app_id, app_name):
481483
print(f"Failed to delete {app_name}. Exception {ex}")
482484

483485

486+
def _delete_cloud_space(client, project_id, cloud_space_id, app_name):
487+
"""Used to delete the parent cloudspace."""
488+
print(f"Deleting {app_name} id: {cloud_space_id}")
489+
try:
490+
res = client.cloud_space_service_delete_cloud_space(
491+
project_id=project_id,
492+
id=cloud_space_id,
493+
)
494+
assert res == {}
495+
except ApiException as ex:
496+
print(f"Failed to delete {app_name}. Exception {ex}")
497+
498+
484499
def delete_cloud_lightning_apps():
485500
"""Cleanup cloud apps that start with the name test-{PR_NUMBER}-{TEST_APP_NAME}.
486501
@@ -505,10 +520,16 @@ def delete_cloud_lightning_apps():
505520
if pr_number and app_name and not lit_app.name.startswith(f"test-{pr_number}-{app_name}-"):
506521
continue
507522
_delete_lightning_app(client, project_id=project_id, app_id=lit_app.id, app_name=lit_app.name)
523+
_delete_cloud_space(
524+
client, project_id=project_id, cloud_space_id=lit_app.spec.cloud_space_id, app_name=lit_app.name
525+
)
508526

509527
print("deleting apps that were created more than 1 hour ago.")
510528

511529
for lit_app in list_apps.lightningapps:
512530

513531
if lit_app.created_at < datetime.datetime.now(lit_app.created_at.tzinfo) - datetime.timedelta(hours=1):
514532
_delete_lightning_app(client, project_id=project_id, app_id=lit_app.id, app_name=lit_app.name)
533+
_delete_cloud_space(
534+
client, project_id=project_id, cloud_space_id=lit_app.spec.cloud_space_id, app_name=lit_app.name
535+
)

src/lightning_app/utilities/app_logs.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,16 @@
2727

2828
@dataclass
2929
class _LogEventLabels:
30-
app: str
31-
container: str
32-
filename: str
33-
job: str
34-
namespace: str
35-
node_name: str
36-
pod: str
30+
app: Optional[str] = None
31+
container: Optional[str] = None
32+
filename: Optional[str] = None
33+
job: Optional[str] = None
34+
namespace: Optional[str] = None
35+
node_name: Optional[str] = None
36+
pod: Optional[str] = None
37+
clusterID: Optional[str] = None
38+
component: Optional[str] = None
39+
projectID: Optional[str] = None
3740
stream: Optional[str] = None
3841

3942

src/lightning_app/utilities/packaging/cloud_compute.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,11 @@ def __post_init__(self) -> None:
111111
if "gpu" not in self.name:
112112
raise ValueError("CloudCompute `interruptible=True` is supported only with GPU.")
113113

114+
# FIXME: Clean the mess on the platform side
115+
if self.name == "default" or self.name == "cpu":
116+
self.name = "cpu-small"
117+
self._internal_id = "default"
118+
114119
# TODO: Remove from the platform first.
115120
self.preemptible = self.interruptible
116121

@@ -147,7 +152,7 @@ def id(self) -> Optional[str]:
147152
return self._internal_id
148153

149154
def is_default(self) -> bool:
150-
return self.name == "default"
155+
return self.name in ("default", "cpu-small")
151156

152157
def _generate_id(self):
153158
return "default" if self.name == "default" else uuid4().hex[:7]

tests/integrations_app/public/test_v0_app.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ def check_content(button_name, text_content):
5353
has_logs = False
5454
while not has_logs:
5555
for log in fetch_logs(["flow"]):
56+
print(log)
5657
if "'a': 'a', 'b': 'b'" in log:
5758
has_logs = True
5859
sleep(1)

0 commit comments

Comments
 (0)