Skip to content

feat(RHOAIENG-25241): disable ray usage metrics by default #828

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions docs/sphinx/user-docs/cluster-configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,21 @@ requirements for creating the Ray Cluster.
documentation on building a custom image
`here <https://github.com/opendatahub-io/distributed-workloads/tree/main/images/runtime/examples>`__.

Ray Usage Statistics
-------------------

By default, Ray usage statistics collection is disabled in CodeFlare SDK clusters. This stops statistics from being sent to AnyScale. If you want to enable usage statistics collection, you can set the ``RAY_USAGE_STATS_ENABLED`` environment variable to ``1`` in your cluster configuration:

.. code:: python

from codeflare_sdk import Cluster, ClusterConfiguration

cluster = Cluster(ClusterConfiguration(
name='ray-example',
namespace='default',
envs={'RAY_USAGE_STATS_ENABLED': '1'} # Enable usage statistics
))

The ``labels={"exampleLabel": "example"}`` parameter can be used to
apply additional labels to the RayCluster resource.

Expand Down
4 changes: 4 additions & 0 deletions src/codeflare_sdk/ray/cluster/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,10 @@ def __post_init__(self):
"Warning: TLS verification has been disabled - Endpoint checks will be bypassed"
)

# Set default environment variable to disable Ray usage stats if not already set
if "RAY_USAGE_STATS_ENABLED" not in self.envs:
self.envs["RAY_USAGE_STATS_ENABLED"] = "0"

if self.enable_gcs_ft:
if not self.redis_address:
raise ValueError(
Expand Down
18 changes: 8 additions & 10 deletions src/codeflare_sdk/ray/cluster/test_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,11 +465,10 @@ def test_get_cluster_no_appwrapper(mocker):
return_value=expected_rc,
)
get_cluster("test-all-params", "ns", write_to_file=True)
assert filecmp.cmp(
f"{aw_dir}test-all-params.yaml",
f"{expected_clusters_dir}/ray/unit-test-all-params.yaml",
shallow=True,
)

with open(f"{aw_dir}test-all-params.yaml") as f:
generated_rc = yaml.load(f, Loader=yaml.FullLoader)
assert generated_rc == expected_rc


def test_get_cluster_with_appwrapper(mocker):
Expand All @@ -487,11 +486,10 @@ def test_get_cluster_with_appwrapper(mocker):
return_value=expected_aw,
)
get_cluster("aw-all-params", "ns", write_to_file=True)
assert filecmp.cmp(
f"{aw_dir}aw-all-params.yaml",
f"{expected_clusters_dir}/appwrapper/unit-test-all-params.yaml",
shallow=True,
)

with open(f"{aw_dir}aw-all-params.yaml") as f:
generated_aw = yaml.load(f, Loader=yaml.FullLoader)
assert generated_aw == expected_aw


def test_wait_ready(mocker, capsys):
Expand Down
47 changes: 46 additions & 1 deletion src/codeflare_sdk/ray/cluster/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import filecmp
import pytest
import os
import yaml

parent = Path(__file__).resolve().parents[4] # project directory
expected_clusters_dir = f"{parent}/tests/test_cluster_yamls"
Expand Down Expand Up @@ -85,7 +86,11 @@ def test_config_creation_all_parameters(mocker):
assert cluster.config.worker_memory_requests == "12G"
assert cluster.config.worker_memory_limits == "16G"
assert cluster.config.appwrapper == False
assert cluster.config.envs == {"key1": "value1", "key2": "value2"}
assert cluster.config.envs == {
"key1": "value1",
"key2": "value2",
"RAY_USAGE_STATS_ENABLED": "0",
}
assert cluster.config.image == "example/ray:tag"
assert cluster.config.image_pull_secrets == ["secret1", "secret2"]
assert cluster.config.write_to_file == True
Expand Down Expand Up @@ -206,6 +211,46 @@ def test_gcs_fault_tolerance_config_validation():
)


def test_ray_usage_stats_default(mocker):
mocker.patch("kubernetes.client.ApisApi.get_api_versions")
mocker.patch("kubernetes.client.CustomObjectsApi.list_namespaced_custom_object")

cluster = Cluster(
ClusterConfiguration(name="default-usage-stats-cluster", namespace="ns")
)

# Verify that usage stats are disabled by default
assert cluster.config.envs["RAY_USAGE_STATS_ENABLED"] == "0"

# Check that the environment variable is set in the YAML
head_container = cluster.resource_yaml["spec"]["headGroupSpec"]["template"]["spec"][
"containers"
][0]
env_vars = {env["name"]: env["value"] for env in head_container["env"]}
assert env_vars["RAY_USAGE_STATS_ENABLED"] == "0"


def test_ray_usage_stats_enabled(mocker):
mocker.patch("kubernetes.client.ApisApi.get_api_versions")
mocker.patch("kubernetes.client.CustomObjectsApi.list_namespaced_custom_object")

cluster = Cluster(
ClusterConfiguration(
name="usage-stats-enabled-cluster",
namespace="ns",
envs={"RAY_USAGE_STATS_ENABLED": "1"},
)
)

assert cluster.config.envs["RAY_USAGE_STATS_ENABLED"] == "1"

head_container = cluster.resource_yaml["spec"]["headGroupSpec"]["template"]["spec"][
"containers"
][0]
env_vars = {env["name"]: env["value"] for env in head_container["env"]}
assert env_vars["RAY_USAGE_STATS_ENABLED"] == "1"


# Make sure to always keep this function last
def test_cleanup():
os.remove(f"{aw_dir}test-all-params.yaml")
Expand Down
4 changes: 4 additions & 0 deletions tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ spec:
value: value1
- name: key2
value: value2
- name: RAY_USAGE_STATS_ENABLED
value: '0'
image: example/ray:tag
imagePullPolicy: Always
lifecycle:
Expand Down Expand Up @@ -159,6 +161,8 @@ spec:
value: value1
- name: key2
value: value2
- name: RAY_USAGE_STATS_ENABLED
value: '0'
image: example/ray:tag
imagePullPolicy: Always
lifecycle:
Expand Down
6 changes: 6 additions & 0 deletions tests/test_cluster_yamls/kueue/aw_kueue.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ spec:
- mountPath: /etc/ssl/certs/odh-ca-bundle.crt
name: odh-ca-cert
subPath: odh-ca-bundle.crt
env:
- name: RAY_USAGE_STATS_ENABLED
value: '0'
volumes:
- configMap:
items:
Expand Down Expand Up @@ -133,6 +136,9 @@ spec:
- mountPath: /etc/ssl/certs/odh-ca-bundle.crt
name: odh-ca-cert
subPath: odh-ca-bundle.crt
env:
- name: RAY_USAGE_STATS_ENABLED
value: '0'
volumes:
- configMap:
items:
Expand Down
6 changes: 6 additions & 0 deletions tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ spec:
- mountPath: /etc/ssl/certs/odh-ca-bundle.crt
name: odh-ca-cert
subPath: odh-ca-bundle.crt
env:
- name: RAY_USAGE_STATS_ENABLED
value: '0'
volumes:
- configMap:
items:
Expand Down Expand Up @@ -133,6 +136,9 @@ spec:
- mountPath: /etc/ssl/certs/odh-ca-bundle.crt
name: odh-ca-cert
subPath: odh-ca-bundle.crt
env:
- name: RAY_USAGE_STATS_ENABLED
value: '0'
volumes:
- configMap:
items:
Expand Down
6 changes: 6 additions & 0 deletions tests/test_cluster_yamls/ray/default-appwrapper.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ spec:
name: dashboard
- containerPort: 10001
name: client
env:
- name: RAY_USAGE_STATS_ENABLED
value: '0'
resources:
limits:
cpu: 2
Expand Down Expand Up @@ -111,6 +114,9 @@ spec:
- -c
- ray stop
name: machine-learning
env:
- name: RAY_USAGE_STATS_ENABLED
value: '0'
resources:
limits:
cpu: 1
Expand Down
6 changes: 6 additions & 0 deletions tests/test_cluster_yamls/ray/default-ray-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ spec:
name: dashboard
- containerPort: 10001
name: client
env:
- name: RAY_USAGE_STATS_ENABLED
value: '0'
resources:
limits:
cpu: 2
Expand Down Expand Up @@ -110,6 +113,9 @@ spec:
requests:
cpu: 1
memory: 2G
env:
- name: RAY_USAGE_STATS_ENABLED
value: '0'
volumeMounts:
- mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
name: odh-trusted-ca-cert
Expand Down
4 changes: 4 additions & 0 deletions tests/test_cluster_yamls/ray/unit-test-all-params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ spec:
value: value1
- name: key2
value: value2
- name: RAY_USAGE_STATS_ENABLED
value: '0'
image: example/ray:tag
imagePullPolicy: Always
lifecycle:
Expand Down Expand Up @@ -150,6 +152,8 @@ spec:
value: value1
- name: key2
value: value2
- name: RAY_USAGE_STATS_ENABLED
value: '0'
image: example/ray:tag
imagePullPolicy: Always
lifecycle:
Expand Down
Loading