feat: Support custom service account for Ray cluster creation and Ray Client connection

yinghsienwu · copybara-github · commit e0c6227d0dd9 · 2024-05-08T19:23:46.000-07:00
PiperOrigin-RevId: 631998839
diff --git a/google/cloud/aiplatform/preview/vertex_ray/client_builder.py b/google/cloud/aiplatform/preview/vertex_ray/client_builder.py
@@ -98,8 +98,21 @@ def __init__(self, address: Optional[str]) -> None:
         public_address = self.response.resource_runtime.access_uris.get(
             "RAY_CLIENT_ENDPOINT"
         )
+        service_account = (
+            self.response.resource_runtime_spec.service_account_spec.service_account
+        )
+
         if public_address is None:
             address = private_address
+            if service_account:
+                raise ValueError(
+                    "[Ray on Vertex AI]: Ray Cluster ",
+                    address,
+                    " failed to start Head node properly because custom service"
+                    " account isn't supported in peered VPC network. Use public"
+                    " endpoint instead (createa a cluster withought specifying"
+                    " VPC network).",
+                )
         else:
             address = public_address
 
@@ -110,17 +123,7 @@ def __init__(self, address: Optional[str]) -> None:
                 persistent_resource_id,
                 " Head node is not reachable. Please ensure that a valid VPC network has been specified.",
             )
-        # Handling service_account
-        service_account = (
-            self.response.resource_runtime_spec.service_account_spec.service_account
-        )
 
-        if service_account:
-            raise ValueError(
-                "[Ray on Vertex AI]: Ray Cluster ",
-                address,
-                " failed to start Head node properly because custom service account isn't supported.",
-            )
         logging.debug("[Ray on Vertex AI]: Resolved head node ip: %s", address)
         cluster = _gapic_utils.persistent_resource_to_cluster(
             persistent_resource=self.response
diff --git a/google/cloud/aiplatform/preview/vertex_ray/cluster_init.py b/google/cloud/aiplatform/preview/vertex_ray/cluster_init.py
@@ -32,6 +32,7 @@
     RayMetricSpec,
     ResourcePool,
     ResourceRuntimeSpec,
+    ServiceAccountSpec,
 )
 
 from google.cloud.aiplatform.preview.vertex_ray.util import (
@@ -48,6 +49,7 @@ def create_ray_cluster(
     python_version: Optional[str] = "3.10",
     ray_version: Optional[str] = "2.9",
     network: Optional[str] = None,
+    service_account: Optional[str] = None,
     cluster_name: Optional[str] = None,
     worker_node_types: Optional[List[resources.Resources]] = None,
     custom_images: Optional[resources.NodeImages] = None,
@@ -78,7 +80,9 @@ def create_ray_cluster(
 
     cluster_resource_name = vertex_ray.create_ray_cluster(
         head_node_type=head_node_type,
-        network="projects/my-project-number/global/networks/my-vpc-name",
+        network="projects/my-project-number/global/networks/my-vpc-name",  # Optional
+        service_account="my-service-account@my-project-number.iam.gserviceaccount.com",  # Optional
+        cluster_name="my-cluster-name",  # Optional
         worker_node_types=worker_node_types,
         ray_version="2.9",
     )
@@ -100,6 +104,8 @@ def create_ray_cluster(
             Vertex API service. For Ray Job API, VPC network is not required
             because Ray Cluster connection can be accessed through dashboard
             address.
+        service_account: Service account to be used for running Ray programs on
+            the cluster.
         cluster_name: This value may be up to 63 characters, and valid
             characters are `[a-z0-9_-]`. The first character cannot be a number
             or hyphen.
@@ -254,7 +260,17 @@ def create_ray_cluster(
     ray_spec = RaySpec(
         resource_pool_images=resource_pool_images, ray_metric_spec=ray_metric_spec
     )
-    resource_runtime_spec = ResourceRuntimeSpec(ray_spec=ray_spec)
+    if service_account:
+        service_account_spec = ServiceAccountSpec(
+            enable_custom_service_account=True,
+            service_account=service_account,
+        )
+        resource_runtime_spec = ResourceRuntimeSpec(
+            ray_spec=ray_spec,
+            service_account_spec=service_account_spec,
+        )
+    else:
+        resource_runtime_spec = ResourceRuntimeSpec(ray_spec=ray_spec)
     persistent_resource = PersistentResource(
         resource_pools=resource_pools,
         network=network,
diff --git a/google/cloud/aiplatform/preview/vertex_ray/util/_gapic_utils.py b/google/cloud/aiplatform/preview/vertex_ray/util/_gapic_utils.py
@@ -166,7 +166,10 @@ def persistent_resource_to_cluster(
     head_image_uri = (
         persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[head_id]
     )
-
+    if persistent_resource.resource_runtime_spec.service_account_spec.service_account:
+        cluster.service_account = (
+            persistent_resource.resource_runtime_spec.service_account_spec.service_account
+        )
     if not head_image_uri:
         head_image_uri = persistent_resource.resource_runtime_spec.ray_spec.image_uri
 
diff --git a/google/cloud/aiplatform/preview/vertex_ray/util/resources.py b/google/cloud/aiplatform/preview/vertex_ray/util/resources.py
@@ -41,7 +41,7 @@ class Resources:
             us-docker.pkg.dev/my-project/ray-gpu.2-9.py310-tf:latest).
     """
 
-    machine_type: Optional[str] = "n1-standard-8"
+    machine_type: Optional[str] = "n1-standard-16"
     node_count: Optional[int] = 1
     accelerator_type: Optional[str] = None
     accelerator_count: Optional[int] = 0
@@ -81,6 +81,8 @@ class Cluster:
             managed in the Vertex API service. For Ray Job API, VPC network is
             not required because cluster connection can be accessed through
             dashboard address.
+        service_account: Service account to be used for running Ray programs on
+            the cluster.
         state: Describes the cluster state (defined in PersistentResource.State).
         python_version: Python version for the ray cluster (e.g. "3.10").
         ray_version: Ray version for the ray cluster (e.g. "2.4").
@@ -102,6 +104,7 @@ class Cluster:
 
     cluster_resource_name: str = None
     network: str = None
+    service_account: str = None
     state: PersistentResource.State = None
     python_version: str = None
     ray_version: str = None
diff --git a/tests/unit/vertex_ray/test_cluster_init.py b/tests/unit/vertex_ray/test_cluster_init.py
@@ -92,6 +92,34 @@ def get_persistent_resource_1_pool_custom_image_mock():
         yield get_persistent_resource_1_pool_custom_image_mock
 
 
+@pytest.fixture
+def create_persistent_resource_1_pool_byosa_mock():
+    with mock.patch.object(
+        PersistentResourceServiceClient,
+        "create_persistent_resource",
+    ) as create_persistent_resource_1_pool_byosa_mock:
+        create_persistent_resource_lro_mock = mock.Mock(ga_operation.Operation)
+        create_persistent_resource_lro_mock.result.return_value = (
+            tc.ClusterConstants.TEST_RESPONSE_RUNNING_1_POOL_BYOSA
+        )
+        create_persistent_resource_1_pool_byosa_mock.return_value = (
+            create_persistent_resource_lro_mock
+        )
+        yield create_persistent_resource_1_pool_byosa_mock
+
+
+@pytest.fixture
+def get_persistent_resource_1_pool_byosa_mock():
+    with mock.patch.object(
+        PersistentResourceServiceClient,
+        "get_persistent_resource",
+    ) as get_persistent_resource_1_pool_byosa_mock:
+        get_persistent_resource_1_pool_byosa_mock.return_value = (
+            tc.ClusterConstants.TEST_RESPONSE_RUNNING_1_POOL_BYOSA
+        )
+        yield get_persistent_resource_1_pool_byosa_mock
+
+
 @pytest.fixture
 def create_persistent_resource_2_pools_mock():
     with mock.patch.object(
@@ -426,6 +454,30 @@ def test_create_ray_cluster_initialized_success(
             ]
         )
 
+    @pytest.mark.usefixtures("get_persistent_resource_1_pool_byosa_mock")
+    def test_create_ray_cluster_byosa_success(
+        self, create_persistent_resource_1_pool_byosa_mock
+    ):
+        """If head and worker nodes are duplicate, merge to head pool."""
+        cluster_name = vertex_ray.create_ray_cluster(
+            head_node_type=tc.ClusterConstants.TEST_HEAD_NODE_TYPE_1_POOL,
+            worker_node_types=tc.ClusterConstants.TEST_WORKER_NODE_TYPES_1_POOL,
+            service_account=tc.ProjectConstants.TEST_SERVICE_ACCOUNT,
+            cluster_name=tc.ClusterConstants.TEST_VERTEX_RAY_PR_ID,
+        )
+
+        assert tc.ClusterConstants.TEST_VERTEX_RAY_PR_ADDRESS == cluster_name
+
+        request = persistent_resource_service.CreatePersistentResourceRequest(
+            parent=tc.ProjectConstants.TEST_PARENT,
+            persistent_resource=tc.ClusterConstants.TEST_REQUEST_RUNNING_1_POOL_BYOSA,
+            persistent_resource_id=tc.ClusterConstants.TEST_VERTEX_RAY_PR_ID,
+        )
+
+        create_persistent_resource_1_pool_byosa_mock.assert_called_with(
+            request,
+        )
+
     def test_create_ray_cluster_head_multinode_error(self):
         with pytest.raises(ValueError) as e:
             vertex_ray.create_ray_cluster(
@@ -508,6 +560,16 @@ def test_get_ray_cluster_with_custom_image_success(
         get_persistent_resource_2_pools_custom_image_mock.assert_called_once()
         cluster_eq(cluster, tc.ClusterConstants.TEST_CLUSTER_CUSTOM_IMAGE)
 
+    def test_get_ray_cluster_byosa_success(
+        self, get_persistent_resource_1_pool_byosa_mock
+    ):
+        cluster = vertex_ray.get_ray_cluster(
+            cluster_resource_name=tc.ClusterConstants.TEST_VERTEX_RAY_PR_ADDRESS
+        )
+
+        get_persistent_resource_1_pool_byosa_mock.assert_called_once()
+        cluster_eq(cluster, tc.ClusterConstants.TEST_CLUSTER_BYOSA)
+
     @pytest.mark.usefixtures("get_persistent_resource_exception_mock")
     def test_get_ray_cluster_error(self):
         with pytest.raises(ValueError) as e:
diff --git a/tests/unit/vertex_ray/test_constants.py b/tests/unit/vertex_ray/test_constants.py
@@ -16,6 +16,7 @@
 #
 
 import dataclasses
+import sys
 
 from google.cloud.aiplatform.preview.vertex_ray.util.resources import Cluster
 from google.cloud.aiplatform.preview.vertex_ray.util.resources import (
@@ -28,10 +29,10 @@
 from google.cloud.aiplatform_v1beta1.types.persistent_resource import (
     PersistentResource,
 )
-from google.cloud.aiplatform_v1beta1.types.persistent_resource import RaySpec
 from google.cloud.aiplatform_v1beta1.types.persistent_resource import (
     RayMetricSpec,
 )
+from google.cloud.aiplatform_v1beta1.types.persistent_resource import RaySpec
 from google.cloud.aiplatform_v1beta1.types.persistent_resource import (
     ResourcePool,
 )
@@ -41,9 +42,11 @@
 from google.cloud.aiplatform_v1beta1.types.persistent_resource import (
     ResourceRuntimeSpec,
 )
-
+from google.cloud.aiplatform_v1beta1.types.persistent_resource import (
+    ServiceAccountSpec,
+)
 import pytest
-import sys
+
 
 rovminversion = pytest.mark.skipif(
     sys.version_info > (3, 10), reason="Requires python3.10 or lower"
@@ -67,6 +70,7 @@ class ProjectConstants:
     TEST_MODEL_ID = (
         f"projects/{TEST_GCP_PROJECT_NUMBER}/locations/{TEST_GCP_REGION}/models/456"
     )
+    TEST_SERVICE_ACCOUNT = "service-account@project.iam.gserviceaccount.com"
 
 
 @dataclasses.dataclass(frozen=True)
@@ -79,6 +83,9 @@ class ClusterConstants:
     TEST_VERTEX_RAY_DASHBOARD_ADDRESS = (
         "48b400ad90b8dd3c-dot-us-central1.aiplatform-training.googleusercontent.com"
     )
+    TEST_VERTEX_RAY_CLIENT_ENDPOINT = (
+        "88888.us-central1-1234567.staging-ray.vertexai.goog:443"
+    )
     TEST_VERTEX_RAY_PR_ID = "user-persistent-resource-1234567890"
     TEST_VERTEX_RAY_PR_ADDRESS = (
         f"{ProjectConstants.TEST_PARENT}/persistentResources/" + TEST_VERTEX_RAY_PR_ID
@@ -106,7 +113,7 @@ class ClusterConstants:
     TEST_RESOURCE_POOL_0 = ResourcePool(
         id="head-node",
         machine_spec=MachineSpec(
-            machine_type="n1-standard-8",
+            machine_type="n1-standard-16",
             accelerator_type="NVIDIA_TESLA_P100",
             accelerator_count=1,
         ),
@@ -147,6 +154,20 @@ class ClusterConstants:
         ),
         network=ProjectConstants.TEST_VPC_NETWORK,
     )
+    TEST_REQUEST_RUNNING_1_POOL_BYOSA = PersistentResource(
+        resource_pools=[TEST_RESOURCE_POOL_0],
+        resource_runtime_spec=ResourceRuntimeSpec(
+            ray_spec=RaySpec(
+                resource_pool_images={"head-node": TEST_GPU_IMAGE},
+                ray_metric_spec=RayMetricSpec(disabled=False),
+            ),
+            service_account_spec=ServiceAccountSpec(
+                enable_custom_service_account=True,
+                service_account=ProjectConstants.TEST_SERVICE_ACCOUNT,
+            ),
+        ),
+        network=None,
+    )
     # Get response has generated name, and URIs
     TEST_RESPONSE_RUNNING_1_POOL = PersistentResource(
         name=TEST_VERTEX_RAY_PR_ADDRESS,
@@ -185,6 +206,50 @@ class ClusterConstants:
         ),
         state="RUNNING",
     )
+    TEST_RESPONSE_RUNNING_1_POOL_BYOSA = PersistentResource(
+        name=TEST_VERTEX_RAY_PR_ADDRESS,
+        resource_pools=[TEST_RESOURCE_POOL_0],
+        resource_runtime_spec=ResourceRuntimeSpec(
+            ray_spec=RaySpec(
+                resource_pool_images={"head-node": TEST_GPU_IMAGE},
+                ray_metric_spec=RayMetricSpec(disabled=False),
+            ),
+            service_account_spec=ServiceAccountSpec(
+                enable_custom_service_account=True,
+                service_account=ProjectConstants.TEST_SERVICE_ACCOUNT,
+            ),
+        ),
+        network=None,
+        resource_runtime=ResourceRuntime(
+            access_uris={
+                "RAY_DASHBOARD_URI": TEST_VERTEX_RAY_DASHBOARD_ADDRESS,
+                "RAY_CLIENT_ENDPOINT": TEST_VERTEX_RAY_CLIENT_ENDPOINT,
+            }
+        ),
+        state="RUNNING",
+    )
+    TEST_RESPONSE_1_POOL_BYOSA_PRIVATE = PersistentResource(
+        name=TEST_VERTEX_RAY_PR_ADDRESS,
+        resource_pools=[TEST_RESOURCE_POOL_0],
+        resource_runtime_spec=ResourceRuntimeSpec(
+            ray_spec=RaySpec(
+                resource_pool_images={"head-node": TEST_GPU_IMAGE},
+                ray_metric_spec=RayMetricSpec(disabled=False),
+            ),
+            service_account_spec=ServiceAccountSpec(
+                enable_custom_service_account=True,
+                service_account=ProjectConstants.TEST_SERVICE_ACCOUNT,
+            ),
+        ),
+        network=ProjectConstants.TEST_VPC_NETWORK,
+        resource_runtime=ResourceRuntime(
+            access_uris={
+                "RAY_DASHBOARD_URI": TEST_VERTEX_RAY_DASHBOARD_ADDRESS,
+                "RAY_CLIENT_ENDPOINT": TEST_VERTEX_RAY_CLIENT_ENDPOINT,
+            }
+        ),
+        state="RUNNING",
+    )
     # 2_POOL: worker_node_types and head_node_type have different MachineSpecs
     TEST_HEAD_NODE_TYPE_2_POOLS = Resources()
     TEST_WORKER_NODE_TYPES_2_POOLS = [
@@ -208,7 +273,7 @@ class ClusterConstants:
     TEST_RESOURCE_POOL_1 = ResourcePool(
         id="head-node",
         machine_spec=MachineSpec(
-            machine_type="n1-standard-8",
+            machine_type="n1-standard-16",
         ),
         disk_spec=DiskSpec(
             boot_disk_type="pd-ssd",
@@ -302,6 +367,7 @@ class ClusterConstants:
         python_version="3.10",
         ray_version="2.9",
         network=ProjectConstants.TEST_VPC_NETWORK,
+        service_account=None,
         state="RUNNING",
         head_node_type=TEST_HEAD_NODE_TYPE_1_POOL,
         worker_node_types=TEST_WORKER_NODE_TYPES_1_POOL,
@@ -312,6 +378,7 @@ class ClusterConstants:
         python_version="3.10",
         ray_version="2.9",
         network=ProjectConstants.TEST_VPC_NETWORK,
+        service_account=None,
         state="RUNNING",
         head_node_type=TEST_HEAD_NODE_TYPE_2_POOLS,
         worker_node_types=TEST_WORKER_NODE_TYPES_2_POOLS,
@@ -320,11 +387,23 @@ class ClusterConstants:
     TEST_CLUSTER_CUSTOM_IMAGE = Cluster(
         cluster_resource_name=TEST_VERTEX_RAY_PR_ADDRESS,
         network=ProjectConstants.TEST_VPC_NETWORK,
+        service_account=None,
         state="RUNNING",
         head_node_type=TEST_HEAD_NODE_TYPE_2_POOLS_CUSTOM_IMAGE,
         worker_node_types=TEST_WORKER_NODE_TYPES_2_POOLS_CUSTOM_IMAGE,
         dashboard_address=TEST_VERTEX_RAY_DASHBOARD_ADDRESS,
     )
+    TEST_CLUSTER_BYOSA = Cluster(
+        cluster_resource_name=TEST_VERTEX_RAY_PR_ADDRESS,
+        python_version="3.10",
+        ray_version="2.9",
+        network="",
+        service_account=ProjectConstants.TEST_SERVICE_ACCOUNT,
+        state="RUNNING",
+        head_node_type=TEST_HEAD_NODE_TYPE_1_POOL,
+        worker_node_types=TEST_WORKER_NODE_TYPES_1_POOL,
+        dashboard_address=TEST_VERTEX_RAY_DASHBOARD_ADDRESS,
+    )
     TEST_BEARER_TOKEN = "test-bearer-token"
     TEST_HEADERS = {
         "Content-Type": "application/json",
diff --git a/tests/unit/vertex_ray/test_vertex_ray_client.py b/tests/unit/vertex_ray/test_vertex_ray_client.py