|
1 | 1 | #!/usr/bin/env python
|
| 2 | + |
| 3 | +# Copyright 2022 Google LLC |
| 4 | +# |
2 | 5 | # Licensed under the Apache License, Version 2.0 (the "License");
|
3 | 6 | # you may not use this file except in compliance with the License.
|
4 | 7 | # You may obtain a copy of the License at
|
5 | 8 | #
|
6 |
| -# http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
7 | 10 | #
|
8 | 11 | # Unless required by applicable law or agreed to in writing, software
|
9 | 12 | # distributed under the License is distributed on an "AS IS" BASIS,
|
10 | 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11 | 14 | # See the License for the specific language governing permissions and
|
12 | 15 | # limitations under the License.
|
13 |
| -r"""Sample command-line program to run a pyspark job on a new or existing |
14 |
| -cluster. |
15 |
| -
|
16 |
| -Global region clusters are supported with --global_region flag. |
17 |
| -
|
18 |
| -Example Usage to run the pyspark job on a new cluster: |
19 |
| -python submit_job_to_cluster.py --project_id=$PROJECT --gcs_bucket=$BUCKET \ |
20 |
| - --create_new_cluster --cluster_name=$CLUSTER --zone=$ZONE |
21 |
| -
|
22 |
| -Example Usage to run the pyspark job on an existing global region cluster: |
23 |
| -python submit_job_to_cluster.py --project_id=$PROJECT --gcs_bucket=$BUCKET \ |
24 |
| - --global_region --cluster_name=$CLUSTER --zone=$ZONE |
25 | 16 |
|
| 17 | +# [START dataproc_quickstart] |
26 | 18 | """
|
| 19 | +Command-line program to create a Dataproc cluster, |
| 20 | +run a PySpark job located in Cloud Storage on the cluster, |
| 21 | +then delete the cluster after the job completes. |
27 | 22 |
|
28 |
| -from __future__ import absolute_import |
29 |
| -from __future__ import division |
30 |
| -from __future__ import print_function |
| 23 | +Usage: |
| 24 | + python submit_job_to_cluster --project_id <PROJECT_ID> --region <REGION> \ |
| 25 | + --cluster_name <CLUSTER_NAME> --job_file_path <GCS_JOB_FILE_PATH> |
| 26 | +""" |
31 | 27 |
|
32 | 28 | import argparse
|
33 |
| -import os |
| 29 | +import re |
34 | 30 |
|
35 | 31 | from google.cloud import dataproc_v1
|
36 | 32 | from google.cloud import storage
|
37 | 33 |
|
38 | 34 |
|
39 |
| -DEFAULT_FILENAME = "pyspark_sort.py" |
40 |
| -waiting_callback = False |
41 |
| - |
42 |
| - |
43 |
| -def get_pyspark_file(pyspark_file=None): |
44 |
| - if pyspark_file: |
45 |
| - f = open(pyspark_file, "rb") |
46 |
| - return f, os.path.basename(pyspark_file) |
47 |
| - else: |
48 |
| - """Gets the PySpark file from current directory.""" |
49 |
| - current_dir = os.path.dirname(os.path.abspath(__file__)) |
50 |
| - f = open(os.path.join(current_dir, DEFAULT_FILENAME), "rb") |
51 |
| - return f, DEFAULT_FILENAME |
52 |
| - |
53 |
| - |
54 |
| -def get_region_from_zone(zone): |
55 |
| - try: |
56 |
| - region_as_list = zone.split("-")[:-1] |
57 |
| - return "-".join(region_as_list) |
58 |
| - except (AttributeError, IndexError, ValueError): |
59 |
| - raise ValueError("Invalid zone provided, please check your input.") |
60 |
| - |
61 |
| - |
62 |
| -def upload_pyspark_file(project, bucket_name, filename, spark_file): |
63 |
| - """Uploads the PySpark file in this directory to the configured input |
64 |
| - bucket.""" |
65 |
| - print("Uploading pyspark file to Cloud Storage.") |
66 |
| - client = storage.Client(project=project) |
67 |
| - bucket = client.get_bucket(bucket_name) |
68 |
| - blob = bucket.blob(filename) |
69 |
| - blob.upload_from_file(spark_file) |
70 |
| - |
71 |
| - |
72 |
| -def download_output(project, cluster_id, output_bucket, job_id): |
73 |
| - """Downloads the output file from Cloud Storage and returns it as a |
74 |
| - string.""" |
75 |
| - print("Downloading output file.") |
76 |
| - client = storage.Client(project=project) |
77 |
| - bucket = client.get_bucket(output_bucket) |
78 |
| - output_blob = "google-cloud-dataproc-metainfo/{}/jobs/{}/driveroutput.000000000".format( |
79 |
| - cluster_id, job_id |
| 35 | +# [START dataproc_create_cluster] |
| 36 | +def quickstart(project_id, region, cluster_name, job_file_path): |
| 37 | + # Create the cluster client. |
| 38 | + cluster_client = dataproc_v1.ClusterControllerClient( |
| 39 | + client_options={"api_endpoint": "{}-dataproc.googleapis.com:443".format(region)} |
80 | 40 | )
|
81 |
| - return bucket.blob(output_blob).download_as_string() |
82 |
| - |
83 | 41 |
|
84 |
| -# [START dataproc_submit_job_create_cluster] |
85 |
| -def create_cluster(dataproc, project, zone, region, cluster_name): |
86 |
| - """Create the cluster.""" |
87 |
| - print("Creating cluster...") |
88 |
| - zone_uri = "https://www.googleapis.com/compute/v1/projects/{}/zones/{}".format( |
89 |
| - project, zone |
90 |
| - ) |
91 |
| - cluster_data = { |
92 |
| - "project_id": project, |
| 42 | + # Create the cluster config. |
| 43 | + cluster = { |
| 44 | + "project_id": project_id, |
93 | 45 | "cluster_name": cluster_name,
|
94 | 46 | "config": {
|
95 |
| - "gce_cluster_config": {"zone_uri": zone_uri}, |
96 |
| - "master_config": {"num_instances": 1, "machine_type_uri": "n1-standard-1"}, |
97 |
| - "worker_config": {"num_instances": 2, "machine_type_uri": "n1-standard-1"}, |
| 47 | + "master_config": {"num_instances": 1, "machine_type_uri": "n1-standard-2"}, |
| 48 | + "worker_config": {"num_instances": 2, "machine_type_uri": "n1-standard-2"}, |
98 | 49 | },
|
99 | 50 | }
|
100 | 51 |
|
101 |
| - cluster = dataproc.create_cluster( |
102 |
| - request={"project_id": project, "region": region, "cluster": cluster_data} |
| 52 | + # Create the cluster. |
| 53 | + operation = cluster_client.create_cluster( |
| 54 | + request={"project_id": project_id, "region": region, "cluster": cluster} |
103 | 55 | )
|
104 |
| - cluster.add_done_callback(callback) |
105 |
| - global waiting_callback |
106 |
| - waiting_callback = True |
107 |
| - |
108 |
| - |
109 |
| -# [END dataproc_submit_job_create_cluster] |
110 |
| - |
111 |
| - |
112 |
| -def callback(operation_future): |
113 |
| - # Reset global when callback returns. |
114 |
| - global waiting_callback |
115 |
| - waiting_callback = False |
116 |
| - |
117 |
| - |
118 |
| -def wait_for_cluster_creation(): |
119 |
| - """Wait for cluster creation.""" |
120 |
| - print("Waiting for cluster creation...") |
121 |
| - |
122 |
| - while True: |
123 |
| - if not waiting_callback: |
124 |
| - print("Cluster created.") |
125 |
| - break |
126 |
| - |
127 |
| - |
128 |
| -# [START dataproc_list_clusters_with_detail] |
129 |
| -def list_clusters_with_details(dataproc, project, region): |
130 |
| - """List the details of clusters in the region.""" |
131 |
| - for cluster in dataproc.list_clusters( |
132 |
| - request={"project_id": project, "region": region} |
133 |
| - ): |
134 |
| - print(("{} - {}".format(cluster.cluster_name, cluster.status.state.name,))) |
135 |
| - |
| 56 | + result = operation.result() |
136 | 57 |
|
137 |
| -# [END dataproc_list_clusters_with_detail] |
| 58 | + print("Cluster created successfully: {}".format(result.cluster_name)) |
138 | 59 |
|
| 60 | +# [END dataproc_create_cluster] |
139 | 61 |
|
140 |
| -def get_cluster_id_by_name(dataproc, project_id, region, cluster_name): |
141 |
| - """Helper function to retrieve the ID and output bucket of a cluster by |
142 |
| - name.""" |
143 |
| - for cluster in dataproc.list_clusters( |
144 |
| - request={"project_id": project_id, "region": region} |
145 |
| - ): |
146 |
| - if cluster.cluster_name == cluster_name: |
147 |
| - return cluster.cluster_uuid, cluster.config.config_bucket |
148 |
| - |
| 62 | +# [START dataproc_submit_job] |
| 63 | + # Create the job client. |
| 64 | + job_client = dataproc_v1.JobControllerClient( |
| 65 | + client_options={"api_endpoint": "{}-dataproc.googleapis.com:443".format(region)} |
| 66 | + ) |
149 | 67 |
|
150 |
| -# [START dataproc_submit_pyspark_job] |
151 |
| -def submit_pyspark_job(dataproc, project, region, cluster_name, bucket_name, filename): |
152 |
| - """Submit the Pyspark job to the cluster (assumes `filename` was uploaded |
153 |
| - to `bucket_name.""" |
154 |
| - job_details = { |
| 68 | + # Create the job config. |
| 69 | + job = { |
155 | 70 | "placement": {"cluster_name": cluster_name},
|
156 |
| - "pyspark_job": { |
157 |
| - "main_python_file_uri": "gs://{}/{}".format(bucket_name, filename) |
158 |
| - }, |
| 71 | + "pyspark_job": {"main_python_file_uri": job_file_path}, |
159 | 72 | }
|
160 | 73 |
|
161 |
| - result = dataproc.submit_job( |
162 |
| - request={"project_id": project, "region": region, "job": job_details} |
| 74 | + operation = job_client.submit_job_as_operation( |
| 75 | + request={"project_id": project_id, "region": region, "job": job} |
163 | 76 | )
|
164 |
| - job_id = result.reference.job_id |
165 |
| - print("Submitted job ID {}.".format(job_id)) |
166 |
| - return job_id |
167 |
| - |
| 77 | + response = operation.result() |
168 | 78 |
|
169 |
| -# [END dataproc_submit_pyspark_job] |
| 79 | + # Dataproc job output is saved to the Cloud Storage bucket |
| 80 | + # allocated to the job. Use regex to obtain the bucket and blob info. |
| 81 | + matches = re.match("gs://(.*?)/(.*)", response.driver_output_resource_uri) |
170 | 82 |
|
171 |
| - |
172 |
| -# [START dataproc_delete] |
173 |
| -def delete_cluster(dataproc, project, region, cluster): |
174 |
| - """Delete the cluster.""" |
175 |
| - print("Tearing down cluster.") |
176 |
| - result = dataproc.delete_cluster( |
177 |
| - request={"project_id": project, "region": region, "cluster_name": cluster} |
| 83 | + output = ( |
| 84 | + storage.Client() |
| 85 | + .get_bucket(matches.group(1)) |
| 86 | + .blob(f"{matches.group(2)}.000000000") |
| 87 | + .download_as_string() |
178 | 88 | )
|
179 |
| - return result |
180 |
| - |
181 |
| - |
182 |
| -# [END dataproc_delete] |
183 |
| - |
184 |
| - |
185 |
| -# [START dataproc_wait] |
186 |
| -def wait_for_job(dataproc, project, region, job_id): |
187 |
| - """Wait for job to complete or error out.""" |
188 |
| - print("Waiting for job to finish...") |
189 |
| - while True: |
190 |
| - job = dataproc.get_job( |
191 |
| - request={"project_id": project, "region": region, "job_id": job_id} |
192 |
| - ) |
193 |
| - # Handle exceptions |
194 |
| - if job.status.State(job.status.state).name == "ERROR": |
195 |
| - raise Exception(job.status.details) |
196 |
| - if job.status.State(job.status.state).name == "DONE": |
197 |
| - print("Job finished.") |
198 |
| - return job |
199 |
| - |
200 |
| - |
201 |
| -# [END dataproc_wait] |
202 |
| - |
203 |
| - |
204 |
| -def main( |
205 |
| - project_id, |
206 |
| - zone, |
207 |
| - cluster_name, |
208 |
| - bucket_name, |
209 |
| - pyspark_file=None, |
210 |
| - create_new_cluster=True, |
211 |
| - global_region=True, |
212 |
| -): |
213 | 89 |
|
214 |
| - # [START dataproc_get_client] |
215 |
| - if global_region: |
216 |
| - region = "global" |
217 |
| - # Use the default gRPC global endpoints. |
218 |
| - dataproc_cluster_client = dataproc_v1.ClusterControllerClient() |
219 |
| - dataproc_job_client = dataproc_v1.JobControllerClient() |
220 |
| - else: |
221 |
| - region = get_region_from_zone(zone) |
222 |
| - # Use a regional gRPC endpoint. See: |
223 |
| - # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints |
224 |
| - dataproc_cluster_client = dataproc_v1.ClusterControllerClient( |
225 |
| - client_options={"api_endpoint": f"{region}-dataproc.googleapis.com:443"} |
226 |
| - ) |
227 |
| - dataproc_job_client = dataproc_v1.ClusterControllerClient( |
228 |
| - client_options={"api_endpoint": f"{region}-dataproc.googleapis.com:443"} |
229 |
| - ) |
230 |
| - # [END dataproc_get_client] |
231 |
| - |
232 |
| - try: |
233 |
| - spark_file, spark_filename = get_pyspark_file(pyspark_file) |
234 |
| - if create_new_cluster: |
235 |
| - create_cluster( |
236 |
| - dataproc_cluster_client, project_id, zone, region, cluster_name |
237 |
| - ) |
238 |
| - wait_for_cluster_creation() |
239 |
| - upload_pyspark_file(project_id, bucket_name, spark_filename, spark_file) |
240 |
| - |
241 |
| - list_clusters_with_details(dataproc_cluster_client, project_id, region) |
242 |
| - |
243 |
| - (cluster_id, output_bucket) = get_cluster_id_by_name( |
244 |
| - dataproc_cluster_client, project_id, region, cluster_name |
245 |
| - ) |
246 |
| - |
247 |
| - # [START dataproc_call_submit_pyspark_job] |
248 |
| - job_id = submit_pyspark_job( |
249 |
| - dataproc_job_client, |
250 |
| - project_id, |
251 |
| - region, |
252 |
| - cluster_name, |
253 |
| - bucket_name, |
254 |
| - spark_filename, |
255 |
| - ) |
256 |
| - # [END dataproc_call_submit_pyspark_job] |
| 90 | + print(f"Job finished successfully: {output}\r\n") |
| 91 | + # [END dataproc_submit_job] |
| 92 | + |
| 93 | + # [START dataproc_delete_cluster] |
| 94 | + # Delete the cluster once the job has terminated. |
| 95 | + operation = cluster_client.delete_cluster( |
| 96 | + request={ |
| 97 | + "project_id": project_id, |
| 98 | + "region": region, |
| 99 | + "cluster_name": cluster_name, |
| 100 | + } |
| 101 | + ) |
| 102 | + operation.result() |
257 | 103 |
|
258 |
| - wait_for_job(dataproc_job_client, project_id, region, job_id) |
259 |
| - output = download_output(project_id, cluster_id, output_bucket, job_id) |
260 |
| - print("Received job output {}".format(output)) |
261 |
| - return output |
262 |
| - finally: |
263 |
| - if create_new_cluster: |
264 |
| - delete_cluster(dataproc_cluster_client, project_id, region, cluster_name) |
265 |
| - spark_file.close() |
| 104 | + print("Cluster {} successfully deleted.".format(cluster_name)) |
| 105 | +# [END dataproc_delete_cluster] |
266 | 106 |
|
267 | 107 |
|
268 | 108 | if __name__ == "__main__":
|
269 | 109 | parser = argparse.ArgumentParser(
|
270 |
| - description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter |
271 |
| - ) |
272 |
| - parser.add_argument( |
273 |
| - "--project_id", help="Project ID you want to access.", required=True |
274 |
| - ) |
275 |
| - parser.add_argument( |
276 |
| - "--zone", help="Zone to create clusters in/connect to", required=True |
| 110 | + description=__doc__, |
| 111 | + formatter_class=argparse.RawDescriptionHelpFormatter, |
277 | 112 | )
|
278 | 113 | parser.add_argument(
|
279 |
| - "--cluster_name", help="Name of the cluster to create/connect to", required=True |
| 114 | + "--project_id", |
| 115 | + type=str, |
| 116 | + required=True, |
| 117 | + help="Project to use for creating resources.", |
280 | 118 | )
|
281 | 119 | parser.add_argument(
|
282 |
| - "--gcs_bucket", help="Bucket to upload Pyspark file to", required=True |
| 120 | + "--region", |
| 121 | + type=str, |
| 122 | + required=True, |
| 123 | + help="Region where the resources should live.", |
283 | 124 | )
|
284 | 125 | parser.add_argument(
|
285 |
| - "--pyspark_file", help="Pyspark filename. Defaults to pyspark_sort.py" |
| 126 | + "--cluster_name", |
| 127 | + type=str, |
| 128 | + required=True, |
| 129 | + help="Name to use for creating a cluster.", |
286 | 130 | )
|
287 | 131 | parser.add_argument(
|
288 |
| - "--create_new_cluster", |
289 |
| - action="store_true", |
290 |
| - help="States if the cluster should be created", |
291 |
| - ) |
292 |
| - parser.add_argument( |
293 |
| - "--global_region", |
294 |
| - action="store_true", |
295 |
| - help="If cluster is in the global region", |
| 132 | + "--job_file_path", |
| 133 | + type=str, |
| 134 | + required=True, |
| 135 | + help="Job in Cloud Storage to run on the cluster.", |
296 | 136 | )
|
297 | 137 |
|
298 | 138 | args = parser.parse_args()
|
299 |
| - main( |
300 |
| - args.project_id, |
301 |
| - args.zone, |
302 |
| - args.cluster_name, |
303 |
| - args.gcs_bucket, |
304 |
| - args.pyspark_file, |
305 |
| - args.create_new_cluster, |
306 |
| - args.global_region, |
307 |
| - ) |
| 139 | + quickstart(args.project_id, args.region, args.cluster_name, args.job_file_path) |
| 140 | +# [END dataproc_quickstart] |
0 commit comments