custom_job.proto 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279
  1. // Copyright 2022 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.cloud.aiplatform.v1;
  16. import "google/api/field_behavior.proto";
  17. import "google/api/resource.proto";
  18. import "google/cloud/aiplatform/v1/encryption_spec.proto";
  19. import "google/cloud/aiplatform/v1/env_var.proto";
  20. import "google/cloud/aiplatform/v1/io.proto";
  21. import "google/cloud/aiplatform/v1/job_state.proto";
  22. import "google/cloud/aiplatform/v1/machine_resources.proto";
  23. import "google/protobuf/duration.proto";
  24. import "google/protobuf/timestamp.proto";
  25. import "google/rpc/status.proto";
  26. option csharp_namespace = "Google.Cloud.AIPlatform.V1";
  27. option go_package = "google.golang.org/genproto/googleapis/cloud/aiplatform/v1;aiplatform";
  28. option java_multiple_files = true;
  29. option java_outer_classname = "CustomJobProto";
  30. option java_package = "com.google.cloud.aiplatform.v1";
  31. option php_namespace = "Google\\Cloud\\AIPlatform\\V1";
  32. option ruby_package = "Google::Cloud::AIPlatform::V1";
  33. // Represents a job that runs custom workloads such as a Docker container or a
  34. // Python package. A CustomJob can have multiple worker pools and each worker
  35. // pool can have its own machine and input spec. A CustomJob will be cleaned up
  36. // once the job enters terminal state (failed or succeeded).
  37. message CustomJob {
  38. option (google.api.resource) = {
  39. type: "aiplatform.googleapis.com/CustomJob"
  40. pattern: "projects/{project}/locations/{location}/customJobs/{custom_job}"
  41. };
  42. // Output only. Resource name of a CustomJob.
  43. string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
  44. // Required. The display name of the CustomJob.
  45. // The name can be up to 128 characters long and can be consist of any UTF-8
  46. // characters.
  47. string display_name = 2 [(google.api.field_behavior) = REQUIRED];
  48. // Required. Job spec.
  49. CustomJobSpec job_spec = 4 [(google.api.field_behavior) = REQUIRED];
  50. // Output only. The detailed state of the job.
  51. JobState state = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
  52. // Output only. Time when the CustomJob was created.
  53. google.protobuf.Timestamp create_time = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
  54. // Output only. Time when the CustomJob for the first time entered the
  55. // `JOB_STATE_RUNNING` state.
  56. google.protobuf.Timestamp start_time = 7 [(google.api.field_behavior) = OUTPUT_ONLY];
  57. // Output only. Time when the CustomJob entered any of the following states:
  58. // `JOB_STATE_SUCCEEDED`, `JOB_STATE_FAILED`, `JOB_STATE_CANCELLED`.
  59. google.protobuf.Timestamp end_time = 8 [(google.api.field_behavior) = OUTPUT_ONLY];
  60. // Output only. Time when the CustomJob was most recently updated.
  61. google.protobuf.Timestamp update_time = 9 [(google.api.field_behavior) = OUTPUT_ONLY];
  62. // Output only. Only populated when job's state is `JOB_STATE_FAILED` or
  63. // `JOB_STATE_CANCELLED`.
  64. google.rpc.Status error = 10 [(google.api.field_behavior) = OUTPUT_ONLY];
  65. // The labels with user-defined metadata to organize CustomJobs.
  66. //
  67. // Label keys and values can be no longer than 64 characters
  68. // (Unicode codepoints), can only contain lowercase letters, numeric
  69. // characters, underscores and dashes. International characters are allowed.
  70. //
  71. // See https://goo.gl/xmQnxf for more information and examples of labels.
  72. map<string, string> labels = 11;
  73. // Customer-managed encryption key options for a CustomJob. If this is set,
  74. // then all resources created by the CustomJob will be encrypted with the
  75. // provided encryption key.
  76. EncryptionSpec encryption_spec = 12;
  77. // Output only. URIs for accessing [interactive
  78. // shells](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell)
  79. // (one URI for each training node). Only available if
  80. // [job_spec.enable_web_access][google.cloud.aiplatform.v1.CustomJobSpec.enable_web_access] is `true`.
  81. //
  82. // The keys are names of each node in the training job; for example,
  83. // `workerpool0-0` for the primary node, `workerpool1-0` for the first node in
  84. // the second worker pool, and `workerpool1-1` for the second node in the
  85. // second worker pool.
  86. //
  87. // The values are the URIs for each node's interactive shell.
  88. map<string, string> web_access_uris = 16 [(google.api.field_behavior) = OUTPUT_ONLY];
  89. }
  90. // Represents the spec of a CustomJob.
  91. message CustomJobSpec {
  92. // Required. The spec of the worker pools including machine type and Docker image.
  93. // All worker pools except the first one are optional and can be skipped by
  94. // providing an empty value.
  95. repeated WorkerPoolSpec worker_pool_specs = 1 [(google.api.field_behavior) = REQUIRED];
  96. // Scheduling options for a CustomJob.
  97. Scheduling scheduling = 3;
  98. // Specifies the service account for workload run-as account.
  99. // Users submitting jobs must have act-as permission on this run-as account.
  100. // If unspecified, the [Vertex AI Custom Code Service
  101. // Agent](https://cloud.google.com/vertex-ai/docs/general/access-control#service-agents)
  102. // for the CustomJob's project is used.
  103. string service_account = 4;
  104. // Optional. The full name of the Compute Engine
  105. // [network](/compute/docs/networks-and-firewalls#networks) to which the Job
  106. // should be peered. For example, `projects/12345/global/networks/myVPC`.
  107. // [Format](/compute/docs/reference/rest/v1/networks/insert)
  108. // is of the form `projects/{project}/global/networks/{network}`.
  109. // Where {project} is a project number, as in `12345`, and {network} is a
  110. // network name.
  111. //
  112. // To specify this field, you must have already [configured VPC Network
  113. // Peering for Vertex
  114. // AI](https://cloud.google.com/vertex-ai/docs/general/vpc-peering).
  115. //
  116. // If this field is left unspecified, the job is not peered with any network.
  117. string network = 5 [
  118. (google.api.field_behavior) = OPTIONAL,
  119. (google.api.resource_reference) = {
  120. type: "compute.googleapis.com/Network"
  121. }
  122. ];
  123. // Optional. A list of names for the reserved ip ranges under the VPC network
  124. // that can be used for this job.
  125. //
  126. // If set, we will deploy the job within the provided ip ranges. Otherwise,
  127. // the job will be deployed to any ip ranges under the provided VPC
  128. // network.
  129. //
  130. // Example: ['vertex-ai-ip-range'].
  131. repeated string reserved_ip_ranges = 13 [(google.api.field_behavior) = OPTIONAL];
  132. // The Cloud Storage location to store the output of this CustomJob or
  133. // HyperparameterTuningJob. For HyperparameterTuningJob,
  134. // the baseOutputDirectory of
  135. // each child CustomJob backing a Trial is set to a subdirectory of name
  136. // [id][google.cloud.aiplatform.v1.Trial.id] under its parent HyperparameterTuningJob's
  137. // baseOutputDirectory.
  138. //
  139. // The following Vertex AI environment variables will be passed to
  140. // containers or python modules when this field is set:
  141. //
  142. // For CustomJob:
  143. //
  144. // * AIP_MODEL_DIR = `<base_output_directory>/model/`
  145. // * AIP_CHECKPOINT_DIR = `<base_output_directory>/checkpoints/`
  146. // * AIP_TENSORBOARD_LOG_DIR = `<base_output_directory>/logs/`
  147. //
  148. // For CustomJob backing a Trial of HyperparameterTuningJob:
  149. //
  150. // * AIP_MODEL_DIR = `<base_output_directory>/<trial_id>/model/`
  151. // * AIP_CHECKPOINT_DIR = `<base_output_directory>/<trial_id>/checkpoints/`
  152. // * AIP_TENSORBOARD_LOG_DIR = `<base_output_directory>/<trial_id>/logs/`
  153. GcsDestination base_output_directory = 6;
  154. // Optional. The name of a Vertex AI [Tensorboard][google.cloud.aiplatform.v1.Tensorboard] resource to which this CustomJob
  155. // will upload Tensorboard logs.
  156. // Format:
  157. // `projects/{project}/locations/{location}/tensorboards/{tensorboard}`
  158. string tensorboard = 7 [
  159. (google.api.field_behavior) = OPTIONAL,
  160. (google.api.resource_reference) = {
  161. type: "aiplatform.googleapis.com/Tensorboard"
  162. }
  163. ];
  164. // Optional. Whether you want Vertex AI to enable [interactive shell
  165. // access](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell)
  166. // to training containers.
  167. //
  168. // If set to `true`, you can access interactive shells at the URIs given
  169. // by [CustomJob.web_access_uris][google.cloud.aiplatform.v1.CustomJob.web_access_uris] or [Trial.web_access_uris][google.cloud.aiplatform.v1.Trial.web_access_uris] (within
  170. // [HyperparameterTuningJob.trials][google.cloud.aiplatform.v1.HyperparameterTuningJob.trials]).
  171. bool enable_web_access = 10 [(google.api.field_behavior) = OPTIONAL];
  172. }
  173. // Represents the spec of a worker pool in a job.
  174. message WorkerPoolSpec {
  175. // The custom task to be executed in this worker pool.
  176. oneof task {
  177. // The custom container task.
  178. ContainerSpec container_spec = 6;
  179. // The Python packaged task.
  180. PythonPackageSpec python_package_spec = 7;
  181. }
  182. // Optional. Immutable. The specification of a single machine.
  183. MachineSpec machine_spec = 1 [
  184. (google.api.field_behavior) = OPTIONAL,
  185. (google.api.field_behavior) = IMMUTABLE
  186. ];
  187. // Optional. The number of worker replicas to use for this worker pool.
  188. int64 replica_count = 2 [(google.api.field_behavior) = OPTIONAL];
  189. // Optional. List of NFS mount spec.
  190. repeated NfsMount nfs_mounts = 4 [(google.api.field_behavior) = OPTIONAL];
  191. // Disk spec.
  192. DiskSpec disk_spec = 5;
  193. }
  194. // The spec of a Container.
  195. message ContainerSpec {
  196. // Required. The URI of a container image in the Container Registry that is to be run on
  197. // each worker replica.
  198. string image_uri = 1 [(google.api.field_behavior) = REQUIRED];
  199. // The command to be invoked when the container is started.
  200. // It overrides the entrypoint instruction in Dockerfile when provided.
  201. repeated string command = 2;
  202. // The arguments to be passed when starting the container.
  203. repeated string args = 3;
  204. // Environment variables to be passed to the container.
  205. // Maximum limit is 100.
  206. repeated EnvVar env = 4;
  207. }
  208. // The spec of a Python packaged code.
  209. message PythonPackageSpec {
  210. // Required. The URI of a container image in Artifact Registry that will run the
  211. // provided Python package. Vertex AI provides a wide range of executor
  212. // images with pre-installed packages to meet users' various use cases. See
  213. // the list of [pre-built containers for
  214. // training](https://cloud.google.com/vertex-ai/docs/training/pre-built-containers).
  215. // You must use an image from this list.
  216. string executor_image_uri = 1 [(google.api.field_behavior) = REQUIRED];
  217. // Required. The Google Cloud Storage location of the Python package files which are
  218. // the training program and its dependent packages.
  219. // The maximum number of package URIs is 100.
  220. repeated string package_uris = 2 [(google.api.field_behavior) = REQUIRED];
  221. // Required. The Python module name to run after installing the packages.
  222. string python_module = 3 [(google.api.field_behavior) = REQUIRED];
  223. // Command line arguments to be passed to the Python task.
  224. repeated string args = 4;
  225. // Environment variables to be passed to the python module.
  226. // Maximum limit is 100.
  227. repeated EnvVar env = 5;
  228. }
  229. // All parameters related to queuing and scheduling of custom jobs.
  230. message Scheduling {
  231. // The maximum job running time. The default is 7 days.
  232. google.protobuf.Duration timeout = 1;
  233. // Restarts the entire CustomJob if a worker gets restarted.
  234. // This feature can be used by distributed training jobs that are not
  235. // resilient to workers leaving and joining a job.
  236. bool restart_job_on_worker_restart = 3;
  237. }