job.proto 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465
  1. // Copyright 2022 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.cloud.batch.v1;
  16. import "google/api/field_behavior.proto";
  17. import "google/api/resource.proto";
  18. import "google/cloud/batch/v1/task.proto";
  19. import "google/protobuf/duration.proto";
  20. import "google/protobuf/timestamp.proto";
  21. option csharp_namespace = "Google.Cloud.Batch.V1";
  22. option go_package = "google.golang.org/genproto/googleapis/cloud/batch/v1;batch";
  23. option java_multiple_files = true;
  24. option java_outer_classname = "JobProto";
  25. option java_package = "com.google.cloud.batch.v1";
  26. option objc_class_prefix = "GCB";
  27. option php_namespace = "Google\\Cloud\\Batch\\V1";
  28. option ruby_package = "Google::Cloud::Batch::V1";
  29. // The Cloud Batch Job description.
  30. message Job {
  31. option (google.api.resource) = {
  32. type: "batch.googleapis.com/Job"
  33. pattern: "projects/{project}/locations/{location}/jobs/{job}"
  34. };
  35. // Output only. Job name.
  36. // For example: "projects/123456/locations/us-central1/jobs/job01".
  37. string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
  38. // Output only. A system generated unique ID (in UUID4 format) for the Job.
  39. string uid = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
  40. // Priority of the Job.
  41. // The valid value range is [0, 100).
  42. // A job with higher priority value is more likely to run earlier if all other
  43. // requirements are satisfied.
  44. int64 priority = 3;
  45. // Required. TaskGroups in the Job. Only one TaskGroup is supported now.
  46. repeated TaskGroup task_groups = 4 [(google.api.field_behavior) = REQUIRED];
  47. // Compute resource allocation for all TaskGroups in the Job.
  48. AllocationPolicy allocation_policy = 7;
  49. // Labels for the Job. Labels could be user provided or system generated.
  50. // For example,
  51. // "labels": {
  52. // "department": "finance",
  53. // "environment": "test"
  54. // }
  55. // You can assign up to 64 labels. [Google Compute Engine label
  56. // restrictions](https://cloud.google.com/compute/docs/labeling-resources#restrictions)
  57. // apply.
  58. // Label names that start with "goog-" or "google-" are reserved.
  59. map<string, string> labels = 8;
  60. // Output only. Job status. It is read only for users.
  61. JobStatus status = 9 [(google.api.field_behavior) = OUTPUT_ONLY];
  62. // Output only. When the Job was created.
  63. google.protobuf.Timestamp create_time = 11 [(google.api.field_behavior) = OUTPUT_ONLY];
  64. // Output only. The last time the Job was updated.
  65. google.protobuf.Timestamp update_time = 12 [(google.api.field_behavior) = OUTPUT_ONLY];
  66. // Log preservation policy for the Job.
  67. LogsPolicy logs_policy = 13;
  68. // Notification configurations.
  69. repeated JobNotification notifications = 14;
  70. }
  71. // LogsPolicy describes how outputs from a Job's Tasks (stdout/stderr) will be
  72. // preserved.
  73. message LogsPolicy {
  74. // The destination (if any) for logs.
  75. enum Destination {
  76. // Logs are not preserved.
  77. DESTINATION_UNSPECIFIED = 0;
  78. // Logs are streamed to Cloud Logging.
  79. CLOUD_LOGGING = 1;
  80. // Logs are saved to a file path.
  81. PATH = 2;
  82. }
  83. // Where logs should be saved.
  84. Destination destination = 1;
  85. // The path to which logs are saved when the destination = PATH. This can be a
  86. // local file path on the VM, or under the mount point of a Persistent Disk or
  87. // Filestore, or a Cloud Storage path.
  88. string logs_path = 2;
  89. }
  90. // Job status.
  91. message JobStatus {
  92. // VM instance status.
  93. message InstanceStatus {
  94. // The Compute Engine machine type.
  95. string machine_type = 1;
  96. // The VM instance provisioning model.
  97. AllocationPolicy.ProvisioningModel provisioning_model = 2;
  98. // The max number of tasks can be assigned to this instance type.
  99. int64 task_pack = 3;
  100. }
  101. // Aggregated task status for a TaskGroup.
  102. message TaskGroupStatus {
  103. // Count of task in each state in the TaskGroup.
  104. // The map key is task state name.
  105. map<string, int64> counts = 1;
  106. // Status of instances allocated for the TaskGroup.
  107. repeated InstanceStatus instances = 2;
  108. }
  109. // Valid Job states.
  110. enum State {
  111. STATE_UNSPECIFIED = 0;
  112. // Job is admitted (validated and persisted) and waiting for resources.
  113. QUEUED = 1;
  114. // Job is scheduled to run as soon as resource allocation is ready.
  115. // The resource allocation may happen at a later time but with a high
  116. // chance to succeed.
  117. SCHEDULED = 2;
  118. // Resource allocation has been successful. At least one Task in the Job is
  119. // RUNNING.
  120. RUNNING = 3;
  121. // All Tasks in the Job have finished successfully.
  122. SUCCEEDED = 4;
  123. // At least one Task in the Job has failed.
  124. FAILED = 5;
  125. // The Job will be deleted, but has not been deleted yet. Typically this is
  126. // because resources used by the Job are still being cleaned up.
  127. DELETION_IN_PROGRESS = 6;
  128. }
  129. // Job state
  130. State state = 1;
  131. // Job status events
  132. repeated StatusEvent status_events = 2;
  133. // Aggregated task status for each TaskGroup in the Job.
  134. // The map key is TaskGroup ID.
  135. map<string, TaskGroupStatus> task_groups = 4;
  136. // The duration of time that the Job spent in status RUNNING.
  137. google.protobuf.Duration run_duration = 5;
  138. }
  139. // Notification configurations.
  140. message JobNotification {
  141. // Message details.
  142. // Describe the attribute that a message should have.
  143. // Without specified message attributes, no message will be sent by default.
  144. message Message {
  145. // The message type.
  146. Type type = 1;
  147. // The new job state.
  148. JobStatus.State new_job_state = 2;
  149. // The new task state.
  150. TaskStatus.State new_task_state = 3;
  151. }
  152. // The message type.
  153. enum Type {
  154. // Unspecified.
  155. TYPE_UNSPECIFIED = 0;
  156. // Notify users that the job state has changed.
  157. JOB_STATE_CHANGED = 1;
  158. // Notify users that the task state has changed.
  159. TASK_STATE_CHANGED = 2;
  160. }
  161. // The Pub/Sub topic where notifications like the job state changes
  162. // will be published. This topic exist in the same project as the job
  163. // and billings will be charged to this project.
  164. // If not specified, no Pub/Sub messages will be sent.
  165. // Topic format: `projects/{project}/topics/{topic}`.
  166. string pubsub_topic = 1;
  167. // The attribute requirements of messages to be sent to this Pub/Sub topic.
  168. // Without this field, no message will be sent.
  169. Message message = 2;
  170. }
  171. // A Job's resource allocation policy describes when, where, and how compute
  172. // resources should be allocated for the Job.
  173. message AllocationPolicy {
  174. message LocationPolicy {
  175. // A list of allowed location names represented by internal URLs.
  176. // Each location can be a region or a zone.
  177. // Only one region or multiple zones in one region is supported now.
  178. // For example,
  179. // ["regions/us-central1"] allow VMs in any zones in region us-central1.
  180. // ["zones/us-central1-a", "zones/us-central1-c"] only allow VMs
  181. // in zones us-central1-a and us-central1-c.
  182. // All locations end up in different regions would cause errors.
  183. // For example,
  184. // ["regions/us-central1", "zones/us-central1-a", "zones/us-central1-b",
  185. // "zones/us-west1-a"] contains 2 regions "us-central1" and
  186. // "us-west1". An error is expected in this case.
  187. repeated string allowed_locations = 1;
  188. }
  189. // A new persistent disk or a local ssd.
  190. // A VM can only have one local SSD setting but multiple local SSD partitions.
  191. // https://cloud.google.com/compute/docs/disks#pdspecs.
  192. // https://cloud.google.com/compute/docs/disks#localssds.
  193. message Disk {
  194. // A data source from which a PD will be created.
  195. oneof data_source {
  196. // Name of a public or custom image used as the data source.
  197. string image = 4;
  198. // Name of a snapshot used as the data source.
  199. string snapshot = 5;
  200. }
  201. // Disk type as shown in `gcloud compute disk-types list`
  202. // For example, "pd-ssd", "pd-standard", "pd-balanced", "local-ssd".
  203. string type = 1;
  204. // Disk size in GB.
  205. // This field is ignored if `data_source` is `disk` or `image`.
  206. // If `type` is `local-ssd`, size_gb should be a multiple of 375GB,
  207. // otherwise, the final size will be the next greater multiple of 375 GB.
  208. int64 size_gb = 2;
  209. // Local SSDs are available through both "SCSI" and "NVMe" interfaces.
  210. // If not indicated, "NVMe" will be the default one for local ssds.
  211. // We only support "SCSI" for persistent disks now.
  212. string disk_interface = 6;
  213. }
  214. // A new or an existing persistent disk (PD) or a local ssd attached to a VM
  215. // instance.
  216. message AttachedDisk {
  217. oneof attached {
  218. Disk new_disk = 1;
  219. // Name of an existing PD.
  220. string existing_disk = 2;
  221. }
  222. // Device name that the guest operating system will see.
  223. // It is used by Runnable.volumes field to mount disks. So please specify
  224. // the device_name if you want Batch to help mount the disk, and it should
  225. // match the device_name field in volumes.
  226. string device_name = 3;
  227. }
  228. // Accelerator describes Compute Engine accelerators to be attached to the VM.
  229. message Accelerator {
  230. // The accelerator type. For example, "nvidia-tesla-t4".
  231. // See `gcloud compute accelerator-types list`.
  232. string type = 1;
  233. // The number of accelerators of this type.
  234. int64 count = 2;
  235. // Deprecated: please use instances[0].install_gpu_drivers instead.
  236. bool install_gpu_drivers = 3 [deprecated = true];
  237. }
  238. // InstancePolicy describes an instance type and resources attached to each VM
  239. // created by this InstancePolicy.
  240. message InstancePolicy {
  241. // The Compute Engine machine type.
  242. string machine_type = 2;
  243. // The minimum CPU platform.
  244. // See
  245. // `https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform`.
  246. // Not yet implemented.
  247. string min_cpu_platform = 3;
  248. // The provisioning model.
  249. ProvisioningModel provisioning_model = 4;
  250. // The accelerators attached to each VM instance.
  251. repeated Accelerator accelerators = 5;
  252. // Non-boot disks to be attached for each VM created by this InstancePolicy.
  253. // New disks will be deleted when the VM is deleted.
  254. repeated AttachedDisk disks = 6;
  255. }
  256. // Either an InstancePolicy or an instance template.
  257. message InstancePolicyOrTemplate {
  258. oneof policy_template {
  259. // InstancePolicy.
  260. InstancePolicy policy = 1;
  261. // Name of an instance template used to create VMs.
  262. // Named the field as 'instance_template' instead of 'template' to avoid
  263. // c++ keyword conflict.
  264. string instance_template = 2;
  265. }
  266. // Set this field true if users want Batch to help fetch drivers from a
  267. // third party location and install them for GPUs specified in
  268. // policy.accelerators or instance_template on their behalf. Default is
  269. // false.
  270. bool install_gpu_drivers = 3;
  271. }
  272. // A network interface.
  273. message NetworkInterface {
  274. // The URL of the network resource.
  275. string network = 1;
  276. // The URL of the Subnetwork resource.
  277. string subnetwork = 2;
  278. // Default is false (with an external IP address). Required if
  279. // no external public IP address is attached to the VM. If no external
  280. // public IP address, additional configuration is required to allow the VM
  281. // to access Google Services. See
  282. // https://cloud.google.com/vpc/docs/configure-private-google-access and
  283. // https://cloud.google.com/nat/docs/gce-example#create-nat for more
  284. // information.
  285. bool no_external_ip_address = 3;
  286. }
  287. // NetworkPolicy describes VM instance network configurations.
  288. message NetworkPolicy {
  289. // Network configurations.
  290. repeated NetworkInterface network_interfaces = 1;
  291. }
  292. // Compute Engine VM instance provisioning model.
  293. enum ProvisioningModel {
  294. // Unspecified.
  295. PROVISIONING_MODEL_UNSPECIFIED = 0;
  296. // Standard VM.
  297. STANDARD = 1;
  298. // SPOT VM.
  299. SPOT = 2;
  300. // Preemptible VM (PVM).
  301. //
  302. // Above SPOT VM is the preferable model for preemptible VM instances: the
  303. // old preemptible VM model (indicated by this field) is the older model,
  304. // and has been migrated to use the SPOT model as the underlying technology.
  305. // This old model will still be supported.
  306. PREEMPTIBLE = 3;
  307. }
  308. // Location where compute resources should be allocated for the Job.
  309. LocationPolicy location = 1;
  310. // Describe instances that can be created by this AllocationPolicy.
  311. // Only instances[0] is supported now.
  312. repeated InstancePolicyOrTemplate instances = 8;
  313. // Service account that VMs will run as.
  314. ServiceAccount service_account = 9;
  315. // Labels applied to all VM instances and other resources
  316. // created by AllocationPolicy.
  317. // Labels could be user provided or system generated.
  318. // You can assign up to 64 labels. [Google Compute Engine label
  319. // restrictions](https://cloud.google.com/compute/docs/labeling-resources#restrictions)
  320. // apply.
  321. // Label names that start with "goog-" or "google-" are reserved.
  322. map<string, string> labels = 6;
  323. // The network policy.
  324. NetworkPolicy network = 7;
  325. }
  326. // A TaskGroup contains one or multiple Tasks that share the same
  327. // Runnable but with different runtime parameters.
  328. message TaskGroup {
  329. option (google.api.resource) = {
  330. type: "batch.googleapis.com/TaskGroup"
  331. pattern: "projects/{project}/locations/{location}/jobs/{job}/taskGroups/{task_group}"
  332. };
  333. // Output only. TaskGroup name.
  334. // The system generates this field based on parent Job name.
  335. // For example:
  336. // "projects/123456/locations/us-west1/jobs/job01/taskGroups/group01".
  337. string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
  338. // Required. Tasks in the group share the same task spec.
  339. TaskSpec task_spec = 3 [(google.api.field_behavior) = REQUIRED];
  340. // Number of Tasks in the TaskGroup.
  341. // default is 1
  342. int64 task_count = 4;
  343. // Max number of tasks that can run in parallel.
  344. // Default to min(task_count, 1000).
  345. int64 parallelism = 5;
  346. // An array of environment variable mappings, which are passed to Tasks with
  347. // matching indices. If task_environments is used then task_count should
  348. // not be specified in the request (and will be ignored). Task count will be
  349. // the length of task_environments.
  350. //
  351. // Tasks get a BATCH_TASK_INDEX and BATCH_TASK_COUNT environment variable, in
  352. // addition to any environment variables set in task_environments, specifying
  353. // the number of Tasks in the Task's parent TaskGroup, and the specific Task's
  354. // index in the TaskGroup (0 through BATCH_TASK_COUNT - 1).
  355. //
  356. // task_environments supports up to 200 entries.
  357. repeated Environment task_environments = 9;
  358. // Max number of tasks that can be run on a VM at the same time.
  359. // If not specified, the system will decide a value based on available
  360. // compute resources on a VM and task requirements.
  361. int64 task_count_per_node = 10;
  362. // When true, Batch will populate a file with a list of all VMs assigned to
  363. // the TaskGroup and set the BATCH_HOSTS_FILE environment variable to the path
  364. // of that file. Defaults to false.
  365. bool require_hosts_file = 11;
  366. // When true, Batch will configure SSH to allow passwordless login between
  367. // VMs running the Batch tasks in the same TaskGroup.
  368. bool permissive_ssh = 12;
  369. }
  370. // Carries information about a Google Cloud service account.
  371. message ServiceAccount {
  372. // Email address of the service account. If not specified, the default
  373. // Compute Engine service account for the project will be used. If instance
  374. // template is being used, the service account has to be specified in the
  375. // instance template and it has to match the email field here.
  376. string email = 1;
  377. }