123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617 |
- // Copyright 2022 Google LLC
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- syntax = "proto3";
- package google.cloud.tpu.v1;
- import "google/api/annotations.proto";
- import "google/api/client.proto";
- import "google/api/field_behavior.proto";
- import "google/api/resource.proto";
- import "google/longrunning/operations.proto";
- import "google/protobuf/timestamp.proto";
- option go_package = "google.golang.org/genproto/googleapis/cloud/tpu/v1;tpu";
- option java_multiple_files = true;
- option java_outer_classname = "CloudTpuProto";
- option java_package = "com.google.cloud.tpu.v1";
- // Manages TPU nodes and other resources
- //
- // TPU API v1
- service Tpu {
- option (google.api.default_host) = "tpu.googleapis.com";
- option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform";
- // Lists nodes.
- rpc ListNodes(ListNodesRequest) returns (ListNodesResponse) {
- option (google.api.http) = {
- get: "/v1/{parent=projects/*/locations/*}/nodes"
- };
- option (google.api.method_signature) = "parent";
- }
- // Gets the details of a node.
- rpc GetNode(GetNodeRequest) returns (Node) {
- option (google.api.http) = {
- get: "/v1/{name=projects/*/locations/*/nodes/*}"
- };
- option (google.api.method_signature) = "name";
- }
- // Creates a node.
- rpc CreateNode(CreateNodeRequest) returns (google.longrunning.Operation) {
- option (google.api.http) = {
- post: "/v1/{parent=projects/*/locations/*}/nodes"
- body: "node"
- };
- option (google.api.method_signature) = "parent,node,node_id";
- option (google.longrunning.operation_info) = {
- response_type: "Node"
- metadata_type: "OperationMetadata"
- };
- }
- // Deletes a node.
- rpc DeleteNode(DeleteNodeRequest) returns (google.longrunning.Operation) {
- option (google.api.http) = {
- delete: "/v1/{name=projects/*/locations/*/nodes/*}"
- };
- option (google.api.method_signature) = "name";
- option (google.longrunning.operation_info) = {
- response_type: "Node"
- metadata_type: "OperationMetadata"
- };
- }
- // Reimages a node's OS.
- rpc ReimageNode(ReimageNodeRequest) returns (google.longrunning.Operation) {
- option (google.api.http) = {
- post: "/v1/{name=projects/*/locations/*/nodes/*}:reimage"
- body: "*"
- };
- option (google.longrunning.operation_info) = {
- response_type: "Node"
- metadata_type: "OperationMetadata"
- };
- }
- // Stops a node, this operation is only available with single TPU nodes.
- rpc StopNode(StopNodeRequest) returns (google.longrunning.Operation) {
- option (google.api.http) = {
- post: "/v1/{name=projects/*/locations/*/nodes/*}:stop"
- body: "*"
- };
- option (google.longrunning.operation_info) = {
- response_type: "Node"
- metadata_type: "OperationMetadata"
- };
- }
- // Starts a node.
- rpc StartNode(StartNodeRequest) returns (google.longrunning.Operation) {
- option (google.api.http) = {
- post: "/v1/{name=projects/*/locations/*/nodes/*}:start"
- body: "*"
- };
- option (google.longrunning.operation_info) = {
- response_type: "Node"
- metadata_type: "OperationMetadata"
- };
- }
- // List TensorFlow versions supported by this API.
- rpc ListTensorFlowVersions(ListTensorFlowVersionsRequest) returns (ListTensorFlowVersionsResponse) {
- option (google.api.http) = {
- get: "/v1/{parent=projects/*/locations/*}/tensorflowVersions"
- };
- option (google.api.method_signature) = "parent";
- }
- // Gets TensorFlow Version.
- rpc GetTensorFlowVersion(GetTensorFlowVersionRequest) returns (TensorFlowVersion) {
- option (google.api.http) = {
- get: "/v1/{name=projects/*/locations/*/tensorflowVersions/*}"
- };
- option (google.api.method_signature) = "name";
- }
- // Lists accelerator types supported by this API.
- rpc ListAcceleratorTypes(ListAcceleratorTypesRequest) returns (ListAcceleratorTypesResponse) {
- option (google.api.http) = {
- get: "/v1/{parent=projects/*/locations/*}/acceleratorTypes"
- };
- option (google.api.method_signature) = "parent";
- }
- // Gets AcceleratorType.
- rpc GetAcceleratorType(GetAcceleratorTypeRequest) returns (AcceleratorType) {
- option (google.api.http) = {
- get: "/v1/{name=projects/*/locations/*/acceleratorTypes/*}"
- };
- option (google.api.method_signature) = "name";
- }
- }
- // Sets the scheduling options for this node.
- message SchedulingConfig {
- // Defines whether the node is preemptible.
- bool preemptible = 1;
- // Whether the node is created under a reservation.
- bool reserved = 2;
- }
- // A network endpoint over which a TPU worker can be reached.
- message NetworkEndpoint {
- // The IP address of this network endpoint.
- string ip_address = 1;
- // The port of this network endpoint.
- int32 port = 2;
- }
- // A TPU instance.
- message Node {
- option (google.api.resource) = {
- type: "tpu.googleapis.com/Node"
- pattern: "projects/{project}/locations/{location}/nodes/{node}"
- };
- // Represents the different states of a TPU node during its lifecycle.
- enum State {
- // TPU node state is not known/set.
- STATE_UNSPECIFIED = 0;
- // TPU node is being created.
- CREATING = 1;
- // TPU node has been created.
- READY = 2;
- // TPU node is restarting.
- RESTARTING = 3;
- // TPU node is undergoing reimaging.
- REIMAGING = 4;
- // TPU node is being deleted.
- DELETING = 5;
- // TPU node is being repaired and may be unusable. Details can be
- // found in the `help_description` field.
- REPAIRING = 6;
- // TPU node is stopped.
- STOPPED = 8;
- // TPU node is currently stopping.
- STOPPING = 9;
- // TPU node is currently starting.
- STARTING = 10;
- // TPU node has been preempted. Only applies to Preemptible TPU Nodes.
- PREEMPTED = 11;
- // TPU node has been terminated due to maintenance or has reached the end of
- // its life cycle (for preemptible nodes).
- TERMINATED = 12;
- // TPU node is currently hiding.
- HIDING = 13;
- // TPU node has been hidden.
- HIDDEN = 14;
- // TPU node is currently unhiding.
- UNHIDING = 15;
- }
- // Health defines the status of a TPU node as reported by
- // Health Monitor.
- enum Health {
- // Health status is unknown: not initialized or failed to retrieve.
- HEALTH_UNSPECIFIED = 0;
- // The resource is healthy.
- HEALTHY = 1;
- // The resource is unhealthy.
- DEPRECATED_UNHEALTHY = 2;
- // The resource is unresponsive.
- TIMEOUT = 3;
- // The in-guest ML stack is unhealthy.
- UNHEALTHY_TENSORFLOW = 4;
- // The node is under maintenance/priority boost caused rescheduling and
- // will resume running once rescheduled.
- UNHEALTHY_MAINTENANCE = 5;
- }
- // TPU API Version.
- enum ApiVersion {
- // API version is unknown.
- API_VERSION_UNSPECIFIED = 0;
- // TPU API V1Alpha1 version.
- V1_ALPHA1 = 1;
- // TPU API V1 version.
- V1 = 2;
- // TPU API V2Alpha1 version.
- V2_ALPHA1 = 3;
- }
- // Output only. Immutable. The name of the TPU
- string name = 1 [
- (google.api.field_behavior) = IMMUTABLE,
- (google.api.field_behavior) = OUTPUT_ONLY
- ];
- // The user-supplied description of the TPU. Maximum of 512 characters.
- string description = 3;
- // Required. The type of hardware accelerators associated with this node.
- string accelerator_type = 5 [(google.api.field_behavior) = REQUIRED];
- // Output only. DEPRECATED! Use network_endpoints instead.
- // The network address for the TPU Node as visible to Compute Engine
- // instances.
- string ip_address = 8 [deprecated = true];
- // Output only. DEPRECATED! Use network_endpoints instead.
- // The network port for the TPU Node as visible to Compute Engine instances.
- string port = 14 [deprecated = true];
- // Output only. The current state for the TPU Node.
- State state = 9 [(google.api.field_behavior) = OUTPUT_ONLY];
- // Output only. If this field is populated, it contains a description of why the TPU Node
- // is unhealthy.
- string health_description = 10 [(google.api.field_behavior) = OUTPUT_ONLY];
- // Required. The version of Tensorflow running in the Node.
- string tensorflow_version = 11 [(google.api.field_behavior) = REQUIRED];
- // The name of a network they wish to peer the TPU node to. It must be a
- // preexisting Compute Engine network inside of the project on which this API
- // has been activated. If none is provided, "default" will be used.
- string network = 12;
- // The CIDR block that the TPU node will use when selecting an IP address.
- // This CIDR block must be a /29 block; the Compute Engine networks API
- // forbids a smaller block, and using a larger block would be wasteful (a
- // node can only consume one IP address). Errors will occur if the CIDR block
- // has already been used for a currently existing TPU node, the CIDR block
- // conflicts with any subnetworks in the user's provided network, or the
- // provided network is peered with another network that is using that CIDR
- // block.
- string cidr_block = 13;
- // Output only. The service account used to run the tensor flow services within the node.
- // To share resources, including Google Cloud Storage data, with the
- // Tensorflow job running in the Node, this account must have permissions to
- // that data.
- string service_account = 15 [(google.api.field_behavior) = OUTPUT_ONLY];
- // Output only. The time when the node was created.
- google.protobuf.Timestamp create_time = 16 [(google.api.field_behavior) = OUTPUT_ONLY];
- // The scheduling options for this node.
- SchedulingConfig scheduling_config = 17;
- // Output only. The network endpoints where TPU workers can be accessed and
- // sent work. It is recommended that Tensorflow clients of the node reach out
- // to the 0th entry in this map first.
- repeated NetworkEndpoint network_endpoints = 21 [(google.api.field_behavior) = OUTPUT_ONLY];
- // The health status of the TPU node.
- Health health = 22;
- // Resource labels to represent user-provided metadata.
- map<string, string> labels = 24;
- // Whether the VPC peering for the node is set up through Service Networking
- // API. The VPC Peering should be set up before provisioning the node.
- // If this field is set, cidr_block field should not be specified. If the
- // network, that you want to peer the TPU Node to, is Shared VPC networks,
- // the node must be created with this this field enabled.
- bool use_service_networking = 27;
- // Output only. The API version that created this Node.
- ApiVersion api_version = 38 [(google.api.field_behavior) = OUTPUT_ONLY];
- // Output only. The Symptoms that have occurred to the TPU Node.
- repeated Symptom symptoms = 39 [(google.api.field_behavior) = OUTPUT_ONLY];
- }
- // Request for [ListNodes][google.cloud.tpu.v1.Tpu.ListNodes].
- message ListNodesRequest {
- // Required. The parent resource name.
- string parent = 1 [
- (google.api.field_behavior) = REQUIRED,
- (google.api.resource_reference) = {
- child_type: "tpu.googleapis.com/Node"
- }
- ];
- // The maximum number of items to return.
- int32 page_size = 2;
- // The next_page_token value returned from a previous List request, if any.
- string page_token = 3;
- }
- // Response for [ListNodes][google.cloud.tpu.v1.Tpu.ListNodes].
- message ListNodesResponse {
- // The listed nodes.
- repeated Node nodes = 1;
- // The next page token or empty if none.
- string next_page_token = 2;
- // Locations that could not be reached.
- repeated string unreachable = 3;
- }
- // Request for [GetNode][google.cloud.tpu.v1.Tpu.GetNode].
- message GetNodeRequest {
- // Required. The resource name.
- string name = 1 [
- (google.api.field_behavior) = REQUIRED,
- (google.api.resource_reference) = {
- type: "tpu.googleapis.com/Node"
- }
- ];
- }
- // Request for [CreateNode][google.cloud.tpu.v1.Tpu.CreateNode].
- message CreateNodeRequest {
- // Required. The parent resource name.
- string parent = 1 [
- (google.api.field_behavior) = REQUIRED,
- (google.api.resource_reference) = {
- child_type: "tpu.googleapis.com/Node"
- }
- ];
- // The unqualified resource name.
- string node_id = 2;
- // Required. The node.
- Node node = 3 [(google.api.field_behavior) = REQUIRED];
- }
- // Request for [DeleteNode][google.cloud.tpu.v1.Tpu.DeleteNode].
- message DeleteNodeRequest {
- // Required. The resource name.
- string name = 1 [
- (google.api.field_behavior) = REQUIRED,
- (google.api.resource_reference) = {
- type: "tpu.googleapis.com/Node"
- }
- ];
- }
- // Request for [ReimageNode][google.cloud.tpu.v1.Tpu.ReimageNode].
- message ReimageNodeRequest {
- // The resource name.
- string name = 1;
- // The version for reimage to create.
- string tensorflow_version = 2;
- }
- // Request for [StopNode][google.cloud.tpu.v1.Tpu.StopNode].
- message StopNodeRequest {
- // The resource name.
- string name = 1;
- }
- // Request for [StartNode][google.cloud.tpu.v1.Tpu.StartNode].
- message StartNodeRequest {
- // The resource name.
- string name = 1;
- }
- // A tensorflow version that a Node can be configured with.
- message TensorFlowVersion {
- option (google.api.resource) = {
- type: "tpu.googleapis.com/TensorFlowVersion"
- pattern: "projects/{project}/locations/{location}/tensorFlowVersions/{tensor_flow_version}"
- };
- // The resource name.
- string name = 1;
- // the tensorflow version.
- string version = 2;
- }
- // Request for [GetTensorFlowVersion][google.cloud.tpu.v1.Tpu.GetTensorFlowVersion].
- message GetTensorFlowVersionRequest {
- // Required. The resource name.
- string name = 1 [
- (google.api.field_behavior) = REQUIRED,
- (google.api.resource_reference) = {
- type: "tpu.googleapis.com/TensorFlowVersion"
- }
- ];
- }
- // Request for [ListTensorFlowVersions][google.cloud.tpu.v1.Tpu.ListTensorFlowVersions].
- message ListTensorFlowVersionsRequest {
- // Required. The parent resource name.
- string parent = 1 [
- (google.api.field_behavior) = REQUIRED,
- (google.api.resource_reference) = {
- type: "tpu.googleapis.com/TensorFlowVersion"
- }
- ];
- // The maximum number of items to return.
- int32 page_size = 2;
- // The next_page_token value returned from a previous List request, if any.
- string page_token = 3;
- // List filter.
- string filter = 5;
- // Sort results.
- string order_by = 6;
- }
- // Response for [ListTensorFlowVersions][google.cloud.tpu.v1.Tpu.ListTensorFlowVersions].
- message ListTensorFlowVersionsResponse {
- // The listed nodes.
- repeated TensorFlowVersion tensorflow_versions = 1;
- // The next page token or empty if none.
- string next_page_token = 2;
- // Locations that could not be reached.
- repeated string unreachable = 3;
- }
- // A accelerator type that a Node can be configured with.
- message AcceleratorType {
- option (google.api.resource) = {
- type: "tpu.googleapis.com/AcceleratorType"
- pattern: "projects/{project}/locations/{location}/acceleratorTypes/{accelerator_type}"
- };
- // The resource name.
- string name = 1;
- // the accelerator type.
- string type = 2;
- }
- // Request for [GetAcceleratorType][google.cloud.tpu.v1.Tpu.GetAcceleratorType].
- message GetAcceleratorTypeRequest {
- // Required. The resource name.
- string name = 1 [
- (google.api.field_behavior) = REQUIRED,
- (google.api.resource_reference) = {
- type: "tpu.googleapis.com/AcceleratorType"
- }
- ];
- }
- // Request for [ListAcceleratorTypes][google.cloud.tpu.v1.Tpu.ListAcceleratorTypes].
- message ListAcceleratorTypesRequest {
- // Required. The parent resource name.
- string parent = 1 [
- (google.api.field_behavior) = REQUIRED,
- (google.api.resource_reference) = {
- type: "tpu.googleapis.com/AcceleratorType"
- }
- ];
- // The maximum number of items to return.
- int32 page_size = 2;
- // The next_page_token value returned from a previous List request, if any.
- string page_token = 3;
- // List filter.
- string filter = 5;
- // Sort results.
- string order_by = 6;
- }
- // Response for [ListAcceleratorTypes][google.cloud.tpu.v1.Tpu.ListAcceleratorTypes].
- message ListAcceleratorTypesResponse {
- // The listed nodes.
- repeated AcceleratorType accelerator_types = 1;
- // The next page token or empty if none.
- string next_page_token = 2;
- // Locations that could not be reached.
- repeated string unreachable = 3;
- }
- // Metadata describing an [Operation][google.longrunning.Operation]
- message OperationMetadata {
- // The time the operation was created.
- google.protobuf.Timestamp create_time = 1;
- // The time the operation finished running.
- google.protobuf.Timestamp end_time = 2;
- // Target of the operation - for example
- // projects/project-1/connectivityTests/test-1
- string target = 3;
- // Name of the verb executed by the operation.
- string verb = 4;
- // Human-readable status of the operation, if any.
- string status_detail = 5;
- // Specifies if cancellation was requested for the operation.
- bool cancel_requested = 6;
- // API version.
- string api_version = 7;
- }
- // A Symptom instance.
- message Symptom {
- // SymptomType represents the different types of Symptoms that a TPU can be
- // at.
- enum SymptomType {
- // Unspecified symptom.
- SYMPTOM_TYPE_UNSPECIFIED = 0;
- // TPU VM memory is low.
- LOW_MEMORY = 1;
- // TPU runtime is out of memory.
- OUT_OF_MEMORY = 2;
- // TPU runtime execution has timed out.
- EXECUTE_TIMED_OUT = 3;
- // TPU runtime fails to construct a mesh that recognizes each TPU device's
- // neighbors.
- MESH_BUILD_FAIL = 4;
- // TPU HBM is out of memory.
- HBM_OUT_OF_MEMORY = 5;
- // Abusive behaviors have been identified on the current project.
- PROJECT_ABUSE = 6;
- }
- // Timestamp when the Symptom is created.
- google.protobuf.Timestamp create_time = 1;
- // Type of the Symptom.
- SymptomType symptom_type = 2;
- // Detailed information of the current Symptom.
- string details = 3;
- // A string used to uniquely distinguish a worker within a TPU node.
- string worker_id = 4;
- }
|