cloud_tpu.proto 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617
  1. // Copyright 2022 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.cloud.tpu.v1;
  16. import "google/api/annotations.proto";
  17. import "google/api/client.proto";
  18. import "google/api/field_behavior.proto";
  19. import "google/api/resource.proto";
  20. import "google/longrunning/operations.proto";
  21. import "google/protobuf/timestamp.proto";
  22. option go_package = "google.golang.org/genproto/googleapis/cloud/tpu/v1;tpu";
  23. option java_multiple_files = true;
  24. option java_outer_classname = "CloudTpuProto";
  25. option java_package = "com.google.cloud.tpu.v1";
  26. // Manages TPU nodes and other resources
  27. //
  28. // TPU API v1
  29. service Tpu {
  30. option (google.api.default_host) = "tpu.googleapis.com";
  31. option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform";
  32. // Lists nodes.
  33. rpc ListNodes(ListNodesRequest) returns (ListNodesResponse) {
  34. option (google.api.http) = {
  35. get: "/v1/{parent=projects/*/locations/*}/nodes"
  36. };
  37. option (google.api.method_signature) = "parent";
  38. }
  39. // Gets the details of a node.
  40. rpc GetNode(GetNodeRequest) returns (Node) {
  41. option (google.api.http) = {
  42. get: "/v1/{name=projects/*/locations/*/nodes/*}"
  43. };
  44. option (google.api.method_signature) = "name";
  45. }
  46. // Creates a node.
  47. rpc CreateNode(CreateNodeRequest) returns (google.longrunning.Operation) {
  48. option (google.api.http) = {
  49. post: "/v1/{parent=projects/*/locations/*}/nodes"
  50. body: "node"
  51. };
  52. option (google.api.method_signature) = "parent,node,node_id";
  53. option (google.longrunning.operation_info) = {
  54. response_type: "Node"
  55. metadata_type: "OperationMetadata"
  56. };
  57. }
  58. // Deletes a node.
  59. rpc DeleteNode(DeleteNodeRequest) returns (google.longrunning.Operation) {
  60. option (google.api.http) = {
  61. delete: "/v1/{name=projects/*/locations/*/nodes/*}"
  62. };
  63. option (google.api.method_signature) = "name";
  64. option (google.longrunning.operation_info) = {
  65. response_type: "Node"
  66. metadata_type: "OperationMetadata"
  67. };
  68. }
  69. // Reimages a node's OS.
  70. rpc ReimageNode(ReimageNodeRequest) returns (google.longrunning.Operation) {
  71. option (google.api.http) = {
  72. post: "/v1/{name=projects/*/locations/*/nodes/*}:reimage"
  73. body: "*"
  74. };
  75. option (google.longrunning.operation_info) = {
  76. response_type: "Node"
  77. metadata_type: "OperationMetadata"
  78. };
  79. }
  80. // Stops a node, this operation is only available with single TPU nodes.
  81. rpc StopNode(StopNodeRequest) returns (google.longrunning.Operation) {
  82. option (google.api.http) = {
  83. post: "/v1/{name=projects/*/locations/*/nodes/*}:stop"
  84. body: "*"
  85. };
  86. option (google.longrunning.operation_info) = {
  87. response_type: "Node"
  88. metadata_type: "OperationMetadata"
  89. };
  90. }
  91. // Starts a node.
  92. rpc StartNode(StartNodeRequest) returns (google.longrunning.Operation) {
  93. option (google.api.http) = {
  94. post: "/v1/{name=projects/*/locations/*/nodes/*}:start"
  95. body: "*"
  96. };
  97. option (google.longrunning.operation_info) = {
  98. response_type: "Node"
  99. metadata_type: "OperationMetadata"
  100. };
  101. }
  102. // List TensorFlow versions supported by this API.
  103. rpc ListTensorFlowVersions(ListTensorFlowVersionsRequest) returns (ListTensorFlowVersionsResponse) {
  104. option (google.api.http) = {
  105. get: "/v1/{parent=projects/*/locations/*}/tensorflowVersions"
  106. };
  107. option (google.api.method_signature) = "parent";
  108. }
  109. // Gets TensorFlow Version.
  110. rpc GetTensorFlowVersion(GetTensorFlowVersionRequest) returns (TensorFlowVersion) {
  111. option (google.api.http) = {
  112. get: "/v1/{name=projects/*/locations/*/tensorflowVersions/*}"
  113. };
  114. option (google.api.method_signature) = "name";
  115. }
  116. // Lists accelerator types supported by this API.
  117. rpc ListAcceleratorTypes(ListAcceleratorTypesRequest) returns (ListAcceleratorTypesResponse) {
  118. option (google.api.http) = {
  119. get: "/v1/{parent=projects/*/locations/*}/acceleratorTypes"
  120. };
  121. option (google.api.method_signature) = "parent";
  122. }
  123. // Gets AcceleratorType.
  124. rpc GetAcceleratorType(GetAcceleratorTypeRequest) returns (AcceleratorType) {
  125. option (google.api.http) = {
  126. get: "/v1/{name=projects/*/locations/*/acceleratorTypes/*}"
  127. };
  128. option (google.api.method_signature) = "name";
  129. }
  130. }
  131. // Sets the scheduling options for this node.
  132. message SchedulingConfig {
  133. // Defines whether the node is preemptible.
  134. bool preemptible = 1;
  135. // Whether the node is created under a reservation.
  136. bool reserved = 2;
  137. }
  138. // A network endpoint over which a TPU worker can be reached.
  139. message NetworkEndpoint {
  140. // The IP address of this network endpoint.
  141. string ip_address = 1;
  142. // The port of this network endpoint.
  143. int32 port = 2;
  144. }
  145. // A TPU instance.
  146. message Node {
  147. option (google.api.resource) = {
  148. type: "tpu.googleapis.com/Node"
  149. pattern: "projects/{project}/locations/{location}/nodes/{node}"
  150. };
  151. // Represents the different states of a TPU node during its lifecycle.
  152. enum State {
  153. // TPU node state is not known/set.
  154. STATE_UNSPECIFIED = 0;
  155. // TPU node is being created.
  156. CREATING = 1;
  157. // TPU node has been created.
  158. READY = 2;
  159. // TPU node is restarting.
  160. RESTARTING = 3;
  161. // TPU node is undergoing reimaging.
  162. REIMAGING = 4;
  163. // TPU node is being deleted.
  164. DELETING = 5;
  165. // TPU node is being repaired and may be unusable. Details can be
  166. // found in the `help_description` field.
  167. REPAIRING = 6;
  168. // TPU node is stopped.
  169. STOPPED = 8;
  170. // TPU node is currently stopping.
  171. STOPPING = 9;
  172. // TPU node is currently starting.
  173. STARTING = 10;
  174. // TPU node has been preempted. Only applies to Preemptible TPU Nodes.
  175. PREEMPTED = 11;
  176. // TPU node has been terminated due to maintenance or has reached the end of
  177. // its life cycle (for preemptible nodes).
  178. TERMINATED = 12;
  179. // TPU node is currently hiding.
  180. HIDING = 13;
  181. // TPU node has been hidden.
  182. HIDDEN = 14;
  183. // TPU node is currently unhiding.
  184. UNHIDING = 15;
  185. }
  186. // Health defines the status of a TPU node as reported by
  187. // Health Monitor.
  188. enum Health {
  189. // Health status is unknown: not initialized or failed to retrieve.
  190. HEALTH_UNSPECIFIED = 0;
  191. // The resource is healthy.
  192. HEALTHY = 1;
  193. // The resource is unhealthy.
  194. DEPRECATED_UNHEALTHY = 2;
  195. // The resource is unresponsive.
  196. TIMEOUT = 3;
  197. // The in-guest ML stack is unhealthy.
  198. UNHEALTHY_TENSORFLOW = 4;
  199. // The node is under maintenance/priority boost caused rescheduling and
  200. // will resume running once rescheduled.
  201. UNHEALTHY_MAINTENANCE = 5;
  202. }
  203. // TPU API Version.
  204. enum ApiVersion {
  205. // API version is unknown.
  206. API_VERSION_UNSPECIFIED = 0;
  207. // TPU API V1Alpha1 version.
  208. V1_ALPHA1 = 1;
  209. // TPU API V1 version.
  210. V1 = 2;
  211. // TPU API V2Alpha1 version.
  212. V2_ALPHA1 = 3;
  213. }
  214. // Output only. Immutable. The name of the TPU
  215. string name = 1 [
  216. (google.api.field_behavior) = IMMUTABLE,
  217. (google.api.field_behavior) = OUTPUT_ONLY
  218. ];
  219. // The user-supplied description of the TPU. Maximum of 512 characters.
  220. string description = 3;
  221. // Required. The type of hardware accelerators associated with this node.
  222. string accelerator_type = 5 [(google.api.field_behavior) = REQUIRED];
  223. // Output only. DEPRECATED! Use network_endpoints instead.
  224. // The network address for the TPU Node as visible to Compute Engine
  225. // instances.
  226. string ip_address = 8 [deprecated = true];
  227. // Output only. DEPRECATED! Use network_endpoints instead.
  228. // The network port for the TPU Node as visible to Compute Engine instances.
  229. string port = 14 [deprecated = true];
  230. // Output only. The current state for the TPU Node.
  231. State state = 9 [(google.api.field_behavior) = OUTPUT_ONLY];
  232. // Output only. If this field is populated, it contains a description of why the TPU Node
  233. // is unhealthy.
  234. string health_description = 10 [(google.api.field_behavior) = OUTPUT_ONLY];
  235. // Required. The version of Tensorflow running in the Node.
  236. string tensorflow_version = 11 [(google.api.field_behavior) = REQUIRED];
  237. // The name of a network they wish to peer the TPU node to. It must be a
  238. // preexisting Compute Engine network inside of the project on which this API
  239. // has been activated. If none is provided, "default" will be used.
  240. string network = 12;
  241. // The CIDR block that the TPU node will use when selecting an IP address.
  242. // This CIDR block must be a /29 block; the Compute Engine networks API
  243. // forbids a smaller block, and using a larger block would be wasteful (a
  244. // node can only consume one IP address). Errors will occur if the CIDR block
  245. // has already been used for a currently existing TPU node, the CIDR block
  246. // conflicts with any subnetworks in the user's provided network, or the
  247. // provided network is peered with another network that is using that CIDR
  248. // block.
  249. string cidr_block = 13;
  250. // Output only. The service account used to run the tensor flow services within the node.
  251. // To share resources, including Google Cloud Storage data, with the
  252. // Tensorflow job running in the Node, this account must have permissions to
  253. // that data.
  254. string service_account = 15 [(google.api.field_behavior) = OUTPUT_ONLY];
  255. // Output only. The time when the node was created.
  256. google.protobuf.Timestamp create_time = 16 [(google.api.field_behavior) = OUTPUT_ONLY];
  257. // The scheduling options for this node.
  258. SchedulingConfig scheduling_config = 17;
  259. // Output only. The network endpoints where TPU workers can be accessed and
  260. // sent work. It is recommended that Tensorflow clients of the node reach out
  261. // to the 0th entry in this map first.
  262. repeated NetworkEndpoint network_endpoints = 21 [(google.api.field_behavior) = OUTPUT_ONLY];
  263. // The health status of the TPU node.
  264. Health health = 22;
  265. // Resource labels to represent user-provided metadata.
  266. map<string, string> labels = 24;
  267. // Whether the VPC peering for the node is set up through Service Networking
  268. // API. The VPC Peering should be set up before provisioning the node.
  269. // If this field is set, cidr_block field should not be specified. If the
  270. // network, that you want to peer the TPU Node to, is Shared VPC networks,
  271. // the node must be created with this this field enabled.
  272. bool use_service_networking = 27;
  273. // Output only. The API version that created this Node.
  274. ApiVersion api_version = 38 [(google.api.field_behavior) = OUTPUT_ONLY];
  275. // Output only. The Symptoms that have occurred to the TPU Node.
  276. repeated Symptom symptoms = 39 [(google.api.field_behavior) = OUTPUT_ONLY];
  277. }
  278. // Request for [ListNodes][google.cloud.tpu.v1.Tpu.ListNodes].
  279. message ListNodesRequest {
  280. // Required. The parent resource name.
  281. string parent = 1 [
  282. (google.api.field_behavior) = REQUIRED,
  283. (google.api.resource_reference) = {
  284. child_type: "tpu.googleapis.com/Node"
  285. }
  286. ];
  287. // The maximum number of items to return.
  288. int32 page_size = 2;
  289. // The next_page_token value returned from a previous List request, if any.
  290. string page_token = 3;
  291. }
  292. // Response for [ListNodes][google.cloud.tpu.v1.Tpu.ListNodes].
  293. message ListNodesResponse {
  294. // The listed nodes.
  295. repeated Node nodes = 1;
  296. // The next page token or empty if none.
  297. string next_page_token = 2;
  298. // Locations that could not be reached.
  299. repeated string unreachable = 3;
  300. }
  301. // Request for [GetNode][google.cloud.tpu.v1.Tpu.GetNode].
  302. message GetNodeRequest {
  303. // Required. The resource name.
  304. string name = 1 [
  305. (google.api.field_behavior) = REQUIRED,
  306. (google.api.resource_reference) = {
  307. type: "tpu.googleapis.com/Node"
  308. }
  309. ];
  310. }
  311. // Request for [CreateNode][google.cloud.tpu.v1.Tpu.CreateNode].
  312. message CreateNodeRequest {
  313. // Required. The parent resource name.
  314. string parent = 1 [
  315. (google.api.field_behavior) = REQUIRED,
  316. (google.api.resource_reference) = {
  317. child_type: "tpu.googleapis.com/Node"
  318. }
  319. ];
  320. // The unqualified resource name.
  321. string node_id = 2;
  322. // Required. The node.
  323. Node node = 3 [(google.api.field_behavior) = REQUIRED];
  324. }
  325. // Request for [DeleteNode][google.cloud.tpu.v1.Tpu.DeleteNode].
  326. message DeleteNodeRequest {
  327. // Required. The resource name.
  328. string name = 1 [
  329. (google.api.field_behavior) = REQUIRED,
  330. (google.api.resource_reference) = {
  331. type: "tpu.googleapis.com/Node"
  332. }
  333. ];
  334. }
  335. // Request for [ReimageNode][google.cloud.tpu.v1.Tpu.ReimageNode].
  336. message ReimageNodeRequest {
  337. // The resource name.
  338. string name = 1;
  339. // The version for reimage to create.
  340. string tensorflow_version = 2;
  341. }
  342. // Request for [StopNode][google.cloud.tpu.v1.Tpu.StopNode].
  343. message StopNodeRequest {
  344. // The resource name.
  345. string name = 1;
  346. }
  347. // Request for [StartNode][google.cloud.tpu.v1.Tpu.StartNode].
  348. message StartNodeRequest {
  349. // The resource name.
  350. string name = 1;
  351. }
  352. // A tensorflow version that a Node can be configured with.
  353. message TensorFlowVersion {
  354. option (google.api.resource) = {
  355. type: "tpu.googleapis.com/TensorFlowVersion"
  356. pattern: "projects/{project}/locations/{location}/tensorFlowVersions/{tensor_flow_version}"
  357. };
  358. // The resource name.
  359. string name = 1;
  360. // the tensorflow version.
  361. string version = 2;
  362. }
  363. // Request for [GetTensorFlowVersion][google.cloud.tpu.v1.Tpu.GetTensorFlowVersion].
  364. message GetTensorFlowVersionRequest {
  365. // Required. The resource name.
  366. string name = 1 [
  367. (google.api.field_behavior) = REQUIRED,
  368. (google.api.resource_reference) = {
  369. type: "tpu.googleapis.com/TensorFlowVersion"
  370. }
  371. ];
  372. }
  373. // Request for [ListTensorFlowVersions][google.cloud.tpu.v1.Tpu.ListTensorFlowVersions].
  374. message ListTensorFlowVersionsRequest {
  375. // Required. The parent resource name.
  376. string parent = 1 [
  377. (google.api.field_behavior) = REQUIRED,
  378. (google.api.resource_reference) = {
  379. type: "tpu.googleapis.com/TensorFlowVersion"
  380. }
  381. ];
  382. // The maximum number of items to return.
  383. int32 page_size = 2;
  384. // The next_page_token value returned from a previous List request, if any.
  385. string page_token = 3;
  386. // List filter.
  387. string filter = 5;
  388. // Sort results.
  389. string order_by = 6;
  390. }
  391. // Response for [ListTensorFlowVersions][google.cloud.tpu.v1.Tpu.ListTensorFlowVersions].
  392. message ListTensorFlowVersionsResponse {
  393. // The listed nodes.
  394. repeated TensorFlowVersion tensorflow_versions = 1;
  395. // The next page token or empty if none.
  396. string next_page_token = 2;
  397. // Locations that could not be reached.
  398. repeated string unreachable = 3;
  399. }
  400. // A accelerator type that a Node can be configured with.
  401. message AcceleratorType {
  402. option (google.api.resource) = {
  403. type: "tpu.googleapis.com/AcceleratorType"
  404. pattern: "projects/{project}/locations/{location}/acceleratorTypes/{accelerator_type}"
  405. };
  406. // The resource name.
  407. string name = 1;
  408. // the accelerator type.
  409. string type = 2;
  410. }
  411. // Request for [GetAcceleratorType][google.cloud.tpu.v1.Tpu.GetAcceleratorType].
  412. message GetAcceleratorTypeRequest {
  413. // Required. The resource name.
  414. string name = 1 [
  415. (google.api.field_behavior) = REQUIRED,
  416. (google.api.resource_reference) = {
  417. type: "tpu.googleapis.com/AcceleratorType"
  418. }
  419. ];
  420. }
  421. // Request for [ListAcceleratorTypes][google.cloud.tpu.v1.Tpu.ListAcceleratorTypes].
  422. message ListAcceleratorTypesRequest {
  423. // Required. The parent resource name.
  424. string parent = 1 [
  425. (google.api.field_behavior) = REQUIRED,
  426. (google.api.resource_reference) = {
  427. type: "tpu.googleapis.com/AcceleratorType"
  428. }
  429. ];
  430. // The maximum number of items to return.
  431. int32 page_size = 2;
  432. // The next_page_token value returned from a previous List request, if any.
  433. string page_token = 3;
  434. // List filter.
  435. string filter = 5;
  436. // Sort results.
  437. string order_by = 6;
  438. }
  439. // Response for [ListAcceleratorTypes][google.cloud.tpu.v1.Tpu.ListAcceleratorTypes].
  440. message ListAcceleratorTypesResponse {
  441. // The listed nodes.
  442. repeated AcceleratorType accelerator_types = 1;
  443. // The next page token or empty if none.
  444. string next_page_token = 2;
  445. // Locations that could not be reached.
  446. repeated string unreachable = 3;
  447. }
  448. // Metadata describing an [Operation][google.longrunning.Operation]
  449. message OperationMetadata {
  450. // The time the operation was created.
  451. google.protobuf.Timestamp create_time = 1;
  452. // The time the operation finished running.
  453. google.protobuf.Timestamp end_time = 2;
  454. // Target of the operation - for example
  455. // projects/project-1/connectivityTests/test-1
  456. string target = 3;
  457. // Name of the verb executed by the operation.
  458. string verb = 4;
  459. // Human-readable status of the operation, if any.
  460. string status_detail = 5;
  461. // Specifies if cancellation was requested for the operation.
  462. bool cancel_requested = 6;
  463. // API version.
  464. string api_version = 7;
  465. }
  466. // A Symptom instance.
  467. message Symptom {
  468. // SymptomType represents the different types of Symptoms that a TPU can be
  469. // at.
  470. enum SymptomType {
  471. // Unspecified symptom.
  472. SYMPTOM_TYPE_UNSPECIFIED = 0;
  473. // TPU VM memory is low.
  474. LOW_MEMORY = 1;
  475. // TPU runtime is out of memory.
  476. OUT_OF_MEMORY = 2;
  477. // TPU runtime execution has timed out.
  478. EXECUTE_TIMED_OUT = 3;
  479. // TPU runtime fails to construct a mesh that recognizes each TPU device's
  480. // neighbors.
  481. MESH_BUILD_FAIL = 4;
  482. // TPU HBM is out of memory.
  483. HBM_OUT_OF_MEMORY = 5;
  484. // Abusive behaviors have been identified on the current project.
  485. PROJECT_ABUSE = 6;
  486. }
  487. // Timestamp when the Symptom is created.
  488. google.protobuf.Timestamp create_time = 1;
  489. // Type of the Symptom.
  490. SymptomType symptom_type = 2;
  491. // Detailed information of the current Symptom.
  492. string details = 3;
  493. // A string used to uniquely distinguish a worker within a TPU node.
  494. string worker_id = 4;
  495. }