storage.proto 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. // Copyright 2020 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.cloud.bigquery.storage.v1beta1;
  16. import "google/api/annotations.proto";
  17. import "google/api/client.proto";
  18. import "google/api/field_behavior.proto";
  19. import "google/api/resource.proto";
  20. import "google/cloud/bigquery/storage/v1beta1/arrow.proto";
  21. import "google/cloud/bigquery/storage/v1beta1/avro.proto";
  22. import "google/cloud/bigquery/storage/v1beta1/read_options.proto";
  23. import "google/cloud/bigquery/storage/v1beta1/table_reference.proto";
  24. import "google/protobuf/empty.proto";
  25. import "google/protobuf/timestamp.proto";
  26. option go_package = "google.golang.org/genproto/googleapis/cloud/bigquery/storage/v1beta1;storage";
  27. option java_package = "com.google.cloud.bigquery.storage.v1beta1";
  28. // BigQuery storage API.
  29. //
  30. // The BigQuery storage API can be used to read data stored in BigQuery.
  31. service BigQueryStorage {
  32. option (google.api.default_host) = "bigquerystorage.googleapis.com";
  33. option (google.api.oauth_scopes) =
  34. "https://www.googleapis.com/auth/bigquery,"
  35. "https://www.googleapis.com/auth/cloud-platform";
  36. // Creates a new read session. A read session divides the contents of a
  37. // BigQuery table into one or more streams, which can then be used to read
  38. // data from the table. The read session also specifies properties of the
  39. // data to be read, such as a list of columns or a push-down filter describing
  40. // the rows to be returned.
  41. //
  42. // A particular row can be read by at most one stream. When the caller has
  43. // reached the end of each stream in the session, then all the data in the
  44. // table has been read.
  45. //
  46. // Read sessions automatically expire 24 hours after they are created and do
  47. // not require manual clean-up by the caller.
  48. rpc CreateReadSession(CreateReadSessionRequest) returns (ReadSession) {
  49. option (google.api.http) = {
  50. post: "/v1beta1/{table_reference.project_id=projects/*}"
  51. body: "*"
  52. additional_bindings {
  53. post: "/v1beta1/{table_reference.dataset_id=projects/*/datasets/*}"
  54. body: "*"
  55. }
  56. };
  57. option (google.api.method_signature) = "table_reference,parent,requested_streams";
  58. }
  59. // Reads rows from the table in the format prescribed by the read session.
  60. // Each response contains one or more table rows, up to a maximum of 10 MiB
  61. // per response; read requests which attempt to read individual rows larger
  62. // than this will fail.
  63. //
  64. // Each request also returns a set of stream statistics reflecting the
  65. // estimated total number of rows in the read stream. This number is computed
  66. // based on the total table size and the number of active streams in the read
  67. // session, and may change as other streams continue to read data.
  68. rpc ReadRows(ReadRowsRequest) returns (stream ReadRowsResponse) {
  69. option (google.api.http) = {
  70. get: "/v1beta1/{read_position.stream.name=projects/*/streams/*}"
  71. };
  72. option (google.api.method_signature) = "read_position";
  73. }
  74. // Creates additional streams for a ReadSession. This API can be used to
  75. // dynamically adjust the parallelism of a batch processing task upwards by
  76. // adding additional workers.
  77. rpc BatchCreateReadSessionStreams(BatchCreateReadSessionStreamsRequest) returns (BatchCreateReadSessionStreamsResponse) {
  78. option (google.api.http) = {
  79. post: "/v1beta1/{session.name=projects/*/sessions/*}"
  80. body: "*"
  81. };
  82. option (google.api.method_signature) = "session,requested_streams";
  83. }
  84. // Triggers the graceful termination of a single stream in a ReadSession. This
  85. // API can be used to dynamically adjust the parallelism of a batch processing
  86. // task downwards without losing data.
  87. //
  88. // This API does not delete the stream -- it remains visible in the
  89. // ReadSession, and any data processed by the stream is not released to other
  90. // streams. However, no additional data will be assigned to the stream once
  91. // this call completes. Callers must continue reading data on the stream until
  92. // the end of the stream is reached so that data which has already been
  93. // assigned to the stream will be processed.
  94. //
  95. // This method will return an error if there are no other live streams
  96. // in the Session, or if SplitReadStream() has been called on the given
  97. // Stream.
  98. rpc FinalizeStream(FinalizeStreamRequest) returns (google.protobuf.Empty) {
  99. option (google.api.http) = {
  100. post: "/v1beta1/{stream.name=projects/*/streams/*}"
  101. body: "*"
  102. };
  103. option (google.api.method_signature) = "stream";
  104. }
  105. // Splits a given read stream into two Streams. These streams are referred to
  106. // as the primary and the residual of the split. The original stream can still
  107. // be read from in the same manner as before. Both of the returned streams can
  108. // also be read from, and the total rows return by both child streams will be
  109. // the same as the rows read from the original stream.
  110. //
  111. // Moreover, the two child streams will be allocated back to back in the
  112. // original Stream. Concretely, it is guaranteed that for streams Original,
  113. // Primary, and Residual, that Original[0-j] = Primary[0-j] and
  114. // Original[j-n] = Residual[0-m] once the streams have been read to
  115. // completion.
  116. //
  117. // This method is guaranteed to be idempotent.
  118. rpc SplitReadStream(SplitReadStreamRequest) returns (SplitReadStreamResponse) {
  119. option (google.api.http) = {
  120. get: "/v1beta1/{original_stream.name=projects/*/streams/*}"
  121. };
  122. option (google.api.method_signature) = "original_stream";
  123. }
  124. }
  125. // Information about a single data stream within a read session.
  126. message Stream {
  127. option (google.api.resource) = {
  128. type: "bigquerystorage.googleapis.com/Stream"
  129. pattern: "projects/{project}/locations/{location}/streams/{stream}"
  130. };
  131. // Name of the stream, in the form
  132. // `projects/{project_id}/locations/{location}/streams/{stream_id}`.
  133. string name = 1;
  134. }
  135. // Expresses a point within a given stream using an offset position.
  136. message StreamPosition {
  137. // Identifier for a given Stream.
  138. Stream stream = 1;
  139. // Position in the stream.
  140. int64 offset = 2;
  141. }
  142. // Information returned from a `CreateReadSession` request.
  143. message ReadSession {
  144. option (google.api.resource) = {
  145. type: "bigquerystorage.googleapis.com/ReadSession"
  146. pattern: "projects/{project}/locations/{location}/sessions/{session}"
  147. };
  148. // Unique identifier for the session, in the form
  149. // `projects/{project_id}/locations/{location}/sessions/{session_id}`.
  150. string name = 1;
  151. // Time at which the session becomes invalid. After this time, subsequent
  152. // requests to read this Session will return errors.
  153. google.protobuf.Timestamp expire_time = 2;
  154. // The schema for the read. If read_options.selected_fields is set, the
  155. // schema may be different from the table schema as it will only contain
  156. // the selected fields.
  157. oneof schema {
  158. // Avro schema.
  159. AvroSchema avro_schema = 5;
  160. // Arrow schema.
  161. ArrowSchema arrow_schema = 6;
  162. }
  163. // Streams associated with this session.
  164. repeated Stream streams = 4;
  165. // Table that this ReadSession is reading from.
  166. TableReference table_reference = 7;
  167. // Any modifiers which are applied when reading from the specified table.
  168. TableModifiers table_modifiers = 8;
  169. // The strategy to use for distributing data among the streams.
  170. ShardingStrategy sharding_strategy = 9;
  171. }
  172. // Creates a new read session, which may include additional options such as
  173. // requested parallelism, projection filters and constraints.
  174. message CreateReadSessionRequest {
  175. // Required. Reference to the table to read.
  176. TableReference table_reference = 1 [(google.api.field_behavior) = REQUIRED];
  177. // Required. String of the form `projects/{project_id}` indicating the
  178. // project this ReadSession is associated with. This is the project that will
  179. // be billed for usage.
  180. string parent = 6 [
  181. (google.api.field_behavior) = REQUIRED,
  182. (google.api.resource_reference) = {
  183. type: "cloudresourcemanager.googleapis.com/Project"
  184. }
  185. ];
  186. // Any modifiers to the Table (e.g. snapshot timestamp).
  187. TableModifiers table_modifiers = 2;
  188. // Initial number of streams. If unset or 0, we will
  189. // provide a value of streams so as to produce reasonable throughput. Must be
  190. // non-negative. The number of streams may be lower than the requested number,
  191. // depending on the amount parallelism that is reasonable for the table and
  192. // the maximum amount of parallelism allowed by the system.
  193. //
  194. // Streams must be read starting from offset 0.
  195. int32 requested_streams = 3;
  196. // Read options for this session (e.g. column selection, filters).
  197. TableReadOptions read_options = 4;
  198. // Data output format. Currently default to Avro.
  199. DataFormat format = 5;
  200. // The strategy to use for distributing data among multiple streams. Currently
  201. // defaults to liquid sharding.
  202. ShardingStrategy sharding_strategy = 7;
  203. }
  204. // Data format for input or output data.
  205. enum DataFormat {
  206. // Data format is unspecified.
  207. DATA_FORMAT_UNSPECIFIED = 0;
  208. // Avro is a standard open source row based file format.
  209. // See https://avro.apache.org/ for more details.
  210. AVRO = 1;
  211. ARROW = 3;
  212. }
  213. // Strategy for distributing data among multiple streams in a read session.
  214. enum ShardingStrategy {
  215. // Same as LIQUID.
  216. SHARDING_STRATEGY_UNSPECIFIED = 0;
  217. // Assigns data to each stream based on the client's read rate. The faster the
  218. // client reads from a stream, the more data is assigned to the stream. In
  219. // this strategy, it's possible to read all data from a single stream even if
  220. // there are other streams present.
  221. LIQUID = 1;
  222. // Assigns data to each stream such that roughly the same number of rows can
  223. // be read from each stream. Because the server-side unit for assigning data
  224. // is collections of rows, the API does not guarantee that each stream will
  225. // return the same number or rows. Additionally, the limits are enforced based
  226. // on the number of pre-filtering rows, so some filters can lead to lopsided
  227. // assignments.
  228. BALANCED = 2;
  229. }
  230. // Requesting row data via `ReadRows` must provide Stream position information.
  231. message ReadRowsRequest {
  232. // Required. Identifier of the position in the stream to start reading from.
  233. // The offset requested must be less than the last row read from ReadRows.
  234. // Requesting a larger offset is undefined.
  235. StreamPosition read_position = 1 [(google.api.field_behavior) = REQUIRED];
  236. }
  237. // Progress information for a given Stream.
  238. message StreamStatus {
  239. // Number of estimated rows in the current stream. May change over time as
  240. // different readers in the stream progress at rates which are relatively fast
  241. // or slow.
  242. int64 estimated_row_count = 1;
  243. // A value in the range [0.0, 1.0] that represents the fraction of rows
  244. // assigned to this stream that have been processed by the server. In the
  245. // presence of read filters, the server may process more rows than it returns,
  246. // so this value reflects progress through the pre-filtering rows.
  247. //
  248. // This value is only populated for sessions created through the BALANCED
  249. // sharding strategy.
  250. float fraction_consumed = 2;
  251. // Represents the progress of the current stream.
  252. Progress progress = 4;
  253. // Whether this stream can be split. For sessions that use the LIQUID sharding
  254. // strategy, this value is always false. For BALANCED sessions, this value is
  255. // false when enough data have been read such that no more splits are possible
  256. // at that point or beyond. For small tables or streams that are the result of
  257. // a chain of splits, this value may never be true.
  258. bool is_splittable = 3;
  259. }
  260. message Progress {
  261. // The fraction of rows assigned to the stream that have been processed by the
  262. // server so far, not including the rows in the current response message.
  263. //
  264. // This value, along with `at_response_end`, can be used to interpolate the
  265. // progress made as the rows in the message are being processed using the
  266. // following formula: `at_response_start + (at_response_end -
  267. // at_response_start) * rows_processed_from_response / rows_in_response`.
  268. //
  269. // Note that if a filter is provided, the `at_response_end` value of the
  270. // previous response may not necessarily be equal to the `at_response_start`
  271. // value of the current response.
  272. float at_response_start = 1;
  273. // Similar to `at_response_start`, except that this value includes the rows in
  274. // the current response.
  275. float at_response_end = 2;
  276. }
  277. // Information on if the current connection is being throttled.
  278. message ThrottleStatus {
  279. // How much this connection is being throttled.
  280. // 0 is no throttling, 100 is completely throttled.
  281. int32 throttle_percent = 1;
  282. }
  283. // Response from calling `ReadRows` may include row data, progress and
  284. // throttling information.
  285. message ReadRowsResponse {
  286. // Row data is returned in format specified during session creation.
  287. oneof rows {
  288. // Serialized row data in AVRO format.
  289. AvroRows avro_rows = 3;
  290. // Serialized row data in Arrow RecordBatch format.
  291. ArrowRecordBatch arrow_record_batch = 4;
  292. }
  293. // Number of serialized rows in the rows block. This value is recorded here,
  294. // in addition to the row_count values in the output-specific messages in
  295. // `rows`, so that code which needs to record progress through the stream can
  296. // do so in an output format-independent way.
  297. int64 row_count = 6;
  298. // Estimated stream statistics.
  299. StreamStatus status = 2;
  300. // Throttling status. If unset, the latest response still describes
  301. // the current throttling status.
  302. ThrottleStatus throttle_status = 5;
  303. }
  304. // Information needed to request additional streams for an established read
  305. // session.
  306. message BatchCreateReadSessionStreamsRequest {
  307. // Required. Must be a non-expired session obtained from a call to
  308. // CreateReadSession. Only the name field needs to be set.
  309. ReadSession session = 1 [(google.api.field_behavior) = REQUIRED];
  310. // Required. Number of new streams requested. Must be positive.
  311. // Number of added streams may be less than this, see CreateReadSessionRequest
  312. // for more information.
  313. int32 requested_streams = 2 [(google.api.field_behavior) = REQUIRED];
  314. }
  315. // The response from `BatchCreateReadSessionStreams` returns the stream
  316. // identifiers for the newly created streams.
  317. message BatchCreateReadSessionStreamsResponse {
  318. // Newly added streams.
  319. repeated Stream streams = 1;
  320. }
  321. // Request information for invoking `FinalizeStream`.
  322. message FinalizeStreamRequest {
  323. // Required. Stream to finalize.
  324. Stream stream = 2 [(google.api.field_behavior) = REQUIRED];
  325. }
  326. // Request information for `SplitReadStream`.
  327. message SplitReadStreamRequest {
  328. // Required. Stream to split.
  329. Stream original_stream = 1 [(google.api.field_behavior) = REQUIRED];
  330. // A value in the range (0.0, 1.0) that specifies the fractional point at
  331. // which the original stream should be split. The actual split point is
  332. // evaluated on pre-filtered rows, so if a filter is provided, then there is
  333. // no guarantee that the division of the rows between the new child streams
  334. // will be proportional to this fractional value. Additionally, because the
  335. // server-side unit for assigning data is collections of rows, this fraction
  336. // will always map to to a data storage boundary on the server side.
  337. float fraction = 2;
  338. }
  339. // Response from `SplitReadStream`.
  340. message SplitReadStreamResponse {
  341. // Primary stream, which contains the beginning portion of
  342. // |original_stream|. An empty value indicates that the original stream can no
  343. // longer be split.
  344. Stream primary_stream = 1;
  345. // Remainder stream, which contains the tail of |original_stream|. An empty
  346. // value indicates that the original stream can no longer be split.
  347. Stream remainder_stream = 2;
  348. }