streaming.proto 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231
  1. // Copyright 2022 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.dataflow.v1beta3;
  16. option csharp_namespace = "Google.Cloud.Dataflow.V1Beta3";
  17. option go_package = "google.golang.org/genproto/googleapis/dataflow/v1beta3;dataflow";
  18. option java_multiple_files = true;
  19. option java_outer_classname = "StreamingProto";
  20. option java_package = "com.google.dataflow.v1beta3";
  21. option php_namespace = "Google\\Cloud\\Dataflow\\V1beta3";
  22. option ruby_package = "Google::Cloud::Dataflow::V1beta3";
  23. // Global topology of the streaming Dataflow job, including all
  24. // computations and their sharded locations.
  25. message TopologyConfig {
  26. // The computations associated with a streaming Dataflow job.
  27. repeated ComputationTopology computations = 1;
  28. // The disks assigned to a streaming Dataflow job.
  29. repeated DataDiskAssignment data_disk_assignments = 2;
  30. // Maps user stage names to stable computation names.
  31. map<string, string> user_stage_to_computation_name_map = 3;
  32. // The size (in bits) of keys that will be assigned to source messages.
  33. int32 forwarding_key_bits = 4;
  34. // Version number for persistent state.
  35. int32 persistent_state_version = 5;
  36. }
  37. // Identifies a pubsub location to use for transferring data into or
  38. // out of a streaming Dataflow job.
  39. message PubsubLocation {
  40. // A pubsub topic, in the form of
  41. // "pubsub.googleapis.com/topics/<project-id>/<topic-name>"
  42. string topic = 1;
  43. // A pubsub subscription, in the form of
  44. // "pubsub.googleapis.com/subscriptions/<project-id>/<subscription-name>"
  45. string subscription = 2;
  46. // If set, contains a pubsub label from which to extract record timestamps.
  47. // If left empty, record timestamps will be generated upon arrival.
  48. string timestamp_label = 3;
  49. // If set, contains a pubsub label from which to extract record ids.
  50. // If left empty, record deduplication will be strictly best effort.
  51. string id_label = 4;
  52. // Indicates whether the pipeline allows late-arriving data.
  53. bool drop_late_data = 5;
  54. // If set, specifies the pubsub subscription that will be used for tracking
  55. // custom time timestamps for watermark estimation.
  56. string tracking_subscription = 6;
  57. // If true, then the client has requested to get pubsub attributes.
  58. bool with_attributes = 7;
  59. }
  60. // Identifies the location of a streaming computation stage, for
  61. // stage-to-stage communication.
  62. message StreamingStageLocation {
  63. // Identifies the particular stream within the streaming Dataflow
  64. // job.
  65. string stream_id = 1;
  66. }
  67. // Identifies the location of a streaming side input.
  68. message StreamingSideInputLocation {
  69. // Identifies the particular side input within the streaming Dataflow job.
  70. string tag = 1;
  71. // Identifies the state family where this side input is stored.
  72. string state_family = 2;
  73. }
  74. // Identifies the location of a custom souce.
  75. message CustomSourceLocation {
  76. // Whether this source is stateful.
  77. bool stateful = 1;
  78. }
  79. // Describes a stream of data, either as input to be processed or as
  80. // output of a streaming Dataflow job.
  81. message StreamLocation {
  82. // A specification of a stream's location.
  83. oneof location {
  84. // The stream is part of another computation within the current
  85. // streaming Dataflow job.
  86. StreamingStageLocation streaming_stage_location = 1;
  87. // The stream is a pubsub stream.
  88. PubsubLocation pubsub_location = 2;
  89. // The stream is a streaming side input.
  90. StreamingSideInputLocation side_input_location = 3;
  91. // The stream is a custom source.
  92. CustomSourceLocation custom_source_location = 4;
  93. }
  94. }
  95. // State family configuration.
  96. message StateFamilyConfig {
  97. // The state family value.
  98. string state_family = 1;
  99. // If true, this family corresponds to a read operation.
  100. bool is_read = 2;
  101. }
  102. // All configuration data for a particular Computation.
  103. message ComputationTopology {
  104. // The system stage name.
  105. string system_stage_name = 1;
  106. // The ID of the computation.
  107. string computation_id = 5;
  108. // The key ranges processed by the computation.
  109. repeated KeyRangeLocation key_ranges = 2;
  110. // The inputs to the computation.
  111. repeated StreamLocation inputs = 3;
  112. // The outputs from the computation.
  113. repeated StreamLocation outputs = 4;
  114. // The state family values.
  115. repeated StateFamilyConfig state_families = 7;
  116. }
  117. // Location information for a specific key-range of a sharded computation.
  118. // Currently we only support UTF-8 character splits to simplify encoding into
  119. // JSON.
  120. message KeyRangeLocation {
  121. // The start (inclusive) of the key range.
  122. string start = 1;
  123. // The end (exclusive) of the key range.
  124. string end = 2;
  125. // The physical location of this range assignment to be used for
  126. // streaming computation cross-worker message delivery.
  127. string delivery_endpoint = 3;
  128. // The name of the data disk where data for this range is stored.
  129. // This name is local to the Google Cloud Platform project and uniquely
  130. // identifies the disk within that project, for example
  131. // "myproject-1014-104817-4c2-harness-0-disk-1".
  132. string data_disk = 5;
  133. // DEPRECATED. The location of the persistent state for this range, as a
  134. // persistent directory in the worker local filesystem.
  135. string deprecated_persistent_directory = 4 [deprecated = true];
  136. }
  137. // Describes mounted data disk.
  138. message MountedDataDisk {
  139. // The name of the data disk.
  140. // This name is local to the Google Cloud Platform project and uniquely
  141. // identifies the disk within that project, for example
  142. // "myproject-1014-104817-4c2-harness-0-disk-1".
  143. string data_disk = 1;
  144. }
  145. // Data disk assignment for a given VM instance.
  146. message DataDiskAssignment {
  147. // VM instance name the data disks mounted to, for example
  148. // "myproject-1014-104817-4c2-harness-0".
  149. string vm_instance = 1;
  150. // Mounted data disks. The order is important a data disk's 0-based index in
  151. // this list defines which persistent directory the disk is mounted to, for
  152. // example the list of { "myproject-1014-104817-4c2-harness-0-disk-0" },
  153. // { "myproject-1014-104817-4c2-harness-0-disk-1" }.
  154. repeated string data_disks = 2;
  155. }
  156. // Data disk assignment information for a specific key-range of a sharded
  157. // computation.
  158. // Currently we only support UTF-8 character splits to simplify encoding into
  159. // JSON.
  160. message KeyRangeDataDiskAssignment {
  161. // The start (inclusive) of the key range.
  162. string start = 1;
  163. // The end (exclusive) of the key range.
  164. string end = 2;
  165. // The name of the data disk where data for this range is stored.
  166. // This name is local to the Google Cloud Platform project and uniquely
  167. // identifies the disk within that project, for example
  168. // "myproject-1014-104817-4c2-harness-0-disk-1".
  169. string data_disk = 3;
  170. }
  171. // Describes full or partial data disk assignment information of the computation
  172. // ranges.
  173. message StreamingComputationRanges {
  174. // The ID of the computation.
  175. string computation_id = 1;
  176. // Data disk assignments for ranges from this computation.
  177. repeated KeyRangeDataDiskAssignment range_assignments = 2;
  178. }
  179. // Streaming appliance snapshot configuration.
  180. message StreamingApplianceSnapshotConfig {
  181. // If set, indicates the snapshot id for the snapshot being performed.
  182. string snapshot_id = 1;
  183. // Indicates which endpoint is used to import appliance state.
  184. string import_state_endpoint = 2;
  185. }