environment.proto 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554
  1. // Copyright 2022 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.dataflow.v1beta3;
  16. import "google/api/field_behavior.proto";
  17. import "google/protobuf/any.proto";
  18. import "google/protobuf/struct.proto";
  19. option csharp_namespace = "Google.Cloud.Dataflow.V1Beta3";
  20. option go_package = "google.golang.org/genproto/googleapis/dataflow/v1beta3;dataflow";
  21. option java_multiple_files = true;
  22. option java_outer_classname = "EnvironmentProto";
  23. option java_package = "com.google.dataflow.v1beta3";
  24. option php_namespace = "Google\\Cloud\\Dataflow\\V1beta3";
  25. option ruby_package = "Google::Cloud::Dataflow::V1beta3";
  26. // Describes the environment in which a Dataflow Job runs.
  27. message Environment {
  28. // The prefix of the resources the system should use for temporary
  29. // storage. The system will append the suffix "/temp-{JOBNAME} to
  30. // this resource prefix, where {JOBNAME} is the value of the
  31. // job_name field. The resulting bucket and object prefix is used
  32. // as the prefix of the resources used to store temporary data
  33. // needed during the job execution. NOTE: This will override the
  34. // value in taskrunner_settings.
  35. // The supported resource type is:
  36. //
  37. // Google Cloud Storage:
  38. //
  39. // storage.googleapis.com/{bucket}/{object}
  40. // bucket.storage.googleapis.com/{object}
  41. string temp_storage_prefix = 1;
  42. // The type of cluster manager API to use. If unknown or
  43. // unspecified, the service will attempt to choose a reasonable
  44. // default. This should be in the form of the API service name,
  45. // e.g. "compute.googleapis.com".
  46. string cluster_manager_api_service = 2;
  47. // The list of experiments to enable. This field should be used for SDK
  48. // related experiments and not for service related experiments. The proper
  49. // field for service related experiments is service_options.
  50. repeated string experiments = 3;
  51. // The list of service options to enable. This field should be used for
  52. // service related experiments only. These experiments, when graduating to GA,
  53. // should be replaced by dedicated fields or become default (i.e. always on).
  54. repeated string service_options = 16;
  55. // If set, contains the Cloud KMS key identifier used to encrypt data
  56. // at rest, AKA a Customer Managed Encryption Key (CMEK).
  57. //
  58. // Format:
  59. // projects/PROJECT_ID/locations/LOCATION/keyRings/KEY_RING/cryptoKeys/KEY
  60. string service_kms_key_name = 12;
  61. // The worker pools. At least one "harness" worker pool must be
  62. // specified in order for the job to have workers.
  63. repeated WorkerPool worker_pools = 4;
  64. // A description of the process that generated the request.
  65. google.protobuf.Struct user_agent = 5;
  66. // A structure describing which components and their versions of the service
  67. // are required in order to run the job.
  68. google.protobuf.Struct version = 6;
  69. // The dataset for the current project where various workflow
  70. // related tables are stored.
  71. //
  72. // The supported resource type is:
  73. //
  74. // Google BigQuery:
  75. // bigquery.googleapis.com/{dataset}
  76. string dataset = 7;
  77. // The Cloud Dataflow SDK pipeline options specified by the user. These
  78. // options are passed through the service and are used to recreate the
  79. // SDK pipeline options on the worker in a language agnostic and platform
  80. // independent way.
  81. google.protobuf.Struct sdk_pipeline_options = 8;
  82. // Experimental settings.
  83. google.protobuf.Any internal_experiments = 9;
  84. // Identity to run virtual machines as. Defaults to the default account.
  85. string service_account_email = 10;
  86. // Which Flexible Resource Scheduling mode to run in.
  87. FlexResourceSchedulingGoal flex_resource_scheduling_goal = 11;
  88. // The Compute Engine region
  89. // (https://cloud.google.com/compute/docs/regions-zones/regions-zones) in
  90. // which worker processing should occur, e.g. "us-west1". Mutually exclusive
  91. // with worker_zone. If neither worker_region nor worker_zone is specified,
  92. // default to the control plane's region.
  93. string worker_region = 13;
  94. // The Compute Engine zone
  95. // (https://cloud.google.com/compute/docs/regions-zones/regions-zones) in
  96. // which worker processing should occur, e.g. "us-west1-a". Mutually exclusive
  97. // with worker_region. If neither worker_region nor worker_zone is specified,
  98. // a zone in the control plane's region is chosen based on available capacity.
  99. string worker_zone = 14;
  100. // Output only. The shuffle mode used for the job.
  101. ShuffleMode shuffle_mode = 15 [(google.api.field_behavior) = OUTPUT_ONLY];
  102. // Any debugging options to be supplied to the job.
  103. DebugOptions debug_options = 17;
  104. }
  105. // The packages that must be installed in order for a worker to run the
  106. // steps of the Cloud Dataflow job that will be assigned to its worker
  107. // pool.
  108. //
  109. // This is the mechanism by which the Cloud Dataflow SDK causes code to
  110. // be loaded onto the workers. For example, the Cloud Dataflow Java SDK
  111. // might use this to install jars containing the user's code and all of the
  112. // various dependencies (libraries, data files, etc.) required in order
  113. // for that code to run.
  114. message Package {
  115. // The name of the package.
  116. string name = 1;
  117. // The resource to read the package from. The supported resource type is:
  118. //
  119. // Google Cloud Storage:
  120. //
  121. // storage.googleapis.com/{bucket}
  122. // bucket.storage.googleapis.com/
  123. string location = 2;
  124. }
  125. // Specifies the processing model used by a
  126. // [google.dataflow.v1beta3.Job], which determines the way the Job is
  127. // managed by the Cloud Dataflow service (how workers are scheduled, how
  128. // inputs are sharded, etc).
  129. enum JobType {
  130. // The type of the job is unspecified, or unknown.
  131. JOB_TYPE_UNKNOWN = 0;
  132. // A batch job with a well-defined end point: data is read, data is
  133. // processed, data is written, and the job is done.
  134. JOB_TYPE_BATCH = 1;
  135. // A continuously streaming job with no end: data is read,
  136. // processed, and written continuously.
  137. JOB_TYPE_STREAMING = 2;
  138. }
  139. // Specifies the resource to optimize for in Flexible Resource Scheduling.
  140. enum FlexResourceSchedulingGoal {
  141. // Run in the default mode.
  142. FLEXRS_UNSPECIFIED = 0;
  143. // Optimize for lower execution time.
  144. FLEXRS_SPEED_OPTIMIZED = 1;
  145. // Optimize for lower cost.
  146. FLEXRS_COST_OPTIMIZED = 2;
  147. }
  148. // Describes the data disk used by a workflow job.
  149. message Disk {
  150. // Size of disk in GB. If zero or unspecified, the service will
  151. // attempt to choose a reasonable default.
  152. int32 size_gb = 1;
  153. // Disk storage type, as defined by Google Compute Engine. This
  154. // must be a disk type appropriate to the project and zone in which
  155. // the workers will run. If unknown or unspecified, the service
  156. // will attempt to choose a reasonable default.
  157. //
  158. // For example, the standard persistent disk type is a resource name
  159. // typically ending in "pd-standard". If SSD persistent disks are
  160. // available, the resource name typically ends with "pd-ssd". The
  161. // actual valid values are defined the Google Compute Engine API,
  162. // not by the Cloud Dataflow API; consult the Google Compute Engine
  163. // documentation for more information about determining the set of
  164. // available disk types for a particular project and zone.
  165. //
  166. // Google Compute Engine Disk types are local to a particular
  167. // project in a particular zone, and so the resource name will
  168. // typically look something like this:
  169. //
  170. // compute.googleapis.com/projects/project-id/zones/zone/diskTypes/pd-standard
  171. string disk_type = 2;
  172. // Directory in a VM where disk is mounted.
  173. string mount_point = 3;
  174. }
  175. // Provides data to pass through to the worker harness.
  176. message WorkerSettings {
  177. // The base URL for accessing Google Cloud APIs.
  178. //
  179. // When workers access Google Cloud APIs, they logically do so via
  180. // relative URLs. If this field is specified, it supplies the base
  181. // URL to use for resolving these relative URLs. The normative
  182. // algorithm used is defined by RFC 1808, "Relative Uniform Resource
  183. // Locators".
  184. //
  185. // If not specified, the default value is "http://www.googleapis.com/"
  186. string base_url = 1;
  187. // Whether to send work progress updates to the service.
  188. bool reporting_enabled = 2;
  189. // The Cloud Dataflow service path relative to the root URL, for example,
  190. // "dataflow/v1b3/projects".
  191. string service_path = 3;
  192. // The Shuffle service path relative to the root URL, for example,
  193. // "shuffle/v1beta1".
  194. string shuffle_service_path = 4;
  195. // The ID of the worker running this pipeline.
  196. string worker_id = 5;
  197. // The prefix of the resources the system should use for temporary
  198. // storage.
  199. //
  200. // The supported resource type is:
  201. //
  202. // Google Cloud Storage:
  203. //
  204. // storage.googleapis.com/{bucket}/{object}
  205. // bucket.storage.googleapis.com/{object}
  206. string temp_storage_prefix = 6;
  207. }
  208. // Taskrunner configuration settings.
  209. message TaskRunnerSettings {
  210. // The UNIX user ID on the worker VM to use for tasks launched by
  211. // taskrunner; e.g. "root".
  212. string task_user = 1;
  213. // The UNIX group ID on the worker VM to use for tasks launched by
  214. // taskrunner; e.g. "wheel".
  215. string task_group = 2;
  216. // The OAuth2 scopes to be requested by the taskrunner in order to
  217. // access the Cloud Dataflow API.
  218. repeated string oauth_scopes = 3;
  219. // The base URL for the taskrunner to use when accessing Google Cloud APIs.
  220. //
  221. // When workers access Google Cloud APIs, they logically do so via
  222. // relative URLs. If this field is specified, it supplies the base
  223. // URL to use for resolving these relative URLs. The normative
  224. // algorithm used is defined by RFC 1808, "Relative Uniform Resource
  225. // Locators".
  226. //
  227. // If not specified, the default value is "http://www.googleapis.com/"
  228. string base_url = 4;
  229. // The API version of endpoint, e.g. "v1b3"
  230. string dataflow_api_version = 5;
  231. // The settings to pass to the parallel worker harness.
  232. WorkerSettings parallel_worker_settings = 6;
  233. // The location on the worker for task-specific subdirectories.
  234. string base_task_dir = 7;
  235. // Whether to continue taskrunner if an exception is hit.
  236. bool continue_on_exception = 8;
  237. // Whether to send taskrunner log info to Google Compute Engine VM serial
  238. // console.
  239. bool log_to_serialconsole = 9;
  240. // Whether to also send taskrunner log info to stderr.
  241. bool alsologtostderr = 10;
  242. // Indicates where to put logs. If this is not specified, the logs
  243. // will not be uploaded.
  244. //
  245. // The supported resource type is:
  246. //
  247. // Google Cloud Storage:
  248. // storage.googleapis.com/{bucket}/{object}
  249. // bucket.storage.googleapis.com/{object}
  250. string log_upload_location = 11;
  251. // The directory on the VM to store logs.
  252. string log_dir = 12;
  253. // The prefix of the resources the taskrunner should use for
  254. // temporary storage.
  255. //
  256. // The supported resource type is:
  257. //
  258. // Google Cloud Storage:
  259. // storage.googleapis.com/{bucket}/{object}
  260. // bucket.storage.googleapis.com/{object}
  261. string temp_storage_prefix = 13;
  262. // The command to launch the worker harness.
  263. string harness_command = 14;
  264. // The file to store the workflow in.
  265. string workflow_file_name = 15;
  266. // The file to store preprocessing commands in.
  267. string commandlines_file_name = 16;
  268. // The ID string of the VM.
  269. string vm_id = 17;
  270. // The suggested backend language.
  271. string language_hint = 18;
  272. // The streaming worker main class name.
  273. string streaming_worker_main_class = 19;
  274. }
  275. // Specifies what happens to a resource when a Cloud Dataflow
  276. // [google.dataflow.v1beta3.Job][google.dataflow.v1beta3.Job] has completed.
  277. enum TeardownPolicy {
  278. // The teardown policy isn't specified, or is unknown.
  279. TEARDOWN_POLICY_UNKNOWN = 0;
  280. // Always teardown the resource.
  281. TEARDOWN_ALWAYS = 1;
  282. // Teardown the resource on success. This is useful for debugging
  283. // failures.
  284. TEARDOWN_ON_SUCCESS = 2;
  285. // Never teardown the resource. This is useful for debugging and
  286. // development.
  287. TEARDOWN_NEVER = 3;
  288. }
  289. // The default set of packages to be staged on a pool of workers.
  290. enum DefaultPackageSet {
  291. // The default set of packages to stage is unknown, or unspecified.
  292. DEFAULT_PACKAGE_SET_UNKNOWN = 0;
  293. // Indicates that no packages should be staged at the worker unless
  294. // explicitly specified by the job.
  295. DEFAULT_PACKAGE_SET_NONE = 1;
  296. // Stage packages typically useful to workers written in Java.
  297. DEFAULT_PACKAGE_SET_JAVA = 2;
  298. // Stage packages typically useful to workers written in Python.
  299. DEFAULT_PACKAGE_SET_PYTHON = 3;
  300. }
  301. // Specifies the algorithm used to determine the number of worker
  302. // processes to run at any given point in time, based on the amount of
  303. // data left to process, the number of workers, and how quickly
  304. // existing workers are processing data.
  305. enum AutoscalingAlgorithm {
  306. // The algorithm is unknown, or unspecified.
  307. AUTOSCALING_ALGORITHM_UNKNOWN = 0;
  308. // Disable autoscaling.
  309. AUTOSCALING_ALGORITHM_NONE = 1;
  310. // Increase worker count over time to reduce job execution time.
  311. AUTOSCALING_ALGORITHM_BASIC = 2;
  312. }
  313. // Settings for WorkerPool autoscaling.
  314. message AutoscalingSettings {
  315. // The algorithm to use for autoscaling.
  316. AutoscalingAlgorithm algorithm = 1;
  317. // The maximum number of workers to cap scaling at.
  318. int32 max_num_workers = 2;
  319. }
  320. // Specifies how IP addresses should be allocated to the worker machines.
  321. enum WorkerIPAddressConfiguration {
  322. // The configuration is unknown, or unspecified.
  323. WORKER_IP_UNSPECIFIED = 0;
  324. // Workers should have public IP addresses.
  325. WORKER_IP_PUBLIC = 1;
  326. // Workers should have private IP addresses.
  327. WORKER_IP_PRIVATE = 2;
  328. }
  329. // Defines a SDK harness container for executing Dataflow pipelines.
  330. message SdkHarnessContainerImage {
  331. // A docker container image that resides in Google Container Registry.
  332. string container_image = 1;
  333. // If true, recommends the Dataflow service to use only one core per SDK
  334. // container instance with this image. If false (or unset) recommends using
  335. // more than one core per SDK container instance with this image for
  336. // efficiency. Note that Dataflow service may choose to override this property
  337. // if needed.
  338. bool use_single_core_per_container = 2;
  339. // Environment ID for the Beam runner API proto Environment that corresponds
  340. // to the current SDK Harness.
  341. string environment_id = 3;
  342. // The set of capabilities enumerated in the above Environment proto. See also
  343. // https://github.com/apache/beam/blob/master/model/pipeline/src/main/proto/beam_runner_api.proto
  344. repeated string capabilities = 4;
  345. }
  346. // Describes one particular pool of Cloud Dataflow workers to be
  347. // instantiated by the Cloud Dataflow service in order to perform the
  348. // computations required by a job. Note that a workflow job may use
  349. // multiple pools, in order to match the various computational
  350. // requirements of the various stages of the job.
  351. message WorkerPool {
  352. // The kind of the worker pool; currently only `harness` and `shuffle`
  353. // are supported.
  354. string kind = 1;
  355. // Number of Google Compute Engine workers in this pool needed to
  356. // execute the job. If zero or unspecified, the service will
  357. // attempt to choose a reasonable default.
  358. int32 num_workers = 2;
  359. // Packages to be installed on workers.
  360. repeated Package packages = 3;
  361. // The default package set to install. This allows the service to
  362. // select a default set of packages which are useful to worker
  363. // harnesses written in a particular language.
  364. DefaultPackageSet default_package_set = 4;
  365. // Machine type (e.g. "n1-standard-1"). If empty or unspecified, the
  366. // service will attempt to choose a reasonable default.
  367. string machine_type = 5;
  368. // Sets the policy for determining when to turndown worker pool.
  369. // Allowed values are: `TEARDOWN_ALWAYS`, `TEARDOWN_ON_SUCCESS`, and
  370. // `TEARDOWN_NEVER`.
  371. // `TEARDOWN_ALWAYS` means workers are always torn down regardless of whether
  372. // the job succeeds. `TEARDOWN_ON_SUCCESS` means workers are torn down
  373. // if the job succeeds. `TEARDOWN_NEVER` means the workers are never torn
  374. // down.
  375. //
  376. // If the workers are not torn down by the service, they will
  377. // continue to run and use Google Compute Engine VM resources in the
  378. // user's project until they are explicitly terminated by the user.
  379. // Because of this, Google recommends using the `TEARDOWN_ALWAYS`
  380. // policy except for small, manually supervised test jobs.
  381. //
  382. // If unknown or unspecified, the service will attempt to choose a reasonable
  383. // default.
  384. TeardownPolicy teardown_policy = 6;
  385. // Size of root disk for VMs, in GB. If zero or unspecified, the service will
  386. // attempt to choose a reasonable default.
  387. int32 disk_size_gb = 7;
  388. // Type of root disk for VMs. If empty or unspecified, the service will
  389. // attempt to choose a reasonable default.
  390. string disk_type = 16;
  391. // Fully qualified source image for disks.
  392. string disk_source_image = 8;
  393. // Zone to run the worker pools in. If empty or unspecified, the service
  394. // will attempt to choose a reasonable default.
  395. string zone = 9;
  396. // Settings passed through to Google Compute Engine workers when
  397. // using the standard Dataflow task runner. Users should ignore
  398. // this field.
  399. TaskRunnerSettings taskrunner_settings = 10;
  400. // The action to take on host maintenance, as defined by the Google
  401. // Compute Engine API.
  402. string on_host_maintenance = 11;
  403. // Data disks that are used by a VM in this workflow.
  404. repeated Disk data_disks = 12;
  405. // Metadata to set on the Google Compute Engine VMs.
  406. map<string, string> metadata = 13;
  407. // Settings for autoscaling of this WorkerPool.
  408. AutoscalingSettings autoscaling_settings = 14;
  409. // Extra arguments for this worker pool.
  410. google.protobuf.Any pool_args = 15;
  411. // Network to which VMs will be assigned. If empty or unspecified,
  412. // the service will use the network "default".
  413. string network = 17;
  414. // Subnetwork to which VMs will be assigned, if desired. Expected to be of
  415. // the form "regions/REGION/subnetworks/SUBNETWORK".
  416. string subnetwork = 19;
  417. // Required. Docker container image that executes the Cloud Dataflow worker
  418. // harness, residing in Google Container Registry.
  419. //
  420. // Deprecated for the Fn API path. Use sdk_harness_container_images instead.
  421. string worker_harness_container_image = 18;
  422. // The number of threads per worker harness. If empty or unspecified, the
  423. // service will choose a number of threads (according to the number of cores
  424. // on the selected machine type for batch, or 1 by convention for streaming).
  425. int32 num_threads_per_worker = 20;
  426. // Configuration for VM IPs.
  427. WorkerIPAddressConfiguration ip_configuration = 21;
  428. // Set of SDK harness containers needed to execute this pipeline. This will
  429. // only be set in the Fn API path. For non-cross-language pipelines this
  430. // should have only one entry. Cross-language pipelines will have two or more
  431. // entries.
  432. repeated SdkHarnessContainerImage sdk_harness_container_images = 22;
  433. }
  434. // Specifies the shuffle mode used by a
  435. // [google.dataflow.v1beta3.Job], which determines the approach data is shuffled
  436. // during processing. More details in:
  437. // https://cloud.google.com/dataflow/docs/guides/deploying-a-pipeline#dataflow-shuffle
  438. enum ShuffleMode {
  439. // Shuffle mode information is not available.
  440. SHUFFLE_MODE_UNSPECIFIED = 0;
  441. // Shuffle is done on the worker VMs.
  442. VM_BASED = 1;
  443. // Shuffle is done on the service side.
  444. SERVICE_BASED = 2;
  445. }
  446. // Describes any options that have an effect on the debugging of pipelines.
  447. message DebugOptions {
  448. // When true, enables the logging of the literal hot key to the user's Cloud
  449. // Logging.
  450. bool enable_hot_key_logging = 1;
  451. }