123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554 |
- // Copyright 2022 Google LLC
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- syntax = "proto3";
- package google.dataflow.v1beta3;
- import "google/api/field_behavior.proto";
- import "google/protobuf/any.proto";
- import "google/protobuf/struct.proto";
- option csharp_namespace = "Google.Cloud.Dataflow.V1Beta3";
- option go_package = "google.golang.org/genproto/googleapis/dataflow/v1beta3;dataflow";
- option java_multiple_files = true;
- option java_outer_classname = "EnvironmentProto";
- option java_package = "com.google.dataflow.v1beta3";
- option php_namespace = "Google\\Cloud\\Dataflow\\V1beta3";
- option ruby_package = "Google::Cloud::Dataflow::V1beta3";
- // Describes the environment in which a Dataflow Job runs.
- message Environment {
- // The prefix of the resources the system should use for temporary
- // storage. The system will append the suffix "/temp-{JOBNAME} to
- // this resource prefix, where {JOBNAME} is the value of the
- // job_name field. The resulting bucket and object prefix is used
- // as the prefix of the resources used to store temporary data
- // needed during the job execution. NOTE: This will override the
- // value in taskrunner_settings.
- // The supported resource type is:
- //
- // Google Cloud Storage:
- //
- // storage.googleapis.com/{bucket}/{object}
- // bucket.storage.googleapis.com/{object}
- string temp_storage_prefix = 1;
- // The type of cluster manager API to use. If unknown or
- // unspecified, the service will attempt to choose a reasonable
- // default. This should be in the form of the API service name,
- // e.g. "compute.googleapis.com".
- string cluster_manager_api_service = 2;
- // The list of experiments to enable. This field should be used for SDK
- // related experiments and not for service related experiments. The proper
- // field for service related experiments is service_options.
- repeated string experiments = 3;
- // The list of service options to enable. This field should be used for
- // service related experiments only. These experiments, when graduating to GA,
- // should be replaced by dedicated fields or become default (i.e. always on).
- repeated string service_options = 16;
- // If set, contains the Cloud KMS key identifier used to encrypt data
- // at rest, AKA a Customer Managed Encryption Key (CMEK).
- //
- // Format:
- // projects/PROJECT_ID/locations/LOCATION/keyRings/KEY_RING/cryptoKeys/KEY
- string service_kms_key_name = 12;
- // The worker pools. At least one "harness" worker pool must be
- // specified in order for the job to have workers.
- repeated WorkerPool worker_pools = 4;
- // A description of the process that generated the request.
- google.protobuf.Struct user_agent = 5;
- // A structure describing which components and their versions of the service
- // are required in order to run the job.
- google.protobuf.Struct version = 6;
- // The dataset for the current project where various workflow
- // related tables are stored.
- //
- // The supported resource type is:
- //
- // Google BigQuery:
- // bigquery.googleapis.com/{dataset}
- string dataset = 7;
- // The Cloud Dataflow SDK pipeline options specified by the user. These
- // options are passed through the service and are used to recreate the
- // SDK pipeline options on the worker in a language agnostic and platform
- // independent way.
- google.protobuf.Struct sdk_pipeline_options = 8;
- // Experimental settings.
- google.protobuf.Any internal_experiments = 9;
- // Identity to run virtual machines as. Defaults to the default account.
- string service_account_email = 10;
- // Which Flexible Resource Scheduling mode to run in.
- FlexResourceSchedulingGoal flex_resource_scheduling_goal = 11;
- // The Compute Engine region
- // (https://cloud.google.com/compute/docs/regions-zones/regions-zones) in
- // which worker processing should occur, e.g. "us-west1". Mutually exclusive
- // with worker_zone. If neither worker_region nor worker_zone is specified,
- // default to the control plane's region.
- string worker_region = 13;
- // The Compute Engine zone
- // (https://cloud.google.com/compute/docs/regions-zones/regions-zones) in
- // which worker processing should occur, e.g. "us-west1-a". Mutually exclusive
- // with worker_region. If neither worker_region nor worker_zone is specified,
- // a zone in the control plane's region is chosen based on available capacity.
- string worker_zone = 14;
- // Output only. The shuffle mode used for the job.
- ShuffleMode shuffle_mode = 15 [(google.api.field_behavior) = OUTPUT_ONLY];
- // Any debugging options to be supplied to the job.
- DebugOptions debug_options = 17;
- }
- // The packages that must be installed in order for a worker to run the
- // steps of the Cloud Dataflow job that will be assigned to its worker
- // pool.
- //
- // This is the mechanism by which the Cloud Dataflow SDK causes code to
- // be loaded onto the workers. For example, the Cloud Dataflow Java SDK
- // might use this to install jars containing the user's code and all of the
- // various dependencies (libraries, data files, etc.) required in order
- // for that code to run.
- message Package {
- // The name of the package.
- string name = 1;
- // The resource to read the package from. The supported resource type is:
- //
- // Google Cloud Storage:
- //
- // storage.googleapis.com/{bucket}
- // bucket.storage.googleapis.com/
- string location = 2;
- }
- // Specifies the processing model used by a
- // [google.dataflow.v1beta3.Job], which determines the way the Job is
- // managed by the Cloud Dataflow service (how workers are scheduled, how
- // inputs are sharded, etc).
- enum JobType {
- // The type of the job is unspecified, or unknown.
- JOB_TYPE_UNKNOWN = 0;
- // A batch job with a well-defined end point: data is read, data is
- // processed, data is written, and the job is done.
- JOB_TYPE_BATCH = 1;
- // A continuously streaming job with no end: data is read,
- // processed, and written continuously.
- JOB_TYPE_STREAMING = 2;
- }
- // Specifies the resource to optimize for in Flexible Resource Scheduling.
- enum FlexResourceSchedulingGoal {
- // Run in the default mode.
- FLEXRS_UNSPECIFIED = 0;
- // Optimize for lower execution time.
- FLEXRS_SPEED_OPTIMIZED = 1;
- // Optimize for lower cost.
- FLEXRS_COST_OPTIMIZED = 2;
- }
- // Describes the data disk used by a workflow job.
- message Disk {
- // Size of disk in GB. If zero or unspecified, the service will
- // attempt to choose a reasonable default.
- int32 size_gb = 1;
- // Disk storage type, as defined by Google Compute Engine. This
- // must be a disk type appropriate to the project and zone in which
- // the workers will run. If unknown or unspecified, the service
- // will attempt to choose a reasonable default.
- //
- // For example, the standard persistent disk type is a resource name
- // typically ending in "pd-standard". If SSD persistent disks are
- // available, the resource name typically ends with "pd-ssd". The
- // actual valid values are defined the Google Compute Engine API,
- // not by the Cloud Dataflow API; consult the Google Compute Engine
- // documentation for more information about determining the set of
- // available disk types for a particular project and zone.
- //
- // Google Compute Engine Disk types are local to a particular
- // project in a particular zone, and so the resource name will
- // typically look something like this:
- //
- // compute.googleapis.com/projects/project-id/zones/zone/diskTypes/pd-standard
- string disk_type = 2;
- // Directory in a VM where disk is mounted.
- string mount_point = 3;
- }
- // Provides data to pass through to the worker harness.
- message WorkerSettings {
- // The base URL for accessing Google Cloud APIs.
- //
- // When workers access Google Cloud APIs, they logically do so via
- // relative URLs. If this field is specified, it supplies the base
- // URL to use for resolving these relative URLs. The normative
- // algorithm used is defined by RFC 1808, "Relative Uniform Resource
- // Locators".
- //
- // If not specified, the default value is "http://www.googleapis.com/"
- string base_url = 1;
- // Whether to send work progress updates to the service.
- bool reporting_enabled = 2;
- // The Cloud Dataflow service path relative to the root URL, for example,
- // "dataflow/v1b3/projects".
- string service_path = 3;
- // The Shuffle service path relative to the root URL, for example,
- // "shuffle/v1beta1".
- string shuffle_service_path = 4;
- // The ID of the worker running this pipeline.
- string worker_id = 5;
- // The prefix of the resources the system should use for temporary
- // storage.
- //
- // The supported resource type is:
- //
- // Google Cloud Storage:
- //
- // storage.googleapis.com/{bucket}/{object}
- // bucket.storage.googleapis.com/{object}
- string temp_storage_prefix = 6;
- }
- // Taskrunner configuration settings.
- message TaskRunnerSettings {
- // The UNIX user ID on the worker VM to use for tasks launched by
- // taskrunner; e.g. "root".
- string task_user = 1;
- // The UNIX group ID on the worker VM to use for tasks launched by
- // taskrunner; e.g. "wheel".
- string task_group = 2;
- // The OAuth2 scopes to be requested by the taskrunner in order to
- // access the Cloud Dataflow API.
- repeated string oauth_scopes = 3;
- // The base URL for the taskrunner to use when accessing Google Cloud APIs.
- //
- // When workers access Google Cloud APIs, they logically do so via
- // relative URLs. If this field is specified, it supplies the base
- // URL to use for resolving these relative URLs. The normative
- // algorithm used is defined by RFC 1808, "Relative Uniform Resource
- // Locators".
- //
- // If not specified, the default value is "http://www.googleapis.com/"
- string base_url = 4;
- // The API version of endpoint, e.g. "v1b3"
- string dataflow_api_version = 5;
- // The settings to pass to the parallel worker harness.
- WorkerSettings parallel_worker_settings = 6;
- // The location on the worker for task-specific subdirectories.
- string base_task_dir = 7;
- // Whether to continue taskrunner if an exception is hit.
- bool continue_on_exception = 8;
- // Whether to send taskrunner log info to Google Compute Engine VM serial
- // console.
- bool log_to_serialconsole = 9;
- // Whether to also send taskrunner log info to stderr.
- bool alsologtostderr = 10;
- // Indicates where to put logs. If this is not specified, the logs
- // will not be uploaded.
- //
- // The supported resource type is:
- //
- // Google Cloud Storage:
- // storage.googleapis.com/{bucket}/{object}
- // bucket.storage.googleapis.com/{object}
- string log_upload_location = 11;
- // The directory on the VM to store logs.
- string log_dir = 12;
- // The prefix of the resources the taskrunner should use for
- // temporary storage.
- //
- // The supported resource type is:
- //
- // Google Cloud Storage:
- // storage.googleapis.com/{bucket}/{object}
- // bucket.storage.googleapis.com/{object}
- string temp_storage_prefix = 13;
- // The command to launch the worker harness.
- string harness_command = 14;
- // The file to store the workflow in.
- string workflow_file_name = 15;
- // The file to store preprocessing commands in.
- string commandlines_file_name = 16;
- // The ID string of the VM.
- string vm_id = 17;
- // The suggested backend language.
- string language_hint = 18;
- // The streaming worker main class name.
- string streaming_worker_main_class = 19;
- }
- // Specifies what happens to a resource when a Cloud Dataflow
- // [google.dataflow.v1beta3.Job][google.dataflow.v1beta3.Job] has completed.
- enum TeardownPolicy {
- // The teardown policy isn't specified, or is unknown.
- TEARDOWN_POLICY_UNKNOWN = 0;
- // Always teardown the resource.
- TEARDOWN_ALWAYS = 1;
- // Teardown the resource on success. This is useful for debugging
- // failures.
- TEARDOWN_ON_SUCCESS = 2;
- // Never teardown the resource. This is useful for debugging and
- // development.
- TEARDOWN_NEVER = 3;
- }
- // The default set of packages to be staged on a pool of workers.
- enum DefaultPackageSet {
- // The default set of packages to stage is unknown, or unspecified.
- DEFAULT_PACKAGE_SET_UNKNOWN = 0;
- // Indicates that no packages should be staged at the worker unless
- // explicitly specified by the job.
- DEFAULT_PACKAGE_SET_NONE = 1;
- // Stage packages typically useful to workers written in Java.
- DEFAULT_PACKAGE_SET_JAVA = 2;
- // Stage packages typically useful to workers written in Python.
- DEFAULT_PACKAGE_SET_PYTHON = 3;
- }
- // Specifies the algorithm used to determine the number of worker
- // processes to run at any given point in time, based on the amount of
- // data left to process, the number of workers, and how quickly
- // existing workers are processing data.
- enum AutoscalingAlgorithm {
- // The algorithm is unknown, or unspecified.
- AUTOSCALING_ALGORITHM_UNKNOWN = 0;
- // Disable autoscaling.
- AUTOSCALING_ALGORITHM_NONE = 1;
- // Increase worker count over time to reduce job execution time.
- AUTOSCALING_ALGORITHM_BASIC = 2;
- }
- // Settings for WorkerPool autoscaling.
- message AutoscalingSettings {
- // The algorithm to use for autoscaling.
- AutoscalingAlgorithm algorithm = 1;
- // The maximum number of workers to cap scaling at.
- int32 max_num_workers = 2;
- }
- // Specifies how IP addresses should be allocated to the worker machines.
- enum WorkerIPAddressConfiguration {
- // The configuration is unknown, or unspecified.
- WORKER_IP_UNSPECIFIED = 0;
- // Workers should have public IP addresses.
- WORKER_IP_PUBLIC = 1;
- // Workers should have private IP addresses.
- WORKER_IP_PRIVATE = 2;
- }
- // Defines a SDK harness container for executing Dataflow pipelines.
- message SdkHarnessContainerImage {
- // A docker container image that resides in Google Container Registry.
- string container_image = 1;
- // If true, recommends the Dataflow service to use only one core per SDK
- // container instance with this image. If false (or unset) recommends using
- // more than one core per SDK container instance with this image for
- // efficiency. Note that Dataflow service may choose to override this property
- // if needed.
- bool use_single_core_per_container = 2;
- // Environment ID for the Beam runner API proto Environment that corresponds
- // to the current SDK Harness.
- string environment_id = 3;
- // The set of capabilities enumerated in the above Environment proto. See also
- // https://github.com/apache/beam/blob/master/model/pipeline/src/main/proto/beam_runner_api.proto
- repeated string capabilities = 4;
- }
- // Describes one particular pool of Cloud Dataflow workers to be
- // instantiated by the Cloud Dataflow service in order to perform the
- // computations required by a job. Note that a workflow job may use
- // multiple pools, in order to match the various computational
- // requirements of the various stages of the job.
- message WorkerPool {
- // The kind of the worker pool; currently only `harness` and `shuffle`
- // are supported.
- string kind = 1;
- // Number of Google Compute Engine workers in this pool needed to
- // execute the job. If zero or unspecified, the service will
- // attempt to choose a reasonable default.
- int32 num_workers = 2;
- // Packages to be installed on workers.
- repeated Package packages = 3;
- // The default package set to install. This allows the service to
- // select a default set of packages which are useful to worker
- // harnesses written in a particular language.
- DefaultPackageSet default_package_set = 4;
- // Machine type (e.g. "n1-standard-1"). If empty or unspecified, the
- // service will attempt to choose a reasonable default.
- string machine_type = 5;
- // Sets the policy for determining when to turndown worker pool.
- // Allowed values are: `TEARDOWN_ALWAYS`, `TEARDOWN_ON_SUCCESS`, and
- // `TEARDOWN_NEVER`.
- // `TEARDOWN_ALWAYS` means workers are always torn down regardless of whether
- // the job succeeds. `TEARDOWN_ON_SUCCESS` means workers are torn down
- // if the job succeeds. `TEARDOWN_NEVER` means the workers are never torn
- // down.
- //
- // If the workers are not torn down by the service, they will
- // continue to run and use Google Compute Engine VM resources in the
- // user's project until they are explicitly terminated by the user.
- // Because of this, Google recommends using the `TEARDOWN_ALWAYS`
- // policy except for small, manually supervised test jobs.
- //
- // If unknown or unspecified, the service will attempt to choose a reasonable
- // default.
- TeardownPolicy teardown_policy = 6;
- // Size of root disk for VMs, in GB. If zero or unspecified, the service will
- // attempt to choose a reasonable default.
- int32 disk_size_gb = 7;
- // Type of root disk for VMs. If empty or unspecified, the service will
- // attempt to choose a reasonable default.
- string disk_type = 16;
- // Fully qualified source image for disks.
- string disk_source_image = 8;
- // Zone to run the worker pools in. If empty or unspecified, the service
- // will attempt to choose a reasonable default.
- string zone = 9;
- // Settings passed through to Google Compute Engine workers when
- // using the standard Dataflow task runner. Users should ignore
- // this field.
- TaskRunnerSettings taskrunner_settings = 10;
- // The action to take on host maintenance, as defined by the Google
- // Compute Engine API.
- string on_host_maintenance = 11;
- // Data disks that are used by a VM in this workflow.
- repeated Disk data_disks = 12;
- // Metadata to set on the Google Compute Engine VMs.
- map<string, string> metadata = 13;
- // Settings for autoscaling of this WorkerPool.
- AutoscalingSettings autoscaling_settings = 14;
- // Extra arguments for this worker pool.
- google.protobuf.Any pool_args = 15;
- // Network to which VMs will be assigned. If empty or unspecified,
- // the service will use the network "default".
- string network = 17;
- // Subnetwork to which VMs will be assigned, if desired. Expected to be of
- // the form "regions/REGION/subnetworks/SUBNETWORK".
- string subnetwork = 19;
- // Required. Docker container image that executes the Cloud Dataflow worker
- // harness, residing in Google Container Registry.
- //
- // Deprecated for the Fn API path. Use sdk_harness_container_images instead.
- string worker_harness_container_image = 18;
- // The number of threads per worker harness. If empty or unspecified, the
- // service will choose a number of threads (according to the number of cores
- // on the selected machine type for batch, or 1 by convention for streaming).
- int32 num_threads_per_worker = 20;
- // Configuration for VM IPs.
- WorkerIPAddressConfiguration ip_configuration = 21;
- // Set of SDK harness containers needed to execute this pipeline. This will
- // only be set in the Fn API path. For non-cross-language pipelines this
- // should have only one entry. Cross-language pipelines will have two or more
- // entries.
- repeated SdkHarnessContainerImage sdk_harness_container_images = 22;
- }
- // Specifies the shuffle mode used by a
- // [google.dataflow.v1beta3.Job], which determines the approach data is shuffled
- // during processing. More details in:
- // https://cloud.google.com/dataflow/docs/guides/deploying-a-pipeline#dataflow-shuffle
- enum ShuffleMode {
- // Shuffle mode information is not available.
- SHUFFLE_MODE_UNSPECIFIED = 0;
- // Shuffle is done on the worker VMs.
- VM_BASED = 1;
- // Shuffle is done on the service side.
- SERVICE_BASED = 2;
- }
- // Describes any options that have an effect on the debugging of pipelines.
- message DebugOptions {
- // When true, enables the logging of the literal hot key to the user's Cloud
- // Logging.
- bool enable_hot_key_logging = 1;
- }
|