training_pipeline.proto 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. // Copyright 2022 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.cloud.aiplatform.v1beta1;
  16. import "google/api/field_behavior.proto";
  17. import "google/api/resource.proto";
  18. import "google/cloud/aiplatform/v1beta1/encryption_spec.proto";
  19. import "google/cloud/aiplatform/v1beta1/io.proto";
  20. import "google/cloud/aiplatform/v1beta1/model.proto";
  21. import "google/cloud/aiplatform/v1beta1/pipeline_state.proto";
  22. import "google/protobuf/struct.proto";
  23. import "google/protobuf/timestamp.proto";
  24. import "google/rpc/status.proto";
  25. option csharp_namespace = "Google.Cloud.AIPlatform.V1Beta1";
  26. option go_package = "google.golang.org/genproto/googleapis/cloud/aiplatform/v1beta1;aiplatform";
  27. option java_multiple_files = true;
  28. option java_outer_classname = "TrainingPipelineProto";
  29. option java_package = "com.google.cloud.aiplatform.v1beta1";
  30. option php_namespace = "Google\\Cloud\\AIPlatform\\V1beta1";
  31. option ruby_package = "Google::Cloud::AIPlatform::V1beta1";
  32. // The TrainingPipeline orchestrates tasks associated with training a Model. It
  33. // always executes the training task, and optionally may also
  34. // export data from Vertex AI's Dataset which becomes the training input,
  35. // [upload][google.cloud.aiplatform.v1beta1.ModelService.UploadModel] the Model to Vertex AI, and evaluate the
  36. // Model.
  37. message TrainingPipeline {
  38. option (google.api.resource) = {
  39. type: "aiplatform.googleapis.com/TrainingPipeline"
  40. pattern: "projects/{project}/locations/{location}/trainingPipelines/{training_pipeline}"
  41. };
  42. // Output only. Resource name of the TrainingPipeline.
  43. string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
  44. // Required. The user-defined name of this TrainingPipeline.
  45. string display_name = 2 [(google.api.field_behavior) = REQUIRED];
  46. // Specifies Vertex AI owned input data that may be used for training the
  47. // Model. The TrainingPipeline's [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition] should make
  48. // clear whether this config is used and if there are any special requirements
  49. // on how it should be filled. If nothing about this config is mentioned in
  50. // the [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition], then it should be assumed that the
  51. // TrainingPipeline does not depend on this configuration.
  52. InputDataConfig input_data_config = 3;
  53. // Required. A Google Cloud Storage path to the YAML file that defines the training task
  54. // which is responsible for producing the model artifact, and may also include
  55. // additional auxiliary work.
  56. // The definition files that can be used here are found in
  57. // gs://google-cloud-aiplatform/schema/trainingjob/definition/.
  58. // Note: The URI given on output will be immutable and probably different,
  59. // including the URI scheme, than the one given on input. The output URI will
  60. // point to a location where the user only has a read access.
  61. string training_task_definition = 4 [(google.api.field_behavior) = REQUIRED];
  62. // Required. The training task's parameter(s), as specified in the
  63. // [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition]'s `inputs`.
  64. google.protobuf.Value training_task_inputs = 5 [(google.api.field_behavior) = REQUIRED];
  65. // Output only. The metadata information as specified in the [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition]'s
  66. // `metadata`. This metadata is an auxiliary runtime and final information
  67. // about the training task. While the pipeline is running this information is
  68. // populated only at a best effort basis. Only present if the
  69. // pipeline's [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition] contains `metadata` object.
  70. google.protobuf.Value training_task_metadata = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
  71. // Describes the Model that may be uploaded (via [ModelService.UploadModel][google.cloud.aiplatform.v1beta1.ModelService.UploadModel])
  72. // by this TrainingPipeline. The TrainingPipeline's
  73. // [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition] should make clear whether this Model
  74. // description should be populated, and if there are any special requirements
  75. // regarding how it should be filled. If nothing is mentioned in the
  76. // [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition], then it should be assumed that this field
  77. // should not be filled and the training task either uploads the Model without
  78. // a need of this information, or that training task does not support
  79. // uploading a Model as part of the pipeline.
  80. // When the Pipeline's state becomes `PIPELINE_STATE_SUCCEEDED` and
  81. // the trained Model had been uploaded into Vertex AI, then the
  82. // model_to_upload's resource [name][google.cloud.aiplatform.v1beta1.Model.name] is populated. The Model
  83. // is always uploaded into the Project and Location in which this pipeline
  84. // is.
  85. Model model_to_upload = 7;
  86. // Optional. The ID to use for the uploaded Model, which will become the final
  87. // component of the model resource name.
  88. //
  89. // This value may be up to 63 characters, and valid characters are
  90. // `[a-z0-9_-]`. The first character cannot be a number or hyphen.
  91. string model_id = 22 [(google.api.field_behavior) = OPTIONAL];
  92. // Optional. When specify this field, the `model_to_upload` will not be uploaded as a
  93. // new model, instead, it will become a new version of this `parent_model`.
  94. string parent_model = 21 [(google.api.field_behavior) = OPTIONAL];
  95. // Output only. The detailed state of the pipeline.
  96. PipelineState state = 9 [(google.api.field_behavior) = OUTPUT_ONLY];
  97. // Output only. Only populated when the pipeline's state is `PIPELINE_STATE_FAILED` or
  98. // `PIPELINE_STATE_CANCELLED`.
  99. google.rpc.Status error = 10 [(google.api.field_behavior) = OUTPUT_ONLY];
  100. // Output only. Time when the TrainingPipeline was created.
  101. google.protobuf.Timestamp create_time = 11 [(google.api.field_behavior) = OUTPUT_ONLY];
  102. // Output only. Time when the TrainingPipeline for the first time entered the
  103. // `PIPELINE_STATE_RUNNING` state.
  104. google.protobuf.Timestamp start_time = 12 [(google.api.field_behavior) = OUTPUT_ONLY];
  105. // Output only. Time when the TrainingPipeline entered any of the following states:
  106. // `PIPELINE_STATE_SUCCEEDED`, `PIPELINE_STATE_FAILED`,
  107. // `PIPELINE_STATE_CANCELLED`.
  108. google.protobuf.Timestamp end_time = 13 [(google.api.field_behavior) = OUTPUT_ONLY];
  109. // Output only. Time when the TrainingPipeline was most recently updated.
  110. google.protobuf.Timestamp update_time = 14 [(google.api.field_behavior) = OUTPUT_ONLY];
  111. // The labels with user-defined metadata to organize TrainingPipelines.
  112. //
  113. // Label keys and values can be no longer than 64 characters
  114. // (Unicode codepoints), can only contain lowercase letters, numeric
  115. // characters, underscores and dashes. International characters are allowed.
  116. //
  117. // See https://goo.gl/xmQnxf for more information and examples of labels.
  118. map<string, string> labels = 15;
  119. // Customer-managed encryption key spec for a TrainingPipeline. If set, this
  120. // TrainingPipeline will be secured by this key.
  121. //
  122. // Note: Model trained by this TrainingPipeline is also secured by this key if
  123. // [model_to_upload][google.cloud.aiplatform.v1beta1.TrainingPipeline.encryption_spec] is not set separately.
  124. EncryptionSpec encryption_spec = 18;
  125. }
  126. // Specifies Vertex AI owned input data to be used for training, and
  127. // possibly evaluating, the Model.
  128. message InputDataConfig {
  129. // The instructions how the input data should be split between the
  130. // training, validation and test sets.
  131. // If no split type is provided, the [fraction_split][google.cloud.aiplatform.v1beta1.InputDataConfig.fraction_split] is used by default.
  132. oneof split {
  133. // Split based on fractions defining the size of each set.
  134. FractionSplit fraction_split = 2;
  135. // Split based on the provided filters for each set.
  136. FilterSplit filter_split = 3;
  137. // Supported only for tabular Datasets.
  138. //
  139. // Split based on a predefined key.
  140. PredefinedSplit predefined_split = 4;
  141. // Supported only for tabular Datasets.
  142. //
  143. // Split based on the timestamp of the input data pieces.
  144. TimestampSplit timestamp_split = 5;
  145. // Supported only for tabular Datasets.
  146. //
  147. // Split based on the distribution of the specified column.
  148. StratifiedSplit stratified_split = 12;
  149. }
  150. // Only applicable to Custom and Hyperparameter Tuning TrainingPipelines.
  151. //
  152. // The destination of the training data to be written to.
  153. //
  154. // Supported destination file formats:
  155. // * For non-tabular data: "jsonl".
  156. // * For tabular data: "csv" and "bigquery".
  157. //
  158. // The following Vertex AI environment variables are passed to containers
  159. // or python modules of the training task when this field is set:
  160. //
  161. // * AIP_DATA_FORMAT : Exported data format.
  162. // * AIP_TRAINING_DATA_URI : Sharded exported training data uris.
  163. // * AIP_VALIDATION_DATA_URI : Sharded exported validation data uris.
  164. // * AIP_TEST_DATA_URI : Sharded exported test data uris.
  165. oneof destination {
  166. // The Cloud Storage location where the training data is to be
  167. // written to. In the given directory a new directory is created with
  168. // name:
  169. // `dataset-<dataset-id>-<annotation-type>-<timestamp-of-training-call>`
  170. // where timestamp is in YYYY-MM-DDThh:mm:ss.sssZ ISO-8601 format.
  171. // All training input data is written into that directory.
  172. //
  173. // The Vertex AI environment variables representing Cloud Storage
  174. // data URIs are represented in the Cloud Storage wildcard
  175. // format to support sharded data. e.g.: "gs://.../training-*.jsonl"
  176. //
  177. // * AIP_DATA_FORMAT = "jsonl" for non-tabular data, "csv" for tabular data
  178. // * AIP_TRAINING_DATA_URI =
  179. // "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/training-*.${AIP_DATA_FORMAT}"
  180. //
  181. // * AIP_VALIDATION_DATA_URI =
  182. // "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/validation-*.${AIP_DATA_FORMAT}"
  183. //
  184. // * AIP_TEST_DATA_URI =
  185. // "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/test-*.${AIP_DATA_FORMAT}"
  186. GcsDestination gcs_destination = 8;
  187. // Only applicable to custom training with tabular Dataset with BigQuery
  188. // source.
  189. //
  190. // The BigQuery project location where the training data is to be written
  191. // to. In the given project a new dataset is created with name
  192. // `dataset_<dataset-id>_<annotation-type>_<timestamp-of-training-call>`
  193. // where timestamp is in YYYY_MM_DDThh_mm_ss_sssZ format. All training
  194. // input data is written into that dataset. In the dataset three
  195. // tables are created, `training`, `validation` and `test`.
  196. //
  197. // * AIP_DATA_FORMAT = "bigquery".
  198. // * AIP_TRAINING_DATA_URI =
  199. // "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.training"
  200. //
  201. // * AIP_VALIDATION_DATA_URI =
  202. // "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.validation"
  203. //
  204. // * AIP_TEST_DATA_URI =
  205. // "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.test"
  206. BigQueryDestination bigquery_destination = 10;
  207. }
  208. // Required. The ID of the Dataset in the same Project and Location which data will be
  209. // used to train the Model. The Dataset must use schema compatible with
  210. // Model being trained, and what is compatible should be described in the
  211. // used TrainingPipeline's [training_task_definition]
  212. // [google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition].
  213. // For tabular Datasets, all their data is exported to training, to pick
  214. // and choose from.
  215. string dataset_id = 1 [(google.api.field_behavior) = REQUIRED];
  216. // Applicable only to Datasets that have DataItems and Annotations.
  217. //
  218. // A filter on Annotations of the Dataset. Only Annotations that both
  219. // match this filter and belong to DataItems not ignored by the split method
  220. // are used in respectively training, validation or test role, depending on
  221. // the role of the DataItem they are on (for the auto-assigned that role is
  222. // decided by Vertex AI). A filter with same syntax as the one used in
  223. // [ListAnnotations][google.cloud.aiplatform.v1beta1.DatasetService.ListAnnotations] may be used, but note
  224. // here it filters across all Annotations of the Dataset, and not just within
  225. // a single DataItem.
  226. string annotations_filter = 6;
  227. // Applicable only to custom training with Datasets that have DataItems and
  228. // Annotations.
  229. //
  230. // Cloud Storage URI that points to a YAML file describing the annotation
  231. // schema. The schema is defined as an OpenAPI 3.0.2 [Schema
  232. // Object](https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.2.md#schemaObject).
  233. // The schema files that can be used here are found in
  234. // gs://google-cloud-aiplatform/schema/dataset/annotation/ , note that the
  235. // chosen schema must be consistent with
  236. // [metadata][google.cloud.aiplatform.v1beta1.Dataset.metadata_schema_uri] of the Dataset specified by
  237. // [dataset_id][google.cloud.aiplatform.v1beta1.InputDataConfig.dataset_id].
  238. //
  239. // Only Annotations that both match this schema and belong to DataItems not
  240. // ignored by the split method are used in respectively training, validation
  241. // or test role, depending on the role of the DataItem they are on.
  242. //
  243. // When used in conjunction with [annotations_filter][google.cloud.aiplatform.v1beta1.InputDataConfig.annotations_filter], the Annotations used
  244. // for training are filtered by both [annotations_filter][google.cloud.aiplatform.v1beta1.InputDataConfig.annotations_filter] and
  245. // [annotation_schema_uri][google.cloud.aiplatform.v1beta1.InputDataConfig.annotation_schema_uri].
  246. string annotation_schema_uri = 9;
  247. // Only applicable to Datasets that have SavedQueries.
  248. //
  249. // The ID of a SavedQuery (annotation set) under the Dataset specified by
  250. // [dataset_id][google.cloud.aiplatform.v1beta1.InputDataConfig.dataset_id] used for filtering Annotations for training.
  251. //
  252. // Only Annotations that are associated with this SavedQuery are used in
  253. // respectively training. When used in conjunction with
  254. // [annotations_filter][google.cloud.aiplatform.v1beta1.InputDataConfig.annotations_filter], the Annotations used for training are filtered by
  255. // both [saved_query_id][google.cloud.aiplatform.v1beta1.InputDataConfig.saved_query_id] and [annotations_filter][google.cloud.aiplatform.v1beta1.InputDataConfig.annotations_filter].
  256. //
  257. // Only one of [saved_query_id][google.cloud.aiplatform.v1beta1.InputDataConfig.saved_query_id] and [annotation_schema_uri][google.cloud.aiplatform.v1beta1.InputDataConfig.annotation_schema_uri] should be
  258. // specified as both of them represent the same thing: problem type.
  259. string saved_query_id = 7;
  260. // Whether to persist the ML use assignment to data item system labels.
  261. bool persist_ml_use_assignment = 11;
  262. }
  263. // Assigns the input data to training, validation, and test sets as per the
  264. // given fractions. Any of `training_fraction`, `validation_fraction` and
  265. // `test_fraction` may optionally be provided, they must sum to up to 1. If the
  266. // provided ones sum to less than 1, the remainder is assigned to sets as
  267. // decided by Vertex AI. If none of the fractions are set, by default roughly
  268. // 80% of data is used for training, 10% for validation, and 10% for test.
  269. message FractionSplit {
  270. // The fraction of the input data that is to be used to train the Model.
  271. double training_fraction = 1;
  272. // The fraction of the input data that is to be used to validate the Model.
  273. double validation_fraction = 2;
  274. // The fraction of the input data that is to be used to evaluate the Model.
  275. double test_fraction = 3;
  276. }
  277. // Assigns input data to training, validation, and test sets based on the given
  278. // filters, data pieces not matched by any filter are ignored. Currently only
  279. // supported for Datasets containing DataItems.
  280. // If any of the filters in this message are to match nothing, then they can be
  281. // set as '-' (the minus sign).
  282. //
  283. // Supported only for unstructured Datasets.
  284. //
  285. message FilterSplit {
  286. // Required. A filter on DataItems of the Dataset. DataItems that match
  287. // this filter are used to train the Model. A filter with same syntax
  288. // as the one used in [DatasetService.ListDataItems][google.cloud.aiplatform.v1beta1.DatasetService.ListDataItems] may be used. If a
  289. // single DataItem is matched by more than one of the FilterSplit filters,
  290. // then it is assigned to the first set that applies to it in the
  291. // training, validation, test order.
  292. string training_filter = 1 [(google.api.field_behavior) = REQUIRED];
  293. // Required. A filter on DataItems of the Dataset. DataItems that match
  294. // this filter are used to validate the Model. A filter with same syntax
  295. // as the one used in [DatasetService.ListDataItems][google.cloud.aiplatform.v1beta1.DatasetService.ListDataItems] may be used. If a
  296. // single DataItem is matched by more than one of the FilterSplit filters,
  297. // then it is assigned to the first set that applies to it in the
  298. // training, validation, test order.
  299. string validation_filter = 2 [(google.api.field_behavior) = REQUIRED];
  300. // Required. A filter on DataItems of the Dataset. DataItems that match
  301. // this filter are used to test the Model. A filter with same syntax
  302. // as the one used in [DatasetService.ListDataItems][google.cloud.aiplatform.v1beta1.DatasetService.ListDataItems] may be used. If a
  303. // single DataItem is matched by more than one of the FilterSplit filters,
  304. // then it is assigned to the first set that applies to it in the
  305. // training, validation, test order.
  306. string test_filter = 3 [(google.api.field_behavior) = REQUIRED];
  307. }
  308. // Assigns input data to training, validation, and test sets based on the
  309. // value of a provided key.
  310. //
  311. // Supported only for tabular Datasets.
  312. message PredefinedSplit {
  313. // Required. The key is a name of one of the Dataset's data columns.
  314. // The value of the key (either the label's value or value in the column)
  315. // must be one of {`training`, `validation`, `test`}, and it defines to which
  316. // set the given piece of data is assigned. If for a piece of data the key
  317. // is not present or has an invalid value, that piece is ignored by the
  318. // pipeline.
  319. string key = 1 [(google.api.field_behavior) = REQUIRED];
  320. }
  321. // Assigns input data to training, validation, and test sets based on a
  322. // provided timestamps. The youngest data pieces are assigned to training set,
  323. // next to validation set, and the oldest to the test set.
  324. //
  325. // Supported only for tabular Datasets.
  326. message TimestampSplit {
  327. // The fraction of the input data that is to be used to train the Model.
  328. double training_fraction = 1;
  329. // The fraction of the input data that is to be used to validate the Model.
  330. double validation_fraction = 2;
  331. // The fraction of the input data that is to be used to evaluate the Model.
  332. double test_fraction = 3;
  333. // Required. The key is a name of one of the Dataset's data columns.
  334. // The values of the key (the values in the column) must be in RFC 3339
  335. // `date-time` format, where `time-offset` = `"Z"`
  336. // (e.g. 1985-04-12T23:20:50.52Z). If for a piece of data the key is not
  337. // present or has an invalid value, that piece is ignored by the pipeline.
  338. string key = 4 [(google.api.field_behavior) = REQUIRED];
  339. }
  340. // Assigns input data to the training, validation, and test sets so that the
  341. // distribution of values found in the categorical column (as specified by the
  342. // `key` field) is mirrored within each split. The fraction values determine
  343. // the relative sizes of the splits.
  344. //
  345. // For example, if the specified column has three values, with 50% of the rows
  346. // having value "A", 25% value "B", and 25% value "C", and the split fractions
  347. // are specified as 80/10/10, then the training set will constitute 80% of the
  348. // training data, with about 50% of the training set rows having the value "A"
  349. // for the specified column, about 25% having the value "B", and about 25%
  350. // having the value "C".
  351. //
  352. // Only the top 500 occurring values are used; any values not in the top
  353. // 500 values are randomly assigned to a split. If less than three rows contain
  354. // a specific value, those rows are randomly assigned.
  355. //
  356. // Supported only for tabular Datasets.
  357. message StratifiedSplit {
  358. // The fraction of the input data that is to be used to train the Model.
  359. double training_fraction = 1;
  360. // The fraction of the input data that is to be used to validate the Model.
  361. double validation_fraction = 2;
  362. // The fraction of the input data that is to be used to evaluate the Model.
  363. double test_fraction = 3;
  364. // Required. The key is a name of one of the Dataset's data columns.
  365. // The key provided must be for a categorical column.
  366. string key = 4 [(google.api.field_behavior) = REQUIRED];
  367. }