123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283 |
- // Copyright 2021 Google LLC
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- syntax = "proto3";
- package google.cloud.aiplatform.v1.schema.trainingjob.definition;
- import "google/cloud/aiplatform/v1/schema/trainingjob/definition/export_evaluated_data_items_config.proto";
- option csharp_namespace = "Google.Cloud.AIPlatform.V1.Schema.TrainingJob.Definition";
- option go_package = "google.golang.org/genproto/googleapis/cloud/aiplatform/v1/schema/trainingjob/definition;definition";
- option java_multiple_files = true;
- option java_outer_classname = "AutoMLTablesProto";
- option java_package = "com.google.cloud.aiplatform.v1.schema.trainingjob.definition";
- option php_namespace = "Google\\Cloud\\AIPlatform\\V1\\Schema\\TrainingJob\\Definition";
- option ruby_package = "Google::Cloud::AIPlatform::V1::Schema::TrainingJob::Definition";
- // A TrainingJob that trains and uploads an AutoML Tables Model.
- message AutoMlTables {
- // The input parameters of this TrainingJob.
- AutoMlTablesInputs inputs = 1;
- // The metadata information.
- AutoMlTablesMetadata metadata = 2;
- }
- message AutoMlTablesInputs {
- message Transformation {
- // Training pipeline will infer the proper transformation based on the
- // statistic of dataset.
- message AutoTransformation {
- string column_name = 1;
- }
- // Training pipeline will perform following transformation functions.
- // * The value converted to float32.
- // * The z_score of the value.
- // * log(value+1) when the value is greater than or equal to 0. Otherwise,
- // this transformation is not applied and the value is considered a
- // missing value.
- // * z_score of log(value+1) when the value is greater than or equal to 0.
- // Otherwise, this transformation is not applied and the value is
- // considered a missing value.
- // * A boolean value that indicates whether the value is valid.
- message NumericTransformation {
- string column_name = 1;
- // If invalid values is allowed, the training pipeline will create a
- // boolean feature that indicated whether the value is valid.
- // Otherwise, the training pipeline will discard the input row from
- // trainining data.
- bool invalid_values_allowed = 2;
- }
- // Training pipeline will perform following transformation functions.
- // * The categorical string as is--no change to case, punctuation,
- // spelling,
- // tense, and so on.
- // * Convert the category name to a dictionary lookup index and generate an
- // embedding for each index.
- // * Categories that appear less than 5 times in the training dataset are
- // treated as the "unknown" category. The "unknown" category gets its own
- // special lookup index and resulting embedding.
- message CategoricalTransformation {
- string column_name = 1;
- }
- // Training pipeline will perform following transformation functions.
- // * Apply the transformation functions for Numerical columns.
- // * Determine the year, month, day,and weekday. Treat each value from the
- // * timestamp as a Categorical column.
- // * Invalid numerical values (for example, values that fall outside of a
- // typical timestamp range, or are extreme values) receive no special
- // treatment and are not removed.
- message TimestampTransformation {
- string column_name = 1;
- // The format in which that time field is expressed. The time_format must
- // either be one of:
- // * `unix-seconds`
- // * `unix-milliseconds`
- // * `unix-microseconds`
- // * `unix-nanoseconds`
- // (for respectively number of seconds, milliseconds, microseconds and
- // nanoseconds since start of the Unix epoch);
- // or be written in `strftime` syntax. If time_format is not set, then the
- // default format is RFC 3339 `date-time` format, where
- // `time-offset` = `"Z"` (e.g. 1985-04-12T23:20:50.52Z)
- string time_format = 2;
- // If invalid values is allowed, the training pipeline will create a
- // boolean feature that indicated whether the value is valid.
- // Otherwise, the training pipeline will discard the input row from
- // trainining data.
- bool invalid_values_allowed = 3;
- }
- // Training pipeline will perform following transformation functions.
- // * The text as is--no change to case, punctuation, spelling, tense, and
- // so
- // on.
- // * Tokenize text to words. Convert each words to a dictionary lookup
- // index
- // and generate an embedding for each index. Combine the embedding of all
- // elements into a single embedding using the mean.
- // * Tokenization is based on unicode script boundaries.
- // * Missing values get their own lookup index and resulting embedding.
- // * Stop-words receive no special treatment and are not removed.
- message TextTransformation {
- string column_name = 1;
- }
- // Treats the column as numerical array and performs following
- // transformation functions.
- // * All transformations for Numerical types applied to the average of the
- // all elements.
- // * The average of empty arrays is treated as zero.
- message NumericArrayTransformation {
- string column_name = 1;
- // If invalid values is allowed, the training pipeline will create a
- // boolean feature that indicated whether the value is valid.
- // Otherwise, the training pipeline will discard the input row from
- // trainining data.
- bool invalid_values_allowed = 2;
- }
- // Treats the column as categorical array and performs following
- // transformation functions.
- // * For each element in the array, convert the category name to a
- // dictionary
- // lookup index and generate an embedding for each index.
- // Combine the embedding of all elements into a single embedding using
- // the mean.
- // * Empty arrays treated as an embedding of zeroes.
- message CategoricalArrayTransformation {
- string column_name = 1;
- }
- // Treats the column as text array and performs following transformation
- // functions.
- // * Concatenate all text values in the array into a single text value
- // using
- // a space (" ") as a delimiter, and then treat the result as a single
- // text value. Apply the transformations for Text columns.
- // * Empty arrays treated as an empty text.
- message TextArrayTransformation {
- string column_name = 1;
- }
- // The transformation that the training pipeline will apply to the input
- // columns.
- oneof transformation_detail {
- AutoTransformation auto = 1;
- NumericTransformation numeric = 2;
- CategoricalTransformation categorical = 3;
- TimestampTransformation timestamp = 4;
- TextTransformation text = 5;
- NumericArrayTransformation repeated_numeric = 6;
- CategoricalArrayTransformation repeated_categorical = 7;
- TextArrayTransformation repeated_text = 8;
- }
- }
- // Additional optimization objective configuration. Required for
- // `maximize-precision-at-recall` and `maximize-recall-at-precision`,
- // otherwise unused.
- oneof additional_optimization_objective_config {
- // Required when optimization_objective is "maximize-precision-at-recall".
- // Must be between 0 and 1, inclusive.
- float optimization_objective_recall_value = 5;
- // Required when optimization_objective is "maximize-recall-at-precision".
- // Must be between 0 and 1, inclusive.
- float optimization_objective_precision_value = 6;
- }
- // The type of prediction the Model is to produce.
- // "classification" - Predict one out of multiple target values is
- // picked for each row.
- // "regression" - Predict a value based on its relation to other values.
- // This type is available only to columns that contain
- // semantically numeric values, i.e. integers or floating
- // point number, even if stored as e.g. strings.
- string prediction_type = 1;
- // The column name of the target column that the model is to predict.
- string target_column = 2;
- // Each transformation will apply transform function to given input column.
- // And the result will be used for training.
- // When creating transformation for BigQuery Struct column, the column should
- // be flattened using "." as the delimiter.
- repeated Transformation transformations = 3;
- // Objective function the model is optimizing towards. The training process
- // creates a model that maximizes/minimizes the value of the objective
- // function over the validation set.
- //
- // The supported optimization objectives depend on the prediction type.
- // If the field is not set, a default objective function is used.
- //
- // classification (binary):
- // "maximize-au-roc" (default) - Maximize the area under the receiver
- // operating characteristic (ROC) curve.
- // "minimize-log-loss" - Minimize log loss.
- // "maximize-au-prc" - Maximize the area under the precision-recall curve.
- // "maximize-precision-at-recall" - Maximize precision for a specified
- // recall value.
- // "maximize-recall-at-precision" - Maximize recall for a specified
- // precision value.
- //
- // classification (multi-class):
- // "minimize-log-loss" (default) - Minimize log loss.
- //
- // regression:
- // "minimize-rmse" (default) - Minimize root-mean-squared error (RMSE).
- // "minimize-mae" - Minimize mean-absolute error (MAE).
- // "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE).
- string optimization_objective = 4;
- // Required. The train budget of creating this model, expressed in milli node
- // hours i.e. 1,000 value in this field means 1 node hour.
- //
- // The training cost of the model will not exceed this budget. The final cost
- // will be attempted to be close to the budget, though may end up being (even)
- // noticeably smaller - at the backend's discretion. This especially may
- // happen when further model training ceases to provide any improvements.
- //
- // If the budget is set to a value known to be insufficient to train a
- // model for the given dataset, the training won't be attempted and
- // will error.
- //
- // The train budget must be between 1,000 and 72,000 milli node hours,
- // inclusive.
- int64 train_budget_milli_node_hours = 7;
- // Use the entire training budget. This disables the early stopping feature.
- // By default, the early stopping feature is enabled, which means that AutoML
- // Tables might stop training before the entire training budget has been used.
- bool disable_early_stopping = 8;
- // Column name that should be used as the weight column.
- // Higher values in this column give more importance to the row
- // during model training. The column must have numeric values between 0 and
- // 10000 inclusively; 0 means the row is ignored for training. If weight
- // column field is not set, then all rows are assumed to have equal weight
- // of 1.
- string weight_column_name = 9;
- // Configuration for exporting test set predictions to a BigQuery table. If
- // this configuration is absent, then the export is not performed.
- ExportEvaluatedDataItemsConfig export_evaluated_data_items_config = 10;
- // Additional experiment flags for the Tables training pipeline.
- repeated string additional_experiments = 11;
- }
- // Model metadata specific to AutoML Tables.
- message AutoMlTablesMetadata {
- // Output only. The actual training cost of the model, expressed in milli
- // node hours, i.e. 1,000 value in this field means 1 node hour. Guaranteed
- // to not exceed the train budget.
- int64 train_cost_milli_node_hours = 1;
- }
|