automl_tables.proto 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. // Copyright 2021 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.cloud.aiplatform.v1.schema.trainingjob.definition;
  16. import "google/cloud/aiplatform/v1/schema/trainingjob/definition/export_evaluated_data_items_config.proto";
  17. option csharp_namespace = "Google.Cloud.AIPlatform.V1.Schema.TrainingJob.Definition";
  18. option go_package = "google.golang.org/genproto/googleapis/cloud/aiplatform/v1/schema/trainingjob/definition;definition";
  19. option java_multiple_files = true;
  20. option java_outer_classname = "AutoMLTablesProto";
  21. option java_package = "com.google.cloud.aiplatform.v1.schema.trainingjob.definition";
  22. option php_namespace = "Google\\Cloud\\AIPlatform\\V1\\Schema\\TrainingJob\\Definition";
  23. option ruby_package = "Google::Cloud::AIPlatform::V1::Schema::TrainingJob::Definition";
  24. // A TrainingJob that trains and uploads an AutoML Tables Model.
  25. message AutoMlTables {
  26. // The input parameters of this TrainingJob.
  27. AutoMlTablesInputs inputs = 1;
  28. // The metadata information.
  29. AutoMlTablesMetadata metadata = 2;
  30. }
  31. message AutoMlTablesInputs {
  32. message Transformation {
  33. // Training pipeline will infer the proper transformation based on the
  34. // statistic of dataset.
  35. message AutoTransformation {
  36. string column_name = 1;
  37. }
  38. // Training pipeline will perform following transformation functions.
  39. // * The value converted to float32.
  40. // * The z_score of the value.
  41. // * log(value+1) when the value is greater than or equal to 0. Otherwise,
  42. // this transformation is not applied and the value is considered a
  43. // missing value.
  44. // * z_score of log(value+1) when the value is greater than or equal to 0.
  45. // Otherwise, this transformation is not applied and the value is
  46. // considered a missing value.
  47. // * A boolean value that indicates whether the value is valid.
  48. message NumericTransformation {
  49. string column_name = 1;
  50. // If invalid values is allowed, the training pipeline will create a
  51. // boolean feature that indicated whether the value is valid.
  52. // Otherwise, the training pipeline will discard the input row from
  53. // trainining data.
  54. bool invalid_values_allowed = 2;
  55. }
  56. // Training pipeline will perform following transformation functions.
  57. // * The categorical string as is--no change to case, punctuation,
  58. // spelling,
  59. // tense, and so on.
  60. // * Convert the category name to a dictionary lookup index and generate an
  61. // embedding for each index.
  62. // * Categories that appear less than 5 times in the training dataset are
  63. // treated as the "unknown" category. The "unknown" category gets its own
  64. // special lookup index and resulting embedding.
  65. message CategoricalTransformation {
  66. string column_name = 1;
  67. }
  68. // Training pipeline will perform following transformation functions.
  69. // * Apply the transformation functions for Numerical columns.
  70. // * Determine the year, month, day,and weekday. Treat each value from the
  71. // * timestamp as a Categorical column.
  72. // * Invalid numerical values (for example, values that fall outside of a
  73. // typical timestamp range, or are extreme values) receive no special
  74. // treatment and are not removed.
  75. message TimestampTransformation {
  76. string column_name = 1;
  77. // The format in which that time field is expressed. The time_format must
  78. // either be one of:
  79. // * `unix-seconds`
  80. // * `unix-milliseconds`
  81. // * `unix-microseconds`
  82. // * `unix-nanoseconds`
  83. // (for respectively number of seconds, milliseconds, microseconds and
  84. // nanoseconds since start of the Unix epoch);
  85. // or be written in `strftime` syntax. If time_format is not set, then the
  86. // default format is RFC 3339 `date-time` format, where
  87. // `time-offset` = `"Z"` (e.g. 1985-04-12T23:20:50.52Z)
  88. string time_format = 2;
  89. // If invalid values is allowed, the training pipeline will create a
  90. // boolean feature that indicated whether the value is valid.
  91. // Otherwise, the training pipeline will discard the input row from
  92. // trainining data.
  93. bool invalid_values_allowed = 3;
  94. }
  95. // Training pipeline will perform following transformation functions.
  96. // * The text as is--no change to case, punctuation, spelling, tense, and
  97. // so
  98. // on.
  99. // * Tokenize text to words. Convert each words to a dictionary lookup
  100. // index
  101. // and generate an embedding for each index. Combine the embedding of all
  102. // elements into a single embedding using the mean.
  103. // * Tokenization is based on unicode script boundaries.
  104. // * Missing values get their own lookup index and resulting embedding.
  105. // * Stop-words receive no special treatment and are not removed.
  106. message TextTransformation {
  107. string column_name = 1;
  108. }
  109. // Treats the column as numerical array and performs following
  110. // transformation functions.
  111. // * All transformations for Numerical types applied to the average of the
  112. // all elements.
  113. // * The average of empty arrays is treated as zero.
  114. message NumericArrayTransformation {
  115. string column_name = 1;
  116. // If invalid values is allowed, the training pipeline will create a
  117. // boolean feature that indicated whether the value is valid.
  118. // Otherwise, the training pipeline will discard the input row from
  119. // trainining data.
  120. bool invalid_values_allowed = 2;
  121. }
  122. // Treats the column as categorical array and performs following
  123. // transformation functions.
  124. // * For each element in the array, convert the category name to a
  125. // dictionary
  126. // lookup index and generate an embedding for each index.
  127. // Combine the embedding of all elements into a single embedding using
  128. // the mean.
  129. // * Empty arrays treated as an embedding of zeroes.
  130. message CategoricalArrayTransformation {
  131. string column_name = 1;
  132. }
  133. // Treats the column as text array and performs following transformation
  134. // functions.
  135. // * Concatenate all text values in the array into a single text value
  136. // using
  137. // a space (" ") as a delimiter, and then treat the result as a single
  138. // text value. Apply the transformations for Text columns.
  139. // * Empty arrays treated as an empty text.
  140. message TextArrayTransformation {
  141. string column_name = 1;
  142. }
  143. // The transformation that the training pipeline will apply to the input
  144. // columns.
  145. oneof transformation_detail {
  146. AutoTransformation auto = 1;
  147. NumericTransformation numeric = 2;
  148. CategoricalTransformation categorical = 3;
  149. TimestampTransformation timestamp = 4;
  150. TextTransformation text = 5;
  151. NumericArrayTransformation repeated_numeric = 6;
  152. CategoricalArrayTransformation repeated_categorical = 7;
  153. TextArrayTransformation repeated_text = 8;
  154. }
  155. }
  156. // Additional optimization objective configuration. Required for
  157. // `maximize-precision-at-recall` and `maximize-recall-at-precision`,
  158. // otherwise unused.
  159. oneof additional_optimization_objective_config {
  160. // Required when optimization_objective is "maximize-precision-at-recall".
  161. // Must be between 0 and 1, inclusive.
  162. float optimization_objective_recall_value = 5;
  163. // Required when optimization_objective is "maximize-recall-at-precision".
  164. // Must be between 0 and 1, inclusive.
  165. float optimization_objective_precision_value = 6;
  166. }
  167. // The type of prediction the Model is to produce.
  168. // "classification" - Predict one out of multiple target values is
  169. // picked for each row.
  170. // "regression" - Predict a value based on its relation to other values.
  171. // This type is available only to columns that contain
  172. // semantically numeric values, i.e. integers or floating
  173. // point number, even if stored as e.g. strings.
  174. string prediction_type = 1;
  175. // The column name of the target column that the model is to predict.
  176. string target_column = 2;
  177. // Each transformation will apply transform function to given input column.
  178. // And the result will be used for training.
  179. // When creating transformation for BigQuery Struct column, the column should
  180. // be flattened using "." as the delimiter.
  181. repeated Transformation transformations = 3;
  182. // Objective function the model is optimizing towards. The training process
  183. // creates a model that maximizes/minimizes the value of the objective
  184. // function over the validation set.
  185. //
  186. // The supported optimization objectives depend on the prediction type.
  187. // If the field is not set, a default objective function is used.
  188. //
  189. // classification (binary):
  190. // "maximize-au-roc" (default) - Maximize the area under the receiver
  191. // operating characteristic (ROC) curve.
  192. // "minimize-log-loss" - Minimize log loss.
  193. // "maximize-au-prc" - Maximize the area under the precision-recall curve.
  194. // "maximize-precision-at-recall" - Maximize precision for a specified
  195. // recall value.
  196. // "maximize-recall-at-precision" - Maximize recall for a specified
  197. // precision value.
  198. //
  199. // classification (multi-class):
  200. // "minimize-log-loss" (default) - Minimize log loss.
  201. //
  202. // regression:
  203. // "minimize-rmse" (default) - Minimize root-mean-squared error (RMSE).
  204. // "minimize-mae" - Minimize mean-absolute error (MAE).
  205. // "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE).
  206. string optimization_objective = 4;
  207. // Required. The train budget of creating this model, expressed in milli node
  208. // hours i.e. 1,000 value in this field means 1 node hour.
  209. //
  210. // The training cost of the model will not exceed this budget. The final cost
  211. // will be attempted to be close to the budget, though may end up being (even)
  212. // noticeably smaller - at the backend's discretion. This especially may
  213. // happen when further model training ceases to provide any improvements.
  214. //
  215. // If the budget is set to a value known to be insufficient to train a
  216. // model for the given dataset, the training won't be attempted and
  217. // will error.
  218. //
  219. // The train budget must be between 1,000 and 72,000 milli node hours,
  220. // inclusive.
  221. int64 train_budget_milli_node_hours = 7;
  222. // Use the entire training budget. This disables the early stopping feature.
  223. // By default, the early stopping feature is enabled, which means that AutoML
  224. // Tables might stop training before the entire training budget has been used.
  225. bool disable_early_stopping = 8;
  226. // Column name that should be used as the weight column.
  227. // Higher values in this column give more importance to the row
  228. // during model training. The column must have numeric values between 0 and
  229. // 10000 inclusively; 0 means the row is ignored for training. If weight
  230. // column field is not set, then all rows are assumed to have equal weight
  231. // of 1.
  232. string weight_column_name = 9;
  233. // Configuration for exporting test set predictions to a BigQuery table. If
  234. // this configuration is absent, then the export is not performed.
  235. ExportEvaluatedDataItemsConfig export_evaluated_data_items_config = 10;
  236. // Additional experiment flags for the Tables training pipeline.
  237. repeated string additional_experiments = 11;
  238. }
  239. // Model metadata specific to AutoML Tables.
  240. message AutoMlTablesMetadata {
  241. // Output only. The actual training cost of the model, expressed in milli
  242. // node hours, i.e. 1,000 value in this field means 1 node hour. Guaranteed
  243. // to not exceed the train budget.
  244. int64 train_cost_milli_node_hours = 1;
  245. }