diff --git a/CHANGELOG.md b/CHANGELOG.md index a99e0ecd91..a0539af01e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,26 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.20.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.19.0...v1.20.0) (2024-09-25) + + +### Features + +* Add bigframes.bigquery.approx_top_count ([#1010](https://github.com/googleapis/python-bigquery-dataframes/issues/1010)) ([3263bd7](https://github.com/googleapis/python-bigquery-dataframes/commit/3263bd70cff01bc18f1ae4ac3d5aa7f9d70fd4b7)) +* Add bigframes.ml.compose.SQLScalarColumnTransformer to create custom SQL-based transformations ([#955](https://github.com/googleapis/python-bigquery-dataframes/issues/955)) ([1930b4e](https://github.com/googleapis/python-bigquery-dataframes/commit/1930b4efe60295751ceef89c2a824923a35b19af)) +* Allow multiple columns input for llm models ([#998](https://github.com/googleapis/python-bigquery-dataframes/issues/998)) ([2fe5e48](https://github.com/googleapis/python-bigquery-dataframes/commit/2fe5e48c56bbc359d3769824c83745d65a001dd7)) + + +### Bug Fixes + +* Fix __repr__ caching with partial ordering ([#1016](https://github.com/googleapis/python-bigquery-dataframes/issues/1016)) ([208a984](https://github.com/googleapis/python-bigquery-dataframes/commit/208a98475389f59d4e32e0cfbcc46824cac278a6)) + + +### Documentation + +* Limit pypi notebook to 7 days and add more info about differences with partial ordering mode ([#1013](https://github.com/googleapis/python-bigquery-dataframes/issues/1013)) ([3c54399](https://github.com/googleapis/python-bigquery-dataframes/commit/3c543990297ec3be0e30425ee841546217e26d2a)) +* Move and edit existing linear-regression tutorial snippet ([#991](https://github.com/googleapis/python-bigquery-dataframes/issues/991)) ([4cb62fd](https://github.com/googleapis/python-bigquery-dataframes/commit/4cb62fd74fc1ac3bb21da23b8639464a9ae3525d)) + ## [1.19.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.18.0...v1.19.0) (2024-09-24) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 303120b88a..28a818e709 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -272,6 +272,46 @@ def json_extract_array( return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path)) +# Approximate aggrgate functions defined from +# https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions + + +def approx_top_count( + series: series.Series, + number: int, +) -> series.Series: + """Returns the approximate top elements of `expression` as an array of STRUCTs. + The number parameter specifies the number of elements returned. + + Each `STRUCT` contains two fields. The first field (named `value`) contains an input + value. The second field (named `count`) contains an `INT64` specifying the number + of times the value was returned. + + Returns `NULL` if there are zero input rows. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["apple", "apple", "pear", "pear", "pear", "banana"]) + >>> bbq.approx_top_count(s, number=2) + [{'value': 'pear', 'count': 3}, {'value': 'apple', 'count': 2}] + + Args: + series (bigframes.series.Series): + The Series with any data type that the `GROUP BY` clause supports. + number (int): + An integer specifying the number of times the value was returned. + + Returns: + bigframes.series.Series: A new Series with the result data. + """ + if number < 1: + raise ValueError("The number of approx_top_count must be at least 1") + return series._apply_aggregation(agg_ops.ApproxTopCountOp(number=number)) + + def struct(value: dataframe.DataFrame) -> series.Series: """Takes a DataFrame and converts it into a Series of structs with each struct entry corresponding to a DataFrame row and each struct field diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 91a3045efb..b65953934d 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -11,6 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from __future__ import annotations + import functools import typing from typing import cast, List, Optional @@ -19,6 +22,7 @@ import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import ibis import ibis.expr.datatypes as ibis_dtypes +import ibis.expr.operations as ibis_ops import ibis.expr.types as ibis_types import pandas as pd @@ -196,6 +200,34 @@ def _( return cast(ibis_types.NumericValue, value) +@compile_unary_agg.register +def _( + op: agg_ops.ApproxTopCountOp, + column: ibis_types.Column, + window=None, +) -> ibis_types.ArrayColumn: + # APPROX_TOP_COUNT has very few allowed windows. + if window is not None: + raise NotImplementedError( + f"Approx top count with windowing is not supported. {constants.FEEDBACK_LINK}" + ) + + # Define a user-defined function (UDF) that approximates the top counts of an expression. + # The type of value is dynamically matching the input column. + def approx_top_count(expression, number: ibis_dtypes.int64): # type: ignore + ... + + return_type = ibis_dtypes.Array( + ibis_dtypes.Struct.from_tuples( + [("value", column.type()), ("count", ibis_dtypes.int64)] + ) + ) + approx_top_count.__annotations__["return"] = return_type + udf_op = ibis_ops.udf.agg.builtin(approx_top_count) + + return udf_op(expression=column, number=op.number) + + @compile_unary_agg.register @numeric_op def _( diff --git a/bigframes/core/tree_properties.py b/bigframes/core/tree_properties.py index 4978e75e38..1b0fe0d072 100644 --- a/bigframes/core/tree_properties.py +++ b/bigframes/core/tree_properties.py @@ -44,8 +44,8 @@ def can_fast_head(node: nodes.BigFrameNode) -> bool: """Can get head fast if can push head operator down to leafs and operators preserve rows.""" if isinstance(node, nodes.LeafNode): return node.supports_fast_head - if isinstance(node, nodes.UnaryNode): - return node.row_preserving and can_fast_head(node.child) + if isinstance(node, (nodes.ProjectionNode, nodes.SelectionNode)): + return can_fast_head(node.child) return False diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 49a668f008..47c93bfa30 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -643,7 +643,6 @@ def __repr__(self) -> str: if opts.repr_mode == "deferred": return formatter.repr_query_job(self._compute_dry_run()) - self._cached() # TODO(swast): pass max_columns and get the true column count back. Maybe # get 1 more column than we have requested so that pandas can add the # ... for us? @@ -2303,52 +2302,19 @@ def melt( self._block.melt(id_col_ids, val_col_ids, var_name, value_name) ) - _NUMERIC_DESCRIBE_AGGS = ( - "count", - "mean", - "std", - "min", - "25%", - "50%", - "75%", - "max", - ) - _NON_NUMERIC_DESCRIBE_AGGS = ("count", "nunique") - def describe(self, include: None | Literal["all"] = None) -> DataFrame: - - allowed_non_numeric_types = { - bigframes.dtypes.STRING_DTYPE, - bigframes.dtypes.BOOL_DTYPE, - bigframes.dtypes.BYTES_DTYPE, - } - if include is None: numeric_df = self._drop_non_numeric(permissive=False) if len(numeric_df.columns) == 0: # Describe eligible non-numeric columns - result = self.select_dtypes(include=allowed_non_numeric_types).agg( - self._NON_NUMERIC_DESCRIBE_AGGS - ) - else: - # Otherwise, only describe numeric columns - result = numeric_df.agg(self._NUMERIC_DESCRIBE_AGGS) - return typing.cast(DataFrame, result) + return self._describe_non_numeric() - elif include == "all": - numeric_result = typing.cast( - DataFrame, - self._drop_non_numeric(permissive=False).agg( - self._NUMERIC_DESCRIBE_AGGS - ), - ) + # Otherwise, only describe numeric columns + return self._describe_numeric() - non_numeric_result = typing.cast( - DataFrame, - self.select_dtypes(include=allowed_non_numeric_types).agg( - self._NON_NUMERIC_DESCRIBE_AGGS - ), - ) + elif include == "all": + numeric_result = self._describe_numeric() + non_numeric_result = self._describe_non_numeric() if len(numeric_result.columns) == 0: return non_numeric_result @@ -2365,6 +2331,35 @@ def describe(self, include: None | Literal["all"] = None) -> DataFrame: else: raise ValueError(f"Unsupported include type: {include}") + def _describe_numeric(self) -> DataFrame: + return typing.cast( + DataFrame, + self._drop_non_numeric(permissive=False).agg( + [ + "count", + "mean", + "std", + "min", + "25%", + "50%", + "75%", + "max", + ] + ), + ) + + def _describe_non_numeric(self) -> DataFrame: + return typing.cast( + DataFrame, + self.select_dtypes( + include={ + bigframes.dtypes.STRING_DTYPE, + bigframes.dtypes.BOOL_DTYPE, + bigframes.dtypes.BYTES_DTYPE, + } + ).agg(["count", "nunique"]), + ) + def skew(self, *, numeric_only: bool = False): if not numeric_only: frame = self._raise_on_non_numeric("skew") diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index 6ae06c9d9f..81181b58cf 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -198,10 +198,6 @@ def _extract_output_names(self): # pass the columns that are not transformed if "transformSql" not in transform_col_dict: continue - transform_sql: str = transform_col_dict["transformSql"] - if not transform_sql.startswith("ML."): - continue - output_names.append(transform_col_dict["name"]) self._output_names = output_names diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 3cfa1851f5..08c9761cc3 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -46,6 +46,101 @@ ) +class SQLScalarColumnTransformer: + r""" + Wrapper for plain SQL code contained in a ColumnTransformer. + + Create a single column transformer in plain sql. + This transformer can only be used inside ColumnTransformer. + + When creating an instance '{0}' can be used as placeholder + for the column to transform: + + SQLScalarColumnTransformer("{0}+1") + + The default target column gets the prefix 'transformed\_' + but can also be changed when creating an instance: + + SQLScalarColumnTransformer("{0}+1", "inc_{0}") + + **Examples:** + + >>> from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer + >>> import bigframes.pandas as bpd + + >>> df = bpd.DataFrame({'name': ["James", None, "Mary"], 'city': ["New York", "Boston", None]}) + >>> col_trans = ColumnTransformer([ + ... ("strlen", + ... SQLScalarColumnTransformer("CASE WHEN {0} IS NULL THEN 15 ELSE LENGTH({0}) END"), + ... ['name', 'city']), + ... ]) + >>> col_trans = col_trans.fit(df) + >>> df_transformed = col_trans.transform(df) + >>> df_transformed + transformed_name transformed_city + 0 5 8 + 1 15 6 + 2 4 15 + + [3 rows x 2 columns] + + SQLScalarColumnTransformer can be combined with other transformers, like StandardScaler: + + >>> col_trans = ColumnTransformer([ + ... ("identity", SQLScalarColumnTransformer("{0}", target_column="{0}"), ["col1", "col5"]), + ... ("increment", SQLScalarColumnTransformer("{0}+1", target_column="inc_{0}"), "col2"), + ... ("stdscale", preprocessing.StandardScaler(), "col3"), + ... # ... + ... ]) + + """ + + def __init__(self, sql: str, target_column: str = "transformed_{0}"): + super().__init__() + self._sql = sql + self._target_column = target_column.replace("`", "") + + PLAIN_COLNAME_RX = re.compile("^[a-z][a-z0-9_]*$", re.IGNORECASE) + + def escape(self, colname: str): + colname = colname.replace("`", "") + if self.PLAIN_COLNAME_RX.match(colname): + return colname + return f"`{colname}`" + + def _compile_to_sql( + self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None + ) -> List[str]: + if columns is None: + columns = X.columns + result = [] + for column in columns: + current_sql = self._sql.format(self.escape(column)) + current_target_column = self.escape(self._target_column.format(column)) + result.append(f"{current_sql} AS {current_target_column}") + return result + + def __repr__(self): + return f"SQLScalarColumnTransformer(sql='{self._sql}', target_column='{self._target_column}')" + + def __eq__(self, other) -> bool: + return type(self) is type(other) and self._keys() == other._keys() + + def __hash__(self) -> int: + return hash(self._keys()) + + def _keys(self): + return (self._sql, self._target_column) + + +# Type hints for transformers contained in ColumnTransformer +SingleColTransformer = Union[ + preprocessing.PreprocessingType, + impute.SimpleImputer, + SQLScalarColumnTransformer, +] + + @log_adapter.class_logger class ColumnTransformer( base.Transformer, @@ -60,7 +155,7 @@ def __init__( transformers: Iterable[ Tuple[ str, - Union[preprocessing.PreprocessingType, impute.SimpleImputer], + SingleColTransformer, Union[str, Iterable[str]], ] ], @@ -78,14 +173,12 @@ def _keys(self): @property def transformers_( self, - ) -> List[ - Tuple[str, Union[preprocessing.PreprocessingType, impute.SimpleImputer], str] - ]: + ) -> List[Tuple[str, SingleColTransformer, str,]]: """The collection of transformers as tuples of (name, transformer, column).""" result: List[ Tuple[ str, - Union[preprocessing.PreprocessingType, impute.SimpleImputer], + SingleColTransformer, str, ] ] = [] @@ -103,6 +196,8 @@ def transformers_( return result + AS_FLEXNAME_SUFFIX_RX = re.compile("^(.*)\\bAS\\s*`[^`]+`\\s*$", re.IGNORECASE) + @classmethod def _extract_from_bq_model( cls, @@ -114,7 +209,7 @@ def _extract_from_bq_model( transformers_set: Set[ Tuple[ str, - Union[preprocessing.PreprocessingType, impute.SimpleImputer], + SingleColTransformer, Union[str, List[str]], ] ] = set() @@ -130,8 +225,11 @@ def camel_to_snake(name): if "transformSql" not in transform_col_dict: continue transform_sql: str = transform_col_dict["transformSql"] - if not transform_sql.startswith("ML."): - continue + + # workaround for bug in bq_model returning " AS `...`" suffix for flexible names + flex_name_match = cls.AS_FLEXNAME_SUFFIX_RX.match(transform_sql) + if flex_name_match: + transform_sql = flex_name_match.group(1) output_names.append(transform_col_dict["name"]) found_transformer = False @@ -148,8 +246,22 @@ def camel_to_snake(name): found_transformer = True break if not found_transformer: - raise NotImplementedError( - f"Unsupported transformer type. {constants.FEEDBACK_LINK}" + if transform_sql.startswith("ML."): + raise NotImplementedError( + f"Unsupported transformer type. {constants.FEEDBACK_LINK}" + ) + + target_column = transform_col_dict["name"] + sql_transformer = SQLScalarColumnTransformer( + transform_sql, target_column=target_column + ) + input_column_name = f"?{target_column}" + transformers_set.add( + ( + camel_to_snake(sql_transformer.__class__.__name__), + sql_transformer, + input_column_name, + ) ) transformer = cls(transformers=list(transformers_set)) @@ -167,6 +279,8 @@ def _merge( assert len(transformers) > 0 _, transformer_0, column_0 = transformers[0] + if isinstance(transformer_0, SQLScalarColumnTransformer): + return self # SQLScalarColumnTransformer only work inside ColumnTransformer feature_columns_sorted = sorted( [ cast(str, feature_column.name) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 53a9d40c6e..3d11cd123e 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -244,7 +244,7 @@ def predict( Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): - Input DataFrame or Series, which contains only one column of prompts. + Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "prompt" column for prediction. Prompts can include preamble, questions, suggestions, instructions, or examples. temperature (float, default 0.0): @@ -307,14 +307,10 @@ def predict( (X,) = utils.convert_to_dataframe(X) - if len(X.columns) != 1: - raise ValueError( - f"Only support one column as input. {constants.FEEDBACK_LINK}" - ) - - # BQML identified the column by name - col_label = cast(blocks.Label, X.columns[0]) - X = X.rename(columns={col_label: "prompt"}) + if len(X.columns) == 1: + # BQML identified the column by name + col_label = cast(blocks.Label, X.columns[0]) + X = X.rename(columns={col_label: "prompt"}) options = { "temperature": temperature, @@ -522,7 +518,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): - Input DataFrame, which needs to contain a column with name "content". Only the column will be used as input. Content can include preamble, questions, suggestions, instructions, or examples. + Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "content" column for prediction. Returns: bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. @@ -531,14 +527,10 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: # Params reference: https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models (X,) = utils.convert_to_dataframe(X) - if len(X.columns) != 1: - raise ValueError( - f"Only support one column as input. {constants.FEEDBACK_LINK}" - ) - - # BQML identified the column by name - col_label = cast(blocks.Label, X.columns[0]) - X = X.rename(columns={col_label: "content"}) + if len(X.columns) == 1: + # BQML identified the column by name + col_label = cast(blocks.Label, X.columns[0]) + X = X.rename(columns={col_label: "content"}) options = { "flatten_json_output": True, @@ -679,7 +671,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): - Input DataFrame, which needs to contain a column with name "content". Only the column will be used as input. Content can include preamble, questions, suggestions, instructions, or examples. + Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "content" column for prediction. Returns: bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. @@ -688,14 +680,10 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: # Params reference: https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models (X,) = utils.convert_to_dataframe(X) - if len(X.columns) != 1: - raise ValueError( - f"Only support one column as input. {constants.FEEDBACK_LINK}" - ) - - # BQML identified the column by name - col_label = cast(blocks.Label, X.columns[0]) - X = X.rename(columns={col_label: "content"}) + if len(X.columns) == 1: + # BQML identified the column by name + col_label = cast(blocks.Label, X.columns[0]) + X = X.rename(columns={col_label: "content"}) options = { "flatten_json_output": True, @@ -893,7 +881,7 @@ def predict( Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): - Input DataFrame or Series, which contains only one column of prompts. + Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "prompt" column for prediction. Prompts can include preamble, questions, suggestions, instructions, or examples. temperature (float, default 0.9): @@ -938,14 +926,10 @@ def predict( (X,) = utils.convert_to_dataframe(X) - if len(X.columns) != 1: - raise ValueError( - f"Only support one column as input. {constants.FEEDBACK_LINK}" - ) - - # BQML identified the column by name - col_label = cast(blocks.Label, X.columns[0]) - X = X.rename(columns={col_label: "prompt"}) + if len(X.columns) == 1: + # BQML identified the column by name + col_label = cast(blocks.Label, X.columns[0]) + X = X.rename(columns={col_label: "prompt"}) options = { "temperature": temperature, @@ -1181,7 +1165,7 @@ def predict( Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): - Input DataFrame or Series, which contains only one column of prompts. + Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "prompt" column for prediction. Prompts can include preamble, questions, suggestions, instructions, or examples. max_output_tokens (int, default 128): @@ -1222,14 +1206,10 @@ def predict( (X,) = utils.convert_to_dataframe(X) - if len(X.columns) != 1: - raise ValueError( - f"Only support one column as input. {constants.FEEDBACK_LINK}" - ) - - # BQML identified the column by name - col_label = cast(blocks.Label, X.columns[0]) - X = X.rename(columns={col_label: "prompt"}) + if len(X.columns) == 1: + # BQML identified the column by name + col_label = cast(blocks.Label, X.columns[0]) + X = X.rename(columns={col_label: "prompt"}) options = { "max_output_tokens": max_output_tokens, diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index d071889ac4..faba7465d9 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -184,6 +184,23 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT return input_types[0] +@dataclasses.dataclass(frozen=True) +class ApproxTopCountOp(UnaryAggregateOp): + name: typing.ClassVar[str] = "approx_top_count" + number: int + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if not dtypes.is_orderable(input_types[0]): + raise TypeError(f"Type {input_types[0]} is not orderable") + + input_type = input_types[0] + fields = [ + pa.field("value", dtypes.bigframes_dtype_to_arrow_dtype(input_type)), + pa.field("count", pa.int64()), + ] + return pd.ArrowDtype(pa.list_(pa.struct(fields))) + + @dataclasses.dataclass(frozen=True) class MeanOp(UnaryAggregateOp): name: ClassVar[str] = "mean" diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index d2a2e0f1b2..f89b5aefec 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -360,11 +360,6 @@ def _cache_with_cluster_cols( def _cache_with_offsets(self, array_value: bigframes.core.ArrayValue): """Executes the query and uses the resulting table to rewrite future executions.""" - - if not self.strictly_ordered: - raise ValueError( - "Caching with offsets only supported in strictly ordered mode." - ) offset_column = bigframes.core.guid.generate_guid("bigframes_offsets") w_offsets, offset_column = array_value.promote_offsets() sql = self.compiler.compile_unordered(self._get_optimized_plan(w_offsets.node)) diff --git a/bigframes/version.py b/bigframes/version.py index 5dda345fcb..60f4942175 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.19.0" +__version__ = "1.20.0" diff --git a/notebooks/dataframes/pypi.ipynb b/notebooks/dataframes/pypi.ipynb index 7b16412ff5..a62bd45768 100644 --- a/notebooks/dataframes/pypi.ipynb +++ b/notebooks/dataframes/pypi.ipynb @@ -40,6 +40,16 @@ "execution_count": 2, "metadata": {}, "outputs": [], + "source": [ + "# Choose a package which you want to visualize.\n", + "package_name = \"pandas\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], "source": [ "import bigframes.pandas as bpd\n", "\n", @@ -58,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -83,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -111,24 +121,27 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Limit to the most recent 30 days of data\n", + "## Limit to the most recent 7 days of data\n", "\n", - "The PyPI and deps.dev tables are partitioned by date. Query only the most recent 30 days of data to reduce the number of bytes scanned.\n", + "The PyPI and deps.dev tables are partitioned by date. Query only the most recent 7 days of data to reduce the number of bytes scanned.\n", "\n", "Just as with the default ordering mode, filters can be describe in a pandas-compatible way by passing a Boolean Series to the DataFrame's `__getitem__` accessor." ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import datetime\n", "\n", - "last_30_days = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=30)\n", - "pypi = pypi[pypi[\"timestamp\"] > last_30_days]\n", - "deps = deps[(deps[\"SnapshotAt\"] > last_30_days) & (deps[\"System\"] == \"PYPI\")]" + "now = datetime.datetime.now(datetime.timezone.utc)\n", + "last_7_days = now - datetime.timedelta(days=7)\n", + "last_30_days = now - datetime.timedelta(days=30)\n", + "pypi = pypi[pypi[\"timestamp\"] > last_7_days]\n", + "deps = deps[deps[\"SnapshotAt\"] > last_30_days] # deps are refreshed less frequently\n", + "deps = deps[deps[\"System\"] == \"PYPI\"]" ] }, { @@ -143,15 +156,295 @@ "```\n", "import datetime\n", "\n", - "last_30_days = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=30)\n", + "last_7_days = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=7)\n", "\n", "# Without ordering_mode = \"partial\", one must limit the data at \"read\" time to reduce bytes scanned.\n", "pypi = bpd.read_gbq_table(\n", " \"bigquery-public-data.pypi.file_downloads\",\n", " columns=[\"timestamp\", \"project\"],\n", - " filters=[(\"timestamp\", \">\", last_30_days)],\n", + " filters=[(\"timestamp\", \">\", last_7_days)],\n", ")\n", - "```" + "```\n", + "\n", + "`head()` is not available when no ordering has been established. It fails with `OrderRequiredError`. Use `peek()` instead to download a sample of the data. This will be much more efficient, as the query doesn't need to order all rows to determine which are first." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 423d9d93-1495-4c76-b8c2-e830a6e19ff4 is DONE. 110.3 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampcountry_codeurlprojectfiledetailstls_protocoltls_cipher
02024-09-18 18:15:04+00:00US/packages/ff/c8/4cd4b2834012ffc71ae3fd69187f08...aiobreaker{'filename': 'aiobreaker-1.2.0-py3-none-any.wh...{'installer': {'name': 'pip', 'version': '21.1...TLSv1.3TLS_AES_128_GCM_SHA256
12024-09-18 18:29:50+00:00US/packages/21/8e/4562029e179226051cd4aa3135444d...aiobotocore{'filename': 'aiobotocore-1.3.0.tar.gz', 'proj...{'installer': {'name': 'pip', 'version': '24.1...TLSv1.2ECDHE-RSA-AES128-GCM-SHA256
22024-09-18 18:22:14+00:00US/packages/11/16/4226e59bb72e096d9809ccedf349a1...aiobotocore{'filename': 'aiobotocore-2.0.1.tar.gz', 'proj...{'installer': {'name': 'pip', 'version': '24.2...TLSv1.2ECDHE-RSA-AES128-GCM-SHA256
32024-09-18 18:22:08+00:00US/packages/11/16/4226e59bb72e096d9809ccedf349a1...aiobotocore{'filename': 'aiobotocore-2.0.1.tar.gz', 'proj...{'installer': {'name': 'pip', 'version': '24.2...TLSv1.2ECDHE-RSA-AES128-GCM-SHA256
42024-09-18 18:29:22+00:00US/packages/54/b7/453119271cc4c36b07fdeab9b0ff25...aiobotocore{'filename': 'aiobotocore-2.3.3.tar.gz', 'proj...{'installer': {'name': 'pip', 'version': '24.1...TLSv1.2ECDHE-RSA-AES128-GCM-SHA256
\n", + "
" + ], + "text/plain": [ + " timestamp country_code \\\n", + "0 2024-09-18 18:15:04+00:00 US \n", + "1 2024-09-18 18:29:50+00:00 US \n", + "2 2024-09-18 18:22:14+00:00 US \n", + "3 2024-09-18 18:22:08+00:00 US \n", + "4 2024-09-18 18:29:22+00:00 US \n", + "\n", + " url project \\\n", + "0 /packages/ff/c8/4cd4b2834012ffc71ae3fd69187f08... aiobreaker \n", + "1 /packages/21/8e/4562029e179226051cd4aa3135444d... aiobotocore \n", + "2 /packages/11/16/4226e59bb72e096d9809ccedf349a1... aiobotocore \n", + "3 /packages/11/16/4226e59bb72e096d9809ccedf349a1... aiobotocore \n", + "4 /packages/54/b7/453119271cc4c36b07fdeab9b0ff25... aiobotocore \n", + "\n", + " file \\\n", + "0 {'filename': 'aiobreaker-1.2.0-py3-none-any.wh... \n", + "1 {'filename': 'aiobotocore-1.3.0.tar.gz', 'proj... \n", + "2 {'filename': 'aiobotocore-2.0.1.tar.gz', 'proj... \n", + "3 {'filename': 'aiobotocore-2.0.1.tar.gz', 'proj... \n", + "4 {'filename': 'aiobotocore-2.3.3.tar.gz', 'proj... \n", + "\n", + " details tls_protocol \\\n", + "0 {'installer': {'name': 'pip', 'version': '21.1... TLSv1.3 \n", + "1 {'installer': {'name': 'pip', 'version': '24.1... TLSv1.2 \n", + "2 {'installer': {'name': 'pip', 'version': '24.2... TLSv1.2 \n", + "3 {'installer': {'name': 'pip', 'version': '24.2... TLSv1.2 \n", + "4 {'installer': {'name': 'pip', 'version': '24.1... TLSv1.2 \n", + "\n", + " tls_cipher \n", + "0 TLS_AES_128_GCM_SHA256 \n", + "1 ECDHE-RSA-AES128-GCM-SHA256 \n", + "2 ECDHE-RSA-AES128-GCM-SHA256 \n", + "3 ECDHE-RSA-AES128-GCM-SHA256 \n", + "4 ECDHE-RSA-AES128-GCM-SHA256 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Warning: Ensure bpd.options.bigquery.ordering_mode = \"partial\" or else\n", + "# this query() will cause a full table scan because of the sequential index.\n", + "assert bpd.options.bigquery.ordering_mode == \"partial\"\n", + "pypi.peek()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 3a421217-59e2-4722-8382-0930f0a3b9ee is DONE. 1.5 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SnapshotAtSystemNameVersionDependencyMinimumDepth
02024-08-29 04:39:16.121656+00:00PYPIzxkane-cdk-construct-simple-nat0.2.89{'System': 'PYPI', 'Name': 'attrs', 'Version':...2
12024-08-29 04:39:16.121656+00:00PYPIzxkane-cdk-construct-simple-nat0.2.82{'System': 'PYPI', 'Name': 'attrs', 'Version':...2
22024-08-29 04:39:16.121656+00:00PYPIzxkane-cdk-construct-simple-nat0.2.88{'System': 'PYPI', 'Name': 'attrs', 'Version':...2
32024-08-29 04:39:16.121656+00:00PYPIzxkane-cdk-construct-simple-nat0.2.91{'System': 'PYPI', 'Name': 'attrs', 'Version':...2
42024-08-29 04:39:16.121656+00:00PYPIzxkane-cdk-construct-simple-nat0.2.77{'System': 'PYPI', 'Name': 'attrs', 'Version':...2
\n", + "
" + ], + "text/plain": [ + " SnapshotAt System Name \\\n", + "0 2024-08-29 04:39:16.121656+00:00 PYPI zxkane-cdk-construct-simple-nat \n", + "1 2024-08-29 04:39:16.121656+00:00 PYPI zxkane-cdk-construct-simple-nat \n", + "2 2024-08-29 04:39:16.121656+00:00 PYPI zxkane-cdk-construct-simple-nat \n", + "3 2024-08-29 04:39:16.121656+00:00 PYPI zxkane-cdk-construct-simple-nat \n", + "4 2024-08-29 04:39:16.121656+00:00 PYPI zxkane-cdk-construct-simple-nat \n", + "\n", + " Version Dependency MinimumDepth \n", + "0 0.2.89 {'System': 'PYPI', 'Name': 'attrs', 'Version':... 2 \n", + "1 0.2.82 {'System': 'PYPI', 'Name': 'attrs', 'Version':... 2 \n", + "2 0.2.88 {'System': 'PYPI', 'Name': 'attrs', 'Version':... 2 \n", + "3 0.2.91 {'System': 'PYPI', 'Name': 'attrs', 'Version':... 2 \n", + "4 0.2.77 {'System': 'PYPI', 'Name': 'attrs', 'Version':... 2 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "deps.peek()" ] }, { @@ -167,7 +460,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -185,11 +478,111 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job 5b69917f-9ed7-483a-9241-0083acea9990 is DONE. 1.1 GB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job ac94c55d-ce8e-4694-ad97-55c933cf3053 is DONE. 123 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameDependencyNamesize
0pandaspytz168
1pandasnumpy168
2pandaspython-dateutil168
3pandassix168
4pandastzdata56
\n", + "
" + ], + "text/plain": [ + " Name DependencyName size\n", + "0 pandas pytz 168\n", + "1 pandas numpy 168\n", + "2 pandas python-dateutil 168\n", + "3 pandas six 168\n", + "4 pandas tzdata 56" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "pandas_deps = deps[deps[\"Name\"] == \"pandas\"].groupby([\"Name\", \"DependencyName\"], as_index=False).size()" + "package_deps = deps[deps[\"Name\"] == package_name].groupby([\"Name\", \"DependencyName\"], as_index=False).size()\n", + "package_deps.peek()" ] }, { @@ -205,14 +598,14 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "pandas_and_deps = bpd.concat(\n", " [\n", - " pandas_deps.drop(columns=[\"Name\", \"size\"]).rename(columns={\"DependencyName\": \"Name\"}),\n", - " bpd.DataFrame({\"Name\": [\"pandas\"]}),\n", + " package_deps.drop(columns=[\"Name\", \"size\"]).rename(columns={\"DependencyName\": \"Name\"}),\n", + " bpd.DataFrame({\"Name\": [package_name]}),\n", " ],\n", "\n", " # To join DataFrames that have a NULL index, set ignore_index = True.\n", @@ -229,7 +622,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -240,18 +633,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Create a time series to visualize by grouping by the date, extracted from the `timestamp` column." + "Create a time series to visualize by grouping by the date, extracted from the `timestamp` column.\n", + "\n", + "**Note:** If you don't `peek()` at your data and only do grouped aggregations, BigQuery DataFrames can eliminate unnecessary ordering from the compilation even without `ordering_mode = \"partial\"`.\n", + "\n", + "When BigQuery DataFrames aggregates over columns, those columns provide a\n", + "unique key post-aggregation that is used for ordering. Any ordering applied before is overridden. By aggregating over\n", + "a time series, the line plots will render in the expected order." ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 88c93524-d74f-4bbd-b6b1-0fb752ebedde is DONE. 28.6 GB processed. Open Job" + "Query job 57037a4c-5b8b-4f30-a5c6-bfeb9731a38f is DONE. 270.4 GB processed. Open Job" ], "text/plain": [ "" @@ -259,14 +658,86 @@ }, "metadata": {}, "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job a8ea0b8e-2260-4175-b80d-668a2411c6ad is DONE. 2.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "date project \n", + "2024-09-17 numpy 2572982\n", + " pandas 2195692\n", + " python-dateutil 3608119\n", + " pytz 1863133\n", + " six 3064640\n", + " tzdata 991989\n", + "2024-09-18 numpy 13282573\n", + " pandas 10856758\n", + " python-dateutil 17877058\n", + " pytz 9450103\n", + " six 15225000\n", + " tzdata 5230039\n", + "2024-09-19 numpy 13637868\n", + " pandas 11077817\n", + " python-dateutil 18449777\n", + " pytz 9690329\n", + " six 15706263\n", + " tzdata 5473910\n", + "2024-09-20 numpy 12609524\n", + " pandas 10758593\n", + " python-dateutil 17257536\n", + " pytz 9082050\n", + " six 14489456\n", + " tzdata 5206738\n", + "2024-09-21 numpy 8316481\n", + " pandas 7483241\n", + " python-dateutil 11604691\n", + " pytz 5494178\n", + " six 8814983\n", + " tzdata 3141578\n", + "2024-09-22 numpy 7768078\n", + " pandas 6566272\n", + " python-dateutil 10835755\n", + " pytz 5130018\n", + " six 8297507\n", + " tzdata 2811247\n", + "2024-09-23 numpy 12389164\n", + " pandas 10758931\n", + " python-dateutil 17153013\n", + " pytz 9045824\n", + " six 14512209\n", + " tzdata 5214048\n", + "2024-09-24 numpy 10385658\n", + " pandas 8830996\n", + " python-dateutil 14066307\n", + " pytz 7425446\n", + " six 11917222\n", + " tzdata 4550626\n", + "dtype: Int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "# When BigQuery DataFrames aggregates over columns, those columns provide a\n", - "# unique key post-aggregation that is used for ordering. By aggregating over\n", - "# a time series, the line plots will render in the expexted order.\n", "pandas_pypi = pandas_pypi.assign(date=pandas_pypi[\"timestamp\"].dt.date)\n", - "downloads_per_day = pandas_pypi.groupby([\"date\", \"project\"]).size().unstack()" + "downloads_per_day = pandas_pypi.groupby([\"date\", \"project\"]).size()\n", + "\n", + "# Cache after the aggregation so that the aggregation only runs once.\n", + "downloads_per_day.cache()\n", + "downloads_per_day.to_pandas()" ] }, { @@ -278,13 +749,25 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 09bcc1a0-def3-474f-acd5-1d55b9653e07 is DONE. 28.6 GB processed. Open Job" + "Query job 319558aa-e092-4fd0-a8aa-447fca216a57 is DONE. 1.6 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job e64eb861-bd12-4748-9c29-d97990aa1241 is DONE. 1.2 kB processed. Open Job" ], "text/plain": [ "" @@ -299,13 +782,13 @@ "" ] }, - "execution_count": 19, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -315,8 +798,15 @@ } ], "source": [ - "downloads_per_day.plot.line(rot=45, ylabel=\"daily downloads\", ylim=(0, 2e7))" + "downloads_per_day.unstack().plot.line(rot=45, ylabel=\"daily downloads\", ylim=(0, 2e7))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -335,7 +825,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.12.6" } }, "nbformat": 4, diff --git a/samples/snippets/linear_regression_tutorial_test.py b/samples/snippets/linear_regression_tutorial_test.py new file mode 100644 index 0000000000..0c861d1120 --- /dev/null +++ b/samples/snippets/linear_regression_tutorial_test.py @@ -0,0 +1,42 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_linear_regression(random_model_id: str) -> None: + your_model_id = random_model_id + # [START bigquery_dataframes_bqml_linear_regression] + from bigframes.ml.linear_model import LinearRegression + import bigframes.pandas as bpd + + # Load data from BigQuery + bq_df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") + + # Drop rows with nulls to get training data + training_data = bq_df.dropna(subset=["body_mass_g"]) + + # Specify your feature (or input) columns and the label (or output) column: + feature_columns = training_data.drop(columns=["body_mass_g"]) + label_columns = training_data[["body_mass_g"]] + + # Create the linear model + model = LinearRegression() + model.fit(feature_columns, label_columns) + model.to_gbq( + your_model_id, # For example: "bqml_tutorial.penguins_model" + replace=True, + ) + # [END bigquery_dataframes_bqml_linear_regression] + assert feature_columns is not None + assert label_columns is not None + assert model is not None diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 5ee2dc6397..d9246eecfb 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -154,9 +154,9 @@ def session_load() -> Generator[bigframes.Session, None, None]: session.close() # close generated session at cleanup time -@pytest.fixture(scope="session", params=["ordered", "unordered"]) +@pytest.fixture(scope="session", params=["strict", "partial"]) def maybe_ordered_session(request) -> Generator[bigframes.Session, None, None]: - context = bigframes.BigQueryOptions(location="US", ordering_mode="partial") + context = bigframes.BigQueryOptions(location="US", ordering_mode=request.param) session = bigframes.Session(context=context) yield session session.close() # close generated session at cleanup type diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index 59c5a1538f..ba963837e5 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -36,6 +36,32 @@ def test_columntransformer_standalone_fit_and_transform( preprocessing.MinMaxScaler(), ["culmen_length_mm"], ), + ( + "increment", + compose.SQLScalarColumnTransformer("{0}+1"), + ["culmen_length_mm", "flipper_length_mm"], + ), + ( + "length", + compose.SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN -1 ELSE LENGTH({0}) END", + target_column="len_{0}", + ), + "species", + ), + ( + "ohe", + compose.SQLScalarColumnTransformer( + "CASE WHEN {0}='Adelie Penguin (Pygoscelis adeliae)' THEN 1 ELSE 0 END", + target_column="ohe_adelie", + ), + "species", + ), + ( + "identity", + compose.SQLScalarColumnTransformer("{0}", target_column="{0}"), + ["culmen_length_mm", "flipper_length_mm"], + ), ] ) @@ -51,6 +77,12 @@ def test_columntransformer_standalone_fit_and_transform( "standard_scaled_culmen_length_mm", "min_max_scaled_culmen_length_mm", "standard_scaled_flipper_length_mm", + "transformed_culmen_length_mm", + "transformed_flipper_length_mm", + "len_species", + "ohe_adelie", + "culmen_length_mm", + "flipper_length_mm", ], index=[1633, 1672, 1690], col_exact=False, @@ -70,6 +102,19 @@ def test_columntransformer_standalone_fit_transform(new_penguins_df): preprocessing.StandardScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "length", + compose.SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN -1 ELSE LENGTH({0}) END", + target_column="len_{0}", + ), + "species", + ), + ( + "identity", + compose.SQLScalarColumnTransformer("{0}", target_column="{0}"), + ["culmen_length_mm", "flipper_length_mm"], + ), ] ) @@ -83,6 +128,9 @@ def test_columntransformer_standalone_fit_transform(new_penguins_df): "onehotencoded_species", "standard_scaled_culmen_length_mm", "standard_scaled_flipper_length_mm", + "len_species", + "culmen_length_mm", + "flipper_length_mm", ], index=[1633, 1672, 1690], col_exact=False, @@ -102,6 +150,27 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id): preprocessing.StandardScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "length", + compose.SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN -1 ELSE LENGTH({0}) END", + target_column="len_{0}", + ), + "species", + ), + ( + "identity", + compose.SQLScalarColumnTransformer("{0}", target_column="{0}"), + ["culmen_length_mm", "flipper_length_mm"], + ), + ( + "flexname", + compose.SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN -1 ELSE LENGTH({0}) END", + target_column="Flex {0} Name", + ), + "species", + ), ] ) transformer.fit( @@ -122,6 +191,36 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id): ), ("standard_scaler", preprocessing.StandardScaler(), "culmen_length_mm"), ("standard_scaler", preprocessing.StandardScaler(), "flipper_length_mm"), + ( + "sql_scalar_column_transformer", + compose.SQLScalarColumnTransformer( + "CASE WHEN species IS NULL THEN -1 ELSE LENGTH(species) END", + target_column="len_species", + ), + "?len_species", + ), + ( + "sql_scalar_column_transformer", + compose.SQLScalarColumnTransformer( + "flipper_length_mm", target_column="flipper_length_mm" + ), + "?flipper_length_mm", + ), + ( + "sql_scalar_column_transformer", + compose.SQLScalarColumnTransformer( + "culmen_length_mm", target_column="culmen_length_mm" + ), + "?culmen_length_mm", + ), + ( + "sql_scalar_column_transformer", + compose.SQLScalarColumnTransformer( + "CASE WHEN species IS NULL THEN -1 ELSE LENGTH(species) END ", + target_column="Flex species Name", + ), + "?Flex species Name", + ), ] assert set(reloaded_transformer.transformers) == set(expected) assert reloaded_transformer._bqml_model is not None @@ -136,6 +235,10 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id): "onehotencoded_species", "standard_scaled_culmen_length_mm", "standard_scaled_flipper_length_mm", + "len_species", + "culmen_length_mm", + "flipper_length_mm", + "Flex species Name", ], index=[1633, 1672, 1690], col_exact=False, diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py index 1d13300115..51b45485ad 100644 --- a/tests/system/load/test_llm.py +++ b/tests/system/load/test_llm.py @@ -156,3 +156,27 @@ def test_claude3_text_generator_predict_with_params_success( utils.check_pandas_df_schema_and_index( df, columns=utils.ML_GENERATE_TEXT_OUTPUT, index=3, col_exact=False ) + + +@pytest.mark.parametrize( + "model_name", + ("claude-3-sonnet", "claude-3-haiku", "claude-3-5-sonnet", "claude-3-opus"), +) +@pytest.mark.flaky(retries=3, delay=120) +def test_claude3_text_generator_predict_multi_col_success( + llm_text_df, model_name, session, session_us_east5, bq_connection +): + if model_name in ("claude-3-5-sonnet", "claude-3-opus"): + session = session_us_east5 + + llm_text_df["additional_col"] = 1 + claude3_text_generator_model = llm.Claude3TextGenerator( + model_name=model_name, connection_name=bq_connection, session=session + ) + df = claude3_text_generator_model.predict(llm_text_df).to_pandas() + utils.check_pandas_df_schema_and_index( + df, + columns=utils.ML_GENERATE_TEXT_OUTPUT + ["additional_col"], + index=3, + col_exact=False, + ) diff --git a/tests/system/small/bigquery/test_approx_agg.py b/tests/system/small/bigquery/test_approx_agg.py new file mode 100644 index 0000000000..c88f5850f8 --- /dev/null +++ b/tests/system/small/bigquery/test_approx_agg.py @@ -0,0 +1,76 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes.bigquery as bbq +import bigframes.pandas as bpd + + +@pytest.mark.parametrize( + ("data", "expected"), + [ + pytest.param( + [1, 2, 3, 3, 2], [{"value": 3, "count": 2}, {"value": 2, "count": 2}] + ), + pytest.param( + ["apple", "apple", "pear", "pear", "pear", "banana"], + [{"value": "pear", "count": 3}, {"value": "apple", "count": 2}], + ), + pytest.param( + [True, False, True, False, True], + [{"value": True, "count": 3}, {"value": False, "count": 2}], + ), + pytest.param( + [], + [], + ), + pytest.param( + [[1, 2], [1], [1, 2]], + [], + marks=pytest.mark.xfail(raises=TypeError), + ), + ], + ids=["int64", "string", "bool", "null", "array"], +) +def test_approx_top_count_w_dtypes(data, expected): + s = bpd.Series(data) + result = bbq.approx_top_count(s, number=2) + assert result == expected + + +@pytest.mark.parametrize( + ("number", "expected"), + [ + pytest.param( + 0, + [], + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param(1, [{"value": 3, "count": 2}]), + pytest.param( + 4, + [ + {"value": 3, "count": 2}, + {"value": 2, "count": 2}, + {"value": 1, "count": 1}, + ], + ), + ], + ids=["zero", "one", "full"], +) +def test_approx_top_count_w_numbers(number, expected): + s = bpd.Series([1, 2, 3, 3, 2]) + result = bbq.approx_top_count(s, number=number) + assert result == expected diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 3093a36534..a4a09731a1 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -15,6 +15,7 @@ import pytest from bigframes.ml import llm +import bigframes.pandas as bpd from tests.system import utils @@ -166,6 +167,20 @@ def test_text_generator_predict_arbitrary_col_label_success( ) +@pytest.mark.flaky(retries=2) +def test_text_generator_predict_multiple_cols_success( + palm2_text_generator_model, llm_text_df: bpd.DataFrame +): + df = llm_text_df.assign(additional_col=1) + pd_df = palm2_text_generator_model.predict(df).to_pandas() + utils.check_pandas_df_schema_and_index( + pd_df, + columns=utils.ML_GENERATE_TEXT_OUTPUT + ["additional_col"], + index=3, + col_exact=False, + ) + + @pytest.mark.flaky(retries=2) def test_text_generator_predict_with_params_success( palm2_text_generator_model, llm_text_df @@ -212,11 +227,33 @@ def test_text_embedding_generator_predict_default_params_success( model_name=model_name, connection_name=bq_connection, session=session ) df = text_embedding_model.predict(llm_text_df).to_pandas() - assert df.shape == (3, 4) - assert "ml_generate_embedding_result" in df.columns - series = df["ml_generate_embedding_result"] - value = series[0] - assert len(value) == 768 + utils.check_pandas_df_schema_and_index( + df, columns=utils.ML_GENERATE_EMBEDDING_OUTPUT, index=3, col_exact=False + ) + assert len(df["ml_generate_embedding_result"][0]) == 768 + + +@pytest.mark.parametrize( + "model_name", + ("text-embedding-004", "text-multilingual-embedding-002"), +) +@pytest.mark.flaky(retries=2) +def test_text_embedding_generator_multi_cols_predict_success( + llm_text_df: bpd.DataFrame, model_name, session, bq_connection +): + df = llm_text_df.assign(additional_col=1) + df = df.rename(columns={"prompt": "content"}) + text_embedding_model = llm.TextEmbeddingGenerator( + model_name=model_name, connection_name=bq_connection, session=session + ) + pd_df = text_embedding_model.predict(df).to_pandas() + utils.check_pandas_df_schema_and_index( + pd_df, + columns=utils.ML_GENERATE_EMBEDDING_OUTPUT + ["additional_col"], + index=3, + col_exact=False, + ) + assert len(pd_df["ml_generate_embedding_result"][0]) == 768 @pytest.mark.parametrize( @@ -295,6 +332,33 @@ def test_gemini_text_generator_predict_with_params_success( ) +@pytest.mark.parametrize( + "model_name", + ( + "gemini-pro", + "gemini-1.5-pro-preview-0514", + "gemini-1.5-flash-preview-0514", + "gemini-1.5-pro-001", + "gemini-1.5-flash-001", + ), +) +@pytest.mark.flaky(retries=2) +def test_gemini_text_generator_multi_cols_predict_success( + llm_text_df: bpd.DataFrame, model_name, session, bq_connection +): + df = llm_text_df.assign(additional_col=1) + gemini_text_generator_model = llm.GeminiTextGenerator( + model_name=model_name, connection_name=bq_connection, session=session + ) + pd_df = gemini_text_generator_model.predict(df).to_pandas() + utils.check_pandas_df_schema_and_index( + pd_df, + columns=utils.ML_GENERATE_TEXT_OUTPUT + ["additional_col"], + index=3, + col_exact=False, + ) + + @pytest.mark.flaky(retries=2) def test_llm_palm_score(llm_fine_tune_df_default_index): model = llm.PaLM2TextGenerator(model_name="text-bison") diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 0a637e983f..340df93791 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -567,6 +567,30 @@ def test_repr_w_all_rows(scalars_dfs): assert actual == expected +def test_join_repr(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + scalars_df = ( + scalars_df[["int64_col"]] + .join(scalars_df.set_index("int64_col")[["int64_too"]]) + .sort_index() + ) + scalars_pandas_df = ( + scalars_pandas_df[["int64_col"]] + .join(scalars_pandas_df.set_index("int64_col")[["int64_too"]]) + .sort_index() + ) + # Pandas join result index name seems to depend on the index values in a way that bigframes can't match exactly + scalars_pandas_df.index.name = None + + actual = repr(scalars_df) + + with display_options.pandas_repr(bigframes.options.display): + expected = repr(scalars_pandas_df) + + assert actual == expected + + def test_repr_html_w_all_rows(scalars_dfs): scalars_df, _ = scalars_dfs # get a pandas df of the expected format diff --git a/tests/system/utils.py b/tests/system/utils.py index 26e3e97e24..83d0e683bc 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -50,6 +50,12 @@ "ml_generate_text_status", "prompt", ] +ML_GENERATE_EMBEDDING_OUTPUT = [ + "ml_generate_embedding_result", + "ml_generate_embedding_statistics", + "ml_generate_embedding_status", + "content", +] def skip_legacy_pandas(test): diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 60dcc75b63..7643f76e56 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -11,11 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from unittest import mock +from google.cloud import bigquery +import pytest import sklearn.compose as sklearn_compose # type: ignore import sklearn.preprocessing as sklearn_preprocessing # type: ignore from bigframes.ml import compose, preprocessing +from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer +from bigframes.ml.core import BqmlModel +import bigframes.pandas as bpd def test_columntransformer_init_expectedtransforms(): @@ -173,3 +179,403 @@ def test_columntransformer_repr_matches_sklearn(): ) assert bf_column_transformer.__repr__() == sk_column_transformer.__repr__() + + +@pytest.fixture(scope="session") +def mock_X(): + mock_df = mock.create_autospec(spec=bpd.DataFrame) + return mock_df + + +def test_columntransformer_init_with_sqltransformers(): + ident_transformer = SQLScalarColumnTransformer("{0}", target_column="ident_{0}") + len1_transformer = SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN -2 ELSE LENGTH({0}) END", target_column="len1_{0}" + ) + len2_transformer = SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN 99 ELSE LENGTH({0}) END", target_column="len2_{0}" + ) + label_transformer = preprocessing.LabelEncoder() + column_transformer = compose.ColumnTransformer( + [ + ( + "ident_trafo", + ident_transformer, + ["culmen_length_mm", "flipper_length_mm"], + ), + ("len1_trafo", len1_transformer, ["species"]), + ("len2_trafo", len2_transformer, ["species"]), + ("label", label_transformer, "species"), + ] + ) + + assert column_transformer.transformers_ == [ + ("ident_trafo", ident_transformer, "culmen_length_mm"), + ("ident_trafo", ident_transformer, "flipper_length_mm"), + ("len1_trafo", len1_transformer, "species"), + ("len2_trafo", len2_transformer, "species"), + ("label", label_transformer, "species"), + ] + + +def test_columntransformer_repr_sqltransformers(): + ident_transformer = SQLScalarColumnTransformer("{0}", target_column="ident_{0}") + len1_transformer = SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN -2 ELSE LENGTH({0}) END", target_column="len1_{0}" + ) + len2_transformer = SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN 99 ELSE LENGTH({0}) END", target_column="len2_{0}" + ) + label_transformer = preprocessing.LabelEncoder() + column_transformer = compose.ColumnTransformer( + [ + ( + "ident_trafo", + ident_transformer, + ["culmen_length_mm", "flipper_length_mm"], + ), + ("len1_trafo", len1_transformer, ["species"]), + ("len2_trafo", len2_transformer, ["species"]), + ("label", label_transformer, "species"), + ] + ) + + expected = """ColumnTransformer(transformers=[('ident_trafo', + SQLScalarColumnTransformer(sql='{0}', target_column='ident_{0}'), + ['culmen_length_mm', 'flipper_length_mm']), + ('len1_trafo', + SQLScalarColumnTransformer(sql='CASE WHEN {0} IS NULL THEN -2 ELSE LENGTH({0}) END', target_column='len1_{0}'), + ['species']), + ('len2_trafo', + SQLScalarColumnTransformer(sql='CASE WHEN {0} IS NULL THEN 99 ELSE LENGTH({0}) END', target_column='len2_{0}'), + ['species']), + ('label', LabelEncoder(), 'species')])""" + actual = column_transformer.__repr__() + assert expected == actual + + +def test_customtransformer_compile_sql(mock_X): + ident_trafo = SQLScalarColumnTransformer("{0}", target_column="ident_{0}") + sqls = ident_trafo._compile_to_sql(X=mock_X, columns=["col1", "col2"]) + assert sqls == [ + "col1 AS ident_col1", + "col2 AS ident_col2", + ] + + len1_trafo = SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN -5 ELSE LENGTH({0}) END", target_column="len1_{0}" + ) + sqls = len1_trafo._compile_to_sql(X=mock_X, columns=["col1", "col2"]) + assert sqls == [ + "CASE WHEN col1 IS NULL THEN -5 ELSE LENGTH(col1) END AS len1_col1", + "CASE WHEN col2 IS NULL THEN -5 ELSE LENGTH(col2) END AS len1_col2", + ] + + len2_trafo = SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN 99 ELSE LENGTH({0}) END", target_column="len2_{0}" + ) + sqls = len2_trafo._compile_to_sql(X=mock_X, columns=["col1", "col2"]) + assert sqls == [ + "CASE WHEN col1 IS NULL THEN 99 ELSE LENGTH(col1) END AS len2_col1", + "CASE WHEN col2 IS NULL THEN 99 ELSE LENGTH(col2) END AS len2_col2", + ] + + +def create_bq_model_mock(mocker, transform_columns, feature_columns=None): + properties = {"transformColumns": transform_columns} + mock_bq_model = bigquery.Model("model_project.model_dataset.model_id") + type(mock_bq_model)._properties = mock.PropertyMock(return_value=properties) + if feature_columns: + result = [ + bigquery.standard_sql.StandardSqlField(col, None) for col in feature_columns + ] + mocker.patch( + "google.cloud.bigquery.model.Model.feature_columns", + new_callable=mock.PropertyMock(return_value=result), + ) + + return mock_bq_model + + +@pytest.fixture +def bq_model_good(mocker): + return create_bq_model_mock( + mocker, + [ + { + "name": "ident_culmen_length_mm", + "type": {"typeKind": "INT64"}, + "transformSql": "culmen_length_mm /*CT.IDENT()*/", + }, + { + "name": "ident_flipper_length_mm", + "type": {"typeKind": "INT64"}, + "transformSql": "flipper_length_mm /*CT.IDENT()*/", + }, + { + "name": "len1_species", + "type": {"typeKind": "INT64"}, + "transformSql": "CASE WHEN species IS NULL THEN -5 ELSE LENGTH(species) END /*CT.LEN1()*/", + }, + { + "name": "len2_species", + "type": {"typeKind": "INT64"}, + "transformSql": "CASE WHEN species IS NULL THEN 99 ELSE LENGTH(species) END /*CT.LEN2([99])*/", + }, + { + "name": "labelencoded_county", + "type": {"typeKind": "INT64"}, + "transformSql": "ML.LABEL_ENCODER(county, 1000000, 0) OVER()", + }, + { + "name": "labelencoded_species", + "type": {"typeKind": "INT64"}, + "transformSql": "ML.LABEL_ENCODER(species, 1000000, 0) OVER()", + }, + ], + ) + + +@pytest.fixture +def bq_model_merge(mocker): + return create_bq_model_mock( + mocker, + [ + { + "name": "labelencoded_county", + "type": {"typeKind": "INT64"}, + "transformSql": "ML.LABEL_ENCODER(county, 1000000, 0) OVER()", + }, + { + "name": "labelencoded_species", + "type": {"typeKind": "INT64"}, + "transformSql": "ML.LABEL_ENCODER(species, 1000000, 0) OVER()", + }, + ], + ["county", "species"], + ) + + +@pytest.fixture +def bq_model_no_merge(mocker): + return create_bq_model_mock( + mocker, + [ + { + "name": "ident_culmen_length_mm", + "type": {"typeKind": "INT64"}, + "transformSql": "culmen_length_mm /*CT.IDENT()*/", + } + ], + ["culmen_length_mm"], + ) + + +@pytest.fixture +def bq_model_unknown_ML(mocker): + return create_bq_model_mock( + mocker, + [ + { + "name": "unknownml_culmen_length_mm", + "type": {"typeKind": "INT64"}, + "transformSql": "ML.UNKNOWN(culmen_length_mm)", + }, + { + "name": "labelencoded_county", + "type": {"typeKind": "INT64"}, + "transformSql": "ML.LABEL_ENCODER(county, 1000000, 0) OVER()", + }, + ], + ) + + +@pytest.fixture +def bq_model_flexnames(mocker): + return create_bq_model_mock( + mocker, + [ + { + "name": "Flex Name culmen_length_mm", + "type": {"typeKind": "INT64"}, + "transformSql": "culmen_length_mm", + }, + { + "name": "transformed_Culmen Length MM", + "type": {"typeKind": "INT64"}, + "transformSql": "`Culmen Length MM`*/", + }, + # test workaround for bug in get_model + { + "name": "Flex Name flipper_length_mm", + "type": {"typeKind": "INT64"}, + "transformSql": "flipper_length_mm AS `Flex Name flipper_length_mm`", + }, + { + "name": "transformed_Flipper Length MM", + "type": {"typeKind": "INT64"}, + "transformSql": "`Flipper Length MM` AS `transformed_Flipper Length MM`*/", + }, + ], + ) + + +def test_columntransformer_extract_from_bq_model_good(bq_model_good): + col_trans = ColumnTransformer._extract_from_bq_model(bq_model_good) + assert len(col_trans.transformers) == 6 + # normalize the representation for string comparing + col_trans.transformers.sort(key=lambda trafo: str(trafo)) + actual = col_trans.__repr__() + expected = """ColumnTransformer(transformers=[('label_encoder', + LabelEncoder(max_categories=1000001, + min_frequency=0), + 'county'), + ('label_encoder', + LabelEncoder(max_categories=1000001, + min_frequency=0), + 'species'), + ('sql_scalar_column_transformer', + SQLScalarColumnTransformer(sql='CASE WHEN species IS NULL THEN -5 ELSE LENGTH(species) END /*CT.LEN1()*/', target_column='len1_species'), + '?len1_species'), + ('sql_scalar_column_transformer', + SQLScalarColumnTransformer(sql='CASE WHEN species IS NULL THEN 99 ELSE LENGTH(species) END /*CT.LEN2([99])*/', target_column='len2_species'), + '?len2_species'), + ('sql_scalar_column_transformer', + SQLScalarColumnTransformer(sql='culmen_length_mm /*CT.IDENT()*/', target_column='ident_culmen_length_mm'), + '?ident_culmen_length_mm'), + ('sql_scalar_column_transformer', + SQLScalarColumnTransformer(sql='flipper_length_mm /*CT.IDENT()*/', target_column='ident_flipper_length_mm'), + '?ident_flipper_length_mm')])""" + assert expected == actual + + +def test_columntransformer_extract_from_bq_model_merge(bq_model_merge): + col_trans = ColumnTransformer._extract_from_bq_model(bq_model_merge) + assert isinstance(col_trans, ColumnTransformer) + merged_col_trans = col_trans._merge(bq_model_merge) + assert isinstance(merged_col_trans, preprocessing.LabelEncoder) + assert ( + merged_col_trans.__repr__() + == """LabelEncoder(max_categories=1000001, min_frequency=0)""" + ) + assert merged_col_trans._output_names == [ + "labelencoded_county", + "labelencoded_species", + ] + + +def test_columntransformer_extract_from_bq_model_no_merge(bq_model_no_merge): + col_trans = ColumnTransformer._extract_from_bq_model(bq_model_no_merge) + merged_col_trans = col_trans._merge(bq_model_no_merge) + assert isinstance(merged_col_trans, ColumnTransformer) + expected = """ColumnTransformer(transformers=[('sql_scalar_column_transformer', + SQLScalarColumnTransformer(sql='culmen_length_mm /*CT.IDENT()*/', target_column='ident_culmen_length_mm'), + '?ident_culmen_length_mm')])""" + actual = merged_col_trans.__repr__() + assert expected == actual + + +def test_columntransformer_extract_from_bq_model_unknown_ML(bq_model_unknown_ML): + try: + _ = ColumnTransformer._extract_from_bq_model(bq_model_unknown_ML) + assert False + except NotImplementedError as e: + assert "Unsupported transformer type" in e.args[0] + + +def test_columntransformer_extract_output_names(bq_model_good): + class BQMLModel(BqmlModel): + def __init__(self, bq_model): + self._model = bq_model + + col_trans = ColumnTransformer._extract_from_bq_model(bq_model_good) + col_trans._bqml_model = BQMLModel(bq_model_good) + col_trans._extract_output_names() + assert col_trans._output_names == [ + "ident_culmen_length_mm", + "ident_flipper_length_mm", + "len1_species", + "len2_species", + "labelencoded_county", + "labelencoded_species", + ] + + +def test_columntransformer_compile_to_sql(mock_X): + ident_transformer = SQLScalarColumnTransformer("{0}", target_column="ident_{0}") + len1_transformer = SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN -2 ELSE LENGTH({0}) END", target_column="len1_{0}" + ) + len2_transformer = SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN 99 ELSE LENGTH({0}) END", target_column="len2_{0}" + ) + label_transformer = preprocessing.LabelEncoder() + column_transformer = compose.ColumnTransformer( + [ + ( + "ident_trafo", + ident_transformer, + ["culmen_length_mm", "flipper_length_mm"], + ), + ("len1_trafo", len1_transformer, ["species"]), + ("len2_trafo", len2_transformer, ["species"]), + ("label", label_transformer, "species"), + ] + ) + sqls = column_transformer._compile_to_sql(mock_X) + assert sqls == [ + "culmen_length_mm AS ident_culmen_length_mm", + "flipper_length_mm AS ident_flipper_length_mm", + "CASE WHEN species IS NULL THEN -2 ELSE LENGTH(species) END AS len1_species", + "CASE WHEN species IS NULL THEN 99 ELSE LENGTH(species) END AS len2_species", + "ML.LABEL_ENCODER(species, 1000000, 0) OVER() AS labelencoded_species", + ] + + +def test_columntransformer_flexible_column_names(mock_X): + ident_transformer = SQLScalarColumnTransformer("{0}", target_column="ident {0}") + len1_transformer = SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN -2 ELSE LENGTH({0}) END", target_column="len1_{0}" + ) + len2_transformer = SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN 99 ELSE LENGTH({0}) END", target_column="len2_{0}" + ) + column_transformer = compose.ColumnTransformer( + [ + ( + "ident_trafo", + ident_transformer, + ["culmen_length_mm", "flipper_length_mm"], + ), + ("len1_trafo", len1_transformer, ["species shortname"]), + ("len2_trafo", len2_transformer, ["`species longname`"]), + ] + ) + sqls = column_transformer._compile_to_sql(mock_X) + assert sqls == [ + "culmen_length_mm AS `ident culmen_length_mm`", + "flipper_length_mm AS `ident flipper_length_mm`", + "CASE WHEN `species shortname` IS NULL THEN -2 ELSE LENGTH(`species shortname`) END AS `len1_species shortname`", + "CASE WHEN `species longname` IS NULL THEN 99 ELSE LENGTH(`species longname`) END AS `len2_species longname`", + ] + + +def test_columntransformer_extract_from_bq_model_flexnames(bq_model_flexnames): + col_trans = ColumnTransformer._extract_from_bq_model(bq_model_flexnames) + assert len(col_trans.transformers) == 4 + # normalize the representation for string comparing + col_trans.transformers.sort(key=lambda trafo: str(trafo)) + actual = col_trans.__repr__() + expected = """ColumnTransformer(transformers=[('sql_scalar_column_transformer', + SQLScalarColumnTransformer(sql='`Culmen Length MM`*/', target_column='transformed_Culmen Length MM'), + '?transformed_Culmen Length MM'), + ('sql_scalar_column_transformer', + SQLScalarColumnTransformer(sql='`Flipper Length MM` AS `transformed_Flipper Length MM`*/', target_column='transformed_Flipper Length MM'), + '?transformed_Flipper Length MM'), + ('sql_scalar_column_transformer', + SQLScalarColumnTransformer(sql='culmen_length_mm', target_column='Flex Name culmen_length_mm'), + '?Flex Name culmen_length_mm'), + ('sql_scalar_column_transformer', + SQLScalarColumnTransformer(sql='flipper_length_mm ', target_column='Flex Name flipper_length_mm'), + '?Flex Name flipper_length_mm')])""" + assert expected == actual diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 5dda345fcb..60f4942175 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.19.0" +__version__ = "1.20.0" pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy