diff --git a/.kokoro/continuous/doctest.cfg b/.kokoro/continuous/doctest.cfg index dfdc78782f..dca21d43fd 100644 --- a/.kokoro/continuous/doctest.cfg +++ b/.kokoro/continuous/doctest.cfg @@ -8,10 +8,5 @@ env_vars: { env_vars: { key: "GOOGLE_CLOUD_PROJECT" - value: "bigframes-load-testing" -} - -env_vars: { - key: "BIGFRAMES_TEST_MODEL_VERTEX_ENDPOINT" - value: "https://us-central1-aiplatform.googleapis.com/v1/projects/272725758477/locations/us-central1/endpoints/590545496255234048" + value: "bigframes-testing" } diff --git a/.kokoro/continuous/notebook.cfg b/.kokoro/continuous/notebook.cfg index ca3d98b58b..c14297019a 100644 --- a/.kokoro/continuous/notebook.cfg +++ b/.kokoro/continuous/notebook.cfg @@ -13,10 +13,5 @@ env_vars: { env_vars: { key: "GOOGLE_CLOUD_PROJECT" - value: "bigframes-load-testing" -} - -env_vars: { - key: "BIGFRAMES_TEST_MODEL_VERTEX_ENDPOINT" - value: "https://us-central1-aiplatform.googleapis.com/v1/projects/272725758477/locations/us-central1/endpoints/590545496255234048" + value: "bigframes-testing" } diff --git a/.kokoro/presubmit/doctest.cfg b/.kokoro/presubmit/doctest.cfg index dfdc78782f..dca21d43fd 100644 --- a/.kokoro/presubmit/doctest.cfg +++ b/.kokoro/presubmit/doctest.cfg @@ -8,10 +8,5 @@ env_vars: { env_vars: { key: "GOOGLE_CLOUD_PROJECT" - value: "bigframes-load-testing" -} - -env_vars: { - key: "BIGFRAMES_TEST_MODEL_VERTEX_ENDPOINT" - value: "https://us-central1-aiplatform.googleapis.com/v1/projects/272725758477/locations/us-central1/endpoints/590545496255234048" + value: "bigframes-testing" } diff --git a/.kokoro/presubmit/notebook.cfg b/.kokoro/presubmit/notebook.cfg index 94e2a3c686..cc73c3bea4 100644 --- a/.kokoro/presubmit/notebook.cfg +++ b/.kokoro/presubmit/notebook.cfg @@ -8,10 +8,5 @@ env_vars: { env_vars: { key: "GOOGLE_CLOUD_PROJECT" - value: "bigframes-load-testing" -} - -env_vars: { - key: "BIGFRAMES_TEST_MODEL_VERTEX_ENDPOINT" - value: "https://us-central1-aiplatform.googleapis.com/v1/projects/272725758477/locations/us-central1/endpoints/590545496255234048" + value: "bigframes-testing" } diff --git a/CHANGELOG.md b/CHANGELOG.md index a0539af01e..c398f17d43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,27 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.21.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.20.0...v1.21.0) (2024-10-02) + + +### Features + +* Add deprecation warning to PaLM2TextGenerator model ([#1035](https://github.com/googleapis/python-bigquery-dataframes/issues/1035)) ([1183b0f](https://github.com/googleapis/python-bigquery-dataframes/commit/1183b0fb2be7af7386e4bd0d0d1312433db60454)) +* Add DeprecationWarning for PaLM2TextEmbeddingGenerator ([#1018](https://github.com/googleapis/python-bigquery-dataframes/issues/1018)) ([4af5bbb](https://github.com/googleapis/python-bigquery-dataframes/commit/4af5bbb9e42fdb0add17308475c7881d7035fbfd)) +* Add ml.model_selection.cross_validate support ([#1020](https://github.com/googleapis/python-bigquery-dataframes/issues/1020)) ([1a38063](https://github.com/googleapis/python-bigquery-dataframes/commit/1a380631f793f82637cd384601956ee4457dc58a)) +* Allow access of struct fields with dot operators on `Series` ([#1019](https://github.com/googleapis/python-bigquery-dataframes/issues/1019)) ([ef76f13](https://github.com/googleapis/python-bigquery-dataframes/commit/ef76f137fbbf9e8f8c5a63023554d22059ab4fbd)) + + +### Bug Fixes + +* Ensure no double execution for to_pandas ([#1032](https://github.com/googleapis/python-bigquery-dataframes/issues/1032)) ([4992cc2](https://github.com/googleapis/python-bigquery-dataframes/commit/4992cc27e46bc2b0a908c7d521785989735186f4)) +* Remove pre-caching of remote function results ([#1028](https://github.com/googleapis/python-bigquery-dataframes/issues/1028)) ([0359bc8](https://github.com/googleapis/python-bigquery-dataframes/commit/0359bc85839c37b5cd10c0c418b275ac0dc29c4a)) + + +### Documentation + +* Add ml cross-validation notebook ([#1037](https://github.com/googleapis/python-bigquery-dataframes/issues/1037)) ([057f3f0](https://github.com/googleapis/python-bigquery-dataframes/commit/057f3f0d694ddffe8745443a85b4fb43081893bb)) + ## [1.20.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.19.0...v1.20.0) (2024-09-25) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 8ad00a46c6..acab99f249 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -29,6 +29,7 @@ import bigframes.core.compile import bigframes.core.expression as ex import bigframes.core.guid +import bigframes.core.identifiers as ids import bigframes.core.join_def as join_def import bigframes.core.local_data as local_data import bigframes.core.nodes as nodes @@ -66,14 +67,33 @@ def from_pyarrow(cls, arrow_table: pa.Table, session: Session): iobytes = io.BytesIO() pa_feather.write_feather(adapted_table, iobytes) + # Scan all columns by default, we define this list as it can be pruned while preserving source_def + scan_list = nodes.ScanList( + tuple( + nodes.ScanItem(ids.ColumnId(item.column), item.dtype, item.column) + for item in schema.items + ) + ) + node = nodes.ReadLocalNode( iobytes.getvalue(), data_schema=schema, session=session, n_rows=arrow_table.num_rows, + scan_list=scan_list, ) return cls(node) + @classmethod + def from_range(cls, start, end, step): + return cls( + nodes.FromRangeNode( + start=start.node, + end=end.node, + step=step, + ) + ) + @classmethod def from_table( cls, @@ -93,14 +113,30 @@ def from_table( "Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.", bigframes.exceptions.PreviewWarning, ) + # define data source only for needed columns, this makes row-hashing cheaper + table_def = nodes.GbqTable.from_table(table, columns=schema.names) + + # create ordering from info + ordering = None + if offsets_col: + ordering = orderings.TotalOrdering.from_offset_col(offsets_col) + elif primary_key: + ordering = orderings.TotalOrdering.from_primary_key(primary_key) + + # Scan all columns by default, we define this list as it can be pruned while preserving source_def + scan_list = nodes.ScanList( + tuple( + nodes.ScanItem(ids.ColumnId(item.column), item.dtype, item.column) + for item in schema.items + ) + ) + source_def = nodes.BigqueryDataSource( + table=table_def, at_time=at_time, sql_predicate=predicate, ordering=ordering + ) node = nodes.ReadTableNode( - table=nodes.GbqTable.from_table(table), - total_order_cols=(offsets_col,) if offsets_col else tuple(primary_key), - order_col_is_sequential=(offsets_col is not None), - columns=schema, - at_time=at_time, + source=source_def, + scan_list=scan_list, table_session=session, - sql_predicate=predicate, ) return cls(node) @@ -146,12 +182,22 @@ def as_cached( ordering: Optional[orderings.RowOrdering], ) -> ArrayValue: """ - Replace the node with an equivalent one that references a tabel where the value has been materialized to. + Replace the node with an equivalent one that references a table where the value has been materialized to. """ + table = nodes.GbqTable.from_table(cache_table) + source = nodes.BigqueryDataSource(table, ordering=ordering) + # Assumption: GBQ cached table uses field name as bq column name + scan_list = nodes.ScanList( + tuple( + nodes.ScanItem(field.id, field.dtype, field.id.name) + for field in self.node.fields + ) + ) node = nodes.CachedTableNode( original_node=self.node, - table=nodes.GbqTable.from_table(cache_table), - ordering=ordering, + source=source, + table_session=self.session, + scan_list=scan_list, ) return ArrayValue(node) @@ -169,7 +215,7 @@ def row_count(self) -> ArrayValue: # Operations def filter_by_id(self, predicate_id: str, keep_null: bool = False) -> ArrayValue: """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" - predicate: ex.Expression = ex.free_var(predicate_id) + predicate: ex.Expression = ex.deref(predicate_id) if keep_null: predicate = ops.fillna_op.as_expr(predicate, ex.const(True)) return self.filter(predicate) @@ -200,7 +246,9 @@ def promote_offsets(self) -> Tuple[ArrayValue, str]: ) return ( - ArrayValue(nodes.PromoteOffsetsNode(child=self.node, col_id=col_id)), + ArrayValue( + nodes.PromoteOffsetsNode(child=self.node, col_id=ids.ColumnId(col_id)) + ), col_id, ) @@ -212,7 +260,9 @@ def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: def compute_values(self, assignments: Sequence[ex.Expression]): col_ids = self._gen_namespaced_uids(len(assignments)) - ex_id_pairs = tuple((ex, id) for ex, id in zip(assignments, col_ids)) + ex_id_pairs = tuple( + (ex, ids.ColumnId(id)) for ex, id in zip(assignments, col_ids) + ) return ( ArrayValue(nodes.ProjectionNode(child=self.node, assignments=ex_id_pairs)), col_ids, @@ -228,14 +278,19 @@ def assign(self, source_id: str, destination_id: str) -> ArrayValue: if destination_id in self.column_ids: # Mutate case exprs = [ ( - (source_id if (col_id == destination_id) else col_id), - col_id, + ex.deref(source_id if (col_id == destination_id) else col_id), + ids.ColumnId(col_id), ) for col_id in self.column_ids ] else: # append case - self_projection = ((col_id, col_id) for col_id in self.column_ids) - exprs = [*self_projection, (source_id, destination_id)] + self_projection = ( + (ex.deref(col_id), ids.ColumnId(col_id)) for col_id in self.column_ids + ) + exprs = [ + *self_projection, + (ex.deref(source_id), ids.ColumnId(destination_id)), + ] return ArrayValue( nodes.SelectionNode( child=self.node, @@ -248,24 +303,15 @@ def create_constant( value: typing.Any, dtype: typing.Optional[bigframes.dtypes.Dtype], ) -> Tuple[ArrayValue, str]: - destination_id = self._gen_namespaced_uid() if pandas.isna(value): # Need to assign a data type when value is NaN. dtype = dtype or bigframes.dtypes.DEFAULT_DTYPE - return ( - ArrayValue( - nodes.ProjectionNode( - child=self.node, - assignments=((ex.const(value, dtype), destination_id),), - ) - ), - destination_id, - ) + return self.project_to_id(ex.const(value, dtype)) def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: # This basically just drops and reorders columns - logically a no-op except as a final step - selections = ((col_id, col_id) for col_id in column_ids) + selections = ((ex.deref(col_id), ids.ColumnId(col_id)) for col_id in column_ids) return ArrayValue( nodes.SelectionNode( child=self.node, @@ -274,14 +320,8 @@ def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: ) def drop_columns(self, columns: Iterable[str]) -> ArrayValue: - new_projection = ( - (col_id, col_id) for col_id in self.column_ids if col_id not in columns - ) - return ArrayValue( - nodes.SelectionNode( - child=self.node, - input_output_pairs=tuple(new_projection), - ) + return self.select_columns( + [col_id for col_id in self.column_ids if col_id not in columns] ) def aggregate( @@ -297,11 +337,12 @@ def aggregate( by_column_id: column id of the aggregation key, this is preserved through the transform dropna: whether null keys should be dropped """ + agg_defs = tuple((agg, ids.ColumnId(name)) for agg, name in aggregations) return ArrayValue( nodes.AggregateNode( child=self.node, - aggregations=tuple(aggregations), - by_column_ids=tuple(by_column_ids), + aggregations=agg_defs, + by_column_ids=tuple(map(ex.deref, by_column_ids)), dropna=dropna, ) ) @@ -342,10 +383,10 @@ def project_window_op( ArrayValue( nodes.WindowOpNode( child=self.node, - column_name=column_name, + column_name=ex.deref(column_name), op=op, window_spec=window_spec, - output_name=output_name, + output_name=ids.ColumnId(output_name), never_skip_nulls=never_skip_nulls, skip_reproject_unsafe=skip_reproject_unsafe, ) @@ -373,26 +414,34 @@ def relational_join( conditions: typing.Tuple[typing.Tuple[str, str], ...] = (), type: typing.Literal["inner", "outer", "left", "right", "cross"] = "inner", ) -> typing.Tuple[ArrayValue, typing.Tuple[dict[str, str], dict[str, str]]]: + l_mapping = { # Identity mapping, only rename right side + lcol.name: lcol.name for lcol in self.node.ids + } + r_mapping = { # Rename conflicting names + rcol.name: rcol.name + if (rcol.name not in l_mapping) + else bigframes.core.guid.generate_guid() + for rcol in other.node.ids + } + other_node = other.node + if set(other_node.ids) & set(self.node.ids): + other_node = nodes.SelectionNode( + other_node, + tuple( + (ex.deref(old_id), ids.ColumnId(new_id)) + for old_id, new_id in r_mapping.items() + ), + ) + join_node = nodes.JoinNode( left_child=self.node, - right_child=other.node, - conditions=conditions, + right_child=other_node, + conditions=tuple( + (ex.deref(l_mapping[l_col]), ex.deref(r_mapping[r_col])) + for l_col, r_col in conditions + ), type=type, ) - # Maps input ids to output ids for caller convenience - l_size = len(self.node.schema) - l_mapping = { - lcol: ocol - for lcol, ocol in zip( - self.node.schema.names, join_node.schema.names[:l_size] - ) - } - r_mapping = { - rcol: ocol - for rcol, ocol in zip( - other.node.schema.names, join_node.schema.names[l_size:] - ) - } return ArrayValue(join_node), (l_mapping, r_mapping) def try_align_as_projection( @@ -414,7 +463,7 @@ def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue: for column_id in column_ids: assert bigframes.dtypes.is_array_like(self.get_column_type(column_id)) - offsets = tuple(self.get_offset_for_name(id) for id in column_ids) + offsets = tuple(ex.deref(id) for id in column_ids) return ArrayValue(nodes.ExplodeNode(child=self.node, column_ids=offsets)) def _uniform_sampling(self, fraction: float) -> ArrayValue: @@ -425,9 +474,6 @@ def _uniform_sampling(self, fraction: float) -> ArrayValue: """ return ArrayValue(nodes.RandomSampleNode(self.node, fraction)) - def get_offset_for_name(self, name: str): - return self.schema.names.index(name) - # Deterministically generate namespaced ids for new variables # These new ids are only unique within the current namespace. # Many operations, such as joins, create new namespaces. See: BigFrameNode.defines_namespace diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 398c6ab26a..2c4991b629 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -210,7 +210,7 @@ def _interpolate_column( ) -> typing.Tuple[blocks.Block, str]: if interpolate_method not in ["linear", "nearest", "ffill"]: raise ValueError("interpolate method not supported") - window_ordering = (ordering.OrderingExpression(ex.free_var(x_values)),) + window_ordering = (ordering.OrderingExpression(ex.deref(x_values)),) backwards_window = windows.rows(following=0, ordering=window_ordering) forwards_window = windows.rows(preceding=0, ordering=window_ordering) @@ -373,7 +373,7 @@ def value_counts( block = block.order_by( [ ordering.OrderingExpression( - ex.free_var(count_id), + ex.deref(count_id), direction=ordering.OrderingDirection.ASC if ascending else ordering.OrderingDirection.DESC, @@ -430,7 +430,7 @@ def rank( nullity_col_ids.append(nullity_col_id) window_ordering = ( ordering.OrderingExpression( - ex.free_var(col), + ex.deref(col), ordering.OrderingDirection.ASC if ascending else ordering.OrderingDirection.DESC, @@ -522,7 +522,7 @@ def nsmallest( block = block.reversed() order_refs = [ ordering.OrderingExpression( - ex.free_var(col_id), direction=ordering.OrderingDirection.ASC + ex.deref(col_id), direction=ordering.OrderingDirection.ASC ) for col_id in column_ids ] @@ -552,7 +552,7 @@ def nlargest( block = block.reversed() order_refs = [ ordering.OrderingExpression( - ex.free_var(col_id), direction=ordering.OrderingDirection.DESC + ex.deref(col_id), direction=ordering.OrderingDirection.DESC ) for col_id in column_ids ] @@ -849,9 +849,9 @@ def _idx_extrema( ) # Have to find the min for each order_refs = [ - ordering.OrderingExpression(ex.free_var(value_col), direction), + ordering.OrderingExpression(ex.deref(value_col), direction), *[ - ordering.OrderingExpression(ex.free_var(idx_col)) + ordering.OrderingExpression(ex.deref(idx_col)) for idx_col in original_block.index_columns ], ] diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 7f6f5f1cc9..9e245399cd 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -23,17 +23,28 @@ import ast import dataclasses +import datetime import functools import itertools -import os import random import textwrap import typing -from typing import Iterable, List, Literal, Mapping, Optional, Sequence, Tuple, Union +from typing import ( + Iterable, + List, + Literal, + Mapping, + Optional, + Sequence, + Tuple, + TYPE_CHECKING, + Union, +) import warnings import bigframes_vendored.constants as constants import google.cloud.bigquery as bigquery +import numpy import pandas as pd import pyarrow as pa @@ -56,7 +67,10 @@ import bigframes.features import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops -import bigframes.session._io.pandas +import bigframes.session._io.pandas as io_pandas + +if TYPE_CHECKING: + import bigframes.session.executor # Type constraint for wherever column labels are used Label = typing.Hashable @@ -450,46 +464,14 @@ def reorder_levels(self, ids: typing.Sequence[str]): level_names = [self.col_id_to_index_name[index_id] for index_id in ids] return Block(self.expr, ids, self.column_labels, level_names) - def _to_dataframe(self, result) -> pd.DataFrame: - """Convert BigQuery data to pandas DataFrame with specific dtypes.""" - result_dataframe = self.session._rows_to_dataframe(result) - # Runs strict validations to ensure internal type predictions and ibis are completely in sync - # Do not execute these validations outside of testing suite. - if "PYTEST_CURRENT_TEST" in os.environ: - self._validate_result_schema(result.schema) - return result_dataframe - - def _validate_result_schema( - self, bq_result_schema: list[bigquery.schema.SchemaField] - ): - actual_schema = tuple(bq_result_schema) - ibis_schema = self.expr._compiled_schema - internal_schema = self.expr.schema - if not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: - return - if internal_schema.to_bigquery() != actual_schema: - raise ValueError( - f"This error should only occur while testing. BigFrames internal schema: {internal_schema.to_bigquery()} does not match actual schema: {actual_schema}" - ) - if ibis_schema.to_bigquery() != actual_schema: - raise ValueError( - f"This error should only occur while testing. Ibis schema: {ibis_schema.to_bigquery()} does not match actual schema: {actual_schema}" - ) - def to_arrow( self, *, ordered: bool = True, ) -> Tuple[pa.Table, bigquery.QueryJob]: """Run query and download results as a pyarrow Table.""" - # pa.Table.from_pandas puts index columns last, so update the expression to match. - expr = self.expr.select_columns( - list(self.value_columns) + list(self.index_columns) - ) - - _, query_job = self.session._execute(expr, ordered=ordered) - results_iterator = query_job.result() - pa_table = results_iterator.to_arrow() + execute_result = self.session._executor.execute(self.expr, ordered=ordered) + pa_table = execute_result.to_arrow_table() pa_index_labels = [] for index_level, index_label in enumerate(self._index_labels): @@ -498,8 +480,10 @@ def to_arrow( else: pa_index_labels.append(f"__index_level_{index_level}__") + # pa.Table.from_pandas puts index columns last, so update to match. + pa_table = pa_table.select([*self.value_columns, *self.index_columns]) pa_table = pa_table.rename_columns(list(self.column_labels) + pa_index_labels) - return pa_table, query_job + return pa_table, execute_result.query_job def to_pandas( self, @@ -508,7 +492,7 @@ def to_pandas( random_state: Optional[int] = None, *, ordered: bool = True, - ) -> Tuple[pd.DataFrame, bigquery.QueryJob]: + ) -> Tuple[pd.DataFrame, Optional[bigquery.QueryJob]]: """Run query and download results as a pandas DataFrame. Args: @@ -560,8 +544,8 @@ def try_peek( self, n: int = 20, force: bool = False ) -> typing.Optional[pd.DataFrame]: if force or self.expr.supports_fast_peek: - iterator, _ = self.session._peek(self.expr, n) - df = self._to_dataframe(iterator) + result = self.session._executor.peek(self.expr, n) + df = io_pandas.arrow_to_pandas(result.to_arrow_table(), self.expr.schema) self._copy_index_to_pandas(df) return df else: @@ -574,18 +558,15 @@ def to_pandas_batches( page_size and max_results determine the size and number of batches, see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result""" - dtypes = dict(zip(self.index_columns, self.index.dtypes)) - dtypes.update(zip(self.value_columns, self.dtypes)) - _, query_job = self.session._executor.execute( - self.expr, ordered=True, use_explicit_destination=True - ) - results_iterator = query_job.result( - page_size=page_size, max_results=max_results - ) - for arrow_table in results_iterator.to_arrow_iterable( - bqstorage_client=self.session.bqstoragereadclient - ): - df = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) + execute_result = self.session._executor.execute( + self.expr, + ordered=True, + use_explicit_destination=True, + page_size=page_size, + max_results=max_results, + ) + for record_batch in execute_result.arrow_batches(): + df = io_pandas.arrow_to_pandas(record_batch, self.expr.schema) self._copy_index_to_pandas(df) yield df @@ -605,22 +586,19 @@ def _copy_index_to_pandas(self, df: pd.DataFrame): def _materialize_local( self, materialize_options: MaterializationOptions = MaterializationOptions() - ) -> Tuple[pd.DataFrame, bigquery.QueryJob]: + ) -> Tuple[pd.DataFrame, Optional[bigquery.QueryJob]]: """Run query and download results as a pandas DataFrame. Return the total number of results as well.""" # TODO(swast): Allow for dry run and timeout. - _, query_job = self.session._execute( - self.expr, ordered=materialize_options.ordered - ) - results_iterator = query_job.result() - - table_size = ( - self.session._get_table_size(query_job.destination) / _BYTES_TO_MEGABYTES + execute_result = self.session._executor.execute( + self.expr, ordered=materialize_options.ordered, get_size_bytes=True ) + assert execute_result.total_bytes is not None + table_mb = execute_result.total_bytes / _BYTES_TO_MEGABYTES sample_config = materialize_options.downsampling max_download_size = sample_config.max_download_size fraction = ( - max_download_size / table_size - if (max_download_size is not None) and (table_size != 0) + max_download_size / table_mb + if (max_download_size is not None) and (table_mb != 0) else 2 ) @@ -629,7 +607,7 @@ def _materialize_local( if fraction < 1: if not sample_config.enable_downsampling: raise RuntimeError( - f"The data size ({table_size:.2f} MB) exceeds the maximum download limit of " + f"The data size ({table_mb:.2f} MB) exceeds the maximum download limit of " f"{max_download_size} MB. You can:\n\t* Enable downsampling in global options:\n" "\t\t`bigframes.options.sampling.enable_downsampling = True`\n" "\t* Update the global `max_download_size` option. Please make sure " @@ -640,12 +618,12 @@ def _materialize_local( ) warnings.warn( - f"The data size ({table_size:.2f} MB) exceeds the maximum download limit of" + f"The data size ({table_mb:.2f} MB) exceeds the maximum download limit of" f"({max_download_size} MB). It will be downsampled to {max_download_size} MB for download." "\nPlease refer to the documentation for configuring the downloading limit.", UserWarning, ) - total_rows = results_iterator.total_rows + total_rows = execute_result.total_rows # Remove downsampling config from subsequent invocations, as otherwise could result in many # iterations if downsampling undershoots return self._downsample( @@ -657,11 +635,12 @@ def _materialize_local( MaterializationOptions(ordered=materialize_options.ordered) ) else: - total_rows = results_iterator.total_rows - df = self._to_dataframe(results_iterator) + total_rows = execute_result.total_rows + arrow = execute_result.to_arrow_table() + df = io_pandas.arrow_to_pandas(arrow, schema=self.expr.schema) self._copy_index_to_pandas(df) - return df, query_job + return df, execute_result.query_job def _downsample( self, total_rows: int, sampling_method: str, fraction: float, random_state @@ -680,7 +659,7 @@ def _downsample( ) return block elif sampling_method == _UNIFORM: - block = self._split( + block = self.split( fracs=(fraction,), random_state=random_state, sort=False, @@ -693,7 +672,7 @@ def _downsample( f"please choose from {','.join(_SAMPLING_METHODS)}." ) - def _split( + def split( self, ns: Iterable[int] = (), fracs: Iterable[float] = (), @@ -739,7 +718,7 @@ def _split( ) block, hash_string_sum_col = block.apply_unary_op(string_sum_col, ops.hash_op) block = block.order_by( - [ordering.OrderingExpression(ex.free_var(hash_string_sum_col))] + [ordering.OrderingExpression(ex.deref(hash_string_sum_col))] ) intervals = [] @@ -758,7 +737,7 @@ def _split( sliced_blocks = [ sliced_block.order_by( [ - ordering.OrderingExpression(ex.free_var(idx_col)) + ordering.OrderingExpression(ex.deref(idx_col)) for idx_col in sliced_block.index_columns ] ) @@ -767,7 +746,7 @@ def _split( elif sort is False: sliced_blocks = [ sliced_block.order_by( - [ordering.OrderingExpression(ex.free_var(ordering_col))] + [ordering.OrderingExpression(ex.deref(ordering_col))] ) for sliced_block in sliced_blocks ] @@ -785,7 +764,7 @@ def _compute_dry_run( self, value_keys: Optional[Iterable[str]] = None ) -> bigquery.QueryJob: expr = self._apply_value_keys_to_expr(value_keys=value_keys) - _, query_job = self.session._dry_run(expr) + query_job = self.session._executor.dry_run(expr) return query_job def _apply_value_keys_to_expr(self, value_keys: Optional[Iterable[str]] = None): @@ -925,9 +904,9 @@ def multi_apply_unary_op( ) -> Block: if isinstance(op, ops.UnaryOp): input_varname = guid.generate_guid() - expr = op.as_expr(input_varname) + expr = op.as_expr(ex.free_var(input_varname)) else: - input_varnames = op.unbound_variables + input_varnames = op.free_variables assert len(input_varnames) == 1 expr = op input_varname = input_varnames[0] @@ -936,7 +915,7 @@ def multi_apply_unary_op( for col_id in columns: label = self.col_id_to_label[col_id] block, result_id = block.project_expr( - expr.bind_variables({input_varname: ex.free_var(col_id)}), + expr.bind_variables({input_varname: ex.deref(col_id)}), label=label, ) block = block.copy_values(result_id, col_id) @@ -966,7 +945,7 @@ def apply_window_op( block = self if skip_null_groups: for key in window_spec.grouping_keys: - block, not_null_id = block.apply_unary_op(key, ops.notnull_op) + block, not_null_id = block.apply_unary_op(key.id.name, ops.notnull_op) block = block.filter_by_id(not_null_id).drop_columns([not_null_id]) expr, result_id = block._expr.project_window_op( column, @@ -1050,7 +1029,7 @@ def aggregate_all_and_stack( if axis_n == 0: aggregations = [ ( - ex.UnaryAggregation(operation, ex.free_var(col_id)) + ex.UnaryAggregation(operation, ex.deref(col_id)) if isinstance(operation, agg_ops.UnaryAggregateOp) else ex.NullaryAggregation(operation), col_id, @@ -1081,7 +1060,10 @@ def aggregate_all_and_stack( index_cols = passthrough_cols[:-1] og_offset_col = passthrough_cols[-1] index_aggregations = [ - (ex.UnaryAggregation(agg_ops.AnyValueOp(), ex.free_var(col_id)), col_id) + ( + ex.UnaryAggregation(agg_ops.AnyValueOp(), ex.deref(col_id)), + col_id, + ) for col_id in index_cols ] # TODO: may need add NullaryAggregation in main_aggregation @@ -1090,7 +1072,7 @@ def aggregate_all_and_stack( operation, agg_ops.UnaryAggregateOp ), f"Expected a unary operation, but got {operation}. Please report this error and how you got here to the BigQuery DataFrames team (bit.ly/bigframes-feedback)." main_aggregation = ( - ex.UnaryAggregation(operation, ex.free_var(value_col_ids[0])), + ex.UnaryAggregation(operation, ex.deref(value_col_ids[0])), value_col_ids[0], ) # Drop row identity after aggregating over it @@ -1200,7 +1182,7 @@ def aggregate( """ agg_specs = [ ( - ex.UnaryAggregation(operation, ex.free_var(input_id)) + ex.UnaryAggregation(operation, ex.deref(input_id)) if isinstance(operation, agg_ops.UnaryAggregateOp) else ex.NullaryAggregation(operation), guid.generate_guid(), @@ -1258,7 +1240,7 @@ def get_stat( aggregations = [ ( - ex.UnaryAggregation(stat, ex.free_var(column_id)) + ex.UnaryAggregation(stat, ex.deref(column_id)) if isinstance(stat, agg_ops.UnaryAggregateOp) else ex.NullaryAggregation(stat), stat.name, @@ -1287,7 +1269,7 @@ def get_binary_stat( aggregations = [ ( ex.BinaryAggregation( - stat, ex.free_var(column_id_left), ex.free_var(column_id_right) + stat, ex.deref(column_id_left), ex.deref(column_id_right) ), f"{stat.name}_{column_id_left}{column_id_right}", ) @@ -1313,7 +1295,7 @@ def summarize( labels = pd.Index([stat.name for stat in stats]) aggregations = [ ( - ex.UnaryAggregation(stat, ex.free_var(col_id)) + ex.UnaryAggregation(stat, ex.deref(col_id)) if isinstance(stat, agg_ops.UnaryAggregateOp) else ex.NullaryAggregation(stat), f"{col_id}-{stat.name}", @@ -1350,7 +1332,7 @@ def calculate_pairwise_metric(self, op=agg_ops.CorrOp()): aggregations = [ ( - ex.BinaryAggregation(op, ex.free_var(left_col), ex.free_var(right_col)), + ex.BinaryAggregation(op, ex.deref(left_col), ex.deref(right_col)), f"{left_col}-{right_col}", ) for left_col in self.value_columns @@ -1567,7 +1549,7 @@ def _forward_slice(self, start: int = 0, stop=None, step: int = 1): @functools.cache def retrieve_repr_request_results( self, max_results: int - ) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]: + ) -> Tuple[pd.DataFrame, int, Optional[bigquery.QueryJob]]: """ Retrieves a pandas dataframe containing only max_results many rows for use with printing methods. @@ -1575,12 +1557,13 @@ def retrieve_repr_request_results( Returns a tuple of the dataframe and the overall number of rows of the query. """ - results, query_job = self.session._executor.head(self.expr, max_results) + head_result = self.session._executor.head(self.expr, max_results) count = self.session._executor.get_row_count(self.expr) - computed_df = self._to_dataframe(results) - self._copy_index_to_pandas(computed_df) - return computed_df, count, query_job + arrow = self.session._executor.execute(self.expr).to_arrow_table() + df = io_pandas.arrow_to_pandas(arrow, schema=self.expr.schema) + self._copy_index_to_pandas(df) + return df, count, head_result.query_job def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]: expr, result_id = self._expr.promote_offsets() @@ -1853,6 +1836,179 @@ def transpose( .with_transpose_cache(self) ) + def _generate_sequence( + self, + start, + stop, + step: int = 1, + ): + range_expr = self.expr.from_range( + start, + stop, + step, + ) + + return Block( + range_expr, + column_labels=["min"], + index_columns=[], + ) + + def _generate_resample_label( + self, + rule: str, + closed: Optional[Literal["right", "left"]] = None, + label: Optional[Literal["right", "left"]] = None, + on: Optional[Label] = None, + level: typing.Union[LevelType, typing.Sequence[LevelType]] = None, + origin: Union[ + Union[pd.Timestamp, datetime.datetime, numpy.datetime64, int, float, str], + Literal["epoch", "start", "start_day", "end", "end_day"], + ] = "start_day", + ) -> Block: + # Validate and resolve the index or column to use for grouping + if on is None: + if len(self.index_columns) == 0: + raise ValueError( + f"No index for resampling. Expected {bigframes.dtypes.DATETIME_DTYPE} or " + f"{bigframes.dtypes.TIMESTAMP_DTYPE} index or 'on' parameter specifying a column." + ) + if len(self.index_columns) > 1 and (level is None): + raise ValueError( + "Multiple indices are not supported for this operation" + " when 'level' is not set." + ) + level = level or 0 + col_id = self.index.resolve_level(level)[0] + # Reset index to make the resampling level a column, then drop all other index columns. + # This simplifies processing by focusing solely on the column required for resampling. + block = self.reset_index(drop=False) + block = block.drop_columns( + [col for col in self.index.column_ids if col != col_id] + ) + elif level is not None: + raise ValueError("The Grouper cannot specify both a key and a level!") + else: + matches = self.label_to_col_id.get(on, []) + if len(matches) > 1: + raise ValueError( + f"Multiple columns matching id {on} were found. {constants.FEEDBACK_LINK}" + ) + if len(matches) == 0: + raise KeyError(f"The grouper name {on} is not found") + + col_id = matches[0] + block = self + if level is None: + dtype = self._column_type(col_id) + elif isinstance(level, int): + dtype = self.index.dtypes[level] + else: + dtype = self.index.dtypes[self.index.names.index(level)] + + if dtype not in ( + bigframes.dtypes.DATETIME_DTYPE, + bigframes.dtypes.TIMESTAMP_DTYPE, + ): + raise TypeError( + f"Invalid column type: {dtype}. Expected types are " + f"{bigframes.dtypes.DATETIME_DTYPE}, or " + f"{bigframes.dtypes.TIMESTAMP_DTYPE}." + ) + + freq = pd.tseries.frequencies.to_offset(rule) + assert freq is not None + + if origin not in ("epoch", "start", "start_day"): + raise ValueError( + "'origin' should be equal to 'epoch', 'start' or 'start_day'" + f". Got '{origin}' instead." + ) + + agg_specs = [ + ( + ex.UnaryAggregation(agg_ops.min_op, ex.deref(col_id)), + guid.generate_guid(), + ), + ] + origin_block = Block( + block.expr.aggregate(agg_specs, dropna=True), + column_labels=["origin"], + index_columns=[], + ) + + col_level = block.value_columns.index(col_id) + + block = block.merge( + origin_block, how="cross", left_join_ids=[], right_join_ids=[], sort=True + ) + + # After merging, the original column ids are altered. 'col_level' is the index of + # the datetime column used for resampling. 'block.value_columns[-1]' is the + # 'origin' column, which is the minimum datetime value. + block, label_col_id = block.apply_binary_op( + block.value_columns[col_level], + block.value_columns[-1], + op=ops.DatetimeToIntegerLabelOp(freq=freq, closed=closed, origin=origin), + ) + block = block.drop_columns([block.value_columns[-2]]) + + # Generate integer label sequence. + min_agg_specs = [ + ( + ex.UnaryAggregation(agg_ops.min_op, ex.deref(label_col_id)), + guid.generate_guid(), + ), + ] + max_agg_specs = [ + ( + ex.UnaryAggregation(agg_ops.max_op, ex.deref(label_col_id)), + guid.generate_guid(), + ), + ] + label_start = block.expr.aggregate(min_agg_specs, dropna=True) + label_stop = block.expr.aggregate(max_agg_specs, dropna=True) + + label_block = block._generate_sequence( + start=label_start, + stop=label_stop, + ) + + label_block = label_block.merge( + origin_block, how="cross", left_join_ids=[], right_join_ids=[], sort=True + ) + + block = label_block.merge( + block, + how="left", + left_join_ids=[label_block.value_columns[0]], + right_join_ids=[label_col_id], + sort=True, + ) + + block, resample_label_id = block.apply_binary_op( + block.value_columns[0], + block.value_columns[1], + op=ops.IntegerLabelToDatetimeOp(freq=freq, label=label, origin=origin), + ) + + # After multiple merges, the columns: + # - block.value_columns[0] is the integer label sequence, + # - block.value_columns[1] is the origin column (minimum datetime value), + # - col_level+2 represents the datetime column used for resampling, + # - block.value_columns[-2] is the integer label column derived from the datetime column. + # These columns are no longer needed. + block = block.drop_columns( + [ + block.value_columns[0], + block.value_columns[1], + block.value_columns[col_level + 2], + block.value_columns[-2], + ] + ) + + return block.set_index([resample_label_id]) + def _create_stack_column(self, col_label: typing.Tuple, stack_labels: pd.Index): dtype = None input_columns: list[Optional[str]] = [] @@ -2031,7 +2187,7 @@ def merge( # sort uses coalesced join keys always joined_expr = joined_expr.order_by( [ - ordering.OrderingExpression(ex.free_var(col_id)) + ordering.OrderingExpression(ex.deref(col_id)) for col_id in coalesced_ids ], ) @@ -2084,12 +2240,12 @@ def _align_both_axes( ) left_input_lookup = ( - lambda index: ex.free_var(get_column_left[self.value_columns[index]]) + lambda index: ex.deref(get_column_left[self.value_columns[index]]) if index != -1 else ex.const(None) ) righ_input_lookup = ( - lambda index: ex.free_var(get_column_right[other.value_columns[index]]) + lambda index: ex.deref(get_column_right[other.value_columns[index]]) if index != -1 else ex.const(None) ) @@ -2107,8 +2263,8 @@ def _align_axis_0( series_column_id = other.value_columns[0] inputs = tuple( ( - ex.free_var(get_column_left[col]), - ex.free_var(get_column_right[series_column_id]), + ex.deref(get_column_left[col]), + ex.deref(get_column_right[series_column_id]), ) for col in self.value_columns ) @@ -2143,12 +2299,12 @@ def _align_series_block_axis_1( ) left_input_lookup = ( - lambda index: ex.free_var(get_column_left[self.value_columns[index]]) + lambda index: ex.deref(get_column_left[self.value_columns[index]]) if index != -1 else ex.const(None) ) righ_input_lookup = ( - lambda index: ex.free_var( + lambda index: ex.deref( get_column_right[other.transpose().value_columns[index]] ) if index != -1 @@ -2178,7 +2334,7 @@ def _align_pd_series_axis_1( ) left_input_lookup = ( - lambda index: ex.free_var(self.value_columns[index]) + lambda index: ex.deref(self.value_columns[index]) if index != -1 else ex.const(None) ) @@ -2330,7 +2486,10 @@ def to_sql_query( # the BigQuery unicode column name feature? substitutions[old_id] = new_id - sql = self.session._to_sql( + # Note: this uses the sql from the executor, so is coupled tightly to execution + # implementaton. It will reference cached tables instead of original data sources. + # Maybe should just compile raw BFET? Depends on user intent. + sql = self.session._executor.to_sql( array_value, col_id_overrides=substitutions, enable_cache=enable_cache ) return ( @@ -2424,7 +2583,7 @@ def _get_rows_as_json_values(self) -> Block: # TODO(shobs): Replace direct SQL manipulation by structured expression # manipulation expr, ordering_column_name = self.expr.promote_offsets() - expr_sql = self.session._to_sql(expr) + expr_sql = self.session._executor.to_sql(expr) # Names of the columns to serialize for the row. # We will use the repr-eval pattern to serialize a value here and @@ -2578,17 +2737,8 @@ def to_pandas(self, *, ordered: Optional[bool] = None) -> pd.Index: raise bigframes.exceptions.NullIndexError( "Cannot materialize index, as this object does not have an index. Set index column(s) using set_index." ) - # Project down to only the index column. So the query can be cached to visualize other data. - index_columns = list(self._block.index_columns) - expr = self._expr.select_columns(index_columns) - results, _ = self.session._execute( - expr, ordered=ordered if ordered is not None else True - ) - df = expr.session._rows_to_dataframe(results) - df = df.set_index(index_columns) - index = df.index - index.names = list(self._block._index_labels) # type:ignore - return index + ordered = ordered if ordered is not None else True + return self._block.select_columns([]).to_pandas(ordered=ordered)[0].index def resolve_level(self, level: LevelsType) -> typing.Sequence[str]: if utils.is_list_like(level): @@ -2734,7 +2884,7 @@ def join_mono_indexed( if sort: combined_expr = combined_expr.order_by( [ - ordering.OrderingExpression(ex.free_var(col_id)) + ordering.OrderingExpression(ex.deref(col_id)) for col_id in coalesced_join_cols ] ) @@ -2797,7 +2947,7 @@ def join_multi_indexed( if sort: combined_expr = combined_expr.order_by( [ - ordering.OrderingExpression(ex.free_var(col_id)) + ordering.OrderingExpression(ex.deref(col_id)) for col_id in coalesced_join_cols ] ) @@ -3036,7 +3186,7 @@ def unpivot( *( ( ops.eq_op.as_expr(explode_offsets_id, ex.const(i)), - ex.free_var(column_mapping[id_or_null]) + ex.deref(column_mapping[id_or_null]) if (id_or_null is not None) else ex.const(None), ) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 6973091296..f4afdaa97c 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -36,6 +36,7 @@ import bigframes.core.compile.scalar_op_compiler as op_compilers import bigframes.core.expression as ex import bigframes.core.guid +import bigframes.core.identifiers as ids from bigframes.core.ordering import ( ascending_over, encode_order_string, @@ -44,7 +45,6 @@ RowOrdering, TotalOrdering, ) -import bigframes.core.schema as schemata import bigframes.core.sql from bigframes.core.window_spec import RangeWindowBounds, RowsWindowBounds, WindowSpec import bigframes.dtypes @@ -142,12 +142,12 @@ def projection( def selection( self: T, - input_output_pairs: typing.Tuple[typing.Tuple[str, str], ...], + input_output_pairs: typing.Tuple[typing.Tuple[ex.DerefOp, str], ...], ) -> T: """Apply an expression to the ArrayValue and assign the output to a column.""" bindings = {col: self._get_ibis_column(col) for col in self.column_ids} values = [ - op_compiler.compile_expression(ex.free_var(input), bindings).name(id) + op_compiler.compile_expression(input, bindings).name(id) for input, id in input_output_pairs ] result = self._select(tuple(values)) # type: ignore @@ -184,7 +184,7 @@ def _aggregate_base( table: ibis_types.Table, order_by: typing.Sequence[ibis_types.Value] = [], aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]] = [], - by_column_ids: typing.Sequence[str] = (), + by_column_ids: typing.Sequence[ex.DerefOp] = (), dropna: bool = True, ) -> OrderedIR: assert not self.is_ordered_ir or len(order_by) > 0 @@ -197,17 +197,21 @@ def _aggregate_base( for aggregate, col_out in aggregations } if by_column_ids: - result = table.group_by(by_column_ids).aggregate(**stats) + result = table.group_by((ref.id.sql for ref in by_column_ids)).aggregate( + **stats + ) # Must have deterministic ordering, so order by the unique "by" column ordering = TotalOrdering( - tuple([ascending_over(column_id) for column_id in by_column_ids]), - total_ordering_columns=frozenset(by_column_ids), + tuple([OrderingExpression(column_id) for column_id in by_column_ids]), + total_ordering_columns=frozenset( + [ex.DerefOp(ref.id.local_normalized) for ref in by_column_ids] + ), ) columns = tuple(result[key] for key in result.columns) expr = OrderedIR(result, columns=columns, ordering=ordering) if dropna: - for column_id in by_column_ids: - expr = expr._filter(expr._get_ibis_column(column_id).notnull()) + for ref in by_column_ids: + expr = expr._filter(expr._compile_expression(ref).notnull()) return expr else: aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} @@ -286,7 +290,7 @@ def row_count(self) -> OrderedIR: (ibis_table["count"],), ordering=TotalOrdering( ordering_value_columns=(ascending_over("count"),), - total_ordering_columns=frozenset(["count"]), + total_ordering_columns=frozenset([ex.deref("count")]), ), ) @@ -351,10 +355,13 @@ def _to_ibis_expr( return table def filter(self, predicate: ex.Expression) -> UnorderedIR: - if any(map(is_window, map(self._get_ibis_column, predicate.unbound_variables))): - # ibis doesn't support qualify syntax, so create CTE if filtering over window expression - # https://github.com/ibis-project/ibis/issues/9775 - return self._reproject_to_table().filter(predicate) + for ref in predicate.column_references: + ibis_value = self._get_ibis_column(ref.sql) + if is_window(ibis_value): + # ibis doesn't support qualify syntax, so create CTE if filtering over window expression + # https://github.com/ibis-project/ibis/issues/9775 + return self._reproject_to_table().filter(predicate) + bindings = {col: self._get_ibis_column(col) for col in self.column_ids} condition = op_compiler.compile_expression(predicate, bindings) return self._filter(condition) @@ -368,7 +375,7 @@ def _filter(self, predicate_value: ibis_types.BooleanValue) -> UnorderedIR: def aggregate( self, aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]], - by_column_ids: typing.Sequence[str] = (), + by_column_ids: typing.Sequence[ex.DerefOp] = (), dropna: bool = True, ) -> OrderedIR: """ @@ -400,9 +407,9 @@ def _uniform_sampling(self, fraction: float) -> UnorderedIR: columns=columns, ) - def explode(self, offsets: typing.Sequence[int]) -> UnorderedIR: + def explode(self, columns: typing.Sequence[ex.DerefOp]) -> UnorderedIR: table = self._to_ibis_expr() - column_ids = tuple(table.columns[offset] for offset in offsets) + column_ids = tuple(ref.id.sql for ref in columns) # The offset array ensures null represents empty arrays after unnesting. offset_array_id = bigframes.core.guid.generate_guid("offset_array_") @@ -555,7 +562,9 @@ def __init__( all_columns = value_col_ids | hidden_col_ids ordering_valid = all( - set(col.scalar_expression.unbound_variables).issubset(all_columns) + set(ref.sql for ref in col.scalar_expression.column_references).issubset( + all_columns + ) for col in ordering.all_ordering_columns ) if value_col_ids & hidden_col_ids: @@ -575,9 +584,7 @@ def has_total_order(self) -> bool: @classmethod def from_pandas( - cls, - pd_df: pandas.DataFrame, - schema: schemata.ArraySchema, + cls, pd_df: pandas.DataFrame, scan_cols: bigframes.core.nodes.ScanList ) -> OrderedIR: """ Builds an in-memory only (SQL only) expr from a pandas dataframe. @@ -593,10 +600,10 @@ def from_pandas( # derive the ibis schema from the original pandas schema ibis_schema = [ ( - name, + local_label, bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype(dtype), ) - for name, dtype in zip(schema.names, schema.dtypes) + for id, dtype, local_label in scan_cols.items ] ibis_schema.append((ORDER_ID_COLUMN, ibis_dtypes.int64)) @@ -604,7 +611,10 @@ def from_pandas( return cls( keys_memtable, - columns=[keys_memtable[column].name(column) for column in pd_df.columns], + columns=[ + keys_memtable[local_label].name(col_id.sql) + for col_id, _, local_label in scan_cols.items + ], ordering=TotalOrdering.from_offset_col(ORDER_ID_COLUMN), hidden_ordering_columns=(keys_memtable[ORDER_ID_COLUMN],), ) @@ -656,7 +666,7 @@ def reversed(self) -> OrderedIR: def aggregate( self, aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]], - by_column_ids: typing.Sequence[str] = (), + by_column_ids: typing.Sequence[ex.DerefOp] = (), dropna: bool = True, ) -> OrderedIR: """ @@ -711,9 +721,9 @@ def _uniform_sampling(self, fraction: float) -> OrderedIR: ordering=self._ordering, ) - def explode(self, offsets: typing.Sequence[int]) -> OrderedIR: + def explode(self, columns: typing.Sequence[ex.DerefOp]) -> OrderedIR: table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True) - column_ids = tuple(table.columns[offset] for offset in offsets) + column_ids = tuple(ref.id.sql for ref in columns) offset_array_id = bigframes.core.guid.generate_guid("offset_array_") offset_array = ibis.range( @@ -761,7 +771,7 @@ def explode(self, offsets: typing.Sequence[int]) -> OrderedIR: table_w_unnest[unnest_offset_id], ] l_mappings = {id: id for id in self._ordering.referenced_columns} - r_mappings = {unnest_offset_id: unnest_offset_id} + r_mappings = {ids.ColumnId(unnest_offset_id): ids.ColumnId(unnest_offset_id)} ordering = join_orderings( self._ordering, TotalOrdering.from_offset_col(unnest_offset_id), @@ -796,7 +806,10 @@ def promote_offsets(self, col_id: str) -> OrderedIR: # Also ibis cannot window literals, so need to reproject those (even though this is legal in googlesql) # Seee: https://github.com/ibis-project/ibis/issues/9773 can_directly_window = not any( - map(lambda x: is_literal(x) or is_window(x), self._ibis_order) + map( + lambda x: is_literal(x) or is_window(x), + itertools.chain(self._ibis_order, self._predicates), + ) ) if not can_directly_window: return self._reproject_to_table().promote_offsets(col_id) @@ -810,12 +823,14 @@ def promote_offsets(self, col_id: str) -> OrderedIR: *self.columns, offsets.name(col_id), ] - return expr_builder.build() + # Reproject, so that offsets are just a scalar value that can be used elsewhere + expr_builder.ordering = TotalOrdering.from_offset_col(col_id) + return expr_builder.build()._reproject_to_table() ## Methods that only work with ordering def project_window_op( self, - column_name: str, + column_name: ex.DerefOp, op: agg_ops.UnaryWindowOp, window_spec: WindowSpec, output_name: str, @@ -834,7 +849,10 @@ def project_window_op( # Also ibis cannot window literals, so need to reproject those (even though this is legal in googlesql) # See: https://github.com/ibis-project/ibis/issues/9773 used_exprs = map( - self._get_any_column, [column_name, *window_spec.all_referenced_columns] + self._compile_expression, + itertools.chain( + (column_name,), map(ex.DerefOp, window_spec.all_referenced_columns) + ), ) can_directly_window = not any( map(lambda x: is_literal(x) or is_window(x), used_exprs) @@ -848,14 +866,16 @@ def project_window_op( never_skip_nulls=never_skip_nulls, ) - column = typing.cast(ibis_types.Column, self._get_ibis_column(column_name)) + column = typing.cast(ibis_types.Column, self._compile_expression(column_name)) window = self._ibis_window_from_spec( window_spec, require_total_order=op.uses_total_row_ordering ) bindings = {col: self._get_ibis_column(col) for col in self.column_ids} window_op = agg_compiler.compile_analytic( - ex.UnaryAggregation(op, ex.free_var(column_name)), window, bindings=bindings + ex.UnaryAggregation(op, column_name), + window, + bindings=bindings, ) clauses = [] @@ -865,7 +885,7 @@ def project_window_op( if op.skips_nulls: # Most operations do not count NULL values towards min_periods observation_count = agg_compiler.compile_analytic( - ex.UnaryAggregation(agg_ops.count_op, ex.free_var(column_name)), + ex.UnaryAggregation(agg_ops.count_op, column_name), window, bindings=bindings, ) @@ -874,7 +894,7 @@ def project_window_op( # notnull is just used to convert null values to non-null (FALSE) values to be counted denulled_value = typing.cast(ibis_types.BooleanColumn, column.notnull()) observation_count = agg_compiler.compile_analytic( - ex.UnaryAggregation(agg_ops.count_op, ex.free_var("_denulled")), + ex.UnaryAggregation(agg_ops.count_op, ex.deref("_denulled")), window, bindings={**bindings, "_denulled": denulled_value}, ) @@ -891,7 +911,7 @@ def project_window_op( case_statement = case_statement.else_(window_op).end() # type: ignore window_op = case_statement - result = self._set_or_replace_by_id(output_name or column_name, window_op) + result = self._set_or_replace_by_id(output_name, window_op) return result def _reproject_to_table(self) -> OrderedIR: @@ -901,8 +921,9 @@ def _reproject_to_table(self) -> OrderedIR: ) columns = [table[column_name] for column_name in self._column_names] ordering_col_ids = list( - itertools.chain.from_iterable( - ref.scalar_expression.unbound_variables + id.sql + for id in itertools.chain.from_iterable( + ref.scalar_expression.column_references for ref in self._ordering.all_ordering_columns ) ) @@ -1069,10 +1090,13 @@ def _to_ibis_expr( return table def filter(self, predicate: ex.Expression) -> OrderedIR: - if any(map(is_window, map(self._get_ibis_column, predicate.unbound_variables))): - # ibis doesn't support qualify syntax, so create CTE if filtering over window expression - # https://github.com/ibis-project/ibis/issues/9775 - return self._reproject_to_table().filter(predicate) + for ref in predicate.column_references: + ibis_value = self._get_ibis_column(ref.sql) + if is_window(ibis_value): + # ibis doesn't support qualify syntax, so create CTE if filtering over window expression + # https://github.com/ibis-project/ibis/issues/9775 + return self._reproject_to_table().filter(predicate) + bindings = {col: self._get_ibis_column(col) for col in self.column_ids} condition = op_compiler.compile_expression(predicate, bindings) return self._filter(condition) @@ -1088,8 +1112,9 @@ def _set_or_replace_by_id(self, id: str, new_value: ibis_types.Value) -> Ordered """Safely assign by id while maintaining ordering integrity.""" # TODO: Split into explicit set and replace methods ordering_col_ids = set( - itertools.chain.from_iterable( - col_ref.scalar_expression.unbound_variables + id.sql + for id in itertools.chain.from_iterable( + col_ref.scalar_expression.column_references for col_ref in self._ordering.ordering_value_columns ) ) @@ -1110,15 +1135,16 @@ def _select(self, values: typing.Tuple[ibis_types.Value]) -> OrderedIR: """Safely assign by id while maintaining ordering integrity.""" # TODO: Split into explicit set and replace methods ordering_col_ids = set( - itertools.chain.from_iterable( + id.sql + for id in itertools.chain.from_iterable( [ - col_ref.scalar_expression.unbound_variables + col_ref.scalar_expression.column_references for col_ref in self._ordering.ordering_value_columns ] ) ) ir = self - mappings = {value.name: value for value in values} + mappings = {typing.cast(str, value.get_name()): value for value in values} for ordering_id in ordering_col_ids: # Drop case if (ordering_id not in mappings) and (ordering_id in ir.column_ids): @@ -1156,17 +1182,25 @@ def _get_hidden_ordering_column(self, key: str) -> ibis_types.Column: ) return typing.cast(ibis_types.Column, self._hidden_ordering_column_names[key]) - def _hide_column(self, column_id) -> OrderedIR: + def _hide_column(self, column_id: str) -> OrderedIR: """Pushes columns to hidden columns list. Used to hide ordering columns that have been dropped or destructively mutated.""" expr_builder = self.builder() # Need to rename column as caller might be creating a new row with the same name but different values. # Can avoid this if don't allow callers to determine ids and instead generate unique ones in this class. - new_name = bigframes.core.guid.generate_guid(prefix="bigframes_hidden_") + new_name = ids.ColumnId( + bigframes.core.guid.generate_guid(prefix="bigframes_hidden_") + ) expr_builder.hidden_ordering_columns = [ *self._hidden_ordering_columns, - self._get_ibis_column(column_id).name(new_name), + self._get_ibis_column(column_id).name(new_name.sql), ] - expr_builder.ordering = self._ordering.with_column_remap({column_id: new_name}) + matching_ref = next( + ref for ref in self._ordering.referenced_columns if ref.sql == column_id + ) + # allow_partial_bindings since only remapping hidden column, not all columns + expr_builder.ordering = self._ordering.remap_column_refs( + {matching_ref: new_name}, allow_partial_bindings=True + ) return expr_builder.build() def _bake_ordering(self) -> OrderedIR: @@ -1181,15 +1215,15 @@ def _bake_ordering(self) -> OrderedIR: ) new_baked_cols.append(baked_column) new_expr = OrderingExpression( - ex.free_var(baked_column.get_name()), expr.direction, expr.na_last + ex.deref(baked_column.get_name()), expr.direction, expr.na_last ) new_exprs.append(new_expr) - elif isinstance(expr.scalar_expression, ex.UnboundVariableExpression): + elif isinstance(expr.scalar_expression, ex.DerefOp): order_col = expr.scalar_expression.id new_exprs.append(expr) - if order_col not in self.column_ids: + if order_col.sql not in self.column_ids: new_baked_cols.append( - self._ibis_bindings[expr.scalar_expression.id] + self._ibis_bindings[expr.scalar_expression.id.sql] ) if isinstance(self._ordering, TotalOrdering): @@ -1275,7 +1309,7 @@ def _ibis_window_from_spec( group_by: typing.List[ibis_types.Value] = ( [ typing.cast( - ibis_types.Column, _as_identity(self._get_ibis_column(column)) + ibis_types.Column, _as_identity(self._compile_expression(column)) ) for column in window_spec.grouping_keys ] diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 74fcaf5f2a..0917097c70 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -28,8 +28,12 @@ import bigframes.core.compile.concat as concat_impl import bigframes.core.compile.default_ordering as default_ordering import bigframes.core.compile.ibis_types +import bigframes.core.compile.scalar_op_compiler +import bigframes.core.compile.scalar_op_compiler as compile_scalar import bigframes.core.compile.schema_translator import bigframes.core.compile.single_column +import bigframes.core.guid as guids +import bigframes.core.identifiers as ids import bigframes.core.nodes as nodes import bigframes.core.ordering as bf_ordering @@ -43,6 +47,7 @@ class Compiler: # In strict mode, ordering will always be deterministic # In unstrict mode, ordering from ReadTable or after joins may be ambiguous to improve query performance. strict: bool = True + scalar_op_compiler = compile_scalar.ScalarOpCompiler() def compile_ordered_ir(self, node: nodes.BigFrameNode) -> compiled.OrderedIR: ir = typing.cast(compiled.OrderedIR, self.compile_node(node, True)) @@ -75,6 +80,9 @@ def _compile_node( @_compile_node.register def compile_join(self, node: nodes.JoinNode, ordered: bool = True): + condition_pairs = tuple( + (left.id.sql, right.id.sql) for left, right in node.conditions + ) if ordered: # In general, joins are an ordering destroying operation. # With ordering_mode = "partial", make this explicit. In @@ -86,7 +94,7 @@ def compile_join(self, node: nodes.JoinNode, ordered: bool = True): left=left_ordered, right=right_ordered, type=node.type, - conditions=node.conditions, + conditions=condition_pairs, ) else: left_unordered = self.compile_unordered_ir(node.left_child) @@ -95,7 +103,7 @@ def compile_join(self, node: nodes.JoinNode, ordered: bool = True): left=left_unordered, right=right_unordered, type=node.type, - conditions=node.conditions, + conditions=condition_pairs, ).as_ordered_ir() else: left_unordered = self.compile_unordered_ir(node.left_child) @@ -104,96 +112,79 @@ def compile_join(self, node: nodes.JoinNode, ordered: bool = True): left=left_unordered, right=right_unordered, type=node.type, - conditions=node.conditions, + conditions=condition_pairs, ) @_compile_node.register - def compile_readlocal(self, node: nodes.ReadLocalNode, ordered: bool = True): - array_as_pd = pd.read_feather(io.BytesIO(node.feather_bytes)) - ordered_ir = compiled.OrderedIR.from_pandas(array_as_pd, node.schema) + def compile_fromrange(self, node: nodes.FromRangeNode, ordered: bool = True): + # Both start and end are single elements and do not inherently have an order + start = self.compile_unordered_ir(node.start) + end = self.compile_unordered_ir(node.end) + start_table = start._to_ibis_expr() + end_table = end._to_ibis_expr() + + start_column = start_table.schema().names[0] + end_column = end_table.schema().names[0] + + # Perform a cross join to avoid errors + joined_table = start_table.cross_join(end_table) + + labels_array_table = ibis.range( + joined_table[start_column], joined_table[end_column] + node.step, node.step + ).name("labels") + labels = ( + typing.cast(ibis.expr.types.ArrayValue, labels_array_table) + .unnest() + .as_table() + ) if ordered: - return ordered_ir + return compiled.OrderedIR( + labels, + columns=[labels[labels.columns[0]]], + ordering=bf_ordering.TotalOrdering().from_offset_col(labels.columns[0]), + ) else: - return ordered_ir.to_unordered() + return compiled.UnorderedIR( + labels, + columns=[labels[labels.columns[0]]], + ) @_compile_node.register - def compile_cached_table(self, node: nodes.CachedTableNode, ordered: bool = True): - full_table_name = ( - f"{node.table.project_id}.{node.table.dataset_id}.{node.table.table_id}" - ) - used_columns = ( - *node.schema.names, - *node._hidden_columns, - ) - # Physical schema might include unused columns, unsupported datatypes like JSON - physical_schema = ibis.backends.bigquery.BigQuerySchema.to_ibis( - list(i for i in node.table.physical_schema if i.name in used_columns) + def compile_readlocal(self, node: nodes.ReadLocalNode, ordered: bool = True): + array_as_pd = pd.read_feather( + io.BytesIO(node.feather_bytes), + columns=[item.source_id for item in node.scan_list.items], ) - ibis_table = ibis.table(physical_schema, full_table_name) + ordered_ir = compiled.OrderedIR.from_pandas(array_as_pd, node.scan_list) if ordered: - if node.ordering is None: - # If this happens, session malfunctioned while applying cached results. - raise ValueError( - "Cannot use unordered cached value. Result requires ordering information." - ) - if self.strict and not isinstance(node.ordering, bf_ordering.TotalOrdering): - raise ValueError( - "Cannot use partially ordered cached value. Result requires total ordering information." - ) - ir = compiled.OrderedIR( - ibis_table, - columns=tuple( - bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( - ibis_table[col] - ) - for col in [*node.schema.names, *node._hidden_columns] - ), - ordering=node.ordering, - ) - ir = ir._select( - tuple(ir._get_ibis_column(name) for name in node.schema.names) - ) - return ir + return ordered_ir else: - return compiled.UnorderedIR( - ibis_table, - columns=tuple( - bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( - ibis_table[col] - ) - for col in node.schema.names - ), - ) + return ordered_ir.to_unordered() @_compile_node.register def compile_readtable(self, node: nodes.ReadTableNode, ordered: bool = True): if ordered: - return self.compile_read_table_ordered(node) + return self.compile_read_table_ordered(node.source, node.scan_list) else: - return self.compile_read_table_unordered(node) + return self.compile_read_table_unordered(node.source, node.scan_list) def read_table_as_unordered_ibis( - self, node: nodes.ReadTableNode + self, source: nodes.BigqueryDataSource ) -> ibis.expr.types.Table: - full_table_name = ( - f"{node.table.project_id}.{node.table.dataset_id}.{node.table.table_id}" - ) - used_columns = ( - *node.schema.names, - *[i for i in node.total_order_cols if i not in node.schema.names], - ) + full_table_name = f"{source.table.project_id}.{source.table.dataset_id}.{source.table.table_id}" + used_columns = tuple(col.name for col in source.table.physical_schema) # Physical schema might include unused columns, unsupported datatypes like JSON physical_schema = ibis.backends.bigquery.BigQuerySchema.to_ibis( - list(i for i in node.table.physical_schema if i.name in used_columns) + list(i for i in source.table.physical_schema if i.name in used_columns) ) - if node.at_time is not None or node.sql_predicate is not None: + if source.at_time is not None or source.sql_predicate is not None: import bigframes.session._io.bigquery sql = bigframes.session._io.bigquery.to_query( full_table_name, columns=used_columns, - sql_predicate=node.sql_predicate, - time_travel_timestamp=node.at_time, + sql_predicate=source.sql_predicate, + time_travel_timestamp=source.at_time, ) return ibis.backends.bigquery.Backend().sql( schema=physical_schema, query=sql @@ -201,56 +192,64 @@ def read_table_as_unordered_ibis( else: return ibis.table(physical_schema, full_table_name) - def compile_read_table_unordered(self, node: nodes.ReadTableNode): - ibis_table = self.read_table_as_unordered_ibis(node) + def compile_read_table_unordered( + self, source: nodes.BigqueryDataSource, scan: nodes.ScanList + ): + ibis_table = self.read_table_as_unordered_ibis(source) return compiled.UnorderedIR( ibis_table, tuple( bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( - ibis_table[col] + ibis_table[scan_item.source_id].name(scan_item.id.sql) ) - for col in node.schema.names + for scan_item in scan.items ), ) - def compile_read_table_ordered(self, node: nodes.ReadTableNode): - ibis_table = self.read_table_as_unordered_ibis(node) - if node.total_order_cols: - ordering_value_columns = tuple( - bf_ordering.ascending_over(col) for col in node.total_order_cols + def compile_read_table_ordered( + self, source: nodes.BigqueryDataSource, scan_list: nodes.ScanList + ): + ibis_table = self.read_table_as_unordered_ibis(source) + if source.ordering is not None: + visible_column_mapping = { + ids.ColumnId(scan_item.source_id): scan_item.id + for scan_item in scan_list.items + } + full_mapping = { + ids.ColumnId(col.name): ids.ColumnId(guids.generate_guid()) + for col in source.ordering.referenced_columns + } + full_mapping.update(visible_column_mapping) + + ordering = source.ordering.remap_column_refs(full_mapping) + hidden_columns = tuple( + ibis_table[source_id.sql].name(out_id.sql) + for source_id, out_id in full_mapping.items() + if source_id not in visible_column_mapping ) - if node.order_col_is_sequential: - integer_encoding = bf_ordering.IntegerEncoding( - is_encoded=True, is_sequential=True + elif self.strict: # In strict mode, we fallback to ordering by row hash + order_values = [ + col.name(guids.generate_guid()) + for col in default_ordering.gen_default_ordering( + ibis_table, use_double_hash=True ) - else: - integer_encoding = bf_ordering.IntegerEncoding() - ordering: bf_ordering.RowOrdering = bf_ordering.TotalOrdering( - ordering_value_columns, - integer_encoding=integer_encoding, - total_ordering_columns=frozenset(node.total_order_cols), - ) - hidden_columns = () - elif self.strict: - ibis_table, ordering = default_ordering.gen_default_ordering( - ibis_table, use_double_hash=True - ) - hidden_columns = tuple( - ibis_table[col] - for col in ibis_table.columns - if col not in node.schema.names + ] + ordering = bf_ordering.TotalOrdering.from_primary_key( + [value.get_name() for value in order_values] ) + hidden_columns = tuple(order_values) else: # In unstrict mode, don't generate total ordering from hashing as this is # expensive (prevent removing any columns from table scan) ordering, hidden_columns = bf_ordering.RowOrdering(), () + return compiled.OrderedIR( ibis_table, columns=tuple( bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( - ibis_table[col] + ibis_table[scan_item.source_id].name(scan_item.id.sql) ) - for col in node.schema.names + for scan_item in scan_list.items ), ordering=ordering, hidden_ordering_columns=hidden_columns, @@ -260,7 +259,7 @@ def compile_read_table_ordered(self, node: nodes.ReadTableNode): def compile_promote_offsets( self, node: nodes.PromoteOffsetsNode, ordered: bool = True ): - result = self.compile_ordered_ir(node.child).promote_offsets(node.col_id) + result = self.compile_ordered_ir(node.child).promote_offsets(node.col_id.sql) return result if ordered else result.to_unordered() @_compile_node.register @@ -284,12 +283,14 @@ def compile_reversed(self, node: nodes.ReversedNode, ordered: bool = True): @_compile_node.register def compile_selection(self, node: nodes.SelectionNode, ordered: bool = True): result = self.compile_node(node.child, ordered) - return result.selection(node.input_output_pairs) + selection = tuple((ref, id.sql) for ref, id in node.input_output_pairs) + return result.selection(selection) @_compile_node.register def compile_projection(self, node: nodes.ProjectionNode, ordered: bool = True): result = self.compile_node(node.child, ordered) - return result.projection(node.assignments) + projections = ((expr, id.sql) for expr, id in node.assignments) + return result.projection(tuple(projections)) @_compile_node.register def compile_concat(self, node: nodes.ConcatNode, ordered: bool = True): @@ -312,13 +313,14 @@ def compile_aggregate(self, node: nodes.AggregateNode, ordered: bool = True): has_ordered_aggregation_ops = any( aggregate.op.can_order_by for aggregate, _ in node.aggregations ) + aggs = tuple((agg, id.sql) for agg, id in node.aggregations) if ordered and has_ordered_aggregation_ops: return self.compile_ordered_ir(node.child).aggregate( - node.aggregations, node.by_column_ids, node.dropna + aggs, node.by_column_ids, node.dropna ) else: result = self.compile_unordered_ir(node.child).aggregate( - node.aggregations, node.by_column_ids, node.dropna + aggs, node.by_column_ids, node.dropna ) return result if ordered else result.to_unordered() @@ -328,7 +330,7 @@ def compile_window(self, node: nodes.WindowOpNode, ordered: bool = True): node.column_name, node.op, node.window_spec, - node.output_name, + node.output_name.sql, never_skip_nulls=node.never_skip_nulls, ) return result if ordered else result.to_unordered() diff --git a/bigframes/core/compile/concat.py b/bigframes/core/compile/concat.py index 35cf9ec5b4..81d6805d22 100644 --- a/bigframes/core/compile/concat.py +++ b/bigframes/core/compile/concat.py @@ -19,6 +19,7 @@ import ibis import bigframes.core.compile.compiled as compiled +import bigframes.core.expression as ex from bigframes.core.ordering import ( ascending_over, reencode_order_string, @@ -85,7 +86,7 @@ def concat_ordered( combined_table = ibis.union(*tables) ordering = TotalOrdering( ordering_value_columns=tuple([ascending_over(ORDER_ID_COLUMN)]), - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + total_ordering_columns=frozenset([ex.deref(ORDER_ID_COLUMN)]), string_encoding=StringEncoding(True, prefix_size + max_encoding_size), ) return compiled.OrderedIR( diff --git a/bigframes/core/compile/default_ordering.py b/bigframes/core/compile/default_ordering.py index a6b625caca..bafeebddc9 100644 --- a/bigframes/core/compile/default_ordering.py +++ b/bigframes/core/compile/default_ordering.py @@ -18,7 +18,6 @@ from __future__ import annotations -import itertools from typing import cast import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops @@ -28,7 +27,6 @@ import ibis.expr.types as ibis_types import bigframes.core.guid as guid -import bigframes.core.ordering as order def _convert_to_nonnull_string(column: ibis_types.Column) -> ibis_types.StringValue: @@ -58,7 +56,9 @@ def _convert_to_nonnull_string(column: ibis_types.Column) -> ibis_types.StringVa return cast(ibis_types.StringColumn, ibis.literal("\\")).concat(escaped) -def gen_default_ordering(table: ibis.table, use_double_hash: bool = True): +def gen_default_ordering( + table: ibis.table, use_double_hash: bool = True +) -> list[ibis.Value]: ordering_hash_part = guid.generate_guid("bigframes_ordering_") ordering_hash_part2 = guid.generate_guid("bigframes_ordering_") ordering_rand_part = guid.generate_guid("bigframes_ordering_") @@ -81,16 +81,4 @@ def gen_default_ordering(table: ibis.table, use_double_hash: bool = True): if use_double_hash else [full_row_hash, random_value] ) - - original_column_ids = table.columns - table_with_ordering = table.select( - itertools.chain(original_column_ids, order_values) - ) - - ordering = order.TotalOrdering( - ordering_value_columns=tuple( - order.ascending_over(col.get_name()) for col in order_values - ), - total_ordering_columns=frozenset(col.get_name() for col in order_values), - ) - return table_with_ordering, ordering + return order_values diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index 77bfb84425..a4c37b7c5d 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -330,7 +330,11 @@ def _ibis_dtype_to_arrow_dtype(ibis_dtype: ibis_dtypes.DataType) -> pa.DataType: if isinstance(ibis_dtype, ibis_dtypes.Struct): return pa.struct( [ - (name, _ibis_dtype_to_arrow_dtype(dtype)) + pa.field( + name, + _ibis_dtype_to_arrow_dtype(dtype), + nullable=not pa.types.is_list(_ibis_dtype_to_arrow_dtype(dtype)), + ) for name, dtype in ibis_dtype.fields.items() ] ) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 799a408d5b..729b341e85 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -86,13 +86,13 @@ def _( @compile_expression.register def _( self, - expression: ex.UnboundVariableExpression, + expression: ex.DerefOp, bindings: typing.Dict[str, ibis_types.Value], ) -> ibis_types.Value: - if expression.id not in bindings: + if expression.id.sql not in bindings: raise ValueError(f"Could not resolve unbound variable {expression.id}") else: - return bindings[expression.id] + return bindings[expression.id.sql] @compile_expression.register def _( @@ -729,6 +729,194 @@ def floor_dt_op_impl(x: ibis_types.Value, op: ops.FloorDtOp): return result.cast(result_type) +@scalar_op_compiler.register_binary_op(ops.DatetimeToIntegerLabelOp, pass_op=True) +def datetime_to_integer_label_op_impl( + x: ibis_types.Value, y: ibis_types.Value, op: ops.DatetimeToIntegerLabelOp +): + # Determine if the frequency is fixed by checking if 'op.freq.nanos' is defined. + try: + return datetime_to_integer_label_fixed_frequency(x, y, op) + except ValueError: + return datetime_to_integer_label_non_fixed_frequency(x, y, op) + + +def datetime_to_integer_label_fixed_frequency( + x: ibis_types.Value, y: ibis_types.Value, op: ops.DatetimeToIntegerLabelOp +): + """ + This function handles fixed frequency conversions where the unit can range + from microseconds (us) to days. + """ + us = op.freq.nanos / 1000 + x_int = x.cast(ibis_dtypes.Timestamp(timezone="UTC")).cast(ibis_dtypes.int64) + first = calculate_resample_first(y, op.origin) + x_int_label = (x_int - first) // us + return x_int_label + + +def datetime_to_integer_label_non_fixed_frequency( + x: ibis_types.Value, y: ibis_types.Value, op: ops.DatetimeToIntegerLabelOp +): + """ + This function handles non-fixed frequency conversions for units ranging + from weeks to years. + """ + rule_code = op.freq.rule_code + n = op.freq.n + if rule_code == "W-SUN": # Weekly + us = n * 7 * 24 * 60 * 60 * 1000000 + x = x.truncate("week") + ibis.interval(days=6) + y = y.truncate("week") + ibis.interval(days=6) + x_int = x.cast(ibis_dtypes.Timestamp(timezone="UTC")).cast(ibis_dtypes.int64) + first = y.cast(ibis_dtypes.Timestamp(timezone="UTC")).cast(ibis_dtypes.int64) + x_int_label = ( + ibis.case() + .when(x_int == first, 0) + .else_((x_int - first - 1) // us + 1) + .end() + ) + elif rule_code == "ME": # Monthly + x_int = x.year() * 12 + x.month() - 1 + first = y.year() * 12 + y.month() - 1 + x_int_label = ( + ibis.case() + .when(x_int == first, 0) + .else_((x_int - first - 1) // n + 1) + .end() + ) + elif rule_code == "QE-DEC": # Quarterly + x_int = x.year() * 4 + x.quarter() - 1 + first = y.year() * 4 + y.quarter() - 1 + x_int_label = ( + ibis.case() + .when(x_int == first, 0) + .else_((x_int - first - 1) // n + 1) + .end() + ) + elif rule_code == "YE-DEC": # Yearly + x_int = x.year() + first = y.year() + x_int_label = ( + ibis.case() + .when(x_int == first, 0) + .else_((x_int - first - 1) // n + 1) + .end() + ) + else: + raise ValueError(rule_code) + return x_int_label + + +@scalar_op_compiler.register_binary_op(ops.IntegerLabelToDatetimeOp, pass_op=True) +def integer_label_to_datetime_op_impl( + x: ibis_types.Value, y: ibis_types.Value, op: ops.IntegerLabelToDatetimeOp +): + # Determine if the frequency is fixed by checking if 'op.freq.nanos' is defined. + try: + return integer_label_to_datetime_op_fixed_frequency(x, y, op) + except ValueError: + return integer_label_to_datetime_op_non_fixed_frequency(x, y, op) + + +def integer_label_to_datetime_op_fixed_frequency( + x: ibis_types.Value, y: ibis_types.Value, op: ops.IntegerLabelToDatetimeOp +): + """ + This function handles fixed frequency conversions where the unit can range + from microseconds (us) to days. + """ + us = op.freq.nanos / 1000 + + first = calculate_resample_first(y, op.origin) + + x_label = ( + (x * us + first) + .cast(ibis_dtypes.int64) + .to_timestamp(unit="us") + .cast(ibis_dtypes.Timestamp(timezone="UTC")) + .cast(y.type()) + ) + return x_label + + +def integer_label_to_datetime_op_non_fixed_frequency( + x: ibis_types.Value, y: ibis_types.Value, op: ops.IntegerLabelToDatetimeOp +): + """ + This function handles non-fixed frequency conversions for units ranging + from weeks to years. + """ + rule_code = op.freq.rule_code + n = op.freq.n + if rule_code == "W-SUN": # Weekly + us = n * 7 * 24 * 60 * 60 * 1000000 + first = ( + y.cast(ibis_dtypes.Timestamp(timezone="UTC")).truncate("week") + + ibis.interval(days=6) + ).cast(ibis_dtypes.int64) + x_label = ( + (x * us + first) + .cast(ibis_dtypes.int64) + .to_timestamp(unit="us") + .cast(ibis_dtypes.Timestamp(timezone="UTC")) + .cast(y.type()) + ) + elif rule_code == "ME": # Monthly + one = ibis.literal(1) + twelve = ibis.literal(12) + first = y.year() * twelve + y.month() - one + + x = x * n + first + year = x // twelve + month = (x % twelve) + one + + next_year = (month == twelve).ifelse(year + one, year) + next_month = (month == twelve).ifelse(one, month + one) + next_month_date = ibis.timestamp(next_year, next_month, one, 0, 0, 0) + + x_label = next_month_date - ibis.interval(days=1) + elif rule_code == "QE-DEC": # Quarterly + one = ibis.literal(1) + three = ibis.literal(3) + four = ibis.literal(4) + twelve = ibis.literal(12) + first = y.year() * four + y.quarter() - one + + x = x * n + first + year = x // four + month = ((x % four) + one) * three + + next_year = (month == twelve).ifelse(year + one, year) + next_month = (month == twelve).ifelse(one, month + one) + next_month_date = ibis.timestamp(next_year, next_month, one, 0, 0, 0) + + x_label = next_month_date - ibis.interval(days=1) + elif rule_code == "YE-DEC": # Yearly + one = ibis.literal(1) + first = y.year() + x = x * n + first + next_year = x + one + next_month_date = ibis.timestamp(next_year, 1, 1, 0, 0, 0) + x_label = next_month_date - ibis.interval(days=1) + + return x_label.cast(ibis_dtypes.Timestamp(timezone="UTC")).cast(y.type()) + + +def calculate_resample_first(y: ibis_types.Value, origin): + if origin == "epoch": + return ibis.literal(0) + elif origin == "start_day": + return ( + y.cast(ibis_dtypes.date) + .cast(ibis_dtypes.Timestamp(timezone="UTC")) + .cast(ibis_dtypes.int64) + ) + elif origin == "start": + return y.cast(ibis_dtypes.Timestamp(timezone="UTC")).cast(ibis_dtypes.int64) + else: + raise ValueError(f"Origin {origin} not supported") + + @scalar_op_compiler.register_unary_op(ops.time_op) def time_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.TimestampValue, x).time() diff --git a/bigframes/core/compile/single_column.py b/bigframes/core/compile/single_column.py index 325df8e180..6f2f3f5b6e 100644 --- a/bigframes/core/compile/single_column.py +++ b/bigframes/core/compile/single_column.py @@ -52,9 +52,8 @@ def join_by_column_ordered( """ # Do not reset the generator - id_generator = ids.standard_identifiers() - l_value_mapping = dict(zip(left.column_ids, id_generator)) - r_value_mapping = dict(zip(right.column_ids, id_generator)) + l_value_mapping = dict(zip(left.column_ids, left.column_ids)) + r_value_mapping = dict(zip(right.column_ids, right.column_ids)) l_hidden_mapping = { id: guids.generate_guid("hidden_") for id in left._hidden_column_ids @@ -93,8 +92,8 @@ def join_by_column_ordered( ordering = orderings.join_orderings( left._ordering, right._ordering, - l_mapping, - r_mapping, + {ids.ColumnId(lin): ids.ColumnId(lout) for lin, lout in l_mapping.items()}, + {ids.ColumnId(rin): ids.ColumnId(rout) for rin, rout in r_mapping.items()}, left_order_dominates=(type != "right"), ) @@ -143,18 +142,13 @@ def join_by_column_unordered( first the coalesced join keys, then, all the left columns, and finally, all the right columns. """ - id_generator = ids.standard_identifiers() - l_mapping = dict(zip(left.column_ids, id_generator)) - r_mapping = dict(zip(right.column_ids, id_generator)) - left_table = left._to_ibis_expr( - col_id_overrides=l_mapping, - ) - right_table = right._to_ibis_expr( - col_id_overrides=r_mapping, - ) + # Shouldn't need to select the column ids explicitly, but it seems that ibis has some + # bug resolving column ids otherwise, potentially because of the "JoinChain" op + left_table = left._to_ibis_expr().select(left.column_ids) + right_table = right._to_ibis_expr().select(right.column_ids) join_conditions = [ - value_to_join_key(left_table[l_mapping[left_index]]) - == value_to_join_key(right_table[r_mapping[right_index]]) + value_to_join_key(left_table[left_index]) + == value_to_join_key(right_table[right_index]) for left_index, right_index in conditions ] @@ -166,8 +160,8 @@ def join_by_column_unordered( ) # We could filter out the original join columns, but predicates/ordering # might still reference them in implicit joins. - columns = [combined_table[l_mapping[col.get_name()]] for col in left.columns] + [ - combined_table[r_mapping[col.get_name()]] for col in right.columns + columns = [combined_table[col.get_name()] for col in left.columns] + [ + combined_table[col.get_name()] for col in right.columns ] return compiled.UnorderedIR( combined_table, diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 4779e92cde..9dee599a7c 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -20,6 +20,7 @@ import typing from typing import Mapping, Union +import bigframes.core.identifiers as ids import bigframes.dtypes as dtypes import bigframes.operations import bigframes.operations.aggregations as agg_ops @@ -31,6 +32,10 @@ def const( return ScalarConstantExpression(value, dtype or dtypes.infer_literal_type(value)) +def deref(name: str) -> DerefOp: + return DerefOp(ids.ColumnId(name)) + + def free_var(id: str) -> UnboundVariableExpression: return UnboundVariableExpression(id) @@ -43,17 +48,21 @@ class Aggregation(abc.ABC): @abc.abstractmethod def output_type( - self, input_types: dict[str, dtypes.ExpressionType] + self, input_types: dict[ids.ColumnId, dtypes.ExpressionType] ) -> dtypes.ExpressionType: ... + @property + def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: + return () + @dataclasses.dataclass(frozen=True) class NullaryAggregation(Aggregation): op: agg_ops.NullaryWindowOp = dataclasses.field() def output_type( - self, input_types: dict[str, bigframes.dtypes.Dtype] + self, input_types: dict[ids.ColumnId, bigframes.dtypes.Dtype] ) -> dtypes.ExpressionType: return self.op.output_type() @@ -61,44 +70,54 @@ def output_type( @dataclasses.dataclass(frozen=True) class UnaryAggregation(Aggregation): op: agg_ops.UnaryWindowOp = dataclasses.field() - arg: Union[ - UnboundVariableExpression, ScalarConstantExpression - ] = dataclasses.field() + arg: Union[DerefOp, ScalarConstantExpression] = dataclasses.field() def output_type( - self, input_types: dict[str, bigframes.dtypes.Dtype] + self, input_types: dict[ids.ColumnId, bigframes.dtypes.Dtype] ) -> dtypes.ExpressionType: return self.op.output_type(self.arg.output_type(input_types)) + @property + def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: + return self.arg.column_references + @dataclasses.dataclass(frozen=True) class BinaryAggregation(Aggregation): op: agg_ops.BinaryAggregateOp = dataclasses.field() - left: Union[ - UnboundVariableExpression, ScalarConstantExpression - ] = dataclasses.field() - right: Union[ - UnboundVariableExpression, ScalarConstantExpression - ] = dataclasses.field() + left: Union[DerefOp, ScalarConstantExpression] = dataclasses.field() + right: Union[DerefOp, ScalarConstantExpression] = dataclasses.field() def output_type( - self, input_types: dict[str, bigframes.dtypes.Dtype] + self, input_types: dict[ids.ColumnId, bigframes.dtypes.Dtype] ) -> dtypes.ExpressionType: return self.op.output_type( self.left.output_type(input_types), self.right.output_type(input_types) ) + @property + def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: + return (*self.left.column_references, *self.right.column_references) + @dataclasses.dataclass(frozen=True) class Expression(abc.ABC): """An expression represents a computation taking N scalar inputs and producing a single output scalar.""" @property - def unbound_variables(self) -> typing.Tuple[str, ...]: + def free_variables(self) -> typing.Tuple[str, ...]: return () - def rename(self, name_mapping: Mapping[str, str]) -> Expression: - return self + @property + def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: + return () + + def remap_column_refs( + self, name_mapping: Mapping[ids.ColumnId, ids.ColumnId] + ) -> Expression: + return self.bind_refs( + {old_id: DerefOp(new_id) for old_id, new_id in name_mapping.items()} + ) @property @abc.abstractmethod @@ -107,17 +126,29 @@ def is_const(self) -> bool: @abc.abstractmethod def output_type( - self, input_types: dict[str, dtypes.ExpressionType] + self, input_types: dict[ids.ColumnId, dtypes.ExpressionType] ) -> dtypes.ExpressionType: ... + @abc.abstractmethod + def bind_refs( + self, + bindings: Mapping[ids.ColumnId, Expression], + allow_partial_bindings: bool = False, + ) -> Expression: + """Replace variables with expression given in `bindings`. + + If allow_partial_bindings is False, validate that all free variables are bound to a new value. + """ + ... + @abc.abstractmethod def bind_variables( - self, bindings: Mapping[str, Expression], check_bind_all: bool = True + self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False ) -> Expression: """Replace variables with expression given in `bindings`. - If check_bind_all is True, validate that all free variables are bound to a new value. + If allow_partial_bindings is False, validate that all free variables are bound to a new value. """ ... @@ -143,19 +174,23 @@ class ScalarConstantExpression(Expression): def is_const(self) -> bool: return True - def rename(self, name_mapping: Mapping[str, str]) -> ScalarConstantExpression: - return self - def output_type( - self, input_types: dict[str, bigframes.dtypes.Dtype] + self, input_types: dict[ids.ColumnId, bigframes.dtypes.Dtype] ) -> dtypes.ExpressionType: return self.dtype def bind_variables( - self, bindings: Mapping[str, Expression], check_bind_all: bool = True + self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False ) -> Expression: return self + def bind_refs( + self, + bindings: Mapping[ids.ColumnId, Expression], + allow_partial_bindings: bool = False, + ) -> ScalarConstantExpression: + return self + @property def is_bijective(self) -> bool: # () <-> value @@ -169,21 +204,59 @@ class UnboundVariableExpression(Expression): id: str @property - def unbound_variables(self) -> typing.Tuple[str, ...]: + def free_variables(self) -> typing.Tuple[str, ...]: return (self.id,) - def rename(self, name_mapping: Mapping[str, str]) -> UnboundVariableExpression: - if self.id in name_mapping: - return UnboundVariableExpression(name_mapping[self.id]) - else: - return self + @property + def is_const(self) -> bool: + return False + + def output_type( + self, input_types: dict[ids.ColumnId, bigframes.dtypes.Dtype] + ) -> dtypes.ExpressionType: + raise ValueError(f"Type of variable {self.id} has not been fixed.") + + def bind_refs( + self, + bindings: Mapping[ids.ColumnId, Expression], + allow_partial_bindings: bool = False, + ) -> UnboundVariableExpression: + return self + + def bind_variables( + self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False + ) -> Expression: + if self.id in bindings.keys(): + return bindings[self.id] + elif not allow_partial_bindings: + raise ValueError(f"Variable {self.id} remains unbound") + return self + + @property + def is_bijective(self) -> bool: + return True + + @property + def is_identity(self) -> bool: + return True + + +@dataclasses.dataclass(frozen=True) +class DerefOp(Expression): + """A variable expression representing an unbound variable.""" + + id: ids.ColumnId + + @property + def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: + return (self.id,) @property def is_const(self) -> bool: return False def output_type( - self, input_types: dict[str, bigframes.dtypes.Dtype] + self, input_types: dict[ids.ColumnId, bigframes.dtypes.Dtype] ) -> dtypes.ExpressionType: if self.id in input_types: return input_types[self.id] @@ -191,11 +264,18 @@ def output_type( raise ValueError(f"Type of variable {self.id} has not been fixed.") def bind_variables( - self, bindings: Mapping[str, Expression], check_bind_all: bool = True + self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False + ) -> Expression: + return self + + def bind_refs( + self, + bindings: Mapping[ids.ColumnId, Expression], + allow_partial_bindings: bool = False, ) -> Expression: if self.id in bindings.keys(): return bindings[self.id] - elif check_bind_all: + elif not allow_partial_bindings: raise ValueError(f"Variable {self.id} remains unbound") return self @@ -216,16 +296,19 @@ class OpExpression(Expression): inputs: typing.Tuple[Expression, ...] @property - def unbound_variables(self) -> typing.Tuple[str, ...]: + def column_references( + self, + ) -> typing.Tuple[bigframes.core.identifiers.ColumnId, ...]: return tuple( itertools.chain.from_iterable( - map(lambda x: x.unbound_variables, self.inputs) + map(lambda x: x.column_references, self.inputs) ) ) - def rename(self, name_mapping: Mapping[str, str]) -> Expression: - return OpExpression( - self.op, tuple(input.rename(name_mapping) for input in self.inputs) + @property + def free_variables(self) -> typing.Tuple[str, ...]: + return tuple( + itertools.chain.from_iterable(map(lambda x: x.free_variables, self.inputs)) ) @property @@ -233,7 +316,7 @@ def is_const(self) -> bool: return all(child.is_const for child in self.inputs) def output_type( - self, input_types: dict[str, dtypes.ExpressionType] + self, input_types: dict[ids.ColumnId, dtypes.ExpressionType] ) -> dtypes.ExpressionType: operand_types = tuple( map(lambda x: x.output_type(input_types=input_types), self.inputs) @@ -241,12 +324,27 @@ def output_type( return self.op.output_type(*operand_types) def bind_variables( - self, bindings: Mapping[str, Expression], check_bind_all: bool = True - ) -> Expression: + self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False + ) -> OpExpression: + return OpExpression( + self.op, + tuple( + input.bind_variables( + bindings, allow_partial_bindings=allow_partial_bindings + ) + for input in self.inputs + ), + ) + + def bind_refs( + self, + bindings: Mapping[ids.ColumnId, Expression], + allow_partial_bindings: bool = False, + ) -> OpExpression: return OpExpression( self.op, tuple( - input.bind_variables(bindings, check_bind_all=check_bind_all) + input.bind_refs(bindings, allow_partial_bindings=allow_partial_bindings) for input in self.inputs ), ) diff --git a/bigframes/core/identifiers.py b/bigframes/core/identifiers.py index 9239c41248..0d2aaeb07c 100644 --- a/bigframes/core/identifiers.py +++ b/bigframes/core/identifiers.py @@ -11,16 +11,37 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations -# Later, plan on migrating ids to use integers to reduce memory usage allow use of bitmaps to represent column sets - +import dataclasses +import functools from typing import Generator -ID_TYPE = str - -def standard_identifiers() -> Generator[ID_TYPE, None, None]: +def standard_identifiers() -> Generator[str, None, None]: i = 0 while True: yield f"col_{i}" i = i + 1 + + +# Used for expression trees +@functools.total_ordering +@dataclasses.dataclass(frozen=True) +class ColumnId: + """Local id without plan-wide id.""" + + name: str + + @property + def sql(self) -> str: + """Returns the unescaped SQL name.""" + return self.name + + @property + def local_normalized(self) -> ColumnId: + """For use in compiler only. Normalizes to ColumnId referring to sql name.""" + return self # == ColumnId(name=self.sql) + + def __lt__(self, other: ColumnId) -> bool: + return self.name < other.name diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 017702b85a..0ba79bebee 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -327,7 +327,9 @@ def astype( ) -> Index: if self.nlevels > 1: raise TypeError("Multiindex does not support 'astype'") - return self._apply_unary_expr(ops.AsTypeOp(to_type=dtype).as_expr("arg")) + return self._apply_unary_expr( + ops.AsTypeOp(to_type=dtype).as_expr(ex.free_var("arg")) + ) def all(self) -> bool: if self.nlevels > 1: @@ -396,7 +398,9 @@ def value_counts( def fillna(self, value=None) -> Index: if self.nlevels > 1: raise TypeError("Multiindex does not support 'fillna'") - return self._apply_unary_expr(ops.fillna_op.as_expr("arg", ex.const(value))) + return self._apply_unary_expr( + ops.fillna_op.as_expr(ex.free_var("arg"), ex.const(value)) + ) def rename(self, name: Union[str, Sequence[str]]) -> Index: names = [name] if isinstance(name, str) else list(name) @@ -446,7 +450,9 @@ def isin(self, values) -> Index: ) return self._apply_unary_expr( - ops.IsInOp(values=tuple(values), match_nulls=True).as_expr("arg") + ops.IsInOp(values=tuple(values), match_nulls=True).as_expr( + ex.free_var("arg") + ) ).fillna(value=False) def _apply_unary_expr( @@ -454,14 +460,16 @@ def _apply_unary_expr( op: ex.Expression, ) -> Index: """Applies a unary operator to the index.""" - if len(op.unbound_variables) != 1: + if len(op.free_variables) != 1: raise ValueError("Expression must have exactly 1 unbound variable.") - unbound_variable = op.unbound_variables[0] + unbound_variable = op.free_variables[0] block = self._block result_ids = [] for col in self._block.index_columns: - block, result_id = block.project_expr(op.rename({unbound_variable: col})) + block, result_id = block.project_expr( + op.bind_variables({unbound_variable: ex.deref(col)}) + ) result_ids.append(result_id) block = block.set_index(result_ids, index_labels=self._block.index.names) diff --git a/bigframes/core/join_def.py b/bigframes/core/join_def.py index 5b7b7e45dd..cd9c2acd17 100644 --- a/bigframes/core/join_def.py +++ b/bigframes/core/join_def.py @@ -17,8 +17,6 @@ import enum from typing import Literal, NamedTuple -import bigframes.core.identifiers as ids - class JoinSide(enum.Enum): LEFT = 0 @@ -34,21 +32,21 @@ def inverse(self) -> JoinSide: class JoinCondition(NamedTuple): - left_id: ids.ID_TYPE - right_id: ids.ID_TYPE + left_id: str + right_id: str @dataclasses.dataclass(frozen=True) class JoinColumnMapping: source_table: JoinSide - source_id: ids.ID_TYPE - destination_id: ids.ID_TYPE + source_id: str + destination_id: str @dataclasses.dataclass(frozen=True) class CoalescedColumnMapping: """Special column mapping used only by implicit joiner only""" - left_source_id: ids.ID_TYPE - right_source_id: ids.ID_TYPE - destination_id: ids.ID_TYPE + left_source_id: str + right_source_id: str + destination_id: str diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 93b59f75ee..e65040686e 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -20,12 +20,13 @@ import functools import itertools import typing -from typing import Callable, Tuple +from typing import Callable, Iterable, Sequence, Tuple import google.cloud.bigquery as bq import bigframes.core.expression as ex import bigframes.core.guid +import bigframes.core.identifiers import bigframes.core.identifiers as bfet_ids from bigframes.core.ordering import OrderingExpression import bigframes.core.schema as schemata @@ -41,8 +42,13 @@ # A fixed number of variable to assume for overhead on some operations OVERHEAD_VARIABLES = 5 +COLUMN_SET = frozenset[bfet_ids.ColumnId] -COL_OFFSET = int + +@dataclass(frozen=True) +class Field: + id: bfet_ids.ColumnId + dtype: bigframes.dtypes.Dtype @dataclass(frozen=True) @@ -103,11 +109,16 @@ def roots(self) -> typing.Set[BigFrameNode]: ) return set(roots) + # TODO: For deep trees, this can create a lot of overhead, maybe use zero-copy persistent datastructure? @property @abc.abstractmethod - def schema(self) -> schemata.ArraySchema: + def fields(self) -> Tuple[Field, ...]: ... + @property + def ids(self) -> Iterable[bfet_ids.ColumnId]: + return (field.id for field in self.fields) + @property @abc.abstractmethod def variables_introduced(self) -> int: @@ -162,6 +173,13 @@ def total_relational_ops(self) -> int: def total_joins(self) -> int: return int(self.joins) + sum(map(lambda x: x.total_joins, self.child_nodes)) + @functools.cached_property + def schema(self) -> schemata.ArraySchema: + # TODO: Make schema just a view on fields + return schemata.ArraySchema( + tuple(schemata.SchemaItem(i.id.name, i.dtype) for i in self.fields) + ) + @property def planning_complexity(self) -> int: """ @@ -197,6 +215,16 @@ def defined_variables(self) -> set[str]: *(child.defined_variables for child in self.child_nodes) ) + def get_type(self, id: bfet_ids.ColumnId) -> bigframes.dtypes.Dtype: + return self._dtype_lookup[id] + + @functools.cached_property + def _dtype_lookup(self): + return {field.id: field.dtype for field in self.fields} + + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: + return self.transform_children(lambda x: x.prune(used_cols)) + @dataclass(frozen=True) class UnaryNode(BigFrameNode): @@ -207,8 +235,8 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]: return (self.child,) @functools.cached_property - def schema(self) -> schemata.ArraySchema: - return self.child.schema + def fields(self) -> Tuple[Field, ...]: + return self.child.fields @property def explicitly_ordered(self) -> bool: @@ -228,9 +256,14 @@ def order_ambiguous(self) -> bool: class JoinNode(BigFrameNode): left_child: BigFrameNode right_child: BigFrameNode - conditions: typing.Tuple[typing.Tuple[str, str], ...] + conditions: typing.Tuple[typing.Tuple[ex.DerefOp, ex.DerefOp], ...] type: typing.Literal["inner", "outer", "left", "right", "cross"] + def __post_init__(self): + assert not ( + set(self.left_child.ids) & set(self.right_child.ids) + ), "Join ids collide" + @property def row_preserving(self) -> bool: return False @@ -256,15 +289,8 @@ def __hash__(self): return self._node_hash @functools.cached_property - def schema(self) -> schemata.ArraySchema: - items = [] - schema_items = itertools.chain( - self.left_child.schema.items, self.right_child.schema.items - ) - identifiers = bfet_ids.standard_identifiers() - for id, item in zip(identifiers, schema_items): - items.append(schemata.SchemaItem(id, item.dtype)) - return schemata.ArraySchema(tuple(items)) + def fields(self) -> Tuple[Field, ...]: + return tuple(itertools.chain(self.left_child.fields, self.right_child.fields)) @functools.cached_property def variables_introduced(self) -> int: @@ -286,9 +312,17 @@ def transform_children( def defines_namespace(self) -> bool: return True + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: + # If this is a cross join, make sure to select at least one column from each side + new_used = used_cols.union( + map(lambda x: x.id, itertools.chain.from_iterable(self.conditions)) + ) + return self.transform_children(lambda x: x.prune(new_used)) + @dataclass(frozen=True) class ConcatNode(BigFrameNode): + # TODO: Explcitly map column ids from each child children: Tuple[BigFrameNode, ...] def __post_init__(self): @@ -315,13 +349,12 @@ def __hash__(self): return self._node_hash @functools.cached_property - def schema(self) -> schemata.ArraySchema: + def fields(self) -> Tuple[Field, ...]: # TODO: Output names should probably be aligned beforehand or be part of concat definition - items = tuple( - schemata.SchemaItem(f"column_{i}", dtype) - for i, dtype in enumerate(self.children[0].schema.dtypes) + return tuple( + Field(bfet_ids.ColumnId(f"column_{i}"), field.dtype) + for i, field in enumerate(self.children[0].fields) ) - return schemata.ArraySchema(items) @functools.cached_property def variables_introduced(self) -> int: @@ -333,8 +366,59 @@ def transform_children( ) -> BigFrameNode: return replace(self, children=tuple(t(child) for child in self.children)) + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: + # TODO: Make concat prunable, probably by redefining + return self + + +@dataclass(frozen=True) +class FromRangeNode(BigFrameNode): + # TODO: Enforce single-row, single column constraint + start: BigFrameNode + end: BigFrameNode + step: int + + def __hash__(self): + return self._node_hash + + @property + def roots(self) -> typing.Set[BigFrameNode]: + return {self} + + @property + def child_nodes(self) -> typing.Sequence[BigFrameNode]: + return (self.start, self.end) + + @property + def order_ambiguous(self) -> bool: + return False + + @property + def explicitly_ordered(self) -> bool: + return True + + @functools.cached_property + def fields(self) -> Tuple[Field, ...]: + return (Field(bfet_ids.ColumnId("labels"), self.start.fields[0].dtype),) + + @functools.cached_property + def variables_introduced(self) -> int: + """Defines the number of variables generated by the current node. Used to estimate query planning complexity.""" + return len(self.schema.items) + OVERHEAD_VARIABLES + + def transform_children( + self, t: Callable[[BigFrameNode], BigFrameNode] + ) -> BigFrameNode: + return replace(self, start=t(self.start), end=t(self.end)) + + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: + # TODO: Make FromRangeNode prunable (or convert to other node types) + return self + # Input Nodex +# TODO: Most leaf nodes produce fixed column names based on the datasource +# They should support renaming @dataclass(frozen=True) class LeafNode(BigFrameNode): @property @@ -356,24 +440,37 @@ def row_count(self) -> typing.Optional[int]: return None +class ScanItem(typing.NamedTuple): + id: bfet_ids.ColumnId + dtype: bigframes.dtypes.Dtype # Might be multiple logical types for a given physical source type + source_id: str # Flexible enough for both local data and bq data + + +@dataclass(frozen=True) +class ScanList: + items: typing.Tuple[ScanItem, ...] + + @dataclass(frozen=True) class ReadLocalNode(LeafNode): feather_bytes: bytes data_schema: schemata.ArraySchema n_rows: int + # Mapping of local ids to bfet id. + scan_list: ScanList session: typing.Optional[bigframes.session.Session] = None def __hash__(self): return self._node_hash @functools.cached_property - def schema(self) -> schemata.ArraySchema: - return self.data_schema + def fields(self) -> Tuple[Field, ...]: + return tuple(Field(col_id, dtype) for col_id, dtype, _ in self.scan_list.items) @functools.cached_property def variables_introduced(self) -> int: """Defines the number of variables generated by the current node. Used to estimate query planning complexity.""" - return len(self.schema.items) + 1 + return len(self.scan_list.items) + 1 @property def supports_fast_head(self) -> bool: @@ -391,6 +488,18 @@ def explicitly_ordered(self) -> bool: def row_count(self) -> typing.Optional[int]: return self.n_rows + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: + new_scan_list = ScanList( + tuple(item for item in self.scan_list.items if item.id in used_cols) + ) + return ReadLocalNode( + self.feather_bytes, + self.data_schema, + self.n_rows, + new_scan_list, + self.session, + ) + @dataclass(frozen=True) class GbqTable: @@ -402,12 +511,17 @@ class GbqTable: cluster_cols: typing.Optional[Tuple[str, ...]] @staticmethod - def from_table(table: bq.Table) -> GbqTable: + def from_table(table: bq.Table, columns: Sequence[str] = ()) -> GbqTable: + # Subsetting fields with columns can reduce cost of row-hash default ordering + if columns: + schema = tuple(item for item in table.schema if item.name in columns) + else: + schema = tuple(table.schema) return GbqTable( project_id=table.project, dataset_id=table.dataset_id, table_id=table.table_id, - physical_schema=tuple(table.schema), + physical_schema=schema, n_rows=table.num_rows, cluster_cols=None if table.clustering_fields is None @@ -415,32 +529,40 @@ def from_table(table: bq.Table) -> GbqTable: ) -## Put ordering in here or just add order_by node above? @dataclass(frozen=True) -class ReadTableNode(LeafNode): - table: GbqTable - # Subset of physical schema columns, with chosen BQ types - columns: schemata.ArraySchema = field() +class BigqueryDataSource: + """ + Google BigQuery Data source. - table_session: bigframes.session.Session = field() - # Empty tuple if no primary key (primary key can be any set of columns that together form a unique key) - # Empty if no known unique key - total_order_cols: Tuple[str, ...] = field() - # indicates a primary key that is exactly offsets 0, 1, 2, ..., N-2, N-1 - order_col_is_sequential: bool = False + This should not be modified once defined, as all attributes contribute to the default ordering. + """ + + table: GbqTable at_time: typing.Optional[datetime.datetime] = None # Added for backwards compatibility, not validated sql_predicate: typing.Optional[str] = None + ordering: typing.Optional[orderings.RowOrdering] = None + + +## Put ordering in here or just add order_by node above? +@dataclass(frozen=True) +class ReadTableNode(LeafNode): + source: BigqueryDataSource + # Subset of physical schema column + # Mapping of table schema ids to bfet id. + scan_list: ScanList + + table_session: bigframes.session.Session = field() def __post_init__(self): # enforce invariants - physical_names = set(map(lambda i: i.name, self.table.physical_schema)) - if not set(self.columns.names).issubset(physical_names): + physical_names = set(map(lambda i: i.name, self.source.table.physical_schema)) + if not set(scan.source_id for scan in self.scan_list.items).issubset( + physical_names + ): raise ValueError( - f"Requested schema {self.columns} cannot be derived from table schemal {self.table.physical_schema}" + f"Requested schema {self.scan_list} cannot be derived from table schemal {self.source.table.physical_schema}" ) - if self.order_col_is_sequential and len(self.total_order_cols) != 1: - raise ValueError("Sequential primary key must have only one component") @property def session(self): @@ -449,9 +571,9 @@ def session(self): def __hash__(self): return self._node_hash - @property - def schema(self) -> schemata.ArraySchema: - return self.columns + @functools.cached_property + def fields(self) -> Tuple[Field, ...]: + return tuple(Field(col_id, dtype) for col_id, dtype, _ in self.scan_list.items) @property def relation_ops_created(self) -> int: @@ -463,102 +585,57 @@ def supports_fast_head(self) -> bool: # Fast head is only supported when row offsets are available. # In the future, ORDER BY+LIMIT optimizations may allow fast head when # clustered and/or partitioned on ordering key - return self.order_col_is_sequential + return (self.source.ordering is not None) and self.source.ordering.is_sequential @property def order_ambiguous(self) -> bool: - return len(self.total_order_cols) == 0 + return ( + self.source.ordering is None + ) or not self.source.ordering.is_total_ordering @property def explicitly_ordered(self) -> bool: - return len(self.total_order_cols) > 0 + return self.source.ordering is not None @functools.cached_property def variables_introduced(self) -> int: - return len(self.schema.items) + 1 + return len(self.scan_list.items) + 1 @property def row_count(self) -> typing.Optional[int]: - if self.sql_predicate is None: - return self.table.n_rows + if self.source.sql_predicate is None: + return self.source.table.n_rows return None + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: + new_scan_list = ScanList( + tuple(item for item in self.scan_list.items if item.id in used_cols) + ) + return ReadTableNode(self.source, new_scan_list, self.table_session) + -# This node shouldn't be used in the "original" expression tree, only used as replacement for original during planning @dataclass(frozen=True) -class CachedTableNode(LeafNode): +class CachedTableNode(ReadTableNode): # The original BFET subtree that was cached # note: this isn't a "child" node. original_node: BigFrameNode = field() - # reference to cached materialization of original_node - table: GbqTable - ordering: typing.Optional[orderings.RowOrdering] = field() - - def __post_init__(self): - # enforce invariants - physical_names = set(map(lambda i: i.name, self.table.physical_schema)) - logical_names = self.original_node.schema.names - if not set(logical_names).issubset(physical_names): - raise ValueError( - f"Requested schema {logical_names} cannot be derived from table schema {self.table.physical_schema}" - ) - if not set(self._hidden_columns).issubset(physical_names): - raise ValueError( - f"Requested hidden columns {self._hidden_columns} cannot be derived from table schema {self.table.physical_schema}" - ) - - @property - def session(self): - return self.original_node.session def __hash__(self): return self._node_hash - @property - def schema(self) -> schemata.ArraySchema: - return self.original_node.schema - - @functools.cached_property - def variables_introduced(self) -> int: - return len(self.schema.items) + OVERHEAD_VARIABLES - - @property - def _hidden_columns(self) -> typing.Tuple[str, ...]: - """Physical columns used to define ordering but not directly exposed as value columns.""" - if self.ordering is None: - return () - return tuple( - col - for col in sorted(self.ordering.referenced_columns) - if col not in self.schema.names + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: + new_scan_list = ScanList( + tuple(item for item in self.scan_list.items if item.id in used_cols) + ) + return CachedTableNode( + self.source, new_scan_list, self.table_session, self.original_node ) - - @property - def supports_fast_head(self) -> bool: - # Fast head is only supported when row offsets are available. - # In the future, ORDER BY+LIMIT optimizations may allow fast head when - # clustered and/or partitioned on ordering key - return (self.ordering is None) or self.ordering.is_sequential - - @property - def order_ambiguous(self) -> bool: - return not isinstance(self.ordering, orderings.TotalOrdering) - - @property - def explicitly_ordered(self) -> bool: - return (self.ordering is not None) and len( - self.ordering.all_ordering_columns - ) > 0 - - @property - def row_count(self) -> typing.Optional[int]: - return self.table.n_rows # Unary nodes @dataclass(frozen=True) class PromoteOffsetsNode(UnaryNode): - col_id: str + col_id: bigframes.core.identifiers.ColumnId def __hash__(self): return self._node_hash @@ -568,10 +645,8 @@ def non_local(self) -> bool: return True @property - def schema(self) -> schemata.ArraySchema: - return self.child.schema.append( - schemata.SchemaItem(self.col_id, bigframes.dtypes.INT_DTYPE) - ) + def fields(self) -> Tuple[Field, ...]: + return (*self.child.fields, Field(self.col_id, bigframes.dtypes.INT_DTYPE)) @property def relation_ops_created(self) -> int: @@ -581,6 +656,13 @@ def relation_ops_created(self) -> int: def variables_introduced(self) -> int: return 1 + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: + if self.col_id not in used_cols: + return self.child.prune(used_cols) + else: + new_used = used_cols.difference([self.col_id]) + return self.transform_children(lambda x: x.prune(new_used)) + @dataclass(frozen=True) class FilterNode(UnaryNode): @@ -597,20 +679,16 @@ def __hash__(self): def variables_introduced(self) -> int: return 1 + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: + consumed_ids = used_cols.union(self.predicate.column_references) + pruned_child = self.child.prune(consumed_ids) + return FilterNode(pruned_child, self.predicate) + @dataclass(frozen=True) class OrderByNode(UnaryNode): by: Tuple[OrderingExpression, ...] - def __post_init__(self): - available_variables = self.child.schema.names - for order_expr in self.by: - for variable in order_expr.scalar_expression.unbound_variables: - if variable not in available_variables: - raise ValueError( - f"Cannot over unknown id:{variable}, columns are {available_variables}" - ) - def __hash__(self): return self._node_hash @@ -627,6 +705,14 @@ def relation_ops_created(self) -> int: def explicitly_ordered(self) -> bool: return True + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: + ordering_cols = itertools.chain.from_iterable( + map(lambda x: x.referenced_columns, self.by) + ) + consumed_ids = used_cols.union(ordering_cols) + pruned_child = self.child.prune(consumed_ids) + return OrderByNode(pruned_child, self.by) + @dataclass(frozen=True) class ReversedNode(UnaryNode): @@ -648,23 +734,19 @@ def relation_ops_created(self) -> int: @dataclass(frozen=True) class SelectionNode(UnaryNode): - input_output_pairs: typing.Tuple[typing.Tuple[str, str], ...] - - def __post_init__(self): - for input, _ in self.input_output_pairs: - assert input in self.child.schema.names + input_output_pairs: typing.Tuple[ + typing.Tuple[ex.DerefOp, bigframes.core.identifiers.ColumnId], ... + ] def __hash__(self): return self._node_hash @functools.cached_property - def schema(self) -> schemata.ArraySchema: - input_types = self.child.schema._mapping - items = tuple( - schemata.SchemaItem(output, input_types[input]) + def fields(self) -> Tuple[Field, ...]: + return tuple( + Field(output, self.child.get_type(input.id)) for input, output in self.input_output_pairs ) - return schemata.ArraySchema(items) @property def variables_introduced(self) -> int: @@ -678,15 +760,26 @@ def variables_introduced(self) -> int: def defines_namespace(self) -> bool: return True + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: + pruned_selections = tuple( + select for select in self.input_output_pairs if select[1] in used_cols + ) + consumed_ids = frozenset(i[0].id for i in pruned_selections) + + pruned_child = self.child.prune(consumed_ids) + return SelectionNode(pruned_child, pruned_selections) + @dataclass(frozen=True) class ProjectionNode(UnaryNode): """Assigns new variables (without modifying existing ones)""" - assignments: typing.Tuple[typing.Tuple[ex.Expression, str], ...] + assignments: typing.Tuple[ + typing.Tuple[ex.Expression, bigframes.core.identifiers.ColumnId], ... + ] def __post_init__(self): - input_types = self.child.schema._mapping + input_types = self.child._dtype_lookup for expression, id in self.assignments: # throws TypeError if invalid _ = expression.output_type(input_types) @@ -697,18 +790,13 @@ def __hash__(self): return self._node_hash @functools.cached_property - def schema(self) -> schemata.ArraySchema: - input_types = self.child.schema._mapping - items = tuple( - schemata.SchemaItem( - id, bigframes.dtypes.dtype_for_etype(ex.output_type(input_types)) - ) + def fields(self) -> Tuple[Field, ...]: + input_types = self.child._dtype_lookup + new_fields = ( + Field(id, bigframes.dtypes.dtype_for_etype(ex.output_type(input_types))) for ex, id in self.assignments ) - schema = self.child.schema - for item in items: - schema = schema.append(item) - return schema + return (*self.child.fields, *new_fields) @property def variables_introduced(self) -> int: @@ -716,6 +804,16 @@ def variables_introduced(self) -> int: new_vars = sum(1 for i in self.assignments if not i[0].is_identity) return new_vars + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: + pruned_assignments = tuple(i for i in self.assignments if i[1] in used_cols) + if len(pruned_assignments) == 0: + return self.child.prune(used_cols) + consumed_ids = itertools.chain.from_iterable( + i[0].column_references for i in pruned_assignments + ) + pruned_child = self.child.prune(used_cols.union(consumed_ids)) + return ProjectionNode(pruned_child, pruned_assignments) + # TODO: Merge RowCount into Aggregate Node? # Row count can be compute from table metadata sometimes, so it is a bit special. @@ -730,10 +828,8 @@ def non_local(self) -> bool: return True @functools.cached_property - def schema(self) -> schemata.ArraySchema: - return schemata.ArraySchema( - (schemata.SchemaItem("count", bigframes.dtypes.INT_DTYPE),) - ) + def fields(self) -> Tuple[Field, ...]: + return (Field(bfet_ids.ColumnId("count"), bigframes.dtypes.INT_DTYPE),) @property def variables_introduced(self) -> int: @@ -746,8 +842,10 @@ def defines_namespace(self) -> bool: @dataclass(frozen=True) class AggregateNode(UnaryNode): - aggregations: typing.Tuple[typing.Tuple[ex.Aggregation, str], ...] - by_column_ids: typing.Tuple[str, ...] = tuple([]) + aggregations: typing.Tuple[ + typing.Tuple[ex.Aggregation, bigframes.core.identifiers.ColumnId], ... + ] + by_column_ids: typing.Tuple[ex.DerefOp, ...] = tuple([]) dropna: bool = True @property @@ -762,19 +860,20 @@ def non_local(self) -> bool: return True @functools.cached_property - def schema(self) -> schemata.ArraySchema: - by_items = tuple( - schemata.SchemaItem(id, self.child.schema.get_type(id)) - for id in self.by_column_ids + def fields(self) -> Tuple[Field, ...]: + by_items = ( + Field(ref.id, self.child.get_type(ref.id)) for ref in self.by_column_ids ) - input_types = self.child.schema._mapping - agg_items = tuple( - schemata.SchemaItem( - id, bigframes.dtypes.dtype_for_etype(agg.output_type(input_types)) + agg_items = ( + Field( + id, + bigframes.dtypes.dtype_for_etype( + agg.output_type(self.child._dtype_lookup) + ), ) for agg, id in self.aggregations ) - return schemata.ArraySchema(tuple([*by_items, *agg_items])) + return (*by_items, *agg_items) @property def variables_introduced(self) -> int: @@ -792,13 +891,23 @@ def explicitly_ordered(self) -> bool: def defines_namespace(self) -> bool: return True + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: + by_ids = (ref.id for ref in self.by_column_ids) + pruned_aggs = tuple(agg for agg in self.aggregations if agg[1] in used_cols) + agg_inputs = itertools.chain.from_iterable( + agg.column_references for agg, _ in pruned_aggs + ) + consumed_ids = frozenset(itertools.chain(by_ids, agg_inputs)) + pruned_child = self.child.prune(consumed_ids) + return AggregateNode(pruned_child, pruned_aggs, self.by_column_ids, self.dropna) + @dataclass(frozen=True) class WindowOpNode(UnaryNode): - column_name: str + column_name: ex.DerefOp op: agg_ops.UnaryWindowOp window_spec: window.WindowSpec - output_name: str + output_name: bigframes.core.identifiers.ColumnId never_skip_nulls: bool = False skip_reproject_unsafe: bool = False @@ -810,12 +919,10 @@ def non_local(self) -> bool: return True @functools.cached_property - def schema(self) -> schemata.ArraySchema: - input_type = self.child.schema.get_type(self.column_name) + def fields(self) -> Tuple[Field, ...]: + input_type = self.child.get_type(self.column_name.id) new_item_dtype = self.op.output_type(input_type) - return self.child.schema.append( - schemata.SchemaItem(self.output_name, new_item_dtype) - ) + return (*self.child.fields, Field(self.output_name, new_item_dtype)) @property def variables_introduced(self) -> int: @@ -826,6 +933,14 @@ def relation_ops_created(self) -> int: # Assume that if not reprojecting, that there is a sequence of window operations sharing the same window return 0 if self.skip_reproject_unsafe else 4 + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: + if self.output_name not in used_cols: + return self.child + consumed_ids = used_cols.difference([self.output_name]).union( + [self.column_name.id] + ) + return self.transform_children(lambda x: x.prune(consumed_ids)) + # TODO: Remove this op @dataclass(frozen=True) @@ -863,9 +978,10 @@ def variables_introduced(self) -> int: return 1 +# TODO: Explode should create a new column instead of overriding the existing one @dataclass(frozen=True) class ExplodeNode(UnaryNode): - column_ids: typing.Tuple[COL_OFFSET, ...] + column_ids: typing.Tuple[ex.DerefOp, ...] @property def row_preserving(self) -> bool: @@ -875,19 +991,18 @@ def __hash__(self): return self._node_hash @functools.cached_property - def schema(self) -> schemata.ArraySchema: - items = tuple( - schemata.SchemaItem( - name, + def fields(self) -> Tuple[Field, ...]: + return tuple( + Field( + field.id, bigframes.dtypes.arrow_dtype_to_bigframes_dtype( - self.child.schema.get_type(name).pyarrow_dtype.value_type + self.child.get_type(field.id).pyarrow_dtype.value_type # type: ignore ), ) - if offset in self.column_ids - else schemata.SchemaItem(name, self.child.schema.get_type(name)) - for offset, name in enumerate(self.child.schema.names) + if field.id in set(map(lambda x: x.id, self.column_ids)) + else field + for field in self.child.fields ) - return schemata.ArraySchema(items) @property def relation_ops_created(self) -> int: @@ -900,3 +1015,9 @@ def variables_introduced(self) -> int: @property def defines_namespace(self) -> bool: return True + + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: + # Cannot prune explode op + return self.transform_children( + lambda x: x.prune(used_cols.union(ref.id for ref in self.column_ids)) + ) diff --git a/bigframes/core/ordering.py b/bigframes/core/ordering.py index a57d7a18d6..8bba7d72b6 100644 --- a/bigframes/core/ordering.py +++ b/bigframes/core/ordering.py @@ -24,6 +24,7 @@ import ibis.expr.types as ibis_types import bigframes.core.expression as expression +import bigframes.core.identifiers as ids # TODO(tbergeron): Encode more efficiently ORDERING_ID_STRING_BASE: int = 10 @@ -54,16 +55,29 @@ class OrderingExpression: direction: OrderingDirection = OrderingDirection.ASC na_last: bool = True - def remap_names(self, mapping: Mapping[str, str]) -> OrderingExpression: - return OrderingExpression( - self.scalar_expression.rename(mapping), self.direction, self.na_last + @property + def referenced_columns(self) -> Set[ids.ColumnId]: + return set(self.scalar_expression.column_references) + + def remap_column_refs( + self, + mapping: Mapping[ids.ColumnId, ids.ColumnId], + allow_partial_bindings: bool = False, + ) -> OrderingExpression: + return self.bind_refs( + {old_id: expression.DerefOp(new_id) for old_id, new_id in mapping.items()}, + allow_partial_bindings=allow_partial_bindings, ) - def bind_variables( - self, mapping: Mapping[str, expression.Expression] + def bind_refs( + self, + mapping: Mapping[ids.ColumnId, expression.Expression], + allow_partial_bindings: bool = False, ) -> OrderingExpression: return OrderingExpression( - self.scalar_expression.bind_variables(mapping), + self.scalar_expression.bind_refs( + mapping, allow_partial_bindings=allow_partial_bindings + ), self.direction, self.na_last, ) @@ -106,11 +120,11 @@ def all_ordering_columns(self) -> Sequence[OrderingExpression]: return list(self.ordering_value_columns) @property - def referenced_columns(self) -> Set[str]: + def referenced_columns(self) -> Set[ids.ColumnId]: return set( col for part in self.ordering_value_columns - for col in part.scalar_expression.unbound_variables + for col in part.referenced_columns ) @property @@ -122,6 +136,10 @@ def is_string_encoded(self) -> bool: def is_sequential(self) -> bool: return self.integer_encoding.is_encoded and self.integer_encoding.is_sequential + @property + def is_total_ordering(self) -> bool: + return False + @property def total_order_col(self) -> Optional[OrderingExpression]: """Returns column id of columns that defines total ordering, if such as column exists""" @@ -133,9 +151,16 @@ def with_reverse(self) -> RowOrdering: tuple([col.with_reverse() for col in self.ordering_value_columns]), ) - def with_column_remap(self, mapping: typing.Mapping[str, str]) -> RowOrdering: + def remap_column_refs( + self, + mapping: typing.Mapping[ids.ColumnId, ids.ColumnId], + allow_partial_bindings: bool = False, + ) -> RowOrdering: new_value_columns = [ - col.remap_names(mapping) for col in self.all_ordering_columns + col.remap_column_refs( + mapping, allow_partial_bindings=allow_partial_bindings + ) + for col in self.all_ordering_columns ] return RowOrdering( tuple(new_value_columns), @@ -182,13 +207,13 @@ def _truncate_ordering( self, order_refs: tuple[OrderingExpression, ...] ) -> tuple[OrderingExpression, ...]: # Truncate once we refer to a full key in bijective operations - columns_seen: Set[str] = set() + columns_seen: Set[ids.ColumnId] = set() truncated_refs = [] for order_part in order_refs: expr = order_part.scalar_expression - if not set(expr.unbound_variables).issubset(columns_seen): + if not set(expr.column_references).issubset(columns_seen): if expr.is_bijective: - columns_seen.update(expr.unbound_variables) + columns_seen.update(expr.column_references) truncated_refs.append(order_part) return tuple(truncated_refs) @@ -200,16 +225,31 @@ class TotalOrdering(RowOrdering): # A table has a total ordering defined by the identities of a set of 1 or more columns. # These columns must always be part of the ordering, in order to guarantee that the ordering is total. # Therefore, any modifications(or drops) done to these columns must result in hidden copies being made. - total_ordering_columns: frozenset[str] = field(default_factory=frozenset) + total_ordering_columns: frozenset[expression.DerefOp] = field( + default_factory=frozenset + ) @classmethod def from_offset_col(cls, col: str) -> TotalOrdering: return TotalOrdering( (ascending_over(col),), integer_encoding=IntegerEncoding(True, is_sequential=True), - total_ordering_columns=frozenset({col}), + total_ordering_columns=frozenset({expression.deref(col)}), + ) + + @classmethod + def from_primary_key(cls, primary_key: Sequence[str]) -> TotalOrdering: + return TotalOrdering( + tuple(ascending_over(col) for col in primary_key), + total_ordering_columns=frozenset( + {expression.deref(col) for col in primary_key} + ), ) + @property + def is_total_ordering(self) -> bool: + return True + def with_non_sequential(self): """Create a copy that is marked as non-sequential. @@ -253,14 +293,14 @@ def _truncate_ordering( self, order_refs: tuple[OrderingExpression, ...] ) -> tuple[OrderingExpression, ...]: # Truncate once we refer to a full key in bijective operations - must_see = set(self.total_ordering_columns) - columns_seen: Set[str] = set() + must_see = set(ref.id for ref in self.total_ordering_columns) + columns_seen: Set[ids.ColumnId] = set() truncated_refs = [] for order_part in order_refs: expr = order_part.scalar_expression - if not set(expr.unbound_variables).issubset(columns_seen): + if not set(expr.column_references).issubset(columns_seen): if expr.is_bijective: - columns_seen.update(expr.unbound_variables) + columns_seen.update(expr.column_references) truncated_refs.append(order_part) if columns_seen.issuperset(must_see): return tuple(truncated_refs) @@ -275,12 +315,20 @@ def with_reverse(self): total_ordering_columns=self.total_ordering_columns, ) - def with_column_remap(self, mapping: typing.Mapping[str, str]): + def remap_column_refs( + self, + mapping: typing.Mapping[ids.ColumnId, ids.ColumnId], + allow_partial_bindings: bool = False, + ): new_value_columns = [ - col.remap_names(mapping) for col in self.all_ordering_columns + col.remap_column_refs( + mapping, allow_partial_bindings=allow_partial_bindings + ) + for col in self.all_ordering_columns ] new_total_order = frozenset( - mapping.get(col_id, col_id) for col_id in self.total_ordering_columns + expression.DerefOp(mapping.get(col_id.id, col_id.id)) + for col_id in self.total_ordering_columns ) return TotalOrdering( tuple(new_value_columns), @@ -326,12 +374,12 @@ def reencode_order_string( # Convenience functions def ascending_over(id: str, nulls_last: bool = True) -> OrderingExpression: - return OrderingExpression(expression.free_var(id), na_last=nulls_last) + return OrderingExpression(expression.deref(id), na_last=nulls_last) def descending_over(id: str, nulls_last: bool = True) -> OrderingExpression: return OrderingExpression( - expression.free_var(id), direction=OrderingDirection.DESC, na_last=nulls_last + expression.deref(id), direction=OrderingDirection.DESC, na_last=nulls_last ) @@ -339,8 +387,8 @@ def descending_over(id: str, nulls_last: bool = True) -> OrderingExpression: def join_orderings( left: TotalOrdering, right: TotalOrdering, - left_id_mapping: Mapping[str, str], - right_id_mapping: Mapping[str, str], + left_id_mapping: Mapping[ids.ColumnId, ids.ColumnId], + right_id_mapping: Mapping[ids.ColumnId, ids.ColumnId], left_order_dominates: bool = True, ) -> TotalOrdering: ... @@ -350,8 +398,8 @@ def join_orderings( def join_orderings( left: RowOrdering, right: RowOrdering, - left_id_mapping: Mapping[str, str], - right_id_mapping: Mapping[str, str], + left_id_mapping: Mapping[ids.ColumnId, ids.ColumnId], + right_id_mapping: Mapping[ids.ColumnId, ids.ColumnId], left_order_dominates: bool = True, ) -> RowOrdering: ... @@ -360,15 +408,15 @@ def join_orderings( def join_orderings( left: RowOrdering, right: RowOrdering, - left_id_mapping: Mapping[str, str], - right_id_mapping: Mapping[str, str], + left_id_mapping: Mapping[ids.ColumnId, ids.ColumnId], + right_id_mapping: Mapping[ids.ColumnId, ids.ColumnId], left_order_dominates: bool = True, ) -> RowOrdering: left_ordering_refs = [ - ref.remap_names(left_id_mapping) for ref in left.all_ordering_columns + ref.remap_column_refs(left_id_mapping) for ref in left.all_ordering_columns ] right_ordering_refs = [ - ref.remap_names(right_id_mapping) for ref in right.all_ordering_columns + ref.remap_column_refs(right_id_mapping) for ref in right.all_ordering_columns ] if left_order_dominates: joined_refs = [*left_ordering_refs, *right_ordering_refs] @@ -377,14 +425,16 @@ def join_orderings( if isinstance(left, TotalOrdering) and isinstance(right, TotalOrdering): left_total_order_cols = frozenset( - [left_id_mapping[id] for id in left.total_ordering_columns] + [left_id_mapping[ref.id] for ref in left.total_ordering_columns] ) right_total_order_cols = frozenset( - [right_id_mapping[id] for id in right.total_ordering_columns] + [right_id_mapping[ref.id] for ref in right.total_ordering_columns] ) return TotalOrdering( ordering_value_columns=tuple(joined_refs), - total_ordering_columns=left_total_order_cols | right_total_order_cols, + total_ordering_columns=frozenset( + map(expression.DerefOp, left_total_order_cols | right_total_order_cols) + ), ) else: return RowOrdering(tuple(joined_refs)) diff --git a/bigframes/core/pruning.py b/bigframes/core/pruning.py index 55165a616c..2542c8b6f0 100644 --- a/bigframes/core/pruning.py +++ b/bigframes/core/pruning.py @@ -12,11 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Set, TYPE_CHECKING + import bigframes.core.expression as ex -import bigframes.core.schema as schemata +import bigframes.core.identifiers as ids +import bigframes.core.nodes import bigframes.dtypes import bigframes.operations as ops +if TYPE_CHECKING: + import bigframes.core.nodes + + LOW_CARDINALITY_TYPES = [bigframes.dtypes.BOOL_DTYPE] COMPARISON_OP_TYPES = tuple( @@ -34,11 +41,11 @@ def cluster_cols_for_predicate( - predicate: ex.Expression, schema: schemata.ArraySchema -) -> list[str]: + predicate: ex.Expression, clusterable_cols: Set[ids.ColumnId] +) -> list[ids.ColumnId]: """Try to determine cluster col candidates that work with given predicates.""" # TODO: Prioritize based on predicted selectivity (eg. equality conditions are probably very selective) - if isinstance(predicate, ex.UnboundVariableExpression): + if isinstance(predicate, ex.DerefOp): cols = [predicate.id] elif isinstance(predicate, ex.OpExpression): op = predicate.op @@ -47,31 +54,33 @@ def cluster_cols_for_predicate( if isinstance(op, COMPARISON_OP_TYPES): cols = cluster_cols_for_comparison(predicate.inputs[0], predicate.inputs[1]) elif isinstance(op, (type(ops.invert_op))): - cols = cluster_cols_for_predicate(predicate.inputs[0], schema) + cols = cluster_cols_for_predicate(predicate.inputs[0], clusterable_cols) elif isinstance(op, (type(ops.and_op), type(ops.or_op))): - left_cols = cluster_cols_for_predicate(predicate.inputs[0], schema) - right_cols = cluster_cols_for_predicate(predicate.inputs[1], schema) + left_cols = cluster_cols_for_predicate( + predicate.inputs[0], clusterable_cols + ) + right_cols = cluster_cols_for_predicate( + predicate.inputs[1], clusterable_cols + ) cols = [*left_cols, *[col for col in right_cols if col not in left_cols]] else: cols = [] else: # Constant cols = [] - return [ - col for col in cols if bigframes.dtypes.is_clusterable(schema.get_type(col)) - ] + return [col for col in cols if col in clusterable_cols] def cluster_cols_for_comparison( left_ex: ex.Expression, right_ex: ex.Expression -) -> list[str]: +) -> list[ids.ColumnId]: # TODO: Try to normalize expressions such that one side is a single variable. # eg. Convert -cola>=3 to cola<-3 and colb+3 < 4 to colb < 1 if left_ex.is_const: # There are some invertible ops that would also be ok - if isinstance(right_ex, ex.UnboundVariableExpression): + if isinstance(right_ex, ex.DerefOp): return [right_ex.id] elif right_ex.is_const: - if isinstance(left_ex, ex.UnboundVariableExpression): + if isinstance(left_ex, ex.DerefOp): return [left_ex.id] return [] diff --git a/bigframes/core/rewrite.py b/bigframes/core/rewrite.py index 0e73166ea5..095f537c21 100644 --- a/bigframes/core/rewrite.py +++ b/bigframes/core/rewrite.py @@ -19,12 +19,13 @@ from typing import Mapping, Optional, Sequence, Tuple import bigframes.core.expression as scalar_exprs +import bigframes.core.identifiers as ids import bigframes.core.join_def as join_defs import bigframes.core.nodes as nodes import bigframes.core.ordering as order import bigframes.operations as ops -Selection = Tuple[Tuple[scalar_exprs.Expression, str], ...] +Selection = Tuple[Tuple[scalar_exprs.Expression, ids.ColumnId], ...] REWRITABLE_NODE_TYPES = ( nodes.SelectionNode, @@ -40,7 +41,7 @@ class SquashedSelect: """Squash nodes together until target node, separating out the projection, filter and reordering expressions.""" root: nodes.BigFrameNode - columns: Tuple[Tuple[scalar_exprs.Expression, str], ...] + columns: Tuple[Tuple[scalar_exprs.Expression, ids.ColumnId], ...] predicate: Optional[scalar_exprs.Expression] ordering: Tuple[order.OrderingExpression, ...] reverse_root: bool = False @@ -51,8 +52,7 @@ def from_node_span( ) -> SquashedSelect: if node == target: selection = tuple( - (scalar_exprs.UnboundVariableExpression(id), id) - for id in get_node_column_ids(node) + (scalar_exprs.DerefOp(id), id) for id in get_node_column_ids(node) ) return cls(node, selection, None, ()) @@ -72,13 +72,15 @@ def from_node_span( raise ValueError(f"Cannot rewrite node {node}") @property - def column_lookup(self) -> Mapping[str, scalar_exprs.Expression]: + def column_lookup(self) -> Mapping[ids.ColumnId, scalar_exprs.Expression]: return {col_id: expr for expr, col_id in self.columns} - def select(self, input_output_pairs: Tuple[Tuple[str, str], ...]) -> SquashedSelect: + def select( + self, input_output_pairs: Tuple[Tuple[scalar_exprs.DerefOp, ids.ColumnId], ...] + ) -> SquashedSelect: new_columns = tuple( ( - scalar_exprs.free_var(input).bind_variables(self.column_lookup), + input.bind_refs(self.column_lookup), output, ) for input, output in input_output_pairs @@ -88,11 +90,11 @@ def select(self, input_output_pairs: Tuple[Tuple[str, str], ...]) -> SquashedSel ) def project( - self, projection: Tuple[Tuple[scalar_exprs.Expression, str], ...] + self, projection: Tuple[Tuple[scalar_exprs.Expression, ids.ColumnId], ...] ) -> SquashedSelect: existing_columns = self.columns new_columns = tuple( - (expr.bind_variables(self.column_lookup), id) for expr, id in projection + (expr.bind_refs(self.column_lookup), id) for expr, id in projection ) return SquashedSelect( self.root, @@ -104,10 +106,10 @@ def project( def filter(self, predicate: scalar_exprs.Expression) -> SquashedSelect: if self.predicate is None: - new_predicate = predicate.bind_variables(self.column_lookup) + new_predicate = predicate.bind_refs(self.column_lookup) else: new_predicate = ops.and_op.as_expr( - self.predicate, predicate.bind_variables(self.column_lookup) + self.predicate, predicate.bind_refs(self.column_lookup) ) return SquashedSelect( self.root, self.columns, new_predicate, self.ordering, self.reverse_root @@ -121,7 +123,7 @@ def reverse(self) -> SquashedSelect: def order_with(self, by: Tuple[order.OrderingExpression, ...]): adjusted_orderings = [ - order_part.bind_variables(self.column_lookup) for order_part in by + order_part.bind_refs(self.column_lookup) for order_part in by ] new_ordering = (*adjusted_orderings, *self.ordering) return SquashedSelect( @@ -134,8 +136,8 @@ def can_merge( join_keys: Tuple[join_defs.CoalescedColumnMapping, ...], ) -> bool: """Determines whether the two selections can be merged into a single selection.""" - r_exprs_by_id = {id: expr for expr, id in right.columns} - l_exprs_by_id = {id: expr for expr, id in self.columns} + r_exprs_by_id = {id.name: expr for expr, id in right.columns} + l_exprs_by_id = {id.name: expr for expr, id in self.columns} l_join_exprs = [ l_exprs_by_id[join_key.left_source_id] for join_key in join_keys ] @@ -227,7 +229,7 @@ def expand(self) -> nodes.BigFrameNode: root = nodes.FilterNode(child=root, predicate=self.predicate) if self.ordering: root = nodes.OrderByNode(child=root, by=self.ordering) - selection = tuple((id, id) for _, id in self.columns) + selection = tuple((scalar_exprs.DerefOp(id), id) for _, id in self.columns) return nodes.SelectionNode( child=nodes.ProjectionNode(child=root, assignments=self.columns), input_output_pairs=selection, @@ -266,14 +268,15 @@ def merge_expressions( rmask: Optional[scalar_exprs.Expression], ) -> Selection: new_selection: Selection = tuple() - l_exprs_by_id = {id: expr for expr, id in lselection} - r_exprs_by_id = {id: expr for expr, id in rselection} + # Assumption is simple ids + l_exprs_by_id = {id.name: expr for expr, id in lselection} + r_exprs_by_id = {id.name: expr for expr, id in rselection} for key in join_keys: # Join keys expressions are equivalent on both sides, so can choose either left or right key assert l_exprs_by_id[key.left_source_id] == r_exprs_by_id[key.right_source_id] expr = l_exprs_by_id[key.left_source_id] id = key.destination_id - new_selection = (*new_selection, (expr, id)) + new_selection = (*new_selection, (expr, ids.ColumnId(id))) for mapping in mappings: if mapping.source_table == join_defs.JoinSide.LEFT: expr = l_exprs_by_id[mapping.source_id] @@ -283,7 +286,7 @@ def merge_expressions( expr = r_exprs_by_id[mapping.source_id] if rmask is not None: expr = apply_mask(expr, rmask) - new_selection = (*new_selection, (expr, mapping.destination_id)) + new_selection = (*new_selection, (expr, ids.ColumnId(mapping.destination_id))) return new_selection @@ -354,12 +357,8 @@ def decompose_conjunction( return (expr,) -def get_node_column_ids(node: nodes.BigFrameNode) -> Tuple[str, ...]: - # TODO: Convert to use node.schema once that has been merged - # Note: this actually compiles the node to get the schema - import bigframes.core - - return tuple(bigframes.core.ArrayValue(node).column_ids) +def get_node_column_ids(node: nodes.BigFrameNode) -> Tuple[ids.ColumnId, ...]: + return tuple(field.id for field in node.fields) def common_selection_root( diff --git a/bigframes/core/schema.py b/bigframes/core/schema.py index 03e4de8993..2b49f81d85 100644 --- a/bigframes/core/schema.py +++ b/bigframes/core/schema.py @@ -19,6 +19,7 @@ import typing import google.cloud.bigquery +import pyarrow import bigframes.core.guid import bigframes.dtypes @@ -64,6 +65,19 @@ def to_bigquery(self) -> typing.Tuple[google.cloud.bigquery.SchemaField, ...]: for item in self.items ) + def to_pyarrow(self) -> pyarrow.Schema: + fields = [] + for item in self.items: + pa_type = bigframes.dtypes.bigframes_dtype_to_arrow_dtype(item.dtype) + fields.append( + pyarrow.field( + item.column, + pa_type, + nullable=not pyarrow.types.is_list(pa_type), + ) + ) + return pyarrow.schema(fields) + def drop(self, columns: typing.Iterable[str]) -> ArraySchema: return ArraySchema( tuple(item for item in self.items if item.column not in columns) @@ -74,6 +88,14 @@ def select(self, columns: typing.Iterable[str]) -> ArraySchema: tuple(SchemaItem(name, self.get_type(name)) for name in columns) ) + def rename(self, mapping: typing.Mapping[str, str]) -> ArraySchema: + return ArraySchema( + tuple( + SchemaItem(mapping.get(item.column, item.column), item.dtype) + for item in self.items + ) + ) + def append(self, item: SchemaItem): return ArraySchema(tuple([*self.items, item])) diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index 528c9bcc74..e44091e7b1 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -110,10 +110,8 @@ def ordering_clause( if ordering_expr.is_const: # Probably shouldn't have constants in ordering definition, but best to ignore if somehow they end up here. continue - assert isinstance( - ordering_expr, bigframes.core.expression.UnboundVariableExpression - ) - part = f"`{ordering_expr.id}` {asc_desc} {null_clause}" + assert isinstance(ordering_expr, bigframes.core.expression.DerefOp) + part = f"`{ordering_expr.id.sql}` {asc_desc} {null_clause}" parts.append(part) return f"ORDER BY {' ,'.join(parts)}" diff --git a/bigframes/core/window/__init__.py b/bigframes/core/window/__init__.py index fb682c950e..2b45560b15 100644 --- a/bigframes/core/window/__init__.py +++ b/bigframes/core/window/__init__.py @@ -81,7 +81,7 @@ def _apply_aggregate( original_index_ids = block.index_columns block = block.reset_index(drop=False) index_ids = ( - *[col for col in self._window_spec.grouping_keys], + *[col.id.name for col in self._window_spec.grouping_keys], *original_index_ids, ) block = block.set_index(col_ids=index_ids) diff --git a/bigframes/core/window_spec.py b/bigframes/core/window_spec.py index 3d80afea5a..2b9ff65084 100644 --- a/bigframes/core/window_spec.py +++ b/bigframes/core/window_spec.py @@ -17,6 +17,8 @@ import itertools from typing import Optional, Set, Tuple, Union +import bigframes.core.expression as ex +import bigframes.core.identifiers as ids import bigframes.core.ordering as orderings @@ -41,7 +43,9 @@ def unbound( WindowSpec """ return WindowSpec( - grouping_keys=grouping_keys, min_periods=min_periods, ordering=ordering + grouping_keys=tuple(map(ex.deref, grouping_keys)), + min_periods=min_periods, + ordering=ordering, ) @@ -72,7 +76,7 @@ def rows( """ bounds = RowsWindowBounds(preceding=preceding, following=following) return WindowSpec( - grouping_keys=grouping_keys, + grouping_keys=tuple(map(ex.deref, grouping_keys)), bounds=bounds, min_periods=min_periods, ordering=ordering, @@ -95,7 +99,9 @@ def cumulative_rows( """ bounds = RowsWindowBounds(following=0) return WindowSpec( - grouping_keys=grouping_keys, bounds=bounds, min_periods=min_periods + grouping_keys=tuple(map(ex.deref, grouping_keys)), + bounds=bounds, + min_periods=min_periods, ) @@ -115,7 +121,9 @@ def inverse_cumulative_rows( """ bounds = RowsWindowBounds(preceding=0) return WindowSpec( - grouping_keys=grouping_keys, bounds=bounds, min_periods=min_periods + grouping_keys=tuple(map(ex.deref, grouping_keys)), + bounds=bounds, + min_periods=min_periods, ) @@ -148,7 +156,7 @@ class WindowSpec: ordering: List of columns ids and ordering direction to override base ordering """ - grouping_keys: Tuple[str, ...] = tuple() + grouping_keys: Tuple[ex.DerefOp, ...] = tuple() ordering: Tuple[orderings.OrderingExpression, ...] = tuple() bounds: Union[RowsWindowBounds, RangeWindowBounds, None] = None min_periods: int = 0 @@ -164,11 +172,11 @@ def row_bounded(self): return isinstance(self.bounds, RowsWindowBounds) @property - def all_referenced_columns(self) -> Set[str]: + def all_referenced_columns(self) -> Set[ids.ColumnId]: """ Return list of all variables reference ind the window. """ ordering_vars = itertools.chain.from_iterable( - item.scalar_expression.unbound_variables for item in self.ordering + item.scalar_expression.column_references for item in self.ordering ) - return set(itertools.chain(self.grouping_keys, ordering_vars)) + return set(itertools.chain((i.id for i in self.grouping_keys), ordering_vars)) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 47c93bfa30..d9f7cb9f42 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -499,7 +499,7 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: ) return DataFrame(self._block.select_columns(selected_columns)) - def _set_internal_query_job(self, query_job: bigquery.QueryJob): + def _set_internal_query_job(self, query_job: Optional[bigquery.QueryJob]): self._query_job = query_job def __getitem__( @@ -749,11 +749,11 @@ def _apply_scalar_binop( if reverse: expr = op.as_expr( left_input=ex.const(other), - right_input=bigframes.core.guid.generate_guid(), + right_input=ex.free_var("var1"), ) else: expr = op.as_expr( - left_input=bigframes.core.guid.generate_guid(), + left_input=ex.free_var("var1"), right_input=ex.const(other), ) return DataFrame( @@ -2937,9 +2937,9 @@ def sample( ns = (n,) if n is not None else () fracs = (frac,) if frac is not None else () return DataFrame( - self._block._split( - ns=ns, fracs=fracs, random_state=random_state, sort=sort - )[0] + self._block.split(ns=ns, fracs=fracs, random_state=random_state, sort=sort)[ + 0 + ] ) def explode( @@ -2976,9 +2976,91 @@ def _split( At most one of ns and fracs can be passed in. If neither, default to ns = (1,). Return a list of sampled DataFrames. """ - blocks = self._block._split(ns=ns, fracs=fracs, random_state=random_state) + blocks = self._block.split(ns=ns, fracs=fracs, random_state=random_state) return [DataFrame(block) for block in blocks] + @validations.requires_ordering() + def _resample( + self, + rule: str, + *, + on: blocks.Label = None, + level: Optional[LevelsType] = None, + origin: Union[ + Union[ + pandas.Timestamp, datetime.datetime, numpy.datetime64, int, float, str + ], + Literal["epoch", "start", "start_day", "end", "end_day"], + ] = "start_day", + ) -> bigframes.core.groupby.DataFrameGroupBy: + """Internal function to support resample. Resample time-series data. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + + >>> data = { + ... "timestamp_col": pd.date_range( + ... start="2021-01-01 13:00:00", periods=30, freq="1s" + ... ), + ... "int64_col": range(30), + ... "int64_too": range(10, 40), + ... } + + Resample on a DataFrame with index: + + >>> df = bpd.DataFrame(data).set_index("timestamp_col") + >>> df._resample(rule="7s").min() + int64_col int64_too + 2021-01-01 12:59:55 0 10 + 2021-01-01 13:00:02 2 12 + 2021-01-01 13:00:09 9 19 + 2021-01-01 13:00:16 16 26 + 2021-01-01 13:00:23 23 33 + + [5 rows x 2 columns] + + Resample with column and origin set to 'start': + + >>> df = bpd.DataFrame(data) + >>> df._resample(rule="7s", on = "timestamp_col", origin="start").min() + int64_col int64_too + 2021-01-01 13:00:00 0 10 + 2021-01-01 13:00:07 7 17 + 2021-01-01 13:00:14 14 24 + 2021-01-01 13:00:21 21 31 + 2021-01-01 13:00:28 28 38 + + [5 rows x 2 columns] + + Args: + rule (str): + The offset string representing target conversion. + on (str, default None): + For a DataFrame, column to use instead of index for resampling. Column + must be datetime-like. + level (str or int, default None): + For a MultiIndex, level (name or number) to use for resampling. + level must be datetime-like. + origin(str, default 'start_day'): + The timestamp on which to adjust the grouping. Must be one of the following: + 'epoch': origin is 1970-01-01 + 'start': origin is the first value of the timeseries + 'start_day': origin is the first day at midnight of the timeseries + Returns: + DataFrameGroupBy: DataFrameGroupBy object. + """ + block = self._block._generate_resample_label( + rule=rule, + on=on, + level=level, + origin=origin, + ) + df = DataFrame(block) + return df.groupby(level=0) + @classmethod def from_dict( cls, @@ -3156,7 +3238,7 @@ def to_gbq( default_project=default_project, ) ) - _, query_job = self._session._export( + query_job = self._session._executor.export_gbq( export_array, destination=destination, col_id_overrides=id_overrides, @@ -3567,11 +3649,7 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): ops.NaryRemoteFunctionOp(func=func), series_list[1:] ) result_series.name = None - - # Return Series with materialized result so that any error in the remote - # function is caught early - materialized_series = result_series.cache() - return materialized_series + return result_series # Per-column apply results = {name: func(col, *args, **kwargs) for name, col in self.items()} diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 3cd2507231..bc5b89b779 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -356,6 +356,8 @@ def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype: for mapping in SIMPLE_TYPES if mapping.arrow_dtype is not None } +# unidirectional mapping +_BIGFRAMES_TO_ARROW[GEO_DTYPE] = pa.string() def bigframes_dtype_to_arrow_dtype( @@ -382,10 +384,14 @@ def infer_literal_type(literal) -> typing.Optional[Dtype]: as_arrow = bigframes_dtype_to_arrow_dtype(common_type) return pd.ArrowDtype(as_arrow) if pd.api.types.is_dict_like(literal): - fields = [ - (key, bigframes_dtype_to_arrow_dtype(infer_literal_type(literal[key]))) - for key in literal.keys() - ] + fields = [] + for key in literal.keys(): + field_type = bigframes_dtype_to_arrow_dtype( + infer_literal_type(literal[key]) + ) + fields.append( + pa.field(key, field_type, nullable=(not pa.types.is_list(field_type))) + ) return pd.ArrowDtype(pa.struct(fields)) if pd.isna(literal): return None # Null value without a definite type @@ -437,10 +443,13 @@ def convert_schema_field( is_repeated = field.mode == "REPEATED" if field.field_type == "RECORD": mapped_fields = map(convert_schema_field, field.fields) - pa_struct = pa.struct( - (name, bigframes_dtype_to_arrow_dtype(dtype)) - for name, dtype in mapped_fields - ) + fields = [] + for name, dtype in mapped_fields: + arrow_type = bigframes_dtype_to_arrow_dtype(dtype) + fields.append( + pa.field(name, arrow_type, nullable=not pa.types.is_list(arrow_type)) + ) + pa_struct = pa.struct(fields) pa_type = pa.list_(pa_struct) if is_repeated else pa_struct return field.name, pd.ArrowDtype(pa_type) elif ( diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py index 00abb887b0..462bdf2bdd 100644 --- a/bigframes/exceptions.py +++ b/bigframes/exceptions.py @@ -69,3 +69,7 @@ class AmbiguousWindowWarning(Warning): class UnknownDataTypeWarning(Warning): """Data type is unknown.""" + + +class ApiDeprecationWarning(FutureWarning): + """The API has been deprecated.""" diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index de0ae8cc68..63249b1a8a 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -112,6 +112,7 @@ def repr_query_job(query_job: Optional[bigquery.QueryJob]): def wait_for_query_job( query_job: bigquery.QueryJob, max_results: Optional[int] = None, + page_size: Optional[int] = None, progress_bar: Optional[str] = None, ) -> bigquery.table.RowIterator: """Return query results. Displays a progress bar while the query is running @@ -120,6 +121,8 @@ def wait_for_query_job( The job representing the execution of the query on the server. max_results (int, Optional): The maximum number of rows the row iterator should return. + page_size (int, Optional): + The number of results to return on each results page. progress_bar (str, Optional): Which progress bar to show. Returns: @@ -133,7 +136,9 @@ def wait_for_query_job( display_id = str(random.random()) loading_bar = display.HTML(get_query_job_loading_html(query_job)) display.display(loading_bar, display_id=display_id) - query_result = query_job.result(max_results=max_results) + query_result = query_job.result( + max_results=max_results, page_size=page_size + ) query_job.reload() display.update_display( display.HTML(get_query_job_loading_html(query_job)), @@ -142,13 +147,17 @@ def wait_for_query_job( elif progress_bar == "terminal": initial_loading_bar = get_query_job_loading_string(query_job) print(initial_loading_bar) - query_result = query_job.result(max_results=max_results) + query_result = query_job.result( + max_results=max_results, page_size=page_size + ) query_job.reload() if initial_loading_bar != get_query_job_loading_string(query_job): print(get_query_job_loading_string(query_job)) else: # No progress bar. - query_result = query_job.result(max_results=max_results) + query_result = query_job.result( + max_results=max_results, page_size=page_size + ) query_job.reload() return query_result except api_core_exceptions.RetryError as exc: diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index b41426f7d7..02ccc9d6a5 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -24,6 +24,7 @@ import bigframes import bigframes.constants as constants +import bigframes.formatting_helpers as formatting_helpers from bigframes.ml import sql as ml_sql import bigframes.pandas as bpd @@ -233,7 +234,7 @@ def copy(self, new_model_name: str, replace: bool = False) -> BqmlModel: copy_job = self._session.bqclient.copy_table( self.model_name, new_model_name, job_config=job_config ) - self._session._start_generic_job(copy_job) + _start_generic_job(copy_job) new_model = self._session.bqclient.get_model(new_model_name) return BqmlModel(self._session, new_model) @@ -479,3 +480,12 @@ def create_xgboost_imported_model( ) return self._create_model_with_sql(session=session, sql=sql) + + +def _start_generic_job(job: formatting_helpers.GenericJob): + if bigframes.options.display.progress_bar is not None: + formatting_helpers.wait_for_job( + job, bigframes.options.display.progress_bar + ) # Wait for the job to complete + else: + job.result() diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 3d11cd123e..c12da01b54 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -21,9 +21,10 @@ import bigframes_vendored.constants as constants from google.cloud import bigquery +import typing_extensions import bigframes -from bigframes import clients +from bigframes import clients, exceptions from bigframes.core import blocks, log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd @@ -83,10 +84,17 @@ _ML_GENERATE_EMBEDDING_STATUS = "ml_generate_embedding_status" +@typing_extensions.deprecated( + "PaLM2TextGenerator is going to be deprecated. Use GeminiTextGenerator(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. ", + category=exceptions.ApiDeprecationWarning, +) @log_adapter.class_logger class PaLM2TextGenerator(base.BaseEstimator): """PaLM2 text generator LLM model. + .. note:: + PaLM2TextGenerator is going to be deprecated. Use GeminiTextGenerator(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. + Args: model_name (str, Default to "text-bison"): The model for natural language tasks. “text-bison” returns model fine-tuned to follow natural language instructions @@ -403,12 +411,16 @@ def to_gbq(self, model_name: str, replace: bool = False) -> PaLM2TextGenerator: return new_model.session.read_gbq_model(model_name) +@typing_extensions.deprecated( + "PaLM2TextEmbeddingGenerator has been deprecated. Use TextEmbeddingGenerator(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.TextEmbeddingGenerator) instead. ", + category=exceptions.ApiDeprecationWarning, +) @log_adapter.class_logger class PaLM2TextEmbeddingGenerator(base.BaseEstimator): """PaLM2 text embedding generator LLM model. .. note:: - Models in this class are outdated and going to be deprecated. To use the most updated text embedding models, go to the TextEmbeddingGenerator class. + PaLM2TextEmbeddingGenerator has been deprecated. Use TextEmbeddingGenerator(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.TextEmbeddingGenerator) instead. Args: diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index e4c41b2a39..f9d7e6cf73 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -18,9 +18,11 @@ import inspect -from typing import cast, Generator, List, Union +import time +from typing import cast, Generator, List, Optional, Union import bigframes_vendored.sklearn.model_selection._split as vendored_model_selection_split +import bigframes_vendored.sklearn.model_selection._validation as vendored_model_selection_validation from bigframes.core import log_adapter from bigframes.ml import utils @@ -147,3 +149,37 @@ def split( yield utils.convert_to_types( [X_train, X_test, y_train, y_test], [X, X, y, y] ) + + +def cross_validate( + estimator, + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series, None] = None, + *, + cv: Optional[Union[int, KFold]] = None, +) -> dict[str, list]: + if cv is None: + cv = KFold(n_splits=5) + elif isinstance(cv, int): + cv = KFold(n_splits=cv) + + result: dict[str, list] = {"test_score": [], "fit_time": [], "score_time": []} + for X_train, X_test, y_train, y_test in cv.split(X, y): # type: ignore + fit_start_time = time.perf_counter() + estimator.fit(X_train, y_train) + fit_time = time.perf_counter() - fit_start_time + + score_start_time = time.perf_counter() + score = estimator.score(X_test, y_test) + score_time = time.perf_counter() - score_start_time + + result["test_score"].append(score) + result["fit_time"].append(fit_time) + result["score_time"].append(score_time) + + return result + + +cross_validate.__doc__ = inspect.getdoc( + vendored_model_selection_validation.cross_validate +) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 55b8fa1802..63127a70de 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -15,12 +15,14 @@ from __future__ import annotations import dataclasses +import datetime import functools import typing from typing import Union import numpy as np import pandas as pd +from pandas.tseries.offsets import DateOffset import pyarrow as pa import bigframes.dtypes @@ -148,11 +150,11 @@ def as_expr( def _convert_expr_input( input: typing.Union[str, bigframes.core.expression.Expression] ) -> bigframes.core.expression.Expression: - """Allows creating free variables with just a string""" + """Allows creating column references with just a string""" import bigframes.core.expression if isinstance(input, str): - return bigframes.core.expression.UnboundVariableExpression(input) + return bigframes.core.expression.deref(input) else: return input @@ -589,6 +591,34 @@ def output_type(self, *input_types): return input_types[0] +@dataclasses.dataclass(frozen=True) +class DatetimeToIntegerLabelOp(BinaryOp): + name: typing.ClassVar[str] = "datetime_to_integer_label" + freq: DateOffset + closed: typing.Optional[typing.Literal["right", "left"]] + origin: Union[ + Union[pd.Timestamp, datetime.datetime, np.datetime64, int, float, str], + typing.Literal["epoch", "start", "start_day", "end", "end_day"], + ] + + def output_type(self, *input_types): + return dtypes.INT_DTYPE + + +@dataclasses.dataclass(frozen=True) +class IntegerLabelToDatetimeOp(BinaryOp): + name: typing.ClassVar[str] = "integer_label_to_datetime" + freq: DateOffset + label: typing.Optional[typing.Literal["right", "left"]] + origin: Union[ + Union[pd.Timestamp, datetime.datetime, np.datetime64, int, float, str], + typing.Literal["epoch", "start", "start_day", "end", "end_day"], + ] + + def output_type(self, *input_types): + return input_types[1] + + ## Array Ops @dataclasses.dataclass(frozen=True) class ArrayToStringOp(UnaryOp): @@ -879,10 +909,12 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT fields = [] for i in range(num_input_types): + arrow_type = dtypes.bigframes_dtype_to_arrow_dtype(input_types[i]) fields.append( - ( + pa.field( self.column_names[i], - dtypes.bigframes_dtype_to_arrow_dtype(input_types[i]), + arrow_type, + nullable=(not pa.types.is_list(arrow_type)), ) ) return pd.ArrowDtype( diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index faba7465d9..3e4e9d1df1 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -323,8 +323,8 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT ) pa_type = pa.struct( [ - ("left_exclusive", interval_dtype), - ("right_inclusive", interval_dtype), + pa.field("left_exclusive", interval_dtype, nullable=True), + pa.field("right_inclusive", interval_dtype, nullable=True), ] ) return pd.ArrowDtype(pa_type) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index f9a6a87b7a..8304f0070c 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -24,6 +24,7 @@ import bigframes.core.blocks as blocks import bigframes.core.convert import bigframes.core.expression as ex +import bigframes.core.identifiers as ids import bigframes.core.indexes as indexes import bigframes.core.scalar as scalars import bigframes.dtypes @@ -219,31 +220,27 @@ def _apply_binary_aggregation( self, other: series.Series, stat: agg_ops.BinaryAggregateOp ) -> float: (left, right, block) = self._align(other, how="outer") - assert isinstance(left, ex.UnboundVariableExpression) - assert isinstance(right, ex.UnboundVariableExpression) - return block.get_binary_stat(left.id, right.id, stat) + assert isinstance(left, ex.DerefOp) + assert isinstance(right, ex.DerefOp) + return block.get_binary_stat(left.id.name, right.id.name, stat) - AlignedExprT = Union[ex.ScalarConstantExpression, ex.UnboundVariableExpression] + AlignedExprT = Union[ex.ScalarConstantExpression, ex.DerefOp] @typing.overload def _align( self, other: series.Series, how="outer" - ) -> tuple[ - ex.UnboundVariableExpression, - ex.UnboundVariableExpression, - blocks.Block, - ]: + ) -> tuple[ex.DerefOp, ex.DerefOp, blocks.Block,]: ... @typing.overload def _align( self, other: typing.Union[series.Series, scalars.Scalar], how="outer" - ) -> tuple[ex.UnboundVariableExpression, AlignedExprT, blocks.Block,]: + ) -> tuple[ex.DerefOp, AlignedExprT, blocks.Block,]: ... def _align( self, other: typing.Union[series.Series, scalars.Scalar], how="outer" - ) -> tuple[ex.UnboundVariableExpression, AlignedExprT, blocks.Block,]: + ) -> tuple[ex.DerefOp, AlignedExprT, blocks.Block,]: """Aligns the series value with another scalar or series object. Returns new left column id, right column id and joined tabled expression.""" values, block = self._align_n( [ @@ -251,13 +248,13 @@ def _align( ], how, ) - return (typing.cast(ex.UnboundVariableExpression, values[0]), values[1], block) + return (typing.cast(ex.DerefOp, values[0]), values[1], block) - def _align3(self, other1: series.Series | scalars.Scalar, other2: series.Series | scalars.Scalar, how="left") -> tuple[ex.UnboundVariableExpression, AlignedExprT, AlignedExprT, blocks.Block]: # type: ignore + def _align3(self, other1: series.Series | scalars.Scalar, other2: series.Series | scalars.Scalar, how="left") -> tuple[ex.DerefOp, AlignedExprT, AlignedExprT, blocks.Block]: # type: ignore """Aligns the series value with 2 other scalars or series objects. Returns new values and joined tabled expression.""" values, index = self._align_n([other1, other2], how) return ( - typing.cast(ex.UnboundVariableExpression, values[0]), + typing.cast(ex.DerefOp, values[0]), values[1], values[2], index, @@ -270,17 +267,13 @@ def _align_n( ignore_self=False, cast_scalars: bool = True, ) -> tuple[ - typing.Sequence[ - Union[ex.ScalarConstantExpression, ex.UnboundVariableExpression] - ], + typing.Sequence[Union[ex.ScalarConstantExpression, ex.DerefOp]], blocks.Block, ]: if ignore_self: - value_ids: List[ - Union[ex.ScalarConstantExpression, ex.UnboundVariableExpression] - ] = [] + value_ids: List[Union[ex.ScalarConstantExpression, ex.DerefOp]] = [] else: - value_ids = [ex.free_var(self._value_column)] + value_ids = [ex.deref(self._value_column)] block = self._block for other in others: @@ -289,9 +282,16 @@ def _align_n( get_column_left, get_column_right, ) = block.join(other._block, how=how) + rebindings = { + ids.ColumnId(old): ids.ColumnId(new) + for old, new in get_column_left.items() + } + remapped_value_ids = ( + value.remap_column_refs(rebindings) for value in value_ids + ) value_ids = [ - *[value.rename(get_column_left) for value in value_ids], - ex.free_var(get_column_right[other._value_column]), + *remapped_value_ids, # type: ignore + ex.deref(get_column_right[other._value_column]), ] else: # Will throw if can't interpret as scalar. diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 1bdf49eaf5..98da6d826c 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -21,20 +21,7 @@ import inspect import sys import typing -from typing import ( - Any, - Callable, - Dict, - IO, - Iterable, - List, - Literal, - MutableSequence, - Optional, - Sequence, - Tuple, - Union, -) +from typing import Any, Iterable, List, Literal, Optional, Sequence, Tuple, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.reshape.concat as vendored_pandas_concat @@ -42,16 +29,7 @@ import bigframes_vendored.pandas.core.reshape.merge as vendored_pandas_merge import bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile import bigframes_vendored.pandas.core.tools.datetimes as vendored_pandas_datetimes -import bigframes_vendored.pandas.io.gbq as vendored_pandas_gbq -from google.cloud import bigquery -import numpy import pandas -from pandas._typing import ( - CompressionOptions, - FilePath, - ReadPickleBuffer, - StorageOptions, -) import bigframes._config as config import bigframes.core.blocks @@ -65,6 +43,18 @@ import bigframes.enums import bigframes.functions._utils as functions_utils import bigframes.operations as ops +from bigframes.pandas.io.api import ( + read_csv, + read_gbq, + read_gbq_function, + read_gbq_model, + read_gbq_query, + read_gbq_table, + read_json, + read_pandas, + read_parquet, + read_pickle, +) import bigframes.series import bigframes.session import bigframes.session._io.bigquery @@ -373,286 +363,6 @@ def merge( merge.__doc__ = vendored_pandas_merge.merge.__doc__ -def _set_default_session_location_if_possible(query): - # Set the location as per the query if this is the first query the user is - # running and: - # (1) Default session has not started yet, and - # (2) Location is not set yet, and - # (3) Use of regional endpoints is not set. - # If query is a table name, then it would be the location of the table. - # If query is a SQL with a table, then it would be table's location. - # If query is a SQL with no table, then it would be the BQ default location. - if ( - options.bigquery._session_started - or options.bigquery.location - or options.bigquery.use_regional_endpoints - ): - return - - clients_provider = bigframes.session.clients.ClientsProvider( - project=options.bigquery.project, - location=options.bigquery.location, - use_regional_endpoints=options.bigquery.use_regional_endpoints, - credentials=options.bigquery.credentials, - application_name=options.bigquery.application_name, - bq_kms_key_name=options.bigquery.kms_key_name, - ) - - bqclient = clients_provider.bqclient - - if bigframes.session._io.bigquery.is_query(query): - # Intentionally run outside of the session so that we can detect the - # location before creating the session. Since it's a dry_run, labels - # aren't necessary. - job = bqclient.query(query, bigquery.QueryJobConfig(dry_run=True)) - options.bigquery.location = job.location - else: - table = bqclient.get_table(query) - options.bigquery.location = table.location - - -# Note: the following methods are duplicated from Session. This duplication -# enables the following: -# -# 1. Static type checking knows the argument and return types, which is -# difficult to do with decorators. Aside: When we require Python 3.10, we -# can use Concatenate for generic typing in decorators. See: -# https://stackoverflow.com/a/68290080/101923 -# 2. docstrings get processed by static processing tools, such as VS Code's -# autocomplete. -# 3. Positional arguments function as expected. If we were to pull in the -# methods directly from Session, a Session object would need to be the first -# argument, even if we allow a default value. -# 4. Allows to set BigQuery options for the BigFrames session based on the -# method and its arguments. - - -def read_csv( - filepath_or_buffer: str | IO["bytes"], - *, - sep: Optional[str] = ",", - header: Optional[int] = 0, - names: Optional[ - Union[MutableSequence[Any], numpy.ndarray[Any, Any], Tuple[Any, ...], range] - ] = None, - index_col: Optional[ - Union[ - int, - str, - Sequence[Union[str, int]], - bigframes.enums.DefaultIndexKind, - Literal[False], - ] - ] = None, - usecols: Optional[ - Union[ - MutableSequence[str], - Tuple[str, ...], - Sequence[int], - pandas.Series, - pandas.Index, - numpy.ndarray[Any, Any], - Callable[[Any], bool], - ] - ] = None, - dtype: Optional[Dict] = None, - engine: Optional[ - Literal["c", "python", "pyarrow", "python-fwf", "bigquery"] - ] = None, - encoding: Optional[str] = None, - **kwargs, -) -> bigframes.dataframe.DataFrame: - return global_session.with_default_session( - bigframes.session.Session.read_csv, - filepath_or_buffer=filepath_or_buffer, - sep=sep, - header=header, - names=names, - index_col=index_col, - usecols=usecols, - dtype=dtype, - engine=engine, - encoding=encoding, - **kwargs, - ) - - -read_csv.__doc__ = inspect.getdoc(bigframes.session.Session.read_csv) - - -def read_json( - path_or_buf: str | IO["bytes"], - *, - orient: Literal[ - "split", "records", "index", "columns", "values", "table" - ] = "columns", - dtype: Optional[Dict] = None, - encoding: Optional[str] = None, - lines: bool = False, - engine: Literal["ujson", "pyarrow", "bigquery"] = "ujson", - **kwargs, -) -> bigframes.dataframe.DataFrame: - return global_session.with_default_session( - bigframes.session.Session.read_json, - path_or_buf=path_or_buf, - orient=orient, - dtype=dtype, - encoding=encoding, - lines=lines, - engine=engine, - **kwargs, - ) - - -read_json.__doc__ = inspect.getdoc(bigframes.session.Session.read_json) - - -def read_gbq( - query_or_table: str, - *, - index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), - columns: Iterable[str] = (), - configuration: Optional[Dict] = None, - max_results: Optional[int] = None, - filters: vendored_pandas_gbq.FiltersType = (), - use_cache: Optional[bool] = None, - col_order: Iterable[str] = (), -) -> bigframes.dataframe.DataFrame: - _set_default_session_location_if_possible(query_or_table) - return global_session.with_default_session( - bigframes.session.Session.read_gbq, - query_or_table, - index_col=index_col, - columns=columns, - configuration=configuration, - max_results=max_results, - filters=filters, - use_cache=use_cache, - col_order=col_order, - ) - - -read_gbq.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq) - - -def read_gbq_model(model_name: str): - return global_session.with_default_session( - bigframes.session.Session.read_gbq_model, - model_name, - ) - - -read_gbq_model.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_model) - - -def read_gbq_query( - query: str, - *, - index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), - columns: Iterable[str] = (), - configuration: Optional[Dict] = None, - max_results: Optional[int] = None, - use_cache: Optional[bool] = None, - col_order: Iterable[str] = (), - filters: vendored_pandas_gbq.FiltersType = (), -) -> bigframes.dataframe.DataFrame: - _set_default_session_location_if_possible(query) - return global_session.with_default_session( - bigframes.session.Session.read_gbq_query, - query, - index_col=index_col, - columns=columns, - configuration=configuration, - max_results=max_results, - use_cache=use_cache, - col_order=col_order, - filters=filters, - ) - - -read_gbq_query.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_query) - - -def read_gbq_table( - query: str, - *, - index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), - columns: Iterable[str] = (), - max_results: Optional[int] = None, - filters: vendored_pandas_gbq.FiltersType = (), - use_cache: bool = True, - col_order: Iterable[str] = (), -) -> bigframes.dataframe.DataFrame: - _set_default_session_location_if_possible(query) - return global_session.with_default_session( - bigframes.session.Session.read_gbq_table, - query, - index_col=index_col, - columns=columns, - max_results=max_results, - filters=filters, - use_cache=use_cache, - col_order=col_order, - ) - - -read_gbq_table.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_table) - - -@typing.overload -def read_pandas(pandas_dataframe: pandas.DataFrame) -> bigframes.dataframe.DataFrame: - ... - - -@typing.overload -def read_pandas(pandas_dataframe: pandas.Series) -> bigframes.series.Series: - ... - - -@typing.overload -def read_pandas(pandas_dataframe: pandas.Index) -> bigframes.core.indexes.Index: - ... - - -def read_pandas(pandas_dataframe: Union[pandas.DataFrame, pandas.Series, pandas.Index]): - return global_session.with_default_session( - bigframes.session.Session.read_pandas, - pandas_dataframe, - ) - - -read_pandas.__doc__ = inspect.getdoc(bigframes.session.Session.read_pandas) - - -def read_pickle( - filepath_or_buffer: FilePath | ReadPickleBuffer, - compression: CompressionOptions = "infer", - storage_options: StorageOptions = None, -): - return global_session.with_default_session( - bigframes.session.Session.read_pickle, - filepath_or_buffer=filepath_or_buffer, - compression=compression, - storage_options=storage_options, - ) - - -read_pickle.__doc__ = inspect.getdoc(bigframes.session.Session.read_pickle) - - -def read_parquet( - path: str | IO["bytes"], *, engine: str = "auto" -) -> bigframes.dataframe.DataFrame: - return global_session.with_default_session( - bigframes.session.Session.read_parquet, - path, - engine=engine, - ) - - -read_parquet.__doc__ = inspect.getdoc(bigframes.session.Session.read_parquet) - - def remote_function( input_types: Union[None, type, Sequence[type]] = None, output_type: Optional[type] = None, @@ -697,17 +407,6 @@ def remote_function( remote_function.__doc__ = inspect.getdoc(bigframes.session.Session.remote_function) -def read_gbq_function(function_name: str, is_row_processor: bool = False): - return global_session.with_default_session( - bigframes.session.Session.read_gbq_function, - function_name=function_name, - is_row_processor=is_row_processor, - ) - - -read_gbq_function.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_function) - - @typing.overload def to_datetime( arg: Union[ @@ -893,7 +592,7 @@ def reset_session(): pass # Use __all__ to let type checkers know what is part of the public API. -__all___ = [ +__all__ = [ # Functions "concat", "merge", @@ -901,7 +600,11 @@ def reset_session(): "read_gbq", "read_gbq_function", "read_gbq_model", + "read_gbq_query", + "read_gbq_table", + "read_json", "read_pandas", + "read_parquet", "read_pickle", "remote_function", "to_datetime", @@ -911,7 +614,7 @@ def reset_session(): "Float64Dtype", "Int64Dtype", "StringDtype", - "ArrowDtype" + "ArrowDtype", # Class aliases "DataFrame", "Index", diff --git a/bigframes/pandas/io/__init__.py b/bigframes/pandas/io/__init__.py new file mode 100644 index 0000000000..6d5e14bcf4 --- /dev/null +++ b/bigframes/pandas/io/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py new file mode 100644 index 0000000000..4e08b3ef5e --- /dev/null +++ b/bigframes/pandas/io/api.py @@ -0,0 +1,347 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import inspect +import typing +from typing import ( + Any, + Callable, + Dict, + IO, + Iterable, + Literal, + MutableSequence, + Optional, + Sequence, + Tuple, + Union, +) + +import bigframes_vendored.pandas.io.gbq as vendored_pandas_gbq +from google.cloud import bigquery +import numpy +import pandas +from pandas._typing import ( + CompressionOptions, + FilePath, + ReadPickleBuffer, + StorageOptions, +) + +import bigframes._config as config +import bigframes.core.blocks +import bigframes.core.global_session as global_session +import bigframes.core.indexes +import bigframes.core.joins +import bigframes.core.reshape +import bigframes.core.tools +import bigframes.dataframe +import bigframes.enums +import bigframes.series +import bigframes.session +import bigframes.session._io.bigquery +import bigframes.session.clients +import bigframes.version + +# Note: the following methods are duplicated from Session. This duplication +# enables the following: +# +# 1. Static type checking knows the argument and return types, which is +# difficult to do with decorators. Aside: When we require Python 3.10, we +# can use Concatenate for generic typing in decorators. See: +# https://stackoverflow.com/a/68290080/101923 +# 2. docstrings get processed by static processing tools, such as VS Code's +# autocomplete. +# 3. Positional arguments function as expected. If we were to pull in the +# methods directly from Session, a Session object would need to be the first +# argument, even if we allow a default value. +# 4. Allows to set BigQuery options for the BigFrames session based on the +# method and its arguments. + + +def read_csv( + filepath_or_buffer: str | IO["bytes"], + *, + sep: Optional[str] = ",", + header: Optional[int] = 0, + names: Optional[ + Union[MutableSequence[Any], numpy.ndarray[Any, Any], Tuple[Any, ...], range] + ] = None, + index_col: Optional[ + Union[ + int, + str, + Sequence[Union[str, int]], + bigframes.enums.DefaultIndexKind, + Literal[False], + ] + ] = None, + usecols: Optional[ + Union[ + MutableSequence[str], + Tuple[str, ...], + Sequence[int], + pandas.Series, + pandas.Index, + numpy.ndarray[Any, Any], + Callable[[Any], bool], + ] + ] = None, + dtype: Optional[Dict] = None, + engine: Optional[ + Literal["c", "python", "pyarrow", "python-fwf", "bigquery"] + ] = None, + encoding: Optional[str] = None, + **kwargs, +) -> bigframes.dataframe.DataFrame: + return global_session.with_default_session( + bigframes.session.Session.read_csv, + filepath_or_buffer=filepath_or_buffer, + sep=sep, + header=header, + names=names, + index_col=index_col, + usecols=usecols, + dtype=dtype, + engine=engine, + encoding=encoding, + **kwargs, + ) + + +read_csv.__doc__ = inspect.getdoc(bigframes.session.Session.read_csv) + + +def read_json( + path_or_buf: str | IO["bytes"], + *, + orient: Literal[ + "split", "records", "index", "columns", "values", "table" + ] = "columns", + dtype: Optional[Dict] = None, + encoding: Optional[str] = None, + lines: bool = False, + engine: Literal["ujson", "pyarrow", "bigquery"] = "ujson", + **kwargs, +) -> bigframes.dataframe.DataFrame: + return global_session.with_default_session( + bigframes.session.Session.read_json, + path_or_buf=path_or_buf, + orient=orient, + dtype=dtype, + encoding=encoding, + lines=lines, + engine=engine, + **kwargs, + ) + + +read_json.__doc__ = inspect.getdoc(bigframes.session.Session.read_json) + + +def read_gbq( + query_or_table: str, + *, + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), + columns: Iterable[str] = (), + configuration: Optional[Dict] = None, + max_results: Optional[int] = None, + filters: vendored_pandas_gbq.FiltersType = (), + use_cache: Optional[bool] = None, + col_order: Iterable[str] = (), +) -> bigframes.dataframe.DataFrame: + _set_default_session_location_if_possible(query_or_table) + return global_session.with_default_session( + bigframes.session.Session.read_gbq, + query_or_table, + index_col=index_col, + columns=columns, + configuration=configuration, + max_results=max_results, + filters=filters, + use_cache=use_cache, + col_order=col_order, + ) + + +read_gbq.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq) + + +def read_gbq_model(model_name: str): + return global_session.with_default_session( + bigframes.session.Session.read_gbq_model, + model_name, + ) + + +read_gbq_model.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_model) + + +def read_gbq_query( + query: str, + *, + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), + columns: Iterable[str] = (), + configuration: Optional[Dict] = None, + max_results: Optional[int] = None, + use_cache: Optional[bool] = None, + col_order: Iterable[str] = (), + filters: vendored_pandas_gbq.FiltersType = (), +) -> bigframes.dataframe.DataFrame: + _set_default_session_location_if_possible(query) + return global_session.with_default_session( + bigframes.session.Session.read_gbq_query, + query, + index_col=index_col, + columns=columns, + configuration=configuration, + max_results=max_results, + use_cache=use_cache, + col_order=col_order, + filters=filters, + ) + + +read_gbq_query.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_query) + + +def read_gbq_table( + query: str, + *, + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), + columns: Iterable[str] = (), + max_results: Optional[int] = None, + filters: vendored_pandas_gbq.FiltersType = (), + use_cache: bool = True, + col_order: Iterable[str] = (), +) -> bigframes.dataframe.DataFrame: + _set_default_session_location_if_possible(query) + return global_session.with_default_session( + bigframes.session.Session.read_gbq_table, + query, + index_col=index_col, + columns=columns, + max_results=max_results, + filters=filters, + use_cache=use_cache, + col_order=col_order, + ) + + +read_gbq_table.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_table) + + +@typing.overload +def read_pandas(pandas_dataframe: pandas.DataFrame) -> bigframes.dataframe.DataFrame: + ... + + +@typing.overload +def read_pandas(pandas_dataframe: pandas.Series) -> bigframes.series.Series: + ... + + +@typing.overload +def read_pandas(pandas_dataframe: pandas.Index) -> bigframes.core.indexes.Index: + ... + + +def read_pandas(pandas_dataframe: Union[pandas.DataFrame, pandas.Series, pandas.Index]): + return global_session.with_default_session( + bigframes.session.Session.read_pandas, + pandas_dataframe, + ) + + +read_pandas.__doc__ = inspect.getdoc(bigframes.session.Session.read_pandas) + + +def read_pickle( + filepath_or_buffer: FilePath | ReadPickleBuffer, + compression: CompressionOptions = "infer", + storage_options: StorageOptions = None, +): + return global_session.with_default_session( + bigframes.session.Session.read_pickle, + filepath_or_buffer=filepath_or_buffer, + compression=compression, + storage_options=storage_options, + ) + + +read_pickle.__doc__ = inspect.getdoc(bigframes.session.Session.read_pickle) + + +def read_parquet( + path: str | IO["bytes"], *, engine: str = "auto" +) -> bigframes.dataframe.DataFrame: + return global_session.with_default_session( + bigframes.session.Session.read_parquet, + path, + engine=engine, + ) + + +read_parquet.__doc__ = inspect.getdoc(bigframes.session.Session.read_parquet) + + +def read_gbq_function(function_name: str, is_row_processor: bool = False): + return global_session.with_default_session( + bigframes.session.Session.read_gbq_function, + function_name=function_name, + is_row_processor=is_row_processor, + ) + + +read_gbq_function.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_function) + + +def _set_default_session_location_if_possible(query): + # Set the location as per the query if this is the first query the user is + # running and: + # (1) Default session has not started yet, and + # (2) Location is not set yet, and + # (3) Use of regional endpoints is not set. + # If query is a table name, then it would be the location of the table. + # If query is a SQL with a table, then it would be table's location. + # If query is a SQL with no table, then it would be the BQ default location. + if ( + config.options.bigquery._session_started + or config.options.bigquery.location + or config.options.bigquery.use_regional_endpoints + ): + return + + clients_provider = bigframes.session.clients.ClientsProvider( + project=config.options.bigquery.project, + location=config.options.bigquery.location, + use_regional_endpoints=config.options.bigquery.use_regional_endpoints, + credentials=config.options.bigquery.credentials, + application_name=config.options.bigquery.application_name, + bq_kms_key_name=config.options.bigquery.kms_key_name, + ) + + bqclient = clients_provider.bqclient + + if bigframes.session._io.bigquery.is_query(query): + # Intentionally run outside of the session so that we can detect the + # location before creating the session. Since it's a dry_run, labels + # aren't necessary. + job = bqclient.query(query, bigquery.QueryJobConfig(dry_run=True)) + config.options.bigquery.location = job.location + else: + table = bqclient.get_table(query) + config.options.bigquery.location = table.location diff --git a/bigframes/series.py b/bigframes/series.py index 193eea7ee3..16e2eef6f1 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -16,13 +16,14 @@ from __future__ import annotations +import datetime import functools import inspect import itertools import numbers import textwrap import typing -from typing import Any, cast, Literal, Mapping, Optional, Sequence, Tuple, Union +from typing import Any, cast, List, Literal, Mapping, Optional, Sequence, Tuple, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.series as vendored_pandas_series @@ -30,6 +31,7 @@ import numpy import pandas import pandas.core.dtypes.common +import pyarrow as pa import typing_extensions import bigframes.core @@ -181,11 +183,19 @@ def _info_axis(self) -> indexes.Index: def _session(self) -> bigframes.Session: return self._get_block().expr.session + @property + def _struct_fields(self) -> List[str]: + if not bigframes.dtypes.is_struct_like(self._dtype): + return [] + + struct_type = typing.cast(pa.StructType, self._dtype.pyarrow_dtype) + return [struct_type.field(i).name for i in range(struct_type.num_fields)] + @validations.requires_ordering() def transpose(self) -> Series: return self - def _set_internal_query_job(self, query_job: bigquery.QueryJob): + def _set_internal_query_job(self, query_job: Optional[bigquery.QueryJob]): self._query_job = query_job def __len__(self): @@ -1096,6 +1106,9 @@ def __pos__(self) -> Series: def __neg__(self) -> Series: return self._apply_unary_op(ops.neg_op) + def __dir__(self) -> List[str]: + return dir(type(self)) + self._struct_fields + def eq(self, other: object) -> Series: # TODO: enforce stricter alignment return self._apply_binary_op(other, ops.eq_op) @@ -1233,14 +1246,22 @@ def __getitem__(self, indexer): if isinstance(indexer, Series): (left, right, block) = self._align(indexer, "left") block = block.filter(right) - block = block.select_column(left.id) + block = block.select_column(left.id.name) return Series(block) return self.loc[indexer] __getitem__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__getitem__) def __getattr__(self, key: str): - if hasattr(pandas.Series, key): + # Protect against recursion errors with uninitialized Series objects. + # We use "_block" attribute to check whether the instance is initialized. + # See: + # https://github.com/googleapis/python-bigquery-dataframes/issues/728 + # and + # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html + if key == "_block": + raise AttributeError(key) + elif hasattr(pandas.Series, key): raise AttributeError( textwrap.dedent( f""" @@ -1249,6 +1270,8 @@ def __getattr__(self, key: str): """ ) ) + elif key in self._struct_fields: + return self.struct.field(key) else: raise AttributeError(key) @@ -1453,10 +1476,7 @@ def apply( ops.RemoteFunctionOp(func=func, apply_on_null=True) ) - # return Series with materialized result so that any error in the remote - # function is caught early - materialized_series = result_series._cached(session_aware=False) - return materialized_series + return result_series def combine( self, @@ -1484,10 +1504,7 @@ def combine( other, ops.BinaryRemoteFunctionOp(func=func) ) - # return Series with materialized result so that any error in the remote - # function is caught early - materialized_series = result_series._cached() - return materialized_series + return result_series @validations.requires_index def add_prefix(self, prefix: str, axis: int | str | None = None) -> Series: @@ -1790,9 +1807,9 @@ def sample( ns = (n,) if n is not None else () fracs = (frac,) if frac is not None else () return Series( - self._block._split( - ns=ns, fracs=fracs, random_state=random_state, sort=sort - )[0] + self._block.split(ns=ns, fracs=fracs, random_state=random_state, sort=sort)[ + 0 + ] ) def explode(self, *, ignore_index: Optional[bool] = False) -> Series: @@ -1802,6 +1819,72 @@ def explode(self, *, ignore_index: Optional[bool] = False) -> Series: ) ) + @validations.requires_ordering() + def _resample( + self, + rule: str, + *, + closed: Optional[Literal["right", "left"]] = None, + label: Optional[Literal["right", "left"]] = None, + level: Optional[LevelsType] = None, + origin: Union[ + Union[ + pandas.Timestamp, datetime.datetime, numpy.datetime64, int, float, str + ], + Literal["epoch", "start", "start_day", "end", "end_day"], + ] = "start_day", + ) -> bigframes.core.groupby.SeriesGroupBy: + """Internal function to support resample. Resample time-series data. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + + >>> data = { + ... "timestamp_col": pd.date_range( + ... start="2021-01-01 13:00:00", periods=30, freq="1s" + ... ), + ... "int64_col": range(30), + ... } + >>> s = bpd.DataFrame(data).set_index("timestamp_col") + >>> s._resample(rule="7s", origin="epoch").min() + int64_col + 2021-01-01 12:59:56 0 + 2021-01-01 13:00:03 3 + 2021-01-01 13:00:10 10 + 2021-01-01 13:00:17 17 + 2021-01-01 13:00:24 24 + + [5 rows x 1 columns] + + + Args: + rule (str): + The offset string representing target conversion. + level (str or int, default None): + For a MultiIndex, level (name or number) to use for resampling. + level must be datetime-like. + origin(str, default 'start_day'): + The timestamp on which to adjust the grouping. Must be one of the following: + 'epoch': origin is 1970-01-01 + 'start': origin is the first value of the timeseries + 'start_day': origin is the first day at midnight of the timeseries + Returns: + SeriesGroupBy: SeriesGroupBy object. + """ + block = self._block._generate_resample_label( + rule=rule, + closed=closed, + label=label, + on=None, + level=level, + origin=origin, + ) + series = Series(block) + return series.groupby(level=0) + def __array_ufunc__( self, ufunc: numpy.ufunc, method: str, *inputs, **kwargs ) -> Series: diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 3a9cba442c..0d7a90c250 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -70,7 +70,6 @@ import bigframes.dataframe import bigframes.dtypes import bigframes.exceptions -import bigframes.formatting_helpers as formatting_helpers import bigframes.functions._remote_function_session as bigframes_rf_session import bigframes.functions.remote_function as bigframes_rf import bigframes.session._io.bigquery as bf_io_bigquery @@ -261,11 +260,11 @@ def __init__( ) self._executor = bigframes.session.executor.BigQueryCachingExecutor( bqclient=self._clients_provider.bqclient, + bqstoragereadclient=self._clients_provider.bqstoragereadclient, storage_manager=self._temp_storage_manager, strictly_ordered=self._strictly_ordered, metrics=self._metrics, ) - self._loader = bigframes.session.loader.GbqDataLoader( session=self, bqclient=self._clients_provider.bqclient, @@ -1372,20 +1371,6 @@ def _start_query_ml_ddl( return bf_io_bigquery.start_query_with_client(self.bqclient, sql, job_config) - def _execute( - self, - array_value: core.ArrayValue, - *, - ordered: bool = True, - col_id_overrides: Mapping[str, str] = {}, - use_explicit_destination: bool = False, - ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: - return self._executor.execute( - array_value, - ordered=ordered, - col_id_overrides=col_id_overrides, - ) - def _export( self, array_value: core.ArrayValue, @@ -1404,49 +1389,6 @@ def _export( cluster_cols=cluster_cols, ) - def _dry_run( - self, array_value: core.ArrayValue, ordered: bool = True - ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: - return self._executor.dry_run(array_value, ordered=ordered) - - def _peek( - self, array_value: core.ArrayValue, n_rows: int - ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: - """A 'peek' efficiently accesses a small number of rows in the dataframe.""" - return self._executor.peek(array_value, n_rows) - - def _to_sql( - self, - array_value: core.ArrayValue, - offset_column: typing.Optional[str] = None, - col_id_overrides: typing.Mapping[str, str] = {}, - ordered: bool = False, - enable_cache: bool = True, - ) -> str: - return self._executor.to_sql( - array_value, offset_column, col_id_overrides, ordered, enable_cache - ) - - def _get_table_size(self, destination_table): - table = self.bqclient.get_table(destination_table) - return table.num_bytes - - def _rows_to_dataframe( - self, row_iterator: bigquery.table.RowIterator - ) -> pandas.DataFrame: - # Can ignore inferred datatype until dtype emulation breaks 1:1 mapping between BQ types and bigframes types - dtypes_from_bq = bigframes.dtypes.bf_type_from_type_kind(row_iterator.schema) - arrow_table = row_iterator.to_arrow() - return bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes_from_bq) - - def _start_generic_job(self, job: formatting_helpers.GenericJob): - if bigframes.options.display.progress_bar is not None: - formatting_helpers.wait_for_job( - job, bigframes.options.display.progress_bar - ) # Wait for the job to complete - else: - job.result() - def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Session: return Session(context) diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 011c1f1bee..b7706d34ca 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -237,7 +237,7 @@ def start_query_with_client( opts = bigframes.options.display if opts.progress_bar is not None and not query_job.configuration.dry_run: results_iterator = formatting_helpers.wait_for_query_job( - query_job, max_results, opts.progress_bar + query_job, max_results=max_results, progress_bar=opts.progress_bar ) else: results_iterator = query_job.result(max_results=max_results) diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py index 83e30fd900..6ceaab6915 100644 --- a/bigframes/session/_io/pandas.py +++ b/bigframes/session/_io/pandas.py @@ -11,8 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations -from typing import Dict, Union +from typing import Union import bigframes_vendored.constants as constants import geopandas # type: ignore @@ -22,6 +23,7 @@ import pyarrow.compute # type: ignore import pyarrow.types # type: ignore +import bigframes.core.schema import bigframes.features @@ -49,17 +51,18 @@ def _arrow_to_pandas_arrowdtype( def arrow_to_pandas( - arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], dtypes: Dict + arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], + schema: bigframes.core.schema.ArraySchema, ): - if len(dtypes) != arrow_table.num_columns: + if len(schema) != arrow_table.num_columns: raise ValueError( - f"Number of types {len(dtypes)} doesn't match number of columns " + f"Number of types {len(schema)} doesn't match number of columns " f"{arrow_table.num_columns}. {constants.FEEDBACK_LINK}" ) serieses = {} for field, column in zip(arrow_table.schema, arrow_table): - dtype = dtypes[field.name] + dtype = schema.get_type(field.name) if dtype == geopandas.array.GeometryDtype(): series = geopandas.GeoSeries.from_wkt( diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index f89b5aefec..8508c714fd 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -14,22 +14,40 @@ from __future__ import annotations +import dataclasses import math -from typing import cast, Literal, Mapping, Optional, Sequence, Tuple, Union +import os +from typing import ( + Callable, + cast, + Iterator, + Literal, + Mapping, + Optional, + Sequence, + Tuple, + Union, +) import warnings import weakref import google.api_core.exceptions import google.cloud.bigquery as bigquery import google.cloud.bigquery.job as bq_job +import google.cloud.bigquery.table as bq_table +import google.cloud.bigquery_storage_v1 +import pyarrow import bigframes.core import bigframes.core.compile import bigframes.core.expression as ex import bigframes.core.guid +import bigframes.core.identifiers import bigframes.core.nodes as nodes import bigframes.core.ordering as order +import bigframes.core.schema import bigframes.core.tree_properties as tree_properties +import bigframes.features import bigframes.formatting_helpers as formatting_helpers import bigframes.operations as ops import bigframes.session._io.bigquery as bq_io @@ -41,8 +59,27 @@ QUERY_COMPLEXITY_LIMIT = 1e7 # Number of times to factor out subqueries before giving up. MAX_SUBTREE_FACTORINGS = 5 - _MAX_CLUSTER_COLUMNS = 4 +# TODO: b/338258028 Enable pruning to reduce text size. +ENABLE_PRUNING = False + + +@dataclasses.dataclass(frozen=True) +class ExecuteResult: + arrow_batches: Callable[[], Iterator[pyarrow.RecordBatch]] + schema: bigframes.core.schema.ArraySchema + query_job: Optional[bigquery.QueryJob] = None + total_bytes: Optional[int] = None + total_rows: Optional[int] = None + + def to_arrow_table(self) -> pyarrow.Table: + # Need to provide schema if no result rows, as arrow can't infer + # If ther are rows, it is safest to infer schema from batches. + # Any discrepencies between predicted schema and actual schema will produce errors. + return pyarrow.Table.from_batches( + self.arrow_batches(), + self.schema.to_pyarrow() if not self.total_rows else None, + ) class BigQueryCachingExecutor: @@ -58,6 +95,7 @@ def __init__( self, bqclient: bigquery.Client, storage_manager: bigframes.session.temp_storage.TemporaryGbqStorageManager, + bqstoragereadclient: google.cloud.bigquery_storage_v1.BigQueryReadClient, strictly_ordered: bool = True, metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, ): @@ -71,6 +109,7 @@ def __init__( nodes.BigFrameNode, nodes.BigFrameNode ] = weakref.WeakKeyDictionary() self.metrics = metrics + self.bqstoragereadclient = bqstoragereadclient def to_sql( self, @@ -105,6 +144,9 @@ def execute( ordered: bool = True, col_id_overrides: Mapping[str, str] = {}, use_explicit_destination: bool = False, + get_size_bytes: bool = False, + page_size: Optional[int] = None, + max_results: Optional[int] = None, ): """ Execute the ArrayValue, storing the result to a temporary session-owned table. @@ -115,19 +157,43 @@ def execute( sql = self.to_sql( array_value, ordered=ordered, col_id_overrides=col_id_overrides ) + adjusted_schema = array_value.schema.rename(col_id_overrides) job_config = bigquery.QueryJobConfig() # Use explicit destination to avoid 10GB limit of temporary table if use_explicit_destination: - schema = array_value.schema.to_bigquery() destination_table = self.storage_manager.create_temp_table( - schema, cluster_cols=[] + adjusted_schema.to_bigquery(), cluster_cols=[] ) job_config.destination = destination_table # TODO(swast): plumb through the api_name of the user-facing api that # caused this query. - return self._run_execute_query( + iterator, query_job = self._run_execute_query( sql=sql, job_config=job_config, + page_size=page_size, + max_results=max_results, + ) + + # Though we provide the read client, iterator may or may not use it based on what is efficient for the result + def iterator_supplier(): + return iterator.to_arrow_iterable(bqstorage_client=self.bqstoragereadclient) + + if get_size_bytes is True: + size_bytes = self.bqclient.get_table(query_job.destination).num_bytes + else: + size_bytes = None + + # Runs strict validations to ensure internal type predictions and ibis are completely in sync + # Do not execute these validations outside of testing suite. + if "PYTEST_CURRENT_TEST" in os.environ and len(col_id_overrides) == 0: + validate_result_schema(array_value, iterator.schema) + + return ExecuteResult( + arrow_batches=iterator_supplier, + schema=adjusted_schema, + query_job=query_job, + total_bytes=size_bytes, + total_rows=iterator.total_rows, ) def export_gbq( @@ -154,10 +220,11 @@ def export_gbq( ) # TODO(swast): plumb through the api_name of the user-facing api that # caused this query. - return self._run_execute_query( + _, query_job = self._run_execute_query( sql=sql, job_config=job_config, ) + return query_job def export_gcs( self, @@ -170,11 +237,11 @@ def export_gcs( """ Export the ArrayValue to gcs. """ - _, query_job = self.execute( + query_job = self.execute( array_value, ordered=False, col_id_overrides=col_id_overrides, - ) + ).query_job result_table = query_job.destination export_data_statement = bq_io.create_export_data_statement( f"{result_table.project}.{result_table.dataset_id}.{result_table.table_id}", @@ -188,7 +255,9 @@ def export_gcs( self._wait_on_job(export_job) return query_job - def dry_run(self, array_value: bigframes.core.ArrayValue, ordered: bool = True): + def dry_run( + self, array_value: bigframes.core.ArrayValue, ordered: bool = True + ) -> bigquery.QueryJob: """ Dry run executing the ArrayValue. @@ -198,14 +267,14 @@ def dry_run(self, array_value: bigframes.core.ArrayValue, ordered: bool = True): job_config = bigquery.QueryJobConfig(dry_run=True) bq_io.add_labels(job_config) query_job = self.bqclient.query(sql, job_config=job_config) - results_iterator = query_job.result() - return results_iterator, query_job + _ = query_job.result() + return query_job def peek( self, array_value: bigframes.core.ArrayValue, n_rows: int, - ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + ) -> ExecuteResult: """ A 'peek' efficiently accesses a small number of rows in the dataframe. """ @@ -217,11 +286,22 @@ def peek( # TODO(swast): plumb through the api_name of the user-facing api that # caused this query. - return self._run_execute_query(sql=sql) + iterator, query_job = self._run_execute_query(sql=sql) + return ExecuteResult( + # Probably don't need read client for small peek results, but let client decide + arrow_batches=lambda: iterator.to_arrow_iterable( + bqstorage_client=self.bqstoragereadclient + ), + schema=array_value.schema, + query_job=query_job, + total_rows=iterator.total_rows, + ) + # This is used exclusively to optimize __repr__ + # TODO: We need to model this def head( self, array_value: bigframes.core.ArrayValue, n_rows: int - ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + ) -> ExecuteResult: """ Preview the first n rows of the dataframe. This is less efficient than the unordered peek preview op. """ @@ -249,8 +329,18 @@ def head( # TODO(swast): plumb through the api_name of the user-facing api that # caused this query. - return self._run_execute_query(sql=sql) + iterator, query_job = self._run_execute_query(sql=sql) + return ExecuteResult( + # Probably don't need read client for small head results, but let client decide + arrow_batches=lambda: iterator.to_arrow_iterable( + bqstorage_client=self.bqstoragereadclient + ), + schema=array_value.schema, + query_job=query_job, + total_rows=iterator.total_rows, + ) + # TODO: Remove. We shouldn't need this method, row count node can automatically be detected def get_row_count(self, array_value: bigframes.core.ArrayValue) -> int: count = self._local_get_row_count(array_value) if count is not None: @@ -277,7 +367,9 @@ def _run_execute_query( sql: str, job_config: Optional[bq_job.QueryJobConfig] = None, api_name: Optional[str] = None, - ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + page_size: Optional[int] = None, + max_results: Optional[int] = None, + ) -> Tuple[bq_table.RowIterator, bigquery.QueryJob]: """ Starts BigQuery query job and waits for results. """ @@ -293,7 +385,12 @@ def _run_execute_query( job_config.labels["bigframes-mode"] = "unordered" try: query_job = self.bqclient.query(sql, job_config=job_config) - return self._wait_on_job(query_job), query_job + return ( + self._wait_on_job( + query_job, max_results=max_results, page_size=page_size + ), + query_job, + ) except google.api_core.exceptions.BadRequest as e: # Unfortunately, this error type does not have a separate error code or exception type @@ -303,14 +400,24 @@ def _run_execute_query( else: raise - def _wait_on_job(self, query_job: bigquery.QueryJob) -> bigquery.table.RowIterator: + def _wait_on_job( + self, + query_job: bigquery.QueryJob, + page_size: Optional[int] = None, + max_results: Optional[int] = None, + ) -> bq_table.RowIterator: opts = bigframes.options.display if opts.progress_bar is not None and not query_job.configuration.dry_run: results_iterator = formatting_helpers.wait_for_query_job( - query_job, progress_bar=opts.progress_bar + query_job, + progress_bar=opts.progress_bar, + max_results=max_results, + page_size=page_size, ) else: - results_iterator = query_job.result() + results_iterator = query_job.result( + max_results=max_results, page_size=page_size + ) if self.metrics is not None: self.metrics.count_job_stats(query_job) @@ -326,6 +433,9 @@ def _get_optimized_plan(self, node: nodes.BigFrameNode) -> nodes.BigFrameNode: optimized_plan = tree_properties.replace_nodes( node, (dict(self._cached_executions)) ) + if ENABLE_PRUNING: + used_fields = frozenset(field.id for field in optimized_plan.fields) + optimized_plan = optimized_plan.prune(used_fields) return optimized_plan def _is_trivially_executable(self, array_value: bigframes.core.ArrayValue): @@ -384,9 +494,10 @@ def _cache_with_session_awareness( target, cluster_cols = bigframes.session.planner.session_aware_cache_plan( array_value.node, list(session_forest) ) + cluster_cols_sql_names = [id.sql for id in cluster_cols] if len(cluster_cols) > 0: self._cache_with_cluster_cols( - bigframes.core.ArrayValue(target), cluster_cols + bigframes.core.ArrayValue(target), cluster_cols_sql_names ) elif self.strictly_ordered: self._cache_with_offsets(bigframes.core.ArrayValue(target)) @@ -450,12 +561,38 @@ def _sql_as_cached_temp_table( def generate_head_plan(node: nodes.BigFrameNode, n: int): offsets_id = bigframes.core.guid.generate_guid("offsets_") - plan_w_offsets = nodes.PromoteOffsetsNode(node, offsets_id) - predicate = ops.lt_op.as_expr(ex.free_var(offsets_id), ex.const(n)) + plan_w_offsets = nodes.PromoteOffsetsNode( + node, bigframes.core.identifiers.ColumnId(offsets_id) + ) + predicate = ops.lt_op.as_expr(ex.deref(offsets_id), ex.const(n)) plan_w_head = nodes.FilterNode(plan_w_offsets, predicate) # Finally, drop the offsets column - return nodes.SelectionNode(plan_w_head, tuple((i, i) for i in node.schema.names)) + return nodes.SelectionNode( + plan_w_head, + tuple( + (ex.deref(i), bigframes.core.identifiers.ColumnId(i)) + for i in node.schema.names + ), + ) def generate_row_count_plan(node: nodes.BigFrameNode): return nodes.RowCountNode(node) + + +def validate_result_schema( + array_value: bigframes.core.ArrayValue, bq_schema: list[bigquery.schema.SchemaField] +): + actual_schema = tuple(bq_schema) + ibis_schema = array_value._compiled_schema + internal_schema = array_value.schema + if not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: + return + if internal_schema.to_bigquery() != actual_schema: + raise ValueError( + f"This error should only occur while testing. BigFrames internal schema: {internal_schema.to_bigquery()} does not match actual schema: {actual_schema}" + ) + if ibis_schema.to_bigquery() != actual_schema: + raise ValueError( + f"This error should only occur while testing. Ibis schema: {ibis_schema.to_bigquery()} does not match actual schema: {actual_schema}" + ) diff --git a/bigframes/session/planner.py b/bigframes/session/planner.py index bc640ec9fa..2a562abadf 100644 --- a/bigframes/session/planner.py +++ b/bigframes/session/planner.py @@ -18,14 +18,16 @@ from typing import Sequence, Tuple import bigframes.core.expression as ex +import bigframes.core.identifiers as ids import bigframes.core.nodes as nodes import bigframes.core.pruning as predicate_pruning import bigframes.core.tree_properties as traversals +import bigframes.dtypes def session_aware_cache_plan( root: nodes.BigFrameNode, session_forest: Sequence[nodes.BigFrameNode] -) -> Tuple[nodes.BigFrameNode, list[str]]: +) -> Tuple[nodes.BigFrameNode, list[ids.ColumnId]]: """ Determines the best node to cache given a target and a list of object roots for objects in a session. @@ -40,7 +42,7 @@ def session_aware_cache_plan( filters: list[ ex.Expression ] = [] # accumulate filters into this as traverse downwards - clusterable_cols: set[str] = set() + clusterable_cols: set[ids.ColumnId] = set() while isinstance(cur_node, de_cachable_types): if isinstance(cur_node, nodes.FilterNode): # Filter node doesn't define any variables, so no need to chain expressions @@ -50,14 +52,11 @@ def session_aware_cache_plan( # that instead reference variables in the child node. bindings = {name: expr for expr, name in cur_node.assignments} filters = [ - i.bind_variables(bindings, check_bind_all=False) for i in filters + i.bind_refs(bindings, allow_partial_bindings=True) for i in filters ] elif isinstance(cur_node, nodes.SelectionNode): - bindings = { - output: ex.free_var(input) - for input, output in cur_node.input_output_pairs - } - filters = [i.bind_variables(bindings) for i in filters] + bindings = {output: input for input, output in cur_node.input_output_pairs} + filters = [i.bind_refs(bindings) for i in filters] else: raise ValueError(f"Unexpected de-cached node: {cur_node}") @@ -65,13 +64,17 @@ def session_aware_cache_plan( cur_node_refs = node_counts.get(cur_node, 0) if cur_node_refs > caching_target_refs: caching_target, caching_target_refs = cur_node, cur_node_refs - schema = cur_node.schema + cluster_compatible_cols = { + field.id + for field in cur_node.fields + if bigframes.dtypes.is_clusterable(field.dtype) + } # Cluster cols only consider the target object and not other sesssion objects clusterable_cols = set( itertools.chain.from_iterable( map( lambda f: predicate_pruning.cluster_cols_for_predicate( - f, schema + f, cluster_compatible_cols ), filters, ) diff --git a/bigframes/streaming/__init__.py b/bigframes/streaming/__init__.py index 66f345f0ab..d439d622a2 100644 --- a/bigframes/streaming/__init__.py +++ b/bigframes/streaming/__init__.py @@ -15,13 +15,13 @@ import inspect import bigframes.core.global_session as global_session -import bigframes.pandas as bpd +from bigframes.pandas.io.api import _set_default_session_location_if_possible import bigframes.session import bigframes.streaming.dataframe as streaming_dataframe def read_gbq_table(table: str) -> streaming_dataframe.StreamingDataFrame: - bpd._set_default_session_location_if_possible(table) + _set_default_session_location_if_possible(table) return global_session.with_default_session( bigframes.session.Session.read_gbq_table_streaming, table ) diff --git a/bigframes/version.py b/bigframes/version.py index 60f4942175..c07f26bc6f 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.20.0" +__version__ = "1.21.0" diff --git a/docs/reference/bigframes.pandas/general_functions.rst b/docs/reference/bigframes.pandas/general_functions.rst index 4fff9aabf8..fff1a9ef59 100644 --- a/docs/reference/bigframes.pandas/general_functions.rst +++ b/docs/reference/bigframes.pandas/general_functions.rst @@ -6,3 +6,4 @@ General functions .. automodule:: bigframes.pandas :members: :undoc-members: + :noindex: diff --git a/notebooks/location/regionalized.ipynb b/notebooks/location/regionalized.ipynb index c05d27c24e..c383a22609 100644 --- a/notebooks/location/regionalized.ipynb +++ b/notebooks/location/regionalized.ipynb @@ -47,32 +47,36 @@ ], "source": [ "# Take multi-region US as the default BQ location, where most of the BQ data lies including the BQ public datasets\n", - "BQ_LOCATION = \"us\"\n", - "PROJECT = \"bigframes-dev\"\n", + "import os\n", + "\n", + "PROJECT_ID = os.environ.get(\"GOOGLE_CLOUD_PROJECT\")\n", + "BQ_LOCATION = os.environ.get(\"BIGQUERY_LOCATION\")\n", + "\n", + "if not PROJECT_ID:\n", + " raise ValueError(\"Project must be set via environment variable GOOGLE_CLOUD_PROJECT\")\n", + "if not BQ_LOCATION:\n", + " raise ValueError(\"BQ location must be set via environment variable BIGQUERY_LOCATION\")\n", + "\n", "DATASET = \"bigframes_testing\"\n", "PENGUINS_TABLE = \"bigquery-public-data.ml_datasets.penguins\"\n", "\n", "\n", "# Check for a location set in the environment and do location-specific setup if needed\n", "\n", - "import os\n", "import google.api_core.exceptions\n", "from google.cloud import bigquery\n", "import bigframes\n", - " \n", - "env_bq_location = os.getenv(\"BIGQUERY_LOCATION\")\n", - "if env_bq_location and env_bq_location != BQ_LOCATION:\n", - " BQ_LOCATION = env_bq_location.lower()\n", "\n", "client = bigquery.Client()\n", "\n", + "BQ_LOCATION = BQ_LOCATION.lower()\n", "if BQ_LOCATION != \"us\":\n", " bq_location_normalized = BQ_LOCATION.replace('-', '_')\n", "\n", " # Nominate a local penguins table\n", " penguins_table_ref = bigquery.TableReference.from_string(PENGUINS_TABLE)\n", " penguins_local_dataset_name = f\"{DATASET}_{bq_location_normalized}\"\n", - " penguins_local_dataset_ref = bigquery.DatasetReference(project=PROJECT, dataset_id=penguins_local_dataset_name)\n", + " penguins_local_dataset_ref = bigquery.DatasetReference(project=PROJECT_ID, dataset_id=penguins_local_dataset_name)\n", " penguins_local_dataset = bigquery.Dataset(penguins_local_dataset_ref)\n", " penguins_local_dataset.location = BQ_LOCATION\n", " penguins_local_table_ref= bigquery.TableReference(penguins_local_dataset, penguins_table_ref.table_id)\n", @@ -94,13 +98,13 @@ " DATASET = f\"{DATASET}_{bq_location_normalized}\"\n", "\n", "# Create the dataset to store the model if it doesn't exist \n", - "model_local_dataset = bigquery.Dataset(bigquery.DatasetReference(project=PROJECT, dataset_id=DATASET))\n", + "model_local_dataset = bigquery.Dataset(bigquery.DatasetReference(project=PROJECT_ID, dataset_id=DATASET))\n", "model_local_dataset.location = BQ_LOCATION\n", "model_dataset = client.create_dataset(model_local_dataset, exists_ok=True)\n", "\n", "# Finally log the variables driving the core notebook execution\n", "log = ('\\n'.join(f\"{name}: {str(value)}\" for name, value in {\n", - " \"BigQuery project\" : PROJECT,\n", + " \"BigQuery project\" : PROJECT_ID,\n", " \"BigQuery location\" : BQ_LOCATION,\n", " \"Penguins Table\" : PENGUINS_TABLE,\n", " \"ML Model Dataset\" : model_dataset.reference\n", @@ -134,7 +138,7 @@ "\n", "# Note: The project option is not required in all environments.\n", "# On BigQuery Studio, the project ID is automatically detected.\n", - "bigframes.pandas.options.bigquery.project = PROJECT\n", + "bigframes.pandas.options.bigquery.project = PROJECT_ID\n", "\n", "# Note: The location option is not required.\n", "# It defaults to the location of the first table or query\n", diff --git a/notebooks/ml/bq_dataframes_ml_cross_validation.ipynb b/notebooks/ml/bq_dataframes_ml_cross_validation.ipynb new file mode 100644 index 0000000000..824d911aff --- /dev/null +++ b/notebooks/ml/bq_dataframes_ml_cross_validation.ipynb @@ -0,0 +1,1105 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BigFrames ML Cross-Vaidation\n", + "\n", + "This demo shows how to do cross validation in bigframes.ml" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Prepare Data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py:3577: UserWarning: Reading cached table from 2024-10-01 22:44:50.650768+00:00 to avoid incompatibilies with previous reads of this table. To read the latest version, set `use_cache=False` or close the current session with Session.close() or bigframes.pandas.close_session().\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 4c2f2252-687a-47c3-87ad-22db8ad96e2b is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job a05c7268-8db2-468b-9fb4-0fb5c9534f51 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
speciesislandculmen_length_mmculmen_depth_mmflipper_length_mmbody_mass_gsex
0Gentoo penguin (Pygoscelis papua)Biscoe50.515.9225.05400.0MALE
1Gentoo penguin (Pygoscelis papua)Biscoe45.114.5215.05000.0FEMALE
2Adelie Penguin (Pygoscelis adeliae)Torgersen41.418.5202.03875.0MALE
3Adelie Penguin (Pygoscelis adeliae)Torgersen38.617.0188.02900.0FEMALE
4Gentoo penguin (Pygoscelis papua)Biscoe46.514.8217.05200.0FEMALE
........................
339Adelie Penguin (Pygoscelis adeliae)Dream38.117.6187.03425.0FEMALE
340Adelie Penguin (Pygoscelis adeliae)Biscoe36.417.1184.02850.0FEMALE
341Chinstrap penguin (Pygoscelis antarctica)Dream40.916.6187.03200.0FEMALE
342Adelie Penguin (Pygoscelis adeliae)Biscoe41.321.1195.04400.0MALE
343Chinstrap penguin (Pygoscelis antarctica)Dream45.216.6191.03250.0FEMALE
\n", + "

334 rows × 7 columns

\n", + "
[334 rows x 7 columns in total]" + ], + "text/plain": [ + " species island culmen_length_mm \\\n", + "0 Gentoo penguin (Pygoscelis papua) Biscoe 50.5 \n", + "1 Gentoo penguin (Pygoscelis papua) Biscoe 45.1 \n", + "2 Adelie Penguin (Pygoscelis adeliae) Torgersen 41.4 \n", + "3 Adelie Penguin (Pygoscelis adeliae) Torgersen 38.6 \n", + "4 Gentoo penguin (Pygoscelis papua) Biscoe 46.5 \n", + ".. ... ... ... \n", + "339 Adelie Penguin (Pygoscelis adeliae) Dream 38.1 \n", + "340 Adelie Penguin (Pygoscelis adeliae) Biscoe 36.4 \n", + "341 Chinstrap penguin (Pygoscelis antarctica) Dream 40.9 \n", + "342 Adelie Penguin (Pygoscelis adeliae) Biscoe 41.3 \n", + "343 Chinstrap penguin (Pygoscelis antarctica) Dream 45.2 \n", + "\n", + " culmen_depth_mm flipper_length_mm body_mass_g sex \n", + "0 15.9 225.0 5400.0 MALE \n", + "1 14.5 215.0 5000.0 FEMALE \n", + "2 18.5 202.0 3875.0 MALE \n", + "3 17.0 188.0 2900.0 FEMALE \n", + "4 14.8 217.0 5200.0 FEMALE \n", + ".. ... ... ... ... \n", + "339 17.6 187.0 3425.0 FEMALE \n", + "340 17.1 184.0 2850.0 FEMALE \n", + "341 16.6 187.0 3200.0 FEMALE \n", + "342 21.1 195.0 4400.0 MALE \n", + "343 16.6 191.0 3250.0 FEMALE \n", + "...\n", + "\n", + "[334 rows x 7 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# read and filter out unavailable data\n", + "df = bpd.read_gbq(\"bigframes-dev.bqml_tutorial.penguins\")\n", + "df = df.dropna()\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Select X and y from the dataset\n", + "X = df[\n", + " [\n", + " \"species\",\n", + " \"island\",\n", + " \"culmen_length_mm\",\n", + " ]\n", + " ]\n", + "y = df[\"body_mass_g\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.1 Define KFold class and Train/Test for Each Fold (Mauanl Approach)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from bigframes.ml import model_selection, linear_model" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Create KFold instance, n_splits defines how many folds the data will split. For example, n_split=5 will split the entire dataset into 5 pieces. \n", + "# In each fold, 4 pieces will be used for training, and the other piece will be used for evaluation. \n", + "kf = model_selection.KFold(n_splits=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 582e7c02-bcc6-412a-a513-46ee5dba7ad8 is DONE. 2.7 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 917ff09b-072b-4c55-b26f-1780e2e97519 is DONE. 25.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 2f4e102d-48bc-401f-a781-39830e2c6c9b is DONE. 16.4 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job aabe8a28-8dce-4e00-8a8c-18e9e090e6e7 is DONE. 26.3 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job ec9d8798-e28e-44bc-aa8e-44ab28f0214f is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 8aa0fa94-e43e-41c6-9de3-f0a67392c47f is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", + "0 318.358226 151689.571141 0.009814 \n", + "\n", + " median_absolute_error r2_score explained_variance \n", + "0 255.095561 0.780659 0.783304 \n", + "\n", + "[1 rows x 6 columns]\n" + ] + }, + { + "data": { + "text/html": [ + "Query job bf6ef937-9583-4aa8-8313-563638465d5f is DONE. 25.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 4c8b564c-5bbd-4447-babf-e307524962e5 is DONE. 16.4 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job cd5e337f-6d44-473d-a90b-be8a79bba6bf is DONE. 26.3 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job ad80012d-7c6c-4dbf-9271-2ff7f899f174 is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 8fc20587-d8ba-4c0f-bed9-3e1cf3c6ae52 is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", + "0 306.435423 151573.84019 0.008539 \n", + "\n", + " median_absolute_error r2_score explained_variance \n", + "0 244.2899 0.737623 0.742859 \n", + "\n", + "[1 rows x 6 columns]\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 90286d2b-e805-4b19-8876-c9973579e9ff is DONE. 25.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job ceb6c8f2-16cc-4758-bde8-3e4975ba1452 is DONE. 16.4 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job f49434fa-a7e0-406a-bbe2-5651595e3418 is DONE. 26.3 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 5dd7a277-10fe-4117-a354-ef8668a8b913 is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 4b58b016-9a50-4a66-b86c-8431faad43bf is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", + "0 253.349578 112039.741164 0.007153 \n", + "\n", + " median_absolute_error r2_score explained_variance \n", + "0 185.916761 0.823381 0.823456 \n", + "\n", + "[1 rows x 6 columns]\n" + ] + }, + { + "data": { + "text/html": [ + "Query job ca700ecf-0c08-4286-b979-2bc7a0bee89c is DONE. 25.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job f0731e71-7754-47a2-a553-93a61e712533 is DONE. 16.4 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job ae66d34d-5f0a-4297-9d41-57067ae54a9b is DONE. 26.3 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 7655a649-ceca-4792-b764-fb371f5872ec is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 8b0634c8-73a9-422c-9644-842142dbb059 is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", + "0 320.381386 155234.800349 0.008638 \n", + "\n", + " median_absolute_error r2_score explained_variance \n", + "0 306.281263 0.793405 0.794504 \n", + "\n", + "[1 rows x 6 columns]\n" + ] + }, + { + "data": { + "text/html": [ + "Query job bb26cde9-1991-4e0a-8492-b19d15b1b7aa is DONE. 25.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 7ddd0883-492d-46bc-a588-f3cbab2474bb is DONE. 16.5 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 5de571e4-d2f9-43c7-b014-3d65a3731b64 is DONE. 26.3 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job d20ac7d8-cd21-4a1f-a200-2dfa6373bcdb is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 235e8a80-33ea-4a95-a7d0-34e40a8ca396 is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", + "0 303.855563 141869.030392 0.008989 \n", + "\n", + " median_absolute_error r2_score explained_variance \n", + "0 245.102301 0.731737 0.732793 \n", + "\n", + "[1 rows x 6 columns]\n" + ] + } + ], + "source": [ + "for X_train, X_test, y_train, y_test in kf.split(X, y):\n", + " model = linear_model.LinearRegression()\n", + " model.fit(X_train, y_train)\n", + " score = model.score(X_test, y_test)\n", + "\n", + " print(score)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.2 Use cross_validate Function to Do Cross Validation (Automatic Approach)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 9274ae2e-e9a7-4701-ac64-56632323d02a is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 22f9477b-de02-4c07-b480-c3270a69d7e0 is DONE. 25.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job ebb192b7-4a9e-4238-b4e6-b630e2f94988 is DONE. 16.5 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 44441e8c-8753-41b0-b1b7-9a6c4eab8c74 is DONE. 26.3 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 239fed9a-b488-47da-a0df-a3b7c6ec40f4 is DONE. 25.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job f4248b2d-3430-426c-872d-8590f2878366 is DONE. 16.4 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job d9f6b034-c300-4dd7-91dd-48fa912f2456 is DONE. 26.3 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job e2f39f5b-2f4c-402a-a8d5-a7cff918508d is DONE. 25.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 54cf3710-b5f4-4aec-b11f-0281126a151a is DONE. 16.4 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 833d13cd-ec59-499b-98f6-95ec18766698 is DONE. 26.3 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 0120e332-0691-44a4-9198-f5c131b8f59c is DONE. 25.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job f4ba7a4c-5fd9-4f97-ab34-a8f139e7472a is DONE. 16.4 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 857aadfc-2ade-429c-bef8-428e44d48c55 is DONE. 26.3 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 906d6d34-a506-4957-b07f-7e5ed2e0634b is DONE. 25.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 498563db-3e68-4df7-a2d5-83da6adb49ed is DONE. 16.5 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 01af95ca-6288-4253-b379-7327e1c9de88 is DONE. 26.3 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 5ce36d32-6db1-42e5-a8cf-84bb8244a57e is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job e05ec77d-6025-4edd-b5e3-9c4e7a124e71 is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 418a4a5d-2bb3-41e5-9e7c-9852389a491b is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job b33e30da-cfed-4d6f-b227-f433d97879cb is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 7ad7f0c8-ecae-4ef2-bc91-0ebeb5f88e7b is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job a6e8bd12-1122-4c26-b0e1-58342238016c is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job c553439c-9586-479c-92c5-01a0d333125b is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job c598d64c-26b9-49fc-afad-a6544b38cfa2 is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job ebcb73e8-1294-4f10-b826-c495046fd714 is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job d73f57ba-a25d-4b90-b474-13d81a3e22ab is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "{'test_score': [ mean_absolute_error mean_squared_error mean_squared_log_error \\\n", + " 0 237.154735 97636.17064 0.005571 \n", + " \n", + " median_absolute_error r2_score explained_variance \n", + " 0 187.883888 0.842018 0.846816 \n", + " \n", + " [1 rows x 6 columns],\n", + " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", + " 0 304.281635 141966.045867 0.008064 \n", + " \n", + " median_absolute_error r2_score explained_variance \n", + " 0 236.096453 0.762979 0.764008 \n", + " \n", + " [1 rows x 6 columns],\n", + " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", + " 0 316.380322 157332.146085 0.009699 \n", + " \n", + " median_absolute_error r2_score explained_variance \n", + " 0 222.824496 0.764607 0.765369 \n", + " \n", + " [1 rows x 6 columns],\n", + " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", + " 0 309.609657 152421.826588 0.009772 \n", + " \n", + " median_absolute_error r2_score explained_variance \n", + " 0 254.163976 0.772954 0.773119 \n", + " \n", + " [1 rows x 6 columns],\n", + " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", + " 0 339.339345 169760.629993 0.010597 \n", + " \n", + " median_absolute_error r2_score explained_variance \n", + " 0 312.335706 0.741167 0.74118 \n", + " \n", + " [1 rows x 6 columns]],\n", + " 'fit_time': [18.200648623984307,\n", + " 17.565149880945683,\n", + " 18.202434757025912,\n", + " 18.04062689607963,\n", + " 19.370970834977925],\n", + " 'score_time': [4.76077218609862,\n", + " 4.577479084953666,\n", + " 4.581933492794633,\n", + " 4.741644307971001,\n", + " 5.1031754210125655]}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# By using model_selection.cross_validate, the above 2.1 process is automated. The returned scores contains the evaluation results for each fold.\n", + "model = linear_model.LinearRegression()\n", + "scores = model_selection.cross_validate(model, X, y, cv=5)\n", + "scores" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/regression/bq_dataframes_ml_linear_regression.ipynb b/notebooks/ml/bq_dataframes_ml_linear_regression.ipynb similarity index 100% rename from notebooks/regression/bq_dataframes_ml_linear_regression.ipynb rename to notebooks/ml/bq_dataframes_ml_linear_regression.ipynb diff --git a/notebooks/regression/easy_linear_regression.ipynb b/notebooks/ml/easy_linear_regression.ipynb similarity index 100% rename from notebooks/regression/easy_linear_regression.ipynb rename to notebooks/ml/easy_linear_regression.ipynb diff --git a/notebooks/regression/sklearn_linear_regression.ipynb b/notebooks/ml/sklearn_linear_regression.ipynb similarity index 100% rename from notebooks/regression/sklearn_linear_regression.ipynb rename to notebooks/ml/sklearn_linear_regression.ipynb diff --git a/noxfile.py b/noxfile.py index c704da00a5..714c8333bd 100644 --- a/noxfile.py +++ b/noxfile.py @@ -731,7 +731,7 @@ def notebook(session: nox.Session): # appropriate values and omitting cleanup logic that may break # our test infrastructure. "notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb", # Needs DATASET. - "notebooks/regression/bq_dataframes_ml_linear_regression.ipynb", # Needs DATASET_ID. + "notebooks/ml/bq_dataframes_ml_linear_regression.ipynb", # Needs DATASET_ID. "notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb", # Needs CONNECTION. # TODO(b/332737009): investigate why we get 404 errors, even though # bq_dataframes_llm_code_generation creates a bucket in the sample. diff --git a/scripts/setup-project-for-testing.sh b/scripts/setup-project-for-testing.sh index a160784c12..df9cea46a4 100755 --- a/scripts/setup-project-for-testing.sh +++ b/scripts/setup-project-for-testing.sh @@ -57,12 +57,14 @@ function log_and_execute() { ################################################################################ function enable_apis() { for service in aiplatform.googleapis.com \ + artifactregistry.googleapis.com \ bigquery.googleapis.com \ bigqueryconnection.googleapis.com \ bigquerystorage.googleapis.com \ cloudbuild.googleapis.com \ cloudfunctions.googleapis.com \ cloudresourcemanager.googleapis.com \ + compute.googleapis.com \ run.googleapis.com \ ; do log_and_execute gcloud --project=$PROJECT_ID services enable $service @@ -148,6 +150,7 @@ function ensure_bq_connections_with_iam() { southamerica-west1 \ us \ us-central1 \ + us-east5 \ ; do ensure_bq_connection_with_iam "$location" "$BIGFRAMES_RF_CONNECTION_NAME" done diff --git a/tests/data/nested_structs.jsonl b/tests/data/nested_structs.jsonl new file mode 100644 index 0000000000..f57214b0b3 --- /dev/null +++ b/tests/data/nested_structs.jsonl @@ -0,0 +1,2 @@ +{"id": 1, "person": {"name": "Alice", "age":30, "address": {"city": "New York", "country": "USA"}}} +{"id": 2, "person": {"name": "Bob", "age":25, "address": {"city": "London", "country": "UK"}}} \ No newline at end of file diff --git a/tests/data/nested_structs_schema.json b/tests/data/nested_structs_schema.json new file mode 100644 index 0000000000..6692615cef --- /dev/null +++ b/tests/data/nested_structs_schema.json @@ -0,0 +1,39 @@ +[ + { + "name": "id", + "type": "INTEGER", + "mode": "REQUIRED" + }, + { + "name": "person", + "type": "RECORD", + "fields": [ + { + "name": "name", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "age", + "type": "INTEGER", + "mode": "NULLABLE" + }, + { + "name": "address", + "type": "RECORD", + "fields": [ + { + "name": "city", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "country", + "type": "STRING", + "mode": "NULLABLE" + } + ] + } + ] + } +] diff --git a/tests/system/conftest.py b/tests/system/conftest.py index d9246eecfb..217cf71e0c 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -32,6 +32,7 @@ import ibis.backends import numpy as np import pandas as pd +import pyarrow as pa import pytest import pytz import test_utils.prefixer @@ -290,6 +291,7 @@ def load_test_data_tables( ("scalars", "scalars_schema.json", "scalars.jsonl"), ("scalars_too", "scalars_schema.json", "scalars.jsonl"), ("nested", "nested_schema.json", "nested.jsonl"), + ("nested_structs", "nested_structs_schema.json", "nested_structs.jsonl"), ("repeated", "repeated_schema.json", "repeated.jsonl"), ("penguins", "penguins_schema.json", "penguins.jsonl"), ("time_series", "time_series_schema.json", "time_series.jsonl"), @@ -367,6 +369,11 @@ def nested_table_id(test_data_tables) -> str: return test_data_tables["nested"] +@pytest.fixture(scope="session") +def nested_structs_table_id(test_data_tables) -> str: + return test_data_tables["nested_structs"] + + @pytest.fixture(scope="session") def repeated_table_id(test_data_tables) -> str: return test_data_tables["repeated"] @@ -412,6 +419,43 @@ def nested_pandas_df() -> pd.DataFrame: return df +@pytest.fixture(scope="session") +def nested_structs_df( + nested_structs_table_id: str, session: bigframes.Session +) -> bigframes.dataframe.DataFrame: + """DataFrame pointing at test data.""" + return session.read_gbq(nested_structs_table_id, index_col="id") + + +@pytest.fixture(scope="session") +def nested_structs_pandas_df() -> pd.DataFrame: + """pd.DataFrame pointing at test data.""" + + df = pd.read_json( + DATA_DIR / "nested_structs.jsonl", + lines=True, + ) + df = df.set_index("id") + return df + + +@pytest.fixture(scope="session") +def nested_structs_pandas_type() -> pd.ArrowDtype: + address_struct_schema = pa.struct( + [pa.field("city", pa.string()), pa.field("country", pa.string())] + ) + + person_struct_schema = pa.struct( + [ + pa.field("name", pa.string()), + pa.field("age", pa.int64()), + pa.field("address", address_struct_schema), + ] + ) + + return pd.ArrowDtype(person_struct_schema) + + @pytest.fixture(scope="session") def repeated_df( repeated_table_id: str, session: bigframes.Session diff --git a/tests/system/large/ml/test_model_selection.py b/tests/system/large/ml/test_model_selection.py new file mode 100644 index 0000000000..c1856a1537 --- /dev/null +++ b/tests/system/large/ml/test_model_selection.py @@ -0,0 +1,64 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from bigframes.ml import linear_model, model_selection +from tests.system import utils + + +@pytest.mark.parametrize( + ("cv", "n_fold"), + ( + pytest.param( + None, + 5, + ), + pytest.param( + 4, + 4, + ), + pytest.param( + model_selection.KFold(3), + 3, + ), + ), +) +def test_cross_validate(penguins_df_default_index, cv, n_fold): + model = linear_model.LinearRegression() + df = penguins_df_default_index.dropna() + X = df[ + [ + "species", + "island", + "culmen_length_mm", + ] + ] + y = df["body_mass_g"] + + cv_results = model_selection.cross_validate(model, X, y, cv=cv) + + assert "test_score" in cv_results + assert "fit_time" in cv_results + assert "score_time" in cv_results + + assert len(cv_results["test_score"]) == n_fold + assert len(cv_results["fit_time"]) == n_fold + assert len(cv_results["score_time"]) == n_fold + + utils.check_pandas_df_schema_and_index( + cv_results["test_score"][0].to_pandas(), + columns=utils.ML_REGRESSION_METRICS, + index=1, + ) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index a4a09731a1..78fed6b82f 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -14,6 +14,7 @@ import pytest +from bigframes import exceptions from bigframes.ml import llm import bigframes.pandas as bpd from tests.system import utils @@ -447,3 +448,16 @@ def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index): ], index=6, ) + + +def test_palm2_text_generator_deprecated(): + with pytest.warns(exceptions.ApiDeprecationWarning): + llm.PaLM2TextGenerator() + + +def test_palm2_text_embedding_deprecated(): + with pytest.warns(exceptions.ApiDeprecationWarning): + try: + llm.PaLM2TextEmbeddingGenerator() + except (Exception): + pass diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 340df93791..8c2912edd4 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1477,7 +1477,7 @@ def test_get_dtypes_array_struct_table(nested_df): pa.list_( pa.struct( [ - ( + pa.field( "data", pa.list_( pa.struct( @@ -1487,6 +1487,7 @@ def test_get_dtypes_array_struct_table(nested_df): ], ), ), + nullable=False, ), ("timestamp", pa.timestamp("us", "UTC")), ("category", pa.string()), @@ -2195,17 +2196,15 @@ def test_series_binop_axis_index( [ ((1000, 2000, 3000)), (pd.Index([1000, 2000, 3000])), - (bf_indexes.Index([1000, 2000, 3000])), (pd.Series((1000, 2000), index=["int64_too", "float64_col"])), ], ids=[ "tuple", "pd_index", - "bf_index", "pd_series", ], ) -def test_listlike_binop_axis_1(scalars_dfs, input): +def test_listlike_binop_axis_1_in_memory_data(scalars_dfs, input): scalars_df, scalars_pandas_df = scalars_dfs df_columns = ["int64_col", "float64_col", "int64_too"] @@ -2218,6 +2217,21 @@ def test_listlike_binop_axis_1(scalars_dfs, input): assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) +def test_listlike_binop_axis_1_bf_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + df_columns = ["int64_col", "float64_col", "int64_too"] + + bf_result = ( + scalars_df[df_columns] + .add(bf_indexes.Index([1000, 2000, 3000]), axis=1) + .to_pandas() + ) + pd_result = scalars_pandas_df[df_columns].add(pd.Index([1000, 2000, 3000]), axis=1) + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + def test_binop_with_self_aggregate(session, scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs @@ -4807,20 +4821,33 @@ def test_to_gbq_table_labels(scalars_df_index): pytest.param(["A", "C"], True, id="two_arrays_true"), ], ) -def test_dataframe_explode(col_names, ignore_index): +def test_dataframe_explode(col_names, ignore_index, session): data = { "A": [[0, 1, 2], [], [3, 4]], "B": 3, "C": [["a", "b", "c"], np.nan, ["d", "e"]], } - df = bpd.DataFrame(data) + + metrics = session._metrics + df = bpd.DataFrame(data, session=session) pd_df = df.to_pandas() + pd_result = pd_df.explode(col_names, ignore_index=ignore_index) + bf_result = df.explode(col_names, ignore_index=ignore_index) + + # Check that to_pandas() results in at most a single query execution + execs_pre = metrics.execution_count + bf_materialized = bf_result.to_pandas() + execs_post = metrics.execution_count + pd.testing.assert_frame_equal( - df.explode(col_names, ignore_index=ignore_index).to_pandas(), - pd_df.explode(col_names, ignore_index=ignore_index), + bf_materialized, + pd_result, check_index_type=False, check_dtype=False, ) + # we test this property on this method in particular as compilation + # is non-deterministic and won't use the query cache as implemented + assert execs_post - execs_pre <= 1 @pytest.mark.parametrize( @@ -4863,3 +4890,120 @@ def test_dataframe_explode_reserve_order(ignore_index, ordered): def test_dataframe_explode_xfail(col_names): df = bpd.DataFrame({"A": [[0, 1, 2], [], [3, 4]]}) df.explode(col_names) + + +@skip_legacy_pandas +@pytest.mark.parametrize( + ("on", "rule", "origin"), + [ + pytest.param("datetime_col", "100D", "start"), + pytest.param("datetime_col", "30W", "start"), + pytest.param("datetime_col", "5M", "epoch"), + pytest.param("datetime_col", "3Q", "start_day"), + pytest.param("datetime_col", "3YE", "start"), + pytest.param( + "int64_col", "100D", "start", marks=pytest.mark.xfail(raises=TypeError) + ), + pytest.param( + "datetime_col", "100D", "end", marks=pytest.mark.xfail(raises=ValueError) + ), + ], +) +def test__resample_with_column( + scalars_df_index, scalars_pandas_df_index, on, rule, origin +): + bf_result = ( + scalars_df_index._resample(rule=rule, on=on, origin=origin)[ + ["int64_col", "int64_too"] + ] + .max() + .to_pandas() + ) + pd_result = scalars_pandas_df_index.resample(rule=rule, on=on, origin=origin)[ + ["int64_col", "int64_too"] + ].max() + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +@skip_legacy_pandas +@pytest.mark.parametrize( + ("append", "level", "col", "rule"), + [ + pytest.param(False, None, "timestamp_col", "100d"), + pytest.param(True, 1, "timestamp_col", "1200h"), + pytest.param(False, None, "datetime_col", "100d"), + ], +) +def test__resample_with_index( + scalars_df_index, scalars_pandas_df_index, append, level, col, rule +): + scalars_df_index = scalars_df_index.set_index(col, append=append) + scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append) + bf_result = ( + scalars_df_index[["int64_col", "int64_too"]] + ._resample(rule=rule, level=level) + .min() + .to_pandas() + ) + pd_result = ( + scalars_pandas_df_index[["int64_col", "int64_too"]] + .resample(rule=rule, level=level) + .min() + ) + assert_pandas_df_equal(bf_result, pd_result) + + +@skip_legacy_pandas +@pytest.mark.parametrize( + ("rule", "origin", "data"), + [ + ( + "5h", + "epoch", + { + "timestamp_col": pd.date_range( + start="2021-01-01 13:00:00", periods=30, freq="1h" + ), + "int64_col": range(30), + "int64_too": range(10, 40), + }, + ), + ( + "75min", + "start_day", + { + "timestamp_col": pd.date_range( + start="2021-01-01 13:00:00", periods=30, freq="10min" + ), + "int64_col": range(30), + "int64_too": range(10, 40), + }, + ), + ( + "7s", + "epoch", + { + "timestamp_col": pd.date_range( + start="2021-01-01 13:00:00", periods=30, freq="1s" + ), + "int64_col": range(30), + "int64_too": range(10, 40), + }, + ), + ], +) +def test__resample_start_time(rule, origin, data): + col = "timestamp_col" + scalars_df_index = bpd.DataFrame(data).set_index(col) + scalars_pandas_df_index = pd.DataFrame(data).set_index(col) + scalars_pandas_df_index.index.name = None + + bf_result = scalars_df_index._resample(rule=rule, origin=origin).min().to_pandas() + + pd_result = scalars_pandas_df_index.resample(rule=rule, origin=origin).min() + + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index f68589f431..c1ca0d04c0 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -1225,7 +1225,6 @@ def add_pandas(s: pd.Series) -> float: pytest.param("mask"), ], ) -@pytest.mark.flaky(retries=2, delay=120) def test_remote_function_unary_partial_ordering_mode_assign( unordered_session, dataset_id_permanent, method ): diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 793a4062c5..624e287f8d 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -19,6 +19,7 @@ import geopandas as gpd # type: ignore import numpy +from packaging.version import Version import pandas as pd import pyarrow as pa # type: ignore import pytest @@ -3912,3 +3913,61 @@ def test_series_explode_null(data): s.to_pandas().explode(), check_dtype=False, ) + + +@skip_legacy_pandas +@pytest.mark.parametrize( + ("append", "level", "col", "rule"), + [ + pytest.param(False, None, "timestamp_col", "75D"), + pytest.param(True, 1, "timestamp_col", "25W"), + pytest.param(False, None, "datetime_col", "3ME"), + pytest.param(True, "timestamp_col", "timestamp_col", "1YE"), + ], +) +def test__resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule): + scalars_df_index = scalars_df_index.set_index(col, append=append)["int64_col"] + scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append)[ + "int64_col" + ] + bf_result = scalars_df_index._resample(rule=rule, level=level).min().to_pandas() + pd_result = scalars_pandas_df_index.resample(rule=rule, level=level).min() + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_struct_get_field_by_attribute( + nested_structs_df, nested_structs_pandas_df, nested_structs_pandas_type +): + if Version(pd.__version__) < Version("2.2.0"): + pytest.skip("struct accessor is not supported before pandas 2.2") + + bf_series = nested_structs_df["person"] + df_series = nested_structs_pandas_df["person"].astype(nested_structs_pandas_type) + + pd.testing.assert_series_equal( + bf_series.address.city.to_pandas(), + df_series.struct.field("address").struct.field("city"), + check_dtype=False, + check_index=False, + ) + pd.testing.assert_series_equal( + bf_series.address.country.to_pandas(), + df_series.struct.field("address").struct.field("country"), + check_dtype=False, + check_index=False, + ) + + +def test_series_struct_fields_in_dir(nested_structs_df): + series = nested_structs_df["person"] + + assert "age" in dir(series) + assert "address" in dir(series) + assert "city" in dir(series.address) + assert "country" in dir(series.address) + + +def test_series_struct_class_attributes_shadow_struct_fields(nested_structs_df): + series = nested_structs_df["person"] + + assert series.name == "person" diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index ed3e38e6f8..17e8b99704 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -47,10 +47,10 @@ def test_read_gbq_tokyo( result = df.sort_index().to_pandas() expected = scalars_pandas_df_index - _, query_job = session_tokyo._execute(df._block.expr) - assert query_job.location == tokyo_location + result = session_tokyo._executor.execute(df._block.expr) + assert result.query_job.location == tokyo_location - pd.testing.assert_frame_equal(result, expected) + assert len(expected) == result.total_rows @pytest.mark.parametrize( @@ -671,10 +671,10 @@ def test_read_pandas_tokyo( result = df.to_pandas() expected = scalars_pandas_df_index - _, query_job = session_tokyo._execute(df._block.expr) - assert query_job.location == tokyo_location + result = session_tokyo._executor.execute(df._block.expr) + assert result.query_job.location == tokyo_location - pd.testing.assert_frame_equal(result, expected) + assert len(expected) == result.total_rows @utils.skip_legacy_pandas diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index 5e124d73cd..fe3411e266 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -198,3 +198,51 @@ def test_unordered_mode_no_ambiguity_warning(unordered_session): with warnings.catch_warnings(): warnings.simplefilter("error") df.groupby("a").head(3) + + +@skip_legacy_pandas +@pytest.mark.parametrize( + ("rule", "origin", "data"), + [ + ( + "5h", + "epoch", + { + "timestamp_col": pd.date_range( + start="2021-01-01 13:00:00", periods=30, freq="1h" + ), + "int64_col": range(30), + "int64_too": range(10, 40), + }, + ), + ( + "5h", + "epoch", + { + "timestamp_col": pd.DatetimeIndex( + pd.date_range( + start="2021-01-01 13:00:00", periods=15, freq="1h" + ).tolist() + + pd.date_range( + start="2021-01-01 13:00:00", periods=15, freq="1h" + ).tolist() + ), + "int64_col": range(30), + "int64_too": range(10, 40), + }, + ), + ], +) +def test__resample_with_index(unordered_session, rule, origin, data): + col = "timestamp_col" + scalars_df_index = bpd.DataFrame(data, session=unordered_session).set_index(col) + scalars_pandas_df_index = pd.DataFrame(data).set_index(col) + scalars_pandas_df_index.index.name = None + + bf_result = scalars_df_index._resample(rule=rule, origin=origin).min().to_pandas() + + pd_result = scalars_pandas_df_index.resample(rule=rule, origin=origin).min() + + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) diff --git a/tests/unit/core/test_expression.py b/tests/unit/core/test_expression.py index f46c47a582..72e200f007 100644 --- a/tests/unit/core/test_expression.py +++ b/tests/unit/core/test_expression.py @@ -13,13 +13,16 @@ # limitations under the License. import bigframes.core.expression as ex +import bigframes.core.identifiers as ids import bigframes.dtypes as dtypes import bigframes.operations as ops def test_expression_dtype_simple(): expression = ops.add_op.as_expr("a", "b") - result = expression.output_type({"a": dtypes.INT_DTYPE, "b": dtypes.INT_DTYPE}) + result = expression.output_type( + {ids.ColumnId("a"): dtypes.INT_DTYPE, ids.ColumnId("b"): dtypes.INT_DTYPE} + ) assert result == dtypes.INT_DTYPE @@ -28,7 +31,9 @@ def test_expression_dtype_nested(): "a", ops.abs_op.as_expr(ops.sub_op.as_expr("b", ex.const(3.14))) ) - result = expression.output_type({"a": dtypes.INT_DTYPE, "b": dtypes.INT_DTYPE}) + result = expression.output_type( + {ids.ColumnId("a"): dtypes.INT_DTYPE, ids.ColumnId("b"): dtypes.INT_DTYPE} + ) assert result == dtypes.FLOAT_DTYPE diff --git a/tests/unit/session/test_io_pandas.py b/tests/unit/session/test_io_pandas.py index 81d02466ef..2fa07aed35 100644 --- a/tests/unit/session/test_io_pandas.py +++ b/tests/unit/session/test_io_pandas.py @@ -25,6 +25,7 @@ import pyarrow # type: ignore import pytest +import bigframes.core.schema import bigframes.features import bigframes.pandas import bigframes.session._io.pandas @@ -445,7 +446,13 @@ def test_arrow_to_pandas( dtypes: Dict, expected: pandas.DataFrame, ): - actual = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) + schema = bigframes.core.schema.ArraySchema( + tuple( + bigframes.core.schema.SchemaItem(name, dtype) + for name, dtype in dtypes.items() + ) + ) + actual = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, schema) pandas.testing.assert_series_equal(actual.dtypes, expected.dtypes) # assert_frame_equal is converting to numpy internally, which causes some @@ -478,8 +485,14 @@ def test_arrow_to_pandas( def test_arrow_to_pandas_wrong_size_dtypes( arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], dtypes: Dict ): - with pytest.raises(ValueError, match=f"Number of types {len(dtypes)}"): - bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) + schema = bigframes.core.schema.ArraySchema( + tuple( + bigframes.core.schema.SchemaItem(name, dtype) + for name, dtype in dtypes.items() + ) + ) + with pytest.raises(ValueError, match=f"Number of types {len(schema)}"): + bigframes.session._io.pandas.arrow_to_pandas(arrow_table, schema) def test_read_pandas_with_bigframes_dataframe(): diff --git a/tests/unit/test_planner.py b/tests/unit/test_planner.py index 27ec7d5921..491f3de6fa 100644 --- a/tests/unit/test_planner.py +++ b/tests/unit/test_planner.py @@ -20,6 +20,7 @@ import bigframes.core as core import bigframes.core.expression as ex +import bigframes.core.identifiers as ids import bigframes.core.schema import bigframes.operations as ops import bigframes.session.planner as planner @@ -53,7 +54,7 @@ def test_session_aware_caching_project_filter(): target.node, [obj.node for obj in session_objects] ) assert result == LEAF.node - assert cluster_cols == ["col_a"] + assert cluster_cols == [ids.ColumnId("col_a")] def test_session_aware_caching_project_multi_filter(): @@ -76,7 +77,7 @@ def test_session_aware_caching_project_multi_filter(): target.node, [obj.node for obj in session_objects] ) assert result == LEAF.node - assert cluster_cols == ["col_a", "col_b"] + assert cluster_cols == [ids.ColumnId("col_a"), ids.ColumnId("col_b")] def test_session_aware_caching_unusable_filter(): @@ -117,4 +118,4 @@ def test_session_aware_caching_fork_after_window_op(): ], ) assert result == LEAF.promote_offsets()[0].node - assert cluster_cols == ["col_a"] + assert cluster_cols == [ids.ColumnId("col_a")] diff --git a/third_party/bigframes_vendored/sklearn/model_selection/_validation.py b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py new file mode 100644 index 0000000000..43e155da7d --- /dev/null +++ b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py @@ -0,0 +1,46 @@ +""" +The :mod:`sklearn.model_selection._validation` module includes classes and +functions to validate the model. +""" + +# Author: Alexandre Gramfort +# Gael Varoquaux +# Olivier Grisel +# Raghav RV +# Michal Karbownik +# License: BSD 3 clause + + +def cross_validate(estimator, X, y=None, *, cv=None): + """Evaluate metric(s) by cross-validation and also record fit/score times. + + Args: + estimator: + bigframes.ml model that implements fit(). + The object to use to fit the data. + + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The data to fit. + + y (bigframes.dataframe.DataFrame, bigframes.series.Series or None): + The target variable to try to predict in the case of supe()rvised learning. Default to None. + + cv (int, bigframes.ml.model_selection.KFold or None): + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - int, to specify the number of folds in a `KFold`, + - bigframes.ml.model_selection.KFold instance. + + Returns: + Dict[str, List]: A dict of arrays containing the score/time arrays for each scorer is returned. The keys for this ``dict`` are: + + ``test_score`` + The score array for test scores on each cv split. + ``fit_time`` + The time for fitting the estimator on the train + set for each cv split. + ``score_time`` + The time for scoring the estimator on the test set for each + cv split.""" diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 60f4942175..c07f26bc6f 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.20.0" +__version__ = "1.21.0" pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy