From 52b7786c3a28da6c29e3ddf12629802215194ad9 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Mon, 5 Aug 2024 16:21:51 -0700 Subject: [PATCH 01/15] fix: Fix caching from generating row numbers in partial ordering mode (#872) --- bigframes/series.py | 2 +- bigframes/session/__init__.py | 4 +++- tests/system/small/test_unordered.py | 15 ++++++++++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index d41553d0d7..069c469a85 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -641,7 +641,7 @@ def head(self, n: int = 5) -> Series: def tail(self, n: int = 5) -> Series: return typing.cast(Series, self.iloc[-n:]) - def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame: + def peek(self, n: int = 5, *, force: bool = True) -> pandas.Series: """ Preview n arbitrary elements from the series without guarantees about row selection or ordering. diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index dc1da488a1..8ff5862bfc 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1997,8 +1997,10 @@ def _cache_with_session_awareness(self, array_value: core.ArrayValue) -> None: ) if len(cluster_cols) > 0: self._cache_with_cluster_cols(core.ArrayValue(target), cluster_cols) - else: + elif self._strictly_ordered: self._cache_with_offsets(core.ArrayValue(target)) + else: + self._cache_with_cluster_cols(core.ArrayValue(target), []) def _simplify_with_caching(self, array_value: core.ArrayValue): """Attempts to handle the complexity by caching duplicated subtrees and breaking the query into pieces.""" diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index 9f85ec99f9..5e124d73cd 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -19,7 +19,11 @@ import bigframes.exceptions import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas +from tests.system.utils import ( + assert_pandas_df_equal, + assert_series_equal, + skip_legacy_pandas, +) def test_unordered_mode_sql_no_hash(unordered_session): @@ -51,6 +55,15 @@ def test_unordered_mode_cache_aggregate(unordered_session): assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) +def test_unordered_mode_series_peek(unordered_session): + pd_series = pd.Series([1, 2, 3, 4, 5, 6], dtype=pd.Int64Dtype()) + bf_series = bpd.Series(pd_series, session=unordered_session) + pd_result = pd_series.groupby(pd_series % 4).sum() + bf_peek = bf_series.groupby(bf_series % 4).sum().peek(2) + + assert_series_equal(bf_peek, pd_result.reindex(bf_peek.index)) + + def test_unordered_mode_single_aggregate(unordered_session): pd_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=pd.Int64Dtype()) bf_df = bpd.DataFrame(pd_df, session=unordered_session) From 6e6f9df55d435afe0b3ade728ca06826e92a6ee6 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 6 Aug 2024 12:22:53 -0700 Subject: [PATCH 02/15] docs: update streaming notebook (#887) --- notebooks/streaming/streaming_dataframe.ipynb | 203 +++++++++++------- 1 file changed, 127 insertions(+), 76 deletions(-) diff --git a/notebooks/streaming/streaming_dataframe.ipynb b/notebooks/streaming/streaming_dataframe.ipynb index d4cc255fa5..9b52c2d71e 100644 --- a/notebooks/streaming/streaming_dataframe.ipynb +++ b/notebooks/streaming/streaming_dataframe.ipynb @@ -17,10 +17,22 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'1.13.0'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import bigframes\n", - "import bigframes.streaming as bst" + "# make sure bigframes version >= 1.12.0\n", + "bigframes.__version__" ] }, { @@ -29,10 +41,46 @@ "metadata": {}, "outputs": [], "source": [ - "bigframes.options._bigquery_options.project = \"bigframes-load-testing\"\n", + "import bigframes.pandas as bpd\n", + "import bigframes.streaming as bst\n", + "bigframes.options._bigquery_options.project = \"bigframes-load-testing\" # Change to your own project ID\n", "job_id_prefix = \"test_streaming_\"" ] }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 65df3a2f-cda8-405d-8b38-20a755f9b9a0 is DONE. 28.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'birds.penguins_bigtable_streaming'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Copy a table from the public dataset for streaming jobs. Any changes to the table can be reflected in the streaming destination.\n", + "df = bpd.read_gbq(\"bigquery-public-data.ml_datasets.penguins\")\n", + "df.to_gbq(\"birds.penguins_bigtable_streaming\", if_exists=\"replace\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -43,13 +91,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/session/__init__.py:773: PreviewWarning: The bigframes.streaming module is a preview feature, and subject to change.\n", + " warnings.warn(\n", "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/blocks.py:126: NullIndexPreviewWarning: Creating object with Null Index. Null Index is a preview feature.\n", " warnings.warn(\n" ] @@ -61,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -82,7 +132,7 @@ { "data": { "text/html": [ - "Query job d57200dd-e6f1-42c7-876b-7f4a54994ae6 is DONE. 0 Bytes processed. Open Job" + "Query job dd20bd9d-4844-43e4-86ab-95759d7e673a is DONE. 2.7 kB processed. Open Job" ], "text/plain": [ "" @@ -104,7 +154,7 @@ { "data": { "text/html": [ - "Query job 1decce4a-eb32-49f4-8e47-7bda0220037a is DONE. 28.9 kB processed. Open Job" + "Query job 873e44ee-76e9-4254-83d3-04cf36fbd140 is DONE. 28.9 kB processed. Open Job" ], "text/plain": [ "" @@ -144,151 +194,151 @@ " 0\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Torgersen\n", - " 3875\n", + " 3875.0\n", " \n", " \n", " 1\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Torgersen\n", - " 2900\n", + " 2900.0\n", " \n", " \n", " 2\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Biscoe\n", - " 3725\n", + " 3725.0\n", " \n", " \n", " 3\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 2975\n", + " 2975.0\n", " \n", " \n", " 4\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Torgersen\n", - " 3050\n", + " 3050.0\n", " \n", " \n", " 5\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 2700\n", + " 2700.0\n", " \n", " \n", " 6\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 3900\n", + " 3900.0\n", " \n", " \n", " 7\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Biscoe\n", - " 3825\n", + " 3825.0\n", " \n", " \n", " 8\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 3775\n", + " 3775.0\n", " \n", " \n", " 9\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 3350\n", + " 3350.0\n", " \n", " \n", " 10\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Biscoe\n", - " 3900\n", + " 3900.0\n", " \n", " \n", " 11\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Torgersen\n", - " 3650\n", + " 3650.0\n", " \n", " \n", " 12\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Biscoe\n", - " 3200\n", + " 3200.0\n", " \n", " \n", " 13\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 3650\n", + " 3650.0\n", " \n", " \n", " 14\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 3700\n", + " 3700.0\n", " \n", " \n", " 15\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 3800\n", + " 3800.0\n", " \n", " \n", " 16\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 3950\n", + " 3950.0\n", " \n", " \n", " 17\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 3350\n", + " 3350.0\n", " \n", " \n", " 18\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 3100\n", + " 3100.0\n", " \n", " \n", " 19\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 3750\n", + " 3750.0\n", " \n", " \n", " 20\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Biscoe\n", - " 3550\n", + " 3550.0\n", " \n", " \n", " 21\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 3400\n", + " 3400.0\n", " \n", " \n", " 22\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Torgersen\n", - " 3450\n", + " 3450.0\n", " \n", " \n", " 23\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Torgersen\n", - " 3600\n", + " 3600.0\n", " \n", " \n", " 24\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 3650\n", + " 3650.0\n", " \n", " \n", "\n", @@ -297,37 +347,37 @@ ], "text/plain": [ " species rowkey body_mass_g\n", - " Adelie Penguin (Pygoscelis adeliae) Torgersen 3875\n", - " Adelie Penguin (Pygoscelis adeliae) Torgersen 2900\n", - " Adelie Penguin (Pygoscelis adeliae) Biscoe 3725\n", - " Adelie Penguin (Pygoscelis adeliae) Dream 2975\n", - " Adelie Penguin (Pygoscelis adeliae) Torgersen 3050\n", - "Chinstrap penguin (Pygoscelis antarctica) Dream 2700\n", - " Adelie Penguin (Pygoscelis adeliae) Dream 3900\n", - " Adelie Penguin (Pygoscelis adeliae) Biscoe 3825\n", - "Chinstrap penguin (Pygoscelis antarctica) Dream 3775\n", - " Adelie Penguin (Pygoscelis adeliae) Dream 3350\n", - " Adelie Penguin (Pygoscelis adeliae) Biscoe 3900\n", - " Adelie Penguin (Pygoscelis adeliae) Torgersen 3650\n", - " Adelie Penguin (Pygoscelis adeliae) Biscoe 3200\n", - "Chinstrap penguin (Pygoscelis antarctica) Dream 3650\n", - " Adelie Penguin (Pygoscelis adeliae) Dream 3700\n", - "Chinstrap penguin (Pygoscelis antarctica) Dream 3800\n", - "Chinstrap penguin (Pygoscelis antarctica) Dream 3950\n", - "Chinstrap penguin (Pygoscelis antarctica) Dream 3350\n", - " Adelie Penguin (Pygoscelis adeliae) Dream 3100\n", - "Chinstrap penguin (Pygoscelis antarctica) Dream 3750\n", - " Adelie Penguin (Pygoscelis adeliae) Biscoe 3550\n", - "Chinstrap penguin (Pygoscelis antarctica) Dream 3400\n", - " Adelie Penguin (Pygoscelis adeliae) Torgersen 3450\n", - " Adelie Penguin (Pygoscelis adeliae) Torgersen 3600\n", - "Chinstrap penguin (Pygoscelis antarctica) Dream 3650\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 3875.0\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 2900.0\n", + " Adelie Penguin (Pygoscelis adeliae) Biscoe 3725.0\n", + " Adelie Penguin (Pygoscelis adeliae) Dream 2975.0\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 3050.0\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 2700.0\n", + " Adelie Penguin (Pygoscelis adeliae) Dream 3900.0\n", + " Adelie Penguin (Pygoscelis adeliae) Biscoe 3825.0\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3775.0\n", + " Adelie Penguin (Pygoscelis adeliae) Dream 3350.0\n", + " Adelie Penguin (Pygoscelis adeliae) Biscoe 3900.0\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 3650.0\n", + " Adelie Penguin (Pygoscelis adeliae) Biscoe 3200.0\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3650.0\n", + " Adelie Penguin (Pygoscelis adeliae) Dream 3700.0\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3800.0\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3950.0\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3350.0\n", + " Adelie Penguin (Pygoscelis adeliae) Dream 3100.0\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3750.0\n", + " Adelie Penguin (Pygoscelis adeliae) Biscoe 3550.0\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3400.0\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 3450.0\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 3600.0\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3650.0\n", "...\n", "\n", "[165 rows x 3 columns]" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -351,22 +401,22 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/garrettwu/src/bigframes/bigframes/streaming/dataframe.py:338: PreviewWarning: The bigframes.streaming module is a preview feature, and subject to change.\n", + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/streaming/dataframe.py:341: PreviewWarning: The bigframes.streaming module is a preview feature, and subject to change.\n", " warnings.warn(\n" ] } ], "source": [ - "job = sdf.to_bigtable(instance=\"streaming-testing-instance\",\n", - " table=\"garrettwu-no-col-family\",\n", - " service_account_email=\"streaming-testing-admin@bigframes-load-testing.iam.gserviceaccount.com\",\n", + "job = sdf.to_bigtable(instance=\"streaming-testing-instance\", # Change to your own Bigtable instance name\n", + " table=\"garrettwu-no-col-family\", # Change to your own Bigtable table name\n", + " service_account_email=\"streaming-testing-admin@bigframes-load-testing.iam.gserviceaccount.com\", # Change to your own service account\n", " app_profile=None,\n", " truncate=True,\n", " overwrite=True,\n", @@ -378,7 +428,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -397,7 +447,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -406,7 +456,7 @@ "True" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -419,13 +469,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### PubSub\n", - "Create Pubsub streaming job" + "### Pub/Sub\n", + "Create Pub/Sub streaming job" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -438,27 +488,28 @@ } ], "source": [ + "# Pub/Sub requires a single column\n", "sdf = sdf[[\"rowkey\"]]" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/garrettwu/src/bigframes/bigframes/streaming/dataframe.py:453: PreviewWarning: The bigframes.streaming module is a preview feature, and subject to change.\n", + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/streaming/dataframe.py:456: PreviewWarning: The bigframes.streaming module is a preview feature, and subject to change.\n", " warnings.warn(\n" ] } ], "source": [ "job = sdf.to_pubsub(\n", - " topic=\"penguins\",\n", - " service_account_email=\"streaming-testing@bigframes-load-testing.iam.gserviceaccount.com\",\n", + " topic=\"penguins\", # Change to your own Pub/Sub topic ID\n", + " service_account_email=\"streaming-testing@bigframes-load-testing.iam.gserviceaccount.com\", # Change to your own service account\n", " job_id=None,\n", " job_id_prefix=job_id_prefix,\n", " )" @@ -466,7 +517,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -485,7 +536,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -494,7 +545,7 @@ "True" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } From eb6080460344aff2fabb7864536ea4fe24c5fbef Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 6 Aug 2024 15:02:19 -0700 Subject: [PATCH 03/15] perf: Generate SQL with fewer CTEs (#877) --- bigframes/core/compile/compiled.py | 135 +++++++++++++++++------------ bigframes/core/compile/compiler.py | 1 - bigframes/core/window_spec.py | 13 ++- tests/unit/session/test_session.py | 3 +- 4 files changed, 93 insertions(+), 59 deletions(-) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 538789f9d7..cae527931c 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -24,6 +24,7 @@ import ibis.backends.bigquery as ibis_bigquery import ibis.common.deferred # type: ignore import ibis.expr.datatypes as ibis_dtypes +import ibis.expr.operations as ibis_ops import ibis.expr.types as ibis_types import pandas @@ -36,7 +37,6 @@ from bigframes.core.ordering import ( ascending_over, encode_order_string, - IntegerEncoding, join_orderings, OrderingExpression, RowOrdering, @@ -71,19 +71,16 @@ def __init__( # Allow creating a DataFrame directly from an Ibis table expression. # TODO(swast): Validate that each column references the same table (or # no table for literal values). - self._columns = tuple(columns) + self._columns = tuple( + column.resolve(table) + # TODO(https://github.com/ibis-project/ibis/issues/7613): use + # public API to refer to Deferred type. + if isinstance(column, ibis.common.deferred.Deferred) else column + for column in columns + ) # To allow for more efficient lookup by column name, create a # dictionary mapping names to column values. - self._column_names = { - ( - column.resolve(table) - # TODO(https://github.com/ibis-project/ibis/issues/7613): use - # public API to refer to Deferred type. - if isinstance(column, ibis.common.deferred.Deferred) - else column - ).get_name(): column - for column in self._columns - } + self._column_names = {column.get_name(): column for column in self._columns} @property def columns(self) -> typing.Tuple[ibis_types.Value, ...]: @@ -139,10 +136,6 @@ def projection( for expression, id in expression_id_pairs ] result = self._select(tuple(values)) # type: ignore - - # Need to reproject to convert ibis Scalar to ibis Column object - if any(exp_id[0].is_const for exp_id in expression_id_pairs): - result = result._reproject_to_table() return result @abc.abstractmethod @@ -300,8 +293,6 @@ def _to_ibis_expr( ArrayValue objects are sorted, so the following options are available to reflect this in the ibis expression. - * "offset_col": Zero-based offsets are generated as a column, this will - not sort the rows however. * "string_encoded": An ordered string column is provided in output table. * "unordered": No ordering information will be provided in output. Only value columns are projected. @@ -355,6 +346,10 @@ def _to_ibis_expr( return table def filter(self, predicate: ex.Expression) -> UnorderedIR: + if any(map(is_window, map(self._get_ibis_column, predicate.unbound_variables))): + # ibis doesn't support qualify syntax, so create CTE if filtering over window expression + # https://github.com/ibis-project/ibis/issues/9775 + return self._reproject_to_table().filter(predicate) bindings = {col: self._get_ibis_column(col) for col in self.column_ids} condition = op_compiler.compile_expression(predicate, bindings) return self._filter(condition) @@ -785,15 +780,33 @@ def promote_offsets(self, col_id: str) -> OrderedIR: """ # Special case: offsets already exist ordering = self._ordering + # Case 1, already have offsets, just create column from them + if ordering.is_sequential and (ordering.total_order_col is not None): + expr_builder = self.builder() + expr_builder.columns = [ + self._compile_expression( + ordering.total_order_col.scalar_expression + ).name(col_id), + *self.columns, + ] + return expr_builder.build() + # Cannot nest analytic expressions, so reproject to cte first if needed. + # Also ibis cannot window literals, so need to reproject those (even though this is legal in googlesql) + # Seee: https://github.com/ibis-project/ibis/issues/9773 + can_directly_window = not any( + map(lambda x: is_literal(x) or is_window(x), self._ibis_order) + ) + if not can_directly_window: + return self._reproject_to_table().promote_offsets(col_id) - if (not ordering.is_sequential) or (not ordering.total_order_col): - return self._project_offsets().promote_offsets(col_id) + window = ibis.window(order_by=self._ibis_order) + if self._predicates: + window = window.group_by(self._reduced_predicate) + offsets = ibis.row_number().over(window) expr_builder = self.builder() expr_builder.columns = [ - self._compile_expression(ordering.total_order_col.scalar_expression).name( - col_id - ), *self.columns, + offsets.name(col_id), ] return expr_builder.build() @@ -806,7 +819,6 @@ def project_window_op( output_name=None, *, never_skip_nulls=False, - skip_reproject_unsafe: bool = False, ) -> OrderedIR: """ Creates a new expression based on this expression with unary operation applied to one column. @@ -815,8 +827,25 @@ def project_window_op( window_spec: a specification of the window over which to apply the operator output_name: the id to assign to the output of the operator, by default will replace input col if distinct output id not provided never_skip_nulls: will disable null skipping for operators that would otherwise do so - skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection """ + # Cannot nest analytic expressions, so reproject to cte first if needed. + # Also ibis cannot window literals, so need to reproject those (even though this is legal in googlesql) + # See: https://github.com/ibis-project/ibis/issues/9773 + used_exprs = map( + self._get_any_column, [column_name, *window_spec.all_referenced_columns] + ) + can_directly_window = not any( + map(lambda x: is_literal(x) or is_window(x), used_exprs) + ) + if not can_directly_window: + return self._reproject_to_table().project_window_op( + column_name, + op, + window_spec, + output_name, + never_skip_nulls=never_skip_nulls, + ) + column = typing.cast(ibis_types.Column, self._get_ibis_column(column_name)) window = self._ibis_window_from_spec( window_spec, require_total_order=op.uses_total_row_ordering @@ -861,8 +890,7 @@ def project_window_op( window_op = case_statement result = self._set_or_replace_by_id(output_name or column_name, window_op) - # TODO(tbergeron): Automatically track analytic expression usage and defer reprojection until required for valid query generation. - return result._reproject_to_table() if not skip_reproject_unsafe else result + return result def _reproject_to_table(self) -> OrderedIR: table = self._to_ibis_expr( @@ -944,7 +972,7 @@ def _to_ibis_expr( expose_hidden_cols: bool = False, fraction: Optional[float] = None, col_id_overrides: typing.Mapping[str, str] = {}, - ordering_mode: Literal["string_encoded", "offset_col", "unordered"], + ordering_mode: Literal["string_encoded", "unordered"], order_col_name: Optional[str] = ORDER_ID_COLUMN, ): """ @@ -953,8 +981,7 @@ def _to_ibis_expr( ArrayValue objects are sorted, so the following options are available to reflect this in the ibis expression. - * "offset_col": Zero-based offsets are generated as a column, this will - not sort the rows however. + * "string_encoded": An ordered string column is provided in output table. * "unordered": No ordering information will be provided in output. Only value columns are projected. @@ -981,10 +1008,9 @@ def _to_ibis_expr( """ assert ordering_mode in ( "string_encoded", - "offset_col", "unordered", ) - if expose_hidden_cols and ordering_mode in ("ordered_col", "offset_col"): + if expose_hidden_cols and ordering_mode in ("ordered_col"): raise ValueError( f"Cannot expose hidden ordering columns with ordering_mode {ordering_mode}" ) @@ -1034,6 +1060,10 @@ def _to_ibis_expr( return table def filter(self, predicate: ex.Expression) -> OrderedIR: + if any(map(is_window, map(self._get_ibis_column, predicate.unbound_variables))): + # ibis doesn't support qualify syntax, so create CTE if filtering over window expression + # https://github.com/ibis-project/ibis/issues/9775 + return self._reproject_to_table().filter(predicate) bindings = {col: self._get_ibis_column(col) for col in self.column_ids} condition = op_compiler.compile_expression(predicate, bindings) return self._filter(condition) @@ -1174,27 +1204,6 @@ def _bake_ordering(self) -> OrderedIR: predicates=self._predicates, ) - def _project_offsets(self) -> OrderedIR: - """Create a new expression that contains offsets. Should only be executed when - offsets are needed for an operations. Has no effect on expression semantics.""" - if self._ordering.is_sequential: - return self - table = self._to_ibis_expr( - ordering_mode="offset_col", order_col_name=ORDER_ID_COLUMN - ) - columns = [table[column_name] for column_name in self._column_names] - ordering = TotalOrdering( - ordering_value_columns=tuple([ascending_over(ORDER_ID_COLUMN)]), - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - integer_encoding=IntegerEncoding(True, is_sequential=True), - ) - return OrderedIR( - table, - columns=columns, - hidden_ordering_columns=[table[ORDER_ID_COLUMN]], - ordering=ordering, - ) - def _create_order_columns( self, ordering_mode: str, @@ -1202,9 +1211,7 @@ def _create_order_columns( expose_hidden_cols: bool, ) -> typing.Sequence[ibis_types.Value]: # Generate offsets if current ordering id semantics are not sufficiently strict - if ordering_mode == "offset_col": - return (self._create_offset_column().name(order_col_name),) - elif ordering_mode == "string_encoded": + if ordering_mode == "string_encoded": return (self._create_string_ordering_column().name(order_col_name),) elif expose_hidden_cols: return self._hidden_ordering_columns @@ -1328,6 +1335,22 @@ def build(self) -> OrderedIR: ) +def is_literal(column: ibis_types.Value) -> bool: + # Unfortunately, Literals in ibis are not "Columns"s and therefore can't be aggregated. + return not isinstance(column, ibis_types.Column) + + +def is_window(column: ibis_types.Value) -> bool: + matches = ( + (column) + .op() + .find_topmost( + lambda x: isinstance(x, (ibis_ops.WindowFunction, ibis_ops.Relation)) + ) + ) + return any(isinstance(op, ibis_ops.WindowFunction) for op in matches) + + def _reduce_predicate_list( predicate_list: typing.Collection[ibis_types.BooleanValue], ) -> ibis_types.BooleanValue: diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index c7f8c5ab59..8fb1f7ab3a 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -304,7 +304,6 @@ def compile_window(self, node: nodes.WindowOpNode, ordered: bool = True): node.window_spec, node.output_name, never_skip_nulls=node.never_skip_nulls, - skip_reproject_unsafe=node.skip_reproject_unsafe, ) return result if ordered else result.to_unordered() diff --git a/bigframes/core/window_spec.py b/bigframes/core/window_spec.py index 57c57b451a..f011e2848d 100644 --- a/bigframes/core/window_spec.py +++ b/bigframes/core/window_spec.py @@ -14,7 +14,8 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Optional, Tuple, Union +import itertools +from typing import Optional, Set, Tuple, Union import bigframes.core.ordering as orderings @@ -162,3 +163,13 @@ def row_bounded(self): to calculate deterministically. """ return isinstance(self.bounds, RowsWindowBounds) + + @property + def all_referenced_columns(self) -> Set[str]: + """ + Return list of all variables reference ind the window. + """ + ordering_vars = itertools.chain.from_iterable( + item.scalar_expression.unbound_variables for item in self.ordering + ) + return set(itertools.chain(self.grouping_keys, ordering_vars)) diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index 31029abd67..2f7eaa567a 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -246,7 +246,8 @@ def test_default_index_warning_not_raised_by_read_gbq_index_col_sequential_int64 index_col=bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64, ) - # We expect a window operation because we specificaly requested a sequential index. + # We expect a window operation because we specificaly requested a sequential index and named it. + df.index.name = "named_index" generated_sql = df.sql.casefold() assert "OVER".casefold() in generated_sql assert "ROW_NUMBER()".casefold() in generated_sql From 171da6cb33165b49d46ea6528038342abd89e9fa Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 6 Aug 2024 15:35:05 -0700 Subject: [PATCH 04/15] docs: add streaming html docs (#884) * docs: add streaming html docs * add beta icon * add beta icon in toc.yml --- bigframes/session/__init__.py | 4 +++- bigframes/streaming/dataframe.py | 11 ++++++++++- docs/reference/bigframes.streaming/dataframe.rst | 6 ++++++ docs/reference/bigframes.streaming/index.rst | 13 +++++++++++++ docs/reference/index.rst | 1 + docs/templates/toc.yml | 7 +++++++ 6 files changed, 40 insertions(+), 2 deletions(-) create mode 100644 docs/reference/bigframes.streaming/dataframe.rst create mode 100644 docs/reference/bigframes.streaming/index.rst diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 8ff5862bfc..2da788292b 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -760,7 +760,9 @@ def read_gbq_table_streaming( ) -> streaming_dataframe.StreamingDataFrame: """Turn a BigQuery table into a StreamingDataFrame. - Note: The bigframes.streaming module is a preview feature, and subject to change. + .. note:: + + The bigframes.streaming module is a preview feature, and subject to change. **Examples:** diff --git a/bigframes/streaming/dataframe.py b/bigframes/streaming/dataframe.py index 64a4898c57..b83ae5d822 100644 --- a/bigframes/streaming/dataframe.py +++ b/bigframes/streaming/dataframe.py @@ -191,7 +191,16 @@ def to_pubsub( @log_adapter.class_logger class StreamingDataFrame(StreamingBase): - __doc__ = _curate_df_doc(dataframe.DataFrame.__doc__) + __doc__ = ( + _curate_df_doc(dataframe.DataFrame.__doc__) + + """ + .. note:: + + The bigframes.streaming module is a preview feature, and subject to change. + + Currently only supports basic projection, filtering and preview operations. + """ + ) # Private constructor _create_key = object() diff --git a/docs/reference/bigframes.streaming/dataframe.rst b/docs/reference/bigframes.streaming/dataframe.rst new file mode 100644 index 0000000000..79ec64961c --- /dev/null +++ b/docs/reference/bigframes.streaming/dataframe.rst @@ -0,0 +1,6 @@ +bigframes.streaming.dataframe +============================= + +.. autoclass:: bigframes.streaming.dataframe.StreamingDataFrame + :members: + :inherited-members: diff --git a/docs/reference/bigframes.streaming/index.rst b/docs/reference/bigframes.streaming/index.rst new file mode 100644 index 0000000000..20a22072e5 --- /dev/null +++ b/docs/reference/bigframes.streaming/index.rst @@ -0,0 +1,13 @@ + +============================ +BigQuery DataFrame Streaming +============================ + +.. automodule:: bigframes.streaming + :members: + :undoc-members: + +.. toctree:: + :maxdepth: 2 + + dataframe diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 387e9b5ced..eb5a774b29 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -11,3 +11,4 @@ packages. bigframes.pandas/index bigframes.ml/index bigframes.bigquery/index + bigframes.streaming/index diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 95bded9a60..736ffba286 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -203,4 +203,11 @@ - name: BigQuery built-in functions uid: bigframes.bigquery name: bigframes.bigquery + - items: + - name: Overview + uid: bigframes.streaming + - name: StreamingDataFrame + uid: bigframes.streaming.dataframe.StreamingDataFrame + name: bigframes.streaming + status: beta name: BigQuery DataFrames From 8c352ce6991b3635222a285d1eee9a56cd57d0c6 Mon Sep 17 00:00:00 2001 From: mattyopl <90574735+mattyopl@users.noreply.github.com> Date: Tue, 6 Aug 2024 20:31:21 -0400 Subject: [PATCH 05/15] chore: clean up OWNERS (#886) - remove inactive users - add myself Co-authored-by: Matthew Laurence Chen --- OWNERS | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/OWNERS b/OWNERS index f86ad551ef..562ee0f19b 100644 --- a/OWNERS +++ b/OWNERS @@ -1,12 +1,8 @@ -ashleyxu@google.com -bmil@google.com chelsealin@google.com garrettwu@google.com -henryjsolberg@google.com -hormati@google.com huanc@google.com jiaxun@google.com -kemppeterson@google.com +mlaurencechen@google.com shobs@google.com swast@google.com -tbergeron@google.com +tbergeron@google.com \ No newline at end of file From 3dbf84bd1531c1f8d41ba57c2c38b3ba6abfb812 Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Tue, 6 Aug 2024 18:03:50 -0700 Subject: [PATCH 06/15] feat: bigframes.bigquery.json_extract (#868) * feat: bigframes.bigquery.json_extract * fixing tests --- bigframes/bigquery/__init__.py | 35 ++++++++++++++++++++ bigframes/core/compile/scalar_op_compiler.py | 12 +++++++ bigframes/operations/__init__.py | 16 +++++++++ tests/system/small/bigquery/test_json.py | 27 +++++++++++++++ 4 files changed, 90 insertions(+) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index ec26d14f33..7c409839b1 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -208,6 +208,41 @@ def json_set( return series +def json_extract( + series: series.Series, + json_path: str, +) -> series.Series: + """Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON` + value. This function uses single quotes and brackets to escape invalid JSONPath + characters in JSON keys. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) + >>> bbq.json_extract(s, json_path="$.class") + 0 "{\\\"students\\\":[{\\\"id\\\":5},{\\\"id\\\":12}]}" + dtype: string + + Args: + series (bigframes.series.Series): + The Series containing JSON data (as native JSON objects or JSON-formatted strings). + json_path (str): + The JSON path identifying the data that you want to obtain from the input. + + Returns: + bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING. + """ + return series._apply_unary_op(ops.JSONExtract(json_path=json_path)) + + +# Search functions defined from +# https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions + + def vector_search( base_table: str, column_to_search: str, diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 67d0dac436..32749b32a6 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -922,6 +922,11 @@ def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet): ).to_expr() +@scalar_op_compiler.register_unary_op(ops.JSONExtract, pass_op=True) +def json_extract_op_impl(x: ibis_types.Value, op: ops.JSONExtract): + return json_extract(json_obj=x, json_path=op.json_path) + + ### Binary Ops def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None): """Wraps a binary operator to generate nulls of the expected type if either input is a null scalar.""" @@ -1549,6 +1554,13 @@ def json_set( """Produces a new SQL JSON value with the specified JSON data inserted or replaced.""" +@ibis.udf.scalar.builtin(name="json_extract") +def json_extract( + json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str +) -> ibis_dtypes.JSON: + """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" + + @ibis.udf.scalar.builtin(name="ML.DISTANCE") def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64: """Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")""" diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 523882c14e..4d4e40643d 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -602,6 +602,22 @@ def output_type(self, *input_types): return dtypes.STRING_DTYPE +## JSON Ops +@dataclasses.dataclass(frozen=True) +class JSONExtract(UnaryOp): + name: typing.ClassVar[str] = "json_extract" + json_path: str + + def output_type(self, *input_types): + input_type = input_types[0] + if not dtypes.is_json_like(input_type): + raise TypeError( + "Input type must be an valid JSON object or JSON-formatted string type." + + f" Received type: {input_type}" + ) + return input_type + + # Binary Ops fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE) maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE) diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 9e0c06e0bd..059b8eea87 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -110,3 +110,30 @@ def test_json_set_w_invalid_value_type(): def test_json_set_w_invalid_series_type(): with pytest.raises(TypeError): bbq.json_set(bpd.Series([1, 2]), json_path_value_pairs=[("$.a", 1)]) + + +def test_json_extract_from_json(): + s = _get_series_from_json([{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}]) + actual = bbq.json_extract(s, "$.a.b") + # After the introduction of the JSON type, the output should be a JSON-formatted series. + expected = _get_series_from_json(["[1,2]", None, "0"]) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + ) + + +def test_json_extract_from_string(): + s = bpd.Series(['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}']) + actual = bbq.json_extract(s, "$.a.b") + expected = _get_series_from_json(["[1,2]", None, "0"]) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + check_names=False, + ) + + +def test_json_extract_w_invalid_series_type(): + with pytest.raises(TypeError): + bbq.json_extract(bpd.Series([1, 2]), "$.a") From 0603e62fcdf513ca4207909af4dbd8d036af0b0c Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Wed, 7 Aug 2024 11:38:58 -0700 Subject: [PATCH 07/15] chore: fix docs build on owlbot.py (#888) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: fix docs build on owlbot.py * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot --- .kokoro/publish-docs.sh | 3 +++ owlbot.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.kokoro/publish-docs.sh b/.kokoro/publish-docs.sh index 233205d580..297b14ac90 100755 --- a/.kokoro/publish-docs.sh +++ b/.kokoro/publish-docs.sh @@ -58,5 +58,8 @@ python3.10 -m docuploader create-metadata \ cat docs.metadata +# Replace toc.yml template file +mv docs/templates/toc.yml docs/_build/html/docfx_yaml/toc.yml + # upload docs python3.10 -m docuploader upload docs/_build/html/docfx_yaml --metadata-file docs.metadata --destination-prefix docfx --staging-bucket "${V2_STAGING_BUCKET}" diff --git a/owlbot.py b/owlbot.py index ddc578c3a2..f9d9410d6d 100644 --- a/owlbot.py +++ b/owlbot.py @@ -112,14 +112,14 @@ re.escape("# upload docs") + "\n" + re.escape( - 'python3 -m docuploader upload docs/_build/html/docfx_yaml --metadata-file docs.metadata --destination-prefix docfx --staging-bucket "${V2_STAGING_BUCKET}"' + 'python3.10 -m docuploader upload docs/_build/html/docfx_yaml --metadata-file docs.metadata --destination-prefix docfx --staging-bucket "${V2_STAGING_BUCKET}"' ) ), ( "# Replace toc.yml template file\n" + "mv docs/templates/toc.yml docs/_build/html/docfx_yaml/toc.yml\n\n" + "# upload docs\n" - + 'python3 -m docuploader upload docs/_build/html/docfx_yaml --metadata-file docs.metadata --destination-prefix docfx --staging-bucket "${V2_STAGING_BUCKET}"' + + 'python3.10 -m docuploader upload docs/_build/html/docfx_yaml --metadata-file docs.metadata --destination-prefix docfx --staging-bucket "${V2_STAGING_BUCKET}"' ), ) From 3eb6a17a5823faf5ecba92cb9a554df74477871d Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 8 Aug 2024 12:03:30 -0700 Subject: [PATCH 08/15] docs: fix the `DisplayOptions` doc rendering (#893) --- third_party/bigframes_vendored/pandas/core/config_init.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/core/config_init.py b/third_party/bigframes_vendored/pandas/core/config_init.py index 84ab90a322..4bca3f3c75 100644 --- a/third_party/bigframes_vendored/pandas/core/config_init.py +++ b/third_party/bigframes_vendored/pandas/core/config_init.py @@ -18,6 +18,7 @@ **Examples:** Define Repr mode to "deferred" will prevent job execution in repr. + >>> import bigframes.pandas as bpd >>> df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") @@ -26,25 +27,32 @@ Computation deferred. Computation will process 28.9 kB Users can also get a dry run of the job by accessing the query_job property before they've run the job. This will return a dry run instance of the job they can inspect. + >>> df.query_job.total_bytes_processed 28947 User can execute the job by calling .to_pandas() + >>> # df.to_pandas() Reset repr_mode option + >>> bpd.options.display.repr_mode = "head" Can also set the progress_bar option to see the progress bar in terminal, + >>> bpd.options.display.progress_bar = "terminal" notebook, + >>> bpd.options.display.progress_bar = "notebook" or just remove it. + >>> bpd.options.display.progress_bar = None Setting to default value "auto" will detect and show progress bar automatically. + >>> bpd.options.display.progress_bar = "auto" Attributes: From 0c011a8212a8c0824a12c560b6d63048362275e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 8 Aug 2024 14:28:26 -0500 Subject: [PATCH 09/15] test: enable tests on Windows (#857) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * test: enable tests on Windows * use windows subdirectory for scripts * maybe fix for windows * fix working directory * use nox so test dependencies are installed * add 3.10 to system tests * disable system tests * add more goto errors * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot --- .kokoro/continuous/windows.cfg | 3 +++ .kokoro/presubmit/windows.cfg | 3 +++ noxfile.py | 6 +++-- scripts/windows/build.bat | 38 ++++++++++++++++++++++++++++++++ scripts/windows/test.bat | 40 ++++++++++++++++++++++++++++++++++ 5 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 .kokoro/continuous/windows.cfg create mode 100644 .kokoro/presubmit/windows.cfg create mode 100644 scripts/windows/build.bat create mode 100644 scripts/windows/test.bat diff --git a/.kokoro/continuous/windows.cfg b/.kokoro/continuous/windows.cfg new file mode 100644 index 0000000000..806986138d --- /dev/null +++ b/.kokoro/continuous/windows.cfg @@ -0,0 +1,3 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +build_file: "python-bigquery-dataframes/scripts/windows/build.bat" diff --git a/.kokoro/presubmit/windows.cfg b/.kokoro/presubmit/windows.cfg new file mode 100644 index 0000000000..806986138d --- /dev/null +++ b/.kokoro/presubmit/windows.cfg @@ -0,0 +1,3 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +build_file: "python-bigquery-dataframes/scripts/windows/build.bat" diff --git a/noxfile.py b/noxfile.py index d69c16e69c..c464b47270 100644 --- a/noxfile.py +++ b/noxfile.py @@ -62,7 +62,8 @@ UNIT_TEST_EXTRAS: List[str] = [] UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {} -SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.12"] +# 3.10 is needed for Windows tests. +SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.12"] SYSTEM_TEST_STANDARD_DEPENDENCIES = [ "jinja2", "mock", @@ -100,7 +101,8 @@ "docfx", "unit", "unit_noextras", - "system", + "system-3.9", + "system-3.12", "cover", ] diff --git a/scripts/windows/build.bat b/scripts/windows/build.bat new file mode 100644 index 0000000000..d599702c98 --- /dev/null +++ b/scripts/windows/build.bat @@ -0,0 +1,38 @@ +@rem Copyright 2024 Google LLC +@rem +@rem Licensed under the Apache License, Version 2.0 (the "License"); +@rem you may not use this file except in compliance with the License. +@rem You may obtain a copy of the License at +@rem +@rem http://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, software +@rem distributed under the License is distributed on an "AS IS" BASIS, +@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem See the License for the specific language governing permissions and +@rem limitations under the License. + +:; Change directory to repo root. +SET script_dir="%~dp0" +cd "%~dp0"\..\.. + +echo "Listing available Python versions' +py -0 || goto :error + +py -3.10 -m pip install --upgrade pip || goto :error +py -3.10 -m pip install --upgrade pip setuptools wheel || goto :error + +echo "Building Wheel" +py -3.10 -m pip wheel . --wheel-dir wheels || goto :error/ + +echo "Built wheel, now running tests." +call "%script_dir%"/test.bat 3.10 || goto :error + +echo "Windows build has completed successfully" + +:; https://stackoverflow.com/a/46813196/101923 +:; exit 0 +exit /b 0 + +:error +exit /b %errorlevel% diff --git a/scripts/windows/test.bat b/scripts/windows/test.bat new file mode 100644 index 0000000000..bcd605bd12 --- /dev/null +++ b/scripts/windows/test.bat @@ -0,0 +1,40 @@ +@rem Copyright 2024 Google LLC +@rem +@rem Licensed under the Apache License, Version 2.0 (the "License"); +@rem you may not use this file except in compliance with the License. +@rem You may obtain a copy of the License at +@rem +@rem http://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, software +@rem distributed under the License is distributed on an "AS IS" BASIS, +@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem See the License for the specific language governing permissions and +@rem limitations under the License. + +@rem This test file runs for one Python version at a time, and is intended to +@rem be called from within the build loop. + +:; Change directory to repo root. +SET script_dir="%~dp0" +cd "%~dp0"\..\.. + +set PYTHON_VERSION=%1 +if "%PYTHON_VERSION%"=="" ( + echo "Python version was not provided, using Python 3.10" + set PYTHON_VERSION=3.10 +) + +py -%PYTHON_VERSION%-64 -m pip install nox || goto :error + +py -%PYTHON_VERSION%-64 -m nox -s unit-"%PYTHON_VERSION%" || goto :error + +:; TODO(b/358148440): enable system tests on windows +:; py -%PYTHON_VERSION%-64 -m nox -s system-"%PYTHON_VERSION%" || goto :error + +:; https://stackoverflow.com/a/46813196/101923 +:; exit 0 +exit /b 0 + +:error +exit /b %errorlevel% From 7117e33f2b6bc89ae8ce9b168d98bbcb21c08e52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 8 Aug 2024 16:31:23 -0500 Subject: [PATCH 10/15] chore: require Windows unit tests and re-enable owlbot check (#895) --- .github/sync-repo-settings.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/sync-repo-settings.yaml b/.github/sync-repo-settings.yaml index e098468da6..c2f3673fcc 100644 --- a/.github/sync-repo-settings.yaml +++ b/.github/sync-repo-settings.yaml @@ -7,8 +7,7 @@ branchProtectionRules: requiresCodeOwnerReviews: true requiresStrictStatusChecks: false requiredStatusCheckContexts: -# TODO(b/347075426): Restore owlbot as required check -# - 'OwlBot Post Processor' + - 'OwlBot Post Processor' - 'conventionalcommits.org' - 'cla/google' - 'docs' @@ -19,6 +18,7 @@ branchProtectionRules: - 'unit (3.12)' - 'cover' - 'Kokoro presubmit' + - 'Kokoro windows' permissionRules: - team: actools-python permission: admin From e0b11bc8c038db7b950b1653ed4cd44a6246c713 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 9 Aug 2024 16:00:36 -0700 Subject: [PATCH 11/15] perf: Speed up compilation by reducing redundant type normalization (#896) --- bigframes/bigquery/__init__.py | 2 +- bigframes/core/compile/compiled.py | 23 +++-------------------- bigframes/core/compile/ibis_types.py | 1 + tests/system/small/bigquery/test_json.py | 20 +++++++++++--------- 4 files changed, 16 insertions(+), 30 deletions(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 7c409839b1..fb9503dc72 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -224,7 +224,7 @@ def json_extract( >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) >>> bbq.json_extract(s, json_path="$.class") - 0 "{\\\"students\\\":[{\\\"id\\\":5},{\\\"id\\\":12}]}" + 0 {"students":[{"id":5},{"id":12}]} dtype: string Args: diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index cae527931c..5492502f21 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -152,12 +152,7 @@ def _get_ibis_column(self, key: str) -> ibis_types.Value: raise ValueError( "Column name {} not in set of values: {}".format(key, self.column_ids) ) - return typing.cast( - ibis_types.Value, - bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( - self._column_names[key] - ), - ) + return typing.cast(ibis_types.Value, self._column_names[key]) def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: ibis_type = typing.cast( @@ -327,12 +322,7 @@ def _to_ibis_expr( if not columns: return ibis.memtable([]) - # Make sure all dtypes are the "canonical" ones for BigFrames. This is - # important for operations like UNION where the schema must match. - table = self._table.select( - bigframes.core.compile.ibis_types.ibis_value_to_canonical_type(column) - for column in columns - ) + table = self._table.select(columns) base_table = table if self._reduced_predicate is not None: table = table.filter(base_table[PREDICATE_COLUMN]) @@ -1039,14 +1029,7 @@ def _to_ibis_expr( # Make sure we don't have any unbound (deferred) columns. table = self._table.select(columns) - # Make sure all dtypes are the "canonical" ones for BigFrames. This is - # important for operations like UNION where the schema must match. - table = table.select( - bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( - table[column] - ) - for column in table.columns - ) + table = table.select(table[column] for column in table.columns) base_table = table if self._reduced_predicate is not None: table = table.filter(base_table[PREDICATE_COLUMN]) diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index f3221f605f..0b3038c9c7 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -208,6 +208,7 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value: name = value.get_name() if ibis_type.is_json(): value = vendored_ibis_ops.ToJsonString(value).to_expr() + value = value.case().when("null", ibis.null()).else_(value).end() return value.name(name) # Allow REQUIRED fields to be joined with NULLABLE fields. nullable_type = ibis_type.copy(nullable=True) diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 059b8eea87..18ccadd9f5 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -23,11 +23,13 @@ def _get_series_from_json(json_data): + # Note: converts None to sql "null" and not to json none. + values = [ + f"JSON '{json.dumps(data)}'" if data is not None else "NULL" + for data in json_data + ] sql = " UNION ALL ".join( - [ - f"SELECT {id} AS id, JSON '{json.dumps(data)}' AS data" - for id, data in enumerate(json_data) - ] + [f"SELECT {id} AS id, {value} AS data" for id, value in enumerate(values)] ) df = bpd.read_gbq(sql).set_index("id").sort_index() return df["data"] @@ -114,19 +116,19 @@ def test_json_set_w_invalid_series_type(): def test_json_extract_from_json(): s = _get_series_from_json([{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}]) - actual = bbq.json_extract(s, "$.a.b") + actual = bbq.json_extract(s, "$.a.b").to_pandas() # After the introduction of the JSON type, the output should be a JSON-formatted series. - expected = _get_series_from_json(["[1,2]", None, "0"]) + expected = _get_series_from_json([[1, 2], None, 0]).to_pandas() pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual, + expected, ) def test_json_extract_from_string(): s = bpd.Series(['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}']) actual = bbq.json_extract(s, "$.a.b") - expected = _get_series_from_json(["[1,2]", None, "0"]) + expected = _get_series_from_json([[1, 2], None, 0]) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), From 991bb0a25e1e424de38abd065f9d79ab20c24ed2 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 12 Aug 2024 14:32:53 -0700 Subject: [PATCH 12/15] refactor: reorganize `remote_function` code for readability (#885) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor: reorganize `remote_function` code for readability * refactor out RemoteFunctionSession * rename newly introduced modules to make them private symbolically --------- Co-authored-by: Tim Sweña (Swast) --- .../functions/_remote_function_client.py | 476 +++++++ .../functions/_remote_function_session.py | 546 ++++++++ bigframes/functions/_utils.py | 214 +++ bigframes/functions/remote_function.py | 1174 +---------------- bigframes/pandas/__init__.py | 4 +- bigframes/session/__init__.py | 3 +- tests/system/large/test_remote_function.py | 10 +- tests/system/small/test_remote_function.py | 7 +- tests/system/utils.py | 4 +- 9 files changed, 1261 insertions(+), 1177 deletions(-) create mode 100644 bigframes/functions/_remote_function_client.py create mode 100644 bigframes/functions/_remote_function_session.py create mode 100644 bigframes/functions/_utils.py diff --git a/bigframes/functions/_remote_function_client.py b/bigframes/functions/_remote_function_client.py new file mode 100644 index 0000000000..6ef482ecda --- /dev/null +++ b/bigframes/functions/_remote_function_client.py @@ -0,0 +1,476 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import annotations + +import inspect +import logging +import os +import random +import shutil +import string +import sys +import tempfile +from typing import cast, Tuple, TYPE_CHECKING + +import requests + +from bigframes import constants +import bigframes.functions.remote_function_template + +if TYPE_CHECKING: + from bigframes.session import Session + +import google.api_core.exceptions +import google.api_core.retry +from google.cloud import bigquery, functions_v2 + +from . import _utils + +logger = logging.getLogger(__name__) + + +class RemoteFunctionClient: + # Wait time (in seconds) for an IAM binding to take effect after creation + _iam_wait_seconds = 120 + + def __init__( + self, + gcp_project_id, + cloud_function_region, + cloud_functions_client, + bq_location, + bq_dataset, + bq_client, + bq_connection_id, + bq_connection_manager, + cloud_function_service_account, + cloud_function_kms_key_name, + cloud_function_docker_repository, + *, + session: Session, + ): + self._gcp_project_id = gcp_project_id + self._cloud_function_region = cloud_function_region + self._cloud_functions_client = cloud_functions_client + self._bq_location = bq_location + self._bq_dataset = bq_dataset + self._bq_client = bq_client + self._bq_connection_id = bq_connection_id + self._bq_connection_manager = bq_connection_manager + self._cloud_function_service_account = cloud_function_service_account + self._cloud_function_kms_key_name = cloud_function_kms_key_name + self._cloud_function_docker_repository = cloud_function_docker_repository + self._session = session + + def create_bq_remote_function( + self, + input_args, + input_types, + output_type, + endpoint, + bq_function_name, + max_batching_rows, + ): + """Create a BigQuery remote function given the artifacts of a user defined + function and the http endpoint of a corresponding cloud function.""" + if self._bq_connection_manager: + self._bq_connection_manager.create_bq_connection( + self._gcp_project_id, + self._bq_location, + self._bq_connection_id, + "run.invoker", + ) + + # Create BQ function + # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function_2 + bq_function_args = [] + bq_function_return_type = output_type + + # We are expecting the input type annotations to be 1:1 with the input args + for name, type_ in zip(input_args, input_types): + bq_function_args.append(f"{name} {type_}") + + remote_function_options = { + "endpoint": endpoint, + "max_batching_rows": max_batching_rows, + } + + remote_function_options_str = ", ".join( + [ + f'{key}="{val}"' if isinstance(val, str) else f"{key}={val}" + for key, val in remote_function_options.items() + if val is not None + ] + ) + + create_function_ddl = f""" + CREATE OR REPLACE FUNCTION `{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name}({','.join(bq_function_args)}) + RETURNS {bq_function_return_type} + REMOTE WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{self._bq_connection_id}` + OPTIONS ({remote_function_options_str})""" + + logger.info(f"Creating BQ remote function: {create_function_ddl}") + + # Make sure the dataset exists. I.e. if it doesn't exist, go ahead and + # create it + dataset = bigquery.Dataset( + bigquery.DatasetReference.from_string( + self._bq_dataset, default_project=self._gcp_project_id + ) + ) + dataset.location = self._bq_location + try: + # This check does not require bigquery.datasets.create IAM + # permission. So, if the data set already exists, then user can work + # without having that permission. + self._bq_client.get_dataset(dataset) + except google.api_core.exceptions.NotFound: + # This requires bigquery.datasets.create IAM permission + self._bq_client.create_dataset(dataset, exists_ok=True) + + # TODO(swast): plumb through the original, user-facing api_name. + _, query_job = self._session._start_query(create_function_ddl) + logger.info(f"Created remote function {query_job.ddl_target_routine}") + + def get_cloud_function_fully_qualified_parent(self): + "Get the fully qualilfied parent for a cloud function." + return self._cloud_functions_client.common_location_path( + self._gcp_project_id, self._cloud_function_region + ) + + def get_cloud_function_fully_qualified_name(self, name): + "Get the fully qualilfied name for a cloud function." + return self._cloud_functions_client.function_path( + self._gcp_project_id, self._cloud_function_region, name + ) + + def get_remote_function_fully_qualilfied_name(self, name): + "Get the fully qualilfied name for a BQ remote function." + return f"{self._gcp_project_id}.{self._bq_dataset}.{name}" + + def get_cloud_function_endpoint(self, name): + """Get the http endpoint of a cloud function if it exists.""" + fully_qualified_name = self.get_cloud_function_fully_qualified_name(name) + try: + response = self._cloud_functions_client.get_function( + name=fully_qualified_name + ) + return response.service_config.uri + except google.api_core.exceptions.NotFound: + pass + return None + + def generate_cloud_function_code( + self, + def_, + directory, + *, + input_types: Tuple[str], + output_type: str, + package_requirements=None, + is_row_processor=False, + ): + """Generate the cloud function code for a given user defined function. + + Args: + input_types (tuple[str]): + Types of the input arguments in BigQuery SQL data type names. + output_type (str): + Types of the output scalar as a BigQuery SQL data type name. + """ + + # requirements.txt + if package_requirements: + requirements_txt = os.path.join(directory, "requirements.txt") + with open(requirements_txt, "w") as f: + f.write("\n".join(package_requirements)) + + # main.py + entry_point = bigframes.functions.remote_function_template.generate_cloud_function_main_code( + def_, + directory, + input_types=input_types, + output_type=output_type, + is_row_processor=is_row_processor, + ) + return entry_point + + def create_cloud_function( + self, + def_, + cf_name, + *, + input_types: Tuple[str], + output_type: str, + package_requirements=None, + timeout_seconds=600, + max_instance_count=None, + is_row_processor=False, + vpc_connector=None, + memory_mib=1024, + ): + """Create a cloud function from the given user defined function. + + Args: + input_types (tuple[str]): + Types of the input arguments in BigQuery SQL data type names. + output_type (str): + Types of the output scalar as a BigQuery SQL data type name. + """ + + # Build and deploy folder structure containing cloud function + with tempfile.TemporaryDirectory() as directory: + entry_point = self.generate_cloud_function_code( + def_, + directory, + package_requirements=package_requirements, + input_types=input_types, + output_type=output_type, + is_row_processor=is_row_processor, + ) + archive_path = shutil.make_archive(directory, "zip", directory) + + # We are creating cloud function source code from the currently running + # python version. Use the same version to deploy. This is necessary + # because cloudpickle serialization done in one python version and + # deserialization done in another python version doesn't work. + # TODO(shobs): Figure out how to achieve version compatibility, specially + # when pickle (internally used by cloudpickle) guarantees that: + # https://docs.python.org/3/library/pickle.html#:~:text=The%20pickle%20serialization%20format%20is,unique%20breaking%20change%20language%20boundary. + python_version = "python{}{}".format( + sys.version_info.major, sys.version_info.minor + ) + + # Determine an upload URL for user code + upload_url_request = functions_v2.GenerateUploadUrlRequest( + kms_key_name=self._cloud_function_kms_key_name + ) + upload_url_request.parent = self.get_cloud_function_fully_qualified_parent() + upload_url_response = self._cloud_functions_client.generate_upload_url( + request=upload_url_request + ) + + # Upload the code to GCS + with open(archive_path, "rb") as f: + response = requests.put( + upload_url_response.upload_url, + data=f, + headers={"content-type": "application/zip"}, + ) + if response.status_code != 200: + raise RuntimeError( + "Failed to upload user code. code={}, reason={}, text={}".format( + response.status_code, response.reason, response.text + ) + ) + + # Deploy Cloud Function + create_function_request = functions_v2.CreateFunctionRequest() + create_function_request.parent = ( + self.get_cloud_function_fully_qualified_parent() + ) + create_function_request.function_id = cf_name + function = functions_v2.Function() + function.name = self.get_cloud_function_fully_qualified_name(cf_name) + function.build_config = functions_v2.BuildConfig() + function.build_config.runtime = python_version + function.build_config.entry_point = entry_point + function.build_config.source = functions_v2.Source() + function.build_config.source.storage_source = functions_v2.StorageSource() + function.build_config.source.storage_source.bucket = ( + upload_url_response.storage_source.bucket + ) + function.build_config.source.storage_source.object_ = ( + upload_url_response.storage_source.object_ + ) + function.build_config.docker_repository = ( + self._cloud_function_docker_repository + ) + function.service_config = functions_v2.ServiceConfig() + if memory_mib is not None: + function.service_config.available_memory = f"{memory_mib}Mi" + if timeout_seconds is not None: + if timeout_seconds > 1200: + raise ValueError( + "BigQuery remote function can wait only up to 20 minutes" + ", see for more details " + "https://cloud.google.com/bigquery/quotas#remote_function_limits." + ) + function.service_config.timeout_seconds = timeout_seconds + if max_instance_count is not None: + function.service_config.max_instance_count = max_instance_count + if vpc_connector is not None: + function.service_config.vpc_connector = vpc_connector + function.service_config.service_account_email = ( + self._cloud_function_service_account + ) + function.kms_key_name = self._cloud_function_kms_key_name + create_function_request.function = function + + # Create the cloud function and wait for it to be ready to use + try: + operation = self._cloud_functions_client.create_function( + request=create_function_request + ) + operation.result() + + # Cleanup + os.remove(archive_path) + except google.api_core.exceptions.AlreadyExists: + # If a cloud function with the same name already exists, let's + # update it + update_function_request = functions_v2.UpdateFunctionRequest() + update_function_request.function = function + operation = self._cloud_functions_client.update_function( + request=update_function_request + ) + operation.result() + + # Fetch the endpoint of the just created function + endpoint = self.get_cloud_function_endpoint(cf_name) + if not endpoint: + raise ValueError( + f"Couldn't fetch the http endpoint. {constants.FEEDBACK_LINK}" + ) + + logger.info( + f"Successfully created cloud function {cf_name} with uri ({endpoint})" + ) + return endpoint + + def provision_bq_remote_function( + self, + def_, + input_types, + output_type, + reuse, + name, + package_requirements, + max_batching_rows, + cloud_function_timeout, + cloud_function_max_instance_count, + is_row_processor, + cloud_function_vpc_connector, + cloud_function_memory_mib, + ): + """Provision a BigQuery remote function.""" + # Augment user package requirements with any internal package + # requirements + package_requirements = _utils._get_updated_package_requirements( + package_requirements, is_row_processor + ) + + # Compute a unique hash representing the user code + function_hash = _utils._get_hash(def_, package_requirements) + + # If reuse of any existing function with the same name (indicated by the + # same hash of its source code) is not intended, then attach a unique + # suffix to the intended function name to make it unique. + uniq_suffix = None + if not reuse: + # use 4 digits as a unique suffix which should suffice for + # uniqueness per session + uniq_suffix = "".join( + random.choices(string.ascii_lowercase + string.digits, k=4) + ) + + # Derive the name of the cloud function underlying the intended BQ + # remote function. Use the session id to identify the GCF for unnamed + # functions. The named remote functions are treated as a persistant + # artifacts, so let's keep them independent of session id, which also + # makes their naming more stable for the same udf code + session_id = None if name else self._session.session_id + cloud_function_name = _utils.get_cloud_function_name( + function_hash, session_id, uniq_suffix + ) + cf_endpoint = self.get_cloud_function_endpoint(cloud_function_name) + + # Create the cloud function if it does not exist + if not cf_endpoint: + cf_endpoint = self.create_cloud_function( + def_, + cloud_function_name, + input_types=input_types, + output_type=output_type, + package_requirements=package_requirements, + timeout_seconds=cloud_function_timeout, + max_instance_count=cloud_function_max_instance_count, + is_row_processor=is_row_processor, + vpc_connector=cloud_function_vpc_connector, + memory_mib=cloud_function_memory_mib, + ) + else: + logger.info(f"Cloud function {cloud_function_name} already exists.") + + # Derive the name of the remote function + remote_function_name = name + if not remote_function_name: + remote_function_name = _utils.get_remote_function_name( + function_hash, self._session.session_id, uniq_suffix + ) + rf_endpoint, rf_conn = self.get_remote_function_specs(remote_function_name) + + # Create the BQ remote function in following circumstances: + # 1. It does not exist + # 2. It exists but the existing remote function has different + # configuration than intended + created_new = False + if not rf_endpoint or ( + rf_endpoint != cf_endpoint or rf_conn != self._bq_connection_id + ): + input_args = inspect.getargs(def_.__code__).args + if len(input_args) != len(input_types): + raise ValueError( + "Exactly one type should be provided for every input arg." + ) + self.create_bq_remote_function( + input_args, + input_types, + output_type, + cf_endpoint, + remote_function_name, + max_batching_rows, + ) + + created_new = True + else: + logger.info(f"Remote function {remote_function_name} already exists.") + + return remote_function_name, cloud_function_name, created_new + + def get_remote_function_specs(self, remote_function_name): + """Check whether a remote function already exists for the udf.""" + http_endpoint = None + bq_connection = None + routines = self._bq_client.list_routines( + f"{self._gcp_project_id}.{self._bq_dataset}" + ) + try: + for routine in routines: + routine = cast(bigquery.Routine, routine) + if routine.reference.routine_id == remote_function_name: + rf_options = routine.remote_function_options + if rf_options: + http_endpoint = rf_options.endpoint + bq_connection = rf_options.connection + if bq_connection: + bq_connection = os.path.basename(bq_connection) + break + except google.api_core.exceptions.NotFound: + # The dataset might not exist, in which case the http_endpoint doesn't, either. + # Note: list_routines doesn't make an API request until we iterate on the response object. + pass + return (http_endpoint, bq_connection) diff --git a/bigframes/functions/_remote_function_session.py b/bigframes/functions/_remote_function_session.py new file mode 100644 index 0000000000..0ab19ca353 --- /dev/null +++ b/bigframes/functions/_remote_function_session.py @@ -0,0 +1,546 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import annotations + +import collections.abc +import inspect +import sys +import threading +from typing import Any, cast, Dict, Mapping, Optional, Sequence, TYPE_CHECKING, Union +import warnings + +import google.api_core.exceptions +from google.cloud import ( + bigquery, + bigquery_connection_v1, + functions_v2, + resourcemanager_v3, +) + +from bigframes import clients, constants + +if TYPE_CHECKING: + from bigframes.session import Session + +import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes +import ibis +import pandas + +from . import _remote_function_client as rf_client +from . import _utils + + +class RemoteFunctionSession: + """Session to manage remote functions.""" + + def __init__(self): + # Session level mapping of remote function artifacts + self._temp_artifacts: Dict[str, str] = dict() + + # Lock to synchronize the update of the session artifacts + self._artifacts_lock = threading.Lock() + + def _update_temp_artifacts(self, bqrf_routine: str, gcf_path: str): + """Update remote function artifacts in the current session.""" + with self._artifacts_lock: + self._temp_artifacts[bqrf_routine] = gcf_path + + def clean_up( + self, + bqclient: bigquery.Client, + gcfclient: functions_v2.FunctionServiceClient, + session_id: str, + ): + """Delete remote function artifacts in the current session.""" + with self._artifacts_lock: + for bqrf_routine, gcf_path in self._temp_artifacts.items(): + # Let's accept the possibility that the remote function may have + # been deleted directly by the user + bqclient.delete_routine(bqrf_routine, not_found_ok=True) + + # Let's accept the possibility that the cloud function may have + # been deleted directly by the user + try: + gcfclient.delete_function(name=gcf_path) + except google.api_core.exceptions.NotFound: + pass + + self._temp_artifacts.clear() + + # Inspired by @udf decorator implemented in ibis-bigquery package + # https://github.com/ibis-project/ibis-bigquery/blob/main/ibis_bigquery/udf/__init__.py + # which has moved as @js to the ibis package + # https://github.com/ibis-project/ibis/blob/master/ibis/backends/bigquery/udf/__init__.py + def remote_function( + self, + input_types: Union[None, type, Sequence[type]] = None, + output_type: Optional[type] = None, + session: Optional[Session] = None, + bigquery_client: Optional[bigquery.Client] = None, + bigquery_connection_client: Optional[ + bigquery_connection_v1.ConnectionServiceClient + ] = None, + cloud_functions_client: Optional[functions_v2.FunctionServiceClient] = None, + resource_manager_client: Optional[resourcemanager_v3.ProjectsClient] = None, + dataset: Optional[str] = None, + bigquery_connection: Optional[str] = None, + reuse: bool = True, + name: Optional[str] = None, + packages: Optional[Sequence[str]] = None, + cloud_function_service_account: Optional[str] = None, + cloud_function_kms_key_name: Optional[str] = None, + cloud_function_docker_repository: Optional[str] = None, + max_batching_rows: Optional[int] = 1000, + cloud_function_timeout: Optional[int] = 600, + cloud_function_max_instances: Optional[int] = None, + cloud_function_vpc_connector: Optional[str] = None, + cloud_function_memory_mib: Optional[int] = 1024, + ): + """Decorator to turn a user defined function into a BigQuery remote function. + + .. deprecated:: 0.0.1 + This is an internal method. Please use :func:`bigframes.pandas.remote_function` instead. + + .. note:: + Please make sure following is setup before using this API: + + 1. Have the below APIs enabled for your project: + + * BigQuery Connection API + * Cloud Functions API + * Cloud Run API + * Cloud Build API + * Artifact Registry API + * Cloud Resource Manager API + + This can be done from the cloud console (change `PROJECT_ID` to yours): + https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID + + Or from the gcloud CLI: + + `$ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com` + + 2. Have following IAM roles enabled for you: + + * BigQuery Data Editor (roles/bigquery.dataEditor) + * BigQuery Connection Admin (roles/bigquery.connectionAdmin) + * Cloud Functions Developer (roles/cloudfunctions.developer) + * Service Account User (roles/iam.serviceAccountUser) on the service account `PROJECT_NUMBER-compute@developer.gserviceaccount.com` + * Storage Object Viewer (roles/storage.objectViewer) + * Project IAM Admin (roles/resourcemanager.projectIamAdmin) (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.) + + 3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set: + + 1. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection + 2. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function + + Alternatively, the IAM could also be setup via the gcloud CLI: + + `$ gcloud projects add-iam-policy-binding PROJECT_ID --member="serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID" --role="roles/run.invoker"`. + + Args: + input_types (None, type, or sequence(type)): + For scalar user defined function it should be the input type or + sequence of input types. For row processing user defined function, + type `Series` should be specified. + output_type (Optional[type]): + Data type of the output in the user defined function. + session (bigframes.Session, Optional): + BigQuery DataFrames session to use for getting default project, + dataset and BigQuery connection. + bigquery_client (google.cloud.bigquery.Client, Optional): + Client to use for BigQuery operations. If this param is not provided + then bigquery client from the session would be used. + bigquery_connection_client (google.cloud.bigquery_connection_v1.ConnectionServiceClient, Optional): + Client to use for BigQuery connection operations. If this param is + not provided then bigquery connection client from the session would + be used. + cloud_functions_client (google.cloud.functions_v2.FunctionServiceClient, Optional): + Client to use for cloud functions operations. If this param is not + provided then the functions client from the session would be used. + resource_manager_client (google.cloud.resourcemanager_v3.ProjectsClient, Optional): + Client to use for cloud resource management operations, e.g. for + getting and setting IAM roles on cloud resources. If this param is + not provided then resource manager client from the session would be + used. + dataset (str, Optional.): + Dataset in which to create a BigQuery remote function. It should be in + `.` or `` format. If this + parameter is not provided then session dataset id is used. + bigquery_connection (str, Optional): + Name of the BigQuery connection in the form of `CONNECTION_ID` or + `LOCATION.CONNECTION_ID` or `PROJECT_ID.LOCATION.CONNECTION_ID`. + If this param is not provided then the bigquery connection from the session + would be used. If it is pre created in the same location as the + `bigquery_client.location` then it would be used, otherwise it is created + dynamically using the `bigquery_connection_client` assuming the user has necessary + priviliges. The PROJECT_ID should be the same as the BigQuery connection project. + reuse (bool, Optional): + Reuse the remote function if already exists. + `True` by default, which will result in reusing an existing remote + function and corresponding cloud function that was previously + created (if any) for the same udf. + Please note that for an unnamed (i.e. created without an explicit + `name` argument) remote function, the BigQuery DataFrames + session id is attached in the cloud artifacts names. So for the + effective reuse across the sessions it is recommended to create + the remote function with an explicit `name`. + Setting it to `False` would force creating a unique remote function. + If the required remote function does not exist then it would be + created irrespective of this param. + name (str, Optional): + Explicit name of the persisted BigQuery remote function. Use it with + caution, because two users working in the same project and dataset + could overwrite each other's remote functions if they use the same + persistent name. When an explicit name is provided, any session + specific clean up (``bigframes.session.Session.close``/ + ``bigframes.pandas.close_session``/ + ``bigframes.pandas.reset_session``/ + ``bigframes.pandas.clean_up_by_session_id``) does not clean up + the function, and leaves it for the user to manage the function + and the associated cloud function directly. + packages (str[], Optional): + Explicit name of the external package dependencies. Each dependency + is added to the `requirements.txt` as is, and can be of the form + supported in https://pip.pypa.io/en/stable/reference/requirements-file-format/. + cloud_function_service_account (str, Optional): + Service account to use for the cloud functions. If not provided then + the default service account would be used. See + https://cloud.google.com/functions/docs/securing/function-identity + for more details. Please make sure the service account has the + necessary IAM permissions configured as described in + https://cloud.google.com/functions/docs/reference/iam/roles#additional-configuration. + cloud_function_kms_key_name (str, Optional): + Customer managed encryption key to protect cloud functions and + related data at rest. This is of the format + projects/PROJECT_ID/locations/LOCATION/keyRings/KEYRING/cryptoKeys/KEY. + Read https://cloud.google.com/functions/docs/securing/cmek for + more details including granting necessary service accounts + access to the key. + cloud_function_docker_repository (str, Optional): + Docker repository created with the same encryption key as + `cloud_function_kms_key_name` to store encrypted artifacts + created to support the cloud function. This is of the format + projects/PROJECT_ID/locations/LOCATION/repositories/REPOSITORY_NAME. + For more details see + https://cloud.google.com/functions/docs/securing/cmek#before_you_begin. + max_batching_rows (int, Optional): + The maximum number of rows to be batched for processing in the + BQ remote function. Default value is 1000. A lower number can be + passed to avoid timeouts in case the user code is too complex to + process large number of rows fast enough. A higher number can be + used to increase throughput in case the user code is fast enough. + `None` can be passed to let BQ remote functions service apply + default batching. See for more details + https://cloud.google.com/bigquery/docs/remote-functions#limiting_number_of_rows_in_a_batch_request. + cloud_function_timeout (int, Optional): + The maximum amount of time (in seconds) BigQuery should wait for + the cloud function to return a response. See for more details + https://cloud.google.com/functions/docs/configuring/timeout. + Please note that even though the cloud function (2nd gen) itself + allows seeting up to 60 minutes of timeout, BigQuery remote + function can wait only up to 20 minutes, see for more details + https://cloud.google.com/bigquery/quotas#remote_function_limits. + By default BigQuery DataFrames uses a 10 minute timeout. `None` + can be passed to let the cloud functions default timeout take effect. + cloud_function_max_instances (int, Optional): + The maximumm instance count for the cloud function created. This + can be used to control how many cloud function instances can be + active at max at any given point of time. Lower setting can help + control the spike in the billing. Higher setting can help + support processing larger scale data. When not specified, cloud + function's default setting applies. For more details see + https://cloud.google.com/functions/docs/configuring/max-instances. + cloud_function_vpc_connector (str, Optional): + The VPC connector you would like to configure for your cloud + function. This is useful if your code needs access to data or + service(s) that are on a VPC network. See for more details + https://cloud.google.com/functions/docs/networking/connecting-vpc. + cloud_function_memory_mib (int, Optional): + The amounts of memory (in mebibytes) to allocate for the cloud + function (2nd gen) created. This also dictates a corresponding + amount of allocated CPU for the function. By default a memory of + 1024 MiB is set for the cloud functions created to support + BigQuery DataFrames remote function. If you want to let the + default memory of cloud functions be allocated, pass `None`. See + for more details + https://cloud.google.com/functions/docs/configuring/memory. + """ + # Some defaults may be used from the session if not provided otherwise + import bigframes.exceptions as bf_exceptions + import bigframes.pandas as bpd + import bigframes.series as bf_series + import bigframes.session + + session = cast(bigframes.session.Session, session or bpd.get_global_session()) + + # A BigQuery client is required to perform BQ operations + if not bigquery_client: + bigquery_client = session.bqclient + if not bigquery_client: + raise ValueError( + "A bigquery client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + # A BigQuery connection client is required to perform BQ connection operations + if not bigquery_connection_client: + bigquery_connection_client = session.bqconnectionclient + if not bigquery_connection_client: + raise ValueError( + "A bigquery connection client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + # A cloud functions client is required to perform cloud functions operations + if not cloud_functions_client: + cloud_functions_client = session.cloudfunctionsclient + if not cloud_functions_client: + raise ValueError( + "A cloud functions client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + # A resource manager client is required to get/set IAM operations + if not resource_manager_client: + resource_manager_client = session.resourcemanagerclient + if not resource_manager_client: + raise ValueError( + "A resource manager client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + # BQ remote function must be persisted, for which we need a dataset + # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#:~:text=You%20cannot%20create%20temporary%20remote%20functions. + if dataset: + dataset_ref = bigquery.DatasetReference.from_string( + dataset, default_project=bigquery_client.project + ) + else: + dataset_ref = session._anonymous_dataset + + bq_location, cloud_function_region = _utils.get_remote_function_locations( + bigquery_client.location + ) + + # A connection is required for BQ remote function + # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function + if not bigquery_connection: + bigquery_connection = session._bq_connection # type: ignore + + bigquery_connection = clients.resolve_full_bq_connection_name( + bigquery_connection, + default_project=dataset_ref.project, + default_location=bq_location, + ) + # Guaranteed to be the form of .. + ( + gcp_project_id, + bq_connection_location, + bq_connection_id, + ) = bigquery_connection.split(".") + if gcp_project_id.casefold() != dataset_ref.project.casefold(): + raise ValueError( + "The project_id does not match BigQuery connection gcp_project_id: " + f"{dataset_ref.project}." + ) + if bq_connection_location.casefold() != bq_location.casefold(): + raise ValueError( + "The location does not match BigQuery connection location: " + f"{bq_location}." + ) + + # If any CMEK is intended then check that a docker repository is also specified + if ( + cloud_function_kms_key_name is not None + and cloud_function_docker_repository is None + ): + raise ValueError( + "cloud_function_docker_repository must be specified with cloud_function_kms_key_name." + " For more details see https://cloud.google.com/functions/docs/securing/cmek#before_you_begin" + ) + + bq_connection_manager = session.bqconnectionmanager + + def wrapper(func): + nonlocal input_types, output_type + + if not callable(func): + raise TypeError("f must be callable, got {}".format(func)) + + if sys.version_info >= (3, 10): + # Add `eval_str = True` so that deferred annotations are turned into their + # corresponding type objects. Need Python 3.10 for eval_str parameter. + # https://docs.python.org/3/library/inspect.html#inspect.signature + signature_kwargs: Mapping[str, Any] = {"eval_str": True} + else: + signature_kwargs = {} + + signature = inspect.signature( + func, + **signature_kwargs, + ) + + # Try to get input types via type annotations. + if input_types is None: + input_types = [] + for parameter in signature.parameters.values(): + if (param_type := parameter.annotation) is inspect.Signature.empty: + raise ValueError( + "'input_types' was not set and parameter " + f"'{parameter.name}' is missing a type annotation. " + "Types are required to use @remote_function." + ) + input_types.append(param_type) + elif not isinstance(input_types, collections.abc.Sequence): + input_types = [input_types] + + if output_type is None: + if ( + output_type := signature.return_annotation + ) is inspect.Signature.empty: + raise ValueError( + "'output_type' was not set and function is missing a " + "return type annotation. Types are required to use " + "@remote_function." + ) + + # The function will actually be receiving a pandas Series, but allow both + # BigQuery DataFrames and pandas object types for compatibility. + is_row_processor = False + if len(input_types) == 1 and ( + (input_type := input_types[0]) == bf_series.Series + or input_type == pandas.Series + ): + warnings.warn( + "input_types=Series is in preview.", + stacklevel=1, + category=bf_exceptions.PreviewWarning, + ) + + # we will model the row as a json serialized string containing the data + # and the metadata representing the row + input_types = [str] + is_row_processor = True + elif isinstance(input_types, type): + input_types = [input_types] + + # TODO(b/340898611): fix type error + ibis_signature = _utils.ibis_signature_from_python_signature( + signature, input_types, output_type # type: ignore + ) + + remote_function_client = rf_client.RemoteFunctionClient( + dataset_ref.project, + cloud_function_region, + cloud_functions_client, + bq_location, + dataset_ref.dataset_id, + bigquery_client, + bq_connection_id, + bq_connection_manager, + cloud_function_service_account, + cloud_function_kms_key_name, + cloud_function_docker_repository, + session=session, # type: ignore + ) + + # In the unlikely case where the user is trying to re-deploy the same + # function, cleanup the attributes we add below, first. This prevents + # the pickle from having dependencies that might not otherwise be + # present such as ibis or pandas. + def try_delattr(attr): + try: + delattr(func, attr) + except AttributeError: + pass + + try_delattr("bigframes_cloud_function") + try_delattr("bigframes_remote_function") + try_delattr("input_dtypes") + try_delattr("output_dtype") + try_delattr("is_row_processor") + try_delattr("ibis_node") + + ( + rf_name, + cf_name, + created_new, + ) = remote_function_client.provision_bq_remote_function( + func, + input_types=tuple( + third_party_ibis_bqtypes.BigQueryType.from_ibis(type_) + for type_ in ibis_signature.input_types + ), + output_type=third_party_ibis_bqtypes.BigQueryType.from_ibis( + ibis_signature.output_type + ), + reuse=reuse, + name=name, + package_requirements=packages, + max_batching_rows=max_batching_rows, + cloud_function_timeout=cloud_function_timeout, + cloud_function_max_instance_count=cloud_function_max_instances, + is_row_processor=is_row_processor, + cloud_function_vpc_connector=cloud_function_vpc_connector, + cloud_function_memory_mib=cloud_function_memory_mib, + ) + + # TODO: Move ibis logic to compiler step + node = ibis.udf.scalar.builtin( + func, + name=rf_name, + schema=f"{dataset_ref.project}.{dataset_ref.dataset_id}", + signature=(ibis_signature.input_types, ibis_signature.output_type), + ) + func.bigframes_cloud_function = ( + remote_function_client.get_cloud_function_fully_qualified_name(cf_name) + ) + func.bigframes_remote_function = ( + remote_function_client.get_remote_function_fully_qualilfied_name( + rf_name + ) + ) + func.input_dtypes = tuple( + [ + bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( + input_type + ) + for input_type in ibis_signature.input_types + ] + ) + func.output_dtype = ( + bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( + ibis_signature.output_type + ) + ) + func.is_row_processor = is_row_processor + func.ibis_node = node + + # If a new remote function was created, update the cloud artifacts + # created in the session. This would be used to clean up any + # resources in the session. Note that we need to do this only for + # the case where an explicit name was not provided by the user and + # we used an internal name. For the cases where the user provided an + # explicit name, we are assuming that the user wants to persist them + # with that name and would directly manage their lifecycle. + if created_new and (not name): + self._update_temp_artifacts( + func.bigframes_remote_function, func.bigframes_cloud_function + ) + return func + + return wrapper diff --git a/bigframes/functions/_utils.py b/bigframes/functions/_utils.py new file mode 100644 index 0000000000..537473bed8 --- /dev/null +++ b/bigframes/functions/_utils.py @@ -0,0 +1,214 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import hashlib +import inspect +from typing import cast, List, NamedTuple, Optional, Sequence, Set + +import cloudpickle +import google.api_core.exceptions +from google.cloud import bigquery, functions_v2 +import ibis.expr.datatypes.core +import numpy +import pandas +import pyarrow + +import bigframes.core.compile.ibis_types + +# Naming convention for the remote function artifacts +_BIGFRAMES_REMOTE_FUNCTION_PREFIX = "bigframes" +_BQ_FUNCTION_NAME_SEPERATOR = "_" +_GCF_FUNCTION_NAME_SEPERATOR = "-" + +# Protocol version 4 is available in python version 3.4 and above +# https://docs.python.org/3/library/pickle.html#data-stream-format +_pickle_protocol_version = 4 + + +def get_remote_function_locations(bq_location): + """Get BQ location and cloud functions region given a BQ client.""" + # TODO(shobs, b/274647164): Find the best way to determine default location. + # For now let's assume that if no BQ location is set in the client then it + # defaults to US multi region + bq_location = bq_location.lower() if bq_location else "us" + + # Cloud function should be in the same region as the bigquery remote function + cloud_function_region = bq_location + + # BigQuery has multi region but cloud functions does not. + # Any region in the multi region that supports cloud functions should work + # https://cloud.google.com/functions/docs/locations + if bq_location == "us": + cloud_function_region = "us-central1" + elif bq_location == "eu": + cloud_function_region = "europe-west1" + + return bq_location, cloud_function_region + + +def _get_updated_package_requirements( + package_requirements=None, is_row_processor=False +): + requirements = [f"cloudpickle=={cloudpickle.__version__}"] + if is_row_processor: + # bigframes remote function will send an entire row of data as json, + # which would be converted to a pandas series and processed + # Ensure numpy versions match to avoid unpickling problems. See + # internal issue b/347934471. + requirements.append(f"numpy=={numpy.__version__}") + requirements.append(f"pandas=={pandas.__version__}") + requirements.append(f"pyarrow=={pyarrow.__version__}") + + if package_requirements: + requirements.extend(package_requirements) + + requirements = sorted(requirements) + return requirements + + +def _clean_up_by_session_id( + bqclient: bigquery.Client, + gcfclient: functions_v2.FunctionServiceClient, + dataset: bigquery.DatasetReference, + session_id: str, +): + """Delete remote function artifacts for a session id, where the session id + was not necessarily created in the current runtime. This is useful if the + user worked with a BigQuery DataFrames session previously and remembered the + session id, and now wants to clean up its temporary resources at a later + point in time. + """ + + # First clean up the BQ remote functions and then the underlying + # cloud functions, so that at no point we are left with a remote function + # that is pointing to a cloud function that does not exist + + endpoints_to_be_deleted: Set[str] = set() + match_prefix = "".join( + [ + _BIGFRAMES_REMOTE_FUNCTION_PREFIX, + _BQ_FUNCTION_NAME_SEPERATOR, + session_id, + _BQ_FUNCTION_NAME_SEPERATOR, + ] + ) + for routine in bqclient.list_routines(dataset): + routine = cast(bigquery.Routine, routine) + + # skip past the routines not belonging to the given session id, or + # non-remote-function routines + if ( + routine.type_ != bigquery.RoutineType.SCALAR_FUNCTION + or not cast(str, routine.routine_id).startswith(match_prefix) + or not routine.remote_function_options + or not routine.remote_function_options.endpoint + ): + continue + + # Let's forgive the edge case possibility that the BQ remote function + # may have been deleted at the same time directly by the user + bqclient.delete_routine(routine, not_found_ok=True) + endpoints_to_be_deleted.add(routine.remote_function_options.endpoint) + + # Now clean up the cloud functions + bq_location = bqclient.get_dataset(dataset).location + bq_location, gcf_location = get_remote_function_locations(bq_location) + parent_path = gcfclient.common_location_path( + project=dataset.project, location=gcf_location + ) + for gcf in gcfclient.list_functions(parent=parent_path): + # skip past the cloud functions not attached to any BQ remote function + # belonging to the given session id + if gcf.service_config.uri not in endpoints_to_be_deleted: + continue + + # Let's forgive the edge case possibility that the cloud function + # may have been deleted at the same time directly by the user + try: + gcfclient.delete_function(name=gcf.name) + except google.api_core.exceptions.NotFound: + pass + + +def _get_hash(def_, package_requirements=None): + "Get hash (32 digits alphanumeric) of a function." + # There is a known cell-id sensitivity of the cloudpickle serialization in + # notebooks https://github.com/cloudpipe/cloudpickle/issues/538. Because of + # this, if a cell contains a udf decorated with @remote_function, a unique + # cloudpickle code is generated every time the cell is run, creating new + # cloud artifacts every time. This is slow and wasteful. + # A workaround of the same can be achieved by replacing the filename in the + # code object to a static value + # https://github.com/cloudpipe/cloudpickle/issues/120#issuecomment-338510661. + # + # To respect the user code/environment let's make this modification on a + # copy of the udf, not on the original udf itself. + def_copy = cloudpickle.loads(cloudpickle.dumps(def_)) + def_copy.__code__ = def_copy.__code__.replace( + co_filename="bigframes_place_holder_filename" + ) + + def_repr = cloudpickle.dumps(def_copy, protocol=_pickle_protocol_version) + if package_requirements: + for p in sorted(package_requirements): + def_repr += p.encode() + return hashlib.md5(def_repr).hexdigest() + + +def routine_ref_to_string_for_query(routine_ref: bigquery.RoutineReference) -> str: + return f"`{routine_ref.project}.{routine_ref.dataset_id}`.{routine_ref.routine_id}" + + +def get_cloud_function_name(function_hash, session_id=None, uniq_suffix=None): + "Get a name for the cloud function for the given user defined function." + parts = [_BIGFRAMES_REMOTE_FUNCTION_PREFIX] + if session_id: + parts.append(session_id) + parts.append(function_hash) + if uniq_suffix: + parts.append(uniq_suffix) + return _GCF_FUNCTION_NAME_SEPERATOR.join(parts) + + +def get_remote_function_name(function_hash, session_id, uniq_suffix=None): + "Get a name for the BQ remote function for the given user defined function." + parts = [_BIGFRAMES_REMOTE_FUNCTION_PREFIX, session_id, function_hash] + if uniq_suffix: + parts.append(uniq_suffix) + return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) + + +class IbisSignature(NamedTuple): + parameter_names: List[str] + input_types: List[Optional[ibis.expr.datatypes.core.DataType]] + output_type: ibis.expr.datatypes.core.DataType + + +def ibis_signature_from_python_signature( + signature: inspect.Signature, + input_types: Sequence[type], + output_type: type, +) -> IbisSignature: + + return IbisSignature( + parameter_names=list(signature.parameters.keys()), + input_types=[ + bigframes.core.compile.ibis_types.ibis_type_from_python_type(t) + for t in input_types + ], + output_type=bigframes.core.compile.ibis_types.ibis_type_from_python_type( + output_type + ), + ) diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index b3c6aee1b3..7e9df74e76 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -14,664 +14,29 @@ from __future__ import annotations -import collections.abc -import hashlib -import inspect import logging -import os -import random -import shutil -import string -import sys -import tempfile -import threading -from typing import ( - Any, - cast, - Dict, - List, - Mapping, - NamedTuple, - Optional, - Sequence, - Set, - Tuple, - TYPE_CHECKING, - Union, -) +from typing import cast, Optional, TYPE_CHECKING import warnings import ibis -import numpy -import pandas -import pyarrow -import requests if TYPE_CHECKING: from bigframes.session import Session -import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes -import cloudpickle import google.api_core.exceptions import google.api_core.retry -from google.cloud import ( - bigquery, - bigquery_connection_v1, - functions_v2, - resourcemanager_v3, -) +from google.cloud import bigquery import google.iam.v1 -from ibis.expr.datatypes.core import DataType as IbisDataType -from bigframes import clients import bigframes.constants as constants import bigframes.core.compile.ibis_types import bigframes.dtypes import bigframes.functions.remote_function_template -logger = logging.getLogger(__name__) - -# Naming convention for the remote function artifacts -_BIGFRAMES_REMOTE_FUNCTION_PREFIX = "bigframes" -_BQ_FUNCTION_NAME_SEPERATOR = "_" -_GCF_FUNCTION_NAME_SEPERATOR = "-" - -# Protocol version 4 is available in python version 3.4 and above -# https://docs.python.org/3/library/pickle.html#data-stream-format -_pickle_protocol_version = 4 - - -def _clean_up_by_session_id( - bqclient: bigquery.Client, - gcfclient: functions_v2.FunctionServiceClient, - dataset: bigquery.DatasetReference, - session_id: str, -): - """Delete remote function artifacts for a session id, where the session id - was not necessarily created in the current runtime. This is useful if the - user worked with a BigQuery DataFrames session previously and remembered the - session id, and now wants to clean up its temporary resources at a later - point in time. - """ - - # First clean up the BQ remote functions and then the underlying - # cloud functions, so that at no point we are left with a remote function - # that is pointing to a cloud function that does not exist - - endpoints_to_be_deleted: Set[str] = set() - match_prefix = "".join( - [ - _BIGFRAMES_REMOTE_FUNCTION_PREFIX, - _BQ_FUNCTION_NAME_SEPERATOR, - session_id, - _BQ_FUNCTION_NAME_SEPERATOR, - ] - ) - for routine in bqclient.list_routines(dataset): - routine = cast(bigquery.Routine, routine) - - # skip past the routines not belonging to the given session id, or - # non-remote-function routines - if ( - routine.type_ != bigquery.RoutineType.SCALAR_FUNCTION - or not cast(str, routine.routine_id).startswith(match_prefix) - or not routine.remote_function_options - or not routine.remote_function_options.endpoint - ): - continue - - # Let's forgive the edge case possibility that the BQ remote function - # may have been deleted at the same time directly by the user - bqclient.delete_routine(routine, not_found_ok=True) - endpoints_to_be_deleted.add(routine.remote_function_options.endpoint) - - # Now clean up the cloud functions - bq_location = bqclient.get_dataset(dataset).location - bq_location, gcf_location = get_remote_function_locations(bq_location) - parent_path = gcfclient.common_location_path( - project=dataset.project, location=gcf_location - ) - for gcf in gcfclient.list_functions(parent=parent_path): - # skip past the cloud functions not attached to any BQ remote function - # belonging to the given session id - if gcf.service_config.uri not in endpoints_to_be_deleted: - continue - - # Let's forgive the edge case possibility that the cloud function - # may have been deleted at the same time directly by the user - try: - gcfclient.delete_function(name=gcf.name) - except google.api_core.exceptions.NotFound: - pass - - -def get_remote_function_locations(bq_location): - """Get BQ location and cloud functions region given a BQ client.""" - # TODO(shobs, b/274647164): Find the best way to determine default location. - # For now let's assume that if no BQ location is set in the client then it - # defaults to US multi region - bq_location = bq_location.lower() if bq_location else "us" - - # Cloud function should be in the same region as the bigquery remote function - cloud_function_region = bq_location - - # BigQuery has multi region but cloud functions does not. - # Any region in the multi region that supports cloud functions should work - # https://cloud.google.com/functions/docs/locations - if bq_location == "us": - cloud_function_region = "us-central1" - elif bq_location == "eu": - cloud_function_region = "europe-west1" - - return bq_location, cloud_function_region - - -def _get_hash(def_, package_requirements=None): - "Get hash (32 digits alphanumeric) of a function." - # There is a known cell-id sensitivity of the cloudpickle serialization in - # notebooks https://github.com/cloudpipe/cloudpickle/issues/538. Because of - # this, if a cell contains a udf decorated with @remote_function, a unique - # cloudpickle code is generated every time the cell is run, creating new - # cloud artifacts every time. This is slow and wasteful. - # A workaround of the same can be achieved by replacing the filename in the - # code object to a static value - # https://github.com/cloudpipe/cloudpickle/issues/120#issuecomment-338510661. - # - # To respect the user code/environment let's make this modification on a - # copy of the udf, not on the original udf itself. - def_copy = cloudpickle.loads(cloudpickle.dumps(def_)) - def_copy.__code__ = def_copy.__code__.replace( - co_filename="bigframes_place_holder_filename" - ) - - def_repr = cloudpickle.dumps(def_copy, protocol=_pickle_protocol_version) - if package_requirements: - for p in sorted(package_requirements): - def_repr += p.encode() - return hashlib.md5(def_repr).hexdigest() - - -def _get_updated_package_requirements( - package_requirements=None, is_row_processor=False -): - requirements = [f"cloudpickle=={cloudpickle.__version__}"] - if is_row_processor: - # bigframes remote function will send an entire row of data as json, - # which would be converted to a pandas series and processed - # Ensure numpy versions match to avoid unpickling problems. See - # internal issue b/347934471. - requirements.append(f"numpy=={numpy.__version__}") - requirements.append(f"pandas=={pandas.__version__}") - requirements.append(f"pyarrow=={pyarrow.__version__}") - - if package_requirements: - requirements.extend(package_requirements) - - requirements = sorted(requirements) - return requirements - - -def routine_ref_to_string_for_query(routine_ref: bigquery.RoutineReference) -> str: - return f"`{routine_ref.project}.{routine_ref.dataset_id}`.{routine_ref.routine_id}" - - -class IbisSignature(NamedTuple): - parameter_names: List[str] - input_types: List[Optional[IbisDataType]] - output_type: IbisDataType - - -def get_cloud_function_name(function_hash, session_id=None, uniq_suffix=None): - "Get a name for the cloud function for the given user defined function." - parts = [_BIGFRAMES_REMOTE_FUNCTION_PREFIX] - if session_id: - parts.append(session_id) - parts.append(function_hash) - if uniq_suffix: - parts.append(uniq_suffix) - return _GCF_FUNCTION_NAME_SEPERATOR.join(parts) - - -def get_remote_function_name(function_hash, session_id, uniq_suffix=None): - "Get a name for the BQ remote function for the given user defined function." - parts = [_BIGFRAMES_REMOTE_FUNCTION_PREFIX, session_id, function_hash] - if uniq_suffix: - parts.append(uniq_suffix) - return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) - - -class RemoteFunctionClient: - # Wait time (in seconds) for an IAM binding to take effect after creation - _iam_wait_seconds = 120 - - def __init__( - self, - gcp_project_id, - cloud_function_region, - cloud_functions_client, - bq_location, - bq_dataset, - bq_client, - bq_connection_id, - bq_connection_manager, - cloud_function_service_account, - cloud_function_kms_key_name, - cloud_function_docker_repository, - *, - session: Session, - ): - self._gcp_project_id = gcp_project_id - self._cloud_function_region = cloud_function_region - self._cloud_functions_client = cloud_functions_client - self._bq_location = bq_location - self._bq_dataset = bq_dataset - self._bq_client = bq_client - self._bq_connection_id = bq_connection_id - self._bq_connection_manager = bq_connection_manager - self._cloud_function_service_account = cloud_function_service_account - self._cloud_function_kms_key_name = cloud_function_kms_key_name - self._cloud_function_docker_repository = cloud_function_docker_repository - self._session = session - - def create_bq_remote_function( - self, - input_args, - input_types, - output_type, - endpoint, - bq_function_name, - max_batching_rows, - ): - """Create a BigQuery remote function given the artifacts of a user defined - function and the http endpoint of a corresponding cloud function.""" - if self._bq_connection_manager: - self._bq_connection_manager.create_bq_connection( - self._gcp_project_id, - self._bq_location, - self._bq_connection_id, - "run.invoker", - ) - - # Create BQ function - # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function_2 - bq_function_args = [] - bq_function_return_type = output_type - - # We are expecting the input type annotations to be 1:1 with the input args - for name, type_ in zip(input_args, input_types): - bq_function_args.append(f"{name} {type_}") - - remote_function_options = { - "endpoint": endpoint, - "max_batching_rows": max_batching_rows, - } - - remote_function_options_str = ", ".join( - [ - f'{key}="{val}"' if isinstance(val, str) else f"{key}={val}" - for key, val in remote_function_options.items() - if val is not None - ] - ) - - create_function_ddl = f""" - CREATE OR REPLACE FUNCTION `{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name}({','.join(bq_function_args)}) - RETURNS {bq_function_return_type} - REMOTE WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{self._bq_connection_id}` - OPTIONS ({remote_function_options_str})""" - - logger.info(f"Creating BQ remote function: {create_function_ddl}") - - # Make sure the dataset exists. I.e. if it doesn't exist, go ahead and - # create it - dataset = bigquery.Dataset( - bigquery.DatasetReference.from_string( - self._bq_dataset, default_project=self._gcp_project_id - ) - ) - dataset.location = self._bq_location - try: - # This check does not require bigquery.datasets.create IAM - # permission. So, if the data set already exists, then user can work - # without having that permission. - self._bq_client.get_dataset(dataset) - except google.api_core.exceptions.NotFound: - # This requires bigquery.datasets.create IAM permission - self._bq_client.create_dataset(dataset, exists_ok=True) - - # TODO(swast): plumb through the original, user-facing api_name. - _, query_job = self._session._start_query(create_function_ddl) - logger.info(f"Created remote function {query_job.ddl_target_routine}") - - def get_cloud_function_fully_qualified_parent(self): - "Get the fully qualilfied parent for a cloud function." - return self._cloud_functions_client.common_location_path( - self._gcp_project_id, self._cloud_function_region - ) - - def get_cloud_function_fully_qualified_name(self, name): - "Get the fully qualilfied name for a cloud function." - return self._cloud_functions_client.function_path( - self._gcp_project_id, self._cloud_function_region, name - ) - - def get_remote_function_fully_qualilfied_name(self, name): - "Get the fully qualilfied name for a BQ remote function." - return f"{self._gcp_project_id}.{self._bq_dataset}.{name}" - - def get_cloud_function_endpoint(self, name): - """Get the http endpoint of a cloud function if it exists.""" - fully_qualified_name = self.get_cloud_function_fully_qualified_name(name) - try: - response = self._cloud_functions_client.get_function( - name=fully_qualified_name - ) - return response.service_config.uri - except google.api_core.exceptions.NotFound: - pass - return None - - def generate_cloud_function_code( - self, - def_, - directory, - *, - input_types: Tuple[str], - output_type: str, - package_requirements=None, - is_row_processor=False, - ): - """Generate the cloud function code for a given user defined function. - - Args: - input_types (tuple[str]): - Types of the input arguments in BigQuery SQL data type names. - output_type (str): - Types of the output scalar as a BigQuery SQL data type name. - """ - - # requirements.txt - if package_requirements: - requirements_txt = os.path.join(directory, "requirements.txt") - with open(requirements_txt, "w") as f: - f.write("\n".join(package_requirements)) - - # main.py - entry_point = bigframes.functions.remote_function_template.generate_cloud_function_main_code( - def_, - directory, - input_types=input_types, - output_type=output_type, - is_row_processor=is_row_processor, - ) - return entry_point - - def create_cloud_function( - self, - def_, - cf_name, - *, - input_types: Tuple[str], - output_type: str, - package_requirements=None, - timeout_seconds=600, - max_instance_count=None, - is_row_processor=False, - vpc_connector=None, - memory_mib=1024, - ): - """Create a cloud function from the given user defined function. - - Args: - input_types (tuple[str]): - Types of the input arguments in BigQuery SQL data type names. - output_type (str): - Types of the output scalar as a BigQuery SQL data type name. - """ - - # Build and deploy folder structure containing cloud function - with tempfile.TemporaryDirectory() as directory: - entry_point = self.generate_cloud_function_code( - def_, - directory, - package_requirements=package_requirements, - input_types=input_types, - output_type=output_type, - is_row_processor=is_row_processor, - ) - archive_path = shutil.make_archive(directory, "zip", directory) - - # We are creating cloud function source code from the currently running - # python version. Use the same version to deploy. This is necessary - # because cloudpickle serialization done in one python version and - # deserialization done in another python version doesn't work. - # TODO(shobs): Figure out how to achieve version compatibility, specially - # when pickle (internally used by cloudpickle) guarantees that: - # https://docs.python.org/3/library/pickle.html#:~:text=The%20pickle%20serialization%20format%20is,unique%20breaking%20change%20language%20boundary. - python_version = "python{}{}".format( - sys.version_info.major, sys.version_info.minor - ) - - # Determine an upload URL for user code - upload_url_request = functions_v2.GenerateUploadUrlRequest( - kms_key_name=self._cloud_function_kms_key_name - ) - upload_url_request.parent = self.get_cloud_function_fully_qualified_parent() - upload_url_response = self._cloud_functions_client.generate_upload_url( - request=upload_url_request - ) - - # Upload the code to GCS - with open(archive_path, "rb") as f: - response = requests.put( - upload_url_response.upload_url, - data=f, - headers={"content-type": "application/zip"}, - ) - if response.status_code != 200: - raise RuntimeError( - "Failed to upload user code. code={}, reason={}, text={}".format( - response.status_code, response.reason, response.text - ) - ) - - # Deploy Cloud Function - create_function_request = functions_v2.CreateFunctionRequest() - create_function_request.parent = ( - self.get_cloud_function_fully_qualified_parent() - ) - create_function_request.function_id = cf_name - function = functions_v2.Function() - function.name = self.get_cloud_function_fully_qualified_name(cf_name) - function.build_config = functions_v2.BuildConfig() - function.build_config.runtime = python_version - function.build_config.entry_point = entry_point - function.build_config.source = functions_v2.Source() - function.build_config.source.storage_source = functions_v2.StorageSource() - function.build_config.source.storage_source.bucket = ( - upload_url_response.storage_source.bucket - ) - function.build_config.source.storage_source.object_ = ( - upload_url_response.storage_source.object_ - ) - function.build_config.docker_repository = ( - self._cloud_function_docker_repository - ) - function.service_config = functions_v2.ServiceConfig() - if memory_mib is not None: - function.service_config.available_memory = f"{memory_mib}Mi" - if timeout_seconds is not None: - if timeout_seconds > 1200: - raise ValueError( - "BigQuery remote function can wait only up to 20 minutes" - ", see for more details " - "https://cloud.google.com/bigquery/quotas#remote_function_limits." - ) - function.service_config.timeout_seconds = timeout_seconds - if max_instance_count is not None: - function.service_config.max_instance_count = max_instance_count - if vpc_connector is not None: - function.service_config.vpc_connector = vpc_connector - function.service_config.service_account_email = ( - self._cloud_function_service_account - ) - function.kms_key_name = self._cloud_function_kms_key_name - create_function_request.function = function - - # Create the cloud function and wait for it to be ready to use - try: - operation = self._cloud_functions_client.create_function( - request=create_function_request - ) - operation.result() - - # Cleanup - os.remove(archive_path) - except google.api_core.exceptions.AlreadyExists: - # If a cloud function with the same name already exists, let's - # update it - update_function_request = functions_v2.UpdateFunctionRequest() - update_function_request.function = function - operation = self._cloud_functions_client.update_function( - request=update_function_request - ) - operation.result() - - # Fetch the endpoint of the just created function - endpoint = self.get_cloud_function_endpoint(cf_name) - if not endpoint: - raise ValueError( - f"Couldn't fetch the http endpoint. {constants.FEEDBACK_LINK}" - ) - - logger.info( - f"Successfully created cloud function {cf_name} with uri ({endpoint})" - ) - return endpoint - - def provision_bq_remote_function( - self, - def_, - input_types, - output_type, - reuse, - name, - package_requirements, - max_batching_rows, - cloud_function_timeout, - cloud_function_max_instance_count, - is_row_processor, - cloud_function_vpc_connector, - cloud_function_memory_mib, - ): - """Provision a BigQuery remote function.""" - # Augment user package requirements with any internal package - # requirements - package_requirements = _get_updated_package_requirements( - package_requirements, is_row_processor - ) - - # Compute a unique hash representing the user code - function_hash = _get_hash(def_, package_requirements) - - # If reuse of any existing function with the same name (indicated by the - # same hash of its source code) is not intended, then attach a unique - # suffix to the intended function name to make it unique. - uniq_suffix = None - if not reuse: - # use 4 digits as a unique suffix which should suffice for - # uniqueness per session - uniq_suffix = "".join( - random.choices(string.ascii_lowercase + string.digits, k=4) - ) - - # Derive the name of the cloud function underlying the intended BQ - # remote function. Use the session id to identify the GCF for unnamed - # functions. The named remote functions are treated as a persistant - # artifacts, so let's keep them independent of session id, which also - # makes their naming more stable for the same udf code - session_id = None if name else self._session.session_id - cloud_function_name = get_cloud_function_name( - function_hash, session_id, uniq_suffix - ) - cf_endpoint = self.get_cloud_function_endpoint(cloud_function_name) - - # Create the cloud function if it does not exist - if not cf_endpoint: - cf_endpoint = self.create_cloud_function( - def_, - cloud_function_name, - input_types=input_types, - output_type=output_type, - package_requirements=package_requirements, - timeout_seconds=cloud_function_timeout, - max_instance_count=cloud_function_max_instance_count, - is_row_processor=is_row_processor, - vpc_connector=cloud_function_vpc_connector, - memory_mib=cloud_function_memory_mib, - ) - else: - logger.info(f"Cloud function {cloud_function_name} already exists.") - - # Derive the name of the remote function - remote_function_name = name - if not remote_function_name: - remote_function_name = get_remote_function_name( - function_hash, self._session.session_id, uniq_suffix - ) - rf_endpoint, rf_conn = self.get_remote_function_specs(remote_function_name) - - # Create the BQ remote function in following circumstances: - # 1. It does not exist - # 2. It exists but the existing remote function has different - # configuration than intended - created_new = False - if not rf_endpoint or ( - rf_endpoint != cf_endpoint or rf_conn != self._bq_connection_id - ): - input_args = inspect.getargs(def_.__code__).args - if len(input_args) != len(input_types): - raise ValueError( - "Exactly one type should be provided for every input arg." - ) - self.create_bq_remote_function( - input_args, - input_types, - output_type, - cf_endpoint, - remote_function_name, - max_batching_rows, - ) - - created_new = True - else: - logger.info(f"Remote function {remote_function_name} already exists.") - - return remote_function_name, cloud_function_name, created_new +from . import _remote_function_session as rf_session +from . import _utils - def get_remote_function_specs(self, remote_function_name): - """Check whether a remote function already exists for the udf.""" - http_endpoint = None - bq_connection = None - routines = self._bq_client.list_routines( - f"{self._gcp_project_id}.{self._bq_dataset}" - ) - try: - for routine in routines: - routine = cast(bigquery.Routine, routine) - if routine.reference.routine_id == remote_function_name: - rf_options = routine.remote_function_options - if rf_options: - http_endpoint = rf_options.endpoint - bq_connection = rf_options.connection - if bq_connection: - bq_connection = os.path.basename(bq_connection) - break - except google.api_core.exceptions.NotFound: - # The dataset might not exist, in which case the http_endpoint doesn't, either. - # Note: list_routines doesn't make an API request until we iterate on the response object. - pass - return (http_endpoint, bq_connection) +logger = logging.getLogger(__name__) class UnsupportedTypeError(ValueError): @@ -680,34 +45,16 @@ def __init__(self, type_, supported_types): self.supported_types = supported_types -def ibis_signature_from_python_signature( - signature: inspect.Signature, - input_types: Sequence[type], - output_type: type, -) -> IbisSignature: - - return IbisSignature( - parameter_names=list(signature.parameters.keys()), - input_types=[ - bigframes.core.compile.ibis_types.ibis_type_from_python_type(t) - for t in input_types - ], - output_type=bigframes.core.compile.ibis_types.ibis_type_from_python_type( - output_type - ), - ) - - class ReturnTypeMissingError(ValueError): pass # TODO: Move this to compile folder -def ibis_signature_from_routine(routine: bigquery.Routine) -> IbisSignature: +def ibis_signature_from_routine(routine: bigquery.Routine) -> _utils.IbisSignature: if not routine.return_type: raise ReturnTypeMissingError - return IbisSignature( + return _utils.IbisSignature( parameter_names=[arg.name for arg in routine.arguments], input_types=[ bigframes.core.compile.ibis_types.ibis_type_from_type_kind( @@ -748,515 +95,12 @@ def get_routine_reference( return dataset_ref.routine(routine_ref_str) -class _RemoteFunctionSession: - """Session to manage remote functions.""" - - def __init__(self): - # Session level mapping of remote function artifacts - self._temp_artifacts: Dict[str, str] = dict() - - # Lock to synchronize the update of the session artifacts - self._artifacts_lock = threading.Lock() - - def _update_temp_artifacts(self, bqrf_routine: str, gcf_path: str): - """Update remote function artifacts in the current session.""" - with self._artifacts_lock: - self._temp_artifacts[bqrf_routine] = gcf_path - - def clean_up( - self, - bqclient: bigquery.Client, - gcfclient: functions_v2.FunctionServiceClient, - session_id: str, - ): - """Delete remote function artifacts in the current session.""" - with self._artifacts_lock: - for bqrf_routine, gcf_path in self._temp_artifacts.items(): - # Let's accept the possibility that the remote function may have - # been deleted directly by the user - bqclient.delete_routine(bqrf_routine, not_found_ok=True) - - # Let's accept the possibility that the cloud function may have - # been deleted directly by the user - try: - gcfclient.delete_function(name=gcf_path) - except google.api_core.exceptions.NotFound: - pass - - self._temp_artifacts.clear() - - # Inspired by @udf decorator implemented in ibis-bigquery package - # https://github.com/ibis-project/ibis-bigquery/blob/main/ibis_bigquery/udf/__init__.py - # which has moved as @js to the ibis package - # https://github.com/ibis-project/ibis/blob/master/ibis/backends/bigquery/udf/__init__.py - def remote_function( - self, - input_types: Union[None, type, Sequence[type]] = None, - output_type: Optional[type] = None, - session: Optional[Session] = None, - bigquery_client: Optional[bigquery.Client] = None, - bigquery_connection_client: Optional[ - bigquery_connection_v1.ConnectionServiceClient - ] = None, - cloud_functions_client: Optional[functions_v2.FunctionServiceClient] = None, - resource_manager_client: Optional[resourcemanager_v3.ProjectsClient] = None, - dataset: Optional[str] = None, - bigquery_connection: Optional[str] = None, - reuse: bool = True, - name: Optional[str] = None, - packages: Optional[Sequence[str]] = None, - cloud_function_service_account: Optional[str] = None, - cloud_function_kms_key_name: Optional[str] = None, - cloud_function_docker_repository: Optional[str] = None, - max_batching_rows: Optional[int] = 1000, - cloud_function_timeout: Optional[int] = 600, - cloud_function_max_instances: Optional[int] = None, - cloud_function_vpc_connector: Optional[str] = None, - cloud_function_memory_mib: Optional[int] = 1024, - ): - """Decorator to turn a user defined function into a BigQuery remote function. - - .. deprecated:: 0.0.1 - This is an internal method. Please use :func:`bigframes.pandas.remote_function` instead. - - .. note:: - Please make sure following is setup before using this API: - - 1. Have the below APIs enabled for your project: - - * BigQuery Connection API - * Cloud Functions API - * Cloud Run API - * Cloud Build API - * Artifact Registry API - * Cloud Resource Manager API - - This can be done from the cloud console (change `PROJECT_ID` to yours): - https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID - - Or from the gcloud CLI: - - `$ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com` - - 2. Have following IAM roles enabled for you: - - * BigQuery Data Editor (roles/bigquery.dataEditor) - * BigQuery Connection Admin (roles/bigquery.connectionAdmin) - * Cloud Functions Developer (roles/cloudfunctions.developer) - * Service Account User (roles/iam.serviceAccountUser) on the service account `PROJECT_NUMBER-compute@developer.gserviceaccount.com` - * Storage Object Viewer (roles/storage.objectViewer) - * Project IAM Admin (roles/resourcemanager.projectIamAdmin) (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.) - - 3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set: - - 1. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection - 2. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function - - Alternatively, the IAM could also be setup via the gcloud CLI: - - `$ gcloud projects add-iam-policy-binding PROJECT_ID --member="serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID" --role="roles/run.invoker"`. - - Args: - input_types (None, type, or sequence(type)): - For scalar user defined function it should be the input type or - sequence of input types. For row processing user defined function, - type `Series` should be specified. - output_type (Optional[type]): - Data type of the output in the user defined function. - session (bigframes.Session, Optional): - BigQuery DataFrames session to use for getting default project, - dataset and BigQuery connection. - bigquery_client (google.cloud.bigquery.Client, Optional): - Client to use for BigQuery operations. If this param is not provided - then bigquery client from the session would be used. - bigquery_connection_client (google.cloud.bigquery_connection_v1.ConnectionServiceClient, Optional): - Client to use for BigQuery connection operations. If this param is - not provided then bigquery connection client from the session would - be used. - cloud_functions_client (google.cloud.functions_v2.FunctionServiceClient, Optional): - Client to use for cloud functions operations. If this param is not - provided then the functions client from the session would be used. - resource_manager_client (google.cloud.resourcemanager_v3.ProjectsClient, Optional): - Client to use for cloud resource management operations, e.g. for - getting and setting IAM roles on cloud resources. If this param is - not provided then resource manager client from the session would be - used. - dataset (str, Optional.): - Dataset in which to create a BigQuery remote function. It should be in - `.` or `` format. If this - parameter is not provided then session dataset id is used. - bigquery_connection (str, Optional): - Name of the BigQuery connection in the form of `CONNECTION_ID` or - `LOCATION.CONNECTION_ID` or `PROJECT_ID.LOCATION.CONNECTION_ID`. - If this param is not provided then the bigquery connection from the session - would be used. If it is pre created in the same location as the - `bigquery_client.location` then it would be used, otherwise it is created - dynamically using the `bigquery_connection_client` assuming the user has necessary - priviliges. The PROJECT_ID should be the same as the BigQuery connection project. - reuse (bool, Optional): - Reuse the remote function if already exists. - `True` by default, which will result in reusing an existing remote - function and corresponding cloud function that was previously - created (if any) for the same udf. - Please note that for an unnamed (i.e. created without an explicit - `name` argument) remote function, the BigQuery DataFrames - session id is attached in the cloud artifacts names. So for the - effective reuse across the sessions it is recommended to create - the remote function with an explicit `name`. - Setting it to `False` would force creating a unique remote function. - If the required remote function does not exist then it would be - created irrespective of this param. - name (str, Optional): - Explicit name of the persisted BigQuery remote function. Use it with - caution, because two users working in the same project and dataset - could overwrite each other's remote functions if they use the same - persistent name. When an explicit name is provided, any session - specific clean up (``bigframes.session.Session.close``/ - ``bigframes.pandas.close_session``/ - ``bigframes.pandas.reset_session``/ - ``bigframes.pandas.clean_up_by_session_id``) does not clean up - the function, and leaves it for the user to manage the function - and the associated cloud function directly. - packages (str[], Optional): - Explicit name of the external package dependencies. Each dependency - is added to the `requirements.txt` as is, and can be of the form - supported in https://pip.pypa.io/en/stable/reference/requirements-file-format/. - cloud_function_service_account (str, Optional): - Service account to use for the cloud functions. If not provided then - the default service account would be used. See - https://cloud.google.com/functions/docs/securing/function-identity - for more details. Please make sure the service account has the - necessary IAM permissions configured as described in - https://cloud.google.com/functions/docs/reference/iam/roles#additional-configuration. - cloud_function_kms_key_name (str, Optional): - Customer managed encryption key to protect cloud functions and - related data at rest. This is of the format - projects/PROJECT_ID/locations/LOCATION/keyRings/KEYRING/cryptoKeys/KEY. - Read https://cloud.google.com/functions/docs/securing/cmek for - more details including granting necessary service accounts - access to the key. - cloud_function_docker_repository (str, Optional): - Docker repository created with the same encryption key as - `cloud_function_kms_key_name` to store encrypted artifacts - created to support the cloud function. This is of the format - projects/PROJECT_ID/locations/LOCATION/repositories/REPOSITORY_NAME. - For more details see - https://cloud.google.com/functions/docs/securing/cmek#before_you_begin. - max_batching_rows (int, Optional): - The maximum number of rows to be batched for processing in the - BQ remote function. Default value is 1000. A lower number can be - passed to avoid timeouts in case the user code is too complex to - process large number of rows fast enough. A higher number can be - used to increase throughput in case the user code is fast enough. - `None` can be passed to let BQ remote functions service apply - default batching. See for more details - https://cloud.google.com/bigquery/docs/remote-functions#limiting_number_of_rows_in_a_batch_request. - cloud_function_timeout (int, Optional): - The maximum amount of time (in seconds) BigQuery should wait for - the cloud function to return a response. See for more details - https://cloud.google.com/functions/docs/configuring/timeout. - Please note that even though the cloud function (2nd gen) itself - allows seeting up to 60 minutes of timeout, BigQuery remote - function can wait only up to 20 minutes, see for more details - https://cloud.google.com/bigquery/quotas#remote_function_limits. - By default BigQuery DataFrames uses a 10 minute timeout. `None` - can be passed to let the cloud functions default timeout take effect. - cloud_function_max_instances (int, Optional): - The maximumm instance count for the cloud function created. This - can be used to control how many cloud function instances can be - active at max at any given point of time. Lower setting can help - control the spike in the billing. Higher setting can help - support processing larger scale data. When not specified, cloud - function's default setting applies. For more details see - https://cloud.google.com/functions/docs/configuring/max-instances. - cloud_function_vpc_connector (str, Optional): - The VPC connector you would like to configure for your cloud - function. This is useful if your code needs access to data or - service(s) that are on a VPC network. See for more details - https://cloud.google.com/functions/docs/networking/connecting-vpc. - cloud_function_memory_mib (int, Optional): - The amounts of memory (in mebibytes) to allocate for the cloud - function (2nd gen) created. This also dictates a corresponding - amount of allocated CPU for the function. By default a memory of - 1024 MiB is set for the cloud functions created to support - BigQuery DataFrames remote function. If you want to let the - default memory of cloud functions be allocated, pass `None`. See - for more details - https://cloud.google.com/functions/docs/configuring/memory. - """ - # Some defaults may be used from the session if not provided otherwise - import bigframes.exceptions as bf_exceptions - import bigframes.pandas as bpd - import bigframes.series as bf_series - import bigframes.session - - session = cast(bigframes.session.Session, session or bpd.get_global_session()) - - # A BigQuery client is required to perform BQ operations - if not bigquery_client: - bigquery_client = session.bqclient - if not bigquery_client: - raise ValueError( - "A bigquery client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) - - # A BigQuery connection client is required to perform BQ connection operations - if not bigquery_connection_client: - bigquery_connection_client = session.bqconnectionclient - if not bigquery_connection_client: - raise ValueError( - "A bigquery connection client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) - - # A cloud functions client is required to perform cloud functions operations - if not cloud_functions_client: - cloud_functions_client = session.cloudfunctionsclient - if not cloud_functions_client: - raise ValueError( - "A cloud functions client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) - - # A resource manager client is required to get/set IAM operations - if not resource_manager_client: - resource_manager_client = session.resourcemanagerclient - if not resource_manager_client: - raise ValueError( - "A resource manager client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) - - # BQ remote function must be persisted, for which we need a dataset - # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#:~:text=You%20cannot%20create%20temporary%20remote%20functions. - if dataset: - dataset_ref = bigquery.DatasetReference.from_string( - dataset, default_project=bigquery_client.project - ) - else: - dataset_ref = session._anonymous_dataset - - bq_location, cloud_function_region = get_remote_function_locations( - bigquery_client.location - ) - - # A connection is required for BQ remote function - # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function - if not bigquery_connection: - bigquery_connection = session._bq_connection # type: ignore - - bigquery_connection = clients.resolve_full_bq_connection_name( - bigquery_connection, - default_project=dataset_ref.project, - default_location=bq_location, - ) - # Guaranteed to be the form of .. - ( - gcp_project_id, - bq_connection_location, - bq_connection_id, - ) = bigquery_connection.split(".") - if gcp_project_id.casefold() != dataset_ref.project.casefold(): - raise ValueError( - "The project_id does not match BigQuery connection gcp_project_id: " - f"{dataset_ref.project}." - ) - if bq_connection_location.casefold() != bq_location.casefold(): - raise ValueError( - "The location does not match BigQuery connection location: " - f"{bq_location}." - ) - - # If any CMEK is intended then check that a docker repository is also specified - if ( - cloud_function_kms_key_name is not None - and cloud_function_docker_repository is None - ): - raise ValueError( - "cloud_function_docker_repository must be specified with cloud_function_kms_key_name." - " For more details see https://cloud.google.com/functions/docs/securing/cmek#before_you_begin" - ) - - bq_connection_manager = session.bqconnectionmanager - - def wrapper(func): - nonlocal input_types, output_type - - if not callable(func): - raise TypeError("f must be callable, got {}".format(func)) - - if sys.version_info >= (3, 10): - # Add `eval_str = True` so that deferred annotations are turned into their - # corresponding type objects. Need Python 3.10 for eval_str parameter. - # https://docs.python.org/3/library/inspect.html#inspect.signature - signature_kwargs: Mapping[str, Any] = {"eval_str": True} - else: - signature_kwargs = {} - - signature = inspect.signature( - func, - **signature_kwargs, - ) - - # Try to get input types via type annotations. - if input_types is None: - input_types = [] - for parameter in signature.parameters.values(): - if (param_type := parameter.annotation) is inspect.Signature.empty: - raise ValueError( - "'input_types' was not set and parameter " - f"'{parameter.name}' is missing a type annotation. " - "Types are required to use @remote_function." - ) - input_types.append(param_type) - elif not isinstance(input_types, collections.abc.Sequence): - input_types = [input_types] - - if output_type is None: - if ( - output_type := signature.return_annotation - ) is inspect.Signature.empty: - raise ValueError( - "'output_type' was not set and function is missing a " - "return type annotation. Types are required to use " - "@remote_function." - ) - - # The function will actually be receiving a pandas Series, but allow both - # BigQuery DataFrames and pandas object types for compatibility. - is_row_processor = False - if len(input_types) == 1 and ( - (input_type := input_types[0]) == bf_series.Series - or input_type == pandas.Series - ): - warnings.warn( - "input_types=Series is in preview.", - stacklevel=1, - category=bf_exceptions.PreviewWarning, - ) - - # we will model the row as a json serialized string containing the data - # and the metadata representing the row - input_types = [str] - is_row_processor = True - elif isinstance(input_types, type): - input_types = [input_types] - - # TODO(b/340898611): fix type error - ibis_signature = ibis_signature_from_python_signature( - signature, input_types, output_type # type: ignore - ) - - remote_function_client = RemoteFunctionClient( - dataset_ref.project, - cloud_function_region, - cloud_functions_client, - bq_location, - dataset_ref.dataset_id, - bigquery_client, - bq_connection_id, - bq_connection_manager, - cloud_function_service_account, - cloud_function_kms_key_name, - cloud_function_docker_repository, - session=session, # type: ignore - ) - - # In the unlikely case where the user is trying to re-deploy the same - # function, cleanup the attributes we add below, first. This prevents - # the pickle from having dependencies that might not otherwise be - # present such as ibis or pandas. - def try_delattr(attr): - try: - delattr(func, attr) - except AttributeError: - pass - - try_delattr("bigframes_cloud_function") - try_delattr("bigframes_remote_function") - try_delattr("input_dtypes") - try_delattr("output_dtype") - try_delattr("is_row_processor") - try_delattr("ibis_node") - - ( - rf_name, - cf_name, - created_new, - ) = remote_function_client.provision_bq_remote_function( - func, - input_types=tuple( - third_party_ibis_bqtypes.BigQueryType.from_ibis(type_) - for type_ in ibis_signature.input_types - ), - output_type=third_party_ibis_bqtypes.BigQueryType.from_ibis( - ibis_signature.output_type - ), - reuse=reuse, - name=name, - package_requirements=packages, - max_batching_rows=max_batching_rows, - cloud_function_timeout=cloud_function_timeout, - cloud_function_max_instance_count=cloud_function_max_instances, - is_row_processor=is_row_processor, - cloud_function_vpc_connector=cloud_function_vpc_connector, - cloud_function_memory_mib=cloud_function_memory_mib, - ) - - # TODO: Move ibis logic to compiler step - node = ibis.udf.scalar.builtin( - func, - name=rf_name, - schema=f"{dataset_ref.project}.{dataset_ref.dataset_id}", - signature=(ibis_signature.input_types, ibis_signature.output_type), - ) - func.bigframes_cloud_function = ( - remote_function_client.get_cloud_function_fully_qualified_name(cf_name) - ) - func.bigframes_remote_function = ( - remote_function_client.get_remote_function_fully_qualilfied_name( - rf_name - ) - ) - func.input_dtypes = tuple( - [ - bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( - input_type - ) - for input_type in ibis_signature.input_types - ] - ) - func.output_dtype = ( - bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( - ibis_signature.output_type - ) - ) - func.is_row_processor = is_row_processor - func.ibis_node = node - - # If a new remote function was created, update the cloud artifacts - # created in the session. This would be used to clean up any - # resources in the session. Note that we need to do this only for - # the case where an explicit name was not provided by the user and - # we used an internal name. For the cases where the user provided an - # explicit name, we are assuming that the user wants to persist them - # with that name and would directly manage their lifecycle. - if created_new and (not name): - self._update_temp_artifacts( - func.bigframes_remote_function, func.bigframes_cloud_function - ) - return func - - return wrapper - - def remote_function(*args, **kwargs): - remote_function_session = _RemoteFunctionSession() + remote_function_session = rf_session.RemoteFunctionSession() return remote_function_session.remote_function(*args, **kwargs) -remote_function.__doc__ = _RemoteFunctionSession.remote_function.__doc__ +remote_function.__doc__ = rf_session.RemoteFunctionSession.remote_function.__doc__ def read_gbq_function( diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 21f75eb82c..08d808572d 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -63,7 +63,7 @@ import bigframes.core.tools import bigframes.dataframe import bigframes.enums -import bigframes.functions.remote_function as bigframes_rf +import bigframes.functions._utils as functions_utils import bigframes.operations as ops import bigframes.series import bigframes.session @@ -817,7 +817,7 @@ def clean_up_by_session_id( session.bqclient, dataset, session_id ) - bigframes_rf._clean_up_by_session_id( + functions_utils._clean_up_by_session_id( session.bqclient, session.cloudfunctionsclient, dataset, session_id ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 2da788292b..8cef869a32 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -95,6 +95,7 @@ import bigframes.dtypes import bigframes.exceptions import bigframes.formatting_helpers as formatting_helpers +import bigframes.functions._remote_function_session as bigframes_rf_session import bigframes.functions.remote_function as bigframes_rf import bigframes.session._io.bigquery as bf_io_bigquery import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table @@ -316,7 +317,7 @@ def __init__( ) self._allow_ambiguity = not self._strictly_ordered - self._remote_function_session = bigframes_rf._RemoteFunctionSession() + self._remote_function_session = bigframes_rf_session.RemoteFunctionSession() @property def bqclient(self): diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 095f7059cd..d6eefc1e31 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -31,7 +31,7 @@ import bigframes.dataframe import bigframes.dtypes import bigframes.exceptions -import bigframes.functions.remote_function as bigframes_rf +import bigframes.functions._utils as functions_utils import bigframes.pandas as bpd import bigframes.series from tests.system.utils import ( @@ -595,9 +595,11 @@ def add_one(x): add_one_uniq, add_one_uniq_dir = make_uniq_udf(add_one) # Expected cloud function name for the unique udf - package_requirements = bigframes_rf._get_updated_package_requirements() - add_one_uniq_hash = bigframes_rf._get_hash(add_one_uniq, package_requirements) - add_one_uniq_cf_name = bigframes_rf.get_cloud_function_name( + package_requirements = functions_utils._get_updated_package_requirements() + add_one_uniq_hash = functions_utils._get_hash( + add_one_uniq, package_requirements + ) + add_one_uniq_cf_name = functions_utils.get_cloud_function_name( add_one_uniq_hash, session.session_id ) diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index 8ecf9eb368..db573efa40 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -23,6 +23,7 @@ import bigframes import bigframes.dtypes import bigframes.exceptions +from bigframes.functions import _utils as rf_utils from bigframes.functions import remote_function as rf from tests.system.utils import assert_pandas_df_equal @@ -89,12 +90,12 @@ def get_rf_name(func, package_requirements=None, is_row_processor=False): """Get a remote function name for testing given a udf.""" # Augment user package requirements with any internal package # requirements - package_requirements = rf._get_updated_package_requirements( + package_requirements = rf_utils._get_updated_package_requirements( package_requirements, is_row_processor ) # Compute a unique hash representing the user code - function_hash = rf._get_hash(func, package_requirements) + function_hash = rf_utils._get_hash(func, package_requirements) return f"bigframes_{function_hash}" @@ -714,7 +715,7 @@ def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id): src = {"x": [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]} - routine_ref_str = rf.routine_ref_to_string_for_query(routine.reference) + routine_ref_str = rf_utils.routine_ref_to_string_for_query(routine.reference) direct_sql = " UNION ALL ".join( [f"SELECT {x} AS x, {routine_ref_str}({x}) AS y" for x in src["x"]] ) diff --git a/tests/system/utils.py b/tests/system/utils.py index 9fbf191a3a..e9054d04c9 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -26,7 +26,7 @@ import pyarrow as pa # type: ignore import pytest -from bigframes.functions import remote_function +import bigframes.functions._utils as functions_utils import bigframes.pandas ML_REGRESSION_METRICS = [ @@ -340,7 +340,7 @@ def get_cloud_functions( not name or not name_prefix ), "Either 'name' or 'name_prefix' can be passed but not both." - _, location = remote_function.get_remote_function_locations(location) + _, location = functions_utils.get_remote_function_locations(location) parent = f"projects/{project}/locations/{location}" request = functions_v2.ListFunctionsRequest(parent=parent) page_result = functions_client.list_functions(request=request) From 6dff860758bd5de08f0692703f27906e1efbe7e6 Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Mon, 12 Aug 2024 21:15:52 -0700 Subject: [PATCH 13/15] chore: update owlbot script to prevent silent failures at s.replace (#889) * chore: update owlbot script to prevent silent failures at s.replace * fix errors * removing s.replace for CONTRIBUTING.rst because it was excluded from templated files --- owlbot.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/owlbot.py b/owlbot.py index f9d9410d6d..b29384d462 100644 --- a/owlbot.py +++ b/owlbot.py @@ -61,7 +61,7 @@ # ---------------------------------------------------------------------------- # Encourage sharring all relevant versions in bug reports. -s.replace( +assert 1 == s.replace( [".github/ISSUE_TEMPLATE/bug_report.md"], re.escape("#### Steps to reproduce\n"), textwrap.dedent( @@ -90,7 +90,7 @@ ) # Make sure build includes all necessary files. -s.replace( +assert 1 == s.replace( ["MANIFEST.in"], re.escape("recursive-include google"), "recursive-include third_party/bigframes_vendored *\nrecursive-include bigframes", @@ -98,7 +98,7 @@ # Even though BigQuery DataFrames isn't technically a client library, we are # opting into Cloud RAD for docs hosting. -s.replace( +assert 1 == s.replace( [".kokoro/docs/common.cfg"], re.escape('value: "docs-staging-v2-staging"'), 'value: "docs-staging-v2"', @@ -106,7 +106,7 @@ # Use a custom table of contents since the default one isn't organized well # enough for the number of classes we have. -s.replace( +assert 1 == s.replace( [".kokoro/publish-docs.sh"], ( re.escape("# upload docs") @@ -124,19 +124,12 @@ ) # Fixup the documentation. -s.replace( +assert 1 == s.replace( ["docs/conf.py"], re.escape("Google Cloud Client Libraries for bigframes"), "BigQuery DataFrames provides DataFrame APIs on the BigQuery engine", ) -# Update the contributing guide to reflect some differences in this repo. -s.replace( - ["CONTRIBUTING.rst"], - re.escape("blacken"), - "format", -) - # ---------------------------------------------------------------------------- # Samples templates # ---------------------------------------------------------------------------- From e027b7e9d29f628d058611106014a1790459958c Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Mon, 12 Aug 2024 22:24:21 -0700 Subject: [PATCH 14/15] feat: Series.str.__getitem__ (#897) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - [X] Make sure to open an issue as internal issue: 358459166 - [X] Ensure the tests and linter pass - [X] Code coverage does not decrease (if any source code was changed) - [X] Appropriate docs were updated (if necessary) Fixes internal issue: 358459166 🦕 --- bigframes/core/compile/scalar_op_compiler.py | 20 ++- bigframes/operations/__init__.py | 34 ++++ bigframes/operations/strings.py | 27 +++ tests/system/small/operations/test_strings.py | 157 +++++++++++++++--- .../pandas/core/strings/accessor.py | 31 ++++ 5 files changed, 241 insertions(+), 28 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 32749b32a6..e70c49e337 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -902,6 +902,24 @@ def array_to_string_op_impl(x: ibis_types.Value, op: ops.ArrayToStringOp): return typing.cast(ibis_types.ArrayValue, x).join(op.delimiter) +@scalar_op_compiler.register_unary_op(ops.ArrayIndexOp, pass_op=True) +def array_index_op_impl(x: ibis_types.Value, op: ops.ArrayIndexOp): + res = typing.cast(ibis_types.ArrayValue, x)[op.index] + if x.type().is_string(): + return _null_or_value(res, res != ibis.literal("")) + else: + return res + + +@scalar_op_compiler.register_unary_op(ops.ArraySliceOp, pass_op=True) +def array_slice_op_impl(x: ibis_types.Value, op: ops.ArraySliceOp): + res = typing.cast(ibis_types.ArrayValue, x)[op.start : op.stop : op.step] + if x.type().is_string(): + return _null_or_value(res, res != ibis.literal("")) + else: + return res + + # JSON Ops @scalar_op_compiler.register_binary_op(ops.JSONSet, pass_op=True) def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet): @@ -984,7 +1002,7 @@ def ne_op( def _null_or_value(value: ibis_types.Value, where_value: ibis_types.BooleanValue): - return ibis.where( + return ibis.ifelse( where_value, value, ibis.null(), diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 4d4e40643d..fb333d7a53 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -602,6 +602,40 @@ def output_type(self, *input_types): return dtypes.STRING_DTYPE +@dataclasses.dataclass(frozen=True) +class ArrayIndexOp(UnaryOp): + name: typing.ClassVar[str] = "array_index" + index: int + + def output_type(self, *input_types): + input_type = input_types[0] + if dtypes.is_string_like(input_type): + return dtypes.STRING_DTYPE + elif dtypes.is_array_like(input_type): + return dtypes.arrow_dtype_to_bigframes_dtype( + input_type.pyarrow_dtype.value_type + ) + else: + raise TypeError("Input type must be an array or string-like type.") + + +@dataclasses.dataclass(frozen=True) +class ArraySliceOp(UnaryOp): + name: typing.ClassVar[str] = "array_slice" + start: int + stop: typing.Optional[int] = None + step: typing.Optional[int] = None + + def output_type(self, *input_types): + input_type = input_types[0] + if dtypes.is_string_like(input_type): + return dtypes.STRING_DTYPE + elif dtypes.is_array_like(input_type): + return input_type + else: + raise TypeError("Input type must be an array or string-like type.") + + ## JSON Ops @dataclasses.dataclass(frozen=True) class JSONExtract(UnaryOp): diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 22c325d7e0..d3e9c7edc6 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -38,6 +38,33 @@ class StringMethods(bigframes.operations.base.SeriesMethods, vendorstr.StringMethods): __doc__ = vendorstr.StringMethods.__doc__ + def __getitem__(self, key: Union[int, slice]) -> series.Series: + if isinstance(key, int): + if key < 0: + raise NotImplementedError("Negative indexing is not supported.") + return self._apply_unary_op(ops.ArrayIndexOp(index=key)) + elif isinstance(key, slice): + if key.step is not None and key.step != 1: + raise NotImplementedError( + f"Only a step of 1 is allowed, got {key.step}" + ) + if (key.start is not None and key.start < 0) or ( + key.stop is not None and key.stop < 0 + ): + raise NotImplementedError( + "Slicing with negative numbers is not allowed." + ) + + return self._apply_unary_op( + ops.ArraySliceOp( + start=key.start if key.start is not None else 0, + stop=key.stop, + step=key.step, + ) + ) + else: + raise ValueError(f"key must be an int or slice, got {type(key).__name__}") + def find( self, sub: str, diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index b8a8ad2d1e..3191adf920 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -14,10 +14,13 @@ import re +import packaging.version import pandas as pd +import pyarrow as pa import pytest -import bigframes.series +import bigframes.dtypes as dtypes +import bigframes.pandas as bpd from ...utils import assert_series_equal @@ -25,7 +28,7 @@ def test_find(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.find("W").to_pandas() pd_result = scalars_pandas_df[col_name].str.find("W") @@ -50,7 +53,7 @@ def test_find(scalars_dfs): def test_str_contains(scalars_dfs, pat, case, flags, regex): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.contains( pat, case=case, flags=flags, regex=regex @@ -72,7 +75,7 @@ def test_str_contains(scalars_dfs, pat, case, flags, regex): def test_str_extract(scalars_dfs, pat): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.extract(pat).to_pandas() pd_result = scalars_pandas_df[col_name].str.extract(pat) @@ -101,7 +104,7 @@ def test_str_extract(scalars_dfs, pat): def test_str_replace(scalars_dfs, pat, repl, case, flags, regex): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.replace( pat, repl=repl, case=case, flags=flags, regex=regex @@ -132,7 +135,7 @@ def test_str_replace(scalars_dfs, pat, repl, case, flags, regex): def test_str_startswith(scalars_dfs, pat): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] pd_series = scalars_pandas_df[col_name].astype("object") bf_result = bf_series.str.startswith(pat).to_pandas() @@ -157,7 +160,7 @@ def test_str_startswith(scalars_dfs, pat): def test_str_endswith(scalars_dfs, pat): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] pd_series = scalars_pandas_df[col_name].astype("object") bf_result = bf_series.str.endswith(pat).to_pandas() @@ -169,7 +172,7 @@ def test_str_endswith(scalars_dfs, pat): def test_len(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.len().to_pandas() pd_result = scalars_pandas_df[col_name].str.len() @@ -188,7 +191,7 @@ def test_len_with_array_column(nested_df, nested_pandas_df): See: https://stackoverflow.com/a/41340543/101923 """ col_name = "event_sequence" - bf_series: bigframes.series.Series = nested_df[col_name] + bf_series: bpd.Series = nested_df[col_name] bf_result = bf_series.str.len().to_pandas() pd_result = nested_pandas_df[col_name].str.len() @@ -204,7 +207,7 @@ def test_len_with_array_column(nested_df, nested_pandas_df): def test_lower(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.lower().to_pandas() pd_result = scalars_pandas_df[col_name].str.lower() @@ -217,7 +220,7 @@ def test_lower(scalars_dfs): def test_reverse(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.reverse().to_pandas() pd_result = scalars_pandas_df[col_name].copy() for i in pd_result.index: @@ -239,7 +242,7 @@ def test_reverse(scalars_dfs): def test_slice(scalars_dfs, start, stop): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.slice(start, stop).to_pandas() pd_series = scalars_pandas_df[col_name] pd_result = pd_series.str.slice(start, stop) @@ -253,7 +256,7 @@ def test_slice(scalars_dfs, start, stop): def test_strip(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.strip().to_pandas() pd_result = scalars_pandas_df[col_name].str.strip() @@ -266,7 +269,7 @@ def test_strip(scalars_dfs): def test_upper(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.upper().to_pandas() pd_result = scalars_pandas_df[col_name].str.upper() @@ -375,7 +378,7 @@ def test_isupper(weird_strings, weird_strings_pd): def test_rstrip(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.rstrip().to_pandas() pd_result = scalars_pandas_df[col_name].str.rstrip() @@ -388,7 +391,7 @@ def test_rstrip(scalars_dfs): def test_lstrip(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.lstrip().to_pandas() pd_result = scalars_pandas_df[col_name].str.lstrip() @@ -402,7 +405,7 @@ def test_lstrip(scalars_dfs): def test_repeat(scalars_dfs, repeats): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.repeat(repeats).to_pandas() pd_result = scalars_pandas_df[col_name].str.repeat(repeats) @@ -415,7 +418,7 @@ def test_repeat(scalars_dfs, repeats): def test_capitalize(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.capitalize().to_pandas() pd_result = scalars_pandas_df[col_name].str.capitalize() @@ -428,9 +431,9 @@ def test_capitalize(scalars_dfs): def test_cat_with_series(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_filter: bigframes.series.Series = scalars_df["bool_col"] - bf_left: bigframes.series.Series = scalars_df[col_name][bf_filter] - bf_right: bigframes.series.Series = scalars_df[col_name] + bf_filter: bpd.Series = scalars_df["bool_col"] + bf_left: bpd.Series = scalars_df[col_name][bf_filter] + bf_right: bpd.Series = scalars_df[col_name] bf_result = bf_left.str.cat(others=bf_right).to_pandas() pd_filter = scalars_pandas_df["bool_col"] pd_left = scalars_pandas_df[col_name][pd_filter] @@ -447,7 +450,7 @@ def test_str_match(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" pattern = "[A-Z].*" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.match(pattern).to_pandas() pd_result = scalars_pandas_df[col_name].str.match(pattern) @@ -461,7 +464,7 @@ def test_str_fullmatch(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" pattern = "[A-Z].*!" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.fullmatch(pattern).to_pandas() pd_result = scalars_pandas_df[col_name].str.fullmatch(pattern) @@ -474,7 +477,7 @@ def test_str_fullmatch(scalars_dfs): def test_str_get(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.get(8).to_pandas() pd_result = scalars_pandas_df[col_name].str.get(8) @@ -487,7 +490,7 @@ def test_str_get(scalars_dfs): def test_str_pad(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.pad(8, side="both", fillchar="%").to_pandas() pd_result = scalars_pandas_df[col_name].str.pad(8, side="both", fillchar="%") @@ -510,7 +513,7 @@ def test_str_zfill(weird_strings, weird_strings_pd): def test_str_ljust(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.ljust(7, fillchar="%").to_pandas() pd_result = scalars_pandas_df[col_name].str.ljust(7, fillchar="%") @@ -523,7 +526,7 @@ def test_str_ljust(scalars_dfs): def test_str_rjust(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] + bf_series: bpd.Series = scalars_df[col_name] bf_result = bf_series.str.rjust(9, fillchar="%").to_pandas() pd_result = scalars_pandas_df[col_name].str.rjust(9, fillchar="%") @@ -562,3 +565,103 @@ def test_str_split_raise_errors(scalars_dfs, pat, regex): pd_result = pd_result.apply(lambda x: [] if pd.isnull(x) is True else x) assert_series_equal(pd_result, bf_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("index"), + [ + pytest.param( + "first", id="invalid_type", marks=pytest.mark.xfail(raises=ValueError) + ), + pytest.param( + -1, id="neg_index", marks=pytest.mark.xfail(raises=NotImplementedError) + ), + pytest.param( + slice(0, 2, 2), + id="only_allow_one_step", + marks=pytest.mark.xfail(raises=NotImplementedError), + ), + pytest.param( + slice(-1, None, None), + id="neg_slicing", + marks=pytest.mark.xfail(raises=NotImplementedError), + ), + ], +) +def test_getitem_raise_errors(scalars_dfs, index): + scalars_df, _ = scalars_dfs + col_name = "string_col" + scalars_df[col_name].str[index] + + +@pytest.mark.parametrize( + ("index"), + [ + pytest.param(2, id="int"), + pytest.param(slice(None, None, None), id="default_start_slice"), + pytest.param(slice(0, None, 1), id="default_stop_slice"), + pytest.param(slice(0, 2, None), id="default_step_slice"), + ], +) +def test_getitem_w_string(scalars_dfs, index): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = scalars_df[col_name].str[index].to_pandas() + pd_result = scalars_pandas_df[col_name].str[index] + + assert_series_equal(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("index"), + [ + pytest.param(2, id="int"), + pytest.param(slice(None, None, None), id="default_start_slice"), + pytest.param(slice(0, None, 1), id="default_stop_slice"), + pytest.param(slice(0, 2, None), id="default_step_slice"), + pytest.param(slice(0, 0, None), id="single_one_slice"), + ], +) +def test_getitem_w_array(index): + data = [[1], [2, 3], [], [4, 5, 6]] + s = bpd.Series(data) + pd_s = pd.Series(data) + + bf_result = s.str[index].to_pandas() + pd_result = pd_s.str[index] + # Skip dtype checks here because pandas returns `int64` while BF returns `Int64`. + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) + + +def test_getitem_w_struct_array(): + if packaging.version.Version(pd.__version__) <= packaging.version.Version("1.5.0"): + pytest.skip("https://github.com/googleapis/python-bigquery/issues/1992") + + pa_struct = pa.struct( + [ + ("name", pa.string()), + ("age", pa.int64()), + ] + ) + data: list[list[dict]] = [ + [ + {"name": "Alice", "age": 30}, + {"name": "Bob", "age": 25}, + ], + [ + {"name": "Charlie", "age": 35}, + {"name": "David", "age": 40}, + {"name": "Eva", "age": 28}, + ], + [], + [{"name": "Frank", "age": 50}], + ] + s = bpd.Series(data, dtype=bpd.ArrowDtype(pa.list_(pa_struct))) + + result = s.str[1] + assert dtypes.is_struct_like(result.dtype) + + expected_data = [item[1] if len(item) > 1 else None for item in data] + expected = bpd.Series(expected_data, dtype=bpd.ArrowDtype((pa_struct))) + + assert_series_equal(result.to_pandas(), expected.to_pandas()) diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index b02c23f945..bd5e78f415 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -13,6 +13,37 @@ class StringMethods: R's stringr package. """ + def __getitem__(self, key: typing.Union[int, slice]): + """ + Index or slice string or list in the Series. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['Alice', 'Bob', 'Charlie']) + >>> s.str[0] + 0 A + 1 B + 2 C + dtype: string + + >>> s.str[0:3] + 0 Ali + 1 Bob + 2 Cha + dtype: string + + Args: + key (int | slice): + Index or slice of indices to access from each string or list. + + Returns: + bigframes.series.Series: The list at requested index. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def extract(self, pat: str, flags: int = 0): """ Extract capture groups in the regex `pat` as columns in a DataFrame. From ae07274ea3b49f0350da77c3f8fdb44e4cda6778 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Tue, 13 Aug 2024 19:21:06 -0700 Subject: [PATCH 15/15] chore(main): release 1.14.0 (#882) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 26 ++++++++++++++++++++++++++ bigframes/version.py | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3209391f44..754658c5e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,32 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.14.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.13.0...v1.14.0) (2024-08-14) + + +### Features + +* Implement `bigframes.bigquery.json_extract` ([#868](https://github.com/googleapis/python-bigquery-dataframes/issues/868)) ([3dbf84b](https://github.com/googleapis/python-bigquery-dataframes/commit/3dbf84bd1531c1f8d41ba57c2c38b3ba6abfb812)) +* Implement `Series.str.__getitem__` ([#897](https://github.com/googleapis/python-bigquery-dataframes/issues/897)) ([e027b7e](https://github.com/googleapis/python-bigquery-dataframes/commit/e027b7e9d29f628d058611106014a1790459958c)) + + +### Bug Fixes + +* Fix caching from generating row numbers in partial ordering mode ([#872](https://github.com/googleapis/python-bigquery-dataframes/issues/872)) ([52b7786](https://github.com/googleapis/python-bigquery-dataframes/commit/52b7786c3a28da6c29e3ddf12629802215194ad9)) + + +### Performance Improvements + +* Generate SQL with fewer CTEs ([#877](https://github.com/googleapis/python-bigquery-dataframes/issues/877)) ([eb60804](https://github.com/googleapis/python-bigquery-dataframes/commit/eb6080460344aff2fabb7864536ea4fe24c5fbef)) +* Speed up compilation by reducing redundant type normalization ([#896](https://github.com/googleapis/python-bigquery-dataframes/issues/896)) ([e0b11bc](https://github.com/googleapis/python-bigquery-dataframes/commit/e0b11bc8c038db7b950b1653ed4cd44a6246c713)) + + +### Documentation + +* Add streaming html docs ([#884](https://github.com/googleapis/python-bigquery-dataframes/issues/884)) ([171da6c](https://github.com/googleapis/python-bigquery-dataframes/commit/171da6cb33165b49d46ea6528038342abd89e9fa)) +* Fix the `DisplayOptions` doc rendering ([#893](https://github.com/googleapis/python-bigquery-dataframes/issues/893)) ([3eb6a17](https://github.com/googleapis/python-bigquery-dataframes/commit/3eb6a17a5823faf5ecba92cb9a554df74477871d)) +* Update streaming notebook ([#887](https://github.com/googleapis/python-bigquery-dataframes/issues/887)) ([6e6f9df](https://github.com/googleapis/python-bigquery-dataframes/commit/6e6f9df55d435afe0b3ade728ca06826e92a6ee6)) + ## [1.13.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.12.0...v1.13.0) (2024-08-05) diff --git a/bigframes/version.py b/bigframes/version.py index b474f021d4..2e135689ed 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.13.0" +__version__ = "1.14.0" pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy