From 2c1cd45707f539171a764a907f39ab692b25ed0c Mon Sep 17 00:00:00 2001 From: Stephanie A <129541811+DevStephanie@users.noreply.github.com> Date: Fri, 5 Apr 2024 09:31:06 -0500 Subject: [PATCH 01/23] chore: add BigQuery locations to bigframes/constants.py (#578) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: add BigQuery locations to bigframes/constants.py * Apply suggestions from code review --------- Co-authored-by: Tim Sweña (Swast) --- bigframes/constants.py | 65 ++++++++++++++++++++++++++ tests/config.py | 72 ----------------------------- tests/system/large/test_location.py | 14 ++++-- 3 files changed, 75 insertions(+), 76 deletions(-) delete mode 100644 tests/config.py diff --git a/bigframes/constants.py b/bigframes/constants.py index a1ffd2b755..0751501085 100644 --- a/bigframes/constants.py +++ b/bigframes/constants.py @@ -27,3 +27,68 @@ ABSTRACT_METHOD_ERROR_MESSAGE = f"Abstract method. You have likely encountered a bug. Please share this stacktrace and how you reached it with the BigQuery DataFrames team. {FEEDBACK_LINK}" DEFAULT_EXPIRATION = datetime.timedelta(days=7) + +# https://cloud.google.com/bigquery/docs/locations +ALL_BIGQUERY_LOCATIONS = frozenset( + { + "us-east5", + "us-south1", + "us-central1", + "us-west4", + "us-west2", + "northamerica-northeast1", + "us-east4", + "us-west1", + "us-west3", + "southamerica-east1", + "southamerica-west1", + "us-east1", + "northamerica-northeast2", + "asia-south2", + "asia-east2", + "asia-southeast2", + "australia-southeast2", + "asia-south1", + "asia-northeast2", + "asia-northeast3", + "asia-southeast1", + "australia-southeast1", + "asia-east1", + "asia-northeast1", + "europe-west1", + "europe-west10", + "europe-north1", + "europe-west3", + "europe-west2", + "europe-southwest1", + "europe-west8", + "europe-west4", + "europe-west9", + "europe-west12", + "europe-central2", + "europe-west6", + "me-central2", + "me-central1", + "me-west1", + "me-central2", + "me-central1", + "me-west1", + "africa-south1", + } +) + +# https://cloud.google.com/storage/docs/regional-endpoints +REP_ENABLED_BIGQUERY_LOCATIONS = frozenset( + { + "me-central2", + "europe-west9", + "europe-west3", + "us-east4", + "us-west1", + } +) + +# https://cloud.google.com/storage/docs/locational-endpoints +LEP_ENABLED_BIGQUERY_LOCATIONS = frozenset( + ALL_BIGQUERY_LOCATIONS - REP_ENABLED_BIGQUERY_LOCATIONS +) diff --git a/tests/config.py b/tests/config.py deleted file mode 100644 index a885d7e71d..0000000000 --- a/tests/config.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# https://cloud.google.com/bigquery/docs/locations -ALL_BIGQUERY_LOCATIONS = [ - "us-east5", - "us-south1", - "us-central1", - "us-west4", - "us-west2", - "northamerica-northeast1", - "us-east4", - "us-west1", - "us-west3", - "southamerica-east1", - "southamerica-west1", - "us-east1", - "northamerica-northeast2", - "asia-south2", - "asia-east2", - "asia-southeast2", - "australia-southeast2", - "asia-south1", - "asia-northeast2", - "asia-northeast3", - "asia-southeast1", - "australia-southeast1", - "asia-east1", - "asia-northeast1", - "europe-west1", - "europe-west10", - "europe-north1", - "europe-west3", - "europe-west2", - "europe-southwest1", - "europe-west8", - "europe-west4", - "europe-west9", - "europe-west12", - "europe-central2", - "europe-west6", - "me-central2", - "me-central1", - "me-west1", - "me-central2", - "me-central1", - "me-west1", - "africa-south1", -] - -REP_ENABLED_BIGQUERY_LOCATIONS = [ - "me-central2", - "europe-west9", - "europe-west3", - "us-east4", - "us-west1", -] - -LEP_ENABLED_BIGQUERY_LOCATIONS = sorted( - set(ALL_BIGQUERY_LOCATIONS) - set(REP_ENABLED_BIGQUERY_LOCATIONS) -) diff --git a/tests/system/large/test_location.py b/tests/system/large/test_location.py index a4cf8919a0..204c6b7463 100644 --- a/tests/system/large/test_location.py +++ b/tests/system/large/test_location.py @@ -18,8 +18,8 @@ import pytest import bigframes +import bigframes.constants import bigframes.session.clients -from tests import config def _assert_bq_execution_location(session: bigframes.Session): @@ -66,7 +66,11 @@ def test_bq_location_default(): _assert_bq_execution_location(session) -@pytest.mark.parametrize("bigquery_location", config.ALL_BIGQUERY_LOCATIONS) +@pytest.mark.parametrize( + "bigquery_location", + # Sort the set to avoid nondeterminism. + sorted(bigframes.constants.ALL_BIGQUERY_LOCATIONS), +) def test_bq_location(bigquery_location): session = bigframes.Session( context=bigframes.BigQueryOptions(location=bigquery_location) @@ -85,7 +89,8 @@ def test_bq_location(bigquery_location): @pytest.mark.parametrize( "bigquery_location", - config.REP_ENABLED_BIGQUERY_LOCATIONS, + # Sort the set to avoid nondeterminism. + sorted(bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS), ) def test_bq_rep_endpoints(bigquery_location): session = bigframes.Session( @@ -108,7 +113,8 @@ def test_bq_rep_endpoints(bigquery_location): @pytest.mark.parametrize( "bigquery_location", - config.LEP_ENABLED_BIGQUERY_LOCATIONS, + # Sort the set to avoid nondeterminism. + sorted(bigframes.constants.LEP_ENABLED_BIGQUERY_LOCATIONS), ) def test_bq_lep_endpoints(bigquery_location): # We are not testing BigFrames Session for LEP endpoints because it involves From c8da22ff951dd9ef65948bd56dbb7970d80d3018 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Fri, 5 Apr 2024 13:14:16 -0700 Subject: [PATCH 02/23] chore: pin noxfile to the earlier plugin version (#583) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …e docs generation Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index 4ac3a81723..6fa7dbead1 100644 --- a/noxfile.py +++ b/noxfile.py @@ -505,7 +505,7 @@ def docfx(session): SPHINX_VERSION, "alabaster", "recommonmark", - "gcp-sphinx-docfx-yaml", + "gcp-sphinx-docfx-yaml==3.0.1", ) shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) From 57e1cca42190f603eb248916e241f6acf6b9b549 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 5 Apr 2024 15:15:01 -0500 Subject: [PATCH 03/23] chore: add context and address some TODOs in noxfile (#572) * chore: add context and address some TODOs in noxfile * fix lint * remove dead code * mark test_query_job_dry_run as flaky * remove more dead code * escape [ * remove failing test * missing NO COVER --- noxfile.py | 11 ++-- .../system/small/operations/test_plotting.py | 11 ++-- tests/system/small/test_dataframe.py | 10 ---- tests/system/small/test_dataframe_io.py | 43 +++++--------- tests/system/small/test_encryption.py | 36 ++++++------ tests/system/small/test_multiindex.py | 19 ------- tests/system/small/test_pandas.py | 4 +- tests/system/small/test_progress_bar.py | 10 ---- tests/system/small/test_remote_function.py | 30 ++++++---- tests/system/small/test_series.py | 56 ------------------- tests/system/small/test_session.py | 10 +--- .../bigframes_vendored/cpython/_pprint.py | 1 + .../bigframes_vendored/pandas/core/generic.py | 21 ++++--- 13 files changed, 81 insertions(+), 181 deletions(-) diff --git a/noxfile.py b/noxfile.py index 6fa7dbead1..fa9c0a57d8 100644 --- a/noxfile.py +++ b/noxfile.py @@ -112,8 +112,7 @@ def lint(session): "--check", *LINT_PATHS, ) - # TODO(tswast): lint all LINT_PATHS - session.run("flake8", "bigframes", "tests") + session.run("flake8", *LINT_PATHS) @nox.session(python=DEFAULT_PYTHON_VERSION) @@ -411,8 +410,8 @@ def samples(session): CURRENT_DIRECTORY / "testing" / f"constraints-{session.python}.txt" ) - # TODO(swast): Use `requirements.txt` files from the samples directories to - # test samples. + # TODO(b/332735129): Remove this session and use python_samples templates + # where each samples directory has its own noxfile.py file, instead. install_test_extra = True install_systemtest_dependencies(session, install_test_extra, "-c", constraints_path) @@ -434,12 +433,12 @@ def cover(session): session.run("coverage", "report", "--show-missing", "--fail-under=90") # Make sure there is no dead code in our test directories. - # TODO(swast): Cleanup dead code in the system tests directory. session.run( "coverage", "report", "--show-missing", "--include=tests/unit/*", + "--include=tests/system/small/*", "--fail-under=100", ) @@ -714,7 +713,7 @@ def notebook(session: nox.Session): "notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb", # Needs DATASET. "notebooks/regression/bq_dataframes_ml_linear_regression.ipynb", # Needs DATASET_ID. "notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb", # Needs CONNECTION. - # TODO(swast): investigate why we get 404 errors, even though + # TODO(b/332737009): investigate why we get 404 errors, even though # bq_dataframes_llm_code_generation creates a bucket in the sample. "notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb", # Needs BUCKET_URI. "notebooks/generative_ai/sentiment_analysis.ipynb", # Too slow diff --git a/tests/system/small/operations/test_plotting.py b/tests/system/small/operations/test_plotting.py index 6542ce6de3..faf7cb7e6b 100644 --- a/tests/system/small/operations/test_plotting.py +++ b/tests/system/small/operations/test_plotting.py @@ -27,13 +27,10 @@ def _check_legend_labels(ax, labels): """ assert ax.get_legend() is not None texts = ax.get_legend().get_texts() - if not isinstance(texts, list): - assert texts.get_text() == labels - else: - actual_labels = [t.get_text() for t in texts] - assert len(actual_labels) == len(labels) - for label, e in zip(actual_labels, labels): - assert label == e + actual_labels = [t.get_text() for t in texts] + assert len(actual_labels) == len(labels) + for label, e in zip(actual_labels, labels): + assert label == e def test_series_hist_bins(scalars_dfs): diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 5d6a859c11..0811defbc1 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -524,13 +524,6 @@ def test_repr_w_all_rows(scalars_dfs): scalars_df = scalars_df.drop(columns=["numeric_col"]) scalars_pandas_df = scalars_pandas_df.drop(columns=["numeric_col"]) - if scalars_pandas_df.index.name is None: - # Note: Not quite the same as no index / default index, but hopefully - # simulates it well enough while being consistent enough for string - # comparison to work. - scalars_df = scalars_df.set_index("rowindex", drop=False).sort_index() - scalars_df.index.name = None - # When there are 10 or fewer rows, the outputs should be identical. actual = repr(scalars_df.head(10)) @@ -3956,9 +3949,6 @@ def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna): ("bottom", "dense", False, False), ], ) -@pytest.mark.skipif( - True, reason="Blocked by possible pandas rank() regression (b/283278923)" -) def test_df_rank_with_nulls( scalars_df_index, scalars_pandas_df_index, diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 10d7408790..f26902f084 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -23,7 +23,8 @@ try: import pandas_gbq # type: ignore -except ImportError: +except ImportError: # pragma: NO COVER + # TODO(b/332758806): Run system tests without "extras" pandas_gbq = None import typing @@ -129,12 +130,9 @@ def test_to_csv_index( """Test the `to_csv` API with the `index` parameter.""" scalars_df, scalars_pandas_df = scalars_dfs index_col = None - if scalars_df.index.name is not None: - path = gcs_folder + f"test_index_df_to_csv_index_{index}*.csv" - if index: - index_col = typing.cast(str, scalars_df.index.name) - else: - path = gcs_folder + f"test_default_index_df_to_csv_index_{index}*.csv" + path = gcs_folder + f"test_index_df_to_csv_index_{index}*.csv" + if index: + index_col = typing.cast(str, scalars_df.index.name) # TODO(swast): Support "date_format" parameter and make sure our # DATETIME/TIMESTAMP column export is the same format as pandas by default. @@ -386,11 +384,8 @@ def test_to_json_index_invalid_orient( gcs_folder: str, index: bool, ): - scalars_df, scalars_pandas_df = scalars_dfs - if scalars_df.index.name is not None: - path = gcs_folder + f"test_index_df_to_json_index_{index}*.jsonl" - else: - path = gcs_folder + f"test_default_index_df_to_json_index_{index}*.jsonl" + scalars_df, _ = scalars_dfs + path = gcs_folder + f"test_index_df_to_json_index_{index}*.jsonl" with pytest.raises(ValueError): scalars_df.to_json(path, index=index, lines=True) @@ -404,11 +399,8 @@ def test_to_json_index_invalid_lines( gcs_folder: str, index: bool, ): - scalars_df, scalars_pandas_df = scalars_dfs - if scalars_df.index.name is not None: - path = gcs_folder + f"test_index_df_to_json_index_{index}.jsonl" - else: - path = gcs_folder + f"test_default_index_df_to_json_index_{index}.jsonl" + scalars_df, _ = scalars_dfs + path = gcs_folder + f"test_index_df_to_json_index_{index}.jsonl" with pytest.raises(NotImplementedError): scalars_df.to_json(path, index=index) @@ -422,14 +414,13 @@ def test_to_json_index_records_orient( gcs_folder: str, index: bool, ): - """Test the `to_json` API with the `index` parameter.""" + """Test the `to_json` API with the `index` parameter. + + Uses the scalable options orient='records' and lines=True. + """ scalars_df, scalars_pandas_df = scalars_dfs - if scalars_df.index.name is not None: - path = gcs_folder + f"test_index_df_to_json_index_{index}*.jsonl" - else: - path = gcs_folder + f"test_default_index_df_to_json_index_{index}*.jsonl" + path = gcs_folder + f"test_index_df_to_json_index_{index}*.jsonl" - """ Test the `to_json` API with `orient` is `records` and `lines` is True""" scalars_df.to_json(path, index=index, orient="records", lines=True) gcs_df = pd.read_json( @@ -460,11 +451,7 @@ def test_to_parquet_index(scalars_dfs, gcs_folder, index): """Test the `to_parquet` API with the `index` parameter.""" scalars_df, scalars_pandas_df = scalars_dfs scalars_pandas_df = scalars_pandas_df.copy() - - if scalars_df.index.name is not None: - path = gcs_folder + f"test_index_df_to_parquet_{index}*.parquet" - else: - path = gcs_folder + f"test_default_index_df_to_parquet_{index}*.parquet" + path = gcs_folder + f"test_index_df_to_parquet_{index}*.parquet" # TODO(b/268693993): Type GEOGRAPHY is not currently supported for parquet. scalars_df = scalars_df.drop(columns="geography_col") diff --git a/tests/system/small/test_encryption.py b/tests/system/small/test_encryption.py index 3389e5cd68..eae667dc9d 100644 --- a/tests/system/small/test_encryption.py +++ b/tests/system/small/test_encryption.py @@ -64,8 +64,8 @@ def _assert_bq_table_is_encrypted( def test_session_query_job(bq_cmek, session_with_bq_cmek): - if not bq_cmek: - pytest.skip("no cmek set for testing") + if not bq_cmek: # pragma: NO COVER + pytest.skip("no cmek set for testing") # pragma: NO COVER _, query_job = session_with_bq_cmek._start_query( "SELECT 123", job_config=bigquery.QueryJobConfig(use_query_cache=False) @@ -82,8 +82,8 @@ def test_session_query_job(bq_cmek, session_with_bq_cmek): def test_session_load_job(bq_cmek, session_with_bq_cmek): - if not bq_cmek: - pytest.skip("no cmek set for testing") + if not bq_cmek: # pragma: NO COVER + pytest.skip("no cmek set for testing") # pragma: NO COVER # Session should have cmek set in the default query and load job configs load_table = bigframes.session._io.bigquery.random_table( @@ -114,8 +114,8 @@ def test_session_load_job(bq_cmek, session_with_bq_cmek): def test_read_gbq(bq_cmek, session_with_bq_cmek, scalars_table_id): - if not bq_cmek: - pytest.skip("no cmek set for testing") + if not bq_cmek: # pragma: NO COVER + pytest.skip("no cmek set for testing") # pragma: NO COVER # Read the BQ table df = session_with_bq_cmek.read_gbq(scalars_table_id) @@ -125,8 +125,8 @@ def test_read_gbq(bq_cmek, session_with_bq_cmek, scalars_table_id): def test_df_apis(bq_cmek, session_with_bq_cmek, scalars_table_id): - if not bq_cmek: - pytest.skip("no cmek set for testing") + if not bq_cmek: # pragma: NO COVER + pytest.skip("no cmek set for testing") # pragma: NO COVER # Read a BQ table and assert encryption df = session_with_bq_cmek.read_gbq(scalars_table_id) @@ -152,8 +152,8 @@ def test_df_apis(bq_cmek, session_with_bq_cmek, scalars_table_id): def test_read_csv_gcs( bq_cmek, session_with_bq_cmek, scalars_df_index, gcs_folder, engine ): - if not bq_cmek: - pytest.skip("no cmek set for testing") + if not bq_cmek: # pragma: NO COVER + pytest.skip("no cmek set for testing") # pragma: NO COVER # Create a csv in gcs write_path = gcs_folder + "test_read_csv_gcs_bigquery_engine*.csv" @@ -170,8 +170,8 @@ def test_read_csv_gcs( def test_to_gbq(bq_cmek, session_with_bq_cmek, scalars_table_id): - if not bq_cmek: - pytest.skip("no cmek set for testing") + if not bq_cmek: # pragma: NO COVER + pytest.skip("no cmek set for testing") # pragma: NO COVER # Read a BQ table and assert encryption df = session_with_bq_cmek.read_gbq(scalars_table_id) @@ -205,8 +205,8 @@ def test_to_gbq(bq_cmek, session_with_bq_cmek, scalars_table_id): def test_read_pandas(bq_cmek, session_with_bq_cmek): - if not bq_cmek: - pytest.skip("no cmek set for testing") + if not bq_cmek: # pragma: NO COVER + pytest.skip("no cmek set for testing") # pragma: NO COVER # Read a pandas dataframe df = session_with_bq_cmek.read_pandas(pandas.DataFrame([1])) @@ -216,8 +216,8 @@ def test_read_pandas(bq_cmek, session_with_bq_cmek): def test_read_pandas_large(bq_cmek, session_with_bq_cmek): - if not bq_cmek: - pytest.skip("no cmek set for testing") + if not bq_cmek: # pragma: NO COVER + pytest.skip("no cmek set for testing") # pragma: NO COVER # Read a pandas dataframe large enough to trigger a BQ load job df = session_with_bq_cmek.read_pandas(pandas.DataFrame(range(10_000))) @@ -227,8 +227,8 @@ def test_read_pandas_large(bq_cmek, session_with_bq_cmek): def test_bqml(bq_cmek, session_with_bq_cmek, penguins_table_id): - if not bq_cmek: - pytest.skip("no cmek set for testing") + if not bq_cmek: # pragma: NO COVER + pytest.skip("no cmek set for testing") # pragma: NO COVER model = bigframes.ml.linear_model.LinearRegression() df = session_with_bq_cmek.read_gbq(penguins_table_id).dropna() diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 6aca7628cf..330fe44eb8 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -882,25 +882,6 @@ def test_column_multi_index_unstack(scalars_df_index, scalars_pandas_df_index): pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) -@pytest.mark.skip(reason="Pandas fails in newer versions.") -def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index): - columns = ["int64_too", "int64_col", "rowindex_2"] - level1 = pandas.Index(["b", pandas.NA, pandas.NA]) - # Need resulting column to be pyarrow string rather than object dtype - level2 = pandas.Index([pandas.NA, "b", "b"], dtype="string[pyarrow]") - multi_columns = pandas.MultiIndex.from_arrays([level1, level2]) - bf_df = scalars_df_index[columns].copy() - bf_df.columns = multi_columns - pd_df = scalars_pandas_df_index[columns].copy() - pd_df.columns = multi_columns - - bf_result = bf_df.stack().to_pandas() - pd_result = pd_df.stack() - - # Pandas produces NaN, where bq dataframes produces pd.NA - pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) - - def test_corr_w_multi_index(scalars_df_index, scalars_pandas_df_index): columns = ["int64_too", "float64_col", "int64_col"] multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "b"], [1, 2, 2])) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index a080a969c8..95b34a56c5 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -136,7 +136,7 @@ def test_get_dummies_series(scalars_dfs): # adjust for expected dtype differences for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes): - if type_name == "bool": + if type_name == "bool": # pragma: NO COVER pd_result[column_name] = pd_result[column_name].astype("boolean") pd_result.columns = pd_result.columns.astype(object) @@ -157,7 +157,7 @@ def test_get_dummies_series_nameless(scalars_dfs): # adjust for expected dtype differences for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes): - if type_name == "bool": + if type_name == "bool": # pragma: NO COVER pd_result[column_name] = pd_result[column_name].astype("boolean") pd_result.columns = pd_result.columns.astype(object) diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index ea139b9802..5ccc6db0ac 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -126,13 +126,3 @@ def test_query_job_repr(penguins_df_default_index: bf.dataframe.DataFrame): ] for string in string_checks: assert string in query_job_repr - - -def test_query_job_dry_run(penguins_df_default_index: bf.dataframe.DataFrame, capsys): - with bf.option_context("display.repr_mode", "deferred"): - repr(penguins_df_default_index) - repr(penguins_df_default_index["body_mass_g"]) - lines = capsys.readouterr().out.split("\n") - lines = filter(None, lines) - for line in lines: - assert "Computation deferred. Computation will process" in line diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index e7e434dbd0..106638cef3 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -105,7 +105,8 @@ def test_remote_function_direct_no_session_param( reuse=True, ) def square(x): - return x * x + # This executes on a remote function, where coverage isn't tracked. + return x * x # pragma: NO COVER assert square.bigframes_remote_function assert square.bigframes_cloud_function @@ -157,7 +158,8 @@ def test_remote_function_direct_no_session_param_location_specified( reuse=True, ) def square(x): - return x * x + # This executes on a remote function, where coverage isn't tracked. + return x * x # pragma: NO COVER scalars_df, scalars_pandas_df = scalars_dfs @@ -207,7 +209,8 @@ def test_remote_function_direct_no_session_param_location_mismatched( reuse=True, ) def square(x): - return x * x + # This executes on a remote function, where coverage isn't tracked. + return x * x # pragma: NO COVER @pytest.mark.flaky(retries=2, delay=120) @@ -233,7 +236,8 @@ def test_remote_function_direct_no_session_param_location_project_specified( reuse=True, ) def square(x): - return x * x + # This executes on a remote function, where coverage isn't tracked. + return x * x # pragma: NO COVER scalars_df, scalars_pandas_df = scalars_dfs @@ -283,7 +287,8 @@ def test_remote_function_direct_no_session_param_project_mismatched( reuse=True, ) def square(x): - return x * x + # This executes on a remote function, where coverage isn't tracked. + return x * x # pragma: NO COVER @pytest.mark.flaky(retries=2, delay=120) @@ -294,7 +299,8 @@ def test_remote_function_direct_session_param(session_with_bq_connection, scalar session=session_with_bq_connection, ) def square(x): - return x * x + # This executes on a remote function, where coverage isn't tracked. + return x * x # pragma: NO COVER scalars_df, scalars_pandas_df = scalars_dfs @@ -331,7 +337,8 @@ def test_remote_function_via_session_default(session_with_bq_connection, scalars # cloud function would be common and quickly reused. @session_with_bq_connection.remote_function([int], int) def square(x): - return x * x + # This executes on a remote function, where coverage isn't tracked. + return x * x # pragma: NO COVER scalars_df, scalars_pandas_df = scalars_dfs @@ -370,7 +377,8 @@ def test_remote_function_via_session_with_overrides( reuse=True, ) def square(x): - return x * x + # This executes on a remote function, where coverage isn't tracked. + return x * x # pragma: NO COVER scalars_df, scalars_pandas_df = scalars_dfs @@ -497,7 +505,8 @@ def test_skip_bq_connection_check(dataset_id_permanent): @session.remote_function([int], int, dataset=dataset_id_permanent) def add_one(x): - return x + 1 + # This executes on a remote function, where coverage isn't tracked. + return x + 1 # pragma: NO COVER @pytest.mark.flaky(retries=2, delay=120) @@ -534,7 +543,8 @@ def test_read_gbq_function_like_original( reuse=True, ) def square1(x): - return x * x + # This executes on a remote function, where coverage isn't tracked. + return x * x # pragma: NO COVER square2 = rf.read_gbq_function( function_name=square1.bigframes_remote_function, diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index e350286940..539b45eb6a 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1276,8 +1276,6 @@ def test_numeric_literal(scalars_dfs): def test_repr(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - if scalars_pandas_df.index.name != "rowindex": - pytest.skip("Require index & ordering for consistent repr.") col_name = "int64_col" bf_series = scalars_df[col_name] @@ -1405,8 +1403,6 @@ def test_groupby_level_sum(scalars_dfs): # TODO(tbergeron): Use a non-unique index once that becomes possible in tests scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" - if scalars_pandas_df.index.name != "rowindex": - pytest.skip("Require index for groupby level.") bf_series = scalars_df[col_name].groupby(level=0).sum() pd_series = scalars_pandas_df[col_name].groupby(level=0).sum() @@ -1421,8 +1417,6 @@ def test_groupby_level_list_sum(scalars_dfs): # TODO(tbergeron): Use a non-unique index once that becomes possible in tests scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" - if scalars_pandas_df.index.name != "rowindex": - pytest.skip("Require index for groupby level.") bf_series = scalars_df[col_name].groupby(level=["rowindex"]).sum() pd_series = scalars_pandas_df[col_name].groupby(level=["rowindex"]).sum() @@ -1710,9 +1704,6 @@ def test_dtypes(scalars_dfs): def test_head(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - if scalars_df.index.name is None: - pytest.skip("Require explicit index for offset ops.") - bf_result = scalars_df["string_col"].head(2).to_pandas() pd_result = scalars_pandas_df["string_col"].head(2) @@ -1725,9 +1716,6 @@ def test_head(scalars_dfs): def test_tail(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - if scalars_df.index.name is None: - pytest.skip("Require explicit index for offset ops.") - bf_result = scalars_df["string_col"].tail(2).to_pandas() pd_result = scalars_pandas_df["string_col"].tail(2) @@ -1740,9 +1728,6 @@ def test_tail(scalars_dfs): def test_head_then_scalar_operation(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - if scalars_df.index.name is None: - pytest.skip("Require explicit index for offset ops.") - bf_result = (scalars_df["float64_col"].head(1) + 4).to_pandas() pd_result = scalars_pandas_df["float64_col"].head(1) + 4 @@ -1755,9 +1740,6 @@ def test_head_then_scalar_operation(scalars_dfs): def test_head_then_series_operation(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - if scalars_df.index.name is None: - pytest.skip("Require explicit index for offset ops.") - bf_result = ( scalars_df["float64_col"].head(4) + scalars_df["float64_col"].head(2) ).to_pandas() @@ -1841,44 +1823,6 @@ def test_cumsum_int_ordered(scalars_df_index, scalars_pandas_df_index): ) -@pytest.mark.parametrize( - ("na_option",), - [ - ("keep",), - ("top",), - ("bottom",), - ], -) -@pytest.mark.parametrize( - ("method",), - [ - ("average",), - ("min",), - ("max",), - ("first",), - ("dense",), - ], -) -@pytest.mark.skipif( - True, reason="Blocked by possible pandas rank() regression (b/283278923)" -) -def test_rank_with_nulls(scalars_df_index, scalars_pandas_df_index, na_option, method): - col_name = "bool_col" - bf_result = ( - scalars_df_index[col_name].rank(na_option=na_option, method=method).to_pandas() - ) - pd_result = ( - scalars_pandas_df_index[col_name] - .rank(na_option=na_option, method=method) - .astype(pd.Float64Dtype()) - ) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - @pytest.mark.parametrize( ("keep",), [ diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index eb6a0a8dd9..d84244e5cf 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -493,10 +493,7 @@ def test_read_pandas_tokyo( @utils.skip_legacy_pandas def test_read_csv_gcs_default_engine(session, scalars_dfs, gcs_folder): scalars_df, _ = scalars_dfs - if scalars_df.index.name is not None: - path = gcs_folder + "test_read_csv_gcs_default_engine_w_index*.csv" - else: - path = gcs_folder + "test_read_csv_gcs_default_engine_wo_index*.csv" + path = gcs_folder + "test_read_csv_gcs_default_engine_w_index*.csv" read_path = utils.get_first_file_from_wildcard(path) scalars_df.to_csv(path, index=False) dtype = scalars_df.dtypes.to_dict() @@ -520,10 +517,7 @@ def test_read_csv_gcs_default_engine(session, scalars_dfs, gcs_folder): def test_read_csv_gcs_bq_engine(session, scalars_dfs, gcs_folder): scalars_df, _ = scalars_dfs - if scalars_df.index.name is not None: - path = gcs_folder + "test_read_csv_gcs_bq_engine_w_index*.csv" - else: - path = gcs_folder + "test_read_csv_gcs_bq_engine_wo_index*.csv" + path = gcs_folder + "test_read_csv_gcs_bq_engine_w_index*.csv" scalars_df.to_csv(path, index=False) df = session.read_csv(path, engine="bigquery") diff --git a/third_party/bigframes_vendored/cpython/_pprint.py b/third_party/bigframes_vendored/cpython/_pprint.py index 617c14df0d..9b586c939b 100644 --- a/third_party/bigframes_vendored/cpython/_pprint.py +++ b/third_party/bigframes_vendored/cpython/_pprint.py @@ -110,6 +110,7 @@ def has_changed(k, v): # try to avoid calling repr on nested estimators if isinstance(v, BaseEstimator) and v.__class__ != init_params[k].__class__: return True + # Use repr as a last resort. It may be expensive. def is_scalar_nan(x): return isinstance(x, numbers.Real) and math.isnan(x) diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 04cc3990a4..d14cbfaa52 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -165,13 +165,20 @@ def astype(self, dtype): Args: dtype (str or pandas.ExtensionDtype): - A dtype supported by BigQuery DataFrame include ``'boolean'``, ``'Float64'``, ``'Int64'``, - ``'int64[pyarrow]'``, ``'string'``, ``'string[pyarrow]'``, ``'timestamp[us, tz=UTC][pyarrow]'``, - ``'timestamp\[us\]\[pyarrow\]'``, ``'date32\[day\]\[pyarrow\]'``, ``'time64\[us\]\[pyarrow\]'``. - A pandas.ExtensionDtype include ``pandas.BooleanDtype()``, ``pandas.Float64Dtype()``, - ``pandas.Int64Dtype()``, ``pandas.StringDtype(storage="pyarrow")``, - ``pd.ArrowDtype(pa.date32())``, ``pd.ArrowDtype(pa.time64("us"))``, - ``pd.ArrowDtype(pa.timestamp("us"))``, ``pd.ArrowDtype(pa.timestamp("us", tz="UTC"))``. + A dtype supported by BigQuery DataFrame include ``'boolean'``, + ``'Float64'``, ``'Int64'``, ``'int64\\[pyarrow\\]'``, + ``'string'``, ``'string\\[pyarrow\\]'``, + ``'timestamp\\[us, tz=UTC\\]\\[pyarrow\\]'``, + ``'timestamp\\[us\\]\\[pyarrow\\]'``, + ``'date32\\[day\\]\\[pyarrow\\]'``, + ``'time64\\[us\\]\\[pyarrow\\]'``. + A pandas.ExtensionDtype include ``pandas.BooleanDtype()``, + ``pandas.Float64Dtype()``, ``pandas.Int64Dtype()``, + ``pandas.StringDtype(storage="pyarrow")``, + ``pd.ArrowDtype(pa.date32())``, + ``pd.ArrowDtype(pa.time64("us"))``, + ``pd.ArrowDtype(pa.timestamp("us"))``, + ``pd.ArrowDtype(pa.timestamp("us", tz="UTC"))``. Returns: same type as caller From 5f1db8b270b32ab366be3690761da137d9fe65f5 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 5 Apr 2024 13:50:46 -0700 Subject: [PATCH 04/23] fix: Inverting int now does bitwise inversion rather than sign flip (#574) --- bigframes/core/compile/scalar_op_compiler.py | 2 +- tests/system/small/test_series.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 5c165fa1df..53a25d63ed 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -397,7 +397,7 @@ def expm1_op_impl(x: ibis_types.Value): @scalar_op_compiler.register_unary_op(ops.invert_op) def invert_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).negate() + return x.__invert__() ## String Operation diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 539b45eb6a..c882677508 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -284,6 +284,21 @@ def test_abs(scalars_dfs, col_name): assert_series_equal(pd_result, bf_result) +@pytest.mark.parametrize( + ("col_name",), + ( + ("bool_col",), + ("int64_col",), + ), +) +def test_series_invert(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = (~scalars_df[col_name]).to_pandas() + pd_result = ~scalars_pandas_df[col_name] + + assert_series_equal(pd_result, bf_result) + + def test_fillna(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" From c8b137b4340c53454df9da67fa772e21ed704ea8 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Fri, 5 Apr 2024 13:59:33 -0700 Subject: [PATCH 05/23] chore: reorganize the supported pandas apis page (#584) --- docs/templates/toc.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 3c2c688d78..72d135cc96 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -12,6 +12,8 @@ name: Methods - href: summary_property.html name: Properties and Attributes + - href: supported_pandas_apis.html + name: Supported pandas APIs name: BigQuery DataFrames API - items: - items: @@ -79,8 +81,6 @@ name: Series - name: Window uid: bigframes.core.window.Window - - href: supported_pandas_apis.html - name: Supported pandas APIs name: bigframes.pandas - items: - items: From 4b08d9243272229f71688152dbeb69d0ab7c68b4 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Fri, 5 Apr 2024 16:19:44 -0700 Subject: [PATCH 06/23] fix: address more technical writers feedback (#581) --- bigframes/_config/compute_options.py | 4 ++-- bigframes/ml/base.py | 2 +- bigframes/ml/cluster.py | 2 +- bigframes/ml/decomposition.py | 2 +- bigframes/ml/ensemble.py | 16 ++++++------- bigframes/ml/forecasting.py | 4 ++-- bigframes/ml/imported.py | 24 +++++++++---------- bigframes/ml/linear_model.py | 4 ++-- bigframes/ml/llm.py | 6 ++--- .../pandas/core/config_init.py | 4 ++-- .../pandas/core/groupby/__init__.py | 6 ++--- .../pandas/core/indexes/base.py | 10 ++++---- .../bigframes_vendored/pandas/core/series.py | 2 +- .../bigframes_vendored/sklearn/base.py | 6 ++--- .../sklearn/decomposition/_pca.py | 6 ++--- 15 files changed, 49 insertions(+), 49 deletions(-) diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py index 2b849c558a..81ef044f4d 100644 --- a/bigframes/_config/compute_options.py +++ b/bigframes/_config/compute_options.py @@ -21,7 +21,7 @@ @dataclasses.dataclass class ComputeOptions: """ - Encapsulates configuration for compute options. + Encapsulates the configuration for compute options. **Examples:** @@ -39,7 +39,7 @@ class ComputeOptions: Limits the bytes billed for query jobs. Queries that will have bytes billed beyond this limit will fail (without incurring a charge). If unspecified, this will be set to your project default. - See `maximum_bytes_billed `_. + See `maximum_bytes_billed`: https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJobConfig#google_cloud_bigquery_job_QueryJobConfig_maximum_bytes_billed. enable_multi_query_execution (bool, Options): If enabled, large queries may be factored into multiple smaller queries in order to avoid generating queries that are too complex for the query diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index c57cb78791..6c81b66e55 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -193,7 +193,7 @@ def to_gbq(self: _T, model_name: str, replace: bool = False) -> _T: model_name (str): The name of the model. replace (bool, default False): - Whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: Saved transformer.""" diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py index 1035def54d..e63764e7bb 100644 --- a/bigframes/ml/cluster.py +++ b/bigframes/ml/cluster.py @@ -177,7 +177,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> KMeans: model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: KMeans: saved model.""" diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 475b4a046f..0dfb46efaa 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -171,7 +171,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> PCA: model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: PCA: saved model.""" diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index a8f0329145..b248c295f4 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -192,9 +192,9 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBRegressor: model_name (str): The name of the model. replace (bool, default False): - Whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. - Returns: saved model.""" + Returns: Saved model.""" if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") @@ -345,10 +345,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBClassifier: model_name (str): The name of the model. replace (bool, default False): - Whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: - XGBClassifier: saved model.""" + XGBClassifier: Saved model.""" if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") @@ -508,10 +508,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> RandomForestRegresso model_name (str): The name of the model. replace (bool, default False): - Whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: - RandomForestRegressor: saved model.""" + RandomForestRegressor: Saved model.""" if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") @@ -671,10 +671,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> RandomForestClassifi model_name (str): The name of the model. replace (bool, default False): - Whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: - RandomForestClassifier: saved model.""" + RandomForestClassifier: Saved model.""" if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index e50a8ed35b..a7e0c3c0d9 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -363,10 +363,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> ARIMAPlus: model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: - ARIMAPlus: saved model.""" + ARIMAPlus: Saved model.""" if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index b551150050..9198b4eafb 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -32,7 +32,7 @@ class TensorFlowModel(base.Predictor): Args: model_path (str): - GCS path that holds the model files. + Cloud Storage path that holds the model files. session (BigQuery Session): BQ session to create the model. """ @@ -69,10 +69,10 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: Args: X (bigframes.dataframe.DataFrame): - Input DataFrame, schema is defined by the model. + Input DataFrame. Schema is defined by the model. Returns: - bigframes.dataframe.DataFrame: Output DataFrame, schema is defined by the model.""" + bigframes.dataframe.DataFrame: Output DataFrame. Schema is defined by the model.""" if not self._bqml_model: if self.model_path is None: @@ -91,10 +91,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> TensorFlowModel: model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Default to False. Returns: - TensorFlowModel: saved model.""" + TensorFlowModel: Saved model.""" if not self._bqml_model: if self.model_path is None: raise ValueError("Model GCS path must be provided.") @@ -146,7 +146,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): - Input DataFrame or Series, schema is defined by the model. + Input DataFrame or Series. Schema is defined by the model. Returns: bigframes.dataframe.DataFrame: Output DataFrame, schema is defined by the model.""" @@ -168,10 +168,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> ONNXModel: model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: - ONNXModel: saved model.""" + ONNXModel: Saved model.""" if not self._bqml_model: if self.model_path is None: raise ValueError("Model GCS path must be provided.") @@ -262,10 +262,10 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): - Input DataFrame or Series, schema is defined by the model. + Input DataFrame or Series. Schema is defined by the model. Returns: - bigframes.dataframe.DataFrame: Output DataFrame, schema is defined by the model.""" + bigframes.dataframe.DataFrame: Output DataFrame. Schema is defined by the model.""" if not self._bqml_model: if self.model_path is None: @@ -284,10 +284,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBoostModel: model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: - XGBoostModel: saved model.""" + XGBoostModel: Saved model.""" if not self._bqml_model: if self.model_path is None: raise ValueError("Model GCS path must be provided.") diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index c0abe77b9f..63462be09f 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -184,7 +184,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> LinearRegression: model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: LinearRegression: saved model.""" @@ -349,7 +349,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> LogisticRegression: model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: LogisticRegression: saved model.""" diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index ffaeb399bb..31c691fd51 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -248,7 +248,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> PaLM2TextGenerator: model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: PaLM2TextGenerator: saved model.""" @@ -415,7 +415,7 @@ def to_gbq( model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: PaLM2TextEmbeddingGenerator: saved model.""" @@ -595,7 +595,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> GeminiTextGenerator: model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: GeminiTextGenerator: saved model.""" diff --git a/third_party/bigframes_vendored/pandas/core/config_init.py b/third_party/bigframes_vendored/pandas/core/config_init.py index a3178e2761..84ab90a322 100644 --- a/third_party/bigframes_vendored/pandas/core/config_init.py +++ b/third_party/bigframes_vendored/pandas/core/config_init.py @@ -13,7 +13,7 @@ from __future__ import annotations display_options_doc = """ -Encapsulates configuration for displaying objects. +Encapsulates the configuration for displaying objects. **Examples:** @@ -79,7 +79,7 @@ """ sampling_options_doc = """ -Encapsulates configuration for data sampling. +Encapsulates the configuration for data sampling. Attributes: max_download_size (int, default 500): diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index e1cc8c5a53..ed4ca66f38 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -24,7 +24,7 @@ def any(self): Returns: Series or DataFrame: DataFrame or Series of boolean values, where a value is True if any element is True within its - respective group, False otherwise. + respective group; otherwise False. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -35,7 +35,7 @@ def all(self): Returns: Series or DataFrame: DataFrame or Series of boolean values, where a value is True if all elements are True within its - respective group, False otherwise. + respective group; otherwise False. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -339,7 +339,7 @@ def expanding(self, *args, **kwargs): Provides expanding functionality. Returns: - Series or DataFrame: A expanding grouper, providing expanding functionality per group. + Series or DataFrame: An expanding grouper, providing expanding functionality per group. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index 7f5761e45b..eb6b9161fc 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -83,7 +83,7 @@ def copy( name (Label, optional): Set name for new object. Returns: - Index: Index refer to new object which is a copy of this object. + Index: Index reference to new object, which is a copy of this object. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -148,7 +148,7 @@ def isin(self, values): """ Return a boolean array where the index values are in `values`. - Compute boolean array of whether each index value is found in the + Compute boolean array to check whether each index value is found in the passed set of values. The length of the returned boolean array matches the length of the index. @@ -195,7 +195,7 @@ def max(self): def argmin(self) -> int: """ - Return int position of the smallest value in the Series. + Return int position of the smallest value in the series. If the minimum is achieved in multiple locations, the first row position is returned. @@ -264,7 +264,7 @@ def value_counts( Args: normalize (bool, default False): - If True then the object returned will contain the relative + If True, then the object returned will contain the relative frequencies of the unique values. sort (bool, default True): Sort by frequencies. @@ -316,7 +316,7 @@ def drop(self, labels) -> Index: labels (array-like or scalar): Returns: - Index: Will be same type as self + Index: Will be same type as self. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 785755a562..5426e434b3 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -2836,7 +2836,7 @@ def unstack(self, level): def argmax(self): """ - Return int position of the smallest value in the Series. + Return int position of the smallest value in the series. If the minimum is achieved in multiple locations, the first row position is returned. diff --git a/third_party/bigframes_vendored/sklearn/base.py b/third_party/bigframes_vendored/sklearn/base.py index 768328e552..fd8db7a227 100644 --- a/third_party/bigframes_vendored/sklearn/base.py +++ b/third_party/bigframes_vendored/sklearn/base.py @@ -81,9 +81,9 @@ class ClassifierMixin: def score(self, X, y): """Return the mean accuracy on the given test data and labels. - In multi-label classification, this is the subset accuracy - which is a harsh metric since you require for each sample that - each label set be correctly predicted. + In multi-label classification, this is the subset accuracy, + which is a harsh metric since you require that + each label set be correctly predicted for each sample. .. note:: diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index f126e0439d..71e53bf4a9 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -22,7 +22,7 @@ class PCA(BaseEstimator, metaclass=ABCMeta): Args: n_components (int, float or None, default None): - Number of components to keep. If n_components is not set all + Number of components to keep. If n_components is not set, all components are kept, n_components = min(n_samples, n_features). If 0 < n_components < 1, select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components. svd_solver ("full", "randomized" or "auto", default "auto"): @@ -75,7 +75,7 @@ def predict(self, X): Series or a DataFrame to predict. Returns: - bigframes.dataframe.DataFrame: predicted DataFrames.""" + bigframes.dataframe.DataFrame: Predicted DataFrames.""" raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property @@ -90,7 +90,7 @@ def components_(self): numerical_value: If feature is numeric, the value of feature for the principal component that principal_component_id identifies. If feature isn't numeric, the value is NULL. - categorical_value: An list of mappings containing information about categorical features. Each mapping contains the following fields: + categorical_value: A list of mappings containing information about categorical features. Each mapping contains the following fields: categorical_value.category: The name of each category. categorical_value.value: The value of categorical_value.category for the centroid that centroid_id identifies. From 3be4a2e784e046ca9a1fac8d386d072537b6c4de Mon Sep 17 00:00:00 2001 From: Henry Solberg Date: Mon, 8 Apr 2024 09:09:08 -0700 Subject: [PATCH 07/23] docs: add examples for at/iat (#582) * docs: add examples for at/iat * fix example * fix example * fix example * fix example --- .../bigframes_vendored/pandas/core/frame.py | 48 +++++++++++++++++++ .../bigframes_vendored/pandas/core/series.py | 33 +++++++++++++ 2 files changed, 81 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index e5aa47ad3e..ed615000c1 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -5362,6 +5362,30 @@ def loc(self): def iat(self): """Access a single value for a row/column pair by integer position. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... columns=['A', 'B', 'C']) + >>> bpd.options.display.progress_bar = None + >>> df + A B C + 0 0 2 3 + 1 0 4 1 + 2 10 20 30 + + [3 rows x 3 columns] + + Get value at specified row/column pair + + >>> df.iat[1, 2] + 1 + + Get value within a series + + >>> df.loc[0].iat[1] + 2 + Returns: bigframes.core.indexers.IatDataFrameIndexer: Indexers object. """ @@ -5371,6 +5395,30 @@ def iat(self): def at(self): """Access a single value for a row/column label pair. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... index=[4, 5, 6], columns=['A', 'B', 'C']) + >>> bpd.options.display.progress_bar = None + >>> df + A B C + 4 0 2 3 + 5 0 4 1 + 6 10 20 30 + + [3 rows x 3 columns] + + Get value at specified row/column pair + + >>> df.at[4, 'B'] + 2 + + Get value within a series + + >>> df.loc[5].at['B'] + 4 + Returns: bigframes.core.indexers.AtDataFrameIndexer: Indexers object. """ diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 5426e434b3..2d306fb05d 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -3308,6 +3308,22 @@ def loc(self): def iat(self): """Access a single value for a row/column pair by integer position. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> s = bpd.Series(bpd.Series([1, 2, 3])) + >>> bpd.options.display.progress_bar = None + >>> s + 0 1 + 1 2 + 2 3 + dtype: Int64 + + Get value at specified row number + + >>> s.iat[1] + 2 + Returns: bigframes.core.indexers.IatSeriesIndexer: Indexers object. """ @@ -3317,6 +3333,23 @@ def iat(self): def at(self): """Access a single value for a row/column label pair. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> s = bpd.Series([1, 2, 3], index=['A', 'B', 'C']) + >>> bpd.options.display.progress_bar = None + >>> s + A 1 + B 2 + C 3 + dtype: Int64 + + Get value at specified row label + + >>> s.at['B'] + 2 + + Returns: bigframes.core.indexers.AtSeriesIndexer: Indexers object. """ From eed12c181ff8724333b1c426a0eb442c627528b8 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Mon, 8 Apr 2024 12:02:16 -0700 Subject: [PATCH 08/23] fix: toc menu missing plotting name (#591) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal #333374239 🦕 --- docs/templates/toc.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 72d135cc96..4573296ec3 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -69,6 +69,7 @@ uid: bigframes.operations.plotting - name: PlotAccessor uid: bigframes.operations.plotting.PlotAccessor + name: Plotting - items: - name: Series uid: bigframes.series.Series From d048aa8248e008e70e8427d4e56f7833da284698 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 9 Apr 2024 08:57:33 -0700 Subject: [PATCH 09/23] refactor: Switch to using internal schema rules rather than ibis schema (#587) --- bigframes/core/__init__.py | 3 +- bigframes/core/blocks.py | 32 +++++++- bigframes/dataframe.py | 6 -- bigframes/dtypes.py | 30 +++---- bigframes/operations/aggregations.py | 112 ++++++++++++++++++--------- 5 files changed, 122 insertions(+), 61 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 9358dab1b1..3fa690ef37 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -106,8 +106,7 @@ def session(self) -> Session: @functools.cached_property def schema(self) -> schemata.ArraySchema: - # TODO: switch to use self.node.schema - return self._compiled_schema + return self.node.schema @functools.cached_property def _compiled_schema(self) -> schemata.ArraySchema: diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index c7b41e93eb..5b411e5416 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -24,6 +24,7 @@ import dataclasses import functools import itertools +import os import random import typing from typing import Iterable, List, Literal, Mapping, Optional, Sequence, Tuple @@ -41,10 +42,12 @@ import bigframes.core.guid as guid import bigframes.core.join_def as join_defs import bigframes.core.ordering as ordering +import bigframes.core.schema as bf_schema import bigframes.core.tree_properties as tree_properties import bigframes.core.utils import bigframes.core.utils as utils import bigframes.dtypes +import bigframes.features import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.session._io.pandas @@ -411,7 +414,32 @@ def _to_dataframe(self, result) -> pd.DataFrame: """Convert BigQuery data to pandas DataFrame with specific dtypes.""" dtypes = dict(zip(self.index_columns, self.index.dtypes)) dtypes.update(zip(self.value_columns, self.dtypes)) - return self.session._rows_to_dataframe(result, dtypes) + result_dataframe = self.session._rows_to_dataframe(result, dtypes) + # Runs strict validations to ensure internal type predictions and ibis are completely in sync + # Do not execute these validations outside of testing suite. + if "PYTEST_CURRENT_TEST" in os.environ: + self._validate_result_schema(result_dataframe) + return result_dataframe + + def _validate_result_schema(self, result_df: pd.DataFrame): + ibis_schema = self.expr._compiled_schema + internal_schema = self.expr.node.schema + actual_schema = bf_schema.ArraySchema( + tuple( + bf_schema.SchemaItem(name, dtype) # type: ignore + for name, dtype in result_df.dtypes.items() + ) + ) + if not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: + return + if internal_schema != actual_schema: + raise ValueError( + f"This error should only occur while testing. BigFrames internal schema: {internal_schema} does not match actual schema: {actual_schema}" + ) + if ibis_schema != actual_schema: + raise ValueError( + f"This error should only occur while testing. Ibis schema: {ibis_schema} does not match actual schema: {actual_schema}" + ) def to_pandas( self, @@ -1204,7 +1232,7 @@ def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.UnaryAggregateOp # TODO: annotate aggregations themself with this information dtype = self.expr.get_column_type(column_id) stats: list[agg_ops.UnaryAggregateOp] = [agg_ops.count_op] - if dtype not in bigframes.dtypes.UNORDERED_DTYPES: + if bigframes.dtypes.is_orderable(dtype): stats += [agg_ops.min_op, agg_ops.max_op] if dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE: # Notable exclusions: diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 460d1056a3..7b282783bd 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -17,7 +17,6 @@ from __future__ import annotations import datetime -import os import re import sys import textwrap @@ -175,11 +174,6 @@ def __init__( self._block = bigframes.pandas.read_pandas(pd_dataframe)._get_block() self._query_job: Optional[bigquery.QueryJob] = None - # Runs strict validations to ensure internal type predictions and ibis are completely in sync - # Do not execute these validations outside of testing suite. - if "PYTEST_CURRENT_TEST" in os.environ: - self._block.expr.validate_schema() - def __dir__(self): return dir(type(self)) + [ label diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index c5bf5db2fe..3b2092bf85 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -57,13 +57,11 @@ TIME_DTYPE = pd.ArrowDtype(pa.time64("us")) DATETIME_DTYPE = pd.ArrowDtype(pa.timestamp("us")) TIMESTAMP_DTYPE = pd.ArrowDtype(pa.timestamp("us", tz="UTC")) +GEO_DTYPE = gpd.array.GeometryDtype() # Used when storing Null expressions DEFAULT_DTYPE = FLOAT_DTYPE -# On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable -UNORDERED_DTYPES = [gpd.array.GeometryDtype()] - # Type hints for dtype strings supported by BigQuery DataFrame DtypeString = Literal[ "boolean", @@ -134,6 +132,12 @@ def is_array_like(type: ExpressionType) -> bool: ) +def is_struct_like(type: ExpressionType) -> bool: + return isinstance(type, pd.ArrowDtype) and isinstance( + type.pyarrow_dtype, pa.StructType + ) + + def is_numeric(type: ExpressionType) -> bool: return type in NUMERIC_BIGFRAMES_TYPES_PERMISSIVE @@ -143,18 +147,18 @@ def is_iterable(type: ExpressionType) -> bool: def is_comparable(type: ExpressionType) -> bool: - return (type is not None) and (type not in UNORDERED_DTYPES) + return (type is not None) and is_orderable(type) -# Type hints for Ibis data types that can be read to Python objects by BigQuery DataFrame -ReadOnlyIbisDtype = Union[ - ibis_dtypes.Binary, - ibis_dtypes.JSON, - ibis_dtypes.Decimal, - ibis_dtypes.GeoSpatial, - ibis_dtypes.Array, - ibis_dtypes.Struct, -] +def is_orderable(type: ExpressionType) -> bool: + # On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable + return not is_array_like(type) and not is_struct_like(type) and (type != GEO_DTYPE) + + +def is_bool_coercable(type: ExpressionType) -> bool: + # TODO: Implement more bool coercions + return (type is None) or is_numeric(type) or is_string_like(type) + BIDIRECTIONAL_MAPPINGS: Iterable[Tuple[IbisDtype, Dtype]] = ( (ibis_dtypes.boolean, pd.BooleanDtype()), diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 76aa2a6112..36fa787644 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -23,6 +23,7 @@ import pyarrow as pa import bigframes.dtypes as dtypes +import bigframes.operations.type as signatures @dataclasses.dataclass(frozen=True) @@ -38,7 +39,7 @@ def handles_ties(self): return False @abc.abstractmethod - def output_type(self, *input_types: dtypes.ExpressionType): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: ... @@ -48,7 +49,7 @@ class UnaryWindowOp(WindowOp): def arguments(self) -> int: return 1 - def output_type(self, *input_types: dtypes.ExpressionType): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: return input_types[0] @@ -85,7 +86,9 @@ def arguments(self) -> int: class SumOp(UnaryAggregateOp): name: ClassVar[str] = "sum" - def output_type(self, *input_types: dtypes.ExpressionType): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if not dtypes.is_numeric(input_types[0]): + raise TypeError(f"Type {input_types[0]} is not numeric") if pd.api.types.is_bool_dtype(input_types[0]): return dtypes.INT_DTYPE else: @@ -96,8 +99,10 @@ def output_type(self, *input_types: dtypes.ExpressionType): class MedianOp(UnaryAggregateOp): name: ClassVar[str] = "median" - def output_type(self, *input_types: dtypes.ExpressionType): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: # These will change if median is changed to exact implementation. + if not dtypes.is_orderable(input_types[0]): + raise TypeError(f"Type {input_types[0]} is not orderable") if pd.api.types.is_bool_dtype(input_types[0]): return dtypes.INT_DTYPE else: @@ -112,7 +117,9 @@ class ApproxQuartilesOp(UnaryAggregateOp): def name(self): return f"{self.quartile*25}%" - def output_type(self, *input_types: dtypes.ExpressionType): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if not dtypes.is_orderable(input_types[0]): + raise TypeError(f"Type {input_types[0]} is not orderable") if pd.api.types.is_bool_dtype(input_types[0]) or pd.api.types.is_integer_dtype( input_types[0] ): @@ -125,55 +132,68 @@ def output_type(self, *input_types: dtypes.ExpressionType): class MeanOp(UnaryAggregateOp): name: ClassVar[str] = "mean" - def output_type(self, *input_types: dtypes.ExpressionType): - if pd.api.types.is_bool_dtype(input_types[0]) or pd.api.types.is_integer_dtype( - input_types[0] - ): - return dtypes.FLOAT_DTYPE - else: - return input_types[0] + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) class ProductOp(UnaryAggregateOp): name: ClassVar[str] = "product" - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.FLOAT_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.FixedOutputType( + dtypes.is_numeric, dtypes.FLOAT_DTYPE, "numeric" + ).output_type(input_types[0]) @dataclasses.dataclass(frozen=True) class MaxOp(UnaryAggregateOp): name: ClassVar[str] = "max" + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.TypePreserving(dtypes.is_orderable, "orderable").output_type( + input_types[0] + ) + @dataclasses.dataclass(frozen=True) class MinOp(UnaryAggregateOp): name: ClassVar[str] = "min" + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.TypePreserving(dtypes.is_orderable, "orderable").output_type( + input_types[0] + ) + @dataclasses.dataclass(frozen=True) class StdOp(UnaryAggregateOp): name: ClassVar[str] = "std" - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.FLOAT_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.FixedOutputType( + dtypes.is_numeric, dtypes.FLOAT_DTYPE, "numeric" + ).output_type(input_types[0]) @dataclasses.dataclass(frozen=True) class VarOp(UnaryAggregateOp): name: ClassVar[str] = "var" - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.FLOAT_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.FixedOutputType( + dtypes.is_numeric, dtypes.FLOAT_DTYPE, "numeric" + ).output_type(input_types[0]) @dataclasses.dataclass(frozen=True) class PopVarOp(UnaryAggregateOp): name: ClassVar[str] = "popvar" - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.FLOAT_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.FixedOutputType( + dtypes.is_numeric, dtypes.FLOAT_DTYPE, "numeric" + ).output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -184,8 +204,10 @@ class CountOp(UnaryAggregateOp): def skips_nulls(self): return False - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.INT_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.FixedOutputType( + lambda x: True, dtypes.INT_DTYPE, "" + ).output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -202,7 +224,7 @@ def skips_nulls(self): def handles_ties(self): return True - def output_type(self, *input_types: dtypes.ExpressionType): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: if isinstance(self.bins, int) and (self.labels is False): return dtypes.INT_DTYPE else: @@ -237,8 +259,10 @@ def skips_nulls(self): def handles_ties(self): return True - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.INT_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.FixedOutputType( + dtypes.is_orderable, dtypes.INT_DTYPE, "orderable" + ).output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -249,7 +273,7 @@ class NuniqueOp(UnaryAggregateOp): def skips_nulls(self): return False - def output_type(self, *input_types: dtypes.ExpressionType): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: return dtypes.INT_DTYPE @@ -276,8 +300,10 @@ def skips_nulls(self): def handles_ties(self): return True - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.INT_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.FixedOutputType( + dtypes.is_orderable, dtypes.INT_DTYPE, "orderable" + ).output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -290,8 +316,10 @@ def skips_nulls(self): def handles_ties(self): return True - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.INT_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.FixedOutputType( + dtypes.is_orderable, dtypes.INT_DTYPE, "orderable" + ).output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -340,32 +368,40 @@ def skips_nulls(self): class AllOp(UnaryAggregateOp): name: ClassVar[str] = "all" - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.BOOL_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.FixedOutputType( + dtypes.is_bool_coercable, dtypes.BOOL_DTYPE, "convertible to boolean" + ).output_type(input_types[0]) @dataclasses.dataclass(frozen=True) class AnyOp(UnaryAggregateOp): name: ClassVar[str] = "any" - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.BOOL_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.FixedOutputType( + dtypes.is_bool_coercable, dtypes.BOOL_DTYPE, "convertible to boolean" + ).output_type(input_types[0]) @dataclasses.dataclass(frozen=True) class CorrOp(BinaryAggregateOp): name: ClassVar[str] = "corr" - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.FLOAT_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.BINARY_REAL_NUMERIC.output_type( + input_types[0], input_types[1] + ) @dataclasses.dataclass(frozen=True) class CovOp(BinaryAggregateOp): name: ClassVar[str] = "cov" - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.FLOAT_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.BINARY_REAL_NUMERIC.output_type( + input_types[0], input_types[1] + ) sum_op = SumOp() From a43573d0457ddc6b8ec082df4c04109e34f12e89 Mon Sep 17 00:00:00 2001 From: Lily Zhang <32233490+junyazhang@users.noreply.github.com> Date: Tue, 9 Apr 2024 12:37:22 -0700 Subject: [PATCH 10/23] doc: add examples for DatetimeMethods (#577) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * doc: add examples for DatetimeMethods * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * fix presubmit failure * fix presubmit failure --------- Co-authored-by: Owl Bot --- .../pandas/core/arrays/datetimelike.py | 4 +- .../pandas/core/indexes/accessor.py | 238 ++++++++++++++++-- .../pandas/core/tools/datetimes.py | 2 +- 3 files changed, 223 insertions(+), 21 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py index ce5f8d55f3..0d910cec92 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py @@ -9,8 +9,8 @@ def strftime(self, date_format: str): Convert to string Series using specified date_format. Return a Series of formatted strings specified by date_format. Details - of the string format can be found in `BigQuery format elements doc - <%(https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements)s>`__. + of the string format can be found in BigQuery format elements doc: + https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time. **Examples:** diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index 9490f4608b..3f0175359a 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -8,7 +8,27 @@ class DatetimeProperties: @property def day(self): - """The day of the datetime.""" + """The day of the datetime. + + **Examples:** + + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="D") + ... ) + >>> s + 0 2000-01-01 00:00:00 + 1 2000-01-02 00:00:00 + 2 2000-01-03 00:00:00 + dtype: timestamp[us][pyarrow] + >>> s.dt.day + 0 1 + 1 2 + 2 3 + dtype: Int64 + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -18,62 +38,187 @@ def dayofweek(self): Return the day of the week. It is assumed the week starts on Monday, which is denoted by 0 and ends on Sunday which is denoted - by 6. This method is available on both Series with datetime - values (using the `dt` accessor) or DatetimeIndex. + by 6. + + **Examples:** + + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series() + ... ) + >>> s.dt.dayofweek + 2016-12-31 00:00:00 5 + 2017-01-01 00:00:00 6 + 2017-01-02 00:00:00 0 + 2017-01-03 00:00:00 1 + 2017-01-04 00:00:00 2 + 2017-01-05 00:00:00 3 + 2017-01-06 00:00:00 4 + 2017-01-07 00:00:00 5 + 2017-01-08 00:00:00 6 + dtype: Int64 Returns: - Series or Index: Containing integers indicating the day number. + Series: Containing integers indicating the day number. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def date(self): - """Returns numpy array of Python :class:`datetime.date` objects. - - Namely, the date part of Timestamps without time and + """Returns a Series with the date part of Timestamps without time and timezone information. .. warning:: This method returns a Series whereas pandas returns a numpy array. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) + >>> s = bpd.to_datetime(s, utc=True, format="%d/%m/%Y %H:%M:%S%Ez") + >>> s + 0 2020-01-01 10:00:00+00:00 + 1 2020-01-02 11:00:00+00:00 + dtype: timestamp[us, tz=UTC][pyarrow] + >>> s.dt.date + 0 2020-01-01 + 1 2020-01-02 + dtype: date32[day][pyarrow] """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def hour(self): - """The hours of the datetime.""" + """The hours of the datetime. + + **Examples:** + + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="h") + ... ) + >>> s + 0 2000-01-01 00:00:00 + 1 2000-01-01 01:00:00 + 2 2000-01-01 02:00:00 + dtype: timestamp[us][pyarrow] + >>> s.dt.hour + 0 0 + 1 1 + 2 2 + dtype: Int64 + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def minute(self): - """The minutes of the datetime.""" + """The minutes of the datetime. + + **Examples:** + + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="min") + ... ) + >>> s + 0 2000-01-01 00:00:00 + 1 2000-01-01 00:01:00 + 2 2000-01-01 00:02:00 + dtype: timestamp[us][pyarrow] + >>> s.dt.minute + 0 0 + 1 1 + 2 2 + dtype: Int64 + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def month(self): - """The month as January=1, December=12.""" + """The month as January=1, December=12. + + **Examples:** + + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="M") + ... ) + >>> s + 0 2000-01-31 00:00:00 + 1 2000-02-29 00:00:00 + 2 2000-03-31 00:00:00 + dtype: timestamp[us][pyarrow] + >>> s.dt.month + 0 1 + 1 2 + 2 3 + dtype: Int64 + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def second(self): - """The seconds of the datetime.""" + """The seconds of the datetime. + + **Examples:** + + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="s") + ... ) + >>> s + 0 2000-01-01 00:00:00 + 1 2000-01-01 00:00:01 + 2 2000-01-01 00:00:02 + dtype: timestamp[us][pyarrow] + >>> s.dt.second + 0 0 + 1 1 + 2 2 + dtype: Int64 + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def time(self): - """Returns numpy array of :class:`datetime.time` objects. - - The time part of the Timestamps. + """Returns a Series with the time part of the Timestamps. .. warning:: This method returns a Series whereas pandas returns a numpy array. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) + >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") + >>> s + 0 2020-01-01 10:00:00+00:00 + 1 2020-02-01 11:00:00+00:00 + dtype: timestamp[us, tz=UTC][pyarrow] + >>> s.dt.time + 0 10:00:00 + 1 11:00:00 + dtype: time64[us][pyarrow] """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -82,16 +227,47 @@ def time(self): def quarter(self): """The quarter of the date. - .. warning:: - This method returns a Series whereas pandas returns - a numpy array. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "4/1/2020 11:00:00+00:00"]) + >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") + >>> s + 0 2020-01-01 10:00:00+00:00 + 1 2020-04-01 11:00:00+00:00 + dtype: timestamp[us, tz=UTC][pyarrow] + >>> s.dt.quarter + 0 1 + 1 2 + dtype: Int64 """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def year(self): - """The year of the datetime.""" + """The year of the datetime. + + **Examples:** + + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="Y") + ... ) + >>> s + 0 2000-12-31 00:00:00 + 1 2001-12-31 00:00:00 + 2 2002-12-31 00:00:00 + dtype: timestamp[us][pyarrow] + >>> s.dt.year + 0 2000 + 1 2001 + 2 2002 + dtype: Int64 + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -99,6 +275,19 @@ def year(self): def tz(self): """Return the timezone. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) + >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") + >>> s + 0 2020-01-01 10:00:00+00:00 + 1 2020-02-01 11:00:00+00:00 + dtype: timestamp[us, tz=UTC][pyarrow] + >>> s.dt.tz + datetime.timezone.utc + Returns: datetime.tzinfo, pytz.tzinfo.BaseTZInfo, dateutil.tz.tz.tzfile, or None """ @@ -109,6 +298,19 @@ def tz(self): def unit(self) -> str: """Returns the unit of time precision. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) + >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") + >>> s + 0 2020-01-01 10:00:00+00:00 + 1 2020-02-01 11:00:00+00:00 + dtype: timestamp[us, tz=UTC][pyarrow] + >>> s.dt.unit + 'us' + Returns: Unit as string (eg. "us"). """ diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 442220f237..3d460b2b16 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -29,7 +29,7 @@ def to_datetime( .. note:: The format strings for specifying datetime representations in BigQuery and pandas are not completely identical. Ensure that the format string provided is compatible - with BigQuery. + with BigQuery (https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time). **Examples:** From edef48f7a93e19bc1f6d37fb041dfd6314d881d5 Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Tue, 9 Apr 2024 13:55:21 -0700 Subject: [PATCH 11/23] docs: (Series|Dataframe).dtypes (#598) --- .../bigframes_vendored/pandas/core/generic.py | 12 ++++++++++++ .../bigframes_vendored/pandas/core/series.py | 14 ++++++++------ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index d14cbfaa52..61bc39bb12 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -589,6 +589,18 @@ def dtypes(self): The result's index is the original DataFrame's columns. Columns with mixed types aren't supported yet in BigQuery DataFrames. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'float': [1.0], 'int': [1], 'string': ['foo']}) + >>> df.dtypes + float Float64 + int Int64 + string string[pyarrow] + dtype: object + Returns: A *pandas* Series with the data type of each column. """ diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 2d306fb05d..a75d6c2167 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -119,13 +119,15 @@ def shape(self): def dtype(self): """ Return the dtype object of the underlying data. - """ - raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - @property - def dtypes(self): - """ - Return the dtype object of the underlying data. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1, 2, 3]) + >>> s.dtype + Int64Dtype() """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 5f1d670e6b839a30acdb495a05011c2ce4e0c7a4 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Tue, 9 Apr 2024 16:53:57 -0700 Subject: [PATCH 12/23] feat: Add pivot_table for DataFrame. (#473) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: Add pivot_table for DataFrame. * Update logic * Update comments * Remove code unused after merge. * Code update. * Update code example. * Update for Tuple type. * Update code logic * Update format --------- Co-authored-by: Tim Sweña (Swast) --- bigframes/dataframe.py | 60 ++++++++++++++ tests/system/small/test_dataframe.py | 28 +++++++ .../bigframes_vendored/pandas/core/frame.py | 82 +++++++++++++++++++ 3 files changed, 170 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 7b282783bd..32f5a36f79 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2132,6 +2132,66 @@ def pivot( ) -> DataFrame: return self._pivot(columns=columns, index=index, values=values) + def pivot_table( + self, + values: typing.Optional[ + typing.Union[blocks.Label, Sequence[blocks.Label]] + ] = None, + index: typing.Optional[ + typing.Union[blocks.Label, Sequence[blocks.Label]] + ] = None, + columns: typing.Union[blocks.Label, Sequence[blocks.Label]] = None, + aggfunc: str = "mean", + ) -> DataFrame: + if isinstance(index, Iterable) and not ( + isinstance(index, blocks.Label) and index in self.columns + ): + index = list(index) + else: + index = [index] + + if isinstance(columns, Iterable) and not ( + isinstance(columns, blocks.Label) and columns in self.columns + ): + columns = list(columns) + else: + columns = [columns] + + if isinstance(values, Iterable) and not ( + isinstance(values, blocks.Label) and values in self.columns + ): + values = list(values) + else: + values = [values] + + # Unlike pivot, pivot_table has values always ordered. + values.sort() + + keys = index + columns + agged = self.groupby(keys, dropna=True)[values].agg(aggfunc) + + if isinstance(agged, bigframes.series.Series): + agged = agged.to_frame() + + agged = agged.dropna(how="all") + + if len(values) == 1: + agged = agged.rename(columns={agged.columns[0]: values[0]}) + + agged = agged.reset_index() + + pivoted = agged.pivot( + columns=columns, + index=index, + values=values if len(values) > 1 else None, + ).sort_index() + + # TODO: Remove the reordering step once the issue is resolved. + # The pivot_table method results in multi-index columns that are always ordered. + # However, the order of the pivoted result columns is not guaranteed to be sorted. + # Sort and reorder. + return pivoted[pivoted.columns.sort_values()] + def stack(self, level: LevelsType = -1): if not isinstance(self.columns, pandas.MultiIndex): if level not in [0, -1, self.columns.name]: diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 0811defbc1..ba205078ed 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2606,6 +2606,34 @@ def test_df_pivot_hockey(hockey_df, hockey_pandas_df, values, index, columns): pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) +@pytest.mark.parametrize( + ("values", "index", "columns", "aggfunc"), + [ + (("culmen_length_mm", "body_mass_g"), "species", "sex", "std"), + (["body_mass_g", "culmen_length_mm"], ("species", "island"), "sex", "sum"), + ("body_mass_g", "sex", ["island", "species"], "mean"), + ("culmen_depth_mm", "island", "species", "max"), + ], +) +def test_df_pivot_table( + penguins_df_default_index, + penguins_pandas_df_default_index, + values, + index, + columns, + aggfunc, +): + bf_result = penguins_df_default_index.pivot_table( + values=values, index=index, columns=columns, aggfunc=aggfunc + ).to_pandas() + pd_result = penguins_pandas_df_default_index.pivot_table( + values=values, index=index, columns=columns, aggfunc=aggfunc + ) + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_column_type=False + ) + + def test_ipython_key_completions_with_drop(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_names = "string_col" diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index ed615000c1..1fc80449d1 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4711,6 +4711,88 @@ def pivot(self, *, columns, index=None, values=None): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def pivot_table(self, values=None, index=None, columns=None, aggfunc="mean"): + """ + Create a spreadsheet-style pivot table as a DataFrame. + + The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) + on the index and columns of the result DataFrame. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'Product': ['Product A', 'Product B', 'Product A', 'Product B', 'Product A', 'Product B'], + ... 'Region': ['East', 'West', 'East', 'West', 'West', 'East'], + ... 'Sales': [100, 200, 150, 100, 200, 150], + ... 'Rating': [3, 5, 4, 3, 3, 5] + ... }) + >>> df + Product Region Sales Rating + 0 Product A East 100 3 + 1 Product B West 200 5 + 2 Product A East 150 4 + 3 Product B West 100 3 + 4 Product A West 200 3 + 5 Product B East 150 5 + + [6 rows x 4 columns] + + Using `pivot_table` with default aggfunc "mean": + + >>> pivot_table = df.pivot_table( + ... values=['Sales', 'Rating'], + ... index='Product', + ... columns='Region' + ... ) + >>> pivot_table + Rating Sales + Region East West East West + Product + Product A 3.5 3.0 125.0 200.0 + Product B 5.0 4.0 150.0 150.0 + + [2 rows x 4 columns] + + Using `pivot_table` with specified aggfunc "max": + + >>> pivot_table = df.pivot_table( + ... values=['Sales', 'Rating'], + ... index='Product', + ... columns='Region', + ... aggfunc="max" + ... ) + >>> pivot_table + Rating Sales + Region East West East West + Product + Product A 4 3 150 200 + Product B 5 5 150 200 + + [2 rows x 4 columns] + + Args: + values (str, object or a list of the previous, optional): + Column(s) to use for populating new frame's values. If not + specified, all remaining columns will be used and the result will + have hierarchically indexed columns. + + index (str or object or a list of str, optional): + Column to use to make new frame's index. If not given, uses existing index. + + columns (str or object or a list of str): + Column to use to make new frame's columns. + + aggfunc (str, default "mean"): + Aggregation function name to compute summary statistics (e.g., 'sum', 'mean'). + + Returns: + DataFrame: An Excel style pivot table. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def stack(self, level=-1): """ Stack the prescribed level(s) from columns to index. From a557ea2b64633932f730b56688f76806da6195fb Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 10 Apr 2024 02:00:26 +0000 Subject: [PATCH 13/23] docs: add code samples for `str` accessor methdos (#594) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue 333397327 🦕 --- bigframes/operations/strings.py | 20 +- scripts/get_documentation_coverage.py | 4 + .../pandas/core/strings/accessor.py | 663 +++++++++++++++++- 3 files changed, 677 insertions(+), 10 deletions(-) diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index abd45a1453..883d19a1e3 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -53,7 +53,25 @@ def lower(self) -> series.Series: return self._apply_unary_op(ops.lower_op) def reverse(self) -> series.Series: - """Reverse strings in the Series.""" + """Reverse strings in the Series. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(["apple", "banana", "", bpd.NA]) + >>> s.str.reverse() + 0 elppa + 1 ananab + 2 + 3 + dtype: string + + Returns: + bigframes.series.Series: A Series of booleans indicating whether the given + pattern matches the start of each string element. + """ # reverse method is in ibis, not pandas. return self._apply_unary_op(ops.reverse_op) diff --git a/scripts/get_documentation_coverage.py b/scripts/get_documentation_coverage.py index 0b9417b2d3..a6566cafab 100755 --- a/scripts/get_documentation_coverage.py +++ b/scripts/get_documentation_coverage.py @@ -97,6 +97,10 @@ def get_coverage_summary( if name.startswith("_") and not name.startswith("__"): continue + # ignore constructor + if name == "__init__": + continue + def predicate(impl): return ( # This includes class methods like `from_dict`, `from_records` diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index ecdd9547d5..5bb69dc1f2 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -20,19 +20,57 @@ def extract(self, pat: str, flags: int = 0): For each subject string in the Series, extract groups from the first match of regular expression `pat`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + A pattern with two groups will return a DataFrame with two columns. + Non-matches will be `NaN`. + + >>> s = bpd.Series(['a1', 'b2', 'c3']) + >>> s.str.extract(r'([ab])(\\d)') + 0 1 + 0 a 1 + 1 b 2 + 2 + + [3 rows x 2 columns] + + Named groups will become column names in the result. + + >>> s.str.extract(r'(?P[ab])(?P\\d)') + letter digit + 0 a 1 + 1 b 2 + 2 + + [3 rows x 2 columns] + + A pattern with one group will return a DataFrame with one column. + + >>> s.str.extract(r'[ab](\\d)') + 0 + 0 1 + 1 2 + 2 + + [3 rows x 1 columns] + Args: - pat: + pat (str): Regular expression pattern with capturing groups. - flags: + flags (int, default 0 (no flags)): Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that modify regular expression matching for things like case, spaces, etc. For more details, see :mod:`re`. Returns: - A DataFrame with one row for each subject string, and one - column for each group. Any capture group names in regular - expression pat will be used for column names; otherwise - capture group numbers will be used. + bigframes.dataframe.DataFrame: + A DataFrame with one row for each subject string, and one + column for each group. Any capture group names in regular + expression pat will be used for column names; otherwise + capture group numbers will be used. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -43,12 +81,24 @@ def find(self, sub, start: int = 0, end=None): substring is fully contained between [start:end]. Return -1 on failure. Equivalent to standard :meth:`str.find`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> ser = bpd.Series(["cow_", "duck_", "do_ve"]) + >>> ser.str.find("_") + 0 3 + 1 4 + 2 2 + dtype: Int64 + Args: - sub: + sub (str): Substring being searched. start (int, default 0): Left edge index. - end (None): + end (int, default None): Right edge index. Returns: @@ -62,6 +112,20 @@ def len(self): The element may be a sequence (such as a string, tuple or list) or a collection (such as a dictionary). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Returns the length (number of characters) in a string. + + >>> s = bpd.Series(['dog', '', bpd.NA]) + >>> s.str.len() + 0 3 + 1 0 + 2 + dtype: Int64 + Returns: bigframes.series.Series: A Series or Index of integer values indicating the length of each element in the Series or Index. @@ -74,6 +138,22 @@ def lower(self): Equivalent to :meth:`str.lower`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['lower', + ... 'CAPITALS', + ... 'this is a sentence', + ... 'SwApCaSe']) + >>> s.str.lower() + 0 lower + 1 capitals + 2 this is a sentence + 3 swapcase + dtype: string + Returns: bigframes.series.Series: Series with lowercase. """ @@ -83,6 +163,36 @@ def lower(self): def slice(self, start=None, stop=None): """Slice substrings from each element in the Series or Index. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(["koala", "dog", "chameleon"]) + >>> s + 0 koala + 1 dog + 2 chameleon + dtype: string + + >>> s.str.slice(start=1) + 0 oala + 1 og + 2 hameleon + dtype: string + + >>> s.str.slice(stop=2) + 0 ko + 1 do + 2 ch + dtype: string + + >>> s.str.slice(start=2, stop=5) + 0 ala + 1 g + 2 ame + dtype: string + Args: start (int, optional): Start position for slice operation. @@ -106,6 +216,27 @@ def strip(self): Replaces any non-strings in Series with NaNs. Equivalent to :meth:`str.strip`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['Ant', ' Bee ', '\\tCat\\n', bpd.NA]) + >>> s + 0 Ant + 1 Bee + 2 Cat + + 3 + dtype: string + + >>> s.str.strip() + 0 Ant + 1 Bee + 2 Cat + 3 + dtype: string + Returns: bigframes.series.Series: Series or Index without leading and trailing characters. @@ -118,6 +249,22 @@ def upper(self): Equivalent to :meth:`str.upper`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['lower', + ... 'CAPITALS', + ... 'this is a sentence', + ... 'SwApCaSe']) + >>> s.str.upper() + 0 LOWER + 1 CAPITALS + 2 THIS IS A SENTENCE + 3 SWAPCASE + dtype: string + Returns: bigframes.series.Series: Series with uppercase strings. """ @@ -131,6 +278,19 @@ def isnumeric(self): :meth:`str.isnumeric` for each element of the Series/Index. If a string has zero characters, ``False`` is returned for that check. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s1 = bpd.Series(['one', 'one1', '1', '']) + >>> s1.str.isnumeric() + 0 False + 1 False + 2 True + 3 False + dtype: boolean + Returns: bigframes.series.Series: Series or Index of boolean values with the same length as the original Series/Index. @@ -145,6 +305,19 @@ def isalpha(self): :meth:`str.isalpha` for each element of the Series/Index. If a string has zero characters, ``False`` is returned for that check. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s1 = bpd.Series(['one', 'one1', '1', '']) + >>> s1.str.isalpha() + 0 True + 1 False + 2 False + 3 False + dtype: boolean + Returns: bigframes.series.Series: Series with the same length as the originalSeries/Index. """ @@ -158,6 +331,19 @@ def isdigit(self): :meth:`str.isdigit` for each element of the Series/Index. If a string has zero characters, ``False`` is returned for that check. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['23', '1a', '1/5', '']) + >>> s.str.isdigit() + 0 True + 1 False + 2 False + 3 False + dtype: boolean + Returns: bigframes.series.Series: Series with the same length as the originalSeries/Index. """ @@ -171,6 +357,30 @@ def isalnum(self): :meth:`str.isalnum` for each element of the Series/Index. If a string has zero characters, ``False`` is returned for that check. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s1 = bpd.Series(['one', 'one1', '1', '']) + >>> s1.str.isalnum() + 0 True + 1 True + 2 True + 3 False + dtype: boolean + + Note that checks against characters mixed with any additional + punctuation or whitespace will evaluate to false for an alphanumeric + check. + + >>> s2 = bpd.Series(['A B', '1.5', '3,000']) + >>> s2.str.isalnum() + 0 False + 1 False + 2 False + dtype: boolean + Returns: bigframes.series.Series: Series or Index of boolean values with the same length as the original Series/Index. @@ -185,6 +395,18 @@ def isspace(self): :meth:`str.isspace` for each element of the Series/Index. If a string has zero characters, ``False`` is returned for that check. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([' ', '\\t\\r\\n ', '']) + >>> s.str.isspace() + 0 True + 1 True + 2 False + dtype: boolean + Returns: bigframes.series.Series: Series or Index of boolean values with the same length as the original Series/Index. @@ -199,6 +421,19 @@ def islower(self): :meth:`str.islower` for each element of the Series/Index. If a string has zero characters, ``False`` is returned for that check. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) + >>> s.str.islower() + 0 True + 1 False + 2 False + 3 False + dtype: boolean + Returns: bigframes.series.Series: Series or Index of boolean values with the same length as the original Series/Index. @@ -213,6 +448,19 @@ def isupper(self): :meth:`str.isupper` for each element of the Series/Index. If a string has zero characters, ``False`` is returned for that check. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) + >>> s.str.isupper() + 0 False + 1 False + 2 True + 3 False + dtype: boolean + Returns: bigframes.series.Series: Series or Index of boolean values with the same length as the original Series/Index. @@ -227,6 +475,22 @@ def isdecimal(self): :meth:`str.isdecimal` for each element of the Series/Index. If a string has zero characters, ``False`` is returned for that check. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + The `isdecimal` method checks for characters used to form numbers in + base 10. + + >>> s = bpd.Series(['23', '³', '⅕', '']) + >>> s.str.isdecimal() + 0 True + 1 False + 2 False + 3 False + dtype: boolean + Returns: bigframes.series.Series: Series or Index of boolean values with the same length as the original Series/Index. @@ -242,6 +506,27 @@ def rstrip(self): Replaces any non-strings in Series with NaNs. Equivalent to :meth:`str.rstrip`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['Ant', ' Bee ', '\\tCat\\n', bpd.NA]) + >>> s + 0 Ant + 1 Bee + 2 Cat + + 3 + dtype: string + + >>> s.str.rstrip() + 0 Ant + 1 Bee + 2 Cat + 3 + dtype: string + Returns: bigframes.series.Series: Series without trailing characters. """ @@ -256,6 +541,28 @@ def lstrip(self): Replaces any non-strings in Series with NaNs. Equivalent to :meth:`str.lstrip`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['Ant', ' Bee ', '\\tCat\\n', bpd.NA]) + >>> s + 0 Ant + 1 Bee + 2 Cat + + 3 + dtype: string + + >>> s.str.lstrip() + 0 Ant + 1 Bee + 2 Cat + + 3 + dtype: string + Returns: bigframes.series.Series: Series without leading characters. """ @@ -265,6 +572,24 @@ def lstrip(self): def repeat(self, repeats: int): """Duplicate each string in the Series or Index. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['a', 'b', 'c']) + >>> s + 0 a + 1 b + 2 c + dtype: string + + >>> s.str.repeat(repeats=2) + 0 aa + 1 bb + 2 cc + dtype: string + Args: repeats : int or sequence of int Same value for all (int) or different value per (sequence). @@ -281,6 +606,22 @@ def capitalize(self): Equivalent to :meth:`str.capitalize`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['lower', + ... 'CAPITALS', + ... 'this is a sentence', + ... 'SwApCaSe']) + >>> s.str.capitalize() + 0 Lower + 1 Capitals + 2 This is a sentence + 3 Swapcase + dtype: string + Returns: bigframes.series.Series: Series with captitalized strings. """ @@ -293,8 +634,43 @@ def cat(self, others, *, join): If `others` is specified, this function concatenates the Series/Index and elements of `others` element-wise. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can concatenate each string in a Series to another string. + + >>> s = bpd.Series(['Jane', 'John']) + >>> s.str.cat(" Doe") + 0 Jane Doe + 1 John Doe + dtype: string + + You can concatenate another Series. By default left join is performed to + align the corresponding elements. + + >>> s.str.cat(bpd.Series([" Doe", " Foe", " Roe"])) + 0 Jane Doe + 1 John Foe + dtype: string + + >>> s.str.cat(bpd.Series([" Doe", " Foe", " Roe"], index=[2, 0, 1])) + 0 Jane Foe + 1 John Roe + dtype: string + + You can enforce an outer join. + + >>> s.str.cat(bpd.Series([" Doe", " Foe", " Roe"]), join="outer") + 0 Jane Doe + 1 John Foe + 2 + dtype: string + Args: - others (Series): + others (str or Series): + A string or a Series of strings. join ({'left', 'outer'}, default 'left'): Determines the join-style between the calling Series and any @@ -315,6 +691,77 @@ def contains(self, pat, case: bool = True, flags: int = 0, *, regex: bool = True Return boolean Series or Index based on whether a given pattern or regex is contained within a string of a Series or Index. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Returning a Series of booleans using only a literal pattern. + + >>> s1 = bpd.Series(['Mouse', 'dog', 'house and parrot', '23', None]) + >>> s1.str.contains('og') + 0 False + 1 True + 2 False + 3 False + 4 + dtype: boolean + + Specifying case sensitivity using `case`. + + >>> s1.str.contains('oG', case=True) + 0 False + 1 False + 2 False + 3 False + 4 + dtype: boolean + + Returning 'house' or 'dog' when either expression occurs in a string. + + >>> s1.str.contains('house|dog', regex=True) + 0 False + 1 True + 2 True + 3 False + 4 + dtype: boolean + + Ignoring case sensitivity using `flags` with regex. + + >>> import re + >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True) + 0 False + 1 False + 2 True + 3 False + 4 + dtype: boolean + + Returning any digit using regular expression. + + >>> s1.str.contains('\\d', regex=True) + 0 False + 1 False + 2 False + 3 True + 4 + dtype: boolean + + Ensure `pat` is a not a literal pattern when `regex` is set to True. + Note in the following example one might expect only *s2[1]* and *s2[3]* + to return `True`. However, '.0' as a regex matches any character + followed by a 0. + + >>> s2 = bpd.Series(['40', '40.0', '41', '41.0', '35']) + >>> s2.str.contains('.0', regex=True) + 0 True + 1 True + 2 False + 3 True + 4 False + dtype: boolean + Args: pat (str, re.Pattern): Character sequence or regular expression. @@ -348,6 +795,32 @@ def replace( Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on the regex value. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + When *pat* is a string and *regex* is True, the given *pat* is compiled + as a regex. When *repl* is a string, it replaces matching regex patterns + as with `re.sub()`. NaN value(s) in the Series are left as is: + + >>> s = bpd.Series(['foo', 'fuz', bpd.NA]) + >>> s.str.replace('f.', 'ba', regex=True) + 0 bao + 1 baz + 2 + dtype: string + + When *pat* is a string and *regex* is False, every *pat* is replaced + with *repl* as with `str.replace()`: + + >>> s = bpd.Series(['f.o', 'fuz', bpd.NA]) + >>> s.str.replace('f.', 'ba', regex=False) + 0 bao + 1 fuz + 2 + dtype: string + Args: pat (str, re.Pattern): String can be a character sequence or regular expression. @@ -384,6 +857,33 @@ def startswith( """ Test if the start of each string element matches a pattern. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['bat', 'Bear', 'caT', bpd.NA]) + >>> s + 0 bat + 1 Bear + 2 caT + 3 + dtype: string + + >>> s.str.startswith('b') + 0 True + 1 False + 2 False + 3 + dtype: boolean + + >>> s.str.startswith(('b', 'B')) + 0 True + 1 True + 2 False + 3 + dtype: boolean + Args: pat (str, tuple[str, ...]): Character sequence or tuple of strings. Regular expressions are not @@ -402,6 +902,33 @@ def endswith( """ Test if the end of each string element matches a pattern. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['bat', 'bear', 'caT', bpd.NA]) + >>> s + 0 bat + 1 bear + 2 caT + 3 + dtype: string + + >>> s.str.endswith('t') + 0 True + 1 False + 2 False + 3 + dtype: boolean + + >>> s.str.endswith(('t', 'T')) + 0 True + 1 False + 2 True + 3 + dtype: boolean + Args: pat (str, tuple[str, ...]): Character sequence or tuple of strings. Regular expressions are not @@ -417,6 +944,18 @@ def match(self, pat: str, case: bool = True, flags: int = 0): """ Determine if each string starts with a match of a regular expression. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> ser = bpd.Series(["horse", "eagle", "donkey"]) + >>> ser.str.match("e") + 0 False + 1 True + 2 False + dtype: boolean + Args: pat (str): Character sequence or regular expression. @@ -434,6 +973,18 @@ def fullmatch(self, pat: str, case: bool = True, flags: int = 0): """ Determine if each string entirely matches a regular expression. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> ser = bpd.Series(["cat", "duck", "dove"]) + >>> ser.str.fullmatch(r'd.+') + 0 False + 1 True + 2 True + dtype: boolean + Args: pat (str): Character sequence or regular expression. @@ -454,6 +1005,18 @@ def get(self, i: int): Extract element from lists, tuples, dict, or strings in each element in the Series/Index. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(["apple", "banana", "fig"]) + >>> s.str.get(3) + 0 l + 1 a + 2 + dtype: string + Args: i (int): Position or key of element to extract. @@ -472,6 +1035,32 @@ def pad( """ Pad strings in the Series/Index up to width. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(["caribou", "tiger"]) + >>> s + 0 caribou + 1 tiger + dtype: string + + >>> s.str.pad(width=10) + 0 caribou + 1 tiger + dtype: string + + >>> s.str.pad(width=10, side='right', fillchar='-') + 0 caribou--- + 1 tiger----- + dtype: string + + >>> s.str.pad(width=10, side='both', fillchar='-') + 0 -caribou-- + 1 --tiger--- + dtype: string + Args: width (int): Minimum width of resulting string; additional characters will be filled @@ -494,6 +1083,18 @@ def ljust( """ Pad right side of strings in the Series/Index up to width. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> ser = bpd.Series(['dog', 'bird', 'mouse']) + >>> ser.str.ljust(8, fillchar='.') + 0 dog..... + 1 bird.... + 2 mouse... + dtype: string + Args: width (int): Minimum width of resulting string; additional characters will be filled @@ -514,6 +1115,18 @@ def rjust( """ Pad left side of strings in the Series/Index up to width. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> ser = bpd.Series(['dog', 'bird', 'mouse']) + >>> ser.str.rjust(8, fillchar='.') + 0 .....dog + 1 ....bird + 2 ...mouse + dtype: string + Args: width (int): Minimum width of resulting string; additional characters will be filled @@ -538,6 +1151,26 @@ def zfill( in the Series/Index with length greater or equal to `width` are unchanged. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['-1', '1', '1000', bpd.NA]) + >>> s + 0 -1 + 1 1 + 2 1000 + 3 + dtype: string + + >>> s.str.zfill(3) + 0 -01 + 1 001 + 2 1000 + 3 + dtype: string + Args: width (int): Minimum length of resulting string; strings with length less @@ -558,6 +1191,18 @@ def center( Equivalent to :meth:`str.center`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> ser = bpd.Series(['dog', 'bird', 'mouse']) + >>> ser.str.center(8, fillchar='.') + 0 ..dog... + 1 ..bird.. + 2 .mouse.. + dtype: string + Args: width (int): Minimum width of resulting string; additional characters will be filled From 8702dcf54c0f2073e21df42eaef51927481da421 Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Wed, 10 Apr 2024 13:37:02 -0700 Subject: [PATCH 14/23] fix: error for object dtype on read_pandas (#570) --- bigframes/session/__init__.py | 11 ++++++++++- tests/system/small/test_session.py | 6 ++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 354352f1c9..b6d56006be 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1050,7 +1050,12 @@ def _read_pandas( inline_df = self._read_pandas_inline(pandas_dataframe) if inline_df is not None: return inline_df - return self._read_pandas_load_job(pandas_dataframe, api_name) + try: + return self._read_pandas_load_job(pandas_dataframe, api_name) + except pa.ArrowInvalid as e: + raise pa.ArrowInvalid( + f"Could not convert with a BigQuery type: `{e}`. " + ) from e def _read_pandas_inline( self, pandas_dataframe: pandas.DataFrame @@ -1064,6 +1069,10 @@ def _read_pandas_inline( inline_df = dataframe.DataFrame( blocks.Block.from_local(pandas_dataframe, self) ) + except pa.ArrowInvalid as e: + raise pa.ArrowInvalid( + f"Could not convert with a BigQuery type: `{e}`. " + ) from e except ValueError: # Thrown by ibis for some unhandled types return None except pa.ArrowTypeError: # Thrown by arrow for types without mapping (geo). diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index d84244e5cf..ce415f9324 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -24,6 +24,7 @@ import google.cloud.bigquery as bigquery import numpy as np import pandas as pd +import pyarrow as pa import pytest import bigframes @@ -436,6 +437,11 @@ def test_read_pandas_index(session): pd.testing.assert_index_equal(bf_idx.to_pandas(), pd_idx) +def test_read_pandas_w_unsupported_mixed_dtype(session): + with pytest.raises(pa.ArrowInvalid, match="Could not convert"): + session.read_pandas(pd.DataFrame({"a": [1, "hello"]})) + + def test_read_pandas_inline_respects_location(): options = bigframes.BigQueryOptions(location="europe-west1") session = bigframes.Session(options) From 5d0f149dce5425098fcd154d96a302c1661ce5d3 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 10 Apr 2024 14:40:27 -0700 Subject: [PATCH 15/23] feat: Add MultiIndex subclass. (#596) --- bigframes/core/indexes/__init__.py | 2 + bigframes/core/indexes/base.py | 57 ++++++------ bigframes/core/indexes/multi.py | 48 ++++++++++ bigframes/pandas/__init__.py | 2 + tests/system/small/test_multiindex.py | 25 ++++++ .../pandas/core/indexes/multi.py | 88 +++++++++++++++++++ 6 files changed, 193 insertions(+), 29 deletions(-) create mode 100644 bigframes/core/indexes/multi.py create mode 100644 third_party/bigframes_vendored/pandas/core/indexes/multi.py diff --git a/bigframes/core/indexes/__init__.py b/bigframes/core/indexes/__init__.py index ae6011ffa5..0a95adcd83 100644 --- a/bigframes/core/indexes/__init__.py +++ b/bigframes/core/indexes/__init__.py @@ -13,7 +13,9 @@ # limitations under the License. from bigframes.core.indexes.base import Index +from bigframes.core.indexes.multi import MultiIndex __all__ = [ "Index", + "MultiIndex", ] diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index daa52a02b9..46a9e30637 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -42,9 +42,15 @@ class Index(vendored_pandas_index.Index): __doc__ = vendored_pandas_index.Index.__doc__ - - def __init__( - self, + _query_job = None + _block: blocks.Block + _linked_frame: Union[ + bigframes.dataframe.DataFrame, bigframes.series.Series, None + ] = None + + # Overrided on __new__ to create subclasses like pandas does + def __new__( + cls, data=None, dtype=None, *, @@ -73,18 +79,30 @@ def __init__( if dtype is not None: index = index.astype(dtype) block = index._block + elif isinstance(data, pandas.Index): + pd_df = pandas.DataFrame(index=data) + block = df.DataFrame(pd_df, session=session)._block else: pd_index = pandas.Index(data=data, dtype=dtype, name=name) pd_df = pandas.DataFrame(index=pd_index) block = df.DataFrame(pd_df, session=session)._block - self._query_job = None - self._block: blocks.Block = block + + # TODO: Support more index subtypes + from bigframes.core.indexes.multi import MultiIndex + + klass = MultiIndex if len(block._index_columns) > 1 else cls + result = typing.cast(Index, object.__new__(klass)) + result._query_job = None + result._block = block + return result @classmethod def from_frame( cls, frame: Union[bigframes.series.Series, bigframes.dataframe.DataFrame] ) -> Index: - return FrameIndex(frame) + index = Index(frame._block) + index._linked_frame = frame + return index @property def name(self) -> blocks.Label: @@ -107,6 +125,10 @@ def names(self) -> typing.Sequence[blocks.Label]: @names.setter def names(self, values: typing.Sequence[blocks.Label]): new_block = self._block.with_index_labels(values) + if self._linked_frame is not None: + self._linked_frame._set_block( + self._linked_frame._block.with_index_labels(values) + ) self._block = new_block @property @@ -452,26 +474,3 @@ def to_numpy(self, dtype=None, **kwargs) -> np.ndarray: def __len__(self): return self.shape[0] - - -# Index that mutates the originating dataframe/series -class FrameIndex(Index): - def __init__( - self, - series_or_dataframe: typing.Union[ - bigframes.series.Series, bigframes.dataframe.DataFrame - ], - ): - super().__init__(series_or_dataframe._block) - self._whole_frame = series_or_dataframe - - @property - def names(self) -> typing.Sequence[blocks.Label]: - """Returns the names of the Index.""" - return self._block._index_labels - - @names.setter - def names(self, values: typing.Sequence[blocks.Label]): - new_block = self._whole_frame._get_block().with_index_labels(values) - self._whole_frame._set_block(new_block) - self._block = new_block diff --git a/bigframes/core/indexes/multi.py b/bigframes/core/indexes/multi.py new file mode 100644 index 0000000000..182d1f101c --- /dev/null +++ b/bigframes/core/indexes/multi.py @@ -0,0 +1,48 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import cast, Hashable, Iterable, Sequence + +import bigframes_vendored.pandas.core.indexes.multi as vendored_pandas_multindex +import pandas + +from bigframes.core.indexes.base import Index + + +class MultiIndex(Index, vendored_pandas_multindex.MultiIndex): + __doc__ = vendored_pandas_multindex.MultiIndex.__doc__ + + @classmethod + def from_tuples( + cls, + tuples: Iterable[tuple[Hashable, ...]], + sortorder: int | None = None, + names: Sequence[Hashable] | Hashable | None = None, + ) -> MultiIndex: + pd_index = pandas.MultiIndex.from_tuples(tuples, sortorder, names) + # Index.__new__ should detect multiple levels and properly create a multiindex + return cast(MultiIndex, Index(pd_index)) + + @classmethod + def from_arrays( + cls, + arrays, + sortorder: int | None = None, + names=None, + ) -> MultiIndex: + pd_index = pandas.MultiIndex.from_arrays(arrays, sortorder, names) + # Index.__new__ should detect multiple levels and properly create a multiindex + return cast(MultiIndex, Index(pd_index)) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 4b0ac4310c..f5be4421e4 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -707,6 +707,7 @@ def to_datetime( # checking and docstrings. DataFrame = bigframes.dataframe.DataFrame Index = bigframes.core.indexes.Index +MultiIndex = bigframes.core.indexes.MultiIndex Series = bigframes.series.Series # Other public pandas attributes @@ -760,6 +761,7 @@ def to_datetime( # Class aliases "DataFrame", "Index", + "MultiIndex", "Series", # Other public pandas attributes "NamedAgg", diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 330fe44eb8..bb0af52976 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -20,6 +20,31 @@ from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas +def test_multi_index_from_arrays(): + bf_idx = bpd.MultiIndex.from_arrays( + [ + pandas.Index([4, 99], dtype=pandas.Int64Dtype()), + pandas.Index( + [" Hello, World!", "_some_new_string"], + dtype=pandas.StringDtype(storage="pyarrow"), + ), + ], + names=[" 1index 1", "_1index 2"], + ) + pd_idx = pandas.MultiIndex.from_arrays( + [ + pandas.Index([4, 99], dtype=pandas.Int64Dtype()), + pandas.Index( + [" Hello, World!", "_some_new_string"], + dtype=pandas.StringDtype(storage="pyarrow"), + ), + ], + names=[" 1index 1", "_1index 2"], + ) + assert bf_idx.names == pd_idx.names + pandas.testing.assert_index_equal(bf_idx.to_pandas(), pd_idx) + + @skip_legacy_pandas def test_read_pandas_multi_index_axes(): index = pandas.MultiIndex.from_arrays( diff --git a/third_party/bigframes_vendored/pandas/core/indexes/multi.py b/third_party/bigframes_vendored/pandas/core/indexes/multi.py new file mode 100644 index 0000000000..a882aa40e3 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/indexes/multi.py @@ -0,0 +1,88 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/indexes/multi.py +from __future__ import annotations + +from typing import Hashable, Iterable, Sequence + +import bigframes_vendored.pandas.core.indexes.base + +from bigframes import constants + + +class MultiIndex(bigframes_vendored.pandas.core.indexes.base.Index): + """ + A multi-level, or hierarchical, index object for pandas objects. + """ + + @classmethod + def from_tuples( + cls, + tuples: Iterable[tuple[Hashable, ...]], + sortorder: int | None = None, + names: Sequence[Hashable] | Hashable | None = None, + ) -> MultiIndex: + """ + Convert list of tuples to MultiIndex. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> tuples = [(1, 'red'), (1, 'blue'), + ... (2, 'red'), (2, 'blue')] + >>> bpd.MultiIndex.from_tuples(tuples, names=('number', 'color')) + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], + names=['number', 'color']) + + Args: + tuples (list / sequence of tuple-likes): + Each tuple is the index of one row/column. + sortorder (int or None): + Level of sortedness (must be lexicographically sorted by that + level). + names (list / sequence of str, optional): + Names for the levels in the index. + + Returns: + MultiIndex + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @classmethod + def from_arrays( + cls, + arrays, + sortorder: int | None = None, + names=None, + ) -> MultiIndex: + """ + Convert arrays to MultiIndex. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] + >>> bpd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], + names=['number', 'color']) + + Args: + arrays (list / sequence of array-likes): + Each array-like gives one level's value for each data point. + len(arrays) is the number of levels. + sortorder (int or None): + Level of sortedness (must be lexicographically sorted by that + level). + names (list / sequence of str, optional): + Names for the levels in the index. + + Returns: + MultiIndex + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 86e0f38adc71d76e09dd832e5e33cb7c1aab02ac Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 10 Apr 2024 16:48:16 -0700 Subject: [PATCH 16/23] feat: Add hasnans, combine_first, update to Series (#600) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/core/convert.py | 28 +++- bigframes/series.py | 21 ++- tests/system/small/test_series.py | 51 +++++++ .../bigframes_vendored/pandas/core/series.py | 138 ++++++++++++++++++ 4 files changed, 236 insertions(+), 2 deletions(-) diff --git a/bigframes/core/convert.py b/bigframes/core/convert.py index 98f854ad72..1ef329b0c7 100644 --- a/bigframes/core/convert.py +++ b/bigframes/core/convert.py @@ -13,13 +13,27 @@ # limitations under the License. from __future__ import annotations +from typing import Optional + import pandas as pd import bigframes.core.indexes as index import bigframes.series as series -def to_bf_series(obj, default_index: index.Index) -> series.Series: +def to_bf_series(obj, default_index: Optional[index.Index]) -> series.Series: + """ + Convert a an object to a bigframes series + + Args: + obj (list-like or Series): + Object to convert to bigframes Series + default_index (list-like or Index or None): + Index to use if obj has no index + + Returns + bigframes.pandas.Series + """ if isinstance(obj, series.Series): return obj if isinstance(obj, pd.Series): @@ -35,6 +49,18 @@ def to_bf_series(obj, default_index: index.Index) -> series.Series: def to_pd_series(obj, default_index: pd.Index) -> pd.Series: + """ + Convert a an object to a pandas series + + Args: + obj (list-like or Series): + Object to convert to pandas Series + default_index (list-like or Index or None): + Index to use if obj has no index + + Returns + pandas.Series + """ if isinstance(obj, series.Series): return obj.to_pandas() if isinstance(obj, pd.Series): diff --git a/bigframes/series.py b/bigframes/series.py index 185891bc01..b975979eaf 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -22,7 +22,7 @@ import os import textwrap import typing -from typing import Any, Literal, Mapping, Optional, Tuple, Union +from typing import Any, Literal, Mapping, Optional, Sequence, Tuple, Union import bigframes_vendored.pandas.core.series as vendored_pandas_series import google.cloud.bigquery as bigquery @@ -130,6 +130,11 @@ def ndim(self) -> int: def empty(self) -> bool: return self.shape[0] == 0 + @property + def hasnans(self) -> bool: + # Note, hasnans is actually a null check, and NaNs don't count for nullable float + return self.isnull().any() + @property def values(self) -> numpy.ndarray: return self.to_numpy() @@ -753,6 +758,20 @@ def __matmul__(self, other): dot = __matmul__ + def combine_first(self, other: Series) -> Series: + result = self._apply_binary_op(other, ops.coalesce_op) + result.name = self.name + return result + + def update(self, other: Union[Series, Sequence, Mapping]) -> None: + import bigframes.core.convert + + other = bigframes.core.convert.to_bf_series(other, default_index=None) + result = self._apply_binary_op( + other, ops.coalesce_op, reverse=True, alignment="left" + ) + self._set_block(result._get_block()) + def abs(self) -> Series: return self._apply_unary_op(ops.abs_op) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index c882677508..c93af1bf2f 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1261,6 +1261,39 @@ def test_binop_right_filtered(scalars_dfs): ) +@skip_legacy_pandas +def test_series_combine_first(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + int64_col = scalars_df["int64_col"].head(7) + float64_col = scalars_df["float64_col"].tail(7) + bf_result = int64_col.combine_first(float64_col).to_pandas() + + pd_int64_col = scalars_pandas_df["int64_col"].head(7) + pd_float64_col = scalars_pandas_df["float64_col"].tail(7) + pd_result = pd_int64_col.combine_first(pd_float64_col) + + assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_update(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + int64_col = scalars_df["int64_col"].head(7) + float64_col = scalars_df["float64_col"].tail(7).copy() + float64_col.update(int64_col) + + pd_int64_col = scalars_pandas_df["int64_col"].head(7) + pd_float64_col = scalars_pandas_df["float64_col"].tail(7).copy() + pd_float64_col.update(pd_int64_col) + + assert_series_equal( + float64_col.to_pandas(), + pd_float64_col, + ) + + def test_mean(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_col" @@ -1649,6 +1682,24 @@ def test_size(scalars_dfs): assert pd_result == bf_result +def test_series_hasnans_true(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].hasnans + pd_result = scalars_pandas_df["string_col"].hasnans + + assert pd_result == bf_result + + +def test_series_hasnans_false(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].dropna().hasnans + pd_result = scalars_pandas_df["string_col"].dropna().hasnans + + assert pd_result == bf_result + + def test_empty_false(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index a75d6c2167..572f29ff17 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -175,6 +175,31 @@ def name(self) -> Hashable: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property + def hasnans(self) -> bool: + """ + Return True if there are any NaNs. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1, 2, 3, None]) + >>> s + 0 1.0 + 1 2.0 + 2 3.0 + 3 + dtype: Float64 + >>> s.hasnans + True + + Returns: + bool + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property def T(self) -> Series: """Return the transpose, which is by definition self. @@ -2343,6 +2368,119 @@ def rdivmod(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def combine_first(self, other) -> Series: + """ + Update null elements with value in the same location in 'other'. + + Combine two Series objects by filling null values in one Series with + non-null values from the other Series. Result index will be the union + of the two indexes. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> s1 = bpd.Series([1, np.nan]) + >>> s2 = bpd.Series([3, 4, 5]) + >>> s1.combine_first(s2) + 0 1.0 + 1 4.0 + 2 5.0 + dtype: Float64 + + Null values still persist if the location of that null value + does not exist in `other` + + >>> s1 = bpd.Series({'falcon': np.nan, 'eagle': 160.0}) + >>> s2 = bpd.Series({'eagle': 200.0, 'duck': 30.0}) + >>> s1.combine_first(s2) + falcon + eagle 160.0 + duck 30.0 + dtype: Float64 + + Args: + other (Series): + The value(s) to be used for filling null values. + + Returns: + Series: The result of combining the provided Series with the other object. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def update(self, other) -> None: + """ + Modify Series in place using values from passed Series. + + Uses non-NA values from passed Series to make updates. Aligns + on index. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1, 2, 3]) + >>> s.update(bpd.Series([4, 5, 6])) + >>> s + 0 4 + 1 5 + 2 6 + dtype: Int64 + + >>> s = bpd.Series(['a', 'b', 'c']) + >>> s.update(bpd.Series(['d', 'e'], index=[0, 2])) + >>> s + 0 d + 1 b + 2 e + dtype: string + + >>> s = bpd.Series([1, 2, 3]) + >>> s.update(bpd.Series([4, 5, 6, 7, 8])) + >>> s + 0 4 + 1 5 + 2 6 + dtype: Int64 + + If ``other`` contains NaNs the corresponding values are not updated + in the original Series. + + >>> s = bpd.Series([1, 2, 3]) + >>> s.update(bpd.Series([4, np.nan, 6], dtype=pd.Int64Dtype())) + >>> s + 0 4 + 1 2 + 2 6 + dtype: Int64 + + ``other`` can also be a non-Series object type + that is coercible into a Series + + >>> s = bpd.Series([1, 2, 3]) + >>> s.update([4, np.nan, 6]) + >>> s + 0 4.0 + 1 2.0 + 2 6.0 + dtype: Float64 + + >>> s = bpd.Series([1, 2, 3]) + >>> s.update({1: 9}) + >>> s + 0 1 + 1 9 + 2 3 + dtype: Int64 + + Args: + other (Series, or object coercible into Series) + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def all( self, ): From b94bae9892e0fa79dc4bde0f4f1427d00accda6d Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Thu, 11 Apr 2024 10:21:54 -0700 Subject: [PATCH 17/23] fix: loc setitem dtype issue. (#603) * fix: loc setitem dtype issue. * Update NaN selection * Update code example --- bigframes/core/indexers.py | 10 +++++++++- tests/system/small/test_dataframe.py | 14 +++++++++++--- .../bigframes_vendored/pandas/core/generic.py | 6 +++--- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index da6f3f3740..bc03bd1df0 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -192,7 +192,15 @@ def __setitem__( and isinstance(key[0], bigframes.series.Series) and key[0].dtype == "boolean" ) and pd.api.types.is_scalar(value): - new_column = key[0].map({True: value, False: None}) + # For integer scalar, if set value to a new column, the dtype would be default to float. + # But if set value to an existing Int64 column, the dtype would still be integer. + # So we need to use different NaN type to match this behavior. + new_column = key[0].map( + { + True: value, + False: pd.NA if key[1] in self._dataframe.columns else None, + } + ) try: original_column = self._dataframe[key[1]] except KeyError: diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index ba205078ed..e70764fcc0 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2918,15 +2918,23 @@ def test_loc_setitem_bool_series_scalar_new_col(scalars_dfs): ) -def test_loc_setitem_bool_series_scalar_existing_col(scalars_dfs): +@pytest.mark.parametrize( + ("col", "value"), + [ + ("string_col", "hello"), + ("int64_col", 3), + ("float64_col", 3.5), + ], +) +def test_loc_setitem_bool_series_scalar_existing_col(scalars_dfs, col, value): if pd.__version__.startswith("1."): pytest.skip("this loc overload not supported in pandas 1.x.") scalars_df, scalars_pandas_df = scalars_dfs bf_df = scalars_df.copy() pd_df = scalars_pandas_df.copy() - bf_df.loc[bf_df["int64_too"] == 1, "string_col"] = "hello" - pd_df.loc[pd_df["int64_too"] == 1, "string_col"] = "hello" + bf_df.loc[bf_df["int64_too"] == 1, col] = value + pd_df.loc[pd_df["int64_too"] == 1, col] = value pd.testing.assert_frame_equal( bf_df.to_pandas(), diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 61bc39bb12..baa9534a0e 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -662,9 +662,9 @@ def copy(self): >>> df.loc[df["b"] == 2, "b"] = 22 >>> df - a b - 0 1 22.0 - 1 3 4.0 + a b + 0 1 22 + 1 3 4 [2 rows x 2 columns] >>> df_copy From 231cf298e1afee1a145aa8886f185a90cf64c93b Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 11 Apr 2024 10:32:48 -0700 Subject: [PATCH 18/23] refactor: Remove join name mapper code. (#597) * refactor: Remove join name mapper code. * remove dead import --- bigframes/core/compile/single_column.py | 12 +++++--- bigframes/core/joins/__init__.py | 5 +-- bigframes/core/joins/name_resolution.py | 41 ------------------------- bigframes/pandas/__init__.py | 1 + 4 files changed, 12 insertions(+), 47 deletions(-) delete mode 100644 bigframes/core/joins/name_resolution.py diff --git a/bigframes/core/compile/single_column.py b/bigframes/core/compile/single_column.py index f1a3d723ac..dbf25891bf 100644 --- a/bigframes/core/compile/single_column.py +++ b/bigframes/core/compile/single_column.py @@ -23,8 +23,8 @@ import ibis.expr.types as ibis_types import bigframes.core.compile.compiled as compiled +import bigframes.core.guid as guids import bigframes.core.join_def as join_defs -import bigframes.core.joins as joining import bigframes.core.ordering as orderings @@ -50,9 +50,13 @@ def join_by_column_ordered( finally, all the right columns. """ - l_hidden_mapping, r_hidden_mapping = joining.JoinNameRemapper(namespace="hidden")( - left._hidden_column_ids, right._hidden_column_ids - ) + l_hidden_mapping = { + id: guids.generate_guid("hidden_") for id in left._hidden_column_ids + } + r_hidden_mapping = { + id: guids.generate_guid("hidden_") for id in right._hidden_column_ids + } + l_mapping = {**join.get_left_mapping(), **l_hidden_mapping} r_mapping = {**join.get_right_mapping(), **r_hidden_mapping} diff --git a/bigframes/core/joins/__init__.py b/bigframes/core/joins/__init__.py index 415ee4e49d..3c5b9605a3 100644 --- a/bigframes/core/joins/__init__.py +++ b/bigframes/core/joins/__init__.py @@ -15,6 +15,7 @@ """Helpers to join ArrayValue objects.""" from bigframes.core.joins.merge import merge -from bigframes.core.joins.name_resolution import JoinNameRemapper -__all__ = ("merge", "JoinNameRemapper") +__all__ = [ + "merge", +] diff --git a/bigframes/core/joins/name_resolution.py b/bigframes/core/joins/name_resolution.py deleted file mode 100644 index f648d28ad2..0000000000 --- a/bigframes/core/joins/name_resolution.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import annotations - -from typing import Mapping, Sequence, Tuple - - -class JoinNameRemapper: - def __init__(self, namespace: str) -> None: - self._namespace = namespace - - def __call__( - self, left_column_ids: Sequence[str], right_column_ids: Sequence[str] - ) -> Tuple[Mapping[str, str], Mapping[str, str]]: - """ - When joining column ids from different namespaces, this function defines how names are remapped. - - Take care to map value column ids and hidden column ids in separate namespaces. This is important because value - column ids must be deterministic as they are referenced by dependent operators. The generation of hidden ids is - dependent on compilation context, and should be completely separated from value column id mappings. - """ - # This naming strategy depends on the number of value columns in source tables. - # This means column id mappings must be adjusted if pushing operations above or below join in transformation - new_left_ids = { - col: f"{self._namespace}_l_{i}" for i, col in enumerate(left_column_ids) - } - new_right_ids = { - col: f"{self._namespace}_r_{i}" for i, col in enumerate(right_column_ids) - } - return new_left_ids, new_right_ids diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index f5be4421e4..91c3eb603b 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -59,6 +59,7 @@ import bigframes.core.expression as ex import bigframes.core.global_session as global_session import bigframes.core.indexes +import bigframes.core.joins import bigframes.core.reshape import bigframes.core.tools import bigframes.dataframe From 4ec80340459e675b82b437f6c48b2872d362bafe Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 11 Apr 2024 11:24:38 -0700 Subject: [PATCH 19/23] feat: Add Series.autocorr (#605) --- bigframes/series.py | 3 ++ tests/system/small/test_series.py | 8 +++++ .../bigframes_vendored/pandas/core/series.py | 32 +++++++++++++++++++ 3 files changed, 43 insertions(+) diff --git a/bigframes/series.py b/bigframes/series.py index b975979eaf..f11511f969 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -791,6 +791,9 @@ def corr(self, other: Series, method="pearson", min_periods=None) -> float: ) return self._apply_binary_aggregation(other, agg_ops.CorrOp()) + def autocorr(self, lag: int = 1) -> float: + return self.corr(self.shift(lag)) + def cov(self, other: Series) -> float: return self._apply_binary_aggregation(other, agg_ops.CovOp()) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index c93af1bf2f..6e4a87df4f 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -718,6 +718,14 @@ def test_series_corr(scalars_dfs): assert math.isclose(pd_result, bf_result) +@skip_legacy_pandas +def test_series_autocorr(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["float64_col"].autocorr(2) + pd_result = scalars_pandas_df["float64_col"].autocorr(2) + assert math.isclose(pd_result, bf_result) + + def test_series_cov(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df["int64_too"].cov(scalars_df["int64_too"]) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 572f29ff17..192e19fa5a 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -844,6 +844,38 @@ def corr(self, other, method="pearson", min_periods=None) -> float: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def autocorr(self, lag: int = 1) -> float: + """ + Compute the lag-N autocorrelation. + + This method computes the Pearson correlation between + the Series and its shifted self. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([0.25, 0.5, 0.2, -0.05]) + >>> s.autocorr() # doctest: +ELLIPSIS + 0.10355... + >>> s.autocorr(lag=2) + -1.0 + + If the Pearson correlation is not well defined, then 'NaN' is returned. + + >>> s = bpd.Series([1, 0, 0, 0]) + >>> s.autocorr() + nan + + Args: + lag (int, default 1): + Number of lags to apply before performing autocorrelation. + + Returns: + float: The Pearson correlation between self and self.shift(lag). + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def cov( self, other, From 8fc26c424b29a8b78542372e402fcc4e8fface7b Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 12 Apr 2024 05:27:17 +0000 Subject: [PATCH 20/23] docs: add docs for `DataFrame` and `Series` dunder methods (#562) * docs: add docs for `DataFrame.{radd,__add__,__radd__}` * fix rendering, revert ineffective changes, add __eq__ * newline * docs for more df dunders * fix mypy errors and couple of wordings * fix sub and rmod, add docs for __bool__, __nonzero__, __getattr__ * add documentation for Series dunders * fix doctest failure with python 3.12 * move docstrings to third_party for compliance safety * add DataFrame.__getitem__ docstring and code samples * add dunder doc overrides from third_party --- bigframes/dataframe.py | 160 ++- bigframes/series.py | 71 +- .../bigframes_vendored/pandas/core/frame.py | 1050 ++++++++++++++++- .../bigframes_vendored/pandas/core/generic.py | 30 + .../bigframes_vendored/pandas/core/series.py | 567 ++++++++- 5 files changed, 1803 insertions(+), 75 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 32f5a36f79..2deef95277 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -17,6 +17,7 @@ from __future__ import annotations import datetime +import inspect import re import sys import textwrap @@ -314,6 +315,8 @@ def __len__(self): rows, _ = self.shape return rows + __len__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__len__) + def __iter__(self): return iter(self.columns) @@ -466,7 +469,6 @@ def __getitem__( bigframes.series.Series, ], ): # No return type annotations (like pandas) as type cannot always be determined statically - """Gets the specified column(s) from the DataFrame.""" # NOTE: This implements the operations described in # https://pandas.pydata.org/docs/getting_started/intro_tutorials/03_subset_data.html @@ -498,6 +500,8 @@ def __getitem__( return DataFrame(self._block.select_columns(selected_ids)) + __getitem__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__getitem__) + def _getitem_label(self, key: blocks.Label): col_ids = self._block.cols_matching_label(key) if len(col_ids) == 0: @@ -642,14 +646,11 @@ def _repr_html_(self) -> str: return html_string def __setitem__(self, key: str, value: SingleItemValue): - """Modify or insert a column into the DataFrame. - - Note: This does **not** modify the original table the DataFrame was - derived from. - """ df = self._assign_single_item(key, value) self._set_block(df._get_block()) + __setitem__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__setitem__) + def _apply_binop( self, other: float | int | bigframes.series.Series | DataFrame, @@ -838,32 +839,50 @@ def _apply_dataframe_binop( def eq(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: return self._apply_binop(other, ops.eq_op, axis=axis) + def __eq__(self, other) -> DataFrame: # type: ignore + return self.eq(other) + + __eq__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__eq__) + def ne(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: return self._apply_binop(other, ops.ne_op, axis=axis) - __eq__ = eq # type: ignore + def __ne__(self, other) -> DataFrame: # type: ignore + return self.ne(other) - __ne__ = ne # type: ignore + __ne__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__ne__) def le(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: return self._apply_binop(other, ops.le_op, axis=axis) + def __le__(self, other) -> DataFrame: + return self.le(other) + + __le__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__le__) + def lt(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: return self._apply_binop(other, ops.lt_op, axis=axis) + def __lt__(self, other) -> DataFrame: + return self.lt(other) + + __lt__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__lt__) + def ge(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: return self._apply_binop(other, ops.ge_op, axis=axis) - def gt(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: - return self._apply_binop(other, ops.gt_op, axis=axis) + def __ge__(self, other) -> DataFrame: + return self.ge(other) - __lt__ = lt + __ge__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__ge__) - __le__ = le + def gt(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: + return self._apply_binop(other, ops.gt_op, axis=axis) - __gt__ = gt + def __gt__(self, other) -> DataFrame: + return self.gt(other) - __ge__ = ge + __gt__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__gt__) def add( self, @@ -874,7 +893,21 @@ def add( # TODO(swast): Support level parameter with MultiIndex. return self._apply_binop(other, ops.add_op, axis=axis) - __radd__ = __add__ = radd = add + def radd( + self, + other: float | int | bigframes.series.Series | DataFrame, + axis: str | int = "columns", + ) -> DataFrame: + # TODO(swast): Support fill_value parameter. + # TODO(swast): Support level parameter with MultiIndex. + return self.add(other, axis=axis) + + def __add__(self, other) -> DataFrame: + return self.add(other) + + __add__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__add__) + + __radd__ = __add__ def sub( self, @@ -883,7 +916,13 @@ def sub( ) -> DataFrame: return self._apply_binop(other, ops.sub_op, axis=axis) - __sub__ = subtract = sub + subtract = sub + subtract.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.sub) + + def __sub__(self, other): + return self.sub(other) + + __sub__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__sub__) def rsub( self, @@ -892,7 +931,10 @@ def rsub( ) -> DataFrame: return self._apply_binop(other, ops.sub_op, axis=axis, reverse=True) - __rsub__ = rsub + def __rsub__(self, other): + return self.rsub(other) + + __rsub__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__rsub__) def mul( self, @@ -901,7 +943,25 @@ def mul( ) -> DataFrame: return self._apply_binop(other, ops.mul_op, axis=axis) - __rmul__ = __mul__ = rmul = multiply = mul + multiply = mul + multiply.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.mul) + + def __mul__(self, other): + return self.mul(other) + + __mul__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__mul__) + + def rmul( + self, + other: float | int | bigframes.series.Series | DataFrame, + axis: str | int = "columns", + ) -> DataFrame: + return self.mul(other, axis=axis) + + def __rmul__(self, other): + return self.rmul(other) + + __rmul__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__rmul__) def truediv( self, @@ -910,7 +970,13 @@ def truediv( ) -> DataFrame: return self._apply_binop(other, ops.div_op, axis=axis) - div = divide = __truediv__ = truediv + truediv.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.truediv) + div = divide = truediv + + def __truediv__(self, other): + return self.truediv(other) + + __truediv__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__truediv__) def rtruediv( self, @@ -919,7 +985,13 @@ def rtruediv( ) -> DataFrame: return self._apply_binop(other, ops.div_op, axis=axis, reverse=True) - __rtruediv__ = rdiv = rtruediv + rdiv = rtruediv + rdiv.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.rtruediv) + + def __rtruediv__(self, other): + return self.rtruediv(other) + + __rtruediv__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__rtruediv__) def floordiv( self, @@ -928,7 +1000,10 @@ def floordiv( ) -> DataFrame: return self._apply_binop(other, ops.floordiv_op, axis=axis) - __floordiv__ = floordiv + def __floordiv__(self, other): + return self.floordiv(other) + + __floordiv__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__floordiv__) def rfloordiv( self, @@ -937,31 +1012,48 @@ def rfloordiv( ) -> DataFrame: return self._apply_binop(other, ops.floordiv_op, axis=axis, reverse=True) - __rfloordiv__ = rfloordiv + def __rfloordiv__(self, other): + return self.rfloordiv(other) + + __rfloordiv__.__doc__ = inspect.getdoc( + vendored_pandas_frame.DataFrame.__rfloordiv__ + ) def mod(self, other: int | bigframes.series.Series | DataFrame, axis: str | int = "columns") -> DataFrame: # type: ignore return self._apply_binop(other, ops.mod_op, axis=axis) + def __mod__(self, other): + return self.mod(other) + + __mod__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__mod__) + def rmod(self, other: int | bigframes.series.Series | DataFrame, axis: str | int = "columns") -> DataFrame: # type: ignore return self._apply_binop(other, ops.mod_op, axis=axis, reverse=True) - __mod__ = mod + def __rmod__(self, other): + return self.rmod(other) - __rmod__ = rmod + __rmod__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__rmod__) def pow( self, other: int | bigframes.series.Series, axis: str | int = "columns" ) -> DataFrame: return self._apply_binop(other, ops.pow_op, axis=axis) + def __pow__(self, other): + return self.pow(other) + + __pow__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__pow__) + def rpow( self, other: int | bigframes.series.Series, axis: str | int = "columns" ) -> DataFrame: return self._apply_binop(other, ops.pow_op, axis=axis, reverse=True) - __pow__ = pow + def __rpow__(self, other): + return self.rpow(other) - __rpow__ = rpow + __rpow__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__rpow__) def align( self, @@ -1971,6 +2063,7 @@ def prod( return bigframes.series.Series(block.select_column("values")) product = prod + product.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.prod) def count(self, *, numeric_only: bool = False) -> bigframes.series.Series: if not numeric_only: @@ -2010,6 +2103,7 @@ def agg( ) aggregate = agg + aggregate.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.agg) def idxmin(self) -> bigframes.series.Series: return bigframes.series.Series(block_ops.idxmin(self._block)) @@ -2083,6 +2177,7 @@ def kurt(self, *, numeric_only: bool = False): return bigframes.series.Series(result_block) kurtosis = kurt + kurtosis.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.kurt) def _pivot( self, @@ -2542,11 +2637,13 @@ def isna(self) -> DataFrame: return self._apply_unary_op(ops.isnull_op) isnull = isna + isnull.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.isna) def notna(self) -> DataFrame: return self._apply_unary_op(ops.notnull_op) notnull = notna + notnull.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.notna) def cumsum(self): is_numeric_types = [ @@ -2860,7 +2957,10 @@ def to_numpy( ) -> numpy.ndarray: return self.to_pandas().to_numpy(dtype, copy, na_value, **kwargs) - __array__ = to_numpy + def __array__(self, dtype=None) -> numpy.ndarray: + return self.to_numpy(dtype=dtype) + + __array__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__array__) def to_parquet( self, @@ -3227,6 +3327,7 @@ def first_valid_index(self): return applymap = map + applymap.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.map) def _slice( self, @@ -3367,4 +3468,7 @@ def get_right_id(id): def plot(self): return plotting.PlotAccessor(self) - __matmul__ = dot + def __matmul__(self, other) -> DataFrame: + return self.dot(other) + + __matmul__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__matmul__) diff --git a/bigframes/series.py b/bigframes/series.py index f11511f969..2f9123f9a3 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -17,6 +17,7 @@ from __future__ import annotations import functools +import inspect import itertools import numbers import os @@ -180,6 +181,8 @@ def _set_internal_query_job(self, query_job: bigquery.QueryJob): def __len__(self): return self.shape[0] + __len__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__len__) + def __iter__(self) -> typing.Iterator: self._optimize_query_complexity() return itertools.chain.from_iterable( @@ -423,6 +426,7 @@ def ffill(self, *, limit: typing.Optional[int] = None) -> Series: return self._apply_window_op(agg_ops.LastNonNullOp(), window) pad = ffill + pad.__doc__ = inspect.getdoc(vendored_pandas_series.Series.ffill) def bfill(self, *, limit: typing.Optional[int] = None) -> Series: window = bigframes.core.window_spec.WindowSpec(preceding=0, following=limit) @@ -609,28 +613,38 @@ def isna(self) -> "Series": return self._apply_unary_op(ops.isnull_op) isnull = isna + isnull.__doc__ = inspect.getdoc(vendored_pandas_series.Series.isna) def notna(self) -> "Series": return self._apply_unary_op(ops.notnull_op) notnull = notna + notnull.__doc__ = inspect.getdoc(vendored_pandas_series.Series.notna) def __and__(self, other: bool | int | Series) -> Series: return self._apply_binary_op(other, ops.and_op) + __and__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__and__) + __rand__ = __and__ def __or__(self, other: bool | int | Series) -> Series: return self._apply_binary_op(other, ops.or_op) + __or__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__or__) + __ror__ = __or__ def __add__(self, other: float | int | Series) -> Series: return self.add(other) + __add__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__add__) + def __radd__(self, other: float | int | Series) -> Series: return self.radd(other) + __radd__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__radd__) + def add(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.add_op) @@ -640,9 +654,13 @@ def radd(self, other: float | int | Series) -> Series: def __sub__(self, other: float | int | Series) -> Series: return self.sub(other) + __sub__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__sub__) + def __rsub__(self, other: float | int | Series) -> Series: return self.rsub(other) + __rsub__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rsub__) + def sub(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.sub_op) @@ -650,13 +668,18 @@ def rsub(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.sub_op, reverse=True) subtract = sub + subtract.__doc__ = inspect.getdoc(vendored_pandas_series.Series.sub) def __mul__(self, other: float | int | Series) -> Series: return self.mul(other) + __mul__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__mul__) + def __rmul__(self, other: float | int | Series) -> Series: return self.rmul(other) + __rmul__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rmul__) + def mul(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.mul_op) @@ -664,31 +687,40 @@ def rmul(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.mul_op, reverse=True) multiply = mul + multiply.__doc__ = inspect.getdoc(vendored_pandas_series.Series.mul) def __truediv__(self, other: float | int | Series) -> Series: return self.truediv(other) + __truediv__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__truediv__) + def __rtruediv__(self, other: float | int | Series) -> Series: return self.rtruediv(other) + __rtruediv__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rtruediv__) + def truediv(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.div_op) def rtruediv(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.div_op, reverse=True) - div = truediv - - divide = truediv + truediv.__doc__ = inspect.getdoc(vendored_pandas_series.Series.truediv) + div = divide = truediv rdiv = rtruediv + rdiv.__doc__ = inspect.getdoc(vendored_pandas_series.Series.rtruediv) def __floordiv__(self, other: float | int | Series) -> Series: return self.floordiv(other) + __floordiv__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__floordiv__) + def __rfloordiv__(self, other: float | int | Series) -> Series: return self.rfloordiv(other) + __rfloordiv__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rfloordiv__) + def floordiv(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.floordiv_op) @@ -698,9 +730,13 @@ def rfloordiv(self, other: float | int | Series) -> Series: def __pow__(self, other: float | int | Series) -> Series: return self.pow(other) + __pow__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__pow__) + def __rpow__(self, other: float | int | Series) -> Series: return self.rpow(other) + __rpow__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rpow__) + def pow(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.pow_op) @@ -734,9 +770,13 @@ def ge(self, other) -> Series: def __mod__(self, other) -> Series: # type: ignore return self.mod(other) + __mod__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__mod__) + def __rmod__(self, other) -> Series: # type: ignore return self.rmod(other) + __rmod__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rmod__) + def mod(self, other) -> Series: # type: ignore return self._apply_binary_op(other, ops.mod_op) @@ -753,10 +793,18 @@ def rdivmod(self, other) -> Tuple[Series, Series]: # type: ignore # the output should be dtype float, both floordiv and mod returns dtype int in this case. return (self.rfloordiv(other), self.rmod(other)) - def __matmul__(self, other): + def dot(self, other): return (self * other).sum() - dot = __matmul__ + def __matmul__(self, other): + return self.dot(other) + + __matmul__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__matmul__) + + def __rmatmul__(self, other): + return self.dot(other) + + __rmatmul__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rmatmul__) def combine_first(self, other: Series) -> Series: result = self._apply_binary_op(other, ops.coalesce_op) @@ -849,6 +897,7 @@ def agg(self, func: str | typing.Sequence[str]) -> scalars.Scalar | Series: ) aggregate = agg + aggregate.__doc__ = inspect.getdoc(vendored_pandas_series.Series.agg) def skew(self): count = self.count() @@ -883,6 +932,7 @@ def kurt(self): return (numerator / denominator) - adjustment kurtosis = kurt + kurtosis.__doc__ = inspect.getdoc(vendored_pandas_series.Series.kurt) def mode(self) -> Series: block = self._block @@ -930,6 +980,7 @@ def prod(self) -> float: return typing.cast(float, self._apply_aggregation(agg_ops.product_op)) product = prod + product.__doc__ = inspect.getdoc(vendored_pandas_series.Series.prod) def __eq__(self, other: object) -> Series: # type: ignore return self.eq(other) @@ -940,6 +991,8 @@ def __ne__(self, other: object) -> Series: # type: ignore def __invert__(self) -> Series: return self._apply_unary_op(ops.invert_op) + __invert__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__invert__) + def eq(self, other: object) -> Series: # TODO: enforce stricter alignment return self._apply_binary_op(other, ops.eq_op) @@ -1074,6 +1127,8 @@ def __getitem__(self, indexer): return Series(block) return self.loc[indexer] + __getitem__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__getitem__) + def __getattr__(self, key: str): if hasattr(pandas.Series, key): raise AttributeError( @@ -1461,6 +1516,7 @@ def tolist(self) -> list: return self.to_pandas().to_list() to_list = tolist + to_list.__doc__ = inspect.getdoc(vendored_pandas_series.Series.tolist) def to_markdown( self, @@ -1476,7 +1532,10 @@ def to_numpy( ) -> numpy.ndarray: return self.to_pandas().to_numpy(dtype, copy, na_value, **kwargs) - __array__ = to_numpy + def __array__(self, dtype=None) -> numpy.ndarray: + return self.to_numpy(dtype=dtype) + + __array__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__array__) def to_pickle(self, path, **kwargs) -> None: return self.to_pandas().to_pickle(path, **kwargs) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 1fc80449d1..6707dc1403 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -339,6 +339,7 @@ def to_gbq( [2 rows x 2 columns] Write a DataFrame to a BigQuery table with clustering columns: + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4], 'col3': [5, 6]}) >>> clustering_cols = ['col1', 'col3'] >>> df.to_gbq( @@ -910,28 +911,6 @@ def to_orc(self, path=None, **kwargs) -> bytes | None: # ---------------------------------------------------------------------- # Unsorted - def equals(self, other) -> bool: - """ - Test whether two objects contain the same elements. - - This function allows two Series or DataFrames to be compared against - each other to see if they have the same shape and elements. NaNs in - the same location are considered equal. - - The row/column index do not need to have the same type, as long - as the values are considered equal. Corresponding columns must be of - the same dtype. - - Args: - other (Series or DataFrame): - The other Series or DataFrame to be compared with the first. - - Returns: - bool: True if all elements are the same in both objects, False - otherwise. - """ - raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def assign(self, **kwargs) -> DataFrame: r""" Assign new columns to a DataFrame. @@ -1208,7 +1187,6 @@ def rename_axis(self, mapper: Optional[str], **kwargs) -> DataFrame: Set the name of the axis for the index. .. note:: - Currently only accepts a single string parameter (the new name of the index). Args: @@ -1862,7 +1840,7 @@ def sort_index( raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- - # Arithmetic Methods + # Arithmetic and Logical Methods def eq(self, other, axis: str | int = "columns") -> DataFrame: """ @@ -1890,7 +1868,8 @@ def eq(self, other, axis: str | int = "columns") -> DataFrame: rectangle True Name: degrees, dtype: boolean - You can also use arithmetic operator ``==``: + You can also use logical operator `==`: + >>> df["degrees"] == 360 circle True triangle False @@ -1909,6 +1888,39 @@ def eq(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __eq__(self, other): + """ + Check equality of DataFrame and other, element-wise, using logical + operator `==`. + + Equivalent to `DataFrame.eq(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'a': [0, 3, 4], + ... 'b': [360, 0, 180] + ... }) + >>> df == 0 + a b + 0 True False + 1 False True + 2 False False + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to be compared to the DataFrame for equality. + + Returns: + DataFrame: The result of comparing `other` to DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def ne(self, other, axis: str | int = "columns") -> DataFrame: """ Get not equal to of DataFrame and other, element-wise (binary operator `ne`). @@ -1954,6 +1966,39 @@ def ne(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __ne__(self, other): + """ + Check inequality of DataFrame and other, element-wise, using logical + operator `!=`. + + Equivalent to `DataFrame.ne(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'a': [0, 3, 4], + ... 'b': [360, 0, 180] + ... }) + >>> df != 0 + a b + 0 False True + 1 True False + 2 True True + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to be compared to the DataFrame for inequality. + + Returns: + DataFrame: The result of comparing `other` to DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def le(self, other, axis: str | int = "columns") -> DataFrame: """Get 'less than or equal to' of dataframe and other, element-wise (binary operator `<=`). @@ -2004,6 +2049,39 @@ def le(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __le__(self, other): + """ + Check whether DataFrame is less than or equal to other, element-wise, + using logical operator `<=`. + + Equivalent to `DataFrame.le(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'a': [0, -1, 1], + ... 'b': [1, 0, -1] + ... }) + >>> df <= 0 + a b + 0 True False + 1 True True + 2 False True + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to be compared to the DataFrame. + + Returns: + DataFrame: The result of comparing `other` to DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def lt(self, other, axis: str | int = "columns") -> DataFrame: """Get 'less than' of DataFrame and other, element-wise (binary operator `<`). @@ -2054,6 +2132,39 @@ def lt(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __lt__(self, other): + """ + Check whether DataFrame is less than other, element-wise, using logical + operator `<`. + + Equivalent to `DataFrame.lt(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'a': [0, -1, 1], + ... 'b': [1, 0, -1] + ... }) + >>> df < 0 + a b + 0 False False + 1 True False + 2 False True + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to be compared to the DataFrame. + + Returns: + DataFrame: The result of comparing `other` to DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def ge(self, other, axis: str | int = "columns") -> DataFrame: """Get 'greater than or equal to' of DataFrame and other, element-wise (binary operator `>=`). @@ -2104,6 +2215,39 @@ def ge(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __ge__(self, other): + """ + Check whether DataFrame is greater than or equal to other, element-wise, + using logical operator `>=`. + + Equivalent to `DataFrame.ge(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'a': [0, -1, 1], + ... 'b': [1, 0, -1] + ... }) + >>> df >= 0 + a b + 0 True True + 1 False True + 2 True False + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to be compared to the DataFrame. + + Returns: + DataFrame: The result of comparing `other` to DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def gt(self, other, axis: str | int = "columns") -> DataFrame: """Get 'greater than' of DataFrame and other, element-wise (binary operator `>`). @@ -2152,6 +2296,39 @@ def gt(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __gt__(self, other): + """ + Check whether DataFrame is greater than other, element-wise, using logical + operator `>`. + + Equivalent to `DataFrame.gt(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'a': [0, -1, 1], + ... 'b': [1, 0, -1] + ... }) + >>> df > 0 + a b + 0 False True + 1 False False + 2 True False + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to be compared to the DataFrame. + + Returns: + DataFrame: The result of comparing `other` to DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def add(self, other, axis: str | int = "columns") -> DataFrame: """Get addition of DataFrame and other, element-wise (binary operator `+`). @@ -2183,7 +2360,126 @@ def add(self, other, axis: str | int = "columns") -> DataFrame: You can also use arithmetic operator ``+``: - >>> df['A'] + (df['B']) + >>> df['A'] + df['B'] + 0 5 + 1 7 + 2 9 + dtype: Int64 + + Args: + other (float, int, or Series): + Any single or multiple element data structure, or list-like object. + axis ({0 or 'index', 1 or 'columns'}): + Whether to compare by the index (0 or 'index') or columns. + (1 or 'columns'). For Series input, axis to match Series index on. + + Returns: + DataFrame: DataFrame result of the arithmetic operation. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __add__(self, other) -> DataFrame: + """Get addition of DataFrame and other, column-wise, using arithmatic + operator `+`. + + Equivalent to ``DataFrame.add(other)``. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'height': [1.5, 2.6], + ... 'weight': [500, 800] + ... }, + ... index=['elk', 'moose']) + >>> df + height weight + elk 1.5 500 + moose 2.6 800 + + [2 rows x 2 columns] + + Adding a scalar affects all rows and columns. + + >>> df + 1.5 + height weight + elk 3.0 501.5 + moose 4.1 801.5 + + [2 rows x 2 columns] + + You can add another DataFrame with index and columns aligned. + + >>> delta = bpd.DataFrame({ + ... 'height': [0.5, 0.9], + ... 'weight': [50, 80] + ... }, + ... index=['elk', 'moose']) + >>> df + delta + height weight + elk 2.0 550 + moose 3.5 880 + + [2 rows x 2 columns] + + Adding any mis-aligned index and columns will result in invalid values. + + >>> delta = bpd.DataFrame({ + ... 'depth': [0.5, 0.9, 1.0], + ... 'weight': [50, 80, 100] + ... }, + ... index=['elk', 'moose', 'bison']) + >>> df + delta + depth height weight + elk 550 + moose 880 + bison + + [3 rows x 3 columns] + + Args: + other (scalar or DataFrame): + Object to be added to the DataFrame. + + Returns: + DataFrame: The result of adding `other` to DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def radd(self, other, axis: str | int = "columns") -> DataFrame: + """Get addition of DataFrame and other, element-wise (binary operator `+`). + + Equivalent to ``other + dataframe``. With reverse version, `add`. + + Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to + arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. + + .. note:: + Mismatched indices will be unioned together. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].radd(df['B']) + 0 5 + 1 7 + 2 9 + dtype: Int64 + + You can also use arithmetic operator ``+``: + + >>> df['A'] + df['B'] 0 5 1 7 2 9 @@ -2250,6 +2546,49 @@ def sub(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __sub__(self, other): + """ + Get subtraction of other from DataFrame, element-wise, using operator `-`. + + Equivalent to `DataFrame.sub(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can subtract a scalar: + + >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df - 2 + a b + 0 -1 2 + 1 0 3 + 2 1 4 + + [3 rows x 2 columns] + + You can also subtract another DataFrame with index and column labels + aligned: + + >>> df1 = bpd.DataFrame({"a": [2, 2, 2], "b": [3, 3, 3]}) + >>> df - df1 + a b + 0 -1 1 + 1 0 2 + 2 1 3 + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to subtract from the DataFrame. + + Returns: + DataFrame: The result of the subtraction. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rsub(self, other, axis: str | int = "columns") -> DataFrame: """Get subtraction of DataFrame and other, element-wise (binary operator `-`). @@ -2296,6 +2635,21 @@ def rsub(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rsub__(self, other): + """ + Get subtraction of DataFrame from other, element-wise, using operator `-`. + + Equivalent to `DataFrame.rsub(other)`. + + Args: + other (scalar or DataFrame): + Object to subtract the DataFrame from. + + Returns: + DataFrame: The result of the subtraction. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def mul(self, other, axis: str | int = "columns") -> DataFrame: """Get multiplication of DataFrame and other, element-wise (binary operator `*`). @@ -2345,6 +2699,141 @@ def mul(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __mul__(self, other): + """ + Get multiplication of DataFrame with other, element-wise, using operator `*`. + + Equivalent to `DataFrame.mul(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can multiply with a scalar: + + >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df * 3 + a b + 0 3 12 + 1 6 15 + 2 9 18 + + [3 rows x 2 columns] + + You can also multiply with another DataFrame with index and column labels + aligned: + + >>> df1 = bpd.DataFrame({"a": [2, 2, 2], "b": [3, 3, 3]}) + >>> df * df1 + a b + 0 2 12 + 1 4 15 + 2 6 18 + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to multiply with the DataFrame. + + Returns: + DataFrame: The result of the multiplication. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def rmul(self, other, axis: str | int = "columns") -> DataFrame: + """Get multiplication of DataFrame and other, element-wise (binary operator `*`). + + Equivalent to ``other * dataframe``. With reverse version, `mul`. + + Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to + arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. + + .. note:: + Mismatched indices will be unioned together. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].rmul(df['B']) + 0 4 + 1 10 + 2 18 + dtype: Int64 + + You can also use arithmetic operator ``*``: + + >>> df['A'] * (df['B']) + 0 4 + 1 10 + 2 18 + dtype: Int64 + + Args: + other (float, int, or Series): + Any single or multiple element data structure, or list-like object. + axis ({0 or 'index', 1 or 'columns'}): + Whether to compare by the index (0 or 'index') or columns. + (1 or 'columns'). For Series input, axis to match Series index on. + + Returns: + DataFrame: DataFrame result of the arithmetic operation. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __rmul__(self, other): + """ + Get multiplication of DataFrame with other, element-wise, using operator `*`. + + Equivalent to `DataFrame.rmul(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can multiply with a scalar: + + >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df * 3 + a b + 0 3 12 + 1 6 15 + 2 9 18 + + [3 rows x 2 columns] + + You can also multiply with another DataFrame with index and column labels + aligned: + + >>> df1 = bpd.DataFrame({"a": [2, 2, 2], "b": [3, 3, 3]}) + >>> df * df1 + a b + 0 2 12 + 1 4 15 + 2 6 18 + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to multiply the DataFrame with. + + Returns: + DataFrame: The result of the multiplication. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def truediv(self, other, axis: str | int = "columns") -> DataFrame: """Get floating division of DataFrame and other, element-wise (binary operator `/`). @@ -2383,14 +2872,57 @@ def truediv(self, other, axis: str | int = "columns") -> DataFrame: dtype: Float64 Args: - other (float, int, or Series): - Any single or multiple element data structure, or list-like object. - axis ({0 or 'index', 1 or 'columns'}): - Whether to compare by the index (0 or 'index') or columns. - (1 or 'columns'). For Series input, axis to match Series index on. + other (float, int, or Series): + Any single or multiple element data structure, or list-like object. + axis ({0 or 'index', 1 or 'columns'}): + Whether to compare by the index (0 or 'index') or columns. + (1 or 'columns'). For Series input, axis to match Series index on. + + Returns: + DataFrame: DataFrame result of the arithmetic operation. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __truediv__(self, other): + """ + Get division of DataFrame by other, element-wise, using operator `/`. + + Equivalent to `DataFrame.truediv(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can multiply with a scalar: + + >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df / 2 + a b + 0 0.5 2.0 + 1 1.0 2.5 + 2 1.5 3.0 + + [3 rows x 2 columns] + + You can also multiply with another DataFrame with index and column labels + aligned: + + >>> denominator = bpd.DataFrame({"a": [2, 2, 2], "b": [3, 3, 3]}) + >>> df / denominator + a b + 0 0.5 1.333333 + 1 1.0 1.666667 + 2 1.5 2.0 + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to divide the DataFrame by. Returns: - DataFrame: DataFrame result of the arithmetic operation. + DataFrame: The result of the division. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2440,6 +2972,21 @@ def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rtruediv__(self, other): + """ + Get division of other by DataFrame, element-wise, using operator `/`. + + Equivalent to `DataFrame.rtruediv(other)`. + + Args: + other (scalar or DataFrame): + Object to divide by the DataFrame. + + Returns: + DataFrame: The result of the division. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def floordiv(self, other, axis: str | int = "columns") -> DataFrame: """Get integer division of DataFrame and other, element-wise (binary operator `//`). @@ -2489,6 +3036,49 @@ def floordiv(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __floordiv__(self, other): + """ + Get integer divison of DataFrame by other, using arithmatic operator `//`. + + Equivalent to `DataFrame.floordiv(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can divide by a scalar: + + >>> df = bpd.DataFrame({"a": [15, 15, 15], "b": [30, 30, 30]}) + >>> df // 2 + a b + 0 7 15 + 1 7 15 + 2 7 15 + + [3 rows x 2 columns] + + You can also divide by another DataFrame with index and column labels + aligned: + + >>> divisor = bpd.DataFrame({"a": [2, 3, 4], "b": [5, 6, 7]}) + >>> df // divisor + a b + 0 7 6 + 1 5 5 + 2 3 4 + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to divide the DataFrame by. + + Returns: + DataFrame: The result of the integer divison. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame: """Get integer division of DataFrame and other, element-wise (binary operator `//`). @@ -2535,6 +3125,21 @@ def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rfloordiv__(self, other): + """ + Get integer divison of other by DataFrame. + + Equivalent to `DataFrame.rfloordiv(other)`. + + Args: + other (scalar or DataFrame): + Object to divide by the DataFrame. + + Returns: + DataFrame: The result of the integer divison. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def mod(self, other, axis: str | int = "columns") -> DataFrame: """Get modulo of DataFrame and other, element-wise (binary operator `%`). @@ -2584,6 +3189,49 @@ def mod(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __mod__(self, other): + """ + Get modulo of DataFrame with other, element-wise, using operator `%`. + + Equivalent to `DataFrame.mod(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can modulo with a scalar: + + >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df % 3 + a b + 0 1 1 + 1 2 2 + 2 0 0 + + [3 rows x 2 columns] + + You can also modulo with another DataFrame with index and column labels + aligned: + + >>> modulo = bpd.DataFrame({"a": [2, 2, 2], "b": [3, 3, 3]}) + >>> df % modulo + a b + 0 1 1 + 1 0 2 + 2 1 0 + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to modulo the DataFrame by. + + Returns: + DataFrame: The result of the modulo. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rmod(self, other, axis: str | int = "columns") -> DataFrame: """Get modulo of DataFrame and other, element-wise (binary operator `%`). @@ -2630,6 +3278,21 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rmod__(self, other): + """ + Get integer divison of other by DataFrame. + + Equivalent to `DataFrame.rmod(other)`. + + Args: + other (scalar or DataFrame): + Object to modulo by the DataFrame. + + Returns: + DataFrame: The result of the modulo. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def pow(self, other, axis: str | int = "columns") -> DataFrame: """Get Exponential power of dataframe and other, element-wise (binary operator `**`). @@ -2680,6 +3343,50 @@ def pow(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __pow__(self, other): + """ + Get exponentiation of DataFrame with other, element-wise, using operator + `**`. + + Equivalent to `DataFrame.pow(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can exponentiate with a scalar: + + >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df ** 2 + a b + 0 1 16 + 1 4 25 + 2 9 36 + + [3 rows x 2 columns] + + You can also exponentiate with another DataFrame with index and column + labels aligned: + + >>> exponent = bpd.DataFrame({"a": [2, 2, 2], "b": [3, 3, 3]}) + >>> df ** exponent + a b + 0 1 64 + 1 4 125 + 2 9 216 + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to exponentiate the DataFrame with. + + Returns: + DataFrame: The result of the exponentiation. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rpow(self, other, axis: str | int = "columns") -> DataFrame: """Get Exponential power of dataframe and other, element-wise (binary operator `rpow`). @@ -2727,6 +3434,22 @@ def rpow(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rpow__(self, other): + """ + Get exponentiation of other with DataFrame, element-wise, using operator + `**`. + + Equivalent to `DataFrame.rpow(other)`. + + Args: + other (scalar or DataFrame): + Object to exponentiate with the DataFrame. + + Returns: + DataFrame: The result of the exponentiation. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def combine( self, other, func, fill_value=None, overwrite: bool = True ) -> DataFrame: @@ -4102,7 +4825,6 @@ def nsmallest(self, n: int, columns, keep: str = "first"): performant. .. note:: - This function cannot be used with all column types. For example, when specifying columns with `object` or `category` dtypes, ``TypeError`` is raised. @@ -5074,6 +5796,7 @@ def eval(self, expr: str) -> DataFrame: injection if you pass user input to this function. **Examples:** + >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None @@ -5095,11 +5818,11 @@ def eval(self, expr: str) -> DataFrame: 4 7 dtype: Int64 - Assignment is allowed though by default the original DataFrame is not - modified. + Assignment is allowed though by default the original DataFrame is not + modified. >>> df.eval('C = A + B') - A B C + A B C 0 1 10 11 1 2 8 10 2 3 6 9 @@ -5108,7 +5831,7 @@ def eval(self, expr: str) -> DataFrame: [5 rows x 3 columns] >>> df - A B + A B 0 1 10 1 2 8 2 3 6 @@ -5117,7 +5840,7 @@ def eval(self, expr: str) -> DataFrame: [5 rows x 2 columns] - Multiple columns can be assigned to using multi-line expressions: + Multiple columns can be assigned to using multi-line expressions: >>> df.eval( ... ''' @@ -5125,7 +5848,7 @@ def eval(self, expr: str) -> DataFrame: ... D = A - B ... ''' ... ) - A B C D + A B C D 0 1 10 11 -9 1 2 8 10 -6 2 3 6 9 -3 @@ -5149,6 +5872,7 @@ def query(self, expr: str) -> DataFrame | None: Query the columns of a DataFrame with a boolean expression. **Examples:** + >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None @@ -5521,6 +6245,7 @@ def dot(self, other): DataFrame and the index of other must contain the same values, as they will be aligned prior to the multiplication. + .. note:: The dot method for Series computes the inner product, instead of the matrix product here. @@ -5607,6 +6332,59 @@ def dot(self, other): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __matmul__(self, other): + """ + Compute the matrix multiplication between the DataFrame and other, using + operator `@`. + + Equivalent to `DataFrame.dot(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> left = bpd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) + >>> left + 0 1 2 3 + 0 0 1 -2 -1 + 1 1 1 1 1 + + [2 rows x 4 columns] + >>> right = bpd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]]) + >>> right + 0 1 + 0 0 1 + 1 1 2 + 2 -1 -1 + 3 2 0 + + [4 rows x 2 columns] + >>> left @ right + 0 1 + 0 1 4 + 1 2 2 + + [2 rows x 2 columns] + + The operand can be a Series, in which case the result will also be a + Series: + + >>> right = bpd.Series([1, 2, -1,0]) + >>> left @ right + 0 4 + 1 2 + dtype: Int64 + + Args: + other (DataFrame or Series): + Object to be matrix multiplied with the DataFrame. + + Returns: + DataFrame or Series: The result of the matrix multiplication. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property def plot(self): """ @@ -5617,3 +6395,197 @@ def plot(self): An accessor making plots. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __len__(self): + """Returns number of rows in the DataFrame, serves `len` operator. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'a': [0, 1, 2], + ... 'b': [3, 4, 5] + ... }) + >>> len(df) + 3 + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __array__(self): + """ + Returns the rows as NumPy array. + + Equivalent to `DataFrame.to_numpy(dtype)`. + + Users should not call this directly. Rather, it is invoked by + `numpy.array` and `numpy.asarray`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> import numpy as np + + >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [11, 22, 33]}) + + >>> np.array(df) + array([[1, 11], + [2, 22], + [3, 33]], dtype=object) + + >>> np.asarray(df) + array([[1, 11], + [2, 22], + [3, 33]], dtype=object) + + Args: + dtype (str or numpy.dtype, optional): + The dtype to use for the resulting NumPy array. By default, + the dtype is inferred from the data. + + Returns: + numpy.ndarray: + The rows in the DataFrame converted to a `numpy.ndarray` with + the specified dtype. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __getitem__(self, key): + """Gets the specified column(s) from the DataFrame. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... "name" : ["alpha", "beta", "gamma"], + ... "age": [20, 30, 40], + ... "location": ["WA", "NY", "CA"] + ... }) + >>> df + name age location + 0 alpha 20 WA + 1 beta 30 NY + 2 gamma 40 CA + + [3 rows x 3 columns] + + You can specify a column label to retrieve the corresponding Series. + + >>> df["name"] + 0 alpha + 1 beta + 2 gamma + Name: name, dtype: string + + You can specify a list of column labels to retrieve a Dataframe. + + >>> df[["name", "age"]] + name age + 0 alpha 20 + 1 beta 30 + 2 gamma 40 + + [3 rows x 2 columns] + + You can specify a condition as a series of booleans to retrieve matching + rows. + + >>> df[df["age"] > 25] + name age location + 1 beta 30 NY + 2 gamma 40 CA + + [2 rows x 3 columns] + + You can specify a pandas Index with desired column labels. + + >>> import pandas as pd + >>> df[pd.Index(["age", "location"])] + age location + 0 20 WA + 1 30 NY + 2 40 CA + + [3 rows x 2 columns] + + Args: + key (index): + Index or list of indices. It can be a column label, a list of + column labels, a Series of booleans or a pandas Index of desired + column labels + + Returns: + Series or Value: Value(s) at the requested index(es). + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __setitem__(self, key, value): + """Modify or insert a column into the DataFrame. + + .. note:: + This does **not** modify the original table the DataFrame was + derived from. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... "name" : ["alpha", "beta", "gamma"], + ... "age": [20, 30, 40], + ... "location": ["WA", "NY", "CA"] + ... }) + >>> df + name age location + 0 alpha 20 WA + 1 beta 30 NY + 2 gamma 40 CA + + [3 rows x 3 columns] + + You can add assign a constant to a new column. + + >>> df["country"] = "USA" + >>> df + name age location country + 0 alpha 20 WA USA + 1 beta 30 NY USA + 2 gamma 40 CA USA + + [3 rows x 4 columns] + + You can assign a Series to a new column. + + >>> df["new_age"] = df["age"] + 5 + >>> df + name age location country new_age + 0 alpha 20 WA USA 25 + 1 beta 30 NY USA 35 + 2 gamma 40 CA USA 45 + + [3 rows x 5 columns] + + You can assign a Series to an existing column. + + >>> df["new_age"] = bpd.Series([29, 39, 19], index=[1, 2, 0]) + >>> df + name age location country new_age + 0 alpha 20 WA USA 19 + 1 beta 30 NY USA 29 + 2 gamma 40 CA USA 39 + + [3 rows x 5 columns] + + Args: + key (column index): + It can be a new column to be inserted, or an existing column to + be modified. + value (scalar or Series): + Value to be assigned to the column + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index baa9534a0e..9c6120fd6c 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -1120,9 +1120,39 @@ def pipe( return common.pipe(self, func, *args, **kwargs) def __nonzero__(self): + """Returns the truth value of the object.""" raise ValueError( f"The truth value of a {type(self).__name__} is ambiguous. " "Use a.empty, a.bool(), a.item(), a.any() or a.all()." ) __bool__ = __nonzero__ + + def __getattr__(self, name: str): + """ + After regular attribute access, try looking up the name + This allows simpler access to columns for interactive use. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def equals(self, other) -> bool: + """ + Test whether two objects contain the same elements. + + This function allows two Series or DataFrames to be compared against + each other to see if they have the same shape and elements. NaNs in + the same location are considered equal. + + The row/column index do not need to have the same type, as long + as the values are considered equal. Corresponding columns must be of + the same dtype. + + Args: + other (Series or DataFrame): + The other Series or DataFrame to be compared with the first. + + Returns: + bool: True if all elements are the same in both objects, False + otherwise. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 192e19fa5a..46bc9714f8 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -6,6 +6,7 @@ from typing import Hashable, IO, Literal, Mapping, Optional, Sequence, TYPE_CHECKING from bigframes_vendored.pandas.core.generic import NDFrame +import numpy import numpy as np from pandas._libs import lib from pandas._typing import Axis, FilePath, NaPosition, WriteBuffer @@ -961,13 +962,13 @@ def dot(self, other) -> Series | np.ndarray: def __matmul__(self, other): """ - Matrix multiplication using binary `@` operator in Python>=3.5. + Matrix multiplication using binary `@` operator. """ return NotImplemented def __rmatmul__(self, other): """ - Matrix multiplication using binary `@` operator in Python>=3.5. + Matrix multiplication using binary `@` operator. """ return NotImplemented @@ -2173,6 +2174,55 @@ def add(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __add__(self, other): + """Get addition of Series and other, element-wise, using operator `+`. + + Equivalent to `Series.add(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1.5, 2.6], index=['elk', 'moose']) + >>> s + elk 1.5 + moose 2.6 + dtype: Float64 + + You can add a scalar. + + >>> s + 1.5 + elk 3.0 + moose 4.1 + dtype: Float64 + + You can add another Series with index aligned. + + >>> delta = bpd.Series([1.5, 2.6], index=['elk', 'moose']) + >>> s + delta + elk 3.0 + moose 5.2 + dtype: Float64 + + Adding any mis-aligned index will result in invalid values. + + >>> delta = bpd.Series([1.5, 2.6], index=['moose', 'bison']) + >>> s + delta + elk + moose 4.1 + bison + dtype: Float64 + + Args: + other (scalar or Series): + Object to be added to the Series. + + Returns: + Series: The result of adding `other` to Series. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def radd(self, other) -> Series: """Return addition of Series and other, element-wise (binary operator radd). @@ -2188,6 +2238,20 @@ def radd(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __radd__(self, other): + """Get addition of Series and other, element-wise, using operator `+`. + + Equivalent to `Series.radd(other)`. + + Args: + other (scalar or Series): + Object to which Series should be added. + + Returns: + Series: The result of adding Series to `other`. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def sub( self, other, @@ -2206,6 +2270,55 @@ def sub( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __sub__(self, other): + """Get subtraction of other from Series, element-wise, using operator `-`. + + Equivalent to `Series.sub(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1.5, 2.6], index=['elk', 'moose']) + >>> s + elk 1.5 + moose 2.6 + dtype: Float64 + + You can subtract a scalar. + + >>> s - 1.5 + elk 0.0 + moose 1.1 + dtype: Float64 + + You can subtract another Series with index aligned. + + >>> delta = bpd.Series([0.5, 1.0], index=['elk', 'moose']) + >>> s - delta + elk 1.0 + moose 1.6 + dtype: Float64 + + Adding any mis-aligned index will result in invalid values. + + >>> delta = bpd.Series([0.5, 1.0], index=['moose', 'bison']) + >>> s - delta + elk + moose 2.1 + bison + dtype: Float64 + + Args: + other (scalar or Series): + Object to subtract from the Series. + + Returns: + Series: The result of subtraction. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rsub(self, other) -> Series: """Return subtraction of Series and other, element-wise (binary operator rsub). @@ -2221,6 +2334,20 @@ def rsub(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rsub__(self, other): + """Get subtraction of Series from other, element-wise, using operator `-`. + + Equivalent to `Series.rsub(other)`. + + Args: + other (scalar or Series): + Object to subtract the Series from. + + Returns: + Series: The result of subtraction. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def mul(self, other) -> Series: """Return multiplication of Series and other, element-wise (binary operator mul). @@ -2236,6 +2363,44 @@ def mul(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __mul__(self, other): + """ + Get multiplication of Series with other, element-wise, using operator `*`. + + Equivalent to `Series.mul(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can multiply with a scalar: + + >>> s = bpd.Series([1, 2, 3]) + >>> s * 3 + 0 3 + 1 6 + 2 9 + dtype: Int64 + + You can also multiply with another Series: + + >>> s1 = bpd.Series([2, 3, 4]) + >>> s * s1 + 0 2 + 1 6 + 2 12 + dtype: Int64 + + Args: + other (scalar or Series): + Object to multiply with the Series. + + Returns: + Series: The result of the multiplication. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rmul(self, other) -> Series: """Return multiplication of Series and other, element-wise (binary operator mul). @@ -2250,6 +2415,21 @@ def rmul(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rmul__(self, other): + """ + Get multiplication of other with Series, element-wise, using operator `*`. + + Equivalent to `Series.rmul(other)`. + + Args: + other (scalar or Series): + Object to multiply the Series with. + + Returns: + Series: The result of the multiplication. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def truediv(self, other) -> Series: """Return floating division of Series and other, element-wise (binary operator truediv). @@ -2265,6 +2445,44 @@ def truediv(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __truediv__(self, other): + """ + Get division of Series by other, element-wise, using operator `/`. + + Equivalent to `Series.truediv(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can multiply with a scalar: + + >>> s = bpd.Series([1, 2, 3]) + >>> s / 2 + 0 0.5 + 1 1.0 + 2 1.5 + dtype: Float64 + + You can also multiply with another Series: + + >>> denominator = bpd.Series([2, 3, 4]) + >>> s / denominator + 0 0.5 + 1 0.666667 + 2 0.75 + dtype: Float64 + + Args: + other (scalar or Series): + Object to divide the Series by. + + Returns: + Series: The result of the division. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rtruediv(self, other) -> Series: """Return floating division of Series and other, element-wise (binary operator rtruediv). @@ -2280,6 +2498,21 @@ def rtruediv(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rtruediv__(self, other): + """ + Get division of other by Series, element-wise, using operator `/`. + + Equivalent to `Series.rtruediv(other)`. + + Args: + other (scalar or Series): + Object to divide by the Series. + + Returns: + Series: The result of the division. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def floordiv(self, other) -> Series: """Return integer division of Series and other, element-wise (binary operator floordiv). @@ -2295,6 +2528,44 @@ def floordiv(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __floordiv__(self, other): + """ + Get integer divison of Series by other, using arithmatic operator `//`. + + Equivalent to `Series.floordiv(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can divide by a scalar: + + >>> s = bpd.Series([15, 30, 45]) + >>> s // 2 + 0 7 + 1 15 + 2 22 + dtype: Int64 + + You can also divide by another DataFrame: + + >>> divisor = bpd.Series([3, 4, 4]) + >>> s // divisor + 0 5 + 1 7 + 2 11 + dtype: Int64 + + Args: + other (scalar or Series): + Object to divide the Series by. + + Returns: + Series: The result of the integer divison. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rfloordiv(self, other) -> Series: """Return integer division of Series and other, element-wise (binary operator rfloordiv). @@ -2310,6 +2581,21 @@ def rfloordiv(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rfloordiv__(self, other): + """ + Get integer divison of other by Series, using arithmatic operator `//`. + + Equivalent to `Series.rfloordiv(other)`. + + Args: + other (scalar or Series): + Object to divide by the Series. + + Returns: + Series: The result of the integer divison. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def mod(self, other) -> Series: """Return modulo of Series and other, element-wise (binary operator mod). @@ -2325,6 +2611,44 @@ def mod(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __mod__(self, other): + """ + Get modulo of Series with other, element-wise, using operator `%`. + + Equivalent to `Series.mod(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can modulo with a scalar: + + >>> s = bpd.Series([1, 2, 3]) + >>> s % 3 + 0 1 + 1 2 + 2 0 + dtype: Int64 + + You can also modulo with another Series: + + >>> modulo = bpd.Series([3, 3, 3]) + >>> s % modulo + 0 1 + 1 2 + 2 0 + dtype: Int64 + + Args: + other (scalar or Series): + Object to modulo the Series by. + + Returns: + Series: The result of the modulo. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rmod(self, other) -> Series: """Return modulo of Series and other, element-wise (binary operator mod). @@ -2340,6 +2664,21 @@ def rmod(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rmod__(self, other): + """ + Get modulo of other with Series, element-wise, using operator `%`. + + Equivalent to `Series.rmod(other)`. + + Args: + other (scalar or Series): + Object to modulo by the Series. + + Returns: + Series: The result of the modulo. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def pow(self, other) -> Series: """Return Exponential power of series and other, element-wise (binary operator `pow`). @@ -2355,6 +2694,45 @@ def pow(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __pow__(self, other): + """ + Get exponentiation of Series with other, element-wise, using operator + `**`. + + Equivalent to `Series.pow(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can exponentiate with a scalar: + + >>> s = bpd.Series([1, 2, 3]) + >>> s ** 2 + 0 1 + 1 4 + 2 9 + dtype: Int64 + + You can also exponentiate with another Series: + + >>> exponent = bpd.Series([3, 2, 1]) + >>> s ** exponent + 0 1 + 1 4 + 2 3 + dtype: Int64 + + Args: + other (scalar or Series): + Object to exponentiate the Series with. + + Returns: + Series: The result of the exponentiation. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rpow(self, other) -> Series: """Return Exponential power of series and other, element-wise (binary operator `rpow`). @@ -2370,6 +2748,22 @@ def rpow(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rpow__(self, other): + """ + Get exponentiation of other with Series, element-wise, using operator + `**`. + + Equivalent to `Series.rpow(other)`. + + Args: + other (scalar or Series): + Object to exponentiate with the Series. + + Returns: + Series: The result of the exponentiation. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def divmod(self, other) -> Series: """Return integer division and modulo of Series and other, element-wise (binary operator divmod). @@ -3574,3 +3968,172 @@ def size(self) -> int: int: Return the number of elements in the underlying data. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __array__(self, dtype=None) -> numpy.ndarray: + """ + Returns the values as NumPy array. + + Equivalent to `Series.to_numpy(dtype)`. + + Users should not call this directly. Rather, it is invoked by + `numpy.array` and `numpy.asarray`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> import numpy as np + + >>> ser = bpd.Series([1, 2, 3]) + + >>> np.asarray(ser) + array([1, 2, 3]) + + Args: + dtype (str or numpy.dtype, optional): + The dtype to use for the resulting NumPy array. By default, + the dtype is inferred from the data. + + Returns: + numpy.ndarray: + The values in the series converted to a `numpy.ndarray` with the + specified dtype. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __len__(self): + """Returns number of values in the Series, serves `len` operator. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1, 2, 3]) + >>> len(s) + 3 + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __invert__(self): + """ + Returns the logical inversion (binary NOT) of the Series, element-wise + using operator `~`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> ser = bpd.Series([True, False, True]) + >>> ~ser + 0 False + 1 True + 2 False + dtype: boolean + + Returns: + Series: The inverted values in the series. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __and__(self, other): + """Get bitwise AND of Series and other, element-wise, using operator `&`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([0, 1, 2, 3]) + + You can operate with a scalar. + + >>> s & 6 + 0 0 + 1 0 + 2 2 + 3 2 + dtype: Int64 + + You can operate with another Series. + + >>> s1 = bpd.Series([5, 6, 7, 8]) + >>> s & s1 + 0 0 + 1 0 + 2 2 + 3 0 + dtype: Int64 + + Args: + other (scalar or Series): + Object to bitwise AND with the Series. + + Returns: + Series: The result of the operation. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __or__(self, other): + """Get bitwise OR of Series and other, element-wise, using operator `|`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([0, 1, 2, 3]) + + You can operate with a scalar. + + >>> s | 6 + 0 6 + 1 7 + 2 6 + 3 7 + dtype: Int64 + + You can operate with another Series. + + >>> s1 = bpd.Series([5, 6, 7, 8]) + >>> s | s1 + 0 5 + 1 7 + 2 7 + 3 11 + dtype: Int64 + + Args: + other (scalar or Series): + Object to bitwise OR with the Series. + + Returns: + Series: The result of the operation. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __getitem__(self, indexer): + """Gets the specified index from the Series. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([15, 30, 45]) + >>> s[1] + 30 + >>> s[0:2] + 0 15 + 1 30 + dtype: Int64 + + Args: + indexer (int or slice): + Index or slice of indices. + + Returns: + Series or Value: Value(s) at the requested index(es). + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 290f95dc5198f9ab7cd9d726d40af704250c0449 Mon Sep 17 00:00:00 2001 From: Henry Solberg Date: Fri, 12 Apr 2024 16:58:17 -0700 Subject: [PATCH 21/23] feat: support list of numerics in pandas.cut (#580) An internal user encountered this missing overload --- bigframes/core/reshape/__init__.py | 26 ++++++++-- bigframes/operations/aggregations.py | 6 +-- tests/system/small/test_pandas.py | 52 +++++++++++++++++++ .../pandas/core/reshape/tile.py | 16 +++++- 4 files changed, 93 insertions(+), 7 deletions(-) diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index e3ed8edd21..6bcc25319b 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -14,7 +14,7 @@ from __future__ import annotations import typing -from typing import Iterable, Literal, Optional, Tuple, Union +from typing import Iterable, Literal, Optional, Union import pandas as pd @@ -113,7 +113,7 @@ def cut( bins: Union[ int, pd.IntervalIndex, - Iterable[Tuple[Union[int, float], Union[int, float]]], + Iterable, ], *, labels: Optional[bool] = None, @@ -125,9 +125,29 @@ def cut( if isinstance(bins, pd.IntervalIndex): as_index: pd.IntervalIndex = bins bins = tuple((bin.left.item(), bin.right.item()) for bin in bins) - else: + elif len(list(bins)) == 0: + raise ValueError("`bins` iterable should have at least one item") + elif isinstance(list(bins)[0], tuple): as_index = pd.IntervalIndex.from_tuples(list(bins)) bins = tuple(bins) + elif pd.api.types.is_number(list(bins)[0]): + bins_list = list(bins) + if len(bins_list) < 2: + raise ValueError( + "`bins` iterable of numeric breaks should have" + " at least two items" + ) + as_index = pd.IntervalIndex.from_breaks(bins_list) + single_type = all([isinstance(n, type(bins_list[0])) for n in bins_list]) + numeric_type = type(bins_list[0]) if single_type else float + bins = tuple( + [ + (numeric_type(bins_list[i]), numeric_type(bins_list[i + 1])) + for i in range(len(bins_list) - 1) + ] + ) + else: + raise ValueError("`bins` iterable should contain tuples or numerics") if as_index.is_overlapping: raise ValueError("Overlapping IntervalIndex is not accepted.") diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 36fa787644..f33dc16e30 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -17,7 +17,7 @@ import abc import dataclasses import typing -from typing import ClassVar, Hashable, Optional, Tuple +from typing import ClassVar, Iterable, Optional import pandas as pd import pyarrow as pa @@ -213,7 +213,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT @dataclasses.dataclass(frozen=True) class CutOp(UnaryWindowOp): # TODO: Unintuitive, refactor into multiple ops? - bins: typing.Union[int, Tuple[Tuple[Hashable, Hashable], ...]] + bins: typing.Union[int, Iterable] labels: Optional[bool] @property @@ -232,7 +232,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT interval_dtype = ( pa.float64() if isinstance(self.bins, int) - else dtypes.infer_literal_arrow_type(self.bins[0][0]) + else dtypes.infer_literal_arrow_type(list(self.bins)[0][0]) ) pa_type = pa.struct( [ diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 95b34a56c5..d543f92655 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -424,6 +424,58 @@ def test_cut_default_labels(scalars_dfs): ) +@pytest.mark.parametrize( + ("breaks",), + [ + ([0, 5, 10, 15, 20, 100, 1000],), # ints + ([0.5, 10.5, 15.5, 20.5, 100.5, 1000.5],), # floats + ([0, 5, 10.5, 15.5, 20, 100, 1000.5],), # mixed + ], +) +def test_cut_numeric_breaks(scalars_dfs, breaks): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = pd.cut(scalars_pandas_df["float64_col"], breaks) + bf_result = bpd.cut(scalars_df["float64_col"], breaks).to_pandas() + + # Convert to match data format + pd_result_converted = pd.Series( + [ + {"left_exclusive": interval.left, "right_inclusive": interval.right} + if pd.notna(val) + else pd.NA + for val, interval in zip( + pd_result, pd_result.cat.categories[pd_result.cat.codes] + ) + ], + name=pd_result.name, + ) + + pd.testing.assert_series_equal( + bf_result, pd_result_converted, check_index=False, check_dtype=False + ) + + +@pytest.mark.parametrize( + ("bins",), + [ + (-1,), # negative integer bins argument + ([],), # empty iterable of bins + (["notabreak"],), # iterable of wrong type + ([1],), # numeric breaks with only one numeric + # this is supported by pandas but not by + # the bigquery operation and a bigframes workaround + # is not yet available. Should return column + # of structs with all NaN values. + ], +) +def test_cut_errors(scalars_dfs, bins): + scalars_df, _ = scalars_dfs + + with pytest.raises(ValueError): + bpd.cut(scalars_df["float64_col"], bins) + + @pytest.mark.parametrize( ("bins",), [ diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index fbd1d2d052..6ba3950a76 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -76,10 +76,20 @@ def cut( 3 {'left_exclusive': 5, 'right_inclusive': 20} dtype: struct[pyarrow] + Cut with an iterable of ints: + + >>> bins_ints = [0, 1, 5, 20] + >>> bpd.cut(s, bins=bins_ints) + 0 + 1 {'left_exclusive': 0, 'right_inclusive': 1} + 2 {'left_exclusive': 1, 'right_inclusive': 5} + 3 {'left_exclusive': 5, 'right_inclusive': 20} + dtype: struct[pyarrow] + Args: x (Series): The input Series to be binned. Must be 1-dimensional. - bins (int, pd.IntervalIndex, Iterable[Tuple[Union[int, float], Union[int, float]]]): + bins (int, pd.IntervalIndex, Iterable): The criteria to bin by. int: Defines the number of equal-width bins in the range of `x`. The @@ -88,6 +98,10 @@ def cut( pd.IntervalIndex or Iterable of tuples: Defines the exact bins to be used. It's important to ensure that these bins are non-overlapping. + + Iterable of numerics: Defines the exact bins by using the interval + between each item and its following item. The items must be monotonically + increasing. labels (None): Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of the From 0e24036fe33c60cef08ba78e7ce69319338b6b03 Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Mon, 15 Apr 2024 13:53:53 -0700 Subject: [PATCH 22/23] test: series explode test (#606) --- tests/system/small/test_series.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 6e4a87df4f..d27cd0a236 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3463,9 +3463,8 @@ def foo(x: int, y: int, df): ], ) def test_series_explode(data): - data = [[1, 2, 3], [], numpy.nan, [3, 4]] s = bigframes.pandas.Series(data) - pd_s = pd.Series(data) + pd_s = s.to_pandas() pd.testing.assert_series_equal( s.explode().to_pandas(), pd_s.explode(), From 458bfb2b42c6fc0406489f19210a7d5a406ee90d Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Tue, 16 Apr 2024 10:08:11 -0700 Subject: [PATCH 23/23] chore(main): release 1.2.0 (#588) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 28 ++++++++++++++++++++++++++++ bigframes/version.py | 2 +- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bcb062f08f..a3314c976e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,34 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.2.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.1.0...v1.2.0) (2024-04-15) + + +### Features + +* Add hasnans, combine_first, update to Series ([#600](https://github.com/googleapis/python-bigquery-dataframes/issues/600)) ([86e0f38](https://github.com/googleapis/python-bigquery-dataframes/commit/86e0f38adc71d76e09dd832e5e33cb7c1aab02ac)) +* Add MultiIndex subclass. ([#596](https://github.com/googleapis/python-bigquery-dataframes/issues/596)) ([5d0f149](https://github.com/googleapis/python-bigquery-dataframes/commit/5d0f149dce5425098fcd154d96a302c1661ce5d3)) +* Add pivot_table for DataFrame. ([#473](https://github.com/googleapis/python-bigquery-dataframes/issues/473)) ([5f1d670](https://github.com/googleapis/python-bigquery-dataframes/commit/5f1d670e6b839a30acdb495a05011c2ce4e0c7a4)) +* Add Series.autocorr ([#605](https://github.com/googleapis/python-bigquery-dataframes/issues/605)) ([4ec8034](https://github.com/googleapis/python-bigquery-dataframes/commit/4ec80340459e675b82b437f6c48b2872d362bafe)) +* Support list of numerics in pandas.cut ([#580](https://github.com/googleapis/python-bigquery-dataframes/issues/580)) ([290f95d](https://github.com/googleapis/python-bigquery-dataframes/commit/290f95dc5198f9ab7cd9d726d40af704250c0449)) + + +### Bug Fixes + +* Address more technical writers feedback ([#581](https://github.com/googleapis/python-bigquery-dataframes/issues/581)) ([4b08d92](https://github.com/googleapis/python-bigquery-dataframes/commit/4b08d9243272229f71688152dbeb69d0ab7c68b4)) +* Error for object dtype on read_pandas ([#570](https://github.com/googleapis/python-bigquery-dataframes/issues/570)) ([8702dcf](https://github.com/googleapis/python-bigquery-dataframes/commit/8702dcf54c0f2073e21df42eaef51927481da421)) +* Inverting int now does bitwise inversion rather than sign flip ([#574](https://github.com/googleapis/python-bigquery-dataframes/issues/574)) ([5f1db8b](https://github.com/googleapis/python-bigquery-dataframes/commit/5f1db8b270b32ab366be3690761da137d9fe65f5)) +* Loc setitem dtype issue. ([#603](https://github.com/googleapis/python-bigquery-dataframes/issues/603)) ([b94bae9](https://github.com/googleapis/python-bigquery-dataframes/commit/b94bae9892e0fa79dc4bde0f4f1427d00accda6d)) +* Toc menu missing plotting name ([#591](https://github.com/googleapis/python-bigquery-dataframes/issues/591)) ([eed12c1](https://github.com/googleapis/python-bigquery-dataframes/commit/eed12c181ff8724333b1c426a0eb442c627528b8)) + + +### Documentation + +* (Series|Dataframe).dtypes ([#598](https://github.com/googleapis/python-bigquery-dataframes/issues/598)) ([edef48f](https://github.com/googleapis/python-bigquery-dataframes/commit/edef48f7a93e19bc1f6d37fb041dfd6314d881d5)) +* Add code samples for `str` accessor methdos ([#594](https://github.com/googleapis/python-bigquery-dataframes/issues/594)) ([a557ea2](https://github.com/googleapis/python-bigquery-dataframes/commit/a557ea2b64633932f730b56688f76806da6195fb)) +* Add docs for `DataFrame` and `Series` dunder methods ([#562](https://github.com/googleapis/python-bigquery-dataframes/issues/562)) ([8fc26c4](https://github.com/googleapis/python-bigquery-dataframes/commit/8fc26c424b29a8b78542372e402fcc4e8fface7b)) +* Add examples for at/iat ([#582](https://github.com/googleapis/python-bigquery-dataframes/issues/582)) ([3be4a2e](https://github.com/googleapis/python-bigquery-dataframes/commit/3be4a2e784e046ca9a1fac8d386d072537b6c4de)) + ## [1.1.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.0.0...v1.1.0) (2024-04-04) diff --git a/bigframes/version.py b/bigframes/version.py index 41a3895549..ec2105b648 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.1.0" +__version__ = "1.2.0" pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy