Skip to content

Commit 968020d

Browse files
authored
fix: avoid "Unable to determine type" warning with JSON columns in to_dataframe (#1876)
* add regression tests for empty dataframe * fix arrow test to be compatible with old pyarrow
1 parent 9acd9c1 commit 968020d

10 files changed

+230
-24
lines changed

google/cloud/bigquery/_helpers.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,21 @@ def range_to_py(self, value, field):
387387
CELL_DATA_PARSER = CellDataParser()
388388

389389

390+
class DataFrameCellDataParser(CellDataParser):
391+
"""Override of CellDataParser to handle differences in expection of values in DataFrame-like outputs.
392+
393+
This is used to turn the output of the REST API into a pyarrow Table,
394+
emulating the serialized arrow from the BigQuery Storage Read API.
395+
"""
396+
397+
def json_to_py(self, value, _):
398+
"""No-op because DataFrame expects string for JSON output."""
399+
return value
400+
401+
402+
DATA_FRAME_CELL_DATA_PARSER = DataFrameCellDataParser()
403+
404+
390405
class ScalarQueryParamParser(CellDataParser):
391406
"""Override of CellDataParser to handle the differences in the response from query params.
392407

google/cloud/bigquery/_pandas_helpers.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ def finish(self):
158158
b"ARROW:extension:metadata": b'{"encoding": "WKT"}',
159159
},
160160
"DATETIME": {b"ARROW:extension:name": b"google:sqlType:datetime"},
161+
"JSON": {b"ARROW:extension:name": b"google:sqlType:json"},
161162
}
162163

163164

google/cloud/bigquery/_pyarrow_helpers.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@
1515
"""Shared helper functions for connecting BigQuery and pyarrow.
1616
1717
NOTE: This module is DEPRECATED. Please make updates in the pandas-gbq package,
18-
instead. See: go/pandas-gbq-and-bigframes-redundancy and
18+
instead. See: go/pandas-gbq-and-bigframes-redundancy,
19+
https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/bigquery_to_pyarrow.py
20+
and
1921
https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/pyarrow_to_bigquery.py
2022
"""
2123

@@ -26,6 +28,14 @@
2628
except ImportError:
2729
pyarrow = None
2830

31+
try:
32+
import db_dtypes # type: ignore
33+
34+
db_dtypes_import_exception = None
35+
except ImportError as exc:
36+
db_dtypes = None
37+
db_dtypes_import_exception = exc
38+
2939

3040
def pyarrow_datetime():
3141
return pyarrow.timestamp("us", tz=None)
@@ -67,12 +77,18 @@ def pyarrow_timestamp():
6777
"GEOGRAPHY": pyarrow.string,
6878
"INT64": pyarrow.int64,
6979
"INTEGER": pyarrow.int64,
80+
# Normally, we'd prefer JSON type built-in to pyarrow (added in 19.0.0),
81+
# but we'd like this to map as closely to the BQ Storage API as
82+
# possible, which uses the string() dtype, as JSON support in Arrow
83+
# predates JSON support in BigQuery by several years.
84+
"JSON": pyarrow.string,
7085
"NUMERIC": pyarrow_numeric,
7186
"STRING": pyarrow.string,
7287
"TIME": pyarrow_time,
7388
"TIMESTAMP": pyarrow_timestamp,
7489
}
7590

91+
# DEPRECATED: update pandas_gbq.schema.pyarrow_to_bigquery, instead.
7692
_ARROW_SCALAR_IDS_TO_BQ = {
7793
# https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
7894
pyarrow.bool_().id: "BOOL",
@@ -97,6 +113,9 @@ def pyarrow_timestamp():
97113
pyarrow.large_string().id: "STRING",
98114
# The exact scale and precision don't matter, see below.
99115
pyarrow.decimal128(38, scale=9).id: "NUMERIC",
116+
# NOTE: all extension types (e.g. json_, uuid, db_dtypes.JSONArrowType)
117+
# have the same id (31 as of version 19.0.1), so these should not be
118+
# matched by id.
100119
}
101120

102121
_BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric
@@ -107,6 +126,9 @@ def pyarrow_timestamp():
107126

108127
def bq_to_arrow_scalars(bq_scalar: str):
109128
"""
129+
DEPRECATED: update pandas_gbq.schema.bigquery_to_pyarrow, instead, which is
130+
to be added in https://github.com/googleapis/python-bigquery-pandas/pull/893.
131+
110132
Returns:
111133
The Arrow scalar type that the input BigQuery scalar type maps to.
112134
If it cannot find the BigQuery scalar, return None.
@@ -116,6 +138,8 @@ def bq_to_arrow_scalars(bq_scalar: str):
116138

117139
def arrow_scalar_ids_to_bq(arrow_scalar: Any):
118140
"""
141+
DEPRECATED: update pandas_gbq.schema.pyarrow_to_bigquery, instead.
142+
119143
Returns:
120144
The BigQuery scalar type that the input arrow scalar type maps to.
121145
If it cannot find the arrow scalar, return None.

google/cloud/bigquery/table.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3533,7 +3533,9 @@ def _row_iterator_page_columns(schema, response):
35333533

35343534
def get_column_data(field_index, field):
35353535
for row in rows:
3536-
yield _helpers.CELL_DATA_PARSER.to_py(row["f"][field_index]["v"], field)
3536+
yield _helpers.DATA_FRAME_CELL_DATA_PARSER.to_py(
3537+
row["f"][field_index]["v"], field
3538+
)
35373539

35383540
for field_index, field in enumerate(schema):
35393541
columns.append(get_column_data(field_index, field))

tests/system/test_arrow.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,3 +194,32 @@ def test_list_rows_range_csv(
194194

195195
range_type = schema.field("range_date").type
196196
assert range_type == expected_type
197+
198+
199+
def test_to_arrow_query_with_empty_results(bigquery_client):
200+
"""
201+
JSON regression test for https://github.com/googleapis/python-bigquery/issues/1580.
202+
"""
203+
job = bigquery_client.query(
204+
"""
205+
select
206+
123 as int_col,
207+
'' as string_col,
208+
to_json('{}') as json_col,
209+
struct(to_json('[]') as json_field, -1 as int_field) as struct_col,
210+
[to_json('null')] as json_array_col,
211+
from unnest([])
212+
"""
213+
)
214+
table = job.to_arrow()
215+
assert list(table.column_names) == [
216+
"int_col",
217+
"string_col",
218+
"json_col",
219+
"struct_col",
220+
"json_array_col",
221+
]
222+
assert table.shape == (0, 5)
223+
struct_type = table.field("struct_col").type
224+
assert struct_type.get_field_index("json_field") == 0
225+
assert struct_type.get_field_index("int_field") == 1

tests/system/test_pandas.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1304,6 +1304,32 @@ def test_upload_time_and_datetime_56(bigquery_client, dataset_id):
13041304
]
13051305

13061306

1307+
def test_to_dataframe_query_with_empty_results(bigquery_client):
1308+
"""
1309+
JSON regression test for https://github.com/googleapis/python-bigquery/issues/1580.
1310+
"""
1311+
job = bigquery_client.query(
1312+
"""
1313+
select
1314+
123 as int_col,
1315+
'' as string_col,
1316+
to_json('{}') as json_col,
1317+
struct(to_json('[]') as json_field, -1 as int_field) as struct_col,
1318+
[to_json('null')] as json_array_col,
1319+
from unnest([])
1320+
"""
1321+
)
1322+
df = job.to_dataframe()
1323+
assert list(df.columns) == [
1324+
"int_col",
1325+
"string_col",
1326+
"json_col",
1327+
"struct_col",
1328+
"json_array_col",
1329+
]
1330+
assert len(df.index) == 0
1331+
1332+
13071333
def test_to_dataframe_geography_as_objects(bigquery_client, dataset_id):
13081334
wkt = pytest.importorskip("shapely.wkt")
13091335
bigquery_client.query(
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pytest
16+
17+
import google.cloud.bigquery.schema
18+
19+
20+
def create_field(mode="NULLABLE", type_="IGNORED", name="test_field", **kwargs):
21+
return google.cloud.bigquery.schema.SchemaField(name, type_, mode=mode, **kwargs)
22+
23+
24+
@pytest.fixture
25+
def mut():
26+
from google.cloud.bigquery import _helpers
27+
28+
return _helpers
29+
30+
31+
@pytest.fixture
32+
def object_under_test(mut):
33+
return mut.DATA_FRAME_CELL_DATA_PARSER
34+
35+
36+
def test_json_to_py_doesnt_parse_json(object_under_test):
37+
coerced = object_under_test.json_to_py('{"key":"value"}', create_field())
38+
assert coerced == '{"key":"value"}'
39+
40+
41+
def test_json_to_py_repeated_doesnt_parse_json(object_under_test):
42+
coerced = object_under_test.json_to_py('{"key":"value"}', create_field("REPEATED"))
43+
assert coerced == '{"key":"value"}'
44+
45+
46+
def test_record_to_py_doesnt_parse_json(object_under_test):
47+
subfield = create_field(type_="JSON", name="json")
48+
field = create_field(fields=[subfield])
49+
value = {"f": [{"v": '{"key":"value"}'}]}
50+
coerced = object_under_test.record_to_py(value, field)
51+
assert coerced == {"json": '{"key":"value"}'}
52+
53+
54+
def test_record_to_py_doesnt_parse_repeated_json(object_under_test):
55+
subfield = create_field("REPEATED", "JSON", name="json")
56+
field = create_field("REQUIRED", fields=[subfield])
57+
value = {
58+
"f": [
59+
{
60+
"v": [
61+
{"v": '{"key":"value0"}'},
62+
{"v": '{"key":"value1"}'},
63+
{"v": '{"key":"value2"}'},
64+
]
65+
}
66+
]
67+
}
68+
coerced = object_under_test.record_to_py(value, field)
69+
assert coerced == {
70+
"json": ['{"key":"value0"}', '{"key":"value1"}', '{"key":"value2"}']
71+
}

tests/unit/test__pyarrow_helpers.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,16 @@ def module_under_test():
2727

2828
def test_bq_to_arrow_scalars(module_under_test):
2929
assert (
30-
module_under_test.bq_to_arrow_scalars("BIGNUMERIC")
31-
== module_under_test.pyarrow_bignumeric
30+
module_under_test.bq_to_arrow_scalars("BIGNUMERIC")()
31+
== module_under_test.pyarrow_bignumeric()
32+
)
33+
assert (
34+
# Normally, we'd prefer JSON type built-in to pyarrow (added in 19.0.0),
35+
# but we'd like this to map as closely to the BQ Storage API as
36+
# possible, which uses the string() dtype, as JSON support in Arrow
37+
# predates JSON support in BigQuery by several years.
38+
module_under_test.bq_to_arrow_scalars("JSON")()
39+
== pyarrow.string()
3240
)
3341
assert module_under_test.bq_to_arrow_scalars("UNKNOWN_TYPE") is None
3442

tests/unit/test_table_arrow.py

Lines changed: 46 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ def test_to_arrow_with_jobs_query_response():
2828
"fields": [
2929
{"name": "name", "type": "STRING", "mode": "NULLABLE"},
3030
{"name": "number", "type": "INTEGER", "mode": "NULLABLE"},
31+
{"name": "json", "type": "JSON", "mode": "NULLABLE"},
3132
]
3233
},
3334
"jobReference": {
@@ -37,15 +38,21 @@ def test_to_arrow_with_jobs_query_response():
3738
},
3839
"totalRows": "9",
3940
"rows": [
40-
{"f": [{"v": "Tiarra"}, {"v": "6"}]},
41-
{"f": [{"v": "Timothy"}, {"v": "325"}]},
42-
{"f": [{"v": "Tina"}, {"v": "26"}]},
43-
{"f": [{"v": "Tierra"}, {"v": "10"}]},
44-
{"f": [{"v": "Tia"}, {"v": "17"}]},
45-
{"f": [{"v": "Tiara"}, {"v": "22"}]},
46-
{"f": [{"v": "Tiana"}, {"v": "6"}]},
47-
{"f": [{"v": "Tiffany"}, {"v": "229"}]},
48-
{"f": [{"v": "Tiffani"}, {"v": "8"}]},
41+
{"f": [{"v": "Tiarra"}, {"v": "6"}, {"v": "123"}]},
42+
{"f": [{"v": "Timothy"}, {"v": "325"}, {"v": '{"key":"value"}'}]},
43+
{"f": [{"v": "Tina"}, {"v": "26"}, {"v": "[1,2,3]"}]},
44+
{
45+
"f": [
46+
{"v": "Tierra"},
47+
{"v": "10"},
48+
{"v": '{"aKey": {"bKey": {"cKey": -123}}}'},
49+
]
50+
},
51+
{"f": [{"v": "Tia"}, {"v": "17"}, {"v": None}]},
52+
{"f": [{"v": "Tiara"}, {"v": "22"}, {"v": '"some-json-string"'}]},
53+
{"f": [{"v": "Tiana"}, {"v": "6"}, {"v": '{"nullKey":null}'}]},
54+
{"f": [{"v": "Tiffany"}, {"v": "229"}, {"v": '""'}]},
55+
{"f": [{"v": "Tiffani"}, {"v": "8"}, {"v": "[]"}]},
4956
],
5057
"totalBytesProcessed": "154775150",
5158
"jobComplete": True,
@@ -65,7 +72,7 @@ def test_to_arrow_with_jobs_query_response():
6572
)
6673
records = rows.to_arrow()
6774

68-
assert records.column_names == ["name", "number"]
75+
assert records.column_names == ["name", "number", "json"]
6976
assert records["name"].to_pylist() == [
7077
"Tiarra",
7178
"Timothy",
@@ -78,6 +85,17 @@ def test_to_arrow_with_jobs_query_response():
7885
"Tiffani",
7986
]
8087
assert records["number"].to_pylist() == [6, 325, 26, 10, 17, 22, 6, 229, 8]
88+
assert records["json"].to_pylist() == [
89+
"123",
90+
'{"key":"value"}',
91+
"[1,2,3]",
92+
'{"aKey": {"bKey": {"cKey": -123}}}',
93+
None,
94+
'"some-json-string"',
95+
'{"nullKey":null}',
96+
'""',
97+
"[]",
98+
]
8199

82100

83101
def test_to_arrow_with_jobs_query_response_and_max_results():
@@ -87,6 +105,7 @@ def test_to_arrow_with_jobs_query_response_and_max_results():
87105
"fields": [
88106
{"name": "name", "type": "STRING", "mode": "NULLABLE"},
89107
{"name": "number", "type": "INTEGER", "mode": "NULLABLE"},
108+
{"name": "json", "type": "JSON", "mode": "NULLABLE"},
90109
]
91110
},
92111
"jobReference": {
@@ -96,15 +115,21 @@ def test_to_arrow_with_jobs_query_response_and_max_results():
96115
},
97116
"totalRows": "9",
98117
"rows": [
99-
{"f": [{"v": "Tiarra"}, {"v": "6"}]},
100-
{"f": [{"v": "Timothy"}, {"v": "325"}]},
101-
{"f": [{"v": "Tina"}, {"v": "26"}]},
102-
{"f": [{"v": "Tierra"}, {"v": "10"}]},
103-
{"f": [{"v": "Tia"}, {"v": "17"}]},
104-
{"f": [{"v": "Tiara"}, {"v": "22"}]},
105-
{"f": [{"v": "Tiana"}, {"v": "6"}]},
106-
{"f": [{"v": "Tiffany"}, {"v": "229"}]},
107-
{"f": [{"v": "Tiffani"}, {"v": "8"}]},
118+
{"f": [{"v": "Tiarra"}, {"v": "6"}, {"v": "123"}]},
119+
{"f": [{"v": "Timothy"}, {"v": "325"}, {"v": '{"key":"value"}'}]},
120+
{"f": [{"v": "Tina"}, {"v": "26"}, {"v": "[1,2,3]"}]},
121+
{
122+
"f": [
123+
{"v": "Tierra"},
124+
{"v": "10"},
125+
{"v": '{"aKey": {"bKey": {"cKey": -123}}}'},
126+
]
127+
},
128+
{"f": [{"v": "Tia"}, {"v": "17"}, {"v": None}]},
129+
{"f": [{"v": "Tiara"}, {"v": "22"}, {"v": '"some-json-string"'}]},
130+
{"f": [{"v": "Tiana"}, {"v": "6"}, {"v": '{"nullKey":null}'}]},
131+
{"f": [{"v": "Tiffany"}, {"v": "229"}, {"v": '""'}]},
132+
{"f": [{"v": "Tiffani"}, {"v": "8"}, {"v": "[]"}]},
108133
],
109134
"totalBytesProcessed": "154775150",
110135
"jobComplete": True,
@@ -125,10 +150,11 @@ def test_to_arrow_with_jobs_query_response_and_max_results():
125150
)
126151
records = rows.to_arrow()
127152

128-
assert records.column_names == ["name", "number"]
153+
assert records.column_names == ["name", "number", "json"]
129154
assert records["name"].to_pylist() == [
130155
"Tiarra",
131156
"Timothy",
132157
"Tina",
133158
]
134159
assert records["number"].to_pylist() == [6, 325, 26]
160+
assert records["json"].to_pylist() == ["123", '{"key":"value"}', "[1,2,3]"]

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy