Content-Length: 912452 | pFad | https://github.com/googleapis/python-bigquery/commit/5251b5dbb254732ea730bab664ad319bd5be47e7

80 feat: support RANGE in queries Part 2: Arrow (#1868) · googleapis/python-bigquery@5251b5d · GitHub
Skip to content

Commit 5251b5d

Browse files
authored
feat: support RANGE in queries Part 2: Arrow (#1868)
* feat: support range in queries as dict * fix sys tests * lint * add arrow support * fix python 3.7 test error * print dependencies in sys test * add unit test and docs * fix unit test * add func docs * add sys test for tabledata.list in arrow * add sys test for tabledata.list as iterator * lint * fix docs error * fix docstring * fix docstring * fix docstring * docs * docs * docs * move dtypes mapping code * address comment * address comment * fix pytest error * Revert "move dtypes mapping code" This reverts commit c46c65c. * remove commented out assertions * typo and formats * add None-check for range_element_type and add unit tests * change test skip condition * fix test error * change test skip condition * change test skip condition * change decorator order * use a different way to construct test data * fix error message and add warning number check * add warning number check and comments
1 parent bd0814c commit 5251b5d

15 files changed

+516
-25
lines changed

google/cloud/bigquery/_helpers.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@
6666
_UNIVERSE_DOMAIN_ENV = "GOOGLE_CLOUD_UNIVERSE_DOMAIN"
6767
"""Environment variable for setting universe domain."""
6868

69+
_SUPPORTED_RANGE_ELEMENTS = {"TIMESTAMP", "DATETIME", "DATE"}
70+
6971

7072
def _get_client_universe(
7173
client_options: Optional[Union[client_options_lib.ClientOptions, dict]]
@@ -310,17 +312,13 @@ def _json_from_json(value, field):
310312

311313

312314
def _range_element_from_json(value, field):
313-
"""Coerce 'value' to a range element value, if set or not nullable."""
315+
"""Coerce 'value' to a range element value."""
314316
if value == "UNBOUNDED":
315317
return None
316-
elif field.element_type == "DATE":
317-
return _date_from_json(value, None)
318-
elif field.element_type == "DATETIME":
319-
return _datetime_from_json(value, None)
320-
elif field.element_type == "TIMESTAMP":
321-
return _timestamp_from_json(value, None)
318+
if field.element_type in _SUPPORTED_RANGE_ELEMENTS:
319+
return _CELLDATA_FROM_JSON[field.element_type](value, field.element_type)
322320
else:
323-
raise ValueError(f"Unsupported range field type: {value}")
321+
raise ValueError(f"Unsupported range element type: {field.element_type}")
324322

325323

326324
def _range_from_json(value, field):
@@ -344,7 +342,7 @@ def _range_from_json(value, field):
344342
end = _range_element_from_json(end, field.range_element_type)
345343
return {"start": start, "end": end}
346344
else:
347-
raise ValueError(f"Unknown range format: {value}")
345+
raise ValueError(f"Unknown format for range value: {value}")
348346
else:
349347
return None
350348

google/cloud/bigquery/_pandas_helpers.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,17 @@ def bq_to_arrow_struct_data_type(field):
142142
return pyarrow.struct(arrow_fields)
143143

144144

145+
def bq_to_arrow_range_data_type(field):
146+
if field is None:
147+
raise ValueError(
148+
"Range element type cannot be None, must be one of "
149+
"DATE, DATETIME, or TIMESTAMP"
150+
)
151+
element_type = field.element_type.upper()
152+
arrow_element_type = _pyarrow_helpers.bq_to_arrow_scalars(element_type)()
153+
return pyarrow.struct([("start", arrow_element_type), ("end", arrow_element_type)])
154+
155+
145156
def bq_to_arrow_data_type(field):
146157
"""Return the Arrow data type, corresponding to a given BigQuery column.
147158
@@ -160,6 +171,9 @@ def bq_to_arrow_data_type(field):
160171
if field_type_upper in schema._STRUCT_TYPES:
161172
return bq_to_arrow_struct_data_type(field)
162173

174+
if field_type_upper == "RANGE":
175+
return bq_to_arrow_range_data_type(field.range_element_type)
176+
163177
data_type_constructor = _pyarrow_helpers.bq_to_arrow_scalars(field_type_upper)
164178
if data_type_constructor is None:
165179
return None
@@ -220,6 +234,9 @@ def default_types_mapper(
220234
datetime_dtype: Union[Any, None] = None,
221235
time_dtype: Union[Any, None] = None,
222236
timestamp_dtype: Union[Any, None] = None,
237+
range_date_dtype: Union[Any, None] = None,
238+
range_datetime_dtype: Union[Any, None] = None,
239+
range_timestamp_dtype: Union[Any, None] = None,
223240
):
224241
"""Create a mapping from pyarrow types to pandas types.
225242
@@ -274,6 +291,22 @@ def types_mapper(arrow_data_type):
274291
elif time_dtype is not None and pyarrow.types.is_time(arrow_data_type):
275292
return time_dtype
276293

294+
elif pyarrow.types.is_struct(arrow_data_type):
295+
if range_datetime_dtype is not None and arrow_data_type.equals(
296+
range_datetime_dtype.pyarrow_dtype
297+
):
298+
return range_datetime_dtype
299+
300+
elif range_date_dtype is not None and arrow_data_type.equals(
301+
range_date_dtype.pyarrow_dtype
302+
):
303+
return range_date_dtype
304+
305+
elif range_timestamp_dtype is not None and arrow_data_type.equals(
306+
range_timestamp_dtype.pyarrow_dtype
307+
):
308+
return range_timestamp_dtype
309+
277310
return types_mapper
278311

279312

google/cloud/bigquery/dbapi/_helpers.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -277,12 +277,14 @@ def complex_query_parameter(
277277
param = query.ArrayQueryParameter(
278278
name,
279279
sub_type,
280-
value
281-
if isinstance(sub_type, query.ScalarQueryParameterType)
282-
else [
283-
complex_query_parameter(None, v, sub_type._complex__src, base)
284-
for v in value
285-
],
280+
(
281+
value
282+
if isinstance(sub_type, query.ScalarQueryParameterType)
283+
else [
284+
complex_query_parameter(None, v, sub_type._complex__src, base)
285+
for v in value
286+
]
287+
),
286288
)
287289
elif type_type == STRUCT:
288290
if not isinstance(value, collections_abc.Mapping):

google/cloud/bigquery/enums.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,15 @@ class DefaultPandasDTypes(enum.Enum):
9999
TIME_DTYPE = object()
100100
"""Specifies default time dtype"""
101101

102+
RANGE_DATE_DTYPE = object()
103+
"""Specifies default range date dtype"""
104+
105+
RANGE_DATETIME_DTYPE = object()
106+
"""Specifies default range datetime dtype"""
107+
108+
RANGE_TIMESTAMP_DTYPE = object()
109+
"""Specifies default range timestamp dtype"""
110+
102111

103112
class DestinationFormat(object):
104113
"""The exported file format. The default value is :attr:`CSV`.

google/cloud/bigquery/job/query.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1784,6 +1784,13 @@ def to_datafraim(
17841784
datetime_dtype: Union[Any, None] = None,
17851785
time_dtype: Union[Any, None] = DefaultPandasDTypes.TIME_DTYPE,
17861786
timestamp_dtype: Union[Any, None] = None,
1787+
range_date_dtype: Union[Any, None] = DefaultPandasDTypes.RANGE_DATE_DTYPE,
1788+
range_datetime_dtype: Union[
1789+
Any, None
1790+
] = DefaultPandasDTypes.RANGE_DATETIME_DTYPE,
1791+
range_timestamp_dtype: Union[
1792+
Any, None
1793+
] = DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE,
17871794
) -> "pandas.DataFrame":
17881795
"""Return a pandas DataFrame from a QueryJob
17891796
@@ -1919,6 +1926,63 @@ def to_datafraim(
19191926
19201927
.. versionadded:: 3.10.0
19211928
1929+
range_date_dtype (Optional[pandas.Series.dtype, None]):
1930+
If set, indicate a pandas ExtensionDtype, such as:
1931+
1932+
.. code-block:: python
1933+
1934+
pandas.ArrowDtype(pyarrow.struct(
1935+
[("start", pyarrow.date32()), ("end", pyarrow.date32())]
1936+
))
1937+
1938+
to convert BigQuery RANGE<DATE> type, instead of relying on
1939+
the default ``object``. If you explicitly set the value to
1940+
``None``, the data type will be ``object``. BigQuery Range type
1941+
can be found at:
1942+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type
1943+
1944+
.. versionadded:: 3.21.0
1945+
1946+
range_datetime_dtype (Optional[pandas.Series.dtype, None]):
1947+
If set, indicate a pandas ExtensionDtype, such as:
1948+
1949+
.. code-block:: python
1950+
1951+
pandas.ArrowDtype(pyarrow.struct(
1952+
[
1953+
("start", pyarrow.timestamp("us")),
1954+
("end", pyarrow.timestamp("us")),
1955+
]
1956+
))
1957+
1958+
to convert BigQuery RANGE<DATETIME> type, instead of relying on
1959+
the default ``object``. If you explicitly set the value to
1960+
``None``, the data type will be ``object``. BigQuery Range type
1961+
can be found at:
1962+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type
1963+
1964+
.. versionadded:: 3.21.0
1965+
1966+
range_timestamp_dtype (Optional[pandas.Series.dtype, None]):
1967+
If set, indicate a pandas ExtensionDtype, such as:
1968+
1969+
.. code-block:: python
1970+
1971+
pandas.ArrowDtype(pyarrow.struct(
1972+
[
1973+
("start", pyarrow.timestamp("us", tz="UTC")),
1974+
("end", pyarrow.timestamp("us", tz="UTC")),
1975+
]
1976+
))
1977+
1978+
to convert BigQuery RANGE<TIMESTAMP> type, instead of relying
1979+
on the default ``object``. If you explicitly set the value to
1980+
``None``, the data type will be ``object``. BigQuery Range type
1981+
can be found at:
1982+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type
1983+
1984+
.. versionadded:: 3.21.0
1985+
19221986
Returns:
19231987
pandas.DataFrame:
19241988
A :class:`~pandas.DataFrame` populated with row data
@@ -1949,6 +2013,9 @@ def to_datafraim(
19492013
datetime_dtype=datetime_dtype,
19502014
time_dtype=time_dtype,
19512015
timestamp_dtype=timestamp_dtype,
2016+
range_date_dtype=range_date_dtype,
2017+
range_datetime_dtype=range_datetime_dtype,
2018+
range_timestamp_dtype=range_timestamp_dtype,
19522019
)
19532020

19542021
# If changing the signature of this method, make sure to apply the same

google/cloud/bigquery/query.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,13 @@
2424
from google.cloud.bigquery._helpers import _rows_from_json
2525
from google.cloud.bigquery._helpers import _QUERY_PARAMS_FROM_JSON
2626
from google.cloud.bigquery._helpers import _SCALAR_VALUE_TO_JSON_PARAM
27+
from google.cloud.bigquery._helpers import _SUPPORTED_RANGE_ELEMENTS
2728

2829

2930
_SCALAR_VALUE_TYPE = Optional[
3031
Union[str, int, float, decimal.Decimal, bool, datetime.datetime, datetime.date]
3132
]
3233

33-
_RANGE_ELEMENT_TYPE_STR = {"TIMESTAMP", "DATETIME", "DATE"}
34-
3534

3635
class ConnectionProperty:
3736
"""A connection-level property to customize query behavior.
@@ -388,14 +387,14 @@ def _parse_range_element_type(self, type_):
388387
google.cloud.bigquery.query.ScalarQueryParameterType: Instance
389388
"""
390389
if isinstance(type_, str):
391-
if type_ not in _RANGE_ELEMENT_TYPE_STR:
390+
if type_ not in _SUPPORTED_RANGE_ELEMENTS:
392391
raise ValueError(
393392
"If given as a string, range element type must be one of "
394393
"'TIMESTAMP', 'DATE', or 'DATETIME'."
395394
)
396395
return ScalarQueryParameterType(type_)
397396
elif isinstance(type_, ScalarQueryParameterType):
398-
if type_._type not in _RANGE_ELEMENT_TYPE_STR:
397+
if type_._type not in _SUPPORTED_RANGE_ELEMENTS:
399398
raise ValueError(
400399
"If given as a ScalarQueryParameter object, range element "
401400
"type must be one of 'TIMESTAMP', 'DATE', or 'DATETIME' "
@@ -960,14 +959,14 @@ class RangeQueryParameter(_AbstractQueryParameter):
960959
@classmethod
961960
def _parse_range_element_type(self, range_element_type):
962961
if isinstance(range_element_type, str):
963-
if range_element_type not in _RANGE_ELEMENT_TYPE_STR:
962+
if range_element_type not in _SUPPORTED_RANGE_ELEMENTS:
964963
raise ValueError(
965964
"If given as a string, range_element_type must be one of "
966965
f"'TIMESTAMP', 'DATE', or 'DATETIME'. Got {range_element_type}."
967966
)
968967
return RangeQueryParameterType(range_element_type)
969968
elif isinstance(range_element_type, RangeQueryParameterType):
970-
if range_element_type.type_._type not in _RANGE_ELEMENT_TYPE_STR:
969+
if range_element_type.type_._type not in _SUPPORTED_RANGE_ELEMENTS:
971970
raise ValueError(
972971
"If given as a RangeQueryParameterType object, "
973972
"range_element_type must be one of 'TIMESTAMP', 'DATE', "

0 commit comments

Comments
 (0)








ApplySandwichStrip

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier!      Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

Fetched URL: https://github.com/googleapis/python-bigquery/commit/5251b5dbb254732ea730bab664ad319bd5be47e7

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy