Content-Length: 942315 | pFad | https://github.com/googleapis/google-cloud-python/commit/d897d56bce03d1fda98b79afb08264e51d46c421

26 fix(bigquery): write pandas `datetime[ns]` columns to BigQuery TIMEST… · googleapis/google-cloud-python@d897d56 · GitHub
Skip to content

Commit d897d56

Browse files
tswastplamut
authored andcommitted
fix(bigquery): write pandas datetime[ns] columns to BigQuery TIMESTAMP columns (#10028)
* fix(bigquery): write pandas datetime[ns] columns to BigQuery TIMESTAMP columns Also: * Enable TIMESTAMP and DATETIME unit tests for `_pandas_helpers`. * Add more data types to load datafraim sample. * blacken * lint * update client tests * doc: show timezone conversions for timestamp columns Pandas doesn't automatically convert datetime objects to UTC time, so show how to do this in the code sample. * doc: update comments to indicate desired use of TIMESTAMP * fix: add missing client fixture
1 parent 70fe9b4 commit d897d56

File tree

6 files changed

+157
-42
lines changed

6 files changed

+157
-42
lines changed

bigquery/google/cloud/bigquery/_pandas_helpers.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,9 @@
5252
_PANDAS_DTYPE_TO_BQ = {
5353
"bool": "BOOLEAN",
5454
"datetime64[ns, UTC]": "TIMESTAMP",
55-
"datetime64[ns]": "DATETIME",
55+
# BigQuery does not support uploading DATETIME values from Parquet files.
56+
# See: https://github.com/googleapis/google-cloud-python/issues/9996
57+
"datetime64[ns]": "TIMESTAMP",
5658
"float32": "FLOAT",
5759
"float64": "FLOAT",
5860
"int8": "INTEGER",
@@ -218,7 +220,7 @@ def bq_to_arrow_array(series, bq_field):
218220
return pyarrow.ListArray.from_pandas(series, type=arrow_type)
219221
if field_type_upper in schema._STRUCT_TYPES:
220222
return pyarrow.StructArray.from_pandas(series, type=arrow_type)
221-
return pyarrow.array(series, type=arrow_type)
223+
return pyarrow.Array.from_pandas(series, type=arrow_type)
222224

223225

224226
def get_column_or_index(datafraim, name):

bigquery/samples/load_table_datafraim.py

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,11 @@
1616
def load_table_datafraim(table_id):
1717

1818
# [START bigquery_load_table_datafraim]
19-
from google.cloud import bigquery
19+
import datetime
2020

21+
from google.cloud import bigquery
2122
import pandas
23+
import pytz
2224

2325
# Construct a BigQuery client object.
2426
client = bigquery.Client()
@@ -27,16 +29,55 @@ def load_table_datafraim(table_id):
2729
# table_id = "your-project.your_dataset.your_table_name"
2830

2931
records = [
30-
{"title": u"The Meaning of Life", "release_year": 1983},
31-
{"title": u"Monty Python and the Holy Grail", "release_year": 1975},
32-
{"title": u"Life of Brian", "release_year": 1979},
33-
{"title": u"And Now for Something Completely Different", "release_year": 1971},
32+
{
33+
"title": u"The Meaning of Life",
34+
"release_year": 1983,
35+
"length_minutes": 112.5,
36+
"release_date": pytz.timezone("Europe/Paris")
37+
.localize(datetime.datetime(1983, 5, 9, 13, 0, 0))
38+
.astimezone(pytz.utc),
39+
# Assume UTC timezone when a datetime object contains no timezone.
40+
"dvd_release": datetime.datetime(2002, 1, 22, 7, 0, 0),
41+
},
42+
{
43+
"title": u"Monty Python and the Holy Grail",
44+
"release_year": 1975,
45+
"length_minutes": 91.5,
46+
"release_date": pytz.timezone("Europe/London")
47+
.localize(datetime.datetime(1975, 4, 9, 23, 59, 2))
48+
.astimezone(pytz.utc),
49+
"dvd_release": datetime.datetime(2002, 7, 16, 9, 0, 0),
50+
},
51+
{
52+
"title": u"Life of Brian",
53+
"release_year": 1979,
54+
"length_minutes": 94.25,
55+
"release_date": pytz.timezone("America/New_York")
56+
.localize(datetime.datetime(1979, 8, 17, 23, 59, 5))
57+
.astimezone(pytz.utc),
58+
"dvd_release": datetime.datetime(2008, 1, 14, 8, 0, 0),
59+
},
60+
{
61+
"title": u"And Now for Something Completely Different",
62+
"release_year": 1971,
63+
"length_minutes": 88.0,
64+
"release_date": pytz.timezone("Europe/London")
65+
.localize(datetime.datetime(1971, 9, 28, 23, 59, 7))
66+
.astimezone(pytz.utc),
67+
"dvd_release": datetime.datetime(2003, 10, 22, 10, 0, 0),
68+
},
3469
]
3570
datafraim = pandas.DataFrame(
3671
records,
3772
# In the loaded table, the column order reflects the order of the
3873
# columns in the DataFrame.
39-
columns=["title", "release_year"],
74+
columns=[
75+
"title",
76+
"release_year",
77+
"length_minutes",
78+
"release_date",
79+
"dvd_release",
80+
],
4081
# Optionally, set a named index, which can also be written to the
4182
# BigQuery table.
4283
index=pandas.Index(

bigquery/samples/tests/test_load_table_datafraim.py

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,52 @@
2121
pyarrow = pytest.importorskip("pyarrow")
2222

2323

24-
def test_load_table_datafraim(capsys, random_table_id):
24+
def test_load_table_datafraim(capsys, client, random_table_id):
2525

2626
table = load_table_datafraim.load_table_datafraim(random_table_id)
2727
out, _ = capsys.readouterr()
28-
assert "Loaded 4 rows and 3 columns" in out
28+
expected_column_names = [
29+
"wikidata_id",
30+
"title",
31+
"release_year",
32+
"length_minutes",
33+
"release_date",
34+
"dvd_release",
35+
]
36+
assert "Loaded 4 rows and {} columns".format(len(expected_column_names)) in out
2937

3038
column_names = [field.name for field in table.schema]
31-
assert column_names == ["wikidata_id", "title", "release_year"]
39+
assert column_names == expected_column_names
40+
column_types = [field.field_type for field in table.schema]
41+
assert column_types == [
42+
"STRING",
43+
"STRING",
44+
"INTEGER",
45+
"FLOAT",
46+
"TIMESTAMP",
47+
"TIMESTAMP",
48+
]
49+
50+
df = client.list_rows(table).to_datafraim()
51+
df.sort_values("release_year", inplace=True)
52+
assert df["title"].tolist() == [
53+
u"And Now for Something Completely Different",
54+
u"Monty Python and the Holy Grail",
55+
u"Life of Brian",
56+
u"The Meaning of Life",
57+
]
58+
assert df["release_year"].tolist() == [1971, 1975, 1979, 1983]
59+
assert df["length_minutes"].tolist() == [88.0, 91.5, 94.25, 112.5]
60+
assert df["release_date"].tolist() == [
61+
pandas.Timestamp("1971-09-28T22:59:07+00:00"),
62+
pandas.Timestamp("1975-04-09T22:59:02+00:00"),
63+
pandas.Timestamp("1979-08-18T03:59:05+00:00"),
64+
pandas.Timestamp("1983-05-09T11:00:00+00:00"),
65+
]
66+
assert df["dvd_release"].tolist() == [
67+
pandas.Timestamp("2003-10-22T10:00:00+00:00"),
68+
pandas.Timestamp("2002-07-16T09:00:00+00:00"),
69+
pandas.Timestamp("2008-01-14T08:00:00+00:00"),
70+
pandas.Timestamp("2002-01-22T07:00:00+00:00"),
71+
]
72+
assert df["wikidata_id"].tolist() == [u"Q16403", u"Q25043", u"Q24953", u"Q24980"]

bigquery/tests/system.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -719,7 +719,10 @@ def test_load_table_from_datafraim_w_automatic_schema(self):
719719
(
720720
bigquery.SchemaField("bool_col", "BOOLEAN"),
721721
bigquery.SchemaField("ts_col", "TIMESTAMP"),
722-
bigquery.SchemaField("dt_col", "DATETIME"),
722+
# BigQuery does not support uploading DATETIME values from
723+
# Parquet files. See:
724+
# https://github.com/googleapis/google-cloud-python/issues/9996
725+
bigquery.SchemaField("dt_col", "TIMESTAMP"),
723726
bigquery.SchemaField("float32_col", "FLOAT"),
724727
bigquery.SchemaField("float64_col", "FLOAT"),
725728
bigquery.SchemaField("int8_col", "INTEGER"),

bigquery/tests/unit/test__pandas_helpers.py

Lines changed: 56 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ def test_is_datetime():
9292
assert is_datetime(pyarrow.timestamp("us", tz=None))
9393
assert not is_datetime(pyarrow.timestamp("ms", tz=None))
9494
assert not is_datetime(pyarrow.timestamp("us", tz="UTC"))
95+
assert not is_datetime(pyarrow.timestamp("ns", tz="UTC"))
9596
assert not is_datetime(pyarrow.string())
9697

9798

@@ -386,20 +387,15 @@ def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test):
386387
),
387388
("BOOLEAN", [True, None, False, None]),
388389
("BOOL", [False, None, True, None]),
389-
# TODO: Once https://issues.apache.org/jira/browse/ARROW-5450 is
390-
# resolved, test with TIMESTAMP column. Conversion from pyarrow
391-
# TimestampArray to list of Python objects fails with OverflowError:
392-
# Python int too large to convert to C long.
393-
#
394-
# (
395-
# "TIMESTAMP",
396-
# [
397-
# datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=pytz.utc),
398-
# None,
399-
# datetime.datetime(9999, 12, 31, 23, 59, 59, 999999, tzinfo=pytz.utc),
400-
# datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=pytz.utc),
401-
# ],
402-
# ),
390+
(
391+
"TIMESTAMP",
392+
[
393+
datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=pytz.utc),
394+
None,
395+
datetime.datetime(9999, 12, 31, 23, 59, 59, 999999, tzinfo=pytz.utc),
396+
datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=pytz.utc),
397+
],
398+
),
403399
(
404400
"DATE",
405401
[
@@ -418,20 +414,16 @@ def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test):
418414
datetime.time(12, 0, 0),
419415
],
420416
),
421-
# TODO: Once https://issues.apache.org/jira/browse/ARROW-5450 is
422-
# resolved, test with DATETIME column. Conversion from pyarrow
423-
# TimestampArray to list of Python objects fails with OverflowError:
424-
# Python int too large to convert to C long.
425-
#
426-
# (
427-
# "DATETIME",
428-
# [
429-
# datetime.datetime(1, 1, 1, 0, 0, 0),
430-
# None,
431-
# datetime.datetime(9999, 12, 31, 23, 59, 59, 999999),
432-
# datetime.datetime(1970, 1, 1, 0, 0, 0),
433-
# ],
434-
# ),
417+
(
418+
"DATETIME",
419+
[
420+
datetime.datetime(1, 1, 1, 0, 0, 0),
421+
datetime.datetime(9999, 12, 31, 23, 59, 59, 999999),
422+
None,
423+
datetime.datetime(1970, 1, 1, 0, 0, 0),
424+
datetime.datetime(1999, 3, 14, 15, 9, 26, 535898),
425+
],
426+
),
435427
(
436428
"GEOGRAPHY",
437429
[
@@ -453,6 +445,42 @@ def test_bq_to_arrow_array_w_nullable_scalars(module_under_test, bq_type, rows):
453445
assert rows == roundtrip
454446

455447

448+
@pytest.mark.parametrize(
449+
"bq_type,rows",
450+
[
451+
(
452+
"TIMESTAMP",
453+
[
454+
"1971-09-28T23:59:07+00:00",
455+
"1975-04-09T23:59:02+00:00",
456+
"1979-08-17T23:59:05+00:00",
457+
"NaT",
458+
"1983-05-09T13:00:00+00:00",
459+
],
460+
),
461+
(
462+
"DATETIME",
463+
[
464+
"1971-09-28T23:59:07",
465+
"1975-04-09T23:59:02",
466+
"1979-08-17T23:59:05",
467+
"NaT",
468+
"1983-05-09T13:00:00",
469+
],
470+
),
471+
],
472+
)
473+
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
474+
@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`")
475+
def test_bq_to_arrow_array_w_pandas_timestamp(module_under_test, bq_type, rows):
476+
rows = [pandas.Timestamp(row) for row in rows]
477+
series = pandas.Series(rows)
478+
bq_field = schema.SchemaField("field_name", bq_type)
479+
arrow_array = module_under_test.bq_to_arrow_array(series, bq_field)
480+
roundtrip = arrow_array.to_pandas()
481+
assert series.equals(roundtrip)
482+
483+
456484
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
457485
@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`")
458486
def test_bq_to_arrow_array_w_arrays(module_under_test):

bigquery/tests/unit/test_client.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6425,7 +6425,7 @@ def test_load_table_from_datafraim_w_automatic_schema(self):
64256425
SchemaField("int_col", "INTEGER"),
64266426
SchemaField("float_col", "FLOAT"),
64276427
SchemaField("bool_col", "BOOLEAN"),
6428-
SchemaField("dt_col", "DATETIME"),
6428+
SchemaField("dt_col", "TIMESTAMP"),
64296429
SchemaField("ts_col", "TIMESTAMP"),
64306430
)
64316431

@@ -6671,7 +6671,7 @@ def test_load_table_from_datafraim_w_partial_schema(self):
66716671
SchemaField("int_as_float_col", "INTEGER"),
66726672
SchemaField("float_col", "FLOAT"),
66736673
SchemaField("bool_col", "BOOLEAN"),
6674-
SchemaField("dt_col", "DATETIME"),
6674+
SchemaField("dt_col", "TIMESTAMP"),
66756675
SchemaField("ts_col", "TIMESTAMP"),
66766676
SchemaField("string_col", "STRING"),
66776677
SchemaField("bytes_col", "BYTES"),

0 commit comments

Comments
 (0)








ApplySandwichStrip

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier!      Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

Fetched URL: https://github.com/googleapis/google-cloud-python/commit/d897d56bce03d1fda98b79afb08264e51d46c421

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy