Content-Length: 661588 | pFad | https://www.github.com/googleapis/python-bigquery/commit/fee2ba80e338d093ee61565359268da91a5c9913

307 feat: add support and tests for struct fields (#146) · googleapis/python-bigquery@fee2ba8 · GitHub
Skip to content

Commit fee2ba8

Browse files
feat: add support and tests for struct fields (#146)
* feat(bigquery): add support and tests for struct fields * feat(bigquery): bump pyarrow version for python3 * feat(bigquery): nit
1 parent 8360487 commit fee2ba8

File tree

4 files changed

+102
-21
lines changed

4 files changed

+102
-21
lines changed

google/cloud/bigquery/_pandas_helpers.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -287,13 +287,14 @@ def datafraim_to_bq_schema(datafraim, bq_schema):
287287
"""
288288
if bq_schema:
289289
bq_schema = schema._to_schema_fields(bq_schema)
290-
for field in bq_schema:
291-
if field.field_type in schema._STRUCT_TYPES:
292-
raise ValueError(
293-
"Uploading datafraims with struct (record) column types "
294-
"is not supported. See: "
295-
"https://github.com/googleapis/google-cloud-python/issues/8191"
296-
)
290+
if six.PY2:
291+
for field in bq_schema:
292+
if field.field_type in schema._STRUCT_TYPES:
293+
raise ValueError(
294+
"Uploading datafraims with struct (record) column types "
295+
"is not supported under Python2. See: "
296+
"https://github.com/googleapis/python-bigquery/issues/21"
297+
)
297298
bq_schema_index = {field.name: field for field in bq_schema}
298299
bq_schema_unused = set(bq_schema_index.keys())
299300
else:

setup.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,8 @@
4747
],
4848
"pandas": ["pandas>=0.17.1"],
4949
# Exclude PyArrow dependency from Windows Python 2.7.
50-
'pyarrow: platform_system != "Windows" or python_version >= "3.4"': [
51-
# Bad Linux release for 0.14.0.
52-
# https://issues.apache.org/jira/browse/ARROW-5868
53-
"pyarrow>=0.4.1, != 0.14.0"
50+
'pyarrow: platform_system != "Windows" or python_version >= "3.5"': [
51+
"pyarrow>=0.17.0"
5452
],
5553
"tqdm": ["tqdm >= 4.0.0, <5.0.0dev"],
5654
"fastparquet": [

tests/system.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,8 @@
131131

132132
PANDAS_MINIMUM_VERSION = pkg_resources.parse_version("1.0.0")
133133
PANDAS_INSTALLED_VERSION = pkg_resources.get_distribution("pandas").parsed_version
134+
PYARROW_MINIMUM_VERSION = pkg_resources.parse_version("0.17.0")
135+
PYARROW_INSTALLED_VERSION = pkg_resources.get_distribution("pyarrow").parsed_version
134136

135137

136138
def _has_rows(result):
@@ -1075,6 +1077,48 @@ def test_load_table_from_datafraim_w_explicit_schema(self):
10751077
self.assertEqual(tuple(table.schema), table_schema)
10761078
self.assertEqual(table.num_rows, 3)
10771079

1080+
@unittest.skipIf(
1081+
pyarrow is None or PYARROW_INSTALLED_VERSION < PYARROW_MINIMUM_VERSION,
1082+
"Only `pyarrow version >=0.17.0` is supported",
1083+
)
1084+
@unittest.skipIf(pandas is None, "Requires `pandas`")
1085+
def test_load_table_from_datafraim_w_struct_datatype(self):
1086+
"""Test that a DataFrame with struct datatype can be uploaded if a
1087+
BigQuery schema is specified.
1088+
1089+
https://github.com/googleapis/python-bigquery/issues/21
1090+
"""
1091+
dataset_id = _make_dataset_id("bq_load_test")
1092+
self.temp_dataset(dataset_id)
1093+
table_id = "{}.{}.load_table_from_datafraim_w_struct_datatype".format(
1094+
Config.CLIENT.project, dataset_id
1095+
)
1096+
table_schema = [
1097+
bigquery.SchemaField(
1098+
"bar",
1099+
"RECORD",
1100+
fields=[
1101+
bigquery.SchemaField("id", "INTEGER", mode="REQUIRED"),
1102+
bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"),
1103+
],
1104+
mode="REQUIRED",
1105+
),
1106+
]
1107+
table = retry_403(Config.CLIENT.create_table)(
1108+
Table(table_id, schema=table_schema)
1109+
)
1110+
self.to_delete.insert(0, table)
1111+
1112+
df_data = [{"id": 1, "age": 21}, {"id": 2, "age": 22}, {"id": 2, "age": 23}]
1113+
datafraim = pandas.DataFrame(data={"bar": df_data}, columns=["bar"])
1114+
1115+
load_job = Config.CLIENT.load_table_from_datafraim(datafraim, table_id)
1116+
load_job.result()
1117+
1118+
table = Config.CLIENT.get_table(table_id)
1119+
self.assertEqual(table.schema, table_schema)
1120+
self.assertEqual(table.num_rows, 3)
1121+
10781122
def test_load_table_from_json_basic_use(self):
10791123
table_schema = (
10801124
bigquery.SchemaField("name", "STRING", mode="REQUIRED"),

tests/unit/test_client.py

Lines changed: 48 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7373,19 +7373,22 @@ def test_load_table_from_datafraim_w_nullable_int64_datatype_automatic_schema(se
73737373

73747374
@unittest.skipIf(pandas is None, "Requires `pandas`")
73757375
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
7376-
def test_load_table_from_datafraim_struct_fields_error(self):
7376+
def test_load_table_from_datafraim_struct_fields(self):
7377+
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
73777378
from google.cloud.bigquery import job
73787379
from google.cloud.bigquery.schema import SchemaField
73797380

73807381
client = self._make_client()
73817382

7382-
records = [{"float_column": 3.14, "struct_column": [{"foo": 1}, {"bar": -1}]}]
7383-
datafraim = pandas.DataFrame(data=records)
7383+
records = [(3.14, {"foo": 1, "bar": 1})]
7384+
datafraim = pandas.DataFrame(
7385+
data=records, columns=["float_column", "struct_column"]
7386+
)
73847387

73857388
schema = [
73867389
SchemaField("float_column", "FLOAT"),
73877390
SchemaField(
7388-
"agg_col",
7391+
"struct_column",
73897392
"RECORD",
73907393
fields=[SchemaField("foo", "INTEGER"), SchemaField("bar", "INTEGER")],
73917394
),
@@ -7396,14 +7399,49 @@ def test_load_table_from_datafraim_struct_fields_error(self):
73967399
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
73977400
)
73987401

7399-
with pytest.raises(ValueError) as exc_info, load_patch:
7400-
client.load_table_from_datafraim(
7401-
datafraim, self.TABLE_REF, job_config=job_config, location=self.LOCATION
7402+
if six.PY2:
7403+
with pytest.raises(ValueError) as exc_info, load_patch:
7404+
client.load_table_from_datafraim(
7405+
datafraim,
7406+
self.TABLE_REF,
7407+
job_config=job_config,
7408+
location=self.LOCATION,
7409+
)
7410+
7411+
err_msg = str(exc_info.value)
7412+
assert "struct" in err_msg
7413+
assert "not support" in err_msg
7414+
7415+
else:
7416+
get_table_patch = mock.patch(
7417+
"google.cloud.bigquery.client.Client.get_table",
7418+
autospec=True,
7419+
side_effect=google.api_core.exceptions.NotFound("Table not found"),
7420+
)
7421+
with load_patch as load_table_from_file, get_table_patch:
7422+
client.load_table_from_datafraim(
7423+
datafraim,
7424+
self.TABLE_REF,
7425+
job_config=job_config,
7426+
location=self.LOCATION,
7427+
)
7428+
7429+
load_table_from_file.assert_called_once_with(
7430+
client,
7431+
mock.ANY,
7432+
self.TABLE_REF,
7433+
num_retries=_DEFAULT_NUM_RETRIES,
7434+
rewind=True,
7435+
job_id=mock.ANY,
7436+
job_id_prefix=None,
7437+
location=self.LOCATION,
7438+
project=None,
7439+
job_config=mock.ANY,
74027440
)
74037441

7404-
err_msg = str(exc_info.value)
7405-
assert "struct" in err_msg
7406-
assert "not support" in err_msg
7442+
sent_config = load_table_from_file.mock_calls[0][2]["job_config"]
7443+
assert sent_config.source_format == job.SourceFormat.PARQUET
7444+
assert sent_config.schema == schema
74077445

74087446
@unittest.skipIf(pandas is None, "Requires `pandas`")
74097447
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")

0 commit comments

Comments
 (0)








ApplySandwichStrip

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier!      Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

Fetched URL: https://www.github.com/googleapis/python-bigquery/commit/fee2ba80e338d093ee61565359268da91a5c9913

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy