Skip to content

Commit 8c488e6

Browse files
feat: add reference file schema option for federated formats (#2269)
* feat: add reference file schema option for federated formats * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * chore: fix clirr check * chore: add assertion to tests * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * chore: add create external table tests * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * chore: delete table for external table after testing * comment * cleanup * chore: remove enforced login from library code * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent 4dd963b commit 8c488e6

File tree

4 files changed

+257
-1
lines changed

4 files changed

+257
-1
lines changed

google-cloud-bigquery/clirr-ignored-differences.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,9 @@
1414
<method>com.google.api.services.bigquery.model.GetQueryResultsResponse getQueryResultsWithRowLimit(java.lang.String, java.lang.String, java.lang.String, java.lang.Integer)</method>
1515
<justification>getQueryResultsWithRowLimit is just used by ConnectionImpl at the moment so it should be fine to update the signature instead of writing an overloaded method</justification>
1616
</difference>
17+
<difference>
18+
<differenceType>7013</differenceType>
19+
<className>com/google/cloud/bigquery/ExternalTableDefinition*</className>
20+
<method>*ReferenceFileSchemaUri(*)</method>
21+
</difference>
1722
</differences>

google-cloud-bigquery/src/main/java/com/google/cloud/bigquery/ExternalTableDefinition.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,14 @@ public Builder setHivePartitioningOptions(HivePartitioningOptions hivePartitioni
157157
return setHivePartitioningOptionsInner(hivePartitioningOptions);
158158
};
159159

160+
/**
161+
* When creating an external table, the user can provide a reference file with the table schema.
162+
* This is enabled for the following formats: AVRO, PARQUET, ORC.
163+
*
164+
* @param referenceFileSchemaUri or {@code null} for none
165+
*/
166+
public abstract Builder setReferenceFileSchemaUri(String referenceFileSchemaUri);
167+
160168
abstract Builder setHivePartitioningOptionsInner(
161169
HivePartitioningOptions hivePartitioningOptions);
162170

@@ -250,6 +258,9 @@ public <F extends FormatOptions> F getFormatOptions() {
250258
@Nullable
251259
public abstract Boolean getAutodetect();
252260

261+
@Nullable
262+
public abstract String getReferenceFileSchemaUri();
263+
253264
/**
254265
* [Experimental] Returns the HivePartitioningOptions when the data layout follows Hive
255266
* partitioning convention
@@ -317,6 +328,10 @@ com.google.api.services.bigquery.model.ExternalDataConfiguration toExternalDataC
317328
if (getAutodetect() != null) {
318329
externalConfigurationPb.setAutodetect(getAutodetect());
319330
}
331+
if (getReferenceFileSchemaUri() != null) {
332+
externalConfigurationPb.setReferenceFileSchemaUri(getReferenceFileSchemaUri());
333+
}
334+
320335
if (getHivePartitioningOptions() != null) {
321336
externalConfigurationPb.setHivePartitioningOptions(getHivePartitioningOptions().toPb());
322337
}
@@ -486,6 +501,9 @@ static ExternalTableDefinition fromPb(Table tablePb) {
486501
builder.setHivePartitioningOptions(
487502
HivePartitioningOptions.fromPb(externalDataConfiguration.getHivePartitioningOptions()));
488503
}
504+
if (externalDataConfiguration.getReferenceFileSchemaUri() != null) {
505+
builder.setReferenceFileSchemaUri(externalDataConfiguration.getReferenceFileSchemaUri());
506+
}
489507
}
490508
return builder.build();
491509
}
@@ -538,10 +556,14 @@ static ExternalTableDefinition fromExternalDataConfiguration(
538556
if (externalDataConfiguration.getAutodetect() != null) {
539557
builder.setAutodetect(externalDataConfiguration.getAutodetect());
540558
}
559+
if (externalDataConfiguration.getReferenceFileSchemaUri() != null) {
560+
builder.setReferenceFileSchemaUri(externalDataConfiguration.getReferenceFileSchemaUri());
561+
}
541562
if (externalDataConfiguration.getHivePartitioningOptions() != null) {
542563
builder.setHivePartitioningOptions(
543564
HivePartitioningOptions.fromPb(externalDataConfiguration.getHivePartitioningOptions()));
544565
}
566+
545567
return builder.build();
546568
}
547569
}

google-cloud-bigquery/src/main/java/com/google/cloud/bigquery/LoadJobConfiguration.java

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ public final class LoadJobConfiguration extends JobConfiguration implements Load
5656
private final Long jobTimeoutMs;
5757
private final RangePartitioning rangePartitioning;
5858
private final HivePartitioningOptions hivePartitioningOptions;
59+
private final String referenceFileSchemaUri;
5960

6061
public static final class Builder extends JobConfiguration.Builder<LoadJobConfiguration, Builder>
6162
implements LoadConfiguration.Builder {
@@ -81,6 +82,7 @@ public static final class Builder extends JobConfiguration.Builder<LoadJobConfig
8182
private Long jobTimeoutMs;
8283
private RangePartitioning rangePartitioning;
8384
private HivePartitioningOptions hivePartitioningOptions;
85+
private String referenceFileSchemaUri;
8486

8587
private Builder() {
8688
super(Type.LOAD);
@@ -109,6 +111,7 @@ private Builder(LoadJobConfiguration loadConfiguration) {
109111
this.jobTimeoutMs = loadConfiguration.jobTimeoutMs;
110112
this.rangePartitioning = loadConfiguration.rangePartitioning;
111113
this.hivePartitioningOptions = loadConfiguration.hivePartitioningOptions;
114+
this.referenceFileSchemaUri = loadConfiguration.referenceFileSchemaUri;
112115
}
113116

114117
private Builder(com.google.api.services.bigquery.model.JobConfiguration configurationPb) {
@@ -199,6 +202,9 @@ private Builder(com.google.api.services.bigquery.model.JobConfiguration configur
199202
this.hivePartitioningOptions =
200203
HivePartitioningOptions.fromPb(loadConfigurationPb.getHivePartitioningOptions());
201204
}
205+
if (loadConfigurationPb.getReferenceFileSchemaUri() != null) {
206+
this.referenceFileSchemaUri = loadConfigurationPb.getReferenceFileSchemaUri();
207+
}
202208
}
203209

204210
@Override
@@ -351,6 +357,17 @@ public Builder setHivePartitioningOptions(HivePartitioningOptions hivePartitioni
351357
return this;
352358
}
353359

360+
/**
361+
* When creating an external table, the user can provide a reference file with the table schema.
362+
* This is enabled for the following formats: AVRO, PARQUET, ORC.
363+
*
364+
* @param referenceFileSchemaUri or {@code null} for none
365+
*/
366+
public Builder setReferenceFileSchemaUri(String referenceFileSchemaUri) {
367+
this.referenceFileSchemaUri = referenceFileSchemaUri;
368+
return this;
369+
}
370+
354371
@Override
355372
public LoadJobConfiguration build() {
356373
return new LoadJobConfiguration(this);
@@ -379,6 +396,7 @@ private LoadJobConfiguration(Builder builder) {
379396
this.jobTimeoutMs = builder.jobTimeoutMs;
380397
this.rangePartitioning = builder.rangePartitioning;
381398
this.hivePartitioningOptions = builder.hivePartitioningOptions;
399+
this.referenceFileSchemaUri = builder.referenceFileSchemaUri;
382400
}
383401

384402
@Override
@@ -498,6 +516,10 @@ public HivePartitioningOptions getHivePartitioningOptions() {
498516
return hivePartitioningOptions;
499517
}
500518

519+
public String getReferenceFileSchemaUri() {
520+
return referenceFileSchemaUri;
521+
}
522+
501523
@Override
502524
public Builder toBuilder() {
503525
return new Builder(this);
@@ -525,7 +547,8 @@ ToStringHelper toStringHelper() {
525547
.add("labels", labels)
526548
.add("jobTimeoutMs", jobTimeoutMs)
527549
.add("rangePartitioning", rangePartitioning)
528-
.add("hivePartitioningOptions", hivePartitioningOptions);
550+
.add("hivePartitioningOptions", hivePartitioningOptions)
551+
.add("referenceFileSchemaUri", referenceFileSchemaUri);
529552
}
530553

531554
@Override
@@ -628,6 +651,10 @@ com.google.api.services.bigquery.model.JobConfiguration toPb() {
628651
if (hivePartitioningOptions != null) {
629652
loadConfigurationPb.setHivePartitioningOptions(hivePartitioningOptions.toPb());
630653
}
654+
if (referenceFileSchemaUri != null) {
655+
loadConfigurationPb.setReferenceFileSchemaUri(referenceFileSchemaUri);
656+
}
657+
631658
jobConfiguration.setLoad(loadConfigurationPb);
632659
return jobConfiguration;
633660
}

google-cloud-bigquery/src/test/java/com/google/cloud/bigquery/it/ITBigQueryTest.java

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565
import com.google.cloud.bigquery.ExternalTableDefinition;
6666
import com.google.cloud.bigquery.ExtractJobConfiguration;
6767
import com.google.cloud.bigquery.Field;
68+
import com.google.cloud.bigquery.Field.Mode;
6869
import com.google.cloud.bigquery.FieldList;
6970
import com.google.cloud.bigquery.FieldValue;
7071
import com.google.cloud.bigquery.FieldValue.Attribute;
@@ -4586,4 +4587,205 @@ public void testPreserveAsciiControlCharacters()
45864587
assertEquals("\u0000", row.get(0).getStringValue());
45874588
assertTrue(bigquery.delete(tableId));
45884589
}
4590+
4591+
@Test
4592+
public void testReferenceFileSchemaUriForAvro() {
4593+
try {
4594+
String destinationTableName = "test_reference_file_schema_avro";
4595+
TableId tableId = TableId.of(DATASET, destinationTableName);
4596+
Schema expectedSchema =
4597+
Schema.of(
4598+
Field.newBuilder("username", StandardSQLTypeName.STRING)
4599+
.setMode(Mode.NULLABLE)
4600+
.build(),
4601+
Field.newBuilder("tweet", StandardSQLTypeName.STRING).setMode(Mode.NULLABLE).build(),
4602+
Field.newBuilder("timestamp", StandardSQLTypeName.STRING)
4603+
.setMode(Mode.NULLABLE)
4604+
.build(),
4605+
Field.newBuilder("likes", StandardSQLTypeName.INT64).setMode(Mode.NULLABLE).build());
4606+
4607+
// By default, the table should have c-twitter schema because it is lexicographically last.
4608+
// a-twitter schema (username, tweet, timestamp, likes)
4609+
// b-twitter schema (username, tweet, timestamp)
4610+
// c-twitter schema (username, tweet)
4611+
List<String> SOURCE_URIS =
4612+
ImmutableList.of(
4613+
"gs://"
4614+
+ CLOUD_SAMPLES_DATA
4615+
+ "/bigquery/federated-formats-reference-file-schema/a-twitter.avro",
4616+
"gs://"
4617+
+ CLOUD_SAMPLES_DATA
4618+
+ "/bigquery/federated-formats-reference-file-schema/b-twitter.avro",
4619+
"gs://"
4620+
+ CLOUD_SAMPLES_DATA
4621+
+ "/bigquery/federated-formats-reference-file-schema/c-twitter.avro");
4622+
4623+
// Because referenceFileSchemaUri is set as a-twitter, the table will have a-twitter schema
4624+
String referenceFileSchema =
4625+
"gs://"
4626+
+ CLOUD_SAMPLES_DATA
4627+
+ "/bigquery/federated-formats-reference-file-schema/a-twitter.avro";
4628+
4629+
LoadJobConfiguration loadJobConfiguration =
4630+
LoadJobConfiguration.newBuilder(tableId, SOURCE_URIS, FormatOptions.avro())
4631+
.setReferenceFileSchemaUri(referenceFileSchema)
4632+
.build();
4633+
4634+
Job job = bigquery.create(JobInfo.of(loadJobConfiguration));
4635+
// Blocks until this load table job completes its execution, either failing or succeeding.
4636+
job = job.waitFor();
4637+
assertEquals(true, job.isDone());
4638+
4639+
LoadJobConfiguration actualLoadJobConfiguration = job.getConfiguration();
4640+
Table generatedTable = bigquery.getTable(actualLoadJobConfiguration.getDestinationTable());
4641+
4642+
assertEquals(expectedSchema, generatedTable.getDefinition().getSchema());
4643+
// clean up after test to avoid conflict with other tests
4644+
boolean success = bigquery.delete(tableId);
4645+
assertEquals(true, success);
4646+
} catch (BigQueryException | InterruptedException e) {
4647+
System.out.println("Column not added during load append \n" + e.toString());
4648+
}
4649+
}
4650+
4651+
@Test
4652+
public void testReferenceFileSchemaUriForParquet() {
4653+
try {
4654+
String destinationTableName = "test_reference_file_schema_parquet";
4655+
TableId tableId = TableId.of(DATASET, destinationTableName);
4656+
Schema expectedSchema =
4657+
Schema.of(
4658+
Field.newBuilder("username", StandardSQLTypeName.STRING)
4659+
.setMode(Mode.NULLABLE)
4660+
.build(),
4661+
Field.newBuilder("tweet", StandardSQLTypeName.STRING).setMode(Mode.NULLABLE).build(),
4662+
Field.newBuilder("timestamp", StandardSQLTypeName.STRING)
4663+
.setMode(Mode.NULLABLE)
4664+
.build(),
4665+
Field.newBuilder("likes", StandardSQLTypeName.INT64).setMode(Mode.NULLABLE).build());
4666+
4667+
// By default, the table should have c-twitter schema because it is lexicographically last.
4668+
// a-twitter schema (username, tweet, timestamp, likes)
4669+
// b-twitter schema (username, tweet, timestamp)
4670+
// c-twitter schema (username, tweet)
4671+
List<String> SOURCE_URIS =
4672+
ImmutableList.of(
4673+
"gs://"
4674+
+ CLOUD_SAMPLES_DATA
4675+
+ "/bigquery/federated-formats-reference-file-schema/a-twitter.parquet",
4676+
"gs://"
4677+
+ CLOUD_SAMPLES_DATA
4678+
+ "/bigquery/federated-formats-reference-file-schema/b-twitter.parquet",
4679+
"gs://"
4680+
+ CLOUD_SAMPLES_DATA
4681+
+ "/bigquery/federated-formats-reference-file-schema/c-twitter.parquet");
4682+
4683+
// Because referenceFileSchemaUri is set as a-twitter, the table will have a-twitter schema
4684+
String referenceFileSchema =
4685+
"gs://"
4686+
+ CLOUD_SAMPLES_DATA
4687+
+ "/bigquery/federated-formats-reference-file-schema/a-twitter.parquet";
4688+
4689+
LoadJobConfiguration loadJobConfiguration =
4690+
LoadJobConfiguration.newBuilder(tableId, SOURCE_URIS, FormatOptions.parquet())
4691+
.setReferenceFileSchemaUri(referenceFileSchema)
4692+
.build();
4693+
4694+
Job job = bigquery.create(JobInfo.of(loadJobConfiguration));
4695+
// Blocks until this load table job completes its execution, either failing or succeeding.
4696+
job = job.waitFor();
4697+
assertEquals(true, job.isDone());
4698+
LoadJobConfiguration actualLoadJobConfiguration = job.getConfiguration();
4699+
Table generatedTable = bigquery.getTable(actualLoadJobConfiguration.getDestinationTable());
4700+
4701+
assertEquals(expectedSchema, generatedTable.getDefinition().getSchema());
4702+
// clean up after test to avoid conflict with other tests
4703+
boolean success = bigquery.delete(tableId);
4704+
assertEquals(true, success);
4705+
} catch (BigQueryException | InterruptedException e) {
4706+
System.out.println("Column not added during load append \n" + e.toString());
4707+
}
4708+
}
4709+
4710+
@Test
4711+
public void testCreateExternalTableWithReferenceFileSchemaAvro() {
4712+
String destinationTableName = "test_create_external_table_reference_file_schema_avro";
4713+
TableId tableId = TableId.of(DATASET, destinationTableName);
4714+
Schema expectedSchema =
4715+
Schema.of(
4716+
Field.newBuilder("username", StandardSQLTypeName.STRING).setMode(Mode.NULLABLE).build(),
4717+
Field.newBuilder("tweet", StandardSQLTypeName.STRING).setMode(Mode.NULLABLE).build(),
4718+
Field.newBuilder("timestamp", StandardSQLTypeName.STRING)
4719+
.setMode(Mode.NULLABLE)
4720+
.build(),
4721+
Field.newBuilder("likes", StandardSQLTypeName.INT64).setMode(Mode.NULLABLE).build());
4722+
String CLOUD_SAMPLES_DATA = "cloud-samples-data";
4723+
4724+
// By default, the table should have c-twitter schema because it is lexicographically last.
4725+
// a-twitter schema (username, tweet, timestamp, likes)
4726+
// b-twitter schema (username, tweet, timestamp)
4727+
// c-twitter schema (username, tweet)
4728+
String SOURCE_URI =
4729+
"gs://" + CLOUD_SAMPLES_DATA + "/bigquery/federated-formats-reference-file-schema/*.avro";
4730+
4731+
// Because referenceFileSchemaUri is set as a-twitter, the table will have a-twitter schema
4732+
String referenceFileSchema =
4733+
"gs://"
4734+
+ CLOUD_SAMPLES_DATA
4735+
+ "/bigquery/federated-formats-reference-file-schema/a-twitter.avro";
4736+
4737+
ExternalTableDefinition externalTableDefinition =
4738+
ExternalTableDefinition.newBuilder(SOURCE_URI, FormatOptions.avro())
4739+
.setReferenceFileSchemaUri(referenceFileSchema)
4740+
.build();
4741+
TableInfo tableInfo = TableInfo.of(tableId, externalTableDefinition);
4742+
Table createdTable = bigquery.create(tableInfo);
4743+
Table generatedTable = bigquery.getTable(createdTable.getTableId());
4744+
assertEquals(expectedSchema, generatedTable.getDefinition().getSchema());
4745+
// clean up after test to avoid conflict with other tests
4746+
boolean success = bigquery.delete(tableId);
4747+
assertEquals(true, success);
4748+
}
4749+
4750+
@Test
4751+
public void testCreateExternalTableWithReferenceFileSchemaParquet() {
4752+
String destinationTableName = "test_create_external_table_reference_file_schema_parquet";
4753+
TableId tableId = TableId.of(DATASET, destinationTableName);
4754+
Schema expectedSchema =
4755+
Schema.of(
4756+
Field.newBuilder("username", StandardSQLTypeName.STRING).setMode(Mode.NULLABLE).build(),
4757+
Field.newBuilder("tweet", StandardSQLTypeName.STRING).setMode(Mode.NULLABLE).build(),
4758+
Field.newBuilder("timestamp", StandardSQLTypeName.STRING)
4759+
.setMode(Mode.NULLABLE)
4760+
.build(),
4761+
Field.newBuilder("likes", StandardSQLTypeName.INT64).setMode(Mode.NULLABLE).build());
4762+
String CLOUD_SAMPLES_DATA = "cloud-samples-data";
4763+
4764+
// By default, the table should have c-twitter schema because it is lexicographically last.
4765+
// a-twitter schema (username, tweet, timestamp, likes)
4766+
// b-twitter schema (username, tweet, timestamp)
4767+
// c-twitter schema (username, tweet)
4768+
String SOURCE_URI =
4769+
"gs://"
4770+
+ CLOUD_SAMPLES_DATA
4771+
+ "/bigquery/federated-formats-reference-file-schema/*.parquet";
4772+
4773+
// Because referenceFileSchemaUri is set as a-twitter, the table will have a-twitter schema
4774+
String referenceFileSchema =
4775+
"gs://"
4776+
+ CLOUD_SAMPLES_DATA
4777+
+ "/bigquery/federated-formats-reference-file-schema/a-twitter.parquet";
4778+
4779+
ExternalTableDefinition externalTableDefinition =
4780+
ExternalTableDefinition.newBuilder(SOURCE_URI, FormatOptions.parquet())
4781+
.setReferenceFileSchemaUri(referenceFileSchema)
4782+
.build();
4783+
TableInfo tableInfo = TableInfo.of(tableId, externalTableDefinition);
4784+
Table createdTable = bigquery.create(tableInfo);
4785+
Table generatedTable = bigquery.getTable(createdTable.getTableId());
4786+
assertEquals(expectedSchema, generatedTable.getDefinition().getSchema());
4787+
// clean up after test to avoid conflict with other tests
4788+
boolean success = bigquery.delete(tableId);
4789+
assertEquals(true, success);
4790+
}
45894791
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy