Skip to content

Commit f89df1f

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: add support for SharePoint as a ImportRagFiles source.
PiperOrigin-RevId: 677936135
1 parent b456ce3 commit f89df1f

File tree

6 files changed

+297
-5
lines changed

6 files changed

+297
-5
lines changed

tests/unit/vertex_rag/test_rag_constants.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
RagCorpus,
2525
RagFile,
2626
RagResource,
27+
SharePointSource,
28+
SharePointSources,
2729
SlackChannelsSource,
2830
SlackChannel,
2931
JiraSource,
@@ -42,6 +44,7 @@
4244
JiraSource as GapicJiraSource,
4345
RagCorpus as GapicRagCorpus,
4446
RagFile as GapicRagFile,
47+
SharePointSources as GapicSharePointSources,
4548
SlackSource as GapicSlackSource,
4649
RagContexts,
4750
RetrieveContextsResponse,
@@ -390,6 +393,122 @@
390393
import_rag_files_config=TEST_IMPORT_FILES_CONFIG_JIRA_SOURCE,
391394
)
392395

396+
# SharePoint sources
397+
TEST_SHARE_POINT_SOURCE = SharePointSources(
398+
share_point_sources=[
399+
SharePointSource(
400+
sharepoint_folder_path="test-sharepoint-folder-path",
401+
drive_name="test-drive-name",
402+
client_id="test-client-id",
403+
client_secret="test-client-secret",
404+
tenant_id="test-tenant-id",
405+
sharepoint_site_name="test-sharepoint-site-name",
406+
)
407+
],
408+
)
409+
TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE = ImportRagFilesConfig(
410+
rag_file_chunking_config=RagFileChunkingConfig(
411+
chunk_size=TEST_CHUNK_SIZE,
412+
chunk_overlap=TEST_CHUNK_OVERLAP,
413+
),
414+
share_point_sources=GapicSharePointSources(
415+
share_point_sources=[
416+
GapicSharePointSources.SharePointSource(
417+
sharepoint_folder_path="test-sharepoint-folder-path",
418+
drive_name="test-drive-name",
419+
client_id="test-client-id",
420+
client_secret=api_auth.ApiAuth.ApiKeyConfig(
421+
api_key_secret_version="test-client-secret"
422+
),
423+
tenant_id="test-tenant-id",
424+
sharepoint_site_name="test-sharepoint-site-name",
425+
)
426+
]
427+
),
428+
)
429+
430+
TEST_IMPORT_REQUEST_SHARE_POINT_SOURCE = ImportRagFilesRequest(
431+
parent=TEST_RAG_CORPUS_RESOURCE_NAME,
432+
import_rag_files_config=TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE,
433+
)
434+
435+
TEST_SHARE_POINT_SOURCE_2_DRIVES = SharePointSources(
436+
share_point_sources=[
437+
SharePointSource(
438+
sharepoint_folder_path="test-sharepoint-folder-path",
439+
drive_name="test-drive-name",
440+
drive_id="test-drive-id",
441+
client_id="test-client-id",
442+
client_secret="test-client-secret",
443+
tenant_id="test-tenant-id",
444+
sharepoint_site_name="test-sharepoint-site-name",
445+
)
446+
],
447+
)
448+
449+
TEST_SHARE_POINT_SOURCE_NO_DRIVES = SharePointSources(
450+
share_point_sources=[
451+
SharePointSource(
452+
sharepoint_folder_path="test-sharepoint-folder-path",
453+
client_id="test-client-id",
454+
client_secret="test-client-secret",
455+
tenant_id="test-tenant-id",
456+
sharepoint_site_name="test-sharepoint-site-name",
457+
)
458+
],
459+
)
460+
461+
TEST_SHARE_POINT_SOURCE_2_FOLDERS = SharePointSources(
462+
share_point_sources=[
463+
SharePointSource(
464+
sharepoint_folder_path="test-sharepoint-folder-path",
465+
sharepoint_folder_id="test-sharepoint-folder-id",
466+
drive_name="test-drive-name",
467+
client_id="test-client-id",
468+
client_secret="test-client-secret",
469+
tenant_id="test-tenant-id",
470+
sharepoint_site_name="test-sharepoint-site-name",
471+
)
472+
],
473+
)
474+
475+
TEST_SHARE_POINT_SOURCE_NO_FOLDERS = SharePointSources(
476+
share_point_sources=[
477+
SharePointSource(
478+
drive_name="test-drive-name",
479+
client_id="test-client-id",
480+
client_secret="test-client-secret",
481+
tenant_id="test-tenant-id",
482+
sharepoint_site_name="test-sharepoint-site-name",
483+
)
484+
],
485+
)
486+
487+
TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE_NO_FOLDERS = ImportRagFilesConfig(
488+
rag_file_chunking_config=RagFileChunkingConfig(
489+
chunk_size=TEST_CHUNK_SIZE,
490+
chunk_overlap=TEST_CHUNK_OVERLAP,
491+
),
492+
share_point_sources=GapicSharePointSources(
493+
share_point_sources=[
494+
GapicSharePointSources.SharePointSource(
495+
drive_name="test-drive-name",
496+
client_id="test-client-id",
497+
client_secret=api_auth.ApiAuth.ApiKeyConfig(
498+
api_key_secret_version="test-client-secret"
499+
),
500+
tenant_id="test-tenant-id",
501+
sharepoint_site_name="test-sharepoint-site-name",
502+
)
503+
]
504+
),
505+
)
506+
507+
TEST_IMPORT_REQUEST_SHARE_POINT_SOURCE_NO_FOLDERS = ImportRagFilesRequest(
508+
parent=TEST_RAG_CORPUS_RESOURCE_NAME,
509+
import_rag_files_config=TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE,
510+
)
511+
393512
# Retrieval
394513
TEST_QUERY_TEXT = "What happen to the fox and the dog?"
395514
TEST_CONTEXTS = RagContexts(

tests/unit/vertex_rag/test_rag_data.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,56 @@ def test_prepare_import_files_request_jira_source(self):
563563
)
564564
import_files_request_eq(request, tc.TEST_IMPORT_REQUEST_JIRA_SOURCE)
565565

566+
def test_prepare_import_files_request_sharepoint_source(self):
567+
request = prepare_import_files_request(
568+
corpus_name=tc.TEST_RAG_CORPUS_RESOURCE_NAME,
569+
source=tc.TEST_SHARE_POINT_SOURCE,
570+
chunk_size=tc.TEST_CHUNK_SIZE,
571+
chunk_overlap=tc.TEST_CHUNK_OVERLAP,
572+
)
573+
import_files_request_eq(request, tc.TEST_IMPORT_REQUEST_SHARE_POINT_SOURCE)
574+
575+
def test_prepare_import_files_request_sharepoint_source_2_drives(self):
576+
with pytest.raises(ValueError) as e:
577+
prepare_import_files_request(
578+
corpus_name=tc.TEST_RAG_CORPUS_RESOURCE_NAME,
579+
source=tc.TEST_SHARE_POINT_SOURCE_2_DRIVES,
580+
chunk_size=tc.TEST_CHUNK_SIZE,
581+
chunk_overlap=tc.TEST_CHUNK_OVERLAP,
582+
)
583+
e.match("drive_name and drive_id cannot both be set.")
584+
585+
def test_prepare_import_files_request_sharepoint_source_2_folders(self):
586+
with pytest.raises(ValueError) as e:
587+
prepare_import_files_request(
588+
corpus_name=tc.TEST_RAG_CORPUS_RESOURCE_NAME,
589+
source=tc.TEST_SHARE_POINT_SOURCE_2_FOLDERS,
590+
chunk_size=tc.TEST_CHUNK_SIZE,
591+
chunk_overlap=tc.TEST_CHUNK_OVERLAP,
592+
)
593+
e.match("sharepoint_folder_path and sharepoint_folder_id cannot both be set.")
594+
595+
def test_prepare_import_files_request_sharepoint_source_no_drives(self):
596+
with pytest.raises(ValueError) as e:
597+
prepare_import_files_request(
598+
corpus_name=tc.TEST_RAG_CORPUS_RESOURCE_NAME,
599+
source=tc.TEST_SHARE_POINT_SOURCE_NO_DRIVES,
600+
chunk_size=tc.TEST_CHUNK_SIZE,
601+
chunk_overlap=tc.TEST_CHUNK_OVERLAP,
602+
)
603+
e.match("Either drive_name and drive_id must be set.")
604+
605+
def test_prepare_import_files_request_sharepoint_source_no_folders(self):
606+
request = prepare_import_files_request(
607+
corpus_name=tc.TEST_RAG_CORPUS_RESOURCE_NAME,
608+
source=tc.TEST_SHARE_POINT_SOURCE_NO_FOLDERS,
609+
chunk_size=tc.TEST_CHUNK_SIZE,
610+
chunk_overlap=tc.TEST_CHUNK_OVERLAP,
611+
)
612+
import_files_request_eq(
613+
request, tc.TEST_IMPORT_REQUEST_SHARE_POINT_SOURCE_NO_FOLDERS
614+
)
615+
566616
def test_set_embedding_model_config_set_both_error(self):
567617
embedding_model_config = rag.EmbeddingModelConfig(
568618
publisher_model="whatever",

vertexai/preview/rag/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
RagCorpus,
4545
RagFile,
4646
RagResource,
47+
SharePointSource,
48+
SharePointSources,
4749
SlackChannel,
4850
SlackChannelsSource,
4951
VertexFeatureStore,
@@ -61,6 +63,8 @@
6163
"RagFile",
6264
"RagResource",
6365
"Retrieval",
66+
"SharePointSource",
67+
"SharePointSources",
6468
"SlackChannel",
6569
"SlackChannelsSource",
6670
"VertexFeatureStore",

vertexai/preview/rag/rag_data.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
Pinecone,
4949
RagCorpus,
5050
RagFile,
51+
SharePointSources,
5152
SlackChannelsSource,
5253
VertexFeatureStore,
5354
VertexVectorSearch,
@@ -290,7 +291,7 @@ def upload_file(
290291
def import_files(
291292
corpus_name: str,
292293
paths: Optional[Sequence[str]] = None,
293-
source: Optional[Union[SlackChannelsSource, JiraSource]] = None,
294+
source: Optional[Union[SlackChannelsSource, JiraSource, SharePointSources]] = None,
294295
chunk_size: int = 1024,
295296
chunk_overlap: int = 200,
296297
timeout: int = 600,
@@ -354,6 +355,19 @@ def import_files(
354355
chunk_overlap=100,
355356
)
356357
358+
# SharePoint Example.
359+
sharepoint_query = rag.SharePointSource(
360+
sharepoint_folder_path="https://my-sharepoint-site.com/my-folder",
361+
sharepoint_site_name="my-sharepoint-site.com",
362+
client_id="my-client-id",
363+
client_secret="my-client-secret",
364+
tenant_id="my-tenant-id",
365+
drive_id="my-drive-id",
366+
)
367+
source = rag.SharePointSources(
368+
share_point_sources=[sharepoint_query],
369+
)
370+
357371
# Return the number of imported RagFiles after completion.
358372
print(response.imported_rag_files_count)
359373
@@ -420,7 +434,7 @@ def import_files(
420434
async def import_files_async(
421435
corpus_name: str,
422436
paths: Optional[Sequence[str]] = None,
423-
source: Optional[Union[SlackChannelsSource, JiraSource]] = None,
437+
source: Optional[Union[SlackChannelsSource, JiraSource, SharePointSources]] = None,
424438
chunk_size: int = 1024,
425439
chunk_overlap: int = 200,
426440
max_embedding_requests_per_min: int = 1000,
@@ -484,6 +498,19 @@ async def import_files_async(
484498
chunk_overlap=100,
485499
)
486500
501+
# SharePoint Example.
502+
sharepoint_query = rag.SharePointSource(
503+
sharepoint_folder_path="https://my-sharepoint-site.com/my-folder",
504+
sharepoint_site_name="my-sharepoint-site.com",
505+
client_id="my-client-id",
506+
client_secret="my-client-secret",
507+
tenant_id="my-tenant-id",
508+
drive_id="my-drive-id",
509+
)
510+
source = rag.SharePointSources(
511+
share_point_sources=[sharepoint_query],
512+
)
513+
487514
# Get the result.
488515
await response.result()
489516

vertexai/preview/rag/utils/_gapic_utils.py

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
RagFileParsingConfig,
2727
RagCorpus as GapicRagCorpus,
2828
RagFile as GapicRagFile,
29+
SharePointSources as GapicSharePointSources,
2930
SlackSource as GapicSlackSource,
3031
JiraSource as GapicJiraSource,
3132
RagVectorDbConfig,
@@ -41,6 +42,7 @@
4142
Pinecone,
4243
RagCorpus,
4344
RagFile,
45+
SharePointSources,
4446
SlackChannelsSource,
4547
JiraSource,
4648
VertexFeatureStore,
@@ -222,7 +224,7 @@ def convert_path_to_resource_id(
222224

223225

224226
def convert_source_for_rag_import(
225-
source: Union[SlackChannelsSource, JiraSource]
227+
source: Union[SlackChannelsSource, JiraSource, SharePointSources]
226228
) -> Union[GapicSlackSource, GapicJiraSource]:
227229
"""Converts a SlackChannelsSource or JiraSource to a GapicSlackSource or GapicJiraSource."""
228230
if isinstance(source, SlackChannelsSource):
@@ -269,14 +271,57 @@ def convert_source_for_rag_import(
269271
return GapicJiraSource(
270272
jira_queries=result_source_queries,
271273
)
274+
elif isinstance(source, SharePointSources):
275+
result_source_share_point_sources = []
276+
for share_point_source in source.share_point_sources:
277+
sharepoint_folder_path = share_point_source.sharepoint_folder_path
278+
sharepoint_folder_id = share_point_source.sharepoint_folder_id
279+
drive_name = share_point_source.drive_name
280+
drive_id = share_point_source.drive_id
281+
client_id = share_point_source.client_id
282+
client_secret = share_point_source.client_secret
283+
tenant_id = share_point_source.tenant_id
284+
sharepoint_site_name = share_point_source.sharepoint_site_name
285+
result_share_point_source = GapicSharePointSources.SharePointSource(
286+
client_id=client_id,
287+
client_secret=api_auth.ApiAuth.ApiKeyConfig(
288+
api_key_secret_version=client_secret
289+
),
290+
tenant_id=tenant_id,
291+
sharepoint_site_name=sharepoint_site_name,
292+
)
293+
if sharepoint_folder_path is not None and sharepoint_folder_id is not None:
294+
raise ValueError(
295+
"sharepoint_folder_path and sharepoint_folder_id cannot both be set."
296+
)
297+
elif sharepoint_folder_path is not None:
298+
result_share_point_source.sharepoint_folder_path = (
299+
sharepoint_folder_path
300+
)
301+
elif sharepoint_folder_id is not None:
302+
result_share_point_source.sharepoint_folder_id = sharepoint_folder_id
303+
if drive_name is not None and drive_id is not None:
304+
raise ValueError("drive_name and drive_id cannot both be set.")
305+
elif drive_name is not None:
306+
result_share_point_source.drive_name = drive_name
307+
elif drive_id is not None:
308+
result_share_point_source.drive_id = drive_id
309+
else:
310+
raise ValueError("Either drive_name and drive_id must be set.")
311+
result_source_share_point_sources.append(result_share_point_source)
312+
return GapicSharePointSources(
313+
share_point_sources=result_source_share_point_sources,
314+
)
272315
else:
273-
raise TypeError("source must be a SlackChannelsSource or JiraSource.")
316+
raise TypeError(
317+
"source must be a SlackChannelsSource or JiraSource or SharePointSources."
318+
)
274319

275320

276321
def prepare_import_files_request(
277322
corpus_name: str,
278323
paths: Optional[Sequence[str]] = None,
279-
source: Optional[Union[SlackChannelsSource, JiraSource]] = None,
324+
source: Optional[Union[SlackChannelsSource, JiraSource, SharePointSources]] = None,
280325
chunk_size: int = 1024,
281326
chunk_overlap: int = 200,
282327
max_embedding_requests_per_min: int = 1000,
@@ -307,6 +352,8 @@ def prepare_import_files_request(
307352
import_rag_files_config.slack_source = gapic_source
308353
if isinstance(gapic_source, GapicJiraSource):
309354
import_rag_files_config.jira_source = gapic_source
355+
if isinstance(gapic_source, GapicSharePointSources):
356+
import_rag_files_config.share_point_sources = gapic_source
310357
else:
311358
uris = []
312359
resource_ids = []

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy