diff --git a/definitions/reporting/test_predictions.ipynb b/definitions/reporting/test_predictions.ipynb
new file mode 100644
index 0000000..d7ec89f
--- /dev/null
+++ b/definitions/reporting/test_predictions.ipynb
@@ -0,0 +1,1123 @@
+### Overview
+Overview: This demo will show how to use the RAG (retrieval augmented generation) pattern directly within BigQuery on a set of PDFs (unstructured data).
+
+Process Flow:
+1. Create a storage acccount and copy seed data
+2. Populate BigQuery
+3. Create the external connections (Gemini, Vertex AI Embeddings, Vertex AI Document Processors, BigLake)
+4. Set the IAM permissions
+5. Create an object table
+6. Extract the text from each PDF using Vertex AI Document Processor
+7. Create embeddings using Vertex AI Text Embeddings
+8. Use Gemini by searching the embeddings (Vector Search) and injecting them into the context window
+9. Use Gemini with results Grounded with Google Search
+10. Use Gemini with “Response Schema” which will return our response in a structured format (typically JSON, but in BigQuery a table will be returned)
+
+
+
+Cost:
+* Approximate cost: Less than $1
+
+Author:
+* Adam Paternostro
+# Architecture Diagram
+from IPython.display import Image
+Image(url='https://storage.googleapis.com/data-analytics-golden-demo/colab-diagrams/BigQuery-Analytics-using-GenAI.png', width=1200)
+### Video Walkthrough
+[](https://storage.googleapis.com/data-analytics-golden-demo/colab-videos/BigQuery-Analytics-with-GenAI.mp4)
+
+
+from IPython.display import HTML
+
+HTML("""
+
+""")
+### License
+```
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+```
+### Initialize
+from PIL import Image
+from IPython.display import HTML
+import IPython.display
+import google.auth
+import requests
+import json
+import uuid
+import base64
+import os
+import cv2
+import random
+import time
+import datetime
+import base64
+import random
+import logging
+# Set these (run this cell to verify the output)
+
+# chagne to "us", "eu", etc.
+location = "us"
+
+# Get the current date and time
+now = datetime.datetime.now()
+
+# Format the date and time as desired
+formatted_date = now.strftime("%Y-%m-%d-%H-%M")
+
+# Get some values using gcloud
+project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
+user = !(gcloud auth list --filter=status:ACTIVE --format="value(account)")
+
+if len(user) != 1:
+ raise RuntimeError(f"user is not set: {user}")
+user = user[0]
+
+print(f"project_id = {project_id}")
+print(f"user = {user}")
+### Prerequisite to run this notebook
+!gcloud services enable compute.googleapis.com \
+ bigquery.googleapis.com \
+ aiplatform.googleapis.com \
+ dataform.googleapis.com \
+ documentai.googleapis.com \
+ --project "{project_id}"
+### Helper Methods
+##### restAPIHelper
+Calls the Google Cloud REST API using the current users credentials.
+def restAPIHelper(url: str, http_verb: str, request_body: str) -> str:
+ """Calls the Google Cloud REST API passing in the current users credentials"""
+
+ import requests
+ import google.auth
+ import json
+
+ # Get an access token based upon the current user
+ creds, project = google.auth.default()
+ auth_req = google.auth.transport.requests.Request()
+ creds.refresh(auth_req)
+ access_token=creds.token
+
+ headers = {
+ "Content-Type" : "application/json",
+ "Authorization" : "Bearer " + access_token
+ }
+
+ if http_verb == "GET":
+ response = requests.get(url, headers=headers)
+ elif http_verb == "POST":
+ response = requests.post(url, json=request_body, headers=headers)
+ elif http_verb == "PUT":
+ response = requests.put(url, json=request_body, headers=headers)
+ elif http_verb == "PATCH":
+ response = requests.patch(url, json=request_body, headers=headers)
+ elif http_verb == "DELETE":
+ response = requests.delete(url, headers=headers)
+ else:
+ raise RuntimeError(f"Unknown HTTP verb: {http_verb}")
+
+ if response.status_code == 200:
+ return json.loads(response.content)
+ #image_data = json.loads(response.content)["predictions"][0]["bytesBase64Encoded"]
+ else:
+ error = f"Error restAPIHelper -> ' Status: '{response.status_code}' Text: '{response.text}'"
+ raise RuntimeError(error)
+##### RunQuery (on BigQuery)
+def RunQuery(sql):
+ import time
+ from google.cloud import bigquery
+ client = bigquery.Client()
+
+ if (sql.startswith("SELECT") or sql.startswith("WITH")):
+ df_result = client.query(sql).to_dataframe()
+ return df_result
+ else:
+ job_config = bigquery.QueryJobConfig(priority=bigquery.QueryPriority.INTERACTIVE)
+ query_job = client.query(sql, job_config=job_config)
+
+ # Check on the progress by getting the job's updated state.
+ query_job = client.get_job(
+ query_job.job_id, location=query_job.location
+ )
+ print("Job {} is currently in state {} with error result of {}".format(query_job.job_id, query_job.state, query_job.error_result))
+
+ while query_job.state != "DONE":
+ time.sleep(2)
+ query_job = client.get_job(
+ query_job.job_id, location=query_job.location
+ )
+ print("Job {} is currently in state {} with error result of {}".format(query_job.job_id, query_job.state, query_job.error_result))
+
+ if query_job.error_result == None:
+ return True
+ else:
+ raise Exception(query_job.error_result)
+##### Create Vertex AI connection
+def createExternal_BigLake_VertexAI_RemoteFunctions_Connection(project_id, location, connection_name):
+ """Creates a BigLake, Vertex AI, Remote Function connection."""
+
+ # First find the connection
+ # https://cloud.google.com/bigquery/docs/reference/bigqueryconnection/rest/v1/projects.locations.connections/list
+ url = f"https://bigqueryconnection.googleapis.com/v1/projects/{project_id}/locations/{location}/connections"
+
+ # Gather existing connections
+ json_result = restAPIHelper(url, "GET", None)
+ print(f"createBigLakeConnection (GET) json_result: {json_result}")
+
+ # Test to see if connection exists, if so return
+ if "connections" in json_result:
+ for item in json_result["connections"]:
+ print(f"BigLake Connection: {item['name']}")
+ # "projects/756740881369/locations/us/connections/biglake-notebook-connection"
+ # NOTE: We cannot test the complete name since it contains the project number and not id
+ if item["name"].endswith(f"/locations/{location}/connections/{connection_name}"):
+ print("Connection already exists")
+ serviceAccountId = item["cloudResource"]["serviceAccountId"]
+ return serviceAccountId
+
+ # Create the connection
+ # https://cloud.google.com/bigquery/docs/reference/bigqueryconnection/rest/v1/projects.locations.connections/create
+ print("Creating Vertex AI Connection")
+
+ url = f"https://bigqueryconnection.googleapis.com/v1/projects/{project_id}/locations/{location}/connections?connectionId={connection_name}"
+
+ request_body = {
+ "friendlyName": connection_name,
+ "description": "BigLake, Vertex AI, Remote Function connection",
+ "cloudResource": {}
+ }
+
+ json_result = restAPIHelper(url, "POST", request_body)
+
+ serviceAccountId = json_result["cloudResource"]["serviceAccountId"]
+ print("BigLake Connection created: ", serviceAccountId)
+ return serviceAccountId
+
+##### Create a GCS bucket
+def createGoogleCloudStorageBucket(project_id, bucket_name, location):
+ """Creates a Google Cloud Storage bucket."""
+
+ # First find the bucket
+ # https://cloud.google.com/storage/docs/json_api/v1/buckets/list
+ url = f"https://storage.googleapis.com/storage/v1/b?project={project_id}"
+
+ # Gather existing buckets
+ json_result = restAPIHelper(url, "GET", None)
+ print(f"createGoogleCloudStorageBucket (GET) json_result: {json_result}")
+
+ # Test to see if connection exists, if so return
+ if "items" in json_result:
+ for item in json_result["items"]:
+ print(f"Bucket Id / Name: ({item['id']} / {item['name']}")
+ if item["id"] == bucket_name:
+ print("Bucket already exists")
+ return
+
+ # Create the bucket
+ # https://cloud.google.com/storage/docs/json_api/v1/buckets/insert
+ print("Creating Google Cloud Bucket")
+
+ url = f"https://storage.googleapis.com/storage/v1/b?project={project_id}&predefinedAcl=private&predefinedDefaultObjectAcl=private&projection=noAcl"
+
+ request_body = {
+ "name": bucket_name,
+ "location": location
+ }
+
+ json_result = restAPIHelper(url, "POST", request_body)
+ print()
+ print(f"json_result: {json_result}")
+ print()
+ print("BigLake Bucket created: ", bucket_name)
+##### Set IAM permissions on bucket
+def setBucketIamPolicy(bucket_name, accountWithPrefix, role):
+ """Sets the bucket IAM policy."""
+
+ # Get the current bindings (if the account has access then skip)
+ # https://cloud.google.com/storage/docs/json_api/v1/buckets/getIamPolicy
+
+ url = f"https://storage.googleapis.com/storage/v1/b/{bucket_name}/iam"
+ json_result = restAPIHelper(url, "GET", None)
+ print(f"setBucketIamPolicy (GET) json_result: {json_result}")
+
+ # Test to see if permissions exist
+ if "bindings" in json_result:
+ for item in json_result["bindings"]:
+ members = item["members"]
+ for member in members:
+ if member == accountWithPrefix:
+ print("Permissions exist")
+ return
+
+ # Take the existing bindings and we need to append the new permission
+ # Otherwise we loose the existing permissions
+
+ bindings = json_result["bindings"]
+ new_permission = {
+ "role": role,
+ "members": [ accountWithPrefix ]
+ }
+
+ bindings.append(new_permission)
+
+ # https://cloud.google.com/storage/docs/json_api/v1/buckets/setIamPolicy
+ url = f"https://storage.googleapis.com/storage/v1/b/{bucket_name}/iam"
+
+ request_body = { "bindings" : bindings }
+
+ print(f"Permission bindings: {bindings}")
+
+
+ json_result = restAPIHelper(url, "PUT", request_body)
+ print()
+ print(f"json_result: {json_result}")
+ print()
+ print(f"Bucket IAM Permissions set for {accountWithPrefix} {role}")
+##### Set Project Level IAM Permissions
+def setProjectLevelIamPolicy(project_id, accountWithPrefix, role):
+ """Sets the Project Level IAM policy."""
+
+ # Get the current bindings (if the account has access then skip)
+ # https://cloud.google.com/resource-manager/reference/rest/v1/projects/getIamPolicy
+ url = f"https://cloudresourcemanager.googleapis.com/v1/projects/{project_id}:getIamPolicy"
+
+ request_body = { }
+ json_result = restAPIHelper(url, "POST", request_body)
+ print(f"setProjectLevelIamPolicy (GET) json_result: {json_result}")
+
+ # Test to see if permissions exist
+ if "bindings" in json_result:
+ for item in json_result["bindings"]:
+ if item["role"] == role:
+ members = item["members"]
+ for member in members:
+ if member == accountWithPrefix:
+ print("Permissions exist")
+ return
+
+ # Take the existing bindings and we need to append the new permission
+ # Otherwise we loose the existing permissions
+ if "bindings" in json_result:
+ bindings = json_result["bindings"]
+ else:
+ bindings = []
+
+ new_permission = {
+ "role": role,
+ "members": [ accountWithPrefix ]
+ }
+
+ bindings.append(new_permission)
+
+ # https://cloud.google.com/resource-manager/reference/rest/v1/projects/setIamPolicy
+ url = f"https://cloudresourcemanager.googleapis.com/v1/projects/{project_id}:setIamPolicy"
+
+ request_body = { "policy" : {
+ "bindings" : bindings
+ }
+ }
+
+ print(f"Permission bindings: {bindings}")
+
+ json_result = restAPIHelper(url, "POST", request_body)
+ print()
+ print(f"json_result: {json_result}")
+ print()
+ print(f"Project Level IAM Permissions set for {accountWithPrefix} {role}")
+##### Delete Document Processor
+def deleteDocumentProcessor(project_id, location, processor_name, processor_type):
+ """Creates a Vertex AI document process if it does not exist."""
+
+ # First find the item
+ # https://cloud.google.com/document-ai/docs/reference/rest/v1/projects.locations.processors/list
+ url = f"https://{location}-documentai.googleapis.com/v1/projects/{project_id}/locations/{location}/processors"
+
+ # Gather existing items
+ json_result = restAPIHelper(url, "GET", None)
+ print(f"createDocumentProcessor (GET) json_result: {json_result}")
+
+ # Test to see if processor exists, if so return
+ if "processors" in json_result:
+ for item in json_result["processors"]:
+ print(f"Process Name: {item['name']}")
+ # "projects/530963301545/locations/us/processors/b7e8a9fe78cf7e9c"
+ # NOTE: We do not know the random number at the end so test the type and display name
+ if item["type"] == processor_type and item["displayName"] == processor_name:
+ print("Found processor")
+ name = item["name"]
+ url = f"https://{location}-documentai.googleapis.com/v1/{name}"
+ json_result = restAPIHelper(url, "DELETE", None)
+ print("Document Processor Deleted")
+
+ print("Document Processor not found to delete")
+##### Create a Document Processor (to process the PDFs)
+def createDocumentProcessor(project_id, location, processor_name, processor_type):
+ """Creates a Vertex AI document process if it does not exist."""
+
+ # First find the item
+ # https://cloud.google.com/document-ai/docs/reference/rest/v1/projects.locations.processors/list
+ url = f"https://{location}-documentai.googleapis.com/v1/projects/{project_id}/locations/{location}/processors"
+
+ # Gather existing items
+ json_result = restAPIHelper(url, "GET", None)
+ print(f"createDocumentProcessor (GET) json_result: {json_result}")
+
+ # Test to see if processor exists, if so return
+ if "processors" in json_result:
+ for item in json_result["processors"]:
+ print(f"Process Name: {item['name']}")
+ # "projects/530963301545/locations/us/processors/b7e8a9fe78cf7e9c"
+ # NOTE: We do not know the random number at the end so test the type and display name
+ if item["type"] == processor_type and item["displayName"] == processor_name:
+ print("Processor already exists")
+ defaultProcessorVersion = item["defaultProcessorVersion"]
+ return defaultProcessorVersion
+
+ # Create the processor
+ # https://cloud.google.com/document-ai/docs/reference/rest/v1/projects.locations.processors/create
+ print("Creating Document Processor")
+
+ url = f"https://{location}-documentai.googleapis.com/v1/projects/{project_id}/locations/{location}/processors"
+
+ request_body = {
+ "type": processor_type,
+ "displayName": processor_name
+ }
+
+ """
+ INVALID_ARGUMENT: Document pages in non-imageless mode exceed the limit: 15 got 58.
+ Try using imageless mode to increase the limit to 100. [type.googleapis.com/util.MessageSetPayload='[google.rpc.error_details_ext]
+ { message: "Document pages in non-imageless mode exceed the limit: 15 got 58. Try using imageless mode to increase the limit to 100."
+ details { [type.googleapis.com/google.rpc.ErrorInfo] { reason: "PAGE_LIMIT_EXCEEDED" domain: "documentai.googleapis.com" metadata
+ { key: "page_limit" value: "100" } metadata { key: "pages" value: "58" } } } }']
+ """
+
+ json_result = restAPIHelper(url, "POST", request_body)
+
+ """
+ {'name': 'projects/530963301545/locations/us/processors/b7e8a9fe78cf7e9c',
+ 'type': 'OCR_PROCESSOR',
+ 'displayName': 'vertex_ai_ocr_processor',
+ 'state': 'ENABLED',
+ 'processEndpoint': 'https://us-documentai.googleapis.com/v1/projects/530963301545/locations/us/processors/b7e8a9fe78cf7e9c:process',
+ 'createTime': '2025-01-21T19:25:07.980401Z',
+ 'defaultProcessorVersion': 'projects/530963301545/locations/us/processors/b7e8a9fe78cf7e9c/processorVersions/pretrained-ocr-v2.0-2023-06-02',
+ 'processorVersionAliases': [{'alias': 'projects/530963301545/locations/us/processors/b7e8a9fe78cf7e9c/processorVersions/pretrained',
+ 'processorVersion': 'projects/530963301545/locations/us/processors/b7e8a9fe78cf7e9c/processorVersions/pretrained-ocr-v1.0-2020-09-23'},
+ {'alias': 'projects/530963301545/locations/us/processors/b7e8a9fe78cf7e9c/processorVersions/pretrained-next',
+ 'processorVersion': 'projects/530963301545/locations/us/processors/b7e8a9fe78cf7e9c/processorVersions/pretrained-ocr-v1.1-2022-09-12'},
+ {'alias': 'projects/530963301545/locations/us/processors/b7e8a9fe78cf7e9c/processorVersions/rc',
+ 'processorVersion': 'projects/530963301545/locations/us/processors/b7e8a9fe78cf7e9c/processorVersions/pretrained-ocr-v2.1-2024-08-07'},
+ {'alias': 'projects/530963301545/locations/us/processors/b7e8a9fe78cf7e9c/processorVersions/stable',
+ 'processorVersion': 'projects/530963301545/locations/us/processors/b7e8a9fe78cf7e9c/processorVersions/pretrained-ocr-v2.0-2023-06-02'}]}
+ """
+
+ defaultProcessorVersion = json_result["defaultProcessorVersion"]
+ print("Document Processor created: ", defaultProcessorVersion)
+ return defaultProcessorVersion
+
+### MAIN CODE - Setup the Environment
+- Create the bucket and copy data from a shared location
+- Create our BigLake / Vertex AI connection in BigQuery
+- The connection creates a service principal so we will grant access to thie principal to our bucket and permissions to call vertex endpoints
+- Create our tables and load with data
+- Create our models in BigQuery
+##### Create our bucket and copy data for BigQuery / Storage
+# Create a bucket
+bucket_name = project_id
+createGoogleCloudStorageBucket(project_id, bucket_name, location)
+# Copy our data (CSV files). We want the files in our local bucket with local location.
+source_path = "gs://data-analytics-golden-demo/cymbal-consumer-finance/*"
+dest_path = f"gs://{bucket_name}/cymbal-consumer-finance/"
+print(f"Copying data from {source_path} to {dest_path}")
+print("This may take a few minutes...")
+!gsutil -m -q cp -r {source_path} {dest_path}
+print("Copy [data] is complete")
+
+
+# Copy our data (PDFs files). We want the files in our local bucket with local location.
+source_path = "gs://data-analytics-golden-demo/cymbal-consumer-finance-pdfs/*"
+dest_path = f"gs://{bucket_name}/pdfs/"
+print(f"Copying data from {source_path} to {dest_path}")
+print("This may take a few minutes...")
+!gsutil -m -q cp -r {source_path} {dest_path}
+print("Copy [pdfs] is complete")
+
+print(f"To view the files: https://console.cloud.google.com/storage/browser/{bucket_name}")
+##### Create the external BigQuery connection for BigLake / Vertex
+# Create our connection for BigLake / Vertex AI
+
+connection_name = "biglake_vertexai_connection"
+biglake_vertexai_connection_serviceAccountId = createExternal_BigLake_VertexAI_RemoteFunctions_Connection(project_id, location, connection_name)
+print(f"biglake_vertexai_connection_serviceAccountId: {biglake_vertexai_connection_serviceAccountId}")
+##### Grant the service account created by the external connection IAM permissions
+# Grant the Biglake / Vertex AI External connection Service Principal permissions to call Vertex Models / Endpoints
+
+################################################################################################################################################
+# NOTE: You might need to wait a minute or two before running this. It can fail if you run this too quickly after creating the connection
+################################################################################################################################################
+
+# To call Docuemnt API
+# Exception: {'reason': 'invalidQuery', 'location': 'query', 'message': "Permission denied for document processor 'projects/530963301545/locations/us/
+#processors/processor_id'. Please ensure that (1) The processor 'processor_id' exists in project 530963301545, region 'us', and is active. (2)
+# The connection's service account bqcx-530963301545-z8r3@gcp-sa-bigquery-condel.iam.gserviceaccount.com has roles/documentai.viewer role
+#in the project 530963301545. More details: Permission 'documentai.processors.get' denied on resource '//documentai.googleapis.com/projects/
+# 530963301545/locations/us/processors/processor_id' (or it may not exist)."}
+setProjectLevelIamPolicy(project_id, f"serviceAccount:{biglake_vertexai_connection_serviceAccountId}", "roles/documentai.viewer")
+
+
+# To call Vision API
+setProjectLevelIamPolicy(project_id, f"serviceAccount:{biglake_vertexai_connection_serviceAccountId}", "roles/serviceusage.serviceUsageConsumer")
+setProjectLevelIamPolicy(project_id, f"serviceAccount:{biglake_vertexai_connection_serviceAccountId}", "roles/serviceusage.serviceUsageConsumer")
+
+# To call GENERATE TEXT
+setProjectLevelIamPolicy(project_id, f"serviceAccount:{biglake_vertexai_connection_serviceAccountId}","roles/aiplatform.user")
+
+# Grant the current user and the service principal for our BigLake / Vertex AI connection access to the bucket
+# We want our BigLake / Vertex AI connections to have access to the files in our bucket
+
+setBucketIamPolicy(bucket_name, f"serviceAccount:{biglake_vertexai_connection_serviceAccountId}", "roles/storage.objectAdmin")
+setBucketIamPolicy(bucket_name, f"user:{user}", "roles/storage.admin")
+##### Create our BigQuery Dataset and populate it with data
+dataset_name = "cymbal_consumer_finance"
+
+sql = f"""
+CREATE SCHEMA IF NOT EXISTS {dataset_name} OPTIONS(location = '{location}');
+"""
+RunQuery(sql)
+
+sql = f"""LOAD DATA OVERWRITE `{dataset_name}.customers`
+(
+ customer_id STRING,
+ first_name STRING,
+ last_name STRING,
+ date_of_birth DATE,
+ email STRING,
+ phone_number STRING,
+ creation_date DATE,
+ life_event STRING
+)
+FROM FILES (format = 'CSV', skip_leading_rows = 1, uris = ['gs://{bucket_name}/cymbal-consumer-finance/ccf_csv_tables_customers.csv']);
+"""
+RunQuery(sql)
+
+sql = f"""
+LOAD DATA OVERWRITE `{dataset_name}.loan_applications`
+(
+ application_id STRING,
+ customer_id STRING,
+ application_date DATE,
+ product_type STRING,
+ sub_product STRING,
+ loan_amount FLOAT64,
+ description STRING,
+ application_status STRING,
+ approval_date DATE,
+ disbursement_date DATE,
+ application_channel STRING,
+ marketing_cost FLOAT64,
+)
+FROM FILES (format = 'CSV', skip_leading_rows = 1, uris = ['gs://{bucket_name}/cymbal-consumer-finance/ccf_csv_tables_loan_applications.csv']);
+"""
+RunQuery(sql)
+
+sql = f"""
+LOAD DATA OVERWRITE `{dataset_name}.loan_repayments`
+(
+ repayment_id STRING,
+ loan_id STRING,
+ repayment_date DATE,
+ amount_due FLOAT64,
+ amount_paid FLOAT64,
+ payment_status STRING,
+ days_past_due INT64
+)
+FROM FILES (format = 'CSV', skip_leading_rows = 1, uris = ['gs://{bucket_name}/cymbal-consumer-finance/ccf_csv_tables_loan_repayments.csv']);
+"""
+RunQuery(sql)
+
+sql = f"""
+LOAD DATA OVERWRITE `{dataset_name}.marketing_costs`
+(
+ cost_id STRING,
+ channel STRING,
+ product_type STRING,
+ cost_per_lead FLOAT64,
+)
+FROM FILES (format = 'CSV', skip_leading_rows = 1, uris = ['gs://{bucket_name}/cymbal-consumer-finance/ccf_csv_tables_marketing_costs.csv']);
+"""
+RunQuery(sql)
+
+print(f"You should now see a new dataset in BigQuery with several tables loaded with data.")
+##### Create our OCR and Layout Processors (to parse our PDF)
+processor_name = "vertex_ai_ocr_processor"
+processor_type = "OCR_PROCESSOR"
+
+# In case you need to change options (you should delete and then recreate)
+# deleteDocumentProcessor(project_id, location, processor_name, processor_type)
+
+vertex_processor_name = createDocumentProcessor(project_id, location, processor_name, processor_type)
+print(f"vertex_processor_name: {vertex_processor_name}")
+
+# Layout Parser (for chunks)
+layout_processor_name = "vertex_ai_layout_processor"
+layout_processor_type = "LAYOUT_PARSER_PROCESSOR"
+
+layout_processor_name = createDocumentProcessor(project_id, location, layout_processor_name, layout_processor_type)
+print(f"layout_processor_name: {layout_processor_name}")
+##### Create our BQML Models in our BigQuery Dataset
+# Create the remote connection to each Vertex AI service. (Vision, Gemini Pro, Embeddings, etc.)
+# The models use the external connection
+
+#####################################################################################################################
+# NOTE: You might get an error that the processor does not exist (you should re-run this cell)
+#####################################################################################################################
+
+sql = f"""CREATE MODEL IF NOT EXISTS `{project_id}.{dataset_name}.layout-connection`
+REMOTE WITH CONNECTION `{project_id}.{location}.{connection_name}`
+OPTIONS (
+ remote_service_type = 'cloud_ai_document_v1',
+ document_processor='{layout_processor_name}');
+"""
+
+RunQuery(sql)
+
+sql = f"""CREATE MODEL IF NOT EXISTS `{project_id}.{dataset_name}.document-connection`
+REMOTE WITH CONNECTION `{project_id}.{location}.{connection_name}`
+OPTIONS (
+ remote_service_type = 'cloud_ai_document_v1',
+ document_processor='{vertex_processor_name}');
+"""
+
+RunQuery(sql)
+
+sql = f"""CREATE MODEL IF NOT EXISTS `{project_id}.{dataset_name}.vision-connection`
+REMOTE WITH CONNECTION `{project_id}.{location}.{connection_name}`
+OPTIONS (remote_service_type = 'cloud_ai_vision_v1');"""
+
+RunQuery(sql)
+
+print(f"Created cloud_ai_vision_v1: {sql}")
+
+sql = f"""CREATE MODEL IF NOT EXISTS `{project_id}.{dataset_name}.gemini-15-pro`
+REMOTE WITH CONNECTION `{project_id}.{location}.{connection_name}`
+OPTIONS (endpoint = 'gemini-1.5-pro');"""
+
+RunQuery(sql)
+
+print(f"Created gemini-1.5-pro: {sql}")
+
+#sql = f"""CREATE MODEL IF NOT EXISTS `{project_id}.{dataset_name}.gemini-2-0-flash`
+#REMOTE WITH CONNECTION `{project_id}.{location}.{connection_name}`
+#OPTIONS (endpoint = 'gemini-2.0-flash');"""
+
+#RunQuery(sql)
+
+#print(f"Created gemini-2.0-flash: {sql}")
+
+
+sql = f"""CREATE MODEL IF NOT EXISTS `{project_id}.{dataset_name}.vertexai-textembedding`
+REMOTE WITH CONNECTION `{project_id}.{location}.{connection_name}`
+OPTIONS (endpoint = 'text-embedding-005');"""
+
+RunQuery(sql)
+
+print(f"Created text-embedding-005: {sql}")
+### MAIN CODE - Demo
+1. Create an Object table over a set of PDFs
+2. Force a refresh of the object table
+3. Process the PDFs with the OCR processor which creates 1 large text extract
+4. Process the PDFs with the Layout processor which will chunk our PDF text
+5. Create embeddings on the chunked PDF text
+6. Search the embedddings
+7. Use Gemini and the RAG pattern to use our embedding search results to answer a question
+##### Create an Object table over a set of PDFs
+# Create the object table over the files
+
+object_table_name = "object_table_pdfs"
+
+sql = f"""
+CREATE OR REPLACE EXTERNAL TABLE `{project_id}.{dataset_name}.{object_table_name}`
+WITH CONNECTION `{project_id}.{location}.{connection_name}`
+OPTIONS (
+ object_metadata="DIRECTORY",
+ uris = ['gs://{bucket_name}/pdfs/*.pdf'],
+ max_staleness=INTERVAL 30 MINUTE,
+ metadata_cache_mode="MANUAL"
+ );
+"""
+
+RunQuery(sql)
+##### Call the Refresh on the Object table so it picks up the files in storage
+# Since the table is set to MANUAL refresh, refresh the table so we see the files
+
+sql = f"CALL BQ.REFRESH_EXTERNAL_METADATA_CACHE('{project_id}.{dataset_name}.{object_table_name}')"
+RunQuery(sql)
+
+# Show the data
+sql=f"""SELECT *
+ FROM `{project_id}.{dataset_name}.{object_table_name}`
+ LIMIT 20;"""
+
+result=RunQuery(sql)
+
+result
+# Get a signed url so we can show in this notebook
+
+sql=f"""SELECT *
+ FROM EXTERNAL_OBJECT_TRANSFORM(TABLE `{project_id}.{dataset_name}.{object_table_name}`,['SIGNED_URL'])
+ WHERE uri LIKE '%loan_application_0d2e87d5-6337-4fc6-b6ed-5e6f35df596b.pdf%'"""
+
+df=RunQuery(sql)
+
+for row in df.itertuples():
+ uri = row.uri
+ signed_url = row.signed_url
+
+print(f"uri: {uri}")
+print(f"signed_url: {signed_url}")
+
+print()
+print()
+print("Notice that we have handwriting and different types of fonts.")
+print()
+print()
+
+# Chrome shows a warning
+#iframe = IPython.display.IFrame(src=signed_url, width=800, height=600)
+#display(iframe)
+
+html = f"""
+
+"""
+IPython.display.HTML(html)
+
+##### We want to process each PDF document in our object table (OCR Processor)
+# This is 1 big pdf text block using the OCR processor
+
+pdfs_document_ocr_processor_table_name = "pdfs_document_ocr_processor"
+
+sql = f"""
+CREATE OR REPLACE TABLE `{project_id}.{dataset_name}.{pdfs_document_ocr_processor_table_name}` AS (
+ SELECT *
+ FROM ML.PROCESS_DOCUMENT(MODEL `{project_id}.{dataset_name}.document-connection`,
+ TABLE `{project_id}.{dataset_name}.{object_table_name}`)
+);"""
+
+RunQuery(sql)
+
+# Show the data
+sql=f"""SELECT *
+ FROM `{project_id}.{dataset_name}.{pdfs_document_ocr_processor_table_name}`
+ LIMIT 10;"""
+
+result=RunQuery(sql)
+
+result
+##### We want to process each PDF document in our object table (Layout Processor)
+# This is chunks of text blocks using the layout processor
+# We would need to "manually" (or semantic) chunk this to create embeddings
+
+pdfs_document_layout_processor_table_name = "pdfs_document_layout_processor"
+process_options = '{"layout_config": {"chunking_config": {"chunk_size": 100}}}'
+
+sql = f"""
+CREATE OR REPLACE TABLE `{project_id}.{dataset_name}.{pdfs_document_layout_processor_table_name}` AS (
+ SELECT *
+ FROM ML.PROCESS_DOCUMENT(MODEL `{project_id}.{dataset_name}.layout-connection`,
+ TABLE `{project_id}.{dataset_name}.{object_table_name}`,
+ PROCESS_OPTIONS => (JSON '{process_options}')
+ )
+);"""
+
+RunQuery(sql)
+
+# Show the data
+sql=f"""SELECT uri, ml_process_document_result
+ FROM `{project_id}.{dataset_name}.{pdfs_document_layout_processor_table_name}`
+ LIMIT 5;"""
+
+result=RunQuery(sql)
+
+result
+##### Parse the JSON from the PDF extraction. Use BQ JSON functions.
+pdfs_document_layout_processor_chunks_table_name = "pdfs_document_layout_processor_chunks"
+
+sql = f"""CREATE OR REPLACE TABLE `{project_id}.{dataset_name}.{pdfs_document_layout_processor_chunks_table_name}` AS
+SELECT uri,
+ JSON_EXTRACT_SCALAR(json , '$.chunkId') AS chunk_id,
+ JSON_EXTRACT_SCALAR(json , '$.content') AS content,
+ JSON_EXTRACT_SCALAR(json , '$.pageFooters[0].text') AS page_footers_text,
+ JSON_EXTRACT_SCALAR(json , '$.pageSpan.pageStart') AS page_span_start,
+ JSON_EXTRACT_SCALAR(json , '$.pageSpan.pageEnd') AS page_span_end
+ FROM `{project_id}.{dataset_name}.{pdfs_document_layout_processor_table_name}`,
+ UNNEST(JSON_EXTRACT_ARRAY(ml_process_document_result.chunkedDocument.chunks, '$')) json
+"""
+
+RunQuery(sql)
+
+# Show the data
+sql=f"""SELECT *
+ FROM `{project_id}.{dataset_name}.{pdfs_document_layout_processor_chunks_table_name}`
+ ORDER BY uri, chunk_id
+ LIMIT 10"""
+
+result=RunQuery(sql)
+
+result
+##### Use ML.GENERATE_EMBEDDING to create embeddings for our extract PDF text
+# In a real life scenerio, you would not pass in TABLE `{project_id}.{dataset_name}.{pdfs_document_layout_processor_chunks_table_name}`
+# You would pass in a query for items that do not already EXIST in the embedding table. The TABLE parameter could also be a SQL statement.
+
+# The "content" field will automatically be pass to the model as the column to embed
+
+
+pdfs_document_layout_processor_chunks_table_name = "pdfs_document_layout_processor_chunks"
+pdfs_document_embeddings_table_name = "pdfs_document_embeddings"
+
+sql = f"""
+CREATE OR REPLACE TABLE `{project_id}.{dataset_name}.{pdfs_document_embeddings_table_name}` AS
+SELECT uri,
+ chunk_id,
+ content,
+ ml_generate_embedding_result as vector_embedding,
+ ml_generate_embedding_statistics,
+ ml_generate_embedding_status,
+ FROM ML.GENERATE_EMBEDDING(MODEL `{project_id}.{dataset_name}.vertexai-textembedding`,
+ TABLE `{project_id}.{dataset_name}.{pdfs_document_layout_processor_chunks_table_name}`,
+ STRUCT(
+ TRUE AS flatten_json_output,
+ 'SEMANTIC_SIMILARITY' as task_type,
+ 768 AS output_dimensionality
+ ))
+"""
+
+RunQuery(sql)
+##### Show our embedded data (one first element from embedding array)
+# Show the data
+
+sql=f"""SELECT uri, chunk_id, content, [vector_embedding[0],vector_embedding[1]] as vector_embedding, ml_generate_embedding_statistics, ml_generate_embedding_status
+ FROM `{project_id}.{dataset_name}.{pdfs_document_embeddings_table_name}`
+ ORDER BY uri, chunk_id
+ LIMIT 10;"""
+
+result=RunQuery(sql)
+
+result
+##### Search our embeddings table for a search string
+# The search results will contain all the data that is semaniticly close to our search string
+# The shorter the distance the more pertinent the result
+
+vector_search_string = 'always wanted to own a home'
+
+options = '{"fraction_lists_to_search": 0.01}'
+
+sql = f"""SELECT base.uri as uri,
+ base.chunk_id as chunk_id,
+ base.content as content,
+ distance
+ FROM VECTOR_SEARCH(TABLE `{project_id}.{dataset_name}.{pdfs_document_embeddings_table_name}`,
+ 'vector_embedding', -- column in table to search
+ (SELECT ml_generate_embedding_result,
+ content AS query
+ FROM ML.GENERATE_EMBEDDING(MODEL `{project_id}.{dataset_name}.vertexai-textembedding`,
+ (SELECT '{vector_search_string}' AS content),
+ STRUCT(TRUE AS flatten_json_output,
+ 'SEMANTIC_SIMILARITY' as task_type,
+ 768 AS output_dimensionality) -- struct
+ ) -- question embedding
+ ), -- vector search
+ top_k => 10,
+ OPTIONS => '{options}')
+ORDER BY distance;
+"""
+
+result = RunQuery(sql)
+
+result
+**Learning Item**
+- Change the search string for some different items
+##### Search our PDFs (embeddings) and return the results as JSON
+# Now let's return the results as a JSON string that we will inject into the context of Gemini
+# LLMs understand JSON so we will have each text with the source of the text
+
+vector_search_string = 'always wanted to own a home'
+
+options = '{"fraction_lists_to_search": 0.01}'
+
+sql = f"""SELECT TO_JSON_STRING(STRUCT(base.uri as uri,
+ base.chunk_id as chunk_id,
+ base.content as content,
+ distance as vector_search_distance)) as rag_json
+ FROM VECTOR_SEARCH(TABLE `{project_id}.{dataset_name}.{pdfs_document_embeddings_table_name}`,
+ 'vector_embedding', -- column in table to search
+ (SELECT ml_generate_embedding_result,
+ content AS query
+ FROM ML.GENERATE_EMBEDDING(MODEL `{project_id}.{dataset_name}.vertexai-textembedding`,
+ (SELECT '{vector_search_string}' AS content),
+ STRUCT(TRUE AS flatten_json_output,
+ 'SEMANTIC_SIMILARITY' as task_type,
+ 768 AS output_dimensionality) -- struct
+ ) -- question embedding
+ ), -- vector search
+ top_k => 10,
+ OPTIONS => '{options}')
+ORDER BY distance;
+"""
+
+result = RunQuery(sql)
+
+result
+##### Call Gemini directly in BigQuery using ML.GENERATE_TEXT
+llm_prompt = 'What type of LLM are you?'
+
+
+sql = f"""SELECT *
+ FROM ML.GENERATE_TEXT(MODEL`{project_id}.{dataset_name}.gemini-15-pro`,
+ (SELECT '{llm_prompt}' AS prompt),
+ STRUCT(
+ 0.8 AS temperature,
+ 1024 AS max_output_tokens,
+ 0.95 AS top_p,
+ 40 AS top_k)
+ )
+"""
+
+result = RunQuery(sql)
+
+result
+**Learning Item**
+- Parse the returned JSON from Gemini
+- Tip: Create a UDF to parse the JSON, that way if Gemini changes the response JSON you only have 1 place to update your code.
+##### Use Gemini (BQML) to process the data we retrieve from our Embedding Search to create a summary
+# RAG Pattern
+
+vector_search_string = 'own a home'
+
+llm_prompt = """Which people have wanted a house during the loan process?
+Site the sources by explaining your results using the uri and the chunk_id.
+Quote the orginal text from the content.
+