From ea7a94a7979fc62f56ffac553401d483ff6d807e Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Thu, 31 Aug 2023 10:04:14 -0700 Subject: [PATCH] Update azd flow for vectors and CosmosDB (#210) --- .devcontainer/Dockerfile | 6 -- .devcontainer/devcontainer.json | 10 +-- .env.sample | 2 +- README.md | 4 +- app.py | 2 +- .../core/database/cosmos/cosmos-account.bicep | 33 +++++++++ .../cosmos/sql/cosmos-sql-account.bicep | 18 +++++ .../database/cosmos/sql/cosmos-sql-db.bicep | 71 +++++++++++++++++++ .../cosmos/sql/cosmos-sql-role-assign.bicep | 19 +++++ .../cosmos/sql/cosmos-sql-role-def.bicep | 30 ++++++++ infra/core/host/appservice.bicep | 1 + infra/db.bicep | 33 +++++++++ infra/main.bicep | 41 +++++++++-- requirements.txt | 2 +- scripts/data_preparation.py | 16 ++--- scripts/data_utils.py | 48 ++++++++----- scripts/prepdocs.py | 31 +++++++- scripts/prepdocs.sh | 2 +- scripts/readme.md | 4 +- start.sh | 2 +- 20 files changed, 319 insertions(+), 56 deletions(-) delete mode 100644 .devcontainer/Dockerfile create mode 100644 infra/core/database/cosmos/cosmos-account.bicep create mode 100644 infra/core/database/cosmos/sql/cosmos-sql-account.bicep create mode 100644 infra/core/database/cosmos/sql/cosmos-sql-db.bicep create mode 100644 infra/core/database/cosmos/sql/cosmos-sql-role-assign.bicep create mode 100644 infra/core/database/cosmos/sql/cosmos-sql-role-def.bicep create mode 100644 infra/db.bicep diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile deleted file mode 100644 index a2cf933cc6..0000000000 --- a/.devcontainer/Dockerfile +++ /dev/null @@ -1,6 +0,0 @@ -ARG IMAGE=bullseye -FROM --platform=amd64 mcr.microsoft.com/devcontainers/${IMAGE} -RUN export DEBIAN_FRONTEND=noninteractive \ - && apt-get update && apt-get install -y xdg-utils \ - && apt-get clean -y && rm -rf /var/lib/apt/lists/* -RUN curl -fsSL https://aka.ms/install-azd.sh | bash diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 319e5ad5a0..62b58418fb 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,17 +1,13 @@ { "name": "Azure Developer CLI", - "build": { - "dockerfile": "Dockerfile", - "args": { - "IMAGE": "python:3.10" - } - }, + "image": "mcr.microsoft.com/devcontainers/python:3.10", "features": { "ghcr.io/devcontainers/features/node:1": { "version": "16", "nodeGypDependencies": false }, - "ghcr.io/devcontainers/features/azure-cli:1.0.8": {} + "ghcr.io/devcontainers/features/azure-cli:1.0.8": {}, + "ghcr.io/azure/azure-dev/azd:latest": {} }, "customizations": { "vscode": { diff --git a/.env.sample b/.env.sample index 4cf7e4438c..01dd5c1ef1 100644 --- a/.env.sample +++ b/.env.sample @@ -16,7 +16,7 @@ AZURE_SEARCH_PERMITTED_GROUPS_COLUMN= AZURE_OPENAI_RESOURCE= AZURE_OPENAI_MODEL= AZURE_OPENAI_KEY= -AZURE_OPENAI_MODEL_NAME=gpt-35-turbo +AZURE_OPENAI_MODEL_NAME=gpt-35-turbo-16k AZURE_OPENAI_TEMPERATURE=0 AZURE_OPENAI_TOP_P=1.0 AZURE_OPENAI_MAX_TOKENS=1000 diff --git a/README.md b/README.md index 1ff83dc8cd..623e1705ec 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ This repo contains sample code for a simple chat webapp that integrates with Azure OpenAI. Note: some portions of the app use preview APIs. ## Prerequisites -- An existing Azure OpenAI resource and model deployment of a chat model (e.g. `gpt-35-turbo`, `gpt-4`) +- An existing Azure OpenAI resource and model deployment of a chat model (e.g. `gpt-35-turbo-16k`, `gpt-4`) - To use Azure OpenAI on your data: an existing Azure Cognitive Search resource and index. ## Deploy the app @@ -145,7 +145,7 @@ Note: settings starting with `AZURE_SEARCH` are only needed when using Azure Ope |AZURE_OPENAI_RESOURCE||the name of your Azure OpenAI resource| |AZURE_OPENAI_MODEL||The name of your model deployment| |AZURE_OPENAI_ENDPOINT||The endpoint of your Azure OpenAI resource.| -|AZURE_OPENAI_MODEL_NAME|gpt-35-turbo|The name of the model| +|AZURE_OPENAI_MODEL_NAME|gpt-35-turbo-16k|The name of the model| |AZURE_OPENAI_KEY||One of the API keys of your Azure OpenAI resource| |AZURE_OPENAI_TEMPERATURE|0|What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. A value of 0 is recommended when using your data.| |AZURE_OPENAI_TOP_P|1.0|An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. We recommend setting this to 1.0 when using your data.| diff --git a/app.py b/app.py index 460049440d..e1798e1cdf 100644 --- a/app.py +++ b/app.py @@ -56,7 +56,7 @@ def assets(path): AZURE_OPENAI_SYSTEM_MESSAGE = os.environ.get("AZURE_OPENAI_SYSTEM_MESSAGE", "You are an AI assistant that helps people find information.") AZURE_OPENAI_PREVIEW_API_VERSION = os.environ.get("AZURE_OPENAI_PREVIEW_API_VERSION", "2023-06-01-preview") AZURE_OPENAI_STREAM = os.environ.get("AZURE_OPENAI_STREAM", "true") -AZURE_OPENAI_MODEL_NAME = os.environ.get("AZURE_OPENAI_MODEL_NAME", "gpt-35-turbo") # Name of the model, e.g. 'gpt-35-turbo' or 'gpt-4' +AZURE_OPENAI_MODEL_NAME = os.environ.get("AZURE_OPENAI_MODEL_NAME", "gpt-35-turbo-16k") # Name of the model, e.g. 'gpt-35-turbo-16k' or 'gpt-4' AZURE_OPENAI_EMBEDDING_ENDPOINT = os.environ.get("AZURE_OPENAI_EMBEDDING_ENDPOINT") AZURE_OPENAI_EMBEDDING_KEY = os.environ.get("AZURE_OPENAI_EMBEDDING_KEY") diff --git a/infra/core/database/cosmos/cosmos-account.bicep b/infra/core/database/cosmos/cosmos-account.bicep new file mode 100644 index 0000000000..a918b9e695 --- /dev/null +++ b/infra/core/database/cosmos/cosmos-account.bicep @@ -0,0 +1,33 @@ +metadata description = 'Creates an Azure Cosmos DB account.' +param name string +param location string = resourceGroup().location +param tags object = {} + +@allowed([ 'GlobalDocumentDB', 'MongoDB', 'Parse' ]) +param kind string + +resource cosmos 'Microsoft.DocumentDB/databaseAccounts@2022-08-15' = { + name: name + kind: kind + location: location + tags: tags + properties: { + consistencyPolicy: { defaultConsistencyLevel: 'Session' } + locations: [ + { + locationName: location + failoverPriority: 0 + isZoneRedundant: false + } + ] + databaseAccountOfferType: 'Standard' + enableAutomaticFailover: false + enableMultipleWriteLocations: false + apiProperties: (kind == 'MongoDB') ? { serverVersion: '4.0' } : {} + capabilities: [ { name: 'EnableServerless' } ] + } +} + +output endpoint string = cosmos.properties.documentEndpoint +output id string = cosmos.id +output name string = cosmos.name diff --git a/infra/core/database/cosmos/sql/cosmos-sql-account.bicep b/infra/core/database/cosmos/sql/cosmos-sql-account.bicep new file mode 100644 index 0000000000..1f4d5d728d --- /dev/null +++ b/infra/core/database/cosmos/sql/cosmos-sql-account.bicep @@ -0,0 +1,18 @@ +metadata description = 'Creates an Azure Cosmos DB for NoSQL account.' +param name string +param location string = resourceGroup().location +param tags object = {} + +module cosmos '../../cosmos/cosmos-account.bicep' = { + name: 'cosmos-account' + params: { + name: name + location: location + tags: tags + kind: 'GlobalDocumentDB' + } +} + +output endpoint string = cosmos.outputs.endpoint +output id string = cosmos.outputs.id +output name string = cosmos.outputs.name diff --git a/infra/core/database/cosmos/sql/cosmos-sql-db.bicep b/infra/core/database/cosmos/sql/cosmos-sql-db.bicep new file mode 100644 index 0000000000..8b9040d261 --- /dev/null +++ b/infra/core/database/cosmos/sql/cosmos-sql-db.bicep @@ -0,0 +1,71 @@ +metadata description = 'Creates an Azure Cosmos DB for NoSQL account with a database.' +param accountName string +param databaseName string +param location string = resourceGroup().location +param tags object = {} + +param containers array = [] +param principalIds array = [] + +module cosmos 'cosmos-sql-account.bicep' = { + name: 'cosmos-sql-account' + params: { + name: accountName + location: location + tags: tags + } +} + +resource database 'Microsoft.DocumentDB/databaseAccounts/sqlDatabases@2022-05-15' = { + name: '${accountName}/${databaseName}' + properties: { + resource: { id: databaseName } + } + + resource list 'containers' = [for container in containers: { + name: container.name + properties: { + resource: { + id: container.id + partitionKey: { paths: [ container.partitionKey ] } + } + options: {} + } + }] + + dependsOn: [ + cosmos + ] +} + +module roleDefinition 'cosmos-sql-role-def.bicep' = { + name: 'cosmos-sql-role-definition' + params: { + accountName: accountName + } + dependsOn: [ + cosmos + database + ] +} + +// We need batchSize(1) here because sql role assignments have to be done sequentially +@batchSize(1) +module userRole 'cosmos-sql-role-assign.bicep' = [for principalId in principalIds: if (!empty(principalId)) { + name: 'cosmos-sql-user-role-${uniqueString(principalId)}' + params: { + accountName: accountName + roleDefinitionId: roleDefinition.outputs.id + principalId: principalId + } + dependsOn: [ + cosmos + database + ] +}] + +output accountId string = cosmos.outputs.id +output accountName string = cosmos.outputs.name +output databaseName string = databaseName +output endpoint string = cosmos.outputs.endpoint +output roleDefinitionId string = roleDefinition.outputs.id diff --git a/infra/core/database/cosmos/sql/cosmos-sql-role-assign.bicep b/infra/core/database/cosmos/sql/cosmos-sql-role-assign.bicep new file mode 100644 index 0000000000..3949efef0a --- /dev/null +++ b/infra/core/database/cosmos/sql/cosmos-sql-role-assign.bicep @@ -0,0 +1,19 @@ +metadata description = 'Creates a SQL role assignment under an Azure Cosmos DB account.' +param accountName string + +param roleDefinitionId string +param principalId string = '' + +resource role 'Microsoft.DocumentDB/databaseAccounts/sqlRoleAssignments@2022-05-15' = { + parent: cosmos + name: guid(roleDefinitionId, principalId, cosmos.id) + properties: { + principalId: principalId + roleDefinitionId: roleDefinitionId + scope: cosmos.id + } +} + +resource cosmos 'Microsoft.DocumentDB/databaseAccounts@2022-08-15' existing = { + name: accountName +} diff --git a/infra/core/database/cosmos/sql/cosmos-sql-role-def.bicep b/infra/core/database/cosmos/sql/cosmos-sql-role-def.bicep new file mode 100644 index 0000000000..778d6dc47b --- /dev/null +++ b/infra/core/database/cosmos/sql/cosmos-sql-role-def.bicep @@ -0,0 +1,30 @@ +metadata description = 'Creates a SQL role definition under an Azure Cosmos DB account.' +param accountName string + +resource roleDefinition 'Microsoft.DocumentDB/databaseAccounts/sqlRoleDefinitions@2022-08-15' = { + parent: cosmos + name: guid(cosmos.id, accountName, 'sql-role') + properties: { + assignableScopes: [ + cosmos.id + ] + permissions: [ + { + dataActions: [ + 'Microsoft.DocumentDB/databaseAccounts/readMetadata' + 'Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers/items/*' + 'Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers/*' + ] + notDataActions: [] + } + ] + roleName: 'Reader Writer' + type: 'CustomRole' + } +} + +resource cosmos 'Microsoft.DocumentDB/databaseAccounts@2022-08-15' existing = { + name: accountName +} + +output id string = roleDefinition.id diff --git a/infra/core/host/appservice.bicep b/infra/core/host/appservice.bicep index fe118a9f2d..b64fac1b4e 100644 --- a/infra/core/host/appservice.bicep +++ b/infra/core/host/appservice.bicep @@ -25,6 +25,7 @@ param alwaysOn bool = true param appCommandLine string = '' param appSettings object = {} param authClientId string +@secure() param authClientSecret string param authIssuerUri string param clientAffinityEnabled bool = false diff --git a/infra/db.bicep b/infra/db.bicep new file mode 100644 index 0000000000..11310419ee --- /dev/null +++ b/infra/db.bicep @@ -0,0 +1,33 @@ +param accountName string +param location string = resourceGroup().location +param tags object = {} + +param databaseName string = 'db_conversation_history' +param collectionName string = 'conversations' +param principalIds array = [] + +param containers array = [ + { + name: collectionName + id: collectionName + partitionKey: '/id' + } +] + +module cosmos 'core/database/cosmos/sql/cosmos-sql-db.bicep' = { + name: 'cosmos-sql' + params: { + accountName: accountName + databaseName: databaseName + location: location + containers: containers + tags: tags + principalIds: principalIds + } +} + + +output databaseName string = cosmos.outputs.databaseName +output containerName string = containers[0].name +output accountName string = cosmos.outputs.accountName +output endpoint string = cosmos.outputs.endpoint diff --git a/infra/main.bicep b/infra/main.bicep index 590bc7a1a8..520fb72d42 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -31,8 +31,8 @@ param openAiResourceName string = '' param openAiResourceGroupName string = '' param openAiResourceGroupLocation string = location param openAiSkuName string = '' -param openAIModel string = 'turbo' -param openAIModelName string = 'gpt-35-turbo' +param openAIModel string = 'turbo16k' +param openAIModelName string = 'gpt-35-turbo-16k' param openAITemperature int = 0 param openAITopP int = 1 param openAIMaxTokens int = 1000 @@ -40,6 +40,8 @@ param openAIStopSequence string = '' param openAISystemMessage string = 'You are an AI assistant that helps people find information.' param openAIApiVersion string = '2023-06-01-preview' param openAIStream bool = true +param embeddingDeploymentName string = 'embedding' +param embeddingModelName string = 'text-embedding-ada-002' // Used by prepdocs.py: Form recognizer param formRecognizerServiceName string = '' @@ -52,6 +54,9 @@ param authClientId string @secure() param authClientSecret string +// Used for Cosmos DB +param cosmosAccountName string = '' + @description('Id of the user or app to assign application roles') param principalId string = '' @@ -93,7 +98,7 @@ module appServicePlan 'core/host/appserviceplan.bicep' = { // The application frontend var appServiceName = !empty(backendServiceName) ? backendServiceName : '${abbrs.webSitesAppService}backend-${resourceToken}' -var authIssuerUri = 'https://login.microsoftonline.com/${tenant().tenantId}/v2.0' +var authIssuerUri = '${environment().authentication.loginEndpoint}${tenant().tenantId}/v2.0' module backend 'core/host/appservice.bicep' = { name: 'web' scope: resourceGroup @@ -155,7 +160,16 @@ module openAi 'core/ai/cognitiveservices.bicep' = { model: { format: 'OpenAI' name: openAIModelName - version: '0301' + version: '0613' + } + capacity: 30 + } + { + name: embeddingDeploymentName + model: { + format: 'OpenAI' + name: embeddingModelName + version: '2' } capacity: 30 } @@ -182,6 +196,17 @@ module searchService 'core/search/search-services.bicep' = { } } +// The application database +module cosmos 'db.bicep' = { + name: 'cosmos' + scope: resourceGroup + params: { + accountName: !empty(cosmosAccountName) ? cosmosAccountName : '${abbrs.documentDBDatabaseAccounts}${resourceToken}' + location: 'eastus' + tags: tags + principalIds: [principalId, backend.outputs.identityPrincipalId] + } +} // USER ROLES @@ -285,10 +310,13 @@ output AZURE_SEARCH_URL_COLUMN string = searchUrlColumn // openai output AZURE_OPENAI_RESOURCE string = openAi.outputs.name output AZURE_OPENAI_RESOURCE_GROUP string = openAiResourceGroup.name +output AZURE_OPENAI_ENDPOINT string = openAi.outputs.endpoint output AZURE_OPENAI_MODEL string = openAIModel output AZURE_OPENAI_MODEL_NAME string = openAIModelName output AZURE_OPENAI_SKU_NAME string = openAi.outputs.skuName output AZURE_OPENAI_KEY string = openAi.outputs.key +output AZURE_OPENAI_EMBEDDING_KEY string = openAi.outputs.key +output AZURE_OPENAI_EMBEDDING_ENDPOINT string = '${openAi.outputs.endpoint}/openai/deployments/${embeddingDeploymentName}/embeddings?api-version=2023-06-01-preview' output AZURE_OPENAI_TEMPERATURE int = openAITemperature output AZURE_OPENAI_TOP_P int = openAITopP output AZURE_OPENAI_MAX_TOKENS int = openAIMaxTokens @@ -302,4 +330,9 @@ output AZURE_FORMRECOGNIZER_SERVICE string = docPrepResources.outputs.AZURE_FORM output AZURE_FORMRECOGNIZER_RESOURCE_GROUP string = docPrepResources.outputs.AZURE_FORMRECOGNIZER_RESOURCE_GROUP output AZURE_FORMRECOGNIZER_SKU_NAME string = docPrepResources.outputs.AZURE_FORMRECOGNIZER_SKU_NAME +// cosmos +output AZURE_COSMOSDB_ACCOUNT string = cosmos.outputs.accountName +output AZURE_COSMOSDB_DATABASE string = cosmos.outputs.databaseName +output AZURE_COSMOSDB_CONVERSATIONS_CONTAINER string = cosmos.outputs.containerName + output AUTH_ISSUER_URI string = authIssuerUri diff --git a/requirements.txt b/requirements.txt index 0fc1f0971a..494ee703f2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,4 @@ openai==0.27.7 azure-search-documents==11.4.0b6 azure-storage-blob==12.17.0 python-dotenv==1.0.0 -azure-cosmos==4.3.1 \ No newline at end of file +azure-cosmos==4.5.0 \ No newline at end of file diff --git a/scripts/data_preparation.py b/scripts/data_preparation.py index 800df733d4..090e94d14c 100644 --- a/scripts/data_preparation.py +++ b/scripts/data_preparation.py @@ -332,7 +332,7 @@ def validate_index(service_name, subscription_id, resource_group, index_name): print(f"Request failed. Please investigate. Status code: {response.status_code}") break -def create_index(config, credential, form_recognizer_client=None, use_layout=False, njobs=4): +def create_index(config, credential, form_recognizer_client=None, embedding_model_endpoint=None, use_layout=False, njobs=4): service_name = config["search_service_name"] subscription_id = config["subscription_id"] resource_group = config["resource_group"] @@ -361,9 +361,11 @@ def create_index(config, credential, form_recognizer_client=None, use_layout=Fal # chunk directory print("Chunking directory...") add_embeddings = False - if config.get("vector_config_name") and os.environ.get("EMBEDDING_MODEL_ENDPOINT") and os.environ.get("EMBEDDING_MODEL_KEY"): + if config.get("vector_config_name") and embedding_model_endpoint: add_embeddings = True - result = chunk_directory(config["data_path"], num_tokens=config["chunk_size"], token_overlap=config.get("token_overlap",0), form_recognizer_client=form_recognizer_client, use_layout=use_layout, njobs=njobs, add_embeddings=add_embeddings) + result = chunk_directory(config["data_path"], num_tokens=config["chunk_size"], token_overlap=config.get("token_overlap",0), + azure_credential=credential, form_recognizer_client=form_recognizer_client, use_layout=use_layout, njobs=njobs, + add_embeddings=add_embeddings, embedding_endpoint=embedding_model_endpoint) if len(result.chunks) == 0: raise Exception("No chunks found. Please check the data path and chunk size.") @@ -414,16 +416,12 @@ def valid_range(n): form_recognizer_client = DocumentAnalysisClient(endpoint=f"https://{args.form_rec_resource}.cognitiveservices.azure.com/", credential=AzureKeyCredential(args.form_rec_key)) print(f"Using Form Recognizer resource {args.form_rec_resource} for PDF cracking, with the {'Layout' if args.form_rec_use_layout else 'Read'} model.") - if args.embedding_model_endpoint and args.embedding_model_key: - os.environ["EMBEDDING_MODEL_ENDPOINT"] = args.embedding_model_endpoint - os.environ["EMBEDDING_MODEL_KEY"] = args.embedding_model_key - for index_config in config: print("Preparing data for index:", index_config["index_name"]) - if index_config.get("vector_config_name") and not (args.embedding_model_endpoint and args.embedding_model_key): + if index_config.get("vector_config_name") and not args.embedding_model_endpoint: raise Exception("ERROR: Vector search is enabled in the config, but no embedding model endpoint and key were provided. Please provide these values or disable vector search.") - create_index(index_config, credential, form_recognizer_client, use_layout=args.form_rec_use_layout, njobs=args.njobs) + create_index(index_config, credential, form_recognizer_client, embedding_model_endpoint=args.embedding_model_endpoint, use_layout=args.form_rec_use_layout, njobs=args.njobs) print("Data preparation for index", index_config["index_name"], "completed") print(f"Data preparation script completed. {len(config)} indexes updated.") \ No newline at end of file diff --git a/scripts/data_utils.py b/scripts/data_utils.py index 8d54aa35e7..f80c61bf6e 100644 --- a/scripts/data_utils.py +++ b/scripts/data_utils.py @@ -15,6 +15,7 @@ import markdown import tiktoken +from azure.identity import DefaultAzureCredential from azure.ai.formrecognizer import DocumentAnalysisClient from azure.core.credentials import AzureKeyCredential from bs4 import BeautifulSoup @@ -435,27 +436,22 @@ def merge_chunks_serially(chunked_content_list: List[str], num_tokens: int) -> G yield current_chunk, total_size -def get_embedding(text): - endpoint = os.environ.get("EMBEDDING_MODEL_ENDPOINT") - key = os.environ.get("EMBEDDING_MODEL_KEY") - if endpoint is None or key is None: - raise Exception("EMBEDDING_MODEL_ENDPOINT and EMBEDDING_MODEL_KEY are required for embedding") - +def get_embedding(text, azure_credential, embedding_endpoint): try: - endpoint_parts = endpoint.split("/openai/deployments/") + endpoint_parts = embedding_endpoint.split("/openai/deployments/") base_url = endpoint_parts[0] deployment_id = endpoint_parts[1].split("/embeddings")[0] openai.api_version = '2023-05-15' openai.api_base = base_url - openai.api_type = 'azure' - openai.api_key = os.environ.get("EMBEDDING_MODEL_KEY") + openai.api_key = azure_credential.get_token("https://cognitiveservices.azure.com/.default").token + openai.api_type = "azure_ad" embeddings = openai.Embedding.create(deployment_id=deployment_id, input=text) return embeddings['data'][0]["embedding"] except Exception as e: - raise Exception(f"Error getting embeddings with endpoint={endpoint} with error={e}") + raise Exception(f"Error getting embeddings with endpoint={embedding_endpoint} with error={e}") def chunk_content_helper( @@ -507,7 +503,9 @@ def chunk_content( extensions_to_process = FILE_FORMAT_DICT.keys(), cracked_pdf = False, use_layout = False, - add_embeddings = False + add_embeddings = False, + azure_credential = None, + embedding_endpoint = None ) -> ChunkingResult: """Chunks the given content. If ignore_errors is true, returns None in case of an error @@ -548,7 +546,7 @@ def chunk_content( if add_embeddings: for _ in range(RETRY_COUNT): try: - doc.contentVector = get_embedding(chunk) + doc.contentVector = get_embedding(chunk, azure_credential, embedding_endpoint) break except: sleep(30) @@ -595,7 +593,9 @@ def chunk_file( extensions_to_process = FILE_FORMAT_DICT.keys(), form_recognizer_client = None, use_layout = False, - add_embeddings=False + add_embeddings=False, + azure_credential = None, + embedding_endpoint = None ) -> ChunkingResult: """Chunks the given file. Args: @@ -641,7 +641,9 @@ def chunk_file( extensions_to_process=extensions_to_process, cracked_pdf=cracked_pdf, use_layout=use_layout, - add_embeddings=add_embeddings + add_embeddings=add_embeddings, + azure_credential=azure_credential, + embedding_endpoint=embedding_endpoint ) @@ -656,7 +658,9 @@ def process_file( extensions_to_process: List[str] = FILE_FORMAT_DICT.keys(), form_recognizer_client = None, use_layout = False, - add_embeddings = False + add_embeddings = False, + azure_credential = None, + embedding_endpoint = None ): if not form_recognizer_client: @@ -680,7 +684,9 @@ def process_file( extensions_to_process=extensions_to_process, form_recognizer_client=form_recognizer_client, use_layout=use_layout, - add_embeddings=add_embeddings + add_embeddings=add_embeddings, + azure_credential=azure_credential, + embedding_endpoint=embedding_endpoint ) for chunk_idx, chunk_doc in enumerate(result.chunks): chunk_doc.filepath = rel_file_path @@ -705,7 +711,9 @@ def chunk_directory( form_recognizer_client = None, use_layout = False, njobs=4, - add_embeddings = False + add_embeddings = False, + azure_credential = None, + embedding_endpoint = None ): """ Chunks the given directory recursively @@ -746,7 +754,8 @@ def chunk_directory( min_chunk_size=min_chunk_size, url_prefix=url_prefix, token_overlap=token_overlap, extensions_to_process=extensions_to_process, - form_recognizer_client=form_recognizer_client, use_layout=use_layout, add_embeddings=add_embeddings) + form_recognizer_client=form_recognizer_client, use_layout=use_layout, add_embeddings=add_embeddings, + azure_credential=azure_credential, embedding_endpoint=embedding_endpoint) if is_error: num_files_with_errors += 1 continue @@ -761,7 +770,8 @@ def chunk_directory( min_chunk_size=min_chunk_size, url_prefix=url_prefix, token_overlap=token_overlap, extensions_to_process=extensions_to_process, - form_recognizer_client=None, use_layout=use_layout, add_embeddings=add_embeddings) + form_recognizer_client=None, use_layout=use_layout, add_embeddings=add_embeddings, + azure_credential=azure_credential, embedding_endpoint=embedding_endpoint) with ProcessPoolExecutor(max_workers=njobs) as executor: futures = list(tqdm(executor.map(process_file_partial, files_to_process), total=len(files_to_process))) for result, is_error in futures: diff --git a/scripts/prepdocs.py b/scripts/prepdocs.py index 6c515f7294..6f4cc57db1 100644 --- a/scripts/prepdocs.py +++ b/scripts/prepdocs.py @@ -8,11 +8,16 @@ from azure.search.documents.indexes import SearchIndexClient from azure.search.documents.indexes.models import ( SearchableField, + SearchField, + SearchFieldDataType, SemanticField, SemanticSettings, SemanticConfiguration, SearchIndex, PrioritizedFields, + VectorSearch, + VectorSearchAlgorithmConfiguration, + HnswParameters ) from azure.search.documents import SearchClient from azure.ai.formrecognizer import DocumentAnalysisClient @@ -37,6 +42,9 @@ def create_search_index(index_name, index_client): SearchableField(name="filepath", type="Edm.String"), SearchableField(name="url", type="Edm.String"), SearchableField(name="metadata", type="Edm.String"), + SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), + hidden=False, searchable=True, filterable=False, sortable=False, facetable=False, + vector_search_dimensions=1536, vector_search_configuration="default"), ], semantic_settings=SemanticSettings( configurations=[ @@ -51,6 +59,15 @@ def create_search_index(index_name, index_client): ) ] ), + vector_search=VectorSearch( + algorithm_configurations=[ + VectorSearchAlgorithmConfiguration( + name="default", + kind="hnsw", + hnsw_parameters=HnswParameters(metric="cosine") + ) + ] + ) ) print(f"Creating {index_name} search index") index_client.create_index(index) @@ -66,6 +83,8 @@ def upload_documents_to_index(docs, search_client, upload_batch_size=50): d = dataclasses.asdict(document) # add id to documents d.update({"@search.action": "upload", "id": str(id)}) + if "contentVector" in d and d["contentVector"] is None: + del d["contentVector"] to_upload_dicts.append(d) id += 1 @@ -108,7 +127,7 @@ def validate_index(index_name, index_client): def create_and_populate_index( - index_name, index_client, search_client, form_recognizer_client + index_name, index_client, search_client, form_recognizer_client, azure_credential, embedding_endpoint ): # create or update search index with compatible schema create_search_index(index_name, index_client) @@ -121,6 +140,9 @@ def create_and_populate_index( use_layout=True, ignore_errors=False, njobs=1, + add_embeddings=True, + azure_credential=azd_credential, + embedding_endpoint=embedding_endpoint ) if len(result.chunks) == 0: @@ -174,6 +196,11 @@ def create_and_populate_index( required=False, help="Optional. Use this Azure Form Recognizer account key instead of the current user identity to login (use az login to set current user for Azure)", ) + parser.add_argument( + "--embeddingendpoint", + required=False, + help="Optional. Use this OpenAI endpoint to generate embeddings for the documents", + ) args = parser.parse_args() # Use the current user identity to connect to Azure services unless a key is explicitly set for any of them @@ -204,6 +231,6 @@ def create_and_populate_index( credential=formrecognizer_creds, ) create_and_populate_index( - args.index, index_client, search_client, form_recognizer_client + args.index, index_client, search_client, form_recognizer_client, azd_credential, args.embeddingendpoint ) print("Data preparation for index", args.index, "completed") diff --git a/scripts/prepdocs.sh b/scripts/prepdocs.sh index 86a6310f88..1f1a6b933a 100755 --- a/scripts/prepdocs.sh +++ b/scripts/prepdocs.sh @@ -3,4 +3,4 @@ . ./scripts/loadenv.sh echo 'Running "prepdocs.py"' -./.venv/bin/python ./scripts/prepdocs.py --searchservice "$AZURE_SEARCH_SERVICE" --index "$AZURE_SEARCH_INDEX" --formrecognizerservice "$AZURE_FORMRECOGNIZER_SERVICE" --tenantid "$AZURE_TENANT_ID" +./.venv/bin/python ./scripts/prepdocs.py --searchservice "$AZURE_SEARCH_SERVICE" --index "$AZURE_SEARCH_INDEX" --formrecognizerservice "$AZURE_FORMRECOGNIZER_SERVICE" --tenantid "$AZURE_TENANT_ID" --embeddingendpoint "$AZURE_OPENAI_EMBEDDING_ENDPOINT" diff --git a/scripts/readme.md b/scripts/readme.md index 4b632f5b67..160e1903c0 100644 --- a/scripts/readme.md +++ b/scripts/readme.md @@ -36,10 +36,10 @@ Azure Cognitive Search supports vector search in public preview. See [the docs]( To add vectors to your index, you will first need an [Azure OpenAI resource](https://learn.microsoft.com/en-us/azure/ai-services/openai/overview) with an [Ada embedding model deployment](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#embeddings-models). The `text-embedding-ada-002` model is supported. -- Get the endpoint and API key for embedding model deployment. The API key can be found in the Azure Portal under Resource Management for your Azure OpenAI resource, you can use either Key 1 or Key 2. The endpoint will generally be of the format `https://.openai.azure.com/openai/deployments//embeddings?api-version=2023-06-01-preview`. +- Get the endpoint for embedding model deployment. The endpoint will generally be of the format `https://.openai.azure.com/openai/deployments//embeddings?api-version=2023-06-01-preview`. - Run the data preparation script, passing in your config file and the embedding endpoint and key as extra arguments: - `python data_preparation.py --config config.json --embedding-model-endpoint "" --embedding-model-key ""` + `python data_preparation.py --config config.json --embedding-model-endpoint ""` ## Optional: Crack PDFs to Text If your data is in PDF format, you'll first need to convert from PDF to .txt format. You can use your own script for this, or use the provided conversion code here. diff --git a/start.sh b/start.sh index 12fc5f9354..8784c4793c 100755 --- a/start.sh +++ b/start.sh @@ -25,7 +25,7 @@ cd .. echo "" echo "Starting backend" echo "" -./.venv/bin/python -m flask run --port=5000 --host=127.0.0.1 --reload --debug +./.venv/bin/python -m flask run --port=5000 --host=127.0.0.1 --reload --debug if [ $? -ne 0 ]; then echo "Failed to start backend" exit $?