diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 0000000000..55b1852adb --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,8 @@ +ARG VARIANT=bullseye +FROM mcr.microsoft.com/vscode/devcontainers/base:0-${VARIANT} +RUN export DEBIAN_FRONTEND=noninteractive \ + && apt-get update && apt-get install -y xdg-utils \ + && apt-get clean -y && rm -rf /var/lib/apt/lists/* +RUN mkdir -p /opt/microsoft/azd \ + && curl -L https://github.com/Azure/azure-dev/releases/download/azure-dev-cli_0.9.0-beta.2/azd-linux-arm64-beta.tar.gz | tar zxvf - -C /opt/microsoft/azd \ + && ln -s /opt/microsoft/azd/azd-linux-arm64 /usr/local/bin/azd \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000000..ed2b84d7af --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,33 @@ +{ + "name": "Azure Developer CLI", + "build": { + "dockerfile": "Dockerfile", + "args": { + "VARIANT": "bullseye" + } + }, + "features": { + "ghcr.io/devcontainers/features/node:1": { + "version": "16", + "nodeGypDependencies": false + }, + "ghcr.io/devcontainers/features/azure-cli:1.0.8": {} + }, + "customizations": { + "vscode": { + "extensions": [ + "ms-azuretools.azure-dev", + "ms-azuretools.vscode-bicep", + "ms-python.python" + ] + } + }, + "forwardPorts": [ + 5000 + ], + "postCreateCommand": "", + "remoteUser": "vscode", + "hostRequirements": { + "memory": "8gb" + } +} \ No newline at end of file diff --git a/.gitignore b/.gitignore index bfa28926b6..b74a4cf685 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ .venv frontend/node_modules .env -static \ No newline at end of file +static +.azure/ +__pycache__/ diff --git a/README_azd.md b/README_azd.md new file mode 100644 index 0000000000..31d58680ac --- /dev/null +++ b/README_azd.md @@ -0,0 +1,70 @@ +# (Preview) Sample Chat App with AOAI + +## Deploying with the Azure Developer CLI + +> **IMPORTANT:** In order to deploy and run this example, you'll need an **Azure subscription with access enabled for the Azure OpenAI service**. You can request access [here](https://aka.ms/oaiapply). You can also visit [here](https://azure.microsoft.com/free/cognitive-search/) to get some free Azure credits to get you started. + +> **AZURE RESOURCE COSTS** by default this sample will create Azure App Service and Azure Cognitive Search resources that have a monthly cost, as well as Form Recognizer resource that has cost per document page. You can switch them to free versions of each of them if you want to avoid this cost by changing the parameters file under the infra folder (though there are some limits to consider; for example, you can have up to 1 free Cognitive Search resource per subscription, and the free Form Recognizer resource only analyzes the first 2 pages of each document.) + +### Prerequisites + +If you open this project in GitHub Codespaces or a local Dev Container, these will be available in the environment. +Otherwise, you need to install them locally. + +- [Azure Developer CLI](https://aka.ms/azure-dev/install) +- [Python 3+](https://www.python.org/downloads/) + - **Important**: Python and the pip package manager must be in the path in Windows for the setup scripts to work. +- [Node.js](https://nodejs.org/en/download/) +- [Git](https://git-scm.com/downloads) +- [Powershell 7+ (pwsh)](https://github.com/powershell/powershell) - For Windows users only. + - **Important**: Ensure you can run `pwsh.exe` from a PowerShell command. If this fails, you likely need to upgrade PowerShell. + +>NOTE: Your Azure Account must have `Microsoft.Authorization/roleAssignments/write` permissions, such as [User Access Administrator](https://learn.microsoft.com/azure/role-based-access-control/built-in-roles#user-access-administrator) or [Owner](https://learn.microsoft.com/azure/role-based-access-control/built-in-roles#owner). + +### Starting from scratch: + +If you don't have any pre-existing Azure services (i.e. OpenAI or Cognitive Search service), then you can provision +all resources from scratch by following these steps: + +1. Run `azd up` - This will provision Azure resources and deploy this sample to those resources, including building the search index based on the files found in the `./data` folder. +1. After the application has been successfully deployed you will see a URL printed to the console. Click that URL to interact with the application in your browser. + > NOTE: It may take a minute for the application to be fully deployed. If you see a "Python Developer" welcome screen, then wait a minute and refresh the page. + +### Use existing resources: + +If you have existing Azure resources that you want to reuse, then you must first set `azd` environment variables _before_ running `azd up`. + +Run the following commands based on what you want to customize: + +* `azd env set AZURE_OPENAI_RESOURCE {Name of existing OpenAI service}` +* `azd env set AZURE_OPENAI_RESOURCE_GROUP {Name of existing resource group that OpenAI service is provisioned to}` +* `azd env set AZURE_OPENAI_SKU_NAME {Name of OpenAI SKU}`. Defaults to 'S0'. +* `azd env set AZURE_SEARCH_SERVICE {Name of existing Cognitive Search service}` +* `azd env set AZURE_SEARCH_SERVICE_RESOURCE_GROUP {Name of existing resource group that Cognitive Search service is provisioned to}` +* `azd env set AZURE_SEARCH_SKU_NAME {Name of Cognitive Search SKY}`. Defaults to 'standard'. +* `azd env set AZURE_STORAGE_ACCOUNT {Name of existing Storage account}`. Used by prepdocs.py for uploading docs. +* `azd env set AZURE_STORAGE_ACCOUNT_RESOURCE_GROUP {Name of existing resource group that Storage account is provisioned to}`. +* `azd env set AZURE_FORMRECOGNIZER_SERVICE {Name of existing Form Recognizer service}`. Used by prepdocs.py for text extraction from docs. +* `azd env set AZURE_FORMRECOGNIZER_SERVICE_RESOURCE_GROUP {Name of existing resource group that Form Recognizer service is provisioned to}`. +* `azd env set AZURE_FORMRECOGNIZER_SKU_NAME {Name of Form Recognizer SKU}`. Defaults to 'S0'. + + +1. Run `azd up`. This will provision any missing Azure resources and deploy this sample to those resources, including building the search index based on the files found in the `./data` folder. +1. After the application has been successfully deployed you will see a URL printed to the console. Click that URL to interact with the application in your browser. + > NOTE: It may take a minute for the application to be fully deployed. If you see a "Python Developer" welcome screen, then wait a minute and refresh the page. + + +### Re-deploying changes + +If you make any changes to the app code (JS or Python), you can re-deploy the app code to App Service by running the `azd deploy` command. + +If you change any of the Bicep files in the infra folder, then you should re-run `azd up` to both provision resources and deploy code. + +### Running locally: + +1. Run `azd auth login` +3. Run `./start.cmd` or `./start.sh` to start the project locally. + +### Note + +>Note: The PDF documents used in this demo contain information generated using a language model (Azure OpenAI Service). The information contained in these documents is only for demonstration purposes and does not reflect the opinions or beliefs of Microsoft. Microsoft makes no representations or warranties of any kind, express or implied, about the completeness, accuracy, reliability, suitability or availability with respect to the information contained in this document. All rights reserved to Microsoft. diff --git a/app.py b/app.py index b1c48aab36..ec1f1ef3cd 100644 --- a/app.py +++ b/app.py @@ -10,6 +10,9 @@ app = Flask(__name__) +# setup basic logging +logging.basicConfig(level=logging.INFO) + @app.route("/", defaults={"path": "index.html"}) @app.route("/") def static_file(path): @@ -26,6 +29,7 @@ def static_file(path): AZURE_SEARCH_CONTENT_COLUMNS = os.environ.get("AZURE_SEARCH_CONTENT_COLUMNS") AZURE_SEARCH_FILENAME_COLUMN = os.environ.get("AZURE_SEARCH_FILENAME_COLUMN") AZURE_SEARCH_TITLE_COLUMN = os.environ.get("AZURE_SEARCH_TITLE_COLUMN") +print('title', AZURE_SEARCH_TITLE_COLUMN) AZURE_SEARCH_URL_COLUMN = os.environ.get("AZURE_SEARCH_URL_COLUMN") # AOAI Integration Settings @@ -44,7 +48,7 @@ def static_file(path): SHOULD_STREAM = True if AZURE_OPENAI_STREAM.lower() == "true" else False def is_chat_model(): - if 'gpt-4' in AZURE_OPENAI_MODEL_NAME.lower(): + if 'gpt-4' in AZURE_OPENAI_MODEL_NAME.lower() or 'gpt-35' in AZURE_OPENAI_MODEL_NAME.lower(): return True return False @@ -85,7 +89,7 @@ def prepare_body_headers_with_data(request): } ] } - + app.logger.info(body) chatgpt_url = f"https://{AZURE_OPENAI_RESOURCE}.openai.azure.com/openai/deployments/{AZURE_OPENAI_MODEL}" if is_chat_model(): chatgpt_url += "/chat/completions?api-version=2023-03-15-preview" @@ -138,7 +142,7 @@ def stream_with_data(body, headers, endpoint): deltaText = lineJson["choices"][0]["messages"][0]["delta"]["content"] if deltaText != "[DONE]": response["choices"][0]["messages"][1]["content"] += deltaText - + app.logger.info(response) yield json.dumps(response).replace("\n", "\\n") + "\n" except Exception as e: yield json.dumps({"error": str(e)}).replace("\n", "\\n") + "\n" diff --git a/azure.yaml b/azure.yaml new file mode 100644 index 0000000000..29ebf238d4 --- /dev/null +++ b/azure.yaml @@ -0,0 +1,34 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/Azure/azure-dev/main/schemas/v1.0/azure.yaml.json + +name: sample-app-aoai-chatgpt +metadata: + template: sample-app-aoai-chatgpt@0.0.1-beta +services: + backend: + project: . + language: py + host: appservice + hooks: + prepackage: + windows: + shell: pwsh + run: cd ./frontend;npm install;npm run build + interactive: true + continueOnError: false + posix: + shell: sh + run: cd ./frontend;npm install;npm run build + interactive: true + continueOnError: false +hooks: + postprovision: + windows: + shell: pwsh + run: $output = azd env get-values; Add-Content -Path .env -Value $output; + interactive: true + continueOnError: false + posix: + shell: sh + run: azd env get-values > .env + interactive: true + continueOnError: false \ No newline at end of file diff --git a/data/employee_handbook.pdf b/data/employee_handbook.pdf new file mode 100644 index 0000000000..878f36f7dd Binary files /dev/null and b/data/employee_handbook.pdf differ diff --git a/infra/abbreviations.json b/infra/abbreviations.json new file mode 100644 index 0000000000..703e503867 --- /dev/null +++ b/infra/abbreviations.json @@ -0,0 +1,135 @@ +{ + "analysisServicesServers": "as", + "apiManagementService": "apim-", + "appConfigurationConfigurationStores": "appcs-", + "appManagedEnvironments": "cae-", + "appContainerApps": "ca-", + "authorizationPolicyDefinitions": "policy-", + "automationAutomationAccounts": "aa-", + "blueprintBlueprints": "bp-", + "blueprintBlueprintsArtifacts": "bpa-", + "cacheRedis": "redis-", + "cdnProfiles": "cdnp-", + "cdnProfilesEndpoints": "cdne-", + "cognitiveServicesAccounts": "cog-", + "cognitiveServicesFormRecognizer": "cog-fr-", + "cognitiveServicesTextAnalytics": "cog-ta-", + "computeAvailabilitySets": "avail-", + "computeCloudServices": "cld-", + "computeDiskEncryptionSets": "des", + "computeDisks": "disk", + "computeDisksOs": "osdisk", + "computeGalleries": "gal", + "computeSnapshots": "snap-", + "computeVirtualMachines": "vm", + "computeVirtualMachineScaleSets": "vmss-", + "containerInstanceContainerGroups": "ci", + "containerRegistryRegistries": "cr", + "containerServiceManagedClusters": "aks-", + "databricksWorkspaces": "dbw-", + "dataFactoryFactories": "adf-", + "dataLakeAnalyticsAccounts": "dla", + "dataLakeStoreAccounts": "dls", + "dataMigrationServices": "dms-", + "dBforMySQLServers": "mysql-", + "dBforPostgreSQLServers": "psql-", + "devicesIotHubs": "iot-", + "devicesProvisioningServices": "provs-", + "devicesProvisioningServicesCertificates": "pcert-", + "documentDBDatabaseAccounts": "cosmos-", + "eventGridDomains": "evgd-", + "eventGridDomainsTopics": "evgt-", + "eventGridEventSubscriptions": "evgs-", + "eventHubNamespaces": "evhns-", + "eventHubNamespacesEventHubs": "evh-", + "hdInsightClustersHadoop": "hadoop-", + "hdInsightClustersHbase": "hbase-", + "hdInsightClustersKafka": "kafka-", + "hdInsightClustersMl": "mls-", + "hdInsightClustersSpark": "spark-", + "hdInsightClustersStorm": "storm-", + "hybridComputeMachines": "arcs-", + "insightsActionGroups": "ag-", + "insightsComponents": "appi-", + "keyVaultVaults": "kv-", + "kubernetesConnectedClusters": "arck", + "kustoClusters": "dec", + "kustoClustersDatabases": "dedb", + "logicIntegrationAccounts": "ia-", + "logicWorkflows": "logic-", + "machineLearningServicesWorkspaces": "mlw-", + "managedIdentityUserAssignedIdentities": "id-", + "managementManagementGroups": "mg-", + "migrateAssessmentProjects": "migr-", + "networkApplicationGateways": "agw-", + "networkApplicationSecurityGroups": "asg-", + "networkAzureFirewalls": "afw-", + "networkBastionHosts": "bas-", + "networkConnections": "con-", + "networkDnsZones": "dnsz-", + "networkExpressRouteCircuits": "erc-", + "networkFirewallPolicies": "afwp-", + "networkFirewallPoliciesWebApplication": "waf", + "networkFirewallPoliciesRuleGroups": "wafrg", + "networkFrontDoors": "fd-", + "networkFrontdoorWebApplicationFirewallPolicies": "fdfp-", + "networkLoadBalancersExternal": "lbe-", + "networkLoadBalancersInternal": "lbi-", + "networkLoadBalancersInboundNatRules": "rule-", + "networkLocalNetworkGateways": "lgw-", + "networkNatGateways": "ng-", + "networkNetworkInterfaces": "nic-", + "networkNetworkSecurityGroups": "nsg-", + "networkNetworkSecurityGroupsSecurityRules": "nsgsr-", + "networkNetworkWatchers": "nw-", + "networkPrivateDnsZones": "pdnsz-", + "networkPrivateLinkServices": "pl-", + "networkPublicIPAddresses": "pip-", + "networkPublicIPPrefixes": "ippre-", + "networkRouteFilters": "rf-", + "networkRouteTables": "rt-", + "networkRouteTablesRoutes": "udr-", + "networkTrafficManagerProfiles": "traf-", + "networkVirtualNetworkGateways": "vgw-", + "networkVirtualNetworks": "vnet-", + "networkVirtualNetworksSubnets": "snet-", + "networkVirtualNetworksVirtualNetworkPeerings": "peer-", + "networkVirtualWans": "vwan-", + "networkVpnGateways": "vpng-", + "networkVpnGatewaysVpnConnections": "vcn-", + "networkVpnGatewaysVpnSites": "vst-", + "notificationHubsNamespaces": "ntfns-", + "notificationHubsNamespacesNotificationHubs": "ntf-", + "operationalInsightsWorkspaces": "log-", + "portalDashboards": "dash-", + "powerBIDedicatedCapacities": "pbi-", + "purviewAccounts": "pview-", + "recoveryServicesVaults": "rsv-", + "resourcesResourceGroups": "rg-", + "searchSearchServices": "srch-", + "serviceBusNamespaces": "sb-", + "serviceBusNamespacesQueues": "sbq-", + "serviceBusNamespacesTopics": "sbt-", + "serviceEndPointPolicies": "se-", + "serviceFabricClusters": "sf-", + "signalRServiceSignalR": "sigr", + "sqlManagedInstances": "sqlmi-", + "sqlServers": "sql-", + "sqlServersDataWarehouse": "sqldw-", + "sqlServersDatabases": "sqldb-", + "sqlServersDatabasesStretch": "sqlstrdb-", + "storageStorageAccounts": "st", + "storageStorageAccountsVm": "stvm", + "storSimpleManagers": "ssimp", + "streamAnalyticsCluster": "asa-", + "synapseWorkspaces": "syn", + "synapseWorkspacesAnalyticsWorkspaces": "synw", + "synapseWorkspacesSqlPoolsDedicated": "syndp", + "synapseWorkspacesSqlPoolsSpark": "synsp", + "timeSeriesInsightsEnvironments": "tsi-", + "webServerFarms": "plan-", + "webSitesAppService": "app-", + "webSitesAppServiceEnvironment": "ase-", + "webSitesFunctions": "func-", + "webStaticSites": "stapp-" +} diff --git a/infra/core/ai/cognitiveservices.bicep b/infra/core/ai/cognitiveservices.bicep new file mode 100644 index 0000000000..a9d0f49680 --- /dev/null +++ b/infra/core/ai/cognitiveservices.bicep @@ -0,0 +1,40 @@ +param name string +param location string = resourceGroup().location +param tags object = {} + +param customSubDomainName string = name +param deployments array = [] +param kind string = 'OpenAI' +param publicNetworkAccess string = 'Enabled' +param sku object = { + name: 'S0' +} + +resource account 'Microsoft.CognitiveServices/accounts@2022-10-01' = { + name: name + location: location + tags: tags + kind: kind + properties: { + customSubDomainName: customSubDomainName + publicNetworkAccess: publicNetworkAccess + } + sku: sku +} + +@batchSize(1) +resource deployment 'Microsoft.CognitiveServices/accounts/deployments@2022-10-01' = [for deployment in deployments: { + parent: account + name: deployment.name + properties: { + model: deployment.model + raiPolicyName: contains(deployment, 'raiPolicyName') ? deployment.raiPolicyName : null + scaleSettings: deployment.scaleSettings + } +}] + +output endpoint string = account.properties.endpoint +output id string = account.id +output name string = account.name +output skuName string = account.sku.name +output key string = account.listKeys().key1 diff --git a/infra/core/host/appservice.bicep b/infra/core/host/appservice.bicep new file mode 100644 index 0000000000..c90c2491a2 --- /dev/null +++ b/infra/core/host/appservice.bicep @@ -0,0 +1,100 @@ +param name string +param location string = resourceGroup().location +param tags object = {} + +// Reference Properties +param applicationInsightsName string = '' +param appServicePlanId string +param keyVaultName string = '' +param managedIdentity bool = !empty(keyVaultName) + +// Runtime Properties +@allowed([ + 'dotnet', 'dotnetcore', 'dotnet-isolated', 'node', 'python', 'java', 'powershell', 'custom' +]) +param runtimeName string +param runtimeNameAndVersion string = '${runtimeName}|${runtimeVersion}' +param runtimeVersion string + +// Microsoft.Web/sites Properties +param kind string = 'app,linux' + +// Microsoft.Web/sites/config +param allowedOrigins array = [] +param alwaysOn bool = true +param appCommandLine string = '' +param appSettings object = {} +param clientAffinityEnabled bool = false +param enableOryxBuild bool = contains(kind, 'linux') +param functionAppScaleLimit int = -1 +param linuxFxVersion string = runtimeNameAndVersion +param minimumElasticInstanceCount int = -1 +param numberOfWorkers int = -1 +param scmDoBuildDuringDeployment bool = false +param use32BitWorkerProcess bool = false +param ftpsState string = 'FtpsOnly' +param healthCheckPath string = '' + +resource appService 'Microsoft.Web/sites@2022-03-01' = { + name: name + location: location + tags: tags + kind: kind + properties: { + serverFarmId: appServicePlanId + siteConfig: { + linuxFxVersion: linuxFxVersion + alwaysOn: alwaysOn + ftpsState: ftpsState + appCommandLine: appCommandLine + numberOfWorkers: numberOfWorkers != -1 ? numberOfWorkers : null + minimumElasticInstanceCount: minimumElasticInstanceCount != -1 ? minimumElasticInstanceCount : null + use32BitWorkerProcess: use32BitWorkerProcess + functionAppScaleLimit: functionAppScaleLimit != -1 ? functionAppScaleLimit : null + healthCheckPath: healthCheckPath + cors: { + allowedOrigins: union([ 'https://portal.azure.com', 'https://ms.portal.azure.com' ], allowedOrigins) + } + } + clientAffinityEnabled: clientAffinityEnabled + httpsOnly: true + } + + identity: { type: managedIdentity ? 'SystemAssigned' : 'None' } + + resource configAppSettings 'config' = { + name: 'appsettings' + properties: union(appSettings, + { + SCM_DO_BUILD_DURING_DEPLOYMENT: string(scmDoBuildDuringDeployment) + ENABLE_ORYX_BUILD: string(enableOryxBuild) + }, + !empty(applicationInsightsName) ? { APPLICATIONINSIGHTS_CONNECTION_STRING: applicationInsights.properties.ConnectionString } : {}, + !empty(keyVaultName) ? { AZURE_KEY_VAULT_ENDPOINT: keyVault.properties.vaultUri } : {}) + } + + resource configLogs 'config' = { + name: 'logs' + properties: { + applicationLogs: { fileSystem: { level: 'Verbose' } } + detailedErrorMessages: { enabled: true } + failedRequestsTracing: { enabled: true } + httpLogs: { fileSystem: { enabled: true, retentionInDays: 1, retentionInMb: 35 } } + } + dependsOn: [ + configAppSettings + ] + } +} + +resource keyVault 'Microsoft.KeyVault/vaults@2022-07-01' existing = if (!(empty(keyVaultName))) { + name: keyVaultName +} + +resource applicationInsights 'Microsoft.Insights/components@2020-02-02' existing = if (!empty(applicationInsightsName)) { + name: applicationInsightsName +} + +output identityPrincipalId string = managedIdentity ? appService.identity.principalId : '' +output name string = appService.name +output uri string = 'https://${appService.properties.defaultHostName}' diff --git a/infra/core/host/appserviceplan.bicep b/infra/core/host/appserviceplan.bicep new file mode 100644 index 0000000000..c444f40651 --- /dev/null +++ b/infra/core/host/appserviceplan.bicep @@ -0,0 +1,21 @@ +param name string +param location string = resourceGroup().location +param tags object = {} + +param kind string = '' +param reserved bool = true +param sku object + +resource appServicePlan 'Microsoft.Web/serverfarms@2022-03-01' = { + name: name + location: location + tags: tags + sku: sku + kind: kind + properties: { + reserved: reserved + } +} + +output id string = appServicePlan.id +output name string = appServicePlan.name diff --git a/infra/core/search/search-services.bicep b/infra/core/search/search-services.bicep new file mode 100644 index 0000000000..0c6081b76d --- /dev/null +++ b/infra/core/search/search-services.bicep @@ -0,0 +1,43 @@ +param name string +param location string = resourceGroup().location +param tags object = {} + +param sku object = { + name: 'standard' +} + +param authOptions object = {} +param semanticSearch string = 'disabled' + +resource search 'Microsoft.Search/searchServices@2021-04-01-preview' = { + name: name + location: location + tags: tags + identity: { + type: 'SystemAssigned' + } + properties: { + authOptions: authOptions + disableLocalAuth: false + disabledDataExfiltrationOptions: [] + encryptionWithCmk: { + enforcement: 'Unspecified' + } + hostingMode: 'default' + networkRuleSet: { + bypass: 'None' + ipRules: [] + } + partitionCount: 1 + publicNetworkAccess: 'Enabled' + replicaCount: 1 + semanticSearch: semanticSearch + } + sku: sku +} + +output id string = search.id +output endpoint string = 'https://${name}.search.windows.net/' +output name string = search.name +output skuName string = sku.name +output adminKey string = search.listAdminKeys().primaryKey diff --git a/infra/core/security/role.bicep b/infra/core/security/role.bicep new file mode 100644 index 0000000000..dca01e1839 --- /dev/null +++ b/infra/core/security/role.bicep @@ -0,0 +1,20 @@ +param principalId string + +@allowed([ + 'Device' + 'ForeignGroup' + 'Group' + 'ServicePrincipal' + 'User' +]) +param principalType string = 'ServicePrincipal' +param roleDefinitionId string + +resource role 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(subscription().id, resourceGroup().id, principalId, roleDefinitionId) + properties: { + principalId: principalId + principalType: principalType + roleDefinitionId: resourceId('Microsoft.Authorization/roleDefinitions', roleDefinitionId) + } +} diff --git a/infra/core/storage/storage-account.bicep b/infra/core/storage/storage-account.bicep new file mode 100644 index 0000000000..b6dd989185 --- /dev/null +++ b/infra/core/storage/storage-account.bicep @@ -0,0 +1,58 @@ +param name string +param location string = resourceGroup().location +param tags object = {} + +@allowed([ 'Hot', 'Cool', 'Premium' ]) +param accessTier string = 'Hot' +param allowBlobPublicAccess bool = false +param allowCrossTenantReplication bool = true +param allowSharedKeyAccess bool = true +param defaultToOAuthAuthentication bool = false +param deleteRetentionPolicy object = {} +@allowed([ 'AzureDnsZone', 'Standard' ]) +param dnsEndpointType string = 'Standard' +param kind string = 'StorageV2' +param minimumTlsVersion string = 'TLS1_2' +@allowed([ 'Enabled', 'Disabled' ]) +param publicNetworkAccess string = 'Disabled' +param sku object = { name: 'Standard_LRS' } + +param containers array = [] + +resource storage 'Microsoft.Storage/storageAccounts@2022-05-01' = { + name: name + location: location + tags: tags + kind: kind + sku: sku + properties: { + accessTier: accessTier + allowBlobPublicAccess: allowBlobPublicAccess + allowCrossTenantReplication: allowCrossTenantReplication + allowSharedKeyAccess: allowSharedKeyAccess + defaultToOAuthAuthentication: defaultToOAuthAuthentication + dnsEndpointType: dnsEndpointType + minimumTlsVersion: minimumTlsVersion + networkAcls: { + bypass: 'AzureServices' + defaultAction: 'Allow' + } + publicNetworkAccess: publicNetworkAccess + } + + resource blobServices 'blobServices' = if (!empty(containers)) { + name: 'default' + properties: { + deleteRetentionPolicy: deleteRetentionPolicy + } + resource container 'containers' = [for container in containers: { + name: container.name + properties: { + publicAccess: contains(container, 'publicAccess') ? container.publicAccess : 'None' + } + }] + } +} + +output name string = storage.name +output primaryEndpoints object = storage.properties.primaryEndpoints diff --git a/infra/docprep.bicep b/infra/docprep.bicep new file mode 100644 index 0000000000..f31e62fa8f --- /dev/null +++ b/infra/docprep.bicep @@ -0,0 +1,111 @@ +targetScope = 'subscription' + +param resourceGroupName string +param location string +param tags object = {} +param principalId string +param resourceToken string + +// Storage and form recognizer: Used by document uploader / extractor +param storageAccountName string = '' +param storageResourceGroupName string = '' +param storageResourceGroupLocation string = location +param storageContainerName string = 'content' + +param formRecognizerServiceName string = '' +param formRecognizerResourceGroupName string = '' +param formRecognizerResourceGroupLocation string = location +param formRecognizerSkuName string = 'S0' + +var abbrs = loadJsonContent('abbreviations.json') + +resource resourceGroup 'Microsoft.Resources/resourceGroups@2021-04-01' existing = { + name: resourceGroupName +} + +resource storageResourceGroup 'Microsoft.Resources/resourceGroups@2021-04-01' existing = if (!empty(storageResourceGroupName)) { + name: !empty(storageResourceGroupName) ? storageResourceGroupName : resourceGroup.name +} + +resource formRecognizerResourceGroup 'Microsoft.Resources/resourceGroups@2021-04-01' existing = if (!empty(formRecognizerResourceGroupName)) { + name: !empty(formRecognizerResourceGroupName) ? formRecognizerResourceGroupName : resourceGroup.name +} + +module storage 'core/storage/storage-account.bicep' = { + name: 'storage' + scope: storageResourceGroup + params: { + name: !empty(storageAccountName) ? storageAccountName : '${abbrs.storageStorageAccounts}${resourceToken}' + location: storageResourceGroupLocation + tags: tags + publicNetworkAccess: 'Enabled' + sku: { + name: 'Standard_ZRS' + } + deleteRetentionPolicy: { + enabled: true + days: 2 + } + containers: [ + { + name: storageContainerName + publicAccess: 'None' + } + ] + } +} + +module formRecognizer 'core/ai/cognitiveservices.bicep' = { + name: 'formrecognizer' + scope: formRecognizerResourceGroup + params: { + name: !empty(formRecognizerServiceName) ? formRecognizerServiceName : '${abbrs.cognitiveServicesFormRecognizer}${resourceToken}' + kind: 'FormRecognizer' + location: formRecognizerResourceGroupLocation + tags: tags + sku: { + name: formRecognizerSkuName + } + } +} + +module storageRoleUser 'core/security/role.bicep' = { + scope: storageResourceGroup + name: 'storage-role-user' + params: { + principalId: principalId + roleDefinitionId: '2a2b9908-6ea1-4ae2-8e65-a410df84e7d1' + principalType: 'User' + } +} + +module storageContribRoleUser 'core/security/role.bicep' = { + scope: storageResourceGroup + name: 'storage-contribrole-user' + params: { + principalId: principalId + roleDefinitionId: 'ba92f5b4-2d11-453d-a403-e96b0029c9fe' + principalType: 'User' + } +} + +module formRecognizerRoleUser 'core/security/role.bicep' = { + scope: formRecognizerResourceGroup + name: 'formrecognizer-role-user' + params: { + principalId: principalId + roleDefinitionId: 'a97b65f3-24c7-4388-baec-2e87135dc908' + principalType: 'User' + } +} + +// Used by prepdocs +// Form recognizer +output AZURE_FORMRECOGNIZER_SERVICE string = formRecognizer.outputs.name +output AZURE_FORMRECOGNIZER_RESOURCE_GROUP string = formRecognizerResourceGroup.name +output AZURE_FORMRECOGNIZER_SKU_NAME string = formRecognizerSkuName + +// Storage +output AZURE_STORAGE_ACCOUNT string = storage.outputs.name +output AZURE_STORAGE_CONTAINER string = storageContainerName +output AZURE_STORAGE_RESOURCE_GROUP string = storageResourceGroup.name diff --git a/infra/main.bicep b/infra/main.bicep new file mode 100644 index 0000000000..27f005f4a4 --- /dev/null +++ b/infra/main.bicep @@ -0,0 +1,307 @@ +targetScope = 'subscription' + +@minLength(1) +@maxLength(64) +@description('Name of the the environment which is used to generate a short unique hash used in all resources.') +param environmentName string + +@minLength(1) +@description('Primary location for all resources') +param location string + +param appServicePlanName string = '' +param backendServiceName string = '' +param resourceGroupName string = '' + +param searchServiceName string = '' +param searchServiceResourceGroupName string = '' +param searchServiceResourceGroupLocation string = location +param searchServiceSkuName string = '' +param searchIndexName string = 'gptkbindex' +param searchUseSemanticSearch bool = false +param searchSemanticSearchConfig string = 'default' +param searchTopK int = 5 +param searchEnableInDomain bool = true +param searchContentColumns string = 'content' +param searchFilenameColumn string = 'filepath' +param searchTitleColumn string = 'title' +param searchUrlColumn string = 'url' + +param openAiResourceName string = '' +param openAiResourceGroupName string = '' +param openAiResourceGroupLocation string = location +param openAiSkuName string = '' +param openAIModel string = 'chat' +param openAIModelName string = 'gpt-35-turbo' +param openAITemperature int = 0 +param openAITopP int = 1 +param openAIMaxTokens int = 1000 +param openAIStopSequence string = '\n' +param openAISystemMessage string = 'You are an AI assistant that helps people find information.' +param openAIApiVersion string = '2023-06-01-preview' +param openAIStream bool = true + +// Used by prepdocs.py: Storage and form recognizer +param storageAccountName string = '' +param storageResourceGroupName string = '' +param storageResourceGroupLocation string = location +param storageContainerName string = 'content' +param formRecognizerServiceName string = '' +param formRecognizerResourceGroupName string = '' +param formRecognizerResourceGroupLocation string = location +param formRecognizerSkuName string = '' + +@description('Id of the user or app to assign application roles') +param principalId string = '' + +var abbrs = loadJsonContent('abbreviations.json') +var resourceToken = toLower(uniqueString(subscription().id, environmentName, location)) +var tags = { 'azd-env-name': environmentName } + +// Organize resources in a resource group +resource resourceGroup 'Microsoft.Resources/resourceGroups@2021-04-01' = { + name: !empty(resourceGroupName) ? resourceGroupName : '${abbrs.resourcesResourceGroups}${environmentName}' + location: location + tags: tags +} + +resource openAiResourceGroup 'Microsoft.Resources/resourceGroups@2021-04-01' existing = if (!empty(openAiResourceGroupName)) { + name: !empty(openAiResourceGroupName) ? openAiResourceGroupName : resourceGroup.name +} + +resource searchServiceResourceGroup 'Microsoft.Resources/resourceGroups@2021-04-01' existing = if (!empty(searchServiceResourceGroupName)) { + name: !empty(searchServiceResourceGroupName) ? searchServiceResourceGroupName : resourceGroup.name +} + + +// Create an App Service Plan to group applications under the same payment plan and SKU +module appServicePlan 'core/host/appserviceplan.bicep' = { + name: 'appserviceplan' + scope: resourceGroup + params: { + name: !empty(appServicePlanName) ? appServicePlanName : '${abbrs.webServerFarms}${resourceToken}' + location: location + tags: tags + sku: { + name: 'B1' + capacity: 1 + } + kind: 'linux' + } +} + +// The application frontend +module backend 'core/host/appservice.bicep' = { + name: 'web' + scope: resourceGroup + params: { + name: !empty(backendServiceName) ? backendServiceName : '${abbrs.webSitesAppService}backend-${resourceToken}' + location: location + tags: union(tags, { 'azd-service-name': 'backend' }) + appServicePlanId: appServicePlan.outputs.id + runtimeName: 'python' + runtimeVersion: '3.10' + scmDoBuildDuringDeployment: true + managedIdentity: true + appSettings: { + // search + AZURE_SEARCH_INDEX: searchIndexName + AZURE_SEARCH_SERVICE: searchService.outputs.name + AZURE_SEARCH_KEY: searchService.outputs.adminKey + AZURE_SEARCH_USE_SEMANTIC_SEARCH: searchUseSemanticSearch + AZURE_SEARCH_SEMANTIC_SEARCH_CONFIG: searchSemanticSearchConfig + AZURE_SEARCH_TOP_K: searchTopK + AZURE_SEARCH_ENABLE_IN_DOMAIN: searchEnableInDomain + AZURE_SEARCH_CONTENT_COLUMNS: searchContentColumns + AZURE_SEARCH_FILENAME_COLUMN: searchFilenameColumn + AZURE_SEARCH_TITLE_COLUMN: searchTitleColumn + AZURE_SEARCH_URL_COLUMN: searchUrlColumn + // openai + AZURE_OPENAI_RESOURCE: openAi.outputs.name + AZURE_OPENAI_MODEL: openAIModel + AZURE_OPENAI_MODEL_NAME: openAIModelName + AZURE_OPENAI_KEY: openAi.outputs.key + AZURE_OPENAI_TEMPERATURE: openAITemperature + AZURE_OPENAI_TOP_P: openAITopP + AZURE_OPENAI_MAX_TOKENS: openAIMaxTokens + AZURE_OPENAI_STOP_SEQUENCE: openAIStopSequence + AZURE_OPENAI_SYSTEM_MESSAGE: openAISystemMessage + AZURE_OPENAI_PREVIEW_API_VERSION: openAIApiVersion + AZURE_OPENAI_STREAM: openAIStream + } + } +} + + +module openAi 'core/ai/cognitiveservices.bicep' = { + name: 'openai' + scope: openAiResourceGroup + params: { + name: !empty(openAiResourceName) ? openAiResourceName : '${abbrs.cognitiveServicesAccounts}${resourceToken}' + location: openAiResourceGroupLocation + tags: tags + sku: { + name: !empty(openAiSkuName) ? openAiSkuName : 'S0' + } + deployments: [ + { + name: openAIModel + model: { + format: 'OpenAI' + name: openAIModelName + version: '0301' + } + scaleSettings: { + scaleType: 'Standard' + } + } + ] + } +} + +module searchService 'core/search/search-services.bicep' = { + name: 'search-service' + scope: searchServiceResourceGroup + params: { + name: !empty(searchServiceName) ? searchServiceName : 'gptkb-${resourceToken}' + location: searchServiceResourceGroupLocation + tags: tags + authOptions: { + aadOrApiKey: { + aadAuthFailureMode: 'http401WithBearerChallenge' + } + } + sku: { + name: !empty(searchServiceSkuName) ? searchServiceSkuName : 'standard' + } + semanticSearch: 'free' + } +} + + + +// USER ROLES +module openAiRoleUser 'core/security/role.bicep' = { + scope: openAiResourceGroup + name: 'openai-role-user' + params: { + principalId: principalId + roleDefinitionId: '5e0bd9bd-7b93-4f28-af87-19fc36ad61bd' + principalType: 'User' + } +} + +module searchRoleUser 'core/security/role.bicep' = { + scope: searchServiceResourceGroup + name: 'search-role-user' + params: { + principalId: principalId + roleDefinitionId: '1407120a-92aa-4202-b7e9-c0e197c71c8f' + principalType: 'User' + } +} + +module searchIndexDataContribRoleUser 'core/security/role.bicep' = { + scope: searchServiceResourceGroup + name: 'search-index-data-contrib-role-user' + params: { + principalId: principalId + roleDefinitionId: '8ebe5a00-799e-43f5-93ac-243d3dce84a7' + principalType: 'User' + } +} + +module searchServiceContribRoleUser 'core/security/role.bicep' = { + scope: searchServiceResourceGroup + name: 'search-service-contrib-role-user' + params: { + principalId: principalId + roleDefinitionId: '7ca78c08-252a-4471-8644-bb5ff32d4ba0' + principalType: 'User' + } +} + +// SYSTEM IDENTITIES +module openAiRoleBackend 'core/security/role.bicep' = { + scope: openAiResourceGroup + name: 'openai-role-backend' + params: { + principalId: backend.outputs.identityPrincipalId + roleDefinitionId: '5e0bd9bd-7b93-4f28-af87-19fc36ad61bd' + principalType: 'ServicePrincipal' + } +} + +module searchRoleBackend 'core/security/role.bicep' = { + scope: searchServiceResourceGroup + name: 'search-role-backend' + params: { + principalId: backend.outputs.identityPrincipalId + roleDefinitionId: '1407120a-92aa-4202-b7e9-c0e197c71c8f' + principalType: 'ServicePrincipal' + } +} + +// For doc prep + +module docPrepResources 'docprep.bicep' = { + name: 'docprep-resources' + params: { + location: location + resourceToken: resourceToken + tags: tags + principalId: principalId + resourceGroupName: resourceGroup.name + storageAccountName: storageAccountName + storageResourceGroupName: storageResourceGroupName + storageResourceGroupLocation: storageResourceGroupLocation + storageContainerName: storageContainerName + formRecognizerServiceName: formRecognizerServiceName + formRecognizerResourceGroupName: formRecognizerResourceGroupName + formRecognizerResourceGroupLocation: formRecognizerResourceGroupLocation + formRecognizerSkuName: !empty(formRecognizerSkuName) ? formRecognizerSkuName : 'S0' + } +} +output AZURE_LOCATION string = location +output AZURE_TENANT_ID string = tenant().tenantId +output AZURE_RESOURCE_GROUP string = resourceGroup.name + +output BACKEND_URI string = backend.outputs.uri + +// search +output AZURE_SEARCH_INDEX string = searchIndexName +output AZURE_SEARCH_SERVICE string = searchService.outputs.name +output AZURE_SEARCH_SERVICE_RESOURCE_GROUP string = searchServiceResourceGroup.name +output AZURE_SEARCH_SKU_NAME string = searchService.outputs.skuName +output AZURE_SEARCH_KEY string = searchService.outputs.adminKey +output AZURE_SEARCH_USE_SEMANTIC_SEARCH bool = searchUseSemanticSearch +output AZURE_SEARCH_SEMANTIC_SEARCH_CONFIG string = searchSemanticSearchConfig +output AZURE_SEARCH_TOP_K int = searchTopK +output AZURE_SEARCH_ENABLE_IN_DOMAIN bool = searchEnableInDomain +output AZURE_SEARCH_CONTENT_COLUMNS string = searchContentColumns +output AZURE_SEARCH_FILENAME_COLUMN string = searchFilenameColumn +output AZURE_SEARCH_TITLE_COLUMN string = searchTitleColumn +output AZURE_SEARCH_URL_COLUMN string = searchUrlColumn + +// openai +output AZURE_OPENAI_RESOURCE string = openAi.outputs.name +output AZURE_OPENAI_RESOURCE_GROUP string = openAiResourceGroup.name +output AZURE_OPENAI_MODEL string = openAIModel +output AZURE_OPENAI_MODEL_NAME string = openAIModelName +output AZURE_OPENAI_SKU_NAME string = openAi.outputs.skuName +output AZURE_OPENAI_KEY string = openAi.outputs.key +output AZURE_OPENAI_TEMPERATURE int = openAITemperature +output AZURE_OPENAI_TOP_P int = openAITopP +output AZURE_OPENAI_MAX_TOKENS int = openAIMaxTokens +output AZURE_OPENAI_STOP_SEQUENCE string = openAIStopSequence +output AZURE_OPENAI_SYSTEM_MESSAGE string = openAISystemMessage +output AZURE_OPENAI_PREVIEW_API_VERSION string = openAIApiVersion +output AZURE_OPENAI_STREAM bool = openAIStream + +// Used by prepdocs.py: +output AZURE_FORMRECOGNIZER_SERVICE string = docPrepResources.outputs.AZURE_FORMRECOGNIZER_SERVICE +output AZURE_FORMRECOGNIZER_RESOURCE_GROUP string = docPrepResources.outputs.AZURE_FORMRECOGNIZER_RESOURCE_GROUP +output AZURE_FORMRECOGNIZER_SKU_NAME string = docPrepResources.outputs.AZURE_FORMRECOGNIZER_SKU_NAME +output AZURE_STORAGE_ACCOUNT string = docPrepResources.outputs.AZURE_STORAGE_ACCOUNT +output AZURE_STORAGE_CONTAINER string = docPrepResources.outputs.AZURE_STORAGE_CONTAINER +output AZURE_STORAGE_RESOURCE_GROUP string = docPrepResources.outputs.AZURE_STORAGE_RESOURCE_GROUP diff --git a/infra/main.parameters.json b/infra/main.parameters.json new file mode 100644 index 0000000000..8b5ab944e4 --- /dev/null +++ b/infra/main.parameters.json @@ -0,0 +1,48 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "environmentName": { + "value": "${AZURE_ENV_NAME}" + }, + "location": { + "value": "${AZURE_LOCATION}" + }, + "principalId": { + "value": "${AZURE_PRINCIPAL_ID}" + }, + "openAiResourceName": { + "value": "${AZURE_OPENAI_RESOURCE}" + }, + "openAiResourceGroupName": { + "value": "${AZURE_OPENAI_RESOURCE_GROUP}" + }, + "openAiSkuName": { + "value": "${AZURE_OPENAI_SKU_NAME}" + }, + "searchServiceName": { + "value": "${AZURE_SEARCH_SERVICE}" + }, + "searchServiceResourceGroupName": { + "value": "${AZURE_SEARCH_SERVICE_RESOURCE_GROUP}" + }, + "searchServiceSkuName": { + "value": "${AZURE_SEARCH_SKU_NAME}" + }, + "storageAccountName": { + "value": "${AZURE_STORAGE_ACCOUNT}" + }, + "storageResourceGroupName": { + "value": "${AZURE_STORAGE_RESOURCE_GROUP}" + }, + "formRecognizerServiceName": { + "value": "${AZURE_FORMRECOGNIZER_SERVICE}" + }, + "formRecognizerResourceGroupName": { + "value": "${AZURE_FORMRECOGNIZER_RESOURCE_GROUP}" + }, + "formRecognizerSkuName": { + "value": "${AZURE_FORMRECOGNIZER_SKU_NAME}" + } + } +} diff --git a/scripts/config.json b/scripts/config.json deleted file mode 100644 index bef31a51c6..0000000000 --- a/scripts/config.json +++ /dev/null @@ -1,13 +0,0 @@ -[ - { - "data_path": "", - "location": "", - "subscription_id": "", - "resource_group": "", - "search_service_name": "", - "index_name": "", - "chunk_size": 1024, - "token_overlap": 128, - "semantic_config_name": "default" - } -] \ No newline at end of file diff --git a/scripts/data_preparation.py b/scripts/data_preparation.py deleted file mode 100644 index 8be06381a3..0000000000 --- a/scripts/data_preparation.py +++ /dev/null @@ -1,328 +0,0 @@ -"""Data Preparation Script for an Azure Cognitive Search Index.""" -import argparse -import json -import time -import requests -import subprocess -import dataclasses -from tqdm import tqdm -from azure.core.credentials import AzureKeyCredential -from azure.identity import AzureCliCredential -from data_utils import chunk_directory -from azure.search.documents import SearchClient -from azure.ai.formrecognizer import DocumentAnalysisClient - -def check_if_search_service_exists(search_service_name: str, - subscription_id: str, - resource_group: str, - credential = None): - """_summary_ - - Args: - search_service_name (str): _description_ - subscription_id (str): _description_ - resource_group (str): _description_ - credential: Azure credential to use for getting acs instance - """ - if credential is None: - raise ValueError("credential cannot be None") - url = ( - f"https://management.azure.com/subscriptions/{subscription_id}" - f"/resourceGroups/{resource_group}/providers/Microsoft.Search/searchServices" - f"/{search_service_name}?api-version=2021-04-01-preview" - ) - - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {credential.get_token('https://management.azure.com/.default').token}", - } - - response = requests.get(url, headers=headers) - return response.status_code == 200 - - -def create_search_service( - search_service_name: str, - subscription_id: str, - resource_group: str, - location: str, - sku: str = "standard", - credential = None, -): - """_summary_ - - Args: - search_service_name (str): _description_ - subscription_id (str): _description_ - resource_group (str): _description_ - location (str): _description_ - credential: Azure credential to use for creating acs instance - - Raises: - Exception: _description_ - """ - if credential is None: - raise ValueError("credential cannot be None") - url = ( - f"https://management.azure.com/subscriptions/{subscription_id}" - f"/resourceGroups/{resource_group}/providers/Microsoft.Search/searchServices" - f"/{search_service_name}?api-version=2021-04-01-preview" - ) - - payload = { - "location": f"{location}", - "sku": {"name": sku}, - "properties": { - "replicaCount": 1, - "partitionCount": 1, - "hostingMode": "default", - "semanticSearch": "free", - }, - } - - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {credential.get_token('https://management.azure.com/.default').token}", - } - - response = requests.put(url, json=payload, headers=headers) - if response.status_code != 201: - raise Exception( - f"Failed to create search service. Error: {response.text}") - -def create_or_update_search_index(service_name, subscription_id, resource_group, index_name, semantic_config_name, credential): - if credential is None: - raise ValueError("credential cannot be None") - admin_key = json.loads( - subprocess.run( - f"az search admin-key show --subscription {subscription_id} --resource-group {resource_group} --service-name {service_name}", - shell=True, - capture_output=True, - ).stdout - )["primaryKey"] - - url = f"https://{service_name}.search.windows.net/indexes/{index_name}?api-version=2021-04-30-Preview" - headers = { - "Content-Type": "application/json", - "api-key": admin_key, - } - - body = { - "fields": [ - { - "name": "id", - "type": "Edm.String", - "searchable": True, - "analyzer": "en.lucene", - "key": True, - }, - { - "name": "content", - "type": "Edm.String", - "searchable": True, - "sortable": False, - "facetable": False, - "filterable": False, - "analyzer": "en.lucene", - }, - { - "name": "title", - "type": "Edm.String", - "searchable": True, - "sortable": False, - "facetable": False, - "filterable": False, - "analyzer": "en.lucene", - }, - { - "name": "filepath", - "type": "Edm.String", - "searchable": True, - "sortable": False, - "facetable": False, - "filterable": False, - }, - { - "name": "url", - "type": "Edm.String", - "searchable": True, - }, - { - "name": "metadata", - "type": "Edm.String", - "searchable": True, - }, - ], - "suggesters": [], - "scoringProfiles": [], - "semantic": { - "configurations": [ - { - "name": semantic_config_name, - "prioritizedFields": { - "titleField": {"fieldName": "title"}, - "prioritizedContentFields": [{"fieldName": "content"}], - "prioritizedKeywordsFields": [], - }, - } - ] - }, - } - - response = requests.put(url, json=body, headers=headers) - if response.status_code == 201: - print(f"Created search index {index_name}") - elif response.status_code == 204: - print(f"Updated existing search index {index_name}") - else: - raise Exception(f"Failed to create search index. Error: {response.text}") - - return True - -def upload_documents_to_index(service_name, subscription_id, resource_group, index_name, docs, credential, upload_batch_size = 50): - if credential is None: - raise ValueError("credential cannot be None") - - to_upload_dicts = [] - - id = 0 - for document in docs: - d = dataclasses.asdict(document) - # add id to documents - d.update({"@search.action": "upload", "id": str(id)}) - to_upload_dicts.append(d) - id += 1 - - endpoint = "https://{}.search.windows.net/".format(service_name) - admin_key = json.loads( - subprocess.run( - f"az search admin-key show --subscription {subscription_id} --resource-group {resource_group} --service-name {service_name}", - shell=True, - capture_output=True, - ).stdout - )["primaryKey"] - - search_client = SearchClient( - endpoint=endpoint, - index_name=index_name, - credential=AzureKeyCredential(admin_key), - ) - # Upload the documents in batches of upload_batch_size - for i in tqdm(range(0, len(to_upload_dicts), upload_batch_size), desc="Indexing Chunks..."): - batch = to_upload_dicts[i: i + upload_batch_size] - results = search_client.upload_documents(documents=batch) - num_failures = 0 - errors = set() - for result in results: - if not result.succeeded: - print(f"Indexing Failed for {result.key} with ERROR: {result.error_message}") - num_failures += 1 - errors.add(result.error_message) - if num_failures > 0: - raise Exception(f"INDEXING FAILED for {num_failures} documents. Please recreate the index." - f"To Debug: PLEASE CHECK chunk_size and upload_batch_size. \n Error Messages: {list(errors)}") - -def validate_index(service_name, subscription_id, resource_group, index_name): - api_version = "2021-04-30-Preview" - admin_key = json.loads( - subprocess.run( - f"az search admin-key show --subscription {subscription_id} --resource-group {resource_group} --service-name {service_name}", - shell=True, - capture_output=True, - ).stdout - )["primaryKey"] - - headers = { - "Content-Type": "application/json", - "api-key": admin_key} - params = {"api-version": api_version} - url = f"https://{service_name}.search.windows.net/indexes/{index_name}/stats" - for retry_count in range(5): - response = requests.get(url, headers=headers, params=params) - - if response.status_code == 200: - response = response.json() - num_chunks = response['documentCount'] - if num_chunks==0 and retry_count < 4: - print("Index is empty. Waiting 60 seconds to check again...") - time.sleep(60) - elif num_chunks==0 and retry_count == 4: - print("Index is empty. Please investigate and re-index.") - else: - print(f"The index contains {num_chunks} chunks.") - average_chunk_size = response['storageSize']/num_chunks - print(f"The average chunk size of the index is {average_chunk_size} bytes.") - break - else: - if response.status_code==404: - print(f"The index does not seem to exist. Please make sure the index was created correctly, and that you are using the correct service and index names") - elif response.status_code==403: - print(f"Authentication Failure: Make sure you are using the correct key") - else: - print(f"Request failed. Please investigate. Status code: {response.status_code}") - break - -def create_index(config, credential, form_recognizer_client=None, use_layout=False): - service_name = config["search_service_name"] - subscription_id = config["subscription_id"] - resource_group = config["resource_group"] - location = config["location"] - index_name = config["index_name"] - - # check if search service exists, create if not - if check_if_search_service_exists(service_name, subscription_id, resource_group, credential): - print(f"Using existing search service {service_name}") - else: - print(f"Creating search service {service_name}") - create_search_service(service_name, subscription_id, resource_group, location, credential=credential) - - # create or update search index with compatible schema - if not create_or_update_search_index(service_name, subscription_id, resource_group, index_name, config["semantic_config_name"], credential): - raise Exception(f"Failed to create or update index {index_name}") - - # chunk directory - print("Chunking directory...") - result = chunk_directory(config["data_path"], num_tokens=config["chunk_size"], token_overlap=config.get("token_overlap",0), form_recognizer_client=form_recognizer_client, use_layout=use_layout) - - if len(result.chunks) == 0: - raise Exception("No chunks found. Please check the data path and chunk size.") - - print(f"Processed {result.total_files} files") - print(f"Unsupported formats: {result.num_unsupported_format_files} files") - print(f"Files with errors: {result.num_files_with_errors} files") - print(f"Found {len(result.chunks)} chunks") - - # upload documents to index - print("Uploading documents to index...") - upload_documents_to_index(service_name, subscription_id, resource_group, index_name, result.chunks, credential) - - # check if index is ready/validate index - print("Validating index...") - validate_index(service_name, subscription_id, resource_group, index_name) - print("Index validation completed") - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--config", type=str, help="Path to config file containing settings for data preparation") - parser.add_argument("--form-rec-resource", type=str, help="Name of your Form Recognizer resource to use for PDF cracking.") - parser.add_argument("--form-rec-key", type=str, help="Key for your Form Recognizer resource to use for PDF cracking.") - parser.add_argument("--form-rec-use-layout", default=False, action='store_true', help="Whether to use Layout model for PDF cracking, if False will use Read model.") - args = parser.parse_args() - - with open(args.config) as f: - config = json.load(f) - - credential = AzureCliCredential() - form_recognizer_client = None - - print("Data preparation script started") - if args.form_rec_resource and args.form_rec_key: - form_recognizer_client = DocumentAnalysisClient(endpoint=f"https://{args.form_rec_resource}.cognitiveservices.azure.com/", credential=AzureKeyCredential(args.form_rec_key)) - print(f"Using Form Recognizer resource {args.form_rec_resource} for PDF cracking, with the {'Layout' if args.form_rec_use_layout else 'Read'} model.") - - for index_config in config: - print("Preparing data for index:", index_config["index_name"]) - create_index(index_config, credential, form_recognizer_client, use_layout=args.form_rec_use_layout) - print("Data preparation for index", index_config["index_name"], "completed") - - print(f"Data preparation script completed. {len(config)} indexes updated.") \ No newline at end of file diff --git a/scripts/data_utils.py b/scripts/data_utils.py index f3526989d3..2c996b5afe 100644 --- a/scripts/data_utils.py +++ b/scripts/data_utils.py @@ -1,638 +1,640 @@ -"""Data utilities for index preparation.""" -import os -import ast -import markdown -import re -import tiktoken -import html -import json - -from tqdm import tqdm -from abc import ABC, abstractmethod -from bs4 import BeautifulSoup, Tag, NavigableString -from dataclasses import dataclass - -from typing import List, Dict, Optional, Generator, Tuple, Union -from langchain.text_splitter import MarkdownTextSplitter, RecursiveCharacterTextSplitter, PythonCodeTextSplitter - -FILE_FORMAT_DICT = { - "md": "markdown", - "txt": "text", - "html": "html", - "shtml": "html", - "htm": "html", - "py": "python", - "pdf": "pdf" - } - -SENTENCE_ENDINGS = [".", "!", "?"] -WORDS_BREAKS = list(reversed([",", ";", ":", " ", "(", ")", "[", "]", "{", "}", "\t", "\n"])) - -@dataclass -class Document(object): - """A data class for storing documents - - Attributes: - content (str): The content of the document. - id (Optional[str]): The id of the document. - title (Optional[str]): The title of the document. - filepath (Optional[str]): The filepath of the document. - url (Optional[str]): The url of the document. - metadata (Optional[Dict]): The metadata of the document. - """ - - content: str - id: Optional[str] = None - title: Optional[str] = None - filepath: Optional[str] = None - url: Optional[str] = None - metadata: Optional[Dict] = None - -def cleanup_content(content: str) -> str: - """Cleans up the given content using regexes - Args: - content (str): The content to clean up. - Returns: - str: The cleaned up content. - """ - output = re.sub(r"\n{2,}", "\n", content) - output = re.sub(r"[^\S\n]{2,}", " ", output) - output = re.sub(r"-{2,}", "--", output) - - return output.strip() - -class BaseParser(ABC): - """A parser parses content to produce a document.""" - - @abstractmethod - def parse(self, content: str, file_name: Optional[str] = None) -> Document: - """Parses the given content. - Args: - content (str): The content to parse. - file_name (str): The file name associated with the content. - Returns: - Document: The parsed document. - """ - pass - - def parse_file(self, file_path: str) -> Document: - """Parses the given file. - Args: - file_path (str): The file to parse. - Returns: - Document: The parsed document. - """ - with open(file_path, "r") as f: - return self.parse(f.read(), os.path.basename(file_path)) - - def parse_directory(self, directory_path: str) -> List[Document]: - """Parses the given directory. - Args: - directory_path (str): The directory to parse. - Returns: - List[Document]: List of parsed documents. - """ - documents = [] - for file_name in os.listdir(directory_path): - file_path = os.path.join(directory_path, file_name) - if os.path.isfile(file_path): - documents.append(self.parse_file(file_path)) - return documents - -class MarkdownParser(BaseParser): - """Parses Markdown content.""" - - def __init__(self) -> None: - super().__init__() - self._html_parser = HTMLParser() - - def parse(self, content: str, file_name: Optional[str] = None) -> Document: - """Parses the given content. - Args: - content (str): The content to parse. - file_name (str): The file name associated with the content. - Returns: - Document: The parsed document. - """ - html_content = markdown.markdown(content, extensions=['fenced_code', 'toc', 'tables', 'sane_lists']) - - return self._html_parser.parse(html_content, file_name) - - -class HTMLParser(BaseParser): - """Parses HTML content.""" - TITLE_MAX_TOKENS = 128 - NEWLINE_TEMPL = "" - - def __init__(self) -> None: - super().__init__() - self.token_estimator = TokenEstimator() - - def parse(self, content: str, file_name: Optional[str] = None) -> Document: - """Parses the given content. - Args: - content (str): The content to parse. - file_name (str): The file name associated with the content. - Returns: - Document: The parsed document. - """ - soup = BeautifulSoup(content, 'html.parser') - - # Extract the title - title = '' - if soup.title and soup.title.string: - title = soup.title.string - else: - # Try to find the first

tag - h1_tag = soup.find('h1') - if h1_tag: - title = h1_tag.get_text(strip=True) - else: - h2_tag = soup.find('h2') - if h2_tag: - title = h2_tag.get_text(strip=True) - if title is None or title == '': - # if title is still not found, guess using the next string - try: - title = next(soup.stripped_strings) - title = self.token_estimator.construct_tokens_with_size(title, self.TITLE_MAX_TOKENS) - - except StopIteration: - title = file_name - - # Helper function to process text nodes - def process_text(text): - return text.strip() - - # Helper function to process anchor tags - def process_anchor_tag(tag): - href = tag.get('href', '') - text = tag.get_text(strip=True) - return f'{text} ({href})' - - # Collect all text nodes and anchor tags in a list - elements = [] - - for elem in soup.descendants: - if isinstance(elem, (Tag, NavigableString)): - page_element: Union[Tag, NavigableString] = elem - if page_element.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'code']: - if elements and not elements[-1].endswith('\n'): - elements.append(self.NEWLINE_TEMPL) - if isinstance(page_element, str): - elements.append(process_text(page_element)) - elif page_element.name == 'a': - elements.append(process_anchor_tag(page_element)) - - - # Join the list into a single string and return but ensure that either of newlines or space are used. - result = '' - is_prev_newline = False - for elem in elements: - if elem: - if elem == self.NEWLINE_TEMPL: - result += "\n" - is_prev_newline = True - else: - if not is_prev_newline: - result += " " - else: - is_prev_newline = False - result += f"{elem}" - - if title is None: - title = '' # ensure no 'None' type title - return Document(content=cleanup_content(result), title=str(title)) - -class TextParser(BaseParser): - """Parses text content.""" - - def __init__(self) -> None: - super().__init__() - - def _get_first_alphanum_line(self, content: str) -> Optional[str]: - title = None - for line in content.splitlines(): - if any([c.isalnum() for c in line]): - title = line.strip() - break - return title - - def _get_first_line_with_property( - self, content: str, property: str = "title: " - ) -> Optional[str]: - title = None - for line in content.splitlines(): - if line.startswith(property): - title = line[len(property) :].strip() - break - return title - - def parse(self, content: str, file_name: Optional[str] = None) -> Document: - """Parses the given content. - Args: - content (str): The content to parse. - file_name (str): The file name associated with the content. - Returns: - Document: The parsed document. - """ - title = self._get_first_line_with_property( - content - ) or self._get_first_alphanum_line(content) - - return Document(content=cleanup_content(content), title=title or file_name) - - -class PythonParser(BaseParser): - def _get_topdocstring(self, text): - tree = ast.parse(text) - docstring = ast.get_docstring(tree) # returns top docstring - return docstring - - def parse(self, content: str, file_name: Optional[str] = None) -> Document: - """Parses the given content. - Args: - content (str): The content to parse. - file_name (str): The file name associated with the content. - Returns: - Document: The parsed document. - """ - docstring = self._get_topdocstring(content) - if docstring: - title = f"{file_name}: {docstring}" - else: - title = file_name - return Document(content=content, title=title) - - def __init__(self) -> None: - super().__init__() - -class ParserFactory: - def __init__(self): - self._parsers = { - "html": HTMLParser(), - "text": TextParser(), - "markdown": MarkdownParser(), - "python": PythonParser() - } - - @property - def supported_formats(self) -> List[str]: - "Returns a list of supported formats" - return list(self._parsers.keys()) - - def __call__(self, file_format: str) -> BaseParser: - parser = self._parsers.get(file_format, None) - if parser is None: - raise UnsupportedFormatError(f"{file_format} is not supported") - - return parser - -class TokenEstimator(object): - GPT2_TOKENIZER = tiktoken.get_encoding("gpt2") - - def estimate_tokens(self, text: str) -> int: - return len(self.GPT2_TOKENIZER.encode(text)) - - def construct_tokens_with_size(self, tokens: str, numofTokens: int) -> str: - newTokens = self.GPT2_TOKENIZER.decode( - self.GPT2_TOKENIZER.encode(tokens)[:numofTokens] - ) - return newTokens - -parser_factory = ParserFactory() -TOKEN_ESTIMATOR = TokenEstimator() - -class UnsupportedFormatError(Exception): - """Exception raised when a format is not supported by a parser.""" - - pass - -@dataclass -class ChunkingResult: - """Data model for chunking result - - Attributes: - chunks (List[Document]): List of chunks. - total_files (int): Total number of files. - num_unsupported_format_files (int): Number of files with unsupported format. - num_files_with_errors (int): Number of files with errors. - skipped_chunks (int): Number of chunks skipped. - """ - chunks: List[Document] - total_files: int - num_unsupported_format_files: int = 0 - num_files_with_errors: int = 0 - # some chunks might be skipped to small number of tokens - skipped_chunks: int = 0 - -def get_files_recursively(directory_path: str) -> List[str]: - """Gets all files in the given directory recursively. - Args: - directory_path (str): The directory to get files from. - Returns: - List[str]: List of file paths. - """ - file_paths = [] - for dirpath, _, files in os.walk(directory_path): - for file_name in files: - file_path = os.path.join(dirpath, file_name) - file_paths.append(file_path) - return file_paths - -def convert_escaped_to_posix(escaped_path): - windows_path = escaped_path.replace("\\\\", "\\") - posix_path = windows_path.replace("\\", "/") - return posix_path - -def _get_file_format(file_name: str, extensions_to_process: List[str]) -> Optional[str]: - """Gets the file format from the file name. - Returns None if the file format is not supported. - Args: - file_name (str): The file name. - extensions_to_process (List[str]): List of extensions to process. - Returns: - str: The file format. - """ - - # in case the caller gives us a file path - file_name = os.path.basename(file_name) - file_extension = file_name.split(".")[-1] - if file_extension not in extensions_to_process: - return None - return FILE_FORMAT_DICT.get(file_extension, None) - -def table_to_html(table): - table_html = "" - rows = [sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index) for i in range(table.row_count)] - for row_cells in rows: - table_html += "" - for cell in row_cells: - tag = "th" if (cell.kind == "columnHeader" or cell.kind == "rowHeader") else "td" - cell_spans = "" - if cell.column_span > 1: cell_spans += f" colSpan={cell.column_span}" - if cell.row_span > 1: cell_spans += f" rowSpan={cell.row_span}" - table_html += f"<{tag}{cell_spans}>{html.escape(cell.content)}" - table_html +="" - table_html += "
" - return table_html - -def extract_pdf_content(file_path, form_recognizer_client, use_layout=False): - offset = 0 - page_map = [] - model = "prebuilt-layout" if use_layout else "prebuilt-read" - with open(file_path, "rb") as f: - poller = form_recognizer_client.begin_analyze_document(model, document = f) - form_recognizer_results = poller.result() - - for page_num, page in enumerate(form_recognizer_results.pages): - tables_on_page = [table for table in form_recognizer_results.tables if table.bounding_regions[0].page_number == page_num + 1] - - # (if using layout) mark all positions of the table spans in the page - page_offset = page.spans[0].offset - page_length = page.spans[0].length - table_chars = [-1]*page_length - for table_id, table in enumerate(tables_on_page): - for span in table.spans: - # replace all table spans with "table_id" in table_chars array - for i in range(span.length): - idx = span.offset - page_offset + i - if idx >=0 and idx < page_length: - table_chars[idx] = table_id - - # build page text by replacing charcters in table spans with table html if using layout - page_text = "" - added_tables = set() - for idx, table_id in enumerate(table_chars): - if table_id == -1: - page_text += form_recognizer_results.content[page_offset + idx] - elif not table_id in added_tables: - page_text += table_to_html(tables_on_page[table_id]) - added_tables.add(table_id) - - page_text += " " - page_map.append((page_num, offset, page_text)) - offset += len(page_text) - - full_text = "".join([page_text for _, _, page_text in page_map]) - return full_text - -def chunk_content_helper( - content: str, file_format: str, file_name: Optional[str], - token_overlap: int, - num_tokens: int = 256 -) -> Generator[Tuple[str, int, Document], None, None]: - parser = parser_factory(file_format) - doc = parser.parse(content, file_name=file_name) - if num_tokens == None: - num_tokens = 1000000000 - - if file_format == "markdown": - splitter = MarkdownTextSplitter.from_tiktoken_encoder(chunk_size=num_tokens, chunk_overlap=token_overlap) - elif file_format == "python": - splitter = PythonCodeTextSplitter.from_tiktoken_encoder(chunk_size=num_tokens, chunk_overlap=token_overlap) - else: - splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( - separators=SENTENCE_ENDINGS + WORDS_BREAKS, - chunk_size=num_tokens, chunk_overlap=token_overlap) - chunked_content_list = splitter.split_text(doc.content) - for chunked_content in chunked_content_list: - chunk_size = TOKEN_ESTIMATOR.estimate_tokens(chunked_content) - yield chunked_content, chunk_size, doc - -def chunk_content( - content: str, - file_name: Optional[str] = None, - url: Optional[str] = None, - ignore_errors: bool = True, - num_tokens: int = 256, - min_chunk_size: int = 10, - token_overlap: int = 0, - extensions_to_process = FILE_FORMAT_DICT.keys(), - cracked_pdf = False -) -> ChunkingResult: - """Chunks the given content. If ignore_errors is true, returns None - in case of an error - Args: - content (str): The content to chunk. - file_name (str): The file name. used for title, file format detection. - url (str): The url. used for title. - ignore_errors (bool): If true, ignores errors and returns None. - num_tokens (int): The number of tokens in each chunk. - min_chunk_size (int): The minimum chunk size below which chunks will be filtered. - token_overlap (int): The number of tokens to overlap between chunks. - Returns: - List[Document]: List of chunked documents. - """ - - try: - if file_name is None or cracked_pdf: - file_format = "text" - else: - file_format = _get_file_format(file_name, extensions_to_process) - if file_format is None: - raise Exception( - f"{file_name} is not supported") - - chunked_context = chunk_content_helper( - content=content, - file_name=file_name, - file_format=file_format, - num_tokens=num_tokens, - token_overlap=token_overlap - ) - chunks = [] - skipped_chunks = 0 - for chunk, chunk_size, doc in chunked_context: - if chunk_size >= min_chunk_size: - chunks.append( - Document( - content=chunk, - title=doc.title, - url=url, - ) - ) - else: - skipped_chunks += 1 - - except UnsupportedFormatError as e: - if ignore_errors: - return ChunkingResult( - chunks=[], total_files=1, num_unsupported_format_files=1 - ) - else: - raise e - except Exception as e: - if ignore_errors: - return ChunkingResult(chunks=[], total_files=1, num_files_with_errors=1) - else: - raise e - return ChunkingResult( - chunks=chunks, - total_files=1, - skipped_chunks=skipped_chunks, - ) - -def chunk_file( - file_path: str, - ignore_errors: bool = True, - num_tokens=256, - min_chunk_size=10, - url = None, - token_overlap: int = 0, - extensions_to_process = FILE_FORMAT_DICT.keys(), - form_recognizer_client = None, - use_layout = False -) -> ChunkingResult: - """Chunks the given file. - Args: - file_path (str): The file to chunk. - Returns: - List[Document]: List of chunked documents. - """ - file_name = os.path.basename(file_path) - file_format = _get_file_format(file_name, extensions_to_process) - if not file_format: - if ignore_errors: - return ChunkingResult( - chunks=[], total_files=1, num_unsupported_format_files=1 - ) - else: - raise UnsupportedFormatError(f"{file_name} is not supported") - - cracked_pdf = False - if file_format == "pdf": - if form_recognizer_client is None: - raise UnsupportedFormatError("form_recognizer_client is required for pdf files") - content = extract_pdf_content(file_path, form_recognizer_client, use_layout=use_layout) - cracked_pdf = True - else: - with open(file_path, "r", encoding="utf8") as f: - content = f.read() - return chunk_content( - content=content, - file_name=file_name, - ignore_errors=ignore_errors, - num_tokens=num_tokens, - min_chunk_size=min_chunk_size, - url=url, - token_overlap=max(0, token_overlap), - extensions_to_process=extensions_to_process, - cracked_pdf=cracked_pdf - ) - -def chunk_directory( - directory_path: str, - ignore_errors: bool = True, - num_tokens: int = 1024, - min_chunk_size: int = 10, - url_prefix = None, - token_overlap: int = 0, - extensions_to_process: List[str] = FILE_FORMAT_DICT.keys(), - form_recognizer_client = None, - use_layout = False -): - """ - Chunks the given directory recursively - Args: - directory_path (str): The directory to chunk. - ignore_errors (bool): If true, ignores errors and returns None. - num_tokens (int): The number of tokens to use for chunking. - min_chunk_size (int): The minimum chunk size. - url_prefix (str): The url prefix to use for the files. If None, the url will be None. If not None, the url will be url_prefix + relpath. - For example, if the directory path is /home/user/data and the url_prefix is https://example.com/data, - then the url for the file /home/user/data/file1.txt will be https://example.com/data/file1.txt - token_overlap (int): The number of tokens to overlap between chunks. - extensions_to_process (List[str]): The list of extensions to process. - form_recognizer_client: Optional form recognizer client to use for pdf files. - use_layout (bool): If true, uses Layout model for pdf files. Otherwise, uses Read. - - Returns: - List[Document]: List of chunked documents. - """ - chunks = [] - total_files = 0 - num_unsupported_format_files = 0 - num_files_with_errors = 0 - skipped_chunks = 0 - for file_path in tqdm(get_files_recursively(directory_path)): - if os.path.isfile(file_path): - # get relpath - url_path = None - rel_file_path = os.path.relpath(file_path, directory_path) - if url_prefix: - url_path = url_prefix + rel_file_path - url_path = convert_escaped_to_posix(url_path) - try: - result = chunk_file( - file_path, - ignore_errors=ignore_errors, - num_tokens=num_tokens, - min_chunk_size=min_chunk_size, - url=url_path, - token_overlap=token_overlap, - extensions_to_process=extensions_to_process, - form_recognizer_client=form_recognizer_client, - use_layout=use_layout - ) - for chunk_idx, chunk_doc in enumerate(result.chunks): - chunk_doc.filepath = rel_file_path - chunk_doc.metadata = json.dumps({"chunk_id": str(chunk_idx)}) - chunks.extend(result.chunks) - num_unsupported_format_files += result.num_unsupported_format_files - num_files_with_errors += result.num_files_with_errors - skipped_chunks += result.skipped_chunks - except Exception as e: - if not ignore_errors: - raise - print(f"File ({file_path}) failed with ", e) - num_files_with_errors += 1 - total_files += 1 - - return ChunkingResult( - chunks=chunks, - total_files=total_files, - num_unsupported_format_files=num_unsupported_format_files, - num_files_with_errors=num_files_with_errors, - skipped_chunks=skipped_chunks, - ) +"""Data utilities for index preparation.""" +import os +import ast +import markdown +import re +import tiktoken +import html +import json + +from tqdm import tqdm +from abc import ABC, abstractmethod +from bs4 import BeautifulSoup, Tag, NavigableString +from dataclasses import dataclass + +from typing import List, Dict, Optional, Generator, Tuple, Union +from langchain.text_splitter import MarkdownTextSplitter, RecursiveCharacterTextSplitter, PythonCodeTextSplitter + +FILE_FORMAT_DICT = { + "md": "markdown", + "txt": "text", + "html": "html", + "shtml": "html", + "htm": "html", + "py": "python", + "pdf": "pdf" + } + +SENTENCE_ENDINGS = [".", "!", "?"] +WORDS_BREAKS = list(reversed([",", ";", ":", " ", "(", ")", "[", "]", "{", "}", "\t", "\n"])) + +@dataclass +class Document(object): + """A data class for storing documents + + Attributes: + content (str): The content of the document. + id (Optional[str]): The id of the document. + title (Optional[str]): The title of the document. + filepath (Optional[str]): The filepath of the document. + url (Optional[str]): The url of the document. + metadata (Optional[Dict]): The metadata of the document. + """ + + content: str + id: Optional[str] = None + title: Optional[str] = None + filepath: Optional[str] = None + url: Optional[str] = None + metadata: Optional[Dict] = None + +def cleanup_content(content: str) -> str: + """Cleans up the given content using regexes + Args: + content (str): The content to clean up. + Returns: + str: The cleaned up content. + """ + output = re.sub(r"\n{2,}", "\n", content) + output = re.sub(r"[^\S\n]{2,}", " ", output) + output = re.sub(r"-{2,}", "--", output) + + return output.strip() + +class BaseParser(ABC): + """A parser parses content to produce a document.""" + + @abstractmethod + def parse(self, content: str, file_name: Optional[str] = None) -> Document: + """Parses the given content. + Args: + content (str): The content to parse. + file_name (str): The file name associated with the content. + Returns: + Document: The parsed document. + """ + pass + + def parse_file(self, file_path: str) -> Document: + """Parses the given file. + Args: + file_path (str): The file to parse. + Returns: + Document: The parsed document. + """ + with open(file_path, "r") as f: + return self.parse(f.read(), os.path.basename(file_path)) + + def parse_directory(self, directory_path: str) -> List[Document]: + """Parses the given directory. + Args: + directory_path (str): The directory to parse. + Returns: + List[Document]: List of parsed documents. + """ + documents = [] + for file_name in os.listdir(directory_path): + file_path = os.path.join(directory_path, file_name) + if os.path.isfile(file_path): + documents.append(self.parse_file(file_path)) + return documents + +class MarkdownParser(BaseParser): + """Parses Markdown content.""" + + def __init__(self) -> None: + super().__init__() + self._html_parser = HTMLParser() + + def parse(self, content: str, file_name: Optional[str] = None) -> Document: + """Parses the given content. + Args: + content (str): The content to parse. + file_name (str): The file name associated with the content. + Returns: + Document: The parsed document. + """ + html_content = markdown.markdown(content, extensions=['fenced_code', 'toc', 'tables', 'sane_lists']) + + return self._html_parser.parse(html_content, file_name) + + +class HTMLParser(BaseParser): + """Parses HTML content.""" + TITLE_MAX_TOKENS = 128 + NEWLINE_TEMPL = "" + + def __init__(self) -> None: + super().__init__() + self.token_estimator = TokenEstimator() + + def parse(self, content: str, file_name: Optional[str] = None) -> Document: + """Parses the given content. + Args: + content (str): The content to parse. + file_name (str): The file name associated with the content. + Returns: + Document: The parsed document. + """ + soup = BeautifulSoup(content, 'html.parser') + + # Extract the title + title = '' + if soup.title and soup.title.string: + title = soup.title.string + else: + # Try to find the first

tag + h1_tag = soup.find('h1') + if h1_tag: + title = h1_tag.get_text(strip=True) + else: + h2_tag = soup.find('h2') + if h2_tag: + title = h2_tag.get_text(strip=True) + if title is None or title == '': + # if title is still not found, guess using the next string + try: + title = next(soup.stripped_strings) + title = self.token_estimator.construct_tokens_with_size(title, self.TITLE_MAX_TOKENS) + + except StopIteration: + title = file_name + + # Helper function to process text nodes + def process_text(text): + return text.strip() + + # Helper function to process anchor tags + def process_anchor_tag(tag): + href = tag.get('href', '') + text = tag.get_text(strip=True) + return f'{text} ({href})' + + # Collect all text nodes and anchor tags in a list + elements = [] + + for elem in soup.descendants: + if isinstance(elem, (Tag, NavigableString)): + page_element: Union[Tag, NavigableString] = elem + if page_element.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'code']: + if elements and not elements[-1].endswith('\n'): + elements.append(self.NEWLINE_TEMPL) + if isinstance(page_element, str): + elements.append(process_text(page_element)) + elif page_element.name == 'a': + elements.append(process_anchor_tag(page_element)) + + + # Join the list into a single string and return but ensure that either of newlines or space are used. + result = '' + is_prev_newline = False + for elem in elements: + if elem: + if elem == self.NEWLINE_TEMPL: + result += "\n" + is_prev_newline = True + else: + if not is_prev_newline: + result += " " + else: + is_prev_newline = False + result += f"{elem}" + + if title is None: + title = '' # ensure no 'None' type title + return Document(content=cleanup_content(result), title=str(title)) + +class TextParser(BaseParser): + """Parses text content.""" + + def __init__(self) -> None: + super().__init__() + + def _get_first_alphanum_line(self, content: str) -> Optional[str]: + title = None + for line in content.splitlines(): + if any([c.isalnum() for c in line]): + title = line.strip() + break + return title + + def _get_first_line_with_property( + self, content: str, property: str = "title: " + ) -> Optional[str]: + title = None + for line in content.splitlines(): + if line.startswith(property): + title = line[len(property) :].strip() + break + return title + + def parse(self, content: str, file_name: Optional[str] = None) -> Document: + """Parses the given content. + Args: + content (str): The content to parse. + file_name (str): The file name associated with the content. + Returns: + Document: The parsed document. + """ + title = self._get_first_line_with_property( + content + ) or self._get_first_alphanum_line(content) + + return Document(content=cleanup_content(content), title=title or file_name) + + +class PythonParser(BaseParser): + def _get_topdocstring(self, text): + tree = ast.parse(text) + docstring = ast.get_docstring(tree) # returns top docstring + return docstring + + def parse(self, content: str, file_name: Optional[str] = None) -> Document: + """Parses the given content. + Args: + content (str): The content to parse. + file_name (str): The file name associated with the content. + Returns: + Document: The parsed document. + """ + docstring = self._get_topdocstring(content) + if docstring: + title = f"{file_name}: {docstring}" + else: + title = file_name + return Document(content=content, title=title) + + def __init__(self) -> None: + super().__init__() + +class ParserFactory: + def __init__(self): + self._parsers = { + "html": HTMLParser(), + "text": TextParser(), + "markdown": MarkdownParser(), + "python": PythonParser() + } + + @property + def supported_formats(self) -> List[str]: + "Returns a list of supported formats" + return list(self._parsers.keys()) + + def __call__(self, file_format: str) -> BaseParser: + parser = self._parsers.get(file_format, None) + if parser is None: + raise UnsupportedFormatError(f"{file_format} is not supported") + + return parser + +class TokenEstimator(object): + GPT2_TOKENIZER = tiktoken.get_encoding("gpt2") + + def estimate_tokens(self, text: str) -> int: + return len(self.GPT2_TOKENIZER.encode(text)) + + def construct_tokens_with_size(self, tokens: str, numofTokens: int) -> str: + newTokens = self.GPT2_TOKENIZER.decode( + self.GPT2_TOKENIZER.encode(tokens)[:numofTokens] + ) + return newTokens + +parser_factory = ParserFactory() +TOKEN_ESTIMATOR = TokenEstimator() + +class UnsupportedFormatError(Exception): + """Exception raised when a format is not supported by a parser.""" + + pass + +@dataclass +class ChunkingResult: + """Data model for chunking result + + Attributes: + chunks (List[Document]): List of chunks. + total_files (int): Total number of files. + num_unsupported_format_files (int): Number of files with unsupported format. + num_files_with_errors (int): Number of files with errors. + skipped_chunks (int): Number of chunks skipped. + """ + chunks: List[Document] + total_files: int + num_unsupported_format_files: int = 0 + num_files_with_errors: int = 0 + # some chunks might be skipped to small number of tokens + skipped_chunks: int = 0 + +def get_files_recursively(directory_path: str) -> List[str]: + """Gets all files in the given directory recursively. + Args: + directory_path (str): The directory to get files from. + Returns: + List[str]: List of file paths. + """ + file_paths = [] + for dirpath, _, files in os.walk(directory_path): + for file_name in files: + file_path = os.path.join(dirpath, file_name) + file_paths.append(file_path) + return file_paths + +def convert_escaped_to_posix(escaped_path): + windows_path = escaped_path.replace("\\\\", "\\") + posix_path = windows_path.replace("\\", "/") + return posix_path + +def _get_file_format(file_name: str, extensions_to_process: List[str]) -> Optional[str]: + """Gets the file format from the file name. + Returns None if the file format is not supported. + Args: + file_name (str): The file name. + extensions_to_process (List[str]): List of extensions to process. + Returns: + str: The file format. + """ + + # in case the caller gives us a file path + file_name = os.path.basename(file_name) + file_extension = file_name.split(".")[-1] + if file_extension not in extensions_to_process: + return None + return FILE_FORMAT_DICT.get(file_extension, None) + +def table_to_html(table): + table_html = "" + rows = [sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index) for i in range(table.row_count)] + for row_cells in rows: + table_html += "" + for cell in row_cells: + tag = "th" if (cell.kind == "columnHeader" or cell.kind == "rowHeader") else "td" + cell_spans = "" + if cell.column_span > 1: cell_spans += f" colSpan={cell.column_span}" + if cell.row_span > 1: cell_spans += f" rowSpan={cell.row_span}" + table_html += f"<{tag}{cell_spans}>{html.escape(cell.content)}" + table_html +="" + table_html += "
" + return table_html + +def extract_pdf_content(file_path, form_recognizer_client, use_layout=False): + offset = 0 + page_map = [] + model = "prebuilt-layout" if use_layout else "prebuilt-read" + with open(file_path, "rb") as f: + poller = form_recognizer_client.begin_analyze_document(model, document = f) + form_recognizer_results = poller.result() + title = next((p.content for p in form_recognizer_results.paragraphs if p.role == "title"), None) + + for page_num, page in enumerate(form_recognizer_results.pages): + tables_on_page = [table for table in form_recognizer_results.tables if table.bounding_regions[0].page_number == page_num + 1] + + # (if using layout) mark all positions of the table spans in the page + page_offset = page.spans[0].offset + page_length = page.spans[0].length + table_chars = [-1]*page_length + for table_id, table in enumerate(tables_on_page): + for span in table.spans: + # replace all table spans with "table_id" in table_chars array + for i in range(span.length): + idx = span.offset - page_offset + i + if idx >=0 and idx < page_length: + table_chars[idx] = table_id + + # build page text by replacing charcters in table spans with table html if using layout + page_text = "" + added_tables = set() + for idx, table_id in enumerate(table_chars): + if table_id == -1: + page_text += form_recognizer_results.content[page_offset + idx] + elif not table_id in added_tables: + page_text += table_to_html(tables_on_page[table_id]) + added_tables.add(table_id) + + page_text += " " + page_map.append((page_num, offset, page_text)) + offset += len(page_text) + + full_text = "".join([page_text for _, _, page_text in page_map]) + return full_text + +def chunk_content_helper( + content: str, file_format: str, file_name: Optional[str], + token_overlap: int, + num_tokens: int = 256 +) -> Generator[Tuple[str, int, Document], None, None]: + parser = parser_factory(file_format) + doc = parser.parse(content, file_name=file_name) + if num_tokens == None: + num_tokens = 1000000000 + + if file_format == "markdown": + splitter = MarkdownTextSplitter.from_tiktoken_encoder(chunk_size=num_tokens, chunk_overlap=token_overlap) + elif file_format == "python": + splitter = PythonCodeTextSplitter.from_tiktoken_encoder(chunk_size=num_tokens, chunk_overlap=token_overlap) + else: + splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( + separators=SENTENCE_ENDINGS + WORDS_BREAKS, + chunk_size=num_tokens, chunk_overlap=token_overlap) + chunked_content_list = splitter.split_text(doc.content) + for chunked_content in chunked_content_list: + chunk_size = TOKEN_ESTIMATOR.estimate_tokens(chunked_content) + yield chunked_content, chunk_size, doc + +def chunk_content( + content: str, + file_name: Optional[str] = None, + url: Optional[str] = None, + ignore_errors: bool = True, + num_tokens: int = 256, + min_chunk_size: int = 10, + token_overlap: int = 0, + extensions_to_process = FILE_FORMAT_DICT.keys(), + cracked_pdf = False +) -> ChunkingResult: + """Chunks the given content. If ignore_errors is true, returns None + in case of an error + Args: + content (str): The content to chunk. + file_name (str): The file name. used for title, file format detection. + url (str): The url. used for title. + ignore_errors (bool): If true, ignores errors and returns None. + num_tokens (int): The number of tokens in each chunk. + min_chunk_size (int): The minimum chunk size below which chunks will be filtered. + token_overlap (int): The number of tokens to overlap between chunks. + Returns: + List[Document]: List of chunked documents. + """ + + try: + if file_name is None or cracked_pdf: + file_format = "text" + else: + file_format = _get_file_format(file_name, extensions_to_process) + if file_format is None: + raise Exception( + f"{file_name} is not supported") + + chunked_context = chunk_content_helper( + content=content, + file_name=file_name, + file_format=file_format, + num_tokens=num_tokens, + token_overlap=token_overlap + ) + chunks = [] + skipped_chunks = 0 + for chunk, chunk_size, doc in chunked_context: + if chunk_size >= min_chunk_size: + chunks.append( + Document( + content=chunk, + title=doc.title, + url=url, + ) + ) + else: + skipped_chunks += 1 + + except UnsupportedFormatError as e: + if ignore_errors: + return ChunkingResult( + chunks=[], total_files=1, num_unsupported_format_files=1 + ) + else: + raise e + except Exception as e: + if ignore_errors: + return ChunkingResult(chunks=[], total_files=1, num_files_with_errors=1) + else: + raise e + return ChunkingResult( + chunks=chunks, + total_files=1, + skipped_chunks=skipped_chunks, + ) + +def chunk_file( + file_path: str, + ignore_errors: bool = False, + num_tokens=256, + min_chunk_size=10, + url = None, + token_overlap: int = 0, + extensions_to_process = FILE_FORMAT_DICT.keys(), + form_recognizer_client = None, + use_layout = False +) -> ChunkingResult: + """Chunks the given file. + Args: + file_path (str): The file to chunk. + Returns: + List[Document]: List of chunked documents. + """ + file_name = os.path.basename(file_path) + file_format = _get_file_format(file_name, extensions_to_process) + if not file_format: + if ignore_errors: + return ChunkingResult( + chunks=[], total_files=1, num_unsupported_format_files=1 + ) + else: + raise UnsupportedFormatError(f"{file_name} is not supported") + + cracked_pdf = False + if file_format == "pdf": + if form_recognizer_client is None: + raise UnsupportedFormatError("form_recognizer_client is required for pdf files") + content = extract_pdf_content(file_path, form_recognizer_client, use_layout=use_layout) + cracked_pdf = True + else: + with open(file_path, "r", encoding="utf8") as f: + content = f.read() + return chunk_content( + content=content, + file_name=file_name, + ignore_errors=ignore_errors, + num_tokens=num_tokens, + min_chunk_size=min_chunk_size, + url=url, + token_overlap=max(0, token_overlap), + extensions_to_process=extensions_to_process, + cracked_pdf=cracked_pdf + ) + +def chunk_directory( + directory_path: str, + ignore_errors: bool = False, + num_tokens: int = 1024, + min_chunk_size: int = 10, + url_prefix = None, + token_overlap: int = 0, + extensions_to_process: List[str] = FILE_FORMAT_DICT.keys(), + form_recognizer_client = None, + use_layout = False +): + """ + Chunks the given directory recursively + Args: + directory_path (str): The directory to chunk. + ignore_errors (bool): If true, ignores errors and returns None. + num_tokens (int): The number of tokens to use for chunking. + min_chunk_size (int): The minimum chunk size. + url_prefix (str): The url prefix to use for the files. If None, the url will be None. If not None, the url will be url_prefix + relpath. + For example, if the directory path is /home/user/data and the url_prefix is https://example.com/data, + then the url for the file /home/user/data/file1.txt will be https://example.com/data/file1.txt + token_overlap (int): The number of tokens to overlap between chunks. + extensions_to_process (List[str]): The list of extensions to process. + form_recognizer_client: Optional form recognizer client to use for pdf files. + use_layout (bool): If true, uses Layout model for pdf files. Otherwise, uses Read. + + Returns: + List[Document]: List of chunked documents. + """ + chunks = [] + total_files = 0 + num_unsupported_format_files = 0 + num_files_with_errors = 0 + skipped_chunks = 0 + for file_path in tqdm(get_files_recursively(directory_path)): + print(file_path) + if os.path.isfile(file_path): + # get relpath + url_path = None + rel_file_path = os.path.relpath(file_path, directory_path) + if url_prefix: + url_path = url_prefix + rel_file_path + url_path = convert_escaped_to_posix(url_path) + try: + result = chunk_file( + file_path, + ignore_errors=ignore_errors, + num_tokens=num_tokens, + min_chunk_size=min_chunk_size, + url=url_path, + token_overlap=token_overlap, + extensions_to_process=extensions_to_process, + form_recognizer_client=form_recognizer_client, + use_layout=use_layout + ) + for chunk_idx, chunk_doc in enumerate(result.chunks): + chunk_doc.filepath = rel_file_path + chunk_doc.metadata = json.dumps({"chunk_id": str(chunk_idx)}) + chunks.extend(result.chunks) + num_unsupported_format_files += result.num_unsupported_format_files + num_files_with_errors += result.num_files_with_errors + skipped_chunks += result.skipped_chunks + except Exception as e: + if not ignore_errors: + raise + print(f"File ({file_path}) failed with ", e) + num_files_with_errors += 1 + total_files += 1 + + return ChunkingResult( + chunks=chunks, + total_files=total_files, + num_unsupported_format_files=num_unsupported_format_files, + num_files_with_errors=num_files_with_errors, + skipped_chunks=skipped_chunks, + ) \ No newline at end of file diff --git a/scripts/prepdocs.ps1 b/scripts/prepdocs.ps1 new file mode 100644 index 0000000000..e691293180 --- /dev/null +++ b/scripts/prepdocs.ps1 @@ -0,0 +1,39 @@ +Write-Host "" +Write-Host "Loading azd .env file from current environment" +Write-Host "" + +$output = azd env get-values + +foreach ($line in $output) { + if (!$line.Contains('=')) { + continue + } + + $name, $value = $line.Split("=") + $value = $value -replace '^\"|\"$' + [Environment]::SetEnvironmentVariable($name, $value) +} + +Write-Host "Environment variables set." + +$pythonCmd = Get-Command python -ErrorAction SilentlyContinue +if (-not $pythonCmd) { + # fallback to python3 if python not found + $pythonCmd = Get-Command python3 -ErrorAction SilentlyContinue +} + +Write-Host 'Creating python virtual environment "scripts/.venv"' +Start-Process -FilePath ($pythonCmd).Source -ArgumentList "-m venv ./scripts/.venv" -Wait -NoNewWindow + +$venvPythonPath = "./scripts/.venv/scripts/python.exe" +if (Test-Path -Path "/usr") { + # fallback to Linux venv path + $venvPythonPath = "./scripts/.venv/bin/python" +} + +Write-Host 'Installing dependencies from "requirements.txt" into virtual environment' +Start-Process -FilePath $venvPythonPath -ArgumentList "-m pip install -r ./scripts/requirements.txt" -Wait -NoNewWindow + +Write-Host 'Running "prepdocs.py"' +$cwd = (Get-Location) +Start-Process -FilePath $venvPythonPath -ArgumentList "./scripts/prepdocs.py $cwd/data/* --storageaccount $env:AZURE_STORAGE_ACCOUNT --container $env:AZURE_STORAGE_CONTAINER --searchservice $env:AZURE_SEARCH_SERVICE --index $env:AZURE_SEARCH_INDEX --formrecognizerservice $env:AZURE_FORMRECOGNIZER_SERVICE --tenantid $env:AZURE_TENANT_ID -v" -Wait -NoNewWindow diff --git a/scripts/prepdocs.py b/scripts/prepdocs.py new file mode 100644 index 0000000000..fcb6e82f90 --- /dev/null +++ b/scripts/prepdocs.py @@ -0,0 +1,138 @@ +import argparse +import dataclasses + +from tqdm import tqdm +from azure.identity import AzureDeveloperCliCredential +from azure.core.credentials import AzureKeyCredential +from azure.storage.blob import BlobServiceClient +from azure.search.documents.indexes import SearchIndexClient +from azure.search.documents.indexes.models import ( + SearchableField, + SemanticField, + SemanticSettings, + SemanticConfiguration, + SearchIndex, + PrioritizedFields +) +from azure.search.documents import SearchClient +from azure.ai.formrecognizer import DocumentAnalysisClient + + +from data_utils import chunk_directory + + + +def create_search_index(index_name, index_client): + print(f"Ensuring search index {index_name} exists") + if index_name not in index_client.list_index_names(): + index = SearchIndex( + name=index_name, + fields=[ + SearchableField(name="id", type="Edm.String", key=True), + SearchableField(name="content", type="Edm.String", analyzer_name="en.lucene"), + SearchableField(name="title", type="Edm.String", analyzer_name="en.lucene"), + SearchableField(name="filepath", type="Edm.String"), + SearchableField(name="url", type="Edm.String"), + SearchableField(name="metadata", type="Edm.String") + ], + semantic_settings=SemanticSettings( + configurations=[SemanticConfiguration( + name='default', + prioritized_fields=PrioritizedFields( + title_field=SemanticField(field_name='title'), + prioritized_content_fields=[SemanticField(field_name='content')]))]) + ) + print(f"Creating {index_name} search index") + index_client.create_index(index) + else: + print(f"Search index {index_name} already exists") + +def upload_documents_to_index(docs, search_client, upload_batch_size = 50): + to_upload_dicts = [] + + id = 0 + for document in docs: + d = dataclasses.asdict(document) + # add id to documents + d.update({"@search.action": "upload", "id": str(id)}) + to_upload_dicts.append(d) + id += 1 + + + # Upload the documents in batches of upload_batch_size + for i in tqdm(range(0, len(to_upload_dicts), upload_batch_size), desc="Indexing Chunks..."): + batch = to_upload_dicts[i: i + upload_batch_size] + results = search_client.upload_documents(documents=batch) + num_failures = 0 + errors = set() + for result in results: + if not result.succeeded: + print(f"Indexing Failed for {result.key} with ERROR: {result.error_message}") + num_failures += 1 + errors.add(result.error_message) + if num_failures > 0: + raise Exception(f"INDEXING FAILED for {num_failures} documents. Please recreate the index." + f"To Debug: PLEASE CHECK chunk_size and upload_batch_size. \n Error Messages: {list(errors)}") + + +def create_and_populate_index(index_name, index_client, search_client, form_recognizer_client): + + # create or update search index with compatible schema + create_search_index(index_name, index_client) + + # chunk directory + print("Chunking directory...") + result = chunk_directory("./data", form_recognizer_client=form_recognizer_client, use_layout=True, ignore_errors=False) + + if len(result.chunks) == 0: + raise Exception("No chunks found. Please check the data path and chunk size.") + + print(f"Processed {result.total_files} files") + print(f"Unsupported formats: {result.num_unsupported_format_files} files") + print(f"Files with errors: {result.num_files_with_errors} files") + print(f"Found {len(result.chunks)} chunks") + + # upload documents to index + print("Uploading documents to index...") + upload_documents_to_index(result.chunks, search_client) + + # check if index is ready/validate index + # print("Validating index...") + # TODO: validate_index(index_name) - Port to Azure CLI + # print("Index validation completed") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index.", + epilog="Example: prepdocs.py '..\data\*' --storageaccount myaccount --container mycontainer --searchservice mysearch --index myindex -v" + ) + parser.add_argument("files", help="Files to be processed") + parser.add_argument("--storageaccount", help="Azure Blob Storage account name") + parser.add_argument("--container", help="Azure Blob Storage container name") + parser.add_argument("--storagekey", required=False, help="Optional. Use this Azure Blob Storage account key instead of the current user identity to login (use az login to set current user for Azure)") + parser.add_argument("--tenantid", required=False, help="Optional. Use this to define the Azure directory where to authenticate)") + parser.add_argument("--searchservice", help="Name of the Azure Cognitive Search service where content should be indexed (must exist already)") + parser.add_argument("--index", help="Name of the Azure Cognitive Search index where content should be indexed (will be created if it doesn't exist)") + parser.add_argument("--searchkey", required=False, help="Optional. Use this Azure Cognitive Search account key instead of the current user identity to login (use az login to set current user for Azure)") + parser.add_argument("--formrecognizerservice", required=False, help="Optional. Name of the Azure Form Recognizer service which will be used to extract text, tables and layout from the documents (must exist already)") + parser.add_argument("--formrecognizerkey", required=False, help="Optional. Use this Azure Form Recognizer account key instead of the current user identity to login (use az login to set current user for Azure)") + args = parser.parse_args() + + # Use the current user identity to connect to Azure services unless a key is explicitly set for any of them + azd_credential = AzureDeveloperCliCredential() if args.tenantid == None else AzureDeveloperCliCredential(tenant_id=args.tenantid, process_timeout=60) + default_creds = azd_credential if args.searchkey == None or args.storagekey == None else None + search_creds = default_creds if args.searchkey == None else AzureKeyCredential(args.searchkey) + formrecognizer_creds = default_creds if args.formrecognizerkey == None else AzureKeyCredential(args.formrecognizerkey) + + print("Data preparation script started") + print("Preparing data for index:", args.index) + search_endpoint = f"https://{args.searchservice}.search.windows.net/" + index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_creds) + search_client = SearchClient(endpoint=search_endpoint, credential=search_creds, index_name=args.index) + form_recognizer_client = DocumentAnalysisClient( + endpoint=f"https://{args.formrecognizerservice}.cognitiveservices.azure.com/", + credential=formrecognizer_creds) + + create_and_populate_index(args.index, index_client, search_client, form_recognizer_client) + print("Data preparation for index", args.index, "completed") \ No newline at end of file diff --git a/scripts/prepdocs.sh b/scripts/prepdocs.sh new file mode 100755 index 0000000000..cc6b09f63b --- /dev/null +++ b/scripts/prepdocs.sh @@ -0,0 +1,21 @@ + #!/bin/sh + +echo "" +echo "Loading azd .env file from current environment" +echo "" + +while IFS='=' read -r key value; do + value=$(echo "$value" | sed 's/^"//' | sed 's/"$//') + export "$key=$value" +done <", - "location": "", - "subscription_id": "", - "resource_group": "", - "search_service_name": "", - "index_name": "", - "chunk_size": 1024, // set to null to disable chunking before ingestion - "token_overlap": 128 // number of tokens to overlap between chunks - "semantic_config_name": "default" - } -] -``` - -## Create Indexes and Ingest Data -Disclaimer: Make sure there are no duplicate pages in your data. That could impact the quality of the responses you get in a negative way. - -- Run the data preparation script, passing in your config file. - - `python data_preparation.py --config config.json` - -## Optional: Crack PDFs to Text -If your data is in PDF format, you'll first need to convert from PDF to .txt format. You can use your own script for this, or use the provided conversion code here. - -### Setup for PDF Cracking -- Create a [Form Recognizer](https://learn.microsoft.com/en-us/azure/applied-ai-services/form-recognizer/create-a-form-recognizer-resource?view=form-recog-3.0.0) resource in your subscription -- Make sure you have the Form Recognizer SDK: `pip install azure-ai-formrecognizer` -- Run the following command to get an access key for your Form Recognizer resource: - `az cognitiveservices account keys list --name "" --resource-group ""` - - Copy one of the keys returned by this command. - -### Create Indexes and Ingest Data from PDF with Form Recognizer -Pass in your Form Recognizer resource name and key when running the data preparation script: - -`python data_preparation.py --config config.json --form-rec-resource --form-rec-key ` - -This will use the Form Recognizer Read model by default. If your documents have a lot of tables and relevant layout information, you can use the Form Recognizer Layout model, which is more costly and slower to run but will preserve table information with better quality. To use the Layout model instead of the default Read model, pass in the argument `--form-rec-use-layout`. - -`python data_preparation.py --config config.json --form-rec-resource --form-rec-key --form-rec-use-layout` \ No newline at end of file diff --git a/scripts/requirements.txt b/scripts/requirements.txt index aa677b4bb6..f8fc32c618 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1,9 +1,10 @@ -markdown -requests -tqdm -azure-identity -azure-search-documents -tiktoken -langchain -bs4 -azure-ai-formrecognizer \ No newline at end of file +azure-identity==1.13.0b4 +azure-search-documents==11.4.0b3 +azure-ai-formrecognizer==3.2.1 +azure-storage-blob==12.14.1 +markdown +requests +tqdm +tiktoken +langchain +bs4 \ No newline at end of file diff --git a/start.sh b/start.sh new file mode 100755 index 0000000000..94fd92bf35 --- /dev/null +++ b/start.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +echo "" +echo "Restoring backend python packages" +echo "" +python3 -m pip install -r requirements.txt +if [ $? -ne 0 ]; then + echo "Failed to restore backend python packages" + exit $? +fi + +echo "" +echo "Restoring frontend npm packages" +echo "" +cd frontend +npm install +if [ $? -ne 0 ]; then + echo "Failed to restore frontend npm packages" + exit $? +fi + +echo "" +echo "Building frontend" +echo "" +npm run build +if [ $? -ne 0 ]; then + echo "Failed to build frontend" + exit $? +fi + +echo "" +echo "Starting backend" +echo "" +cd .. +python3 -m flask run --port=50505 --reload --debug +if [ $? -ne 0 ]; then + echo "Failed to start backend" + exit $? +fi \ No newline at end of file