diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 00000000..395f20d6 --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,350 @@ +name: Deployment Lifecycle Automation + +on: + push: + branches: + - main + schedule: + - cron: "0 9,21 * * *" # Runs at 9:00 AM and 9:00 PM GMT + workflow_dispatch: + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Checkout Code + uses: actions/checkout@v3 + + - name: Setup Azure CLI + run: | + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + az --version # Verify installation + + - name: Login to Azure + run: | + az login --service-principal -u ${{ secrets.AZURE_MAINTENANCE_CLIENT_ID }} -p ${{ secrets.AZURE_MAINTENANCE_CLIENT_SECRET }} --tenant ${{ secrets.AZURE_TENANT_ID }} + + - name: Run Quota Check + id: quota-check + run: | + export AZURE_MAINTENANCE_CLIENT_ID=${{ secrets.AZURE_MAINTENANCE_CLIENT_ID }} + export AZURE_TENANT_ID=${{ secrets.AZURE_TENANT_ID }} + export AZURE_MAINTENANCE_CLIENT_SECRET=${{ secrets.AZURE_MAINTENANCE_CLIENT_SECRET }} + export AZURE_MAINTENANCE_SUBSCRIPTION_ID="${{ secrets.AZURE_MAINTENANCE_SUBSCRIPTION_ID }}" + export GPT_MIN_CAPACITY="100" + export AZURE_REGIONS="${{ vars.AZURE_REGIONS }}" + + chmod +x infra/scripts/checkquota.sh + if ! infra/scripts/checkquota.sh; then + # If quota check fails due to insufficient quota, set the flag + if grep -q "No region with sufficient quota found" infra/scripts/checkquota.sh; then + echo "QUOTA_FAILED=true" >> $GITHUB_ENV + fi + exit 1 # Fail the pipeline if any other failure occurs + fi + + - name: Send Notification on Quota Failure + if: env.QUOTA_FAILED == 'true' + run: | + RUN_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + EMAIL_BODY=$(cat <Dear Team,

The quota check has failed, and the pipeline cannot proceed.

Build URL: ${RUN_URL}

Please take necessary action.

Best regards,
Your Automation Team

" + } + EOF + ) + + curl -X POST "${{ secrets.LOGIC_APP_URL }}" \ + -H "Content-Type: application/json" \ + -d "$EMAIL_BODY" || echo "Failed to send notification" + + - name: Fail Pipeline if Quota Check Fails + if: env.QUOTA_FAILED == 'true' + run: exit 1 + + - name: Install Bicep CLI + run: az bicep install + + - name: Set Deployment Region + run: | + echo "Selected Region: $VALID_REGION" + echo "AZURE_LOCATION=$VALID_REGION" >> $GITHUB_ENV + + - name: Generate Resource Group Name + id: generate_rg_name + run: | + echo "Generating a unique resource group name..." + TIMESTAMP=$(date +%Y%m%d%H%M) + # Define the common part and add a "cps-" prefix + COMMON_PART="pslautomation" + UNIQUE_RG_NAME="cps-${COMMON_PART}${TIMESTAMP}" + echo "RESOURCE_GROUP_NAME=${UNIQUE_RG_NAME}" >> $GITHUB_ENV + echo "Generated Resource_GROUP_PREFIX: ${UNIQUE_RG_NAME}" + + - name: Check and Create Resource Group + id: check_create_rg + run: | + set -e + echo "Checking if resource group exists..." + rg_exists=$(az group exists --name ${{ env.RESOURCE_GROUP_NAME }}) + if [ "$rg_exists" = "false" ]; then + echo "Resource group does not exist. Creating..." + az group create --name ${{ env.RESOURCE_GROUP_NAME }} \ + --location ${{ env.AZURE_LOCATION }} \ + --tags "CreatedBy=Deployment Lifecycle Automation Pipeline" \ + "Purpose=Deploying and Cleaning Up Resources for Validation" \ + "ApplicationName=Content Processing Accelerator" \ + || { echo "Error creating resource group"; exit 1; } + else + echo "Resource group already exists." + fi + + - name: Generate Unique Solution Prefix + id: generate_solution_prefix + run: | + set -e + COMMON_PART="pslr" + TIMESTAMP=$(date +%s) + UPDATED_TIMESTAMP=$(echo $TIMESTAMP | tail -c 3) + UNIQUE_SOLUTION_PREFIX="${COMMON_PART}${UPDATED_TIMESTAMP}" + echo "SOLUTION_PREFIX=${UNIQUE_SOLUTION_PREFIX}" >> $GITHUB_ENV + echo "Generated SOLUTION_PREFIX: ${UNIQUE_SOLUTION_PREFIX}" + + - name: Deploy Bicep Template + id: deploy + run: | + set -e + az deployment group create \ + --resource-group ${{ env.RESOURCE_GROUP_NAME }} \ + --template-file infra/main.json \ + --parameters \ + environmentName="${{ env.SOLUTION_PREFIX }}" \ + secondaryLocation="EastUs2" \ + contentUnderstandingLocation="WestUS" \ + deploymentType="GlobalStandard" \ + gptModelName="gpt-4o" \ + gptModelVersion="2024-08-06" \ + gptDeploymentCapacity="30" \ + minReplicaContainerApp="1" \ + maxReplicaContainerApp="1" \ + minReplicaContainerApi="1" \ + maxReplicaContainerApi="1" \ + minReplicaContainerWeb="1" \ + maxReplicaContainerWeb="1" \ + useLocalBuild="false" + + - name: Delete Bicep Deployment + if: success() + run: | + set -e + echo "Checking if resource group exists..." + rg_exists=$(az group exists --name ${{ env.RESOURCE_GROUP_NAME }}) + if [ "$rg_exists" = "true" ]; then + echo "Resource group exists. Cleaning..." + az group delete \ + --name ${{ env.RESOURCE_GROUP_NAME }} \ + --yes \ + --no-wait + echo "Resource group deleted... ${{ env.RESOURCE_GROUP_NAME }}" + else + echo "Resource group does not exist." + fi + + - name: Wait for Resource Deletion to Complete + run: | + echo "Fetching resources in the resource group: ${{ env.RESOURCE_GROUP_NAME }}" + + # Ensure correct subscription is set + az account set --subscription "${{ secrets.AZURE_MAINTENANCE_SUBSCRIPTION_ID }}" + + # Fetch all resource IDs dynamically (instead of names) + resources_to_check=($(az resource list --resource-group ${{ env.RESOURCE_GROUP_NAME }} --query "[].id" -o tsv)) + + # Exit early if no resources found + if [ ${#resources_to_check[@]} -eq 0 ]; then + echo "No resources found in the resource group. Skipping deletion check." + exit 0 + fi + + echo "Resources to check: ${resources_to_check[@]}" + + # Extract only resource names and store them in a space-separated string + resources_to_purge="" + for resource_id in "${resources_to_check[@]}"; do + resource_name=$(basename "$resource_id") # Extract the last part of the ID as the name + resources_to_purge+="$resource_name " + done + + # Save the list for later use + echo "RESOURCES_TO_PURGE=$resources_to_purge" >> "$GITHUB_ENV" + + echo "Waiting for resources to be fully deleted..." + + # Maximum retries & retry intervals + max_retries=10 + retry_intervals=(150 180 210 240 270 300) # increased intervals for each retry for potentially long deletion times + retries=0 + + while true; do + all_deleted=true + + for resource_id in "${resources_to_check[@]}"; do + echo "Checking if resource '$resource_id' is deleted..." + + # Check resource existence using full ID + resource_status=$(az resource show --ids "$resource_id" --query "id" -o tsv 2>/dev/null || echo "NotFound") + + if [[ "$resource_status" != "NotFound" ]]; then + echo "Resource '$resource_id' is still present." + all_deleted=false + else + echo "Resource '$resource_id' is fully deleted." + fi + done + + # Break loop if all resources are deleted + if [ "$all_deleted" = true ]; then + echo "All resources are fully deleted. Proceeding with purging..." + break + fi + + # Stop retrying if max retries are reached + if [ $retries -ge $max_retries ]; then + echo "Some resources were not deleted after $max_retries retries. Failing the pipeline." + exit 1 + fi + + echo "Some resources are still present. Retrying in ${retry_intervals[$retries]} seconds..." + sleep ${retry_intervals[$retries]} + retries=$((retries + 1)) + done + + - name: Purging the Resources + if: success() + run: | + set -e + + echo "Using saved list of deleted resources from previous step..." + + # Ensure the correct subscription is set + az account set --subscription "${{ secrets.AZURE_MAINTENANCE_SUBSCRIPTION_ID }}" + + # Iterate over each deleted resource + for resource_name in $RESOURCES_TO_PURGE; do + echo "Checking for deleted resource: $resource_name" + + # Query Azure for deleted resources based on type + case "$resource_name" in + *"kv-cps"*) + deleted_resource=$(az keyvault list-deleted --query "[?name=='$resource_name'].{name:name, type:type, id:id}" -o json) + ;; + *"stcps"*) + deleted_resource=$(az storage account list --query "[?name=='$resource_name']" -o json || echo "{}") + ;; + *"cosmos-cps"*) + deleted_resource=$(az cosmosdb show --name "$resource_name" --query "{name:name, type:type, id:id}" -o json 2>/dev/null || echo "{}") + ;; + *"aisa-cps"*) + deleted_resource=$(az cognitiveservices account list-deleted --query "[?name=='$resource_name'].{name:name, type:type, id:id}" -o json) + ;; + *"appcs-cps"*) + deleted_resource=$(az resource list --query "[?starts_with(name, 'appcs') && type=='Microsoft.Insights/components'].{name:name, type:type, id:id}" -o json) + ;; + *"appi-cps"*) + deleted_resource=$(az resource list --query "[?starts_with(name, 'appi') && type=='Microsoft.Insights/components'].{name:name, type:type, id:id}" -o json) + ;; + *"ca-cps"*) + deleted_resource=$(az resource list --query "[?starts_with(name, 'ca') && type=='Microsoft.Web/containerApps'].{name:name, type:type, id:id}" -o json) + ;; + *) + deleted_resource=$(az resource list --query "[?name=='$resource_name'].{name:name, type:type, id:id}" -o json) + ;; + esac + + if [[ -z "$deleted_resource" || "$deleted_resource" == "[]" || "$deleted_resource" == "{}" ]]; then + echo "Resource $resource_name not found in deleted list. Skipping..." + continue + fi + + # Extract name, type, and ID from the JSON response + name=$(echo "$deleted_resource" | jq -r '.[0].name') + type=$(echo "$deleted_resource" | jq -r '.[0].type') + id=$(echo "$deleted_resource" | jq -r '.[0].id') + + echo "Purging resource: $name (Type: $type)" + + case "$type" in + "Microsoft.KeyVault/deletedVaults") + echo "Purging Key Vault: $name" + purge_output=$(az keyvault purge --name "$name" 2>&1 || true) + + if echo "$purge_output" | grep -q "MethodNotAllowed"; then + echo "WARNING: Soft Delete Protection is enabled for $name. Purge is not allowed. Skipping..." + else + echo "Key Vault $name purged successfully." + fi + ;; + + "Microsoft.ContainerRegistry/registries") + echo "Deleting Azure Container Registry (ACR): $name" + az acr delete --name "$name" --yes || echo "Failed to delete Azure Container Registry: $name" + ;; + + "Microsoft.Storage/storageAccounts") + echo "Purging Storage Account: $name" + az storage account delete --name "$name" --yes || echo "Failed to delete Storage Account: $name" + ;; + + "Microsoft.DocumentDB/databaseAccounts") + echo "Purging Cosmos DB: $name" + az cosmosdb delete --name "$name" --yes || echo "Failed to delete Cosmos DB Account: $name" + ;; + + "Microsoft.CognitiveServices/deletedAccounts") + echo "Purging Cognitive Services Account: $name" + az cognitiveservices account purge --location "${{ env.AZURE_LOCATION }}" --resource-group "${{ env.RESOURCE_GROUP_NAME }}" --name "$name" || echo "Failed to purge Cognitive Services Account: $name" + ;; + + "Microsoft.AppConfiguration/configurationStores") + echo "Deleting App Configuration: $name" + az appconfig delete --name "$name" --yes || echo "Failed to delete App Configuration: $name" + ;; + + "Microsoft.Insights/components") + echo "Deleting Application Insights: $name" + az monitor app-insights component delete --ids "$id" || echo "Failed to delete Application Insights: $name" + ;; + + "Microsoft.Web/containerApps") + echo "Deleting Container App: $name" + az containerapp delete --name "$name" --yes || echo "Failed to delete Container App: $name" + ;; + + *) + echo "Purging General Resource: $name" + if [[ -n "$id" && "$id" != "null" ]]; then + az resource delete --ids "$id" --verbose || echo "Failed to delete $name" + else + echo "Resource ID not found for $name. Skipping purge." + fi + ;; + esac + done + + echo "Resource purging completed successfully" + + - name: Send Notification on Failure + if: failure() + run: | + RUN_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + + EMAIL_BODY=$(cat <Dear Team,

We would like to inform you that the Content Processing Automation process has encountered an issue and has failed to complete successfully.

Build URL: ${RUN_URL}
${OUTPUT}

Please investigate the matter at your earliest convenience.

Best regards,
Your Automation Team

" + } + EOF + ) + + curl -X POST "${{ secrets.LOGIC_APP_URL }}" \ + -H "Content-Type: application/json" \ + -d "$EMAIL_BODY" || echo "Failed to send notification" diff --git a/infra/scripts/checkquota.sh b/infra/scripts/checkquota.sh new file mode 100644 index 00000000..e4aab3df --- /dev/null +++ b/infra/scripts/checkquota.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +# List of Azure regions to check for quota (update as needed) +IFS=', ' read -ra REGIONS <<< "$AZURE_REGIONS" + +SUBSCRIPTION_ID="${AZURE_MAINTENANCE_SUBSCRIPTION_ID}" +GPT_MIN_CAPACITY="${GPT_MIN_CAPACITY}" +AZURE_MAINTENANCE_CLIENT_ID="${AZURE_MAINTENANCE_CLIENT_ID}" +AZURE_TENANT_ID="${AZURE_TENANT_ID}" +AZURE_MAINTENANCE_CLIENT_SECRET="${AZURE_MAINTENANCE_CLIENT_SECRET}" + +# Authenticate using Managed Identity +echo "Authentication using Managed Identity..." +if ! az login --service-principal -u "$AZURE_MAINTENANCE_CLIENT_ID" -p "$AZURE_MAINTENANCE_CLIENT_SECRET" --tenant "$AZURE_TENANT_ID"; then + echo "❌ Error: Failed to login using Managed Identity." + exit 1 +fi + +echo "🔄 Validating required environment variables..." +if [[ -z "$SUBSCRIPTION_ID" || -z "$GPT_MIN_CAPACITY" || -z "$REGIONS" ]]; then + echo "❌ ERROR: Missing required environment variables." + exit 1 +fi + +echo "🔄 Setting Azure subscription..." +if ! az account set --subscription "$SUBSCRIPTION_ID"; then + echo "❌ ERROR: Invalid subscription ID or insufficient permissions." + exit 1 +fi +echo "✅ Azure subscription set successfully." + +# Define models and their minimum required capacities +declare -A MIN_CAPACITY=( + ["OpenAI.Standard.gpt-4o"]=$GPT_MIN_CAPACITY +) + +VALID_REGION="" +for REGION in "${REGIONS[@]}"; do + echo "----------------------------------------" + echo "🔍 Checking region: $REGION" + + QUOTA_INFO=$(az cognitiveservices usage list --location "$REGION" --output json) + if [ -z "$QUOTA_INFO" ]; then + echo "⚠️ WARNING: Failed to retrieve quota for region $REGION. Skipping." + continue + fi + + INSUFFICIENT_QUOTA=false + for MODEL in "${!MIN_CAPACITY[@]}"; do + MODEL_INFO=$(echo "$QUOTA_INFO" | awk -v model="\"value\": \"$MODEL\"" ' + BEGIN { RS="},"; FS="," } + $0 ~ model { print $0 } + ') + + if [ -z "$MODEL_INFO" ]; then + echo "⚠️ WARNING: No quota information found for model: $MODEL in $REGION. Skipping." + continue + fi + + CURRENT_VALUE=$(echo "$MODEL_INFO" | awk -F': ' '/"currentValue"/ {print $2}' | tr -d ',' | tr -d ' ') + LIMIT=$(echo "$MODEL_INFO" | awk -F': ' '/"limit"/ {print $2}' | tr -d ',' | tr -d ' ') + + CURRENT_VALUE=${CURRENT_VALUE:-0} + LIMIT=${LIMIT:-0} + + CURRENT_VALUE=$(echo "$CURRENT_VALUE" | cut -d'.' -f1) + LIMIT=$(echo "$LIMIT" | cut -d'.' -f1) + + AVAILABLE=$((LIMIT - CURRENT_VALUE)) + + echo "✅ Model: $MODEL | Used: $CURRENT_VALUE | Limit: $LIMIT | Available: $AVAILABLE" + + if [ "$AVAILABLE" -lt "${MIN_CAPACITY[$MODEL]}" ]; then + echo "❌ ERROR: $MODEL in $REGION has insufficient quota." + INSUFFICIENT_QUOTA=true + break + fi + done + + if [ "$INSUFFICIENT_QUOTA" = false ]; then + VALID_REGION="$REGION" + break + fi + +done + +if [ -z "$VALID_REGION" ]; then + echo "❌ No region with sufficient quota found. Blocking deployment." + echo "QUOTA_FAILED=true" >> "$GITHUB_ENV" + exit 0 +else + echo "✅ Suggested Region: $VALID_REGION" + echo "VALID_REGION=$VALID_REGION" >> "$GITHUB_ENV" + exit 0 +fi