microsoft
diff --git a/‎data/agent_teams/legal_contract_team.json‎
Lines changed: 75 additions & 0 deletions b/‎data/agent_teams/legal_contract_team.json‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎data/datasets/legal_contract/Compliance.docx‎
143 KB b/‎data/datasets/legal_contract/Compliance.docx‎
143 KB
diff --git a/‎data/datasets/legal_contract/NDA.docx‎
139 KB b/‎data/datasets/legal_contract/NDA.docx‎
139 KB
diff --git a/‎data/datasets/legal_contract/Risks.docx‎
143 KB b/‎data/datasets/legal_contract/Risks.docx‎
143 KB
diff --git a/‎index_datasets.py‎
Lines changed: 140 additions & 0 deletions b/‎index_datasets.py‎
Lines changed: 140 additions & 0 deletions
diff --git a/‎infra/scripts/index_datasets.py‎
Lines changed: 2 additions & 0 deletions b/‎infra/scripts/index_datasets.py‎
Lines changed: 2 additions & 0 deletions
@@ -0,0 +1,75 @@
+{
+  "id": "1",
+  "team_id": "team-legal-1",
+  "name": "Legal Contract Review Team",
+  "status": "visible",
+  "created": "",
+  "created_by": "",
+  "description": "A multi-agent legal review team that summarizes NDAs, identifies risks, checks compliance, and recommends improvements using advanced legal reasoning and retrieval-augmented analysis.",
+  "logo": "",
+  "plan": "",
+  "agents": [
+    {
+      "input_key": "",
+      "type": "summary",
+      "name": "SummaryAgent",
+      "deployment_name": "gpt-4.1-mini",
+      "icon": "",
+      "system_message": "You are the Summary Agent for legal contract analysis. Your task is to produce a clear, accurate, and structured executive summary of NDA and legal agreement documents. You must deliver summaries organized into labeled sections including: Overview, Parties, Effective Date, Purpose, Definition of Confidential Information, Receiving Party Obligations, Term & Termination, Governing Law, Restrictions & Limitations, Miscellaneous Clauses, Notable or Unusual Terms, and Key Items for Risk & Compliance Agents. Highlight missing elements such as liability caps, dispute resolution mechanisms, data handling obligations, or ambiguous language. Maintain a precise, neutral legal tone. Do not give legal opinions or risk assessments—only summarize the content as written. Use retrieval results from the search index to ensure completeness and reference contextual definitions or standard clause expectations when needed.",
+      "description": "Produces comprehensive, structured summaries of NDAs and legal contracts, capturing all key terms, clauses, obligations, jurisdictions, and notable provisions.",
+      "use_rag": true,
+      "use_mcp": false,
+      "use_bing": false,
+      "use_reasoning": false,
+      "index_name": "macae-legal-index",
+      "index_foundry_name": "",
+      "index_endpoint": "",
+      "coding_tools": false
+    },
+    {
+      "input_key": "",
+      "type": "risk",
+      "name": "RiskAgent",
+      "deployment_name": "gpt-4.1-mini",
+      "icon": "",
+      "system_message": "You are the Risk Agent for NDA and legal contract analysis. Use the NDA Risk Assessment Reference document and retrieved context to identify High, Medium, and Low risk issues. Evaluate clauses for missing liability caps, ambiguous terms, overly broad confidentiality definitions, jurisdiction misalignment, missing termination rights, unclear data handling obligations, missing dispute resolution, and any incomplete or poorly scoped definitions. For every risk you identify, provide: (1) Risk Category (High/Medium/Low), (2) Clause or Section impacted, (3) Description of the issue, (4) Why it matters or what exposure it creates, and (5) Suggested edit or corrective language. Apply the risk scoring framework: High = escalate immediately; Medium = requires revision; Low = minor issue. Be precise, legally aligned, and practical. Reference retrieved examples or standards when appropriate. Your output must be structured and actionable.",
+      "description": "Identifies and classifies legal risks in NDAs and contracts using the organization's risk framework, and provides suggested edits to reduce exposure.",
+      "use_rag": true,
+      "use_mcp": false,
+      "use_bing": false,
+      "use_reasoning": false,
+      "index_name": "macae-legal-index",
+      "index_foundry_name": "",
+      "index_endpoint": "",
+      "coding_tools": false
+    },
+    {
+      "input_key": "",
+      "type": "compliance",
+      "name": "ComplianceAgent",
+      "deployment_name": "gpt-4.1-mini",
+      "icon": "",
+      "system_message": "You are the Compliance Agent responsible for validating NDAs and legal agreements against mandatory legal and policy requirements. Use the NDA Compliance Reference Document and retrieval results to evaluate whether the contract includes all required clauses: Confidentiality, Term & Termination, Governing Law aligned to approved jurisdictions, Non-Assignment, and Entire Agreement. Identify compliance gaps including ambiguous language, missing liability protections, improper jurisdiction, excessive term length, insufficient data protection obligations, missing dispute resolution mechanisms, or export control risks. For each issue provide: (1) Compliance Area (e.g., Term Length, Jurisdiction, Confidentiality), (2) Status (Pass/Fail), (3) Issue Description, (4) Whether it is Mandatory or Recommended, (5) Corrective Recommendation or Suggested Language. Deliver a final Compliance Status summary. Maintain professional, objective, legally accurate tone.",
+      "description": "Performs compliance validation of NDAs and contracts against legal policy requirements, identifies gaps, and provides corrective recommendations and compliance status.",
+      "use_rag": true,
+      "use_mcp": false,
+      "use_bing": false,
+      "use_reasoning": false,
+      "index_name": "macae-legal-index",
+      "index_foundry_name": "",
+      "index_endpoint": "",
+      "coding_tools": false
+    }
+  ],
+  "protected": false,
+  "starting_tasks": [
+    {
+      "id": "task-1",
+      "name": "NDA Contract Review",
+      "prompt": "Please review the following NDA contract for summary, risks, and compliance issues.",
+      "created": "",
+      "creator": "",
+      "logo": ""
+    }
+  ]
+}
@@ -0,0 +1,140 @@
+import os
+import sys
+import uuid
+from pathlib import Path
+
+from azure.search.documents import SearchClient
+from azure.search.documents.indexes import SearchIndexClient
+from azure.search.documents.indexes.models import SearchIndex, SimpleField, SearchableField, SearchFieldDataType
+from azure.core.credentials import AzureKeyCredential
+
+# Optional import for DOCX extraction
+try:
+    from docx import Document
+    from docx.opc.exceptions import PackageNotFoundError
+except Exception:
+    Document = None
+    PackageNotFoundError = None
+
+# --------------------------
+# CONFIGURATION
+repo_root = Path(__file__).resolve().parent
+LOCAL_DOCX_FOLDER = repo_root / "data" / "datasets" / "legal_contract"
+
+AZURE_SEARCH_SERVICE = ""
+AZURE_SEARCH_ADMIN_KEY = ""
+AZURE_SEARCH_INDEX_NAME = ""
+# --------------------------
+
+# Ensure folder exists and show resolved path
+LOCAL_DOCX_FOLDER = LOCAL_DOCX_FOLDER.resolve()
+os.makedirs(LOCAL_DOCX_FOLDER, exist_ok=True)
+print(f"Using local DOCX folder: {LOCAL_DOCX_FOLDER}")
+
+if Document is None:
+    print("python-docx is not installed. Install with: pip install python-docx")
+    sys.exit(1)
+
+def extract_text_from_docx(file_path):
+    file_path = str(file_path)
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(file_path)
+    try:
+        doc = Document(file_path)
+        paragraphs = [p.text for p in doc.paragraphs if p.text]
+        return "\n".join(paragraphs)
+    except PackageNotFoundError:
+        # Not a valid .docx archive (possibly a renamed file) — treat as skip
+        raise
+    except Exception:
+        raise
+
+# Collect documents
+documents = []
+skipped = []
+
+for entry in sorted(os.listdir(LOCAL_DOCX_FOLDER)):
+    if not entry.lower().endswith(".docx"):
+        continue
+    if entry.startswith("~$"):
+        # skip Word temp files
+        continue
+
+    path = LOCAL_DOCX_FOLDER / entry
+    if not path.is_file():
+        continue
+
+    try:
+        text = extract_text_from_docx(path)
+    except PackageNotFoundError:
+        print(f"Skipping (invalid .docx): {entry}")
+        skipped.append(entry)
+        continue
+    except Exception as e:
+        print(f"Failed to extract {entry}: {e}")
+        skipped.append(entry)
+        continue
+
+    if not text or not text.strip():
+        print(f"Skipping (no text): {entry}")
+        skipped.append(entry)
+        continue
+
+    documents.append({
+        "id": str(uuid.uuid4()),
+        "title": entry,
+        "content": text,
+        "metadata_storage_path": str(path)
+    })
+
+if not documents:
+    print("No valid DOCX files found or extracted. Exiting.")
+    if skipped:
+        print("Skipped files:")
+        for s in skipped:
+            print(" -", s)
+    sys.exit(1)
+
+print(f"Extracted text from {len(documents)} DOCX file(s).")
+
+# --------------------------
+# Create / update Azure Search index
+index_client = SearchIndexClient(
+    endpoint=f"https://{AZURE_SEARCH_SERVICE}.search.windows.net",
+    credential=AzureKeyCredential(AZURE_SEARCH_ADMIN_KEY)
+)
+
+fields = [
+    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
+    SearchableField(name="title", type=SearchFieldDataType.String),
+    SearchableField(name="content", type=SearchFieldDataType.String, analyzer_name="standard.lucene"),
+    SimpleField(name="metadata_storage_path", type=SearchFieldDataType.String, filterable=True, sortable=True)
+]
+
+index = SearchIndex(name=AZURE_SEARCH_INDEX_NAME, fields=fields)
+index_client.create_or_update_index(index)
+print("Azure Search index created/updated.")
+
+# --------------------------
+# Upload documents
+search_client = SearchClient(
+    endpoint=f"https://{AZURE_SEARCH_SERVICE}.search.windows.net",
+    index_name=AZURE_SEARCH_INDEX_NAME,
+    credential=AzureKeyCredential(AZURE_SEARCH_ADMIN_KEY)
+)
+
+upload_results = search_client.upload_documents(documents)
+succeeded = sum(1 for r in upload_results if getattr(r, "succeeded", False) is True)
+print(f"Uploaded {succeeded}/{len(documents)} documents to Azure Search.")
+
+# Optional quick search
+query = "Compliance"
+try:
+    results = search_client.search(query, top=5)
+    print(f"\nSearch results for '{query}':")
+    for r in results:
+        title = r.get("title") or r.get("metadata_storage_path")
+        snippet = (r.get("content") or "")[:150].replace("\n", " ")
+        print(f" - {title}: {snippet}...")
+except Exception as e:
+    print("Search query failed:", e)
@@ -96,6 +96,8 @@ def extract_pdf_text(pdf_bytes):
     title = blob.name.replace(".csv", "")
     title = title.replace(".json", "")
     title = title.replace(".pdf", "")
+    title = title.replace(".docx", "")
+    title = title.replace(".pptx", "")
     data = container_client.download_blob(blob.name).readall()
 
     try: