Skip to content

Commit d7db91a

Browse files
legal contract script
1 parent ccdef20 commit d7db91a

6 files changed

Lines changed: 217 additions & 0 deletions

File tree

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
{
2+
"id": "1",
3+
"team_id": "team-legal-1",
4+
"name": "Legal Contract Review Team",
5+
"status": "visible",
6+
"created": "",
7+
"created_by": "",
8+
"description": "A multi-agent legal review team that summarizes NDAs, identifies risks, checks compliance, and recommends improvements using advanced legal reasoning and retrieval-augmented analysis.",
9+
"logo": "",
10+
"plan": "",
11+
"agents": [
12+
{
13+
"input_key": "",
14+
"type": "summary",
15+
"name": "SummaryAgent",
16+
"deployment_name": "gpt-4.1-mini",
17+
"icon": "",
18+
"system_message": "You are the Summary Agent for legal contract analysis. Your task is to produce a clear, accurate, and structured executive summary of NDA and legal agreement documents. You must deliver summaries organized into labeled sections including: Overview, Parties, Effective Date, Purpose, Definition of Confidential Information, Receiving Party Obligations, Term & Termination, Governing Law, Restrictions & Limitations, Miscellaneous Clauses, Notable or Unusual Terms, and Key Items for Risk & Compliance Agents. Highlight missing elements such as liability caps, dispute resolution mechanisms, data handling obligations, or ambiguous language. Maintain a precise, neutral legal tone. Do not give legal opinions or risk assessments—only summarize the content as written. Use retrieval results from the search index to ensure completeness and reference contextual definitions or standard clause expectations when needed.",
19+
"description": "Produces comprehensive, structured summaries of NDAs and legal contracts, capturing all key terms, clauses, obligations, jurisdictions, and notable provisions.",
20+
"use_rag": true,
21+
"use_mcp": false,
22+
"use_bing": false,
23+
"use_reasoning": false,
24+
"index_name": "macae-legal-index",
25+
"index_foundry_name": "",
26+
"index_endpoint": "",
27+
"coding_tools": false
28+
},
29+
{
30+
"input_key": "",
31+
"type": "risk",
32+
"name": "RiskAgent",
33+
"deployment_name": "gpt-4.1-mini",
34+
"icon": "",
35+
"system_message": "You are the Risk Agent for NDA and legal contract analysis. Use the NDA Risk Assessment Reference document and retrieved context to identify High, Medium, and Low risk issues. Evaluate clauses for missing liability caps, ambiguous terms, overly broad confidentiality definitions, jurisdiction misalignment, missing termination rights, unclear data handling obligations, missing dispute resolution, and any incomplete or poorly scoped definitions. For every risk you identify, provide: (1) Risk Category (High/Medium/Low), (2) Clause or Section impacted, (3) Description of the issue, (4) Why it matters or what exposure it creates, and (5) Suggested edit or corrective language. Apply the risk scoring framework: High = escalate immediately; Medium = requires revision; Low = minor issue. Be precise, legally aligned, and practical. Reference retrieved examples or standards when appropriate. Your output must be structured and actionable.",
36+
"description": "Identifies and classifies legal risks in NDAs and contracts using the organization's risk framework, and provides suggested edits to reduce exposure.",
37+
"use_rag": true,
38+
"use_mcp": false,
39+
"use_bing": false,
40+
"use_reasoning": false,
41+
"index_name": "macae-legal-index",
42+
"index_foundry_name": "",
43+
"index_endpoint": "",
44+
"coding_tools": false
45+
},
46+
{
47+
"input_key": "",
48+
"type": "compliance",
49+
"name": "ComplianceAgent",
50+
"deployment_name": "gpt-4.1-mini",
51+
"icon": "",
52+
"system_message": "You are the Compliance Agent responsible for validating NDAs and legal agreements against mandatory legal and policy requirements. Use the NDA Compliance Reference Document and retrieval results to evaluate whether the contract includes all required clauses: Confidentiality, Term & Termination, Governing Law aligned to approved jurisdictions, Non-Assignment, and Entire Agreement. Identify compliance gaps including ambiguous language, missing liability protections, improper jurisdiction, excessive term length, insufficient data protection obligations, missing dispute resolution mechanisms, or export control risks. For each issue provide: (1) Compliance Area (e.g., Term Length, Jurisdiction, Confidentiality), (2) Status (Pass/Fail), (3) Issue Description, (4) Whether it is Mandatory or Recommended, (5) Corrective Recommendation or Suggested Language. Deliver a final Compliance Status summary. Maintain professional, objective, legally accurate tone.",
53+
"description": "Performs compliance validation of NDAs and contracts against legal policy requirements, identifies gaps, and provides corrective recommendations and compliance status.",
54+
"use_rag": true,
55+
"use_mcp": false,
56+
"use_bing": false,
57+
"use_reasoning": false,
58+
"index_name": "macae-legal-index",
59+
"index_foundry_name": "",
60+
"index_endpoint": "",
61+
"coding_tools": false
62+
}
63+
],
64+
"protected": false,
65+
"starting_tasks": [
66+
{
67+
"id": "task-1",
68+
"name": "NDA Contract Review",
69+
"prompt": "Please review the following NDA contract for summary, risks, and compliance issues.",
70+
"created": "",
71+
"creator": "",
72+
"logo": ""
73+
}
74+
]
75+
}
143 KB
Binary file not shown.
139 KB
Binary file not shown.
143 KB
Binary file not shown.

index_datasets.py

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
import os
2+
import sys
3+
import uuid
4+
from pathlib import Path
5+
6+
from azure.search.documents import SearchClient
7+
from azure.search.documents.indexes import SearchIndexClient
8+
from azure.search.documents.indexes.models import SearchIndex, SimpleField, SearchableField, SearchFieldDataType
9+
from azure.core.credentials import AzureKeyCredential
10+
11+
# Optional import for DOCX extraction
12+
try:
13+
from docx import Document
14+
from docx.opc.exceptions import PackageNotFoundError
15+
except Exception:
16+
Document = None
17+
PackageNotFoundError = None
18+
19+
# --------------------------
20+
# CONFIGURATION
21+
repo_root = Path(__file__).resolve().parent
22+
LOCAL_DOCX_FOLDER = repo_root / "data" / "datasets" / "legal_contract"
23+
24+
AZURE_SEARCH_SERVICE = ""
25+
AZURE_SEARCH_ADMIN_KEY = ""
26+
AZURE_SEARCH_INDEX_NAME = ""
27+
# --------------------------
28+
29+
# Ensure folder exists and show resolved path
30+
LOCAL_DOCX_FOLDER = LOCAL_DOCX_FOLDER.resolve()
31+
os.makedirs(LOCAL_DOCX_FOLDER, exist_ok=True)
32+
print(f"Using local DOCX folder: {LOCAL_DOCX_FOLDER}")
33+
34+
if Document is None:
35+
print("python-docx is not installed. Install with: pip install python-docx")
36+
sys.exit(1)
37+
38+
def extract_text_from_docx(file_path):
39+
file_path = str(file_path)
40+
if not os.path.exists(file_path):
41+
raise FileNotFoundError(file_path)
42+
try:
43+
doc = Document(file_path)
44+
paragraphs = [p.text for p in doc.paragraphs if p.text]
45+
return "\n".join(paragraphs)
46+
except PackageNotFoundError:
47+
# Not a valid .docx archive (possibly a renamed file) — treat as skip
48+
raise
49+
except Exception:
50+
raise
51+
52+
# Collect documents
53+
documents = []
54+
skipped = []
55+
56+
for entry in sorted(os.listdir(LOCAL_DOCX_FOLDER)):
57+
if not entry.lower().endswith(".docx"):
58+
continue
59+
if entry.startswith("~$"):
60+
# skip Word temp files
61+
continue
62+
63+
path = LOCAL_DOCX_FOLDER / entry
64+
if not path.is_file():
65+
continue
66+
67+
try:
68+
text = extract_text_from_docx(path)
69+
except PackageNotFoundError:
70+
print(f"Skipping (invalid .docx): {entry}")
71+
skipped.append(entry)
72+
continue
73+
except Exception as e:
74+
print(f"Failed to extract {entry}: {e}")
75+
skipped.append(entry)
76+
continue
77+
78+
if not text or not text.strip():
79+
print(f"Skipping (no text): {entry}")
80+
skipped.append(entry)
81+
continue
82+
83+
documents.append({
84+
"id": str(uuid.uuid4()),
85+
"title": entry,
86+
"content": text,
87+
"metadata_storage_path": str(path)
88+
})
89+
90+
if not documents:
91+
print("No valid DOCX files found or extracted. Exiting.")
92+
if skipped:
93+
print("Skipped files:")
94+
for s in skipped:
95+
print(" -", s)
96+
sys.exit(1)
97+
98+
print(f"Extracted text from {len(documents)} DOCX file(s).")
99+
100+
# --------------------------
101+
# Create / update Azure Search index
102+
index_client = SearchIndexClient(
103+
endpoint=f"https://{AZURE_SEARCH_SERVICE}.search.windows.net",
104+
credential=AzureKeyCredential(AZURE_SEARCH_ADMIN_KEY)
105+
)
106+
107+
fields = [
108+
SimpleField(name="id", type=SearchFieldDataType.String, key=True),
109+
SearchableField(name="title", type=SearchFieldDataType.String),
110+
SearchableField(name="content", type=SearchFieldDataType.String, analyzer_name="standard.lucene"),
111+
SimpleField(name="metadata_storage_path", type=SearchFieldDataType.String, filterable=True, sortable=True)
112+
]
113+
114+
index = SearchIndex(name=AZURE_SEARCH_INDEX_NAME, fields=fields)
115+
index_client.create_or_update_index(index)
116+
print("Azure Search index created/updated.")
117+
118+
# --------------------------
119+
# Upload documents
120+
search_client = SearchClient(
121+
endpoint=f"https://{AZURE_SEARCH_SERVICE}.search.windows.net",
122+
index_name=AZURE_SEARCH_INDEX_NAME,
123+
credential=AzureKeyCredential(AZURE_SEARCH_ADMIN_KEY)
124+
)
125+
126+
upload_results = search_client.upload_documents(documents)
127+
succeeded = sum(1 for r in upload_results if getattr(r, "succeeded", False) is True)
128+
print(f"Uploaded {succeeded}/{len(documents)} documents to Azure Search.")
129+
130+
# Optional quick search
131+
query = "Compliance"
132+
try:
133+
results = search_client.search(query, top=5)
134+
print(f"\nSearch results for '{query}':")
135+
for r in results:
136+
title = r.get("title") or r.get("metadata_storage_path")
137+
snippet = (r.get("content") or "")[:150].replace("\n", " ")
138+
print(f" - {title}: {snippet}...")
139+
except Exception as e:
140+
print("Search query failed:", e)

infra/scripts/index_datasets.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ def extract_pdf_text(pdf_bytes):
9696
title = blob.name.replace(".csv", "")
9797
title = title.replace(".json", "")
9898
title = title.replace(".pdf", "")
99+
title = title.replace(".docx", "")
100+
title = title.replace(".pptx", "")
99101
data = container_client.download_blob(blob.name).readall()
100102

101103
try:

0 commit comments

Comments
 (0)