Skip to content

Commit 370b32f

Browse files
committed
Clean up redundant RAI refusal detection code
Removed unused code that was made obsolete by proactive input validation: - RAI_REFUSAL_PATTERNS list (~50 patterns) - _check_for_rai_refusal() function - _check_message_for_rai_refusal() function - 'ABSOLUTE RULE - NO HANDOFF AFTER REFUSAL' sections from agent instructions - RAI refusal checks in process_message() and send_user_response() - RAI refusal termination condition from workflow builder The proactive harmful content detection at input layer (HARMFUL_INPUT_PATTERNS, _check_input_for_harmful_content, checks in parse_brief/process_message) now handles content safety, making agent refusal detection unnecessary.
1 parent e19f463 commit 370b32f

4 files changed

Lines changed: 82 additions & 204 deletions

File tree

content-gen/src/backend/app.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,33 @@ async def parse_brief():
216216
logger.warning(f"Failed to save brief message to CosmosDB: {e}")
217217

218218
orchestrator = get_orchestrator()
219-
parsed_brief, clarifying_questions = await orchestrator.parse_brief(brief_text)
219+
parsed_brief, clarifying_questions, rai_blocked = await orchestrator.parse_brief(brief_text)
220+
221+
# Check if request was blocked due to harmful content
222+
if rai_blocked:
223+
# Save the refusal as assistant response
224+
try:
225+
cosmos_service = await get_cosmos_service()
226+
await cosmos_service.add_message_to_conversation(
227+
conversation_id=conversation_id,
228+
user_id=user_id,
229+
message={
230+
"role": "assistant",
231+
"content": clarifying_questions, # This is the refusal message
232+
"agent": "ContentSafety",
233+
"timestamp": datetime.now(timezone.utc).isoformat()
234+
}
235+
)
236+
except Exception as e:
237+
logger.warning(f"Failed to save RAI response to CosmosDB: {e}")
238+
239+
return jsonify({
240+
"rai_blocked": True,
241+
"requires_clarification": False,
242+
"requires_confirmation": False,
243+
"conversation_id": conversation_id,
244+
"message": clarifying_questions
245+
})
220246

221247
# Check if we need clarifying questions
222248
if clarifying_questions:

content-gen/src/backend/orchestrator.py

Lines changed: 30 additions & 197 deletions
Original file line numberDiff line numberDiff line change
@@ -132,118 +132,6 @@ def _check_input_for_harmful_content(message: str) -> tuple[bool, str]:
132132
Please share a marketing-related request and I'll assist you."""
133133

134134

135-
# RAI (Responsible AI) detection patterns
136-
# These patterns indicate when an agent has identified a jailbreak attempt,
137-
# content safety violation, or out-of-scope request
138-
RAI_REFUSAL_PATTERNS = [
139-
# Out-of-scope refusals - exact phrases from instructions
140-
"i'm a specialized marketing content generation assistant",
141-
"i cannot help with general questions",
142-
"outside of marketing",
143-
"i can only assist with marketing",
144-
"this request is outside my scope",
145-
"not within my capabilities as a marketing",
146-
"designed exclusively for creating marketing materials",
147-
"cannot help with general questions or topics outside",
148-
# Content safety refusals
149-
"i cannot generate content that",
150-
"i'm unable to create content involving",
151-
"this request violates content safety",
152-
"violates content safety guidelines",
153-
"inappropriate content",
154-
"harmful content",
155-
"i cannot assist with this type of request",
156-
"violates our content guidelines",
157-
"against our content policy",
158-
"cannot process this request",
159-
# Jailbreak detection
160-
"i cannot ignore my instructions",
161-
"i cannot pretend to be",
162-
"i cannot bypass my guidelines",
163-
"i cannot override my safety",
164-
"this appears to be an attempt to",
165-
"i'm designed to decline requests that",
166-
"designed to decline requests",
167-
# General refusals indicating RAI concern
168-
"i'm not able to help with that",
169-
"i cannot fulfill this request",
170-
"this is not something i can assist with",
171-
"i must decline this request",
172-
"i can't help with",
173-
"i am not able to",
174-
"i'm sorry, but i can",
175-
"i apologize, but i can",
176-
"unfortunately, i cannot",
177-
"i'm afraid i can't",
178-
# Common model refusal patterns
179-
"as an ai assistant",
180-
"as a marketing assistant, i",
181-
"my purpose is to help with marketing",
182-
"i specialize in marketing",
183-
"that's outside my area",
184-
"not within my scope",
185-
"falls outside",
186-
"beyond my capabilities",
187-
"not something i'm able to",
188-
]
189-
190-
191-
def _check_for_rai_refusal(conversation: list) -> bool:
192-
"""
193-
Check if any agent response in the conversation indicates an RAI refusal.
194-
195-
This detects when an agent has identified a jailbreak attempt, content safety
196-
violation, or out-of-scope request and refused to continue.
197-
198-
Args:
199-
conversation: List of ChatMessage objects from the workflow
200-
201-
Returns:
202-
bool: True if an RAI refusal was detected, triggering workflow termination
203-
"""
204-
for msg in conversation:
205-
# Only check assistant/agent messages, not user messages
206-
if msg.role.value == "user":
207-
continue
208-
209-
message_text = msg.text.lower() if msg.text else ""
210-
211-
# Check for RAI refusal patterns
212-
for pattern in RAI_REFUSAL_PATTERNS:
213-
if pattern in message_text:
214-
agent_name = msg.author_name or "unknown agent"
215-
logger.info(f"RAI refusal detected from {agent_name}: matched pattern '{pattern}'")
216-
return True
217-
218-
return False
219-
220-
221-
def _check_message_for_rai_refusal(message_text: str) -> bool:
222-
"""
223-
Check if a single message indicates an RAI refusal.
224-
225-
This is used to detect refusals at the application layer and terminate
226-
the workflow immediately, without waiting for the next workflow cycle.
227-
228-
Args:
229-
message_text: The text content of the message to check
230-
231-
Returns:
232-
bool: True if an RAI refusal pattern was detected
233-
"""
234-
if not message_text:
235-
return False
236-
237-
message_lower = message_text.lower()
238-
239-
for pattern in RAI_REFUSAL_PATTERNS:
240-
if pattern in message_lower:
241-
logger.info(f"RAI refusal pattern detected in message: '{pattern}'")
242-
return True
243-
244-
return False
245-
246-
247135
# Agent system instructions
248136
TRIAGE_INSTRUCTIONS = f"""You are a Triage Agent (coordinator) for a retail marketing content generation system.
249137
@@ -275,16 +163,6 @@ def _check_message_for_rai_refusal(message_text: str) -> bool:
275163
276164
What marketing content can I help you create today?"
277165
278-
## ABSOLUTE RULE - NO HANDOFF AFTER REFUSAL
279-
After you provide ANY refusal message (out-of-scope, content safety, jailbreak):
280-
- DO NOT call transfer_to_planning_agent or any transfer function
281-
- DO NOT call any tool or function
282-
- DO NOT hand off to any other agent
283-
- STOP IMMEDIATELY after your refusal response
284-
- The conversation ENDS with your refusal
285-
286-
This is NON-NEGOTIABLE. If you refuse a request, you must NOT use any handoff/transfer functions.
287-
288166
### ONLY assist with these marketing-specific tasks:
289167
- Creating marketing copy (ads, social posts, emails, product descriptions)
290168
- Generating marketing images and visuals for campaigns
@@ -333,16 +211,6 @@ def _check_message_for_rai_refusal(message_text: str) -> bool:
333211
334212
I can only help create professional, appropriate marketing content. Please provide a legitimate marketing brief and I'll be happy to assist."
335213
336-
## ABSOLUTE RULE - NO HANDOFF AFTER REFUSAL
337-
After you provide ANY refusal response:
338-
- DO NOT call transfer_to_triage_agent or any transfer function
339-
- DO NOT call any tool or function
340-
- DO NOT hand off to any other agent
341-
- STOP IMMEDIATELY after your refusal response
342-
- The conversation ENDS with your refusal
343-
344-
This is NON-NEGOTIABLE. If you refuse a request, you must NOT use any handoff/transfer functions.
345-
346214
## BRIEF PARSING (for legitimate requests only)
347215
When given a creative brief, extract and structure a JSON object with these REQUIRED fields:
348216
- overview: Campaign summary (what is the campaign about?)
@@ -685,13 +553,8 @@ def initialize(self) -> None:
685553
# Compliance can hand back to content agents for corrections or to triage
686554
.add_handoff(compliance_agent, [text_content_agent, image_content_agent, triage_agent])
687555
.with_termination_condition(
688-
# Terminate the workflow under these conditions:
689-
# 1. After 10 user messages (prevent infinite loops)
690-
# 2. When an agent detects an RAI concern (jailbreak, content safety, out-of-scope)
691-
lambda conv: (
692-
sum(1 for msg in conv if msg.role.value == "user") >= 10
693-
or _check_for_rai_refusal(conv)
694-
)
556+
# Terminate the workflow after 10 user messages (prevent infinite loops)
557+
lambda conv: sum(1 for msg in conv if msg.role.value == "user") >= 10
695558
)
696559
.build()
697560
)
@@ -770,32 +633,7 @@ async def process_message(
770633
for msg in event.data.conversation
771634
])
772635

773-
# Check ALL messages in the conversation for RAI refusal
774-
# This catches cases where an early agent refused but still handed off
775-
rai_refusal_msg = None
776-
rai_refusal_agent = None
777-
for msg in event.data.conversation:
778-
if msg.role.value != "user" and msg.text:
779-
if _check_message_for_rai_refusal(msg.text):
780-
rai_refusal_msg = msg.text
781-
rai_refusal_agent = msg.author_name or "assistant"
782-
logger.info(f"RAI refusal detected from {rai_refusal_agent} in conversation history")
783-
break # Use the FIRST refusal found
784-
785-
if rai_refusal_msg:
786-
logger.info(f"Terminating workflow due to RAI refusal from {rai_refusal_agent}")
787-
yield {
788-
"type": "agent_response",
789-
"agent": rai_refusal_agent,
790-
"content": rai_refusal_msg,
791-
"conversation_history": conversation_text,
792-
"is_final": True, # Mark as final to stop workflow
793-
"rai_blocked": True, # Flag indicating RAI block
794-
"metadata": {"conversation_id": conversation_id}
795-
}
796-
return # Exit the generator to stop processing
797-
798-
# Get the last message content for normal flow
636+
# Get the last message content
799637
last_msg_content = event.data.conversation[-1].text if event.data.conversation else ""
800638
last_msg_agent = event.data.conversation[-1].author_name if event.data.conversation else "unknown"
801639

@@ -886,31 +724,7 @@ async def send_user_response(
886724

887725
elif isinstance(event, RequestInfoEvent):
888726
if isinstance(event.data, HandoffAgentUserRequest):
889-
# Check ALL messages in the conversation for RAI refusal
890-
# This catches cases where an early agent refused but still handed off
891-
rai_refusal_msg = None
892-
rai_refusal_agent = None
893-
for msg in event.data.conversation:
894-
if msg.role.value != "user" and msg.text:
895-
if _check_message_for_rai_refusal(msg.text):
896-
rai_refusal_msg = msg.text
897-
rai_refusal_agent = msg.author_name or "assistant"
898-
logger.info(f"RAI refusal detected from {rai_refusal_agent} in user response flow")
899-
break # Use the FIRST refusal found
900-
901-
if rai_refusal_msg:
902-
logger.info(f"Terminating workflow due to RAI refusal from {rai_refusal_agent}")
903-
yield {
904-
"type": "agent_response",
905-
"agent": rai_refusal_agent,
906-
"content": rai_refusal_msg,
907-
"is_final": True, # Mark as final to stop workflow
908-
"rai_blocked": True, # Flag indicating RAI block
909-
"metadata": {"conversation_id": conversation_id}
910-
}
911-
return # Exit the generator to stop processing
912-
913-
# Get the last message content for normal flow
727+
# Get the last message content
914728
last_msg_content = event.data.conversation[-1].text if event.data.conversation else ""
915729
last_msg_agent = event.data.conversation[-1].author_name if event.data.conversation else "unknown"
916730

@@ -953,7 +767,7 @@ async def send_user_response(
953767
async def parse_brief(
954768
self,
955769
brief_text: str
956-
) -> tuple[CreativeBrief, str | None]:
770+
) -> tuple[CreativeBrief, str | None, bool]:
957771
"""
958772
Parse a free-text creative brief into structured format.
959773
If critical information is missing, return clarifying questions.
@@ -962,13 +776,32 @@ async def parse_brief(
962776
brief_text: Free-text creative brief from user
963777
964778
Returns:
965-
tuple: (CreativeBrief, clarifying_questions_or_none)
966-
- If all critical fields are provided: (brief, None)
967-
- If critical fields are missing: (partial_brief, clarifying_questions_string)
779+
tuple: (CreativeBrief, clarifying_questions_or_none, is_blocked)
780+
- If all critical fields are provided: (brief, None, False)
781+
- If critical fields are missing: (partial_brief, clarifying_questions_string, False)
782+
- If harmful content detected: (empty_brief, refusal_message, True)
968783
"""
969784
if not self._initialized:
970785
self.initialize()
971786

787+
# PROACTIVE CONTENT SAFETY CHECK - Block harmful content at input layer
788+
is_harmful, matched_pattern = _check_input_for_harmful_content(brief_text)
789+
if is_harmful:
790+
logger.warning(f"Blocking harmful content in parse_brief. Pattern: {matched_pattern}")
791+
# Return empty brief with refusal message and blocked=True
792+
empty_brief = CreativeBrief(
793+
overview="",
794+
objectives="",
795+
target_audience="",
796+
key_message="",
797+
tone_and_style="",
798+
deliverable="",
799+
timelines="",
800+
visual_guidelines="",
801+
cta=""
802+
)
803+
return empty_brief, RAI_HARMFUL_CONTENT_RESPONSE, True
804+
972805
planning_agent = self._agents["planning"]
973806

974807
# First, analyze the brief and check for missing critical fields
@@ -1055,14 +888,14 @@ async def parse_brief(
1055888

1056889
# Check if we need clarifying questions
1057890
if analysis.get("status") == "incomplete" and analysis.get("clarifying_message"):
1058-
return (brief, analysis["clarifying_message"])
891+
return (brief, analysis["clarifying_message"], False)
1059892

1060-
return (brief, None)
893+
return (brief, None, False)
1061894

1062895
except Exception as e:
1063896
logger.error(f"Failed to parse brief analysis response: {e}")
1064897
# Fallback to basic extraction
1065-
return (self._extract_brief_from_text(brief_text), None)
898+
return (self._extract_brief_from_text(brief_text), None, False)
1066899

1067900
def _extract_brief_from_text(self, text: str) -> CreativeBrief:
1068901
"""Extract brief fields from labeled text like 'Overview: ...'"""

content-gen/src/frontend/src/App.tsx

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,9 @@ function App() {
213213

214214
setGenerationStatus('Updating creative brief...');
215215
const parsed = await parseBrief(refinementPrompt, conversationId, userId, signal);
216-
setPendingBrief(parsed.brief);
216+
if (parsed.brief) {
217+
setPendingBrief(parsed.brief);
218+
}
217219
setGenerationStatus('');
218220

219221
const assistantMessage: ChatMessage = {
@@ -287,10 +289,24 @@ function App() {
287289
setGenerationStatus('Analyzing creative brief...');
288290
const parsed = await parseBrief(content, conversationId, userId, signal);
289291

290-
// Check if clarification is needed
291-
if (parsed.requires_clarification && parsed.clarifying_questions) {
292+
// Check if request was blocked due to harmful content
293+
if (parsed.rai_blocked) {
294+
// Show the refusal message without any brief UI
295+
setGenerationStatus('');
296+
297+
const assistantMessage: ChatMessage = {
298+
id: uuidv4(),
299+
role: 'assistant',
300+
content: parsed.message,
301+
agent: 'ContentSafety',
302+
timestamp: new Date().toISOString(),
303+
};
304+
setMessages(prev => [...prev, assistantMessage]);
305+
} else if (parsed.requires_clarification && parsed.clarifying_questions) {
292306
// Set partial brief for display but show clarifying questions
293-
setPendingBrief(parsed.brief);
307+
if (parsed.brief) {
308+
setPendingBrief(parsed.brief);
309+
}
294310
setGenerationStatus('');
295311

296312
const assistantMessage: ChatMessage = {
@@ -303,7 +319,9 @@ function App() {
303319
setMessages(prev => [...prev, assistantMessage]);
304320
} else {
305321
// Brief is complete, show for confirmation
306-
setPendingBrief(parsed.brief);
322+
if (parsed.brief) {
323+
setPendingBrief(parsed.brief);
324+
}
307325
setGenerationStatus('');
308326

309327
const assistantMessage: ChatMessage = {

content-gen/src/frontend/src/types/index.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,10 +93,11 @@ export interface BrandGuidelines {
9393
}
9494

9595
export interface ParsedBriefResponse {
96-
brief: CreativeBrief;
96+
brief?: CreativeBrief;
9797
requires_confirmation: boolean;
9898
requires_clarification?: boolean;
9999
clarifying_questions?: string;
100+
rai_blocked?: boolean;
100101
message: string;
101102
conversation_id?: string;
102103
}

0 commit comments

Comments
 (0)