@@ -132,118 +132,6 @@ def _check_input_for_harmful_content(message: str) -> tuple[bool, str]:
132132Please share a marketing-related request and I'll assist you."""
133133
134134
135- # RAI (Responsible AI) detection patterns
136- # These patterns indicate when an agent has identified a jailbreak attempt,
137- # content safety violation, or out-of-scope request
138- RAI_REFUSAL_PATTERNS = [
139- # Out-of-scope refusals - exact phrases from instructions
140- "i'm a specialized marketing content generation assistant" ,
141- "i cannot help with general questions" ,
142- "outside of marketing" ,
143- "i can only assist with marketing" ,
144- "this request is outside my scope" ,
145- "not within my capabilities as a marketing" ,
146- "designed exclusively for creating marketing materials" ,
147- "cannot help with general questions or topics outside" ,
148- # Content safety refusals
149- "i cannot generate content that" ,
150- "i'm unable to create content involving" ,
151- "this request violates content safety" ,
152- "violates content safety guidelines" ,
153- "inappropriate content" ,
154- "harmful content" ,
155- "i cannot assist with this type of request" ,
156- "violates our content guidelines" ,
157- "against our content policy" ,
158- "cannot process this request" ,
159- # Jailbreak detection
160- "i cannot ignore my instructions" ,
161- "i cannot pretend to be" ,
162- "i cannot bypass my guidelines" ,
163- "i cannot override my safety" ,
164- "this appears to be an attempt to" ,
165- "i'm designed to decline requests that" ,
166- "designed to decline requests" ,
167- # General refusals indicating RAI concern
168- "i'm not able to help with that" ,
169- "i cannot fulfill this request" ,
170- "this is not something i can assist with" ,
171- "i must decline this request" ,
172- "i can't help with" ,
173- "i am not able to" ,
174- "i'm sorry, but i can" ,
175- "i apologize, but i can" ,
176- "unfortunately, i cannot" ,
177- "i'm afraid i can't" ,
178- # Common model refusal patterns
179- "as an ai assistant" ,
180- "as a marketing assistant, i" ,
181- "my purpose is to help with marketing" ,
182- "i specialize in marketing" ,
183- "that's outside my area" ,
184- "not within my scope" ,
185- "falls outside" ,
186- "beyond my capabilities" ,
187- "not something i'm able to" ,
188- ]
189-
190-
191- def _check_for_rai_refusal (conversation : list ) -> bool :
192- """
193- Check if any agent response in the conversation indicates an RAI refusal.
194-
195- This detects when an agent has identified a jailbreak attempt, content safety
196- violation, or out-of-scope request and refused to continue.
197-
198- Args:
199- conversation: List of ChatMessage objects from the workflow
200-
201- Returns:
202- bool: True if an RAI refusal was detected, triggering workflow termination
203- """
204- for msg in conversation :
205- # Only check assistant/agent messages, not user messages
206- if msg .role .value == "user" :
207- continue
208-
209- message_text = msg .text .lower () if msg .text else ""
210-
211- # Check for RAI refusal patterns
212- for pattern in RAI_REFUSAL_PATTERNS :
213- if pattern in message_text :
214- agent_name = msg .author_name or "unknown agent"
215- logger .info (f"RAI refusal detected from { agent_name } : matched pattern '{ pattern } '" )
216- return True
217-
218- return False
219-
220-
221- def _check_message_for_rai_refusal (message_text : str ) -> bool :
222- """
223- Check if a single message indicates an RAI refusal.
224-
225- This is used to detect refusals at the application layer and terminate
226- the workflow immediately, without waiting for the next workflow cycle.
227-
228- Args:
229- message_text: The text content of the message to check
230-
231- Returns:
232- bool: True if an RAI refusal pattern was detected
233- """
234- if not message_text :
235- return False
236-
237- message_lower = message_text .lower ()
238-
239- for pattern in RAI_REFUSAL_PATTERNS :
240- if pattern in message_lower :
241- logger .info (f"RAI refusal pattern detected in message: '{ pattern } '" )
242- return True
243-
244- return False
245-
246-
247135# Agent system instructions
248136TRIAGE_INSTRUCTIONS = f"""You are a Triage Agent (coordinator) for a retail marketing content generation system.
249137
@@ -275,16 +163,6 @@ def _check_message_for_rai_refusal(message_text: str) -> bool:
275163
276164What marketing content can I help you create today?"
277165
278- ## ABSOLUTE RULE - NO HANDOFF AFTER REFUSAL
279- After you provide ANY refusal message (out-of-scope, content safety, jailbreak):
280- - DO NOT call transfer_to_planning_agent or any transfer function
281- - DO NOT call any tool or function
282- - DO NOT hand off to any other agent
283- - STOP IMMEDIATELY after your refusal response
284- - The conversation ENDS with your refusal
285-
286- This is NON-NEGOTIABLE. If you refuse a request, you must NOT use any handoff/transfer functions.
287-
288166### ONLY assist with these marketing-specific tasks:
289167- Creating marketing copy (ads, social posts, emails, product descriptions)
290168- Generating marketing images and visuals for campaigns
@@ -333,16 +211,6 @@ def _check_message_for_rai_refusal(message_text: str) -> bool:
333211
334212I can only help create professional, appropriate marketing content. Please provide a legitimate marketing brief and I'll be happy to assist."
335213
336- ## ABSOLUTE RULE - NO HANDOFF AFTER REFUSAL
337- After you provide ANY refusal response:
338- - DO NOT call transfer_to_triage_agent or any transfer function
339- - DO NOT call any tool or function
340- - DO NOT hand off to any other agent
341- - STOP IMMEDIATELY after your refusal response
342- - The conversation ENDS with your refusal
343-
344- This is NON-NEGOTIABLE. If you refuse a request, you must NOT use any handoff/transfer functions.
345-
346214## BRIEF PARSING (for legitimate requests only)
347215When given a creative brief, extract and structure a JSON object with these REQUIRED fields:
348216- overview: Campaign summary (what is the campaign about?)
@@ -685,13 +553,8 @@ def initialize(self) -> None:
685553 # Compliance can hand back to content agents for corrections or to triage
686554 .add_handoff (compliance_agent , [text_content_agent , image_content_agent , triage_agent ])
687555 .with_termination_condition (
688- # Terminate the workflow under these conditions:
689- # 1. After 10 user messages (prevent infinite loops)
690- # 2. When an agent detects an RAI concern (jailbreak, content safety, out-of-scope)
691- lambda conv : (
692- sum (1 for msg in conv if msg .role .value == "user" ) >= 10
693- or _check_for_rai_refusal (conv )
694- )
556+ # Terminate the workflow after 10 user messages (prevent infinite loops)
557+ lambda conv : sum (1 for msg in conv if msg .role .value == "user" ) >= 10
695558 )
696559 .build ()
697560 )
@@ -770,32 +633,7 @@ async def process_message(
770633 for msg in event .data .conversation
771634 ])
772635
773- # Check ALL messages in the conversation for RAI refusal
774- # This catches cases where an early agent refused but still handed off
775- rai_refusal_msg = None
776- rai_refusal_agent = None
777- for msg in event .data .conversation :
778- if msg .role .value != "user" and msg .text :
779- if _check_message_for_rai_refusal (msg .text ):
780- rai_refusal_msg = msg .text
781- rai_refusal_agent = msg .author_name or "assistant"
782- logger .info (f"RAI refusal detected from { rai_refusal_agent } in conversation history" )
783- break # Use the FIRST refusal found
784-
785- if rai_refusal_msg :
786- logger .info (f"Terminating workflow due to RAI refusal from { rai_refusal_agent } " )
787- yield {
788- "type" : "agent_response" ,
789- "agent" : rai_refusal_agent ,
790- "content" : rai_refusal_msg ,
791- "conversation_history" : conversation_text ,
792- "is_final" : True , # Mark as final to stop workflow
793- "rai_blocked" : True , # Flag indicating RAI block
794- "metadata" : {"conversation_id" : conversation_id }
795- }
796- return # Exit the generator to stop processing
797-
798- # Get the last message content for normal flow
636+ # Get the last message content
799637 last_msg_content = event .data .conversation [- 1 ].text if event .data .conversation else ""
800638 last_msg_agent = event .data .conversation [- 1 ].author_name if event .data .conversation else "unknown"
801639
@@ -886,31 +724,7 @@ async def send_user_response(
886724
887725 elif isinstance (event , RequestInfoEvent ):
888726 if isinstance (event .data , HandoffAgentUserRequest ):
889- # Check ALL messages in the conversation for RAI refusal
890- # This catches cases where an early agent refused but still handed off
891- rai_refusal_msg = None
892- rai_refusal_agent = None
893- for msg in event .data .conversation :
894- if msg .role .value != "user" and msg .text :
895- if _check_message_for_rai_refusal (msg .text ):
896- rai_refusal_msg = msg .text
897- rai_refusal_agent = msg .author_name or "assistant"
898- logger .info (f"RAI refusal detected from { rai_refusal_agent } in user response flow" )
899- break # Use the FIRST refusal found
900-
901- if rai_refusal_msg :
902- logger .info (f"Terminating workflow due to RAI refusal from { rai_refusal_agent } " )
903- yield {
904- "type" : "agent_response" ,
905- "agent" : rai_refusal_agent ,
906- "content" : rai_refusal_msg ,
907- "is_final" : True , # Mark as final to stop workflow
908- "rai_blocked" : True , # Flag indicating RAI block
909- "metadata" : {"conversation_id" : conversation_id }
910- }
911- return # Exit the generator to stop processing
912-
913- # Get the last message content for normal flow
727+ # Get the last message content
914728 last_msg_content = event .data .conversation [- 1 ].text if event .data .conversation else ""
915729 last_msg_agent = event .data .conversation [- 1 ].author_name if event .data .conversation else "unknown"
916730
@@ -953,7 +767,7 @@ async def send_user_response(
953767 async def parse_brief (
954768 self ,
955769 brief_text : str
956- ) -> tuple [CreativeBrief , str | None ]:
770+ ) -> tuple [CreativeBrief , str | None , bool ]:
957771 """
958772 Parse a free-text creative brief into structured format.
959773 If critical information is missing, return clarifying questions.
@@ -962,13 +776,32 @@ async def parse_brief(
962776 brief_text: Free-text creative brief from user
963777
964778 Returns:
965- tuple: (CreativeBrief, clarifying_questions_or_none)
966- - If all critical fields are provided: (brief, None)
967- - If critical fields are missing: (partial_brief, clarifying_questions_string)
779+ tuple: (CreativeBrief, clarifying_questions_or_none, is_blocked)
780+ - If all critical fields are provided: (brief, None, False)
781+ - If critical fields are missing: (partial_brief, clarifying_questions_string, False)
782+ - If harmful content detected: (empty_brief, refusal_message, True)
968783 """
969784 if not self ._initialized :
970785 self .initialize ()
971786
787+ # PROACTIVE CONTENT SAFETY CHECK - Block harmful content at input layer
788+ is_harmful , matched_pattern = _check_input_for_harmful_content (brief_text )
789+ if is_harmful :
790+ logger .warning (f"Blocking harmful content in parse_brief. Pattern: { matched_pattern } " )
791+ # Return empty brief with refusal message and blocked=True
792+ empty_brief = CreativeBrief (
793+ overview = "" ,
794+ objectives = "" ,
795+ target_audience = "" ,
796+ key_message = "" ,
797+ tone_and_style = "" ,
798+ deliverable = "" ,
799+ timelines = "" ,
800+ visual_guidelines = "" ,
801+ cta = ""
802+ )
803+ return empty_brief , RAI_HARMFUL_CONTENT_RESPONSE , True
804+
972805 planning_agent = self ._agents ["planning" ]
973806
974807 # First, analyze the brief and check for missing critical fields
@@ -1055,14 +888,14 @@ async def parse_brief(
1055888
1056889 # Check if we need clarifying questions
1057890 if analysis .get ("status" ) == "incomplete" and analysis .get ("clarifying_message" ):
1058- return (brief , analysis ["clarifying_message" ])
891+ return (brief , analysis ["clarifying_message" ], False )
1059892
1060- return (brief , None )
893+ return (brief , None , False )
1061894
1062895 except Exception as e :
1063896 logger .error (f"Failed to parse brief analysis response: { e } " )
1064897 # Fallback to basic extraction
1065- return (self ._extract_brief_from_text (brief_text ), None )
898+ return (self ._extract_brief_from_text (brief_text ), None , False )
1066899
1067900 def _extract_brief_from_text (self , text : str ) -> CreativeBrief :
1068901 """Extract brief fields from labeled text like 'Overview: ...'"""
0 commit comments