Skip to content

Commit 428e635

Browse files
committed
fix: filter system prompt content from agent responses
- Add _filter_system_prompt_from_response() function to detect and filter leaked system prompt content - Apply filtering to all 4 response output points in process_message() and send_user_response() - Return safe fallback message when system prompt patterns are detected - Prevents internal agent instructions from being exposed to users during follow-up queries
1 parent feebc1e commit 428e635

1 file changed

Lines changed: 73 additions & 4 deletions

File tree

content-gen/src/backend/orchestrator.py

Lines changed: 73 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,69 @@ def _check_input_for_harmful_content(message: str) -> tuple[bool, str]:
109109
return False, ""
110110

111111

112+
# Patterns that indicate system prompt content is being leaked in agent responses
113+
# These are key phrases from our agent instructions that should never appear in user-facing output
114+
SYSTEM_PROMPT_PATTERNS = [
115+
# Agent role descriptions
116+
r"You are an? \w+ Agent",
117+
r"You are a Triage Agent",
118+
r"You are a Planning Agent",
119+
r"You are a Research Agent",
120+
r"You are a Text Content Agent",
121+
r"You are an Image Content Agent",
122+
r"You are a Compliance Agent",
123+
# Handoff instructions
124+
r"hand off to \w+_agent",
125+
r"hand back to \w+_agent",
126+
r"may hand off to",
127+
r"After (?:generating|completing|validation|parsing)",
128+
# Internal workflow markers
129+
r"CRITICAL: SCOPE ENFORCEMENT",
130+
r"## CRITICAL:",
131+
r"### IMMEDIATELY REJECT",
132+
r"CONTENT SAFETY - CRITICAL",
133+
r"MANDATORY: ZERO TEXT IN IMAGE",
134+
# Instruction markers
135+
r"Return JSON with:",
136+
r"Your scope is (?:strictly |)limited to",
137+
r"When creating image prompts:",
138+
r"Check for:\s*\n\s*-",
139+
# RAI internal instructions
140+
r"NEVER generate images that contain:",
141+
r"Responsible AI - Image Generation Rules",
142+
# Agent framework references
143+
r"compliance_agent|triage_agent|planning_agent|research_agent|text_content_agent|image_content_agent",
144+
]
145+
146+
_SYSTEM_PROMPT_PATTERNS_COMPILED = [re.compile(pattern, re.IGNORECASE | re.DOTALL) for pattern in SYSTEM_PROMPT_PATTERNS]
147+
148+
149+
def _filter_system_prompt_from_response(response_text: str) -> str:
150+
"""
151+
Filter out any system prompt content that might have leaked into agent responses.
152+
153+
This is a safety measure to ensure internal agent instructions are never
154+
exposed to users, even if the LLM model accidentally includes them.
155+
156+
Args:
157+
response_text: The agent's response text
158+
159+
Returns:
160+
str: Cleaned response with any system prompt content removed
161+
"""
162+
if not response_text:
163+
return response_text
164+
165+
# Check if response contains system prompt patterns
166+
for pattern in _SYSTEM_PROMPT_PATTERNS_COMPILED:
167+
if pattern.search(response_text):
168+
logger.warning(f"System prompt content detected in agent response, filtering. Pattern: {pattern.pattern[:50]}")
169+
# Return a safe fallback message instead of the leaked content
170+
return "I understand your request. Could you please clarify what specific changes you'd like me to make to the marketing content? I'm here to help refine your campaign materials."
171+
172+
return response_text
173+
174+
112175
# Standard RAI refusal message for harmful content
113176
RAI_HARMFUL_CONTENT_RESPONSE = """I'm a specialized marketing content generation assistant designed exclusively for creating professional marketing materials.
114177
@@ -637,8 +700,9 @@ async def process_message(
637700
for msg in messages
638701
])
639702

640-
# Get the last message content
703+
# Get the last message content and filter any system prompt leakage
641704
last_msg_content = messages[-1].text if messages else (event.data.agent_response.text if hasattr(event.data, 'agent_response') and event.data.agent_response else "")
705+
last_msg_content = _filter_system_prompt_from_response(last_msg_content)
642706
last_msg_agent = messages[-1].author_name if messages and hasattr(messages[-1], 'author_name') else "unknown"
643707

644708
yield {
@@ -663,10 +727,12 @@ async def process_message(
663727
]
664728
if assistant_messages:
665729
last_msg = assistant_messages[-1]
730+
# Filter any system prompt leakage from the response
731+
filtered_content = _filter_system_prompt_from_response(last_msg.text)
666732
yield {
667733
"type": "agent_response",
668734
"agent": last_msg.author_name or "assistant",
669-
"content": last_msg.text,
735+
"content": filtered_content,
670736
"is_final": True,
671737
"metadata": {"conversation_id": conversation_id}
672738
}
@@ -733,8 +799,9 @@ async def send_user_response(
733799
if not isinstance(messages, list):
734800
messages = [messages] if messages else []
735801

736-
# Get the last message content
802+
# Get the last message content and filter any system prompt leakage
737803
last_msg_content = messages[-1].text if messages else (event.data.agent_response.text if hasattr(event.data, 'agent_response') and event.data.agent_response else "")
804+
last_msg_content = _filter_system_prompt_from_response(last_msg_content)
738805
last_msg_agent = messages[-1].author_name if messages and hasattr(messages[-1], 'author_name') else "unknown"
739806

740807
yield {
@@ -756,10 +823,12 @@ async def send_user_response(
756823
]
757824
if assistant_messages:
758825
last_msg = assistant_messages[-1]
826+
# Filter any system prompt leakage from the response
827+
filtered_content = _filter_system_prompt_from_response(last_msg.text)
759828
yield {
760829
"type": "agent_response",
761830
"agent": last_msg.author_name or "assistant",
762-
"content": last_msg.text,
831+
"content": filtered_content,
763832
"is_final": True,
764833
"metadata": {"conversation_id": conversation_id}
765834
}

0 commit comments

Comments
 (0)