Skip to content

Commit e19f463

Browse files
committed
Add proactive harmful content detection at input layer
- Add HARMFUL_INPUT_PATTERNS regex list to detect dangerous content in user input - Add _check_input_for_harmful_content() function for proactive scanning - Add RAI_HARMFUL_CONTENT_RESPONSE standard message for blocked requests - Check user input BEFORE sending to agents in process_message() - Check follow-up responses in send_user_response() - Immediately return with rai_blocked=True when harmful content detected - Blocks requests containing: weapons, violence, illegal activities, hate, etc. This is the first line of defense - catching harmful requests at the application layer rather than relying on the LLM agent to refuse (which it wasn't doing).
1 parent ba19f52 commit e19f463

1 file changed

Lines changed: 106 additions & 0 deletions

File tree

content-gen/src/backend/orchestrator.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,80 @@
5858
logger = logging.getLogger(__name__)
5959

6060

61+
# Harmful content patterns to detect in USER INPUT before processing
62+
# This provides proactive content safety by blocking harmful requests at the input layer
63+
HARMFUL_INPUT_PATTERNS = [
64+
# Violence and weapons
65+
r"\b(make|making|create|creating|build|building|how to make|how to build)\b.{0,30}\b(bomb|explosive|weapon|gun|firearm|knife attack|poison)\b",
66+
r"\b(bomb|explosive|weapon|gun|firearm)\b.{0,30}\b(make|making|create|creating|build|building)\b",
67+
r"\b(kill|murder|assassinate|harm|hurt|attack|shoot|stab)\b.{0,20}\b(people|person|someone|victims)\b",
68+
r"\b(terrorist|terrorism|mass shooting|school shooting|violence)\b",
69+
# Illegal activities
70+
r"\b(illegal drugs|drug trafficking|sell drugs|meth|cocaine|heroin|fentanyl)\b",
71+
r"\b(how to steal|stealing|robbery|burglary|break into)\b",
72+
r"\b(money laundering|fraud scheme|scam people|con people)\b",
73+
r"\b(hack|hacking|cyber attack|ddos|malware|ransomware)\b.{0,20}\b(create|make|build|deploy|spread)\b",
74+
# Hate and discrimination
75+
r"\b(racist|sexist|homophobic|transphobic|discriminat)\b.{0,20}\b(content|campaign|ad|message)\b",
76+
r"\b(hate speech|white supremac|nazi|ethnic cleansing)\b",
77+
# Self-harm
78+
r"\b(suicide|self.?harm|cut myself|kill myself)\b",
79+
# Sexual content
80+
r"\b(child porn|csam|minors|underage|pedophil)\b",
81+
r"\b(explicit|pornograph|sexual content)\b.{0,20}\b(create|make|generate)\b",
82+
# Misinformation
83+
r"\b(fake news|disinformation|misinformation)\b.{0,20}\b(campaign|spread|create)\b",
84+
# Specific harmful combinations
85+
r"\bbomb\b", # Direct mention of bomb in any context
86+
r"\bexplosive device\b",
87+
r"\bweapon of mass\b",
88+
]
89+
90+
# Compiled regex patterns for performance
91+
_HARMFUL_PATTERNS_COMPILED = [re.compile(pattern, re.IGNORECASE) for pattern in HARMFUL_INPUT_PATTERNS]
92+
93+
94+
def _check_input_for_harmful_content(message: str) -> tuple[bool, str]:
95+
"""
96+
Proactively check user input for harmful content BEFORE sending to agents.
97+
98+
This is the first line of defense - catching harmful requests at the input
99+
layer rather than relying on the agent to refuse.
100+
101+
Args:
102+
message: The user's input message
103+
104+
Returns:
105+
tuple: (is_harmful: bool, matched_pattern: str or empty)
106+
"""
107+
if not message:
108+
return False, ""
109+
110+
message_lower = message.lower()
111+
112+
for i, pattern in enumerate(_HARMFUL_PATTERNS_COMPILED):
113+
if pattern.search(message_lower):
114+
matched = HARMFUL_INPUT_PATTERNS[i]
115+
logger.warning(f"Harmful content detected in user input. Pattern: {matched}")
116+
return True, matched
117+
118+
return False, ""
119+
120+
121+
# Standard RAI refusal message for harmful content
122+
RAI_HARMFUL_CONTENT_RESPONSE = """I'm a specialized marketing content generation assistant designed exclusively for creating professional marketing materials.
123+
124+
I cannot help with this request as it involves content that violates our content safety guidelines. I'm designed to create positive, helpful marketing content only.
125+
126+
If you have a legitimate marketing request, I'd be happy to help you create:
127+
- Product descriptions and campaigns
128+
- Social media content
129+
- Email marketing materials
130+
- Brand messaging and taglines
131+
132+
Please share a marketing-related request and I'll assist you."""
133+
134+
61135
# RAI (Responsible AI) detection patterns
62136
# These patterns indicate when an agent has identified a jailbreak attempt,
63137
# content safety violation, or out-of-scope request
@@ -650,6 +724,23 @@ async def process_message(
650724

651725
logger.info(f"Processing message for conversation {conversation_id}")
652726

727+
# PROACTIVE CONTENT SAFETY CHECK - Block harmful content at input layer
728+
# This is the first line of defense, before any agent processes the request
729+
is_harmful, matched_pattern = _check_input_for_harmful_content(message)
730+
if is_harmful:
731+
logger.warning(f"Blocking harmful content for conversation {conversation_id}. Pattern: {matched_pattern}")
732+
yield {
733+
"type": "agent_response",
734+
"agent": "content_safety",
735+
"content": RAI_HARMFUL_CONTENT_RESPONSE,
736+
"conversation_history": f"user: {message}\ncontent_safety: {RAI_HARMFUL_CONTENT_RESPONSE}",
737+
"is_final": True,
738+
"rai_blocked": True,
739+
"blocked_reason": "harmful_content_detected",
740+
"metadata": {"conversation_id": conversation_id}
741+
}
742+
return # Exit immediately - do not process through agents
743+
653744
# Prepare the input with context
654745
full_input = message
655746
if context:
@@ -767,6 +858,21 @@ async def send_user_response(
767858
if not self._initialized:
768859
self.initialize()
769860

861+
# PROACTIVE CONTENT SAFETY CHECK - Block harmful content in follow-up messages too
862+
is_harmful, matched_pattern = _check_input_for_harmful_content(user_response)
863+
if is_harmful:
864+
logger.warning(f"Blocking harmful content in user response for conversation {conversation_id}. Pattern: {matched_pattern}")
865+
yield {
866+
"type": "agent_response",
867+
"agent": "content_safety",
868+
"content": RAI_HARMFUL_CONTENT_RESPONSE,
869+
"is_final": True,
870+
"rai_blocked": True,
871+
"blocked_reason": "harmful_content_detected",
872+
"metadata": {"conversation_id": conversation_id}
873+
}
874+
return # Exit immediately - do not continue workflow
875+
770876
try:
771877
responses = {request_id: user_response}
772878
async for event in self._workflow.send_responses_streaming(responses):

0 commit comments

Comments
 (0)