|
58 | 58 | logger = logging.getLogger(__name__) |
59 | 59 |
|
60 | 60 |
|
| 61 | +# Harmful content patterns to detect in USER INPUT before processing |
| 62 | +# This provides proactive content safety by blocking harmful requests at the input layer |
| 63 | +HARMFUL_INPUT_PATTERNS = [ |
| 64 | + # Violence and weapons |
| 65 | + r"\b(make|making|create|creating|build|building|how to make|how to build)\b.{0,30}\b(bomb|explosive|weapon|gun|firearm|knife attack|poison)\b", |
| 66 | + r"\b(bomb|explosive|weapon|gun|firearm)\b.{0,30}\b(make|making|create|creating|build|building)\b", |
| 67 | + r"\b(kill|murder|assassinate|harm|hurt|attack|shoot|stab)\b.{0,20}\b(people|person|someone|victims)\b", |
| 68 | + r"\b(terrorist|terrorism|mass shooting|school shooting|violence)\b", |
| 69 | + # Illegal activities |
| 70 | + r"\b(illegal drugs|drug trafficking|sell drugs|meth|cocaine|heroin|fentanyl)\b", |
| 71 | + r"\b(how to steal|stealing|robbery|burglary|break into)\b", |
| 72 | + r"\b(money laundering|fraud scheme|scam people|con people)\b", |
| 73 | + r"\b(hack|hacking|cyber attack|ddos|malware|ransomware)\b.{0,20}\b(create|make|build|deploy|spread)\b", |
| 74 | + # Hate and discrimination |
| 75 | + r"\b(racist|sexist|homophobic|transphobic|discriminat)\b.{0,20}\b(content|campaign|ad|message)\b", |
| 76 | + r"\b(hate speech|white supremac|nazi|ethnic cleansing)\b", |
| 77 | + # Self-harm |
| 78 | + r"\b(suicide|self.?harm|cut myself|kill myself)\b", |
| 79 | + # Sexual content |
| 80 | + r"\b(child porn|csam|minors|underage|pedophil)\b", |
| 81 | + r"\b(explicit|pornograph|sexual content)\b.{0,20}\b(create|make|generate)\b", |
| 82 | + # Misinformation |
| 83 | + r"\b(fake news|disinformation|misinformation)\b.{0,20}\b(campaign|spread|create)\b", |
| 84 | + # Specific harmful combinations |
| 85 | + r"\bbomb\b", # Direct mention of bomb in any context |
| 86 | + r"\bexplosive device\b", |
| 87 | + r"\bweapon of mass\b", |
| 88 | +] |
| 89 | + |
| 90 | +# Compiled regex patterns for performance |
| 91 | +_HARMFUL_PATTERNS_COMPILED = [re.compile(pattern, re.IGNORECASE) for pattern in HARMFUL_INPUT_PATTERNS] |
| 92 | + |
| 93 | + |
| 94 | +def _check_input_for_harmful_content(message: str) -> tuple[bool, str]: |
| 95 | + """ |
| 96 | + Proactively check user input for harmful content BEFORE sending to agents. |
| 97 | + |
| 98 | + This is the first line of defense - catching harmful requests at the input |
| 99 | + layer rather than relying on the agent to refuse. |
| 100 | + |
| 101 | + Args: |
| 102 | + message: The user's input message |
| 103 | + |
| 104 | + Returns: |
| 105 | + tuple: (is_harmful: bool, matched_pattern: str or empty) |
| 106 | + """ |
| 107 | + if not message: |
| 108 | + return False, "" |
| 109 | + |
| 110 | + message_lower = message.lower() |
| 111 | + |
| 112 | + for i, pattern in enumerate(_HARMFUL_PATTERNS_COMPILED): |
| 113 | + if pattern.search(message_lower): |
| 114 | + matched = HARMFUL_INPUT_PATTERNS[i] |
| 115 | + logger.warning(f"Harmful content detected in user input. Pattern: {matched}") |
| 116 | + return True, matched |
| 117 | + |
| 118 | + return False, "" |
| 119 | + |
| 120 | + |
| 121 | +# Standard RAI refusal message for harmful content |
| 122 | +RAI_HARMFUL_CONTENT_RESPONSE = """I'm a specialized marketing content generation assistant designed exclusively for creating professional marketing materials. |
| 123 | +
|
| 124 | +I cannot help with this request as it involves content that violates our content safety guidelines. I'm designed to create positive, helpful marketing content only. |
| 125 | +
|
| 126 | +If you have a legitimate marketing request, I'd be happy to help you create: |
| 127 | +- Product descriptions and campaigns |
| 128 | +- Social media content |
| 129 | +- Email marketing materials |
| 130 | +- Brand messaging and taglines |
| 131 | +
|
| 132 | +Please share a marketing-related request and I'll assist you.""" |
| 133 | + |
| 134 | + |
61 | 135 | # RAI (Responsible AI) detection patterns |
62 | 136 | # These patterns indicate when an agent has identified a jailbreak attempt, |
63 | 137 | # content safety violation, or out-of-scope request |
@@ -650,6 +724,23 @@ async def process_message( |
650 | 724 |
|
651 | 725 | logger.info(f"Processing message for conversation {conversation_id}") |
652 | 726 |
|
| 727 | + # PROACTIVE CONTENT SAFETY CHECK - Block harmful content at input layer |
| 728 | + # This is the first line of defense, before any agent processes the request |
| 729 | + is_harmful, matched_pattern = _check_input_for_harmful_content(message) |
| 730 | + if is_harmful: |
| 731 | + logger.warning(f"Blocking harmful content for conversation {conversation_id}. Pattern: {matched_pattern}") |
| 732 | + yield { |
| 733 | + "type": "agent_response", |
| 734 | + "agent": "content_safety", |
| 735 | + "content": RAI_HARMFUL_CONTENT_RESPONSE, |
| 736 | + "conversation_history": f"user: {message}\ncontent_safety: {RAI_HARMFUL_CONTENT_RESPONSE}", |
| 737 | + "is_final": True, |
| 738 | + "rai_blocked": True, |
| 739 | + "blocked_reason": "harmful_content_detected", |
| 740 | + "metadata": {"conversation_id": conversation_id} |
| 741 | + } |
| 742 | + return # Exit immediately - do not process through agents |
| 743 | + |
653 | 744 | # Prepare the input with context |
654 | 745 | full_input = message |
655 | 746 | if context: |
@@ -767,6 +858,21 @@ async def send_user_response( |
767 | 858 | if not self._initialized: |
768 | 859 | self.initialize() |
769 | 860 |
|
| 861 | + # PROACTIVE CONTENT SAFETY CHECK - Block harmful content in follow-up messages too |
| 862 | + is_harmful, matched_pattern = _check_input_for_harmful_content(user_response) |
| 863 | + if is_harmful: |
| 864 | + logger.warning(f"Blocking harmful content in user response for conversation {conversation_id}. Pattern: {matched_pattern}") |
| 865 | + yield { |
| 866 | + "type": "agent_response", |
| 867 | + "agent": "content_safety", |
| 868 | + "content": RAI_HARMFUL_CONTENT_RESPONSE, |
| 869 | + "is_final": True, |
| 870 | + "rai_blocked": True, |
| 871 | + "blocked_reason": "harmful_content_detected", |
| 872 | + "metadata": {"conversation_id": conversation_id} |
| 873 | + } |
| 874 | + return # Exit immediately - do not continue workflow |
| 875 | + |
770 | 876 | try: |
771 | 877 | responses = {request_id: user_response} |
772 | 878 | async for event in self._workflow.send_responses_streaming(responses): |
|
0 commit comments