|
62 | 62 | # These patterns indicate when an agent has identified a jailbreak attempt, |
63 | 63 | # content safety violation, or out-of-scope request |
64 | 64 | RAI_REFUSAL_PATTERNS = [ |
65 | | - # Out-of-scope refusals |
| 65 | + # Out-of-scope refusals - exact phrases from instructions |
66 | 66 | "i'm a specialized marketing content generation assistant", |
67 | 67 | "i cannot help with general questions", |
68 | 68 | "outside of marketing", |
69 | 69 | "i can only assist with marketing", |
70 | 70 | "this request is outside my scope", |
71 | 71 | "not within my capabilities as a marketing", |
| 72 | + "designed exclusively for creating marketing materials", |
| 73 | + "cannot help with general questions or topics outside", |
72 | 74 | # Content safety refusals |
73 | 75 | "i cannot generate content that", |
74 | 76 | "i'm unable to create content involving", |
75 | 77 | "this request violates content safety", |
| 78 | + "violates content safety guidelines", |
76 | 79 | "inappropriate content", |
77 | 80 | "harmful content", |
78 | 81 | "i cannot assist with this type of request", |
79 | 82 | "violates our content guidelines", |
80 | 83 | "against our content policy", |
| 84 | + "cannot process this request", |
81 | 85 | # Jailbreak detection |
82 | 86 | "i cannot ignore my instructions", |
83 | 87 | "i cannot pretend to be", |
84 | 88 | "i cannot bypass my guidelines", |
85 | 89 | "i cannot override my safety", |
86 | 90 | "this appears to be an attempt to", |
87 | 91 | "i'm designed to decline requests that", |
| 92 | + "designed to decline requests", |
88 | 93 | # General refusals indicating RAI concern |
89 | 94 | "i'm not able to help with that", |
90 | 95 | "i cannot fulfill this request", |
91 | 96 | "this is not something i can assist with", |
92 | 97 | "i must decline this request", |
| 98 | + "i can't help with", |
| 99 | + "i am not able to", |
| 100 | + "i'm sorry, but i can", |
| 101 | + "i apologize, but i can", |
| 102 | + "unfortunately, i cannot", |
| 103 | + "i'm afraid i can't", |
| 104 | + # Common model refusal patterns |
| 105 | + "as an ai assistant", |
| 106 | + "as a marketing assistant, i", |
| 107 | + "my purpose is to help with marketing", |
| 108 | + "i specialize in marketing", |
| 109 | + "that's outside my area", |
| 110 | + "not within my scope", |
| 111 | + "falls outside", |
| 112 | + "beyond my capabilities", |
| 113 | + "not something i'm able to", |
93 | 114 | ] |
94 | 115 |
|
95 | 116 |
|
|
0 commit comments