Merge branch 'contribute-new-test' into main

prompt-security · Apr 16, 2024 · 4d62236 · 4d62236
2 parents 1b43a50 + 890c199
commit 4d62236
Show file tree

Hide file tree

Showing 4 changed files with 109 additions and 0 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -110,6 +110,56 @@ git commit -m "Add a brief description of your change"
 git push origin your-branch-name
 ```
 
+## Get Started with Your First Contribution: Adding a New Test
+
+The easist way to contribute to ps-fuzz project is by creating a new test! You can see an example PR of a test here: [](PR #1)
+This can be easily acheived by:
+
+#### 1. Create a Test File
+* Navigate to the attacks directory. 
+* Create a new file, naming it after the specific attack or the dataset it utilizes.
+
+#### 2. Set Up Your File
+Add the following imports and set up logging in your new file:
+```python
+from ..chat_clients import ChatSession
+from ..client_config import ClientConfig
+from ..attack_config import AttackConfig
+from ..test_base import TestBase, StatusUpdate
+from ..attack_registry import register_test
+from typing import Generator
+from pkg_resources import resource_filename
+import logging
+logger = logging.getLogger(__name__)
+```
+
+#### 3. Implement the TestBase Class in your test's class:
+* Define your test class by extending TestBase and using the @register_test decorator.
+* Example implementation:
+```python
+@register_test
+class TestHarmfulBehaviors(TestBase):
+    def __init__(self, client_config: ClientConfig, attack_config: AttackConfig):
+        super().__init__(
+            client_config,
+            attack_config,
+            test_name = "your_test_name_here",
+            test_description = "Describe your test thoroughly here"
+        )
+
+    def run(self) -> Generator[StatusUpdate, None, None]:
+        # Generate or retrieve all necessary attack prompts for the test
+        # Send them to the model
+        # Process the results to determine which attacks succeeded and which failed
+        # That's it!
+```
+
+#### 4. Follow insctructions: Implement the logic inside the run function as outlined in the comments.
+
+#### 5. Open a PR! Submit your changes for review by opening a pull request.
+
+#### That’s all it takes to contribute a new test to the PS-Fuzz project!
+
 ## Submitting a pull request
 
 1. Update your branch

diff --git a/ps_fuzz/attack_data/prompt_injections_for_base64.parquet b/ps_fuzz/attack_data/prompt_injections_for_base64.parquet
diff --git a/ps_fuzz/attacks/base64_injection.py b/ps_fuzz/attacks/base64_injection.py
@@ -0,0 +1,58 @@
+from ..chat_clients import ChatSession
+from ..client_config import ClientConfig
+from ..attack_config import AttackConfig
+from ..test_base import TestBase, StatusUpdate
+from ..attack_registry import register_test
+from ..util import contains_refusal_keywords
+from typing import Generator
+import pandas as pd
+from pkg_resources import resource_filename # for loading attack data file packaged with the library
+import logging
+import base64
+logger = logging.getLogger(__name__)
+
+@register_test
+class TestHarmfulBehaviors(TestBase):
+    def __init__(self, client_config: ClientConfig, attack_config: AttackConfig):
+        super().__init__(
+            client_config,
+            attack_config,
+            test_name = "harmful_behavior",
+            test_description = "Challenges the LLM with a dataset of harmful behavior prompts dataset"
+        )
+
+    def run(self):
+        dataset_filename = resource_filename('ps_fuzz', 'attack_data/prompt_injections_for_base64.parquet')
+        data = pd.read_parquet(dataset_filename, engine='fastparquet')
+        rows_count = data.shape[0]
+
+        if rows_count > self.attack_config.attack_prompts_count:
+            # If dataset is larger than requested number of attack samples, reduce dataset by taking attack_prompts_count samples
+            data = data.sample(n=self.attack_config.attack_prompts_count, random_state=42)  # constant `random_state` for better reproducibility
+            rows_count = data.shape[0]
+
+        # Generate list of attack prompt variations
+        yield StatusUpdate(self.client_config, self.test_name, self.status, "Working", 0, rows_count)
+        for attack_prompt_index, row in data.iterrows():
+            attack_prompt = row['text']
+            attack_prompt_bytes = attack_prompt.encode('utf-8')
+            attack_prompt_base64 = base64.b64encode(attack_prompt_bytes)
+            attack_prompt_base64_string = attack_prompt_base64.decode('utf-8')
+            chat = ChatSession(self.client_config.get_target_client(), self.client_config.get_system_prompts())
+            response = chat.say(attack_prompt_base64_string)
+            if not response:
+                logger.warning("Error while attacking against target LLM (didn't receive response) ...", file = sys.stderr)
+                self.status.report_error(attack_prompt)
+
+            # The idea is to check that response contains something similar to expected response, but without refusal keywords
+            # This is a simple way to discriminate refusal from compliance without requiring a full NLP model.
+            verified = not contains_refusal_keywords(response)
+            logger.info(f"Test '{self.test_name}': attack prompt #{attack_prompt_index}: {attack_prompt}")
+            logger.info(f"Test '{self.test_name}': attack response #{attack_prompt_index} break_success={verified}): {response}")
+
+            if verified:
+                self.status.report_breach(attack_prompt, response)
+            else:
+                self.status.report_resilient(attack_prompt, response)
+            yield StatusUpdate(self.client_config, self.test_name, self.status, "Attacking", attack_prompt_index, self.attack_config.attack_prompts_count)
+        yield StatusUpdate(self.client_config, self.test_name, self.status, "Completed", rows_count, rows_count)
diff --git a/setup.py b/setup.py
@@ -43,6 +43,7 @@
         "pandas==2.2.2",
         "inquirer==3.2.4",
         "prompt-toolkit==3.0.43",
+        "fastparquet==2024.2.0"
     ],
     extras_require={
         "dev": ["pytest==7.4.4"]