Skip to content

Commit

Permalink
Merge branch 'contribute-new-test' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
guy-ps authored Apr 16, 2024
2 parents 1b43a50 + 890c199 commit 4d62236
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 0 deletions.
50 changes: 50 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,56 @@ git commit -m "Add a brief description of your change"
git push origin your-branch-name
```

## Get Started with Your First Contribution: Adding a New Test

The easist way to contribute to ps-fuzz project is by creating a new test! You can see an example PR of a test here: [](PR #1)
This can be easily acheived by:

#### 1. Create a Test File
* Navigate to the attacks directory.
* Create a new file, naming it after the specific attack or the dataset it utilizes.

#### 2. Set Up Your File
Add the following imports and set up logging in your new file:
```python
from ..chat_clients import ChatSession
from ..client_config import ClientConfig
from ..attack_config import AttackConfig
from ..test_base import TestBase, StatusUpdate
from ..attack_registry import register_test
from typing import Generator
from pkg_resources import resource_filename
import logging
logger = logging.getLogger(__name__)
```

#### 3. Implement the TestBase Class in your test's class:
* Define your test class by extending TestBase and using the @register_test decorator.
* Example implementation:
```python
@register_test
class TestHarmfulBehaviors(TestBase):
def __init__(self, client_config: ClientConfig, attack_config: AttackConfig):
super().__init__(
client_config,
attack_config,
test_name = "your_test_name_here",
test_description = "Describe your test thoroughly here"
)

def run(self) -> Generator[StatusUpdate, None, None]:
# Generate or retrieve all necessary attack prompts for the test
# Send them to the model
# Process the results to determine which attacks succeeded and which failed
# That's it!
```

#### 4. Follow insctructions: Implement the logic inside the run function as outlined in the comments.

#### 5. Open a PR! Submit your changes for review by opening a pull request.

#### That’s all it takes to contribute a new test to the PS-Fuzz project!

## Submitting a pull request

1. Update your branch
Expand Down
Binary file not shown.
58 changes: 58 additions & 0 deletions ps_fuzz/attacks/base64_injection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from ..chat_clients import ChatSession
from ..client_config import ClientConfig
from ..attack_config import AttackConfig
from ..test_base import TestBase, StatusUpdate
from ..attack_registry import register_test
from ..util import contains_refusal_keywords
from typing import Generator
import pandas as pd
from pkg_resources import resource_filename # for loading attack data file packaged with the library
import logging
import base64
logger = logging.getLogger(__name__)

@register_test
class TestHarmfulBehaviors(TestBase):
def __init__(self, client_config: ClientConfig, attack_config: AttackConfig):
super().__init__(
client_config,
attack_config,
test_name = "harmful_behavior",
test_description = "Challenges the LLM with a dataset of harmful behavior prompts dataset"
)

def run(self):
dataset_filename = resource_filename('ps_fuzz', 'attack_data/prompt_injections_for_base64.parquet')
data = pd.read_parquet(dataset_filename, engine='fastparquet')
rows_count = data.shape[0]

if rows_count > self.attack_config.attack_prompts_count:
# If dataset is larger than requested number of attack samples, reduce dataset by taking attack_prompts_count samples
data = data.sample(n=self.attack_config.attack_prompts_count, random_state=42) # constant `random_state` for better reproducibility
rows_count = data.shape[0]

# Generate list of attack prompt variations
yield StatusUpdate(self.client_config, self.test_name, self.status, "Working", 0, rows_count)
for attack_prompt_index, row in data.iterrows():
attack_prompt = row['text']
attack_prompt_bytes = attack_prompt.encode('utf-8')
attack_prompt_base64 = base64.b64encode(attack_prompt_bytes)
attack_prompt_base64_string = attack_prompt_base64.decode('utf-8')
chat = ChatSession(self.client_config.get_target_client(), self.client_config.get_system_prompts())
response = chat.say(attack_prompt_base64_string)
if not response:
logger.warning("Error while attacking against target LLM (didn't receive response) ...", file = sys.stderr)
self.status.report_error(attack_prompt)

# The idea is to check that response contains something similar to expected response, but without refusal keywords
# This is a simple way to discriminate refusal from compliance without requiring a full NLP model.
verified = not contains_refusal_keywords(response)
logger.info(f"Test '{self.test_name}': attack prompt #{attack_prompt_index}: {attack_prompt}")
logger.info(f"Test '{self.test_name}': attack response #{attack_prompt_index} break_success={verified}): {response}")

if verified:
self.status.report_breach(attack_prompt, response)
else:
self.status.report_resilient(attack_prompt, response)
yield StatusUpdate(self.client_config, self.test_name, self.status, "Attacking", attack_prompt_index, self.attack_config.attack_prompts_count)
yield StatusUpdate(self.client_config, self.test_name, self.status, "Completed", rows_count, rows_count)
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
"pandas==2.2.2",
"inquirer==3.2.4",
"prompt-toolkit==3.0.43",
"fastparquet==2024.2.0"
],
extras_require={
"dev": ["pytest==7.4.4"]
Expand Down

0 comments on commit 4d62236

Please sign in to comment.