-
Notifications
You must be signed in to change notification settings - Fork 1
/
format_B.py
49 lines (35 loc) · 1.41 KB
/
format_B.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import argparse
import json
import os
def convert_format(old_data):
new_data = []
for item in old_data:
new_conversation = ""
for message in item['conversations']:
if message['from'] == 'human':
new_conversation += 'USER: ' + message['value'] + '\n'
elif message['from'] == 'gpt':
new_conversation += 'ASSISTANT: ' + message['value'] + '</s>\n'
new_data.append({
'conversation': new_conversation.strip()
})
return new_data
def filter_conversations(new_data):
filtered_data = []
for item in new_data:
if '<s>' not in item['conversation'] and '<\\s>' not in item['conversation']:
filtered_data.append(item)
return filtered_data
def main():
parser = argparse.ArgumentParser(description="Convert JSON dataset from old format to new format.")
parser.add_argument('--input', type=str, required=True, help='Path to the input JSON file in old format.')
args = parser.parse_args()
with open(args.input, 'r') as input_file:
old_data = json.load(input_file)
new_data = convert_format(old_data)
filtered_data = filter_conversations(new_data)
os.makedirs('datasets', exist_ok=True)
with open(os.path.join('datasets', 'vicuna_data_B.json'), 'w') as all_output_file:
json.dump(filtered_data, all_output_file, indent=2)
if __name__ == "__main__":
main()