-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclean_json_noexist_img.py
More file actions
71 lines (57 loc) · 2.33 KB
/
clean_json_noexist_img.py
File metadata and controls
71 lines (57 loc) · 2.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import glob
import json
import os
from datetime import datetime
import sys
# images_root = 'scrap_pdfs/output_pages_images'
images_root = None
def combine_jsons(directory, output_file):
combined_data = []
# Glob all JSON files in the directory
json_files = glob.glob(os.path.join(directory, '*.json'))
if len(json_files) == 1:
output_file = os.path.basename(json_files[0]).replace('.json', '_fix.json')
# Read each JSON file and append its contents to the combined_data list
for file in json_files:
with open(file, 'r', encoding='utf-8') as f:
data = json.load(f)
combined_data.extend(data)
# Filter out items based on specified conditions
filtered_data = filter_items(combined_data, directory)
# Generate the output file name with a date prefix
date_prefix = datetime.now().strftime("%Y%m%d")
if len(json_files) == 1:
output_filename = output_file
else:
output_filename = f"{date_prefix}_{len(filtered_data)}_{output_file}"
# Write the filtered_data to the output file
with open(output_filename, 'w', encoding='utf-8') as f:
json.dump(filtered_data, f, ensure_ascii=False, indent=2)
print(f"Combined JSON files saved to: {output_filename}, filtered: {len(combined_data)} {len(filtered_data)}")
def filter_items(data, directory):
filtered_data = []
for item in data:
conversations = item['conversations']
if is_gpt_value_valid(conversations):
if images_root is not None:
img_f = os.path.join(images_root, item['image'].replace('images/', ''))
else:
img_f = os.path.join(directory, item['image'])
if os.path.exists(img_f):
filtered_data.append(item)
else:
print(f"Skipping item with image name: {item['image']} (file not found)")
else:
print(f"Skipping item with image name: {item['image']}")
return filtered_data
def is_gpt_value_valid(conversations):
for conversation in conversations:
if conversation['from'] == 'gpt':
value = conversation['value']
if value.strip() == '' or '&#$' in value:
return False
return True
# Usage example
directory = sys.argv[1]
output_file = 'combined.json'
combine_jsons(directory, output_file)