-
Notifications
You must be signed in to change notification settings - Fork 0
/
jsonFiles_merging.py
90 lines (83 loc) · 3.99 KB
/
jsonFiles_merging.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os,shutil,json,codecs
def merge_json_files(directory_path, output_json_file):
total_records = 0
# 遍历目录下的所有文件
json_file = codecs.open(output_json_file, 'w+', encoding='UTF-8')
json_file.write('[\n')
for filename in os.listdir(directory_path):
if filename.endswith(".json"):
json_file_path = os.path.join(directory_path, filename)
# 读取JSON文件中的内容并添加到列表中
with open(json_file_path, 'r', encoding='UTF-8') as sub_json_file:
data = json.load(sub_json_file)
for record in data:
item_json = json.dumps(dict(record), ensure_ascii=False)
json_file.write('\t' + item_json + ',\n')
total_records+=1
# 将整合后的数据写入输出JSON文件
# 在结束后,需要对 process_item 最后一次执行输出的 “逗号” 去除
# 当前文件指针处于文件尾,我们需要首先使用 SEEK 方法,定位文件尾前的两个字符(一个','(逗号), 一个'\n'(换行符))的位置
json_file.seek(-2, os.SEEK_END)
# 使用 truncate() 方法,将后面的数据清空
json_file.truncate()
# 重新输出'\n',并输入']',与 open_spider(self, spider) 时输出的 '[',构成一个完整的数组格式
json_file.write('\n]')
json_file.close()
print("ALL JSONs Merged")
import ijson
def count_records(json_path):
total_records = 0
with open(json_path,"r+", encoding='utf-8') as f:
for record in ijson.items(f, "item"):
total_records+=1
return total_records
def url_extract(json_path, output_json_file):
total_records = 0
output = codecs.open(output_json_file, 'w+', encoding='UTF-8')
output.write('[\n')
with open(json_path,"r+", encoding='utf-8') as f:
for record in ijson.items(f, 'item'):
ID = record['ID']
release_dtl = record['release_dtl']
urls = {}
for key,value in release_dtl.items():
if(key[:2] == 'd_'):
urls['FANZA'] = value['url']
elif(key[:2] == 'RJ'):
urls['DLsite'] = value['url']
else:
urls[key] = value['url']
url_info = {ID:urls}
item_json = json.dumps(url_info, ensure_ascii=False)
output.write('\t' + item_json + ',\n')
total_records+=1
output.seek(-2, os.SEEK_END)
# 使用 truncate() 方法,将后面的数据清空
output.truncate()
# 重新输出'\n',并输入']',与 open_spider(self, spider) 时输出的 '[',构成一个完整的数组格式
output.write('\n]')
output.close()
return total_records
def unfold_json_files(directory_path):
total_records = 0
# 遍历目录下的所有文件
for foldername in os.listdir(directory_path):
folder_path = os.path.join(directory_path, foldername)
if os.path.isdir(folder_path):
for filename in os.listdir(folder_path):
if filename == 'missing_ids.json':
continue
if filename.endswith(".json"):
json_file_path = os.path.join(folder_path, filename)
shutil.move(json_file_path,directory_path)
total_records+=1
print("ALL JSON Folders Unfolded\n{0} Files in total".format(total_records))
os.chdir(os.path.dirname(os.path.abspath(__file__)))
#merge_json_files(r'D:\2024Spring\DLsite-Analysis\ALL Batches', 'dojinDB.json')
#print("{0} records loaded".format(count_records('test_batch\\test_batch.json')))
unfold_json_files(r'D:\2024Spring\DLsite-Analysis\batches')
#json_path = 'test_batch\\test_batch.json'
#output_json_file = 'test_batch\\test_url.json'
json_path = 'dojinDB.json'
output_json_file = 'dojin_urls.json'
#print("{0} records extracted".format(url_extract(json_path, output_json_file)))