From af01bc850039a862e42771a522eeb22c9c2769fa Mon Sep 17 00:00:00 2001 From: esbatmop Date: Thu, 23 May 2024 21:59:16 +0800 Subject: [PATCH] fix oom bug --- basic_dedup/write_meta_data_pkl.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/basic_dedup/write_meta_data_pkl.py b/basic_dedup/write_meta_data_pkl.py index 35837b0..1f87f44 100644 --- a/basic_dedup/write_meta_data_pkl.py +++ b/basic_dedup/write_meta_data_pkl.py @@ -2,17 +2,21 @@ import hashlib import pandas as pd import argparse +import time # 计算文件的 SHA256 哈希值 def sha256(filename): + hash_sha256 = hashlib.sha256() with open(filename, 'rb') as f: - content = f.read() - return hashlib.sha256(content).hexdigest() + for chunk in iter(lambda: f.read(1024*1024*256), b""): + hash_sha256.update(chunk) + return hash_sha256.hexdigest() # 递归遍历目录并输出文件路径、文件大小和 SHA256 哈希值 def get_all_files_list(dir_path): file_path_list = [] for root, _, files in os.walk(dir_path): + #print(files) for file in files: file_path = os.path.join(root, file) file_path_list.append(file_path) @@ -29,6 +33,7 @@ def write_to_csv(dir_path, pkl_file='files.pkl'): data = {'File': [], 'Size': [], 'SHA256': []} file_path_set = set(get_all_files_list(dir_path)) + #print(file_path_set) file_path_set -= set(existing_df['File']) @@ -40,8 +45,8 @@ def write_to_csv(dir_path, pkl_file='files.pkl'): data['Size'].append(file_size) data['SHA256'].append(file_sha256) except: - print('file not exist: {}'.format(filepath)) - + print("file not exist:" + filepath) + df = pd.concat([existing_df, pd.DataFrame(data)], ignore_index=True) # 将 DataFrame 写入 pickle 文件