diff --git a/scripts/README.md b/scripts/README.md index d6086a17..ee34f119 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -21,6 +21,7 @@ python hn_mine.py \ --input_file toy_finetune_data.jsonl \ --output_file toy_finetune_data_minedHN.jsonl \ --range_for_sampling 2-200 \ +--similarity_range 0.3-0.8 \ --negative_number 15 \ --use_gpu_for_searching ``` @@ -29,6 +30,7 @@ python hn_mine.py \ - **`output_file`**: path to save JSON data with mined hard negatives for finetuning - **`negative_number`**: the number of sampled negatives - **`range_for_sampling`**: where to sample negative. For example, `2-100` means sampling `negative_number` negatives from top2-top200 documents. **You can set larger value to reduce the difficulty of negatives (e.g., set it `60-300` to sample negatives from top60-300 passages)** +- **`similarity_range`**: Specifies the similarity score range for sampling negatives. This defines the range of similarity between the query and the negative samples. For example, "0.3-0.8" will only sample negatives with similarity scores between 0.3 and 0.8, allowing control over the difficulty of the negatives based on their relevance to the query. (e.g., setting it to "0.1-0.9" to sample negatives with similarity scores from 0.1 to 0.9), you can reduce the difficulty by including more diverse and less relevant negatives, whereas narrowing the range (e.g., "0.6-0.8") increases difficulty by focusing on more relevant negatives. - **`candidate_pool`**: The pool to retrieval. The default value is None, and this script will retrieve from the combination of all `neg` in `input_file`. The format of this file is the same as [pretrain data](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/pretrain#2-data-format). If input a candidate_pool, this script will retrieve negatives from this file. - **`use_gpu_for_searching`**: whether to use faiss-gpu to retrieve negatives. diff --git a/scripts/hn_mine.py b/scripts/hn_mine.py index dfa82a75..5c024f57 100644 --- a/scripts/hn_mine.py +++ b/scripts/hn_mine.py @@ -15,6 +15,7 @@ def get_args(): parser.add_argument('--candidate_pool', default=None, type=str) parser.add_argument('--output_file', default=None, type=str) parser.add_argument('--range_for_sampling', default="10-210", type=str, help="range to sample negatives") + parser.add_argument('--similarity_range', default="0.0-1.0", type=str, help="similarity range to sample negatives") parser.add_argument('--use_gpu_for_searching', action='store_true', help='use faiss-gpu') parser.add_argument('--negative_number', default=15, type=int, help='the number of negatives') parser.add_argument('--query_instruction_for_retrieval', default="") @@ -55,7 +56,7 @@ def get_corpus(candidate_pool): return corpus -def find_knn_neg(model, input_file, candidate_pool, output_file, sample_range, negative_number, use_gpu): +def find_knn_neg(model, input_file, candidate_pool, output_file, sample_range, similarity_range, negative_number, use_gpu): corpus = [] queries = [] train_data = [] @@ -81,16 +82,21 @@ def find_knn_neg(model, input_file, candidate_pool, output_file, sample_range, n print('create index and search------------------') index = create_index(p_vecs, use_gpu=use_gpu) - _, all_inxs = batch_search(index, q_vecs, topk=sample_range[-1]) + all_scores, all_inxs = batch_search(index, q_vecs, topk=sample_range[-1]) assert len(all_inxs) == len(train_data) + min_sim, max_sim = similarity_range for i, data in enumerate(train_data): query = data['query'] - inxs = all_inxs[i][sample_range[0]:sample_range[1]] + scores = all_scores[i] + inxs = all_inxs[i] + + inxs = inxs[sample_range[0]:sample_range[1]] + scores = scores[sample_range[0]:sample_range[1]] + filtered_inx = [] - for inx in inxs: - if inx == -1: break - if corpus[inx] not in data['pos'] and corpus[inx] != query: + for score, inx in zip(scores, inxs): + if min_sim <= score <= max_sim and corpus[inx] not in data['pos'] and corpus[inx] != query: filtered_inx.append(inx) if len(filtered_inx) > negative_number: @@ -108,8 +114,9 @@ def find_knn_neg(model, input_file, candidate_pool, output_file, sample_range, n if __name__ == '__main__': args = get_args() - sample_range = args.range_for_sampling.split('-') - sample_range = [int(x) for x in sample_range] + + sample_range = list(map(int, args.range_for_sampling.split('-'))) + similarity_range = list(map(float, args.similarity_range.split('-'))) model = FlagModel(args.model_name_or_path, query_instruction_for_retrieval=args.query_instruction_for_retrieval) @@ -118,5 +125,6 @@ def find_knn_neg(model, input_file, candidate_pool, output_file, sample_range, n candidate_pool=args.candidate_pool, output_file=args.output_file, sample_range=sample_range, + similarity_range=similarity_range, negative_number=args.negative_number, - use_gpu=args.use_gpu_for_searching) + use_gpu=args.use_gpu_for_searching) \ No newline at end of file