From f1ce7d1f2ffcda9f0ace4d0f013883764f08ef5e Mon Sep 17 00:00:00 2001 From: amyxie361 Date: Thu, 28 Apr 2022 14:48:00 -0400 Subject: [PATCH 01/12] init wp-tokenizer for beir stats --- scripts/beir/tokenize_corpus.py | 97 +++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 scripts/beir/tokenize_corpus.py diff --git a/scripts/beir/tokenize_corpus.py b/scripts/beir/tokenize_corpus.py new file mode 100644 index 000000000..5a987cb89 --- /dev/null +++ b/scripts/beir/tokenize_corpus.py @@ -0,0 +1,97 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import argparse +import multiprocessing +import json +import time +from joblib import Parallel, delayed +from transformers import AutoTokenizer +from ltr_msmarco.convert_common import get_retokenized + +'''Replace original contents fields with bert tokenization''' + + +parser = argparse.ArgumentParser(description='Convert MSMARCO-adhoc documents.') +parser.add_argument('--input', metavar='input file', help='input file', + type=str, required=True) +parser.add_argument('--output', metavar='output file', help='output file', + type=str, required=True) +parser.add_argument('--workers', metavar='# of processes', help='# of workers to spawn', + type=int, default=multiprocessing.cpu_count() - 2) + + +args = parser.parse_args() +print(args) +arg_vars = vars(args) + +def batch_file(iterable, n=10000): + batch = [] + for line in iterable: + batch.append(line) + if len(batch) == n: + yield batch + batch = [] + if len(batch) > 0: + yield batch + batch = [] + return + + +def batch_process(batch): + bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + + def process(line): + if not line: + return None + json_line = json.loads(line) + pid = json_line['id'] + body = json_line['contents'] + + doc = {"id": pid, + "contents": get_retokenized(bert_tokenizer, body.lower())} + return doc + + res = [] + start = time.time() + for line in batch: + res.append(process(line)) + if len(res) % 1000 == 0: + end = time.time() + print(f"finish {len(res)} using {end-start}") + start = end + return res + + +if __name__ == '__main__': + workers = args.workers + print(f"Spawning {workers} processes") + pool = Parallel(n_jobs=workers, verbose=10) + line_num = 0 + + with open(args.input) as inFile: + with open(args.output, 'w') as outFile: + for batch_json in pool([delayed(batch_process)(batch) for batch in batch_file(inFile)]): + for doc_json in batch_json: + line_num = line_num + 1 + if doc_json is not None: + outFile.write(json.dumps(doc_json) + '\n') + else: + print(f"Ignoring misformatted line {line_num}") + + if line_num % 100 == 0: + print(f"Processed {line_num} passages") + + print(f"Processed {line_num} passages") From 66ebe22f695033e935b8cf9258e442c4e87c25f9 Mon Sep 17 00:00:00 2001 From: yqxie Date: Thu, 28 Apr 2022 15:26:01 -0400 Subject: [PATCH 02/12] modify tokenize_corpus from msmarco to beir --- scripts/beir/tokenize_corpus.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/scripts/beir/tokenize_corpus.py b/scripts/beir/tokenize_corpus.py index 5a987cb89..5bfb7bdc8 100644 --- a/scripts/beir/tokenize_corpus.py +++ b/scripts/beir/tokenize_corpus.py @@ -19,12 +19,11 @@ import time from joblib import Parallel, delayed from transformers import AutoTokenizer -from ltr_msmarco.convert_common import get_retokenized '''Replace original contents fields with bert tokenization''' -parser = argparse.ArgumentParser(description='Convert MSMARCO-adhoc documents.') +parser = argparse.ArgumentParser(description='Convert BEIR original documents to word piece tokenized.') parser.add_argument('--input', metavar='input file', help='input file', type=str, required=True) parser.add_argument('--output', metavar='output file', help='output file', @@ -37,6 +36,17 @@ print(args) arg_vars = vars(args) +def get_retokenized(tokenizer, text): + """ + copy from pyserini.scripts.ltr_msmarco.convert_common.get_retokenized + Obtain a space separated re-tokenized text. + :param tokenizer: a tokenizer that has the function + tokenize that returns an array of tokens. + :param text: a text to re-tokenize. + """ + return ' '.join(tokenizer.tokenize(text)) + + def batch_file(iterable, n=10000): batch = [] for line in iterable: @@ -57,11 +67,13 @@ def process(line): if not line: return None json_line = json.loads(line) - pid = json_line['id'] - body = json_line['contents'] + pid = json_line['_id'] + title = json_line['title'] + body = json_line['text'] - doc = {"id": pid, - "contents": get_retokenized(bert_tokenizer, body.lower())} + doc = {"_id": pid, + "title": get_retokenized(bert_tokenizer, title.lower()), + "text": get_retokenized(bert_tokenizer, body.lower())} return doc res = [] From f463df831158efbb0205ef688b8d6db67406d388 Mon Sep 17 00:00:00 2001 From: amyxie361 Date: Thu, 28 Apr 2022 16:18:22 -0400 Subject: [PATCH 03/12] add query tokenizer --- scripts/beir/tokenize_queries.py | 109 +++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 scripts/beir/tokenize_queries.py diff --git a/scripts/beir/tokenize_queries.py b/scripts/beir/tokenize_queries.py new file mode 100644 index 000000000..583c44ffd --- /dev/null +++ b/scripts/beir/tokenize_queries.py @@ -0,0 +1,109 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import argparse +import multiprocessing +import json +import time +from joblib import Parallel, delayed +from transformers import AutoTokenizer + +'''Replace original contents fields with bert tokenization''' + + +parser = argparse.ArgumentParser(description='Convert BEIR original documents to word piece tokenized.') +parser.add_argument('--input', metavar='input file', help='input file', + type=str, required=True) +parser.add_argument('--output', metavar='output file', help='output file', + type=str, required=True) +parser.add_argument('--workers', metavar='# of processes', help='# of workers to spawn', + type=int, default=multiprocessing.cpu_count() - 2) + + +args = parser.parse_args() +print(args) +arg_vars = vars(args) + +def get_retokenized(tokenizer, text): + """ + copy from pyserini.scripts.ltr_msmarco.convert_common.get_retokenized + Obtain a space separated re-tokenized text. + :param tokenizer: a tokenizer that has the function + tokenize that returns an array of tokens. + :param text: a text to re-tokenize. + """ + return ' '.join(tokenizer.tokenize(text)) + + +def batch_file(iterable, n=10000): + batch = [] + for line in iterable: + batch.append(line) + if len(batch) == n: + yield batch + batch = [] + if len(batch) > 0: + yield batch + batch = [] + return + + +def batch_process(batch): + bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + + def process(line): + if not line: + return None + json_line = json.loads(line) + pid = json_line['_id'] + title = json_line['title'] + body = json_line['text'] + + doc = {"_id": pid, + "title": get_retokenized(bert_tokenizer, title.lower()), + "text": get_retokenized(bert_tokenizer, body.lower())} + return doc + + res = [] + start = time.time() + for line in batch: + res.append(process(line)) + if len(res) % 1000 == 0: + end = time.time() + print(f"finish {len(res)} using {end-start}") + start = end + return res + + +if __name__ == '__main__': + workers = args.workers + print(f"Spawning {workers} processes") + pool = Parallel(n_jobs=workers, verbose=10) + line_num = 0 + + with open(args.input) as inFile: + with open(args.output, 'w') as outFile: + for batch_json in pool([delayed(batch_process)(batch) for batch in batch_file(inFile)]): + for doc_json in batch_json: + line_num = line_num + 1 + if doc_json is not None: + outFile.write(json.dumps(doc_json) + '\n') + else: + print(f"Ignoring misformatted line {line_num}") + + if line_num % 10000 == 0: + print(f"Processed {line_num} passages") + + print(f"Processed {line_num} passages") From 4089fee31327269ac1d4d876cf8edc6a5197cb67 Mon Sep 17 00:00:00 2001 From: amyxie361 Date: Sun, 1 May 2022 14:58:23 -0400 Subject: [PATCH 04/12] runable compare domains --- scripts/beir/compare_domains.py | 127 +++++++++++++++++++++++++++ scripts/beir/index_bm25.sh | 10 +++ scripts/beir/test_compare_domains.sh | 3 + scripts/beir/tokenize_corpus.py | 8 +- scripts/beir/tokenize_corpus.sh | 33 +++++++ scripts/beir/tokenize_queries.py | 8 +- scripts/beir/tokenize_queries.sh | 38 ++++++++ 7 files changed, 219 insertions(+), 8 deletions(-) create mode 100644 scripts/beir/compare_domains.py create mode 100644 scripts/beir/index_bm25.sh create mode 100644 scripts/beir/test_compare_domains.sh create mode 100644 scripts/beir/tokenize_corpus.sh create mode 100644 scripts/beir/tokenize_queries.sh diff --git a/scripts/beir/compare_domains.py b/scripts/beir/compare_domains.py new file mode 100644 index 000000000..95c55ceec --- /dev/null +++ b/scripts/beir/compare_domains.py @@ -0,0 +1,127 @@ +import argparse +from pyserini.index.lucene import IndexReader + + +def index2stats(index_path): + index_reader = IndexReader(index_path) + + terms = index_reader.terms() + + cf_dict = {} + df_dict = {} + for t in terms: + txt = t.term + df = t.df + cf = t.cf + cf_dict[txt] = int(cf) + df_dict[txt] = int(df) + + return cf_dict, df_dict, index_reader.stats() + +def count_total(d): + s = 0 + for t in d: + s += d[t] + return s + + +def jaccard(d1, d2): + ret = (float(len(set(d1).intersection(set(d2)))) / + float(len(set(d1).union(set(d2))))) + return ret + +def weighted_jaccard(d1, d2): + term_union = set(d1).union(set(d2)) + min_sum = max_sum = 0 + for t in term_union: + if t not in d1: + max_sum += d2[t] + elif t not in d2: + max_sum += d1[t] + else: + min_sum += min(d1[t], d2[t]) + max_sum += max(d1[t], d2[t]) + ret = float(min_sum) / float(max_sum) + return ret + +def cf2freq(d): + total = count_total(d) + new_d = {} + for t in d: + new_d[t] = float(d[t]) / float(total) + return new_d + +def df2idf(d, n): + total = n + new_d = {} + for t in d: + new_d[t] = float(n) / float(d[t]) + return new_d + +def filter_freq_dict(freq_d, threshold=0.0001): + new_d = {} + for t in freq_d: + if freq_d[t] > threshold: + new_d[t] = freq_d[t] + return new_d + +def print_results(datasets, results, save_file): + f = open(save_file, 'w') + + f.write("\t{}\n".format("\t".join(datasets))) + for d1 in datasets: + f.write(d1) + for d2 in datasets: + f.write("\t{:.4f}".format(results[d1][d2])) + f.write("\n") + f.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--index_path', type=str, help='path to indexes of all the beir dataset', required=True) + parser.add_argument('--index_name_format', type=str, help='define your own index dir path name', default="/lucene-index-beir-{}") + parser.add_argument('--compare_metric', type=str, help='the metric for comparing two vocab, choose from: jaccard, weight_jaccard, df_filter, tf_filter', default="weight_jaccard") + parser.add_argument('--compare_threshold', type=float, help='when choosing df_filter, or tf_filter, you can choolse the threshold', default=0.0001) + parser.add_argument('--output_path', type=str, help='path to save the stat results', required=True) + args = parser.parse_args() + + #beir_datasets = ['arguana', 'bioasq', 'climate-fever', 'dbpedia-entity', 'fever', 'fiqa', 'hotpotqa', 'nfcorpus', 'nq', 'quora', 'robust04', 'scidocs', 'scifact', 'signal1m', 'trec-covid', 'trec-news', 'webis-touche2020'] + beir_datasets = ['arguana', 'fiqa'] + cfs = dfs = stats = {} + for d in beir_datasets: + cf, df, stat = index2stats(args.index_path + args.index_name_format.format(d)) + cfs[d] = cf + dfs[d] = df + stat[d] = stat + + results = {} + for d1 in beir_datasets: + metric_d1 = {} + for d2 in beir_datasets: + if d1 == d2: + metric_d1[d2] = 1 + else: + if args.compare_metric == "jaccard": + metric_d1[d2] = jaccard(cfs[d1], cfs[d2]) + elif args.compare_metric == "weight_jaccard": + metric_d1[d2] = weighted_jaccard(cfs[d1], cfs[d2]) + elif args.compare_metric == "df_filter": + new_d1 = filter_freq_dict(cf2freq(cfs[d1])) + new_d2 = filter_freq_dict(cf2freq(cfs[d2])) + metric_d1[d2] = jaccard(new_d1, new_d2) + elif args.compare_metric == "tf_filter": + new_d1 = filter_freq_dict(df2idf(dfs[d1], stat[d1]['documents'])) + new_d2 = filter_freq_dict(df2idf(dfs[d2], stat[d2]['documents'])) + metric_d1[d2] = jaccard(new_d1, new_d2) + results[d1] = metric_d1 + + print_results(beir_datasets, results, args.output_path) + + + + + + + + diff --git a/scripts/beir/index_bm25.sh b/scripts/beir/index_bm25.sh new file mode 100644 index 000000000..d03e36b74 --- /dev/null +++ b/scripts/beir/index_bm25.sh @@ -0,0 +1,10 @@ +mkdir -p indexes + +for corpora in arguana bioasq climate-fever dbpedia-entity fever hotpotqa nfcorpus quora robust04 scidocs scifact signal1m trec-covid trec-news webis-touche2020 +#for corpora in fiqa +do + +python -m pyserini.index -collection JsonCollection -generator DefaultLuceneDocumentGenerator \ + -threads 20 -input /store/scratch/y247xie/00_data/wp-tokenized-anserini/${corpora}/corpus \ + -index indexes/lucene-index-beir-${corpora} -storePositions -storeDocvectors -storeRaw +done diff --git a/scripts/beir/test_compare_domains.sh b/scripts/beir/test_compare_domains.sh new file mode 100644 index 000000000..17f442b3a --- /dev/null +++ b/scripts/beir/test_compare_domains.sh @@ -0,0 +1,3 @@ +python compare_domains.py \ + --index_path indexes \ + --output_path indexes_weight_jaccard diff --git a/scripts/beir/tokenize_corpus.py b/scripts/beir/tokenize_corpus.py index 5bfb7bdc8..abe8352e9 100644 --- a/scripts/beir/tokenize_corpus.py +++ b/scripts/beir/tokenize_corpus.py @@ -71,16 +71,16 @@ def process(line): title = json_line['title'] body = json_line['text'] - doc = {"_id": pid, + doc = {"id": pid, "title": get_retokenized(bert_tokenizer, title.lower()), - "text": get_retokenized(bert_tokenizer, body.lower())} + "contents": get_retokenized(bert_tokenizer, body.lower())} return doc res = [] start = time.time() for line in batch: res.append(process(line)) - if len(res) % 1000 == 0: + if len(res) % 100000 == 0: end = time.time() print(f"finish {len(res)} using {end-start}") start = end @@ -103,7 +103,7 @@ def process(line): else: print(f"Ignoring misformatted line {line_num}") - if line_num % 100 == 0: + if line_num % 10000 == 0: print(f"Processed {line_num} passages") print(f"Processed {line_num} passages") diff --git a/scripts/beir/tokenize_corpus.sh b/scripts/beir/tokenize_corpus.sh new file mode 100644 index 000000000..7a3064741 --- /dev/null +++ b/scripts/beir/tokenize_corpus.sh @@ -0,0 +1,33 @@ +for corpora in arguana bioasq climate-fever dbpedia-entity fever fiqa hotpotqa nfcorpus quora robust04 scidocs scifact signal1m trec-covid trec-news webis-touche2020 +do +mkdir -p /store/scratch/y247xie/00_data/wp-tokenized-anserini/${corpora}/ +python tokenize_corpus.py \ + --input /store/collections/beir-v1.0.0/original/${corpora}/corpus.jsonl \ + --output /store/scratch/y247xie/00_data/wp-tokenized-anserini/${corpora}/corpus.jsonl + +python tokenize_queries.py \ + --input /store/collections/beir-v1.0.0/original/${corpora}/queries.jsonl \ + --output /store/scratch/y247xie/00_data/wp-tokenized-anserini/${corpora}/queries.jsonl +done + +for corpora in android english gaming gis mathematica physics programmers stats tex unix webmasters wordpress +do +mkdir -p /store/scratch/y247xie/00_data/wp-tokenized-anserini/cqadupstack/${corpora}/ +python tokenize_corpus.py \ + --input /store/collections/beir-v1.0.0/original/cqadupstack/${corpora}/corpus.jsonl \ + --output /store/scratch/y247xie/00_data/wp-tokenized-anserini/cqadupstack/${corpora}/corpus.jsonl +done + +for corpora in nq +do +mkdir -p /store/scratch/y247xie/00_data/wp-tokenized-anserini/${corpora}/ +python tokenize_corpus.py \ + --input /store/scratch/y247xie/00_data/nq/corpus.jsonl \ + --output /store/scratch/y247xie/00_data/wp-tokenized-anserini/${corpora}/corpus.jsonl + +python tokenize_queries.py \ + --input /store/scratch/y247xie/00_data/nq/queries.jsonl \ + --output /store/scratch/y247xie/00_data/wp-tokenized-anserini/${corpora}/queries.jsonl + +done + diff --git a/scripts/beir/tokenize_queries.py b/scripts/beir/tokenize_queries.py index 583c44ffd..02ab3e662 100644 --- a/scripts/beir/tokenize_queries.py +++ b/scripts/beir/tokenize_queries.py @@ -68,12 +68,12 @@ def process(line): return None json_line = json.loads(line) pid = json_line['_id'] - title = json_line['title'] body = json_line['text'] + metadata = json_line['metadata'] - doc = {"_id": pid, - "title": get_retokenized(bert_tokenizer, title.lower()), - "text": get_retokenized(bert_tokenizer, body.lower())} + doc = {"id": pid, + "contents": get_retokenized(bert_tokenizer, body.lower()), + "metadata":metadata, } return doc res = [] diff --git a/scripts/beir/tokenize_queries.sh b/scripts/beir/tokenize_queries.sh new file mode 100644 index 000000000..630b32ef4 --- /dev/null +++ b/scripts/beir/tokenize_queries.sh @@ -0,0 +1,38 @@ +for corpora in arguana bioasq climate-fever dbpedia-entity fever fiqa hotpotqa nfcorpus quora robust04 scidocs scifact signal1m trec-covid trec-news webis-touche2020 +do + +#for corpora in android english gaming gis mathematica physics programmers stats tex unix webmasters wordpress +#do +#for corpora in nq +#do +#mkdir -p /store/scratch/y247xie/00_data/wp-tokenized/${corpora}/ +#mkdir -p /store/scratch/y247xie/00_data/wp-tokenized/cqadupstack/${corpora} +#python tokenize_corpus.py \ +# --input /store/collections/beir-v1.0.0/original/${corpora}/corpus.jsonl \ +# --output /store/scratch/y247xie/00_data/wp-tokenized/${corpora}/corpus.jsonl +#python tokenize_corpus.py \ +# --input /store/collections/beir-v1.0.0/original/cqadupstack/${corpora}/corpus.jsonl \ +# --output /store/scratch/y247xie/00_data/wp-tokenized/cqadupstack/${corpora}/corpus.jsonl +python tokenize_queries.py \ + --input /store/collections/beir-v1.0.0/original/${corpora}/queries.jsonl \ + --output /store/scratch/y247xie/00_data/wp-tokenized/${corpora}/queries.jsonl + +done + +for corpora in android english gaming gis mathematica physics programmers stats tex unix webmasters wordpress +do +python tokenize_queries.py \ + --input /store/collections/beir-v1.0.0/original/cqadupstack/${corpora}/queries.jsonl \ + --output /store/scratch/y247xie/00_data/wp-tokenized/cqadupstack/${corpora}/queries.jsonl +done + +for corpora in nq +do +python tokenize_queries.py \ + --input /store/scratch/y247xie/00_data/nq/queries.jsonl \ + --output /store/scratch/y247xie/00_data/wp-tokenized/${corpora}/queries.jsonl +done + + + + From 6bae387be013d15f8ca4fe1fb803bf62e818c760 Mon Sep 17 00:00:00 2001 From: amyxie361 Date: Mon, 2 May 2022 12:15:49 -0400 Subject: [PATCH 05/12] test all the metrics --- scripts/beir/compare_domains.py | 8 ++++---- scripts/beir/test_compare_domains.sh | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/beir/compare_domains.py b/scripts/beir/compare_domains.py index 95c55ceec..c6cc94739 100644 --- a/scripts/beir/compare_domains.py +++ b/scripts/beir/compare_domains.py @@ -86,8 +86,8 @@ def print_results(datasets, results, save_file): parser.add_argument('--output_path', type=str, help='path to save the stat results', required=True) args = parser.parse_args() - #beir_datasets = ['arguana', 'bioasq', 'climate-fever', 'dbpedia-entity', 'fever', 'fiqa', 'hotpotqa', 'nfcorpus', 'nq', 'quora', 'robust04', 'scidocs', 'scifact', 'signal1m', 'trec-covid', 'trec-news', 'webis-touche2020'] - beir_datasets = ['arguana', 'fiqa'] + beir_datasets = ['trec-covid', 'bioasq', 'nfcorpus', 'nq', 'hotpotqa', 'climate-fever', 'fever', 'dbpedia-entity', 'fiqa', 'signal1m', 'trec-news', 'robust04', 'arguana', 'webis-touche2020', 'quora', 'cqadupstack', 'scidocs', 'scifact'] + #beir_datasets = ['arguana', 'fiqa'] cfs = dfs = stats = {} for d in beir_datasets: cf, df, stat = index2stats(args.index_path + args.index_name_format.format(d)) @@ -111,8 +111,8 @@ def print_results(datasets, results, save_file): new_d2 = filter_freq_dict(cf2freq(cfs[d2])) metric_d1[d2] = jaccard(new_d1, new_d2) elif args.compare_metric == "tf_filter": - new_d1 = filter_freq_dict(df2idf(dfs[d1], stat[d1]['documents'])) - new_d2 = filter_freq_dict(df2idf(dfs[d2], stat[d2]['documents'])) + new_d1 = filter_freq_dict(df2idf(dfs[d1], 1)) + new_d2 = filter_freq_dict(df2idf(dfs[d2], 1)) metric_d1[d2] = jaccard(new_d1, new_d2) results[d1] = metric_d1 diff --git a/scripts/beir/test_compare_domains.sh b/scripts/beir/test_compare_domains.sh index 17f442b3a..ce4bdfc1b 100644 --- a/scripts/beir/test_compare_domains.sh +++ b/scripts/beir/test_compare_domains.sh @@ -1,3 +1,4 @@ python compare_domains.py \ --index_path indexes \ - --output_path indexes_weight_jaccard + --output_path indexes_df_filter.tsv \ + --compare_metric df_filter From c7eb404cde7b323921cc852f83a3e03f61e89a5d Mon Sep 17 00:00:00 2001 From: amyxie361 Date: Mon, 2 May 2022 12:21:01 -0400 Subject: [PATCH 06/12] use beir format --- scripts/beir/tokenize_corpus.py | 4 ++-- scripts/beir/tokenize_queries.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/beir/tokenize_corpus.py b/scripts/beir/tokenize_corpus.py index abe8352e9..b7439ffbc 100644 --- a/scripts/beir/tokenize_corpus.py +++ b/scripts/beir/tokenize_corpus.py @@ -71,9 +71,9 @@ def process(line): title = json_line['title'] body = json_line['text'] - doc = {"id": pid, + doc = {"_id": pid, "title": get_retokenized(bert_tokenizer, title.lower()), - "contents": get_retokenized(bert_tokenizer, body.lower())} + "text": get_retokenized(bert_tokenizer, body.lower())} return doc res = [] diff --git a/scripts/beir/tokenize_queries.py b/scripts/beir/tokenize_queries.py index 02ab3e662..bc47df394 100644 --- a/scripts/beir/tokenize_queries.py +++ b/scripts/beir/tokenize_queries.py @@ -71,8 +71,8 @@ def process(line): body = json_line['text'] metadata = json_line['metadata'] - doc = {"id": pid, - "contents": get_retokenized(bert_tokenizer, body.lower()), + doc = {"_id": pid, + "text": get_retokenized(bert_tokenizer, body.lower()), "metadata":metadata, } return doc From 271ac2a2cdc5029396e71327adfe947a76b4d67e Mon Sep 17 00:00:00 2001 From: amyxie361 Date: Mon, 2 May 2022 12:23:37 -0400 Subject: [PATCH 07/12] update with BeirFlatCollection --- scripts/beir/index_bm25.sh | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/scripts/beir/index_bm25.sh b/scripts/beir/index_bm25.sh index d03e36b74..80b90efa4 100644 --- a/scripts/beir/index_bm25.sh +++ b/scripts/beir/index_bm25.sh @@ -1,10 +1,12 @@ -mkdir -p indexes +#mkdir -p indexes -for corpora in arguana bioasq climate-fever dbpedia-entity fever hotpotqa nfcorpus quora robust04 scidocs scifact signal1m trec-covid trec-news webis-touche2020 -#for corpora in fiqa -do +#for corpora in arguana bioasq climate-fever dbpedia-entity fever hotpotqa nfcorpus quora robust04 scidocs scifact signal1m trec-covid trec-news webis-touche2020 fiqa nq +#do -python -m pyserini.index -collection JsonCollection -generator DefaultLuceneDocumentGenerator \ - -threads 20 -input /store/scratch/y247xie/00_data/wp-tokenized-anserini/${corpora}/corpus \ - -index indexes/lucene-index-beir-${corpora} -storePositions -storeDocvectors -storeRaw +#for corpora in android english gaming gis mathematica physics programmers stats tex unix webmasters wordpress +for corpora in fiqa +do +python -m pyserini.index -collection BeirFlatCollection -generator DefaultLuceneDocumentGenerator \ + -threads 20 -input /store/scratch/y247xie/00_data/wp-tokenized/${corpora}/corpus \ + -index indexes/lucene-index-beir-${corpora}_ -storePositions -storeDocvectors -storeRaw -pretokenized done From 7ece3468b6d4593f21ab85fd9b2dd9dcf203b25e Mon Sep 17 00:00:00 2001 From: amyxie361 Date: Tue, 10 May 2022 17:04:20 -0400 Subject: [PATCH 08/12] add kl-divergency and msmarco --- scripts/beir/compare_domains.py | 20 ++++++++++++++++---- scripts/beir/index_bm25.sh | 9 +++++---- scripts/beir/indexes_df_filter.tsv | 20 ++++++++++++++++++++ scripts/beir/indexes_kl_divergence.tsv | 20 ++++++++++++++++++++ scripts/beir/indexes_tf_filter.tsv | 20 ++++++++++++++++++++ scripts/beir/indexes_weight_jaccard.tsv | 20 ++++++++++++++++++++ scripts/beir/test_compare_domains.sh | 7 +++++-- 7 files changed, 106 insertions(+), 10 deletions(-) create mode 100644 scripts/beir/indexes_df_filter.tsv create mode 100644 scripts/beir/indexes_kl_divergence.tsv create mode 100644 scripts/beir/indexes_tf_filter.tsv create mode 100644 scripts/beir/indexes_weight_jaccard.tsv diff --git a/scripts/beir/compare_domains.py b/scripts/beir/compare_domains.py index c6cc94739..42c99128f 100644 --- a/scripts/beir/compare_domains.py +++ b/scripts/beir/compare_domains.py @@ -1,4 +1,6 @@ import argparse +import numpy as np + from pyserini.index.lucene import IndexReader @@ -24,6 +26,12 @@ def count_total(d): s += d[t] return s +def kl_divergence(d1, d2): + value = float(0) + for w in d1: + if w in d2: + value += d1[w] * np.log(d1[w] / d2[w]) + return value def jaccard(d1, d2): ret = (float(len(set(d1).intersection(set(d2)))) / @@ -81,18 +89,18 @@ def print_results(datasets, results, save_file): parser = argparse.ArgumentParser() parser.add_argument('--index_path', type=str, help='path to indexes of all the beir dataset', required=True) parser.add_argument('--index_name_format', type=str, help='define your own index dir path name', default="/lucene-index-beir-{}") - parser.add_argument('--compare_metric', type=str, help='the metric for comparing two vocab, choose from: jaccard, weight_jaccard, df_filter, tf_filter', default="weight_jaccard") + parser.add_argument('--compare_metric', type=str, help='the metric for comparing two vocab, choose from: jaccard, weight_jaccard, df_filter, tf_filter, kl_divergence', default="weight_jaccard") parser.add_argument('--compare_threshold', type=float, help='when choosing df_filter, or tf_filter, you can choolse the threshold', default=0.0001) parser.add_argument('--output_path', type=str, help='path to save the stat results', required=True) args = parser.parse_args() - beir_datasets = ['trec-covid', 'bioasq', 'nfcorpus', 'nq', 'hotpotqa', 'climate-fever', 'fever', 'dbpedia-entity', 'fiqa', 'signal1m', 'trec-news', 'robust04', 'arguana', 'webis-touche2020', 'quora', 'cqadupstack', 'scidocs', 'scifact'] + beir_datasets = ['trec-covid', 'bioasq', 'nfcorpus', 'nq', 'hotpotqa', 'climate-fever', 'fever', 'dbpedia-entity', 'fiqa', 'signal1m', 'trec-news', 'robust04', 'arguana', 'webis-touche2020', 'quora', 'cqadupstack', 'scidocs', 'scifact', 'msmarco'] #beir_datasets = ['arguana', 'fiqa'] cfs = dfs = stats = {} for d in beir_datasets: cf, df, stat = index2stats(args.index_path + args.index_name_format.format(d)) - cfs[d] = cf - dfs[d] = df + cfs[d] = cf # count frequency -- int + dfs[d] = df # document frequency -- int stat[d] = stat results = {} @@ -114,6 +122,10 @@ def print_results(datasets, results, save_file): new_d1 = filter_freq_dict(df2idf(dfs[d1], 1)) new_d2 = filter_freq_dict(df2idf(dfs[d2], 1)) metric_d1[d2] = jaccard(new_d1, new_d2) + elif args.compare_metric == "kl_divergence": + new_d1 = filter_freq_dict(cf2freq(cfs[d1])) + new_d2 = filter_freq_dict(cf2freq(cfs[d2])) + metric_d1[d2] = kl_divergence(new_d1, new_d2) results[d1] = metric_d1 print_results(beir_datasets, results, args.output_path) diff --git a/scripts/beir/index_bm25.sh b/scripts/beir/index_bm25.sh index 80b90efa4..f46598388 100644 --- a/scripts/beir/index_bm25.sh +++ b/scripts/beir/index_bm25.sh @@ -4,9 +4,10 @@ #do #for corpora in android english gaming gis mathematica physics programmers stats tex unix webmasters wordpress -for corpora in fiqa +#for corpora in fiqa # BeirFlatCollection +for corpora in msmarco do -python -m pyserini.index -collection BeirFlatCollection -generator DefaultLuceneDocumentGenerator \ - -threads 20 -input /store/scratch/y247xie/00_data/wp-tokenized/${corpora}/corpus \ - -index indexes/lucene-index-beir-${corpora}_ -storePositions -storeDocvectors -storeRaw -pretokenized +python -m pyserini.index -collection JsonCollection -generator DefaultLuceneDocumentGenerator \ + -threads 20 -input /store/scratch/y247xie/00_data/wp-tokenized/${corpora} \ + -index indexes/lucene-index-beir-${corpora} -storePositions -storeDocvectors -storeRaw -pretokenized done diff --git a/scripts/beir/indexes_df_filter.tsv b/scripts/beir/indexes_df_filter.tsv new file mode 100644 index 000000000..0a408a408 --- /dev/null +++ b/scripts/beir/indexes_df_filter.tsv @@ -0,0 +1,20 @@ + trec-covid bioasq nfcorpus nq hotpotqa climate-fever fever dbpedia-entity fiqa signal1m trec-news robust04 arguana webis-touche2020 quora cqadupstack scidocs scifact msmarco +trec-covid 1.0000 0.6805 0.5963 0.3362 0.2377 0.2603 0.2604 0.2334 0.2451 0.2251 0.2787 0.3180 0.2927 0.2760 0.2149 0.2772 0.4241 0.6367 0.3485 +bioasq 0.6805 1.0000 0.6533 0.3165 0.2267 0.2464 0.2465 0.2242 0.2259 0.2182 0.2541 0.2943 0.2567 0.2586 0.2031 0.2753 0.3823 0.7174 0.3335 +nfcorpus 0.5963 0.6533 1.0000 0.3165 0.2207 0.2400 0.2401 0.2187 0.2338 0.2096 0.2508 0.2908 0.2665 0.2540 0.2047 0.2594 0.3541 0.6115 0.3318 +nq 0.3362 0.3165 0.3165 1.0000 0.4995 0.5694 0.5689 0.4902 0.3926 0.3710 0.5455 0.5439 0.4421 0.4387 0.3717 0.3437 0.3329 0.2937 0.5508 +hotpotqa 0.2377 0.2267 0.2207 0.4995 1.0000 0.8334 0.8340 0.9355 0.2344 0.3446 0.3469 0.3274 0.2709 0.2543 0.2790 0.2427 0.2235 0.2107 0.3767 +climate-fever 0.2603 0.2464 0.2400 0.5694 0.8334 1.0000 0.9993 0.8253 0.2640 0.3549 0.3949 0.3762 0.3064 0.2862 0.2933 0.2677 0.2449 0.2280 0.4122 +fever 0.2604 0.2465 0.2401 0.5689 0.8340 0.9993 1.0000 0.8258 0.2636 0.3544 0.3945 0.3758 0.3060 0.2858 0.2935 0.2673 0.2450 0.2281 0.4117 +dbpedia-entity 0.2334 0.2242 0.2187 0.4902 0.9355 0.8253 0.8258 1.0000 0.2255 0.3425 0.3405 0.3231 0.2635 0.2463 0.2751 0.2402 0.2177 0.2084 0.3691 +fiqa 0.2451 0.2259 0.2338 0.3926 0.2344 0.2640 0.2636 0.2255 1.0000 0.3077 0.4699 0.4494 0.4356 0.5031 0.4116 0.4062 0.3054 0.2202 0.4680 +signal1m 0.2251 0.2182 0.2096 0.3710 0.3446 0.3549 0.3544 0.3425 0.3077 1.0000 0.4079 0.3431 0.2916 0.3211 0.3555 0.3037 0.2081 0.2048 0.3654 +trec-news 0.2787 0.2541 0.2508 0.5455 0.3469 0.3949 0.3945 0.3405 0.4699 0.4079 1.0000 0.5913 0.4754 0.5179 0.3765 0.3568 0.3016 0.2400 0.5027 +robust04 0.3180 0.2943 0.2908 0.5439 0.3274 0.3762 0.3758 0.3231 0.4494 0.3431 0.5913 1.0000 0.4845 0.4528 0.3275 0.3324 0.3199 0.2800 0.4757 +arguana 0.2927 0.2567 0.2665 0.4421 0.2709 0.3064 0.3060 0.2635 0.4356 0.2916 0.4754 0.4845 1.0000 0.5555 0.3461 0.3116 0.3413 0.2539 0.4240 +webis-touche2020 0.2760 0.2586 0.2540 0.4387 0.2543 0.2862 0.2858 0.2463 0.5031 0.3211 0.5179 0.4528 0.5555 1.0000 0.3838 0.3866 0.3364 0.2508 0.4530 +quora 0.2149 0.2031 0.2047 0.3717 0.2790 0.2933 0.2935 0.2751 0.4116 0.3555 0.3765 0.3275 0.3461 0.3838 1.0000 0.3551 0.2515 0.1920 0.4549 +cqadupstack 0.2772 0.2753 0.2594 0.3437 0.2427 0.2677 0.2673 0.2402 0.4062 0.3037 0.3568 0.3324 0.3116 0.3866 0.3551 1.0000 0.3632 0.2598 0.4230 +scidocs 0.4241 0.3823 0.3541 0.3329 0.2235 0.2449 0.2450 0.2177 0.3054 0.2081 0.3016 0.3199 0.3413 0.3364 0.2515 0.3632 1.0000 0.3796 0.3599 +scifact 0.6367 0.7174 0.6115 0.2937 0.2107 0.2280 0.2281 0.2084 0.2202 0.2048 0.2400 0.2800 0.2539 0.2508 0.1920 0.2598 0.3796 1.0000 0.3112 +msmarco 0.3485 0.3335 0.3318 0.5508 0.3767 0.4122 0.4117 0.3691 0.4680 0.3654 0.5027 0.4757 0.4240 0.4530 0.4549 0.4230 0.3599 0.3112 1.0000 diff --git a/scripts/beir/indexes_kl_divergence.tsv b/scripts/beir/indexes_kl_divergence.tsv new file mode 100644 index 000000000..476033474 --- /dev/null +++ b/scripts/beir/indexes_kl_divergence.tsv @@ -0,0 +1,20 @@ + trec-covid bioasq nfcorpus nq hotpotqa climate-fever fever dbpedia-entity fiqa signal1m trec-news robust04 arguana webis-touche2020 quora cqadupstack scidocs scifact msmarco +trec-covid 1.0000 0.0448 0.0336 0.1378 0.1141 0.1268 0.1268 0.1026 0.0769 0.1147 0.2565 0.1991 0.0948 0.1706 0.1257 0.0665 0.0856 0.0569 0.1137 +bioasq 0.0609 1.0000 0.0194 0.1555 0.1295 0.1447 0.1447 0.1217 0.0999 0.1345 0.2773 0.2233 0.1397 0.1957 0.1517 0.0833 0.1371 0.0239 0.1279 +nfcorpus 0.1186 0.0863 1.0000 0.1908 0.1527 0.1706 0.1706 0.1435 0.1212 0.1509 0.3080 0.2669 0.1566 0.2258 0.1755 0.1044 0.1849 0.1257 0.1794 +nq 0.1812 0.1576 0.1643 1.0000 0.0616 0.0384 0.0383 0.0815 0.0574 0.1851 0.2639 0.1940 0.0632 0.1516 0.1579 0.0737 0.1527 0.1855 0.0489 +hotpotqa 0.2694 0.2197 0.2561 0.1654 1.0000 0.0588 0.0588 0.0050 0.1574 0.2180 0.4821 0.3698 0.2383 0.3233 0.1839 0.1503 0.2188 0.2563 0.1190 +climate-fever 0.1456 0.1090 0.1328 0.0707 -0.0202 1.0000 -0.0000 -0.0222 0.0586 0.1299 0.3001 0.2246 0.1211 0.1858 0.1071 0.0644 0.1062 0.1335 0.0523 +fever 0.1456 0.1090 0.1328 0.0708 -0.0202 0.0000 1.0000 -0.0222 0.0588 0.1300 0.3002 0.2247 0.1212 0.1858 0.1071 0.0645 0.1062 0.1335 0.0523 +dbpedia-entity 0.2553 0.2071 0.2423 0.1618 0.0013 0.0604 0.0604 1.0000 0.1551 0.2109 0.4672 0.3603 0.2314 0.3199 0.1803 0.1449 0.2111 0.2426 0.1218 +fiqa 0.3243 0.3136 0.3265 0.3440 0.3525 0.3592 0.3582 0.3496 1.0000 0.3400 0.4209 0.4279 0.2379 0.2018 0.2046 0.1204 0.3096 0.3201 0.2314 +signal1m 0.3184 0.2348 0.2533 0.2131 0.2702 0.2878 0.2878 0.2469 0.2958 1.0000 0.4628 0.2995 0.2113 0.3399 0.1300 0.2227 0.3798 0.2558 0.1895 +trec-news 0.0620 0.0436 0.0422 0.0681 0.0835 0.0679 0.0677 0.0795 -0.0693 0.1217 1.0000 0.0441 -0.0241 -0.0209 0.0574 -0.0204 0.0600 0.0454 0.0254 +robust04 0.0416 0.0256 0.0194 0.0370 0.0584 0.0599 0.0595 0.0594 -0.0747 0.0811 0.0656 1.0000 -0.0266 -0.0126 0.0488 -0.0377 0.0326 0.0366 0.0053 +arguana 0.2747 0.2292 0.2534 0.2178 0.2459 0.2374 0.2373 0.2517 0.0637 0.2840 0.3115 0.2616 1.0000 0.1559 0.1798 0.1154 0.2201 0.2417 0.1416 +webis-touche2020 0.1759 0.1677 0.1734 0.1959 0.2135 0.2161 0.2161 0.2093 -0.0209 0.2110 0.2116 0.2317 0.0718 1.0000 0.1017 0.0445 0.1768 0.1711 0.1255 +quora 0.6807 0.9262 0.7167 1.0242 0.5189 0.6385 0.6385 0.5243 0.5093 0.7571 0.9984 1.0744 0.7696 0.6971 1.0000 0.4465 1.0338 0.6816 0.7129 +cqadupstack 0.3405 0.3268 0.3328 0.3428 0.3404 0.3548 0.3544 0.3385 0.1287 0.3009 0.4360 0.4501 0.2323 0.2593 0.1877 1.0000 0.3545 0.3202 0.2640 +scidocs 0.1595 0.1440 0.1419 0.1746 0.1613 0.1796 0.1796 0.1526 0.1121 0.1258 0.3094 0.2685 0.1335 0.1828 0.1601 0.1383 1.0000 0.1584 0.1128 +scifact 0.0915 0.0563 0.0574 0.1600 0.1297 0.1442 0.1442 0.1215 0.0911 0.1241 0.2662 0.2197 0.1328 0.1859 0.1368 0.0746 0.1293 1.0000 0.1346 +msmarco 0.2116 0.1982 0.2083 0.1484 0.1556 0.1780 0.1779 0.1526 0.0356 0.1836 0.3661 0.3101 0.1353 0.1968 0.1138 0.0810 0.1784 0.2204 1.0000 diff --git a/scripts/beir/indexes_tf_filter.tsv b/scripts/beir/indexes_tf_filter.tsv new file mode 100644 index 000000000..957e1a473 --- /dev/null +++ b/scripts/beir/indexes_tf_filter.tsv @@ -0,0 +1,20 @@ + trec-covid bioasq nfcorpus nq hotpotqa climate-fever fever dbpedia-entity fiqa signal1m trec-news robust04 arguana webis-touche2020 quora cqadupstack scidocs scifact msmarco +trec-covid 1.0000 0.5225 0.5367 0.7852 0.7695 0.6963 0.6962 0.7722 0.8216 0.8511 0.7654 0.7957 0.7335 0.8538 0.8450 0.8544 0.8116 0.5629 0.6933 +bioasq 0.5225 1.0000 0.1540 0.6365 0.6223 0.6332 0.6332 0.6220 0.4986 0.6195 0.6504 0.6609 0.4141 0.6220 0.5379 0.5949 0.4224 0.1639 0.7079 +nfcorpus 0.5367 0.1540 1.0000 0.3979 0.3977 0.3425 0.3424 0.3994 0.5425 0.4687 0.3922 0.4194 0.5547 0.4648 0.5216 0.4705 0.5858 0.7047 0.3258 +nq 0.7852 0.6365 0.3979 1.0000 0.9263 0.8870 0.8869 0.9260 0.7417 0.8924 0.8971 0.8946 0.6171 0.8980 0.8011 0.8941 0.6824 0.4205 0.8288 +hotpotqa 0.7695 0.6223 0.3977 0.9263 1.0000 0.9088 0.9086 0.9868 0.7290 0.8748 0.8540 0.8523 0.6093 0.8689 0.7865 0.8719 0.6740 0.4207 0.8053 +climate-fever 0.6963 0.6332 0.3425 0.8870 0.9088 1.0000 0.9998 0.9053 0.6556 0.8019 0.8279 0.8134 0.5391 0.8035 0.7134 0.7997 0.6032 0.3634 0.8089 +fever 0.6962 0.6332 0.3424 0.8869 0.9086 0.9998 1.0000 0.9051 0.6556 0.8019 0.8277 0.8135 0.5392 0.8034 0.7133 0.7996 0.6031 0.3633 0.8090 +dbpedia-entity 0.7722 0.6220 0.3994 0.9260 0.9868 0.9053 0.9051 1.0000 0.7312 0.8771 0.8534 0.8530 0.6119 0.8693 0.7896 0.8735 0.6768 0.4225 0.8040 +fiqa 0.8216 0.4986 0.5425 0.7417 0.7290 0.6556 0.6556 0.7312 1.0000 0.8267 0.7301 0.7637 0.7555 0.8191 0.8492 0.8219 0.7839 0.5677 0.6590 +signal1m 0.8511 0.6195 0.4687 0.8924 0.8748 0.8019 0.8019 0.8771 0.8267 1.0000 0.8723 0.8941 0.7018 0.9403 0.8771 0.9303 0.7535 0.4918 0.7968 +trec-news 0.7654 0.6504 0.3922 0.8971 0.8540 0.8279 0.8277 0.8534 0.7301 0.8723 1.0000 0.9119 0.6057 0.8787 0.7784 0.8435 0.6608 0.4139 0.8578 +robust04 0.7957 0.6609 0.4194 0.8946 0.8523 0.8134 0.8135 0.8530 0.7637 0.8941 0.9119 1.0000 0.6504 0.9016 0.8087 0.8625 0.6967 0.4421 0.8584 +arguana 0.7335 0.4141 0.5547 0.6171 0.6093 0.5391 0.5392 0.6119 0.7555 0.7018 0.6057 0.6504 1.0000 0.6962 0.7429 0.6963 0.7329 0.5798 0.5498 +webis-touche2020 0.8538 0.6220 0.4648 0.8980 0.8689 0.8035 0.8034 0.8693 0.8191 0.9403 0.8787 0.9016 0.6962 1.0000 0.8729 0.9344 0.7545 0.4876 0.8016 +quora 0.8450 0.5379 0.5216 0.8011 0.7865 0.7134 0.7133 0.7896 0.8492 0.8771 0.7784 0.8087 0.7429 0.8729 1.0000 0.8732 0.7931 0.5468 0.7044 +cqadupstack 0.8544 0.5949 0.4705 0.8941 0.8719 0.7997 0.7996 0.8735 0.8219 0.9303 0.8435 0.8625 0.6963 0.9344 0.8732 1.0000 0.7626 0.4936 0.7674 +scidocs 0.8116 0.4224 0.5858 0.6824 0.6740 0.6032 0.6031 0.6768 0.7839 0.7535 0.6608 0.6967 0.7329 0.7545 0.7931 0.7626 1.0000 0.6193 0.5921 +scifact 0.5629 0.1639 0.7047 0.4205 0.4207 0.3634 0.3633 0.4225 0.5677 0.4918 0.4139 0.4421 0.5798 0.4876 0.5468 0.4936 0.6193 1.0000 0.3492 +msmarco 0.6933 0.7079 0.3258 0.8288 0.8053 0.8089 0.8090 0.8040 0.6590 0.7968 0.8578 0.8584 0.5498 0.8016 0.7044 0.7674 0.5921 0.3492 1.0000 diff --git a/scripts/beir/indexes_weight_jaccard.tsv b/scripts/beir/indexes_weight_jaccard.tsv new file mode 100644 index 000000000..113587c1d --- /dev/null +++ b/scripts/beir/indexes_weight_jaccard.tsv @@ -0,0 +1,20 @@ + trec-covid bioasq nfcorpus nq hotpotqa climate-fever fever dbpedia-entity fiqa signal1m trec-news robust04 arguana webis-touche2020 quora cqadupstack scidocs scifact msmarco +trec-covid 1.0000 0.0089 0.0303 0.1014 0.0750 0.0503 0.0503 0.0764 0.1929 0.1872 0.0872 0.1254 0.0505 0.2383 0.1906 0.2560 0.1527 0.0416 0.0442 +bioasq 0.0089 1.0000 0.0003 0.0728 0.0831 0.1248 0.1248 0.0804 0.0025 0.0240 0.0719 0.0542 0.0005 0.0244 0.0032 0.0192 0.0014 0.0004 0.1717 +nfcorpus 0.0303 0.0003 1.0000 0.0033 0.0026 0.0016 0.0016 0.0027 0.0860 0.0093 0.0030 0.0044 0.2769 0.0101 0.0691 0.0127 0.1684 0.5651 0.0014 +nq 0.1014 0.0728 0.0033 1.0000 0.5244 0.4535 0.4535 0.5248 0.0312 0.2665 0.5252 0.5363 0.0059 0.2781 0.0353 0.2028 0.0175 0.0046 0.3755 +hotpotqa 0.0750 0.0831 0.0026 0.5244 1.0000 0.6130 0.6130 0.9267 0.0237 0.2075 0.3535 0.3389 0.0046 0.1766 0.0263 0.1429 0.0136 0.0036 0.3808 +climate-fever 0.0503 0.1248 0.0016 0.4535 0.6130 1.0000 1.0000 0.5920 0.0152 0.1429 0.3620 0.3013 0.0028 0.1364 0.0172 0.1042 0.0085 0.0022 0.4879 +fever 0.0503 0.1248 0.0016 0.4535 0.6130 1.0000 1.0000 0.5921 0.0152 0.1429 0.3620 0.3013 0.0028 0.1364 0.0172 0.1042 0.0085 0.0022 0.4879 +dbpedia-entity 0.0764 0.0804 0.0027 0.5248 0.9267 0.5920 0.5921 1.0000 0.0244 0.2132 0.3528 0.3412 0.0047 0.1776 0.0271 0.1471 0.0140 0.0037 0.3719 +fiqa 0.1929 0.0025 0.0860 0.0312 0.0237 0.0152 0.0152 0.0244 1.0000 0.0878 0.0277 0.0411 0.1710 0.0938 0.4108 0.1154 0.3184 0.1097 0.0128 +signal1m 0.1872 0.0240 0.0093 0.2665 0.2075 0.1429 0.1429 0.2132 0.0878 1.0000 0.2393 0.3082 0.0170 0.3544 0.1020 0.3399 0.0439 0.0123 0.1260 +trec-news 0.0872 0.0719 0.0030 0.5252 0.3535 0.3620 0.3620 0.3528 0.0277 0.2393 1.0000 0.5535 0.0052 0.2749 0.0335 0.1939 0.0154 0.0040 0.3720 +robust04 0.1254 0.0542 0.0044 0.5363 0.3389 0.3013 0.3013 0.3412 0.0411 0.3082 0.5535 1.0000 0.0076 0.3681 0.0467 0.2616 0.0228 0.0060 0.2788 +arguana 0.0505 0.0005 0.2769 0.0059 0.0046 0.0028 0.0028 0.0047 0.1710 0.0170 0.0052 0.0076 1.0000 0.0179 0.1292 0.0225 0.2531 0.3048 0.0024 +webis-touche2020 0.2383 0.0244 0.0101 0.2781 0.1766 0.1364 0.1364 0.1776 0.0938 0.3544 0.2749 0.3681 0.0179 1.0000 0.1087 0.4355 0.0514 0.0137 0.1290 +quora 0.1906 0.0032 0.0691 0.0353 0.0263 0.0172 0.0172 0.0271 0.4108 0.1020 0.0335 0.0467 0.1292 0.1087 1.0000 0.1418 0.2441 0.0885 0.0158 +cqadupstack 0.2560 0.0192 0.0127 0.2028 0.1429 0.1042 0.1042 0.1471 0.1154 0.3399 0.1939 0.2616 0.0225 0.4355 0.1418 1.0000 0.0672 0.0171 0.0996 +scidocs 0.1527 0.0014 0.1684 0.0175 0.0136 0.0085 0.0085 0.0140 0.3184 0.0439 0.0154 0.0228 0.2531 0.0514 0.2441 0.0672 1.0000 0.2217 0.0072 +scifact 0.0416 0.0004 0.5651 0.0046 0.0036 0.0022 0.0022 0.0037 0.1097 0.0123 0.0040 0.0060 0.3048 0.0137 0.0885 0.0171 0.2217 1.0000 0.0019 +msmarco 0.0442 0.1717 0.0014 0.3755 0.3808 0.4879 0.4879 0.3719 0.0128 0.1260 0.3720 0.2788 0.0024 0.1290 0.0158 0.0996 0.0072 0.0019 1.0000 diff --git a/scripts/beir/test_compare_domains.sh b/scripts/beir/test_compare_domains.sh index ce4bdfc1b..e7ea8be5c 100644 --- a/scripts/beir/test_compare_domains.sh +++ b/scripts/beir/test_compare_domains.sh @@ -1,4 +1,7 @@ +for metric in weight_jaccard kl_divergence tf_filter df_filter +do python compare_domains.py \ --index_path indexes \ - --output_path indexes_df_filter.tsv \ - --compare_metric df_filter + --output_path indexes_${metric}.tsv \ + --compare_metric ${metric} +done From 1bf765faa628d9c3f69c42a28e34f64db1e54b55 Mon Sep 17 00:00:00 2001 From: amyxie361 Date: Wed, 11 May 2022 13:21:44 -0400 Subject: [PATCH 09/12] add js divergence, fix weighted_jaccard, add tokenizer option --- scripts/beir/compare_domains.py | 30 +++++++++++++++++-- scripts/beir/indexes_js_divergence.tsv | 20 +++++++++++++ scripts/beir/indexes_weight_jaccard.tsv | 38 ++++++++++++------------- scripts/beir/tokenize_corpus.py | 5 ++-- 4 files changed, 69 insertions(+), 24 deletions(-) create mode 100644 scripts/beir/indexes_js_divergence.tsv diff --git a/scripts/beir/compare_domains.py b/scripts/beir/compare_domains.py index 42c99128f..247854616 100644 --- a/scripts/beir/compare_domains.py +++ b/scripts/beir/compare_domains.py @@ -33,6 +33,19 @@ def kl_divergence(d1, d2): value += d1[w] * np.log(d1[w] / d2[w]) return value +def js_divergence(d1, d2): + mean = {} + for w in d1: + mean[w] = d1[w] * 0.5 + for w in d2: + if w in mean: + mean[w] += d2[w] * 0.5 + else: + mean[w] = d2[w] * 0.5 + + jsd = 0.5 * (kl_divergence(d1, mean) + kl_divergence(d2, mean)) + return jsd + def jaccard(d1, d2): ret = (float(len(set(d1).intersection(set(d2)))) / float(len(set(d1).union(set(d2))))) @@ -89,7 +102,7 @@ def print_results(datasets, results, save_file): parser = argparse.ArgumentParser() parser.add_argument('--index_path', type=str, help='path to indexes of all the beir dataset', required=True) parser.add_argument('--index_name_format', type=str, help='define your own index dir path name', default="/lucene-index-beir-{}") - parser.add_argument('--compare_metric', type=str, help='the metric for comparing two vocab, choose from: jaccard, weight_jaccard, df_filter, tf_filter, kl_divergence', default="weight_jaccard") + parser.add_argument('--compare_metric', type=str, help='the metric for comparing two vocab, choose from: jaccard, weight_jaccard, df_filter, tf_filter, kl_divergence, js_divergence', default="weight_jaccard") parser.add_argument('--compare_threshold', type=float, help='when choosing df_filter, or tf_filter, you can choolse the threshold', default=0.0001) parser.add_argument('--output_path', type=str, help='path to save the stat results', required=True) args = parser.parse_args() @@ -108,12 +121,17 @@ def print_results(datasets, results, save_file): metric_d1 = {} for d2 in beir_datasets: if d1 == d2: - metric_d1[d2] = 1 + if args.compare_metric in ["jaccard", "weight_jaccard", "df_filter", "tf_filter"]: + metric_d1[d2] = 1 + elif args.compare_metric in ["kl_divergence", "js_divergence"]: + metric_d1[d2] = 0 else: if args.compare_metric == "jaccard": metric_d1[d2] = jaccard(cfs[d1], cfs[d2]) elif args.compare_metric == "weight_jaccard": - metric_d1[d2] = weighted_jaccard(cfs[d1], cfs[d2]) + new_d1 = filter_freq_dict(cf2freq(cfs[d1])) + new_d2 = filter_freq_dict(cf2freq(cfs[d2])) + metric_d1[d2] = weighted_jaccard(new_d1, new_d2) elif args.compare_metric == "df_filter": new_d1 = filter_freq_dict(cf2freq(cfs[d1])) new_d2 = filter_freq_dict(cf2freq(cfs[d2])) @@ -126,6 +144,12 @@ def print_results(datasets, results, save_file): new_d1 = filter_freq_dict(cf2freq(cfs[d1])) new_d2 = filter_freq_dict(cf2freq(cfs[d2])) metric_d1[d2] = kl_divergence(new_d1, new_d2) + elif args.compare_metric == "js_divergence": + new_d1 = filter_freq_dict(cf2freq(cfs[d1])) + new_d2 = filter_freq_dict(cf2freq(cfs[d2])) + metric_d1[d2] = js_divergence(new_d1, new_d2) + else: + raise NotImplementedError results[d1] = metric_d1 print_results(beir_datasets, results, args.output_path) diff --git a/scripts/beir/indexes_js_divergence.tsv b/scripts/beir/indexes_js_divergence.tsv new file mode 100644 index 000000000..60a55173c --- /dev/null +++ b/scripts/beir/indexes_js_divergence.tsv @@ -0,0 +1,20 @@ + trec-covid bioasq nfcorpus nq hotpotqa climate-fever fever dbpedia-entity fiqa signal1m trec-news robust04 arguana webis-touche2020 quora cqadupstack scidocs scifact msmarco +trec-covid 1.0000 0.0546 0.0839 0.1642 0.2218 0.2017 0.2016 0.2221 0.2382 0.2489 0.2108 0.1854 0.2013 0.2163 0.2808 0.2252 0.1392 0.0701 0.1624 +bioasq 0.0546 1.0000 0.0603 0.1658 0.2205 0.2011 0.2011 0.2198 0.2425 0.2449 0.2189 0.1922 0.2182 0.2217 0.2816 0.2191 0.1519 0.0423 0.1618 +nfcorpus 0.0839 0.0603 1.0000 0.1814 0.2409 0.2211 0.2211 0.2402 0.2556 0.2684 0.2329 0.2050 0.2236 0.2356 0.2987 0.2441 0.1775 0.0782 0.1776 +nq 0.1642 0.1658 0.1814 1.0000 0.0816 0.0633 0.0634 0.0821 0.1366 0.1482 0.0943 0.0854 0.1052 0.1224 0.1754 0.1559 0.1576 0.1849 0.0672 +hotpotqa 0.2218 0.2205 0.2409 0.0816 1.0000 0.0201 0.0201 0.0049 0.2208 0.1691 0.1765 0.1673 0.1931 0.2166 0.2170 0.2207 0.2222 0.2379 0.1214 +climate-fever 0.2017 0.2011 0.2211 0.0633 0.0201 1.0000 0.0000 0.0198 0.1984 0.1606 0.1437 0.1355 0.1701 0.1893 0.2078 0.2005 0.2019 0.2193 0.1060 +fever 0.2016 0.2011 0.2211 0.0634 0.0201 0.0000 1.0000 0.0198 0.1985 0.1606 0.1438 0.1356 0.1702 0.1894 0.2078 0.2006 0.2019 0.2193 0.1061 +dbpedia-entity 0.2221 0.2198 0.2402 0.0821 0.0049 0.0198 0.0198 1.0000 0.2229 0.1685 0.1764 0.1670 0.1943 0.2184 0.2183 0.2201 0.2237 0.2377 0.1218 +fiqa 0.2382 0.2425 0.2556 0.1366 0.2208 0.1984 0.1985 0.2229 1.0000 0.1826 0.1164 0.1233 0.1182 0.0883 0.1501 0.1238 0.2003 0.2568 0.0970 +signal1m 0.2489 0.2449 0.2684 0.1482 0.1691 0.1606 0.1606 0.1685 0.1826 1.0000 0.1560 0.1744 0.2006 0.1852 0.1916 0.1812 0.2486 0.2621 0.1393 +trec-news 0.2108 0.2189 0.2329 0.0943 0.1765 0.1437 0.1438 0.1764 0.1164 0.1560 1.0000 0.0696 0.1098 0.0908 0.1901 0.1623 0.1953 0.2341 0.1108 +robust04 0.1854 0.1922 0.2050 0.0854 0.1673 0.1355 0.1356 0.1670 0.1233 0.1744 0.0696 1.0000 0.1082 0.1147 0.2092 0.1736 0.1809 0.2089 0.1086 +arguana 0.2013 0.2182 0.2236 0.1052 0.1931 0.1701 0.1702 0.1943 0.1182 0.2006 0.1098 0.1082 1.0000 0.0857 0.1902 0.1766 0.1702 0.2274 0.1143 +webis-touche2020 0.2163 0.2217 0.2356 0.1224 0.2166 0.1893 0.1894 0.2184 0.0883 0.1852 0.0908 0.1147 0.0857 1.0000 0.1707 0.1352 0.1846 0.2341 0.1122 +quora 0.2808 0.2816 0.2987 0.1754 0.2170 0.2078 0.2078 0.2183 0.1501 0.1916 0.1901 0.2092 0.1902 0.1707 1.0000 0.1669 0.2471 0.2970 0.1350 +cqadupstack 0.2252 0.2191 0.2441 0.1559 0.2207 0.2005 0.2006 0.2201 0.1238 0.1812 0.1623 0.1736 0.1766 0.1352 0.1669 1.0000 0.1833 0.2358 0.1205 +scidocs 0.1392 0.1519 0.1775 0.1576 0.2222 0.2019 0.2019 0.2237 0.2003 0.2486 0.1953 0.1809 0.1702 0.1846 0.2471 0.1833 1.0000 0.1602 0.1482 +scifact 0.0701 0.0423 0.0782 0.1849 0.2379 0.2193 0.2193 0.2377 0.2568 0.2621 0.2341 0.2089 0.2274 0.2341 0.2970 0.2358 0.1602 1.0000 0.1805 +msmarco 0.1624 0.1618 0.1776 0.0672 0.1214 0.1060 0.1061 0.1218 0.0970 0.1393 0.1108 0.1086 0.1143 0.1122 0.1350 0.1205 0.1482 0.1805 1.0000 diff --git a/scripts/beir/indexes_weight_jaccard.tsv b/scripts/beir/indexes_weight_jaccard.tsv index 113587c1d..29da5e612 100644 --- a/scripts/beir/indexes_weight_jaccard.tsv +++ b/scripts/beir/indexes_weight_jaccard.tsv @@ -1,20 +1,20 @@ trec-covid bioasq nfcorpus nq hotpotqa climate-fever fever dbpedia-entity fiqa signal1m trec-news robust04 arguana webis-touche2020 quora cqadupstack scidocs scifact msmarco -trec-covid 1.0000 0.0089 0.0303 0.1014 0.0750 0.0503 0.0503 0.0764 0.1929 0.1872 0.0872 0.1254 0.0505 0.2383 0.1906 0.2560 0.1527 0.0416 0.0442 -bioasq 0.0089 1.0000 0.0003 0.0728 0.0831 0.1248 0.1248 0.0804 0.0025 0.0240 0.0719 0.0542 0.0005 0.0244 0.0032 0.0192 0.0014 0.0004 0.1717 -nfcorpus 0.0303 0.0003 1.0000 0.0033 0.0026 0.0016 0.0016 0.0027 0.0860 0.0093 0.0030 0.0044 0.2769 0.0101 0.0691 0.0127 0.1684 0.5651 0.0014 -nq 0.1014 0.0728 0.0033 1.0000 0.5244 0.4535 0.4535 0.5248 0.0312 0.2665 0.5252 0.5363 0.0059 0.2781 0.0353 0.2028 0.0175 0.0046 0.3755 -hotpotqa 0.0750 0.0831 0.0026 0.5244 1.0000 0.6130 0.6130 0.9267 0.0237 0.2075 0.3535 0.3389 0.0046 0.1766 0.0263 0.1429 0.0136 0.0036 0.3808 -climate-fever 0.0503 0.1248 0.0016 0.4535 0.6130 1.0000 1.0000 0.5920 0.0152 0.1429 0.3620 0.3013 0.0028 0.1364 0.0172 0.1042 0.0085 0.0022 0.4879 -fever 0.0503 0.1248 0.0016 0.4535 0.6130 1.0000 1.0000 0.5921 0.0152 0.1429 0.3620 0.3013 0.0028 0.1364 0.0172 0.1042 0.0085 0.0022 0.4879 -dbpedia-entity 0.0764 0.0804 0.0027 0.5248 0.9267 0.5920 0.5921 1.0000 0.0244 0.2132 0.3528 0.3412 0.0047 0.1776 0.0271 0.1471 0.0140 0.0037 0.3719 -fiqa 0.1929 0.0025 0.0860 0.0312 0.0237 0.0152 0.0152 0.0244 1.0000 0.0878 0.0277 0.0411 0.1710 0.0938 0.4108 0.1154 0.3184 0.1097 0.0128 -signal1m 0.1872 0.0240 0.0093 0.2665 0.2075 0.1429 0.1429 0.2132 0.0878 1.0000 0.2393 0.3082 0.0170 0.3544 0.1020 0.3399 0.0439 0.0123 0.1260 -trec-news 0.0872 0.0719 0.0030 0.5252 0.3535 0.3620 0.3620 0.3528 0.0277 0.2393 1.0000 0.5535 0.0052 0.2749 0.0335 0.1939 0.0154 0.0040 0.3720 -robust04 0.1254 0.0542 0.0044 0.5363 0.3389 0.3013 0.3013 0.3412 0.0411 0.3082 0.5535 1.0000 0.0076 0.3681 0.0467 0.2616 0.0228 0.0060 0.2788 -arguana 0.0505 0.0005 0.2769 0.0059 0.0046 0.0028 0.0028 0.0047 0.1710 0.0170 0.0052 0.0076 1.0000 0.0179 0.1292 0.0225 0.2531 0.3048 0.0024 -webis-touche2020 0.2383 0.0244 0.0101 0.2781 0.1766 0.1364 0.1364 0.1776 0.0938 0.3544 0.2749 0.3681 0.0179 1.0000 0.1087 0.4355 0.0514 0.0137 0.1290 -quora 0.1906 0.0032 0.0691 0.0353 0.0263 0.0172 0.0172 0.0271 0.4108 0.1020 0.0335 0.0467 0.1292 0.1087 1.0000 0.1418 0.2441 0.0885 0.0158 -cqadupstack 0.2560 0.0192 0.0127 0.2028 0.1429 0.1042 0.1042 0.1471 0.1154 0.3399 0.1939 0.2616 0.0225 0.4355 0.1418 1.0000 0.0672 0.0171 0.0996 -scidocs 0.1527 0.0014 0.1684 0.0175 0.0136 0.0085 0.0085 0.0140 0.3184 0.0439 0.0154 0.0228 0.2531 0.0514 0.2441 0.0672 1.0000 0.2217 0.0072 -scifact 0.0416 0.0004 0.5651 0.0046 0.0036 0.0022 0.0022 0.0037 0.1097 0.0123 0.0040 0.0060 0.3048 0.0137 0.0885 0.0171 0.2217 1.0000 0.0019 -msmarco 0.0442 0.1717 0.0014 0.3755 0.3808 0.4879 0.4879 0.3719 0.0128 0.1260 0.3720 0.2788 0.0024 0.1290 0.0158 0.0996 0.0072 0.0019 1.0000 +trec-covid 1.0000 0.6702 0.5867 0.3619 0.2564 0.2955 0.2955 0.2574 0.2721 0.2390 0.2698 0.3307 0.3119 0.2911 0.2042 0.2993 0.4355 0.6190 0.3553 +bioasq 0.6702 1.0000 0.6530 0.3559 0.2588 0.2973 0.2973 0.2613 0.2633 0.2417 0.2546 0.3170 0.2876 0.2771 0.1991 0.3053 0.4019 0.7019 0.3516 +nfcorpus 0.5867 0.6530 1.0000 0.3325 0.2350 0.2689 0.2689 0.2371 0.2546 0.2178 0.2441 0.3039 0.2827 0.2703 0.1884 0.2767 0.3613 0.5820 0.3299 +nq 0.3619 0.3559 0.3325 1.0000 0.4992 0.5883 0.5882 0.4983 0.4031 0.3688 0.4455 0.4913 0.4459 0.4018 0.3214 0.3668 0.3770 0.3273 0.5709 +hotpotqa 0.2564 0.2588 0.2350 0.4992 1.0000 0.7719 0.7720 0.9361 0.2598 0.3286 0.2760 0.3020 0.2817 0.2387 0.2806 0.2604 0.2573 0.2395 0.4378 +climate-fever 0.2955 0.2973 0.2689 0.5883 0.7719 1.0000 0.9998 0.7741 0.2988 0.3495 0.3382 0.3659 0.3258 0.2803 0.2792 0.2969 0.2969 0.2736 0.4613 +fever 0.2955 0.2973 0.2689 0.5882 0.7720 0.9998 1.0000 0.7742 0.2987 0.3494 0.3381 0.3658 0.3257 0.2802 0.2792 0.2968 0.2970 0.2736 0.4612 +dbpedia-entity 0.2574 0.2613 0.2371 0.4983 0.9361 0.7741 0.7742 1.0000 0.2595 0.3332 0.2758 0.3038 0.2807 0.2372 0.2783 0.2624 0.2566 0.2410 0.4396 +fiqa 0.2721 0.2633 0.2546 0.4031 0.2598 0.2988 0.2987 0.2595 1.0000 0.3152 0.4142 0.4030 0.4821 0.5323 0.3748 0.4754 0.3243 0.2503 0.4781 +signal1m 0.2390 0.2417 0.2178 0.3688 0.3286 0.3495 0.3494 0.3332 0.3152 1.0000 0.3237 0.3063 0.2794 0.2937 0.2944 0.3244 0.2371 0.2291 0.3960 +trec-news 0.2698 0.2546 0.2441 0.4455 0.2760 0.3382 0.3381 0.2758 0.4142 0.3237 1.0000 0.5740 0.4239 0.4964 0.2756 0.3322 0.2825 0.2412 0.4140 +robust04 0.3307 0.3170 0.3039 0.4913 0.3020 0.3659 0.3658 0.3038 0.4030 0.3063 0.5740 1.0000 0.4421 0.4458 0.2602 0.3248 0.3196 0.2954 0.4329 +arguana 0.3119 0.2876 0.2827 0.4459 0.2817 0.3258 0.3257 0.2807 0.4821 0.2794 0.4239 0.4421 1.0000 0.5237 0.3105 0.3709 0.3706 0.2801 0.4409 +webis-touche2020 0.2911 0.2771 0.2703 0.4018 0.2387 0.2803 0.2802 0.2372 0.5323 0.2937 0.4964 0.4458 0.5237 1.0000 0.3264 0.4141 0.3237 0.2693 0.4234 +quora 0.2042 0.1991 0.1884 0.3214 0.2806 0.2792 0.2792 0.2783 0.3748 0.2944 0.2756 0.2602 0.3105 0.3264 1.0000 0.3405 0.2409 0.1884 0.4022 +cqadupstack 0.2993 0.3053 0.2767 0.3668 0.2604 0.2969 0.2968 0.2624 0.4754 0.3244 0.3322 0.3248 0.3709 0.4141 0.3405 1.0000 0.3477 0.2874 0.4328 +scidocs 0.4355 0.4019 0.3613 0.3770 0.2573 0.2969 0.2970 0.2566 0.3243 0.2371 0.2825 0.3196 0.3706 0.3237 0.2409 0.3477 1.0000 0.3946 0.3868 +scifact 0.6190 0.7019 0.5820 0.3273 0.2395 0.2736 0.2736 0.2410 0.2503 0.2291 0.2412 0.2954 0.2801 0.2693 0.1884 0.2874 0.3946 1.0000 0.3270 +msmarco 0.3553 0.3516 0.3299 0.5709 0.4378 0.4613 0.4612 0.4396 0.4781 0.3960 0.4140 0.4329 0.4409 0.4234 0.4022 0.4328 0.3868 0.3270 1.0000 diff --git a/scripts/beir/tokenize_corpus.py b/scripts/beir/tokenize_corpus.py index b7439ffbc..69ecb03fa 100644 --- a/scripts/beir/tokenize_corpus.py +++ b/scripts/beir/tokenize_corpus.py @@ -30,7 +30,8 @@ type=str, required=True) parser.add_argument('--workers', metavar='# of processes', help='# of workers to spawn', type=int, default=multiprocessing.cpu_count() - 2) - +parser.add_argument('--tokenizer', metavar='tokenizer', help='tokenizer', + type=str, default='bert-base-cased') args = parser.parse_args() print(args) @@ -61,7 +62,7 @@ def batch_file(iterable, n=10000): def batch_process(batch): - bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + bert_tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) def process(line): if not line: From a9ea6f43495da530fb713e0d76767f17531b493e Mon Sep 17 00:00:00 2001 From: amyxie361 Date: Wed, 11 May 2022 13:22:36 -0400 Subject: [PATCH 10/12] add tokenizer option --- scripts/beir/tokenize_queries.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/beir/tokenize_queries.py b/scripts/beir/tokenize_queries.py index bc47df394..a875d67fd 100644 --- a/scripts/beir/tokenize_queries.py +++ b/scripts/beir/tokenize_queries.py @@ -30,6 +30,8 @@ type=str, required=True) parser.add_argument('--workers', metavar='# of processes', help='# of workers to spawn', type=int, default=multiprocessing.cpu_count() - 2) +parser.add_argument('--tokenizer', metavar='tokenizer', help='tokenizer', + type=str, default='bert-base-cased') args = parser.parse_args() @@ -61,7 +63,7 @@ def batch_file(iterable, n=10000): def batch_process(batch): - bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + bert_tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) def process(line): if not line: From a2af11651fbe4830b741d6cd190f182ef1676af5 Mon Sep 17 00:00:00 2001 From: amyxie361 Date: Thu, 12 May 2022 15:27:04 -0400 Subject: [PATCH 11/12] fix bug and add query2query query2corpus comparison --- scripts/beir/compare_domains.py | 107 +++++++++++------- scripts/beir/similarities/q2c-df_filter.tsv | 20 ++++ .../beir/similarities/q2c-js_divergence.tsv | 20 ++++ .../beir/similarities/q2c-kl_divergence.tsv | 20 ++++ scripts/beir/similarities/q2c-tf_filter.tsv | 20 ++++ .../beir/similarities/q2c-weight_jaccard.tsv | 20 ++++ scripts/beir/test_compare_domains.sh | 10 +- 7 files changed, 177 insertions(+), 40 deletions(-) create mode 100644 scripts/beir/similarities/q2c-df_filter.tsv create mode 100644 scripts/beir/similarities/q2c-js_divergence.tsv create mode 100644 scripts/beir/similarities/q2c-kl_divergence.tsv create mode 100644 scripts/beir/similarities/q2c-tf_filter.tsv create mode 100644 scripts/beir/similarities/q2c-weight_jaccard.tsv diff --git a/scripts/beir/compare_domains.py b/scripts/beir/compare_domains.py index 247854616..8a41cc67e 100644 --- a/scripts/beir/compare_domains.py +++ b/scripts/beir/compare_domains.py @@ -29,7 +29,7 @@ def count_total(d): def kl_divergence(d1, d2): value = float(0) for w in d1: - if w in d2: + if w in d2: # through out zero tokens for both sets value += d1[w] * np.log(d1[w] / d2[w]) return value @@ -101,55 +101,86 @@ def print_results(datasets, results, save_file): if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--index_path', type=str, help='path to indexes of all the beir dataset', required=True) - parser.add_argument('--index_name_format', type=str, help='define your own index dir path name', default="/lucene-index-beir-{}") parser.add_argument('--compare_metric', type=str, help='the metric for comparing two vocab, choose from: jaccard, weight_jaccard, df_filter, tf_filter, kl_divergence, js_divergence', default="weight_jaccard") parser.add_argument('--compare_threshold', type=float, help='when choosing df_filter, or tf_filter, you can choolse the threshold', default=0.0001) parser.add_argument('--output_path', type=str, help='path to save the stat results', required=True) + parser.add_argument('--compare_sets', type=str, default="c2c", help="choose from c2c, q2q, q2c") args = parser.parse_args() + corpus_format = "/corpus/lucene-index-beir-{}" + queries_format = "/queries/lucene-index-beir-queires-{}" + beir_datasets = ['trec-covid', 'bioasq', 'nfcorpus', 'nq', 'hotpotqa', 'climate-fever', 'fever', 'dbpedia-entity', 'fiqa', 'signal1m', 'trec-news', 'robust04', 'arguana', 'webis-touche2020', 'quora', 'cqadupstack', 'scidocs', 'scifact', 'msmarco'] - #beir_datasets = ['arguana', 'fiqa'] - cfs = dfs = stats = {} - for d in beir_datasets: - cf, df, stat = index2stats(args.index_path + args.index_name_format.format(d)) - cfs[d] = cf # count frequency -- int - dfs[d] = df # document frequency -- int - stat[d] = stat + #beir_datasets = ['trec-covid', 'bioasq', 'nfcorpus', 'nq'] # Testing + cfs = {} + dfs = {} + summary = {} + cfs2 = {} + dfs2 = {} + summary2 = {} + if args.compare_sets == "c2c": + for d in beir_datasets: + cf, df, stat = index2stats(args.index_path + corpus_format.format(d)) + cfs[d] = cf # count frequency -- int + dfs[d] = df # document frequency -- int + summary[d] = stat +# stats[d] = stat + cfs2 = cfs + dfs2 = dfs + summary2 = summary + elif args.compare_sets == "q2q": + for d in beir_datasets: + cf, df, stat = index2stats(args.index_path + queries_format.format(d)) + cfs[d] = cf + dfs[d] = df + summary[d] = stat + cfs2 = cfs + dfs2 = dfs + dfs2 = dfs + summary2 = summary + elif args.compare_sets == "q2c": + for d in beir_datasets: + cf, df, stat = index2stats(args.index_path + queries_format.format(d)) + cfs[d] = cf + dfs[d] = df + summary[d] = stat + for d in beir_datasets: + cf, df, stat = index2stats(args.index_path + corpus_format.format(d)) + cfs2[d] = cf + dfs2[d] = df + summary2[d] = stat + else: + NotImplementedError("--compare_sets {}".format(args.compare_sets)) + results = {} for d1 in beir_datasets: metric_d1 = {} for d2 in beir_datasets: - if d1 == d2: - if args.compare_metric in ["jaccard", "weight_jaccard", "df_filter", "tf_filter"]: - metric_d1[d2] = 1 - elif args.compare_metric in ["kl_divergence", "js_divergence"]: - metric_d1[d2] = 0 + if args.compare_metric == "jaccard": + metric_d1[d2] = jaccard(cfs[d1], cfs2[d2]) + elif args.compare_metric == "weight_jaccard": + new_d1 = cf2freq(cfs[d1]) + new_d2 = cf2freq(cfs2[d2]) + metric_d1[d2] = weighted_jaccard(new_d1, new_d2) + elif args.compare_metric == "df_filter": + new_d1 = filter_freq_dict(cf2freq(cfs[d1])) + new_d2 = filter_freq_dict(cf2freq(cfs2[d2])) + metric_d1[d2] = jaccard(new_d1, new_d2) + elif args.compare_metric == "tf_filter": + new_d1 = filter_freq_dict(df2idf(dfs[d1], summary[d1]["documents"])) + new_d2 = filter_freq_dict(df2idf(dfs2[d2], summary2[d2]["documents"])) + metric_d1[d2] = jaccard(new_d1, new_d2) + elif args.compare_metric == "kl_divergence": + new_d1 = filter_freq_dict(cf2freq(cfs[d1])) + new_d2 = filter_freq_dict(cf2freq(cfs2[d2])) + metric_d1[d2] = kl_divergence(new_d1, new_d2) + elif args.compare_metric == "js_divergence": + new_d1 = filter_freq_dict(cf2freq(cfs[d1])) + new_d2 = filter_freq_dict(cf2freq(cfs2[d2])) + metric_d1[d2] = js_divergence(new_d1, new_d2) else: - if args.compare_metric == "jaccard": - metric_d1[d2] = jaccard(cfs[d1], cfs[d2]) - elif args.compare_metric == "weight_jaccard": - new_d1 = filter_freq_dict(cf2freq(cfs[d1])) - new_d2 = filter_freq_dict(cf2freq(cfs[d2])) - metric_d1[d2] = weighted_jaccard(new_d1, new_d2) - elif args.compare_metric == "df_filter": - new_d1 = filter_freq_dict(cf2freq(cfs[d1])) - new_d2 = filter_freq_dict(cf2freq(cfs[d2])) - metric_d1[d2] = jaccard(new_d1, new_d2) - elif args.compare_metric == "tf_filter": - new_d1 = filter_freq_dict(df2idf(dfs[d1], 1)) - new_d2 = filter_freq_dict(df2idf(dfs[d2], 1)) - metric_d1[d2] = jaccard(new_d1, new_d2) - elif args.compare_metric == "kl_divergence": - new_d1 = filter_freq_dict(cf2freq(cfs[d1])) - new_d2 = filter_freq_dict(cf2freq(cfs[d2])) - metric_d1[d2] = kl_divergence(new_d1, new_d2) - elif args.compare_metric == "js_divergence": - new_d1 = filter_freq_dict(cf2freq(cfs[d1])) - new_d2 = filter_freq_dict(cf2freq(cfs[d2])) - metric_d1[d2] = js_divergence(new_d1, new_d2) - else: - raise NotImplementedError + raise NotImplementedError results[d1] = metric_d1 print_results(beir_datasets, results, args.output_path) diff --git a/scripts/beir/similarities/q2c-df_filter.tsv b/scripts/beir/similarities/q2c-df_filter.tsv new file mode 100644 index 000000000..350757784 --- /dev/null +++ b/scripts/beir/similarities/q2c-df_filter.tsv @@ -0,0 +1,20 @@ + trec-covid bioasq nfcorpus nq hotpotqa climate-fever fever dbpedia-entity fiqa signal1m trec-news robust04 arguana webis-touche2020 quora cqadupstack scidocs scifact msmarco +trec-covid 0.1393 0.1130 0.1093 0.1017 0.0794 0.0814 0.0814 0.0757 0.0898 0.0668 0.0924 0.0968 0.1050 0.1100 0.0902 0.0803 0.0946 0.1171 0.1067 +bioasq 0.3534 0.4330 0.3275 0.1742 0.1925 0.1925 0.1925 0.1912 0.1277 0.2033 0.1332 0.1551 0.1313 0.1427 0.1603 0.1912 0.2196 0.4567 0.2174 +nfcorpus 0.2330 0.2657 0.3053 0.1679 0.1696 0.1736 0.1736 0.1672 0.1372 0.1997 0.1447 0.1526 0.1424 0.1495 0.1730 0.1547 0.1556 0.2594 0.2213 +nq 0.1836 0.1822 0.1761 0.3999 0.3369 0.3645 0.3645 0.3307 0.2492 0.2935 0.3329 0.3104 0.2602 0.2879 0.3304 0.2253 0.2052 0.1710 0.3576 +hotpotqa 0.1839 0.1809 0.1677 0.4478 0.5847 0.5973 0.5973 0.5734 0.1987 0.3041 0.3199 0.3037 0.2234 0.2236 0.2652 0.2082 0.1839 0.1734 0.3127 +climate-fever 0.2352 0.2173 0.2252 0.3268 0.2276 0.2434 0.2434 0.2224 0.2962 0.2339 0.2994 0.3205 0.3119 0.3062 0.2621 0.2299 0.2490 0.2183 0.3345 +fever 0.1675 0.1664 0.1535 0.3497 0.4121 0.4207 0.4207 0.4116 0.1684 0.2767 0.2695 0.2489 0.2036 0.2022 0.2395 0.1747 0.1564 0.1569 0.2594 +dbpedia-entity 0.1006 0.0991 0.0987 0.1901 0.2187 0.2194 0.2194 0.2166 0.1079 0.1660 0.1562 0.1551 0.1330 0.1235 0.1559 0.1111 0.1092 0.0954 0.1574 +fiqa 0.2103 0.1905 0.1923 0.2921 0.1905 0.2092 0.2092 0.1872 0.5152 0.2450 0.2997 0.3302 0.3154 0.3223 0.3423 0.2577 0.2460 0.1849 0.3371 +signal1m 0.1051 0.1047 0.1023 0.1747 0.1546 0.1565 0.1565 0.1525 0.1548 0.1863 0.1787 0.1748 0.1570 0.1517 0.1719 0.1285 0.1132 0.1016 0.1651 +trec-news 0.1006 0.0916 0.0950 0.1550 0.1197 0.1261 0.1261 0.1182 0.1522 0.1408 0.1797 0.1549 0.1504 0.1605 0.1587 0.1150 0.0974 0.0896 0.1476 +robust04 0.1889 0.1672 0.1814 0.2069 0.1694 0.1759 0.1759 0.1647 0.1718 0.1658 0.1853 0.2250 0.2444 0.2039 0.1911 0.1300 0.1882 0.1680 0.2143 +arguana 0.2530 0.2128 0.2192 0.3942 0.2540 0.2738 0.2738 0.2477 0.3770 0.2650 0.3876 0.4195 0.7783 0.4592 0.3407 0.2437 0.2934 0.2134 0.3659 +webis-touche2020 0.0423 0.0391 0.0446 0.0597 0.0429 0.0455 0.0455 0.0419 0.0640 0.0469 0.0675 0.0605 0.0879 0.0886 0.0751 0.0404 0.0465 0.0402 0.0632 +quora 0.2167 0.2013 0.1944 0.3613 0.2628 0.2837 0.2837 0.2550 0.3660 0.3250 0.3784 0.3436 0.3442 0.3906 0.7351 0.2971 0.2467 0.1887 0.3978 +cqadupstack 0.2354 0.2261 0.1994 0.2831 0.2092 0.2272 0.2272 0.2042 0.2937 0.2456 0.2589 0.2650 0.2485 0.2837 0.3468 0.4928 0.3361 0.2148 0.3473 +scidocs 0.2158 0.2124 0.1799 0.1783 0.1539 0.1616 0.1616 0.1529 0.1460 0.1614 0.1439 0.1575 0.1532 0.1513 0.1785 0.1989 0.3926 0.2025 0.1985 +scifact 0.3480 0.4334 0.3390 0.1805 0.1742 0.1801 0.1801 0.1750 0.1333 0.1852 0.1379 0.1566 0.1501 0.1446 0.1569 0.1637 0.2130 0.4916 0.2146 +msmarco 0.2836 0.2972 0.2833 0.3513 0.3169 0.3344 0.3344 0.3105 0.2788 0.3285 0.2867 0.2934 0.2435 0.2749 0.3910 0.2775 0.2493 0.2734 0.5009 diff --git a/scripts/beir/similarities/q2c-js_divergence.tsv b/scripts/beir/similarities/q2c-js_divergence.tsv new file mode 100644 index 000000000..42af8cabb --- /dev/null +++ b/scripts/beir/similarities/q2c-js_divergence.tsv @@ -0,0 +1,20 @@ + trec-covid bioasq nfcorpus nq hotpotqa climate-fever fever dbpedia-entity fiqa signal1m trec-news robust04 arguana webis-touche2020 quora cqadupstack scidocs scifact msmarco +trec-covid 0.3183 0.3676 0.3849 0.3597 0.3819 0.3684 0.3684 0.3820 0.3965 0.4042 0.3868 0.3549 0.3679 0.3751 0.3260 0.4248 0.3683 0.3671 0.3522 +bioasq 0.2216 0.1939 0.2452 0.2776 0.2744 0.2702 0.2702 0.2731 0.3335 0.2988 0.3278 0.2907 0.3208 0.3132 0.2495 0.3148 0.2702 0.1810 0.2579 +nfcorpus 0.3008 0.2820 0.2583 0.3178 0.3277 0.3191 0.3191 0.3272 0.3482 0.2987 0.3360 0.3254 0.3471 0.3345 0.3020 0.3600 0.3371 0.2873 0.2799 +nq 0.2869 0.2845 0.3068 0.1743 0.1953 0.1829 0.1829 0.1960 0.2434 0.2454 0.2198 0.2069 0.2260 0.2203 0.1841 0.3058 0.2603 0.2981 0.1856 +hotpotqa 0.2499 0.2482 0.2704 0.1275 0.0960 0.1008 0.1008 0.1018 0.2376 0.2236 0.1957 0.1734 0.2168 0.2159 0.1813 0.2764 0.2329 0.2610 0.1750 +climate-fever 0.2160 0.2250 0.2305 0.1478 0.1999 0.1911 0.1911 0.2025 0.1872 0.2395 0.1728 0.1546 0.1561 0.1659 0.2440 0.2633 0.1941 0.2328 0.1506 +fever 0.2465 0.2418 0.2618 0.1408 0.1121 0.1106 0.1106 0.1112 0.2412 0.2140 0.1955 0.1808 0.2184 0.2127 0.2471 0.2789 0.2337 0.2576 0.1670 +dbpedia-entity 0.3952 0.3931 0.4094 0.3073 0.2939 0.2892 0.2892 0.2932 0.3938 0.3597 0.3492 0.3332 0.3557 0.3660 0.3371 0.4233 0.3751 0.4050 0.3299 +fiqa 0.2888 0.2930 0.3090 0.2280 0.2771 0.2605 0.2605 0.2768 0.1349 0.2557 0.2288 0.2061 0.2181 0.2053 0.1374 0.2552 0.2568 0.3045 0.1928 +signal1m 0.3789 0.3756 0.3903 0.3191 0.3378 0.3278 0.3278 0.3369 0.3549 0.3094 0.3284 0.3129 0.3494 0.3549 0.3713 0.3845 0.3720 0.3886 0.3224 +trec-news 0.3678 0.3770 0.3844 0.3170 0.3488 0.3356 0.3356 0.3485 0.3306 0.3366 0.3052 0.3157 0.3212 0.3252 0.3403 0.3927 0.3572 0.3847 0.3051 +robust04 0.2822 0.2948 0.2968 0.2344 0.2638 0.2546 0.2546 0.2662 0.2772 0.3146 0.2741 0.2331 0.2282 0.2492 0.2574 0.3427 0.2604 0.3010 0.2303 +arguana 0.1881 0.2051 0.2091 0.0948 0.1652 0.1533 0.1533 0.1661 0.1285 0.2014 0.1132 0.1019 0.0152 0.0906 0.1844 0.2164 0.1505 0.2118 0.1084 +webis-touche2020 0.4870 0.4907 0.5016 0.4444 0.4620 0.4544 0.4544 0.4623 0.4601 0.4669 0.4590 0.4445 0.4284 0.4409 0.3754 0.4994 0.4657 0.4979 0.4337 +quora 0.2889 0.2929 0.3110 0.2058 0.2450 0.2341 0.2341 0.2478 0.1782 0.2259 0.2103 0.2086 0.2045 0.1707 0.0197 0.2490 0.2524 0.3046 0.1759 +cqadupstack 0.2505 0.2455 0.2745 0.1985 0.2368 0.2313 0.2313 0.2423 0.1895 0.2232 0.2225 0.2041 0.2267 0.1899 0.1315 0.1410 0.1996 0.2591 0.1715 +scidocs 0.2915 0.2898 0.3205 0.3076 0.3200 0.3082 0.3082 0.3189 0.3653 0.3241 0.3372 0.3121 0.3470 0.3534 0.3522 0.3487 0.1940 0.3017 0.2969 +scifact 0.1811 0.1440 0.1905 0.2613 0.2666 0.2561 0.2561 0.2626 0.3200 0.2871 0.3070 0.2688 0.3047 0.3041 0.3307 0.3117 0.2519 0.1162 0.2354 +msmarco 0.2524 0.2451 0.2665 0.2008 0.2152 0.2062 0.2062 0.2152 0.2202 0.2217 0.2298 0.2172 0.2313 0.2168 0.1290 0.2666 0.2421 0.2583 0.1500 diff --git a/scripts/beir/similarities/q2c-kl_divergence.tsv b/scripts/beir/similarities/q2c-kl_divergence.tsv new file mode 100644 index 000000000..b66d54ee2 --- /dev/null +++ b/scripts/beir/similarities/q2c-kl_divergence.tsv @@ -0,0 +1,20 @@ + trec-covid bioasq nfcorpus nq hotpotqa climate-fever fever dbpedia-entity fiqa signal1m trec-news robust04 arguana webis-touche2020 quora cqadupstack scidocs scifact msmarco +trec-covid 1.3171 1.5038 1.1645 1.5557 1.1925 1.1408 1.1408 0.9532 1.0464 1.3154 1.2979 1.3355 1.3443 0.9990 0.7820 1.2226 1.3264 1.2179 1.4574 +bioasq 0.4752 0.7102 0.3978 0.8157 0.3905 0.3777 0.3777 0.2632 0.4918 0.6834 0.6380 0.6360 0.5517 0.4903 0.2398 0.4770 0.7853 0.3281 0.7250 +nfcorpus 0.2896 0.4219 0.3631 0.3448 0.1964 0.1872 0.1872 0.1796 0.1171 0.2434 0.1895 0.2277 0.2054 0.1432 0.1329 0.1255 0.2553 0.3123 0.4695 +nq 0.6697 0.6104 0.6476 0.6461 0.5398 0.5819 0.5819 0.4346 0.5066 0.7994 0.6049 0.5834 0.5205 0.4989 0.4141 0.7528 0.5870 0.6352 0.6111 +hotpotqa 0.3564 0.4486 0.2768 0.5484 0.2580 0.2638 0.2638 0.1741 0.3566 0.6152 0.4689 0.4241 0.4318 0.3451 0.2707 0.4518 0.5310 0.2513 0.4842 +climate-fever 0.3194 0.2791 0.3221 0.2604 0.2667 0.2332 0.2332 0.2684 0.1397 0.4865 0.1754 0.1666 0.2025 0.1652 0.3798 0.3445 0.2696 0.3130 0.2298 +fever 0.2779 0.2559 0.2806 0.3023 0.2154 0.2306 0.2306 0.2118 0.2123 0.4458 0.2788 0.2252 0.2366 0.1892 0.4571 0.3251 0.2422 0.2774 0.2645 +dbpedia-entity 0.3518 0.4480 0.3473 0.7123 0.4688 0.4821 0.4821 0.4532 0.4179 0.6805 0.5349 0.5395 0.4326 0.3993 0.4276 0.4748 0.5365 0.3164 0.6126 +fiqa 0.4496 0.6679 0.4018 0.8408 0.3873 0.3725 0.3725 0.3064 0.4936 0.6569 0.6042 0.7719 0.6030 0.4260 0.2688 0.4113 0.6733 0.3843 0.7004 +signal1m 0.3475 0.3260 0.3270 0.4788 0.4410 0.4079 0.4079 0.4372 0.3117 0.5070 0.4102 0.3603 0.4289 0.3510 0.5111 0.2662 0.3942 0.3434 0.4072 +trec-news 0.5208 0.4706 0.5086 0.6563 0.5731 0.4983 0.4983 0.5693 0.4899 0.6819 0.5679 0.5583 0.5521 0.5472 0.6004 0.5097 0.4776 0.4631 0.5420 +robust04 0.5762 0.5368 0.4200 0.5896 0.4293 0.3977 0.3977 0.2987 0.3907 0.6561 0.5068 0.5127 0.4510 0.3981 0.3908 0.4927 0.6771 0.4396 0.4966 +arguana 0.3261 0.3043 0.3251 0.2163 0.3057 0.2324 0.2324 0.3226 0.1342 0.4923 0.1779 0.1673 0.0086 0.1116 0.3763 0.3046 0.2496 0.3176 0.1932 +webis-touche2020 1.1044 1.8766 1.1320 2.1298 0.6137 0.6239 0.6239 0.6142 1.4826 1.6078 1.7456 1.7416 1.8355 1.6536 1.0439 1.2382 1.9108 1.1932 1.8591 +quora 0.7001 0.9831 0.5066 1.2133 0.5968 0.5866 0.5866 0.3828 0.5919 0.9419 0.8957 1.0112 0.8785 0.6004 0.0487 0.7040 1.0933 0.4753 0.9167 +cqadupstack 0.4205 0.6056 0.3868 0.6199 0.2873 0.2564 0.2564 0.2716 0.2657 0.4874 0.4727 0.4815 0.4274 0.2820 0.1423 0.3226 0.5857 0.3824 0.4424 +scidocs 0.3670 0.3062 0.2769 0.3921 0.3258 0.3291 0.3291 0.2623 0.3085 0.2957 0.2987 0.2691 0.3982 0.3358 0.4599 0.3927 0.4649 0.3636 0.3903 +scifact 0.3018 0.2893 0.2767 0.2787 0.2422 0.2234 0.2234 0.2265 0.1821 0.3003 0.2207 0.1837 0.2009 0.1968 0.3422 0.1784 0.3055 0.2355 0.3151 +msmarco 0.6714 0.4714 0.4600 0.6986 0.4999 0.4973 0.4973 0.1797 0.3832 0.6074 0.5625 0.6057 0.4861 0.4326 0.1259 0.5138 0.6222 0.4238 0.5427 diff --git a/scripts/beir/similarities/q2c-tf_filter.tsv b/scripts/beir/similarities/q2c-tf_filter.tsv new file mode 100644 index 000000000..a055c6921 --- /dev/null +++ b/scripts/beir/similarities/q2c-tf_filter.tsv @@ -0,0 +1,20 @@ + trec-covid bioasq nfcorpus nq hotpotqa climate-fever fever dbpedia-entity fiqa signal1m trec-news robust04 arguana webis-touche2020 quora cqadupstack scidocs scifact msmarco +trec-covid 0.0116 0.0106 0.0206 0.0101 0.0101 0.0101 0.0101 0.0101 0.0121 0.0104 0.0105 0.0106 0.0139 0.0104 0.0113 0.0104 0.0131 0.0198 0.0106 +bioasq 0.2015 0.1849 0.3231 0.1763 0.1765 0.1768 0.1768 0.1765 0.2043 0.1810 0.1825 0.1855 0.2193 0.1818 0.1963 0.1807 0.2239 0.3228 0.1846 +nfcorpus 0.1358 0.1250 0.2271 0.1191 0.1193 0.1194 0.1195 0.1193 0.1392 0.1228 0.1234 0.1253 0.1524 0.1228 0.1330 0.1221 0.1495 0.2054 0.1248 +nq 0.2360 0.2275 0.2729 0.2170 0.2173 0.2176 0.2177 0.2173 0.2457 0.2236 0.2247 0.2282 0.2617 0.2238 0.2403 0.2216 0.2504 0.2711 0.2276 +hotpotqa 0.8068 0.8765 0.4888 0.8514 0.8526 0.8532 0.8532 0.8525 0.7796 0.8695 0.8748 0.8776 0.7027 0.8667 0.8236 0.8459 0.7325 0.5074 0.8775 +climate-fever 0.1841 0.1694 0.2715 0.1616 0.1618 0.1620 0.1619 0.1618 0.1907 0.1666 0.1674 0.1697 0.2159 0.1666 0.1793 0.1658 0.2029 0.2631 0.1692 +fever 0.5803 0.5947 0.4570 0.5674 0.5681 0.5692 0.5692 0.5680 0.5875 0.5846 0.5876 0.5961 0.5888 0.5845 0.5975 0.5750 0.5683 0.4695 0.5953 +dbpedia-entity 0.0510 0.0484 0.0658 0.0461 0.0462 0.0463 0.0463 0.0462 0.0531 0.0476 0.0478 0.0486 0.0596 0.0475 0.0512 0.0473 0.0546 0.0623 0.0484 +fiqa 0.2159 0.1991 0.2991 0.1898 0.1901 0.1903 0.1903 0.1900 0.2281 0.1957 0.1966 0.1994 0.2560 0.1957 0.2126 0.1948 0.2398 0.3014 0.1988 +signal1m 0.0321 0.0299 0.0452 0.0285 0.0285 0.0285 0.0285 0.0285 0.0339 0.0293 0.0295 0.0299 0.0383 0.0294 0.0319 0.0292 0.0352 0.0448 0.0297 +trec-news 0.0207 0.0191 0.0312 0.0182 0.0182 0.0182 0.0182 0.0182 0.0218 0.0188 0.0189 0.0190 0.0251 0.0188 0.0202 0.0187 0.0228 0.0304 0.0190 +robust04 0.0605 0.0556 0.0936 0.0530 0.0531 0.0532 0.0532 0.0531 0.0627 0.0547 0.0549 0.0559 0.0733 0.0547 0.0593 0.0544 0.0663 0.0894 0.0556 +arguana 0.5176 0.4919 0.5190 0.4693 0.4699 0.4701 0.4702 0.4698 0.5384 0.4838 0.4861 0.4931 0.6626 0.4840 0.5140 0.4798 0.5438 0.5342 0.4916 +webis-touche2020 0.0083 0.0076 0.0128 0.0072 0.0072 0.0073 0.0073 0.0072 0.0087 0.0075 0.0075 0.0076 0.0102 0.0075 0.0081 0.0074 0.0093 0.0122 0.0076 +quora 0.4320 0.4139 0.4550 0.3949 0.3954 0.3959 0.3960 0.3954 0.4545 0.4069 0.4089 0.4142 0.4703 0.4069 0.4426 0.4042 0.4586 0.4550 0.4136 +cqadupstack 0.3533 0.3317 0.4277 0.3168 0.3173 0.3175 0.3176 0.3172 0.3712 0.3259 0.3280 0.3320 0.3869 0.3267 0.3537 0.3252 0.3909 0.4426 0.3316 +scidocs 0.1310 0.1195 0.2000 0.1143 0.1145 0.1145 0.1145 0.1144 0.1352 0.1177 0.1182 0.1196 0.1471 0.1179 0.1280 0.1173 0.1474 0.2026 0.1193 +scifact 0.1346 0.1231 0.2317 0.1173 0.1175 0.1177 0.1177 0.1175 0.1369 0.1203 0.1215 0.1233 0.1518 0.1210 0.1309 0.1203 0.1505 0.2302 0.1228 +msmarco 0.8630 0.9449 0.5137 0.9058 0.9070 0.9074 0.9075 0.9068 0.8388 0.9290 0.9365 0.9466 0.7437 0.9286 0.8926 0.9034 0.7892 0.5389 0.9466 diff --git a/scripts/beir/similarities/q2c-weight_jaccard.tsv b/scripts/beir/similarities/q2c-weight_jaccard.tsv new file mode 100644 index 000000000..8a866ddb5 --- /dev/null +++ b/scripts/beir/similarities/q2c-weight_jaccard.tsv @@ -0,0 +1,20 @@ + trec-covid bioasq nfcorpus nq hotpotqa climate-fever fever dbpedia-entity fiqa signal1m trec-news robust04 arguana webis-touche2020 quora cqadupstack scidocs scifact msmarco +trec-covid 0.1719 0.1443 0.1393 0.1287 0.1221 0.1296 0.1296 0.1218 0.1321 0.0978 0.1223 0.1465 0.1497 0.1391 0.1954 0.1076 0.1477 0.1469 0.1412 +bioasq 0.3219 0.3639 0.3002 0.2248 0.2324 0.2358 0.2358 0.2342 0.1864 0.2035 0.1812 0.2143 0.1991 0.1994 0.2814 0.2063 0.2644 0.3813 0.2576 +nfcorpus 0.2198 0.2377 0.2637 0.1794 0.1727 0.1779 0.1779 0.1736 0.1728 0.2131 0.1761 0.1787 0.1675 0.1824 0.2084 0.1745 0.1831 0.2333 0.2218 +nq 0.2180 0.2195 0.2024 0.3424 0.3196 0.3329 0.3329 0.3177 0.2481 0.2534 0.2774 0.2967 0.2710 0.2745 0.3265 0.1938 0.2436 0.2075 0.3209 +hotpotqa 0.2713 0.2697 0.2483 0.4692 0.5535 0.5367 0.5367 0.5396 0.2875 0.3059 0.3476 0.3780 0.3158 0.3138 0.3520 0.2287 0.2949 0.2569 0.3798 +climate-fever 0.3296 0.3124 0.3101 0.4065 0.3290 0.3428 0.3427 0.3245 0.3789 0.2633 0.3798 0.4140 0.4217 0.4005 0.2789 0.2697 0.3611 0.3090 0.4027 +fever 0.2453 0.2481 0.2315 0.3861 0.4406 0.4360 0.4360 0.4407 0.2627 0.3006 0.3221 0.3256 0.2839 0.3001 0.2795 0.2133 0.2604 0.2315 0.3487 +dbpedia-entity 0.1489 0.1503 0.1429 0.1958 0.2121 0.2123 0.2123 0.2122 0.1493 0.1477 0.1676 0.1831 0.1792 0.1614 0.1796 0.1247 0.1640 0.1449 0.1808 +fiqa 0.2326 0.2222 0.2163 0.2729 0.2223 0.2388 0.2388 0.2207 0.4274 0.2494 0.2873 0.3068 0.3024 0.3199 0.4379 0.2736 0.2727 0.2179 0.3274 +signal1m 0.1523 0.1523 0.1490 0.1589 0.1436 0.1550 0.1550 0.1446 0.1772 0.1903 0.1755 0.1858 0.1592 0.1639 0.1485 0.1555 0.1497 0.1496 0.1659 +trec-news 0.1535 0.1460 0.1506 0.1633 0.1393 0.1511 0.1511 0.1389 0.1879 0.1582 0.1905 0.1737 0.1802 0.1806 0.1715 0.1459 0.1605 0.1461 0.1780 +robust04 0.2498 0.2389 0.2438 0.2833 0.2472 0.2580 0.2580 0.2450 0.2603 0.1903 0.2435 0.2891 0.3081 0.2772 0.2556 0.1906 0.2800 0.2386 0.2829 +arguana 0.3670 0.3371 0.3356 0.5091 0.3863 0.4088 0.4088 0.3807 0.4765 0.3145 0.4913 0.5221 0.8406 0.5519 0.3583 0.3170 0.4357 0.3324 0.4938 +webis-touche2020 0.0744 0.0698 0.0730 0.0765 0.0742 0.0761 0.0761 0.0733 0.1007 0.0665 0.0846 0.0861 0.1031 0.1019 0.1541 0.0729 0.0886 0.0717 0.0922 +quora 0.2405 0.2314 0.2167 0.3287 0.2771 0.2889 0.2889 0.2726 0.3683 0.2895 0.3269 0.3217 0.3339 0.3808 0.7576 0.2739 0.2853 0.2196 0.3771 +cqadupstack 0.2822 0.2809 0.2573 0.3215 0.2708 0.2794 0.2794 0.2618 0.3554 0.2887 0.2943 0.3146 0.2962 0.3443 0.4415 0.4130 0.3488 0.2723 0.3691 +scidocs 0.2323 0.2341 0.2080 0.1852 0.1759 0.1844 0.1844 0.1759 0.1619 0.1785 0.1705 0.1950 0.1737 0.1638 0.1719 0.1728 0.3253 0.2271 0.2050 +scifact 0.3746 0.4260 0.3731 0.2371 0.2364 0.2488 0.2488 0.2401 0.2085 0.2209 0.2071 0.2425 0.2171 0.2182 0.1866 0.2287 0.2869 0.4823 0.2754 +msmarco 0.2866 0.2928 0.2678 0.3607 0.3343 0.3446 0.3446 0.3327 0.3005 0.3226 0.3118 0.3248 0.2909 0.3076 0.4459 0.2487 0.2965 0.2730 0.4578 diff --git a/scripts/beir/test_compare_domains.sh b/scripts/beir/test_compare_domains.sh index e7ea8be5c..5505bb612 100644 --- a/scripts/beir/test_compare_domains.sh +++ b/scripts/beir/test_compare_domains.sh @@ -1,7 +1,13 @@ -for metric in weight_jaccard kl_divergence tf_filter df_filter +rm -r similarities +mkdir similarities +for setmode in q2c +do +for metric in weight_jaccard kl_divergence tf_filter df_filter js_divergence do python compare_domains.py \ --index_path indexes \ - --output_path indexes_${metric}.tsv \ + --output_path similarities/${setmode}-${metric}.tsv \ + --compare_sets ${setmode} \ --compare_metric ${metric} done +done From c4b3c2efc168ac21951e3c59a2c9e77fc8ca4962 Mon Sep 17 00:00:00 2001 From: amyxie361 Date: Thu, 12 May 2022 15:29:41 -0400 Subject: [PATCH 12/12] remove previous results --- scripts/beir/indexes_df_filter.tsv | 20 -------------------- scripts/beir/indexes_js_divergence.tsv | 20 -------------------- scripts/beir/indexes_kl_divergence.tsv | 20 -------------------- scripts/beir/indexes_tf_filter.tsv | 20 -------------------- scripts/beir/indexes_weight_jaccard.tsv | 20 -------------------- 5 files changed, 100 deletions(-) delete mode 100644 scripts/beir/indexes_df_filter.tsv delete mode 100644 scripts/beir/indexes_js_divergence.tsv delete mode 100644 scripts/beir/indexes_kl_divergence.tsv delete mode 100644 scripts/beir/indexes_tf_filter.tsv delete mode 100644 scripts/beir/indexes_weight_jaccard.tsv diff --git a/scripts/beir/indexes_df_filter.tsv b/scripts/beir/indexes_df_filter.tsv deleted file mode 100644 index 0a408a408..000000000 --- a/scripts/beir/indexes_df_filter.tsv +++ /dev/null @@ -1,20 +0,0 @@ - trec-covid bioasq nfcorpus nq hotpotqa climate-fever fever dbpedia-entity fiqa signal1m trec-news robust04 arguana webis-touche2020 quora cqadupstack scidocs scifact msmarco -trec-covid 1.0000 0.6805 0.5963 0.3362 0.2377 0.2603 0.2604 0.2334 0.2451 0.2251 0.2787 0.3180 0.2927 0.2760 0.2149 0.2772 0.4241 0.6367 0.3485 -bioasq 0.6805 1.0000 0.6533 0.3165 0.2267 0.2464 0.2465 0.2242 0.2259 0.2182 0.2541 0.2943 0.2567 0.2586 0.2031 0.2753 0.3823 0.7174 0.3335 -nfcorpus 0.5963 0.6533 1.0000 0.3165 0.2207 0.2400 0.2401 0.2187 0.2338 0.2096 0.2508 0.2908 0.2665 0.2540 0.2047 0.2594 0.3541 0.6115 0.3318 -nq 0.3362 0.3165 0.3165 1.0000 0.4995 0.5694 0.5689 0.4902 0.3926 0.3710 0.5455 0.5439 0.4421 0.4387 0.3717 0.3437 0.3329 0.2937 0.5508 -hotpotqa 0.2377 0.2267 0.2207 0.4995 1.0000 0.8334 0.8340 0.9355 0.2344 0.3446 0.3469 0.3274 0.2709 0.2543 0.2790 0.2427 0.2235 0.2107 0.3767 -climate-fever 0.2603 0.2464 0.2400 0.5694 0.8334 1.0000 0.9993 0.8253 0.2640 0.3549 0.3949 0.3762 0.3064 0.2862 0.2933 0.2677 0.2449 0.2280 0.4122 -fever 0.2604 0.2465 0.2401 0.5689 0.8340 0.9993 1.0000 0.8258 0.2636 0.3544 0.3945 0.3758 0.3060 0.2858 0.2935 0.2673 0.2450 0.2281 0.4117 -dbpedia-entity 0.2334 0.2242 0.2187 0.4902 0.9355 0.8253 0.8258 1.0000 0.2255 0.3425 0.3405 0.3231 0.2635 0.2463 0.2751 0.2402 0.2177 0.2084 0.3691 -fiqa 0.2451 0.2259 0.2338 0.3926 0.2344 0.2640 0.2636 0.2255 1.0000 0.3077 0.4699 0.4494 0.4356 0.5031 0.4116 0.4062 0.3054 0.2202 0.4680 -signal1m 0.2251 0.2182 0.2096 0.3710 0.3446 0.3549 0.3544 0.3425 0.3077 1.0000 0.4079 0.3431 0.2916 0.3211 0.3555 0.3037 0.2081 0.2048 0.3654 -trec-news 0.2787 0.2541 0.2508 0.5455 0.3469 0.3949 0.3945 0.3405 0.4699 0.4079 1.0000 0.5913 0.4754 0.5179 0.3765 0.3568 0.3016 0.2400 0.5027 -robust04 0.3180 0.2943 0.2908 0.5439 0.3274 0.3762 0.3758 0.3231 0.4494 0.3431 0.5913 1.0000 0.4845 0.4528 0.3275 0.3324 0.3199 0.2800 0.4757 -arguana 0.2927 0.2567 0.2665 0.4421 0.2709 0.3064 0.3060 0.2635 0.4356 0.2916 0.4754 0.4845 1.0000 0.5555 0.3461 0.3116 0.3413 0.2539 0.4240 -webis-touche2020 0.2760 0.2586 0.2540 0.4387 0.2543 0.2862 0.2858 0.2463 0.5031 0.3211 0.5179 0.4528 0.5555 1.0000 0.3838 0.3866 0.3364 0.2508 0.4530 -quora 0.2149 0.2031 0.2047 0.3717 0.2790 0.2933 0.2935 0.2751 0.4116 0.3555 0.3765 0.3275 0.3461 0.3838 1.0000 0.3551 0.2515 0.1920 0.4549 -cqadupstack 0.2772 0.2753 0.2594 0.3437 0.2427 0.2677 0.2673 0.2402 0.4062 0.3037 0.3568 0.3324 0.3116 0.3866 0.3551 1.0000 0.3632 0.2598 0.4230 -scidocs 0.4241 0.3823 0.3541 0.3329 0.2235 0.2449 0.2450 0.2177 0.3054 0.2081 0.3016 0.3199 0.3413 0.3364 0.2515 0.3632 1.0000 0.3796 0.3599 -scifact 0.6367 0.7174 0.6115 0.2937 0.2107 0.2280 0.2281 0.2084 0.2202 0.2048 0.2400 0.2800 0.2539 0.2508 0.1920 0.2598 0.3796 1.0000 0.3112 -msmarco 0.3485 0.3335 0.3318 0.5508 0.3767 0.4122 0.4117 0.3691 0.4680 0.3654 0.5027 0.4757 0.4240 0.4530 0.4549 0.4230 0.3599 0.3112 1.0000 diff --git a/scripts/beir/indexes_js_divergence.tsv b/scripts/beir/indexes_js_divergence.tsv deleted file mode 100644 index 60a55173c..000000000 --- a/scripts/beir/indexes_js_divergence.tsv +++ /dev/null @@ -1,20 +0,0 @@ - trec-covid bioasq nfcorpus nq hotpotqa climate-fever fever dbpedia-entity fiqa signal1m trec-news robust04 arguana webis-touche2020 quora cqadupstack scidocs scifact msmarco -trec-covid 1.0000 0.0546 0.0839 0.1642 0.2218 0.2017 0.2016 0.2221 0.2382 0.2489 0.2108 0.1854 0.2013 0.2163 0.2808 0.2252 0.1392 0.0701 0.1624 -bioasq 0.0546 1.0000 0.0603 0.1658 0.2205 0.2011 0.2011 0.2198 0.2425 0.2449 0.2189 0.1922 0.2182 0.2217 0.2816 0.2191 0.1519 0.0423 0.1618 -nfcorpus 0.0839 0.0603 1.0000 0.1814 0.2409 0.2211 0.2211 0.2402 0.2556 0.2684 0.2329 0.2050 0.2236 0.2356 0.2987 0.2441 0.1775 0.0782 0.1776 -nq 0.1642 0.1658 0.1814 1.0000 0.0816 0.0633 0.0634 0.0821 0.1366 0.1482 0.0943 0.0854 0.1052 0.1224 0.1754 0.1559 0.1576 0.1849 0.0672 -hotpotqa 0.2218 0.2205 0.2409 0.0816 1.0000 0.0201 0.0201 0.0049 0.2208 0.1691 0.1765 0.1673 0.1931 0.2166 0.2170 0.2207 0.2222 0.2379 0.1214 -climate-fever 0.2017 0.2011 0.2211 0.0633 0.0201 1.0000 0.0000 0.0198 0.1984 0.1606 0.1437 0.1355 0.1701 0.1893 0.2078 0.2005 0.2019 0.2193 0.1060 -fever 0.2016 0.2011 0.2211 0.0634 0.0201 0.0000 1.0000 0.0198 0.1985 0.1606 0.1438 0.1356 0.1702 0.1894 0.2078 0.2006 0.2019 0.2193 0.1061 -dbpedia-entity 0.2221 0.2198 0.2402 0.0821 0.0049 0.0198 0.0198 1.0000 0.2229 0.1685 0.1764 0.1670 0.1943 0.2184 0.2183 0.2201 0.2237 0.2377 0.1218 -fiqa 0.2382 0.2425 0.2556 0.1366 0.2208 0.1984 0.1985 0.2229 1.0000 0.1826 0.1164 0.1233 0.1182 0.0883 0.1501 0.1238 0.2003 0.2568 0.0970 -signal1m 0.2489 0.2449 0.2684 0.1482 0.1691 0.1606 0.1606 0.1685 0.1826 1.0000 0.1560 0.1744 0.2006 0.1852 0.1916 0.1812 0.2486 0.2621 0.1393 -trec-news 0.2108 0.2189 0.2329 0.0943 0.1765 0.1437 0.1438 0.1764 0.1164 0.1560 1.0000 0.0696 0.1098 0.0908 0.1901 0.1623 0.1953 0.2341 0.1108 -robust04 0.1854 0.1922 0.2050 0.0854 0.1673 0.1355 0.1356 0.1670 0.1233 0.1744 0.0696 1.0000 0.1082 0.1147 0.2092 0.1736 0.1809 0.2089 0.1086 -arguana 0.2013 0.2182 0.2236 0.1052 0.1931 0.1701 0.1702 0.1943 0.1182 0.2006 0.1098 0.1082 1.0000 0.0857 0.1902 0.1766 0.1702 0.2274 0.1143 -webis-touche2020 0.2163 0.2217 0.2356 0.1224 0.2166 0.1893 0.1894 0.2184 0.0883 0.1852 0.0908 0.1147 0.0857 1.0000 0.1707 0.1352 0.1846 0.2341 0.1122 -quora 0.2808 0.2816 0.2987 0.1754 0.2170 0.2078 0.2078 0.2183 0.1501 0.1916 0.1901 0.2092 0.1902 0.1707 1.0000 0.1669 0.2471 0.2970 0.1350 -cqadupstack 0.2252 0.2191 0.2441 0.1559 0.2207 0.2005 0.2006 0.2201 0.1238 0.1812 0.1623 0.1736 0.1766 0.1352 0.1669 1.0000 0.1833 0.2358 0.1205 -scidocs 0.1392 0.1519 0.1775 0.1576 0.2222 0.2019 0.2019 0.2237 0.2003 0.2486 0.1953 0.1809 0.1702 0.1846 0.2471 0.1833 1.0000 0.1602 0.1482 -scifact 0.0701 0.0423 0.0782 0.1849 0.2379 0.2193 0.2193 0.2377 0.2568 0.2621 0.2341 0.2089 0.2274 0.2341 0.2970 0.2358 0.1602 1.0000 0.1805 -msmarco 0.1624 0.1618 0.1776 0.0672 0.1214 0.1060 0.1061 0.1218 0.0970 0.1393 0.1108 0.1086 0.1143 0.1122 0.1350 0.1205 0.1482 0.1805 1.0000 diff --git a/scripts/beir/indexes_kl_divergence.tsv b/scripts/beir/indexes_kl_divergence.tsv deleted file mode 100644 index 476033474..000000000 --- a/scripts/beir/indexes_kl_divergence.tsv +++ /dev/null @@ -1,20 +0,0 @@ - trec-covid bioasq nfcorpus nq hotpotqa climate-fever fever dbpedia-entity fiqa signal1m trec-news robust04 arguana webis-touche2020 quora cqadupstack scidocs scifact msmarco -trec-covid 1.0000 0.0448 0.0336 0.1378 0.1141 0.1268 0.1268 0.1026 0.0769 0.1147 0.2565 0.1991 0.0948 0.1706 0.1257 0.0665 0.0856 0.0569 0.1137 -bioasq 0.0609 1.0000 0.0194 0.1555 0.1295 0.1447 0.1447 0.1217 0.0999 0.1345 0.2773 0.2233 0.1397 0.1957 0.1517 0.0833 0.1371 0.0239 0.1279 -nfcorpus 0.1186 0.0863 1.0000 0.1908 0.1527 0.1706 0.1706 0.1435 0.1212 0.1509 0.3080 0.2669 0.1566 0.2258 0.1755 0.1044 0.1849 0.1257 0.1794 -nq 0.1812 0.1576 0.1643 1.0000 0.0616 0.0384 0.0383 0.0815 0.0574 0.1851 0.2639 0.1940 0.0632 0.1516 0.1579 0.0737 0.1527 0.1855 0.0489 -hotpotqa 0.2694 0.2197 0.2561 0.1654 1.0000 0.0588 0.0588 0.0050 0.1574 0.2180 0.4821 0.3698 0.2383 0.3233 0.1839 0.1503 0.2188 0.2563 0.1190 -climate-fever 0.1456 0.1090 0.1328 0.0707 -0.0202 1.0000 -0.0000 -0.0222 0.0586 0.1299 0.3001 0.2246 0.1211 0.1858 0.1071 0.0644 0.1062 0.1335 0.0523 -fever 0.1456 0.1090 0.1328 0.0708 -0.0202 0.0000 1.0000 -0.0222 0.0588 0.1300 0.3002 0.2247 0.1212 0.1858 0.1071 0.0645 0.1062 0.1335 0.0523 -dbpedia-entity 0.2553 0.2071 0.2423 0.1618 0.0013 0.0604 0.0604 1.0000 0.1551 0.2109 0.4672 0.3603 0.2314 0.3199 0.1803 0.1449 0.2111 0.2426 0.1218 -fiqa 0.3243 0.3136 0.3265 0.3440 0.3525 0.3592 0.3582 0.3496 1.0000 0.3400 0.4209 0.4279 0.2379 0.2018 0.2046 0.1204 0.3096 0.3201 0.2314 -signal1m 0.3184 0.2348 0.2533 0.2131 0.2702 0.2878 0.2878 0.2469 0.2958 1.0000 0.4628 0.2995 0.2113 0.3399 0.1300 0.2227 0.3798 0.2558 0.1895 -trec-news 0.0620 0.0436 0.0422 0.0681 0.0835 0.0679 0.0677 0.0795 -0.0693 0.1217 1.0000 0.0441 -0.0241 -0.0209 0.0574 -0.0204 0.0600 0.0454 0.0254 -robust04 0.0416 0.0256 0.0194 0.0370 0.0584 0.0599 0.0595 0.0594 -0.0747 0.0811 0.0656 1.0000 -0.0266 -0.0126 0.0488 -0.0377 0.0326 0.0366 0.0053 -arguana 0.2747 0.2292 0.2534 0.2178 0.2459 0.2374 0.2373 0.2517 0.0637 0.2840 0.3115 0.2616 1.0000 0.1559 0.1798 0.1154 0.2201 0.2417 0.1416 -webis-touche2020 0.1759 0.1677 0.1734 0.1959 0.2135 0.2161 0.2161 0.2093 -0.0209 0.2110 0.2116 0.2317 0.0718 1.0000 0.1017 0.0445 0.1768 0.1711 0.1255 -quora 0.6807 0.9262 0.7167 1.0242 0.5189 0.6385 0.6385 0.5243 0.5093 0.7571 0.9984 1.0744 0.7696 0.6971 1.0000 0.4465 1.0338 0.6816 0.7129 -cqadupstack 0.3405 0.3268 0.3328 0.3428 0.3404 0.3548 0.3544 0.3385 0.1287 0.3009 0.4360 0.4501 0.2323 0.2593 0.1877 1.0000 0.3545 0.3202 0.2640 -scidocs 0.1595 0.1440 0.1419 0.1746 0.1613 0.1796 0.1796 0.1526 0.1121 0.1258 0.3094 0.2685 0.1335 0.1828 0.1601 0.1383 1.0000 0.1584 0.1128 -scifact 0.0915 0.0563 0.0574 0.1600 0.1297 0.1442 0.1442 0.1215 0.0911 0.1241 0.2662 0.2197 0.1328 0.1859 0.1368 0.0746 0.1293 1.0000 0.1346 -msmarco 0.2116 0.1982 0.2083 0.1484 0.1556 0.1780 0.1779 0.1526 0.0356 0.1836 0.3661 0.3101 0.1353 0.1968 0.1138 0.0810 0.1784 0.2204 1.0000 diff --git a/scripts/beir/indexes_tf_filter.tsv b/scripts/beir/indexes_tf_filter.tsv deleted file mode 100644 index 957e1a473..000000000 --- a/scripts/beir/indexes_tf_filter.tsv +++ /dev/null @@ -1,20 +0,0 @@ - trec-covid bioasq nfcorpus nq hotpotqa climate-fever fever dbpedia-entity fiqa signal1m trec-news robust04 arguana webis-touche2020 quora cqadupstack scidocs scifact msmarco -trec-covid 1.0000 0.5225 0.5367 0.7852 0.7695 0.6963 0.6962 0.7722 0.8216 0.8511 0.7654 0.7957 0.7335 0.8538 0.8450 0.8544 0.8116 0.5629 0.6933 -bioasq 0.5225 1.0000 0.1540 0.6365 0.6223 0.6332 0.6332 0.6220 0.4986 0.6195 0.6504 0.6609 0.4141 0.6220 0.5379 0.5949 0.4224 0.1639 0.7079 -nfcorpus 0.5367 0.1540 1.0000 0.3979 0.3977 0.3425 0.3424 0.3994 0.5425 0.4687 0.3922 0.4194 0.5547 0.4648 0.5216 0.4705 0.5858 0.7047 0.3258 -nq 0.7852 0.6365 0.3979 1.0000 0.9263 0.8870 0.8869 0.9260 0.7417 0.8924 0.8971 0.8946 0.6171 0.8980 0.8011 0.8941 0.6824 0.4205 0.8288 -hotpotqa 0.7695 0.6223 0.3977 0.9263 1.0000 0.9088 0.9086 0.9868 0.7290 0.8748 0.8540 0.8523 0.6093 0.8689 0.7865 0.8719 0.6740 0.4207 0.8053 -climate-fever 0.6963 0.6332 0.3425 0.8870 0.9088 1.0000 0.9998 0.9053 0.6556 0.8019 0.8279 0.8134 0.5391 0.8035 0.7134 0.7997 0.6032 0.3634 0.8089 -fever 0.6962 0.6332 0.3424 0.8869 0.9086 0.9998 1.0000 0.9051 0.6556 0.8019 0.8277 0.8135 0.5392 0.8034 0.7133 0.7996 0.6031 0.3633 0.8090 -dbpedia-entity 0.7722 0.6220 0.3994 0.9260 0.9868 0.9053 0.9051 1.0000 0.7312 0.8771 0.8534 0.8530 0.6119 0.8693 0.7896 0.8735 0.6768 0.4225 0.8040 -fiqa 0.8216 0.4986 0.5425 0.7417 0.7290 0.6556 0.6556 0.7312 1.0000 0.8267 0.7301 0.7637 0.7555 0.8191 0.8492 0.8219 0.7839 0.5677 0.6590 -signal1m 0.8511 0.6195 0.4687 0.8924 0.8748 0.8019 0.8019 0.8771 0.8267 1.0000 0.8723 0.8941 0.7018 0.9403 0.8771 0.9303 0.7535 0.4918 0.7968 -trec-news 0.7654 0.6504 0.3922 0.8971 0.8540 0.8279 0.8277 0.8534 0.7301 0.8723 1.0000 0.9119 0.6057 0.8787 0.7784 0.8435 0.6608 0.4139 0.8578 -robust04 0.7957 0.6609 0.4194 0.8946 0.8523 0.8134 0.8135 0.8530 0.7637 0.8941 0.9119 1.0000 0.6504 0.9016 0.8087 0.8625 0.6967 0.4421 0.8584 -arguana 0.7335 0.4141 0.5547 0.6171 0.6093 0.5391 0.5392 0.6119 0.7555 0.7018 0.6057 0.6504 1.0000 0.6962 0.7429 0.6963 0.7329 0.5798 0.5498 -webis-touche2020 0.8538 0.6220 0.4648 0.8980 0.8689 0.8035 0.8034 0.8693 0.8191 0.9403 0.8787 0.9016 0.6962 1.0000 0.8729 0.9344 0.7545 0.4876 0.8016 -quora 0.8450 0.5379 0.5216 0.8011 0.7865 0.7134 0.7133 0.7896 0.8492 0.8771 0.7784 0.8087 0.7429 0.8729 1.0000 0.8732 0.7931 0.5468 0.7044 -cqadupstack 0.8544 0.5949 0.4705 0.8941 0.8719 0.7997 0.7996 0.8735 0.8219 0.9303 0.8435 0.8625 0.6963 0.9344 0.8732 1.0000 0.7626 0.4936 0.7674 -scidocs 0.8116 0.4224 0.5858 0.6824 0.6740 0.6032 0.6031 0.6768 0.7839 0.7535 0.6608 0.6967 0.7329 0.7545 0.7931 0.7626 1.0000 0.6193 0.5921 -scifact 0.5629 0.1639 0.7047 0.4205 0.4207 0.3634 0.3633 0.4225 0.5677 0.4918 0.4139 0.4421 0.5798 0.4876 0.5468 0.4936 0.6193 1.0000 0.3492 -msmarco 0.6933 0.7079 0.3258 0.8288 0.8053 0.8089 0.8090 0.8040 0.6590 0.7968 0.8578 0.8584 0.5498 0.8016 0.7044 0.7674 0.5921 0.3492 1.0000 diff --git a/scripts/beir/indexes_weight_jaccard.tsv b/scripts/beir/indexes_weight_jaccard.tsv deleted file mode 100644 index 29da5e612..000000000 --- a/scripts/beir/indexes_weight_jaccard.tsv +++ /dev/null @@ -1,20 +0,0 @@ - trec-covid bioasq nfcorpus nq hotpotqa climate-fever fever dbpedia-entity fiqa signal1m trec-news robust04 arguana webis-touche2020 quora cqadupstack scidocs scifact msmarco -trec-covid 1.0000 0.6702 0.5867 0.3619 0.2564 0.2955 0.2955 0.2574 0.2721 0.2390 0.2698 0.3307 0.3119 0.2911 0.2042 0.2993 0.4355 0.6190 0.3553 -bioasq 0.6702 1.0000 0.6530 0.3559 0.2588 0.2973 0.2973 0.2613 0.2633 0.2417 0.2546 0.3170 0.2876 0.2771 0.1991 0.3053 0.4019 0.7019 0.3516 -nfcorpus 0.5867 0.6530 1.0000 0.3325 0.2350 0.2689 0.2689 0.2371 0.2546 0.2178 0.2441 0.3039 0.2827 0.2703 0.1884 0.2767 0.3613 0.5820 0.3299 -nq 0.3619 0.3559 0.3325 1.0000 0.4992 0.5883 0.5882 0.4983 0.4031 0.3688 0.4455 0.4913 0.4459 0.4018 0.3214 0.3668 0.3770 0.3273 0.5709 -hotpotqa 0.2564 0.2588 0.2350 0.4992 1.0000 0.7719 0.7720 0.9361 0.2598 0.3286 0.2760 0.3020 0.2817 0.2387 0.2806 0.2604 0.2573 0.2395 0.4378 -climate-fever 0.2955 0.2973 0.2689 0.5883 0.7719 1.0000 0.9998 0.7741 0.2988 0.3495 0.3382 0.3659 0.3258 0.2803 0.2792 0.2969 0.2969 0.2736 0.4613 -fever 0.2955 0.2973 0.2689 0.5882 0.7720 0.9998 1.0000 0.7742 0.2987 0.3494 0.3381 0.3658 0.3257 0.2802 0.2792 0.2968 0.2970 0.2736 0.4612 -dbpedia-entity 0.2574 0.2613 0.2371 0.4983 0.9361 0.7741 0.7742 1.0000 0.2595 0.3332 0.2758 0.3038 0.2807 0.2372 0.2783 0.2624 0.2566 0.2410 0.4396 -fiqa 0.2721 0.2633 0.2546 0.4031 0.2598 0.2988 0.2987 0.2595 1.0000 0.3152 0.4142 0.4030 0.4821 0.5323 0.3748 0.4754 0.3243 0.2503 0.4781 -signal1m 0.2390 0.2417 0.2178 0.3688 0.3286 0.3495 0.3494 0.3332 0.3152 1.0000 0.3237 0.3063 0.2794 0.2937 0.2944 0.3244 0.2371 0.2291 0.3960 -trec-news 0.2698 0.2546 0.2441 0.4455 0.2760 0.3382 0.3381 0.2758 0.4142 0.3237 1.0000 0.5740 0.4239 0.4964 0.2756 0.3322 0.2825 0.2412 0.4140 -robust04 0.3307 0.3170 0.3039 0.4913 0.3020 0.3659 0.3658 0.3038 0.4030 0.3063 0.5740 1.0000 0.4421 0.4458 0.2602 0.3248 0.3196 0.2954 0.4329 -arguana 0.3119 0.2876 0.2827 0.4459 0.2817 0.3258 0.3257 0.2807 0.4821 0.2794 0.4239 0.4421 1.0000 0.5237 0.3105 0.3709 0.3706 0.2801 0.4409 -webis-touche2020 0.2911 0.2771 0.2703 0.4018 0.2387 0.2803 0.2802 0.2372 0.5323 0.2937 0.4964 0.4458 0.5237 1.0000 0.3264 0.4141 0.3237 0.2693 0.4234 -quora 0.2042 0.1991 0.1884 0.3214 0.2806 0.2792 0.2792 0.2783 0.3748 0.2944 0.2756 0.2602 0.3105 0.3264 1.0000 0.3405 0.2409 0.1884 0.4022 -cqadupstack 0.2993 0.3053 0.2767 0.3668 0.2604 0.2969 0.2968 0.2624 0.4754 0.3244 0.3322 0.3248 0.3709 0.4141 0.3405 1.0000 0.3477 0.2874 0.4328 -scidocs 0.4355 0.4019 0.3613 0.3770 0.2573 0.2969 0.2970 0.2566 0.3243 0.2371 0.2825 0.3196 0.3706 0.3237 0.2409 0.3477 1.0000 0.3946 0.3868 -scifact 0.6190 0.7019 0.5820 0.3273 0.2395 0.2736 0.2736 0.2410 0.2503 0.2291 0.2412 0.2954 0.2801 0.2693 0.1884 0.2874 0.3946 1.0000 0.3270 -msmarco 0.3553 0.3516 0.3299 0.5709 0.4378 0.4613 0.4612 0.4396 0.4781 0.3960 0.4140 0.4329 0.4409 0.4234 0.4022 0.4328 0.3868 0.3270 1.0000