From f1ce7d1f2ffcda9f0ace4d0f013883764f08ef5e Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Thu, 28 Apr 2022 14:48:00 -0400
Subject: [PATCH 01/12] init wp-tokenizer for beir stats

---
 scripts/beir/tokenize_corpus.py | 97 +++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 scripts/beir/tokenize_corpus.py

diff --git a/scripts/beir/tokenize_corpus.py b/scripts/beir/tokenize_corpus.py
new file mode 100644
index 000000000..5a987cb89
--- /dev/null
+++ b/scripts/beir/tokenize_corpus.py
@@ -0,0 +1,97 @@
+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import argparse
+import multiprocessing
+import json
+import time
+from joblib import Parallel, delayed
+from transformers import AutoTokenizer
+from ltr_msmarco.convert_common import get_retokenized
+
+'''Replace original contents fields with bert tokenization'''
+
+
+parser = argparse.ArgumentParser(description='Convert MSMARCO-adhoc documents.')
+parser.add_argument('--input', metavar='input file', help='input file',
+                    type=str, required=True)
+parser.add_argument('--output', metavar='output file', help='output file',
+                    type=str, required=True)
+parser.add_argument('--workers', metavar='# of processes', help='# of workers to spawn',
+                    type=int, default=multiprocessing.cpu_count() - 2)
+
+
+args = parser.parse_args()
+print(args)
+arg_vars = vars(args)
+
+def batch_file(iterable, n=10000):
+    batch = []
+    for line in iterable:
+        batch.append(line)
+        if len(batch) == n:
+            yield batch
+            batch = []
+    if len(batch) > 0:
+        yield batch
+        batch = []
+    return
+
+
+def batch_process(batch):
+    bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+
+    def process(line):
+        if not line:
+            return None
+        json_line = json.loads(line)
+        pid = json_line['id']
+        body = json_line['contents']
+
+        doc = {"id": pid,
+               "contents": get_retokenized(bert_tokenizer, body.lower())}
+        return doc
+    
+    res = []
+    start = time.time()
+    for line in batch:
+        res.append(process(line))
+        if len(res) % 1000 == 0:
+            end = time.time()
+            print(f"finish {len(res)} using {end-start}")
+            start = end
+    return res
+
+
+if __name__ == '__main__':
+    workers = args.workers
+    print(f"Spawning {workers} processes")
+    pool = Parallel(n_jobs=workers, verbose=10)
+    line_num = 0
+
+    with open(args.input) as inFile:
+        with open(args.output, 'w') as outFile:
+            for batch_json in pool([delayed(batch_process)(batch) for batch in batch_file(inFile)]):
+                for doc_json in batch_json:
+                    line_num = line_num + 1
+                    if doc_json is not None:
+                        outFile.write(json.dumps(doc_json) + '\n')
+                    else:
+                        print(f"Ignoring misformatted line {line_num}")
+
+                    if line_num % 100 == 0:
+                        print(f"Processed {line_num} passages")
+
+    print(f"Processed {line_num} passages")

From 66ebe22f695033e935b8cf9258e442c4e87c25f9 Mon Sep 17 00:00:00 2001
From: yqxie <yqxie@rsvp.ai>
Date: Thu, 28 Apr 2022 15:26:01 -0400
Subject: [PATCH 02/12] modify tokenize_corpus from msmarco to beir

---
 scripts/beir/tokenize_corpus.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/scripts/beir/tokenize_corpus.py b/scripts/beir/tokenize_corpus.py
index 5a987cb89..5bfb7bdc8 100644
--- a/scripts/beir/tokenize_corpus.py
+++ b/scripts/beir/tokenize_corpus.py
@@ -19,12 +19,11 @@
 import time
 from joblib import Parallel, delayed
 from transformers import AutoTokenizer
-from ltr_msmarco.convert_common import get_retokenized
 
 '''Replace original contents fields with bert tokenization'''
 
 
-parser = argparse.ArgumentParser(description='Convert MSMARCO-adhoc documents.')
+parser = argparse.ArgumentParser(description='Convert BEIR original documents to word piece tokenized.')
 parser.add_argument('--input', metavar='input file', help='input file',
                     type=str, required=True)
 parser.add_argument('--output', metavar='output file', help='output file',
@@ -37,6 +36,17 @@
 print(args)
 arg_vars = vars(args)
 
+def get_retokenized(tokenizer, text):
+    """
+    copy from pyserini.scripts.ltr_msmarco.convert_common.get_retokenized
+    Obtain a space separated re-tokenized text.
+    :param tokenizer:  a tokenizer that has the function
+                       tokenize that returns an array of tokens.
+    :param text:       a text to re-tokenize.
+    """
+    return ' '.join(tokenizer.tokenize(text))
+
+
 def batch_file(iterable, n=10000):
     batch = []
     for line in iterable:
@@ -57,11 +67,13 @@ def process(line):
         if not line:
             return None
         json_line = json.loads(line)
-        pid = json_line['id']
-        body = json_line['contents']
+        pid = json_line['_id']
+        title = json_line['title']
+        body = json_line['text']
 
-        doc = {"id": pid,
-               "contents": get_retokenized(bert_tokenizer, body.lower())}
+        doc = {"_id": pid,
+               "title":  get_retokenized(bert_tokenizer, title.lower()),
+               "text": get_retokenized(bert_tokenizer, body.lower())}
         return doc
     
     res = []

From f463df831158efbb0205ef688b8d6db67406d388 Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Thu, 28 Apr 2022 16:18:22 -0400
Subject: [PATCH 03/12] add query tokenizer

---
 scripts/beir/tokenize_queries.py | 109 +++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 scripts/beir/tokenize_queries.py

diff --git a/scripts/beir/tokenize_queries.py b/scripts/beir/tokenize_queries.py
new file mode 100644
index 000000000..583c44ffd
--- /dev/null
+++ b/scripts/beir/tokenize_queries.py
@@ -0,0 +1,109 @@
+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import argparse
+import multiprocessing
+import json
+import time
+from joblib import Parallel, delayed
+from transformers import AutoTokenizer
+
+'''Replace original contents fields with bert tokenization'''
+
+
+parser = argparse.ArgumentParser(description='Convert BEIR original documents to word piece tokenized.')
+parser.add_argument('--input', metavar='input file', help='input file',
+                    type=str, required=True)
+parser.add_argument('--output', metavar='output file', help='output file',
+                    type=str, required=True)
+parser.add_argument('--workers', metavar='# of processes', help='# of workers to spawn',
+                    type=int, default=multiprocessing.cpu_count() - 2)
+
+
+args = parser.parse_args()
+print(args)
+arg_vars = vars(args)
+
+def get_retokenized(tokenizer, text):
+    """
+    copy from pyserini.scripts.ltr_msmarco.convert_common.get_retokenized
+    Obtain a space separated re-tokenized text.
+    :param tokenizer:  a tokenizer that has the function
+                       tokenize that returns an array of tokens.
+    :param text:       a text to re-tokenize.
+    """
+    return ' '.join(tokenizer.tokenize(text))
+
+
+def batch_file(iterable, n=10000):
+    batch = []
+    for line in iterable:
+        batch.append(line)
+        if len(batch) == n:
+            yield batch
+            batch = []
+    if len(batch) > 0:
+        yield batch
+        batch = []
+    return
+
+
+def batch_process(batch):
+    bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+
+    def process(line):
+        if not line:
+            return None
+        json_line = json.loads(line)
+        pid = json_line['_id']
+        title = json_line['title']
+        body = json_line['text']
+
+        doc = {"_id": pid,
+               "title":  get_retokenized(bert_tokenizer, title.lower()),
+               "text": get_retokenized(bert_tokenizer, body.lower())}
+        return doc
+    
+    res = []
+    start = time.time()
+    for line in batch:
+        res.append(process(line))
+        if len(res) % 1000 == 0:
+            end = time.time()
+            print(f"finish {len(res)} using {end-start}")
+            start = end
+    return res
+
+
+if __name__ == '__main__':
+    workers = args.workers
+    print(f"Spawning {workers} processes")
+    pool = Parallel(n_jobs=workers, verbose=10)
+    line_num = 0
+
+    with open(args.input) as inFile:
+        with open(args.output, 'w') as outFile:
+            for batch_json in pool([delayed(batch_process)(batch) for batch in batch_file(inFile)]):
+                for doc_json in batch_json:
+                    line_num = line_num + 1
+                    if doc_json is not None:
+                        outFile.write(json.dumps(doc_json) + '\n')
+                    else:
+                        print(f"Ignoring misformatted line {line_num}")
+
+                    if line_num % 10000 == 0:
+                        print(f"Processed {line_num} passages")
+
+    print(f"Processed {line_num} passages")

From 4089fee31327269ac1d4d876cf8edc6a5197cb67 Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Sun, 1 May 2022 14:58:23 -0400
Subject: [PATCH 04/12] runable compare domains

---
 scripts/beir/compare_domains.py      | 127 +++++++++++++++++++++++++++
 scripts/beir/index_bm25.sh           |  10 +++
 scripts/beir/test_compare_domains.sh |   3 +
 scripts/beir/tokenize_corpus.py      |   8 +-
 scripts/beir/tokenize_corpus.sh      |  33 +++++++
 scripts/beir/tokenize_queries.py     |   8 +-
 scripts/beir/tokenize_queries.sh     |  38 ++++++++
 7 files changed, 219 insertions(+), 8 deletions(-)
 create mode 100644 scripts/beir/compare_domains.py
 create mode 100644 scripts/beir/index_bm25.sh
 create mode 100644 scripts/beir/test_compare_domains.sh
 create mode 100644 scripts/beir/tokenize_corpus.sh
 create mode 100644 scripts/beir/tokenize_queries.sh

diff --git a/scripts/beir/compare_domains.py b/scripts/beir/compare_domains.py
new file mode 100644
index 000000000..95c55ceec
--- /dev/null
+++ b/scripts/beir/compare_domains.py
@@ -0,0 +1,127 @@
+import argparse
+from pyserini.index.lucene import IndexReader
+
+
+def index2stats(index_path):
+    index_reader = IndexReader(index_path)
+
+    terms = index_reader.terms()
+
+    cf_dict = {}
+    df_dict = {}
+    for t in terms:
+        txt = t.term
+        df = t.df
+        cf = t.cf
+        cf_dict[txt] = int(cf)
+        df_dict[txt] = int(df)
+
+    return cf_dict, df_dict, index_reader.stats() 
+
+def count_total(d):
+    s = 0
+    for t in d:
+        s += d[t]
+    return s
+
+
+def jaccard(d1, d2):
+    ret = (float(len(set(d1).intersection(set(d2)))) / 
+           float(len(set(d1).union(set(d2)))))
+    return ret
+
+def weighted_jaccard(d1, d2):
+    term_union = set(d1).union(set(d2))
+    min_sum = max_sum = 0
+    for t in term_union:
+        if t not in d1:
+            max_sum += d2[t]
+        elif t not in d2:
+            max_sum += d1[t]
+        else:
+            min_sum += min(d1[t], d2[t])
+            max_sum += max(d1[t], d2[t])
+    ret = float(min_sum) / float(max_sum)
+    return ret
+
+def cf2freq(d):
+    total = count_total(d)
+    new_d = {}
+    for t in d:
+        new_d[t] = float(d[t]) / float(total)
+    return new_d
+
+def df2idf(d, n):
+    total = n
+    new_d = {}
+    for t in d:
+        new_d[t] = float(n) / float(d[t])
+    return new_d
+
+def filter_freq_dict(freq_d, threshold=0.0001):
+    new_d = {}
+    for t in freq_d:
+        if freq_d[t] > threshold:
+            new_d[t] = freq_d[t]
+    return new_d
+
+def print_results(datasets, results, save_file):
+    f = open(save_file, 'w')
+
+    f.write("\t{}\n".format("\t".join(datasets)))
+    for d1 in datasets:
+        f.write(d1)
+        for d2 in datasets:
+            f.write("\t{:.4f}".format(results[d1][d2]))
+        f.write("\n")
+    f.close()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--index_path', type=str, help='path to indexes of all the beir dataset', required=True)
+    parser.add_argument('--index_name_format', type=str, help='define your own index dir path name', default="/lucene-index-beir-{}")
+    parser.add_argument('--compare_metric', type=str, help='the metric for comparing two vocab, choose from: jaccard, weight_jaccard, df_filter, tf_filter', default="weight_jaccard")
+    parser.add_argument('--compare_threshold', type=float, help='when choosing df_filter, or tf_filter, you can choolse the threshold', default=0.0001)
+    parser.add_argument('--output_path', type=str, help='path to save the stat results', required=True)
+    args = parser.parse_args()
+
+    #beir_datasets = ['arguana', 'bioasq', 'climate-fever', 'dbpedia-entity', 'fever', 'fiqa', 'hotpotqa', 'nfcorpus', 'nq', 'quora', 'robust04', 'scidocs', 'scifact', 'signal1m', 'trec-covid', 'trec-news', 'webis-touche2020']
+    beir_datasets = ['arguana', 'fiqa']
+    cfs = dfs = stats = {}
+    for d in beir_datasets:
+        cf, df, stat = index2stats(args.index_path + args.index_name_format.format(d))
+        cfs[d] = cf
+        dfs[d] = df
+        stat[d] = stat
+
+    results = {}
+    for d1 in beir_datasets:
+        metric_d1 = {}
+        for d2 in beir_datasets:
+            if d1 == d2:
+                metric_d1[d2] = 1
+            else:
+                if args.compare_metric == "jaccard":
+                    metric_d1[d2] = jaccard(cfs[d1], cfs[d2])
+                elif args.compare_metric == "weight_jaccard":
+                    metric_d1[d2] = weighted_jaccard(cfs[d1], cfs[d2])
+                elif args.compare_metric == "df_filter":
+                    new_d1 = filter_freq_dict(cf2freq(cfs[d1]))
+                    new_d2 = filter_freq_dict(cf2freq(cfs[d2]))
+                    metric_d1[d2] = jaccard(new_d1, new_d2)
+                elif args.compare_metric == "tf_filter":
+                    new_d1 = filter_freq_dict(df2idf(dfs[d1], stat[d1]['documents']))
+                    new_d2 = filter_freq_dict(df2idf(dfs[d2], stat[d2]['documents']))
+                    metric_d1[d2] = jaccard(new_d1, new_d2)
+        results[d1] = metric_d1
+
+    print_results(beir_datasets, results, args.output_path)
+
+
+
+
+
+
+
+
diff --git a/scripts/beir/index_bm25.sh b/scripts/beir/index_bm25.sh
new file mode 100644
index 000000000..d03e36b74
--- /dev/null
+++ b/scripts/beir/index_bm25.sh
@@ -0,0 +1,10 @@
+mkdir -p indexes
+
+for corpora in arguana bioasq climate-fever dbpedia-entity fever hotpotqa nfcorpus quora robust04 scidocs scifact signal1m trec-covid trec-news webis-touche2020
+#for corpora in fiqa
+do
+
+python -m pyserini.index -collection JsonCollection -generator DefaultLuceneDocumentGenerator \
+	-threads 20 -input /store/scratch/y247xie/00_data/wp-tokenized-anserini/${corpora}/corpus \
+	-index indexes/lucene-index-beir-${corpora} -storePositions -storeDocvectors -storeRaw
+done
diff --git a/scripts/beir/test_compare_domains.sh b/scripts/beir/test_compare_domains.sh
new file mode 100644
index 000000000..17f442b3a
--- /dev/null
+++ b/scripts/beir/test_compare_domains.sh
@@ -0,0 +1,3 @@
+python compare_domains.py \
+    --index_path indexes \
+    --output_path indexes_weight_jaccard 
diff --git a/scripts/beir/tokenize_corpus.py b/scripts/beir/tokenize_corpus.py
index 5bfb7bdc8..abe8352e9 100644
--- a/scripts/beir/tokenize_corpus.py
+++ b/scripts/beir/tokenize_corpus.py
@@ -71,16 +71,16 @@ def process(line):
         title = json_line['title']
         body = json_line['text']
 
-        doc = {"_id": pid,
+        doc = {"id": pid,
                "title":  get_retokenized(bert_tokenizer, title.lower()),
-               "text": get_retokenized(bert_tokenizer, body.lower())}
+               "contents": get_retokenized(bert_tokenizer, body.lower())}
         return doc
     
     res = []
     start = time.time()
     for line in batch:
         res.append(process(line))
-        if len(res) % 1000 == 0:
+        if len(res) % 100000 == 0:
             end = time.time()
             print(f"finish {len(res)} using {end-start}")
             start = end
@@ -103,7 +103,7 @@ def process(line):
                     else:
                         print(f"Ignoring misformatted line {line_num}")
 
-                    if line_num % 100 == 0:
+                    if line_num % 10000 == 0:
                         print(f"Processed {line_num} passages")
 
     print(f"Processed {line_num} passages")
diff --git a/scripts/beir/tokenize_corpus.sh b/scripts/beir/tokenize_corpus.sh
new file mode 100644
index 000000000..7a3064741
--- /dev/null
+++ b/scripts/beir/tokenize_corpus.sh
@@ -0,0 +1,33 @@
+for corpora in arguana bioasq climate-fever dbpedia-entity fever fiqa hotpotqa nfcorpus quora robust04 scidocs scifact signal1m trec-covid trec-news webis-touche2020 
+do
+mkdir -p /store/scratch/y247xie/00_data/wp-tokenized-anserini/${corpora}/
+python tokenize_corpus.py \
+	--input /store/collections/beir-v1.0.0/original/${corpora}/corpus.jsonl \
+	--output /store/scratch/y247xie/00_data/wp-tokenized-anserini/${corpora}/corpus.jsonl
+
+python tokenize_queries.py \
+	--input /store/collections/beir-v1.0.0/original/${corpora}/queries.jsonl \
+	--output /store/scratch/y247xie/00_data/wp-tokenized-anserini/${corpora}/queries.jsonl
+done
+
+for corpora in android  english  gaming  gis  mathematica  physics  programmers  stats  tex  unix  webmasters  wordpress
+do
+mkdir -p /store/scratch/y247xie/00_data/wp-tokenized-anserini/cqadupstack/${corpora}/
+python tokenize_corpus.py \
+	--input /store/collections/beir-v1.0.0/original/cqadupstack/${corpora}/corpus.jsonl \
+	--output /store/scratch/y247xie/00_data/wp-tokenized-anserini/cqadupstack/${corpora}/corpus.jsonl
+done
+
+for corpora in nq
+do
+mkdir -p /store/scratch/y247xie/00_data/wp-tokenized-anserini/${corpora}/
+python tokenize_corpus.py \
+	--input /store/scratch/y247xie/00_data/nq/corpus.jsonl \
+	--output /store/scratch/y247xie/00_data/wp-tokenized-anserini/${corpora}/corpus.jsonl
+
+python tokenize_queries.py \
+	--input /store/scratch/y247xie/00_data/nq/queries.jsonl \
+	--output /store/scratch/y247xie/00_data/wp-tokenized-anserini/${corpora}/queries.jsonl
+
+done
+	
diff --git a/scripts/beir/tokenize_queries.py b/scripts/beir/tokenize_queries.py
index 583c44ffd..02ab3e662 100644
--- a/scripts/beir/tokenize_queries.py
+++ b/scripts/beir/tokenize_queries.py
@@ -68,12 +68,12 @@ def process(line):
             return None
         json_line = json.loads(line)
         pid = json_line['_id']
-        title = json_line['title']
         body = json_line['text']
+        metadata = json_line['metadata']
 
-        doc = {"_id": pid,
-               "title":  get_retokenized(bert_tokenizer, title.lower()),
-               "text": get_retokenized(bert_tokenizer, body.lower())}
+        doc = {"id": pid,
+               "contents": get_retokenized(bert_tokenizer, body.lower()),
+               "metadata":metadata, }
         return doc
     
     res = []
diff --git a/scripts/beir/tokenize_queries.sh b/scripts/beir/tokenize_queries.sh
new file mode 100644
index 000000000..630b32ef4
--- /dev/null
+++ b/scripts/beir/tokenize_queries.sh
@@ -0,0 +1,38 @@
+for corpora in arguana bioasq climate-fever dbpedia-entity fever fiqa hotpotqa nfcorpus quora robust04 scidocs scifact signal1m trec-covid trec-news webis-touche2020 
+do
+
+#for corpora in android  english  gaming  gis  mathematica  physics  programmers  stats  tex  unix  webmasters  wordpress
+#do
+#for corpora in nq
+#do
+#mkdir -p /store/scratch/y247xie/00_data/wp-tokenized/${corpora}/
+#mkdir -p /store/scratch/y247xie/00_data/wp-tokenized/cqadupstack/${corpora}
+#python tokenize_corpus.py \
+#	--input /store/collections/beir-v1.0.0/original/${corpora}/corpus.jsonl \
+#	--output /store/scratch/y247xie/00_data/wp-tokenized/${corpora}/corpus.jsonl
+#python tokenize_corpus.py \
+#	--input /store/collections/beir-v1.0.0/original/cqadupstack/${corpora}/corpus.jsonl \
+#	--output /store/scratch/y247xie/00_data/wp-tokenized/cqadupstack/${corpora}/corpus.jsonl
+python tokenize_queries.py \
+	--input /store/collections/beir-v1.0.0/original/${corpora}/queries.jsonl \
+	--output /store/scratch/y247xie/00_data/wp-tokenized/${corpora}/queries.jsonl
+
+done
+
+for corpora in android  english  gaming  gis  mathematica  physics  programmers  stats  tex  unix  webmasters  wordpress
+do
+python tokenize_queries.py \
+	--input /store/collections/beir-v1.0.0/original/cqadupstack/${corpora}/queries.jsonl \
+	 --output /store/scratch/y247xie/00_data/wp-tokenized/cqadupstack/${corpora}/queries.jsonl
+done
+
+for corpora in nq
+do
+python tokenize_queries.py \
+	--input /store/scratch/y247xie/00_data/nq/queries.jsonl \
+	--output /store/scratch/y247xie/00_data/wp-tokenized/${corpora}/queries.jsonl
+done
+
+
+
+	

From 6bae387be013d15f8ca4fe1fb803bf62e818c760 Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Mon, 2 May 2022 12:15:49 -0400
Subject: [PATCH 05/12] test all the metrics

---
 scripts/beir/compare_domains.py      | 8 ++++----
 scripts/beir/test_compare_domains.sh | 3 ++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/scripts/beir/compare_domains.py b/scripts/beir/compare_domains.py
index 95c55ceec..c6cc94739 100644
--- a/scripts/beir/compare_domains.py
+++ b/scripts/beir/compare_domains.py
@@ -86,8 +86,8 @@ def print_results(datasets, results, save_file):
     parser.add_argument('--output_path', type=str, help='path to save the stat results', required=True)
     args = parser.parse_args()
 
-    #beir_datasets = ['arguana', 'bioasq', 'climate-fever', 'dbpedia-entity', 'fever', 'fiqa', 'hotpotqa', 'nfcorpus', 'nq', 'quora', 'robust04', 'scidocs', 'scifact', 'signal1m', 'trec-covid', 'trec-news', 'webis-touche2020']
-    beir_datasets = ['arguana', 'fiqa']
+    beir_datasets = ['trec-covid', 'bioasq', 'nfcorpus', 'nq', 'hotpotqa', 'climate-fever', 'fever', 'dbpedia-entity', 'fiqa', 'signal1m', 'trec-news',  'robust04', 'arguana', 'webis-touche2020', 'quora', 'cqadupstack', 'scidocs', 'scifact']
+    #beir_datasets = ['arguana', 'fiqa']
     cfs = dfs = stats = {}
     for d in beir_datasets:
         cf, df, stat = index2stats(args.index_path + args.index_name_format.format(d))
@@ -111,8 +111,8 @@ def print_results(datasets, results, save_file):
                     new_d2 = filter_freq_dict(cf2freq(cfs[d2]))
                     metric_d1[d2] = jaccard(new_d1, new_d2)
                 elif args.compare_metric == "tf_filter":
-                    new_d1 = filter_freq_dict(df2idf(dfs[d1], stat[d1]['documents']))
-                    new_d2 = filter_freq_dict(df2idf(dfs[d2], stat[d2]['documents']))
+                    new_d1 = filter_freq_dict(df2idf(dfs[d1], 1))
+                    new_d2 = filter_freq_dict(df2idf(dfs[d2], 1))
                     metric_d1[d2] = jaccard(new_d1, new_d2)
         results[d1] = metric_d1
 
diff --git a/scripts/beir/test_compare_domains.sh b/scripts/beir/test_compare_domains.sh
index 17f442b3a..ce4bdfc1b 100644
--- a/scripts/beir/test_compare_domains.sh
+++ b/scripts/beir/test_compare_domains.sh
@@ -1,3 +1,4 @@
 python compare_domains.py \
     --index_path indexes \
-    --output_path indexes_weight_jaccard 
+    --output_path indexes_df_filter.tsv \
+    --compare_metric df_filter 

From c7eb404cde7b323921cc852f83a3e03f61e89a5d Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Mon, 2 May 2022 12:21:01 -0400
Subject: [PATCH 06/12] use beir format

---
 scripts/beir/tokenize_corpus.py  | 4 ++--
 scripts/beir/tokenize_queries.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/beir/tokenize_corpus.py b/scripts/beir/tokenize_corpus.py
index abe8352e9..b7439ffbc 100644
--- a/scripts/beir/tokenize_corpus.py
+++ b/scripts/beir/tokenize_corpus.py
@@ -71,9 +71,9 @@ def process(line):
         title = json_line['title']
         body = json_line['text']
 
-        doc = {"id": pid,
+        doc = {"_id": pid,
                "title":  get_retokenized(bert_tokenizer, title.lower()),
-               "contents": get_retokenized(bert_tokenizer, body.lower())}
+               "text": get_retokenized(bert_tokenizer, body.lower())}
         return doc
     
     res = []
diff --git a/scripts/beir/tokenize_queries.py b/scripts/beir/tokenize_queries.py
index 02ab3e662..bc47df394 100644
--- a/scripts/beir/tokenize_queries.py
+++ b/scripts/beir/tokenize_queries.py
@@ -71,8 +71,8 @@ def process(line):
         body = json_line['text']
         metadata = json_line['metadata']
 
-        doc = {"id": pid,
-               "contents": get_retokenized(bert_tokenizer, body.lower()),
+        doc = {"_id": pid,
+               "text": get_retokenized(bert_tokenizer, body.lower()),
                "metadata":metadata, }
         return doc
     

From 271ac2a2cdc5029396e71327adfe947a76b4d67e Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Mon, 2 May 2022 12:23:37 -0400
Subject: [PATCH 07/12] update with BeirFlatCollection

---
 scripts/beir/index_bm25.sh | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/scripts/beir/index_bm25.sh b/scripts/beir/index_bm25.sh
index d03e36b74..80b90efa4 100644
--- a/scripts/beir/index_bm25.sh
+++ b/scripts/beir/index_bm25.sh
@@ -1,10 +1,12 @@
-mkdir -p indexes
+#mkdir -p indexes
 
-for corpora in arguana bioasq climate-fever dbpedia-entity fever hotpotqa nfcorpus quora robust04 scidocs scifact signal1m trec-covid trec-news webis-touche2020
-#for corpora in fiqa
-do
+#for corpora in arguana bioasq climate-fever dbpedia-entity fever hotpotqa nfcorpus quora robust04 scidocs scifact signal1m trec-covid trec-news webis-touche2020 fiqa nq
+#do
 
-python -m pyserini.index -collection JsonCollection -generator DefaultLuceneDocumentGenerator \
-	-threads 20 -input /store/scratch/y247xie/00_data/wp-tokenized-anserini/${corpora}/corpus \
-	-index indexes/lucene-index-beir-${corpora} -storePositions -storeDocvectors -storeRaw
+#for corpora in android  english  gaming  gis  mathematica  physics  programmers  stats  tex  unix  webmasters  wordpress
+for corpora in fiqa
+do
+python -m pyserini.index -collection BeirFlatCollection -generator DefaultLuceneDocumentGenerator \
+	-threads 20 -input /store/scratch/y247xie/00_data/wp-tokenized/${corpora}/corpus \
+	-index indexes/lucene-index-beir-${corpora}_ -storePositions -storeDocvectors -storeRaw -pretokenized
 done

From 7ece3468b6d4593f21ab85fd9b2dd9dcf203b25e Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Tue, 10 May 2022 17:04:20 -0400
Subject: [PATCH 08/12] add kl-divergency and msmarco

---
 scripts/beir/compare_domains.py         | 20 ++++++++++++++++----
 scripts/beir/index_bm25.sh              |  9 +++++----
 scripts/beir/indexes_df_filter.tsv      | 20 ++++++++++++++++++++
 scripts/beir/indexes_kl_divergence.tsv  | 20 ++++++++++++++++++++
 scripts/beir/indexes_tf_filter.tsv      | 20 ++++++++++++++++++++
 scripts/beir/indexes_weight_jaccard.tsv | 20 ++++++++++++++++++++
 scripts/beir/test_compare_domains.sh    |  7 +++++--
 7 files changed, 106 insertions(+), 10 deletions(-)
 create mode 100644 scripts/beir/indexes_df_filter.tsv
 create mode 100644 scripts/beir/indexes_kl_divergence.tsv
 create mode 100644 scripts/beir/indexes_tf_filter.tsv
 create mode 100644 scripts/beir/indexes_weight_jaccard.tsv

diff --git a/scripts/beir/compare_domains.py b/scripts/beir/compare_domains.py
index c6cc94739..42c99128f 100644
--- a/scripts/beir/compare_domains.py
+++ b/scripts/beir/compare_domains.py
@@ -1,4 +1,6 @@
 import argparse
+import numpy as np
+
 from pyserini.index.lucene import IndexReader
 
 
@@ -24,6 +26,12 @@ def count_total(d):
         s += d[t]
     return s
 
+def kl_divergence(d1, d2):
+    value = float(0)
+    for w in d1:
+        if w in d2:
+            value += d1[w] * np.log(d1[w] / d2[w])
+    return value
 
 def jaccard(d1, d2):
     ret = (float(len(set(d1).intersection(set(d2)))) / 
@@ -81,18 +89,18 @@ def print_results(datasets, results, save_file):
     parser = argparse.ArgumentParser()
     parser.add_argument('--index_path', type=str, help='path to indexes of all the beir dataset', required=True)
     parser.add_argument('--index_name_format', type=str, help='define your own index dir path name', default="/lucene-index-beir-{}")
-    parser.add_argument('--compare_metric', type=str, help='the metric for comparing two vocab, choose from: jaccard, weight_jaccard, df_filter, tf_filter', default="weight_jaccard")
+    parser.add_argument('--compare_metric', type=str, help='the metric for comparing two vocab, choose from: jaccard, weight_jaccard, df_filter, tf_filter, kl_divergence', default="weight_jaccard")
     parser.add_argument('--compare_threshold', type=float, help='when choosing df_filter, or tf_filter, you can choolse the threshold', default=0.0001)
     parser.add_argument('--output_path', type=str, help='path to save the stat results', required=True)
     args = parser.parse_args()
 
-    beir_datasets = ['trec-covid', 'bioasq', 'nfcorpus', 'nq', 'hotpotqa', 'climate-fever', 'fever', 'dbpedia-entity', 'fiqa', 'signal1m', 'trec-news',  'robust04', 'arguana', 'webis-touche2020', 'quora', 'cqadupstack', 'scidocs', 'scifact']
+    beir_datasets = ['trec-covid', 'bioasq', 'nfcorpus', 'nq', 'hotpotqa', 'climate-fever', 'fever', 'dbpedia-entity', 'fiqa', 'signal1m', 'trec-news',  'robust04', 'arguana', 'webis-touche2020', 'quora', 'cqadupstack', 'scidocs', 'scifact', 'msmarco']
     #beir_datasets = ['arguana', 'fiqa']
     cfs = dfs = stats = {}
     for d in beir_datasets:
         cf, df, stat = index2stats(args.index_path + args.index_name_format.format(d))
-        cfs[d] = cf
-        dfs[d] = df
+        cfs[d] = cf # count frequency -- int
+        dfs[d] = df # document frequency -- int
         stat[d] = stat
 
     results = {}
@@ -114,6 +122,10 @@ def print_results(datasets, results, save_file):
                     new_d1 = filter_freq_dict(df2idf(dfs[d1], 1))
                     new_d2 = filter_freq_dict(df2idf(dfs[d2], 1))
                     metric_d1[d2] = jaccard(new_d1, new_d2)
+                elif args.compare_metric == "kl_divergence":
+                    new_d1 = filter_freq_dict(cf2freq(cfs[d1]))
+                    new_d2 = filter_freq_dict(cf2freq(cfs[d2]))
+                    metric_d1[d2] = kl_divergence(new_d1, new_d2)
         results[d1] = metric_d1
 
     print_results(beir_datasets, results, args.output_path)
diff --git a/scripts/beir/index_bm25.sh b/scripts/beir/index_bm25.sh
index 80b90efa4..f46598388 100644
--- a/scripts/beir/index_bm25.sh
+++ b/scripts/beir/index_bm25.sh
@@ -4,9 +4,10 @@
 #do
 
 #for corpora in android  english  gaming  gis  mathematica  physics  programmers  stats  tex  unix  webmasters  wordpress
-for corpora in fiqa
+#for corpora in fiqa # BeirFlatCollection
+for corpora in msmarco
 do
-python -m pyserini.index -collection BeirFlatCollection -generator DefaultLuceneDocumentGenerator \
-	-threads 20 -input /store/scratch/y247xie/00_data/wp-tokenized/${corpora}/corpus \
-	-index indexes/lucene-index-beir-${corpora}_ -storePositions -storeDocvectors -storeRaw -pretokenized
+python -m pyserini.index -collection JsonCollection -generator DefaultLuceneDocumentGenerator \
+	-threads 20 -input /store/scratch/y247xie/00_data/wp-tokenized/${corpora} \
+	-index indexes/lucene-index-beir-${corpora} -storePositions -storeDocvectors -storeRaw -pretokenized
 done
diff --git a/scripts/beir/indexes_df_filter.tsv b/scripts/beir/indexes_df_filter.tsv
new file mode 100644
index 000000000..0a408a408
--- /dev/null
+++ b/scripts/beir/indexes_df_filter.tsv
@@ -0,0 +1,20 @@
+	trec-covid	bioasq	nfcorpus	nq	hotpotqa	climate-fever	fever	dbpedia-entity	fiqa	signal1m	trec-news	robust04	arguana	webis-touche2020	quora	cqadupstack	scidocs	scifact	msmarco
+trec-covid	1.0000	0.6805	0.5963	0.3362	0.2377	0.2603	0.2604	0.2334	0.2451	0.2251	0.2787	0.3180	0.2927	0.2760	0.2149	0.2772	0.4241	0.6367	0.3485
+bioasq	0.6805	1.0000	0.6533	0.3165	0.2267	0.2464	0.2465	0.2242	0.2259	0.2182	0.2541	0.2943	0.2567	0.2586	0.2031	0.2753	0.3823	0.7174	0.3335
+nfcorpus	0.5963	0.6533	1.0000	0.3165	0.2207	0.2400	0.2401	0.2187	0.2338	0.2096	0.2508	0.2908	0.2665	0.2540	0.2047	0.2594	0.3541	0.6115	0.3318
+nq	0.3362	0.3165	0.3165	1.0000	0.4995	0.5694	0.5689	0.4902	0.3926	0.3710	0.5455	0.5439	0.4421	0.4387	0.3717	0.3437	0.3329	0.2937	0.5508
+hotpotqa	0.2377	0.2267	0.2207	0.4995	1.0000	0.8334	0.8340	0.9355	0.2344	0.3446	0.3469	0.3274	0.2709	0.2543	0.2790	0.2427	0.2235	0.2107	0.3767
+climate-fever	0.2603	0.2464	0.2400	0.5694	0.8334	1.0000	0.9993	0.8253	0.2640	0.3549	0.3949	0.3762	0.3064	0.2862	0.2933	0.2677	0.2449	0.2280	0.4122
+fever	0.2604	0.2465	0.2401	0.5689	0.8340	0.9993	1.0000	0.8258	0.2636	0.3544	0.3945	0.3758	0.3060	0.2858	0.2935	0.2673	0.2450	0.2281	0.4117
+dbpedia-entity	0.2334	0.2242	0.2187	0.4902	0.9355	0.8253	0.8258	1.0000	0.2255	0.3425	0.3405	0.3231	0.2635	0.2463	0.2751	0.2402	0.2177	0.2084	0.3691
+fiqa	0.2451	0.2259	0.2338	0.3926	0.2344	0.2640	0.2636	0.2255	1.0000	0.3077	0.4699	0.4494	0.4356	0.5031	0.4116	0.4062	0.3054	0.2202	0.4680
+signal1m	0.2251	0.2182	0.2096	0.3710	0.3446	0.3549	0.3544	0.3425	0.3077	1.0000	0.4079	0.3431	0.2916	0.3211	0.3555	0.3037	0.2081	0.2048	0.3654
+trec-news	0.2787	0.2541	0.2508	0.5455	0.3469	0.3949	0.3945	0.3405	0.4699	0.4079	1.0000	0.5913	0.4754	0.5179	0.3765	0.3568	0.3016	0.2400	0.5027
+robust04	0.3180	0.2943	0.2908	0.5439	0.3274	0.3762	0.3758	0.3231	0.4494	0.3431	0.5913	1.0000	0.4845	0.4528	0.3275	0.3324	0.3199	0.2800	0.4757
+arguana	0.2927	0.2567	0.2665	0.4421	0.2709	0.3064	0.3060	0.2635	0.4356	0.2916	0.4754	0.4845	1.0000	0.5555	0.3461	0.3116	0.3413	0.2539	0.4240
+webis-touche2020	0.2760	0.2586	0.2540	0.4387	0.2543	0.2862	0.2858	0.2463	0.5031	0.3211	0.5179	0.4528	0.5555	1.0000	0.3838	0.3866	0.3364	0.2508	0.4530
+quora	0.2149	0.2031	0.2047	0.3717	0.2790	0.2933	0.2935	0.2751	0.4116	0.3555	0.3765	0.3275	0.3461	0.3838	1.0000	0.3551	0.2515	0.1920	0.4549
+cqadupstack	0.2772	0.2753	0.2594	0.3437	0.2427	0.2677	0.2673	0.2402	0.4062	0.3037	0.3568	0.3324	0.3116	0.3866	0.3551	1.0000	0.3632	0.2598	0.4230
+scidocs	0.4241	0.3823	0.3541	0.3329	0.2235	0.2449	0.2450	0.2177	0.3054	0.2081	0.3016	0.3199	0.3413	0.3364	0.2515	0.3632	1.0000	0.3796	0.3599
+scifact	0.6367	0.7174	0.6115	0.2937	0.2107	0.2280	0.2281	0.2084	0.2202	0.2048	0.2400	0.2800	0.2539	0.2508	0.1920	0.2598	0.3796	1.0000	0.3112
+msmarco	0.3485	0.3335	0.3318	0.5508	0.3767	0.4122	0.4117	0.3691	0.4680	0.3654	0.5027	0.4757	0.4240	0.4530	0.4549	0.4230	0.3599	0.3112	1.0000
diff --git a/scripts/beir/indexes_kl_divergence.tsv b/scripts/beir/indexes_kl_divergence.tsv
new file mode 100644
index 000000000..476033474
--- /dev/null
+++ b/scripts/beir/indexes_kl_divergence.tsv
@@ -0,0 +1,20 @@
+	trec-covid	bioasq	nfcorpus	nq	hotpotqa	climate-fever	fever	dbpedia-entity	fiqa	signal1m	trec-news	robust04	arguana	webis-touche2020	quora	cqadupstack	scidocs	scifact	msmarco
+trec-covid	1.0000	0.0448	0.0336	0.1378	0.1141	0.1268	0.1268	0.1026	0.0769	0.1147	0.2565	0.1991	0.0948	0.1706	0.1257	0.0665	0.0856	0.0569	0.1137
+bioasq	0.0609	1.0000	0.0194	0.1555	0.1295	0.1447	0.1447	0.1217	0.0999	0.1345	0.2773	0.2233	0.1397	0.1957	0.1517	0.0833	0.1371	0.0239	0.1279
+nfcorpus	0.1186	0.0863	1.0000	0.1908	0.1527	0.1706	0.1706	0.1435	0.1212	0.1509	0.3080	0.2669	0.1566	0.2258	0.1755	0.1044	0.1849	0.1257	0.1794
+nq	0.1812	0.1576	0.1643	1.0000	0.0616	0.0384	0.0383	0.0815	0.0574	0.1851	0.2639	0.1940	0.0632	0.1516	0.1579	0.0737	0.1527	0.1855	0.0489
+hotpotqa	0.2694	0.2197	0.2561	0.1654	1.0000	0.0588	0.0588	0.0050	0.1574	0.2180	0.4821	0.3698	0.2383	0.3233	0.1839	0.1503	0.2188	0.2563	0.1190
+climate-fever	0.1456	0.1090	0.1328	0.0707	-0.0202	1.0000	-0.0000	-0.0222	0.0586	0.1299	0.3001	0.2246	0.1211	0.1858	0.1071	0.0644	0.1062	0.1335	0.0523
+fever	0.1456	0.1090	0.1328	0.0708	-0.0202	0.0000	1.0000	-0.0222	0.0588	0.1300	0.3002	0.2247	0.1212	0.1858	0.1071	0.0645	0.1062	0.1335	0.0523
+dbpedia-entity	0.2553	0.2071	0.2423	0.1618	0.0013	0.0604	0.0604	1.0000	0.1551	0.2109	0.4672	0.3603	0.2314	0.3199	0.1803	0.1449	0.2111	0.2426	0.1218
+fiqa	0.3243	0.3136	0.3265	0.3440	0.3525	0.3592	0.3582	0.3496	1.0000	0.3400	0.4209	0.4279	0.2379	0.2018	0.2046	0.1204	0.3096	0.3201	0.2314
+signal1m	0.3184	0.2348	0.2533	0.2131	0.2702	0.2878	0.2878	0.2469	0.2958	1.0000	0.4628	0.2995	0.2113	0.3399	0.1300	0.2227	0.3798	0.2558	0.1895
+trec-news	0.0620	0.0436	0.0422	0.0681	0.0835	0.0679	0.0677	0.0795	-0.0693	0.1217	1.0000	0.0441	-0.0241	-0.0209	0.0574	-0.0204	0.0600	0.0454	0.0254
+robust04	0.0416	0.0256	0.0194	0.0370	0.0584	0.0599	0.0595	0.0594	-0.0747	0.0811	0.0656	1.0000	-0.0266	-0.0126	0.0488	-0.0377	0.0326	0.0366	0.0053
+arguana	0.2747	0.2292	0.2534	0.2178	0.2459	0.2374	0.2373	0.2517	0.0637	0.2840	0.3115	0.2616	1.0000	0.1559	0.1798	0.1154	0.2201	0.2417	0.1416
+webis-touche2020	0.1759	0.1677	0.1734	0.1959	0.2135	0.2161	0.2161	0.2093	-0.0209	0.2110	0.2116	0.2317	0.0718	1.0000	0.1017	0.0445	0.1768	0.1711	0.1255
+quora	0.6807	0.9262	0.7167	1.0242	0.5189	0.6385	0.6385	0.5243	0.5093	0.7571	0.9984	1.0744	0.7696	0.6971	1.0000	0.4465	1.0338	0.6816	0.7129
+cqadupstack	0.3405	0.3268	0.3328	0.3428	0.3404	0.3548	0.3544	0.3385	0.1287	0.3009	0.4360	0.4501	0.2323	0.2593	0.1877	1.0000	0.3545	0.3202	0.2640
+scidocs	0.1595	0.1440	0.1419	0.1746	0.1613	0.1796	0.1796	0.1526	0.1121	0.1258	0.3094	0.2685	0.1335	0.1828	0.1601	0.1383	1.0000	0.1584	0.1128
+scifact	0.0915	0.0563	0.0574	0.1600	0.1297	0.1442	0.1442	0.1215	0.0911	0.1241	0.2662	0.2197	0.1328	0.1859	0.1368	0.0746	0.1293	1.0000	0.1346
+msmarco	0.2116	0.1982	0.2083	0.1484	0.1556	0.1780	0.1779	0.1526	0.0356	0.1836	0.3661	0.3101	0.1353	0.1968	0.1138	0.0810	0.1784	0.2204	1.0000
diff --git a/scripts/beir/indexes_tf_filter.tsv b/scripts/beir/indexes_tf_filter.tsv
new file mode 100644
index 000000000..957e1a473
--- /dev/null
+++ b/scripts/beir/indexes_tf_filter.tsv
@@ -0,0 +1,20 @@
+	trec-covid	bioasq	nfcorpus	nq	hotpotqa	climate-fever	fever	dbpedia-entity	fiqa	signal1m	trec-news	robust04	arguana	webis-touche2020	quora	cqadupstack	scidocs	scifact	msmarco
+trec-covid	1.0000	0.5225	0.5367	0.7852	0.7695	0.6963	0.6962	0.7722	0.8216	0.8511	0.7654	0.7957	0.7335	0.8538	0.8450	0.8544	0.8116	0.5629	0.6933
+bioasq	0.5225	1.0000	0.1540	0.6365	0.6223	0.6332	0.6332	0.6220	0.4986	0.6195	0.6504	0.6609	0.4141	0.6220	0.5379	0.5949	0.4224	0.1639	0.7079
+nfcorpus	0.5367	0.1540	1.0000	0.3979	0.3977	0.3425	0.3424	0.3994	0.5425	0.4687	0.3922	0.4194	0.5547	0.4648	0.5216	0.4705	0.5858	0.7047	0.3258
+nq	0.7852	0.6365	0.3979	1.0000	0.9263	0.8870	0.8869	0.9260	0.7417	0.8924	0.8971	0.8946	0.6171	0.8980	0.8011	0.8941	0.6824	0.4205	0.8288
+hotpotqa	0.7695	0.6223	0.3977	0.9263	1.0000	0.9088	0.9086	0.9868	0.7290	0.8748	0.8540	0.8523	0.6093	0.8689	0.7865	0.8719	0.6740	0.4207	0.8053
+climate-fever	0.6963	0.6332	0.3425	0.8870	0.9088	1.0000	0.9998	0.9053	0.6556	0.8019	0.8279	0.8134	0.5391	0.8035	0.7134	0.7997	0.6032	0.3634	0.8089
+fever	0.6962	0.6332	0.3424	0.8869	0.9086	0.9998	1.0000	0.9051	0.6556	0.8019	0.8277	0.8135	0.5392	0.8034	0.7133	0.7996	0.6031	0.3633	0.8090
+dbpedia-entity	0.7722	0.6220	0.3994	0.9260	0.9868	0.9053	0.9051	1.0000	0.7312	0.8771	0.8534	0.8530	0.6119	0.8693	0.7896	0.8735	0.6768	0.4225	0.8040
+fiqa	0.8216	0.4986	0.5425	0.7417	0.7290	0.6556	0.6556	0.7312	1.0000	0.8267	0.7301	0.7637	0.7555	0.8191	0.8492	0.8219	0.7839	0.5677	0.6590
+signal1m	0.8511	0.6195	0.4687	0.8924	0.8748	0.8019	0.8019	0.8771	0.8267	1.0000	0.8723	0.8941	0.7018	0.9403	0.8771	0.9303	0.7535	0.4918	0.7968
+trec-news	0.7654	0.6504	0.3922	0.8971	0.8540	0.8279	0.8277	0.8534	0.7301	0.8723	1.0000	0.9119	0.6057	0.8787	0.7784	0.8435	0.6608	0.4139	0.8578
+robust04	0.7957	0.6609	0.4194	0.8946	0.8523	0.8134	0.8135	0.8530	0.7637	0.8941	0.9119	1.0000	0.6504	0.9016	0.8087	0.8625	0.6967	0.4421	0.8584
+arguana	0.7335	0.4141	0.5547	0.6171	0.6093	0.5391	0.5392	0.6119	0.7555	0.7018	0.6057	0.6504	1.0000	0.6962	0.7429	0.6963	0.7329	0.5798	0.5498
+webis-touche2020	0.8538	0.6220	0.4648	0.8980	0.8689	0.8035	0.8034	0.8693	0.8191	0.9403	0.8787	0.9016	0.6962	1.0000	0.8729	0.9344	0.7545	0.4876	0.8016
+quora	0.8450	0.5379	0.5216	0.8011	0.7865	0.7134	0.7133	0.7896	0.8492	0.8771	0.7784	0.8087	0.7429	0.8729	1.0000	0.8732	0.7931	0.5468	0.7044
+cqadupstack	0.8544	0.5949	0.4705	0.8941	0.8719	0.7997	0.7996	0.8735	0.8219	0.9303	0.8435	0.8625	0.6963	0.9344	0.8732	1.0000	0.7626	0.4936	0.7674
+scidocs	0.8116	0.4224	0.5858	0.6824	0.6740	0.6032	0.6031	0.6768	0.7839	0.7535	0.6608	0.6967	0.7329	0.7545	0.7931	0.7626	1.0000	0.6193	0.5921
+scifact	0.5629	0.1639	0.7047	0.4205	0.4207	0.3634	0.3633	0.4225	0.5677	0.4918	0.4139	0.4421	0.5798	0.4876	0.5468	0.4936	0.6193	1.0000	0.3492
+msmarco	0.6933	0.7079	0.3258	0.8288	0.8053	0.8089	0.8090	0.8040	0.6590	0.7968	0.8578	0.8584	0.5498	0.8016	0.7044	0.7674	0.5921	0.3492	1.0000
diff --git a/scripts/beir/indexes_weight_jaccard.tsv b/scripts/beir/indexes_weight_jaccard.tsv
new file mode 100644
index 000000000..113587c1d
--- /dev/null
+++ b/scripts/beir/indexes_weight_jaccard.tsv
@@ -0,0 +1,20 @@
+	trec-covid	bioasq	nfcorpus	nq	hotpotqa	climate-fever	fever	dbpedia-entity	fiqa	signal1m	trec-news	robust04	arguana	webis-touche2020	quora	cqadupstack	scidocs	scifact	msmarco
+trec-covid	1.0000	0.0089	0.0303	0.1014	0.0750	0.0503	0.0503	0.0764	0.1929	0.1872	0.0872	0.1254	0.0505	0.2383	0.1906	0.2560	0.1527	0.0416	0.0442
+bioasq	0.0089	1.0000	0.0003	0.0728	0.0831	0.1248	0.1248	0.0804	0.0025	0.0240	0.0719	0.0542	0.0005	0.0244	0.0032	0.0192	0.0014	0.0004	0.1717
+nfcorpus	0.0303	0.0003	1.0000	0.0033	0.0026	0.0016	0.0016	0.0027	0.0860	0.0093	0.0030	0.0044	0.2769	0.0101	0.0691	0.0127	0.1684	0.5651	0.0014
+nq	0.1014	0.0728	0.0033	1.0000	0.5244	0.4535	0.4535	0.5248	0.0312	0.2665	0.5252	0.5363	0.0059	0.2781	0.0353	0.2028	0.0175	0.0046	0.3755
+hotpotqa	0.0750	0.0831	0.0026	0.5244	1.0000	0.6130	0.6130	0.9267	0.0237	0.2075	0.3535	0.3389	0.0046	0.1766	0.0263	0.1429	0.0136	0.0036	0.3808
+climate-fever	0.0503	0.1248	0.0016	0.4535	0.6130	1.0000	1.0000	0.5920	0.0152	0.1429	0.3620	0.3013	0.0028	0.1364	0.0172	0.1042	0.0085	0.0022	0.4879
+fever	0.0503	0.1248	0.0016	0.4535	0.6130	1.0000	1.0000	0.5921	0.0152	0.1429	0.3620	0.3013	0.0028	0.1364	0.0172	0.1042	0.0085	0.0022	0.4879
+dbpedia-entity	0.0764	0.0804	0.0027	0.5248	0.9267	0.5920	0.5921	1.0000	0.0244	0.2132	0.3528	0.3412	0.0047	0.1776	0.0271	0.1471	0.0140	0.0037	0.3719
+fiqa	0.1929	0.0025	0.0860	0.0312	0.0237	0.0152	0.0152	0.0244	1.0000	0.0878	0.0277	0.0411	0.1710	0.0938	0.4108	0.1154	0.3184	0.1097	0.0128
+signal1m	0.1872	0.0240	0.0093	0.2665	0.2075	0.1429	0.1429	0.2132	0.0878	1.0000	0.2393	0.3082	0.0170	0.3544	0.1020	0.3399	0.0439	0.0123	0.1260
+trec-news	0.0872	0.0719	0.0030	0.5252	0.3535	0.3620	0.3620	0.3528	0.0277	0.2393	1.0000	0.5535	0.0052	0.2749	0.0335	0.1939	0.0154	0.0040	0.3720
+robust04	0.1254	0.0542	0.0044	0.5363	0.3389	0.3013	0.3013	0.3412	0.0411	0.3082	0.5535	1.0000	0.0076	0.3681	0.0467	0.2616	0.0228	0.0060	0.2788
+arguana	0.0505	0.0005	0.2769	0.0059	0.0046	0.0028	0.0028	0.0047	0.1710	0.0170	0.0052	0.0076	1.0000	0.0179	0.1292	0.0225	0.2531	0.3048	0.0024
+webis-touche2020	0.2383	0.0244	0.0101	0.2781	0.1766	0.1364	0.1364	0.1776	0.0938	0.3544	0.2749	0.3681	0.0179	1.0000	0.1087	0.4355	0.0514	0.0137	0.1290
+quora	0.1906	0.0032	0.0691	0.0353	0.0263	0.0172	0.0172	0.0271	0.4108	0.1020	0.0335	0.0467	0.1292	0.1087	1.0000	0.1418	0.2441	0.0885	0.0158
+cqadupstack	0.2560	0.0192	0.0127	0.2028	0.1429	0.1042	0.1042	0.1471	0.1154	0.3399	0.1939	0.2616	0.0225	0.4355	0.1418	1.0000	0.0672	0.0171	0.0996
+scidocs	0.1527	0.0014	0.1684	0.0175	0.0136	0.0085	0.0085	0.0140	0.3184	0.0439	0.0154	0.0228	0.2531	0.0514	0.2441	0.0672	1.0000	0.2217	0.0072
+scifact	0.0416	0.0004	0.5651	0.0046	0.0036	0.0022	0.0022	0.0037	0.1097	0.0123	0.0040	0.0060	0.3048	0.0137	0.0885	0.0171	0.2217	1.0000	0.0019
+msmarco	0.0442	0.1717	0.0014	0.3755	0.3808	0.4879	0.4879	0.3719	0.0128	0.1260	0.3720	0.2788	0.0024	0.1290	0.0158	0.0996	0.0072	0.0019	1.0000
diff --git a/scripts/beir/test_compare_domains.sh b/scripts/beir/test_compare_domains.sh
index ce4bdfc1b..e7ea8be5c 100644
--- a/scripts/beir/test_compare_domains.sh
+++ b/scripts/beir/test_compare_domains.sh
@@ -1,4 +1,7 @@
+for metric in weight_jaccard kl_divergence tf_filter df_filter
+do
 python compare_domains.py \
     --index_path indexes \
-    --output_path indexes_df_filter.tsv \
-    --compare_metric df_filter 
+    --output_path indexes_${metric}.tsv \
+    --compare_metric ${metric}
+done

From 1bf765faa628d9c3f69c42a28e34f64db1e54b55 Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Wed, 11 May 2022 13:21:44 -0400
Subject: [PATCH 09/12] add js divergence, fix weighted_jaccard, add tokenizer
 option

---
 scripts/beir/compare_domains.py         | 30 +++++++++++++++++--
 scripts/beir/indexes_js_divergence.tsv  | 20 +++++++++++++
 scripts/beir/indexes_weight_jaccard.tsv | 38 ++++++++++++-------------
 scripts/beir/tokenize_corpus.py         |  5 ++--
 4 files changed, 69 insertions(+), 24 deletions(-)
 create mode 100644 scripts/beir/indexes_js_divergence.tsv

diff --git a/scripts/beir/compare_domains.py b/scripts/beir/compare_domains.py
index 42c99128f..247854616 100644
--- a/scripts/beir/compare_domains.py
+++ b/scripts/beir/compare_domains.py
@@ -33,6 +33,19 @@ def kl_divergence(d1, d2):
             value += d1[w] * np.log(d1[w] / d2[w])
     return value
 
+def js_divergence(d1, d2):
+    mean = {}
+    for w in d1:
+        mean[w] = d1[w] * 0.5
+    for w in d2:
+        if w in mean:
+            mean[w] += d2[w] * 0.5
+        else:
+            mean[w] = d2[w] * 0.5
+
+    jsd = 0.5 *  (kl_divergence(d1, mean) + kl_divergence(d2, mean))
+    return jsd
+
 def jaccard(d1, d2):
     ret = (float(len(set(d1).intersection(set(d2)))) / 
            float(len(set(d1).union(set(d2)))))
@@ -89,7 +102,7 @@ def print_results(datasets, results, save_file):
     parser = argparse.ArgumentParser()
     parser.add_argument('--index_path', type=str, help='path to indexes of all the beir dataset', required=True)
     parser.add_argument('--index_name_format', type=str, help='define your own index dir path name', default="/lucene-index-beir-{}")
-    parser.add_argument('--compare_metric', type=str, help='the metric for comparing two vocab, choose from: jaccard, weight_jaccard, df_filter, tf_filter, kl_divergence', default="weight_jaccard")
+    parser.add_argument('--compare_metric', type=str, help='the metric for comparing two vocab, choose from: jaccard, weight_jaccard, df_filter, tf_filter, kl_divergence, js_divergence', default="weight_jaccard")
     parser.add_argument('--compare_threshold', type=float, help='when choosing df_filter, or tf_filter, you can choolse the threshold', default=0.0001)
     parser.add_argument('--output_path', type=str, help='path to save the stat results', required=True)
     args = parser.parse_args()
@@ -108,12 +121,17 @@ def print_results(datasets, results, save_file):
         metric_d1 = {}
         for d2 in beir_datasets:
             if d1 == d2:
-                metric_d1[d2] = 1
+                if args.compare_metric in ["jaccard", "weight_jaccard", "df_filter", "tf_filter"]:
+                    metric_d1[d2] = 1
+                elif args.compare_metric in ["kl_divergence", "js_divergence"]:
+                    metric_d1[d2] = 0
             else:
                 if args.compare_metric == "jaccard":
                     metric_d1[d2] = jaccard(cfs[d1], cfs[d2])
                 elif args.compare_metric == "weight_jaccard":
-                    metric_d1[d2] = weighted_jaccard(cfs[d1], cfs[d2])
+                    new_d1 = filter_freq_dict(cf2freq(cfs[d1]))
+                    new_d2 = filter_freq_dict(cf2freq(cfs[d2]))
+                    metric_d1[d2] = weighted_jaccard(new_d1, new_d2)
                 elif args.compare_metric == "df_filter":
                     new_d1 = filter_freq_dict(cf2freq(cfs[d1]))
                     new_d2 = filter_freq_dict(cf2freq(cfs[d2]))
@@ -126,6 +144,12 @@ def print_results(datasets, results, save_file):
                     new_d1 = filter_freq_dict(cf2freq(cfs[d1]))
                     new_d2 = filter_freq_dict(cf2freq(cfs[d2]))
                     metric_d1[d2] = kl_divergence(new_d1, new_d2)
+                elif args.compare_metric == "js_divergence":
+                    new_d1 = filter_freq_dict(cf2freq(cfs[d1]))
+                    new_d2 = filter_freq_dict(cf2freq(cfs[d2]))
+                    metric_d1[d2] = js_divergence(new_d1, new_d2)
+                else:
+                    raise NotImplementedError
         results[d1] = metric_d1
 
     print_results(beir_datasets, results, args.output_path)
diff --git a/scripts/beir/indexes_js_divergence.tsv b/scripts/beir/indexes_js_divergence.tsv
new file mode 100644
index 000000000..60a55173c
--- /dev/null
+++ b/scripts/beir/indexes_js_divergence.tsv
@@ -0,0 +1,20 @@
+	trec-covid	bioasq	nfcorpus	nq	hotpotqa	climate-fever	fever	dbpedia-entity	fiqa	signal1m	trec-news	robust04	arguana	webis-touche2020	quora	cqadupstack	scidocs	scifact	msmarco
+trec-covid	1.0000	0.0546	0.0839	0.1642	0.2218	0.2017	0.2016	0.2221	0.2382	0.2489	0.2108	0.1854	0.2013	0.2163	0.2808	0.2252	0.1392	0.0701	0.1624
+bioasq	0.0546	1.0000	0.0603	0.1658	0.2205	0.2011	0.2011	0.2198	0.2425	0.2449	0.2189	0.1922	0.2182	0.2217	0.2816	0.2191	0.1519	0.0423	0.1618
+nfcorpus	0.0839	0.0603	1.0000	0.1814	0.2409	0.2211	0.2211	0.2402	0.2556	0.2684	0.2329	0.2050	0.2236	0.2356	0.2987	0.2441	0.1775	0.0782	0.1776
+nq	0.1642	0.1658	0.1814	1.0000	0.0816	0.0633	0.0634	0.0821	0.1366	0.1482	0.0943	0.0854	0.1052	0.1224	0.1754	0.1559	0.1576	0.1849	0.0672
+hotpotqa	0.2218	0.2205	0.2409	0.0816	1.0000	0.0201	0.0201	0.0049	0.2208	0.1691	0.1765	0.1673	0.1931	0.2166	0.2170	0.2207	0.2222	0.2379	0.1214
+climate-fever	0.2017	0.2011	0.2211	0.0633	0.0201	1.0000	0.0000	0.0198	0.1984	0.1606	0.1437	0.1355	0.1701	0.1893	0.2078	0.2005	0.2019	0.2193	0.1060
+fever	0.2016	0.2011	0.2211	0.0634	0.0201	0.0000	1.0000	0.0198	0.1985	0.1606	0.1438	0.1356	0.1702	0.1894	0.2078	0.2006	0.2019	0.2193	0.1061
+dbpedia-entity	0.2221	0.2198	0.2402	0.0821	0.0049	0.0198	0.0198	1.0000	0.2229	0.1685	0.1764	0.1670	0.1943	0.2184	0.2183	0.2201	0.2237	0.2377	0.1218
+fiqa	0.2382	0.2425	0.2556	0.1366	0.2208	0.1984	0.1985	0.2229	1.0000	0.1826	0.1164	0.1233	0.1182	0.0883	0.1501	0.1238	0.2003	0.2568	0.0970
+signal1m	0.2489	0.2449	0.2684	0.1482	0.1691	0.1606	0.1606	0.1685	0.1826	1.0000	0.1560	0.1744	0.2006	0.1852	0.1916	0.1812	0.2486	0.2621	0.1393
+trec-news	0.2108	0.2189	0.2329	0.0943	0.1765	0.1437	0.1438	0.1764	0.1164	0.1560	1.0000	0.0696	0.1098	0.0908	0.1901	0.1623	0.1953	0.2341	0.1108
+robust04	0.1854	0.1922	0.2050	0.0854	0.1673	0.1355	0.1356	0.1670	0.1233	0.1744	0.0696	1.0000	0.1082	0.1147	0.2092	0.1736	0.1809	0.2089	0.1086
+arguana	0.2013	0.2182	0.2236	0.1052	0.1931	0.1701	0.1702	0.1943	0.1182	0.2006	0.1098	0.1082	1.0000	0.0857	0.1902	0.1766	0.1702	0.2274	0.1143
+webis-touche2020	0.2163	0.2217	0.2356	0.1224	0.2166	0.1893	0.1894	0.2184	0.0883	0.1852	0.0908	0.1147	0.0857	1.0000	0.1707	0.1352	0.1846	0.2341	0.1122
+quora	0.2808	0.2816	0.2987	0.1754	0.2170	0.2078	0.2078	0.2183	0.1501	0.1916	0.1901	0.2092	0.1902	0.1707	1.0000	0.1669	0.2471	0.2970	0.1350
+cqadupstack	0.2252	0.2191	0.2441	0.1559	0.2207	0.2005	0.2006	0.2201	0.1238	0.1812	0.1623	0.1736	0.1766	0.1352	0.1669	1.0000	0.1833	0.2358	0.1205
+scidocs	0.1392	0.1519	0.1775	0.1576	0.2222	0.2019	0.2019	0.2237	0.2003	0.2486	0.1953	0.1809	0.1702	0.1846	0.2471	0.1833	1.0000	0.1602	0.1482
+scifact	0.0701	0.0423	0.0782	0.1849	0.2379	0.2193	0.2193	0.2377	0.2568	0.2621	0.2341	0.2089	0.2274	0.2341	0.2970	0.2358	0.1602	1.0000	0.1805
+msmarco	0.1624	0.1618	0.1776	0.0672	0.1214	0.1060	0.1061	0.1218	0.0970	0.1393	0.1108	0.1086	0.1143	0.1122	0.1350	0.1205	0.1482	0.1805	1.0000
diff --git a/scripts/beir/indexes_weight_jaccard.tsv b/scripts/beir/indexes_weight_jaccard.tsv
index 113587c1d..29da5e612 100644
--- a/scripts/beir/indexes_weight_jaccard.tsv
+++ b/scripts/beir/indexes_weight_jaccard.tsv
@@ -1,20 +1,20 @@
 	trec-covid	bioasq	nfcorpus	nq	hotpotqa	climate-fever	fever	dbpedia-entity	fiqa	signal1m	trec-news	robust04	arguana	webis-touche2020	quora	cqadupstack	scidocs	scifact	msmarco
-trec-covid	1.0000	0.0089	0.0303	0.1014	0.0750	0.0503	0.0503	0.0764	0.1929	0.1872	0.0872	0.1254	0.0505	0.2383	0.1906	0.2560	0.1527	0.0416	0.0442
-bioasq	0.0089	1.0000	0.0003	0.0728	0.0831	0.1248	0.1248	0.0804	0.0025	0.0240	0.0719	0.0542	0.0005	0.0244	0.0032	0.0192	0.0014	0.0004	0.1717
-nfcorpus	0.0303	0.0003	1.0000	0.0033	0.0026	0.0016	0.0016	0.0027	0.0860	0.0093	0.0030	0.0044	0.2769	0.0101	0.0691	0.0127	0.1684	0.5651	0.0014
-nq	0.1014	0.0728	0.0033	1.0000	0.5244	0.4535	0.4535	0.5248	0.0312	0.2665	0.5252	0.5363	0.0059	0.2781	0.0353	0.2028	0.0175	0.0046	0.3755
-hotpotqa	0.0750	0.0831	0.0026	0.5244	1.0000	0.6130	0.6130	0.9267	0.0237	0.2075	0.3535	0.3389	0.0046	0.1766	0.0263	0.1429	0.0136	0.0036	0.3808
-climate-fever	0.0503	0.1248	0.0016	0.4535	0.6130	1.0000	1.0000	0.5920	0.0152	0.1429	0.3620	0.3013	0.0028	0.1364	0.0172	0.1042	0.0085	0.0022	0.4879
-fever	0.0503	0.1248	0.0016	0.4535	0.6130	1.0000	1.0000	0.5921	0.0152	0.1429	0.3620	0.3013	0.0028	0.1364	0.0172	0.1042	0.0085	0.0022	0.4879
-dbpedia-entity	0.0764	0.0804	0.0027	0.5248	0.9267	0.5920	0.5921	1.0000	0.0244	0.2132	0.3528	0.3412	0.0047	0.1776	0.0271	0.1471	0.0140	0.0037	0.3719
-fiqa	0.1929	0.0025	0.0860	0.0312	0.0237	0.0152	0.0152	0.0244	1.0000	0.0878	0.0277	0.0411	0.1710	0.0938	0.4108	0.1154	0.3184	0.1097	0.0128
-signal1m	0.1872	0.0240	0.0093	0.2665	0.2075	0.1429	0.1429	0.2132	0.0878	1.0000	0.2393	0.3082	0.0170	0.3544	0.1020	0.3399	0.0439	0.0123	0.1260
-trec-news	0.0872	0.0719	0.0030	0.5252	0.3535	0.3620	0.3620	0.3528	0.0277	0.2393	1.0000	0.5535	0.0052	0.2749	0.0335	0.1939	0.0154	0.0040	0.3720
-robust04	0.1254	0.0542	0.0044	0.5363	0.3389	0.3013	0.3013	0.3412	0.0411	0.3082	0.5535	1.0000	0.0076	0.3681	0.0467	0.2616	0.0228	0.0060	0.2788
-arguana	0.0505	0.0005	0.2769	0.0059	0.0046	0.0028	0.0028	0.0047	0.1710	0.0170	0.0052	0.0076	1.0000	0.0179	0.1292	0.0225	0.2531	0.3048	0.0024
-webis-touche2020	0.2383	0.0244	0.0101	0.2781	0.1766	0.1364	0.1364	0.1776	0.0938	0.3544	0.2749	0.3681	0.0179	1.0000	0.1087	0.4355	0.0514	0.0137	0.1290
-quora	0.1906	0.0032	0.0691	0.0353	0.0263	0.0172	0.0172	0.0271	0.4108	0.1020	0.0335	0.0467	0.1292	0.1087	1.0000	0.1418	0.2441	0.0885	0.0158
-cqadupstack	0.2560	0.0192	0.0127	0.2028	0.1429	0.1042	0.1042	0.1471	0.1154	0.3399	0.1939	0.2616	0.0225	0.4355	0.1418	1.0000	0.0672	0.0171	0.0996
-scidocs	0.1527	0.0014	0.1684	0.0175	0.0136	0.0085	0.0085	0.0140	0.3184	0.0439	0.0154	0.0228	0.2531	0.0514	0.2441	0.0672	1.0000	0.2217	0.0072
-scifact	0.0416	0.0004	0.5651	0.0046	0.0036	0.0022	0.0022	0.0037	0.1097	0.0123	0.0040	0.0060	0.3048	0.0137	0.0885	0.0171	0.2217	1.0000	0.0019
-msmarco	0.0442	0.1717	0.0014	0.3755	0.3808	0.4879	0.4879	0.3719	0.0128	0.1260	0.3720	0.2788	0.0024	0.1290	0.0158	0.0996	0.0072	0.0019	1.0000
+trec-covid	1.0000	0.6702	0.5867	0.3619	0.2564	0.2955	0.2955	0.2574	0.2721	0.2390	0.2698	0.3307	0.3119	0.2911	0.2042	0.2993	0.4355	0.6190	0.3553
+bioasq	0.6702	1.0000	0.6530	0.3559	0.2588	0.2973	0.2973	0.2613	0.2633	0.2417	0.2546	0.3170	0.2876	0.2771	0.1991	0.3053	0.4019	0.7019	0.3516
+nfcorpus	0.5867	0.6530	1.0000	0.3325	0.2350	0.2689	0.2689	0.2371	0.2546	0.2178	0.2441	0.3039	0.2827	0.2703	0.1884	0.2767	0.3613	0.5820	0.3299
+nq	0.3619	0.3559	0.3325	1.0000	0.4992	0.5883	0.5882	0.4983	0.4031	0.3688	0.4455	0.4913	0.4459	0.4018	0.3214	0.3668	0.3770	0.3273	0.5709
+hotpotqa	0.2564	0.2588	0.2350	0.4992	1.0000	0.7719	0.7720	0.9361	0.2598	0.3286	0.2760	0.3020	0.2817	0.2387	0.2806	0.2604	0.2573	0.2395	0.4378
+climate-fever	0.2955	0.2973	0.2689	0.5883	0.7719	1.0000	0.9998	0.7741	0.2988	0.3495	0.3382	0.3659	0.3258	0.2803	0.2792	0.2969	0.2969	0.2736	0.4613
+fever	0.2955	0.2973	0.2689	0.5882	0.7720	0.9998	1.0000	0.7742	0.2987	0.3494	0.3381	0.3658	0.3257	0.2802	0.2792	0.2968	0.2970	0.2736	0.4612
+dbpedia-entity	0.2574	0.2613	0.2371	0.4983	0.9361	0.7741	0.7742	1.0000	0.2595	0.3332	0.2758	0.3038	0.2807	0.2372	0.2783	0.2624	0.2566	0.2410	0.4396
+fiqa	0.2721	0.2633	0.2546	0.4031	0.2598	0.2988	0.2987	0.2595	1.0000	0.3152	0.4142	0.4030	0.4821	0.5323	0.3748	0.4754	0.3243	0.2503	0.4781
+signal1m	0.2390	0.2417	0.2178	0.3688	0.3286	0.3495	0.3494	0.3332	0.3152	1.0000	0.3237	0.3063	0.2794	0.2937	0.2944	0.3244	0.2371	0.2291	0.3960
+trec-news	0.2698	0.2546	0.2441	0.4455	0.2760	0.3382	0.3381	0.2758	0.4142	0.3237	1.0000	0.5740	0.4239	0.4964	0.2756	0.3322	0.2825	0.2412	0.4140
+robust04	0.3307	0.3170	0.3039	0.4913	0.3020	0.3659	0.3658	0.3038	0.4030	0.3063	0.5740	1.0000	0.4421	0.4458	0.2602	0.3248	0.3196	0.2954	0.4329
+arguana	0.3119	0.2876	0.2827	0.4459	0.2817	0.3258	0.3257	0.2807	0.4821	0.2794	0.4239	0.4421	1.0000	0.5237	0.3105	0.3709	0.3706	0.2801	0.4409
+webis-touche2020	0.2911	0.2771	0.2703	0.4018	0.2387	0.2803	0.2802	0.2372	0.5323	0.2937	0.4964	0.4458	0.5237	1.0000	0.3264	0.4141	0.3237	0.2693	0.4234
+quora	0.2042	0.1991	0.1884	0.3214	0.2806	0.2792	0.2792	0.2783	0.3748	0.2944	0.2756	0.2602	0.3105	0.3264	1.0000	0.3405	0.2409	0.1884	0.4022
+cqadupstack	0.2993	0.3053	0.2767	0.3668	0.2604	0.2969	0.2968	0.2624	0.4754	0.3244	0.3322	0.3248	0.3709	0.4141	0.3405	1.0000	0.3477	0.2874	0.4328
+scidocs	0.4355	0.4019	0.3613	0.3770	0.2573	0.2969	0.2970	0.2566	0.3243	0.2371	0.2825	0.3196	0.3706	0.3237	0.2409	0.3477	1.0000	0.3946	0.3868
+scifact	0.6190	0.7019	0.5820	0.3273	0.2395	0.2736	0.2736	0.2410	0.2503	0.2291	0.2412	0.2954	0.2801	0.2693	0.1884	0.2874	0.3946	1.0000	0.3270
+msmarco	0.3553	0.3516	0.3299	0.5709	0.4378	0.4613	0.4612	0.4396	0.4781	0.3960	0.4140	0.4329	0.4409	0.4234	0.4022	0.4328	0.3868	0.3270	1.0000
diff --git a/scripts/beir/tokenize_corpus.py b/scripts/beir/tokenize_corpus.py
index b7439ffbc..69ecb03fa 100644
--- a/scripts/beir/tokenize_corpus.py
+++ b/scripts/beir/tokenize_corpus.py
@@ -30,7 +30,8 @@
                     type=str, required=True)
 parser.add_argument('--workers', metavar='# of processes', help='# of workers to spawn',
                     type=int, default=multiprocessing.cpu_count() - 2)
-
+parser.add_argument('--tokenizer', metavar='tokenizer', help='tokenizer',
+                    type=str, default='bert-base-cased')
 
 args = parser.parse_args()
 print(args)
@@ -61,7 +62,7 @@ def batch_file(iterable, n=10000):
 
 
 def batch_process(batch):
-    bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+    bert_tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
 
     def process(line):
         if not line:

From a9ea6f43495da530fb713e0d76767f17531b493e Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Wed, 11 May 2022 13:22:36 -0400
Subject: [PATCH 10/12] add tokenizer option

---
 scripts/beir/tokenize_queries.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/beir/tokenize_queries.py b/scripts/beir/tokenize_queries.py
index bc47df394..a875d67fd 100644
--- a/scripts/beir/tokenize_queries.py
+++ b/scripts/beir/tokenize_queries.py
@@ -30,6 +30,8 @@
                     type=str, required=True)
 parser.add_argument('--workers', metavar='# of processes', help='# of workers to spawn',
                     type=int, default=multiprocessing.cpu_count() - 2)
+parser.add_argument('--tokenizer', metavar='tokenizer', help='tokenizer',
+                    type=str, default='bert-base-cased')
 
 
 args = parser.parse_args()
@@ -61,7 +63,7 @@ def batch_file(iterable, n=10000):
 
 
 def batch_process(batch):
-    bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+    bert_tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
 
     def process(line):
         if not line:

From a2af11651fbe4830b741d6cd190f182ef1676af5 Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Thu, 12 May 2022 15:27:04 -0400
Subject: [PATCH 11/12] fix bug and add query2query query2corpus comparison

---
 scripts/beir/compare_domains.py               | 107 +++++++++++-------
 scripts/beir/similarities/q2c-df_filter.tsv   |  20 ++++
 .../beir/similarities/q2c-js_divergence.tsv   |  20 ++++
 .../beir/similarities/q2c-kl_divergence.tsv   |  20 ++++
 scripts/beir/similarities/q2c-tf_filter.tsv   |  20 ++++
 .../beir/similarities/q2c-weight_jaccard.tsv  |  20 ++++
 scripts/beir/test_compare_domains.sh          |  10 +-
 7 files changed, 177 insertions(+), 40 deletions(-)
 create mode 100644 scripts/beir/similarities/q2c-df_filter.tsv
 create mode 100644 scripts/beir/similarities/q2c-js_divergence.tsv
 create mode 100644 scripts/beir/similarities/q2c-kl_divergence.tsv
 create mode 100644 scripts/beir/similarities/q2c-tf_filter.tsv
 create mode 100644 scripts/beir/similarities/q2c-weight_jaccard.tsv

diff --git a/scripts/beir/compare_domains.py b/scripts/beir/compare_domains.py
index 247854616..8a41cc67e 100644
--- a/scripts/beir/compare_domains.py
+++ b/scripts/beir/compare_domains.py
@@ -29,7 +29,7 @@ def count_total(d):
 def kl_divergence(d1, d2):
     value = float(0)
     for w in d1:
-        if w in d2:
+        if w in d2: # through out zero tokens for both sets
             value += d1[w] * np.log(d1[w] / d2[w])
     return value
 
@@ -101,55 +101,86 @@ def print_results(datasets, results, save_file):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--index_path', type=str, help='path to indexes of all the beir dataset', required=True)
-    parser.add_argument('--index_name_format', type=str, help='define your own index dir path name', default="/lucene-index-beir-{}")
     parser.add_argument('--compare_metric', type=str, help='the metric for comparing two vocab, choose from: jaccard, weight_jaccard, df_filter, tf_filter, kl_divergence, js_divergence', default="weight_jaccard")
     parser.add_argument('--compare_threshold', type=float, help='when choosing df_filter, or tf_filter, you can choolse the threshold', default=0.0001)
     parser.add_argument('--output_path', type=str, help='path to save the stat results', required=True)
+    parser.add_argument('--compare_sets', type=str, default="c2c", help="choose from c2c, q2q, q2c")
     args = parser.parse_args()
 
+    corpus_format = "/corpus/lucene-index-beir-{}"
+    queries_format = "/queries/lucene-index-beir-queires-{}"
+
     beir_datasets = ['trec-covid', 'bioasq', 'nfcorpus', 'nq', 'hotpotqa', 'climate-fever', 'fever', 'dbpedia-entity', 'fiqa', 'signal1m', 'trec-news',  'robust04', 'arguana', 'webis-touche2020', 'quora', 'cqadupstack', 'scidocs', 'scifact', 'msmarco']
-    #beir_datasets = ['arguana', 'fiqa']
-    cfs = dfs = stats = {}
-    for d in beir_datasets:
-        cf, df, stat = index2stats(args.index_path + args.index_name_format.format(d))
-        cfs[d] = cf # count frequency -- int
-        dfs[d] = df # document frequency -- int
-        stat[d] = stat
+    #beir_datasets = ['trec-covid', 'bioasq', 'nfcorpus', 'nq'] # Testing
+    cfs = {}
+    dfs = {}
+    summary = {}
+    cfs2 = {}
+    dfs2 = {}
+    summary2 = {}
+    if args.compare_sets == "c2c":
+        for d in beir_datasets:
+            cf, df, stat = index2stats(args.index_path + corpus_format.format(d))
+            cfs[d] = cf # count frequency -- int
+            dfs[d] = df # document frequency -- int
+            summary[d] = stat
+#            stats[d] = stat
+        cfs2 = cfs
+        dfs2 = dfs
+        summary2 = summary
+    elif args.compare_sets == "q2q":
+        for d in beir_datasets:
+            cf, df, stat = index2stats(args.index_path + queries_format.format(d))
+            cfs[d] = cf
+            dfs[d] = df
+            summary[d] = stat
+        cfs2 = cfs
+        dfs2 = dfs
+        dfs2 = dfs
+        summary2 = summary
+    elif args.compare_sets == "q2c":
+        for d in beir_datasets:
+            cf, df, stat = index2stats(args.index_path + queries_format.format(d))
+            cfs[d] = cf
+            dfs[d] = df
+            summary[d] = stat
+        for d in beir_datasets:
+            cf, df, stat = index2stats(args.index_path + corpus_format.format(d))
+            cfs2[d] = cf
+            dfs2[d] = df
+            summary2[d] = stat
+    else:
+        NotImplementedError("--compare_sets {}".format(args.compare_sets))
+
 
     results = {}
     for d1 in beir_datasets:
         metric_d1 = {}
         for d2 in beir_datasets:
-            if d1 == d2:
-                if args.compare_metric in ["jaccard", "weight_jaccard", "df_filter", "tf_filter"]:
-                    metric_d1[d2] = 1
-                elif args.compare_metric in ["kl_divergence", "js_divergence"]:
-                    metric_d1[d2] = 0
+            if args.compare_metric == "jaccard":
+                metric_d1[d2] = jaccard(cfs[d1], cfs2[d2])
+            elif args.compare_metric == "weight_jaccard":
+                new_d1 = cf2freq(cfs[d1])
+                new_d2 = cf2freq(cfs2[d2])
+                metric_d1[d2] = weighted_jaccard(new_d1, new_d2)
+            elif args.compare_metric == "df_filter":
+                new_d1 = filter_freq_dict(cf2freq(cfs[d1]))
+                new_d2 = filter_freq_dict(cf2freq(cfs2[d2]))
+                metric_d1[d2] = jaccard(new_d1, new_d2)
+            elif args.compare_metric == "tf_filter":
+                new_d1 = filter_freq_dict(df2idf(dfs[d1], summary[d1]["documents"]))
+                new_d2 = filter_freq_dict(df2idf(dfs2[d2], summary2[d2]["documents"]))
+                metric_d1[d2] = jaccard(new_d1, new_d2)
+            elif args.compare_metric == "kl_divergence":
+                new_d1 = filter_freq_dict(cf2freq(cfs[d1]))
+                new_d2 = filter_freq_dict(cf2freq(cfs2[d2]))
+                metric_d1[d2] = kl_divergence(new_d1, new_d2)
+            elif args.compare_metric == "js_divergence":
+                new_d1 = filter_freq_dict(cf2freq(cfs[d1]))
+                new_d2 = filter_freq_dict(cf2freq(cfs2[d2]))
+                metric_d1[d2] = js_divergence(new_d1, new_d2)
             else:
-                if args.compare_metric == "jaccard":
-                    metric_d1[d2] = jaccard(cfs[d1], cfs[d2])
-                elif args.compare_metric == "weight_jaccard":
-                    new_d1 = filter_freq_dict(cf2freq(cfs[d1]))
-                    new_d2 = filter_freq_dict(cf2freq(cfs[d2]))
-                    metric_d1[d2] = weighted_jaccard(new_d1, new_d2)
-                elif args.compare_metric == "df_filter":
-                    new_d1 = filter_freq_dict(cf2freq(cfs[d1]))
-                    new_d2 = filter_freq_dict(cf2freq(cfs[d2]))
-                    metric_d1[d2] = jaccard(new_d1, new_d2)
-                elif args.compare_metric == "tf_filter":
-                    new_d1 = filter_freq_dict(df2idf(dfs[d1], 1))
-                    new_d2 = filter_freq_dict(df2idf(dfs[d2], 1))
-                    metric_d1[d2] = jaccard(new_d1, new_d2)
-                elif args.compare_metric == "kl_divergence":
-                    new_d1 = filter_freq_dict(cf2freq(cfs[d1]))
-                    new_d2 = filter_freq_dict(cf2freq(cfs[d2]))
-                    metric_d1[d2] = kl_divergence(new_d1, new_d2)
-                elif args.compare_metric == "js_divergence":
-                    new_d1 = filter_freq_dict(cf2freq(cfs[d1]))
-                    new_d2 = filter_freq_dict(cf2freq(cfs[d2]))
-                    metric_d1[d2] = js_divergence(new_d1, new_d2)
-                else:
-                    raise NotImplementedError
+                raise NotImplementedError
         results[d1] = metric_d1
 
     print_results(beir_datasets, results, args.output_path)
diff --git a/scripts/beir/similarities/q2c-df_filter.tsv b/scripts/beir/similarities/q2c-df_filter.tsv
new file mode 100644
index 000000000..350757784
--- /dev/null
+++ b/scripts/beir/similarities/q2c-df_filter.tsv
@@ -0,0 +1,20 @@
+	trec-covid	bioasq	nfcorpus	nq	hotpotqa	climate-fever	fever	dbpedia-entity	fiqa	signal1m	trec-news	robust04	arguana	webis-touche2020	quora	cqadupstack	scidocs	scifact	msmarco
+trec-covid	0.1393	0.1130	0.1093	0.1017	0.0794	0.0814	0.0814	0.0757	0.0898	0.0668	0.0924	0.0968	0.1050	0.1100	0.0902	0.0803	0.0946	0.1171	0.1067
+bioasq	0.3534	0.4330	0.3275	0.1742	0.1925	0.1925	0.1925	0.1912	0.1277	0.2033	0.1332	0.1551	0.1313	0.1427	0.1603	0.1912	0.2196	0.4567	0.2174
+nfcorpus	0.2330	0.2657	0.3053	0.1679	0.1696	0.1736	0.1736	0.1672	0.1372	0.1997	0.1447	0.1526	0.1424	0.1495	0.1730	0.1547	0.1556	0.2594	0.2213
+nq	0.1836	0.1822	0.1761	0.3999	0.3369	0.3645	0.3645	0.3307	0.2492	0.2935	0.3329	0.3104	0.2602	0.2879	0.3304	0.2253	0.2052	0.1710	0.3576
+hotpotqa	0.1839	0.1809	0.1677	0.4478	0.5847	0.5973	0.5973	0.5734	0.1987	0.3041	0.3199	0.3037	0.2234	0.2236	0.2652	0.2082	0.1839	0.1734	0.3127
+climate-fever	0.2352	0.2173	0.2252	0.3268	0.2276	0.2434	0.2434	0.2224	0.2962	0.2339	0.2994	0.3205	0.3119	0.3062	0.2621	0.2299	0.2490	0.2183	0.3345
+fever	0.1675	0.1664	0.1535	0.3497	0.4121	0.4207	0.4207	0.4116	0.1684	0.2767	0.2695	0.2489	0.2036	0.2022	0.2395	0.1747	0.1564	0.1569	0.2594
+dbpedia-entity	0.1006	0.0991	0.0987	0.1901	0.2187	0.2194	0.2194	0.2166	0.1079	0.1660	0.1562	0.1551	0.1330	0.1235	0.1559	0.1111	0.1092	0.0954	0.1574
+fiqa	0.2103	0.1905	0.1923	0.2921	0.1905	0.2092	0.2092	0.1872	0.5152	0.2450	0.2997	0.3302	0.3154	0.3223	0.3423	0.2577	0.2460	0.1849	0.3371
+signal1m	0.1051	0.1047	0.1023	0.1747	0.1546	0.1565	0.1565	0.1525	0.1548	0.1863	0.1787	0.1748	0.1570	0.1517	0.1719	0.1285	0.1132	0.1016	0.1651
+trec-news	0.1006	0.0916	0.0950	0.1550	0.1197	0.1261	0.1261	0.1182	0.1522	0.1408	0.1797	0.1549	0.1504	0.1605	0.1587	0.1150	0.0974	0.0896	0.1476
+robust04	0.1889	0.1672	0.1814	0.2069	0.1694	0.1759	0.1759	0.1647	0.1718	0.1658	0.1853	0.2250	0.2444	0.2039	0.1911	0.1300	0.1882	0.1680	0.2143
+arguana	0.2530	0.2128	0.2192	0.3942	0.2540	0.2738	0.2738	0.2477	0.3770	0.2650	0.3876	0.4195	0.7783	0.4592	0.3407	0.2437	0.2934	0.2134	0.3659
+webis-touche2020	0.0423	0.0391	0.0446	0.0597	0.0429	0.0455	0.0455	0.0419	0.0640	0.0469	0.0675	0.0605	0.0879	0.0886	0.0751	0.0404	0.0465	0.0402	0.0632
+quora	0.2167	0.2013	0.1944	0.3613	0.2628	0.2837	0.2837	0.2550	0.3660	0.3250	0.3784	0.3436	0.3442	0.3906	0.7351	0.2971	0.2467	0.1887	0.3978
+cqadupstack	0.2354	0.2261	0.1994	0.2831	0.2092	0.2272	0.2272	0.2042	0.2937	0.2456	0.2589	0.2650	0.2485	0.2837	0.3468	0.4928	0.3361	0.2148	0.3473
+scidocs	0.2158	0.2124	0.1799	0.1783	0.1539	0.1616	0.1616	0.1529	0.1460	0.1614	0.1439	0.1575	0.1532	0.1513	0.1785	0.1989	0.3926	0.2025	0.1985
+scifact	0.3480	0.4334	0.3390	0.1805	0.1742	0.1801	0.1801	0.1750	0.1333	0.1852	0.1379	0.1566	0.1501	0.1446	0.1569	0.1637	0.2130	0.4916	0.2146
+msmarco	0.2836	0.2972	0.2833	0.3513	0.3169	0.3344	0.3344	0.3105	0.2788	0.3285	0.2867	0.2934	0.2435	0.2749	0.3910	0.2775	0.2493	0.2734	0.5009
diff --git a/scripts/beir/similarities/q2c-js_divergence.tsv b/scripts/beir/similarities/q2c-js_divergence.tsv
new file mode 100644
index 000000000..42af8cabb
--- /dev/null
+++ b/scripts/beir/similarities/q2c-js_divergence.tsv
@@ -0,0 +1,20 @@
+	trec-covid	bioasq	nfcorpus	nq	hotpotqa	climate-fever	fever	dbpedia-entity	fiqa	signal1m	trec-news	robust04	arguana	webis-touche2020	quora	cqadupstack	scidocs	scifact	msmarco
+trec-covid	0.3183	0.3676	0.3849	0.3597	0.3819	0.3684	0.3684	0.3820	0.3965	0.4042	0.3868	0.3549	0.3679	0.3751	0.3260	0.4248	0.3683	0.3671	0.3522
+bioasq	0.2216	0.1939	0.2452	0.2776	0.2744	0.2702	0.2702	0.2731	0.3335	0.2988	0.3278	0.2907	0.3208	0.3132	0.2495	0.3148	0.2702	0.1810	0.2579
+nfcorpus	0.3008	0.2820	0.2583	0.3178	0.3277	0.3191	0.3191	0.3272	0.3482	0.2987	0.3360	0.3254	0.3471	0.3345	0.3020	0.3600	0.3371	0.2873	0.2799
+nq	0.2869	0.2845	0.3068	0.1743	0.1953	0.1829	0.1829	0.1960	0.2434	0.2454	0.2198	0.2069	0.2260	0.2203	0.1841	0.3058	0.2603	0.2981	0.1856
+hotpotqa	0.2499	0.2482	0.2704	0.1275	0.0960	0.1008	0.1008	0.1018	0.2376	0.2236	0.1957	0.1734	0.2168	0.2159	0.1813	0.2764	0.2329	0.2610	0.1750
+climate-fever	0.2160	0.2250	0.2305	0.1478	0.1999	0.1911	0.1911	0.2025	0.1872	0.2395	0.1728	0.1546	0.1561	0.1659	0.2440	0.2633	0.1941	0.2328	0.1506
+fever	0.2465	0.2418	0.2618	0.1408	0.1121	0.1106	0.1106	0.1112	0.2412	0.2140	0.1955	0.1808	0.2184	0.2127	0.2471	0.2789	0.2337	0.2576	0.1670
+dbpedia-entity	0.3952	0.3931	0.4094	0.3073	0.2939	0.2892	0.2892	0.2932	0.3938	0.3597	0.3492	0.3332	0.3557	0.3660	0.3371	0.4233	0.3751	0.4050	0.3299
+fiqa	0.2888	0.2930	0.3090	0.2280	0.2771	0.2605	0.2605	0.2768	0.1349	0.2557	0.2288	0.2061	0.2181	0.2053	0.1374	0.2552	0.2568	0.3045	0.1928
+signal1m	0.3789	0.3756	0.3903	0.3191	0.3378	0.3278	0.3278	0.3369	0.3549	0.3094	0.3284	0.3129	0.3494	0.3549	0.3713	0.3845	0.3720	0.3886	0.3224
+trec-news	0.3678	0.3770	0.3844	0.3170	0.3488	0.3356	0.3356	0.3485	0.3306	0.3366	0.3052	0.3157	0.3212	0.3252	0.3403	0.3927	0.3572	0.3847	0.3051
+robust04	0.2822	0.2948	0.2968	0.2344	0.2638	0.2546	0.2546	0.2662	0.2772	0.3146	0.2741	0.2331	0.2282	0.2492	0.2574	0.3427	0.2604	0.3010	0.2303
+arguana	0.1881	0.2051	0.2091	0.0948	0.1652	0.1533	0.1533	0.1661	0.1285	0.2014	0.1132	0.1019	0.0152	0.0906	0.1844	0.2164	0.1505	0.2118	0.1084
+webis-touche2020	0.4870	0.4907	0.5016	0.4444	0.4620	0.4544	0.4544	0.4623	0.4601	0.4669	0.4590	0.4445	0.4284	0.4409	0.3754	0.4994	0.4657	0.4979	0.4337
+quora	0.2889	0.2929	0.3110	0.2058	0.2450	0.2341	0.2341	0.2478	0.1782	0.2259	0.2103	0.2086	0.2045	0.1707	0.0197	0.2490	0.2524	0.3046	0.1759
+cqadupstack	0.2505	0.2455	0.2745	0.1985	0.2368	0.2313	0.2313	0.2423	0.1895	0.2232	0.2225	0.2041	0.2267	0.1899	0.1315	0.1410	0.1996	0.2591	0.1715
+scidocs	0.2915	0.2898	0.3205	0.3076	0.3200	0.3082	0.3082	0.3189	0.3653	0.3241	0.3372	0.3121	0.3470	0.3534	0.3522	0.3487	0.1940	0.3017	0.2969
+scifact	0.1811	0.1440	0.1905	0.2613	0.2666	0.2561	0.2561	0.2626	0.3200	0.2871	0.3070	0.2688	0.3047	0.3041	0.3307	0.3117	0.2519	0.1162	0.2354
+msmarco	0.2524	0.2451	0.2665	0.2008	0.2152	0.2062	0.2062	0.2152	0.2202	0.2217	0.2298	0.2172	0.2313	0.2168	0.1290	0.2666	0.2421	0.2583	0.1500
diff --git a/scripts/beir/similarities/q2c-kl_divergence.tsv b/scripts/beir/similarities/q2c-kl_divergence.tsv
new file mode 100644
index 000000000..b66d54ee2
--- /dev/null
+++ b/scripts/beir/similarities/q2c-kl_divergence.tsv
@@ -0,0 +1,20 @@
+	trec-covid	bioasq	nfcorpus	nq	hotpotqa	climate-fever	fever	dbpedia-entity	fiqa	signal1m	trec-news	robust04	arguana	webis-touche2020	quora	cqadupstack	scidocs	scifact	msmarco
+trec-covid	1.3171	1.5038	1.1645	1.5557	1.1925	1.1408	1.1408	0.9532	1.0464	1.3154	1.2979	1.3355	1.3443	0.9990	0.7820	1.2226	1.3264	1.2179	1.4574
+bioasq	0.4752	0.7102	0.3978	0.8157	0.3905	0.3777	0.3777	0.2632	0.4918	0.6834	0.6380	0.6360	0.5517	0.4903	0.2398	0.4770	0.7853	0.3281	0.7250
+nfcorpus	0.2896	0.4219	0.3631	0.3448	0.1964	0.1872	0.1872	0.1796	0.1171	0.2434	0.1895	0.2277	0.2054	0.1432	0.1329	0.1255	0.2553	0.3123	0.4695
+nq	0.6697	0.6104	0.6476	0.6461	0.5398	0.5819	0.5819	0.4346	0.5066	0.7994	0.6049	0.5834	0.5205	0.4989	0.4141	0.7528	0.5870	0.6352	0.6111
+hotpotqa	0.3564	0.4486	0.2768	0.5484	0.2580	0.2638	0.2638	0.1741	0.3566	0.6152	0.4689	0.4241	0.4318	0.3451	0.2707	0.4518	0.5310	0.2513	0.4842
+climate-fever	0.3194	0.2791	0.3221	0.2604	0.2667	0.2332	0.2332	0.2684	0.1397	0.4865	0.1754	0.1666	0.2025	0.1652	0.3798	0.3445	0.2696	0.3130	0.2298
+fever	0.2779	0.2559	0.2806	0.3023	0.2154	0.2306	0.2306	0.2118	0.2123	0.4458	0.2788	0.2252	0.2366	0.1892	0.4571	0.3251	0.2422	0.2774	0.2645
+dbpedia-entity	0.3518	0.4480	0.3473	0.7123	0.4688	0.4821	0.4821	0.4532	0.4179	0.6805	0.5349	0.5395	0.4326	0.3993	0.4276	0.4748	0.5365	0.3164	0.6126
+fiqa	0.4496	0.6679	0.4018	0.8408	0.3873	0.3725	0.3725	0.3064	0.4936	0.6569	0.6042	0.7719	0.6030	0.4260	0.2688	0.4113	0.6733	0.3843	0.7004
+signal1m	0.3475	0.3260	0.3270	0.4788	0.4410	0.4079	0.4079	0.4372	0.3117	0.5070	0.4102	0.3603	0.4289	0.3510	0.5111	0.2662	0.3942	0.3434	0.4072
+trec-news	0.5208	0.4706	0.5086	0.6563	0.5731	0.4983	0.4983	0.5693	0.4899	0.6819	0.5679	0.5583	0.5521	0.5472	0.6004	0.5097	0.4776	0.4631	0.5420
+robust04	0.5762	0.5368	0.4200	0.5896	0.4293	0.3977	0.3977	0.2987	0.3907	0.6561	0.5068	0.5127	0.4510	0.3981	0.3908	0.4927	0.6771	0.4396	0.4966
+arguana	0.3261	0.3043	0.3251	0.2163	0.3057	0.2324	0.2324	0.3226	0.1342	0.4923	0.1779	0.1673	0.0086	0.1116	0.3763	0.3046	0.2496	0.3176	0.1932
+webis-touche2020	1.1044	1.8766	1.1320	2.1298	0.6137	0.6239	0.6239	0.6142	1.4826	1.6078	1.7456	1.7416	1.8355	1.6536	1.0439	1.2382	1.9108	1.1932	1.8591
+quora	0.7001	0.9831	0.5066	1.2133	0.5968	0.5866	0.5866	0.3828	0.5919	0.9419	0.8957	1.0112	0.8785	0.6004	0.0487	0.7040	1.0933	0.4753	0.9167
+cqadupstack	0.4205	0.6056	0.3868	0.6199	0.2873	0.2564	0.2564	0.2716	0.2657	0.4874	0.4727	0.4815	0.4274	0.2820	0.1423	0.3226	0.5857	0.3824	0.4424
+scidocs	0.3670	0.3062	0.2769	0.3921	0.3258	0.3291	0.3291	0.2623	0.3085	0.2957	0.2987	0.2691	0.3982	0.3358	0.4599	0.3927	0.4649	0.3636	0.3903
+scifact	0.3018	0.2893	0.2767	0.2787	0.2422	0.2234	0.2234	0.2265	0.1821	0.3003	0.2207	0.1837	0.2009	0.1968	0.3422	0.1784	0.3055	0.2355	0.3151
+msmarco	0.6714	0.4714	0.4600	0.6986	0.4999	0.4973	0.4973	0.1797	0.3832	0.6074	0.5625	0.6057	0.4861	0.4326	0.1259	0.5138	0.6222	0.4238	0.5427
diff --git a/scripts/beir/similarities/q2c-tf_filter.tsv b/scripts/beir/similarities/q2c-tf_filter.tsv
new file mode 100644
index 000000000..a055c6921
--- /dev/null
+++ b/scripts/beir/similarities/q2c-tf_filter.tsv
@@ -0,0 +1,20 @@
+	trec-covid	bioasq	nfcorpus	nq	hotpotqa	climate-fever	fever	dbpedia-entity	fiqa	signal1m	trec-news	robust04	arguana	webis-touche2020	quora	cqadupstack	scidocs	scifact	msmarco
+trec-covid	0.0116	0.0106	0.0206	0.0101	0.0101	0.0101	0.0101	0.0101	0.0121	0.0104	0.0105	0.0106	0.0139	0.0104	0.0113	0.0104	0.0131	0.0198	0.0106
+bioasq	0.2015	0.1849	0.3231	0.1763	0.1765	0.1768	0.1768	0.1765	0.2043	0.1810	0.1825	0.1855	0.2193	0.1818	0.1963	0.1807	0.2239	0.3228	0.1846
+nfcorpus	0.1358	0.1250	0.2271	0.1191	0.1193	0.1194	0.1195	0.1193	0.1392	0.1228	0.1234	0.1253	0.1524	0.1228	0.1330	0.1221	0.1495	0.2054	0.1248
+nq	0.2360	0.2275	0.2729	0.2170	0.2173	0.2176	0.2177	0.2173	0.2457	0.2236	0.2247	0.2282	0.2617	0.2238	0.2403	0.2216	0.2504	0.2711	0.2276
+hotpotqa	0.8068	0.8765	0.4888	0.8514	0.8526	0.8532	0.8532	0.8525	0.7796	0.8695	0.8748	0.8776	0.7027	0.8667	0.8236	0.8459	0.7325	0.5074	0.8775
+climate-fever	0.1841	0.1694	0.2715	0.1616	0.1618	0.1620	0.1619	0.1618	0.1907	0.1666	0.1674	0.1697	0.2159	0.1666	0.1793	0.1658	0.2029	0.2631	0.1692
+fever	0.5803	0.5947	0.4570	0.5674	0.5681	0.5692	0.5692	0.5680	0.5875	0.5846	0.5876	0.5961	0.5888	0.5845	0.5975	0.5750	0.5683	0.4695	0.5953
+dbpedia-entity	0.0510	0.0484	0.0658	0.0461	0.0462	0.0463	0.0463	0.0462	0.0531	0.0476	0.0478	0.0486	0.0596	0.0475	0.0512	0.0473	0.0546	0.0623	0.0484
+fiqa	0.2159	0.1991	0.2991	0.1898	0.1901	0.1903	0.1903	0.1900	0.2281	0.1957	0.1966	0.1994	0.2560	0.1957	0.2126	0.1948	0.2398	0.3014	0.1988
+signal1m	0.0321	0.0299	0.0452	0.0285	0.0285	0.0285	0.0285	0.0285	0.0339	0.0293	0.0295	0.0299	0.0383	0.0294	0.0319	0.0292	0.0352	0.0448	0.0297
+trec-news	0.0207	0.0191	0.0312	0.0182	0.0182	0.0182	0.0182	0.0182	0.0218	0.0188	0.0189	0.0190	0.0251	0.0188	0.0202	0.0187	0.0228	0.0304	0.0190
+robust04	0.0605	0.0556	0.0936	0.0530	0.0531	0.0532	0.0532	0.0531	0.0627	0.0547	0.0549	0.0559	0.0733	0.0547	0.0593	0.0544	0.0663	0.0894	0.0556
+arguana	0.5176	0.4919	0.5190	0.4693	0.4699	0.4701	0.4702	0.4698	0.5384	0.4838	0.4861	0.4931	0.6626	0.4840	0.5140	0.4798	0.5438	0.5342	0.4916
+webis-touche2020	0.0083	0.0076	0.0128	0.0072	0.0072	0.0073	0.0073	0.0072	0.0087	0.0075	0.0075	0.0076	0.0102	0.0075	0.0081	0.0074	0.0093	0.0122	0.0076
+quora	0.4320	0.4139	0.4550	0.3949	0.3954	0.3959	0.3960	0.3954	0.4545	0.4069	0.4089	0.4142	0.4703	0.4069	0.4426	0.4042	0.4586	0.4550	0.4136
+cqadupstack	0.3533	0.3317	0.4277	0.3168	0.3173	0.3175	0.3176	0.3172	0.3712	0.3259	0.3280	0.3320	0.3869	0.3267	0.3537	0.3252	0.3909	0.4426	0.3316
+scidocs	0.1310	0.1195	0.2000	0.1143	0.1145	0.1145	0.1145	0.1144	0.1352	0.1177	0.1182	0.1196	0.1471	0.1179	0.1280	0.1173	0.1474	0.2026	0.1193
+scifact	0.1346	0.1231	0.2317	0.1173	0.1175	0.1177	0.1177	0.1175	0.1369	0.1203	0.1215	0.1233	0.1518	0.1210	0.1309	0.1203	0.1505	0.2302	0.1228
+msmarco	0.8630	0.9449	0.5137	0.9058	0.9070	0.9074	0.9075	0.9068	0.8388	0.9290	0.9365	0.9466	0.7437	0.9286	0.8926	0.9034	0.7892	0.5389	0.9466
diff --git a/scripts/beir/similarities/q2c-weight_jaccard.tsv b/scripts/beir/similarities/q2c-weight_jaccard.tsv
new file mode 100644
index 000000000..8a866ddb5
--- /dev/null
+++ b/scripts/beir/similarities/q2c-weight_jaccard.tsv
@@ -0,0 +1,20 @@
+	trec-covid	bioasq	nfcorpus	nq	hotpotqa	climate-fever	fever	dbpedia-entity	fiqa	signal1m	trec-news	robust04	arguana	webis-touche2020	quora	cqadupstack	scidocs	scifact	msmarco
+trec-covid	0.1719	0.1443	0.1393	0.1287	0.1221	0.1296	0.1296	0.1218	0.1321	0.0978	0.1223	0.1465	0.1497	0.1391	0.1954	0.1076	0.1477	0.1469	0.1412
+bioasq	0.3219	0.3639	0.3002	0.2248	0.2324	0.2358	0.2358	0.2342	0.1864	0.2035	0.1812	0.2143	0.1991	0.1994	0.2814	0.2063	0.2644	0.3813	0.2576
+nfcorpus	0.2198	0.2377	0.2637	0.1794	0.1727	0.1779	0.1779	0.1736	0.1728	0.2131	0.1761	0.1787	0.1675	0.1824	0.2084	0.1745	0.1831	0.2333	0.2218
+nq	0.2180	0.2195	0.2024	0.3424	0.3196	0.3329	0.3329	0.3177	0.2481	0.2534	0.2774	0.2967	0.2710	0.2745	0.3265	0.1938	0.2436	0.2075	0.3209
+hotpotqa	0.2713	0.2697	0.2483	0.4692	0.5535	0.5367	0.5367	0.5396	0.2875	0.3059	0.3476	0.3780	0.3158	0.3138	0.3520	0.2287	0.2949	0.2569	0.3798
+climate-fever	0.3296	0.3124	0.3101	0.4065	0.3290	0.3428	0.3427	0.3245	0.3789	0.2633	0.3798	0.4140	0.4217	0.4005	0.2789	0.2697	0.3611	0.3090	0.4027
+fever	0.2453	0.2481	0.2315	0.3861	0.4406	0.4360	0.4360	0.4407	0.2627	0.3006	0.3221	0.3256	0.2839	0.3001	0.2795	0.2133	0.2604	0.2315	0.3487
+dbpedia-entity	0.1489	0.1503	0.1429	0.1958	0.2121	0.2123	0.2123	0.2122	0.1493	0.1477	0.1676	0.1831	0.1792	0.1614	0.1796	0.1247	0.1640	0.1449	0.1808
+fiqa	0.2326	0.2222	0.2163	0.2729	0.2223	0.2388	0.2388	0.2207	0.4274	0.2494	0.2873	0.3068	0.3024	0.3199	0.4379	0.2736	0.2727	0.2179	0.3274
+signal1m	0.1523	0.1523	0.1490	0.1589	0.1436	0.1550	0.1550	0.1446	0.1772	0.1903	0.1755	0.1858	0.1592	0.1639	0.1485	0.1555	0.1497	0.1496	0.1659
+trec-news	0.1535	0.1460	0.1506	0.1633	0.1393	0.1511	0.1511	0.1389	0.1879	0.1582	0.1905	0.1737	0.1802	0.1806	0.1715	0.1459	0.1605	0.1461	0.1780
+robust04	0.2498	0.2389	0.2438	0.2833	0.2472	0.2580	0.2580	0.2450	0.2603	0.1903	0.2435	0.2891	0.3081	0.2772	0.2556	0.1906	0.2800	0.2386	0.2829
+arguana	0.3670	0.3371	0.3356	0.5091	0.3863	0.4088	0.4088	0.3807	0.4765	0.3145	0.4913	0.5221	0.8406	0.5519	0.3583	0.3170	0.4357	0.3324	0.4938
+webis-touche2020	0.0744	0.0698	0.0730	0.0765	0.0742	0.0761	0.0761	0.0733	0.1007	0.0665	0.0846	0.0861	0.1031	0.1019	0.1541	0.0729	0.0886	0.0717	0.0922
+quora	0.2405	0.2314	0.2167	0.3287	0.2771	0.2889	0.2889	0.2726	0.3683	0.2895	0.3269	0.3217	0.3339	0.3808	0.7576	0.2739	0.2853	0.2196	0.3771
+cqadupstack	0.2822	0.2809	0.2573	0.3215	0.2708	0.2794	0.2794	0.2618	0.3554	0.2887	0.2943	0.3146	0.2962	0.3443	0.4415	0.4130	0.3488	0.2723	0.3691
+scidocs	0.2323	0.2341	0.2080	0.1852	0.1759	0.1844	0.1844	0.1759	0.1619	0.1785	0.1705	0.1950	0.1737	0.1638	0.1719	0.1728	0.3253	0.2271	0.2050
+scifact	0.3746	0.4260	0.3731	0.2371	0.2364	0.2488	0.2488	0.2401	0.2085	0.2209	0.2071	0.2425	0.2171	0.2182	0.1866	0.2287	0.2869	0.4823	0.2754
+msmarco	0.2866	0.2928	0.2678	0.3607	0.3343	0.3446	0.3446	0.3327	0.3005	0.3226	0.3118	0.3248	0.2909	0.3076	0.4459	0.2487	0.2965	0.2730	0.4578
diff --git a/scripts/beir/test_compare_domains.sh b/scripts/beir/test_compare_domains.sh
index e7ea8be5c..5505bb612 100644
--- a/scripts/beir/test_compare_domains.sh
+++ b/scripts/beir/test_compare_domains.sh
@@ -1,7 +1,13 @@
-for metric in weight_jaccard kl_divergence tf_filter df_filter
+rm -r similarities
+mkdir similarities
+for setmode in q2c
+do
+for metric in weight_jaccard kl_divergence tf_filter df_filter js_divergence
 do
 python compare_domains.py \
     --index_path indexes \
-    --output_path indexes_${metric}.tsv \
+    --output_path similarities/${setmode}-${metric}.tsv \
+    --compare_sets ${setmode} \
     --compare_metric ${metric}
 done
+done

From c4b3c2efc168ac21951e3c59a2c9e77fc8ca4962 Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Thu, 12 May 2022 15:29:41 -0400
Subject: [PATCH 12/12] remove previous results

---
 scripts/beir/indexes_df_filter.tsv      | 20 --------------------
 scripts/beir/indexes_js_divergence.tsv  | 20 --------------------
 scripts/beir/indexes_kl_divergence.tsv  | 20 --------------------
 scripts/beir/indexes_tf_filter.tsv      | 20 --------------------
 scripts/beir/indexes_weight_jaccard.tsv | 20 --------------------
 5 files changed, 100 deletions(-)
 delete mode 100644 scripts/beir/indexes_df_filter.tsv
 delete mode 100644 scripts/beir/indexes_js_divergence.tsv
 delete mode 100644 scripts/beir/indexes_kl_divergence.tsv
 delete mode 100644 scripts/beir/indexes_tf_filter.tsv
 delete mode 100644 scripts/beir/indexes_weight_jaccard.tsv

diff --git a/scripts/beir/indexes_df_filter.tsv b/scripts/beir/indexes_df_filter.tsv
deleted file mode 100644
index 0a408a408..000000000
--- a/scripts/beir/indexes_df_filter.tsv
+++ /dev/null
@@ -1,20 +0,0 @@
-	trec-covid	bioasq	nfcorpus	nq	hotpotqa	climate-fever	fever	dbpedia-entity	fiqa	signal1m	trec-news	robust04	arguana	webis-touche2020	quora	cqadupstack	scidocs	scifact	msmarco
-trec-covid	1.0000	0.6805	0.5963	0.3362	0.2377	0.2603	0.2604	0.2334	0.2451	0.2251	0.2787	0.3180	0.2927	0.2760	0.2149	0.2772	0.4241	0.6367	0.3485
-bioasq	0.6805	1.0000	0.6533	0.3165	0.2267	0.2464	0.2465	0.2242	0.2259	0.2182	0.2541	0.2943	0.2567	0.2586	0.2031	0.2753	0.3823	0.7174	0.3335
-nfcorpus	0.5963	0.6533	1.0000	0.3165	0.2207	0.2400	0.2401	0.2187	0.2338	0.2096	0.2508	0.2908	0.2665	0.2540	0.2047	0.2594	0.3541	0.6115	0.3318
-nq	0.3362	0.3165	0.3165	1.0000	0.4995	0.5694	0.5689	0.4902	0.3926	0.3710	0.5455	0.5439	0.4421	0.4387	0.3717	0.3437	0.3329	0.2937	0.5508
-hotpotqa	0.2377	0.2267	0.2207	0.4995	1.0000	0.8334	0.8340	0.9355	0.2344	0.3446	0.3469	0.3274	0.2709	0.2543	0.2790	0.2427	0.2235	0.2107	0.3767
-climate-fever	0.2603	0.2464	0.2400	0.5694	0.8334	1.0000	0.9993	0.8253	0.2640	0.3549	0.3949	0.3762	0.3064	0.2862	0.2933	0.2677	0.2449	0.2280	0.4122
-fever	0.2604	0.2465	0.2401	0.5689	0.8340	0.9993	1.0000	0.8258	0.2636	0.3544	0.3945	0.3758	0.3060	0.2858	0.2935	0.2673	0.2450	0.2281	0.4117
-dbpedia-entity	0.2334	0.2242	0.2187	0.4902	0.9355	0.8253	0.8258	1.0000	0.2255	0.3425	0.3405	0.3231	0.2635	0.2463	0.2751	0.2402	0.2177	0.2084	0.3691
-fiqa	0.2451	0.2259	0.2338	0.3926	0.2344	0.2640	0.2636	0.2255	1.0000	0.3077	0.4699	0.4494	0.4356	0.5031	0.4116	0.4062	0.3054	0.2202	0.4680
-signal1m	0.2251	0.2182	0.2096	0.3710	0.3446	0.3549	0.3544	0.3425	0.3077	1.0000	0.4079	0.3431	0.2916	0.3211	0.3555	0.3037	0.2081	0.2048	0.3654
-trec-news	0.2787	0.2541	0.2508	0.5455	0.3469	0.3949	0.3945	0.3405	0.4699	0.4079	1.0000	0.5913	0.4754	0.5179	0.3765	0.3568	0.3016	0.2400	0.5027
-robust04	0.3180	0.2943	0.2908	0.5439	0.3274	0.3762	0.3758	0.3231	0.4494	0.3431	0.5913	1.0000	0.4845	0.4528	0.3275	0.3324	0.3199	0.2800	0.4757
-arguana	0.2927	0.2567	0.2665	0.4421	0.2709	0.3064	0.3060	0.2635	0.4356	0.2916	0.4754	0.4845	1.0000	0.5555	0.3461	0.3116	0.3413	0.2539	0.4240
-webis-touche2020	0.2760	0.2586	0.2540	0.4387	0.2543	0.2862	0.2858	0.2463	0.5031	0.3211	0.5179	0.4528	0.5555	1.0000	0.3838	0.3866	0.3364	0.2508	0.4530
-quora	0.2149	0.2031	0.2047	0.3717	0.2790	0.2933	0.2935	0.2751	0.4116	0.3555	0.3765	0.3275	0.3461	0.3838	1.0000	0.3551	0.2515	0.1920	0.4549
-cqadupstack	0.2772	0.2753	0.2594	0.3437	0.2427	0.2677	0.2673	0.2402	0.4062	0.3037	0.3568	0.3324	0.3116	0.3866	0.3551	1.0000	0.3632	0.2598	0.4230
-scidocs	0.4241	0.3823	0.3541	0.3329	0.2235	0.2449	0.2450	0.2177	0.3054	0.2081	0.3016	0.3199	0.3413	0.3364	0.2515	0.3632	1.0000	0.3796	0.3599
-scifact	0.6367	0.7174	0.6115	0.2937	0.2107	0.2280	0.2281	0.2084	0.2202	0.2048	0.2400	0.2800	0.2539	0.2508	0.1920	0.2598	0.3796	1.0000	0.3112
-msmarco	0.3485	0.3335	0.3318	0.5508	0.3767	0.4122	0.4117	0.3691	0.4680	0.3654	0.5027	0.4757	0.4240	0.4530	0.4549	0.4230	0.3599	0.3112	1.0000
diff --git a/scripts/beir/indexes_js_divergence.tsv b/scripts/beir/indexes_js_divergence.tsv
deleted file mode 100644
index 60a55173c..000000000
--- a/scripts/beir/indexes_js_divergence.tsv
+++ /dev/null
@@ -1,20 +0,0 @@
-	trec-covid	bioasq	nfcorpus	nq	hotpotqa	climate-fever	fever	dbpedia-entity	fiqa	signal1m	trec-news	robust04	arguana	webis-touche2020	quora	cqadupstack	scidocs	scifact	msmarco
-trec-covid	1.0000	0.0546	0.0839	0.1642	0.2218	0.2017	0.2016	0.2221	0.2382	0.2489	0.2108	0.1854	0.2013	0.2163	0.2808	0.2252	0.1392	0.0701	0.1624
-bioasq	0.0546	1.0000	0.0603	0.1658	0.2205	0.2011	0.2011	0.2198	0.2425	0.2449	0.2189	0.1922	0.2182	0.2217	0.2816	0.2191	0.1519	0.0423	0.1618
-nfcorpus	0.0839	0.0603	1.0000	0.1814	0.2409	0.2211	0.2211	0.2402	0.2556	0.2684	0.2329	0.2050	0.2236	0.2356	0.2987	0.2441	0.1775	0.0782	0.1776
-nq	0.1642	0.1658	0.1814	1.0000	0.0816	0.0633	0.0634	0.0821	0.1366	0.1482	0.0943	0.0854	0.1052	0.1224	0.1754	0.1559	0.1576	0.1849	0.0672
-hotpotqa	0.2218	0.2205	0.2409	0.0816	1.0000	0.0201	0.0201	0.0049	0.2208	0.1691	0.1765	0.1673	0.1931	0.2166	0.2170	0.2207	0.2222	0.2379	0.1214
-climate-fever	0.2017	0.2011	0.2211	0.0633	0.0201	1.0000	0.0000	0.0198	0.1984	0.1606	0.1437	0.1355	0.1701	0.1893	0.2078	0.2005	0.2019	0.2193	0.1060
-fever	0.2016	0.2011	0.2211	0.0634	0.0201	0.0000	1.0000	0.0198	0.1985	0.1606	0.1438	0.1356	0.1702	0.1894	0.2078	0.2006	0.2019	0.2193	0.1061
-dbpedia-entity	0.2221	0.2198	0.2402	0.0821	0.0049	0.0198	0.0198	1.0000	0.2229	0.1685	0.1764	0.1670	0.1943	0.2184	0.2183	0.2201	0.2237	0.2377	0.1218
-fiqa	0.2382	0.2425	0.2556	0.1366	0.2208	0.1984	0.1985	0.2229	1.0000	0.1826	0.1164	0.1233	0.1182	0.0883	0.1501	0.1238	0.2003	0.2568	0.0970
-signal1m	0.2489	0.2449	0.2684	0.1482	0.1691	0.1606	0.1606	0.1685	0.1826	1.0000	0.1560	0.1744	0.2006	0.1852	0.1916	0.1812	0.2486	0.2621	0.1393
-trec-news	0.2108	0.2189	0.2329	0.0943	0.1765	0.1437	0.1438	0.1764	0.1164	0.1560	1.0000	0.0696	0.1098	0.0908	0.1901	0.1623	0.1953	0.2341	0.1108
-robust04	0.1854	0.1922	0.2050	0.0854	0.1673	0.1355	0.1356	0.1670	0.1233	0.1744	0.0696	1.0000	0.1082	0.1147	0.2092	0.1736	0.1809	0.2089	0.1086
-arguana	0.2013	0.2182	0.2236	0.1052	0.1931	0.1701	0.1702	0.1943	0.1182	0.2006	0.1098	0.1082	1.0000	0.0857	0.1902	0.1766	0.1702	0.2274	0.1143
-webis-touche2020	0.2163	0.2217	0.2356	0.1224	0.2166	0.1893	0.1894	0.2184	0.0883	0.1852	0.0908	0.1147	0.0857	1.0000	0.1707	0.1352	0.1846	0.2341	0.1122
-quora	0.2808	0.2816	0.2987	0.1754	0.2170	0.2078	0.2078	0.2183	0.1501	0.1916	0.1901	0.2092	0.1902	0.1707	1.0000	0.1669	0.2471	0.2970	0.1350
-cqadupstack	0.2252	0.2191	0.2441	0.1559	0.2207	0.2005	0.2006	0.2201	0.1238	0.1812	0.1623	0.1736	0.1766	0.1352	0.1669	1.0000	0.1833	0.2358	0.1205
-scidocs	0.1392	0.1519	0.1775	0.1576	0.2222	0.2019	0.2019	0.2237	0.2003	0.2486	0.1953	0.1809	0.1702	0.1846	0.2471	0.1833	1.0000	0.1602	0.1482
-scifact	0.0701	0.0423	0.0782	0.1849	0.2379	0.2193	0.2193	0.2377	0.2568	0.2621	0.2341	0.2089	0.2274	0.2341	0.2970	0.2358	0.1602	1.0000	0.1805
-msmarco	0.1624	0.1618	0.1776	0.0672	0.1214	0.1060	0.1061	0.1218	0.0970	0.1393	0.1108	0.1086	0.1143	0.1122	0.1350	0.1205	0.1482	0.1805	1.0000
diff --git a/scripts/beir/indexes_kl_divergence.tsv b/scripts/beir/indexes_kl_divergence.tsv
deleted file mode 100644
index 476033474..000000000
--- a/scripts/beir/indexes_kl_divergence.tsv
+++ /dev/null
@@ -1,20 +0,0 @@
-	trec-covid	bioasq	nfcorpus	nq	hotpotqa	climate-fever	fever	dbpedia-entity	fiqa	signal1m	trec-news	robust04	arguana	webis-touche2020	quora	cqadupstack	scidocs	scifact	msmarco
-trec-covid	1.0000	0.0448	0.0336	0.1378	0.1141	0.1268	0.1268	0.1026	0.0769	0.1147	0.2565	0.1991	0.0948	0.1706	0.1257	0.0665	0.0856	0.0569	0.1137
-bioasq	0.0609	1.0000	0.0194	0.1555	0.1295	0.1447	0.1447	0.1217	0.0999	0.1345	0.2773	0.2233	0.1397	0.1957	0.1517	0.0833	0.1371	0.0239	0.1279
-nfcorpus	0.1186	0.0863	1.0000	0.1908	0.1527	0.1706	0.1706	0.1435	0.1212	0.1509	0.3080	0.2669	0.1566	0.2258	0.1755	0.1044	0.1849	0.1257	0.1794
-nq	0.1812	0.1576	0.1643	1.0000	0.0616	0.0384	0.0383	0.0815	0.0574	0.1851	0.2639	0.1940	0.0632	0.1516	0.1579	0.0737	0.1527	0.1855	0.0489
-hotpotqa	0.2694	0.2197	0.2561	0.1654	1.0000	0.0588	0.0588	0.0050	0.1574	0.2180	0.4821	0.3698	0.2383	0.3233	0.1839	0.1503	0.2188	0.2563	0.1190
-climate-fever	0.1456	0.1090	0.1328	0.0707	-0.0202	1.0000	-0.0000	-0.0222	0.0586	0.1299	0.3001	0.2246	0.1211	0.1858	0.1071	0.0644	0.1062	0.1335	0.0523
-fever	0.1456	0.1090	0.1328	0.0708	-0.0202	0.0000	1.0000	-0.0222	0.0588	0.1300	0.3002	0.2247	0.1212	0.1858	0.1071	0.0645	0.1062	0.1335	0.0523
-dbpedia-entity	0.2553	0.2071	0.2423	0.1618	0.0013	0.0604	0.0604	1.0000	0.1551	0.2109	0.4672	0.3603	0.2314	0.3199	0.1803	0.1449	0.2111	0.2426	0.1218
-fiqa	0.3243	0.3136	0.3265	0.3440	0.3525	0.3592	0.3582	0.3496	1.0000	0.3400	0.4209	0.4279	0.2379	0.2018	0.2046	0.1204	0.3096	0.3201	0.2314
-signal1m	0.3184	0.2348	0.2533	0.2131	0.2702	0.2878	0.2878	0.2469	0.2958	1.0000	0.4628	0.2995	0.2113	0.3399	0.1300	0.2227	0.3798	0.2558	0.1895
-trec-news	0.0620	0.0436	0.0422	0.0681	0.0835	0.0679	0.0677	0.0795	-0.0693	0.1217	1.0000	0.0441	-0.0241	-0.0209	0.0574	-0.0204	0.0600	0.0454	0.0254
-robust04	0.0416	0.0256	0.0194	0.0370	0.0584	0.0599	0.0595	0.0594	-0.0747	0.0811	0.0656	1.0000	-0.0266	-0.0126	0.0488	-0.0377	0.0326	0.0366	0.0053
-arguana	0.2747	0.2292	0.2534	0.2178	0.2459	0.2374	0.2373	0.2517	0.0637	0.2840	0.3115	0.2616	1.0000	0.1559	0.1798	0.1154	0.2201	0.2417	0.1416
-webis-touche2020	0.1759	0.1677	0.1734	0.1959	0.2135	0.2161	0.2161	0.2093	-0.0209	0.2110	0.2116	0.2317	0.0718	1.0000	0.1017	0.0445	0.1768	0.1711	0.1255
-quora	0.6807	0.9262	0.7167	1.0242	0.5189	0.6385	0.6385	0.5243	0.5093	0.7571	0.9984	1.0744	0.7696	0.6971	1.0000	0.4465	1.0338	0.6816	0.7129
-cqadupstack	0.3405	0.3268	0.3328	0.3428	0.3404	0.3548	0.3544	0.3385	0.1287	0.3009	0.4360	0.4501	0.2323	0.2593	0.1877	1.0000	0.3545	0.3202	0.2640
-scidocs	0.1595	0.1440	0.1419	0.1746	0.1613	0.1796	0.1796	0.1526	0.1121	0.1258	0.3094	0.2685	0.1335	0.1828	0.1601	0.1383	1.0000	0.1584	0.1128
-scifact	0.0915	0.0563	0.0574	0.1600	0.1297	0.1442	0.1442	0.1215	0.0911	0.1241	0.2662	0.2197	0.1328	0.1859	0.1368	0.0746	0.1293	1.0000	0.1346
-msmarco	0.2116	0.1982	0.2083	0.1484	0.1556	0.1780	0.1779	0.1526	0.0356	0.1836	0.3661	0.3101	0.1353	0.1968	0.1138	0.0810	0.1784	0.2204	1.0000
diff --git a/scripts/beir/indexes_tf_filter.tsv b/scripts/beir/indexes_tf_filter.tsv
deleted file mode 100644
index 957e1a473..000000000
--- a/scripts/beir/indexes_tf_filter.tsv
+++ /dev/null
@@ -1,20 +0,0 @@
-	trec-covid	bioasq	nfcorpus	nq	hotpotqa	climate-fever	fever	dbpedia-entity	fiqa	signal1m	trec-news	robust04	arguana	webis-touche2020	quora	cqadupstack	scidocs	scifact	msmarco
-trec-covid	1.0000	0.5225	0.5367	0.7852	0.7695	0.6963	0.6962	0.7722	0.8216	0.8511	0.7654	0.7957	0.7335	0.8538	0.8450	0.8544	0.8116	0.5629	0.6933
-bioasq	0.5225	1.0000	0.1540	0.6365	0.6223	0.6332	0.6332	0.6220	0.4986	0.6195	0.6504	0.6609	0.4141	0.6220	0.5379	0.5949	0.4224	0.1639	0.7079
-nfcorpus	0.5367	0.1540	1.0000	0.3979	0.3977	0.3425	0.3424	0.3994	0.5425	0.4687	0.3922	0.4194	0.5547	0.4648	0.5216	0.4705	0.5858	0.7047	0.3258
-nq	0.7852	0.6365	0.3979	1.0000	0.9263	0.8870	0.8869	0.9260	0.7417	0.8924	0.8971	0.8946	0.6171	0.8980	0.8011	0.8941	0.6824	0.4205	0.8288
-hotpotqa	0.7695	0.6223	0.3977	0.9263	1.0000	0.9088	0.9086	0.9868	0.7290	0.8748	0.8540	0.8523	0.6093	0.8689	0.7865	0.8719	0.6740	0.4207	0.8053
-climate-fever	0.6963	0.6332	0.3425	0.8870	0.9088	1.0000	0.9998	0.9053	0.6556	0.8019	0.8279	0.8134	0.5391	0.8035	0.7134	0.7997	0.6032	0.3634	0.8089
-fever	0.6962	0.6332	0.3424	0.8869	0.9086	0.9998	1.0000	0.9051	0.6556	0.8019	0.8277	0.8135	0.5392	0.8034	0.7133	0.7996	0.6031	0.3633	0.8090
-dbpedia-entity	0.7722	0.6220	0.3994	0.9260	0.9868	0.9053	0.9051	1.0000	0.7312	0.8771	0.8534	0.8530	0.6119	0.8693	0.7896	0.8735	0.6768	0.4225	0.8040
-fiqa	0.8216	0.4986	0.5425	0.7417	0.7290	0.6556	0.6556	0.7312	1.0000	0.8267	0.7301	0.7637	0.7555	0.8191	0.8492	0.8219	0.7839	0.5677	0.6590
-signal1m	0.8511	0.6195	0.4687	0.8924	0.8748	0.8019	0.8019	0.8771	0.8267	1.0000	0.8723	0.8941	0.7018	0.9403	0.8771	0.9303	0.7535	0.4918	0.7968
-trec-news	0.7654	0.6504	0.3922	0.8971	0.8540	0.8279	0.8277	0.8534	0.7301	0.8723	1.0000	0.9119	0.6057	0.8787	0.7784	0.8435	0.6608	0.4139	0.8578
-robust04	0.7957	0.6609	0.4194	0.8946	0.8523	0.8134	0.8135	0.8530	0.7637	0.8941	0.9119	1.0000	0.6504	0.9016	0.8087	0.8625	0.6967	0.4421	0.8584
-arguana	0.7335	0.4141	0.5547	0.6171	0.6093	0.5391	0.5392	0.6119	0.7555	0.7018	0.6057	0.6504	1.0000	0.6962	0.7429	0.6963	0.7329	0.5798	0.5498
-webis-touche2020	0.8538	0.6220	0.4648	0.8980	0.8689	0.8035	0.8034	0.8693	0.8191	0.9403	0.8787	0.9016	0.6962	1.0000	0.8729	0.9344	0.7545	0.4876	0.8016
-quora	0.8450	0.5379	0.5216	0.8011	0.7865	0.7134	0.7133	0.7896	0.8492	0.8771	0.7784	0.8087	0.7429	0.8729	1.0000	0.8732	0.7931	0.5468	0.7044
-cqadupstack	0.8544	0.5949	0.4705	0.8941	0.8719	0.7997	0.7996	0.8735	0.8219	0.9303	0.8435	0.8625	0.6963	0.9344	0.8732	1.0000	0.7626	0.4936	0.7674
-scidocs	0.8116	0.4224	0.5858	0.6824	0.6740	0.6032	0.6031	0.6768	0.7839	0.7535	0.6608	0.6967	0.7329	0.7545	0.7931	0.7626	1.0000	0.6193	0.5921
-scifact	0.5629	0.1639	0.7047	0.4205	0.4207	0.3634	0.3633	0.4225	0.5677	0.4918	0.4139	0.4421	0.5798	0.4876	0.5468	0.4936	0.6193	1.0000	0.3492
-msmarco	0.6933	0.7079	0.3258	0.8288	0.8053	0.8089	0.8090	0.8040	0.6590	0.7968	0.8578	0.8584	0.5498	0.8016	0.7044	0.7674	0.5921	0.3492	1.0000
diff --git a/scripts/beir/indexes_weight_jaccard.tsv b/scripts/beir/indexes_weight_jaccard.tsv
deleted file mode 100644
index 29da5e612..000000000
--- a/scripts/beir/indexes_weight_jaccard.tsv
+++ /dev/null
@@ -1,20 +0,0 @@
-	trec-covid	bioasq	nfcorpus	nq	hotpotqa	climate-fever	fever	dbpedia-entity	fiqa	signal1m	trec-news	robust04	arguana	webis-touche2020	quora	cqadupstack	scidocs	scifact	msmarco
-trec-covid	1.0000	0.6702	0.5867	0.3619	0.2564	0.2955	0.2955	0.2574	0.2721	0.2390	0.2698	0.3307	0.3119	0.2911	0.2042	0.2993	0.4355	0.6190	0.3553
-bioasq	0.6702	1.0000	0.6530	0.3559	0.2588	0.2973	0.2973	0.2613	0.2633	0.2417	0.2546	0.3170	0.2876	0.2771	0.1991	0.3053	0.4019	0.7019	0.3516
-nfcorpus	0.5867	0.6530	1.0000	0.3325	0.2350	0.2689	0.2689	0.2371	0.2546	0.2178	0.2441	0.3039	0.2827	0.2703	0.1884	0.2767	0.3613	0.5820	0.3299
-nq	0.3619	0.3559	0.3325	1.0000	0.4992	0.5883	0.5882	0.4983	0.4031	0.3688	0.4455	0.4913	0.4459	0.4018	0.3214	0.3668	0.3770	0.3273	0.5709
-hotpotqa	0.2564	0.2588	0.2350	0.4992	1.0000	0.7719	0.7720	0.9361	0.2598	0.3286	0.2760	0.3020	0.2817	0.2387	0.2806	0.2604	0.2573	0.2395	0.4378
-climate-fever	0.2955	0.2973	0.2689	0.5883	0.7719	1.0000	0.9998	0.7741	0.2988	0.3495	0.3382	0.3659	0.3258	0.2803	0.2792	0.2969	0.2969	0.2736	0.4613
-fever	0.2955	0.2973	0.2689	0.5882	0.7720	0.9998	1.0000	0.7742	0.2987	0.3494	0.3381	0.3658	0.3257	0.2802	0.2792	0.2968	0.2970	0.2736	0.4612
-dbpedia-entity	0.2574	0.2613	0.2371	0.4983	0.9361	0.7741	0.7742	1.0000	0.2595	0.3332	0.2758	0.3038	0.2807	0.2372	0.2783	0.2624	0.2566	0.2410	0.4396
-fiqa	0.2721	0.2633	0.2546	0.4031	0.2598	0.2988	0.2987	0.2595	1.0000	0.3152	0.4142	0.4030	0.4821	0.5323	0.3748	0.4754	0.3243	0.2503	0.4781
-signal1m	0.2390	0.2417	0.2178	0.3688	0.3286	0.3495	0.3494	0.3332	0.3152	1.0000	0.3237	0.3063	0.2794	0.2937	0.2944	0.3244	0.2371	0.2291	0.3960
-trec-news	0.2698	0.2546	0.2441	0.4455	0.2760	0.3382	0.3381	0.2758	0.4142	0.3237	1.0000	0.5740	0.4239	0.4964	0.2756	0.3322	0.2825	0.2412	0.4140
-robust04	0.3307	0.3170	0.3039	0.4913	0.3020	0.3659	0.3658	0.3038	0.4030	0.3063	0.5740	1.0000	0.4421	0.4458	0.2602	0.3248	0.3196	0.2954	0.4329
-arguana	0.3119	0.2876	0.2827	0.4459	0.2817	0.3258	0.3257	0.2807	0.4821	0.2794	0.4239	0.4421	1.0000	0.5237	0.3105	0.3709	0.3706	0.2801	0.4409
-webis-touche2020	0.2911	0.2771	0.2703	0.4018	0.2387	0.2803	0.2802	0.2372	0.5323	0.2937	0.4964	0.4458	0.5237	1.0000	0.3264	0.4141	0.3237	0.2693	0.4234
-quora	0.2042	0.1991	0.1884	0.3214	0.2806	0.2792	0.2792	0.2783	0.3748	0.2944	0.2756	0.2602	0.3105	0.3264	1.0000	0.3405	0.2409	0.1884	0.4022
-cqadupstack	0.2993	0.3053	0.2767	0.3668	0.2604	0.2969	0.2968	0.2624	0.4754	0.3244	0.3322	0.3248	0.3709	0.4141	0.3405	1.0000	0.3477	0.2874	0.4328
-scidocs	0.4355	0.4019	0.3613	0.3770	0.2573	0.2969	0.2970	0.2566	0.3243	0.2371	0.2825	0.3196	0.3706	0.3237	0.2409	0.3477	1.0000	0.3946	0.3868
-scifact	0.6190	0.7019	0.5820	0.3273	0.2395	0.2736	0.2736	0.2410	0.2503	0.2291	0.2412	0.2954	0.2801	0.2693	0.1884	0.2874	0.3946	1.0000	0.3270
-msmarco	0.3553	0.3516	0.3299	0.5709	0.4378	0.4613	0.4612	0.4396	0.4781	0.3960	0.4140	0.4329	0.4409	0.4234	0.4022	0.4328	0.3868	0.3270	1.0000