-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.yml
64 lines (43 loc) · 2.41 KB
/
config.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# TODO: write in the readme that if the user wants to regenerate something, just delete the embeddings / clusters / dictlist_nn.json files and run the script again
# otherwise, the script will use the existing files, it will get much faster after the first runv
captions_urls_path: "/data/projects/data-decay-repr/captions/Train_GCC-training.tsv"
# Model to use for the embeddings
model_name: "openai/clip-vit-large-patch14"
# Step size for calculating embeddings
step_size: 1000
# Dataset embedding location
dataset_embeddings_path: "/data/projects/data-decay-repr/embeddings2/text_embeddings_L14.npy"
# # of clusters to create
cluster_count: 100
# Clusters save folder
#clusters_folder: "/data/projects/data-decay-repr/clusters/"
clusters_folder: /data/projects/data-decay/cc3m/script_tests/clusters/
# Use torch kmeans instead of sklearn kmeans
use_torch_kmeans: True
# Location of decayed indices
decayed_indices_path: "/data/projects/data-decay-repr/cc3m_decayed_indices.txt"
# Only need to recalculate decayed samples dictionary if decayed indices or the nearby_sample_count are updated
decayed_dict_calculate: True
# Location of decayed samples dictionary
decayed_samples_dict_nn_path: "/data/projects/data-decay-repr/diclist_nn.json"
# True if we want to consider the peripheral samples
consider_nns: True
# Which similarity measure to use distances or dot products
similarity_type: 'dot_products'
# Where to save the results. Make sure folder exists
result_folder: "/data/projects/data-decay-repr/results/"
# Number of nearest neighbors to consider
nearby_sample_count: 20
# At least how many of those nearest neighbors should be decayed
nearby_decayed_sample_count_threshold: 12
# How many other clusters to consider apart from the closest one (from closest to farthest)
closest_clusters_count: 0
# Checking other clusters for all decayed samples might be costly. We can limit the number of decayed samples to check
closest_cluster_check_threshold: 2
# Whether to check if the nearby_decayed_sample_count_threshold decayed neighbour has at least lower_similarity_threshold similarity to the decayed sample
check_similarity: True
lower_similarity_threshold: 0.8
# Combine clusters if their centroids have at least cluster_similarity_threshold similarity
group_similarity_threshold: 0.8
# How many decayed samples should be in a cluster to consider it (including the decayed neighbours if consider_nns is True)
group_element_threshold: 0