forked from floft/codats
-
Notifications
You must be signed in to change notification settings - Fork 0
/
experiments_ssda.py
executable file
·141 lines (111 loc) · 4.71 KB
/
experiments_ssda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python3
"""
Generates the list of which single-source adaptation problems to perform
For each dataset, generate 10 random source-target pairs (excluding the source
as the target, otherwise it's not domain adaptation)
Note: 3 runs for each, but that's in the .srun scripts not here
(for mean +/- stdev)
Usage: ./experiments_ssda.py > experiments_ssda.txt
"""
import random
import itertools
import datasets.datasets as datasets
from experiments_msda import natural_keys
def generate_single_source(dataset_name, users, max_number=5):
# Take random set of the possible combinations
combinations = list(itertools.combinations(users, 2))
random.shuffle(combinations)
combinations = combinations[:max_number]
pairs = []
for source_user, target_user in combinations:
assert source_user != target_user
pairs.append((dataset_name, str(source_user), str(target_user)))
return pairs
if __name__ == "__main__":
# Sources-target pairs for training
pairs = []
uids = []
for name in datasets.list_datasets():
# Tune on "watch_noother" not "watch"
if name == "watch":
continue
users = datasets.get_dataset_users(name)
# Since sources-target aren't stored in filename anymore (too long), we
# would run into folder name conflicts if we didn't append a unique ID
# to each sources-target pair
uid = 0
# Make this repeatable
random.seed(42)
# Allows extra max_users for some datasets without changin uid's
#
# TODO get rid of all this confusing code once we decide what number
# to set max_users to. If we don't need to change max_users, then
# we can just increment uid's like before.
bonus_uid = 0
max_number = 10
curr_pairs = generate_single_source(name, users, max_number=max_number)
for i, (dataset_name, source_users, target_user) in enumerate(curr_pairs):
# We want to allow increasing the number of max_users for
# wisdm_at and watch without changing the uid's of the 0-4
# targets for backwards compatibility (otherwise we have to move
# all the models around...)
set_of_five = i // 5
# before we had 0-4 (or 1-5), so do as before
if max_number == 5 or set_of_five == 0:
uids.append(uid)
uid += 1
else:
uids.append(str(uid)+"_"+str(bonus_uid))
bonus_uid += 1
pairs += curr_pairs
# Check that these make sense
print("List of adaptations we'll perform:")
for i, (dataset_name, source, target) in enumerate(pairs):
print(" ", dataset_name, source, "to", target, "uid", uids[i])
print()
#
# kamiak_{train,eval}_ssda.srun
#
print("For kamiak_{train,eval}_ssda.srun:")
dataset_names = []
print_uids = []
sources = []
targets = []
dataset_target_pairs = {} # for upper bounds
for i, (dataset_name, source, target) in enumerate(pairs):
dataset_names.append("\""+dataset_name+"\"")
print_uids.append(str(uids[i]))
sources.append("\""+source+"\"")
targets.append("\""+target+"\"")
# for upper bounds
pair_name = ("\""+dataset_name+"\"", "\""+target+"\"")
full_pair = ("\""+dataset_name+"\"", str(uids[i]), "\""+target+"\"")
if pair_name not in dataset_target_pairs:
dataset_target_pairs[pair_name] = full_pair
print("# number of adaptation problems =", len(sources))
print("uids=(", " ".join(print_uids), ")", sep="")
print("datasets=(", " ".join(dataset_names), ")", sep="")
print("sources=(", " ".join(sources), ")", sep="")
print("targets=(", " ".join(targets), ")", sep="")
print()
#
# kamiak_{train,eval}_ssda_upper.srun
#
print("For kamiak_{train,eval}_ssda_upper.srun:")
targets_unique = list(set(dataset_target_pairs.values()))
targets_unique.sort(key=natural_keys)
sources_blank = ["\"\""]*len(targets_unique)
targets_unique_uids = []
targets_unique_dataset = []
targets_unique_target = []
for dataset_name, uid, target in targets_unique:
# Uses first uid from dataset_name-target
targets_unique_uids.append(uid)
targets_unique_dataset.append(dataset_name)
targets_unique_target.append(target)
print("# number of adaptation problems =", len(targets_unique))
print("uids=(", " ".join(["u"+str(x) for x in targets_unique_uids]), ")", sep="")
print("datasets=(", " ".join(targets_unique_dataset), ")", sep="")
print("sources=(", " ".join(sources_blank), ")", sep="")
print("targets=(", " ".join(targets_unique_target), ")", sep="")
print()