diff --git a/docs/example-analyses/simulation-finding-funniest.py b/docs/example-analyses/simulation-finding-funniest.py new file mode 100644 index 0000000..64f7989 --- /dev/null +++ b/docs/example-analyses/simulation-finding-funniest.py @@ -0,0 +1,427 @@ +import numpy as np +from typing import List, Dict, Any + +# R is half of the range of possible values. Here, it's (3-1)/2 = 2/2 = 1. +R = 1.0 + +# delta is the failure probability -- i.e. set delta = 0.05 for 1-delta = 95% success probability. +delta = 0.05 + +# ------------------- DEFINE CLASSES FOR DIFFERENT SAMPLERS ------------------- + +class Random: + def __init__(self, n:int): + """ + Parameters + ---------- + - n:int, number of targets (captions/videos/etc) + + Vars in function + ---------------- + - self._initialized tracks whether some lists, other variables have been created yet + - self.name gives the model a name for purposes of tracking simulation results, + in case we want to plot simulation results for multiple models + """ + self.n = n + self.name = self.model_name() + " " + str(n) + self._initialized = False + + def model_name(self): + return "Random" + + def get_queries(self, n_queries=100): + """ + Parameters + ---------- + n_queries: int + Number of queries to queue up for a user + + Returns + ------- + indices:List[int], len(indices) = n + uniformly random permutation of values from 0 to "number of targets - 1" (i.e. n-1) + scores:List[float], len(scores) = n + List of n scores (where the score at index i corresponds to caption i) between 0 and 1. + Represents quality of the query that can be posed to the user. + Note that, for class Random, scores are just assigned randomly, so they don't really represent quality here. + """ + indices = np.arange(self.n).astype(int) + scores = np.random.uniform(0, 1, size=len(indices)) + + return indices, scores + + def _initialize(self): + """ + Initializes lists for storing info about user ratings of captions. + Index i of self._reward_ stores the sum of user-provided ratings for caption i. + Index i of self._reward_squared_ stores the sum of the squares of user-provided ratings for caption i. + Index i of self._hits_ stores the number of times users have rated caption i. + Set self._initialized = True so this initialization does not overwrite things every time. + Index i of self.emp_avg_ratings_ stores the average user-provided rating for caption i. + Index i of self.ucbs_ stores the distance from caption i to its upper (lower) "delta"-width confidence bound. + """ + self._reward_ = np.zeros(self.n) + self._reward_squared_ = np.zeros(self.n) + self._hits_ = np.zeros(self.n) + self.emp_avg_ratings_ = self._reward_ / np.maximum(self._hits_, 1) + self.ucbs_ = self.calculate_ucbs(self._hits_,self.emp_avg_ratings_,np.arange(self.n),np.zeros(self.n)) + self.cis_ = self.calculate_95_ci(self._reward_squared_, self.emp_avg_ratings_, self._hits_) + self._initialized = True + + def partial_fit(self, answers:List[Dict[str, int]]): + """ + Parameters + ---------- + answers:List[Dict[str, int]] + A list of user-provided ratings of captions; each dict stores a single rating. + Format of each dictionary is {"index": caption index, "rating": 1 <= rating <= 3}. + + Returns + ------- + self, which is the model + self.emp_avg_ratings_ is a list of ratings, where index i contains the average rating for caption i + self.ucbs_ is a list of numbers related to confidence intervals, where index i contains self.ucbs_, + where self.emp_avg_ratings_ \pm self.ucbs_ is the "delta" CI for the rating for caption i + """ + if not self._initialized: + self._initialize() + for answer in answers: + self._reward_[answer["index"]] += answer["rating"] + self._reward_squared_[answer["index"]] += answer["rating"]**2 + self._hits_[answer["index"]] += 1 + self.emp_avg_ratings_ = self._reward_ / np.maximum(self._hits_, 1) + answer_indices = [answer["index"] for answer in answers] + self.ucbs_ = self.calculate_ucbs(self._hits_, self.emp_avg_ratings_, answer_indices, self.ucbs_) + self.cis_ = self.calculate_95_ci(self._reward_squared_, self.emp_avg_ratings_, self._hits_) + + return self + + def calculate_ucbs(self, num_hits:List[int], emp_avg_ratings, answer_indices:List[int], old_ucbs:List[float]): + """ + Parameters + ---------- + num_hits:List[int] + A list of the number of user-provided ratings for a caption, where index i contains the + number of user-provided ratings received for caption i. + emp_avg_ratings + A list of average user-provided ratings, where index i contains the average rating for caption i. + answer_indices:List[int] + List of caption numbers that received a user rating in the most recent round. Permits more efficient calculation + of UCBS when using lil_KLUCB. + old_ucbs:List[float] + List of old ucbs, where index i refers to the old ucb for caption i. Used for more efficiently + calculating ucbs for lil_KLUCB. + + Returns + ------- + dist_to_upper:List[float] + A list of the distances from the average ratings for captions to the upper confidence bound on each caption's rating. + For caption i, index i is the distance from self.emp_avg_ratings_[i] to the upper confidence bound on the rating for caption i. + Note that the calculation of the UCB depends on "delta", where we want to find the best caption w.p. 1-delta. + This is done according to the lil-UCB algorithm. + See https://proceedings.neurips.cc/paper/2017/file/c02f9de3c2f3040751818aacc7f60b74-Paper.pdf. + """ + # Formula from + # https://github.com/nextml/NEXT/blob/4c8f4d5a66376a18c047f4c9409f73c75925bf07/apps/CardinalBanditsPureExploration/algs/LilUCB.py#L103 + dist_to_upper = np.zeros(len(num_hits)) + 1e8 + dist_to_upper[np.nonzero(num_hits)] = 2.0*R*R*np.log(4*num_hits[np.nonzero(num_hits)]* + num_hits[np.nonzero(num_hits)]/delta ) / num_hits[np.nonzero(num_hits)] + return dist_to_upper + + def calculate_95_ci(self, sum_rewards_squared, empirical_mean, num_hits): + ''' + Parameters + ---------- + sum_rewards_squared + A list of ints, where index j stores Sum_i(rating_i^2) for caption j. + (We need to store this because Sum_i(rating_i^2) can't be computed directly from Sum_i(rating_i).) + empirical_mean + A list of floats, where index j stores [Sum_i(rating_i)]/n for caption j, where n is num_hits. + num_hits + A list of ints, where index j stores the number of user ratings for caption j. + + Returns + ------- + A list of floats, where index j stores 1.96 times the standard deviation of the user-provided ratings for caption j. + (The 95% CI for caption j, then, is its empirical mean \pm this value.) + ''' + + ''' + Want to calculate + 1.96 * sqrt{ \frac{ 1/n * (Sum_i (reward_i^2) - n * empirical_mean^2) } { (n-1) * n } }. + See https://github.com/nextml/NEXT/blob/4c8f4d5a66376a18c047f4c9409f73c75925bf07/apps/CardinalBanditsPureExploration/algs/LilUCB.py#L73 + ''' + ci_95 = np.zeros(len(num_hits)) + 1e8 + numerator = sum_rewards_squared - num_hits * empirical_mean**2 + denominator = (num_hits - 1) * num_hits ** 2 + + ci_95[np.argwhere(num_hits > 1)] = 1.96 * np.sqrt(numerator[np.argwhere(num_hits > 1)] / denominator[np.argwhere(num_hits > 1)]) + return ci_95 + +class Active(Random): + def model_name(self): + return "Active" + + def get_queries(self, n_queries=100): + """ + Parameters + ---------- + n_queries:int + Number of queries to queue up for a user + + Returns + ------- + indices:List[int], len(indices) = min(n_queries, n) + uniformly random permutation of values from 0 to "number of targets - 1" (i.e. n-1) + scores:List[float], len(scores) = n_queries + list of n_queries decimal values between 0 and 1. Represents quality of the query that can be posed to the user + """ + if not self._initialized: + self._initialize() + + indices = np.arange(self.n).astype(int) + scores = self.emp_avg_ratings_ + self.ucbs_ + + return indices, scores + +class lil_KLUCB(Active): + def model_name(self): + return "lil_KLUCB" + + def calculate_ucbs(self, num_hits:List[int], emp_avg_ratings, answer_indices:List[int], old_ucbs:List[float]): + """ + Parameters + ---------- + num_hits:List[int] + A list of the number of user-provided ratings for a caption, where index i contains the + number of user-provided ratings received for caption i. + emp_avg_ratings + A list of average user-provided ratings, where index i contains the average rating for caption i. + answer_indices:List[int] + List of caption numbers that received a user rating in the most recent round. Permits more efficient calculation + of UCBS when using lil_KLUCB. + old_ucbs:List[float] + List of old ucbs, where index i refers to the old ucb for caption i. Used for more efficiently + calculating ucbs for lil_KLUCB. + + Returns + ------- + dist_to_upper:List[float] + A list of the distances from the average ratings for captions to the upper "delta"-width confidence bound on each caption's rating. + For caption i, index i is the distance from self.emp_avg_ratings_[i] to the upper "delta"-width + confidence bound on the rating for caption i. + This is done according to the lil-KLUCB algorithm as stated in + https://proceedings.neurips.cc/paper/2017/file/c02f9de3c2f3040751818aacc7f60b74-Paper.pdf. + """ + mu = emp_avg_ratings + UCB = old_ucbs + for i in answer_indices: + if num_hits[i]==0: + # mu[i] = float('inf') + UCB[i] = 1e8 + else: + # UCB[i] = mu[i] + np.sqrt( 2.0*R*R*np.log( 4*T[i]*T[i]/delta ) / T[i] ) + # Note that the line below only makes sense when the responses take values in {1,2,3} + # UCB[i] = self.computeUCB(muhat=(mu[i]-1)/2,threshold=(np.log(2*num_hits[i]*num_hits[i]/delta)/num_hits[i])) + UCB[i] = ( self.computeUCB(muhat=(mu[i]-1)/2,threshold=(np.log(2*num_hits[i]*num_hits[i]/delta)/num_hits[i])) )*2+1 - mu[i] + return UCB# - emp_avg_ratings + + # Inspired by code at + # https://github.com/nextml/NEXT/blob/4c8f4d5a66376a18c047f4c9409f73c75925bf07/apps/CardinalBanditsPureExploration/algs/KLUCB.py#L128-L157 + def computeUCB(self,muhat,threshold,accuracy=(10**(-6))): + lower=muhat + upper=1 + UCB=(lower+upper)/2 + while (upper-lower)>accuracy: + new=self.leftright(muhat,lower,upper,threshold) + lower=new[0] + upper=new[1] + UCB=new[2] + return UCB + + # Also inspired by code at + # https://github.com/nextml/NEXT/blob/4c8f4d5a66376a18c047f4c9409f73c75925bf07/apps/CardinalBanditsPureExploration/algs/KLUCB.py#L128-L157 + def leftright(self,muhat,lower,upper,threshold): + if muhat*(1-muhat)!=0: + silly=(upper+lower)/2 + KL=(muhat*np.log(muhat/silly))+((1-muhat)*np.log((1-muhat)/(1-silly))) + if KL>=threshold: + return [lower,silly,(silly+lower)/2] + if KL=threshold: + return [lower,silly,(silly+lower)/2] + if KL int: + """ + Parameters + ----------- + index:int + Which index do we want the score for? + captions:List[List[int]] + A list of caption scores. For any `i`, `len(captions[i]) == 3` because + it represents the number of unfunny, somewhat funny and funny ratings. + + Returns + ------- + rating:int + A rating of 1, 2 or 3 representing the "funniness" of a caption. + """ + assert 0 <= index < len(captions) + ratings = np.asarray(captions[index]).astype(float) + ratings /= ratings.sum() + return np.random.choice([1, 2, 3], p=ratings) + +def num_potential_funniest(emp_avg_ratings, ucbs): + """ + Parameters + ---------- + emp_avg_ratings + Index i of emp_avg_ratings stores the average user-provided rating for caption i. + ucbs + Index i of ucbs stores the distance from caption i to its upper confidence bound for finding the best caption w.p. 1-delta. + + Returns + ------- + Let j be the index of the caption with the highest empirical average rating. + This returns $Sum_i (emp_avg_ratings[i] + ucbs[i] > emp_avg_ratings[j] - ucbs[j])$ + """ + best_caption_index = np.argmax(emp_avg_ratings) + c = 1 + best_avg_lower = emp_avg_ratings[best_caption_index] - c * ucbs[best_caption_index] + upper_bounds = emp_avg_ratings + c * ucbs + return (upper_bounds > best_avg_lower).sum() + +def num_geq_funniest(emp_avg_ratings): + """ + Parameters + ---------- + emp_avg_ratings + Index i of emp_avg_ratings stores the average user-provided rating for caption i. + + Returns + ------- + Let j be the index of the caption with the highest empirical average rating. + This returns $Sum_i (emp_avg_ratings[i] + ucbs[i] > emp_avg_ratings[j] - ucbs[j])$ + """ + return (emp_avg_ratings >= emp_avg_ratings[0]).sum() + +# simulation as a function +def simulate_model(model_name, top_n_scores:int, total_queries:int): + ''' + Parameters + ---------- + model_name:str + The name with which we want to associate simulation results for that model. + top_n_scores:int + The number of captions each user should respond to. For example, top_n_scores = 5 simulates + the case in which each user rates 5 captions. + total_queries:int + The number of ratings to collect from users in a given caption contest. For example, total_queries = 400_000 + simulates a caption contest with a total of 400,000 user-provided ratings. + + Returns + ------- + A list of flat data with the items listed below in 'datum' + ''' + data = [] + for query_batch_num in range(total_queries//top_n_scores): + list_of_dicts = [] + query_indices, query_scores = the_model.get_queries() + + top_score_indices = np.argpartition(query_scores, -top_n_scores)[-top_n_scores:] + + # pose queries to the human, and store in dict_ans + for index in query_indices[top_score_indices]: + # dictionary of answers + dict_ans = {} + index_rating = simulate_human_answer(index, no_header[:,-3:]) + dict_ans["index"] = index + dict_ans["rating"] = index_rating + list_of_dicts.append(dict_ans) + + the_model.partial_fit(list_of_dicts) + datum = {"num_funniest": num_potential_funniest(the_model.emp_avg_ratings_, the_model.cis_), + "num_queries": (query_batch_num+1)*top_n_scores, + "query_batch": query_batch_num, + "top_n_scores": top_n_scores, + "sampler": the_model.name, + "num_geq_funniest": num_geq_funniest(the_model.emp_avg_ratings_) + } + data.append(datum) + + return data + +# ------------------- END OF FUNCTIONS FOR RUNNING THE SIMULATION ------------------- + +# ------------------- RUN THE SIMULATION ------------------- + +# Read CSV +import pandas as pd +# # Read dataset from download +# df = pd.read_csv('803.csv') + +# Read dataset from online +contest = 803 +dfs = pd.read_html(f"https://nextml.github.io/caption-contest-data/dashboards/{contest}.html") +# Get the last table in the webpage +df = dfs[-1] + +# .values removes the header row. (In this specific case, also want to remove first column.) +no_header = df.values[:,1:] + +# PARAMETRIZE THE SIMULATION + +total_queries = 400_000 + +# Create model and get queries +the_model = lil_KLUCB(len(no_header)) + +list_of_models = [lil_KLUCB(len(no_header)), Active(len(no_header)), Random(len(no_header))] +list_of_models = ["lil_KLUCB", "Active", "Random"] + +top_n_scores = 5 + +data = [] +num_samples = 10 +for _ in range(num_samples): + the_model = 0 + for model_name in list_of_models: + if model_name == "lil_KLUCB": + the_model = lil_KLUCB(len(no_header)) + elif model_name == "Active": + the_model = Active(len(no_header)) + elif model_name == "Random": + the_model = Random(len(no_header)) + data = data + simulate_model(the_model, top_n_scores, total_queries) + +# WRITE SIMULATION RESULTS TO CSV + +import pandas as pd +def list_to_csv(input_list, save_name): + df = pd.DataFrame(input_list) + df.to_csv(save_name) + +def csv_to_dataframe(input_csv): + return pd.read_csv(input_csv) + +import datetime + +current_datetime = datetime.datetime.now() +current_datetime = current_datetime.strftime('%Y%m%d%T').replace(":","") +# list_to_csv(data, "seaborn" + str(current_datetime) + ".csv") +list_to_csv(data, "seaborn_test_huge2.csv") diff --git a/docs/example-analyses/stats_next.py b/docs/example-analyses/stats_next.py new file mode 100644 index 0000000..d0aae53 --- /dev/null +++ b/docs/example-analyses/stats_next.py @@ -0,0 +1,246 @@ +""" +From https://github.com/stsievert/salmon-experiments/blob/8daa4e23ca9960bc585a83828ff6ab71f1b90584/response-rate-next2/stats_next.py + +This file is a collection of various stat functions. +Input: embedding filenames. +Output: stats for each embedding. +The following stats are collected: +* Accuracy +* Distance from ground truth embedding +* Nearest neighbor accuracy +""" + +from time import time +from typing import Tuple, Union, Dict, List, Tuple +from numbers import Number as NumberType +import msgpack +import pandas as pd +from pprint import pprint + +import numpy as np +from scipy.spatial import procrustes +from scipy.spatial.distance import pdist, squareform +from sklearn.manifold import SpectralEmbedding +import numpy.linalg as LA + +ArrayLike = Union[list, np.ndarray] +Number = Union[NumberType, int, float, np.integer, np.floating] + + +def collect( + embedding: ArrayLike, targets: List[int], X_test: ArrayLike +) -> Dict[str, float]: + embedding = np.asarray(embedding) + X_test = np.asarray(X_test) + + accuracy = _get_acc(embedding, X_test) + nn_acc, nn_diffs = _get_nn_diffs(embedding, targets) + + diff_stats = { + f"nn_diff_p{k}": np.percentile(nn_diffs, k) + for k in [99, 98, 95, 90, 80, 70, 60, 50, 40, 30, 20, 10, 5, 2, 1] + } + nn_dists = {f"nn_acc_radius_{k}": (nn_diffs <= k).mean() for k in range(30)} + + n, d = embedding.shape + stats = {} + if d > 1: + reduce = SpectralEmbedding(n_components=1, affinity="nearest_neighbors") + embedding = reduce.fit_transform(embedding) + norm = np.linalg.norm + if targets: + ground_truth = np.array(targets) + assert (np.diff(ground_truth) > 0).all() + ground_truth = ground_truth.reshape(-1, 1) + else: + ground_truth = np.arange(n).reshape(-1, 1) + Y1, Y2, disparity = procrustes(ground_truth, embedding) + stats = { + "embedding_error": norm(Y1 - Y2), + "embedding_rel_error": norm(Y1 - Y2) / norm(Y1), + "procrustes_disparity": disparity, + } + + return { + "accuracy": accuracy, + "nn_diff_median": np.median(nn_diffs), + "nn_diff_mean": nn_diffs.mean(), + "nn_acc": nn_acc, + "avg_items_closer_than_NN": np.mean(items_closer_than_true_NN(embedding, targets)), + **diff_stats, + **stats, + **nn_dists, + **_dist_stats(ground_truth, embedding), + } + + +def _dist_stats(ground_truth: np.ndarray, em: np.ndarray) -> Dict[str, Number]: + D_star = pdist(ground_truth) + D_hat = pdist(em) + D_star /= D_star.max() + D_hat /= D_hat.max() + return {"dist_rel_error": LA.norm(D_hat - D_star) / LA.norm(D_star)} + + +def _get_acc(embedding: np.ndarray, X: np.ndarray) -> float: + assert isinstance(embedding, np.ndarray) + n, d = embedding.shape + # X[i] is always [h, w, l] so zero is the right choice. + y = np.zeros(len(X)).astype("int") + assert X.ndim == 2 and X.shape[1] == 3, f"{type(X)}, {X.shape}" + y_hat = TSTE_predict(X, embedding=embedding) + assert all(_.dtype.kind in ["u", "i"] for _ in [y, y_hat]) + acc = (y == y_hat).mean() + return acc + + +def nn_accs(em: np.ndarray, targets: List[int]): + nn_acc, nn_diffs = _get_nn_diffs(embedding, targets) + + diff_stats = { + f"nn_diff_p{k}": np.percentile(nn_diffs, k) + for k in [99, 98, 95, 90, 80, 70, 60, 50, 40, 30, 20, 10, 5, 2, 1] + } + nn_dists = {f"nn_acc_radius_{k}": (nn_diffs <= k).mean() for k in range(30)} + return nn_dists + +def _get_nn_diffs(embedding, targets: List[int]) -> Tuple[float, np.ndarray]: + """ + Get the NN accuracy and the number of objects that are closer than the + true NN. + """ + true_nns = [] + t = np.array(targets) + for ti in targets: + true_dist = np.abs(t - ti).astype("float32") + true_dist[true_dist <= 0] = np.inf + true_nns.append(true_dist.argmin()) + true_nns = np.array(true_nns).astype("int") + + dists = distances(gram_matrix(embedding)) + dists[dists <= 0] = np.inf + + neighbors = dists.argmin(axis=1) + neighbor_dists = np.abs(neighbors - true_nns) + nn_acc = (neighbor_dists == 0).mean() + return nn_acc, neighbor_dists + +def items_closer_than_true_NN(embedding, targets) -> List[int]: + # find true nearest neighbors + true_nns = [] + t = np.array(targets) + for ti in targets: + true_dist = np.abs(t - ti).astype("float32") + true_dist[np.abs(true_dist) < 1e-3] = np.inf + true_nns.append(true_dist.argmin()) + + # for each item in the embedding, how many items are between + # it and the true NN? + + ## so, get distance matrix + dists = distances(gram_matrix(embedding)) + dists[dists <= 0] = np.inf + + items_closer_than_NNs = [] + for dist, true_nn in zip(dists, true_nns): + true_nn_dist = dist[true_nn] + items_closer_than_NN = (dist < true_nn_dist).sum() + items_closer_than_NNs.append(items_closer_than_NN) + return items_closer_than_NNs + + +################################################################# +## everything from here down copied from Salmon +# salmon/triplets/samplers/_adaptive_runners.py#Adaptive.predict +# salmon/triplets/samplers/adaptive/search/gram_utils.py +################################################################# +Array = np.ndarray +def gram_matrix(X: Array) -> Array: + """ + Get Gram matrix from embedding + Arguments + --------- + X : Array + Embedding. X.shape == (num_items, item_dim) + Returns + ------- + G : Array + Gram matrix. G.shape == (n, n) + """ +# if isinstance(X, torch.Tensor): +# return X @ X.transpose(0, 1) + return X @ X.T + + +def distances(G: Array) -> Array: + """ + Get distance matrix from gram matrix + Arguments + --------- + G : Array + Gram matrix. G.shape == (n, n) for n objects + Returns + ------- + D : Array + Distance matrix. D.shape == (n, n) + """ + G1 = np.diag(G).reshape(1, -1) + G2 = np.diag(G).reshape(-1, 1) + + D = -2 * G + G1 + G2 + return D + +def TSTE_predict(X, embedding=None): + """ + Predict the answers of queries from the current embedding. + Parameters + ---------- + X : array-like + Each row is ``[head, left, right]``. Each element in ``X`` or + ``X[i, j]`` is an index of the current embedding. + Returns + ------- + y : array-like + The winner of each query. An element of ``y`` is 0 if the left + item is the predicted winner, and 1 if the right element is the + predicted winner. + """ + head_idx = X[:, 0].flatten() + left_idx = X[:, 1].flatten() + right_idx = X[:, 2].flatten() + + head = embedding[head_idx] + left = embedding[left_idx] + right = embedding[right_idx] + + ldiff = LA.norm(head - left, axis=1) + rdiff = LA.norm(head - right, axis=1) + + # 1 if right closer; 0 if left closer + # (which matches the labeling scheme) + right_closer = rdiff < ldiff + return right_closer.astype("uint8") + +if __name__ == "__main__": + file = "salmon/io/2021-05-18/ARR-1_history.msgpack" + file = "next/io/2021-05-18/rate=1_history.msgpack" + with open(file, "rb") as f: + history = msgpack.load(f) + datum = history[-1] + em = datum["embedding"] + n_responses = datum["meta"]["n_responses"] + fnames = pd.read_csv("targets.csv.zip", header=None)[0].tolist() + targets = [int(f.strip("i.png")) for f in fnames] + targets = list(sorted(targets)) + + X_test = [ + [i_h, i_l, i_r] if abs(h - l) < abs(h - r) else [i_h, i_r, i_l] + for i_h, h in enumerate(targets) + for i_l, l in enumerate(targets) + for i_r, r in enumerate(targets) + if h not in [l, r] and l != r + ] + + s = collect(em, targets, X_test) + print(len(history)) + print(n_responses, s["accuracy"]) diff --git a/docs/example-analyses/visualization-finding-funniest.ipynb b/docs/example-analyses/visualization-finding-funniest.ipynb new file mode 100644 index 0000000..b9fd9fa --- /dev/null +++ b/docs/example-analyses/visualization-finding-funniest.ipynb @@ -0,0 +1,181 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b29449d7-13d2-4a80-b5d0-c706f9a4bd9a", + "metadata": {}, + "source": [ + "# Finding the funniest caption\n", + "\n", + "The visualizations in this notebook attempt to show how quickly a \"best caption\" rises to the top of the rankings. We perform two visualizations.\n", + "1. The graph titled \"# Captions within 95% CI of Current Funniest\" provides a visualization for how soon *a* caption (not necessarily the *true* funniest caption) can plausibly be identified as the funniest. First, the average user-provided rating is computed for each caption. Then, a 95% CI is computed for each of these average user-provided ratings (basically using the central limit theorem). The corresponding graph displays the number of captions with a 95% CI intersecting the 95% CI around the caption with the highest average user-provided rating.\n", + "1. The graph titled \"# Captions with Simulated Rating Higher than True Funniest\" provides a visualization for how quickly the funniest caption can be correctly identified. the following. Recall that we have access to the ground truth for which caption is funniest. This graph displays how many captions, after a given number of queries, have recieved an average user-provided rating that is better than the average user-provided rating received by the true funniest caption.\n", + "\n", + "Each visualization is performed for three different learning strategies.\n", + "1. \"Random\" randomly selects captions for users to rate.\n", + "1. \"Active\" adaptively chooses captions for users to rate according to the upper confidence bound strategy described in https://arxiv.org/abs/1312.7308.\n", + "1. \"lil_KLUCB\" adaptively chooses captions for users to rate according to the upper confidence bound strategy described in https://arxiv.org/abs/1709.03570.\n", + "\n", + "The line on each graph is a plot of the mean, taken over 10 samples. The shaded region around each line is the standard deviation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f102aef8-fa04-43e7-8390-a37880067ef3", + "metadata": {}, + "outputs": [], + "source": [ + "!python simulation-finding-funniest.py" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "60a0aac3-b89f-4c1c-92fd-9e1709366c5a", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import seaborn as sns\n", + "import viz" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1e88d592-f52d-4128-8552-82edc83cad42", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import seaborn as sns\n", + "def list_to_csv(input_list, save_name):\n", + " df = pd.DataFrame(input_list)\n", + " df.to_csv(save_name)\n", + " \n", + "def csv_to_dataframe(input_csv):\n", + " return pd.read_csv(input_csv)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d373c4e3-15a7-4871-9fb5-cca15aeafc00", + "metadata": {}, + "outputs": [], + "source": [ + "df = csv_to_dataframe(\"seaborn_test_huge2.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "864c5d01-4f90-4a20-847e-37da471e0038", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.8, 11867.903583496167)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "ax = viz.lineplot(\n", + " x=\"num_queries\",\n", + " y=\"num_funniest\",\n", + " hue=\"sampler\",\n", + " ci=.25,\n", + " data=df,\n", + " palette = sns.color_palette()\n", + ")\n", + "ax.set_yscale(\"log\")\n", + "# ax.set_xscale(\"log\")\n", + "ax.set_title(\"# Captions within 95% CI of Current Funniest\")\n", + "ax.set_ylim(bottom=.8)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1119e1a9-8127-444a-ba18-cac4d693210a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.8, 11867.903583496167)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "ax = viz.lineplot(\n", + " x=\"num_queries\",\n", + " y=\"num_geq_funniest\",\n", + " hue=\"sampler\",\n", + " ci=.25,\n", + " data=df,\n", + " palette = sns.color_palette()\n", + ")\n", + "ax.set_yscale(\"log\")\n", + "# ax.set_xscale(\"log\")\n", + "ax.set_title(\"# Captions with Simulated Rating Higher than True Funniest\")\n", + "ax.set_ylim(bottom=.8)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/example-analyses/viz.py b/docs/example-analyses/viz.py new file mode 100644 index 0000000..04e47ca --- /dev/null +++ b/docs/example-analyses/viz.py @@ -0,0 +1,49 @@ +""" +From https://github.com/stsievert/salmon-experiments/blob/8daa4e23ca9960bc585a83828ff6ab71f1b90584/response-rate-next2/viz.py +""" + +import stats_next as stats +import pandas as pd +import msgpack +import warnings + +import matplotlib.pyplot as plt +import matplotlib as mpl +import numpy as np + +def lineplot( + data, x, y, hue, style="-", hue_order=None, ci=0.25, ax=None, estimator="median", palette="copper" +): + if ax is None: + fig, ax = plt.subplots() + if hue_order is None: + hue_order = sorted(data[hue].unique()) + if isinstance(palette, list): + colors = palette + else: + cmap = mpl.cm.get_cmap(palette) + colors = [cmap(x) for x in np.linspace(0, 1, num=len(hue_order))] + for k, (h, color) in enumerate(zip(hue_order, colors)): + show = data[data[hue] == h] + kwargs = dict(index=x, values=y) + middle = show.pivot_table(aggfunc=estimator, **kwargs) + if not len(middle): + continue + _style = style if "C" not in style else style.format(k=k) + if isinstance(style, list): + _style = style[k] + + ax.plot(middle, _style, label=h, color=color) + if ci > 0: + lower = show.pivot_table(aggfunc=lambda x: x.quantile(q=ci), **kwargs) + upper = show.pivot_table(aggfunc=lambda x: x.quantile(q=1 - ci), **kwargs) + assert (lower.index == upper.index).all() + ax.fill_between( + lower.index.values, + y1=lower.values.flatten(), + y2=upper.values.flatten(), + color=color, + alpha=0.2, + ) + ax.legend(loc="best") + return ax