From ccfb70fea4b9e65c9baf47c30aa7155dd0202f3a Mon Sep 17 00:00:00 2001 From: sadda Date: Mon, 10 May 2021 10:37:21 +0200 Subject: [PATCH] Lecture 11: Polishing touches --- docs/make.jl | 2 +- docs/src/lecture_11/monte.md | 34 +++--- scripts/lecture_11/script.jl | 144 +++++++++++++------------ scripts/lecture_11/script_sol.jl | 175 ++++++++++++++++--------------- 4 files changed, 180 insertions(+), 175 deletions(-) diff --git a/docs/make.jl b/docs/make.jl index 82cde0104..7ba7f51cd 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -100,9 +100,9 @@ lecture_10 = joinpath.("./lecture_10/", [ ]) lecture_11 = joinpath.("./lecture_11/", [ + "sparse.md", "monte.md", "glm.md", - "sparse.md", ]) lecture_12 = joinpath.("./lecture_12/", [ diff --git a/docs/src/lecture_11/monte.md b/docs/src/lecture_11/monte.md index 69c9de0a7..14194194c 100644 --- a/docs/src/lecture_11/monte.md +++ b/docs/src/lecture_11/monte.md @@ -345,7 +345,7 @@ While the rejection sampling provides a good approximation for the first two dis This exercise computes the expected value ```math -\mathbb E_3 \cos(100x) = \int_{-\infty}^\infty \cos(100 x) f_3(x) dx, +\mathbb E_3 \cos(100X) = \int_{-\infty}^\infty \cos(100 x) f_3(x) dx, ``` where we consider the expectation ``\mathbb E`` with respect to ``d_3\sim N(0, 0.01)`` with density ``f_3``. The first possibility to compute the expectation is to discretize the integral. @@ -361,19 +361,15 @@ nothing # hide The second possibility is to approximate the integral by ```math -\mathbb E_3 \cos(100x) \approx \frac 1n\sum_{i=1}^n \cos(x_i), +\mathbb E_3 \cos(100X) \approx \frac 1n\sum_{i=1}^n \cos(x_i), ``` -where ``x_i`` are sampled from ``d_3``. We do this in `expectation1`, and `expectation2`, where the formed generates from the Distributions package while the latter uses our rejection sampling. +where ``x_i`` are sampled from ``d_3``. We do this in `expectation1`, and `expectation2`, where the formed generates from the Distributions package while the latter uses our rejection sampling. We use the method of the `mean` function, which takes a function as its first argument. ```@example monte -function expectation1(h, d; n=1000000) - xs = rand(d, n) - return mean(h.(xs)) -end +expectation1(h, d; n = 1000000) = mean(h, rand(d, n)) -function expectation2(h, f, x_max, xlims; n=1000000) - xs = rejection_sampling(f, f(x_max), xlims...; n=n) - return mean(h.(xs)) +function expectation2(h, f, f_max, xlims; n=1000000) + return mean(h, rejection_sampling(f, f_max, xlims...; n)) end nothing # hide @@ -382,21 +378,21 @@ nothing # hide If it is difficult to sample from ``d_3``, we can use a trick to sample from some other distribution. This is based on the following formula: ```math -\mathbb E_3 h(x) = \int_{-\infty}^\infty h(x) f_3(x) dx = \int_{-\infty}^\infty h(x) \frac{f_3(x)}{f_1(x)}f_1(x) dx = \mathbb E_1 \frac{h(x)}{f_1(x)}. +\mathbb E_3 h(x) = \int_{-\infty}^\infty h(x) f_3(x) dx = \int_{-\infty}^\infty h(x) \frac{f_3(x)}{f_1(x)}f_1(x) dx = \mathbb E_1 \frac{h(x)f_3(x)}{f_1(x)}. ``` This gives rise to another implementation of the same thing. ```@example monte function expectation3(h, f, d_gen; n=1000000) - xs = rand(d_gen, n) - return mean(h.(xs) .* f.(xs) ./ pdf(d_gen, xs)) + g(x) = h(x)*f(x)/pdf(d_gen, x) + return mean(g, rand(d_gen, n)) end nothing # hide ``` -We run these three approaches for ``20`` repetition.s +We run these three approaches for ``20`` repetitions. ```@example monte n = 100000 @@ -404,7 +400,7 @@ n_rep = 20 Random.seed!(666) e1 = [expectation1(h, d3; n=n) for _ in 1:n_rep] -e2 = [expectation2(h, f3, d3.μ, xlims; n=n) for _ in 1:n_rep] +e2 = [expectation2(h, f3, f3(d3.μ), xlims; n=n) for _ in 1:n_rep] e3 = [expectation3(h, f3, d1; n=n) for _ in 1:n_rep] nothing # hide @@ -505,7 +501,7 @@ Quantiles form an important concept in statistics. Its definition is slightly co The quantile at level ``\alpha=0.5`` is the mean. Quantiles play an important role in estimates, where they form upper and lower bounds for confidence intervals. They are also used in hypothesis testing. -This part will investigate how quantiles on a finite sample differ from the true quantile. We will consider two ways of computing the quantile. Both of them sample ``n`` points from some distribution ``d``. The first one follows the statistical definition and selects the index of the ``n\alpha`` smallest observation. The second one uses the function `quantile`, which performs some interpolation. +This part will investigate how quantiles on a finite sample differ from the true quantile. We will consider two ways of computing the quantile. Both of them sample ``n`` points from some distribution ``d``. The first one follows the statistical definition and selects the index of the ``n\alpha`` smallest observation by the `partialsort` function. The second one uses the function `quantile`, which performs some interpolation. ```@example monte quantile_sampled1(d, n::Int, α) = partialsort(rand(d, n), floor(Int, α*n)) @@ -514,7 +510,7 @@ quantile_sampled2(d, n::Int, α) = quantile(rand(d, n), α) nothing # hide ``` -We defined the vectorized version. This is not the efficient way because for every ``n``, the samples will be randomly generated again. +We defined the vectorized version. This is not efficient because for every ``n``, the samples will be randomly generated again. ```@example monte quantile_sampled1(d, ns::AbstractVector, α) = quantile_sampled1.(d, ns, α) @@ -554,7 +550,7 @@ plt2 = deepcopy(plt1) nothing # hide ``` -Now we add the sampled quantiles and the mean over all repetitions. Since we work with two plots at the same time, we specify into which plot we want to add the new data. It would be better to create a function for plotting and call it for `qs1` and `qs2`, but we wanted to show how to work two plots at the same time. +Now we add the sampled quantiles and the mean over all repetitions. Since we work with two plots, we specify into which plot we want to add the new data. It would be better to create a function for plotting and call it for `qs1` and `qs2`, but we wanted to show how to work two plots simultaneously. ```@example monte for i in 1:size(qs1,1) @@ -589,7 +585,7 @@ savefig(plt2, "quantile2.svg") # hide ![](quantile1.svg) ![](quantile2.svg) -Both sampled estimates give a lower estimate than the true quantile. In statistical methodology, these estimates are biased. We observe that the interpolated estimate is closer to the true value, and that computing the quantile even on ``10000`` points gives an uncertainty interval of approximately ``0.25``. +Both sampled estimates give a lower estimate than the true quantile. In statistical methodology, these estimates are biased. We observe that the interpolated estimate is closer to the true value and that computing the quantile even on ``10000`` points gives an uncertainty interval of approximately ``0.25``. ```@raw html diff --git a/scripts/lecture_11/script.jl b/scripts/lecture_11/script.jl index e5bdb4668..9afef4edf 100644 --- a/scripts/lecture_11/script.jl +++ b/scripts/lecture_11/script.jl @@ -9,6 +9,82 @@ using RDatasets using SpecialFunctions using Statistics + + + +# Ridge regression + +n = 10000 +m = 1000 + +Random.seed!(666) +X = randn(n, m) + +y = 10*X[:,1] + X[:,2] + randn(n) + +# Exercise + + + +# Comparison + +@btime ridge_reg(X, y, 10); + +@btime ridge_reg(X, y, 10, Q, Q_inv, λ); + +# Dependence on mu + +μs = range(0, 1000; length=50) + +ws = hcat(ridge_reg.(Ref(X), Ref(y), μs, Ref(Q), Ref(Q_inv), Ref(λ))...) + +plot(μs, abs.(ws'); + label="", + yscale=:log10, + xlabel="mu", + ylabel="weights: log scale", +) + +# LASSO + +S(x, η) = max(x-η, 0) - max(-x-η, 0) + +function lasso(X, y, μ, Q, Q_inv, λ; + max_iter = 100, + ρ = 1e3, + w = zeros(size(X,2)), + u = zeros(size(X,2)), + z = zeros(size(X,2)), + ) + + for i in 1:max_iter + w = Q * ( (Diagonal(1 ./ (λ .+ ρ)) * ( Q_inv * (X'*y + ρ*(z-u))))) + z = S.(w + u, μ / ρ) + u = u + w - z + end + return w, u, z +end + +ws = zeros(size(X,2), length(μs)) + +for (i, μ) in enumerate(μs) + global w, u, z + w, u, z = i > 1 ? lasso(X, y, μ, Q, Q_inv, λ; w, u, z) : lasso(X, y, μ, Q, Q_inv, λ) + ws[:,i] = w +end + +plot(μs, abs.(ws'); + label="", + yscale=:log10, + xlabel="mu", + ylabel="weights: log scale", +) + + + + + + plot(0:0.1:10, gamma; xlabel="x", ylabel="gamma(x): log scale", @@ -151,71 +227,3 @@ model = glm(@formula(W ~ 1 + N + Y + I + K + F), wages, InverseGaussian(), SqrtL # Exercise - -# Ridge regression - -n = 10000 -m = 1000 - -Random.seed!(666) -X = randn(n, m) - -y = 10*X[:,1] + X[:,2] + randn(n) - -# Exercise - - - -# Comparison - -@btime ridge_reg(X, y, 10); - -@btime ridge_reg(X, y, 10, Q, Q_inv, λ); - -# Dependence on mu - -μs = range(0, 1000; length=50) - -ws = hcat(ridge_reg.(Ref(X), Ref(y), μs, Ref(Q), Ref(Q_inv), Ref(λ))...) - -plot(μs, abs.(ws'); - label="", - yscale=:log10, - xlabel="mu", - ylabel="weights: log scale", -) - -# LASSO - -S(x, η) = max(x-η, 0) - max(-x-η, 0) - -function lasso(X, y, μ, Q, Q_inv, λ; - max_iter = 100, - ρ = 1e3, - w = zeros(size(X,2)), - u = zeros(size(X,2)), - z = zeros(size(X,2)), - ) - - for i in 1:max_iter - w = Q * ( (Diagonal(1 ./ (λ .+ ρ)) * ( Q_inv * (X'*y + ρ*(z-u))))) - z = S.(w + u, μ / ρ) - u = u + w - z - end - return w, u, z -end - -ws = zeros(size(X,2), length(μs)) - -for (i, μ) in enumerate(μs) - global w, u, z - w, u, z = i > 1 ? lasso(X, y, μ, Q, Q_inv, λ; w, u, z) : lasso(X, y, μ, Q, Q_inv, λ) - ws[:,i] = w -end - -plot(μs, abs.(ws'); - label="", - yscale=:log10, - xlabel="mu", - ylabel="weights: log scale", -) diff --git a/scripts/lecture_11/script_sol.jl b/scripts/lecture_11/script_sol.jl index 534469729..9294b8a1a 100644 --- a/scripts/lecture_11/script_sol.jl +++ b/scripts/lecture_11/script_sol.jl @@ -9,6 +9,94 @@ using RDatasets using SpecialFunctions using Statistics + + + +# Ridge regression + +n = 10000 +m = 1000 + +Random.seed!(666) +X = randn(n, m) + +y = 10*X[:,1] + X[:,2] + randn(n) + +# Exercise + +ridge_reg(X, y, μ) = (X'*X + μ*I) \ (X'*y) + +eigen_dec = eigen(X'*X) +Q = eigen_dec.vectors +Q_inv = Matrix(Q') +λ = eigen_dec.values + +ridge_reg(X, y, μ, Q, Q_inv, λ) = Q * ((Diagonal(1 ./ (λ .+ μ)) * ( Q_inv * (X'*y)))) + +w1 = ridge_reg(X, y, 10) +w2 = ridge_reg(X, y, 10, Q, Q_inv, λ) + +norm(w1 - w2) + +# Comparison + +@btime ridge_reg(X, y, 10); + +@btime ridge_reg(X, y, 10, Q, Q_inv, λ); + +# Dependence on mu + +μs = range(0, 1000; length=50) + +ws = hcat(ridge_reg.(Ref(X), Ref(y), μs, Ref(Q), Ref(Q_inv), Ref(λ))...) + +plot(μs, abs.(ws'); + label="", + yscale=:log10, + xlabel="mu", + ylabel="weights: log scale", +) + +# LASSO + +S(x, η) = max(x-η, 0) - max(-x-η, 0) + +function lasso(X, y, μ, Q, Q_inv, λ; + max_iter = 100, + ρ = 1e3, + w = zeros(size(X,2)), + u = zeros(size(X,2)), + z = zeros(size(X,2)), + ) + + for i in 1:max_iter + w = Q * ( (Diagonal(1 ./ (λ .+ ρ)) * ( Q_inv * (X'*y + ρ*(z-u))))) + z = S.(w + u, μ / ρ) + u = u + w - z + end + return w, u, z +end + +ws = zeros(size(X,2), length(μs)) + +for (i, μ) in enumerate(μs) + global w, u, z + w, u, z = i > 1 ? lasso(X, y, μ, Q, Q_inv, λ; w, u, z) : lasso(X, y, μ, Q, Q_inv, λ) + ws[:,i] = w +end + +plot(μs, abs.(ws'); + label="", + yscale=:log10, + xlabel="mu", + ylabel="weights: log scale", +) + + + + + + plot(0:0.1:10, gamma; xlabel="x", ylabel="gamma(x): log scale", @@ -199,90 +287,3 @@ scatter(y, y_hat; ) - - - - - - - - -# Ridge regression - -n = 10000 -m = 1000 - -Random.seed!(666) -X = randn(n, m) - -y = 10*X[:,1] + X[:,2] + randn(n) - -# Exercise - -ridge_reg(X, y, μ) = (X'*X + μ*I) \ (X'*y) - -eigen_dec = eigen(X'*X) -Q = eigen_dec.vectors -Q_inv = Matrix(Q') -λ = eigen_dec.values - -ridge_reg(X, y, μ, Q, Q_inv, λ) = Q * ((Diagonal(1 ./ (λ .+ μ)) * ( Q_inv * (X'*y)))) - -w1 = ridge_reg(X, y, 10) -w2 = ridge_reg(X, y, 10, Q, Q_inv, λ) - -norm(w1 - w2) - -# Comparison - -@btime ridge_reg(X, y, 10); - -@btime ridge_reg(X, y, 10, Q, Q_inv, λ); - -# Dependence on mu - -μs = range(0, 1000; length=50) - -ws = hcat(ridge_reg.(Ref(X), Ref(y), μs, Ref(Q), Ref(Q_inv), Ref(λ))...) - -plot(μs, abs.(ws'); - label="", - yscale=:log10, - xlabel="mu", - ylabel="weights: log scale", -) - -# LASSO - -S(x, η) = max(x-η, 0) - max(-x-η, 0) - -function lasso(X, y, μ, Q, Q_inv, λ; - max_iter = 100, - ρ = 1e3, - w = zeros(size(X,2)), - u = zeros(size(X,2)), - z = zeros(size(X,2)), - ) - - for i in 1:max_iter - w = Q * ( (Diagonal(1 ./ (λ .+ ρ)) * ( Q_inv * (X'*y + ρ*(z-u))))) - z = S.(w + u, μ / ρ) - u = u + w - z - end - return w, u, z -end - -ws = zeros(size(X,2), length(μs)) - -for (i, μ) in enumerate(μs) - global w, u, z - w, u, z = i > 1 ? lasso(X, y, μ, Q, Q_inv, λ; w, u, z) : lasso(X, y, μ, Q, Q_inv, λ) - ws[:,i] = w -end - -plot(μs, abs.(ws'); - label="", - yscale=:log10, - xlabel="mu", - ylabel="weights: log scale", -)