From afa34c883b91741e315a01da868dca00f6fd3acd Mon Sep 17 00:00:00 2001 From: James Fairbanks Date: Tue, 24 Feb 2015 18:12:44 -0500 Subject: [PATCH] Extend varinfo to support normalization without breaking api with tests with docs --- doc/source/varinfo.rst | 17 +++++++++++------ src/varinfo.jl | 30 ++++++++++++++++++++++++------ test/varinfo.jl | 27 +++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 12 deletions(-) diff --git a/doc/source/varinfo.rst b/doc/source/varinfo.rst index c20d6b23..e68aaaeb 100644 --- a/doc/source/varinfo.rst +++ b/doc/source/varinfo.rst @@ -1,7 +1,7 @@ Variation of Information ========================== -`Variation of information `_ (also known as *shared information distance*) is a measure of the distance between two clusterings. It is devised based on mutual information, but it is a true metric, *i.e.* it satisfies symmetry and triangle inequality. +`Variation of information `_ (also known as *shared information distance*) is a measure of the distance between two clusterings. It is devised based on mutual information, but it is a true metric, *i.e.* it satisfies symmetry and triangle inequality. There are variants of variation of information that have been discussed. Vinh, Epps, and Bailey provides a survey of these variants. The names of the variants are taken from Table 3 of this paper. **References:** @@ -9,9 +9,14 @@ Variation of Information *Comparing Clusterings by the Variation of Information.* Learning Theory and Kernel Machines: 173–187. + Vinh, Nguyen Xuan and Epps Julien and Bailey, James (2010) + *Information Theoretic Measures for Clusterings Comparison: + Variants, Properties, Normalization and Correction for Chance* + Journal of Machine Learning Research 11 2837-2854 + This package provides the ``varinfo`` function that implements this metric: -.. function:: varinfo(k1, a1, k2, a2) +.. function:: varinfo(k1, a1, k2, a2, variant) Compute the variation of information between two assignments. @@ -19,14 +24,14 @@ This package provides the ``varinfo`` function that implements this metric: :param a1: The assignment vector for the first clustering. :param k2: The number of clusters in the second clustering. :param a2: The assignment vector for the second clustering. + :param variant: The type of normalization to perform defaults to :Djoint which is unnormalized. :return: the value of variation of information. -.. function:: varinfo(R, k0, a0) +.. function:: varinfo(R, k0, a0, variant) This method takes ``R``, an instance of ``ClusteringResult``, as input, and computes the variation of information between its corresponding clustering with one given by ``(k0, a0)``, where ``k0`` is the number of clusters in the other clustering, while ``a0`` is the corresponding assignment vector. -.. function:: varinfo(R1, R2) - - This method takes ``R1`` and ``R2`` (both are instances of ``ClusteringResult``) and computes the variation of information between them. +.. function:: varinfo(R1, R2, variant) + This method takes ``R1`` and ``R2`` (both are instances of ``ClusteringResult``) and computes the variation of information between them. \ No newline at end of file diff --git a/src/varinfo.jl b/src/varinfo.jl index b6cf76bf..f893ec64 100644 --- a/src/varinfo.jl +++ b/src/varinfo.jl @@ -1,6 +1,7 @@ # Variation of Information +const _varinfo_default_variant = :Djoint -function varinfo(k1::Int, a1::AbstractVector{Int}, +function information(k1::Int, a1::AbstractVector{Int}, k2::Int, a2::AbstractVector{Int}) # check input arguments @@ -45,13 +46,30 @@ function varinfo(k1::Int, a1::AbstractVector{Int}, end end - return H1 + H2 - I * 2.0 + return I, H1, H2 end -varinfo(R::ClusteringResult, k0::Int, a0::AbstractVector{Int}) = - varinfo(nclusters(R), assignments(R), k0, a0) +function varinfo(k1::Int, a1::AbstractVector{Int}, + k2::Int, a2::AbstractVector{Int}, + variant::Symbol=_varinfo_default_variant) + I, H1, H2 = information(k1, a1, k2, a2) + if variant == :Djoint + v = H1 + H2 - I * 2.0 + elseif variant == :Dmax + v = max(H1,H2) - I + elseif variant == :djoint + v = (1 - 2*I/(H1+H2)) + elseif variant == :dmax + v = 1 - (I/max(H1, H2)) + end + return v +end + +varinfo(R::ClusteringResult, k0::Int, a0::AbstractVector{Int}, variant::Symbol=_varinfo_default_variant) = + varinfo(nclusters(R), assignments(R), k0, a0, variant) -varinfo(R1::ClusteringResult, R2::ClusteringResult) = +varinfo(R1::ClusteringResult, R2::ClusteringResult, variant::Symbol=_varinfo_default_variant) = varinfo(nclusters(R1), assignments(R1), - nclusters(R2), assignments(R2)) + nclusters(R2), assignments(R2), + variant) diff --git a/test/varinfo.jl b/test/varinfo.jl index abdb8e84..5c94b486 100644 --- a/test/varinfo.jl +++ b/test/varinfo.jl @@ -8,21 +8,48 @@ a2 = [1, 1, 1, 1, 2, 2, 2, 2, 2, 2] @test_approx_eq_eps varinfo(3, a1, 3, a1) 0.0 1.0e-12 @test_approx_eq_eps varinfo(2, a2, 2, a2) 0.0 1.0e-12 +@test_approx_eq_eps varinfo(3, a1, 3, a1, :dmax) 0.0 1.0e-12 +@test_approx_eq_eps varinfo(3, a1, 3, a1, :djoint) 0.0 1.0e-12 +@test_approx_eq_eps varinfo(3, a1, 3, a1, :Dmax) 0.0 1.0e-12 +@test_approx_eq_eps varinfo(3, a1, 3, a1, :Djoint) 0.0 1.0e-12 v = varinfo(3, a1, 2, a2) v_ = varinfo(2, a2, 3, a1) @test 0.0 < v < log(3) @test_approx_eq v v_ +nid = varinfo(3, a1, 2, a2, :dmax) +nid_ = varinfo(2, a2, 3, a1, :dmax) +@test 0.0 < nid < 1 +@test_approx_eq nid nid_ + +nid = varinfo(3, a1, 2, a2, :djoint) +nid_ = varinfo(2, a2, 3, a1, :djoint) +@test 0.0 < nid < 1 +@test_approx_eq nid nid_ + a1 = [1, 2, 3, 4, 5] a2 = [1, 1, 1, 1, 1] @test_approx_eq varinfo(5, a1, 1, a2) log(5) @test_approx_eq varinfo(1, a2, 5, a1) log(5) +@test_approx_eq varinfo(5, a1, 1, a2, :dmax) 1.0 +@test_approx_eq varinfo(1, a2, 5, a1, :dmax) 1.0 + +@test_approx_eq varinfo(5, a1, 1, a2, :djoint) 1.0 +@test_approx_eq varinfo(1, a2, 5, a1, :djoint) 1.0 a1 = [1, 1, 1, 2, 2, 2] a2 = [2, 2, 2, 1, 1, 1] @test_approx_eq_eps varinfo(2, a1, 2, a2) 0.0 1.0e-12 @test_approx_eq_eps varinfo(2, a1, 3, a2) 0.0 1.0e-12 @test_approx_eq_eps varinfo(4, a1, 3, a2) 0.0 1.0e-12 + +@test_approx_eq_eps varinfo(2, a1, 2, a2, :dmax) 0.0 1.0e-12 +@test_approx_eq_eps varinfo(2, a1, 3, a2, :dmax) 0.0 1.0e-12 +@test_approx_eq_eps varinfo(4, a1, 3, a2, :dmax) 0.0 1.0e-12 + +@test_approx_eq_eps varinfo(2, a1, 2, a2, :djoint) 0.0 1.0e-12 +@test_approx_eq_eps varinfo(2, a1, 3, a2, :djoint) 0.0 1.0e-12 +@test_approx_eq_eps varinfo(4, a1, 3, a2, :djoint) 0.0 1.0e-12