diff --git a/Project.toml b/Project.toml
index a1a760b..33aebd5 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "TensorInference"
 uuid = "c2297e78-99bd-40ad-871d-f50e56b81012"
 authors = ["Jin-Guo Liu", "Martin Roa Villescas"]
-version = "0.3.0"
+version = "0.4.0"
 
 [deps]
 Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
diff --git a/examples/hard-core-lattice-gas/main.jl b/examples/hard-core-lattice-gas/main.jl
index 1981a41..b8cd6f3 100644
--- a/examples/hard-core-lattice-gas/main.jl
+++ b/examples/hard-core-lattice-gas/main.jl
@@ -55,14 +55,14 @@ partition_func[]
 
 # The marginal probabilities can be computed with the [`marginals`](@ref) function, which measures how likely a site is occupied.
 mars = marginals(pmodel)
-show_graph(graph; locs=sites, vertex_colors=[(1-b, 1-b, 1-b) for b in getindex.(mars, 2)], texts=fill("", nv(graph)))
+show_graph(graph; locs=sites, vertex_colors=[(b = mars[[i]][2]; (1-b, 1-b, 1-b)) for i in vertices(graph)], texts=fill("", nv(graph)))
 # The can see the sites at the corner is more likely to be occupied.
 # To obtain two-site correlations, one can set the variables to query marginal probabilities manually.
 pmodel2 = TensorNetworkModel(problem, β; mars=[[e.src, e.dst] for e in edges(graph)])
 mars = marginals(pmodel2);
 
 # We show the probability that both sites on an edge are not occupied
-show_graph(graph; locs=sites, edge_colors=[(b=mar[1, 1]; (1-b, 1-b, 1-b)) for mar in mars], texts=fill("", nv(graph)), edge_line_width=5)
+show_graph(graph; locs=sites, edge_colors=[(b = mars[[e.src, e.dst]][1, 1]; (1-b, 1-b, 1-b)) for e in edges(graph)], texts=fill("", nv(graph)), edge_line_width=5)
 
 # ## The most likely configuration
 # The MAP and MMAP can be used to get the most likely configuration given an evidence.
@@ -91,4 +91,4 @@ sum(config2)
 # The return value is a matrix, with the columns correspond to different samples.
 configs = sample(pmodel3, 1000)
 sizes = sum(configs; dims=1)
-[count(==(i), sizes) for i=0:34]  # counting sizes
\ No newline at end of file
+[count(==(i), sizes) for i=0:34]  # counting sizes
diff --git a/src/mar.jl b/src/mar.jl
index 9f858da..c83b998 100644
--- a/src/mar.jl
+++ b/src/mar.jl
@@ -124,16 +124,67 @@ end
 """
 $(TYPEDSIGNATURES)
 
-Returns the marginal probability distribution of variables.
-One can use `get_vars(tn)` to get the full list of variables in this tensor network.
+Query the marginals of the variables in a [`TensorNetworkModel`](@ref).
+The returned value is a dictionary of variables and their marginals, where a marginal is a joint probability distribution over the associated variables.
+By default, the marginals of all individual variables are returned.
+The marginal variables to query can be specified when constructing [`TensorNetworkModel`](@ref) as its field `mars`.
+It will affect the contraction order of the tensor network.
+
+### Arguments
+- `tn`: the [`TensorNetworkModel`](@ref) to query.
+- `usecuda`: whether to use CUDA for tensor contraction.
+- `rescale`: whether to rescale the tensors during contraction.
+
+### Example
+The following example is from [`examples/asia/main.jl`](@ref).
+
+```jldoctest; setup = :(using TensorInference, Random; Random.seed!(0))
+julia> model = read_model_file(pkgdir(TensorInference, "examples", "asia", "asia.uai"));
+
+julia> tn = TensorNetworkModel(model; evidence=Dict(1=>0))
+TensorNetworkModel{Int64, DynamicNestedEinsum{Int64}, Array{Float64}}
+variables: 1 (evidence → 0), 2, 3, 4, 5, 6, 7, 8
+contraction time = 2^6.022, space = 2^2.0, read-write = 2^7.077
+
+julia> marginals(tn)
+Dict{Vector{Int64}, Vector{Float64}} with 8 entries:
+  [8] => [0.450138, 0.549863]
+  [3] => [0.5, 0.5]
+  [1] => [1.0]
+  [5] => [0.45, 0.55]
+  [4] => [0.055, 0.945]
+  [6] => [0.10225, 0.89775]
+  [7] => [0.145092, 0.854908]
+  [2] => [0.05, 0.95]
+
+julia> tn2 = TensorNetworkModel(model; evidence=Dict(1=>0), mars=[[2, 3], [3, 4]])
+TensorNetworkModel{Int64, DynamicNestedEinsum{Int64}, Array{Float64}}
+variables: 1 (evidence → 0), 2, 3, 4, 5, 6, 7, 8
+contraction time = 2^7.781, space = 2^5.0, read-write = 2^8.443
+
+julia> marginals(tn2)
+Dict{Vector{Int64}, Matrix{Float64}} with 2 entries:
+  [2, 3] => [0.025 0.025; 0.475 0.475]
+  [3, 4] => [0.05 0.45; 0.005 0.495]
+```
+
+In this example, we first set the evidence of variable 1 to 0, then we query the marginals of all individual variables.
+The returned values is a dictionary, the key are query variables, and the value are the corresponding marginals.
+The marginals are vectors, with its entries corresponding to the probability of the variable taking the value 0 and 1, respectively.
+For evidence variable 1, the marginal is always `[1.0]`, since it is fixed to 0.
+
+Then we set the marginal variables to query to be variable 2 and 3, and variable 3 and 4, respectively.
+The joint marginals may or may not increase the contraction time and space.
+Here, the contraction space complexity is increased from 2^2.0 to 2^5.0, and the contraction time complexity is increased from 2^5.977 to 2^7.781.
+The output marginals are joint probabilities of the query variables represented by tensors.
 """
-function marginals(tn::TensorNetworkModel; usecuda = false, rescale = true)::Vector
+function marginals(tn::TensorNetworkModel; usecuda = false, rescale = true)::Dict{Vector{Int}}
     # sometimes, the cost can overflow, then we need to rescale the tensors during contraction.
     cost, grads = cost_and_gradient(tn.code, adapt_tensors(tn; usecuda, rescale))
     @debug "cost = $cost"
     if rescale
-        return LinearAlgebra.normalize!.(getfield.(grads[1:length(tn.mars)], :normalized_value), 1)
+        return Dict(zip(tn.mars, LinearAlgebra.normalize!.(getfield.(grads[1:length(tn.mars)], :normalized_value), 1)))
     else
-        return LinearAlgebra.normalize!.(grads[1:length(tn.mars)], 1)
+        return Dict(zip(tn.mars, LinearAlgebra.normalize!.(grads[1:length(tn.mars)], 1)))
     end
 end
diff --git a/test/cuda.jl b/test/cuda.jl
index 3944441..528d643 100644
--- a/test/cuda.jl
+++ b/test/cuda.jl
@@ -11,11 +11,11 @@ CUDA.allowscalar(false)
     tn = TensorNetworkModel(model; optimizer = TreeSA(ntrials = 1, niters = 2, βs = 1:0.1:40), evidence)
     @debug contraction_complexity(tn)
     @time marginals2 = marginals(tn; usecuda = true)
-    @test all(x -> x isa CuArray, marginals2)
+    @test all(x -> x.second isa CuArray, marginals2)
     # for dangling vertices, the output size is 1.
     npass = 0
     for i in 1:(model.nvars)
-        npass += (length(marginals2[i]) == 1 && reference_solution[i] == [0.0, 1]) || isapprox(Array(marginals2[i]), reference_solution[i]; atol = 1e-6)
+        npass += (length(marginals2[[i]]) == 1 && reference_solution[i] == [0.0, 1]) || isapprox(Array(marginals2[[i]]), reference_solution[i]; atol = 1e-6)
     end
     @test npass == model.nvars
 end
diff --git a/test/generictensornetworks.jl b/test/generictensornetworks.jl
index 56454f2..fc7a190 100644
--- a/test/generictensornetworks.jl
+++ b/test/generictensornetworks.jl
@@ -7,7 +7,7 @@ using GenericTensorNetworks, TensorInference
     g = GenericTensorNetworks.Graphs.smallgraph(:petersen)
     problem = IndependentSet(g)
     model = TensorNetworkModel(problem, β; mars=[[2, 3]])
-    mars = marginals(model)[1]
+    mars = marginals(model)[[2, 3]]
     problem2 = IndependentSet(g; openvertices=[2,3])
     mars2 = TensorInference.normalize!(GenericTensorNetworks.solve(problem2, PartitionFunction(β)), 1)
     @test mars ≈ mars2
diff --git a/test/mar.jl b/test/mar.jl
index 352c080..7aa10b4 100644
--- a/test/mar.jl
+++ b/test/mar.jl
@@ -31,7 +31,7 @@ end
     # compute marginals
     ti_sol = marginals(tn)
     ref_sol[collect(keys(evidence))] .= fill([1.0], length(evidence)) # imitate dummy vars
-    @test isapprox(ti_sol, ref_sol; atol = 1e-5)
+    @test isapprox([ti_sol[[i]] for i=1:length(ref_sol)], ref_sol; atol = 1e-5)
 end
 
 @testset "UAI Reference Solution Comparison" begin
@@ -63,7 +63,7 @@ end
                 @debug contraction_complexity(tn)
                 ti_sol = marginals(tn)
                 ref_sol[collect(keys(evidence))] .= fill([1.0], length(evidence)) # imitate dummy vars
-                @test isapprox(ti_sol, ref_sol; atol = 1e-4)
+                @test isapprox([ti_sol[[i]] for i=1:length(ref_sol)], ref_sol; atol = 1e-4)
             end
         end
     end
@@ -120,15 +120,18 @@ end
     mars = marginals(tnet)
     tnet23 = TensorNetworkModel(model; openvars=[2,3])
     tnet34 = TensorNetworkModel(model; openvars=[3,4])
-    @test mars[1] ≈ probability(tnet23)
-    @test mars[2] ≈ probability(tnet34)
+    @test mars[[2 ,3]] ≈ probability(tnet23)
+    @test mars[[3, 4]] ≈ probability(tnet34)
 
-    tnet1 = TensorNetworkModel(model; mars=[[2, 3], [3, 4]], evidence=Dict(3=>1))
-    tnet2 = TensorNetworkModel(model; mars=[[2, 3], [3, 4]], evidence=Dict(3=>0))
+    vars = [[2, 4], [3, 5]]
+    tnet1 = TensorNetworkModel(model; mars=vars, evidence=Dict(3=>1))
+    tnet2 = TensorNetworkModel(model; mars=vars, evidence=Dict(3=>0))
     mars1 = marginals(tnet1)
     mars2 = marginals(tnet2)
     update_evidence!(tnet1, Dict(3=>0))
     mars1b = marginals(tnet1)
-    @test !(mars1 ≈ mars2)
-    @test mars1b ≈ mars2
+    for k in vars
+        @test !(mars1[k] ≈ mars2[k])
+        @test mars1b[k] ≈ mars2[k]
+    end
 end
\ No newline at end of file
diff --git a/test/sampling.jl b/test/sampling.jl
index eca855c..6ad254b 100644
--- a/test/sampling.jl
+++ b/test/sampling.jl
@@ -49,14 +49,14 @@ using TensorInference, Test
     n = 10000
     tnet = TensorNetworkModel(model)
     samples = sample(tnet, n)
-    mars = getindex.(marginals(tnet), 2)
+    mars = marginals(tnet)
     mars_sample = [count(s->s[k]==(1), samples) for k=1:8] ./ n
-    @test isapprox(mars, mars_sample, atol=0.05)
+    @test isapprox([mars[[i]][2] for i=1:8], mars_sample, atol=0.05)
 
     # fix the evidence
     tnet = TensorNetworkModel(model, optimizer=TreeSA(), evidence=Dict(7=>1))
     samples = sample(tnet, n)
-    mars = getindex.(marginals(tnet), 1)
+    mars = marginals(tnet)
     mars_sample = [count(s->s[k]==(0), samples) for k=1:8] ./ n
-    @test isapprox([mars[1:6]..., mars[8]], [mars_sample[1:6]..., mars_sample[8]], atol=0.05)
+    @test isapprox([[mars[[i]][1] for i=1:6]..., mars[[8]][1]], [mars_sample[1:6]..., mars_sample[8]], atol=0.05)
 end
\ No newline at end of file