Skip to content

Commit d7f1e57

Browse files
committed
add scaling to qmedist(), mmedist()
1 parent 4fa8de2 commit d7f1e57

File tree

7 files changed

+83
-17
lines changed

7 files changed

+83
-17
lines changed

NEWS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ BUG FIX
99

1010
- mgedist() may suffer a numerical issue for Anderson-Darling GoF metrics. All GoF metrics now take care of numerical issue, such as log(0) or 1/0, and are properly scaled by the sample sized to avoid large sample size issues. Thanks for Ethan Chapman for reporting the bug.
1111
- the default starting value for the gamma distribution was wrongly computed for the rate parameter. Thanks for Wendy Martin for reporting the bug.
12-
- mledist() may suffer a scaling issue and the objective function is properly scaled by the sample sized to avoid large sample size issues.
12+
- mledist(), mmedist(), qmedist() may suffer a scaling issue and the objective function is properly scaled by the sample sized to avoid large sample size issues.
1313

1414
# fitdistrplus 1.2-1
1515

R/mmedist.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,7 @@ mmedist <- function (data, distr, order, memp, start=NULL, fix.arg=NULL,
386386
(momemp - momtheo)^2
387387
}
388388
fnobj <- function(par, fix.arg, obs, mdistnam, memp, weights)
389-
sum( sapply(order, function(o) DIFF2(par, fix.arg, o, obs, mdistnam, memp)) )
389+
mean( sapply(order, function(o) DIFF2(par, fix.arg, o, obs, mdistnam, memp)) )
390390
}else
391391
{
392392
DIFF2 <- function(par, fix.arg, order, obs, mdistnam, memp, weights)
@@ -396,7 +396,7 @@ mmedist <- function (data, distr, order, memp, start=NULL, fix.arg=NULL,
396396
(momemp - momtheo)^2
397397
}
398398
fnobj <- function(par, fix.arg, obs, mdistnam, memp, weights)
399-
sum( sapply(order, function(o) DIFF2(par, fix.arg, o, obs, mdistnam, memp, weights)) )
399+
mean( sapply(order, function(o) DIFF2(par, fix.arg, o, obs, mdistnam, memp, weights)) )
400400
}
401401

402402
cens <- FALSE

R/qmedist.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ qmedist <- function (data, distr, probs, start=NULL, fix.arg=NULL,
133133
}
134134

135135
fnobj <- function(par, fix.arg, obs, qdistnam, qtype)
136-
sum( sapply(probs, function(p) DIFF2Q(par, fix.arg, p, obs, qdistnam, qtype)) )
136+
mean( sapply(probs, function(p) DIFF2Q(par, fix.arg, p, obs, qdistnam, qtype)) )
137137

138138
}else if (!cens && !is.null(weights))
139139
{
@@ -144,7 +144,7 @@ qmedist <- function (data, distr, probs, start=NULL, fix.arg=NULL,
144144
(qemp - qtheo)^2
145145
}
146146
fnobj <- function(par, fix.arg, obs, qdistnam, qtype)
147-
sum( sapply(probs, function(p) DIFF2Q(par, fix.arg, p, obs, qdistnam, qtype)) )
147+
mean( sapply(probs, function(p) DIFF2Q(par, fix.arg, p, obs, qdistnam, qtype)) )
148148
}
149149

150150
# Function to calculate the loglikelihood to return

tests/t-mledist-nocens.R

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ x <- c(rnorm(1000, 5), rnorm(1000, 10))
114114
#MLE fit
115115
fit1 <- mledist(x, "norm2", start=list(w=1/3, m1=4, s1=2, m2=8, s2=2),
116116
lower=c(0, 0, 0, 0, 0))
117-
117+
fit1
118118

119119
#fitted coef around 0.5003298 4.9842719 0.9909527 10.0296973 1.0024444 , fitted loglik -4185.114
120120

@@ -307,8 +307,6 @@ x <- c(rep(0, n), rpois(n, 10), rpois(n, 50))
307307
mledist(x, "pois", optim.method="Nelder-Mead", control=list(maxit=10))
308308

309309

310-
311-
312310
# (15) basic fit of a normal distribution with new fix.arg/start.arg
313311
#
314312

tests/t-mmedist.R

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ set.seed(1234)
88
x1 <- rnorm(n=nsample)
99
mmedist(x1,"norm")
1010

11+
#fitted coef is -0.3831574 0.9446870, fitted loglik is -13.62037
12+
1113
try(mmedist(x1,"norm", fix.arg=list(mean=0)))
1214

1315
# (2) fit a discrete distribution (Poisson)
@@ -17,14 +19,16 @@ set.seed(1234)
1719
x2 <- rpois(n=nsample, lambda = 2)
1820
mmedist(x2,"pois")
1921

22+
#fitted coef is 1.7, fitted loglik is -15.31626
23+
2024
# (3) fit a finite-support distribution (beta)
2125
#
2226

2327
set.seed(1234)
2428
x3 <- rbeta(n=nsample, shape1=5, shape2=10)
2529
mmedist(x3,"beta")
2630

27-
31+
#fitted coef is 3.956081 9.512364, fitted loglik is 6.939847
2832

2933
# (4) fit a Pareto distribution
3034
#
@@ -42,6 +46,8 @@ if(any(installed.packages()[, "Package"] == "actuar"))
4246
#fit
4347
mmedist(x4, "pareto", order=c(1, 2), memp=memp, start=c(shape=10, scale=10),
4448
lower=1, upper=Inf)
49+
#fitted coef is 127.4319 64.2936, fitted loglik is -3.689438
50+
4551
mmedist(x4, "pareto", order=1, memp=memp, start=list(shape=10), fix.arg=list(scale=1.5),
4652
lower=2, upper=Inf)
4753
mmedist(x4, "pareto", order=1, memp=memp, start=function(x) list(shape=10), fix.arg=list(scale=1.5),
@@ -78,6 +84,7 @@ s2 <- log(1+var(x3)/mean(x3)^2*(n-1)/n)
7884
mu <- log(mean(x3)) - s2/2
7985
cbind(c(mu, s2), f2$estimate)
8086

87+
#fitted coef is -0.1123774 1.1839216, fitted loglik is -12.99279
8188

8289
c(truestim=exp(mu+s2/2),
8390
jensen=as.numeric(exp(f1$estimate["meanlog"]+f1$estimate["sdlog"]^2/2)),
@@ -124,6 +131,8 @@ fitdistrplus:::wtdvar(x1, w)
124131

125132
mmedist(exp(x1), "lnorm", weights=w)$estimate
126133

134+
#fitted coef is -0.4199772 0.6276551, fitted loglik is -26.82328
135+
127136
#test non integer weights
128137
try(mmedist(x1, "norm", weights=rep(1/3, length(x1))))
129138
try(mmedist(1:10, "pois", weights=c(rep(1, 9), 1.001), start=list(lambda=mean(x))))
@@ -132,7 +141,7 @@ try(mmedist(1:10, "pois", weights=c(rep(1, 9), 1.0000001), start=list(lambda=mea
132141

133142
# (8) fit of a neg binom distribution with weighted moment matching estimation
134143
#
135-
144+
set.seed(1234)
136145
x4 <- rnbinom(nsample, 5, 1/2)
137146
table(x4)
138147
w <- rep(1, length(x4))
@@ -141,6 +150,8 @@ w[x4 > 5] <- 2
141150
mmedist(x4, "nbinom", weights=w)$estimate
142151
mmedist(x4, "nbinom", weights=NULL)$estimate
143152

153+
#fitted coef is 12.980769 4.615385, fitted loglik is -29.67728
154+
144155
# (9) relevant example for zero modified geometric distribution
145156
#
146157

@@ -207,6 +218,9 @@ w[x5 > 5] <- 2
207218

208219
mmedist(x5, "zmgeom", order=1:2, memp=memp1, start=list(p1=mean(x5 == 0), p2=1/mean(x5[x5 > 0])),
209220
lower=0.01, upper=0.99)$estimate
221+
222+
#fitted coef is 0.4528775 0.1402878 , fitted loglik is -26.80729
223+
210224
mmedist(x5, "zmgeom", order=1:2, memp=memp2, start=list(p1=mean(x5 == 0), p2=1/mean(x5[x5 > 0])),
211225
weights=w)$estimate
212226

@@ -221,6 +235,7 @@ mmedist(x5, "zmgeom", order=1:2, memp=memp2, start=list(p1=mean(x5 == 0), p2=1/m
221235
if(any(installed.packages()[, "Package"] == "actuar"))
222236
{
223237
require("actuar")
238+
set.seed(1234)
224239
#simulate a sample
225240
x4 <- rpareto(nsample, 6, 2)
226241

@@ -231,4 +246,25 @@ if(any(installed.packages()[, "Package"] == "actuar"))
231246
#fit
232247
mmedist(x4, "pareto", order=c(1, 2), memp=memp, start=c(shape=10, scale=10), lower=1, upper=Inf, optim.method = "L-BFGS-B") #L-BFGS-B via optim
233248
mmedist(x4, "pareto", order=c(1, 2), memp=memp, start=c(shape=10, scale=10), lower=1, upper=Inf, optim.method = "Nelder") #Nelder Mead via constrOptim
249+
250+
#fitted coef is 4.294391 1.597783, fitted loglik is -2.173544
234251
}
252+
253+
254+
# (11) large sample size issue
255+
256+
if(FALSE)
257+
{
258+
set.seed(123)
259+
obs <- rlnorm(1e7, 3, 2)
260+
for(i in 2:7)
261+
cat(i, try(mmedist(obs[1:10^i], "lnorm", control=list(trace=0, REPORT=1))$estimate, silent=TRUE), "\n")
262+
263+
# 2 3.795472 1.370517
264+
# 3 3.569512 1.67947
265+
# 4 3.241516 1.881192
266+
# 5 3.179215 1.902745
267+
# 6 2.993209 2.004211
268+
# 7 2.996089 2.002622
269+
}
270+

tests/t-msedist.R

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,15 @@ legend("bottomright", lty=1:2, col=c("green", "blue"), leg=c("theoretical value"
3535

3636
msedist(x1, "exp", control=list(trace=0, REPORT=1))
3737

38+
#fitted coef is 1.336924, fitted loglik is -5.610213
39+
3840
mse_exp <- fitdist(x1, "exp", method="mse")
3941
plot(mse_exp)
4042
summary(mse_exp)
4143
gofstat(mse_exp)
4244

45+
#fitted coef is -0.1486435 0.9892013, fitted loglik is -13.92195
46+
4347
mse_exp_boot <- bootdist(mse_exp, niter = nboot)
4448
plot(mse_exp_boot)
4549
abline(v=1, col="green")
@@ -88,6 +92,8 @@ optim(c(2,2), Dn)
8892
msedist(x1, "lnorm", control=list(trace=0, REPORT=1))
8993

9094

95+
#fitted coef is -0.5939281 0.6723368, fitted loglik is -2.380166
96+
9197
mse_lnorm <- fitdist(x1, "lnorm", method="mse")
9298
mle_lnorm <- fitdist(x1, "lnorm", method="mle")
9399
plot(mse_lnorm)
@@ -113,6 +119,7 @@ optim(c(1,1,10), Dn)
113119

114120
msedist(x1, "burr", start=list(shape1=1, shape2=1, rate=10), control=list(trace=0, REPORT=1))
115121

122+
#fitted coef is 0.609447 3.343173 3.811287, fitted loglik is -3.023078
116123

117124
mse_burr <- fitdist(x1, "burr", method="mse", start=list(shape1=1, shape2=1, rate=10))
118125
mle_burr <- fitdist(x1, "burr", method="mle", start=list(shape1=1, shape2=1, rate=10))

tests/t-qmedist.R

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ set.seed(1234)
88
x1 <- rnorm(n=nsample)
99
qmedist(x1, "norm", probs=c(1/3, 2/3))
1010

11+
#fitted coef is -0.1486435 0.9892013, fitted loglik is -13.92195
1112

1213
# (2) defining your own distribution functions, here for the Gumbel
1314
# distribution for other distributions, see the CRAN task view dedicated
@@ -17,20 +18,25 @@ dgumbel <- function(x, a, b) 1/b*exp((a-x)/b)*exp(-exp((a-x)/b))
1718
qgumbel <- function(p, a, b) a - b*log(-log(p))
1819
qmedist(x1, "gumbel", probs=c(1/3, 2/3), start=list(a=10,b=5))
1920

21+
#fitted coef is -0.4935571 0.8557281, fitted loglik is -16.79828
22+
2023
# (3) fit a discrete distribution (Poisson)
2124
#
2225

2326
set.seed(1234)
2427
x2 <- rpois(n=nsample,lambda = 2)
2528
qmedist(x2, "pois", probs=1/2)
2629

30+
#fitted coef is 1.7, fitted loglik is -15.31626
31+
2732
# (4) fit a finite-support distribution (beta)
2833
#
2934

3035
set.seed(1234)
3136
x3 <- rbeta(n=nsample, shape1=5, shape2=10)
3237
qmedist(x3, "beta", probs=c(1/3, 2/3))
3338

39+
#fitted coef is 4.010999 8.397853, fitted loglik is 6.642124
3440

3541
# (5) fit frequency distributions on USArrests dataset.
3642
#
@@ -39,6 +45,8 @@ x4 <- USArrests$Assault
3945
qmedist(x4, "pois", probs=1/2)
4046
qmedist(x4, "nbinom", probs=c(1/3, 2/3))
4147

48+
#fitted coef is 2.518966 182.313344, fitted loglik is 292.5969
49+
4250
# (6) normal mixture
4351
#
4452

@@ -65,7 +73,8 @@ fit2 <- qmedist(x, "norm2", probs=c(1/6, 1/4, 1/3, 1/2, 2/3),
6573
start=list(poid=1/3, m1=4, s1=2, m2=8, s2=2),
6674
lower=c(0, 0, 0, 0, 0), upper=c(1/2, Inf, Inf, Inf, Inf))
6775

68-
76+
fit2
77+
#fitted coef is 0.3433528 4.2872449 0.3891135 9.2598612 1.7948554, fitted loglik is -38.14106
6978

7079
# (7) test error messages
7180
#
@@ -106,14 +115,11 @@ x <- rexp(nsample)
106115
f3 <- qmedist(x, "exp", probs=1/2)
107116
w4 <- c(rep(1, nsample/2), round(sqrt(1:(nsample/2))))
108117
f4 <- qmedist(x, "exp", weights=w4, probs=1/2)
109-
f3$estimate
110-
f4$estimate
118+
c(f3$estimate, f4$estimate)
111119

112-
f3$loglik
113-
f4$loglik
120+
c(f3$loglik, f4$loglik)
114121

115-
median(x)
116-
median(tail(x, 50))
122+
#fitted coef is 0.4816191, fitted loglik is -16.95355
117123

118124

119125
#try non integer weights
@@ -128,3 +134,22 @@ qmedist(x, "norm", probs=1:2/3, control=list(maxit=2), start=list(mean=1e5, sd=1
128134
x <- rnorm(nsample)
129135
qmedist(x, "norm", probs=1:2/3, optim.method="L-BFGS-B", lower=c(-Inf, 0)) #via optim
130136
qmedist(x, "norm", probs=1:2/3, optim.method="Nelder", lower=c(-Inf, 0)) #via constrOptim
137+
138+
139+
# (11) large sample size issue
140+
141+
if(FALSE)
142+
{
143+
set.seed(123)
144+
obs <- rlnorm(1e7, 3, 2)
145+
for(i in 2:7)
146+
cat(i, try(qmedist(obs[1:10^i], "lnorm", probs=1:2/3, control=list(trace=0, REPORT=1))$estimate, silent=TRUE), "\n")
147+
148+
# 2 3.109257 1.767013
149+
# 3 3.022615 1.857885
150+
# 4 2.999376 1.978701
151+
# 5 3.003344 2.00941
152+
# 6 2.99881 2.001733
153+
# 7 2.999859 2.000227
154+
}
155+

0 commit comments

Comments
 (0)