Skip to content

Commit 851eca9

Browse files
authored
Add loglikelihood and nullloglikelihood (#239)
* Add loglikelihood and nullloglikelihood along with tests Add StatsAPI functions related to loglikelihood and nullloglikelihood. These are necessary for computing some R-squared values (McFadden, etc.), so those are also tested. * correct description * fixes for dof * fix tests * add dof_fes and calculate pseudo adjr2 based on that * Small dof correction * add r2 and adjr2 based on StatsAPI * change deviance -> rss and add nulldeviance
1 parent f4cb313 commit 851eca9

File tree

4 files changed

+91
-15
lines changed

4 files changed

+91
-15
lines changed

src/FixedEffectModel.jl

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,10 @@ struct FixedEffectModel <: RegressionModel
2525

2626
nobs::Int64 # Number of observations
2727
dof::Int64 # Number parameters estimated - has_intercept. Used for p-value of F-stat.
28+
dof_fes::Int64 # Number of fixed effects
2829
dof_residual::Int64 # dof used for t-test and p-value of F-stat. nobs - degrees of freedoms with simple std
2930
rss::Float64 # Sum of squared residuals
3031
tss::Float64 # Total sum of squares
31-
r2::Float64 # R squared
32-
adjr2::Float64 # R squared adjusted
3332

3433
F::Float64 # F statistics
3534
p::Float64 # p value for the F statistics
@@ -56,13 +55,55 @@ StatsAPI.vcov(m::FixedEffectModel) = m.vcov
5655
StatsAPI.nobs(m::FixedEffectModel) = m.nobs
5756
StatsAPI.dof(m::FixedEffectModel) = m.dof
5857
StatsAPI.dof_residual(m::FixedEffectModel) = m.dof_residual
59-
StatsAPI.r2(m::FixedEffectModel) = m.r2
60-
StatsAPI.adjr2(m::FixedEffectModel) = m.adjr2
58+
StatsAPI.r2(m::FixedEffectModel) = r2(m, :devianceratio)
6159
StatsAPI.islinear(m::FixedEffectModel) = true
62-
StatsAPI.deviance(m::FixedEffectModel) = m.tss
60+
StatsAPI.deviance(m::FixedEffectModel) = rss(m)
61+
StatsAPI.nulldeviance(m::FixedEffectModel) = m.tss
6362
StatsAPI.rss(m::FixedEffectModel) = m.rss
64-
StatsAPI.mss(m::FixedEffectModel) = deviance(m) - rss(m)
63+
StatsAPI.mss(m::FixedEffectModel) = nulldeviance(m) - rss(m)
6564
StatsModels.formula(m::FixedEffectModel) = m.formula_schema
65+
dof_fes(m::FixedEffectModel) = m.dof_fes
66+
67+
function StatsAPI.loglikelihood(m::FixedEffectModel)
68+
n = nobs(m)
69+
-n/2 * (log(2π * deviance(m) / n) + 1)
70+
end
71+
72+
function StatsAPI.nullloglikelihood(m::FixedEffectModel)
73+
n = nobs(m)
74+
-n/2 * (log(2π * nulldeviance(m) / n) + 1)
75+
end
76+
77+
# Stata reghdfe reports nullloglikelood after fixed effects are dealt with
78+
# and some of R fixest estimates also use loglikelihood with only fixed
79+
# effects in the regression
80+
function nullloglikelihood_within(m::FixedEffectModel)
81+
n = nobs(m)
82+
tss_within = deviance(m) / (1 - m.r2_within)
83+
-n/2 * (log(2π * tss_within / n) + 1)
84+
end
85+
86+
function StatsAPI.adjr2(model::FixedEffectModel, variant::Symbol=:devianceratio)
87+
#dof(model) = parameters - has_intercept
88+
#dof_fes(model) = total degrees of freedom for all fixed effects, including the intercept
89+
has_int = hasintercept(formula(model))
90+
k = dof(model) + dof_fes(model) + has_int
91+
if variant == :McFadden
92+
# there seems to be some inconsistency as to whether the intercept is included in the dof
93+
# these values match R fixest
94+
k = k - has_int - has_fe(model)
95+
ll = loglikelihood(model)
96+
ll0 = nullloglikelihood(model)
97+
1 - (ll - k)/ll0
98+
elseif variant == :devianceratio
99+
n = nobs(model)
100+
dev = deviance(model)
101+
dev0 = nulldeviance(model)
102+
1 - (dev*(n - (has_int | has_fe(model)))) / (dev0 * max(n - k, 1))
103+
else
104+
throw(ArgumentError("variant must be one of :McFadden or :devianceratio"))
105+
end
106+
end
66107

67108
function StatsAPI.confint(m::FixedEffectModel; level::Real = 0.95)
68109
scale = tdistinvcdf(StatsAPI.dof_residual(m), 1 - (1 - level) / 2)

src/FixedEffectModels.jl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@ include("fit.jl")
2525
include("partial_out.jl")
2626

2727
# Export from StatsBase
28-
export coef, coefnames, coeftable, responsename, vcov, stderror, nobs, dof, dof_residual, r2, r², adjr2, adjr², islinear, deviance, rss, mss, confint, predict, residuals, fit
28+
export coef, coefnames, coeftable, responsename, vcov, stderror, nobs, dof, dof_residual, r2, r², adjr2, adjr², islinear, deviance, nulldeviance, rss, mss, confint, predict, residuals, fit,
29+
loglikelihood, nullloglikelihood, dof_fes
30+
2931

3032
export reg,
3133
partial_out,

src/fit.jl

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ Estimate a linear model with high dimensional categorical variables / instrument
1818
* `drop_singletons::Bool = true`: Should singletons be dropped?
1919
* `progress_bar::Bool = true`: Should the regression show a progressbar?
2020
* `first_stage::Bool = true`: Should the first-stage F-stat and p-value be computed?
21-
* `dof_add::Integer = 0`:
2221
* `subset::Union{Nothing, AbstractVector} = nothing`: select specific rows.
2322
2423
@@ -429,6 +428,7 @@ function StatsAPI.fit(::Type{FixedEffectModel},
429428
##
430429
##############################################################################
431430
# Compute degrees of freedom
431+
dof_fes_total = 0
432432
dof_fes = 0
433433
if has_fes
434434
for fe in fes
@@ -439,6 +439,7 @@ function StatsAPI.fit(::Type{FixedEffectModel},
439439
#only count groups that exists
440440
dof_fes += nunique(fe)
441441
end
442+
dof_fes_total += nunique(fe)
442443
end
443444
end
444445

@@ -448,7 +449,7 @@ function StatsAPI.fit(::Type{FixedEffectModel},
448449
end
449450

450451
# Compute standard error
451-
vcov_data = Vcov.VcovData(Xhat, crossx, residuals, nobs - size(X, 2) - dof_fes - dof_add)
452+
vcov_data = Vcov.VcovData(Xhat, crossx, residuals, nobs - size(X, 2) - dof_fes)
452453
matrix_vcov = StatsAPI.vcov(vcov_data, vcov_method)
453454

454455
# Compute Fstat
@@ -471,11 +472,9 @@ function StatsAPI.fit(::Type{FixedEffectModel},
471472
end
472473
end
473474

474-
# Compute rss, tss, r2, r2 adjusted
475+
# Compute rss, tss
475476
rss = sum(abs2, residuals)
476477
mss = tss_total - rss
477-
r2 = 1 - rss / tss_total
478-
adjr2 = 1 - rss / tss_total * (nobs - (has_intercept | has_fe_intercept)) / max(nobs - size(X, 2) - dof_fes - dof_add, 1)
479478
if has_fes
480479
r2_within = 1 - rss / tss_partial
481480
end
@@ -515,6 +514,5 @@ function StatsAPI.fit(::Type{FixedEffectModel},
515514
if esample == Colon()
516515
esample = trues(N)
517516
end
518-
519-
return FixedEffectModel(coef, matrix_vcov, vcov, nclusters, esample, residuals2, augmentdf, fekeys, coef_names, response_name, formula_origin, formula_schema, contrasts, nobs, dof_, dof_tstat_, rss, tss_total, r2, adjr2, F, p, iterations, converged, r2_within, F_kp, p_kp)
517+
return FixedEffectModel(coef, matrix_vcov, vcov, nclusters, esample, residuals2, augmentdf, fekeys, coef_names, response_name, formula_origin, formula_schema, contrasts, nobs, dof_, dof_fes_total, dof_tstat_, rss, tss_total, F, p, iterations, converged, r2_within, F_kp, p_kp)
520518
end

test/fit.jl

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
using CUDA, FixedEffectModels, CategoricalArrays, CSV, DataFrames, Test, LinearAlgebra
2-
2+
using FixedEffectModels: nullloglikelihood_within
33

44

55
##############################################################################
@@ -458,6 +458,10 @@ end
458458
x = reg(df, m)
459459
@test r2(x) 0.0969 atol = 1e-4
460460
@test adjr2(x) 0.09622618 atol = 1e-4
461+
m = @formula Sales ~ Price + Pimin + fe(State)
462+
x = reg(df, m, Vcov.cluster(:State))
463+
@test r2(x) 0.77472 atol = 1e-4
464+
@test adjr2(x) 0.766768 atol = 1e-4
461465

462466

463467
##############################################################################
@@ -561,6 +565,37 @@ end
561565
m = @formula Sales ~ (Price ~ Pimin + CPI)
562566
x = reg(df, m, Vcov.cluster(:State, :Year))
563567
@test x.F_kp 2873.1405 atol = 1e-4
568+
569+
############################################
570+
##
571+
## loglikelihood and related
572+
##
573+
## tested with clusters since those should not
574+
## affect the results
575+
############################################
576+
577+
m = @formula(Sales ~ Price)
578+
x = reg(df, m, Vcov.cluster(:State))
579+
@test loglikelihood(x) -6625.8266 atol = 1e-4
580+
@test nullloglikelihood(x) -6696.1387 atol = 1e-4
581+
@test r2(x, :McFadden) 0.01050 atol = 1e-4 # Pseudo R2 in R fixest
582+
@test adjr2(x, :McFadden) 0.01035 atol = 1e-4
583+
584+
m = @formula(Sales ~ Price + Pimin)
585+
x = reg(df, m, Vcov.cluster(:State))
586+
@test loglikelihood(x) -6598.6300 atol = 1e-4
587+
@test nullloglikelihood(x) -6696.1387 atol = 1e-4
588+
@test r2(x, :McFadden) 0.01456 atol = 1e-4 # Pseudo R2 in R fixest
589+
@test adjr2(x, :McFadden) 0.01426 atol = 1e-4
590+
591+
m = @formula(Sales ~ Price + Pimin + fe(State))
592+
x = reg(df, m, Vcov.cluster(:State))
593+
@test loglikelihood(x) -5667.7629 atol = 1e-4
594+
@test nullloglikelihood(x) -6696.1387 atol = 1e-4
595+
@test nullloglikelihood_within(x) -5891.2836 atol = 1e-4
596+
@test r2(x, :McFadden) 0.15358 atol = 1e-4 # Pseudo R2 in R fixest
597+
@test adjr2(x, :McFadden) 0.14656 atol = 1e-4
598+
564599
end
565600

566601

0 commit comments

Comments
 (0)