Add loglikelihood and nullloglikelihood (#239)

junder873 · web-flow · commit 851eca929981 · 2023-06-29T10:35:48.000-05:00
* Add loglikelihood and nullloglikelihood along with tests

Add StatsAPI functions related to loglikelihood and nullloglikelihood. These are necessary for computing some R-squared values (McFadden, etc.), so those are also tested.

* correct description

* fixes for dof

* fix tests

* add dof_fes and calculate pseudo adjr2 based on that

* Small dof correction

* add r2 and adjr2 based on StatsAPI

* change deviance -&gt; rss and add nulldeviance
diff --git a/src/FixedEffectModel.jl b/src/FixedEffectModel.jl
@@ -25,11 +25,10 @@ struct FixedEffectModel <: RegressionModel
 
     nobs::Int64             # Number of observations
     dof::Int64              # Number parameters estimated - has_intercept. Used for p-value of F-stat.
+    dof_fes::Int64          # Number of fixed effects
     dof_residual::Int64     # dof used for t-test and p-value of F-stat. nobs - degrees of freedoms with simple std
     rss::Float64            # Sum of squared residuals
     tss::Float64            # Total sum of squares
-    r2::Float64             # R squared
-    adjr2::Float64          # R squared adjusted
 
     F::Float64              # F statistics
     p::Float64              # p value for the F statistics
@@ -56,13 +55,55 @@ StatsAPI.vcov(m::FixedEffectModel) = m.vcov
 StatsAPI.nobs(m::FixedEffectModel) = m.nobs
 StatsAPI.dof(m::FixedEffectModel) = m.dof
 StatsAPI.dof_residual(m::FixedEffectModel) = m.dof_residual
-StatsAPI.r2(m::FixedEffectModel) = m.r2
-StatsAPI.adjr2(m::FixedEffectModel) = m.adjr2
+StatsAPI.r2(m::FixedEffectModel) = r2(m, :devianceratio)
 StatsAPI.islinear(m::FixedEffectModel) = true
-StatsAPI.deviance(m::FixedEffectModel) = m.tss
+StatsAPI.deviance(m::FixedEffectModel) = rss(m)
+StatsAPI.nulldeviance(m::FixedEffectModel) = m.tss
 StatsAPI.rss(m::FixedEffectModel) = m.rss
-StatsAPI.mss(m::FixedEffectModel) = deviance(m) - rss(m)
+StatsAPI.mss(m::FixedEffectModel) = nulldeviance(m) - rss(m)
 StatsModels.formula(m::FixedEffectModel) = m.formula_schema
+dof_fes(m::FixedEffectModel) = m.dof_fes
+
+function StatsAPI.loglikelihood(m::FixedEffectModel)
+    n = nobs(m)
+    -n/2 * (log(2π * deviance(m) / n) + 1)
+end
+
+function StatsAPI.nullloglikelihood(m::FixedEffectModel)
+    n = nobs(m)
+    -n/2 * (log(2π * nulldeviance(m) / n) + 1)
+end
+
+# Stata reghdfe reports nullloglikelood after fixed effects are dealt with
+# and some of R fixest estimates also use loglikelihood with only fixed
+# effects in the regression
+function nullloglikelihood_within(m::FixedEffectModel)
+    n = nobs(m)
+    tss_within = deviance(m) / (1 - m.r2_within)
+    -n/2 * (log(2π * tss_within / n) + 1)
+end
+
+function StatsAPI.adjr2(model::FixedEffectModel, variant::Symbol=:devianceratio)
+    #dof(model) = parameters - has_intercept
+    #dof_fes(model) = total degrees of freedom for all fixed effects, including the intercept
+    has_int = hasintercept(formula(model))
+    k = dof(model) + dof_fes(model) + has_int
+    if variant == :McFadden
+        # there seems to be some inconsistency as to whether the intercept is included in the dof
+        # these values match R fixest
+        k = k - has_int - has_fe(model)
+        ll = loglikelihood(model)
+        ll0 = nullloglikelihood(model)
+        1 - (ll - k)/ll0
+    elseif variant == :devianceratio
+        n = nobs(model)
+        dev  = deviance(model)
+        dev0 = nulldeviance(model)
+        1 - (dev*(n - (has_int | has_fe(model)))) / (dev0 * max(n - k, 1))
+    else
+        throw(ArgumentError("variant must be one of :McFadden or :devianceratio"))
+    end
+end
 
 function StatsAPI.confint(m::FixedEffectModel; level::Real = 0.95)
     scale = tdistinvcdf(StatsAPI.dof_residual(m), 1 - (1 - level) / 2)
diff --git a/src/FixedEffectModels.jl b/src/FixedEffectModels.jl
@@ -25,7 +25,9 @@ include("fit.jl")
 include("partial_out.jl")
 
 # Export from StatsBase
-export coef, coefnames, coeftable, responsename, vcov, stderror, nobs, dof, dof_residual, r2, r², adjr2, adjr², islinear, deviance, rss, mss, confint, predict, residuals, fit
+export coef, coefnames, coeftable, responsename, vcov, stderror, nobs, dof, dof_residual, r2,  r², adjr2, adjr², islinear, deviance, nulldeviance, rss, mss, confint, predict, residuals, fit,
+    loglikelihood, nullloglikelihood, dof_fes
+
 
 export reg,
 partial_out,
diff --git a/src/fit.jl b/src/fit.jl
@@ -18,7 +18,6 @@ Estimate a linear model with high dimensional categorical variables / instrument
 * `drop_singletons::Bool = true`: Should singletons be dropped?
 * `progress_bar::Bool = true`: Should the regression show a progressbar?
 * `first_stage::Bool = true`: Should the first-stage F-stat and p-value be computed?
-* `dof_add::Integer = 0`: 
 * `subset::Union{Nothing, AbstractVector} = nothing`: select specific rows. 
 
 
@@ -429,6 +428,7 @@ function StatsAPI.fit(::Type{FixedEffectModel},
     ##
     ##############################################################################
     # Compute degrees of freedom
+    dof_fes_total = 0
     dof_fes = 0
     if has_fes
         for fe in fes
@@ -439,6 +439,7 @@ function StatsAPI.fit(::Type{FixedEffectModel},
                 #only count groups that exists
                 dof_fes += nunique(fe)
             end
+            dof_fes_total += nunique(fe)
         end
     end
 
@@ -448,7 +449,7 @@ function StatsAPI.fit(::Type{FixedEffectModel},
     end
 
     # Compute standard error
-    vcov_data = Vcov.VcovData(Xhat, crossx, residuals, nobs - size(X, 2) - dof_fes - dof_add)
+    vcov_data = Vcov.VcovData(Xhat, crossx, residuals, nobs - size(X, 2) - dof_fes)
     matrix_vcov = StatsAPI.vcov(vcov_data, vcov_method)
 
     # Compute Fstat
@@ -471,11 +472,9 @@ function StatsAPI.fit(::Type{FixedEffectModel},
         end
     end
 
-    # Compute rss, tss, r2, r2 adjusted
+    # Compute rss, tss
     rss = sum(abs2, residuals)
     mss = tss_total - rss
-    r2 = 1 - rss / tss_total
-    adjr2 = 1 - rss / tss_total * (nobs - (has_intercept | has_fe_intercept)) / max(nobs - size(X, 2) - dof_fes - dof_add, 1)
     if has_fes
         r2_within = 1 - rss / tss_partial
     end
@@ -515,6 +514,5 @@ function StatsAPI.fit(::Type{FixedEffectModel},
     if esample == Colon()
         esample = trues(N)
     end
-
-    return FixedEffectModel(coef, matrix_vcov, vcov, nclusters, esample, residuals2, augmentdf, fekeys, coef_names, response_name, formula_origin, formula_schema, contrasts, nobs, dof_, dof_tstat_, rss, tss_total, r2, adjr2, F, p, iterations, converged, r2_within, F_kp, p_kp)
+    return FixedEffectModel(coef, matrix_vcov, vcov, nclusters, esample, residuals2, augmentdf, fekeys, coef_names, response_name, formula_origin, formula_schema, contrasts, nobs, dof_, dof_fes_total, dof_tstat_, rss, tss_total, F, p, iterations, converged, r2_within, F_kp, p_kp)
 end
diff --git a/test/fit.jl b/test/fit.jl
@@ -1,5 +1,5 @@
 using CUDA, FixedEffectModels, CategoricalArrays, CSV, DataFrames, Test, LinearAlgebra
-
+using FixedEffectModels: nullloglikelihood_within
 
 
 ##############################################################################
@@ -458,6 +458,10 @@ end
 	x = reg(df, m)
 	@test r2(x) ≈ 0.0969 atol = 1e-4
 	@test adjr2(x) ≈ 0.09622618 atol = 1e-4
+	m = @formula Sales ~ Price + Pimin + fe(State)
+	x = reg(df, m, Vcov.cluster(:State))
+	@test r2(x) ≈ 0.77472 atol = 1e-4
+	@test adjr2(x) ≈ 0.766768 atol = 1e-4
 
 
 	##############################################################################
@@ -561,6 +565,37 @@ end
 	m = @formula Sales ~ (Price ~ Pimin + CPI)
 	x = reg(df, m, Vcov.cluster(:State, :Year))
 	@test x.F_kp  ≈ 2873.1405 atol = 1e-4
+
+	############################################
+	##
+	## loglikelihood and related
+	##
+	## tested with clusters since those should not
+	## affect the results
+	############################################
+
+	m = @formula(Sales ~ Price)
+	x = reg(df, m, Vcov.cluster(:State))
+	@test loglikelihood(x) ≈ -6625.8266 atol = 1e-4
+	@test nullloglikelihood(x) ≈ -6696.1387 atol = 1e-4
+	@test r2(x, :McFadden) ≈ 0.01050 atol = 1e-4 # Pseudo R2 in R fixest
+	@test adjr2(x, :McFadden) ≈ 0.01035 atol = 1e-4
+
+	m = @formula(Sales ~ Price + Pimin)
+	x = reg(df, m, Vcov.cluster(:State))
+	@test loglikelihood(x) ≈ -6598.6300 atol = 1e-4
+	@test nullloglikelihood(x) ≈ -6696.1387 atol = 1e-4
+	@test r2(x, :McFadden) ≈ 0.01456 atol = 1e-4 # Pseudo R2 in R fixest
+	@test adjr2(x, :McFadden) ≈ 0.01426 atol = 1e-4
+
+	m = @formula(Sales ~ Price + Pimin + fe(State))
+	x = reg(df, m, Vcov.cluster(:State))
+	@test loglikelihood(x) ≈ -5667.7629 atol = 1e-4
+	@test nullloglikelihood(x) ≈ -6696.1387 atol = 1e-4
+	@test nullloglikelihood_within(x) ≈ -5891.2836 atol = 1e-4
+	@test r2(x, :McFadden) ≈ 0.15358 atol = 1e-4 # Pseudo R2 in R fixest
+	@test adjr2(x, :McFadden) ≈ 0.14656 atol = 1e-4
+
 end