Skip to content

Commit 8f11f6e

Browse files
Simplify logic about collinearity and fixed effects (#254)
* remove cholesky * speed up * update * rmv find_cols * patched version * Update Project.toml * Update fit.jl * remove crossprod * Update fit.jl * simplify logic about fixedeffects collinerarity * rmv getcols * Update fit.jl * Update fit.jl * correct collinearity in endogeneous
1 parent 79a372b commit 8f11f6e

File tree

4 files changed

+117
-91
lines changed

4 files changed

+117
-91
lines changed

benchmark/benchmark.jl

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,21 @@ y= 3 .* x1 .+ 5 .* x2 .+ cos.(id1) .+ cos.(id2).^2 .+ randn(N)
1212
df = DataFrame(id1 = id1, id2 = id2, x1 = x1, x2 = x2, y = y)
1313
# first time
1414
@time reg(df, @formula(y ~ x1 + x2))
15-
# 3.5s
15+
# 1.823s
1616
@time reg(df, @formula(y ~ x1 + x2))
17-
# 0.497374 seconds (450 allocations: 691.441 MiB, 33.18% gc time)
17+
# 0.353469 seconds (441 allocations: 691.439 MiB, 3.65% gc time)
1818
@time reg(df, @formula(y ~ x1 + x2), Vcov.cluster(:id2))
19-
# 1.898018 seconds (7.10 M allocations: 1.220 GiB, 8.20% gc time, 4.46% compilation time)
19+
# 0.763999 seconds (2.96 M allocations: 967.418 MiB, 2.29% gc time, 54.39% compilation time: 5% of which was recompilation)
2020
@time reg(df, @formula(y ~ x1 + x2), Vcov.cluster(:id2))
21-
# 0.605172 seconds (591 allocations: 768.939 MiB, 42.38% gc time)
21+
# 0.401544 seconds (622 allocations: 768.943 MiB, 3.52% gc time)
2222
@time reg(df, @formula(y ~ x1 + x2 + fe(id1)))
2323
# 0.893835 seconds (1.03 k allocations: 929.130 MiB, 54.19% gc time)
24+
@time reg(df, @formula(y ~ x1 + x2 + fe(id1)))
25+
# 0.474160 seconds (1.13 k allocations: 933.340 MiB, 1.74% gc time)
2426
@time reg(df, @formula(y ~ x1 + x2 + fe(id1)), Vcov.cluster(:id1))
25-
# 1.015078 seconds (1.18 k allocations: 1008.532 MiB, 56.50% gc time)
27+
# 0.598816 seconds (261.08 k allocations: 1.007 GiB, 8.29% gc time, 9.21% compilation time)
2628
@time reg(df, @formula(y ~ x1 + x2 + fe(id1) + fe(id2)))
27-
# 1.835464 seconds (4.02 k allocations: 1.057 GiB, 35.59% gc time)
29+
# 1.584573 seconds (489.64 k allocations: 1.094 GiB, 2.10% gc time, 8.53% compilation time)
2830

2931
# More complicated setup
3032
N = 800000 # number of observations
@@ -38,6 +40,10 @@ y= 3 .* x1 .+ 5 .* x2 .+ cos.(id1) .+ cos.(id2).^2 .+ randn(N)
3840
df = DataFrame(id1 = id1, id2 = id2, x1 = x1, x2 = x2, y = y)
3941
@time reg(df, @formula(y ~ x1 + x2 + fe(id1) + fe(id2)))
4042
# 2.504294 seconds (75.83 k allocations: 95.525 MiB, 0.23% gc time)
43+
# for some reason in 1.10 I now get worse time (iter 200)
44+
# 4.709078 seconds (108.98 k allocations: 101.417 MiB)
45+
46+
4147

4248

4349
+# fixest

src/fit.jl

Lines changed: 86 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ function StatsAPI.fit(::Type{FixedEffectModel},
215215
end
216216

217217
# collect all variable names (outcome, exo [, endo, iv])
218-
var_names_all = [response_name; coef_names]
218+
var_names_all = vcat(response_name, coef_names)
219219

220220
if has_iv
221221
subdf = Tables.columntable((; (x => disallowmissing(view(df[!, x], esample)) for x in endo_vars)...))
@@ -281,7 +281,9 @@ function StatsAPI.fit(::Type{FixedEffectModel},
281281
if i == 1
282282
@info "Dependent variable $(var_names_all[1]) is probably perfectly explained by fixed effects (tol = $collinear_tol)."
283283
else
284-
@info "RHS-variable $(var_names_all[i]) is probably collinear with the fixed effects (tol = $collinear_tol)."
284+
@info "RHS-variable $(var_names_all[i]) is collinear with the fixed effects (tol = $collinear_tol)."
285+
# set to zero so that removed when taking basis
286+
cols[i] .= 0.0
285287
end
286288
end
287289

@@ -305,86 +307,114 @@ function StatsAPI.fit(::Type{FixedEffectModel},
305307
end
306308
end
307309

308-
309-
310-
311310
##############################################################################
312311
##
313312
## Get Linearly Independent Components of Matrix
314313
##
315314
##############################################################################
316315
# Compute linearly independent columns + create the Xhat matrix
317316
if has_iv
318-
n_exo = size(Xexo, 2)
319-
n_endo = size(Xendo, 2)
320-
n_z = size(Z, 2)
321-
perm = 1:(n_exo + n_endo)
322-
323-
# first pass: remove colinear variables in Xendo
324-
notcollinear_fe_endo = collinear_fe[(n_exo+2):(n_exo+n_endo+1)] .== false
325-
basis_endo = basis(Xendo; has_intercept = false) .* notcollinear_fe_endo
326-
Xendo = getcols(Xendo, basis_endo)
327-
328-
# second pass: remove colinear variable in Xexo, Z, and Xendo
329-
notcollinear_fe_exo = collinear_fe[2:(n_exo+1)] .== false
330-
notcollinear_fe_z = collinear_fe[(n_exo+n_endo+2):(n_exo+n_endo+n_z+1)] .== false
331-
notcollinear_fe_endo_small = notcollinear_fe_endo[basis_endo]
332-
333-
334-
basis_all = basis(Xexo, Z, Xendo; has_intercept = has_intercept)
335-
basis_Xexo = basis_all[1:size(Xexo, 2)] .* notcollinear_fe_exo
336-
basis_Z = basis_all[(size(Xexo, 2) +1):(size(Xexo, 2) + size(Z, 2))] .* notcollinear_fe_z
337-
basis_endo_small = basis_all[(size(Xexo, 2) + size(Z, 2) + 1):end] .* notcollinear_fe_endo_small
317+
perm = 1:(size(Xexo, 2) +size(Xendo, 2))
318+
319+
# first pass: remove collinear variables in Xendo
320+
XendoXendo = Xendo' * Xendo
321+
basis_endo = basis!(Symmetric(deepcopy(XendoXendo)); has_intercept = false)
322+
if !all(basis_endo)
323+
Xendo = Xendo[:, basis_endo]
324+
XendoXendo = XendoXendo[basis_endo, basis_endo]
325+
end
338326

327+
# second pass: remove collinear variable in Xexo, Z, and Xendo
328+
XexoXexo = Xexo'Xexo
329+
XexoZ = Xexo'Z
330+
XexoXendo = Xexo'Xendo
331+
ZZ = Z'Z
332+
ZXendo = Z'Xendo
333+
XexoZXendo = Symmetric(hvcat(3, XexoXexo, XexoZ, XexoXendo,
334+
zeros(size(Z, 2), size(Xexo, 2)), ZZ, ZXendo,
335+
zeros(size(Xendo, 2), size(Xexo, 2)), zeros(size(Xendo, 2), size(Z, 2)), XendoXendo))
336+
basis_all = basis!(XexoZXendo; has_intercept = has_intercept)
337+
basis_Xexo, basis_Z, basis_endo_small = basis_all[1:size(Xexo, 2)], basis_all[(size(Xexo, 2) +1):(size(Xexo, 2) + size(Z, 2))], basis_all[(size(Xexo, 2) + size(Z, 2) + 1):end]
338+
# basis_endo_small has same length as number of basis_endo who are true
339339
if !all(basis_endo_small)
340340
# if adding Xexo and Z makes Xendo collinear, consider these variables are exogeneous
341-
Xexo = hcat(Xexo, getcols(Xendo, .!basis_endo_small))
342-
Xendo = getcols(Xendo, basis_endo_small)
341+
Xexo = hcat(Xexo, Xendo[:, .!basis_endo_small])
342+
Xendo = Xendo[:, basis_endo_small]
343+
XexoXexo = Xexo'Xexo
344+
XexoZ = Xexo'Z
345+
XexoXendo = Xexo'Xendo
346+
ZXendo = Z'Xendo
347+
XendoXendo = Xendo'Xendo
343348

344349
# out returns false for endo collinear with instruments
345350
basis_endo2 = trues(length(basis_endo))
346351
basis_endo2[basis_endo] = basis_endo_small
347-
348-
# Change coef_names and oldX
349352
# TODO: I should probably also change formula in this case so that predict still works
350353
ans = 1:length(basis_endo)
351354
ans = vcat(ans[.!basis_endo2], ans[basis_endo2])
352355
perm = vcat(1:length(basis_Xexo), length(basis_Xexo) .+ ans)
353-
356+
# there are basis_endo - basis_endo_small in endo
354357
out = join(coefendo_names[.!basis_endo2], " ")
355358
@info "Endogenous vars collinear with ivs. Recategorized as exogenous: $(out)"
356-
359+
357360
# third pass
358-
basis_all = basis(Xexo, Z, Xendo; has_intercept = has_intercept)
359-
basis_Xexo = basis_all[1:size(Xexo, 2)]
360-
basis_Z = basis_all[(size(Xexo, 2) +1):(size(Xexo, 2) + size(Z, 2))]
361+
XexoZXendo = Symmetric(hvcat(3, XexoXexo, XexoZ, XexoXendo,
362+
zeros(size(Z, 2), size(Xexo, 2)), ZZ, ZXendo,
363+
zeros(size(Xendo, 2), size(Xexo, 2)), zeros(size(Xendo, 2), size(Z, 2)), XendoXendo))
364+
basis_all = basis!(XexoZXendo; has_intercept = has_intercept)
365+
basis_Xexo, basis_Z, basis_endo_small2 = basis_all[1:size(Xexo, 2)], basis_all[(size(Xexo, 2) +1):(size(Xexo, 2) + size(Z, 2))], basis_all[(size(Xexo, 2) + size(Z, 2) + 1):end]
361366
end
362-
363-
Xexo = getcols(Xexo, basis_Xexo)
364-
Z = getcols(Z, basis_Z)
365-
size(Z, 2) >= size(Xendo, 2) || throw("Model not identified. There must be at least as many ivs as endogeneous variables")
366-
basis_coef = vcat(basis_Xexo, basis_endo[basis_endo_small])
367+
if !all(basis_Xexo)
368+
Xexo = Xexo[:, basis_Xexo]
369+
XexoXexo = XexoXexo[basis_Xexo, basis_Xexo]
370+
XexoXendo = XexoXendo[basis_Xexo, :]
371+
end
372+
if !all(basis_Z)
373+
Z = Z[:, basis_Z]
374+
ZZ = ZZ[basis_Z, basis_Z]
375+
ZXendo = ZXendo[basis_Z, :]
376+
end
377+
XexoZ = XexoZ[basis_Xexo, basis_Z]
378+
size(ZXendo, 1) >= size(ZXendo, 2) || throw("Model not identified. There must be at least as many ivs as endogeneous variables")
379+
# basis_endo is true for stuff non colinear
380+
# I need to have same vector but removeing the true that have been reclassified as exo and replace them by nothing. so i need to create a vector equal to false if non endo and non basis_endo_small, which is basis_endo2
381+
basis_endo2 = trues(length(basis_endo))
382+
basis_endo2[basis_endo] = basis_endo_small
383+
basis_coef = vcat(basis_Xexo, basis_endo[basis_endo2])
367384

368385
# Build
369386
newZ = hcat(Xexo, Z)
370-
Pi = ls_solve(newZ, Xendo)
371-
Xhat = hcat(Xexo, newZ * Pi)
387+
# now create Pi = newZ \ Xendo
388+
newZnewZ = hvcat(2, XexoXexo, XexoZ,
389+
XexoZ', ZZ)
390+
newZXendo = vcat(XexoXendo, ZXendo)
391+
Pi = ls_solve!(Symmetric(hvcat(2, newZnewZ, newZXendo,
392+
zeros(size(newZXendo')), zeros(size(Xendo, 2), size(Xendo, 2)))),
393+
size(newZnewZ, 2))
394+
newnewZ = newZ * Pi
395+
Xhat = hcat(Xexo, newnewZ)
396+
XhatXhat = Symmetric(hvcat(2, XexoXexo, Xexo'newnewZ,
397+
zeros(size(newnewZ, 2), size(Xexo, 2)), newnewZ'newnewZ))
372398
X = hcat(Xexo, Xendo)
373-
374399
# prepare residuals used for first stage F statistic
375400
## partial out Xendo in place wrt (Xexo, Z)
376401
Xendo_res = BLAS.gemm!('N', 'N', -1.0, newZ, Pi, 1.0, Xendo)
377402
## partial out Z in place wrt Xexo
378-
Pi2 = ls_solve(Xexo, Z)
403+
# Now create Pi2 = Xexo \ Z
404+
Pi2 = ls_solve!(Symmetric(hvcat(2, XexoXexo, XexoZ,
405+
zeros(size(Z, 2), size(Xexo, 2)), ZZ)), size(Xexo, 2))
379406
Z_res = BLAS.gemm!('N', 'N', -1.0, Xexo, Pi2, 1.0, Z)
380407
else
381408
# get linearly independent columns
382-
n_exo = size(Xexo, 2)
383-
perm = 1:n_exo
384-
notcollinear_fe_exo = collinear_fe[2:(n_exo+1)] .== false
385-
basis_Xexo = basis(Xexo; has_intercept = has_intercept) .* notcollinear_fe_exo
386-
Xexo = getcols(Xexo, basis_Xexo)
409+
perm = 1:size(Xexo, 2)
410+
XexoXexo = Xexo'Xexo
411+
basis_Xexo = basis!(Symmetric(deepcopy(XexoXexo)); has_intercept = has_intercept)
412+
if !all(basis_Xexo)
413+
Xexo = Xexo[:, basis_Xexo]
414+
XexoXexo = XexoXexo[basis_Xexo, basis_Xexo]
415+
end
387416
Xhat = Xexo
417+
XhatXhat = Symmetric(XexoXexo)
388418
X = Xexo
389419
basis_coef = basis_Xexo
390420
end
@@ -394,11 +424,10 @@ function StatsAPI.fit(::Type{FixedEffectModel},
394424
## Do the regression
395425
##
396426
##############################################################################
397-
398-
crossx = Xhat' * Xhat
399-
Xy = Symmetric(hvcat(2, crossx, Xhat'y, zeros(size(Xhat, 2))', [0.0]))
427+
Xy = Symmetric(hvcat(2, XhatXhat, Xhat'y,
428+
zeros(size(Xhat, 2))', [0.0]))
400429
invsym!(Xy; diagonal = 1:size(Xhat, 2))
401-
invcrossx = Symmetric(.- Xy[1:(end-1),1:(end-1)])
430+
invXhatXhat = Symmetric(.- Xy[1:(end-1),1:(end-1)])
402431
coef = Xy[1:(end-1),end]
403432

404433
##############################################################################
@@ -419,7 +448,10 @@ function StatsAPI.fit(::Type{FixedEffectModel},
419448

420449
augmentdf = DataFrame()
421450
if save_fe
422-
oldX = getcols(oldX[:, perm], basis_coef)
451+
oldX = oldX[:, perm]
452+
if !all(basis_coef)
453+
oldX = oldX[:, basis_coef]
454+
end
423455
newfes, b, c = solve_coefficients!(oldy - oldX * coef, feM; tol = tol, maxiter = maxiter)
424456
for fekey in fekeys
425457
augmentdf[!, fekey] = df[:, fekey]
@@ -457,7 +489,7 @@ function StatsAPI.fit(::Type{FixedEffectModel},
457489
end
458490

459491
# Compute standard error
460-
vcov_data = Vcov.VcovData(Xhat, crossx, invcrossx, residuals, nobs - size(X, 2) - dof_fes)
492+
vcov_data = Vcov.VcovData(Xhat, XhatXhat, invXhatXhat, residuals, nobs - size(X, 2) - dof_fes)
461493
matrix_vcov = StatsAPI.vcov(vcov_data, vcov_method)
462494
# Compute Fstat
463495
F = Fstat(coef, matrix_vcov, has_intercept)

src/utils/basecol.jl

Lines changed: 15 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,3 @@
1-
# create the matrix X'X
2-
crossprod(x1) = Symmetric(x1'x1)
3-
4-
function crossprod(x1, x2)
5-
Symmetric(hvcat(2, x1'x1, x1'x2,
6-
zeros(size(x2, 2), size(x1, 2)), x2'x2))
7-
end
8-
9-
function crossprod(x1, x2, x3)
10-
Symmetric(hvcat(3, x1'x1, x1'x2, x1'x3,
11-
zeros(size(x2, 2), size(x1, 2)), x2'x2, x2'x3,
12-
zeros(size(x3, 2), size(x1, 2)), zeros(size(x3, 2), size(x2, 2)), x3'x3))
13-
end
14-
151
# generalized 2inverse
162
#actually return minus the symmetric
173
function invsym!(X::Symmetric; has_intercept = false, setzeros = false, diagonal = 1:size(X, 2))
@@ -38,25 +24,23 @@ function invsym!(X::Symmetric; has_intercept = false, setzeros = false, diagonal
3824
return X
3925
end
4026

41-
## Returns base of [A B C ...]
42-
## Important: it must be the case that it returns the in order, that is [A B A] returns [true true false] not [false true true]
43-
function basis(xs...; has_intercept = false)
44-
invXX = invsym!(crossprod(xs...); has_intercept = has_intercept, setzeros = true)
27+
## Returns base of X = [A B C ...]. Takes as input the matrix X'X (actuallyjust its right upper-triangular)
28+
## Important: it must be the case that colinear are first columbs in the bsae in the order of columns
29+
## that is [A B A] returns [true true false] not [false true true]
30+
function basis!(XX::Symmetric; has_intercept = false)
31+
invXX = invsym!(XX; has_intercept = has_intercept, setzeros = true)
4532
return diag(invXX) .< 0
4633
end
4734

48-
function getcols(X::AbstractMatrix, basecolX::AbstractVector)
49-
sum(basecolX) == size(X, 2) ? X : X[:, basecolX]
50-
end
51-
52-
function ls_solve(X, y::AbstractVector)
53-
Xy = crossprod(X, y)
54-
invsym!(Xy, diagonal = 1:size(X, 2))
55-
return Xy[1:size(X, 2),end]
56-
end
5735

58-
function ls_solve(X, Y::AbstractMatrix)
59-
XY = crossprod(X, Y)
60-
invsym!(XY, diagonal = 1:size(X, 2))
61-
return XY[1:size(X, 2),(end-size(Y, 2)+1):end]
36+
#solve X \ y. Take as input the matrix [X'X, X'y
37+
# y'X, y'y]
38+
# (but only upper matters)
39+
function ls_solve!(Xy::Symmetric, nx)
40+
if nx > 0
41+
invsym!(Xy, diagonal = 1:nx)
42+
return Xy[1:nx, (nx+1):end]
43+
else
44+
return zeros(Float64, 0, size(Xy, 2) - nx)
45+
end
6246
end

test/fit.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,10 @@ using CUDA, Metal
250250
x = reg(df, m)
251251
@test iszero(coef(x)[2]) || iszero(coef(x)[3])
252252

253+
m = @formula Sales ~ (Price + Price2 ~ Pimin + NDI)
254+
x = reg(df, m)
255+
@test iszero(coef(x)[2]) || iszero(coef(x)[3])
256+
253257

254258
## endogeneous variables collinear with instruments are reclassified
255259
df.zPimin = df.Pimin

0 commit comments

Comments
 (0)