Skip to content

Commit 1e2c359

Browse files
ksheddenararslan
andauthored
ENH: Add hypothesis tests for CCA (#214)
--------- Co-authored-by: Alex Arslan <ararslan@comcast.net>
1 parent c3067e2 commit 1e2c359

File tree

4 files changed

+154
-12
lines changed

4 files changed

+154
-12
lines changed

Project.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ version = "0.10.2"
88

99
[deps]
1010
Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97"
11+
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
1112
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
1213
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
1314
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

src/MultivariateStats.jl

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@ module MultivariateStats
33
using LinearAlgebra
44
using SparseArrays
55
using Statistics: middle
6-
using StatsAPI: RegressionModel
6+
using Distributions: cdf, FDist
7+
using StatsAPI: RegressionModel, HypothesisTest
78
using StatsBase: SimpleCovariance, CovarianceEstimator, AbstractDataTransform,
89
ConvergenceException, pairwise, pairwise!, CoefTable
910

1011
import Statistics: mean, var, cov, covm, cor
1112
import Base: length, size, show
12-
import StatsAPI: fit, predict, coef, weights, dof, r2
13+
import StatsAPI: fit, predict, coef, weights, dof, r2, pvalue
1314
import LinearAlgebra: eigvals, eigvecs
1415

1516
export
@@ -28,6 +29,7 @@ module MultivariateStats
2829
eigvecs, # eignenvectors of the transformation
2930
loadings, # model loadings
3031
var, # model variance
32+
pvalue, # p-values for hypothesis tests
3133

3234
# lreg
3335
llsq, # Linear Least Square regression
@@ -65,7 +67,10 @@ module MultivariateStats
6567
KernelPCA, # Type: the Kernel PCA model
6668

6769
## cca
68-
CCA, # Type: Correlation Component Analysis model
70+
CCA, # Type: Correlation Component Analysis model
71+
WilksLambdaTest, # Wilks lambda statistics and tests
72+
PillaiTraceTest, # Pillai trace statistics and tests
73+
LawleyHotellingTest, # Lawley-Hotelling statistics and tests
6974

7075
ccacov, # CCA based on covariances
7176
ccasvd, # CCA based on singular value decomposition of input data

src/cca.jl

Lines changed: 119 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,16 @@ struct CCA{T<:Real} <: RegressionModel
1010
xproj::Matrix{T} # projection matrix for X, of size (dx, p)
1111
yproj::Matrix{T} # projection matrix for Y, of size (dy, p)
1212
corrs::Vector{T} # correlations, of length p
13+
eigs::Vector{T} # eigenvalues
14+
nobs::Int64 # number of observations
1315

1416
function CCA(xm::Vector{T},
1517
ym::Vector{T},
1618
xp::Matrix{T},
1719
yp::Matrix{T},
18-
crs::Vector{T}) where T<:Real
20+
crs::Vector{T},
21+
eigs::Vector{T},
22+
nobs::Int) where T<:Real
1923

2024
dx, px = size(xp)
2125
dy, py = size(yp)
@@ -32,7 +36,7 @@ struct CCA{T<:Real} <: RegressionModel
3236
length(crs) == px ||
3337
throw(DimensionMismatch("Incorrect length of corrs."))
3438

35-
new{T}(xm, ym, xp, yp, crs)
39+
new{T}(xm, ym, xp, yp, crs, eigs, nobs)
3640
end
3741
end
3842

@@ -177,7 +181,7 @@ function _ccacov(Cxx, Cyy, Cxy, xmean, ymean, p::Int)
177181
G = cholesky(Cyy) \ Cxy'
178182
Ex = eigen(Symmetric(Cxy * G), Symmetric(Cxx))
179183
ord = sortperm(Ex.values; rev=true)
180-
vx, Px = extract_kv(Ex, ord, p)
184+
eigs, Px = extract_kv(Ex, ord, p)
181185
Py = qnormalize!(G * Px, Cyy)
182186
else
183187
# solve Py: (Cyx * inv(Cxx) * Cxy) Py = λ Cyy Py
@@ -186,7 +190,7 @@ function _ccacov(Cxx, Cyy, Cxy, xmean, ymean, p::Int)
186190
H = cholesky(Cxx) \ Cxy
187191
Ey = eigen(Symmetric(Cxy'H), Symmetric(Cyy))
188192
ord = sortperm(Ey.values; rev=true)
189-
vy, Py = extract_kv(Ey, ord, p)
193+
eigs, Py = extract_kv(Ey, ord, p)
190194
Px = qnormalize!(H * Py, Cxx)
191195
end
192196

@@ -196,7 +200,7 @@ function _ccacov(Cxx, Cyy, Cxy, xmean, ymean, p::Int)
196200
crs = coldot(Px, Cxy * Py)
197201

198202
# construct CCA model
199-
CCA(xmean, ymean, Px, Py, crs)
203+
CCA(xmean, ymean, Px, Py, crs, sqrt.(eigs), -1)
200204
end
201205

202206
"""
@@ -275,7 +279,7 @@ function _ccasvd(Zx::DenseMatrix{T}, Zy::DenseMatrix{T}, xmean::Vector{T}, ymean
275279
crs = rmul!(coldot(Zx'Px, Zy'Py), one(T)/(n-1))
276280

277281
# construct CCA model
278-
CCA(xmean, ymean, Px, Py, crs)
282+
CCA(xmean, ymean, Px, Py, crs, S.S[si], n)
279283
end
280284

281285
## interface functions
@@ -336,3 +340,112 @@ function fit(::Type{CCA}, X::AbstractMatrix{T}, Y::AbstractMatrix{T};
336340

337341
return M::CCA
338342
end
343+
344+
abstract type MultivariateTest <: HypothesisTest end
345+
346+
struct WilksLambdaTest <: MultivariateTest
347+
stat::Float64
348+
fstat::Float64
349+
df1::Float64
350+
df2::Float64
351+
end
352+
353+
struct LawleyHotellingTest <: MultivariateTest
354+
stat::Float64
355+
fstat::Float64
356+
df1::Float64
357+
df2::Float64
358+
end
359+
360+
struct PillaiTraceTest <: MultivariateTest
361+
stat::Float64
362+
fstat::Float64
363+
df1::Float64
364+
df2::Float64
365+
end
366+
367+
function pvalue(ct::MultivariateTest)
368+
return ccdf(FDist(ct.df1, ct.df2), ct.fstat)
369+
end
370+
371+
function dof(ct::MultivariateTest)
372+
return (ct.df1, ct.df2)
373+
end
374+
375+
function _testprep(cca::CCA, n, k)
376+
377+
r = cca.eigs[k:end]
378+
dx = length(cca.xmean)
379+
dy = length(cca.ymean)
380+
if isnothing(n) && cca.nobs == -1
381+
throw(ArgumentError("If CCA was fit using :cov, n must be provided to tests"))
382+
end
383+
if n != -1 && cca.nobs != -1 && cca.nobs != n
384+
throw("Provided n is different from actual n")
385+
end
386+
n = n == -1 ? cca.nobs : n
387+
388+
p = dx - k + 1
389+
q = dy - k + 1
390+
n = n - k + 1
391+
392+
m = (abs(p - q) - 1) / 2
393+
N = (n - p - q - 2) / 2
394+
s = min(p, q)
395+
396+
return r, s, m, N, n, dx, dy, p, q
397+
end
398+
399+
"""
400+
WilksLambdaTest(cca; n=-1, k=1)
401+
402+
Use Wilks Lambda to test the dimension of a CCA. The null hypothesis of
403+
the test is that canonical correlations k, k+1, ... are zero. If the
404+
CCA was fit with a covariance matrix then the sample size n must be provided.
405+
"""
406+
function WilksLambdaTest(cca::CCA; n=-1, k=1)
407+
408+
# Reference: Rencher and Christensen (2012)
409+
410+
r, s, m, N, n, dx, dy, p, q = _testprep(cca, n, k)
411+
stat = prod(1 .- r.^2)
412+
w = n - (p + q + 3) / 2
413+
t = p*q == 2 ? 1.0 : sqrt((p^2*q^2 - 4) / (p^2 + q^2 - 5))
414+
df1 = p*q
415+
df2 = w*t - p*q/2 + 1
416+
fstat = ((1 - stat^(1/t)) / stat^(1/t)) * (df2 / df1)
417+
return WilksLambdaTest(stat, fstat, df1, df2)
418+
end
419+
420+
"""
421+
PillaiTraceTest(cca; n=-1, k=1)
422+
423+
Use Pillai's trace to test the dimension of a CCA. The null hypothesis of
424+
the test is that canonical correlations k, k+1, ... are zero. If the
425+
CCA was fit with a covariance matrix then the sample size n must be provided.
426+
"""
427+
function PillaiTraceTest(cca::CCA; n=-1, k=1)
428+
r, s, m, N, n, dx, dy, p, q = _testprep(cca, n, k)
429+
stat = sum(abs2, r)
430+
fstat = (2*N + s + 1)*stat / ((2*m + s + 1) * (s - stat))
431+
df1 = s*(2*m + s + 1)
432+
df2 = s*(2*N + s + 1)
433+
return PillaiTraceTest(stat, fstat, df1, df2)
434+
end
435+
436+
"""
437+
LawleyHotellingTest(cca; n=-1, k=1)
438+
439+
Use the Lawley Hotelling statistics to test the dimension of a CCA. The
440+
null hypothesis of the test is that canonical correlations k, k+1, ... are
441+
zero. If the CCA was fit with a covariance matrix then the sample size n
442+
must be provided.
443+
"""
444+
function LawleyHotellingTest(cca::CCA; n=-1, k=1)
445+
r, s, m, N, n, dx, dy, p, q = _testprep(cca, n, k)
446+
stat = sum(r.^2 ./ (1 .- r.^2))
447+
fstat = 2*(s*N + 1) * stat / (s^2 * (2*m + s + 1))
448+
df1 = s*(2*m + s + 1)
449+
df2 = 2*(s*N + 1)
450+
return LawleyHotellingTest(stat, fstat, df1, df2)
451+
end

test/cca.jl

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ using Statistics: mean, cov, cor
2020
Px = qr(randn(rng, dx, dx)).Q[:, 1:p]
2121
Py = qr(randn(rng, dy, dy)).Q[:, 1:p]
2222

23-
M = CCA(Float64[], Float64[], Px, Py, [0.8, 0.6, 0.4])
23+
M = CCA(Float64[], Float64[], Px, Py, [0.8, 0.6, 0.4], zeros(3), -1)
2424

2525
@test size(M)[1] == dx
2626
@test size(M)[2] == dy
@@ -40,7 +40,7 @@ using Statistics: mean, cov, cor
4040
ux = randn(rng, dx)
4141
uy = randn(rng, dy)
4242

43-
M = CCA(ux, uy, Px, Py, [0.8, 0.6, 0.4])
43+
M = CCA(ux, uy, Px, Py, [0.8, 0.6, 0.4], zeros(3), -1)
4444

4545
@test size(M)[1] == dx
4646
@test size(M)[2] == dy
@@ -145,12 +145,35 @@ using Statistics: mean, cov, cor
145145
predict(M, YY, :y)
146146
predict(MM, XX, :x)
147147
predict(MM, YY, :y)
148-
148+
149149
# type stability
150150
for func in (M->mean(M, :x), M->mean(M, :y),
151151
M->projection(M, :x),
152152
M->projection(M, :y), cor)
153153
@test eltype(func(M)) == Float64
154154
@test eltype(func(MM)) == Float32
155155
end
156+
157+
M1 = fit(CCA, X, Y; method=:svd, outdim=5)
158+
M2 = fit(CCA, X, Y; method=:cov, outdim=5)
159+
160+
# Compare hypothesis tests with Stata
161+
stats = (WilksLambdaTest=0.000384245, PillaiTraceTest=2.81275, LawleyHotellingTest=55.1432)
162+
df1 = (WilksLambdaTest=30, PillaiTraceTest=30, LawleyHotellingTest=30)
163+
df2 = (WilksLambdaTest=3958, PillaiTraceTest=4965, LawleyHotellingTest=4937)
164+
fstats = (WilksLambdaTest=810.3954, PillaiTraceTest=212.8296, LawleyHotellingTest=1814.9480)
165+
166+
for f in [WilksLambdaTest, PillaiTraceTest, LawleyHotellingTest]
167+
168+
ct1 = f(M1)
169+
ct2 = f(M2; n=size(X, 2))
170+
s = Symbol(f)
171+
172+
for ct in [ct1, ct2]
173+
@test isapprox(ct.stat, stats[s], atol=1e-5, rtol=1e-5)
174+
@test isapprox(ct.fstat, fstats[s], atol=1e-5, rtol=1e-5)
175+
@test isapprox(ct.df1, df1[s], atol=1e-5, rtol=1e-5)
176+
@test isapprox(ct.df2, df2[s], atol=1e-5, rtol=1e-5)
177+
end
178+
end
156179
end

0 commit comments

Comments
 (0)