ENH: Add hypothesis tests for CCA (#214)

kshedden · ararslan · web-flow · commit 1e2c3591069c · 2024-06-05T10:57:20.000-05:00
---------

Co-authored-by: Alex Arslan &lt;ararslan@comcast.net&gt;
diff --git a/Project.toml b/Project.toml
@@ -8,6 +8,7 @@ version = "0.10.2"
 
 [deps]
 Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97"
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
diff --git a/src/MultivariateStats.jl b/src/MultivariateStats.jl
@@ -3,13 +3,14 @@ module MultivariateStats
     using LinearAlgebra
     using SparseArrays
     using Statistics: middle
-    using StatsAPI: RegressionModel
+    using Distributions: cdf, FDist
+    using StatsAPI: RegressionModel, HypothesisTest
     using StatsBase: SimpleCovariance, CovarianceEstimator, AbstractDataTransform,
                      ConvergenceException, pairwise, pairwise!, CoefTable
 
     import Statistics: mean, var, cov, covm, cor
     import Base: length, size, show
-    import StatsAPI: fit, predict, coef, weights, dof, r2
+    import StatsAPI: fit, predict, coef, weights, dof, r2, pvalue
     import LinearAlgebra: eigvals, eigvecs
 
     export
@@ -28,6 +29,7 @@ module MultivariateStats
     eigvecs,            # eignenvectors of the transformation
     loadings,           # model loadings
     var,                # model variance
+    pvalue,             # p-values for hypothesis tests
 
     # lreg
     llsq,               # Linear Least Square regression
@@ -65,7 +67,10 @@ module MultivariateStats
     KernelPCA,          # Type: the Kernel PCA model
 
     ## cca
-    CCA,                # Type: Correlation Component Analysis model
+    CCA,                 # Type: Correlation Component Analysis model
+    WilksLambdaTest,     # Wilks lambda statistics and tests
+    PillaiTraceTest,     # Pillai trace statistics and tests
+    LawleyHotellingTest, # Lawley-Hotelling statistics and tests
 
     ccacov,             # CCA based on covariances
     ccasvd,             # CCA based on singular value decomposition of input data
diff --git a/src/cca.jl b/src/cca.jl
@@ -10,12 +10,16 @@ struct CCA{T<:Real} <: RegressionModel
     xproj::Matrix{T}  # projection matrix for X, of size (dx, p)
     yproj::Matrix{T}  # projection matrix for Y, of size (dy, p)
     corrs::Vector{T}  # correlations, of length p
+    eigs::Vector{T}   # eigenvalues
+    nobs::Int64       # number of observations
 
     function CCA(xm::Vector{T},
                  ym::Vector{T},
                  xp::Matrix{T},
                  yp::Matrix{T},
-                 crs::Vector{T}) where T<:Real
+                 crs::Vector{T},
+                 eigs::Vector{T},
+                 nobs::Int) where T<:Real
 
         dx, px = size(xp)
         dy, py = size(yp)
@@ -32,7 +36,7 @@ struct CCA{T<:Real} <: RegressionModel
         length(crs) == px ||
             throw(DimensionMismatch("Incorrect length of corrs."))
 
-        new{T}(xm, ym, xp, yp, crs)
+        new{T}(xm, ym, xp, yp, crs, eigs, nobs)
     end
 end
 
@@ -177,7 +181,7 @@ function _ccacov(Cxx, Cyy, Cxy, xmean, ymean, p::Int)
         G = cholesky(Cyy) \ Cxy'
         Ex = eigen(Symmetric(Cxy * G), Symmetric(Cxx))
         ord = sortperm(Ex.values; rev=true)
-        vx, Px = extract_kv(Ex, ord, p)
+        eigs, Px = extract_kv(Ex, ord, p)
         Py = qnormalize!(G * Px, Cyy)
     else
         # solve Py: (Cyx * inv(Cxx) * Cxy) Py = λ Cyy Py
@@ -186,7 +190,7 @@ function _ccacov(Cxx, Cyy, Cxy, xmean, ymean, p::Int)
         H = cholesky(Cxx) \ Cxy
         Ey = eigen(Symmetric(Cxy'H), Symmetric(Cyy))
         ord = sortperm(Ey.values; rev=true)
-        vy, Py = extract_kv(Ey, ord, p)
+        eigs, Py = extract_kv(Ey, ord, p)
         Px = qnormalize!(H * Py, Cxx)
     end
 
@@ -196,7 +200,7 @@ function _ccacov(Cxx, Cyy, Cxy, xmean, ymean, p::Int)
     crs = coldot(Px, Cxy * Py)
 
     # construct CCA model
-    CCA(xmean, ymean, Px, Py, crs)
+    CCA(xmean, ymean, Px, Py, crs, sqrt.(eigs), -1)
 end
 
 """
@@ -275,7 +279,7 @@ function _ccasvd(Zx::DenseMatrix{T}, Zy::DenseMatrix{T}, xmean::Vector{T}, ymean
     crs = rmul!(coldot(Zx'Px, Zy'Py), one(T)/(n-1))
 
     # construct CCA model
-    CCA(xmean, ymean, Px, Py, crs)
+    CCA(xmean, ymean, Px, Py, crs, S.S[si], n)
 end
 
 ## interface functions
@@ -336,3 +340,112 @@ function fit(::Type{CCA}, X::AbstractMatrix{T}, Y::AbstractMatrix{T};
 
     return M::CCA
 end
+
+abstract type MultivariateTest <: HypothesisTest end
+
+struct WilksLambdaTest <: MultivariateTest
+    stat::Float64
+    fstat::Float64
+    df1::Float64
+    df2::Float64
+end
+
+struct LawleyHotellingTest <: MultivariateTest
+    stat::Float64
+    fstat::Float64
+    df1::Float64
+    df2::Float64
+end
+
+struct PillaiTraceTest <: MultivariateTest
+    stat::Float64
+    fstat::Float64
+    df1::Float64
+    df2::Float64
+end
+
+function pvalue(ct::MultivariateTest)
+    return ccdf(FDist(ct.df1, ct.df2), ct.fstat)
+end
+
+function dof(ct::MultivariateTest)
+    return (ct.df1, ct.df2)
+end
+
+function _testprep(cca::CCA, n, k)
+
+    r = cca.eigs[k:end]
+    dx = length(cca.xmean)
+    dy = length(cca.ymean)
+    if isnothing(n) && cca.nobs == -1
+        throw(ArgumentError("If CCA was fit using :cov, n must be provided to tests"))
+    end
+    if n != -1 && cca.nobs != -1 && cca.nobs != n
+        throw("Provided n is different from actual n")
+    end
+    n = n == -1 ? cca.nobs : n
+
+    p = dx - k + 1
+    q = dy - k + 1
+    n = n - k + 1
+
+    m = (abs(p - q) - 1) / 2
+    N = (n - p - q - 2) / 2
+    s = min(p, q)
+
+    return r, s, m, N, n, dx, dy, p, q
+end
+
+"""
+    WilksLambdaTest(cca; n=-1, k=1)
+
+Use Wilks Lambda to test the dimension of a CCA.  The null hypothesis of
+the test is that canonical correlations k, k+1, ... are zero.  If the
+CCA was fit with a covariance matrix then the sample size n must be provided.
+"""
+function WilksLambdaTest(cca::CCA; n=-1, k=1)
+
+    # Reference: Rencher and Christensen (2012)
+
+    r, s, m, N, n, dx, dy, p, q = _testprep(cca, n, k)
+    stat = prod(1 .- r.^2)
+    w = n - (p + q + 3) / 2
+    t = p*q == 2 ? 1.0 : sqrt((p^2*q^2 - 4) / (p^2 + q^2 - 5))
+    df1 = p*q
+    df2 = w*t - p*q/2 + 1
+    fstat = ((1 - stat^(1/t)) / stat^(1/t)) * (df2 / df1)
+    return WilksLambdaTest(stat, fstat, df1, df2)
+end
+
+"""
+    PillaiTraceTest(cca; n=-1, k=1)
+
+Use Pillai's trace to test the dimension of a CCA.  The null hypothesis of
+the test is that canonical correlations k, k+1, ... are zero.  If the
+CCA was fit with a covariance matrix then the sample size n must be provided.
+"""
+function PillaiTraceTest(cca::CCA; n=-1, k=1)
+    r, s, m, N, n, dx, dy, p, q = _testprep(cca, n, k)
+    stat = sum(abs2, r)
+    fstat = (2*N + s + 1)*stat / ((2*m + s + 1) * (s - stat))
+    df1 = s*(2*m + s + 1)
+    df2 = s*(2*N + s + 1)
+    return PillaiTraceTest(stat, fstat, df1, df2)
+end
+
+"""
+    LawleyHotellingTest(cca; n=-1, k=1)
+
+Use the Lawley Hotelling statistics to test the dimension of a CCA.  The
+null hypothesis of the test is that canonical correlations k, k+1, ... are
+zero.  If the CCA was fit with a covariance matrix then the sample size n
+must be provided.
+"""
+function LawleyHotellingTest(cca::CCA; n=-1, k=1)
+    r, s, m, N, n, dx, dy, p, q = _testprep(cca, n, k)
+    stat = sum(r.^2 ./ (1 .- r.^2))
+    fstat = 2*(s*N + 1) * stat / (s^2 * (2*m + s + 1))
+    df1 = s*(2*m + s + 1)
+    df2 = 2*(s*N + 1)
+    return LawleyHotellingTest(stat, fstat, df1, df2)
+end
diff --git a/test/cca.jl b/test/cca.jl
@@ -20,7 +20,7 @@ using Statistics: mean, cov, cor
     Px = qr(randn(rng, dx, dx)).Q[:, 1:p]
     Py = qr(randn(rng, dy, dy)).Q[:, 1:p]
 
-    M = CCA(Float64[], Float64[], Px, Py, [0.8, 0.6, 0.4])
+    M = CCA(Float64[], Float64[], Px, Py, [0.8, 0.6, 0.4], zeros(3), -1)
 
     @test size(M)[1] == dx
     @test size(M)[2] == dy
@@ -40,7 +40,7 @@ using Statistics: mean, cov, cor
     ux = randn(rng, dx)
     uy = randn(rng, dy)
 
-    M = CCA(ux, uy, Px, Py, [0.8, 0.6, 0.4])
+    M = CCA(ux, uy, Px, Py, [0.8, 0.6, 0.4], zeros(3), -1)
 
     @test size(M)[1] == dx
     @test size(M)[2] == dy
@@ -145,12 +145,35 @@ using Statistics: mean, cov, cor
     predict(M, YY, :y)
     predict(MM, XX, :x)
     predict(MM, YY, :y)
-    
+
     # type stability
     for func in (M->mean(M, :x), M->mean(M, :y),
                  M->projection(M, :x),
                  M->projection(M, :y), cor)
         @test eltype(func(M)) == Float64
         @test eltype(func(MM)) == Float32
     end
+
+    M1 = fit(CCA, X, Y; method=:svd, outdim=5)
+    M2 = fit(CCA, X, Y; method=:cov, outdim=5)
+
+    # Compare hypothesis tests with Stata
+    stats = (WilksLambdaTest=0.000384245, PillaiTraceTest=2.81275, LawleyHotellingTest=55.1432)
+    df1 = (WilksLambdaTest=30, PillaiTraceTest=30, LawleyHotellingTest=30)
+    df2 = (WilksLambdaTest=3958, PillaiTraceTest=4965, LawleyHotellingTest=4937)
+    fstats = (WilksLambdaTest=810.3954, PillaiTraceTest=212.8296, LawleyHotellingTest=1814.9480)
+
+    for f in [WilksLambdaTest, PillaiTraceTest, LawleyHotellingTest]
+
+        ct1 = f(M1)
+        ct2 = f(M2; n=size(X, 2))
+        s = Symbol(f)
+
+        for ct in [ct1, ct2]
+            @test isapprox(ct.stat, stats[s], atol=1e-5, rtol=1e-5)
+            @test isapprox(ct.fstat, fstats[s], atol=1e-5, rtol=1e-5)
+            @test isapprox(ct.df1, df1[s], atol=1e-5, rtol=1e-5)
+            @test isapprox(ct.df2, df2[s], atol=1e-5, rtol=1e-5)
+        end
+    end
 end