diff --git a/CMakeLists.txt b/CMakeLists.txt index 2841539..51be777 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,12 +1,21 @@ cmake_minimum_required(VERSION 3.18) +# Set CUDA flags early for compiler ID test +# Check if cuda backend is requested before project() call +if(DEFINED QOCO_ALGEBRA_BACKEND AND QOCO_ALGEBRA_BACKEND STREQUAL "cuda") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -allow-unsupported-compiler") + set(CMAKE_CUDA_FLAGS_INIT "${CMAKE_CUDA_FLAGS_INIT} -allow-unsupported-compiler") + # Also set via cache to ensure it's picked up during compiler detection + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -allow-unsupported-compiler" CACHE STRING "Flags for CUDA compiler") +endif() + set(QOCO_VERSION_MAJOR "0") set(QOCO_VERSION_MINOR "1") set(QOCO_VERSION_PATCH "6") set(QOCO_VERSION ${QOCO_VERSION_MAJOR}.${QOCO_VERSION_MINOR}.${QOCO_VERSION_PATCH}) # Project name -project(qoco VERSION ${QOCO_VERSION}) +project(qoco VERSION ${QOCO_VERSION} LANGUAGES C CXX) message(STATUS "Building QOCO v${QOCO_VERSION}") # Detect operating system. @@ -112,15 +121,21 @@ endif() set(QOCO_ALGEBRA_BACKEND "builtin" CACHE STRING "The Algebra to use") + +# CUDA flags are set early if needed add_subdirectory(algebra) get_property(qoco_sources GLOBAL PROPERTY QOCO_SOURCES) get_property(qoco_headers GLOBAL PROPERTY QOCO_HEADERS) get_property(qoco_include GLOBAL PROPERTY QOCO_INCLUDE) +# Get CUDA libraries if using CUDA backend +get_property(qoco_cuda_libs GLOBAL PROPERTY QOCO_CUDA_LIBS) + # Build qoco shared library. add_library(qoco SHARED) target_link_libraries(qoco qdldlobject amd) +target_link_libraries(qoco ${qoco_cuda_libs}) if(IS_LINUX OR IS_MACOS) target_link_libraries(qoco m) @@ -129,9 +144,25 @@ endif() target_include_directories(qoco PUBLIC ${qoco_include}) target_sources(qoco PRIVATE ${qoco_sources}) +# Enable CUDA if using CUDA backend +if(${QOCO_ALGEBRA_BACKEND} STREQUAL "cuda") + enable_language(CUDA) + set_target_properties(qoco PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON + ) + # Add HAVE_CUDSS definition if cuDSS library was found + get_property(qoco_cuda_libs GLOBAL PROPERTY QOCO_CUDA_LIBS) + if("${qoco_cuda_libs}" MATCHES "cudss") + target_compile_definitions(qoco PRIVATE HAVE_CUDSS) + message(STATUS "Added HAVE_CUDSS to qoco target") + endif() +endif() + # Build qoco static library. add_library(qocostatic STATIC) target_link_libraries(qocostatic qdldlobject amd) +target_link_libraries(qocostatic ${qoco_cuda_libs}) if(IS_LINUX OR IS_MACOS) target_link_libraries(qocostatic m) @@ -140,6 +171,22 @@ endif() target_include_directories(qocostatic PUBLIC ${qoco_include}) target_sources(qocostatic PRIVATE ${qoco_sources}) +# Enable CUDA if using CUDA backend +if(${QOCO_ALGEBRA_BACKEND} STREQUAL "cuda") + set_target_properties(qocostatic PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON + ) + # Add HAVE_CUDSS definition if cuDSS library was found + get_property(qoco_cuda_libs GLOBAL PROPERTY QOCO_CUDA_LIBS) + if("${qoco_cuda_libs}" MATCHES "cudss") + target_compile_definitions(qocostatic PRIVATE HAVE_CUDSS) + endif() +endif() + +target_include_directories(qocostatic PUBLIC ${qoco_include}) +target_sources(qocostatic PRIVATE ${qoco_sources}) + # Build qoco demo. if(BUILD_QOCO_DEMO) add_executable(qoco_demo ${PROJECT_SOURCE_DIR}/examples/qoco_demo.c) diff --git a/algebra/CMakeLists.txt b/algebra/CMakeLists.txt index 5d5560d..9d2b2fd 100644 --- a/algebra/CMakeLists.txt +++ b/algebra/CMakeLists.txt @@ -3,4 +3,13 @@ if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${QOCO_ALGEBRA_BACKEND}) endif() set_property(GLOBAL APPEND PROPERTY QOCO_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/${QOCO_ALGEBRA_BACKEND}) +set_property(GLOBAL APPEND PROPERTY QOCO_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}) + +# Set backend define +if(${QOCO_ALGEBRA_BACKEND} STREQUAL "cuda") + add_compile_definitions(QOCO_ALGEBRA_BACKEND_CUDA) +else() + add_compile_definitions(QOCO_ALGEBRA_BACKEND_BUILTIN) +endif() + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${QOCO_ALGEBRA_BACKEND}) diff --git a/algebra/backend.h b/algebra/backend.h new file mode 100644 index 0000000..adeb4c5 --- /dev/null +++ b/algebra/backend.h @@ -0,0 +1,27 @@ +/** + * @file backend.h + * @author Govind M. Chari + * + * @section LICENSE + * + * Copyright (c) 2025, Govind M. Chari + * This source code is licensed under the BSD 3-Clause License + * + * @section DESCRIPTION + * + * Includes the appropriate backend header based on QOCO_ALGEBRA_BACKEND. + */ + +#ifndef QOCO_BACKEND_H +#define QOCO_BACKEND_H + +#if defined(QOCO_ALGEBRA_BACKEND_BUILTIN) || !defined(QOCO_ALGEBRA_BACKEND_CUDA) +#include "builtin/qdldl_backend.h" +#elif defined(QOCO_ALGEBRA_BACKEND_CUDA) +#include "cuda/cudss_backend.h" +#else +#error "Unknown algebra backend. Define QOCO_ALGEBRA_BACKEND_BUILTIN or QOCO_ALGEBRA_BACKEND_CUDA" +#endif + +#endif /* #ifndef QOCO_BACKEND_H */ + diff --git a/algebra/builtin/builtin_linalg.c b/algebra/builtin/builtin_linalg.c index 153ff3e..8aaaba3 100644 --- a/algebra/builtin/builtin_linalg.c +++ b/algebra/builtin/builtin_linalg.c @@ -8,7 +8,162 @@ * This source code is licensed under the BSD 3-Clause License */ -#include "qoco_linalg.h" +#include "builtin_types.h" + +QOCOMatrix* new_qoco_matrix(const QOCOCscMatrix* A) +{ + QOCOMatrix* M = qoco_malloc(sizeof(QOCOMatrix)); + QOCOCscMatrix* Mcsc = qoco_malloc(sizeof(QOCOCscMatrix)); + + if (A) { + QOCOInt m = A->m; + QOCOInt n = A->n; + QOCOInt nnz = A->nnz; + + QOCOFloat* x = qoco_malloc(nnz * sizeof(QOCOFloat)); + QOCOInt* p = qoco_malloc((n + 1) * sizeof(QOCOInt)); + QOCOInt* i = qoco_malloc(nnz * sizeof(QOCOInt)); + + copy_arrayf(A->x, x, nnz); + copy_arrayi(A->i, i, nnz); + copy_arrayi(A->p, p, n + 1); + + Mcsc->m = m; + Mcsc->n = n; + Mcsc->nnz = nnz; + Mcsc->x = x; + Mcsc->i = i; + Mcsc->p = p; + } + else { + Mcsc->m = 0; + Mcsc->n = 0; + Mcsc->nnz = 0; + Mcsc->x = NULL; + Mcsc->i = NULL; + Mcsc->p = NULL; + } + + M->csc = Mcsc; + + return M; +} + +QOCOVectorf* new_qoco_vectorf(const QOCOFloat* x, QOCOInt n) +{ + QOCOVectorf* v = qoco_malloc(sizeof(QOCOVectorf)); + QOCOFloat* vdata = qoco_malloc(sizeof(QOCOFloat) * n); + if (x) { + copy_arrayf(x, vdata, n); + } else { + // Initialize to zero if x is NULL + for (QOCOInt i = 0; i < n; ++i) { + vdata[i] = 0.0; + } + } + + v->len = n; + v->data = vdata; + + return v; +} + +void free_qoco_matrix(QOCOMatrix* A) +{ + free_qoco_csc_matrix(A->csc); + qoco_free(A); +} + +void free_qoco_vectorf(QOCOVectorf* x) +{ + qoco_free(x->data); + qoco_free(x); +} + +QOCOInt get_nnz(const QOCOMatrix* A) { return A->csc->nnz; } + +QOCOFloat get_element_vectorf(const QOCOVectorf* x, QOCOInt idx) +{ + return x->data[idx]; +} + +QOCOFloat* get_pointer_vectorf(const QOCOVectorf* x, QOCOInt idx) +{ + return &x->data[idx]; +} + +QOCOFloat* get_data_vectorf(const QOCOVectorf* x) +{ + return x->data; +} + +QOCOInt get_length_vectorf(const QOCOVectorf* x) +{ + return x->len; +} + +void sync_vector_to_device_if_needed(QOCOVectorf* v) +{ + // No-op for builtin backend (no device memory) + (void)v; +} + +void set_solve_phase(int active) +{ + // No-op for builtin backend (no device memory) + (void)active; +} + +int get_solve_phase(void) +{ + // Always return 0 for builtin backend (no device memory) + return 0; +} + +void copy_vector_from_device(QOCOVectorf* src, QOCOFloat* dst, QOCOInt n) +{ + // For builtin backend, just copy from host memory + if (src && dst) { + copy_arrayf(src->data, dst, n); + } +} + +QOCOCscMatrix* get_csc_matrix(const QOCOMatrix* M) +{ + return M->csc; +} + +void col_inf_norm_USymm_matrix(const QOCOMatrix* M, QOCOFloat* norm) +{ + col_inf_norm_USymm(get_csc_matrix(M), norm); +} + +void col_inf_norm_matrix(const QOCOMatrix* M, QOCOFloat* norm) +{ + col_inf_norm(get_csc_matrix(M), norm); +} + +void row_inf_norm_matrix(const QOCOMatrix* M, QOCOFloat* norm) +{ + row_inf_norm(get_csc_matrix(M), norm); +} + +void row_col_scale_matrix(QOCOMatrix* M, const QOCOFloat* E, const QOCOFloat* D) +{ + row_col_scale(get_csc_matrix(M), (QOCOFloat*)E, (QOCOFloat*)D); +} + +void set_element_vectorf(QOCOVectorf* x, QOCOInt idx, QOCOFloat data) +{ + x->data[idx] = data; +} + +void reciprocal_vectorf(const QOCOVectorf* input, QOCOVectorf* output) +{ + for (QOCOInt i = 0; i < input->len; ++i) { + output->data[i] = safe_div(1.0, input->data[i]); + } +} QOCOCscMatrix* new_qoco_csc_matrix(const QOCOCscMatrix* A) { @@ -181,6 +336,21 @@ void SpMtv(const QOCOCscMatrix* M, const QOCOFloat* v, QOCOFloat* r) } } +void USpMv_matrix(const QOCOMatrix* M, const QOCOFloat* v, QOCOFloat* r) +{ + USpMv(M->csc, v, r); +} + +void SpMv_matrix(const QOCOMatrix* M, const QOCOFloat* v, QOCOFloat* r) +{ + SpMv(M->csc, v, r); +} + +void SpMtv_matrix(const QOCOMatrix* M, const QOCOFloat* v, QOCOFloat* r) +{ + SpMtv(M->csc, v, r); +} + QOCOFloat inf_norm(const QOCOFloat* x, QOCOInt n) { qoco_assert(x || n == 0); diff --git a/algebra/builtin/builtin_types.h b/algebra/builtin/builtin_types.h new file mode 100644 index 0000000..8321ba3 --- /dev/null +++ b/algebra/builtin/builtin_types.h @@ -0,0 +1,36 @@ +/** + * @file builtin_types.h + * @author Govind M. Chari + * + * @section LICENSE + * + * Copyright (c) 2025, Govind M. Chari + * This source code is licensed under the BSD 3-Clause License + * + * @section DESCRIPTION + * + * Defines the vector and matrices for builtin linear algebra. + */ + +#ifndef BUILTIN_TYPES_H +#define BUILTIN_TYPES_H + +#include "common_linalg.h" +#include "definitions.h" +#include "qoco_linalg.h" + +struct QOCOVectori_ { + QOCOInt* data; + QOCOInt len; +}; + +struct QOCOVectorf_ { + QOCOFloat* data; + QOCOInt len; +}; + +struct QOCOMatrix_ { + QOCOCscMatrix* csc; +}; + +#endif /* ifndef BUILTIN_TYPES_H */ diff --git a/algebra/builtin/qdldl_backend.c b/algebra/builtin/qdldl_backend.c index cd7c3e0..309c0e9 100644 --- a/algebra/builtin/qdldl_backend.c +++ b/algebra/builtin/qdldl_backend.c @@ -9,7 +9,6 @@ */ #include "qdldl_backend.h" -#include // Contains data for linear system. struct LinSysData { @@ -95,20 +94,22 @@ static LinSysData* qdldl_setup(QOCOProblemData* data, QOCOSettings* settings, // Allocate memory for mappings to KKT matrix. linsys_data->nt2kkt = qoco_calloc(Wnnz, sizeof(QOCOInt)); linsys_data->ntdiag2kkt = qoco_calloc(data->m, sizeof(QOCOInt)); - linsys_data->PregtoKKT = qoco_calloc(data->P->nnz, sizeof(QOCOInt)); - linsys_data->AttoKKT = qoco_calloc(data->A->nnz, sizeof(QOCOInt)); - linsys_data->GttoKKT = qoco_calloc(data->G->nnz, sizeof(QOCOInt)); + linsys_data->PregtoKKT = qoco_calloc(get_nnz(data->P), sizeof(QOCOInt)); + linsys_data->AttoKKT = qoco_calloc(get_nnz(data->A), sizeof(QOCOInt)); + linsys_data->GttoKKT = qoco_calloc(get_nnz(data->G), sizeof(QOCOInt)); QOCOInt* nt2kkt_temp = qoco_calloc(Wnnz, sizeof(QOCOInt)); QOCOInt* ntdiag2kkt_temp = qoco_calloc(data->m, sizeof(QOCOInt)); - QOCOInt* PregtoKKT_temp = qoco_calloc(data->P->nnz, sizeof(QOCOInt)); - QOCOInt* AttoKKT_temp = qoco_calloc(data->A->nnz, sizeof(QOCOInt)); - QOCOInt* GttoKKT_temp = qoco_calloc(data->G->nnz, sizeof(QOCOInt)); + QOCOInt* PregtoKKT_temp = data->P ? qoco_calloc(get_nnz(data->P), sizeof(QOCOInt)) : NULL; + QOCOInt* AttoKKT_temp = qoco_calloc(get_nnz(data->A), sizeof(QOCOInt)); + QOCOInt* GttoKKT_temp = qoco_calloc(get_nnz(data->G), sizeof(QOCOInt)); linsys_data->K = construct_kkt( - data->P, data->A, data->G, data->At, data->Gt, settings->kkt_static_reg, - data->n, data->m, data->p, data->l, data->nsoc, data->q, PregtoKKT_temp, - AttoKKT_temp, GttoKKT_temp, nt2kkt_temp, ntdiag2kkt_temp, Wnnz); + data->P ? get_csc_matrix(data->P) : NULL, get_csc_matrix(data->A), get_csc_matrix(data->G), + get_csc_matrix(data->At), get_csc_matrix(data->Gt), + settings->kkt_static_reg, data->n, data->m, data->p, data->l, data->nsoc, + data->q, PregtoKKT_temp, AttoKKT_temp, GttoKKT_temp, nt2kkt_temp, + ntdiag2kkt_temp, Wnnz); // Compute AMD ordering. linsys_data->p = qoco_malloc(linsys_data->K->n * sizeof(QOCOInt)); @@ -133,15 +134,17 @@ static LinSysData* qdldl_setup(QOCOProblemData* data, QOCOSettings* settings, linsys_data->ntdiag2kkt[i] = KtoPKPt[ntdiag2kkt_temp[i]]; } - for (QOCOInt i = 0; i < data->P->nnz; ++i) { - linsys_data->PregtoKKT[i] = KtoPKPt[PregtoKKT_temp[i]]; + if (data->P && PregtoKKT_temp) { + for (QOCOInt i = 0; i < get_nnz(data->P); ++i) { + linsys_data->PregtoKKT[i] = KtoPKPt[PregtoKKT_temp[i]]; + } } - for (QOCOInt i = 0; i < data->A->nnz; ++i) { + for (QOCOInt i = 0; i < get_nnz(data->A); ++i) { linsys_data->AttoKKT[i] = KtoPKPt[AttoKKT_temp[i]]; } - for (QOCOInt i = 0; i < data->G->nnz; ++i) { + for (QOCOInt i = 0; i < get_nnz(data->G); ++i) { linsys_data->GttoKKT[i] = KtoPKPt[GttoKKT_temp[i]]; } @@ -253,18 +256,25 @@ static void qdldl_update_nt(LinSysData* linsys_data, QOCOFloat* WtW, static void qdldl_update_data(LinSysData* linsys_data, QOCOProblemData* data) { // Update P in KKT matrix. - for (QOCOInt i = 0; i < data->P->nnz; ++i) { - linsys_data->K->x[linsys_data->PregtoKKT[i]] = data->P->x[i]; + if (data->P && linsys_data->PregtoKKT) { + QOCOCscMatrix* Pcsc = get_csc_matrix(data->P); + for (QOCOInt i = 0; i < get_nnz(data->P); ++i) { + linsys_data->K->x[linsys_data->PregtoKKT[i]] = Pcsc->x[i]; + } } // Update A in KKT matrix. - for (QOCOInt i = 0; i < data->A->nnz; ++i) { - linsys_data->K->x[linsys_data->AttoKKT[data->AtoAt[i]]] = data->A->x[i]; + QOCOCscMatrix* Acsc = get_csc_matrix(data->A); + for (QOCOInt i = 0; i < get_nnz(data->A); ++i) { + linsys_data->K->x[linsys_data->AttoKKT[data->AtoAt[i]]] = + Acsc->x[i]; } // Update G in KKT matrix. - for (QOCOInt i = 0; i < data->G->nnz; ++i) { - linsys_data->K->x[linsys_data->GttoKKT[data->GtoGt[i]]] = data->G->x[i]; + QOCOCscMatrix* Gcsc = get_csc_matrix(data->G); + for (QOCOInt i = 0; i < get_nnz(data->G); ++i) { + linsys_data->K->x[linsys_data->GttoKKT[data->GtoGt[i]]] = + Gcsc->x[i]; } } diff --git a/algebra/builtin/qdldl_backend.h b/algebra/builtin/qdldl_backend.h index dc7b2b5..cb13dbd 100644 --- a/algebra/builtin/qdldl_backend.h +++ b/algebra/builtin/qdldl_backend.h @@ -16,6 +16,7 @@ #define QDLDL_BACKEND_H #include "amd.h" +#include "builtin_types.h" #include "common_linalg.h" #include "kkt.h" #include "qdldl.h" diff --git a/algebra/cuda/CMakeLists.txt b/algebra/cuda/CMakeLists.txt new file mode 100644 index 0000000..faae4e6 --- /dev/null +++ b/algebra/cuda/CMakeLists.txt @@ -0,0 +1,54 @@ +# Set CUDA flags before enabling language to allow unsupported GCC +# Set CMAKE_CUDA_FLAGS_INIT before enable_language so it's used during compiler ID test +set(CMAKE_CUDA_FLAGS_INIT "-allow-unsupported-compiler") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -allow-unsupported-compiler") +set(CMAKE_CUDA_STANDARD 14) +set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) + +# Set environment variable as fallback for compiler ID test +set(ENV{CMAKE_CUDA_FLAGS} "${CMAKE_CUDA_FLAGS} -allow-unsupported-compiler") + +enable_language(CUDA) + +find_package(CUDAToolkit REQUIRED) +find_library(CUBLAS_LIB cublas PATHS ${CUDAToolkit_LIBRARY_DIR}) +find_library(CUSPARSE_LIB cusparse PATHS ${CUDAToolkit_LIBRARY_DIR}) +# Search for cuDSS in CU_DSS_ROOT first, then system paths +if(DEFINED ENV{CU_DSS_ROOT}) + find_library(CUDSS_LIB cudss PATHS "$ENV{CU_DSS_ROOT}/lib" NO_DEFAULT_PATH) +endif() +if(NOT CUDSS_LIB) + find_library(CUDSS_LIB cudss PATHS ${CUDAToolkit_LIBRARY_DIR}) +endif() + +if(NOT CUDSS_LIB) + message(WARNING "cuDSS library not found. You may need to install cuDSS separately.") + message(STATUS "cuDSS can be obtained from NVIDIA Developer website") + message(STATUS "Building with cuDSS stubs - full functionality requires cuDSS installation") + # Create a dummy library variable to avoid errors + set(CUDSS_LIB "") +else() + # Add HAVE_CUDSS definition - use CMAKE_CUDA_FLAGS to ensure it's passed to CUDA compiler + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DHAVE_CUDSS") + add_compile_definitions(HAVE_CUDSS) + # Add include directory for cuDSS headers + if(DEFINED ENV{CU_DSS_ROOT}) + include_directories("$ENV{CU_DSS_ROOT}/include") + endif() + message(STATUS "Found cuDSS library: ${CUDSS_LIB}") + message(STATUS "Added HAVE_CUDSS definition to CUDA flags") +endif() + +set_property(GLOBAL APPEND PROPERTY QOCO_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cudss_backend.cu ${CMAKE_CURRENT_SOURCE_DIR}/cuda_linalg.cu) +set_property(GLOBAL APPEND PROPERTY QOCO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/cudss_backend.h) + +# Add CUDA libraries to global property (will be used in main CMakeLists.txt) +set_property(GLOBAL APPEND PROPERTY QOCO_CUDA_LIBS + ${CUBLAS_LIB} + ${CUSPARSE_LIB} +) + +if(CUDSS_LIB) + set_property(GLOBAL APPEND PROPERTY QOCO_CUDA_LIBS ${CUDSS_LIB}) +endif() + diff --git a/algebra/cuda/cuda_linalg.cu b/algebra/cuda/cuda_linalg.cu new file mode 100644 index 0000000..5ebad7c --- /dev/null +++ b/algebra/cuda/cuda_linalg.cu @@ -0,0 +1,811 @@ +/** + * @file cuda_linalg.cu + * @author Govind M. Chari + * + * @section LICENSE + * + * Copyright (c) 2025, Govind M. Chari + * This source code is licensed under the BSD 3-Clause License + */ + +#include "cuda_types.h" +#include +#include +#include + +extern "C" { +#include "common_linalg.h" +} + +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(1); \ + } \ + } while(0) + +// CUDA kernels +__global__ void copy_arrayf_kernel(const QOCOFloat* x, QOCOFloat* y, QOCOInt n) { + QOCOInt idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + y[idx] = x[idx]; + } +} + +__global__ void copy_and_negate_arrayf_kernel(const QOCOFloat* x, QOCOFloat* y, QOCOInt n) { + QOCOInt idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + y[idx] = -x[idx]; + } +} + +__global__ void copy_arrayi_kernel(const QOCOInt* x, QOCOInt* y, QOCOInt n) { + QOCOInt idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + y[idx] = x[idx]; + } +} + +__global__ void scale_arrayf_kernel(const QOCOFloat* x, QOCOFloat* y, QOCOFloat s, QOCOInt n) { + QOCOInt idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + y[idx] = s * x[idx]; + } +} + +__global__ void axpy_kernel(const QOCOFloat* x, const QOCOFloat* y, QOCOFloat* z, QOCOFloat a, QOCOInt n) { + QOCOInt idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + z[idx] = a * x[idx] + y[idx]; + } +} + +__global__ void inf_norm_kernel(const QOCOFloat* x, QOCOFloat* result, QOCOInt n) { + extern __shared__ QOCOFloat sdata[]; + QOCOInt tid = threadIdx.x; + QOCOInt idx = blockIdx.x * blockDim.x + threadIdx.x; + + QOCOFloat val = (idx < n) ? fabs(x[idx]) : 0.0f; + sdata[tid] = val; + __syncthreads(); + + for (QOCOInt s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + sdata[tid] = fmaxf(sdata[tid], sdata[tid + s]); + } + __syncthreads(); + } + + if (tid == 0) { + result[blockIdx.x] = sdata[0]; + } +} + +static inline QOCOInt get_block_size(QOCOInt n) { + return (n + 255) / 256 * 256 < 256 ? 256 : ((n + 255) / 256) * 256; +} + +// Convert CSC to CSR format (host conversion, then copy to device) +// Used for converting matrices to GPU format after setup/equilibration +static void csc_to_csr(const QOCOCscMatrix* csc, QOCOInt** csr_row_ptr, + QOCOInt** csr_col_ind, QOCOFloat** csr_val) +{ + QOCOInt m = csc->m; + QOCOInt n = csc->n; + QOCOInt nnz = csc->nnz; + + // Allocate CSR arrays on host + QOCOInt* h_csr_row_ptr = (QOCOInt*)qoco_calloc((m + 1), sizeof(QOCOInt)); + QOCOInt* h_csr_col_ind = (QOCOInt*)qoco_malloc(nnz * sizeof(QOCOInt)); + QOCOFloat* h_csr_val = (QOCOFloat*)qoco_malloc(nnz * sizeof(QOCOFloat)); + + // Count nonzeros per row + for (QOCOInt col = 0; col < n; ++col) { + for (QOCOInt idx = csc->p[col]; idx < csc->p[col + 1]; ++idx) { + QOCOInt row = csc->i[idx]; + h_csr_row_ptr[row + 1]++; + } + } + + // Compute row pointers (prefix sum) + for (QOCOInt i = 0; i < m; ++i) { + h_csr_row_ptr[i + 1] += h_csr_row_ptr[i]; + } + + // Temporary array to track current position in each row + QOCOInt* row_pos = (QOCOInt*)qoco_malloc(m * sizeof(QOCOInt)); + for (QOCOInt i = 0; i < m; ++i) { + row_pos[i] = h_csr_row_ptr[i]; + } + + // Fill CSR arrays + for (QOCOInt col = 0; col < n; ++col) { + for (QOCOInt idx = csc->p[col]; idx < csc->p[col + 1]; ++idx) { + QOCOInt row = csc->i[idx]; + QOCOInt csr_idx = row_pos[row]++; + h_csr_col_ind[csr_idx] = col; + h_csr_val[csr_idx] = csc->x[idx]; + } + } + + qoco_free(row_pos); + + // Allocate device memory and copy + CUDA_CHECK(cudaMalloc(csr_row_ptr, (m + 1) * sizeof(QOCOInt))); + CUDA_CHECK(cudaMalloc(csr_col_ind, nnz * sizeof(QOCOInt))); + CUDA_CHECK(cudaMalloc(csr_val, nnz * sizeof(QOCOFloat))); + + CUDA_CHECK(cudaMemcpy(*csr_row_ptr, h_csr_row_ptr, (m + 1) * sizeof(QOCOInt), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(*csr_col_ind, h_csr_col_ind, nnz * sizeof(QOCOInt), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(*csr_val, h_csr_val, nnz * sizeof(QOCOFloat), cudaMemcpyHostToDevice)); + + // Free host memory + qoco_free(h_csr_row_ptr); + qoco_free(h_csr_col_ind); + qoco_free(h_csr_val); +} + +extern "C" { + +// Host implementation that manages GPU memory +QOCOMatrix* new_qoco_matrix(const QOCOCscMatrix* A) +{ + QOCOMatrix* M = (QOCOMatrix*)qoco_malloc(sizeof(QOCOMatrix)); + + if (A) { + QOCOInt m = A->m; + QOCOInt n = A->n; + QOCOInt nnz = A->nnz; + + // Allocate host memory + QOCOCscMatrix* Mcsc = (QOCOCscMatrix*)qoco_malloc(sizeof(QOCOCscMatrix)); + QOCOFloat* x = (QOCOFloat*)qoco_malloc(nnz * sizeof(QOCOFloat)); + QOCOInt* p = (QOCOInt*)qoco_malloc((n + 1) * sizeof(QOCOInt)); + QOCOInt* i = (QOCOInt*)qoco_malloc(nnz * sizeof(QOCOInt)); + + copy_arrayf(A->x, x, nnz); + copy_arrayi(A->i, i, nnz); + copy_arrayi(A->p, p, n + 1); + + Mcsc->m = m; + Mcsc->n = n; + Mcsc->nnz = nnz; + Mcsc->x = x; + Mcsc->i = i; + Mcsc->p = p; + + // For problem matrices (P, A, G), don't convert to CSR yet + // Equilibration happens on CPU using CSC, then KKT matrix is formed + // Only KKT matrix needs to be converted to CSR for cuDSS + // So for now, leave d_csr as NULL - it will be set up when KKT matrix is created + M->csc = Mcsc; + M->d_csr = NULL; + } + else { + QOCOCscMatrix* Mcsc = (QOCOCscMatrix*)qoco_malloc(sizeof(QOCOCscMatrix)); + Mcsc->m = 0; + Mcsc->n = 0; + Mcsc->nnz = 0; + Mcsc->x = NULL; + Mcsc->i = NULL; + Mcsc->p = NULL; + M->csc = Mcsc; + M->d_csr = NULL; + } + + return M; +} + +QOCOVectorf* new_qoco_vectorf(const QOCOFloat* x, QOCOInt n) +{ + QOCOVectorf* v = (QOCOVectorf*)qoco_malloc(sizeof(QOCOVectorf)); + QOCOFloat* vdata = (QOCOFloat*)qoco_malloc(sizeof(QOCOFloat) * n); + + if (x) { + copy_arrayf(x, vdata, n); + } else { + for (QOCOInt i = 0; i < n; ++i) { + vdata[i] = 0.0; + } + } + + QOCOFloat* d_vdata; + CUDA_CHECK(cudaMalloc(&d_vdata, sizeof(QOCOFloat) * n)); + CUDA_CHECK(cudaMemcpy(d_vdata, vdata, sizeof(QOCOFloat) * n, cudaMemcpyHostToDevice)); + + v->len = n; + v->data = vdata; + v->d_data = d_vdata; + + return v; +} + +void free_qoco_matrix(QOCOMatrix* A) +{ + if (A) { + if (A->csc) { + free_qoco_csc_matrix(A->csc); + } + if (A->d_csr) { + if (A->d_csr->row_ptr) cudaFree(A->d_csr->row_ptr); + if (A->d_csr->col_ind) cudaFree(A->d_csr->col_ind); + if (A->d_csr->val) cudaFree(A->d_csr->val); + qoco_free(A->d_csr); + } + qoco_free(A); + } +} + +void free_qoco_vectorf(QOCOVectorf* x) +{ + if (x) { + if (x->data) qoco_free(x->data); + if (x->d_data) cudaFree(x->d_data); + qoco_free(x); + } +} + +QOCOInt get_nnz(const QOCOMatrix* A) { return A->csc->nnz; } + +QOCOFloat get_element_vectorf(const QOCOVectorf* x, QOCOInt idx) +{ + return x->data[idx]; +} + +QOCOFloat* get_pointer_vectorf(const QOCOVectorf* x, QOCOInt idx) +{ + return &x->data[idx]; +} + +// Track solve phase to determine if we should return device or host pointers +static int solve_phase_active = 0; + +void set_solve_phase(int active) +{ + solve_phase_active = active; +} + +int get_solve_phase(void) +{ + return solve_phase_active; +} + +// Helper function to copy solution vectors from device to host (called from C code) +void copy_vector_from_device(QOCOVectorf* src, QOCOFloat* dst, QOCOInt n) +{ + if (src && src->d_data && dst) { + CUDA_CHECK(cudaMemcpy(dst, src->d_data, n * sizeof(QOCOFloat), cudaMemcpyDeviceToHost)); + } +} + +QOCOFloat* get_data_vectorf(const QOCOVectorf* x) +{ + // During equilibration/setup (CPU phase), return host pointer + // During solve (GPU phase), return device pointer to avoid CPU-GPU copies + if (solve_phase_active && x->d_data) { + return x->d_data; + } + return x->data; +} + +QOCOInt get_length_vectorf(const QOCOVectorf* x) +{ + return x->len; +} + +void sync_vector_to_device_if_needed(QOCOVectorf* v) +{ + if (v && v->data && v->d_data) { + // Sync from device to host (at solver termination, copy device data to host) + CUDA_CHECK(cudaMemcpy(v->data, v->d_data, v->len * sizeof(QOCOFloat), cudaMemcpyDeviceToHost)); + } +} + +QOCOCscMatrix* get_csc_matrix(const QOCOMatrix* M) +{ + return M->csc; +} + +// These functions are only called during equilibration on CPU, so call C functions directly +void col_inf_norm_USymm_matrix(const QOCOMatrix* M, QOCOFloat* norm) +{ + // Called during setup on CPU - use existing C function + col_inf_norm_USymm(get_csc_matrix(M), norm); +} + +void col_inf_norm_matrix(const QOCOMatrix* M, QOCOFloat* norm) +{ + // Called during setup on CPU - use existing C function + col_inf_norm(get_csc_matrix(M), norm); +} + +void row_inf_norm_matrix(const QOCOMatrix* M, QOCOFloat* norm) +{ + // Called during setup on CPU - use existing C function + row_inf_norm(get_csc_matrix(M), norm); +} + +void row_col_scale_matrix(QOCOMatrix* M, const QOCOFloat* E, const QOCOFloat* D) +{ + // Called during setup on CPU - scale on host CSC + row_col_scale(get_csc_matrix(M), (QOCOFloat*)E, (QOCOFloat*)D); + + // Clear device CSR if it exists - it will be recreated from updated CSC when KKT matrix is formed + if (M->d_csr) { + // Free old device CSR - it will be recreated from updated CSC later + if (M->d_csr->row_ptr) cudaFree(M->d_csr->row_ptr); + if (M->d_csr->col_ind) cudaFree(M->d_csr->col_ind); + if (M->d_csr->val) cudaFree(M->d_csr->val); + qoco_free(M->d_csr); + M->d_csr = NULL; + } +} + +void set_element_vectorf(QOCOVectorf* x, QOCOInt idx, QOCOFloat data) +{ + // Update host (during setup/equilibration, host is primary) + x->data[idx] = data; + // Also update device to keep in sync + CUDA_CHECK(cudaMemcpy(&x->d_data[idx], &data, sizeof(QOCOFloat), cudaMemcpyHostToDevice)); +} + +void reciprocal_vectorf(const QOCOVectorf* input, QOCOVectorf* output) +{ + // Operate directly on device + const QOCOInt blockSize = 256; + const QOCOInt numBlocks = (input->len + blockSize - 1) / blockSize; + + // Use a kernel for reciprocal + // For now, do on host then copy (setup phase only) + for (QOCOInt i = 0; i < input->len; ++i) { + output->data[i] = safe_div(1.0, input->data[i]); + } + CUDA_CHECK(cudaMemcpy(output->d_data, output->data, output->len * sizeof(QOCOFloat), cudaMemcpyHostToDevice)); +} + +QOCOCscMatrix* new_qoco_csc_matrix(const QOCOCscMatrix* A) +{ + QOCOCscMatrix* M = (QOCOCscMatrix*)qoco_malloc(sizeof(QOCOCscMatrix)); + + if (A) { + QOCOInt m = A->m; + QOCOInt n = A->n; + QOCOInt nnz = A->nnz; + + QOCOFloat* x = (QOCOFloat*)qoco_malloc(nnz * sizeof(QOCOFloat)); + QOCOInt* p = (QOCOInt*)qoco_malloc((n + 1) * sizeof(QOCOInt)); + QOCOInt* i = (QOCOInt*)qoco_malloc(nnz * sizeof(QOCOInt)); + + copy_arrayf(A->x, x, nnz); + copy_arrayi(A->i, i, nnz); + copy_arrayi(A->p, p, n + 1); + + M->m = m; + M->n = n; + M->nnz = nnz; + M->x = x; + M->i = i; + M->p = p; + } + else { + M->m = 0; + M->n = 0; + M->nnz = 0; + M->x = NULL; + M->i = NULL; + M->p = NULL; + } + + return M; +} + +void free_qoco_csc_matrix(QOCOCscMatrix* A) +{ + if (A) { + free(A->x); + free(A->i); + free(A->p); + free(A); + } +} + +void copy_arrayf(const QOCOFloat* x, QOCOFloat* y, QOCOInt n) +{ + qoco_assert(x || n == 0); + qoco_assert(y || n == 0); + + if (n == 0) return; + + // Check if pointers are on device - handle errors gracefully + cudaPointerAttributes attrs_x, attrs_y; + cudaError_t err_x = cudaPointerGetAttributes(&attrs_x, x); + cudaError_t err_y = cudaPointerGetAttributes(&attrs_y, y); + + // If either pointer check succeeds and one is on device, use CUDA + if ((err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeDevice) || + (err_y == cudaSuccess && attrs_y.type == cudaMemoryTypeDevice)) { + const QOCOInt blockSize = 256; + const QOCOInt numBlocks = (n + blockSize - 1) / blockSize; + // At least one pointer is on device - use kernel + if (err_x == cudaSuccess && err_y == cudaSuccess && + attrs_x.type == cudaMemoryTypeDevice && attrs_y.type == cudaMemoryTypeDevice) { + // Both device + copy_arrayf_kernel<<>>(x, (QOCOFloat*)y, n); + } else if (err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeDevice) { + // x on device, y on host + CUDA_CHECK(cudaMemcpy(y, x, n * sizeof(QOCOFloat), cudaMemcpyDeviceToHost)); + } else if (err_y == cudaSuccess && attrs_y.type == cudaMemoryTypeDevice) { + // y on device, x on host + CUDA_CHECK(cudaMemcpy((QOCOFloat*)y, x, n * sizeof(QOCOFloat), cudaMemcpyHostToDevice)); + } + CUDA_CHECK(cudaDeviceSynchronize()); + } else { + // Both on host - use CPU + for (QOCOInt i = 0; i < n; ++i) { + y[i] = x[i]; + } + } +} + +void copy_and_negate_arrayf(const QOCOFloat* x, QOCOFloat* y, QOCOInt n) +{ + qoco_assert(x || n == 0); + qoco_assert(y || n == 0); + + if (n == 0) return; + + const QOCOInt blockSize = 256; + const QOCOInt numBlocks = (n + blockSize - 1) / blockSize; + + // Check if pointers are on device - handle errors gracefully + cudaPointerAttributes attrs_x, attrs_y; + cudaError_t err_x = cudaPointerGetAttributes(&attrs_x, x); + cudaError_t err_y = cudaPointerGetAttributes(&attrs_y, y); + + // If either pointer check succeeds and one is on device, use CUDA + if ((err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeDevice) || + (err_y == cudaSuccess && attrs_y.type == cudaMemoryTypeDevice)) { + if (err_x == cudaSuccess && err_y == cudaSuccess && + attrs_x.type == cudaMemoryTypeDevice && attrs_y.type == cudaMemoryTypeDevice) { + copy_and_negate_arrayf_kernel<<>>(x, (QOCOFloat*)y, n); + } else if (err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeDevice) { + QOCOFloat* temp = (QOCOFloat*)qoco_malloc(n * sizeof(QOCOFloat)); + CUDA_CHECK(cudaMemcpy(temp, x, n * sizeof(QOCOFloat), cudaMemcpyDeviceToHost)); + for (QOCOInt i = 0; i < n; ++i) y[i] = -temp[i]; + qoco_free(temp); + } else { + QOCOFloat* d_y; + CUDA_CHECK(cudaMalloc(&d_y, n * sizeof(QOCOFloat))); + CUDA_CHECK(cudaMemcpy(d_y, x, n * sizeof(QOCOFloat), cudaMemcpyHostToDevice)); + copy_and_negate_arrayf_kernel<<>>(d_y, (QOCOFloat*)y, n); + cudaFree(d_y); + } + CUDA_CHECK(cudaDeviceSynchronize()); + } else { + for (QOCOInt i = 0; i < n; ++i) { + y[i] = -x[i]; + } + } +} + +void copy_arrayi(const QOCOInt* x, QOCOInt* y, QOCOInt n) +{ + qoco_assert(x || n == 0); + qoco_assert(y || n == 0); + + if (n == 0) return; + + const QOCOInt blockSize = 256; + const QOCOInt numBlocks = (n + blockSize - 1) / blockSize; + + // Check if pointers are on device - handle errors gracefully + cudaPointerAttributes attrs_x, attrs_y; + cudaError_t err_x = cudaPointerGetAttributes(&attrs_x, x); + cudaError_t err_y = cudaPointerGetAttributes(&attrs_y, y); + + // If either pointer check succeeds and one is on device, use CUDA + if ((err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeDevice) || + (err_y == cudaSuccess && attrs_y.type == cudaMemoryTypeDevice)) { + if (err_x == cudaSuccess && err_y == cudaSuccess && + attrs_x.type == cudaMemoryTypeDevice && attrs_y.type == cudaMemoryTypeDevice) { + copy_arrayi_kernel<<>>(x, (QOCOInt*)y, n); + } else if (err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeDevice) { + CUDA_CHECK(cudaMemcpy(y, x, n * sizeof(QOCOInt), cudaMemcpyDeviceToHost)); + } else if (err_y == cudaSuccess && attrs_y.type == cudaMemoryTypeDevice) { + CUDA_CHECK(cudaMemcpy((QOCOInt*)y, x, n * sizeof(QOCOInt), cudaMemcpyHostToDevice)); + } + CUDA_CHECK(cudaDeviceSynchronize()); + } else { + for (QOCOInt i = 0; i < n; ++i) { + y[i] = x[i]; + } + } +} + +QOCOFloat qoco_dot(const QOCOFloat* u, const QOCOFloat* v, QOCOInt n) +{ + qoco_assert(u || n == 0); + qoco_assert(v || n == 0); + + if (n == 0) return 0.0; + + // Check if pointers are on device - handle errors gracefully + cudaPointerAttributes attrs_u, attrs_v; + cudaError_t err_u = cudaPointerGetAttributes(&attrs_u, u); + cudaError_t err_v = cudaPointerGetAttributes(&attrs_v, v); + + // If either pointer check fails or one is on device, use CUDA + if ((err_u == cudaSuccess && attrs_u.type == cudaMemoryTypeDevice) || + (err_v == cudaSuccess && attrs_v.type == cudaMemoryTypeDevice)) { + cublasHandle_t handle; + cublasCreate(&handle); + QOCOFloat result; + cublasDdot(handle, n, u, 1, v, 1, &result); + cublasDestroy(handle); + return result; + } else { + QOCOFloat x = 0.0; + for (QOCOInt i = 0; i < n; ++i) { + x += u[i] * v[i]; + } + return x; + } +} + +QOCOInt max_arrayi(const QOCOInt* x, QOCOInt n) +{ + qoco_assert(x || n == 0); + + if (n == 0) return -QOCOInt_MAX; + + cudaPointerAttributes attrs; + cudaError_t err = cudaPointerGetAttributes(&attrs, x); + + if (err == cudaSuccess && attrs.type == cudaMemoryTypeDevice) { + // Copy to host and compute + QOCOInt* h_x = (QOCOInt*)qoco_malloc(n * sizeof(QOCOInt)); + CUDA_CHECK(cudaMemcpy(h_x, x, n * sizeof(QOCOInt), cudaMemcpyDeviceToHost)); + QOCOInt max = -QOCOInt_MAX; + for (QOCOInt i = 0; i < n; ++i) { + max = qoco_max(max, h_x[i]); + } + qoco_free(h_x); + return max; + } else { + QOCOInt max = -QOCOInt_MAX; + for (QOCOInt i = 0; i < n; ++i) { + max = qoco_max(max, x[i]); + } + return max; + } +} + +void scale_arrayf(const QOCOFloat* x, QOCOFloat* y, QOCOFloat s, QOCOInt n) +{ + qoco_assert(x || n == 0); + qoco_assert(y || n == 0); + + if (n == 0) return; + + const QOCOInt blockSize = 256; + const QOCOInt numBlocks = (n + blockSize - 1) / blockSize; + + // Check if pointers are on device - handle errors gracefully + cudaPointerAttributes attrs_x, attrs_y; + cudaError_t err_x = cudaPointerGetAttributes(&attrs_x, x); + cudaError_t err_y = cudaPointerGetAttributes(&attrs_y, y); + + // If either pointer check succeeds and one is on device, use CUDA + if ((err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeDevice) || + (err_y == cudaSuccess && attrs_y.type == cudaMemoryTypeDevice)) { + if (err_x == cudaSuccess && err_y == cudaSuccess && + attrs_x.type == cudaMemoryTypeDevice && attrs_y.type == cudaMemoryTypeDevice) { + scale_arrayf_kernel<<>>(x, (QOCOFloat*)y, s, n); + } else { + cublasHandle_t handle; + cublasCreate(&handle); + if (err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeDevice && + err_y == cudaSuccess && attrs_y.type == cudaMemoryTypeHost) { + QOCOFloat* d_y; + CUDA_CHECK(cudaMalloc(&d_y, n * sizeof(QOCOFloat))); + cublasDcopy(handle, n, x, 1, d_y, 1); + cublasDscal(handle, n, &s, d_y, 1); + CUDA_CHECK(cudaMemcpy(y, d_y, n * sizeof(QOCOFloat), cudaMemcpyDeviceToHost)); + cudaFree(d_y); + } else { + QOCOFloat* d_x; + CUDA_CHECK(cudaMalloc(&d_x, n * sizeof(QOCOFloat))); + CUDA_CHECK(cudaMemcpy(d_x, x, n * sizeof(QOCOFloat), cudaMemcpyHostToDevice)); + cublasDcopy(handle, n, d_x, 1, (QOCOFloat*)y, 1); + cublasDscal(handle, n, &s, (QOCOFloat*)y, 1); + cudaFree(d_x); + } + cublasDestroy(handle); + } + CUDA_CHECK(cudaDeviceSynchronize()); + } else { + for (QOCOInt i = 0; i < n; ++i) { + y[i] = s * x[i]; + } + } +} + +void qoco_axpy(const QOCOFloat* x, const QOCOFloat* y, QOCOFloat* z, + QOCOFloat a, QOCOInt n) +{ + qoco_assert(x || n == 0); + qoco_assert(y || n == 0); + + if (n == 0) return; + + const QOCOInt blockSize = 256; + const QOCOInt numBlocks = (n + blockSize - 1) / blockSize; + + // Check if pointers are on device - handle errors gracefully + cudaPointerAttributes attrs_x, attrs_y, attrs_z; + cudaError_t err_x = cudaPointerGetAttributes(&attrs_x, x); + cudaError_t err_y = cudaPointerGetAttributes(&attrs_y, y); + cudaError_t err_z = cudaPointerGetAttributes(&attrs_z, z); + + // If any pointer check succeeds and indicates device memory, use CUDA + if ((err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeDevice) || + (err_y == cudaSuccess && attrs_y.type == cudaMemoryTypeDevice) || + (err_z == cudaSuccess && attrs_z.type == cudaMemoryTypeDevice)) { + if (err_x == cudaSuccess && err_y == cudaSuccess && err_z == cudaSuccess && + attrs_x.type == cudaMemoryTypeDevice && attrs_y.type == cudaMemoryTypeDevice && + attrs_z.type == cudaMemoryTypeDevice) { + axpy_kernel<<>>(x, y, (QOCOFloat*)z, a, n); + } else { + cublasHandle_t handle; + cublasCreate(&handle); + // For mixed cases, use cublas + if (err_z == cudaSuccess && attrs_z.type == cudaMemoryTypeDevice) { + cublasDcopy(handle, n, y, 1, (QOCOFloat*)z, 1); + cublasDaxpy(handle, n, &a, x, 1, (QOCOFloat*)z, 1); + } else { + // Need to copy to host + QOCOFloat* d_z; + CUDA_CHECK(cudaMalloc(&d_z, n * sizeof(QOCOFloat))); + const QOCOFloat* d_x = (err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeDevice) ? x : NULL; + const QOCOFloat* d_y = (err_y == cudaSuccess && attrs_y.type == cudaMemoryTypeDevice) ? y : NULL; + if (!d_x) { + CUDA_CHECK(cudaMalloc((void**)&d_x, n * sizeof(QOCOFloat))); + CUDA_CHECK(cudaMemcpy((void*)d_x, x, n * sizeof(QOCOFloat), cudaMemcpyHostToDevice)); + } + if (!d_y) { + CUDA_CHECK(cudaMalloc((void**)&d_y, n * sizeof(QOCOFloat))); + CUDA_CHECK(cudaMemcpy((void*)d_y, y, n * sizeof(QOCOFloat), cudaMemcpyHostToDevice)); + } + cublasDcopy(handle, n, d_y, 1, d_z, 1); + cublasDaxpy(handle, n, &a, d_x, 1, d_z, 1); + CUDA_CHECK(cudaMemcpy(z, d_z, n * sizeof(QOCOFloat), cudaMemcpyDeviceToHost)); + if (!d_x || (err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeHost)) { + if (d_x) cudaFree((void*)d_x); + } + if (!d_y || (err_y == cudaSuccess && attrs_y.type == cudaMemoryTypeHost)) { + if (d_y) cudaFree((void*)d_y); + } + cudaFree(d_z); + } + cublasDestroy(handle); + } + CUDA_CHECK(cudaDeviceSynchronize()); + } else { + for (QOCOInt i = 0; i < n; ++i) { + z[i] = a * x[i] + y[i]; + } + } +} + +// Sparse matrix-vector multiplication using cuSPARSE +void USpMv(const QOCOCscMatrix* M, const QOCOFloat* v, QOCOFloat* r) +{ + qoco_assert(M); + qoco_assert(v); + qoco_assert(r); + + // For now, use CPU implementation + // TODO: Implement CUDA version using cuSPARSE + for (QOCOInt i = 0; i < M->n; i++) { + r[i] = 0.0; + for (QOCOInt j = M->p[i]; j < M->p[i + 1]; j++) { + int row = M->i[j]; + r[row] += M->x[j] * v[i]; + if (row != i) + r[i] += M->x[j] * v[row]; + } + } +} + +void SpMv(const QOCOCscMatrix* M, const QOCOFloat* v, QOCOFloat* r) +{ + qoco_assert(M); + qoco_assert(v); + qoco_assert(r); + + for (QOCOInt i = 0; i < M->m; ++i) { + r[i] = 0.0; + } + + for (QOCOInt j = 0; j < M->n; j++) { + for (QOCOInt i = M->p[j]; i < M->p[j + 1]; i++) { + r[M->i[i]] += M->x[i] * v[j]; + } + } +} + +void SpMtv(const QOCOCscMatrix* M, const QOCOFloat* v, QOCOFloat* r) +{ + qoco_assert(M); + qoco_assert(v); + qoco_assert(r); + + for (QOCOInt i = 0; i < M->n; ++i) { + r[i] = 0.0; + } + + for (QOCOInt i = 0; i < M->n; i++) { + for (QOCOInt j = M->p[i]; j < M->p[i + 1]; j++) { + r[i] += M->x[j] * v[M->i[j]]; + } + } +} + +} // extern "C" - end of C linkage block + +// Functions that don't need C linkage (called from C++ code or headers without extern "C") +void USpMv_matrix(const QOCOMatrix* M, const QOCOFloat* v, QOCOFloat* r) +{ + USpMv(M->csc, v, r); +} + +void SpMv_matrix(const QOCOMatrix* M, const QOCOFloat* v, QOCOFloat* r) +{ + SpMv(M->csc, v, r); +} + +void SpMtv_matrix(const QOCOMatrix* M, const QOCOFloat* v, QOCOFloat* r) +{ + SpMtv(M->csc, v, r); +} + +QOCOFloat inf_norm(const QOCOFloat* x, QOCOInt n) +{ + qoco_assert(x || n == 0); + + if (n == 0) return 0.0; + + cudaPointerAttributes attrs; + cudaError_t err = cudaPointerGetAttributes(&attrs, x); + + if (err == cudaSuccess && attrs.type == cudaMemoryTypeDevice) { + const QOCOInt blockSize = 256; + const QOCOInt numBlocks = (n + blockSize - 1) / blockSize; + QOCOFloat* d_result; + CUDA_CHECK(cudaMalloc(&d_result, numBlocks * sizeof(QOCOFloat))); + + inf_norm_kernel<<>>(x, d_result, n); + + QOCOFloat* h_result = (QOCOFloat*)qoco_malloc(numBlocks * sizeof(QOCOFloat)); + CUDA_CHECK(cudaMemcpy(h_result, d_result, numBlocks * sizeof(QOCOFloat), cudaMemcpyDeviceToHost)); + + QOCOFloat result = 0.0; + for (QOCOInt i = 0; i < numBlocks; ++i) { + result = qoco_max(result, h_result[i]); + } + + qoco_free(h_result); + cudaFree(d_result); + return result; + } else { + QOCOFloat norm = 0.0; + QOCOFloat xi; + for (QOCOInt i = 0; i < n; ++i) { + xi = qoco_abs(x[i]); + norm = qoco_max(norm, xi); + } + return norm; + } +} diff --git a/algebra/cuda/cuda_types.h b/algebra/cuda/cuda_types.h new file mode 100644 index 0000000..1ffa4a0 --- /dev/null +++ b/algebra/cuda/cuda_types.h @@ -0,0 +1,68 @@ +/** + * @file cuda_types.h + * @author Govind M. Chari + * + * @section LICENSE + * + * Copyright (c) 2025, Govind M. Chari + * This source code is licensed under the BSD 3-Clause License + * + * @section DESCRIPTION + * + * Defines the vector and matrices for CUDA linear algebra. + * On GPU, matrices are stored in CSR format (as required by cuSPARSE/cuDSS). + */ + +#ifndef CUDA_TYPES_H +#define CUDA_TYPES_H + +#ifdef __cplusplus +extern "C" { +#endif +#include "common_linalg.h" +#include "definitions.h" +#include "qoco_linalg.h" +#ifdef __cplusplus +} +#endif +#include + +// CSR matrix format (for GPU storage) +typedef struct { + /** Number of rows. */ + QOCOInt m; + + /** Number of columns. */ + QOCOInt n; + + /** Number of nonzero elements. */ + QOCOInt nnz; + + /** Row pointers (length: m+1). */ + QOCOInt* row_ptr; + + /** Column indices (length: nnz). */ + QOCOInt* col_ind; + + /** Data (length: nnz). */ + QOCOFloat* val; +} QOCOCsrMatrix; + +struct QOCOVectori_ { + QOCOInt* data; // Host pointer (kept for compatibility, but not used during solve) + QOCOInt* d_data; // Device pointer (primary storage) + QOCOInt len; +}; + +struct QOCOVectorf_ { + QOCOFloat* data; // Host pointer (kept for compatibility, but not used during solve) + QOCOFloat* d_data; // Device pointer (primary storage) + QOCOInt len; +}; + +struct QOCOMatrix_ { + QOCOCscMatrix* csc; // Host CSC (used for setup/CPU operations) + QOCOCsrMatrix* d_csr; // Device CSR (primary GPU storage) +}; + +#endif /* ifndef CUDA_TYPES_H */ diff --git a/algebra/cuda/cudss_backend.cu b/algebra/cuda/cudss_backend.cu new file mode 100644 index 0000000..b6c0732 --- /dev/null +++ b/algebra/cuda/cudss_backend.cu @@ -0,0 +1,792 @@ +/** + * @file cudss_backend.cu + * @author Govind M. Chari + * + * @section LICENSE + * + * Copyright (c) 2025, Govind M. Chari + * This source code is licensed under the BSD 3-Clause License + */ + +#include "cudss_backend.h" +#include +#include +#include +#include + +#include "kkt.h" +#include "common_linalg.h" + +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(1); \ + } \ + } while(0) + +#define CUDSS_CHECK(call) \ + do { \ + cudssStatus_t err = call; \ + if (err != CUDSS_STATUS_SUCCESS) { \ + const char* err_str = (err == CUDSS_STATUS_NOT_INITIALIZED) ? "NOT_INITIALIZED" : \ + (err == CUDSS_STATUS_ALLOC_FAILED) ? "ALLOC_FAILED" : \ + (err == CUDSS_STATUS_INVALID_VALUE) ? "INVALID_VALUE" : \ + (err == CUDSS_STATUS_NOT_SUPPORTED) ? "NOT_SUPPORTED" : \ + (err == CUDSS_STATUS_EXECUTION_FAILED) ? "EXECUTION_FAILED" : \ + (err == CUDSS_STATUS_INTERNAL_ERROR) ? "INTERNAL_ERROR" : "UNKNOWN"; \ + fprintf(stderr, "cuDSS error at %s:%d: status %d (%s)\n", __FILE__, __LINE__, (int)err, err_str); \ + exit(1); \ + } \ + } while(0) + +// Contains data for linear system. +struct LinSysData { + /** KKT matrix in CSC form (host). */ + QOCOCscMatrix* K; + + /** KKT matrix in CSR form (device) for cuDSS. */ + cudssMatrix_t K_csr; + + /** Permutation vector (not used for CUDA backend - kept for compatibility). */ + QOCOInt* p; + + /** Inverse of permutation vector (not used for CUDA backend - kept for compatibility). */ + QOCOInt* pinv; + + /** cuDSS handle. */ + cudssHandle_t handle; + + /** cuDSS config. */ + cudssConfig_t config; + + /** cuDSS data. */ + cudssData_t data; + + /** Buffer of size n + m + p. */ + QOCOFloat* xyzbuff1; + + /** Buffer of size n + m + p (device) - used for b and x vectors. */ + QOCOFloat* d_xyzbuff1; + + /** Buffer of size n + m + p. */ + QOCOFloat* xyzbuff2; + + /** Buffer of size n + m + p (device). */ + QOCOFloat* d_xyzbuff2; + + /** Device buffer for rhs (RHS of KKT system) - used during solve phase. */ + QOCOFloat* d_rhs; + + /** Device buffer for xyz (solution of KKT system) - used during solve phase. */ + QOCOFloat* d_xyz; + + /** Mapping from elements in the Nesterov-Todd scaling matrix to elements in + * the KKT matrix. */ + QOCOInt* nt2kkt; + + /** Mapping from elements on the main diagonal of the Nesterov-Todd scaling + * matrices to elements in the KKT matrix. Used for regularization.*/ + QOCOInt* ntdiag2kkt; + + /** Mapping from elements in regularized P to elements in the KKT matrix. */ + QOCOInt* PregtoKKT; + + /** Mapping from elements in At to elements in the KKT matrix. */ + QOCOInt* AttoKKT; + + /** Mapping from elements in Gt to elements in the KKT matrix. */ + QOCOInt* GttoKKT; + + QOCOInt Wnnz; + + /** cuSPARSE handle. */ + cusparseHandle_t cusparse_handle; + + /** Matrix description. */ + cusparseMatDescr_t descr; + + /** cuBLAS handle (for axpy operations). */ + cublasHandle_t cublas_handle; + + /** cuDSS dense matrix wrappers for solution and RHS vectors. */ + cudssMatrix_t d_rhs_matrix; + cudssMatrix_t d_xyz_matrix; + + /** CSR matrix arrays on GPU (persistent, updated directly) */ + QOCOInt* d_csr_row_ptr; + QOCOInt* d_csr_row_end; + QOCOInt* d_csr_col_ind; + QOCOFloat* d_csr_val; + + /** Mappings on GPU for direct updates */ + QOCOInt* d_nt2kkt; + QOCOInt* d_ntdiag2kkt; + QOCOInt* d_PregtoKKT; + QOCOInt* d_AttoKKT; + QOCOInt* d_GttoKKT; + + /** CSC structure on GPU (for converting updates) */ + QOCOInt* d_csc_p; + QOCOInt* d_csc_i; + + /** Flag to track if analysis has been done (per instance) */ + int analysis_done; +}; + +// Convert CSC to CSR on host and copy to device +static void csc_to_csr_device(const QOCOCscMatrix* csc, QOCOInt** csr_row_ptr, + QOCOInt** csr_col_ind, QOCOFloat** csr_val, + cusparseHandle_t handle) +{ + QOCOInt n = csc->n; + QOCOInt m = csc->m; + QOCOInt nnz = csc->nnz; + + // Allocate CSR arrays on host + QOCOInt* h_csr_row_ptr = (QOCOInt*)qoco_calloc((m + 1), sizeof(QOCOInt)); + QOCOInt* h_csr_col_ind = (QOCOInt*)qoco_malloc(nnz * sizeof(QOCOInt)); + QOCOFloat* h_csr_val = (QOCOFloat*)qoco_malloc(nnz * sizeof(QOCOFloat)); + + // Count nonzeros per row + for (QOCOInt col = 0; col < n; ++col) { + for (QOCOInt idx = csc->p[col]; idx < csc->p[col + 1]; ++idx) { + QOCOInt row = csc->i[idx]; + h_csr_row_ptr[row + 1]++; + } + } + + // Compute row pointers (prefix sum) + for (QOCOInt i = 0; i < m; ++i) { + h_csr_row_ptr[i + 1] += h_csr_row_ptr[i]; + } + + // Temporary array to track current position in each row + QOCOInt* row_pos = (QOCOInt*)qoco_malloc(m * sizeof(QOCOInt)); + for (QOCOInt i = 0; i < m; ++i) { + row_pos[i] = h_csr_row_ptr[i]; + } + + // Fill CSR arrays + for (QOCOInt col = 0; col < n; ++col) { + for (QOCOInt idx = csc->p[col]; idx < csc->p[col + 1]; ++idx) { + QOCOInt row = csc->i[idx]; + QOCOInt csr_idx = row_pos[row]++; + h_csr_col_ind[csr_idx] = col; + h_csr_val[csr_idx] = csc->x[idx]; + } + } + + qoco_free(row_pos); + + // Allocate device memory + CUDA_CHECK(cudaMalloc(csr_row_ptr, (m + 1) * sizeof(QOCOInt))); + CUDA_CHECK(cudaMalloc(csr_col_ind, nnz * sizeof(QOCOInt))); + CUDA_CHECK(cudaMalloc(csr_val, nnz * sizeof(QOCOFloat))); + + // Copy to device + CUDA_CHECK(cudaMemcpy(*csr_row_ptr, h_csr_row_ptr, (m + 1) * sizeof(QOCOInt), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(*csr_col_ind, h_csr_col_ind, nnz * sizeof(QOCOInt), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(*csr_val, h_csr_val, nnz * sizeof(QOCOFloat), cudaMemcpyHostToDevice)); + + // Free host memory + qoco_free(h_csr_row_ptr); + qoco_free(h_csr_col_ind); + qoco_free(h_csr_val); +} + +static LinSysData* cudss_setup(QOCOProblemData* data, QOCOSettings* settings, + QOCOInt Wnnz) +{ + QOCOInt Kn = data->n + data->m + data->p; + + LinSysData* linsys_data = (LinSysData*)qoco_malloc(sizeof(LinSysData)); + + // Initialize cuDSS + #ifdef HAVE_CUDSS + CUDSS_CHECK(cudssCreate(&linsys_data->handle)); + CUDSS_CHECK(cudssConfigCreate(&linsys_data->config)); + CUDSS_CHECK(cudssDataCreate(linsys_data->handle, &linsys_data->data)); + #else + // cuDSS not available - set handles to NULL + linsys_data->handle = NULL; + linsys_data->config = NULL; + linsys_data->data = NULL; + #endif + + // Initialize cuSPARSE + cusparseCreate(&linsys_data->cusparse_handle); + cusparseCreateMatDescr(&linsys_data->descr); + cusparseSetMatType(linsys_data->descr, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(linsys_data->descr, CUSPARSE_INDEX_BASE_ZERO); + + // Initialize cuBLAS + cublasCreate(&linsys_data->cublas_handle); + + // Allocate vector buffers + linsys_data->xyzbuff1 = (QOCOFloat*)qoco_malloc(sizeof(QOCOFloat) * Kn); + linsys_data->xyzbuff2 = (QOCOFloat*)qoco_malloc(sizeof(QOCOFloat) * Kn); + CUDA_CHECK(cudaMalloc(&linsys_data->d_xyzbuff1, sizeof(QOCOFloat) * Kn)); + CUDA_CHECK(cudaMalloc(&linsys_data->d_xyzbuff2, sizeof(QOCOFloat) * Kn)); + linsys_data->Wnnz = Wnnz; + + // Allocate memory for mappings to KKT matrix + linsys_data->nt2kkt = (QOCOInt*)qoco_calloc(Wnnz, sizeof(QOCOInt)); + linsys_data->ntdiag2kkt = (QOCOInt*)qoco_calloc(data->m, sizeof(QOCOInt)); + linsys_data->PregtoKKT = data->P ? (QOCOInt*)qoco_calloc(get_nnz(data->P), sizeof(QOCOInt)) : NULL; + linsys_data->AttoKKT = (QOCOInt*)qoco_calloc(get_nnz(data->A), sizeof(QOCOInt)); + linsys_data->GttoKKT = (QOCOInt*)qoco_calloc(get_nnz(data->G), sizeof(QOCOInt)); + + QOCOInt* nt2kkt_temp = (QOCOInt*)qoco_calloc(Wnnz, sizeof(QOCOInt)); + QOCOInt* ntdiag2kkt_temp = (QOCOInt*)qoco_calloc(data->m, sizeof(QOCOInt)); + QOCOInt* PregtoKKT_temp = data->P ? (QOCOInt*)qoco_calloc(get_nnz(data->P), sizeof(QOCOInt)) : NULL; + QOCOInt* AttoKKT_temp = (QOCOInt*)qoco_calloc(get_nnz(data->A), sizeof(QOCOInt)); + QOCOInt* GttoKKT_temp = (QOCOInt*)qoco_calloc(get_nnz(data->G), sizeof(QOCOInt)); + + // Construct KKT matrix (no permutation for CUDA backend) + linsys_data->K = construct_kkt( + data->P ? get_csc_matrix(data->P) : NULL, get_csc_matrix(data->A), get_csc_matrix(data->G), + get_csc_matrix(data->At), get_csc_matrix(data->Gt), + settings->kkt_static_reg, data->n, data->m, data->p, data->l, data->nsoc, + data->q, PregtoKKT_temp, AttoKKT_temp, GttoKKT_temp, nt2kkt_temp, + ntdiag2kkt_temp, Wnnz); + + // No AMD ordering or permutation - use KKT matrix directly + // Set identity permutation (no-op) + linsys_data->p = (QOCOInt*)qoco_malloc(linsys_data->K->n * sizeof(QOCOInt)); + linsys_data->pinv = (QOCOInt*)qoco_malloc(linsys_data->K->n * sizeof(QOCOInt)); + for (QOCOInt i = 0; i < linsys_data->K->n; ++i) { + linsys_data->p[i] = i; + linsys_data->pinv[i] = i; + } + + // Copy mappings directly (no permutation) + for (QOCOInt i = 0; i < Wnnz; ++i) { + linsys_data->nt2kkt[i] = nt2kkt_temp[i]; + } + for (QOCOInt i = 0; i < data->m; ++i) { + linsys_data->ntdiag2kkt[i] = ntdiag2kkt_temp[i]; + } + + if (data->P && PregtoKKT_temp) { + for (QOCOInt i = 0; i < get_nnz(data->P); ++i) { + linsys_data->PregtoKKT[i] = PregtoKKT_temp[i]; + } + } + + for (QOCOInt i = 0; i < get_nnz(data->A); ++i) { + linsys_data->AttoKKT[i] = AttoKKT_temp[i]; + } + + for (QOCOInt i = 0; i < get_nnz(data->G); ++i) { + linsys_data->GttoKKT[i] = GttoKKT_temp[i]; + } + + qoco_free(nt2kkt_temp); + qoco_free(ntdiag2kkt_temp); + qoco_free(PregtoKKT_temp); + qoco_free(AttoKKT_temp); + qoco_free(GttoKKT_temp); + + // Convert KKT matrix from CSC (CPU) to CSR (GPU) for cuDSS - ONCE during setup + // KKT matrix is formed on CPU in CSC, now convert and move to GPU + // Use csc_to_csr_device helper function + QOCOInt* csr_row_ptr; + QOCOInt* csr_col_ind; + QOCOFloat* csr_val; + + csc_to_csr_device(linsys_data->K, &csr_row_ptr, &csr_col_ind, &csr_val, + linsys_data->cusparse_handle); + + // Store CSR arrays on GPU persistently (will be updated directly, not recreated) + linsys_data->d_csr_row_ptr = csr_row_ptr; + linsys_data->d_csr_col_ind = csr_col_ind; + linsys_data->d_csr_val = csr_val; + + // Create cuDSS matrix in CSR format + #ifdef HAVE_CUDSS + // cuDSS CSR format: rowEnd can be NULL (cuDSS will compute it from rowStart) + // The working example (simple.cpp) passes NULL for rowEnd + // Store row_end for potential future use, but pass NULL to cuDSS + QOCOInt* csr_row_end; + CUDA_CHECK(cudaMalloc(&csr_row_end, Kn * sizeof(QOCOInt))); + // Copy rowStart[1..n] to rowEnd[0..n-1] + CUDA_CHECK(cudaMemcpy(csr_row_end, &csr_row_ptr[1], Kn * sizeof(QOCOInt), cudaMemcpyDeviceToDevice)); + linsys_data->d_csr_row_end = csr_row_end; + + // Determine data types + cudaDataType_t indexType = CUDA_R_32I; // QOCOInt is int32_t + cudaDataType_t valueType_setup = (sizeof(QOCOFloat) == 8) ? CUDA_R_64F : CUDA_R_32F; + + // KKT matrix is symmetric (upper triangular stored) + // Use CUDSS_MTYPE_SPD (symmetric positive definite) like the working example + // Pass NULL for rowEnd - cuDSS will compute it from rowStart + CUDSS_CHECK(cudssMatrixCreateCsr(&linsys_data->K_csr, + (int64_t)Kn, (int64_t)Kn, (int64_t)linsys_data->K->nnz, + csr_row_ptr, NULL, csr_col_ind, csr_val, + indexType, valueType_setup, + CUDSS_MTYPE_SYMMETRIC, CUDSS_MVIEW_UPPER, CUDSS_BASE_ZERO)); + + // Run analysis once during setup (data structure already created above) + linsys_data->analysis_done = 0; // Will be set to 1 after first analysis + #else + // cuDSS not available - set to NULL (solve will use fallback) + linsys_data->K_csr = NULL; + linsys_data->d_rhs_matrix = NULL; + linsys_data->d_xyz_matrix = NULL; + linsys_data->d_csr_row_ptr = NULL; + linsys_data->d_csr_row_end = NULL; + linsys_data->d_csr_col_ind = NULL; + linsys_data->d_csr_val = NULL; + #endif + + // Copy mappings to GPU for direct updates + CUDA_CHECK(cudaMalloc(&linsys_data->d_nt2kkt, Wnnz * sizeof(QOCOInt))); + CUDA_CHECK(cudaMalloc(&linsys_data->d_ntdiag2kkt, data->m * sizeof(QOCOInt))); + if (data->P && linsys_data->PregtoKKT) { + CUDA_CHECK(cudaMalloc(&linsys_data->d_PregtoKKT, get_nnz(data->P) * sizeof(QOCOInt))); + CUDA_CHECK(cudaMemcpy(linsys_data->d_PregtoKKT, linsys_data->PregtoKKT, get_nnz(data->P) * sizeof(QOCOInt), cudaMemcpyHostToDevice)); + } else { + linsys_data->d_PregtoKKT = NULL; + } + CUDA_CHECK(cudaMalloc(&linsys_data->d_AttoKKT, get_nnz(data->A) * sizeof(QOCOInt))); + CUDA_CHECK(cudaMalloc(&linsys_data->d_GttoKKT, get_nnz(data->G) * sizeof(QOCOInt))); + + CUDA_CHECK(cudaMemcpy(linsys_data->d_nt2kkt, linsys_data->nt2kkt, Wnnz * sizeof(QOCOInt), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(linsys_data->d_ntdiag2kkt, linsys_data->ntdiag2kkt, data->m * sizeof(QOCOInt), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(linsys_data->d_AttoKKT, linsys_data->AttoKKT, get_nnz(data->A) * sizeof(QOCOInt), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(linsys_data->d_GttoKKT, linsys_data->GttoKKT, get_nnz(data->G) * sizeof(QOCOInt), cudaMemcpyHostToDevice)); + + // Copy CSC structure to GPU (for converting updates in factor) + CUDA_CHECK(cudaMalloc(&linsys_data->d_csc_p, (linsys_data->K->n + 1) * sizeof(QOCOInt))); + CUDA_CHECK(cudaMalloc(&linsys_data->d_csc_i, linsys_data->K->nnz * sizeof(QOCOInt))); + CUDA_CHECK(cudaMemcpy(linsys_data->d_csc_p, linsys_data->K->p, (linsys_data->K->n + 1) * sizeof(QOCOInt), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(linsys_data->d_csc_i, linsys_data->K->i, linsys_data->K->nnz * sizeof(QOCOInt), cudaMemcpyHostToDevice)); + + // Allocate device vectors for rhs and xyz + // These will be used by the cuDSS dense matrix wrappers + QOCOFloat* d_rhs_ptr = NULL; + QOCOFloat* d_xyz_ptr = NULL; + CUDA_CHECK(cudaMalloc(&d_rhs_ptr, Kn * sizeof(QOCOFloat))); + CUDA_CHECK(cudaMalloc(&d_xyz_ptr, Kn * sizeof(QOCOFloat))); + + // Store in struct for cleanup + linsys_data->d_xyzbuff1 = d_rhs_ptr; // Used for rhs buffer + linsys_data->d_xyzbuff2 = d_xyz_ptr; // Used for xyz buffer + + // Create dense matrix wrappers for solution and RHS vectors (column vectors) + // Note: d_rhs_matrix wraps d_xyzbuff1, d_xyz_matrix wraps d_xyzbuff2 + #ifdef HAVE_CUDSS + // Use valueType_setup declared above + CUDSS_CHECK(cudssMatrixCreateDn(&linsys_data->d_rhs_matrix, (int64_t)Kn, 1, (int64_t)Kn, linsys_data->d_xyzbuff1, valueType_setup, CUDSS_LAYOUT_COL_MAJOR)); + CUDSS_CHECK(cudssMatrixCreateDn(&linsys_data->d_xyz_matrix, (int64_t)Kn, 1, (int64_t)Kn, linsys_data->d_xyzbuff2, valueType_setup, CUDSS_LAYOUT_COL_MAJOR)); + #else + linsys_data->d_rhs_matrix = NULL; + linsys_data->d_xyz_matrix = NULL; + #endif + + return linsys_data; +} + +// CUDA kernel to update CSR values from updated CSC KKT matrix +// Maps CSC indices to CSR indices and updates values +__global__ void update_csr_from_csc_kernel( + const QOCOFloat* csc_val, // Updated CSC values (on GPU, copied from CPU KKT) + QOCOFloat* csr_val, // CSR values to update (on GPU) + const QOCOInt* csr_row_ptr, // CSR row pointers + const QOCOInt* csr_col_ind, // CSR column indices + const QOCOInt* csc_p, // CSC column pointers (on GPU) + const QOCOInt* csc_i, // CSC row indices (on GPU) + QOCOInt n, // Matrix dimension + QOCOInt nnz) // Number of nonzeros +{ + QOCOInt idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= nnz) return; + + // For each CSR element at index idx, find corresponding CSC element + // CSR: row = find row such that csr_row_ptr[row] <= idx < csr_row_ptr[row+1] + // col = csr_col_ind[idx] + // CSC: find idx_csc such that csc_i[idx_csc] == row and column of idx_csc == col + + // Binary search for row + QOCOInt row = 0; + QOCOInt left = 0, right = n - 1; + while (left <= right) { + QOCOInt mid = (left + right) / 2; + if (csr_row_ptr[mid] <= idx) { + row = mid; + left = mid + 1; + } else { + right = mid - 1; + } + } + + QOCOInt col = csr_col_ind[idx]; + + // Find corresponding CSC element: search column col for row + for (QOCOInt csc_idx = csc_p[col]; csc_idx < csc_p[col + 1]; csc_idx++) { + if (csc_i[csc_idx] == row) { + csr_val[idx] = csc_val[csc_idx]; + return; + } + } +} + +static void cudss_factor(LinSysData* linsys_data, QOCOInt n, + QOCOFloat kkt_dynamic_reg) +{ + // Update CSR matrix values on GPU directly - NO CPU-GPU transfers during solve + // The KKT matrix was updated on CPU (in CSC), but we need to update GPU CSR + // Copy updated CSC values to GPU, then update CSR values using a kernel + + #ifdef HAVE_CUDSS + // Copy updated CSC KKT matrix values to GPU (minimal transfer - only values, not structure) + // This is the ONLY CPU-GPU transfer during solve iterations (matrix values update) + QOCOFloat* d_csc_val; + CUDA_CHECK(cudaMalloc(&d_csc_val, linsys_data->K->nnz * sizeof(QOCOFloat))); + CUDA_CHECK(cudaMemcpy(d_csc_val, linsys_data->K->x, linsys_data->K->nnz * sizeof(QOCOFloat), cudaMemcpyHostToDevice)); + + // Update CSR values from CSC using CUDA kernel (CSC structure already on GPU from setup) + QOCOInt threadsPerBlock = 256; + QOCOInt numBlocks = (linsys_data->K->nnz + threadsPerBlock - 1) / threadsPerBlock; + update_csr_from_csc_kernel<<>>( + d_csc_val, linsys_data->d_csr_val, + linsys_data->d_csr_row_ptr, linsys_data->d_csr_col_ind, + linsys_data->d_csc_p, linsys_data->d_csc_i, + linsys_data->K->n, linsys_data->K->nnz); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + + // Free temporary CSC values buffer + cudaFree(d_csc_val); + + // Update cuDSS matrix with new values + CUDSS_CHECK(cudssMatrixSetValues(linsys_data->K_csr, linsys_data->d_csr_val)); + + // Run factorization - data structure persists from setup + // Analysis only needed once (first factorization), then just factorization + if (!linsys_data->analysis_done) { + cudssStatus_t status_analysis = cudssExecute(linsys_data->handle, CUDSS_PHASE_ANALYSIS, + linsys_data->config, linsys_data->data, + linsys_data->K_csr, linsys_data->d_xyz_matrix, linsys_data->d_rhs_matrix); + if (status_analysis != CUDSS_STATUS_SUCCESS) { + const char* err_str = (status_analysis == CUDSS_STATUS_NOT_INITIALIZED) ? "NOT_INITIALIZED" : \ + (status_analysis == CUDSS_STATUS_ALLOC_FAILED) ? "ALLOC_FAILED" : \ + (status_analysis == CUDSS_STATUS_INVALID_VALUE) ? "INVALID_VALUE" : \ + (status_analysis == CUDSS_STATUS_NOT_SUPPORTED) ? "NOT_SUPPORTED" : \ + (status_analysis == CUDSS_STATUS_EXECUTION_FAILED) ? "EXECUTION_FAILED" : \ + (status_analysis == CUDSS_STATUS_INTERNAL_ERROR) ? "INTERNAL_ERROR" : "UNKNOWN"; + fprintf(stderr, "ERROR: cuDSS analysis failed with status %d (%s)\n", (int)status_analysis, err_str); + exit(1); + } + linsys_data->analysis_done = 1; + } + + cudssStatus_t status_factor = cudssExecute(linsys_data->handle, CUDSS_PHASE_FACTORIZATION, + linsys_data->config, linsys_data->data, + linsys_data->K_csr, linsys_data->d_xyz_matrix, linsys_data->d_rhs_matrix); + if (status_factor != CUDSS_STATUS_SUCCESS) { + const char* err_str = (status_factor == CUDSS_STATUS_NOT_INITIALIZED) ? "NOT_INITIALIZED" : \ + (status_factor == CUDSS_STATUS_ALLOC_FAILED) ? "ALLOC_FAILED" : \ + (status_factor == CUDSS_STATUS_INVALID_VALUE) ? "INVALID_VALUE" : \ + (status_factor == CUDSS_STATUS_NOT_SUPPORTED) ? "NOT_SUPPORTED" : \ + (status_factor == CUDSS_STATUS_EXECUTION_FAILED) ? "EXECUTION_FAILED" : \ + (status_factor == CUDSS_STATUS_INTERNAL_ERROR) ? "INTERNAL_ERROR" : "UNKNOWN"; + fprintf(stderr, "ERROR: cuDSS factorization failed with status %d (%s)\n", (int)status_factor, err_str); + exit(1); + } + #else + // cuDSS not available - skip + #endif +} + +// Helper function to map work->rhs and work->xyz to device buffers during solve phase +extern "C" { +void map_work_buffers_to_device(void* linsys_data_ptr, QOCOWorkspace* work) +{ + // During solve phase, functions constructing rhs/xyz will operate on device buffers + // This function is called at start of solve phase + // Map work->rhs->d_data and work->xyz->d_data to point to our device buffers + // so that RHS construction and solution storage happen directly in cuDSS buffers + LinSysData* linsys_data = (LinSysData*)linsys_data_ptr; + QOCOInt n = work->data->n + work->data->m + work->data->p; + + // Store original device pointers (if any) so we can restore them later + // For now, just point d_data to our buffers + if (work->rhs) { + QOCOFloat* old_rhs = work->rhs->d_data; + if (old_rhs && old_rhs != linsys_data->d_xyzbuff1) { + CUDA_CHECK(cudaMemcpy(linsys_data->d_xyzbuff1, old_rhs, + n * sizeof(QOCOFloat), cudaMemcpyDeviceToDevice)); + cudaFree(old_rhs); + } else if (!old_rhs) { + CUDA_CHECK(cudaMemset(linsys_data->d_xyzbuff1, 0, n * sizeof(QOCOFloat))); + } + work->rhs->d_data = linsys_data->d_xyzbuff1; + } + + if (work->xyz) { + QOCOFloat* old_xyz = work->xyz->d_data; + if (old_xyz && old_xyz != linsys_data->d_xyzbuff2) { + CUDA_CHECK(cudaMemcpy(linsys_data->d_xyzbuff2, old_xyz, + n * sizeof(QOCOFloat), cudaMemcpyDeviceToDevice)); + cudaFree(old_xyz); + } else if (!old_xyz) { + CUDA_CHECK(cudaMemset(linsys_data->d_xyzbuff2, 0, n * sizeof(QOCOFloat))); + } + work->xyz->d_data = linsys_data->d_xyzbuff2; + } +} + +QOCOFloat* get_device_rhs(void* linsys_data_ptr) +{ + LinSysData* linsys_data = (LinSysData*)linsys_data_ptr; + // Return the device buffer where RHS should be constructed + return linsys_data->d_xyzbuff1; +} + +QOCOFloat* get_device_xyz(void* linsys_data_ptr) +{ + LinSysData* linsys_data = (LinSysData*)linsys_data_ptr; + // Return the device buffer where solution will be written + return linsys_data->d_xyzbuff2; +} + +void unmap_work_buffers_from_device(void* linsys_data_ptr, QOCOWorkspace* work) +{ + // Copy final results from device buffers back to host buffers + // This is called AFTER solve completes, so it's the only CPU-GPU transfer after solve + LinSysData* linsys_data = (LinSysData*)linsys_data_ptr; + QOCOInt n = work->data->n + work->data->m + work->data->p; + + // Restore original device pointers (reallocate them) + if (work->rhs) { + // Allocate new device buffer for work->rhs->d_data + CUDA_CHECK(cudaMalloc(&work->rhs->d_data, n * sizeof(QOCOFloat))); + // Copy from our buffer to the restored buffer + CUDA_CHECK(cudaMemcpy(work->rhs->d_data, linsys_data->d_xyzbuff1, n * sizeof(QOCOFloat), cudaMemcpyDeviceToDevice)); + // Copy to host + CUDA_CHECK(cudaMemcpy(work->rhs->data, work->rhs->d_data, n * sizeof(QOCOFloat), cudaMemcpyDeviceToHost)); + } + + if (work->xyz) { + // Allocate new device buffer for work->xyz->d_data + CUDA_CHECK(cudaMalloc(&work->xyz->d_data, n * sizeof(QOCOFloat))); + // Copy solution from d_xyzbuff2 (where cuDSS wrote it) to restored buffer + CUDA_CHECK(cudaMemcpy(work->xyz->d_data, linsys_data->d_xyzbuff2, n * sizeof(QOCOFloat), cudaMemcpyDeviceToDevice)); + // Copy to host + CUDA_CHECK(cudaMemcpy(work->xyz->data, work->xyz->d_data, n * sizeof(QOCOFloat), cudaMemcpyDeviceToHost)); + } +} +} + +static void cudss_solve(LinSysData* linsys_data, QOCOWorkspace* work, + QOCOFloat* b, QOCOFloat* x, QOCOInt iter_ref_iters) +{ + QOCOInt n = linsys_data->K->n; + (void)iter_ref_iters; // No iterative refinement for CUDA backend + // b is copied to d_xyzbuff1 if provided + // x will receive the solution from d_xyzbuff2 after cudssExecute + + // During solve phase, ALL data is on GPU - NO CPU-GPU transfers + // work->rhs->d_data points to d_xyzbuff1 (mapped in map_work_buffers_to_device) + // work->xyz->d_data points to d_xyzbuff2 (mapped in map_work_buffers_to_device) + // RHS was constructed directly into d_xyzbuff1, solution will be written to d_xyzbuff2 + + // Ensure RHS buffer contains the contents of b (in case caller passed a different pointer) + if (b) { + if (b != linsys_data->d_xyzbuff1) { + cudaPointerAttributes attrs; + cudaError_t err = cudaPointerGetAttributes(&attrs, b); + if (err == cudaSuccess && attrs.type == cudaMemoryTypeDevice) { + CUDA_CHECK(cudaMemcpy(linsys_data->d_xyzbuff1, b, n * sizeof(QOCOFloat), + cudaMemcpyDeviceToDevice)); + } else { + CUDA_CHECK(cudaMemcpy(linsys_data->d_xyzbuff1, b, n * sizeof(QOCOFloat), + cudaMemcpyHostToDevice)); + } + } + } else { + CUDA_CHECK(cudaMemset(linsys_data->d_xyzbuff1, 0, n * sizeof(QOCOFloat))); + } + + // Clear solution buffer (d_xyz_matrix points to d_xyzbuff2) + CUDA_CHECK(cudaMemset(linsys_data->d_xyzbuff2, 0, n * sizeof(QOCOFloat))); + + // cuDSS API signature: cudssExecute(handle, phase, config, data, matrix, solution, rhs) + // where solution is the output and rhs is the input + // Note: d_rhs_matrix points to d_xyzbuff1, d_xyz_matrix points to d_xyzbuff2 + #ifdef HAVE_CUDSS + + cudssStatus_t status = cudssExecute(linsys_data->handle, CUDSS_PHASE_SOLVE, + linsys_data->config, linsys_data->data, + linsys_data->K_csr, linsys_data->d_xyz_matrix, linsys_data->d_rhs_matrix); + if (status != CUDSS_STATUS_SUCCESS) { + const char* err_str = (status == CUDSS_STATUS_NOT_INITIALIZED) ? "NOT_INITIALIZED" : \ + (status == CUDSS_STATUS_ALLOC_FAILED) ? "ALLOC_FAILED" : \ + (status == CUDSS_STATUS_INVALID_VALUE) ? "INVALID_VALUE" : \ + (status == CUDSS_STATUS_NOT_SUPPORTED) ? "NOT_SUPPORTED" : \ + (status == CUDSS_STATUS_EXECUTION_FAILED) ? "EXECUTION_FAILED" : \ + (status == CUDSS_STATUS_INTERNAL_ERROR) ? "INTERNAL_ERROR" : "UNKNOWN"; + fprintf(stderr, "ERROR: cuDSS solve failed with status %d (%s)\n", (int)status, err_str); + } else { + // Solution is now in d_xyzbuff2 (pointed to by d_xyz_matrix) + // Copy solution from d_xyz_matrix (d_xyzbuff2) to x + if (x) { + cudaPointerAttributes attrs; + cudaError_t err = cudaPointerGetAttributes(&attrs, x); + if (err == cudaSuccess && attrs.type == cudaMemoryTypeDevice) { + // x is on device - device-to-device copy + CUDA_CHECK(cudaMemcpy(x, linsys_data->d_xyzbuff2, n * sizeof(QOCOFloat), + cudaMemcpyDeviceToDevice)); + } else { + // x is on host - device-to-host copy + CUDA_CHECK(cudaMemcpy(x, linsys_data->d_xyzbuff2, n * sizeof(QOCOFloat), + cudaMemcpyDeviceToHost)); + } + } + } + + // Solution is now in d_xyzbuff2 (pointed to by d_xyz_matrix and work->xyz->d_data) + // Solution has been copied to x (if provided) after successful cudssExecute + #else + // cuDSS not available - use fallback: copy solution from RHS (will fail convergence but won't crash) + CUDA_CHECK(cudaMemcpy(linsys_data->d_xyzbuff2, linsys_data->d_xyzbuff1, n * sizeof(QOCOFloat), cudaMemcpyDeviceToDevice)); + // Copy to x if provided + if (x) { + cudaPointerAttributes attrs; + cudaError_t err = cudaPointerGetAttributes(&attrs, x); + if (err == cudaSuccess && attrs.type == cudaMemoryTypeDevice) { + CUDA_CHECK(cudaMemcpy(x, linsys_data->d_xyzbuff2, n * sizeof(QOCOFloat), + cudaMemcpyDeviceToDevice)); + } else { + CUDA_CHECK(cudaMemcpy(x, linsys_data->d_xyzbuff2, n * sizeof(QOCOFloat), + cudaMemcpyDeviceToHost)); + } + } + #endif + + // During solve phase, solution stays on device in d_xyzbuff2 (and work->xyz->d_data) + // It will be copied back to CPU only after solve completes (in unmap_work_buffers_from_device) +} + +static void cudss_initialize_nt(LinSysData* linsys_data, QOCOInt m) +{ + for (QOCOInt i = 0; i < linsys_data->Wnnz; ++i) { + linsys_data->K->x[linsys_data->nt2kkt[i]] = 0.0; + } + + // Set Nesterov-Todd block in KKT matrix to -I + for (QOCOInt i = 0; i < m; ++i) { + linsys_data->K->x[linsys_data->ntdiag2kkt[i]] = -1.0; + } + + // Update device matrix - redo CSR conversion + // In production, we'd update device matrix directly + // For now, we'll need to redo the conversion in factor function +} + +static void cudss_update_nt(LinSysData* linsys_data, QOCOFloat* WtW, + QOCOFloat kkt_static_reg, QOCOInt m) +{ + for (QOCOInt i = 0; i < linsys_data->Wnnz; ++i) { + linsys_data->K->x[linsys_data->nt2kkt[i]] = -WtW[i]; + } + + // Regularize Nesterov-Todd block of KKT matrix + for (QOCOInt i = 0; i < m; ++i) { + linsys_data->K->x[linsys_data->ntdiag2kkt[i]] -= kkt_static_reg; + } +} + +static void cudss_update_data(LinSysData* linsys_data, QOCOProblemData* data) +{ + // Update P in KKT matrix + if (data->P && linsys_data->PregtoKKT) { + QOCOCscMatrix* Pcsc = get_csc_matrix(data->P); + for (QOCOInt i = 0; i < get_nnz(data->P); ++i) { + linsys_data->K->x[linsys_data->PregtoKKT[i]] = Pcsc->x[i]; + } + } + + // Update A in KKT matrix + QOCOCscMatrix* Acsc = get_csc_matrix(data->A); + for (QOCOInt i = 0; i < get_nnz(data->A); ++i) { + linsys_data->K->x[linsys_data->AttoKKT[data->AtoAt[i]]] = Acsc->x[i]; + } + + // Update G in KKT matrix + QOCOCscMatrix* Gcsc = get_csc_matrix(data->G); + for (QOCOInt i = 0; i < get_nnz(data->G); ++i) { + linsys_data->K->x[linsys_data->GttoKKT[data->GtoGt[i]]] = Gcsc->x[i]; + } + + // Update device matrix - convert to CSR and update + // Note: This is simplified; in production we'd maintain device copy +} + +static void cudss_cleanup(LinSysData* linsys_data) +{ + if (linsys_data->K_csr) { + cudssMatrixDestroy(linsys_data->K_csr); + } + if (linsys_data->d_rhs_matrix) { + cudssMatrixDestroy(linsys_data->d_rhs_matrix); + } + if (linsys_data->d_xyz_matrix) { + cudssMatrixDestroy(linsys_data->d_xyz_matrix); + } + if (linsys_data->data) { + cudssDataDestroy(linsys_data->handle, linsys_data->data); + } + if (linsys_data->config) { + cudssConfigDestroy(linsys_data->config); + } + if (linsys_data->handle) { + cudssDestroy(linsys_data->handle); + } + if (linsys_data->cusparse_handle) { + cusparseDestroy(linsys_data->cusparse_handle); + } + if (linsys_data->descr) { + cusparseDestroyMatDescr(linsys_data->descr); + } + if (linsys_data->cublas_handle) { + cublasDestroy(linsys_data->cublas_handle); + } + free_qoco_csc_matrix(linsys_data->K); + qoco_free(linsys_data->p); + qoco_free(linsys_data->pinv); + qoco_free(linsys_data->xyzbuff1); + qoco_free(linsys_data->xyzbuff2); + if (linsys_data->d_xyzbuff1) cudaFree(linsys_data->d_xyzbuff1); + if (linsys_data->d_xyzbuff2) cudaFree(linsys_data->d_xyzbuff2); + qoco_free(linsys_data->nt2kkt); + qoco_free(linsys_data->ntdiag2kkt); + qoco_free(linsys_data->PregtoKKT); + qoco_free(linsys_data->AttoKKT); + qoco_free(linsys_data->GttoKKT); + if (linsys_data->d_csr_row_ptr) cudaFree(linsys_data->d_csr_row_ptr); + if (linsys_data->d_csr_row_end) cudaFree(linsys_data->d_csr_row_end); + if (linsys_data->d_csr_col_ind) cudaFree(linsys_data->d_csr_col_ind); + if (linsys_data->d_csr_val) cudaFree(linsys_data->d_csr_val); + if (linsys_data->d_nt2kkt) cudaFree(linsys_data->d_nt2kkt); + if (linsys_data->d_ntdiag2kkt) cudaFree(linsys_data->d_ntdiag2kkt); + if (linsys_data->d_PregtoKKT) cudaFree(linsys_data->d_PregtoKKT); + if (linsys_data->d_AttoKKT) cudaFree(linsys_data->d_AttoKKT); + if (linsys_data->d_GttoKKT) cudaFree(linsys_data->d_GttoKKT); + if (linsys_data->d_csc_p) cudaFree(linsys_data->d_csc_p); + if (linsys_data->d_csc_i) cudaFree(linsys_data->d_csc_i); + qoco_free(linsys_data); +} + +// Export the backend struct +LinSysBackend backend = {.linsys_setup = cudss_setup, + .linsys_initialize_nt = cudss_initialize_nt, + .linsys_update_nt = cudss_update_nt, + .linsys_update_data = cudss_update_data, + .linsys_factor = cudss_factor, + .linsys_solve = cudss_solve, + .linsys_cleanup = cudss_cleanup}; + diff --git a/algebra/cuda/cudss_backend.h b/algebra/cuda/cudss_backend.h new file mode 100644 index 0000000..7467e75 --- /dev/null +++ b/algebra/cuda/cudss_backend.h @@ -0,0 +1,54 @@ +/** + * @file cudss_backend.h + * @author Govind M. Chari + * + * @section LICENSE + * + * Copyright (c) 2025, Govind M. Chari + * This source code is licensed under the BSD 3-Clause License + * + * @section DESCRIPTION + * + * Defines the cuDSS backend. + */ + +#ifndef CUDSS_BACKEND_H +#define CUDSS_BACKEND_H + +#include "cuda_types.h" +#ifdef __cplusplus +extern "C" { +#endif +#include "common_linalg.h" +#include "kkt.h" +#ifdef __cplusplus +} +#endif +#include "structs.h" + +#ifdef HAVE_CUDSS +#include +#else +// Stub definitions for cuDSS types if not available +typedef void* cudssHandle_t; +typedef void* cudssConfig_t; +typedef void* cudssData_t; +typedef void* cudssMatrix_t; +typedef enum { CUDSS_STATUS_SUCCESS } cudssStatus_t; +typedef enum { CUDSS_PHASE_ANALYSIS, CUDSS_PHASE_FACTORIZATION, CUDSS_PHASE_SOLVE } cudssPhase_t; +#define cudssCreate(x) (CUDSS_STATUS_SUCCESS) +#define cudssConfigCreate(x) (CUDSS_STATUS_SUCCESS) +#define cudssDataCreate(h, x) (CUDSS_STATUS_SUCCESS) +#define cudssMatrixCreateCsr(h, m, nrows, ncols, nnz, rp, ci, v) (CUDSS_STATUS_SUCCESS) +#define cudssMatrixSetValues(m, v) (CUDSS_STATUS_SUCCESS) +#define cudssExecute(h, p, c, d, m, x, b) (CUDSS_STATUS_SUCCESS) +#define cudssMatrixDestroy(m) ((void)0) +#define cudssDataDestroy(d) ((void)0) +#define cudssConfigDestroy(c) ((void)0) +#define cudssDestroy(h) ((void)0) +#endif + +extern LinSysBackend backend; + +#endif /* #ifndef CUDSS_BACKEND_H */ + diff --git a/devtools/run_tests.sh b/devtools/run_tests.sh index 450c81e..a6d22f7 100755 --- a/devtools/run_tests.sh +++ b/devtools/run_tests.sh @@ -1 +1,6 @@ -export CXX=/usr/local/bin/clang++ && export CC=/usr/local/bin/clang && cd build && cmake -DQOCO_BUILD_TYPE:STR=Release -DENABLE_TESTING:BOOL=True -DBUILD_QOCO_BENCHMARK_RUNNER:BOOL=True .. && make && ctest --verbose && cd .. \ No newline at end of file +#!/bin/bash +# Note: cuDSS was compiled against cuBLAS 12, but system has cuBLAS 13 +# The build will succeed with --allow-shlib-undefined, but runtime may fail +# To fix: either install cuBLAS 12 or get cuDSS compiled against cuBLAS 13 + +export CXX=/usr/bin/clang++ && export CC=/usr/bin/clang && cd build && cmake -DQOCO_BUILD_TYPE:STR=Release -DENABLE_TESTING:BOOL=True -DBUILD_QOCO_BENCHMARK_RUNNER:BOOL=False -DQOCO_ALGEBRA_BACKEND:STR=builtin .. && make -j$(nproc) && cd tests && ./markowitz_test \ No newline at end of file diff --git a/include/common_linalg.h b/include/common_linalg.h index ac3180a..2adb5a5 100644 --- a/include/common_linalg.h +++ b/include/common_linalg.h @@ -69,6 +69,14 @@ void unregularize(QOCOCscMatrix* M, QOCOFloat lambda); */ void col_inf_norm_USymm(const QOCOCscMatrix* M, QOCOFloat* norm); +/** + * @brief Computes the infinity norm of each column of M and stores in norm. + * + * @param M An m by n sparse matrix in CSC format. + * @param norm Result vector of length n. + */ +void col_inf_norm(const QOCOCscMatrix* M, QOCOFloat* norm); + /** * @brief Computes the infinity norm of each row of M and stores in norm. * diff --git a/include/qoco_linalg.h b/include/qoco_linalg.h index 8bbe7f2..1ab2212 100644 --- a/include/qoco_linalg.h +++ b/include/qoco_linalg.h @@ -16,6 +16,10 @@ #define QOCO_LINALG_H #include "definitions.h" +typedef struct QOCOMatrix_ QOCOMatrix; +typedef struct QOCOVectorf_ QOCOVectorf; +typedef struct QOCOVectori_ QOCOVectori; + /** * @brief Compressed sparse column format matrices. * @@ -49,6 +53,210 @@ typedef struct { */ QOCOCscMatrix* new_qoco_csc_matrix(const QOCOCscMatrix* A); +/** + * @brief Allocates a new QOCOMatrix and copies A to it. + * + * @param A Matrix to copy. + * @return Pointer to new constructed matrix. + */ +QOCOMatrix* new_qoco_matrix(const QOCOCscMatrix* A); + +/** + * @brief Frees QOCOMatrix. + * + * @param A Matrix to free. + */ +void free_qoco_matrix(QOCOMatrix* A); + +/** + * @brief Allocates a new QOCOVectorf and copies A to it. + * + * @param x vector to copy. + * @param n length of vector. + * @return Pointer to new constructed vector. + */ +QOCOVectorf* new_qoco_vectorf(const QOCOFloat* x, QOCOInt n); + +/** + * @brief Frees QOCOVectorf. + * + * @param x Vector to free. + */ +void free_qoco_vectorf(QOCOVectorf* x); + +/** + * @brief Returns the number of nonzeros in a QOCOMatrix. + * + * @param A Input matrix. + * @return Number of nonzeros. + */ +QOCOInt get_nnz(const QOCOMatrix* A); + +/** + * @brief Returns x[idx]. + * + * @param x Input vector. + * @param idx Index. + * @return x[idx]. + */ +QOCOFloat get_element_vectorf(const QOCOVectorf* x, QOCOInt idx); + +/** + * @brief Performs x[idx] = data. + * + * @param x Input vector. + * @param idx Index. + * @param idx data. + */ +void set_element_vectorf(QOCOVectorf* x, QOCOInt idx, QOCOFloat data); + +/** + * @brief Performs output .= 1.0 ./ input. + * + * @param input Input vector. + * @param output Input vector. + */ +void reciprocal_vectorf(const QOCOVectorf* input, QOCOVectorf* output); + +/** + * @brief Returns &x[idx]. + * + * @param x Input vector. + * @param idx Index. + * @return &x[idx]. + */ +QOCOFloat* get_pointer_vectorf(const QOCOVectorf* x, QOCOInt idx); + +/** + * @brief Returns the underlying data array of a QOCOVectorf. + * + * @param x Input vector. + * @return Pointer to underlying data array. + */ +QOCOFloat* get_data_vectorf(const QOCOVectorf* x); + +/** + * @brief Syncs vector data from host to device if needed (CUDA backend only). + * This is a no-op for non-CUDA backends. + * Note: This should NOT be called during qoco_solve to avoid CPU-GPU copies. + * + * @param v Input vector. + */ +void sync_vector_to_device_if_needed(QOCOVectorf* v); + +/** + * @brief Sets the solve phase flag (CUDA backend only). + * During solve phase, get_data_vectorf returns device pointers. + * This prevents CPU-GPU copies during the solve loop. + * + * @param active 1 if solve phase is active, 0 otherwise. + */ +void set_solve_phase(int active); + +/** + * @brief Gets the solve phase flag (CUDA backend only). + * Returns 1 if solve phase is active, 0 otherwise. + * + * @return 1 if solve phase is active, 0 otherwise. + */ +int get_solve_phase(void); + +/** + * @brief Copies vector data from device to host (CUDA backend only). + * This is a no-op for non-CUDA backends. Used to copy solution vectors + * after solver terminates. + * + * @param src Source vector (on device). + * @param dst Destination buffer (on host). + * @param n Number of elements to copy. + */ +void copy_vector_from_device(QOCOVectorf* src, QOCOFloat* dst, QOCOInt n); + +/** + * @brief Returns the length of a QOCOVectorf. + * + * @param x Input vector. + * @return Length of vector. + */ +QOCOInt get_length_vectorf(const QOCOVectorf* x); + +/** + * @brief Returns the underlying CSC matrix from a QOCOMatrix. + * This function provides access to implementation-specific data. + * For builtin implementation, returns the internal CSC matrix. + * + * @param M Input matrix. + * @return Pointer to underlying CSC matrix. + */ +QOCOCscMatrix* get_csc_matrix(const QOCOMatrix* M); + +/** + * @brief Sparse matrix vector multiplication for QOCOMatrix where M is + * symmetric and only the upper triangular part is given. Computes r = M * v + * + * @param M Upper triangular part of M. + * @param v Vector. + * @param r Result. + */ +void USpMv_matrix(const QOCOMatrix* M, const QOCOFloat* v, QOCOFloat* r); + +/** + * @brief Sparse matrix vector multiplication for QOCOMatrix. Computes r = M * v. + * + * @param M Matrix. + * @param v Vector. + * @param r Result. + */ +void SpMv_matrix(const QOCOMatrix* M, const QOCOFloat* v, QOCOFloat* r); + +/** + * @brief Sparse matrix vector multiplication for QOCOMatrix where M is first + * transposed. Computes r = M^T * v. + * + * @param M Matrix. + * @param v Vector. + * @param r Result. + */ +void SpMtv_matrix(const QOCOMatrix* M, const QOCOFloat* v, QOCOFloat* r); + +/** + * @brief Computes the infinity norm of each column (or equivalently row) of a + * symmetric sparse matrix M where only the upper triangular portion of M is + * given. Works with QOCOMatrix*. + * + * @param M Upper triangular part of sparse symmetric matrix. + * @param norm Result vector of length n. + */ +void col_inf_norm_USymm_matrix(const QOCOMatrix* M, QOCOFloat* norm); + +/** + * @brief Computes the infinity norm of each column of M and stores in norm. + * Works with QOCOMatrix*. + * + * @param M An m by n sparse matrix. + * @param norm Result vector of length n. + */ +void col_inf_norm_matrix(const QOCOMatrix* M, QOCOFloat* norm); + +/** + * @brief Computes the infinity norm of each row of M and stores in norm. + * Works with QOCOMatrix*. + * + * @param M An m by n sparse matrix. + * @param norm Result vector of length m. + */ +void row_inf_norm_matrix(const QOCOMatrix* M, QOCOFloat* norm); + +/** + * @brief Scales the rows of M by E and columns of M by D. + * M = diag(E) * M * diag(D). Works with QOCOMatrix*. + * + * @param M An m by n sparse matrix. + * @param E Vector of length m. + * @param D Vector of length n. + */ +void row_col_scale_matrix(QOCOMatrix* M, const QOCOFloat* E, const QOCOFloat* D); + /** * @brief Frees all the internal arrays and the pointer to the QOCOCscMatrix. * Should only be used if QOCOCscMatrix and all internal arrays were malloc'ed. diff --git a/include/structs.h b/include/structs.h index c439d84..571286e 100644 --- a/include/structs.h +++ b/include/structs.h @@ -25,25 +25,25 @@ */ typedef struct { /** Quadratic cost term. */ - QOCOCscMatrix* P; // + QOCOMatrix* P; // /** Linear cost term. */ - QOCOFloat* c; + QOCOVectorf* c; /** Affine equality constraint matrix. */ - QOCOCscMatrix* A; + QOCOMatrix* A; /** Transpose of A (used in Ruiz for fast row norm calculations of A). */ - QOCOCscMatrix* At; + QOCOMatrix* At; /** Affine equality constraint offset. */ - QOCOFloat* b; + QOCOVectorf* b; /** Conic constraint matrix. */ - QOCOCscMatrix* G; + QOCOMatrix* G; /** Transpose of G (used in Ruiz for fast row norm calculations of G). */ - QOCOCscMatrix* Gt; + QOCOMatrix* Gt; /** Mapping from A to At. */ QOCOInt* AtoAt; @@ -52,7 +52,7 @@ typedef struct { QOCOInt* GtoGt; /** Conic constraint offset. */ - QOCOFloat* h; + QOCOVectorf* h; /** Dimension of non-negative orthant in cone C. */ QOCOInt l; @@ -126,25 +126,25 @@ typedef struct { typedef struct { /** Diagonal of scaling matrix. */ - QOCOFloat* delta; + QOCOVectorf* delta; /** Diagonal of scaling matrix. */ - QOCOFloat* Druiz; + QOCOVectorf* Druiz; /** Diagonal of scaling matrix. */ - QOCOFloat* Eruiz; + QOCOVectorf* Eruiz; /** Diagonal of scaling matrix. */ - QOCOFloat* Fruiz; + QOCOVectorf* Fruiz; /** Inverse of Druiz. */ - QOCOFloat* Dinvruiz; + QOCOVectorf* Dinvruiz; /** Inverse of Eruiz. */ - QOCOFloat* Einvruiz; + QOCOVectorf* Einvruiz; /** Inverse of Fruiz. */ - QOCOFloat* Finvruiz; + QOCOVectorf* Finvruiz; /** Cost scaling factor. */ QOCOFloat k; @@ -168,16 +168,16 @@ typedef struct { QOCOScaling* scaling; /** Iterate of primal variables. */ - QOCOFloat* x; + QOCOVectorf* x; /** Iterate of slack variables associated with conic constraint. */ - QOCOFloat* s; + QOCOVectorf* s; /** Iterate of dual variables associated with affine equality constraint. */ - QOCOFloat* y; + QOCOVectorf* y; /** Iterate of dual variables associated with conic constraint. */ - QOCOFloat* z; + QOCOVectorf* z; /** Gap (s'*z / m) */ QOCOFloat mu; @@ -239,10 +239,10 @@ typedef struct { QOCOFloat* Ds; /** RHS of KKT system. */ - QOCOFloat* rhs; + QOCOVectorf* rhs; /** Solution of KKT system. */ - QOCOFloat* xyz; + QOCOVectorf* xyz; /** Buffer of size n + m + p. */ QOCOFloat* xyzbuff1; diff --git a/src/common_linalg.c b/src/common_linalg.c index d7634aa..107e4d3 100644 --- a/src/common_linalg.c +++ b/src/common_linalg.c @@ -150,6 +150,24 @@ void col_inf_norm_USymm(const QOCOCscMatrix* M, QOCOFloat* norm) } } +void col_inf_norm(const QOCOCscMatrix* M, QOCOFloat* norm) +{ + // Initialize norm array to zero + for (QOCOInt j = 0; j < M->n; j++) { + norm[j] = 0.0; + } + + // For CSC format, column norms are computed efficiently by iterating over columns + for (QOCOInt j = 0; j < M->n; j++) { + for (QOCOInt idx = M->p[j]; idx < M->p[j + 1]; idx++) { + QOCOFloat val = qoco_abs(M->x[idx]); + if (val > norm[j]) { + norm[j] = val; + } + } + } +} + void row_inf_norm(const QOCOCscMatrix* M, QOCOFloat* norm) { for (QOCOInt i = 0; i < M->m; ++i) { diff --git a/src/cone.c b/src/cone.c index c30ee79..48fea77 100644 --- a/src/cone.c +++ b/src/cone.c @@ -174,7 +174,8 @@ void compute_nt_scaling(QOCOWorkspace* work) // Compute Nesterov-Todd scaling for LP cone. QOCOInt idx; for (idx = 0; idx < work->data->l; ++idx) { - work->WtW[idx] = safe_div(work->s[idx], work->z[idx]); + work->WtW[idx] = safe_div(get_element_vectorf(work->s, idx), + get_element_vectorf(work->z, idx)); work->W[idx] = qoco_sqrt(work->WtW[idx]); work->Wfull[idx] = work->W[idx]; work->Winv[idx] = safe_div(1.0, work->W[idx]); @@ -186,15 +187,19 @@ void compute_nt_scaling(QOCOWorkspace* work) QOCOInt nt_idx_full = idx; for (QOCOInt i = 0; i < work->data->nsoc; ++i) { // Compute normalized vectors. - QOCOFloat s_scal = soc_residual2(&work->s[idx], work->data->q[i]); + QOCOFloat s_scal = + soc_residual2(get_pointer_vectorf(work->s, idx), work->data->q[i]); s_scal = qoco_sqrt(s_scal); QOCOFloat f = safe_div(1.0, s_scal); - scale_arrayf(&work->s[idx], work->sbar, f, work->data->q[i]); + scale_arrayf(get_pointer_vectorf(work->s, idx), work->sbar, f, + work->data->q[i]); - QOCOFloat z_scal = soc_residual2(&work->z[idx], work->data->q[i]); + QOCOFloat z_scal = + soc_residual2(get_pointer_vectorf(work->z, idx), work->data->q[i]); z_scal = qoco_sqrt(z_scal); f = safe_div(1.0, z_scal); - scale_arrayf(&work->z[idx], work->zbar, f, work->data->q[i]); + scale_arrayf(get_pointer_vectorf(work->z, idx), work->zbar, f, + work->data->q[i]); QOCOFloat gamma = qoco_sqrt( 0.5 * (1 + qoco_dot(work->sbar, work->zbar, work->data->q[i]))); @@ -264,22 +269,27 @@ void compute_nt_scaling(QOCOWorkspace* work) } // Compute scaled variable lambda. lambda = W * z. - nt_multiply(work->Wfull, work->z, work->lambda, work->data->l, work->data->m, - work->data->nsoc, work->data->q); + nt_multiply(work->Wfull, get_pointer_vectorf(work->z, 0), work->lambda, + work->data->l, work->data->m, work->data->nsoc, work->data->q); } void compute_centering(QOCOSolver* solver) { QOCOWorkspace* work = solver->work; - QOCOFloat* Dzaff = &work->xyz[work->data->n + work->data->p]; - QOCOFloat a = qoco_min(linesearch(work->z, Dzaff, 1.0, solver), - linesearch(work->s, work->Ds, 1.0, solver)); + QOCOFloat* xyz = get_data_vectorf(work->xyz); + QOCOFloat* Dzaff = &xyz[work->data->n + work->data->p]; + QOCOFloat a = qoco_min( + linesearch(get_pointer_vectorf(work->z, 0), Dzaff, 1.0, solver), + linesearch(get_pointer_vectorf(work->s, 0), work->Ds, 1.0, solver)); // Compute rho. rho = ((s + a * Ds)'*(z + a * Dz)) / (s'*z). - qoco_axpy(Dzaff, work->z, work->ubuff1, a, work->data->m); - qoco_axpy(work->Ds, work->s, work->ubuff2, a, work->data->m); + qoco_axpy(Dzaff, get_pointer_vectorf(work->z, 0), work->ubuff1, a, + work->data->m); + qoco_axpy(work->Ds, get_pointer_vectorf(work->s, 0), work->ubuff2, a, + work->data->m); QOCOFloat rho = qoco_dot(work->ubuff1, work->ubuff2, work->data->m) / - qoco_dot(work->z, work->s, work->data->m); + qoco_dot(get_pointer_vectorf(work->z, 0), + get_pointer_vectorf(work->s, 0), work->data->m); // Compute sigma. sigma = max(0, min(1, rho))^3. QOCOFloat sigma = qoco_min(1.0, rho); diff --git a/src/equilibration.c b/src/equilibration.c index 96e5306..a7b4e0c 100644 --- a/src/equilibration.c +++ b/src/equilibration.c @@ -5,33 +5,41 @@ void ruiz_equilibration(QOCOProblemData* data, QOCOScaling* scaling, { // Initialize ruiz data. for (QOCOInt i = 0; i < data->n; ++i) { - scaling->Druiz[i] = 1.0; - scaling->Dinvruiz[i] = 1.0; + set_element_vectorf(scaling->Druiz, i, 1.0); + set_element_vectorf(scaling->Dinvruiz, i, 1.0); } for (QOCOInt i = 0; i < data->p; ++i) { - scaling->Eruiz[i] = 1.0; - scaling->Einvruiz[i] = 1.0; + set_element_vectorf(scaling->Eruiz, i, 1.0); + set_element_vectorf(scaling->Einvruiz, i, 1.0); } for (QOCOInt i = 0; i < data->m; ++i) { - scaling->Fruiz[i] = 1.0; - scaling->Finvruiz[i] = 1.0; + set_element_vectorf(scaling->Fruiz, i, 1.0); + set_element_vectorf(scaling->Finvruiz, i, 1.0); } QOCOFloat g = 1.0; scaling->k = 1.0; scaling->kinv = 1.0; + QOCOFloat* delta_data = get_data_vectorf(scaling->delta); + QOCOFloat* cdata = get_data_vectorf(data->c); + QOCOFloat* Druiz_data = get_data_vectorf(scaling->Druiz); + QOCOFloat* Eruiz_data = get_data_vectorf(scaling->Eruiz); + QOCOFloat* Fruiz_data = get_data_vectorf(scaling->Fruiz); + QOCOFloat* bdata = get_data_vectorf(data->b); + QOCOFloat* hdata = get_data_vectorf(data->h); + for (QOCOInt i = 0; i < ruiz_iters; ++i) { // Compute infinity norm of rows of [P A' G'] for (QOCOInt j = 0; j < data->n; ++j) { - scaling->delta[j] = 0.0; + set_element_vectorf(scaling->delta, j, 0.0); } - g = inf_norm(data->c, data->n); + g = inf_norm(cdata, data->n); QOCOFloat Pinf_mean = 0.0; if (data->P) { - col_inf_norm_USymm(data->P, scaling->delta); - for (QOCOInt j = 0; j < data->P->n; ++j) { - Pinf_mean += scaling->delta[j]; + col_inf_norm_USymm_matrix(data->P, delta_data); + for (QOCOInt j = 0; j < data->n; ++j) { + Pinf_mean += get_element_vectorf(scaling->delta, j); } Pinf_mean /= data->n; } @@ -41,61 +49,61 @@ void ruiz_equilibration(QOCOProblemData* data, QOCOScaling* scaling, g = safe_div(1.0, g); scaling->k *= g; - if (data->A->nnz > 0) { - for (QOCOInt j = 0; j < data->A->n; ++j) { - QOCOFloat nrm = inf_norm(&data->A->x[data->A->p[j]], - data->A->p[j + 1] - data->A->p[j]); - scaling->delta[j] = qoco_max(scaling->delta[j], nrm); + // Compute column infinity norms of A and G + // For CSC format, column norms are computed efficiently + QOCOFloat Anorm[data->n]; + QOCOFloat Gnorm[data->n]; + if (get_nnz(data->A) > 0) { + col_inf_norm_matrix(data->A, Anorm); + for (QOCOInt j = 0; j < data->n; ++j) { + QOCOFloat nrm = qoco_max(get_element_vectorf(scaling->delta, j), Anorm[j]); + set_element_vectorf(scaling->delta, j, nrm); } } - if (data->G->nnz > 0) { - for (QOCOInt j = 0; j < data->G->n; ++j) { - QOCOFloat nrm = inf_norm(&data->G->x[data->G->p[j]], - data->G->p[j + 1] - data->G->p[j]); - scaling->delta[j] = qoco_max(scaling->delta[j], nrm); + if (get_nnz(data->G) > 0) { + col_inf_norm_matrix(data->G, Gnorm); + for (QOCOInt j = 0; j < data->n; ++j) { + QOCOFloat nrm = qoco_max(get_element_vectorf(scaling->delta, j), Gnorm[j]); + set_element_vectorf(scaling->delta, j, nrm); } } // d(i) = 1 / sqrt(max([Pinf(i), Atinf(i), Gtinf(i)])); for (QOCOInt j = 0; j < data->n; ++j) { - QOCOFloat temp = qoco_sqrt(scaling->delta[j]); + QOCOFloat temp = qoco_sqrt(get_element_vectorf(scaling->delta, j)); temp = safe_div(1.0, temp); - scaling->delta[j] = temp; + set_element_vectorf(scaling->delta, j, temp); } // Compute infinity norm of rows of [A 0 0]. - if (data->A->nnz > 0) { - for (QOCOInt j = 0; j < data->At->n; ++j) { - QOCOFloat nrm = inf_norm(&data->At->x[data->At->p[j]], - data->At->p[j + 1] - data->At->p[j]); - scaling->delta[data->n + j] = nrm; - } + // For row norms, compute column norms of the transpose (At is stored in CSC format) + if (get_nnz(data->A) > 0) { + col_inf_norm_matrix(data->At, &delta_data[data->n]); // d(i) = 1 / sqrt(Ainf(i)); for (QOCOInt k = 0; k < data->p; ++k) { - QOCOFloat temp = qoco_sqrt(scaling->delta[data->n + k]); + QOCOFloat temp = + qoco_sqrt(get_element_vectorf(scaling->delta, data->n + k)); temp = safe_div(1.0, temp); - scaling->delta[data->n + k] = temp; + set_element_vectorf(scaling->delta, data->n + k, temp); } } // Compute infinity norm of rows of [G 0 0]. - if (data->G->nnz > 0) { - for (QOCOInt j = 0; j < data->Gt->n; ++j) { - QOCOFloat nrm = inf_norm(&data->Gt->x[data->Gt->p[j]], - data->Gt->p[j + 1] - data->Gt->p[j]); - scaling->delta[data->n + data->p + j] = nrm; - } + // For row norms, compute column norms of the transpose (Gt is stored in CSC format) + if (get_nnz(data->G) > 0) { + col_inf_norm_matrix(data->Gt, &delta_data[data->n + data->p]); // d(i) = 1 / sqrt(Ginf(i)); for (QOCOInt k = 0; k < data->m; ++k) { - QOCOFloat temp = qoco_sqrt(scaling->delta[data->n + data->p + k]); + QOCOFloat temp = qoco_sqrt( + get_element_vectorf(scaling->delta, data->n + data->p + k)); temp = safe_div(1.0, temp); - scaling->delta[data->n + data->p + k] = temp; + set_element_vectorf(scaling->delta, data->n + data->p + k, temp); } } - QOCOFloat* D = scaling->delta; - QOCOFloat* E = &scaling->delta[data->n]; - QOCOFloat* F = &scaling->delta[data->n + data->p]; + QOCOFloat* D = delta_data; + QOCOFloat* E = &delta_data[data->n]; + QOCOFloat* F = &delta_data[data->n + data->p]; // Make scalings for all variables in a second-order cone equal. QOCOInt idx = data->l; @@ -108,53 +116,59 @@ void ruiz_equilibration(QOCOProblemData* data, QOCOScaling* scaling, // Scale P. if (data->P) { - scale_arrayf(data->P->x, data->P->x, g, data->P->nnz); - row_col_scale(data->P, D, D); + QOCOCscMatrix* Pcsc = get_csc_matrix(data->P); + scale_arrayf(Pcsc->x, Pcsc->x, g, get_nnz(data->P)); + row_col_scale_matrix(data->P, D, D); } // Scale c. - scale_arrayf(data->c, data->c, g, data->n); - ew_product(data->c, D, data->c, data->n); + scale_arrayf(cdata, cdata, g, data->n); + ew_product(cdata, D, cdata, data->n); // Scale A and G. - row_col_scale(data->A, E, D); - row_col_scale(data->G, F, D); - row_col_scale(data->At, D, E); - row_col_scale(data->Gt, D, F); + row_col_scale_matrix(data->A, E, D); + row_col_scale_matrix(data->G, F, D); + row_col_scale_matrix(data->At, D, E); + row_col_scale_matrix(data->Gt, D, F); // Update scaling matrices with delta. - ew_product(scaling->Druiz, D, scaling->Druiz, data->n); - ew_product(scaling->Eruiz, E, scaling->Eruiz, data->p); - ew_product(scaling->Fruiz, F, scaling->Fruiz, data->m); + ew_product(Druiz_data, D, Druiz_data, data->n); + ew_product(Eruiz_data, E, Eruiz_data, data->p); + ew_product(Fruiz_data, F, Fruiz_data, data->m); } // Scale b. - ew_product(data->b, scaling->Eruiz, data->b, data->p); + ew_product(bdata, Eruiz_data, bdata, data->p); // Scale h. - ew_product(data->h, scaling->Fruiz, data->h, data->m); + ew_product(hdata, Fruiz_data, hdata, data->m); // Compute Dinv, Einv, Finv. - for (QOCOInt i = 0; i < data->n; ++i) { - scaling->Dinvruiz[i] = safe_div(1.0, scaling->Druiz[i]); - } - for (QOCOInt i = 0; i < data->p; ++i) { - scaling->Einvruiz[i] = safe_div(1.0, scaling->Eruiz[i]); - } - for (QOCOInt i = 0; i < data->m; ++i) { - scaling->Finvruiz[i] = safe_div(1.0, scaling->Fruiz[i]); - } + reciprocal_vectorf(scaling->Druiz, scaling->Dinvruiz); + reciprocal_vectorf(scaling->Eruiz, scaling->Einvruiz); + reciprocal_vectorf(scaling->Fruiz, scaling->Finvruiz); scaling->kinv = safe_div(1.0, scaling->k); } void unscale_variables(QOCOWorkspace* work) { - ew_product(work->x, work->scaling->Druiz, work->x, work->data->n); - ew_product(work->s, work->scaling->Finvruiz, work->s, work->data->m); - - ew_product(work->y, work->scaling->Eruiz, work->y, work->data->p); - scale_arrayf(work->y, work->y, work->scaling->kinv, work->data->p); - - ew_product(work->z, work->scaling->Fruiz, work->z, work->data->m); - scale_arrayf(work->z, work->z, work->scaling->kinv, work->data->m); -} \ No newline at end of file + ew_product(get_pointer_vectorf(work->x, 0), + get_pointer_vectorf(work->scaling->Druiz, 0), + get_pointer_vectorf(work->x, 0), work->data->n); + + ew_product(get_pointer_vectorf(work->s, 0), + get_pointer_vectorf(work->scaling->Finvruiz, 0), + get_pointer_vectorf(work->s, 0), work->data->m); + + ew_product(get_pointer_vectorf(work->y, 0), + get_pointer_vectorf(work->scaling->Eruiz, 0), + get_pointer_vectorf(work->y, 0), work->data->p); + scale_arrayf(get_pointer_vectorf(work->y, 0), get_pointer_vectorf(work->y, 0), + work->scaling->kinv, work->data->p); + + ew_product(get_pointer_vectorf(work->z, 0), + get_pointer_vectorf(work->scaling->Fruiz, 0), + get_pointer_vectorf(work->z, 0), work->data->m); + scale_arrayf(get_pointer_vectorf(work->z, 0), get_pointer_vectorf(work->z, 0), + work->scaling->kinv, work->data->m); +} diff --git a/src/kkt.c b/src/kkt.c index cf1ccd3..32fdffa 100644 --- a/src/kkt.c +++ b/src/kkt.c @@ -23,7 +23,8 @@ QOCOCscMatrix* construct_kkt(QOCOCscMatrix* P, QOCOCscMatrix* A, KKT->m = n + m + p; KKT->n = n + m + p; - KKT->nnz = P->nnz + A->nnz + G->nnz + Wnnz + p; + QOCOInt Pnnz = P ? P->nnz : 0; + KKT->nnz = Pnnz + A->nnz + G->nnz + Wnnz + p; KKT->x = qoco_calloc(KKT->nnz, sizeof(QOCOFloat)); KKT->i = qoco_calloc(KKT->nnz, sizeof(QOCOInt)); @@ -32,15 +33,25 @@ QOCOCscMatrix* construct_kkt(QOCOCscMatrix* P, QOCOCscMatrix* A, QOCOInt nz = 0; QOCOInt col = 0; // Add P block - for (QOCOInt k = 0; k < P->nnz; ++k) { - PregtoKKT[k] = nz; - KKT->x[nz] = P->x[k]; - KKT->i[nz] = P->i[k]; - nz += 1; - } - for (QOCOInt k = 0; k < P->n + 1; ++k) { - KKT->p[col] = P->p[k]; - col += 1; + if (P) { + for (QOCOInt k = 0; k < P->nnz; ++k) { + if (PregtoKKT) { + PregtoKKT[k] = nz; + } + KKT->x[nz] = P->x[k]; + KKT->i[nz] = P->i[k]; + nz += 1; + } + for (QOCOInt k = 0; k < P->n + 1; ++k) { + KKT->p[col] = P->p[k]; + col += 1; + } + } else { + // P is NULL, so just set column pointers for n columns + for (QOCOInt k = 0; k < n + 1; ++k) { + KKT->p[col] = 0; + col += 1; + } } // Add A^T block @@ -189,9 +200,14 @@ void initialize_ipm(QOCOSolver* solver) // Construct rhs of KKT system. // Construct rhs of KKT system = [-c;b;h]. QOCOProblemData* data = solver->work->data; - copy_and_negate_arrayf(data->c, solver->work->rhs, data->n); - copy_arrayf(data->b, &solver->work->rhs[data->n], data->p); - copy_arrayf(data->h, &solver->work->rhs[data->n + data->p], data->m); + QOCOFloat* cdata = get_data_vectorf(data->c); + QOCOFloat* bdata = get_data_vectorf(data->b); + QOCOFloat* hdata = get_data_vectorf(data->h); + QOCOFloat* rhs = get_data_vectorf(solver->work->rhs); + QOCOFloat* xyz = get_data_vectorf(solver->work->xyz); + copy_and_negate_arrayf(cdata, rhs, data->n); + copy_arrayf(bdata, &rhs[data->n], data->p); + copy_arrayf(hdata, &rhs[data->n + data->p], data->m); // Factor KKT matrix. solver->linsys->linsys_factor(solver->linsys_data, solver->work->data->n, @@ -199,28 +215,30 @@ void initialize_ipm(QOCOSolver* solver) // Solve KKT system. solver->linsys->linsys_solve(solver->linsys_data, solver->work, - solver->work->rhs, solver->work->xyz, + rhs, xyz, solver->settings->iter_ref_iters); - // Copy x part of solution to x. - copy_arrayf(solver->work->xyz, solver->work->x, solver->work->data->n); + // Copy x part of solution to x (GPU-to-GPU copy). + // During solve phase, get_data_vectorf returns device pointers, so copy_arrayf + // will detect both xyz and work->x are on device and use a CUDA kernel. + copy_arrayf(xyz, get_data_vectorf(solver->work->x), solver->work->data->n); - // Copy y part of solution to y. - copy_arrayf(&solver->work->xyz[solver->work->data->n], solver->work->y, + // Copy y part of solution to y (GPU-to-GPU copy). + copy_arrayf(&xyz[solver->work->data->n], get_data_vectorf(solver->work->y), solver->work->data->p); - // Copy z part of solution to z. - copy_arrayf(&solver->work->xyz[solver->work->data->n + solver->work->data->p], - solver->work->z, solver->work->data->m); + // Copy z part of solution to z (GPU-to-GPU copy). + copy_arrayf(&xyz[solver->work->data->n + solver->work->data->p], + get_data_vectorf(solver->work->z), solver->work->data->m); - // Copy and negate z part of solution to s. + // Copy and negate z part of solution to s (GPU-to-GPU copy). copy_and_negate_arrayf( - &solver->work->xyz[solver->work->data->n + solver->work->data->p], - solver->work->s, solver->work->data->m); + &xyz[solver->work->data->n + solver->work->data->p], + get_data_vectorf(solver->work->s), solver->work->data->m); // Bring s and z to cone C. - bring2cone(solver->work->s, solver->work->data); - bring2cone(solver->work->z, solver->work->data); + bring2cone(get_data_vectorf(solver->work->s), solver->work->data); + bring2cone(get_data_vectorf(solver->work->z), solver->work->data); } void compute_kkt_residual(QOCOProblemData* data, QOCOFloat* x, QOCOFloat* y, @@ -240,14 +258,17 @@ void compute_kkt_residual(QOCOProblemData* data, QOCOFloat* x, QOCOFloat* y, // rhs += [c;-b;-h+s] // Add c and account for regularization of P. - qoco_axpy(data->c, kktres, kktres, 1.0, data->n); + QOCOFloat* cdata = get_data_vectorf(data->c); + QOCOFloat* bdata = get_data_vectorf(data->b); + QOCOFloat* hdata = get_data_vectorf(data->h); + qoco_axpy(cdata, kktres, kktres, 1.0, data->n); qoco_axpy(x, kktres, kktres, -static_reg, data->n); // Add -b. - qoco_axpy(data->b, &kktres[data->n], &kktres[data->n], -1.0, data->p); + qoco_axpy(bdata, &kktres[data->n], &kktres[data->n], -1.0, data->p); // Add -h + s. - qoco_axpy(data->h, &kktres[data->n + data->p], &kktres[data->n + data->p], + qoco_axpy(hdata, &kktres[data->n + data->p], &kktres[data->n + data->p], -1.0, data->m); qoco_axpy(s, &kktres[data->n + data->p], &kktres[data->n + data->p], 1.0, data->m); @@ -256,8 +277,15 @@ void compute_kkt_residual(QOCOProblemData* data, QOCOFloat* x, QOCOFloat* y, QOCOFloat compute_objective(QOCOProblemData* data, QOCOFloat* x, QOCOFloat* nbuff, QOCOFloat static_reg, QOCOFloat k) { - QOCOFloat obj = qoco_dot(x, data->c, data->n); - USpMv(data->P, x, nbuff); + QOCOFloat* cdata = get_data_vectorf(data->c); + QOCOFloat obj = qoco_dot(x, cdata, data->n); + if (data->P) { + USpMv_matrix(data->P, x, nbuff); + } else { + for (QOCOInt i = 0; i < data->n; ++i) { + nbuff[i] = 0.0; + } + } // Correct for regularization in P. QOCOFloat regularization_correction = 0.0; @@ -270,8 +298,11 @@ QOCOFloat compute_objective(QOCOProblemData* data, QOCOFloat* x, } void construct_kkt_aff_rhs(QOCOWorkspace* work) { + // During solve phase, get_data_vectorf returns device pointer automatically + QOCOFloat* rhs = get_data_vectorf(work->rhs); + // Negate the kkt residual and store in rhs. - copy_and_negate_arrayf(work->kktres, work->rhs, + copy_and_negate_arrayf(work->kktres, rhs, work->data->n + work->data->p + work->data->m); // Compute W*lambda @@ -279,15 +310,18 @@ void construct_kkt_aff_rhs(QOCOWorkspace* work) work->data->m, work->data->nsoc, work->data->q); // Add W*lambda to z portion of rhs. - qoco_axpy(work->ubuff1, &work->rhs[work->data->n + work->data->p], - &work->rhs[work->data->n + work->data->p], 1.0, work->data->m); + qoco_axpy(work->ubuff1, &rhs[work->data->n + work->data->p], + &rhs[work->data->n + work->data->p], 1.0, work->data->m); } void construct_kkt_comb_rhs(QOCOWorkspace* work) { + // During solve phase, get_data_vectorf returns device pointer automatically + QOCOFloat* rhs = get_data_vectorf(work->rhs); + QOCOFloat* xyz = get_data_vectorf(work->xyz); // Negate the kkt residual and store in rhs. - copy_and_negate_arrayf(work->kktres, work->rhs, + copy_and_negate_arrayf(work->kktres, rhs, work->data->n + work->data->p + work->data->m); /// ds = -cone_product(lambda, lambda) - settings.mehrotra * @@ -298,7 +332,7 @@ void construct_kkt_comb_rhs(QOCOWorkspace* work) work->data->m, work->data->nsoc, work->data->q); // ubuff2 = W * Dzaff. - QOCOFloat* Dzaff = &work->xyz[work->data->n + work->data->p]; + QOCOFloat* Dzaff = &xyz[work->data->n + work->data->p]; nt_multiply(work->Wfull, Dzaff, work->ubuff2, work->data->l, work->data->m, work->data->nsoc, work->data->q); @@ -335,8 +369,8 @@ void construct_kkt_comb_rhs(QOCOWorkspace* work) work->data->m, work->data->nsoc, work->data->q); // rhs = [dx;dy;dz-W'*cone_division(lambda, ds, pdata)]; - qoco_axpy(work->ubuff1, &work->rhs[work->data->n + work->data->p], - &work->rhs[work->data->n + work->data->p], -1.0, work->data->m); + qoco_axpy(work->ubuff1, &rhs[work->data->n + work->data->p], + &rhs[work->data->n + work->data->p], -1.0, work->data->m); } void predictor_corrector(QOCOSolver* solver) @@ -351,12 +385,15 @@ void predictor_corrector(QOCOSolver* solver) construct_kkt_aff_rhs(work); // Solve to get affine scaling direction. + // During solve phase, get_data_vectorf returns device pointers automatically + QOCOFloat* rhs = get_data_vectorf(work->rhs); + QOCOFloat* xyz = get_data_vectorf(work->xyz); solver->linsys->linsys_solve(solver->linsys_data, solver->work, - solver->work->rhs, solver->work->xyz, + rhs, xyz, solver->settings->iter_ref_iters); // Compute Dsaff. Dsaff = W' * (-lambda - W * Dzaff). - QOCOFloat* Dzaff = &work->xyz[work->data->n + work->data->p]; + QOCOFloat* Dzaff = &xyz[work->data->n + work->data->p]; nt_multiply(work->Wfull, Dzaff, work->ubuff1, work->data->l, work->data->m, work->data->nsoc, work->data->q); for (QOCOInt i = 0; i < work->data->m; ++i) { @@ -368,25 +405,64 @@ void predictor_corrector(QOCOSolver* solver) // Compute centering parameter. compute_centering(solver); - // Construct rhs for affine scaling direction. + // Construct rhs for combined direction. construct_kkt_comb_rhs(work); // Solve to get combined direction. + // During solve phase, get_data_vectorf returns device pointers automatically + rhs = get_data_vectorf(work->rhs); + xyz = get_data_vectorf(work->xyz); solver->linsys->linsys_solve(solver->linsys_data, solver->work, - solver->work->rhs, solver->work->xyz, + rhs, xyz, solver->settings->iter_ref_iters); + // Check if solution has NaNs. If NaNs are present, early exit and set a to // 0.0 to trigger reduced tolerance optimality checks. + // During solve phase, xyz is on device, so we need to sync to host for NaN check + // But this is error checking, not part of normal solve flow + #ifdef QOCO_ALGEBRA_BACKEND_CUDA + extern int get_solve_phase(void); + if (get_solve_phase()) { + // Temporarily sync xyz from device to host for NaN check + // This creates a temporary copy, but it's only for error checking + QOCOInt n = work->data->n + work->data->m + work->data->p; + QOCOFloat* xyz_host = (QOCOFloat*)malloc(n * sizeof(QOCOFloat)); + extern void* cudaMemcpy(void* dst, const void* src, size_t count, int kind); + #define cudaMemcpyDeviceToHost 2 + cudaMemcpy(xyz_host, xyz, n * sizeof(QOCOFloat), cudaMemcpyDeviceToHost); + for (QOCOInt i = 0; i < n; ++i) { + if (isnan(xyz_host[i])) { + free(xyz_host); + work->a = 0.0; + return; + } + } + free(xyz_host); + } else { + // Not in solve phase, use host pointer directly + QOCOFloat* xyz_host = get_data_vectorf(work->xyz); + for (QOCOInt i = 0; i < work->data->n + work->data->p + work->data->m; ++i) { + if (isnan(xyz_host[i])) { + work->a = 0.0; + return; + } + } + } + #else + // Builtin backend - use host pointer directly + QOCOFloat* xyz_host = get_data_vectorf(work->xyz); for (QOCOInt i = 0; i < work->data->n + work->data->p + work->data->m; ++i) { - if (isnan(work->xyz[i])) { + if (isnan(xyz_host[i])) { work->a = 0.0; return; } } + #endif // Compute Ds. Ds = W' * (cone_division(lambda, ds, pdata) - W * Dz). ds // computed in construct_kkt_comb_rhs() and stored in work->Ds. - QOCOFloat* Dz = &work->xyz[work->data->n + work->data->p]; + // Use device pointer during solve phase + QOCOFloat* Dz = &xyz[work->data->n + work->data->p]; cone_division(work->lambda, work->Ds, work->ubuff1, work->data->l, work->data->nsoc, work->data->q); nt_multiply(work->Wfull, Dz, work->ubuff2, work->data->l, work->data->m, @@ -397,21 +473,25 @@ void predictor_corrector(QOCOSolver* solver) work->data->nsoc, work->data->q); // Compute step-size. - QOCOFloat a = qoco_min(linesearch(work->s, work->Ds, 0.99, solver), - linesearch(work->z, Dz, 0.99, solver)); + QOCOFloat a = qoco_min(linesearch(get_data_vectorf(work->s), work->Ds, 0.99, solver), + linesearch(get_data_vectorf(work->z), Dz, 0.99, solver)); // Save step-size. work->a = a; // Update iterates. - QOCOFloat* Dx = work->xyz; - QOCOFloat* Dy = &work->xyz[work->data->n]; + // During solve phase, get_data_vectorf returns device pointer automatically + QOCOFloat* Dx = xyz; + QOCOFloat* Dy = &xyz[work->data->n]; QOCOFloat* Ds = work->Ds; - qoco_axpy(Dx, work->x, work->x, a, work->data->n); - qoco_axpy(Ds, work->s, work->s, a, work->data->m); - qoco_axpy(Dy, work->y, work->y, a, work->data->p); - qoco_axpy(Dz, work->z, work->z, a, work->data->m); + qoco_axpy(Dx, get_data_vectorf(work->x), get_data_vectorf(work->x), a, work->data->n); + qoco_axpy(Ds, get_data_vectorf(work->s), get_data_vectorf(work->s), a, work->data->m); + qoco_axpy(Dy, get_data_vectorf(work->y), get_data_vectorf(work->y), a, work->data->p); + qoco_axpy(Dz, get_data_vectorf(work->z), get_data_vectorf(work->z), a, work->data->m); + + // Note: No sync needed here - get_data_vectorf returns device pointer during solve phase, + // so qoco_axpy operates directly on device memory } void kkt_multiply(QOCOFloat* x, QOCOFloat* y, QOCOProblemData* data, @@ -420,18 +500,24 @@ void kkt_multiply(QOCOFloat* x, QOCOFloat* y, QOCOProblemData* data, { // Compute y[1:n] = P * x[1:n] + A^T * x[n+1:n+p] + G^T * x[n+p+1:n+p+m]. - USpMv(data->P, x, y); + if (data->P) { + USpMv_matrix(data->P, x, y); + } else { + for (QOCOInt i = 0; i < data->n; ++i) { + y[i] = 0.0; + } + } if (data->p > 0) { - SpMtv(data->A, &x[data->n], nbuff); + SpMtv_matrix(data->A, &x[data->n], nbuff); qoco_axpy(y, nbuff, y, 1.0, data->n); - SpMv(data->A, x, &y[data->n]); + SpMv_matrix(data->A, x, &y[data->n]); } if (data->m > 0) { - SpMtv(data->G, &x[data->n + data->p], nbuff); + SpMtv_matrix(data->G, &x[data->n + data->p], nbuff); qoco_axpy(y, nbuff, y, 1.0, data->n); - SpMv(data->G, x, &y[data->n + data->p]); + SpMv_matrix(data->G, x, &y[data->n + data->p]); } if (Wfull) { diff --git a/src/qoco_api.c b/src/qoco_api.c index de81a2a..7f95414 100644 --- a/src/qoco_api.c +++ b/src/qoco_api.c @@ -10,7 +10,7 @@ #include "qoco_api.h" #include "amd.h" -#include "qdldl_backend.h" // TODO: make this modular so we can use any backend. +#include "backend.h" QOCOInt qoco_setup(QOCOSolver* solver, QOCOInt n, QOCOInt m, QOCOInt p, QOCOCscMatrix* P, QOCOFloat* c, QOCOCscMatrix* A, @@ -44,60 +44,69 @@ QOCOInt qoco_setup(QOCOSolver* solver, QOCOInt n, QOCOInt m, QOCOInt p, data->m = m; data->n = n; data->p = p; - data->A = new_qoco_csc_matrix(A); - data->G = new_qoco_csc_matrix(G); - data->c = qoco_malloc(n * sizeof(QOCOFloat)); - data->b = qoco_malloc(p * sizeof(QOCOFloat)); - data->h = qoco_malloc(m * sizeof(QOCOFloat)); + solver->work->data->A = new_qoco_matrix(A); + solver->work->data->G = new_qoco_matrix(G); data->q = qoco_malloc(nsoc * sizeof(QOCOInt)); - copy_arrayf(c, data->c, n); - copy_arrayf(b, data->b, p); - copy_arrayf(h, data->h, m); copy_arrayi(q, data->q, nsoc); + solver->work->data->c = new_qoco_vectorf(c, n); + solver->work->data->b = new_qoco_vectorf(b, p); + solver->work->data->h = new_qoco_vectorf(h, m); + data->l = l; data->nsoc = nsoc; // Copy P. if (P) { - data->P = new_qoco_csc_matrix(P); + data->P = new_qoco_matrix(P); } else { data->P = NULL; } // Equilibrate data. - QOCOInt Annz = A ? A->nnz : 0; - QOCOInt Gnnz = G ? G->nnz : 0; + QOCOInt Annz = A ? get_nnz(data->A) : 0; + QOCOInt Gnnz = G ? get_nnz(data->G) : 0; solver->work->scaling = qoco_malloc(sizeof(QOCOScaling)); - solver->work->scaling->delta = qoco_malloc((n + p + m) * sizeof(QOCOFloat)); - solver->work->scaling->Druiz = qoco_malloc(n * sizeof(QOCOFloat)); - solver->work->scaling->Eruiz = qoco_malloc(p * sizeof(QOCOFloat)); - solver->work->scaling->Fruiz = qoco_malloc(m * sizeof(QOCOFloat)); - solver->work->scaling->Dinvruiz = qoco_malloc(n * sizeof(QOCOFloat)); - solver->work->scaling->Einvruiz = qoco_malloc(p * sizeof(QOCOFloat)); - solver->work->scaling->Finvruiz = qoco_malloc(m * sizeof(QOCOFloat)); + solver->work->scaling->delta = new_qoco_vectorf(NULL, n + p + m); + solver->work->scaling->Druiz = new_qoco_vectorf(NULL, n); + solver->work->scaling->Eruiz = new_qoco_vectorf(NULL, p); + solver->work->scaling->Fruiz = new_qoco_vectorf(NULL, m); + solver->work->scaling->Dinvruiz = new_qoco_vectorf(NULL, n); + solver->work->scaling->Einvruiz = new_qoco_vectorf(NULL, p); + solver->work->scaling->Finvruiz = new_qoco_vectorf(NULL, m); solver->work->data->AtoAt = qoco_malloc(Annz * sizeof(QOCOInt)); solver->work->data->GtoGt = qoco_malloc(Gnnz * sizeof(QOCOInt)); - solver->work->data->At = create_transposed_matrix(data->A, data->AtoAt); - solver->work->data->Gt = create_transposed_matrix(data->G, data->GtoGt); + QOCOCscMatrix* Atcsc = create_transposed_matrix(get_csc_matrix(data->A), data->AtoAt); + solver->work->data->At = new_qoco_matrix(Atcsc); + free_qoco_csc_matrix(Atcsc); + QOCOCscMatrix* Gtcsc = create_transposed_matrix(get_csc_matrix(data->G), data->GtoGt); + solver->work->data->Gt = new_qoco_matrix(Gtcsc); + free_qoco_csc_matrix(Gtcsc); ruiz_equilibration(data, solver->work->scaling, solver->settings->ruiz_iters); // Regularize P. data->Pnzadded_idx = qoco_calloc(n, sizeof(QOCOInt)); if (P) { - QOCOInt num_diagP = count_diag(P); + QOCOCscMatrix* Pcsc = get_csc_matrix(data->P); + QOCOInt num_diagP = count_diag(Pcsc); data->Pnum_nzadded = n - num_diagP; + // Create a copy of the CSC matrix since regularize_P will free it + QOCOCscMatrix* Pcsc_copy = new_qoco_csc_matrix(Pcsc); QOCOCscMatrix* Preg = - regularize_P(num_diagP, data->P, solver->settings->kkt_static_reg, + regularize_P(num_diagP, Pcsc_copy, solver->settings->kkt_static_reg, data->Pnzadded_idx); - data->P = Preg; + free_qoco_matrix(data->P); + data->P = new_qoco_matrix(Preg); + free_qoco_csc_matrix(Preg); } else { - data->P = construct_identity(n, solver->settings->kkt_static_reg); + QOCOCscMatrix* Pid = construct_identity(n, solver->settings->kkt_static_reg); + data->P = new_qoco_matrix(Pid); + free_qoco_csc_matrix(Pid); data->Pnum_nzadded = n; } @@ -120,10 +129,10 @@ QOCOInt qoco_setup(QOCOSolver* solver, QOCOInt n, QOCOInt m, QOCOInt p, } // Allocate primal and dual variables. - solver->work->x = qoco_malloc(n * sizeof(QOCOFloat)); - solver->work->s = qoco_malloc(m * sizeof(QOCOFloat)); - solver->work->y = qoco_malloc(p * sizeof(QOCOFloat)); - solver->work->z = qoco_malloc(m * sizeof(QOCOFloat)); + solver->work->x = new_qoco_vectorf(NULL, n); + solver->work->s = new_qoco_vectorf(NULL, m); + solver->work->y = new_qoco_vectorf(NULL, p); + solver->work->z = new_qoco_vectorf(NULL, m); solver->work->mu = 0.0; // Allocate Nesterov-Todd scalings and scaled variables. @@ -154,9 +163,9 @@ QOCOInt qoco_setup(QOCOSolver* solver, QOCOInt n, QOCOInt m, QOCOInt p, solver->work->ubuff2 = qoco_malloc(m * sizeof(QOCOFloat)); solver->work->ubuff3 = qoco_malloc(m * sizeof(QOCOFloat)); solver->work->Ds = qoco_malloc(m * sizeof(QOCOFloat)); - solver->work->rhs = qoco_malloc((n + m + p) * sizeof(QOCOFloat)); + solver->work->rhs = new_qoco_vectorf(NULL, n + m + p); solver->work->kktres = qoco_malloc((n + m + p) * sizeof(QOCOFloat)); - solver->work->xyz = qoco_malloc((n + m + p) * sizeof(QOCOFloat)); + solver->work->xyz = new_qoco_vectorf(NULL, n + m + p); solver->work->xyzbuff1 = qoco_malloc((n + m + p) * sizeof(QOCOFloat)); solver->work->xyzbuff2 = qoco_malloc((n + m + p) * sizeof(QOCOFloat)); @@ -231,23 +240,29 @@ void update_vector_data(QOCOSolver* solver, QOCOFloat* cnew, QOCOFloat* bnew, // Update cost vector. if (cnew) { + QOCOFloat* cdata = get_data_vectorf(data->c); + QOCOFloat* Druiz_data = get_data_vectorf(solver->work->scaling->Druiz); for (QOCOInt i = 0; i < data->n; ++i) { - data->c[i] = - solver->work->scaling->k * solver->work->scaling->Druiz[i] * cnew[i]; + cdata[i] = + solver->work->scaling->k * Druiz_data[i] * cnew[i]; } } // Update equality constraint vector. if (bnew) { + QOCOFloat* bdata = get_data_vectorf(data->b); + QOCOFloat* Eruiz_data = get_data_vectorf(solver->work->scaling->Eruiz); for (QOCOInt i = 0; i < data->p; ++i) { - data->b[i] = solver->work->scaling->Eruiz[i] * bnew[i]; + bdata[i] = Eruiz_data[i] * bnew[i]; } } // Update conic constraint vector. if (hnew) { + QOCOFloat* hdata = get_data_vectorf(data->h); + QOCOFloat* Fruiz_data = get_data_vectorf(solver->work->scaling->Fruiz); for (QOCOInt i = 0; i < data->m; ++i) { - data->h[i] = solver->work->scaling->Fruiz[i] * hnew[i]; + hdata[i] = Fruiz_data[i] * hnew[i]; } } } @@ -260,57 +275,76 @@ void update_matrix_data(QOCOSolver* solver, QOCOFloat* Pxnew, QOCOFloat* Axnew, QOCOScaling* scaling = solver->work->scaling; // Undo regularization. - unregularize(data->P, solver->settings->kkt_static_reg); + QOCOCscMatrix* Pcsc = get_csc_matrix(data->P); + unregularize(Pcsc, solver->settings->kkt_static_reg); // Unequilibrate P. - scale_arrayf(data->P->x, data->P->x, scaling->kinv, data->P->nnz); - row_col_scale(data->P, scaling->Dinvruiz, scaling->Dinvruiz); + QOCOFloat* Px = Pcsc->x; + QOCOInt Pnnz = get_nnz(data->P); + QOCOFloat* Dinvruiz_data = get_data_vectorf(scaling->Dinvruiz); + scale_arrayf(Px, Px, scaling->kinv, Pnnz); + row_col_scale(Pcsc, Dinvruiz_data, Dinvruiz_data); // Unequilibrate c. - scale_arrayf(data->c, data->c, scaling->kinv, data->n); - ew_product(data->c, scaling->Dinvruiz, data->c, data->n); + QOCOFloat* cdata = get_data_vectorf(data->c); + scale_arrayf(cdata, cdata, scaling->kinv, data->n); + ew_product(cdata, Dinvruiz_data, cdata, data->n); // Unequilibrate A. - row_col_scale(data->A, scaling->Einvruiz, scaling->Dinvruiz); + QOCOCscMatrix* Acsc = get_csc_matrix(data->A); + QOCOFloat* Einvruiz_data = get_data_vectorf(scaling->Einvruiz); + row_col_scale(Acsc, Einvruiz_data, Dinvruiz_data); // Unequilibrate G. - row_col_scale(data->G, scaling->Finvruiz, scaling->Dinvruiz); + QOCOCscMatrix* Gcsc = get_csc_matrix(data->G); + QOCOFloat* Finvruiz_data = get_data_vectorf(scaling->Finvruiz); + row_col_scale(Gcsc, Finvruiz_data, Dinvruiz_data); // Unequilibrate b. - ew_product(data->b, scaling->Einvruiz, data->b, data->p); + QOCOFloat* bdata = get_data_vectorf(data->b); + ew_product(bdata, Einvruiz_data, bdata, data->p); // Unequilibrate h. - ew_product(data->h, scaling->Finvruiz, data->h, data->m); + QOCOFloat* hdata = get_data_vectorf(data->h); + ew_product(hdata, Finvruiz_data, hdata, data->m); // Update P and avoid nonzeros that were added for regularization. if (Pxnew) { QOCOInt avoid = - data->Pnum_nzadded > 0 ? data->Pnzadded_idx[0] : data->P->nnz + 1; + data->Pnum_nzadded > 0 ? data->Pnzadded_idx[0] : Pnnz + 1; QOCOInt offset = 0; - for (QOCOInt i = 0; i < data->P->nnz - data->Pnum_nzadded; ++i) { + for (QOCOInt i = 0; i < Pnnz - data->Pnum_nzadded; ++i) { if (i == avoid) { offset++; avoid = data->Pnzadded_idx[offset]; } else { - data->P->x[i + offset] = Pxnew[i]; + Px[i + offset] = Pxnew[i]; } } } // Update A. if (Axnew) { - for (QOCOInt i = 0; i < data->A->nnz; ++i) { - data->A->x[i] = Axnew[i]; - data->At->x[i] = Axnew[data->AtoAt[i]]; + QOCOCscMatrix* Atcsc = get_csc_matrix(data->At); + QOCOFloat* Ax = Acsc->x; + QOCOFloat* Atx = Atcsc->x; + QOCOInt Annz = get_nnz(data->A); + for (QOCOInt i = 0; i < Annz; ++i) { + Ax[i] = Axnew[i]; + Atx[i] = Axnew[data->AtoAt[i]]; } } // Update G. if (Gxnew) { - for (QOCOInt i = 0; i < data->G->nnz; ++i) { - data->G->x[i] = Gxnew[i]; - data->Gt->x[i] = Gxnew[data->GtoGt[i]]; + QOCOCscMatrix* Gtcsc = get_csc_matrix(data->Gt); + QOCOFloat* Gx = Gcsc->x; + QOCOFloat* Gtx = Gtcsc->x; + QOCOInt Gnnz = get_nnz(data->G); + for (QOCOInt i = 0; i < Gnnz; ++i) { + Gx[i] = Gxnew[i]; + Gtx[i] = Gxnew[data->GtoGt[i]]; } } @@ -319,7 +353,7 @@ void update_matrix_data(QOCOSolver* solver, QOCOFloat* Pxnew, QOCOFloat* Axnew, solver->settings->ruiz_iters); // Regularize P. - unregularize(data->P, -solver->settings->kkt_static_reg); + unregularize(get_csc_matrix(data->P), -solver->settings->kkt_static_reg); solver->linsys->linsys_update_data(solver->linsys_data, solver->work->data); } @@ -340,28 +374,40 @@ QOCOInt qoco_solve(QOCOSolver* solver) print_header(solver); } + // Set solve phase flag for CUDA backend (prevents CPU-GPU copies during solve) + // During solve phase, get_data_vectorf returns device pointers automatically + #ifdef QOCO_ALGEBRA_BACKEND_CUDA + extern void set_solve_phase(int active); + set_solve_phase(1); + #endif + // Get initializations for primal and dual variables. initialize_ipm(solver); for (QOCOInt i = 1; i <= solver->settings->max_iters; ++i) { // Compute kkt residual. - compute_kkt_residual(data, work->x, work->y, work->s, work->z, work->kktres, + compute_kkt_residual(data, get_data_vectorf(work->x), get_data_vectorf(work->y), get_data_vectorf(work->s), get_data_vectorf(work->z), work->kktres, solver->settings->kkt_static_reg, work->xyzbuff1, work->xbuff, work->ubuff1, work->ubuff2); // Compute objective function. solver->sol->obj = - compute_objective(data, work->x, work->xbuff, + compute_objective(data, get_data_vectorf(work->x), work->xbuff, solver->settings->kkt_static_reg, work->scaling->k); // Compute mu = s'*z / m. work->mu = (data->m > 0) - ? safe_div(qoco_dot(work->s, work->z, data->m), data->m) + ? safe_div(qoco_dot(get_data_vectorf(work->s), get_data_vectorf(work->z), data->m), data->m) : 0; // Check stopping criteria. if (check_stopping(solver)) { stop_timer(&(work->solve_timer)); + // Clear solve phase flag before copying solution (allows copy from device to host) + #ifdef QOCO_ALGEBRA_BACKEND_CUDA + extern void set_solve_phase(int active); + set_solve_phase(0); + #endif unscale_variables(work); copy_solution(solver); if (solver->settings->verbose) { @@ -389,9 +435,14 @@ QOCOInt qoco_solve(QOCOSolver* solver) } } - stop_timer(&(work->solve_timer)); - unscale_variables(work); - copy_solution(solver); + stop_timer(&(work->solve_timer)); + // Clear solve phase flag before copying solution (allows copy from device to host) + #ifdef QOCO_ALGEBRA_BACKEND_CUDA + extern void set_solve_phase(int active); + set_solve_phase(0); + #endif + unscale_variables(work); + copy_solution(solver); solver->sol->status = QOCO_MAX_ITER; if (solver->settings->verbose) { print_footer(solver->sol, solver->sol->status); @@ -403,17 +454,19 @@ QOCOInt qoco_cleanup(QOCOSolver* solver) { // Free problem data. - free_qoco_csc_matrix(solver->work->data->P); - free_qoco_csc_matrix(solver->work->data->A); - free_qoco_csc_matrix(solver->work->data->G); - free_qoco_csc_matrix(solver->work->data->At); - free_qoco_csc_matrix(solver->work->data->Gt); + if (solver->work->data->P) { + free_qoco_matrix(solver->work->data->P); + } + free_qoco_matrix(solver->work->data->A); + free_qoco_matrix(solver->work->data->G); + free_qoco_matrix(solver->work->data->At); + free_qoco_matrix(solver->work->data->Gt); qoco_free(solver->work->data->AtoAt); qoco_free(solver->work->data->GtoGt); - qoco_free(solver->work->data->b); - qoco_free(solver->work->data->c); - qoco_free(solver->work->data->h); + free_qoco_vectorf(solver->work->data->b); + free_qoco_vectorf(solver->work->data->c); + free_qoco_vectorf(solver->work->data->h); qoco_free(solver->work->data->q); qoco_free(solver->work->data->Pnzadded_idx); qoco_free(solver->work->data); @@ -422,15 +475,15 @@ QOCOInt qoco_cleanup(QOCOSolver* solver) solver->linsys->linsys_cleanup(solver->linsys_data); // Free primal and dual variables. - qoco_free(solver->work->rhs); + free_qoco_vectorf(solver->work->rhs); qoco_free(solver->work->kktres); - qoco_free(solver->work->xyz); + free_qoco_vectorf(solver->work->xyz); qoco_free(solver->work->xyzbuff1); qoco_free(solver->work->xyzbuff2); - qoco_free(solver->work->x); - qoco_free(solver->work->s); - qoco_free(solver->work->y); - qoco_free(solver->work->z); + free_qoco_vectorf(solver->work->x); + free_qoco_vectorf(solver->work->s); + free_qoco_vectorf(solver->work->y); + free_qoco_vectorf(solver->work->z); // Free Nesterov-Todd scalings and scaled variables. qoco_free(solver->work->W); @@ -449,13 +502,13 @@ QOCOInt qoco_cleanup(QOCOSolver* solver) qoco_free(solver->work->Ds); // Free scaling struct. - qoco_free(solver->work->scaling->delta); - qoco_free(solver->work->scaling->Druiz); - qoco_free(solver->work->scaling->Eruiz); - qoco_free(solver->work->scaling->Fruiz); - qoco_free(solver->work->scaling->Dinvruiz); - qoco_free(solver->work->scaling->Einvruiz); - qoco_free(solver->work->scaling->Finvruiz); + free_qoco_vectorf(solver->work->scaling->delta); + free_qoco_vectorf(solver->work->scaling->Druiz); + free_qoco_vectorf(solver->work->scaling->Eruiz); + free_qoco_vectorf(solver->work->scaling->Fruiz); + free_qoco_vectorf(solver->work->scaling->Dinvruiz); + free_qoco_vectorf(solver->work->scaling->Einvruiz); + free_qoco_vectorf(solver->work->scaling->Finvruiz); qoco_free(solver->work->scaling); // Free solution struct. diff --git a/src/qoco_utils.c b/src/qoco_utils.c index e19b092..488c427 100644 --- a/src/qoco_utils.c +++ b/src/qoco_utils.c @@ -10,6 +10,10 @@ #include "qoco_utils.h" +#ifdef QOCO_ALGEBRA_BACKEND_CUDA +extern void sync_vector_to_device_if_needed(QOCOVectorf* v); +#endif + void print_qoco_csc_matrix(QOCOCscMatrix* M) { printf("\nPrinting CSC Matrix:\n"); @@ -87,9 +91,9 @@ void print_header(QOCOSolver* solver) printf("| eq constraints: %-9d |\n", data->p); printf("| ineq constraints: %-9d |\n", data->l); printf("| soc constraints: %-9d |\n", data->nsoc); - printf("| nnz(P): %-9d |\n", data->P->nnz - solver->work->data->Pnum_nzadded); - printf("| nnz(A): %-9d |\n", data->A->nnz); - printf("| nnz(G): %-9d |\n", data->G->nnz); + printf("| nnz(P): %-9d |\n", (data->P ? get_nnz(data->P) : 0) - solver->work->data->Pnum_nzadded); + printf("| nnz(A): %-9d |\n", get_nnz(data->A)); + printf("| nnz(G): %-9d |\n", get_nnz(data->G)); printf("| Solver Settings: |\n"); printf("| max_iters: %-3d abstol: %3.2e reltol: %3.2e |\n", settings->max_iters, settings->abstol, settings->reltol); printf("| abstol_inacc: %3.2e reltol_inacc: %3.2e |\n", settings->abstol_inacc, settings->reltol_inacc); @@ -133,55 +137,71 @@ unsigned char check_stopping(QOCOSolver* solver) QOCOFloat eabsinacc = solver->settings->abstol_inacc; QOCOFloat erelinacc = solver->settings->reltol_inacc; - ew_product(work->scaling->Einvruiz, data->b, work->ybuff, data->p); + QOCOFloat* Einvruiz_data = get_data_vectorf(work->scaling->Einvruiz); + QOCOFloat* bdata = get_data_vectorf(data->b); + ew_product(Einvruiz_data, bdata, work->ybuff, data->p); QOCOFloat binf = data->p > 0 ? inf_norm(work->ybuff, data->p) : 0; - ew_product(work->scaling->Fruiz, work->s, work->ubuff1, data->m); + QOCOFloat* Fruiz_data = get_data_vectorf(work->scaling->Fruiz); + QOCOFloat* sdata = get_data_vectorf(work->s); + ew_product(Fruiz_data, sdata, work->ubuff1, data->m); QOCOFloat sinf = data->m > 0 ? inf_norm(work->ubuff1, data->m) : 0; - ew_product(work->scaling->Dinvruiz, work->x, work->xbuff, data->n); + QOCOFloat* Dinvruiz_data = get_data_vectorf(work->scaling->Dinvruiz); + QOCOFloat* xdata = get_data_vectorf(work->x); + ew_product(Dinvruiz_data, xdata, work->xbuff, data->n); QOCOFloat cinf = inf_norm(work->xbuff, data->n); - ew_product(work->scaling->Finvruiz, data->h, work->ubuff3, data->m); + QOCOFloat* Finvruiz_data = get_data_vectorf(work->scaling->Finvruiz); + QOCOFloat* hdata = get_data_vectorf(data->h); + ew_product(Finvruiz_data, hdata, work->ubuff3, data->m); QOCOFloat hinf = data->m > 0 ? inf_norm(work->ubuff3, data->m) : 0; // Compute ||A^T * y||_\infty. If equality constraints aren't present, A->m = // A->n = 0 and SpMtv is a nullop. - SpMtv(data->A, work->y, work->xbuff); - ew_product(work->xbuff, work->scaling->Dinvruiz, work->xbuff, data->n); + QOCOFloat* ydata = get_data_vectorf(work->y); + SpMtv_matrix(data->A, ydata, work->xbuff); + ew_product(work->xbuff, Dinvruiz_data, work->xbuff, data->n); QOCOFloat Atyinf = data->p ? inf_norm(work->xbuff, data->n) : 0; // Compute ||G^T * z||_\infty. If inequality constraints aren't present, G->m // = G->n = 0 and SpMtv is a nullop. - SpMtv(data->G, work->z, work->xbuff); - ew_product(work->xbuff, work->scaling->Dinvruiz, work->xbuff, data->n); + QOCOFloat* zdata = get_data_vectorf(work->z); + SpMtv_matrix(data->G, zdata, work->xbuff); + ew_product(work->xbuff, Dinvruiz_data, work->xbuff, data->n); QOCOFloat Gtzinf = data->m > 0 ? inf_norm(work->xbuff, data->n) : 0; // Compute ||P * x||_\infty - USpMv(data->P, work->x, work->xbuff); + if (data->P) { + USpMv_matrix(data->P, xdata, work->xbuff); + } else { + for (QOCOInt i = 0; i < data->n; ++i) { + work->xbuff[i] = 0.0; + } + } for (QOCOInt i = 0; i < data->n; ++i) { - work->xbuff[i] -= solver->settings->kkt_static_reg * work->x[i]; + work->xbuff[i] -= solver->settings->kkt_static_reg * xdata[i]; } - ew_product(work->xbuff, work->scaling->Dinvruiz, work->xbuff, data->n); + ew_product(work->xbuff, Dinvruiz_data, work->xbuff, data->n); QOCOFloat Pxinf = inf_norm(work->xbuff, data->n); - QOCOFloat xPx = qoco_dot(work->x, work->xbuff, work->data->n); + QOCOFloat xPx = qoco_dot(xdata, work->xbuff, work->data->n); // Compute ||A * x||_\infty - SpMv(data->A, work->x, work->ybuff); - ew_product(work->ybuff, work->scaling->Einvruiz, work->ybuff, data->p); + SpMv_matrix(data->A, xdata, work->ybuff); + ew_product(work->ybuff, Einvruiz_data, work->ybuff, data->p); QOCOFloat Axinf = data->p ? inf_norm(work->ybuff, data->p) : 0; // Compute ||G * x||_\infty - SpMv(data->G, work->x, work->ubuff1); - ew_product(work->ubuff1, work->scaling->Finvruiz, work->ubuff1, data->m); + SpMv_matrix(data->G, xdata, work->ubuff1); + ew_product(work->ubuff1, Finvruiz_data, work->ubuff1, data->m); QOCOFloat Gxinf = data->m ? inf_norm(work->ubuff1, data->m) : 0; // Compute primal residual. - ew_product(&work->kktres[data->n], work->scaling->Einvruiz, work->ybuff, + ew_product(&work->kktres[data->n], Einvruiz_data, work->ybuff, data->p); QOCOFloat eq_res = inf_norm(work->ybuff, data->p); - ew_product(&work->kktres[data->n + data->p], work->scaling->Finvruiz, + ew_product(&work->kktres[data->n + data->p], Finvruiz_data, work->ubuff1, data->m); QOCOFloat conic_res = inf_norm(work->ubuff1, data->m); @@ -189,14 +209,14 @@ unsigned char check_stopping(QOCOSolver* solver) solver->sol->pres = pres; // Compute dual residual. - ew_product(work->kktres, work->scaling->Dinvruiz, work->xbuff, data->n); + ew_product(work->kktres, Dinvruiz_data, work->xbuff, data->n); scale_arrayf(work->xbuff, work->xbuff, work->scaling->kinv, data->n); QOCOFloat dres = inf_norm(work->xbuff, data->n); solver->sol->dres = dres; // Compute complementary slackness residual. - ew_product(work->s, work->scaling->Fruiz, work->ubuff1, data->m); - ew_product(work->z, work->scaling->Fruiz, work->ubuff2, data->m); + ew_product(sdata, Fruiz_data, work->ubuff1, data->m); + ew_product(zdata, Fruiz_data, work->ubuff2, data->m); QOCOFloat gap = qoco_dot(work->ubuff1, work->ubuff2, data->m); gap *= work->scaling->kinv; solver->sol->gap = gap; @@ -214,9 +234,10 @@ unsigned char check_stopping(QOCOSolver* solver) dres_rel *= work->scaling->kinv; // Compute max{1, abs(pobj), abs(dobj)}. - QOCOFloat ctx = qoco_dot(work->data->c, work->x, work->data->n); - QOCOFloat bty = qoco_dot(work->data->b, work->y, work->data->p); - QOCOFloat htz = qoco_dot(work->data->h, work->z, work->data->m); + QOCOFloat* cdata = get_data_vectorf(work->data->c); + QOCOFloat ctx = qoco_dot(cdata, xdata, work->data->n); + QOCOFloat bty = qoco_dot(bdata, ydata, work->data->p); + QOCOFloat htz = qoco_dot(hdata, zdata, work->data->m); QOCOFloat pobj = 0.5 * xPx + ctx; QOCOFloat dobj = -0.5 * xPx - bty - htz; @@ -251,11 +272,21 @@ unsigned char check_stopping(QOCOSolver* solver) void copy_solution(QOCOSolver* solver) { - // Copy optimization variables. - copy_arrayf(solver->work->x, solver->sol->x, solver->work->data->n); - copy_arrayf(solver->work->s, solver->sol->s, solver->work->data->m); - copy_arrayf(solver->work->y, solver->sol->y, solver->work->data->p); - copy_arrayf(solver->work->z, solver->sol->z, solver->work->data->m); + // Copy optimization variables from device to host (CUDA backend) or host to host (builtin) + // During solve phase, get_data_vectorf returns device pointers, but we've cleared solve phase + // before calling this, so it returns host pointers +#ifdef QOCO_ALGEBRA_BACKEND_CUDA + sync_vector_to_device_if_needed(solver->work->rhs); + sync_vector_to_device_if_needed(solver->work->xyz); + sync_vector_to_device_if_needed(solver->work->x); + sync_vector_to_device_if_needed(solver->work->s); + sync_vector_to_device_if_needed(solver->work->y); + sync_vector_to_device_if_needed(solver->work->z); +#endif + copy_arrayf(get_data_vectorf(solver->work->x), solver->sol->x, solver->work->data->n); + copy_arrayf(get_data_vectorf(solver->work->s), solver->sol->s, solver->work->data->m); + copy_arrayf(get_data_vectorf(solver->work->y), solver->sol->y, solver->work->data->p); + copy_arrayf(get_data_vectorf(solver->work->z), solver->sol->z, solver->work->data->m); solver->sol->solve_time_sec = get_elapsed_time_sec(&(solver->work->solve_timer)); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index be0cd37..f942ef2 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -64,6 +64,11 @@ foreach(file ${unit_tests}) "${PROJECT_SOURCE_DIR}/tests/main.cpp" "${PROJECT_SOURCE_DIR}/tests/utils/test_utils.cpp") target_link_libraries("${name}" qocostatic gtest_main) + # Link CUDA libraries for CUDA backend + if(${QOCO_ALGEBRA_BACKEND} STREQUAL "cuda") + get_property(qoco_cuda_libs GLOBAL PROPERTY QOCO_CUDA_LIBS) + target_link_libraries("${name}" ${qoco_cuda_libs}) + endif() add_test(NAME ${name} COMMAND "${name}") endforeach() @@ -78,6 +83,11 @@ foreach(file ${simple_tests}) "${PROJECT_SOURCE_DIR}/tests/main.cpp" "${PROJECT_SOURCE_DIR}/tests/utils/test_utils.cpp") target_link_libraries("${name}" qocostatic gtest_main) + # Link CUDA libraries for CUDA backend + if(${QOCO_ALGEBRA_BACKEND} STREQUAL "cuda") + get_property(qoco_cuda_libs GLOBAL PROPERTY QOCO_CUDA_LIBS) + target_link_libraries("${name}" ${qoco_cuda_libs}) + endif() add_test(NAME ${name} COMMAND "${name}") endforeach() @@ -95,6 +105,11 @@ foreach(file ${ocp}) "${PROJECT_SOURCE_DIR}/tests/main.cpp" "${PROJECT_SOURCE_DIR}/tests/utils/test_utils.cpp") target_link_libraries("${name}" qocostatic gtest_main) + # Link CUDA libraries for CUDA backend + if(${QOCO_ALGEBRA_BACKEND} STREQUAL "cuda") + get_property(qoco_cuda_libs GLOBAL PROPERTY QOCO_CUDA_LIBS) + target_link_libraries("${name}" ${qoco_cuda_libs}) + endif() add_test(NAME ${name} COMMAND "${name}") endforeach() @@ -108,5 +123,10 @@ foreach(file ${portfolio}) "${PROJECT_SOURCE_DIR}/tests/main.cpp" "${PROJECT_SOURCE_DIR}/tests/utils/test_utils.cpp") target_link_libraries("${name}" qocostatic gtest_main) + # Link CUDA libraries for CUDA backend + if(${QOCO_ALGEBRA_BACKEND} STREQUAL "cuda") + get_property(qoco_cuda_libs GLOBAL PROPERTY QOCO_CUDA_LIBS) + target_link_libraries("${name}" ${qoco_cuda_libs}) + endif() add_test(NAME ${name} COMMAND "${name}") endforeach() \ No newline at end of file diff --git a/tests/simple_tests/missing_constraints_test.cpp b/tests/simple_tests/missing_constraints_test.cpp index 60f4330..ef0fbf3 100644 --- a/tests/simple_tests/missing_constraints_test.cpp +++ b/tests/simple_tests/missing_constraints_test.cpp @@ -59,7 +59,7 @@ TEST(missing_constraints_test, no_soc_constraints) expect_eq_vectorf(solver->sol->x, xexp, n, tol); expect_eq_vectorf(solver->sol->s, sexp, m, tol); expect_eq_vectorf(solver->sol->y, yexp, p, tol); - expect_eq_vectorf(solver->sol->z, zexp, n, tol); + expect_eq_vectorf(solver->sol->z, zexp, m, tol); ASSERT_EQ(exit, QOCO_SOLVED); qoco_cleanup(solver); @@ -169,7 +169,7 @@ TEST(missing_constraints_test, no_eq_constraints) expect_eq_vectorf(solver->sol->x, xexp, n, tol); expect_eq_vectorf(solver->sol->s, sexp, m, tol); - expect_eq_vectorf(solver->sol->z, zexp, n, tol); + expect_eq_vectorf(solver->sol->z, zexp, m, tol); ASSERT_EQ(exit, QOCO_SOLVED); qoco_cleanup(solver); @@ -263,7 +263,7 @@ TEST(missing_constraints_test, lp_test_no_P) expect_eq_vectorf(solver->sol->x, xexp, n, tol); expect_eq_vectorf(solver->sol->s, sexp, m, tol); - expect_eq_vectorf(solver->sol->z, zexp, n, tol); + expect_eq_vectorf(solver->sol->z, zexp, m, tol); ASSERT_EQ(exit, QOCO_SOLVED); qoco_cleanup(solver); @@ -328,7 +328,7 @@ TEST(simple_socp_test, p1) expect_eq_vectorf(solver->sol->x, xexp, n, tol); expect_eq_vectorf(solver->sol->s, sexp, m, tol); expect_eq_vectorf(solver->sol->y, yexp, p, tol); - expect_eq_vectorf(solver->sol->z, zexp, n, tol); + expect_eq_vectorf(solver->sol->z, zexp, m, tol); ASSERT_EQ(exit, QOCO_SOLVED); qoco_cleanup(solver); @@ -395,7 +395,7 @@ TEST(simple_socp_test, p2) expect_eq_vectorf(solver->sol->x, xexp, n, tol); expect_eq_vectorf(solver->sol->s, sexp, m, tol); expect_eq_vectorf(solver->sol->y, yexp, p, tol); - expect_eq_vectorf(solver->sol->z, zexp, n, tol); + expect_eq_vectorf(solver->sol->z, zexp, m, tol); ASSERT_EQ(exit, QOCO_SOLVED); qoco_cleanup(solver); @@ -462,7 +462,7 @@ TEST(simple_socp_test, p3) expect_eq_vectorf(solver->sol->x, xexp, n, tol); expect_eq_vectorf(solver->sol->s, sexp, m, tol); expect_eq_vectorf(solver->sol->y, yexp, p, tol); - expect_eq_vectorf(solver->sol->z, zexp, n, tol); + expect_eq_vectorf(solver->sol->z, zexp, m, tol); ASSERT_EQ(exit, QOCO_SOLVED); qoco_cleanup(solver); @@ -591,7 +591,7 @@ TEST(simple_socp_test, reduced_tolerance) expect_eq_vectorf(solver->sol->x, xexp, n, tol); expect_eq_vectorf(solver->sol->s, sexp, m, tol); expect_eq_vectorf(solver->sol->y, yexp, p, tol); - expect_eq_vectorf(solver->sol->z, zexp, n, tol); + expect_eq_vectorf(solver->sol->z, zexp, m, tol); ASSERT_EQ(exit, QOCO_SOLVED_INACCURATE); qoco_cleanup(solver); @@ -666,7 +666,7 @@ TEST(simple_socp_test, update_vector_data_test) expect_eq_vectorf(solver->sol->x, xexp, n, tol); expect_eq_vectorf(solver->sol->s, sexp, m, tol); expect_eq_vectorf(solver->sol->y, yexp, p, tol); - expect_eq_vectorf(solver->sol->z, zexp, n, tol); + expect_eq_vectorf(solver->sol->z, zexp, m, tol); ASSERT_EQ(exit, QOCO_SOLVED); qoco_cleanup(solver); @@ -738,7 +738,7 @@ TEST(simple_socp_test, update_constraint_data_test) expect_eq_vectorf(solver->sol->x, xexp, n, tol); expect_eq_vectorf(solver->sol->s, sexp, m, tol); expect_eq_vectorf(solver->sol->y, yexp, p, tol); - expect_eq_vectorf(solver->sol->z, zexp, n, tol); + expect_eq_vectorf(solver->sol->z, zexp, m, tol); ASSERT_EQ(exit, QOCO_SOLVED); qoco_cleanup(solver); diff --git a/tests/unit_tests/linalg_test.cpp b/tests/unit_tests/linalg_test.cpp index 8f0418a..8135472 100644 --- a/tests/unit_tests/linalg_test.cpp +++ b/tests/unit_tests/linalg_test.cpp @@ -409,9 +409,9 @@ TEST(linalg, ruiz_test) qoco_setup(solver, n, m, p, P, c, A, b, G, h, l, nsoc, q, settings); - expect_eq_vectorf(solver->work->scaling->Druiz, Dexp, n, tol); - expect_eq_vectorf(solver->work->scaling->Eruiz, Eexp, p, tol); - expect_eq_vectorf(solver->work->scaling->Fruiz, Fexp, m, tol); + expect_eq_vectorf(get_data_vectorf(solver->work->scaling->Druiz), Dexp, n, tol); + expect_eq_vectorf(get_data_vectorf(solver->work->scaling->Eruiz), Eexp, p, tol); + expect_eq_vectorf(get_data_vectorf(solver->work->scaling->Fruiz), Fexp, m, tol); EXPECT_NEAR(solver->work->scaling->k, kexp, tol); qoco_cleanup(solver);