diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2841539..51be777 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,12 +1,21 @@
 cmake_minimum_required(VERSION 3.18)
 
+# Set CUDA flags early for compiler ID test
+# Check if cuda backend is requested before project() call
+if(DEFINED QOCO_ALGEBRA_BACKEND AND QOCO_ALGEBRA_BACKEND STREQUAL "cuda")
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -allow-unsupported-compiler")
+    set(CMAKE_CUDA_FLAGS_INIT "${CMAKE_CUDA_FLAGS_INIT} -allow-unsupported-compiler")
+    # Also set via cache to ensure it's picked up during compiler detection
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -allow-unsupported-compiler" CACHE STRING "Flags for CUDA compiler")
+endif()
+
 set(QOCO_VERSION_MAJOR "0")
 set(QOCO_VERSION_MINOR "1")
 set(QOCO_VERSION_PATCH "6")
 set(QOCO_VERSION ${QOCO_VERSION_MAJOR}.${QOCO_VERSION_MINOR}.${QOCO_VERSION_PATCH})
 
 # Project name
-project(qoco VERSION ${QOCO_VERSION})
+project(qoco VERSION ${QOCO_VERSION} LANGUAGES C CXX)
 message(STATUS "Building QOCO v${QOCO_VERSION}")
 
 # Detect operating system.
@@ -112,15 +121,21 @@ endif()
 set(QOCO_ALGEBRA_BACKEND
     "builtin"
     CACHE STRING "The Algebra to use")
+
+# CUDA flags are set early if needed
 add_subdirectory(algebra)
 
 get_property(qoco_sources GLOBAL PROPERTY QOCO_SOURCES)
 get_property(qoco_headers GLOBAL PROPERTY QOCO_HEADERS)
 get_property(qoco_include GLOBAL PROPERTY QOCO_INCLUDE)
 
+# Get CUDA libraries if using CUDA backend
+get_property(qoco_cuda_libs GLOBAL PROPERTY QOCO_CUDA_LIBS)
+
 # Build qoco shared library.
 add_library(qoco SHARED)
 target_link_libraries(qoco qdldlobject amd)
+target_link_libraries(qoco ${qoco_cuda_libs})
 
 if(IS_LINUX OR IS_MACOS)
     target_link_libraries(qoco m)
@@ -129,9 +144,25 @@ endif()
 target_include_directories(qoco PUBLIC ${qoco_include})
 target_sources(qoco PRIVATE ${qoco_sources})
 
+# Enable CUDA if using CUDA backend
+if(${QOCO_ALGEBRA_BACKEND} STREQUAL "cuda")
+    enable_language(CUDA)
+    set_target_properties(qoco PROPERTIES
+        CUDA_SEPARABLE_COMPILATION ON
+        CUDA_RESOLVE_DEVICE_SYMBOLS ON
+    )
+    # Add HAVE_CUDSS definition if cuDSS library was found
+    get_property(qoco_cuda_libs GLOBAL PROPERTY QOCO_CUDA_LIBS)
+    if("${qoco_cuda_libs}" MATCHES "cudss")
+        target_compile_definitions(qoco PRIVATE HAVE_CUDSS)
+        message(STATUS "Added HAVE_CUDSS to qoco target")
+    endif()
+endif()
+
 # Build qoco static library.
 add_library(qocostatic STATIC)
 target_link_libraries(qocostatic qdldlobject amd)
+target_link_libraries(qocostatic ${qoco_cuda_libs})
 
 if(IS_LINUX OR IS_MACOS)
     target_link_libraries(qocostatic m)
@@ -140,6 +171,22 @@ endif()
 target_include_directories(qocostatic PUBLIC ${qoco_include})
 target_sources(qocostatic PRIVATE ${qoco_sources})
 
+# Enable CUDA if using CUDA backend
+if(${QOCO_ALGEBRA_BACKEND} STREQUAL "cuda")
+    set_target_properties(qocostatic PROPERTIES
+        CUDA_SEPARABLE_COMPILATION ON
+        CUDA_RESOLVE_DEVICE_SYMBOLS ON
+    )
+    # Add HAVE_CUDSS definition if cuDSS library was found
+    get_property(qoco_cuda_libs GLOBAL PROPERTY QOCO_CUDA_LIBS)
+    if("${qoco_cuda_libs}" MATCHES "cudss")
+        target_compile_definitions(qocostatic PRIVATE HAVE_CUDSS)
+    endif()
+endif()
+
+target_include_directories(qocostatic PUBLIC ${qoco_include})
+target_sources(qocostatic PRIVATE ${qoco_sources})
+
 # Build qoco demo.
 if(BUILD_QOCO_DEMO)
     add_executable(qoco_demo ${PROJECT_SOURCE_DIR}/examples/qoco_demo.c)
diff --git a/algebra/CMakeLists.txt b/algebra/CMakeLists.txt
index 5d5560d..9d2b2fd 100644
--- a/algebra/CMakeLists.txt
+++ b/algebra/CMakeLists.txt
@@ -3,4 +3,13 @@ if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${QOCO_ALGEBRA_BACKEND})
 endif()
 
 set_property(GLOBAL APPEND PROPERTY QOCO_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/${QOCO_ALGEBRA_BACKEND})
+set_property(GLOBAL APPEND PROPERTY QOCO_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR})
+
+# Set backend define
+if(${QOCO_ALGEBRA_BACKEND} STREQUAL "cuda")
+  add_compile_definitions(QOCO_ALGEBRA_BACKEND_CUDA)
+else()
+  add_compile_definitions(QOCO_ALGEBRA_BACKEND_BUILTIN)
+endif()
+
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${QOCO_ALGEBRA_BACKEND})
diff --git a/algebra/backend.h b/algebra/backend.h
new file mode 100644
index 0000000..adeb4c5
--- /dev/null
+++ b/algebra/backend.h
@@ -0,0 +1,27 @@
+/**
+ * @file backend.h
+ * @author Govind M. Chari <govindchari1@gmail.com>
+ *
+ * @section LICENSE
+ *
+ * Copyright (c) 2025, Govind M. Chari
+ * This source code is licensed under the BSD 3-Clause License
+ *
+ * @section DESCRIPTION
+ *
+ * Includes the appropriate backend header based on QOCO_ALGEBRA_BACKEND.
+ */
+
+#ifndef QOCO_BACKEND_H
+#define QOCO_BACKEND_H
+
+#if defined(QOCO_ALGEBRA_BACKEND_BUILTIN) || !defined(QOCO_ALGEBRA_BACKEND_CUDA)
+#include "builtin/qdldl_backend.h"
+#elif defined(QOCO_ALGEBRA_BACKEND_CUDA)
+#include "cuda/cudss_backend.h"
+#else
+#error "Unknown algebra backend. Define QOCO_ALGEBRA_BACKEND_BUILTIN or QOCO_ALGEBRA_BACKEND_CUDA"
+#endif
+
+#endif /* #ifndef QOCO_BACKEND_H */
+
diff --git a/algebra/builtin/builtin_linalg.c b/algebra/builtin/builtin_linalg.c
index 153ff3e..8aaaba3 100644
--- a/algebra/builtin/builtin_linalg.c
+++ b/algebra/builtin/builtin_linalg.c
@@ -8,7 +8,162 @@
  * This source code is licensed under the BSD 3-Clause License
  */
 
-#include "qoco_linalg.h"
+#include "builtin_types.h"
+
+QOCOMatrix* new_qoco_matrix(const QOCOCscMatrix* A)
+{
+  QOCOMatrix* M = qoco_malloc(sizeof(QOCOMatrix));
+  QOCOCscMatrix* Mcsc = qoco_malloc(sizeof(QOCOCscMatrix));
+
+  if (A) {
+    QOCOInt m = A->m;
+    QOCOInt n = A->n;
+    QOCOInt nnz = A->nnz;
+
+    QOCOFloat* x = qoco_malloc(nnz * sizeof(QOCOFloat));
+    QOCOInt* p = qoco_malloc((n + 1) * sizeof(QOCOInt));
+    QOCOInt* i = qoco_malloc(nnz * sizeof(QOCOInt));
+
+    copy_arrayf(A->x, x, nnz);
+    copy_arrayi(A->i, i, nnz);
+    copy_arrayi(A->p, p, n + 1);
+
+    Mcsc->m = m;
+    Mcsc->n = n;
+    Mcsc->nnz = nnz;
+    Mcsc->x = x;
+    Mcsc->i = i;
+    Mcsc->p = p;
+  }
+  else {
+    Mcsc->m = 0;
+    Mcsc->n = 0;
+    Mcsc->nnz = 0;
+    Mcsc->x = NULL;
+    Mcsc->i = NULL;
+    Mcsc->p = NULL;
+  }
+
+  M->csc = Mcsc;
+
+  return M;
+}
+
+QOCOVectorf* new_qoco_vectorf(const QOCOFloat* x, QOCOInt n)
+{
+  QOCOVectorf* v = qoco_malloc(sizeof(QOCOVectorf));
+  QOCOFloat* vdata = qoco_malloc(sizeof(QOCOFloat) * n);
+  if (x) {
+    copy_arrayf(x, vdata, n);
+  } else {
+    // Initialize to zero if x is NULL
+    for (QOCOInt i = 0; i < n; ++i) {
+      vdata[i] = 0.0;
+    }
+  }
+
+  v->len = n;
+  v->data = vdata;
+
+  return v;
+}
+
+void free_qoco_matrix(QOCOMatrix* A)
+{
+  free_qoco_csc_matrix(A->csc);
+  qoco_free(A);
+}
+
+void free_qoco_vectorf(QOCOVectorf* x)
+{
+  qoco_free(x->data);
+  qoco_free(x);
+}
+
+QOCOInt get_nnz(const QOCOMatrix* A) { return A->csc->nnz; }
+
+QOCOFloat get_element_vectorf(const QOCOVectorf* x, QOCOInt idx)
+{
+  return x->data[idx];
+}
+
+QOCOFloat* get_pointer_vectorf(const QOCOVectorf* x, QOCOInt idx)
+{
+  return &x->data[idx];
+}
+
+QOCOFloat* get_data_vectorf(const QOCOVectorf* x)
+{
+  return x->data;
+}
+
+QOCOInt get_length_vectorf(const QOCOVectorf* x)
+{
+  return x->len;
+}
+
+void sync_vector_to_device_if_needed(QOCOVectorf* v)
+{
+  // No-op for builtin backend (no device memory)
+  (void)v;
+}
+
+void set_solve_phase(int active)
+{
+  // No-op for builtin backend (no device memory)
+  (void)active;
+}
+
+int get_solve_phase(void)
+{
+  // Always return 0 for builtin backend (no device memory)
+  return 0;
+}
+
+void copy_vector_from_device(QOCOVectorf* src, QOCOFloat* dst, QOCOInt n)
+{
+  // For builtin backend, just copy from host memory
+  if (src && dst) {
+    copy_arrayf(src->data, dst, n);
+  }
+}
+
+QOCOCscMatrix* get_csc_matrix(const QOCOMatrix* M)
+{
+  return M->csc;
+}
+
+void col_inf_norm_USymm_matrix(const QOCOMatrix* M, QOCOFloat* norm)
+{
+  col_inf_norm_USymm(get_csc_matrix(M), norm);
+}
+
+void col_inf_norm_matrix(const QOCOMatrix* M, QOCOFloat* norm)
+{
+  col_inf_norm(get_csc_matrix(M), norm);
+}
+
+void row_inf_norm_matrix(const QOCOMatrix* M, QOCOFloat* norm)
+{
+  row_inf_norm(get_csc_matrix(M), norm);
+}
+
+void row_col_scale_matrix(QOCOMatrix* M, const QOCOFloat* E, const QOCOFloat* D)
+{
+  row_col_scale(get_csc_matrix(M), (QOCOFloat*)E, (QOCOFloat*)D);
+}
+
+void set_element_vectorf(QOCOVectorf* x, QOCOInt idx, QOCOFloat data)
+{
+  x->data[idx] = data;
+}
+
+void reciprocal_vectorf(const QOCOVectorf* input, QOCOVectorf* output)
+{
+  for (QOCOInt i = 0; i < input->len; ++i) {
+    output->data[i] = safe_div(1.0, input->data[i]);
+  }
+}
 
 QOCOCscMatrix* new_qoco_csc_matrix(const QOCOCscMatrix* A)
 {
@@ -181,6 +336,21 @@ void SpMtv(const QOCOCscMatrix* M, const QOCOFloat* v, QOCOFloat* r)
   }
 }
 
+void USpMv_matrix(const QOCOMatrix* M, const QOCOFloat* v, QOCOFloat* r)
+{
+  USpMv(M->csc, v, r);
+}
+
+void SpMv_matrix(const QOCOMatrix* M, const QOCOFloat* v, QOCOFloat* r)
+{
+  SpMv(M->csc, v, r);
+}
+
+void SpMtv_matrix(const QOCOMatrix* M, const QOCOFloat* v, QOCOFloat* r)
+{
+  SpMtv(M->csc, v, r);
+}
+
 QOCOFloat inf_norm(const QOCOFloat* x, QOCOInt n)
 {
   qoco_assert(x || n == 0);
diff --git a/algebra/builtin/builtin_types.h b/algebra/builtin/builtin_types.h
new file mode 100644
index 0000000..8321ba3
--- /dev/null
+++ b/algebra/builtin/builtin_types.h
@@ -0,0 +1,36 @@
+/**
+ * @file builtin_types.h
+ * @author Govind M. Chari <govindchari1@gmail.com>
+ *
+ * @section LICENSE
+ *
+ * Copyright (c) 2025, Govind M. Chari
+ * This source code is licensed under the BSD 3-Clause License
+ *
+ * @section DESCRIPTION
+ *
+ * Defines the vector and matrices for builtin linear algebra.
+ */
+
+#ifndef BUILTIN_TYPES_H
+#define BUILTIN_TYPES_H
+
+#include "common_linalg.h"
+#include "definitions.h"
+#include "qoco_linalg.h"
+
+struct QOCOVectori_ {
+  QOCOInt* data;
+  QOCOInt len;
+};
+
+struct QOCOVectorf_ {
+  QOCOFloat* data;
+  QOCOInt len;
+};
+
+struct QOCOMatrix_ {
+  QOCOCscMatrix* csc;
+};
+
+#endif /* ifndef BUILTIN_TYPES_H */
diff --git a/algebra/builtin/qdldl_backend.c b/algebra/builtin/qdldl_backend.c
index cd7c3e0..309c0e9 100644
--- a/algebra/builtin/qdldl_backend.c
+++ b/algebra/builtin/qdldl_backend.c
@@ -9,7 +9,6 @@
  */
 
 #include "qdldl_backend.h"
-#include <stdio.h>
 
 // Contains data for linear system.
 struct LinSysData {
@@ -95,20 +94,22 @@ static LinSysData* qdldl_setup(QOCOProblemData* data, QOCOSettings* settings,
   // Allocate memory for mappings to KKT matrix.
   linsys_data->nt2kkt = qoco_calloc(Wnnz, sizeof(QOCOInt));
   linsys_data->ntdiag2kkt = qoco_calloc(data->m, sizeof(QOCOInt));
-  linsys_data->PregtoKKT = qoco_calloc(data->P->nnz, sizeof(QOCOInt));
-  linsys_data->AttoKKT = qoco_calloc(data->A->nnz, sizeof(QOCOInt));
-  linsys_data->GttoKKT = qoco_calloc(data->G->nnz, sizeof(QOCOInt));
+  linsys_data->PregtoKKT = qoco_calloc(get_nnz(data->P), sizeof(QOCOInt));
+  linsys_data->AttoKKT = qoco_calloc(get_nnz(data->A), sizeof(QOCOInt));
+  linsys_data->GttoKKT = qoco_calloc(get_nnz(data->G), sizeof(QOCOInt));
 
   QOCOInt* nt2kkt_temp = qoco_calloc(Wnnz, sizeof(QOCOInt));
   QOCOInt* ntdiag2kkt_temp = qoco_calloc(data->m, sizeof(QOCOInt));
-  QOCOInt* PregtoKKT_temp = qoco_calloc(data->P->nnz, sizeof(QOCOInt));
-  QOCOInt* AttoKKT_temp = qoco_calloc(data->A->nnz, sizeof(QOCOInt));
-  QOCOInt* GttoKKT_temp = qoco_calloc(data->G->nnz, sizeof(QOCOInt));
+  QOCOInt* PregtoKKT_temp = data->P ? qoco_calloc(get_nnz(data->P), sizeof(QOCOInt)) : NULL;
+  QOCOInt* AttoKKT_temp = qoco_calloc(get_nnz(data->A), sizeof(QOCOInt));
+  QOCOInt* GttoKKT_temp = qoco_calloc(get_nnz(data->G), sizeof(QOCOInt));
 
   linsys_data->K = construct_kkt(
-      data->P, data->A, data->G, data->At, data->Gt, settings->kkt_static_reg,
-      data->n, data->m, data->p, data->l, data->nsoc, data->q, PregtoKKT_temp,
-      AttoKKT_temp, GttoKKT_temp, nt2kkt_temp, ntdiag2kkt_temp, Wnnz);
+      data->P ? get_csc_matrix(data->P) : NULL, get_csc_matrix(data->A), get_csc_matrix(data->G), 
+      get_csc_matrix(data->At), get_csc_matrix(data->Gt),
+      settings->kkt_static_reg, data->n, data->m, data->p, data->l, data->nsoc,
+      data->q, PregtoKKT_temp, AttoKKT_temp, GttoKKT_temp, nt2kkt_temp,
+      ntdiag2kkt_temp, Wnnz);
 
   // Compute AMD ordering.
   linsys_data->p = qoco_malloc(linsys_data->K->n * sizeof(QOCOInt));
@@ -133,15 +134,17 @@ static LinSysData* qdldl_setup(QOCOProblemData* data, QOCOSettings* settings,
     linsys_data->ntdiag2kkt[i] = KtoPKPt[ntdiag2kkt_temp[i]];
   }
 
-  for (QOCOInt i = 0; i < data->P->nnz; ++i) {
-    linsys_data->PregtoKKT[i] = KtoPKPt[PregtoKKT_temp[i]];
+  if (data->P && PregtoKKT_temp) {
+    for (QOCOInt i = 0; i < get_nnz(data->P); ++i) {
+      linsys_data->PregtoKKT[i] = KtoPKPt[PregtoKKT_temp[i]];
+    }
   }
 
-  for (QOCOInt i = 0; i < data->A->nnz; ++i) {
+  for (QOCOInt i = 0; i < get_nnz(data->A); ++i) {
     linsys_data->AttoKKT[i] = KtoPKPt[AttoKKT_temp[i]];
   }
 
-  for (QOCOInt i = 0; i < data->G->nnz; ++i) {
+  for (QOCOInt i = 0; i < get_nnz(data->G); ++i) {
     linsys_data->GttoKKT[i] = KtoPKPt[GttoKKT_temp[i]];
   }
 
@@ -253,18 +256,25 @@ static void qdldl_update_nt(LinSysData* linsys_data, QOCOFloat* WtW,
 static void qdldl_update_data(LinSysData* linsys_data, QOCOProblemData* data)
 {
   // Update P in KKT matrix.
-  for (QOCOInt i = 0; i < data->P->nnz; ++i) {
-    linsys_data->K->x[linsys_data->PregtoKKT[i]] = data->P->x[i];
+  if (data->P && linsys_data->PregtoKKT) {
+    QOCOCscMatrix* Pcsc = get_csc_matrix(data->P);
+    for (QOCOInt i = 0; i < get_nnz(data->P); ++i) {
+      linsys_data->K->x[linsys_data->PregtoKKT[i]] = Pcsc->x[i];
+    }
   }
 
   // Update A in KKT matrix.
-  for (QOCOInt i = 0; i < data->A->nnz; ++i) {
-    linsys_data->K->x[linsys_data->AttoKKT[data->AtoAt[i]]] = data->A->x[i];
+  QOCOCscMatrix* Acsc = get_csc_matrix(data->A);
+  for (QOCOInt i = 0; i < get_nnz(data->A); ++i) {
+    linsys_data->K->x[linsys_data->AttoKKT[data->AtoAt[i]]] =
+        Acsc->x[i];
   }
 
   // Update G in KKT matrix.
-  for (QOCOInt i = 0; i < data->G->nnz; ++i) {
-    linsys_data->K->x[linsys_data->GttoKKT[data->GtoGt[i]]] = data->G->x[i];
+  QOCOCscMatrix* Gcsc = get_csc_matrix(data->G);
+  for (QOCOInt i = 0; i < get_nnz(data->G); ++i) {
+    linsys_data->K->x[linsys_data->GttoKKT[data->GtoGt[i]]] =
+        Gcsc->x[i];
   }
 }
 
diff --git a/algebra/builtin/qdldl_backend.h b/algebra/builtin/qdldl_backend.h
index dc7b2b5..cb13dbd 100644
--- a/algebra/builtin/qdldl_backend.h
+++ b/algebra/builtin/qdldl_backend.h
@@ -16,6 +16,7 @@
 #define QDLDL_BACKEND_H
 
 #include "amd.h"
+#include "builtin_types.h"
 #include "common_linalg.h"
 #include "kkt.h"
 #include "qdldl.h"
diff --git a/algebra/cuda/CMakeLists.txt b/algebra/cuda/CMakeLists.txt
new file mode 100644
index 0000000..faae4e6
--- /dev/null
+++ b/algebra/cuda/CMakeLists.txt
@@ -0,0 +1,54 @@
+# Set CUDA flags before enabling language to allow unsupported GCC
+# Set CMAKE_CUDA_FLAGS_INIT before enable_language so it's used during compiler ID test
+set(CMAKE_CUDA_FLAGS_INIT "-allow-unsupported-compiler")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -allow-unsupported-compiler")
+set(CMAKE_CUDA_STANDARD 14)
+set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+
+# Set environment variable as fallback for compiler ID test
+set(ENV{CMAKE_CUDA_FLAGS} "${CMAKE_CUDA_FLAGS} -allow-unsupported-compiler")
+
+enable_language(CUDA)
+
+find_package(CUDAToolkit REQUIRED)
+find_library(CUBLAS_LIB cublas PATHS ${CUDAToolkit_LIBRARY_DIR})
+find_library(CUSPARSE_LIB cusparse PATHS ${CUDAToolkit_LIBRARY_DIR})
+# Search for cuDSS in CU_DSS_ROOT first, then system paths
+if(DEFINED ENV{CU_DSS_ROOT})
+  find_library(CUDSS_LIB cudss PATHS "$ENV{CU_DSS_ROOT}/lib" NO_DEFAULT_PATH)
+endif()
+if(NOT CUDSS_LIB)
+  find_library(CUDSS_LIB cudss PATHS ${CUDAToolkit_LIBRARY_DIR})
+endif()
+
+if(NOT CUDSS_LIB)
+  message(WARNING "cuDSS library not found. You may need to install cuDSS separately.")
+  message(STATUS "cuDSS can be obtained from NVIDIA Developer website")
+  message(STATUS "Building with cuDSS stubs - full functionality requires cuDSS installation")
+  # Create a dummy library variable to avoid errors
+  set(CUDSS_LIB "")
+else()
+  # Add HAVE_CUDSS definition - use CMAKE_CUDA_FLAGS to ensure it's passed to CUDA compiler
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DHAVE_CUDSS")
+  add_compile_definitions(HAVE_CUDSS)
+  # Add include directory for cuDSS headers
+  if(DEFINED ENV{CU_DSS_ROOT})
+    include_directories("$ENV{CU_DSS_ROOT}/include")
+  endif()
+  message(STATUS "Found cuDSS library: ${CUDSS_LIB}")
+  message(STATUS "Added HAVE_CUDSS definition to CUDA flags")
+endif()
+
+set_property(GLOBAL APPEND PROPERTY QOCO_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cudss_backend.cu ${CMAKE_CURRENT_SOURCE_DIR}/cuda_linalg.cu)
+set_property(GLOBAL APPEND PROPERTY QOCO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/cudss_backend.h)
+
+# Add CUDA libraries to global property (will be used in main CMakeLists.txt)
+set_property(GLOBAL APPEND PROPERTY QOCO_CUDA_LIBS 
+  ${CUBLAS_LIB} 
+  ${CUSPARSE_LIB}
+)
+
+if(CUDSS_LIB)
+  set_property(GLOBAL APPEND PROPERTY QOCO_CUDA_LIBS ${CUDSS_LIB})
+endif()
+
diff --git a/algebra/cuda/cuda_linalg.cu b/algebra/cuda/cuda_linalg.cu
new file mode 100644
index 0000000..5ebad7c
--- /dev/null
+++ b/algebra/cuda/cuda_linalg.cu
@@ -0,0 +1,811 @@
+/**
+ * @file cuda_linalg.cu
+ * @author Govind M. Chari <govindchari1@gmail.com>
+ *
+ * @section LICENSE
+ *
+ * Copyright (c) 2025, Govind M. Chari
+ * This source code is licensed under the BSD 3-Clause License
+ */
+
+#include "cuda_types.h"
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusparse.h>
+
+extern "C" {
+#include "common_linalg.h"
+}
+
+#define CUDA_CHECK(call) \
+  do { \
+    cudaError_t err = call; \
+    if (err != cudaSuccess) { \
+      fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
+      exit(1); \
+    } \
+  } while(0)
+
+// CUDA kernels
+__global__ void copy_arrayf_kernel(const QOCOFloat* x, QOCOFloat* y, QOCOInt n) {
+  QOCOInt idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    y[idx] = x[idx];
+  }
+}
+
+__global__ void copy_and_negate_arrayf_kernel(const QOCOFloat* x, QOCOFloat* y, QOCOInt n) {
+  QOCOInt idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    y[idx] = -x[idx];
+  }
+}
+
+__global__ void copy_arrayi_kernel(const QOCOInt* x, QOCOInt* y, QOCOInt n) {
+  QOCOInt idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    y[idx] = x[idx];
+  }
+}
+
+__global__ void scale_arrayf_kernel(const QOCOFloat* x, QOCOFloat* y, QOCOFloat s, QOCOInt n) {
+  QOCOInt idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    y[idx] = s * x[idx];
+  }
+}
+
+__global__ void axpy_kernel(const QOCOFloat* x, const QOCOFloat* y, QOCOFloat* z, QOCOFloat a, QOCOInt n) {
+  QOCOInt idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    z[idx] = a * x[idx] + y[idx];
+  }
+}
+
+__global__ void inf_norm_kernel(const QOCOFloat* x, QOCOFloat* result, QOCOInt n) {
+  extern __shared__ QOCOFloat sdata[];
+  QOCOInt tid = threadIdx.x;
+  QOCOInt idx = blockIdx.x * blockDim.x + threadIdx.x;
+  
+  QOCOFloat val = (idx < n) ? fabs(x[idx]) : 0.0f;
+  sdata[tid] = val;
+  __syncthreads();
+  
+  for (QOCOInt s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      sdata[tid] = fmaxf(sdata[tid], sdata[tid + s]);
+    }
+    __syncthreads();
+  }
+  
+  if (tid == 0) {
+    result[blockIdx.x] = sdata[0];
+  }
+}
+
+static inline QOCOInt get_block_size(QOCOInt n) {
+  return (n + 255) / 256 * 256 < 256 ? 256 : ((n + 255) / 256) * 256;
+}
+
+// Convert CSC to CSR format (host conversion, then copy to device)
+// Used for converting matrices to GPU format after setup/equilibration
+static void csc_to_csr(const QOCOCscMatrix* csc, QOCOInt** csr_row_ptr, 
+                       QOCOInt** csr_col_ind, QOCOFloat** csr_val)
+{
+  QOCOInt m = csc->m;
+  QOCOInt n = csc->n;
+  QOCOInt nnz = csc->nnz;
+  
+  // Allocate CSR arrays on host
+  QOCOInt* h_csr_row_ptr = (QOCOInt*)qoco_calloc((m + 1), sizeof(QOCOInt));
+  QOCOInt* h_csr_col_ind = (QOCOInt*)qoco_malloc(nnz * sizeof(QOCOInt));
+  QOCOFloat* h_csr_val = (QOCOFloat*)qoco_malloc(nnz * sizeof(QOCOFloat));
+  
+  // Count nonzeros per row
+  for (QOCOInt col = 0; col < n; ++col) {
+    for (QOCOInt idx = csc->p[col]; idx < csc->p[col + 1]; ++idx) {
+      QOCOInt row = csc->i[idx];
+      h_csr_row_ptr[row + 1]++;
+    }
+  }
+  
+  // Compute row pointers (prefix sum)
+  for (QOCOInt i = 0; i < m; ++i) {
+    h_csr_row_ptr[i + 1] += h_csr_row_ptr[i];
+  }
+  
+  // Temporary array to track current position in each row
+  QOCOInt* row_pos = (QOCOInt*)qoco_malloc(m * sizeof(QOCOInt));
+  for (QOCOInt i = 0; i < m; ++i) {
+    row_pos[i] = h_csr_row_ptr[i];
+  }
+  
+  // Fill CSR arrays
+  for (QOCOInt col = 0; col < n; ++col) {
+    for (QOCOInt idx = csc->p[col]; idx < csc->p[col + 1]; ++idx) {
+      QOCOInt row = csc->i[idx];
+      QOCOInt csr_idx = row_pos[row]++;
+      h_csr_col_ind[csr_idx] = col;
+      h_csr_val[csr_idx] = csc->x[idx];
+    }
+  }
+  
+  qoco_free(row_pos);
+  
+  // Allocate device memory and copy
+  CUDA_CHECK(cudaMalloc(csr_row_ptr, (m + 1) * sizeof(QOCOInt)));
+  CUDA_CHECK(cudaMalloc(csr_col_ind, nnz * sizeof(QOCOInt)));
+  CUDA_CHECK(cudaMalloc(csr_val, nnz * sizeof(QOCOFloat)));
+  
+  CUDA_CHECK(cudaMemcpy(*csr_row_ptr, h_csr_row_ptr, (m + 1) * sizeof(QOCOInt), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(*csr_col_ind, h_csr_col_ind, nnz * sizeof(QOCOInt), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(*csr_val, h_csr_val, nnz * sizeof(QOCOFloat), cudaMemcpyHostToDevice));
+  
+  // Free host memory
+  qoco_free(h_csr_row_ptr);
+  qoco_free(h_csr_col_ind);
+  qoco_free(h_csr_val);
+}
+
+extern "C" {
+
+// Host implementation that manages GPU memory
+QOCOMatrix* new_qoco_matrix(const QOCOCscMatrix* A)
+{
+  QOCOMatrix* M = (QOCOMatrix*)qoco_malloc(sizeof(QOCOMatrix));
+  
+  if (A) {
+    QOCOInt m = A->m;
+    QOCOInt n = A->n;
+    QOCOInt nnz = A->nnz;
+
+    // Allocate host memory
+    QOCOCscMatrix* Mcsc = (QOCOCscMatrix*)qoco_malloc(sizeof(QOCOCscMatrix));
+    QOCOFloat* x = (QOCOFloat*)qoco_malloc(nnz * sizeof(QOCOFloat));
+    QOCOInt* p = (QOCOInt*)qoco_malloc((n + 1) * sizeof(QOCOInt));
+    QOCOInt* i = (QOCOInt*)qoco_malloc(nnz * sizeof(QOCOInt));
+
+    copy_arrayf(A->x, x, nnz);
+    copy_arrayi(A->i, i, nnz);
+    copy_arrayi(A->p, p, n + 1);
+
+    Mcsc->m = m;
+    Mcsc->n = n;
+    Mcsc->nnz = nnz;
+    Mcsc->x = x;
+    Mcsc->i = i;
+    Mcsc->p = p;
+
+    // For problem matrices (P, A, G), don't convert to CSR yet
+    // Equilibration happens on CPU using CSC, then KKT matrix is formed
+    // Only KKT matrix needs to be converted to CSR for cuDSS
+    // So for now, leave d_csr as NULL - it will be set up when KKT matrix is created
+    M->csc = Mcsc;
+    M->d_csr = NULL;
+  }
+  else {
+    QOCOCscMatrix* Mcsc = (QOCOCscMatrix*)qoco_malloc(sizeof(QOCOCscMatrix));
+    Mcsc->m = 0;
+    Mcsc->n = 0;
+    Mcsc->nnz = 0;
+    Mcsc->x = NULL;
+    Mcsc->i = NULL;
+    Mcsc->p = NULL;
+    M->csc = Mcsc;
+    M->d_csr = NULL;
+  }
+
+  return M;
+}
+
+QOCOVectorf* new_qoco_vectorf(const QOCOFloat* x, QOCOInt n)
+{
+  QOCOVectorf* v = (QOCOVectorf*)qoco_malloc(sizeof(QOCOVectorf));
+  QOCOFloat* vdata = (QOCOFloat*)qoco_malloc(sizeof(QOCOFloat) * n);
+  
+  if (x) {
+    copy_arrayf(x, vdata, n);
+  } else {
+    for (QOCOInt i = 0; i < n; ++i) {
+      vdata[i] = 0.0;
+    }
+  }
+
+  QOCOFloat* d_vdata;
+  CUDA_CHECK(cudaMalloc(&d_vdata, sizeof(QOCOFloat) * n));
+  CUDA_CHECK(cudaMemcpy(d_vdata, vdata, sizeof(QOCOFloat) * n, cudaMemcpyHostToDevice));
+
+  v->len = n;
+  v->data = vdata;
+  v->d_data = d_vdata;
+
+  return v;
+}
+
+void free_qoco_matrix(QOCOMatrix* A)
+{
+  if (A) {
+    if (A->csc) {
+      free_qoco_csc_matrix(A->csc);
+    }
+    if (A->d_csr) {
+      if (A->d_csr->row_ptr) cudaFree(A->d_csr->row_ptr);
+      if (A->d_csr->col_ind) cudaFree(A->d_csr->col_ind);
+      if (A->d_csr->val) cudaFree(A->d_csr->val);
+      qoco_free(A->d_csr);
+    }
+    qoco_free(A);
+  }
+}
+
+void free_qoco_vectorf(QOCOVectorf* x)
+{
+  if (x) {
+    if (x->data) qoco_free(x->data);
+    if (x->d_data) cudaFree(x->d_data);
+    qoco_free(x);
+  }
+}
+
+QOCOInt get_nnz(const QOCOMatrix* A) { return A->csc->nnz; }
+
+QOCOFloat get_element_vectorf(const QOCOVectorf* x, QOCOInt idx)
+{
+  return x->data[idx];
+}
+
+QOCOFloat* get_pointer_vectorf(const QOCOVectorf* x, QOCOInt idx)
+{
+  return &x->data[idx];
+}
+
+// Track solve phase to determine if we should return device or host pointers
+static int solve_phase_active = 0;
+
+void set_solve_phase(int active)
+{
+  solve_phase_active = active;
+}
+
+int get_solve_phase(void)
+{
+  return solve_phase_active;
+}
+
+// Helper function to copy solution vectors from device to host (called from C code)
+void copy_vector_from_device(QOCOVectorf* src, QOCOFloat* dst, QOCOInt n)
+{
+  if (src && src->d_data && dst) {
+    CUDA_CHECK(cudaMemcpy(dst, src->d_data, n * sizeof(QOCOFloat), cudaMemcpyDeviceToHost));
+  }
+}
+
+QOCOFloat* get_data_vectorf(const QOCOVectorf* x)
+{
+  // During equilibration/setup (CPU phase), return host pointer
+  // During solve (GPU phase), return device pointer to avoid CPU-GPU copies
+  if (solve_phase_active && x->d_data) {
+    return x->d_data;
+  }
+  return x->data;
+}
+
+QOCOInt get_length_vectorf(const QOCOVectorf* x)
+{
+  return x->len;
+}
+
+void sync_vector_to_device_if_needed(QOCOVectorf* v)
+{
+  if (v && v->data && v->d_data) {
+    // Sync from device to host (at solver termination, copy device data to host)
+    CUDA_CHECK(cudaMemcpy(v->data, v->d_data, v->len * sizeof(QOCOFloat), cudaMemcpyDeviceToHost));
+  }
+}
+
+QOCOCscMatrix* get_csc_matrix(const QOCOMatrix* M)
+{
+  return M->csc;
+}
+
+// These functions are only called during equilibration on CPU, so call C functions directly
+void col_inf_norm_USymm_matrix(const QOCOMatrix* M, QOCOFloat* norm)
+{
+  // Called during setup on CPU - use existing C function
+  col_inf_norm_USymm(get_csc_matrix(M), norm);
+}
+
+void col_inf_norm_matrix(const QOCOMatrix* M, QOCOFloat* norm)
+{
+  // Called during setup on CPU - use existing C function
+  col_inf_norm(get_csc_matrix(M), norm);
+}
+
+void row_inf_norm_matrix(const QOCOMatrix* M, QOCOFloat* norm)
+{
+  // Called during setup on CPU - use existing C function
+  row_inf_norm(get_csc_matrix(M), norm);
+}
+
+void row_col_scale_matrix(QOCOMatrix* M, const QOCOFloat* E, const QOCOFloat* D)
+{
+  // Called during setup on CPU - scale on host CSC
+  row_col_scale(get_csc_matrix(M), (QOCOFloat*)E, (QOCOFloat*)D);
+  
+  // Clear device CSR if it exists - it will be recreated from updated CSC when KKT matrix is formed
+  if (M->d_csr) {
+    // Free old device CSR - it will be recreated from updated CSC later
+    if (M->d_csr->row_ptr) cudaFree(M->d_csr->row_ptr);
+    if (M->d_csr->col_ind) cudaFree(M->d_csr->col_ind);
+    if (M->d_csr->val) cudaFree(M->d_csr->val);
+    qoco_free(M->d_csr);
+    M->d_csr = NULL;
+  }
+}
+
+void set_element_vectorf(QOCOVectorf* x, QOCOInt idx, QOCOFloat data)
+{
+  // Update host (during setup/equilibration, host is primary)
+  x->data[idx] = data;
+  // Also update device to keep in sync
+  CUDA_CHECK(cudaMemcpy(&x->d_data[idx], &data, sizeof(QOCOFloat), cudaMemcpyHostToDevice));
+}
+
+void reciprocal_vectorf(const QOCOVectorf* input, QOCOVectorf* output)
+{
+  // Operate directly on device
+  const QOCOInt blockSize = 256;
+  const QOCOInt numBlocks = (input->len + blockSize - 1) / blockSize;
+  
+  // Use a kernel for reciprocal
+  // For now, do on host then copy (setup phase only)
+  for (QOCOInt i = 0; i < input->len; ++i) {
+    output->data[i] = safe_div(1.0, input->data[i]);
+  }
+  CUDA_CHECK(cudaMemcpy(output->d_data, output->data, output->len * sizeof(QOCOFloat), cudaMemcpyHostToDevice));
+}
+
+QOCOCscMatrix* new_qoco_csc_matrix(const QOCOCscMatrix* A)
+{
+  QOCOCscMatrix* M = (QOCOCscMatrix*)qoco_malloc(sizeof(QOCOCscMatrix));
+
+  if (A) {
+    QOCOInt m = A->m;
+    QOCOInt n = A->n;
+    QOCOInt nnz = A->nnz;
+
+    QOCOFloat* x = (QOCOFloat*)qoco_malloc(nnz * sizeof(QOCOFloat));
+    QOCOInt* p = (QOCOInt*)qoco_malloc((n + 1) * sizeof(QOCOInt));
+    QOCOInt* i = (QOCOInt*)qoco_malloc(nnz * sizeof(QOCOInt));
+
+    copy_arrayf(A->x, x, nnz);
+    copy_arrayi(A->i, i, nnz);
+    copy_arrayi(A->p, p, n + 1);
+
+    M->m = m;
+    M->n = n;
+    M->nnz = nnz;
+    M->x = x;
+    M->i = i;
+    M->p = p;
+  }
+  else {
+    M->m = 0;
+    M->n = 0;
+    M->nnz = 0;
+    M->x = NULL;
+    M->i = NULL;
+    M->p = NULL;
+  }
+
+  return M;
+}
+
+void free_qoco_csc_matrix(QOCOCscMatrix* A)
+{
+  if (A) {
+    free(A->x);
+    free(A->i);
+    free(A->p);
+    free(A);
+  }
+}
+
+void copy_arrayf(const QOCOFloat* x, QOCOFloat* y, QOCOInt n)
+{
+  qoco_assert(x || n == 0);
+  qoco_assert(y || n == 0);
+
+  if (n == 0) return;
+  
+  // Check if pointers are on device - handle errors gracefully
+  cudaPointerAttributes attrs_x, attrs_y;
+  cudaError_t err_x = cudaPointerGetAttributes(&attrs_x, x);
+  cudaError_t err_y = cudaPointerGetAttributes(&attrs_y, y);
+  
+  // If either pointer check succeeds and one is on device, use CUDA
+  if ((err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeDevice) ||
+      (err_y == cudaSuccess && attrs_y.type == cudaMemoryTypeDevice)) {
+    const QOCOInt blockSize = 256;
+    const QOCOInt numBlocks = (n + blockSize - 1) / blockSize;
+    // At least one pointer is on device - use kernel
+    if (err_x == cudaSuccess && err_y == cudaSuccess &&
+        attrs_x.type == cudaMemoryTypeDevice && attrs_y.type == cudaMemoryTypeDevice) {
+      // Both device
+      copy_arrayf_kernel<<<numBlocks, blockSize>>>(x, (QOCOFloat*)y, n);
+    } else if (err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeDevice) {
+      // x on device, y on host
+      CUDA_CHECK(cudaMemcpy(y, x, n * sizeof(QOCOFloat), cudaMemcpyDeviceToHost));
+    } else if (err_y == cudaSuccess && attrs_y.type == cudaMemoryTypeDevice) {
+      // y on device, x on host
+      CUDA_CHECK(cudaMemcpy((QOCOFloat*)y, x, n * sizeof(QOCOFloat), cudaMemcpyHostToDevice));
+    }
+    CUDA_CHECK(cudaDeviceSynchronize());
+  } else {
+    // Both on host - use CPU
+    for (QOCOInt i = 0; i < n; ++i) {
+      y[i] = x[i];
+    }
+  }
+}
+
+void copy_and_negate_arrayf(const QOCOFloat* x, QOCOFloat* y, QOCOInt n)
+{
+  qoco_assert(x || n == 0);
+  qoco_assert(y || n == 0);
+
+  if (n == 0) return;
+  
+  const QOCOInt blockSize = 256;
+  const QOCOInt numBlocks = (n + blockSize - 1) / blockSize;
+  
+  // Check if pointers are on device - handle errors gracefully
+  cudaPointerAttributes attrs_x, attrs_y;
+  cudaError_t err_x = cudaPointerGetAttributes(&attrs_x, x);
+  cudaError_t err_y = cudaPointerGetAttributes(&attrs_y, y);
+  
+  // If either pointer check succeeds and one is on device, use CUDA
+  if ((err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeDevice) ||
+      (err_y == cudaSuccess && attrs_y.type == cudaMemoryTypeDevice)) {
+    if (err_x == cudaSuccess && err_y == cudaSuccess &&
+        attrs_x.type == cudaMemoryTypeDevice && attrs_y.type == cudaMemoryTypeDevice) {
+      copy_and_negate_arrayf_kernel<<<numBlocks, blockSize>>>(x, (QOCOFloat*)y, n);
+    } else if (err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeDevice) {
+        QOCOFloat* temp = (QOCOFloat*)qoco_malloc(n * sizeof(QOCOFloat));
+      CUDA_CHECK(cudaMemcpy(temp, x, n * sizeof(QOCOFloat), cudaMemcpyDeviceToHost));
+      for (QOCOInt i = 0; i < n; ++i) y[i] = -temp[i];
+      qoco_free(temp);
+    } else {
+      QOCOFloat* d_y;
+      CUDA_CHECK(cudaMalloc(&d_y, n * sizeof(QOCOFloat)));
+      CUDA_CHECK(cudaMemcpy(d_y, x, n * sizeof(QOCOFloat), cudaMemcpyHostToDevice));
+      copy_and_negate_arrayf_kernel<<<numBlocks, blockSize>>>(d_y, (QOCOFloat*)y, n);
+      cudaFree(d_y);
+    }
+    CUDA_CHECK(cudaDeviceSynchronize());
+  } else {
+    for (QOCOInt i = 0; i < n; ++i) {
+      y[i] = -x[i];
+    }
+  }
+}
+
+void copy_arrayi(const QOCOInt* x, QOCOInt* y, QOCOInt n)
+{
+  qoco_assert(x || n == 0);
+  qoco_assert(y || n == 0);
+
+  if (n == 0) return;
+  
+  const QOCOInt blockSize = 256;
+  const QOCOInt numBlocks = (n + blockSize - 1) / blockSize;
+  
+  // Check if pointers are on device - handle errors gracefully
+  cudaPointerAttributes attrs_x, attrs_y;
+  cudaError_t err_x = cudaPointerGetAttributes(&attrs_x, x);
+  cudaError_t err_y = cudaPointerGetAttributes(&attrs_y, y);
+  
+  // If either pointer check succeeds and one is on device, use CUDA
+  if ((err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeDevice) ||
+      (err_y == cudaSuccess && attrs_y.type == cudaMemoryTypeDevice)) {
+    if (err_x == cudaSuccess && err_y == cudaSuccess &&
+        attrs_x.type == cudaMemoryTypeDevice && attrs_y.type == cudaMemoryTypeDevice) {
+      copy_arrayi_kernel<<<numBlocks, blockSize>>>(x, (QOCOInt*)y, n);
+    } else if (err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeDevice) {
+      CUDA_CHECK(cudaMemcpy(y, x, n * sizeof(QOCOInt), cudaMemcpyDeviceToHost));
+    } else if (err_y == cudaSuccess && attrs_y.type == cudaMemoryTypeDevice) {
+      CUDA_CHECK(cudaMemcpy((QOCOInt*)y, x, n * sizeof(QOCOInt), cudaMemcpyHostToDevice));
+    }
+    CUDA_CHECK(cudaDeviceSynchronize());
+  } else {
+    for (QOCOInt i = 0; i < n; ++i) {
+      y[i] = x[i];
+    }
+  }
+}
+
+QOCOFloat qoco_dot(const QOCOFloat* u, const QOCOFloat* v, QOCOInt n)
+{
+  qoco_assert(u || n == 0);
+  qoco_assert(v || n == 0);
+
+  if (n == 0) return 0.0;
+
+  // Check if pointers are on device - handle errors gracefully
+  cudaPointerAttributes attrs_u, attrs_v;
+  cudaError_t err_u = cudaPointerGetAttributes(&attrs_u, u);
+  cudaError_t err_v = cudaPointerGetAttributes(&attrs_v, v);
+  
+  // If either pointer check fails or one is on device, use CUDA
+  if ((err_u == cudaSuccess && attrs_u.type == cudaMemoryTypeDevice) ||
+      (err_v == cudaSuccess && attrs_v.type == cudaMemoryTypeDevice)) {
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    QOCOFloat result;
+    cublasDdot(handle, n, u, 1, v, 1, &result);
+    cublasDestroy(handle);
+    return result;
+  } else {
+    QOCOFloat x = 0.0;
+    for (QOCOInt i = 0; i < n; ++i) {
+      x += u[i] * v[i];
+    }
+    return x;
+  }
+}
+
+QOCOInt max_arrayi(const QOCOInt* x, QOCOInt n)
+{
+  qoco_assert(x || n == 0);
+
+  if (n == 0) return -QOCOInt_MAX;
+
+  cudaPointerAttributes attrs;
+  cudaError_t err = cudaPointerGetAttributes(&attrs, x);
+  
+  if (err == cudaSuccess && attrs.type == cudaMemoryTypeDevice) {
+    // Copy to host and compute
+      QOCOInt* h_x = (QOCOInt*)qoco_malloc(n * sizeof(QOCOInt));
+    CUDA_CHECK(cudaMemcpy(h_x, x, n * sizeof(QOCOInt), cudaMemcpyDeviceToHost));
+    QOCOInt max = -QOCOInt_MAX;
+    for (QOCOInt i = 0; i < n; ++i) {
+      max = qoco_max(max, h_x[i]);
+    }
+    qoco_free(h_x);
+    return max;
+  } else {
+    QOCOInt max = -QOCOInt_MAX;
+    for (QOCOInt i = 0; i < n; ++i) {
+      max = qoco_max(max, x[i]);
+    }
+    return max;
+  }
+}
+
+void scale_arrayf(const QOCOFloat* x, QOCOFloat* y, QOCOFloat s, QOCOInt n)
+{
+  qoco_assert(x || n == 0);
+  qoco_assert(y || n == 0);
+
+  if (n == 0) return;
+
+  const QOCOInt blockSize = 256;
+  const QOCOInt numBlocks = (n + blockSize - 1) / blockSize;
+
+  // Check if pointers are on device - handle errors gracefully
+  cudaPointerAttributes attrs_x, attrs_y;
+  cudaError_t err_x = cudaPointerGetAttributes(&attrs_x, x);
+  cudaError_t err_y = cudaPointerGetAttributes(&attrs_y, y);
+
+  // If either pointer check succeeds and one is on device, use CUDA
+  if ((err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeDevice) ||
+      (err_y == cudaSuccess && attrs_y.type == cudaMemoryTypeDevice)) {
+    if (err_x == cudaSuccess && err_y == cudaSuccess &&
+        attrs_x.type == cudaMemoryTypeDevice && attrs_y.type == cudaMemoryTypeDevice) {
+      scale_arrayf_kernel<<<numBlocks, blockSize>>>(x, (QOCOFloat*)y, s, n);
+    } else {
+      cublasHandle_t handle;
+      cublasCreate(&handle);
+      if (err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeDevice &&
+          err_y == cudaSuccess && attrs_y.type == cudaMemoryTypeHost) {
+        QOCOFloat* d_y;
+        CUDA_CHECK(cudaMalloc(&d_y, n * sizeof(QOCOFloat)));
+        cublasDcopy(handle, n, x, 1, d_y, 1);
+        cublasDscal(handle, n, &s, d_y, 1);
+        CUDA_CHECK(cudaMemcpy(y, d_y, n * sizeof(QOCOFloat), cudaMemcpyDeviceToHost));
+        cudaFree(d_y);
+      } else {
+        QOCOFloat* d_x;
+        CUDA_CHECK(cudaMalloc(&d_x, n * sizeof(QOCOFloat)));
+        CUDA_CHECK(cudaMemcpy(d_x, x, n * sizeof(QOCOFloat), cudaMemcpyHostToDevice));
+        cublasDcopy(handle, n, d_x, 1, (QOCOFloat*)y, 1);
+        cublasDscal(handle, n, &s, (QOCOFloat*)y, 1);
+        cudaFree(d_x);
+      }
+      cublasDestroy(handle);
+    }
+    CUDA_CHECK(cudaDeviceSynchronize());
+  } else {
+    for (QOCOInt i = 0; i < n; ++i) {
+      y[i] = s * x[i];
+    }
+  }
+}
+
+void qoco_axpy(const QOCOFloat* x, const QOCOFloat* y, QOCOFloat* z,
+               QOCOFloat a, QOCOInt n)
+{
+  qoco_assert(x || n == 0);
+  qoco_assert(y || n == 0);
+
+  if (n == 0) return;
+
+  const QOCOInt blockSize = 256;
+  const QOCOInt numBlocks = (n + blockSize - 1) / blockSize;
+
+  // Check if pointers are on device - handle errors gracefully
+  cudaPointerAttributes attrs_x, attrs_y, attrs_z;
+  cudaError_t err_x = cudaPointerGetAttributes(&attrs_x, x);
+  cudaError_t err_y = cudaPointerGetAttributes(&attrs_y, y);
+  cudaError_t err_z = cudaPointerGetAttributes(&attrs_z, z);
+  
+  // If any pointer check succeeds and indicates device memory, use CUDA
+  if ((err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeDevice) ||
+      (err_y == cudaSuccess && attrs_y.type == cudaMemoryTypeDevice) ||
+      (err_z == cudaSuccess && attrs_z.type == cudaMemoryTypeDevice)) {
+    if (err_x == cudaSuccess && err_y == cudaSuccess && err_z == cudaSuccess &&
+        attrs_x.type == cudaMemoryTypeDevice && attrs_y.type == cudaMemoryTypeDevice &&
+        attrs_z.type == cudaMemoryTypeDevice) {
+      axpy_kernel<<<numBlocks, blockSize>>>(x, y, (QOCOFloat*)z, a, n);
+    } else {
+      cublasHandle_t handle;
+      cublasCreate(&handle);
+      // For mixed cases, use cublas
+      if (err_z == cudaSuccess && attrs_z.type == cudaMemoryTypeDevice) {
+        cublasDcopy(handle, n, y, 1, (QOCOFloat*)z, 1);
+        cublasDaxpy(handle, n, &a, x, 1, (QOCOFloat*)z, 1);
+      } else {
+        // Need to copy to host
+        QOCOFloat* d_z;
+        CUDA_CHECK(cudaMalloc(&d_z, n * sizeof(QOCOFloat)));
+        const QOCOFloat* d_x = (err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeDevice) ? x : NULL;
+        const QOCOFloat* d_y = (err_y == cudaSuccess && attrs_y.type == cudaMemoryTypeDevice) ? y : NULL;
+        if (!d_x) {
+          CUDA_CHECK(cudaMalloc((void**)&d_x, n * sizeof(QOCOFloat)));
+          CUDA_CHECK(cudaMemcpy((void*)d_x, x, n * sizeof(QOCOFloat), cudaMemcpyHostToDevice));
+        }
+        if (!d_y) {
+          CUDA_CHECK(cudaMalloc((void**)&d_y, n * sizeof(QOCOFloat)));
+          CUDA_CHECK(cudaMemcpy((void*)d_y, y, n * sizeof(QOCOFloat), cudaMemcpyHostToDevice));
+        }
+        cublasDcopy(handle, n, d_y, 1, d_z, 1);
+        cublasDaxpy(handle, n, &a, d_x, 1, d_z, 1);
+        CUDA_CHECK(cudaMemcpy(z, d_z, n * sizeof(QOCOFloat), cudaMemcpyDeviceToHost));
+        if (!d_x || (err_x == cudaSuccess && attrs_x.type == cudaMemoryTypeHost)) {
+          if (d_x) cudaFree((void*)d_x);
+        }
+        if (!d_y || (err_y == cudaSuccess && attrs_y.type == cudaMemoryTypeHost)) {
+          if (d_y) cudaFree((void*)d_y);
+        }
+        cudaFree(d_z);
+      }
+      cublasDestroy(handle);
+    }
+    CUDA_CHECK(cudaDeviceSynchronize());
+  } else {
+    for (QOCOInt i = 0; i < n; ++i) {
+      z[i] = a * x[i] + y[i];
+    }
+  }
+}
+
+// Sparse matrix-vector multiplication using cuSPARSE
+void USpMv(const QOCOCscMatrix* M, const QOCOFloat* v, QOCOFloat* r)
+{
+  qoco_assert(M);
+  qoco_assert(v);
+  qoco_assert(r);
+
+  // For now, use CPU implementation
+  // TODO: Implement CUDA version using cuSPARSE
+  for (QOCOInt i = 0; i < M->n; i++) {
+    r[i] = 0.0;
+    for (QOCOInt j = M->p[i]; j < M->p[i + 1]; j++) {
+      int row = M->i[j];
+      r[row] += M->x[j] * v[i];
+      if (row != i)
+        r[i] += M->x[j] * v[row];
+    }
+  }
+}
+
+void SpMv(const QOCOCscMatrix* M, const QOCOFloat* v, QOCOFloat* r)
+{
+  qoco_assert(M);
+  qoco_assert(v);
+  qoco_assert(r);
+
+  for (QOCOInt i = 0; i < M->m; ++i) {
+    r[i] = 0.0;
+  }
+
+  for (QOCOInt j = 0; j < M->n; j++) {
+    for (QOCOInt i = M->p[j]; i < M->p[j + 1]; i++) {
+      r[M->i[i]] += M->x[i] * v[j];
+    }
+  }
+}
+
+void SpMtv(const QOCOCscMatrix* M, const QOCOFloat* v, QOCOFloat* r)
+{
+  qoco_assert(M);
+  qoco_assert(v);
+  qoco_assert(r);
+
+  for (QOCOInt i = 0; i < M->n; ++i) {
+    r[i] = 0.0;
+  }
+
+  for (QOCOInt i = 0; i < M->n; i++) {
+    for (QOCOInt j = M->p[i]; j < M->p[i + 1]; j++) {
+      r[i] += M->x[j] * v[M->i[j]];
+    }
+  }
+}
+
+} // extern "C" - end of C linkage block
+
+// Functions that don't need C linkage (called from C++ code or headers without extern "C")
+void USpMv_matrix(const QOCOMatrix* M, const QOCOFloat* v, QOCOFloat* r)
+{
+  USpMv(M->csc, v, r);
+}
+
+void SpMv_matrix(const QOCOMatrix* M, const QOCOFloat* v, QOCOFloat* r)
+{
+  SpMv(M->csc, v, r);
+}
+
+void SpMtv_matrix(const QOCOMatrix* M, const QOCOFloat* v, QOCOFloat* r)
+{
+  SpMtv(M->csc, v, r);
+}
+
+QOCOFloat inf_norm(const QOCOFloat* x, QOCOInt n)
+{
+  qoco_assert(x || n == 0);
+
+  if (n == 0) return 0.0;
+
+  cudaPointerAttributes attrs;
+  cudaError_t err = cudaPointerGetAttributes(&attrs, x);
+  
+  if (err == cudaSuccess && attrs.type == cudaMemoryTypeDevice) {
+    const QOCOInt blockSize = 256;
+    const QOCOInt numBlocks = (n + blockSize - 1) / blockSize;
+    QOCOFloat* d_result;
+    CUDA_CHECK(cudaMalloc(&d_result, numBlocks * sizeof(QOCOFloat)));
+    
+    inf_norm_kernel<<<numBlocks, blockSize, blockSize * sizeof(QOCOFloat)>>>(x, d_result, n);
+    
+    QOCOFloat* h_result = (QOCOFloat*)qoco_malloc(numBlocks * sizeof(QOCOFloat));
+    CUDA_CHECK(cudaMemcpy(h_result, d_result, numBlocks * sizeof(QOCOFloat), cudaMemcpyDeviceToHost));
+    
+    QOCOFloat result = 0.0;
+    for (QOCOInt i = 0; i < numBlocks; ++i) {
+      result = qoco_max(result, h_result[i]);
+    }
+    
+    qoco_free(h_result);
+    cudaFree(d_result);
+    return result;
+  } else {
+    QOCOFloat norm = 0.0;
+    QOCOFloat xi;
+    for (QOCOInt i = 0; i < n; ++i) {
+      xi = qoco_abs(x[i]);
+      norm = qoco_max(norm, xi);
+    }
+    return norm;
+  }
+}
diff --git a/algebra/cuda/cuda_types.h b/algebra/cuda/cuda_types.h
new file mode 100644
index 0000000..1ffa4a0
--- /dev/null
+++ b/algebra/cuda/cuda_types.h
@@ -0,0 +1,68 @@
+/**
+ * @file cuda_types.h
+ * @author Govind M. Chari <govindchari1@gmail.com>
+ *
+ * @section LICENSE
+ *
+ * Copyright (c) 2025, Govind M. Chari
+ * This source code is licensed under the BSD 3-Clause License
+ *
+ * @section DESCRIPTION
+ *
+ * Defines the vector and matrices for CUDA linear algebra.
+ * On GPU, matrices are stored in CSR format (as required by cuSPARSE/cuDSS).
+ */
+
+#ifndef CUDA_TYPES_H
+#define CUDA_TYPES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "common_linalg.h"
+#include "definitions.h"
+#include "qoco_linalg.h"
+#ifdef __cplusplus
+}
+#endif
+#include <cuda_runtime.h>
+
+// CSR matrix format (for GPU storage)
+typedef struct {
+  /** Number of rows. */
+  QOCOInt m;
+
+  /** Number of columns. */
+  QOCOInt n;
+
+  /** Number of nonzero elements. */
+  QOCOInt nnz;
+
+  /** Row pointers (length: m+1). */
+  QOCOInt* row_ptr;
+
+  /** Column indices (length: nnz). */
+  QOCOInt* col_ind;
+
+  /** Data (length: nnz). */
+  QOCOFloat* val;
+} QOCOCsrMatrix;
+
+struct QOCOVectori_ {
+  QOCOInt* data;  // Host pointer (kept for compatibility, but not used during solve)
+  QOCOInt* d_data;  // Device pointer (primary storage)
+  QOCOInt len;
+};
+
+struct QOCOVectorf_ {
+  QOCOFloat* data;  // Host pointer (kept for compatibility, but not used during solve)
+  QOCOFloat* d_data;  // Device pointer (primary storage)
+  QOCOInt len;
+};
+
+struct QOCOMatrix_ {
+  QOCOCscMatrix* csc;  // Host CSC (used for setup/CPU operations)
+  QOCOCsrMatrix* d_csr;  // Device CSR (primary GPU storage)
+};
+
+#endif /* ifndef CUDA_TYPES_H */
diff --git a/algebra/cuda/cudss_backend.cu b/algebra/cuda/cudss_backend.cu
new file mode 100644
index 0000000..b6c0732
--- /dev/null
+++ b/algebra/cuda/cudss_backend.cu
@@ -0,0 +1,792 @@
+/**
+ * @file cudss_backend.cu
+ * @author Govind M. Chari <govindchari1@gmail.com>
+ *
+ * @section LICENSE
+ *
+ * Copyright (c) 2025, Govind M. Chari
+ * This source code is licensed under the BSD 3-Clause License
+ */
+
+#include "cudss_backend.h"
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusparse.h>
+#include <library_types.h>
+
+#include "kkt.h"
+#include "common_linalg.h"
+
+#define CUDA_CHECK(call) \
+  do { \
+    cudaError_t err = call; \
+    if (err != cudaSuccess) { \
+      fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
+      exit(1); \
+    } \
+  } while(0)
+
+#define CUDSS_CHECK(call) \
+  do { \
+    cudssStatus_t err = call; \
+    if (err != CUDSS_STATUS_SUCCESS) { \
+      const char* err_str = (err == CUDSS_STATUS_NOT_INITIALIZED) ? "NOT_INITIALIZED" : \
+                           (err == CUDSS_STATUS_ALLOC_FAILED) ? "ALLOC_FAILED" : \
+                           (err == CUDSS_STATUS_INVALID_VALUE) ? "INVALID_VALUE" : \
+                           (err == CUDSS_STATUS_NOT_SUPPORTED) ? "NOT_SUPPORTED" : \
+                           (err == CUDSS_STATUS_EXECUTION_FAILED) ? "EXECUTION_FAILED" : \
+                           (err == CUDSS_STATUS_INTERNAL_ERROR) ? "INTERNAL_ERROR" : "UNKNOWN"; \
+      fprintf(stderr, "cuDSS error at %s:%d: status %d (%s)\n", __FILE__, __LINE__, (int)err, err_str); \
+      exit(1); \
+    } \
+  } while(0)
+
+// Contains data for linear system.
+struct LinSysData {
+  /** KKT matrix in CSC form (host). */
+  QOCOCscMatrix* K;
+
+  /** KKT matrix in CSR form (device) for cuDSS. */
+  cudssMatrix_t K_csr;
+
+  /** Permutation vector (not used for CUDA backend - kept for compatibility). */
+  QOCOInt* p;
+
+  /** Inverse of permutation vector (not used for CUDA backend - kept for compatibility). */
+  QOCOInt* pinv;
+
+  /** cuDSS handle. */
+  cudssHandle_t handle;
+
+  /** cuDSS config. */
+  cudssConfig_t config;
+
+  /** cuDSS data. */
+  cudssData_t data;
+
+  /** Buffer of size n + m + p. */
+  QOCOFloat* xyzbuff1;
+
+  /** Buffer of size n + m + p (device) - used for b and x vectors. */
+  QOCOFloat* d_xyzbuff1;
+
+  /** Buffer of size n + m + p. */
+  QOCOFloat* xyzbuff2;
+
+  /** Buffer of size n + m + p (device). */
+  QOCOFloat* d_xyzbuff2;
+
+  /** Device buffer for rhs (RHS of KKT system) - used during solve phase. */
+  QOCOFloat* d_rhs;
+
+  /** Device buffer for xyz (solution of KKT system) - used during solve phase. */
+  QOCOFloat* d_xyz;
+
+  /** Mapping from elements in the Nesterov-Todd scaling matrix to elements in
+   * the KKT matrix. */
+  QOCOInt* nt2kkt;
+
+  /** Mapping from elements on the main diagonal of the Nesterov-Todd scaling
+   * matrices to elements in the KKT matrix. Used for regularization.*/
+  QOCOInt* ntdiag2kkt;
+
+  /** Mapping from elements in regularized P to elements in the KKT matrix. */
+  QOCOInt* PregtoKKT;
+
+  /** Mapping from elements in At to elements in the KKT matrix. */
+  QOCOInt* AttoKKT;
+
+  /** Mapping from elements in Gt to elements in the KKT matrix. */
+  QOCOInt* GttoKKT;
+
+  QOCOInt Wnnz;
+
+  /** cuSPARSE handle. */
+  cusparseHandle_t cusparse_handle;
+
+  /** Matrix description. */
+  cusparseMatDescr_t descr;
+
+  /** cuBLAS handle (for axpy operations). */
+  cublasHandle_t cublas_handle;
+
+  /** cuDSS dense matrix wrappers for solution and RHS vectors. */
+  cudssMatrix_t d_rhs_matrix;
+  cudssMatrix_t d_xyz_matrix;
+
+  /** CSR matrix arrays on GPU (persistent, updated directly) */
+  QOCOInt* d_csr_row_ptr;
+  QOCOInt* d_csr_row_end;
+  QOCOInt* d_csr_col_ind;
+  QOCOFloat* d_csr_val;
+
+  /** Mappings on GPU for direct updates */
+  QOCOInt* d_nt2kkt;
+  QOCOInt* d_ntdiag2kkt;
+  QOCOInt* d_PregtoKKT;
+  QOCOInt* d_AttoKKT;
+  QOCOInt* d_GttoKKT;
+
+  /** CSC structure on GPU (for converting updates) */
+  QOCOInt* d_csc_p;
+  QOCOInt* d_csc_i;
+
+  /** Flag to track if analysis has been done (per instance) */
+  int analysis_done;
+};
+
+// Convert CSC to CSR on host and copy to device
+static void csc_to_csr_device(const QOCOCscMatrix* csc, QOCOInt** csr_row_ptr, 
+                               QOCOInt** csr_col_ind, QOCOFloat** csr_val,
+                               cusparseHandle_t handle)
+{
+  QOCOInt n = csc->n;
+  QOCOInt m = csc->m;
+  QOCOInt nnz = csc->nnz;
+  
+  // Allocate CSR arrays on host
+  QOCOInt* h_csr_row_ptr = (QOCOInt*)qoco_calloc((m + 1), sizeof(QOCOInt));
+  QOCOInt* h_csr_col_ind = (QOCOInt*)qoco_malloc(nnz * sizeof(QOCOInt));
+  QOCOFloat* h_csr_val = (QOCOFloat*)qoco_malloc(nnz * sizeof(QOCOFloat));
+  
+  // Count nonzeros per row
+  for (QOCOInt col = 0; col < n; ++col) {
+    for (QOCOInt idx = csc->p[col]; idx < csc->p[col + 1]; ++idx) {
+      QOCOInt row = csc->i[idx];
+      h_csr_row_ptr[row + 1]++;
+    }
+  }
+  
+  // Compute row pointers (prefix sum)
+  for (QOCOInt i = 0; i < m; ++i) {
+    h_csr_row_ptr[i + 1] += h_csr_row_ptr[i];
+  }
+  
+  // Temporary array to track current position in each row
+  QOCOInt* row_pos = (QOCOInt*)qoco_malloc(m * sizeof(QOCOInt));
+  for (QOCOInt i = 0; i < m; ++i) {
+    row_pos[i] = h_csr_row_ptr[i];
+  }
+  
+  // Fill CSR arrays
+  for (QOCOInt col = 0; col < n; ++col) {
+    for (QOCOInt idx = csc->p[col]; idx < csc->p[col + 1]; ++idx) {
+      QOCOInt row = csc->i[idx];
+      QOCOInt csr_idx = row_pos[row]++;
+      h_csr_col_ind[csr_idx] = col;
+      h_csr_val[csr_idx] = csc->x[idx];
+    }
+  }
+  
+  qoco_free(row_pos);
+  
+  // Allocate device memory
+  CUDA_CHECK(cudaMalloc(csr_row_ptr, (m + 1) * sizeof(QOCOInt)));
+  CUDA_CHECK(cudaMalloc(csr_col_ind, nnz * sizeof(QOCOInt)));
+  CUDA_CHECK(cudaMalloc(csr_val, nnz * sizeof(QOCOFloat)));
+  
+  // Copy to device
+  CUDA_CHECK(cudaMemcpy(*csr_row_ptr, h_csr_row_ptr, (m + 1) * sizeof(QOCOInt), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(*csr_col_ind, h_csr_col_ind, nnz * sizeof(QOCOInt), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(*csr_val, h_csr_val, nnz * sizeof(QOCOFloat), cudaMemcpyHostToDevice));
+  
+  // Free host memory
+  qoco_free(h_csr_row_ptr);
+  qoco_free(h_csr_col_ind);
+  qoco_free(h_csr_val);
+}
+
+static LinSysData* cudss_setup(QOCOProblemData* data, QOCOSettings* settings,
+                               QOCOInt Wnnz)
+{
+  QOCOInt Kn = data->n + data->m + data->p;
+
+  LinSysData* linsys_data = (LinSysData*)qoco_malloc(sizeof(LinSysData));
+
+  // Initialize cuDSS
+  #ifdef HAVE_CUDSS
+  CUDSS_CHECK(cudssCreate(&linsys_data->handle));
+  CUDSS_CHECK(cudssConfigCreate(&linsys_data->config));
+  CUDSS_CHECK(cudssDataCreate(linsys_data->handle, &linsys_data->data));
+  #else
+  // cuDSS not available - set handles to NULL
+  linsys_data->handle = NULL;
+  linsys_data->config = NULL;
+  linsys_data->data = NULL;
+  #endif
+
+  // Initialize cuSPARSE
+  cusparseCreate(&linsys_data->cusparse_handle);
+  cusparseCreateMatDescr(&linsys_data->descr);
+  cusparseSetMatType(linsys_data->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
+  cusparseSetMatIndexBase(linsys_data->descr, CUSPARSE_INDEX_BASE_ZERO);
+
+  // Initialize cuBLAS
+  cublasCreate(&linsys_data->cublas_handle);
+
+  // Allocate vector buffers
+  linsys_data->xyzbuff1 = (QOCOFloat*)qoco_malloc(sizeof(QOCOFloat) * Kn);
+  linsys_data->xyzbuff2 = (QOCOFloat*)qoco_malloc(sizeof(QOCOFloat) * Kn);
+  CUDA_CHECK(cudaMalloc(&linsys_data->d_xyzbuff1, sizeof(QOCOFloat) * Kn));
+  CUDA_CHECK(cudaMalloc(&linsys_data->d_xyzbuff2, sizeof(QOCOFloat) * Kn));
+  linsys_data->Wnnz = Wnnz;
+
+  // Allocate memory for mappings to KKT matrix
+  linsys_data->nt2kkt = (QOCOInt*)qoco_calloc(Wnnz, sizeof(QOCOInt));
+  linsys_data->ntdiag2kkt = (QOCOInt*)qoco_calloc(data->m, sizeof(QOCOInt));
+  linsys_data->PregtoKKT = data->P ? (QOCOInt*)qoco_calloc(get_nnz(data->P), sizeof(QOCOInt)) : NULL;
+  linsys_data->AttoKKT = (QOCOInt*)qoco_calloc(get_nnz(data->A), sizeof(QOCOInt));
+  linsys_data->GttoKKT = (QOCOInt*)qoco_calloc(get_nnz(data->G), sizeof(QOCOInt));
+
+  QOCOInt* nt2kkt_temp = (QOCOInt*)qoco_calloc(Wnnz, sizeof(QOCOInt));
+  QOCOInt* ntdiag2kkt_temp = (QOCOInt*)qoco_calloc(data->m, sizeof(QOCOInt));
+  QOCOInt* PregtoKKT_temp = data->P ? (QOCOInt*)qoco_calloc(get_nnz(data->P), sizeof(QOCOInt)) : NULL;
+  QOCOInt* AttoKKT_temp = (QOCOInt*)qoco_calloc(get_nnz(data->A), sizeof(QOCOInt));
+  QOCOInt* GttoKKT_temp = (QOCOInt*)qoco_calloc(get_nnz(data->G), sizeof(QOCOInt));
+
+  // Construct KKT matrix (no permutation for CUDA backend)
+  linsys_data->K = construct_kkt(
+      data->P ? get_csc_matrix(data->P) : NULL, get_csc_matrix(data->A), get_csc_matrix(data->G),
+      get_csc_matrix(data->At), get_csc_matrix(data->Gt),
+      settings->kkt_static_reg, data->n, data->m, data->p, data->l, data->nsoc,
+      data->q, PregtoKKT_temp, AttoKKT_temp, GttoKKT_temp, nt2kkt_temp,
+      ntdiag2kkt_temp, Wnnz);
+
+  // No AMD ordering or permutation - use KKT matrix directly
+  // Set identity permutation (no-op)
+  linsys_data->p = (QOCOInt*)qoco_malloc(linsys_data->K->n * sizeof(QOCOInt));
+  linsys_data->pinv = (QOCOInt*)qoco_malloc(linsys_data->K->n * sizeof(QOCOInt));
+  for (QOCOInt i = 0; i < linsys_data->K->n; ++i) {
+    linsys_data->p[i] = i;
+    linsys_data->pinv[i] = i;
+  }
+
+  // Copy mappings directly (no permutation)
+  for (QOCOInt i = 0; i < Wnnz; ++i) {
+    linsys_data->nt2kkt[i] = nt2kkt_temp[i];
+  }
+  for (QOCOInt i = 0; i < data->m; ++i) {
+    linsys_data->ntdiag2kkt[i] = ntdiag2kkt_temp[i];
+  }
+
+  if (data->P && PregtoKKT_temp) {
+    for (QOCOInt i = 0; i < get_nnz(data->P); ++i) {
+      linsys_data->PregtoKKT[i] = PregtoKKT_temp[i];
+    }
+  }
+
+  for (QOCOInt i = 0; i < get_nnz(data->A); ++i) {
+    linsys_data->AttoKKT[i] = AttoKKT_temp[i];
+  }
+
+  for (QOCOInt i = 0; i < get_nnz(data->G); ++i) {
+    linsys_data->GttoKKT[i] = GttoKKT_temp[i];
+  }
+
+  qoco_free(nt2kkt_temp);
+  qoco_free(ntdiag2kkt_temp);
+  qoco_free(PregtoKKT_temp);
+  qoco_free(AttoKKT_temp);
+  qoco_free(GttoKKT_temp);
+
+  // Convert KKT matrix from CSC (CPU) to CSR (GPU) for cuDSS - ONCE during setup
+  // KKT matrix is formed on CPU in CSC, now convert and move to GPU
+  // Use csc_to_csr_device helper function
+  QOCOInt* csr_row_ptr;
+  QOCOInt* csr_col_ind;
+  QOCOFloat* csr_val;
+  
+  csc_to_csr_device(linsys_data->K, &csr_row_ptr, &csr_col_ind, &csr_val,
+                    linsys_data->cusparse_handle);
+
+  // Store CSR arrays on GPU persistently (will be updated directly, not recreated)
+  linsys_data->d_csr_row_ptr = csr_row_ptr;
+  linsys_data->d_csr_col_ind = csr_col_ind;
+  linsys_data->d_csr_val = csr_val;
+  
+  // Create cuDSS matrix in CSR format
+  #ifdef HAVE_CUDSS
+  // cuDSS CSR format: rowEnd can be NULL (cuDSS will compute it from rowStart)
+  // The working example (simple.cpp) passes NULL for rowEnd
+  // Store row_end for potential future use, but pass NULL to cuDSS
+  QOCOInt* csr_row_end;
+  CUDA_CHECK(cudaMalloc(&csr_row_end, Kn * sizeof(QOCOInt)));
+  // Copy rowStart[1..n] to rowEnd[0..n-1]
+  CUDA_CHECK(cudaMemcpy(csr_row_end, &csr_row_ptr[1], Kn * sizeof(QOCOInt), cudaMemcpyDeviceToDevice));
+  linsys_data->d_csr_row_end = csr_row_end;
+  
+  // Determine data types
+  cudaDataType_t indexType = CUDA_R_32I;  // QOCOInt is int32_t
+  cudaDataType_t valueType_setup = (sizeof(QOCOFloat) == 8) ? CUDA_R_64F : CUDA_R_32F;
+  
+  // KKT matrix is symmetric (upper triangular stored)
+  // Use CUDSS_MTYPE_SPD (symmetric positive definite) like the working example
+  // Pass NULL for rowEnd - cuDSS will compute it from rowStart
+  CUDSS_CHECK(cudssMatrixCreateCsr(&linsys_data->K_csr,
+                                   (int64_t)Kn, (int64_t)Kn, (int64_t)linsys_data->K->nnz,
+                                   csr_row_ptr, NULL, csr_col_ind, csr_val,
+                                   indexType, valueType_setup,
+                                   CUDSS_MTYPE_SYMMETRIC, CUDSS_MVIEW_UPPER, CUDSS_BASE_ZERO));
+  
+  // Run analysis once during setup (data structure already created above)
+  linsys_data->analysis_done = 0;  // Will be set to 1 after first analysis
+  #else
+  // cuDSS not available - set to NULL (solve will use fallback)
+  linsys_data->K_csr = NULL;
+  linsys_data->d_rhs_matrix = NULL;
+  linsys_data->d_xyz_matrix = NULL;
+  linsys_data->d_csr_row_ptr = NULL;
+  linsys_data->d_csr_row_end = NULL;
+  linsys_data->d_csr_col_ind = NULL;
+  linsys_data->d_csr_val = NULL;
+  #endif
+
+  // Copy mappings to GPU for direct updates
+  CUDA_CHECK(cudaMalloc(&linsys_data->d_nt2kkt, Wnnz * sizeof(QOCOInt)));
+  CUDA_CHECK(cudaMalloc(&linsys_data->d_ntdiag2kkt, data->m * sizeof(QOCOInt)));
+  if (data->P && linsys_data->PregtoKKT) {
+    CUDA_CHECK(cudaMalloc(&linsys_data->d_PregtoKKT, get_nnz(data->P) * sizeof(QOCOInt)));
+    CUDA_CHECK(cudaMemcpy(linsys_data->d_PregtoKKT, linsys_data->PregtoKKT, get_nnz(data->P) * sizeof(QOCOInt), cudaMemcpyHostToDevice));
+  } else {
+    linsys_data->d_PregtoKKT = NULL;
+  }
+  CUDA_CHECK(cudaMalloc(&linsys_data->d_AttoKKT, get_nnz(data->A) * sizeof(QOCOInt)));
+  CUDA_CHECK(cudaMalloc(&linsys_data->d_GttoKKT, get_nnz(data->G) * sizeof(QOCOInt)));
+  
+  CUDA_CHECK(cudaMemcpy(linsys_data->d_nt2kkt, linsys_data->nt2kkt, Wnnz * sizeof(QOCOInt), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(linsys_data->d_ntdiag2kkt, linsys_data->ntdiag2kkt, data->m * sizeof(QOCOInt), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(linsys_data->d_AttoKKT, linsys_data->AttoKKT, get_nnz(data->A) * sizeof(QOCOInt), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(linsys_data->d_GttoKKT, linsys_data->GttoKKT, get_nnz(data->G) * sizeof(QOCOInt), cudaMemcpyHostToDevice));
+
+  // Copy CSC structure to GPU (for converting updates in factor)
+  CUDA_CHECK(cudaMalloc(&linsys_data->d_csc_p, (linsys_data->K->n + 1) * sizeof(QOCOInt)));
+  CUDA_CHECK(cudaMalloc(&linsys_data->d_csc_i, linsys_data->K->nnz * sizeof(QOCOInt)));
+  CUDA_CHECK(cudaMemcpy(linsys_data->d_csc_p, linsys_data->K->p, (linsys_data->K->n + 1) * sizeof(QOCOInt), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(linsys_data->d_csc_i, linsys_data->K->i, linsys_data->K->nnz * sizeof(QOCOInt), cudaMemcpyHostToDevice));
+
+  // Allocate device vectors for rhs and xyz
+  // These will be used by the cuDSS dense matrix wrappers
+  QOCOFloat* d_rhs_ptr = NULL;
+  QOCOFloat* d_xyz_ptr = NULL;
+  CUDA_CHECK(cudaMalloc(&d_rhs_ptr, Kn * sizeof(QOCOFloat)));
+  CUDA_CHECK(cudaMalloc(&d_xyz_ptr, Kn * sizeof(QOCOFloat)));
+  
+  // Store in struct for cleanup
+  linsys_data->d_xyzbuff1 = d_rhs_ptr;  // Used for rhs buffer
+  linsys_data->d_xyzbuff2 = d_xyz_ptr;  // Used for xyz buffer
+  
+  // Create dense matrix wrappers for solution and RHS vectors (column vectors)
+  // Note: d_rhs_matrix wraps d_xyzbuff1, d_xyz_matrix wraps d_xyzbuff2
+  #ifdef HAVE_CUDSS
+  // Use valueType_setup declared above
+  CUDSS_CHECK(cudssMatrixCreateDn(&linsys_data->d_rhs_matrix, (int64_t)Kn, 1, (int64_t)Kn, linsys_data->d_xyzbuff1, valueType_setup, CUDSS_LAYOUT_COL_MAJOR));
+  CUDSS_CHECK(cudssMatrixCreateDn(&linsys_data->d_xyz_matrix, (int64_t)Kn, 1, (int64_t)Kn, linsys_data->d_xyzbuff2, valueType_setup, CUDSS_LAYOUT_COL_MAJOR));
+  #else
+  linsys_data->d_rhs_matrix = NULL;
+  linsys_data->d_xyz_matrix = NULL;
+  #endif
+
+  return linsys_data;
+}
+
+// CUDA kernel to update CSR values from updated CSC KKT matrix
+// Maps CSC indices to CSR indices and updates values
+__global__ void update_csr_from_csc_kernel(
+    const QOCOFloat* csc_val,           // Updated CSC values (on GPU, copied from CPU KKT)
+    QOCOFloat* csr_val,                  // CSR values to update (on GPU)
+    const QOCOInt* csr_row_ptr,          // CSR row pointers
+    const QOCOInt* csr_col_ind,          // CSR column indices
+    const QOCOInt* csc_p,                // CSC column pointers (on GPU)
+    const QOCOInt* csc_i,                // CSC row indices (on GPU)
+    QOCOInt n,                           // Matrix dimension
+    QOCOInt nnz)                         // Number of nonzeros
+{
+  QOCOInt idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= nnz) return;
+  
+  // For each CSR element at index idx, find corresponding CSC element
+  // CSR: row = find row such that csr_row_ptr[row] <= idx < csr_row_ptr[row+1]
+  //      col = csr_col_ind[idx]
+  // CSC: find idx_csc such that csc_i[idx_csc] == row and column of idx_csc == col
+  
+  // Binary search for row
+  QOCOInt row = 0;
+  QOCOInt left = 0, right = n - 1;
+  while (left <= right) {
+    QOCOInt mid = (left + right) / 2;
+    if (csr_row_ptr[mid] <= idx) {
+      row = mid;
+      left = mid + 1;
+    } else {
+      right = mid - 1;
+    }
+  }
+  
+  QOCOInt col = csr_col_ind[idx];
+  
+  // Find corresponding CSC element: search column col for row
+  for (QOCOInt csc_idx = csc_p[col]; csc_idx < csc_p[col + 1]; csc_idx++) {
+    if (csc_i[csc_idx] == row) {
+      csr_val[idx] = csc_val[csc_idx];
+      return;
+    }
+  }
+}
+
+static void cudss_factor(LinSysData* linsys_data, QOCOInt n,
+                         QOCOFloat kkt_dynamic_reg)
+{
+  // Update CSR matrix values on GPU directly - NO CPU-GPU transfers during solve
+  // The KKT matrix was updated on CPU (in CSC), but we need to update GPU CSR
+  // Copy updated CSC values to GPU, then update CSR values using a kernel
+  
+  #ifdef HAVE_CUDSS
+  // Copy updated CSC KKT matrix values to GPU (minimal transfer - only values, not structure)
+  // This is the ONLY CPU-GPU transfer during solve iterations (matrix values update)
+  QOCOFloat* d_csc_val;
+  CUDA_CHECK(cudaMalloc(&d_csc_val, linsys_data->K->nnz * sizeof(QOCOFloat)));
+  CUDA_CHECK(cudaMemcpy(d_csc_val, linsys_data->K->x, linsys_data->K->nnz * sizeof(QOCOFloat), cudaMemcpyHostToDevice));
+  
+  // Update CSR values from CSC using CUDA kernel (CSC structure already on GPU from setup)
+  QOCOInt threadsPerBlock = 256;
+  QOCOInt numBlocks = (linsys_data->K->nnz + threadsPerBlock - 1) / threadsPerBlock;
+  update_csr_from_csc_kernel<<<numBlocks, threadsPerBlock>>>(
+      d_csc_val, linsys_data->d_csr_val,
+      linsys_data->d_csr_row_ptr, linsys_data->d_csr_col_ind,
+      linsys_data->d_csc_p, linsys_data->d_csc_i,
+      linsys_data->K->n, linsys_data->K->nnz);
+  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(cudaDeviceSynchronize());
+  
+  // Free temporary CSC values buffer
+  cudaFree(d_csc_val);
+  
+  // Update cuDSS matrix with new values
+  CUDSS_CHECK(cudssMatrixSetValues(linsys_data->K_csr, linsys_data->d_csr_val));
+  
+  // Run factorization - data structure persists from setup
+  // Analysis only needed once (first factorization), then just factorization
+  if (!linsys_data->analysis_done) {
+    cudssStatus_t status_analysis = cudssExecute(linsys_data->handle, CUDSS_PHASE_ANALYSIS,
+                             linsys_data->config, linsys_data->data,
+                             linsys_data->K_csr, linsys_data->d_xyz_matrix, linsys_data->d_rhs_matrix);
+    if (status_analysis != CUDSS_STATUS_SUCCESS) {
+      const char* err_str = (status_analysis == CUDSS_STATUS_NOT_INITIALIZED) ? "NOT_INITIALIZED" : \
+                           (status_analysis == CUDSS_STATUS_ALLOC_FAILED) ? "ALLOC_FAILED" : \
+                           (status_analysis == CUDSS_STATUS_INVALID_VALUE) ? "INVALID_VALUE" : \
+                           (status_analysis == CUDSS_STATUS_NOT_SUPPORTED) ? "NOT_SUPPORTED" : \
+                           (status_analysis == CUDSS_STATUS_EXECUTION_FAILED) ? "EXECUTION_FAILED" : \
+                           (status_analysis == CUDSS_STATUS_INTERNAL_ERROR) ? "INTERNAL_ERROR" : "UNKNOWN";
+      fprintf(stderr, "ERROR: cuDSS analysis failed with status %d (%s)\n", (int)status_analysis, err_str);
+      exit(1);
+    }
+    linsys_data->analysis_done = 1;
+  }
+
+  cudssStatus_t status_factor = cudssExecute(linsys_data->handle, CUDSS_PHASE_FACTORIZATION,
+                           linsys_data->config, linsys_data->data,
+                           linsys_data->K_csr, linsys_data->d_xyz_matrix, linsys_data->d_rhs_matrix);
+  if (status_factor != CUDSS_STATUS_SUCCESS) {
+    const char* err_str = (status_factor == CUDSS_STATUS_NOT_INITIALIZED) ? "NOT_INITIALIZED" : \
+                         (status_factor == CUDSS_STATUS_ALLOC_FAILED) ? "ALLOC_FAILED" : \
+                         (status_factor == CUDSS_STATUS_INVALID_VALUE) ? "INVALID_VALUE" : \
+                         (status_factor == CUDSS_STATUS_NOT_SUPPORTED) ? "NOT_SUPPORTED" : \
+                         (status_factor == CUDSS_STATUS_EXECUTION_FAILED) ? "EXECUTION_FAILED" : \
+                         (status_factor == CUDSS_STATUS_INTERNAL_ERROR) ? "INTERNAL_ERROR" : "UNKNOWN";
+    fprintf(stderr, "ERROR: cuDSS factorization failed with status %d (%s)\n", (int)status_factor, err_str);
+    exit(1);
+  }
+  #else
+  // cuDSS not available - skip
+  #endif
+}
+
+// Helper function to map work->rhs and work->xyz to device buffers during solve phase
+extern "C" {
+void map_work_buffers_to_device(void* linsys_data_ptr, QOCOWorkspace* work)
+{
+  // During solve phase, functions constructing rhs/xyz will operate on device buffers
+  // This function is called at start of solve phase
+  // Map work->rhs->d_data and work->xyz->d_data to point to our device buffers
+  // so that RHS construction and solution storage happen directly in cuDSS buffers
+  LinSysData* linsys_data = (LinSysData*)linsys_data_ptr;
+  QOCOInt n = work->data->n + work->data->m + work->data->p;
+  
+  // Store original device pointers (if any) so we can restore them later
+  // For now, just point d_data to our buffers
+  if (work->rhs) {
+    QOCOFloat* old_rhs = work->rhs->d_data;
+    if (old_rhs && old_rhs != linsys_data->d_xyzbuff1) {
+      CUDA_CHECK(cudaMemcpy(linsys_data->d_xyzbuff1, old_rhs,
+                            n * sizeof(QOCOFloat), cudaMemcpyDeviceToDevice));
+      cudaFree(old_rhs);
+    } else if (!old_rhs) {
+      CUDA_CHECK(cudaMemset(linsys_data->d_xyzbuff1, 0, n * sizeof(QOCOFloat)));
+    }
+    work->rhs->d_data = linsys_data->d_xyzbuff1;
+  }
+  
+  if (work->xyz) {
+    QOCOFloat* old_xyz = work->xyz->d_data;
+    if (old_xyz && old_xyz != linsys_data->d_xyzbuff2) {
+      CUDA_CHECK(cudaMemcpy(linsys_data->d_xyzbuff2, old_xyz,
+                            n * sizeof(QOCOFloat), cudaMemcpyDeviceToDevice));
+      cudaFree(old_xyz);
+    } else if (!old_xyz) {
+      CUDA_CHECK(cudaMemset(linsys_data->d_xyzbuff2, 0, n * sizeof(QOCOFloat)));
+    }
+    work->xyz->d_data = linsys_data->d_xyzbuff2;
+  }
+}
+
+QOCOFloat* get_device_rhs(void* linsys_data_ptr)
+{
+  LinSysData* linsys_data = (LinSysData*)linsys_data_ptr;
+  // Return the device buffer where RHS should be constructed
+  return linsys_data->d_xyzbuff1;
+}
+
+QOCOFloat* get_device_xyz(void* linsys_data_ptr)
+{
+  LinSysData* linsys_data = (LinSysData*)linsys_data_ptr;
+  // Return the device buffer where solution will be written
+  return linsys_data->d_xyzbuff2;
+}
+
+void unmap_work_buffers_from_device(void* linsys_data_ptr, QOCOWorkspace* work)
+{
+  // Copy final results from device buffers back to host buffers
+  // This is called AFTER solve completes, so it's the only CPU-GPU transfer after solve
+  LinSysData* linsys_data = (LinSysData*)linsys_data_ptr;
+  QOCOInt n = work->data->n + work->data->m + work->data->p;
+  
+  // Restore original device pointers (reallocate them)
+  if (work->rhs) {
+    // Allocate new device buffer for work->rhs->d_data
+    CUDA_CHECK(cudaMalloc(&work->rhs->d_data, n * sizeof(QOCOFloat)));
+    // Copy from our buffer to the restored buffer
+    CUDA_CHECK(cudaMemcpy(work->rhs->d_data, linsys_data->d_xyzbuff1, n * sizeof(QOCOFloat), cudaMemcpyDeviceToDevice));
+    // Copy to host
+    CUDA_CHECK(cudaMemcpy(work->rhs->data, work->rhs->d_data, n * sizeof(QOCOFloat), cudaMemcpyDeviceToHost));
+  }
+  
+  if (work->xyz) {
+    // Allocate new device buffer for work->xyz->d_data
+    CUDA_CHECK(cudaMalloc(&work->xyz->d_data, n * sizeof(QOCOFloat)));
+    // Copy solution from d_xyzbuff2 (where cuDSS wrote it) to restored buffer
+    CUDA_CHECK(cudaMemcpy(work->xyz->d_data, linsys_data->d_xyzbuff2, n * sizeof(QOCOFloat), cudaMemcpyDeviceToDevice));
+    // Copy to host
+    CUDA_CHECK(cudaMemcpy(work->xyz->data, work->xyz->d_data, n * sizeof(QOCOFloat), cudaMemcpyDeviceToHost));
+  }
+}
+}
+
+static void cudss_solve(LinSysData* linsys_data, QOCOWorkspace* work,
+                        QOCOFloat* b, QOCOFloat* x, QOCOInt iter_ref_iters)
+{
+  QOCOInt n = linsys_data->K->n;
+  (void)iter_ref_iters;  // No iterative refinement for CUDA backend
+  // b is copied to d_xyzbuff1 if provided
+  // x will receive the solution from d_xyzbuff2 after cudssExecute
+
+  // During solve phase, ALL data is on GPU - NO CPU-GPU transfers
+  // work->rhs->d_data points to d_xyzbuff1 (mapped in map_work_buffers_to_device)
+  // work->xyz->d_data points to d_xyzbuff2 (mapped in map_work_buffers_to_device)
+  // RHS was constructed directly into d_xyzbuff1, solution will be written to d_xyzbuff2
+  
+  // Ensure RHS buffer contains the contents of b (in case caller passed a different pointer)
+  if (b) {
+    if (b != linsys_data->d_xyzbuff1) {
+      cudaPointerAttributes attrs;
+      cudaError_t err = cudaPointerGetAttributes(&attrs, b);
+      if (err == cudaSuccess && attrs.type == cudaMemoryTypeDevice) {
+        CUDA_CHECK(cudaMemcpy(linsys_data->d_xyzbuff1, b, n * sizeof(QOCOFloat),
+                              cudaMemcpyDeviceToDevice));
+      } else {
+        CUDA_CHECK(cudaMemcpy(linsys_data->d_xyzbuff1, b, n * sizeof(QOCOFloat),
+                              cudaMemcpyHostToDevice));
+      }
+    }
+  } else {
+    CUDA_CHECK(cudaMemset(linsys_data->d_xyzbuff1, 0, n * sizeof(QOCOFloat)));
+  }
+  
+  // Clear solution buffer (d_xyz_matrix points to d_xyzbuff2)
+  CUDA_CHECK(cudaMemset(linsys_data->d_xyzbuff2, 0, n * sizeof(QOCOFloat)));
+    
+  // cuDSS API signature: cudssExecute(handle, phase, config, data, matrix, solution, rhs)
+  // where solution is the output and rhs is the input
+  // Note: d_rhs_matrix points to d_xyzbuff1, d_xyz_matrix points to d_xyzbuff2
+  #ifdef HAVE_CUDSS
+
+  cudssStatus_t status = cudssExecute(linsys_data->handle, CUDSS_PHASE_SOLVE,
+                                      linsys_data->config, linsys_data->data,
+                                      linsys_data->K_csr, linsys_data->d_xyz_matrix, linsys_data->d_rhs_matrix);
+  if (status != CUDSS_STATUS_SUCCESS) {
+    const char* err_str = (status == CUDSS_STATUS_NOT_INITIALIZED) ? "NOT_INITIALIZED" : \
+                         (status == CUDSS_STATUS_ALLOC_FAILED) ? "ALLOC_FAILED" : \
+                         (status == CUDSS_STATUS_INVALID_VALUE) ? "INVALID_VALUE" : \
+                         (status == CUDSS_STATUS_NOT_SUPPORTED) ? "NOT_SUPPORTED" : \
+                         (status == CUDSS_STATUS_EXECUTION_FAILED) ? "EXECUTION_FAILED" : \
+                         (status == CUDSS_STATUS_INTERNAL_ERROR) ? "INTERNAL_ERROR" : "UNKNOWN";
+    fprintf(stderr, "ERROR: cuDSS solve failed with status %d (%s)\n", (int)status, err_str);
+  } else {
+    // Solution is now in d_xyzbuff2 (pointed to by d_xyz_matrix)
+    // Copy solution from d_xyz_matrix (d_xyzbuff2) to x
+    if (x) {
+      cudaPointerAttributes attrs;
+      cudaError_t err = cudaPointerGetAttributes(&attrs, x);
+      if (err == cudaSuccess && attrs.type == cudaMemoryTypeDevice) {
+        // x is on device - device-to-device copy
+        CUDA_CHECK(cudaMemcpy(x, linsys_data->d_xyzbuff2, n * sizeof(QOCOFloat),
+                              cudaMemcpyDeviceToDevice));
+      } else {
+        // x is on host - device-to-host copy
+        CUDA_CHECK(cudaMemcpy(x, linsys_data->d_xyzbuff2, n * sizeof(QOCOFloat),
+                              cudaMemcpyDeviceToHost));
+      }
+    }
+  }
+
+  // Solution is now in d_xyzbuff2 (pointed to by d_xyz_matrix and work->xyz->d_data)
+  // Solution has been copied to x (if provided) after successful cudssExecute
+  #else
+  // cuDSS not available - use fallback: copy solution from RHS (will fail convergence but won't crash)
+  CUDA_CHECK(cudaMemcpy(linsys_data->d_xyzbuff2, linsys_data->d_xyzbuff1, n * sizeof(QOCOFloat), cudaMemcpyDeviceToDevice));
+  // Copy to x if provided
+  if (x) {
+    cudaPointerAttributes attrs;
+    cudaError_t err = cudaPointerGetAttributes(&attrs, x);
+    if (err == cudaSuccess && attrs.type == cudaMemoryTypeDevice) {
+      CUDA_CHECK(cudaMemcpy(x, linsys_data->d_xyzbuff2, n * sizeof(QOCOFloat),
+                            cudaMemcpyDeviceToDevice));
+    } else {
+      CUDA_CHECK(cudaMemcpy(x, linsys_data->d_xyzbuff2, n * sizeof(QOCOFloat),
+                            cudaMemcpyDeviceToHost));
+    }
+  }
+  #endif
+  
+  // During solve phase, solution stays on device in d_xyzbuff2 (and work->xyz->d_data)
+  // It will be copied back to CPU only after solve completes (in unmap_work_buffers_from_device)
+}
+
+static void cudss_initialize_nt(LinSysData* linsys_data, QOCOInt m)
+{
+  for (QOCOInt i = 0; i < linsys_data->Wnnz; ++i) {
+    linsys_data->K->x[linsys_data->nt2kkt[i]] = 0.0;
+  }
+
+  // Set Nesterov-Todd block in KKT matrix to -I
+  for (QOCOInt i = 0; i < m; ++i) {
+    linsys_data->K->x[linsys_data->ntdiag2kkt[i]] = -1.0;
+  }
+
+  // Update device matrix - redo CSR conversion
+  // In production, we'd update device matrix directly
+  // For now, we'll need to redo the conversion in factor function
+}
+
+static void cudss_update_nt(LinSysData* linsys_data, QOCOFloat* WtW,
+                            QOCOFloat kkt_static_reg, QOCOInt m)
+{
+  for (QOCOInt i = 0; i < linsys_data->Wnnz; ++i) {
+    linsys_data->K->x[linsys_data->nt2kkt[i]] = -WtW[i];
+  }
+
+  // Regularize Nesterov-Todd block of KKT matrix
+  for (QOCOInt i = 0; i < m; ++i) {
+    linsys_data->K->x[linsys_data->ntdiag2kkt[i]] -= kkt_static_reg;
+  }
+}
+
+static void cudss_update_data(LinSysData* linsys_data, QOCOProblemData* data)
+{
+  // Update P in KKT matrix
+  if (data->P && linsys_data->PregtoKKT) {
+    QOCOCscMatrix* Pcsc = get_csc_matrix(data->P);
+    for (QOCOInt i = 0; i < get_nnz(data->P); ++i) {
+      linsys_data->K->x[linsys_data->PregtoKKT[i]] = Pcsc->x[i];
+    }
+  }
+
+  // Update A in KKT matrix
+  QOCOCscMatrix* Acsc = get_csc_matrix(data->A);
+  for (QOCOInt i = 0; i < get_nnz(data->A); ++i) {
+    linsys_data->K->x[linsys_data->AttoKKT[data->AtoAt[i]]] = Acsc->x[i];
+  }
+
+  // Update G in KKT matrix
+  QOCOCscMatrix* Gcsc = get_csc_matrix(data->G);
+  for (QOCOInt i = 0; i < get_nnz(data->G); ++i) {
+    linsys_data->K->x[linsys_data->GttoKKT[data->GtoGt[i]]] = Gcsc->x[i];
+  }
+
+  // Update device matrix - convert to CSR and update
+  // Note: This is simplified; in production we'd maintain device copy
+}
+
+static void cudss_cleanup(LinSysData* linsys_data)
+{
+  if (linsys_data->K_csr) {
+    cudssMatrixDestroy(linsys_data->K_csr);
+  }
+  if (linsys_data->d_rhs_matrix) {
+    cudssMatrixDestroy(linsys_data->d_rhs_matrix);
+  }
+  if (linsys_data->d_xyz_matrix) {
+    cudssMatrixDestroy(linsys_data->d_xyz_matrix);
+  }
+  if (linsys_data->data) {
+    cudssDataDestroy(linsys_data->handle, linsys_data->data);
+  }
+  if (linsys_data->config) {
+    cudssConfigDestroy(linsys_data->config);
+  }
+  if (linsys_data->handle) {
+    cudssDestroy(linsys_data->handle);
+  }
+  if (linsys_data->cusparse_handle) {
+    cusparseDestroy(linsys_data->cusparse_handle);
+  }
+  if (linsys_data->descr) {
+    cusparseDestroyMatDescr(linsys_data->descr);
+  }
+  if (linsys_data->cublas_handle) {
+    cublasDestroy(linsys_data->cublas_handle);
+  }
+  free_qoco_csc_matrix(linsys_data->K);
+  qoco_free(linsys_data->p);
+  qoco_free(linsys_data->pinv);
+  qoco_free(linsys_data->xyzbuff1);
+  qoco_free(linsys_data->xyzbuff2);
+  if (linsys_data->d_xyzbuff1) cudaFree(linsys_data->d_xyzbuff1);
+  if (linsys_data->d_xyzbuff2) cudaFree(linsys_data->d_xyzbuff2);
+  qoco_free(linsys_data->nt2kkt);
+  qoco_free(linsys_data->ntdiag2kkt);
+  qoco_free(linsys_data->PregtoKKT);
+  qoco_free(linsys_data->AttoKKT);
+  qoco_free(linsys_data->GttoKKT);
+  if (linsys_data->d_csr_row_ptr) cudaFree(linsys_data->d_csr_row_ptr);
+  if (linsys_data->d_csr_row_end) cudaFree(linsys_data->d_csr_row_end);
+  if (linsys_data->d_csr_col_ind) cudaFree(linsys_data->d_csr_col_ind);
+  if (linsys_data->d_csr_val) cudaFree(linsys_data->d_csr_val);
+  if (linsys_data->d_nt2kkt) cudaFree(linsys_data->d_nt2kkt);
+  if (linsys_data->d_ntdiag2kkt) cudaFree(linsys_data->d_ntdiag2kkt);
+  if (linsys_data->d_PregtoKKT) cudaFree(linsys_data->d_PregtoKKT);
+  if (linsys_data->d_AttoKKT) cudaFree(linsys_data->d_AttoKKT);
+  if (linsys_data->d_GttoKKT) cudaFree(linsys_data->d_GttoKKT);
+  if (linsys_data->d_csc_p) cudaFree(linsys_data->d_csc_p);
+  if (linsys_data->d_csc_i) cudaFree(linsys_data->d_csc_i);
+  qoco_free(linsys_data);
+}
+
+// Export the backend struct
+LinSysBackend backend = {.linsys_setup = cudss_setup,
+                         .linsys_initialize_nt = cudss_initialize_nt,
+                         .linsys_update_nt = cudss_update_nt,
+                         .linsys_update_data = cudss_update_data,
+                         .linsys_factor = cudss_factor,
+                         .linsys_solve = cudss_solve,
+                         .linsys_cleanup = cudss_cleanup};
+
diff --git a/algebra/cuda/cudss_backend.h b/algebra/cuda/cudss_backend.h
new file mode 100644
index 0000000..7467e75
--- /dev/null
+++ b/algebra/cuda/cudss_backend.h
@@ -0,0 +1,54 @@
+/**
+ * @file cudss_backend.h
+ * @author Govind M. Chari <govindchari1@gmail.com>
+ *
+ * @section LICENSE
+ *
+ * Copyright (c) 2025, Govind M. Chari
+ * This source code is licensed under the BSD 3-Clause License
+ *
+ * @section DESCRIPTION
+ *
+ * Defines the cuDSS backend.
+ */
+
+#ifndef CUDSS_BACKEND_H
+#define CUDSS_BACKEND_H
+
+#include "cuda_types.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "common_linalg.h"
+#include "kkt.h"
+#ifdef __cplusplus
+}
+#endif
+#include "structs.h"
+
+#ifdef HAVE_CUDSS
+#include <cudss.h>
+#else
+// Stub definitions for cuDSS types if not available
+typedef void* cudssHandle_t;
+typedef void* cudssConfig_t;
+typedef void* cudssData_t;
+typedef void* cudssMatrix_t;
+typedef enum { CUDSS_STATUS_SUCCESS } cudssStatus_t;
+typedef enum { CUDSS_PHASE_ANALYSIS, CUDSS_PHASE_FACTORIZATION, CUDSS_PHASE_SOLVE } cudssPhase_t;
+#define cudssCreate(x) (CUDSS_STATUS_SUCCESS)
+#define cudssConfigCreate(x) (CUDSS_STATUS_SUCCESS)
+#define cudssDataCreate(h, x) (CUDSS_STATUS_SUCCESS)
+#define cudssMatrixCreateCsr(h, m, nrows, ncols, nnz, rp, ci, v) (CUDSS_STATUS_SUCCESS)
+#define cudssMatrixSetValues(m, v) (CUDSS_STATUS_SUCCESS)
+#define cudssExecute(h, p, c, d, m, x, b) (CUDSS_STATUS_SUCCESS)
+#define cudssMatrixDestroy(m) ((void)0)
+#define cudssDataDestroy(d) ((void)0)
+#define cudssConfigDestroy(c) ((void)0)
+#define cudssDestroy(h) ((void)0)
+#endif
+
+extern LinSysBackend backend;
+
+#endif /* #ifndef CUDSS_BACKEND_H */
+
diff --git a/devtools/run_tests.sh b/devtools/run_tests.sh
index 450c81e..a6d22f7 100755
--- a/devtools/run_tests.sh
+++ b/devtools/run_tests.sh
@@ -1 +1,6 @@
-export CXX=/usr/local/bin/clang++ && export CC=/usr/local/bin/clang && cd build && cmake -DQOCO_BUILD_TYPE:STR=Release -DENABLE_TESTING:BOOL=True -DBUILD_QOCO_BENCHMARK_RUNNER:BOOL=True .. && make && ctest --verbose && cd ..
\ No newline at end of file
+#!/bin/bash
+# Note: cuDSS was compiled against cuBLAS 12, but system has cuBLAS 13
+# The build will succeed with --allow-shlib-undefined, but runtime may fail
+# To fix: either install cuBLAS 12 or get cuDSS compiled against cuBLAS 13
+
+export CXX=/usr/bin/clang++ && export CC=/usr/bin/clang && cd build && cmake -DQOCO_BUILD_TYPE:STR=Release -DENABLE_TESTING:BOOL=True -DBUILD_QOCO_BENCHMARK_RUNNER:BOOL=False -DQOCO_ALGEBRA_BACKEND:STR=builtin .. && make -j$(nproc) && cd tests && ./markowitz_test
\ No newline at end of file
diff --git a/include/common_linalg.h b/include/common_linalg.h
index ac3180a..2adb5a5 100644
--- a/include/common_linalg.h
+++ b/include/common_linalg.h
@@ -69,6 +69,14 @@ void unregularize(QOCOCscMatrix* M, QOCOFloat lambda);
  */
 void col_inf_norm_USymm(const QOCOCscMatrix* M, QOCOFloat* norm);
 
+/**
+ * @brief Computes the infinity norm of each column of M and stores in norm.
+ *
+ * @param M An m by n sparse matrix in CSC format.
+ * @param norm Result vector of length n.
+ */
+void col_inf_norm(const QOCOCscMatrix* M, QOCOFloat* norm);
+
 /**
  * @brief Computes the infinity norm of each row of M and stores in norm.
  *
diff --git a/include/qoco_linalg.h b/include/qoco_linalg.h
index 8bbe7f2..1ab2212 100644
--- a/include/qoco_linalg.h
+++ b/include/qoco_linalg.h
@@ -16,6 +16,10 @@
 #define QOCO_LINALG_H
 #include "definitions.h"
 
+typedef struct QOCOMatrix_ QOCOMatrix;
+typedef struct QOCOVectorf_ QOCOVectorf;
+typedef struct QOCOVectori_ QOCOVectori;
+
 /**
  * @brief Compressed sparse column format matrices.
  *
@@ -49,6 +53,210 @@ typedef struct {
  */
 QOCOCscMatrix* new_qoco_csc_matrix(const QOCOCscMatrix* A);
 
+/**
+ * @brief Allocates a new QOCOMatrix and copies A to it.
+ *
+ * @param A Matrix to copy.
+ * @return Pointer to new constructed matrix.
+ */
+QOCOMatrix* new_qoco_matrix(const QOCOCscMatrix* A);
+
+/**
+ * @brief Frees QOCOMatrix.
+ *
+ * @param A Matrix to free.
+ */
+void free_qoco_matrix(QOCOMatrix* A);
+
+/**
+ * @brief Allocates a new QOCOVectorf and copies A to it.
+ *
+ * @param x vector to copy.
+ * @param n length of vector.
+ * @return Pointer to new constructed vector.
+ */
+QOCOVectorf* new_qoco_vectorf(const QOCOFloat* x, QOCOInt n);
+
+/**
+ * @brief Frees QOCOVectorf.
+ *
+ * @param x Vector to free.
+ */
+void free_qoco_vectorf(QOCOVectorf* x);
+
+/**
+ * @brief Returns the number of nonzeros in a QOCOMatrix.
+ *
+ * @param A Input matrix.
+ * @return Number of nonzeros.
+ */
+QOCOInt get_nnz(const QOCOMatrix* A);
+
+/**
+ * @brief Returns x[idx].
+ *
+ * @param x Input vector.
+ * @param idx Index.
+ * @return x[idx].
+ */
+QOCOFloat get_element_vectorf(const QOCOVectorf* x, QOCOInt idx);
+
+/**
+ * @brief Performs x[idx] = data.
+ *
+ * @param x Input vector.
+ * @param idx Index.
+ * @param idx data.
+ */
+void set_element_vectorf(QOCOVectorf* x, QOCOInt idx, QOCOFloat data);
+
+/**
+ * @brief Performs output .= 1.0 ./ input.
+ *
+ * @param input Input vector.
+ * @param output Input vector.
+ */
+void reciprocal_vectorf(const QOCOVectorf* input, QOCOVectorf* output);
+
+/**
+ * @brief Returns &x[idx].
+ *
+ * @param x Input vector.
+ * @param idx Index.
+ * @return &x[idx].
+ */
+QOCOFloat* get_pointer_vectorf(const QOCOVectorf* x, QOCOInt idx);
+
+/**
+ * @brief Returns the underlying data array of a QOCOVectorf.
+ *
+ * @param x Input vector.
+ * @return Pointer to underlying data array.
+ */
+QOCOFloat* get_data_vectorf(const QOCOVectorf* x);
+
+/**
+ * @brief Syncs vector data from host to device if needed (CUDA backend only).
+ * This is a no-op for non-CUDA backends.
+ * Note: This should NOT be called during qoco_solve to avoid CPU-GPU copies.
+ *
+ * @param v Input vector.
+ */
+void sync_vector_to_device_if_needed(QOCOVectorf* v);
+
+/**
+ * @brief Sets the solve phase flag (CUDA backend only).
+ * During solve phase, get_data_vectorf returns device pointers.
+ * This prevents CPU-GPU copies during the solve loop.
+ *
+ * @param active 1 if solve phase is active, 0 otherwise.
+ */
+void set_solve_phase(int active);
+
+/**
+ * @brief Gets the solve phase flag (CUDA backend only).
+ * Returns 1 if solve phase is active, 0 otherwise.
+ *
+ * @return 1 if solve phase is active, 0 otherwise.
+ */
+int get_solve_phase(void);
+
+/**
+ * @brief Copies vector data from device to host (CUDA backend only).
+ * This is a no-op for non-CUDA backends. Used to copy solution vectors
+ * after solver terminates.
+ *
+ * @param src Source vector (on device).
+ * @param dst Destination buffer (on host).
+ * @param n Number of elements to copy.
+ */
+void copy_vector_from_device(QOCOVectorf* src, QOCOFloat* dst, QOCOInt n);
+
+/**
+ * @brief Returns the length of a QOCOVectorf.
+ *
+ * @param x Input vector.
+ * @return Length of vector.
+ */
+QOCOInt get_length_vectorf(const QOCOVectorf* x);
+
+/**
+ * @brief Returns the underlying CSC matrix from a QOCOMatrix.
+ * This function provides access to implementation-specific data.
+ * For builtin implementation, returns the internal CSC matrix.
+ *
+ * @param M Input matrix.
+ * @return Pointer to underlying CSC matrix.
+ */
+QOCOCscMatrix* get_csc_matrix(const QOCOMatrix* M);
+
+/**
+ * @brief Sparse matrix vector multiplication for QOCOMatrix where M is
+ * symmetric and only the upper triangular part is given. Computes r = M * v
+ *
+ * @param M Upper triangular part of M.
+ * @param v Vector.
+ * @param r Result.
+ */
+void USpMv_matrix(const QOCOMatrix* M, const QOCOFloat* v, QOCOFloat* r);
+
+/**
+ * @brief Sparse matrix vector multiplication for QOCOMatrix. Computes r = M * v.
+ *
+ * @param M Matrix.
+ * @param v Vector.
+ * @param r Result.
+ */
+void SpMv_matrix(const QOCOMatrix* M, const QOCOFloat* v, QOCOFloat* r);
+
+/**
+ * @brief Sparse matrix vector multiplication for QOCOMatrix where M is first
+ * transposed. Computes r = M^T * v.
+ *
+ * @param M Matrix.
+ * @param v Vector.
+ * @param r Result.
+ */
+void SpMtv_matrix(const QOCOMatrix* M, const QOCOFloat* v, QOCOFloat* r);
+
+/**
+ * @brief Computes the infinity norm of each column (or equivalently row) of a
+ * symmetric sparse matrix M where only the upper triangular portion of M is
+ * given. Works with QOCOMatrix*.
+ *
+ * @param M Upper triangular part of sparse symmetric matrix.
+ * @param norm Result vector of length n.
+ */
+void col_inf_norm_USymm_matrix(const QOCOMatrix* M, QOCOFloat* norm);
+
+/**
+ * @brief Computes the infinity norm of each column of M and stores in norm.
+ * Works with QOCOMatrix*.
+ *
+ * @param M An m by n sparse matrix.
+ * @param norm Result vector of length n.
+ */
+void col_inf_norm_matrix(const QOCOMatrix* M, QOCOFloat* norm);
+
+/**
+ * @brief Computes the infinity norm of each row of M and stores in norm.
+ * Works with QOCOMatrix*.
+ *
+ * @param M An m by n sparse matrix.
+ * @param norm Result vector of length m.
+ */
+void row_inf_norm_matrix(const QOCOMatrix* M, QOCOFloat* norm);
+
+/**
+ * @brief Scales the rows of M by E and columns of M by D.
+ * M = diag(E) * M * diag(D). Works with QOCOMatrix*.
+ *
+ * @param M An m by n sparse matrix.
+ * @param E Vector of length m.
+ * @param D Vector of length n.
+ */
+void row_col_scale_matrix(QOCOMatrix* M, const QOCOFloat* E, const QOCOFloat* D);
+
 /**
  * @brief Frees all the internal arrays and the pointer to the QOCOCscMatrix.
  * Should only be used if QOCOCscMatrix and all internal arrays were malloc'ed.
diff --git a/include/structs.h b/include/structs.h
index c439d84..571286e 100644
--- a/include/structs.h
+++ b/include/structs.h
@@ -25,25 +25,25 @@
  */
 typedef struct {
   /** Quadratic cost term. */
-  QOCOCscMatrix* P; //
+  QOCOMatrix* P; //
 
   /** Linear cost term. */
-  QOCOFloat* c;
+  QOCOVectorf* c;
 
   /** Affine equality constraint matrix. */
-  QOCOCscMatrix* A;
+  QOCOMatrix* A;
 
   /** Transpose of A (used in Ruiz for fast row norm calculations of A). */
-  QOCOCscMatrix* At;
+  QOCOMatrix* At;
 
   /** Affine equality constraint offset. */
-  QOCOFloat* b;
+  QOCOVectorf* b;
 
   /** Conic constraint matrix. */
-  QOCOCscMatrix* G;
+  QOCOMatrix* G;
 
   /** Transpose of G (used in Ruiz for fast row norm calculations of G). */
-  QOCOCscMatrix* Gt;
+  QOCOMatrix* Gt;
 
   /** Mapping from A to At. */
   QOCOInt* AtoAt;
@@ -52,7 +52,7 @@ typedef struct {
   QOCOInt* GtoGt;
 
   /** Conic constraint offset. */
-  QOCOFloat* h;
+  QOCOVectorf* h;
 
   /** Dimension of non-negative orthant in cone C. */
   QOCOInt l;
@@ -126,25 +126,25 @@ typedef struct {
 typedef struct {
 
   /** Diagonal of scaling matrix. */
-  QOCOFloat* delta;
+  QOCOVectorf* delta;
 
   /** Diagonal of scaling matrix. */
-  QOCOFloat* Druiz;
+  QOCOVectorf* Druiz;
 
   /** Diagonal of scaling matrix. */
-  QOCOFloat* Eruiz;
+  QOCOVectorf* Eruiz;
 
   /** Diagonal of scaling matrix. */
-  QOCOFloat* Fruiz;
+  QOCOVectorf* Fruiz;
 
   /** Inverse of Druiz. */
-  QOCOFloat* Dinvruiz;
+  QOCOVectorf* Dinvruiz;
 
   /** Inverse of Eruiz. */
-  QOCOFloat* Einvruiz;
+  QOCOVectorf* Einvruiz;
 
   /** Inverse of Fruiz. */
-  QOCOFloat* Finvruiz;
+  QOCOVectorf* Finvruiz;
 
   /** Cost scaling factor. */
   QOCOFloat k;
@@ -168,16 +168,16 @@ typedef struct {
   QOCOScaling* scaling;
 
   /** Iterate of primal variables. */
-  QOCOFloat* x;
+  QOCOVectorf* x;
 
   /** Iterate of slack variables associated with conic constraint. */
-  QOCOFloat* s;
+  QOCOVectorf* s;
 
   /** Iterate of dual variables associated with affine equality constraint. */
-  QOCOFloat* y;
+  QOCOVectorf* y;
 
   /** Iterate of dual variables associated with conic constraint. */
-  QOCOFloat* z;
+  QOCOVectorf* z;
 
   /** Gap (s'*z / m) */
   QOCOFloat mu;
@@ -239,10 +239,10 @@ typedef struct {
   QOCOFloat* Ds;
 
   /** RHS of KKT system. */
-  QOCOFloat* rhs;
+  QOCOVectorf* rhs;
 
   /** Solution of KKT system. */
-  QOCOFloat* xyz;
+  QOCOVectorf* xyz;
 
   /** Buffer of size n + m + p. */
   QOCOFloat* xyzbuff1;
diff --git a/src/common_linalg.c b/src/common_linalg.c
index d7634aa..107e4d3 100644
--- a/src/common_linalg.c
+++ b/src/common_linalg.c
@@ -150,6 +150,24 @@ void col_inf_norm_USymm(const QOCOCscMatrix* M, QOCOFloat* norm)
   }
 }
 
+void col_inf_norm(const QOCOCscMatrix* M, QOCOFloat* norm)
+{
+  // Initialize norm array to zero
+  for (QOCOInt j = 0; j < M->n; j++) {
+    norm[j] = 0.0;
+  }
+  
+  // For CSC format, column norms are computed efficiently by iterating over columns
+  for (QOCOInt j = 0; j < M->n; j++) {
+    for (QOCOInt idx = M->p[j]; idx < M->p[j + 1]; idx++) {
+      QOCOFloat val = qoco_abs(M->x[idx]);
+      if (val > norm[j]) {
+        norm[j] = val;
+      }
+    }
+  }
+}
+
 void row_inf_norm(const QOCOCscMatrix* M, QOCOFloat* norm)
 {
   for (QOCOInt i = 0; i < M->m; ++i) {
diff --git a/src/cone.c b/src/cone.c
index c30ee79..48fea77 100644
--- a/src/cone.c
+++ b/src/cone.c
@@ -174,7 +174,8 @@ void compute_nt_scaling(QOCOWorkspace* work)
   // Compute Nesterov-Todd scaling for LP cone.
   QOCOInt idx;
   for (idx = 0; idx < work->data->l; ++idx) {
-    work->WtW[idx] = safe_div(work->s[idx], work->z[idx]);
+    work->WtW[idx] = safe_div(get_element_vectorf(work->s, idx),
+                              get_element_vectorf(work->z, idx));
     work->W[idx] = qoco_sqrt(work->WtW[idx]);
     work->Wfull[idx] = work->W[idx];
     work->Winv[idx] = safe_div(1.0, work->W[idx]);
@@ -186,15 +187,19 @@ void compute_nt_scaling(QOCOWorkspace* work)
   QOCOInt nt_idx_full = idx;
   for (QOCOInt i = 0; i < work->data->nsoc; ++i) {
     // Compute normalized vectors.
-    QOCOFloat s_scal = soc_residual2(&work->s[idx], work->data->q[i]);
+    QOCOFloat s_scal =
+        soc_residual2(get_pointer_vectorf(work->s, idx), work->data->q[i]);
     s_scal = qoco_sqrt(s_scal);
     QOCOFloat f = safe_div(1.0, s_scal);
-    scale_arrayf(&work->s[idx], work->sbar, f, work->data->q[i]);
+    scale_arrayf(get_pointer_vectorf(work->s, idx), work->sbar, f,
+                 work->data->q[i]);
 
-    QOCOFloat z_scal = soc_residual2(&work->z[idx], work->data->q[i]);
+    QOCOFloat z_scal =
+        soc_residual2(get_pointer_vectorf(work->z, idx), work->data->q[i]);
     z_scal = qoco_sqrt(z_scal);
     f = safe_div(1.0, z_scal);
-    scale_arrayf(&work->z[idx], work->zbar, f, work->data->q[i]);
+    scale_arrayf(get_pointer_vectorf(work->z, idx), work->zbar, f,
+                 work->data->q[i]);
 
     QOCOFloat gamma = qoco_sqrt(
         0.5 * (1 + qoco_dot(work->sbar, work->zbar, work->data->q[i])));
@@ -264,22 +269,27 @@ void compute_nt_scaling(QOCOWorkspace* work)
   }
 
   // Compute scaled variable lambda. lambda = W * z.
-  nt_multiply(work->Wfull, work->z, work->lambda, work->data->l, work->data->m,
-              work->data->nsoc, work->data->q);
+  nt_multiply(work->Wfull, get_pointer_vectorf(work->z, 0), work->lambda,
+              work->data->l, work->data->m, work->data->nsoc, work->data->q);
 }
 
 void compute_centering(QOCOSolver* solver)
 {
   QOCOWorkspace* work = solver->work;
-  QOCOFloat* Dzaff = &work->xyz[work->data->n + work->data->p];
-  QOCOFloat a = qoco_min(linesearch(work->z, Dzaff, 1.0, solver),
-                         linesearch(work->s, work->Ds, 1.0, solver));
+  QOCOFloat* xyz = get_data_vectorf(work->xyz);
+  QOCOFloat* Dzaff = &xyz[work->data->n + work->data->p];
+  QOCOFloat a = qoco_min(
+      linesearch(get_pointer_vectorf(work->z, 0), Dzaff, 1.0, solver),
+      linesearch(get_pointer_vectorf(work->s, 0), work->Ds, 1.0, solver));
 
   // Compute rho. rho = ((s + a * Ds)'*(z + a * Dz)) / (s'*z).
-  qoco_axpy(Dzaff, work->z, work->ubuff1, a, work->data->m);
-  qoco_axpy(work->Ds, work->s, work->ubuff2, a, work->data->m);
+  qoco_axpy(Dzaff, get_pointer_vectorf(work->z, 0), work->ubuff1, a,
+            work->data->m);
+  qoco_axpy(work->Ds, get_pointer_vectorf(work->s, 0), work->ubuff2, a,
+            work->data->m);
   QOCOFloat rho = qoco_dot(work->ubuff1, work->ubuff2, work->data->m) /
-                  qoco_dot(work->z, work->s, work->data->m);
+                  qoco_dot(get_pointer_vectorf(work->z, 0),
+                           get_pointer_vectorf(work->s, 0), work->data->m);
 
   // Compute sigma. sigma = max(0, min(1, rho))^3.
   QOCOFloat sigma = qoco_min(1.0, rho);
diff --git a/src/equilibration.c b/src/equilibration.c
index 96e5306..a7b4e0c 100644
--- a/src/equilibration.c
+++ b/src/equilibration.c
@@ -5,33 +5,41 @@ void ruiz_equilibration(QOCOProblemData* data, QOCOScaling* scaling,
 {
   // Initialize ruiz data.
   for (QOCOInt i = 0; i < data->n; ++i) {
-    scaling->Druiz[i] = 1.0;
-    scaling->Dinvruiz[i] = 1.0;
+    set_element_vectorf(scaling->Druiz, i, 1.0);
+    set_element_vectorf(scaling->Dinvruiz, i, 1.0);
   }
   for (QOCOInt i = 0; i < data->p; ++i) {
-    scaling->Eruiz[i] = 1.0;
-    scaling->Einvruiz[i] = 1.0;
+    set_element_vectorf(scaling->Eruiz, i, 1.0);
+    set_element_vectorf(scaling->Einvruiz, i, 1.0);
   }
   for (QOCOInt i = 0; i < data->m; ++i) {
-    scaling->Fruiz[i] = 1.0;
-    scaling->Finvruiz[i] = 1.0;
+    set_element_vectorf(scaling->Fruiz, i, 1.0);
+    set_element_vectorf(scaling->Finvruiz, i, 1.0);
   }
   QOCOFloat g = 1.0;
   scaling->k = 1.0;
   scaling->kinv = 1.0;
 
+  QOCOFloat* delta_data = get_data_vectorf(scaling->delta);
+  QOCOFloat* cdata = get_data_vectorf(data->c);
+  QOCOFloat* Druiz_data = get_data_vectorf(scaling->Druiz);
+  QOCOFloat* Eruiz_data = get_data_vectorf(scaling->Eruiz);
+  QOCOFloat* Fruiz_data = get_data_vectorf(scaling->Fruiz);
+  QOCOFloat* bdata = get_data_vectorf(data->b);
+  QOCOFloat* hdata = get_data_vectorf(data->h);
+
   for (QOCOInt i = 0; i < ruiz_iters; ++i) {
 
     // Compute infinity norm of rows of [P A' G']
     for (QOCOInt j = 0; j < data->n; ++j) {
-      scaling->delta[j] = 0.0;
+      set_element_vectorf(scaling->delta, j, 0.0);
     }
-    g = inf_norm(data->c, data->n);
+    g = inf_norm(cdata, data->n);
     QOCOFloat Pinf_mean = 0.0;
     if (data->P) {
-      col_inf_norm_USymm(data->P, scaling->delta);
-      for (QOCOInt j = 0; j < data->P->n; ++j) {
-        Pinf_mean += scaling->delta[j];
+      col_inf_norm_USymm_matrix(data->P, delta_data);
+      for (QOCOInt j = 0; j < data->n; ++j) {
+        Pinf_mean += get_element_vectorf(scaling->delta, j);
       }
       Pinf_mean /= data->n;
     }
@@ -41,61 +49,61 @@ void ruiz_equilibration(QOCOProblemData* data, QOCOScaling* scaling,
     g = safe_div(1.0, g);
     scaling->k *= g;
 
-    if (data->A->nnz > 0) {
-      for (QOCOInt j = 0; j < data->A->n; ++j) {
-        QOCOFloat nrm = inf_norm(&data->A->x[data->A->p[j]],
-                                 data->A->p[j + 1] - data->A->p[j]);
-        scaling->delta[j] = qoco_max(scaling->delta[j], nrm);
+    // Compute column infinity norms of A and G
+    // For CSC format, column norms are computed efficiently
+    QOCOFloat Anorm[data->n];
+    QOCOFloat Gnorm[data->n];
+    if (get_nnz(data->A) > 0) {
+      col_inf_norm_matrix(data->A, Anorm);
+      for (QOCOInt j = 0; j < data->n; ++j) {
+        QOCOFloat nrm = qoco_max(get_element_vectorf(scaling->delta, j), Anorm[j]);
+        set_element_vectorf(scaling->delta, j, nrm);
       }
     }
-    if (data->G->nnz > 0) {
-      for (QOCOInt j = 0; j < data->G->n; ++j) {
-        QOCOFloat nrm = inf_norm(&data->G->x[data->G->p[j]],
-                                 data->G->p[j + 1] - data->G->p[j]);
-        scaling->delta[j] = qoco_max(scaling->delta[j], nrm);
+    if (get_nnz(data->G) > 0) {
+      col_inf_norm_matrix(data->G, Gnorm);
+      for (QOCOInt j = 0; j < data->n; ++j) {
+        QOCOFloat nrm = qoco_max(get_element_vectorf(scaling->delta, j), Gnorm[j]);
+        set_element_vectorf(scaling->delta, j, nrm);
       }
     }
 
     // d(i) = 1 / sqrt(max([Pinf(i), Atinf(i), Gtinf(i)]));
     for (QOCOInt j = 0; j < data->n; ++j) {
-      QOCOFloat temp = qoco_sqrt(scaling->delta[j]);
+      QOCOFloat temp = qoco_sqrt(get_element_vectorf(scaling->delta, j));
       temp = safe_div(1.0, temp);
-      scaling->delta[j] = temp;
+      set_element_vectorf(scaling->delta, j, temp);
     }
 
     // Compute infinity norm of rows of [A 0 0].
-    if (data->A->nnz > 0) {
-      for (QOCOInt j = 0; j < data->At->n; ++j) {
-        QOCOFloat nrm = inf_norm(&data->At->x[data->At->p[j]],
-                                 data->At->p[j + 1] - data->At->p[j]);
-        scaling->delta[data->n + j] = nrm;
-      }
+    // For row norms, compute column norms of the transpose (At is stored in CSC format)
+    if (get_nnz(data->A) > 0) {
+      col_inf_norm_matrix(data->At, &delta_data[data->n]);
       // d(i) = 1 / sqrt(Ainf(i));
       for (QOCOInt k = 0; k < data->p; ++k) {
-        QOCOFloat temp = qoco_sqrt(scaling->delta[data->n + k]);
+        QOCOFloat temp =
+            qoco_sqrt(get_element_vectorf(scaling->delta, data->n + k));
         temp = safe_div(1.0, temp);
-        scaling->delta[data->n + k] = temp;
+        set_element_vectorf(scaling->delta, data->n + k, temp);
       }
     }
 
     // Compute infinity norm of rows of [G 0 0].
-    if (data->G->nnz > 0) {
-      for (QOCOInt j = 0; j < data->Gt->n; ++j) {
-        QOCOFloat nrm = inf_norm(&data->Gt->x[data->Gt->p[j]],
-                                 data->Gt->p[j + 1] - data->Gt->p[j]);
-        scaling->delta[data->n + data->p + j] = nrm;
-      }
+    // For row norms, compute column norms of the transpose (Gt is stored in CSC format)
+    if (get_nnz(data->G) > 0) {
+      col_inf_norm_matrix(data->Gt, &delta_data[data->n + data->p]);
       // d(i) = 1 / sqrt(Ginf(i));
       for (QOCOInt k = 0; k < data->m; ++k) {
-        QOCOFloat temp = qoco_sqrt(scaling->delta[data->n + data->p + k]);
+        QOCOFloat temp = qoco_sqrt(
+            get_element_vectorf(scaling->delta, data->n + data->p + k));
         temp = safe_div(1.0, temp);
-        scaling->delta[data->n + data->p + k] = temp;
+        set_element_vectorf(scaling->delta, data->n + data->p + k, temp);
       }
     }
 
-    QOCOFloat* D = scaling->delta;
-    QOCOFloat* E = &scaling->delta[data->n];
-    QOCOFloat* F = &scaling->delta[data->n + data->p];
+    QOCOFloat* D = delta_data;
+    QOCOFloat* E = &delta_data[data->n];
+    QOCOFloat* F = &delta_data[data->n + data->p];
 
     // Make scalings for all variables in a second-order cone equal.
     QOCOInt idx = data->l;
@@ -108,53 +116,59 @@ void ruiz_equilibration(QOCOProblemData* data, QOCOScaling* scaling,
 
     // Scale P.
     if (data->P) {
-      scale_arrayf(data->P->x, data->P->x, g, data->P->nnz);
-      row_col_scale(data->P, D, D);
+      QOCOCscMatrix* Pcsc = get_csc_matrix(data->P);
+      scale_arrayf(Pcsc->x, Pcsc->x, g, get_nnz(data->P));
+      row_col_scale_matrix(data->P, D, D);
     }
 
     // Scale c.
-    scale_arrayf(data->c, data->c, g, data->n);
-    ew_product(data->c, D, data->c, data->n);
+    scale_arrayf(cdata, cdata, g, data->n);
+    ew_product(cdata, D, cdata, data->n);
 
     // Scale A and G.
-    row_col_scale(data->A, E, D);
-    row_col_scale(data->G, F, D);
-    row_col_scale(data->At, D, E);
-    row_col_scale(data->Gt, D, F);
+    row_col_scale_matrix(data->A, E, D);
+    row_col_scale_matrix(data->G, F, D);
+    row_col_scale_matrix(data->At, D, E);
+    row_col_scale_matrix(data->Gt, D, F);
 
     // Update scaling matrices with delta.
-    ew_product(scaling->Druiz, D, scaling->Druiz, data->n);
-    ew_product(scaling->Eruiz, E, scaling->Eruiz, data->p);
-    ew_product(scaling->Fruiz, F, scaling->Fruiz, data->m);
+    ew_product(Druiz_data, D, Druiz_data, data->n);
+    ew_product(Eruiz_data, E, Eruiz_data, data->p);
+    ew_product(Fruiz_data, F, Fruiz_data, data->m);
   }
 
   // Scale b.
-  ew_product(data->b, scaling->Eruiz, data->b, data->p);
+  ew_product(bdata, Eruiz_data, bdata, data->p);
 
   // Scale h.
-  ew_product(data->h, scaling->Fruiz, data->h, data->m);
+  ew_product(hdata, Fruiz_data, hdata, data->m);
 
   // Compute Dinv, Einv, Finv.
-  for (QOCOInt i = 0; i < data->n; ++i) {
-    scaling->Dinvruiz[i] = safe_div(1.0, scaling->Druiz[i]);
-  }
-  for (QOCOInt i = 0; i < data->p; ++i) {
-    scaling->Einvruiz[i] = safe_div(1.0, scaling->Eruiz[i]);
-  }
-  for (QOCOInt i = 0; i < data->m; ++i) {
-    scaling->Finvruiz[i] = safe_div(1.0, scaling->Fruiz[i]);
-  }
+  reciprocal_vectorf(scaling->Druiz, scaling->Dinvruiz);
+  reciprocal_vectorf(scaling->Eruiz, scaling->Einvruiz);
+  reciprocal_vectorf(scaling->Fruiz, scaling->Finvruiz);
   scaling->kinv = safe_div(1.0, scaling->k);
 }
 
 void unscale_variables(QOCOWorkspace* work)
 {
-  ew_product(work->x, work->scaling->Druiz, work->x, work->data->n);
-  ew_product(work->s, work->scaling->Finvruiz, work->s, work->data->m);
-
-  ew_product(work->y, work->scaling->Eruiz, work->y, work->data->p);
-  scale_arrayf(work->y, work->y, work->scaling->kinv, work->data->p);
-
-  ew_product(work->z, work->scaling->Fruiz, work->z, work->data->m);
-  scale_arrayf(work->z, work->z, work->scaling->kinv, work->data->m);
-}
\ No newline at end of file
+  ew_product(get_pointer_vectorf(work->x, 0),
+             get_pointer_vectorf(work->scaling->Druiz, 0),
+             get_pointer_vectorf(work->x, 0), work->data->n);
+
+  ew_product(get_pointer_vectorf(work->s, 0),
+             get_pointer_vectorf(work->scaling->Finvruiz, 0),
+             get_pointer_vectorf(work->s, 0), work->data->m);
+
+  ew_product(get_pointer_vectorf(work->y, 0),
+             get_pointer_vectorf(work->scaling->Eruiz, 0),
+             get_pointer_vectorf(work->y, 0), work->data->p);
+  scale_arrayf(get_pointer_vectorf(work->y, 0), get_pointer_vectorf(work->y, 0),
+               work->scaling->kinv, work->data->p);
+
+  ew_product(get_pointer_vectorf(work->z, 0),
+             get_pointer_vectorf(work->scaling->Fruiz, 0),
+             get_pointer_vectorf(work->z, 0), work->data->m);
+  scale_arrayf(get_pointer_vectorf(work->z, 0), get_pointer_vectorf(work->z, 0),
+               work->scaling->kinv, work->data->m);
+}
diff --git a/src/kkt.c b/src/kkt.c
index cf1ccd3..32fdffa 100644
--- a/src/kkt.c
+++ b/src/kkt.c
@@ -23,7 +23,8 @@ QOCOCscMatrix* construct_kkt(QOCOCscMatrix* P, QOCOCscMatrix* A,
 
   KKT->m = n + m + p;
   KKT->n = n + m + p;
-  KKT->nnz = P->nnz + A->nnz + G->nnz + Wnnz + p;
+  QOCOInt Pnnz = P ? P->nnz : 0;
+  KKT->nnz = Pnnz + A->nnz + G->nnz + Wnnz + p;
 
   KKT->x = qoco_calloc(KKT->nnz, sizeof(QOCOFloat));
   KKT->i = qoco_calloc(KKT->nnz, sizeof(QOCOInt));
@@ -32,15 +33,25 @@ QOCOCscMatrix* construct_kkt(QOCOCscMatrix* P, QOCOCscMatrix* A,
   QOCOInt nz = 0;
   QOCOInt col = 0;
   // Add P block
-  for (QOCOInt k = 0; k < P->nnz; ++k) {
-    PregtoKKT[k] = nz;
-    KKT->x[nz] = P->x[k];
-    KKT->i[nz] = P->i[k];
-    nz += 1;
-  }
-  for (QOCOInt k = 0; k < P->n + 1; ++k) {
-    KKT->p[col] = P->p[k];
-    col += 1;
+  if (P) {
+    for (QOCOInt k = 0; k < P->nnz; ++k) {
+      if (PregtoKKT) {
+        PregtoKKT[k] = nz;
+      }
+      KKT->x[nz] = P->x[k];
+      KKT->i[nz] = P->i[k];
+      nz += 1;
+    }
+    for (QOCOInt k = 0; k < P->n + 1; ++k) {
+      KKT->p[col] = P->p[k];
+      col += 1;
+    }
+  } else {
+    // P is NULL, so just set column pointers for n columns
+    for (QOCOInt k = 0; k < n + 1; ++k) {
+      KKT->p[col] = 0;
+      col += 1;
+    }
   }
 
   // Add A^T block
@@ -189,9 +200,14 @@ void initialize_ipm(QOCOSolver* solver)
   // Construct rhs of KKT system.
   // Construct rhs of KKT system = [-c;b;h].
   QOCOProblemData* data = solver->work->data;
-  copy_and_negate_arrayf(data->c, solver->work->rhs, data->n);
-  copy_arrayf(data->b, &solver->work->rhs[data->n], data->p);
-  copy_arrayf(data->h, &solver->work->rhs[data->n + data->p], data->m);
+  QOCOFloat* cdata = get_data_vectorf(data->c);
+  QOCOFloat* bdata = get_data_vectorf(data->b);
+  QOCOFloat* hdata = get_data_vectorf(data->h);
+  QOCOFloat* rhs = get_data_vectorf(solver->work->rhs);
+  QOCOFloat* xyz = get_data_vectorf(solver->work->xyz);
+  copy_and_negate_arrayf(cdata, rhs, data->n);
+  copy_arrayf(bdata, &rhs[data->n], data->p);
+  copy_arrayf(hdata, &rhs[data->n + data->p], data->m);
 
   // Factor KKT matrix.
   solver->linsys->linsys_factor(solver->linsys_data, solver->work->data->n,
@@ -199,28 +215,30 @@ void initialize_ipm(QOCOSolver* solver)
 
   // Solve KKT system.
   solver->linsys->linsys_solve(solver->linsys_data, solver->work,
-                               solver->work->rhs, solver->work->xyz,
+                               rhs, xyz,
                                solver->settings->iter_ref_iters);
 
-  // Copy x part of solution to x.
-  copy_arrayf(solver->work->xyz, solver->work->x, solver->work->data->n);
+  // Copy x part of solution to x (GPU-to-GPU copy).
+  // During solve phase, get_data_vectorf returns device pointers, so copy_arrayf
+  // will detect both xyz and work->x are on device and use a CUDA kernel.
+  copy_arrayf(xyz, get_data_vectorf(solver->work->x), solver->work->data->n);
 
-  // Copy y part of solution to y.
-  copy_arrayf(&solver->work->xyz[solver->work->data->n], solver->work->y,
+  // Copy y part of solution to y (GPU-to-GPU copy).
+  copy_arrayf(&xyz[solver->work->data->n], get_data_vectorf(solver->work->y),
               solver->work->data->p);
 
-  // Copy z part of solution to z.
-  copy_arrayf(&solver->work->xyz[solver->work->data->n + solver->work->data->p],
-              solver->work->z, solver->work->data->m);
+  // Copy z part of solution to z (GPU-to-GPU copy).
+  copy_arrayf(&xyz[solver->work->data->n + solver->work->data->p],
+              get_data_vectorf(solver->work->z), solver->work->data->m);
 
-  // Copy and negate z part of solution to s.
+  // Copy and negate z part of solution to s (GPU-to-GPU copy).
   copy_and_negate_arrayf(
-      &solver->work->xyz[solver->work->data->n + solver->work->data->p],
-      solver->work->s, solver->work->data->m);
+      &xyz[solver->work->data->n + solver->work->data->p],
+      get_data_vectorf(solver->work->s), solver->work->data->m);
 
   // Bring s and z to cone C.
-  bring2cone(solver->work->s, solver->work->data);
-  bring2cone(solver->work->z, solver->work->data);
+  bring2cone(get_data_vectorf(solver->work->s), solver->work->data);
+  bring2cone(get_data_vectorf(solver->work->z), solver->work->data);
 }
 
 void compute_kkt_residual(QOCOProblemData* data, QOCOFloat* x, QOCOFloat* y,
@@ -240,14 +258,17 @@ void compute_kkt_residual(QOCOProblemData* data, QOCOFloat* x, QOCOFloat* y,
 
   // rhs += [c;-b;-h+s]
   // Add c and account for regularization of P.
-  qoco_axpy(data->c, kktres, kktres, 1.0, data->n);
+  QOCOFloat* cdata = get_data_vectorf(data->c);
+  QOCOFloat* bdata = get_data_vectorf(data->b);
+  QOCOFloat* hdata = get_data_vectorf(data->h);
+  qoco_axpy(cdata, kktres, kktres, 1.0, data->n);
   qoco_axpy(x, kktres, kktres, -static_reg, data->n);
 
   // Add -b.
-  qoco_axpy(data->b, &kktres[data->n], &kktres[data->n], -1.0, data->p);
+  qoco_axpy(bdata, &kktres[data->n], &kktres[data->n], -1.0, data->p);
 
   // Add -h + s.
-  qoco_axpy(data->h, &kktres[data->n + data->p], &kktres[data->n + data->p],
+  qoco_axpy(hdata, &kktres[data->n + data->p], &kktres[data->n + data->p],
             -1.0, data->m);
   qoco_axpy(s, &kktres[data->n + data->p], &kktres[data->n + data->p], 1.0,
             data->m);
@@ -256,8 +277,15 @@ void compute_kkt_residual(QOCOProblemData* data, QOCOFloat* x, QOCOFloat* y,
 QOCOFloat compute_objective(QOCOProblemData* data, QOCOFloat* x,
                             QOCOFloat* nbuff, QOCOFloat static_reg, QOCOFloat k)
 {
-  QOCOFloat obj = qoco_dot(x, data->c, data->n);
-  USpMv(data->P, x, nbuff);
+  QOCOFloat* cdata = get_data_vectorf(data->c);
+  QOCOFloat obj = qoco_dot(x, cdata, data->n);
+  if (data->P) {
+    USpMv_matrix(data->P, x, nbuff);
+  } else {
+    for (QOCOInt i = 0; i < data->n; ++i) {
+      nbuff[i] = 0.0;
+    }
+  }
 
   // Correct for regularization in P.
   QOCOFloat regularization_correction = 0.0;
@@ -270,8 +298,11 @@ QOCOFloat compute_objective(QOCOProblemData* data, QOCOFloat* x,
 }
 void construct_kkt_aff_rhs(QOCOWorkspace* work)
 {
+  // During solve phase, get_data_vectorf returns device pointer automatically
+  QOCOFloat* rhs = get_data_vectorf(work->rhs);
+  
   // Negate the kkt residual and store in rhs.
-  copy_and_negate_arrayf(work->kktres, work->rhs,
+  copy_and_negate_arrayf(work->kktres, rhs,
                          work->data->n + work->data->p + work->data->m);
 
   // Compute W*lambda
@@ -279,15 +310,18 @@ void construct_kkt_aff_rhs(QOCOWorkspace* work)
               work->data->m, work->data->nsoc, work->data->q);
 
   // Add W*lambda to z portion of rhs.
-  qoco_axpy(work->ubuff1, &work->rhs[work->data->n + work->data->p],
-            &work->rhs[work->data->n + work->data->p], 1.0, work->data->m);
+  qoco_axpy(work->ubuff1, &rhs[work->data->n + work->data->p],
+            &rhs[work->data->n + work->data->p], 1.0, work->data->m);
 }
 
 void construct_kkt_comb_rhs(QOCOWorkspace* work)
 {
+  // During solve phase, get_data_vectorf returns device pointer automatically
+  QOCOFloat* rhs = get_data_vectorf(work->rhs);
+  QOCOFloat* xyz = get_data_vectorf(work->xyz);
 
   // Negate the kkt residual and store in rhs.
-  copy_and_negate_arrayf(work->kktres, work->rhs,
+  copy_and_negate_arrayf(work->kktres, rhs,
                          work->data->n + work->data->p + work->data->m);
 
   /// ds = -cone_product(lambda, lambda) - settings.mehrotra *
@@ -298,7 +332,7 @@ void construct_kkt_comb_rhs(QOCOWorkspace* work)
               work->data->m, work->data->nsoc, work->data->q);
 
   // ubuff2 = W * Dzaff.
-  QOCOFloat* Dzaff = &work->xyz[work->data->n + work->data->p];
+  QOCOFloat* Dzaff = &xyz[work->data->n + work->data->p];
   nt_multiply(work->Wfull, Dzaff, work->ubuff2, work->data->l, work->data->m,
               work->data->nsoc, work->data->q);
 
@@ -335,8 +369,8 @@ void construct_kkt_comb_rhs(QOCOWorkspace* work)
               work->data->m, work->data->nsoc, work->data->q);
 
   // rhs = [dx;dy;dz-W'*cone_division(lambda, ds, pdata)];
-  qoco_axpy(work->ubuff1, &work->rhs[work->data->n + work->data->p],
-            &work->rhs[work->data->n + work->data->p], -1.0, work->data->m);
+  qoco_axpy(work->ubuff1, &rhs[work->data->n + work->data->p],
+            &rhs[work->data->n + work->data->p], -1.0, work->data->m);
 }
 
 void predictor_corrector(QOCOSolver* solver)
@@ -351,12 +385,15 @@ void predictor_corrector(QOCOSolver* solver)
   construct_kkt_aff_rhs(work);
 
   // Solve to get affine scaling direction.
+  // During solve phase, get_data_vectorf returns device pointers automatically
+  QOCOFloat* rhs = get_data_vectorf(work->rhs);
+  QOCOFloat* xyz = get_data_vectorf(work->xyz);
   solver->linsys->linsys_solve(solver->linsys_data, solver->work,
-                               solver->work->rhs, solver->work->xyz,
+                               rhs, xyz,
                                solver->settings->iter_ref_iters);
 
   // Compute Dsaff. Dsaff = W' * (-lambda - W * Dzaff).
-  QOCOFloat* Dzaff = &work->xyz[work->data->n + work->data->p];
+  QOCOFloat* Dzaff = &xyz[work->data->n + work->data->p];
   nt_multiply(work->Wfull, Dzaff, work->ubuff1, work->data->l, work->data->m,
               work->data->nsoc, work->data->q);
   for (QOCOInt i = 0; i < work->data->m; ++i) {
@@ -368,25 +405,64 @@ void predictor_corrector(QOCOSolver* solver)
   // Compute centering parameter.
   compute_centering(solver);
 
-  // Construct rhs for affine scaling direction.
+  // Construct rhs for combined direction.
   construct_kkt_comb_rhs(work);
 
   // Solve to get combined direction.
+  // During solve phase, get_data_vectorf returns device pointers automatically
+  rhs = get_data_vectorf(work->rhs);
+  xyz = get_data_vectorf(work->xyz);
   solver->linsys->linsys_solve(solver->linsys_data, solver->work,
-                               solver->work->rhs, solver->work->xyz,
+                               rhs, xyz,
                                solver->settings->iter_ref_iters);
+  
   // Check if solution has NaNs. If NaNs are present, early exit and set a to
   // 0.0 to trigger reduced tolerance optimality checks.
+  // During solve phase, xyz is on device, so we need to sync to host for NaN check
+  // But this is error checking, not part of normal solve flow
+  #ifdef QOCO_ALGEBRA_BACKEND_CUDA
+  extern int get_solve_phase(void);
+  if (get_solve_phase()) {
+    // Temporarily sync xyz from device to host for NaN check
+    // This creates a temporary copy, but it's only for error checking
+    QOCOInt n = work->data->n + work->data->m + work->data->p;
+    QOCOFloat* xyz_host = (QOCOFloat*)malloc(n * sizeof(QOCOFloat));
+    extern void* cudaMemcpy(void* dst, const void* src, size_t count, int kind);
+    #define cudaMemcpyDeviceToHost 2
+    cudaMemcpy(xyz_host, xyz, n * sizeof(QOCOFloat), cudaMemcpyDeviceToHost);
+    for (QOCOInt i = 0; i < n; ++i) {
+      if (isnan(xyz_host[i])) {
+        free(xyz_host);
+        work->a = 0.0;
+        return;
+      }
+    }
+    free(xyz_host);
+  } else {
+    // Not in solve phase, use host pointer directly
+    QOCOFloat* xyz_host = get_data_vectorf(work->xyz);
+    for (QOCOInt i = 0; i < work->data->n + work->data->p + work->data->m; ++i) {
+      if (isnan(xyz_host[i])) {
+        work->a = 0.0;
+        return;
+      }
+    }
+  }
+  #else
+  // Builtin backend - use host pointer directly
+  QOCOFloat* xyz_host = get_data_vectorf(work->xyz);
   for (QOCOInt i = 0; i < work->data->n + work->data->p + work->data->m; ++i) {
-    if (isnan(work->xyz[i])) {
+    if (isnan(xyz_host[i])) {
       work->a = 0.0;
       return;
     }
   }
+  #endif
 
   // Compute Ds. Ds = W' * (cone_division(lambda, ds, pdata) - W * Dz). ds
   // computed in construct_kkt_comb_rhs() and stored in work->Ds.
-  QOCOFloat* Dz = &work->xyz[work->data->n + work->data->p];
+  // Use device pointer during solve phase
+  QOCOFloat* Dz = &xyz[work->data->n + work->data->p];
   cone_division(work->lambda, work->Ds, work->ubuff1, work->data->l,
                 work->data->nsoc, work->data->q);
   nt_multiply(work->Wfull, Dz, work->ubuff2, work->data->l, work->data->m,
@@ -397,21 +473,25 @@ void predictor_corrector(QOCOSolver* solver)
               work->data->nsoc, work->data->q);
 
   // Compute step-size.
-  QOCOFloat a = qoco_min(linesearch(work->s, work->Ds, 0.99, solver),
-                         linesearch(work->z, Dz, 0.99, solver));
+  QOCOFloat a = qoco_min(linesearch(get_data_vectorf(work->s), work->Ds, 0.99, solver),
+                         linesearch(get_data_vectorf(work->z), Dz, 0.99, solver));
 
   // Save step-size.
   work->a = a;
 
   // Update iterates.
-  QOCOFloat* Dx = work->xyz;
-  QOCOFloat* Dy = &work->xyz[work->data->n];
+  // During solve phase, get_data_vectorf returns device pointer automatically
+  QOCOFloat* Dx = xyz;
+  QOCOFloat* Dy = &xyz[work->data->n];
   QOCOFloat* Ds = work->Ds;
 
-  qoco_axpy(Dx, work->x, work->x, a, work->data->n);
-  qoco_axpy(Ds, work->s, work->s, a, work->data->m);
-  qoco_axpy(Dy, work->y, work->y, a, work->data->p);
-  qoco_axpy(Dz, work->z, work->z, a, work->data->m);
+  qoco_axpy(Dx, get_data_vectorf(work->x), get_data_vectorf(work->x), a, work->data->n);
+  qoco_axpy(Ds, get_data_vectorf(work->s), get_data_vectorf(work->s), a, work->data->m);
+  qoco_axpy(Dy, get_data_vectorf(work->y), get_data_vectorf(work->y), a, work->data->p);
+  qoco_axpy(Dz, get_data_vectorf(work->z), get_data_vectorf(work->z), a, work->data->m);
+  
+  // Note: No sync needed here - get_data_vectorf returns device pointer during solve phase,
+  // so qoco_axpy operates directly on device memory
 }
 
 void kkt_multiply(QOCOFloat* x, QOCOFloat* y, QOCOProblemData* data,
@@ -420,18 +500,24 @@ void kkt_multiply(QOCOFloat* x, QOCOFloat* y, QOCOProblemData* data,
 {
 
   // Compute y[1:n] = P * x[1:n] + A^T * x[n+1:n+p] + G^T * x[n+p+1:n+p+m].
-  USpMv(data->P, x, y);
+  if (data->P) {
+    USpMv_matrix(data->P, x, y);
+  } else {
+    for (QOCOInt i = 0; i < data->n; ++i) {
+      y[i] = 0.0;
+    }
+  }
 
   if (data->p > 0) {
-    SpMtv(data->A, &x[data->n], nbuff);
+    SpMtv_matrix(data->A, &x[data->n], nbuff);
     qoco_axpy(y, nbuff, y, 1.0, data->n);
-    SpMv(data->A, x, &y[data->n]);
+    SpMv_matrix(data->A, x, &y[data->n]);
   }
 
   if (data->m > 0) {
-    SpMtv(data->G, &x[data->n + data->p], nbuff);
+    SpMtv_matrix(data->G, &x[data->n + data->p], nbuff);
     qoco_axpy(y, nbuff, y, 1.0, data->n);
-    SpMv(data->G, x, &y[data->n + data->p]);
+    SpMv_matrix(data->G, x, &y[data->n + data->p]);
   }
 
   if (Wfull) {
diff --git a/src/qoco_api.c b/src/qoco_api.c
index de81a2a..7f95414 100644
--- a/src/qoco_api.c
+++ b/src/qoco_api.c
@@ -10,7 +10,7 @@
 
 #include "qoco_api.h"
 #include "amd.h"
-#include "qdldl_backend.h" // TODO: make this modular so we can use any backend.
+#include "backend.h"
 
 QOCOInt qoco_setup(QOCOSolver* solver, QOCOInt n, QOCOInt m, QOCOInt p,
                    QOCOCscMatrix* P, QOCOFloat* c, QOCOCscMatrix* A,
@@ -44,60 +44,69 @@ QOCOInt qoco_setup(QOCOSolver* solver, QOCOInt n, QOCOInt m, QOCOInt p,
   data->m = m;
   data->n = n;
   data->p = p;
-  data->A = new_qoco_csc_matrix(A);
-  data->G = new_qoco_csc_matrix(G);
-  data->c = qoco_malloc(n * sizeof(QOCOFloat));
-  data->b = qoco_malloc(p * sizeof(QOCOFloat));
-  data->h = qoco_malloc(m * sizeof(QOCOFloat));
+  solver->work->data->A = new_qoco_matrix(A);
+  solver->work->data->G = new_qoco_matrix(G);
   data->q = qoco_malloc(nsoc * sizeof(QOCOInt));
 
-  copy_arrayf(c, data->c, n);
-  copy_arrayf(b, data->b, p);
-  copy_arrayf(h, data->h, m);
   copy_arrayi(q, data->q, nsoc);
 
+  solver->work->data->c = new_qoco_vectorf(c, n);
+  solver->work->data->b = new_qoco_vectorf(b, p);
+  solver->work->data->h = new_qoco_vectorf(h, m);
+
   data->l = l;
   data->nsoc = nsoc;
 
   // Copy P.
   if (P) {
-    data->P = new_qoco_csc_matrix(P);
+    data->P = new_qoco_matrix(P);
   }
   else {
     data->P = NULL;
   }
 
   // Equilibrate data.
-  QOCOInt Annz = A ? A->nnz : 0;
-  QOCOInt Gnnz = G ? G->nnz : 0;
+  QOCOInt Annz = A ? get_nnz(data->A) : 0;
+  QOCOInt Gnnz = G ? get_nnz(data->G) : 0;
 
   solver->work->scaling = qoco_malloc(sizeof(QOCOScaling));
-  solver->work->scaling->delta = qoco_malloc((n + p + m) * sizeof(QOCOFloat));
-  solver->work->scaling->Druiz = qoco_malloc(n * sizeof(QOCOFloat));
-  solver->work->scaling->Eruiz = qoco_malloc(p * sizeof(QOCOFloat));
-  solver->work->scaling->Fruiz = qoco_malloc(m * sizeof(QOCOFloat));
-  solver->work->scaling->Dinvruiz = qoco_malloc(n * sizeof(QOCOFloat));
-  solver->work->scaling->Einvruiz = qoco_malloc(p * sizeof(QOCOFloat));
-  solver->work->scaling->Finvruiz = qoco_malloc(m * sizeof(QOCOFloat));
+  solver->work->scaling->delta = new_qoco_vectorf(NULL, n + p + m);
+  solver->work->scaling->Druiz = new_qoco_vectorf(NULL, n);
+  solver->work->scaling->Eruiz = new_qoco_vectorf(NULL, p);
+  solver->work->scaling->Fruiz = new_qoco_vectorf(NULL, m);
+  solver->work->scaling->Dinvruiz = new_qoco_vectorf(NULL, n);
+  solver->work->scaling->Einvruiz = new_qoco_vectorf(NULL, p);
+  solver->work->scaling->Finvruiz = new_qoco_vectorf(NULL, m);
   solver->work->data->AtoAt = qoco_malloc(Annz * sizeof(QOCOInt));
   solver->work->data->GtoGt = qoco_malloc(Gnnz * sizeof(QOCOInt));
 
-  solver->work->data->At = create_transposed_matrix(data->A, data->AtoAt);
-  solver->work->data->Gt = create_transposed_matrix(data->G, data->GtoGt);
+  QOCOCscMatrix* Atcsc = create_transposed_matrix(get_csc_matrix(data->A), data->AtoAt);
+  solver->work->data->At = new_qoco_matrix(Atcsc);
+  free_qoco_csc_matrix(Atcsc);
+  QOCOCscMatrix* Gtcsc = create_transposed_matrix(get_csc_matrix(data->G), data->GtoGt);
+  solver->work->data->Gt = new_qoco_matrix(Gtcsc);
+  free_qoco_csc_matrix(Gtcsc);
   ruiz_equilibration(data, solver->work->scaling, solver->settings->ruiz_iters);
 
   // Regularize P.
   data->Pnzadded_idx = qoco_calloc(n, sizeof(QOCOInt));
   if (P) {
-    QOCOInt num_diagP = count_diag(P);
+    QOCOCscMatrix* Pcsc = get_csc_matrix(data->P);
+    QOCOInt num_diagP = count_diag(Pcsc);
     data->Pnum_nzadded = n - num_diagP;
+    // Create a copy of the CSC matrix since regularize_P will free it
+    QOCOCscMatrix* Pcsc_copy = new_qoco_csc_matrix(Pcsc);
     QOCOCscMatrix* Preg =
-        regularize_P(num_diagP, data->P, solver->settings->kkt_static_reg,
+        regularize_P(num_diagP, Pcsc_copy, solver->settings->kkt_static_reg,
                      data->Pnzadded_idx);
-    data->P = Preg;
+    free_qoco_matrix(data->P);
+    data->P = new_qoco_matrix(Preg);
+    free_qoco_csc_matrix(Preg);
   }
   else {
-    data->P = construct_identity(n, solver->settings->kkt_static_reg);
+    QOCOCscMatrix* Pid = construct_identity(n, solver->settings->kkt_static_reg);
+    data->P = new_qoco_matrix(Pid);
+    free_qoco_csc_matrix(Pid);
     data->Pnum_nzadded = n;
   }
 
@@ -120,10 +129,10 @@ QOCOInt qoco_setup(QOCOSolver* solver, QOCOInt n, QOCOInt m, QOCOInt p,
   }
 
   // Allocate primal and dual variables.
-  solver->work->x = qoco_malloc(n * sizeof(QOCOFloat));
-  solver->work->s = qoco_malloc(m * sizeof(QOCOFloat));
-  solver->work->y = qoco_malloc(p * sizeof(QOCOFloat));
-  solver->work->z = qoco_malloc(m * sizeof(QOCOFloat));
+  solver->work->x = new_qoco_vectorf(NULL, n);
+  solver->work->s = new_qoco_vectorf(NULL, m);
+  solver->work->y = new_qoco_vectorf(NULL, p);
+  solver->work->z = new_qoco_vectorf(NULL, m);
   solver->work->mu = 0.0;
 
   // Allocate Nesterov-Todd scalings and scaled variables.
@@ -154,9 +163,9 @@ QOCOInt qoco_setup(QOCOSolver* solver, QOCOInt n, QOCOInt m, QOCOInt p,
   solver->work->ubuff2 = qoco_malloc(m * sizeof(QOCOFloat));
   solver->work->ubuff3 = qoco_malloc(m * sizeof(QOCOFloat));
   solver->work->Ds = qoco_malloc(m * sizeof(QOCOFloat));
-  solver->work->rhs = qoco_malloc((n + m + p) * sizeof(QOCOFloat));
+  solver->work->rhs = new_qoco_vectorf(NULL, n + m + p);
   solver->work->kktres = qoco_malloc((n + m + p) * sizeof(QOCOFloat));
-  solver->work->xyz = qoco_malloc((n + m + p) * sizeof(QOCOFloat));
+  solver->work->xyz = new_qoco_vectorf(NULL, n + m + p);
   solver->work->xyzbuff1 = qoco_malloc((n + m + p) * sizeof(QOCOFloat));
   solver->work->xyzbuff2 = qoco_malloc((n + m + p) * sizeof(QOCOFloat));
 
@@ -231,23 +240,29 @@ void update_vector_data(QOCOSolver* solver, QOCOFloat* cnew, QOCOFloat* bnew,
 
   // Update cost vector.
   if (cnew) {
+    QOCOFloat* cdata = get_data_vectorf(data->c);
+    QOCOFloat* Druiz_data = get_data_vectorf(solver->work->scaling->Druiz);
     for (QOCOInt i = 0; i < data->n; ++i) {
-      data->c[i] =
-          solver->work->scaling->k * solver->work->scaling->Druiz[i] * cnew[i];
+      cdata[i] =
+          solver->work->scaling->k * Druiz_data[i] * cnew[i];
     }
   }
 
   // Update equality constraint vector.
   if (bnew) {
+    QOCOFloat* bdata = get_data_vectorf(data->b);
+    QOCOFloat* Eruiz_data = get_data_vectorf(solver->work->scaling->Eruiz);
     for (QOCOInt i = 0; i < data->p; ++i) {
-      data->b[i] = solver->work->scaling->Eruiz[i] * bnew[i];
+      bdata[i] = Eruiz_data[i] * bnew[i];
     }
   }
 
   // Update conic constraint vector.
   if (hnew) {
+    QOCOFloat* hdata = get_data_vectorf(data->h);
+    QOCOFloat* Fruiz_data = get_data_vectorf(solver->work->scaling->Fruiz);
     for (QOCOInt i = 0; i < data->m; ++i) {
-      data->h[i] = solver->work->scaling->Fruiz[i] * hnew[i];
+      hdata[i] = Fruiz_data[i] * hnew[i];
     }
   }
 }
@@ -260,57 +275,76 @@ void update_matrix_data(QOCOSolver* solver, QOCOFloat* Pxnew, QOCOFloat* Axnew,
   QOCOScaling* scaling = solver->work->scaling;
 
   // Undo regularization.
-  unregularize(data->P, solver->settings->kkt_static_reg);
+  QOCOCscMatrix* Pcsc = get_csc_matrix(data->P);
+  unregularize(Pcsc, solver->settings->kkt_static_reg);
 
   // Unequilibrate P.
-  scale_arrayf(data->P->x, data->P->x, scaling->kinv, data->P->nnz);
-  row_col_scale(data->P, scaling->Dinvruiz, scaling->Dinvruiz);
+  QOCOFloat* Px = Pcsc->x;
+  QOCOInt Pnnz = get_nnz(data->P);
+  QOCOFloat* Dinvruiz_data = get_data_vectorf(scaling->Dinvruiz);
+  scale_arrayf(Px, Px, scaling->kinv, Pnnz);
+  row_col_scale(Pcsc, Dinvruiz_data, Dinvruiz_data);
 
   // Unequilibrate c.
-  scale_arrayf(data->c, data->c, scaling->kinv, data->n);
-  ew_product(data->c, scaling->Dinvruiz, data->c, data->n);
+  QOCOFloat* cdata = get_data_vectorf(data->c);
+  scale_arrayf(cdata, cdata, scaling->kinv, data->n);
+  ew_product(cdata, Dinvruiz_data, cdata, data->n);
 
   // Unequilibrate A.
-  row_col_scale(data->A, scaling->Einvruiz, scaling->Dinvruiz);
+  QOCOCscMatrix* Acsc = get_csc_matrix(data->A);
+  QOCOFloat* Einvruiz_data = get_data_vectorf(scaling->Einvruiz);
+  row_col_scale(Acsc, Einvruiz_data, Dinvruiz_data);
 
   // Unequilibrate G.
-  row_col_scale(data->G, scaling->Finvruiz, scaling->Dinvruiz);
+  QOCOCscMatrix* Gcsc = get_csc_matrix(data->G);
+  QOCOFloat* Finvruiz_data = get_data_vectorf(scaling->Finvruiz);
+  row_col_scale(Gcsc, Finvruiz_data, Dinvruiz_data);
 
   // Unequilibrate b.
-  ew_product(data->b, scaling->Einvruiz, data->b, data->p);
+  QOCOFloat* bdata = get_data_vectorf(data->b);
+  ew_product(bdata, Einvruiz_data, bdata, data->p);
 
   // Unequilibrate h.
-  ew_product(data->h, scaling->Finvruiz, data->h, data->m);
+  QOCOFloat* hdata = get_data_vectorf(data->h);
+  ew_product(hdata, Finvruiz_data, hdata, data->m);
 
   // Update P and avoid nonzeros that were added for regularization.
   if (Pxnew) {
     QOCOInt avoid =
-        data->Pnum_nzadded > 0 ? data->Pnzadded_idx[0] : data->P->nnz + 1;
+        data->Pnum_nzadded > 0 ? data->Pnzadded_idx[0] : Pnnz + 1;
     QOCOInt offset = 0;
-    for (QOCOInt i = 0; i < data->P->nnz - data->Pnum_nzadded; ++i) {
+    for (QOCOInt i = 0; i < Pnnz - data->Pnum_nzadded; ++i) {
       if (i == avoid) {
         offset++;
         avoid = data->Pnzadded_idx[offset];
       }
       else {
-        data->P->x[i + offset] = Pxnew[i];
+        Px[i + offset] = Pxnew[i];
       }
     }
   }
 
   // Update A.
   if (Axnew) {
-    for (QOCOInt i = 0; i < data->A->nnz; ++i) {
-      data->A->x[i] = Axnew[i];
-      data->At->x[i] = Axnew[data->AtoAt[i]];
+    QOCOCscMatrix* Atcsc = get_csc_matrix(data->At);
+    QOCOFloat* Ax = Acsc->x;
+    QOCOFloat* Atx = Atcsc->x;
+    QOCOInt Annz = get_nnz(data->A);
+    for (QOCOInt i = 0; i < Annz; ++i) {
+      Ax[i] = Axnew[i];
+      Atx[i] = Axnew[data->AtoAt[i]];
     }
   }
 
   // Update G.
   if (Gxnew) {
-    for (QOCOInt i = 0; i < data->G->nnz; ++i) {
-      data->G->x[i] = Gxnew[i];
-      data->Gt->x[i] = Gxnew[data->GtoGt[i]];
+    QOCOCscMatrix* Gtcsc = get_csc_matrix(data->Gt);
+    QOCOFloat* Gx = Gcsc->x;
+    QOCOFloat* Gtx = Gtcsc->x;
+    QOCOInt Gnnz = get_nnz(data->G);
+    for (QOCOInt i = 0; i < Gnnz; ++i) {
+      Gx[i] = Gxnew[i];
+      Gtx[i] = Gxnew[data->GtoGt[i]];
     }
   }
 
@@ -319,7 +353,7 @@ void update_matrix_data(QOCOSolver* solver, QOCOFloat* Pxnew, QOCOFloat* Axnew,
                      solver->settings->ruiz_iters);
 
   // Regularize P.
-  unregularize(data->P, -solver->settings->kkt_static_reg);
+  unregularize(get_csc_matrix(data->P), -solver->settings->kkt_static_reg);
 
   solver->linsys->linsys_update_data(solver->linsys_data, solver->work->data);
 }
@@ -340,28 +374,40 @@ QOCOInt qoco_solve(QOCOSolver* solver)
     print_header(solver);
   }
 
+  // Set solve phase flag for CUDA backend (prevents CPU-GPU copies during solve)
+  // During solve phase, get_data_vectorf returns device pointers automatically
+  #ifdef QOCO_ALGEBRA_BACKEND_CUDA
+  extern void set_solve_phase(int active);
+  set_solve_phase(1);
+  #endif
+
   // Get initializations for primal and dual variables.
   initialize_ipm(solver);
   for (QOCOInt i = 1; i <= solver->settings->max_iters; ++i) {
 
     // Compute kkt residual.
-    compute_kkt_residual(data, work->x, work->y, work->s, work->z, work->kktres,
+    compute_kkt_residual(data, get_data_vectorf(work->x), get_data_vectorf(work->y), get_data_vectorf(work->s), get_data_vectorf(work->z), work->kktres,
                          solver->settings->kkt_static_reg, work->xyzbuff1,
                          work->xbuff, work->ubuff1, work->ubuff2);
 
     // Compute objective function.
     solver->sol->obj =
-        compute_objective(data, work->x, work->xbuff,
+        compute_objective(data, get_data_vectorf(work->x), work->xbuff,
                           solver->settings->kkt_static_reg, work->scaling->k);
 
     // Compute mu = s'*z / m.
     work->mu = (data->m > 0)
-                   ? safe_div(qoco_dot(work->s, work->z, data->m), data->m)
+                   ? safe_div(qoco_dot(get_data_vectorf(work->s), get_data_vectorf(work->z), data->m), data->m)
                    : 0;
 
     // Check stopping criteria.
     if (check_stopping(solver)) {
       stop_timer(&(work->solve_timer));
+      // Clear solve phase flag before copying solution (allows copy from device to host)
+      #ifdef QOCO_ALGEBRA_BACKEND_CUDA
+      extern void set_solve_phase(int active);
+      set_solve_phase(0);
+      #endif
       unscale_variables(work);
       copy_solution(solver);
       if (solver->settings->verbose) {
@@ -389,9 +435,14 @@ QOCOInt qoco_solve(QOCOSolver* solver)
     }
   }
 
-  stop_timer(&(work->solve_timer));
-  unscale_variables(work);
-  copy_solution(solver);
+      stop_timer(&(work->solve_timer));
+      // Clear solve phase flag before copying solution (allows copy from device to host)
+      #ifdef QOCO_ALGEBRA_BACKEND_CUDA
+      extern void set_solve_phase(int active);
+      set_solve_phase(0);
+      #endif
+      unscale_variables(work);
+      copy_solution(solver);
   solver->sol->status = QOCO_MAX_ITER;
   if (solver->settings->verbose) {
     print_footer(solver->sol, solver->sol->status);
@@ -403,17 +454,19 @@ QOCOInt qoco_cleanup(QOCOSolver* solver)
 {
 
   // Free problem data.
-  free_qoco_csc_matrix(solver->work->data->P);
-  free_qoco_csc_matrix(solver->work->data->A);
-  free_qoco_csc_matrix(solver->work->data->G);
-  free_qoco_csc_matrix(solver->work->data->At);
-  free_qoco_csc_matrix(solver->work->data->Gt);
+  if (solver->work->data->P) {
+    free_qoco_matrix(solver->work->data->P);
+  }
+  free_qoco_matrix(solver->work->data->A);
+  free_qoco_matrix(solver->work->data->G);
+  free_qoco_matrix(solver->work->data->At);
+  free_qoco_matrix(solver->work->data->Gt);
 
   qoco_free(solver->work->data->AtoAt);
   qoco_free(solver->work->data->GtoGt);
-  qoco_free(solver->work->data->b);
-  qoco_free(solver->work->data->c);
-  qoco_free(solver->work->data->h);
+  free_qoco_vectorf(solver->work->data->b);
+  free_qoco_vectorf(solver->work->data->c);
+  free_qoco_vectorf(solver->work->data->h);
   qoco_free(solver->work->data->q);
   qoco_free(solver->work->data->Pnzadded_idx);
   qoco_free(solver->work->data);
@@ -422,15 +475,15 @@ QOCOInt qoco_cleanup(QOCOSolver* solver)
   solver->linsys->linsys_cleanup(solver->linsys_data);
 
   // Free primal and dual variables.
-  qoco_free(solver->work->rhs);
+  free_qoco_vectorf(solver->work->rhs);
   qoco_free(solver->work->kktres);
-  qoco_free(solver->work->xyz);
+  free_qoco_vectorf(solver->work->xyz);
   qoco_free(solver->work->xyzbuff1);
   qoco_free(solver->work->xyzbuff2);
-  qoco_free(solver->work->x);
-  qoco_free(solver->work->s);
-  qoco_free(solver->work->y);
-  qoco_free(solver->work->z);
+  free_qoco_vectorf(solver->work->x);
+  free_qoco_vectorf(solver->work->s);
+  free_qoco_vectorf(solver->work->y);
+  free_qoco_vectorf(solver->work->z);
 
   // Free Nesterov-Todd scalings and scaled variables.
   qoco_free(solver->work->W);
@@ -449,13 +502,13 @@ QOCOInt qoco_cleanup(QOCOSolver* solver)
   qoco_free(solver->work->Ds);
 
   // Free scaling struct.
-  qoco_free(solver->work->scaling->delta);
-  qoco_free(solver->work->scaling->Druiz);
-  qoco_free(solver->work->scaling->Eruiz);
-  qoco_free(solver->work->scaling->Fruiz);
-  qoco_free(solver->work->scaling->Dinvruiz);
-  qoco_free(solver->work->scaling->Einvruiz);
-  qoco_free(solver->work->scaling->Finvruiz);
+  free_qoco_vectorf(solver->work->scaling->delta);
+  free_qoco_vectorf(solver->work->scaling->Druiz);
+  free_qoco_vectorf(solver->work->scaling->Eruiz);
+  free_qoco_vectorf(solver->work->scaling->Fruiz);
+  free_qoco_vectorf(solver->work->scaling->Dinvruiz);
+  free_qoco_vectorf(solver->work->scaling->Einvruiz);
+  free_qoco_vectorf(solver->work->scaling->Finvruiz);
   qoco_free(solver->work->scaling);
 
   // Free solution struct.
diff --git a/src/qoco_utils.c b/src/qoco_utils.c
index e19b092..488c427 100644
--- a/src/qoco_utils.c
+++ b/src/qoco_utils.c
@@ -10,6 +10,10 @@
 
 #include "qoco_utils.h"
 
+#ifdef QOCO_ALGEBRA_BACKEND_CUDA
+extern void sync_vector_to_device_if_needed(QOCOVectorf* v);
+#endif
+
 void print_qoco_csc_matrix(QOCOCscMatrix* M)
 {
   printf("\nPrinting CSC Matrix:\n");
@@ -87,9 +91,9 @@ void print_header(QOCOSolver* solver)
   printf("|     eq constraints:   %-9d                       |\n", data->p);
   printf("|     ineq constraints: %-9d                       |\n", data->l);
   printf("|     soc constraints:  %-9d                       |\n", data->nsoc);
-  printf("|     nnz(P):           %-9d                       |\n", data->P->nnz - solver->work->data->Pnum_nzadded);
-  printf("|     nnz(A):           %-9d                       |\n", data->A->nnz);
-  printf("|     nnz(G):           %-9d                       |\n", data->G->nnz);
+  printf("|     nnz(P):           %-9d                       |\n", (data->P ? get_nnz(data->P) : 0) - solver->work->data->Pnum_nzadded);
+  printf("|     nnz(A):           %-9d                       |\n", get_nnz(data->A));
+  printf("|     nnz(G):           %-9d                       |\n", get_nnz(data->G));
   printf("| Solver Settings:                                      |\n");
   printf("|     max_iters: %-3d abstol: %3.2e reltol: %3.2e  |\n", settings->max_iters, settings->abstol, settings->reltol);
   printf("|     abstol_inacc: %3.2e reltol_inacc: %3.2e     |\n", settings->abstol_inacc, settings->reltol_inacc);
@@ -133,55 +137,71 @@ unsigned char check_stopping(QOCOSolver* solver)
   QOCOFloat eabsinacc = solver->settings->abstol_inacc;
   QOCOFloat erelinacc = solver->settings->reltol_inacc;
 
-  ew_product(work->scaling->Einvruiz, data->b, work->ybuff, data->p);
+  QOCOFloat* Einvruiz_data = get_data_vectorf(work->scaling->Einvruiz);
+  QOCOFloat* bdata = get_data_vectorf(data->b);
+  ew_product(Einvruiz_data, bdata, work->ybuff, data->p);
   QOCOFloat binf = data->p > 0 ? inf_norm(work->ybuff, data->p) : 0;
 
-  ew_product(work->scaling->Fruiz, work->s, work->ubuff1, data->m);
+  QOCOFloat* Fruiz_data = get_data_vectorf(work->scaling->Fruiz);
+  QOCOFloat* sdata = get_data_vectorf(work->s);
+  ew_product(Fruiz_data, sdata, work->ubuff1, data->m);
   QOCOFloat sinf = data->m > 0 ? inf_norm(work->ubuff1, data->m) : 0;
 
-  ew_product(work->scaling->Dinvruiz, work->x, work->xbuff, data->n);
+  QOCOFloat* Dinvruiz_data = get_data_vectorf(work->scaling->Dinvruiz);
+  QOCOFloat* xdata = get_data_vectorf(work->x);
+  ew_product(Dinvruiz_data, xdata, work->xbuff, data->n);
   QOCOFloat cinf = inf_norm(work->xbuff, data->n);
 
-  ew_product(work->scaling->Finvruiz, data->h, work->ubuff3, data->m);
+  QOCOFloat* Finvruiz_data = get_data_vectorf(work->scaling->Finvruiz);
+  QOCOFloat* hdata = get_data_vectorf(data->h);
+  ew_product(Finvruiz_data, hdata, work->ubuff3, data->m);
   QOCOFloat hinf = data->m > 0 ? inf_norm(work->ubuff3, data->m) : 0;
 
   // Compute ||A^T * y||_\infty. If equality constraints aren't present, A->m =
   // A->n = 0 and SpMtv is a nullop.
-  SpMtv(data->A, work->y, work->xbuff);
-  ew_product(work->xbuff, work->scaling->Dinvruiz, work->xbuff, data->n);
+  QOCOFloat* ydata = get_data_vectorf(work->y);
+  SpMtv_matrix(data->A, ydata, work->xbuff);
+  ew_product(work->xbuff, Dinvruiz_data, work->xbuff, data->n);
   QOCOFloat Atyinf = data->p ? inf_norm(work->xbuff, data->n) : 0;
 
   // Compute ||G^T * z||_\infty. If inequality constraints aren't present, G->m
   // = G->n = 0 and SpMtv is a nullop.
-  SpMtv(data->G, work->z, work->xbuff);
-  ew_product(work->xbuff, work->scaling->Dinvruiz, work->xbuff, data->n);
+  QOCOFloat* zdata = get_data_vectorf(work->z);
+  SpMtv_matrix(data->G, zdata, work->xbuff);
+  ew_product(work->xbuff, Dinvruiz_data, work->xbuff, data->n);
   QOCOFloat Gtzinf = data->m > 0 ? inf_norm(work->xbuff, data->n) : 0;
 
   // Compute ||P * x||_\infty
-  USpMv(data->P, work->x, work->xbuff);
+  if (data->P) {
+    USpMv_matrix(data->P, xdata, work->xbuff);
+  } else {
+    for (QOCOInt i = 0; i < data->n; ++i) {
+      work->xbuff[i] = 0.0;
+    }
+  }
   for (QOCOInt i = 0; i < data->n; ++i) {
-    work->xbuff[i] -= solver->settings->kkt_static_reg * work->x[i];
+    work->xbuff[i] -= solver->settings->kkt_static_reg * xdata[i];
   }
-  ew_product(work->xbuff, work->scaling->Dinvruiz, work->xbuff, data->n);
+  ew_product(work->xbuff, Dinvruiz_data, work->xbuff, data->n);
   QOCOFloat Pxinf = inf_norm(work->xbuff, data->n);
-  QOCOFloat xPx = qoco_dot(work->x, work->xbuff, work->data->n);
+  QOCOFloat xPx = qoco_dot(xdata, work->xbuff, work->data->n);
 
   // Compute ||A * x||_\infty
-  SpMv(data->A, work->x, work->ybuff);
-  ew_product(work->ybuff, work->scaling->Einvruiz, work->ybuff, data->p);
+  SpMv_matrix(data->A, xdata, work->ybuff);
+  ew_product(work->ybuff, Einvruiz_data, work->ybuff, data->p);
   QOCOFloat Axinf = data->p ? inf_norm(work->ybuff, data->p) : 0;
 
   // Compute ||G * x||_\infty
-  SpMv(data->G, work->x, work->ubuff1);
-  ew_product(work->ubuff1, work->scaling->Finvruiz, work->ubuff1, data->m);
+  SpMv_matrix(data->G, xdata, work->ubuff1);
+  ew_product(work->ubuff1, Finvruiz_data, work->ubuff1, data->m);
   QOCOFloat Gxinf = data->m ? inf_norm(work->ubuff1, data->m) : 0;
 
   // Compute primal residual.
-  ew_product(&work->kktres[data->n], work->scaling->Einvruiz, work->ybuff,
+  ew_product(&work->kktres[data->n], Einvruiz_data, work->ybuff,
              data->p);
   QOCOFloat eq_res = inf_norm(work->ybuff, data->p);
 
-  ew_product(&work->kktres[data->n + data->p], work->scaling->Finvruiz,
+  ew_product(&work->kktres[data->n + data->p], Finvruiz_data,
              work->ubuff1, data->m);
   QOCOFloat conic_res = inf_norm(work->ubuff1, data->m);
 
@@ -189,14 +209,14 @@ unsigned char check_stopping(QOCOSolver* solver)
   solver->sol->pres = pres;
 
   // Compute dual residual.
-  ew_product(work->kktres, work->scaling->Dinvruiz, work->xbuff, data->n);
+  ew_product(work->kktres, Dinvruiz_data, work->xbuff, data->n);
   scale_arrayf(work->xbuff, work->xbuff, work->scaling->kinv, data->n);
   QOCOFloat dres = inf_norm(work->xbuff, data->n);
   solver->sol->dres = dres;
 
   // Compute complementary slackness residual.
-  ew_product(work->s, work->scaling->Fruiz, work->ubuff1, data->m);
-  ew_product(work->z, work->scaling->Fruiz, work->ubuff2, data->m);
+  ew_product(sdata, Fruiz_data, work->ubuff1, data->m);
+  ew_product(zdata, Fruiz_data, work->ubuff2, data->m);
   QOCOFloat gap = qoco_dot(work->ubuff1, work->ubuff2, data->m);
   gap *= work->scaling->kinv;
   solver->sol->gap = gap;
@@ -214,9 +234,10 @@ unsigned char check_stopping(QOCOSolver* solver)
   dres_rel *= work->scaling->kinv;
 
   // Compute max{1, abs(pobj), abs(dobj)}.
-  QOCOFloat ctx = qoco_dot(work->data->c, work->x, work->data->n);
-  QOCOFloat bty = qoco_dot(work->data->b, work->y, work->data->p);
-  QOCOFloat htz = qoco_dot(work->data->h, work->z, work->data->m);
+  QOCOFloat* cdata = get_data_vectorf(work->data->c);
+  QOCOFloat ctx = qoco_dot(cdata, xdata, work->data->n);
+  QOCOFloat bty = qoco_dot(bdata, ydata, work->data->p);
+  QOCOFloat htz = qoco_dot(hdata, zdata, work->data->m);
   QOCOFloat pobj = 0.5 * xPx + ctx;
   QOCOFloat dobj = -0.5 * xPx - bty - htz;
 
@@ -251,11 +272,21 @@ unsigned char check_stopping(QOCOSolver* solver)
 
 void copy_solution(QOCOSolver* solver)
 {
-  // Copy optimization variables.
-  copy_arrayf(solver->work->x, solver->sol->x, solver->work->data->n);
-  copy_arrayf(solver->work->s, solver->sol->s, solver->work->data->m);
-  copy_arrayf(solver->work->y, solver->sol->y, solver->work->data->p);
-  copy_arrayf(solver->work->z, solver->sol->z, solver->work->data->m);
+  // Copy optimization variables from device to host (CUDA backend) or host to host (builtin)
+  // During solve phase, get_data_vectorf returns device pointers, but we've cleared solve phase
+  // before calling this, so it returns host pointers
+#ifdef QOCO_ALGEBRA_BACKEND_CUDA
+  sync_vector_to_device_if_needed(solver->work->rhs);
+  sync_vector_to_device_if_needed(solver->work->xyz);
+  sync_vector_to_device_if_needed(solver->work->x);
+  sync_vector_to_device_if_needed(solver->work->s);
+  sync_vector_to_device_if_needed(solver->work->y);
+  sync_vector_to_device_if_needed(solver->work->z);
+#endif
+  copy_arrayf(get_data_vectorf(solver->work->x), solver->sol->x, solver->work->data->n);
+  copy_arrayf(get_data_vectorf(solver->work->s), solver->sol->s, solver->work->data->m);
+  copy_arrayf(get_data_vectorf(solver->work->y), solver->sol->y, solver->work->data->p);
+  copy_arrayf(get_data_vectorf(solver->work->z), solver->sol->z, solver->work->data->m);
 
   solver->sol->solve_time_sec =
       get_elapsed_time_sec(&(solver->work->solve_timer));
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index be0cd37..f942ef2 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -64,6 +64,11 @@ foreach(file ${unit_tests})
     "${PROJECT_SOURCE_DIR}/tests/main.cpp"
     "${PROJECT_SOURCE_DIR}/tests/utils/test_utils.cpp")
   target_link_libraries("${name}" qocostatic gtest_main)
+  # Link CUDA libraries for CUDA backend
+  if(${QOCO_ALGEBRA_BACKEND} STREQUAL "cuda")
+    get_property(qoco_cuda_libs GLOBAL PROPERTY QOCO_CUDA_LIBS)
+    target_link_libraries("${name}" ${qoco_cuda_libs})
+  endif()
   add_test(NAME ${name} COMMAND "${name}")
 endforeach()
 
@@ -78,6 +83,11 @@ foreach(file ${simple_tests})
     "${PROJECT_SOURCE_DIR}/tests/main.cpp"
     "${PROJECT_SOURCE_DIR}/tests/utils/test_utils.cpp")
   target_link_libraries("${name}" qocostatic gtest_main)
+  # Link CUDA libraries for CUDA backend
+  if(${QOCO_ALGEBRA_BACKEND} STREQUAL "cuda")
+    get_property(qoco_cuda_libs GLOBAL PROPERTY QOCO_CUDA_LIBS)
+    target_link_libraries("${name}" ${qoco_cuda_libs})
+  endif()
   add_test(NAME ${name} COMMAND "${name}")
 endforeach()
 
@@ -95,6 +105,11 @@ foreach(file ${ocp})
     "${PROJECT_SOURCE_DIR}/tests/main.cpp"
     "${PROJECT_SOURCE_DIR}/tests/utils/test_utils.cpp")
   target_link_libraries("${name}" qocostatic gtest_main)
+  # Link CUDA libraries for CUDA backend
+  if(${QOCO_ALGEBRA_BACKEND} STREQUAL "cuda")
+    get_property(qoco_cuda_libs GLOBAL PROPERTY QOCO_CUDA_LIBS)
+    target_link_libraries("${name}" ${qoco_cuda_libs})
+  endif()
   add_test(NAME ${name} COMMAND "${name}")
 endforeach()
 
@@ -108,5 +123,10 @@ foreach(file ${portfolio})
     "${PROJECT_SOURCE_DIR}/tests/main.cpp"
     "${PROJECT_SOURCE_DIR}/tests/utils/test_utils.cpp")
   target_link_libraries("${name}" qocostatic gtest_main)
+  # Link CUDA libraries for CUDA backend
+  if(${QOCO_ALGEBRA_BACKEND} STREQUAL "cuda")
+    get_property(qoco_cuda_libs GLOBAL PROPERTY QOCO_CUDA_LIBS)
+    target_link_libraries("${name}" ${qoco_cuda_libs})
+  endif()
   add_test(NAME ${name} COMMAND "${name}")
 endforeach()
\ No newline at end of file
diff --git a/tests/simple_tests/missing_constraints_test.cpp b/tests/simple_tests/missing_constraints_test.cpp
index 60f4330..ef0fbf3 100644
--- a/tests/simple_tests/missing_constraints_test.cpp
+++ b/tests/simple_tests/missing_constraints_test.cpp
@@ -59,7 +59,7 @@ TEST(missing_constraints_test, no_soc_constraints)
   expect_eq_vectorf(solver->sol->x, xexp, n, tol);
   expect_eq_vectorf(solver->sol->s, sexp, m, tol);
   expect_eq_vectorf(solver->sol->y, yexp, p, tol);
-  expect_eq_vectorf(solver->sol->z, zexp, n, tol);
+  expect_eq_vectorf(solver->sol->z, zexp, m, tol);
   ASSERT_EQ(exit, QOCO_SOLVED);
 
   qoco_cleanup(solver);
@@ -169,7 +169,7 @@ TEST(missing_constraints_test, no_eq_constraints)
 
   expect_eq_vectorf(solver->sol->x, xexp, n, tol);
   expect_eq_vectorf(solver->sol->s, sexp, m, tol);
-  expect_eq_vectorf(solver->sol->z, zexp, n, tol);
+  expect_eq_vectorf(solver->sol->z, zexp, m, tol);
   ASSERT_EQ(exit, QOCO_SOLVED);
 
   qoco_cleanup(solver);
@@ -263,7 +263,7 @@ TEST(missing_constraints_test, lp_test_no_P)
 
   expect_eq_vectorf(solver->sol->x, xexp, n, tol);
   expect_eq_vectorf(solver->sol->s, sexp, m, tol);
-  expect_eq_vectorf(solver->sol->z, zexp, n, tol);
+  expect_eq_vectorf(solver->sol->z, zexp, m, tol);
   ASSERT_EQ(exit, QOCO_SOLVED);
 
   qoco_cleanup(solver);
@@ -328,7 +328,7 @@ TEST(simple_socp_test, p1)
   expect_eq_vectorf(solver->sol->x, xexp, n, tol);
   expect_eq_vectorf(solver->sol->s, sexp, m, tol);
   expect_eq_vectorf(solver->sol->y, yexp, p, tol);
-  expect_eq_vectorf(solver->sol->z, zexp, n, tol);
+  expect_eq_vectorf(solver->sol->z, zexp, m, tol);
   ASSERT_EQ(exit, QOCO_SOLVED);
 
   qoco_cleanup(solver);
@@ -395,7 +395,7 @@ TEST(simple_socp_test, p2)
   expect_eq_vectorf(solver->sol->x, xexp, n, tol);
   expect_eq_vectorf(solver->sol->s, sexp, m, tol);
   expect_eq_vectorf(solver->sol->y, yexp, p, tol);
-  expect_eq_vectorf(solver->sol->z, zexp, n, tol);
+  expect_eq_vectorf(solver->sol->z, zexp, m, tol);
   ASSERT_EQ(exit, QOCO_SOLVED);
 
   qoco_cleanup(solver);
@@ -462,7 +462,7 @@ TEST(simple_socp_test, p3)
   expect_eq_vectorf(solver->sol->x, xexp, n, tol);
   expect_eq_vectorf(solver->sol->s, sexp, m, tol);
   expect_eq_vectorf(solver->sol->y, yexp, p, tol);
-  expect_eq_vectorf(solver->sol->z, zexp, n, tol);
+  expect_eq_vectorf(solver->sol->z, zexp, m, tol);
   ASSERT_EQ(exit, QOCO_SOLVED);
 
   qoco_cleanup(solver);
@@ -591,7 +591,7 @@ TEST(simple_socp_test, reduced_tolerance)
   expect_eq_vectorf(solver->sol->x, xexp, n, tol);
   expect_eq_vectorf(solver->sol->s, sexp, m, tol);
   expect_eq_vectorf(solver->sol->y, yexp, p, tol);
-  expect_eq_vectorf(solver->sol->z, zexp, n, tol);
+  expect_eq_vectorf(solver->sol->z, zexp, m, tol);
   ASSERT_EQ(exit, QOCO_SOLVED_INACCURATE);
 
   qoco_cleanup(solver);
@@ -666,7 +666,7 @@ TEST(simple_socp_test, update_vector_data_test)
   expect_eq_vectorf(solver->sol->x, xexp, n, tol);
   expect_eq_vectorf(solver->sol->s, sexp, m, tol);
   expect_eq_vectorf(solver->sol->y, yexp, p, tol);
-  expect_eq_vectorf(solver->sol->z, zexp, n, tol);
+  expect_eq_vectorf(solver->sol->z, zexp, m, tol);
   ASSERT_EQ(exit, QOCO_SOLVED);
 
   qoco_cleanup(solver);
@@ -738,7 +738,7 @@ TEST(simple_socp_test, update_constraint_data_test)
   expect_eq_vectorf(solver->sol->x, xexp, n, tol);
   expect_eq_vectorf(solver->sol->s, sexp, m, tol);
   expect_eq_vectorf(solver->sol->y, yexp, p, tol);
-  expect_eq_vectorf(solver->sol->z, zexp, n, tol);
+  expect_eq_vectorf(solver->sol->z, zexp, m, tol);
   ASSERT_EQ(exit, QOCO_SOLVED);
 
   qoco_cleanup(solver);
diff --git a/tests/unit_tests/linalg_test.cpp b/tests/unit_tests/linalg_test.cpp
index 8f0418a..8135472 100644
--- a/tests/unit_tests/linalg_test.cpp
+++ b/tests/unit_tests/linalg_test.cpp
@@ -409,9 +409,9 @@ TEST(linalg, ruiz_test)
 
   qoco_setup(solver, n, m, p, P, c, A, b, G, h, l, nsoc, q, settings);
 
-  expect_eq_vectorf(solver->work->scaling->Druiz, Dexp, n, tol);
-  expect_eq_vectorf(solver->work->scaling->Eruiz, Eexp, p, tol);
-  expect_eq_vectorf(solver->work->scaling->Fruiz, Fexp, m, tol);
+  expect_eq_vectorf(get_data_vectorf(solver->work->scaling->Druiz), Dexp, n, tol);
+  expect_eq_vectorf(get_data_vectorf(solver->work->scaling->Eruiz), Eexp, p, tol);
+  expect_eq_vectorf(get_data_vectorf(solver->work->scaling->Fruiz), Fexp, m, tol);
   EXPECT_NEAR(solver->work->scaling->k, kexp, tol);
 
   qoco_cleanup(solver);