Skip to content

Commit 93618d7

Browse files
committed
Working on everything but lcvx_bad_scale
1 parent 4a3f0e7 commit 93618d7

File tree

3 files changed

+188
-11
lines changed

3 files changed

+188
-11
lines changed

algebra/cuda/cudss_backend.cu

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -586,8 +586,8 @@ static void cudss_solve(LinSysData* linsys_data, QOCOWorkspace* work,
586586
{
587587
QOCOInt n = linsys_data->K->n;
588588
(void)iter_ref_iters; // No iterative refinement for CUDA backend
589-
(void)b; // b is ignored - RHS is already in d_xyzbuff1 via work->rhs->d_data mapping
590-
(void)x; // x is ignored - solution will be in d_xyzbuff2 via work->xyz->d_data mapping
589+
// b is copied to d_xyzbuff1 if provided
590+
// x will receive the solution from d_xyzbuff2 after cudssExecute
591591

592592
// During solve phase, ALL data is on GPU - NO CPU-GPU transfers
593593
// work->rhs->d_data points to d_xyzbuff1 (mapped in map_work_buffers_to_device)
@@ -643,6 +643,22 @@ static void cudss_solve(LinSysData* linsys_data, QOCOWorkspace* work,
643643
(status == CUDSS_STATUS_INTERNAL_ERROR) ? "INTERNAL_ERROR" : "UNKNOWN";
644644
fprintf(stderr, "ERROR: cuDSS solve failed with status %d (%s)\n", (int)status, err_str);
645645
} else {
646+
// Solution is now in d_xyzbuff2 (pointed to by d_xyz_matrix)
647+
// Copy solution from d_xyz_matrix (d_xyzbuff2) to x
648+
if (x) {
649+
cudaPointerAttributes attrs;
650+
cudaError_t err = cudaPointerGetAttributes(&attrs, x);
651+
if (err == cudaSuccess && attrs.type == cudaMemoryTypeDevice) {
652+
// x is on device - device-to-device copy
653+
CUDA_CHECK(cudaMemcpy(x, linsys_data->d_xyzbuff2, n * sizeof(QOCOFloat),
654+
cudaMemcpyDeviceToDevice));
655+
} else {
656+
// x is on host - device-to-host copy
657+
CUDA_CHECK(cudaMemcpy(x, linsys_data->d_xyzbuff2, n * sizeof(QOCOFloat),
658+
cudaMemcpyDeviceToHost));
659+
}
660+
}
661+
646662
// Debug: print solution after solve
647663
QOCOFloat* sol_host = (QOCOFloat*)malloc(n * sizeof(QOCOFloat));
648664
if (sol_host) {
@@ -662,10 +678,22 @@ static void cudss_solve(LinSysData* linsys_data, QOCOWorkspace* work,
662678
}
663679

664680
// Solution is now in d_xyzbuff2 (pointed to by d_xyz_matrix and work->xyz->d_data)
665-
// No need to copy - work->xyz->d_data already points to d_xyzbuff2
681+
// Solution has been copied to x (if provided) after successful cudssExecute
666682
#else
667683
// cuDSS not available - use fallback: copy solution from RHS (will fail convergence but won't crash)
668684
CUDA_CHECK(cudaMemcpy(linsys_data->d_xyzbuff2, linsys_data->d_xyzbuff1, n * sizeof(QOCOFloat), cudaMemcpyDeviceToDevice));
685+
// Copy to x if provided
686+
if (x) {
687+
cudaPointerAttributes attrs;
688+
cudaError_t err = cudaPointerGetAttributes(&attrs, x);
689+
if (err == cudaSuccess && attrs.type == cudaMemoryTypeDevice) {
690+
CUDA_CHECK(cudaMemcpy(x, linsys_data->d_xyzbuff2, n * sizeof(QOCOFloat),
691+
cudaMemcpyDeviceToDevice));
692+
} else {
693+
CUDA_CHECK(cudaMemcpy(x, linsys_data->d_xyzbuff2, n * sizeof(QOCOFloat),
694+
cudaMemcpyDeviceToHost));
695+
}
696+
}
669697
#endif
670698

671699
// During solve phase, solution stays on device in d_xyzbuff2 (and work->xyz->d_data)

src/kkt.c

Lines changed: 156 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -218,28 +218,177 @@ void initialize_ipm(QOCOSolver* solver)
218218
rhs, xyz,
219219
solver->settings->iter_ref_iters);
220220

221-
// Copy x part of solution to x.
221+
// Copy x part of solution to x (GPU-to-GPU copy).
222+
// During solve phase, get_data_vectorf returns device pointers, so copy_arrayf
223+
// will detect both xyz and work->x are on device and use a CUDA kernel.
222224
copy_arrayf(xyz, get_data_vectorf(solver->work->x), solver->work->data->n);
223225

224-
// Copy y part of solution to y.
226+
// Copy y part of solution to y (GPU-to-GPU copy).
225227
copy_arrayf(&xyz[solver->work->data->n], get_data_vectorf(solver->work->y),
226228
solver->work->data->p);
227229

228-
// Copy z part of solution to z.
230+
// Copy z part of solution to z (GPU-to-GPU copy).
229231
copy_arrayf(&xyz[solver->work->data->n + solver->work->data->p],
230232
get_data_vectorf(solver->work->z), solver->work->data->m);
231233

232-
// Copy and negate z part of solution to s.
234+
// Copy and negate z part of solution to s (GPU-to-GPU copy).
233235
copy_and_negate_arrayf(
234236
&xyz[solver->work->data->n + solver->work->data->p],
235237
get_data_vectorf(solver->work->s), solver->work->data->m);
236238

237-
// Note: No sync needed here - get_data_vectorf will return device pointer
238-
// during solve phase, so vectors are already on device
239-
240239
// Bring s and z to cone C.
241240
bring2cone(get_data_vectorf(solver->work->s), solver->work->data);
242241
bring2cone(get_data_vectorf(solver->work->z), solver->work->data);
242+
243+
// DEBUG: Print xyz and work->x, work->y, work->z, work->s to verify copies
244+
fprintf(stderr, "DEBUG initialize_ipm: Starting debug output\n");
245+
fflush(stderr);
246+
247+
#ifdef QOCO_ALGEBRA_BACKEND_CUDA
248+
fprintf(stderr, "DEBUG initialize_ipm: CUDA backend detected\n");
249+
fflush(stderr);
250+
{
251+
QOCOInt n = solver->work->data->n;
252+
QOCOInt p = solver->work->data->p;
253+
QOCOInt m = solver->work->data->m;
254+
QOCOInt total = n + p + m;
255+
256+
fprintf(stderr, "DEBUG initialize_ipm: n=%d, p=%d, m=%d, total=%d\n", n, p, m, total);
257+
fflush(stderr);
258+
259+
// Allocate host buffers
260+
QOCOFloat* xyz_host = (QOCOFloat*)malloc(total * sizeof(QOCOFloat));
261+
QOCOFloat* x_host = (QOCOFloat*)malloc(n * sizeof(QOCOFloat));
262+
QOCOFloat* y_host = (QOCOFloat*)malloc(p * sizeof(QOCOFloat));
263+
QOCOFloat* z_host = (QOCOFloat*)malloc(m * sizeof(QOCOFloat));
264+
QOCOFloat* s_host = (QOCOFloat*)malloc(m * sizeof(QOCOFloat));
265+
266+
fprintf(stderr, "DEBUG initialize_ipm: Allocated buffers: xyz_host=%p, x_host=%p, y_host=%p, z_host=%p, s_host=%p\n",
267+
(void*)xyz_host, (void*)x_host, (void*)y_host, (void*)z_host, (void*)s_host);
268+
fflush(stderr);
269+
270+
if (xyz_host && x_host && y_host && z_host && s_host) {
271+
// Copy from device to host
272+
extern void* cudaMemcpy(void* dst, const void* src, size_t count, int kind);
273+
#define cudaMemcpyDeviceToHost 2
274+
275+
fprintf(stderr, "DEBUG initialize_ipm: Copying from device to host...\n");
276+
fflush(stderr);
277+
278+
cudaMemcpy(xyz_host, xyz, total * sizeof(QOCOFloat), cudaMemcpyDeviceToHost);
279+
cudaMemcpy(x_host, get_data_vectorf(solver->work->x), n * sizeof(QOCOFloat), cudaMemcpyDeviceToHost);
280+
cudaMemcpy(y_host, get_data_vectorf(solver->work->y), p * sizeof(QOCOFloat), cudaMemcpyDeviceToHost);
281+
cudaMemcpy(z_host, get_data_vectorf(solver->work->z), m * sizeof(QOCOFloat), cudaMemcpyDeviceToHost);
282+
cudaMemcpy(s_host, get_data_vectorf(solver->work->s), m * sizeof(QOCOFloat), cudaMemcpyDeviceToHost);
283+
284+
fprintf(stderr, "DEBUG initialize_ipm: xyz = {");
285+
for (QOCOInt i = 0; i < total; ++i) {
286+
fprintf(stderr, "%.17g", xyz_host[i]);
287+
if (i != total - 1) fprintf(stderr, ", ");
288+
}
289+
fprintf(stderr, "}\n");
290+
fflush(stderr);
291+
292+
fprintf(stderr, "DEBUG initialize_ipm: work->x = {");
293+
for (QOCOInt i = 0; i < n; ++i) {
294+
fprintf(stderr, "%.17g", x_host[i]);
295+
if (i != n - 1) fprintf(stderr, ", ");
296+
}
297+
fprintf(stderr, "}\n");
298+
fflush(stderr);
299+
300+
fprintf(stderr, "DEBUG initialize_ipm: work->y = {");
301+
for (QOCOInt i = 0; i < p; ++i) {
302+
fprintf(stderr, "%.17g", y_host[i]);
303+
if (i != p - 1) fprintf(stderr, ", ");
304+
}
305+
fprintf(stderr, "}\n");
306+
fflush(stderr);
307+
308+
fprintf(stderr, "DEBUG initialize_ipm: work->z = {");
309+
for (QOCOInt i = 0; i < m; ++i) {
310+
fprintf(stderr, "%.17g", z_host[i]);
311+
if (i != m - 1) fprintf(stderr, ", ");
312+
}
313+
fprintf(stderr, "}\n");
314+
fflush(stderr);
315+
316+
fprintf(stderr, "DEBUG initialize_ipm: work->s = {");
317+
for (QOCOInt i = 0; i < m; ++i) {
318+
fprintf(stderr, "%.17g", s_host[i]);
319+
if (i != m - 1) fprintf(stderr, ", ");
320+
}
321+
fprintf(stderr, "}\n");
322+
fflush(stderr);
323+
324+
free(xyz_host);
325+
free(x_host);
326+
free(y_host);
327+
free(z_host);
328+
free(s_host);
329+
} else {
330+
fprintf(stderr, "DEBUG initialize_ipm: ERROR - malloc failed!\n");
331+
fflush(stderr);
332+
}
333+
}
334+
#else
335+
fprintf(stderr, "DEBUG initialize_ipm: Builtin backend (not CUDA)\n");
336+
fflush(stderr);
337+
{
338+
QOCOInt n = solver->work->data->n;
339+
QOCOInt p = solver->work->data->p;
340+
QOCOInt m = solver->work->data->m;
341+
QOCOInt total = n + p + m;
342+
QOCOFloat* xyz_ptr = xyz;
343+
QOCOFloat* x_ptr = get_data_vectorf(solver->work->x);
344+
QOCOFloat* y_ptr = get_data_vectorf(solver->work->y);
345+
QOCOFloat* z_ptr = get_data_vectorf(solver->work->z);
346+
QOCOFloat* s_ptr = get_data_vectorf(solver->work->s);
347+
348+
fprintf(stderr, "DEBUG initialize_ipm: xyz = {");
349+
for (QOCOInt i = 0; i < total; ++i) {
350+
fprintf(stderr, "%.17g", xyz_ptr[i]);
351+
if (i != total - 1) fprintf(stderr, ", ");
352+
}
353+
fprintf(stderr, "}\n");
354+
fflush(stderr);
355+
356+
fprintf(stderr, "DEBUG initialize_ipm: work->x = {");
357+
for (QOCOInt i = 0; i < n; ++i) {
358+
fprintf(stderr, "%.17g", x_ptr[i]);
359+
if (i != n - 1) fprintf(stderr, ", ");
360+
}
361+
fprintf(stderr, "}\n");
362+
fflush(stderr);
363+
364+
fprintf(stderr, "DEBUG initialize_ipm: work->y = {");
365+
for (QOCOInt i = 0; i < p; ++i) {
366+
fprintf(stderr, "%.17g", y_ptr[i]);
367+
if (i != p - 1) fprintf(stderr, ", ");
368+
}
369+
fprintf(stderr, "}\n");
370+
fflush(stderr);
371+
372+
fprintf(stderr, "DEBUG initialize_ipm: work->z = {");
373+
for (QOCOInt i = 0; i < m; ++i) {
374+
fprintf(stderr, "%.17g", z_ptr[i]);
375+
if (i != m - 1) fprintf(stderr, ", ");
376+
}
377+
fprintf(stderr, "}\n");
378+
fflush(stderr);
379+
380+
fprintf(stderr, "DEBUG initialize_ipm: work->s = {");
381+
for (QOCOInt i = 0; i < m; ++i) {
382+
fprintf(stderr, "%.17g", s_ptr[i]);
383+
if (i != m - 1) fprintf(stderr, ", ");
384+
}
385+
fprintf(stderr, "}\n");
386+
fflush(stderr);
387+
}
388+
#endif
389+
390+
fprintf(stderr, "DEBUG initialize_ipm: Finished debug output\n");
391+
fflush(stderr);
243392
}
244393

245394
void compute_kkt_residual(QOCOProblemData* data, QOCOFloat* x, QOCOFloat* y,

src/qoco_api.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ void qoco_set_csc(QOCOCscMatrix* A, QOCOInt m, QOCOInt n, QOCOInt Annz,
197197

198198
void set_default_settings(QOCOSettings* settings)
199199
{
200-
settings->max_iters = 1;
200+
settings->max_iters = 200;
201201
settings->bisect_iters = 5;
202202
settings->ruiz_iters = 0;
203203
settings->iter_ref_iters = 1;

0 commit comments

Comments
 (0)