@@ -218,28 +218,177 @@ void initialize_ipm(QOCOSolver* solver)
218218 rhs , xyz ,
219219 solver -> settings -> iter_ref_iters );
220220
221- // Copy x part of solution to x.
221+ // Copy x part of solution to x (GPU-to-GPU copy).
222+ // During solve phase, get_data_vectorf returns device pointers, so copy_arrayf
223+ // will detect both xyz and work->x are on device and use a CUDA kernel.
222224 copy_arrayf (xyz , get_data_vectorf (solver -> work -> x ), solver -> work -> data -> n );
223225
224- // Copy y part of solution to y.
226+ // Copy y part of solution to y (GPU-to-GPU copy) .
225227 copy_arrayf (& xyz [solver -> work -> data -> n ], get_data_vectorf (solver -> work -> y ),
226228 solver -> work -> data -> p );
227229
228- // Copy z part of solution to z.
230+ // Copy z part of solution to z (GPU-to-GPU copy) .
229231 copy_arrayf (& xyz [solver -> work -> data -> n + solver -> work -> data -> p ],
230232 get_data_vectorf (solver -> work -> z ), solver -> work -> data -> m );
231233
232- // Copy and negate z part of solution to s.
234+ // Copy and negate z part of solution to s (GPU-to-GPU copy) .
233235 copy_and_negate_arrayf (
234236 & xyz [solver -> work -> data -> n + solver -> work -> data -> p ],
235237 get_data_vectorf (solver -> work -> s ), solver -> work -> data -> m );
236238
237- // Note: No sync needed here - get_data_vectorf will return device pointer
238- // during solve phase, so vectors are already on device
239-
240239 // Bring s and z to cone C.
241240 bring2cone (get_data_vectorf (solver -> work -> s ), solver -> work -> data );
242241 bring2cone (get_data_vectorf (solver -> work -> z ), solver -> work -> data );
242+
243+ // DEBUG: Print xyz and work->x, work->y, work->z, work->s to verify copies
244+ fprintf (stderr , "DEBUG initialize_ipm: Starting debug output\n" );
245+ fflush (stderr );
246+
247+ #ifdef QOCO_ALGEBRA_BACKEND_CUDA
248+ fprintf (stderr , "DEBUG initialize_ipm: CUDA backend detected\n" );
249+ fflush (stderr );
250+ {
251+ QOCOInt n = solver -> work -> data -> n ;
252+ QOCOInt p = solver -> work -> data -> p ;
253+ QOCOInt m = solver -> work -> data -> m ;
254+ QOCOInt total = n + p + m ;
255+
256+ fprintf (stderr , "DEBUG initialize_ipm: n=%d, p=%d, m=%d, total=%d\n" , n , p , m , total );
257+ fflush (stderr );
258+
259+ // Allocate host buffers
260+ QOCOFloat * xyz_host = (QOCOFloat * )malloc (total * sizeof (QOCOFloat ));
261+ QOCOFloat * x_host = (QOCOFloat * )malloc (n * sizeof (QOCOFloat ));
262+ QOCOFloat * y_host = (QOCOFloat * )malloc (p * sizeof (QOCOFloat ));
263+ QOCOFloat * z_host = (QOCOFloat * )malloc (m * sizeof (QOCOFloat ));
264+ QOCOFloat * s_host = (QOCOFloat * )malloc (m * sizeof (QOCOFloat ));
265+
266+ fprintf (stderr , "DEBUG initialize_ipm: Allocated buffers: xyz_host=%p, x_host=%p, y_host=%p, z_host=%p, s_host=%p\n" ,
267+ (void * )xyz_host , (void * )x_host , (void * )y_host , (void * )z_host , (void * )s_host );
268+ fflush (stderr );
269+
270+ if (xyz_host && x_host && y_host && z_host && s_host ) {
271+ // Copy from device to host
272+ extern void * cudaMemcpy (void * dst , const void * src , size_t count , int kind );
273+ #define cudaMemcpyDeviceToHost 2
274+
275+ fprintf (stderr , "DEBUG initialize_ipm: Copying from device to host...\n" );
276+ fflush (stderr );
277+
278+ cudaMemcpy (xyz_host , xyz , total * sizeof (QOCOFloat ), cudaMemcpyDeviceToHost );
279+ cudaMemcpy (x_host , get_data_vectorf (solver -> work -> x ), n * sizeof (QOCOFloat ), cudaMemcpyDeviceToHost );
280+ cudaMemcpy (y_host , get_data_vectorf (solver -> work -> y ), p * sizeof (QOCOFloat ), cudaMemcpyDeviceToHost );
281+ cudaMemcpy (z_host , get_data_vectorf (solver -> work -> z ), m * sizeof (QOCOFloat ), cudaMemcpyDeviceToHost );
282+ cudaMemcpy (s_host , get_data_vectorf (solver -> work -> s ), m * sizeof (QOCOFloat ), cudaMemcpyDeviceToHost );
283+
284+ fprintf (stderr , "DEBUG initialize_ipm: xyz = {" );
285+ for (QOCOInt i = 0 ; i < total ; ++ i ) {
286+ fprintf (stderr , "%.17g" , xyz_host [i ]);
287+ if (i != total - 1 ) fprintf (stderr , ", " );
288+ }
289+ fprintf (stderr , "}\n" );
290+ fflush (stderr );
291+
292+ fprintf (stderr , "DEBUG initialize_ipm: work->x = {" );
293+ for (QOCOInt i = 0 ; i < n ; ++ i ) {
294+ fprintf (stderr , "%.17g" , x_host [i ]);
295+ if (i != n - 1 ) fprintf (stderr , ", " );
296+ }
297+ fprintf (stderr , "}\n" );
298+ fflush (stderr );
299+
300+ fprintf (stderr , "DEBUG initialize_ipm: work->y = {" );
301+ for (QOCOInt i = 0 ; i < p ; ++ i ) {
302+ fprintf (stderr , "%.17g" , y_host [i ]);
303+ if (i != p - 1 ) fprintf (stderr , ", " );
304+ }
305+ fprintf (stderr , "}\n" );
306+ fflush (stderr );
307+
308+ fprintf (stderr , "DEBUG initialize_ipm: work->z = {" );
309+ for (QOCOInt i = 0 ; i < m ; ++ i ) {
310+ fprintf (stderr , "%.17g" , z_host [i ]);
311+ if (i != m - 1 ) fprintf (stderr , ", " );
312+ }
313+ fprintf (stderr , "}\n" );
314+ fflush (stderr );
315+
316+ fprintf (stderr , "DEBUG initialize_ipm: work->s = {" );
317+ for (QOCOInt i = 0 ; i < m ; ++ i ) {
318+ fprintf (stderr , "%.17g" , s_host [i ]);
319+ if (i != m - 1 ) fprintf (stderr , ", " );
320+ }
321+ fprintf (stderr , "}\n" );
322+ fflush (stderr );
323+
324+ free (xyz_host );
325+ free (x_host );
326+ free (y_host );
327+ free (z_host );
328+ free (s_host );
329+ } else {
330+ fprintf (stderr , "DEBUG initialize_ipm: ERROR - malloc failed!\n" );
331+ fflush (stderr );
332+ }
333+ }
334+ #else
335+ fprintf (stderr , "DEBUG initialize_ipm: Builtin backend (not CUDA)\n" );
336+ fflush (stderr );
337+ {
338+ QOCOInt n = solver -> work -> data -> n ;
339+ QOCOInt p = solver -> work -> data -> p ;
340+ QOCOInt m = solver -> work -> data -> m ;
341+ QOCOInt total = n + p + m ;
342+ QOCOFloat * xyz_ptr = xyz ;
343+ QOCOFloat * x_ptr = get_data_vectorf (solver -> work -> x );
344+ QOCOFloat * y_ptr = get_data_vectorf (solver -> work -> y );
345+ QOCOFloat * z_ptr = get_data_vectorf (solver -> work -> z );
346+ QOCOFloat * s_ptr = get_data_vectorf (solver -> work -> s );
347+
348+ fprintf (stderr , "DEBUG initialize_ipm: xyz = {" );
349+ for (QOCOInt i = 0 ; i < total ; ++ i ) {
350+ fprintf (stderr , "%.17g" , xyz_ptr [i ]);
351+ if (i != total - 1 ) fprintf (stderr , ", " );
352+ }
353+ fprintf (stderr , "}\n" );
354+ fflush (stderr );
355+
356+ fprintf (stderr , "DEBUG initialize_ipm: work->x = {" );
357+ for (QOCOInt i = 0 ; i < n ; ++ i ) {
358+ fprintf (stderr , "%.17g" , x_ptr [i ]);
359+ if (i != n - 1 ) fprintf (stderr , ", " );
360+ }
361+ fprintf (stderr , "}\n" );
362+ fflush (stderr );
363+
364+ fprintf (stderr , "DEBUG initialize_ipm: work->y = {" );
365+ for (QOCOInt i = 0 ; i < p ; ++ i ) {
366+ fprintf (stderr , "%.17g" , y_ptr [i ]);
367+ if (i != p - 1 ) fprintf (stderr , ", " );
368+ }
369+ fprintf (stderr , "}\n" );
370+ fflush (stderr );
371+
372+ fprintf (stderr , "DEBUG initialize_ipm: work->z = {" );
373+ for (QOCOInt i = 0 ; i < m ; ++ i ) {
374+ fprintf (stderr , "%.17g" , z_ptr [i ]);
375+ if (i != m - 1 ) fprintf (stderr , ", " );
376+ }
377+ fprintf (stderr , "}\n" );
378+ fflush (stderr );
379+
380+ fprintf (stderr , "DEBUG initialize_ipm: work->s = {" );
381+ for (QOCOInt i = 0 ; i < m ; ++ i ) {
382+ fprintf (stderr , "%.17g" , s_ptr [i ]);
383+ if (i != m - 1 ) fprintf (stderr , ", " );
384+ }
385+ fprintf (stderr , "}\n" );
386+ fflush (stderr );
387+ }
388+ #endif
389+
390+ fprintf (stderr , "DEBUG initialize_ipm: Finished debug output\n" );
391+ fflush (stderr );
243392}
244393
245394void compute_kkt_residual (QOCOProblemData * data , QOCOFloat * x , QOCOFloat * y ,
0 commit comments