oneAPI backend simulation support.

haoyanwa · haoyanwa · commit c3077151e369 · 2025-02-24T16:28:29.000-08:00
diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py
@@ -170,11 +170,12 @@ def definition_cpp(self, name_suffix='', as_reference=False):
         else:
             return f'{self.type.name} {self.name}{name_suffix}'
 
-    def declare_cpp(self, pipe_min_size=0, indent=''):
+    # Updated pipe min size to be 32 for simulation.
+    def declare_cpp(self, pipe_min_size=32, indent=''):
         # Updated to use streaming beat for restartable streaming kernel.
         # Streaming beat is a wrapper type of the actual type with sideband control signals.
         # Syntax: using BeatT = sycl::ext::intel::experimental::StreamingBeat<DataT, eop, empty>;
-        streaming_beat_t = f"{self.type.name}BeatT"
+        streaming_beat_t = f"{self.pipe_name}BeatT"
         lines = (
             f"{indent}class {self.pipe_id};\n"
             f"{indent}using {streaming_beat_t} = "
diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt
@@ -38,12 +38,13 @@ set(LIBRARY_NAME myproject-${LIB_STAMP})
 # You can also specify a device family (E.g. "Arria10" or "Stratix10") or a
 # specific part number (E.g. "10AS066N3F40E2SG") to generate a standalone IP.
 if(NOT DEFINED FPGA_DEVICE)
-    set(FPGA_DEVICE "Arria10")
+    set(FPGA_DEVICE "Agilex7")
 endif()
 
 # Use cmake -DUSER_FPGA_FLAGS=<flags> to set extra flags for FPGA backend
 # compilation.
-set(USER_FPGA_FLAGS -Wno-unused-label ${USER_FPGA_FLAGS})
+# -Xsoptimize=latency Turns off the hyper-optimized handshake
+set(USER_FPGA_FLAGS -Wno-unused-label;${USER_FPGA_FLAGS};-Xsoptimize=latency)
 
 # Use cmake -DUSER_FLAGS=<flags> to set extra flags for general compilation.
 set(USER_FLAGS -Wno-unused-label -fconstexpr-steps=134217728 ${USER_FLAGS})
diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h
@@ -37,7 +37,7 @@ struct DMA_convert_data {
     sycl::ext::oneapi::experimental::annotated_arg<src_T *, 
       decltype(sycl::ext::oneapi::experimental::properties{
           sycl::ext::intel::experimental::latency<0>,
-          sycl::ext::intel::experimental::dwidth<8>,
+          sycl::ext::intel::experimental::dwidth<16>,
           sycl::ext::intel::experimental::buffer_location<kInputBufferLocation>,
           sycl::ext::intel::experimental::read_write_mode_read,
           sycl::ext::intel::experimental::wait_request_requested})>
@@ -91,7 +91,7 @@ struct DMA_convert_data_back {
     sycl::ext::oneapi::experimental::annotated_arg<dst_T *, 
       decltype(sycl::ext::oneapi::experimental::properties{
           sycl::ext::intel::experimental::latency<0>,
-          sycl::ext::intel::experimental::dwidth<8>,
+          sycl::ext::intel::experimental::dwidth<16>,
           sycl::ext::intel::experimental::buffer_location<kOutputBufferLocation>,
           sycl::ext::intel::experimental::read_write_mode_write,
           sycl::ext::intel::experimental::wait_request_requested})>
diff --git a/hls4ml/templates/oneapi/myproject_test.cpp b/hls4ml/templates/oneapi/myproject_test.cpp
@@ -88,7 +88,7 @@ int main(int argc, char **argv) {
 #define NUM_ITERATIONS 100
     auto selector = sycl::ext::intel::fpga_selector_v;
 #else // #if FPGA_EMULATOR
-#define NUM_ITERATIONS 100
+#define NUM_ITERATIONS 10
     auto selector = sycl::ext::intel::fpga_emulator_selector_v;
 #endif
 
@@ -124,83 +124,90 @@ int main(int argc, char **argv) {
 
     // hls-fpga-machine-learning insert runtime contant
 
+    try {
 #if defined(IS_BSP)
-    // Allocate host memory if BSP is in use.
-    float *vals = sycl::malloc_host<float>(kInputSz, q);
-    if (vals == nullptr) {
-        std::cerr << "ERROR: host allocation failed for input\n";
-        fout.close();
-        return 1;
-    }
-    float *outputs = sycl::malloc_host<float>(kOutputSz, q);
-    if (output == nullptr) {
-        std::cerr << "ERROR: host allocation failed for output\n";
-        fout.close();
-        return 1;
-    }    
+        // Allocate host memory if BSP is in use.
+        float *vals = sycl::malloc_host<float>(kInputSz, q);
+        if (vals == nullptr) {
+            std::cerr << "ERROR: host allocation failed for input\n";
+            fout.close();
+            return 1;
+        }
+        float *outputs = sycl::malloc_host<float>(kOutputSz, q);
+        if (outputs == nullptr) {
+            std::cerr << "ERROR: host allocation failed for output\n";
+            fout.close();
+            return 1;
+        }    
 #else
-    float *vals = new float[kInputSz];
-    float *outputs = new float[kOutputSz];
+        float *vals = sycl::malloc_shared<float>(kInputSz, q, sycl::property_list{buffer_location(nnet::kInputBufferLocation)});
+        float *outputs = sycl::malloc_shared<float>(kOutputSz, q, sycl::property_list{buffer_location(nnet::kOutputBufferLocation)});
 #endif
 
-    if (file_valid) {
-        // Start always-run streaming kernel here, instead of inside a loop.
-        q.single_task(MyProject{});
+        if (file_valid) {
+            // Start always-run streaming kernel here, instead of inside a loop.
+            q.single_task(MyProject{});
 
-        // hls-fpga-machine-learning insert data
+            // hls-fpga-machine-learning insert data
 
-        // hls-fpga-machine-learning convert output
+            // hls-fpga-machine-learning convert output
 
-        // Print output from kernel and from prediction file.
-        for (int i = 0; i < num_iterations; i++) {
-            for (int j = 0; j < kOutLayerSize; j++) {
-                fout << outputs[i * kOutLayerSize + j] << " ";
-            }
-            fout << std::endl;
-            if (i % CHECKPOINT == 0) {
-                std::cout << "Predictions" << std::endl;
-                // hls-fpga-machine-learning insert predictions
-                for (auto predval : predictions[i]) {
-                    std::cout << predval << " ";
+            // Print output from kernel and from prediction file.
+            for (int i = 0; i < num_iterations; i++) {
+                for (int j = 0; j < kOutLayerSize; j++) {
+                    fout << outputs[i * kOutLayerSize + j] << " ";
                 }
-                std::cout << std::endl;
-                std::cout << "Quantized predictions" << std::endl;
-                // hls-fpga-machine-learning insert quantized
+                fout << std::endl;
+                if (i % CHECKPOINT == 0) {
+                    std::cout << "Predictions" << std::endl;
+                    // hls-fpga-machine-learning insert predictions
+                    for (auto predval : predictions[i]) {
+                        std::cout << predval << " ";
+                    }
+                    std::cout << std::endl;
+                    std::cout << "Quantized predictions" << std::endl;
+                    // hls-fpga-machine-learning insert quantized
+                    for (int j = 0; j < kOutLayerSize; j++) {
+                        std::cout << outputs[i * kOutLayerSize + j] << " ";
+                    }
+                    std::cout << std::endl;
+                }
+            }
+        } else {
+            std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations
+                    << " invocations." << std::endl;
+            q.single_task(MyProject{});
+            // hls-fpga-machine-learning insert top-level-function
+            // hls-fpga-machine-learning insert zero
+            // hls-fpga-machine-learning convert output
+            for (int i = 0; i < num_iterations; i++) {
                 for (int j = 0; j < kOutLayerSize; j++) {
                     std::cout << outputs[i * kOutLayerSize + j] << " ";
+                    fout << outputs[i * kOutLayerSize + j] << " ";
                 }
                 std::cout << std::endl;
+                fout << std::endl;
             }
         }
-    } else {
-        std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations
-                  << " invocations." << std::endl;
-
-        // hls-fpga-machine-learning insert top-level-function
-
-        // hls-fpga-machine-learning insert zero
-        q.single_task(MyProject{});
-        // hls-fpga-machine-learning convert output
-        for (int i = 0; i < num_iterations; i++) {
-            for (int j = 0; j < kOutLayerSize; j++) {
-                std::cout << outputs[i * kOutLayerSize + j] << " ";
-                fout << outputs[i * kOutLayerSize + j] << " ";
-            }
-            std::cout << std::endl;
-            fout << std::endl;
+        sycl::free(vals, q);
+        sycl::free(outputs, q);
+        fout.close();
+        std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
+    } catch (sycl::exception const &e) {
+        // Catches exceptions in the host code.
+        std::cerr << "Caught a SYCL host exception:\n"
+                  << e.what() << "\n";
+
+        // Most likely the runtime couldn't find FPGA hardware!
+        if (e.code().value() == CL_DEVICE_NOT_FOUND)
+        {
+            std::cerr << "If you are targeting an FPGA, please ensure that your "
+                         "system has a correctly configured FPGA board.\n";
+            std::cerr << "Run sys_check in the oneAPI root directory to verify.\n";
+            std::cerr << "If you are targeting the FPGA emulator, compile with "
+                         "-DFPGA_EMULATOR.\n";
         }
+        std::terminate();
     }
-
-    // Free up resources.
-#if defined(IS_BSP)
-    free(vals);
-    free(outputs);
-#else
-    delete[] vals;
-    delete[] outputs;
-#endif
-    fout.close();
-    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
-
     return 0;
 }
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
@@ -205,6 +205,8 @@ def write_project_cpp(self, model):
                             newline += indent * 2 + f'typename nnet::ExtractPipeType<{out.pipe_name}>::value_type {out_beat};\n'
                             newline += indent * 2 + f'{out_beat}.data = {out.name};\n'
                             newline += indent * 2 + f'{out.pipe_name}::write({out_beat});\n'
+                        newline += indent * 2 + '// stops the kernel when the last input seen.\n'
+                        newline += indent * 2 + f'keep_going = !{model_inputs[0].name}_beat.eop;\n'
                         newline += f"{indent}}}\n"
                     # don't need to add anything in io_stream