refactor: create dummy _C module for python loading

yoyolicoris · yoyolicoris · commit 086bbb44e55b · 2025-05-04T09:54:15.000+01:00
diff --git a/torchlpc/__init__.py b/torchlpc/__init__.py
@@ -3,16 +3,24 @@
 from pathlib import Path
 import warnings
 
-so_files = list(Path(__file__).parent.glob("_C*.so"))
-# assert len(so_files) == 1, f"Expected one _C*.so file, found {len(so_files)}"
-if len(so_files) == 1:
-    torch.ops.load_library(so_files[0])
+# so_files = list(Path(__file__).parent.glob("_C*.so"))
+# # assert len(so_files) == 1, f"Expected one _C*.so file, found {len(so_files)}"
+# if len(so_files) == 1:
+#     torch.ops.load_library(so_files[0])
+#     EXTENSION_LOADED = True
+# elif len(so_files) > 1:
+#     raise ValueError(f"Expected one _C*.so file, found {len(so_files)}")
+# else:
+#     warnings.warn("No _C*.so file found. Custom extension not loaded.")
+#     EXTENSION_LOADED = False
+
+try:
+    from . import _C
+
     EXTENSION_LOADED = True
-elif len(so_files) > 1:
-    raise ValueError(f"Expected one _C*.so file, found {len(so_files)}")
-else:
-    warnings.warn("No _C*.so file found. Custom extension not loaded.")
+except ImportError:
     EXTENSION_LOADED = False
+    warnings.warn("Custom extension not loaded. Falling back to Numba implementation.")
 
 from .core import LPC
 
diff --git a/torchlpc/csrc/scan_cpu.cpp b/torchlpc/csrc/scan_cpu.cpp
@@ -1,10 +1,28 @@
+#include <Python.h>
 #include <torch/script.h>
 #include <torch/torch.h>
 
 #include <algorithm>
 #include <utility>
 #include <vector>
 
+extern "C" {
+/* Creates a dummy empty _C module that can be imported from Python.
+   The import from Python will load the .so associated with this extension
+   built from this file, so that all the TORCH_LIBRARY calls below are run.*/
+PyObject *PyInit__C(void) {
+    static struct PyModuleDef module_def = {
+        PyModuleDef_HEAD_INIT,
+        "_C", /* name of module */
+        NULL, /* module documentation, may be NULL */
+        -1,   /* size of per-interpreter state of the module,
+                 or -1 if the module keeps state in global variables. */
+        NULL, /* methods */
+    };
+    return PyModule_Create(&module_def);
+}
+}
+
 template <typename scalar_t>
 void scan_cpu(const at::Tensor &input, const at::Tensor &weights,
               const at::Tensor &initials, const at::Tensor &output) {
@@ -34,10 +52,11 @@ void scan_cpu(const at::Tensor &input, const at::Tensor &weights,
 
     std::pair<scalar_t, scalar_t> buffer[total_size];
 
-    const scalar_t *input_ptr = input_contiguous.data_ptr<scalar_t>();
-    const scalar_t *initials_ptr = initials_contiguous.data_ptr<scalar_t>();
-    const scalar_t *weights_ptr = weights_contiguous.data_ptr<scalar_t>();
-    scalar_t *output_ptr = output.data_ptr<scalar_t>();
+    const scalar_t *input_ptr = input_contiguous.const_data_ptr<scalar_t>();
+    const scalar_t *initials_ptr =
+        initials_contiguous.const_data_ptr<scalar_t>();
+    const scalar_t *weights_ptr = weights_contiguous.const_data_ptr<scalar_t>();
+    scalar_t *output_ptr = output.mutable_data_ptr<scalar_t>();
 
     std::transform(weights_ptr, weights_ptr + total_size, input_ptr, buffer,
                    [](const scalar_t &a, const scalar_t &b) {
@@ -84,8 +103,8 @@ void lpc_cpu_core(const torch::Tensor &a, const torch::Tensor &padded_out) {
 
     auto a_contiguous = a.contiguous();
 
-    const scalar_t *a_ptr = a_contiguous.data_ptr<scalar_t>();
-    scalar_t *out_ptr = padded_out.data_ptr<scalar_t>();
+    const scalar_t *a_ptr = a_contiguous.const_data_ptr<scalar_t>();
+    scalar_t *out_ptr = padded_out.mutable_data_ptrscalar_t>();
 
     at::parallel_for(0, B, 1, [&](int64_t start, int64_t end) {
         for (auto b = start; b < end; b++) {