Introduction
Pyvorin Edge allows you to drop below the Python layer and execute hand-written C kernels on the edge device. This is useful when the compiler cannot auto-vectorize a hot path, or when you need explicit control over SIMD registers on ARM64.
In this article you will:
- Understand the
ABIContractclass frommodule_host/abi.py. - Write a simple C kernel that adds two float arrays.
- Rewrite it with ARM NEON intrinsics (
vaddq_f32,vld1q_f32,vst1q_f32). - Build a
.sowith GCC and load it throughModuleLoader. - Register the kernel with
CompilerBridgeso the pipeline can call it.
The ABI Contract
Every shared object that Pyvorin Edge loads must declare an ABIContract. This dataclass lives in edge_runtime/pyv_edge_agent/module_host/abi.py and tells the runtime:
- Which symbol to call (
function_name) - The ctypes signature (
arg_typesandreturn_type) - The calling convention (always
"cdecl"on Linux)
from pyv_edge_agent.module_host.abi import ABIContract
import ctypes
abi = ABIContract(
function_name="vec_add_f32",
arg_types=[
ctypes.POINTER(ctypes.c_float), # a
ctypes.POINTER(ctypes.c_float), # b
ctypes.POINTER(ctypes.c_float), # out
ctypes.c_size_t, # n
],
return_type=None,
calling_convention="cdecl",
)
Writing a Simple C Kernel
Start with a scalar fallback kernel so you have a reference implementation to validate against.
Scalar Reference (vec_add_scalar.c)
#include <stddef.h>
void vec_add_f32(
const float *restrict a,
const float *restrict b,
float *restrict out,
size_t n
) {
for (size_t i = 0; i < n; i++) {
out[i] = a[i] + b[i];
}
}
NEON-Optimized Version (vec_add_neon.c)
#include <arm_neon.h>
#include <stddef.h>
void vec_add_f32(
const float *restrict a,
const float *restrict b,
float *restrict out,
size_t n
) {
size_t i = 0;
/* Process 16 floats per iteration (four 128-bit vectors) */
for (; i + 15 < n; i += 16) {
float32x4_t va0 = vld1q_f32(&a[i + 0]);
float32x4_t va1 = vld1q_f32(&a[i + 4]);
float32x4_t va2 = vld1q_f32(&a[i + 8]);
float32x4_t va3 = vld1q_f32(&a[i + 12]);
float32x4_t vb0 = vld1q_f32(&b[i + 0]);
float32x4_t vb1 = vld1q_f32(&b[i + 4]);
float32x4_t vb2 = vld1q_f32(&b[i + 8]);
float32x4_t vb3 = vld1q_f32(&b[i + 12]);
vst1q_f32(&out[i + 0], vaddq_f32(va0, vb0));
vst1q_f32(&out[i + 4], vaddq_f32(va1, vb1));
vst1q_f32(&out[i + 8], vaddq_f32(va2, vb2));
vst1q_f32(&out[i + 12], vaddq_f32(va3, vb3));
}
/* Scalar tail */
for (; i < n; i++) {
out[i] = a[i] + b[i];
}
}
Building the Shared Object
Compile for the target ARM64 edge device. The flags below enable position-independent code, maximum optimization, and the correct architecture level.
gcc -shared -fPIC -O3 -march=armv8-a+fp+simd \
-o libvecadd.so vec_add_neon.c
Verify the symbol is exported:
nm -D libvecadd.so | grep vec_add_f32
Loading and Testing with ModuleLoader
ModuleLoader in module_host/loader.py is thread-safe and caches symbol lookups after the first call.
import ctypes
import numpy as np
from pyv_edge_agent.module_host.abi import ABIContract
from pyv_edge_agent.module_host.loader import ModuleLoader
abi = ABIContract(
function_name="vec_add_f32",
arg_types=[
ctypes.POINTER(ctypes.c_float),
ctypes.POINTER(ctypes.c_float),
ctypes.POINTER(ctypes.c_float),
ctypes.c_size_t,
],
return_type=None,
)
loader = ModuleLoader()
loader.load("./libvecadd.so", abi)
# Prepare data
n = 1024
a = np.arange(n, dtype=np.float32)
b = np.arange(n, dtype=np.float32) * 2.0
out = np.empty(n, dtype=np.float32)
# Get ctypes pointers
pa = a.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
pb = b.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
po = out.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
loader.call("vec_add_f32", pa, pb, po, n)
np.testing.assert_allclose(out, a + b)
loader.unload()
print("Kernel test passed.")
Registering with CompilerBridge
For pipeline integration, register the kernel through CompilerBridge so it appears in the deployment manifest. The bridge also provides validate_compilation() to compare the native output against a Python reference.
from pyvorin_edge.compiler_bridge import CompilerBridge
def python_reference(a, b):
return [x + y for x, y in zip(a, b)]
bridge = CompilerBridge()
# If you already have a .so from hand-written C, generate its ABI
abi = bridge.generate_abi("./libvecadd.so", "vec_add_f32")
# Validate against Python reference using ModuleLoader under the hood
report = bridge.validate_compilation(
"./libvecadd.so",
python_reference,
test_inputs=[
(list(range(64)), list(range(64))),
([1.5] * 128, [2.5] * 128),
],
)
print(report)
# {'passed': 2, 'failed': 0, 'mismatches': []}
Complete Test Script
Copy-paste the script below to validate your kernel end-to-end.
#!/usr/bin/env python3
"""End-to-end test for a custom C kernel on Pyvorin Edge."""
import ctypes
import numpy as np
from pathlib import Path
from pyv_edge_agent.module_host.abi import ABIContract
from pyv_edge_agent.module_host.loader import ModuleLoader
from pyvorin_edge.compiler_bridge import CompilerBridge
SO_PATH = Path(__file__).with_name("libvecadd.so")
abi = ABIContract(
function_name="vec_add_f32",
arg_types=[
ctypes.POINTER(ctypes.c_float),
ctypes.POINTER(ctypes.c_float),
ctypes.POINTER(ctypes.c_float),
ctypes.c_size_t,
],
return_type=None,
)
def python_reference(a, b):
return [x + y for x, y in zip(a, b)]
def main() -> None:
bridge = CompilerBridge()
report = bridge.validate_compilation(
SO_PATH, python_reference,
test_inputs=[
(list(map(float, range(256))), list(map(float, range(256)))),
],
)
assert report["failed"] == 0, report["mismatches"]
loader = ModuleLoader()
loader.load(str(SO_PATH), abi)
n = 4096
a = np.random.rand(n).astype(np.float32)
b = np.random.rand(n).astype(np.float32)
out = np.empty(n, dtype=np.float32)
loader.call(
"vec_add_f32",
a.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
b.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
out.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
n,
)
np.testing.assert_allclose(out, a + b, rtol=1e-6)
loader.unload()
print("All checks passed.")
if __name__ == "__main__":
main()
Summary
You now know how to write a C kernel, vectorize it with NEON, build a .so, describe its ABI with ABIContract, load it through ModuleLoader, and validate it with CompilerBridge. In the next article we cover the plugin architecture that lets you wire custom kernels into live pipelines.