diff --git a/Python/cython/Makefile b/Python/cython/Makefile new file mode 100644 index 00000000..618641b4 --- /dev/null +++ b/Python/cython/Makefile @@ -0,0 +1,11 @@ +.PHONY: build clean test + +build: + python setup.py build_ext --inplace + +clean: + rm -f compute.c hpc_kernels.o compute*.so compute*.pyd + rm -rf build *.egg-info + +test: build + python driver.py diff --git a/Python/cython/README.md b/Python/cython/README.md new file mode 100644 index 00000000..d6e0d612 --- /dev/null +++ b/Python/cython/README.md @@ -0,0 +1,61 @@ +# Cython Shared Library - HPCTrainingExamples Kernels + +This example wraps **actual C computation kernels from the HPCTrainingExamples repo** into a Python-callable shared library (`.so` / `.pyd`) using Cython. + +## Wrapped Kernels + +| Function | Original Source | Description | +|---|---|---| +| `py_cpu_func(inp)` | `ManagedMemory/CPU_Code/cpu_code.c` | Doubles every element: `out[i] = in[i] * 2.0` | +| `py_saxpy(a, x, y)` | `Pragma_Examples/OpenMP/C/1_saxpy` | SAXPY: `y = a*x + y` | +| `py_vecadd(a, b)` | `Pragma_Examples/OpenMP/C/3_vecadd` | Vector addition: `c = a + b` | +| `py_reduction(x)` | `Pragma_Examples/OpenMP/C/2_reduction` | Sum-reduction of an array | + +The core loops in [hpc_kernels.c](hpc_kernels.c) are extracted directly from the original repo sources (with `main()` and OpenMP timing scaffolding removed). The Cython wrapper in [compute.pyx](compute.pyx) calls into these C functions and handles NumPy array ↔ C pointer conversion. + +## Prerequisites + +```bash +pip install cython numpy +``` + +## Build + +```bash +# Option 1 - Makefile +make build + +# Option 2 - setup.py directly +python setup.py build_ext --inplace +``` + +This compiles `hpc_kernels.c` + `compute.pyx` into `compute..so`. + +## Run + +```bash +make test +# or +python driver.py +``` + +The driver benchmarks each Cython-wrapped kernel against its NumPy equivalent and validates correctness using the same expected values as the original C programs. + +## Clean + +```bash +make clean +``` + +## File Layout + +``` +Python/cython/ +├── hpc_kernels.h C declarations for the kernels +├── hpc_kernels.c C kernel implementations (from repo) +├── compute.pyx Cython wrapper module +├── setup.py Build script (setuptools + Cython) +├── driver.py Benchmark / validation driver +├── Makefile Build automation +└── README.md +``` diff --git a/Python/cython/compute.pyx b/Python/cython/compute.pyx new file mode 100644 index 00000000..3df3a2f5 --- /dev/null +++ b/Python/cython/compute.pyx @@ -0,0 +1,78 @@ +# cython: boundscheck=False, wraparound=False, cdivision=True +""" +Cython wrappers around C kernels from HPCTrainingExamples. + +Wraps the following repo examples as a shared library: + - cpu_func (ManagedMemory/CPU_Code/cpu_code.c) + - saxpy (Pragma_Examples/OpenMP/C/1_saxpy) + - vecadd (Pragma_Examples/OpenMP/C/3_vecadd) + - reduction (Pragma_Examples/OpenMP/C/2_reduction) +""" + +import numpy as np +cimport numpy as np + +ctypedef np.float64_t DOUBLE_t +ctypedef np.float32_t FLOAT_t + + +cdef extern from "hpc_kernels.h": + void cpu_func(double *inp, double *out, int M) + void saxpy(float a, float *x, float *y, int N) + void vecadd(double *a, double *b, double *c, int N) + double reduction(double *x, int n) + + +# Python wrappers around the c kernels. + +def py_cpu_func(np.ndarray[DOUBLE_t, ndim=1] inp): + """Double every element (ManagedMemory/CPU_Code/cpu_code.c cpu_func).""" + # Ensure correct dtype and contiguous memory to safely take &inp[0] + inp_c = np.ascontiguousarray(inp, dtype=np.float64) + cdef int M = inp_c.shape[0] + if M == 0: + return np.empty(0, dtype=np.float64) + cdef np.ndarray[DOUBLE_t, ndim=1] out = np.empty(M, dtype=np.float64) + cpu_func(&inp_c[0], &out[0], M) + return out + + +def py_saxpy(float a, + np.ndarray[FLOAT_t, ndim=1] x, + np.ndarray[FLOAT_t, ndim=1] y): + """y = a*x + y (Pragma_Examples/OpenMP/C/1_saxpy saxpy).""" + # Coerce to float32 contiguous arrays and check lengths + x_c = np.ascontiguousarray(x, dtype=np.float32) + y_c = np.ascontiguousarray(y, dtype=np.float32) + cdef int N = x_c.shape[0] + if y_c.shape[0] != N: + raise ValueError("x and y must have the same length") + if N == 0: + return y_c.copy() + cdef np.ndarray[FLOAT_t, ndim=1] y_out = y_c.copy() + saxpy(a, &x_c[0], &y_out[0], N) + return y_out + + +def py_vecadd(np.ndarray[DOUBLE_t, ndim=1] a, + np.ndarray[DOUBLE_t, ndim=1] b): + """c = a + b (Pragma_Examples/OpenMP/C/3_vecadd vecadd).""" + a_c = np.ascontiguousarray(a, dtype=np.float64) + b_c = np.ascontiguousarray(b, dtype=np.float64) + cdef int N = a_c.shape[0] + if b_c.shape[0] != N: + raise ValueError("a and b must have the same length") + if N == 0: + return np.empty(0, dtype=np.float64) + cdef np.ndarray[DOUBLE_t, ndim=1] c = np.empty(N, dtype=np.float64) + vecadd(&a_c[0], &b_c[0], &c[0], N) + return c + + +def py_reduction(np.ndarray[DOUBLE_t, ndim=1] x): + """Sum all elements (Pragma_Examples/OpenMP/C/2_reduction reduction).""" + x_c = np.ascontiguousarray(x, dtype=np.float64) + cdef int n = x_c.shape[0] + if n == 0: + return 0.0 + return reduction(&x_c[0], n) diff --git a/Python/cython/driver.py b/Python/cython/driver.py new file mode 100644 index 00000000..207b4d8f --- /dev/null +++ b/Python/cython/driver.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +""" +Driver for the Cython-wrapped HPC Training Examples kernels. + +Exercises every kernel from the shared library and validates +results against NumPy equivalents, matching the behaviour of +the original C programs in the repo. +""" + +import time +import numpy as np + +try: + import compute +except ImportError: + raise ImportError( + "Could not import the 'compute' shared library.\n" + "Build it first: python setup.py build_ext --inplace" + ) + + +def bench(label, func, *args, repeats=50): + """Time a function and return its result.""" + result = func(*args) # warm-up + t0 = time.perf_counter() + for _ in range(repeats): + func(*args) + elapsed = (time.perf_counter() - t0) / repeats + print(f" {label:40s} {elapsed*1e6:10.1f} us") + return result + + +def test_cpu_func(): + """ManagedMemory/CPU_Code/cpu_code.c — doubles every element.""" + M = 100_000 + inp = np.ones(M, dtype=np.float64) + + out = bench("Cython cpu_func", compute.py_cpu_func, inp) + + # The original C program expects sum(out) == 200000 + assert np.allclose(out, inp * 2.0), "cpu_func mismatch!" + total = out.sum() + print(f" Result is {total:.6f} (expected {M * 2.0:.6f})") + + +def test_saxpy(): + """Pragma_Examples/OpenMP/C/1_saxpy — y = a*x + y.""" + N = 1_000_000 + a = np.float32(2.0) + x = np.ones(N, dtype=np.float32) + y = np.full(N, 2.0, dtype=np.float32) + + y_out = bench("Cython saxpy", compute.py_saxpy, a, x, y) + y_ref = bench("NumPy a*x + y", lambda: a * x + y) + + # Original program expects y[0] == 4.0, y[N-1] == 4.0 + assert np.allclose(y_out, a * x + y), "saxpy mismatch!" + print(f" y[0] {y_out[0]:.6f} y[N-1] {y_out[-1]:.6f} (expected 4.0)") + + +def test_vecadd(): + """Pragma_Examples/OpenMP/C/3_vecadd — c = a + b.""" + N = 100_000 + a = np.array([np.sin(i+1)**2 for i in range(N)], dtype=np.float64) + b = np.array([np.cos(i+1)**2 for i in range(N)], dtype=np.float64) + + c_cy = bench("Cython vecadd", compute.py_vecadd, a, b) + c_np = bench("NumPy a + b", np.add, a, b) + + assert np.allclose(c_cy, c_np), "vecadd mismatch!" + # Original expects mean(c) ≈ 1.0 (sin²+cos²=1) + avg = c_cy.mean() + print(f" Final result: {avg:.6f} (expected ≈1.0)") + + +def test_reduction(): + """Pragma_Examples/OpenMP/C/2_reduction — sum of array.""" + n = 100_000 + x = np.full(n, 2.0, dtype=np.float64) + + s_cy = bench("Cython reduction", compute.py_reduction, x) + s_np = bench("NumPy np.sum", np.sum, x) + + # Original expects sum == 200000 + assert abs(s_cy - s_np) < 1e-6, "reduction mismatch!" + print(f" Sum={s_cy:.6f} (expected {n * 2.0:.6f})") + + +def main(): + print("=" * 62) + print("HPCTrainingExamples — Cython Shared Library Tests") + print("=" * 62) + + print(f"\n--- cpu_func (ManagedMemory/CPU_Code) ---") + test_cpu_func() + + print(f"\n--- saxpy (Pragma_Examples/OpenMP/C/1_saxpy) ---") + test_saxpy() + + print(f"\n--- vecadd (Pragma_Examples/OpenMP/C/3_vecadd) ---") + test_vecadd() + + print(f"\n--- reduction (Pragma_Examples/OpenMP/C/2_reduction) ---") + test_reduction() + + print(f"\n{'=' * 62}") + print("All tests passed.") + + +if __name__ == "__main__": + main() diff --git a/Python/cython/hpc_kernels.c b/Python/cython/hpc_kernels.c new file mode 100644 index 00000000..0a5bd376 --- /dev/null +++ b/Python/cython/hpc_kernels.c @@ -0,0 +1,43 @@ +/* + * hpc_kernels.c + * + * CPU computation kernels extracted from HPCTrainingExamples. + * Each function mirrors the core loop from the original source + * (with the main() and OpenMP/timing scaffolding stripped out so + * the pure kernel can be called from Python via Cython). + * + * Original sources: + * cpu_func - ManagedMemory/CPU_Code/cpu_code.c + * saxpy - Pragma_Examples/OpenMP/C/1_saxpy/0_saxpy_portyourself/saxpy.c + * vecadd - Pragma_Examples/OpenMP/C/3_vecadd/0_vecadd_portyourself/vecadd.c + * reduction - Pragma_Examples/OpenMP/C/2_reduction/0_reduction_portyourself/reduction.c + */ + +#include "hpc_kernels.h" + +/* cpu_func (ManagedMemory/CPU_Code/cpu_code.c) */ +void cpu_func(double *in, double *out, int M) { + for (int i = 0; i < M; i++) { + out[i] = in[i] * 2.0; + } +} + +void saxpy(float a, float *x, float *y, int N) { + for (int i = 0; i < N; i++) { + y[i] = a * x[i] + y[i]; + } +} + +void vecadd(double *a, double *b, double *c, int N) { + for (int i = 0; i < N; i++) { + c[i] = a[i] + b[i]; + } +} + +double reduction(double *x, int n) { + double sum = 0.0; + for (int i = 0; i < n; i++) { + sum = sum + x[i]; + } + return sum; +} diff --git a/Python/cython/hpc_kernels.h b/Python/cython/hpc_kernels.h new file mode 100644 index 00000000..c7859514 --- /dev/null +++ b/Python/cython/hpc_kernels.h @@ -0,0 +1,28 @@ +/* + * hpc_kernels.h + * + * Declarations for CPU computation kernels extracted from + * HPCTrainingExamples so they can be compiled into a shared + * library via Cython. + * + * Sources: + * cpu_func – ManagedMemory/CPU_Code/cpu_code.c + * saxpy – Pragma_Examples/OpenMP/C/1_saxpy/0_saxpy_portyourself/saxpy.c + * vecadd – Pragma_Examples/OpenMP/C/3_vecadd/0_vecadd_portyourself/vecadd.c + * reduction – Pragma_Examples/OpenMP/C/2_reduction/0_reduction_portyourself/reduction.c + */ + +#ifndef HPC_KERNELS_H +#define HPC_KERNELS_H + +/* Double every element: out[i] = in[i] * 2.0 */ +void cpu_func(double *in, double *out, int M); + +/* SAXPY: y[i] = a * x[i] + y[i] */ +void saxpy(float a, float *x, float *y, int N); + +void vecadd(double *a, double *b, double *c, int N); + +double reduction(double *x, int n); + +#endif /* HPC_KERNELS_H */ diff --git a/Python/cython/setup.py b/Python/cython/setup.py new file mode 100644 index 00000000..c7abdf84 --- /dev/null +++ b/Python/cython/setup.py @@ -0,0 +1,16 @@ +from setuptools import setup, Extension +from Cython.Build import cythonize +import numpy as np + +extensions = [ + Extension( + "compute", + sources=["compute.pyx", "hpc_kernels.c"], + include_dirs=[np.get_include(), "."], + ) +] + +setup( + name="hpc-cython-kernels", + ext_modules=cythonize(extensions, compiler_directives={"language_level": "3"}), +)