From cccf2f746781f7eab446b44b74a2ae780433962a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?chant=28=CF=88=CE=B1=CE=BB=CE=BC=CF=8C=CF=82=29?= <126305902+theonlychant@users.noreply.github.com> Date: Sat, 18 Apr 2026 21:46:31 -0500 Subject: [PATCH 1/2] cython-sample::HPC --- Python/cython/Makefile | 11 ++++ Python/cython/README.md | 61 ++++++++++++++++++++ Python/cython/compute.pyx | 58 +++++++++++++++++++ Python/cython/driver.py | 111 ++++++++++++++++++++++++++++++++++++ Python/cython/hpc_kernels.c | 46 +++++++++++++++ Python/cython/hpc_kernels.h | 30 ++++++++++ Python/cython/setup.py | 16 ++++++ 7 files changed, 333 insertions(+) create mode 100644 Python/cython/Makefile create mode 100644 Python/cython/README.md create mode 100644 Python/cython/compute.pyx create mode 100644 Python/cython/driver.py create mode 100644 Python/cython/hpc_kernels.c create mode 100644 Python/cython/hpc_kernels.h create mode 100644 Python/cython/setup.py diff --git a/Python/cython/Makefile b/Python/cython/Makefile new file mode 100644 index 00000000..618641b4 --- /dev/null +++ b/Python/cython/Makefile @@ -0,0 +1,11 @@ +.PHONY: build clean test + +build: + python setup.py build_ext --inplace + +clean: + rm -f compute.c hpc_kernels.o compute*.so compute*.pyd + rm -rf build *.egg-info + +test: build + python driver.py diff --git a/Python/cython/README.md b/Python/cython/README.md new file mode 100644 index 00000000..ceb6ebaa --- /dev/null +++ b/Python/cython/README.md @@ -0,0 +1,61 @@ +# Cython Shared Library — HPCTrainingExamples Kernels + +This example wraps **actual C computation kernels from the HPCTrainingExamples repo** into a Python-callable shared library (`.so` / `.pyd`) using Cython. + +## Wrapped Kernels + +| Function | Original Source | Description | +|---|---|---| +| `py_cpu_func(inp)` | `ManagedMemory/CPU_Code/cpu_code.c` | Doubles every element: `out[i] = in[i] * 2.0` | +| `py_saxpy(a, x, y)` | `Pragma_Examples/OpenMP/C/1_saxpy` | SAXPY: `y = a*x + y` | +| `py_vecadd(a, b)` | `Pragma_Examples/OpenMP/C/3_vecadd` | Vector addition: `c = a + b` | +| `py_reduction(x)` | `Pragma_Examples/OpenMP/C/2_reduction` | Sum-reduction of an array | + +The core loops in [hpc_kernels.c](hpc_kernels.c) are extracted directly from the original repo sources (with `main()` and OpenMP timing scaffolding removed). The Cython wrapper in [compute.pyx](compute.pyx) calls into these C functions and handles NumPy array ↔ C pointer conversion. + +## Prerequisites + +```bash +pip install cython numpy +``` + +## Build + +```bash +# Option 1 – Makefile +make build + +# Option 2 – setup.py directly +python setup.py build_ext --inplace +``` + +This compiles `hpc_kernels.c` + `compute.pyx` into `compute..so`. + +## Run + +```bash +make test +# or +python driver.py +``` + +The driver benchmarks each Cython-wrapped kernel against its NumPy equivalent and validates correctness using the same expected values as the original C programs. + +## Clean + +```bash +make clean +``` + +## File Layout + +``` +Python/cython/ +├── hpc_kernels.h C declarations for the kernels +├── hpc_kernels.c C kernel implementations (from repo) +├── compute.pyx Cython wrapper module +├── setup.py Build script (setuptools + Cython) +├── driver.py Benchmark / validation driver +├── Makefile Build automation +└── README.md +``` diff --git a/Python/cython/compute.pyx b/Python/cython/compute.pyx new file mode 100644 index 00000000..5a118ade --- /dev/null +++ b/Python/cython/compute.pyx @@ -0,0 +1,58 @@ +# cython: boundscheck=False, wraparound=False, cdivision=True +""" +Cython wrappers around C kernels from HPCTrainingExamples. + +Wraps the following repo examples as a shared library: + - cpu_func (ManagedMemory/CPU_Code/cpu_code.c) + - saxpy (Pragma_Examples/OpenMP/C/1_saxpy) + - vecadd (Pragma_Examples/OpenMP/C/3_vecadd) + - reduction (Pragma_Examples/OpenMP/C/2_reduction) +""" + +import numpy as np +cimport numpy as np + +ctypedef np.float64_t DOUBLE_t +ctypedef np.float32_t FLOAT_t + +# ── C declarations (from hpc_kernels.h) ────────────────────────── +cdef extern from "hpc_kernels.h": + void cpu_func(double *inp, double *out, int M) + void saxpy(float a, float *x, float *y, int N) + void vecadd(double *a, double *b, double *c, int N) + double reduction(double *x, int n) + + +# ── Python-visible wrappers ────────────────────────────────────── + +def py_cpu_func(np.ndarray[DOUBLE_t, ndim=1] inp): + """Double every element (ManagedMemory/CPU_Code/cpu_code.c cpu_func).""" + cdef int M = inp.shape[0] + cdef np.ndarray[DOUBLE_t, ndim=1] out = np.empty(M, dtype=np.float64) + cpu_func(&inp[0], &out[0], M) + return out + + +def py_saxpy(float a, + np.ndarray[FLOAT_t, ndim=1] x, + np.ndarray[FLOAT_t, ndim=1] y): + """y = a*x + y (Pragma_Examples/OpenMP/C/1_saxpy saxpy).""" + cdef int N = x.shape[0] + cdef np.ndarray[FLOAT_t, ndim=1] y_out = y.copy() + saxpy(a, &x[0], &y_out[0], N) + return y_out + + +def py_vecadd(np.ndarray[DOUBLE_t, ndim=1] a, + np.ndarray[DOUBLE_t, ndim=1] b): + """c = a + b (Pragma_Examples/OpenMP/C/3_vecadd vecadd).""" + cdef int N = a.shape[0] + cdef np.ndarray[DOUBLE_t, ndim=1] c = np.empty(N, dtype=np.float64) + vecadd(&a[0], &b[0], &c[0], N) + return c + + +def py_reduction(np.ndarray[DOUBLE_t, ndim=1] x): + """Sum all elements (Pragma_Examples/OpenMP/C/2_reduction reduction).""" + cdef int n = x.shape[0] + return reduction(&x[0], n) diff --git a/Python/cython/driver.py b/Python/cython/driver.py new file mode 100644 index 00000000..207b4d8f --- /dev/null +++ b/Python/cython/driver.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +""" +Driver for the Cython-wrapped HPC Training Examples kernels. + +Exercises every kernel from the shared library and validates +results against NumPy equivalents, matching the behaviour of +the original C programs in the repo. +""" + +import time +import numpy as np + +try: + import compute +except ImportError: + raise ImportError( + "Could not import the 'compute' shared library.\n" + "Build it first: python setup.py build_ext --inplace" + ) + + +def bench(label, func, *args, repeats=50): + """Time a function and return its result.""" + result = func(*args) # warm-up + t0 = time.perf_counter() + for _ in range(repeats): + func(*args) + elapsed = (time.perf_counter() - t0) / repeats + print(f" {label:40s} {elapsed*1e6:10.1f} us") + return result + + +def test_cpu_func(): + """ManagedMemory/CPU_Code/cpu_code.c — doubles every element.""" + M = 100_000 + inp = np.ones(M, dtype=np.float64) + + out = bench("Cython cpu_func", compute.py_cpu_func, inp) + + # The original C program expects sum(out) == 200000 + assert np.allclose(out, inp * 2.0), "cpu_func mismatch!" + total = out.sum() + print(f" Result is {total:.6f} (expected {M * 2.0:.6f})") + + +def test_saxpy(): + """Pragma_Examples/OpenMP/C/1_saxpy — y = a*x + y.""" + N = 1_000_000 + a = np.float32(2.0) + x = np.ones(N, dtype=np.float32) + y = np.full(N, 2.0, dtype=np.float32) + + y_out = bench("Cython saxpy", compute.py_saxpy, a, x, y) + y_ref = bench("NumPy a*x + y", lambda: a * x + y) + + # Original program expects y[0] == 4.0, y[N-1] == 4.0 + assert np.allclose(y_out, a * x + y), "saxpy mismatch!" + print(f" y[0] {y_out[0]:.6f} y[N-1] {y_out[-1]:.6f} (expected 4.0)") + + +def test_vecadd(): + """Pragma_Examples/OpenMP/C/3_vecadd — c = a + b.""" + N = 100_000 + a = np.array([np.sin(i+1)**2 for i in range(N)], dtype=np.float64) + b = np.array([np.cos(i+1)**2 for i in range(N)], dtype=np.float64) + + c_cy = bench("Cython vecadd", compute.py_vecadd, a, b) + c_np = bench("NumPy a + b", np.add, a, b) + + assert np.allclose(c_cy, c_np), "vecadd mismatch!" + # Original expects mean(c) ≈ 1.0 (sin²+cos²=1) + avg = c_cy.mean() + print(f" Final result: {avg:.6f} (expected ≈1.0)") + + +def test_reduction(): + """Pragma_Examples/OpenMP/C/2_reduction — sum of array.""" + n = 100_000 + x = np.full(n, 2.0, dtype=np.float64) + + s_cy = bench("Cython reduction", compute.py_reduction, x) + s_np = bench("NumPy np.sum", np.sum, x) + + # Original expects sum == 200000 + assert abs(s_cy - s_np) < 1e-6, "reduction mismatch!" + print(f" Sum={s_cy:.6f} (expected {n * 2.0:.6f})") + + +def main(): + print("=" * 62) + print("HPCTrainingExamples — Cython Shared Library Tests") + print("=" * 62) + + print(f"\n--- cpu_func (ManagedMemory/CPU_Code) ---") + test_cpu_func() + + print(f"\n--- saxpy (Pragma_Examples/OpenMP/C/1_saxpy) ---") + test_saxpy() + + print(f"\n--- vecadd (Pragma_Examples/OpenMP/C/3_vecadd) ---") + test_vecadd() + + print(f"\n--- reduction (Pragma_Examples/OpenMP/C/2_reduction) ---") + test_reduction() + + print(f"\n{'=' * 62}") + print("All tests passed.") + + +if __name__ == "__main__": + main() diff --git a/Python/cython/hpc_kernels.c b/Python/cython/hpc_kernels.c new file mode 100644 index 00000000..083a06b7 --- /dev/null +++ b/Python/cython/hpc_kernels.c @@ -0,0 +1,46 @@ +/* + * hpc_kernels.c + * + * CPU computation kernels extracted from HPCTrainingExamples. + * Each function mirrors the core loop from the original source + * (with the main() and OpenMP/timing scaffolding stripped out so + * the pure kernel can be called from Python via Cython). + * + * Original sources: + * cpu_func – ManagedMemory/CPU_Code/cpu_code.c + * saxpy – Pragma_Examples/OpenMP/C/1_saxpy/0_saxpy_portyourself/saxpy.c + * vecadd – Pragma_Examples/OpenMP/C/3_vecadd/0_vecadd_portyourself/vecadd.c + * reduction – Pragma_Examples/OpenMP/C/2_reduction/0_reduction_portyourself/reduction.c + */ + +#include "hpc_kernels.h" + +/* ── cpu_func (ManagedMemory/CPU_Code/cpu_code.c) ──────────────── */ +void cpu_func(double *in, double *out, int M) { + for (int i = 0; i < M; i++) { + out[i] = in[i] * 2.0; + } +} + +/* ── saxpy (Pragma_Examples/OpenMP/C/1_saxpy) ──────────────────── */ +void saxpy(float a, float *x, float *y, int N) { + for (int i = 0; i < N; i++) { + y[i] = a * x[i] + y[i]; + } +} + +/* ── vecadd (Pragma_Examples/OpenMP/C/3_vecadd) ────────────────── */ +void vecadd(double *a, double *b, double *c, int N) { + for (int i = 0; i < N; i++) { + c[i] = a[i] + b[i]; + } +} + +/* ── reduction (Pragma_Examples/OpenMP/C/2_reduction) ──────────── */ +double reduction(double *x, int n) { + double sum = 0.0; + for (int i = 0; i < n; i++) { + sum = sum + x[i]; + } + return sum; +} diff --git a/Python/cython/hpc_kernels.h b/Python/cython/hpc_kernels.h new file mode 100644 index 00000000..35f56088 --- /dev/null +++ b/Python/cython/hpc_kernels.h @@ -0,0 +1,30 @@ +/* + * hpc_kernels.h + * + * Declarations for CPU computation kernels extracted from + * HPCTrainingExamples so they can be compiled into a shared + * library via Cython. + * + * Sources: + * cpu_func – ManagedMemory/CPU_Code/cpu_code.c + * saxpy – Pragma_Examples/OpenMP/C/1_saxpy/0_saxpy_portyourself/saxpy.c + * vecadd – Pragma_Examples/OpenMP/C/3_vecadd/0_vecadd_portyourself/vecadd.c + * reduction – Pragma_Examples/OpenMP/C/2_reduction/0_reduction_portyourself/reduction.c + */ + +#ifndef HPC_KERNELS_H +#define HPC_KERNELS_H + +/* Double every element: out[i] = in[i] * 2.0 */ +void cpu_func(double *in, double *out, int M); + +/* SAXPY: y[i] = a * x[i] + y[i] */ +void saxpy(float a, float *x, float *y, int N); + +/* Vector add: c[i] = a[i] + b[i] */ +void vecadd(double *a, double *b, double *c, int N); + +/* Sum-reduction: returns sum of x[0..n-1] */ +double reduction(double *x, int n); + +#endif /* HPC_KERNELS_H */ diff --git a/Python/cython/setup.py b/Python/cython/setup.py new file mode 100644 index 00000000..c7abdf84 --- /dev/null +++ b/Python/cython/setup.py @@ -0,0 +1,16 @@ +from setuptools import setup, Extension +from Cython.Build import cythonize +import numpy as np + +extensions = [ + Extension( + "compute", + sources=["compute.pyx", "hpc_kernels.c"], + include_dirs=[np.get_include(), "."], + ) +] + +setup( + name="hpc-cython-kernels", + ext_modules=cythonize(extensions, compiler_directives={"language_level": "3"}), +) From 2c2bab555dee8edbd87b6155acf586d07c47ded7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?chant=28=CF=88=CE=B1=CE=BB=CE=BC=CF=8C=CF=82=29?= <126305902+theonlychant@users.noreply.github.com> Date: Tue, 21 Apr 2026 03:34:11 -0500 Subject: [PATCH 2/2] added::validation + contiguity/dtype rappers to prevent crashes and ensure correct dtypes. --- Python/cython/README.md | 6 +++--- Python/cython/compute.pyx | 42 +++++++++++++++++++++++++++---------- Python/cython/hpc_kernels.c | 13 +++++------- Python/cython/hpc_kernels.h | 2 -- 4 files changed, 39 insertions(+), 24 deletions(-) diff --git a/Python/cython/README.md b/Python/cython/README.md index ceb6ebaa..d6e0d612 100644 --- a/Python/cython/README.md +++ b/Python/cython/README.md @@ -1,4 +1,4 @@ -# Cython Shared Library — HPCTrainingExamples Kernels +# Cython Shared Library - HPCTrainingExamples Kernels This example wraps **actual C computation kernels from the HPCTrainingExamples repo** into a Python-callable shared library (`.so` / `.pyd`) using Cython. @@ -22,10 +22,10 @@ pip install cython numpy ## Build ```bash -# Option 1 – Makefile +# Option 1 - Makefile make build -# Option 2 – setup.py directly +# Option 2 - setup.py directly python setup.py build_ext --inplace ``` diff --git a/Python/cython/compute.pyx b/Python/cython/compute.pyx index 5a118ade..3df3a2f5 100644 --- a/Python/cython/compute.pyx +++ b/Python/cython/compute.pyx @@ -15,7 +15,7 @@ cimport numpy as np ctypedef np.float64_t DOUBLE_t ctypedef np.float32_t FLOAT_t -# ── C declarations (from hpc_kernels.h) ────────────────────────── + cdef extern from "hpc_kernels.h": void cpu_func(double *inp, double *out, int M) void saxpy(float a, float *x, float *y, int N) @@ -23,13 +23,17 @@ cdef extern from "hpc_kernels.h": double reduction(double *x, int n) -# ── Python-visible wrappers ────────────────────────────────────── +# Python wrappers around the c kernels. def py_cpu_func(np.ndarray[DOUBLE_t, ndim=1] inp): """Double every element (ManagedMemory/CPU_Code/cpu_code.c cpu_func).""" - cdef int M = inp.shape[0] + # Ensure correct dtype and contiguous memory to safely take &inp[0] + inp_c = np.ascontiguousarray(inp, dtype=np.float64) + cdef int M = inp_c.shape[0] + if M == 0: + return np.empty(0, dtype=np.float64) cdef np.ndarray[DOUBLE_t, ndim=1] out = np.empty(M, dtype=np.float64) - cpu_func(&inp[0], &out[0], M) + cpu_func(&inp_c[0], &out[0], M) return out @@ -37,22 +41,38 @@ def py_saxpy(float a, np.ndarray[FLOAT_t, ndim=1] x, np.ndarray[FLOAT_t, ndim=1] y): """y = a*x + y (Pragma_Examples/OpenMP/C/1_saxpy saxpy).""" - cdef int N = x.shape[0] - cdef np.ndarray[FLOAT_t, ndim=1] y_out = y.copy() - saxpy(a, &x[0], &y_out[0], N) + # Coerce to float32 contiguous arrays and check lengths + x_c = np.ascontiguousarray(x, dtype=np.float32) + y_c = np.ascontiguousarray(y, dtype=np.float32) + cdef int N = x_c.shape[0] + if y_c.shape[0] != N: + raise ValueError("x and y must have the same length") + if N == 0: + return y_c.copy() + cdef np.ndarray[FLOAT_t, ndim=1] y_out = y_c.copy() + saxpy(a, &x_c[0], &y_out[0], N) return y_out def py_vecadd(np.ndarray[DOUBLE_t, ndim=1] a, np.ndarray[DOUBLE_t, ndim=1] b): """c = a + b (Pragma_Examples/OpenMP/C/3_vecadd vecadd).""" - cdef int N = a.shape[0] + a_c = np.ascontiguousarray(a, dtype=np.float64) + b_c = np.ascontiguousarray(b, dtype=np.float64) + cdef int N = a_c.shape[0] + if b_c.shape[0] != N: + raise ValueError("a and b must have the same length") + if N == 0: + return np.empty(0, dtype=np.float64) cdef np.ndarray[DOUBLE_t, ndim=1] c = np.empty(N, dtype=np.float64) - vecadd(&a[0], &b[0], &c[0], N) + vecadd(&a_c[0], &b_c[0], &c[0], N) return c def py_reduction(np.ndarray[DOUBLE_t, ndim=1] x): """Sum all elements (Pragma_Examples/OpenMP/C/2_reduction reduction).""" - cdef int n = x.shape[0] - return reduction(&x[0], n) + x_c = np.ascontiguousarray(x, dtype=np.float64) + cdef int n = x_c.shape[0] + if n == 0: + return 0.0 + return reduction(&x_c[0], n) diff --git a/Python/cython/hpc_kernels.c b/Python/cython/hpc_kernels.c index 083a06b7..0a5bd376 100644 --- a/Python/cython/hpc_kernels.c +++ b/Python/cython/hpc_kernels.c @@ -7,36 +7,33 @@ * the pure kernel can be called from Python via Cython). * * Original sources: - * cpu_func – ManagedMemory/CPU_Code/cpu_code.c - * saxpy – Pragma_Examples/OpenMP/C/1_saxpy/0_saxpy_portyourself/saxpy.c - * vecadd – Pragma_Examples/OpenMP/C/3_vecadd/0_vecadd_portyourself/vecadd.c - * reduction – Pragma_Examples/OpenMP/C/2_reduction/0_reduction_portyourself/reduction.c + * cpu_func - ManagedMemory/CPU_Code/cpu_code.c + * saxpy - Pragma_Examples/OpenMP/C/1_saxpy/0_saxpy_portyourself/saxpy.c + * vecadd - Pragma_Examples/OpenMP/C/3_vecadd/0_vecadd_portyourself/vecadd.c + * reduction - Pragma_Examples/OpenMP/C/2_reduction/0_reduction_portyourself/reduction.c */ #include "hpc_kernels.h" -/* ── cpu_func (ManagedMemory/CPU_Code/cpu_code.c) ──────────────── */ +/* cpu_func (ManagedMemory/CPU_Code/cpu_code.c) */ void cpu_func(double *in, double *out, int M) { for (int i = 0; i < M; i++) { out[i] = in[i] * 2.0; } } -/* ── saxpy (Pragma_Examples/OpenMP/C/1_saxpy) ──────────────────── */ void saxpy(float a, float *x, float *y, int N) { for (int i = 0; i < N; i++) { y[i] = a * x[i] + y[i]; } } -/* ── vecadd (Pragma_Examples/OpenMP/C/3_vecadd) ────────────────── */ void vecadd(double *a, double *b, double *c, int N) { for (int i = 0; i < N; i++) { c[i] = a[i] + b[i]; } } -/* ── reduction (Pragma_Examples/OpenMP/C/2_reduction) ──────────── */ double reduction(double *x, int n) { double sum = 0.0; for (int i = 0; i < n; i++) { diff --git a/Python/cython/hpc_kernels.h b/Python/cython/hpc_kernels.h index 35f56088..c7859514 100644 --- a/Python/cython/hpc_kernels.h +++ b/Python/cython/hpc_kernels.h @@ -21,10 +21,8 @@ void cpu_func(double *in, double *out, int M); /* SAXPY: y[i] = a * x[i] + y[i] */ void saxpy(float a, float *x, float *y, int N); -/* Vector add: c[i] = a[i] + b[i] */ void vecadd(double *a, double *b, double *c, int N); -/* Sum-reduction: returns sum of x[0..n-1] */ double reduction(double *x, int n); #endif /* HPC_KERNELS_H */