KernelTuner
diff --git a/‎.github/workflows/cmake-action.yml‎
Lines changed: 47 additions & 0 deletions b/‎.github/workflows/cmake-action.yml‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎.github/workflows/cmake.yml‎
Lines changed: 28 additions & 0 deletions b/‎.github/workflows/cmake.yml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎combine.py‎
Lines changed: 20 additions & 1 deletion b/‎combine.py‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎docs/api.rst‎
Lines changed: 3 additions & 1 deletion b/‎docs/api.rst‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/build_api.py‎
Lines changed: 35 additions & 30 deletions b/‎docs/build_api.py‎
Lines changed: 35 additions & 30 deletions
diff --git a/‎examples/vector_add/main.cu‎
Lines changed: 8 additions & 14 deletions b/‎examples/vector_add/main.cu‎
Lines changed: 8 additions & 14 deletions
diff --git a/‎include/kernel_float.h‎
Lines changed: 6 additions & 6 deletions b/‎include/kernel_float.h‎
Lines changed: 6 additions & 6 deletions
@@ -0,0 +1,47 @@
+name: CMake
+
+on:
+  workflow_call:
+    inputs:
+      cuda-version:
+        required: true
+        type: string
+
+env:
+  # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
+  BUILD_TYPE: Debug
+
+jobs:
+  build:
+    # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
+    # You can convert this to a matrix build if you need cross-platform coverage.
+    # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: Jimver/cuda-toolkit@v0.2.11
+      id: cuda-toolkit
+      with:
+        method: network
+        sub-packages: '["nvcc"]'
+        cuda:  ${{ inputs.cuda-version }}
+
+    - uses: actions/checkout@v3
+      with:
+        submodules: 'true'
+
+    - name: Configure CMake
+      # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
+      # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
+      run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DKERNEL_FLOAT_BUILD_TEST=1 -DKERNEL_FLOAT_BUILD_EXAMPLE=1
+
+    - name: Build
+      # Build your program with the given configuration
+      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
+
+    - name: Test
+      working-directory: ${{github.workspace}}/build
+      # Execute tests defined by the CMake configuration.
+      # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
+      run: ./tests/kernel_float_tests --durations=yes --success --verbosity=high ~[GPU]
+
@@ -0,0 +1,28 @@
+name: CMake
+
+on:
+  push:
+  pull_request:
+    branches: [ "main" ]
+
+env:
+  # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
+  BUILD_TYPE: Debug
+
+jobs:
+  build-cuda:
+    uses: ./.github/workflows/cmake-action.yml
+    with:
+      cuda-version: "12.2.0"
+
+  build-cuda-11-7:
+    needs: build-cuda
+    uses: ./.github/workflows/cmake-action.yml
+    with:
+      cuda-version: "11.7.0"
+
+  build-cuda-12-0:
+    needs: build-cuda
+    uses: ./.github/workflows/cmake-action.yml
+    with:
+      cuda-version: "12.0.0"
@@ -2,6 +2,24 @@
 import subprocess
 from datetime import datetime
 
+license_boilerplate = """/*
+ * Kernel Float: Header-only library for vector types and reduced precision floating-point math.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+"""
+
 directory = "include/kernel_float"
 contents = dict()
 
@@ -28,7 +46,8 @@
 except Exception as e:
     print(f"warning: {e}")
 
-output = "\n".join([
+output = license_boilerplate
+output += "\n".join([
     "//" + "=" * 80,
     "// this file has been auto-generated, do not modify its contents!",
     f"// date: {date}",
 
@@ -3,8 +3,10 @@ API Reference
 .. toctree::
    api/types.rst
    api/primitives.rst
+   api/generation.rst
    api/unary_operators.rst
    api/binary_operators.rst
    api/reductions.rst
-   api/shuffling.rst
    api/mathematical.rst
+   api/conditional.rst
+
@@ -65,51 +65,51 @@ def build_index_page(groups):
 
     return filename
 
-aliases = []
-for ty in ["vec", "float", "double", "half", "bfloat16x", ""]:
-    if ty != "vec":
-        aliases.append(f"{ty}X")
-
+aliases = ["scalar", "vec"]
+for ty in ["vec"]:
     for i in range(2, 8 + 1):
         aliases.append(f"{ty}{i}")
 
 groups = {
         "Types": [
             ("vector", "vector", "struct"),
-            ("Aliases", [
-                "unaligned_vec",
-                "vec",
-            ] + aliases,
-            "typedef"),
+            ("Aliases", aliases, "typedef"),
         ],
         "Primitives": [
-            ("range", "range()"),
-            ("range", "range(F)"),
             "map",
             "reduce",
             "zip",
             "zip_common",
             "cast",
             "broadcast",
-            "resize",
-            "for_each",
-        ],
-        "Shuffling": [
+            "convert",
+            "make_vec",
+            "into_vector",
             "concat",
-            "swizzle",
-            "first",
-            "last",
-            "reversed",
-            "rotate_left",
-            "rotate_right",
+            "select",
+            "for_each",
         ],
-        "Unary Operators": [
+        "Generation": [
+            "range",
+            "range_like",
+            "each_index",
             "fill",
             "fill_like",
             "zeros",
             "zeros_like",
             "ones",
             "ones_like",
+        ],
+        "Shuffling": [
+        #    "concat",
+        #    "swizzle",
+        #    "first",
+        #    "last",
+        #    "reversed",
+        #    "rotate_left",
+        #    "rotate_right",
+        ],
+        "Unary Operators": [
             "negate",
             "bit_not",
             "logical_not",
@@ -135,21 +135,21 @@ def build_index_page(groups):
             ("min", "min(L&&, R&&)"),
             "nextafter",
             "modf",
-            "pow",
+            ("pow", "pow(L&&, R&&)"),
             "remainder",
             #"rhypot",
         ],
         "Reductions": [
             "sum",
-            ("max", "max(V&&)"),
-            ("min", "min(V&&)"),
+            ("max", "max(const V&)"),
+            ("min", "min(const V&)"),
             "product",
             "all",
             "any",
             "count",
         ],
         "Mathematical": [
-            "abs",
+            ("abs", "abs(const V&)"),
             "acos",
             "acosh",
             "asin",
@@ -166,22 +166,22 @@ def build_index_page(groups):
             "erfcinv",
             "erfcx",
             "erfinv",
-            "exp",
+            ("exp", "exp(const V&)"),
             "exp10",
             "exp2",
             "fabs",
             "floor",
             "ilogb",
             "lgamma",
-            "log",
+            ("log", "log(const V&)"),
             "log10",
             "logb",
             "nearbyint",
             "normcdf",
             "rcbrt",
             "sin",
             "sinh",
-            "sqrt",
+            ("sqrt", "sqrt(const V&)"),
             "tan",
             "tanh",
             "tgamma",
@@ -193,6 +193,11 @@ def build_index_page(groups):
             "isinf",
             "isnan",
         ],
+        "Conditional": [
+            ("where", "where(const C&, const L&, const R&)"),
+            ("where", "where(const C&, const L&)"),
+            ("where", "where(const C&)"),
+        ]
 }
 
 build_index_page(groups)
@@ -4,9 +4,7 @@
 #include <vector>
 
 #include "kernel_float.h"
-namespace kf = kernel_float;
-
-using x = kf::half;
+using namespace kernel_float::prelude;
 
 void cuda_check(cudaError_t code) {
     if (code != cudaSuccess) {
@@ -15,11 +13,7 @@ void cuda_check(cudaError_t code) {
 }
 
 template<int N>
-__global__ void my_kernel(
-    int length,
-    const kf::unaligned_vec<__half, N>* input,
-    double constant,
-    kf::unaligned_vec<float, N>* output) {
+__global__ void my_kernel(int length, const khalf<N>* input, double constant, kfloat<N>* output) {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (i * N < length) {
@@ -30,24 +24,24 @@ __global__ void my_kernel(
 template<int items_per_thread>
 void run_kernel(int n) {
     double constant = 1.0;
-    std::vector<__half> input(n);
+    std::vector<half> input(n);
     std::vector<float> output_expected;
     std::vector<float> output_result;
 
     // Generate input data
     for (int i = 0; i < n; i++) {
-        input[i] = __half(i);
+        input[i] = half(i);
         output_expected[i] = float(i + constant);
     }
 
     // Allocate device memory
-    kf::unaligned_vec<__half, items_per_thread>* input_dev;
-    kf::unaligned_vec<float, items_per_thread>* output_dev;
-    cuda_check(cudaMalloc(&input_dev, sizeof(__half) * n));
+    khalf<items_per_thread>* input_dev;
+    kfloat<items_per_thread>* output_dev;
+    cuda_check(cudaMalloc(&input_dev, sizeof(half) * n));
     cuda_check(cudaMalloc(&output_dev, sizeof(float) * n));
 
     // Copy device memory
-    cuda_check(cudaMemcpy(input_dev, input.data(), sizeof(__half) * n, cudaMemcpyDefault));
+    cuda_check(cudaMemcpy(input_dev, input.data(), sizeof(half) * n, cudaMemcpyDefault));
 
     // Launch kernel!
     int block_size = 256;
 
@@ -1,18 +1,18 @@
 #ifndef KERNEL_FLOAT_H
 #define KERNEL_FLOAT_H
 
+#include "kernel_float/base.h"
 #include "kernel_float/bf16.h"
 #include "kernel_float/binops.h"
-#include "kernel_float/cast.h"
+#include "kernel_float/conversion.h"
 #include "kernel_float/fp16.h"
-#include "kernel_float/fp8.h"
-#include "kernel_float/interface.h"
 #include "kernel_float/iterate.h"
 #include "kernel_float/macros.h"
 #include "kernel_float/meta.h"
+#include "kernel_float/prelude.h"
 #include "kernel_float/reduce.h"
-#include "kernel_float/storage.h"
-#include "kernel_float/swizzle.h"
+#include "kernel_float/triops.h"
 #include "kernel_float/unops.h"
+#include "kernel_float/vector.h"
 
-#endif
+#endif