Code review

pwilkin · pwilkin · commit e1d9c23889d4 · 2025-11-08T20:02:32.000+01:00
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
@@ -2207,6 +2207,16 @@ static void ggml_compute_forward_tri_f32(const ggml_compute_params * params, ggm
 
     const auto [ir0, ir1] = get_thread_range(params, src0);
 
+    bool (*bipred)(int, int);
+
+    switch (ttype) {
+        case GGML_TRI_TYPE_LOWER: bipred = [](int i, int r) { return i < r; }; break;
+        case GGML_TRI_TYPE_LOWER_DIAG: bipred = [](int i, int r) { return i <= r; }; break;
+        case GGML_TRI_TYPE_UPPER: bipred = [](int i, int r) { return i > r; }; break;
+        case GGML_TRI_TYPE_UPPER_DIAG:
+        default: bipred = [](int i, int r) { return i >= r; }; break;
+    }
+
     for (int64_t ir = ir0; ir < ir1; ++ir) {
         const int64_t i03 = ir/(ne02*ne01);
         const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
@@ -2215,7 +2225,7 @@ static void ggml_compute_forward_tri_f32(const ggml_compute_params * params, ggm
         float * dst_ptr  = (float  *) ((char *) dst->data   + i03*nb3  + i02*nb2  + i01*nb1);
         float * src_ptr  = (float  *) ((char *) src0->data  + i03*nb03 + i02*nb02 + i01*nb01);
 
-        ggml_vec_tri_f32(ne0, i01, dst_ptr, src_ptr, keep_org_val, c, ttype);
+        ggml_vec_tri_f32(ne0, i01, dst_ptr, src_ptr, keep_org_val, c, bipred);
     }
 
 }
diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h
@@ -1424,18 +1424,10 @@ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
 //   src          - input array
 //   keep_org_val - if true, keep original value where mask applies; otherwise use constant 'c'
 //   c            - constant value to use when not keeping original value
-//   type         - type of triangular mask (lower, upper, etc.)
-inline static void ggml_vec_tri_f32(const int n, const int r, float * dst, const float * src, bool keep_org_val, float c, enum ggml_tri_type type) {
+//   bipred       - the predicate on coordinates, derived from tri_type
+inline static void ggml_vec_tri_f32(const int n, const int r, float * dst, const float * src, bool keep_org_val, float c, bool (*bipred)(int, int)) {
     for (int i = 0; i < n; ++i) {
-        bool cmp = false;
-        switch (type) {
-            case GGML_TRI_TYPE_LOWER: cmp = i < r; break;
-            case GGML_TRI_TYPE_LOWER_DIAG: cmp = i <= r; break;
-            case GGML_TRI_TYPE_UPPER: cmp = i > r; break;
-            case GGML_TRI_TYPE_UPPER_DIAG:
-            default: cmp = i >= r; break;
-        }
-        dst[i] = cmp ? (keep_org_val ? src[i] : c) : 0.0f;
+        dst[i] = bipred(i, r) ? (keep_org_val ? src[i] : c) : 0.0f;
     }
 }
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -5082,6 +5082,9 @@ struct ggml_tensor * ggml_tri(
     float constant,
     enum ggml_tri_type tritype) {
 
+    GGML_ASSERT(ggml_is_contiguous(a));
+    GGML_ASSERT(a->ne[0] == a->ne[1]);
+
     struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
 
     ggml_set_op_params_i32(result, 0, tritype);
@@ -5956,6 +5959,7 @@ struct ggml_tensor * ggml_opt_step_sgd(
 }
 
 // solve_tri
+
 struct ggml_tensor * ggml_solve_tri(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
@@ -5966,9 +5970,9 @@ struct ggml_tensor * ggml_solve_tri(
     // B must have same outer dimension as A
     GGML_ASSERT(a->ne[1] == b->ne[1]);
 
-    // B must be broadcastable to A
-    GGML_ASSERT(a->ne[2] % b->ne[2] == 0);
-    GGML_ASSERT(a->ne[3] % b->ne[3] == 0);
+    // batch dimensions must be equal
+    GGML_ASSERT(a->ne[2] == b->ne[2]);
+    GGML_ASSERT(a->ne[3] == b->ne[3]);
 
     GGML_ASSERT(ggml_is_contiguous(a));
     GGML_ASSERT(ggml_is_contiguous(b));
@@ -6565,12 +6569,12 @@ static void ggml_compute_backward(
                         struct ggml_tensor * neg_src0 = ggml_neg(ctx, src0);
                         struct ggml_tensor * exp_neg  = ggml_exp(ctx, neg_src0);
                         struct ggml_tensor * ones =
-                            ggml_exp(ctx, ggml_new_tensor_4d(ctx, src0->type, src0->ne[0], src0->ne[1], src0->ne[2],
-                                                             src0->ne[3]));
+                            ggml_scale_bias(ctx, ggml_new_tensor_4d(ctx, src0->type, src0->ne[0], src0->ne[1], src0->ne[2],
+                                                    src0->ne[3]), 0.0f, 1.0f);
                         struct ggml_tensor * one_plus_exp = ggml_add(ctx, ones, exp_neg);
                         struct ggml_tensor * sigmoid      = ggml_div(ctx, ones, one_plus_exp);
                         ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, sigmoid));
-                    }
+                    } 
                 } break;
                 default: {
                     fprintf(stderr, "%s: unsupported unary op for backward pass: %s\n",
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -188,12 +188,11 @@ static void init_tensor_causal(ggml_tensor * tensor, float min = -1.0f, float ma
     std::mt19937 gen(rd());
     std::uniform_real_distribution<float> dis(min, max);
 
-    for (int64_t i0 = 0; i0 < tensor->ne[0]; i0++) {
-        for (int64_t i1 = 0; i1 < tensor->ne[1]; i1++) {
-            for (int64_t i2 = 0; i2 < tensor->ne[2]; i2++) {
-                for (int64_t i3 = 0; i3 < tensor->ne[3]; i3++) {
-                    int64_t idx = i0 * tensor->nb[0] / sizeof(float) + i1 * tensor->nb[1] / sizeof(float) +
-                        i2 * tensor->nb[2] / sizeof(float) + i3 * tensor->nb[3] / sizeof(float);
+    for (int64_t i3 = 0; i3 < tensor->ne[3]; i3++) {
+        for (int64_t i2 = 0; i2 < tensor->ne[2]; i2++) {
+            for (int64_t i1 = 0; i2 < tensor->ne[1]; i1++) {
+                for (int64_t i0 = 0; i0 < tensor->ne[0]; i0++) {
+                    int64_t idx = (i0 * tensor->nb[0] + i1 * tensor->nb[1] + i2 * tensor->nb[2] + i3 * tensor->nb[3]) / sizeof(float);
                     if (i0 <= i1) {
                         data_f32[idx] = dis(gen);
                     } else {
@@ -4785,7 +4784,6 @@ struct test_argsort : public test_case {
     }
 };
 
-// GGML_OP_TOPK_MOE
 struct test_topk_moe: public test_case {
     const std::array<int64_t, 4> ne;
     const int n_expert_used;
@@ -4843,7 +4841,6 @@ struct test_topk_moe: public test_case {
     }
 };
 
-// GGML_MOE_EXPERT_REDUCE
 struct test_moe_expert_reduce : public test_case {
     const int64_t n_embd;
     const int64_t n_tokens;
@@ -5349,7 +5346,7 @@ struct test_pad : public test_case {
     }
 };
 
-// GGML_OP_EXT
+// GGML_OP_PAD (with extension)
 struct test_pad_ext : public test_case {
     const ggml_type type;
     const std::array<int64_t, 4> ne_a;
@@ -5797,49 +5794,53 @@ struct test_opt_step_sgd : public test_case {
     }
 };
 
-// GGML_OP_ADD
-// GGML_OP_SUB
-// GGML_OP_DIV
-// GGML_OP_MUL
-struct test_op_arith : public test_case {
+// GGML_OP_CUMSUM
+struct test_cumsum : public test_case {
     const ggml_type              type;
     const std::array<int64_t, 4> ne;
-    const ggml_op                op;
 
-    std::string vars() override { return VARS_TO_STR3(type, ne, op); }
+    std::string vars() override { return VARS_TO_STR2(type, ne); }
 
-    test_op_arith(ggml_op op, ggml_type type = GGML_TYPE_F32,
+    test_cumsum(ggml_type type = GGML_TYPE_F32,
             std::array<int64_t, 4> ne = { 10, 5, 4, 3 })
-        : type(type), ne(ne), op(op) {
-        GGML_ASSERT(op == GGML_OP_ADD || op == GGML_OP_SUB || op == GGML_OP_DIV || op == GGML_OP_MUL);
-    }
+        : type(type), ne(ne) {}
 
-        ggml_tensor * build_graph(ggml_context * ctx) override {
+    ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
         ggml_set_param(a);
         ggml_set_name(a, "a");
 
-        ggml_tensor * b = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
-        ggml_set_name(b, "b");
+        ggml_tensor * out = ggml_cumsum(ctx, a);
 
-        ggml_tensor * out;
+        ggml_set_name(out, "out");
 
-        switch (op) {
-            case GGML_OP_ADD:
-                out = ggml_add(ctx, a, b);
-                break;
-            case GGML_OP_SUB:
-                out = ggml_sub(ctx, a, b);
-                break;
-            case GGML_OP_DIV:
-                out = ggml_div(ctx, a, b);
-                break;
-            case GGML_OP_MUL:
-                out = ggml_mul(ctx, a, b);
-                break;
-            default:
-                GGML_ABORT("This test only supports ADD, SUB, DIV and MUL");
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -1.0f, 1.0f);
         }
+    }
+};
+
+// GGML_OP_EXPM1
+struct test_expm1 : public test_case {
+    const ggml_type              type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override { return VARS_TO_STR2(type, ne); }
+
+    test_expm1(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = { 10, 5, 4, 3 })
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_expm1(ctx, a);
 
         ggml_set_name(out, "out");
 
@@ -5848,20 +5849,19 @@ struct test_op_arith : public test_case {
 
     void initialize_tensors(ggml_context * ctx) override {
         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            init_tensor_uniform(t, 0.1f, 1.0f);  // no zeroes because div might complain
+            init_tensor_uniform(t, -1.0f, 1.0f);
         }
     }
-
 };
 
-// GGML_OP_CUMSUM
-struct test_cumsum : public test_case {
+// GGML_OP_SOFTPLUS
+struct test_softplus : public test_case {
     const ggml_type              type;
     const std::array<int64_t, 4> ne;
 
     std::string vars() override { return VARS_TO_STR2(type, ne); }
 
-    test_cumsum(ggml_type type = GGML_TYPE_F32,
+    test_softplus(ggml_type type = GGML_TYPE_F32,
             std::array<int64_t, 4> ne = { 10, 5, 4, 3 })
         : type(type), ne(ne) {}
 
@@ -5870,7 +5870,7 @@ struct test_cumsum : public test_case {
         ggml_set_param(a);
         ggml_set_name(a, "a");
 
-        ggml_tensor * out = ggml_cumsum(ctx, a);
+        ggml_tensor * out = ggml_softplus(ctx, a);
 
         ggml_set_name(out, "out");
 
@@ -7256,6 +7256,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
         test_cases.emplace_back(new test_ceil      (type));
         test_cases.emplace_back(new test_round     (type));
         test_cases.emplace_back(new test_trunc     (type));
+        test_cases.emplace_back(new test_expm1     (type));
+        test_cases.emplace_back(new test_softplus  (type));
         test_cases.emplace_back(new test_sqr       (type, {7, 1, 5, 3}));
         test_cases.emplace_back(new test_sqrt      (type, {7, 1, 5, 3}));
         test_cases.emplace_back(new test_log       (type, {7, 1, 5, 3}));
@@ -7269,12 +7271,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
         test_cases.emplace_back(new test_trunc     (type, {7, 1, 5, 3}));
     }
 
-    // basic arithmetic, have to do them manually now that fusion is not supported
-    test_cases.emplace_back(new test_op_arith(GGML_OP_ADD, GGML_TYPE_F32));
-    test_cases.emplace_back(new test_op_arith(GGML_OP_SUB, GGML_TYPE_F32));
-    test_cases.emplace_back(new test_op_arith(GGML_OP_DIV, GGML_TYPE_F32));
-    test_cases.emplace_back(new test_op_arith(GGML_OP_MUL, GGML_TYPE_F32));
-
     test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
     test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 1}, 5));
     test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 2}, 5));