diff --git a/src/parse_moi.jl b/src/parse_moi.jl index 26b003b..31b09f1 100644 --- a/src/parse_moi.jl +++ b/src/parse_moi.jl @@ -147,13 +147,24 @@ function _parse_moi_stack!( parent_index::Int, ) m, n = x.size - # Build vcat(row(v11, v12, ...), row(v21, v22, ...), ...) + # Build vcat(row(v11, v12, ...), row(v21, v22, ...), ...). + # + # The outer loop is `1:m` (forward order), NOT `m:-1:1`. The `:row` nodes + # we push end up at consecutive positions in `expr.nodes`, and `:vcat` + # later reads its children in tape-index order (CSC `nzrange`) — so the + # row with the smallest tape index becomes row 1 of the output matrix. + # If the outer loop ran in reverse, `row_m` would land at the smallest + # tape index and `:vcat` would silently place it as row 1, producing a + # row-flipped matrix on the tape (a latent bug, fixed here). + # + # The inner loop stays `n:-1:1` because the items go on the stack and pop + # in LIFO order — pushing in reverse j order gives forward j-order on + # pop, which matches the column-major layout below. vcat_id = data.operators.multivariate_operator_to_id[:vcat] row_id = data.operators.multivariate_operator_to_id[:row] push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vcat_id, parent_index)) vcat_idx = length(expr.nodes) - # Push rows in reverse order for stack processing - for i in m:-1:1 + for i in 1:m push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, row_id, vcat_idx)) row_idx = length(expr.nodes) for j in n:-1:1 @@ -192,11 +203,14 @@ function _parse_moi_stack!( parent_index::Int, ) m, n = size(x) + # See the `ArrayOfContiguousVariables{2}` overload for the rationale on + # the `1:m` outer loop (the previous `m:-1:1` produced a row-flipped + # matrix on the tape). vcat_id = data.operators.multivariate_operator_to_id[:vcat] row_id = data.operators.multivariate_operator_to_id[:row] push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vcat_id, parent_index)) vcat_idx = length(expr.nodes) - for i in m:-1:1 + for i in 1:m push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, row_id, vcat_idx)) row_idx = length(expr.nodes) for j in n:-1:1 diff --git a/test/JuMP.jl b/test/JuMP.jl index 2774d81..535c0d2 100644 --- a/test/JuMP.jl +++ b/test/JuMP.jl @@ -241,29 +241,57 @@ function _test_neural( W1_val = [0.3 -0.2; 0.1 0.4] W2_val = [-0.1 0.5; 0.2 -0.3] obj, g = _eval(model, loss, [vec(W1_val); vec(W2_val)]) - obj_val = 0.8516435891643307 + # Reference computed from the same hand-written forward/reverse formulas + # as `perf/cuda_vs_pytorch.jl::forward_pass`/`reverse_diff`, adapted to + # this test's loss `sum((Y - target).^2)` (no `/ n` scaling, full gradient + # over both `W1` and `W2`). `_eval` evaluates the objective at `xstart` + # and the gradient at `x = [1, ..., 8]`, so we need the references at the + # corresponding inputs. + X_const = [1.0 0.5; 0.3 0.8] + target_const = [0.5 0.2; 0.1 0.7] + obj_val = _ref_objective(W1_val, W2_val, X_const, target_const) if with_norm obj_val = sqrt(obj_val) end @test obj ≈ obj_val - grad = [ - 12.3913945850742 - 0.6880048864793 - 9.4322503589489 - 0.5223651220724 - 46.2269560438734 - 53.9729454980064 - 45.7401048264386 - 53.4195902684781 - ] + W1_at_grad = reshape([1.0, 2.0, 3.0, 4.0], 2, 2) + W2_at_grad = reshape([5.0, 6.0, 7.0, 8.0], 2, 2) + grad_sumsq = _ref_gradient(W1_at_grad, W2_at_grad, X_const, target_const) if with_norm - @test g ≈ grad * 0.019879429552408144 + # `d/dx ‖E‖₂ = (1/(2‖E‖₂)) · d/dx ‖E‖₂² = grad_sumsq / (2 sqrt(sumsq))`, + # taken at the gradient evaluation point. + norm_at_grad = + sqrt(_ref_objective(W1_at_grad, W2_at_grad, X_const, target_const)) + @test g ≈ grad_sumsq ./ (2 * norm_at_grad) else - @test g ≈ grad + @test g ≈ grad_sumsq end return end +# Hand-written forward + reverse for the 2-layer MLP `loss = sum((W2 * +# tanh.(W1 * X) - target).^2)`. Same shape as `perf/cuda_vs_pytorch.jl`'s +# `forward_pass` / `reverse_diff` but adapted to this test (no `/ n` scaling +# and gradient over both `W1` and `W2`). Returned gradient is flattened with +# the JuMP variable convention `[vec(grad_W1); vec(grad_W2)]`. +function _ref_forward(W1, W2, X, target) + y_1 = tanh.(W1 * X) + J_1 = 1 .- y_1 .^ 2 + J_2 = 2 .* (W2 * y_1 .- target) + return y_1, J_1, J_2 +end + +function _ref_objective(W1, W2, X, target) + return sum((W2 * tanh.(W1 * X) .- target) .^ 2) +end + +function _ref_gradient(W1, W2, X, target) + y_1, J_1, J_2 = _ref_forward(W1, W2, X, target) + grad_W1 = (J_1 .* (W2' * J_2)) * X' + grad_W2 = J_2 * y_1' + return [vec(grad_W1); vec(grad_W2)] +end + function test_neural() bin = [false, true] @testset "$(with_norm ? "norm" : "sum")" for with_norm in bin