diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index a59ff23e8..6db01d9b5 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -3275,6 +3275,17 @@ static std::optional prepare_video_generation_latents(sd latents.denoise_mask = sd::full({latents.init_latent.shape()[0], latents.init_latent.shape()[1], latents.init_latent.shape()[2], 1, 1}, 1.f); sd::ops::fill_slice(&latents.denoise_mask, 2, 0, init_image_latent.shape()[2], 0.0f); + if (!end_image.empty()) { + auto end_img = end_image.reshape({end_image.shape()[0], end_image.shape()[1], 1, end_image.shape()[2], 1}); + auto end_image_latent = sd_ctx->sd->encode_first_stage(end_img); // [b, c, 1, h/vae_scale_factor, w/vae_scale_factor] + if (end_image_latent.empty()) { + LOG_ERROR("failed to encode end video frame"); + return std::nullopt; + } + sd::ops::slice_assign(&latents.init_latent, 2, latents.init_latent.shape()[2] - 1, latents.init_latent.shape()[2], end_image_latent); + sd::ops::fill_slice(&latents.denoise_mask, 2, latents.init_latent.shape()[2] - 1, latents.init_latent.shape()[2], 0.0f); + } + int64_t t2 = ggml_time_ms(); LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1); } else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-VACE-1.3B" ||