# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates # SPDX-License-Identifier: Apache-2.0 import os import random from pathlib import Path from typing import List import torch import numpy as np import PIL from PIL import Image import tempfile, shutil from diffusers import ( AutoencoderKLCogVideoX, CogVideoXImageToVideoMOTPipeline, CogVideoXTransformer3DMOTModel, ) from diffusers.utils import export_to_video, load_video def export_via_tmp(frames, final_path, fps): final_path = Path(final_path) final_path.parent.mkdir(parents=True, exist_ok=True) local_tmp_dir = os.environ.get("LOCAL_TMP", "/dev/shm" if os.path.isdir("/dev/shm") else "/tmp") suffix = final_path.suffix or ".mp4" fd, tmp_path = tempfile.mkstemp(prefix="vid_", suffix=suffix, dir=local_tmp_dir) os.close(fd) try: export_to_video(frames, tmp_path, fps=fps) partial = final_path.with_suffix(final_path.suffix + ".partial") shutil.copyfile(tmp_path, partial) os.replace(partial, final_path) finally: try: os.remove(tmp_path) except FileNotFoundError: pass def select_frames(video_frames: List[PIL.Image.Image], num: int, mode: str) -> List[PIL.Image.Image]: if len(video_frames) == 0: return [] if mode == "first": return video_frames[:num] if mode == "evenly": import torch as _torch idx = _torch.linspace(0, len(video_frames) - 1, num).long().tolist() return [video_frames[i] for i in idx] if mode == "random": if len(video_frames) <= num: return video_frames import random as _random start = _random.randint(0, len(video_frames) - num) return video_frames[start:start+num] return video_frames def set_global_seed(seed: int): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) if __name__ == "__main__": model_id = "ckpts/Video-As-Prompt-CogVideoX-5B" output_root = "outputs_infer" target_image_path = "assets/images/demo/animal-2.jpg" ref_video_path = "assets/videos/demo/object-725.mp4" set_global_seed(42) vae = AutoencoderKLCogVideoX.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.bfloat16) transformer = CogVideoXTransformer3DMOTModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch.bfloat16) pipe = CogVideoXImageToVideoMOTPipeline.from_pretrained( model_id, vae=vae, transformer=transformer, torch_dtype=torch.bfloat16 ).to("cuda") # pipe = CogVideoXImageToVideoMOTPipeline.from_pretrained( # model_id, # vae=vae, # transformer=transformer, # torch_dtype=torch.bfloat16 # ) # # offload base on module, max around 30GB # pipe.enable_model_cpu_offload() # # offload base on layer, max around 7.5GB # # pipe.enable_sequential_cpu_offload() ref_video = load_video(ref_video_path) image = Image.open(target_image_path).convert("RGB") ref_frames = select_frames(ref_video, num=49, mode="evenly") output_frames = pipe( image=image, ref_videos=[ref_frames], prompt="A chestnut-colored horse stands on a grassy hill against a backdrop of distant, snow-dusted mountains. The horse begins to inflate, its defined, muscular body swelling and rounding into a smooth, balloon-like form while retaining its rich, brown hide color. Without changing its orientation, the now-buoyant horse lifts silently from the ground. It begins a steady vertical ascent, rising straight up and eventually floating out of the top of the frame. The camera remains completely static throughout the entire sequence, holding a fixed shot on the landscape as the horse transforms and departs, ensuring the verdant hill and mountain range in the background stay perfectly still.", prompt_mot_ref=[ "A hand holds up a single beige sneaker decorated with gold calligraphy and floral illustrations, with small green plants tucked inside. The sneaker immediately begins to inflate like a balloon, its shape distorting as the decorative details stretch and warp across the expanding surface. It rapidly transforms into a perfectly smooth, matte beige sphere, inheriting the primary color from the original shoe. Once the transformation is complete, the new balloon-like object quickly ascends, moving straight up and exiting the top of the frame. The camera remains completely static and the plain white background is unchanged throughout the entire sequence." ], height=480, width=720, num_frames=49, frames_selection="evenly", use_dynamic_cfg=True, # generator=torch.Generator(device="cuda").manual_seed(42), ).frames[0] export_via_tmp(output_frames, os.path.join(output_root, "cog_vap.mp4"), fps=16)