当前位置：首页 > article >正文

inpainting 语言驱动

article 2025/4/2 15:16:35

language 驱动

安装xformers

结论： inpaint Anything好：

分辨率宽和高需要是64的倍数。

推理代码：

AVID 2024.11.15 没开源

language 驱动

GitHub - jianzongwu/Language-Driven-Video-Inpainting: (CVPR 2024) Official code for paper "Towards Language-Driven Video Inpainting via Multimodal Large Language Models"

安装xformers

(RECOMMENDED, linux) Install latest stable with conda: Requires PyTorch 2.5.1 installed with conda

# (python 3.10/3.11 only)
conda install xformers -c xformers

(RECOMMENDED, linux & win) Install latest stable with pip: Requires PyTorch 2.5.1

# [linux only] cuda 11.8 version
pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu118
# [linux only] cuda 12.1 version
pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu121
# [linux & win] cuda 12.4 version
pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu124
# [linux only] (EXPERIMENTAL) rocm 6.1 version
pip3 install -U xformers --index-url https://download.pytorch.org/whl/rocm6.1

Development binaries:

# Use either conda or pip, same requirements as for the stable version above
conda install xformers -c xformers/label/dev
pip install --pre -U xformers

Install from source: If you want to use with another version of PyTorch for instance (including nightly-releases)

# (Optional) Makes the build much faster
pip install ninja
# Set TORCH_CUDA_ARCH_LIST if running and building on different GPU types
pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
# (this can take dozens of minutes)

提示词：remove the person on middle

效果图：

结论： inpaint Anything好：

分辨率宽和高需要是64的倍数。

huggingface-cli download --resume-download jianzongwu/lgvi-i --local-dir ckpt

推理代码：

import os
import argparse
from einops import rearrange
import random
import numpy as np
from PIL import Image

import torch
import cv2
import sys
import os
os.chdir(os.path.dirname(os.path.abspath(__file__)))
import torchvision
from rovi.models.unet import RoviModel
from rovi.pipelines.pipeline_rovi import RoviPipeline


import pdb
# WIDTH = 768
# HEIGHT = 384

def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=4, fps=8):
    videos = rearrange(videos, "b c t h w -> t b c h w")
    outputs = []
    for x in videos:
        x = torchvision.utils.make_grid(x, nrow=n_rows)
        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
        if rescale:
            x = (x + 1.0) / 2.0  # -1,1 -> 0,1
        x = (x * 255).numpy().astype(np.uint8)
        outputs.append(x)

    os.makedirs(os.path.dirname(path), exist_ok=True)
    imageio.mimsave(path, outputs, fps=25)
    
def resize_image(image, max_area=600 * 400):
    original_height, original_width = image.shape[:2]

    max_area_found = 0
    best_resized_image = None

    # 确定最小缩放比例，避免过度缩小导致图像信息丢失过多或出现不合理情况
    min_scale = 0.2

    # 从原始尺寸开始逐步缩放，可缩小也可放大，但限制最小缩放比例
    for scale in [scale_value for scale_value in [i / 100 for i in range(100, int(min_scale * 100), -1)] if scale_value >= min_scale]:
        new_width = int(original_width * scale / 64) * 64
        new_height = int(original_height * scale / 64) * 64

        if 0 < new_width * new_height < max_area:
            resized_image = cv2.resize(image, (new_width, new_height))
            current_area = new_width * new_height
            if current_area > max_area_found:
                max_area_found = current_area
                best_resized_image = resized_image
                break
    return best_resized_image        
os.environ['CUDA_VISIBLE_DEVICES'] = '4'
device = torch.device("cuda:0")

def load_video(video_path):
    frames = []
    max_num_frames = 80
    cap = cv2.VideoCapture(video_path)
    frames=[]
    while True:
        ret_val, img0 = cap.read()  #
        if img0 is not None:
            # img0=cv2.resize(img0,(WIDTH, HEIGHT))
            img0=resize_image(img0)
            img0=cv2.cvtColor(img0, cv2.COLOR_BGR2RGB)
            frames.append(img0)
            if len(frames)==max_num_frames:
                break
        else:
            break    
    print("len(frames)",frames[0].shape)
    print("len(frames)",len(frames))
    if 0:
        frame_files = list(sorted(os.listdir(video_path)))[:max_num_frames]
        for frame_name in frame_files:
            image = Image.open(os.path.join(video_path, frame_name)).convert("RGB")
            image = image.resize((WIDTH, HEIGHT), resample=Image.BILINEAR)
            frames.append(image)
    # 
    frames = np.stack(frames, axis=2)
    frames = torch.from_numpy(frames).permute(2, 3, 0, 1).contiguous().unsqueeze(0)
    frames = frames.float().div(255).clamp(0, 1).half().cuda() * 2.0 - 1.0
    return frames

@torch.no_grad()
def main(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True

    generator = torch.Generator(device)
    generator.manual_seed(args.seed)

    weight_dtype = torch.float16
    unet = RoviModel.from_pretrained(args.ckpt_path, subfolder='unet')
    unet.to(device).to(weight_dtype)
    pipe = RoviPipeline.from_pretrained(args.ckpt_path, unet=unet, torch_dtype=weight_dtype).to(device)

    pixel_values = load_video(args.video_path)

    batch_size, video_length, num_channels, height, width = pixel_values.shape

    pixel_values = rearrange(pixel_values, "b f c h w -> (b f) c h w")
    condition_latents = pipe.vae.encode(pixel_values).latent_dist.sample()
    condition_latents = rearrange(condition_latents, "(b f) c h w -> b c f h w", f=video_length)
    condition_latents = condition_latents * 0.18215
    
    guidance_scale = args.gs
    image_guidance_scale = args.igs
    num_inference_steps = 50

    pipe_output = pipe(args.expr, img_condition=condition_latents, num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale, image_guidance_scale=image_guidance_scale, generator=generator)
    
    video = pipe_output.videos

    output_path = f"./results/{args.expr}.mp4"
    pdb.set_trace()
    save_videos_grid(video, output_path)

if __name__ == '__main__':
    """
    srun --partition=s1_mm_research --job-name=layout_multi --nodes=1 --gres=gpu:1 --ntasks-per-node=1 --cpus-per-task=8 --kill-on-bad-exit=1 --quotatype=auto \
    nohup python -m inference_referring \
        --expr "remove the bird on left" \
        > nohup.out 2>&1 &
    """
    parser = argparse.ArgumentParser()
    # parser.add_argument('--video_path', help='path of the video folder', default='/mnt/pfs/users/lbg/project/Language-Inpainting/code/Language-Driven-Video-Inpainting-main/1_5691.mp4')
    # parser.add_argument('--video_path', help='path of the video folder', default='/mnt/pfs/users/lbg/project/Language-Inpainting/code/Language-Driven-Video-Inpainting-main/face_171927256762865370_raw.mp4')
    # parser.add_argument('--video_path', help='path of the video folder', default='/mnt/pfs/users/lbg/project/Language-Inpainting/code/Language-Driven-Video-Inpainting-main/eating_171917109089910622_raw.mp4')
    parser.add_argument('--video_path', help='path of the video folder', default='/mnt/pfs/users/lbg/project/Language-Inpainting/code/Language-Driven-Video-Inpainting-main/1_5620.mp4')
    parser.add_argument('--ckpt_path', help='path of the checkpoint folder', default='/mnt/pfs/users/lbg/project/Language-Inpainting/code/lgvi')
    parser.add_argument('--expr', help='referring expression', default='remove the person on middle')
    parser.add_argument('--gs', type=float, help='language guidance scale', default=3.0)
    parser.add_argument('--igs', type=float, help='image guidance scale', default=1.5)
    parser.add_argument('--seed', type=int, help='random seed', default=0)
    args = parser.parse_args()

    main(args)