inpainting 语言驱动
目录
language 驱动
安装xformers
结论: inpaint Anything好:
分辨率 宽和高需要是64的倍数。
推理代码:
AVID 2024.11.15 没开源
language 驱动
GitHub - jianzongwu/Language-Driven-Video-Inpainting: (CVPR 2024) Official code for paper "Towards Language-Driven Video Inpainting via Multimodal Large Language Models"
安装xformers
- (RECOMMENDED, linux) Install latest stable with conda: Requires PyTorch 2.5.1 installed with conda
# (python 3.10/3.11 only) conda install xformers -c xformers
- (RECOMMENDED, linux & win) Install latest stable with pip: Requires PyTorch 2.5.1
# [linux only] cuda 11.8 version pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu118 # [linux only] cuda 12.1 version pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu121 # [linux & win] cuda 12.4 version pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu124 # [linux only] (EXPERIMENTAL) rocm 6.1 version pip3 install -U xformers --index-url https://download.pytorch.org/whl/rocm6.1
- Development binaries:
# Use either conda or pip, same requirements as for the stable version above conda install xformers -c xformers/label/dev pip install --pre -U xformers
- Install from source: If you want to use with another version of PyTorch for instance (including nightly-releases)
# (Optional) Makes the build much faster pip install ninja # Set TORCH_CUDA_ARCH_LIST if running and building on different GPU types pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers # (this can take dozens of minutes)
提示词:remove the person on middle
效果图:
结论: inpaint Anything好:
分辨率 宽和高需要是64的倍数。
huggingface-cli download --resume-download jianzongwu/lgvi-i --local-dir ckpt
推理代码:
import os
import argparse
from einops import rearrange
import random
import numpy as np
from PIL import Image
import torch
import cv2
import sys
import os
os.chdir(os.path.dirname(os.path.abspath(__file__)))
import torchvision
from rovi.models.unet import RoviModel
from rovi.pipelines.pipeline_rovi import RoviPipeline
import pdb
# WIDTH = 768
# HEIGHT = 384
def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=4, fps=8):
videos = rearrange(videos, "b c t h w -> t b c h w")
outputs = []
for x in videos:
x = torchvision.utils.make_grid(x, nrow=n_rows)
x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
if rescale:
x = (x + 1.0) / 2.0 # -1,1 -> 0,1
x = (x * 255).numpy().astype(np.uint8)
outputs.append(x)
os.makedirs(os.path.dirname(path), exist_ok=True)
imageio.mimsave(path, outputs, fps=25)
def resize_image(image, max_area=600 * 400):
original_height, original_width = image.shape[:2]
max_area_found = 0
best_resized_image = None
# 确定最小缩放比例,避免过度缩小导致图像信息丢失过多或出现不合理情况
min_scale = 0.2
# 从原始尺寸开始逐步缩放,可缩小也可放大,但限制最小缩放比例
for scale in [scale_value for scale_value in [i / 100 for i in range(100, int(min_scale * 100), -1)] if scale_value >= min_scale]:
new_width = int(original_width * scale / 64) * 64
new_height = int(original_height * scale / 64) * 64
if 0 < new_width * new_height < max_area:
resized_image = cv2.resize(image, (new_width, new_height))
current_area = new_width * new_height
if current_area > max_area_found:
max_area_found = current_area
best_resized_image = resized_image
break
return best_resized_image
os.environ['CUDA_VISIBLE_DEVICES'] = '4'
device = torch.device("cuda:0")
def load_video(video_path):
frames = []
max_num_frames = 80
cap = cv2.VideoCapture(video_path)
frames=[]
while True:
ret_val, img0 = cap.read() #
if img0 is not None:
# img0=cv2.resize(img0,(WIDTH, HEIGHT))
img0=resize_image(img0)
img0=cv2.cvtColor(img0, cv2.COLOR_BGR2RGB)
frames.append(img0)
if len(frames)==max_num_frames:
break
else:
break
print("len(frames)",frames[0].shape)
print("len(frames)",len(frames))
if 0:
frame_files = list(sorted(os.listdir(video_path)))[:max_num_frames]
for frame_name in frame_files:
image = Image.open(os.path.join(video_path, frame_name)).convert("RGB")
image = image.resize((WIDTH, HEIGHT), resample=Image.BILINEAR)
frames.append(image)
#
frames = np.stack(frames, axis=2)
frames = torch.from_numpy(frames).permute(2, 3, 0, 1).contiguous().unsqueeze(0)
frames = frames.float().div(255).clamp(0, 1).half().cuda() * 2.0 - 1.0
return frames
@torch.no_grad()
def main(args):
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True
generator = torch.Generator(device)
generator.manual_seed(args.seed)
weight_dtype = torch.float16
unet = RoviModel.from_pretrained(args.ckpt_path, subfolder='unet')
unet.to(device).to(weight_dtype)
pipe = RoviPipeline.from_pretrained(args.ckpt_path, unet=unet, torch_dtype=weight_dtype).to(device)
pixel_values = load_video(args.video_path)
batch_size, video_length, num_channels, height, width = pixel_values.shape
pixel_values = rearrange(pixel_values, "b f c h w -> (b f) c h w")
condition_latents = pipe.vae.encode(pixel_values).latent_dist.sample()
condition_latents = rearrange(condition_latents, "(b f) c h w -> b c f h w", f=video_length)
condition_latents = condition_latents * 0.18215
guidance_scale = args.gs
image_guidance_scale = args.igs
num_inference_steps = 50
pipe_output = pipe(args.expr, img_condition=condition_latents, num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale, image_guidance_scale=image_guidance_scale, generator=generator)
video = pipe_output.videos
output_path = f"./results/{args.expr}.mp4"
pdb.set_trace()
save_videos_grid(video, output_path)
if __name__ == '__main__':
"""
srun --partition=s1_mm_research --job-name=layout_multi --nodes=1 --gres=gpu:1 --ntasks-per-node=1 --cpus-per-task=8 --kill-on-bad-exit=1 --quotatype=auto \
nohup python -m inference_referring \
--expr "remove the bird on left" \
> nohup.out 2>&1 &
"""
parser = argparse.ArgumentParser()
# parser.add_argument('--video_path', help='path of the video folder', default='/mnt/pfs/users/lbg/project/Language-Inpainting/code/Language-Driven-Video-Inpainting-main/1_5691.mp4')
# parser.add_argument('--video_path', help='path of the video folder', default='/mnt/pfs/users/lbg/project/Language-Inpainting/code/Language-Driven-Video-Inpainting-main/face_171927256762865370_raw.mp4')
# parser.add_argument('--video_path', help='path of the video folder', default='/mnt/pfs/users/lbg/project/Language-Inpainting/code/Language-Driven-Video-Inpainting-main/eating_171917109089910622_raw.mp4')
parser.add_argument('--video_path', help='path of the video folder', default='/mnt/pfs/users/lbg/project/Language-Inpainting/code/Language-Driven-Video-Inpainting-main/1_5620.mp4')
parser.add_argument('--ckpt_path', help='path of the checkpoint folder', default='/mnt/pfs/users/lbg/project/Language-Inpainting/code/lgvi')
parser.add_argument('--expr', help='referring expression', default='remove the person on middle')
parser.add_argument('--gs', type=float, help='language guidance scale', default=3.0)
parser.add_argument('--igs', type=float, help='image guidance scale', default=1.5)
parser.add_argument('--seed', type=int, help='random seed', default=0)
args = parser.parse_args()
main(args)
AVID 2024.11.15 没开源
https://github.com/zhang-zx/AVID