Spaces:

innat
/

VideoMAE

Running

App Files Files Community

VideoMAE / utils.py

innat

Update utils.py

93ca8bb 12 months ago

raw

history blame

No virus

3.61 kB


	import tensorflow as tf
	import numpy as np
	from einops import rearrange
	from decord import VideoReader

	num_frames = 16
	input_size = 224
	patch_size = (16, 16)
	IMAGENET_MEAN = np.array([0.45, 0.45, 0.45])
	IMAGENET_STD = np.array([0.225, 0.225, 0.225])

	def format_frames(frame, output_size):
	frame = tf.image.convert_image_dtype(frame, tf.uint8)
	frame = tf.image.resize(frame, size=output_size)
	frame = frame / 255.
	frame = frame - IMAGENET_MEAN
	frame = frame / IMAGENET_STD
	return frame

	def read_video(file_path):
	container = VideoReader(file_path)
	return container

	def frame_sampling(container, num_frames):
	interval = len(container) // num_frames
	bids = np.arange(num_frames) * interval
	offset = np.random.randint(interval, size=bids.shape)
	frame_index = bids + offset
	frames = container.get_batch(frame_index).asnumpy()
	frames = np.stack(frames)
	frames = format_frames(frames, [input_size] * 2)
	return frames

	def denormalize(image):
	image = image.numpy() if not isinstance(image, np.ndarray) else image
	image = image * IMAGENET_STD + IMAGENET_MEAN
	image = (image * 255).clip(0, 255).astype('uint8')
	return image

	def reconstrunction(input_frame, bool_mask, pretrained_pred):
	img_squeeze = rearrange(
	input_frame.numpy(),
	'b (t p0) (h p1) (w p2) c -> b (t h w) (p0 p1 p2) c',
	p0=2, p1=patch_size[0], p2=patch_size[0]
	)
	img_mean = np.mean(img_squeeze, axis=-2, keepdims=True)
	img_variance = np.var(img_squeeze, axis=-2, ddof=1, keepdims=True)
	img_norm = (img_squeeze - img_mean) / (np.sqrt(img_variance) + 1e-6)
	img_patch = rearrange(img_norm, 'b n p c -> b n (p c)')
	img_patch[bool_mask] = pretrained_pred

	# make mask
	mask = np.ones_like(img_patch)
	mask[bool_mask] = 0
	mask = rearrange(
	mask, 'b n (p c) -> b n p c', c=3
	)
	mask = rearrange(
	mask,
	'b (t h w) (p0 p1 p2) c -> b (t p0) (h p1) (w p2) c',
	p0=2, p1=patch_size[0], p2=patch_size[1], h=14, w=14
	)

	#save reconstruction video
	rec_img = rearrange(img_patch, 'b n (p c) -> b n p c', c=3)

	# Notice: To visualize the reconstruction video, we add the predict and the original mean and var of each patch.
	img_mean = np.mean(img_squeeze, axis=-2, keepdims=True)
	img_std = np.sqrt(np.var(img_squeeze, axis=-2, ddof=1, keepdims=True) + 1e-6)
	rec_img = rec_img * img_std + img_mean
	rec_img = rearrange(
	rec_img,
	'b (t h w) (p0 p1 p2) c -> b (t p0) (h p1) (w p2) c',
	p0=2, p1=patch_size[0], p2=patch_size[1], h=14, w=14
	)

	return (
	rec_img[0],
	mask[0]
	)


	class TubeMaskingGenerator:
	def __init__(self, input_size, mask_ratio):
	self.frames, self.height, self.width = input_size
	self.num_patches_per_frame = self.height * self.width
	self.total_patches = self.frames * self.num_patches_per_frame
	self.num_masks_per_frame = int(mask_ratio * self.num_patches_per_frame)
	self.total_masks = self.frames * self.num_masks_per_frame

	def __repr__(self):
	repr_str = "Maks: total patches {}, mask patches {}".format(
	self.total_patches, self.total_masks
	)
	return repr_str

	def __call__(self):
	mask_per_frame = np.hstack([
	np.zeros(self.num_patches_per_frame - self.num_masks_per_frame),
	np.ones(self.num_masks_per_frame),
	])
	np.random.shuffle(mask_per_frame)
	mask = np.tile(mask_per_frame, (self.frames,1)).flatten()
	return mask