diff --git a/perceptor/losses/__init__.py b/perceptor/losses/__init__.py index 013a233..e11ec3f 100644 --- a/perceptor/losses/__init__.py +++ b/perceptor/losses/__init__.py @@ -6,7 +6,6 @@ from .lpips import LPIPS from .super_resolution import SuperResolution, SuperResolutionDiscriminator from .memorability import Memorability -from .midas_depth import MidasDepth from .open_clip import OpenCLIP from .resize import Resize from .ruclip import RuCLIP diff --git a/perceptor/losses/midas_depth.py b/perceptor/losses/midas_depth.py deleted file mode 100644 index c4cf3d6..0000000 --- a/perceptor/losses/midas_depth.py +++ /dev/null @@ -1,41 +0,0 @@ -import torch.nn.functional as F - -from perceptor.transforms.resize import resize -from perceptor.losses.interface import LossInterface -from perceptor.models.midas_depth import ( - MidasDepth as MidasDepthModel, -) - - -class MidasDepth(LossInterface): - def __init__(self, name="dpt_large"): - super().__init__() - self.model = MidasDepthModel(name=name) - - def forward(self, images, depth_maps): - mask = depth_maps != -1 - predicted_depth_maps = ( - resize( - self.model(images)[:, None], - out_shape=depth_maps.shape[-2:], - )[:, 0] - * mask - ) - return ( - (normalize(predicted_depth_maps[mask]) - normalize(depth_maps[mask])) - .square() - .mean() - ) - - -def normalize(x): - return (x - x.mean()) / x.std() - - -def test_midas_depth_loss(): - import torch - - loss = MidasDepth().cuda() - images = torch.zeros((1, 3, 256, 256)).cuda() - depth_maps = torch.zeros((1, 3, 256, 256)).cuda() - loss(images, depth_maps) diff --git a/perceptor/losses/open_clip.py b/perceptor/losses/open_clip.py index b27100b..a6b1627 100644 --- a/perceptor/losses/open_clip.py +++ b/perceptor/losses/open_clip.py @@ -5,7 +5,31 @@ class OpenCLIP(LossInterface): - def __init__(self, architecture="ViT-B-32-quickgelu", weights="laion400m_e31"): + def __init__(self, architecture="ViT-B-32", weights="laion2b_e16"): + """ + Args: + archicture (str): name of the clip model + weights (str): name of the weights + + Available weight/model combinations are (in order of relevance): + - ("ViT-B-32", "laion2b_e16") (65.62%) + - ("ViT-B-16-plus-240", "laion400m_e32") (69.21%) + - ("ViT-B-16", "laion400m_e32") (67.07%) + - ("ViT-B-32", "laion400m_e32") (62.96%) + - ("ViT-L-14", "laion400m_e32") (72.77%) + - ("RN101", "yfcc15m") (34.8%) + - ("RN50", "yfcc15m") (32.7%) + - ("RN50", "cc12m") (36.45%) + - ("RN50-quickgelu", "openai") + - ("RN101-quickgelu", "openai") + - ("RN50x4", "openai") + - ("RN50x16", "openai") + - ("RN50x64", "openai") + - ("ViT-B-32-quickgelu", "openai") + - ("ViT-B-16", "openai") + - ("ViT-L-14", "openai") + - ("ViT-L-14-336", "openai") + """ super().__init__() self.architecture = architecture self.weights = weights diff --git a/perceptor/models/monster_diffusion/monster_diffusion.py b/perceptor/models/monster_diffusion/monster_diffusion.py index c8f2a41..4a4e1aa 100644 --- a/perceptor/models/monster_diffusion/monster_diffusion.py +++ b/perceptor/models/monster_diffusion/monster_diffusion.py @@ -41,23 +41,15 @@ def training_ts(size): random_ts = (diffusion.P_mean + torch.randn(size) * diffusion.P_std).exp() return random_ts - @staticmethod - def schedule_ts(n_steps): - ramp = torch.linspace(0, 1, n_steps) + def _schedule_ts(self, n_steps): + ramp = torch.linspace(0, 1, n_steps).to(self.device) min_inv_rho = diffusion.sigma_min ** (1 / diffusion.rho) max_inv_rho = diffusion.sigma_max ** (1 / diffusion.rho) return (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** diffusion.rho - @staticmethod - def evaluation_ts(): - n_steps = 1000 - schedule_ts = MonsterDiffusion.schedule_ts(n_steps) - return torch.cat( - [ - schedule_ts, - MonsterDiffusion.reversed_ts(schedule_ts, n_steps), - ] - ).unique() + def schedule_ts(self, n_steps): + schedule_ts = self._schedule_ts(n_steps) + return zip(schedule_ts[:-1], schedule_ts[1:]) @staticmethod def sigmas(ts): @@ -70,8 +62,7 @@ def alphas(ts): @staticmethod def random_noise(size): return standardize.decode( - torch.randn(size, *settings.INPUT_SHAPE) - * MonsterDiffusion.sigmas(MonsterDiffusion.schedule_ts(100)[:1]) + torch.randn(size, *settings.INPUT_SHAPE) * diffusion.sigma_max ) @staticmethod @@ -147,7 +138,7 @@ def forward( return PredictionBatch( denoised_xs=denoised_xs, diffused_images=diffused_images, - ts=ts, + ts=torch.as_tensor(ts).flatten().to(self.device), ) def predictions_( @@ -237,11 +228,12 @@ def elucidated_sample( ) n_steps = n_evaluations // 2 - schedule_ts = self.schedule_ts(n_steps)[:, None].repeat(1, size).to(self.device) i = 0 progress = tqdm(total=n_steps, disable=not progress, leave=False) - for from_ts, to_ts in zip(schedule_ts[:-1], schedule_ts[1:]): - reversed_ts = self.reversed_ts(from_ts, n_steps).clamp(max=schedule_ts[0]) + for from_ts, to_ts in self.schedule_ts(n_steps): + reversed_ts = self.reversed_ts(from_ts, n_steps).clamp( + max=diffusion.sigma_max + ) reversed_diffused_images = self.inject_noise( diffused_images, from_ts, reversed_ts ) @@ -317,13 +309,11 @@ def linear_multistep_sample( diffused_images = diffused_images.to(self.device) n_steps = n_evaluations - schedule_ts = self.schedule_ts(n_steps)[:, None].repeat(1, size).to(self.device) + schedule_ts = self._schedule_ts(n_steps) epses = list() progress = tqdm(total=n_steps, disable=not progress, leave=False) - for from_index, from_ts, to_ts in zip( - range(n_steps), schedule_ts[:-1], schedule_ts[1:] - ): + for from_index, (from_ts, to_ts) in enumerate(self.schedule_ts(n_steps)): predictions = self.predictions( diffused_images, @@ -338,7 +328,7 @@ def linear_multistep_sample( coeffs = [ self.linear_multistep_coeff( current_order, - self.sigmas(schedule_ts[:, 0]).cpu().flatten(), + self.sigmas(schedule_ts).cpu().flatten(), from_index, to_index, ) @@ -364,5 +354,18 @@ def linear_multistep_sample( def test_monster_diffusion(): + from perceptor import utils + + model = MonsterDiffusion().cuda() + for images in model.sample(size=1, n_evaluations=50): + pass + utils.pil_image(images).save("tests/monster_diffusion.png") + + +def test_monster_diffusion_lms(): + from perceptor import utils + model = MonsterDiffusion().cuda() - model.sample(size=1, n_evaluations=4) + for images in model.linear_multistep_sample(size=1, n_evaluations=50): + pass + utils.pil_image(images).save("tests/monster_diffusion_lms.png") diff --git a/perceptor/models/monster_diffusion/prediction.py b/perceptor/models/monster_diffusion/prediction.py index 734da5a..c494f15 100644 --- a/perceptor/models/monster_diffusion/prediction.py +++ b/perceptor/models/monster_diffusion/prediction.py @@ -60,6 +60,10 @@ def __iter__(self): @staticmethod def sigmas(ts): + if isinstance(ts, float): + ts = torch.as_tensor(ts) + if ts.ndim == 0: + return torch.full((1,), ts).to(ts.device) return ts[:, None, None, None] @staticmethod diff --git a/perceptor/models/open_clip.py b/perceptor/models/open_clip.py index 468eea3..6ae4862 100644 --- a/perceptor/models/open_clip.py +++ b/perceptor/models/open_clip.py @@ -12,8 +12,8 @@ class OpenCLIP(torch.nn.Module): def __init__(self, archicture="ViT-B-32", weights="laion2b_e16"): """ Args: - archicture: name of the clip model - weights: name of the weights + archicture (str): name of the clip model + weights (str): name of the weights Available weight/model combinations are (in order of relevance): - ("ViT-B-32", "laion2b_e16") (65.62%) diff --git a/perceptor/models/velocity_diffusion/velocity_diffusion.py b/perceptor/models/velocity_diffusion/velocity_diffusion.py index 143f010..f90153c 100644 --- a/perceptor/models/velocity_diffusion/velocity_diffusion.py +++ b/perceptor/models/velocity_diffusion/velocity_diffusion.py @@ -45,13 +45,17 @@ def shape(self): return self.model.shape @staticmethod - def schedule_ts(n_steps, from_sigma=1, to_sigma=1e-2, rho=0.7): + def schedule_ts(n_steps=500, from_sigma=1, to_sigma=1e-2, rho=0.7): ramp = torch.linspace(0, 1, n_steps + 1) min_inv_rho = to_sigma ** (1 / rho) max_inv_rho = from_sigma ** (1 / rho) - return Model.sigmas_to_ts( + schedule_ts = Model.sigmas_to_ts( (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho ) + return zip(schedule_ts[:-1], schedule_ts[1:]) + + def random_diffused(self, shape): + return diffusion_space.decode(torch.randn(shape)).to(self.device) @staticmethod def sigmas_to_ts(sigmas): @@ -143,11 +147,9 @@ def test_velocity_diffusion(): n_iterations = 3 - steps = diffusion.schedule_ts(n_iterations, from_sigma=1.0, rho=0.7) - diffused_images = torch.randn((1, 3, 512, 512)).to(device).add(1).div(2) - for from_ts, to_ts in zip(steps[:-1], steps[1:]): + for from_ts, to_ts in diffusion.schedule_ts(n_iterations, from_sigma=1.0, rho=0.7): if (from_ts < 1.0).all(): new_from_ts = from_ts * 1.003 diffused_images = diffusion.predictions( diff --git a/perceptor/utils/pil_image.py b/perceptor/utils/pil_image.py index 2a95e38..bb51aed 100644 --- a/perceptor/utils/pil_image.py +++ b/perceptor/utils/pil_image.py @@ -1,7 +1,9 @@ +from PIL import Image from torchvision.transforms.functional import to_pil_image +from lantern import Tensor -def pil_image(images): +def pil_image(images: Tensor) -> Image: if images.max() > 1 or images.min() < 0: print("Warning: images are not in range [0, 1]") n, c, h, w = images.shape diff --git a/pyproject.toml b/pyproject.toml index 9404f87..513369c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "perceptor" -version = "0.4.0" +version = "0.5.0" description = "" authors = ["Richard Löwenström ", "dribnet"] readme = "README.md"