feat: merge

AshishKumar4 · Aug 10, 2024 · 68ba3f7 · 68ba3f7
2 parents 76c40ce + 5984fab
commit 68ba3f7
Show file tree

Hide file tree

Showing 14 changed files with 1,697 additions and 504 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,7 @@ good models
 .env
 tensorboard
 wandb
+gs:
 gcs_mount
 datacache
 *.deb

diff --git a/README.md b/README.md
@@ -221,6 +221,23 @@ plotImages(samples, dpi=300)
 
 ## Gallery
 
+### Images generated by Euler Ancestral Sampler in 200 Steps [text2image with CFG]
+Model trained on Laion-Aesthetics 12M + CC12M + MS COCO + 1M aesthetic 6+ subset of COYO-700M on TPU-v4-32:
+`a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful landscape with a river with mountains, a beautiful forest with a river and sunlight, a beautiful forest with a river and sunlight, a beautiful forest with a river and sunlight, a beautiful forest with a river and sunlight, a beautiful forest with a river and sunlight, a beautiful forest with a river and sunlight, a beautiful forest with a river and sunlight, a beautiful forest with a river and sunlight, a big mansion with a garden, a big mansion with a garden, a big mansion with a garden, a big mansion with a garden, a big mansion with a garden, a big mansion with a garden, a big mansion with a garden, a big mansion with a garden`
+
+**Params**:
+`Dataset: Laion-Aesthetics 12M + CC12M + MS COCO + 1M aesthetic 6+ subset of COYO-700M`
+`Batch size: 256`
+`Image Size: 128`
+`Training Epochs: 5`
+`Steps per epoch: 74573`
+`Model Configurations: feature_depths=[128, 256, 512, 1024]`
+
+`Training Noise Schedule: EDMNoiseScheduler`
+`Inference Noise Schedule: KarrasEDMPredictor`
+
+![EulerA with CFG](images/medium_epoch5.png)
+
 ### Images generated by Euler Ancestral Sampler in 200 Steps [text2image with CFG]
 Images generated by the following prompts using classifier free guidance with guidance factor = 2:
 `'water tulip, a water lily, a water lily, a water lily, a photo of a marigold, a water lily, a water lily, a photo of a lotus, a photo of a lotus, a photo of a lotus, a photo of a rose, a photo of a rose, a photo of a rose, a photo of a rose, a photo of a rose'`

diff --git a/datasets/dataset preparations.ipynb b/datasets/dataset preparations.ipynb
diff --git a/evaluate.ipynb b/evaluate.ipynb
diff --git a/flaxdiff/models/attention.py b/flaxdiff/models/attention.py
@@ -156,7 +156,9 @@ def __call__(self, x, context=None):
         value = self.value(context)
 
         hidden_states = nn.dot_product_attention(
-            query, key, value, dtype=self.dtype, broadcast_dropout=False, dropout_rng=None, precision=self.precision
+            query, key, value, dtype=self.dtype, broadcast_dropout=False, 
+            dropout_rng=None, precision=self.precision, force_fp32_for_softmax=True,
+            deterministic=True
         )
         proj = self.proj_attn(hidden_states)
         proj = proj.reshape(orig_x_shape)
@@ -187,7 +189,7 @@ def setup(self):
 
     def __call__(self, hidden_states):
         hidden_states = self.proj(hidden_states)
-        hidden_linear, hidden_gelu = jnp.split(hidden_states, 2, axis=3)
+        hidden_linear, hidden_gelu = jnp.split(hidden_states, 2, axis=-1)
         return hidden_linear * nn.gelu(hidden_gelu)
 
 class FlaxFeedForward(nn.Module):
@@ -291,14 +293,14 @@ class TransformerBlock(nn.Module):
     dtype: Optional[Dtype] = None
     precision: PrecisionLike = None
     use_projection: bool = False
-    use_flash_attention:bool = True
-    use_self_and_cross:bool = False
+    use_flash_attention:bool = False
+    use_self_and_cross:bool = True
     only_pure_attention:bool = False
 
     @nn.compact
     def __call__(self, x, context=None):
         inner_dim = self.heads * self.dim_head
-        B, H, W, C = x.shape
+        C = x.shape[-1]
         normed_x = nn.RMSNorm(epsilon=1e-5, dtype=self.dtype)(x)
         if self.use_projection == True:
             if self.use_linear_attention:

diff --git a/flaxdiff/models/common.py b/flaxdiff/models/common.py
@@ -5,6 +5,7 @@
 from flax.typing import Dtype, PrecisionLike
 from typing import Dict, Callable, Sequence, Any, Union
 import einops
+from functools import partial
 
 # Kernel initializer to use
 def kernel_init(scale, dtype=jnp.float32):
@@ -266,11 +267,20 @@ class ResidualBlock(nn.Module):
     kernel_init:Callable=kernel_init(1.0)
     dtype: Optional[Dtype] = None
     precision: PrecisionLike = None
+
+    def setup(self):
+        if self.norm_groups > 0:
+            norm = partial(nn.GroupNorm, self.norm_groups)
+        else:
+            norm = partial(nn.RMSNorm, 1e-5)
+
+        self.norm1 = norm()
+        self.norm2 = norm()
 
     @nn.compact
     def __call__(self, x:jax.Array, temb:jax.Array, textemb:jax.Array=None, extra_features:jax.Array=None):
         residual = x
-        out = nn.GroupNorm(self.norm_groups)(x)
+        out = self.norm1(x)
         # out = nn.RMSNorm()(x)
         out = self.activation(out)
 
@@ -295,7 +305,7 @@ def __call__(self, x:jax.Array, temb:jax.Array, textemb:jax.Array=None, extra_fe
         # out = out * (1 + scale) + shift
         out = out + temb
 
-        out = nn.GroupNorm(self.norm_groups)(out)
+        out = self.norm2(out)
         # out = nn.RMSNorm()(out)
         out = self.activation(out)
 

diff --git a/flaxdiff/models/simple_unet.py b/flaxdiff/models/simple_unet.py
@@ -6,6 +6,7 @@
 import einops
 from .common import kernel_init, ConvLayer, Downsample, Upsample, FourierEmbedding, TimeProjection, ResidualBlock
 from .attention import TransformerBlock
+from functools import partial
 
 class Unet(nn.Module):
     output_channels:int=3
@@ -19,6 +20,15 @@ class Unet(nn.Module):
     dtype: Optional[Dtype] = None
     precision: PrecisionLike = None
 
+    def setup(self):
+        if self.norm_groups > 0:
+            norm = partial(nn.GroupNorm, self.norm_groups)
+        else:
+            norm = partial(nn.RMSNorm, 1e-5)
+
+        # self.last_up_norm = norm()
+        self.conv_out_norm = norm()
+
     @nn.compact
     def __call__(self, x, temb, textcontext):
         # print("embedding features", self.emb_features)
@@ -69,7 +79,7 @@ def __call__(self, x, temb, textcontext):
                                         use_projection=attention_config.get("use_projection", False),
                                         use_self_and_cross=attention_config.get("use_self_and_cross", True),
                                         precision=attention_config.get("precision", self.precision),
-                                        only_pure_attention=True,
+                                        only_pure_attention=attention_config.get("only_pure_attention", True),
                                         name=f"down_{i}_attention_{j}")(x, textcontext)
                 # print("down residual for feature level", i, "is of shape", x.shape, "features", dim_in)
                 downs.append(x)
@@ -107,8 +117,8 @@ def __call__(self, x, temb, textcontext):
                                     use_linear_attention=False,
                                     use_projection=middle_attention.get("use_projection", False),
                                     use_self_and_cross=False,
-                                    precision=attention_config.get("precision", self.precision),
-                                    only_pure_attention=True,
+                                    precision=middle_attention.get("precision", self.precision),
+                                    only_pure_attention=middle_attention.get("only_pure_attention", True),
                                     name=f"middle_attention_{j}")(x, textcontext)
             x = ResidualBlock(
                 middle_conv_type,
@@ -150,7 +160,7 @@ def __call__(self, x, temb, textcontext):
                                         use_projection=attention_config.get("use_projection", False),
                                         use_self_and_cross=attention_config.get("use_self_and_cross", True),
                                         precision=attention_config.get("precision", self.precision),
-                                        only_pure_attention=True,
+                                        only_pure_attention=attention_config.get("only_pure_attention", True),
                                         name=f"up_{i}_attention_{j}")(x, textcontext)
             # print("Upscaling ", i, x.shape)
             if i != len(feature_depths) - 1:
@@ -163,13 +173,13 @@ def __call__(self, x, temb, textcontext):
                     precision=self.precision
                 )(x)
 
-        # x = nn.GroupNorm(8)(x)
+        # x = self.last_up_norm(x)
         x = ConvLayer(
             conv_type,
             features=self.feature_depths[0],
             kernel_size=(3, 3),
             strides=(1, 1),
-            kernel_init=kernel_init(0.0),
+            kernel_init=kernel_init(1.0),
             dtype=self.dtype,
             precision=self.precision
         )(x)
@@ -189,7 +199,7 @@ def __call__(self, x, temb, textcontext):
             precision=self.precision
         )(x, temb)
 
-        x = nn.GroupNorm(self.norm_groups)(x)
+        x = self.conv_out_norm(x)
         x = self.activation(x)
 
         noise_out = ConvLayer(

diff --git a/flaxdiff/models/simple_vit.py b/flaxdiff/models/simple_vit.py
@@ -4,7 +4,7 @@
 import jax.numpy as jnp
 from flax import linen as nn
 from typing import Callable, Any
-from .simply_unet import FourierEmbedding, TimeProjection, ConvLayer, kernel_init
+from .simple_unet import FourierEmbedding, TimeProjection, ConvLayer, kernel_init
 from .attention import TransformerBlock
 
 class PatchEmbedding(nn.Module):
@@ -40,34 +40,35 @@ def __call__(self, x):
 class TransformerEncoder(nn.Module):
     num_layers: int
     num_heads: int
-    mlp_dim: int
     dropout_rate: float = 0.1
     dtype: Any = jnp.float32
     precision: Any = jax.lax.Precision.HIGH
+    use_projection: bool = False
 
     @nn.compact
-    def __call__(self, x, training=True):
+    def __call__(self, x, context=None):
         for _ in range(self.num_layers):
             x = TransformerBlock(
                 heads=self.num_heads,
                 dim_head=x.shape[-1] // self.num_heads,
-                mlp_dim=self.mlp_dim,
                 dropout_rate=self.dropout_rate,
                 dtype=self.dtype,
-                precision=self.precision
-            )(x)
+                precision=self.precision,
+                use_self_and_cross=True,
+                use_projection=self.use_projection,
+            )(x, context)
         return x
 
 class VisionTransformer(nn.Module):
     patch_size: int = 16
     embedding_dim: int = 768
     num_layers: int = 12
     num_heads: int = 12
-    mlp_dim: int = 3072
     emb_features: int = 256
     dropout_rate: float = 0.1
     dtype: Any = jnp.float32
     precision: Any = jax.lax.Precision.HIGH
+    use_projection: bool = False
 
     @nn.compact
     def __call__(self, x, temb, textcontext=None):
@@ -81,27 +82,23 @@ def __call__(self, x, temb, textcontext=None):
 
         # Add positional encoding
         x = PositionalEncoding(max_len=x.shape[1], embedding_dim=self.embedding_dim)(x)
+
+        num_patches = x.shape[1]
 
         # Add time embedding
         temb = jnp.expand_dims(temb, axis=1)
         x = jnp.concatenate([x, temb], axis=1)
 
-        # Add text context
-        if textcontext is not None:
-            x = jnp.concatenate([x, textcontext], axis=1)
-
         # Transformer encoder
         x = TransformerEncoder(
             num_layers=self.num_layers,
             num_heads=self.num_heads,
-            mlp_dim=self.mlp_dim,
             dropout_rate=self.dropout_rate,
             dtype=self.dtype,
-            precision=self.precision
-        )(x)
+            precision=self.precision,
+            use_projection=self.use_projection
+        )(x, textcontext)
 
-        # Extract the image tokens (exclude time and text embeddings)
-        num_patches = (x.shape[1] - 1 - (0 if textcontext is None else textcontext.shape[1]))
         x = x[:, :num_patches, :]
 
         # Reshape to image dimensions

diff --git a/flaxdiff/trainer/diffusion_trainer.py b/flaxdiff/trainer/diffusion_trainer.py
@@ -29,6 +29,8 @@ def apply_ema(self, decay: float = 0.999):
         )
         return self.replace(ema_params=new_ema_params)
 
+from flaxdiff.models.autoencoder.autoencoder import AutoEncoder
+
 class DiffusionTrainer(SimpleTrainer):
     noise_schedule: NoiseScheduler
     model_output_transform: DiffusionPredictionTransform
@@ -40,7 +42,7 @@ def __init__(self,
                  optimizer: optax.GradientTransformation,
                  noise_schedule: NoiseScheduler,
                  rngs: jax.random.PRNGKey,
-                 unconditional_prob: float = 0.2,
+                 unconditional_prob: float = 0.12,
                  name: str = "Diffusion",
                  model_output_transform: DiffusionPredictionTransform = EpsilonPredictionTransform(),
                  autoencoder: AutoEncoder = None,
@@ -67,7 +69,8 @@ def generate_states(
         existing_state: dict = None,
         existing_best_state: dict = None,
         model: nn.Module = None,
-        param_transforms: Callable = None
+        param_transforms: Callable = None,
+        use_dynamic_scale: bool = False
     ) -> Tuple[TrainState, TrainState]:
         print("Generating states for DiffusionTrainer")
         rngs, subkey = jax.random.split(rngs)
@@ -88,7 +91,8 @@ def generate_states(
             ema_params=new_state['ema_params'],
             tx=optimizer,
             rngs=rngs,
-            metrics=Metrics.empty()
+            metrics=Metrics.empty(),
+            dynamic_scale = flax.training.dynamic_scale.DynamicScale() if use_dynamic_scale else None
         )
 
         if existing_best_state is not None:
@@ -125,14 +129,14 @@ def train_step(train_state: TrainState, rng_state: RandomMarkovState, batch, loc
             local_rng_state = RandomMarkovState(subkey)
 
             images = batch['image']
-            images = jnp.array(images, dtype=jnp.bfloat16)
+            images = jnp.array(images, dtype=jnp.float32)
             # normalize image
             images = (images - 127.5) / 127.5
 
             if autoencoder is not None:
                 # Convert the images to latent space
-                # local_rng_state, rngs = local_rng_state.get_random_key()
-                images = autoencoder.encode(images)#, rngs)
+                local_rng_state, rngs = local_rng_state.get_random_key()
+                images = autoencoder.encode(images, rngs)
 
             output = text_embedder(
                 input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
@@ -163,12 +167,39 @@ def model_loss(params):
                 loss = nloss
                 return loss
 
-            loss, grads = jax.value_and_grad(model_loss)(train_state.params)
+
+            if train_state.dynamic_scale is not None:
+                # dynamic scale takes care of averaging gradients across replicas
+                grad_fn = train_state.dynamic_scale.value_and_grad(
+                    model_loss, axis_name="data"
+                )
+                dynamic_scale, is_fin, loss, grads = grad_fn(train_state.params)
+                train_state = train_state.replace(dynamic_scale=dynamic_scale)
+            else:
+                grad_fn = jax.value_and_grad(model_loss)
+                loss, grads = grad_fn(train_state.params)
+                if distributed_training:
+                    grads = jax.lax.pmean(grads, "data")
+
+            new_state = train_state.apply_gradients(grads=grads)
+
+            if train_state.dynamic_scale:
+                # if is_fin == False the gradients contain Inf/NaNs and optimizer state and
+                # params should be restored (= skip this step).
+                select_fn = functools.partial(jnp.where, is_fin)
+                new_state = train_state.replace(
+                    opt_state=jax.tree_util.tree_map(
+                        select_fn, new_state.opt_state, train_state.opt_state
+                    ),
+                    params=jax.tree_util.tree_map(
+                        select_fn, new_state.params, train_state.params
+                    ),
+                )
+
+            train_state = new_state.apply_ema(self.ema_decay)
+
             if distributed_training:
-                grads = jax.lax.pmean(grads, "data")
                 loss = jax.lax.pmean(loss, "data")
-            train_state = train_state.apply_gradients(grads=grads)
-            train_state = train_state.apply_ema(self.ema_decay)
             return train_state, loss, rng_state
 
         if distributed_training:
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ good models @@
     .env
     tensorboard
     wandb
+    gs:
     gcs_mount
     datacache
     *.deb
@@ Expand Down @@