aggr.py

from typing import Optional

from math import pi, log
import math
from typing import Optional

import torch
import einops
from einops import repeat, rearrange

from torch import Tensor
import torch.nn.functional as F
from torch import einsum
from torch.nn import init
# import GraphConv from torchgeometric
from torch_geometric.nn import GraphConv
from torch_geometric.nn.aggr import Aggregation
from torch_geometric.nn.pool import global_add_pool
from torch_geometric.utils import to_dense_batch, to_dense_adj
from torch.nn import LayerNorm, Linear, MultiheadAttention, Parameter

from torchsummary import summary

def cache_fn(f):
    cache = dict()
    @wraps(f)
    def cached_fn(*args, _cache = True, key = None, **kwargs):
        if not _cache:
            return f(*args, **kwargs)
        nonlocal cache
        if key in cache:
            return cache[key]
        result = f(*args, **kwargs)
        cache[key] = result
        return result
    return cached_fn


def _no_grad_trunc_normal_(tensor, mean, std, a, b):
    # Cut & paste from PyTorch official master until it's in a few official releases - RW
    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
    def norm_cdf(x):
        # Computes standard normal cumulative distribution function
        return (1. + math.erf(x / math.sqrt(2.))) / 2.

    if (mean < a - 2 * std) or (mean > b + 2 * std):
        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
                      "The distribution of values may be incorrect.",
                      stacklevel=2)

    with torch.no_grad():
        # Values are generated by using a truncated uniform distribution and
        # then using the inverse CDF for the normal distribution.
        # Get upper and lower cdf values
        l = norm_cdf((a - mean) / std)
        u = norm_cdf((b - mean) / std)

        # Uniformly fill tensor with values from [l, u], then translate to
        # [2l-1, 2u-1].
        tensor.uniform_(2 * l - 1, 2 * u - 1)

        # Use inverse cdf transform for normal distribution to get truncated
        # standard normal
        tensor.erfinv_()

        # Transform to proper mean, std
        tensor.mul_(std * math.sqrt(2.))
        tensor.add_(mean)

        # Clamp to ensure it's in the proper range
        tensor.clamp_(min=a, max=b)
        return tensor

def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
    # type: (Tensor, float, float, float, float) -> Tensor
    r"""Fills the input Tensor with values drawn from a truncated
    normal distribution. The values are effectively drawn from the
    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
    with values outside :math:`[a, b]` redrawn until they are within
    the bounds. The method used for generating the random values works
    best when :math:`a \leq \text{mean} \leq b`.
    Args:
        tensor: an n-dimensional `torch.Tensor`
        mean: the mean of the normal distribution
        std: the standard deviation of the normal distribution
        a: the minimum cutoff value
        b: the maximum cutoff value
    Examples:
        >>> w = torch.empty(3, 5)
        >>> nn.init.trunc_normal_(w)
    """
    return _no_grad_trunc_normal_(tensor, mean, std, a, b)

def exists(val):
    return val is not None

def posemb_sincos_2d(gdata, node_coords, mask, temperature = 10000, dtype = torch.float32):
    b, n, dim, device, dtype = *gdata.shape, gdata.device, gdata.dtype

    assert (dim % 4) == 0, 'feature dimension must be multiple of 4 for sincos emb'
    omega = torch.arange(dim // 4, device = device) / (dim // 4 - 1)
    omega = 1. / (temperature ** omega)

    # Extract x and y coordinate lists for each tensor in the batch
    y = node_coords[:, :, 0] # B, N
    x = node_coords[:, :, 1] # B, N
    
    pos_emb = []
    for i in range(b):
        y = node_coords[i, :, 0]
        x = node_coords[i, :, 1]

        y = y.flatten()[:, None] * omega[None, :]
        x = x.flatten()[:, None] * omega[None, :]
        pe = torch.cat((x.sin(), x.cos(), y.sin(), y.cos()), dim = 1)

        pos_emb.append(pe)
    pos_emb = torch.stack(pos_emb)

    if exists(mask):
        mask = repeat(mask, 'b n -> b n d', d = dim)
        fill_value = 0  
        pos_emb.masked_fill_(~mask, fill_value)

    return pe.type(dtype)

def fourier_encode(node_coords, mask, max_freq=10, num_bands = 6):

    device = node_coords.device
    y_max, x_max = node_coords[:, :, 0].max()+1, node_coords[:, :, 1].max()+1
    axis_pos = list(map(lambda size: torch.linspace(-1., 1., steps=size, device=device, dtype=torch.float32), (y_max, x_max)))
    x = torch.stack(torch.meshgrid(*axis_pos, indexing = 'ij'), dim = -1)

    x = x.unsqueeze(-1)
    device, dtype, orig_x = x.device, x.dtype, x
    # print("orig_x shape", orig_x.shape)

    scales = torch.linspace(1., max_freq / 2, num_bands, device = device, dtype = dtype)
    scales = scales[(*((None,) * (len(x.shape) - 1)), Ellipsis)]

    x = x * scales * pi
    # print("Shape of x", x.shape)
    batch, num_nodes, _ = node_coords.shape
    
    x_enc = x[node_coords[:, :, 0], node_coords[:, :, 1]]
    orig_x_enc = orig_x[node_coords[:, :, 0], node_coords[:, :, 1]]

    x_enc = torch.cat([x_enc.sin(), x_enc.cos()], dim = -1)
    x_enc = torch.cat((x_enc, orig_x_enc), dim = -1)

    mask = rearrange(mask, 'b ... -> b (...)')
    max_neg_value = 0
    mask = repeat(mask, 'b n -> b n c d', c=2, d=2 * num_bands + 1)
    x_enc.masked_fill_(~mask, max_neg_value)

    return x_enc

class PreNorm(torch.nn.Module):
    def __init__(self, dim, fn, context_dim=None):
        super().__init__()
        self.norm = torch.nn.LayerNorm(dim)
        self.norm_context = torch.nn.LayerNorm(context_dim) if context_dim is not None else None
        self.fn = fn

        torch.nn.init.constant_(self.norm.weight, 1)
        torch.nn.init.constant_(self.norm.bias, 0)

        if context_dim is not None:
            torch.nn.init.constant_(self.norm_context.weight, 1)
            torch.nn.init.constant_(self.norm_context.bias, 0)

    def forward(self, x, **kwargs):
        x = self.norm(x)

        if exists(self.norm_context):
            K = kwargs['K']
            normed_K = self.norm_context(K)
            kwargs.update(K = normed_K)

        return self.fn(x, **kwargs)

    def get_attention_map(self):
        return self.fn.get_attention_map()

    def get_attn_gradients(self):
        return self.fn.get_attn_gradients()

class FeedForward(torch.nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(dim, hidden_dim, bias=False),
            torch.nn.GELU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_dim, dim, bias=False),
            torch.nn.Dropout(dropout)
        )

        for m in self.net.modules():
            if isinstance(m, torch.nn.Linear):
                trunc_normal_(m.weight, std=.02)
                if isinstance(m, torch.nn.Linear) and m.bias is not None:
                    torch.nn.init.constant_(m.bias, 0)


    def forward(self, x):
        return self.net(x)

class MAB(torch.nn.Module):
    def __init__(self, query_dim , context_dim=None, heads: int = 1, dropout: float = 0.0): # 
        super(MAB, self).__init__()

        self.query_dim = query_dim
        self.context_dim = context_dim if context_dim is not None else query_dim
        self.latent_dim = query_dim 

        self.heads = heads
        
        # Q, K, V emb_dimensions
        self.dim_head = self.latent_dim // self.heads
        self.scale_factor = self.dim_head ** -0.5

        # Adding temperature term to softmax
        self.T = 1.0
        self.attend = torch.nn.Softmax(dim = -1)
        self.dropout = torch.nn.Dropout(0.5)

        self.fc_q = torch.nn.Linear(self.query_dim, self.latent_dim, bias = False)
        self.fc_kv = torch.nn.Linear(self.context_dim, self.latent_dim * 2, bias = False)
      
        trunc_normal_(self.fc_q.weight, std=.02)
        trunc_normal_(self.fc_kv.weight, std=.02)

        self.attn_gradients = None
        self.attention_map = None
    
    def save_attn_gradients(self, attn_gradients):
        self.attn_gradients = attn_gradients

    def get_attn_gradients(self):
        return self.attn_gradients

    def save_attention_map(self, attention_map):
        self.attention_map = attention_map

    def get_attention_map(self):
        return self.attention_map

    def forward(self, Q: Tensor, K: Tensor, mask: Tensor = None, register_hook: bool = False) -> Tensor:
        
        Q = self.fc_q(Q)
        K, V = self.fc_kv(K).chunk(2, dim = -1)

        Q, K, V = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), (Q, K, V))

        scaled_dot_prod = einsum('b h c d, b h n d -> b h c n', Q, K) * self.scale_factor
        # scaled_dot_prod = torch.matmul(Q, K.transpose(-1, -2)) * self.scale_factor
        if exists(mask):
            mask = rearrange(mask, 'b ... -> b (...)')
            max_neg_value = -torch.finfo(scaled_dot_prod.dtype).max
            mask = repeat(mask, 'b n -> b h () n', h = self.heads)
            scaled_dot_prod.masked_fill_(~mask, max_neg_value)


        A = self.attend(scaled_dot_prod/self.T) 
        A = self.dropout(A)

        self.save_attention_map(A)
        if register_hook:
            A.register_hook(self.save_attn_gradients)

        # this is A^T * V
        # O = torch.matmul(A, V)
        O = einsum('b h c n, b h n d -> b h c d', A, V)
        # this is merging the heads
        O = rearrange(O, 'b h c d -> b c (h d)')

        return O


class SetAttentionBlock(torch.nn.Module):
    r"""The Set Attention Block (SAB) from the `"Set Transformer: A
    Framework for Attention-based Permutation-Invariant Neural Networks"
    <https://arxiv.org/abs/1810.00825>`_ paper

    .. math::

        \mathrm{SAB}(\mathbf{X}) = \mathrm{MAB}(\mathbf{x}, \mathbf{y})

    Args:
        query_dim (int): Size of each input sample.
        heads (int, optional): Number of multi-head-attentions.
            (default: :obj:`1`)
        norm (str, optional): If set to :obj:`False`, will not apply layer
            normalization. (default: :obj:`True`)
        dropout (float, optional): Dropout probability of attention weights.
            (default: :obj:`0`)
    """
    def __init__(self, query_dim: int, heads: int = 1, dropout: float = 0.0):
        super().__init__()
        self.mab = PreNorm(query_dim, MAB(query_dim, heads=heads, dropout=dropout))
        self.ff = PreNorm(query_dim, FeedForward(query_dim, query_dim, dropout=dropout))


    def forward(self, x: Tensor, register_hook=False) -> Tensor:

        kwargs = {'K': x, 'register_hook': register_hook}
        x = self.mab(x, **kwargs) + x
        x = self.ff(x) + x

        return x

class PoolingByMultiheadAttentionBlock(torch.nn.Module):
    r"""The Pooling by Multihead Attention (PMA) layer from the `"Set
    Transformer: A Framework for Attention-based Permutation-Invariant Neural
    Networks" <https://arxiv.org/abs/1810.00825>`_ paper

    .. math::

        \mathrm{PMA}(\mathbf{X}) = \mathrm{MAB}(\mathbf{S}, \mathbf{x})

    where :math:`\mathbf{S}` denotes :obj:`num_seed_points` learnable vectors.

    Args:
        query_dim (int): Size wanted of each sample.
        context_dim (int) : Current of the each sample.
        num_seed_points (int, optional): Number of seed points.
            (default: :obj:`1`)
        heads (int, optional): Number of multi-head-attentions.
            (default: :obj:`1`)
        norm (str, optional): If set to :obj:`False`, will not apply layer
            normalization. (default: :obj:`True`)
        dropout (float, optional): Dropout probability of attention weights.
            (default: :obj:`0`)
    """
    def __init__(self, query_dim, context_dim=None, num_seed_points: int = 1, heads: int = 1, dropout: float = 0.0):
        super().__init__()
        
        self.query_dim = query_dim
        self.context_dim = context_dim if context_dim is not None else query_dim
        self.latent_dim = self.query_dim

        self.num_seed_points = num_seed_points

        self.seed = torch.nn.Parameter(torch.randn(self.num_seed_points, self.latent_dim))

        self.mab = PreNorm(self.query_dim, MAB(self.query_dim, self.context_dim, heads, dropout), context_dim = self.context_dim)
        self.ff = PreNorm(self.query_dim, FeedForward(self.query_dim, self.query_dim, dropout))                           

    def forward(self, x: Tensor, mask: Tensor, seeds=None, register_hook=False) -> Tensor:
        
        b, _, _ = x.size()

        if seeds == None:
            # seeds = torch.sum(x, dim=1, keepdim=True) # b 1 d
            seeds = repeat(self.seed, 'c d -> b c d', b = b)
        
        kwargs = {'K': x, 'mask': mask, 'register_hook': register_hook}
        x_pool = self.mab(seeds, **kwargs) + seeds # <-- Does adding seeds make sense?
        x_pool = self.ff(x_pool)  + x_pool # <-- Does adding x_pool improve the loss?
        # S is the assignment matrix and x_pool is the pooled cluster representation

        return x_pool
    
class GraphMultisetAggregation(torch.nn.Module):
    r"""The Graph Multiset Transformer pooling operator from the
    `"Accurate Learning of Graph Representations
    with Graph Multiset Pooling" <https://arxiv.org/abs/2102.11533>`_ paper.

    The :class:`GraphMultisetTransformer` aggregates elements into
    :math:`k` representative elements via attention-based pooling, computes the
    interaction among them via :obj:`num_encoder_blocks` self-attention blocks,
    and finally pools the representative elements via attention-based pooling
    into a single cluster.

    Args:
        dim (int): Default Size of each input sample.
        k (int): Number of :math:`k` representative nodes after pooling.
        num_encoder_blocks (int, optional): Number of Set Attention Blocks
            (SABs) between the two pooling blocks. (default: :obj:`1`)
        heads (int, optional): Number of multi-head-attentions.
            (default: :obj:`1`)
        norm (str, optional): If set to :obj:`True`, will apply layer
            normalization. (default: :obj:`False`)
        dropout (float, optional): Dropout probability of attention weights.
            (default: :obj:`0`)
    """
    def __init__(
        self,
        dim: int,
        k: int,
        num_pma_blocks: int = 1,
        num_encoder_blocks: int = 3,
        heads: int = 1,
        dropout: float = 0.0,
        
    ):
        super().__init__()

        self.k = k # number of pooled nodes (generally set to 100)
        self.heads = heads
        self.num_pma_blocks = num_pma_blocks

        self.pos_enc_type = 'fourier'
        if self.pos_enc_type == 'fourier':
            self.num_freq_bands = 6
            self.max_freq = 10
            fourier_dim = (2 * ((self.num_freq_bands * 2) + 1))
            context_dim = fourier_dim + dim

        elif self.pos_enc_type == 'sincos':
            context_dim = dim

        self.query_dim = dim
        self.context_dim = context_dim

        self.layers = torch.nn.ModuleList([])
        for i in range(self.num_pma_blocks):
            if i > 0:
                self.context_dim = None
            pma = PoolingByMultiheadAttentionBlock(self.query_dim, self.context_dim, k, heads=1, dropout=dropout)

            self.encoders = torch.nn.ModuleList([])
            for _ in range(num_encoder_blocks):
                self.encoders.append(SetAttentionBlock(self.query_dim, heads, dropout=dropout))

            self.layers.append(torch.nn.ModuleList([pma, self.encoders]))

        self.cls_token = torch.nn.Parameter(torch.randn(1, self.query_dim))
        self.dropout = torch.nn.Dropout(dropout)
    
    def forward(self, x: Tensor, batch = None, node_coords=None, register_hook=False) -> Tensor:
        
        x_g, mask = to_dense_batch(x, batch) # x shape: b, n, d without adjacency matrix
        node_coords, mask = to_dense_batch(node_coords, batch)
        
        # sincos embeddings
        if self.pos_enc_type == 'sincos':
            pe = posemb_sincos_2d(x_g, node_coords, mask)
            x_g_pos = x_g + pe
            # x is 128 dimensional with position embeddings added to the x features
           
        # fourier embeddings
        elif self.pos_enc_type == 'fourier':
            enc_pos = fourier_encode(node_coords, mask, max_freq=self.max_freq, num_bands=self.num_freq_bands)
            enc_pos = rearrange(enc_pos, '... n d -> ... (n d)')
            x_g_pos = torch.cat((x_g, enc_pos), dim = -1)
            # x is 128 + 26 dimensional with position embeddings concatenates to the x features
        
        # Pooling by Multihead Attention
        x_p = None
        for idx, (pma, encoders) in enumerate(self.layers):

            if idx == 0: x = x_g_pos
            else: x = x_g    
            x_p = pma(x=x, mask=mask, seeds=x_p, register_hook=register_hook)
    
            if idx == len(self.layers) - 1:
                # using CLS token for graph-level classification
                b, n, _ = x_p.shape
                cls_tokens = repeat(self.cls_token, '1 d -> b 1 d', b = b)

                x_p = torch.cat((cls_tokens, x_p), dim=1) 
                x_p = self.dropout(x_p)
            for layer in encoders:
                x_p = layer(x_p, register_hook=register_hook)            

        # x will be of shape b, c+1, d
        return x_p[:, 0]
        
        # x will be of shape b, c, d
        # x = torch.mean(x, dim=1)
        # return x