-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
100 lines (79 loc) · 3.5 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import torch
import itertools
import utils
import numpy as np
import pandas as pd
from enum import Enum
from torch_geometric.data import Data, DataLoader, InMemoryDataset
from torch.utils.data import DataLoader, Dataset
NUM_ATOMS=9
MAX_ATOMS=27
TARGET=['E1-CC2', 'E2-CC2', 'f1-CC2', 'f2-CC2']
def gaussian_expansion(D, mu_min=-1, delta_mu=0.2, mu_max=10, sigma=0.2):
mu = np.arange(mu_min, mu_max + delta_mu, delta_mu)
diff = D[:,:,np.newaxis] - mu[np.newaxis, np.newaxis, :]
return np.exp(-diff ** 2 / (2 * sigma))
def get_distance_matrix(X, dist_method):
assert(dist_method in ('is_3D', 'euclid', 'graph'))
if dist_method == 'is_3D':
return X.euclid_D if X.is_3D else X.graph_D
return X[f'{dist_method}_D']
class QM8Dataset(Dataset):
def __init__(self, fname, target=TARGET, max_atoms=MAX_ATOMS, mu_min=-1, delta_mu=0.2,
mu_max=10, sigma=0.2, nrows=None, dist_method='euclid'):
df = pd.read_json(fname, lines=True, orient='records', nrows=nrows)
self.target = torch.FloatTensor(df[target].values)
Zs, Ds, sizes = [], [], []
for i, x in df.iterrows():
Zs.append(utils.pad_(torch.LongTensor(x.Z), max_atoms))
D = np.array(get_distance_matrix(x, dist_method))
D_hat = gaussian_expansion(D, mu_min, delta_mu, mu_max, sigma)
Ds.append(utils.pad_Dhat(torch.FloatTensor(D_hat), max_atoms))
sizes.append(len(x.Z))
self.Zs = torch.stack(Zs)
self.Ds = torch.stack(Ds)
self.sizes = torch.LongTensor(sizes)
def __getitem__(self, idx):
return self.Zs[idx], self.Ds[idx], self.sizes[idx], self.target[idx]
def __len__(self):
return len(self.Zs)
class GraphQM8(InMemoryDataset):
def __init__(self, fname, target=TARGET, max_atoms=MAX_ATOMS, mu_min=-1, delta_mu=0.2,
mu_max=10, sigma=0.2, nrows=None, dist_method='euclid'):
self.fname = fname
self.target = target
self.max_atoms = max_atoms
self.mu_min = mu_min
self.delta_mu = delta_mu
self.mu_max = mu_max
self.sigma = sigma
self.nrows = nrows
self.dist_method = dist_method
super().__init__('', None, None)
self.data, self.slices = torch.load(self.processed_paths[0])
@property
def raw_file_names(self):
return ['data/sdf.json']
@property
def processed_file_names(self):
return ['processed.pt']
def process(self):
data_df = pd.read_json(self.raw_file_names[0],
lines=True, nrows=self.nrows)
data_list = []
for i, row in data_df.iterrows():
edges = np.array(list(itertools.permutations(range(len(row.Z)), 2)))
Z = torch.LongTensor(row.Z)
D = np.array(get_distance_matrix(row, 'is_3D'))
D_hat = gaussian_expansion(D, self.mu_min, self.delta_mu,
self.mu_max, self.sigma)
D = np.array([D_hat[x, y, :] for x, y in edges])
d = Data(Z=Z,
edge_index=torch.LongTensor(edges.T),
num_nodes=len(Z),
edge_attr=torch.FloatTensor(D),
y=torch.FloatTensor([row[['E1-CC2', 'E2-CC2', 'f1-CC2', 'f2-CC2']]]))
print(d)
data_list.append(d)
self.data, self.slices = self.collate(data_list)
torch.save((self.data, self.slices), f'processed/{self.processed_file_names[0]}')