Skip to content

Commit

Permalink
Merge pull request #26 from joshhan619/ltsm-stack
Browse files Browse the repository at this point in the history
Informer model and baseline benchmark scripts
  • Loading branch information
LSC2204 authored Jan 19, 2025
2 parents 1034501 + 236b9dd commit 8239910
Show file tree
Hide file tree
Showing 13 changed files with 353 additions and 94 deletions.
34 changes: 25 additions & 9 deletions ltsm/data_pipeline/data_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import numpy as np
import torch
import argparse
import json
import random
import ipdb

from ltsm.data_provider.data_factory import get_datasets
from ltsm.data_provider.data_loader import HF_Dataset
from ltsm.data_provider.data_loader import HF_Dataset, HF_Timestamp_Dataset
from ltsm.data_pipeline.model_manager import ModelManager

import logging
Expand Down Expand Up @@ -72,11 +73,10 @@ def run(self):
)

train_dataset, eval_dataset, test_datasets, _ = get_datasets(self.args)
train_dataset, eval_dataset= HF_Dataset(train_dataset), HF_Dataset(eval_dataset)

if self.args.model == 'PatchTST' or self.args.model == 'DLinear':
# Set the patch number to the size of the input sequence including the prompt sequence
self.model_manager.args.seq_len = train_dataset[0]["input_data"].size()[0]
if self.args.model == "Informer":
train_dataset, eval_dataset = HF_Timestamp_Dataset(train_dataset), HF_Timestamp_Dataset(eval_dataset)
else:
train_dataset, eval_dataset= HF_Dataset(train_dataset), HF_Dataset(eval_dataset)

model = self.model_manager.create_model()

Expand All @@ -103,16 +103,24 @@ def run(self):

# Testing settings
for test_dataset in test_datasets:
if self.args.model == "Informer":
test_ds = HF_Timestamp_Dataset(test_dataset)
else:
test_ds = HF_Dataset(test_dataset)

trainer.compute_loss = self.model_manager.compute_loss
trainer.prediction_step = self.model_manager.prediction_step
test_dataset = HF_Dataset(test_dataset)

metrics = trainer.evaluate(test_dataset)
metrics = trainer.evaluate(test_ds)
trainer.log_metrics("Test", metrics)
trainer.save_metrics("Test", metrics)

def get_args():
parser = argparse.ArgumentParser(description='LTSM')

# Load JSON config file
parser.add_argument('--config', type=str, help='Path to JSON configuration file')

# Basic Config
parser.add_argument('--model_id', type=str, default='test_run', help='model id')
Expand All @@ -122,8 +130,9 @@ def get_args():
parser.add_argument('--checkpoints', type=str, default='./checkpoints/')

# Data Settings
parser.add_argument('--data', help='dataset type')
parser.add_argument('--data_path', nargs='+', default='dataset/weather.csv', help='data files')
parser.add_argument('--test_data_path_list', nargs='+', required=True, help='test data file')
parser.add_argument('--test_data_path_list', nargs='+', help='test data file')
parser.add_argument('--prompt_data_path', type=str, default='./weather.csv', help='prompt data file')
parser.add_argument('--data_processing', type=str, default="standard_scaler", help='data processing method')
parser.add_argument('--train_ratio', type=float, default=0.7, help='train data ratio')
Expand Down Expand Up @@ -153,7 +162,6 @@ def get_args():
parser.add_argument('--model', type=str, default='model', help='model name, , options:[LTSM, LTSM_WordPrompt, LTSM_Tokenizer, DLinear, PatchTST, Informer]')
parser.add_argument('--stride', type=int, default=8, help='stride')
parser.add_argument('--tmax', type=int, default=10, help='tmax')
parser.add_argument('--dropout', type=float, default=0.05, help='dropout')
parser.add_argument('--embed', type=str, default='timeF',
help='time features encoding, options:[timeF, fixed, learned]')
parser.add_argument('--activation', type=str, default='gelu', help='activation')
Expand Down Expand Up @@ -200,6 +208,14 @@ def get_args():

args, unknown = parser.parse_known_args()

if args.config:
with open(args.config, 'r') as f:
config = json.load(f)
json_args = argparse.Namespace(**config)

for key, value in vars(json_args).items():
setattr(args, key, value)

return args


Expand Down
22 changes: 20 additions & 2 deletions ltsm/data_pipeline/model_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,12 @@ def compute_loss(self, model, inputs, return_outputs=False):
Returns:
torch.Tensor or tuple: The computed loss, and optionally the outputs.
"""
outputs = model(inputs["input_data"])
if self.args.model == 'Informer':
input_data_mark = inputs["timestamp_input"].to(model.module.device)
label_mark = inputs["timestamp_labels"].to(model.module.device)
outputs = model(inputs["input_data"], input_data_mark, inputs["labels"], label_mark)
else:
outputs = model(inputs["input_data"])
loss = nn.functional.mse_loss(outputs, inputs["labels"])
return (loss, outputs) if return_outputs else loss

Expand All @@ -146,7 +151,12 @@ def prediction_step(self, model, inputs, prediction_loss_only=False, ignore_keys
"""
input_data = inputs["input_data"].to(model.module.device)
labels = inputs["labels"].to(model.module.device)
outputs = model(input_data)
if self.args.model == 'Informer':
input_data_mark = inputs["timestamp_input"].to(model.module.device)
label_mark = inputs["timestamp_labels"].to(model.module.device)
outputs = model(input_data, input_data_mark, labels, label_mark)
else:
outputs = model(input_data)
loss = nn.functional.mse_loss(outputs, labels)
return (loss, outputs, labels)

Expand All @@ -160,6 +170,14 @@ def collate_fn(self, batch):
Returns:
dict: Collated batch with 'input_data' and 'labels' tensors.
"""
if self.args.model == 'Informer':
return {
'input_data': torch.from_numpy(np.stack([x['input_data'] for x in batch])).type(torch.float32),
'labels': torch.from_numpy(np.stack([x['labels'] for x in batch])).type(torch.float32),
'timestamp_input': torch.from_numpy(np.stack([x['timestamp_input'] for x in batch])).type(torch.float32),
'timestamp_labels': torch.from_numpy(np.stack([x['timestamp_labels'] for x in batch])).type(torch.float32)
}

return {
'input_data': torch.from_numpy(np.stack([x['input_data'] for x in batch])).type(torch.float32),
'labels': torch.from_numpy(np.stack([x['labels'] for x in batch])).type(torch.float32),
Expand Down
85 changes: 55 additions & 30 deletions ltsm/data_provider/data_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from ltsm.data_provider.data_splitter import SplitterByTimestamp
from ltsm.data_provider.tokenizer import processor_dict
from ltsm.data_provider.dataset import TSDataset, TSPromptDataset, TSTokenDataset
from ltsm.data_provider.data_loader import Dataset_Custom, Dataset_ETT_hour, Dataset_ETT_minute

from typing import Tuple, List, Union, Dict
import logging
Expand Down Expand Up @@ -341,37 +342,61 @@ def getDatasets(self)->Tuple[TSDataset, TSDataset, List[TSDataset]]:
return train_ds, val_ds, test_ds_list

def get_datasets(args):
ds_factory = DatasetFactory(
data_paths=args.data_path,
prompt_data_path=args.prompt_data_path,
data_processing=args.data_processing,
seq_len=args.seq_len,
pred_len=args.pred_len,
train_ratio=args.train_ratio,
val_ratio=args.val_ratio,
model=args.model,
downsample_rate=args.downsample_rate,
do_anomaly=args.do_anomaly
)
train_ds, val_ds, test_ds_list= ds_factory.getDatasets()

return train_ds, val_ds, test_ds_list, ds_factory.processor
if "LTSM" in args.model:
# Create datasets
dataset_factory = DatasetFactory(
data_paths=args.data_path,
prompt_data_path=args.prompt_data_path,
data_processing=args.data_processing,
seq_len=args.seq_len,
pred_len=args.pred_len,
train_ratio=args.train_ratio,
val_ratio=args.val_ratio,
model=args.model,
split_test_sets=False,
downsample_rate=args.downsample_rate,
do_anomaly=args.do_anomaly
)
train_dataset, val_dataset, test_datasets = dataset_factory.getDatasets()
processor = dataset_factory.processor
else:
timeenc = 0 if args.embed != 'timeF' else 1
Data = Dataset_Custom
if args.data == "ETTh1" or args.data == "ETTh2":
Data = Dataset_ETT_hour
elif args.data == "ETTm1" or args.data == "ETTm2":
Data = Dataset_ETT_minute

train_dataset = Data(
data_path=args.data_path[0],
split='train',
size=[args.seq_len, args.pred_len],
freq=args.freq,
timeenc=timeenc,
features=args.features
)
val_dataset = Data(
data_path=args.data_path[0],
split='val',
size=[args.seq_len, args.pred_len],
freq=args.freq,
timeenc=timeenc,
features=args.features
)
test_datasets = [Data(
data_path=args.data_path[0],
split='test',
size=[args.seq_len, args.pred_len],
freq=args.freq,
timeenc=timeenc,
features=args.features
)]
processor = train_dataset.scaler

return train_dataset, val_dataset, test_datasets, processor

def get_data_loaders(args):
# Create datasets
dataset_factory = DatasetFactory(
data_paths=args.data_path,
prompt_data_path=args.prompt_data_path,
data_processing=args.data_processing,
seq_len=args.seq_len,
pred_len=args.pred_len,
train_ratio=args.train_ratio,
val_ratio=args.val_ratio,
model=args.model,
split_test_sets=False,
do_anomaly=args.do_anomaly
)
train_dataset, val_dataset, test_datasets = dataset_factory.getDatasets()
train_dataset, val_dataset, test_datasets, processor = get_datasets()
print(f"Data loaded, train size {len(train_dataset)}, val size {len(val_dataset)}")

train_loader = DataLoader(
Expand All @@ -396,4 +421,4 @@ def get_data_loaders(args):
num_workers=0,
)

return train_loader, val_loader, test_loader, dataset_factory.processor
return train_loader, val_loader, test_loader, processor
62 changes: 53 additions & 9 deletions ltsm/data_provider/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,42 @@ def inverse_transform(self, data):
def add_data(self, df):
return self.dataset.add_data(df)

def __getitem__(self, index):

seq_x, seq_y = self.dataset.__getitem__(index)
def __getitem__(self, index):
outputs = self.dataset.__getitem__(index)
seq_x = outputs[0]
seq_y = outputs[1]

return {
"input_data": seq_x,
"labels": seq_y
}

class HF_Timestamp_Dataset(Dataset):
def __init__(self, dataset):
super().__init__()
self.dataset = dataset

def __read_data__(self):
return self.dataset.__read_data__()

def __len__(self):
return self.dataset.__len__()

def inverse_transform(self, data):
return self.dataset.inverse_transform(data)

def add_data(self, df):
return self.dataset.add_data(df)

def __getitem__(self, index):
seq_x, seq_y, seq_x_mark, seq_y_mark = self.dataset.__getitem__(index)

return {
"input_data": seq_x,
"labels": seq_y,
"timestamp_input": seq_x_mark,
"timestamp_labels": seq_y_mark
}

class Dataset_ETT_hour(Dataset):
def __init__(
Expand Down Expand Up @@ -131,8 +159,13 @@ def __getitem__(self, index):
s_end = s_begin + self.seq_len
r_begin = s_end
r_end = r_begin + self.pred_len
seq_x = self.data_x[s_begin:s_end, feat_id:feat_id+1]
seq_y = self.data_y[r_begin:r_end, feat_id:feat_id+1]
if self.enc_in > 1:
seq_x = self.data_x[s_begin:s_end]
seq_y = self.data_y[r_begin:r_end]
else:
seq_x = self.data_x[s_begin:s_end, feat_id:feat_id+1]
seq_y = self.data_y[r_begin:r_end, feat_id:feat_id+1]

seq_x_mark = self.data_stamp[s_begin:s_end]
seq_y_mark = self.data_stamp[r_begin:r_end]

Expand Down Expand Up @@ -233,8 +266,13 @@ def __getitem__(self, index):
s_end = s_begin + self.seq_len
r_begin = s_end
r_end = r_begin + self.pred_len
seq_x = self.data_x[s_begin:s_end, feat_id:feat_id+1]
seq_y = self.data_y[r_begin:r_end, feat_id:feat_id+1]
if self.enc_in > 1:
seq_x = self.data_x[s_begin:s_end]
seq_y = self.data_y[r_begin:r_end]
else:
seq_x = self.data_x[s_begin:s_end, feat_id:feat_id+1]
seq_y = self.data_y[r_begin:r_end, feat_id:feat_id+1]

seq_x_mark = self.data_stamp[s_begin:s_end]
seq_y_mark = self.data_stamp[r_begin:r_end]

Expand Down Expand Up @@ -345,8 +383,14 @@ def __getitem__(self, index):
s_end = s_begin + self.seq_len
r_begin = s_end
r_end = r_begin + self.pred_len
seq_x = self.data_x[s_begin:s_end, feat_id:feat_id+1]
seq_y = self.data_y[r_begin:r_end, feat_id:feat_id+1]

if self.enc_in > 1:
seq_x = self.data_x[s_begin:s_end]
seq_y = self.data_y[r_begin:r_end]
else:
seq_x = self.data_x[s_begin:s_end, feat_id:feat_id+1]
seq_y = self.data_y[r_begin:r_end, feat_id:feat_id+1]

seq_x_mark = self.data_stamp[s_begin:s_end]
seq_y_mark = self.data_stamp[r_begin:r_end]

Expand Down
4 changes: 2 additions & 2 deletions tests/model/DLinear_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ def config(tmp_path):
"test_data_path_list": [str(data_path)],
"prompt_data_path": str(prompt_data_path),
"enc_in": 1,
"seq_len": 336+133, # Equal to the sequence length + the length of prompt
"train_epochs": 1000,
"seq_len": 336, # Equal to the sequence length + the length of prompt
"train_epochs": 100,
"patience": 10,
"lradj": 'TST',
"pct_start": 0.2,
Expand Down
2 changes: 1 addition & 1 deletion tests/model/Informer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def config(tmp_path):
"dropout": 0.2,
"fc_dropout": 0.2,
"head_dropout": 0,
"seq_len": 336+133, # Equal to the sequence length + the length of prompt
"seq_len": 336,
"patch_len": 16,
"stride": 8,
"des": 'Exp',
Expand Down
2 changes: 1 addition & 1 deletion tests/model/PatchTST_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def config(tmp_path):
"dropout": 0.2,
"fc_dropout": 0.2,
"head_dropout": 0,
"seq_len": 336+133, # Equal to the sequence length + the length of prompt
"seq_len": 336,
"patch_len": 16,
"stride": 8,
"des": 'Exp',
Expand Down
16 changes: 16 additions & 0 deletions tests/test_scripts/dlinear.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"model": "DLinear",
"model_name_or_path": "gpt2-medium",
"pred_len": 96,
"gradient_accumulation_steps": 64,
"seq_len": 336,
"des": "Exp",
"train_epochs": 100,
"freeze": 0,
"itr": 1,
"learning_rate": 1e-3,
"downsample_rate": 20,
"output_dir": "output/dlinear/",
"eval": 0,
"features": "M"
}
Loading

0 comments on commit 8239910

Please sign in to comment.