transformerVSxgboost时间序列任务

transformer结构在近几年可谓大放异彩,这几天正好有个时间序列预测的小场景,所以动手试了一下transformer的时序预测能力。同时,为了作为对比,也使用了xgboost树模型对预测效果进行了对比。

数据准备

在数据方面,手动生成了具有一定周期和趋势的一维序列数据,数据形态和代码如下:

logo

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import numpy as np
np.random.seed(1)
def generate_data(sequence_length = 2000):
# 生成时间序列数据和目标值
feature_dim = 3 # 决定序列值的维度
features = np.zeros((sequence_length, feature_dim))
targets = np.zeros(sequence_length)
# 生成特征数据和目标值
for i in range(sequence_length):
# 生成特征数据
features[i, 0] = np.sin(0.1 * i) # 特征1:正弦函数
features[i, 1] = 0.5 * np.random.randn() + 2 # 特征2:高斯噪声偏移
features[i, 2] = i/200 # 特征3:趋势
# 生成目标值
targets[i] = 0.7 * features[i, 0] + 0.2 * features[i, 1] + 0.1 * features[i, 2]
return targets
data = generate_data(10000)

transformer时间序列模型

本文使用的transformer预测时序任务架构参考的是 https://arxiv.org/abs/2001.08317, 整体结构如下:

logo

由于我们的数据是时序的,因此训练的时候需要对src和tgt做mask,同时tgt的输入和输出是一对一的关系。
在做推理时,我们往往只知道需要预测的tgt序列当中最早的一个数据,因此我们需要循环着做推理,即把上一时刻的预测值加入到tgt输入中去预测下一时刻的值,再把下一时刻的值加入tgt如此循环。

最终transformer时间序列预测的效果图如下,预测值mae大概0.122:
logo

xgboost时间序列模型

作为对比,使用了传统的提升树模型xgboost训练了同样一组数据,简单的做了一些调参,预测值mae大概0.124,
指标上看xgboost只比transformer模型弱了一点点,但是从可视化的图片来看,tranformer模型的预测结果方差要比xgboost小一些,从数据上看也是这样,transformer模型的预测误差方差为0.084,而xgboost预测误差方差为0.106。
logo

具体实现代码如下:

TimeSeriesTransformer类:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
class TimeSeriesTransformer(nn.Module):
def __init__(self,
input_size: int,
dec_seq_len: int,
batch_first: bool,
out_seq_len: int = 58,
dim_val: int = 512,
n_encoder_layers: int = 4,
n_decoder_layers: int = 4,
n_heads: int = 8,
dropout_encoder: float = 0.2,
dropout_decoder: float = 0.2,
dropout_pos_enc: float = 0.1,
dim_feedforward_encoder: int = 2048,
dim_feedforward_decoder: int = 2048,
num_predicted_features: int = 1
):
"""
Args:
input_size: int, number of input variables. 1 if univariate.
dec_seq_len: int, the length of the input sequence fed to the decoder
dim_val: int, aka d_model. All sub-layers in the model produce
outputs of dimension dim_val
n_encoder_layers: int, number of stacked encoder layers in the encoder
n_decoder_layers: int, number of stacked encoder layers in the decoder
n_heads: int, the number of attention heads (aka parallel attention layers)
dropout_encoder: float, the dropout rate of the encoder
dropout_decoder: float, the dropout rate of the decoder
dropout_pos_enc: float, the dropout rate of the positional encoder
dim_feedforward_encoder: int, number of neurons in the linear layer
of the encoder
dim_feedforward_decoder: int, number of neurons in the linear layer
of the decoder
num_predicted_features: int, the number of features you want to predict.
Most of the time, this will be 1 because we're
only forecasting FCR-N prices in DK2, but in
we wanted to also predict FCR-D with the same
model, num_predicted_features should be 2.
"""
super().__init__()
self.dec_seq_len = dec_seq_len
self.encoder_input_layer = nn.Linear(
in_features=input_size,
out_features=dim_val
)
self.decoder_input_layer = nn.Linear(
in_features=num_predicted_features,
out_features=dim_val
)
self.linear_mapping = nn.Linear(
in_features=dim_val,
out_features=num_predicted_features
)
# Create positional encoder
self.positional_encoding_layer = PositionalEncoder(
d_model=dim_val,
dropout=dropout_pos_enc
)
# The encoder layer used in the paper is identical to the one used by
# Vaswani et al (2017) on which the PyTorch module is based.
encoder_layer = nn.TransformerEncoderLayer(
d_model=dim_val,
nhead=n_heads,
dim_feedforward=dim_feedforward_encoder,
dropout=dropout_encoder,
batch_first=batch_first
)
self.encoder = nn.TransformerEncoder(
encoder_layer=encoder_layer,
num_layers=n_encoder_layers,
norm=None
)
decoder_layer = nn.TransformerDecoderLayer(
d_model=dim_val,
nhead=n_heads,
dim_feedforward=dim_feedforward_decoder,
dropout=dropout_decoder,
batch_first=batch_first
)
self.decoder = nn.TransformerDecoder(
decoder_layer=decoder_layer,
num_layers=n_decoder_layers,
norm=None
)
def forward(self, src: Tensor, tgt: Tensor, src_mask: Tensor = None,
tgt_mask: Tensor = None) -> Tensor:
"""
Returns a tensor of shape:
[target_sequence_length, batch_size, num_predicted_features]
Args:
src: the encoder's output sequence. Shape: (S,E) for unbatched input,
(S, N, E) if batch_first=False or (N, S, E) if
batch_first=True, where S is the source sequence length,
N is the batch size, and E is the number of features (1 if univariate)
tgt: the sequence to the decoder. Shape: (T,E) for unbatched input,
(T, N, E)(T,N,E) if batch_first=False or (N, T, E) if
batch_first=True, where T is the target sequence length,
N is the batch size, and E is the number of features (1 if univariate)
src_mask: the mask for the src sequence to prevent the model from
using data points from the target sequence
tgt_mask: the mask for the tgt sequence to prevent the model from
using data points from the target sequence
"""
# Pass throguh the input layer right before the encoder
src = self.encoder_input_layer(
src) # src shape: [batch_size, src length, dim_val] regardless of number of input features
# Pass through the positional encoding layer
src = self.positional_encoding_layer(
src) # src shape: [batch_size, src length, dim_val] regardless of number of input features
src = self.encoder( # src shape: [batch_size, enc_seq_len, dim_val]
src=src
)
# print("From model.forward(): Size of src after encoder: {}".format(src.size()))
# Pass decoder input through decoder input layer
decoder_output = self.decoder_input_layer(
tgt) # src shape: [target sequence length, batch_size, dim_val] regardless of number of input features
# Pass throguh decoder - output shape: [batch_size, target seq len, dim_val]
decoder_output = self.decoder(
tgt=decoder_output,
memory=src,
tgt_mask=tgt_mask,
memory_mask=src_mask
)
# Pass through linear mapping
decoder_output = self.linear_mapping(decoder_output) # shape [batch_size, target seq len]
return decoder_output

TransformerDataset类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
class TransformerDataset(Dataset):
"""
Dataset class used for transformer models.
"""
def __init__(self,
data: torch.tensor,
indices: list,
enc_seq_len: int,
dec_seq_len: int,
target_seq_len: int
) -> None:
"""
Args:
data: tensor, the entire train, validation or test data sequence
before any slicing. If univariate, data.size() will be
[number of samples, number of variables]
where the number of variables will be equal to 1 + the number of
exogenous variables. Number of exogenous variables would be 0
if univariate.
indices: a list of tuples. Each tuple has two elements:
1) the start index of a sub-sequence
2) the end index of a sub-sequence.
The sub-sequence is split into src, trg and trg_y later.
enc_seq_len: int, the desired length of the input sequence given to the
the first layer of the transformer model.
"""
super().__init__()
self.indices = indices
self.data = data.astype(np.double)
self.enc_seq_len = enc_seq_len
self.dec_seq_len = dec_seq_len
self.target_seq_len = target_seq_len
def __len__(self):
return len(self.indices)
def __getitem__(self, index):
"""
Returns a tuple with 3 elements:
1) src (the encoder input)
2) trg (the decoder input)
3) trg_y (the target)
"""
# Get the first element of the i'th tuple in the list self.indicesasdfas
start_idx = self.indices[index][0]
# Get the second (and last) element of the i'th tuple in the list self.indices
end_idx = self.indices[index][1]
sequence = self.data[start_idx:end_idx]
# print("From __getitem__: sequence length = {}".format(len(sequence)))
src, trg, trg_y = self.get_src_trg(
sequence=sequence,
enc_seq_len=self.enc_seq_len,
dec_seq_len=self.dec_seq_len,
target_seq_len=self.target_seq_len
)
return src, trg, trg_y
def get_src_trg(
self,
sequence: torch.Tensor,
enc_seq_len: int,
dec_seq_len: int,
target_seq_len: int
) -> Tuple[torch.tensor, torch.tensor, torch.tensor]:
"""
Generate the src (encoder input), trg (decoder input) and trg_y (the target)
sequences from a sequence.
Args:
sequence: tensor, a 1D tensor of length n where
n = encoder input length + target sequence length
enc_seq_len: int, the desired length of the input to the transformer encoder
target_seq_len: int, the desired length of the target sequence (the
one against which the model output is compared)
Return:
src: tensor, 1D, used as input to the transformer model
trg: tensor, 1D, used as input to the transformer model
trg_y: tensor, 1D, the target sequence against which the model output
is compared when computing loss.
"""
assert len(
sequence) == enc_seq_len + target_seq_len, "Sequence length does not equal (input length + target length)"
# encoder input
src = sequence[:enc_seq_len]
# decoder input. As per the paper, it must have the same dimension as the
# target sequence, and it must contain the last value of src, and all
# values of trg_y except the last (i.e. it must be shifted right by 1)
trg = sequence[enc_seq_len - 1:len(sequence) - 1]
assert len(trg) == target_seq_len, "Length of trg does not match target sequence length"
# The target sequence against which the model output will be compared to compute loss
trg_y = sequence[-target_seq_len:]
assert len(trg_y) == target_seq_len, "Length of trg_y does not match target sequence length"
return src, trg, trg_y

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
class PositionalEncoder(nn.Module):
def __init__(
self,
dropout: float = 0.1,
max_seq_len: int = 5000,
d_model: int = 512,
batch_first: bool = False
):
"""
Parameters:
dropout: the dropout rate
max_seq_len: the maximum length of the input sequences
d_model: The dimension of the output of sub-layers in the model
(Vaswani et al, 2017)
"""
super().__init__()
self.d_model = d_model
self.dropout = nn.Dropout(p=dropout)
self.batch_first = batch_first
self.x_dim = 1 if batch_first else 0
# copy pasted from PyTorch tutorial
position = torch.arange(max_seq_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
pe = torch.zeros(max_seq_len, 1, d_model)
pe[:, 0, 0::2] = torch.sin(position * div_term)
pe[:, 0, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe)
def forward(self, x: Tensor) -> Tensor:
"""
Args:
x: Tensor, shape [batch_size, enc_seq_len, dim_val] or
[enc_seq_len, batch_size, dim_val]
"""
x = x + self.pe[:x.size(self.x_dim)]
return self.dropout(x)

训练和推理代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
def run_encoder_decoder_inference(
model: nn.Module,
src: torch.Tensor,
forecast_window: int,
batch_size: int,
batch_first: bool = False
) -> torch.Tensor:
"""
NB! This function is currently only tested on models that work with
batch_first = False
This function is for encoder-decoder type models in which the decoder requires
an input, tgt, which - during training - is the target sequence. During inference,
the values of tgt are unknown, and the values therefore have to be generated
iteratively.
This function returns a prediction of length forecast_window for each batch in src
NB! If you want the inference to be done without gradient calculation,
make sure to call this function inside the context manager torch.no_grad like:
with torch.no_grad:
run_encoder_decoder_inference()
The context manager is intentionally not called inside this function to make
it usable in cases where the function is used to compute loss that must be
backpropagated during training and gradient calculation hence is required.
If use_predicted_tgt = True:
To begin with, tgt is equal to the last value of src. Then, the last element
in the model's prediction is iteratively concatenated with tgt, such that
at each step in the for-loop, tgt's size increases by 1. Finally, tgt will
have the correct length (target sequence length) and the final prediction
will be produced and returned.
Args:
model: An encoder-decoder type model where the decoder requires
target values as input. Should be set to evaluation mode before
passed to this function.
src: The input to the model
forecast_horizon: The desired length of the model's output, e.g. 58 if you
want to predict the next 58 hours of FCR prices.
batch_size: batch size
batch_first: If true, the shape of the model input should be
[batch size, input sequence length, number of features].
If false, [input sequence length, batch size, number of features]
"""
# Dimension of a batched model input that contains the target sequence values
target_seq_dim = 0 if batch_first == False else 1
# Take the last value of thetarget variable in all batches in src and make it tgt
# as per the Influenza paper
tgt = src[-1, :, 0] if batch_first == False else src[:, -1, 0] # shape [1, batch_size, 1]
# Change shape from [batch_size] to [1, batch_size, 1]
if batch_size == 1:
tgt = tgt.unsqueeze(0).unsqueeze(0) # change from [1] to [1, 1, 1]
# Iteratively concatenate tgt with the first element in the prediction
for _ in range(forecast_window - 1):
# Create masks
dim_a = tgt.shape[1] if batch_first == True else tgt.shape[0]
dim_b = src.shape[1] if batch_first == True else src.shape[0]
tgt_mask = generate_square_subsequent_mask(
dim1=dim_a,
dim2=dim_a,
)
src_mask = generate_square_subsequent_mask(
dim1=dim_a,
dim2=dim_b,
)
tgt_mask = tgt_mask.to("cuda:0")
src_mask = src_mask.to("cuda:0")
#print(src.device, tgt.device, src_mask.device, tgt_mask.device)
# Make prediction
prediction = model(src, tgt, src_mask, tgt_mask)
# If statement simply makes sure that the predicted value is
# extracted and reshaped correctly
if batch_first == False:
# Obtain the predicted value at t+1 where t is the last time step
# represented in tgt
last_predicted_value = prediction[-1, :, :]
# Reshape from [batch_size, 1] --> [1, batch_size, 1]
last_predicted_value = last_predicted_value.unsqueeze(0)
else:
# Obtain predicted value
last_predicted_value = prediction[:, -1, :]
# Reshape from [batch_size, 1] --> [batch_size, 1, 1]
last_predicted_value = last_predicted_value.unsqueeze(-1)
# Detach the predicted element from the graph and concatenate with
# tgt in dimension 1 or 0
tgt = torch.cat((tgt, last_predicted_value.detach()), target_seq_dim)
# Create masks
dim_a = tgt.shape[1] if batch_first == True else tgt.shape[0]
dim_b = src.shape[1] if batch_first == True else src.shape[0]
tgt_mask = generate_square_subsequent_mask(
dim1=dim_a,
dim2=dim_a,
)
src_mask = generate_square_subsequent_mask(
dim1=dim_a,
dim2=dim_b,
)
tgt_mask = tgt_mask.to("cuda:0")
src_mask = src_mask.to("cuda:0")
# Make final prediction
final_prediction = model(src, tgt, src_mask, tgt_mask)
return final_prediction
# 构造时间序列数据
forecast_window = 60
enc_seq_len = 120
epochs = 70
train_indices = [(i, i+enc_seq_len+forecast_window) for i in range(0, 9000, 1)]# 生成1000条样本 9170
train_dataset = TransformerDataset(data, indices=train_indices, enc_seq_len=enc_seq_len, dec_seq_len=forecast_window, target_seq_len=forecast_window)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_indices = [(i, i+enc_seq_len+forecast_window) for i in range(9180, 9181, 5)]# 生成1000条样本
val_dataset = TransformerDataset(data, indices=val_indices, enc_seq_len=enc_seq_len, dec_seq_len=forecast_window, target_seq_len=forecast_window)
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)
## Model parameters
dim_val = 56 # This can be any value divisible by n_heads. 512 is used in the original transformer paper.
n_heads = 2 # The number of attention heads (aka parallel attention layers). dim_val must be divisible by this number
n_decoder_layers = 2 # Number of times the decoder layer is stacked in the decoder
n_encoder_layers = 2 # Number of times the encoder layer is stacked in the encoder
input_size = 1 # The number of input variables. 1 if univariate forecasting.
model = TimeSeriesTransformer(
input_size=input_size,
dim_val=dim_val,
batch_first=True,
dec_seq_len=forecast_window,
out_seq_len=forecast_window,
n_decoder_layers=n_decoder_layers,
n_encoder_layers=n_encoder_layers,
n_heads=n_heads)
model.to(device="cuda:0")
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = torch.nn.MSELoss()
# Iterate over all epochs
for epoch in range(epochs):
train_loss = 0.0
# Iterate over all (x,y) pairs in training dataloader
if epoch>0:
for i, (src, tgt, tgt_y) in enumerate(train_dataloader):
src = src.unsqueeze(-1)
src = src.to(torch.float32)
src = src.to("cuda:0")
tgt = tgt.unsqueeze(-1)
tgt = tgt.to(torch.float32)
tgt = tgt.to("cuda:0")
tgt_y = tgt_y.to(torch.float32)
tgt_y = tgt_y.to("cuda:0")
# zero the parameter gradients
optimizer.zero_grad()
# Generate masks
tgt_mask = generate_square_subsequent_mask(
dim1=forecast_window,
dim2=forecast_window
)
src_mask = generate_square_subsequent_mask(
dim1=forecast_window,
dim2=enc_seq_len
)
tgt_mask = tgt_mask.to("cuda:0")
src_mask = src_mask.to("cuda:0")
# Make forecasts
prediction = model(src, tgt, src_mask, tgt_mask)
prediction = prediction.squeeze(-1)
# Compute and backprop loss
loss = criterion(tgt_y, prediction)
loss.backward()
# Take optimizer step
optimizer.step()
train_loss += loss.item() * src.size(0)
print("Epoch: {} Loss: {}".format(epoch, train_loss / len(train_dataloader.dataset)))
# Iterate over all (x,y) pairs in validation dataloader
model.eval()
with torch.no_grad():
for i, (src, _, tgt_y) in enumerate(val_dataloader):
src = src.unsqueeze(-1)
src = src.to(torch.float32)
src = src.to("cuda:0")
tgt_y = tgt_y.to(torch.float32)
tgt_y = tgt_y.to("cuda:0")
#print(model,src)
prediction = run_encoder_decoder_inference(
model=model,
src=src,
forecast_window=forecast_window,
batch_size=src.shape[0],
batch_first=True,
)
loss = criterion(tgt_y, prediction)
#print(f"tgt_y:{tgt_y},prediction:{prediction}")
print(f"val loss:{loss}")
break

xgboost训练代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# 构造时间序列数据
data = generate_data(10000)
forecast_window = 60
enc_seq_len = 120
train_indices = [(i, i+enc_seq_len+forecast_window) for i in range(0, 9000, 1)]# 生成1000条样本 9170
val_indices = [(i, i+enc_seq_len+forecast_window) for i in range(9180, 9181, 5)]# 生成1000条样本
train_x_list = []
train_y_list = []
for (s,e) in train_indices:
train_x_list.append(data[s:s+enc_seq_len])
train_y_list.append(data[e-forecast_window:e])
test_x_list = []
test_y_list = []
for (s,e) in val_indices:
test_x_list.append(data[s:s+enc_seq_len])
test_y_list.append(data[e-forecast_window:e])
train_x = np.stack(train_x_list)
train_y = np.stack(train_y_list)
test_x = np.stack(test_x_list)
test_y = np.stack(test_y_list)
import xgboost as xgb
params = {
#'booster': 'gbtree',
'objective': 'reg:squarederror',
'tree_method': 'hist',
'eval_metric':'mae',
#'gamma': 0.02,
'max_depth': 4,
#'subsample': 0.8,
#'colsample_bytree': 0.8,
#'scale_pos_weights': 1,
#'min_child_weight': 4,
'silent': 0,
'eta': 0.015,
}
dtrain = xgb.DMatrix(train_x, label=train_y[:,0])
dval = xgb.DMatrix(test_x, label=test_y[:,0])
evallist = [(dtrain,'train'),(dval, 'eval')]
xgb_model = xgb.train(params, dtrain, num_boost_round=500,
early_stopping_rounds=60, evals=evallist)
def inference(model,test_x,test_y):
pred_y = []
fea = test_x[0][:].reshape(1,-1)
for i in range(len(test_y[0])):
_y = model.predict(xgb.DMatrix(fea[:,-120:]))[0]
pred_y.append(_y)
fea = np.append(fea,np.array([[_y]]),axis=1)
#print(_y,fea[:,-120:])
return pred_y
pred_y = inference(xgb_model,test_x,test_y)
def plot_curves(x, y1, y2, y3):
plt.plot(x, np.concatenate([y1,y2]), label='predict',color='red')
plt.plot(x, np.concatenate([y1,y3]), label='gt',color='yellow')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('xgboost mae:0.124')
plt.legend()
plt.show()
plot_curves(np.array([i for i in range(180)]),test_x[0],pred_y,test_y[0])