transformer结构在近几年可谓大放异彩，这几天正好有个时间序列预测的小场景，所以动手试了一下transformer的时序预测能力。同时，为了作为对比，也使用了xgboost树模型对预测效果进行了对比。

数据准备

在数据方面，手动生成了具有一定周期和趋势的一维序列数据，数据形态和代码如下：

logo

import numpy as np
np.random.seed(1)
def generate_data(sequence_length = 2000):
    # 生成时间序列数据和目标值
    feature_dim = 3  # 决定序列值的维度
    features = np.zeros((sequence_length, feature_dim))
    targets = np.zeros(sequence_length)
    # 生成特征数据和目标值
    for i in range(sequence_length):
        # 生成特征数据
        features[i, 0] = np.sin(0.1 * i)  # 特征1：正弦函数
        features[i, 1] = 0.5 * np.random.randn() + 2  # 特征2：高斯噪声偏移
        features[i, 2] = i/200 # 特征3：趋势
        # 生成目标值
        targets[i] = 0.7 * features[i, 0] + 0.2 * features[i, 1] + 0.1 * features[i, 2]
    return targets
data = generate_data(10000)

transformer时间序列模型

本文使用的transformer预测时序任务架构参考的是 https://arxiv.org/abs/2001.08317, 整体结构如下：

logo

由于我们的数据是时序的，因此训练的时候需要对src和tgt做mask，同时tgt的输入和输出是一对一的关系。
在做推理时，我们往往只知道需要预测的tgt序列当中最早的一个数据，因此我们需要循环着做推理，即把上一时刻的预测值加入到tgt输入中去预测下一时刻的值，再把下一时刻的值加入tgt如此循环。

最终transformer时间序列预测的效果图如下,预测值mae大概0.122:
logo

xgboost时间序列模型

作为对比，使用了传统的提升树模型xgboost训练了同样一组数据，简单的做了一些调参,预测值mae大概0.124,
指标上看xgboost只比transformer模型弱了一点点，但是从可视化的图片来看，tranformer模型的预测结果方差要比xgboost小一些,从数据上看也是这样，transformer模型的预测误差方差为0.084，而xgboost预测误差方差为0.106。
logo

具体实现代码如下：

TimeSeriesTransformer类:


class TimeSeriesTransformer(nn.Module):
    def __init__(self,
                 input_size: int,
                 dec_seq_len: int,
                 batch_first: bool,
                 out_seq_len: int = 58,
                 dim_val: int = 512,
                 n_encoder_layers: int = 4,
                 n_decoder_layers: int = 4,
                 n_heads: int = 8,
                 dropout_encoder: float = 0.2,
                 dropout_decoder: float = 0.2,
                 dropout_pos_enc: float = 0.1,
                 dim_feedforward_encoder: int = 2048,
                 dim_feedforward_decoder: int = 2048,
                 num_predicted_features: int = 1
                 ):
        """
        Args:
            input_size: int, number of input variables. 1 if univariate.
            dec_seq_len: int, the length of the input sequence fed to the decoder
            dim_val: int, aka d_model. All sub-layers in the model produce
                     outputs of dimension dim_val
            n_encoder_layers: int, number of stacked encoder layers in the encoder
            n_decoder_layers: int, number of stacked encoder layers in the decoder
            n_heads: int, the number of attention heads (aka parallel attention layers)
            dropout_encoder: float, the dropout rate of the encoder
            dropout_decoder: float, the dropout rate of the decoder
            dropout_pos_enc: float, the dropout rate of the positional encoder
            dim_feedforward_encoder: int, number of neurons in the linear layer
                                     of the encoder
            dim_feedforward_decoder: int, number of neurons in the linear layer
                                     of the decoder
            num_predicted_features: int, the number of features you want to predict.
                                    Most of the time, this will be 1 because we're
                                    only forecasting FCR-N prices in DK2, but in
                                    we wanted to also predict FCR-D with the same
                                    model, num_predicted_features should be 2.
        """
        super().__init__()
        self.dec_seq_len = dec_seq_len
        self.encoder_input_layer = nn.Linear(
            in_features=input_size,
            out_features=dim_val
        )
        self.decoder_input_layer = nn.Linear(
            in_features=num_predicted_features,
            out_features=dim_val
        )
        self.linear_mapping = nn.Linear(
            in_features=dim_val,
            out_features=num_predicted_features
        )
        # Create positional encoder
        self.positional_encoding_layer = PositionalEncoder(
            d_model=dim_val,
            dropout=dropout_pos_enc
        )
        # The encoder layer used in the paper is identical to the one used by
        # Vaswani et al (2017) on which the PyTorch module is based.
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=dim_val,
            nhead=n_heads,
            dim_feedforward=dim_feedforward_encoder,
            dropout=dropout_encoder,
            batch_first=batch_first
        )
        self.encoder = nn.TransformerEncoder(
            encoder_layer=encoder_layer,
            num_layers=n_encoder_layers,
            norm=None
        )
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=dim_val,
            nhead=n_heads,
            dim_feedforward=dim_feedforward_decoder,
            dropout=dropout_decoder,
            batch_first=batch_first
        )
        self.decoder = nn.TransformerDecoder(
            decoder_layer=decoder_layer,
            num_layers=n_decoder_layers,
            norm=None
        )
    def forward(self, src: Tensor, tgt: Tensor, src_mask: Tensor = None,
                tgt_mask: Tensor = None) -> Tensor:
        """
        Returns a tensor of shape:
        [target_sequence_length, batch_size, num_predicted_features]
        Args:
            src: the encoder's output sequence. Shape: (S,E) for unbatched input,
                 (S, N, E) if batch_first=False or (N, S, E) if
                 batch_first=True, where S is the source sequence length,
                 N is the batch size, and E is the number of features (1 if univariate)
            tgt: the sequence to the decoder. Shape: (T,E) for unbatched input,
                 (T, N, E)(T,N,E) if batch_first=False or (N, T, E) if
                 batch_first=True, where T is the target sequence length,
                 N is the batch size, and E is the number of features (1 if univariate)
            src_mask: the mask for the src sequence to prevent the model from
                      using data points from the target sequence
            tgt_mask: the mask for the tgt sequence to prevent the model from
                      using data points from the target sequence
        """
        # Pass throguh the input layer right before the encoder
        src = self.encoder_input_layer(
            src)  # src shape: [batch_size, src length, dim_val] regardless of number of input features
        # Pass through the positional encoding layer
        src = self.positional_encoding_layer(
            src)  # src shape: [batch_size, src length, dim_val] regardless of number of input features
        src = self.encoder(  # src shape: [batch_size, enc_seq_len, dim_val]
            src=src
        )
        # print("From model.forward(): Size of src after encoder: {}".format(src.size()))
        # Pass decoder input through decoder input layer
        decoder_output = self.decoder_input_layer(
            tgt)  # src shape: [target sequence length, batch_size, dim_val] regardless of number of input features
        # Pass throguh decoder - output shape: [batch_size, target seq len, dim_val]
        decoder_output = self.decoder(
            tgt=decoder_output,
            memory=src,
            tgt_mask=tgt_mask,
            memory_mask=src_mask
        )
        # Pass through linear mapping
        decoder_output = self.linear_mapping(decoder_output)  # shape [batch_size, target seq len]
        return decoder_output

TransformerDataset类

class TransformerDataset(Dataset):
    """
    Dataset class used for transformer models.
    """
    def __init__(self,
                 data: torch.tensor,
                 indices: list,
                 enc_seq_len: int,
                 dec_seq_len: int,
                 target_seq_len: int
                 ) -> None:
        """
        Args:
            data: tensor, the entire train, validation or test data sequence
                        before any slicing. If univariate, data.size() will be
                        [number of samples, number of variables]
                        where the number of variables will be equal to 1 + the number of
                        exogenous variables. Number of exogenous variables would be 0
                        if univariate.
            indices: a list of tuples. Each tuple has two elements:
                     1) the start index of a sub-sequence
                     2) the end index of a sub-sequence.
                     The sub-sequence is split into src, trg and trg_y later.
            enc_seq_len: int, the desired length of the input sequence given to the
                     the first layer of the transformer model.
        """
        super().__init__()
        self.indices = indices
        self.data = data.astype(np.double)
        self.enc_seq_len = enc_seq_len
        self.dec_seq_len = dec_seq_len
        self.target_seq_len = target_seq_len
    def __len__(self):
        return len(self.indices)
    def __getitem__(self, index):
        """
        Returns a tuple with 3 elements:
        1) src (the encoder input)
        2) trg (the decoder input)
        3) trg_y (the target)
        """
        # Get the first element of the i'th tuple in the list self.indicesasdfas
        start_idx = self.indices[index][0]
        # Get the second (and last) element of the i'th tuple in the list self.indices
        end_idx = self.indices[index][1]
        sequence = self.data[start_idx:end_idx]
        # print("From __getitem__: sequence length = {}".format(len(sequence)))
        src, trg, trg_y = self.get_src_trg(
            sequence=sequence,
            enc_seq_len=self.enc_seq_len,
            dec_seq_len=self.dec_seq_len,
            target_seq_len=self.target_seq_len
        )
        return src, trg, trg_y
    def get_src_trg(
            self,
            sequence: torch.Tensor,
            enc_seq_len: int,
            dec_seq_len: int,
            target_seq_len: int
    ) -> Tuple[torch.tensor, torch.tensor, torch.tensor]:
        """
        Generate the src (encoder input), trg (decoder input) and trg_y (the target)
        sequences from a sequence.
        Args:
            sequence: tensor, a 1D tensor of length n where
                    n = encoder input length + target sequence length
            enc_seq_len: int, the desired length of the input to the transformer encoder
            target_seq_len: int, the desired length of the target sequence (the
                            one against which the model output is compared)
        Return:
            src: tensor, 1D, used as input to the transformer model
            trg: tensor, 1D, used as input to the transformer model
            trg_y: tensor, 1D, the target sequence against which the model output
                is compared when computing loss.
        """
        assert len(
            sequence) == enc_seq_len + target_seq_len, "Sequence length does not equal (input length + target length)"
        # encoder input
        src = sequence[:enc_seq_len]
        # decoder input. As per the paper, it must have the same dimension as the
        # target sequence, and it must contain the last value of src, and all
        # values of trg_y except the last (i.e. it must be shifted right by 1)
        trg = sequence[enc_seq_len - 1:len(sequence) - 1]
        assert len(trg) == target_seq_len, "Length of trg does not match target sequence length"
        # The target sequence against which the model output will be compared to compute loss
        trg_y = sequence[-target_seq_len:]
        assert len(trg_y) == target_seq_len, "Length of trg_y does not match target sequence length"
        return src, trg, trg_y

class PositionalEncoder(nn.Module):
    def __init__(
            self,
            dropout: float = 0.1,
            max_seq_len: int = 5000,
            d_model: int = 512,
            batch_first: bool = False
    ):
        """
        Parameters:
            dropout: the dropout rate
            max_seq_len: the maximum length of the input sequences
            d_model: The dimension of the output of sub-layers in the model
                     (Vaswani et al, 2017)
        """
        super().__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(p=dropout)
        self.batch_first = batch_first
        self.x_dim = 1 if batch_first else 0
        # copy pasted from PyTorch tutorial
        position = torch.arange(max_seq_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_seq_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)
    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [batch_size, enc_seq_len, dim_val] or
               [enc_seq_len, batch_size, dim_val]
        """
        x = x + self.pe[:x.size(self.x_dim)]
        return self.dropout(x)

训练和推理代码

def run_encoder_decoder_inference(
        model: nn.Module,
        src: torch.Tensor,
        forecast_window: int,
        batch_size: int,
        batch_first: bool = False
) -> torch.Tensor:
    """
    NB! This function is currently only tested on models that work with
    batch_first = False
    This function is for encoder-decoder type models in which the decoder requires
    an input, tgt, which - during training - is the target sequence. During inference,
    the values of tgt are unknown, and the values therefore have to be generated
    iteratively.
    This function returns a prediction of length forecast_window for each batch in src
    NB! If you want the inference to be done without gradient calculation,
    make sure to call this function inside the context manager torch.no_grad like:
    with torch.no_grad:
        run_encoder_decoder_inference()
    The context manager is intentionally not called inside this function to make
    it usable in cases where the function is used to compute loss that must be
    backpropagated during training and gradient calculation hence is required.
    If use_predicted_tgt = True:
    To begin with, tgt is equal to the last value of src. Then, the last element
    in the model's prediction is iteratively concatenated with tgt, such that
    at each step in the for-loop, tgt's size increases by 1. Finally, tgt will
    have the correct length (target sequence length) and the final prediction
    will be produced and returned.
    Args:
        model: An encoder-decoder type model where the decoder requires
               target values as input. Should be set to evaluation mode before
               passed to this function.
        src: The input to the model
        forecast_horizon: The desired length of the model's output, e.g. 58 if you
                         want to predict the next 58 hours of FCR prices.
        batch_size: batch size
        batch_first: If true, the shape of the model input should be
                     [batch size, input sequence length, number of features].
                     If false, [input sequence length, batch size, number of features]
    """
    # Dimension of a batched model input that contains the target sequence values
    target_seq_dim = 0 if batch_first == False else 1
    # Take the last value of thetarget variable in all batches in src and make it tgt
    # as per the Influenza paper
    tgt = src[-1, :, 0] if batch_first == False else src[:, -1, 0]  # shape [1, batch_size, 1]
    # Change shape from [batch_size] to [1, batch_size, 1]
    if batch_size == 1:
        tgt = tgt.unsqueeze(0).unsqueeze(0)  # change from [1] to [1, 1, 1]
    # Iteratively concatenate tgt with the first element in the prediction
    for _ in range(forecast_window - 1):
        # Create masks
        dim_a = tgt.shape[1] if batch_first == True else tgt.shape[0]
        dim_b = src.shape[1] if batch_first == True else src.shape[0]
        tgt_mask = generate_square_subsequent_mask(
            dim1=dim_a,
            dim2=dim_a,
        )
        src_mask = generate_square_subsequent_mask(
            dim1=dim_a,
            dim2=dim_b,
        )
        
        tgt_mask = tgt_mask.to("cuda:0")
        src_mask = src_mask.to("cuda:0")
        #print(src.device, tgt.device, src_mask.device, tgt_mask.device)
        # Make prediction
        
        prediction = model(src, tgt, src_mask, tgt_mask)
        # If statement simply makes sure that the predicted value is
        # extracted and reshaped correctly
        if batch_first == False:
            # Obtain the predicted value at t+1 where t is the last time step
            # represented in tgt
            last_predicted_value = prediction[-1, :, :]
            # Reshape from [batch_size, 1] --> [1, batch_size, 1]
            last_predicted_value = last_predicted_value.unsqueeze(0)
        else:
            # Obtain predicted value
            last_predicted_value = prediction[:, -1, :]
            # Reshape from [batch_size, 1] --> [batch_size, 1, 1]
            last_predicted_value = last_predicted_value.unsqueeze(-1)
        # Detach the predicted element from the graph and concatenate with
        # tgt in dimension 1 or 0
        tgt = torch.cat((tgt, last_predicted_value.detach()), target_seq_dim)
    # Create masks
    dim_a = tgt.shape[1] if batch_first == True else tgt.shape[0]
    dim_b = src.shape[1] if batch_first == True else src.shape[0]
    tgt_mask = generate_square_subsequent_mask(
        dim1=dim_a,
        dim2=dim_a,
    )
    src_mask = generate_square_subsequent_mask(
        dim1=dim_a,
        dim2=dim_b,
    )
    tgt_mask = tgt_mask.to("cuda:0")
    src_mask = src_mask.to("cuda:0")
    # Make final prediction
    final_prediction = model(src, tgt, src_mask, tgt_mask)
    return final_prediction
# 构造时间序列数据
forecast_window = 60
enc_seq_len = 120
epochs = 70
train_indices = [(i, i+enc_seq_len+forecast_window) for i in range(0, 9000, 1)]# 生成1000条样本 9170
train_dataset = TransformerDataset(data, indices=train_indices, enc_seq_len=enc_seq_len, dec_seq_len=forecast_window, target_seq_len=forecast_window)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_indices = [(i, i+enc_seq_len+forecast_window) for i in range(9180, 9181, 5)]# 生成1000条样本
val_dataset = TransformerDataset(data, indices=val_indices, enc_seq_len=enc_seq_len, dec_seq_len=forecast_window, target_seq_len=forecast_window)
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)
## Model parameters
dim_val = 56 # This can be any value divisible by n_heads. 512 is used in the original transformer paper.
n_heads = 2 # The number of attention heads (aka parallel attention layers). dim_val must be divisible by this number
n_decoder_layers = 2 # Number of times the decoder layer is stacked in the decoder
n_encoder_layers = 2 # Number of times the encoder layer is stacked in the encoder
input_size = 1 # The number of input variables. 1 if univariate forecasting.
model = TimeSeriesTransformer(
    input_size=input_size,
    dim_val=dim_val,
    batch_first=True,
    dec_seq_len=forecast_window,
    out_seq_len=forecast_window,
    n_decoder_layers=n_decoder_layers,
    n_encoder_layers=n_encoder_layers,
    n_heads=n_heads)
model.to(device="cuda:0")
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = torch.nn.MSELoss()
# Iterate over all epochs
for epoch in range(epochs):
    train_loss = 0.0
    # Iterate over all (x,y) pairs in training dataloader
    if epoch>0:
        for i, (src, tgt, tgt_y) in enumerate(train_dataloader):
            src = src.unsqueeze(-1)
            src = src.to(torch.float32)
            src = src.to("cuda:0")
            tgt = tgt.unsqueeze(-1)
            tgt = tgt.to(torch.float32)
            tgt = tgt.to("cuda:0")
            tgt_y = tgt_y.to(torch.float32)
            tgt_y = tgt_y.to("cuda:0")
            # zero the parameter gradients
            optimizer.zero_grad()
            # Generate masks
            tgt_mask = generate_square_subsequent_mask(
                dim1=forecast_window,
                dim2=forecast_window
                )
            
            src_mask = generate_square_subsequent_mask(
                dim1=forecast_window,
                dim2=enc_seq_len
                )
            tgt_mask = tgt_mask.to("cuda:0")
            src_mask = src_mask.to("cuda:0")
            
            # Make forecasts
            prediction = model(src, tgt, src_mask, tgt_mask)
            prediction = prediction.squeeze(-1)
            # Compute and backprop loss
            loss = criterion(tgt_y, prediction)
            loss.backward()
            # Take optimizer step
            optimizer.step()
            train_loss += loss.item() * src.size(0)
        print("Epoch: {} Loss: {}".format(epoch, train_loss / len(train_dataloader.dataset)))
    # Iterate over all (x,y) pairs in validation dataloader
    model.eval()
    with torch.no_grad():
        for i, (src, _, tgt_y) in enumerate(val_dataloader):
            src = src.unsqueeze(-1)
            src = src.to(torch.float32)
            src = src.to("cuda:0")
            tgt_y = tgt_y.to(torch.float32)
            tgt_y = tgt_y.to("cuda:0")
            #print(model,src)
            prediction = run_encoder_decoder_inference(
                model=model,
                src=src,
                forecast_window=forecast_window,
                batch_size=src.shape[0],
                batch_first=True,
            )
            loss = criterion(tgt_y, prediction)
            #print(f"tgt_y:{tgt_y},prediction:{prediction}")
            print(f"val loss:{loss}")
            break

xgboost训练代码:

# 构造时间序列数据
data = generate_data(10000)
forecast_window = 60
enc_seq_len = 120
train_indices = [(i, i+enc_seq_len+forecast_window) for i in range(0, 9000, 1)]# 生成1000条样本 9170
val_indices = [(i, i+enc_seq_len+forecast_window) for i in range(9180, 9181, 5)]# 生成1000条样本
train_x_list = []
train_y_list = []
for (s,e) in train_indices:
    train_x_list.append(data[s:s+enc_seq_len])
    train_y_list.append(data[e-forecast_window:e])
test_x_list = []
test_y_list = []
for (s,e) in val_indices:
    test_x_list.append(data[s:s+enc_seq_len])
    test_y_list.append(data[e-forecast_window:e])
train_x = np.stack(train_x_list)
train_y = np.stack(train_y_list)
test_x = np.stack(test_x_list)
test_y = np.stack(test_y_list)    
import xgboost as xgb
params = {
            #'booster': 'gbtree',
            'objective': 'reg:squarederror',
            'tree_method': 'hist',
            'eval_metric':'mae',
            #'gamma': 0.02,
            'max_depth': 4,
            #'subsample': 0.8,
            #'colsample_bytree': 0.8,
            #'scale_pos_weights': 1,
            #'min_child_weight': 4,
            'silent': 0,
            'eta': 0.015,
        }
dtrain = xgb.DMatrix(train_x, label=train_y[:,0])
dval = xgb.DMatrix(test_x, label=test_y[:,0])
evallist = [(dtrain,'train'),(dval, 'eval')]
xgb_model = xgb.train(params, dtrain, num_boost_round=500,
                      early_stopping_rounds=60, evals=evallist)
def inference(model,test_x,test_y):
    pred_y = []
    fea = test_x[0][:].reshape(1,-1)
    for i in range(len(test_y[0])):
        _y = model.predict(xgb.DMatrix(fea[:,-120:]))[0]
        pred_y.append(_y)
        fea = np.append(fea,np.array([[_y]]),axis=1)
        #print(_y,fea[:,-120:])
        
    return pred_y
pred_y = inference(xgb_model,test_x,test_y)
def plot_curves(x, y1, y2, y3):
    plt.plot(x, np.concatenate([y1,y2]), label='predict',color='red')
    plt.plot(x, np.concatenate([y1,y3]), label='gt',color='yellow')
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.title('xgboost mae:0.124')
    plt.legend()
    plt.show()
    
plot_curves(np.array([i for i in range(180)]),test_x[0],pred_y,test_y[0])