使用LSTM预测天气

2020-01-17 16:18:52 浏览数 (2)

本篇使用的数据集是由Max-Planck-Institute for Biogeochemistry记录的天气数据。每10分钟观测一次气压、气温、风速等天气数据。数据集有共420551条记录,历时八年(2009~2016)。训练集取前30万条记录,余下的记录做为验证集。

本篇中的长短时记忆网络(LSTM)使用144个温度数据点(一天的数据)历史记录来预测未来(接下来)6个温度数据点(一个小时的数据)。

代码如下:

代码语言:javascript复制
# -*- coding: utf-8 -*-
"""
Created on Sun Jan  5 21:08:46 2020
@author: Administrator
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
#下载天气数据集
zip_path = tf.keras.utils.get_file(
    origin='https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip',
    fname='jena_climate_2009_2016.csv.zip',
    extract=True)
csv_path, _ = os.path.splitext(zip_path)
df = pd.read_csv(csv_path)
print(df.head()) #瞧一瞧数据集长啥样
#每10分钟有一次观测数据。1小时有6次观测数据,1天有6x24=144次观测数据
print(df.shape) #(420551, 15),2920天(8年)的天气数据
'''
假设我们需要预测未来6小时的气温,为了做预测,我们可以选择5天的观测数据,这样我们就选择144x5 = 720个数据作为窗口来训练模型。
下面的函数就是返回类似这样的窗口。参数history_size是需要的历史数据个数,target_size 为需要预测的数据点个数。
'''
def univariate_data(dataset, start_index, end_index, history_size, target_size):
  data = []
  labels = []
  start_index = start_index   history_size
  if end_index is None:
    end_index = len(dataset) - target_size
  for i in range(start_index, end_index):
    indices = range(i-history_size, i)
    # Reshape data from (history_size,) to (history_size, 1)
    data.append(np.reshape(dataset[indices], (history_size, 1)))
    labels.append(dataset[i target_size])
  return np.array(data), np.array(labels)
#头30万条数据作为训练集,剩下的作为验证集
TRAIN_SPLIT = 300000
tf.random.set_seed(13)
#Forecast a univariate time series 预测单变量(温度)
uni_data = df['T (degC)']
uni_data.index = df['Date Time']
print(uni_data.head())
#uni_data.plot(subplots=True) #绘制历史数据
uni_data = uni_data.values

#数据标准化(减去均值,再除以标准差)
uni_train_mean = uni_data[:TRAIN_SPLIT].mean()
uni_train_std = uni_data[:TRAIN_SPLIT].std()
uni_data = (uni_data-uni_train_mean)/uni_train_std
univariate_past_history = 144 #用144个历史数据点
univariate_future_target = 6 #预测接下来的6个数据点
x_train_uni, y_train_uni = univariate_data(uni_data, 0, TRAIN_SPLIT,
                                           univariate_past_history,
                                           univariate_future_target)
x_val_uni, y_val_uni = univariate_data(uni_data, TRAIN_SPLIT, None,
                                       univariate_past_history,
                                       univariate_future_target)
print ('Single window of past history')
print (x_train_uni[0])
print ('n Target temperature to predict')
print (y_train_uni[0])

def create_time_steps(length):
  return list(range(-length, 0))
  
def show_plot(plot_data, delta, title):
  labels = ['History', 'True Future', 'Model Prediction']
  marker = ['.-', 'rx', 'go']#红色叉死为真值,绿色圆点为预测值
  time_steps = create_time_steps(plot_data[0].shape[0])
  if delta:
    future = delta
  else:
    future = 0
  plt.title(title)
  
  for i, x in enumerate(plot_data):
    if i==0:#历史
      historyLine, = plt.plot(time_steps, plot_data[i].flatten(), marker[i], label=labels[i])
    else:#真值或预测值
        #historyLine.get_c()获取历史线的颜色,对应的数据保持颜色一致
      plt.plot(range(future), plot_data[i], marker[i], color=historyLine.get_c(),#
               markersize=10,label=labels[i])#
  plt.legend()
  #plt.xlim([time_steps[0], (future 5)*2])
  plt.xlabel('Time-Step')
  return plt
  
#show_plot([x_train_uni[0], y_train_uni[0]], 0, 'Sample Example')
def baseline(history):#baseline只是简单的将过去历史记录的均值作为预测值
  return np.mean(history)
#baseline 结果绘图
#show_plot([x_train_uni[0], y_train_uni[0], baseline(x_train_uni[0])], 0,'Baseline Prediction Example')
代码语言:javascript复制
BATCH_SIZE = 256
BUFFER_SIZE = 10000
#训练集
train_univariate = tf.data.Dataset.from_tensor_slices((x_train_uni, y_train_uni))
train_univariate = train_univariate.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()#打乱训练集
#验证集
val_univariate = tf.data.Dataset.from_tensor_slices((x_val_uni, y_val_uni))
val_univariate = val_univariate.batch(BATCH_SIZE).repeat() #打乱验证集

#创建一个简单的LSTM网络模型
simple_lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(units=8, input_shape=x_train_uni.shape[-2:],activation="tanh"),#units:输出空间的维度
    tf.keras.layers.Dense(1)
])
simple_lstm_model.compile(optimizer='adam', loss='mae')#模型编译,设定优化器和损失类型
#做个简单的预测来检查模型的输出
for x, y in val_univariate.take(1):
    print(simple_lstm_model.predict(x).shape)

#因为数据集很大,为了节省时间,每个EPOCH仅跑300步,没有跑完所有训练数据
EVALUATION_INTERVAL = 300
EPOCHS = 10
simple_lstm_model.fit(train_univariate, epochs=EPOCHS,
                      steps_per_epoch=EVALUATION_INTERVAL,
                      validation_data=val_univariate, validation_steps=50)
for x, y in val_univariate.take(3):#做3次预测
  #plot = show_plot([x[0].numpy(), y[0].numpy(),simple_lstm_model.predict(x)[0]], 
                   #delta=univariate_future_target,
                   #title= 'Simple LSTM model')
  plot = show_plot([x[0].numpy(), y[0:univariate_future_target].numpy(),simple_lstm_model.predict(x)[0:univariate_future_target]], 
                   delta=univariate_future_target,
                   title= 'Simple LSTM model')
  plot.show()

在验证集中取了3组数据,预测的结果如下。其中,历史数据(144个点)用线表示,真实值(6个点)用X表示,预测值(6个点)用O表示。最简单的,可以增大EVALUATION_INTERVAL和EPOCHS来提高预测精度。

0 人点赞