作者:RSJ & 杰
谷歌大脑-Ventilator Pressure Prediction金牌方案分享
简介
在刚刚结束的谷歌大脑Ventilator Pressure Prediction大赛中,前排选手CHRIS DEOTTE开源了教科书版的基于Transformer对序列问题进行建模的方案。
本篇文章共有两大非常值得学习借鉴的地方:
- 目标函数的设计;
- Transformer框架的使用;
该处的目标函数设计对于类似的问题是通用的,在早期的序列化问题建模中,我们也曾经尝试对单个目标预测和几个target一起训练预测,发现后者的效果往往更为稳定,所以非常有借鉴意义;而Transformer的框架使用虽然简单,但是是非常好的一个baseline,非常值得借鉴。
后面我们结合代码一起学习,详细的代码大家可以参考文末作者的Notebook,此处我们仅介绍核心的三大模块。
代码解读
此处我们直接结合代码一起学习,我们将代码拆分为下面几大核心的部分:
- 特征工程;
- Loss设计;
- Transformer Block使用。
01
特征工程
特征工程分为:
- 交叉特征,主要是乘法和cumsum为主;
- lag特征;lag1-4
- 与局部统计特征的差值;
- 基于lag特征的diff特征;
- 时间戳&滑窗统计特征;
- 类别变量的dummy;
def add_features(df):
df['cross'] = df['u_in'] * df['u_out']
df['cross2'] = df['time_step'] * df['u_out']
df['area'] = df['time_step'] * df['u_in']
df['area'] = df.groupby('breath_id')['area'].cumsum()
df['time_step_cumsum'] = df.groupby(['breath_id'])['time_step'].cumsum()
df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
print("Step-1...Completed")
df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
df = df.fillna(0)
print("Step-2...Completed")
df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
df['breath_id__u_in__mean'] = df.groupby(['breath_id'])['u_in'].transform('mean')
df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
print("Step-3...Completed")
df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
print("Step-4...Completed")
df['one'] = 1
df['count'] = (df['one']).groupby(df['breath_id']).cumsum()
df['u_in_cummean'] =df['u_in_cumsum'] /df['count']
df['breath_id_lag']=df['breath_id'].shift(1).fillna(0)
df['breath_id_lag2']=df['breath_id'].shift(2).fillna(0)
df['breath_id_lagsame']=np.select([df['breath_id_lag']==df['breath_id']],[1],0)
df['breath_id_lag2same']=np.select([df['breath_id_lag2']==df['breath_id']],[1],0)
df['breath_id__u_in_lag'] = df['u_in'].shift(1).fillna(0)
df['breath_id__u_in_lag'] = df['breath_id__u_in_lag'] * df['breath_id_lagsame']
df['breath_id__u_in_lag2'] = df['u_in'].shift(2).fillna(0)
df['breath_id__u_in_lag2'] = df['breath_id__u_in_lag2'] * df['breath_id_lag2same']
print("Step-5...Completed")
df['time_step_diff'] = df.groupby('breath_id')['time_step'].diff().fillna(0)
df['ewm_u_in_mean'] = (df
.groupby('breath_id')['u_in']
.ewm(halflife=9)
.mean()
.reset_index(level=0,drop=True))
df[["15_in_sum","15_in_min","15_in_max","15_in_mean"]] = (df
.groupby('breath_id')['u_in']
.rolling(window=15,min_periods=1)
.agg({"15_in_sum":"sum",
"15_in_min":"min",
"15_in_max":"max",
"15_in_mean":"mean"
#"15_in_std":"std"
})
.reset_index(level=0,drop=True))
print("Step-6...Completed")
df['u_in_lagback_diff1'] = df['u_in'] - df['u_in_lag_back1']
df['u_out_lagback_diff1'] = df['u_out'] - df['u_out_lag_back1']
df['u_in_lagback_diff2'] = df['u_in'] - df['u_in_lag_back2']
df['u_out_lagback_diff2'] = df['u_out'] - df['u_out_lag_back2']
print("Step-7...Completed")
df['R'] = df['R'].astype(str)
df['C'] = df['C'].astype(str)
df['R__C'] = df["R"].astype(str) '__' df["C"].astype(str)
df = pd.get_dummies(df)
print("Step-8...Completed")
return df
train = add_features(train)
test = add_features(test)
02
辅助Loss设计
- 相邻pressure的差值;
- pressure的cumsum;
train['pressure_diff'] = train.groupby('breath_id').pressure.diff().fillna(0)
train['pressure_integral'] = train.groupby('breath_id').pressure.cumsum()/200
targets = train[['pressure','pressure_diff','pressure_integral']].to_numpy().reshape(-1, 80, 3)
代码语言:javascript复制U_OUT_IDX = 2
y_weight = np.ones_like( targets )
u_out_values = train[:,:,U_OUT_IDX]
y_weight[ u_out_values==0 ] = 0 # because robust scaler changes 1 to
03
Transformer
此处使用了传统的MHA,细节不再详述;https://keras.io/examples/nlp/text_classification_with_transformer/
代码语言:javascript复制class TransformerBlock(layers.Layer):
def __init__(self, embed_dim, feat_dim, num_heads, ff_dim, rate=0.1):
super(TransformerBlock, self).__init__()
self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
self.ffn = keras.Sequential(
[layers.Dense(ff_dim, activation="gelu"), layers.Dense(feat_dim),]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(rate)
self.dropout2 = layers.Dropout(rate)
def call(self, inputs, training):
attn_output = self.att(inputs, inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 ffn_output)
代码语言:javascript复制feat_dim = train.shape[-1] 32
embed_dim = 64 # Embedding size for attention
num_heads = 8 # Number of attention heads
ff_dim = 128 # Hidden layer size in feed forward network inside transformer
dropout_rate = 0.0
num_blocks = 12
def build_model():
inputs = layers.Input(shape=train.shape[-2:])
# "EMBEDDING LAYER"
x = layers.Dense(feat_dim)(inputs)
x = layers.LayerNormalization(epsilon=1e-6)(x)
# TRANSFORMER BLOCKS
for k in range(num_blocks):
x_old = x
transformer_block = TransformerBlock(embed_dim, feat_dim, num_heads, ff_dim, dropout_rate)
x = transformer_block(x)
x = 0.7*x 0.3*x_old # SKIP CONNECTION
# REGRESSION HEAD
x = layers.Dense(128, activation="selu")(x)
x = layers.Dropout(dropout_rate)(x)
outputs = layers.Dense(3, activation="linear")(x)
model = keras.Model(inputs=inputs, outputs=outputs)
return model
小结
本文基于Transformer的方案简洁明了,非常值得借鉴,包括Transformer用于序列建模的方式,以及此类问题Loss的设计等等。
参考文献
- https://www.kaggle.com/cdeotte/tensorflow-transformer-0-112/comments?scriptVersionId=78792844