通过图像训练强化学习模型

2020-04-16 15:10:12 浏览数 (1)

通过图像识别observation及reward

在gym中运行atari环境的时候可以选择同一个游戏的内存方式或者图像方式,内存方式直接返回游戏的状态,图像方式返回当前游戏的画面

之前的文章都是在MATLAB或者simulink中直接获取observation及reward,这里我们让环境返回图像,通过神经网络识别图像中的信息

%% 读取环境

ccc

env = rlPredefinedEnv('SimplePendulumWithImage-Continuous');

obsInfo = getObservationInfo(env);

actInfo = getActionInfo(env);

rng(0)

%%

hiddenLayerSize1 = 400;

hiddenLayerSize2 = 300;

%% 初始化agent

imgPath = [

imageInputLayer(obsInfo(1).Dimension,'Normalization','none','Name',obsInfo(1).Name)

convolution2dLayer(10,2,'Name','conv1','Stride',5,'Padding',0)

reluLayer('Name','relu1')

fullyConnectedLayer(2,'Name','fc1')

concatenationLayer(3,2,'Name','cat1')

fullyConnectedLayer(hiddenLayerSize1,'Name','fc2')

reluLayer('Name','relu2')

fullyConnectedLayer(hiddenLayerSize2,'Name','fc3')

additionLayer(2,'Name','add')

reluLayer('Name','relu3')

fullyConnectedLayer(1,'Name','fc4')

];

dthetaPath = [

imageInputLayer(obsInfo(2).Dimension,'Normalization','none','Name',obsInfo(2).Name)

fullyConnectedLayer(1,'Name','fc5','BiasLearnRateFactor',0,'Bias',0)

];

actPath =[

imageInputLayer(actInfo(1).Dimension,'Normalization','none','Name','action')

fullyConnectedLayer(hiddenLayerSize2,'Name','fc6','BiasLearnRateFactor',0,'Bias',zeros(hiddenLayerSize2,1))

];

criticNetwork = layerGraph(imgPath);

criticNetwork = addLayers(criticNetwork,dthetaPath);

criticNetwork = addLayers(criticNetwork,actPath);

criticNetwork = connectLayers(criticNetwork,'fc5','cat1/in2');

criticNetwork = connectLayers(criticNetwork,'fc6','add/in2');

%%

figure

plot(criticNetwork)

%%

criticOptions = rlRepresentationOptions('LearnRate',1e-03,'GradientThreshold',1);

% criticOptions.UseDevice = 'gpu';

critic = rlRepresentation(criticNetwork,obsInfo,actInfo,...

'Observation',{'pendImage','angularRate'},'Action',{'action'},criticOptions);

%%

imgPath = [

imageInputLayer(obsInfo(1).Dimension,'Normalization','none','Name',obsInfo(1).Name)

convolution2dLayer(10,2,'Name','conv1','Stride',5,'Padding',0)

reluLayer('Name','relu1')

fullyConnectedLayer(2,'Name','fc1')

concatenationLayer(3,2,'Name','cat1')

fullyConnectedLayer(hiddenLayerSize1,'Name','fc2')

reluLayer('Name','relu2')

fullyConnectedLayer(hiddenLayerSize2,'Name','fc3')

reluLayer('Name','relu3')

fullyConnectedLayer(1,'Name','fc4')

tanhLayer('Name','tanh1')

scalingLayer('Name','scale1','Scale',max(actInfo.UpperLimit))

];

dthetaPath = [

imageInputLayer(obsInfo(2).Dimension,'Normalization','none','Name',obsInfo(2).Name)

fullyConnectedLayer(1,'Name','fc5','BiasLearnRateFactor',0,'Bias',0)

];

%%

actorNetwork = layerGraph(imgPath);

actorNetwork = addLayers(actorNetwork,dthetaPath);

actorNetwork = connectLayers(actorNetwork,'fc5','cat1/in2');

%%

actorOptions = rlRepresentationOptions('LearnRate',1e-04,'GradientThreshold',1);

% actorOptions.UseDevice = 'gpu';

actor = rlRepresentation(actorNetwork,obsInfo,actInfo,'Observation',{'pendImage','angularRate'},'Action',{'scale1'},actorOptions);

figure

plot(actorNetwork)

%%

agentOptions = rlDDPGAgentOptions(...

'SampleTime',env.Ts,...

'TargetSmoothFactor',1e-3,...

'ExperienceBufferLength',1e6,...

'DiscountFactor',0.99,...

'MiniBatchSize',128);

agentOptions.NoiseOptions.Variance = 0.6;

agentOptions.NoiseOptions.VarianceDecayRate = 1e-6;

agent = rlDDPGAgent(actor,critic,agentOptions);

%% 设置训练参数

maxepisodes = 5000;

maxsteps = 400;

trainingOptions = rlTrainingOptions(...

'MaxEpisodes',maxepisodes,...

'MaxStepsPerEpisode',maxsteps,...

'Plots','training-progress',...

'StopTrainingCriteria','AverageReward',...

'StopTrainingValue',-740);

plot(env)

%% 并行学习设置

trainingOptions.UseParallel = true;

trainingOptions.ParallelizationOptions.Mode = "async";

trainingOptions.ParallelizationOptions.DataToSendFromWorkers = "Experiences";

trainingOptions.ParallelizationOptions.StepsUntilDataIsSent = -1;

%% 训练

trainingStats = train(agent,env,trainingOptions);

%% 结果展示

simOptions = rlSimulationOptions('MaxSteps',500);

experience = sim(env,agent,simOptions);

% bdclose(mdl)

0 人点赞