利用化合物的结构与活性数据,基于RDKit和Python3的机器学习活性预测模型小示例。
代码示例:
代码语言:javascript复制#导入必须的包
#!/usr/bin/env python3
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem as ch
from rdkit.Chem import Draw as d
from rdkit import DataStructs
import pandas as pd
from rdkit.Chem import rdMolDescriptors as rdescriptors
from matplotlib.mlab import PCA
import matplotlib.pyplot as plt
import csv
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
import sklearn
from rdkit.Chem import PandasTools, Descriptors, MolFromSmiles
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn import svm
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
代码语言:javascript复制#载入数据
with open ('receptor-bioactivity.txt', 'r') as f:
content_raw = list((csv.reader(f, delimiter = 't')))
len(content_raw)
代码语言:javascript复制#移除重复数据
content=[]
for i in range(0,len(content_raw)):
if i == 0:
chembl_id=content_raw[i][0]
content.append(content_raw[i])
elif content_raw[i][0]!=chembl_id:
chembl_id=content_raw[i][0]
content.append(content_raw[i])
代码语言:javascript复制#提取有用信息
names = []
smiles = []
activity = []
mols = []
for i in range(1,len(content)):
names.append(content[i][0])
smiles.append(content[i][5])
activity.append(content[i][7])
mols.append(ch.MolFromSmiles(content[i][5]))
代码语言:javascript复制#为深入分析创建数据集
def HBA(mol):
return rdescriptors.CalcNumLipinskiHBA(mol)
def HBD(mol):
return rdescriptors.CalcNumLipinskiHBD(mol)
def logP(mol):
return Descriptors.MolLogP(mol)
def TPSA(mol):
return Descriptors.TPSA(mol)
def num_rotatable_bonds(mol):
return Descriptors.NumRotatableBonds(mol)
def num_heavy_atoms(mol):
return mol.GetNumHeavyAtoms()
def MW(mol):
return Descriptors.MolWt(mol)
data=[]
for i,mol in enumerate(mols):
data.append([names[i], float(activity[i]), HBA(mol), HBD(mol), float(MW(mol)), logP(mol),float(TPSA(mol)),num_rotatable_bonds(mol),num_heavy_atoms(mol),smiles[i]])
dataframe=pd.DataFrame(data,columns=["CHEMBL_ID", "activity", "HBA", "HBD", "MW", "logP", "TPSA","rb",'smiles'])
dataframe.set_index("CHEMBL_ID",inplace=True)
PCA分析,数据降维也称主成分分析
代码语言:javascript复制#PCA分析
pca1=PCA(dataframe.drop(['smiles'],axis=1))
plt.rcParams["figure.figsize"] = [15, 15]
plt.style.use('ggplot')
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title('Ghrelin Receptor Homo sapiens PCA')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
X = [x[0] for x in pca1.Y]
Y = [y[1] for y in pca1.Y]
plt.scatter(X,Y)
plt.show()
代码语言:javascript复制#运用随机森林模型,并为其选择有用数据
model=dataframe.loc[:,["smiles", "activity"]]
desc_list = Descriptors.descList
model["pic50"] = model.activity.apply(lambda x : -1.0 * np.log10(x / 1.0e9))
for desc_name, function in desc_list:
values = []
for smiles in model["smiles"]:
mol = MolFromSmiles(smiles)
values.append(function(mol))
model[desc_name] = values
columns = [x[0] for x in desc_list[:30]]
代码语言:javascript复制#划分数据集,训练模型
train_data, test_data = train_test_split(model, test_size=0.3)
model2 = RandomForestRegressor(n_estimators=15)
model2.fit(train_data[columns], train_data["pic50"])
代码语言:javascript复制#测试模型,绘图
plt.rcParams["figure.figsize"] = [15, 15]
span = (1,12)
axes = plt.gca()
axes.set_xlim(span)
axes.set_ylim(span)
plt.plot((span[0],span[1]), (span[0],span[1]), linestyle='--')
plt.scatter(
test_data["pic50"]
, model2.predict(test_data[columns])
, c='blue'
, s=20
)
plt.show()