RDKit & Scikitlearn

Let’s try to build model using RDKit and Scikit-learn .
Scikit-learn is simple and efficient tools for data mining and data analysis.
I referred this page.
here
At first, build model using molecular descriptors .

#qsar_desc.py
import sys, cPickle
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn import neighbors
from sklearn import cross_validation
from sklearn import tree, metrics
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import StandardScaler

from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report

trainset_file = sys.argv[1]
testset_file = sys.argv[2]
trainset = [ x for x in Chem.SDMolSupplier(trainset_file) ]
testset = [ x for x in Chem.SDMolSupplier(testset_file) ]

classes = { '(A) low':0, '(B) medium':1, '(C) high':2 }

def calclator( mols ):
    nms = [ x[0] for x in Descriptors._descList ]
    calc = MoleculeDescriptors.MolecularDescriptorCalculator( nms )
    desc = [ calc.CalcDescriptors(mol) for mol in mols ]
    return desc

#################################
# make training set and test set#
#################################
trainActs = np.array([ classes[x.GetProp( 'SOL_classification' )] for x in trainset ])
testActs = np.array([ classes[x.GetProp( 'SOL_classification' )] for x in testset ])
trainDescrs = calclator( trainset )
testDescrs = calclator( testset )
cPickle.dump(trainDescrs,file('trainDescrs.pkl','wb+'))
cPickle.dump(testDescrs,file('testDescrs.pkl','wb+'))


print "RandomForest"
clf_RF = RandomForestClassifier( n_estimators=100, max_depth=5,  min_samples_split=5, random_state=0,n_jobs=1 )
scores_RF = cross_validation.cross_val_score(clf_RF, trainDescrs, trainActs)
print scores_RF
nclf_RF = RandomForestClassifier( n_estimators=100, max_depth=5,  min_samples_split=5, random_state=0,n_jobs=1 )
nclf_RF = nclf_RF.fit( testDescrs, testActs )
pred_RF = nclf_RF.predict( testDescrs )
print metrics.confusion_matrix( testActs, pred_RF )
accuracy = nclf_RF.score( testDescrs, testActs )
print accuracy

print "Naive_Bayse"
gnb = GaussianNB()
clf_NB = gnb.fit( trainDescrs, trainActs )
pred_NB = clf_NB.predict( testDescrs )
print metrics.confusion_matrix( testActs, pred_NB )
accuracy = clf_NB.score( testDescrs, testActs )
print accuracy
print "K-NN"
clf_KNN = neighbors.KNeighborsClassifier( 15, weights="uniform" )
clf_KNN.fit( trainDescrs, trainActs )
pred_KNN = clf_KNN.predict( testDescrs )
print metrics.confusion_matrix( testActs, pred_KNN )
accuracy = clf_KNN.score( testDescrs, testActs )
print accuracy

print "DecsionTreeClassifier"
clf_DT = tree.DecisionTreeClassifier()
clf_DT.fit( trainDescrs, trainActs )
pred_DT = clf_DT.predict( testDescrs )
print metrics.confusion_matrix( testActs, pred_DT )
accuracy = clf_DT.score( testDescrs, testActs )
print accuracy
export_file = tree.export_graphviz( clf_DT, out_file='tree.png' )

print "SVM"
clf_svm = svm.SVC( gamma=0.001, C=100. )
clf_svm = clf_svm.fit( trainDescrs, trainActs )
pred_SVM = clf_svm.predict( testDescrs )
print metrics.confusion_matrix( testActs, pred_SVM )
accuracy = clf_svm.score( testDescrs, testActs )
print accuracy
print "finished"

Here we go.

$ python qsar_desc.py solubility.train.sdf solubility.test.sdf > result.txt
$ cat result.txt
SVM
  1 RandomForest
  2 [ 0.80994152  0.83333333  0.81524927]
  3 [[ 97   5   0]
  4  [  3 112   0]
  5  [  1   4  35]]
  6 0.949416342412
  7 
  8 
  9 Naive_Bayse
 10 [[58 38  6]
 11  [ 7 50 58]
 12  [ 1  5 34]]
 13 0.552529182879
 14 
 15 
 16 K-NN
 17 [[83 19  0]
 18  [32 72 11]
 19  [ 3  4 33]]
 20 0.731517509728
 21 
 22 
 23 DecsionTreeClassifier
 24 [[85 16  1]
 25  [14 92  9]
 26  [ 0  7 33]]
 27 0.817120622568
 28 

OK!
Next, build model using molecular fingerprint .

import sys, cPickle
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem

from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn import neighbors
from sklearn import cross_validation
from sklearn import tree, metrics
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import StandardScaler

from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report

trainset_file = sys.argv[1]
testset_file = sys.argv[2]
trainset = [ x for x in Chem.SDMolSupplier(trainset_file) ]
testset = [ x for x in Chem.SDMolSupplier(testset_file) ]

classes = { '(A) low':0, '(B) medium':1, '(C) high':2 }

def getfp( mols ):
    fps =  [AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=1024).ToBitString() for mol in mols ]
    bits = []
    for fp in fps:
        bit_list = []
        for bit in fp:
            bit_list.append( int(bit) )
        bits.append( bit_list )
    return bits

#################################
# make training set and test set#
#################################
trainActs = np.array([ classes[x.GetProp( 'SOL_classification' )] for x in trainset ])
testActs = np.array([ classes[x.GetProp( 'SOL_classification' )] for x in testset ])
trainDescrs = getfp( trainset )
testDescrs = getfp( testset )
cPickle.dump(trainDescrs,file('trainDescrs.pkl','wb+'))
cPickle.dump(testDescrs,file('testDescrs.pkl','wb+'))


print "RandomForest"
clf_RF = RandomForestClassifier( n_estimators=100, max_depth=5,  min_samples_split=5, random_state=0,n_jobs=1 )
scores_RF = cross_validation.cross_val_score(clf_RF, trainDescrs, trainActs)
print scores_RF
nclf_RF = RandomForestClassifier( n_estimators=100, max_depth=5,  min_samples_split=5, random_state=0,n_jobs=1 )
nclf_RF = nclf_RF.fit( testDescrs, testActs )
pred_RF = nclf_RF.predict( testDescrs )
print metrics.confusion_matrix( testActs, pred_RF )
accuracy = nclf_RF.score( testDescrs, testActs )
print accuracy

print "Naive_Bayse"
gnb = GaussianNB()
clf_NB = gnb.fit( trainDescrs, trainActs )
pred_NB = clf_NB.predict( testDescrs )
print metrics.confusion_matrix( testActs, pred_NB )
accuracy = clf_NB.score( testDescrs, testActs )
print accuracy

print "K-NN"
clf_KNN = neighbors.KNeighborsClassifier( 15, weights="uniform" )
clf_KNN.fit( trainDescrs, trainActs )
pred_KNN = clf_KNN.predict( testDescrs )
print metrics.confusion_matrix( testActs, pred_KNN )
accuracy = clf_KNN.score( testDescrs, testActs )
print accuracy

print "DecsionTreeClassifier"
clf_DT = tree.DecisionTreeClassifier()
clf_DT.fit( trainDescrs, trainActs )
pred_DT = clf_DT.predict( testDescrs )
print metrics.confusion_matrix( testActs, pred_DT )
accuracy = clf_DT.score( testDescrs, testActs )
print accuracy
export_file = tree.export_graphviz( clf_DT, out_file='tree.png' )

print "SVM"
clf_svm = svm.SVC( gamma=0.001, C=100. )
clf_svm = clf_svm.fit( trainDescrs, trainActs )
pred_SVM = clf_svm.predict( testDescrs )
print metrics.confusion_matrix( testActs, pred_SVM )
accuracy = clf_svm.score( testDescrs, testActs )
print accuracy

print "finished"

Let’s try !

$ python qsar_fp.py solubility.train.sdf solubility.test.sdf > result.txt
$ cat result.txt
RandomForest
[ 0.58187135  0.56725146  0.59530792]
[[ 71  31   0]
 [  3 112   0]
 [  0  40   0]]
0.712062256809


Naive_Bayse
[[48 43 11]
 [23 62 30]
 [ 2 24 14]]
0.482490272374


K-NN
[[73 26  3]
 [36 61 18]
 [ 8 15 17]]
0.587548638132


DecsionTreeClassifier
[[72 25  5]
 [43 55 17]
 [ 4 14 22]]
0.579766536965


SVM
[[78 21  3]
 [26 78 11]
 [ 1 10 29]]
0.719844357977
finished

RF method using molecular descriptors showed good result in this case.
I think scikit-learn is very powerful tools and easy to use.
So, I’ll try to build useful tools to QSAR analysis.

広告

コメントを残す

以下に詳細を記入するか、アイコンをクリックしてログインしてください。

WordPress.com ロゴ

WordPress.com アカウントを使ってコメントしています。 ログアウト / 変更 )

Twitter 画像

Twitter アカウントを使ってコメントしています。 ログアウト / 変更 )

Facebook の写真

Facebook アカウントを使ってコメントしています。 ログアウト / 変更 )

Google+ フォト

Google+ アカウントを使ってコメントしています。 ログアウト / 変更 )

%s と連携中