RDKit & Scikitlearn

Let’s try to build model using RDKit and Scikit-learn .
Scikit-learn is simple and efficient tools for data mining and data analysis.
I referred this page.
here
At first, build model using molecular descriptors .

#qsar_desc.py
import sys, cPickle
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn import neighbors
from sklearn import cross_validation
from sklearn import tree, metrics
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import StandardScaler

from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report

trainset_file = sys.argv[1]
testset_file = sys.argv[2]
trainset = [ x for x in Chem.SDMolSupplier(trainset_file) ]
testset = [ x for x in Chem.SDMolSupplier(testset_file) ]

classes = { '(A) low':0, '(B) medium':1, '(C) high':2 }

def calclator( mols ):
    nms = [ x[0] for x in Descriptors._descList ]
    calc = MoleculeDescriptors.MolecularDescriptorCalculator( nms )
    desc = [ calc.CalcDescriptors(mol) for mol in mols ]
    return desc

#################################
# make training set and test set#
#################################
trainActs = np.array([ classes[x.GetProp( 'SOL_classification' )] for x in trainset ])
testActs = np.array([ classes[x.GetProp( 'SOL_classification' )] for x in testset ])
trainDescrs = calclator( trainset )
testDescrs = calclator( testset )
cPickle.dump(trainDescrs,file('trainDescrs.pkl','wb+'))
cPickle.dump(testDescrs,file('testDescrs.pkl','wb+'))


print "RandomForest"
clf_RF = RandomForestClassifier( n_estimators=100, max_depth=5,  min_samples_split=5, random_state=0,n_jobs=1 )
scores_RF = cross_validation.cross_val_score(clf_RF, trainDescrs, trainActs)
print scores_RF
nclf_RF = RandomForestClassifier( n_estimators=100, max_depth=5,  min_samples_split=5, random_state=0,n_jobs=1 )
nclf_RF = nclf_RF.fit( testDescrs, testActs )
pred_RF = nclf_RF.predict( testDescrs )
print metrics.confusion_matrix( testActs, pred_RF )
accuracy = nclf_RF.score( testDescrs, testActs )
print accuracy

print "Naive_Bayse"
gnb = GaussianNB()
clf_NB = gnb.fit( trainDescrs, trainActs )
pred_NB = clf_NB.predict( testDescrs )
print metrics.confusion_matrix( testActs, pred_NB )
accuracy = clf_NB.score( testDescrs, testActs )
print accuracy
print "K-NN"
clf_KNN = neighbors.KNeighborsClassifier( 15, weights="uniform" )
clf_KNN.fit( trainDescrs, trainActs )
pred_KNN = clf_KNN.predict( testDescrs )
print metrics.confusion_matrix( testActs, pred_KNN )
accuracy = clf_KNN.score( testDescrs, testActs )
print accuracy

print "DecsionTreeClassifier"
clf_DT = tree.DecisionTreeClassifier()
clf_DT.fit( trainDescrs, trainActs )
pred_DT = clf_DT.predict( testDescrs )
print metrics.confusion_matrix( testActs, pred_DT )
accuracy = clf_DT.score( testDescrs, testActs )
print accuracy
export_file = tree.export_graphviz( clf_DT, out_file='tree.png' )

print "SVM"
clf_svm = svm.SVC( gamma=0.001, C=100. )
clf_svm = clf_svm.fit( trainDescrs, trainActs )
pred_SVM = clf_svm.predict( testDescrs )
print metrics.confusion_matrix( testActs, pred_SVM )
accuracy = clf_svm.score( testDescrs, testActs )
print accuracy
print "finished"

Here we go.

$ python qsar_desc.py solubility.train.sdf solubility.test.sdf > result.txt
$ cat result.txt
SVM
  1 RandomForest
  2 [ 0.80994152  0.83333333  0.81524927]
  3 [[ 97   5   0]
  4  [  3 112   0]
  5  [  1   4  35]]
  6 0.949416342412
  7 
  8 
  9 Naive_Bayse
 10 [[58 38  6]
 11  [ 7 50 58]
 12  [ 1  5 34]]
 13 0.552529182879
 14 
 15 
 16 K-NN
 17 [[83 19  0]
 18  [32 72 11]
 19  [ 3  4 33]]
 20 0.731517509728
 21 
 22 
 23 DecsionTreeClassifier
 24 [[85 16  1]
 25  [14 92  9]
 26  [ 0  7 33]]
 27 0.817120622568
 28 

OK!
Next, build model using molecular fingerprint .

import sys, cPickle
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem

from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn import neighbors
from sklearn import cross_validation
from sklearn import tree, metrics
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import StandardScaler

from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report

trainset_file = sys.argv[1]
testset_file = sys.argv[2]
trainset = [ x for x in Chem.SDMolSupplier(trainset_file) ]
testset = [ x for x in Chem.SDMolSupplier(testset_file) ]

classes = { '(A) low':0, '(B) medium':1, '(C) high':2 }

def getfp( mols ):
    fps =  [AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=1024).ToBitString() for mol in mols ]
    bits = []
    for fp in fps:
        bit_list = []
        for bit in fp:
            bit_list.append( int(bit) )
        bits.append( bit_list )
    return bits

#################################
# make training set and test set#
#################################
trainActs = np.array([ classes[x.GetProp( 'SOL_classification' )] for x in trainset ])
testActs = np.array([ classes[x.GetProp( 'SOL_classification' )] for x in testset ])
trainDescrs = getfp( trainset )
testDescrs = getfp( testset )
cPickle.dump(trainDescrs,file('trainDescrs.pkl','wb+'))
cPickle.dump(testDescrs,file('testDescrs.pkl','wb+'))


print "RandomForest"
clf_RF = RandomForestClassifier( n_estimators=100, max_depth=5,  min_samples_split=5, random_state=0,n_jobs=1 )
scores_RF = cross_validation.cross_val_score(clf_RF, trainDescrs, trainActs)
print scores_RF
nclf_RF = RandomForestClassifier( n_estimators=100, max_depth=5,  min_samples_split=5, random_state=0,n_jobs=1 )
nclf_RF = nclf_RF.fit( testDescrs, testActs )
pred_RF = nclf_RF.predict( testDescrs )
print metrics.confusion_matrix( testActs, pred_RF )
accuracy = nclf_RF.score( testDescrs, testActs )
print accuracy

print "Naive_Bayse"
gnb = GaussianNB()
clf_NB = gnb.fit( trainDescrs, trainActs )
pred_NB = clf_NB.predict( testDescrs )
print metrics.confusion_matrix( testActs, pred_NB )
accuracy = clf_NB.score( testDescrs, testActs )
print accuracy

print "K-NN"
clf_KNN = neighbors.KNeighborsClassifier( 15, weights="uniform" )
clf_KNN.fit( trainDescrs, trainActs )
pred_KNN = clf_KNN.predict( testDescrs )
print metrics.confusion_matrix( testActs, pred_KNN )
accuracy = clf_KNN.score( testDescrs, testActs )
print accuracy

print "DecsionTreeClassifier"
clf_DT = tree.DecisionTreeClassifier()
clf_DT.fit( trainDescrs, trainActs )
pred_DT = clf_DT.predict( testDescrs )
print metrics.confusion_matrix( testActs, pred_DT )
accuracy = clf_DT.score( testDescrs, testActs )
print accuracy
export_file = tree.export_graphviz( clf_DT, out_file='tree.png' )

print "SVM"
clf_svm = svm.SVC( gamma=0.001, C=100. )
clf_svm = clf_svm.fit( trainDescrs, trainActs )
pred_SVM = clf_svm.predict( testDescrs )
print metrics.confusion_matrix( testActs, pred_SVM )
accuracy = clf_svm.score( testDescrs, testActs )
print accuracy

print "finished"

Let’s try !

$ python qsar_fp.py solubility.train.sdf solubility.test.sdf > result.txt
$ cat result.txt
RandomForest
[ 0.58187135  0.56725146  0.59530792]
[[ 71  31   0]
 [  3 112   0]
 [  0  40   0]]
0.712062256809


Naive_Bayse
[[48 43 11]
 [23 62 30]
 [ 2 24 14]]
0.482490272374


K-NN
[[73 26  3]
 [36 61 18]
 [ 8 15 17]]
0.587548638132


DecsionTreeClassifier
[[72 25  5]
 [43 55 17]
 [ 4 14 22]]
0.579766536965


SVM
[[78 21  3]
 [26 78 11]
 [ 1 10 29]]
0.719844357977
finished

RF method using molecular descriptors showed good result in this case.
I think scikit-learn is very powerful tools and easy to use.
So, I’ll try to build useful tools to QSAR analysis.

Advertisement

Published by iwatobipen

I'm medicinal chemist in mid size of pharmaceutical company. I love chemoinfo, cording, organic synthesis, my family.

2 thoughts on “RDKit & Scikitlearn

  1. We have found the Gradient Boosted Trees do well for cheminformatic problems, AdaBoost less so. Another method that works well is SVMs with a kernel made with a Morgan Fingerprint and Tanimoto Similarity.

    1. Thanks for your comment. I’ve not tried before. Did you compare with Gradient boost and Deep learning ? If yes, I would like to know the result.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

This site uses Akismet to reduce spam. Learn how your comment data is processed.

%d bloggers like this: