Let’s try to build model using RDKit and Scikit-learn .
Scikit-learn is simple and efficient tools for data mining and data analysis.
I referred this page.
here
At first, build model using molecular descriptors .
#qsar_desc.py import sys, cPickle import numpy as np from rdkit import Chem from rdkit.Chem import Descriptors from rdkit.ML.Descriptors import MoleculeDescriptors from sklearn.ensemble import RandomForestClassifier from sklearn import svm from sklearn.naive_bayes import GaussianNB from sklearn import neighbors from sklearn import cross_validation from sklearn import tree, metrics from sklearn.cross_validation import StratifiedKFold from sklearn.preprocessing import StandardScaler from sklearn.grid_search import GridSearchCV from sklearn.metrics import classification_report trainset_file = sys.argv[1] testset_file = sys.argv[2] trainset = [ x for x in Chem.SDMolSupplier(trainset_file) ] testset = [ x for x in Chem.SDMolSupplier(testset_file) ] classes = { '(A) low':0, '(B) medium':1, '(C) high':2 } def calclator( mols ): nms = [ x[0] for x in Descriptors._descList ] calc = MoleculeDescriptors.MolecularDescriptorCalculator( nms ) desc = [ calc.CalcDescriptors(mol) for mol in mols ] return desc ################################# # make training set and test set# ################################# trainActs = np.array([ classes[x.GetProp( 'SOL_classification' )] for x in trainset ]) testActs = np.array([ classes[x.GetProp( 'SOL_classification' )] for x in testset ]) trainDescrs = calclator( trainset ) testDescrs = calclator( testset ) cPickle.dump(trainDescrs,file('trainDescrs.pkl','wb+')) cPickle.dump(testDescrs,file('testDescrs.pkl','wb+')) print "RandomForest" clf_RF = RandomForestClassifier( n_estimators=100, max_depth=5, min_samples_split=5, random_state=0,n_jobs=1 ) scores_RF = cross_validation.cross_val_score(clf_RF, trainDescrs, trainActs) print scores_RF nclf_RF = RandomForestClassifier( n_estimators=100, max_depth=5, min_samples_split=5, random_state=0,n_jobs=1 ) nclf_RF = nclf_RF.fit( testDescrs, testActs ) pred_RF = nclf_RF.predict( testDescrs ) print metrics.confusion_matrix( testActs, pred_RF ) accuracy = nclf_RF.score( testDescrs, testActs ) print accuracy print "Naive_Bayse" gnb = GaussianNB() clf_NB = gnb.fit( trainDescrs, trainActs ) pred_NB = clf_NB.predict( testDescrs ) print metrics.confusion_matrix( testActs, pred_NB ) accuracy = clf_NB.score( testDescrs, testActs ) print accuracy print "K-NN" clf_KNN = neighbors.KNeighborsClassifier( 15, weights="uniform" ) clf_KNN.fit( trainDescrs, trainActs ) pred_KNN = clf_KNN.predict( testDescrs ) print metrics.confusion_matrix( testActs, pred_KNN ) accuracy = clf_KNN.score( testDescrs, testActs ) print accuracy print "DecsionTreeClassifier" clf_DT = tree.DecisionTreeClassifier() clf_DT.fit( trainDescrs, trainActs ) pred_DT = clf_DT.predict( testDescrs ) print metrics.confusion_matrix( testActs, pred_DT ) accuracy = clf_DT.score( testDescrs, testActs ) print accuracy export_file = tree.export_graphviz( clf_DT, out_file='tree.png' ) print "SVM" clf_svm = svm.SVC( gamma=0.001, C=100. ) clf_svm = clf_svm.fit( trainDescrs, trainActs ) pred_SVM = clf_svm.predict( testDescrs ) print metrics.confusion_matrix( testActs, pred_SVM ) accuracy = clf_svm.score( testDescrs, testActs ) print accuracy print "finished"
Here we go.
$ python qsar_desc.py solubility.train.sdf solubility.test.sdf > result.txt $ cat result.txt
SVM 1 RandomForest 2 [ 0.80994152 0.83333333 0.81524927] 3 [[ 97 5 0] 4 [ 3 112 0] 5 [ 1 4 35]] 6 0.949416342412 7 8 9 Naive_Bayse 10 [[58 38 6] 11 [ 7 50 58] 12 [ 1 5 34]] 13 0.552529182879 14 15 16 K-NN 17 [[83 19 0] 18 [32 72 11] 19 [ 3 4 33]] 20 0.731517509728 21 22 23 DecsionTreeClassifier 24 [[85 16 1] 25 [14 92 9] 26 [ 0 7 33]] 27 0.817120622568 28
OK!
Next, build model using molecular fingerprint .
import sys, cPickle import numpy as np from rdkit import Chem from rdkit.Chem import Descriptors from rdkit.Chem import AllChem from sklearn.ensemble import RandomForestClassifier from sklearn import svm from sklearn.naive_bayes import GaussianNB from sklearn import neighbors from sklearn import cross_validation from sklearn import tree, metrics from sklearn.cross_validation import StratifiedKFold from sklearn.preprocessing import StandardScaler from sklearn.grid_search import GridSearchCV from sklearn.metrics import classification_report trainset_file = sys.argv[1] testset_file = sys.argv[2] trainset = [ x for x in Chem.SDMolSupplier(trainset_file) ] testset = [ x for x in Chem.SDMolSupplier(testset_file) ] classes = { '(A) low':0, '(B) medium':1, '(C) high':2 } def getfp( mols ): fps = [AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=1024).ToBitString() for mol in mols ] bits = [] for fp in fps: bit_list = [] for bit in fp: bit_list.append( int(bit) ) bits.append( bit_list ) return bits ################################# # make training set and test set# ################################# trainActs = np.array([ classes[x.GetProp( 'SOL_classification' )] for x in trainset ]) testActs = np.array([ classes[x.GetProp( 'SOL_classification' )] for x in testset ]) trainDescrs = getfp( trainset ) testDescrs = getfp( testset ) cPickle.dump(trainDescrs,file('trainDescrs.pkl','wb+')) cPickle.dump(testDescrs,file('testDescrs.pkl','wb+')) print "RandomForest" clf_RF = RandomForestClassifier( n_estimators=100, max_depth=5, min_samples_split=5, random_state=0,n_jobs=1 ) scores_RF = cross_validation.cross_val_score(clf_RF, trainDescrs, trainActs) print scores_RF nclf_RF = RandomForestClassifier( n_estimators=100, max_depth=5, min_samples_split=5, random_state=0,n_jobs=1 ) nclf_RF = nclf_RF.fit( testDescrs, testActs ) pred_RF = nclf_RF.predict( testDescrs ) print metrics.confusion_matrix( testActs, pred_RF ) accuracy = nclf_RF.score( testDescrs, testActs ) print accuracy print "Naive_Bayse" gnb = GaussianNB() clf_NB = gnb.fit( trainDescrs, trainActs ) pred_NB = clf_NB.predict( testDescrs ) print metrics.confusion_matrix( testActs, pred_NB ) accuracy = clf_NB.score( testDescrs, testActs ) print accuracy print "K-NN" clf_KNN = neighbors.KNeighborsClassifier( 15, weights="uniform" ) clf_KNN.fit( trainDescrs, trainActs ) pred_KNN = clf_KNN.predict( testDescrs ) print metrics.confusion_matrix( testActs, pred_KNN ) accuracy = clf_KNN.score( testDescrs, testActs ) print accuracy print "DecsionTreeClassifier" clf_DT = tree.DecisionTreeClassifier() clf_DT.fit( trainDescrs, trainActs ) pred_DT = clf_DT.predict( testDescrs ) print metrics.confusion_matrix( testActs, pred_DT ) accuracy = clf_DT.score( testDescrs, testActs ) print accuracy export_file = tree.export_graphviz( clf_DT, out_file='tree.png' ) print "SVM" clf_svm = svm.SVC( gamma=0.001, C=100. ) clf_svm = clf_svm.fit( trainDescrs, trainActs ) pred_SVM = clf_svm.predict( testDescrs ) print metrics.confusion_matrix( testActs, pred_SVM ) accuracy = clf_svm.score( testDescrs, testActs ) print accuracy print "finished"
Let’s try !
$ python qsar_fp.py solubility.train.sdf solubility.test.sdf > result.txt $ cat result.txt
RandomForest [ 0.58187135 0.56725146 0.59530792] [[ 71 31 0] [ 3 112 0] [ 0 40 0]] 0.712062256809 Naive_Bayse [[48 43 11] [23 62 30] [ 2 24 14]] 0.482490272374 K-NN [[73 26 3] [36 61 18] [ 8 15 17]] 0.587548638132 DecsionTreeClassifier [[72 25 5] [43 55 17] [ 4 14 22]] 0.579766536965 SVM [[78 21 3] [26 78 11] [ 1 10 29]] 0.719844357977 finished
RF method using molecular descriptors showed good result in this case.
I think scikit-learn is very powerful tools and easy to use.
So, I’ll try to build useful tools to QSAR analysis.
We have found the Gradient Boosted Trees do well for cheminformatic problems, AdaBoost less so. Another method that works well is SVMs with a kernel made with a Morgan Fingerprint and Tanimoto Similarity.
Thanks for your comment. I’ve not tried before. Did you compare with Gradient boost and Deep learning ? If yes, I would like to know the result.