Recently there are lots of report about deep learning to predict biological activity(QSAR).
I think almost of these predictors use MLP. I wonder if I could use another method like 2DCNN, I could get good predictor.
So, I tried to build 2DCNN QSAR model.
Fortunately, 1024 bit fingerprint is easily convert to 2D 32 x 32 fingerprint.
Let’s start.
At first, I converted molecules to 2D bit image. Data source is ChEMBL.
rom rdkit import Chem from rdkit.Chem import AllChem from rdkit.Chem import DataStructs from sklearn.cross_validation import train_test_split from rdkit.Chem import Draw import pandas as pd import numpy as np import glob import pickle import matplotlib.pyplot as plt from sklearn.cross_validation import train_test_split df = pd.read_table('bioactivity-15_13-09-28.txt', header=0) df_bind = df[ df.ASSAY_TYPE=="B" ] df_bind = df_bind[ df_bind.STANDARD_VALUE != None ] df_bind = df_bind[ df_bind.STANDARD_VALUE >= 0 ] rows = df_bind.shape[ 0 ] mols = [ ] act = [ ] X = [ ] def generate_fparr( mol ): arr = np.zeros( (1,) ) fp = AllChem.GetMorganFingerprintAsBitVect( mol, 2, nBits = 1024, useFeatures = True ) DataStructs.ConvertToNumpyArray( fp, arr ) size = 32 return arr.reshape( 1, size, size ) def save_fig( fparr, filepath, size=32 ): X, Y = np.meshgrid( range(size), range(size) ) Z = fparr Z = Z[::-1,:] plt.xlim( 0, 31 ) plt.ylim( 0, 31 ) plt.pcolor(X, Y, Z[0]) plt.gray() plt.savefig( filepath ) plt.close() def act2bin( val ): if val > 10000: return 0 else: return 1 for i in range( rows ): try: smi = df_bind.CANONICAL_SMILES[i] mol = Chem.MolFromSmiles( smi ) if mol != None: mols.append( mol ) act.append( act2bin( df_bind.STANDARD_VALUE[i]) ) else: pass except: pass # save mols image dataset for idx, mol in enumerate( mols ): X.append( generate_fparr( mol ) ) if act[ idx ] == 1: save_fig( X[ idx ], "./posi/idx_{}.png".format( idx ) ) elif act[ idx ] == 0: save_fig( X[ idx ], "./nega/idx_{}.png".format( idx ) ) X = np.asarray(X) Y = np.asarray(act) x_train, x_test, y_train, y_test = train_test_split( X,Y, test_size=0.2, random_state=123 ) f = open( 'fpimagedataset.pkl', 'wb' ) pickle.dump([ ( x_train,y_train ), ( x_test, y_test ) ], f) f.close()
Now I got 2D Fingerprint dataset and 2D fingerprint molecular bit image. Following image is one of positive bit image. Of course I can not understand from the image why the molecule was active.
Next, I wrote predictor using Keras.
import numpy as np from keras.models import Sequential from keras.layers.core import Dense, Activation, Dropout, Flatten from keras.layers.normalization import BatchNormalization from keras.layers.convolutional import Convolution2D, MaxPooling2D from keras.utils import np_utils import pickle import matplotlib import matplotlib.pyplot as plt ( x_train, y_train ), ( x_test, y_test ) = pickle.load( open('fpimagedataset.pkl', 'rb') ) batch_size = 200 nb_classes = 2 nb_epoch = 100 nb_filters = 32 nb_pool = 2 nb_conv = 3 im_rows , im_cols = 32, 32 im_channels = 1 x_train = x_train.astype( "float32" ) x_test = x_test.astype( "float32" ) y_train = np_utils.to_categorical( y_train, nb_classes ) y_test = np_utils.to_categorical( y_test, nb_classes ) print( x_train.shape[0], 'train samples' ) print( x_test.shape[0], 'test samples' ) model = Sequential() model.add( Convolution2D( 32, 3, 3, input_shape = ( im_channels, im_rows, im_cols ) ) ) model.add( BatchNormalization() ) model.add( Activation( 'relu' ) ) model.add( Convolution2D( 32, 3, 3, )) model.add( BatchNormalization() ) model.add( Activation( 'relu' ) ) model.add( Convolution2D( 32, 3, 3 ) ) model.add( BatchNormalization() ) model.add( Activation( 'relu' ) ) model.add( MaxPooling2D( pool_size=( 2, 2 ) ) ) model.add( Flatten() ) model.add( Dense( 200 ) ) model.add( Activation( 'relu' ) ) model.add( Dropout( 0.5 ) ) model.add( Dense(nb_classes) ) model.add( Activation('softmax') ) model.compile( loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'], ) hist = model.fit( x_train, y_train, batch_size = batch_size, nb_epoch = nb_epoch, verbose = 1, validation_data = ( x_test, y_test )) print( model.summary() ) score = model.evaluate( x_test, y_test, verbose=0 ) loss = hist.history[ 'loss' ] acc = hist.history[ 'acc' ] val_loss = hist.history[ 'val_loss' ] val_acc = hist.history[ 'val_acc' ] plt.plot( range(len( loss )), loss, label='loss' ) plt.plot( range(len( val_loss )), val_loss, label='val_loss' ) plt.xlabel( 'epoch' ) plt.ylabel( 'loss' ) plt.savefig( 'loss.png' ) plt.close() plt.plot( range(len( acc )), acc, label='accuracy' ) plt.plot( range(len( val_acc )), val_acc, label='val_accuracy' ) plt.xlabel( 'epoch' ) plt.ylabel( 'acc' ) plt.savefig( 'acc.png' ) plt.close()
Finally, I run the code.
It took long time to finish the calculation and I got 2 images.
Accuracy of training set was increasing depend on number of epochs, but accuracy of test set was not same.
It was same as loss score. I think the predictor is overfitting. ;-(
I used drop out, normalisation but I couldn’t avoid over fitting. Hmm……