Use convolution2D in QSAR.

Recently there are lots of report about deep learning to predict biological activity(QSAR).
I think almost of these predictors use MLP. I wonder if I could use another method like 2DCNN, I could get good predictor.
So, I tried to build 2DCNN QSAR model.
Fortunately, 1024 bit fingerprint is easily convert to 2D 32 x 32 fingerprint.
Let’s start.
At first, I converted molecules to 2D bit image. Data source is ChEMBL.

rom rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from sklearn.cross_validation import train_test_split
from rdkit.Chem import Draw
import pandas as pd
import numpy as np
import glob
import pickle
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split


df = pd.read_table('bioactivity-15_13-09-28.txt', header=0)
df_bind = df[ df.ASSAY_TYPE=="B" ]
df_bind = df_bind[ df_bind.STANDARD_VALUE != None ]
df_bind = df_bind[ df_bind.STANDARD_VALUE >= 0 ]

rows = df_bind.shape[ 0 ]
mols = [ ]
act = [ ]
X = [ ]

def generate_fparr( mol ):
    arr = np.zeros( (1,) )
    fp = AllChem.GetMorganFingerprintAsBitVect( mol, 2, nBits = 1024, useFeatures = True )
    DataStructs.ConvertToNumpyArray( fp, arr )
    size = 32
    return arr.reshape( 1, size, size )

def save_fig( fparr, filepath, size=32 ):
    X, Y = np.meshgrid( range(size), range(size) )
    Z = fparr
    Z = Z[::-1,:]
    plt.xlim( 0, 31 )
    plt.ylim( 0, 31 )
    plt.pcolor(X, Y, Z[0])
    plt.gray()
    plt.savefig( filepath )
    plt.close()

def act2bin( val ):
    if val > 10000:
        return 0
    else:
        return 1

for i in range( rows ):
    try:
        smi = df_bind.CANONICAL_SMILES[i]
        mol = Chem.MolFromSmiles( smi )
        if mol != None:
            mols.append( mol )
            act.append( act2bin( df_bind.STANDARD_VALUE[i]) )
        else:
            pass
    except:
        pass

# save mols image dataset
for idx, mol in enumerate( mols ):
    X.append( generate_fparr( mol ) )
    if act[ idx ] == 1:
        save_fig( X[ idx ], "./posi/idx_{}.png".format( idx ) )
    elif act[ idx ] == 0:
        save_fig( X[ idx ], "./nega/idx_{}.png".format( idx ) )


X = np.asarray(X)
Y = np.asarray(act)

x_train, x_test, y_train, y_test = train_test_split( X,Y, test_size=0.2, random_state=123 )

f = open( 'fpimagedataset.pkl', 'wb' )
pickle.dump([ ( x_train,y_train ), ( x_test, y_test ) ], f)
f.close()

Now I got 2D Fingerprint dataset and 2D fingerprint molecular bit image. Following image is one of positive bit image. Of course I can not understand from the image why the molecule was active.
idx_0

Next, I wrote predictor using Keras.

import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.utils import np_utils
import pickle
import matplotlib
import matplotlib.pyplot as plt

( x_train, y_train ), ( x_test, y_test ) = pickle.load( open('fpimagedataset.pkl', 'rb') )

batch_size = 200
nb_classes = 2
nb_epoch = 100
nb_filters = 32
nb_pool = 2
nb_conv = 3
im_rows , im_cols = 32, 32
im_channels = 1

x_train = x_train.astype( "float32" )
x_test = x_test.astype( "float32" )

y_train = np_utils.to_categorical( y_train, nb_classes )
y_test = np_utils.to_categorical( y_test, nb_classes )

print( x_train.shape[0], 'train samples' )
print( x_test.shape[0], 'test samples' )


model = Sequential()
model.add( Convolution2D( 32, 3, 3,
                            input_shape = ( im_channels, im_rows, im_cols ) ) )
model.add( BatchNormalization() )
model.add( Activation( 'relu' ) )

model.add( Convolution2D( 32, 3, 3,    ))
model.add( BatchNormalization() )
model.add( Activation( 'relu' ) )

model.add( Convolution2D( 32, 3, 3 ) )
model.add( BatchNormalization() )
model.add( Activation( 'relu' ) )
model.add( MaxPooling2D( pool_size=( 2, 2 ) ) )
model.add( Flatten() )
model.add( Dense( 200 ) )
model.add( Activation( 'relu' ) )
model.add( Dropout( 0.5 ) )
model.add( Dense(nb_classes) )
model.add( Activation('softmax') )
model.compile( loss='categorical_crossentropy',
               optimizer='adadelta',
               metrics=['accuracy'],
               )

hist = model.fit( x_train, y_train,
                  batch_size = batch_size,
                  nb_epoch = nb_epoch,
                  verbose = 1,
                  validation_data = ( x_test, y_test ))


print( model.summary() )
score = model.evaluate( x_test, y_test, verbose=0 )

loss = hist.history[ 'loss' ]
acc = hist.history[ 'acc' ]
val_loss = hist.history[ 'val_loss' ]
val_acc = hist.history[ 'val_acc' ]
plt.plot( range(len( loss )), loss, label='loss' )
plt.plot( range(len( val_loss )), val_loss, label='val_loss' )
plt.xlabel( 'epoch' )
plt.ylabel( 'loss' )
plt.savefig( 'loss.png' )
plt.close()
plt.plot( range(len( acc )), acc, label='accuracy' )
plt.plot( range(len( val_acc )), val_acc, label='val_accuracy' )
plt.xlabel( 'epoch' )
plt.ylabel( 'acc' )
plt.savefig( 'acc.png' )
plt.close()

Finally, I run the code.
It took long time to finish the calculation and I got 2 images.
Accuracy of training set was increasing depend on number of epochs, but accuracy of test set was not same.
It was same as loss score. I think the predictor is overfitting. ;-(
I used drop out, normalisation but I couldn’t avoid over fitting. Hmm……

loss

acc

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s