Use convolution2D in QSAR.

Recently there are lots of report about deep learning to predict biological activity(QSAR).
I think almost of these predictors use MLP. I wonder if I could use another method like 2DCNN, I could get good predictor.
So, I tried to build 2DCNN QSAR model.
Fortunately, 1024 bit fingerprint is easily convert to 2D 32 x 32 fingerprint.
Let’s start.
At first, I converted molecules to 2D bit image. Data source is ChEMBL.

rom rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from sklearn.cross_validation import train_test_split
from rdkit.Chem import Draw
import pandas as pd
import numpy as np
import glob
import pickle
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split


df = pd.read_table('bioactivity-15_13-09-28.txt', header=0)
df_bind = df[ df.ASSAY_TYPE=="B" ]
df_bind = df_bind[ df_bind.STANDARD_VALUE != None ]
df_bind = df_bind[ df_bind.STANDARD_VALUE >= 0 ]

rows = df_bind.shape[ 0 ]
mols = [ ]
act = [ ]
X = [ ]

def generate_fparr( mol ):
    arr = np.zeros( (1,) )
    fp = AllChem.GetMorganFingerprintAsBitVect( mol, 2, nBits = 1024, useFeatures = True )
    DataStructs.ConvertToNumpyArray( fp, arr )
    size = 32
    return arr.reshape( 1, size, size )

def save_fig( fparr, filepath, size=32 ):
    X, Y = np.meshgrid( range(size), range(size) )
    Z = fparr
    Z = Z[::-1,:]
    plt.xlim( 0, 31 )
    plt.ylim( 0, 31 )
    plt.pcolor(X, Y, Z[0])
    plt.gray()
    plt.savefig( filepath )
    plt.close()

def act2bin( val ):
    if val > 10000:
        return 0
    else:
        return 1

for i in range( rows ):
    try:
        smi = df_bind.CANONICAL_SMILES[i]
        mol = Chem.MolFromSmiles( smi )
        if mol != None:
            mols.append( mol )
            act.append( act2bin( df_bind.STANDARD_VALUE[i]) )
        else:
            pass
    except:
        pass

# save mols image dataset
for idx, mol in enumerate( mols ):
    X.append( generate_fparr( mol ) )
    if act[ idx ] == 1:
        save_fig( X[ idx ], "./posi/idx_{}.png".format( idx ) )
    elif act[ idx ] == 0:
        save_fig( X[ idx ], "./nega/idx_{}.png".format( idx ) )


X = np.asarray(X)
Y = np.asarray(act)

x_train, x_test, y_train, y_test = train_test_split( X,Y, test_size=0.2, random_state=123 )

f = open( 'fpimagedataset.pkl', 'wb' )
pickle.dump([ ( x_train,y_train ), ( x_test, y_test ) ], f)
f.close()

Now I got 2D Fingerprint dataset and 2D fingerprint molecular bit image. Following image is one of positive bit image. Of course I can not understand from the image why the molecule was active.
idx_0

Next, I wrote predictor using Keras.

import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.utils import np_utils
import pickle
import matplotlib
import matplotlib.pyplot as plt

( x_train, y_train ), ( x_test, y_test ) = pickle.load( open('fpimagedataset.pkl', 'rb') )

batch_size = 200
nb_classes = 2
nb_epoch = 100
nb_filters = 32
nb_pool = 2
nb_conv = 3
im_rows , im_cols = 32, 32
im_channels = 1

x_train = x_train.astype( "float32" )
x_test = x_test.astype( "float32" )

y_train = np_utils.to_categorical( y_train, nb_classes )
y_test = np_utils.to_categorical( y_test, nb_classes )

print( x_train.shape[0], 'train samples' )
print( x_test.shape[0], 'test samples' )


model = Sequential()
model.add( Convolution2D( 32, 3, 3,
                            input_shape = ( im_channels, im_rows, im_cols ) ) )
model.add( BatchNormalization() )
model.add( Activation( 'relu' ) )

model.add( Convolution2D( 32, 3, 3,    ))
model.add( BatchNormalization() )
model.add( Activation( 'relu' ) )

model.add( Convolution2D( 32, 3, 3 ) )
model.add( BatchNormalization() )
model.add( Activation( 'relu' ) )
model.add( MaxPooling2D( pool_size=( 2, 2 ) ) )
model.add( Flatten() )
model.add( Dense( 200 ) )
model.add( Activation( 'relu' ) )
model.add( Dropout( 0.5 ) )
model.add( Dense(nb_classes) )
model.add( Activation('softmax') )
model.compile( loss='categorical_crossentropy',
               optimizer='adadelta',
               metrics=['accuracy'],
               )

hist = model.fit( x_train, y_train,
                  batch_size = batch_size,
                  nb_epoch = nb_epoch,
                  verbose = 1,
                  validation_data = ( x_test, y_test ))


print( model.summary() )
score = model.evaluate( x_test, y_test, verbose=0 )

loss = hist.history[ 'loss' ]
acc = hist.history[ 'acc' ]
val_loss = hist.history[ 'val_loss' ]
val_acc = hist.history[ 'val_acc' ]
plt.plot( range(len( loss )), loss, label='loss' )
plt.plot( range(len( val_loss )), val_loss, label='val_loss' )
plt.xlabel( 'epoch' )
plt.ylabel( 'loss' )
plt.savefig( 'loss.png' )
plt.close()
plt.plot( range(len( acc )), acc, label='accuracy' )
plt.plot( range(len( val_acc )), val_acc, label='val_accuracy' )
plt.xlabel( 'epoch' )
plt.ylabel( 'acc' )
plt.savefig( 'acc.png' )
plt.close()

Finally, I run the code.
It took long time to finish the calculation and I got 2 images.
Accuracy of training set was increasing depend on number of epochs, but accuracy of test set was not same.
It was same as loss score. I think the predictor is overfitting. ;-(
I used drop out, normalisation but I couldn’t avoid over fitting. Hmm……

loss

acc

広告

コメントを残す

以下に詳細を記入するか、アイコンをクリックしてログインしてください。

WordPress.com ロゴ

WordPress.com アカウントを使ってコメントしています。 ログアウト / 変更 )

Twitter 画像

Twitter アカウントを使ってコメントしています。 ログアウト / 変更 )

Facebook の写真

Facebook アカウントを使ってコメントしています。 ログアウト / 変更 )

Google+ フォト

Google+ アカウントを使ってコメントしています。 ログアウト / 変更 )

%s と連携中