Ultra fast clustering script with RDKit #RDKit

Some years ago, I got very useful information for molecular clustering. Bayon is ultra fast clustering tool. The author made not only Japanese-tutorial but also English-tutorial.
This tools is easy to use but to use bayon in chemoinformatics area, user needs data preparation.

I wrote simple script that converts smiles to bayon input format and then perform clustering and parse output data.
Main code is following.

import argparse
import subprocess
import pickle
import os
from rdkit import Chem
from rdkit.Chem import AllChem

parser = argparse.ArgumentParser( "Fast clustering for chemoinformatics" )
parser.add_argument( "input" )
parser.add_argument( "n", help = "the number of clusters" )
#parser.add_argument( "-l", help = "limit value of cluster bisection" )
#parser.add_argument( "-p", help = "output similarity points" )
parser.add_argument( "--output", default = "clustered.tsv" )
parser.add_argument( "-c", help = "filename of centroid", default = "centroid.tsv" )

def smi2fp( molid, smiles ):
    mol = Chem.MolFromSmiles( smiles )
    onbits = AllChem.GetMorganFingerprintAsBitVect( mol, 2 ).GetOnBits()
    row = molid
    for bit in onbits:
        row += "\tFP_{}\t1.0".format( bit )
    row += "\n"
    return row


if __name__ == "__main__":
    args = parser.parse_args()
    inputf = open( args.input, "r" )
    n = args.n
    c = args.c
    output = args.output
    tempf = open( "fp.tsv", "w" )
    for line in inputf:
        line = line.rstrip().split( "\t" )
        tempf.write( smi2fp( line[0], line[1] ))
    tempf.close()
    res = subprocess.call( "time bayon -p -c {} -n {} fp.tsv > {}".format( c, n, output ), shell=True )

    #parse results
    parsefile = open( output.split(".")[0]+"_parse.tsv", "w" )
    inputf = open( output, "r" )
    for line in inputf:
        line = line.rstrip().split( "\t" )
        cluster_id = line[0]
        for i in range( 1, len( line )-1, 2 ) :
            molid = line[ i ]
            point = line[ i + 1 ]
            parsefile.write("{}\t{}\tCLS_ID_{}\n".format( molid, point, cluster_id ))
    parsefile.close()

The code perform clustering molecules and output cluster with point ( similarity ) and parse default bayon format.

I ran the code with rdkit cdk2.sdf data.
47 compound clustered into 5 clusters within 0.006s!

iwatobipen$ python fastcluster.py cdk2.smi 5

real	0m0.015s
user	0m0.006s
sys	0m0.002s
Done!

It seems work fine. 😉

Advertisements

plot data using RDKit-PandasTools

I often use SDMolSupplier method to read SDF. But, PandasTool is another useful way to read SDF.
But, to handle property data, I need to convert data to float.
Today I read a major supplier’s SDF. And plot data using seaborn.

%matplotlib inline
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from rdkit.Chem import PandasTools

IPythonConsole.ipython_useSVG=True
nx = [ mol for mol in Chem.SDMolSupplier( 'dataset.sdf' ) ]
print( m = nx[0][n for n in m.GetPropNames()] )

[out]
[------------
 'CLogP',
 ------------
 'FSP3',
 -------------
'MolWeight',
 -------------
 'Purity_percent',
 'ROT',
-------------
 'Salt',
 'TPSA']

nx = PandasTools.LoadSDF( 'dataset.sdf' )
#convert data
nx.FSP3 = natx.FSP3.apply( np.float32 )
nx.ROT = natx.ROT.apply( np.float32 )
nx.TPSA = natx.TPSA.apply( np.float32 )
nx.CLogP = natx.CLogP.apply( np.float32 )
nx.MolWeight = natx.MolWeight.apply( np.float32 )

sns.lmplot( 'ROT','FSP3', data=natx, )
sns.distplot(natx.MolWeight)
sns.jointplot(natx.MolWeight, natx.FSP3, kind='hex')

Fig 3 indicates that the dataset have many FSp3 rich molecules.
I’m interested in the dataset and will analyse more details of the Dataset.
What is the most important thing in the library design ? Novelty, synthetic accessibility, cost, diversity, druggability, etc. How do you define the quality of compound / library ?

fig1
fsp3rot
fig2
mw
fig3
fsp3_molwt

Generate diamine ring systems using RDKit # RDKit

Somedays ago, I posted new way to represent chemical space using exit vector.
So, I want to generate set of diamine ring systems. It’s easy to generate molecule set using RDKit.
Let’s try it.
At first I defined ring systems.

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

c5 = Chem.MolFromSmiles( 'C1CCCC1' )
c6 = Chem.MolFromSmiles( 'C1CCCCC1' )
c7 = Chem.MolFromSmiles( 'C1CCCCCC1' )
c45 = Chem.MolFromSmiles( 'C2CC1CCC1C2' )
c46 = Chem.MolFromSmiles( 'C2CCC1CCC1C2' )
c55 = Chem.MolFromSmiles( 'C1CC2CCCC2C1' )
c56 = Chem.MolFromSmiles( 'C2CCC1CCCC1C2' )
c66 = Chem.MolFromSmiles( 'C1CCC2CCCCC2C1' )
sp35 = Chem.MolFromSmiles( 'C1CC11CCCC1' )
sp36 =  Chem.MolFromSmiles( 'C1CC11CCCCC1' )
sp45 = Chem.MolFromSmiles( 'C1CCC11CCCC1' )
sp46 =  Chem.MolFromSmiles( 'C1CCC11CCCCC1' )
sp55 = Chem.MolFromSmiles( 'C1CCC11CCCC1' )
sp55 = Chem.MolFromSmiles( 'C1CCCC11CCCC1' )
sp65 = Chem.MolFromSmiles( 'C1CCCC11CCCCC1' )
sp66 =  Chem.MolFromSmiles( 'C1CCCCC11CCCCC1' )

ringsets = [
    c5, c6, c7,
    c45, c46, c55, c56, c66,
    sp35, sp36, sp45, sp46, sp55, sp65, sp66]
rxn = AllChem.ReactionFromSmarts( '[CH2&$(C(CC)(CC)):2]>>[NH:2]' )
monoamine = []

Then, I defined function that returns only unique mols.

def get_unipro( ps ):
    uniq = set( [ Chem.MolToSmiles( mol[0], isomericSmiles =True ) for mol in ps ] )
    mols = [ Chem.MolFromSmiles( smi ) for smi in uniq ]
    return mols

def get_unimol( mols ):
    uniq = set( [ Chem.MolToSmiles( mol, isomericSmiles =True ) for mol in mols ] )
    mols = [ Chem.MolFromSmiles( smi ) for smi in uniq ]
    return mols

OK, Ready. Run reaction!

for ring in ringsets:
    ps = rxn.RunReactants( (ring,) )
    res = get_unipro( ps )
    monoamine.extend( res )
diamine = []
for ring in monoamine:
    ps = rxn.RunReactants( (ring,) )
    res = get_unipro( ps )
    diamine.extend( res )
diamine = get_unimol( diamine )
Draw.MolsToGridImage( ringsets, molsPerRow = 5  )

frame

Draw.MolsToGridImage(diamine, molsPerRow= 5 )

diamine

It’s works fine.

calculate exit vector distance of each molecule

Some days ago, I participated a seminar about drug discovery.
All presentations were nice, and I interested in one of unique method to representation chemical space, named “exit vector plot”.
ref is following
Following Ramachandran: exit vector plots (EVP) as a tool to navigate chemical space covered by 3D bifunctional scaffolds. The case of cycloalkanes
http://pubs.rsc.org/en/content/articlelanding/2016/ra/c5ra19958a

The concept is, chemical space is represented as two vectors n1, n2 with 4 parameters phi1, phi2, theta( dihedral angle ), r.
It’s like a ramachandran plot that is used to plot structure of protein.

image

In this work, the author calculate exit vector about several ring systems and clustering them.
I think this concept is very useful because if chemist want to replace a ring system to another one, it’s difficult to search molecules that shows low structure similarity but has high exit vector similarity.

So, if we can search molecules use exit vector similarity, it will be new way to expand chemical space.
In the paper, the author clustering molecules using Euclidean distance of exit vector. Distance is almost dissimilarity.
Now let’s try to write exit vector calculator using RDKit!
In following code, I focused in 2nd-diamine set because it was easy to define two vectors(n1, n2).
First I prepare sample sdf that has energy minimized 3D structure.(there is not in the code)
Then write code!
RDKit has cool function to calculate distance and angle in rdMolTransform class.

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import AllChem
from rdkit.Chem import rdMolTransforms
import numpy as np
from rdkit.Chem import Draw
sdf = Chem.SDMolSupplier( "3d_amine.sdf", removeHs=False )
mols = [ m for m in sdf ]
mol = mols[1]
mol.GetSubstructMatch( Chem.MolFromSmarts( "N[H]" ) )
atoms = [atom for atom in mol.GetAtoms()]

def getev( mol ):
    if mol.GetNumConformers() >= 1:
        matches = mol.GetSubstructMatches( Chem.MolFromSmarts( "N[H]" ) )
        conf = mol.GetConformer()
        theta = rdMolTransforms.GetDihedralDeg( conf,
                                                matches[0][1],
                                                matches[0][0],
                                                matches[1][0],
                                                matches[1][1]  )
        temp_phi1 = 180 - rdMolTransforms.GetAngleDeg(conf,
                                           matches[1][0],
                                           matches[0][0],
                                           matches[0][1]
                                          )
        temp_phi2 = 180 - rdMolTransforms.GetAngleDeg(conf,
                                           matches[0][0],
                                           matches[1][0],
                                           matches[1][1]
                                          )
        if temp_phi1 >= temp_phi2:
            phi1 = temp_phi1
            phi2 = temp_phi2
        else:
            phi1 = temp_phi2
            phi2 = temp_phi1

        r = rdMolTransforms.GetBondLength( conf, matches[0][0], matches[1][0] )
        return theata, phi1, phi2, r
    else:
        print( "No conformer!" )

def transform_cartegian( theta, phi1, phi2, r ):
    theta = np.deg2rad( theta )
    phi1 = np.deg2rad( phi1 )
    phi2 = np.deg2rad( phi2 )
    x = np.sin( theta ) * np.sin( phi1 ) * np.sin( phi2 ) *r
    y = np.sin( theta ) * np.sin( phi1 ) * np.cos( phi2 ) *r
    z = np.sin( theta ) * np.cos( phi1 ) *r
    t = np.cos( theta ) *r
    return x, y, z, t

def get_dist(v1,v2):
    v1 = np.asarray( v1 )
    v2 = np.asarray( v2 )
    delta =  v1 - v2
    d = np.linalg.norm( delta )
    return d

def calc_distance( mol1, mol2 ):
    theta1, phi11, phi21, r1 = getev( mol1 )
    theta2, phi12, phi22, r2 = getev( mol2 )
    cart1 = transform_cartegian( theta1, phi11, phi21, r1 )
    cart2 = transform_cartegian( theta2, phi12, phi22, r2 )
    d = get_dist( cart1, cart2 )
    return d

All done!
Then run code.

for i, mol in enumerate( mols ):
    d = calc_distance(mols[3],mol)
    print( 3, i, d )

I got following response.
idx1 idx2 distace
3 0 4.90798883048
3 1 1.48017700543
3 2 6.08814365316
3 3 0.0
3 4 3.93873942603
3 5 5.45068906229
3 6 1.40146560569
3 7 0.0619944778129
3 8 6.92182054937
3 9 6.22560061852
3 10 6.19913412456
3 11 5.75093874617
3 12 5.38531808804
3 13 1.85179121146
3 14 1.80235655185
idx1, 6, 7 are near to idx4.
Let see…

Draw.MolsToGridImage( [mols[3],mols[1], mols[6], mols[7]] )
Draw.MolsToGridImage( mols )

I got following images.
It’s seems works fine.

selected mols
all mols

I uploaded the code to github.
https://github.com/iwatobipen/rdkit_evp

Get molecular angle information using RDKit

Now, I’m thinking new molecular similarity score.
It uses molecular angle information.

RDKit has some functions that calculate dihedral or some angles.
Like this..

from rdkit import Chem
from rdkit.Chem import rdMolTransforms
mol = Chem.MolFromSmiles( "N1CCNCC1" )
from rdkit.Chem import AllChem
#generate one confomer.
AllChem.EmbedMolecule(mol)
conf=mol.GetConformer(0)
atoms=[a for a in mol.GetAtoms()]
for a in atoms:
print( a.GetIdx(), a.GetSymbol() )

out put is…
0 N
1 C
2 C
3 N
4 C
5 C
Calculate torsion angle.

rdMolTransforms.GetAngleDeg(conf,0,1,2)
#113.92146321507079 ( degree )

Next, calculate dihedral angle between N-C / C-N.

rdMolTransforms.GetDihedralDeg(conf, 0,1,2,3)

I got -57.29664523229906 ( deg )

Works fine.

I’ll go to next step. 😉

New way to plot chemical space.

I often use Principal Component Analysis(PCA) to reduction dimension.
Of course, there are several method to reduction dimension, not only PCA but also MDS, kernel PCA, isomap,sammon mapping, ICA, etc.
Some days ago, I found interesting method that called t-SNE t-Distributed Stochastic Neighbor Embedding.
Key of t-SNE is that use t-Distribution instead of normal distribution.
PCA is liner method but t-SNE is none liner method.
If reader who is interested in t-SNE, please refer following url.
https://lvdmaaten.github.io/publications/papers/JMLR_2008.pdf
t-SNE works well in image recognition. BYW, there are not information about chemistry.
So I tried to visualise chemical space using t-SNE. Fortunately Scikit-learn is implemented t-SNE!
I wrote a example to do that.
Code is following….
I used gsk3b inhibitor as dataset.
in case PCA

import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from sklearn.decomposition import PCA
import pickle, sys
import seaborn as sns
drugs = [ mol for mol in Chem.SDMolSupplier( "gsk3b.sdf" ) if mol != None ]

def calc_fp_arr( mols ):
    fplist = []
    for mol in mols:
        arr = np.zeros( (1,) )
        fp = AllChem.GetMorganFingerprintAsBitVect( mol, 2 )
        DataStructs.ConvertToNumpyArray( fp, arr )
        fplist.append( arr )
    return np.asarray( fplist )

res=calc_fp_arr(drugs)
pca = PCA( n_components = 3 )
pca.fit( res )
f = open( 'pca.pkl', 'wb' )
pickle.dump( pca, f )
f.close()


x = pca.transform( res )

f = open( "pcares.txt", "w" )
act = []
for i in range( x.shape[0] ):
    line = ""
    line = Chem.MolToSmiles( drugs[i] ) + "," + drugs[i].GetProp( "STANDARD_VALUE" ) + "," + str( x[i][0] )  + "," + str( x[i][1] ) + "\n"
    f.write( line )
    act.append( float(drugs[i].GetProp( "STANDARD_VALUE" )) )
f.close()
x = pd.DataFrame( x )
x.columns = [ 'PC1', 'PC2', 'PC3' ]
x["IC50"] = act

g = sns.jointplot( "PC1", "PC2", data = x )

g.savefig( "PCA.png" )

In t-SNE

import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from sklearn.manifold import TSNE
import pickle, sys
import seaborn as sns
drugs = [ mol for mol in Chem.SDMolSupplier( "gsk3b.sdf" ) if mol != None ]

def calc_fp_arr( mols ):
    fplist = []
    for mol in mols:
        arr = np.zeros( (1,) )
        fp = AllChem.GetMorganFingerprintAsBitVect( mol, 2 )
        DataStructs.ConvertToNumpyArray( fp, arr )
        fplist.append( arr )
    return np.asarray( fplist )

res=calc_fp_arr(drugs)
model = TSNE( n_components=3, init="pca",  n_iter=2000 )
#model = TSNE( n_components=2,random_state=42 ,  n_iter=1000 )
x = model.fit_transform( res )
f = open( 'pca_tsne.pkl', 'wb' )
pickle.dump( model, f )
f.close()



f = open( "pca_tsneres.txt", "w" )
act = []
for i in range( x.shape[0] ):
    line = ""
    line = Chem.MolToSmiles( drugs[i] ) + "," + drugs[i].GetProp( "STANDARD_VALUE" ) + "," + str( x[i][0] )  + "," + str( x[i][1] ) + "\n"
    act.append(  float(drugs[i].GetProp( "STANDARD_VALUE" ))  )
    f.write( line )
f.close()
x = pd.DataFrame( x )
x.columns = [ 'A1', 'A2', 'A3'  ]

g = sns.jointplot( "A1", "A2", data = x )
g.savefig( "TSNE.png" )

Now I got following 2 images.
In this case, PCA seems better than t-SNE, but I tried in-house dataset, t-SNE showed good performance.
I want to implement GTM with python.
Which method do you use to represent chemical space ?

PCA
TSNE

Code is pushed my repo.
https://github.com/iwatobipen/chemo_info/tree/master/chemicalspace2
😉

Use convolution2D in QSAR.

Recently there are lots of report about deep learning to predict biological activity(QSAR).
I think almost of these predictors use MLP. I wonder if I could use another method like 2DCNN, I could get good predictor.
So, I tried to build 2DCNN QSAR model.
Fortunately, 1024 bit fingerprint is easily convert to 2D 32 x 32 fingerprint.
Let’s start.
At first, I converted molecules to 2D bit image. Data source is ChEMBL.

rom rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from sklearn.cross_validation import train_test_split
from rdkit.Chem import Draw
import pandas as pd
import numpy as np
import glob
import pickle
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split


df = pd.read_table('bioactivity-15_13-09-28.txt', header=0)
df_bind = df[ df.ASSAY_TYPE=="B" ]
df_bind = df_bind[ df_bind.STANDARD_VALUE != None ]
df_bind = df_bind[ df_bind.STANDARD_VALUE >= 0 ]

rows = df_bind.shape[ 0 ]
mols = [ ]
act = [ ]
X = [ ]

def generate_fparr( mol ):
    arr = np.zeros( (1,) )
    fp = AllChem.GetMorganFingerprintAsBitVect( mol, 2, nBits = 1024, useFeatures = True )
    DataStructs.ConvertToNumpyArray( fp, arr )
    size = 32
    return arr.reshape( 1, size, size )

def save_fig( fparr, filepath, size=32 ):
    X, Y = np.meshgrid( range(size), range(size) )
    Z = fparr
    Z = Z[::-1,:]
    plt.xlim( 0, 31 )
    plt.ylim( 0, 31 )
    plt.pcolor(X, Y, Z[0])
    plt.gray()
    plt.savefig( filepath )
    plt.close()

def act2bin( val ):
    if val > 10000:
        return 0
    else:
        return 1

for i in range( rows ):
    try:
        smi = df_bind.CANONICAL_SMILES[i]
        mol = Chem.MolFromSmiles( smi )
        if mol != None:
            mols.append( mol )
            act.append( act2bin( df_bind.STANDARD_VALUE[i]) )
        else:
            pass
    except:
        pass

# save mols image dataset
for idx, mol in enumerate( mols ):
    X.append( generate_fparr( mol ) )
    if act[ idx ] == 1:
        save_fig( X[ idx ], "./posi/idx_{}.png".format( idx ) )
    elif act[ idx ] == 0:
        save_fig( X[ idx ], "./nega/idx_{}.png".format( idx ) )


X = np.asarray(X)
Y = np.asarray(act)

x_train, x_test, y_train, y_test = train_test_split( X,Y, test_size=0.2, random_state=123 )

f = open( 'fpimagedataset.pkl', 'wb' )
pickle.dump([ ( x_train,y_train ), ( x_test, y_test ) ], f)
f.close()

Now I got 2D Fingerprint dataset and 2D fingerprint molecular bit image. Following image is one of positive bit image. Of course I can not understand from the image why the molecule was active.
idx_0

Next, I wrote predictor using Keras.

import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.utils import np_utils
import pickle
import matplotlib
import matplotlib.pyplot as plt

( x_train, y_train ), ( x_test, y_test ) = pickle.load( open('fpimagedataset.pkl', 'rb') )

batch_size = 200
nb_classes = 2
nb_epoch = 100
nb_filters = 32
nb_pool = 2
nb_conv = 3
im_rows , im_cols = 32, 32
im_channels = 1

x_train = x_train.astype( "float32" )
x_test = x_test.astype( "float32" )

y_train = np_utils.to_categorical( y_train, nb_classes )
y_test = np_utils.to_categorical( y_test, nb_classes )

print( x_train.shape[0], 'train samples' )
print( x_test.shape[0], 'test samples' )


model = Sequential()
model.add( Convolution2D( 32, 3, 3,
                            input_shape = ( im_channels, im_rows, im_cols ) ) )
model.add( BatchNormalization() )
model.add( Activation( 'relu' ) )

model.add( Convolution2D( 32, 3, 3,    ))
model.add( BatchNormalization() )
model.add( Activation( 'relu' ) )

model.add( Convolution2D( 32, 3, 3 ) )
model.add( BatchNormalization() )
model.add( Activation( 'relu' ) )
model.add( MaxPooling2D( pool_size=( 2, 2 ) ) )
model.add( Flatten() )
model.add( Dense( 200 ) )
model.add( Activation( 'relu' ) )
model.add( Dropout( 0.5 ) )
model.add( Dense(nb_classes) )
model.add( Activation('softmax') )
model.compile( loss='categorical_crossentropy',
               optimizer='adadelta',
               metrics=['accuracy'],
               )

hist = model.fit( x_train, y_train,
                  batch_size = batch_size,
                  nb_epoch = nb_epoch,
                  verbose = 1,
                  validation_data = ( x_test, y_test ))


print( model.summary() )
score = model.evaluate( x_test, y_test, verbose=0 )

loss = hist.history[ 'loss' ]
acc = hist.history[ 'acc' ]
val_loss = hist.history[ 'val_loss' ]
val_acc = hist.history[ 'val_acc' ]
plt.plot( range(len( loss )), loss, label='loss' )
plt.plot( range(len( val_loss )), val_loss, label='val_loss' )
plt.xlabel( 'epoch' )
plt.ylabel( 'loss' )
plt.savefig( 'loss.png' )
plt.close()
plt.plot( range(len( acc )), acc, label='accuracy' )
plt.plot( range(len( val_acc )), val_acc, label='val_accuracy' )
plt.xlabel( 'epoch' )
plt.ylabel( 'acc' )
plt.savefig( 'acc.png' )
plt.close()

Finally, I run the code.
It took long time to finish the calculation and I got 2 images.
Accuracy of training set was increasing depend on number of epochs, but accuracy of test set was not same.
It was same as loss score. I think the predictor is overfitting. ;-(
I used drop out, normalisation but I couldn’t avoid over fitting. Hmm……

loss

acc