Molecular Drawing code in RDKit

Using RDKit and Ipython notebook is useful for interactive coding.
User can check molecules directly in notebook
It can not only one molecule but also list of molecules. If user want to draw molecules as grid image, MolsToGridImage method is good choice.
Also the method highlight atoms.
Example is following.

from rdkit import Chem
from rdkit.Chem import rdBase
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit import RDConfig
import os
sdf = Chem.SDMolSupplier(  os.path.join(RDConfig.RDDocsDir, 'Book/data/cdk2.sdf'))
mols = [  m for m in sdf ]
core = Chem.MolFromSmiles( 'c1ncc2nc[nH]c2n1' )
img = Draw.MolsToGridImage( mols, molsPerRow=3, highlightAtomLists=[ mol.GetSubstructMatch(core) for mol in mols], useSVG=True )
img

img returns following image.
gridimage

User can set Highlight atoms for each molecules. I think it’s very flexible for development.

Use convolution2D in QSAR.

Recently there are lots of report about deep learning to predict biological activity(QSAR).
I think almost of these predictors use MLP. I wonder if I could use another method like 2DCNN, I could get good predictor.
So, I tried to build 2DCNN QSAR model.
Fortunately, 1024 bit fingerprint is easily convert to 2D 32 x 32 fingerprint.
Let’s start.
At first, I converted molecules to 2D bit image. Data source is ChEMBL.

rom rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from sklearn.cross_validation import train_test_split
from rdkit.Chem import Draw
import pandas as pd
import numpy as np
import glob
import pickle
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split


df = pd.read_table('bioactivity-15_13-09-28.txt', header=0)
df_bind = df[ df.ASSAY_TYPE=="B" ]
df_bind = df_bind[ df_bind.STANDARD_VALUE != None ]
df_bind = df_bind[ df_bind.STANDARD_VALUE >= 0 ]

rows = df_bind.shape[ 0 ]
mols = [ ]
act = [ ]
X = [ ]

def generate_fparr( mol ):
    arr = np.zeros( (1,) )
    fp = AllChem.GetMorganFingerprintAsBitVect( mol, 2, nBits = 1024, useFeatures = True )
    DataStructs.ConvertToNumpyArray( fp, arr )
    size = 32
    return arr.reshape( 1, size, size )

def save_fig( fparr, filepath, size=32 ):
    X, Y = np.meshgrid( range(size), range(size) )
    Z = fparr
    Z = Z[::-1,:]
    plt.xlim( 0, 31 )
    plt.ylim( 0, 31 )
    plt.pcolor(X, Y, Z[0])
    plt.gray()
    plt.savefig( filepath )
    plt.close()

def act2bin( val ):
    if val > 10000:
        return 0
    else:
        return 1

for i in range( rows ):
    try:
        smi = df_bind.CANONICAL_SMILES[i]
        mol = Chem.MolFromSmiles( smi )
        if mol != None:
            mols.append( mol )
            act.append( act2bin( df_bind.STANDARD_VALUE[i]) )
        else:
            pass
    except:
        pass

# save mols image dataset
for idx, mol in enumerate( mols ):
    X.append( generate_fparr( mol ) )
    if act[ idx ] == 1:
        save_fig( X[ idx ], "./posi/idx_{}.png".format( idx ) )
    elif act[ idx ] == 0:
        save_fig( X[ idx ], "./nega/idx_{}.png".format( idx ) )


X = np.asarray(X)
Y = np.asarray(act)

x_train, x_test, y_train, y_test = train_test_split( X,Y, test_size=0.2, random_state=123 )

f = open( 'fpimagedataset.pkl', 'wb' )
pickle.dump([ ( x_train,y_train ), ( x_test, y_test ) ], f)
f.close()

Now I got 2D Fingerprint dataset and 2D fingerprint molecular bit image. Following image is one of positive bit image. Of course I can not understand from the image why the molecule was active.
idx_0

Next, I wrote predictor using Keras.

import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.utils import np_utils
import pickle
import matplotlib
import matplotlib.pyplot as plt

( x_train, y_train ), ( x_test, y_test ) = pickle.load( open('fpimagedataset.pkl', 'rb') )

batch_size = 200
nb_classes = 2
nb_epoch = 100
nb_filters = 32
nb_pool = 2
nb_conv = 3
im_rows , im_cols = 32, 32
im_channels = 1

x_train = x_train.astype( "float32" )
x_test = x_test.astype( "float32" )

y_train = np_utils.to_categorical( y_train, nb_classes )
y_test = np_utils.to_categorical( y_test, nb_classes )

print( x_train.shape[0], 'train samples' )
print( x_test.shape[0], 'test samples' )


model = Sequential()
model.add( Convolution2D( 32, 3, 3,
                            input_shape = ( im_channels, im_rows, im_cols ) ) )
model.add( BatchNormalization() )
model.add( Activation( 'relu' ) )

model.add( Convolution2D( 32, 3, 3,    ))
model.add( BatchNormalization() )
model.add( Activation( 'relu' ) )

model.add( Convolution2D( 32, 3, 3 ) )
model.add( BatchNormalization() )
model.add( Activation( 'relu' ) )
model.add( MaxPooling2D( pool_size=( 2, 2 ) ) )
model.add( Flatten() )
model.add( Dense( 200 ) )
model.add( Activation( 'relu' ) )
model.add( Dropout( 0.5 ) )
model.add( Dense(nb_classes) )
model.add( Activation('softmax') )
model.compile( loss='categorical_crossentropy',
               optimizer='adadelta',
               metrics=['accuracy'],
               )

hist = model.fit( x_train, y_train,
                  batch_size = batch_size,
                  nb_epoch = nb_epoch,
                  verbose = 1,
                  validation_data = ( x_test, y_test ))


print( model.summary() )
score = model.evaluate( x_test, y_test, verbose=0 )

loss = hist.history[ 'loss' ]
acc = hist.history[ 'acc' ]
val_loss = hist.history[ 'val_loss' ]
val_acc = hist.history[ 'val_acc' ]
plt.plot( range(len( loss )), loss, label='loss' )
plt.plot( range(len( val_loss )), val_loss, label='val_loss' )
plt.xlabel( 'epoch' )
plt.ylabel( 'loss' )
plt.savefig( 'loss.png' )
plt.close()
plt.plot( range(len( acc )), acc, label='accuracy' )
plt.plot( range(len( val_acc )), val_acc, label='val_accuracy' )
plt.xlabel( 'epoch' )
plt.ylabel( 'acc' )
plt.savefig( 'acc.png' )
plt.close()

Finally, I run the code.
It took long time to finish the calculation and I got 2 images.
Accuracy of training set was increasing depend on number of epochs, but accuracy of test set was not same.
It was same as loss score. I think the predictor is overfitting. ;-(
I used drop out, normalisation but I couldn’t avoid over fitting. Hmm……

loss

acc

某ソフトのユーザー会に参加した話

先週末某ソフトのユーザーに参加させていただきました。
ソフトのユーザー会ですが、あまりソフトよりな話にならず、良い会だと思います。
まあ年3回やるとして、毎回アップデートを出そうと思うとそれはそれで開発的にはしんどいのかもしれません。

すべての演題が素敵でした。特に企業の課題を掘り下げるためのアンケートの話題が個人的に興味深い話題でした。アンケートって、質問が多すぎると疲れてしまうし、項目が細かすぎても適当に解答書くかもしれないし難しいなって思いますが、プレゼンは非常にクリアに問題を浮き出す構成になっておりました。

そんなプレゼンを聴きながら思ったのは、アンケートは調査対象がヒトなのですが、創薬は化合物に対するアンケートなのかと、、、
 Q1で活性=>あればQ2ADME=>あればQ3PK、、、、とか。
PDCAのサイクルは当然ですがSARの方が早いと思いますが。

違いは一回のサーベイで全部の回答が集まるのかどうか。施作を打つ上で、データが多いほど問題を深掘りできるはずですが、実際には同じデータを見てもどこまで掘り下げるかは解析のスキルに依存すると思います。

初期からリソースをかけて多くのデータを取ることが成功への近道か、一つ一つ課題を解決しつつステップワイズにデータを取る方が効率的なのか。

そもそも創薬化学者としての感性が求められるのか。
色々と考えされられます。

Use rdkitjs in local application

Electron is tool to build cross platform desktop apps with javascript, HTML, and CSS.
http://electron.atom.io/
Users can develop their own app like web app. I think it’s interesting, because the developed application will run local environment.

Today I tried to use electron. My code (index.js) is almost same as quick start.
I installed browserfiy, rdkitjs, jquery and highcharts via using ‘npm install command’. ( highcharts was not used following code.)

To start, at first type…

npm init

Then I got packages.json

{
  "name": "electron_test",
  "version": "1.0.0",
  "description": "",
  "main": "index.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "author": "iwatobipen",
  "license": "GPL",
  "dependencies": {
    "highcharts": "^4.2.5",
    "jquery": "^3.0.0",
    "rdkit": "^0.1.1"
  },
  "devDependencies": {
    "electron-prebuilt": "^1.2.2"
  }
}

Then made index.js ( same as quick start ).

const electron = require('electron');
const $ = require( 'jquery' );
const Highcharts = require('highcharts');
// Module to control application life.
const {app} = electron;
// Module to create native browser window.
const {BrowserWindow} = electron;

// Keep a global reference of the window object, if you don't, the window will
// be closed automatically when the JavaScript object is garbage collected.
let win;

function createWindow() {
  // Create the browser window.
  win = new BrowserWindow({width: 800, height: 600, 'node-integration': false});

  // and load the index.html of the app.
  win.loadURL(`file://${__dirname}/index.html`);

  // Open the DevTools.
  win.webContents.openDevTools();

  // Emitted when the window is closed.
  win.on('closed', () => {
    // Dereference the window object, usually you would store windows
    // in an array if your app supports multi windows, this is the time
    // when you should delete the corresponding element.
    win = null;
  });
}

// This method will be called when Electron has finished
// initialization and is ready to create browser windows.
// Some APIs can only be used after this event occurs.
app.on('ready', createWindow);

// Quit when all windows are closed.
app.on('window-all-closed', () => {
  // On OS X it is common for applications and their menu bar
  // to stay active until the user quits explicitly with Cmd + Q
  if (process.platform !== 'darwin') {
    app.quit();
  }
});

app.on('activate', () => {
  // On OS X it's common to re-create a window in the app when the
  // dock icon is clicked and there are no other windows open.
  if (win === null) {
    createWindow();
  }
});

// In this file you can include the rest of your app's specific main process
// code. You can also put them in separate files and require them here.

Next, I wrote index.html.
This code gets smiles as input and write molecule as SVG image.
It’s very simple.
Before writing code, I installed browserify, so the code can call library using require( ‘package name’ ).
BTW, I want to use jQuery but following it couldn’t call jQuery. So I wrote document.getElementById ….. instead of $( ‘#hogehoge’ )…. ;-(

<!DOCTYPE html>
<html>
  <head>
    <meta charset="UTF-8">
    <title>RDKIT js with electron</title>
    <!-- jQuery is not loaded first run. Why???  -->
    <script> window.$ = window.jQuery =  require( 'jquery' );</script>
    <script> var Highcharts = require( 'highcharts' );</script>
    <script> var RDKit = require( 'rdkit' );</script>
    <script> var drawmol = function() {
                        var smi = document.getElementById('smi').value;
                        console.log( smi );
			var mol = RDKit.Molecule.fromSmiles( smi );
			mol.Kekulize();
			var svg = mol.Drawing2D();
			var svg = svg.split('svg:').join('');
                        document.getElementById("drawer").innerHTML = svg;         
               };
    </script>

  </head>
  <body>
    <h1>Smiles parser</h1>
    We are using node <script>document.write(process.versions.node)</script>,
    Chrome <script>document.write(process.versions.chrome)</script>,
    and Electron <script>document.write(process.versions.electron)</script>.<br>
    <p>This is simple smiles converter using RDKit js.</p>
    <p>Input smiles to text box and push the button.</p>
    <input id='smi' type='text' ><br>
    <button type='button' onclick='drawmol()' >draw</button><br>

    <div id='drawer'></div>
  </body>
</html>

Next run electron.

electron .

Then app will run. And input smiles, push button I got following image.
electron-test

It’s seems work well.
Electron-packager can make local app for multi platforms.
For convenient, just type
$ electron-packager . –all
I got electron-test.app. and the code run local environment.
;-)
I uploaded the code to my repo.
https://github.com/iwatobipen/chemo_info/tree/master/electron_test/electron_test
Maybe zip archive will run without installing further js library.

Electron can make application like a web app it’s amazing for me.

Visualize chemical space using RDKit-Scikitlearn-Highchart

I often use Principle component analysis (PCA) to visualize chemical space. PCA is useful to describe chemical diversity. I wonder if I could project new designed molecules to reference current chemical space.
I think that sci-kitlearn and rdkit is suitable to do that. Recently I often use seaborn to visualization, but today I used highcharts to visualize data. Because highchart can handle data interactively in web app.
Flask was used web-app framework, and rdkit was used to fingerprint calculation.

My first example was following. All function and data were embedded ‘app.py’.
Structures.sdf is data of DrugBank.
Following code is ….
1st- calculate fingerprints about reference molecules and test molecules.
2nd- Do PCA against reference molecules.
3rd- projection test mols to reference molecules chemical space.
4th- Convert molecue to svg text. ( It is nice work of RDKIT! )
5th- pass datas( PC1, PC2, SVG ) to highcharts.js
To convert molecules to SVG is important to visualize molecules in tooltip.

from flask import Flask, render_template
app = Flask( __name__ )

from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import MolDraw2DSVG
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem import rdDepictor, Descriptors, AllChem, DataStructs
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import pickle
#structures from drug bank
drugs = [ mol for mol in Chem.SDMolSupplier( &amp;amp;quot;structures.sdf&amp;amp;quot; ) if mol != None ][:500]
#testset
test = [ mol for mol in Chem.SDMolSupplier( &amp;amp;quot;testset.sdf&amp;amp;quot; ) if mol != None ]

def calc_fp_arr( mols ):
    fplist = []
    for mol in mols:
        arr = np.zeros( (1,) )
        fp = AllChem.GetMorganFingerprintAsBitVect( mol, 2 )
        DataStructs.ConvertToNumpyArray( fp, arr )
        fplist.append( arr )
    return np.asarray( fplist )

def getsvgtext( mol ):
    d2d = rdMolDraw2D.MolDraw2DSVG(200,200)
    d2d.DrawMolecule( mol )
    d2d.FinishDrawing()
    svg = d2d.GetDrawingText()
    return svg.replace( &amp;amp;quot;svg:&amp;amp;quot;,&amp;amp;quot;&amp;amp;quot; )

drugfparr = calc_fp_arr( drugs )
testfparr = calc_fp_arr( test )

#do PCA
pca = PCA( n_components=2 )
pca.fit( drugfparr )
f = open( 'drugpca.pkl', 'wb' )
pickle.dump( pca, f )
f.close()

drugsX = pca.transform( drugfparr )
data1 = [ { 'x' : drugsX[i][0], 'y':drugsX[i][1], 'svg': getsvgtext( drugs[i] ) } for i in range(len(drugsX)) ]
testX = pca.transform( testfparr )
data2 = [ {  'x': testX[i][0], 'y':testX[i][1], 'svg': getsvgtext( test[i] ) } for i in range(len(testX)) ]

@app.route( '/' )
@app.route( '/chart' )
def chart():
    return render_template( 'chart.html', data1 = data1, data2 = data2 )

if __name__ == '__main__':
    app.debug = True
    app.run(  )

Next, wrote template ‘chart.html’.
It’s important to load jquery at first, if highcharts is loaded at first following code did not run.
I embedded SVG in tooltip, so useHTML set true.
And another option is almost default settings.
Highcharts can access attribute of dataset like ‘ this.point.hogehoge’.
So, I used this.point.svg to get the svgtext from dataset.

&amp;amp;lt;!DOCTYPE html&amp;amp;gt;
&amp;amp;lt;html&amp;amp;gt;
&amp;amp;lt;head&amp;amp;gt;

    &amp;amp;lt;title&amp;amp;gt; test &amp;amp;lt;/title&amp;amp;gt;
    &amp;amp;lt;script type='text/javascript' src =&amp;amp;quot;{{ url_for('static', filename='jquery-2.2.4.min.js') }}&amp;amp;quot;&amp;amp;gt;&amp;amp;lt;/script&amp;amp;gt;
    &amp;amp;lt;script type='text/javascript' src = &amp;amp;quot;{{ url_for( 'static', filename='highcharts/js/highcharts.js' ) }}&amp;amp;quot;  &amp;amp;gt;&amp;amp;lt;/script&amp;amp;gt;
    &amp;amp;lt;script type='text/javascript' src = &amp;amp;quot;{{ url_for( 'static', filename='highcharts/js/modules/exporting.js' ) }}&amp;amp;quot;  &amp;amp;gt;&amp;amp;lt;/script&amp;amp;gt;

    &amp;amp;lt;script&amp;amp;gt;
    $(function(){
    $('#container').highcharts({
      chart :{
        type : 'scatter',
        zoomType : 'xy'
      },
      title : {
        text : 'chemical space mapping'
      },
      xAxis : {
        title : { text : 'PC1'},
        gridLineWidth : 2,
      },
      yAxis : {
        title : { text : 'PCA2'}
      },
      tooltip :{
        useHTML : true,
        formatter : function(){
          return this.point.svg
        }
      },
      series : [{
        turboThreshold:0,
        name : 'drugs',
        color : 'rgba( 223, 83, 83, .3 )',
        data : {{ data1|safe }}
      },{
        name : 'testmol',
        color : 'rgba( 119, 152, 191, .8 )',
        data : {{ data2|safe }}
      }]
    });
    });
    &amp;amp;lt;/script&amp;amp;gt;

  &amp;amp;lt;/head&amp;amp;gt;
  &amp;amp;lt;body&amp;amp;gt;
    &amp;amp;lt;p&amp;amp;gt; scatter plot &amp;amp;lt;/p&amp;amp;gt;&amp;amp;lt;/br&amp;amp;gt;
    &amp;amp;lt;div id = &amp;amp;quot;container&amp;amp;quot; style = &amp;amp;quot;width:500px; height:500px;&amp;amp;quot;&amp;amp;gt;&amp;amp;lt;/div&amp;amp;gt;

  &amp;amp;lt;/body&amp;amp;gt;
&amp;amp;lt;/html&amp;amp;gt;

Then run code.

python app.py

I got interactive scatter plot.
chemicalspace_pca
Easy to zoom!!
zoom
It works fine. I pushed all code to my github repo.
https://github.com/iwatobipen/chemo_info/tree/master/highcharts_app

plot descriptors in jupyter notebook using highcharts.

Last Saturday, I enjoyed mishima.syk#8.
Main topic was scikit-learn hans-on and . y-sama made great tutorial.
https://github.com/y-sama/sklearn-tutorial
I recommend check the tutorial.

Jupyter notebook is very powerful and flexible tool to analyse and visualise data.
And I found that jupyter notebook can handle html directly.
It’s means I can embed javascript to notebook.
So, I tried to plot molecular data using highcharts.
Highcharts is one of the JS library to visualise data.
Code is very simple.
Lode sdf, calculate molwt and logp, and plot them.

from IPython.display import HTML
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import Descriptors
import json

%%html
<script src="./Highcharts-4.2.5/js/highcharts.js"></script>

mols = PandasTools.LoadSDF( "testset.sdf" )
data =zip(
         list(map(Descriptors.MolWt, mols.ROMol)),  list(map(Descriptors.MolLogP, mols.ROMol ))
      )
data = [ list(i) for i in data ]
chartdict = {
    "chart" : { "type" : 'scatter', 'zoomType' : 'xy' },
    "title" : { "text" : 'scatter_test' },
    "xAxis" : { "title" : { "text" : "mol wt" } },
    "yAxis" : { "title" : { "text" : "logp" } },
    "series": [{ "name" : "mol/logp", "data" : data }]
 }

template = """
           <div id="container" style="width:100%; height:400px;"></div>
           <script type="text/javascript">
           $(function(){{
               $("#container").highcharts({data});
           }}
           );
           </script>
"""
HTML(template.format( data=json.dumps(chartdict)))

That’s all.
And then I got following interactive plot.
Screen Shot 2016-06-04 at 2.07.03 PM
Jupyter notebook is cool!!!
You can check my snipets in following url.
http://nbviewer.jupyter.org/github/iwatobipen/chemo_info/blob/master/highcharts_py/highchartstest.ipynb

memorandum/PandasTools

RDKit has a method called PandasTools that can use molecules object as PandasDataFrame.
It is quite powerful method I think.
I got following error yesterday.

from rdkit import Chem
from rdkit.Chem import PandasTools
df = PandasTools.LoadSDF("testset.sdf")
df

....
--> 131 formatter = pd.core.format.DataFrameFormatter(self,buf=None,columns=None,col_space=None,colSpace=None,header=True,index=True,
132 na_rep='NaN',formatters=None,float_format=None,sparsify=None,index_names=True,
133 justify = None, force_unicode=None,bold_rows=True,classes=None,escape=False)
AttributeError: module 'pandas.core' has no attribute 'format'
...

I could not solve the problem, but member of RDKit user group gave me advise, the problem was depend on pandas.
My pandas was version 0.18.1 and the “format” module got moved from pandas.core to pandas.formats.

So to fix the problem, change version is the easiest way.
I reinstalled pandas 0.18.0 instead of 0.18.1. And All worked fine. ;-)